Class: MicroSpider
- Inherits:
-
Object
- Object
- MicroSpider
- Includes:
- Capybara::DSL, SpiderCore::Behavior, SpiderCore::FieldDSL, SpiderCore::FollowDSL, SpiderCore::PaginationDSL
- Defined in:
- lib/micro_spider.rb
Instance Attribute Summary collapse
-
#actions ⇒ Object
Returns the value of attribute actions.
-
#broken_paths ⇒ Object
readonly
Returns the value of attribute broken_paths.
-
#current_location ⇒ Object
readonly
Returns the value of attribute current_location.
-
#delay ⇒ Object
Returns the value of attribute delay.
-
#excretion ⇒ Object
readonly
Returns the value of attribute excretion.
-
#logger ⇒ Object
Returns the value of attribute logger.
-
#paths ⇒ Object
readonly
Returns the value of attribute paths.
-
#recipe ⇒ Object
Returns the value of attribute recipe.
-
#skip_set_entrance ⇒ Object
Returns the value of attribute skip_set_entrance.
-
#timeout ⇒ Object
Returns the value of attribute timeout.
-
#visited_paths ⇒ Object
readonly
Returns the value of attribute visited_paths.
Attributes included from SpiderCore::PaginationDSL
Attributes included from SpiderCore::FollowDSL
Instance Method Summary collapse
-
#click(locator, opts = {}, &block) ⇒ Object
Click the locator.
- #completed? ⇒ Boolean
- #crawl(&block) ⇒ Object
-
#create_action(name, &block) ⇒ Object
Spider can create custom action when it is crawling.
-
#entrance(*path_or_paths) ⇒ Object
This will be the first path for spider to visit.
-
#entrance_on_path(path, pattern, opts = {}, &block) ⇒ Object
Sometimes the entrances are on the page.
- #execute_actions ⇒ Object
-
#initialize(excretion = nil) ⇒ MicroSpider
constructor
A new instance of MicroSpider.
-
#learn(recipe = nil, &block) ⇒ Object
Teach the spider behaviors and it will repeat to the end.
- #metaclass ⇒ Object
- #reset ⇒ Object
- #results ⇒ Object
-
#set(name, value, opts = {}, &block) ⇒ Object
Set a variable.
- #site(url) ⇒ Object
- #spawn ⇒ Object
-
#visit(path) ⇒ Object
Visit the path.
Methods included from SpiderCore::PaginationDSL
Methods included from SpiderCore::FollowDSL
Methods included from SpiderCore::FieldDSL
#css_field, #css_fields, #field, #fields, #foreach, #xpath_field, #xpath_fields
Constructor Details
#initialize(excretion = nil) ⇒ MicroSpider
Returns a new instance of MicroSpider.
37 38 39 40 41 42 43 44 45 46 |
# File 'lib/micro_spider.rb', line 37 def initialize(excretion = nil) @paths = [] @actions = [] @setted_variables = {} @timeout = 120 @excretion = excretion || { status: 'inprogress', results: [] } @logger = Logger.new(STDOUT) @visited_paths = Set.new @broken_paths = [] end |
Instance Attribute Details
#actions ⇒ Object
Returns the value of attribute actions.
35 36 37 |
# File 'lib/micro_spider.rb', line 35 def actions @actions end |
#broken_paths ⇒ Object (readonly)
Returns the value of attribute broken_paths.
34 35 36 |
# File 'lib/micro_spider.rb', line 34 def broken_paths @broken_paths end |
#current_location ⇒ Object (readonly)
Returns the value of attribute current_location.
34 35 36 |
# File 'lib/micro_spider.rb', line 34 def current_location @current_location end |
#delay ⇒ Object
Returns the value of attribute delay.
34 35 36 |
# File 'lib/micro_spider.rb', line 34 def delay @delay end |
#excretion ⇒ Object (readonly)
Returns the value of attribute excretion.
34 35 36 |
# File 'lib/micro_spider.rb', line 34 def excretion @excretion end |
#logger ⇒ Object
Returns the value of attribute logger.
35 36 37 |
# File 'lib/micro_spider.rb', line 35 def logger @logger end |
#paths ⇒ Object (readonly)
Returns the value of attribute paths.
34 35 36 |
# File 'lib/micro_spider.rb', line 34 def paths @paths end |
#recipe ⇒ Object
Returns the value of attribute recipe.
35 36 37 |
# File 'lib/micro_spider.rb', line 35 def recipe @recipe end |
#skip_set_entrance ⇒ Object
Returns the value of attribute skip_set_entrance.
35 36 37 |
# File 'lib/micro_spider.rb', line 35 def skip_set_entrance @skip_set_entrance end |
#timeout ⇒ Object
Returns the value of attribute timeout.
35 36 37 |
# File 'lib/micro_spider.rb', line 35 def timeout @timeout end |
#visited_paths ⇒ Object (readonly)
Returns the value of attribute visited_paths.
34 35 36 |
# File 'lib/micro_spider.rb', line 34 def visited_paths @visited_paths end |
Instance Method Details
#click(locator, opts = {}, &block) ⇒ Object
Click the locator. This will trigger visit action and change current location.
102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
# File 'lib/micro_spider.rb', line 102 def click(locator, opts = {}, &block) actions << lambda { path = find_link(locator, opts)[:href] rescue nil raise SpiderCore::ClickPathNotFound, "#{locator} not found" if path.nil? if block_given? spider = self.spawn spider.entrance(path) spider.learn(&block) current_location[:click] ||= [] current_location[:click] << spider.crawl[:results] else visit(path) end } end |
#completed? ⇒ Boolean
295 296 297 |
# File 'lib/micro_spider.rb', line 295 def completed? excretion[:status] == 'completed' end |
#crawl(&block) ⇒ Object
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 |
# File 'lib/micro_spider.rb', line 192 def crawl(&block) return excretion if completed? @paths.compact! path = nil loop do path = @paths.shift break if path.nil? break unless @visited_paths.include?(path) end if path.nil? excretion[:status] = 'completed' return excretion end learn(@recipe) if @actions.empty? begin visit(path) rescue Timeout::Error => err @broken_paths << path logger.fatal("Timeout!!! execution expired when visit `#{path}`") logger.fatal(err) rescue SystemExit, Interrupt logger.fatal("SystemExit && Interrupt") exit! rescue Exception => err @broken_paths << path logger.fatal("Caught exception when visit `#{path}`") logger.fatal(err) else @visited_paths << path execute_actions yield(@current_location) if block_given? excretion[:results] << @current_location ensure @actions = [] @skip_set_entrance = true crawl(&block) end excretion end |
#create_action(name, &block) ⇒ Object
Spider can create custom action when it is crawling.
258 259 260 261 |
# File 'lib/micro_spider.rb', line 258 def create_action(name, &block) action = proc { actions << lambda { block.call(current_location) } } .send :define_method, name, &action end |
#entrance(*path_or_paths) ⇒ Object
This will be the first path for spider to visit. If more than one entrance, the spider will crawl theme one by one.
169 170 171 172 |
# File 'lib/micro_spider.rb', line 169 def entrance(*path_or_paths) return if @skip_set_entrance @paths += path_or_paths end |
#entrance_on_path(path, pattern, opts = {}, &block) ⇒ Object
Sometimes the entrances are on the page.
182 183 184 185 186 187 188 189 190 |
# File 'lib/micro_spider.rb', line 182 def entrance_on_path(path, pattern, opts = {}, &block) return if @skip_set_entrance kind = opts[:kind] || :css visit(path) entrances = scan_all(kind, pattern, opts).map do |element| block_given? ? yield(element) : element[:href] end @paths += entrances.to_a end |
#execute_actions ⇒ Object
263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 |
# File 'lib/micro_spider.rb', line 263 def execute_actions actions.delete_if { |action| begin Timeout::timeout(@timeout) { action.call } rescue Timeout::Error => err logger.fatal('Timeout!!! execution expired when execute action') logger.fatal(err.) logger.fatal(err.backtrace.inspect) break rescue SpiderCore::ClickPathNotFound => err logger.fatal(err.) logger.fatal(err.backtrace.inspect) break end } end |
#learn(recipe = nil, &block) ⇒ Object
Teach the spider behaviors and it will repeat to the end.
139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
# File 'lib/micro_spider.rb', line 139 def learn(recipe = nil, &block) if block_given? instance_eval(&block) @recipe = block elsif recipe.is_a?(Proc) instance_eval(&recipe) @recipe = recipe elsif recipe.is_a?(String) instance_eval(recipe) @recipe = recipe else self end end |
#metaclass ⇒ Object
299 300 301 |
# File 'lib/micro_spider.rb', line 299 def class << self; self; end end |
#reset ⇒ Object
237 238 239 240 241 242 243 |
# File 'lib/micro_spider.rb', line 237 def reset return unless completed? @paths = visited_paths.to_a @excretion = { status: 'inprogress', results: [] } @visited_paths = Set.new @current_location = nil end |
#results ⇒ Object
291 292 293 |
# File 'lib/micro_spider.rb', line 291 def results excretion[:results] end |
#set(name, value, opts = {}, &block) ⇒ Object
Set a variable. You can use it later.
87 88 89 90 91 92 93 94 95 96 97 |
# File 'lib/micro_spider.rb', line 87 def set(name, value, opts = {}, &block) selector = opts.delete(:selector) if selector.nil? @setted_variables[name.to_s] = value else actions << lambda { elements = scan_all(selector, value, opts) @setted_variables[name.to_s] = block_given? ? yield(elements) : handle_element(elements.first) } end end |
#site(url) ⇒ Object
154 155 156 157 |
# File 'lib/micro_spider.rb', line 154 def site(url) return if @site Capybara.app_host = @excretion[:site] = @site = url end |
#spawn ⇒ Object
280 281 282 283 284 285 286 287 288 289 |
# File 'lib/micro_spider.rb', line 280 def spawn spider = self.clone spider.instance_variable_set(:@paths, []) spider.instance_variable_set(:@actions, []) spider.instance_variable_set(:@visited_paths, Set.new) spider.instance_variable_set(:@broken_paths, Set.new) spider.instance_variable_set(:@excretion, { status: 'inprogress', results: [] }) spider.skip_set_entrance = false spider end |
#visit(path) ⇒ Object
Visit the path.
65 66 67 68 69 70 71 72 |
# File 'lib/micro_spider.rb', line 65 def visit(path) raise ArgumentError, "Path can't be nil or empty" if path.nil? || path.empty? sleep_or_not logger.info "Begin to visit #{path}." super(path) @current_location = {entrance: path} logger.info "Current location is #{path}." end |