Class: HttpProxyPool::Basetask
- Inherits:
-
Object
- Object
- HttpProxyPool::Basetask
- Defined in:
- lib/http_proxy_pool/basetask.rb
Instance Attribute Summary collapse
-
#agent ⇒ Object
Returns the value of attribute agent.
-
#logger ⇒ Object
Returns the value of attribute logger.
-
#next_page ⇒ Object
Returns the value of attribute next_page.
-
#page_parser ⇒ Object
Returns the value of attribute page_parser.
-
#url ⇒ Object
Returns the value of attribute url.
Instance Method Summary collapse
- #curr_page ⇒ Object
-
#initialize(opts = {}) ⇒ Basetask
constructor
A new instance of Basetask.
- #ips(lastest = true) ⇒ Object
- #nextpage(&block) ⇒ Object
- #parser(&block) ⇒ Object
- #rand_sleep(max_tick = 2) ⇒ Object
- #sitename ⇒ Object
- #sitetask(url, opts = {}) ⇒ Object
Constructor Details
#initialize(opts = {}) ⇒ Basetask
Returns a new instance of Basetask.
11 12 13 14 15 |
# File 'lib/http_proxy_pool/basetask.rb', line 11 def initialize(opts = {}) @agent = opts[:agent] @logger = opts[:logger] @url = opts[:url] end |
Instance Attribute Details
#agent ⇒ Object
Returns the value of attribute agent.
5 6 7 |
# File 'lib/http_proxy_pool/basetask.rb', line 5 def agent @agent end |
#logger ⇒ Object
Returns the value of attribute logger.
5 6 7 |
# File 'lib/http_proxy_pool/basetask.rb', line 5 def logger @logger end |
#next_page ⇒ Object
Returns the value of attribute next_page.
5 6 7 |
# File 'lib/http_proxy_pool/basetask.rb', line 5 def next_page @next_page end |
#page_parser ⇒ Object
Returns the value of attribute page_parser.
5 6 7 |
# File 'lib/http_proxy_pool/basetask.rb', line 5 def page_parser @page_parser end |
#url ⇒ Object
Returns the value of attribute url.
5 6 7 |
# File 'lib/http_proxy_pool/basetask.rb', line 5 def url @url end |
Instance Method Details
#curr_page ⇒ Object
69 70 71 |
# File 'lib/http_proxy_pool/basetask.rb', line 69 def curr_page @agent.page.uri end |
#ips(lastest = true) ⇒ Object
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
# File 'lib/http_proxy_pool/basetask.rb', line 30 def ips(lastest = true) uri = @url loop do @logger.info("start crawling page [#{uri}] ...") @agent.get(uri) # get all page need sleep a random time rand_sleep unless lastest begin instance_eval(&page_parser).each do |field| yield field end rescue Exception => e @logger.error("parsing page error[#{uri}]. #{e.to_s}") break end begin break unless @next_page uri = instance_eval(&next_page) break unless uri rescue => e @logger.error("error occoured when get next page[#{uri}]. #{e.to_s}") break end break if lastest end end |
#nextpage(&block) ⇒ Object
65 66 67 |
# File 'lib/http_proxy_pool/basetask.rb', line 65 def nextpage(&block) @next_page = block if block_given? end |
#parser(&block) ⇒ Object
61 62 63 |
# File 'lib/http_proxy_pool/basetask.rb', line 61 def parser(&block) @page_parser = block if block_given? end |
#rand_sleep(max_tick = 2) ⇒ Object
77 78 79 |
# File 'lib/http_proxy_pool/basetask.rb', line 77 def rand_sleep(max_tick = 2) sleep rand(max_tick) end |
#sitename ⇒ Object
73 74 75 |
# File 'lib/http_proxy_pool/basetask.rb', line 73 def sitename URI.parse(URI.encode(@url)).host end |
#sitetask(url, opts = {}) ⇒ Object
17 18 19 20 21 22 23 24 25 26 27 28 |
# File 'lib/http_proxy_pool/basetask.rb', line 17 def sitetask(url, opts = {}) raise ScriptError.new("script do not specify a url!") unless url @url = url @agent = opts[:agent] || Mechanize.new @logger ||= opts[:logger] #for debug #@agent.set_proxy '127.0.0.1', 8888 yield end |