Class: Meiriyigua::ListCrawl
- Inherits:
-
Object
- Object
- Meiriyigua::ListCrawl
- Defined in:
- lib/meiriyigua/list_crawl.rb
Instance Attribute Summary collapse
-
#detail_urls ⇒ Object
readonly
Returns the value of attribute detail_urls.
Instance Method Summary collapse
- #handle_1234wg(page) ⇒ Object
- #handle_dongdongwg(page) ⇒ Object
- #handle_gg1z(page) ⇒ Object
- #handle_nanawg(page) ⇒ Object
- #handle_qh24(page) ⇒ Object
- #handle_ucbug(page) ⇒ Object
- #handle_url(uri) ⇒ Object
- #handle_uuuwg(page) ⇒ Object
- #handle_xiaolinzi(page) ⇒ Object
- #handle_xixiwg(page) ⇒ Object
-
#initialize ⇒ ListCrawl
constructor
A new instance of ListCrawl.
- #run ⇒ Object
Constructor Details
#initialize ⇒ ListCrawl
Returns a new instance of ListCrawl.
8 9 10 11 12 13 14 |
# File 'lib/meiriyigua/list_crawl.rb', line 8 def initialize @list_urls = Queue.new @detail_urls = Queue.new @agent = CrawlClient.create_agent init_url end |
Instance Attribute Details
#detail_urls ⇒ Object (readonly)
Returns the value of attribute detail_urls.
6 7 8 |
# File 'lib/meiriyigua/list_crawl.rb', line 6 def detail_urls @detail_urls end |
Instance Method Details
#handle_1234wg(page) ⇒ Object
41 42 43 44 |
# File 'lib/meiriyigua/list_crawl.rb', line 41 def handle_1234wg(page) urls = page.search('td[width="470"] a[href^="/1234/"]') urls.collect {|a| "http://www.1234wg.com#{a['href']}" } end |
#handle_dongdongwg(page) ⇒ Object
66 67 68 69 |
# File 'lib/meiriyigua/list_crawl.rb', line 66 def handle_dongdongwg(page) urls = page.search('span.list_title > a') urls.collect{|a| "http://www.dongdongwg.com#{a['href']}"} end |
#handle_gg1z(page) ⇒ Object
61 62 63 64 |
# File 'lib/meiriyigua/list_crawl.rb', line 61 def handle_gg1z(page) urls = page.search('span.app-name a') urls.collect{|a| "http://www.gg1z.com#{a['href']}"} end |
#handle_nanawg(page) ⇒ Object
51 52 53 54 |
# File 'lib/meiriyigua/list_crawl.rb', line 51 def handle_nanawg(page) urls = page.search('td[width="362"] a:last-of-type') urls.collect{|a| "http://www.nanawg.com#{a['href']}"} end |
#handle_qh24(page) ⇒ Object
46 47 48 49 |
# File 'lib/meiriyigua/list_crawl.rb', line 46 def handle_qh24(page) urls = page.search('div#downhot table a') urls.collect{|a| "http://www.qh24.com#{a['href']}"} end |
#handle_ucbug(page) ⇒ Object
56 57 58 59 |
# File 'lib/meiriyigua/list_crawl.rb', line 56 def handle_ucbug(page) urls = page.search('li.slmain2_2_2 a') urls.collect{|a| a['href']} end |
#handle_url(uri) ⇒ Object
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
# File 'lib/meiriyigua/list_crawl.rb', line 23 def handle_url(uri) page = @agent.get(uri) CrawlClient.set_page_encoding(page) name = uri.host.to_s.split('.')[1] urls = send("handle_#{name}", page) if urls.empty? print "抓取列表页 #{uri.to_s} " puts "失败" else urls.each {|a| @detail_urls << a} print "抓取列表页 #{uri.to_s} " puts "成功" end CrawlClient.random_sleep rescue puts "抓取列表出错了 #{$!.class} #{$!.}\n#{$!.backtrace.join("\n")}" end |
#handle_uuuwg(page) ⇒ Object
71 72 73 74 |
# File 'lib/meiriyigua/list_crawl.rb', line 71 def handle_uuuwg(page) urls = page.search('table.main_table tr > td:nth-child(2) a') urls.collect{|a| "http://www.uuuwg.com#{a['href']}"} end |
#handle_xiaolinzi(page) ⇒ Object
81 82 83 84 |
# File 'lib/meiriyigua/list_crawl.rb', line 81 def handle_xiaolinzi(page) urls = page.search('td.rewid1 > a') urls.collect{|a| "http://www.xiaolinzi.com#{a['href']}"} end |
#handle_xixiwg(page) ⇒ Object
76 77 78 79 |
# File 'lib/meiriyigua/list_crawl.rb', line 76 def handle_xixiwg(page) urls = page.search('div.entry > h2 > a') urls.collect{|a| "http://www.xixiwg.com#{a['href']}"} end |
#run ⇒ Object
16 17 18 19 20 21 |
# File 'lib/meiriyigua/list_crawl.rb', line 16 def run while !@list_urls.empty? uri = URI(@list_urls.pop) handle_url(uri) end end |