Class: Meiriyigua::ListCrawl
- Inherits:
-
Object
- Object
- Meiriyigua::ListCrawl
- Defined in:
- lib/meiriyigua/list_crawl.rb
Instance Attribute Summary collapse
-
#detail_urls ⇒ Object
readonly
Returns the value of attribute detail_urls.
Instance Method Summary collapse
- #handle_1234wg(page) ⇒ Object
- #handle_dongdongwg(page) ⇒ Object
- #handle_gg1z(page) ⇒ Object
- #handle_nanawg(page) ⇒ Object
- #handle_qh24(page) ⇒ Object
- #handle_ucbug(page) ⇒ Object
- #handle_url(uri) ⇒ Object
- #handle_uuuwg(page) ⇒ Object
- #handle_xiaolinzi(page) ⇒ Object
- #handle_xixiwg(page) ⇒ Object
-
#initialize ⇒ ListCrawl
constructor
A new instance of ListCrawl.
- #run ⇒ Object
Constructor Details
#initialize ⇒ ListCrawl
Returns a new instance of ListCrawl.
8 9 10 11 12 13 14 |
# File 'lib/meiriyigua/list_crawl.rb', line 8 def initialize @list_urls = Queue.new @detail_urls = Queue.new @agent = CrawlClient.create_agent init_url end |
Instance Attribute Details
#detail_urls ⇒ Object (readonly)
Returns the value of attribute detail_urls.
6 7 8 |
# File 'lib/meiriyigua/list_crawl.rb', line 6 def detail_urls @detail_urls end |
Instance Method Details
#handle_1234wg(page) ⇒ Object
39 40 41 42 |
# File 'lib/meiriyigua/list_crawl.rb', line 39 def handle_1234wg(page) urls = page.search('td[width="470"] a[href^="/1234/"]') urls.collect {|a| "http://www.1234wg.com#{a['href']}" } end |
#handle_dongdongwg(page) ⇒ Object
64 65 66 67 |
# File 'lib/meiriyigua/list_crawl.rb', line 64 def handle_dongdongwg(page) urls = page.search('span.list_title > a') urls.collect{|a| "http://www.dongdongwg.com#{a['href']}"} end |
#handle_gg1z(page) ⇒ Object
59 60 61 62 |
# File 'lib/meiriyigua/list_crawl.rb', line 59 def handle_gg1z(page) urls = page.search('span.app-name a') urls.collect{|a| "http://www.gg1z.com#{a['href']}"} end |
#handle_nanawg(page) ⇒ Object
49 50 51 52 |
# File 'lib/meiriyigua/list_crawl.rb', line 49 def handle_nanawg(page) urls = page.search('td[width="362"] a:last-of-type') urls.collect{|a| "http://www.nanawg.com#{a['href']}"} end |
#handle_qh24(page) ⇒ Object
44 45 46 47 |
# File 'lib/meiriyigua/list_crawl.rb', line 44 def handle_qh24(page) urls = page.search('div#downhot table a') urls.collect{|a| "http://www.qh24.com#{a['href']}"} end |
#handle_ucbug(page) ⇒ Object
54 55 56 57 |
# File 'lib/meiriyigua/list_crawl.rb', line 54 def handle_ucbug(page) urls = page.search('li.slmain2_2_2 a') urls.collect{|a| a['href']} end |
#handle_url(uri) ⇒ Object
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/meiriyigua/list_crawl.rb', line 23 def handle_url(uri) page = @agent.get(uri) CrawlClient.set_page_encoding(page) name = uri.host.to_s.split('.')[1] urls = send("handle_#{name}", page) if urls.empty? print "抓取列表页 #{uri.to_s} " puts "失败" else urls.each {|a| @detail_urls << a} print "抓取列表页 #{uri.to_s} " puts "成功" end CrawlClient.random_sleep end |
#handle_uuuwg(page) ⇒ Object
69 70 71 72 |
# File 'lib/meiriyigua/list_crawl.rb', line 69 def handle_uuuwg(page) urls = page.search('table.main_table tr > td:nth-child(2) a') urls.collect{|a| "http://www.uuuwg.com#{a['href']}"} end |
#handle_xiaolinzi(page) ⇒ Object
79 80 81 82 |
# File 'lib/meiriyigua/list_crawl.rb', line 79 def handle_xiaolinzi(page) urls = page.search('td.rewid1 > a') urls.collect{|a| "http://www.xiaolinzi.com#{a['href']}"} end |
#handle_xixiwg(page) ⇒ Object
74 75 76 77 |
# File 'lib/meiriyigua/list_crawl.rb', line 74 def handle_xixiwg(page) urls = page.search('div.entry > h2 > a') urls.collect{|a| "http://www.xixiwg.com#{a['href']}"} end |
#run ⇒ Object
16 17 18 19 20 21 |
# File 'lib/meiriyigua/list_crawl.rb', line 16 def run while !@list_urls.empty? uri = URI(@list_urls.pop) handle_url(uri) end end |