Class: Meiriyigua::ListCrawl

Inherits:
Object
  • Object
show all
Defined in:
lib/meiriyigua/list_crawl.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeListCrawl

Returns a new instance of ListCrawl.



8
9
10
11
12
13
14
# File 'lib/meiriyigua/list_crawl.rb', line 8

def initialize
  @list_urls = Queue.new
  @detail_urls = Queue.new
  @agent = CrawlClient.create_agent

  init_url
end

Instance Attribute Details

#detail_urlsObject (readonly)

Returns the value of attribute detail_urls.



6
7
8
# File 'lib/meiriyigua/list_crawl.rb', line 6

def detail_urls
  @detail_urls
end

Instance Method Details

#handle_1234wg(page) ⇒ Object



39
40
41
42
# File 'lib/meiriyigua/list_crawl.rb', line 39

def handle_1234wg(page)
  urls = page.search('td[width="470"] a[href^="/1234/"]')
  urls.collect {|a| "http://www.1234wg.com#{a['href']}" }
end

#handle_dongdongwg(page) ⇒ Object



64
65
66
67
# File 'lib/meiriyigua/list_crawl.rb', line 64

def handle_dongdongwg(page)
  urls = page.search('span.list_title > a')
  urls.collect{|a| "http://www.dongdongwg.com#{a['href']}"}
end

#handle_gg1z(page) ⇒ Object



59
60
61
62
# File 'lib/meiriyigua/list_crawl.rb', line 59

def handle_gg1z(page)
  urls = page.search('span.app-name a')
  urls.collect{|a| "http://www.gg1z.com#{a['href']}"}
end

#handle_nanawg(page) ⇒ Object



49
50
51
52
# File 'lib/meiriyigua/list_crawl.rb', line 49

def handle_nanawg(page)
  urls = page.search('td[width="362"] a:last-of-type')
  urls.collect{|a| "http://www.nanawg.com#{a['href']}"}
end

#handle_qh24(page) ⇒ Object



44
45
46
47
# File 'lib/meiriyigua/list_crawl.rb', line 44

def handle_qh24(page)
  urls = page.search('div#downhot table a')
  urls.collect{|a| "http://www.qh24.com#{a['href']}"}
end

#handle_ucbug(page) ⇒ Object



54
55
56
57
# File 'lib/meiriyigua/list_crawl.rb', line 54

def handle_ucbug(page)
  urls = page.search('li.slmain2_2_2 a')
  urls.collect{|a| a['href']}
end

#handle_url(uri) ⇒ Object



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/meiriyigua/list_crawl.rb', line 23

def handle_url(uri)
  page = @agent.get(uri)
  CrawlClient.set_page_encoding(page)
  name = uri.host.to_s.split('.')[1]
  urls = send("handle_#{name}", page)
  if urls.empty?
    print "抓取列表页 #{uri.to_s} "
    puts "失败"
  else
    urls.each {|a| @detail_urls << a}
    print "抓取列表页 #{uri.to_s} "
    puts "成功"
  end
  CrawlClient.random_sleep
end

#handle_uuuwg(page) ⇒ Object



69
70
71
72
# File 'lib/meiriyigua/list_crawl.rb', line 69

def handle_uuuwg(page)
  urls = page.search('table.main_table tr > td:nth-child(2) a')
  urls.collect{|a| "http://www.uuuwg.com#{a['href']}"}
end

#handle_xiaolinzi(page) ⇒ Object



79
80
81
82
# File 'lib/meiriyigua/list_crawl.rb', line 79

def handle_xiaolinzi(page)
  urls = page.search('td.rewid1 > a')
  urls.collect{|a| "http://www.xiaolinzi.com#{a['href']}"}
end

#handle_xixiwg(page) ⇒ Object



74
75
76
77
# File 'lib/meiriyigua/list_crawl.rb', line 74

def handle_xixiwg(page)
  urls = page.search('div.entry > h2 > a')
  urls.collect{|a| "http://www.xixiwg.com#{a['href']}"}
end

#runObject



16
17
18
19
20
21
# File 'lib/meiriyigua/list_crawl.rb', line 16

def run
  while !@list_urls.empty?
    uri = URI(@list_urls.pop)
    handle_url(uri)
  end
end