Class: Meiriyigua::DetailCrawl

Inherits:
Object
  • Object
show all
Includes:
Models
Defined in:
lib/meiriyigua/detail_crawl.rb

Instance Method Summary collapse

Constructor Details

#initialize(detail_urls) ⇒ DetailCrawl

Returns a new instance of DetailCrawl.



9
10
11
12
# File 'lib/meiriyigua/detail_crawl.rb', line 9

def initialize(detail_urls)
  @detail_urls = detail_urls
  @agent = CrawlClient.create_agent
end

Instance Method Details

#handle_1234wg(page, page_record) ⇒ Object



58
59
60
61
62
63
64
65
66
# File 'lib/meiriyigua/detail_crawl.rb', line 58

def handle_1234wg(page, page_record)
  page_record.title = page.search('td[width="583"] > font > strong font').text.strip
  return if page_record.title.empty?
  page_record.category = page.search('body > table[background="/images/hgf-4.gif"] td[style="padding-left:6px;"] a:last-of-type').text
  page_record.content = strip_content(page.search('td#intro'))
  filename = page.search('td[valign="top"] > script:last-of-type').text.split(',')[1][6..-2]
  page_record.downloads = "http://dx2down.bugwg.com:801/#{URI.escape filename}"
  page_record
end

#handle_dongdongwg(page, page_record) ⇒ Object



115
116
117
118
119
120
121
122
123
124
# File 'lib/meiriyigua/detail_crawl.rb', line 115

def handle_dongdongwg(page, page_record)
  page_record.title = page.search('//div[@class="pageMainArea"]/h1/text()').text.strip
  return if page_record.title.empty?
  page_record.category = page.search('span.current1 a:last-of-type').text
  content = page.search('div#mainSoftIntro')
  content.search('p:last-of-type').remove
  page_record.content = strip_content(content)
  page_record.downloads = join_downloads(page.search('ul.downlistbox a').collect{|a| a['href']})
  page_record
end

#handle_gg1z(page, page_record) ⇒ Object



95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# File 'lib/meiriyigua/detail_crawl.rb', line 95

def handle_gg1z(page, page_record)
  page_record.title = page.search('div.software-info > div.cp-top > h3').text.strip
  return if page_record.title.empty?
  page_record.category = page.search('div.nav-breadcrumb a:nth-last-of-type(2)').text
  content = page.search('div.cp-main > div.cp-main')
  content.search('font[color="red"]').remove
  page_record.content = strip_content(content)

  downloads = page.search('ul.download-list a').collect{|a| "http://www.gg1z.com#{a['href']}"}
  downloads = [downloads.first, downloads.last].uniq
  final_downloads = []
  downloads.each do |down|
    down_page = @agent.get(down, nil, page.uri.to_s)
    CrawlClient.set_page_encoding(down_page)
    final_downloads.concat( down_page.search('div.downarea a').collect{|a| a['href'] =~ /^http/ ? a['href'] : "http://www.gg1z.com#{a['href']}"} )
  end
  page_record.downloads = join_downloads(final_downloads)
  page_record
end

#handle_nanawg(page, page_record) ⇒ Object



77
78
79
80
81
82
83
84
# File 'lib/meiriyigua/detail_crawl.rb', line 77

def handle_nanawg(page, page_record)
  page_record.title = page.search('div.right_tit').text.strip
  return if page_record.title.empty?
  page_record.category = page.search('div#index3 a:last-of-type').text
  page_record.content = page.search('div.rightsum_text4').text
  page_record.downloads = join_downloads(page.search('ul.ul2 a').collect{|a| a['href'] =~ /^http/ ? a['href'] : "http://www.nanawg.com#{a['href']}"})
  page_record
end

#handle_qh24(page, page_record) ⇒ Object



68
69
70
71
72
73
74
75
# File 'lib/meiriyigua/detail_crawl.rb', line 68

def handle_qh24(page, page_record)
  page_record.title = page.search('//*[@id="sintro"]/h1/text()').text.strip
  return if page_record.title.empty?
  page_record.category = page.search('h2.classname > a:last-of-type').text
  page_record.content = strip_content(page.search('div.cnt'))
  page_record.downloads = join_downloads(page.search('div#intext dd a').collect{|a| a['href']})
  page_record
end

#handle_ucbug(page, page_record) ⇒ Object



86
87
88
89
90
91
92
93
# File 'lib/meiriyigua/detail_crawl.rb', line 86

def handle_ucbug(page, page_record)
  page_record.title = page.search('div.spmain_1 a').text.strip
  return if page_record.title.empty?
  page_record.category = page.search('div.slhead_1 a:last-of-type').text
  page_record.content = page.search('div.spmain_5').text
  page_record.downloads = join_downloads(page.search('ul.ul_Address a').collect{|a| a['href']})
  page_record
end

#handle_url(uri) ⇒ Object



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/meiriyigua/detail_crawl.rb', line 21

def handle_url(uri)

  if UrlRecord.exist_url?(uri.to_s)
    #print "抓取详情页 #{uri.to_s} "
    #puts "重复,跳过"
    return
  end

  page = @agent.get(uri)
  CrawlClient.set_page_encoding(page)
  name = uri.host.to_s.split('.')[1]

  url_record = UrlRecord.new
  url_record.url = uri.to_s
  url_record.detail_at = Time.now

  page_record = PageRecord.new
  url_record.page_record = page_record

  page_record = send("handle_#{name}", page, page_record)
  if page_record.nil?
    print "抓取详情页 #{uri.to_s} "
    puts "失败"
  else
    print "抓取详情页 #{uri.to_s} "
    if url_record.save
      puts "成功"
    else
      puts "保存失败"
    end
  end

  CrawlClient.random_sleep
rescue
    puts "抓取详情出错了 #{$!.class} #{$!.message}\n#{$!.backtrace.join("\n")}"
end

#handle_uuuwg(page, page_record) ⇒ Object



126
127
128
129
130
131
132
133
# File 'lib/meiriyigua/detail_crawl.rb', line 126

def handle_uuuwg(page, page_record)
  page_record.title = page.search('div.spmain_1').text.strip
  return if page_record.title.empty?
  page_record.category = page.search('div.slhead_1 a:last-of-type').text
  page_record.content = strip_content(page.search('div.spmain_5'))
  page_record.downloads = join_downloads(page.search('ul.spmain_3_2 > li:last-of-type a').collect{|a| a['href']})
  page_record
end

#handle_xiaolinzi(page, page_record) ⇒ Object



145
146
147
148
149
150
151
152
# File 'lib/meiriyigua/detail_crawl.rb', line 145

def handle_xiaolinzi(page, page_record)
  page_record.title = page.search('div.dlbt_wz').text.strip
  return if page_record.title.empty?
  page_record.category = page.search('div.head_dh a:last-of-type').text
  page_record.content = strip_content(page.search('div#content_all'))
  page_record.downloads = join_downloads(page.search('div.dl_link_bd a[target="_blank"]').collect{|a| a['href']})
  page_record
end

#handle_xixiwg(page, page_record) ⇒ Object



135
136
137
138
139
140
141
142
143
# File 'lib/meiriyigua/detail_crawl.rb', line 135

def handle_xixiwg(page, page_record)
  page_record.title = page.search('div.r2 h2').text.strip
  return if page_record.title.empty?
  page_record.category = page.search('div.location a:last-of-type').text
  page_record.content = strip_content(page.search('div#intro'))
  filename = page.search('div.xzk script:last-of-type').text.split(',')[1].strip[6..-3]
  page_record.downloads = "http://dxdown1.xixiwg.com/#{URI.escape filename}"
  page_record
end

#join_downloads(downloads) ⇒ Object



158
159
160
# File 'lib/meiriyigua/detail_crawl.rb', line 158

def join_downloads(downloads)
  downloads.uniq.join('#!#')
end

#runObject



14
15
16
17
18
19
# File 'lib/meiriyigua/detail_crawl.rb', line 14

def run
  while !@detail_urls.empty?
    uri = URI(@detail_urls.pop)
    handle_url(uri)
  end
end

#strip_content(content) ⇒ Object



154
155
156
# File 'lib/meiriyigua/detail_crawl.rb', line 154

def strip_content(content)
  content.text
end