Class: Meiriyigua::DetailCrawl

Inherits:
Object
  • Object
show all
Includes:
Models
Defined in:
lib/meiriyigua/detail_crawl.rb

Instance Method Summary collapse

Constructor Details

#initialize(detail_urls) ⇒ DetailCrawl

Returns a new instance of DetailCrawl.



9
10
11
12
# File 'lib/meiriyigua/detail_crawl.rb', line 9

def initialize(detail_urls)
  @detail_urls = detail_urls
  @agent = CrawlClient.create_agent
end

Instance Method Details

#handle_1234wg(page, page_record) ⇒ Object



56
57
58
59
60
61
62
63
64
# File 'lib/meiriyigua/detail_crawl.rb', line 56

def handle_1234wg(page, page_record)
  page_record.title = page.search('td[width="583"] > font > strong font').text.strip
  return if page_record.title.empty?
  page_record.category = page.search('body > table[background="/images/hgf-4.gif"] td[style="padding-left:6px;"] a:last-of-type').text
  page_record.content = strip_content(page.search('td#intro'))
  filename = page.search('td[valign="top"] > script:last-of-type').text.split(',')[1][6..-2]
  page_record.downloads = "http://dx2down.bugwg.com:801/#{URI.escape filename}"
  page_record
end

#handle_dongdongwg(page, page_record) ⇒ Object



113
114
115
116
117
118
119
120
121
122
# File 'lib/meiriyigua/detail_crawl.rb', line 113

def handle_dongdongwg(page, page_record)
  page_record.title = page.search('//div[@class="pageMainArea"]/h1/text()').text.strip
  return if page_record.title.empty?
  page_record.category = page.search('span.current1 a:last-of-type').text
  content = page.search('div#mainSoftIntro')
  content.search('p:last-of-type').remove
  page_record.content = strip_content(content)
  page_record.downloads = join_downloads(page.search('ul.downlistbox a').collect{|a| a['href']})
  page_record
end

#handle_gg1z(page, page_record) ⇒ Object



93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# File 'lib/meiriyigua/detail_crawl.rb', line 93

def handle_gg1z(page, page_record)
  page_record.title = page.search('div.software-info > div.cp-top > h3').text.strip
  return if page_record.title.empty?
  page_record.category = page.search('div.nav-breadcrumb a:nth-last-of-type(2)').text
  content = page.search('div.cp-main > div.cp-main')
  content.search('font[color="red"]').remove
  page_record.content = strip_content(content)

  downloads = page.search('ul.download-list a').collect{|a| "http://www.gg1z.com#{a['href']}"}
  downloads = [downloads.first, downloads.last].uniq
  final_downloads = []
  downloads.each do |down|
    down_page = @agent.get(down, nil, page.uri.to_s)
    CrawlClient.set_page_encoding(down_page)
    final_downloads.concat( down_page.search('div.downarea a').collect{|a| a['href'] =~ /^http/ ? a['href'] : "http://www.gg1z.com#{a['href']}"} )
  end
  page_record.downloads = join_downloads(final_downloads)
  page_record
end

#handle_nanawg(page, page_record) ⇒ Object



75
76
77
78
79
80
81
82
# File 'lib/meiriyigua/detail_crawl.rb', line 75

def handle_nanawg(page, page_record)
  page_record.title = page.search('div.right_tit').text.strip
  return if page_record.title.empty?
  page_record.category = page.search('div#index3 a:last-of-type').text
  page_record.content = page.search('div.rightsum_text4').text
  page_record.downloads = join_downloads(page.search('ul.ul2 a').collect{|a| a['href'] =~ /^http/ ? a['href'] : "http://www.nanawg.com#{a['href']}"})
  page_record
end

#handle_qh24(page, page_record) ⇒ Object



66
67
68
69
70
71
72
73
# File 'lib/meiriyigua/detail_crawl.rb', line 66

def handle_qh24(page, page_record)
  page_record.title = page.search('//*[@id="sintro"]/h1/text()').text.strip
  return if page_record.title.empty?
  page_record.category = page.search('h2.classname > a:last-of-type').text
  page_record.content = strip_content(page.search('div.cnt'))
  page_record.downloads = join_downloads(page.search('div#intext dd a').collect{|a| a['href']})
  page_record
end

#handle_ucbug(page, page_record) ⇒ Object



84
85
86
87
88
89
90
91
# File 'lib/meiriyigua/detail_crawl.rb', line 84

def handle_ucbug(page, page_record)
  page_record.title = page.search('div.spmain_1 a').text.strip
  return if page_record.title.empty?
  page_record.category = page.search('div.slhead_1 a:last-of-type').text
  page_record.content = page.search('div.spmain_5').text
  page_record.downloads = join_downloads(page.search('ul.ul_Address a').collect{|a| a['href']})
  page_record
end

#handle_url(uri) ⇒ Object



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/meiriyigua/detail_crawl.rb', line 21

def handle_url(uri)

  if UrlRecord.exist_url?(uri.to_s)
    #print "抓取详情页 #{uri.to_s} "
    #puts "重复,跳过"
    return
  end

  page = @agent.get(uri)
  CrawlClient.set_page_encoding(page)
  name = uri.host.to_s.split('.')[1]

  url_record = UrlRecord.new
  url_record.url = uri.to_s
  url_record.detail_at = Time.now

  page_record = PageRecord.new
  url_record.page_record = page_record

  page_record = send("handle_#{name}", page, page_record)
  if page_record.nil?
    print "抓取详情页 #{uri.to_s} "
    puts "失败"
  else
    print "抓取详情页 #{uri.to_s} "
    if url_record.save
      puts "成功"
    else
      puts "保存失败"
    end
  end

  CrawlClient.random_sleep
end

#handle_uuuwg(page, page_record) ⇒ Object



124
125
126
127
128
129
130
131
# File 'lib/meiriyigua/detail_crawl.rb', line 124

def handle_uuuwg(page, page_record)
  page_record.title = page.search('div.spmain_1').text.strip
  return if page_record.title.empty?
  page_record.category = page.search('div.slhead_1 a:last-of-type').text
  page_record.content = strip_content(page.search('div.spmain_5'))
  page_record.downloads = join_downloads(page.search('ul.spmain_3_2 > li:last-of-type a').collect{|a| a['href']})
  page_record
end

#handle_xiaolinzi(page, page_record) ⇒ Object



143
144
145
146
147
148
149
150
# File 'lib/meiriyigua/detail_crawl.rb', line 143

def handle_xiaolinzi(page, page_record)
  page_record.title = page.search('div.dlbt_wz').text.strip
  return if page_record.title.empty?
  page_record.category = page.search('div.head_dh a:last-of-type').text
  page_record.content = strip_content(page.search('div#content_all'))
  page_record.downloads = join_downloads(page.search('div.dl_link_bd a[target="_blank"]').collect{|a| a['href']})
  page_record
end

#handle_xixiwg(page, page_record) ⇒ Object



133
134
135
136
137
138
139
140
141
# File 'lib/meiriyigua/detail_crawl.rb', line 133

def handle_xixiwg(page, page_record)
  page_record.title = page.search('div.r2 h2').text.strip
  return if page_record.title.empty?
  page_record.category = page.search('div.location a:last-of-type').text
  page_record.content = strip_content(page.search('div#intro'))
  filename = page.search('div.xzk script:last-of-type').text.split(',')[1].strip[6..-3]
  page_record.downloads = "http://dxdown1.xixiwg.com/#{URI.escape filename}"
  page_record
end

#join_downloads(downloads) ⇒ Object



156
157
158
# File 'lib/meiriyigua/detail_crawl.rb', line 156

def join_downloads(downloads)
  downloads.uniq.join('#!#')
end

#runObject



14
15
16
17
18
19
# File 'lib/meiriyigua/detail_crawl.rb', line 14

def run
  while !@detail_urls.empty?
    uri = URI(@detail_urls.pop)
    handle_url(uri)
  end
end

#strip_content(content) ⇒ Object



152
153
154
# File 'lib/meiriyigua/detail_crawl.rb', line 152

def strip_content(content)
  content.text
end