Class: Meiriyigua::DetailCrawl
- Inherits:
-
Object
- Object
- Meiriyigua::DetailCrawl
- Includes:
- Models
- Defined in:
- lib/meiriyigua/detail_crawl.rb
Instance Method Summary collapse
- #handle_1234wg(page, page_record) ⇒ Object
- #handle_dongdongwg(page, page_record) ⇒ Object
- #handle_gg1z(page, page_record) ⇒ Object
- #handle_nanawg(page, page_record) ⇒ Object
- #handle_qh24(page, page_record) ⇒ Object
- #handle_ucbug(page, page_record) ⇒ Object
- #handle_url(uri) ⇒ Object
- #handle_uuuwg(page, page_record) ⇒ Object
- #handle_xiaolinzi(page, page_record) ⇒ Object
- #handle_xixiwg(page, page_record) ⇒ Object
-
#initialize(detail_urls) ⇒ DetailCrawl
constructor
A new instance of DetailCrawl.
- #join_downloads(downloads) ⇒ Object
- #run ⇒ Object
- #strip_content(content) ⇒ Object
Constructor Details
#initialize(detail_urls) ⇒ DetailCrawl
Returns a new instance of DetailCrawl.
9 10 11 12 |
# File 'lib/meiriyigua/detail_crawl.rb', line 9 def initialize(detail_urls) @detail_urls = detail_urls @agent = CrawlClient.create_agent end |
Instance Method Details
#handle_1234wg(page, page_record) ⇒ Object
56 57 58 59 60 61 62 63 64 |
# File 'lib/meiriyigua/detail_crawl.rb', line 56 def handle_1234wg(page, page_record) page_record.title = page.search('td[width="583"] > font > strong font').text.strip return if page_record.title.empty? page_record.category = page.search('body > table[background="/images/hgf-4.gif"] td[style="padding-left:6px;"] a:last-of-type').text page_record.content = strip_content(page.search('td#intro')) filename = page.search('td[valign="top"] > script:last-of-type').text.split(',')[1][6..-2] page_record.downloads = "http://dx2down.bugwg.com:801/#{URI.escape filename}" page_record end |
#handle_dongdongwg(page, page_record) ⇒ Object
113 114 115 116 117 118 119 120 121 122 |
# File 'lib/meiriyigua/detail_crawl.rb', line 113 def handle_dongdongwg(page, page_record) page_record.title = page.search('//div[@class="pageMainArea"]/h1/text()').text.strip return if page_record.title.empty? page_record.category = page.search('span.current1 a:last-of-type').text content = page.search('div#mainSoftIntro') content.search('p:last-of-type').remove page_record.content = strip_content(content) page_record.downloads = join_downloads(page.search('ul.downlistbox a').collect{|a| a['href']}) page_record end |
#handle_gg1z(page, page_record) ⇒ Object
93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
# File 'lib/meiriyigua/detail_crawl.rb', line 93 def handle_gg1z(page, page_record) page_record.title = page.search('div.software-info > div.cp-top > h3').text.strip return if page_record.title.empty? page_record.category = page.search('div.nav-breadcrumb a:nth-last-of-type(2)').text content = page.search('div.cp-main > div.cp-main') content.search('font[color="red"]').remove page_record.content = strip_content(content) downloads = page.search('ul.download-list a').collect{|a| "http://www.gg1z.com#{a['href']}"} downloads = [downloads.first, downloads.last].uniq final_downloads = [] downloads.each do |down| down_page = @agent.get(down, nil, page.uri.to_s) CrawlClient.set_page_encoding(down_page) final_downloads.concat( down_page.search('div.downarea a').collect{|a| a['href'] =~ /^http/ ? a['href'] : "http://www.gg1z.com#{a['href']}"} ) end page_record.downloads = join_downloads(final_downloads) page_record end |
#handle_nanawg(page, page_record) ⇒ Object
75 76 77 78 79 80 81 82 |
# File 'lib/meiriyigua/detail_crawl.rb', line 75 def handle_nanawg(page, page_record) page_record.title = page.search('div.right_tit').text.strip return if page_record.title.empty? page_record.category = page.search('div#index3 a:last-of-type').text page_record.content = page.search('div.rightsum_text4').text page_record.downloads = join_downloads(page.search('ul.ul2 a').collect{|a| a['href'] =~ /^http/ ? a['href'] : "http://www.nanawg.com#{a['href']}"}) page_record end |
#handle_qh24(page, page_record) ⇒ Object
66 67 68 69 70 71 72 73 |
# File 'lib/meiriyigua/detail_crawl.rb', line 66 def handle_qh24(page, page_record) page_record.title = page.search('//*[@id="sintro"]/h1/text()').text.strip return if page_record.title.empty? page_record.category = page.search('h2.classname > a:last-of-type').text page_record.content = strip_content(page.search('div.cnt')) page_record.downloads = join_downloads(page.search('div#intext dd a').collect{|a| a['href']}) page_record end |
#handle_ucbug(page, page_record) ⇒ Object
84 85 86 87 88 89 90 91 |
# File 'lib/meiriyigua/detail_crawl.rb', line 84 def handle_ucbug(page, page_record) page_record.title = page.search('div.spmain_1 a').text.strip return if page_record.title.empty? page_record.category = page.search('div.slhead_1 a:last-of-type').text page_record.content = page.search('div.spmain_5').text page_record.downloads = join_downloads(page.search('ul.ul_Address a').collect{|a| a['href']}) page_record end |
#handle_url(uri) ⇒ Object
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
# File 'lib/meiriyigua/detail_crawl.rb', line 21 def handle_url(uri) if UrlRecord.exist_url?(uri.to_s) #print "抓取详情页 #{uri.to_s} " #puts "重复,跳过" return end page = @agent.get(uri) CrawlClient.set_page_encoding(page) name = uri.host.to_s.split('.')[1] url_record = UrlRecord.new url_record.url = uri.to_s url_record.detail_at = Time.now page_record = PageRecord.new url_record.page_record = page_record page_record = send("handle_#{name}", page, page_record) if page_record.nil? print "抓取详情页 #{uri.to_s} " puts "失败" else print "抓取详情页 #{uri.to_s} " if url_record.save puts "成功" else puts "保存失败" end end CrawlClient.random_sleep end |
#handle_uuuwg(page, page_record) ⇒ Object
124 125 126 127 128 129 130 131 |
# File 'lib/meiriyigua/detail_crawl.rb', line 124 def handle_uuuwg(page, page_record) page_record.title = page.search('div.spmain_1').text.strip return if page_record.title.empty? page_record.category = page.search('div.slhead_1 a:last-of-type').text page_record.content = strip_content(page.search('div.spmain_5')) page_record.downloads = join_downloads(page.search('ul.spmain_3_2 > li:last-of-type a').collect{|a| a['href']}) page_record end |
#handle_xiaolinzi(page, page_record) ⇒ Object
143 144 145 146 147 148 149 150 |
# File 'lib/meiriyigua/detail_crawl.rb', line 143 def handle_xiaolinzi(page, page_record) page_record.title = page.search('div.dlbt_wz').text.strip return if page_record.title.empty? page_record.category = page.search('div.head_dh a:last-of-type').text page_record.content = strip_content(page.search('div#content_all')) page_record.downloads = join_downloads(page.search('div.dl_link_bd a[target="_blank"]').collect{|a| a['href']}) page_record end |
#handle_xixiwg(page, page_record) ⇒ Object
133 134 135 136 137 138 139 140 141 |
# File 'lib/meiriyigua/detail_crawl.rb', line 133 def handle_xixiwg(page, page_record) page_record.title = page.search('div.r2 h2').text.strip return if page_record.title.empty? page_record.category = page.search('div.location a:last-of-type').text page_record.content = strip_content(page.search('div#intro')) filename = page.search('div.xzk script:last-of-type').text.split(',')[1].strip[6..-3] page_record.downloads = "http://dxdown1.xixiwg.com/#{URI.escape filename}" page_record end |
#join_downloads(downloads) ⇒ Object
156 157 158 |
# File 'lib/meiriyigua/detail_crawl.rb', line 156 def join_downloads(downloads) downloads.uniq.join('#!#') end |
#run ⇒ Object
14 15 16 17 18 19 |
# File 'lib/meiriyigua/detail_crawl.rb', line 14 def run while !@detail_urls.empty? uri = URI(@detail_urls.pop) handle_url(uri) end end |
#strip_content(content) ⇒ Object
152 153 154 |
# File 'lib/meiriyigua/detail_crawl.rb', line 152 def strip_content(content) content.text end |