Class: Baiduserp::Parser
- Inherits:
-
Object
- Object
- Baiduserp::Parser
- Defined in:
- lib/baiduserp/parser.rb,
lib/baiduserp/parser/ranks.rb,
lib/baiduserp/parser/con_ar.rb,
lib/baiduserp/parser/zhixin.rb,
lib/baiduserp/parser/ads_top.rb,
lib/baiduserp/parser/ads_right.rb,
lib/baiduserp/parser/result_num.rb,
lib/baiduserp/parser/right_hotel.rb,
lib/baiduserp/parser/pinpaizhuanqu.rb,
lib/baiduserp/parser/right_weather.rb,
lib/baiduserp/parser/related_keywords.rb,
lib/baiduserp/parser/right_personinfo.rb,
lib/baiduserp/parser/right_relaperson.rb
Instance Method Summary collapse
- #_parse_ads_right(file) ⇒ Object
- #_parse_ads_top(file) ⇒ Object
- #_parse_con_ar(file) ⇒ Object
- #_parse_pinpaizhuanqu(file) ⇒ Object
- #_parse_ranks(file) ⇒ Object
- #_parse_related_keywords(file) ⇒ Object
- #_parse_result_num(file) ⇒ Object
- #_parse_right_hotel(file) ⇒ Object
- #_parse_right_personinfo(file) ⇒ Object
- #_parse_right_relaperson(file) ⇒ Object
- #_parse_right_weather(file) ⇒ Object
- #_parse_zhixin(file) ⇒ Object
- #get_search_html(keyword, page = 1) ⇒ Object
- #parse(html) ⇒ Object
- #parse_file(file_path) ⇒ Object
- #search(keyword, page = 1) ⇒ Object
Instance Method Details
#_parse_ads_right(file) ⇒ Object
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 |
# File 'lib/baiduserp/parser/ads_right.rb', line 2 def _parse_ads_right(file) result = [] file[:doc].search('div.EC_im').each do |div| r = {} r[:rank] = div['id'].sub('bdfs','').to_i + 1 r[:title] = Baiduserp::Helper.get_content_safe(div.search('a.EC_t')) r[:content] = Baiduserp::Helper.get_content_safe(div.search('a.EC_desc/font')) r[:url] = Baiduserp::Helper.get_content_safe(div.search('font.EC_url')) result << r end result end |
#_parse_ads_top(file) ⇒ Object
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
# File 'lib/baiduserp/parser/ads_top.rb', line 2 def _parse_ads_top(file) result = [] rank = 0 part = file[:doc].search('div#content_left').first return result if part.nil? part.children.each do |div| id = div['id'].to_i break if id > 0 && id < 3000 next unless div['class'].to_s.include?('ec_pp_f') rank += 1 if div.name == 'div' r = {rank: rank, id: id} r[:title] = Baiduserp::Helper.get_content_safe(div.search('div.ec_title')) r[:content] = Baiduserp::Helper.get_content_safe(div.search('div.ec_desc')) r[:url] = Baiduserp::Helper.get_content_safe(div.search('span.ec_url')) result << r else # div.name == 'table' r = {rank: rank, id: id} r[:title] = Baiduserp::Helper.get_content_safe(div.search('td.EC_header/a')) r[:content] = Baiduserp::Helper.get_content_safe(div.search('a.EC_desc')) r[:url] = Baiduserp::Helper.get_content_safe(div.search('a.EC_url')) result << r end end result end |
#_parse_con_ar(file) ⇒ Object
2 3 4 5 6 7 8 9 10 11 12 13 |
# File 'lib/baiduserp/parser/con_ar.rb', line 2 def _parse_con_ar(file) result = [] divs = file[:doc].search("div#content_right div#con-ar").first return [] if divs.nil? divs.children.each do |div| next unless div['class'].to_s.include?('result-op') result << {:tpl => div['tpl'], :data_click => Baiduserp::Helper.parse_data_click(div['data-click']) } end result end |
#_parse_pinpaizhuanqu(file) ⇒ Object
2 3 4 5 6 7 |
# File 'lib/baiduserp/parser/pinpaizhuanqu.rb', line 2 def _parse_pinpaizhuanqu(file) part = file[:doc].search("div[@id='content_left']").first return false if part.nil? part.children[2].name == 'script' end |
#_parse_ranks(file) ⇒ Object
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
# File 'lib/baiduserp/parser/ranks.rb', line 2 def _parse_ranks(file) result = [] part = file[:doc].search("div[@id='content_left']").first return result if part.nil? part.children.each do |table| next if table.nil? id = table['id'].to_i next unless id > 0 && id < 3000 r = {:rank => id} r[:result_op] = table['class'].to_s.include?('result-op') r[:fk] = table['fk'] r[:srcid] = table['srcid'] r[:tpl] = table['tpl'] r[:mu] = table['mu'] url = table.search('h3/a').first unless url.nil? url = url['href'] sleep(rand) url = Baiduserp::Client.get_rank_url(url).headers['location'] if url.include?('http://www.baidu.com/link?') end r[:url] = url r[:title] = Baiduserp::Helper.get_content_safe(table.search('h3')) r[:content] = Baiduserp::Helper.get_content_safe(table.search('div.c-abstract')) table.search('a').each do |link| r[:baiduopen] = true if link['href'].to_s.include?('open.baidu.com') end r[:baiduopen] = false if r[:baiduopen].nil? result << r end result end |
#_parse_related_keywords(file) ⇒ Object
2 3 4 5 6 7 8 9 10 |
# File 'lib/baiduserp/parser/related_keywords.rb', line 2 def (file) result = [] file[:doc].search('div[@id="rs"]').each do |rs| rs.css('a').each do |link| result << link.content end end result end |
#_parse_result_num(file) ⇒ Object
4 5 6 7 8 9 10 11 12 13 14 15 16 |
# File 'lib/baiduserp/parser/result_num.rb', line 4 def _parse_result_num(file) html = file[:html] str = html.scan(/百度为您找到相关结果(.*)个/).join str = str.gsub('约','') if str.include?('万') parts = str.split('万') result = parts[0].to_i * 10000 + parts[1].to_i else result = str.gsub(',', '').to_i end result end |
#_parse_right_hotel(file) ⇒ Object
2 3 4 5 6 7 8 9 10 11 |
# File 'lib/baiduserp/parser/right_hotel.rb', line 2 def _parse_right_hotel(file) rh = file[:doc].search('div[@tpl="right_hotel"]') return nil if rh.nil? rh = rh.first return nil if rh.nil? title = Baiduserp::Helper.get_content_safe(rh.search('div.opr-hotel-title')) {:title => title} end |
#_parse_right_personinfo(file) ⇒ Object
2 3 4 5 6 7 8 9 10 11 12 13 |
# File 'lib/baiduserp/parser/right_personinfo.rb', line 2 def _parse_right_personinfo(file) rp = file[:doc].search('div[@tpl="right_personinfo"]') return nil if rp.nil? title = Baiduserp::Helper.get_content_safe rp.search('span.opr-personinfo-subtitle-large') info_summary = Baiduserp::Helper.get_content_safe rp.search('div.opr-personinfo-summary') info = Baiduserp::Helper.get_content_safe rp.search('div.opr-personinfo-info') source = Baiduserp::Helper.get_content_safe rp.search('div.opr-personinfo-source a') return nil if title.nil? && info.nil? && source.nil? {:title => title, :info_summary => info_summary, :info => info, :source => source} end |
#_parse_right_relaperson(file) ⇒ Object
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 |
# File 'lib/baiduserp/parser/right_relaperson.rb', line 2 def _parse_right_relaperson(file) relapersons = file[:doc].search('div[@tpl="right_relaperson"]') return nil if relapersons.nil? result = [] relapersons.each do |rr| title = rr.search('div.cr-title/span').first title = title.content unless title.nil? r = [] rr.search('p.opr-relaperson-name/a').each do |p| r << p['title'] end result << {:title => title, :names => r} end result end |
#_parse_right_weather(file) ⇒ Object
2 3 4 5 6 7 8 9 10 11 12 13 |
# File 'lib/baiduserp/parser/right_weather.rb', line 2 def _parse_right_weather(file) rw = file[:doc].search('div[@tpl="right_weather"]') return nil if rw.nil? rw = rw.first return nil if rw.nil? title = Baiduserp::Helper.get_content_safe(rw.search('div.opr-weather-title')) week = rw.search('a.opr-weather-week').first['href'] {:title => title, :week => week} end |
#_parse_zhixin(file) ⇒ Object
2 3 4 5 6 7 8 9 10 11 12 13 14 |
# File 'lib/baiduserp/parser/zhixin.rb', line 2 def _parse_zhixin(file) result = [] file[:doc].search("div#content_left .result-zxl").each do |zxl| result << {:id => zxl['id'], :srcid => zxl['srcid'], :fk => zxl['fk'], :tpl => zxl['tpl'], :mu => zxl['mu'], :data_click => Baiduserp::Helper.parse_data_click(zxl['data-click']) } end result end |
#get_search_html(keyword, page = 1) ⇒ Object
48 49 50 51 52 53 54 |
# File 'lib/baiduserp/parser.rb', line 48 def get_search_html(keyword,page=1) keyword = keyword.gsub(" ","+") page = page.to_i > 1 ? "&pn=#{page.to_i-1}0" : "" serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&rsv_bp=0&ch=&tn=baidu&bar=&rsv_spt=3&ie=utf-8&rsv_sug3=2&rsv_sug=0&rsv_sug1=2&rsv_sug4=24&inputT=#{1000+rand(1000)}") # serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&ie=utf-8") Client.get_serp(serp_url).body end |
#parse(html) ⇒ Object
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
# File 'lib/baiduserp/parser.rb', line 13 def parse(html) html = html.encode!('UTF-8','UTF-8',:invalid => :replace) @file = Hash.new @serp = Baiduserp::Result.new @file[:html] = html @file[:doc] = Nokogiri::HTML(html) self.methods.each do |m| next unless m =~ /^_parse_/ #p m begin @serp[m.to_s.sub('_parse_','').to_sym] = self.send m,@file rescue Exception => e issue_file = "/tmp/baiduserp_issue_#{Time.now.strftime("%Y%m%d%H%M%S")}.html" open(issue_file,'w').puts(html) puts "Notice:" puts "Baiduserp gem have a bug, please email to [email protected] to report it." puts "Please attach file #{issue_file} in the email and the error information below, thanks!" puts e. puts e.inspect puts e.backtrace raise "Baiduserp Parser Get An Error!" end #p @serp.keys end @serp end |
#parse_file(file_path) ⇒ Object
56 57 58 59 60 61 62 63 |
# File 'lib/baiduserp/parser.rb', line 56 def parse_file(file_path) if File.exists? file_path html = open(file_path).read else html = Client.get_serp(file_path).body end parse html end |
#search(keyword, page = 1) ⇒ Object
43 44 45 46 |
# File 'lib/baiduserp/parser.rb', line 43 def search(keyword,page=1) html = get_search_html(keyword,page) parse html end |