Class: Baiduserp::Parser

Inherits:
Object
  • Object
show all
Defined in:
lib/baiduserp/parser.rb,
lib/baiduserp/parser/ranks.rb,
lib/baiduserp/parser/con_ar.rb,
lib/baiduserp/parser/zhixin.rb,
lib/baiduserp/parser/ads_top.rb,
lib/baiduserp/parser/ads_right.rb,
lib/baiduserp/parser/result_num.rb,
lib/baiduserp/parser/right_hotel.rb,
lib/baiduserp/parser/pinpaizhuanqu.rb,
lib/baiduserp/parser/right_weather.rb,
lib/baiduserp/parser/related_keywords.rb,
lib/baiduserp/parser/right_personinfo.rb,
lib/baiduserp/parser/right_relaperson.rb

Instance Method Summary collapse

Instance Method Details

#_parse_ads_right(file) ⇒ Object



2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# File 'lib/baiduserp/parser/ads_right.rb', line 2

def _parse_ads_right(file)
  result = []
  file[:doc].search('div.EC_im').each do |div|
    r = {}

    r[:rank] = div['id'].sub('bdfs','').to_i + 1

    r[:title] = Baiduserp::Helper.get_content_safe(div.search('a.EC_t'))

    r[:content] = Baiduserp::Helper.get_content_safe(div.search('a.EC_desc/font'))

    r[:url] = Baiduserp::Helper.get_content_safe(div.search('font.EC_url'))

    result << r
  end
  result
end

#_parse_ads_top(file) ⇒ Object



2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/baiduserp/parser/ads_top.rb', line 2

def _parse_ads_top(file)
  result = []
  rank = 0

  part = file[:doc].search('div#content_left').first
  return result if part.nil?

  part.children.each do |div|
    id = div['id'].to_i
    break if id > 0 && id < 3000
    next unless div['class'].to_s.include?('ec_pp_f')
    rank += 1

    if div.name == 'div'
      r = {rank: rank, id: id}

      r[:title] = Baiduserp::Helper.get_content_safe(div.search('div.ec_title'))

      r[:content] = Baiduserp::Helper.get_content_safe(div.search('div.ec_desc'))

      r[:url] = Baiduserp::Helper.get_content_safe(div.search('span.ec_url'))

      result << r

    else # div.name == 'table'
      r = {rank: rank, id: id}

      r[:title] = Baiduserp::Helper.get_content_safe(div.search('td.EC_header/a'))

      r[:content] = Baiduserp::Helper.get_content_safe(div.search('a.EC_desc'))

      r[:url] = Baiduserp::Helper.get_content_safe(div.search('a.EC_url'))

      result << r

    end
  end

  result
end

#_parse_con_ar(file) ⇒ Object



2
3
4
5
6
7
8
9
10
11
12
13
# File 'lib/baiduserp/parser/con_ar.rb', line 2

def _parse_con_ar(file)
  result = []
  divs = file[:doc].search("div#content_right div#con-ar").first
  return [] if divs.nil?
  divs.children.each do |div|
    next unless div['class'].to_s.include?('result-op')
    result << {:tpl => div['tpl'], 
      :data_click => Baiduserp::Helper.parse_data_click(div['data-click'])
    }
  end
  result
end

#_parse_pinpaizhuanqu(file) ⇒ Object



2
3
4
5
6
7
# File 'lib/baiduserp/parser/pinpaizhuanqu.rb', line 2

def _parse_pinpaizhuanqu(file)
  part = file[:doc].search("div[@id='content_left']").first
  return false if part.nil?

  part.children[2].name == 'script'
end

#_parse_ranks(file) ⇒ Object



2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/baiduserp/parser/ranks.rb', line 2

def _parse_ranks(file)
  result = []
  part = file[:doc].search("div[@id='content_left']").first
  return result if part.nil?

  part.children.each do |table|
    next if table.nil?
    id = table['id'].to_i
    next unless id > 0 && id < 3000

    r = {:rank => id}

    r[:result_op] = table['class'].to_s.include?('result-op')

    r[:fk] = table['fk']

    r[:srcid] = table['srcid']

    r[:tpl] = table['tpl']

    r[:mu] = table['mu']

    url = table.search('h3/a').first
    unless url.nil?
      url = url['href'] 
      sleep(rand)
      url = Baiduserp::Client.get_rank_url(url).headers['location'] if url.include?('http://www.baidu.com/link?')
    end
    r[:url] = url

    r[:title] = Baiduserp::Helper.get_content_safe(table.search('h3'))

    r[:content] = Baiduserp::Helper.get_content_safe(table.search('div.c-abstract'))

    table.search('a').each do |link|
      r[:baiduopen] = true if link['href'].to_s.include?('open.baidu.com')
    end
    r[:baiduopen] = false if r[:baiduopen].nil?
    
    result << r
  end
  result
end


2
3
4
5
6
7
8
9
10
# File 'lib/baiduserp/parser/related_keywords.rb', line 2

def _parse_related_keywords(file)
  result = []
  file[:doc].search('div[@id="rs"]').each do |rs|
    rs.css('a').each do |link|
      result << link.content
    end
  end
  result
end

#_parse_result_num(file) ⇒ Object



4
5
6
7
8
9
10
11
12
13
14
15
16
# File 'lib/baiduserp/parser/result_num.rb', line 4

def _parse_result_num(file)
  html = file[:html]
  str = html.scan(/百度为您找到相关结果(.*)个/).join
  str = str.gsub('','')
  if str.include?('')
    parts = str.split('')
    result = parts[0].to_i * 10000 + parts[1].to_i
  else
    result = str.gsub(',', '').to_i
  end

  result
end

#_parse_right_hotel(file) ⇒ Object



2
3
4
5
6
7
8
9
10
11
# File 'lib/baiduserp/parser/right_hotel.rb', line 2

def _parse_right_hotel(file)
  rh = file[:doc].search('div[@tpl="right_hotel"]')
  return nil if rh.nil?

  rh = rh.first
  return nil if rh.nil?
  title = Baiduserp::Helper.get_content_safe(rh.search('div.opr-hotel-title'))
  
  {:title => title}
end

#_parse_right_personinfo(file) ⇒ Object



2
3
4
5
6
7
8
9
10
11
12
13
# File 'lib/baiduserp/parser/right_personinfo.rb', line 2

def _parse_right_personinfo(file)
  rp = file[:doc].search('div[@tpl="right_personinfo"]')
  return nil if rp.nil?

  title = Baiduserp::Helper.get_content_safe rp.search('span.opr-personinfo-subtitle-large')
  info_summary = Baiduserp::Helper.get_content_safe rp.search('div.opr-personinfo-summary')
  info = Baiduserp::Helper.get_content_safe rp.search('div.opr-personinfo-info')
  source = Baiduserp::Helper.get_content_safe rp.search('div.opr-personinfo-source a')
  
  return nil if title.nil? && info.nil? && source.nil?
  {:title => title, :info_summary => info_summary, :info => info, :source => source}
end

#_parse_right_relaperson(file) ⇒ Object



2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# File 'lib/baiduserp/parser/right_relaperson.rb', line 2

def _parse_right_relaperson(file)
  relapersons = file[:doc].search('div[@tpl="right_relaperson"]')
  return nil if relapersons.nil?

  result = []
  relapersons.each do |rr|
    title = rr.search('div.cr-title/span').first
    title = title.content unless title.nil?
    r = []
    rr.search('p.opr-relaperson-name/a').each do |p|
      r << p['title']
    end
    result << {:title => title, :names => r}
  end
  result
end

#_parse_right_weather(file) ⇒ Object



2
3
4
5
6
7
8
9
10
11
12
13
# File 'lib/baiduserp/parser/right_weather.rb', line 2

def _parse_right_weather(file)
  rw = file[:doc].search('div[@tpl="right_weather"]')
  return nil if rw.nil?

  rw = rw.first
  return nil if rw.nil?

  title = Baiduserp::Helper.get_content_safe(rw.search('div.opr-weather-title'))
  week = rw.search('a.opr-weather-week').first['href']
  
  {:title => title, :week => week}
end

#_parse_zhixin(file) ⇒ Object



2
3
4
5
6
7
8
9
10
11
12
13
14
# File 'lib/baiduserp/parser/zhixin.rb', line 2

def _parse_zhixin(file)
  result = []
  file[:doc].search("div#content_left .result-zxl").each do |zxl|
    result << {:id => zxl['id'], 
      :srcid => zxl['srcid'],
      :fk => zxl['fk'],
      :tpl => zxl['tpl'], 
      :mu => zxl['mu'],
      :data_click => Baiduserp::Helper.parse_data_click(zxl['data-click'])
    }
  end
  result
end

#get_search_html(keyword, page = 1) ⇒ Object



48
49
50
51
52
53
54
# File 'lib/baiduserp/parser.rb', line 48

def get_search_html(keyword,page=1)
  keyword = keyword.gsub(" ","+")
  page = page.to_i > 1 ? "&pn=#{page.to_i-1}0" : ""
  serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&rsv_bp=0&ch=&tn=baidu&bar=&rsv_spt=3&ie=utf-8&rsv_sug3=2&rsv_sug=0&rsv_sug1=2&rsv_sug4=24&inputT=#{1000+rand(1000)}")
  # serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&ie=utf-8")
  Client.get_serp(serp_url).body
end

#parse(html) ⇒ Object



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/baiduserp/parser.rb', line 13

def parse(html)
  html = html.encode!('UTF-8','UTF-8',:invalid => :replace)
  @file = Hash.new
  @serp = Baiduserp::Result.new

  @file[:html] = html
  @file[:doc] = Nokogiri::HTML(html)

  self.methods.each do |m|
    next unless m =~ /^_parse_/
    #p m
    begin
      @serp[m.to_s.sub('_parse_','').to_sym] = self.send m,@file
    rescue Exception => e
      issue_file = "/tmp/baiduserp_issue_#{Time.now.strftime("%Y%m%d%H%M%S")}.html"
      open(issue_file,'w').puts(html)
      puts "Notice:"
      puts "Baiduserp gem have a bug, please email to [email protected] to report it."
      puts "Please attach file #{issue_file} in the email and the error information below, thanks!"
      puts e.message
      puts e.inspect
      puts e.backtrace
      raise "Baiduserp Parser Get An Error!"
    end
    #p @serp.keys
  end

  @serp
end

#parse_file(file_path) ⇒ Object



56
57
58
59
60
61
62
63
# File 'lib/baiduserp/parser.rb', line 56

def parse_file(file_path)
  if File.exists? file_path
    html = open(file_path).read
  else
    html = Client.get_serp(file_path).body
  end
  parse html
end

#search(keyword, page = 1) ⇒ Object



43
44
45
46
# File 'lib/baiduserp/parser.rb', line 43

def search(keyword,page=1)
  html = get_search_html(keyword,page)
  parse html
end