Class: Baidu

Inherits:
Object
  • Object
show all
Defined in:
lib/baidu.rb

Constant Summary collapse

BaseUri =
'http://www.baidu.com/s?'

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeBaidu

Returns a new instance of Baidu.



9
10
11
12
13
14
15
16
17
18
# File 'lib/baidu.rb', line 9

def initialize
    @a = Mechanize.new {|agent| agent.user_agent_alias = 'Linux Mozilla'}
    @a.idle_timeout = 2
    @a.max_history = 1
    @perpage = 100
    @page = nil
    @debug = false
    @data = Hash.new
    #@baseuri = "http://www.baidu.com/s?rn=#{@perpage}&wd="
end

Instance Attribute Details

#dataObject (readonly)

Returns the value of attribute data.



7
8
9
# File 'lib/baidu.rb', line 7

def data
  @data
end

#debugObject

Returns the value of attribute debug.



6
7
8
# File 'lib/baidu.rb', line 6

def debug
  @debug
end

#pageObject (readonly)

Returns the value of attribute page.



7
8
9
# File 'lib/baidu.rb', line 7

def page
  @page
end

#pagenumberObject

Returns the value of attribute pagenumber.



6
7
8
# File 'lib/baidu.rb', line 6

def pagenumber
  @pagenumber
end

#perpageObject

Returns the value of attribute perpage.



6
7
8
# File 'lib/baidu.rb', line 6

def perpage
  @perpage
end

#wdObject (readonly)

Returns the value of attribute wd.



7
8
9
# File 'lib/baidu.rb', line 7

def wd
  @wd
end

Instance Method Details

#how_manyObject

Raises:

  • (StandardError)


125
126
127
128
129
130
131
132
133
# File 'lib/baidu.rb', line 125

def how_many
    return @data['how_many'] if @data.has_key?'how_many'
    raise StandardError,'wrong with @page' unless @page.instance_of? Mechanize::Page
    numSpan = @page.search("//span[@class='nums']").first
    return false if numSpan.nil?
    return numSpan.content.gsub(/\D/,'').to_i
    #return false if @page.search("//span[@class='nums']").first.nil?
    #return @page.search("//span[@class='nums']").first.content.gsub(/\D/,'').to_i
end

domain:xxx.yyy.com/path/file.html



65
66
67
68
69
# File 'lib/baidu.rb', line 65

def how_many_links(uri)
    return @data['how_many']if @data.has_key?'how_many'
    query("domain:\"#{uri}\"")
    return how_many
end

#how_many_pages(host) ⇒ Object

site:xxx.yyy.com



58
59
60
61
62
# File 'lib/baidu.rb', line 58

def how_many_pages(host)
    return @data['how_many']if @data.has_key?'how_many'
    query("site:#{host}")
    return how_many
end

#how_many_pages_with(host, string) ⇒ Object

site:xxx.yyy.com inurl:zzz



72
73
74
75
76
# File 'lib/baidu.rb', line 72

def how_many_pages_with(host,string)
    return @data['how_many']if @data.has_key?'how_many'
    query("site:#{host} inurl:#{string}")
    return how_many
end

#nextObject



135
136
137
138
139
140
141
# File 'lib/baidu.rb', line 135

def next
    nextbtn = @page.link_with(:text=>/下一页/)
    return false if (nextbtn.nil? or @currpage >= @maxpage)
    @page = @a.click(nextbtn)
    self.clean
    return true
end

#popular?(wd) ⇒ Boolean

Returns:

  • (Boolean)


27
28
29
# File 'lib/baidu.rb', line 27

def popular?(wd)
    return @a.get("http://index.baidu.com/main/word.php?word=#{URI.encode(wd.encode("GBK"))}").body.include?"boxFlash"
end

#query(wd) ⇒ Object



31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/baidu.rb', line 31

def query(wd)
    @data.clear
    @wd = wd
    @data.clear
    q = Array.new
    q << "wd=#{wd}"
    q << "rn=#{@perpage}"
    queryStr = q.join("&")
    uri = URI.encode((BaseUri + queryStr).encode('GBK'))
    @page = @a.get uri
    clean
    @number = self.how_many
    @maxpage = (@number / @perpage.to_f).round
    @currpage =0
=begin
    query = "#{query}"
    @uri = @baseuri+URI.encode(query.encode('GBK'))
    @page = @a.get @uri
    self.clean
    @number = self.how_many
    @maxpage = (@number / @perpage.to_f).round
    @maxpage =10 if @maxpage>10
    @currpage =0
=end
end

#rank(host) ⇒ Object

look up a word and get the rank of a uri with $host



79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# File 'lib/baidu.rb', line 79

def rank(host)#on base of ranks
    return @data[:rank][host] if @data.has_key?:rank and @data[:rank].has_key?host
    ranks.each_with_index do |uri,index|
        if URI.parse(URI.encode(uri).host)
            @data << {:rank=>{host=>index+1}}
            return index+1
        end
    end
=begin
    @page.search("//table[@class=\"result\"]").each do |table|
        href = @page.search("//table[@id=\"#{table['id']}\"]//a").first['href']
        begin
            return table['id'] if host==URI.parse(URI.encode(href)).host
        rescue URI::InvalidURIError
            puts "invalid uri:#{href}" if @debug
        end
    end
    return false
=end
end

#ranksObject

(keyword=false)

Raises:

  • (StandardError)


100
101
102
103
104
105
106
107
108
109
110
# File 'lib/baidu.rb', line 100

def ranks#(keyword=false)
    return @data[:ranks] if @data.has_key?:ranks
    raise StandardError,'wrong with @page' unless @page.instance_of? Mechanize::Page
    #self.query(keyword) if keyword
    ranks = Array.new
    @page.search("//table[@class=\"result\"]").each do |table|
        ranks << @page.search("//table[@id=\"#{table['id']}\"]//a").first['href']
    end
    @data[:ranks] = ranks
    return ranks
end

Raises:

  • (StandardError)


112
113
114
115
116
117
118
119
120
121
122
123
# File 'lib/baidu.rb', line 112

def related_keywords
    return @data[:realated_keywords] if @data.has_key?:realated_keywords
    raise StandardError,'wrong with @page' unless @page.instance_of? Mechanize::Page
    keywords = Array.new
    div = @page.search("//div[@id=\"rs\"]//tr//a")
    return false if div.nil?
    div.each do |keyword|
        keywords << keyword.text
    end
    return keywords
    #m = /href="[^"]+">([^<]+)<\/a>/.match(related.content)
end

#suggestions(wd) ⇒ Object



21
22
23
24
25
# File 'lib/baidu.rb', line 21

def suggestions(wd)
    json = @a.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").body.force_encoding('GBK').encode("UTF-8")
    m = /\[([^\]]*)\]/.match json
    return JSON.parse m[0]
end