Class: Meiriyigua::BaiduCrawl

Inherits:
Object
  • Object
show all
Includes:
Models
Defined in:
lib/meiriyigua/baidu_crawl.rb

Instance Method Summary collapse

Constructor Details

#initializeBaiduCrawl

Returns a new instance of BaiduCrawl.



9
10
11
# File 'lib/meiriyigua/baidu_crawl.rb', line 9

def initialize
  @agent = CrawlClient.create_agent
end

Instance Method Details

#get_intro(title) ⇒ Object



32
33
34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/meiriyigua/baidu_crawl.rb', line 32

def get_intro(title)
  page = @agent.get('http://www.baidu.com/')
  sleep 1
  search_form = page.form_with(:name => "f1")
  search_form.field_with(:name => "wd").value = title
  search_results = @agent.submit search_form

  result = ""
  search_results.search('div.c-container div.c-abstract').each_with_index{|a, i| result << "提示#{i+1}\r\n #{a.text}\r\n\r\n"}
  result
rescue
  puts "抓取百度简介出错了 #{$!.class} #{$!.message}\n#{$!.backtrace.join("\n")}"
  ""
end

#runObject



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/meiriyigua/baidu_crawl.rb', line 13

def run
  UrlRecord.all(:baidu_at => nil).each do |url_record|
    page_record = url_record.page_record
    baidu_intro = get_intro(page_record.title)
    CrawlClient.random_sleep
    if baidu_intro.empty?
      print "抓取百度简介 #{url_record.url} "
      puts "失败"
    else
      page_record.baidu_intro = baidu_intro
      page_record.save
      url_record.baidu_at = Time.now
      url_record.save
      print "抓取百度简介 #{url_record.url} "
      puts "成功"
    end
  end
end