Class: Meiriyigua::BaiduCrawl
- Inherits:
-
Object
- Object
- Meiriyigua::BaiduCrawl
- Includes:
- Models
- Defined in:
- lib/meiriyigua/baidu_crawl.rb
Instance Method Summary collapse
- #get_intro(title) ⇒ Object
-
#initialize ⇒ BaiduCrawl
constructor
A new instance of BaiduCrawl.
- #run ⇒ Object
Constructor Details
#initialize ⇒ BaiduCrawl
Returns a new instance of BaiduCrawl.
9 10 11 |
# File 'lib/meiriyigua/baidu_crawl.rb', line 9 def initialize @agent = CrawlClient.create_agent end |
Instance Method Details
#get_intro(title) ⇒ Object
32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
# File 'lib/meiriyigua/baidu_crawl.rb', line 32 def get_intro(title) page = @agent.get('http://www.baidu.com/') sleep 1 search_form = page.form_with(:name => "f1") search_form.field_with(:name => "wd").value = title search_results = @agent.submit search_form result = "" search_results.search('div.c-container div.c-abstract').each_with_index{|a, i| result << "提示#{i+1}\r\n #{a.text}\r\n\r\n"} result rescue puts "抓取百度简介出错了 #{$!.class} #{$!.}\n#{$!.backtrace.join("\n")}" "" end |
#run ⇒ Object
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
# File 'lib/meiriyigua/baidu_crawl.rb', line 13 def run UrlRecord.all(:baidu_at => nil).each do |url_record| page_record = url_record.page_record baidu_intro = get_intro(page_record.title) CrawlClient.random_sleep if baidu_intro.empty? print "抓取百度简介 #{url_record.url} " puts "失败" else page_record.baidu_intro = baidu_intro page_record.save url_record.baidu_at = Time.now url_record.save print "抓取百度简介 #{url_record.url} " puts "成功" end end end |