Class: Baiduserp::Client
- Inherits:
-
Object
- Object
- Baiduserp::Client
- Includes:
- HTTParty
- Defined in:
- lib/baiduserp/client.rb
Constant Summary collapse
- AllUserAgents =
YAML.load(open(File.('../user_agents.yml',__FILE__)))
Class Method Summary collapse
Instance Method Summary collapse
Class Method Details
.get_rank_url(url) ⇒ Object
20 21 22 |
# File 'lib/baiduserp/client.rb', line 20 def self.get_rank_url(url) self.new.get_rank_url(url) end |
.get_serp(url, retries = 3) ⇒ Object
16 17 18 |
# File 'lib/baiduserp/client.rb', line 16 def self.get_serp(url,retries = 3) self.new.get_serp(url,retries) end |
.rand_ua ⇒ Object
7 8 9 |
# File 'lib/baiduserp/client.rb', line 7 def self.rand_ua AllUserAgents[rand(AllUserAgents.size)] end |
Instance Method Details
#get_rank_url(url) ⇒ Object
24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/baiduserp/client.rb', line 24 def get_rank_url(url) begin response = self.class.get(url) rescue StandardError => e puts e.class puts e. sleep(10) retry end response end |
#get_serp(url, retries = 3) ⇒ Object
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# File 'lib/baiduserp/client.rb', line 36 def get_serp(url, retries = 3) if retries > 0 begin response = self.class.get(url) rescue StandardError => e puts e.class puts e. sleep(10) retry end if response.code != 200 puts response puts "Retry on URL: #{url}" sleep(rand(60)+1200) response = self.class.get_serp(url,retries - 1) end if response.nil? puts "Still error after 3 tries, sleep 3600s now." sleep(3600) response = self.class.get_serp(url) end ##Baidu Stopped response Content-Length in headers... #if response.headers['Content-Length'].nil? # puts "Can't read Content-Length from response, retry." # response = self.class.get_serp(url,retries-1) #end # #if response.headers['Content-Length'].to_i != response.body.bytesize # issue_file = "/tmp/baiduserp_crawler_issue_#{Time.now.strftime("%Y%m%d%H%M%S")}.html" # open(issue_file,'w').puts(response.body) # puts "Notice:" # puts "Baiduserp get an error when crawl SERP: response size (#{response.headers['Content-Length']}) not match body size." # puts "Please see file #{issue_file} for body content." # puts "Sleep 10s and retry" # sleep(10) # response = self.class.get_serp(url) #end response else nil end end |