20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
|
# File 'lib/baiduserp/client.rb', line 20
def get_serp(url, retries = 3)
if retries > 0
begin
response = self.class.get(url)
rescue StandardError => e
puts e.class
puts e.message
sleep(10)
retry
end
if response.code != 200
puts response
puts "Retry on URL: #{url}"
sleep(rand(60)+1200)
response = self.class.get_serp(url,retries - 1)
end
if response.nil?
puts "Still error after 3 tries, sleep 3600s now."
sleep(3600)
response = self.class.get_serp(url)
end
if response.['Content-Length'].nil?
response = self.class.get_serp(url,retries)
end
if response.['Content-Length'].to_i != response.body.bytesize
issue_file = "/tmp/baiduserp_crawler_issue_#{Time.now.strftime("%Y%m%d%H%M%S")}.html"
open(issue_file,'w').puts(response.body)
puts "Notice:"
puts "Baiduserp get an error when crawl SERP: response size (#{response.['Content-Length']}) not match body size."
puts "Please see file #{issue_file} for body content."
puts "Sleep 10s and retry"
sleep(10)
response = self.class.get_serp(url)
end
return response
else
return nil
end
end
|