64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
|
# File 'lib/crawl.rb', line 64
def retrieve
unless cancelled?
unless already_running? @options[:url]
unless already_crawled? @options[:url]
update_queues
if within_crawl_limits?
@redis.sadd("currently_running", @options[:url])
@stats.update_status("Retrieving #{@options[:url]}...")
@content = Cobweb.new(@options).get(@options[:url], @options)
update_counters
if @options[:url] == @redis.get("original_base_url")
@redis.set("crawled_base_url", @content[:base_url])
end
if content.permitted_type?
@stats.update_statistics(@content)
return true
end
else
puts "======================================="
puts "OUTWITH CRAWL LIMITS"
puts "======================================="
decrement_queue_counter
end
else
puts "======================================="
puts "ALREADY CRAWLED"
puts "======================================="
decrement_queue_counter
end
else
debug_puts "\n\nDETECTED DUPLICATE JOB for #{@options[:url]}\n"
debug_ap @redis.smembers("currently_running")
decrement_queue_counter
end
else
puts "======================================="
puts "CRAWL CANCELLED"
puts "======================================="
end
false
end
|