85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
|
# File 'lib/web2text.rb', line 85
def self.do_crawl(options)
crawl = Crawl.new options[:url], options[:avoid], options[:focus]
crawler = Crawler.new crawl, options[:query]
formatter = options[:formatter].new crawl, options[:out]
Anemone.crawl(crawl.url, :obey_robots_txt => !options[:ignore_robots_txt]) do |anemone|
anemone.focus_crawl do |page|
crawl.filter page.links
end
anemone.on_every_page do |page|
STDERR.puts page.url
code = page.code || 200
if 300 <= code and code < 400
next
elsif !crawl.focus? page.url
next
elsif page.doc.nil?
STDERR.puts "ERR: Failed to retrieve #{page.url}"
next
end
plain = crawler.doc_as_plaintext page.doc
formatter.append plain, page.url
sleep options[:sleep]
end
anemone.after_crawl do
formatter.close
end
end
end
|