Method: Web2Text.do_crawl

Defined in:
lib/web2text.rb

.do_crawl(options) ⇒ Object



85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# File 'lib/web2text.rb', line 85

def self.do_crawl(options)
  crawl = Crawl.new options[:url], options[:avoid], options[:focus]
  crawler = Crawler.new crawl, options[:query]
  formatter = options[:formatter].new crawl, options[:out]

  Anemone.crawl(crawl.url, :obey_robots_txt => !options[:ignore_robots_txt]) do |anemone|
      anemone.focus_crawl do |page|
        crawl.filter page.links
      end

      anemone.on_every_page do |page|
          STDERR.puts page.url

          # ignore redirects
          code = page.code || 200
          if 300 <= code and code < 400
            next
          elsif !crawl.focus? page.url
            next
          elsif page.doc.nil?
            STDERR.puts "ERR: Failed to retrieve #{page.url}"
            next
          end

          plain = crawler.doc_as_plaintext page.doc
          formatter.append plain, page.url
          sleep options[:sleep]
      end

      anemone.after_crawl do
        formatter.close
      end
  end
end