Class: Vore::Crawler
Overview
This is the class that starts and controls the crawling
Constant Summary collapse
- PLATFORM =
[:cpu, :os].map { |m| Gem::Platform.local.send(m) }.join("-")
- FILE_SEPERATOR =
PLATFORM.include?("windows") ? File::ALT_SEPARATOR : File::SEPARATOR
Instance Attribute Summary collapse
-
#handlers ⇒ Object
readonly
Returns the value of attribute handlers.
-
#output_dir ⇒ Object
readonly
Returns the value of attribute output_dir.
Instance Method Summary collapse
-
#initialize(sanitization_config: Vore::Configuration::DEFAULT_SANITIZATION_CONFIG, handlers: nil, options: {}) ⇒ Crawler
constructor
Creates a crawler denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
- #process_file(path) {|page| ... } ⇒ Object
- #rewrite(html_file) ⇒ Object
- #run_command(website, delay: 0) ⇒ Object
- #scrape_each_page(website, &block) ⇒ Object
- #user_agent ⇒ Object
Methods included from TestHelper
#content, #generate_path, #generate_sentence, #generate_word, #loop_times, #loop_times=, #meta_tag_count, #meta_tag_count=
Constructor Details
#initialize(sanitization_config: Vore::Configuration::DEFAULT_SANITIZATION_CONFIG, handlers: nil, options: {}) ⇒ Crawler
Creates a crawler denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
# File 'lib/vore/crawler.rb', line 18 def initialize(sanitization_config: Vore::Configuration::DEFAULT_SANITIZATION_CONFIG, handlers: nil, options: {}) @meta_extractor = Vore::Handlers::MetaExtractor.new @handlers = if handlers.nil? [@meta_extractor, Vore::Handlers::TagRemover.new] else handlers.unshift(@meta_extractor) end @selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: @handlers) ext = PLATFORM.include?("windows") ? ".exe" : "" @executable = File.([__FILE__, "..", "..", "..", "exe", "vore-spider#{ext}"].join(FILE_SEPERATOR)) @options = Vore::Configuration::DEFAULT_OPTIONS.merge() @parent_output_dir = @options[:output_dir] @parent_output_dir_len = @parent_output_dir.to_s.split(FILE_SEPERATOR).size Vore.logger.level = @options[:log_level] Listen.logger = Vore.logger @results = { pages_visited: 0, unprocessed_pages: [], } return if File.exist?(@executable) warn("ERROR: Unsupported platform: `#{PLATFORM}`") exit(1) end |
Instance Attribute Details
#handlers ⇒ Object (readonly)
Returns the value of attribute handlers.
14 15 16 |
# File 'lib/vore/crawler.rb', line 14 def handlers @handlers end |
#output_dir ⇒ Object (readonly)
Returns the value of attribute output_dir.
14 15 16 |
# File 'lib/vore/crawler.rb', line 14 def output_dir @output_dir end |
Instance Method Details
#process_file(path) {|page| ... } ⇒ Object
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
# File 'lib/vore/crawler.rb', line 77 def process_file(path, &block) @results[:pages_visited] += 1 html_file = File.read(path).force_encoding("UTF-8") if html_file.empty? @results[:unprocessed_pages] << path return end rewritten_html_file = @selma.rewrite(html_file) return if rewritten_html_file.empty? # drops the first 3 parts of the path, which are "tmp", "vore", and the site name url_path = path.split(FILE_SEPERATOR)[(@parent_output_dir_len + 1)..].join("/") page = Vore::PageData.new( content: rewritten_html_file, title: @meta_extractor.title, meta: @meta_extractor., path: url_path, ) yield page end |
#rewrite(html_file) ⇒ Object
103 104 105 106 107 108 109 |
# File 'lib/vore/crawler.rb', line 103 def rewrite(html_file) @selma.rewrite(html_file) rescue StandardError => e Vore.logger.warn("Error rewriting #{path}: #{e}") @results[:unprocessed_pages] << path "" end |
#run_command(website, delay: 0) ⇒ Object
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
# File 'lib/vore/crawler.rb', line 111 def run_command(website, delay: 0) pid = Process.spawn( @executable, "--user-agent", user_agent, "--delay", delay.to_s, "--url", website, "download", "-t", @output_dir, ) _, _status = Process.waitpid2(pid) rescue StandardError => e Vore.logger.error(e) end |
#scrape_each_page(website, &block) ⇒ Object
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
# File 'lib/vore/crawler.rb', line 48 def scrape_each_page(website, &block) @output_dir = "#{@parent_output_dir}/#{website.gsub(/[^a-zA-Z0-9]/, "_").squeeze("_")}" FileUtils.rm_rf(@output_dir) FileUtils.mkdir_p(@output_dir) listener = Listen.to(@output_dir) do |_modified, added, _removed| if added.any? added.each do |path| process_file(path, &block) File.delete(path) if @options[:delete_after_yield] end end end listener.start Vore.logger.info("Vore started crawling #{website}, outputting to #{output_dir}") begin run_command(website, delay: @options[:delay]) ensure sleep(0.5) # give listener time to clean up listener.stop end Vore.logger.info("Vore finished crawling #{website}") @results end |