Class: Vore::Crawler

Inherits:
Object
  • Object
show all
Includes:
TestHelper
Defined in:
lib/vore/crawler.rb

Overview

This is the class that starts and controls the crawling

Constant Summary collapse

PLATFORM =
[:cpu, :os].map { |m| Gem::Platform.local.send(m) }.join("-")
FILE_SEPERATOR =
PLATFORM.include?("windows") ? File::ALT_SEPARATOR : File::SEPARATOR

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from TestHelper

#content, #generate_path, #generate_sentence, #generate_word, #loop_times, #loop_times=, #meta_tag_count, #meta_tag_count=

Constructor Details

#initialize(sanitization_config: Vore::Configuration::DEFAULT_SANITIZATION_CONFIG, handlers: nil, options: {}) ⇒ Crawler

Creates a crawler denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.



18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/vore/crawler.rb', line 18

def initialize(sanitization_config: Vore::Configuration::DEFAULT_SANITIZATION_CONFIG, handlers: nil, options: {})
  @meta_extractor = Vore::Handlers::MetaExtractor.new

  @handlers = if handlers.nil?
    [@meta_extractor, Vore::Handlers::TagRemover.new]
  else
    handlers.unshift(@meta_extractor)
  end

  @selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: @handlers)
  ext = PLATFORM.include?("windows") ? ".exe" : ""
  @executable = File.expand_path([__FILE__, "..", "..", "..", "exe", "vore-spider#{ext}"].join(FILE_SEPERATOR))
  @options = Vore::Configuration::DEFAULT_OPTIONS.merge(options)
  @parent_output_dir = @options[:output_dir]
  @parent_output_dir_len = @parent_output_dir.to_s.split(FILE_SEPERATOR).size

  Vore.logger.level = @options[:log_level]
  Listen.logger = Vore.logger

  @results = {
    pages_visited: 0,
    unprocessed_pages: [],
  }

  return if File.exist?(@executable)

  warn("ERROR: Unsupported platform: `#{PLATFORM}`")
  exit(1)
end

Instance Attribute Details

#handlersObject (readonly)

Returns the value of attribute handlers.



14
15
16
# File 'lib/vore/crawler.rb', line 14

def handlers
  @handlers
end

#output_dirObject (readonly)

Returns the value of attribute output_dir.



14
15
16
# File 'lib/vore/crawler.rb', line 14

def output_dir
  @output_dir
end

Instance Method Details

#process_file(path) {|page| ... } ⇒ Object

Yields:

  • (page)


77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# File 'lib/vore/crawler.rb', line 77

def process_file(path, &block)
  @results[:pages_visited] += 1

  html_file = File.read(path).force_encoding("UTF-8")

  if html_file.empty?
    @results[:unprocessed_pages] << path
    return
  end

  rewritten_html_file = @selma.rewrite(html_file)
  return if rewritten_html_file.empty?

  # drops the first 3 parts of the path, which are "tmp", "vore", and the site name
  url_path = path.split(FILE_SEPERATOR)[(@parent_output_dir_len + 1)..].join("/")

  page = Vore::PageData.new(
    content: rewritten_html_file,
    title: @meta_extractor.title,
    meta: @meta_extractor.meta,
    path: url_path,
  )

  yield page
end

#rewrite(html_file) ⇒ Object



103
104
105
106
107
108
109
# File 'lib/vore/crawler.rb', line 103

def rewrite(html_file)
  @selma.rewrite(html_file)
rescue StandardError => e
  Vore.logger.warn("Error rewriting #{path}: #{e}")
  @results[:unprocessed_pages] << path
  ""
end

#run_command(website, delay: 0) ⇒ Object



111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# File 'lib/vore/crawler.rb', line 111

def run_command(website, delay: 0)
  pid = Process.spawn(
    @executable,
    "--user-agent",
    user_agent,
    "--delay",
    delay.to_s,
    "--url",
    website,
    "download",
    "-t",
    @output_dir,
  )

  _, _status = Process.waitpid2(pid)
rescue StandardError => e
  Vore.logger.error(e)
end

#scrape_each_page(website, &block) ⇒ Object



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/vore/crawler.rb', line 48

def scrape_each_page(website, &block)
  @output_dir = "#{@parent_output_dir}/#{website.gsub(/[^a-zA-Z0-9]/, "_").squeeze("_")}"
  FileUtils.rm_rf(@output_dir)
  FileUtils.mkdir_p(@output_dir)

  listener = Listen.to(@output_dir) do |_modified, added, _removed|
    if added.any?
      added.each do |path|
        process_file(path, &block)
        File.delete(path) if @options[:delete_after_yield]
      end
    end
  end
  listener.start

  Vore.logger.info("Vore started crawling #{website}, outputting to #{output_dir}")

  begin
    run_command(website, delay: @options[:delay])
  ensure
    sleep(0.5) # give listener time to clean up
    listener.stop
  end

  Vore.logger.info("Vore finished crawling #{website}")

  @results
end

#user_agentObject



130
131
132
# File 'lib/vore/crawler.rb', line 130

def user_agent
  "'Mozilla/5.0 (compatible; Vore/#{Vore::VERSION}; +https://github.com/gjtorikian/vore)'"
end