Class: SiteDiff::Crawler
- Inherits:
-
Object
- Object
- SiteDiff::Crawler
- Defined in:
- lib/sitediff/crawler.rb
Defined Under Namespace
Classes: Info
Constant Summary collapse
- DEFAULT_DEPTH =
3
Instance Method Summary collapse
-
#add_uri(rel, depth) ⇒ Object
Handle a newly found relative URI.
-
#fetched_uri(rel, depth, res) ⇒ Object
Handle the fetch of a URI.
-
#filter_links(uris) ⇒ Object
Filter out links we don’t want.
-
#find_links(doc) ⇒ Object
Return a list of string links found on a page.
-
#initialize(hydra, base, interval, whitelist, blacklist, depth = DEFAULT_DEPTH, curl_opts = UriWrapper::DEFAULT_CURL_OPTS, debug = true, &block) ⇒ Crawler
constructor
Create a crawler with a base URL.
-
#relativize_link(uri) ⇒ Object
Make a link relative to @base_uri.
-
#resolve_link(base, rel) ⇒ Object
Resolve a potentially-relative link.
Constructor Details
#initialize(hydra, base, interval, whitelist, blacklist, depth = DEFAULT_DEPTH, curl_opts = UriWrapper::DEFAULT_CURL_OPTS, debug = true, &block) ⇒ Crawler
Create a crawler with a base URL
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/sitediff/crawler.rb', line 17 def initialize(hydra, base, interval, whitelist, blacklist, depth = DEFAULT_DEPTH, curl_opts = UriWrapper::DEFAULT_CURL_OPTS, debug = true, &block) @hydra = hydra @base_uri = Addressable::URI.parse(base) @base = base @interval = interval @whitelist = whitelist @blacklist = blacklist @found = Set.new @callback = block @curl_opts = curl_opts @debug = debug add_uri('', depth) end |
Instance Method Details
#add_uri(rel, depth) ⇒ Object
Handle a newly found relative URI
40 41 42 43 44 45 46 47 48 49 |
# File 'lib/sitediff/crawler.rb', line 40 def add_uri(rel, depth) return if @found.include? rel @found << rel wrapper = UriWrapper.new(@base + rel, @curl_opts, @debug) wrapper.queue(@hydra) do |res| fetched_uri(rel, depth, res) end end |
#fetched_uri(rel, depth, res) ⇒ Object
Handle the fetch of a URI
52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
# File 'lib/sitediff/crawler.rb', line 52 def fetched_uri(rel, depth, res) if res.error SiteDiff.log(res.error, :error) return elsif !res.content SiteDiff.log('Response is missing content. Treating as an error.', :error) return end base = Addressable::URI.parse(@base + rel) doc = Nokogiri::HTML(res.content) # Call the callback info = Info.new( relative: rel, uri: base, read_result: res, document: doc ) # Insert delay to limit fetching rate if @interval != 0 SiteDiff.log("Waiting #{@interval} milliseconds.", :info) sleep(@interval / 1000.0) end @callback[info] return unless depth >= 1 # Find links links = find_links(doc) uris = links.map { |l| resolve_link(base, l) }.compact uris = filter_links(uris) # Make them relative rels = uris.map { |u| relativize_link(u) } # Queue them in turn rels.each do |r| next if @found.include? r add_uri(r, depth - 1) end end |
#filter_links(uris) ⇒ Object
Filter out links we don’t want. Links passed in are absolute URIs.
115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
# File 'lib/sitediff/crawler.rb', line 115 def filter_links(uris) uris.find_all do |u| is_sub_uri = (u.host == @base_uri.host) && u.path.start_with?(@base_uri.path) if is_sub_uri is_whitelisted = @whitelist.nil? ? false : @whitelist.match(u.path) is_blacklisted = @blacklist.nil? ? false : @blacklist.match(u.path) if is_blacklisted && !is_whitelisted SiteDiff.log "Ignoring blacklisted URL #{u.path}", :info end is_whitelisted || !is_blacklisted end # SiteDiff.log "Filtering URL #{u.path}", :info # SiteDiff.log Regexp.new(@blacklist).match(u.path).inspect, :info # (u.host == @base_uri.host) && # (u.path.start_with?(@base_uri.path)) && # (@whitelist == '' || Regexp.new(@whitelist).match(u.path)) && # (@blacklist == '' || !(Regexp.new(@blacklist).match(u.path))) end end |
#find_links(doc) ⇒ Object
Return a list of string links found on a page.
110 111 112 |
# File 'lib/sitediff/crawler.rb', line 110 def find_links(doc) doc.xpath('//a[@href]').map { |e| e['href'] } end |
#relativize_link(uri) ⇒ Object
Make a link relative to @base_uri
105 106 107 |
# File 'lib/sitediff/crawler.rb', line 105 def relativize_link(uri) uri.path.slice(@base_uri.path.length, uri.path.length) end |
#resolve_link(base, rel) ⇒ Object
Resolve a potentially-relative link. Return nil on error.
97 98 99 100 101 102 |
# File 'lib/sitediff/crawler.rb', line 97 def resolve_link(base, rel) base + rel rescue Addressable::URI::InvalidURIError SiteDiff.log "skipped invalid URL: '#{rel}' (at #{base})", :warn nil end |