Module: Sitemaps

Defined in:
lib/sitemaps.rb,
lib/sitemaps/parser.rb,
lib/sitemaps/fetcher.rb,
lib/sitemaps/version.rb

Overview

Discover, fetch and parse XML sitemaps as defined by the ‘sitemaps.org` spec.

Defined Under Namespace

Modules: Fetcher, Parser Classes: Entry, Sitemap, Submap

Constant Summary collapse

VERSION =
"0.1.1".freeze

Class Method Summary collapse

Class Method Details

.fetch(url, fetch: nil, recurse: true, max_entries: nil, &block) ⇒ Object



22
23
24
25
26
27
# File 'lib/sitemaps.rb', line 22

def self.fetch(url, fetch: nil, recurse: true, max_entries: nil, &block)
  fetch ||= -> (u) { Sitemaps::Fetcher.fetch(u) }
  url     = parse_url(url)

  recurse ? fetch_recursive(url, fetch, max_entries, &block) : fetch_single(url, fetch, max_entries, &block)
end

.fetch_recursive(url, fetch, max_entries, &block) ⇒ Object



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/sitemaps.rb', line 36

def self.fetch_recursive(url, fetch, max_entries, &block)
  queue = [parse_url(url)]
  maps  = {}

  # walk the queue, fetching the sitemap requested and adding
  # new sitemaps to the queue as found
  loop do
    begin
      url = queue.pop
      break if url.nil?
      next  unless maps[url].nil?

      # fetch this item in the queue, and queue up any sub maps it found
      maps[url] = fetch_single(url, fetch, max_entries, &block)
      queue.push(*maps[url].sitemaps.map(&:loc))

      # decrement max_entries (since it's max_entries total, not per map)
      unless max_entries.nil?
        max_entries -= maps[url].entries.length
        break if max_entries <= 0
      end
    rescue => ex
      $stderr.puts "ERROR FETCHING: #{url}, #{ex.message}, ignoring..."
      next
    end
  end

  # collapse the recovered maps into a single one with everything
  maps.each_with_object(Sitemap.new([], [])) do |(_, map), result|
    result.sitemaps.concat(map.sitemaps)
    result.entries.concat(map.entries)
  end
end

.fetch_single(url, fetch, max_entries, &block) ⇒ Object



29
30
31
32
33
34
# File 'lib/sitemaps.rb', line 29

def self.fetch_single(url, fetch, max_entries, &block)
  url    = parse_url(url)
  source = fetch.call(url)

  Sitemaps::Parser.parse(source, max_entries: max_entries, filter: block)
end

.parse(source) ⇒ Object



18
19
20
# File 'lib/sitemaps.rb', line 18

def self.parse(source)
  Sitemaps::Parser.parse(source)
end

.parse_url(url) ⇒ Object



70
71
72
73
74
75
# File 'lib/sitemaps.rb', line 70

def self.parse_url(url)
  return url if url.is_a? URI

  url = "http://#{url}" unless url =~ %r{^https?://}
  URI.parse(url)
end