Class: SitemapParser

Inherits:
Object
  • Object
show all
Defined in:
lib/sitemap-parser.rb

Instance Method Summary collapse

Constructor Details

#initialize(url, opts = {}) ⇒ SitemapParser

Returns a new instance of SitemapParser.



9
10
11
12
# File 'lib/sitemap-parser.rb', line 9

def initialize(url, opts = {})
  @url = url
  @options = { followlocation: true, recurse: false, url_regex: nil }.merge(opts)
end

Instance Method Details

#raw_sitemapObject



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# File 'lib/sitemap-parser.rb', line 14

def raw_sitemap
  @raw_sitemap ||= begin
    if /\Ahttp/i.match?(@url)
      request_options = @options.dup.tap { |opts| opts.delete(:recurse); opts.delete(:url_regex) }
      request = Typhoeus::Request.new(@url, request_options)
      request.on_complete do |response|
        raise "HTTP request to #{@url} failed" unless response.success?

        return inflate_body_if_needed(response)
      end
      request.run
    elsif File.exist?(@url) && @url =~ %r{[\\/]sitemap\.xml\Z}i
      File.open(@url, &:read)
    end
  end
end

#sitemapObject



31
32
33
# File 'lib/sitemap-parser.rb', line 31

def sitemap
  @sitemap ||= Nokogiri::XML(raw_sitemap)
end

#to_aObject



53
54
55
56
57
# File 'lib/sitemap-parser.rb', line 53

def to_a
  urls.map { |url| url.at('loc').content }
rescue NoMethodError
  raise 'Malformed sitemap, url without loc'
end

#urlsObject



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/sitemap-parser.rb', line 35

def urls
  if sitemap.at('urlset')
    filter_sitemap_urls(sitemap.at('urlset').search('url'))
  elsif sitemap.at('sitemapindex')
    found_urls = []
    if @options[:recurse]
      urls = sitemap.at('sitemapindex').search('sitemap')
      filter_sitemap_urls(urls).each do |sitemap|
        child_sitemap_location = sitemap.at('loc').content
        found_urls << self.class.new(child_sitemap_location, recurse: false).urls
      end
    end
    found_urls.flatten
  else
    raise 'Malformed sitemap, no urlset'
  end
end