Class: SitemapParser

Inherits:
Object
  • Object
show all
Defined in:
lib/sitemap-parser.rb

Instance Method Summary collapse

Constructor Details

#initialize(url, opts = {}) ⇒ SitemapParser


6
7
8
9
# File 'lib/sitemap-parser.rb', line 6

def initialize(url, opts = {})
  @url = url
  @options = {:followlocation => true, :recurse => false}.merge(opts)
end

Instance Method Details

#raw_sitemapObject


11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# File 'lib/sitemap-parser.rb', line 11

def raw_sitemap
  @raw_sitemap ||= begin
    if @url =~ /\Ahttp/i
      request_options = @options.dup.tap { |opts| opts.delete(:recurse) }
      request = Typhoeus::Request.new(@url, request_options)
      request.on_complete do |response|
        if response.success?
          return response.body
        else
          raise "HTTP request to #{@url} failed"
        end
      end
      request.run
    elsif File.exist?(@url) && @url =~ /[\\\/]sitemap\.xml\Z/i
      open(@url) { |f| f.read }
    end
  end
end

#sitemapObject


30
31
32
# File 'lib/sitemap-parser.rb', line 30

def sitemap
  @sitemap ||= Nokogiri::XML(raw_sitemap)
end

#to_aObject


51
52
53
54
55
# File 'lib/sitemap-parser.rb', line 51

def to_a
  urls.map { |url| url.at("loc").content }
rescue NoMethodError
  raise 'Malformed sitemap, url without loc'
end

#urlsObject


34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/sitemap-parser.rb', line 34

def urls
  if sitemap.at('urlset')
    sitemap.at("urlset").search("url")
  elsif sitemap.at('sitemapindex')
    found_urls = []
    if @options[:recurse]
      sitemap.at('sitemapindex').search('sitemap').each do |sitemap|
        child_sitemap_location = sitemap.at('loc').content
        found_urls << self.class.new(child_sitemap_location, :recurse => false).urls
      end
    end
    return found_urls.flatten
  else
    raise 'Malformed sitemap, no urlset'
  end
end