Class: SitemapParser
- Inherits:
-
Object
- Object
- SitemapParser
- Defined in:
- lib/sitemap-parser.rb,
lib/sitemap-parser/version.rb
Constant Summary collapse
- VERSION =
'0.5.6'
Instance Method Summary collapse
-
#initialize(url, opts = {}) ⇒ SitemapParser
constructor
A new instance of SitemapParser.
- #raw_sitemap ⇒ Object
- #sitemap ⇒ Object
- #to_a ⇒ Object
- #urls ⇒ Object
Constructor Details
#initialize(url, opts = {}) ⇒ SitemapParser
Returns a new instance of SitemapParser.
9 10 11 12 |
# File 'lib/sitemap-parser.rb', line 9 def initialize(url, opts = {}) @url = url @options = { followlocation: true, recurse: false, url_regex: nil }.merge(opts) end |
Instance Method Details
#raw_sitemap ⇒ Object
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
# File 'lib/sitemap-parser.rb', line 14 def raw_sitemap @raw_sitemap ||= begin if /\Ahttp/i.match?(@url) = @options.dup.tap { |opts| opts.delete(:recurse); opts.delete(:url_regex) } request = Typhoeus::Request.new(@url, ) request.on_complete do |response| raise "HTTP request to #{@url} failed" unless response.success? return inflate_body_if_needed(response) end request.run elsif File.exist?(@url) && @url =~ %r{[\\/]sitemap\.xml\Z}i File.open(@url, &:read) end end end |
#sitemap ⇒ Object
31 32 33 |
# File 'lib/sitemap-parser.rb', line 31 def sitemap @sitemap ||= Nokogiri::XML(raw_sitemap) end |
#to_a ⇒ Object
53 54 55 56 57 |
# File 'lib/sitemap-parser.rb', line 53 def to_a urls.map { |url| url.at('loc').content } rescue NoMethodError raise 'Malformed sitemap, url without loc' end |
#urls ⇒ Object
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
# File 'lib/sitemap-parser.rb', line 35 def urls if sitemap.at('urlset') filter_sitemap_urls(sitemap.at('urlset').search('url')) elsif sitemap.at('sitemapindex') found_urls = [] if @options[:recurse] urls = sitemap.at('sitemapindex').search('sitemap') filter_sitemap_urls(urls).each do |sitemap| child_sitemap_location = sitemap.at('loc').content found_urls << self.class.new(child_sitemap_location, recurse: false).urls end end found_urls.flatten else raise 'Malformed sitemap, no urlset' end end |