Class: Retriever::FetchSitemap

Inherits:
Fetch
  • Object
show all
Defined in:
lib/retriever/fetchsitemap.rb

Constant Summary

Constants inherited from Fetch

Retriever::Fetch::HR

Instance Attribute Summary

Attributes inherited from Fetch

#max_pages, #result, #t

Instance Method Summary collapse

Methods inherited from Fetch

#dump, #errlog, #filter_out_querystrings, #good_response?, #lg, #start, #write

Constructor Details

#initialize(url, options) ⇒ FetchSitemap

receives target URL and RR options returns an array of all unique pages found on the site



6
7
8
9
10
11
12
13
14
15
16
17
# File 'lib/retriever/fetchsitemap.rb', line 6

def initialize(url, options)
  super
  start
  @result.push(@t.target)
  @result.concat(@link_stack)

  async_crawl_and_collect
  # done, make sure progress bar says we are done
  @progressbar.finish if @progress
  @result.sort_by! { |x| x.length } if @result.size > 1
  @result.uniq!
end

Instance Method Details

#gen_xmlObject

produces valid XML sitemap based on page collection fetched. Writes to current directory.



21
22
23
24
25
26
27
28
29
30
31
32
# File 'lib/retriever/fetchsitemap.rb', line 21

def gen_xml
  filename = @t.host.split('.')[1]
  f = File.open("sitemap-#{filename}.xml", 'w+')
  f << "<?xml version='1.0' encoding='UTF-8'?>"
  f << "<urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
  @result.each do |url|
    f << "<url><loc>#{url}</loc></url>"
  end
  f << '</urlset>'
  f.close
  print_file_info(filename)
end