Class: Rake::BlacklightSitemapTask

Inherits:
Object
  • Object
show all
Defined in:
lib/blacklight-sitemap.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize {|_self| ... } ⇒ BlacklightSitemapTask

Returns a new instance of BlacklightSitemapTask.

Yields:

  • (_self)

Yield Parameters:



47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/blacklight-sitemap.rb', line 47

def initialize
  @resource_url = 'http://localhost:3000/catalog'
  @public_url = 'http://localhost:3000'
  
  @base_filename = 'blacklight'
  @gzip = false
  @changefreq = nil
  @max = 50000 #default value for max number of locs per sitemap file
  @lastmod_field = 'timestamp'
  @priority_field = nil
  @sort = '_docid_ asc' # http://osdir.com/ml/solr-user.lucene.apache.org/2010-03/msg01371.html
  @qt = 'standard'
  yield self if block_given?
  define
end

Instance Attribute Details

#base_filenameObject

base filename to use for sitemap in case these will be moved to a location that hosts other sitemaps so these sitemaps do not overwrite others



22
23
24
# File 'lib/blacklight-sitemap.rb', line 22

def base_filename
  @base_filename
end

#changefreqObject

value for changefreq for each page listed



28
29
30
# File 'lib/blacklight-sitemap.rb', line 28

def changefreq
  @changefreq
end

#gzipObject

should the files be gzipped? requires the commandline tool gzip



25
26
27
# File 'lib/blacklight-sitemap.rb', line 25

def gzip
  @gzip
end

#lastmod_fieldObject

Solr field that contains a date to create a lastmod date for the page. Currently must be a string as in W3C Datetime format or YYYY-MM-DD



36
37
38
# File 'lib/blacklight-sitemap.rb', line 36

def lastmod_field
  @lastmod_field
end

#maxObject

the most resources which should be listed within a single sitemap defaults to 50,000



32
33
34
# File 'lib/blacklight-sitemap.rb', line 32

def max
  @max
end

#priority_fieldObject

Solr field to use to provide a priority for this resource



39
40
41
# File 'lib/blacklight-sitemap.rb', line 39

def priority_field
  @priority_field
end

#public_urlObject

base url used for public directory where sitemaps will be placed



18
19
20
# File 'lib/blacklight-sitemap.rb', line 18

def public_url
  @public_url
end

#qtObject

pick a request handler.



45
46
47
# File 'lib/blacklight-sitemap.rb', line 45

def qt
  @qt
end

#resource_urlObject

base url used for locations of resources



15
16
17
# File 'lib/blacklight-sitemap.rb', line 15

def resource_url
  @resource_url
end

#sortObject

Solr sort option



42
43
44
# File 'lib/blacklight-sitemap.rb', line 42

def sort
  @sort
end

Instance Method Details

#defineObject



63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# File 'lib/blacklight-sitemap.rb', line 63

def define
  namespace :blacklight do
    desc 'clobber then create sitemap files for blacklight'
    task :sitemap => ['sitemap:clobber', 'sitemap:create']

    namespace :sitemap do

      desc 'create a sitemap for blacklight'
      task :create => :environment do
        start_time = Time.now

        #collect warnings here rather than raise an error
        warnings = []

        blacklight_config = CatalogController.blacklight_config

        puts 'Creating a sitemap...'
        fl = ['id', @lastmod_field, @priority_field].compact.join(',')
        base_solr_parameters = {:qt => @qt, :fq => 'id:[* TO *]', :fl => fl}
        response = Blacklight.solr.get(blacklight_config.solr_path, :params => base_solr_parameters.merge(:rows => 1))
        number_of_resources = response['response']['numFound']
        puts 'Number of resources: ' + number_of_resources.to_s
        batches = (number_of_resources / @max.to_f).ceil
        puts 'Total sitemap to create: ' + batches.to_s
        master_sitemap = ''
        base_solr_parameters.merge!(:sort => @sort) if @sort

        # create a hash of batches with lastmod dates so that the most recent
        # lastmod date shows up associated with that batch. This will feed
        # into the lastmod for each sitemap in the index sitemap.
        batch_lastmods = {}

        batches.times do |batch_number|
          current_page = batch_number + 1
          start = batch_number * @max
          puts 'Processing batch # ' + current_page.to_s
          response = Blacklight.solr.get(blacklight_config.solr_path, :params => base_solr_parameters.merge(:rows => @max, :start => start))['response']
          sitemap_builder = Nokogiri::XML::Builder.new do |xml|
            xml.urlset "xmlns" => "http://www.sitemaps.org/schemas/sitemap/0.9" do
              response['docs'].each do |doc|
                xml.url do
                  # FIXME through config
                  xml.loc File.join(@resource_url.to_s, doc['id'])
                  if @lastmod_field and doc[@lastmod_field]
                    xml.lastmod doc[@lastmod_field].to_s
                    if batch_lastmods[batch_number].blank? or batch_lastmods[batch_number] < doc[@lastmod_field]
                      batch_lastmods[batch_number] = doc[@lastmod_field]
                    end
                  end
                  xml.priority doc[@priority_field] if @priority_field and doc[@priority_field]
                  xml.changefreq @changefreq if @changefreq
                end
              end
            end
          end
          sitemap_filename = File.join(Rails.root, 'public', @base_filename + '-sitemap' + batch_number.to_s + '.xml')
          File.open(sitemap_filename, 'w') do |fh|
            fh.puts sitemap_builder.to_xml
          end
          if File.size(sitemap_filename) > 10485760
            warnings << 'WARNING Sitemap is over 10MB limit: ' + sitemap_filename
          end
          if @gzip
            `gzip #{sitemap_filename}`
          end
        end
        puts 'Creating sitemap index...'
        rake_run_lastmod = DateTime.now.utc.strftime("%Y-%m-%dT%H:%M:%S+00:00")
        sitemap_index_builder = Nokogiri::XML::Builder.new do |xml|
          xml.sitemapindex 'xmlns' => 'http://www.sitemaps.org/schemas/sitemap/0.9' do
            batches.times do |batch|
              sitemap_filename = File.join(@public_url.to_s, @base_filename + '-sitemap' + batch.to_s + '.xml')
              sitemap_filename << '.gz' if @gzip
              xml.sitemap{
                xml.loc sitemap_filename
                if batch_lastmods[batch]
                  xml.lastmod batch_lastmods[batch]
                else
                  xml.lastmod rake_run_lastmod
                end
              }
            end
          end
        end #sitemap_index_builder
        index_sitemap_filename = File.join(Rails.root, 'public', @base_filename + '-sitemap.xml')
        File.open(index_sitemap_filename, 'w') do |fh|
          fh.puts sitemap_index_builder.to_xml
        end
        if File.size(index_sitemap_filename) > 10485760
          warnings << 'WARNING Index sitemap is over 10MB limit: ' + index_sitemap_filename
        end
        puts 'Done.'
        end_time = Time.now
        puts 'Create start time: ' + start_time.to_s
        puts 'Create end time:   ' + end_time.to_s
        puts 'Execution time in seconds: ' + (end_time - start_time).to_s
        puts warnings.join("\n")
      end # task :sitemap

      desc 'clobber sitemap files'
      task :clobber do
        puts "Deleting all sitemap files..."
        Dir.glob(File.join(Rails.root, 'public', @base_filename + '-sitemap*')).each do |sitemap|
          FileUtils.rm(sitemap)
        end
      end

    end # namespace :sitemap
  end # namespace :blacklight
end