Class: Maltese::Sitemap

Inherits:
Object
  • Object
show all
Defined in:
lib/maltese/sitemap.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(attributes = {}) ⇒ Sitemap

Returns a new instance of Sitemap.



20
21
22
# File 'lib/maltese/sitemap.rb', line 20

def initialize(attributes={})
  @sitemap_bucket = attributes[:sitemap_bucket].presence || "search.test.datacite.org"
end

Instance Attribute Details

#sitemap_bucketObject (readonly)

Returns the value of attribute sitemap_bucket.



3
4
5
# File 'lib/maltese/sitemap.rb', line 3

def sitemap_bucket
  @sitemap_bucket
end

Instance Method Details

#get_data(url) ⇒ Object



108
109
110
# File 'lib/maltese/sitemap.rb', line 108

def get_data(url)
  Maremma.get(url, timeout: 300)
end

#get_query_url(options = {}) ⇒ Object



80
81
82
83
84
85
86
87
88
89
90
# File 'lib/maltese/sitemap.rb', line 80

def get_query_url(options={})
  options[:cursor] = options[:cursor] || 1
  options[:size] = options[:size] || job_batch_size

  params = { 
    "fields[dois]" => "doi,updated",
    "page[cursor]" => options[:cursor],
    "page[size]" => options[:size]
  }
  search_path + URI.encode_www_form(params)
end

#get_total(options = {}) ⇒ Object



73
74
75
76
77
78
# File 'lib/maltese/sitemap.rb', line 73

def get_total(options={})
  query_url = get_query_url(options.merge(size: 0))

  result = Maremma.get(query_url, options)
  result.body.dig("meta", "total")
end

#job_batch_sizeObject



40
41
42
# File 'lib/maltese/sitemap.rb', line 40

def job_batch_size
  1000
end

#parse_data(result) ⇒ Object



112
113
114
115
116
117
118
119
120
# File 'lib/maltese/sitemap.rb', line 112

def parse_data(result)
  return result.body.fetch("errors") if result.body.fetch("errors", nil).present?

  result.body.fetch("data", []).each do |item|
    loc = "/works/" + item.dig("attributes", "doi")
    sitemap.add loc, changefreq: "monthly", lastmod: item.dig("attrributes", "updated")
  end
  sitemap.sitemap.link_count
end

#process_data(options = {}) ⇒ Object



92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/maltese/sitemap.rb', line 92

def process_data(options = {})
  options[:start_time] = Time.now

  # walk through paginated results
  while options[:url] do
    response = get_data(options[:url])
    parse_data(response)
    options[:url] = response.body.dig("links", "next")

    # don't loop when testing
    break if ENV['RACK'] == "test"     
  end

  push_data(options)
end

#push_data(options = {}) ⇒ Object



122
123
124
125
126
127
# File 'lib/maltese/sitemap.rb', line 122

def push_data(options={})
  sitemap.finalize!
  options[:start_time] ||= Time.now
  sitemap.sitemap_index.stats_summary(:time_taken => Time.now - options[:start_time])
  sitemap.sitemap.link_count
end

#queue_jobs(options = {}) ⇒ Object



60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/maltese/sitemap.rb', line 60

def queue_jobs(options={})
  total = get_total(options)

  if total > 0
    puts process_data(options.merge(total: total, url: get_query_url))
  else
    puts "No works found."
  end

  # return number of works queued
  total
end

#s3_adapterObject



53
54
55
56
57
58
# File 'lib/maltese/sitemap.rb', line 53

def s3_adapter
  SitemapGenerator::AwsSdkAdapter.new(sitemap_bucket,
                                  aws_access_key_id: ENV['AWS_ACCESS_KEY_ID'],
                                  aws_secret_access_key: ENV['AWS_SECRET_ACCESS_KEY'],
                                  aws_region: ENV['AWS_REGION'])
end

#search_pathObject



32
33
34
# File 'lib/maltese/sitemap.rb', line 32

def search_path
  ENV['RACK_ENV'] == "production" ? "https://api.datacite.org/dois?" : "https://api.test.datacite.org/dois?"
end

#sitemapObject



44
45
46
47
48
49
50
51
# File 'lib/maltese/sitemap.rb', line 44

def sitemap
  @sitemap ||= SitemapGenerator::LinkSet.new(
    default_host: sitemap_url,
    sitemaps_host: sitemap_url,
    sitemaps_path: sitemaps_path,
    adapter: s3_adapter,
    finalize: false)
end

#sitemap_urlObject



24
25
26
# File 'lib/maltese/sitemap.rb', line 24

def sitemap_url
  ENV['RACK_ENV'] == "production" ? "https://search.datacite.org/" : "https://search.test.datacite.org/"
end

#sitemaps_pathObject



28
29
30
# File 'lib/maltese/sitemap.rb', line 28

def sitemaps_path
  "sitemaps/"
end

#timeoutObject



36
37
38
# File 'lib/maltese/sitemap.rb', line 36

def timeout
  60
end