Class: Maltese::Sitemap

Inherits:
Object
  • Object
show all
Defined in:
lib/maltese/sitemap.rb

Constant Summary collapse

SLACK_ICON_URL =

icon for Slack messages

"https://raw.githubusercontent.com/datacite/homepage/master/source/images/fabrica.png"

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(attributes = {}) ⇒ Sitemap

Returns a new instance of Sitemap.



30
31
32
33
34
35
36
37
38
39
# File 'lib/maltese/sitemap.rb', line 30

def initialize(attributes={})
  @sitemap_bucket = attributes[:sitemap_bucket].presence || "search.test.datacite.org"
  @rack_env = attributes[:rack_env].presence || ENV['RACK_ENV'] || "stage"
  @access_key = attributes[:access_key].presence || ENV['AWS_ACCESS_KEY_ID']
  @secret_key = attributes[:secret_key].presence || ENV['AWS_SECRET_ACCESS_KEY']
  @region = attributes[:region].presence || ENV['AWS_REGION']
  @slack_webhook_url = attributes[:slack_webhook_url].presence || ENV['SLACK_WEBHOOK_URL']

  @logger = LogStashLogger.new(type: :stdout)
end

Instance Attribute Details

#access_keyObject (readonly)

Returns the value of attribute access_key.



10
11
12
# File 'lib/maltese/sitemap.rb', line 10

def access_key
  @access_key
end

#loggerObject (readonly)

Returns the value of attribute logger.



10
11
12
# File 'lib/maltese/sitemap.rb', line 10

def logger
  @logger
end

#rack_envObject (readonly)

Returns the value of attribute rack_env.



10
11
12
# File 'lib/maltese/sitemap.rb', line 10

def rack_env
  @rack_env
end

#regionObject (readonly)

Returns the value of attribute region.



10
11
12
# File 'lib/maltese/sitemap.rb', line 10

def region
  @region
end

#secret_keyObject (readonly)

Returns the value of attribute secret_key.



10
11
12
# File 'lib/maltese/sitemap.rb', line 10

def secret_key
  @secret_key
end

#sitemap_bucketObject (readonly)

Returns the value of attribute sitemap_bucket.



10
11
12
# File 'lib/maltese/sitemap.rb', line 10

def sitemap_bucket
  @sitemap_bucket
end

#slack_webhook_urlObject (readonly)

Returns the value of attribute slack_webhook_url.



10
11
12
# File 'lib/maltese/sitemap.rb', line 10

def slack_webhook_url
  @slack_webhook_url
end

Instance Method Details

#get_data(url) ⇒ Object



164
165
166
# File 'lib/maltese/sitemap.rb', line 164

def get_data(url)
  Maremma.get(url, timeout: 300)
end

#get_query_url(options = {}) ⇒ Object



103
104
105
106
107
108
109
110
111
112
113
# File 'lib/maltese/sitemap.rb', line 103

def get_query_url(options={})
  options[:size] = options[:size] || job_batch_size

  params = { 
    "fields[dois]" => "doi,updated",
    "exclude-registration-agencies" => "true",
    "page[scroll]" => "7m",
    "page[size]" => options[:size]
  }
  search_path + URI.encode_www_form(params)
end

#get_total(options = {}) ⇒ Object



96
97
98
99
100
101
# File 'lib/maltese/sitemap.rb', line 96

def get_total(options={})
  query_url = get_query_url(options.merge(size: 1))

  result = Maremma.get(query_url, options)
  result.body.dig("meta", "total")
end

#job_batch_sizeObject



61
62
63
# File 'lib/maltese/sitemap.rb', line 61

def job_batch_size
  1000
end

#parse_data(result) ⇒ Object



168
169
170
171
172
173
174
# File 'lib/maltese/sitemap.rb', line 168

def parse_data(result)
  Array.wrap(result.body.fetch("data", nil)).each do |item|
    loc = "/doi.org/" + item.dig("attributes", "doi")
    sitemap.add loc, changefreq: "weekly", lastmod: item.dig("attributes", "updated")
  end
  sitemap.sitemap.link_count
end

#process_data(options = {}) ⇒ Object



115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# File 'lib/maltese/sitemap.rb', line 115

def process_data(options = {})
  options[:start_time] = Time.now
  link_count = 0

  # walk through paginated results
  while options[:url] do
    begin
      response = nil

      # speed up tests
      base_interval = rack_env == "test" ? 0.1 : 10

      # retry on temporal errors (status codes 408, 500 and 502)
      Retriable.retriable(base_interval: base_interval, multiplier: 2) do
        response = get_data(options[:url])

        raise Timeout::Error, "A timeout error occured for URL #{options[:url]}." if response.status == 408
        raise InternalServerError, "An internal server error occured for URL #{options[:url]}." if response.status == 500
        raise BadGatewayError, "A bad gateway error occured for URL #{options[:url]}." if response.status == 502
      end

      if response.status == 200
        link_count = parse_data(response)
        logger.info "#{(link_count + sitemap.sitemap_index.total_link_count).to_s(:delimited)} DOIs parsed."
        options[:url] = response.body.dig("links", "next")
      else
        logger.error "An error occured for URL #{options[:url]}."
        logger.error "Error: #{response.body.fetch("errors").inspect}" if response.body.fetch("errors", nil).present?
        options[:url] = nil
      end
    rescue => exception
      logger.error "Error: #{exception.message}"
      fields = [
        { title: "Error", value: exception.message },
        { title: "Number of DOIs", value: sitemap.sitemap_index.total_link_count.to_s(:delimited), short: true },
        { title: "Number of Sitemaps", value: sitemap.sitemap_index.link_count.to_s(:delimited), short: true },
        { title: "Time Taken", value: "#{((Time.now - options[:start_time])/ 60.0).ceil} min", short: true }
      ]
      send_notification_to_slack(nil, title: slack_title + ": Sitemaps Not Updated", level: "danger", fields: fields) unless rack_env == "test"
      options[:url] = nil
    ensure
      # don't loop when testing
      break if rack_env == "test"
    end  
  end

  push_data(options)
end

#push_data(options = {}) ⇒ Object



176
177
178
179
180
181
182
183
184
185
186
187
188
189
# File 'lib/maltese/sitemap.rb', line 176

def push_data(options={})
  sitemap.finalize!
  options[:start_time] ||= Time.now
  sitemap.sitemap_index.stats_summary(:time_taken => Time.now - options[:start_time])
  
  fields = [
    { title: "URL", value: sitemap.sitemap_index_url },
    { title: "Number of DOIs", value: sitemap.sitemap_index.total_link_count.to_s(:delimited), short: true },
    { title: "Number of Sitemaps", value: sitemap.sitemap_index.link_count.to_s(:delimited), short: true },
    { title: "Time Taken", value: "#{((Time.now - options[:start_time])/ 60.0).ceil} min", short: true }
  ]
  send_notification_to_slack(nil, title: slack_title + ": Sitemaps Updated", level: "good", fields: fields) unless rack_env == "test"
  sitemap.sitemap.link_count
end

#queue_jobs(options = {}) ⇒ Object



81
82
83
84
85
86
87
88
89
90
91
92
93
94
# File 'lib/maltese/sitemap.rb', line 81

def queue_jobs(options={})
  total = get_total(options)

  if total.nil?
    logger.error "An error occured."
  elsif total > 0
    process_data(options.merge(total: total, url: get_query_url))
  else
    logger.info "No works found."
  end

  # return number of works queued
  total.to_i
end

#s3_adapterObject



74
75
76
77
78
79
# File 'lib/maltese/sitemap.rb', line 74

def s3_adapter
  SitemapGenerator::AwsSdkAdapter.new(sitemap_bucket,
                                  aws_access_key_id: access_key,
                                  aws_secret_access_key: secret_key,
                                  aws_region: region)
end

#search_pathObject



53
54
55
# File 'lib/maltese/sitemap.rb', line 53

def search_path
  rack_env == "production" ? "https://api.datacite.org/dois?" : "https://api.stage.datacite.org/dois?"
end

#send_notification_to_slack(text, options = {}) ⇒ Object



191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# File 'lib/maltese/sitemap.rb', line 191

def send_notification_to_slack(text, options={})
  return nil unless slack_webhook_url.present?

  attachment = {
    title: options[:title] || "Fabrica Message",
    text: text,
    color: options[:level] || "good",
    fields: options[:fields]
  }.compact

  begin
    notifier = Slack::Notifier.new(slack_webhook_url,
                                   username: "Fabrica",
                                   icon_url: SLACK_ICON_URL)
    response = notifier.ping attachments: [attachment]
    response.first.body
  rescue => exception
    logger.error exception.message
  end
end

#sitemapObject



65
66
67
68
69
70
71
72
# File 'lib/maltese/sitemap.rb', line 65

def sitemap
  @sitemap ||= SitemapGenerator::LinkSet.new(
    default_host: sitemap_url,
    sitemaps_host: sitemap_url,
    sitemaps_path: sitemaps_path,
    adapter: s3_adapter,
    finalize: false)
end

#sitemap_urlObject



41
42
43
# File 'lib/maltese/sitemap.rb', line 41

def sitemap_url
  rack_env == "production" ? "https://commons.datacite.org/" : "https://commons.stage.datacite.org/"
end

#sitemaps_pathObject



49
50
51
# File 'lib/maltese/sitemap.rb', line 49

def sitemaps_path
  "sitemaps/"
end

#slack_titleObject



45
46
47
# File 'lib/maltese/sitemap.rb', line 45

def slack_title
  rack_env == "production" ? "DataCite Commons" : "DataCite Commons Stage"
end

#timeoutObject



57
58
59
# File 'lib/maltese/sitemap.rb', line 57

def timeout
  60
end