Class: Maltese::Sitemap

Inherits:
Object
  • Object
show all
Defined in:
lib/maltese/sitemap.rb

Constant Summary collapse

SLACK_ICON_URL =

icon for Slack messages

"https://github.com/datacite/segugio/blob/master/source/images/fabrica.png"

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(attributes = {}) ⇒ Sitemap

Returns a new instance of Sitemap.



29
30
31
32
33
34
35
36
37
38
# File 'lib/maltese/sitemap.rb', line 29

def initialize(attributes={})
  @sitemap_bucket = attributes[:sitemap_bucket].presence || "search.test.datacite.org"
  @rack_env = attributes[:rack_env].presence || ENV['RACK_ENV'] || "stage"
  @access_key = attributes[:access_key].presence || ENV['AWS_ACCESS_KEY_ID']
  @secret_key = attributes[:secret_key].presence || ENV['AWS_SECRET_ACCESS_KEY']
  @region = attributes[:region].presence || ENV['AWS_REGION']
  @slack_webhook_url = attributes[:slack_webhook_url].presence || ENV['SLACK_WEBHOOK_URL']

  @logger = LogStashLogger.new(type: :stdout)
end

Instance Attribute Details

#access_keyObject (readonly)

Returns the value of attribute access_key.



9
10
11
# File 'lib/maltese/sitemap.rb', line 9

def access_key
  @access_key
end

#loggerObject (readonly)

Returns the value of attribute logger.



9
10
11
# File 'lib/maltese/sitemap.rb', line 9

def logger
  @logger
end

#rack_envObject (readonly)

Returns the value of attribute rack_env.



9
10
11
# File 'lib/maltese/sitemap.rb', line 9

def rack_env
  @rack_env
end

#regionObject (readonly)

Returns the value of attribute region.



9
10
11
# File 'lib/maltese/sitemap.rb', line 9

def region
  @region
end

#secret_keyObject (readonly)

Returns the value of attribute secret_key.



9
10
11
# File 'lib/maltese/sitemap.rb', line 9

def secret_key
  @secret_key
end

#sitemap_bucketObject (readonly)

Returns the value of attribute sitemap_bucket.



9
10
11
# File 'lib/maltese/sitemap.rb', line 9

def sitemap_bucket
  @sitemap_bucket
end

#slack_webhook_urlObject (readonly)

Returns the value of attribute slack_webhook_url.



9
10
11
# File 'lib/maltese/sitemap.rb', line 9

def slack_webhook_url
  @slack_webhook_url
end

Instance Method Details

#format_number(number) ⇒ Object



209
210
211
# File 'lib/maltese/sitemap.rb', line 209

def format_number(number)
  number.to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse
end

#get_data(url) ⇒ Object



161
162
163
# File 'lib/maltese/sitemap.rb', line 161

def get_data(url)
  Maremma.get(url, timeout: 300)
end

#get_query_url(options = {}) ⇒ Object



102
103
104
105
106
107
108
109
110
111
# File 'lib/maltese/sitemap.rb', line 102

def get_query_url(options={})
  options[:size] = options[:size] || job_batch_size

  params = { 
    "fields[dois]" => "doi,updated",
    "page[scroll]" => "7m",
    "page[size]" => options[:size]
  }
  search_path + URI.encode_www_form(params)
end

#get_total(options = {}) ⇒ Object



95
96
97
98
99
100
# File 'lib/maltese/sitemap.rb', line 95

def get_total(options={})
  query_url = get_query_url(options.merge(size: 1))

  result = Maremma.get(query_url, options)
  result.body.dig("meta", "total")
end

#job_batch_sizeObject



60
61
62
# File 'lib/maltese/sitemap.rb', line 60

def job_batch_size
  1000
end

#parse_data(result) ⇒ Object



165
166
167
168
169
170
171
# File 'lib/maltese/sitemap.rb', line 165

def parse_data(result)
  Array.wrap(result.body.fetch("data", nil)).each do |item|
    loc = "/works/" + item.dig("attributes", "doi")
    sitemap.add loc, changefreq: "weekly", lastmod: item.dig("attributes", "updated")
  end
  sitemap.sitemap.link_count
end

#process_data(options = {}) ⇒ Object



113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# File 'lib/maltese/sitemap.rb', line 113

def process_data(options = {})
  options[:start_time] = Time.now
  link_count = 0
  error_count = 0

  # walk through paginated results
  while options[:url] do
    begin
      response = nil

      # retry on temporal errors (status codes 408 and 502)
      Retriable.retriable(base_interval: 10, multiplier: 2) do
        response = get_data(options[:url])

        raise Timeout::Error, "A timeout error occured for URL #{options[:url]}." if response.status == 408
        raise BadGatewayError, "A bad gateway error occured for URL #{options[:url]}." if response.status == 502
      end

      if response.status == 200
        link_count = parse_data(response)
        logger.info "#{link_count} DOIs parsed."
        options[:url] = response.body.dig("links", "next")
      else
        logger.error "An error occured for URL #{options[:url]}."
        logger.error "Error: #{response.body.fetch("errors").inspect}" if response.body.fetch("errors", nil).present?
        error_count += 1
        options[:url] = nil
      end
    rescue => exception
      logger.error "Error: #{exception.message}"
      error_count += 1
      fields = [
        { title: "Error", value: exception.message },
        { title: "Time Taken", value: "#{((Time.now - options[:start_time])/ 60.0).ceil} min", short: true }
      ]
      send_notification_to_slack(nil, title: slack_title + ": Sitemaps Not Updated", level: "danger", fields: fields) unless rack_env == "test"
      options[:url] = nil
    ensure
      # don't loop when testing
      break if rack_env == "test"
    end  
  end

  return link_count if error_count > 0

  push_data(options)
end

#push_data(options = {}) ⇒ Object



173
174
175
176
177
178
179
180
181
182
183
184
185
# File 'lib/maltese/sitemap.rb', line 173

def push_data(options={})
  sitemap.finalize!
  options[:start_time] ||= Time.now
  sitemap.sitemap_index.stats_summary(:time_taken => Time.now - options[:start_time])
  
  fields = [
    { title: "URL", value: "#{sitemap_url}sitemaps/sitemap.xml.gz" },
    { title: "Number of DOIs", value: format_number(sitemap.sitemap.link_count), short: true },
    { title: "Time Taken", value: "#{((Time.now - options[:start_time])/ 60.0).ceil} min", short: true }
  ]
  send_notification_to_slack(nil, title: slack_title + ": Sitemaps Updated", level: "good", fields: fields) unless rack_env == "test"
  sitemap.sitemap.link_count
end

#queue_jobs(options = {}) ⇒ Object



80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/maltese/sitemap.rb', line 80

def queue_jobs(options={})
  total = get_total(options)

  if total.nil?
    logger.error "An error occured."
  elsif total > 0
    process_data(options.merge(total: total, url: get_query_url))
  else
    logger.info "No works found."
  end

  # return number of works queued
  total.to_i
end

#s3_adapterObject



73
74
75
76
77
78
# File 'lib/maltese/sitemap.rb', line 73

def s3_adapter
  SitemapGenerator::AwsSdkAdapter.new(sitemap_bucket,
                                  aws_access_key_id: access_key,
                                  aws_secret_access_key: secret_key,
                                  aws_region: region)
end

#search_pathObject



52
53
54
# File 'lib/maltese/sitemap.rb', line 52

def search_path
  rack_env == "production" ? "https://api.datacite.org/dois?" : "https://api.test.datacite.org/dois?"
end

#send_notification_to_slack(text, options = {}) ⇒ Object



187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# File 'lib/maltese/sitemap.rb', line 187

def send_notification_to_slack(text, options={})
  return nil unless slack_webhook_url.present?

  attachment = {
    title: options[:title] || "Fabrica Message",
    text: text,
    color: options[:level] || "good",
    fields: options[:fields]
  }.compact

  begin
    notifier = Slack::Notifier.new slack_webhook_url,
                                  username: "Fabrica",
                                  icon_url: SLACK_ICON_URL
    response = notifier.ping attachments: [attachment]
    response.first.body
  rescue Slack::Notifier::APIError => exception
    logger.error exception.message
  end
end

#sitemapObject



64
65
66
67
68
69
70
71
# File 'lib/maltese/sitemap.rb', line 64

def sitemap
  @sitemap ||= SitemapGenerator::LinkSet.new(
    default_host: sitemap_url,
    sitemaps_host: sitemap_url,
    sitemaps_path: sitemaps_path,
    adapter: s3_adapter,
    finalize: false)
end

#sitemap_urlObject



40
41
42
# File 'lib/maltese/sitemap.rb', line 40

def sitemap_url
  rack_env == "production" ? "https://search.datacite.org/" : "https://search.test.datacite.org/"
end

#sitemaps_pathObject



48
49
50
# File 'lib/maltese/sitemap.rb', line 48

def sitemaps_path
  "sitemaps/"
end

#slack_titleObject



44
45
46
# File 'lib/maltese/sitemap.rb', line 44

def slack_title
  rack_env == "production" ? "DataCite Fabrica" : "DataCite Fabrica Test"
end

#timeoutObject



56
57
58
# File 'lib/maltese/sitemap.rb', line 56

def timeout
  60
end