Class: Toccatore::Base

Inherits:
Object
  • Object
show all
Defined in:
lib/toccatore/base.rb

Direct Known Subclasses

DataciteRelated, OrcidUpdate, UsageUpdate

Constant Summary collapse

ICON_URL =

icon for Slack messages

"https://raw.githubusercontent.com/datacite/toccatore/master/lib/toccatore/images/toccatore.png"

Instance Method Summary collapse

Instance Method Details

#cleanup_author(author) ⇒ Object



212
213
214
215
216
217
218
219
220
# File 'lib/toccatore/base.rb', line 212

def cleanup_author(author)
  # detect pattern "Smith J.", but not "Smith, John K."
  author = author.gsub(/[[:space:]]([A-Z]\.)?(-?[A-Z]\.)$/, ', \1\2') unless author.include?(",")

  # titleize strings
  # remove non-standard space characters
  author.my_titleize
        .gsub(/[[:space:]]/, ' ')
end

#get_authors(authors, options = {}) ⇒ Object

parse array of author strings into CSL format



230
231
232
# File 'lib/toccatore/base.rb', line 230

def get_authors(authors, options={})
  Array(authors).map { |author| get_one_author(author) }
end

#get_data(options = {}) ⇒ Object



102
103
104
105
# File 'lib/toccatore/base.rb', line 102

def get_data(options={})
  query_url = get_query_url(options)
  Maremma.get(query_url, options)
end

#get_doi_ra(prefix) ⇒ Object



152
153
154
155
156
157
158
159
160
161
# File 'lib/toccatore/base.rb', line 152

def get_doi_ra(prefix)
  return nil if prefix.blank?

  url = "https://api.datacite.org/prefixes/#{prefix}"
  result = Maremma.get(url)

  return result.body.fetch("errors") if result.body.fetch("errors", nil).present?

  result.body.fetch("data", {}).fetch('attributes', {}).fetch('registration-agency', nil)
end

#get_hashed_authors(authors) ⇒ Object

parse array of author hashes into CSL format



235
236
237
# File 'lib/toccatore/base.rb', line 235

def get_hashed_authors(authors)
  Array(authors).map { |author| get_one_hashed_author(author) }
end

#get_name_identifier(author) ⇒ Object



247
248
249
250
251
252
253
254
255
# File 'lib/toccatore/base.rb', line 247

def get_name_identifier(author)
  name_identifier = author.fetch("nameIdentifier", nil)
  name_identifier_scheme = author.fetch("nameIdentifierScheme", "orcid").downcase
  if name_identifier_scheme == "orcid" && name_identifier = validate_orcid(name_identifier)
    "http://orcid.org/#{name_identifier}"
  else
    nil
  end
end

#get_one_author(author) ⇒ Object

parse author string into CSL format only assume personal name when using sort-order: “Turing, Alan”



196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# File 'lib/toccatore/base.rb', line 196

def get_one_author(author)
  return { "literal" => "" } if author.strip.blank?

  author = cleanup_author(author)
  names = Namae.parse(author)

  if names.blank? || is_personal_name?(author).blank?
    { "literal" => author }
  else
    name = names.first

    { "family" => name.family,
      "given" => name.given }.compact
  end
end

#get_one_hashed_author(author) ⇒ Object



239
240
241
242
243
244
245
# File 'lib/toccatore/base.rb', line 239

def get_one_hashed_author(author)
  raw_name = author.fetch("creatorName", nil)

  author_hsh = get_one_author(raw_name)
  author_hsh["ORCID"] = get_name_identifier(author)
  author_hsh.compact
end

#get_query_url(options = {}) ⇒ Object



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/toccatore/base.rb', line 28

def get_query_url(options={})
  updated = "updated:[#{options[:from_date]}T00:00:00Z TO #{options[:until_date]}T23:59:59Z]"
  fq = "#{updated} AND has_metadata:true AND is_active:true"

  if options[:doi].present?
    q = "doi:#{options[:doi]}"
  elsif options[:orcid].present?
    q = "nameIdentifier:ORCID\\:#{options[:orcid]}"
  elsif options[:related_identifier].present?
    q = "relatedIdentifier:DOI\\:#{options[:related_identifier]}"
  elsif options[:query].present?
    q = options[:query]
  else
    q = query
  end

  params = { q: q,
             start: options[:offset],
             rows: options[:rows],
             fl: "doi,resourceTypeGeneral,relatedIdentifier,nameIdentifier,minted,updated",
             fq: fq,
             wt: "json" }
  url +  URI.encode_www_form(params)
end

#get_total(options = {}) ⇒ Object



53
54
55
56
57
# File 'lib/toccatore/base.rb', line 53

def get_total(options={})
  query_url = get_query_url(options.merge(rows: 0))
  result = Maremma.get(query_url, options)
  result.body.fetch("data", {}).fetch("response", {}).fetch("numFound", 0)
end

#is_personal_name?(author) ⇒ Boolean

Returns:

  • (Boolean)


222
223
224
225
226
227
# File 'lib/toccatore/base.rb', line 222

def is_personal_name?(author)
  return true if author.include?(",")

  # lookup given name
  name_detector.name_exists?(author.split.first)
end

#job_batch_sizeObject



132
133
134
# File 'lib/toccatore/base.rb', line 132

def job_batch_size
  1000
end

#name_detectorObject



257
258
259
# File 'lib/toccatore/base.rb', line 257

def name_detector
  GenderDetector.new
end

#normalize_doi(doi) ⇒ Object



171
172
173
174
175
176
177
178
179
180
# File 'lib/toccatore/base.rb', line 171

def normalize_doi(doi)
  doi = validate_doi(doi)
  return nil unless doi.present?

  # remove non-printing whitespace and downcase
  doi = doi.delete("\u200B").downcase

  # turn DOI into URL, escape unsafe characters
  "https://doi.org/" + Addressable::URI.encode(doi)
end

#orcid_as_url(orcid) ⇒ Object



186
187
188
# File 'lib/toccatore/base.rb', line 186

def orcid_as_url(orcid)
  "http://orcid.org/#{orcid}" if orcid.present?
end

#orcid_from_url(url) ⇒ Object



182
183
184
# File 'lib/toccatore/base.rb', line 182

def orcid_from_url(url)
  Array(/\Ahttp:\/\/orcid\.org\/(.+)/.match(url)).last
end

#process_data(options = {}) ⇒ Object



93
94
95
96
97
98
99
100
# File 'lib/toccatore/base.rb', line 93

def process_data(options = {})
  data = get_data(options.merge(timeout: timeout, source_id: source_id))
  data = parse_data(data, options)

  return [OpenStruct.new(body: { "data" => [] })] if data.empty?

  push_data(data, options)
end

#push_data(items, options = {}) ⇒ Object

method returns number of errors



108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# File 'lib/toccatore/base.rb', line 108

def push_data(items, options={})
  if items.empty?
    puts "No works found for date range #{options[:from_date]} - #{options[:until_date]}."
    0
  elsif options[:access_token].blank?
    puts "An error occured: Access token missing."
    options[:total]
  else
    error_total = 0
    Array(items).each do |item|
      error_total += push_item(item, options)
    end
    error_total
  end
end

#queue_jobs(options = {}) ⇒ Object



59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/toccatore/base.rb', line 59

def queue_jobs(options={})
  options[:offset] = options[:offset].to_i || 0
  options[:rows] = options[:rows].presence || job_batch_size
  options[:from_date] = options[:from_date].presence || (Time.now.to_date - 1.day).iso8601
  options[:until_date] = options[:until_date].presence || Time.now.to_date.iso8601

  total = get_total(options)

  if total > 0
    # walk through paginated results
    total_pages = (total.to_f / job_batch_size).ceil
    error_total = 0

    (0...total_pages).each do |page|
      options[:offset] = page * job_batch_size
      options[:total] = total
      error_total += process_data(options)
    end
    text = "#{total} works processed with #{error_total} errors for date range #{options[:from_date]} - #{options[:until_date]}."
  else
    text = "No works found for date range #{options[:from_date]} - #{options[:until_date]}."
  end

  puts text

  # send slack notification
  options[:level] = total > 0 ? "good" : "warning"
  options[:title] = "Report for #{source_id}"
  send_notification_to_slack(text, options) if options[:slack_webhook_url].present?

  # return number of works queued
  total
end

#send_notification_to_slack(text, options = {}) ⇒ Object



136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# File 'lib/toccatore/base.rb', line 136

def send_notification_to_slack(text, options={})
  return nil unless options[:slack_webhook_url].present?

  attachment = {
    title: options[:title] || "Report",
    text: text,
    color: options[:level] || "good"
  }

  notifier = Slack::Notifier.new options[:slack_webhook_url],
                                 username: "Event Data Agent",
                                 icon_url: ICON_URL
  response = notifier.ping attachments: [attachment]
  response.body
end

#timeoutObject



128
129
130
# File 'lib/toccatore/base.rb', line 128

def timeout
  120
end

#unfreeze(hsh) ⇒ Object



261
262
263
264
265
# File 'lib/toccatore/base.rb', line 261

def unfreeze(hsh)
  new_hash = {}
  hsh.each_pair { |k,v| new_hash.merge!({k.downcase.to_sym => v})  }
  new_hash
end

#urlObject



124
125
126
# File 'lib/toccatore/base.rb', line 124

def url
  "https://search.datacite.org/api?"
end

#validate_doi(doi) ⇒ Object



163
164
165
# File 'lib/toccatore/base.rb', line 163

def validate_doi(doi)
  Array(/\A(?:(http|https):\/\/(dx\.)?doi.org\/)?(doi:)?(10\.\d{4,5}\/.+)\z/.match(doi)).last
end

#validate_orcid(orcid) ⇒ Object



190
191
192
# File 'lib/toccatore/base.rb', line 190

def validate_orcid(orcid)
  Array(/\A(?:http:\/\/orcid\.org\/)?(\d{4}-\d{4}-\d{4}-\d{3}[0-9X]+)\z/.match(orcid)).last
end

#validate_prefix(doi) ⇒ Object



167
168
169
# File 'lib/toccatore/base.rb', line 167

def validate_prefix(doi)
  Array(/\A(?:(http|https):\/\/(dx\.)?doi.org\/)?(doi:)?(10\.\d{4,5})\/.+\z/.match(doi)).last
end