Class: Toccatore::Base

Inherits:
Object
  • Object
show all
Defined in:
lib/toccatore/base.rb

Direct Known Subclasses

OrcidUpdate

Instance Method Summary collapse

Instance Method Details

#clean_doi(doi) ⇒ Object

remove non-printing whitespace



275
276
277
# File 'lib/toccatore/base.rb', line 275

def clean_doi(doi)
  doi.gsub(/\u200B/, '')
end

#cleanup_author(author) ⇒ Object



322
323
324
325
326
327
328
329
330
# File 'lib/toccatore/base.rb', line 322

def cleanup_author(author)
  # detect pattern "Smith J.", but not "Smith, John K."
  author = author.gsub(/[[:space:]]([A-Z]\.)?(-?[A-Z]\.)$/, ', \1\2') unless author.include?(",")

  # titleize strings
  # remove non-standard space characters
  author.my_titleize
        .gsub(/[[:space:]]/, ' ')
end

#config_fieldsObject



258
259
260
# File 'lib/toccatore/base.rb', line 258

def config_fields
  [:url, :push_url, :access_token]
end

#doi_as_url(doi) ⇒ Object



288
289
290
# File 'lib/toccatore/base.rb', line 288

def doi_as_url(doi)
  Addressable::URI.encode("https://doi.org/#{clean_doi(doi)}") if doi.present?
end

#doi_from_url(url) ⇒ Object



279
280
281
282
283
284
285
286
# File 'lib/toccatore/base.rb', line 279

def doi_from_url(url)
  if /(http|https):\/\/(dx\.)?doi\.org\/(\w+)/.match(url)
    uri = Addressable::URI.parse(url)
    uri.path[1..-1].upcase
  elsif url.starts_with?("doi:")
    url[4..-1].upcase
  end
end

#get_authors(authors, options = {}) ⇒ Object

parse array of author strings into CSL format



340
341
342
# File 'lib/toccatore/base.rb', line 340

def get_authors(authors, options={})
  Array(authors).map { |author| get_one_author(author, options) }
end

#get_contributions(obj, items) ⇒ Object

we are flipping subj and obj for contributions



236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
# File 'lib/toccatore/base.rb', line 236

def get_contributions(obj, items)
  prefix = obj["DOI"][/^10\.\d{4,5}/]

  Array(items).reduce([]) do |sum, item|
    orcid = item.split(':', 2).last
    orcid = validate_orcid(orcid)

    return sum if orcid.nil?

    sum << { prefix: prefix,
             message_type: "contribution",
             relation: { "subj_id" => orcid_as_url(orcid),
                         "obj_id" => obj["pid"],
                         "relation_type_id" => nil,
                         "source_id" => source_id,
                         "publisher_id" => obj["publisher_id"],
                         "registration_agency_id" => "datacite",
                         "occurred_at" => obj["issued"] },
             obj: obj }
  end
end

#get_data(options = {}) ⇒ Object



85
86
87
88
# File 'lib/toccatore/base.rb', line 85

def get_data(options={})
  query_url = get_query_url(options)
  Maremma.get(query_url, options)
end

#get_doi_relations(subj, items) ⇒ Object



208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
# File 'lib/toccatore/base.rb', line 208

def get_doi_relations(subj, items)
  prefix = subj["DOI"][/^10\.\d{4,5}/]

  Array(items).reduce([]) do |sum, item|
    raw_relation_type, _related_identifier_type, related_identifier = item.split(':', 3)
    doi = related_identifier.strip.upcase
    registration_agency = get_doi_ra(doi)

    if source_id == "datacite_crossref" && registration_agency == "datacite"
      sum
    else
      _source_id = registration_agency == "crossref" ? "datacite_crossref" : "datacite_related"
      pid = doi_as_url(doi)

      sum << { prefix: prefix,
               relation: { "subj_id" => subj["pid"],
                           "obj_id" => pid,
                           "relation_type_id" => raw_relation_type.underscore,
                           "source_id" => _source_id,
                           "publisher_id" => subj["publisher_id"],
                           "registration_agency_id" => registration_agency,
                           "occurred_at" => subj["issued"] },
               subj: subj }
    end
  end
end

#get_github_relations(subj, items) ⇒ Object



167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# File 'lib/toccatore/base.rb', line 167

def get_github_relations(subj, items)
  prefix = subj["DOI"][/^10\.\d{4,5}/]

  Array(items).reduce([]) do |sum, item|
    raw_relation_type, _related_identifier_type, related_identifier = item.split(':', 3)

    # get parent repo
    # code from https://github.com/octokit/octokit.rb/blob/master/lib/octokit/repository.rb
    related_identifier = PostRank::URI.clean(related_identifier)
    github_hash = github_from_url(related_identifier)
    owner_url = github_as_owner_url(github_hash)
    repo_url = github_as_repo_url(github_hash)

    sum << { prefix: prefix,
             relation: { "subj_id" => subj["pid"],
                         "obj_id" => related_identifier,
                         "relation_type_id" => raw_relation_type.underscore,
                         "source_id" => source_id,
                         "publisher_id" => subj["publisher_id"],
                         "registration_agency_id" => "github",
                         "occurred_at" => subj["issued"] },
             subj: subj }

    # if relatedIdentifier is release URL rather than repo URL
    if related_identifier != repo_url
      sum << { relation: { "subj_id" => related_identifier,
                           "obj_id" => repo_url,
                           "relation_type_id" => "is_part_of",
                           "source_id" => source_id,
                           "publisher_id" => "github",
                           "registration_agency_id" => "github" } }
    end

    sum << {  message_type: "contribution",
              relation: { "subj_id" => owner_url,
                          "obj_id" => repo_url,
                          "source_id" => "github_contributor",
                          "registration_agency_id" => "github" }}
  end
end

#get_hashed_authors(authors) ⇒ Object

parse array of author hashes into CSL format



345
346
347
# File 'lib/toccatore/base.rb', line 345

def get_hashed_authors(authors)
  Array(authors).map { |author| get_one_hashed_author(author) }
end

#get_name_identifier(author) ⇒ Object



357
358
359
360
361
362
363
364
365
# File 'lib/toccatore/base.rb', line 357

def get_name_identifier(author)
  name_identifier = author.fetch("nameIdentifier", nil)
  name_identifier_scheme = author.fetch("nameIdentifierScheme", "orcid").downcase
  if name_identifier_scheme == "orcid" && name_identifier = validate_orcid(name_identifier)
    "http://orcid.org/#{name_identifier}"
  else
    nil
  end
end

#get_one_author(author, options = {}) ⇒ Object

parse author string into CSL format only assume personal name when using sort-order: “Turing, Alan”



306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
# File 'lib/toccatore/base.rb', line 306

def get_one_author(author, options = {})
  return { "literal" => "" } if author.strip.blank?

  author = cleanup_author(author)
  names = Namae.parse(author)

  if names.blank? || is_personal_name?(author).blank?
    { "literal" => author }
  else
    name = names.first

    { "family" => name.family,
      "given" => name.given }.compact
  end
end

#get_one_hashed_author(author) ⇒ Object



349
350
351
352
353
354
355
# File 'lib/toccatore/base.rb', line 349

def get_one_hashed_author(author)
  raw_name = author.fetch("creatorName", nil)

  author_hsh = get_one_author(raw_name)
  author_hsh["ORCID"] = get_name_identifier(author)
  author_hsh.compact
end

#get_query_url(options = {}) ⇒ Object



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/toccatore/base.rb', line 23

def get_query_url(options={})
  updated = "updated:[#{options[:from_date]}T00:00:00Z TO #{options[:until_date]}T23:59:59Z]"
  fq = "#{updated} AND has_metadata:true AND is_active:true"

  if options[:doi].present?
    q = "doi:#{options[:doi]}"
  elsif options[:orcid].present?
    q = "nameIdentifier:ORCID\\:#{options[:orcid]}"
  elsif options[:query].present?
    q = options[:query]
  else
    q = query
  end

  params = { q: q,
             start: options[:offset],
             rows: options[:rows],
             fl: "doi,creator,title,publisher,publicationYear,resourceTypeGeneral,datacentre_symbol,relatedIdentifier,nameIdentifier,xml,minted,updated",
             fq: fq,
             wt: "json" }
  url +  URI.encode_www_form(params)
end


120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# File 'lib/toccatore/base.rb', line 120

def get_relations_with_related_works(items)
  Array(items).reduce([]) do |sum, item|
    doi = item.fetch("doi", nil)
    prefix = doi[/^10\.\d{4,5}/]
    pid = doi_as_url(doi)
    type = item.fetch("resourceTypeGeneral", nil)
    publisher_id = item.fetch("datacentre_symbol", nil)

    xml = Base64.decode64(item.fetch('xml', "PGhzaD48L2hzaD4=\n"))
    xml = Hash.from_xml(xml).fetch("resource", {})
    authors = xml.fetch("creators", {}).fetch("creator", [])
    authors = [authors] if authors.is_a?(Hash)

    subj = { "pid" => pid,
             "DOI" => doi,
             "author" => get_hashed_authors(authors),
             "title" => item.fetch("title", []).first,
             "container-title" => item.fetch("publisher", nil),
             "published" => item.fetch("publicationYear", nil),
             "issued" => item.fetch("minted", nil),
             "publisher_id" => publisher_id,
             "registration_agency_id" => "datacite",
             "tracked" => true,
             "type" => type }

    related_doi_identifiers = item.fetch('relatedIdentifier', []).select { |id| id =~ /:DOI:.+/ }
    sum += get_doi_relations(subj, related_doi_identifiers)

    related_github_identifiers = item.fetch('relatedIdentifier', []).select { |id| id =~ /:URL:https:\/\/github.com.+/ }
    sum += get_github_relations(subj, related_github_identifiers)

    name_identifiers = item.fetch('nameIdentifier', []).select { |id| id =~ /^ORCID:.+/ }
    sum += get_contributions(subj, name_identifiers)

    if source_id == "datacite_import"
      sum += [{ prefix: prefix,
                relation: { "subj_id" => subj["pid"],
                            "source_id" => source_id,
                            "publisher_id" => subj["publisher_id"],
                            "occurred_at" => subj["issued"] },
                subj: subj }]
    end

    sum
  end
end

#get_total(options = {}) ⇒ Object



46
47
48
49
50
# File 'lib/toccatore/base.rb', line 46

def get_total(options={})
  query_url = get_query_url(options.merge(rows: 0))
  result = Maremma.get(query_url, options)
  result.body.fetch("data", {}).fetch("response", {}).fetch("numFound", 0)
end

#is_personal_name?(author) ⇒ Boolean



332
333
334
335
336
337
# File 'lib/toccatore/base.rb', line 332

def is_personal_name?(author)
  return true if author.include?(",")

  # lookup given name
  name_detector.name_exists?(author.split.first)
end

#job_batch_sizeObject



270
271
272
# File 'lib/toccatore/base.rb', line 270

def job_batch_size
  1000
end

#name_detectorObject



367
368
369
# File 'lib/toccatore/base.rb', line 367

def name_detector
  GenderDetector.new
end

#orcid_as_url(orcid) ⇒ Object



296
297
298
# File 'lib/toccatore/base.rb', line 296

def orcid_as_url(orcid)
  "http://orcid.org/#{orcid}" if orcid.present?
end

#orcid_from_url(url) ⇒ Object



292
293
294
# File 'lib/toccatore/base.rb', line 292

def orcid_from_url(url)
  Array(/\Ahttp:\/\/orcid\.org\/(.+)/.match(url)).last
end

#parse_data(result, options = {}) ⇒ Object



90
91
92
93
94
95
# File 'lib/toccatore/base.rb', line 90

def parse_data(result, options={})
  return result.body.fetch("errors") if result.body.fetch("errors", nil).present?

  items = result.fetch("data", {}).fetch('response', {}).fetch('docs', nil)
  get_relations_with_related_works(items)
end

#process_data(options = {}) ⇒ Object



76
77
78
79
80
81
82
83
# File 'lib/toccatore/base.rb', line 76

def process_data(options = {})
  data = get_data(options.merge(timeout: timeout, source_id: source_id))
  data = parse_data(data, options.merge(source_id: source_id))

  return [OpenStruct.new(body: { "data" => [] })] if data.empty?

  push_data(data, options)
end

#push_data(items, options = {}) ⇒ Object

push to Lagotto deposit API if no error and we have collected works



98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# File 'lib/toccatore/base.rb', line 98

def push_data(items, options={})
  if items.empty?
    puts "No works found for date range #{options[:from_date]} - #{options[:until_date]}."
  else
    Array(items).map do |item|
      relation = item.fetch(:relation, {})
      deposit = { "deposit" => { "subj_id" => relation.fetch("subj_id", nil),
                                 "obj_id" => relation.fetch("obj_id", nil),
                                 "relation_type_id" => relation.fetch("relation_type_id", nil),
                                 "source_id" => relation.fetch("source_id", nil),
                                 "publisher_id" => relation.fetch("publisher_id", nil),
                                 "subj" => item.fetch(:subj, {}),
                                 "obj" => item.fetch(:obj, {}),
                                 "message_type" => item.fetch(:message_type, "relation"),
                                 "prefix" => item.fetch(:prefix, nil),
                                 "source_token" => uuid } }

      Maremma.post push_url, data: deposit.to_json, content_type: 'json', token: access_token
    end
  end
end

#queue_jobs(options = {}) ⇒ Object



52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# File 'lib/toccatore/base.rb', line 52

def queue_jobs(options={})
  options[:offset] = options[:offset].to_i || 0
  options[:rows] = options[:rows].presence || job_batch_size
  options[:from_date] = options[:from_date].presence || (Time.now.to_date - 1.day).iso8601
  options[:until_date] = options[:until_date].presence || Time.now.to_date.iso8601

  total = get_total(options)

  if total > 0
    # walk through paginated results
    total_pages = (total.to_f / job_batch_size).ceil

    (0...total_pages).each do |page|
      options[:offset] = page * job_batch_size
      process_data(options)
    end
  else
    puts "No works found for date range #{options[:from_date]} - #{options[:until_date]}."
  end

  # return number of works queued
  total
end

#timeoutObject



266
267
268
# File 'lib/toccatore/base.rb', line 266

def timeout
  120
end

#unfreeze(hsh) ⇒ Object



371
372
373
374
375
# File 'lib/toccatore/base.rb', line 371

def unfreeze(hsh)
  new_hash = {}
  hsh.each_pair { |k,v| new_hash.merge!({k.downcase.to_sym => v})  }
  new_hash
end

#urlObject



262
263
264
# File 'lib/toccatore/base.rb', line 262

def url
  "https://search.datacite.org/api?"
end

#validate_orcid(orcid) ⇒ Object



300
301
302
# File 'lib/toccatore/base.rb', line 300

def validate_orcid(orcid)
  Array(/\A(?:http:\/\/orcid\.org\/)?(\d{4}-\d{4}-\d{4}-\d{3}[0-9X]+)\z/.match(orcid)).last
end