Class: Toccatore::Base

Inherits:
Object
  • Object
show all
Defined in:
lib/toccatore/base.rb

Direct Known Subclasses

OrcidUpdate

Instance Method Summary collapse

Instance Method Details

#clean_doi(doi) ⇒ Object

remove non-printing whitespace



266
267
268
# File 'lib/toccatore/base.rb', line 266

def clean_doi(doi)
  doi.gsub(/\u200B/, '')
end

#cleanup_author(author) ⇒ Object



313
314
315
316
317
318
319
320
321
# File 'lib/toccatore/base.rb', line 313

def cleanup_author(author)
  # detect pattern "Smith J.", but not "Smith, John K."
  author = author.gsub(/[[:space:]]([A-Z]\.)?(-?[A-Z]\.)$/, ', \1\2') unless author.include?(",")

  # titleize strings
  # remove non-standard space characters
  author.my_titleize
        .gsub(/[[:space:]]/, ' ')
end

#config_fieldsObject



249
250
251
# File 'lib/toccatore/base.rb', line 249

def config_fields
  [:url, :push_url, :access_token]
end

#doi_as_url(doi) ⇒ Object



279
280
281
# File 'lib/toccatore/base.rb', line 279

def doi_as_url(doi)
  Addressable::URI.encode("https://doi.org/#{clean_doi(doi)}") if doi.present?
end

#doi_from_url(url) ⇒ Object



270
271
272
273
274
275
276
277
# File 'lib/toccatore/base.rb', line 270

def doi_from_url(url)
  if /(http|https):\/\/(dx\.)?doi\.org\/(\w+)/.match(url)
    uri = Addressable::URI.parse(url)
    uri.path[1..-1].upcase
  elsif url.starts_with?("doi:")
    url[4..-1].upcase
  end
end

#get_authors(authors, options = {}) ⇒ Object

parse array of author strings into CSL format



331
332
333
# File 'lib/toccatore/base.rb', line 331

def get_authors(authors, options={})
  Array(authors).map { |author| get_one_author(author, options) }
end

#get_contributions(obj, items) ⇒ Object

we are flipping subj and obj for contributions



227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
# File 'lib/toccatore/base.rb', line 227

def get_contributions(obj, items)
  prefix = obj["DOI"][/^10\.\d{4,5}/]

  Array(items).reduce([]) do |sum, item|
    orcid = item.split(':', 2).last
    orcid = validate_orcid(orcid)

    return sum if orcid.nil?

    sum << { prefix: prefix,
             message_type: "contribution",
             relation: { "subj_id" => orcid_as_url(orcid),
                         "obj_id" => obj["pid"],
                         "relation_type_id" => nil,
                         "source_id" => source_id,
                         "publisher_id" => obj["publisher_id"],
                         "registration_agency_id" => "datacite",
                         "occurred_at" => obj["issued"] },
             obj: obj }
  end
end

#get_data(options = {}) ⇒ Object



76
77
78
79
# File 'lib/toccatore/base.rb', line 76

def get_data(options={})
  query_url = get_query_url(options)
  Maremma.get(query_url, options)
end

#get_doi_relations(subj, items) ⇒ Object



199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# File 'lib/toccatore/base.rb', line 199

def get_doi_relations(subj, items)
  prefix = subj["DOI"][/^10\.\d{4,5}/]

  Array(items).reduce([]) do |sum, item|
    raw_relation_type, _related_identifier_type, related_identifier = item.split(':', 3)
    doi = related_identifier.strip.upcase
    registration_agency = get_doi_ra(doi)

    if source_id == "datacite_crossref" && registration_agency == "datacite"
      sum
    else
      _source_id = registration_agency == "crossref" ? "datacite_crossref" : "datacite_related"
      pid = doi_as_url(doi)

      sum << { prefix: prefix,
               relation: { "subj_id" => subj["pid"],
                           "obj_id" => pid,
                           "relation_type_id" => raw_relation_type.underscore,
                           "source_id" => _source_id,
                           "publisher_id" => subj["publisher_id"],
                           "registration_agency_id" => registration_agency,
                           "occurred_at" => subj["issued"] },
               subj: subj }
    end
  end
end

#get_github_relations(subj, items) ⇒ Object



158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# File 'lib/toccatore/base.rb', line 158

def get_github_relations(subj, items)
  prefix = subj["DOI"][/^10\.\d{4,5}/]

  Array(items).reduce([]) do |sum, item|
    raw_relation_type, _related_identifier_type, related_identifier = item.split(':', 3)

    # get parent repo
    # code from https://github.com/octokit/octokit.rb/blob/master/lib/octokit/repository.rb
    related_identifier = PostRank::URI.clean(related_identifier)
    github_hash = github_from_url(related_identifier)
    owner_url = github_as_owner_url(github_hash)
    repo_url = github_as_repo_url(github_hash)

    sum << { prefix: prefix,
             relation: { "subj_id" => subj["pid"],
                         "obj_id" => related_identifier,
                         "relation_type_id" => raw_relation_type.underscore,
                         "source_id" => source_id,
                         "publisher_id" => subj["publisher_id"],
                         "registration_agency_id" => "github",
                         "occurred_at" => subj["issued"] },
             subj: subj }

    # if relatedIdentifier is release URL rather than repo URL
    if related_identifier != repo_url
      sum << { relation: { "subj_id" => related_identifier,
                           "obj_id" => repo_url,
                           "relation_type_id" => "is_part_of",
                           "source_id" => source_id,
                           "publisher_id" => "github",
                           "registration_agency_id" => "github" } }
    end

    sum << {  message_type: "contribution",
              relation: { "subj_id" => owner_url,
                          "obj_id" => repo_url,
                          "source_id" => "github_contributor",
                          "registration_agency_id" => "github" }}
  end
end

#get_hashed_authors(authors) ⇒ Object

parse array of author hashes into CSL format



336
337
338
# File 'lib/toccatore/base.rb', line 336

def get_hashed_authors(authors)
  Array(authors).map { |author| get_one_hashed_author(author) }
end

#get_name_identifier(author) ⇒ Object



348
349
350
351
352
353
354
355
356
# File 'lib/toccatore/base.rb', line 348

def get_name_identifier(author)
  name_identifier = author.fetch("nameIdentifier", nil)
  name_identifier_scheme = author.fetch("nameIdentifierScheme", "orcid").downcase
  if name_identifier_scheme == "orcid" && name_identifier = validate_orcid(name_identifier)
    "http://orcid.org/#{name_identifier}"
  else
    nil
  end
end

#get_one_author(author, options = {}) ⇒ Object

parse author string into CSL format only assume personal name when using sort-order: “Turing, Alan”



297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
# File 'lib/toccatore/base.rb', line 297

def get_one_author(author, options = {})
  return { "literal" => "" } if author.strip.blank?

  author = cleanup_author(author)
  names = Namae.parse(author)

  if names.blank? || is_personal_name?(author).blank?
    { "literal" => author }
  else
    name = names.first

    { "family" => name.family,
      "given" => name.given }.compact
  end
end

#get_one_hashed_author(author) ⇒ Object



340
341
342
343
344
345
346
# File 'lib/toccatore/base.rb', line 340

def get_one_hashed_author(author)
  raw_name = author.fetch("creatorName", nil)

  author_hsh = get_one_author(raw_name)
  author_hsh["ORCID"] = get_name_identifier(author)
  author_hsh.compact
end

#get_query_url(options = {}) ⇒ Object



23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/toccatore/base.rb', line 23

def get_query_url(options={})
  updated = "updated:[#{options[:from_date]}T00:00:00Z TO #{options[:until_date]}T23:59:59Z]"
  fq = "#{updated} AND has_metadata:true AND is_active:true"
  q = options[:query].presence || query

  params = { q: q,
             start: options[:offset],
             rows: options[:rows],
             fl: "doi,creator,title,publisher,publicationYear,resourceTypeGeneral,datacentre_symbol,relatedIdentifier,nameIdentifier,xml,minted,updated",
             fq: fq,
             wt: "json" }
  url +  URI.encode_www_form(params)
end


111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# File 'lib/toccatore/base.rb', line 111

def get_relations_with_related_works(items)
  Array(items).reduce([]) do |sum, item|
    doi = item.fetch("doi", nil)
    prefix = doi[/^10\.\d{4,5}/]
    pid = doi_as_url(doi)
    type = item.fetch("resourceTypeGeneral", nil)
    publisher_id = item.fetch("datacentre_symbol", nil)

    xml = Base64.decode64(item.fetch('xml', "PGhzaD48L2hzaD4=\n"))
    xml = Hash.from_xml(xml).fetch("resource", {})
    authors = xml.fetch("creators", {}).fetch("creator", [])
    authors = [authors] if authors.is_a?(Hash)

    subj = { "pid" => pid,
             "DOI" => doi,
             "author" => get_hashed_authors(authors),
             "title" => item.fetch("title", []).first,
             "container-title" => item.fetch("publisher", nil),
             "published" => item.fetch("publicationYear", nil),
             "issued" => item.fetch("minted", nil),
             "publisher_id" => publisher_id,
             "registration_agency_id" => "datacite",
             "tracked" => true,
             "type" => type }

    related_doi_identifiers = item.fetch('relatedIdentifier', []).select { |id| id =~ /:DOI:.+/ }
    sum += get_doi_relations(subj, related_doi_identifiers)

    related_github_identifiers = item.fetch('relatedIdentifier', []).select { |id| id =~ /:URL:https:\/\/github.com.+/ }
    sum += get_github_relations(subj, related_github_identifiers)

    name_identifiers = item.fetch('nameIdentifier', []).select { |id| id =~ /^ORCID:.+/ }
    sum += get_contributions(subj, name_identifiers)

    if source_id == "datacite_import"
      sum += [{ prefix: prefix,
                relation: { "subj_id" => subj["pid"],
                            "source_id" => source_id,
                            "publisher_id" => subj["publisher_id"],
                            "occurred_at" => subj["issued"] },
                subj: subj }]
    end

    sum
  end
end

#get_total(options = {}) ⇒ Object



37
38
39
40
41
# File 'lib/toccatore/base.rb', line 37

def get_total(options={})
  query_url = get_query_url(options.merge(rows: 0))
  result = Maremma.get(query_url, options)
  result.body.fetch("data", {}).fetch("response", {}).fetch("numFound", 0)
end

#is_personal_name?(author) ⇒ Boolean

Returns:

  • (Boolean)


323
324
325
326
327
328
# File 'lib/toccatore/base.rb', line 323

def is_personal_name?(author)
  return true if author.include?(",")

  # lookup given name
  name_detector.name_exists?(author.split.first)
end

#job_batch_sizeObject



261
262
263
# File 'lib/toccatore/base.rb', line 261

def job_batch_size
  1000
end

#name_detectorObject



358
359
360
# File 'lib/toccatore/base.rb', line 358

def name_detector
  GenderDetector.new
end

#orcid_as_url(orcid) ⇒ Object



287
288
289
# File 'lib/toccatore/base.rb', line 287

def orcid_as_url(orcid)
  "http://orcid.org/#{orcid}" if orcid.present?
end

#orcid_from_url(url) ⇒ Object



283
284
285
# File 'lib/toccatore/base.rb', line 283

def orcid_from_url(url)
  Array(/\Ahttp:\/\/orcid\.org\/(.+)/.match(url)).last
end

#parse_data(result, options = {}) ⇒ Object



81
82
83
84
85
86
# File 'lib/toccatore/base.rb', line 81

def parse_data(result, options={})
  return result.body.fetch("errors") if result.body.fetch("errors", nil).present?

  items = result.fetch("data", {}).fetch('response', {}).fetch('docs', nil)
  get_relations_with_related_works(items)
end

#process_data(options = {}) ⇒ Object



67
68
69
70
71
72
73
74
# File 'lib/toccatore/base.rb', line 67

def process_data(options = {})
  data = get_data(options.merge(timeout: timeout, source_id: source_id))
  data = parse_data(data, options.merge(source_id: source_id))

  return [OpenStruct.new(body: { "data" => [] })] if data.empty?

  push_data(data, options)
end

#push_data(items, options = {}) ⇒ Object

push to Lagotto deposit API if no error and we have collected works



89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# File 'lib/toccatore/base.rb', line 89

def push_data(items, options={})
  if items.empty?
    puts "No works found for date range #{options[:from_date]} - #{options[:until_date]}."
  else
    Array(items).map do |item|
      relation = item.fetch(:relation, {})
      deposit = { "deposit" => { "subj_id" => relation.fetch("subj_id", nil),
                                 "obj_id" => relation.fetch("obj_id", nil),
                                 "relation_type_id" => relation.fetch("relation_type_id", nil),
                                 "source_id" => relation.fetch("source_id", nil),
                                 "publisher_id" => relation.fetch("publisher_id", nil),
                                 "subj" => item.fetch(:subj, {}),
                                 "obj" => item.fetch(:obj, {}),
                                 "message_type" => item.fetch(:message_type, "relation"),
                                 "prefix" => item.fetch(:prefix, nil),
                                 "source_token" => uuid } }

      Maremma.post push_url, data: deposit.to_json, content_type: 'json', token: access_token
    end
  end
end

#queue_jobs(options = {}) ⇒ Object



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/toccatore/base.rb', line 43

def queue_jobs(options={})
  options[:offset] = options[:offset].to_i || 0
  options[:rows] = options[:rows].presence || job_batch_size
  options[:from_date] = options[:from_date].presence || (Time.now.to_date - 1.day).iso8601
  options[:until_date] = options[:until_date].presence || Time.now.to_date.iso8601

  total = get_total(options)

  if total > 0
    # walk through paginated results
    total_pages = (total.to_f / job_batch_size).ceil

    (0...total_pages).each do |page|
      options[:offset] = page * job_batch_size
      process_data(options)
    end
  else
    puts "No works found for date range #{options[:from_date]} - #{options[:until_date]}."
  end

  # return number of works queued
  total
end

#timeoutObject



257
258
259
# File 'lib/toccatore/base.rb', line 257

def timeout
  120
end

#urlObject



253
254
255
# File 'lib/toccatore/base.rb', line 253

def url
  "https://search.datacite.org/api?"
end

#validate_orcid(orcid) ⇒ Object



291
292
293
# File 'lib/toccatore/base.rb', line 291

def validate_orcid(orcid)
  Array(/\A(?:http:\/\/orcid\.org\/)?(\d{4}-\d{4}-\d{4}-\d{3}[0-9X]+)\z/.match(orcid)).last
end