Class: Toccatore::Base

Inherits:
Object
  • Object
show all
Defined in:
lib/toccatore/base.rb

Direct Known Subclasses

OrcidUpdate

Instance Method Summary collapse

Instance Method Details

#clean_doi(doi) ⇒ Object

remove non-printing whitespace



261
262
263
# File 'lib/toccatore/base.rb', line 261

def clean_doi(doi)
  doi.gsub(/\u200B/, '')
end

#cleanup_author(author) ⇒ Object



308
309
310
311
312
313
314
315
316
# File 'lib/toccatore/base.rb', line 308

def cleanup_author(author)
  # detect pattern "Smith J.", but not "Smith, John K."
  author = author.gsub(/[[:space:]]([A-Z]\.)?(-?[A-Z]\.)$/, ', \1\2') unless author.include?(",")

  # titleize strings
  # remove non-standard space characters
  author.my_titleize
        .gsub(/[[:space:]]/, ' ')
end

#config_fieldsObject



244
245
246
# File 'lib/toccatore/base.rb', line 244

def config_fields
  [:url, :push_url, :access_token]
end

#doi_as_url(doi) ⇒ Object



274
275
276
# File 'lib/toccatore/base.rb', line 274

def doi_as_url(doi)
  Addressable::URI.encode("https://doi.org/#{clean_doi(doi)}") if doi.present?
end

#doi_from_url(url) ⇒ Object



265
266
267
268
269
270
271
272
# File 'lib/toccatore/base.rb', line 265

def doi_from_url(url)
  if /(http|https):\/\/(dx\.)?doi\.org\/(\w+)/.match(url)
    uri = Addressable::URI.parse(url)
    uri.path[1..-1].upcase
  elsif url.starts_with?("doi:")
    url[4..-1].upcase
  end
end

#get_authors(authors, options = {}) ⇒ Object

parse array of author strings into CSL format



326
327
328
# File 'lib/toccatore/base.rb', line 326

def get_authors(authors, options={})
  Array(authors).map { |author| get_one_author(author, options) }
end

#get_contributions(obj, items) ⇒ Object

we are flipping subj and obj for contributions



222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
# File 'lib/toccatore/base.rb', line 222

def get_contributions(obj, items)
  prefix = obj["DOI"][/^10\.\d{4,5}/]

  Array(items).reduce([]) do |sum, item|
    orcid = item.split(':', 2).last
    orcid = validate_orcid(orcid)

    return sum if orcid.nil?

    sum << { prefix: prefix,
             message_type: "contribution",
             relation: { "subj_id" => orcid_as_url(orcid),
                         "obj_id" => obj["pid"],
                         "relation_type_id" => nil,
                         "source_id" => source_id,
                         "publisher_id" => obj["publisher_id"],
                         "registration_agency_id" => "datacite",
                         "occurred_at" => obj["issued"] },
             obj: obj }
  end
end

#get_data(options = {}) ⇒ Object



73
74
75
76
# File 'lib/toccatore/base.rb', line 73

def get_data(options={})
  query_url = get_query_url(options)
  Maremma.get(query_url, options)
end

#get_doi_relations(subj, items) ⇒ Object



194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
# File 'lib/toccatore/base.rb', line 194

def get_doi_relations(subj, items)
  prefix = subj["DOI"][/^10\.\d{4,5}/]

  Array(items).reduce([]) do |sum, item|
    raw_relation_type, _related_identifier_type, related_identifier = item.split(':', 3)
    doi = related_identifier.strip.upcase
    registration_agency = get_doi_ra(doi)

    if source_id == "datacite_crossref" && registration_agency == "datacite"
      sum
    else
      _source_id = registration_agency == "crossref" ? "datacite_crossref" : "datacite_related"
      pid = doi_as_url(doi)

      sum << { prefix: prefix,
               relation: { "subj_id" => subj["pid"],
                           "obj_id" => pid,
                           "relation_type_id" => raw_relation_type.underscore,
                           "source_id" => _source_id,
                           "publisher_id" => subj["publisher_id"],
                           "registration_agency_id" => registration_agency,
                           "occurred_at" => subj["issued"] },
               subj: subj }
    end
  end
end

#get_github_relations(subj, items) ⇒ Object



153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# File 'lib/toccatore/base.rb', line 153

def get_github_relations(subj, items)
  prefix = subj["DOI"][/^10\.\d{4,5}/]

  Array(items).reduce([]) do |sum, item|
    raw_relation_type, _related_identifier_type, related_identifier = item.split(':', 3)

    # get parent repo
    # code from https://github.com/octokit/octokit.rb/blob/master/lib/octokit/repository.rb
    related_identifier = PostRank::URI.clean(related_identifier)
    github_hash = github_from_url(related_identifier)
    owner_url = github_as_owner_url(github_hash)
    repo_url = github_as_repo_url(github_hash)

    sum << { prefix: prefix,
             relation: { "subj_id" => subj["pid"],
                         "obj_id" => related_identifier,
                         "relation_type_id" => raw_relation_type.underscore,
                         "source_id" => source_id,
                         "publisher_id" => subj["publisher_id"],
                         "registration_agency_id" => "github",
                         "occurred_at" => subj["issued"] },
             subj: subj }

    # if relatedIdentifier is release URL rather than repo URL
    if related_identifier != repo_url
      sum << { relation: { "subj_id" => related_identifier,
                           "obj_id" => repo_url,
                           "relation_type_id" => "is_part_of",
                           "source_id" => source_id,
                           "publisher_id" => "github",
                           "registration_agency_id" => "github" } }
    end

    sum << {  message_type: "contribution",
              relation: { "subj_id" => owner_url,
                          "obj_id" => repo_url,
                          "source_id" => "github_contributor",
                          "registration_agency_id" => "github" }}
  end
end

#get_hashed_authors(authors) ⇒ Object

parse array of author hashes into CSL format



331
332
333
# File 'lib/toccatore/base.rb', line 331

def get_hashed_authors(authors)
  Array(authors).map { |author| get_one_hashed_author(author) }
end

#get_name_identifier(author) ⇒ Object



343
344
345
346
347
348
349
350
351
# File 'lib/toccatore/base.rb', line 343

def get_name_identifier(author)
  name_identifier = author.fetch("nameIdentifier", nil)
  name_identifier_scheme = author.fetch("nameIdentifierScheme", "orcid").downcase
  if name_identifier_scheme == "orcid" && name_identifier = validate_orcid(name_identifier)
    "http://orcid.org/#{name_identifier}"
  else
    nil
  end
end

#get_one_author(author, options = {}) ⇒ Object

parse author string into CSL format only assume personal name when using sort-order: “Turing, Alan”



292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
# File 'lib/toccatore/base.rb', line 292

def get_one_author(author, options = {})
  return { "literal" => "" } if author.strip.blank?

  author = cleanup_author(author)
  names = Namae.parse(author)

  if names.blank? || is_personal_name?(author).blank?
    { "literal" => author }
  else
    name = names.first

    { "family" => name.family,
      "given" => name.given }.compact
  end
end

#get_one_hashed_author(author) ⇒ Object



335
336
337
338
339
340
341
# File 'lib/toccatore/base.rb', line 335

def get_one_hashed_author(author)
  raw_name = author.fetch("creatorName", nil)

  author_hsh = get_one_author(raw_name)
  author_hsh["ORCID"] = get_name_identifier(author)
  author_hsh.compact
end

#get_query_url(options = {}) ⇒ Object



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/toccatore/base.rb', line 23

def get_query_url(options={})
  offset = options[:offset].to_i || 0
  rows = options[:rows].presence || job_batch_size
  from_date = options[:from_date].presence || (Time.now.to_date - 1.day).iso8601
  until_date = options[:until_date].presence || Time.now.to_date.iso8601

  updated = "updated:[#{from_date}T00:00:00Z TO #{until_date}T23:59:59Z]"
  fq = "#{updated} AND has_metadata:true AND is_active:true"

  params = { q: q,
             start: offset,
             rows: rows,
             fl: "doi,creator,title,publisher,publicationYear,resourceTypeGeneral,datacentre_symbol,relatedIdentifier,nameIdentifier,xml,minted,updated",
             fq: fq,
             wt: "json" }
  url +  URI.encode_www_form(params)
end


106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# File 'lib/toccatore/base.rb', line 106

def get_relations_with_related_works(items)
  Array(items).reduce([]) do |sum, item|
    doi = item.fetch("doi", nil)
    prefix = doi[/^10\.\d{4,5}/]
    pid = doi_as_url(doi)
    type = item.fetch("resourceTypeGeneral", nil)
    publisher_id = item.fetch("datacentre_symbol", nil)

    xml = Base64.decode64(item.fetch('xml', "PGhzaD48L2hzaD4=\n"))
    xml = Hash.from_xml(xml).fetch("resource", {})
    authors = xml.fetch("creators", {}).fetch("creator", [])
    authors = [authors] if authors.is_a?(Hash)

    subj = { "pid" => pid,
             "DOI" => doi,
             "author" => get_hashed_authors(authors),
             "title" => item.fetch("title", []).first,
             "container-title" => item.fetch("publisher", nil),
             "published" => item.fetch("publicationYear", nil),
             "issued" => item.fetch("minted", nil),
             "publisher_id" => publisher_id,
             "registration_agency_id" => "datacite",
             "tracked" => true,
             "type" => type }

    related_doi_identifiers = item.fetch('relatedIdentifier', []).select { |id| id =~ /:DOI:.+/ }
    sum += get_doi_relations(subj, related_doi_identifiers)

    related_github_identifiers = item.fetch('relatedIdentifier', []).select { |id| id =~ /:URL:https:\/\/github.com.+/ }
    sum += get_github_relations(subj, related_github_identifiers)

    name_identifiers = item.fetch('nameIdentifier', []).select { |id| id =~ /^ORCID:.+/ }
    sum += get_contributions(subj, name_identifiers)

    if source_id == "datacite_import"
      sum += [{ prefix: prefix,
                relation: { "subj_id" => subj["pid"],
                            "source_id" => source_id,
                            "publisher_id" => subj["publisher_id"],
                            "occurred_at" => subj["issued"] },
                subj: subj }]
    end

    sum
  end
end

#get_total(options = {}) ⇒ Object



41
42
43
44
45
# File 'lib/toccatore/base.rb', line 41

def get_total(options={})
  query_url = get_query_url(options.merge(rows: 0))
  result = Maremma.get(query_url, options)
  result.body.fetch("data", {}).fetch("response", {}).fetch("numFound", 0)
end

#is_personal_name?(author) ⇒ Boolean

Returns:

  • (Boolean)


318
319
320
321
322
323
# File 'lib/toccatore/base.rb', line 318

def is_personal_name?(author)
  return true if author.include?(",")

  # lookup given name
  name_detector.name_exists?(author.split.first)
end

#job_batch_sizeObject



256
257
258
# File 'lib/toccatore/base.rb', line 256

def job_batch_size
  1000
end

#name_detectorObject



353
354
355
# File 'lib/toccatore/base.rb', line 353

def name_detector
  GenderDetector.new
end

#orcid_as_url(orcid) ⇒ Object



282
283
284
# File 'lib/toccatore/base.rb', line 282

def orcid_as_url(orcid)
  "http://orcid.org/#{orcid}" if orcid.present?
end

#orcid_from_url(url) ⇒ Object



278
279
280
# File 'lib/toccatore/base.rb', line 278

def orcid_from_url(url)
  Array(/\Ahttp:\/\/orcid\.org\/(.+)/.match(url)).last
end

#parse_data(result, options = {}) ⇒ Object



78
79
80
81
82
83
# File 'lib/toccatore/base.rb', line 78

def parse_data(result, options={})
  return result.body.fetch("errors") if result.body.fetch("errors", nil).present?

  items = result.fetch("data", {}).fetch('response', {}).fetch('docs', nil)
  get_relations_with_related_works(items)
end

#process_data(options = {}) ⇒ Object



64
65
66
67
68
69
70
71
# File 'lib/toccatore/base.rb', line 64

def process_data(options = {})
  data = get_data(options.merge(timeout: timeout, source_id: source_id))
  data = parse_data(data, options.merge(source_id: source_id))

  return [OpenStruct.new(body: { "data" => [] })] if data.empty?

  push_data(data, options)
end

#push_data(items, options = {}) ⇒ Object

push to Lagotto deposit API if no error and we have collected works



86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/toccatore/base.rb', line 86

def push_data(items, options={})
  return [] if items.empty?

  Array(items).map do |item|
    relation = item.fetch(:relation, {})
    deposit = { "deposit" => { "subj_id" => relation.fetch("subj_id", nil),
                               "obj_id" => relation.fetch("obj_id", nil),
                               "relation_type_id" => relation.fetch("relation_type_id", nil),
                               "source_id" => relation.fetch("source_id", nil),
                               "publisher_id" => relation.fetch("publisher_id", nil),
                               "subj" => item.fetch(:subj, {}),
                               "obj" => item.fetch(:obj, {}),
                               "message_type" => item.fetch(:message_type, "relation"),
                               "prefix" => item.fetch(:prefix, nil),
                               "source_token" => uuid } }

    Maremma.post push_url, data: deposit.to_json, content_type: 'json', token: access_token
  end
end

#queue_jobs(options = {}) ⇒ Object



47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/toccatore/base.rb', line 47

def queue_jobs(options={})
  total = get_total(options)

  if total > 0
    # walk through paginated results
    total_pages = (total.to_f / job_batch_size).ceil

    (0...total_pages).each do |page|
      options[:offset] = page * job_batch_size
      process_data(options)
    end
  end

  # return number of works queued
  total
end

#timeoutObject



252
253
254
# File 'lib/toccatore/base.rb', line 252

def timeout
  120
end

#urlObject



248
249
250
# File 'lib/toccatore/base.rb', line 248

def url
  "https://search.datacite.org/api?"
end

#validate_orcid(orcid) ⇒ Object



286
287
288
# File 'lib/toccatore/base.rb', line 286

def validate_orcid(orcid)
  Array(/\A(?:http:\/\/orcid\.org\/)?(\d{4}-\d{4}-\d{4}-\d{3}[0-9X]+)\z/.match(orcid)).last
end