Class: Toccatore::Base

Inherits:
Object
  • Object
show all
Defined in:
lib/toccatore/base.rb

Direct Known Subclasses

DataciteRelated, OrcidUpdate

Instance Method Summary collapse

Instance Method Details

#cleanup_author(author) ⇒ Object



174
175
176
177
178
179
180
181
182
# File 'lib/toccatore/base.rb', line 174

def cleanup_author(author)
  # detect pattern "Smith J.", but not "Smith, John K."
  author = author.gsub(/[[:space:]]([A-Z]\.)?(-?[A-Z]\.)$/, ', \1\2') unless author.include?(",")

  # titleize strings
  # remove non-standard space characters
  author.my_titleize
        .gsub(/[[:space:]]/, ' ')
end

#get_authors(authors, options = {}) ⇒ Object

parse array of author strings into CSL format



192
193
194
# File 'lib/toccatore/base.rb', line 192

def get_authors(authors, options={})
  Array(authors).map { |author| get_one_author(author) }
end

#get_data(options = {}) ⇒ Object



87
88
89
90
# File 'lib/toccatore/base.rb', line 87

def get_data(options={})
  query_url = get_query_url(options)
  Maremma.get(query_url, options)
end

#get_doi_ra(prefix) ⇒ Object



114
115
116
117
118
119
120
121
122
123
# File 'lib/toccatore/base.rb', line 114

def get_doi_ra(prefix)
  return nil if prefix.blank?

  url = "https://api.datacite.org/prefixes/#{prefix}"
  result = Maremma.get(url)

  return result.body.fetch("errors") if result.body.fetch("errors", nil).present?

  result.body.fetch("data", {}).fetch('attributes', {}).fetch('registration-agency', nil)
end

#get_hashed_authors(authors) ⇒ Object

parse array of author hashes into CSL format



197
198
199
# File 'lib/toccatore/base.rb', line 197

def get_hashed_authors(authors)
  Array(authors).map { |author| get_one_hashed_author(author) }
end

#get_name_identifier(author) ⇒ Object



209
210
211
212
213
214
215
216
217
# File 'lib/toccatore/base.rb', line 209

def get_name_identifier(author)
  name_identifier = author.fetch("nameIdentifier", nil)
  name_identifier_scheme = author.fetch("nameIdentifierScheme", "orcid").downcase
  if name_identifier_scheme == "orcid" && name_identifier = validate_orcid(name_identifier)
    "http://orcid.org/#{name_identifier}"
  else
    nil
  end
end

#get_one_author(author) ⇒ Object

parse author string into CSL format only assume personal name when using sort-order: “Turing, Alan”



158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# File 'lib/toccatore/base.rb', line 158

def get_one_author(author)
  return { "literal" => "" } if author.strip.blank?

  author = cleanup_author(author)
  names = Namae.parse(author)

  if names.blank? || is_personal_name?(author).blank?
    { "literal" => author }
  else
    name = names.first

    { "family" => name.family,
      "given" => name.given }.compact
  end
end

#get_one_hashed_author(author) ⇒ Object



201
202
203
204
205
206
207
# File 'lib/toccatore/base.rb', line 201

def get_one_hashed_author(author)
  raw_name = author.fetch("creatorName", nil)

  author_hsh = get_one_author(raw_name)
  author_hsh["ORCID"] = get_name_identifier(author)
  author_hsh.compact
end

#get_query_url(options = {}) ⇒ Object



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/toccatore/base.rb', line 23

def get_query_url(options={})
  updated = "updated:[#{options[:from_date]}T00:00:00Z TO #{options[:until_date]}T23:59:59Z]"
  fq = "#{updated} AND has_metadata:true AND is_active:true"

  if options[:doi].present?
    q = "doi:#{options[:doi]}"
  elsif options[:orcid].present?
    q = "nameIdentifier:ORCID\\:#{options[:orcid]}"
  elsif options[:related_identifier].present?
    q = "relatedIdentifier:DOI\\:#{options[:related_identifier]}"
  elsif options[:query].present?
    q = options[:query]
  else
    q = query
  end

  params = { q: q,
             start: options[:offset],
             rows: options[:rows],
             fl: "doi,resourceTypeGeneral,relatedIdentifier,nameIdentifier,minted,updated",
             fq: fq,
             wt: "json" }
  url +  URI.encode_www_form(params)
end

#get_total(options = {}) ⇒ Object



48
49
50
51
52
# File 'lib/toccatore/base.rb', line 48

def get_total(options={})
  query_url = get_query_url(options.merge(rows: 0))
  result = Maremma.get(query_url, options)
  result.body.fetch("data", {}).fetch("response", {}).fetch("numFound", 0)
end

#is_personal_name?(author) ⇒ Boolean



184
185
186
187
188
189
# File 'lib/toccatore/base.rb', line 184

def is_personal_name?(author)
  return true if author.include?(",")

  # lookup given name
  name_detector.name_exists?(author.split.first)
end

#job_batch_sizeObject



110
111
112
# File 'lib/toccatore/base.rb', line 110

def job_batch_size
  1000
end

#name_detectorObject



219
220
221
# File 'lib/toccatore/base.rb', line 219

def name_detector
  GenderDetector.new
end

#normalize_doi(doi) ⇒ Object



133
134
135
136
137
138
139
140
141
142
# File 'lib/toccatore/base.rb', line 133

def normalize_doi(doi)
  doi = validate_doi(doi)
  return nil unless doi.present?

  # remove non-printing whitespace and downcase
  doi = doi.delete("\u200B").downcase

  # turn DOI into URL, escape unsafe characters
  "https://doi.org/" + Addressable::URI.encode(doi)
end

#orcid_as_url(orcid) ⇒ Object



148
149
150
# File 'lib/toccatore/base.rb', line 148

def orcid_as_url(orcid)
  "http://orcid.org/#{orcid}" if orcid.present?
end

#orcid_from_url(url) ⇒ Object



144
145
146
# File 'lib/toccatore/base.rb', line 144

def orcid_from_url(url)
  Array(/\Ahttp:\/\/orcid\.org\/(.+)/.match(url)).last
end

#process_data(options = {}) ⇒ Object



78
79
80
81
82
83
84
85
# File 'lib/toccatore/base.rb', line 78

def process_data(options = {})
  data = get_data(options.merge(timeout: timeout, source_id: source_id))
  data = parse_data(data, options)

  return [OpenStruct.new(body: { "data" => [] })] if data.empty?

  push_data(data, options)
end

#push_data(items, options = {}) ⇒ Object



92
93
94
95
96
97
98
99
100
# File 'lib/toccatore/base.rb', line 92

def push_data(items, options={})
  if items.empty?
    puts "No works found for date range #{options[:from_date]} - #{options[:until_date]}."
  elsif options[:access_token].blank?
    puts "An error occured: Access token missing."
  else
    Array(items).each { |item| push_item(item, options) }
  end
end

#queue_jobs(options = {}) ⇒ Object



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/toccatore/base.rb', line 54

def queue_jobs(options={})
  options[:offset] = options[:offset].to_i || 0
  options[:rows] = options[:rows].presence || job_batch_size
  options[:from_date] = options[:from_date].presence || (Time.now.to_date - 1.day).iso8601
  options[:until_date] = options[:until_date].presence || Time.now.to_date.iso8601

  total = get_total(options)

  if total > 0
    # walk through paginated results
    total_pages = (total.to_f / job_batch_size).ceil

    (0...total_pages).each do |page|
      options[:offset] = page * job_batch_size
      process_data(options)
    end
  else
    puts "No works found for date range #{options[:from_date]} - #{options[:until_date]}."
  end

  # return number of works queued
  total
end

#timeoutObject



106
107
108
# File 'lib/toccatore/base.rb', line 106

def timeout
  120
end

#unfreeze(hsh) ⇒ Object



223
224
225
226
227
# File 'lib/toccatore/base.rb', line 223

def unfreeze(hsh)
  new_hash = {}
  hsh.each_pair { |k,v| new_hash.merge!({k.downcase.to_sym => v})  }
  new_hash
end

#urlObject



102
103
104
# File 'lib/toccatore/base.rb', line 102

def url
  "https://search.datacite.org/api?"
end

#validate_doi(doi) ⇒ Object



125
126
127
# File 'lib/toccatore/base.rb', line 125

def validate_doi(doi)
  Array(/\A(?:(http|https):\/\/(dx\.)?doi.org\/)?(doi:)?(10\.\d{4,5}\/.+)\z/.match(doi)).last
end

#validate_orcid(orcid) ⇒ Object



152
153
154
# File 'lib/toccatore/base.rb', line 152

def validate_orcid(orcid)
  Array(/\A(?:http:\/\/orcid\.org\/)?(\d{4}-\d{4}-\d{4}-\d{3}[0-9X]+)\z/.match(orcid)).last
end

#validate_prefix(doi) ⇒ Object



129
130
131
# File 'lib/toccatore/base.rb', line 129

def validate_prefix(doi)
  Array(/\A(?:(http|https):\/\/(dx\.)?doi.org\/)?(doi:)?(10\.\d{4,5})\/.+\z/.match(doi)).last
end