Class: Toccatore::Base
- Inherits:
-
Object
show all
- Defined in:
- lib/toccatore/base.rb
Instance Method Summary
collapse
Instance Method Details
#cleanup_author(author) ⇒ Object
174
175
176
177
178
179
180
181
182
|
# File 'lib/toccatore/base.rb', line 174
def cleanup_author(author)
author = author.gsub(/[[:space:]]([A-Z]\.)?(-?[A-Z]\.)$/, ', \1\2') unless author.include?(",")
author.my_titleize
.gsub(/[[:space:]]/, ' ')
end
|
#get_authors(authors, options = {}) ⇒ Object
parse array of author strings into CSL format
192
193
194
|
# File 'lib/toccatore/base.rb', line 192
def get_authors(authors, options={})
Array(authors).map { |author| get_one_author(author) }
end
|
#get_data(options = {}) ⇒ Object
87
88
89
90
|
# File 'lib/toccatore/base.rb', line 87
def get_data(options={})
query_url = get_query_url(options)
Maremma.get(query_url, options)
end
|
#get_doi_ra(prefix) ⇒ Object
114
115
116
117
118
119
120
121
122
123
|
# File 'lib/toccatore/base.rb', line 114
def get_doi_ra(prefix)
return nil if prefix.blank?
url = "https://api.datacite.org/prefixes/#{prefix}"
result = Maremma.get(url)
return result.body.fetch("errors") if result.body.fetch("errors", nil).present?
result.body.fetch("data", {}).fetch('attributes', {}).fetch('registration-agency', nil)
end
|
#get_hashed_authors(authors) ⇒ Object
parse array of author hashes into CSL format
197
198
199
|
# File 'lib/toccatore/base.rb', line 197
def get_hashed_authors(authors)
Array(authors).map { |author| get_one_hashed_author(author) }
end
|
#get_name_identifier(author) ⇒ Object
209
210
211
212
213
214
215
216
217
|
# File 'lib/toccatore/base.rb', line 209
def get_name_identifier(author)
name_identifier = author.fetch("nameIdentifier", nil)
name_identifier_scheme = author.fetch("nameIdentifierScheme", "orcid").downcase
if name_identifier_scheme == "orcid" && name_identifier = validate_orcid(name_identifier)
"http://orcid.org/#{name_identifier}"
else
nil
end
end
|
#get_one_author(author) ⇒ Object
parse author string into CSL format only assume personal name when using sort-order: “Turing, Alan”
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
|
# File 'lib/toccatore/base.rb', line 158
def get_one_author(author)
return { "literal" => "" } if author.strip.blank?
author = cleanup_author(author)
names = Namae.parse(author)
if names.blank? || is_personal_name?(author).blank?
{ "literal" => author }
else
name = names.first
{ "family" => name.family,
"given" => name.given }.compact
end
end
|
#get_one_hashed_author(author) ⇒ Object
201
202
203
204
205
206
207
|
# File 'lib/toccatore/base.rb', line 201
def get_one_hashed_author(author)
raw_name = author.fetch("creatorName", nil)
author_hsh = get_one_author(raw_name)
author_hsh["ORCID"] = get_name_identifier(author)
author_hsh.compact
end
|
#get_query_url(options = {}) ⇒ Object
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
|
# File 'lib/toccatore/base.rb', line 23
def get_query_url(options={})
updated = "updated:[#{options[:from_date]}T00:00:00Z TO #{options[:until_date]}T23:59:59Z]"
fq = "#{updated} AND has_metadata:true AND is_active:true"
if options[:doi].present?
q = "doi:#{options[:doi]}"
elsif options[:orcid].present?
q = "nameIdentifier:ORCID\\:#{options[:orcid]}"
elsif options[:related_identifier].present?
q = "relatedIdentifier:DOI\\:#{options[:related_identifier]}"
elsif options[:query].present?
q = options[:query]
else
q = query
end
params = { q: q,
start: options[:offset],
rows: options[:rows],
fl: "doi,resourceTypeGeneral,relatedIdentifier,nameIdentifier,minted,updated",
fq: fq,
wt: "json" }
url + URI.encode_www_form(params)
end
|
#get_total(options = {}) ⇒ Object
48
49
50
51
52
|
# File 'lib/toccatore/base.rb', line 48
def get_total(options={})
query_url = get_query_url(options.merge(rows: 0))
result = Maremma.get(query_url, options)
result.body.fetch("data", {}).fetch("response", {}).fetch("numFound", 0)
end
|
#is_personal_name?(author) ⇒ Boolean
184
185
186
187
188
189
|
# File 'lib/toccatore/base.rb', line 184
def is_personal_name?(author)
return true if author.include?(",")
name_detector.name_exists?(author.split.first)
end
|
#job_batch_size ⇒ Object
110
111
112
|
# File 'lib/toccatore/base.rb', line 110
def job_batch_size
1000
end
|
#name_detector ⇒ Object
219
220
221
|
# File 'lib/toccatore/base.rb', line 219
def name_detector
GenderDetector.new
end
|
#normalize_doi(doi) ⇒ Object
133
134
135
136
137
138
139
140
141
142
|
# File 'lib/toccatore/base.rb', line 133
def normalize_doi(doi)
doi = validate_doi(doi)
return nil unless doi.present?
doi = doi.delete("\u200B").downcase
"https://doi.org/" + Addressable::URI.encode(doi)
end
|
#orcid_as_url(orcid) ⇒ Object
148
149
150
|
# File 'lib/toccatore/base.rb', line 148
def orcid_as_url(orcid)
"http://orcid.org/#{orcid}" if orcid.present?
end
|
#orcid_from_url(url) ⇒ Object
144
145
146
|
# File 'lib/toccatore/base.rb', line 144
def orcid_from_url(url)
Array(/\Ahttp:\/\/orcid\.org\/(.+)/.match(url)).last
end
|
#process_data(options = {}) ⇒ Object
78
79
80
81
82
83
84
85
|
# File 'lib/toccatore/base.rb', line 78
def process_data(options = {})
data = get_data(options.merge(timeout: timeout, source_id: source_id))
data = parse_data(data, options)
return [OpenStruct.new(body: { "data" => [] })] if data.empty?
push_data(data, options)
end
|
#push_data(items, options = {}) ⇒ Object
92
93
94
95
96
97
98
99
100
|
# File 'lib/toccatore/base.rb', line 92
def push_data(items, options={})
if items.empty?
puts "No works found for date range #{options[:from_date]} - #{options[:until_date]}."
elsif options[:access_token].blank?
puts "An error occured: Access token missing."
else
Array(items).each { |item| push_item(item, options) }
end
end
|
#queue_jobs(options = {}) ⇒ Object
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
|
# File 'lib/toccatore/base.rb', line 54
def queue_jobs(options={})
options[:offset] = options[:offset].to_i || 0
options[:rows] = options[:rows].presence || job_batch_size
options[:from_date] = options[:from_date].presence || (Time.now.to_date - 1.day).iso8601
options[:until_date] = options[:until_date].presence || Time.now.to_date.iso8601
total = get_total(options)
if total > 0
total_pages = (total.to_f / job_batch_size).ceil
(0...total_pages).each do |page|
options[:offset] = page * job_batch_size
process_data(options)
end
else
puts "No works found for date range #{options[:from_date]} - #{options[:until_date]}."
end
total
end
|
#timeout ⇒ Object
106
107
108
|
# File 'lib/toccatore/base.rb', line 106
def timeout
120
end
|
#unfreeze(hsh) ⇒ Object
223
224
225
226
227
|
# File 'lib/toccatore/base.rb', line 223
def unfreeze(hsh)
new_hash = {}
hsh.each_pair { |k,v| new_hash.merge!({k.downcase.to_sym => v}) }
new_hash
end
|
#url ⇒ Object
102
103
104
|
# File 'lib/toccatore/base.rb', line 102
def url
"https://search.datacite.org/api?"
end
|
#validate_doi(doi) ⇒ Object
125
126
127
|
# File 'lib/toccatore/base.rb', line 125
def validate_doi(doi)
Array(/\A(?:(http|https):\/\/(dx\.)?doi.org\/)?(doi:)?(10\.\d{4,5}\/.+)\z/.match(doi)).last
end
|
#validate_orcid(orcid) ⇒ Object
152
153
154
|
# File 'lib/toccatore/base.rb', line 152
def validate_orcid(orcid)
Array(/\A(?:http:\/\/orcid\.org\/)?(\d{4}-\d{4}-\d{4}-\d{3}[0-9X]+)\z/.match(orcid)).last
end
|
#validate_prefix(doi) ⇒ Object
129
130
131
|
# File 'lib/toccatore/base.rb', line 129
def validate_prefix(doi)
Array(/\A(?:(http|https):\/\/(dx\.)?doi.org\/)?(doi:)?(10\.\d{4,5})\/.+\z/.match(doi)).last
end
|