Class: RelatonNist::DataFetcher

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton_nist/data_fetcher.rb

Constant Summary collapse

RELATION_TYPES =
{
  "replaces" => "obsoletes",
  "isVersionOf" => "editionOf",
  "hasTranslation" => "hasTranslation",
  "isTranslationOf" => "translatedFrom",
  "hasPreprint" => "hasReprint",
  "isPreprintOf" => "reprintOf",
  "isSupplementTo" => "complements",
  "isPartOf" => "partOf",
  "hasPart" => "hasPart",
}.freeze
URL =
"https://raw.githubusercontent.com/usnistgov/NIST-Tech-Pubs/nist-pages/xml/allrecords.xml"

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(output, format) ⇒ DataFetcher

Returns a new instance of DataFetcher.



20
21
22
23
24
25
# File 'lib/relaton_nist/data_fetcher.rb', line 20

def initialize(output, format)
  @output = output
  @format = format
  @ext = format.sub(/^bib/, "")
  @files = []
end

Class Method Details

.fetch(output: "data", format: "yaml") ⇒ Object

Fetch all the documnts from dataset

Parameters:

  • output (String) (defaults to: "data")

    foldet name to save the documents

  • format (String) (defaults to: "yaml")

    format to save the documents (yaml, xml, bibxml)



354
355
356
# File 'lib/relaton_nist/data_fetcher.rb', line 354

def self.fetch(output: "data", format: "yaml")
  new(output, format).fetch
end

Instance Method Details

#affiliation(doc) ⇒ Array<RelatonBib::Affiliation>

Create affiliation organization

Parameters:

  • doc (Nokogiri::XML::Element)

    affiliation element

Returns:

  • (Array<RelatonBib::Affiliation>)

    affiliation



251
252
253
254
255
256
# File 'lib/relaton_nist/data_fetcher.rb', line 251

def affiliation(doc)
  doc.xpath("./institution/institution_department").map do |id|
    org = RelatonBib::Organization.new name: id.text
    RelatonBib::Affiliation.new organization: org
  end
end

#create_org(pub) ⇒ RelatonBib::Organization

Create publisher organization

Parameters:

  • pub (Nokogiri::XML::Element)

    publisher element

Returns:

  • (RelatonBib::Organization)

    publisher organization



231
232
233
234
235
236
237
238
239
240
241
242
# File 'lib/relaton_nist/data_fetcher.rb', line 231

def create_org(pub)
  name = pub.at("publisher_name").text
  abbr = pub.at("../institution[institution_name[.='#{name}']]/institution_acronym")&.text
  place = pub.at("./publisher_place") ||
    pub.at("../institution[institution_name[.='#{name}']]/institution_place")
  cont = []
  if place
    city, state = place.text.split(", ")
    cont << RelatonBib::Address.new(street: [], city: city, state: state, country: "US")
  end
  RelatonBib::Organization.new name: name, abbreviation: abbr, contact: cont
end

#fetchObject

Fetch all the documnts from dataset



330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
# File 'lib/relaton_nist/data_fetcher.rb', line 330

def fetch # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  t1 = Time.now
  puts "Started at: #{t1}"

  docs = Nokogiri::XML OpenURI.open_uri URL
  FileUtils.mkdir_p @output
  FileUtils.rm Dir[File.join(@output, "*.#{@ext}")]
  docs.xpath("/body/query/doi_record/report-paper/report-paper_metadata")
    .each { |doc| parse_doc doc }

  t2 = Time.now
  puts "Stopped at: #{t2}"
  puts "Done in: #{(t2 - t1).round} sec."
rescue StandardError => e
  warn e.message
  warn e.backtrace[0..5].join("\n")
end

#fetch_abstract(doc) ⇒ Array<RelatonBib::FormattedString>

Parameters:

  • doc (Nokogiri::XML::Element)

Returns:

  • (Array<RelatonBib::FormattedString>)


133
134
135
136
137
# File 'lib/relaton_nist/data_fetcher.rb', line 133

def fetch_abstract(doc)
  doc.xpath("jats:abstract/jats:p", "jats" => "http://www.ncbi.nlm.nih.gov/JATS1").map do |a|
    RelatonBib::FormattedString.new(content: a.text, language: doc["language"], script: "Latn")
  end
end

#fetch_contributor(doc) ⇒ Array<Hash>

Parameters:

  • doc (Nokogiri::XML::Element)

Returns:

  • (Array<Hash>)


141
142
143
144
145
146
147
148
149
150
# File 'lib/relaton_nist/data_fetcher.rb', line 141

def fetch_contributor(doc)
  contribs = doc.xpath("contributors/person_name").map do |p|
    person = RelatonBib::Person.new(name: fullname(p, doc),
                                    affiliation: affiliation(doc))
    { entity: person, role: [{ type: p["contributor_role"] }] }
  end
  contribs + doc.xpath("publisher").map do |p|
    { entity: create_org(p), role: [{ type: "publisher" }] }
  end
end

#fetch_date(doc) ⇒ Array<RelatonBib::BibliographicDate>

Parameters:

  • doc (Nokogiri::XML::Element)

Returns:

  • (Array<RelatonBib::BibliographicDate>)


86
87
88
89
90
91
92
93
94
95
96
97
# File 'lib/relaton_nist/data_fetcher.rb', line 86

def fetch_date(doc)
  doc.xpath("publication_date|approval_date").map do |dt|
    on = dt.at("year").text
    if (m = dt.at "month")
      on += "-#{m.text}"
      d = dt.at "day"
      on += "-#{d.text}" if d
    end
    type = dt.name == "publication_date" ? "published" : "confirmed"
    RelatonBib::BibliographicDate.new(type: type, on: on)
  end
end

#fetch_docid(doc) ⇒ Array<RelatonBib::DocumentIdentifier>

Parameters:

  • doc (Nokogiri::XML::Element)

Returns:

  • (Array<RelatonBib::DocumentIdentifier>)


67
68
69
70
71
# File 'lib/relaton_nist/data_fetcher.rb', line 67

def fetch_docid(doc)
  parse_docid(doc).map do |id|
    RelatonBib::DocumentIdentifier.new(**id)
  end
end

#fetch_doi(doc) ⇒ Object

rubocop:disable Metrics/CyclomaticComplexity



48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/relaton_nist/data_fetcher.rb', line 48

def fetch_doi(doc) # rubocop:disable Metrics/CyclomaticComplexity
  id = doc.at("doi_data/doi").text
  case id
  when "10.6028/NBS.CIRC.e2e" then "10.6028/NBS.CIRC.2e2"
  when "10.6028/NBS.CIRC.sup" then "10.6028/NBS.CIRC.24e7sup"
  when "10.6028/NBS.CIRC.supJun1925-Jun1926" then "10.6028/NBS.CIRC.24e7sup2"
  when "10.6028/NBS.CIRC.supJun1925-Jun1927" then "10.6028/NBS.CIRC.24e7sup3"
  when "10.6028/NBS.CIRC.24supJuly1922" then "10.6028/NBS.CIRC.24e6sup"
  when "10.6028/NBS.CIRC.24supJan1924" then "10.6028/NBS.CIRC.24e6sup2"
  else id
  end
end

#fetch_edition(doc) ⇒ String

Parameters:

  • doc (Nokogiri::XML::Element)

Returns:

  • (String)


101
102
103
# File 'lib/relaton_nist/data_fetcher.rb', line 101

def fetch_edition(doc)
  doc.at("edition_number")&.text
end

Parameters:

  • doc (Nokogiri::XML::Element)

Returns:

  • (Array<RelatonBib::TypedUri>)


123
124
125
126
127
128
129
# File 'lib/relaton_nist/data_fetcher.rb', line 123

def fetch_link(doc)
  pdf = doc.at("doi_data/resource").text
  doi = "https://doi.org/#{fetch_doi(doc)}"
  [{ type: "doi", content: doi }, { type: "pdf", content: pdf }].map do |l|
    RelatonBib::TypedUri.new(**l)
  end
end

#fetch_place(doc) ⇒ Array<String>

Parameters:

  • doc (Nokogiri::XML::Element)

Returns:

  • (Array<String>)


260
261
262
# File 'lib/relaton_nist/data_fetcher.rb', line 260

def fetch_place(doc)
  doc.xpath("institution/institution_place").map(&:text)
end

#fetch_relation(doc) ⇒ Array<Hash>

Parameters:

  • doc (Nokogiri::XML::Element)

Returns:

  • (Array<Hash>)


107
108
109
110
111
112
113
114
115
116
117
118
119
# File 'lib/relaton_nist/data_fetcher.rb', line 107

def fetch_relation(doc) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
  ns = "http://www.crossref.org/relations.xsd"
  doc.xpath("./ns:program/ns:related_item", ns: ns).map do |rel|
    rdoi = rel.at_xpath("ns:intra_work_relation|ns:inter_work_relation", ns: ns)
    id = rdoi.text.split("/")[1..].join("/").gsub(".", " ")
    fref = RelatonBib::FormattedRef.new content: id
    docid = RelatonBib::DocumentIdentifier.new(type: "NIST", id: id, primary: true)
    bibitem = RelatonBib::BibliographicItem.new formattedref: fref, docid: [docid]
    type = RELATION_TYPES[rdoi["relationship-type"]]
    warn "Relation type #{rdoi['relationship-type']} not found" unless type
    { type: type, bibitem: bibitem }
  end
end

#fetch_series(doc) ⇒ Array<RelatonBib::Series>

Fetches series

Parameters:

  • doc (Nokogiri::XML::Element)

    document element

Returns:

  • (Array<RelatonBib::Series>)

    series



271
272
273
274
275
276
277
278
279
# File 'lib/relaton_nist/data_fetcher.rb', line 271

def fetch_series(doc)
  series_path = File.expand_path("series.yaml", __dir__)
  series = YAML.load_file series_path
  prf, srs, num = pub_id(doc).split
  sname = series[srs] || srs
  title = RelatonBib::TypedTitleString.new(content: "#{prf} #{sname}")
  abbr = RelatonBib::LocalizedString.new srs
  [RelatonBib::Series.new(title: title, abbreviation: abbr, number: num)]
end

#fetch_title(doc) ⇒ RelatonBib::TypedTitleStringCollection, Array

Parameters:

  • doc (Nokogiri::XML::Element)

Returns:

  • (RelatonBib::TypedTitleStringCollection, Array)


75
76
77
78
79
80
81
82
# File 'lib/relaton_nist/data_fetcher.rb', line 75

def fetch_title(doc)
  t = doc.xpath("titles/title|titles/subtitle")
  return [] unless t.any?

  # RelatonBib::TypedTitleString.from_string t.map(&:text).join, "en", "Latn"
  [{ content: t.map(&:text).join, language: "en", script: "Latn",
     format: "text/plain" }]
end

#forename(doc, cnt, init = nil) ⇒ RelatonBib::Forename

Create forename object

Parameters:

  • doc (Nokogiri::XML::Element)

    document element

  • cnt (String, nil)

    forename content

  • init (String, nil) (defaults to: nil)

    initial content

Returns:

  • (RelatonBib::Forename)

    forename object



204
205
206
207
208
209
210
# File 'lib/relaton_nist/data_fetcher.rb', line 204

def forename(doc, cnt, init = nil)
  return if (cnt.nil? || cnt.empty?) && (init.nil? || init.empty?)

  RelatonBib::Forename.new(
    content: cnt, language: doc["language"], script: "Latn", initial: init,
  )
end

#forename_initial(person, doc) ⇒ Array<Array<RelatonBib::LocalizedString>>

Create forename and initials objects from person name element.

Parameters:

  • person (Nokogiri::XML::Element)

    person name element

  • doc (Nokogiri::XML::Element)

    document element

Returns:

  • (Array<Array<RelatonBib::LocalizedString>>)

    forename and initials



178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# File 'lib/relaton_nist/data_fetcher.rb', line 178

def forename_initial(person, doc) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
  fnames = []
  fname = person.at("given_name")&.text
  if fname
    if /^(?<inits>(?:\w[.\s]+|[A-Z]{1,2}$)+)$/ =~ fname
      ints = inits.split(/[.\s]*/)
      fnames << forename(doc, fname, ints.shift)
      ints.each { |i| fnames << forename(doc, nil, i) }
    else
      fn = forename(doc, fname)
      fnames << fn if fn
    end
  end
  initials = localized_string inits, doc if not(inits.nil? || inits.empty?)
  [fnames, initials]
end

#fullname(person, doc) ⇒ RelatonBib::FullName

Create full name object from person name element.

Parameters:

  • person (Nokogiri::XML::Element)

    name element

  • doc (Nokogiri::XML::Element)

    document element

Returns:

  • (RelatonBib::FullName)

    full name object



160
161
162
163
164
165
166
167
168
# File 'lib/relaton_nist/data_fetcher.rb', line 160

def fullname(person, doc)
  forename, initials = forename_initial(person, doc)
  surname = localized_string person.at("surname").text, doc
  ident = person.xpath("ORCID").map do |id|
    RelatonBib::PersonIdentifier.new "orcid", id.text
  end
  RelatonBib::FullName.new(surname: surname, forename: forename,
                           initials: initials, identifier: ident)
end

#localized_string(content, doc) ⇒ RelatonBib::LocalizedString

Create localized string

Parameters:

  • content (String)

    content of string

  • doc (Nokogiri::XML::Elemrnt)

    XML element

Returns:

  • (RelatonBib::LocalizedString)

    localized string



220
221
222
# File 'lib/relaton_nist/data_fetcher.rb', line 220

def localized_string(content, doc)
  RelatonBib::LocalizedString.new content, doc["language"], "Latn"
end

#parse_doc(doc) ⇒ Object

Create a document instance an save it.

Parameters:

  • doc (Nokogiri::XML::Element)

Raises:

  • (StandardError)


309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
# File 'lib/relaton_nist/data_fetcher.rb', line 309

def parse_doc(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
  # mtd = doc.at('doi_record/report-paper/report-paper_metadata')
  item = RelatonNist::NistBibliographicItem.new(
    type: "standard", docid: fetch_docid(doc),
    title: fetch_title(doc), link: fetch_link(doc), abstract: fetch_abstract(doc),
    date: fetch_date(doc), edition: fetch_edition(doc),
    contributor: fetch_contributor(doc), relation: fetch_relation(doc),
    place: fetch_place(doc), series: fetch_series(doc),
    language: [doc["language"]], script: ["Latn"], doctype: "standard"
  )
  write_file item
rescue StandardError => e
  warn "Document: #{doc.at('doi').text}"
  warn e.message
  warn e.backtrace[0..5].join("\n")
  # raise e
end

#parse_docid(doc) ⇒ Object

rubocop:disable Metrics/AbcSize, Metrics/MethodLength



27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/relaton_nist/data_fetcher.rb', line 27

def parse_docid(doc) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
  # case doi
  # when "10.6028/NBS.CIRC.12e2revjune" then doi.sub!("13e", "12e")
  # when "10.6028/NBS.CIRC.36e2" then doi.sub!("46e", "36e")
  # when "10.6028/NBS.HB.67suppJune1967" then doi.sub!("1965", "1967")
  # when "10.6028/NBS.HB.105-1r1990" then doi.sub!("105-1-1990", "105-1r1990")
  # when "10.6028/NIST.HB.150-10-1995" then doi.sub!(/150-10$/, "150-10-1995")
  # end
  # anchor = doi.split("/")[1..-1].join "/"
  [
    { type: "NIST", id: pub_id(doc), primary: true },
    { type: "DOI", id: fetch_doi(doc) },
    # { type: "NIST", id: anchor(doc), scope: "anchor" },
  ]
end

#pub_id(doc) ⇒ Object



43
44
45
46
# File 'lib/relaton_nist/data_fetcher.rb', line 43

def pub_id(doc)
  # anchor(doc).gsub(".", " ")
  fetch_doi(doc).split("/")[1..].join("/").gsub(".", " ")
end

#write_file(bib) ⇒ Object

Save document



286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
# File 'lib/relaton_nist/data_fetcher.rb', line 286

def write_file(bib) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  id = bib.docidentifier[0].id.gsub(%r{[/\s:.]}, "_").upcase.sub(/^NIST_IR/, "NISTIR")
  file = File.join(@output, "#{id}.#{@ext}")
  if @files.include? file
    warn "File #{file} exists. Docid: #{bib.docidentifier[0].id}"
    # warn "Link: #{bib.link.detect { |l| l.type == 'src' }.content}"
  else @files << file
  end
  output = case @format
           when "yaml" then bib.to_hash.to_yaml
           when "xml" then bib.to_xml bibdata: true
           else bib.send "to_#{@format}"
           end
  File.write file, output, encoding: "UTF-8"
end