Class: RelatonNist::DataFetcher

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton_nist/data_fetcher.rb

Constant Summary collapse

RELATION_TYPES =
{
  "replaces" => "obsoletes",
  "isVersionOf" => "editionOf",
  "hasTranslation" => "hasTranslation",
  "isTranslationOf" => "translatedFrom",
  "hasPreprint" => "hasReprint",
  "isPreprintOf" => "reprintOf",
  "isSupplementTo" => "complements",
  "isPartOf" => "partOf",
  "hasPart" => "hasPart",
}.freeze
URL =
"https://raw.githubusercontent.com/usnistgov/NIST-Tech-Pubs/nist-pages/xml/allrecords.xml"

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(output, format) ⇒ DataFetcher

Returns a new instance of DataFetcher.



20
21
22
23
24
25
# File 'lib/relaton_nist/data_fetcher.rb', line 20

def initialize(output, format)
  @output = output
  @format = format
  @ext = format.sub(/^bib/, "")
  @files = []
end

Class Method Details

.fetch(output: "data", format: "yaml") ⇒ Object

Fetch all the documnts from dataset

Parameters:

  • output (String) (defaults to: "data")

    foldet name to save the documents

  • format (String) (defaults to: "yaml")

    format to save the documents (yaml, xml, bibxml)



365
366
367
# File 'lib/relaton_nist/data_fetcher.rb', line 365

def self.fetch(output: "data", format: "yaml")
  new(output, format).fetch
end

Instance Method Details

#affiliation(doc) ⇒ Array<RelatonBib::Affiliation>

Create affiliation organization

Parameters:

  • doc (Nokogiri::XML::Element)

    affiliation element

Returns:

  • (Array<RelatonBib::Affiliation>)

    affiliation



258
259
260
261
262
263
# File 'lib/relaton_nist/data_fetcher.rb', line 258

def affiliation(doc)
  doc.xpath("./institution/institution_department").map do |id|
    org = RelatonBib::Organization.new name: id.text
    RelatonBib::Affiliation.new organization: org
  end
end

#create_org(pub) ⇒ RelatonBib::Organization

Create publisher organization

Parameters:

  • pub (Nokogiri::XML::Element)

    publisher element

Returns:

  • (RelatonBib::Organization)

    publisher organization



238
239
240
241
242
243
244
245
246
247
248
249
# File 'lib/relaton_nist/data_fetcher.rb', line 238

def create_org(pub)
  name = pub.at("publisher_name").text
  abbr = pub.at("../institution[institution_name[.='#{name}']]/institution_acronym")&.text
  place = pub.at("./publisher_place") ||
    pub.at("../institution[institution_name[.='#{name}']]/institution_place")
  cont = []
  if place
    city, state = place.text.split(", ")
    cont << RelatonBib::Address.new(street: [], city: city, state: state, country: "US")
  end
  RelatonBib::Organization.new name: name, abbreviation: abbr, contact: cont
end

#fetchObject

Fetch all the documnts from dataset



340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
# File 'lib/relaton_nist/data_fetcher.rb', line 340

def fetch # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  t1 = Time.now
  puts "Started at: #{t1}"

  docs = Nokogiri::XML OpenURI.open_uri URL
  FileUtils.mkdir_p @output
  FileUtils.rm Dir[File.join(@output, "*.#{@ext}")]
  docs.xpath("/body/query/doi_record/report-paper/report-paper_metadata")
    .each { |doc| parse_doc doc }

  index.save
  t2 = Time.now
  puts "Stopped at: #{t2}"
  puts "Done in: #{(t2 - t1).round} sec."
rescue StandardError => e
  warn e.message
  warn e.backtrace[0..5].join("\n")
end

#fetch_abstract(doc) ⇒ Array<RelatonBib::FormattedString>

Parameters:

  • doc (Nokogiri::XML::Element)

Returns:

  • (Array<RelatonBib::FormattedString>)


136
137
138
139
140
141
142
143
144
# File 'lib/relaton_nist/data_fetcher.rb', line 136

def fetch_abstract(doc)
  doc.xpath(
    "jats:abstract/jats:p", "jats" => "http://www.ncbi.nlm.nih.gov/JATS1"
  ).each_with_object([]) do |a, m|
    next if a.text.empty?

    m << RelatonBib::FormattedString.new(content: a.text, language: doc["language"], script: "Latn")
  end
end

#fetch_contributor(doc) ⇒ Array<Hash>

Parameters:

  • doc (Nokogiri::XML::Element)

Returns:

  • (Array<Hash>)


148
149
150
151
152
153
154
155
156
157
# File 'lib/relaton_nist/data_fetcher.rb', line 148

def fetch_contributor(doc)
  contribs = doc.xpath("contributors/person_name").map do |p|
    person = RelatonBib::Person.new(name: fullname(p, doc),
                                    affiliation: affiliation(doc))
    { entity: person, role: [{ type: p["contributor_role"] }] }
  end
  contribs + doc.xpath("publisher").map do |p|
    { entity: create_org(p), role: [{ type: "publisher" }] }
  end
end

#fetch_date(doc) ⇒ Array<RelatonBib::BibliographicDate>

Parameters:

  • doc (Nokogiri::XML::Element)

Returns:

  • (Array<RelatonBib::BibliographicDate>)


89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/relaton_nist/data_fetcher.rb', line 89

def fetch_date(doc)
  doc.xpath("publication_date|approval_date").map do |dt|
    on = dt.at("year").text
    if (m = dt.at "month")
      on += "-#{m.text}"
      d = dt.at "day"
      on += "-#{d.text}" if d
    end
    type = dt.name == "publication_date" ? "published" : "confirmed"
    RelatonBib::BibliographicDate.new(type: type, on: on)
  end
end

#fetch_docid(doc) ⇒ Array<RelatonBib::DocumentIdentifier>

Parameters:

  • doc (Nokogiri::XML::Element)

Returns:

  • (Array<RelatonBib::DocumentIdentifier>)


70
71
72
73
74
# File 'lib/relaton_nist/data_fetcher.rb', line 70

def fetch_docid(doc)
  parse_docid(doc).map do |id|
    RelatonBib::DocumentIdentifier.new(**id)
  end
end

#fetch_doi(doc) ⇒ Object

rubocop:disable Metrics/CyclomaticComplexity



51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/relaton_nist/data_fetcher.rb', line 51

def fetch_doi(doc) # rubocop:disable Metrics/CyclomaticComplexity
  id = doc.at("doi_data/doi").text
  case id
  when "10.6028/NBS.CIRC.e2e" then "10.6028/NBS.CIRC.2e2"
  when "10.6028/NBS.CIRC.sup" then "10.6028/NBS.CIRC.24e7sup"
  when "10.6028/NBS.CIRC.supJun1925-Jun1926" then "10.6028/NBS.CIRC.24e7sup2"
  when "10.6028/NBS.CIRC.supJun1925-Jun1927" then "10.6028/NBS.CIRC.24e7sup3"
  when "10.6028/NBS.CIRC.24supJuly1922" then "10.6028/NBS.CIRC.24e6sup"
  when "10.6028/NBS.CIRC.24supJan1924" then "10.6028/NBS.CIRC.24e6sup2"
  else id
  end
end

#fetch_edition(doc) ⇒ String

Parameters:

  • doc (Nokogiri::XML::Element)

Returns:

  • (String)


104
105
106
# File 'lib/relaton_nist/data_fetcher.rb', line 104

def fetch_edition(doc)
  doc.at("edition_number")&.text
end

Parameters:

  • doc (Nokogiri::XML::Element)

Returns:

  • (Array<RelatonBib::TypedUri>)


126
127
128
129
130
131
132
# File 'lib/relaton_nist/data_fetcher.rb', line 126

def fetch_link(doc)
  pdf = doc.at("doi_data/resource").text
  doi = "https://doi.org/#{fetch_doi(doc)}"
  [{ type: "doi", content: doi }, { type: "pdf", content: pdf }].map do |l|
    RelatonBib::TypedUri.new(**l)
  end
end

#fetch_place(doc) ⇒ Array<String>

Parameters:

  • doc (Nokogiri::XML::Element)

Returns:

  • (Array<String>)


267
268
269
# File 'lib/relaton_nist/data_fetcher.rb', line 267

def fetch_place(doc)
  doc.xpath("institution/institution_place").map(&:text)
end

#fetch_relation(doc) ⇒ Array<Hash>

Parameters:

  • doc (Nokogiri::XML::Element)

Returns:

  • (Array<Hash>)


110
111
112
113
114
115
116
117
118
119
120
121
122
# File 'lib/relaton_nist/data_fetcher.rb', line 110

def fetch_relation(doc) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
  ns = "http://www.crossref.org/relations.xsd"
  doc.xpath("./ns:program/ns:related_item", ns: ns).map do |rel|
    rdoi = rel.at_xpath("ns:intra_work_relation|ns:inter_work_relation", ns: ns)
    id = rdoi.text.split("/")[1..].join("/").gsub(".", " ")
    fref = RelatonBib::FormattedRef.new content: id
    docid = RelatonBib::DocumentIdentifier.new(type: "NIST", id: id, primary: true)
    bibitem = RelatonBib::BibliographicItem.new formattedref: fref, docid: [docid]
    type = RELATION_TYPES[rdoi["relationship-type"]]
    warn "Relation type #{rdoi['relationship-type']} not found" unless type
    { type: type, bibitem: bibitem }
  end
end

#fetch_series(doc) ⇒ Array<RelatonBib::Series>

Fetches series

Parameters:

  • doc (Nokogiri::XML::Element)

    document element

Returns:

  • (Array<RelatonBib::Series>)

    series



278
279
280
281
282
283
284
# File 'lib/relaton_nist/data_fetcher.rb', line 278

def fetch_series(doc)
  prf, srs, num = pub_id(doc).split
  sname = series[srs] || srs
  title = RelatonBib::TypedTitleString.new(content: "#{prf} #{sname}")
  abbr = RelatonBib::LocalizedString.new srs
  [RelatonBib::Series.new(title: title, abbreviation: abbr, number: num)]
end

#fetch_title(doc) ⇒ RelatonBib::TypedTitleStringCollection, Array

Parameters:

  • doc (Nokogiri::XML::Element)

Returns:

  • (RelatonBib::TypedTitleStringCollection, Array)


78
79
80
81
82
83
84
85
# File 'lib/relaton_nist/data_fetcher.rb', line 78

def fetch_title(doc)
  t = doc.xpath("titles/title|titles/subtitle")
  return [] unless t.any?

  # RelatonBib::TypedTitleString.from_string t.map(&:text).join, "en", "Latn"
  [{ content: t.map(&:text).join, language: "en", script: "Latn",
     format: "text/plain" }]
end

#forename(doc, cnt, init = nil) ⇒ RelatonBib::Forename

Create forename object

Parameters:

  • doc (Nokogiri::XML::Element)

    document element

  • cnt (String, nil)

    forename content

  • init (String, nil) (defaults to: nil)

    initial content

Returns:

  • (RelatonBib::Forename)

    forename object



211
212
213
214
215
216
217
# File 'lib/relaton_nist/data_fetcher.rb', line 211

def forename(doc, cnt, init = nil)
  return if (cnt.nil? || cnt.empty?) && (init.nil? || init.empty?)

  RelatonBib::Forename.new(
    content: cnt, language: doc["language"], script: "Latn", initial: init,
  )
end

#forename_initial(person, doc) ⇒ Array<Array<RelatonBib::LocalizedString>>

Create forename and initials objects from person name element.

Parameters:

  • person (Nokogiri::XML::Element)

    person name element

  • doc (Nokogiri::XML::Element)

    document element

Returns:

  • (Array<Array<RelatonBib::LocalizedString>>)

    forename and initials



185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# File 'lib/relaton_nist/data_fetcher.rb', line 185

def forename_initial(person, doc) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
  fnames = []
  fname = person.at("given_name")&.text
  if fname
    if /^(?<inits>(?:\w[.\s]+|[A-Z]{1,2}$)+)$/ =~ fname
      ints = inits.split(/[.\s]*/)
      fnames << forename(doc, fname, ints.shift)
      ints.each { |i| fnames << forename(doc, nil, i) }
    else
      fn = forename(doc, fname)
      fnames << fn if fn
    end
  end
  initials = localized_string inits, doc if not(inits.nil? || inits.empty?)
  [fnames, initials]
end

#fullname(person, doc) ⇒ RelatonBib::FullName

Create full name object from person name element.

Parameters:

  • person (Nokogiri::XML::Element)

    name element

  • doc (Nokogiri::XML::Element)

    document element

Returns:

  • (RelatonBib::FullName)

    full name object



167
168
169
170
171
172
173
174
175
# File 'lib/relaton_nist/data_fetcher.rb', line 167

def fullname(person, doc)
  forename, initials = forename_initial(person, doc)
  surname = localized_string person.at("surname").text, doc
  ident = person.xpath("ORCID").map do |id|
    RelatonBib::PersonIdentifier.new "orcid", id.text
  end
  RelatonBib::FullName.new(surname: surname, forename: forename,
                           initials: initials, identifier: ident)
end

#indexObject



27
28
29
# File 'lib/relaton_nist/data_fetcher.rb', line 27

def index
  @index ||= Relaton::Index.find_or_create :nist, file: "index-v1.yaml"
end

#localized_string(content, doc) ⇒ RelatonBib::LocalizedString

Create localized string

Parameters:

  • content (String)

    content of string

  • doc (Nokogiri::XML::Elemrnt)

    XML element

Returns:

  • (RelatonBib::LocalizedString)

    localized string



227
228
229
# File 'lib/relaton_nist/data_fetcher.rb', line 227

def localized_string(content, doc)
  RelatonBib::LocalizedString.new content, doc["language"], "Latn"
end

#parse_doc(doc) ⇒ Object

Create a document instance an save it.

Parameters:

  • doc (Nokogiri::XML::Element)

Raises:

  • (StandardError)


319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
# File 'lib/relaton_nist/data_fetcher.rb', line 319

def parse_doc(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
  # mtd = doc.at('doi_record/report-paper/report-paper_metadata')
  item = RelatonNist::NistBibliographicItem.new(
    type: "standard", docid: fetch_docid(doc),
    title: fetch_title(doc), link: fetch_link(doc), abstract: fetch_abstract(doc),
    date: fetch_date(doc), edition: fetch_edition(doc),
    contributor: fetch_contributor(doc), relation: fetch_relation(doc),
    place: fetch_place(doc), series: fetch_series(doc),
    language: [doc["language"]], script: ["Latn"], doctype: "standard"
  )
  write_file item
rescue StandardError => e
  warn "Document: #{doc.at('doi').text}"
  warn e.message
  warn e.backtrace[0..5].join("\n")
  # raise e
end

#parse_docid(doc) ⇒ Object

rubocop:disable Metrics/AbcSize, Metrics/MethodLength



31
32
33
34
35
36
37
# File 'lib/relaton_nist/data_fetcher.rb', line 31

def parse_docid(doc) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
  [
    { type: "NIST", id: pub_id(doc), primary: true },
    { type: "DOI", id: fetch_doi(doc) },
    # { type: "NIST", id: anchor(doc), scope: "anchor" },
  ]
end

#pub_id(doc) ⇒ String

Parse document’s ID from XML

Parameters:

  • doc (Nokogiri::XML::Element)

    XML element

Returns:

  • (String)

    document’s ID



46
47
48
49
# File 'lib/relaton_nist/data_fetcher.rb', line 46

def pub_id(doc)
  # anchor(doc).gsub(".", " ")
  fetch_doi(doc).split("/")[1..].join("/").gsub(".", " ").sub(/^nist\sir/, "NIST IR")
end

#seriesObject



286
287
288
# File 'lib/relaton_nist/data_fetcher.rb', line 286

def series
  @series ||= YAML.load_file File.expand_path("series.yaml", __dir__)
end

#write_file(bib) ⇒ Object

Save document



295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
# File 'lib/relaton_nist/data_fetcher.rb', line 295

def write_file(bib) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  id = bib.docidentifier[0].id.gsub(%r{[/\s:.]}, "_").upcase.sub(/^NIST_IR/, "NISTIR")
  file = File.join(@output, "#{id}.#{@ext}")
  if @files.include? file
    warn "File #{file} exists. Docid: #{bib.docidentifier[0].id}"
    # warn "Link: #{bib.link.detect { |l| l.type == 'src' }.content}"
  else @files << file
  end
  output = case @format
           when "yaml" then bib.to_hash.to_yaml
           when "xml" then bib.to_xml bibdata: true
           else bib.send "to_#{@format}"
           end
  index.add_or_update bib.docidentifier[0].id, file
  File.write file, output, encoding: "UTF-8"
end