Class: RelatonNist::DataFetcher

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton_nist/data_fetcher.rb

Constant Summary collapse

RELATION_TYPES =
{
  "replaces" => "obsoletes",
  "isVersionOf" => "editionOf",
  "hasTranslation" => "hasTranslation",
  "isTranslationOf" => "translatedFrom",
  "hasPreprint" => "hasReprint",
  "isSupplementTo" => "complements",
  "isPartOf" => "partOf",
  "hasPart" => "hasPart",
}.freeze
URL =
"https://raw.githubusercontent.com/usnistgov/NIST-Tech-Pubs/nist-pages/xml/allrecords.xml"

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(output, format) ⇒ DataFetcher

Returns a new instance of DataFetcher.



19
20
21
22
23
24
# File 'lib/relaton_nist/data_fetcher.rb', line 19

def initialize(output, format)
  @output = output
  @format = format
  @ext = format.sub(/^bib/, "")
  @files = []
end

Class Method Details

.fetch(output: "data", format: "yaml") ⇒ Object

Fetch all the documnts from dataset

Parameters:

  • output (String) (defaults to: "data")

    foldet name to save the documents

  • format (String) (defaults to: "yaml")

    format to save the documents (yaml, xml, bibxml)



260
261
262
# File 'lib/relaton_nist/data_fetcher.rb', line 260

def self.fetch(output: "data", format: "yaml")
  new(output, format).fetch
end

Instance Method Details

#affiliation(doc) ⇒ Object



169
170
171
172
173
174
# File 'lib/relaton_nist/data_fetcher.rb', line 169

def affiliation(doc)
  doc.xpath("./institution/institution_department").map do |id|
    org = RelatonBib::Organization.new name: id.text
    RelatonBib::Affiliation.new organization: org
  end
end

#anchor(doc) ⇒ Object



59
60
61
# File 'lib/relaton_nist/data_fetcher.rb', line 59

def anchor(doc)
  doi(doc).split("/")[1..-1].join "/"
end

#doi(doc) ⇒ Object

rubocop:disable Metrics/CyclomaticComplexity



46
47
48
49
50
51
52
53
54
55
56
57
# File 'lib/relaton_nist/data_fetcher.rb', line 46

def doi(doc) # rubocop:disable Metrics/CyclomaticComplexity
  id = doc.at("doi_data/doi").text
  case id
  when "10.6028/NBS.CIRC.e2e" then "10.6028/NBS.CIRC.2e2"
  when "10.6028/NBS.CIRC.sup" then "10.6028/NBS.CIRC.24e7sup"
  when "10.6028/NBS.CIRC.supJun1925-Jun1926" then "10.6028/NBS.CIRC.24e7sup2"
  when "10.6028/NBS.CIRC.supJun1925-Jun1927" then "10.6028/NBS.CIRC.24e7sup3"
  when "10.6028/NBS.CIRC.24supJuly1922" then "10.6028/NBS.CIRC.24e6sup"
  when "10.6028/NBS.CIRC.24supJan1924" then "10.6028/NBS.CIRC.24e6sup2"
  else id
  end
end

#fetchObject

Fetch all the documnts from dataset



236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
# File 'lib/relaton_nist/data_fetcher.rb', line 236

def fetch # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  t1 = Time.now
  puts "Started at: #{t1}"

  docs = Nokogiri::XML OpenURI.open_uri URL
  FileUtils.mkdir @output unless Dir.exist? @output
  FileUtils.rm Dir[File.join(@output, "*.#{@ext}")]
  docs.xpath("/body/query/doi_record/report-paper/report-paper_metadata")
    .each { |doc| parse_doc doc }

  t2 = Time.now
  puts "Stopped at: #{t2}"
  puts "Done in: #{(t2 - t1).round} sec."
rescue StandardError => e
  warn e.message
  warn e.backtrace[0..5].join("\n")
end

#fetch_abstract(doc) ⇒ Array<RelatonBib::FormattedString>

Parameters:

  • doc (Nokogiri::XML::Element)

Returns:

  • (Array<RelatonBib::FormattedString>)


124
125
126
127
128
# File 'lib/relaton_nist/data_fetcher.rb', line 124

def fetch_abstract(doc)
  doc.xpath("jats:abstract/jats:p", "jats" => "http://www.ncbi.nlm.nih.gov/JATS1").map do |a|
    RelatonBib::FormattedString.new(content: a.text, language: doc["language"], script: "Latn")
  end
end

#fetch_contributor(doc) ⇒ Array<Hash>

Parameters:

  • doc (Nokogiri::XML::Element)

Returns:

  • (Array<Hash>)


132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# File 'lib/relaton_nist/data_fetcher.rb', line 132

def fetch_contributor(doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
  contribs = doc.xpath("contributors/person_name").map do |p|
    forename = []
    initial = []
    p.at("given_name")&.text&.split&.each do |fn|
      if /^(?<init>\w)\.?$/ =~ fn
        initial << RelatonBib::LocalizedString.new(init, doc["language"], "Latn")
      else
        forename << RelatonBib::LocalizedString.new(fn, doc["language"], "Latn")
      end
    end
    sname = p.at("surname").text
    surname = RelatonBib::LocalizedString.new sname, doc["language"], "Latn"
    ident = p.xpath("ORCID").map do |id|
      RelatonBib::PersonIdentifier.new "orcid", id.text
    end
    fullname = RelatonBib::FullName.new(
      surname: surname, forename: forename, initial: initial, identifier: ident,
    )
    person = RelatonBib::Person.new name: fullname, affiliation: affiliation(doc)
    { entity: person, role: [{ type: p["contributor_role"] }] }
  end
  contribs + doc.xpath("publisher").map do |p|
    abbr = p.at("../institution/institution_acronym")&.text
    place = p.at("./publisher_place")
    cont = []
    if place
      city, state = place.text.split(", ")
      cont << RelatonBib::Address.new(street: [], city: city, state: state, country: "US")
    end
    org = RelatonBib::Organization.new(
      name: p.at("publisher_name").text, abbreviation: abbr, contact: cont,
    )
    { entity: org, role: [{ type: "publisher" }] }
  end
end

#fetch_date(doc) ⇒ Array<RelatonBib::BibliographicDate>

Parameters:

  • doc (Nokogiri::XML::Element)

Returns:

  • (Array<RelatonBib::BibliographicDate>)


82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/relaton_nist/data_fetcher.rb', line 82

def fetch_date(doc)
  doc.xpath("publication_date|approval_date").map do |dt|
    on = dt.at("year").text
    if (m = dt.at "month")
      on += "-#{m.text}"
      d = dt.at "day"
      on += "-#{d.text}" if d
    end
    type = dt.name == "publication_date" ? "published" : "confirmed"
    RelatonBib::BibliographicDate.new(type: type, on: on)
  end
end

#fetch_docid(doc) ⇒ Array<RelatonBib::DocumentIdentifier>

Parameters:

  • doc (Nokogiri::XML::Element)

Returns:

  • (Array<RelatonBib::DocumentIdentifier>)


65
66
67
68
69
# File 'lib/relaton_nist/data_fetcher.rb', line 65

def fetch_docid(doc)
  parse_docid(doc).map do |id|
    RelatonBib::DocumentIdentifier.new(**id)
  end
end

#fetch_edition(doc) ⇒ String

Parameters:

  • doc (Nokogiri::XML::Element)

Returns:

  • (String)


97
98
99
# File 'lib/relaton_nist/data_fetcher.rb', line 97

def fetch_edition(doc)
  doc.at("edition_number")&.text
end

Parameters:

  • doc (Nokogiri::XML::Element)

Returns:

  • (Array<RelatonBib::TypedUri>)


117
118
119
120
# File 'lib/relaton_nist/data_fetcher.rb', line 117

def fetch_link(doc)
  url = doc.at("doi_data/resource").text
  [RelatonBib::TypedUri.new(type: "doi", content: url)]
end

#fetch_place(doc) ⇒ Array<String>

Parameters:

  • doc (Nokogiri::XML::Element)

Returns:

  • (Array<String>)


178
179
180
# File 'lib/relaton_nist/data_fetcher.rb', line 178

def fetch_place(doc)
  doc.xpath("institution/institution_place").map(&:text)
end

#fetch_relation(doc) ⇒ Array<Hash>

Parameters:

  • doc (Nokogiri::XML::Element)

Returns:

  • (Array<Hash>)


103
104
105
106
107
108
109
110
111
112
113
# File 'lib/relaton_nist/data_fetcher.rb', line 103

def fetch_relation(doc)
  ns = "http://www.crossref.org/relations.xsd"
  doc.xpath("./ns:program/ns:related_item", ns: ns).map do |rel|
    rdoi = rel.at_xpath("ns:intra_work_relation|ns:inter_work_relation", ns: ns)
    fref = RelatonBib::FormattedRef.new content: rdoi.text
    bibitem = RelatonBib::BibliographicItem.new formattedref: fref
    type = RELATION_TYPES[rdoi["relationship-type"]]
    warn "Relation type #{rdoi['relationship-type']} not found" unless type
    { type: type, bibitem: bibitem }
  end
end

#fetch_series(doc) ⇒ Object



182
183
184
185
# File 'lib/relaton_nist/data_fetcher.rb', line 182

def fetch_series(doc)
  title = RelatonBib::TypedTitleString.new(content: "NIST")
  [RelatonBib::Series.new(title: title, number: pub_id(doc))]
end

#fetch_title(doc) ⇒ RelatonBib::TypedTitleStringCollection, Array

Parameters:

  • doc (Nokogiri::XML::Element)

Returns:

  • (RelatonBib::TypedTitleStringCollection, Array)


73
74
75
76
77
78
# File 'lib/relaton_nist/data_fetcher.rb', line 73

def fetch_title(doc)
  t = doc.xpath("titles/title|titles/subtitle")
  return [] unless t.any?

  RelatonBib::TypedTitleString.from_string t.map(&:text).join, "en", "Latn"
end

#parse_doc(doc) ⇒ Object

Create a document instance an save it.

Parameters:

  • doc (Nokogiri::XML::Element)

Raises:

  • (StandardError)


215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
# File 'lib/relaton_nist/data_fetcher.rb', line 215

def parse_doc(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
  # mtd = doc.at('doi_record/report-paper/report-paper_metadata')
  item = RelatonNist::NistBibliographicItem.new(
    type: "standard", docid: fetch_docid(doc), title: fetch_title(doc),
    link: fetch_link(doc), abstract: fetch_abstract(doc),
    date: fetch_date(doc), edition: fetch_edition(doc),
    contributor: fetch_contributor(doc), relation: fetch_relation(doc),
    place: fetch_place(doc), series: fetch_series(doc),
    language: [doc["language"]], script: ["Latn"], doctype: "standard"
  )
  write_file item
rescue StandardError => e
  warn "Document: #{doc.at('doi').text}"
  warn e.message
  warn e.backtrace[0..5].join("\n")
  # raise e
end

#parse_docid(doc) ⇒ Object

rubocop:disable Metrics/AbcSize, Metrics/MethodLength



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/relaton_nist/data_fetcher.rb', line 26

def parse_docid(doc) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
  # case doi
  # when "10.6028/NBS.CIRC.12e2revjune" then doi.sub!("13e", "12e")
  # when "10.6028/NBS.CIRC.36e2" then doi.sub!("46e", "36e")
  # when "10.6028/NBS.HB.67suppJune1967" then doi.sub!("1965", "1967")
  # when "10.6028/NBS.HB.105-1r1990" then doi.sub!("105-1-1990", "105-1r1990")
  # when "10.6028/NIST.HB.150-10-1995" then doi.sub!(/150-10$/, "150-10-1995")
  # end
  # anchor = doi.split("/")[1..-1].join "/"
  [
    { type: "NIST", id: pub_id(doc), primary: true },
    { type: "DOI", id: doi(doc) },
    { type: "NIST", id: anchor(doc), scope: "anchor" },
  ]
end

#pub_id(doc) ⇒ Object



42
43
44
# File 'lib/relaton_nist/data_fetcher.rb', line 42

def pub_id(doc)
  anchor(doc).gsub(".", " ")
end

#write_file(bib) ⇒ Object

Save document



192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# File 'lib/relaton_nist/data_fetcher.rb', line 192

def write_file(bib) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  id = bib.docidentifier[0].id.gsub(%r{[/\s:.]}, "_").upcase.sub(/^NIST_IR/, "NISTIR")
  file = File.join(@output, "#{id}.#{@ext}")
  if @files.include? file
    warn "File #{file} exists. Docid: #{bib.docidentifier[0].id}"
    # warn "Link: #{bib.link.detect { |l| l.type == 'src' }.content}"
  else @files << file
  end
  output = case @format
           when "yaml" then bib.to_hash.to_yaml
           when "xml" then bib.to_xml bibdata: true
           else bib.send "to_#{@format}"
           end
  File.write file, output, encoding: "UTF-8"
end