Class: RelatonNist::DataFetcher
- Inherits:
-
Object
- Object
- RelatonNist::DataFetcher
- Defined in:
- lib/relaton_nist/data_fetcher.rb
Constant Summary collapse
- RELATION_TYPES =
{ "replaces" => "obsoletes", "isVersionOf" => "editionOf", "hasTranslation" => "hasTranslation", "isTranslationOf" => "translatedFrom", "hasPreprint" => "hasReprint", "isSupplementTo" => "complements", "isPartOf" => "partOf", "hasPart" => "hasPart", }.freeze
- URL =
"https://raw.githubusercontent.com/usnistgov/NIST-Tech-Pubs/nist-pages/xml/allrecords.xml"
Class Method Summary collapse
-
.fetch(output: "data", format: "yaml") ⇒ Object
Fetch all the documnts from dataset.
Instance Method Summary collapse
-
#affiliation(doc) ⇒ Array<RelatonBib::Affiliation>
Create affiliation organization.
-
#create_org(pub) ⇒ RelatonBib::Organization
Create publisher organization.
-
#fetch ⇒ Object
Fetch all the documnts from dataset.
- #fetch_abstract(doc) ⇒ Array<RelatonBib::FormattedString>
- #fetch_contributor(doc) ⇒ Array<Hash>
- #fetch_date(doc) ⇒ Array<RelatonBib::BibliographicDate>
- #fetch_docid(doc) ⇒ Array<RelatonBib::DocumentIdentifier>
-
#fetch_doi(doc) ⇒ Object
rubocop:disable Metrics/CyclomaticComplexity.
- #fetch_edition(doc) ⇒ String
- #fetch_link(doc) ⇒ Array<RelatonBib::TypedUri>
- #fetch_place(doc) ⇒ Array<String>
- #fetch_relation(doc) ⇒ Array<Hash>
-
#fetch_series(doc) ⇒ Array<RelatonBib::Series>
Fetches series.
- #fetch_title(doc) ⇒ RelatonBib::TypedTitleStringCollection, Array
-
#initialize(output, format) ⇒ DataFetcher
constructor
A new instance of DataFetcher.
-
#parse_doc(doc) ⇒ Object
Create a document instance an save it.
-
#parse_docid(doc) ⇒ Object
rubocop:disable Metrics/AbcSize, Metrics/MethodLength.
- #pub_id(doc) ⇒ Object
-
#write_file(bib) ⇒ Object
Save document.
Constructor Details
#initialize(output, format) ⇒ DataFetcher
Returns a new instance of DataFetcher.
19 20 21 22 23 24 |
# File 'lib/relaton_nist/data_fetcher.rb', line 19 def initialize(output, format) @output = output @format = format @ext = format.sub(/^bib/, "") @files = [] end |
Class Method Details
.fetch(output: "data", format: "yaml") ⇒ Object
Fetch all the documnts from dataset
296 297 298 |
# File 'lib/relaton_nist/data_fetcher.rb', line 296 def self.fetch(output: "data", format: "yaml") new(output, format).fetch end |
Instance Method Details
#affiliation(doc) ⇒ Array<RelatonBib::Affiliation>
Create affiliation organization
194 195 196 197 198 199 |
# File 'lib/relaton_nist/data_fetcher.rb', line 194 def affiliation(doc) doc.xpath("./institution/institution_department").map do |id| org = RelatonBib::Organization.new name: id.text RelatonBib::Affiliation.new organization: org end end |
#create_org(pub) ⇒ RelatonBib::Organization
Create publisher organization
174 175 176 177 178 179 180 181 182 183 184 185 |
# File 'lib/relaton_nist/data_fetcher.rb', line 174 def create_org(pub) name = pub.at("publisher_name").text abbr = pub.at("../institution[institution_name[.='#{name}']]/institution_acronym")&.text place = pub.at("./publisher_place") || pub.at("../institution[institution_name[.='#{name}']]/institution_place") cont = [] if place city, state = place.text.split(", ") cont << RelatonBib::Address.new(street: [], city: city, state: state, country: "US") end RelatonBib::Organization.new name: name, abbreviation: abbr, contact: cont end |
#fetch ⇒ Object
Fetch all the documnts from dataset
272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 |
# File 'lib/relaton_nist/data_fetcher.rb', line 272 def fetch # rubocop:disable Metrics/AbcSize,Metrics/MethodLength t1 = Time.now puts "Started at: #{t1}" docs = Nokogiri::XML OpenURI.open_uri URL FileUtils.mkdir @output unless Dir.exist? @output FileUtils.rm Dir[File.join(@output, "*.#{@ext}")] docs.xpath("/body/query/doi_record/report-paper/report-paper_metadata") .each { |doc| parse_doc doc } t2 = Time.now puts "Stopped at: #{t2}" puts "Done in: #{(t2 - t1).round} sec." rescue StandardError => e warn e. warn e.backtrace[0..5].join("\n") end |
#fetch_abstract(doc) ⇒ Array<RelatonBib::FormattedString>
132 133 134 135 136 |
# File 'lib/relaton_nist/data_fetcher.rb', line 132 def fetch_abstract(doc) doc.xpath("jats:abstract/jats:p", "jats" => "http://www.ncbi.nlm.nih.gov/JATS1").map do |a| RelatonBib::FormattedString.new(content: a.text, language: doc["language"], script: "Latn") end end |
#fetch_contributor(doc) ⇒ Array<Hash>
140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
# File 'lib/relaton_nist/data_fetcher.rb', line 140 def fetch_contributor(doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity contribs = doc.xpath("contributors/person_name").map do |p| forename = [] initial = [] p.at("given_name")&.text&.split&.each do |fn| if /^(?<init>\w)\.?$/ =~ fn initial << RelatonBib::LocalizedString.new(init, doc["language"], "Latn") else forename << RelatonBib::LocalizedString.new(fn, doc["language"], "Latn") end end sname = p.at("surname").text surname = RelatonBib::LocalizedString.new sname, doc["language"], "Latn" ident = p.xpath("ORCID").map do |id| RelatonBib::PersonIdentifier.new "orcid", id.text end fullname = RelatonBib::FullName.new( surname: surname, forename: forename, initial: initial, identifier: ident, ) person = RelatonBib::Person.new name: fullname, affiliation: affiliation(doc) { entity: person, role: [{ type: p["contributor_role"] }] } end contribs + doc.xpath("publisher").map do |p| { entity: create_org(p), role: [{ type: "publisher" }] } end end |
#fetch_date(doc) ⇒ Array<RelatonBib::BibliographicDate>
85 86 87 88 89 90 91 92 93 94 95 96 |
# File 'lib/relaton_nist/data_fetcher.rb', line 85 def fetch_date(doc) doc.xpath("publication_date|approval_date").map do |dt| on = dt.at("year").text if (m = dt.at "month") on += "-#{m.text}" d = dt.at "day" on += "-#{d.text}" if d end type = dt.name == "publication_date" ? "published" : "confirmed" RelatonBib::BibliographicDate.new(type: type, on: on) end end |
#fetch_docid(doc) ⇒ Array<RelatonBib::DocumentIdentifier>
66 67 68 69 70 |
# File 'lib/relaton_nist/data_fetcher.rb', line 66 def fetch_docid(doc) parse_docid(doc).map do |id| RelatonBib::DocumentIdentifier.new(**id) end end |
#fetch_doi(doc) ⇒ Object
rubocop:disable Metrics/CyclomaticComplexity
47 48 49 50 51 52 53 54 55 56 57 58 |
# File 'lib/relaton_nist/data_fetcher.rb', line 47 def fetch_doi(doc) # rubocop:disable Metrics/CyclomaticComplexity id = doc.at("doi_data/doi").text case id when "10.6028/NBS.CIRC.e2e" then "10.6028/NBS.CIRC.2e2" when "10.6028/NBS.CIRC.sup" then "10.6028/NBS.CIRC.24e7sup" when "10.6028/NBS.CIRC.supJun1925-Jun1926" then "10.6028/NBS.CIRC.24e7sup2" when "10.6028/NBS.CIRC.supJun1925-Jun1927" then "10.6028/NBS.CIRC.24e7sup3" when "10.6028/NBS.CIRC.24supJuly1922" then "10.6028/NBS.CIRC.24e6sup" when "10.6028/NBS.CIRC.24supJan1924" then "10.6028/NBS.CIRC.24e6sup2" else id end end |
#fetch_edition(doc) ⇒ String
100 101 102 |
# File 'lib/relaton_nist/data_fetcher.rb', line 100 def fetch_edition(doc) doc.at("edition_number")&.text end |
#fetch_link(doc) ⇒ Array<RelatonBib::TypedUri>
122 123 124 125 126 127 128 |
# File 'lib/relaton_nist/data_fetcher.rb', line 122 def fetch_link(doc) pdf = doc.at("doi_data/resource").text doi = "https://doi.org/#{fetch_doi(doc)}" [{ type: "doi", content: doi }, { type: "pdf", content: pdf }].map do |l| RelatonBib::TypedUri.new(**l) end end |
#fetch_place(doc) ⇒ Array<String>
203 204 205 |
# File 'lib/relaton_nist/data_fetcher.rb', line 203 def fetch_place(doc) doc.xpath("institution/institution_place").map(&:text) end |
#fetch_relation(doc) ⇒ Array<Hash>
106 107 108 109 110 111 112 113 114 115 116 117 118 |
# File 'lib/relaton_nist/data_fetcher.rb', line 106 def fetch_relation(doc) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength ns = "http://www.crossref.org/relations.xsd" doc.xpath("./ns:program/ns:related_item", ns: ns).map do |rel| rdoi = rel.at_xpath("ns:intra_work_relation|ns:inter_work_relation", ns: ns) id = rdoi.text.split("/")[1..].join("/").gsub(".", " ") fref = RelatonBib::FormattedRef.new content: id docid = RelatonBib::DocumentIdentifier.new(type: "NIST", id: id, primary: true) bibitem = RelatonBib::BibliographicItem.new formattedref: fref, docid: [docid] type = RELATION_TYPES[rdoi["relationship-type"]] warn "Relation type #{rdoi['relationship-type']} not found" unless type { type: type, bibitem: bibitem } end end |
#fetch_series(doc) ⇒ Array<RelatonBib::Series>
Fetches series
214 215 216 217 218 219 220 221 |
# File 'lib/relaton_nist/data_fetcher.rb', line 214 def fetch_series(doc) series_path = File.("series.yaml", __dir__) series = YAML.load_file series_path prf, srs, = pub_id(doc).split sname = series[srs] || srs title = RelatonBib::TypedTitleString.new(content: "#{prf} #{sname}") [RelatonBib::Series.new(title: title, number: "#{prf} #{srs}")] end |
#fetch_title(doc) ⇒ RelatonBib::TypedTitleStringCollection, Array
74 75 76 77 78 79 80 81 |
# File 'lib/relaton_nist/data_fetcher.rb', line 74 def fetch_title(doc) t = doc.xpath("titles/title|titles/subtitle") return [] unless t.any? # RelatonBib::TypedTitleString.from_string t.map(&:text).join, "en", "Latn" [{ content: t.map(&:text).join, language: "en", script: "Latn", format: "text/plain" }] end |
#parse_doc(doc) ⇒ Object
Create a document instance an save it.
251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 |
# File 'lib/relaton_nist/data_fetcher.rb', line 251 def parse_doc(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize # mtd = doc.at('doi_record/report-paper/report-paper_metadata') item = RelatonNist::NistBibliographicItem.new( fetched: Date.today.to_s, type: "standard", docid: fetch_docid(doc), title: fetch_title(doc), link: fetch_link(doc), abstract: fetch_abstract(doc), date: fetch_date(doc), edition: fetch_edition(doc), contributor: fetch_contributor(doc), relation: fetch_relation(doc), place: fetch_place(doc), series: fetch_series(doc), language: [doc["language"]], script: ["Latn"], doctype: "standard" ) write_file item rescue StandardError => e warn "Document: #{doc.at('doi').text}" warn e. warn e.backtrace[0..5].join("\n") # raise e end |
#parse_docid(doc) ⇒ Object
rubocop:disable Metrics/AbcSize, Metrics/MethodLength
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
# File 'lib/relaton_nist/data_fetcher.rb', line 26 def parse_docid(doc) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength # case doi # when "10.6028/NBS.CIRC.12e2revjune" then doi.sub!("13e", "12e") # when "10.6028/NBS.CIRC.36e2" then doi.sub!("46e", "36e") # when "10.6028/NBS.HB.67suppJune1967" then doi.sub!("1965", "1967") # when "10.6028/NBS.HB.105-1r1990" then doi.sub!("105-1-1990", "105-1r1990") # when "10.6028/NIST.HB.150-10-1995" then doi.sub!(/150-10$/, "150-10-1995") # end # anchor = doi.split("/")[1..-1].join "/" [ { type: "NIST", id: pub_id(doc), primary: true }, { type: "DOI", id: fetch_doi(doc) }, # { type: "NIST", id: anchor(doc), scope: "anchor" }, ] end |
#pub_id(doc) ⇒ Object
42 43 44 45 |
# File 'lib/relaton_nist/data_fetcher.rb', line 42 def pub_id(doc) # anchor(doc).gsub(".", " ") fetch_doi(doc).split("/")[1..].join("/").gsub(".", " ") end |
#write_file(bib) ⇒ Object
Save document
228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 |
# File 'lib/relaton_nist/data_fetcher.rb', line 228 def write_file(bib) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength id = bib.docidentifier[0].id.gsub(%r{[/\s:.]}, "_").upcase.sub(/^NIST_IR/, "NISTIR") file = File.join(@output, "#{id}.#{@ext}") if @files.include? file warn "File #{file} exists. Docid: #{bib.docidentifier[0].id}" # warn "Link: #{bib.link.detect { |l| l.type == 'src' }.content}" else @files << file end output = case @format when "yaml" then bib.to_hash.to_yaml when "xml" then bib.to_xml bibdata: true else bib.send "to_#{@format}" end File.write file, output, encoding: "UTF-8" end |