Class: RelatonNist::DataFetcher
- Inherits:
-
Object
- Object
- RelatonNist::DataFetcher
- Defined in:
- lib/relaton_nist/data_fetcher.rb
Constant Summary collapse
- RELATION_TYPES =
{ "replaces" => "obsoletes", "isVersionOf" => "editionOf", "hasTranslation" => "hasTranslation", "isTranslationOf" => "translatedFrom", "hasPreprint" => "hasReprint", "isSupplementTo" => "complements", "isPartOf" => "partOf", "hasPart" => "hasPart", }.freeze
- URL =
"https://raw.githubusercontent.com/usnistgov/NIST-Tech-Pubs/nist-pages/xml/allrecords.xml"
Class Method Summary collapse
-
.fetch(output: "data", format: "yaml") ⇒ Object
Fetch all the documnts from dataset.
Instance Method Summary collapse
- #affiliation(doc) ⇒ Object
- #anchor(doc) ⇒ Object
-
#doi(doc) ⇒ Object
rubocop:disable Metrics/CyclomaticComplexity.
-
#fetch ⇒ Object
Fetch all the documnts from dataset.
- #fetch_abstract(doc) ⇒ Array<RelatonBib::FormattedString>
- #fetch_contributor(doc) ⇒ Array<Hash>
- #fetch_date(doc) ⇒ Array<RelatonBib::BibliographicDate>
- #fetch_docid(doc) ⇒ Array<RelatonBib::DocumentIdentifier>
- #fetch_edition(doc) ⇒ String
- #fetch_link(doc) ⇒ Array<RelatonBib::TypedUri>
- #fetch_place(doc) ⇒ Array<String>
- #fetch_relation(doc) ⇒ Array<Hash>
- #fetch_series(doc) ⇒ Object
- #fetch_title(doc) ⇒ RelatonBib::TypedTitleStringCollection, Array
-
#initialize(output, format) ⇒ DataFetcher
constructor
A new instance of DataFetcher.
-
#parse_doc(doc) ⇒ Object
Create a document instance an save it.
-
#parse_docid(doc) ⇒ Object
rubocop:disable Metrics/AbcSize, Metrics/MethodLength.
- #pub_id(doc) ⇒ Object
-
#write_file(bib) ⇒ Object
Save document.
Constructor Details
#initialize(output, format) ⇒ DataFetcher
Returns a new instance of DataFetcher.
19 20 21 22 23 24 |
# File 'lib/relaton_nist/data_fetcher.rb', line 19 def initialize(output, format) @output = output @format = format @ext = format.sub(/^bib/, "") @files = [] end |
Class Method Details
.fetch(output: "data", format: "yaml") ⇒ Object
Fetch all the documnts from dataset
260 261 262 |
# File 'lib/relaton_nist/data_fetcher.rb', line 260 def self.fetch(output: "data", format: "yaml") new(output, format).fetch end |
Instance Method Details
#affiliation(doc) ⇒ Object
169 170 171 172 173 174 |
# File 'lib/relaton_nist/data_fetcher.rb', line 169 def affiliation(doc) doc.xpath("./institution/institution_department").map do |id| org = RelatonBib::Organization.new name: id.text RelatonBib::Affiliation.new organization: org end end |
#anchor(doc) ⇒ Object
59 60 61 |
# File 'lib/relaton_nist/data_fetcher.rb', line 59 def anchor(doc) doi(doc).split("/")[1..-1].join "/" end |
#doi(doc) ⇒ Object
rubocop:disable Metrics/CyclomaticComplexity
46 47 48 49 50 51 52 53 54 55 56 57 |
# File 'lib/relaton_nist/data_fetcher.rb', line 46 def doi(doc) # rubocop:disable Metrics/CyclomaticComplexity id = doc.at("doi_data/doi").text case id when "10.6028/NBS.CIRC.e2e" then "10.6028/NBS.CIRC.2e2" when "10.6028/NBS.CIRC.sup" then "10.6028/NBS.CIRC.24e7sup" when "10.6028/NBS.CIRC.supJun1925-Jun1926" then "10.6028/NBS.CIRC.24e7sup2" when "10.6028/NBS.CIRC.supJun1925-Jun1927" then "10.6028/NBS.CIRC.24e7sup3" when "10.6028/NBS.CIRC.24supJuly1922" then "10.6028/NBS.CIRC.24e6sup" when "10.6028/NBS.CIRC.24supJan1924" then "10.6028/NBS.CIRC.24e6sup2" else id end end |
#fetch ⇒ Object
Fetch all the documnts from dataset
236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 |
# File 'lib/relaton_nist/data_fetcher.rb', line 236 def fetch # rubocop:disable Metrics/AbcSize,Metrics/MethodLength t1 = Time.now puts "Started at: #{t1}" docs = Nokogiri::XML OpenURI.open_uri URL FileUtils.mkdir @output unless Dir.exist? @output FileUtils.rm Dir[File.join(@output, "*.#{@ext}")] docs.xpath("/body/query/doi_record/report-paper/report-paper_metadata") .each { |doc| parse_doc doc } t2 = Time.now puts "Stopped at: #{t2}" puts "Done in: #{(t2 - t1).round} sec." rescue StandardError => e warn e. warn e.backtrace[0..5].join("\n") end |
#fetch_abstract(doc) ⇒ Array<RelatonBib::FormattedString>
124 125 126 127 128 |
# File 'lib/relaton_nist/data_fetcher.rb', line 124 def fetch_abstract(doc) doc.xpath("jats:abstract/jats:p", "jats" => "http://www.ncbi.nlm.nih.gov/JATS1").map do |a| RelatonBib::FormattedString.new(content: a.text, language: doc["language"], script: "Latn") end end |
#fetch_contributor(doc) ⇒ Array<Hash>
132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
# File 'lib/relaton_nist/data_fetcher.rb', line 132 def fetch_contributor(doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity contribs = doc.xpath("contributors/person_name").map do |p| forename = [] initial = [] p.at("given_name")&.text&.split&.each do |fn| if /^(?<init>\w)\.?$/ =~ fn initial << RelatonBib::LocalizedString.new(init, doc["language"], "Latn") else forename << RelatonBib::LocalizedString.new(fn, doc["language"], "Latn") end end sname = p.at("surname").text surname = RelatonBib::LocalizedString.new sname, doc["language"], "Latn" ident = p.xpath("ORCID").map do |id| RelatonBib::PersonIdentifier.new "orcid", id.text end fullname = RelatonBib::FullName.new( surname: surname, forename: forename, initial: initial, identifier: ident, ) person = RelatonBib::Person.new name: fullname, affiliation: affiliation(doc) { entity: person, role: [{ type: p["contributor_role"] }] } end contribs + doc.xpath("publisher").map do |p| abbr = p.at("../institution/institution_acronym")&.text place = p.at("./publisher_place") cont = [] if place city, state = place.text.split(", ") cont << RelatonBib::Address.new(street: [], city: city, state: state, country: "US") end org = RelatonBib::Organization.new( name: p.at("publisher_name").text, abbreviation: abbr, contact: cont, ) { entity: org, role: [{ type: "publisher" }] } end end |
#fetch_date(doc) ⇒ Array<RelatonBib::BibliographicDate>
82 83 84 85 86 87 88 89 90 91 92 93 |
# File 'lib/relaton_nist/data_fetcher.rb', line 82 def fetch_date(doc) doc.xpath("publication_date|approval_date").map do |dt| on = dt.at("year").text if (m = dt.at "month") on += "-#{m.text}" d = dt.at "day" on += "-#{d.text}" if d end type = dt.name == "publication_date" ? "published" : "confirmed" RelatonBib::BibliographicDate.new(type: type, on: on) end end |
#fetch_docid(doc) ⇒ Array<RelatonBib::DocumentIdentifier>
65 66 67 68 69 |
# File 'lib/relaton_nist/data_fetcher.rb', line 65 def fetch_docid(doc) parse_docid(doc).map do |id| RelatonBib::DocumentIdentifier.new(**id) end end |
#fetch_edition(doc) ⇒ String
97 98 99 |
# File 'lib/relaton_nist/data_fetcher.rb', line 97 def fetch_edition(doc) doc.at("edition_number")&.text end |
#fetch_link(doc) ⇒ Array<RelatonBib::TypedUri>
117 118 119 120 |
# File 'lib/relaton_nist/data_fetcher.rb', line 117 def fetch_link(doc) url = doc.at("doi_data/resource").text [RelatonBib::TypedUri.new(type: "doi", content: url)] end |
#fetch_place(doc) ⇒ Array<String>
178 179 180 |
# File 'lib/relaton_nist/data_fetcher.rb', line 178 def fetch_place(doc) doc.xpath("institution/institution_place").map(&:text) end |
#fetch_relation(doc) ⇒ Array<Hash>
103 104 105 106 107 108 109 110 111 112 113 |
# File 'lib/relaton_nist/data_fetcher.rb', line 103 def fetch_relation(doc) ns = "http://www.crossref.org/relations.xsd" doc.xpath("./ns:program/ns:related_item", ns: ns).map do |rel| rdoi = rel.at_xpath("ns:intra_work_relation|ns:inter_work_relation", ns: ns) fref = RelatonBib::FormattedRef.new content: rdoi.text bibitem = RelatonBib::BibliographicItem.new formattedref: fref type = RELATION_TYPES[rdoi["relationship-type"]] warn "Relation type #{rdoi['relationship-type']} not found" unless type { type: type, bibitem: bibitem } end end |
#fetch_series(doc) ⇒ Object
182 183 184 185 |
# File 'lib/relaton_nist/data_fetcher.rb', line 182 def fetch_series(doc) title = RelatonBib::TypedTitleString.new(content: "NIST") [RelatonBib::Series.new(title: title, number: pub_id(doc))] end |
#fetch_title(doc) ⇒ RelatonBib::TypedTitleStringCollection, Array
73 74 75 76 77 78 |
# File 'lib/relaton_nist/data_fetcher.rb', line 73 def fetch_title(doc) t = doc.xpath("titles/title|titles/subtitle") return [] unless t.any? RelatonBib::TypedTitleString.from_string t.map(&:text).join, "en", "Latn" end |
#parse_doc(doc) ⇒ Object
Create a document instance an save it.
215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 |
# File 'lib/relaton_nist/data_fetcher.rb', line 215 def parse_doc(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize # mtd = doc.at('doi_record/report-paper/report-paper_metadata') item = RelatonNist::NistBibliographicItem.new( type: "standard", docid: fetch_docid(doc), title: fetch_title(doc), link: fetch_link(doc), abstract: fetch_abstract(doc), date: fetch_date(doc), edition: fetch_edition(doc), contributor: fetch_contributor(doc), relation: fetch_relation(doc), place: fetch_place(doc), series: fetch_series(doc), language: [doc["language"]], script: ["Latn"], doctype: "standard" ) write_file item rescue StandardError => e warn "Document: #{doc.at('doi').text}" warn e. warn e.backtrace[0..5].join("\n") # raise e end |
#parse_docid(doc) ⇒ Object
rubocop:disable Metrics/AbcSize, Metrics/MethodLength
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
# File 'lib/relaton_nist/data_fetcher.rb', line 26 def parse_docid(doc) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength # case doi # when "10.6028/NBS.CIRC.12e2revjune" then doi.sub!("13e", "12e") # when "10.6028/NBS.CIRC.36e2" then doi.sub!("46e", "36e") # when "10.6028/NBS.HB.67suppJune1967" then doi.sub!("1965", "1967") # when "10.6028/NBS.HB.105-1r1990" then doi.sub!("105-1-1990", "105-1r1990") # when "10.6028/NIST.HB.150-10-1995" then doi.sub!(/150-10$/, "150-10-1995") # end # anchor = doi.split("/")[1..-1].join "/" [ { type: "NIST", id: pub_id(doc), primary: true }, { type: "DOI", id: doi(doc) }, { type: "NIST", id: anchor(doc), scope: "anchor" }, ] end |
#pub_id(doc) ⇒ Object
42 43 44 |
# File 'lib/relaton_nist/data_fetcher.rb', line 42 def pub_id(doc) anchor(doc).gsub(".", " ") end |
#write_file(bib) ⇒ Object
Save document
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
# File 'lib/relaton_nist/data_fetcher.rb', line 192 def write_file(bib) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength id = bib.docidentifier[0].id.gsub(%r{[/\s:.]}, "_").upcase.sub(/^NIST_IR/, "NISTIR") file = File.join(@output, "#{id}.#{@ext}") if @files.include? file warn "File #{file} exists. Docid: #{bib.docidentifier[0].id}" # warn "Link: #{bib.link.detect { |l| l.type == 'src' }.content}" else @files << file end output = case @format when "yaml" then bib.to_hash.to_yaml when "xml" then bib.to_xml bibdata: true else bib.send "to_#{@format}" end File.write file, output, encoding: "UTF-8" end |