Class: RelatonW3c::DataFetcher
- Inherits:
-
Object
- Object
- RelatonW3c::DataFetcher
- Defined in:
- lib/relaton_w3c/data_fetcher.rb
Instance Attribute Summary collapse
-
#data ⇒ Object
readonly
Returns the value of attribute data.
-
#group_names ⇒ Object
readonly
Returns the value of attribute group_names.
-
#rdf_archive ⇒ Object
readonly
Returns the value of attribute rdf_archive.
Class Method Summary collapse
-
.fetch(output: "data", format: "yaml") ⇒ Object
Initialize fetcher and run fetch.
Instance Method Summary collapse
-
#add_has_edition_relation(bib) ⇒ Object
Add hasEdition relations form previous parsed document.
-
#fetch ⇒ Object
Parse documents.
-
#file_name(id) ⇒ String
Generate file name.
-
#initialize(output, format) ⇒ DataFetcher
constructor
Data fetcher initializer.
-
#query_unversioned_docs(rdf) ⇒ Array<RDF::Query::Solution>
Query RDF source for unversioned documents.
-
#query_versioned_docs(rdf) ⇒ RDF::Query::Solutions
Query RDF source for versioned documents.
-
#read_bibxml(file) ⇒ RelatonW3c::W3cBibliographicItem
Read BibXML file.
-
#read_xml(file) ⇒ RelatonW3c::W3cBibliographicItem
Read XML file.
-
#read_yaml(file) ⇒ RelatonW3c::W3cBibliographicItem
Read YAML file.
-
#same_edition?(rel1, rel2) ⇒ Boolean
Compare two relations.
-
#save_doc(bib, warn_duplicate: true) ⇒ Object
Save document to file.
- #serialize(bib) ⇒ Object
Constructor Details
#initialize(output, format) ⇒ DataFetcher
Data fetcher initializer
18 19 20 21 22 23 24 25 26 27 |
# File 'lib/relaton_w3c/data_fetcher.rb', line 18 def initialize(output, format) @output = output @format = format @ext = format.sub(/^bib/, "") dir = File.dirname(File.(__FILE__)) @group_names = YAML.load_file(File.join(dir, "workgroups.yaml")) @files = Set.new @index = DataIndex.create_from_file @index1 = Relaton::Index.find_or_create :W3C, file: "index1.yaml" end |
Instance Attribute Details
#data ⇒ Object (readonly)
Returns the value of attribute data.
10 11 12 |
# File 'lib/relaton_w3c/data_fetcher.rb', line 10 def data @data end |
#group_names ⇒ Object (readonly)
Returns the value of attribute group_names.
10 11 12 |
# File 'lib/relaton_w3c/data_fetcher.rb', line 10 def group_names @group_names end |
#rdf_archive ⇒ Object (readonly)
Returns the value of attribute rdf_archive.
10 11 12 |
# File 'lib/relaton_w3c/data_fetcher.rb', line 10 def rdf_archive @rdf_archive end |
Class Method Details
.fetch(output: "data", format: "yaml") ⇒ Object
Initialize fetcher and run fetch
36 37 38 39 40 41 42 43 44 |
# File 'lib/relaton_w3c/data_fetcher.rb', line 36 def self.fetch(output: "data", format: "yaml") t1 = Time.now puts "Started at: #{t1}" FileUtils.mkdir_p output new(output, format).fetch t2 = Time.now puts "Stopped at: #{t2}" puts "Done in: #{(t2 - t1).round} sec." end |
Instance Method Details
#add_has_edition_relation(bib) ⇒ Object
Add hasEdition relations form previous parsed document
76 77 78 79 80 81 82 83 84 85 86 87 88 |
# File 'lib/relaton_w3c/data_fetcher.rb', line 76 def add_has_edition_relation(bib) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity, Metrics/MethodLength file = file_name bib.docnumber if File.exist? file item = send "read_#{@format}", file item.relation.each do |r1| r1.type = "hasEdition" if r1.type == "instanceOf" same_edition = bib.relation.detect { |r2| same_edition?(r1, r2) } bib.relation << r1 unless same_edition end end bib.relation.select { |r| r.type == "hasEdition" } .max_by { |r| r.bibitem.id.match(/(?<=-)\d{8}$/).to_s }&.type = "instanceOf" end |
#fetch ⇒ Object
Parse documents
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
# File 'lib/relaton_w3c/data_fetcher.rb', line 55 def fetch # (source) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength rdf = rdf_archive.get_data %i[versioned unversioned].each do |type| send("query_#{type}_docs", rdf).each do |sl| bib = DataParser.parse(rdf, sl, self) add_has_edition_relation(bib) if type == :unversioned save_doc bib rescue StandardError => e link = sl.respond_to?(:link) ? sl.link : sl.version_of Util.error "Error: document #{link} #{e.}\n#{e.backtrace.join("\n")}" end end @index.sort!.save @index1.save end |
#file_name(id) ⇒ String
Generate file name
209 210 211 212 |
# File 'lib/relaton_w3c/data_fetcher.rb', line 209 def file_name(id) name = id.sub(/^W3C\s/, "").gsub(/[\s,:\/+]/, "_").squeeze("_").downcase File.join @output, "#{name}.#{@ext}" end |
#query_unversioned_docs(rdf) ⇒ Array<RDF::Query::Solution>
Query RDF source for unversioned documents
162 163 164 165 166 167 168 169 170 171 172 |
# File 'lib/relaton_w3c/data_fetcher.rb', line 162 def query_unversioned_docs(rdf) sse = SPARQL.parse(%( PREFIX doc: <http://www.w3.org/2000/10/swap/pim/doc#> SELECT ?version_of WHERE { ?link doc:versionOf ?version_of . FILTER ( isURI(?link) && isURI(?version_of) && ?link != ?version_of ) } )) rdf.query(sse).uniq { |s| s.version_of.to_s.sub(/^https?:\/\//, "").sub(/\/$/, "") } end |
#query_versioned_docs(rdf) ⇒ RDF::Query::Solutions
Query RDF source for versioned documents
145 146 147 148 149 150 151 152 153 154 155 |
# File 'lib/relaton_w3c/data_fetcher.rb', line 145 def query_versioned_docs(rdf) sse = SPARQL.parse(%( PREFIX : <http://www.w3.org/2001/02pd/rec54#> PREFIX dc: <http://purl.org/dc/elements/1.1/> PREFIX doc: <http://www.w3.org/2000/10/swap/pim/doc#> PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> SELECT ?link ?title ?date WHERE { ?link dc:title ?title ; dc:date ?date . } )) rdf.query sse end |
#read_bibxml(file) ⇒ RelatonW3c::W3cBibliographicItem
Read BibXML file
120 121 122 |
# File 'lib/relaton_w3c/data_fetcher.rb', line 120 def read_bibxml(file) BibXMLParser.parse File.read(file, encoding: "UTF-8") end |
#read_xml(file) ⇒ RelatonW3c::W3cBibliographicItem
Read XML file
97 98 99 |
# File 'lib/relaton_w3c/data_fetcher.rb', line 97 def read_xml(file) XMLParser.from_xml(File.read(file, encoding: "UTF-8")) end |
#read_yaml(file) ⇒ RelatonW3c::W3cBibliographicItem
Read YAML file
108 109 110 111 |
# File 'lib/relaton_w3c/data_fetcher.rb', line 108 def read_yaml(file) hash = YAML.load_file(file) W3cBibliographicItem.from_hash(hash) end |
#same_edition?(rel1, rel2) ⇒ Boolean
Compare two relations
132 133 134 135 136 137 138 |
# File 'lib/relaton_w3c/data_fetcher.rb', line 132 def same_edition?(rel1, rel2) return false unless rel1.type == "hasEdition" && rel1.type == rel2.type ids1 = rel1.bibitem.docidentifier.map(&:id) ids2 = rel2.bibitem.docidentifier.map(&:id) (ids1 & ids2).any? end |
#save_doc(bib, warn_duplicate: true) ⇒ Object
Save document to file
179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
# File 'lib/relaton_w3c/data_fetcher.rb', line 179 def save_doc(bib, warn_duplicate: true) return unless bib file = file_name(bib.docnumber) if @files.include?(file) Util.warn "File #{file} already exists. Document: #{bib.docnumber}" if warn_duplicate else pubid = PubId.parse bib.docnumber @index.add pubid, file @index1.add_or_update pubid.to_hash, file @files << file end File.write file, serialize(bib), encoding: "UTF-8" end |
#serialize(bib) ⇒ Object
194 195 196 197 198 199 200 |
# File 'lib/relaton_w3c/data_fetcher.rb', line 194 def serialize(bib) case @format when "xml" then bib.to_xml(bibdata: true) when "yaml" then bib.to_hash.to_yaml else bib.send("to_#{@format}") end end |