Class: RelatonW3c::DataFetcher

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton_w3c/data_fetcher.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(output, format) ⇒ DataFetcher

Data fetcher initializer

Parameters:

  • output (String)

    directory to save files

  • format (String)

    format of output files (xml, yaml, bibxml)



18
19
20
21
22
23
24
25
26
27
# File 'lib/relaton_w3c/data_fetcher.rb', line 18

def initialize(output, format)
  @output = output
  @format = format
  @ext = format.sub(/^bib/, "")
  dir = File.dirname(File.expand_path(__FILE__))
  @group_names = YAML.load_file(File.join(dir, "workgroups.yaml"))
  @files = Set.new
  @index = DataIndex.create_from_file
  @index1 = Relaton::Index.find_or_create :W3C, file: "index1.yaml"
end

Instance Attribute Details

#dataObject (readonly)

Returns the value of attribute data.



10
11
12
# File 'lib/relaton_w3c/data_fetcher.rb', line 10

def data
  @data
end

#group_namesObject (readonly)

Returns the value of attribute group_names.



10
11
12
# File 'lib/relaton_w3c/data_fetcher.rb', line 10

def group_names
  @group_names
end

#rdf_archiveObject (readonly)

Returns the value of attribute rdf_archive.



10
11
12
# File 'lib/relaton_w3c/data_fetcher.rb', line 10

def rdf_archive
  @rdf_archive
end

Class Method Details

.fetch(output: "data", format: "yaml") ⇒ Object

Initialize fetcher and run fetch

Parameters:

  • source (String)

    source name “w3c-tr-archive” or “w3c-rdf”

  • output (Strin) (defaults to: "data")

    directory to save files, default: “data”

  • format (Strin) (defaults to: "yaml")

    format of output files (xml, yaml, bibxml), default: yaml



36
37
38
39
40
41
42
43
44
# File 'lib/relaton_w3c/data_fetcher.rb', line 36

def self.fetch(output: "data", format: "yaml")
  t1 = Time.now
  puts "Started at: #{t1}"
  FileUtils.mkdir_p output
  new(output, format).fetch
  t2 = Time.now
  puts "Stopped at: #{t2}"
  puts "Done in: #{(t2 - t1).round} sec."
end

Instance Method Details

#add_has_edition_relation(bib) ⇒ Object

Add hasEdition relations form previous parsed document

Parameters:



76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/relaton_w3c/data_fetcher.rb', line 76

def add_has_edition_relation(bib) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity, Metrics/MethodLength
  file = file_name bib.docnumber
  if File.exist? file
    item = send "read_#{@format}", file
    item.relation.each do |r1|
      r1.type = "hasEdition" if r1.type == "instanceOf"
      same_edition = bib.relation.detect { |r2| same_edition?(r1, r2) }
      bib.relation << r1 unless same_edition
    end
  end
  bib.relation.select { |r| r.type == "hasEdition" }
    .max_by { |r| r.bibitem.id.match(/(?<=-)\d{8}$/).to_s }&.type = "instanceOf"
end

#fetchObject

Parse documents

Parameters:

  • source (String)

    source name “w3c-tr-archive” or “w3c-rdf”



55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/relaton_w3c/data_fetcher.rb', line 55

def fetch # (source) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
  rdf = rdf_archive.get_data
  %i[versioned unversioned].each do |type|
    send("query_#{type}_docs", rdf).each do |sl|
      bib = DataParser.parse(rdf, sl, self)
      add_has_edition_relation(bib) if type == :unversioned
      save_doc bib
    rescue StandardError => e
      link = sl.respond_to?(:link) ? sl.link : sl.version_of
      Util.error "Error: document #{link} #{e.message}\n#{e.backtrace.join("\n")}"
    end
  end
  @index.sort!.save
  @index1.save
end

#file_name(id) ⇒ String

Generate file name

Parameters:

  • id (String)

    document id

Returns:

  • (String)

    file name



209
210
211
212
# File 'lib/relaton_w3c/data_fetcher.rb', line 209

def file_name(id)
  name = id.sub(/^W3C\s/, "").gsub(/[\s,:\/+]/, "_").squeeze("_").downcase
  File.join @output, "#{name}.#{@ext}"
end

#query_unversioned_docs(rdf) ⇒ Array<RDF::Query::Solution>

Query RDF source for unversioned documents

Returns:

  • (Array<RDF::Query::Solution>)

    query results



162
163
164
165
166
167
168
169
170
171
172
# File 'lib/relaton_w3c/data_fetcher.rb', line 162

def query_unversioned_docs(rdf)
  sse = SPARQL.parse(%(
    PREFIX doc: <http://www.w3.org/2000/10/swap/pim/doc#>
    SELECT ?version_of
    WHERE {
      ?link doc:versionOf ?version_of .
      FILTER ( isURI(?link) && isURI(?version_of) && ?link != ?version_of )
    }
  ))
  rdf.query(sse).uniq { |s| s.version_of.to_s.sub(/^https?:\/\//, "").sub(/\/$/, "") }
end

#query_versioned_docs(rdf) ⇒ RDF::Query::Solutions

Query RDF source for versioned documents

Returns:

  • (RDF::Query::Solutions)

    query results



145
146
147
148
149
150
151
152
153
154
155
# File 'lib/relaton_w3c/data_fetcher.rb', line 145

def query_versioned_docs(rdf)
  sse = SPARQL.parse(%(
    PREFIX : <http://www.w3.org/2001/02pd/rec54#>
    PREFIX dc: <http://purl.org/dc/elements/1.1/>
    PREFIX doc: <http://www.w3.org/2000/10/swap/pim/doc#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    SELECT ?link ?title ?date
    WHERE { ?link dc:title ?title ; dc:date ?date . }
  ))
  rdf.query sse
end

#read_bibxml(file) ⇒ RelatonW3c::W3cBibliographicItem

Read BibXML file

Parameters:

  • file (String)

    file name

Returns:



120
121
122
# File 'lib/relaton_w3c/data_fetcher.rb', line 120

def read_bibxml(file)
  BibXMLParser.parse File.read(file, encoding: "UTF-8")
end

#read_xml(file) ⇒ RelatonW3c::W3cBibliographicItem

Read XML file

Parameters:

  • file (String)

    file name

Returns:



97
98
99
# File 'lib/relaton_w3c/data_fetcher.rb', line 97

def read_xml(file)
  XMLParser.from_xml(File.read(file, encoding: "UTF-8"))
end

#read_yaml(file) ⇒ RelatonW3c::W3cBibliographicItem

Read YAML file

Parameters:

  • file (String)

    file name

Returns:



108
109
110
111
# File 'lib/relaton_w3c/data_fetcher.rb', line 108

def read_yaml(file)
  hash = YAML.load_file(file)
  W3cBibliographicItem.from_hash(hash)
end

#same_edition?(rel1, rel2) ⇒ Boolean

Compare two relations

Parameters:

Returns:

  • (Boolean)

    true if relations are same



132
133
134
135
136
137
138
# File 'lib/relaton_w3c/data_fetcher.rb', line 132

def same_edition?(rel1, rel2)
  return false unless rel1.type == "hasEdition" && rel1.type == rel2.type

  ids1 = rel1.bibitem.docidentifier.map(&:id)
  ids2 = rel2.bibitem.docidentifier.map(&:id)
  (ids1 & ids2).any?
end

#save_doc(bib, warn_duplicate: true) ⇒ Object

Save document to file

Parameters:



179
180
181
182
183
184
185
186
187
188
189
190
191
192
# File 'lib/relaton_w3c/data_fetcher.rb', line 179

def save_doc(bib, warn_duplicate: true)
  return unless bib

  file = file_name(bib.docnumber)
  if @files.include?(file)
    Util.warn "File #{file} already exists. Document: #{bib.docnumber}" if warn_duplicate
  else
    pubid = PubId.parse bib.docnumber
    @index.add pubid, file
    @index1.add_or_update pubid.to_hash, file
    @files << file
  end
  File.write file, serialize(bib), encoding: "UTF-8"
end

#serialize(bib) ⇒ Object



194
195
196
197
198
199
200
# File 'lib/relaton_w3c/data_fetcher.rb', line 194

def serialize(bib)
  case @format
  when "xml" then bib.to_xml(bibdata: true)
  when "yaml" then bib.to_hash.to_yaml
  else bib.send("to_#{@format}")
  end
end