Class: RelatonW3c::DataFetcher

Inherits:
Object
  • Object
show all
Includes:
RateLimitHandler
Defined in:
lib/relaton_w3c/data_fetcher.rb

Class Method Summary collapse

Instance Method Summary collapse

Methods included from RateLimitHandler

fetched_objects, #realize

Constructor Details

#initialize(output, format) ⇒ DataFetcher

Data fetcher initializer

Parameters:

  • output (String)

    directory to save files

  • format (String)

    format of output files (xml, yaml, bibxml)



15
16
17
18
19
20
21
22
23
# File 'lib/relaton_w3c/data_fetcher.rb', line 15

def initialize(output, format)
  @output = output
  @format = format
  @ext = format.sub(/^bib/, "")
  @files = Set.new
  @fetched_urls = {}
  @index = DataIndex.create_from_file
  @index1 = Relaton::Index.find_or_create :W3C, file: "index1.yaml"
end

Class Method Details

.fetch(output: "data", format: "yaml") ⇒ Object

Initialize fetcher and run fetch

Parameters:

  • output (Strin) (defaults to: "data")

    directory to save files, default: “data”

  • format (Strin) (defaults to: "yaml")

    format of output files (xml, yaml, bibxml), default: yaml



31
32
33
34
35
36
37
38
39
# File 'lib/relaton_w3c/data_fetcher.rb', line 31

def self.fetch(output: "data", format: "yaml")
  t1 = Time.now
  puts "Started at: #{t1}"
  FileUtils.mkdir_p output
  new(output, format).fetch
  t2 = Time.now
  puts "Stopped at: #{t2}"
  puts "Done in: #{(t2 - t1).round} sec."
end

Instance Method Details

#clientObject



41
42
43
# File 'lib/relaton_w3c/data_fetcher.rb', line 41

def client
  @client ||= W3cApi::Client.new
end

#fetchObject

Parse documents



48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/relaton_w3c/data_fetcher.rb', line 48

def fetch
  specs = client.specifications
  loop do
    specs.links.specifications.each do |spec|
      fetch_spec spec
    end

    break unless specs.next?

    specs = specs.next
  end
  @index.sort!.save
  @index1.save
end

#fetch_spec(unrealized_spec) ⇒ Object



63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/relaton_w3c/data_fetcher.rb', line 63

def fetch_spec(unrealized_spec)
  spec = realize unrealized_spec
  save_doc DataParser.parse(spec)

  if spec.links.respond_to?(:version_history) && spec.links.version_history
    version_history = realize spec.links.version_history
    version_history.links.spec_versions.each { |version| save_doc DataParser.parse(realize version) }
  end

  if spec.links.respond_to?(:predecessor_versions) && spec.links.predecessor_versions
    predecessor_versions = realize spec.links.predecessor_versions
    predecessor_versions.links.predecessor_versions.each { |version| save_doc DataParser.parse(realize version) }
  end

  if spec.links.respond_to?(:successor_versions) && spec.links.successor_versions
    successor_versions = realize spec.links.successor_versions
    successor_versions.links.successor_versions.each { |version| save_doc DataParser.parse(realize version) }
  end
end

#file_name(id) ⇒ String

Generate file name

Parameters:

  • id (String)

    document id

Returns:

  • (String)

    file name



118
119
120
121
# File 'lib/relaton_w3c/data_fetcher.rb', line 118

def file_name(id)
  name = id.sub(/^W3C\s/, "").gsub(/[\s,:\/+]/, "_").squeeze("_").downcase
  File.join @output, "#{name}.#{@ext}"
end

#save_doc(bib, warn_duplicate: true) ⇒ Object

Save document to file

Parameters:



88
89
90
91
92
93
94
95
96
97
98
99
100
101
# File 'lib/relaton_w3c/data_fetcher.rb', line 88

def save_doc(bib, warn_duplicate: true)
  return unless bib

  file = file_name(bib.docnumber)
  if @files.include?(file)
    Util.warn "File #{file} already exists. Document: #{bib.docnumber}" if warn_duplicate
  else
    pubid = PubId.parse bib.docnumber
    @index.add pubid, file
    @index1.add_or_update pubid.to_hash, file
    @files << file
  end
  File.write file, serialize(bib), encoding: "UTF-8"
end

#serialize(bib) ⇒ Object



103
104
105
106
107
108
109
# File 'lib/relaton_w3c/data_fetcher.rb', line 103

def serialize(bib)
  case @format
  when "xml" then bib.to_xml(bibdata: true)
  when "yaml" then bib.to_hash.to_yaml
  else bib.send("to_#{@format}")
  end
end