Class: RelatonIso::DataFetcher

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton_iso/data_fetcher.rb

Overview

Fetch all the documents from ISO website.

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(output, format) ⇒ DataFetcher

Initialize data fetcher.

Parameters:

  • output (String)

    output directory

  • format (String)

    format of output files (yaml, bibxml, xml)



10
11
12
13
14
15
16
17
18
19
20
# File 'lib/relaton_iso/data_fetcher.rb', line 10

def initialize(output, format) # rubocop:disable Metrics/AbcSize
  @output = output
  @format = format
  @ext = format.sub(/^bib/, "")
  @files = Set.new
  @queue = ::Queue.new
  @mutex = Mutex.new
  @gh_issue = Relaton::Logger::Channels::GhIssue.new "relaton/relaton-iso", "Error fetching ISO documents"
  Relaton.logger_pool[:gh_issue] = Relaton::Logger::Log.new(@gh_issue, levels: [:error])
  @errors = Hash.new(true)
end

Class Method Details

.fetch(output: "data", format: "yaml") ⇒ void

This method returns an undefined value.

Initialize data fetcher and fetch data.

Parameters:

  • output (String) (defaults to: "data")

    output directory (default: “data”)

  • format (String) (defaults to: "yaml")

    format of output files. Allowed: yaml (default), bibxml, xml



38
39
40
41
42
43
44
45
46
# File 'lib/relaton_iso/data_fetcher.rb', line 38

def self.fetch(output: "data", format: "yaml")
  t1 = Time.now
  Util.info "Started at: #{t1}"
  FileUtils.mkdir_p output
  new(output, format).fetch
  t2 = Time.now
  Util.info "Stopped at: #{t2}"
  Util.info "Done in: #{(t2 - t1).round} sec."
end

Instance Method Details

#check_try(try, uri) ⇒ Object



138
139
140
141
142
143
144
# File 'lib/relaton_iso/data_fetcher.rb', line 138

def check_try(try, uri)
  if try < 3
    Util.warn "Timeout fetching #{uri}, retrying..."
    sleep 1
    true
  end
end

#edition_greater?(doc, bib) ⇒ Boolean

Returns:

  • (Boolean)


201
202
203
# File 'lib/relaton_iso/data_fetcher.rb', line 201

def edition_greater?(doc, bib)
  doc.edition && bib.edition && doc.edition.content.to_i > bib.edition.content.to_i
end

#fetchvoid

This method returns an undefined value.

Go through all ICS and fetch all documents.



53
54
55
56
57
58
59
60
61
62
# File 'lib/relaton_iso/data_fetcher.rb', line 53

def fetch # rubocop:disable Metrics/AbcSize
  Util.info "Scrapping ICS pages..."
  fetch_ics
  Util.info "(#{Time.now}) Scrapping documents..."
  fetch_docs
  iso_queue.save
  # index.sort! { |a, b| compare_docids a, b }
  index.save
  repot_errors
end

#fetch_doc(docpath) ⇒ void

This method returns an undefined value.

Fetch document from ISO website.

Parameters:

  • docpath (String)

    document page path



160
161
162
163
164
165
# File 'lib/relaton_iso/data_fetcher.rb', line 160

def fetch_doc(docpath)
  doc = Scrapper.parse_page docpath, errors: @errors
  @mutex.synchronize { save_doc doc, docpath }
rescue StandardError => e
  Util.warn "Fail fetching document: #{url(docpath)}\n#{e.message}\n#{e.backtrace}"
end

#fetch_docsObject



146
147
148
149
150
151
# File 'lib/relaton_iso/data_fetcher.rb', line 146

def fetch_docs
  threads = Array.new(3) { thread { |path| fetch_doc(path) } }
  iso_queue[0..10_000].each { |docpath| @queue << docpath }
  threads.size.times { @queue << :END }
  threads.each(&:join)
end

#fetch_icsObject

Fetch ICS page recursively and store all the links to documents in the iso_queue.

Parameters:

  • path (String)

    path to ICS page



76
77
78
79
80
81
82
# File 'lib/relaton_iso/data_fetcher.rb', line 76

def fetch_ics
  threads = Array.new(3) { thread { |path| fetch_ics_page(path) } }
  fetch_ics_page "/standards-catalogue/browse-by-ics.html"
  sleep(1) until @queue.empty?
  threads.size.times { @queue << :END }
  threads.each(&:join)
end

#fetch_ics_page(path) ⇒ Object



84
85
86
87
88
89
90
91
92
93
94
# File 'lib/relaton_iso/data_fetcher.rb', line 84

def fetch_ics_page(path)
  resp = get_redirection path
  unless resp
    Util.error "Failed fetching ICS page #{url(path)}"
    return
  end

  page = Nokogiri::HTML(resp.body)
  parse_doc_links page
  parse_ics_links page
end

#get_redirection(path) ⇒ Net::HTTPOK?

Get the page from the given path. If the page is redirected, get the page from the new path.

Parameters:

  • path (String)

    path to the page

Returns:

  • (Net::HTTPOK, nil)

    HTTP response



120
121
122
123
124
125
126
127
128
129
130
131
# File 'lib/relaton_iso/data_fetcher.rb', line 120

def get_redirection(path) # rubocop:disable Metrics/MethodLength
  try = 0
  uri = URI url(path)
  begin
    get_response uri
  rescue Net::OpenTimeout, Net::ReadTimeout, Errno::ECONNREFUSED => e
    try += 1
    retry if check_try try, uri

    Util.warn "Failed fetching #{uri}, #{e.message}"
  end
end

#get_response(uri) ⇒ Object



133
134
135
136
# File 'lib/relaton_iso/data_fetcher.rb', line 133

def get_response(uri)
  resp = Net::HTTP.get_response(uri)
  resp.code == "302" ? get_redirection(resp["location"]) : resp
end

#indexObject



22
23
24
# File 'lib/relaton_iso/data_fetcher.rb', line 22

def index
  @index ||= Relaton::Index.find_or_create :iso, file: HitCollection::INDEXFILE
end

#iso_queueObject



26
27
28
# File 'lib/relaton_iso/data_fetcher.rb', line 26

def iso_queue
  @iso_queue ||= RelatonIso::Queue.new
end


96
97
98
99
100
# File 'lib/relaton_iso/data_fetcher.rb', line 96

def parse_doc_links(page)
  doc_links = page.xpath "//td[@data-title='Standard and/or project']/div/div/a"
  @errors[:doc_links] &&= doc_links.empty?
  doc_links.each { |item| iso_queue.add_first item[:href].split("?").first }
end


102
103
104
105
106
# File 'lib/relaton_iso/data_fetcher.rb', line 102

def parse_ics_links(page)
  ics_links = page.xpath("//td[@data-title='ICS']/a")
  @errors[:ics_links] &&= ics_links.empty?
  ics_links.each { |item| @queue << item[:href] }
end

#replace_substage98?(doc, bib) ⇒ Boolean

rubocop:disable Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity

Returns:

  • (Boolean)


205
206
207
208
# File 'lib/relaton_iso/data_fetcher.rb', line 205

def replace_substage98?(doc, bib) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
  doc.edition&.content == bib.edition&.content &&
    (doc.status&.substage&.value != "98" || bib.status&.substage&.value == "98")
end

#repot_errorsObject



64
65
66
67
68
69
# File 'lib/relaton_iso/data_fetcher.rb', line 64

def repot_errors
  @errors.select { |_, v| v }.each_key do |k|
    Util.error "Failed to fetch #{k}"
  end
  @gh_issue.create_issue
end

#rewrite_with_same_or_newer(doc, docid, file, docpath) ⇒ Object



190
191
192
193
194
195
196
197
198
199
# File 'lib/relaton_iso/data_fetcher.rb', line 190

def rewrite_with_same_or_newer(doc, docid, file, docpath)
  hash = YAML.load_file file
  item_hash = HashConverter.hash_to_bib hash
  bib = ::RelatonIsoBib::IsoBibliographicItem.new(**item_hash)
  if edition_greater?(doc, bib) || replace_substage98?(doc, bib)
    write_file file, doc, docid
  elsif @files.include?(file) && !edition_greater?(bib, doc)
    Util.warn "Duplicate file `#{file}` for `#{docid.id}` from #{url(docpath)}"
  end
end

#save_doc(doc, docpath) ⇒ void

This method returns an undefined value.

save document to file.

Parameters:

  • doc (RelatonIsoBib::IsoBibliographicItem)

    document



178
179
180
181
182
183
184
185
186
187
188
# File 'lib/relaton_iso/data_fetcher.rb', line 178

def save_doc(doc, docpath) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  docid = doc.docidentifier.detect(&:primary)
  file_name = docid.id.gsub(/[\s\/:]+/, "-").downcase
  file = File.join @output, "#{file_name}.#{@ext}"
  if File.exist?(file)
    rewrite_with_same_or_newer doc, docid, file, docpath
  else
    write_file file, doc, docid
  end
  iso_queue.move_last docpath
end

#serialize(doc) ⇒ String

Serialize document to string.

Parameters:

  • doc (RelatonIsoBib::IsoBibliographicItem)

    document

Returns:

  • (String)

    serialized document



223
224
225
226
227
228
229
# File 'lib/relaton_iso/data_fetcher.rb', line 223

def serialize(doc)
  case @format
  when "yaml" then doc.to_hash.to_yaml
  when "bibxml" then doc.to_bibxml
  when "xml" then doc.to_xml bibdata: true
  end
end

#url(path) ⇒ Object



108
109
110
# File 'lib/relaton_iso/data_fetcher.rb', line 108

def url(path)
  Scrapper::DOMAIN + path
end

#write_file(file, doc, docid) ⇒ Object



210
211
212
213
214
# File 'lib/relaton_iso/data_fetcher.rb', line 210

def write_file(file, doc, docid)
  @files << file
  index.add_or_update docid.to_h, file
  File.write file, serialize(doc), encoding: "UTF-8"
end