Class: RelatonIso::DataFetcher

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton_iso/data_fetcher.rb

Overview

Fetch all the documents from ISO website.

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(output, format) ⇒ DataFetcher

Initialize data fetcher.

Parameters:

  • output (String)

    output directory

  • format (String)

    format of output files (yaml, bibxml, xml)



10
11
12
13
14
15
16
17
# File 'lib/relaton_iso/data_fetcher.rb', line 10

def initialize(output, format)
  @output = output
  @format = format
  @ext = format.sub(/^bib/, "")
  @files = []
  @queue = ::Queue.new
  @mutex = Mutex.new
end

Class Method Details

.fetch(output: "data", format: "yaml") ⇒ void

This method returns an undefined value.

Initialize data fetcher and fetch data.

Parameters:

  • output (String) (defaults to: "data")

    output directory (default: “data”)

  • format (String) (defaults to: "yaml")

    format of output files. Allowed: yaml (default), bibxml, xml



35
36
37
38
39
40
41
42
43
# File 'lib/relaton_iso/data_fetcher.rb', line 35

def self.fetch(output: "data", format: "yaml")
  t1 = Time.now
  puts "Started at: #{t1}"
  FileUtils.mkdir_p output
  new(output, format).fetch
  t2 = Time.now
  puts "Stopped at: #{t2}"
  puts "Done in: #{(t2 - t1).round} sec."
end

Instance Method Details

#check_try(try, uri) ⇒ Object



112
113
114
115
116
117
118
# File 'lib/relaton_iso/data_fetcher.rb', line 112

def check_try(try, uri)
  if try < 3
    warn "Timeout fetching #{uri}, retrying..."
    sleep 1
    true
  end
end

#fetchvoid

This method returns an undefined value.

Go through all ICS and fetch all documents.



50
51
52
53
54
55
56
57
58
# File 'lib/relaton_iso/data_fetcher.rb', line 50

def fetch # rubocop:disable Metrics/AbcSize
  puts "Scrapping ICS pages..."
  fetch_ics
  puts "[#{Time.now}] Scrapping documents..."
  fetch_docs
  iso_queue.save
  # index.sort! { |a, b| compare_docids a, b }
  index.save
end

#fetch_doc(docpath) ⇒ void

This method returns an undefined value.

Fetch document from ISO website.

Parameters:

  • docpath (String)

    document page path



134
135
136
137
138
139
140
141
142
143
# File 'lib/relaton_iso/data_fetcher.rb', line 134

def fetch_doc(docpath)
  # path = docpath.sub(/\.html$/, "")
  # hit = Hit.new({ path: docpath }, nil)
  doc = Scrapper.parse_page docpath
  @mutex.synchronize { save_doc doc, docpath }
rescue StandardError => e
  warn "Error fetching document: #{Scrapper::DOMAIN}#{docpath}"
  warn e.message
  warn e.backtrace
end

#fetch_docsObject



120
121
122
123
124
125
# File 'lib/relaton_iso/data_fetcher.rb', line 120

def fetch_docs
  threads = Array.new(3) { thread { |path| fetch_doc(path) } }
  iso_queue[0..10_000].each { |docpath| @queue << docpath }
  threads.size.times { @queue << :END }
  threads.each(&:join)
end

#fetch_icsObject

Fetch ICS page recursively and store all the links to documents in the iso_queue.

Parameters:

  • path (String)

    path to ICS page



65
66
67
68
69
70
71
# File 'lib/relaton_iso/data_fetcher.rb', line 65

def fetch_ics
  threads = Array.new(3) { thread { |path| fetch_ics_page(path) } }
  fetch_ics_page "/standards-catalogue/browse-by-ics.html"
  sleep(1) until @queue.empty?
  threads.size.times { @queue << :END }
  threads.each(&:join)
end

#fetch_ics_page(path) ⇒ Object



73
74
75
76
77
78
79
80
81
82
83
# File 'lib/relaton_iso/data_fetcher.rb', line 73

def fetch_ics_page(path)
  resp = get_redirection path
  page = Nokogiri::HTML(resp.body)
  page.xpath("//td[@data-title='Standard and/or project']/div/div/a").each do |item|
    iso_queue.add_first item[:href].split("?").first
  end

  page.xpath("//td[@data-title='ICS']/a").each do |item|
    @queue << item[:href]
  end
end

#get_redirection(path) ⇒ Net::HTTPOK

Get the page from the given path. If the page is redirected, get the page from the new path.

Parameters:

  • path (String)

    path to the page

Returns:

  • (Net::HTTPOK)

    HTTP response



93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/relaton_iso/data_fetcher.rb', line 93

def get_redirection(path) # rubocop:disable Metrics/MethodLength
  try = 0
  uri = URI(Scrapper::DOMAIN + path)
  begin
    get_response uri
  rescue Net::OpenTimeout, Net::ReadTimeout => e
    try += 1
    retry if check_try try, uri

    warn "Error fetching #{uri}"
    warn e.message
  end
end

#get_response(uri) ⇒ Object



107
108
109
110
# File 'lib/relaton_iso/data_fetcher.rb', line 107

def get_response(uri)
  resp = Net::HTTP.get_response(uri)
  resp.code == "302" ? get_redirection(resp["location"]) : resp
end

#indexObject



19
20
21
# File 'lib/relaton_iso/data_fetcher.rb', line 19

def index
  @index ||= Relaton::Index.find_or_create :iso, file: HitCollection::INDEXFILE
end

#iso_queueObject



23
24
25
# File 'lib/relaton_iso/data_fetcher.rb', line 23

def iso_queue
  @iso_queue ||= RelatonIso::Queue.new
end

#save_doc(doc, docpath) ⇒ void

This method returns an undefined value.

save document to file.

Parameters:

  • doc (RelatonIsoBib::IsoBibliographicItem)

    document



156
157
158
159
160
161
162
163
164
165
166
167
168
# File 'lib/relaton_iso/data_fetcher.rb', line 156

def save_doc(doc, docpath) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  docid = doc.docidentifier.detect(&:primary)
  file_name = docid.id.gsub(/[\s\/:]+/, "-").downcase
  file = File.join @output, "#{file_name}.#{@ext}"
  if @files.include? file
    warn "Duplicate file #{file} for #{docid.id} from #{Scrapper::DOMAIN}#{docpath}"
  else
    @files << file
    index.add_or_update docid.to_h, file
    File.write file, serialize(doc), encoding: "UTF-8"
  end
  iso_queue.move_last docpath
end

#serialize(doc) ⇒ String

Serialize document to string.

Parameters:

  • doc (RelatonIsoBib::IsoBibliographicItem)

    document

Returns:

  • (String)

    serialized document



177
178
179
180
181
182
183
# File 'lib/relaton_iso/data_fetcher.rb', line 177

def serialize(doc)
  case @format
  when "yaml" then doc.to_hash.to_yaml
  when "bibxml" then doc.to_bibxml
  when "xml" then doc.to_xml bibdata: true
  end
end