Class: RelatonIso::DataFetcher
- Inherits:
-
Object
- Object
- RelatonIso::DataFetcher
- Defined in:
- lib/relaton_iso/data_fetcher.rb
Overview
Fetch all the documents from ISO website.
Class Method Summary collapse
-
.fetch(output: "data", format: "yaml") ⇒ void
Initialize data fetcher and fetch data.
Instance Method Summary collapse
- #check_try(try, uri) ⇒ Object
- #edition_greater?(doc, bib) ⇒ Boolean
-
#fetch ⇒ void
Go through all ICS and fetch all documents.
-
#fetch_doc(docpath) ⇒ void
Fetch document from ISO website.
- #fetch_docs ⇒ Object
-
#fetch_ics ⇒ Object
Fetch ICS page recursively and store all the links to documents in the iso_queue.
- #fetch_ics_page(path) ⇒ Object
-
#get_redirection(path) ⇒ Net::HTTPOK?
Get the page from the given path.
- #get_response(uri) ⇒ Object
- #index ⇒ Object
-
#initialize(output, format) ⇒ DataFetcher
constructor
Initialize data fetcher.
- #iso_queue ⇒ Object
- #parse_doc_links(page) ⇒ Object
- #parse_ics_links(page) ⇒ Object
-
#replace_substage98?(doc, bib) ⇒ Boolean
rubocop:disable Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity.
- #repot_errors ⇒ Object
- #rewrite_with_same_or_newer(doc, docid, file, docpath) ⇒ Object
-
#save_doc(doc, docpath) ⇒ void
save document to file.
-
#serialize(doc) ⇒ String
Serialize document to string.
- #url(path) ⇒ Object
- #write_file(file, doc, docid) ⇒ Object
Constructor Details
#initialize(output, format) ⇒ DataFetcher
Initialize data fetcher.
10 11 12 13 14 15 16 17 18 19 20 |
# File 'lib/relaton_iso/data_fetcher.rb', line 10 def initialize(output, format) # rubocop:disable Metrics/AbcSize @output = output @format = format @ext = format.sub(/^bib/, "") @files = Set.new @queue = ::Queue.new @mutex = Mutex.new @gh_issue = Relaton::Logger::Channels::GhIssue.new "relaton/relaton-iso", "Error fetching ISO documents" Relaton.logger_pool[:gh_issue] = Relaton::Logger::Log.new(@gh_issue, levels: [:error]) @errors = Hash.new(true) end |
Class Method Details
.fetch(output: "data", format: "yaml") ⇒ void
This method returns an undefined value.
Initialize data fetcher and fetch data.
38 39 40 41 42 43 44 45 46 |
# File 'lib/relaton_iso/data_fetcher.rb', line 38 def self.fetch(output: "data", format: "yaml") t1 = Time.now Util.info "Started at: #{t1}" FileUtils.mkdir_p output new(output, format).fetch t2 = Time.now Util.info "Stopped at: #{t2}" Util.info "Done in: #{(t2 - t1).round} sec." end |
Instance Method Details
#check_try(try, uri) ⇒ Object
138 139 140 141 142 143 144 |
# File 'lib/relaton_iso/data_fetcher.rb', line 138 def check_try(try, uri) if try < 3 Util.warn "Timeout fetching #{uri}, retrying..." sleep 1 true end end |
#edition_greater?(doc, bib) ⇒ Boolean
201 202 203 |
# File 'lib/relaton_iso/data_fetcher.rb', line 201 def edition_greater?(doc, bib) doc.edition && bib.edition && doc.edition.content.to_i > bib.edition.content.to_i end |
#fetch ⇒ void
This method returns an undefined value.
Go through all ICS and fetch all documents.
53 54 55 56 57 58 59 60 61 62 |
# File 'lib/relaton_iso/data_fetcher.rb', line 53 def fetch # rubocop:disable Metrics/AbcSize Util.info "Scrapping ICS pages..." fetch_ics Util.info "(#{Time.now}) Scrapping documents..." fetch_docs iso_queue.save # index.sort! { |a, b| compare_docids a, b } index.save repot_errors end |
#fetch_doc(docpath) ⇒ void
This method returns an undefined value.
Fetch document from ISO website.
160 161 162 163 164 165 |
# File 'lib/relaton_iso/data_fetcher.rb', line 160 def fetch_doc(docpath) doc = Scrapper.parse_page docpath, errors: @errors @mutex.synchronize { save_doc doc, docpath } rescue StandardError => e Util.warn "Fail fetching document: #{url(docpath)}\n#{e.message}\n#{e.backtrace}" end |
#fetch_docs ⇒ Object
146 147 148 149 150 151 |
# File 'lib/relaton_iso/data_fetcher.rb', line 146 def fetch_docs threads = Array.new(3) { thread { |path| fetch_doc(path) } } iso_queue[0..10_000].each { |docpath| @queue << docpath } threads.size.times { @queue << :END } threads.each(&:join) end |
#fetch_ics ⇒ Object
Fetch ICS page recursively and store all the links to documents in the iso_queue.
76 77 78 79 80 81 82 |
# File 'lib/relaton_iso/data_fetcher.rb', line 76 def fetch_ics threads = Array.new(3) { thread { |path| fetch_ics_page(path) } } fetch_ics_page "/standards-catalogue/browse-by-ics.html" sleep(1) until @queue.empty? threads.size.times { @queue << :END } threads.each(&:join) end |
#fetch_ics_page(path) ⇒ Object
84 85 86 87 88 89 90 91 92 93 94 |
# File 'lib/relaton_iso/data_fetcher.rb', line 84 def fetch_ics_page(path) resp = get_redirection path unless resp Util.error "Failed fetching ICS page #{url(path)}" return end page = Nokogiri::HTML(resp.body) parse_doc_links page parse_ics_links page end |
#get_redirection(path) ⇒ Net::HTTPOK?
Get the page from the given path. If the page is redirected, get the page from the new path.
120 121 122 123 124 125 126 127 128 129 130 131 |
# File 'lib/relaton_iso/data_fetcher.rb', line 120 def get_redirection(path) # rubocop:disable Metrics/MethodLength try = 0 uri = URI url(path) begin get_response uri rescue Net::OpenTimeout, Net::ReadTimeout, Errno::ECONNREFUSED => e try += 1 retry if check_try try, uri Util.warn "Failed fetching #{uri}, #{e.message}" end end |
#get_response(uri) ⇒ Object
133 134 135 136 |
# File 'lib/relaton_iso/data_fetcher.rb', line 133 def get_response(uri) resp = Net::HTTP.get_response(uri) resp.code == "302" ? get_redirection(resp["location"]) : resp end |
#index ⇒ Object
22 23 24 |
# File 'lib/relaton_iso/data_fetcher.rb', line 22 def index @index ||= Relaton::Index.find_or_create :iso, file: HitCollection::INDEXFILE end |
#iso_queue ⇒ Object
26 27 28 |
# File 'lib/relaton_iso/data_fetcher.rb', line 26 def iso_queue @iso_queue ||= RelatonIso::Queue.new end |
#parse_doc_links(page) ⇒ Object
96 97 98 99 100 |
# File 'lib/relaton_iso/data_fetcher.rb', line 96 def parse_doc_links(page) doc_links = page.xpath "//td[@data-title='Standard and/or project']/div/div/a" @errors[:doc_links] &&= doc_links.empty? doc_links.each { |item| iso_queue.add_first item[:href].split("?").first } end |
#parse_ics_links(page) ⇒ Object
102 103 104 105 106 |
# File 'lib/relaton_iso/data_fetcher.rb', line 102 def parse_ics_links(page) ics_links = page.xpath("//td[@data-title='ICS']/a") @errors[:ics_links] &&= ics_links.empty? ics_links.each { |item| @queue << item[:href] } end |
#replace_substage98?(doc, bib) ⇒ Boolean
rubocop:disable Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
205 206 207 208 |
# File 'lib/relaton_iso/data_fetcher.rb', line 205 def replace_substage98?(doc, bib) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity doc.edition&.content == bib.edition&.content && (doc.status&.substage&.value != "98" || bib.status&.substage&.value == "98") end |
#repot_errors ⇒ Object
64 65 66 67 68 69 |
# File 'lib/relaton_iso/data_fetcher.rb', line 64 def repot_errors @errors.select { |_, v| v }.each_key do |k| Util.error "Failed to fetch #{k}" end @gh_issue.create_issue end |
#rewrite_with_same_or_newer(doc, docid, file, docpath) ⇒ Object
190 191 192 193 194 195 196 197 198 199 |
# File 'lib/relaton_iso/data_fetcher.rb', line 190 def rewrite_with_same_or_newer(doc, docid, file, docpath) hash = YAML.load_file file item_hash = HashConverter.hash_to_bib hash bib = ::RelatonIsoBib::IsoBibliographicItem.new(**item_hash) if edition_greater?(doc, bib) || replace_substage98?(doc, bib) write_file file, doc, docid elsif @files.include?(file) && !edition_greater?(bib, doc) Util.warn "Duplicate file `#{file}` for `#{docid.id}` from #{url(docpath)}" end end |
#save_doc(doc, docpath) ⇒ void
This method returns an undefined value.
save document to file.
178 179 180 181 182 183 184 185 186 187 188 |
# File 'lib/relaton_iso/data_fetcher.rb', line 178 def save_doc(doc, docpath) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength docid = doc.docidentifier.detect(&:primary) file_name = docid.id.gsub(/[\s\/:]+/, "-").downcase file = File.join @output, "#{file_name}.#{@ext}" if File.exist?(file) rewrite_with_same_or_newer doc, docid, file, docpath else write_file file, doc, docid end iso_queue.move_last docpath end |
#serialize(doc) ⇒ String
Serialize document to string.
223 224 225 226 227 228 229 |
# File 'lib/relaton_iso/data_fetcher.rb', line 223 def serialize(doc) case @format when "yaml" then doc.to_hash.to_yaml when "bibxml" then doc.to_bibxml when "xml" then doc.to_xml bibdata: true end end |
#url(path) ⇒ Object
108 109 110 |
# File 'lib/relaton_iso/data_fetcher.rb', line 108 def url(path) Scrapper::DOMAIN + path end |
#write_file(file, doc, docid) ⇒ Object
210 211 212 213 214 |
# File 'lib/relaton_iso/data_fetcher.rb', line 210 def write_file(file, doc, docid) @files << file index.add_or_update docid.to_h, file File.write file, serialize(doc), encoding: "UTF-8" end |