Class: RelatonIso::DataFetcher
- Inherits:
-
Object
- Object
- RelatonIso::DataFetcher
- Defined in:
- lib/relaton_iso/data_fetcher.rb
Overview
Fetch all the documents from ISO website.
Class Method Summary collapse
-
.fetch(output: "data", format: "yaml") ⇒ void
Initialize data fetcher and fetch data.
Instance Method Summary collapse
- #check_try(try, uri) ⇒ Object
-
#fetch ⇒ void
Go through all ICS and fetch all documents.
-
#fetch_doc(docpath) ⇒ void
Fetch document from ISO website.
- #fetch_docs ⇒ Object
-
#fetch_ics ⇒ Object
Fetch ICS page recursively and store all the links to documents in the iso_queue.
- #fetch_ics_page(path) ⇒ Object
-
#get_redirection(path) ⇒ Net::HTTPOK
Get the page from the given path.
- #get_response(uri) ⇒ Object
- #index ⇒ Object
-
#initialize(output, format) ⇒ DataFetcher
constructor
Initialize data fetcher.
- #iso_queue ⇒ Object
-
#save_doc(doc, docpath) ⇒ void
save document to file.
-
#serialize(doc) ⇒ String
Serialize document to string.
Constructor Details
#initialize(output, format) ⇒ DataFetcher
Initialize data fetcher.
10 11 12 13 14 15 16 17 |
# File 'lib/relaton_iso/data_fetcher.rb', line 10 def initialize(output, format) @output = output @format = format @ext = format.sub(/^bib/, "") @files = [] @queue = ::Queue.new @mutex = Mutex.new end |
Class Method Details
.fetch(output: "data", format: "yaml") ⇒ void
This method returns an undefined value.
Initialize data fetcher and fetch data.
35 36 37 38 39 40 41 42 43 |
# File 'lib/relaton_iso/data_fetcher.rb', line 35 def self.fetch(output: "data", format: "yaml") t1 = Time.now puts "Started at: #{t1}" FileUtils.mkdir_p output new(output, format).fetch t2 = Time.now puts "Stopped at: #{t2}" puts "Done in: #{(t2 - t1).round} sec." end |
Instance Method Details
#check_try(try, uri) ⇒ Object
112 113 114 115 116 117 118 |
# File 'lib/relaton_iso/data_fetcher.rb', line 112 def check_try(try, uri) if try < 3 warn "Timeout fetching #{uri}, retrying..." sleep 1 true end end |
#fetch ⇒ void
This method returns an undefined value.
Go through all ICS and fetch all documents.
50 51 52 53 54 55 56 57 58 |
# File 'lib/relaton_iso/data_fetcher.rb', line 50 def fetch # rubocop:disable Metrics/AbcSize puts "Scrapping ICS pages..." fetch_ics puts "[#{Time.now}] Scrapping documents..." fetch_docs iso_queue.save # index.sort! { |a, b| compare_docids a, b } index.save end |
#fetch_doc(docpath) ⇒ void
This method returns an undefined value.
Fetch document from ISO website.
134 135 136 137 138 139 140 141 142 143 |
# File 'lib/relaton_iso/data_fetcher.rb', line 134 def fetch_doc(docpath) # path = docpath.sub(/\.html$/, "") # hit = Hit.new({ path: docpath }, nil) doc = Scrapper.parse_page docpath @mutex.synchronize { save_doc doc, docpath } rescue StandardError => e warn "Error fetching document: #{Scrapper::DOMAIN}#{docpath}" warn e. warn e.backtrace end |
#fetch_docs ⇒ Object
120 121 122 123 124 125 |
# File 'lib/relaton_iso/data_fetcher.rb', line 120 def fetch_docs threads = Array.new(3) { thread { |path| fetch_doc(path) } } iso_queue[0..10_000].each { |docpath| @queue << docpath } threads.size.times { @queue << :END } threads.each(&:join) end |
#fetch_ics ⇒ Object
Fetch ICS page recursively and store all the links to documents in the iso_queue.
65 66 67 68 69 70 71 |
# File 'lib/relaton_iso/data_fetcher.rb', line 65 def fetch_ics threads = Array.new(3) { thread { |path| fetch_ics_page(path) } } fetch_ics_page "/standards-catalogue/browse-by-ics.html" sleep(1) until @queue.empty? threads.size.times { @queue << :END } threads.each(&:join) end |
#fetch_ics_page(path) ⇒ Object
73 74 75 76 77 78 79 80 81 82 83 |
# File 'lib/relaton_iso/data_fetcher.rb', line 73 def fetch_ics_page(path) resp = get_redirection path page = Nokogiri::HTML(resp.body) page.xpath("//td[@data-title='Standard and/or project']/div/div/a").each do |item| iso_queue.add_first item[:href].split("?").first end page.xpath("//td[@data-title='ICS']/a").each do |item| @queue << item[:href] end end |
#get_redirection(path) ⇒ Net::HTTPOK
Get the page from the given path. If the page is redirected, get the page from the new path.
93 94 95 96 97 98 99 100 101 102 103 104 105 |
# File 'lib/relaton_iso/data_fetcher.rb', line 93 def get_redirection(path) # rubocop:disable Metrics/MethodLength try = 0 uri = URI(Scrapper::DOMAIN + path) begin get_response uri rescue Net::OpenTimeout, Net::ReadTimeout => e try += 1 retry if check_try try, uri warn "Error fetching #{uri}" warn e. end end |
#get_response(uri) ⇒ Object
107 108 109 110 |
# File 'lib/relaton_iso/data_fetcher.rb', line 107 def get_response(uri) resp = Net::HTTP.get_response(uri) resp.code == "302" ? get_redirection(resp["location"]) : resp end |
#index ⇒ Object
19 20 21 |
# File 'lib/relaton_iso/data_fetcher.rb', line 19 def index @index ||= Relaton::Index.find_or_create :iso, file: HitCollection::INDEXFILE end |
#iso_queue ⇒ Object
23 24 25 |
# File 'lib/relaton_iso/data_fetcher.rb', line 23 def iso_queue @iso_queue ||= RelatonIso::Queue.new end |
#save_doc(doc, docpath) ⇒ void
This method returns an undefined value.
save document to file.
156 157 158 159 160 161 162 163 164 165 166 167 168 |
# File 'lib/relaton_iso/data_fetcher.rb', line 156 def save_doc(doc, docpath) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength docid = doc.docidentifier.detect(&:primary) file_name = docid.id.gsub(/[\s\/:]+/, "-").downcase file = File.join @output, "#{file_name}.#{@ext}" if @files.include? file warn "Duplicate file #{file} for #{docid.id} from #{Scrapper::DOMAIN}#{docpath}" else @files << file index.add_or_update docid.to_h, file File.write file, serialize(doc), encoding: "UTF-8" end iso_queue.move_last docpath end |
#serialize(doc) ⇒ String
Serialize document to string.
177 178 179 180 181 182 183 |
# File 'lib/relaton_iso/data_fetcher.rb', line 177 def serialize(doc) case @format when "yaml" then doc.to_hash.to_yaml when "bibxml" then doc.to_bibxml when "xml" then doc.to_xml bibdata: true end end |