Class: RelatonJis::DataFetcher
- Inherits:
-
Object
- Object
- RelatonJis::DataFetcher
- Defined in:
- lib/relaton_jis/data_fetcher.rb
Constant Summary collapse
- URL =
"https://webdesk.jsa.or.jp/books/".freeze
- INDEX_FILE =
"index-v1.yaml".freeze
Class Method Summary collapse
Instance Method Summary collapse
- #agent ⇒ Object
- #count ⇒ Object
- #create_thread_pool(size) ⇒ Object
- #end_threads_and_wait ⇒ Object
- #fetch ⇒ Object
-
#fetch_doc(url) ⇒ Object
rubocop:disable Metrics/MethodLength.
- #file(id) ⇒ Object
-
#get_next_page(offset) ⇒ Object
rubocop:disable Metrics/MethodLength.
- #index ⇒ Object
- #initial_post ⇒ Object
-
#initialize(output, format) ⇒ DataFetcher
constructor
A new instance of DataFetcher.
-
#parse_offset(resp) ⇒ Object
rubocop:disable Metrics/AbcSize.
- #parse_page(resp) ⇒ Object
-
#save_doc(bib, url) ⇒ Object
rubocop:disable Metrics/MethodLength.
- #serialize(bib) ⇒ Object
Constructor Details
#initialize(output, format) ⇒ DataFetcher
Returns a new instance of DataFetcher.
6 7 8 9 10 11 12 13 14 |
# File 'lib/relaton_jis/data_fetcher.rb', line 6 def initialize(output, format) @output = output @format = format @ext = format.sub("bibxml", "xml") @files = Set.new @queue = SizedQueue.new 10 @threads = create_thread_pool 5 @mutex = Mutex.new end |
Class Method Details
.fetch(output: "data", format: "yaml") ⇒ Object
16 17 18 19 20 21 22 23 |
# File 'lib/relaton_jis/data_fetcher.rb', line 16 def self.fetch(output: "data", format: "yaml") start_time = Time.now puts "Start fetching JIS data at #{start_time}" FileUtils.mkdir_p output new(output, format).fetch stop_time = Time.now puts "Fetching JIS data finished at #{stop_time}. It took #{stop_time - start_time} seconds." end |
Instance Method Details
#agent ⇒ Object
72 73 74 |
# File 'lib/relaton_jis/data_fetcher.rb', line 72 def agent @agent ||= Mechanize.new end |
#count ⇒ Object
103 104 105 |
# File 'lib/relaton_jis/data_fetcher.rb', line 103 def count @count.to_i end |
#create_thread_pool(size) ⇒ Object
25 26 27 28 29 30 31 32 33 |
# File 'lib/relaton_jis/data_fetcher.rb', line 25 def create_thread_pool(size) Array.new(size) do Thread.new do until (url = @queue.shift) == :END fetch_doc url end end end end |
#end_threads_and_wait ⇒ Object
97 98 99 100 101 |
# File 'lib/relaton_jis/data_fetcher.rb', line 97 def end_threads_and_wait @threads.size.times { @queue << :END } @queue.close @threads.each(&:join) end |
#fetch ⇒ Object
53 54 55 56 57 58 59 |
# File 'lib/relaton_jis/data_fetcher.rb', line 53 def fetch return unless initial_post resp = agent.get "#{URL}W11M0070/index" parse_page resp index.save end |
#fetch_doc(url) ⇒ Object
rubocop:disable Metrics/MethodLength
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
# File 'lib/relaton_jis/data_fetcher.rb', line 35 def fetch_doc(url) # rubocop:disable Metrics/MethodLength attempts = 0 begin bib = Scraper.new(url).fetch rescue StandardError => e attempts += 1 if attempts < 5 sleep 2 retry else Util.warn "URL: #{url}" Util.warn "#{e.message}\n#{e.backtrace[0..6].join("\n")}" end else save_doc bib, url end end |
#file(id) ⇒ Object
145 146 147 148 |
# File 'lib/relaton_jis/data_fetcher.rb', line 145 def file(id) name = id.gsub(/[:\/\s]/, "_") File.join @output, "#{name}.#{@ext}" end |
#get_next_page(offset) ⇒ Object
rubocop:disable Metrics/MethodLength
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
# File 'lib/relaton_jis/data_fetcher.rb', line 107 def get_next_page(offset) # rubocop:disable Metrics/MethodLength attempts = 0 begin if initial_post agent.post "#{URL}W11M0070/getAddList", search_type: "JIS", offset: offset # agent.post "#{URL}W11M0070/getAddList", search_type: "KOKUNAI", all_search_flg: "all_search", offset: offset end rescue StandardError => e attempts += 1 if attempts < 5 sleep 2 retry else Util.warn "#{e.message}\n#{e.backtrace[0..6].join("\n")}" end end end |
#index ⇒ Object
141 142 143 |
# File 'lib/relaton_jis/data_fetcher.rb', line 141 def index @index ||= Relaton::Index.find_or_create :jis, file: INDEX_FILE end |
#initial_post ⇒ Object
61 62 63 64 65 66 67 68 69 70 |
# File 'lib/relaton_jis/data_fetcher.rb', line 61 def initial_post return true if @initial_time && Time.now - @initial_time < 600 body = { record: 0, dantai: "JIS", searchtype2: 1, status_1: 1, status_2: 2 } # body = { search_type: "KOKUNAI", all_search_flg: "all_search" } resp = agent.post "#{URL}W11M0270/index", body disp = JSON.parse resp.body @initial_time = Time.now disp["status"] || Util.warn("No results found for JIS") end |
#parse_offset(resp) ⇒ Object
rubocop:disable Metrics/AbcSize
87 88 89 90 91 92 93 94 95 |
# File 'lib/relaton_jis/data_fetcher.rb', line 87 def parse_offset(resp) # rubocop:disable Metrics/AbcSize if resp.at('//*[@id="btnPaging"]') # first page @count = resp.at('//script[contains(.,"var count =")]').text.match(/var count = (\d+);/)[1] resp.at("//*[@id='offset']")[:value].to_i else script = resp.at("//script").text script.match(/\("offset"\)\.value = '(\d+)'/)[1].to_i end end |
#parse_page(resp) ⇒ Object
76 77 78 79 80 81 82 83 84 85 |
# File 'lib/relaton_jis/data_fetcher.rb', line 76 def parse_page(resp) while resp resp.xpath('//div[@class="blockGenaral"]/a').each { |a| @queue << a[:href] } offset = parse_offset resp break if offset >= count # no more pages resp = get_next_page(offset) end end_threads_and_wait end |
#save_doc(bib, url) ⇒ Object
rubocop:disable Metrics/MethodLength
125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
# File 'lib/relaton_jis/data_fetcher.rb', line 125 def save_doc(bib, url) # rubocop:disable Metrics/MethodLength return unless bib id = bib.docidentifier.find(&:primary).id file = file id @mutex.synchronize do if @files.include?(file) Util.warn "File #{file} already exists. Duplication URL: #{url}" else @files << file File.write file, serialize(bib), encoding: "UTF-8" index.add_or_update id, file end end end |
#serialize(bib) ⇒ Object
150 151 152 153 154 155 156 |
# File 'lib/relaton_jis/data_fetcher.rb', line 150 def serialize(bib) case @format when "yaml" then bib.to_hash.to_yaml when "xml" then bib.to_xml bibdata: true else bib.send "to_#{@format}" end end |