Class: RelatonJis::DataFetcher

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton_jis/data_fetcher.rb

Constant Summary collapse

URL =
"https://webdesk.jsa.or.jp/books/".freeze
INDEX_FILE =
"index-v1.yaml".freeze

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(output, format) ⇒ DataFetcher

Returns a new instance of DataFetcher.



6
7
8
9
10
11
12
13
14
# File 'lib/relaton_jis/data_fetcher.rb', line 6

def initialize(output, format)
  @output = output
  @format = format
  @ext = format.sub("bibxml", "xml")
  @files = Set.new
  @queue = SizedQueue.new 10
  @threads = create_thread_pool 5
  @mutex = Mutex.new
end

Class Method Details

.fetch(output: "data", format: "yaml") ⇒ Object



16
17
18
19
20
21
22
23
# File 'lib/relaton_jis/data_fetcher.rb', line 16

def self.fetch(output: "data", format: "yaml")
  start_time = Time.now
  puts "Start fetching JIS data at #{start_time}"
  FileUtils.mkdir_p output
  new(output, format).fetch
  stop_time = Time.now
  puts "Fetching JIS data finished at #{stop_time}. It took #{stop_time - start_time} seconds."
end

Instance Method Details

#agentObject



72
73
74
# File 'lib/relaton_jis/data_fetcher.rb', line 72

def agent
  @agent ||= Mechanize.new
end

#countObject



103
104
105
# File 'lib/relaton_jis/data_fetcher.rb', line 103

def count
  @count.to_i
end

#create_thread_pool(size) ⇒ Object



25
26
27
28
29
30
31
32
33
# File 'lib/relaton_jis/data_fetcher.rb', line 25

def create_thread_pool(size)
  Array.new(size) do
    Thread.new do
      until (url = @queue.shift) == :END
        fetch_doc url
      end
    end
  end
end

#end_threads_and_waitObject



97
98
99
100
101
# File 'lib/relaton_jis/data_fetcher.rb', line 97

def end_threads_and_wait
  @threads.size.times { @queue << :END }
  @queue.close
  @threads.each(&:join)
end

#fetchObject



53
54
55
56
57
58
59
# File 'lib/relaton_jis/data_fetcher.rb', line 53

def fetch
  return unless initial_post

  resp = agent.get "#{URL}W11M0070/index"
  parse_page resp
  index.save
end

#fetch_doc(url) ⇒ Object

rubocop:disable Metrics/MethodLength



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/relaton_jis/data_fetcher.rb', line 35

def fetch_doc(url) # rubocop:disable Metrics/MethodLength
  attempts = 0
  begin
    bib = Scraper.new(url).fetch
  rescue StandardError => e
    attempts += 1
    if attempts < 5
      sleep 2
      retry
    else
      Util.warn "URL: #{url}"
      Util.warn "#{e.message}\n#{e.backtrace[0..6].join("\n")}"
    end
  else
    save_doc bib, url
  end
end

#file(id) ⇒ Object



145
146
147
148
# File 'lib/relaton_jis/data_fetcher.rb', line 145

def file(id)
  name = id.gsub(/[:\/\s]/, "_")
  File.join @output, "#{name}.#{@ext}"
end

#get_next_page(offset) ⇒ Object

rubocop:disable Metrics/MethodLength



107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# File 'lib/relaton_jis/data_fetcher.rb', line 107

def get_next_page(offset) # rubocop:disable Metrics/MethodLength
  attempts = 0
  begin
    if initial_post
      agent.post "#{URL}W11M0070/getAddList", search_type: "JIS", offset: offset
      # agent.post "#{URL}W11M0070/getAddList", search_type: "KOKUNAI", all_search_flg: "all_search", offset: offset
    end
  rescue StandardError => e
    attempts += 1
    if attempts < 5
      sleep 2
      retry
    else
      Util.warn "#{e.message}\n#{e.backtrace[0..6].join("\n")}"
    end
  end
end

#indexObject



141
142
143
# File 'lib/relaton_jis/data_fetcher.rb', line 141

def index
  @index ||= Relaton::Index.find_or_create :jis, file: INDEX_FILE
end

#initial_postObject



61
62
63
64
65
66
67
68
69
70
# File 'lib/relaton_jis/data_fetcher.rb', line 61

def initial_post
  return true if @initial_time && Time.now - @initial_time < 600

  body = { record: 0, dantai: "JIS", searchtype2: 1, status_1: 1, status_2: 2 }
  # body = { search_type: "KOKUNAI", all_search_flg: "all_search" }
  resp = agent.post "#{URL}W11M0270/index", body
  disp = JSON.parse resp.body
  @initial_time = Time.now
  disp["status"] || Util.warn("No results found for JIS")
end

#parse_offset(resp) ⇒ Object

rubocop:disable Metrics/AbcSize



87
88
89
90
91
92
93
94
95
# File 'lib/relaton_jis/data_fetcher.rb', line 87

def parse_offset(resp) # rubocop:disable Metrics/AbcSize
  if resp.at('//*[@id="btnPaging"]') # first page
    @count = resp.at('//script[contains(.,"var count =")]').text.match(/var count = (\d+);/)[1]
    resp.at("//*[@id='offset']")[:value].to_i
  else
    script = resp.at("//script").text
    script.match(/\("offset"\)\.value = '(\d+)'/)[1].to_i
  end
end

#parse_page(resp) ⇒ Object



76
77
78
79
80
81
82
83
84
85
# File 'lib/relaton_jis/data_fetcher.rb', line 76

def parse_page(resp)
  while resp
    resp.xpath('//div[@class="blockGenaral"]/a').each { |a| @queue << a[:href] }
    offset = parse_offset resp
    break if offset >= count # no more pages

    resp = get_next_page(offset)
  end
  end_threads_and_wait
end

#save_doc(bib, url) ⇒ Object

rubocop:disable Metrics/MethodLength



125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# File 'lib/relaton_jis/data_fetcher.rb', line 125

def save_doc(bib, url) # rubocop:disable Metrics/MethodLength
  return unless bib

  id = bib.docidentifier.find(&:primary).id
  file = file id
  @mutex.synchronize do
    if @files.include?(file)
      Util.warn "File #{file} already exists. Duplication URL: #{url}"
    else
      @files << file
      File.write file, serialize(bib), encoding: "UTF-8"
      index.add_or_update id, file
    end
  end
end

#serialize(bib) ⇒ Object



150
151
152
153
154
155
156
# File 'lib/relaton_jis/data_fetcher.rb', line 150

def serialize(bib)
  case @format
  when "yaml" then bib.to_hash.to_yaml
  when "xml" then bib.to_xml bibdata: true
  else bib.send "to_#{@format}"
  end
end