Class: BookmeterScraper::Scraper

Inherits:
Object
  • Object
show all
Defined in:
lib/bookmeter_scraper/scraper.rb

Defined Under Namespace

Classes: Book, Books, Profile, User

Constant Summary collapse

PROFILE_ATTRIBUTES =
i(
  name
  gender
  age
  blood_type
  job
  address
  url
  description
  first_day
  elapsed_days
  read_books_count
  read_pages_count
  reviews_count
  bookshelfs_count
)
JP_ATTRIBUTE_NAMES =
{
  gender: '性別',
  age: '年齢',
  blood_type: '血液型',
  job: '職業',
  address: '現住所',
  url: 'URL / ブログ',
  description: '自己紹介',
  first_day: '記録初日',
  elapsed_days: '経過日数',
  read_books_count: '読んだ本',
  read_pages_count: '読んだページ',
  reviews_count: '感想/レビュー',
  bookshelfs_count: '本棚',
}
BOOK_ATTRIBUTES =
i(name author read_dates uri image_uri)
USER_ATTRIBUTES =
i(name id uri)
NUM_BOOKS_PER_PAGE =
40
NUM_USERS_PER_PAGE =
20

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(agent = nil) ⇒ Scraper

Returns a new instance of Scraper.



74
75
76
77
# File 'lib/bookmeter_scraper/scraper.rb', line 74

def initialize(agent = nil)
  @agent = agent
  @book_pages = {}
end

Instance Attribute Details

#agentObject

Returns the value of attribute agent.



71
72
73
# File 'lib/bookmeter_scraper/scraper.rb', line 71

def agent
  @agent
end

Instance Method Details

#extract_books(page) ⇒ Object

Raises:



151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# File 'lib/bookmeter_scraper/scraper.rb', line 151

def extract_books(page)
  raise ArgumentError if page.nil?

  books = []
  1.upto(NUM_BOOKS_PER_PAGE) do |i|
    break if page["book_#{i}_link"].empty?

    read_dates = []
    read_date  = scrape_read_date(page["book_#{i}_link"])
    unless read_date.empty?
      read_dates << Time.local(read_date['year'], read_date['month'], read_date['day'])
    end

    reread_dates = []
    reread_dates << scrape_reread_date(page["book_#{i}_link"])
    reread_dates.flatten!

    unless reread_dates.empty?
      reread_dates.each do |date|
        read_dates << Time.local(date['reread_year'], date['reread_month'], date['reread_day'])
      end
    end

    book_path = page["book_#{i}_link"]
    book_name = scrape_book_name(book_path)
    book_author    = scrape_book_author(book_path)
    book_image_uri = scrape_book_image_uri(book_path)
    book = Book.new(book_name,
                    book_author,
                    read_dates,
                    ROOT_URI + book_path,
                    book_image_uri)
    books << book
  end

  books
end

#extract_users(page) ⇒ Object

Raises:



371
372
373
374
375
376
377
378
379
380
381
382
383
384
# File 'lib/bookmeter_scraper/scraper.rb', line 371

def extract_users(page)
  raise ArgumentError if page.nil?

  users = []
  1.upto(NUM_USERS_PER_PAGE) do |i|
    break if page["user_#{i}_name"].empty?

    user_name = page["user_#{i}_name"]
    user_id   = page["user_#{i}_link"].match(/\/u\/(\d+)$/)[1]
    users << User.new(user_name, user_id, ROOT_URI + "/u/#{user_id}")
  end

  users
end

#fetch_books(user_id, uri_method, agent = @agent) ⇒ Object

Raises:



105
106
107
108
109
110
111
112
113
114
115
116
117
118
# File 'lib/bookmeter_scraper/scraper.rb', line 105

def fetch_books(user_id, uri_method, agent = @agent)
  raise ArgumentError unless user_id =~ USER_ID_REGEX
  raise ArgumentError unless BookmeterScraper.methods.include?(uri_method)
  raise ScraperError if agent.nil?
  return [] unless agent.logged_in?

  books = Books.new
  scraped_pages = scrape_books_pages(user_id, uri_method)
  scraped_pages.each do |page|
    books << extract_books(page)
    books.flatten!
  end
  books
end

#fetch_followers(user_id, agent = @agent) ⇒ Object

Raises:



318
319
320
321
322
323
324
325
326
327
328
329
330
# File 'lib/bookmeter_scraper/scraper.rb', line 318

def fetch_followers(user_id, agent = @agent)
  raise ArgumentError unless user_id =~ USER_ID_REGEX
  raise ScraperError if agent.nil?
  return [] unless agent.logged_in?

  users = []
  scraped_pages = scrape_followers_page(user_id)
  scraped_pages.each do |page|
    users << extract_users(page)
    users.flatten!
  end
  users
end

#fetch_followings(user_id, agent = @agent) ⇒ Object

Raises:



303
304
305
306
307
308
309
310
311
312
313
314
315
316
# File 'lib/bookmeter_scraper/scraper.rb', line 303

def fetch_followings(user_id, agent = @agent)
  raise ArgumentError unless user_id =~ USER_ID_REGEX
  raise ScraperError if agent.nil?
  return [] unless agent.logged_in?

  users = []
  scraped_pages = user_id == agent. ? scrape_followings_page(user_id)
                                                  : scrape_others_followings_page(user_id)
  scraped_pages.each do |page|
    users << extract_users(page)
    users.flatten!
  end
  users
end

#fetch_profile(user_id, agent = @agent) ⇒ Object

Raises:



79
80
81
82
83
84
# File 'lib/bookmeter_scraper/scraper.rb', line 79

def fetch_profile(user_id, agent = @agent)
  raise ArgumentError unless user_id =~ USER_ID_REGEX
  raise ScraperError if agent.nil?

  Profile.new(*scrape_profile(user_id, agent))
end

#fetch_read_books(user_id, target_year_month) ⇒ Object

Raises:



189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
# File 'lib/bookmeter_scraper/scraper.rb', line 189

def fetch_read_books(user_id, target_year_month)
  raise ArgumentError unless user_id =~ USER_ID_REGEX
  raise ArgumentError if target_year_month.nil?

  result = Books.new
  scrape_books_pages(user_id, :read_books_uri).each do |page|
    first_book_date = scrape_read_date(page['book_1_link'])
    last_book_date  = get_last_book_date(page)

    first_book_year_month = Time.local(first_book_date['year'].to_i, first_book_date['month'].to_i)
    last_book_year_month  = Time.local(last_book_date['year'].to_i, last_book_date['month'].to_i)

    if target_year_month < last_book_year_month
      next
    elsif target_year_month == first_book_year_month && target_year_month > last_book_year_month
      result.concat(fetch_target_books(target_year_month, page))
      break
    elsif target_year_month < first_book_year_month && target_year_month > last_book_year_month
      result.concat(fetch_target_books(target_year_month, page))
      break
    elsif target_year_month <= first_book_year_month && target_year_month >= last_book_year_month
      result.concat(fetch_target_books(target_year_month, page))
    elsif target_year_month > first_book_year_month
      break
    end
  end
  result
end

#fetch_target_books(target_year_month, page) ⇒ Object

Raises:



228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
# File 'lib/bookmeter_scraper/scraper.rb', line 228

def fetch_target_books(target_year_month, page)
  raise ArgumentError if target_year_month.nil?
  raise ArgumentError if page.nil?

  target_books = Books.new
  1.upto(NUM_BOOKS_PER_PAGE) do |i|
    next if page["book_#{i}_link"].empty?

    read_year_months = []
    read_date  = scrape_read_date(page["book_#{i}_link"])
    read_dates = [Time.local(read_date['year'], read_date['month'], read_date['day'])]
    read_year_months << Time.local(read_date['year'], read_date['month'])

    reread_dates = []
    reread_dates << scrape_reread_date(page["book_#{i}_link"])
    reread_dates.flatten!

    unless reread_dates.empty?
      reread_dates.each do |date|
        read_year_months << Time.local(date['reread_year'], date['reread_month'])
      end
    end

    next unless read_year_months.include?(target_year_month)

    unless reread_dates.empty?
      reread_dates.each do |date|
        read_dates << Time.local(date['reread_year'], date['reread_month'], date['reread_day'])
      end
    end
    book_path = page["book_#{i}_link"]
    book_name = scrape_book_name(book_path)
    book_author    = scrape_book_author(book_path)
    book_image_uri = scrape_book_image_uri(book_path)
    target_books << Book.new(book_name, book_author, read_dates, ROOT_URI + book_path, book_image_uri)
  end

  target_books
end

#get_book_page(book_uri, agent = @agent) ⇒ Object



268
269
270
271
# File 'lib/bookmeter_scraper/scraper.rb', line 268

def get_book_page(book_uri, agent = @agent)
  @book_pages[book_uri] = agent.get(ROOT_URI + book_uri) unless @book_pages[book_uri]
  @book_pages[book_uri]
end

#get_last_book_date(page) ⇒ Object

Raises:



218
219
220
221
222
223
224
225
226
# File 'lib/bookmeter_scraper/scraper.rb', line 218

def get_last_book_date(page)
  raise ArgumentError if page.nil?

  NUM_BOOKS_PER_PAGE.downto(1) do |i|
    link = page["book_#{i}_link"]
    next if link.empty?
    return scrape_read_date(link)
  end
end

#scrape_book_author(book_uri) ⇒ Object



277
278
279
# File 'lib/bookmeter_scraper/scraper.rb', line 277

def scrape_book_author(book_uri)
  get_book_page(book_uri).search('#author_name').text
end

#scrape_book_image_uri(book_uri) ⇒ Object



281
282
283
# File 'lib/bookmeter_scraper/scraper.rb', line 281

def scrape_book_image_uri(book_uri)
  get_book_page(book_uri).search('//*[@id="book_image"]/@src').text
end

#scrape_book_name(book_uri) ⇒ Object



273
274
275
# File 'lib/bookmeter_scraper/scraper.rb', line 273

def scrape_book_name(book_uri)
  get_book_page(book_uri).search('#title').text
end

#scrape_books_pages(user_id, uri_method, agent = @agent) ⇒ Object

Raises:



120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# File 'lib/bookmeter_scraper/scraper.rb', line 120

def scrape_books_pages(user_id, uri_method, agent = @agent)
  raise ArgumentError unless user_id =~ USER_ID_REGEX
  raise ArgumentError unless BookmeterScraper.methods.include?(uri_method)
  raise ScraperError if agent.nil?
  return [] unless agent.logged_in?

  books_page = agent.get(BookmeterScraper.method(uri_method).call(user_id))

  # if books are not found at all
  return [] if books_page.search('#main_left > div > center > a').empty?

  if books_page.search('span.now_page').empty?
    books_root = Yasuri.struct_books '//*[@id="main_left"]/div' do
      1.upto(NUM_BOOKS_PER_PAGE) do |i|
        send("text_book_#{i}_name", "//*[@id=\"main_left\"]/div/div[#{i + 1}]/div[2]/a")
        send("text_book_#{i}_link", "//*[@id=\"main_left\"]/div/div[#{i + 1}]/div[2]/a/@href")
      end
    end
    return [books_root.inject(agent, books_page)]
  end

  books_root = Yasuri.pages_root '//span[@class="now_page"]/following-sibling::span[1]/a' do
    text_page_index '//span[@class="now_page"]/a'
    1.upto(NUM_BOOKS_PER_PAGE) do |i|
      send("text_book_#{i}_name", "//*[@id=\"main_left\"]/div/div[#{i + 1}]/div[2]/a")
      send("text_book_#{i}_link", "//*[@id=\"main_left\"]/div/div[#{i + 1}]/div[2]/a/@href")
    end
  end
  books_root.inject(agent, books_page)
end

#scrape_followers_page(user_id) ⇒ Object

Raises:



351
352
353
354
# File 'lib/bookmeter_scraper/scraper.rb', line 351

def scrape_followers_page(user_id)
  raise ArgumentError unless user_id =~ USER_ID_REGEX
  scrape_users_listing_page(user_id, :followers_uri)
end

#scrape_followings_page(user_id, agent = @agent) ⇒ Object

Raises:



332
333
334
335
336
337
338
339
340
341
342
343
344
# File 'lib/bookmeter_scraper/scraper.rb', line 332

def scrape_followings_page(user_id, agent = @agent)
  raise ArgumentError unless user_id =~ USER_ID_REGEX
  return [] unless agent.logged_in?

  followings_page = agent.get(BookmeterScraper.followings_uri(user_id))
  followings_root = Yasuri.struct_books '//*[@id="main_left"]/div' do
    1.upto(NUM_USERS_PER_PAGE) do |i|
      send("text_user_#{i}_name", "//*[@id=\"main_left\"]/div/div[#{i}]/a/@title")
      send("text_user_#{i}_link", "//*[@id=\"main_left\"]/div/div[#{i}]/a/@href")
    end
  end
  [followings_root.inject(agent, followings_page)]
end

#scrape_others_followings_page(user_id) ⇒ Object

Raises:



346
347
348
349
# File 'lib/bookmeter_scraper/scraper.rb', line 346

def scrape_others_followings_page(user_id)
  raise ArgumentError unless user_id =~ USER_ID_REGEX
  scrape_users_listing_page(user_id, :followings_uri)
end

#scrape_profile(user_id, agent) ⇒ Object

Raises:



86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# File 'lib/bookmeter_scraper/scraper.rb', line 86

def scrape_profile(user_id, agent)
  raise ArgumentError unless user_id =~ USER_ID_REGEX
  raise ScraperError if agent.nil?

  mypage = agent.get(BookmeterScraper.mypage_uri(user_id))

  profile_dl_tags    = mypage.search('#side_left > div.inner > div.profile > dl')
  jp_attribute_names = profile_dl_tags.map { |i| i.children[0].children.text }
  attribute_values   = profile_dl_tags.map { |i| i.children[1].children.text }
  jp_attributes      = Hash[jp_attribute_names.zip(attribute_values)]

  attributes = PROFILE_ATTRIBUTES.map do |attribute|
    jp_attributes[JP_ATTRIBUTE_NAMES[attribute]]
  end
  attributes[0] = mypage.at_css('#side_left > div.inner > h3').text

  attributes
end

#scrape_read_date(book_uri, agent = @agent) ⇒ Object



285
286
287
288
289
290
291
292
# File 'lib/bookmeter_scraper/scraper.rb', line 285

def scrape_read_date(book_uri, agent = @agent)
  book_date = Yasuri.struct_date '//*[@id="book_edit_area"]/form[1]/div[2]' do
    text_year  '//*[@id="read_date_y"]/option[1]', truncate: /\d+/, proc: :to_i
    text_month '//*[@id="read_date_m"]/option[1]', truncate: /\d+/, proc: :to_i
    text_day   '//*[@id="read_date_d"]/option[1]', truncate: /\d+/, proc: :to_i
  end
  book_date.inject(agent, get_book_page(book_uri))
end

#scrape_reread_date(book_uri, agent = @agent) ⇒ Object



294
295
296
297
298
299
300
301
# File 'lib/bookmeter_scraper/scraper.rb', line 294

def scrape_reread_date(book_uri, agent = @agent)
  book_reread_date = Yasuri.struct_reread_date '//*[@id="book_edit_area"]/div/form[1]/div[2]' do
    text_reread_year  '//div[@class="reread_box"]/form[1]/div[2]/select[1]/option[1]', truncate: /\d+/, proc: :to_i
    text_reread_month '//div[@class="reread_box"]/form[1]/div[2]/select[2]/option[1]', truncate: /\d+/, proc: :to_i
    text_reread_day   '//div[@class="reread_box"]/form[1]/div[2]/select[3]/option[1]', truncate: /\d+/, proc: :to_i
  end
  book_reread_date.inject(agent, get_book_page(book_uri))
end

#scrape_users_listing_page(user_id, uri_method, agent = @agent) ⇒ Object

Raises:



356
357
358
359
360
361
362
363
364
365
366
367
368
369
# File 'lib/bookmeter_scraper/scraper.rb', line 356

def scrape_users_listing_page(user_id, uri_method, agent = @agent)
  raise ArgumentError unless user_id =~ USER_ID_REGEX
  raise ArgumentError unless BookmeterScraper.methods.include?(uri_method)
  return [] unless agent.logged_in?

  page = agent.get(BookmeterScraper.method(uri_method).call(user_id))
  root = Yasuri.struct_users '//*[@id="main_left"]/div' do
    1.upto(NUM_USERS_PER_PAGE) do |i|
      send("text_user_#{i}_name", "//*[@id=\"main_left\"]/div/div[#{i}]/div/div[2]/a/@title")
      send("text_user_#{i}_link", "//*[@id=\"main_left\"]/div/div[#{i}]/div/div[2]/a/@href")
    end
  end
  [root.inject(agent, page)]
end