Class: BookmeterScraper::Scraper
- Inherits:
-
Object
- Object
- BookmeterScraper::Scraper
show all
- Defined in:
- lib/bookmeter_scraper/scraper.rb
Defined Under Namespace
Classes: Book, Books, Profile, User
Constant Summary
collapse
- PROFILE_ATTRIBUTES =
i(
name
gender
age
blood_type
job
address
url
description
first_day
elapsed_days
read_books_count
read_pages_count
reviews_count
bookshelfs_count
)
- JP_ATTRIBUTE_NAMES =
{
gender: '性別',
age: '年齢',
blood_type: '血液型',
job: '職業',
address: '現住所',
url: 'URL / ブログ',
description: '自己紹介',
first_day: '記録初日',
elapsed_days: '経過日数',
read_books_count: '読んだ本',
read_pages_count: '読んだページ',
reviews_count: '感想/レビュー',
bookshelfs_count: '本棚',
}
- BOOK_ATTRIBUTES =
i(name author read_dates uri image_uri)
- USER_ATTRIBUTES =
i(name id uri)
- NUM_BOOKS_PER_PAGE =
40
- NUM_USERS_PER_PAGE =
20
Instance Attribute Summary collapse
Instance Method Summary
collapse
-
#extract_books(page) ⇒ Object
-
#extract_users(page) ⇒ Object
-
#fetch_books(user_id, uri_method, agent = @agent) ⇒ Object
-
#fetch_followers(user_id, agent = @agent) ⇒ Object
-
#fetch_followings(user_id, agent = @agent) ⇒ Object
-
#fetch_profile(user_id, agent = @agent) ⇒ Object
-
#fetch_read_books(user_id, target_year_month) ⇒ Object
-
#fetch_target_books(target_year_month, page) ⇒ Object
-
#get_book_page(book_uri, agent = @agent) ⇒ Object
-
#get_last_book_date(page) ⇒ Object
-
#initialize(agent = nil) ⇒ Scraper
constructor
A new instance of Scraper.
-
#scrape_book_author(book_uri) ⇒ Object
-
#scrape_book_image_uri(book_uri) ⇒ Object
-
#scrape_book_name(book_uri) ⇒ Object
-
#scrape_books_pages(user_id, uri_method, agent = @agent) ⇒ Object
-
#scrape_followers_page(user_id) ⇒ Object
-
#scrape_followings_page(user_id, agent = @agent) ⇒ Object
-
#scrape_others_followings_page(user_id) ⇒ Object
-
#scrape_profile(user_id, agent) ⇒ Object
-
#scrape_read_date(book_uri, agent = @agent) ⇒ Object
-
#scrape_reread_date(book_uri, agent = @agent) ⇒ Object
-
#scrape_users_listing_page(user_id, uri_method, agent = @agent) ⇒ Object
Constructor Details
#initialize(agent = nil) ⇒ Scraper
Returns a new instance of Scraper.
74
75
76
77
|
# File 'lib/bookmeter_scraper/scraper.rb', line 74
def initialize(agent = nil)
@agent = agent
@book_pages = {}
end
|
Instance Attribute Details
#agent ⇒ Object
Returns the value of attribute agent.
71
72
73
|
# File 'lib/bookmeter_scraper/scraper.rb', line 71
def agent
@agent
end
|
Instance Method Details
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
|
# File 'lib/bookmeter_scraper/scraper.rb', line 151
def (page)
raise ArgumentError if page.nil?
books = []
1.upto(NUM_BOOKS_PER_PAGE) do |i|
break if page["book_#{i}_link"].empty?
read_dates = []
read_date = scrape_read_date(page["book_#{i}_link"])
unless read_date.empty?
read_dates << Time.local(read_date['year'], read_date['month'], read_date['day'])
end
reread_dates = []
reread_dates << scrape_reread_date(page["book_#{i}_link"])
reread_dates.flatten!
unless reread_dates.empty?
reread_dates.each do |date|
read_dates << Time.local(date['reread_year'], date['reread_month'], date['reread_day'])
end
end
book_path = page["book_#{i}_link"]
book_name = scrape_book_name(book_path)
book_author = scrape_book_author(book_path)
book_image_uri = scrape_book_image_uri(book_path)
book = Book.new(book_name,
book_author,
read_dates,
ROOT_URI + book_path,
book_image_uri)
books << book
end
books
end
|
371
372
373
374
375
376
377
378
379
380
381
382
383
384
|
# File 'lib/bookmeter_scraper/scraper.rb', line 371
def (page)
raise ArgumentError if page.nil?
users = []
1.upto(NUM_USERS_PER_PAGE) do |i|
break if page["user_#{i}_name"].empty?
user_name = page["user_#{i}_name"]
user_id = page["user_#{i}_link"].match(/\/u\/(\d+)$/)[1]
users << User.new(user_name, user_id, ROOT_URI + "/u/#{user_id}")
end
users
end
|
#fetch_books(user_id, uri_method, agent = @agent) ⇒ Object
105
106
107
108
109
110
111
112
113
114
115
116
117
118
|
# File 'lib/bookmeter_scraper/scraper.rb', line 105
def fetch_books(user_id, uri_method, agent = @agent)
raise ArgumentError unless user_id =~ USER_ID_REGEX
raise ArgumentError unless BookmeterScraper.methods.include?(uri_method)
raise ScraperError if agent.nil?
return [] unless agent.logged_in?
books = Books.new
scraped_pages = scrape_books_pages(user_id, uri_method)
scraped_pages.each do |page|
books << (page)
books.flatten!
end
books
end
|
#fetch_followers(user_id, agent = @agent) ⇒ Object
318
319
320
321
322
323
324
325
326
327
328
329
330
|
# File 'lib/bookmeter_scraper/scraper.rb', line 318
def fetch_followers(user_id, agent = @agent)
raise ArgumentError unless user_id =~ USER_ID_REGEX
raise ScraperError if agent.nil?
return [] unless agent.logged_in?
users = []
scraped_pages = scrape_followers_page(user_id)
scraped_pages.each do |page|
users << (page)
users.flatten!
end
users
end
|
#fetch_followings(user_id, agent = @agent) ⇒ Object
303
304
305
306
307
308
309
310
311
312
313
314
315
316
|
# File 'lib/bookmeter_scraper/scraper.rb', line 303
def fetch_followings(user_id, agent = @agent)
raise ArgumentError unless user_id =~ USER_ID_REGEX
raise ScraperError if agent.nil?
return [] unless agent.logged_in?
users = []
scraped_pages = user_id == agent.log_in_user_id ? scrape_followings_page(user_id)
: scrape_others_followings_page(user_id)
scraped_pages.each do |page|
users << (page)
users.flatten!
end
users
end
|
#fetch_profile(user_id, agent = @agent) ⇒ Object
79
80
81
82
83
84
|
# File 'lib/bookmeter_scraper/scraper.rb', line 79
def fetch_profile(user_id, agent = @agent)
raise ArgumentError unless user_id =~ USER_ID_REGEX
raise ScraperError if agent.nil?
Profile.new(*scrape_profile(user_id, agent))
end
|
#fetch_read_books(user_id, target_year_month) ⇒ Object
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
|
# File 'lib/bookmeter_scraper/scraper.rb', line 189
def fetch_read_books(user_id, target_year_month)
raise ArgumentError unless user_id =~ USER_ID_REGEX
raise ArgumentError if target_year_month.nil?
result = Books.new
scrape_books_pages(user_id, :read_books_uri).each do |page|
first_book_date = scrape_read_date(page['book_1_link'])
last_book_date = get_last_book_date(page)
first_book_year_month = Time.local(first_book_date['year'].to_i, first_book_date['month'].to_i)
last_book_year_month = Time.local(last_book_date['year'].to_i, last_book_date['month'].to_i)
if target_year_month < last_book_year_month
next
elsif target_year_month == first_book_year_month && target_year_month > last_book_year_month
result.concat(fetch_target_books(target_year_month, page))
break
elsif target_year_month < first_book_year_month && target_year_month > last_book_year_month
result.concat(fetch_target_books(target_year_month, page))
break
elsif target_year_month <= first_book_year_month && target_year_month >= last_book_year_month
result.concat(fetch_target_books(target_year_month, page))
elsif target_year_month > first_book_year_month
break
end
end
result
end
|
#fetch_target_books(target_year_month, page) ⇒ Object
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
|
# File 'lib/bookmeter_scraper/scraper.rb', line 228
def fetch_target_books(target_year_month, page)
raise ArgumentError if target_year_month.nil?
raise ArgumentError if page.nil?
target_books = Books.new
1.upto(NUM_BOOKS_PER_PAGE) do |i|
next if page["book_#{i}_link"].empty?
read_year_months = []
read_date = scrape_read_date(page["book_#{i}_link"])
read_dates = [Time.local(read_date['year'], read_date['month'], read_date['day'])]
read_year_months << Time.local(read_date['year'], read_date['month'])
reread_dates = []
reread_dates << scrape_reread_date(page["book_#{i}_link"])
reread_dates.flatten!
unless reread_dates.empty?
reread_dates.each do |date|
read_year_months << Time.local(date['reread_year'], date['reread_month'])
end
end
next unless read_year_months.include?(target_year_month)
unless reread_dates.empty?
reread_dates.each do |date|
read_dates << Time.local(date['reread_year'], date['reread_month'], date['reread_day'])
end
end
book_path = page["book_#{i}_link"]
book_name = scrape_book_name(book_path)
book_author = scrape_book_author(book_path)
book_image_uri = scrape_book_image_uri(book_path)
target_books << Book.new(book_name, book_author, read_dates, ROOT_URI + book_path, book_image_uri)
end
target_books
end
|
#get_book_page(book_uri, agent = @agent) ⇒ Object
268
269
270
271
|
# File 'lib/bookmeter_scraper/scraper.rb', line 268
def get_book_page(book_uri, agent = @agent)
@book_pages[book_uri] = agent.get(ROOT_URI + book_uri) unless @book_pages[book_uri]
@book_pages[book_uri]
end
|
#get_last_book_date(page) ⇒ Object
218
219
220
221
222
223
224
225
226
|
# File 'lib/bookmeter_scraper/scraper.rb', line 218
def get_last_book_date(page)
raise ArgumentError if page.nil?
NUM_BOOKS_PER_PAGE.downto(1) do |i|
link = page["book_#{i}_link"]
next if link.empty?
return scrape_read_date(link)
end
end
|
#scrape_book_author(book_uri) ⇒ Object
277
278
279
|
# File 'lib/bookmeter_scraper/scraper.rb', line 277
def scrape_book_author(book_uri)
get_book_page(book_uri).search('#author_name').text
end
|
#scrape_book_image_uri(book_uri) ⇒ Object
281
282
283
|
# File 'lib/bookmeter_scraper/scraper.rb', line 281
def scrape_book_image_uri(book_uri)
get_book_page(book_uri).search('//*[@id="book_image"]/@src').text
end
|
#scrape_book_name(book_uri) ⇒ Object
273
274
275
|
# File 'lib/bookmeter_scraper/scraper.rb', line 273
def scrape_book_name(book_uri)
get_book_page(book_uri).search('#title').text
end
|
#scrape_books_pages(user_id, uri_method, agent = @agent) ⇒ Object
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
|
# File 'lib/bookmeter_scraper/scraper.rb', line 120
def scrape_books_pages(user_id, uri_method, agent = @agent)
raise ArgumentError unless user_id =~ USER_ID_REGEX
raise ArgumentError unless BookmeterScraper.methods.include?(uri_method)
raise ScraperError if agent.nil?
return [] unless agent.logged_in?
books_page = agent.get(BookmeterScraper.method(uri_method).call(user_id))
return [] if books_page.search('#main_left > div > center > a').empty?
if books_page.search('span.now_page').empty?
books_root = Yasuri.struct_books '//*[@id="main_left"]/div' do
1.upto(NUM_BOOKS_PER_PAGE) do |i|
send("text_book_#{i}_name", "//*[@id=\"main_left\"]/div/div[#{i + 1}]/div[2]/a")
send("text_book_#{i}_link", "//*[@id=\"main_left\"]/div/div[#{i + 1}]/div[2]/a/@href")
end
end
return [books_root.inject(agent, books_page)]
end
books_root = Yasuri.pages_root '//span[@class="now_page"]/following-sibling::span[1]/a' do
text_page_index '//span[@class="now_page"]/a'
1.upto(NUM_BOOKS_PER_PAGE) do |i|
send("text_book_#{i}_name", "//*[@id=\"main_left\"]/div/div[#{i + 1}]/div[2]/a")
send("text_book_#{i}_link", "//*[@id=\"main_left\"]/div/div[#{i + 1}]/div[2]/a/@href")
end
end
books_root.inject(agent, books_page)
end
|
#scrape_followers_page(user_id) ⇒ Object
351
352
353
354
|
# File 'lib/bookmeter_scraper/scraper.rb', line 351
def scrape_followers_page(user_id)
raise ArgumentError unless user_id =~ USER_ID_REGEX
scrape_users_listing_page(user_id, :followers_uri)
end
|
#scrape_followings_page(user_id, agent = @agent) ⇒ Object
332
333
334
335
336
337
338
339
340
341
342
343
344
|
# File 'lib/bookmeter_scraper/scraper.rb', line 332
def scrape_followings_page(user_id, agent = @agent)
raise ArgumentError unless user_id =~ USER_ID_REGEX
return [] unless agent.logged_in?
followings_page = agent.get(BookmeterScraper.followings_uri(user_id))
followings_root = Yasuri.struct_books '//*[@id="main_left"]/div' do
1.upto(NUM_USERS_PER_PAGE) do |i|
send("text_user_#{i}_name", "//*[@id=\"main_left\"]/div/div[#{i}]/a/@title")
send("text_user_#{i}_link", "//*[@id=\"main_left\"]/div/div[#{i}]/a/@href")
end
end
[followings_root.inject(agent, followings_page)]
end
|
#scrape_others_followings_page(user_id) ⇒ Object
346
347
348
349
|
# File 'lib/bookmeter_scraper/scraper.rb', line 346
def scrape_others_followings_page(user_id)
raise ArgumentError unless user_id =~ USER_ID_REGEX
scrape_users_listing_page(user_id, :followings_uri)
end
|
#scrape_profile(user_id, agent) ⇒ Object
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
|
# File 'lib/bookmeter_scraper/scraper.rb', line 86
def scrape_profile(user_id, agent)
raise ArgumentError unless user_id =~ USER_ID_REGEX
raise ScraperError if agent.nil?
mypage = agent.get(BookmeterScraper.mypage_uri(user_id))
profile_dl_tags = mypage.search('#side_left > div.inner > div.profile > dl')
jp_attribute_names = profile_dl_tags.map { |i| i.children[0].children.text }
attribute_values = profile_dl_tags.map { |i| i.children[1].children.text }
jp_attributes = Hash[jp_attribute_names.zip(attribute_values)]
attributes = PROFILE_ATTRIBUTES.map do |attribute|
jp_attributes[JP_ATTRIBUTE_NAMES[attribute]]
end
attributes[0] = mypage.at_css('#side_left > div.inner > h3').text
attributes
end
|
#scrape_read_date(book_uri, agent = @agent) ⇒ Object
285
286
287
288
289
290
291
292
|
# File 'lib/bookmeter_scraper/scraper.rb', line 285
def scrape_read_date(book_uri, agent = @agent)
book_date = Yasuri.struct_date '//*[@id="book_edit_area"]/form[1]/div[2]' do
text_year '//*[@id="read_date_y"]/option[1]', truncate: /\d+/, proc: :to_i
text_month '//*[@id="read_date_m"]/option[1]', truncate: /\d+/, proc: :to_i
text_day '//*[@id="read_date_d"]/option[1]', truncate: /\d+/, proc: :to_i
end
book_date.inject(agent, get_book_page(book_uri))
end
|
#scrape_reread_date(book_uri, agent = @agent) ⇒ Object
294
295
296
297
298
299
300
301
|
# File 'lib/bookmeter_scraper/scraper.rb', line 294
def scrape_reread_date(book_uri, agent = @agent)
book_reread_date = Yasuri.struct_reread_date '//*[@id="book_edit_area"]/div/form[1]/div[2]' do
text_reread_year '//div[@class="reread_box"]/form[1]/div[2]/select[1]/option[1]', truncate: /\d+/, proc: :to_i
text_reread_month '//div[@class="reread_box"]/form[1]/div[2]/select[2]/option[1]', truncate: /\d+/, proc: :to_i
text_reread_day '//div[@class="reread_box"]/form[1]/div[2]/select[3]/option[1]', truncate: /\d+/, proc: :to_i
end
book_reread_date.inject(agent, get_book_page(book_uri))
end
|
#scrape_users_listing_page(user_id, uri_method, agent = @agent) ⇒ Object
356
357
358
359
360
361
362
363
364
365
366
367
368
369
|
# File 'lib/bookmeter_scraper/scraper.rb', line 356
def scrape_users_listing_page(user_id, uri_method, agent = @agent)
raise ArgumentError unless user_id =~ USER_ID_REGEX
raise ArgumentError unless BookmeterScraper.methods.include?(uri_method)
return [] unless agent.logged_in?
page = agent.get(BookmeterScraper.method(uri_method).call(user_id))
root = Yasuri.struct_users '//*[@id="main_left"]/div' do
1.upto(NUM_USERS_PER_PAGE) do |i|
send("text_user_#{i}_name", "//*[@id=\"main_left\"]/div/div[#{i}]/div/div[2]/a/@title")
send("text_user_#{i}_link", "//*[@id=\"main_left\"]/div/div[#{i}]/div/div[2]/a/@href")
end
end
[root.inject(agent, page)]
end
|