Class: GoodreadsBooks::Scraper

Inherits:
Object
  • Object
show all
Defined in:
lib/goodreads_books/scraper.rb

Constant Summary collapse

BASE_URL =
"https://www.goodreads.com"
PAGE_URL =
"/choiceawards"

Class Method Summary collapse

Class Method Details

.scrape_awards_yearObject



5
6
7
8
9
10
11
12
# File 'lib/goodreads_books/scraper.rb', line 5

def self.scrape_awards_year
  # if awards_year is missing from the url,
  # goodreads.com defaults to latest choice awards year
  # /best-books-#{latest awards year}"
  main_url = "#{BASE_URL}#{PAGE_URL}"
  html = open(main_url)
  html.base_uri.to_s.split("-").last.to_i
end

.scrape_book_details(book) ⇒ Object

– self.scrape_books –



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/goodreads_books/scraper.rb', line 36

def self.scrape_book_details(book)
  # Next level of scraping (get details of best book within each category_url)
  book_doc = Nokogiri::HTML(open(book.category_url))

  book.vote = book_doc.css(".gcaRightContainer .gcaWinnerHeader").text.split(" ")[1]
  book.author = book_doc.css(".gcaRightContainer h3 .gcaAuthor a.authorName").text
  book.url = "#{BASE_URL}#{book_doc.css(".gcaRightContainer h3 a.winningTitle").attr("href").text}"

  # goodreads description is encoded, so need to add .encode("ISO-8859-1") to print the special characters eg. â\u0080\u0099s in printable character of '
  # if self.awards_year < 2017, use the span tag, else there's no span tag so don't check for it
  description = book_doc.css(".gcaRightContainer .readable.stacked span")[1]
  if description
    book.description = book_doc.css(".gcaRightContainer .readable.stacked span")[1].text.encode("ISO-8859-1")
  else
    book.description = book_doc.css(".gcaRightContainer .readable.stacked").text.encode("ISO-8859-1")
  end
end

.scrape_books(awards_year) ⇒ Object

– self.scrape_awards_year



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/goodreads_books/scraper.rb', line 14

def self.scrape_books(awards_year)
  main_url = "#{BASE_URL}#{PAGE_URL}/best-books-#{awards_year}"
  doc = Nokogiri::HTML(open(main_url))

  # Category winners page: iterate through the best book of each category and collect the book information
  doc.css(".category.clearFix").collect do |category|
    category_name = category.css("h4").text
    category_url = category.css("a").attr("href").text
    category_title = category.css("img").attr("alt").text

    # for each winner element, assemble the book_details hash
    book_details = {
      :awards_year => awards_year,
      :category => category_name,
      :title => category_title,
      :category_url => "#{BASE_URL}#{category_url}"
    }

    GoodreadsBooks::Book.new_from_web_page(book_details)
  end
end