Class: GoodreadsBooks::Scraper
- Inherits:
-
Object
- Object
- GoodreadsBooks::Scraper
- Defined in:
- lib/goodreads_books/scraper.rb
Constant Summary collapse
- BASE_URL =
"https://www.goodreads.com"- PAGE_URL =
"/choiceawards"
Class Method Summary collapse
- .scrape_awards_year ⇒ Object
-
.scrape_book_details(book) ⇒ Object
– self.scrape_books –.
-
.scrape_books(awards_year) ⇒ Object
– self.scrape_awards_year.
Class Method Details
.scrape_awards_year ⇒ Object
5 6 7 8 9 10 11 12 |
# File 'lib/goodreads_books/scraper.rb', line 5 def self.scrape_awards_year # if awards_year is missing from the url, # goodreads.com defaults to latest choice awards year # /best-books-#{latest awards year}" main_url = "#{BASE_URL}#{PAGE_URL}" html = open(main_url) html.base_uri.to_s.split("-").last.to_i end |
.scrape_book_details(book) ⇒ Object
– self.scrape_books –
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
# File 'lib/goodreads_books/scraper.rb', line 39 def self.scrape_book_details(book) # Next level of scraping (get details of best book within each category_url) book_doc = Nokogiri::HTML(open(book.category_url)) book.vote = book_doc.css(".gcaRightContainer .gcaWinnerHeader").text.split(" ")[1] book. = book_doc.css(".gcaRightContainer h3 .gcaAuthor a.authorName").text book.url = "#{BASE_URL}#{book_doc.css(".gcaRightContainer h3 a.winningTitle").attr("href").text}" # goodreads description is encoded, so need to add .encode("ISO-8859-1") to print the special characters eg. â\u0080\u0099s in printable character of ' # if self.awards_year < 2017, use the span tag, else there's no span tag so don't check for it description = book_doc.css(".gcaRightContainer .readable.stacked span")[1] if description book.description = book_doc.css(".gcaRightContainer .readable.stacked span")[1].text.encode("ISO-8859-1") else book.description = book_doc.css(".gcaRightContainer .readable.stacked").text.encode("ISO-8859-1") end end |
.scrape_books(awards_year) ⇒ Object
– self.scrape_awards_year
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/goodreads_books/scraper.rb', line 14 def self.scrape_books(awards_year) main_url = "#{BASE_URL}#{PAGE_URL}/best-books-#{awards_year}" doc = Nokogiri::HTML(open(main_url)) # Category winners page: iterate through the best book of each category books = [] doc.css(".category.clearFix").each do |category| category_name = category.css("h4").text category_url = category.css("a").attr("href").text category_title = category.css("img").attr("alt").text # for each winner element, assemble the book_details hash book_details = { :awards_year => awards_year, :category => category_name, :title => category_title, :category_url => "#{BASE_URL}#{category_url}" } books = GoodreadsBooks::Book.new_from_web_page(book_details) end books end |