Class: Wordwise::Scraper

Inherits:
Object
  • Object
show all
Defined in:
lib/scraper.rb

Overview

Scrapes web page containing word and the individual pages with their definitions.

Constant Summary collapse

BASEPATH =
'https://en.oxforddictionaries.com'

Class Method Summary collapse

Class Method Details

.scrape_entry_pagesObject

Samples 4 urls to words’ pages and parses the question word, its origin and definition, and 3 more definitions.



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/scraper.rb', line 46

def self.scrape_entry_pages
  docs, word_urls = [], []
  question_words = Wordwise::CLI.question_words
  # Iterates over array to make array of urls that are parsed by Nokogiri
  # and put in another array.
  question_words.each_index do |i|
    word_urls << "#{BASEPATH}/definition/#{question_words[i]}"
    docs << Nokogiri::HTML(open(word_urls[i]))
  end

  # Sets variable for word origin.
  origin_wrapper = docs[0].css('.senseInnerWrapper p')[-1]
  if origin_wrapper
    origin = origin_wrapper.text
  else
    origin = 'Origin not available.'
  end
end

.scrape_word_list(page_idx) ⇒ Object

Scrapes a page with a word list.



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/scraper.rb', line 26

def self.scrape_word_list(page_idx)
  
  doc = Nokogiri::HTML(open(@list_urls[page_idx]))
  @words_defs = {}

  # Creates hash of word-definition pairs.
  (0..doc.css('tr').length - 1).each do |i|
    @words_defs.store(doc.css('tr')[i].css('td')[0].text, doc.css('tr')[i].css('td')[1].text)
  end

  # Removes invalid entries
  @words_defs.delete('')
  @words_defs.delete_if { |w| w =~ /\W/ || w =~ /xylene/ || w =~ /do/ }

  # Converts hash to array for use in .scrape_entry_pages.
  @words_defs_ary = @words_defs.to_a
end

.scrape_word_listsObject

Scrapes page with list of word lists.



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# File 'lib/scraper.rb', line 8

def self.scrape_word_lists
  html = Nokogiri::HTML(open(BASEPATH + '/explore/word-lists'))
  @list_urls, lists = [], []

  # Populates arrays of word list names and urls.
  (0..html.css('.record').size - 1).each do |i|
    @list_urls << BASEPATH + html.css('.record a')[i].attribute('href').value
    lists << html.css('.record h2')[i].text
  end

  # Removes list not fitting format.
  @list_urls.delete_if { |u| u =~ /phobias/ }
  lists.delete_if { |l| l =~ /Phobias/ }

  lists
end