Class: Whatsa::Scraper

Inherits:
Object
  • Object
show all
Defined in:
lib/whatsa/scraper.rb

Constant Summary collapse

WIKISEARCH =
'https://en.wikipedia.org/w/index.php?search='

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(term) ⇒ Scraper

Returns a new instance of Scraper.



7
8
9
10
11
12
13
14
15
# File 'lib/whatsa/scraper.rb', line 7

def initialize(term)
  # only keep word chars and parens, turn everything between each 'word'
  # to a single '+' and remove '+'s at the beginning and end if they're there
  # @query = term.gsub(/\W+/, '+').gsub(/(\A\+|\+\z)/, '')
  @query = term.gsub(/[^A-z0-9\(\)]+/, '+').gsub(/(\A\+|\+\z)/, '')

  # store the page in an instance variable so we don't keep polling the site
  @page = Nokogiri::HTML(open(WIKISEARCH + self.query))
end

Instance Attribute Details

#pageObject (readonly)

Returns the value of attribute page.



5
6
7
# File 'lib/whatsa/scraper.rb', line 5

def page
  @page
end

#queryObject (readonly)

Returns the value of attribute query.



5
6
7
# File 'lib/whatsa/scraper.rb', line 5

def query
  @query
end

Instance Method Details

#article?Boolean

Returns:

  • (Boolean)


25
26
27
# File 'lib/whatsa/scraper.rb', line 25

def article?
  !self.page.css('#ca-nstab-main').empty? && !disambig?
end

#disambig?Boolean

Returns:

  • (Boolean)


29
30
31
# File 'lib/whatsa/scraper.rb', line 29

def disambig?
  !self.page.css('#disambigbox').empty?
end

#make_articleObject



33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/whatsa/scraper.rb', line 33

def make_article
  if article?
    Whatsa::Article.new(self.page)
  elsif results_page? && !not_found?
    first_title = self.page.css('.mw-search-results li a').first.text
    self.class.new(first_title).make_article
  elsif disambig?
    self.class.new(make_disambig.choices.first).make_article
  else
    nil
  end
end

#make_disambigObject



46
47
48
# File 'lib/whatsa/scraper.rb', line 46

def make_disambig
  disambig? ? Whatsa::Disambig.new(self.page) : nil
end

#not_found?Boolean

Returns:

  • (Boolean)


21
22
23
# File 'lib/whatsa/scraper.rb', line 21

def not_found?
  !self.page.css('.mw-search-nonefound').empty?
end

#results_page?Boolean

Returns:

  • (Boolean)


17
18
19
# File 'lib/whatsa/scraper.rb', line 17

def results_page?
  !self.page.css('.searchresults').empty?
end