Class: Proto::Scraper

Inherits:
Object
  • Object
show all
Defined in:
lib/proto/scraper.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url) ⇒ Scraper

Returns a new instance of Scraper.



5
6
7
8
9
10
# File 'lib/proto/scraper.rb', line 5

def initialize(url)
  @url = url.chomp '/' #remove trailing slash
  @doc = Nokogiri::HTML(open(url))
  @page_count     = 1
  @url_collection = []
end

Instance Attribute Details

#docObject

Returns the value of attribute doc.



3
4
5
# File 'lib/proto/scraper.rb', line 3

def doc
  @doc
end

#page_countObject

Returns the value of attribute page_count.



3
4
5
# File 'lib/proto/scraper.rb', line 3

def page_count
  @page_count
end

#traverseObject

Returns the value of attribute traverse.



3
4
5
# File 'lib/proto/scraper.rb', line 3

def traverse
  @traverse
end

#urlObject

Returns the value of attribute url.



3
4
5
# File 'lib/proto/scraper.rb', line 3

def url
  @url
end

#url_collectionObject

Returns the value of attribute url_collection.



3
4
5
# File 'lib/proto/scraper.rb', line 3

def url_collection
  @url_collection
end

Instance Method Details

#collect_urls(base_url = self.url, pagination_selector = nil, url_selector) ⇒ Object



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# File 'lib/proto/scraper.rb', line 12

def collect_urls(base_url=self.url, pagination_selector=nil, url_selector)
  number_of_pages = doc.css(pagination_selector).map.count if pagination_selector

  page_urls = doc.css(url_selector).map { |link| "#{base_url}#{link['href']}" }

  if pagination_selector && (@page_count < number_of_pages)
    next_url = base_url << doc.css(pagination_selector)[@page_count]['href']
    self.doc = Nokogiri::HTML(open(next_url))
    @page_count += 1
    url_collection << page_urls
    collect_urls(base_url, pagination_selector, url_selector)
  else
    url_collection << page_urls
    url_collection.flatten!
  end
end

#fetch(name = 'Type', args) ⇒ Object Also known as: fetch_and_create!



29
30
31
32
33
34
35
36
37
# File 'lib/proto/scraper.rb', line 29

def fetch(name='Type', args)
  if url_collection.empty?
    attributes = scrape_single_page(args)
  else
    attributes = scrape_multiple_pages(args)
  end
  protos = create_return_objects(name, attributes)
  protos
end