Class: Proto::Scraper
- Inherits:
-
Object
- Object
- Proto::Scraper
- Defined in:
- lib/proto/scraper.rb
Instance Attribute Summary collapse
-
#doc ⇒ Object
Returns the value of attribute doc.
-
#page_count ⇒ Object
Returns the value of attribute page_count.
-
#traverse ⇒ Object
Returns the value of attribute traverse.
-
#url ⇒ Object
Returns the value of attribute url.
-
#url_collection ⇒ Object
Returns the value of attribute url_collection.
Instance Method Summary collapse
- #collect_urls(base_url = self.url, pagination_selector = nil, url_selector) ⇒ Object
- #fetch(name = 'Type', args) ⇒ Object (also: #fetch_and_create!)
-
#initialize(url) ⇒ Scraper
constructor
A new instance of Scraper.
Constructor Details
#initialize(url) ⇒ Scraper
Returns a new instance of Scraper.
5 6 7 8 9 10 |
# File 'lib/proto/scraper.rb', line 5 def initialize(url) @url = url.chomp '/' #remove trailing slash @doc = Nokogiri::HTML(open(url)) @page_count = 1 @url_collection = [] end |
Instance Attribute Details
#doc ⇒ Object
Returns the value of attribute doc.
3 4 5 |
# File 'lib/proto/scraper.rb', line 3 def doc @doc end |
#page_count ⇒ Object
Returns the value of attribute page_count.
3 4 5 |
# File 'lib/proto/scraper.rb', line 3 def page_count @page_count end |
#traverse ⇒ Object
Returns the value of attribute traverse.
3 4 5 |
# File 'lib/proto/scraper.rb', line 3 def traverse @traverse end |
#url ⇒ Object
Returns the value of attribute url.
3 4 5 |
# File 'lib/proto/scraper.rb', line 3 def url @url end |
#url_collection ⇒ Object
Returns the value of attribute url_collection.
3 4 5 |
# File 'lib/proto/scraper.rb', line 3 def url_collection @url_collection end |
Instance Method Details
#collect_urls(base_url = self.url, pagination_selector = nil, url_selector) ⇒ Object
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
# File 'lib/proto/scraper.rb', line 12 def collect_urls(base_url=self.url, pagination_selector=nil, url_selector) number_of_pages = doc.css(pagination_selector).map.count if pagination_selector page_urls = doc.css(url_selector).map { |link| "#{base_url}#{link['href']}" } if pagination_selector && (@page_count < number_of_pages) next_url = base_url << doc.css(pagination_selector)[@page_count]['href'] self.doc = Nokogiri::HTML(open(next_url)) @page_count += 1 url_collection << page_urls collect_urls(base_url, pagination_selector, url_selector) else url_collection << page_urls url_collection.flatten! end end |
#fetch(name = 'Type', args) ⇒ Object Also known as: fetch_and_create!
29 30 31 32 33 34 35 36 37 |
# File 'lib/proto/scraper.rb', line 29 def fetch(name='Type', args) if url_collection.empty? attributes = scrape_single_page(args) else attributes = scrape_multiple_pages(args) end protos = create_return_objects(name, attributes) protos end |