Class: WebscraperFramework::Page

Inherits:
Object
  • Object
show all
Defined in:
lib/webscraper_framework/page.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(html: nil) ⇒ Page

Returns a new instance of Page.



22
23
24
# File 'lib/webscraper_framework/page.rb', line 22

def initialize html: nil
  self.html = html
end

Instance Attribute Details

#htmlObject

Returns the value of attribute html.



6
7
8
# File 'lib/webscraper_framework/page.rb', line 6

def html
  @html
end

Class Method Details

.by_html(html) ⇒ Object

helper for seamless initialisation no matter what starting point



37
38
39
# File 'lib/webscraper_framework/page.rb', line 37

def self.by_html(html)
  self.new(html: html)
end

.by_html_string(html_string) ⇒ Object

helper for seamless initialisation no matter what starting point



32
33
34
# File 'lib/webscraper_framework/page.rb', line 32

def self.by_html_string(html_string)
  self.new(html: Nokogiri::HTML(html_string))
end

.by_url(url) ⇒ Object

helper for seamless initialisation no matter what starting point



27
28
29
# File 'lib/webscraper_framework/page.rb', line 27

def self.by_url(url)
  self.new(html: Nokogiri::HTML(get_page(url)))
end

.get_page(url, from_cache = true) ⇒ Object



8
9
10
11
12
13
14
15
16
17
18
19
20
# File 'lib/webscraper_framework/page.rb', line 8

def self.get_page(url, from_cache = true)
  url_hash = Digest::SHA256.hexdigest(url)
  filename = "cache/#{url_hash}"
  if from_cache && File.file?(filename)
    result = open(filename)
    puts "Gotten #{filename} from cache"
  else
    result = open(url)
    File.write(filename, result.read)
    puts "Written cache file #{filename}"
  end
  return result.read
end

Instance Method Details

#collection_by_selector(selector) ⇒ Object



41
42
43
# File 'lib/webscraper_framework/page.rb', line 41

def collection_by_selector(selector)
  self.html.css(selector).map{|item| WebscraperFramework::Page.by_html(item)}
end