Class: Snapcrawl::Page
- Inherits:
-
Object
- Object
- Snapcrawl::Page
- Defined in:
- lib/snapcrawl/page.rb
Constant Summary collapse
- EXTENSION_BLACKLIST =
"png|gif|jpg|pdf|zip"
- PROTOCOL_BLACKLIST =
"mailto|tel"
Instance Attribute Summary collapse
-
#depth ⇒ Object
readonly
Returns the value of attribute depth.
-
#url ⇒ Object
readonly
Returns the value of attribute url.
Instance Method Summary collapse
-
#initialize(url, depth: 0) ⇒ Page
constructor
A new instance of Page.
- #links ⇒ Object
- #pages ⇒ Object
- #path ⇒ Object
- #save_screenshot(outfile) ⇒ Object
- #site ⇒ Object
- #valid? ⇒ Boolean
Constructor Details
#initialize(url, depth: 0) ⇒ Page
16 17 18 |
# File 'lib/snapcrawl/page.rb', line 16 def initialize(url, depth: 0) @url, @depth = url.protocolize, depth end |
Instance Attribute Details
#depth ⇒ Object (readonly)
Returns the value of attribute depth.
11 12 13 |
# File 'lib/snapcrawl/page.rb', line 11 def depth @depth end |
#url ⇒ Object (readonly)
Returns the value of attribute url.
11 12 13 |
# File 'lib/snapcrawl/page.rb', line 11 def url @url end |
Instance Method Details
#links ⇒ Object
32 33 34 35 36 |
# File 'lib/snapcrawl/page.rb', line 32 def links return nil unless valid? doc = Nokogiri::HTML http_response.body normalize_links doc.css('a') end |
#pages ⇒ Object
38 39 40 41 |
# File 'lib/snapcrawl/page.rb', line 38 def pages return nil unless valid? links.map { |link| Page.new link, depth: depth+1 } end |
#path ⇒ Object
28 29 30 |
# File 'lib/snapcrawl/page.rb', line 28 def path @path ||= Addressable::URI.parse(url).request_uri end |
#save_screenshot(outfile) ⇒ Object
43 44 45 46 |
# File 'lib/snapcrawl/page.rb', line 43 def save_screenshot(outfile) return false unless valid? Screenshot.new(url).save "#{outfile}" end |
#site ⇒ Object
24 25 26 |
# File 'lib/snapcrawl/page.rb', line 24 def site @site ||= Addressable::URI.parse(url).site end |
#valid? ⇒ Boolean
20 21 22 |
# File 'lib/snapcrawl/page.rb', line 20 def valid? http_response&.success? end |