Class: Snapcrawl::Page

Inherits:
Object
  • Object
show all
Defined in:
lib/snapcrawl/page.rb

Constant Summary collapse

EXTENSION_BLACKLIST =
"png|gif|jpg|pdf|zip"
PROTOCOL_BLACKLIST =
"mailto|tel"

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, depth: 0) ⇒ Page



16
17
18
# File 'lib/snapcrawl/page.rb', line 16

def initialize(url, depth: 0)
  @url, @depth = url.protocolize, depth
end

Instance Attribute Details

#depthObject (readonly)

Returns the value of attribute depth.



11
12
13
# File 'lib/snapcrawl/page.rb', line 11

def depth
  @depth
end

#urlObject (readonly)

Returns the value of attribute url.



11
12
13
# File 'lib/snapcrawl/page.rb', line 11

def url
  @url
end

Instance Method Details



32
33
34
35
36
# File 'lib/snapcrawl/page.rb', line 32

def links
  return nil unless valid?
  doc = Nokogiri::HTML http_response.body
  normalize_links doc.css('a')
end

#pagesObject



38
39
40
41
# File 'lib/snapcrawl/page.rb', line 38

def pages
  return nil unless valid?
  links.map { |link| Page.new link, depth: depth+1 }
end

#pathObject



28
29
30
# File 'lib/snapcrawl/page.rb', line 28

def path
  @path ||= Addressable::URI.parse(url).request_uri
end

#save_screenshot(outfile) ⇒ Object



43
44
45
46
# File 'lib/snapcrawl/page.rb', line 43

def save_screenshot(outfile)
  return false unless valid?
  Screenshot.new(url).save "#{outfile}"
end

#siteObject



24
25
26
# File 'lib/snapcrawl/page.rb', line 24

def site
  @site ||= Addressable::URI.parse(url).site
end

#valid?Boolean



20
21
22
# File 'lib/snapcrawl/page.rb', line 20

def valid?
  http_response&.success?
end