Class: Snapcrawl::Page

Inherits:
Object
  • Object
show all
Defined in:
lib/snapcrawl/page.rb

Constant Summary collapse

EXTENSION_BLACKLIST =
'png|gif|jpg|pdf|zip'
PROTOCOL_BLACKLIST =
'mailto|tel'

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, depth: 0) ⇒ Page

Returns a new instance of Page.



16
17
18
19
# File 'lib/snapcrawl/page.rb', line 16

def initialize(url, depth: 0)
  @url = url.protocolize
  @depth = depth
end

Instance Attribute Details

#depthObject (readonly)

Returns the value of attribute depth.



11
12
13
# File 'lib/snapcrawl/page.rb', line 11

def depth
  @depth
end

#urlObject (readonly)

Returns the value of attribute url.



11
12
13
# File 'lib/snapcrawl/page.rb', line 11

def url
  @url
end

Instance Method Details



33
34
35
36
37
38
# File 'lib/snapcrawl/page.rb', line 33

def links
  return nil unless valid?

  doc = Nokogiri::HTML http_response.body
  normalize_links doc.css('a')
end

#pagesObject



40
41
42
43
44
# File 'lib/snapcrawl/page.rb', line 40

def pages
  return nil unless valid?

  links.map { |link| Page.new link, depth: depth + 1 }
end

#pathObject



29
30
31
# File 'lib/snapcrawl/page.rb', line 29

def path
  @path ||= Addressable::URI.parse(url).request_uri
end

#save_screenshot(outfile) ⇒ Object



46
47
48
49
50
# File 'lib/snapcrawl/page.rb', line 46

def save_screenshot(outfile)
  return false unless valid?

  Screenshot.new(url).save outfile
end

#siteObject



25
26
27
# File 'lib/snapcrawl/page.rb', line 25

def site
  @site ||= Addressable::URI.parse(url).site
end

#valid?Boolean

Returns:

  • (Boolean)


21
22
23
# File 'lib/snapcrawl/page.rb', line 21

def valid?
  http_response&.success?
end