Class: Grell::Page

Inherits:
Object
  • Object
show all
Extended by:
Forwardable
Defined in:
lib/grell/page.rb

Overview

This class contains the logic related to work with each page we crawl. It is also the interface we use To access the information of each page. This information comes from result private classes below.

Defined Under Namespace

Classes: ErroredPage, UnvisitedPage, VisitedPage

Constant Summary collapse

WAIT_TIME =
10
WAIT_INTERVAL =
0.5

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, id, parent_id) ⇒ Page

Returns a new instance of Page.



18
19
20
21
22
23
24
25
26
# File 'lib/grell/page.rb', line 18

def initialize( url, id, parent_id)
  @rawpage = RawPage.new
  @url = url
  @id = id
  @parent_id = parent_id
  @timestamp = nil
  @times_visited = 0
  @result_page = UnvisitedPage.new
end

Instance Attribute Details

#idObject (readonly)

Returns the value of attribute id.



13
14
15
# File 'lib/grell/page.rb', line 13

def id
  @id
end

#parent_idObject (readonly)

Returns the value of attribute parent_id.



13
14
15
# File 'lib/grell/page.rb', line 13

def parent_id
  @parent_id
end

#rawpageObject (readonly)

Returns the value of attribute rawpage.



13
14
15
# File 'lib/grell/page.rb', line 13

def rawpage
  @rawpage
end

#timestampObject (readonly)

Returns the value of attribute timestamp.



13
14
15
# File 'lib/grell/page.rb', line 13

def timestamp
  @timestamp
end

#urlObject (readonly)

Returns the value of attribute url.



13
14
15
# File 'lib/grell/page.rb', line 13

def url
  @url
end

Instance Method Details

#current_urlObject

The current URL, this may be different from the URL we asked for if there was some redirect



50
51
52
# File 'lib/grell/page.rb', line 50

def current_url
  @rawpage.current_url
end

#error?Boolean

True if there page responded with an error

Returns:

  • (Boolean)


60
61
62
# File 'lib/grell/page.rb', line 60

def error?
  !!(status.to_s =~ /[4|5]\d\d/)
end

#followed_redirects?Boolean

True if we followed a redirect to get the current contents

Returns:

  • (Boolean)


55
56
57
# File 'lib/grell/page.rb', line 55

def followed_redirects?
  current_url != @url
end


28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/grell/page.rb', line 28

def navigate
  # We wait a maximum of WAIT_TIME seconds to get an HTML page. We try or best to workaround inconsistencies on poltergeist
  Reader.wait_for(->{@rawpage.navigate(url)}, WAIT_TIME, WAIT_INTERVAL ) do
    @rawpage.status && !@rawpage.headers.empty? &&
      @rawpage.headers["Content-Type"] && @rawpage.headers["Content-Type"].include?('text/html').equal?(true)
  end
  @result_page = VisitedPage.new(@rawpage)
  @timestamp = Time.now
rescue Capybara::Poltergeist::BrowserError, Capybara::Poltergeist::DeadClient,
       Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::StatusFailError,
       Capybara::Poltergeist::TimeoutError, Errno::ECONNRESET, URI::InvalidURIError => e
  unavailable_page(404, e)
ensure
  @times_visited += 1
end

#pathObject

Extracts the path (e.g. /actions/test_action) from the URL



65
66
67
68
69
# File 'lib/grell/page.rb', line 65

def path
  URI.parse(@url).path
rescue URI::InvalidURIError # Invalid URLs will be added and caught when we try to navigate to them
  @url
end

#retriesObject

Number of times we have retried the current page



45
46
47
# File 'lib/grell/page.rb', line 45

def retries
  [@times_visited - 1, 0].max
end

#unavailable_page(status, exception) ⇒ Object



71
72
73
74
75
# File 'lib/grell/page.rb', line 71

def unavailable_page(status, exception)
  Grell.logger.warn "The page with the URL #{@url} was not available. Exception #{exception}"
  @result_page = ErroredPage.new(status, exception)
  @timestamp = Time.now
end