Class: Grell::Page

Inherits:
Object
  • Object
show all
Extended by:
Forwardable
Defined in:
lib/grell/page.rb

Overview

This class contains the logic related to work with each page we crawl. It is also the interface we use To access the information of each page. This information comes from result private classes below.

Defined Under Namespace

Classes: ErroredPage, UnvisitedPage, VisitedPage

Constant Summary collapse

WAIT_TIME =
10
WAIT_INTERVAL =
0.5

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, id, parent_id) ⇒ Page

Returns a new instance of Page.



18
19
20
21
22
23
24
25
26
# File 'lib/grell/page.rb', line 18

def initialize( url, id, parent_id)
  @rawpage = RawPage.new
  @url = url
  @id = id
  @parent_id = parent_id
  @timestamp = nil
  @times_visited = 0
  @result_page = UnvisitedPage.new
end

Instance Attribute Details

#idObject (readonly)

Returns the value of attribute id.



13
14
15
# File 'lib/grell/page.rb', line 13

def id
  @id
end

#parent_idObject (readonly)

Returns the value of attribute parent_id.



13
14
15
# File 'lib/grell/page.rb', line 13

def parent_id
  @parent_id
end

#rawpageObject (readonly)

Returns the value of attribute rawpage.



13
14
15
# File 'lib/grell/page.rb', line 13

def rawpage
  @rawpage
end

#timestampObject (readonly)

Returns the value of attribute timestamp.



13
14
15
# File 'lib/grell/page.rb', line 13

def timestamp
  @timestamp
end

#urlObject (readonly)

Returns the value of attribute url.



13
14
15
# File 'lib/grell/page.rb', line 13

def url
  @url
end

Instance Method Details

#current_urlObject

The current URL, this may be different from the URL we asked for if there was some redirect



51
52
53
# File 'lib/grell/page.rb', line 51

def current_url
  @rawpage.current_url
end

#error?Boolean

True if there page responded with an error

Returns:

  • (Boolean)


61
62
63
# File 'lib/grell/page.rb', line 61

def error?
  !!(status.to_s =~ /[4|5]\d\d/)
end

#followed_redirects?Boolean

True if we followed a redirect to get the current contents

Returns:

  • (Boolean)


56
57
58
# File 'lib/grell/page.rb', line 56

def followed_redirects?
  current_url != @url
end


28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/grell/page.rb', line 28

def navigate
  # We wait a maximum of WAIT_TIME seconds to get an HTML page. We try our best to workaround inconsistencies on poltergeist
  Reader.wait_for(->{@rawpage.navigate(url)}, WAIT_TIME, WAIT_INTERVAL ) do
    @rawpage.status && !@rawpage.headers.empty? &&
      @rawpage.headers["Content-Type"] && @rawpage.headers["Content-Type"].include?('text/html').equal?(true)
  end
  @rawpage.wait_for_all_ajax_requests(WAIT_TIME, WAIT_INTERVAL)
  @result_page = VisitedPage.new(@rawpage)
  @timestamp = Time.now
rescue Capybara::Poltergeist::BrowserError, Capybara::Poltergeist::DeadClient,
       Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::StatusFailError,
       Capybara::Poltergeist::TimeoutError, Errno::ECONNRESET, URI::InvalidURIError => e
  unavailable_page(404, e)
ensure
  @times_visited += 1
end

#pathObject

Extracts the path (e.g. /actions/test_action) from the URL



66
67
68
69
70
# File 'lib/grell/page.rb', line 66

def path
  URI.parse(@url).path
rescue URI::InvalidURIError # Invalid URLs will be added and caught when we try to navigate to them
  @url
end

#retriesObject

Number of times we have retried the current page



46
47
48
# File 'lib/grell/page.rb', line 46

def retries
  [@times_visited - 1, 0].max
end

#unavailable_page(status, exception) ⇒ Object



72
73
74
75
76
# File 'lib/grell/page.rb', line 72

def unavailable_page(status, exception)
  Grell.logger.warn "The page with the URL #{@url} was not available. Exception #{exception}"
  @result_page = ErroredPage.new(status, exception)
  @timestamp = Time.now
end