Class: Grell::Page

Inherits:
Object
  • Object
show all
Extended by:
Forwardable
Defined in:
lib/grell/page.rb

Overview

This class contains the logic related to work with each page we crawl. It is also the interface we use To access the information of each page. This information comes from result private classes below.

Defined Under Namespace

Classes: ErroredPage, UnvisitedPage, VisitedPage

Constant Summary collapse

WAIT_TIME =
10
WAIT_INTERVAL =
0.5

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, id, parent_id) ⇒ Page

Returns a new instance of Page.



18
19
20
21
22
23
24
25
26
# File 'lib/grell/page.rb', line 18

def initialize( url, id, parent_id)
  @rawpage = RawPage.new
  @url = url
  @id = id
  @parent_id = parent_id
  @timestamp = nil
  @times_visited = 0
  @result_page = UnvisitedPage.new
end

Instance Attribute Details

#idObject (readonly)

Returns the value of attribute id.



13
14
15
# File 'lib/grell/page.rb', line 13

def id
  @id
end

#parent_idObject (readonly)

Returns the value of attribute parent_id.



13
14
15
# File 'lib/grell/page.rb', line 13

def parent_id
  @parent_id
end

#rawpageObject (readonly)

Returns the value of attribute rawpage.



13
14
15
# File 'lib/grell/page.rb', line 13

def rawpage
  @rawpage
end

#timestampObject (readonly)

Returns the value of attribute timestamp.



13
14
15
# File 'lib/grell/page.rb', line 13

def timestamp
  @timestamp
end

#urlObject (readonly)

Returns the value of attribute url.



13
14
15
# File 'lib/grell/page.rb', line 13

def url
  @url
end

Instance Method Details

#current_urlObject

The current URL, this may be different from the URL we asked for if there was some redirect



49
50
51
# File 'lib/grell/page.rb', line 49

def current_url
  @rawpage.current_url
end

#error?Boolean

True if there page responded with an error

Returns:

  • (Boolean)


59
60
61
# File 'lib/grell/page.rb', line 59

def error?
  !!(status.to_s =~ /[4|5]\d\d/)
end

#followed_redirects?Boolean

True if we followed a redirect to get the current contents

Returns:

  • (Boolean)


54
55
56
# File 'lib/grell/page.rb', line 54

def followed_redirects?
  current_url != @url
end


28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/grell/page.rb', line 28

def navigate
  # We wait a maximum of WAIT_TIME seconds to get an HTML page. We try or best to workaround inconsistencies on poltergeist
  Reader.wait_for(->{@rawpage.navigate(url)}, WAIT_TIME, WAIT_INTERVAL ) do
    @rawpage.status && !@rawpage.headers.empty? &&
      @rawpage.headers["Content-Type"] && @rawpage.headers["Content-Type"].include?('text/html').equal?(true)
  end
  @result_page = VisitedPage.new(@rawpage)
  @timestamp = Time.now
  @times_visited += 1
rescue Capybara::Poltergeist::BrowserError, Capybara::Poltergeist::DeadClient,
       Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::StatusFailError,
       Capybara::Poltergeist::TimeoutError, Errno::ECONNRESET, URI::InvalidURIError => e
  unavailable_page(404, e)
end

#pathObject

Extracts the path (e.g. /actions/test_action) from the URL



64
65
66
67
68
# File 'lib/grell/page.rb', line 64

def path
  URI.parse(@url).path
rescue URI::InvalidURIError # Invalid URLs will be added and caught when we try to navigate to them
  @url
end

#retriesObject

Number of times we have retried the current page



44
45
46
# File 'lib/grell/page.rb', line 44

def retries
  [@times_visited -1, 0].max
end

#unavailable_page(status, exception) ⇒ Object



70
71
72
73
74
# File 'lib/grell/page.rb', line 70

def unavailable_page(status, exception)
  Grell.logger.warn "The page with the URL #{@url} was not available. Exception #{exception}"
  @result_page = ErroredPage.new(status, exception)
  @timestamp = Time.now
end