Class: Anemone::Page

Inherits:
Object
  • Object
show all
Defined in:
lib/anemone/page.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, params = {}) ⇒ Page

Create a new page



38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/anemone/page.rb', line 38

def initialize(url, params = {})
  @url = url
  @data = OpenStruct.new

  @code = params[:code]
  @headers = params[:headers] || {}
  @headers['content-type'] ||= ['']
  @aliases = Array(params[:aka]).compact
  @referer = params[:referer]
  @depth = params[:depth] || 0
  @redirect_to = to_absolute(params[:redirect_to])
  @response_time = params[:response_time]
  @body = params[:body]
  @error = params[:error]

  @fetched = !params[:code].nil?
end

Instance Attribute Details

#bodyObject (readonly)

HTML body



11
12
13
# File 'lib/anemone/page.rb', line 11

def body
  @body
end

#codeObject

Integer response code of the page



24
25
26
# File 'lib/anemone/page.rb', line 24

def code
  @code
end

#dataObject

OpenStruct for user-stored data



22
23
24
# File 'lib/anemone/page.rb', line 22

def data
  @data
end

#depthObject

Depth of this page from the root of the crawl. This is not necessarily the shortest path; use PageStore#shortest_paths! to find that value.



29
30
31
# File 'lib/anemone/page.rb', line 29

def depth
  @depth
end

#errorObject (readonly)

Exception object, if one was raised during HTTP#fetch_page



17
18
19
# File 'lib/anemone/page.rb', line 17

def error
  @error
end

#headersObject (readonly)

Headers of the HTTP response



13
14
15
# File 'lib/anemone/page.rb', line 13

def headers
  @headers
end

#redirect_toObject (readonly)

URL of the page this one redirected to, if any



15
16
17
# File 'lib/anemone/page.rb', line 15

def redirect_to
  @redirect_to
end

#refererObject

URL of the page that brought us to this page



31
32
33
# File 'lib/anemone/page.rb', line 31

def referer
  @referer
end

#response_timeObject

Response time of the request for this page in milliseconds



33
34
35
# File 'lib/anemone/page.rb', line 33

def response_time
  @response_time
end

#urlObject (readonly)

The URL of the page



9
10
11
# File 'lib/anemone/page.rb', line 9

def url
  @url
end

#visitedObject

Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!



26
27
28
# File 'lib/anemone/page.rb', line 26

def visited
  @visited
end

Instance Method Details

#content_typeObject

The content-type returned by the HTTP request for this page



108
109
110
# File 'lib/anemone/page.rb', line 108

def content_type
  headers['content-type'].first
end

#cookiesObject

Array of cookies received with this page as WEBrick::Cookie objects.



101
102
103
# File 'lib/anemone/page.rb', line 101

def cookies
  WEBrick::Cookie.parse_set_cookies(@headers['Set-Cookie']) rescue []
end

#discard_doc!Object

Delete the Nokogiri document and response body to conserve memory



85
86
87
88
# File 'lib/anemone/page.rb', line 85

def discard_doc!
  links # force parsing of page links before we trash the document
  @doc = @body = nil
end

#docObject

Nokogiri document for the HTML body



77
78
79
80
# File 'lib/anemone/page.rb', line 77

def doc
  return @doc if @doc
  @doc = Nokogiri::HTML(@body) if @body && html? rescue nil
end

#fetched?Boolean

Was the page successfully fetched? true if the page was fetched with no error, false otherwise.

Returns:

  • (Boolean)


94
95
96
# File 'lib/anemone/page.rb', line 94

def fetched?
  @fetched
end

#html?Boolean

Returns true if the page is a HTML document, returns false otherwise.

Returns:

  • (Boolean)


116
117
118
# File 'lib/anemone/page.rb', line 116

def html?
  !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
end

#in_domain?(uri) ⇒ Boolean

Returns true if uri is in the same domain as the page, returns false otherwise

Returns:

  • (Boolean)


158
159
160
# File 'lib/anemone/page.rb', line 158

def in_domain?(uri)
  uri.host == @url.host
end

Array of distinct A tag HREFs from the page



59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/anemone/page.rb', line 59

def links
  return @links unless @links.nil?
  @links = []
  return @links if !doc

  doc.css('a').each do |a|
    u = a.attributes['href'].content rescue nil
    next if u.nil? or u.empty?
    abs = to_absolute(URI(u)) rescue next
    @links << abs if in_domain?(abs)
  end
  @links.uniq!
  @links
end

#marshal_dumpObject



162
163
164
# File 'lib/anemone/page.rb', line 162

def marshal_dump
  [@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
end

#marshal_load(ary) ⇒ Object



166
167
168
# File 'lib/anemone/page.rb', line 166

def marshal_load(ary)
  @url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
end

#not_found?Boolean

Returns true if the page was not found (returned 404 code), returns false otherwise.

Returns:

  • (Boolean)


132
133
134
# File 'lib/anemone/page.rb', line 132

def not_found?
  404 == @code
end

#redirect?Boolean

Returns true if the page is a HTTP redirect, returns false otherwise.

Returns:

  • (Boolean)


124
125
126
# File 'lib/anemone/page.rb', line 124

def redirect?
  (300..399).include?(@code)
end

#to_absolute(link) ⇒ Object

Converts relative URL link into an absolute URL based on the location of the page



140
141
142
143
144
145
146
147
148
149
150
151
152
# File 'lib/anemone/page.rb', line 140

def to_absolute(link)
  return nil if link.nil?

  # remove anchor
  link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))

  relative = URI(link)
  absolute = @url.merge(relative)

  absolute.path = '/' if absolute.path.empty?

  return absolute
end