Class: Spidr::Page

Inherits:
Object
  • Object
show all
Includes:
Enumerable
Defined in:
lib/spidr/page.rb,
lib/spidr/page/html.rb,
lib/spidr/page/cookies.rb,
lib/spidr/page/status_codes.rb,
lib/spidr/page/content_types.rb

Overview

Represents a requested page from a website.

Constant Summary collapse

/^(?:Path|Expires|Domain|Secure|HTTPOnly)$/i

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, response) ⇒ Page

Creates a new Page object.


25
26
27
28
29
30
# File 'lib/spidr/page.rb', line 25

def initialize(url,response)
  @url      = url
  @response = response
  @headers  = response.to_hash
  @doc      = nil
end

Dynamic Method Handling

This class handles dynamic methods through the method_missing method

#method_missing(name, *arguments, &block) ⇒ String (protected)

Provides transparent access to the values in #headers.

Raises:

  • (NoMethodError)

    The missing method did not map to a header in #headers.


134
135
136
137
138
139
140
141
142
143
144
# File 'lib/spidr/page.rb', line 134

def method_missing(name,*arguments,&block)
  if (arguments.empty? && block.nil?)
    header_name = name.to_s.tr('_','-')

    if @response.key?(header_name)
      return @response[header_name]
    end
  end

  return super(name,*arguments,&block)
end

Instance Attribute Details

#headersObject (readonly)

Headers returned with the body


14
15
16
# File 'lib/spidr/page.rb', line 14

def headers
  @headers
end

#responseObject (readonly)

HTTP Response


11
12
13
# File 'lib/spidr/page.rb', line 11

def response
  @response
end

#urlObject (readonly)

URL of the page


8
9
10
# File 'lib/spidr/page.rb', line 8

def url
  @url
end

Instance Method Details

#at(*arguments) ⇒ Nokogiri::HTML::Node, ... Also known as: %

Searches for the first occurrence an XPath or CSS Path expression.


108
109
110
111
112
# File 'lib/spidr/page.rb', line 108

def at(*arguments)
  if doc
    doc.at(*arguments)
  end
end

#atom?Boolean

Determines if the page is an Atom feed.


191
192
193
# File 'lib/spidr/page/content_types.rb', line 191

def atom?
  is_content_type?('application/atom+xml')
end

#bad_request?Boolean

Determines if the response code is 400.


41
42
43
# File 'lib/spidr/page/status_codes.rb', line 41

def bad_request?
  code == 400
end

#bodyString Also known as: to_s

The body of the response.


38
39
40
# File 'lib/spidr/page.rb', line 38

def body
  (response.body || '')
end

#codeInteger

The response code from the page.


9
10
11
# File 'lib/spidr/page/status_codes.rb', line 9

def code
  @response.code.to_i
end

#content_charsetString?

The charset included in the Content-Type.

Since:

  • 0.4.0


33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/spidr/page/content_types.rb', line 33

def content_charset
  content_types.each do |value|
    if value.include?(';')
      value.split(';').each do |param|
        param.strip!

        if param.start_with?('charset=')
          return param.split('=',2).last
        end
      end
    end
  end

  return nil
end

#content_typeString

The Content-Type of the page.


9
10
11
# File 'lib/spidr/page/content_types.rb', line 9

def content_type
  @response['Content-Type'] || ''
end

#content_typesArray<String>

The content types of the page.

Since:

  • 0.2.2


21
22
23
# File 'lib/spidr/page/content_types.rb', line 21

def content_types
  @response.get_fields('content-type') || []
end

The raw Cookie String sent along with the page.

Since:

  • 0.2.7


16
17
18
# File 'lib/spidr/page/cookies.rb', line 16

def cookie
  @response['Set-Cookie'] || ''
end

The Cookie key -> value pairs returned with the response.

Since:

  • 0.2.2


42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/spidr/page/cookies.rb', line 42

def cookie_params
  params = {}

  cookies.each do |value|
    value.split(';').each do |param|
      param.strip!

      name, value = param.split('=',2)

      unless name =~ RESERVED_COOKIE_NAMES
        params[name] = (value || '')
      end
    end
  end

  return params
end

#cookiesArray<String>

The Cookie values sent along with the page.

Since:

  • 0.2.2


30
31
32
# File 'lib/spidr/page/cookies.rb', line 30

def cookies
  (@response.get_fields('Set-Cookie') || [])
end

#css?Boolean

Determines if the page is a CSS stylesheet.


170
171
172
# File 'lib/spidr/page/content_types.rb', line 170

def css?
  is_content_type?('text/css')
end

#directory?Boolean

Determines if the page is a Directory Listing.

Since:

  • 0.3.0


106
107
108
# File 'lib/spidr/page/content_types.rb', line 106

def directory?
  is_content_type?('text/directory')
end

#docNokogiri::HTML::Document, ...

Returns a parsed document object for HTML, XML, RSS and Atom pages.


55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/spidr/page.rb', line 55

def doc
  unless body.empty?
    doc_class = if html?
                  Nokogiri::HTML::Document
                elsif rss? || atom? || xml? || xsl?
                  Nokogiri::XML::Document
                end

    if doc_class
      begin
        @doc ||= doc_class.parse(body, @url.to_s, content_charset)
      rescue
      end
    end
  end
end

Enumerates over every link in the page.

Yields:

  • (link)

    The given block will be passed every non-empty link in the page.

Yield Parameters:

  • link (String)

    A link in the page.

Since:

  • 0.3.0


178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
# File 'lib/spidr/page/html.rb', line 178

def each_link
  return enum_for(__method__) unless block_given?

  filter = lambda { |url|
    yield url unless (url.nil? || url.empty?)
  }

  each_redirect(&filter) if is_redirect?

  if (html? && doc)
    doc.search('//a[@href]').each do |a|
      filter.call(a.get_attribute('href'))
    end

    doc.search('//frame[@src]').each do |iframe|
      filter.call(iframe.get_attribute('src'))
    end

    doc.search('//iframe[@src]').each do |iframe|
      filter.call(iframe.get_attribute('src'))
    end

    doc.search('//link[@href]').each do |link|
      filter.call(link.get_attribute('href'))
    end

    doc.search('//script[@src]').each do |script|
      filter.call(script.get_attribute('src'))
    end
  end
end

#each_mailto {|link| ... } ⇒ Enumerator

Enumerates over every mailto: link in the page.

Yields:

  • (link)

    The given block will be passed every mailto: link from the page.

Yield Parameters:

  • link (String)

    A mailto: link from the page.

Since:

  • 0.5.0


142
143
144
145
146
147
148
149
150
# File 'lib/spidr/page/html.rb', line 142

def each_mailto
  return enum_for(__method__) unless block_given?

  if (html? && doc)
    doc.search('//a[starts-with(@href,"mailto:")]').each do |a|
      yield a.get_attribute('href')[7..-1]
    end
  end
end

#each_meta_redirect {|link| ... } ⇒ Enumerator

Enumerates over the meta-redirect links in the page.

Yields:

  • (link)

    If a block is given, it will be passed every meta-redirect link from the page.

Yield Parameters:

  • link (String)

    A meta-redirect link from the page.

Since:

  • 0.3.0


35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/spidr/page/html.rb', line 35

def each_meta_redirect
  return enum_for(__method__) unless block_given?

  if (html? && doc)
    search('//meta[@http-equiv and @content]').each do |node|
      if node.get_attribute('http-equiv') =~ /refresh/i
        content = node.get_attribute('content')

        if (redirect = content.match(/url=(\S+)$/))
          yield redirect[1]
        end
      end
    end
  end
end

#each_redirect {|link| ... } ⇒ Enumerator

Enumerates over every HTTP or meta-redirect link in the page.

Yields:

  • (link)

    The given block will be passed every redirection link from the page.

Yield Parameters:

  • link (String)

    A HTTP or meta-redirect link from the page.

Since:

  • 0.3.0


105
106
107
108
109
110
111
112
113
114
115
# File 'lib/spidr/page/html.rb', line 105

def each_redirect(&block)
  return enum_for(__method__) unless block

  if (locations = @response.get_fields('Location'))
    # Location headers override any meta-refresh redirects in the HTML
    locations.each(&block)
  else
    # check page-level meta redirects if there isn't a location header
    each_meta_redirect(&block)
  end
end

#each_url {|url| ... } ⇒ Enumerator Also known as: each

Enumerates over every absolute URL in the page.

Yields:

  • (url)

    The given block will be passed every URL in the page.

Yield Parameters:

  • url (URI::HTTP)

    An absolute URL in the page.

Since:

  • 0.3.0


235
236
237
238
239
240
241
242
243
# File 'lib/spidr/page/html.rb', line 235

def each_url
  return enum_for(__method__) unless block_given?

  each_link do |link|
    if (url = to_absolute(link))
      yield url
    end
  end
end

#had_internal_server_error?Boolean

Determines if the response code is 500.


87
88
89
# File 'lib/spidr/page/status_codes.rb', line 87

def had_internal_server_error?
  code == 500
end

#html?Boolean

Determines if the page is HTML document.


116
117
118
# File 'lib/spidr/page/content_types.rb', line 116

def html?
  is_content_type?('text/html')
end

#is_content_type?(type) ⇒ Boolean

Determines if any of the content-types of the page include a given type.

Examples:

Match the Content-Type

page.is_content_type?('application/json')

Match the sub-type of the Content-Type

page.is_content_type?('json')

Since:

  • 0.4.0


67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/spidr/page/content_types.rb', line 67

def is_content_type?(type)
  if type.include?('/')
    # otherwise only match the first param
    content_types.any? do |value|
      value = value.split(';',2).first

      value == type
    end
  else
    # otherwise only match the sub-type
    content_types.any? do |value|
      value = value.split(';',2).first
      value = value.split('/',2).last

      value == type
    end
  end
end

#is_forbidden?Boolean Also known as: forbidden?

Determines if the response code is 403.


63
64
65
# File 'lib/spidr/page/status_codes.rb', line 63

def is_forbidden?
  code == 403
end

#is_missing?Boolean Also known as: missing?

Determines if the response code is 404.


75
76
77
# File 'lib/spidr/page/status_codes.rb', line 75

def is_missing?
  code == 404
end

#is_ok?Boolean Also known as: ok?

Determines if the response code is 200.


19
20
21
# File 'lib/spidr/page/status_codes.rb', line 19

def is_ok?
  code == 200
end

#is_redirect?Boolean Also known as: redirect?

Determines if the response code is 300, 301, 302, 303 or 307. Also checks for "soft" redirects added at the page level by a meta refresh tag.


99
100
101
102
103
104
105
106
107
108
# File 'lib/spidr/page/status_codes.rb', line 99

def is_redirect?
  case code
  when 300..303, 307
    true
  when 200
    meta_redirect?
  else
    false
  end
end

#is_unauthorized?Boolean Also known as: unauthorized?

Determines if the response code is 401.


51
52
53
# File 'lib/spidr/page/status_codes.rb', line 51

def is_unauthorized?
  code == 401
end

#javascript?Boolean

Determines if the page is JavaScript.


147
148
149
150
# File 'lib/spidr/page/content_types.rb', line 147

def javascript?
  is_content_type?('text/javascript') || \
    is_content_type?('application/javascript')
end

#json?Boolean

Determines if the page is JSON.

Since:

  • 0.3.0


160
161
162
# File 'lib/spidr/page/content_types.rb', line 160

def json?
  is_content_type?('application/json')
end

The links from within the page.


217
218
219
# File 'lib/spidr/page/html.rb', line 217

def links
  each_link.to_a
end

#mailtosArray<String>

mailto: links in the page.

Since:

  • 0.5.0


160
161
162
# File 'lib/spidr/page/html.rb', line 160

def mailtos
  each_mailto.to_a
end

#meta_redirectArray<String>

Deprecated.

Deprecated in 0.3.0 and will be removed in 0.4.0. Use #meta_redirects instead.

The meta-redirect links of the page.


84
85
86
87
88
89
# File 'lib/spidr/page/html.rb', line 84

def meta_redirect
  warn 'DEPRECATION: Spidr::Page#meta_redirect will be removed in 0.3.0'
  warn 'DEPRECATION: Use Spidr::Page#meta_redirects instead'

  meta_redirects
end

#meta_redirect?Boolean

Returns a boolean indicating whether or not page-level meta redirects are present in this page.


58
59
60
# File 'lib/spidr/page/html.rb', line 58

def meta_redirect?
  !each_meta_redirect.first.nil?
end

#meta_redirectsArray<String>

The meta-redirect links of the page.

Since:

  • 0.3.0


70
71
72
# File 'lib/spidr/page/html.rb', line 70

def meta_redirects
  each_meta_redirect.to_a
end

#ms_word?Boolean

Determines if the page is a MS Word document.


201
202
203
# File 'lib/spidr/page/content_types.rb', line 201

def ms_word?
  is_content_type?('application/msword')
end

#pdf?Boolean

Determines if the page is a PDF document.


211
212
213
# File 'lib/spidr/page/content_types.rb', line 211

def pdf?
  is_content_type?('application/pdf')
end

#plain_text?Boolean Also known as: txt?

Determines if the page is plain-text.


92
93
94
# File 'lib/spidr/page/content_types.rb', line 92

def plain_text?
  is_content_type?('text/plain')
end

#redirects_toArray<String>

URLs that this document redirects to.


124
125
126
# File 'lib/spidr/page/html.rb', line 124

def redirects_to
  each_redirect.to_a
end

#rss?Boolean

Determines if the page is a RSS feed.


180
181
182
183
# File 'lib/spidr/page/content_types.rb', line 180

def rss?
  is_content_type?('application/rss+xml') || \
    is_content_type?('application/rdf+xml')
end

#search(*paths) ⇒ Array Also known as: /

Searches the document for XPath or CSS Path paths.

Examples:

page.search('//a[@href]')

See Also:


88
89
90
91
92
93
94
# File 'lib/spidr/page.rb', line 88

def search(*paths)
  if doc
    doc.search(*paths)
  else
    []
  end
end

#timedout?Boolean

Determines if the response code is 308.


31
32
33
# File 'lib/spidr/page/status_codes.rb', line 31

def timedout?
  code == 308
end

#titleString

The title of the HTML page.


14
15
16
17
18
# File 'lib/spidr/page/html.rb', line 14

def title
  if (node = at('//title'))
    node.inner_text
  end
end

#to_absolute(link) ⇒ URI::HTTP

Normalizes and expands a given link into a proper URI.


266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
# File 'lib/spidr/page/html.rb', line 266

def to_absolute(link)
  link    = link.to_s
  new_url = begin
              url.merge(link)
            rescue Exception
              return
            end

  if (!new_url.opaque) && (path = new_url.path)
    # ensure that paths begin with a leading '/' for URI::FTP
    if (new_url.scheme == 'ftp' && !path.start_with?('/'))
      path.insert(0,'/')
    end

    # make sure the path does not contain any .. or . directories,
    # since URI::Generic#merge cannot normalize paths such as
    # "/stuff/../"
    new_url.path = URI.expand_path(path)
  end

  return new_url
end

#urlsArray<URI::HTTP>

Absolute URIs from within the page.


253
254
255
# File 'lib/spidr/page/html.rb', line 253

def urls
  each_url.to_a
end

#xml?Boolean

Determines if the page is XML document.


126
127
128
129
# File 'lib/spidr/page/content_types.rb', line 126

def xml?
  is_content_type?('text/xml') || \
    is_content_type?('application/xml')
end

#xsl?Boolean

Determines if the page is XML Stylesheet (XSL).


137
138
139
# File 'lib/spidr/page/content_types.rb', line 137

def xsl?
  is_content_type?('text/xsl')
end

#zip?Boolean

Determines if the page is a ZIP archive.


221
222
223
# File 'lib/spidr/page/content_types.rb', line 221

def zip?
  is_content_type?('application/zip')
end