Class: WebInspector::Page

Inherits:
Object
  • Object
show all
Defined in:
lib/web_inspector/page.rb

Constant Summary collapse

DEFAULT_TIMEOUT =
30
DEFAULT_RETRIES =
3
DEFAULT_USER_AGENT =
-> { "WebInspector/#{WebInspector::VERSION} (+https://github.com/davidesantangelo/webinspector)" }

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, options = {}) ⇒ Page

Initialize a new WebInspector Page

Parameters:

  • url (String)

    The URL to inspect

  • options (Hash) (defaults to: {})

    Optional parameters

Options Hash (options):

  • :timeout (Integer)

    Request timeout in seconds

  • :retries (Integer)

    Number of retries for failed requests

  • :headers (Hash)

    Custom HTTP headers

  • :allow_redirections (Boolean)

    Whether to follow redirects

  • :user_agent (String)

    Custom user agent



37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/web_inspector/page.rb', line 37

def initialize(url, options = {})
  @url = url
  @options = options
  @retries = options[:retries] || DEFAULT_RETRIES
  @timeout = options[:timeout] || DEFAULT_TIMEOUT
  @headers = options[:headers] || { 'User-Agent' => options[:user_agent] || DEFAULT_USER_AGENT.call }
  @allow_redirections = options[:allow_redirections].nil? || options[:allow_redirections]

  @request = WebInspector::Request.new(url)

  begin
    @inspector = WebInspector::Inspector.new(page)
    @inspector.set_url(url, host)
    @status_code = 200
  rescue StandardError => e
    @error = e
    @status_code = e.respond_to?(:status_code) ? e.status_code : 500
  end
end

Instance Attribute Details

#load_timeFloat? (readonly)

Get the load time of the page in seconds

Returns:

  • (Float, nil)

    Load time in seconds



175
176
177
# File 'lib/web_inspector/page.rb', line 175

def load_time
  @load_time
end

#status_codeObject (readonly)

Returns the value of attribute status_code.



22
23
24
# File 'lib/web_inspector/page.rb', line 22

def status_code
  @status_code
end

Instance Method Details

#content_typeString?

Get the content type of the page

Returns:

  • (String, nil)

    Content type



157
158
159
# File 'lib/web_inspector/page.rb', line 157

def content_type
  response&.headers && response.headers['content-type']
end

#domain_images(u = domain) ⇒ Object



129
130
131
132
133
# File 'lib/web_inspector/page.rb', line 129

def domain_images(u = domain)
  return [] unless success?

  @inspector.domain_images(u, host)
end


123
124
125
126
127
# File 'lib/web_inspector/page.rb', line 123

def domain_links(u = domain)
  return [] unless success?

  @inspector.domain_links(u, host)
end

#error_messageString?

Get the error message if any

Returns:

  • (String, nil)

    The error message or nil if no error



67
68
69
# File 'lib/web_inspector/page.rb', line 67

def error_message
  @error&.message
end

#faviconString?

Get the favicon URL if available

Returns:

  • (String, nil)

    The favicon URL or nil if not found



98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/web_inspector/page.rb', line 98

def favicon
  return @favicon if defined?(@favicon)

  return nil unless success?

  @favicon = begin
    # Try multiple approaches to find favicon

    # 1. Look for standard favicon link tags
    favicon_link = @inspector.page.css("link[rel='shortcut icon'], link[rel='icon'], link[rel='apple-touch-icon']").first
    if favicon_link && favicon_link['href']
      begin
        return URI.join(url, favicon_link['href']).to_s
      rescue URI::InvalidURIError
        # Try next method
      end
    end

    # 2. Try the default location /favicon.ico
    "#{scheme}://#{host}/favicon.ico"
  rescue StandardError
    nil
  end
end

#find(words) ⇒ Object

Special case for find method that takes arguments



82
83
84
85
86
# File 'lib/web_inspector/page.rb', line 82

def find(words)
  return nil unless success?

  @inspector.find(words)
end

#json_ldArray<Hash>

Get all JSON-LD structured data as a hash

Returns:

  • (Array<Hash>)

    Structured data



179
180
181
# File 'lib/web_inspector/page.rb', line 179

def json_ld
  structured_data
end

#responseObject



261
262
263
264
265
266
# File 'lib/web_inspector/page.rb', line 261

def response
  @response ||= fetch
rescue StandardError => e
  @error = e
  nil
end

#security_infoHash

Get information about the page’s security

Returns:

  • (Hash)

    Security information



137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# File 'lib/web_inspector/page.rb', line 137

def security_info
  return @security_info if defined?(@security_info)

  @security_info = {
    secure: scheme == 'https',
    hsts: response&.headers && response.headers['strict-transport-security'] ? true : false,
    content_security_policy: response&.headers && response.headers['content-security-policy'] ? true : false
  }

  # Extract SSL/TLS info if available and using HTTPS
  if scheme == 'https' && response&.env&.response_headers
    @security_info[:ssl_version] = response.env[:ssl_version]
    @security_info[:cipher_suite] = response.env[:cipher_suite]
  end

  @security_info
end

#sizeInteger?

Get the size of the page in bytes

Returns:

  • (Integer, nil)

    Size in bytes



163
164
165
166
167
168
169
170
171
# File 'lib/web_inspector/page.rb', line 163

def size
  return @size if defined?(@size)

  @size = if response&.headers && response.headers['content-length']
            response.headers['content-length'].to_i
          elsif response&.body
            response.body.bytesize
          end
end

#success?Boolean

Check if the page was successfully loaded

Returns:

  • (Boolean)

    true if the page was loaded, false otherwise



60
61
62
# File 'lib/web_inspector/page.rb', line 60

def success?
  !@inspector.nil? && !@error
end

#technologiesHash

Get a hash of all technologies detected on the page

Returns:

  • (Hash)

    Detected technologies



185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# File 'lib/web_inspector/page.rb', line 185

def technologies
  techs = {}
  js_files = javascripts || []
  css_files = stylesheets || []
  page_body = body || ''
  page_meta = meta || {}
  response_headers = response&.headers || {}

  # Frameworks and Libraries
  techs[:jquery] = true if js_files.any? { |js| js.include?('jquery') } || page_body.include?('jQuery')
  techs[:react] = true if page_body.include?('data-reactroot') || js_files.any? { |js| js.include?('react') }
  techs[:vue] = true if page_body.include?('data-v-app') || js_files.any? { |js| js.include?('vue') }
  techs[:angular] = true if page_body.include?('ng-version') || js_files.any? { |js| js.include?('angular') }
  techs[:bootstrap] = true if css_files.any? do |css|
    css.include?('bootstrap')
  end || page_body.include?('class="container"')
  if response_headers['x-powered-by']&.include?('Rails') || response_headers.key?('x-rails-env')
    techs[:rails] =
      true
  end
  techs[:php] = true if response_headers['x-powered-by']&.include?('PHP')

  # CMS
  techs[:wordpress] = true if page_meta['generator']&.include?('WordPress') || page_body.include?('/wp-content/')
  techs[:shopify] = true if page_body.include?('Shopify.shop')

  # Analytics
  techs[:google_analytics] = true if js_files.any? { |js| js.include?('google-analytics.com') }

  # Server
  server = response_headers['server']
  if server
    techs[:server] = server
    techs[:nginx] = true if server.include?('nginx')
    techs[:apache] = true if server.include?('Apache')
    techs[:iis] = true if server.include?('IIS')
    techs[:express] = true if response_headers['x-powered-by']&.include?('Express')
  end

  techs
end

#to_hashHash

Get full JSON representation of the page with all new data

Returns:

  • (Hash)

    JSON representation of the page



229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# File 'lib/web_inspector/page.rb', line 229

def to_hash
  {
    'url' => url,
    'scheme' => scheme,
    'host' => host,
    'port' => port,
    'title' => title,
    'description' => description,
    'meta' => meta,
    'links' => links,
    'images' => images,
    'javascripts' => javascripts,
    'stylesheets' => stylesheets,
    'favicon' => favicon,
    'language' => language,
    'structured_data' => structured_data,
    'microdata' => microdata,
    'security_info' => security_info,
    'content_type' => content_type,
    'size' => size,
    'load_time' => load_time,
    'technologies' => technologies,
    'tag_count' => tag_count,
    'response' => {
      'status' => status_code,
      'headers' => response&.headers || {},
      'success' => success?
    },
    'error' => error_message
  }
end