Class: WebInspector::Inspector

Inherits:
Object
  • Object
show all
Defined in:
lib/web_inspector/inspector.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(page) ⇒ Inspector

Returns a new instance of Inspector.



9
10
11
12
13
# File 'lib/web_inspector/inspector.rb', line 9

def initialize(page)
  @page = page
  @meta = WebInspector::Meta.new(page).meta
  @base_url = nil
end

Instance Attribute Details

#hostObject (readonly)

Returns the value of attribute host.



7
8
9
# File 'lib/web_inspector/inspector.rb', line 7

def host
  @host
end

#metaObject (readonly)

Returns the value of attribute meta.



7
8
9
# File 'lib/web_inspector/inspector.rb', line 7

def meta
  @meta
end

#pageObject (readonly)

Returns the value of attribute page.



7
8
9
# File 'lib/web_inspector/inspector.rb', line 7

def page
  @page
end

#urlObject (readonly)

Returns the value of attribute url.



7
8
9
# File 'lib/web_inspector/inspector.rb', line 7

def url
  @url
end

Instance Method Details

#bodyObject



30
31
32
# File 'lib/web_inspector/inspector.rb', line 30

def body
  @page.css('body').to_html
end

#descriptionObject



26
27
28
# File 'lib/web_inspector/inspector.rb', line 26

def description
  @meta['description'] || @meta['og:description'] || snippet
end

#domain_images(user_domain, host = nil) ⇒ Array<String>

Get images from a specific domain

Parameters:

  • user_domain (String)

    Domain to filter images by

  • host (String) (defaults to: nil)

    Current host

Returns:

  • (Array<String>)

    Filtered images



104
105
106
107
# File 'lib/web_inspector/inspector.rb', line 104

def domain_images(user_domain, host = nil)
  @host ||= host
  filter_by_domain(images, user_domain)
end

Get links from a specific domain

Parameters:

  • user_domain (String)

    Domain to filter links by

  • host (String) (defaults to: nil)

    Current host

Returns:

  • (Array<String>)

    Filtered links



72
73
74
75
# File 'lib/web_inspector/inspector.rb', line 72

def domain_links(user_domain, host = nil)
  @host ||= host
  filter_by_domain(links, user_domain)
end

#find(words) ⇒ Array<Hash>

Search for specific words in the page content

Parameters:

  • words (Array<String>)

    List of words to search for

Returns:

  • (Array<Hash>)

    Counts of word occurrences



37
38
39
40
# File 'lib/web_inspector/inspector.rb', line 37

def find(words)
  text = @page.at('html').inner_text
  counter(text.downcase, words)
end

#imagesArray<String>

Get all images from the page

Returns:

  • (Array<String>)

    Array of image URLs



79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# File 'lib/web_inspector/inspector.rb', line 79

def images
  @images ||= begin
    images = []
    @page.css('img').each do |img|
      src = img[:src]
      next unless src

      # Clean and normalize URL
      src = src.strip

      begin
        absolute_url = make_absolute_url(src)
        images << absolute_url if absolute_url
      rescue URI::InvalidURIError, URI::BadURIError
        # Skip invalid URLs
      end
    end
    images.uniq.compact
  end
end

#javascriptsArray<String>

Get all JavaScript files used by the page

Returns:

  • (Array<String>)

    Array of JavaScript file URLs



111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# File 'lib/web_inspector/inspector.rb', line 111

def javascripts
  @javascripts ||= begin
    scripts = []
    @page.css('script[src]').each do |script|
      src = script[:src]
      next unless src

      # Clean and normalize URL
      src = src.strip

      begin
        absolute_url = make_absolute_url(src)
        scripts << absolute_url if absolute_url
      rescue URI::InvalidURIError, URI::BadURIError
        # Skip invalid URLs
      end
    end
    scripts.uniq.compact
  end
end

#languageString?

Detect the page language

Returns:

  • (String, nil)

    Language code if detected, nil otherwise



157
158
159
160
161
162
163
164
165
166
167
168
# File 'lib/web_inspector/inspector.rb', line 157

def language
  # Check for html lang attribute first
  html_tag = @page.at('html')
  return html_tag['lang'] if html_tag && html_tag['lang'] && !html_tag['lang'].empty?

  # Then check for language meta tag
  lang_meta = @meta['content-language']
  return lang_meta if lang_meta && !lang_meta.empty?

  # Fallback to inspecting content headers if available
  nil
end

Get all links from the page

Returns:

  • (Array<String>)

    Array of URLs



44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/web_inspector/inspector.rb', line 44

def links
  @links ||= begin
    links = []
    @page.css('a').each do |a|
      href = a[:href]
      next unless href

      # Skip javascript and mailto links
      next if href.start_with?('javascript:', 'mailto:', 'tel:')

      # Clean and normalize URL
      href = href.strip

      begin
        absolute_url = make_absolute_url(href)
        links << absolute_url if absolute_url
      rescue URI::InvalidURIError
        # Skip invalid URLs
      end
    end
    links.uniq
  end
end

#microdataArray<Hash>

Extract microdata from the page

Returns:

  • (Array<Hash>)

    Array of microdata items



187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
# File 'lib/web_inspector/inspector.rb', line 187

def microdata
  @microdata ||= begin
    items = []
    @page.css('[itemscope]').each do |scope|
      item = { type: scope['itemtype'] }
      properties = {}

      scope.css('[itemprop]').each do |prop|
        name = prop['itemprop']
        # Extract value based on tag
        value = case prop.name.downcase
                when 'meta'
                  prop['content']
                when 'img', 'audio', 'embed', 'iframe', 'source', 'track', 'video'
                  make_absolute_url(prop['src'])
                when 'a', 'area', 'link'
                  make_absolute_url(prop['href'])
                when 'time'
                  prop['datetime'] || prop.text.strip
                else
                  prop.text.strip
                end
        properties[name] = value
      end

      item[:properties] = properties
      items << item
    end
    items
  end
end

#set_url(url, host) ⇒ Object



15
16
17
18
# File 'lib/web_inspector/inspector.rb', line 15

def set_url(url, host)
  @url = url
  @host = host
end

#structured_dataArray<Hash>

Extract structured data (JSON-LD) from the page

Returns:

  • (Array<Hash>)

    Array of structured data objects



172
173
174
175
176
177
178
179
180
181
182
183
# File 'lib/web_inspector/inspector.rb', line 172

def structured_data
  @structured_data ||= begin
    data = []
    @page.css('script[type="application/ld+json"]').each do |script|
      parsed = JSON.parse(script.text)
      data << parsed if parsed
    rescue JSON::ParserError
      # Skip invalid JSON
    end
    data
  end
end

#stylesheetsArray<String>

Get stylesheets used by the page

Returns:

  • (Array<String>)

    Array of CSS file URLs



134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# File 'lib/web_inspector/inspector.rb', line 134

def stylesheets
  @stylesheets ||= begin
    styles = []
    @page.css('link[rel="stylesheet"]').each do |style|
      href = style[:href]
      next unless href

      # Clean and normalize URL
      href = href.strip

      begin
        absolute_url = make_absolute_url(href)
        styles << absolute_url if absolute_url
      rescue URI::InvalidURIError, URI::BadURIError
        # Skip invalid URLs
      end
    end
    styles.uniq.compact
  end
end

#tag_countHash

Count all tag types on the page

Returns:

  • (Hash)

    Counts of different HTML elements



221
222
223
224
225
226
227
228
229
# File 'lib/web_inspector/inspector.rb', line 221

def tag_count
  tags = {}
  @page.css('*').each do |element|
    tag_name = element.name.downcase
    tags[tag_name] ||= 0
    tags[tag_name] += 1
  end
  tags
end

#titleObject



20
21
22
23
24
# File 'lib/web_inspector/inspector.rb', line 20

def title
  @page.css('title').inner_text.strip
rescue StandardError
  nil
end