Class: WebInspector::Inspector
- Inherits:
-
Object
- Object
- WebInspector::Inspector
- Defined in:
- lib/web_inspector/inspector.rb
Instance Attribute Summary collapse
-
#host ⇒ Object
readonly
Returns the value of attribute host.
-
#meta ⇒ Object
readonly
Returns the value of attribute meta.
-
#page ⇒ Object
readonly
Returns the value of attribute page.
-
#url ⇒ Object
readonly
Returns the value of attribute url.
Instance Method Summary collapse
- #body ⇒ Object
- #description ⇒ Object
-
#domain_images(user_domain, host = nil) ⇒ Array<String>
Get images from a specific domain.
-
#domain_links(user_domain, host = nil) ⇒ Array<String>
Get links from a specific domain.
-
#find(words) ⇒ Array<Hash>
Search for specific words in the page content.
-
#images ⇒ Array<String>
Get all images from the page.
-
#initialize(page) ⇒ Inspector
constructor
A new instance of Inspector.
-
#javascripts ⇒ Array<String>
Get all JavaScript files used by the page.
-
#language ⇒ String?
Detect the page language.
-
#links ⇒ Array<String>
Get all links from the page.
-
#microdata ⇒ Array<Hash>
Extract microdata from the page.
- #set_url(url, host) ⇒ Object
-
#structured_data ⇒ Array<Hash>
Extract structured data (JSON-LD) from the page.
-
#stylesheets ⇒ Array<String>
Get stylesheets used by the page.
-
#tag_count ⇒ Hash
Count all tag types on the page.
- #title ⇒ Object
Constructor Details
#initialize(page) ⇒ Inspector
Returns a new instance of Inspector.
9 10 11 12 13 |
# File 'lib/web_inspector/inspector.rb', line 9 def initialize(page) @page = page = WebInspector::Meta.new(page). @base_url = nil end |
Instance Attribute Details
#host ⇒ Object (readonly)
Returns the value of attribute host.
7 8 9 |
# File 'lib/web_inspector/inspector.rb', line 7 def host @host end |
#meta ⇒ Object (readonly)
Returns the value of attribute meta.
7 8 9 |
# File 'lib/web_inspector/inspector.rb', line 7 def end |
#page ⇒ Object (readonly)
Returns the value of attribute page.
7 8 9 |
# File 'lib/web_inspector/inspector.rb', line 7 def page @page end |
#url ⇒ Object (readonly)
Returns the value of attribute url.
7 8 9 |
# File 'lib/web_inspector/inspector.rb', line 7 def url @url end |
Instance Method Details
#body ⇒ Object
30 31 32 |
# File 'lib/web_inspector/inspector.rb', line 30 def body @page.css('body').to_html end |
#description ⇒ Object
26 27 28 |
# File 'lib/web_inspector/inspector.rb', line 26 def description ['description'] || ['og:description'] || snippet end |
#domain_images(user_domain, host = nil) ⇒ Array<String>
Get images from a specific domain
104 105 106 107 |
# File 'lib/web_inspector/inspector.rb', line 104 def domain_images(user_domain, host = nil) @host ||= host filter_by_domain(images, user_domain) end |
#domain_links(user_domain, host = nil) ⇒ Array<String>
Get links from a specific domain
72 73 74 75 |
# File 'lib/web_inspector/inspector.rb', line 72 def domain_links(user_domain, host = nil) @host ||= host filter_by_domain(links, user_domain) end |
#find(words) ⇒ Array<Hash>
Search for specific words in the page content
37 38 39 40 |
# File 'lib/web_inspector/inspector.rb', line 37 def find(words) text = @page.at('html').inner_text counter(text.downcase, words) end |
#images ⇒ Array<String>
Get all images from the page
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
# File 'lib/web_inspector/inspector.rb', line 79 def images @images ||= begin images = [] @page.css('img').each do |img| src = img[:src] next unless src # Clean and normalize URL src = src.strip begin absolute_url = make_absolute_url(src) images << absolute_url if absolute_url rescue URI::InvalidURIError, URI::BadURIError # Skip invalid URLs end end images.uniq.compact end end |
#javascripts ⇒ Array<String>
Get all JavaScript files used by the page
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
# File 'lib/web_inspector/inspector.rb', line 111 def javascripts @javascripts ||= begin scripts = [] @page.css('script[src]').each do |script| src = script[:src] next unless src # Clean and normalize URL src = src.strip begin absolute_url = make_absolute_url(src) scripts << absolute_url if absolute_url rescue URI::InvalidURIError, URI::BadURIError # Skip invalid URLs end end scripts.uniq.compact end end |
#language ⇒ String?
Detect the page language
157 158 159 160 161 162 163 164 165 166 167 168 |
# File 'lib/web_inspector/inspector.rb', line 157 def language # Check for html lang attribute first html_tag = @page.at('html') return html_tag['lang'] if html_tag && html_tag['lang'] && !html_tag['lang'].empty? # Then check for language meta tag = ['content-language'] return if && !.empty? # Fallback to inspecting content headers if available nil end |
#links ⇒ Array<String>
Get all links from the page
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
# File 'lib/web_inspector/inspector.rb', line 44 def links @links ||= begin links = [] @page.css('a').each do |a| href = a[:href] next unless href # Skip javascript and mailto links next if href.start_with?('javascript:', 'mailto:', 'tel:') # Clean and normalize URL href = href.strip begin absolute_url = make_absolute_url(href) links << absolute_url if absolute_url rescue URI::InvalidURIError # Skip invalid URLs end end links.uniq end end |
#microdata ⇒ Array<Hash>
Extract microdata from the page
187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 |
# File 'lib/web_inspector/inspector.rb', line 187 def microdata @microdata ||= begin items = [] @page.css('[itemscope]').each do |scope| item = { type: scope['itemtype'] } properties = {} scope.css('[itemprop]').each do |prop| name = prop['itemprop'] # Extract value based on tag value = case prop.name.downcase when 'meta' prop['content'] when 'img', 'audio', 'embed', 'iframe', 'source', 'track', 'video' make_absolute_url(prop['src']) when 'a', 'area', 'link' make_absolute_url(prop['href']) when 'time' prop['datetime'] || prop.text.strip else prop.text.strip end properties[name] = value end item[:properties] = properties items << item end items end end |
#set_url(url, host) ⇒ Object
15 16 17 18 |
# File 'lib/web_inspector/inspector.rb', line 15 def set_url(url, host) @url = url @host = host end |
#structured_data ⇒ Array<Hash>
Extract structured data (JSON-LD) from the page
172 173 174 175 176 177 178 179 180 181 182 183 |
# File 'lib/web_inspector/inspector.rb', line 172 def structured_data @structured_data ||= begin data = [] @page.css('script[type="application/ld+json"]').each do |script| parsed = JSON.parse(script.text) data << parsed if parsed rescue JSON::ParserError # Skip invalid JSON end data end end |
#stylesheets ⇒ Array<String>
Get stylesheets used by the page
134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
# File 'lib/web_inspector/inspector.rb', line 134 def stylesheets @stylesheets ||= begin styles = [] @page.css('link[rel="stylesheet"]').each do |style| href = style[:href] next unless href # Clean and normalize URL href = href.strip begin absolute_url = make_absolute_url(href) styles << absolute_url if absolute_url rescue URI::InvalidURIError, URI::BadURIError # Skip invalid URLs end end styles.uniq.compact end end |
#tag_count ⇒ Hash
Count all tag types on the page
221 222 223 224 225 226 227 228 229 |
# File 'lib/web_inspector/inspector.rb', line 221 def tag_count = {} @page.css('*').each do |element| tag_name = element.name.downcase [tag_name] ||= 0 [tag_name] += 1 end end |
#title ⇒ Object
20 21 22 23 24 |
# File 'lib/web_inspector/inspector.rb', line 20 def title @page.css('title').inner_text.strip rescue StandardError nil end |