Class: Curl::Html
- Inherits:
-
Object
- Object
- Curl::Html
- Defined in:
- lib/curly/curl/html.rb
Overview
Class for CURLing an HTML page
Instance Attribute Summary collapse
-
#body ⇒ Object
readonly
Returns the value of attribute body.
-
#body_images ⇒ Object
readonly
Returns the value of attribute body_images.
-
#body_links ⇒ Object
readonly
Returns the value of attribute body_links.
-
#browser ⇒ Object
Returns the value of attribute browser.
-
#clean ⇒ Object
Returns the value of attribute clean.
-
#code ⇒ Object
readonly
Returns the value of attribute code.
-
#compressed ⇒ Object
Returns the value of attribute compressed.
-
#description ⇒ Object
readonly
Returns the value of attribute description.
-
#external_links_only ⇒ Object
Returns the value of attribute external_links_only.
-
#fallback ⇒ Object
Returns the value of attribute fallback.
-
#head ⇒ Object
readonly
Returns the value of attribute head.
-
#headers ⇒ Object
Returns the value of attribute headers.
-
#headers_only ⇒ Object
Returns the value of attribute headers_only.
-
#ignore_fragment_links ⇒ Object
Returns the value of attribute ignore_fragment_links.
-
#ignore_local_links ⇒ Object
Returns the value of attribute ignore_local_links.
-
#links ⇒ Object
readonly
Returns the value of attribute links.
-
#local_links_only ⇒ Object
Returns the value of attribute local_links_only.
-
#meta ⇒ Object
readonly
Returns the value of attribute meta.
-
#settings ⇒ Object
Returns the value of attribute settings.
-
#source ⇒ Object
Returns the value of attribute source.
-
#title ⇒ Object
readonly
Returns the value of attribute title.
-
#url ⇒ Object
readonly
Returns the value of attribute url.
Instance Method Summary collapse
-
#curl ⇒ Object
Curl a url, either with curl or Selenium based on browser settings.
- #execute(script, wait, element_id) ⇒ Object
-
#extract(before, after, inclusive: false) ⇒ Array
Extract text between two regular expressions.
-
#extract_tag(tag, attribute = nil, source: false, content: false) ⇒ Hash, Array
Extract an array of tags or tag attributes.
-
#extract_tag_contents(tag, source: false) ⇒ Array
Extract tag contents or full tag source.
-
#h(level = '\d') ⇒ Array
Return all headers of given level.
-
#images(types: :all) ⇒ Array
Get all images from the page.
-
#initialize(url, options = {}) ⇒ HTMLCurl
constructor
Create a new page object from a URL.
-
#nokogiri_to_tag(el) ⇒ Object
Convert a nokogiri element to Curl::Html format.
-
#parse(source) ⇒ Hash
Parse raw HTML source instead of curling.
- #recurse_children(element) ⇒ Object
-
#screenshot(destination = nil, type: :full_page, script: nil, id: nil, wait: 0) ⇒ Object
Save a screenshot of the url.
-
#search(path, source: @source, return_source: false) ⇒ Array
——————————————————- Perform a CSS query using Nokogiri.
-
#tags(tag = nil) ⇒ Array
Return all tags in body, or a specific tag.
-
#to_data(url: nil) ⇒ Hash
Convert self to a hash of data.
-
#to_s ⇒ Object
String representation.
Constructor Details
#initialize(url, options = {}) ⇒ HTMLCurl
Create a new page object from a URL
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
# File 'lib/curly/curl/html.rb', line 61 def initialize(url, = {}) @browser = [:browser] || :none @source = [:source] @headers = [:headers] || {} @headers_only = [:headers_only] @compressed = [:compressed] @clean = [:clean] @fallback = [:fallback] @ignore_local_links = [:ignore_local_links] @ignore_fragment_links = [:ignore_fragment_links] @external_links_only = [:external_links_only] @local_links_only = [:local_links_only] @curl = TTY::Which.which('curl') @url = url.nil? ? [:url] : url end |
Instance Attribute Details
#body ⇒ Object (readonly)
Returns the value of attribute body.
16 17 18 |
# File 'lib/curly/curl/html.rb', line 16 def body @body end |
#body_images ⇒ Object (readonly)
Returns the value of attribute body_images.
16 17 18 |
# File 'lib/curly/curl/html.rb', line 16 def body_images @body_images end |
#body_links ⇒ Object (readonly)
Returns the value of attribute body_links.
16 17 18 |
# File 'lib/curly/curl/html.rb', line 16 def body_links @body_links end |
#browser ⇒ Object
Returns the value of attribute browser.
13 14 15 |
# File 'lib/curly/curl/html.rb', line 13 def browser @browser end |
#clean ⇒ Object
Returns the value of attribute clean.
13 14 15 |
# File 'lib/curly/curl/html.rb', line 13 def clean @clean end |
#code ⇒ Object (readonly)
Returns the value of attribute code.
16 17 18 |
# File 'lib/curly/curl/html.rb', line 16 def code @code end |
#compressed ⇒ Object
Returns the value of attribute compressed.
13 14 15 |
# File 'lib/curly/curl/html.rb', line 13 def compressed @compressed end |
#description ⇒ Object (readonly)
Returns the value of attribute description.
16 17 18 |
# File 'lib/curly/curl/html.rb', line 16 def description @description end |
#external_links_only ⇒ Object
Returns the value of attribute external_links_only.
13 14 15 |
# File 'lib/curly/curl/html.rb', line 13 def external_links_only @external_links_only end |
#fallback ⇒ Object
Returns the value of attribute fallback.
13 14 15 |
# File 'lib/curly/curl/html.rb', line 13 def fallback @fallback end |
#head ⇒ Object (readonly)
Returns the value of attribute head.
16 17 18 |
# File 'lib/curly/curl/html.rb', line 16 def head @head end |
#headers ⇒ Object
Returns the value of attribute headers.
13 14 15 |
# File 'lib/curly/curl/html.rb', line 13 def headers @headers end |
#headers_only ⇒ Object
Returns the value of attribute headers_only.
13 14 15 |
# File 'lib/curly/curl/html.rb', line 13 def headers_only @headers_only end |
#ignore_fragment_links ⇒ Object
Returns the value of attribute ignore_fragment_links.
13 14 15 |
# File 'lib/curly/curl/html.rb', line 13 def ignore_fragment_links @ignore_fragment_links end |
#ignore_local_links ⇒ Object
Returns the value of attribute ignore_local_links.
13 14 15 |
# File 'lib/curly/curl/html.rb', line 13 def ignore_local_links @ignore_local_links end |
#links ⇒ Object (readonly)
Returns the value of attribute links.
16 17 18 |
# File 'lib/curly/curl/html.rb', line 16 def links @links end |
#local_links_only ⇒ Object
Returns the value of attribute local_links_only.
13 14 15 |
# File 'lib/curly/curl/html.rb', line 13 def local_links_only @local_links_only end |
#meta ⇒ Object (readonly)
Returns the value of attribute meta.
16 17 18 |
# File 'lib/curly/curl/html.rb', line 16 def @meta end |
#settings ⇒ Object
Returns the value of attribute settings.
13 14 15 |
# File 'lib/curly/curl/html.rb', line 13 def settings @settings end |
#source ⇒ Object
Returns the value of attribute source.
13 14 15 |
# File 'lib/curly/curl/html.rb', line 13 def source @source end |
#title ⇒ Object (readonly)
Returns the value of attribute title.
16 17 18 |
# File 'lib/curly/curl/html.rb', line 16 def title @title end |
#url ⇒ Object (readonly)
Returns the value of attribute url.
16 17 18 |
# File 'lib/curly/curl/html.rb', line 16 def url @url end |
Instance Method Details
#curl ⇒ Object
Curl a url, either with curl or Selenium based on browser settings
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
# File 'lib/curly/curl/html.rb', line 95 def curl res = if @url && @browser && @browser != :none source = curl_dynamic_html curl_html(nil, source: source, headers: @headers) elsif url.nil? && !source.nil? curl_html(nil, source: @source, headers: @headers, headers_only: @headers_only, compressed: @compressed, fallback: false) else curl_html(@url, headers: @headers, headers_only: @headers_only, compressed: @compressed, fallback: @fallback) end @url = res[:url] @code = res[:code] @headers = res[:headers] @meta = res[:meta] @links = res[:links] @head = res[:head] unless res[:head].nil? @body = reencode(res[:body]) @source = res[:source] @title = @meta['og:title'] || @meta['title'] unless @meta.nil? @description = @meta['og:description'] || @meta['description'] unless @meta.nil? @body_links = content_links @body_images = content_images end |
#execute(script, wait, element_id) ⇒ Object
142 143 144 |
# File 'lib/curly/curl/html.rb', line 142 def execute(script, wait, element_id) run_js(script, wait, element_id) end |
#extract(before, after, inclusive: false) ⇒ Array
Extract text between two regular expressions
154 155 156 157 158 159 160 161 162 163 |
# File 'lib/curly/curl/html.rb', line 154 def extract(before, after, inclusive: false) before = /#{Regexp.escape(before)}/ unless before.is_a?(Regexp) after = /#{Regexp.escape(after)}/ unless after.is_a?(Regexp) rx = if inclusive /(#{before.source}.*?#{after.source})/m else /(?<=#{before.source})(.*?)(?=#{after.source})/m end @body.scan(rx).map { |r| @clean ? r[0].clean : r[0] } end |
#extract_tag(tag, attribute = nil, source: false, content: false) ⇒ Hash, Array
Extract an array of tags or tag attributes
If attribute is not given, tag contents will be returned
contents] src attributes]
187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 |
# File 'lib/curly/curl/html.rb', line 187 def extract_tag(tag, attribute = nil, source: false, content: false) res = extract_tag_contents(tag, source: true) return res if source res.map! do |tag_source| m = tag_source.to_enum(:scan, /(\S+)=(['"])(.*?)\2/).map { Regexp.last_match } attrs = m.each_with_object({}) { |at, a| a[at[1]] = at[3] } = tag_source.match(/<.*?>(?<content>.*?)</) contents = .nil? ? nil : ['content'] { tag: tag, source: tag_source, attrs: attrs, content: @clean ? contents&.clean : contents } end return res.map { |r| r[:content] } if content return res if attribute.nil? res.map { |r| r[:attrs][attribute] } end |
#extract_tag_contents(tag, source: false) ⇒ Array
Extract tag contents or full tag source
219 220 221 222 223 |
# File 'lib/curly/curl/html.rb', line 219 def extract_tag_contents(tag, source: false) return @body.scan(%r{<#{tag}.*?>(?:.*?</#{tag}>)?}) if source @body.scan(/<#{tag}.*?>(.*?)</).map { |t| t[0] } end |
#h(level = '\d') ⇒ Array
Return all headers of given level
335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 |
# File 'lib/curly/curl/html.rb', line 335 def h(level = '\d') res = [] headlines = @body.to_enum(:scan, %r{<h(?<level>#{level})(?<tag> .*?)?>(?<text>.*?)</h#{level}>}i).map do Regexp.last_match end headlines.each do |m| headline = { level: m['level'] } if m['tag'].nil? attrs = nil else attrs = m['tag'].to_enum(:scan, /(?<attr>\w+)=(?<quot>["'])(?<content>.*?)\k<quot>/).map { Regexp.last_match } attrs.each { |a| headline[a['attr'].to_sym] = a['content'] } end headline[:text] = m['text'].remove_entities res << headline end res end |
#images(types: :all) ⇒ Array
Get all images from the page
250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 |
# File 'lib/curly/curl/html.rb', line 250 def images(types: :all) output = [] types = [types] unless types.is_a?(Array) # types.map!(&:normalize_image_type) types.each do |type| if %i[all opengraph].include?(type) %w[og:image twitter:image].each do |src| next unless @meta.key?(src) output << { type: 'opengraph', attrs: nil, src: @meta[src] } end end images = (%w[img source]) images.each do |img| case img[:tag].downcase when /source/ next unless %i[all srcset].include?(type) srcsets = img[:attrs].filter { |k| k == 'srcset' } if srcsets.count.positive? srcset = [] srcsets.each do |k, v| v.split(/ *, */).each do |s| image, media = s.split(/ /) srcset << { src: image, media: media } end end output << { type: 'srcset', attrs: img[:attrs], images: srcset } end when /img/ next unless %i[all img].include?(type) width = img[:attrs]['width'] height = img[:attrs]['height'] alt = img[:attrs]['alt'] title = img[:attrs]['title'] output << { type: 'img', src: img[:attrs]['src'], width: width || 'unknown', height: height || 'unknown', alt: alt, title: title, attrs: img[:attrs] } end end end output end |
#nokogiri_to_tag(el) ⇒ Object
Convert a nokogiri element to Curl::Html format
359 360 361 362 363 364 365 366 367 368 369 370 371 372 |
# File 'lib/curly/curl/html.rb', line 359 def nokogiri_to_tag(el) attributes = {} attributes = el.attribute_nodes.each_with_object({}) do |a, hsh| hsh[a.name] = a.name =~ /^(class|rel)$/ ? a.value.split(/ /) : a.value end { tag: el.name, source: @clean ? el.to_html&.strip&.clean : el.to_html, attrs: attributes, content: @clean ? el.text&.strip&.clean : el.text.strip, tags: recurse_children(el) } end |
#parse(source) ⇒ Hash
Parse raw HTML source instead of curling
86 87 88 89 90 |
# File 'lib/curly/curl/html.rb', line 86 def parse(source) @body = source { url: @url, code: @code, headers: @headers, meta: @meta, links: @links, head: @head, body: source, source: source.strip, body_links: content_links, body_images: content_images } end |
#recurse_children(element) ⇒ Object
374 375 376 377 378 379 380 381 382 |
# File 'lib/curly/curl/html.rb', line 374 def recurse_children(element) children = [] element.children.each do |child| next if child.name == 'text' children.push(nokogiri_to_tag(child)) end children end |
#screenshot(destination = nil, type: :full_page, script: nil, id: nil, wait: 0) ⇒ Object
Save a screenshot of the url
131 132 133 134 135 |
# File 'lib/curly/curl/html.rb', line 131 def screenshot(destination = nil, type: :full_page, script: nil, id: nil, wait: 0) # full_page = type.to_sym == :full_page # print_page = type.to_sym == :print_page save_screenshot(destination, type: type, script: script, id: id, wait_seconds: wait) end |
#search(path, source: @source, return_source: false) ⇒ Array
Perform a CSS query using Nokogiri
391 392 393 394 395 396 397 398 399 400 401 402 403 |
# File 'lib/curly/curl/html.rb', line 391 def search(path, source: @source, return_source: false) doc = Nokogiri::HTML(source) output = [] if return_source output = doc.search(path).to_html else doc.search(path).each do |el| out = nokogiri_to_tag(el) output.push(out) end end output end |
#tags(tag = nil) ⇒ Array
Return all tags in body, or a specific tag
236 237 238 239 240 241 242 243 |
# File 'lib/curly/curl/html.rb', line 236 def (tag = nil) = (@body) return if tag.nil? tag = [tag] unless tag.is_a?(Array) tag.map!(&:downcase) ().dup.delete_if { |t| !tag.include?(t[:tag].downcase) } end |
#to_data(url: nil) ⇒ Hash
Convert self to a hash of data
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
# File 'lib/curly/curl/html.rb', line 25 def to_data(url: nil) { url: @url || url, code: @code, headers: @headers, meta: @meta, meta_links: @links, head: @clean ? @head&.strip&.clean : @head, body: @clean ? @body&.strip&.clean : @body, source: @clean ? @source&.strip&.clean : @source, title: @title, description: @description, links: @body_links, images: @body_images } end |
#to_s ⇒ Object
String representation
318 319 320 321 322 323 324 325 326 |
# File 'lib/curly/curl/html.rb', line 318 def to_s headers = @headers.nil? ? 0 : @headers.count = @meta.nil? ? 0 : @meta.count links = @links.nil? ? 0 : @links.count [ %(<HTMLCurl: @code="#{@code}" @url="#{@url}" @title="#{@title}"), %(@description=#{@description} @headers:#{headers} @meta:#{} @links:#{links}>) ].join(' ') end |