Class: Curl::Html

Inherits:
Object
  • Object
show all
Defined in:
lib/curly/curl/html.rb

Overview

Class for CURLing an HTML page

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, options = {}) ⇒ HTMLCurl

Create a new page object from a URL

Parameters:

  • url (String)

    The url

  • options (Hash) (defaults to: {})

    The options

Options Hash (options):

  • :browser (Symbol)

    the browser to use instead of curl (:chrome, :firefox)

  • :source (String)

    source provided instead of curl

  • :headers (Hash)

    headers to send in the request

  • :headers_only (Boolean)

    whether to return just response headers

  • :compressed (Boolean)

    expect compressed response

  • :clean (Boolean)

    clean whitespace from response

  • :fallback (Symbol)

    browser to fall back to if curl doesn’t work (:chrome, :firefox)

  • :ignore_local_links (Boolean)

    when collecting links, ignore local/relative links

  • :ignore_fragment_links (Boolean)

    when collecting links, ignore links that are just #fragments

  • :external_links_only (Boolean)

    only collect links outside of current site



61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/curly/curl/html.rb', line 61

def initialize(url, options = {})
  @browser = options[:browser] || :none
  @source = options[:source]
  @headers = options[:headers] || {}
  @headers_only = options[:headers_only]
  @compressed = options[:compressed]
  @clean = options[:clean]
  @fallback = options[:fallback]
  @ignore_local_links = options[:ignore_local_links]
  @ignore_fragment_links = options[:ignore_fragment_links]
  @external_links_only = options[:external_links_only]
  @local_links_only = options[:local_links_only]

  @curl = TTY::Which.which('curl')
  @url = url.nil? ? options[:url] : url
end

Instance Attribute Details

#bodyObject (readonly)

Returns the value of attribute body.



16
17
18
# File 'lib/curly/curl/html.rb', line 16

def body
  @body
end

#body_imagesObject (readonly)

Returns the value of attribute body_images.



16
17
18
# File 'lib/curly/curl/html.rb', line 16

def body_images
  @body_images
end

Returns the value of attribute body_links.



16
17
18
# File 'lib/curly/curl/html.rb', line 16

def body_links
  @body_links
end

#browserObject

Returns the value of attribute browser.



13
14
15
# File 'lib/curly/curl/html.rb', line 13

def browser
  @browser
end

#cleanObject

Returns the value of attribute clean.



13
14
15
# File 'lib/curly/curl/html.rb', line 13

def clean
  @clean
end

#codeObject (readonly)

Returns the value of attribute code.



16
17
18
# File 'lib/curly/curl/html.rb', line 16

def code
  @code
end

#compressedObject

Returns the value of attribute compressed.



13
14
15
# File 'lib/curly/curl/html.rb', line 13

def compressed
  @compressed
end

#descriptionObject (readonly)

Returns the value of attribute description.



16
17
18
# File 'lib/curly/curl/html.rb', line 16

def description
  @description
end

Returns the value of attribute external_links_only.



13
14
15
# File 'lib/curly/curl/html.rb', line 13

def external_links_only
  @external_links_only
end

#fallbackObject

Returns the value of attribute fallback.



13
14
15
# File 'lib/curly/curl/html.rb', line 13

def fallback
  @fallback
end

#headObject (readonly)

Returns the value of attribute head.



16
17
18
# File 'lib/curly/curl/html.rb', line 16

def head
  @head
end

#headersObject

Returns the value of attribute headers.



13
14
15
# File 'lib/curly/curl/html.rb', line 13

def headers
  @headers
end

#headers_onlyObject

Returns the value of attribute headers_only.



13
14
15
# File 'lib/curly/curl/html.rb', line 13

def headers_only
  @headers_only
end

Returns the value of attribute ignore_fragment_links.



13
14
15
# File 'lib/curly/curl/html.rb', line 13

def ignore_fragment_links
  @ignore_fragment_links
end

Returns the value of attribute ignore_local_links.



13
14
15
# File 'lib/curly/curl/html.rb', line 13

def ignore_local_links
  @ignore_local_links
end

Returns the value of attribute links.



16
17
18
# File 'lib/curly/curl/html.rb', line 16

def links
  @links
end

Returns the value of attribute local_links_only.



13
14
15
# File 'lib/curly/curl/html.rb', line 13

def local_links_only
  @local_links_only
end

#metaObject (readonly)

Returns the value of attribute meta.



16
17
18
# File 'lib/curly/curl/html.rb', line 16

def meta
  @meta
end

#settingsObject

Returns the value of attribute settings.



13
14
15
# File 'lib/curly/curl/html.rb', line 13

def settings
  @settings
end

#sourceObject

Returns the value of attribute source.



13
14
15
# File 'lib/curly/curl/html.rb', line 13

def source
  @source
end

#titleObject (readonly)

Returns the value of attribute title.



16
17
18
# File 'lib/curly/curl/html.rb', line 16

def title
  @title
end

#urlObject (readonly)

Returns the value of attribute url.



16
17
18
# File 'lib/curly/curl/html.rb', line 16

def url
  @url
end

Instance Method Details

#curlObject

Curl a url, either with curl or Selenium based on browser settings



95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# File 'lib/curly/curl/html.rb', line 95

def curl
  res = if @url && @browser && @browser != :none
          source = curl_dynamic_html
          curl_html(nil, source: source, headers: @headers)
        elsif url.nil? && !source.nil?
          curl_html(nil, source: @source, headers: @headers, headers_only: @headers_only,
                         compressed: @compressed, fallback: false)
        else
          curl_html(@url, headers: @headers, headers_only: @headers_only,
                          compressed: @compressed, fallback: @fallback)
        end
  @url = res[:url]
  @code = res[:code]
  @headers = res[:headers]
  @meta = res[:meta]
  @links = res[:links]
  @head = res[:head] unless res[:head].nil?
  @body = reencode(res[:body])
  @source = res[:source]
  @title = @meta['og:title'] || @meta['title'] unless @meta.nil?
  @description = @meta['og:description'] || @meta['description'] unless @meta.nil?
  @body_links = content_links
  @body_images = content_images
end

#execute(script, wait, element_id) ⇒ Object

Parameters:

  • script

    The script to run



142
143
144
# File 'lib/curly/curl/html.rb', line 142

def execute(script, wait, element_id)
  run_js(script, wait, element_id)
end

#extract(before, after, inclusive: false) ⇒ Array

Extract text between two regular expressions

Parameters:

  • before (String, Regexp)

    The before

  • after (String, Regexp)

    The after

Returns:

  • (Array)

    array of matches



154
155
156
157
158
159
160
161
162
163
# File 'lib/curly/curl/html.rb', line 154

def extract(before, after, inclusive: false)
  before = /#{Regexp.escape(before)}/ unless before.is_a?(Regexp)
  after = /#{Regexp.escape(after)}/ unless after.is_a?(Regexp)
  rx = if inclusive
         /(#{before.source}.*?#{after.source})/m
       else
         /(?<=#{before.source})(.*?)(?=#{after.source})/m
       end
  @body.scan(rx).map { |r| @clean ? r[0].clean : r[0] }
end

#extract_tag(tag, attribute = nil, source: false, content: false) ⇒ Hash, Array

Extract an array of tags or tag attributes

If attribute is not given, tag contents will be returned

contents] src attributes]

Examples:

page.extract_tag(‘h1’) => [Array of h1 tag

page.extract_tag(‘img’, ‘src’) => [Array of img

Parameters:

  • tag (String)

    The tag

  • attribute (String) (defaults to: nil)

    The attribute

  • source (Boolean) (defaults to: false)

    Return full tag source (negates attribute if true)

  • content (Boolean) (defaults to: false)

    Return only tag contents

Returns:

  • (Hash, Array)

    if source, return array of full tags, if content, return array of tag contents, otherwise, return a hash of tags including attributes and content



187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# File 'lib/curly/curl/html.rb', line 187

def extract_tag(tag, attribute = nil, source: false, content: false)
  res = extract_tag_contents(tag, source: true)

  return res if source

  res.map! do |tag_source|
    m = tag_source.to_enum(:scan, /(\S+)=(['"])(.*?)\2/).map { Regexp.last_match }
    attrs = m.each_with_object({}) { |at, a| a[at[1]] = at[3] }
    tags = tag_source.match(/<.*?>(?<content>.*?)</)
    contents = tags.nil? ? nil : tags['content']
    {
      tag: tag,
      source: tag_source,
      attrs: attrs,
      content: @clean ? contents&.clean : contents
    }
  end

  return res.map { |r| r[:content] } if content

  return res if attribute.nil?

  res.map { |r| r[:attrs][attribute] }
end

#extract_tag_contents(tag, source: false) ⇒ Array

Extract tag contents or full tag source

Parameters:

  • tag

    The tag

  • source (Boolean) (defaults to: false)

    Return full tag instead of contents

Returns:

  • (Array)

    array of tag matches/contents



219
220
221
222
223
# File 'lib/curly/curl/html.rb', line 219

def extract_tag_contents(tag, source: false)
  return @body.scan(%r{<#{tag}.*?>(?:.*?</#{tag}>)?}) if source

  @body.scan(/<#{tag}.*?>(.*?)</).map { |t| t[0] }
end

#h(level = '\d') ⇒ Array

Return all headers of given level

Parameters:

  • level (Number) (defaults to: '\d')

    The level (1-6)

Returns:

  • (Array)

    array of headers with text and all tag attributes as symbols



335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
# File 'lib/curly/curl/html.rb', line 335

def h(level = '\d')
  res = []
  headlines = @body.to_enum(:scan, %r{<h(?<level>#{level})(?<tag> .*?)?>(?<text>.*?)</h#{level}>}i).map do
    Regexp.last_match
  end
  headlines.each do |m|
    headline = { level: m['level'] }
    if m['tag'].nil?
      attrs = nil
    else
      attrs = m['tag'].to_enum(:scan, /(?<attr>\w+)=(?<quot>["'])(?<content>.*?)\k<quot>/).map { Regexp.last_match }
      attrs.each { |a| headline[a['attr'].to_sym] = a['content'] }
    end
    headline[:text] = m['text'].remove_entities
    res << headline
  end
  res
end

#images(types: :all) ⇒ Array

Get all images from the page

Returns:

  • (Array)

    Array of images, both from picture sources and img tags



250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
# File 'lib/curly/curl/html.rb', line 250

def images(types: :all)
  output = []
  types = [types] unless types.is_a?(Array)
  # types.map!(&:normalize_image_type)
  types.each do |type|
    if %i[all opengraph].include?(type)
      %w[og:image twitter:image].each do |src|
        next unless @meta.key?(src)

        output << {
          type: 'opengraph',
          attrs: nil,
          src: @meta[src]
        }
      end
    end
    images = tags(%w[img source])
    images.each do |img|
      case img[:tag].downcase
      when /source/
        next unless %i[all srcset].include?(type)

        srcsets = img[:attrs].filter { |k| k == 'srcset' }
        if srcsets.count.positive?
          srcset = []
          srcsets.each do |k, v|
            v.split(/ *, */).each do |s|
              image, media = s.split(/ /)
              srcset << {
                src: image,
                media: media
              }
            end
          end
          output << {
            type: 'srcset',
            attrs: img[:attrs],
            images: srcset
          }
        end
      when /img/
        next unless %i[all img].include?(type)

        width = img[:attrs]['width']
        height = img[:attrs]['height']
        alt = img[:attrs]['alt']
        title = img[:attrs]['title']

        output << {
          type: 'img',
          src: img[:attrs]['src'],
          width: width || 'unknown',
          height: height || 'unknown',
          alt: alt,
          title: title,
          attrs: img[:attrs]
        }
      end
    end
  end
  output
end

#nokogiri_to_tag(el) ⇒ Object

Convert a nokogiri element to Curl::Html format

Parameters:

  • el (Nokogiri)

    element to convert



359
360
361
362
363
364
365
366
367
368
369
370
371
372
# File 'lib/curly/curl/html.rb', line 359

def nokogiri_to_tag(el)
  attributes = {}
  attributes = el.attribute_nodes.each_with_object({}) do |a, hsh|
    hsh[a.name] = a.name =~ /^(class|rel)$/ ? a.value.split(/ /) : a.value
  end

  {
    tag: el.name,
    source: @clean ? el.to_html&.strip&.clean : el.to_html,
    attrs: attributes,
    content: @clean ? el.text&.strip&.clean : el.text.strip,
    tags: recurse_children(el)
  }
end

#parse(source) ⇒ Hash

Parse raw HTML source instead of curling

Parameters:

  • source (String)

    The source

Returns:

  • (Hash)

    Hash of data after processing #



86
87
88
89
90
# File 'lib/curly/curl/html.rb', line 86

def parse(source)
  @body = source
  { url: @url, code: @code, headers: @headers, meta: @meta, links: @links, head: @head, body: source,
    source: source.strip, body_links: content_links, body_images: content_images }
end

#recurse_children(element) ⇒ Object



374
375
376
377
378
379
380
381
382
# File 'lib/curly/curl/html.rb', line 374

def recurse_children(element)
  children = []
  element.children.each do |child|
    next if child.name == 'text'

    children.push(nokogiri_to_tag(child))
  end
  children
end

#screenshot(destination = nil, type: :full_page, script: nil, id: nil, wait: 0) ⇒ Object

Save a screenshot of the url

Parameters:

  • urls (Array)

    The urls

  • destination (defaults to: nil)

    The file destination

  • browser

    The browser (:firefox, :chrome)

  • type (defaults to: :full_page)

    The type of screenshot to save (:full_page, :print_page, :visible)



131
132
133
134
135
# File 'lib/curly/curl/html.rb', line 131

def screenshot(destination = nil, type: :full_page, script: nil, id: nil, wait: 0)
  # full_page = type.to_sym == :full_page
  # print_page = type.to_sym == :print_page
  save_screenshot(destination, type: type, script: script, id: id, wait_seconds: wait)
end

#search(path, source: @source, return_source: false) ⇒ Array


Perform a CSS query using Nokogiri

Parameters:

  • path (String)

    The CSS path

Returns:

  • (Array)

    array of matched elements



391
392
393
394
395
396
397
398
399
400
401
402
403
# File 'lib/curly/curl/html.rb', line 391

def search(path, source: @source, return_source: false)
  doc = Nokogiri::HTML(source)
  output = []
  if return_source
    output = doc.search(path).to_html
  else
    doc.search(path).each do |el|
      out = nokogiri_to_tag(el)
      output.push(out)
    end
  end
  output
end

#tags(tag = nil) ⇒ Array

Return all tags in body, or a specific tag

Parameters:

  • tag (String, Array) (defaults to: nil)

    The tag to return, can be an array

Returns:

  • (Array)

    Array of tags. If no tag is specified, a hierarchical array of all tags in the document is returned. If one or more tags are specified, return a flattened list in document order.



236
237
238
239
240
241
242
243
# File 'lib/curly/curl/html.rb', line 236

def tags(tag = nil)
  tags = (@body)
  return tags if tag.nil?

  tag = [tag] unless tag.is_a?(Array)
  tag.map!(&:downcase)
  flatten_tags(tags).dup.delete_if { |t| !tag.include?(t[:tag].downcase) }
end

#to_data(url: nil) ⇒ Hash

Convert self to a hash of data

Parameters:

  • url (String) (defaults to: nil)

    A base url to fall back to

Returns:

  • (Hash)

    a hash of data



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/curly/curl/html.rb', line 25

def to_data(url: nil)
  {
    url: @url || url,
    code: @code,
    headers: @headers,
    meta: @meta,
    meta_links: @links,
    head: @clean ? @head&.strip&.clean : @head,
    body: @clean ? @body&.strip&.clean : @body,
    source: @clean ? @source&.strip&.clean : @source,
    title: @title,
    description: @description,
    links: @body_links,
    images: @body_images
  }
end

#to_sObject

String representation

Returns:

  • String representation of the object.



318
319
320
321
322
323
324
325
326
# File 'lib/curly/curl/html.rb', line 318

def to_s
  headers = @headers.nil? ? 0 : @headers.count
  meta = @meta.nil? ? 0 : @meta.count
  links = @links.nil? ? 0 : @links.count
  [
    %(<HTMLCurl: @code="#{@code}" @url="#{@url}" @title="#{@title}"),
    %(@description=#{@description} @headers:#{headers} @meta:#{meta} @links:#{links}>)
  ].join(' ')
end