Class: Curl::Html

Inherits:

Object

Object
Curl::Html

show all

Defined in:: lib/searchlink/curl/html.rb

Overview

Class for CURLing an HTML page

Instance Attribute Summary collapse

#body ⇒ Object readonly

Returns the value of attribute body.
#body_images ⇒ Object readonly

Returns the value of attribute body_images.
#body_links ⇒ Object readonly

Returns the value of attribute body_links.
#code ⇒ Object readonly

Returns the value of attribute code.
#description ⇒ Object readonly

Returns the value of attribute description.
#head ⇒ Object readonly

Returns the value of attribute head.
#headers ⇒ Object readonly

Returns the value of attribute headers.
#links ⇒ Object readonly

Returns the value of attribute links.
#meta ⇒ Object readonly

Returns the value of attribute meta.
#source ⇒ Object readonly

Returns the value of attribute source.
#title ⇒ Object readonly

Returns the value of attribute title.
#url ⇒ Object readonly

Returns the value of attribute url.

Instance Method Summary collapse

#extract(before, after) ⇒ Array

Extract text between two regular expressions.
#extract_tag(tag, attribute = nil, source: false, content: false) ⇒ Hash, Array

Extract an array of tags or tag attributes.
#extract_tag_contents(tag, source: false) ⇒ Object

Extract tag contents or full tag source.
#h(level = '\d') ⇒ Array

Return all headers of given level.
#images ⇒ Array

Get all images from the page.
#initialize(url, headers: nil, headers_only: false, compressed: false) ⇒ HTMLCurl constructor

Create a new page object from a URL.
#tags(tag = nil) ⇒ Array

Return all tags in body, or a specific tag.
#to_s ⇒ Object

Constructor Details

#initialize(url, headers: nil, headers_only: false, compressed: false) ⇒ `HTMLCurl`

Create a new page object from a URL

Parameters:

url (String) —

The url
headers (Hash) (defaults to: nil) —

The headers to use in the curl call
headers_only (Boolean) (defaults to: false) —

Return headers only
compressed (Boolean) (defaults to: false) —

Expect compressed result

# File 'lib/searchlink/curl/html.rb', line 26

def initialize(url, headers: nil, headers_only: false, compressed: false)
  @curl = TTY::Which.which('curl')
  res = curl_html(url, headers: headers, headers_only: headers_only, compressed: compressed)
  @url = res[:url]
  @code = res[:code]
  @headers = res[:headers]
  @meta = res[:meta]
  @links = res[:links]
  @head = res[:head] unless res[:head].nil?
  @body = reencode(res[:body])
  @source = res[:source]
  @title = @meta['og:title'] || @meta['title'] unless @meta.nil?
  @description = @meta['og:description'] || @meta['description'] unless @meta.nil?
  @body_links = content_links
  @body_images = content_images
end

Instance Attribute Details

#body ⇒ `Object` (readonly)

Returns the value of attribute body.



13
14
15

# File 'lib/searchlink/curl/html.rb', line 13

def body
  @body
end

#body_images ⇒ `Object` (readonly)

Returns the value of attribute body_images.



13
14
15

# File 'lib/searchlink/curl/html.rb', line 13

def body_images
  @body_images
end

#body_links ⇒ `Object` (readonly)

Returns the value of attribute body_links.



13
14
15

# File 'lib/searchlink/curl/html.rb', line 13

def body_links
  @body_links
end

#code ⇒ `Object` (readonly)

Returns the value of attribute code.



13
14
15

# File 'lib/searchlink/curl/html.rb', line 13

def code
  @code
end

#description ⇒ `Object` (readonly)

Returns the value of attribute description.



13
14
15

# File 'lib/searchlink/curl/html.rb', line 13

def description
  @description
end

#head ⇒ `Object` (readonly)

Returns the value of attribute head.



13
14
15

# File 'lib/searchlink/curl/html.rb', line 13

def head
  @head
end

#headers ⇒ `Object` (readonly)

Returns the value of attribute headers.



13
14
15

# File 'lib/searchlink/curl/html.rb', line 13

def headers
  @headers
end

#links ⇒ `Object` (readonly)

Returns the value of attribute links.



13
14
15

# File 'lib/searchlink/curl/html.rb', line 13

def links
  @links
end

#meta ⇒ `Object` (readonly)

Returns the value of attribute meta.



13
14
15

# File 'lib/searchlink/curl/html.rb', line 13

def meta
  @meta
end

#source ⇒ `Object` (readonly)

Returns the value of attribute source.



13
14
15

# File 'lib/searchlink/curl/html.rb', line 13

def source
  @source
end

#title ⇒ `Object` (readonly)

Returns the value of attribute title.



13
14
15

# File 'lib/searchlink/curl/html.rb', line 13

def title
  @title
end

#url ⇒ `Object` (readonly)

Returns the value of attribute url.



13
14
15

# File 'lib/searchlink/curl/html.rb', line 13

def url
  @url
end

Instance Method Details

#extract(before, after) ⇒ `Array`

Extract text between two regular expressions

Parameters:

before (String, Regexp) —

The before
after (String, Regexp) —

The after

Returns:

(Array) —

array of matches

# File 'lib/searchlink/curl/html.rb', line 51

def extract(before, after)
  before = /#{Regexp.escape(before)}/ unless before.instance_of?(Regexp)
  after = /#{Regexp.escape(after)}/ unless after.instance_of?(Regexp)
  @body.scan(/#{before.source}(.*?)#{after.source}/)
end

#extract_tag(tag, attribute = nil, source: false, content: false) ⇒ `Hash`, `Array`

Extract an array of tags or tag attributes

If attribute is not given, tag contents will be returned

contents] src attributes]

Examples:

page.extract_tag(‘h1’) => [Array of h1 tag

page.extract_tag(‘img’, ‘src’) => [Array of img

Parameters:

tag (String) —

The tag
attribute (String) (defaults to: nil) —

The attribute
source (Boolean) (defaults to: false) —

Return full tag source (negates attribute if true)
content (Boolean) (defaults to: false) —

Return only tag contents

Returns:

(Hash, Array) —

if source, return array of full tags, if content, return array of tag contents, otherwise, return a hash of tags including attributes and content

# File 'lib/searchlink/curl/html.rb', line 79

def extract_tag(tag, attribute = nil, source: false, content: false)
  res = extract_tag_contents(tag, source: true)

  return res if source

  res.map! do |tag_source|
    m = tag_source.to_enum(:scan, /(\S+)=(['"])(.*?)\2/).map { Regexp.last_match }
    attrs = m.each_with_object({}) { |at, a| a[at[1]] = at[3] }
    tags = tag_source.match(/<.*?>(?<content>.*?)</)
    contents = tags.nil? ? nil : tags['content']
    {
      tag: tag,
      source: tag_source,
      attrs: attrs,
      content: contents
    }
  end

  return res.map { |r| r[:content] } if content

  return res if attribute.nil?

  res.map { |r| r[:attrs][attribute] }
end

#extract_tag_contents(tag, source: false) ⇒ `Object`

Extract tag contents or full tag source

Parameters:

tag —

The tag
source (Boolean) (defaults to: false) —

Return full tag instead of contents

# File 'lib/searchlink/curl/html.rb', line 110

def extract_tag_contents(tag, source: false)
  return @body.scan(%r{<#{tag}.*?>(?:.*?</#{tag}>)?}) if source

  @body.scan(/<#{tag}.*?>(.*?)</).map { |t| t[0] }
end

#h(level = '\d') ⇒ `Array`

Return all headers of given level

Parameters:

level (Number) (defaults to: '\d') —

The level (1-6)

Returns:

(Array) —

array of headers with text and all tag attributes as symbols

# File 'lib/searchlink/curl/html.rb', line 202

def h(level = '\d')
  res = []
  headlines = @body.to_enum(:scan, %r{<h(?<level>#{level})(?<tag> .*?)?>(?<text>.*?)</h#{level}>}i).map { Regexp.last_match }
  headlines.each do |m|
    headline = { level: m['level'] }
    if m['tag'].nil?
      attrs = nil
    else
      attrs = m['tag'].to_enum(:scan, /(?<attr>\w+)=(?<quot>["'])(?<content>.*?)\k<quot>/).map { Regexp.last_match }
      attrs.each { |a| headline[a['attr'].to_sym] = a['content'] }
    end
    headline[:text] = m['text'].remove_entities
    res << headline
  end
  res
end

#images ⇒ `Array`

Get all images from the page

Returns:

(Array) —

Array of images, both from picture sources and img tags

# File 'lib/searchlink/curl/html.rb', line 141

def images
  output = []
  %w[og:image twitter:image].each do |src|
    next unless @meta.key?(src)

    output << {
      type: 'opengraph',
      attrs: nil,
      src: @meta[src]
    }
  end
  images = tags(%w[img source])
  images.each do |img|
    case img[:tag].downcase
    when /source/
      srcsets = img[:attrs].filter { |k| k[:key] =~ /srcset/i }
      if srcsets.count.positive?
        srcset = []
        srcsets.each do |src|
          src[:value].split(/ *, */).each do |s|
            image, media = s.split(/ /)
            srcset << {
              src: image,
              media: media
            }
          end
        end
        output << {
          type: 'srcset',
          attrs: img[:attrs],
          images: srcset
        }
      end
    when /img/
      output << {
        type: 'img',
        src: img[:attrs].filter { |a| a[:key] =~ /src/i }.first[:value],
        attrs: img[:attrs]
      }
    end
  end
  output
end

#tags(tag = nil) ⇒ `Array`

Return all tags in body, or a specific tag

Parameters:

tag (String, Array) (defaults to: nil) —

The tag to return, can be an array

Returns:

(Array) —

Array of tags. If no tag is specified, a hierarchical array of all tags in the document is returned. If one or more tags are specified, return a flattened list in document order.

# File 'lib/searchlink/curl/html.rb', line 127

def tags(tag = nil)
  tags = content_tags(@body)
  return tags if tag.nil?

  tag = [tag] unless tag.is_a?(Array)
  tag.map!(&:downcase)
  flatten_tags(tags).dup.delete_if { |t| !tag.include?(t[:tag].downcase) }
end

#to_s ⇒ `Object`

# File 'lib/searchlink/curl/html.rb', line 185

def to_s
  headers = @headers.nil? ? 0 : @headers.count
  meta = @meta.nil? ? 0 : @meta.count
  links = @links.nil? ? 0 : @links.count
  [
    %(<HTMLCurl: @code="#{@code}" @url="#{@url}" @title="#{@title}"),
    %(@description=#{@description} @headers:#{headers} @meta:#{meta} @links:#{links}>)
  ].join(' ')
end

Class: Curl::Html

Overview

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, headers: nil, headers_only: false, compressed: false) ⇒ HTMLCurl

Instance Attribute Details

#body ⇒ Object (readonly)

#body_images ⇒ Object (readonly)

#body_links ⇒ Object (readonly)

#code ⇒ Object (readonly)

#description ⇒ Object (readonly)

#head ⇒ Object (readonly)

#headers ⇒ Object (readonly)

#links ⇒ Object (readonly)

#meta ⇒ Object (readonly)

#source ⇒ Object (readonly)

#title ⇒ Object (readonly)

#url ⇒ Object (readonly)

Instance Method Details

#extract(before, after) ⇒ Array

#extract_tag(tag, attribute = nil, source: false, content: false) ⇒ Hash, Array

#extract_tag_contents(tag, source: false) ⇒ Object

#h(level = '\d') ⇒ Array

#images ⇒ Array

#tags(tag = nil) ⇒ Array

#to_s ⇒ Object

#initialize(url, headers: nil, headers_only: false, compressed: false) ⇒ `HTMLCurl`

#body ⇒ `Object` (readonly)

#body_images ⇒ `Object` (readonly)

#body_links ⇒ `Object` (readonly)

#code ⇒ `Object` (readonly)

#description ⇒ `Object` (readonly)

#head ⇒ `Object` (readonly)

#headers ⇒ `Object` (readonly)

#links ⇒ `Object` (readonly)

#meta ⇒ `Object` (readonly)

#source ⇒ `Object` (readonly)

#title ⇒ `Object` (readonly)

#url ⇒ `Object` (readonly)

#extract(before, after) ⇒ `Array`

#extract_tag(tag, attribute = nil, source: false, content: false) ⇒ `Hash`, `Array`

#extract_tag_contents(tag, source: false) ⇒ `Object`

#h(level = '\d') ⇒ `Array`

#images ⇒ `Array`

#tags(tag = nil) ⇒ `Array`

#to_s ⇒ `Object`