Class: Curl::Html

Inherits:
Object
  • Object
show all
Defined in:
lib/searchlink/curl/html.rb

Overview

Class for CURLing an HTML page

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, headers: nil, headers_only: false, compressed: false) ⇒ HTMLCurl

Create a new page object from a URL

Parameters:

  • url (String)

    The url

  • headers (Hash) (defaults to: nil)

    The headers to use in the curl call

  • headers_only (Boolean) (defaults to: false)

    Return headers only

  • compressed (Boolean) (defaults to: false)

    Expect compressed result



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/searchlink/curl/html.rb', line 26

def initialize(url, headers: nil, headers_only: false, compressed: false)
  @curl = TTY::Which.which('curl')
  res = curl_html(url, headers: headers, headers_only: headers_only, compressed: compressed)
  @url = res[:url]
  @code = res[:code]
  @headers = res[:headers]
  @meta = res[:meta]
  @links = res[:links]
  @head = res[:head] unless res[:head].nil?
  @body = reencode(res[:body])
  @source = res[:source]
  @title = @meta['og:title'] || @meta['title'] unless @meta.nil?
  @description = @meta['og:description'] || @meta['description'] unless @meta.nil?
  @body_links = content_links
  @body_images = content_images
end

Instance Attribute Details

#bodyObject (readonly)

Returns the value of attribute body.



13
14
15
# File 'lib/searchlink/curl/html.rb', line 13

def body
  @body
end

#body_imagesObject (readonly)

Returns the value of attribute body_images.



13
14
15
# File 'lib/searchlink/curl/html.rb', line 13

def body_images
  @body_images
end

Returns the value of attribute body_links.



13
14
15
# File 'lib/searchlink/curl/html.rb', line 13

def body_links
  @body_links
end

#codeObject (readonly)

Returns the value of attribute code.



13
14
15
# File 'lib/searchlink/curl/html.rb', line 13

def code
  @code
end

#descriptionObject (readonly)

Returns the value of attribute description.



13
14
15
# File 'lib/searchlink/curl/html.rb', line 13

def description
  @description
end

#headObject (readonly)

Returns the value of attribute head.



13
14
15
# File 'lib/searchlink/curl/html.rb', line 13

def head
  @head
end

#headersObject (readonly)

Returns the value of attribute headers.



13
14
15
# File 'lib/searchlink/curl/html.rb', line 13

def headers
  @headers
end

Returns the value of attribute links.



13
14
15
# File 'lib/searchlink/curl/html.rb', line 13

def links
  @links
end

#metaObject (readonly)

Returns the value of attribute meta.



13
14
15
# File 'lib/searchlink/curl/html.rb', line 13

def meta
  @meta
end

#sourceObject (readonly)

Returns the value of attribute source.



13
14
15
# File 'lib/searchlink/curl/html.rb', line 13

def source
  @source
end

#titleObject (readonly)

Returns the value of attribute title.



13
14
15
# File 'lib/searchlink/curl/html.rb', line 13

def title
  @title
end

#urlObject (readonly)

Returns the value of attribute url.



13
14
15
# File 'lib/searchlink/curl/html.rb', line 13

def url
  @url
end

Instance Method Details

#extract(before, after) ⇒ Array

Extract text between two regular expressions

Parameters:

  • before (String, Regexp)

    The before

  • after (String, Regexp)

    The after

Returns:

  • (Array)

    array of matches



51
52
53
54
55
# File 'lib/searchlink/curl/html.rb', line 51

def extract(before, after)
  before = /#{Regexp.escape(before)}/ unless before.instance_of?(Regexp)
  after = /#{Regexp.escape(after)}/ unless after.instance_of?(Regexp)
  @body.scan(/#{before.source}(.*?)#{after.source}/)
end

#extract_tag(tag, attribute = nil, source: false, content: false) ⇒ Hash, Array

Extract an array of tags or tag attributes

If attribute is not given, tag contents will be returned

contents] src attributes]

Examples:

page.extract_tag(‘h1’) => [Array of h1 tag

page.extract_tag(‘img’, ‘src’) => [Array of img

Parameters:

  • tag (String)

    The tag

  • attribute (String) (defaults to: nil)

    The attribute

  • source (Boolean) (defaults to: false)

    Return full tag source (negates attribute if true)

  • content (Boolean) (defaults to: false)

    Return only tag contents

Returns:

  • (Hash, Array)

    if source, return array of full tags, if content, return array of tag contents, otherwise, return a hash of tags including attributes and content



79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/searchlink/curl/html.rb', line 79

def extract_tag(tag, attribute = nil, source: false, content: false)
  res = extract_tag_contents(tag, source: true)

  return res if source

  res.map! do |tag_source|
    m = tag_source.to_enum(:scan, /(\S+)=(['"])(.*?)\2/).map { Regexp.last_match }
    attrs = m.each_with_object({}) { |at, a| a[at[1]] = at[3] }
    tags = tag_source.match(/<.*?>(?<content>.*?)</)
    contents = tags.nil? ? nil : tags['content']
    {
      tag: tag,
      source: tag_source,
      attrs: attrs,
      content: contents
    }
  end

  return res.map { |r| r[:content] } if content

  return res if attribute.nil?

  res.map { |r| r[:attrs][attribute] }
end

#extract_tag_contents(tag, source: false) ⇒ Object

Extract tag contents or full tag source

Parameters:

  • tag

    The tag

  • source (Boolean) (defaults to: false)

    Return full tag instead of contents



110
111
112
113
114
# File 'lib/searchlink/curl/html.rb', line 110

def extract_tag_contents(tag, source: false)
  return @body.scan(%r{<#{tag}.*?>(?:.*?</#{tag}>)?}) if source

  @body.scan(/<#{tag}.*?>(.*?)</).map { |t| t[0] }
end

#h(level = '\d') ⇒ Array

Return all headers of given level

Parameters:

  • level (Number) (defaults to: '\d')

    The level (1-6)

Returns:

  • (Array)

    array of headers with text and all tag attributes as symbols



202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
# File 'lib/searchlink/curl/html.rb', line 202

def h(level = '\d')
  res = []
  headlines = @body.to_enum(:scan, %r{<h(?<level>#{level})(?<tag> .*?)?>(?<text>.*?)</h#{level}>}i).map { Regexp.last_match }
  headlines.each do |m|
    headline = { level: m['level'] }
    if m['tag'].nil?
      attrs = nil
    else
      attrs = m['tag'].to_enum(:scan, /(?<attr>\w+)=(?<quot>["'])(?<content>.*?)\k<quot>/).map { Regexp.last_match }
      attrs.each { |a| headline[a['attr'].to_sym] = a['content'] }
    end
    headline[:text] = m['text'].remove_entities
    res << headline
  end
  res
end

#imagesArray

Get all images from the page

Returns:

  • (Array)

    Array of images, both from picture sources and img tags



141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# File 'lib/searchlink/curl/html.rb', line 141

def images
  output = []
  %w[og:image twitter:image].each do |src|
    next unless @meta.key?(src)

    output << {
      type: 'opengraph',
      attrs: nil,
      src: @meta[src]
    }
  end
  images = tags(%w[img source])
  images.each do |img|
    case img[:tag].downcase
    when /source/
      srcsets = img[:attrs].filter { |k| k[:key] =~ /srcset/i }
      if srcsets.count.positive?
        srcset = []
        srcsets.each do |src|
          src[:value].split(/ *, */).each do |s|
            image, media = s.split(/ /)
            srcset << {
              src: image,
              media: media
            }
          end
        end
        output << {
          type: 'srcset',
          attrs: img[:attrs],
          images: srcset
        }
      end
    when /img/
      output << {
        type: 'img',
        src: img[:attrs].filter { |a| a[:key] =~ /src/i }.first[:value],
        attrs: img[:attrs]
      }
    end
  end
  output
end

#tags(tag = nil) ⇒ Array

Return all tags in body, or a specific tag

Parameters:

  • tag (String, Array) (defaults to: nil)

    The tag to return, can be an array

Returns:

  • (Array)

    Array of tags. If no tag is specified, a hierarchical array of all tags in the document is returned. If one or more tags are specified, return a flattened list in document order.



127
128
129
130
131
132
133
134
# File 'lib/searchlink/curl/html.rb', line 127

def tags(tag = nil)
  tags = (@body)
  return tags if tag.nil?

  tag = [tag] unless tag.is_a?(Array)
  tag.map!(&:downcase)
  flatten_tags(tags).dup.delete_if { |t| !tag.include?(t[:tag].downcase) }
end

#to_sObject



185
186
187
188
189
190
191
192
193
# File 'lib/searchlink/curl/html.rb', line 185

def to_s
  headers = @headers.nil? ? 0 : @headers.count
  meta = @meta.nil? ? 0 : @meta.count
  links = @links.nil? ? 0 : @links.count
  [
    %(<HTMLCurl: @code="#{@code}" @url="#{@url}" @title="#{@title}"),
    %(@description=#{@description} @headers:#{headers} @meta:#{meta} @links:#{links}>)
  ].join(' ')
end