Class: HRefPreview::Preview

Inherits:

Object

Object
HRefPreview::Preview

show all

Defined in:: lib/href_preview/preview.rb

Constant Summary collapse

SANITIZE_OPTIONS =

options

Instance Attribute Summary collapse

#response ⇒ Object readonly

Returns the value of attribute response.

Instance Method Summary collapse

#article_html ⇒ Object
#article_node ⇒ Object
#article_text ⇒ Object
#canonical_uri ⇒ Object
#charset ⇒ String

Returns the charset declared in the HTTP headers or HTML meta tags.
#description ⇒ Object
#dom ⇒ Nokogiri::HTML::Document

The DOM for the response body.
#image_uri ⇒ Object
#images ⇒ Object
#initialize(response, connection = DEFAULT_CONNECTION) ⇒ Preview constructor

Initializes a ‘Preview` from an HTTP response.
#inspect ⇒ Object
#is_html? ⇒ true, false

Returns true if the response had a 2xx HTTP code and the mime type is either HTML or XHTML.
#item_type ⇒ Object
#language ⇒ Object
#mime_type ⇒ MIME::Type

Returns the MIME type declared in the HTTP headers or HTML meta tags.
#published ⇒ Object
#shortlink_uri ⇒ Object
#site_name ⇒ Object
#title ⇒ String

The title of the page.
#twitter ⇒ String

The Twitter handle used by the site.
#updated ⇒ Object

Constructor Details

#initialize(response, connection = DEFAULT_CONNECTION) ⇒ `Preview`

Initializes a ‘Preview` from an HTTP response.

# File 'lib/href_preview/preview.rb', line 28

def initialize(response, connection=DEFAULT_CONNECTION)
  @response = response
  @connection = connection
end

Instance Attribute Details

#response ⇒ `Object` (readonly)

Returns the value of attribute response.



33
34
35

# File 'lib/href_preview/preview.rb', line 33

def response
  @response
end

Instance Method Details

#article_html ⇒ `Object`

# File 'lib/href_preview/preview.rb', line 342

def article_html
  @article_html ||= (if is_html?
    begin
      html = nil
      if article_node
        html = article_node.children.reject do |child|
          next unless child.attribute('class')
          [
            'related_links_inline',
            'inline-share-btn-label',
            'inline-share-btn'
          ].include?(child.attribute('class').value)
        end.map(&:to_s).join('')
      end
      if html
        html = Sanitize.clean(html, SANITIZE_OPTIONS)
        html.gsub!("\r\n", "\n")
        html.gsub!("\t", "  ")
        html.gsub!(/ *\n */, "\n")
        html.gsub!(/\n\n+/, "\n\n")
        html.gsub!(/<p>\n+/, "<p>\n")
        html.gsub!(/\n+<\/p>/, "\n</p>")
        html.gsub!(/<\/p>\n+/, "</p>\n")
        html.strip!

        # Excise empty elements
        reparsed = Nokogiri::HTML.fragment(html)
        excise_empty = lambda do |node|
          if node.respond_to?(:name) && node.name == "script"
            node.unlink
          else
            node.children.each do |node|
              excise_empty.call(node) if node.element?
            end
            if node.respond_to?(:attribute_nodes) && node.respond_to?(:text)
              if node.attribute_nodes.size == 0 && node.text.to_s.strip =~ /^\s*$/ &&
                  node.children.all? { |child| child.text? }
                node.unlink
              end
            end
          end
        end
        excise_empty.call(reparsed)
        html = reparsed.to_s
      end
      html
    end
  end)
end

#article_node ⇒ `Object`

# File 'lib/href_preview/preview.rb', line 297

def article_node
  @article_node ||= (if is_html?
    begin
      nodes = dom.xpath("/html[@itemtype='http://schema.org/NewsArticle']//article[@id='story']")
      nodes.first if nodes.size == 1
    end or
    begin
      nodes = dom.xpath("//*/*[@itemtype='http://schema.org/NewsArticle']")
      nodes.first if nodes.size == 1
    end or
    begin
      nodes = dom.xpath("//*/*[@itemprop='articleBody']")
      nodes.first if nodes.size == 1
    end or
    begin
      nodes = dom.css("article div.article-entry")
      nodes.first if nodes.size == 1
    end or
    begin
      nodes = dom.css("article.post div.entry-content")
      nodes.first if nodes.size == 1
    end or
    begin
      nodes = dom.css("div.post div.postBody")
      nodes.first if nodes.size == 1
    end or
    begin
      nodes = dom.css(".pg_story div#leftcolumn div.body")
      nodes.first if nodes.size == 1
    end
  end)
end

#article_text ⇒ `Object`



392
393
394

# File 'lib/href_preview/preview.rb', line 392

def article_text
  @article_text ||= is_html? ? Sanitize.clean(article_html) : nil
end

#canonical_uri ⇒ `Object`

# File 'lib/href_preview/preview.rb', line 203

def canonical_uri
  @canonical_uri ||= (if is_html?
    begin
      node = dom.xpath("//*/link[@rel='canonical']/@href").first
      Addressable::URI.parse(node.value) if node && node.value && node.value != ''
    end or
    begin
      node = dom.xpath("//*/meta[@property='og:url']/@content").first
      Addressable::URI.parse(node.value) if node && node.value && node.value != ''
    end or
    Addressable::URI.parse(response.env.url.to_s)
  else
    Addressable::URI.parse(response.env.url.to_s)
  end)
end

#charset ⇒ `String`

Returns the charset declared in the HTTP headers or HTML meta tags.

# File 'lib/href_preview/preview.rb', line 59

def charset
  @charset ||= (begin
    charset = response.headers['Content-Type'].to_s[/;\s*charset=([^;,]*)/, 1] or
    begin
      node = dom.xpath("//*/meta[@http-equiv='Content-Type']/@content").first
      node.value.to_s[/;\s*charset=([^;,]*)/, 1] if node
    end or
    begin
      node = dom.xpath("//*/meta/@charset").first
      node.value if node
    end
    charset.strip if charset
  end)
end

#description ⇒ `Object`

# File 'lib/href_preview/preview.rb', line 162

def description
  @description ||= (begin
    if is_html?
      description = begin
        node = dom.xpath("//*/meta[@property='og:description']/@content").first
        node.value if node
      end or
      begin
        node = dom.xpath("//*/meta[@name='dc.description']/@content").first
        node.value if node
      end or
      begin
        node = dom.xpath("//*/meta[@itemprop='description']/@content").first
        node.value if node
      end or
      begin
        node = dom.xpath("//*/meta[@name='description']/@content").first
        node.value if node
      end or
      begin
        node = dom.xpath("//*/meta[@name='dcterms.abstract']/@content").first
        node.value if node
      end or
      begin
        # Unlikely to ever happen
        node = dom.xpath("//*/meta[@name='twitter:description']/@content").first
        node.value if node
      end or
      begin
        # Unlikely to ever happen
        node = dom.xpath("//*/meta[@name='sailthru.description']/@content").first
        node.value if node
      end
      if description
        description.gsub!(/&nbsp;/, ' ')
        description.strip
      end
    end
  end)
end

#dom ⇒ `Nokogiri::HTML::Document`

The DOM for the response body.



111
112
113

# File 'lib/href_preview/preview.rb', line 111

def dom
  @dom ||= Nokogiri::HTML(response.body)
end

#image_uri ⇒ `Object`



232
233
234

# File 'lib/href_preview/preview.rb', line 232

def image_uri
  @image_uri ||= (images.first ? Addressable::URI.parse(images.first.uri) : nil)
end

#images ⇒ `Object`

# File 'lib/href_preview/preview.rb', line 236

def images
  @images ||= (begin
    image_uris = []
    if is_html?
      nodes = dom.xpath("//*/meta[@property='og:image']/@content")
      nodes.each do |node|
        if node && node.value && node.value != ''
          image_uris << Addressable::URI.parse(node.value)
        end
      end
      if article_node
        nodes = article_node.xpath("meta[@itemprop='thumbnailurl']/@content")
        nodes.each do |node|
          if node && node.value && node.value != ''
            image_uris << Addressable::URI.parse(node.value)
          end
        end
      end
    elsif mime_type && mime_type.media_type == 'image'
      image_uris << canonical_uri
    end
    image_uris.uniq.map { |uri| FastImage.new(uri, :timeout => 0.5) }
  end)
end

#inspect ⇒ `Object`

# File 'lib/href_preview/preview.rb', line 446

def inspect
  addr = '0x' + ('%x' % (object_id << 1)).rjust(14, '0')
  "#<HRefPreview::Preview:#{addr} TITLE=#{title.inspect}>"
end

#is_html? ⇒ `true`, `false`

Returns true if the response had a 2xx HTTP code and the mime type is either HTML or XHTML.

# File 'lib/href_preview/preview.rb', line 99

def is_html?
  return (
    response.status >= 200 && response.status < 300 &&
    mime_type && mime_type.sub_type =~ /^x?html/
  )
end

#item_type ⇒ `Object`

# File 'lib/href_preview/preview.rb', line 261

def item_type
  @item_type ||= (if is_html?
    begin
      node = dom.xpath("//*/meta[@property='og:type']/@content").first
      node.value if node
    end or
    if dom.xpath("//*[@itemtype='http://schema.org/NewsArticle']").first != nil
      'article'
    end
  end)
end

#language ⇒ `Object`

# File 'lib/href_preview/preview.rb', line 76

def language
  @language ||= (begin
    language = response.headers['Content-Language'] or
    begin
      node = dom.xpath("//*/meta[@http-equiv='Content-Language']/@content").first
      node.value if node
    end or
    begin
      node = dom.xpath("//*/meta[@name='dc.language']/@content").first
      node.value if node
    end
    if language
      # Strip the irrelevant '-US' from 'en-US' if it appears.
      language[/^([a-z]{2})/, 1].to_s.downcase
    end
  end)
end

#mime_type ⇒ `MIME::Type`

Returns the MIME type declared in the HTTP headers or HTML meta tags.

# File 'lib/href_preview/preview.rb', line 40

def mime_type
  @mime_type ||= (begin
    MIME::Types[response.headers['Content-Type']].first or
    begin
      node = dom.xpath("//*/meta[@http-equiv='Content-Type']/@content").first
      MIME::Types[node.value].first if node && node.value
    end or
    begin
      node = dom.xpath("//*/meta[@name='dc.format']/@content").first
      MIME::Types[node.value].first if node && node.value
    end
  end)
end

#published ⇒ `Object`

# File 'lib/href_preview/preview.rb', line 396

def published
  @published ||= (begin
    # Check under the article node first, otherwise search all
    begin
      node = dom.xpath("//*/meta[@property='article:published_time']/@content").first
      Time.parse(node.value) if node && node.value && node.value != ''
    end or
    if article_node
      node = article_node.xpath("meta[@itemprop='datepublished']/@content").first
      Time.parse(node.value) if node && node.value && node.value != ''
    end or
    begin
      node = dom.xpath("//*/meta[@itemprop='datepublished']/@content").first
      Time.parse(node.value) if node && node.value && node.value != ''
    end or
    begin
      node = dom.xpath("//*/meta[@name='dcterms.created']/@content").first
      Time.parse(node.value) if node && node.value && node.value != ''
    end or
    begin
      # Only a date, not a time, and not particularly specific,
      # so this is a fallback at best.
      node = dom.xpath("//*/meta[@name='dc.date']/@content").first
      Time.parse(node.value) if node && node.value && node.value != ''
    end
  end)
end

#shortlink_uri ⇒ `Object`

# File 'lib/href_preview/preview.rb', line 219

def shortlink_uri
  @shortlink_uri ||= (if is_html?
    begin
      node = dom.xpath("//*/link[@rel='shortlink']/@href").first
      Addressable::URI.parse(node.value) if node && node.value && node.value != ''
    end or
    begin
      node = dom.xpath("//*[@class='story-short-url']/a/@href").first
      Addressable::URI.parse(node.value) if node && node.value && node.value != ''
    end
  end)
end

#site_name ⇒ `Object`

# File 'lib/href_preview/preview.rb', line 273

def site_name
  @site_name ||= (if is_html?
    begin
      node = dom.xpath("//*/meta[@property='og:site_name']/@content").first
      node.value if node
    end or
    begin
      node = dom.xpath("//*/meta[@name='dc.publisher']/@content").first
      node.value if node
    end
  end)
end

#title ⇒ `String`

# File 'lib/href_preview/preview.rb', line 117

def title
  @title ||= (begin
    if is_html?
      title = begin
        node = dom.xpath("//*/meta[@property='og:title']/@content").first
        node.value if node
      end or
      begin
        node = dom.xpath("//*/meta[@name='dc.title']/@content").first
        node.value if node
      end or
      begin
        if article_node
          node = article_node.xpath("*[@itemprop='headline']").first
          node.text if node
        end
      end or
      begin
        node = dom.xpath("//*/*[(self::h1 or self::h2) and @itemprop='headline']").first
        node.text if node
      end or
      begin
        node = dom.xpath("//*/head/title").first
        node.text if node
      end or
      begin
        # Unlikely to ever happen
        node = dom.xpath("//*/meta[@name='twitter:title']/@content").first
        node.value if node
      end or
      begin
        # Unlikely to ever happen
        node = dom.xpath("//*/meta[@name='sailthru.title']/@content").first
        node.value if node
      end
      if title
        title.gsub!(/&nbsp;/, ' ')
        title.gsub!(/^#{site_name}[\s\|\-\:]*/, '')
        title.gsub!(/[\s\|\-\:]*#{site_name}$/, '')
        title.strip
      end
    end
  end)
end

#twitter ⇒ `String`

# File 'lib/href_preview/preview.rb', line 288

def twitter
  @twitter ||= (if is_html?
    begin
      node = dom.xpath("//*/meta[@name='twitter:site']/@content").first
      node.value if node && node.value && node.value =~ /^@/
    end
  end)
end

#updated ⇒ `Object`

# File 'lib/href_preview/preview.rb', line 424

def updated
  @updated ||= (begin
    # Check under the article node first, otherwise search all
    begin
      node = dom.xpath("//*/meta[@property='article:modified_time']/@content").first
      Time.parse(node.value) if node && node.value && node.value != ''
    end or
    if article_node
      node = article_node.xpath("meta[@itemprop='datemodified']/@content").first
      Time.parse(node.value) if node && node.value && node.value != ''
    end or
    begin
      node = dom.xpath("meta[@itemprop='datemodified']/@content").first
      Time.parse(node.value) if node && node.value && node.value != ''
    end or
    begin
      node = dom.xpath("//*/meta[@name='dcterms.modified']/@content").first
      Time.parse(node.value) if node && node.value && node.value != ''
    end
  end)
end

Class: HRefPreview::Preview

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(response, connection = DEFAULT_CONNECTION) ⇒ Preview

Instance Attribute Details

#response ⇒ Object (readonly)

Instance Method Details

#article_html ⇒ Object

#article_node ⇒ Object

#article_text ⇒ Object

#canonical_uri ⇒ Object

#charset ⇒ String

#description ⇒ Object

#dom ⇒ Nokogiri::HTML::Document

#image_uri ⇒ Object

#images ⇒ Object

#inspect ⇒ Object

#is_html? ⇒ true, false

#item_type ⇒ Object

#language ⇒ Object

#mime_type ⇒ MIME::Type

#published ⇒ Object

#shortlink_uri ⇒ Object

#site_name ⇒ Object

#title ⇒ String

#twitter ⇒ String

#updated ⇒ Object