Class: HtmlProcessor

Inherits:
Object
  • Object
show all
Defined in:
lib/html_processor.rb

Overview

This utility class is used to work on html text

You can initialize it with html or markdown text

Constant Summary collapse

DESCRIPTION_PROCESSING_ORDER =

Constants

%w( p h1 h2 h3 h4 h5 h6 )
YOUTUBE_TRANSFORMER =

Define Youtube transformer for Sanitize

lambda do |env|
  node      = env[:node]
  node_name = env[:node_name]

  # Don't continue if this node is already whitelisted or is not an element.
  return if env[:is_whitelisted] || !node.element?

  # Don't continue unless the node is an iframe.
  return unless node_name == 'iframe'

  # Verify that the video URL is actually a valid YouTube video URL.
  return unless node['src'] =~ %r|\A(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/|

  # We're now certain that this is a YouTube embed, but we still need to run
  # it through a special Sanitize step to ensure that no unwanted elements or
  # attributes that don't belong in a YouTube embed can sneak in.
  Sanitize.node!(node, {
    :elements => %w[iframe],

    :attributes => {
      'iframe'  => %w[allowfullscreen frameborder height src width]
    }
  })

  # Now that we're sure that this is a valid YouTube embed and that there are
  # no unwanted elements or attributes hidden inside it, we can tell Sanitize
  # to whitelist the current node.
  {:node_whitelist => [node]}
end
SANITIZER_OPTS =

Default options for Sanitize

Sanitize::Config::RELAXED.merge(
  attributes: Sanitize::Config::RELAXED[:attributes].merge(
    'a' => %w[href hreflang name rel target],
  ),
  transformers: YOUTUBE_TRANSFORMER
)

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(text, options = { }) ⇒ HtmlProcessor

Methods



60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# File 'lib/html_processor.rb', line 60

def initialize(text, options = { })
  @original = text

  # Process markdown or leave original
  if options[:format].to_s == 'markdown' && text
    html_options = { :safe_links_only => true, :hard_wrap => true, :filter_html => false }
    renderer_options = { :autolink => true, :no_intraemphasis => true, :fenced_code_blocks => true, :superscript => true }

    renderer = Redcarpet::Markdown.new(Redcarpet::Render::HTML.new(html_options), renderer_options)
    raw_html = renderer.render(text)
    @html = Sanitize.fragment(raw_html, SANITIZER_OPTS)
  else
    @html = text
  end
end

Instance Attribute Details

#htmlObject (readonly)

Returns the value of attribute html.



10
11
12
# File 'lib/html_processor.rb', line 10

def html
  @html
end

#originalObject (readonly)

Returns the value of attribute original.



10
11
12
# File 'lib/html_processor.rb', line 10

def original
  @original
end

Instance Method Details

#descriptionObject

Return a description of the document by returning the first sentence of the first DESCRIPTION_PROCESSING_ORDER found



85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/html_processor.rb', line 85

def description
  # Return cached value if one
  return @description if @description

  # Parse the html document to try to find
  # a description
  @description = ''
  DESCRIPTION_PROCESSING_ORDER.each do |selector|
    elem = self.document.css(selector).detect { |e| e && !e.content.blank? }
    next if elem.blank? #skip if nil or empty

    # Try to get the first two sentences
    match = elem.content.match(/([^.!?]+[.!?]?)([^.!?]+[.!?]?)?/)
    if match && match.captures.any?
      @description = match.captures.compact.join('')
    end
    break if !@description.empty?
  end

  return @description
end

#documentObject

Return a Nokogiri document based on processor html



78
79
80
# File 'lib/html_processor.rb', line 78

def document
  @document ||= Nokogiri::HTML(@html)
end