Module: Loofah::HTML5::Scrub

Defined in:
lib/loofah/html5/scrub.rb

Constant Summary collapse

CONTROL_CHARACTERS =
/[`\u0000-\u0020\u007f\u0080-\u0101]/
CSS_KEYWORDISH =

rubocop:disable Layout/LineLength

/\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(ch|cm|r?em|ex|in|lh|mm|pc|pt|px|Q|vmax|vmin|vw|vh|%|,|\))?)\z/
CRASS_SEMICOLON =
{ node: :semicolon, raw: ";" }
CSS_IMPORTANT =
"!important"
CSS_WHITESPACE =
" "
CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES =
/\A(["'])?[^"']+\1\z/
DATA_ATTRIBUTE_NAME =
/\Adata-[\w-]+\z/
URI_PROTOCOL_REGEX =

RFC 3986

/\A[a-z][a-z0-9+\-.]*:/
TABLE_FOR_ESCAPE_HTML__ =
{
  "<" => "&lt;",
  ">" => "&gt;",
  "&" => "&amp;",
}

Class Method Summary collapse

Class Method Details

.allowed_element?(element_name) ⇒ Boolean

Returns:

  • (Boolean)


20
21
22
# File 'lib/loofah/html5/scrub.rb', line 20

def allowed_element?(element_name)
  ::Loofah::HTML5::SafeList::ALLOWED_ELEMENTS_WITH_LIBXML2.include?(element_name)
end

.allowed_uri?(uri_string) ⇒ Boolean

Returns true if the given URI string is safe, false otherwise. This method can be used to validate URI attribute values without requiring a Nokogiri DOM node.

Returns:

  • (Boolean)


147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# File 'lib/loofah/html5/scrub.rb', line 147

def allowed_uri?(uri_string)
  # this logic lifted nearly verbatim from HTML5 sanitization
  val_unescaped = CGI.unescapeHTML(uri_string.gsub(CONTROL_CHARACTERS, "")).gsub("&colon;", ":").downcase
  if URI_PROTOCOL_REGEX.match?(val_unescaped)
    protocol = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0]
    return false unless SafeList::ALLOWED_PROTOCOLS.include?(protocol)

    if protocol == "data"
      # permit only allowed data mediatypes
      mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
      mediatype, _ = mediatype.split(/[;,]/)[0..1] if mediatype
      return false if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
    end
  end
  true
end

.cdata_escape(node) ⇒ Object



204
205
206
207
208
209
210
211
# File 'lib/loofah/html5/scrub.rb', line 204

def cdata_escape(node)
  escaped_text = escape_tags(node.text)
  if Nokogiri.jruby?
    node.document.create_text_node(escaped_text)
  else
    node.document.create_cdata(escaped_text)
  end
end

.cdata_needs_escaping?(node) ⇒ Boolean

Returns:

  • (Boolean)


199
200
201
202
# File 'lib/loofah/html5/scrub.rb', line 199

def cdata_needs_escaping?(node)
  # Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` tag as cdata, but it acts that way
  node.cdata? || (Nokogiri.jruby? && node.text? && node.parent.name == "style")
end

.escape_tags(string) ⇒ Object



219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
# File 'lib/loofah/html5/scrub.rb', line 219

def escape_tags(string)
  # modified version of CGI.escapeHTML from ruby 3.1
  enc = string.encoding
  if enc.ascii_compatible?
    string = string.b
    string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__)
    string.force_encoding(enc)
  else
    if enc.dummy?
      origenc = enc
      enc = Encoding::Converter.asciicompat_encoding(enc)
      string = enc ? string.encode(enc) : string.b
    end
    table = Hash[TABLE_FOR_ESCAPE_HTML__.map { |pair| pair.map { |s| s.encode(enc) } }]
    string = string.gsub(/#{"[<>&]".encode(enc)}/, table)
    string.encode!(origenc) if origenc
    string
  end
end

.force_correct_attribute_escaping!(node) ⇒ Object

libxml2 >= 2.9.2 fails to escape comments within some attributes.

see comments about CVE-2018-8048 within the tests for more information


178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# File 'lib/loofah/html5/scrub.rb', line 178

def force_correct_attribute_escaping!(node)
  return unless Nokogiri::VersionInfo.instance.libxml2?

  node.attribute_nodes.each do |attr_node|
    next unless LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES.include?(attr_node.name)

    tag_name = LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES_QUALIFYING_TAG[attr_node.name]
    next unless tag_name.nil? || tag_name == node.name

    #
    #  this block is just like CGI.escape in Ruby 2.4, but
    #  only encodes space and double-quote, to mimic
    #  pre-2.9.2 behavior
    #
    encoding = attr_node.value.encoding
    attr_node.value = attr_node.value.gsub(/[ "]/) do |m|
      "%" + m.unpack("H2" * m.bytesize).join("%").upcase
    end.force_encoding(encoding)
  end
end

.scrub_attribute_that_allows_local_ref(attr_node) ⇒ Object



125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# File 'lib/loofah/html5/scrub.rb', line 125

def scrub_attribute_that_allows_local_ref(attr_node)
  return unless attr_node.value

  nodes = Crass::Parser.new(attr_node.value).parse_component_values

  values = nodes.map do |node|
    case node[:node]
    when :url
      if node[:value].start_with?("#")
        node[:raw]
      end
    when :hash, :ident, :string
      node[:raw]
    end
  end.compact

  attr_node.value = values.join(" ")
end

.scrub_attributes(node) ⇒ Object

alternative implementation of the html5lib attribute scrubbing algorithm



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/loofah/html5/scrub.rb', line 25

def scrub_attributes(node)
  node.attribute_nodes.each do |attr_node|
    attr_name = if attr_node.namespace
      "#{attr_node.namespace.prefix}:#{attr_node.node_name}"
    else
      attr_node.node_name
    end

    if DATA_ATTRIBUTE_NAME.match?(attr_name)
      next
    end

    unless SafeList::ALLOWED_ATTRIBUTES.include?(attr_name)
      attr_node.remove
      next
    end

    if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
      next if scrub_uri_attribute(attr_node)
    end

    if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
      scrub_attribute_that_allows_local_ref(attr_node)
    end

    next unless SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) &&
      attr_name == "xlink:href" &&
      attr_node.value =~ /^\s*[^#\s].*/m

    attr_node.remove
    next
  end

  scrub_css_attribute(node)

  node.attribute_nodes.each do |attr_node|
    if attr_node.value !~ /[^[:space:]]/ && attr_node.name !~ DATA_ATTRIBUTE_NAME
      node.remove_attribute(attr_node.name)
    end
  end

  force_correct_attribute_escaping!(node)
end

.scrub_css(style) ⇒ Object



74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# File 'lib/loofah/html5/scrub.rb', line 74

def scrub_css(style)
  url_flags = [:url, :bad_url]
  style_tree = Crass.parse_properties(style)
  sanitized_tree = []

  style_tree.each do |node|
    next unless node[:node] == :property
    next if node[:children].any? do |child|
      url_flags.include?(child[:node])
    end

    name = node[:name].downcase
    next unless SafeList::ALLOWED_CSS_PROPERTIES.include?(name) ||
      SafeList::ALLOWED_SVG_PROPERTIES.include?(name) ||
      SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first)

    value = node[:children].map do |child|
      case child[:node]
      when :whitespace
        CSS_WHITESPACE
      when :string
        if CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES.match?(child[:raw])
          Crass::Parser.stringify(child)
        end
      when :function
        if SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase)
          Crass::Parser.stringify(child)
        end
      when :ident
        keyword = child[:value]
        if !SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first) ||
            SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) ||
            (keyword =~ CSS_KEYWORDISH)
          keyword
        end
      else
        child[:raw]
      end
    end.compact.join.strip

    next if value.empty?

    value << CSS_WHITESPACE << CSS_IMPORTANT if node[:important]
    propstring = format("%s:%s", name, value)
    sanitized_node = Crass.parse_properties(propstring).first
    sanitized_tree << sanitized_node << CRASS_SEMICOLON
  end

  Crass::Parser.stringify(sanitized_tree)
end

.scrub_css_attribute(node) ⇒ Object



69
70
71
72
# File 'lib/loofah/html5/scrub.rb', line 69

def scrub_css_attribute(node)
  style = node.attributes["style"]
  style.value = scrub_css(style.value) if style
end

.scrub_uri_attribute(attr_node) ⇒ Object



164
165
166
167
168
169
170
171
# File 'lib/loofah/html5/scrub.rb', line 164

def scrub_uri_attribute(attr_node)
  if allowed_uri?(attr_node.value)
    false
  else
    attr_node.remove
    true
  end
end