Module: Loofah::HTML5::Scrub

Defined in:
lib/loofah/html5/scrub.rb

Constant Summary collapse

CONTROL_CHARACTERS =
/[`\u0000-\u0020\u007f\u0080-\u0101]/
CSS_KEYWORDISH =

rubocop:disable Layout/LineLength

/\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(ch|cm|r?em|ex|in|lh|mm|pc|pt|px|Q|vmax|vmin|vw|vh|%|,|\))?)\z/
CRASS_SEMICOLON =
{ node: :semicolon, raw: ";" }
CSS_IMPORTANT =
"!important"
CSS_WHITESPACE =
" "
CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES =
/\A(["'])?[^"']+\1\z/
DATA_ATTRIBUTE_NAME =
/\Adata-[\w-]+\z/
TABLE_FOR_ESCAPE_HTML__ =
{
  "<" => "&lt;",
  ">" => "&gt;",
  "&" => "&amp;",
}

Class Method Summary collapse

Class Method Details

.allowed_element?(element_name) ⇒ Boolean

Returns:

  • (Boolean)


18
19
20
# File 'lib/loofah/html5/scrub.rb', line 18

def allowed_element?(element_name)
  ::Loofah::HTML5::SafeList::ALLOWED_ELEMENTS_WITH_LIBXML2.include?(element_name)
end

.cdata_escape(node) ⇒ Object



192
193
194
195
196
197
198
199
# File 'lib/loofah/html5/scrub.rb', line 192

def cdata_escape(node)
  escaped_text = escape_tags(node.text)
  if Nokogiri.jruby?
    node.document.create_text_node(escaped_text)
  else
    node.document.create_cdata(escaped_text)
  end
end

.cdata_needs_escaping?(node) ⇒ Boolean

Returns:

  • (Boolean)


187
188
189
190
# File 'lib/loofah/html5/scrub.rb', line 187

def cdata_needs_escaping?(node)
  # Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` tag as cdata, but it acts that way
  node.cdata? || (Nokogiri.jruby? && node.text? && node.parent.name == "style")
end

.escape_tags(string) ⇒ Object



207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# File 'lib/loofah/html5/scrub.rb', line 207

def escape_tags(string)
  # modified version of CGI.escapeHTML from ruby 3.1
  enc = string.encoding
  if enc.ascii_compatible?
    string = string.b
    string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__)
    string.force_encoding(enc)
  else
    if enc.dummy?
      origenc = enc
      enc = Encoding::Converter.asciicompat_encoding(enc)
      string = enc ? string.encode(enc) : string.b
    end
    table = Hash[TABLE_FOR_ESCAPE_HTML__.map { |pair| pair.map { |s| s.encode(enc) } }]
    string = string.gsub(/#{"[<>&]".encode(enc)}/, table)
    string.encode!(origenc) if origenc
    string
  end
end

.force_correct_attribute_escaping!(node) ⇒ Object

libxml2 >= 2.9.2 fails to escape comments within some attributes.

see comments about CVE-2018-8048 within the tests for more information


166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# File 'lib/loofah/html5/scrub.rb', line 166

def force_correct_attribute_escaping!(node)
  return unless Nokogiri::VersionInfo.instance.libxml2?

  node.attribute_nodes.each do |attr_node|
    next unless LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES.include?(attr_node.name)

    tag_name = LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES_QUALIFYING_TAG[attr_node.name]
    next unless tag_name.nil? || tag_name == node.name

    #
    #  this block is just like CGI.escape in Ruby 2.4, but
    #  only encodes space and double-quote, to mimic
    #  pre-2.9.2 behavior
    #
    encoding = attr_node.value.encoding
    attr_node.value = attr_node.value.gsub(/[ "]/) do |m|
      "%" + m.unpack("H2" * m.bytesize).join("%").upcase
    end.force_encoding(encoding)
  end
end

.scrub_attribute_that_allows_local_ref(attr_node) ⇒ Object



123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# File 'lib/loofah/html5/scrub.rb', line 123

def scrub_attribute_that_allows_local_ref(attr_node)
  return unless attr_node.value

  nodes = Crass::Parser.new(attr_node.value).parse_component_values

  values = nodes.map do |node|
    case node[:node]
    when :url
      if node[:value].start_with?("#")
        node[:raw]
      end
    when :hash, :ident, :string
      node[:raw]
    end
  end.compact

  attr_node.value = values.join(" ")
end

.scrub_attributes(node) ⇒ Object

alternative implementation of the html5lib attribute scrubbing algorithm



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/loofah/html5/scrub.rb', line 23

def scrub_attributes(node)
  node.attribute_nodes.each do |attr_node|
    attr_name = if attr_node.namespace
      "#{attr_node.namespace.prefix}:#{attr_node.node_name}"
    else
      attr_node.node_name
    end

    if DATA_ATTRIBUTE_NAME.match?(attr_name)
      next
    end

    unless SafeList::ALLOWED_ATTRIBUTES.include?(attr_name)
      attr_node.remove
      next
    end

    if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
      next if scrub_uri_attribute(attr_node)
    end

    if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
      scrub_attribute_that_allows_local_ref(attr_node)
    end

    next unless SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) &&
      attr_name == "xlink:href" &&
      attr_node.value =~ /^\s*[^#\s].*/m

    attr_node.remove
    next
  end

  scrub_css_attribute(node)

  node.attribute_nodes.each do |attr_node|
    if attr_node.value !~ /[^[:space:]]/ && attr_node.name !~ DATA_ATTRIBUTE_NAME
      node.remove_attribute(attr_node.name)
    end
  end

  force_correct_attribute_escaping!(node)
end

.scrub_css(style) ⇒ Object



72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/loofah/html5/scrub.rb', line 72

def scrub_css(style)
  url_flags = [:url, :bad_url]
  style_tree = Crass.parse_properties(style)
  sanitized_tree = []

  style_tree.each do |node|
    next unless node[:node] == :property
    next if node[:children].any? do |child|
      url_flags.include?(child[:node])
    end

    name = node[:name].downcase
    next unless SafeList::ALLOWED_CSS_PROPERTIES.include?(name) ||
      SafeList::ALLOWED_SVG_PROPERTIES.include?(name) ||
      SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first)

    value = node[:children].map do |child|
      case child[:node]
      when :whitespace
        CSS_WHITESPACE
      when :string
        if CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES.match?(child[:raw])
          Crass::Parser.stringify(child)
        end
      when :function
        if SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase)
          Crass::Parser.stringify(child)
        end
      when :ident
        keyword = child[:value]
        if !SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first) ||
            SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) ||
            (keyword =~ CSS_KEYWORDISH)
          keyword
        end
      else
        child[:raw]
      end
    end.compact.join.strip

    next if value.empty?

    value << CSS_WHITESPACE << CSS_IMPORTANT if node[:important]
    propstring = format("%s:%s", name, value)
    sanitized_node = Crass.parse_properties(propstring).first
    sanitized_tree << sanitized_node << CRASS_SEMICOLON
  end

  Crass::Parser.stringify(sanitized_tree)
end

.scrub_css_attribute(node) ⇒ Object



67
68
69
70
# File 'lib/loofah/html5/scrub.rb', line 67

def scrub_css_attribute(node)
  style = node.attributes["style"]
  style.value = scrub_css(style.value) if style
end

.scrub_uri_attribute(attr_node) ⇒ Object



142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# File 'lib/loofah/html5/scrub.rb', line 142

def scrub_uri_attribute(attr_node)
  # this block lifted nearly verbatim from HTML5 sanitization
  val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase
  if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ &&
      !SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
    attr_node.remove
    return true
  elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data"
    # permit only allowed data mediatypes
    mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
    mediatype, _ = mediatype.split(";")[0..1] if mediatype
    if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
      attr_node.remove
      return true
    end
  end
  false
end