Class: Whitewash

Inherits:
Object
  • Object
show all
Defined in:
lib/whitewash.rb

Constant Summary collapse

CSS =
Regexp.new(%r{
  \A\s*
  ([-a-z0-9]+) : \s*
  (?: (?: [-./a-z0-9]+ | \#[0-9a-f]+ | [0-9]+% ) \s* ) +
  \s*\z
}xi).freeze

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(whitelist = Whitewash.default_whitelist) ⇒ Whitewash

whitelist is expected to be loaded from xhtml.yaml.



49
50
51
# File 'lib/whitewash.rb', line 49

def initialize(whitelist = Whitewash.default_whitelist)
  @whitelist = whitelist
end

Instance Attribute Details

#xhtmlObject (readonly)

Returns the value of attribute xhtml.



53
54
55
# File 'lib/whitewash.rb', line 53

def xhtml
  @xhtml
end

Class Method Details

.default_whitelistObject



40
41
42
43
44
45
# File 'lib/whitewash.rb', line 40

def Whitewash.default_whitelist
  unless found = PATH.find {|dir| File.readable?(File.join(dir, WHITELIST)) }
    raise RuntimeError, "Can't find default whitelist"
  end
  File.open(File.join(found, WHITELIST)) {|f| Whitewash.load(f.read.untaint) }
end

.load(string) ⇒ Object

use Syck to parse the whitelist to work around Psych issue #36 that was present in some versions of Ruby 1.9.3



29
30
31
# File 'lib/whitewash.rb', line 29

def Whitewash.load(string)
  YAML.load(string)
end

Instance Method Details

#check_style(whitelist, style) ⇒ Object



62
63
64
65
66
67
68
69
# File 'lib/whitewash.rb', line 62

def check_style(whitelist, style)
  css = whitelist['_css'] or return true
  style.split(';').each do |s|
    return false unless
      s =~ CSS and css.include? $1
  end
  true
end

#sanitize(html, whitelist = @whitelist, &p) ⇒ Object

Return sanitized HTML.

If block is supplied, it will be invoked for each Nokogiri::XML::Element in the sanitized HTML.



113
114
115
116
117
118
119
120
# File 'lib/whitewash.rb', line 113

def sanitize(html, whitelist = @whitelist, &p)
  xml = Nokogiri::HTML(html) {|config| config.noblanks }
  xml = xml.xpath('//html/body').first
  return '' if xml.nil?

  sanitize_element(xml, whitelist, &p)
  xml.children.to_xhtml
end

#sanitize_element(xml, whitelist = @whitelist, &p) ⇒ Object

compare elements and attributes with the whitelist



73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/whitewash.rb', line 73

def sanitize_element(xml, whitelist = @whitelist, &p)
  if xml.name =~ /^_/ or not whitelist.keys.include?(xml.name)
    xml.element_children.each {|e| sanitize_element(e, whitelist, &p) }
    xml.replace(xml.children)
    return
  end

  # sanitize CSS in <style> elements
  if 'style' == xml.name and not check_style(whitelist, xml.content)
    xml.remove
    return
  end

  xml.attribute_nodes.each do |a|
    attrs ||= whitelist['_common'].merge((whitelist[xml.name] or {}))
    unless attrs[a.name] === a.to_s
      xml.remove_attribute(a.name)
      next
    end

    # sanitize CSS in style="" attributes
    if 'style' == a.name and not check_style(whitelist, a.value)
      xml.remove_attribute(a.name)
      next
    end
  end

  # recurse
  xml.element_children.each {|e| sanitize_element(e, whitelist, &p) }

  if block_given?
    yield xml
  end
end