Class: Zumobi::RemovingSanitize

Inherits:
Object
  • Object
show all
Defined in:
lib/zumobi/removing_sanitize.rb

Defined Under Namespace

Modules: Config

Class Method Summary collapse

Class Method Details

.clean(html, config = RemovingSanitize::Config::ZUMOBI) ⇒ Object



126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# File 'lib/zumobi/removing_sanitize.rb', line 126

def self.clean(html, config = RemovingSanitize::Config::ZUMOBI)
    unless html.blank?
        # Remove CDATA escaping: sanitize converts this to <[CDATA[ ... ]]> which is visible to user.
        html.gsub!(/<!\[CDATA\[/,'')
        html.gsub!(/\]\]>/,'')
        # In one feed Nokogiri eats too much HTML when parsing it due to the present of a comment sequence.
        # So here we resort to stripping it out with a regular expression first. ! 
        html.gsub!(/<!--[^-]*-->/,"")
        # Decode HTML entities.
        coder = HTMLEntities.new
        html = coder.decode(html)
        # Decode HTML that is escaped, e.g. "&lt;div&gt;test&lt;/div&gt;"
        html = CGI::unescapeHTML(html)
        unless (html.blank?)
            unless (config[:removals].nil?)
                doc = Nokogiri::HTML.fragment "<div>#{html}</div>"
                config[:removals].each do |removal|
                    doc.search(removal).each do |element|
                        element.remove 
                    end
                end
                # The original does not work on plain text, at the least. Methods in EntryDecorator 
                # like text(true) and plaintext could return content with a wrapping div element.
                # html = doc.children[0].to_html
                html = doc.children[0].children.map { |node| node.to_html }.join('')
            end
            html = Sanitize.clean(html, config)
        end
    end
    html
end