Class: SiteDiff::Sanitizer
- Inherits:
-
Object
- Object
- SiteDiff::Sanitizer
- Defined in:
- lib/sitediff/sanitize.rb,
lib/sitediff/sanitize/regexp.rb,
lib/sitediff/sanitize/dom_transform.rb
Defined Under Namespace
Classes: DomTransform, InvalidSanitization, Regexp
Constant Summary collapse
- TOOLS =
{ array: %w[dom_transform sanitization], scalar: %w[selector remove_spacing] }.freeze
- DOM_TRANSFORMS =
Set.new(%w[remove unwrap_root unwrap remove_class])
Class Method Summary collapse
-
.domify(str, force_doc = false) ⇒ Object
Parse HTML into a node.
-
.prettify(obj) ⇒ Object
Pretty-print some HTML.
-
.remove_node_spacing(node) ⇒ Object
Remove double-spacing inside text nodes.
-
.select_fragments(node, sel) ⇒ Object
Get a fragment consisting of the elements matching the selector(s).
-
.to_document(obj) ⇒ Object
Force this object to be a document, so we can apply a stylesheet.
Instance Method Summary collapse
-
#canonicalize_rule(name) ⇒ Object
Canonicalize a simple rule, eg: ‘remove_spacing’ or ‘selector’.
-
#dom_transforms ⇒ Object
Perform DOM transforms.
-
#initialize(html, config, opts = {}) ⇒ Sanitizer
constructor
A new instance of Sanitizer.
-
#regexps ⇒ Object
Applies regexps.
-
#remove_spacing ⇒ Object
Perform ‘remove_spacing’ action.
- #sanitize ⇒ Object
-
#selector ⇒ Object
Perform ‘selector’ action, to choose a new root.
-
#want_rule(rule) ⇒ Object
Return whether or not we want to keep a rule.
Constructor Details
#initialize(html, config, opts = {}) ⇒ Sanitizer
Returns a new instance of Sanitizer.
20 21 22 23 24 |
# File 'lib/sitediff/sanitize.rb', line 20 def initialize(html, config, opts = {}) @html = html @config = config @opts = opts end |
Class Method Details
.domify(str, force_doc = false) ⇒ Object
Parse HTML into a node
171 172 173 174 175 176 177 |
# File 'lib/sitediff/sanitize.rb', line 171 def self.domify(str, force_doc = false) if force_doc || /<!DOCTYPE/.match(str[0, 512]) Nokogiri::HTML(str) else Nokogiri::HTML.fragment(str) end end |
.prettify(obj) ⇒ Object
Pretty-print some HTML
138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
# File 'lib/sitediff/sanitize.rb', line 138 def self.prettify(obj) @stylesheet ||= begin stylesheet_path = File.join(SiteDiff::FILES_DIR, 'pretty_print.xsl') Nokogiri::XSLT(File.read(stylesheet_path)) end # Pull out the html element's children # The obvious way to do this is to iterate over pretty.css('html'), # but that tends to segfault Nokogiri str = @stylesheet.apply_to(to_document(obj)) # There's a lot of cruft left over,that we don't want # Prevent potential UTF-8 encoding errors by removing invalid bytes. # Not the only solution. # An alternative is to return the string unmodified. str = str.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '') # Remove xml declaration and <html> tags str.sub!(/\A<\?xml.*$\n/, '') str.sub!(/\A^<html>$\n/, '') str.sub!(%r{</html>\n\Z}, '') # Remove top-level indentation indent = /\A(\s*)/.match(str)[1].size str.gsub!(/^\s{,#{indent}}/, '') # Remove blank lines str.gsub!(/^\s*$\n/, '') str end |
.remove_node_spacing(node) ⇒ Object
Remove double-spacing inside text nodes
120 121 122 123 124 125 |
# File 'lib/sitediff/sanitize.rb', line 120 def self.remove_node_spacing(node) # remove double spacing, but only inside text nodes (eg not attributes) node.xpath('//text()').each do |el| el.content = el.content.gsub(/ +/, ' ') end end |
.select_fragments(node, sel) ⇒ Object
Get a fragment consisting of the elements matching the selector(s)
128 129 130 131 132 133 134 135 |
# File 'lib/sitediff/sanitize.rb', line 128 def self.select_fragments(node, sel) # When we choose a new root, we always become a DocumentFragment, # and lose any DOCTYPE and such. ns = node.css(sel) node = Nokogiri::HTML.fragment('') unless node.fragment? node.children = ns node end |
.to_document(obj) ⇒ Object
Force this object to be a document, so we can apply a stylesheet
180 181 182 183 184 185 186 187 188 189 |
# File 'lib/sitediff/sanitize.rb', line 180 def self.to_document(obj) if Nokogiri::XML::Document == obj.class || Nokogiri::HTML::Document == obj.class obj # node or fragment elsif Nokogiri::XML::Node == obj.class || Nokogiri::HTML::DocumentFragment == obj.class domify(obj.to_s, true) else to_document(domify(obj, false)) end end |
Instance Method Details
#canonicalize_rule(name) ⇒ Object
Canonicalize a simple rule, eg: ‘remove_spacing’ or ‘selector’. It may be a simple value, or a hash, or an array of hashes. Turn it into an array of hashes.
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
# File 'lib/sitediff/sanitize.rb', line 56 def canonicalize_rule(name) (rules = @config[name]) || (return nil) if rules[0]&.respond_to?(:[]) && rules[0]['value'] # Already an array elsif rules['value'] # Hash, put it in an array rules = [rules] else # Scalar, put it in a hash rules = [{ 'value' => rules }] end want = rules.select { |r| want_rule(r) } return nil if want.empty? raise "Too many matching rules of type #{name}" if want.size > 1 want.first end |
#dom_transforms ⇒ Object
Perform DOM transforms
107 108 109 110 111 112 113 114 115 |
# File 'lib/sitediff/sanitize.rb', line 107 def dom_transforms (rules = @config['dom_transform']) || return rules = rules.select { |r| want_rule(r) } rules.each do |rule| transform = DomTransform.create(rule) transform.apply(@node) end end |
#regexps ⇒ Object
Applies regexps. Also
89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
# File 'lib/sitediff/sanitize.rb', line 89 def regexps (rules = @config['sanitization']) || return rules = rules.select { |r| want_rule(r) } rules.map! { |r| Regexp.create(r) } selector, global = rules.partition(&:selector?) selector.each { |r| r.apply(@node) } @html = Sanitizer.prettify(@node) @node = nil # Prevent potential UTF-8 encoding errors by removing bytes # Not the only solution. An alternative is to return the # string unmodified. @html = @html.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '') global.each { |r| r.apply(@html) } end |
#remove_spacing ⇒ Object
Perform ‘remove_spacing’ action
77 78 79 80 |
# File 'lib/sitediff/sanitize.rb', line 77 def remove_spacing (rule = canonicalize_rule('remove_spacing')) || return Sanitizer.remove_node_spacing(@node) if rule['value'] end |
#sanitize ⇒ Object
26 27 28 29 30 31 32 33 34 35 36 37 38 |
# File 'lib/sitediff/sanitize.rb', line 26 def sanitize return '' if @html == '' # Quick return on empty input @node = Sanitizer.domify(@html) @html = nil remove_spacing selector dom_transforms regexps @html || Sanitizer.prettify(@node) end |
#selector ⇒ Object
Perform ‘selector’ action, to choose a new root
83 84 85 86 |
# File 'lib/sitediff/sanitize.rb', line 83 def selector (rule = canonicalize_rule('selector')) || return @node = Sanitizer.select_fragments(@node, rule['value']) end |
#want_rule(rule) ⇒ Object
Return whether or not we want to keep a rule
41 42 43 44 45 46 47 48 49 50 51 |
# File 'lib/sitediff/sanitize.rb', line 41 def want_rule(rule) return false unless rule return false if rule['disabled'] # Filter out if path regexp doesn't match if (pathre = rule['path']) && (path = @opts[:path]) return ::Regexp.new(pathre).match(path) end true end |