Class: SiteDiff::Sanitizer

Inherits:
Object
  • Object
show all
Defined in:
lib/sitediff/sanitize.rb,
lib/sitediff/sanitize/regexp.rb,
lib/sitediff/sanitize/dom_transform.rb

Defined Under Namespace

Classes: DomTransform, InvalidSanitization, Regexp

Constant Summary collapse

TOOLS =
{
  array: %w[dom_transform sanitization],
  scalar: %w[selector remove_spacing]
}.freeze
DOM_TRANSFORMS =
Set.new(%w[remove unwrap_root unwrap remove_class])

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(html, config, opts = {}) ⇒ Sanitizer

Returns a new instance of Sanitizer.



20
21
22
23
24
# File 'lib/sitediff/sanitize.rb', line 20

def initialize(html, config, opts = {})
  @html = html
  @config = config
  @opts = opts
end

Class Method Details

.domify(str, force_doc = false) ⇒ Object

Parse HTML into a node



171
172
173
174
175
176
177
# File 'lib/sitediff/sanitize.rb', line 171

def self.domify(str, force_doc = false)
  if force_doc || /<!DOCTYPE/.match(str[0, 512])
    Nokogiri::HTML(str)
  else
    Nokogiri::HTML.fragment(str)
  end
end

.prettify(obj) ⇒ Object

Pretty-print some HTML



138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# File 'lib/sitediff/sanitize.rb', line 138

def self.prettify(obj)
  @stylesheet ||= begin
    stylesheet_path = File.join(SiteDiff::FILES_DIR, 'pretty_print.xsl')
    Nokogiri::XSLT(File.read(stylesheet_path))
  end

  # Pull out the html element's children
  # The obvious way to do this is to iterate over pretty.css('html'),
  # but that tends to segfault Nokogiri
  str = @stylesheet.apply_to(to_document(obj))

  # There's a lot of cruft left over,that we don't want

  # Prevent potential UTF-8 encoding errors by removing invalid bytes.
  # Not the only solution.
  # An alternative is to return the string unmodified.
  str = str.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
  # Remove xml declaration and <html> tags
  str.sub!(/\A<\?xml.*$\n/, '')
  str.sub!(/\A^<html>$\n/, '')
  str.sub!(%r{</html>\n\Z}, '')

  # Remove top-level indentation
  indent = /\A(\s*)/.match(str)[1].size
  str.gsub!(/^\s{,#{indent}}/, '')

  # Remove blank lines
  str.gsub!(/^\s*$\n/, '')

  str
end

.remove_node_spacing(node) ⇒ Object

Remove double-spacing inside text nodes



120
121
122
123
124
125
# File 'lib/sitediff/sanitize.rb', line 120

def self.remove_node_spacing(node)
  # remove double spacing, but only inside text nodes (eg not attributes)
  node.xpath('//text()').each do |el|
    el.content = el.content.gsub(/  +/, ' ')
  end
end

.select_fragments(node, sel) ⇒ Object

Get a fragment consisting of the elements matching the selector(s)



128
129
130
131
132
133
134
135
# File 'lib/sitediff/sanitize.rb', line 128

def self.select_fragments(node, sel)
  # When we choose a new root, we always become a DocumentFragment,
  # and lose any DOCTYPE and such.
  ns = node.css(sel)
  node = Nokogiri::HTML.fragment('') unless node.fragment?
  node.children = ns
  node
end

.to_document(obj) ⇒ Object

Force this object to be a document, so we can apply a stylesheet



180
181
182
183
184
185
186
187
188
189
# File 'lib/sitediff/sanitize.rb', line 180

def self.to_document(obj)
  if Nokogiri::XML::Document == obj.class || Nokogiri::HTML::Document == obj.class
    obj
  # node or fragment
  elsif Nokogiri::XML::Node == obj.class || Nokogiri::HTML::DocumentFragment == obj.class
    domify(obj.to_s, true)
  else
    to_document(domify(obj, false))
  end
end

Instance Method Details

#canonicalize_rule(name) ⇒ Object

Canonicalize a simple rule, eg: ‘remove_spacing’ or ‘selector’. It may be a simple value, or a hash, or an array of hashes. Turn it into an array of hashes.



56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# File 'lib/sitediff/sanitize.rb', line 56

def canonicalize_rule(name)
  (rules = @config[name]) || (return nil)

  if rules[0]&.respond_to?(:[]) && rules[0]['value']
    # Already an array
  elsif rules['value']
    # Hash, put it in an array
    rules = [rules]
  else
    # Scalar, put it in a hash
    rules = [{ 'value' => rules }]
  end

  want = rules.select { |r| want_rule(r) }
  return nil if want.empty?
  raise "Too many matching rules of type #{name}" if want.size > 1

  want.first
end

#dom_transformsObject

Perform DOM transforms



107
108
109
110
111
112
113
114
115
# File 'lib/sitediff/sanitize.rb', line 107

def dom_transforms
  (rules = @config['dom_transform']) || return
  rules = rules.select { |r| want_rule(r) }

  rules.each do |rule|
    transform = DomTransform.create(rule)
    transform.apply(@node)
  end
end

#regexpsObject

Applies regexps. Also



89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/sitediff/sanitize.rb', line 89

def regexps
  (rules = @config['sanitization']) || return
  rules = rules.select { |r| want_rule(r) }

  rules.map! { |r| Regexp.create(r) }
  selector, global = rules.partition(&:selector?)

  selector.each { |r| r.apply(@node) }
  @html = Sanitizer.prettify(@node)
  @node = nil
  # Prevent potential UTF-8 encoding errors by removing bytes
  # Not the only solution. An alternative is to return the
  # string unmodified.
  @html = @html.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
  global.each { |r| r.apply(@html) }
end

#remove_spacingObject

Perform ‘remove_spacing’ action



77
78
79
80
# File 'lib/sitediff/sanitize.rb', line 77

def remove_spacing
  (rule = canonicalize_rule('remove_spacing')) || return
  Sanitizer.remove_node_spacing(@node) if rule['value']
end

#sanitizeObject



26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/sitediff/sanitize.rb', line 26

def sanitize
  return '' if @html == '' # Quick return on empty input

  @node = Sanitizer.domify(@html)
  @html = nil

  remove_spacing
  selector
  dom_transforms
  regexps

  @html || Sanitizer.prettify(@node)
end

#selectorObject

Perform ‘selector’ action, to choose a new root



83
84
85
86
# File 'lib/sitediff/sanitize.rb', line 83

def selector
  (rule = canonicalize_rule('selector')) || return
  @node = Sanitizer.select_fragments(@node, rule['value'])
end

#want_rule(rule) ⇒ Object

Return whether or not we want to keep a rule



41
42
43
44
45
46
47
48
49
50
51
# File 'lib/sitediff/sanitize.rb', line 41

def want_rule(rule)
  return false unless rule
  return false if rule['disabled']

  # Filter out if path regexp doesn't match
  if (pathre = rule['path']) && (path = @opts[:path])
    return ::Regexp.new(pathre).match(path)
  end

  true
end