Class: DiscourseDiff::HtmlTokenizer

Inherits:
Nokogiri::XML::SAX::Document
  • Object
show all
Defined in:
lib/discourse_diff.rb

Constant Summary collapse

USELESS_TAGS =
%w[html body]
AUTOCLOSING_TAGS =
%w[area base br col embed hr img input meta]

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeHtmlTokenizer

Returns a new instance of HtmlTokenizer.



263
264
265
# File 'lib/discourse_diff.rb', line 263

def initialize
  @tokens = []
end

Instance Attribute Details

#tokensObject

Returns the value of attribute tokens.



261
262
263
# File 'lib/discourse_diff.rb', line 261

def tokens
  @tokens
end

Class Method Details

.tokenize(html) ⇒ Object



267
268
269
270
271
272
# File 'lib/discourse_diff.rb', line 267

def self.tokenize(html)
  me = new
  parser = Nokogiri::HTML::SAX::Parser.new(me)
  parser.parse("<html><body>#{html}</body></html>")
  me.tokens
end

Instance Method Details

#characters(string) ⇒ Object



287
288
289
# File 'lib/discourse_diff.rb', line 287

def characters(string)
  @tokens.concat string.scan(/\W|\w+[ \t]*/).map { |x| CGI.escapeHTML(x) }
end

#end_element(name) ⇒ Object



282
283
284
285
# File 'lib/discourse_diff.rb', line 282

def end_element(name)
  return if USELESS_TAGS.include?(name) || AUTOCLOSING_TAGS.include?(name)
  @tokens << "</#{name}>"
end

#start_element(name, attributes = []) ⇒ Object



275
276
277
278
279
# File 'lib/discourse_diff.rb', line 275

def start_element(name, attributes = [])
  return if USELESS_TAGS.include?(name)
  attrs = attributes.map { |a| " #{a[0]}=\"#{CGI.escapeHTML(a[1])}\"" }.join
  @tokens << "<#{name}#{attrs}>"
end