Class: Discourse::Diff::HtmlTokenizer

Inherits:
Nokogiri::XML::SAX::Document
  • Object
show all
Defined in:
lib/discourse/diff.rb

Constant Summary collapse

USELESS_TAGS =
%w{html body}
AUTOCLOSING_TAGS =
%w{area base br col embed hr img input meta}

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeHtmlTokenizer

Returns a new instance of HtmlTokenizer.



245
246
247
# File 'lib/discourse/diff.rb', line 245

def initialize
  @tokens = []
end

Instance Attribute Details

#tokensObject

Returns the value of attribute tokens.



243
244
245
# File 'lib/discourse/diff.rb', line 243

def tokens
  @tokens
end

Class Method Details

.tokenize(html) ⇒ Object



249
250
251
252
253
254
# File 'lib/discourse/diff.rb', line 249

def self.tokenize(html)
  me = new
  parser = Nokogiri::HTML::SAX::Parser.new(me)
  parser.parse("<html><body>#{html}</body></html>")
  me.tokens
end

Instance Method Details

#characters(string) ⇒ Object



269
270
271
# File 'lib/discourse/diff.rb', line 269

def characters(string)
  @tokens.concat string.scan(/\W|\w+[ \t]*/).map { |x| CGI::escapeHTML(x) }
end

#end_element(name) ⇒ Object



264
265
266
267
# File 'lib/discourse/diff.rb', line 264

def end_element(name)
  return if USELESS_TAGS.include?(name) || AUTOCLOSING_TAGS.include?(name)
  @tokens << "</#{name}>"
end

#start_element(name, attributes = []) ⇒ Object



257
258
259
260
261
# File 'lib/discourse/diff.rb', line 257

def start_element(name, attributes = [])
  return if USELESS_TAGS.include?(name)
  attrs = attributes.map { |a| " #{a[0]}=\"#{a[1]}\"" }.join
  @tokens << "<#{name}#{attrs}>"
end