Class: Discourse::Diff::HtmlTokenizer

Inherits:

Nokogiri::XML::SAX::Document

Object
Nokogiri::XML::SAX::Document
Discourse::Diff::HtmlTokenizer

show all

Defined in:: lib/discourse/diff.rb

Constant Summary collapse

USELESS_TAGS =

%w{html body}

AUTOCLOSING_TAGS =

%w{area base br col embed hr img input meta}

Instance Attribute Summary collapse

#tokens ⇒ Object

Returns the value of attribute tokens.

Class Method Summary collapse

.tokenize(html) ⇒ Object

Instance Method Summary collapse

#characters(string) ⇒ Object
#end_element(name) ⇒ Object
#initialize ⇒ HtmlTokenizer constructor

A new instance of HtmlTokenizer.
#start_element(name, attributes = []) ⇒ Object

Constructor Details

#initialize ⇒ `HtmlTokenizer`

Returns a new instance of HtmlTokenizer.



245
246
247

# File 'lib/discourse/diff.rb', line 245

def initialize
  @tokens = []
end

Instance Attribute Details

#tokens ⇒ `Object`

Returns the value of attribute tokens.



243
244
245

# File 'lib/discourse/diff.rb', line 243

def tokens
  @tokens
end

Class Method Details

.tokenize(html) ⇒ `Object`

# File 'lib/discourse/diff.rb', line 249

def self.tokenize(html)
  me = new
  parser = Nokogiri::HTML::SAX::Parser.new(me)
  parser.parse("<html><body>#{html}</body></html>")
  me.tokens
end

Instance Method Details

#characters(string) ⇒ `Object`



269
270
271

# File 'lib/discourse/diff.rb', line 269

def characters(string)
  @tokens.concat string.scan(/\W|\w+[ \t]*/).map { |x| CGI::escapeHTML(x) }
end

#end_element(name) ⇒ `Object`

# File 'lib/discourse/diff.rb', line 264

def end_element(name)
  return if USELESS_TAGS.include?(name) || AUTOCLOSING_TAGS.include?(name)
  @tokens << "</#{name}>"
end

#start_element(name, attributes = []) ⇒ `Object`

# File 'lib/discourse/diff.rb', line 257

def start_element(name, attributes = [])
  return if USELESS_TAGS.include?(name)
  attrs = attributes.map { |a| " #{a[0]}=\"#{a[1]}\"" }.join
  @tokens << "<#{name}#{attrs}>"
end

Class: Discourse::Diff::HtmlTokenizer

Constant Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize ⇒ HtmlTokenizer

Instance Attribute Details

#tokens ⇒ Object

Class Method Details

.tokenize(html) ⇒ Object

Instance Method Details

#characters(string) ⇒ Object

#end_element(name) ⇒ Object

#start_element(name, attributes = []) ⇒ Object

#initialize ⇒ `HtmlTokenizer`

#tokens ⇒ `Object`

.tokenize(html) ⇒ `Object`

#characters(string) ⇒ `Object`

#end_element(name) ⇒ `Object`

#start_element(name, attributes = []) ⇒ `Object`