Class: GoogleTranslateDiff::Tokenizer
- Inherits:
-
Ox::Sax
- Object
- Ox::Sax
- GoogleTranslateDiff::Tokenizer
- Defined in:
- lib/google_translate_diff/tokenizer.rb
Class Method Summary collapse
Instance Method Summary collapse
- #attr(name, value) ⇒ Object
- #end_element(name) ⇒ Object
-
#initialize(source) ⇒ Tokenizer
constructor
A new instance of Tokenizer.
- #start_element(name) ⇒ Object
- #text(_) ⇒ Object
-
#tokens ⇒ Object
rubocop:disable Metrics/AbcSize.
Constructor Details
#initialize(source) ⇒ Tokenizer
Returns a new instance of Tokenizer.
2 3 4 5 6 7 8 9 |
# File 'lib/google_translate_diff/tokenizer.rb', line 2 def initialize(source) @pos = nil @source = source @tokens = [] @context = [] @sequence = [] @indicies = [] end |
Class Method Details
.tokenize(value) ⇒ Object
94 95 96 97 98 99 100 101 |
# File 'lib/google_translate_diff/tokenizer.rb', line 94 def tokenize(value) return [] if value.nil? tokenizer = new(value).tap do |h| Ox.sax_parse(h, StringIO.new(value), HTML_OPTIONS) end puts tokenizer.tokens.inspect tokenizer.tokens end |
Instance Method Details
#attr(name, value) ⇒ Object
23 24 25 26 27 28 |
# File 'lib/google_translate_diff/tokenizer.rb', line 23 def attr(name, value) unless @context.last == :span && name == :class && value == "notranslate" return end @sequence[-1] = :notranslate end |
#end_element(name) ⇒ Object
17 18 19 20 21 |
# File 'lib/google_translate_diff/tokenizer.rb', line 17 def end_element(name) @context.pop @sequence << (nontranslate?(name) ? :notranslate : :markup) @indicies << @pos - 1 unless @pos == @source.bytesize end |
#start_element(name) ⇒ Object
11 12 13 14 15 |
# File 'lib/google_translate_diff/tokenizer.rb', line 11 def start_element(name) @context << name @sequence << :markup @indicies << @pos - 1 end |
#text(_) ⇒ Object
30 31 32 33 |
# File 'lib/google_translate_diff/tokenizer.rb', line 30 def text(_) @sequence << (SKIP.include?(@context.last) ? :markup : :text) @indicies << @pos - 1 end |
#tokens ⇒ Object
rubocop:disable Metrics/AbcSize
36 37 38 39 40 41 42 43 44 45 46 47 |
# File 'lib/google_translate_diff/tokenizer.rb', line 36 def tokens raw_tokens.each_with_object([]) do |token, tokens| if tokens.empty? tokens << token elsif tokens.last[1] == token[1] tokens.last[0].concat(token[0]) else tokens.concat(sentences(tokens.pop[0])) if tokens.last[1] == :text tokens << token end end end |