Class: GoogleTranslateDiff::Tokenizer
- Inherits:
-
Ox::Sax
- Object
- Ox::Sax
- GoogleTranslateDiff::Tokenizer
- Defined in:
- lib/google_translate_diff/tokenizer.rb
Class Method Summary collapse
Instance Method Summary collapse
- #attr(name, value) ⇒ Object
- #end_element(name) ⇒ Object
- #end_instruct(target) ⇒ Object
-
#initialize(source) ⇒ Tokenizer
constructor
A new instance of Tokenizer.
- #instruct(target) ⇒ Object
- #start_element(name) ⇒ Object
- #text(value) ⇒ Object
- #tokens ⇒ Object
Constructor Details
#initialize(source) ⇒ Tokenizer
Returns a new instance of Tokenizer.
2 3 4 5 6 7 8 9 |
# File 'lib/google_translate_diff/tokenizer.rb', line 2 def initialize(source) @pos = nil @source = source @tokens = nil @context = [] @sequence = [] @indicies = [] end |
Class Method Details
.tokenize(value) ⇒ Object
133 134 135 136 137 138 139 |
# File 'lib/google_translate_diff/tokenizer.rb', line 133 def tokenize(value) return [] if value.nil? tokenizer = new(value).tap do |h| Ox.sax_parse(h, StringIO.new(value), HTML_OPTIONS) end tokenizer.tokens end |
Instance Method Details
#attr(name, value) ⇒ Object
27 28 29 30 31 32 33 |
# File 'lib/google_translate_diff/tokenizer.rb', line 27 def attr(name, value) return unless @context.last == :span return unless name == :class && value == "notranslate" return if notranslate? @sequence[-1] = :notranslate end |
#end_element(name) ⇒ Object
23 24 25 |
# File 'lib/google_translate_diff/tokenizer.rb', line 23 def end_element(name) end_markup(name) end |
#end_instruct(target) ⇒ Object
15 16 17 |
# File 'lib/google_translate_diff/tokenizer.rb', line 15 def end_instruct(target) end_markup(target) end |
#instruct(target) ⇒ Object
11 12 13 |
# File 'lib/google_translate_diff/tokenizer.rb', line 11 def instruct(target) start_markup(target) end |
#start_element(name) ⇒ Object
19 20 21 |
# File 'lib/google_translate_diff/tokenizer.rb', line 19 def start_element(name) start_markup(name) end |
#text(value) ⇒ Object
35 36 37 38 39 |
# File 'lib/google_translate_diff/tokenizer.rb', line 35 def text(value) return if value == "" @sequence << (SKIP.include?(@context.last) ? :markup : :text) @indicies << @pos - 1 end |
#tokens ⇒ Object
41 42 43 44 |
# File 'lib/google_translate_diff/tokenizer.rb', line 41 def tokens @tokens ||= token_sequences_joined .tap { |tokens| make_sentences_from_last_token(tokens) } end |