Class: DeepLDiff::Tokenizer
- Inherits:
-
Ox::Sax
- Object
- Ox::Sax
- DeepLDiff::Tokenizer
- Defined in:
- lib/deepl_diff/tokenizer.rb
Class Method Summary collapse
Instance Method Summary collapse
- #attr(name, value) ⇒ Object
- #end_element(name) ⇒ Object
- #end_instruct(target) ⇒ Object
-
#initialize(source) ⇒ Tokenizer
constructor
A new instance of Tokenizer.
- #instruct(target) ⇒ Object
- #start_element(name) ⇒ Object
- #text(value) ⇒ Object
- #tokens ⇒ Object
Constructor Details
#initialize(source) ⇒ Tokenizer
Returns a new instance of Tokenizer.
4 5 6 7 8 9 10 11 |
# File 'lib/deepl_diff/tokenizer.rb', line 4 def initialize(source) @pos = nil @source = source @tokens = nil @context = [] @sequence = [] @indicies = [] end |
Class Method Details
.tokenize(value) ⇒ Object
146 147 148 149 150 151 152 153 |
# File 'lib/deepl_diff/tokenizer.rb', line 146 def tokenize(value) return [] if value.nil? tokenizer = new(value).tap do |h| Ox.sax_parse(h, StringIO.new(value), HTML_OPTIONS) end tokenizer.tokens end |
Instance Method Details
#attr(name, value) ⇒ Object
29 30 31 32 33 34 35 |
# File 'lib/deepl_diff/tokenizer.rb', line 29 def attr(name, value) return unless @context.last == :span return unless name == :class && value == "notranslate" return if notranslate? @sequence[-1] = :notranslate end |
#end_element(name) ⇒ Object
25 26 27 |
# File 'lib/deepl_diff/tokenizer.rb', line 25 def end_element(name) end_markup(name) end |
#end_instruct(target) ⇒ Object
17 18 19 |
# File 'lib/deepl_diff/tokenizer.rb', line 17 def end_instruct(target) end_markup(target) end |
#instruct(target) ⇒ Object
13 14 15 |
# File 'lib/deepl_diff/tokenizer.rb', line 13 def instruct(target) start_markup(target) end |
#start_element(name) ⇒ Object
21 22 23 |
# File 'lib/deepl_diff/tokenizer.rb', line 21 def start_element(name) start_markup(name) end |
#text(value) ⇒ Object
37 38 39 40 41 42 |
# File 'lib/deepl_diff/tokenizer.rb', line 37 def text(value) return if value == "" @sequence << (SKIP.include?(@context.last) ? :markup : :text) @indicies << @pos - 1 end |
#tokens ⇒ Object
44 45 46 47 |
# File 'lib/deepl_diff/tokenizer.rb', line 44 def tokens @tokens ||= token_sequences_joined .tap { |tokens| make_sentences_from_last_token(tokens) } end |