Class: DeepLDiff::Tokenizer

Inherits:
Ox::Sax
  • Object
show all
Defined in:
lib/deepl_diff/tokenizer.rb

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(source) ⇒ Tokenizer

Returns a new instance of Tokenizer.



4
5
6
7
8
9
10
11
# File 'lib/deepl_diff/tokenizer.rb', line 4

def initialize(source)
  @pos = nil
  @source = source
  @tokens = nil
  @context = []
  @sequence = []
  @indicies = []
end

Class Method Details

.tokenize(value) ⇒ Object



146
147
148
149
150
151
152
153
# File 'lib/deepl_diff/tokenizer.rb', line 146

def tokenize(value)
  return [] if value.nil?

  tokenizer = new(value).tap do |h|
    Ox.sax_parse(h, StringIO.new(value), HTML_OPTIONS)
  end
  tokenizer.tokens
end

Instance Method Details

#attr(name, value) ⇒ Object



29
30
31
32
33
34
35
# File 'lib/deepl_diff/tokenizer.rb', line 29

def attr(name, value)
  return unless @context.last == :span
  return unless name == :class && value == "notranslate"
  return if notranslate?

  @sequence[-1] = :notranslate
end

#end_element(name) ⇒ Object



25
26
27
# File 'lib/deepl_diff/tokenizer.rb', line 25

def end_element(name)
  end_markup(name)
end

#end_instruct(target) ⇒ Object



17
18
19
# File 'lib/deepl_diff/tokenizer.rb', line 17

def end_instruct(target)
  end_markup(target)
end

#instruct(target) ⇒ Object



13
14
15
# File 'lib/deepl_diff/tokenizer.rb', line 13

def instruct(target)
  start_markup(target)
end

#start_element(name) ⇒ Object



21
22
23
# File 'lib/deepl_diff/tokenizer.rb', line 21

def start_element(name)
  start_markup(name)
end

#text(value) ⇒ Object



37
38
39
40
41
42
# File 'lib/deepl_diff/tokenizer.rb', line 37

def text(value)
  return if value == ""

  @sequence << (SKIP.include?(@context.last) ? :markup : :text)
  @indicies << @pos - 1
end

#tokensObject



44
45
46
47
# File 'lib/deepl_diff/tokenizer.rb', line 44

def tokens
  @tokens ||= token_sequences_joined
              .tap { |tokens| make_sentences_from_last_token(tokens) }
end