Class: GoogleTranslateDiff::Tokenizer

Inherits:
Ox::Sax
  • Object
show all
Defined in:
lib/google_translate_diff/tokenizer.rb

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(source) ⇒ Tokenizer

Returns a new instance of Tokenizer.



2
3
4
5
6
7
8
9
# File 'lib/google_translate_diff/tokenizer.rb', line 2

def initialize(source)
  @pos = nil
  @source = source
  @tokens = []
  @context = []
  @sequence = []
  @indicies = []
end

Class Method Details

.tokenize(value) ⇒ Object



94
95
96
97
98
99
100
101
# File 'lib/google_translate_diff/tokenizer.rb', line 94

def tokenize(value)
  return [] if value.nil?
  tokenizer = new(value).tap do |h|
    Ox.sax_parse(h, StringIO.new(value), HTML_OPTIONS)
  end
  puts tokenizer.tokens.inspect
  tokenizer.tokens
end

Instance Method Details

#attr(name, value) ⇒ Object



23
24
25
26
27
28
# File 'lib/google_translate_diff/tokenizer.rb', line 23

def attr(name, value)
  unless @context.last == :span && name == :class && value == "notranslate"
    return
  end
  @sequence[-1] = :notranslate
end

#end_element(name) ⇒ Object



17
18
19
20
21
# File 'lib/google_translate_diff/tokenizer.rb', line 17

def end_element(name)
  @context.pop
  @sequence << (nontranslate?(name) ? :notranslate : :markup)
  @indicies << @pos - 1 unless @pos == @source.bytesize
end

#start_element(name) ⇒ Object



11
12
13
14
15
# File 'lib/google_translate_diff/tokenizer.rb', line 11

def start_element(name)
  @context << name
  @sequence << :markup
  @indicies << @pos - 1
end

#text(_) ⇒ Object



30
31
32
33
# File 'lib/google_translate_diff/tokenizer.rb', line 30

def text(_)
  @sequence << (SKIP.include?(@context.last) ? :markup : :text)
  @indicies << @pos - 1
end

#tokensObject

rubocop:disable Metrics/AbcSize



36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/google_translate_diff/tokenizer.rb', line 36

def tokens
  raw_tokens.each_with_object([]) do |token, tokens|
    if tokens.empty?
      tokens << token
    elsif tokens.last[1] == token[1]
      tokens.last[0].concat(token[0])
    else
      tokens.concat(sentences(tokens.pop[0])) if tokens.last[1] == :text
      tokens << token
    end
  end
end