Class: GoogleTranslateDiff::Tokenizer

Inherits:
Ox::Sax
  • Object
show all
Defined in:
lib/google_translate_diff/tokenizer.rb

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(source) ⇒ Tokenizer

Returns a new instance of Tokenizer.



2
3
4
5
6
7
8
9
# File 'lib/google_translate_diff/tokenizer.rb', line 2

def initialize(source)
  @pos = nil
  @source = source
  @tokens = nil
  @context = []
  @sequence = []
  @indicies = []
end

Class Method Details

.tokenize(value) ⇒ Object



133
134
135
136
137
138
139
# File 'lib/google_translate_diff/tokenizer.rb', line 133

def tokenize(value)
  return [] if value.nil?
  tokenizer = new(value).tap do |h|
    Ox.sax_parse(h, StringIO.new(value), HTML_OPTIONS)
  end
  tokenizer.tokens
end

Instance Method Details

#attr(name, value) ⇒ Object



27
28
29
30
31
32
33
# File 'lib/google_translate_diff/tokenizer.rb', line 27

def attr(name, value)
  return unless @context.last == :span
  return unless name == :class && value == "notranslate"
  return if notranslate?

  @sequence[-1] = :notranslate
end

#end_element(name) ⇒ Object



23
24
25
# File 'lib/google_translate_diff/tokenizer.rb', line 23

def end_element(name)
  end_markup(name)
end

#end_instruct(target) ⇒ Object



15
16
17
# File 'lib/google_translate_diff/tokenizer.rb', line 15

def end_instruct(target)
  end_markup(target)
end

#instruct(target) ⇒ Object



11
12
13
# File 'lib/google_translate_diff/tokenizer.rb', line 11

def instruct(target)
  start_markup(target)
end

#start_element(name) ⇒ Object



19
20
21
# File 'lib/google_translate_diff/tokenizer.rb', line 19

def start_element(name)
  start_markup(name)
end

#text(value) ⇒ Object



35
36
37
38
39
# File 'lib/google_translate_diff/tokenizer.rb', line 35

def text(value)
  return if value == ""
  @sequence << (SKIP.include?(@context.last) ? :markup : :text)
  @indicies << @pos - 1
end

#tokensObject



41
42
43
44
# File 'lib/google_translate_diff/tokenizer.rb', line 41

def tokens
  @tokens ||= token_sequences_joined
    .tap { |tokens| make_sentences_from_last_token(tokens) }
end