Class: GoogleTranslateDiff::Tokenizer

Inherits:
Ox::Sax
  • Object
show all
Defined in:
lib/google_translate_diff/tokenizer.rb

Constant Summary collapse

SKIP =
i[script style].freeze

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(source) ⇒ Tokenizer

Returns a new instance of Tokenizer.



2
3
4
5
6
7
8
# File 'lib/google_translate_diff/tokenizer.rb', line 2

def initialize(source)
  @pos = nil
  @prev = 1
  @skip = false
  @source = source
  @tokens = []
end

Instance Attribute Details

#posObject (readonly)

Returns the value of attribute pos.



10
11
12
# File 'lib/google_translate_diff/tokenizer.rb', line 10

def pos
  @pos
end

#prevObject (readonly)

Returns the value of attribute prev.



10
11
12
# File 'lib/google_translate_diff/tokenizer.rb', line 10

def prev
  @prev
end

#textsObject (readonly)

Returns the value of attribute texts.



10
11
12
# File 'lib/google_translate_diff/tokenizer.rb', line 10

def texts
  @texts
end

#tokensObject (readonly)

Returns the value of attribute tokens.



10
11
12
# File 'lib/google_translate_diff/tokenizer.rb', line 10

def tokens
  @tokens
end

Class Method Details

.tokenize(value) ⇒ Object



65
66
67
68
69
70
71
72
73
74
75
# File 'lib/google_translate_diff/tokenizer.rb', line 65

def tokenize(value)
  return [] if value.nil?
  tokenizer = new(value).tap do |h|
    # Ox.default_options = {
    # mode: :generic, effort: :tolerant, smart: true
    # }
    Ox.sax_parse(h, StringIO.new(value))
    h.cut_last_token
  end
  tokenizer.tokens
end

Instance Method Details

#cut_last_tokenObject



53
54
55
56
# File 'lib/google_translate_diff/tokenizer.rb', line 53

def cut_last_token
  last_token = fix_utf(@source.byteslice((@prev - 1)..-1))
  @tokens << [last_token, :markup] if last_token != ""
end

#end_element(name) ⇒ Object



16
17
18
# File 'lib/google_translate_diff/tokenizer.rb', line 16

def end_element(name)
  @skip = false if SKIP.include?(name)
end

#fix_utf(value) ⇒ Object



58
59
60
61
62
# File 'lib/google_translate_diff/tokenizer.rb', line 58

def fix_utf(value)
  value.encode(
    "UTF-8", undef: :replace, invalid: :replace, replace: " "
  )
end

#sentences(value) ⇒ Object

Splits text by sentences



37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/google_translate_diff/tokenizer.rb', line 37

def sentences(value)
  boundaries =
    Punkt::SentenceTokenizer
    .new(value)
    .sentences_from_text(value)

  return [[value, :text]] if boundaries.size == 1

  boundaries.map.with_index do |(left, right), index|
    next_boundary = boundaries[index + 1]
    right = next_boundary[0] - 1 if next_boundary

    [value[left..right], :text]
  end
end

#start_element(name) ⇒ Object



12
13
14
# File 'lib/google_translate_diff/tokenizer.rb', line 12

def start_element(name)
  @skip = true if SKIP.include?(name)
end

#text(value) ⇒ Object



20
21
22
23
24
25
26
27
28
29
# File 'lib/google_translate_diff/tokenizer.rb', line 20

def text(value)
  return if @skip
  value = fix_utf(value)
  return if value.strip.empty?

  token.tap { |t| @tokens << [fix_utf(t), :markup] if t }
  @tokens.concat(sentences(value))

  @prev = @pos + value.bytesize
end

#tokenObject



31
32
33
34
# File 'lib/google_translate_diff/tokenizer.rb', line 31

def token
  return if @prev == @pos
  fix_utf(@source.byteslice((@prev - 1)..(@pos - 2)))
end