Class: GoogleTranslateDiff::Tokenizer
- Inherits:
-
Ox::Sax
- Object
- Ox::Sax
- GoogleTranslateDiff::Tokenizer
- Defined in:
- lib/google_translate_diff/tokenizer.rb
Constant Summary collapse
- SKIP =
i(script style).freeze
Instance Attribute Summary collapse
-
#pos ⇒ Object
readonly
Returns the value of attribute pos.
-
#prev ⇒ Object
readonly
Returns the value of attribute prev.
-
#texts ⇒ Object
readonly
Returns the value of attribute texts.
-
#tokens ⇒ Object
readonly
Returns the value of attribute tokens.
Class Method Summary collapse
Instance Method Summary collapse
- #cut_last_token ⇒ Object
- #end_element(name) ⇒ Object
- #fix_utf(value) ⇒ Object
-
#initialize(source) ⇒ Tokenizer
constructor
A new instance of Tokenizer.
-
#sentences(value) ⇒ Object
Splits text by sentences.
- #start_element(name) ⇒ Object
- #text(value) ⇒ Object
- #token ⇒ Object
Constructor Details
#initialize(source) ⇒ Tokenizer
Returns a new instance of Tokenizer.
2 3 4 5 6 7 8 |
# File 'lib/google_translate_diff/tokenizer.rb', line 2 def initialize(source) @pos = nil @prev = 1 @skip = false @source = source @tokens = [] end |
Instance Attribute Details
#pos ⇒ Object (readonly)
Returns the value of attribute pos.
10 11 12 |
# File 'lib/google_translate_diff/tokenizer.rb', line 10 def pos @pos end |
#prev ⇒ Object (readonly)
Returns the value of attribute prev.
10 11 12 |
# File 'lib/google_translate_diff/tokenizer.rb', line 10 def prev @prev end |
#texts ⇒ Object (readonly)
Returns the value of attribute texts.
10 11 12 |
# File 'lib/google_translate_diff/tokenizer.rb', line 10 def texts @texts end |
#tokens ⇒ Object (readonly)
Returns the value of attribute tokens.
10 11 12 |
# File 'lib/google_translate_diff/tokenizer.rb', line 10 def tokens @tokens end |
Class Method Details
.tokenize(value) ⇒ Object
65 66 67 68 69 70 71 72 73 |
# File 'lib/google_translate_diff/tokenizer.rb', line 65 def tokenize(value) return [] if value.nil? tokenizer = new(value).tap do |h| Ox. = { mode: :generic, effort: :tolerant, smart: true } Ox.sax_parse(h, StringIO.new(value)) h.cut_last_token end tokenizer.tokens end |
Instance Method Details
#cut_last_token ⇒ Object
53 54 55 56 |
# File 'lib/google_translate_diff/tokenizer.rb', line 53 def cut_last_token last_token = fix_utf(@source.byteslice((@prev - 1)..-1)) @tokens << [last_token, :markup] if last_token != "" end |
#end_element(name) ⇒ Object
16 17 18 |
# File 'lib/google_translate_diff/tokenizer.rb', line 16 def end_element(name) @skip = false if SKIP.include?(name) end |
#fix_utf(value) ⇒ Object
58 59 60 61 62 |
# File 'lib/google_translate_diff/tokenizer.rb', line 58 def fix_utf(value) value.encode( "UTF-8", undef: :replace, invalid: :replace, replace: " " ) end |
#sentences(value) ⇒ Object
Splits text by sentences
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
# File 'lib/google_translate_diff/tokenizer.rb', line 37 def sentences(value) boundaries = Punkt::SentenceTokenizer .new(value) .sentences_from_text(value) return [[value, :text]] if boundaries.size == 1 boundaries.map.with_index do |(left, right), index| next_boundary = boundaries[index + 1] right = next_boundary[0] - 1 if next_boundary [value[left..right], :text] end end |
#start_element(name) ⇒ Object
12 13 14 |
# File 'lib/google_translate_diff/tokenizer.rb', line 12 def start_element(name) @skip = true if SKIP.include?(name) end |
#text(value) ⇒ Object
20 21 22 23 24 25 26 27 28 29 |
# File 'lib/google_translate_diff/tokenizer.rb', line 20 def text(value) return if @skip value = fix_utf(value) return if value.strip.empty? token.tap { |t| @tokens << [fix_utf(t), :markup] if t } @tokens.concat(sentences(value)) @prev = @pos + value.bytesize end |
#token ⇒ Object
31 32 33 34 |
# File 'lib/google_translate_diff/tokenizer.rb', line 31 def token return if @prev == @pos fix_utf(@source.byteslice((@prev - 1)..(@pos - 2))) end |