Class: Gliner::TextProcessor
- Inherits:
-
Object
- Object
- Gliner::TextProcessor
- Defined in:
- lib/gliner/text_processor.rb
Instance Method Summary collapse
- #encode_pretokenized(tokens) ⇒ Object
-
#initialize(tokenizer) ⇒ TextProcessor
constructor
A new instance of TextProcessor.
- #normalize_text(text) ⇒ Object
- #split_words(text) ⇒ Object
Constructor Details
#initialize(tokenizer) ⇒ TextProcessor
Returns a new instance of TextProcessor.
5 6 7 8 |
# File 'lib/gliner/text_processor.rb', line 5 def initialize(tokenizer) @tokenizer = tokenizer @word_pre_tokenizer = Tokenizers::PreTokenizers::BertPreTokenizer.new end |
Instance Method Details
#encode_pretokenized(tokens) ⇒ Object
36 37 38 39 40 |
# File 'lib/gliner/text_processor.rb', line 36 def encode_pretokenized(tokens) enc = @tokenizer.encode(tokens, is_pretokenized: true, add_special_tokens: false) { ids: enc.ids, word_ids: enc.word_ids } end |
#normalize_text(text) ⇒ Object
10 11 12 13 14 |
# File 'lib/gliner/text_processor.rb', line 10 def normalize_text(text) str = text.to_s str = '.' if str.empty? str.end_with?('.', '!', '?') ? str : "#{str}." end |
#split_words(text) ⇒ Object
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/gliner/text_processor.rb', line 16 def split_words(text) text = text.to_s tokens = [] starts = [] ends = [] @word_pre_tokenizer.pre_tokenize_str(text).each do |(token, (start_pos, end_pos))| token = token.to_s.downcase next if token.empty? tokens << token starts << start_pos ends << end_pos end [tokens, starts, ends] end |