Class: Gliner::TextProcessor

Inherits:
Object
  • Object
show all
Defined in:
lib/gliner/text_processor.rb

Instance Method Summary collapse

Constructor Details

#initialize(tokenizer) ⇒ TextProcessor

Returns a new instance of TextProcessor.



5
6
7
8
# File 'lib/gliner/text_processor.rb', line 5

def initialize(tokenizer)
  @tokenizer = tokenizer
  @word_pre_tokenizer = Tokenizers::PreTokenizers::BertPreTokenizer.new
end

Instance Method Details

#encode_pretokenized(tokens) ⇒ Object



36
37
38
39
40
# File 'lib/gliner/text_processor.rb', line 36

def encode_pretokenized(tokens)
  enc = @tokenizer.encode(tokens, is_pretokenized: true, add_special_tokens: false)

  { ids: enc.ids, word_ids: enc.word_ids }
end

#normalize_text(text) ⇒ Object



10
11
12
13
14
# File 'lib/gliner/text_processor.rb', line 10

def normalize_text(text)
  str = text.to_s
  str = '.' if str.empty?
  str.end_with?('.', '!', '?') ? str : "#{str}."
end

#split_words(text) ⇒ Object



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/gliner/text_processor.rb', line 16

def split_words(text)
  text = text.to_s

  tokens = []
  starts = []
  ends = []

  @word_pre_tokenizer.pre_tokenize_str(text).each do |(token, (start_pos, end_pos))|
    token = token.to_s.downcase

    next if token.empty?

    tokens << token
    starts << start_pos
    ends << end_pos
  end

  [tokens, starts, ends]
end