Class: Treat::Workers::Processors::Tokenizers::OpenNlp

Inherits:
Object
  • Object
show all
Defined in:
lib/treat/workers/processors/tokenizers/open_nlp.rb

Overview

Maximum entropy tokenization supplied by OpenNLP.

Constant Summary collapse

@@tokenizers =
{}

Class Method Summary collapse

Class Method Details

.tokenize(entity, options = {}) ⇒ Object

Maximum entropy tokenization.



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# File 'lib/treat/workers/processors/tokenizers/open_nlp.rb', line 7

def self.tokenize(entity, options = {})
  
  Treat::Loaders::OpenNLP.load
  
  lang = entity.language
  str = entity.to_s
  
  unless @@tokenizers[lang]
    OpenNLP.use(lang.intern)
    @@tokenizers[lang] = 
    OpenNLP::TokenizerME.new
  end
  
  tokenizer = @@tokenizers[lang]
  tokens = tokenizer.tokenize(str).to_a
  
  tokens.each do |token|
    entity << Treat::Entities::Token.from_string(token)
  end

end