module Tokenizer
class WhitespaceTokenizer
FS = Regexp.new('[[:blank:]]+')
SIMPLE_PRE = ['¿', '¡']
SIMPLE_POST = ['!', '?', ',', ':', ';', '.']
PAIR_PRE = ['(', '{', '[', '<', '«', '„']
PAIR_POST = [')', '}', ']', '>', '»', '“']
PRE_N_POST = ['"', "'"]
private_constant :FS
def initialize(lang = :de, options = {})
@lang = lang
@options = {
pre: SIMPLE_PRE + PAIR_PRE,
post: SIMPLE_POST + PAIR_POST,
pre_n_post: PRE_N_POST
}.merge(options)
end
def tokenize(str)
tokens = sanitize_input(str).split(FS)
return [''] if tokens.empty?
splittables = SIMPLE_PRE + SIMPLE_POST + PAIR_PRE + PAIR_POST + PRE_N_POST
pattern = Regexp.new("[^#{Regexp.escape(splittables.join)}]+")
output = []
tokens.each do |token|
prefix, stem, suffix = token.partition(pattern)
output << prefix.split('') unless prefix.empty?
output << stem unless stem.empty?
output << suffix.split('') unless suffix.empty?
end
output.flatten
end
alias process tokenize
private
def sanitize_input(str)
str.chomp.strip
end
end
class Tokenizer < WhitespaceTokenizer
def initialize(*args)
warn '[Deprecated!] Use WhitespaceTokenizer instead.'
super(*args)
end
end
end