Module: Splitta::WordTokenizer

Included in:
Frag
Defined in:
lib/splitta/word_tokenizer.rb

Constant Summary collapse

TOKENIZE_REGEXPS =
[
  # uniform quotes
  /
    '' |
    `` |
    “  |
    ”
  /ux,                          '"',

  # Separate punctuation (except period) from words:
  /(^|\s)(')/,                  '\1\2 ',
  /(?<=[("`{\[:;&#*@])(.)/,     ' \1',    # left-hand punctuation
  /(.)(?=[?!)";}\]*:@'])/,      '\1 ',    # right-hand punctuation
  /(?<=[)}\]])(.)/,             ' \1',    # left-hand close paren
  /(.)(?=[({\[])/,              '\1 ',    # right-hand open paren
  /((^|\s)-)(?=[^-])/,          '\1 ',    # starting hyphen/minus

  # Treat double-hyphen as one token:
  /([^-])(--+)([^-])/,          '\1 \2 \3',

  # Only separate comma if space follows:
  /(\s|^)(,)(?=(\S))/u,         '\1\2 ',
  /(.)(,)(\s|$)/u,              '\1 \2\3',

  # Combine dots separated by whitespace to be a single token:
  /\.\s\.\s\./u,                '...',

  # Separate "No.6"
  /([A-Za-z]\.)(\d+)/,          '\1 \2',

  # Separate words from ellipses
  /([^.]|^)(\.{2,})(.?)/,       '\1 \2 \3',
  /(^|\s)(\.{2,})([^.\s])/u,    '\1\2 \3',
  /(^|\s)(\.{2,})([^.\s])/u,    '\1 \2\3',

  # fix %, $, &
  /(\d)%/,                      '\1 %',
  /\$(\.?\d)/,                  '$ \1',
  /(\w)& (\w)/,                 '\1&\2',
  /(\w\w+)&(\w\w+)/,            '\1 & \2',

  # fix (n 't) --> ( n't)
  /n 't( |$)/,                  ' n\'t\1',
  /N 'T( |$)/,                  ' N\'T\1',

  # treebank tokenizer special words
  /([Cc])annot/,                '\1an not',
  /\s+/,                        ' ',
]

Instance Method Summary collapse

Instance Method Details

#tokenize(text) ⇒ Object

Tokenize a string using the rules above



62
63
64
65
66
67
68
# File 'lib/splitta/word_tokenizer.rb', line 62

def tokenize(text)
  text = text.dup
  TOKENIZE_REGEXPS.each_slice(2) do |regexp, repl|
    text.gsub!(regexp, repl)
  end
  text
end