Module: WordTokenizer

Defined in:
lib/word_tokenizer.rb

Constant Summary collapse

@@tokenize_regexps =
[
    # Uniform Quotes
    [/''|``/, '"'],
     # Separate punctuation (except for periods) from words.
    [/(^|\s)(')/, '\1\2'],
    [/(?=[\("`{\[:;&#*@])(.)/, '\1 '],
     [/(.)(?=[?!\)";}\]*:@'])|(?=[\)}\]])(.)|(.)(?=[({\[])|((^|\s)-)(?=[^-])/, '\1 '],
     # Treat double-hyphen as a single token.
    [/([^-])(--+)([^-])/, '\1 \2 \3'],
    [/(\s|^)(,)(?=(\S))/, '\1\2 '],
     # Only separate a comma if a space follows.
    [/(.)(,)(\s|$)/, '\1 \2\3'],
     # Combine dots separated by whitespace to be a single token.
    [/\.\s\.\s\./, '...'],
     # Separate "No.6"
    [/([a-zA-Z]\.)(\d+)/, '\1 \2'],
     # Separate words from ellipses
    [/([^\.]|^)(\.{2,})(.?)/, '\1 \2 \3'],
    [/(^|\s)(\.{2,})([^\.\s])/, '\1\2 \3'],
    [/(^|\s)(\.{2,})([^\.\s])/, '\1 \2\3'],
     ##### Some additional fixes.
     # Fix %, $, &
    [/(\d)%/, '\1 %'],
    [/\$(\.?\d)/, '$ \1'],
    [/(\w)& (\w)/, '\1&\2'],
    [/(\w\w+)&(\w\w+)/, '\1 & \2'],
     # Fix (n 't) -> ( n't)
    [/n 't( |$)/, " n't\\1"],
    [/N 'T( |$)/, " N'T\\1"],
     # Treebank tokenizer special words
    [/([Cc])annot/, '\1an not']
 ]

Instance Method Summary collapse

Instance Method Details

#tokenize(s) ⇒ Object



47
48
49
50
# File 'lib/word_tokenizer.rb', line 47

def tokenize(s)
    rules = []
    @@tokenize_regexps.each {|rules| s.gsub!(rules[0], rules[1])}
end