Class: PragmaticTokenizer::Languages::English::SingleQuotes

Inherits:
Object
  • Object
show all
Defined in:
lib/pragmatic_tokenizer/languages/english.rb

Constant Summary collapse

ALNUM_QUOTE =
/(\w|\D)'(?!')(?=\W|$)/
QUOTE_WORD =
/(\W|^)'(?=\w)/
QUOTE_NOT_TWAS1 =
/(\W|^)'(?!twas)/i
QUOTE_NOT_TWAS2 =
/(\W|^)‘(?!twas)/i

Instance Method Summary collapse

Instance Method Details

#handle_single_quotes(text) ⇒ Object



106
107
108
109
110
111
112
113
114
# File 'lib/pragmatic_tokenizer/languages/english.rb', line 106

def handle_single_quotes(text)
  # special treatment for "'twas"
  text.gsub!(QUOTE_NOT_TWAS1, '\1 ' << PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'".freeze] << ' ')
  text.gsub!(QUOTE_NOT_TWAS2, '\1 ' << PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["".freeze] << ' ')

  text.gsub!(QUOTE_WORD,      ' '   << PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'".freeze])
  text.gsub!(ALNUM_QUOTE,     '\1 ' << PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'".freeze] << ' ')
  text
end