Class: PragmaticTokenizer::Languages::English::SingleQuotes

Inherits:
Object
  • Object
show all
Defined in:
lib/pragmatic_tokenizer/languages/english.rb

Constant Summary collapse

REGEXP_LEFT_QUOTES1 =
/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o
REGEXP_LEFT_QUOTES2 =
/(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o
REGEXP_LEFT_QUOTES3 =
/(\W|^)'(?=.*\w)/o
REGEXP_RIGHT_SIDE_QUOTES =
/(\w|\D)'(?!')(?=\W|$)/o

Instance Method Summary collapse

Instance Method Details

#handle_single_quotes(text) ⇒ Object



106
107
108
109
110
111
112
113
114
115
116
117
# File 'lib/pragmatic_tokenizer/languages/english.rb', line 106

def handle_single_quotes(text)
  # Convert left quotes to special character except for 'Twas or 'twas
  replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'".freeze]
  text.gsub!(REGEXP_LEFT_QUOTES1, "\\1 #{replacement} ")
  text.gsub!(REGEXP_LEFT_QUOTES3, ' ' << replacement)
  text.gsub!(REGEXP_RIGHT_SIDE_QUOTES, "\\1 #{replacement} ")

  replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["".freeze]
  text.gsub!(REGEXP_LEFT_QUOTES2, "\\1 #{replacement} ")

  text
end