Module: PragmaticSegmenter::Languages::Common

Included in:
Amharic, Arabic, Armenian, Bulgarian, Burmese, Chinese, Deutsch, Dutch, English, French, Greek, Hindi, Italian, Japanese, Persian, Polish, Russian, Spanish, Urdu
Defined in:
lib/pragmatic_segmenter/languages/common.rb,
lib/pragmatic_segmenter/languages/common/numbers.rb,
lib/pragmatic_segmenter/languages/common/ellipsis.rb

Defined Under Namespace

Modules: Abbreviation, Abbreviations, AmPmRules, DoublePunctuationRules, EllipsisRules, ExclamationPointRules, Numbers, ReinsertEllipsisRules, SingleLetterAbbreviationRules, SubSymbolsRules Classes: AbbreviationReplacer

Constant Summary collapse

Punctuations =

This class holds the punctuation marks.

['', '', '.', '', '!', '?', ''].freeze
GeoLocationRule =
Rule.new(/(?<=[a-zA-z]°)\.(?=\s*\d+)/, '')
SingleNewLineRule =
Rule.new(/\n/, 'ȹ')
QuestionMarkInQuotationRule =
Rule.new(/\?(?=(\'|\"))/, '&ᓷ&')
ExtraWhiteSpaceRule =
Rule.new(/\s{3,}/, ' ')
SubSingleQuoteRule =
Rule.new(/&⎋&/, "'")
SENTENCE_BOUNDARY_REGEX =
/\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|'(?:[^'])*[^,]'(?=\s[A-Z])|"(?:[^"])*[^,]"(?=\s[A-Z])|“(?:[^”])*[^,]”(?=\s[A-Z])|\S.*?[。..!!??ȸȹ☉☈☇☄]/
QUOTATION_AT_END_OF_SENTENCE_REGEX =
/[!?\.-][\"\'\u{201d}\u{201c}]\s{1}[A-Z]/
PARENS_BETWEEN_DOUBLE_QUOTES_REGEX =
/["”]\s\(.*\)\s["“]/
BETWEEN_DOUBLE_QUOTES_REGEX =
/"(?:[^"])*[^,]"|“(?:[^”])*[^,]”/
SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX =
/(?<=[!?\.-][\"\'\u{201d}\u{201c}])\s{1}(?=[A-Z])/
CONTINUOUS_PUNCTUATION_REGEX =
/(?<=\S)(!|\?){3,}(?=(\s|\z|$))/
PossessiveAbbreviationRule =
Rule.new(/\.(?='s\s)|\.(?='s$)|\.(?='s\z)/, '')
KommanditgesellschaftRule =
Rule.new(/(?<=Co)\.(?=\sKG)/, '')
MULTI_PERIOD_ABBREVIATION_REGEX =
/\b[a-z](?:\.[a-z])+[.]/i