Module: PragmaticSegmenter::Rules

Included in:
Cleaner, Languages::Common, PunctuationReplacer
Defined in:
lib/pragmatic_segmenter/rules.rb,
lib/pragmatic_segmenter/rules/html.rb,
lib/pragmatic_segmenter/rules/ellipsis.rb

Defined Under Namespace

Modules: DoublePunctuationRules, EllipsisRules, EscapeRegexReservedCharacters, ExclamationPointRules, HTMLRules, ReinsertEllipsisRules, SubEscapedRegexReservedCharacters, SubSymbolsRules

Constant Summary collapse

URL_EMAIL_KEYWORDS =
['@', 'http', '.com', 'net', 'www', '//']
NO_SPACE_BETWEEN_SENTENCES_REGEX =
/(?<=[a-z])\.(?=[A-Z])/
NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX =
/(?<=\d)\.(?=[A-Z])/
NewLineInMiddleOfWordRule =
Rule.new(/\n(?=[a-zA-Z]{1,2}\n)/, '')
NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX =
/(?<=\s)\n(?=([a-z]|\())/
PDF_NewLineInMiddleOfSentenceRule =
Rule.new(/(?<=[^\n]\s)\n(?=\S)/, '')
PDF_NewLineInMiddleOfSentenceNoSpacesRule =
Rule.new(/\n(?=[a-z])/, ' ')
InlineFormattingRule =
Rule.new(/\{b\^&gt;\d*&lt;b\^\}|\{b\^>\d*<b\^\}/, '')
DoubleNewLineWithSpaceRule =
Rule.new(/\n \n/, "\r")
DoubleNewLineRule =
Rule.new(/\n\n/, "\r")
NewLineFollowedByBulletRule =
Rule.new(/\n(?=•)/, "\r")
NewLineFollowedByPeriodRule =
Rule.new(/\n(?=\.(\s|\n))/, '')
TableOfContentsRule =
Rule.new(/\.{5,}\s*\d+-*\d*/, "\r")
ConsecutivePeriodsRule =
Rule.new(/\.{5,}/, ' ')
ConsecutiveForwardSlashRule =
Rule.new(/\/{3}/, '')
NoSpaceBetweenSentencesRule =
Rule.new(NO_SPACE_BETWEEN_SENTENCES_REGEX, '. ')
NoSpaceBetweenSentencesDigitRule =
Rule.new(NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, '. ')
EscapedCarriageReturnRule =
Rule.new(/\\r/, "\r")
TypoEscapedCarriageReturnRule =
Rule.new(/\\\ r/, "\r")
EscapedNewLineRule =
Rule.new(/\\n/, "\n")
TypoEscapedNewLineRule =
Rule.new(/\\\ n/, "\n")
ReplaceNewlineWithCarriageReturnRule =
Rule.new(/\n/, "\r")
QuotationsFirstRule =
Rule.new(/''/, '"')
QuotationsSecondRule =
Rule.new(/``/, '"')
AbbreviationsWithMultiplePeriodsAndEmailRule =
Rule.new(/(\w)(\.)(\w)/, '\1∮\3')
GeoLocationRule =
Rule.new(/(?<=[a-zA-z]°)\.(?=\s*\d+)/, '')
SingleNewLineRule =
Rule.new(/\n/, 'ȹ')
SubSingleQuoteRule =
Rule.new(/&⎋&/, "'")
ExtraWhiteSpaceRule =
Rule.new(/\s{3,}/, ' ')
QuestionMarkInQuotationRule =
Rule.new(/\?(?=(\'|\"))/, '&ᓷ&')