Module: PragmaticSegmenter::Rules
- Included in:
- Cleaner, Languages::Common, PunctuationReplacer
- Defined in:
- lib/pragmatic_segmenter/rules.rb,
lib/pragmatic_segmenter/rules/html.rb,
lib/pragmatic_segmenter/rules/ellipsis.rb
Defined Under Namespace
Modules: DoublePunctuationRules, EllipsisRules, EscapeRegexReservedCharacters, ExclamationPointRules, HTMLRules, ReinsertEllipsisRules, SubEscapedRegexReservedCharacters, SubSymbolsRules
Constant Summary collapse
- URL_EMAIL_KEYWORDS =
['@', 'http', '.com', 'net', 'www', '//']
- NO_SPACE_BETWEEN_SENTENCES_REGEX =
Rubular: rubular.com/r/6dt98uI76u
/(?<=[a-z])\.(?=[A-Z])/- NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX =
Rubular: rubular.com/r/l6KN6rH5XE
/(?<=\d)\.(?=[A-Z])/- NewLineInMiddleOfWordRule =
Rubular: rubular.com/r/V57WnM9Zut
Rule.new(/\n(?=[a-zA-Z]{1,2}\n)/, '')
- NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX =
Rubular: rubular.com/r/3GiRiP2IbD
/(?<=\s)\n(?=([a-z]|\())/- PDF_NewLineInMiddleOfSentenceRule =
Rubular: rubular.com/r/UZAVcwqck8
Rule.new(/(?<=[^\n]\s)\n(?=\S)/, '')
- PDF_NewLineInMiddleOfSentenceNoSpacesRule =
Rubular: rubular.com/r/eaNwGavmdo
Rule.new(/\n(?=[a-z])/, ' ')
- InlineFormattingRule =
Rubular: rubular.com/r/bAJrhyLNeZ
Rule.new(/\{b\^>\d*<b\^\}|\{b\^>\d*<b\^\}/, '')
- DoubleNewLineWithSpaceRule =
Rubular: rubular.com/r/dMxp5MixFS
Rule.new(/\n \n/, "\r")
- DoubleNewLineRule =
Rubular: rubular.com/r/H6HOJeA8bq
Rule.new(/\n\n/, "\r")
- NewLineFollowedByBulletRule =
Rubular: rubular.com/r/Gn18aAnLdZ
Rule.new(/\n(?=•)/, "\r")
- NewLineFollowedByPeriodRule =
Rubular: rubular.com/r/FseyMiiYFT
Rule.new(/\n(?=\.(\s|\n))/, '')
- TableOfContentsRule =
Rubular: rubular.com/r/8mc1ArOIGy
Rule.new(/\.{5,}\s*\d+-*\d*/, "\r")
- ConsecutivePeriodsRule =
Rubular: rubular.com/r/DwNSuZrNtk
Rule.new(/\.{5,}/, ' ')
- ConsecutiveForwardSlashRule =
Rubular: rubular.com/r/IQ4TPfsbd8
Rule.new(/\/{3}/, '')
- NoSpaceBetweenSentencesRule =
Rubular: rubular.com/r/6dt98uI76u
Rule.new(NO_SPACE_BETWEEN_SENTENCES_REGEX, '. ')
- NoSpaceBetweenSentencesDigitRule =
Rubular: rubular.com/r/l6KN6rH5XE
Rule.new(NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, '. ')
- EscapedCarriageReturnRule =
Rule.new(/\\r/, "\r")
- TypoEscapedCarriageReturnRule =
Rule.new(/\\\ r/, "\r")
- EscapedNewLineRule =
Rule.new(/\\n/, "\n")
- TypoEscapedNewLineRule =
Rule.new(/\\\ n/, "\n")
- ReplaceNewlineWithCarriageReturnRule =
Rule.new(/\n/, "\r")
- QuotationsFirstRule =
Rule.new(/''/, '"')
- QuotationsSecondRule =
Rule.new(/``/, '"')
- AbbreviationsWithMultiplePeriodsAndEmailRule =
Rubular: rubular.com/r/EUbZCNfgei
Rule.new(/(\w)(\.)(\w)/, '\1∮\3')
- GeoLocationRule =
Rubular: rubular.com/r/G2opjedIm9
Rule.new(/(?<=[a-zA-z]°)\.(?=\s*\d+)/, '∯')
- SingleNewLineRule =
Rule.new(/\n/, 'ȹ')
- SubSingleQuoteRule =
Rule.new(/&⎋&/, "'")
- ExtraWhiteSpaceRule =
Rule.new(/\s{3,}/, ' ')
- QuestionMarkInQuotationRule =
Rubular: rubular.com/r/aXPUGm6fQh
Rule.new(/\?(?=(\'|\"))/, '&ᓷ&')