Class: PragmaticTokenizer::Regex

Inherits:
Object
  • Object
show all
Defined in:
lib/pragmatic_tokenizer/regex.rb

Constant Summary collapse

CHUNK_LONG_INPUT_TEXT =

Text that needs to be tokenized is initially split into chunks of this length:

/\S.{1,10000}(?!\S)/m
RANGE_DINGBATS =

Ranges

/[\u2701-\u27BE]/
RANGE_VARIATION_SELECTORS =

alter the previous character

/[\uFE00-\uFE0F]/
RANGE_FULLWIDTH =

e.g. !"#'?

/[\uFF01-\ufF1F]/
RANGE_ALPHANUMERIC_SUPPLEMENT =
/[\u{1F100}-\u{1F1FF}]/
RANGE_UNUSUAL_AND_EMOJI =
/[\u203C-\u3299\u{1F000}-\u{1F644}]/
COLON1 =

Regular expressions which do not need to capture anything are enclosed in /(?: … )/ to enhance performance

/(?:(:)([[:print:]]{2,}))/
COLON2 =
/(?::)/
COMMAS =
/(?:([,‚])+)/
ENCLOSED_PLUS =
/(?:([[:print:]]+)\+([[:print:]]+))/
EMAIL =
/(?:[[:print:]]+[@@][[:print:]]+\.[[:print:]]+)/
DIGIT =
/(?:[[:digit:]]+)/
ASTERISK =
/(?:\*+)/
UNDERSCORE =
/(?:_+)/
HYPHEN_OR_UNDERSCORE =
/(?:[-_])/
LONG_WORD_SPLIT =
/(?:[-_\/—–])/
PERIOD_AND_PRIOR =
/(?:(.+\.))/
PERIOD_ONLY =
/(?:(\.))/
CONTRACTIONS =
/(?:[‘’‚‛‹›'´`])/
PUNCTUATION1 =

all punctuation categories except Pc (Connector) and Po (other)

/(?:([\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Ps}])+)/
PUNCTUATION2 =
/(?:(?<=\S)([!?#{RANGE_FULLWIDTH.source}]+))/
PUNCTUATION3 =
/(?:[!%\-–\u00AD]+)/
PUNCTUATION4 =
/(?:[..。]+)/
DINGBATS =
/(?:(#{RANGE_DINGBATS.source}#{RANGE_VARIATION_SELECTORS.source}*+))/
NO_BREAK_SPACE =
/(?:\u00A0+)/
HTTP =
/(?:https?:\/\/)/
TIME_WITH_COLON =
/(?:\d:\d)/
DOMAIN_PREFIX =
/(?:https?:\/\/|www\.|[[:alpha:]]\.)/
DOMAIN_SUFFIX =
/(?:[[:alpha:]]\.(?:com|net|org|edu|gov|mil|int|[[:alpha:]]{2}))/
DOMAIN1 =
/(?:((https?:\/\/|)[[:print:]]+\.[[:alpha:]]{2,6}(:[0-9]{1,5})?(\/[[:print:]]*+)?))/
DOMAIN2 =
/(?:[[:alnum:]]{2,}([\-.][[:alnum:]]+)*\.[[:alpha:]]{2,6}(:[0-9]{1,5})?(\/[[:print:]]*+)?)/
NOT_URL =
/(?:^(?!#{DOMAIN_PREFIX.source})([[:print:]]*))/
HASHTAG_OR_MENTION =
/(?:[@#@#][[:print:]]+)/
HASHTAG =
/(?:[##][[:print:]]+)/
MENTION =
/(?:[@@][[:print:]]+)/
HASHTAG_WITH_HYPHEN =
/(?:^([##][[:digit:]]+)-)/
ONE_AS_EXCLAMATION =
/(?:\D1+)/
ONES_EXCLAMATIONS =
/(?:!+(1*+!*+)*+)/
MANY_PERIODS =
/(?:^\.{2,}$)/
/(?:[®©™]+)/
CONTROL_CHARACTER =

matches any character with hexadecimal value 00 through 1F or 7F.

/(?:[[:cntrl:]]+)/
APOSTROPHE_AND_S =
/(?:['’`́]s)/
ALSO_DECIMALS =
/(?:[[:alpha:]]*+[[:digit:]]+)/
ACUTE_ACCENT_S =
/(?:\s\u0301(?=s))/
CAPTURE_UNUSUAL_AND_EMOJI =

Regular expressions used to capture items

/(#{RANGE_UNUSUAL_AND_EMOJI.source})/
QUESTION_MARK_NOT_URL =
/#{NOT_URL.source}(\?)/
SLASH_NOT_URL =

Should we change specs and also capture “/”, just like we capture “:” and “?”

/#{NOT_URL.source}\//
SHIFT_BOUNDARY_CHARACTERS =
/([;^&|…«»„“¿¡≠]+)/
MULTIPLE_DOTS =

we keep all dashes

/(\.{2,})/
MULTIPLE_DASHES =

we only keep first dash

/(-){2,}/
BRACKET =
/([{}()\[\]])/
EXCLAMATION_BETWEEN_ALPHA =
/(?<=[[:alpha:]])(!)(?=[[:alpha:]])/
PERCENT_BEFORE_DIGIT =
/(%)\d+/
COMMA_BEFORE_NON_DIGIT =
/(,)(?=\D)/
COMMA_AFTER_NON_DIGIT =
/(?<=\D)(,)/
COLON_IN_URL =
/(?<=[(https?|ftp)]):(?=\/\/)/
QUOTE_BEFORE_PRINT =
/(('')|["“])(?=[[:print:]])/
QUOTE =
/('')|["”]/
HYPHEN_AFTER_NON_WORD =
/(?<=\W)(-)/
HYPHEN_BEFORE_NON_WORD =
/(-)(?=\W)/
STARTS_WITH_COMMAS =
/^#{COMMAS.source}/
STARTS_WITH_HTTP =
/^#{HTTP.source}/
STARTS_WITH_DOMAIN =
/^#{DOMAIN_PREFIX.source}/
STARTS_WITH_COLON1 =
/^#{COLON1.source}/
STARTS_WITH_UNDERSCORE =
/^#{UNDERSCORE.source}/
STARTS_WITH_PUNCTUATION3 =
/^#{PUNCTUATION3.source}/
ENDS_WITH_DOMAIN =
/#{DOMAIN_SUFFIX.source}$/
ENDS_WITH_PUNCTUATION1 =
/#{PUNCTUATION1.source}$/
ENDS_WITH_PUNCTUATION2 =
/#{PUNCTUATION2.source}$/
ENDS_WITH_COLON2 =
/#{COLON2.source}$/
ENDS_WITH_UNDERSCORE =
/#{UNDERSCORE.source}$/
ENDS_WITH_ONES_EXCLAMATIONS =
/#{ONES_EXCLAMATIONS.source}$/
ENDS_WITH_EXCITED_ONE =
/#{ONE_AS_EXCLAMATION.source}$/
ENDS_WITH_APOSTROPHE_AND_S =
/#{APOSTROPHE_AND_S.source}$/
ENDS_WITH_ALPHA =
/[[:alpha:]]$/
ENDS_WITH_DIGIT =
/[[:digit:]]$/
ONLY_DECIMALS =
/(?:^[[:digit:]]+$)/
NO_DECIMALS =
/(?:^\D+$)/
ONLY_PUNCTUATION =
/^[[[:punct:]]^|+]+$/
ONLY_ROMAN_NUMERALS =
/^(?=[MDCLXVI])M*(C[MD]|D?C*)(X[CL]|L?X*)(I[XV]|V?I*)$/i
ONLY_EMAIL =
/^#{EMAIL}$/
ONLY_HASHTAG_MENTION =
/^#{HASHTAG_OR_MENTION}$/
ONLY_HASHTAG =
/^#{HASHTAG}$/
ONLY_MENTION =
/^#{MENTION}$/
ONLY_DOMAIN1 =
/^#{DOMAIN1}$/
ONLY_DOMAIN2 =
/^#{DOMAIN2}$/
ONLY_DOMAIN3 =
Regexp.union(STARTS_WITH_DOMAIN, ENDS_WITH_DOMAIN)
DOMAIN_OR_EMAIL =
Regexp.union(ONLY_DOMAIN1, ONLY_EMAIL)
UNDERSCORES_ASTERISK =
Regexp.union(STARTS_WITH_UNDERSCORE, ENDS_WITH_UNDERSCORE, ASTERISK)
NO_DECIMALS_NO_NUMERALS =
Regexp.union(ALSO_DECIMALS, ONLY_ROMAN_NUMERALS)
COMMAS_OR_PUNCTUATION =
Regexp.union(
    STARTS_WITH_COMMAS,
    ENDS_WITH_PUNCTUATION1,
    ENDS_WITH_PUNCTUATION2
)
VARIOUS =

Can this constant name be clarified?

Regexp.union(
    SLASH_NOT_URL,
    QUESTION_MARK_NOT_URL,
    ENCLOSED_PLUS,
    STARTS_WITH_COLON1,
    DINGBATS,
    HASHTAG_WITH_HYPHEN,
    CAPTURE_UNUSUAL_AND_EMOJI
)
IRRELEVANT_CHARACTERS =
Regexp.union(
    STARTS_WITH_PUNCTUATION3,
    ENDS_WITH_COLON2,
    ENDS_WITH_ONES_EXCLAMATIONS,
    CONTROL_CHARACTER,
    COPYRIGHT_TRADEMARK,
    RANGE_ALPHANUMERIC_SUPPLEMENT
)
PRE_PROCESS =
Regexp.union(
    SHIFT_BOUNDARY_CHARACTERS,
    MULTIPLE_DOTS,
    BRACKET,
    MULTIPLE_DASHES,
    EXCLAMATION_BETWEEN_ALPHA,
    PERCENT_BEFORE_DIGIT,
    COMMA_BEFORE_NON_DIGIT,
    COMMA_AFTER_NON_DIGIT
)