Class: ChatCorrect::Tokenize

Inherits:
Object
  • Object
show all
Defined in:
lib/chat_correct/tokenize.rb

Constant Summary collapse

ABBREVIATIONS =
['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']
PUNCTUATION =
['', '', '.', '', '!', '?', '', '', '¡', '¿', '', '', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',' , ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-']

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(text:) ⇒ Tokenize

Returns a new instance of Tokenize.



6
7
8
# File 'lib/chat_correct/tokenize.rb', line 6

def initialize(text:)
  @text = text
end

Instance Attribute Details

#textObject (readonly)

Returns the value of attribute text.



5
6
7
# File 'lib/chat_correct/tokenize.rb', line 5

def text
  @text
end

Instance Method Details

#tokenizeObject



10
11
12
13
14
15
16
17
18
19
20
21
22
# File 'lib/chat_correct/tokenize.rb', line 10

def tokenize
  return if text.nil?
  return [text] if /\A\w+\z/ =~ text
  converted_text = convert_quotes(text)
  converted_text = shift_all_punct(converted_text)
  converted_text = convert_contractions(converted_text)
  converted_text = convert_numbers_with_commas(converted_text)
  converted_text = convert_numbers_with_periods(converted_text)
  result = converted_text.split(' ')
  tokenized_array = separate_other_ending_punc(separate_full_stop(result)).map do |s|
    s.tr("\n", '').tr("\r", '').strip
  end
end

#tokenize_no_punctObject



24
25
26
27
# File 'lib/chat_correct/tokenize.rb', line 24

def tokenize_no_punct
  return if text.nil? || tokenize.nil?
  tokenize - PUNCTUATION
end