Class: PragmaticTokenizer::FullStopSeparator

Inherits:

Object

Object
PragmaticTokenizer::FullStopSeparator

show all

Defined in:: lib/pragmatic_tokenizer/full_stop_separator.rb

Overview

This class separates true full stops while ignoring periods that are part of an abbreviation

Instance Attribute Summary collapse

#abbreviations ⇒ Object readonly

Returns the value of attribute abbreviations.
#downcase ⇒ Object readonly

Returns the value of attribute downcase.
#tokens ⇒ Object readonly

Returns the value of attribute tokens.

Instance Method Summary collapse

#initialize(tokens:, abbreviations:, downcase:) ⇒ FullStopSeparator constructor

A new instance of FullStopSeparator.
#separate ⇒ Object

Constructor Details

#initialize(tokens:, abbreviations:, downcase:) ⇒ `FullStopSeparator`

Returns a new instance of FullStopSeparator.

# File 'lib/pragmatic_tokenizer/full_stop_separator.rb', line 8

def initialize(tokens:, abbreviations:, downcase:)
  @tokens = tokens
  @abbreviations = abbreviations
  @downcase = downcase
end

Instance Attribute Details

#abbreviations ⇒ `Object` (readonly)

Returns the value of attribute abbreviations.



7
8
9

# File 'lib/pragmatic_tokenizer/full_stop_separator.rb', line 7

def abbreviations
  @abbreviations
end

#downcase ⇒ `Object` (readonly)

Returns the value of attribute downcase.



7
8
9

# File 'lib/pragmatic_tokenizer/full_stop_separator.rb', line 7

def downcase
  @downcase
end

#tokens ⇒ `Object` (readonly)

Returns the value of attribute tokens.



7
8
9

# File 'lib/pragmatic_tokenizer/full_stop_separator.rb', line 7

def tokens
  @tokens
end

Instance Method Details

#separate ⇒ `Object`

# File 'lib/pragmatic_tokenizer/full_stop_separator.rb', line 14

def separate
  abbr = {}
  abbreviations.each do |i|
    abbr[i] = true
  end
  cleaned_tokens = []
  tokens.each_with_index do |_t, i|
    if tokens[i + 1] && tokens[i] =~ /\A(.+)\.\z/
      w = Regexp.last_match(1)
      if downcase
        abbreviation = abbr[w]
      else
        abbreviation = abbr[Unicode.downcase(w)]
      end
      unless abbreviation || w =~ /\A[a-z]\z/i ||
             w =~ /[a-z](?:\.[a-z])+\z/i
        cleaned_tokens << w
        cleaned_tokens << '.'
        next
      end
    end
    cleaned_tokens << tokens[i]
  end
  if downcase
    abbreviation = abbreviations.include?(cleaned_tokens[-1].chomp(".")) unless cleaned_tokens[-1].nil?
  else
    abbreviation = abbreviations.include?(Unicode.downcase(cleaned_tokens[-1]).chomp(".")) unless cleaned_tokens[-1].nil?
  end
  if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !abbreviation
    cleaned_tokens[-1] = Regexp.last_match(1)
    cleaned_tokens.push '.'
  end
  cleaned_tokens
end