Module: NlpPure::Segmenting::DefaultWord

Defined in:
lib/nlp_pure/segmenting/default_word.rb

Constant Summary collapse

DEFAULT_OPTIONS =
{
  # 3+ periods as pseudo-ellipsis (with optional whitespace)
  # OR hyphen, en dash, em dash, and whitespace
  split: /\s?\.{3,}\s?|[\s\-–—…]+/,
  # array of arrays; [0] should be regexp, [1] should be replacement
  # NOTE: minor performance risk in letting this array grow long
  gsub:  [
    # ellipses at the start of a string are problematic; ref #12
    [/^\s?(…|\.{3,})/, ' ']
  ],
  segment_boundary: ' '
}.freeze

Class Method Summary collapse

Class Method Details

.clean_input(text = nil) ⇒ Object



27
28
29
30
31
32
33
34
35
# File 'lib/nlp_pure/segmenting/default_word.rb', line 27

def clean_input(text = nil)
  input = text.to_s
  # perform replacements to work around the limitations of the splitting regexp
  options.fetch(:gsub, []).each do |gsub_pair|
    input.gsub!(gsub_pair[0], gsub_pair[1])
  end
  # NOTE: leading whitespace is problematic; ref #12
  input.strip
end

.optionsObject

NOTE: exposed as a method for easy mock/stub



38
39
40
# File 'lib/nlp_pure/segmenting/default_word.rb', line 38

def options
  DEFAULT_OPTIONS
end

.parse(*args) ⇒ Object



22
23
24
25
# File 'lib/nlp_pure/segmenting/default_word.rb', line 22

def parse(*args)
  return nil if args.nil? || args.empty?
  clean_input(args[0]).split(options.fetch(:split, nil))
end