Module: Spellchecker::Tokenizer

Defined in:
lib/spellchecker/tokenizer.rb,
lib/spellchecker/tokenizer/list.rb,
lib/spellchecker/tokenizer/token.rb,
lib/spellchecker/tokenizer/null_token.rb

Defined Under Namespace

Classes: List, Token

Constant Summary collapse

BLANK_REGEXP =
/[[:blank:]]/.freeze
WORD_REGEXP =
/[[:word:]]/.freeze
LINEBREAK =
"\n"
DOT =
'.'
SIMPLE_PRE =
['¿', '¡'].freeze
SIMPLE_POST =
['!', '?', ',', ':', ';', '.'].freeze
PAIR_PRE =
['(', '{', '[', '<', '«', '', ''].freeze
PAIR_POST =
[')', '}', ']', '>', '»', '', ''].freeze
PRE_N_POST =
['"', "'", '`', '*'].freeze
SPLITTABLES =
SIMPLE_PRE + SIMPLE_POST + PAIR_PRE + PAIR_POST + PRE_N_POST
SPLITTABLES_REGEXP =
Regexp.new("[#{Regexp.escape(SPLITTABLES.join)}]")
NULL_POS =
-1
NULL_TOKEN =
Token.new('', NULL_POS).tap do |t|
  t.next = t
  t.prev = t
end

Class Method Summary collapse

Class Method Details

.call(str) ⇒ Spellchecker::Tokenizer::List

rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity

Parameters:

  • str (String)

    string to be tokenized.

Returns:



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/spellchecker/tokenizer.rb', line 30

def call(str)
  chars = str.chars
  pos = 0
  list = Tokenizer::List.new

  (chars.length + 1).times.each_with_object([]) do |i, acc|
    char = chars[i]

    if char.nil?
      list << Token.new(acc.join, pos) unless acc.empty?

      break
    end

    if char.match?(BLANK_REGEXP)
      list << Token.new(acc.join, pos) unless acc.empty?
      acc.clear
    elsif splitable?(char)
      is_next_wordchar = word_char?(chars[i + 1])

      if acc.empty? && char == DOT && is_next_wordchar
        pos = i
        acc << char
      elsif !word_char?(chars[i - 1]) || !is_next_wordchar || char == LINEBREAK
        list << Token.new(acc.join, pos) unless acc.empty?
        list << Token.new(char, i)

        acc.clear
      else
        acc << char
      end
    else
      pos = i if acc.empty?
      acc << char
    end
  end

  list
end

.splitable?(char) ⇒ Boolean

Parameters:

  • char (String)

Returns:

  • (Boolean)


73
74
75
# File 'lib/spellchecker/tokenizer.rb', line 73

def splitable?(char)
  SPLITTABLES_REGEXP.match?(char) || char == LINEBREAK
end

.word_char?(char) ⇒ Boolean

Parameters:

  • char (String)

Returns:

  • (Boolean)


79
80
81
# File 'lib/spellchecker/tokenizer.rb', line 79

def word_char?(char)
  char&.match?(WORD_REGEXP)
end