Module: Runestone::Corpus

Defined in:
lib/runestone/corpus.rb

Class Method Summary collapse

Class Method Details

.add(*words) ⇒ Object



3
4
5
6
7
8
9
10
11
12
# File 'lib/runestone/corpus.rb', line 3

def self.add(*words)
  return if words.size == 0

  conn = Runestone::Model.connection
  conn.execute(<<-SQL)
    INSERT INTO runestone_corpus ( word )
    VALUES (#{words.map { |w| conn.quote(w.downcase) }.join('),(')})
    ON CONFLICT DO NOTHING
  SQL
end

.similar_words(*words) ⇒ Object



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/runestone/corpus.rb', line 14

def self.similar_words(*words)
  lut = {}
  words = words.inject([]) do |ws, w|
    tt = typo_tolerance(w)
    ws << "#{Runestone::Model.connection.quote(w)}, #{Runestone::Model.connection.quote(w.downcase)}, #{tt}" if tt > 0
    ws
  end
  return lut if words.size == 0
  
  result = Runestone::Model.connection.execute(<<-SQL)
    WITH  tokens (token, token_downcased, typo_tolerance) AS (VALUES (#{words.join('), (')}))
    SELECT token, word, levenshtein(runestone_corpus.word, tokens.token_downcased)
    FROM tokens
    JOIN runestone_corpus ON runestone_corpus.word % tokens.token_downcased
    WHERE
      runestone_corpus.word != tokens.token_downcased
      AND levenshtein(runestone_corpus.word, tokens.token_downcased) <= tokens.typo_tolerance
  SQL
  result.each_row do |t, w, l|
    w.gsub!(/\(|\)|:|\||!|\&|\*/, '')
    next if w == t
    lut[t] ||= []
    lut[t] << w
  end
  lut
end

.typo_tolerance(word) ⇒ Object



41
42
43
# File 'lib/runestone/corpus.rb', line 41

def self.typo_tolerance(word)
  Runestone.typo_tolerances.find { |k,v| v.member?(word.length) }&.first || 0
end