Class: String

Inherits:
Object
  • Object
show all
Defined in:
lib/similarity_string.rb

Instance Method Summary collapse

Instance Method Details

#levenshtein_distance(target_string) ⇒ Object

“frente.co.jp”.levenshtein_distance(“[email protected]”) => 10 “[email protected]”.levenshtein_distance(“frente.co.jp”) => 10



20
21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/similarity_string.rb', line 20

def levenshtein_distance(target_string)
  col, row = self.size + 1, target_string.size + 1
  d = row.times.inject([]){|a, i| a << [0] * col }
  col.times {|i| d[0][i] = i }
  row.times {|i| d[i][0] = i }
  self.size.times do |i1|
    target_string.size.times do |i2|
      cost = self[i1] == target_string[i2] ? 0 : 1
      x, y = i1 + 1, i2 + 1
      d[y][x] = [d[y][x-1]+1, d[y-1][x]+1, d[y-1][x-1]+cost].min
    end
  end
  d[target_string.size][self.size]
end

#match_rate(target_string) ⇒ Object



8
9
10
11
12
# File 'lib/similarity_string.rb', line 8

def match_rate(target_string)
  return 0 if self.length <= 0
  self.ngram(target_string).to_f
  #self.split('').select { |s| s.match(/#{target_string.split('')}/) }.size.quo(self.length).to_f
end

#match_rate_percent(target_string) ⇒ Object



14
15
16
# File 'lib/similarity_string.rb', line 14

def match_rate_percent(target_string)
  match_rate(target_string) * 100
end

#ngram(string, part_len = 3) ⇒ Object

文字数の少ない方を基準に動かすとしたら、こっちの方がいいかも“frente.co.jp”.ngram(“[email protected]”) => 1.0 “[email protected]”.ngram(“frente.co.jp”) => 0.5



38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# File 'lib/similarity_string.rb', line 38

def ngram(string, part_len = 3)
  string = string.dup.gsub(/[\s\n ]+/u, "")
  strlen = string.split(//u).length
  points = []
  source = self.gsub(/[\s\n ]+/u, "")
  srcarr = source.split(//u)
  sourcelen = srcarr.length
  return nil if part_len > sourcelen
  return nil if part_len > strlen
  [string, source].each do|cmpstr|
    counter = 0.0
    (0..(sourcelen - part_len)).each do|start|
      part = srcarr[start, part_len].join("")
      hit = cmpstr.scan(/#{Regexp.quote(part)}/).size
      counter += hit
    end
    points << counter
  end
  points.shift / points.shift  #比較文字の類似度 ÷ 自分自身の類似度(基準値)
end