Class: String
- Inherits:
-
Object
- Object
- String
- Defined in:
- lib/similarity_string.rb
Instance Method Summary collapse
-
#levenshtein_distance(target_string) ⇒ Object
“frente.co.jp”.levenshtein_distance(“[email protected]”) => 10 “[email protected]”.levenshtein_distance(“frente.co.jp”) => 10.
- #match_rate(target_string) ⇒ Object
- #match_rate_percent(target_string) ⇒ Object
-
#ngram(string, part_len = 3) ⇒ Object
文字数の少ない方を基準に動かすとしたら、こっちの方がいいかも “frente.co.jp”.ngram(“[email protected]”) => 1.0 “[email protected]”.ngram(“frente.co.jp”) => 0.5.
Instance Method Details
#levenshtein_distance(target_string) ⇒ Object
“frente.co.jp”.levenshtein_distance(“[email protected]”) => 10 “[email protected]”.levenshtein_distance(“frente.co.jp”) => 10
20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
# File 'lib/similarity_string.rb', line 20 def levenshtein_distance(target_string) col, row = self.size + 1, target_string.size + 1 d = row.times.inject([]){|a, i| a << [0] * col } col.times {|i| d[0][i] = i } row.times {|i| d[i][0] = i } self.size.times do |i1| target_string.size.times do |i2| cost = self[i1] == target_string[i2] ? 0 : 1 x, y = i1 + 1, i2 + 1 d[y][x] = [d[y][x-1]+1, d[y-1][x]+1, d[y-1][x-1]+cost].min end end d[target_string.size][self.size] end |
#match_rate(target_string) ⇒ Object
8 9 10 11 12 |
# File 'lib/similarity_string.rb', line 8 def match_rate(target_string) return 0 if self.length <= 0 self.ngram(target_string).to_f #self.split('').select { |s| s.match(/#{target_string.split('')}/) }.size.quo(self.length).to_f end |
#match_rate_percent(target_string) ⇒ Object
14 15 16 |
# File 'lib/similarity_string.rb', line 14 def match_rate_percent(target_string) match_rate(target_string) * 100 end |
#ngram(string, part_len = 3) ⇒ Object
文字数の少ない方を基準に動かすとしたら、こっちの方がいいかも“frente.co.jp”.ngram(“[email protected]”) => 1.0 “[email protected]”.ngram(“frente.co.jp”) => 0.5
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
# File 'lib/similarity_string.rb', line 38 def ngram(string, part_len = 3) string = string.dup.gsub(/[\s\n ]+/u, "") strlen = string.split(//u).length points = [] source = self.gsub(/[\s\n ]+/u, "") srcarr = source.split(//u) sourcelen = srcarr.length return nil if part_len > sourcelen return nil if part_len > strlen [string, source].each do|cmpstr| counter = 0.0 (0..(sourcelen - part_len)).each do|start| part = srcarr[start, part_len].join("") hit = cmpstr.scan(/#{Regexp.quote(part)}/).size counter += hit end points << counter end points.shift / points.shift #比較文字の類似度 ÷ 自分自身の類似度(基準値) end |