Module: Levenshtein

Defined in:
lib/levenshtein.rb,
ext/levenshtein/levenshtein_c.c

Overview

The Levenshtein distance is a metric for measuring the amount of difference between two sequences (i.e., the so called edit distance). The Levenshtein distance between two strings is given by the minimum number of operations needed to transform one string into the other, where an operation is an insertion, deletion, or substitution of a single character.

More information about the Levenshtein distance algorithm: en.wikipedia.org/wiki/Levenshtein_distance .

Class Method Summary collapse

Class Method Details

.distance(s1, s2, threshold = nil) ⇒ Object

Returns the Levenshtein distance between two byte strings.



44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/levenshtein.rb', line 44

def self.distance(s1, s2, threshold=nil)
  s1, s2	= s2, s1	if s1.length > s2.length	# s1 is the short one; s2 is the long one.

  # Handle some basic circumstances.

  return 0.0		if s1 == s2
  return s2.length	if s1.empty?
  return nil		if threshold and (s2.length-s1.length) >= threshold
  return nil		if threshold and (s1.scan(/./) - s2.scan(/./)).length >= threshold
  return nil		if threshold and (s2.scan(/./) - s1.scan(/./)).length >= threshold

  # Do the expensive calculation on a subset of the strings only, if possible.

  b	= 0
  e1	= s1.length-1
  e2	= s2.length-1

  while s1[b, 1] == s2[b, 1]
    b += 1
  end

  while s1[e1, 1] == s2[e2, 1] and e1 > b and e2 > b
    e1 -= 1
    e2 -= 1
  end

  distance_part2(s1[b..e1], s2[b..e2], threshold)
end

.distance_part2(s1, s2, threshold) ⇒ Object

:nodoc:



73
74
75
76
77
78
79
# File 'lib/levenshtein.rb', line 73

def self.distance_part2(s1, s2, threshold)	# :nodoc:
  if respond_to?(:distance_part2_fast)
    distance_part2_fast(s1, s2, threshold)	# Implemented in C.
  else
    distance_part2_slow(s1, s2, threshold)	# Implemented in Ruby.
  end
end

.distance_part2_fast(rb_s1, rb_s2, rb_threshold) ⇒ Object



3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# File 'ext/levenshtein/levenshtein_c.c', line 3

static VALUE levenshtein_distance_part2(VALUE self, VALUE rb_s1, VALUE rb_s2, VALUE rb_threshold) {
  VALUE	rb_s3;
  int	threshold;
  int	l1, l2, l3;
  char	*s1, *s2, *s3;
  int	*prev_row, *curr_row;
  int	col, row;
  int	curr_row_min, result;

  /* Convert Ruby's s1 to C's s1. */

  rb_s1	= StringValue(rb_s1);
  s1	= RSTRING(rb_s1)->ptr;
  l1	= RSTRING(rb_s1)->len;

  /* Convert Ruby's s2 to C's s2. */

  rb_s2	= StringValue(rb_s2);
  s2	= RSTRING(rb_s2)->ptr;
  l2	= RSTRING(rb_s2)->len;

  /* Convert Ruby's threshold to C's threshold. */

  if (!NIL_P(rb_threshold)) {
    threshold	= FIX2INT(rb_threshold);
  } else {
    threshold	= -1;
  }

  /* The Levenshtein Algorithm itself. */

  /*       s1=              */
  /*       ERIK             */
  /*                        */
  /*      01234             */
  /* s2=V 11234             */
  /*    E 21234             */
  /*    E 32234             */
  /*    N 43334 <- prev_row */
  /*    S 54444 <- curr_row */
  /*    T 65555             */
  /*    R 76566             */
  /*    A 87667             */
 
  /* Allocate memory for both rows */

  prev_row	= ALLOC_N(int, l1+1);
  curr_row	= ALLOC_N(int, l1+1);

  if ((prev_row == NULL) || (curr_row == NULL)) {
    rb_raise(rb_eNoMemError, "out of memory");
  }

  /* Initialize the current row. */

  for (col=0; col<=l1; col++) {
    curr_row[col]	= col;
  }

  for (row=1; row<=l2; row++) {
    /* Copy the current row to the previous row. */

    memcpy(prev_row, curr_row, sizeof(int)*(l1+1));

    /* Calculate the values of the current row. */

    curr_row[0]		= row;
    curr_row_min	= row;

    for (col=1; col<=l1; col++) {
      /* Equal (cost=0) or Substitution (cost=1). */

      curr_row[col]	= prev_row[col-1] + ((s1[col-1] == s2[row-1]) ? 0 : 1);

      /* Insertion if it's cheaper than substitution. */

      if (prev_row[col]+1 < curr_row[col]) {
        curr_row[col] = prev_row[col]+1;
      }

      /* Deletion if it's cheaper than substitution. */

      if (curr_row[col-1]+1 < curr_row[col]) {
        curr_row[col] = curr_row[col-1]+1;
      }

      /* Keep track of the minimum value on this row. */

      if (curr_row[col] < curr_row_min) {
        curr_row_min	= curr_row[col];
      }
    }

    /* Return nil as soon as we exceed the threshold. */

    if (threshold > -1 && curr_row_min >= threshold) {
      free(prev_row);
      free(curr_row);

      return Qnil;
    }
  }

  /* The result is the last value on the last row. */

  result	= curr_row[l1];

  free(prev_row);
  free(curr_row);

  /* Return the Ruby version of the result. */

  return INT2FIX(result);
}

.distance_part2_slow(s1, s2, threshold) ⇒ Object

:nodoc:



81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# File 'lib/levenshtein.rb', line 81

def self.distance_part2_slow(s1, s2, threshold)	# :nodoc:
  row	= (0..s1.length).to_a

  1.upto(s2.length) do |y|
    prow	= row
    row	= [y]

    1.upto(s1.length) do |x|
      row[x]	= [prow[x]+1, row[x-1]+1, prow[x-1]+(s1[x-1]==s2[y-1] ? 0 : 1)].min
    end

    # Stop analysing this string as soon as the best possible result for this string is bigger than the best result so far.
    # (The minimum value in the next row will be equal to or greater than the minimum value in this row.)

    return nil	if threshold and row.min >= threshold
  end

  row[-1]
end

.normalized_distance(s1, s2, threshold = nil) ⇒ Object

Returns the Levenshtein distance as a number between 0.0 and 1.0. It’s basically the Levenshtein distance divided by the length of the longest string.



24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/levenshtein.rb', line 24

def self.normalized_distance(s1, s2, threshold=nil)
  s1, s2	= s2, s1	if s1.length > s2.length	# s1 is the short one; s2 is the long one.

  if s2.empty?
    0.0	# Since s1.length < s2.length, s1 must be empty as well.
  else
    if threshold
      if d = self.distance(s1, s2, (threshold*s2.length+1).to_i)
        d.to_f/s2.length
      else
        nil
      end
    else
      self.distance(s1, s2).to_f/s2.length
    end
  end
end