Module: Levenshtein

Defined in:
lib/levenshtein.rb,
lib/levenshtein/version.rb

Defined Under Namespace

Modules: Util

Constant Summary collapse

VERSION =
"0.2.2"

Class Method Summary collapse

Class Method Details

.distance(a1, a2, threshold = nil, options = {}) ⇒ Object

Returns the Levenshtein distance between two sequences.

The two sequences can be two strings, two arrays, or two other objects responding to :each. All sequences are by generic (fast) C code.

All objects in the sequences should respond to :hash and :eql?.



40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# File 'lib/levenshtein.rb', line 40

def self.distance(a1, a2, threshold=nil, options={})
  a1, a2	= a1.scan(/./), a2.scan(/./)	if String === a1 and String === a2
  a1, a2	= Util.pool(a1, a2)

  # Handle some basic circumstances.

  return 0		if a1 == a2
  return a2.size	if a1.empty?
  return a1.size	if a2.empty?

  if threshold
    return nil	if (a1.size-a2.size) >= threshold
    return nil	if (a2.size-a1.size) >= threshold
    return nil	if (a1-a2).size >= threshold
    return nil	if (a2-a1).size >= threshold
  end

  # Remove the common prefix and the common postfix.

  l1	= a1.size
  l2	= a2.size

  offset			= 0
  no_more_optimizations	= true
 
  while offset < l1 and offset < l2 and a1[offset].equal?(a2[offset])
    offset += 1

    no_more_optimizations	= false
  end
 
  while offset < l1 and offset < l2 and a1[l1-1].equal?(a2[l2-1])
    l1 -= 1
    l2 -= 1

    no_more_optimizations	= false
  end
 
  if no_more_optimizations
    distance_fast_or_slow(a1, a2, threshold, options)
  else
    l1 -= offset
    l2 -= offset

    a1	= a1[offset, l1]
    a2	= a2[offset, l2]

    distance(a1, a2, threshold, options)
  end
end

.distance_fast(rb_o1, rb_o2, rb_threshold) ⇒ Object



4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'ext/levenshtein/levenshtein_fast.c', line 4

VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
  VALUE	*p1, *p2;
  long	l1, l2;
  long	col, row;
  int	threshold;
  int	*prev_row, *curr_row, *temp_row;
  int	curr_row_min, result;
  int	value1, value2;

  /* Be sure that all equivalent objects in rb_o1 and rb_o2 (a.eql?(b) == true) are taken from a pool (a.equal?(b) == true). */
  /* This is done in levenshtein.rb by means of Util.pool. */

  /* Get the sizes of both arrays. */

  l1	= RARRAY_LEN(rb_o1);
  l2	= RARRAY_LEN(rb_o2);

  /* Get the pointers of both arrays. */

  p1	= RARRAY_PTR(rb_o1);
  p2	= RARRAY_PTR(rb_o2);

  /* Convert Ruby's threshold to C's threshold. */

  if (!NIL_P(rb_threshold)) {
    threshold	= FIX2INT(rb_threshold);
  } else {
    threshold	= -1;
  }

  /* The Levenshtein algorithm itself. */

  /*       s1=              */
  /*       ERIK             */
  /*                        */
  /*      01234             */
  /* s2=V 11234             */
  /*    E 21234             */
  /*    E 32234             */
  /*    N 43334 <- prev_row */
  /*    S 54444 <- curr_row */
  /*    T 65555             */
  /*    R 76566             */
  /*    A 87667             */

  /* Allocate memory for both rows */

  prev_row	= (int*) ALLOC_N(int, (l1+1));
  curr_row	= (int*) ALLOC_N(int, (l1+1));

  /* Initialize the current row. */

  for (col=0; col<=l1; col++) {
    curr_row[col]	= col;
  }

  for (row=1; row<=l2; row++) {
    /* Copy the current row to the previous row. */

    temp_row	= prev_row;
    prev_row	= curr_row;
    curr_row	= temp_row;

    /* Calculate the values of the current row. */

    curr_row[0]		= row;
    curr_row_min	= row;

    for (col=1; col<=l1; col++) {
      /* Equal (cost=0) or substitution (cost=1). */

      value1	= prev_row[col-1] + ((p1[col-1] == p2[row-1]) ? 0 : 1);

      /* Insertion if it's cheaper than substitution. */

      value2	= prev_row[col]+1;
      if (value2 < value1) {
        value1	= value2;
      }

      /* Deletion if it's cheaper than substitution. */

      value2	= curr_row[col-1]+1;
      if (value2 < value1) {
        value1	= value2;
      }

      /* Keep track of the minimum value on this row. */

      if (value1 < curr_row_min) {
        curr_row_min	= value1;
      }

      curr_row[col]	= value1;
    }

    /* Return nil as soon as we exceed the threshold. */

    if (threshold > -1 && curr_row_min >= threshold) {
      free(prev_row);
      free(curr_row);

      return Qnil;
    }
  }

  /* The result is the last value on the last row. */

  result	= curr_row[l1];

  free(prev_row);
  free(curr_row);

  /* Return the Ruby version of the result. */

  return INT2FIX(result);
}

.distance_fast_or_slow(a1, a2, threshold, options) ⇒ Object

:nodoc:



91
92
93
94
95
96
97
# File 'lib/levenshtein.rb', line 91

def self.distance_fast_or_slow(a1, a2, threshold, options)	# :nodoc:
  if respond_to?(:distance_fast) and options[:force_slow]
    distance_fast(a1, a2, threshold)	# Implemented in C.
  else
    distance_slow(a1, a2, threshold)	# Implemented in Ruby.
  end
end

.distance_slow(a1, a2, threshold) ⇒ Object

:nodoc:



99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# File 'lib/levenshtein.rb', line 99

def self.distance_slow(a1, a2, threshold)	# :nodoc:
  crow	= (0..a1.size).to_a

  1.upto(a2.size) do |y|
    prow	= crow
    crow	= [y]

    1.upto(a1.size) do |x|
      crow[x]	= [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[x-1].equal?(a2[y-1]) ? 0 : 1)].min
    end

    # Stop analysing this sequence as soon as the best possible
    # result for this sequence is bigger than the best result so far.
    # (The minimum value in the next row will be equal to or greater
    # than the minimum value in this row.)

    return nil	if threshold and crow.min >= threshold
  end

  crow[-1]
end

.normalized_distance(a1, a2, threshold = nil, options = {}) ⇒ Object

Returns the Levenshtein distance as a number between 0.0 and 1.0. It’s basically the Levenshtein distance divided by the size of the longest sequence.



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/levenshtein.rb', line 10

def self.normalized_distance(a1, a2, threshold=nil, options={})
  size	= [a1.size, a2.size].max

  if a1.size == 0 and a2.size == 0
    0.0
  elsif a1.size == 0
    a2.size.to_f/size
  elsif a2.size == 0
    a1.size.to_f/size
  else
    if threshold
      if d = self.distance(a1, a2, (threshold*size).to_i+1)
        d.to_f/size
      else
        nil
      end
    else
      self.distance(a1, a2).to_f/size
    end
  end
end