Class: Ferret::Search::FuzzyQuery

Inherits:
MultiTermQuery show all
Defined in:
lib/ferret/search/fuzzy_query.rb

Overview

Implements the fuzzy search query. The similiarity measurement is based on the Levenshtein (distance) algorithm.

Defined Under Namespace

Classes: ScoreTerm, ScoreTermQueue

Constant Summary collapse

@@default_min_similarity =
0.5
@@default_prefix_length =
0

Instance Attribute Summary collapse

Attributes inherited from MultiTermQuery

#term

Attributes inherited from Query

#boost

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from Query

#combine, #create_weight, #extract_terms, #merge_boolean_queries, #similarity, #weight

Constructor Details

#initialize(term, minimum_similarity = @@default_min_similarity, prefix_length = @@default_prefix_length) ⇒ FuzzyQuery

Create a new FuzzyQuery that will match terms with a similarity of at least minimum_similarity to term. If a prefix_length > 0 is specified, a common prefix of that length is also required.

term

the term to search for

minimum_similarity

a value between 0 and 1 to set the required similarity between the query term and the matching terms. For example, for a minimum_similarity of 0.5 a term of the same length as the query term is considered similar to the query term if the edit distance between both terms is less than length(term)*0.5

prefix_length

length of common (non-fuzzy) prefix. This is the number of characters at the start of a term that must be identical (fuzzy) to the query term if the query is to match that term.

raises

ArgumentError if minimum_similarity is >= 1 or < 0 or if prefix_length < 0



53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/ferret/search/fuzzy_query.rb', line 53

def initialize(term,
               minimum_similarity = @@default_min_similarity,
               prefix_length = @@default_prefix_length)
  super(term)
  
  if (minimum_similarity >= 1.0)
    raise ArgumentError, "minimum_similarity >= 1"
  elsif (minimum_similarity < 0.0)
    raise ArgumentError, "minimum_similarity < 0"
  end

  if (prefix_length < 0)
    raise ArgumentError, "prefix_length < 0"
  end
  
  @minimum_similarity = minimum_similarity
  @prefix_length = prefix_length
end

Instance Attribute Details

#minimum_similarityObject (readonly)

Returns the value of attribute minimum_similarity.



33
34
35
# File 'lib/ferret/search/fuzzy_query.rb', line 33

def minimum_similarity
  @minimum_similarity
end

#prefix_lengthObject (readonly)

Returns the value of attribute prefix_length.



33
34
35
# File 'lib/ferret/search/fuzzy_query.rb', line 33

def prefix_length
  @prefix_length
end

Class Method Details

.default_min_similarityObject



8
9
10
# File 'lib/ferret/search/fuzzy_query.rb', line 8

def FuzzyQuery.default_min_similarity()
  return @@default_min_similarity
end

.default_min_similarity=(minimum_similarity) ⇒ Object



12
13
14
15
16
17
18
19
# File 'lib/ferret/search/fuzzy_query.rb', line 12

def FuzzyQuery.default_min_similarity=(minimum_similarity)
  if (minimum_similarity >= 1.0)
    raise ArgumentError, "minimum_similarity cannot be greater than or equal to 1"
  elsif (minimum_similarity < 0.0)
    raise ArgumentError, "minimum_similarity cannot be less than 0"
  end
  @@default_min_similarity = minimum_similarity
end

.default_prefix_lengthObject



21
22
23
# File 'lib/ferret/search/fuzzy_query.rb', line 21

def FuzzyQuery.default_prefix_length()
  return @@default_prefix_length
end

.default_prefix_length=(prefix_length) ⇒ Object



25
26
27
28
29
30
# File 'lib/ferret/search/fuzzy_query.rb', line 25

def FuzzyQuery.default_prefix_length=(prefix_length)
  if (prefix_length < 0)
    raise ArgumentError, "prefix_length cannot be less than 0"
  end
  @@default_prefix_length = prefix_length
end

Instance Method Details

#eql?(o) ⇒ Boolean Also known as: ==

Returns:

  • (Boolean)


142
143
144
145
146
# File 'lib/ferret/search/fuzzy_query.rb', line 142

def eql?(o) 
  return (o.instance_of?(FuzzyQuery) and super(o) and
          (@minimum_similarity == o.minimum_similarity) and
          (@prefix_length == fuzzyQuery.prefix_length))
end

#get_term_enum(reader) ⇒ Object



72
73
74
# File 'lib/ferret/search/fuzzy_query.rb', line 72

def get_term_enum(reader)
  return FuzzyTermEnum.new(reader, @term, @minimum_similarity, @prefix_length)
end

#hashObject



149
150
151
# File 'lib/ferret/search/fuzzy_query.rb', line 149

def hash() 
  return super ^ @minimum_similarity.hash ^ @prefix_length.hash
end

#rewrite(reader) ⇒ Object



76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# File 'lib/ferret/search/fuzzy_query.rb', line 76

def rewrite(reader)

  fuzzy_enum = get_term_enum(reader)
  max_clause_count = BooleanQuery.max_clause_count
  st_queue = ScoreTermQueue.new(max_clause_count)

  begin 
    begin 
      min_score = 0.0
      score = 0.0
      t = fuzzy_enum.term()
      if t
        score = fuzzy_enum.difference()

        # terms come in alphabetical order, therefore if queue is full and score
        # not bigger than min_score, we can skip
        if(st_queue.size < max_clause_count or score > min_score)
          st_queue.insert(ScoreTerm.new(t, score))
          min_score = st_queue.top.score # maintain min_score
        end
      end
    end while fuzzy_enum.next?
  ensure 
    fuzzy_enum.close()
  end
  
  bq = BooleanQuery.new(true)
  st_queue.size.times do |i|
    st = st_queue.pop()
    tq = TermQuery.new(st.term)                     # found a match
    tq.boost = boost() * st.score                   # set the boost
    bq.add_query(tq, BooleanClause::Occur::SHOULD)  # add to query
  end

  return bq
end

#to_s(field = nil) ⇒ Object



113
114
115
116
117
118
119
# File 'lib/ferret/search/fuzzy_query.rb', line 113

def to_s(field = nil) 
  buffer = ""
  buffer << "#{@term.field}:" if @term.field != field
  buffer << "#{@term.text}~#{minimum_similarity}"
  buffer << "^#{boost()}" if (boost() != 1.0) 
  return buffer
end