Class: IMatch

Inherits:
Object
  • Object
show all
Defined in:
lib/imatch.rb,
lib/lexicon.rb

Defined Under Namespace

Classes: Lexicon

Constant Summary collapse

VERSION =
'0.1.0'
DEFAULT_LEXICON_FILE =
File.join(File.dirname(__FILE__), 'data', 'en.dat')
DEFAULT_NUMBER_OF_LEXICONS =
0
DEFAULT_LEXICON_FRACTION =
0.66

Instance Method Summary collapse

Constructor Details

#initialize(file = DEFAULT_LEXICON_FILE, options = {}) ⇒ IMatch

Returns a new instance of IMatch.



16
17
18
19
20
21
22
23
24
25
26
27
28
# File 'lib/imatch.rb', line 16

def initialize(file = DEFAULT_LEXICON_FILE, options = {})
  @lexicon = IMatch::Lexicon.new(file).freeze
  @stop_words = (options[:stop_words] || []).to_set

  @should_stem = !!options[:stemming]

  @number_of_lexicons = (options[:lexicons] || DEFAULT_NUMBER_OF_LEXICONS).to_i
  @lexicon_fraction = (options[:lexicon_fraction] || DEFAULT_LEXICON_FRACTION).to_f
  @subsets = []
  if @number_of_lexicons > 0
    @number_of_lexicons.times { @subsets << @lexicon.subset(@lexicon_fraction) }
  end
end

Instance Method Details

#lexiconObject



70
71
72
# File 'lib/imatch.rb', line 70

def lexicon
  @lexicon
end

#multiple_signatures(string, tokenize = /\s+/) ⇒ Object



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/imatch.rb', line 30

def multiple_signatures(string, tokenize = /\s+/)
  signatures = Set.new

  if sig = signature(string, tokenize)
    signatures << sig
  end

  @subsets.each do |lex|
    if sig = signature(string, tokenize, lex)
      signatures << sig
    end
  end

  signatures
end

#signature(string, tokenize = /\s+/, lexicon = nil) ⇒ Object



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/imatch.rb', line 46

def signature(string, tokenize = /\s+/, lexicon = nil)
  return nil unless string

  tokens = string.split(tokenize)
  return nil if tokens.empty?

  current_lexicon = lexicon || @lexicon

  usable_tokens = Set.new
  tokens.each do |t|
    token = t.downcase
    token = token.stem if @should_stem && token.respond_to?(:stem)

    next if @stop_words.include?(token)
    next unless current_lexicon.include?(token)

    usable_tokens << token
  end

  return nil if usable_tokens.empty?

  finger_print(usable_tokens.to_a.sort) unless tokens.empty?
end

#to_sObject



74
75
76
# File 'lib/imatch.rb', line 74

def to_s
  %Q{<IMatch stemming="#{@should_stem}" stop_word_count="#{@stop_words.size}">#{@lexicon.to_s}</IMatch>}
end