Class: TextProfileSignature

Inherits:
Object
  • Object
show all
Defined in:
lib/text_profile_signature.rb

Constant Summary collapse

VERSION =
"0.1.0"

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ TextProfileSignature



25
26
27
28
29
30
# File 'lib/text_profile_signature.rb', line 25

def initialize(options={})
  options[:min_token_length] ||= 2
  options[:quant_rate] ||= 0.01
  
  @options = options
end

Instance Method Details

#generate_sign(text) ⇒ Object



32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/text_profile_signature.rb', line 32

def generate_sign(text)
  # remove all characters except letters and digits, 
  # and bring all characters to lower case
  # split the text into tokens (all consecutive non-whitespace characters)
  # discard tokens equal or shorter than MIN_TOKEN_LEN (default 2 characters)
  current_token = String.new
  max_freq = 0
  tokens = {}
  text.each_char do |character|
    if character =~ /[[:alnum:]]/
      current_token << Unicode::downcase(character)
    else
      if current_token.length > 0
        if current_token.length > @options[:min_token_length]
          # Add it
          tok = tokens[current_token]
          unless tok
            tok = {count: 0, term: current_token}
            tokens[current_token] = tok
          end
          tok[:count] += 1
          max_freq = tok[:count] if tok[:count] > max_freq
        end
        current_token = String.new
      end
    end
  end
  
  # Check the last token
  if current_token.length > @options[:min_token_length]
    # Add it
    tok = tokens[current_token]
    unless tok
      tok = {count: 0, term: current_token}
      tokens[current_token] = tok
    end
    tok[:count] += 1
    max_freq = tok[:count] if tok[:count] > max_freq
  end
  
  # calculate the QUANT value
  quant = (max_freq * @options[:quant_rate]).round
  
  if quant < 2
    if max_freq > 1
      quant = 2
    else
      quant = 1
    end
  end
  
  # round down the counts of tokens to the nearest multiple of QUANT
  # tokens, which frequency after quantization falls below QUANT, are discarded
  quantized_tokens = tokens.values.inject([]) do |memo, item|
    # round down to the nearest QUANT
    item[:count] = (item[:count] / quant) * quant
    
    # discard the frequencies below the QUANT
    memo.push(item) if item[:count] >= quant
    
    memo
  end
  
  # sort the list of tokens by decreasing frequency
  profile = quantized_tokens.sort {|x, y| [y[:count], x[:term]] <=> [x[:count], y[:term]]}

  # create a list of tokens and their quantized frequency, 
  # separated by spaces, in the order of decreasing frequency
  quantized_frequency_str = profile.map do |a|
    "#{a[:term]} #{a[:count]}"
  end.join("\n")
  
  Digest::MD5.hexdigest(quantized_frequency_str)
end