32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
|
# File 'lib/text_profile_signature.rb', line 32
def generate_sign(text)
current_token = String.new
max_freq = 0
tokens = {}
text.each_char do |character|
if character =~ /[[:alnum:]]/
current_token << Unicode::downcase(character)
else
if current_token.length > 0
if current_token.length > @options[:min_token_length]
tok = tokens[current_token]
unless tok
tok = {count: 0, term: current_token}
tokens[current_token] = tok
end
tok[:count] += 1
max_freq = tok[:count] if tok[:count] > max_freq
end
current_token = String.new
end
end
end
if current_token.length > @options[:min_token_length]
tok = tokens[current_token]
unless tok
tok = {count: 0, term: current_token}
tokens[current_token] = tok
end
tok[:count] += 1
max_freq = tok[:count] if tok[:count] > max_freq
end
quant = (max_freq * @options[:quant_rate]).round
if quant < 2
if max_freq > 1
quant = 2
else
quant = 1
end
end
quantized_tokens = tokens.values.inject([]) do |memo, item|
item[:count] = (item[:count] / quant) * quant
memo.push(item) if item[:count] >= quant
memo
end
profile = quantized_tokens.sort {|x, y| [y[:count], x[:term]] <=> [x[:count], y[:term]]}
quantized_frequency_str = profile.map do |a|
"#{a[:term]} #{a[:count]}"
end.join("\n")
Digest::MD5.hexdigest(quantized_frequency_str)
end
|