Class: TextProfileSignature
- Inherits:
-
Object
- Object
- TextProfileSignature
- Defined in:
- lib/text_profile_signature.rb
Constant Summary collapse
- VERSION =
"0.1.0"
Instance Method Summary collapse
- #generate_sign(text) ⇒ Object
-
#initialize(options = {}) ⇒ TextProfileSignature
constructor
A new instance of TextProfileSignature.
Constructor Details
#initialize(options = {}) ⇒ TextProfileSignature
Returns a new instance of TextProfileSignature.
25 26 27 28 29 30 |
# File 'lib/text_profile_signature.rb', line 25 def initialize(={}) [:min_token_length] ||= 2 [:quant_rate] ||= 0.01 @options = end |
Instance Method Details
#generate_sign(text) ⇒ Object
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
# File 'lib/text_profile_signature.rb', line 32 def generate_sign(text) # remove all characters except letters and digits, # and bring all characters to lower case # split the text into tokens (all consecutive non-whitespace characters) # discard tokens equal or shorter than MIN_TOKEN_LEN (default 2 characters) current_token = String.new max_freq = 0 tokens = {} text.each_char do |character| if character =~ /[[:alnum:]]/ current_token << Unicode::downcase(character) else if current_token.length > 0 if current_token.length > @options[:min_token_length] # Add it tok = tokens[current_token] unless tok tok = {count: 0, term: current_token} tokens[current_token] = tok end tok[:count] += 1 max_freq = tok[:count] if tok[:count] > max_freq end current_token = String.new end end end # Check the last token if current_token.length > @options[:min_token_length] # Add it tok = tokens[current_token] unless tok tok = {count: 0, term: current_token} tokens[current_token] = tok end tok[:count] += 1 max_freq = tok[:count] if tok[:count] > max_freq end # calculate the QUANT value quant = (max_freq * @options[:quant_rate]).round if quant < 2 if max_freq > 1 quant = 2 else quant = 1 end end # round down the counts of tokens to the nearest multiple of QUANT # tokens, which frequency after quantization falls below QUANT, are discarded quantized_tokens = tokens.values.inject([]) do |memo, item| # round down to the nearest QUANT item[:count] = (item[:count] / quant) * quant # discard the frequencies below the QUANT memo.push(item) if item[:count] >= quant memo end # sort the list of tokens by decreasing frequency profile = quantized_tokens.sort {|x, y| [y[:count], x[:term]] <=> [x[:count], y[:term]]} # create a list of tokens and their quantized frequency, # separated by spaces, in the order of decreasing frequency quantized_frequency_str = profile.map do |a| "#{a[:term]} #{a[:count]}" end.join("\n") Digest::MD5.hexdigest(quantized_frequency_str) end |