Class: LanguageDetector::Profile
- Inherits:
-
Object
- Object
- LanguageDetector::Profile
- Defined in:
- lib/language_detector.rb
Constant Summary collapse
- PUNCTUATIONS =
[?\n, ?\r, ?\t, ?\s, ?!, ?", ?#, ?$, ?%, ?&, ?', ?(, ?), ?*, ?+, ?,, ?-, ?., ?/, ?0, ?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?:, ?;, ?<, ?=, ?>, ??, ?@, ?[, ?\\, ?], ?^, ?_, ?`, ?{, ?|, ?}, ?~]
- LIMIT =
2000
Instance Attribute Summary collapse
-
#name ⇒ Object
readonly
Returns the value of attribute name.
-
#ngrams ⇒ Object
readonly
Returns the value of attribute ngrams.
Instance Method Summary collapse
- #_init_with_string(str, ngram_count) ⇒ Object
- #compute_distance(other_profile) ⇒ Object
- #count_ngram(token, n, counts) ⇒ Object
- #init_with_file(filename) ⇒ Object
- #init_with_string(str) ⇒ Object
-
#initialize(name) ⇒ Profile
constructor
A new instance of Profile.
- #is_puctuation?(b) ⇒ Boolean
- #tokenize(str) ⇒ Object
Constructor Details
#initialize(name) ⇒ Profile
Returns a new instance of Profile.
140 141 142 143 144 145 |
# File 'lib/language_detector.rb', line 140 def initialize(name) @name = name @puctuations = {} PUNCTUATIONS.each {|p| @puctuations[p] = 1} @ngrams = {} end |
Instance Attribute Details
#name ⇒ Object (readonly)
Returns the value of attribute name.
138 139 140 |
# File 'lib/language_detector.rb', line 138 def name @name end |
#ngrams ⇒ Object (readonly)
Returns the value of attribute ngrams.
138 139 140 |
# File 'lib/language_detector.rb', line 138 def ngrams @ngrams end |
Instance Method Details
#_init_with_string(str, ngram_count) ⇒ Object
179 180 181 182 183 184 185 186 187 |
# File 'lib/language_detector.rb', line 179 def _init_with_string str, ngram_count tokens = tokenize(str) tokens.each {|token| count_ngram token, 2, ngram_count count_ngram token, 3, ngram_count count_ngram token, 4, ngram_count count_ngram token, 5, ngram_count } end |
#compute_distance(other_profile) ⇒ Object
125 126 127 128 129 130 131 132 133 134 135 136 |
# File 'lib/language_detector.rb', line 125 def compute_distance other_profile distance = 0 other_profile.ngrams.each {|k, v| n = @ngrams[k] if n distance += (v - n).abs else distance += LanguageDetector::Profile::LIMIT end } return distance end |
#count_ngram(token, n, counts) ⇒ Object
208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 |
# File 'lib/language_detector.rb', line 208 def count_ngram token, n, counts token = "_#{token}#{'_' * (n-1)}" if n > 1 && token.jlength >= n i = 0 while i + n <= token.length s = '' j = 0 while j < n s << token[i+j] j += 1 end if counts[s] counts[s] = counts[s] + 1 else counts[s] = 1 end i += 1 end return counts end |
#init_with_file(filename) ⇒ Object
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
# File 'lib/language_detector.rb', line 147 def init_with_file filename ngram_count = {} path = File.(File.join(File.dirname(__FILE__), "training_data/" + filename)) puts "training with " + path File.open(path).each_line{ |line| _init_with_string line, ngram_count } a = ngram_count.sort {|a,b| b[1] <=> a[1]} i = 1 a.each {|t| @ngrams[t[0]] = i i += 1 break if i > LIMIT } end |
#init_with_string(str) ⇒ Object
165 166 167 168 169 170 171 172 173 174 175 176 177 |
# File 'lib/language_detector.rb', line 165 def init_with_string str ngram_count = {} _init_with_string str, ngram_count a = ngram_count.sort {|a,b| b[1] <=> a[1]} i = 1 a.each {|t| @ngrams[t[0]] = i i += 1 break if i > LIMIT } end |
#is_puctuation?(b) ⇒ Boolean
204 205 206 |
# File 'lib/language_detector.rb', line 204 def is_puctuation? b @puctuations[b] end |
#tokenize(str) ⇒ Object
189 190 191 192 193 194 195 196 197 198 199 200 201 202 |
# File 'lib/language_detector.rb', line 189 def tokenize str tokens = [] s = '' str.each_byte {|b| if is_puctuation?(b) tokens << s unless s.empty? s = '' else s << b end } tokens << s unless s.empty? return tokens end |