Class: TextUtils::Classifier
- Inherits:
-
Object
- Object
- TextUtils::Classifier
- Includes:
- LogUtils::Logging
- Defined in:
- lib/textutils/classifier.rb
Instance Method Summary collapse
- #classify(text_with_comments) ⇒ Object
- #classify_file(path) ⇒ Object
- #dump ⇒ Object
-
#initialize ⇒ Classifier
constructor
A new instance of Classifier.
- #train(key, ary_or_hash_or_str) ⇒ Object
Constructor Details
#initialize ⇒ Classifier
Returns a new instance of Classifier.
9 10 11 |
# File 'lib/textutils/classifier.rb', line 9 def initialize @h = Hash.new( [] ) # hash w/ words - default value is empty ary (word_list) end |
Instance Method Details
#classify(text_with_comments) ⇒ Object
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
# File 'lib/textutils/classifier.rb', line 35 def classify( text_with_comments ) ## check encoding logger.debug " classify - text.encoding: #{text_with_comments.encoding.name}" # nb: strip comments first text = strip_comments( text_with_comments ) counts = [] ## e.g. [[ 'en', 20], # 20 words ## [ 'de', 2]] # 2 words @h.each_with_index do |(key,words),i| logger.debug "key #{key} (#{i+1}/#{@h.size}) - #{words.size} words" counts << [key, count_words_in_text( words, text )] end # sort by word count (reverse sort e.g. highest count goes first) counts = counts.sort {|l,r| r[1] <=> l[1] } # dump stats logger.debug "results:" counts.each_with_index do |entry,i| ## e.g. 1. en: 20 words ## 2. de: 2 words logger.debug " #{i+1}. #{entry[0]}: #{entry[1]}" end logger.debug "classifier - using key >>#{counts[0][0]}<<" ## return key/lang code w/ highest count counts[0][0] end |
#classify_file(path) ⇒ Object
31 32 33 |
# File 'lib/textutils/classifier.rb', line 31 def classify_file( path ) classify( File.read_utf8( path ) ) end |
#dump ⇒ Object
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
# File 'lib/textutils/classifier.rb', line 71 def dump # for debugging dump setup (that is, keys w/ words etc.) @h.each_with_index do |(key, words), i| logger.debug "key #{key} (#{i+1}/#{@h.size}) - #{words.size} words:" logger.debug words.inspect ## check encoding of words (trouble w/ windows cp850 argh!!!) last_encoding_name = '' words.each do |word| if last_encoding_name != word.encoding.name logger.debug " encoding: #{word.encoding.name}" last_encoding_name = word.encoding.name end end end end |
#train(key, ary_or_hash_or_str) ⇒ Object
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
# File 'lib/textutils/classifier.rb', line 13 def train( key, ary_or_hash_or_str ) ## add words to lang/topic key if ary_or_hash_or_str.kind_of?( Array ) words = ary_or_hash_or_str elsif ary_or_hash_or_str.kind_of?( Hash ) words = [] ary_or_hash_or_str.each do |_, values| words += values.strip.split('|') end else # assume string (allow list separated by |) words = ary_or_hash_or_str.strip.split('|') end @h[ key ] += words end |