Class: TextUtils::Classifier

Inherits:
Object
  • Object
show all
Includes:
LogUtils::Logging
Defined in:
lib/textutils/classifier.rb

Instance Method Summary collapse

Constructor Details

#initializeClassifier

Returns a new instance of Classifier.



9
10
11
# File 'lib/textutils/classifier.rb', line 9

def initialize
  @h = Hash.new( [] )  # hash w/ words - default value is empty ary (word_list)
end

Instance Method Details

#classify(text_with_comments) ⇒ Object



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/textutils/classifier.rb', line 35

def classify( text_with_comments )

  ## check encoding
  logger.debug "  classify - text.encoding: #{text_with_comments.encoding.name}"
  
  # nb: strip comments first
  text = strip_comments( text_with_comments )

  counts = []
    ## e.g. [[ 'en', 20], # 20 words
    ##       [ 'de',  2]] # 2 words

  @h.each_with_index do |(key,words),i|
    logger.debug "key #{key} (#{i+1}/#{@h.size}) - #{words.size} words"
    counts << [key, count_words_in_text( words, text )]
  end

  # sort by word count (reverse sort e.g. highest count goes first)
  counts = counts.sort {|l,r| r[1] <=> l[1] }
  
  # dump stats
  
  logger.debug "results:"
  counts.each_with_index do |entry,i|
    ## e.g. 1. en: 20 words
    ##      2. de: 2 words
    logger.debug " #{i+1}. #{entry[0]}: #{entry[1]}"
  end
  
  logger.debug "classifier - using key >>#{counts[0][0]}<<"
  
  ## return key/lang code w/ highest count
  counts[0][0]
end

#classify_file(path) ⇒ Object



31
32
33
# File 'lib/textutils/classifier.rb', line 31

def classify_file( path )
  classify( File.read_utf8( path ) )
end

#dumpObject



71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# File 'lib/textutils/classifier.rb', line 71

def dump
  # for debugging dump setup (that is, keys w/ words etc.)

  @h.each_with_index do |(key, words), i|
    logger.debug "key #{key} (#{i+1}/#{@h.size}) - #{words.size} words:"
    logger.debug words.inspect
    
    ## check encoding of words (trouble w/ windows cp850 argh!!!)
    last_encoding_name = ''
    words.each do |word|
      if last_encoding_name != word.encoding.name
        logger.debug "  encoding: #{word.encoding.name}"
        last_encoding_name = word.encoding.name
      end
    end
  end 
end

#train(key, ary_or_hash_or_str) ⇒ Object



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# File 'lib/textutils/classifier.rb', line 13

def train( key, ary_or_hash_or_str )

  ## add words to lang/topic key

  if ary_or_hash_or_str.kind_of?( Array )
    words = ary_or_hash_or_str
  elsif ary_or_hash_or_str.kind_of?( Hash )
    words = []
    ary_or_hash_or_str.each do |_, values|
      words += values.strip.split('|')
    end
  else  # assume string (allow list separated by |)
    words = ary_or_hash_or_str.strip.split('|')
  end

  @h[ key ] += words
end