4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
|
# File 'lib/text_classifier.rb', line 4
def self.classify(documents_by_category, test_doc)
stop_words = Set.new ['a','about','above','after','again','against','all','am','an','and','any','are','aren\'t','as','at','be','because','been','before','being','below','between','both','but','by','can\'t','cannot','could','couldn\'t','did','didn\'t','do','does','doesn\'t','doing','don\'t','down','during','each','few','for','from','further','had','hadn\'t','has','hasn\'t','have','haven\'t','having','he','he\'d','he\'ll','he\'s','her','here','here\'s','hers','herself','him','himself','his','how','how\'s','i','i\'d','i\'ll','i\'m','i\'ve','if','in','into','is','isn\'t','it','it\'s','its','itself','let\'s','me','more','most','mustn\'t','my','myself','no','nor','not','of','off','on','once','only','or','other','ought','our','ours','ourselves','out','over','own','same','shan\'t','she','she\'d','she\'ll','she\'s','should','shouldn\'t','so','some','such','than','that','that\'s','the','their','theirs','them','themselves','then','there','there\'s','these','they','they\'d','they\'ll','they\'re','they\'ve','this','those','through','to','too','under','until','up','very','was','wasn\'t','we','we\'d','we\'ll','we\'re','we\'ve','were','weren\'t','what','what\'s','when','when\'s','where','where\'s','which','while','who','who\'s','whom','why','why\'s','with','won\'t','would','wouldn\'t','you','you\'d','you\'ll','you\'re','you\'ve','your','yours','yourself','yourselves']
num_categories = documents_by_category.size
probability_of_category = Array.new(num_categories)
num_words_in_category = Array.new(num_categories)
count_words_by_category = Array.new(num_categories)
entire_vocabulary = Set.new
num_docs = 0
for i in 0..num_categories-1 do
documents_this_cat = documents_by_category[i]
num_docs += documents_this_cat.size
documents_this_cat.each do |doc|
doc = doc.downcase.gsub(/[^a-z']/, ' ').squeeze(' ')
end
end
test_doc = test_doc.downcase.gsub(/[^a-z']/, ' ').squeeze(' ')
for i in 0..num_categories-1 do
category = documents_by_category[i]
probability_of_category[i] = category.size.to_f / num_docs
num_words_this_cat = 0
count_words_this_cat = Hash.new(0)
category.each do |document|
document.split.each do |word|
entire_vocabulary.add(word)
num_words_this_cat += 1
count_words_this_cat[word] += 1
end
end
num_words_in_category[i] = num_words_this_cat
count_words_by_category[i] = count_words_this_cat
end
cond_probs = Array.new(num_categories)
size = entire_vocabulary.size
for i in 0..num_categories-1 do
prob = Hash.new(0)
denom = num_words_in_category[i] + size
entire_vocabulary.each do |word|
numer = 1.0 + count_words_by_category[i][word]
prob[word] = numer / denom
end
cond_probs[i] = prob
end
test_doc_probs = Array.new(num_categories)
for i in 0..num_categories-1 do
prob = cond_probs[i]
total_prob = probability_of_category[i]
test_doc.split.each do |word|
total_prob *= prob[word]
end
test_doc_probs[i] = total_prob
end
sum_test_doc_probs = test_doc_probs.inject(:+)
for i in 0..num_categories-1 do
test_doc_probs[i] /= sum_test_doc_probs
end
return test_doc_probs
end
|