Module: DocumentCache

Defined in:
lib/document-cache.rb

Class Method Summary collapse

Class Method Details

.add(search) ⇒ Object



13
14
15
16
# File 'lib/document-cache.rb', line 13

def self.add search
  filename = "#{CACHE_DIR}/#{UUID.new.generate}"
  File.open(filename,'w'){|f| f.write(search)}
end

.clean(sentence) ⇒ Object



62
63
64
# File 'lib/document-cache.rb', line 62

def self.clean(sentence)
  sentence.strip + "."
end

.documentsObject



54
55
56
# File 'lib/document-cache.rb', line 54

def self.documents
  Dir["#{CACHE_DIR}/*"]
end

.extract_matching_words(search, sentence) ⇒ Object



66
67
68
69
70
# File 'lib/document-cache.rb', line 66

def self.extract_matching_words search, sentence
  matches = find_matches_by_stemming(search, [sentence])
  return matches.values.first if !matches.empty?
  return find_matches_by_grepping(search, [sentence]).values.first
end

.find_examples_for(search, count = 1) ⇒ Object



58
59
60
# File 'lib/document-cache.rb', line 58

def self.find_examples_for search, count=1
  find_matches_in documents, search, count
end

.find_matches_by_grepping(search, sentences) ⇒ Object



28
29
30
31
32
33
# File 'lib/document-cache.rb', line 28

def self.find_matches_by_grepping search, sentences
  sentences.inject({}){|hash, s| 
    hash[clean(s)] = [search] if s.include? search 
    hash
  }
end

.find_matches_by_stemming(search, sentences) ⇒ Object



18
19
20
21
22
23
24
25
26
# File 'lib/document-cache.rb', line 18

def self.find_matches_by_stemming search, sentences
  token = VocabularyChest::stem(search)
  sentences.inject({}){|hash, s| 
    words = s.split(" ")
    found = words.select{|w| VocabularyChest::stem(w) == token}
    hash[clean(s)] = found.map{|f| VocabularyChest::sanitize(f)} if !found.empty?
    hash
  }
end

.find_matches_in(filenames, search, count) ⇒ Object



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/document-cache.rb', line 35

def self.find_matches_in filenames, search, count
  matches = {}

  [:find_matches_by_stemming, :find_matches_by_grepping].each{|matcher|
    filenames.each {|filename|
      File.open(filename){|file|
        contents = file.read
        sentences = contents.split(/[\.?!\n]/)
        matches.merge!(self.send(matcher, search, sentences))

        matches.shift until matches.size <= count if matches.size > count
        return matches if matches.size == count
      }
    }
  }

  matches
end

.frequency_listObject



72
73
74
75
76
77
78
# File 'lib/document-cache.rb', line 72

def self.frequency_list
  text = ""
  documents.each{|f| text += File.open(f).read }
  counts = text.split(" ").inject(Hash.new(0)) {|h,w| h[w] += 1; h }
  counts.reject!{|word, count| count < 2}
  counts.sort_by {|k,v| v}.reverse
end

.stemmed_frequency_listObject



80
81
82
83
84
85
86
87
# File 'lib/document-cache.rb', line 80

def self.stemmed_frequency_list
  text = ""
  documents.each{|f| text += File.open(f).read }
  stems = text.split(" ").map{|w| VocabularyChest::stem(w)}
  counts = stems.inject(Hash.new(0)) {|h,stem| h[stem] += 1; h }
  counts.reject!{|stem, count| count < 2}
  counts.sort_by {|k,v| v}.reverse
end