Class: DelimitedWordDataSource
- Inherits:
-
WordDataSource
- Object
- BaseDataSource
- WordDataSource
- DelimitedWordDataSource
- Defined in:
- lib/data/word_data_source.rb
Instance Attribute Summary collapse
-
#buckets ⇒ Object
readonly
Returns the value of attribute buckets.
-
#wordAsEncountered ⇒ Object
readonly
Returns the value of attribute wordAsEncountered.
-
#wordCounts ⇒ Object
readonly
Returns the value of attribute wordCounts.
-
#wordValueSequence ⇒ Object
readonly
Returns the value of attribute wordValueSequence.
Attributes inherited from WordDataSource
Attributes inherited from BaseDataSource
Instance Method Summary collapse
- #bucket ⇒ Object
- #has_terminator? ⇒ Boolean
-
#initialize(filePath, lineStateMachine, limit) ⇒ DelimitedWordDataSource
constructor
A new instance of DelimitedWordDataSource.
-
#metaDataFor(offset) ⇒ Object
TODO: fix this, linear metadata search, O(N) should be O(lg N).
- #process(line) ⇒ Object
- #processData(data, bucket) ⇒ Object
- #save ⇒ Object
- #terminator ⇒ Object
- #verify(word, count) ⇒ Object
- #wordCount(word) ⇒ Object
Methods inherited from WordDataSource
#numberValues, #preprocessLine, #toString, #valueAt
Methods inherited from BaseDataSource
#each_with_index, #extendWith, #nextDataSourceValueAt, #valueSequence
Constructor Details
#initialize(filePath, lineStateMachine, limit) ⇒ DelimitedWordDataSource
Returns a new instance of DelimitedWordDataSource.
123 124 125 126 127 128 129 130 131 132 133 134 |
# File 'lib/data/word_data_source.rb', line 123 def initialize(filePath, lineStateMachine, limit) @lineStateMachine = lineStateMachine @limit = limit @count = 0 @buckets = {} @wordCounts = {} @wordValueSequence = [] # list of words in file in terms of index into @wordAsEncountered @wordAsEncounteredIndex = {} # key is word, value is number as encountered @wordAsEncountered = [] # array entry added only when a new word is encountered @nextWordEncounteredIndex = 0 super(filePath,"/[^[:print:]]/") end |
Instance Attribute Details
#buckets ⇒ Object (readonly)
Returns the value of attribute buckets.
121 122 123 |
# File 'lib/data/word_data_source.rb', line 121 def buckets @buckets end |
#wordAsEncountered ⇒ Object (readonly)
Returns the value of attribute wordAsEncountered.
121 122 123 |
# File 'lib/data/word_data_source.rb', line 121 def wordAsEncountered @wordAsEncountered end |
#wordCounts ⇒ Object (readonly)
Returns the value of attribute wordCounts.
121 122 123 |
# File 'lib/data/word_data_source.rb', line 121 def wordCounts @wordCounts end |
#wordValueSequence ⇒ Object (readonly)
Returns the value of attribute wordValueSequence.
121 122 123 |
# File 'lib/data/word_data_source.rb', line 121 def wordValueSequence @wordValueSequence end |
Instance Method Details
#bucket ⇒ Object
136 137 138 |
# File 'lib/data/word_data_source.rb', line 136 def bucket @lineStateMachine.bucket end |
#has_terminator? ⇒ Boolean
222 223 224 |
# File 'lib/data/word_data_source.rb', line 222 def has_terminator? true end |
#metaDataFor(offset) ⇒ Object
TODO: fix this, linear metadata search, O(N) should be O(lg N)
162 163 164 165 166 167 168 169 170 171 172 |
# File 'lib/data/word_data_source.rb', line 162 def (offset) previousMetadata = "unknown" @lineStateMachine.pages.sort_by(&:reverse).each do |, wordOffset| if (wordOffset < offset) then previousMetadata = else return previousMetadata end end return previousMetadata end |
#process(line) ⇒ Object
207 208 209 210 211 212 213 214 215 216 |
# File 'lib/data/word_data_source.rb', line 207 def process(line) line = self.preprocessLine(line) data = @lineStateMachine.process(line, @wordValueSequence.length) if (data.length > 0) then bucket = @lineStateMachine.bucket @buckets[bucket] = {} if (!@buckets.has_key?(bucket)) return self.processData(data,bucket) end return false end |
#processData(data, bucket) ⇒ Object
179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
# File 'lib/data/word_data_source.rb', line 179 def processData(data,bucket) data.each do |word| word = word.chomp(",") word = word.chomp(".") if (word.length > 0) then @words << word if (!@wordCounts.has_key?(word)) then # we have a new word @wordAsEncounteredIndex[word] = @nextWordEncounteredIndex @wordAsEncountered << word @nextWordEncounteredIndex += 1 @wordCounts[word] = 0 end @wordCounts[word] += 1 if (!@buckets[bucket].has_key?(word)) then @buckets[bucket][word] = 0 end @buckets[bucket][word] += 1 @wordValueSequence << @wordAsEncounteredIndex[word] @count += 1 if ((@limit > 0) && (@count >= @limit)) then return true end end end return false end |
#save ⇒ Object
140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
# File 'lib/data/word_data_source.rb', line 140 def save File.open("#{@filePath}.words", 'w') do |file| @wordAsEncountered.each do |word| file.write("#{word}\n") end end File.open("#{@filePath}.values", 'wb') do |file| file << @wordValueSequence.pack("N*") end File.open("#{@filePath}.summary", "w") do |file| file << "#{@numberWordsInFile} words in file\n" file << "#{@nextWordEncounteredIndex} distinct words\n" file << "Metadata\n" # uh-oh, this seems to reverse the hash in place! @lineStateMachine.pages.sort_by(&:reverse).each do |page, wordOffset| file << "#{wordOffset} #{page}\n" end end end |
#terminator ⇒ Object
226 227 228 |
# File 'lib/data/word_data_source.rb', line 226 def terminator "END_OF_DOCUMENT" end |
#verify(word, count) ⇒ Object
218 219 220 |
# File 'lib/data/word_data_source.rb', line 218 def verify(word, count) @wordCounts[word] == count end |
#wordCount(word) ⇒ Object
174 175 176 177 |
# File 'lib/data/word_data_source.rb', line 174 def wordCount(word) return @wordCounts[word] if @wordCounts.has_key?(word) return 0 end |