Class: DelimitedWordDataSource

Inherits:
WordDataSource show all
Defined in:
lib/data/word_data_source.rb

Instance Attribute Summary collapse

Attributes inherited from WordDataSource

#numberWordsInFile, #words

Attributes inherited from BaseDataSource

#startOffset

Instance Method Summary collapse

Methods inherited from WordDataSource

#numberValues, #preprocessLine, #toString, #valueAt

Methods inherited from BaseDataSource

#each_with_index, #extendWith, #nextDataSourceValueAt, #valueSequence

Constructor Details

#initialize(filePath, lineStateMachine, limit) ⇒ DelimitedWordDataSource

Returns a new instance of DelimitedWordDataSource.



123
124
125
126
127
128
129
130
131
132
133
134
# File 'lib/data/word_data_source.rb', line 123

def initialize(filePath, lineStateMachine, limit)
  @lineStateMachine = lineStateMachine
  @limit = limit
  @count = 0
  @buckets = {}
  @wordCounts = {}
  @wordValueSequence = []  # list of words in file in terms of index into @wordAsEncountered
  @wordAsEncounteredIndex = {}          # key is word, value is number as encountered
  @wordAsEncountered = []  # array entry added only when a new word is encountered
  @nextWordEncounteredIndex = 0
  super(filePath,"/[^[:print:]]/")
end

Instance Attribute Details

#bucketsObject (readonly)

Returns the value of attribute buckets.



121
122
123
# File 'lib/data/word_data_source.rb', line 121

def buckets
  @buckets
end

#wordAsEncounteredObject (readonly)

Returns the value of attribute wordAsEncountered.



121
122
123
# File 'lib/data/word_data_source.rb', line 121

def wordAsEncountered
  @wordAsEncountered
end

#wordCountsObject (readonly)

Returns the value of attribute wordCounts.



121
122
123
# File 'lib/data/word_data_source.rb', line 121

def wordCounts
  @wordCounts
end

#wordValueSequenceObject (readonly)

Returns the value of attribute wordValueSequence.



121
122
123
# File 'lib/data/word_data_source.rb', line 121

def wordValueSequence
  @wordValueSequence
end

Instance Method Details

#bucketObject



136
137
138
# File 'lib/data/word_data_source.rb', line 136

def bucket
  @lineStateMachine.bucket
end

#has_terminator?Boolean

Returns:

  • (Boolean)


222
223
224
# File 'lib/data/word_data_source.rb', line 222

def has_terminator?
  true
end

#metaDataFor(offset) ⇒ Object

TODO: fix this, linear metadata search, O(N) should be O(lg N)



162
163
164
165
166
167
168
169
170
171
172
# File 'lib/data/word_data_source.rb', line 162

def metaDataFor(offset)
   = "unknown"
  @lineStateMachine.pages.sort_by(&:reverse).each do |, wordOffset|
    if (wordOffset < offset) then
       = 
    else
      return 
    end
  end
  return 
end

#process(line) ⇒ Object



207
208
209
210
211
212
213
214
215
216
# File 'lib/data/word_data_source.rb', line 207

def process(line)
  line = self.preprocessLine(line)
  data = @lineStateMachine.process(line, @wordValueSequence.length)
  if (data.length > 0) then
    bucket = @lineStateMachine.bucket
    @buckets[bucket] = {} if (!@buckets.has_key?(bucket))
    return self.processData(data,bucket)
  end
  return false
end

#processData(data, bucket) ⇒ Object



179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# File 'lib/data/word_data_source.rb', line 179

def processData(data,bucket)
  data.each do |word|
    word = word.chomp(",")
    word = word.chomp(".")
    if (word.length > 0) then
      @words << word
      if (!@wordCounts.has_key?(word)) then
        # we have a new word
        @wordAsEncounteredIndex[word] = @nextWordEncounteredIndex
        @wordAsEncountered << word
        @nextWordEncounteredIndex += 1
        @wordCounts[word] = 0
      end
      @wordCounts[word] += 1
      if (!@buckets[bucket].has_key?(word)) then
        @buckets[bucket][word] = 0
      end
      @buckets[bucket][word] += 1
      @wordValueSequence << @wordAsEncounteredIndex[word]
      @count += 1
      if ((@limit > 0) && (@count >= @limit)) then
        return true
      end
    end
  end
  return false
end

#saveObject



140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# File 'lib/data/word_data_source.rb', line 140

def save
  File.open("#{@filePath}.words", 'w') do |file|
    @wordAsEncountered.each do |word|
      file.write("#{word}\n")
    end
  end
  File.open("#{@filePath}.values", 'wb') do |file|
    file << @wordValueSequence.pack("N*")
  end
  File.open("#{@filePath}.summary", "w") do |file|
    file << "#{@numberWordsInFile} words in file\n"
    file << "#{@nextWordEncounteredIndex} distinct words\n"
    file << "Metadata\n"

    # uh-oh, this seems to reverse the hash in place!
    @lineStateMachine.pages.sort_by(&:reverse).each do |page, wordOffset|
      file << "#{wordOffset} #{page}\n"
    end
  end
end

#terminatorObject



226
227
228
# File 'lib/data/word_data_source.rb', line 226

def terminator
  "END_OF_DOCUMENT"
end

#verify(word, count) ⇒ Object



218
219
220
# File 'lib/data/word_data_source.rb', line 218

def verify(word, count)
  @wordCounts[word] == count
end

#wordCount(word) ⇒ Object



174
175
176
177
# File 'lib/data/word_data_source.rb', line 174

def wordCount(word)
  return @wordCounts[word] if @wordCounts.has_key?(word)
  return 0
end