Class: DelimitedWordDataSource

Inherits:

WordDataSource

Object
BaseDataSource
WordDataSource
DelimitedWordDataSource

show all

Defined in:: lib/data/word_data_source.rb

Instance Attribute Summary collapse

#buckets ⇒ Object readonly

Returns the value of attribute buckets.
#wordAsEncountered ⇒ Object readonly

Returns the value of attribute wordAsEncountered.
#wordCounts ⇒ Object readonly

Returns the value of attribute wordCounts.
#wordValueSequence ⇒ Object readonly

Returns the value of attribute wordValueSequence.

Attributes inherited from WordDataSource

#numberWordsInFile, #words

Attributes inherited from BaseDataSource

#startOffset

Instance Method Summary collapse

#bucket ⇒ Object
#has_terminator? ⇒ Boolean
#initialize(filePath, lineStateMachine, limit) ⇒ DelimitedWordDataSource constructor

A new instance of DelimitedWordDataSource.
#metaDataFor(offset) ⇒ Object

TODO: fix this, linear metadata search, O(N) should be O(lg N).
#process(line) ⇒ Object
#processData(data, bucket) ⇒ Object
#save ⇒ Object
#terminator ⇒ Object
#verify(word, count) ⇒ Object
#wordCount(word) ⇒ Object

Methods inherited from WordDataSource

#numberValues, #preprocessLine, #toString, #valueAt

Methods inherited from BaseDataSource

#each_with_index, #extendWith, #nextDataSourceValueAt, #valueSequence

Constructor Details

#initialize(filePath, lineStateMachine, limit) ⇒ `DelimitedWordDataSource`

Returns a new instance of DelimitedWordDataSource.

# File 'lib/data/word_data_source.rb', line 123

def initialize(filePath, lineStateMachine, limit)
  @lineStateMachine = lineStateMachine
  @limit = limit
  @count = 0
  @buckets = {}
  @wordCounts = {}
  @wordValueSequence = []  # list of words in file in terms of index into @wordAsEncountered
  @wordAsEncounteredIndex = {}          # key is word, value is number as encountered
  @wordAsEncountered = []  # array entry added only when a new word is encountered
  @nextWordEncounteredIndex = 0
  super(filePath,"/[^[:print:]]/")
end

Instance Attribute Details

#buckets ⇒ `Object` (readonly)

Returns the value of attribute buckets.



121
122
123

# File 'lib/data/word_data_source.rb', line 121

def buckets
  @buckets
end

#wordAsEncountered ⇒ `Object` (readonly)

Returns the value of attribute wordAsEncountered.



121
122
123

# File 'lib/data/word_data_source.rb', line 121

def wordAsEncountered
  @wordAsEncountered
end

#wordCounts ⇒ `Object` (readonly)

Returns the value of attribute wordCounts.



121
122
123

# File 'lib/data/word_data_source.rb', line 121

def wordCounts
  @wordCounts
end

#wordValueSequence ⇒ `Object` (readonly)

Returns the value of attribute wordValueSequence.



121
122
123

# File 'lib/data/word_data_source.rb', line 121

def wordValueSequence
  @wordValueSequence
end

Instance Method Details

#bucket ⇒ `Object`



136
137
138

# File 'lib/data/word_data_source.rb', line 136

def bucket
  @lineStateMachine.bucket
end

#has_terminator? ⇒ `Boolean`

Returns:

(Boolean)



222
223
224

# File 'lib/data/word_data_source.rb', line 222

def has_terminator?
  true
end

#metaDataFor(offset) ⇒ `Object`

TODO: fix this, linear metadata search, O(N) should be O(lg N)

# File 'lib/data/word_data_source.rb', line 162

def metaDataFor(offset)
  previousMetadata = "unknown"
  @lineStateMachine.pages.sort_by(&:reverse).each do |metadata, wordOffset|
    if (wordOffset < offset) then
      previousMetadata = metadata
    else
      return previousMetadata
    end
  end
  return previousMetadata
end

#process(line) ⇒ `Object`

# File 'lib/data/word_data_source.rb', line 207

def process(line)
  line = self.preprocessLine(line)
  data = @lineStateMachine.process(line, @wordValueSequence.length)
  if (data.length > 0) then
    bucket = @lineStateMachine.bucket
    @buckets[bucket] = {} if (!@buckets.has_key?(bucket))
    return self.processData(data,bucket)
  end
  return false
end

#processData(data, bucket) ⇒ `Object`

# File 'lib/data/word_data_source.rb', line 179

def processData(data,bucket)
  data.each do |word|
    word = word.chomp(",")
    word = word.chomp(".")
    if (word.length > 0) then
      @words << word
      if (!@wordCounts.has_key?(word)) then
        # we have a new word
        @wordAsEncounteredIndex[word] = @nextWordEncounteredIndex
        @wordAsEncountered << word
        @nextWordEncounteredIndex += 1
        @wordCounts[word] = 0
      end
      @wordCounts[word] += 1
      if (!@buckets[bucket].has_key?(word)) then
        @buckets[bucket][word] = 0
      end
      @buckets[bucket][word] += 1
      @wordValueSequence << @wordAsEncounteredIndex[word]
      @count += 1
      if ((@limit > 0) && (@count >= @limit)) then
        return true
      end
    end
  end
  return false
end

#save ⇒ `Object`

# File 'lib/data/word_data_source.rb', line 140

def save
  File.open("#{@filePath}.words", 'w') do |file|
    @wordAsEncountered.each do |word|
      file.write("#{word}\n")
    end
  end
  File.open("#{@filePath}.values", 'wb') do |file|
    file << @wordValueSequence.pack("N*")
  end
  File.open("#{@filePath}.summary", "w") do |file|
    file << "#{@numberWordsInFile} words in file\n"
    file << "#{@nextWordEncounteredIndex} distinct words\n"
    file << "Metadata\n"

    # uh-oh, this seems to reverse the hash in place!
    @lineStateMachine.pages.sort_by(&:reverse).each do |page, wordOffset|
      file << "#{wordOffset} #{page}\n"
    end
  end
end

#terminator ⇒ `Object`



226
227
228

# File 'lib/data/word_data_source.rb', line 226

def terminator
  "END_OF_DOCUMENT"
end

#verify(word, count) ⇒ `Object`



218
219
220

# File 'lib/data/word_data_source.rb', line 218

def verify(word, count)
  @wordCounts[word] == count
end

#wordCount(word) ⇒ `Object`

# File 'lib/data/word_data_source.rb', line 174

def wordCount(word)
  return @wordCounts[word] if @wordCounts.has_key?(word)
  return 0
end

Class: DelimitedWordDataSource

Instance Attribute Summary collapse

Attributes inherited from WordDataSource

Attributes inherited from BaseDataSource

Instance Method Summary collapse

Methods inherited from WordDataSource

Methods inherited from BaseDataSource

Constructor Details

#initialize(filePath, lineStateMachine, limit) ⇒ DelimitedWordDataSource

Instance Attribute Details

#buckets ⇒ Object (readonly)

#wordAsEncountered ⇒ Object (readonly)

#wordCounts ⇒ Object (readonly)

#wordValueSequence ⇒ Object (readonly)

Instance Method Details

#bucket ⇒ Object

#has_terminator? ⇒ Boolean

#metaDataFor(offset) ⇒ Object

#process(line) ⇒ Object

#processData(data, bucket) ⇒ Object

#save ⇒ Object

#terminator ⇒ Object

#verify(word, count) ⇒ Object

#wordCount(word) ⇒ Object