Module: CorrectHorseBatteryStaple::Backend::Isam::InstanceMethods

Defined in:
lib/correct_horse_battery_staple/backend/isam.rb

Instance Method Summary collapse

Instance Method Details

#binwrite(*args) ⇒ Object



92
93
94
95
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 92

def binwrite(*args)
  method = io.respond_to?(:binwrite) ? :binwrite : :write
  io.send(method, *args)
end

#each(&block) ⇒ Object

some core Enumerable building blocks



257
258
259
260
261
262
263
264
265
266
267
268
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 257

def each(&block)
  string = records_string
  max_index = size - 1
  index = 0
  while index < max_index
    word = parse_record(string, index)
    word.index = index
    word.percentile = [(index-0.5)/size,0].max * 100
    yield word
    index += 1
  end
end

#file_range_read(file_range = nil) ⇒ Object



292
293
294
295
296
297
298
299
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 292

def file_range_read(file_range = nil)
  file_range ||= 0...file_size(@file)
  pos = @file.tell
  @file.seek(file_range.first)
  @file.read(range_count(file_range))
ensure
  @file.seek(pos)
end

#file_size(file) ⇒ Object



140
141
142
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 140

def file_size(file)
  (file.respond_to?(:size) ? file.size : file.stat.size)
end

#file_stringObject



288
289
290
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 288

def file_string
  @file.is_a?(StringIO) ? @file.string : file_range_read(nil)
end

#fix_stats(stats) ⇒ Object



27
28
29
30
31
32
33
34
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 27

def fix_stats(stats)
  stats.each do |k,v|
    if v.respond_to?(:nan?) && v.nan?
      stats[k] = -1
    end
  end
  stats
end

#get_word_by_idx(n) ⇒ Object



247
248
249
250
251
252
253
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 247

def get_word_by_idx(n)
  chunk = nth_chunk(n, records_string)
  parse_record(chunk).tap do |w|
    w.index      = n
    w.percentile = [(n-0.5)/size,0].max * 100
  end
end

#initialize_backend_variablesObject



22
23
24
25
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 22

def initialize_backend_variables
  @length_scaling_factor = 15
  @page_size = 4096
end

#inspectObject

Show some information about



189
190
191
192
193
194
195
196
197
198
199
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 189

def inspect
  super + "\n" + <<INSPECT
File size: #{file_size(@file)}
Word length: #{@word_length}
Frequency bytes: #{@frequency_length}
Total record bytes: #{@records_length}

Prelude:
#{@prelude.map {|k,v| k=="stats" ? "" : "  #{k}: #{v}\n" }.join("") }
INSPECT
end

#nth_chunk(n, string) ⇒ Object

return a string representing the nth_record



239
240
241
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 239

def nth_chunk(n, string)
  string[@entry_length * n, @entry_length]
end

#openmodeObject



97
98
99
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 97

def openmode
  IO.respond_to?(:binwrite) ? "wb:ASCII-8BIT" : "w"
end

#pad(size, io) ⇒ Object



88
89
90
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 88

def pad(size, io)
  io.write([].pack("x#{size}"))
end

#page_sizeObject



36
37
38
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 36

def page_size
  @page_size || 4096
end

#parse_preludeObject



148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 148

def parse_prelude
  @file.seek 0
  prelude_buf = @file.read(INITIAL_PRELUDE_LENGTH)

  # byte offset of first record from beginning of file
  # total length of JSON string (without padding)
  (@record_offset, @prelude_len)  = prelude_buf.unpack("NN")

  # read more if our initial read didn't slurp in the entire prelude
  if @prelude_len > prelude_buf.length
    prelude_buf += @file.read(@prelude_len - prelude_buf.length)
  end

  @prelude = JSON.parse( prelude_buf.unpack("@8a#{@prelude_len}")[0] ) || {}

  # includes prefix length byte
  @word_length      = @prelude["wlen"]     || raise(ArgumentError, "Word length is not defined!")

  # as network byte order int
  @frequency_length = @prelude["flen"]     || 4

  # total length of record
  @entry_length     = @prelude["entrylen"] || raise(ArgumentError, "Prelude does not include entrylen!")

  @offset_index1    = @prelude["offset_index1"]
  @offset_index2    = @prelude["offset_index2"]

  @entry_count      = @prelude["n"] || raise(ArgumentError, "Number of records not included!")

  @records_length   = @prelude["records_length"] || (@entry_length * @entry_count)

  @length_scaling_factor = @prelude["length_scaling_factor"] || 10
  
  load_stats_from_hash(@prelude["stats"]) if @prelude["stats"]

  @prelude
end

#parse_record(string, index = 0, word = CorrectHorseBatteryStaple::Word.new(:word => ""), length_range = nil) ⇒ Object

Parse a record into a Word object, which can be provided or will otherwise be constructed as needed fourth arg is a length range which can act as a filter; if not satisfied, nil will be returned



224
225
226
227
228
229
230
231
232
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 224

def parse_record(string, index=0,
                 word=CorrectHorseBatteryStaple::Word.new(:word => ""),
                 length_range = nil)
  bare = parse_record_into_array(string, index, length_range)
  return nil unless bare
  word.word = bare[0]
  word.frequency = bare[1]
  word
end

#parse_record_into_array(string, index, length_range = nil) ⇒ Object

Parse a record into an array of [word, frequency] IFF the word fits into the length_range or length_range is nil



207
208
209
210
211
212
213
214
215
216
217
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 207

def parse_record_into_array(string, index, length_range = nil)
  chunk = nth_chunk(index, string)
  raise "No chunk for index #{index}" unless chunk
  actual_word_length = chunk.unpack("C")[0]
  if !length_range || length_range.include?(actual_word_length)
    # returns [word, frequency]
    chunk.unpack("xa#{actual_word_length}@#{@word_length}N")
  else
    nil
  end
end

#percentile_index(percentile, round = true) ⇒ Object

rather than using a StatisticalArray, we do direct indexing into the file/string



321
322
323
324
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 321

def percentile_index(percentile, round=true)
  r = percentile.to_f/100 * count + 0.5
  round ? r.round : r
end

#pos_of_nth_word_in_file(n) ⇒ Object



243
244
245
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 243

def pos_of_nth_word_in_file(n)
  pos = @record_offset + (n * @entry_length)
end

#precache(max = -1)) ⇒ Object

Format of header:

0..3 - OB - offset of body start in bytes; network byte order 4..7 - LP - length of prelude in network byte order 8..OB-1 - P - JSON-encoded prelude hash and space padding OB..EOF - array of fixed size records as described in prelude

Contents of Prelude (after JSON decoding):

P - length of word part of record P - length of frequency part of record (always 4 bytes) P - length of total part of record P - number of records P - field name sorted by (word or frequency) P - corpus statistics P - absolute file offset of KDTree index P - length in bytes of records section, excluding padding P - what length was multiplied by in creating KDTree (usually 15)

Format of record:

2 bytes - LW - actual length of word within field P bytes - LW bytes of word (W) + P-LW bytes of padding P (4) bytes - frequency as network byte order long

After record section, there is padding up to the next page_size boundary, and then there is a dumped KDTree which extends to EOF.



134
135
136
137
138
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 134

def precache(max = -1)
  return if max > -1 && file_size(@file) > max
  @file.seek 0
  @file = StringIO.new @file.read, "r"
end

#preludeObject



144
145
146
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 144

def prelude
  @prelude || parse_prelude
end

#record_percentile_range_read(percentile_range) ⇒ Object

memoize :record_range_read



314
315
316
317
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 314

def record_percentile_range_read(percentile_range)
  record_range = record_range_for_percentile(percentile_range)
  record_range_read(record_range)
end

#record_range_for_percentile(range) ⇒ Object



326
327
328
329
330
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 326

def record_range_for_percentile(range)
  range = Range.new(range - 0.5, range + 0.5) if range.is_a?(Numeric)
  (percentile_index(range.begin, false).floor * @entry_length ...
   percentile_index(range.end,   false).ceil * @entry_length)
end

#record_range_read(record_range = nil) ⇒ Object



308
309
310
311
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 308

def record_range_read(record_range = nil)
  record_range ||= 0...records_size
  file_range_read((record_range.first + @record_offset)...(range_count(record_range) + @record_offset))
end

#records_sizeObject

file I/O



284
285
286
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 284

def records_size
  @records_length
end

#records_stringObject

returns a string representing the record-holding portion of the file



303
304
305
306
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 303

def records_string
  @records_string ||=
    record_range_read(0 ... records_size)
end

#round_up(val, blocksize = page_size) ⇒ Object

many MMUs in default mode and modern highcap drives have 4k pages/blocks



41
42
43
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 41

def round_up(val, blocksize=page_size)
  [(val.to_f/blocksize).ceil, 1].max * blocksize
end

#sizeObject



270
271
272
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 270

def size
  @entry_count ||= records_size / @entry_length
end

#sorted_entriesObject

we presume that the ISAM file has been sorted



278
279
280
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 278

def sorted_entries
  @sorted_entries ||= entries
end

#word_length(chunk_string) ⇒ Object



234
235
236
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 234

def word_length(chunk_string)
  chunk_string.unpack("C")
end

#write_corpus_to_io(corpus, io = STDOUT) ⇒ Object



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 45

def write_corpus_to_io(corpus, io=STDOUT)
  io.rewind

  # includes prefix length byte
  @word_length = corpus.reduce(0) { |m, e| m > e.word.length ? m : e.word.length } + 1
  @freq_length = 4
  @entry_length = @word_length + @freq_length

  stats = fix_stats(corpus.stats)
  corpus_word_count = corpus.length

  prelude = {
    "wlen"           => @word_length,
    "flen"           => 4,
    "entrylen"       => @word_length + @freq_length,
    "sort"           => "frequency",
    "n"              => corpus_word_count,
    "stats"          => stats,
    "flags"          => 0,
    "length_scaling_factor" => (@length_scaling_factor || 15),
    "records_length" => "0000000000",
    "offset_records" => "0000000000",
    "offset_index1"  => "0000000000",
    "offset_index2"  => "0000000000"
  }

  prelude_json_length = prelude.to_json.length
  prelude["offset_records"] = offset_records = round_up(prelude_json_length+8.0)

  prelude["records_length"] = records_length = corpus_word_count * prelude["entrylen"]
  offset_index1 = prelude["offset_records"] +
    round_up(records_length, page_size)

  prelude["offset_index1"]  = offset_index1

  io.write([offset_records, prelude_json_length, prelude.to_json].
           pack("NNA#{offset_records-8}"))

  corpus.each_with_index do |w, index|
    io.write(s=[w.word.length, w.word, w.frequency].pack("Ca#{@word_length-1}N"))
  end
end