Class: PEROBS::FlatFile

Inherits:
Object
  • Object
show all
Defined in:
lib/perobs/FlatFile.rb

Overview

The FlatFile class manages the storage file of the FlatFileDB. It contains a sequence of blobs Each blob consists of header and the actual blob data bytes.

Constant Summary collapse

INDEX_BTREE_ORDER =

The number of entries in a single BTree node of the index file.

65

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(dir, progressmeter) ⇒ FlatFile

Create a new FlatFile object for a database in the given path.

Parameters:

  • dir (String)

    Directory path for the data base file



48
49
50
51
52
53
54
55
# File 'lib/perobs/FlatFile.rb', line 48

def initialize(dir, progressmeter)
  @db_dir = dir
  @progressmeter = progressmeter
  @f = nil
  @marks = nil
  @index = BTree.new(@db_dir, 'index', INDEX_BTREE_ORDER, @progressmeter)
  @space_list = SpaceTree.new(@db_dir, @progressmeter)
end

Class Method Details

.insert_header_checksums(db_dir) ⇒ Object



638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
# File 'lib/perobs/FlatFile.rb', line 638

def FlatFile::insert_header_checksums(db_dir)
  old_file_name = File.join(db_dir, 'database.blobs')
  new_file_name = File.join(db_dir, 'database_v4.blobs')
  bak_file_name = File.join(db_dir, 'database_v3.blobs')

  old_file = File.open(old_file_name, 'rb')
  new_file = File.open(new_file_name, 'wb')

  entries = 0
  while (buf = old_file.read(21))
    flags, length, id, crc = *buf.unpack('CQQL')
    blob_data = old_file.read(length)

    # Some basic sanity checking to ensure all reserved bits are 0. Older
    # versions of PEROBS used to set bit 1 despite it being reserved now.
    unless flags & 0xF0 == 0
      PEROBS.log.fatal "Blob file #{old_file_name} contains illegal " +
        "flag byte #{'%02x' % flags} at #{old_file.pos - 21}"
    end

    # Check if the blob is valid and current.
    if flags & 0x1 == 1 && flags & 0x8 == 0
      # Make sure the bit 1 is not set anymore.
      flags = flags & 0x05
      header_str = [ flags, length, id, crc ].pack('CQQL')
      header_crc = Zlib.crc32(header_str, 0)
      header_str += [ header_crc ].pack('L')

      new_file.write(header_str + blob_data)
      entries += 1
    end
  end
  PEROBS.log.info "Header checksum added to #{entries} entries"

  old_file.close
  new_file.close

  File.rename(old_file_name, bak_file_name)
  File.rename(new_file_name, old_file_name)
end

Instance Method Details

#check(repair = false) ⇒ Integer

Check (and repair) the FlatFile.

Parameters:

  • repair (Boolean) (defaults to: false)

    True if errors should be fixed.

Returns:

  • (Integer)

    Number of errors found



447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
# File 'lib/perobs/FlatFile.rb', line 447

def check(repair = false)
  errors = 0
  return errors unless @f

  t = Time.now
  PEROBS.log.info "Checking FlatFile database" +
    "#{repair ? ' in repair mode' : ''}..."

  # First check the database blob file. Each entry should be readable and
  # correct and all IDs must be unique. We use a shadow index to keep
  # track of the already found IDs.
  new_index = BTree.new(@db_dir, 'new-index', INDEX_BTREE_ORDER,
                        @progressmeter)
  new_index.erase
  new_index.open

  corrupted_blobs = 0
  @progressmeter.start('Checking blobs file', @f.size) do |pm|
    corrupted_blobs = each_blob_header do |header|
      if header.is_valid?
        # We have a non-deleted entry.
        begin
          @f.seek(header.addr + FlatFileBlobHeader::LENGTH)
          buf = @f.read(header.length)
          if buf.bytesize != header.length
            PEROBS.log.error "Premature end of file in blob with ID " +
              "#{header.id}."
            discard_damaged_blob(header) if repair
            errors += 1
            next
          end

          # Uncompress the data if the compression bit is set in the mark
          # byte.
          if header.is_compressed?
            begin
              buf = Zlib.inflate(buf)
            rescue Zlib::BufError, Zlib::DataError
              PEROBS.log.error "Corrupted compressed block with ID " +
                "#{header.id} found."
              discard_damaged_blob(header) if repair
              errors += 1
              next
            end
          end

          if header.crc && checksum(buf) != header.crc
            PEROBS.log.error "Checksum failure while checking blob " +
              "with ID #{header.id}"
            discard_damaged_blob(header) if repair
            errors += 1
            next
          end
        rescue IOError => e
          PEROBS.log.fatal "Check of blob with ID #{header.id} failed: " +
            e.message
        end

        # Check if the ID has already been found in the file.
        if (previous_address = new_index.get(header.id))
          PEROBS.log.error "Multiple blobs for ID #{header.id} found. " +
            "Addresses: #{previous_address}, #{header.addr}"
          errors += 1
          previous_header = FlatFileBlobHeader.read(@f, previous_address,
                                                    header.id)
          if repair
            # We have two blobs with the same ID and we must discard one of
            # them.
            if header.is_outdated?
              discard_damaged_blob(header)
            elsif previous_header.is_outdated?
              discard_damaged_blob(previous_header)
            else
              PEROBS.log.error "None of the blobs with same ID have " +
                "the outdated flag set. Deleting the smaller one."
              errors += 1
              discard_damaged_blob(header.length < previous_header.length ?
                                   header : previous_header)
            end
            next
          end
        else
          # ID is unique so far. Add it to the shadow index.
          new_index.insert(header.id, header.addr)
        end

      end

      pm.update(header.addr)
    end

    errors += corrupted_blobs
  end

  # We no longer need the new index.
  new_index.close
  new_index.erase

  if repair && corrupted_blobs > 0
    erase_index_files
    defragmentize
    regenerate_index_and_spaces
  else
    # Now we check the index data. It must be correct and the entries must
    # match the blob file. All entries in the index must be in the blob file
    # and vise versa.
    begin
      index_ok = @index.check do |id, address|
        has_id_at?(id, address)
      end
      x_check_errs = 0
      space_check_ok = true
      unless index_ok && (space_check_ok = @space_list.check(self)) &&
        (x_check_errs = cross_check_entries) == 0
        errors += 1 unless index_ok && space_check_ok
        errors += x_check_errs
        regenerate_index_and_spaces if repair
      end
    rescue PEROBS::FatalError
      errors += 1
      regenerate_index_and_spaces if repair
    end
  end

  sync if repair
  PEROBS.log.info "check_db completed in #{Time.now - t} seconds. " +
    "#{errors} errors found."

  errors
end

#clear_all_marksObject

Clear alls marks.



327
328
329
330
331
332
333
# File 'lib/perobs/FlatFile.rb', line 327

def clear_all_marks
  if @marks
    @marks.clear
  else
    @marks = IDList.new(@db_dir, 'marks', 8)
  end
end

#closeObject

Close the flat file. This method must be called to ensure that all data is really written into the filesystem.



84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/perobs/FlatFile.rb', line 84

def close
  @space_list.close if @space_list.is_open?
  @index.close if @index.is_open?

  if @marks
    @marks.erase
    @marks = nil
  end

  if @f
    @f.flush
    @f.flock(File::LOCK_UN)
    @f.fsync
    @f.close
    @f = nil
  end
end

#defragmentizeObject

Eliminate all the holes in the file. This is an in-place implementation. No additional space will be needed on the file system.



337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
# File 'lib/perobs/FlatFile.rb', line 337

def defragmentize
  distance = 0
  new_file_size = 0
  deleted_blobs = 0
  corrupted_blobs = 0
  valid_blobs = 0

  # Iterate over all entries.
  @progressmeter.start('Defragmentizing blobs file', @f.size) do |pm|
    each_blob_header do |header|
      # If we have stumbled over a corrupted blob we treat it similar to a
      # deleted blob and reuse the space.
      if header.corruption_start
        distance += header.addr - header.corruption_start
        corrupted_blobs += 1
      end

      # Total size of the current entry
      entry_bytes = FlatFileBlobHeader::LENGTH + header.length
      if header.is_valid?
        # We have found a valid entry.
        valid_blobs += 1
        if distance > 0
          begin
            # Read current entry into a buffer
            @f.seek(header.addr)
            buf = @f.read(entry_bytes)
            # Write the buffer right after the end of the previous entry.
            @f.seek(header.addr - distance)
            @f.write(buf)
            # Mark the space between the relocated current entry and the
            # next valid entry as deleted space.
            FlatFileBlobHeader.new(@f, @f.pos, 0,
                                   distance - FlatFileBlobHeader::LENGTH,
                                   0, 0).write
            @f.flush
          rescue IOError => e
            PEROBS.log.fatal "Error while moving blob for ID " +
              "#{header.id}: #{e.message}"
          end
        end
        new_file_size = header.addr - distance +
          FlatFileBlobHeader::LENGTH + header.length
      else
        deleted_blobs += 1
        distance += entry_bytes
      end

      pm.update(header.addr)
    end
  end

  PEROBS.log.info "#{distance / 1000} KiB/#{deleted_blobs} blobs of " +
    "#{@f.size / 1000} KiB/#{valid_blobs} blobs or " +
    "#{'%.1f' % (distance.to_f / @f.size * 100.0)}% reclaimed"
  if corrupted_blobs > 0
    PEROBS.log.info "#{corrupted_blobs} corrupted blob(s) found. Space " +
      "was recycled."
  end

  @f.flush
  @f.truncate(new_file_size)
  @f.flush

  sync
end

#delete_obj_by_address(addr, id) ⇒ Object

Delete the blob that is stored at the specified address.

Parameters:

  • addr (Integer)

    Address of the blob to delete

  • id (Integer)

    ID of the blob to delete



129
130
131
132
133
134
# File 'lib/perobs/FlatFile.rb', line 129

def delete_obj_by_address(addr, id)
  @index.remove(id) if @index.is_open?
  header = FlatFileBlobHeader.read(@f, addr, id)
  header.clear_flags
  @space_list.add_space(addr, header.length) if @space_list.is_open?
end

#delete_obj_by_id(id) ⇒ Boolean

Delete the blob for the specified ID.

Parameters:

  • id (Integer)

    ID of the object to be deleted

Returns:

  • (Boolean)

    True if object was deleted, false otherwise



117
118
119
120
121
122
123
124
# File 'lib/perobs/FlatFile.rb', line 117

def delete_obj_by_id(id)
  if (pos = find_obj_addr_by_id(id))
    delete_obj_by_address(pos, id)
    return true
  end

  return false
end

#delete_unmarked_objectsObject

Delete all unmarked objects.



137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/perobs/FlatFile.rb', line 137

def delete_unmarked_objects
  # We don't update the index and the space list during this operation as
  # we defragmentize the blob file at the end. We'll end the operation
  # with an empty space list.
  clear_index_files

  deleted_objects_count = 0
  @progressmeter.start('Sweeping unmarked objects', @f.size) do |pm|
    each_blob_header do |header|
      if header.is_valid? && !@marks.include?(header.id)
        delete_obj_by_address(header.addr, header.id)
        deleted_objects_count += 1
      end

      pm.update(header.addr)
    end
  end
  defragmentize

  # Update the index file and create a new, empty space list.
  regenerate_index_and_spaces

  deleted_objects_count
end

#find_obj_addr_by_id(id) ⇒ Integer

Find the address of the object with the given ID.

Parameters:

  • id (Integer)

    ID of the object

Returns:

  • (Integer)

    Offset in the flat file or nil if not found



257
258
259
# File 'lib/perobs/FlatFile.rb', line 257

def find_obj_addr_by_id(id)
  @index.get(id)
end

#has_id_at?(id, address) ⇒ Boolean

Returns:

  • (Boolean)


619
620
621
622
# File 'lib/perobs/FlatFile.rb', line 619

def has_id_at?(id, address)
  header = FlatFileBlobHeader.read(@f, address)
  header.is_valid? && header.id == id
end

#has_space?(address, size) ⇒ Boolean

Returns:

  • (Boolean)


614
615
616
617
# File 'lib/perobs/FlatFile.rb', line 614

def has_space?(address, size)
  header = FlatFileBlobHeader.read(@f, address)
  !header.is_valid? && header.length == size
end

#inspectObject



624
625
626
627
628
629
630
631
632
633
634
635
636
# File 'lib/perobs/FlatFile.rb', line 624

def inspect
  s = '['
  each_blob_header do |header|
    s << "{ :pos => #{header.addr}, :flags => #{header.flags}, " +
         ":length => #{header.length}, :id => #{header.id}, " +
         ":crc => #{header.crc}"
    if header.is_valid?
      s << ", :value => #{@f.read(header.length)}"
    end
    s << " }\n"
  end
  s + ']'
end

#is_marked_by_id?(id) ⇒ Boolean

Return true if the object with the given ID is marked, false otherwise.

Parameters:

  • id (Integer)

    ID of the object

Returns:

  • (Boolean)


322
323
324
# File 'lib/perobs/FlatFile.rb', line 322

def is_marked_by_id?(id)
  @marks.include?(id)
end

#item_counterInteger

Returns Number of items stored in the DB.

Returns:

  • (Integer)

    Number of items stored in the DB.



273
274
275
# File 'lib/perobs/FlatFile.rb', line 273

def item_counter
  @index.entries_count
end

#mark_obj_by_id(id) ⇒ Object

Mark the object with the given ID.

Parameters:

  • id (Integer)

    ID of the object



316
317
318
# File 'lib/perobs/FlatFile.rb', line 316

def mark_obj_by_id(id)
  @marks.insert(id)
end

#openObject

Open the flat file for reading and writing.



58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/perobs/FlatFile.rb', line 58

def open
  file_name = File.join(@db_dir, 'database.blobs')
  new_db_created = false
  begin
    if File.exist?(file_name)
      @f = File.open(file_name, 'rb+')
    else
      PEROBS.log.info "New FlatFile database '#{file_name}' created"
      @f = File.open(file_name, 'wb+')
      new_db_created = true
    end
  rescue IOError => e
    PEROBS.log.fatal "Cannot open FlatFile database #{file_name}: " +
      e.message
  end
  unless @f.flock(File::LOCK_NB | File::LOCK_EX)
    PEROBS.log.fatal "FlatFile database '#{file_name}' is locked by " +
      "another process"
  end
  @f.sync = true

  open_index_files(!new_db_created)
end

#read_obj_by_address(addr, id) ⇒ String

Read the object at the specified address.

Parameters:

  • addr (Integer)

    Offset in the flat file

  • id (Integer)

    ID of the data blob

Returns:

  • (String)

    Raw object data



281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
# File 'lib/perobs/FlatFile.rb', line 281

def read_obj_by_address(addr, id)
  header = FlatFileBlobHeader.read(@f, addr, id)
  if header.id != id
    PEROBS.log.fatal "Database index corrupted: Index for object " +
      "#{id} points to object with ID #{header.id}"
  end

  buf = nil

  begin
    @f.seek(addr + FlatFileBlobHeader::LENGTH)
    buf = @f.read(header.length)
  rescue IOError => e
    PEROBS.log.fatal "Cannot read blob for ID #{id}: #{e.message}"
  end

  # Uncompress the data if the compression bit is set in the flags byte.
  if header.is_compressed?
    begin
      buf = Zlib.inflate(buf)
    rescue Zlib::BufError, Zlib::DataError
      PEROBS.log.fatal "Corrupted compressed block with ID " +
        "#{header.id} found."
    end
  end

  if checksum(buf) != header.crc
    PEROBS.log.fatal "Checksum failure while reading blob ID #{id}"
  end

  buf
end

#read_obj_by_id(id) ⇒ String or nil

Read the object with the given ID.

Parameters:

  • id (Integer)

    ID of the object

Returns:

  • (String or nil)

    Raw object data if found, otherwise nil



264
265
266
267
268
269
270
# File 'lib/perobs/FlatFile.rb', line 264

def read_obj_by_id(id)
  if (addr = find_obj_addr_by_id(id))
    return read_obj_by_address(addr, id)
  end

  nil
end

#refreshObject

This method iterates over all entries in the FlatFile and removes the entry and inserts it again. This is useful to update all entries in case the storage format has changed.



407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
# File 'lib/perobs/FlatFile.rb', line 407

def refresh
  # This iteration might look scary as we iterate over the entries while
  # while we are rearranging them. Re-inserted items may be inserted
  # before or at the current entry and this is fine. They also may be
  # inserted after the current entry and will be re-read again unless they
  # are inserted after the original file end.
  file_size = @f.size

  # We don't update the index and the space list during this operation as
  # we defragmentize the blob file at the end. We'll end the operation
  # with an empty space list.
  clear_index_files

  @progressmeter.start('Converting objects to new storage format',
                       @f.size) do |pm|
    each_blob_header do |header|
      if header.is_valid?
        buf = read_obj_by_address(header.addr, header.id)
        delete_obj_by_address(header.addr, header.id)
        write_obj_by_id(header.id, buf)
      end

      # Some re-inserted blobs may be inserted after the original file end.
      # No need to process those blobs again.
      break if header.addr >= file_size

      pm.update(header.addr)
    end
  end

  # Reclaim the space saved by compressing entries.
  defragmentize

  # Recreate the index file and create an empty space list.
  regenerate_index_and_spaces
end

#regenerate_index_and_spacesObject

This method clears the index tree and the free space list and regenerates them from the FlatFile.



580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
# File 'lib/perobs/FlatFile.rb', line 580

def regenerate_index_and_spaces
  PEROBS.log.warn "Re-generating FlatFileDB index and space files"
  @index.open unless @index.is_open?
  @index.clear
  @space_list.open unless @space_list.is_open?
  @space_list.clear

  @progressmeter.start('Re-generating database index', @f.size) do |pm|
    each_blob_header do |header|
      if header.is_valid?
        if (duplicate_pos = @index.get(header.id))
          PEROBS.log.error "FlatFile contains multiple blobs for ID " +
            "#{header.id}. First blob is at address #{duplicate_pos}. " +
            "Other blob found at address #{header.addr}."
          if header.length > 0
            @space_list.add_space(header.addr, header.length)
          end
          discard_damaged_blob(header)
        else
          @index.insert(header.id, header.addr)
        end
      else
        if header.length > 0
          @space_list.add_space(header.addr, header.length)
        end
      end

      pm.update(header.addr)
    end
  end

  sync
end

#syncObject

Force outstanding data to be written to the filesystem.



103
104
105
106
107
108
109
110
111
112
# File 'lib/perobs/FlatFile.rb', line 103

def sync
  begin
    @f.flush
    @f.fsync
  rescue IOError => e
    PEROBS.log.fatal "Cannot sync flat file database: #{e.message}"
  end
  @index.sync
  @space_list.sync
end

#write_obj_by_id(id, raw_obj) ⇒ Integer

Write the given object into the file. This method never uses in-place updates for existing objects. A new copy is inserted first and only when the insert was successful, the old copy is deleted and the index updated.

Parameters:

  • id (Integer)

    ID of the object

  • raw_obj (String)

    Raw object as String

Returns:

  • (Integer)

    position of the written blob in the blob file



169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
# File 'lib/perobs/FlatFile.rb', line 169

def write_obj_by_id(id, raw_obj)
  # Check if we have already an object with the given ID. We'll mark it as
  # outdated and save the header for later deletion. In case this
  # operation is aborted or interrupted we ensure that we either have the
  # old or the new version available.
  if (old_addr = find_obj_addr_by_id(id))
    old_header = FlatFileBlobHeader.read(@f, old_addr)
    old_header.set_outdated_flag
  end

  crc = checksum(raw_obj)

  # If the raw_obj is larger then 256 characters we will compress it to
  # safe some space in the database file. For smaller strings the
  # performance impact of compression is not compensated by writing
  # less data to the storage.
  compressed = false
  if raw_obj.bytesize > 256
    raw_obj = Zlib.deflate(raw_obj)
    compressed = true
  end

  addr, length = find_free_blob(raw_obj.bytesize)
  begin
    if length != -1
      # Just a safeguard so we don't overwrite current data.
      header = FlatFileBlobHeader.read(@f, addr)
      if header.length != length
        PEROBS.log.fatal "Length in free list (#{length}) and header " +
          "(#{header.length}) for address #{addr} don't match."
      end
      if raw_obj.bytesize > header.length
        PEROBS.log.fatal "Object (#{raw_obj.bytesize}) is longer than " +
          "blob space (#{header.length})."
      end
      if header.is_valid?
        PEROBS.log.fatal "Entry at address #{addr} with flags: " +
          "#{header.flags} is already used for ID #{header.id}."
      end
    end
    flags = 1 << FlatFileBlobHeader::VALID_FLAG_BIT
    flags |= (1 << FlatFileBlobHeader::COMPRESSED_FLAG_BIT) if compressed
    FlatFileBlobHeader.new(@f, addr, flags, raw_obj.bytesize, id, crc).write
    @f.write(raw_obj)
    if length != -1 && raw_obj.bytesize < length
      # The new object was not appended and it did not completely fill the
      # free space. So we have to write a new header to mark the remaining
      # empty space.
      unless length - raw_obj.bytesize >= FlatFileBlobHeader::LENGTH
        PEROBS.log.fatal "Not enough space to append the empty space " +
          "header (space: #{length} bytes, object: #{raw_obj.bytesize} " +
          "bytes)."
      end
      space_address = @f.pos
      space_length = length - FlatFileBlobHeader::LENGTH - raw_obj.bytesize
      FlatFileBlobHeader.new(@f, space_address, 0, space_length,
                             0, 0).write
      # Register the new space with the space list.
      if @space_list.is_open? && space_length > 0
        @space_list.add_space(space_address, space_length)
      end
    end

    # Once the blob has been written we can update the index as well.
    @index.insert(id, addr) if @index.is_open?

    if old_addr
      # If we had an existing object stored for the ID we have to mark
      # this entry as deleted now.
      old_header.clear_flags
      # And register the newly freed space with the space list.
      if @space_list.is_open?
        @space_list.add_space(old_addr, old_header.length)
      end
    else
      @f.flush
    end
  rescue IOError => e
    PEROBS.log.fatal "Cannot write blob for ID #{id} to FlatFileDB: " +
      e.message
  end

  addr
end