Method: ContentData::ContentData#from_file

Defined in:
lib/content_data/content_data.rb

#from_file(filename) ⇒ Object

TODO validation that file indeed contains ContentData missing TODO class level method? Loading db from file using chunks for better memory performance



383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
# File 'lib/content_data/content_data.rb', line 383

def from_file(filename)
  # read first line (number of contents)
  # calculate line number (number of instances)
  # read number of instances.
  # loop over instances lines (using chunks) and add instances

  unless File.exists? filename
    raise ArgumentError.new "No such a file #{filename}"
  end

  File.open(filename, 'r') { |file|
    # Get number of contents (at first line)
    number_of_contents = file.gets  # this gets the next line or return nil at EOF
    unless (number_of_contents and number_of_contents.match(/^[\d]+$/))  # check that line is of Number format
      raise("Parse error of content data file:#{filename}  line ##{$.}\n" +
            "number of contents should be a number. We got:#{number_of_contents}")
    end
    number_of_contents = number_of_contents.to_i
    # advance file lines over all contents. We need only the instances data to build the content data object
    # use chunks and GC
    contents_chunks = number_of_contents / CHUNK_SIZE
    contents_chunks += 1 if (contents_chunks * CHUNK_SIZE < number_of_contents)
    chunk_index = 0
    while chunk_index < contents_chunks
      chunk_size = CHUNK_SIZE
      if chunk_index + 1 == contents_chunks
        # update last chunk size
        chunk_size = number_of_contents - (chunk_index * CHUNK_SIZE)
      end
      return unless read_contents_chunk(filename, file, chunk_size)
      GC.start
      chunk_index += 1
    end

    # get number of instances
    number_of_instances = file.gets
    unless (number_of_instances and number_of_instances.match(/^[\d]+$/))  # check that line is of Number format
      raise("Parse error of content data file:#{filename}  line ##{$.}\n" +
            "number of instances should be a Number. We got:#{number_of_instances}")
    end
    number_of_instances = number_of_instances.to_i
    # read in instances chunks and GC
    instances_chunks = number_of_instances / CHUNK_SIZE
    instances_chunks += 1 if (instances_chunks * CHUNK_SIZE < number_of_instances)
    chunk_index = 0
    while chunk_index < instances_chunks
      chunk_size = CHUNK_SIZE
      if chunk_index + 1 == instances_chunks
        # update last chunk size
        chunk_size = number_of_instances - (chunk_index * CHUNK_SIZE)
      end
      return unless read_instances_chunk(filename, file, chunk_size)
      GC.start
      chunk_index += 1
    end

    # get number of symlinks
    number_of_symlinks = file.gets
    unless (number_of_symlinks and number_of_symlinks.match(/^[\d]+$/))  # check that line is of Number format
      raise("Parse error of content data file:#{filename}  line ##{$.}\n" +
                "number of symlinks should be a Number. We got:#{number_of_symlinks}")
    end
    number_of_symlinks.to_i.times {
      symlinks_line = file.gets
      unless symlinks_line
        raise("Parse error of content data file:#{filename}  line ##{$.}\n" +
               "Expected to read symlink line but reached EOF")
      end
      parameters = symlinks_line.split('<')
      if (3 != parameters.length)
        raise("Parse error of content data file:#{filename}  line ##{$.}\n" +
              "Expected to read 3 fields ('<' separated) but got #{parameters.length}.\nLine:#{symlinks_line}")
      end

      @symlinks_info[[parameters[0],parameters[1]]] = parameters[2]
    }
  }
end