Class: Ferret::Index::TermVectorsReader

Inherits:
Object
  • Object
show all
Defined in:
lib/ferret/index/term_vectors_io.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(d, segment, field_infos) ⇒ TermVectorsReader

Returns a new instance of TermVectorsReader.



303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
# File 'lib/ferret/index/term_vectors_io.rb', line 303

def initialize(d, segment, field_infos)
 
  if (d.exists?(segment + TermVectorsWriter::TVX_EXTENSION)) 
    @tvx = d.open_input(segment + TermVectorsWriter::TVX_EXTENSION)
    check_valid_format(@tvx)
    @tvd = d.open_input(segment + TermVectorsWriter::TVD_EXTENSION)
    @tvd_format = check_valid_format(@tvd)
    @tvf = d.open_input(segment + TermVectorsWriter::TVF_EXTENSION)
    @tvf_format = check_valid_format(@tvf)
    @size = @tvx.length / 8
  else
    @tvx = nil
    @tvd = nil
    @tvf = nil
  end

  @field_infos = field_infos
end

Instance Attribute Details

#sizeObject (readonly)

Returns the value of attribute size.



296
297
298
# File 'lib/ferret/index/term_vectors_io.rb', line 296

def size
  @size
end

Instance Method Details

#cloneObject



435
436
437
438
439
440
441
442
443
444
445
446
447
# File 'lib/ferret/index/term_vectors_io.rb', line 435

def clone() 
  
  if (@tvx == nil or @tvd == nil or @tvf == nil)
    return nil
  end
  
  clone = self
  clone.tvx = @tvx.clone()
  clone.tvd = @tvd.clone()
  clone.tvf = @tvf.clone()
  
  return clone
end

#closeObject



322
323
324
325
326
327
328
329
330
331
332
333
334
# File 'lib/ferret/index/term_vectors_io.rb', line 322

def close()
  # make an effort to close all streams we can but remember and re-raise
  # the last exception encountered in this process
  keep = nil
  [@tvx, @tvd, @tvf].compact.each do |os|
    begin 
      os.close()
    rescue IOError => e
      keep = e
    end
  end
  raise keep if (keep != nil) 
end

#get_field_tv(doc_num, field) ⇒ Object

Retrieve the term vector for the given document and field

doc_num

The document number to retrieve the vector for

field

The field within the document to retrieve

returns

The TermFreqVector for the document and field or nil if there

is no termVector for this field.
raises

IOException if there is an error reading the term vector files



342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
# File 'lib/ferret/index/term_vectors_io.rb', line 342

def get_field_tv(doc_num, field)
  # Check if no term vectors are available for this segment at all
  field_number = @field_infos.field_number(field)
  result = nil
  if (@tvx != nil) 
    #We need to account for the FORMAT_SIZE at when seeking in the @tvx
    #We don't need to do this in other seeks because we already have the
    # file pointer
    #that was written in another file
    @tvx.seek((doc_num * 8) + TermVectorsWriter::FORMAT_SIZE)
    #puts("TVX Pointer: " + @tvx.pos())
    position = @tvx.read_long()

    @tvd.seek(position)
    field_count = @tvd.read_vint()
    #puts("Num Fields: " + field_count)
    # There are only a few fields per document. We opt for a full scan
    # rather then requiring that they be ordered. We need to read through
    # all of the fields anyway to get to the tvf pointers.
    number = 0
    found = -1
    field_count.times do |i|
      if @tvd_format == TermVectorsWriter::FORMAT_VERSION
        number = @tvd.read_vint()
      else
        number += @tvd.read_vint()
      end
      if (number == field_number)
        found = i
      end
    end

    # This field, although valid in the segment, was not found in this
    # document
    if (found != -1) 
      # Compute position in the @tvf file
      position = 0
      (found + 1).times do 
        position += @tvd.read_vlong()
      end

      result = read_term_vector(field, position)
    end
  end
  return result
end

#get_tv(doc_num) ⇒ Object

Return all term vectors stored for this document or nil if it could not be read in.

doc_num

The document number to retrieve the vector for

returns

All term frequency vectors

raises

IOException if there is an error reading the term vector files



395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
# File 'lib/ferret/index/term_vectors_io.rb', line 395

def get_tv(doc_num)
  result = nil
  # Check if no term vectors are available for this segment at all
  if (@tvx != nil) 
    #We need to offset by
    @tvx.seek((doc_num * 8) + TermVectorsWriter::FORMAT_SIZE)
    position = @tvx.read_long()

    @tvd.seek(position)
    field_count = @tvd.read_vint()

    # No fields are vectorized for this document
    if (field_count != 0) 
      number = 0
      fields = Array.new(field_count)
      
      field_count.times do |i|
        if @tvd_format == TermVectorsWriter::FORMAT_VERSION
          number = @tvd.read_vint()
        else
          number += @tvd.read_vint()
        end

        fields[i] = @field_infos[number].name
      end

      # Compute position in the @tvf file
      position = 0
      tvf_pointers = Array.new(field_count)
      field_count.times do |i|
        position += @tvd.read_vlong()
        tvf_pointers[i] = position
      end

      result = read_term_vectors(fields, tvf_pointers)
    end
  end
  return result
end