Class: Ferret::Index::IndexWriter

Inherits:

Object

Object
Ferret::Index::IndexWriter

show all

Includes:: MonitorMixin, ObjectSpace

Defined in:: lib/ferret/index/index_writer.rb

Overview

An IndexWriter creates and maintains an index.

The third argument to new determines whether a new index is created, or whether an existing index is opened for the addition of new documents.

In either case, documents are added with the add_document method. When finished adding documents, close should be called.

If an index will not have more documents added for a while and optimal search performance is desired, then the optimize method should be called before the index is closed.

Opening an IndexWriter creates a lock file for the directory in use. Trying to open another IndexWriter on the same directory will lead to an IOError. The IOError is also thrown if an IndexReader on the same directory is used to delete documents from the index.

Constant Summary collapse

WRITE_LOCK_TIMEOUT =

COMMIT_LOCK_TIMEOUT =

WRITE_LOCK_NAME =

"write.lock"

COMMIT_LOCK_NAME =

"commit.lock"

DEFAULT_MERGE_FACTOR =

DEFAULT_MIN_MERGE_DOCS =

DEFAULT_MAX_MERGE_DOCS =

0x7fffffff

DEFAULT_MAX_FIELD_LENGTH =

DEFAULT_TERM_INDEX_INTERVAL =

Instance Attribute Summary collapse

#analyzer ⇒ Object readonly

Returns the value of attribute analyzer.
#directory ⇒ Object readonly

Returns the value of attribute directory.
#info_stream ⇒ Object

Returns the value of attribute info_stream.
#max_field_length ⇒ Object

Returns the value of attribute max_field_length.
#max_merge_docs ⇒ Object

Returns the value of attribute max_merge_docs.
#merge_factor ⇒ Object

Returns the value of attribute merge_factor.
#min_merge_docs ⇒ Object (also: #max_buffered_docs)

Returns the value of attribute min_merge_docs.
#segment_infos ⇒ Object readonly

Returns the value of attribute segment_infos.
#similarity ⇒ Object

Returns the value of attribute similarity.
#term_index_interval ⇒ Object

Returns the value of attribute term_index_interval.
#use_compound_file ⇒ Object

Returns the value of attribute use_compound_file.

Instance Method Summary collapse

#add_document(doc, analyzer = @analyzer) ⇒ Object (also: #<<)

Adds a document to this index, using the provided analyzer instead of the local analyzer if provided.
#add_indexes(dirs) ⇒ Object

Merges all segments from an array of indexes into this index.
#add_indexes_readers(readers) ⇒ Object

Merges the provided indexes into this index.
#close ⇒ Object

Flushes all changes to an index and closes all associated files.
#doc_count ⇒ Object

Returns the number of documents currently in this index.
#initialize(dir = nil, options = {}) ⇒ IndexWriter constructor

Constructs an IndexWriter for the index in dir.
#optimize ⇒ Object

Merges all segments together into a single segment, optimizing an index for search.
#segments_counter ⇒ Object

Constructor Details

#initialize(dir = nil, options = {}) ⇒ `IndexWriter`

Constructs an IndexWriter for the index in dir. Text will be analyzed with analyzer. If create is true, then a new, empty index will be created in dir, replacing the index already there, if any.

NOTE: all options are passed in a hash.
dir: the index directory

Options

analyzer: the analyzer to use. Defaults to StandardAnalyzer.
create: true to create the index or overwrite the existing one false to append to the existing index
create_if_missing: true to create the index if it’s missing false to throw an IOError if it’s missing
close_dir: This specifies whether you would this class to close the index directory when this class is closed. The default is false.
use_compound_file: Use a compound file to store the index. This is slower than using multiple files but it prevents the too many files open error. This defaults to true.

# File 'lib/ferret/index/index_writer.rb', line 71

def initialize(dir = nil, options = {})
  super()
  create = options[:create] || false
  create_if_missing = options[:create_if_missing] || false

  if dir.nil?
    @directory = Ferret::Store::RAMDirectory.new
  elsif dir.is_a?(String)
    @directory = Ferret::Store::FSDirectory.new(dir, create)
  else
    @directory = dir
  end
  @close_dir = options[:close_dir] || false
  @use_compound_file = (options[:use_compound_file] != false) # ie default true
  @analyzer = options[:analyzer] || Ferret::Analysis::StandardAnalyzer.new
  @merge_factor = options[:merge_factor] || DEFAULT_MERGE_FACTOR
  @min_merge_docs = options[:min_merge_docs] || DEFAULT_MIN_MERGE_DOCS
  @max_merge_docs = options[:max_merge_docs] || DEFAULT_MAX_MERGE_DOCS
  @max_field_length = options[:max_field_length] || DEFAULT_MAX_FIELD_LENGTH
  @term_index_interval = options[:term_index_interval] || DEFAULT_TERM_INDEX_INTERVAL

  @similarity = Search::Similarity.default
  @segment_infos = SegmentInfos.new()
  @ram_directory = Ferret::Store::RAMDirectory.new()

  # Make sure that the lock is released when this object is destroyed
   
  @write_lock = @directory.make_lock(WRITE_LOCK_NAME)
  @write_lock.obtain(WRITE_LOCK_TIMEOUT) # obtain write lock
  define_finalizer(@write_lock, proc { |id| @write_lock.release() if @write_lock})

  @directory.synchronize() do # in- & inter-process sync
    @directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
      if (create)
        @segment_infos.write(@directory)
      else
        begin
          @segment_infos.read(@directory)
        rescue Exception => e
          if options[:create_if_missing]
            @segment_infos.write(@directory)
          else
            @write_lock.release() # obtain write lock
            raise e
          end
        end
      end 
    end
  end

  @info_stream = nil
end

Instance Attribute Details

#analyzer ⇒ `Object` (readonly)

Returns the value of attribute analyzer.



41
42
43

# File 'lib/ferret/index/index_writer.rb', line 41

def analyzer
  @analyzer
end

#directory ⇒ `Object` (readonly)

Returns the value of attribute directory.



41
42
43

# File 'lib/ferret/index/index_writer.rb', line 41

def directory
  @directory
end

#info_stream ⇒ `Object`

Returns the value of attribute info_stream.



39
40
41

# File 'lib/ferret/index/index_writer.rb', line 39

def info_stream
  @info_stream
end

#max_field_length ⇒ `Object`

Returns the value of attribute max_field_length.



39
40
41

# File 'lib/ferret/index/index_writer.rb', line 39

def max_field_length
  @max_field_length
end

#max_merge_docs ⇒ `Object`

Returns the value of attribute max_merge_docs.



39
40
41

# File 'lib/ferret/index/index_writer.rb', line 39

def max_merge_docs
  @max_merge_docs
end

#merge_factor ⇒ `Object`

Returns the value of attribute merge_factor.



41
42
43

# File 'lib/ferret/index/index_writer.rb', line 41

def merge_factor
  @merge_factor
end

#min_merge_docs ⇒ `Object` Also known as: max_buffered_docs

Returns the value of attribute min_merge_docs.



39
40
41

# File 'lib/ferret/index/index_writer.rb', line 39

def min_merge_docs
  @min_merge_docs
end

#segment_infos ⇒ `Object` (readonly)

Returns the value of attribute segment_infos.



41
42
43

# File 'lib/ferret/index/index_writer.rb', line 41

def segment_infos
  @segment_infos
end

#similarity ⇒ `Object`

Returns the value of attribute similarity.



39
40
41

# File 'lib/ferret/index/index_writer.rb', line 39

def similarity
  @similarity
end

#term_index_interval ⇒ `Object`

Returns the value of attribute term_index_interval.



39
40
41

# File 'lib/ferret/index/index_writer.rb', line 39

def term_index_interval
  @term_index_interval
end

#use_compound_file ⇒ `Object`

Returns the value of attribute use_compound_file.



39
40
41

# File 'lib/ferret/index/index_writer.rb', line 39

def use_compound_file
  @use_compound_file
end

Instance Method Details

#add_document(doc, analyzer = @analyzer) ⇒ `Object` Also known as: <<

Adds a document to this index, using the provided analyzer instead of the local analyzer if provided. If the document contains more than #max_field_length terms for a given field, the remainder are discarded.

# File 'lib/ferret/index/index_writer.rb', line 150

def add_document(doc, analyzer=@analyzer)
  dw = DocumentWriter.new(@ram_directory,
                          analyzer,
                          @similarity,
                          @max_field_length,
                          @term_index_interval)
  dw.info_stream = @info_stream
  segment_name = new_segment_name()
  dw.add_document(segment_name, doc)
  synchronize() do
    @segment_infos << SegmentInfo.new(segment_name, 1, @ram_directory)
    maybe_merge_segments()
  end
end

#add_indexes(dirs) ⇒ `Object`

Merges all segments from an array of indexes into this index.

This may be used to parallelize batch indexing. A large document collection can be broken into sub-collections. Each sub-collection can be indexed in parallel, on a different thread, process or machine. The complete index can then be created by merging sub-collection indexes with this method.

After this completes, the index is optimized.

# File 'lib/ferret/index/index_writer.rb', line 197

def add_indexes(dirs)
  synchronize() do
    optimize()                        # start with zero or 1 seg

    start = @segment_infos.size

    dirs.each do |dir|
      sis = SegmentInfos.new()        # read infos from dir
      sis.read(dir)
      sis.each do |si|
        @segment_infos << si
      end
    end

    # merge newly added segments in log(n) passes
    while (@segment_infos.size > start + @merge_factor)
      (start+1 ... @segment_infos.size).each do |base|
        last = [@segment_infos.size(),  (base + @merge_factor)].min
        if (last - base > 1)
          merge_segments(base, last);
        end
      end
    end

    optimize() # final cleanup
  end
end

#add_indexes_readers(readers) ⇒ `Object`

Merges the provided indexes into this index. After this completes, the index is optimized. The provided IndexReaders are not closed.

# File 'lib/ferret/index/index_writer.rb', line 228

def add_indexes_readers(readers)
  synchronize() do
    segments_to_delete = []
    optimize() # start with zero or 1 seg

    merged_name = new_segment_name()
    merger = SegmentMerger.new(@directory, merged_name, @term_index_interval)

    if (@segment_infos.size() == 1) # add existing index, if any
      s_reader = SegmentReader.get(@segment_infos[0])
      merger << s_reader
      segments_to_delete << s_reader
    end

    readers.each do |reader|
      merger << reader
    end

    doc_count = merger.merge() # merge 'em

    @segment_infos.clear() # pop old infos & add new
    @segment_infos << SegmentInfo.new(merged_name, doc_count, @directory)

    @directory.synchronize() do
      @directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
        @segment_infos.write(@directory) # commit changes
        delete_segments(segments_to_delete)
      end
    end

    if @use_compound_file
      files_to_delete = merger.create_compound_file(merged_name + ".tmp")
      @directory.synchronize() do # in- & inter-process sync
        @directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
          # make compound file visible for SegmentReaders
          @directory.rename(merged_name + ".tmp", merged_name + ".cfs")
          # delete now unused files of segment
          delete_files_and_write_undeletable(files_to_delete)
        end
      end
    end

    optimize()
  end
end

#close ⇒ `Object`

Flushes all changes to an index and closes all associated files.

# File 'lib/ferret/index/index_writer.rb', line 125

def close()
  synchronize() do
    flush_ram_segments()
    @ram_directory.close()
    @write_lock.release() if @write_lock # release write lock
    @write_lock = nil
    if(@close_dir)
      @directory.close()
    end
  end
end

#doc_count ⇒ `Object`

Returns the number of documents currently in this index.

# File 'lib/ferret/index/index_writer.rb', line 138

def doc_count()
  synchronize() do
    count = 0
    @segment_infos.each { |si| count += si.doc_count() }
    return count
  end
end

#optimize ⇒ `Object`

Merges all segments together into a single segment, optimizing an index for search.

# File 'lib/ferret/index/index_writer.rb', line 172

def optimize()
  synchronize() do
    flush_ram_segments()
    while (@segment_infos.size() > 1 ||
            (@segment_infos.size() == 1 &&
              (SegmentReader.has_deletions?(@segment_infos[0]) ||
                (@segment_infos[0].directory != @directory) ||
                  (@use_compound_file &&
                    (!SegmentReader.uses_compound_file?(@segment_infos[0]) ||
                      SegmentReader.has_separate_norms?(@segment_infos[0]))))))
      min_segment = @segment_infos.size() - @merge_factor
      merge_segments(min_segment < 0 ? 0 : min_segment)
    end
  end
end

#segments_counter ⇒ `Object`



166
167
168

# File 'lib/ferret/index/index_writer.rb', line 166

def segments_counter()
  return segment_infos.counter
end

Class: Ferret::Index::IndexWriter

Overview

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(dir = nil, options = {}) ⇒ IndexWriter

Options

Instance Attribute Details

#analyzer ⇒ Object (readonly)

#directory ⇒ Object (readonly)

#info_stream ⇒ Object

#max_field_length ⇒ Object

#max_merge_docs ⇒ Object

#merge_factor ⇒ Object

#min_merge_docs ⇒ Object Also known as: max_buffered_docs

#segment_infos ⇒ Object (readonly)

#similarity ⇒ Object

#term_index_interval ⇒ Object

#use_compound_file ⇒ Object