Class: Ferret::Index::IndexWriter

Inherits:
Object
  • Object
show all
Includes:
MonitorMixin, ObjectSpace
Defined in:
lib/ferret/index/index_writer.rb

Overview

An IndexWriter creates and maintains an index.

The third argument to new determines whether a new index is created, or whether an existing index is opened for the addition of new documents.

In either case, documents are added with the add_document method. When finished adding documents, close should be called.

If an index will not have more documents added for a while and optimal search performance is desired, then the optimize method should be called before the index is closed.

Opening an IndexWriter creates a lock file for the directory in use. Trying to open another IndexWriter on the same directory will lead to an IOError. The IOError is also thrown if an IndexReader on the same directory is used to delete documents from the index.

Constant Summary collapse

WRITE_LOCK_TIMEOUT =
1
COMMIT_LOCK_TIMEOUT =
10
WRITE_LOCK_NAME =
"write.lock"
COMMIT_LOCK_NAME =
"commit.lock"
DEFAULT_MERGE_FACTOR =
10
DEFAULT_MIN_MERGE_DOCS =
10
DEFAULT_MAX_MERGE_DOCS =
0x7fffffff
DEFAULT_MAX_FIELD_LENGTH =
10000
DEFAULT_TERM_INDEX_INTERVAL =
128

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(dir = nil, options = {}) ⇒ IndexWriter

Constructs an IndexWriter for the index in dir. Text will be analyzed with analyzer. If create is true, then a new, empty index will be created in dir, replacing the index already there, if any.

NOTE

all options are passed in a hash.

dir

the index directory

Options

analyzer

the analyzer to use. Defaults to StandardAnalyzer.

create

true to create the index or overwrite the existing one false to append to the existing index

create_if_missing

true to create the index if it’s missing false to throw an IOError if it’s missing

close_dir

This specifies whether you would this class to close the index directory when this class is closed. The default is false.

use_compound_file

Use a compound file to store the index. This is slower than using multiple files but it prevents the too many files open error. This defaults to true.



71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# File 'lib/ferret/index/index_writer.rb', line 71

def initialize(dir = nil, options = {})
  super()
  create = options[:create] || false
  create_if_missing = options[:create_if_missing] || false

  if dir.nil?
    @directory = Ferret::Store::RAMDirectory.new
  elsif dir.is_a?(String)
    @directory = Ferret::Store::FSDirectory.new(dir, create)
  else
    @directory = dir
  end
  @close_dir = options[:close_dir] || false
  @use_compound_file = (options[:use_compound_file] != false) # ie default true
  @analyzer = options[:analyzer] || Ferret::Analysis::StandardAnalyzer.new
  @merge_factor = options[:merge_factor] || DEFAULT_MERGE_FACTOR
  @min_merge_docs = options[:min_merge_docs] || DEFAULT_MIN_MERGE_DOCS
  @max_merge_docs = options[:max_merge_docs] || DEFAULT_MAX_MERGE_DOCS
  @max_field_length = options[:max_field_length] || DEFAULT_MAX_FIELD_LENGTH
  @term_index_interval = options[:term_index_interval] || DEFAULT_TERM_INDEX_INTERVAL

  @similarity = Search::Similarity.default
  @segment_infos = SegmentInfos.new()
  @ram_directory = Ferret::Store::RAMDirectory.new()

  # Make sure that the lock is released when this object is destroyed
   
  @write_lock = @directory.make_lock(WRITE_LOCK_NAME)
  @write_lock.obtain(WRITE_LOCK_TIMEOUT) # obtain write lock
  define_finalizer(@write_lock, proc { |id| @write_lock.release() if @write_lock})

  @directory.synchronize() do # in- & inter-process sync
    @directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
      if (create)
        @segment_infos.write(@directory)
      else
        begin
          @segment_infos.read(@directory)
        rescue Exception => e
          if options[:create_if_missing]
            @segment_infos.write(@directory)
          else
            @write_lock.release() # obtain write lock
            raise e
          end
        end
      end 
    end
  end

  @info_stream = nil
end

Instance Attribute Details

#analyzerObject (readonly)

Returns the value of attribute analyzer.



41
42
43
# File 'lib/ferret/index/index_writer.rb', line 41

def analyzer
  @analyzer
end

#directoryObject (readonly)

Returns the value of attribute directory.



41
42
43
# File 'lib/ferret/index/index_writer.rb', line 41

def directory
  @directory
end

#info_streamObject

Returns the value of attribute info_stream.



39
40
41
# File 'lib/ferret/index/index_writer.rb', line 39

def info_stream
  @info_stream
end

#max_field_lengthObject

Returns the value of attribute max_field_length.



39
40
41
# File 'lib/ferret/index/index_writer.rb', line 39

def max_field_length
  @max_field_length
end

#max_merge_docsObject

Returns the value of attribute max_merge_docs.



39
40
41
# File 'lib/ferret/index/index_writer.rb', line 39

def max_merge_docs
  @max_merge_docs
end

#merge_factorObject

Returns the value of attribute merge_factor.



41
42
43
# File 'lib/ferret/index/index_writer.rb', line 41

def merge_factor
  @merge_factor
end

#min_merge_docsObject Also known as: max_buffered_docs

Returns the value of attribute min_merge_docs.



39
40
41
# File 'lib/ferret/index/index_writer.rb', line 39

def min_merge_docs
  @min_merge_docs
end

#segment_infosObject (readonly)

Returns the value of attribute segment_infos.



41
42
43
# File 'lib/ferret/index/index_writer.rb', line 41

def segment_infos
  @segment_infos
end

#similarityObject

Returns the value of attribute similarity.



39
40
41
# File 'lib/ferret/index/index_writer.rb', line 39

def similarity
  @similarity
end

#term_index_intervalObject

Returns the value of attribute term_index_interval.



39
40
41
# File 'lib/ferret/index/index_writer.rb', line 39

def term_index_interval
  @term_index_interval
end

#use_compound_fileObject

Returns the value of attribute use_compound_file.



39
40
41
# File 'lib/ferret/index/index_writer.rb', line 39

def use_compound_file
  @use_compound_file
end

Instance Method Details

#add_document(doc, analyzer = @analyzer) ⇒ Object Also known as: <<

Adds a document to this index, using the provided analyzer instead of the local analyzer if provided. If the document contains more than #max_field_length terms for a given field, the remainder are discarded.



150
151
152
153
154
155
156
157
158
159
160
161
162
163
# File 'lib/ferret/index/index_writer.rb', line 150

def add_document(doc, analyzer=@analyzer)
  dw = DocumentWriter.new(@ram_directory,
                          analyzer,
                          @similarity,
                          @max_field_length,
                          @term_index_interval)
  dw.info_stream = @info_stream
  segment_name = new_segment_name()
  dw.add_document(segment_name, doc)
  synchronize() do
    @segment_infos << SegmentInfo.new(segment_name, 1, @ram_directory)
    maybe_merge_segments()
  end
end

#add_indexes(dirs) ⇒ Object

Merges all segments from an array of indexes into this index.

This may be used to parallelize batch indexing. A large document collection can be broken into sub-collections. Each sub-collection can be indexed in parallel, on a different thread, process or machine. The complete index can then be created by merging sub-collection indexes with this method.

After this completes, the index is optimized.



197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# File 'lib/ferret/index/index_writer.rb', line 197

def add_indexes(dirs)
  synchronize() do
    optimize()                        # start with zero or 1 seg

    start = @segment_infos.size

    dirs.each do |dir|
      sis = SegmentInfos.new()        # read infos from dir
      sis.read(dir)
      sis.each do |si|
        @segment_infos << si
      end
    end

    # merge newly added segments in log(n) passes
    while (@segment_infos.size > start + @merge_factor)
      (start+1 ... @segment_infos.size).each do |base|
        last = [@segment_infos.size(),  (base + @merge_factor)].min
        if (last - base > 1)
          merge_segments(base, last);
        end
      end
    end

    optimize() # final cleanup
  end
end

#add_indexes_readers(readers) ⇒ Object

Merges the provided indexes into this index. After this completes, the index is optimized. The provided IndexReaders are not closed.



228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
# File 'lib/ferret/index/index_writer.rb', line 228

def add_indexes_readers(readers)
  synchronize() do
    segments_to_delete = []
    optimize() # start with zero or 1 seg

    merged_name = new_segment_name()
    merger = SegmentMerger.new(@directory, merged_name, @term_index_interval)

    if (@segment_infos.size() == 1) # add existing index, if any
      s_reader = SegmentReader.get(@segment_infos[0])
      merger << s_reader
      segments_to_delete << s_reader
    end

    readers.each do |reader|
      merger << reader
    end

    doc_count = merger.merge() # merge 'em

    @segment_infos.clear() # pop old infos & add new
    @segment_infos << SegmentInfo.new(merged_name, doc_count, @directory)

    @directory.synchronize() do
      @directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
        @segment_infos.write(@directory) # commit changes
        delete_segments(segments_to_delete)
      end
    end

    if @use_compound_file
      files_to_delete = merger.create_compound_file(merged_name + ".tmp")
      @directory.synchronize() do # in- & inter-process sync
        @directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
          # make compound file visible for SegmentReaders
          @directory.rename(merged_name + ".tmp", merged_name + ".cfs")
          # delete now unused files of segment
          delete_files_and_write_undeletable(files_to_delete)
        end
      end
    end

    optimize()
  end
end

#closeObject

Flushes all changes to an index and closes all associated files.



125
126
127
128
129
130
131
132
133
134
135
# File 'lib/ferret/index/index_writer.rb', line 125

def close()
  synchronize() do
    flush_ram_segments()
    @ram_directory.close()
    @write_lock.release() if @write_lock # release write lock
    @write_lock = nil
    if(@close_dir)
      @directory.close()
    end
  end
end

#doc_countObject

Returns the number of documents currently in this index.



138
139
140
141
142
143
144
# File 'lib/ferret/index/index_writer.rb', line 138

def doc_count()
  synchronize() do
    count = 0
    @segment_infos.each { |si| count += si.doc_count() }
    return count
  end
end

#optimizeObject

Merges all segments together into a single segment, optimizing an index for search.



172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# File 'lib/ferret/index/index_writer.rb', line 172

def optimize()
  synchronize() do
    flush_ram_segments()
    while (@segment_infos.size() > 1 ||
            (@segment_infos.size() == 1 &&
              (SegmentReader.has_deletions?(@segment_infos[0]) ||
                (@segment_infos[0].directory != @directory) ||
                  (@use_compound_file &&
                    (!SegmentReader.uses_compound_file?(@segment_infos[0]) ||
                      SegmentReader.has_separate_norms?(@segment_infos[0]))))))
      min_segment = @segment_infos.size() - @merge_factor
      merge_segments(min_segment < 0 ? 0 : min_segment)
    end
  end
end

#segments_counterObject



166
167
168
# File 'lib/ferret/index/index_writer.rb', line 166

def segments_counter()
  return segment_infos.counter
end