Class: Ferret::Index::SegmentReader

Inherits:
IndexReader show all
Defined in:
lib/ferret/index/segment_reader.rb

Overview

FIXME: Describe class SegmentReader here.

Defined Under Namespace

Classes: Norm

Constant Summary

Constants inherited from IndexReader

IndexReader::FILENAME_EXTENSIONS

Instance Attribute Summary collapse

Attributes inherited from IndexReader

#directory

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from IndexReader

#acquire_write_lock, #close, #commit, #delete, #delete_docs_with_term, get_current_version, #get_document_with_term, index_exists?, #latest?, open, #set_norm, #term_docs_for, #term_positions_for, #undelete_all

Constructor Details

#initialize(dir, info, seg_infos, close, owner) ⇒ SegmentReader

Returns a new instance of SegmentReader.



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/ferret/index/segment_reader.rb', line 14

def initialize(dir, info, seg_infos, close, owner)
  super(dir, seg_infos, close, owner)
  @segment = info.name

  @cfs_reader = nil
  cfs = directory
  if directory.exists?(@segment + '.cfs') then
    @cfs_reader = CompoundFileReader.new(directory, @segment + '.cfs')
    cfs = @cfs_reader
  end

  @field_infos = FieldInfos.new(cfs, @segment + '.fnm')
  @fields_reader = FieldsReader.new(cfs, @segment, @field_infos)

  @term_infos = TermInfosReader.new(cfs, @segment, @field_infos)
  @deleted_docs = nil
  @deleted_docs_dirty = false
  if SegmentReader.has_deletions?(info) then
    @deleted_docs =
      Ferret::Utils::BitVector.read(directory, @segment + '.del')
  end

  @freq_stream = cfs.open_input(@segment + '.frq')
  @prox_stream = cfs.open_input(@segment + '.prx')
  @norms = {}
  @norms.extend(MonitorMixin)
  @norms_dirty = false
  open_norms(cfs)

  @tv_reader_orig = nil
  if @field_infos.has_vectors? then
    @tv_reader_orig = TermVectorsReader.new(cfs, @segment, @field_infos)
  end
end

Instance Attribute Details

#deleted_docsObject (readonly)

Returns the value of attribute deleted_docs.



7
8
9
# File 'lib/ferret/index/segment_reader.rb', line 7

def deleted_docs
  @deleted_docs
end

#field_infosObject (readonly)

Returns the value of attribute field_infos.



7
8
9
# File 'lib/ferret/index/segment_reader.rb', line 7

def field_infos
  @field_infos
end

#freq_streamObject (readonly)

Returns the value of attribute freq_stream.



7
8
9
# File 'lib/ferret/index/segment_reader.rb', line 7

def freq_stream
  @freq_stream
end

#prox_streamObject (readonly)

Returns the value of attribute prox_stream.



7
8
9
# File 'lib/ferret/index/segment_reader.rb', line 7

def prox_stream
  @prox_stream
end

#segmentObject (readonly)

Returns the value of attribute segment.



7
8
9
# File 'lib/ferret/index/segment_reader.rb', line 7

def segment
  @segment
end

#term_infosObject (readonly)

Returns the value of attribute term_infos.



7
8
9
# File 'lib/ferret/index/segment_reader.rb', line 7

def term_infos
  @term_infos
end

Class Method Details

.get(info, infos = nil, close = false) ⇒ Object



10
11
12
# File 'lib/ferret/index/segment_reader.rb', line 10

def SegmentReader.get(info, infos = nil, close = false)
  return SegmentReader.new(info.directory, info, infos, close, infos!=nil)
end

.has_deletions?(si) ⇒ Boolean

Returns:

  • (Boolean)


85
86
87
# File 'lib/ferret/index/segment_reader.rb', line 85

def SegmentReader.has_deletions?(si)
  return si.directory.exists?(si.name + ".del")
end

.has_separate_norms?(si) ⇒ Boolean

Returns:

  • (Boolean)


98
99
100
101
# File 'lib/ferret/index/segment_reader.rb', line 98

def SegmentReader.has_separate_norms?(si)
  si.directory.each {|f| return true if f =~ /^#{si.name}\.s/}
  return false
end

.uses_compound_file?(si) ⇒ Boolean

Returns:

  • (Boolean)


94
95
96
# File 'lib/ferret/index/segment_reader.rb', line 94

def SegmentReader.uses_compound_file?(si)
  return si.directory.exists?(si.name + ".cfs")
end

Instance Method Details

#close_normsObject



292
293
294
295
296
# File 'lib/ferret/index/segment_reader.rb', line 292

def close_norms()
  @norms.synchronize do
    @norms.each_value {|norm| norm.is.close()}
  end
end

#deleted?(n) ⇒ Boolean

Returns:

  • (Boolean)


160
161
162
163
164
# File 'lib/ferret/index/segment_reader.rb', line 160

def deleted?(n) 
  synchronize do
    return (@deleted_docs != nil and @deleted_docs.get(n))
  end
end

#dirObject



359
360
361
# File 'lib/ferret/index/segment_reader.rb', line 359

def dir()
  return @directory
end

#do_closeObject



69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# File 'lib/ferret/index/segment_reader.rb', line 69

def do_close()
  # clear the cache
  Thread.current["#{self.object_id}-#{@segment}-tv_reader"] = nil

  @fields_reader.close()
  @term_infos.close()

  @freq_stream.close() if @freq_stream
  @prox_stream.close() if @prox_stream

  close_norms()
  
  @tv_reader_orig.close() if @tv_reader_orig
  @cfs_reader.close() if @cfs_reader
end

#do_commitObject



49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/ferret/index/segment_reader.rb', line 49

def do_commit()
  if (@deleted_docs_dirty) # re-write deleted 
    @deleted_docs.write(@directory, @segment + '.tmp')
    @directory.rename(@segment + '.tmp', @segment + '.del')
  end
  if(@undelete_all and @directory.exists?(@segment + '.del'))
    @directory.delete(@segment + '.del')
  end
  if (@norms_dirty) # re-write norms 
    @norms.each_value do |norm|
      if norm.dirty? 
        norm.re_write(@directory, @segment, max_doc(), @cfs_reader)
      end
    end
  end
  @deleted_docs_dirty = false
  @norms_dirty = false
  @undelete_all = false
end

#do_delete(doc_num) ⇒ Object



103
104
105
106
107
108
109
110
# File 'lib/ferret/index/segment_reader.rb', line 103

def do_delete(doc_num) 
  if (@deleted_docs == nil)
    @deleted_docs = Ferret::Utils::BitVector.new
  end
  @deleted_docs_dirty = true
  @undelete_all = false
  @deleted_docs.set(doc_num)
end

#do_set_norm(doc, field, value) ⇒ Object



245
246
247
248
249
250
251
252
253
254
# File 'lib/ferret/index/segment_reader.rb', line 245

def do_set_norm(doc, field, value)
  norm = @norms[field]
  if (norm == nil)                             # not an indexed field
    return
  end
  norm.dirty = true                            # mark it dirty
  @norms_dirty = true

  get_norms(field)[doc] = value                # set the value
end

#do_undelete_allObject



112
113
114
115
116
# File 'lib/ferret/index/segment_reader.rb', line 112

def do_undelete_all() 
  @deleted_docs = nil
  @deleted_docs_dirty = false
  @undelete_all = true
end

#doc_freq(t) ⇒ Object



174
175
176
177
178
179
180
181
# File 'lib/ferret/index/segment_reader.rb', line 174

def doc_freq(t)
  ti = @term_infos.get_term_info(t)
  if (ti != nil)
    return ti.doc_freq
  else
    return 0
  end
end

#file_namesObject



118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# File 'lib/ferret/index/segment_reader.rb', line 118

def file_names()
  file_names = []

  IndexFileNames::INDEX_EXTENSIONS.each do |ext|
    name = @segment + "." + ext
    if (@directory.exists?(name))
      file_names << name
    end
  end

  @field_infos.each_with_index do |fi, i|
    if (fi.indexed?)
      if @cfs_reader.nil?
        name = @segment + ".f" + i.to_s
      else
        name = @segment + ".s" + i.to_s
      end
      if (@directory.exists?(name))
        file_names << name
      end
    end
  end
  return file_names
end

#get_document(n) ⇒ Object



151
152
153
154
155
156
157
158
# File 'lib/ferret/index/segment_reader.rb', line 151

def get_document(n)
  synchronize do
    if deleted?(n)
      raise ArgumentError, "attempt to access a deleted document"
    end
    return @fields_reader.doc(n)
  end
end

#get_field_names(field_option = IndexReader::FieldOption::ALL) ⇒ Object

See IndexReader#get_field_names



196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# File 'lib/ferret/index/segment_reader.rb', line 196

def get_field_names(field_option = IndexReader::FieldOption::ALL) 
  field_set = Set.new
  @field_infos.each do |fi|
    if (field_option == IndexReader::FieldOption::ALL) 
      field_set.add(fi.name)
    elsif (!fi.indexed? and field_option == IndexReader::FieldOption::UNINDEXED) 
      field_set.add(fi.name)
    elsif (fi.indexed? and field_option == IndexReader::FieldOption::INDEXED) 
      field_set.add(fi.name)
    elsif (fi.indexed? and fi.store_term_vector? == false and
           field_option == IndexReader::FieldOption::INDEXED_NO_TERM_VECTOR) 
      field_set.add(fi.name)
    elsif (fi.store_term_vector? == true and
           fi.store_positions? == false and
           fi.store_offsets? == false and
           field_option == IndexReader::FieldOption::TERM_VECTOR) 
      field_set.add(fi.name)
    elsif (fi.indexed? and fi.store_term_vector? and
           field_option == IndexReader::FieldOption::INDEXED_WITH_TERM_VECTOR) 
      field_set.add(fi.name)
    elsif (fi.store_positions? and fi.store_offsets? == false and
           field_option == IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION) 
      field_set.add(fi.name)
    elsif (fi.store_offsets? and fi.store_positions? == false and
           field_option == IndexReader::FieldOption::TERM_VECTOR_WITH_OFFSET) 
      field_set.add(fi.name)
    elsif (fi.store_offsets? and fi.store_positions? and
           field_option == IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION_OFFSET)
      field_set.add(fi.name)
    end
  end
  return field_set
end

#get_norms(field) ⇒ Object



230
231
232
233
234
235
236
237
238
239
240
241
242
243
# File 'lib/ferret/index/segment_reader.rb', line 230

def get_norms(field)
  synchronize do
    norm = @norms[field]
    if (norm == nil)               # not an indexed field
      return nil
    end
    if (norm.bytes == nil)         # value not yet read
      bytes = " " * max_doc()
      get_norms_into(field, bytes, 0)
      norm.bytes = bytes           # cache it
    end
    return norm.bytes
  end
end

#get_norms_into(field, bytes, offset) ⇒ Object

Read norms into a pre-allocated array.



257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
# File 'lib/ferret/index/segment_reader.rb', line 257

def get_norms_into(field, bytes, offset)
  synchronize do
    norm = @norms[field]
    return if (norm == nil) # use zeros in array

    if (norm.bytes != nil) # can copy from cache
      bytes[offset, max_doc()] = norm.bytes[0, max_doc()]
      return
    end

    norm_stream = norm.is.clone()
    begin # read from disk
      norm_stream.seek(0)
      norm_stream.read_bytes(bytes, offset, max_doc())
    ensure 
      norm_stream.close()
    end
  end
end

#get_term_vector(doc_number, field) ⇒ Object

Return a term frequency vector for the specified document and field. The vector returned contains term numbers and frequencies for all terms in the specified field of this document, if the field had storeTermVector flag set. If the flag was not set, the method returns nil.

raises

IOException



327
328
329
330
331
332
333
334
335
336
337
338
339
# File 'lib/ferret/index/segment_reader.rb', line 327

def get_term_vector(doc_number, field)
  # Check if this field is invalid or has no stored term vector
  fi = @field_infos[field]
  if fi.nil? or not fi.store_term_vector? or @tv_reader_orig.nil?
    return nil
  end
  
  term_vectors_reader = get_term_vectors_reader()
  if (term_vectors_reader == nil)
    return nil
  end
  return term_vectors_reader.get_field_tv(doc_number, field)
end

#get_term_vectors(doc_number) ⇒ Object

Return an array of term frequency vectors for the specified document. The array contains a vector for each vectorized field in the document. Each vector vector contains term numbers and frequencies for all terms in a given vectorized field. If no such fields existed, the method returns nil.

raises

IOException



348
349
350
351
352
353
354
355
356
357
# File 'lib/ferret/index/segment_reader.rb', line 348

def get_term_vectors(doc_number)
  if @tv_reader_orig.nil?
    return nil
  end
  term_vectors_reader = get_term_vectors_reader()
  if (term_vectors_reader == nil)
    return nil
  end
  return term_vectors_reader.get_tv(doc_number)
end

#get_term_vectors_readerObject

Create a clone from the initial TermVectorsReader and store it in the Thread

returns

TermVectorsReader



301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
# File 'lib/ferret/index/segment_reader.rb', line 301

def get_term_vectors_reader() 
  #tvr_cache = Thread.current["tv_reader"]
  #if (tvr_cache == nil) 
  #  tvr_cache = Thread.current["tv_reader"] = Ferret::Utils::WeakKeyHash.new
  #end
  #tvr_cache.synchronize do
  #  tv_reader = tvr_cache[self]
  #  if tv_reader == nil
  #    tv_reader = @tv_reader_orig.clone()
  #    tvr_cache[self] = tv_reader
  #  end
  #  return tv_reader
  #end
  tv_reader = Thread.current.get_local(self)
  if tv_reader.nil?
    tv_reader = @tv_reader_orig.clone()
    Thread.current.set_local(self, tv_reader)
  end
  return tv_reader
end

#has_deletions?Boolean

Returns:

  • (Boolean)


89
90
91
# File 'lib/ferret/index/segment_reader.rb', line 89

def has_deletions?() 
  return @deleted_docs != nil
end

#max_docObject



191
192
193
# File 'lib/ferret/index/segment_reader.rb', line 191

def max_doc() 
  return @fields_reader.size()
end

#num_docsObject



183
184
185
186
187
188
189
# File 'lib/ferret/index/segment_reader.rb', line 183

def num_docs() 
  n = max_doc()
  if (@deleted_docs != nil)
    n -= @deleted_docs.count()
  end
  return n
end

#open_norms(cfs_dir) ⇒ Object



277
278
279
280
281
282
283
284
285
286
287
288
289
290
# File 'lib/ferret/index/segment_reader.rb', line 277

def open_norms(cfs_dir)
  @field_infos.each do |fi|
    if (fi.indexed?) 
      # look first if there are separate norms in compound format
      file_name = @segment + ".s" + fi.number.to_s
      d = @directory
      if not d.exists?(file_name)
        file_name = @segment + ".f" + fi.number.to_s
        d = cfs_dir
      end
      @norms[fi.name] = Norm.new(d.open_input(file_name), fi.number)
    end
  end
end

#term_docsObject



166
167
168
# File 'lib/ferret/index/segment_reader.rb', line 166

def term_docs()
  return SegmentTermDocEnum.new(self)
end

#term_positionsObject



170
171
172
# File 'lib/ferret/index/segment_reader.rb', line 170

def term_positions()
  return SegmentTermDocPosEnum.new(self)
end

#termsObject



143
144
145
# File 'lib/ferret/index/segment_reader.rb', line 143

def terms() 
  return @term_infos.terms()
end

#terms_from(t) ⇒ Object



147
148
149
# File 'lib/ferret/index/segment_reader.rb', line 147

def terms_from(t)
  return @term_infos.terms_from(t)
end