Class: Ferret::Index::SegmentReader

Inherits:
IndexReader show all
Defined in:
lib/ferret/index/segment_reader.rb

Overview

FIXME: Describe class SegmentReader here.

Defined Under Namespace

Classes: Norm

Constant Summary

Constants inherited from IndexReader

IndexReader::FILENAME_EXTENSIONS

Instance Attribute Summary collapse

Attributes inherited from IndexReader

#directory

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from IndexReader

#acquire_write_lock, #close, #commit, #delete, #delete_docs_with_term, get_current_version, #get_document_with_term, index_exists?, #latest?, open, #set_norm, #term_docs_for, #term_positions_for, #undelete_all

Constructor Details

#initialize(dir, info, seg_infos, close, owner) ⇒ SegmentReader

Returns a new instance of SegmentReader.



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# File 'lib/ferret/index/segment_reader.rb', line 14

def initialize(dir, info, seg_infos, close, owner)
  super(dir, seg_infos, close, owner)
  @segment = info.name

  @cfs_reader = nil
  dir = directory
  #if directory.exists?(@segment + '.cfs') then
  if SegmentReader.uses_compound_file?(info)
    @cfs_reader = CompoundFileReader.new(directory, @segment + '.cfs')
    dir = @cfs_reader
  end

  @field_infos = FieldInfos.new(dir, @segment + '.fnm')
  @fields_reader = FieldsReader.new(dir, @segment, @field_infos)

  @term_infos = TermInfosReader.new(dir, @segment, @field_infos)
  @deleted_docs = nil
  @deleted_docs_dirty = false
  if SegmentReader.has_deletions?(info) then
    @deleted_docs =
      Ferret::Utils::BitVector.read(directory, @segment + '.del')
  end

  @freq_stream = dir.open_input(@segment + '.frq')
  @prox_stream = dir.open_input(@segment + '.prx')
  @norms = {}
  @norms.extend(MonitorMixin)
  @norms_dirty = false
  open_norms(dir)

  @tv_reader_orig = nil
  if @field_infos.has_vectors? then
    @tv_reader_orig = TermVectorsReader.new(dir, @segment, @field_infos)
  end
end

Instance Attribute Details

#deleted_docsObject (readonly)

Returns the value of attribute deleted_docs.



7
8
9
# File 'lib/ferret/index/segment_reader.rb', line 7

def deleted_docs
  @deleted_docs
end

#field_infosObject (readonly)

Returns the value of attribute field_infos.



7
8
9
# File 'lib/ferret/index/segment_reader.rb', line 7

def field_infos
  @field_infos
end

#freq_streamObject (readonly)

Returns the value of attribute freq_stream.



7
8
9
# File 'lib/ferret/index/segment_reader.rb', line 7

def freq_stream
  @freq_stream
end

#prox_streamObject (readonly)

Returns the value of attribute prox_stream.



7
8
9
# File 'lib/ferret/index/segment_reader.rb', line 7

def prox_stream
  @prox_stream
end

#segmentObject (readonly)

Returns the value of attribute segment.



7
8
9
# File 'lib/ferret/index/segment_reader.rb', line 7

def segment
  @segment
end

#term_infosObject (readonly)

Returns the value of attribute term_infos.



7
8
9
# File 'lib/ferret/index/segment_reader.rb', line 7

def term_infos
  @term_infos
end

Class Method Details

.create_fake_norms(size) ⇒ Object



235
236
237
# File 'lib/ferret/index/segment_reader.rb', line 235

def SegmentReader.create_fake_norms(size)
  Array.new(size, 1).pack("C*")
end

.get(info, infos = nil, close = false) ⇒ Object



10
11
12
# File 'lib/ferret/index/segment_reader.rb', line 10

def SegmentReader.get(info, infos = nil, close = false)
  return SegmentReader.new(info.directory, info, infos, close, infos!=nil)
end

.has_deletions?(si) ⇒ Boolean

Returns:

  • (Boolean)


86
87
88
# File 'lib/ferret/index/segment_reader.rb', line 86

def SegmentReader.has_deletions?(si)
  return si.directory.exists?(si.name + ".del")
end

.has_separate_norms?(si) ⇒ Boolean

Returns:

  • (Boolean)


99
100
101
102
# File 'lib/ferret/index/segment_reader.rb', line 99

def SegmentReader.has_separate_norms?(si)
  si.directory.each {|f| return true if f =~ /^#{si.name}\.s/}
  return false
end

.uses_compound_file?(si) ⇒ Boolean

Returns:

  • (Boolean)


95
96
97
# File 'lib/ferret/index/segment_reader.rb', line 95

def SegmentReader.uses_compound_file?(si)
  return si.directory.exists?(si.name + ".cfs")
end

Instance Method Details

#close_normsObject



308
309
310
311
312
# File 'lib/ferret/index/segment_reader.rb', line 308

def close_norms()
  @norms.synchronize do
    @norms.each_value {|norm| norm.is.close()}
  end
end

#deleted?(n) ⇒ Boolean

Returns:

  • (Boolean)


161
162
163
164
165
# File 'lib/ferret/index/segment_reader.rb', line 161

def deleted?(n) 
  synchronize do
    return (@deleted_docs != nil and @deleted_docs.get(n))
  end
end

#dirObject



375
376
377
# File 'lib/ferret/index/segment_reader.rb', line 375

def dir()
  return @directory
end

#do_closeObject



70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/ferret/index/segment_reader.rb', line 70

def do_close()
  # clear the cache
  Thread.current["#{self.object_id}-#{@segment}-tv_reader"] = nil

  @fields_reader.close()
  @term_infos.close()

  @freq_stream.close() if @freq_stream
  @prox_stream.close() if @prox_stream

  close_norms()
  
  @tv_reader_orig.close() if @tv_reader_orig
  @cfs_reader.close() if @cfs_reader
end

#do_commitObject



50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/ferret/index/segment_reader.rb', line 50

def do_commit()
  if (@deleted_docs_dirty) # re-write deleted 
    @deleted_docs.write(@directory, @segment + '.tmp')
    @directory.rename(@segment + '.tmp', @segment + '.del')
  end
  if(@undelete_all and @directory.exists?(@segment + '.del'))
    @directory.delete(@segment + '.del')
  end
  if (@norms_dirty) # re-write norms 
    @norms.each_value do |norm|
      if norm.dirty? 
        norm.re_write(@directory, @segment, max_doc(), @cfs_reader)
      end
    end
  end
  @deleted_docs_dirty = false
  @norms_dirty = false
  @undelete_all = false
end

#do_delete(doc_num) ⇒ Object



104
105
106
107
108
109
110
111
# File 'lib/ferret/index/segment_reader.rb', line 104

def do_delete(doc_num) 
  if (@deleted_docs == nil)
    @deleted_docs = Ferret::Utils::BitVector.new
  end
  @deleted_docs_dirty = true
  @undelete_all = false
  @deleted_docs.set(doc_num)
end

#do_set_norm(doc, field, value) ⇒ Object



258
259
260
261
262
263
264
265
266
267
# File 'lib/ferret/index/segment_reader.rb', line 258

def do_set_norm(doc, field, value)
  norm = @norms[field]
  if (norm == nil)                             # not an indexed field
    return
  end
  norm.dirty = true                            # mark it dirty
  @norms_dirty = true

  get_norms(field)[doc] = value                # set the value
end

#do_undelete_allObject



113
114
115
116
117
# File 'lib/ferret/index/segment_reader.rb', line 113

def do_undelete_all() 
  @deleted_docs = nil
  @deleted_docs_dirty = false
  @undelete_all = true
end

#doc_freq(t) ⇒ Object



175
176
177
178
179
180
181
182
# File 'lib/ferret/index/segment_reader.rb', line 175

def doc_freq(t)
  ti = @term_infos.get_term_info(t)
  if (ti != nil)
    return ti.doc_freq
  else
    return 0
  end
end

#fake_normsObject



239
240
241
# File 'lib/ferret/index/segment_reader.rb', line 239

def fake_norms()
  return @ones ||= SegmentReader.create_fake_norms(max_doc())
end

#file_namesObject



119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# File 'lib/ferret/index/segment_reader.rb', line 119

def file_names()
  file_names = []

  IndexFileNames::INDEX_EXTENSIONS.each do |ext|
    name = @segment + "." + ext
    if (@directory.exists?(name))
      file_names << name
    end
  end

  @field_infos.each_with_index do |fi, i|
    if (fi.indexed? and not fi.omit_norms?)
      if @cfs_reader.nil?
        name = "#{@segment}.f#{i}"
      else
        name = "#{@segment}.s#{i}"
      end
      if (@directory.exists?(name))
        file_names << name
      end
    end
  end
  return file_names
end

#get_document(n) ⇒ Object



152
153
154
155
156
157
158
159
# File 'lib/ferret/index/segment_reader.rb', line 152

def get_document(n)
  synchronize do
    if deleted?(n)
      raise ArgumentError, "attempt to access a deleted document"
    end
    return @fields_reader.doc(n)
  end
end

#get_field_names(field_option = IndexReader::FieldOption::ALL) ⇒ Object

See IndexReader#get_field_names



197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# File 'lib/ferret/index/segment_reader.rb', line 197

def get_field_names(field_option = IndexReader::FieldOption::ALL) 
  field_set = Set.new
  @field_infos.each do |fi|
    if (field_option == IndexReader::FieldOption::ALL) 
      field_set.add(fi.name)
    elsif (!fi.indexed? and field_option == IndexReader::FieldOption::UNINDEXED) 
      field_set.add(fi.name)
    elsif (fi.indexed? and field_option == IndexReader::FieldOption::INDEXED) 
      field_set.add(fi.name)
    elsif (fi.indexed? and fi.store_term_vector? == false and
           field_option == IndexReader::FieldOption::INDEXED_NO_TERM_VECTOR) 
      field_set.add(fi.name)
    elsif (fi.store_term_vector? == true and
           fi.store_positions? == false and
           fi.store_offsets? == false and
           field_option == IndexReader::FieldOption::TERM_VECTOR) 
      field_set.add(fi.name)
    elsif (fi.indexed? and fi.store_term_vector? and
           field_option == IndexReader::FieldOption::INDEXED_WITH_TERM_VECTOR) 
      field_set.add(fi.name)
    elsif (fi.store_positions? and fi.store_offsets? == false and
           field_option == IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION) 
      field_set.add(fi.name)
    elsif (fi.store_offsets? and fi.store_positions? == false and
           field_option == IndexReader::FieldOption::TERM_VECTOR_WITH_OFFSET) 
      field_set.add(fi.name)
    elsif (fi.store_offsets? and fi.store_positions? and
           field_option == IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION_OFFSET)
      field_set.add(fi.name)
    end
  end
  return field_set
end

#get_norms(field) ⇒ Object



243
244
245
246
247
248
249
250
251
252
253
254
255
256
# File 'lib/ferret/index/segment_reader.rb', line 243

def get_norms(field)
  synchronize do
    norm = @norms[field]
    if (norm == nil)               # not an indexed field or omit norms
      return nil
    end
    if (norm.bytes == nil)         # value not yet read
      bytes = " " * max_doc()
      get_norms_into(field, bytes, 0)
      norm.bytes = bytes           # cache it
    end
    return norm.bytes
  end
end

#get_norms_into(field, bytes, offset) ⇒ Object

Read norms into a pre-allocated array.



270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
# File 'lib/ferret/index/segment_reader.rb', line 270

def get_norms_into(field, bytes, offset)
  synchronize do
    norm = @norms[field]
    if (norm.nil?) 
      bytes[offset, max_doc()] = fake_norms[0, max_doc()]
      return
    end

    if (norm.bytes != nil) # can copy from cache
      bytes[offset, max_doc()] = norm.bytes[0, max_doc()]
      return
    end

    norm_stream = norm.is.clone()
    begin # read from disk
      norm_stream.seek(0)
      norm_stream.read_bytes(bytes, offset, max_doc())
    ensure 
      norm_stream.close()
    end
  end
end

#get_term_vector(doc_number, field) ⇒ Object

Return a term frequency vector for the specified document and field. The vector returned contains term numbers and frequencies for all terms in the specified field of this document, if the field had storeTermVector flag set. If the flag was not set, the method returns nil.

raises

IOException



343
344
345
346
347
348
349
350
351
352
353
354
355
# File 'lib/ferret/index/segment_reader.rb', line 343

def get_term_vector(doc_number, field)
  # Check if this field is invalid or has no stored term vector
  fi = @field_infos[field]
  if fi.nil? or not fi.store_term_vector? or @tv_reader_orig.nil?
    return nil
  end
  
  term_vectors_reader = get_term_vectors_reader()
  if (term_vectors_reader == nil)
    return nil
  end
  return term_vectors_reader.get_field_tv(doc_number, field)
end

#get_term_vectors(doc_number) ⇒ Object

Return an array of term frequency vectors for the specified document. The array contains a vector for each vectorized field in the document. Each vector vector contains term numbers and frequencies for all terms in a given vectorized field. If no such fields existed, the method returns nil.

raises

IOException



364
365
366
367
368
369
370
371
372
373
# File 'lib/ferret/index/segment_reader.rb', line 364

def get_term_vectors(doc_number)
  if @tv_reader_orig.nil?
    return nil
  end
  term_vectors_reader = get_term_vectors_reader()
  if (term_vectors_reader == nil)
    return nil
  end
  return term_vectors_reader.get_tv(doc_number)
end

#get_term_vectors_readerObject

Create a clone from the initial TermVectorsReader and store it in the Thread

returns

TermVectorsReader



317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
# File 'lib/ferret/index/segment_reader.rb', line 317

def get_term_vectors_reader() 
  #tvr_cache = Thread.current["tv_reader"]
  #if (tvr_cache == nil) 
  #  tvr_cache = Thread.current["tv_reader"] = Ferret::Utils::WeakKeyHash.new
  #end
  #tvr_cache.synchronize do
  #  tv_reader = tvr_cache[self]
  #  if tv_reader == nil
  #    tv_reader = @tv_reader_orig.clone()
  #    tvr_cache[self] = tv_reader
  #  end
  #  return tv_reader
  #end
  tv_reader = Thread.current.get_local(self)
  if tv_reader.nil?
    tv_reader = @tv_reader_orig.clone()
    Thread.current.set_local(self, tv_reader)
  end
  return tv_reader
end

#has_deletions?Boolean

Returns:

  • (Boolean)


90
91
92
# File 'lib/ferret/index/segment_reader.rb', line 90

def has_deletions?() 
  return @deleted_docs != nil
end

#has_norms?(field) ⇒ Boolean

Returns:

  • (Boolean)


231
232
233
# File 'lib/ferret/index/segment_reader.rb', line 231

def has_norms?(field)
  return @norms.has_key?(field)
end

#max_docObject



192
193
194
# File 'lib/ferret/index/segment_reader.rb', line 192

def max_doc() 
  return @fields_reader.size()
end

#num_docsObject



184
185
186
187
188
189
190
# File 'lib/ferret/index/segment_reader.rb', line 184

def num_docs() 
  n = max_doc()
  if (@deleted_docs != nil)
    n -= @deleted_docs.count()
  end
  return n
end

#open_norms(cfs_dir) ⇒ Object



293
294
295
296
297
298
299
300
301
302
303
304
305
306
# File 'lib/ferret/index/segment_reader.rb', line 293

def open_norms(cfs_dir)
  @field_infos.each do |fi|
    if (fi.indexed? and not fi.omit_norms?)
      # look first if there are separate norms in compound format
      file_name = @segment + ".s" + fi.number.to_s
      d = @directory
      if not d.exists?(file_name)
        file_name = @segment + ".f" + fi.number.to_s
        d = cfs_dir
      end
      @norms[fi.name] = Norm.new(d.open_input(file_name), fi.number)
    end
  end
end

#term_docsObject



167
168
169
# File 'lib/ferret/index/segment_reader.rb', line 167

def term_docs()
  return SegmentTermDocEnum.new(self)
end

#term_positionsObject



171
172
173
# File 'lib/ferret/index/segment_reader.rb', line 171

def term_positions()
  return SegmentTermDocPosEnum.new(self)
end

#termsObject



144
145
146
# File 'lib/ferret/index/segment_reader.rb', line 144

def terms() 
  return @term_infos.terms()
end

#terms_from(t) ⇒ Object



148
149
150
# File 'lib/ferret/index/segment_reader.rb', line 148

def terms_from(t)
  return @term_infos.terms_from(t)
end