Class: FastRI::FullTextIndexer

Inherits:
Object
  • Object
show all
Defined in:
lib/fastri/full_text_indexer.rb

Constant Summary collapse

WORD_RE =
/[A-Za-z0-9_]+/
NONWORD_RE =
/[^A-Za-z0-9_]+/
MAGIC =
"FastRI full-text index #{FASTRI_FT_INDEX_FORMAT}\0"

Instance Method Summary collapse

Constructor Details

#initialize(max_querysize) ⇒ FullTextIndexer

Returns a new instance of FullTextIndexer.



13
14
15
16
17
# File 'lib/fastri/full_text_indexer.rb', line 13

def initialize(max_querysize)
  @documents = []
  @doc_hash  = {}
  @max_wordsize = max_querysize
end

Instance Method Details

#add_document(name, data, metadata = {}) ⇒ Object



19
20
21
22
# File 'lib/fastri/full_text_indexer.rb', line 19

def add_document(name, data,  = {})
  @doc_hash[name] = [data, .merge(:size => data.size)]
  @documents << name
end

#build_index(full_text_IO, suffix_array_IO) ⇒ Object



62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# File 'lib/fastri/full_text_indexer.rb', line 62

def build_index(full_text_IO, suffix_array_IO)
  fulltext = ""
  io = StringIO.new(fulltext)
  io.write MAGIC
  full_text_IO.write MAGIC
  documents.each do |doc|
    data,  = @doc_hash[doc]
    io.write(data)
    full_text_IO.write(data)
    meta_txt = Marshal.dump()
    footer = "\0....#{doc}\0#{meta_txt}\0"
    footer[1,4] = [footer.size - 5].pack("V")
    io.write(footer)
    full_text_IO.write(footer)
  end

  scanner = StringScanner.new(fulltext)
  scanner.scan(Regexp.new(Regexp.escape(MAGIC)))

  count = 0
  suffixes = []
  until scanner.eos?
    count += 1
    start = scanner.pos
    text = scanner.scan_until(/\0/)
    suffixes.concat find_suffixes(text[0..-2], start)
    len = scanner.scan(/..../).unpack("V")[0]
    #puts "LEN: #{len}  #{scanner.pos}  #{scanner.string.size}"
    #puts "#{scanner.string[scanner.pos,20].inspect}"
    scanner.pos += len
    #scanner.terminate if !text
  end
  sorted = suffixes.sort_by{|x| fulltext[x, @max_wordsize]}
  sorted.each_slice(10000){|x| suffix_array_IO.write x.pack("V*")}
  nil
end

#data(name) ⇒ Object



24
25
26
# File 'lib/fastri/full_text_indexer.rb', line 24

def data(name)
  @doc_hash[name][0]
end

#documentsObject



28
29
30
# File 'lib/fastri/full_text_indexer.rb', line 28

def documents
  @documents = @documents.uniq
end

#find_suffixes(text, offset) ⇒ Object



37
38
39
# File 'lib/fastri/full_text_indexer.rb', line 37

def find_suffixes(text, offset)
  find_suffixes_simple(text, WORD_RE, NONWORD_RE, offset)
end

#find_suffixes_simple(string, word_re, nonword_re, offset) ⇒ Object



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/fastri/full_text_indexer.rb', line 41

def find_suffixes_simple(string, word_re, nonword_re, offset)
  suffixes = []
  sc = StringScanner.new(string)
  until sc.eos?
    sc.skip(nonword_re)
    len = string.size
    loop do
      break if sc.pos == len
      suffixes << offset + sc.pos
      skipped_word = sc.skip(word_re)
      break unless skipped_word
      loop do
        skipped_nonword = sc.skip(nonword_re)
        break unless skipped_nonword
      end
    end
  end
  suffixes
end

#preprocess(str) ⇒ Object



32
33
34
# File 'lib/fastri/full_text_indexer.rb', line 32

def preprocess(str)
  str.gsub(/\0/,"")
end