Class: Indexer101

Inherits:
Object
  • Object
show all
Defined in:
lib/indexer101.rb

Defined Under Namespace

Classes: Index

Instance Method Summary collapse

Constructor Details

#initialize(filename = 'indexer.dat', debug: false) ⇒ Indexer101

Returns a new instance of Indexer101.



71
72
73
74
75
76
77
78
79
80
81
# File 'lib/indexer101.rb', line 71

def initialize(filename='indexer.dat', debug: false)
  
  @filename, @debug = filename, debug
  
  puts
  puts 'Indexer101'.highlight +  " ready to index".green 
  puts

  @indexer = Index.new()
  
end

Instance Method Details

#build(a = @indexer.index.keys) ⇒ Object



83
84
85
86
87
88
89
90
91
92
93
# File 'lib/indexer101.rb', line 83

def build(a=@indexer.index.keys)
  
  t = Time.now
  @indexer.build(a)    
  t2 = Time.now - t
  
  puts "%d words indexed".info % a.length
  puts ("index built in " + ("%.3f" % t2).brown + " seconds").info
  
  self
end

#indexObject



95
96
97
# File 'lib/indexer101.rb', line 95

def index()
  @indexer.index
end

#lookup(s, limit: 10) ⇒ Object

enter a few starting characters and lookup will suggest a few keywords useful for an auto suggest feature



208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# File 'lib/indexer101.rb', line 208

def lookup(s, limit: 10)

  t = Time.now
  a = scan_path s
  puts ('a: ' + a.inspect[0..100] + '...').debug if @debug
  
  i = scan_key @indexer.h, a
  
  r = @indexer.h.dig(*a[0..i])
  puts ('r: ' + r.inspect[0..100] + '...').debug if @debug
  
  return r if r.is_a? Array
  
  results = scan_leaves(r).sort_by(&:length).take(limit)
  t2 = Time.now - t
  puts ("lookup took " + ("%.3f" % t2).brown + " seconds").info
  
  return results
  
end

#read(filename = @filename) ⇒ Object



99
100
101
102
103
104
105
106
107
108
109
110
111
112
# File 'lib/indexer101.rb', line 99

def read(filename=@filename)
  
  t = Time.now
  
  File.open(filename) do |f|  
    @indexer = Marshal.load(f)  
  end
  
  t2 = Time.now - t
  
  puts "index contains %d words".info % @indexer.index.length
  puts "index read in " + ("%.2f" % t2).brown + " seconds".info
  
end

#save(filename = @filename) ⇒ Object



114
115
116
117
118
119
120
# File 'lib/indexer101.rb', line 114

def save(filename=@filename)

  File.open(filename, 'w+') do |f|  
    Marshal.dump(@indexer, f)  
  end 
  
end

#scan_dxindex(*locations, level: 0) ⇒ Object

scan levels: 0 = tags only; 1 = all words in title (including tags)



124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# File 'lib/indexer101.rb', line 124

def scan_dxindex(*locations, level: 0)
  
  t = Time.now
  threads = locations.flatten.map do |location|
    
    Thread.new {

      if location.is_a?(Dynarex) or location.is_a?(DxLite) then
    
        Thread.current[:v] = location
    
      elsif location.is_a? String
    
        case File.extname(location)
        when '.xml'
          Thread.current[:v] = Dynarex.new location, debug: @debug
        when '.json'
          Thread.current[:v] = DxLite.new location, debug: @debug
        end
    
      end
    }      
  end
  
  ThreadsWait.all_waits(*threads)
  
  a = threads.map {|x| x[:v]}
  puts '_a: ' + a.inspect if @debug
  t2 = Time.now - t
  puts ("dxindex documents loaded in " + ("%.2f" % t2).brown \
        + " seconds").info
  

  id = 1
  
  a.each do |dx|

    id2 = id
    
    if @debug then
      puts 'dx: ' + dx.class.inspect
      puts 'dx.all: ' + dx.all.inspect
    end
    
    @indexer.uri_index.merge! Hash[dx.all.reverse.map.with_index \
      {|x,i| [id+i, [Time.parse(x.created), x.title, x.url]]}]
            
    dx.all.reverse.each do |x|
              
      case level
      when 0 
        
        x.title.scan(/(\#\w+)/).flatten(1).each do |keyword|
          @indexer.index[keyword.downcase.to_sym] ||= []
          @indexer.index[keyword.downcase.to_sym] << id2
        end
        
      when 1
        
        # \u{A3} = £ <- represented as Unicode to avoid ASCII to UTF-8 error
        x.title.split(/[\s:"!\?\(\)\u{A3}]+(?=[\w#_'-]+)/).each do |keyword|
          @indexer.index[keyword.downcase.to_sym] ||= []
          @indexer.index[keyword.downcase.to_sym] << id2
        end

      end
              
      id2 += 1
      
    end    
    
    id = id2
    
  end
  
end

#search(*keywords, minchars: 3) ⇒ Object

enter the exact keywords to search from the index



231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
# File 'lib/indexer101.rb', line 231

def search(*keywords, minchars: 3)
  
  t = Time.now
  
  r = keywords.flatten(1).map do |x|
    
    a = []
    a += @indexer.index[x.to_sym].reverse if @indexer.index.has_key? x.to_sym
    
    if x.length >= minchars then
      a += @indexer.index.keys.grep(/^#{x}/i).flat_map\
          {|y| @indexer.index[y].reverse}
      a += @indexer.index.keys.grep(/#{x}/i).flat_map\
          {|y| @indexer.index[y].reverse} 
    end
    
    puts ('a: ' + a.inspect).debug if @debug
    
    a.uniq.map {|y| @indexer.uri_index[y]}
    
  end
  
  # group by number of results found, sort by count, then by date
  a3 = r.flatten(1).group_by(&:last).to_a.sort do |x, x2|
    -([x.last.length, x.last.first] <=> [x2.last.length, x2.last.first])
  end
  
  # fetch the 1st record from each group item
  results = a3.map {|x| x.last.first}
  
  t2 = Time.now - t
  puts ("found %s results" % results.length).info
  puts ("search took " + ("%.3f" % t2).brown + " seconds").info
  puts
  
  return results
  
end

#uri_indexObject



201
202
203
# File 'lib/indexer101.rb', line 201

def uri_index()
  @indexer.uri_index
end