Class: Picky::Indexers::Serial

Inherits:
Base show all
Defined in:
lib/picky/indexers/serial.rb

Overview

Uses a category to index its data.

Note: It is called serial since it indexes each category separately.

Instance Attribute Summary

Attributes inherited from Base

#index_or_category

Instance Method Summary collapse

Methods inherited from Base

#check, #initialize, #notify_finished, #prepare, #reset

Constructor Details

This class inherits a constructor from Picky::Indexers::Base

Instance Method Details

#flush(prepared_file, cache) ⇒ Object



82
83
84
# File 'lib/picky/indexers/serial.rb', line 82

def flush prepared_file, cache
  prepared_file.write(cache.join) && cache.clear
end

#index_flush(datas, file, cache, tokenizer) ⇒ Object



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/picky/indexers/serial.rb', line 54

def index_flush datas, file, cache, tokenizer
  comma   = ?,
  newline = ?\n
  
  # Optimized, therefore duplicate code.
  #
  # TODO Deoptimize?
  #
  if tokenizer
    datas.each do |indexed_id, text|
      tokens, _ = tokenizer.tokenize text # Note: Originals not needed.
      tokens.each do |token_text|
        next unless token_text
        cache << indexed_id << comma << token_text << newline
      end
    end
  else
    datas.each do |indexed_id, tokens|
      tokens.each do |token_text|
        next unless token_text
        cache << indexed_id << comma << token_text << newline
      end
    end
  end

  flush file, cache
end

#process(source_for_prepare, categories, scheduler = Scheduler.new) ⇒ Object

Harvest the data from the source, tokenize, and write to an intermediate “prepared index” file.

Parameters:

* categories: An enumerable of Category-s.


19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/picky/indexers/serial.rb', line 19

def process source_for_prepare, categories, scheduler = Scheduler.new
  categories.each do |category|

    category.prepared_index_file do |file|

      datas = []
      result = []
      tokenizer = category.tokenizer

      reset source_for_prepare

      source.harvest(category) do |*data|

        # Accumulate data.
        #
        datas << data
        next if datas.size < 10_000

        # Opening the file inside the scheduler to
        # have it automagically closed.
        #
        index_flush datas, file, result, tokenizer

        datas.clear

      end

      index_flush datas, file, result, tokenizer

      yield file
    end
  end

end