Class: Picky::Indexers::Serial

Inherits:

Base

Object
Base
Picky::Indexers::Serial

show all

Defined in:: lib/picky/indexers/serial.rb

Overview

Uses a category to index its data.

Note: It is called serial since it indexes each category separately.

Instance Attribute Summary

Attributes inherited from Base

#index_or_category

Instance Method Summary collapse

#flush(prepared_file, cache) ⇒ Object
#index_flush(datas, file, cache, tokenizer) ⇒ Object
#process(source_for_prepare, categories, scheduler = Scheduler.new) ⇒ Object

Harvest the data from the source, tokenize, and write to an intermediate “prepared index” file.

Methods inherited from Base

#check, #initialize, #notify_finished, #prepare, #reset

Constructor Details

This class inherits a constructor from Picky::Indexers::Base

Instance Method Details

#flush(prepared_file, cache) ⇒ `Object`



82
83
84

# File 'lib/picky/indexers/serial.rb', line 82

def flush prepared_file, cache
  prepared_file.write(cache.join) && cache.clear
end

#index_flush(datas, file, cache, tokenizer) ⇒ `Object`

# File 'lib/picky/indexers/serial.rb', line 54

def index_flush datas, file, cache, tokenizer
  comma   = ?,
  newline = ?\n
  
  # Optimized, therefore duplicate code.
  #
  # TODO Deoptimize?
  #
  if tokenizer
    datas.each do |indexed_id, text|
      tokens, _ = tokenizer.tokenize text # Note: Originals not needed.
      tokens.each do |token_text|
        next unless token_text
        cache << indexed_id << comma << token_text << newline
      end
    end
  else
    datas.each do |indexed_id, tokens|
      tokens.each do |token_text|
        next unless token_text
        cache << indexed_id << comma << token_text << newline
      end
    end
  end

  flush file, cache
end

#process(source_for_prepare, categories, scheduler = Scheduler.new) ⇒ `Object`

Harvest the data from the source, tokenize, and write to an intermediate “prepared index” file.

Parameters:

* categories: An enumerable of Category-s.

# File 'lib/picky/indexers/serial.rb', line 19

def process source_for_prepare, categories, scheduler = Scheduler.new
  categories.each do |category|

    category.prepared_index_file do |file|

      datas = []
      result = []
      tokenizer = category.tokenizer

      reset source_for_prepare

      source.harvest(category) do |*data|

        # Accumulate data.
        #
        datas << data
        next if datas.size < 10_000

        # Opening the file inside the scheduler to
        # have it automagically closed.
        #
        index_flush datas, file, result, tokenizer

        datas.clear

      end

      index_flush datas, file, result, tokenizer

      yield file
    end
  end

end