Class: PdfSearch::PdfIndex

Inherits:

Object

Object
PdfSearch::PdfIndex

show all

Defined in:: lib/pdf_index.rb

Class Attribute Summary collapse

.properties ⇒ Object readonly

Returns the value of attribute properties.
.search_input_fields ⇒ Object readonly

Returns the value of attribute search_input_fields.

Instance Attribute Summary collapse

#daemon ⇒ Object

Returns the value of attribute daemon.

Class Method Summary collapse

Instance Method Summary collapse

#additional_document_data(page, reader, pdf_id) ⇒ Object

additional_document_data can be overridden by your custom Index.
#combined_pdf_page_id(pdf_id, text) ⇒ Object
#create_page_document(pdf_id, text, additional_data) ⇒ Object
#index_loop ⇒ Object
#initialize(pdf_dir) ⇒ PdfIndex constructor

A new instance of PdfIndex.
#pid ⇒ Object
#properties ⇒ Object
#reindex ⇒ Object
#search_input_fields ⇒ Object
#search_input_fields_by_type ⇒ Object
#update_page_document(pdf_id, text, additional_data) ⇒ Object

Constructor Details

#initialize(pdf_dir) ⇒ `PdfIndex`

Returns a new instance of PdfIndex.

# File 'lib/pdf_index.rb', line 65

def initialize(pdf_dir)
	@pdf_dir = pdf_dir
	@els_client = ::PdfSearch::ElasticSearchClient
end

Class Attribute Details

.properties ⇒ `Object` (readonly)

Returns the value of attribute properties.



9
10
11

# File 'lib/pdf_index.rb', line 9

def properties
  @properties
end

.search_input_fields ⇒ `Object` (readonly)

Returns the value of attribute search_input_fields.



9
10
11

# File 'lib/pdf_index.rb', line 9

def search_input_fields
  @search_input_fields
end

Instance Attribute Details

#daemon ⇒ `Object`

Returns the value of attribute daemon.



6
7
8

# File 'lib/pdf_index.rb', line 6

def daemon
  @daemon
end

Class Method Details

.create_index ⇒ `Object`

# File 'lib/pdf_index.rb', line 12

def self.create_index
::PdfSearch::ElasticSearchClient.indices.create(
	index: 'pdf_pages',
	body: {
		mappings: {
			document: {
				properties: {
					text: {
						type: 'text'
					}
				}.merge(@properties ||= {})
			}
		}
	}
)

return true

	rescue Elasticsearch::Transport::Transport::Errors::BadRequest => e
if /"type":"(?:resource|index)_already_exists_exception"/ =~ e.message
	return false
else
	raise e
end
end

.property(property_name, type, options = {}) ⇒ `Object`

# File 'lib/pdf_index.rb', line 38

def self.property(property_name, type, options = {})
  @properties ||= {}
  @properties[property_name] = {type: type}

  search_input_type = options.delete(:search)

  if search_input_type
    @search_input_fields ||= {}
    @search_input_fields[property_name] = search_input_type

    @search_input_fields_by_type ||= Hash.new { |h,k| h[k] = [] }
    @search_input_fields_by_type[search_input_type].push(property_name)
  end
end

.start_daemon(dir) ⇒ `Object`

# File 'lib/pdf_index.rb', line 70

def self.start_daemon(dir)
  pdf_index = self.new(::PdfSearch::PdfDir.new(dir))
   if ENV['DEBUG_PDF_INDEXING']
     pdf_index.index_loop
   else
    pdf_index.daemon = Daemons.call(multiple: true, &pdf_index.method(:index_loop))
   end
  pdf_index
end

Instance Method Details

#additional_document_data(page, reader, pdf_id) ⇒ `Object`

additional_document_data can be overridden by your custom Index

class CustomIndex < PdfSearch::PdfIndex

# The attribute that is used (organisation_id) has to be declared to be created when creating or updating the index like following:

        # Name of property   TYPE
property :organisation_id, 'string' # or 'text' etc.

def get_organisation_id
  # ... 
end

def additional_document_data(page, reader, pdf_id)
 return {
   organisation_id: get_organisation_id(pdf_id, page)
 }
end

end



109
110
111

# File 'lib/pdf_index.rb', line 109

def additional_document_data(page, reader, pdf_id)
  return {}
end

#combined_pdf_page_id(pdf_id, text) ⇒ `Object`



127
128
129

# File 'lib/pdf_index.rb', line 127

def combined_pdf_page_id(pdf_id, text)
  "#{pdf_id}-#{Digest::SHA256.hexdigest(text)}"
end

#create_page_document(pdf_id, text, additional_data) ⇒ `Object`

# File 'lib/pdf_index.rb', line 116

def create_page_document(pdf_id, text, additional_data)
 @els_client.create(
    index: 'pdf_pages',
    type: 'document',
    id: combined_pdf_page_id(pdf_id, text),
    body: {
      text: text
    }.merge(additional_data)
  )
end

#index_loop ⇒ `Object`

# File 'lib/pdf_index.rb', line 80

def index_loop
 loop do
  self.reindex
 end
end

#pid ⇒ `Object`



86
87
88

# File 'lib/pdf_index.rb', line 86

def pid
    daemon.pid.pid
end

#properties ⇒ `Object`



61
62
63

# File 'lib/pdf_index.rb', line 61

def properties
  self.class.properties
end

#reindex ⇒ `Object`

# File 'lib/pdf_index.rb', line 132

def reindex
	@pdf_dir.pages.each.with_index do |(page, reader, pdf_id), index|
     additional_data = additional_document_data(page, reader, pdf_id)
     begin
       create_page_document(pdf_id, page.text, additional_data)
	  rescue Elasticsearch::Transport::Transport::Errors::Conflict => e
       puts "Skipping document #{combined_pdf_page_id(pdf_id, page.text)
}, already indexed"
     end
	end
end

#search_input_fields ⇒ `Object`



57
58
59

# File 'lib/pdf_index.rb', line 57

def search_input_fields
  self.class.search_input_fields
end

#search_input_fields_by_type ⇒ `Object`



53
54
55

# File 'lib/pdf_index.rb', line 53

def search_input_fields_by_type
  self.class.instance_variable_get(:@search_input_fields_by_type)
end

#update_page_document(pdf_id, text, additional_data) ⇒ `Object`



113
114

# File 'lib/pdf_index.rb', line 113

def update_page_document(pdf_id, text, additional_data)
end

Class: PdfSearch::PdfIndex

Class Attribute Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(pdf_dir) ⇒ PdfIndex

Class Attribute Details

.properties ⇒ Object (readonly)

.search_input_fields ⇒ Object (readonly)

Instance Attribute Details

#daemon ⇒ Object

Class Method Details

.create_index ⇒ Object

.property(property_name, type, options = {}) ⇒ Object

.start_daemon(dir) ⇒ Object

Instance Method Details

#additional_document_data(page, reader, pdf_id) ⇒ Object

#combined_pdf_page_id(pdf_id, text) ⇒ Object

#create_page_document(pdf_id, text, additional_data) ⇒ Object

#index_loop ⇒ Object

#pid ⇒ Object

#properties ⇒ Object

#reindex ⇒ Object

#search_input_fields ⇒ Object

#search_input_fields_by_type ⇒ Object

#update_page_document(pdf_id, text, additional_data) ⇒ Object