Class: PdfSearch::PdfIndex

Inherits:
Object
  • Object
show all
Defined in:
lib/pdf_index.rb

Class Attribute Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(pdf_dir) ⇒ PdfIndex

Returns a new instance of PdfIndex.



65
66
67
68
# File 'lib/pdf_index.rb', line 65

def initialize(pdf_dir)
	@pdf_dir = pdf_dir
	@els_client = ::PdfSearch::ElasticSearchClient
end

Class Attribute Details

.propertiesObject (readonly)

Returns the value of attribute properties.



9
10
11
# File 'lib/pdf_index.rb', line 9

def properties
  @properties
end

.search_input_fieldsObject (readonly)

Returns the value of attribute search_input_fields.



9
10
11
# File 'lib/pdf_index.rb', line 9

def search_input_fields
  @search_input_fields
end

Instance Attribute Details

#daemonObject

Returns the value of attribute daemon.



6
7
8
# File 'lib/pdf_index.rb', line 6

def daemon
  @daemon
end

Class Method Details

.create_indexObject



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/pdf_index.rb', line 12

def self.create_index
::PdfSearch::ElasticSearchClient.indices.create(
	index: 'pdf_pages',
	body: {
		mappings: {
			document: {
				properties: {
					text: {
						type: 'text'
					}
				}.merge(@properties ||= {})
			}
		}
	}
)

return true

	rescue Elasticsearch::Transport::Transport::Errors::BadRequest => e
if /"type":"(?:resource|index)_already_exists_exception"/ =~ e.message
	return false
else
	raise e
end
end

.property(property_name, type, options = {}) ⇒ Object



38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/pdf_index.rb', line 38

def self.property(property_name, type, options = {})
  @properties ||= {}
  @properties[property_name] = {type: type}

  search_input_type = options.delete(:search)

  if search_input_type
    @search_input_fields ||= {}
    @search_input_fields[property_name] = search_input_type

    @search_input_fields_by_type ||= Hash.new { |h,k| h[k] = [] }
    @search_input_fields_by_type[search_input_type].push(property_name)
  end
end

.start_daemon(dir) ⇒ Object



70
71
72
73
74
75
76
77
78
# File 'lib/pdf_index.rb', line 70

def self.start_daemon(dir)
  pdf_index = self.new(::PdfSearch::PdfDir.new(dir))
   if ENV['DEBUG_PDF_INDEXING']
     pdf_index.index_loop
   else
    pdf_index.daemon = Daemons.call(multiple: true, &pdf_index.method(:index_loop))
   end
  pdf_index
end

Instance Method Details

#additional_document_data(page, reader, pdf_id) ⇒ Object

additional_document_data can be overridden by your custom Index

class CustomIndex < PdfSearch::PdfIndex

# The attribute that is used (organisation_id) has to be declared to be created when creating or updating the index like following:

        # Name of property   TYPE
property :organisation_id, 'string' # or 'text' etc.

def get_organisation_id
  # ... 
end

def additional_document_data(page, reader, pdf_id)
 return {
   organisation_id: get_organisation_id(pdf_id, page)
 }
end

end



109
110
111
# File 'lib/pdf_index.rb', line 109

def additional_document_data(page, reader, pdf_id)
  return {}
end

#combined_pdf_page_id(pdf_id, text) ⇒ Object



127
128
129
# File 'lib/pdf_index.rb', line 127

def combined_pdf_page_id(pdf_id, text)
  "#{pdf_id}-#{Digest::SHA256.hexdigest(text)}"
end

#create_page_document(pdf_id, text, additional_data) ⇒ Object



116
117
118
119
120
121
122
123
124
125
# File 'lib/pdf_index.rb', line 116

def create_page_document(pdf_id, text, additional_data)
 @els_client.create(
    index: 'pdf_pages',
    type: 'document',
    id: combined_pdf_page_id(pdf_id, text),
    body: {
      text: text
    }.merge(additional_data)
  )
end

#index_loopObject



80
81
82
83
84
# File 'lib/pdf_index.rb', line 80

def index_loop
 loop do
  self.reindex
 end
end

#pidObject



86
87
88
# File 'lib/pdf_index.rb', line 86

def pid
    daemon.pid.pid
end

#propertiesObject



61
62
63
# File 'lib/pdf_index.rb', line 61

def properties
  self.class.properties
end

#reindexObject



132
133
134
135
136
137
138
139
140
141
142
# File 'lib/pdf_index.rb', line 132

def reindex
	@pdf_dir.pages.each.with_index do |(page, reader, pdf_id), index|
     additional_data = additional_document_data(page, reader, pdf_id)
     begin
       create_page_document(pdf_id, page.text, additional_data)
	  rescue Elasticsearch::Transport::Transport::Errors::Conflict => e
       puts "Skipping document #{combined_pdf_page_id(pdf_id, page.text)
}, already indexed"
     end
	end
end

#search_input_fieldsObject



57
58
59
# File 'lib/pdf_index.rb', line 57

def search_input_fields
  self.class.search_input_fields
end

#search_input_fields_by_typeObject



53
54
55
# File 'lib/pdf_index.rb', line 53

def search_input_fields_by_type
  self.class.instance_variable_get(:@search_input_fields_by_type)
end

#update_page_document(pdf_id, text, additional_data) ⇒ Object



113
114
# File 'lib/pdf_index.rb', line 113

def update_page_document(pdf_id, text, additional_data)
end