Class: PdfSearch::PdfIndex
- Inherits:
-
Object
- Object
- PdfSearch::PdfIndex
- Defined in:
- lib/pdf_index.rb
Class Attribute Summary collapse
-
.properties ⇒ Object
readonly
Returns the value of attribute properties.
-
.search_input_fields ⇒ Object
readonly
Returns the value of attribute search_input_fields.
Instance Attribute Summary collapse
-
#daemon ⇒ Object
Returns the value of attribute daemon.
Class Method Summary collapse
- .create_index ⇒ Object
- .property(property_name, type, options = {}) ⇒ Object
- .start_daemon(dir) ⇒ Object
Instance Method Summary collapse
-
#additional_document_data(page, reader, pdf_id) ⇒ Object
additional_document_data can be overridden by your custom Index.
- #combined_pdf_page_id(pdf_id, text) ⇒ Object
- #create_page_document(pdf_id, text, additional_data) ⇒ Object
- #index_loop ⇒ Object
-
#initialize(pdf_dir) ⇒ PdfIndex
constructor
A new instance of PdfIndex.
- #pid ⇒ Object
- #properties ⇒ Object
- #reindex ⇒ Object
- #search_input_fields ⇒ Object
- #search_input_fields_by_type ⇒ Object
- #update_page_document(pdf_id, text, additional_data) ⇒ Object
Constructor Details
#initialize(pdf_dir) ⇒ PdfIndex
Returns a new instance of PdfIndex.
65 66 67 68 |
# File 'lib/pdf_index.rb', line 65 def initialize(pdf_dir) @pdf_dir = pdf_dir @els_client = ::PdfSearch::ElasticSearchClient end |
Class Attribute Details
.properties ⇒ Object (readonly)
Returns the value of attribute properties.
9 10 11 |
# File 'lib/pdf_index.rb', line 9 def properties @properties end |
.search_input_fields ⇒ Object (readonly)
Returns the value of attribute search_input_fields.
9 10 11 |
# File 'lib/pdf_index.rb', line 9 def search_input_fields @search_input_fields end |
Instance Attribute Details
#daemon ⇒ Object
Returns the value of attribute daemon.
6 7 8 |
# File 'lib/pdf_index.rb', line 6 def daemon @daemon end |
Class Method Details
.create_index ⇒ Object
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
# File 'lib/pdf_index.rb', line 12 def self.create_index ::PdfSearch::ElasticSearchClient.indices.create( index: 'pdf_pages', body: { mappings: { document: { properties: { text: { type: 'text' } }.merge(@properties ||= {}) } } } ) return true rescue Elasticsearch::Transport::Transport::Errors::BadRequest => e if /"type":"(?:resource|index)_already_exists_exception"/ =~ e. return false else raise e end end |
.property(property_name, type, options = {}) ⇒ Object
38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
# File 'lib/pdf_index.rb', line 38 def self.property(property_name, type, = {}) @properties ||= {} @properties[property_name] = {type: type} search_input_type = .delete(:search) if search_input_type @search_input_fields ||= {} @search_input_fields[property_name] = search_input_type @search_input_fields_by_type ||= Hash.new { |h,k| h[k] = [] } @search_input_fields_by_type[search_input_type].push(property_name) end end |
.start_daemon(dir) ⇒ Object
70 71 72 73 74 75 76 77 78 |
# File 'lib/pdf_index.rb', line 70 def self.start_daemon(dir) pdf_index = self.new(::PdfSearch::PdfDir.new(dir)) if ENV['DEBUG_PDF_INDEXING'] pdf_index.index_loop else pdf_index.daemon = Daemons.call(multiple: true, &pdf_index.method(:index_loop)) end pdf_index end |
Instance Method Details
#additional_document_data(page, reader, pdf_id) ⇒ Object
additional_document_data can be overridden by your custom Index
class CustomIndex < PdfSearch::PdfIndex
# The attribute that is used (organisation_id) has to be declared to be created when creating or updating the index like following:
# Name of property TYPE
property :organisation_id, 'string' # or 'text' etc.
def get_organisation_id
# ...
end
def additional_document_data(page, reader, pdf_id)
return {
organisation_id: get_organisation_id(pdf_id, page)
}
end
end
109 110 111 |
# File 'lib/pdf_index.rb', line 109 def additional_document_data(page, reader, pdf_id) return {} end |
#combined_pdf_page_id(pdf_id, text) ⇒ Object
127 128 129 |
# File 'lib/pdf_index.rb', line 127 def combined_pdf_page_id(pdf_id, text) "#{pdf_id}-#{Digest::SHA256.hexdigest(text)}" end |
#create_page_document(pdf_id, text, additional_data) ⇒ Object
116 117 118 119 120 121 122 123 124 125 |
# File 'lib/pdf_index.rb', line 116 def create_page_document(pdf_id, text, additional_data) @els_client.create( index: 'pdf_pages', type: 'document', id: combined_pdf_page_id(pdf_id, text), body: { text: text }.merge(additional_data) ) end |
#index_loop ⇒ Object
80 81 82 83 84 |
# File 'lib/pdf_index.rb', line 80 def index_loop loop do self.reindex end end |
#pid ⇒ Object
86 87 88 |
# File 'lib/pdf_index.rb', line 86 def pid daemon.pid.pid end |
#properties ⇒ Object
61 62 63 |
# File 'lib/pdf_index.rb', line 61 def properties self.class.properties end |
#reindex ⇒ Object
132 133 134 135 136 137 138 139 140 141 142 |
# File 'lib/pdf_index.rb', line 132 def reindex @pdf_dir.pages.each.with_index do |(page, reader, pdf_id), index| additional_data = additional_document_data(page, reader, pdf_id) begin create_page_document(pdf_id, page.text, additional_data) rescue Elasticsearch::Transport::Transport::Errors::Conflict => e puts "Skipping document #{combined_pdf_page_id(pdf_id, page.text) }, already indexed" end end end |
#search_input_fields ⇒ Object
57 58 59 |
# File 'lib/pdf_index.rb', line 57 def search_input_fields self.class.search_input_fields end |
#search_input_fields_by_type ⇒ Object
53 54 55 |
# File 'lib/pdf_index.rb', line 53 def search_input_fields_by_type self.class.instance_variable_get(:@search_input_fields_by_type) end |
#update_page_document(pdf_id, text, additional_data) ⇒ Object
113 114 |
# File 'lib/pdf_index.rb', line 113 def update_page_document(pdf_id, text, additional_data) end |