Class: PDFextract

Inherits:
Object
  • Object
show all
Defined in:
lib/pdf_extract.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(schema) ⇒ PDFextract

Returns a new instance of PDFextract.



84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/pdf_extract.rb', line 84

def initialize(schema)
	schema.symbolize_keys!

	@base_dir = Time.now.to_i.to_s
	setup_folders(@base_dir)
	@text_dir = @base_dir+'/text_files'
	@image_dir = @base_dir+'/image_files'
	@output_dir = @base_dir+'/output'
	if schema[:file_url]
		@file_path = get_file_from_url(schema[:file_url])
	else
		@file_path = get_file_from_path(schema[:file_path])
		puts @file_path
	end
	@options = schema[:options] if schema[:options]
	@pages = schema[:pages] if schema[:options]
	@results = {}

end

Instance Attribute Details

#base_dirObject

Returns the value of attribute base_dir.



81
82
83
# File 'lib/pdf_extract.rb', line 81

def base_dir
  @base_dir
end

#file_pathObject

Returns the value of attribute file_path.



80
81
82
# File 'lib/pdf_extract.rb', line 80

def file_path
  @file_path
end

#image_dirObject

Returns the value of attribute image_dir.



82
83
84
# File 'lib/pdf_extract.rb', line 82

def image_dir
  @image_dir
end

#optionsObject

Returns the value of attribute options.



81
82
83
# File 'lib/pdf_extract.rb', line 81

def options
  @options
end

#output_dirObject

Returns the value of attribute output_dir.



82
83
84
# File 'lib/pdf_extract.rb', line 82

def output_dir
  @output_dir
end

#pagesObject

Returns the value of attribute pages.



82
83
84
# File 'lib/pdf_extract.rb', line 82

def pages
  @pages
end

#resultsObject

Returns the value of attribute results.



80
81
82
# File 'lib/pdf_extract.rb', line 80

def results
  @results
end

#text_dirObject

Returns the value of attribute text_dir.



81
82
83
# File 'lib/pdf_extract.rb', line 81

def text_dir
  @text_dir
end

Class Method Details

.example_schemaObject



203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
# File 'lib/pdf_extract.rb', line 203

def self.example_schema 
	{
		file_path: "test_files/dream-may.pdf",
		options: {
			remove_protection: false,
			password: nil,
			extract_all_text: true,
			extract_text: []
		},
		pages: [{
			match: "page_num",
			page: 1,
			items: [
				{
					name: 'title',
					kind: 'ocr', #alternative is kind table
					dimensions:  {
						x1: 10,
						x2: 282,
						y1: 50,
						y2: 100
					}
				},
				{
					name: 'units_table',
					kind: 'table',
					dimensions: {
						x1: 0,
						x2: 265.73,
						y1: 184.94,
						y2: 233.84
					}
				}
			]
		}]
	}
end

.extract_ocr(image_path, coords) ⇒ Object



188
189
190
191
192
193
194
195
196
197
198
199
200
201
# File 'lib/pdf_extract.rb', line 188

def self.extract_ocr(image_path,coords)
	
	x = coords["x1"]
	y = coords["y1"]
	width = coords["x2"] - x
	height = coords["y2"] - y
	puts image_path
	puts [x,y,width,height]
	engine = Tesseract::Engine.new(language: :eng)
	engine.image = image_path
	engine.select x,y,width,height
	text = engine.text.strip
	return text
end

Instance Method Details

#cleanupObject



130
131
132
# File 'lib/pdf_extract.rb', line 130

def cleanup
	`rm -r #{base_dir}`
end

#convert_to_image(pages = "all") ⇒ Object



165
166
167
168
169
170
171
# File 'lib/pdf_extract.rb', line 165

def convert_to_image(pages = "all")
	pdf_to_image_files(pages)
	images = []
	Dir.glob(image_dir+"/*.png").each do |file|  
		images << file 
	end
end

#convert_to_text(pages = "all") ⇒ Object



154
155
156
157
158
159
160
161
162
163
164
# File 'lib/pdf_extract.rb', line 154

def convert_to_text(pages = "all")
	pdf_to_text_files(pages)
	text = {}
	#take the text from the pdf pages and load em into this shit
	Dir.glob(text_dir+"/*.txt").each do |file|  
		page_num = file.split("_")[-1].split(".")[0]
		text[page_num] = File.open(file).read 
	end
	puts text
	return text
end

#extract_with_ocr(page_path, dimensions) ⇒ Object



180
181
182
183
184
185
186
187
# File 'lib/pdf_extract.rb', line 180

def extract_with_ocr(page_path,dimensions)
	engine = Tesseract::Engine.new(language: :eng)
	engine.image = page_path
	engine.select 1,34,59,281
	text = engine.text.strip
	dimensions[:result] = text 
	return text
end

#get_file_from_path(path) ⇒ Object



117
118
119
120
121
# File 'lib/pdf_extract.rb', line 117

def get_file_from_path(path)
	new_path = @base_dir+"/temp-file.pdf"
	`cp #{path} #{new_path}` 
	return new_path
end

#get_file_from_url(file_url) ⇒ Object



110
111
112
113
114
115
116
# File 'lib/pdf_extract.rb', line 110

def get_file_from_url(file_url)
	file_data = open(file_url).read
	temp_file = open(@base_dir+"/temp-file.pdf","w")
	temp_file.write file_data
	temp_file.close
	return temp_file.path
end

#pdf_to_image_files(pages) ⇒ Object



173
174
175
# File 'lib/pdf_extract.rb', line 173

def pdf_to_image_files(pages)
	Docsplit.extract_images(file_path,:output => image_dir, :format => [:png])
end

#pdf_to_text_files(pages) ⇒ Object



177
178
179
# File 'lib/pdf_extract.rb', line 177

def pdf_to_text_files(pages)
    Docsplit.extract_text(file_path, :output => text_dir,:pages => pages)
end

#processObject



123
124
125
126
127
128
129
# File 'lib/pdf_extract.rb', line 123

def process
	remove_protection if options[:remove_protection] == true 
	results[:images] = pdf_to_image_files("all")
	results[:text] = convert_to_text if options[:extract_all_text] == true 
	process_pages
	cleanup
end

#process_pagesObject



138
139
140
141
142
143
144
145
146
147
148
149
150
151
# File 'lib/pdf_extract.rb', line 138

def process_pages
	pages.each do |page|
		if page[:match] == "page_num"
			page_num = page[:page]
			page[:image_path] = image_dir+"/temp-file_#{page_num}.png"
			page[:pdf_path] = file_path
							
		end
		page_extractor = PageExtractor.new(page)
		page_extractor.process
		results[page_num] = page_extractor.results
	end

end

#remove_protectionObject



133
134
135
136
# File 'lib/pdf_extract.rb', line 133

def remove_protection
	#todo

end

#setup_folders(folder_name) ⇒ Object



103
104
105
106
107
108
# File 'lib/pdf_extract.rb', line 103

def setup_folders(folder_name)
		`rm -r #{folder_name}` if Dir.exists? folder_name
		`mkdir #{folder_name}`
		`mkdir #{text_dir}`
		`mkdir #{output_dir}`
end