Class: PDFextract

Inherits:

Object

Object
PDFextract

show all

Defined in:: lib/pdf_extract.rb

Instance Attribute Summary collapse

#base_dir ⇒ Object

Returns the value of attribute base_dir.
#file_path ⇒ Object

Returns the value of attribute file_path.
#image_dir ⇒ Object

Returns the value of attribute image_dir.
#options ⇒ Object

Returns the value of attribute options.
#output_dir ⇒ Object

Returns the value of attribute output_dir.
#pages ⇒ Object

Returns the value of attribute pages.
#results ⇒ Object

Returns the value of attribute results.
#text_dir ⇒ Object

Returns the value of attribute text_dir.

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(schema) ⇒ `PDFextract`

Returns a new instance of PDFextract.

# File 'lib/pdf_extract.rb', line 84

def initialize(schema)
	schema.symbolize_keys!

	@base_dir = Time.now.to_i.to_s
	setup_folders(@base_dir)
	@text_dir = @base_dir+'/text_files'
	@image_dir = @base_dir+'/image_files'
	@output_dir = @base_dir+'/output'
	if schema[:file_url]
		@file_path = get_file_from_url(schema[:file_url])
	else
		@file_path = get_file_from_path(schema[:file_path])
		puts @file_path
	end
	@options = schema[:options] if schema[:options]
	@pages = schema[:pages] if schema[:options]
	@results = {}

end

Instance Attribute Details

#base_dir ⇒ `Object`

Returns the value of attribute base_dir.



81
82
83

# File 'lib/pdf_extract.rb', line 81

def base_dir
  @base_dir
end

#file_path ⇒ `Object`

Returns the value of attribute file_path.



80
81
82

# File 'lib/pdf_extract.rb', line 80

def file_path
  @file_path
end

#image_dir ⇒ `Object`

Returns the value of attribute image_dir.



82
83
84

# File 'lib/pdf_extract.rb', line 82

def image_dir
  @image_dir
end

#options ⇒ `Object`

Returns the value of attribute options.



81
82
83

# File 'lib/pdf_extract.rb', line 81

def options
  @options
end

#output_dir ⇒ `Object`

Returns the value of attribute output_dir.



82
83
84

# File 'lib/pdf_extract.rb', line 82

def output_dir
  @output_dir
end

#pages ⇒ `Object`

Returns the value of attribute pages.



82
83
84

# File 'lib/pdf_extract.rb', line 82

def pages
  @pages
end

#results ⇒ `Object`

Returns the value of attribute results.



80
81
82

# File 'lib/pdf_extract.rb', line 80

def results
  @results
end

#text_dir ⇒ `Object`

Returns the value of attribute text_dir.



81
82
83

# File 'lib/pdf_extract.rb', line 81

def text_dir
  @text_dir
end

Class Method Details

.example_schema ⇒ `Object`

# File 'lib/pdf_extract.rb', line 203

def self.example_schema 
	{
		file_path: "test_files/dream-may.pdf",
		options: {
			remove_protection: false,
			password: nil,
			extract_all_text: true,
			extract_text: []
		},
		pages: [{
			match: "page_num",
			page: 1,
			items: [
				{
					name: 'title',
					kind: 'ocr', #alternative is kind table
					dimensions:  {
						x1: 10,
						x2: 282,
						y1: 50,
						y2: 100
					}
				},
				{
					name: 'units_table',
					kind: 'table',
					dimensions: {
						x1: 0,
						x2: 265.73,
						y1: 184.94,
						y2: 233.84
					}
				}
			]
		}]
	}
end

.extract_ocr(image_path, coords) ⇒ `Object`

# File 'lib/pdf_extract.rb', line 188

def self.extract_ocr(image_path,coords)
	
	x = coords["x1"]
	y = coords["y1"]
	width = coords["x2"] - x
	height = coords["y2"] - y
	puts image_path
	puts [x,y,width,height]
	engine = Tesseract::Engine.new(language: :eng)
	engine.image = image_path
	engine.select x,y,width,height
	text = engine.text.strip
	return text
end

Instance Method Details

#cleanup ⇒ `Object`



130
131
132

# File 'lib/pdf_extract.rb', line 130

def cleanup
	`rm -r #{base_dir}`
end

#convert_to_image(pages = "all") ⇒ `Object`

# File 'lib/pdf_extract.rb', line 165

def convert_to_image(pages = "all")
	pdf_to_image_files(pages)
	images = []
	Dir.glob(image_dir+"/*.png").each do |file|  
		images << file 
	end
end

#convert_to_text(pages = "all") ⇒ `Object`

# File 'lib/pdf_extract.rb', line 154

def convert_to_text(pages = "all")
	pdf_to_text_files(pages)
	text = {}
	#take the text from the pdf pages and load em into this shit
	Dir.glob(text_dir+"/*.txt").each do |file|  
		page_num = file.split("_")[-1].split(".")[0]
		text[page_num] = File.open(file).read 
	end
	puts text
	return text
end

#extract_with_ocr(page_path, dimensions) ⇒ `Object`

# File 'lib/pdf_extract.rb', line 180

def extract_with_ocr(page_path,dimensions)
	engine = Tesseract::Engine.new(language: :eng)
	engine.image = page_path
	engine.select 1,34,59,281
	text = engine.text.strip
	dimensions[:result] = text 
	return text
end

#get_file_from_path(path) ⇒ `Object`

# File 'lib/pdf_extract.rb', line 117

def get_file_from_path(path)
	new_path = @base_dir+"/temp-file.pdf"
	`cp #{path} #{new_path}` 
	return new_path
end

#get_file_from_url(file_url) ⇒ `Object`

# File 'lib/pdf_extract.rb', line 110

def get_file_from_url(file_url)
	file_data = open(file_url).read
	temp_file = open(@base_dir+"/temp-file.pdf","w")
	temp_file.write file_data
	temp_file.close
	return temp_file.path
end

#pdf_to_image_files(pages) ⇒ `Object`



173
174
175

# File 'lib/pdf_extract.rb', line 173

def pdf_to_image_files(pages)
	Docsplit.extract_images(file_path,:output => image_dir, :format => [:png])
end

#pdf_to_text_files(pages) ⇒ `Object`



177
178
179

# File 'lib/pdf_extract.rb', line 177

def pdf_to_text_files(pages)
    Docsplit.extract_text(file_path, :output => text_dir,:pages => pages)
end

#process ⇒ `Object`

# File 'lib/pdf_extract.rb', line 123

def process
	remove_protection if options[:remove_protection] == true 
	results[:images] = pdf_to_image_files("all")
	results[:text] = convert_to_text if options[:extract_all_text] == true 
	process_pages
	cleanup
end

#process_pages ⇒ `Object`

# File 'lib/pdf_extract.rb', line 138

def process_pages
	pages.each do |page|
		if page[:match] == "page_num"
			page_num = page[:page]
			page[:image_path] = image_dir+"/temp-file_#{page_num}.png"
			page[:pdf_path] = file_path
							
		end
		page_extractor = PageExtractor.new(page)
		page_extractor.process
		results[page_num] = page_extractor.results
	end

end

#remove_protection ⇒ `Object`

# File 'lib/pdf_extract.rb', line 133

def remove_protection
	#todo

end

#setup_folders(folder_name) ⇒ `Object`

# File 'lib/pdf_extract.rb', line 103

def setup_folders(folder_name)
		`rm -r #{folder_name}` if Dir.exists? folder_name
		`mkdir #{folder_name}`
		`mkdir #{text_dir}`
		`mkdir #{output_dir}`
end

Class: PDFextract

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(schema) ⇒ PDFextract

Instance Attribute Details

#base_dir ⇒ Object

#file_path ⇒ Object

#image_dir ⇒ Object

#options ⇒ Object

#output_dir ⇒ Object

#pages ⇒ Object

#results ⇒ Object

#text_dir ⇒ Object

Class Method Details

.example_schema ⇒ Object

.extract_ocr(image_path, coords) ⇒ Object

Instance Method Details

#cleanup ⇒ Object

#convert_to_image(pages = "all") ⇒ Object

#convert_to_text(pages = "all") ⇒ Object

#extract_with_ocr(page_path, dimensions) ⇒ Object

#get_file_from_path(path) ⇒ Object

#get_file_from_url(file_url) ⇒ Object

#pdf_to_image_files(pages) ⇒ Object

#pdf_to_text_files(pages) ⇒ Object

#process ⇒ Object

#process_pages ⇒ Object

#remove_protection ⇒ Object

#setup_folders(folder_name) ⇒ Object