Class: PDFextract
- Inherits:
-
Object
- Object
- PDFextract
- Defined in:
- lib/pdf_extract.rb
Instance Attribute Summary collapse
-
#base_dir ⇒ Object
Returns the value of attribute base_dir.
-
#file_path ⇒ Object
Returns the value of attribute file_path.
-
#image_dir ⇒ Object
Returns the value of attribute image_dir.
-
#options ⇒ Object
Returns the value of attribute options.
-
#output_dir ⇒ Object
Returns the value of attribute output_dir.
-
#pages ⇒ Object
Returns the value of attribute pages.
-
#results ⇒ Object
Returns the value of attribute results.
-
#text_dir ⇒ Object
Returns the value of attribute text_dir.
Class Method Summary collapse
Instance Method Summary collapse
- #cleanup ⇒ Object
- #convert_to_image(pages = "all") ⇒ Object
- #convert_to_text(pages = "all") ⇒ Object
- #extract_with_ocr(page_path, dimensions) ⇒ Object
- #get_file_from_path(path) ⇒ Object
- #get_file_from_url(file_url) ⇒ Object
-
#initialize(schema) ⇒ PDFextract
constructor
A new instance of PDFextract.
- #pdf_to_image_files(pages) ⇒ Object
- #pdf_to_text_files(pages) ⇒ Object
- #process ⇒ Object
- #process_pages ⇒ Object
- #remove_protection ⇒ Object
- #setup_folders(folder_name) ⇒ Object
Constructor Details
#initialize(schema) ⇒ PDFextract
Returns a new instance of PDFextract.
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
# File 'lib/pdf_extract.rb', line 84 def initialize(schema) schema.symbolize_keys! @base_dir = Time.now.to_i.to_s setup_folders(@base_dir) @text_dir = @base_dir+'/text_files' @image_dir = @base_dir+'/image_files' @output_dir = @base_dir+'/output' if schema[:file_url] @file_path = get_file_from_url(schema[:file_url]) else @file_path = get_file_from_path(schema[:file_path]) puts @file_path end @options = schema[:options] if schema[:options] @pages = schema[:pages] if schema[:options] @results = {} end |
Instance Attribute Details
#base_dir ⇒ Object
Returns the value of attribute base_dir.
81 82 83 |
# File 'lib/pdf_extract.rb', line 81 def base_dir @base_dir end |
#file_path ⇒ Object
Returns the value of attribute file_path.
80 81 82 |
# File 'lib/pdf_extract.rb', line 80 def file_path @file_path end |
#image_dir ⇒ Object
Returns the value of attribute image_dir.
82 83 84 |
# File 'lib/pdf_extract.rb', line 82 def image_dir @image_dir end |
#options ⇒ Object
Returns the value of attribute options.
81 82 83 |
# File 'lib/pdf_extract.rb', line 81 def @options end |
#output_dir ⇒ Object
Returns the value of attribute output_dir.
82 83 84 |
# File 'lib/pdf_extract.rb', line 82 def output_dir @output_dir end |
#pages ⇒ Object
Returns the value of attribute pages.
82 83 84 |
# File 'lib/pdf_extract.rb', line 82 def pages @pages end |
#results ⇒ Object
Returns the value of attribute results.
80 81 82 |
# File 'lib/pdf_extract.rb', line 80 def results @results end |
#text_dir ⇒ Object
Returns the value of attribute text_dir.
81 82 83 |
# File 'lib/pdf_extract.rb', line 81 def text_dir @text_dir end |
Class Method Details
.example_schema ⇒ Object
203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 |
# File 'lib/pdf_extract.rb', line 203 def self.example_schema { file_path: "test_files/dream-may.pdf", options: { remove_protection: false, password: nil, extract_all_text: true, extract_text: [] }, pages: [{ match: "page_num", page: 1, items: [ { name: 'title', kind: 'ocr', #alternative is kind table dimensions: { x1: 10, x2: 282, y1: 50, y2: 100 } }, { name: 'units_table', kind: 'table', dimensions: { x1: 0, x2: 265.73, y1: 184.94, y2: 233.84 } } ] }] } end |
.extract_ocr(image_path, coords) ⇒ Object
188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
# File 'lib/pdf_extract.rb', line 188 def self.extract_ocr(image_path,coords) x = coords["x1"] y = coords["y1"] width = coords["x2"] - x height = coords["y2"] - y puts image_path puts [x,y,width,height] engine = Tesseract::Engine.new(language: :eng) engine.image = image_path engine.select x,y,width,height text = engine.text.strip return text end |
Instance Method Details
#cleanup ⇒ Object
130 131 132 |
# File 'lib/pdf_extract.rb', line 130 def cleanup `rm -r #{base_dir}` end |
#convert_to_image(pages = "all") ⇒ Object
165 166 167 168 169 170 171 |
# File 'lib/pdf_extract.rb', line 165 def convert_to_image(pages = "all") pdf_to_image_files(pages) images = [] Dir.glob(image_dir+"/*.png").each do |file| images << file end end |
#convert_to_text(pages = "all") ⇒ Object
154 155 156 157 158 159 160 161 162 163 164 |
# File 'lib/pdf_extract.rb', line 154 def convert_to_text(pages = "all") pdf_to_text_files(pages) text = {} #take the text from the pdf pages and load em into this shit Dir.glob(text_dir+"/*.txt").each do |file| page_num = file.split("_")[-1].split(".")[0] text[page_num] = File.open(file).read end puts text return text end |
#extract_with_ocr(page_path, dimensions) ⇒ Object
180 181 182 183 184 185 186 187 |
# File 'lib/pdf_extract.rb', line 180 def extract_with_ocr(page_path,dimensions) engine = Tesseract::Engine.new(language: :eng) engine.image = page_path engine.select 1,34,59,281 text = engine.text.strip dimensions[:result] = text return text end |
#get_file_from_path(path) ⇒ Object
117 118 119 120 121 |
# File 'lib/pdf_extract.rb', line 117 def get_file_from_path(path) new_path = @base_dir+"/temp-file.pdf" `cp #{path} #{new_path}` return new_path end |
#get_file_from_url(file_url) ⇒ Object
110 111 112 113 114 115 116 |
# File 'lib/pdf_extract.rb', line 110 def get_file_from_url(file_url) file_data = open(file_url).read temp_file = open(@base_dir+"/temp-file.pdf","w") temp_file.write file_data temp_file.close return temp_file.path end |
#pdf_to_image_files(pages) ⇒ Object
173 174 175 |
# File 'lib/pdf_extract.rb', line 173 def pdf_to_image_files(pages) Docsplit.extract_images(file_path,:output => image_dir, :format => [:png]) end |
#pdf_to_text_files(pages) ⇒ Object
177 178 179 |
# File 'lib/pdf_extract.rb', line 177 def pdf_to_text_files(pages) Docsplit.extract_text(file_path, :output => text_dir,:pages => pages) end |
#process ⇒ Object
123 124 125 126 127 128 129 |
# File 'lib/pdf_extract.rb', line 123 def process remove_protection if [:remove_protection] == true results[:images] = pdf_to_image_files("all") results[:text] = convert_to_text if [:extract_all_text] == true process_pages cleanup end |
#process_pages ⇒ Object
138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
# File 'lib/pdf_extract.rb', line 138 def process_pages pages.each do |page| if page[:match] == "page_num" page_num = page[:page] page[:image_path] = image_dir+"/temp-file_#{page_num}.png" page[:pdf_path] = file_path end page_extractor = PageExtractor.new(page) page_extractor.process results[page_num] = page_extractor.results end end |
#remove_protection ⇒ Object
133 134 135 136 |
# File 'lib/pdf_extract.rb', line 133 def remove_protection #todo end |
#setup_folders(folder_name) ⇒ Object
103 104 105 106 107 108 |
# File 'lib/pdf_extract.rb', line 103 def setup_folders(folder_name) `rm -r #{folder_name}` if Dir.exists? folder_name `mkdir #{folder_name}` `mkdir #{text_dir}` `mkdir #{output_dir}` end |