Class: Tabula::Extraction::PagesInfoExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/tabula/extraction.rb

Instance Method Summary collapse

Constructor Details

#initialize(pdf_file_path, password = '') ⇒ PagesInfoExtractor

Returns a new instance of PagesInfoExtractor.



374
375
376
377
378
379
380
# File 'lib/tabula/extraction.rb', line 374

def initialize(pdf_file_path, password='')
  @pdf_filename = pdf_file_path
  @pdf_file = Extraction.openPDF(pdf_file_path, password)
  @all_pages = @pdf_file.getDocumentCatalog.getAllPages

  @extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all )
end

Instance Method Details

#pagesObject



382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
# File 'lib/tabula/extraction.rb', line 382

def pages
  found_page_with_texts = false
  Enumerator.new do |y|
    begin
      @all_pages.each_with_index do |page, i|
        contents = page.getContents

        if found_page_with_texts
          page = Tabula::Page.new(@pdf_filename,
                                 page.findCropBox.width,
                                 page.findCropBox.height,
                                 page.getRotation.to_i,
                                 i+1) #remember, these are one-indexed
        else 
          page = @extractor.extract_page(i+1)
          found_page_with_texts = page.has_text?
        end

        y.yield page
      end
    ensure
      @pdf_file.close
      @extractor.close!
    end
  end
end