Class: PdfboxTextExtraction

Inherits:

Object

Object
PdfboxTextExtraction

show all

Defined in:: lib/pdfbox_text_extraction.rb,
lib/pdfbox_text_extraction/version.rb

Constant Summary collapse

VERSION =

"1.2.0"

Class Method Summary collapse

.configure_text_extraction_params(text_stripper, options) ⇒ Object

Sets params on text_stripper.
.run(path_to_pdf, options = {}) ⇒ String

Runs text extraction and returns extracted text as string.

Class Method Details

.configure_text_extraction_params(text_stripper, options) ⇒ `Object`

Sets params on text_stripper.

Parameters:

text_stripper (PDFTextStripper)

# File 'lib/pdfbox_text_extraction.rb', line 81

def self.configure_text_extraction_params(text_stripper, options)

  # *****************************************************
  # Extraction thresholds and tolerances

  # Set the character width-based tolerance value that is used to estimate
  # where spaces in text should be added.
  # Default: 0.30000001192092896
  if(o = options[:average_char_tolerance])
    text_stripper.setAverageCharTolerance(o)
  end

  # Set the minimum whitespace, as a multiple of the max height of the current
  # characters beyond which the current line start is considered to be a
  # paragraph start.
  # Default: 2.5
  if(o = options[:drop_threshold])
    text_stripper.setDropThreshold(o)
  end

  # Set the multiple of whitespace character widths for the current text
  # which the current line start can be indented from the previous line
  # start beyond which the current line start is considered to be a
  # paragraph start.
  # Default: 2.0
  if(o = options[:indent_threshold])
    text_stripper.setIndentThreshold(o)
  end

  # Set the space width-based tolerance value that is used to estimate where
  # spaces in text should be added.
  # Default: 0.5
  if(o = options[:spacing_tolerance])
    text_stripper.setSpacingTolerance(o)
  end

  # *****************************************************
  # Sort order

  # The order of the text tokens in a PDF file may not be in the same as
  # they appear visually on the screen.
  # Default: false
  if !(o = options[:sort_by_position]).nil? # Allow override of false
    text_stripper.setSortByPosition(o)
  end

  # *****************************************************
  # Separator tokens

  # Set the desired line separator for output text.
  # Default: "\n"
  if(o = options[:line_separator])
    text_stripper.setLineSeparator(o)
  end

  # Set the string which will be used at the end of a page.
  # Default: ""
  if(o = options[:page_end])
    text_stripper.setPageEnd(o)
  end

  # Set the string which will be used at the end of a page.
  # Default: ""
  if(o = options[:page_start])
    text_stripper.setPageStart(o)
  end

  # Set the string which will be used at the end of a paragraph.
  # Default: ""
  if(o = options[:paragraph_end])
    text_stripper.setParagraphEnd(o)
  end

  # Set the string which will be used at the end of a paragraph.
  # Default: ""
  if(o = options[:paragraph_start])
    text_stripper.setParagraphStart(o)
  end

end

.run(path_to_pdf, options = {}) ⇒ `String`

Runs text extraction and returns extracted text as string. Optionally can extract text from crop area only if crop area dimensions are given. All crop area dimensions are in inches.

Parameters:

path_to_pdf (String)
options (Hash, optional) (defaults to: {})

Options Hash (options):

crop_x (Float) —

crop area top left corner x-coordinate
crop_y (Float) —

crop area top left corner y-coordinate
crop_width (Float) —

crop area width
crop_height (Float) —

crop area height
average_char_tolerance (Float)
drop_threshold (Float)
indent_threshold (Float)
spacing_tolerance (Float)
sort_by_position (Boolean)
line_separator (String)
page_end (String)
page_start (String)
paragraph_end (String)
paragraph_start (String)

Returns:

(String) —

the extracted text

# File 'lib/pdfbox_text_extraction.rb', line 44

def self.run(path_to_pdf, options={})
  file = File.new(path_to_pdf)
  pd_doc = PDDocument.load(file)
  text_stripper = nil
  all_text = ''
  if [:crop_x, :crop_y, :crop_width, :crop_height].any? { |e| options[e] }
    # crop options given, extract from crop area only
    res = 72
    body_text_rect = Rectangle2D::Float.new(
      (options[:crop_x] * res),
      (options[:crop_y] * res),
      (options[:crop_width] * res),
      (options[:crop_height] * res)
    )
    text_stripper = PDFTextStripperByArea.new
    text_stripper.addRegion("bodyText", body_text_rect)
    configure_text_extraction_params(text_stripper, options)

    pd_doc.getPages.each do |page|
      text_stripper.extractRegions(page)
      # Get the body text of the current page
      all_text << text_stripper.getTextForRegion("bodyText")
    end
  else
    # No crop options given, extract all text
    text_stripper = PDFTextStripper.new
    configure_text_extraction_params(text_stripper, options)
    all_text << text_stripper.getText(pd_doc)
  end

  pd_doc.close

  all_text
end