Class: PdfboxTextExtraction

Inherits:
Object
  • Object
show all
Defined in:
lib/pdfbox_text_extraction.rb,
lib/pdfbox_text_extraction/version.rb

Constant Summary collapse

VERSION =
"1.2.0"

Class Method Summary collapse

Class Method Details

.configure_text_extraction_params(text_stripper, options) ⇒ Object

Sets params on text_stripper.

Parameters:

  • text_stripper (PDFTextStripper)


81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/pdfbox_text_extraction.rb', line 81

def self.configure_text_extraction_params(text_stripper, options)

  # *****************************************************
  # Extraction thresholds and tolerances

  # Set the character width-based tolerance value that is used to estimate
  # where spaces in text should be added.
  # Default: 0.30000001192092896
  if(o = options[:average_char_tolerance])
    text_stripper.setAverageCharTolerance(o)
  end

  # Set the minimum whitespace, as a multiple of the max height of the current
  # characters beyond which the current line start is considered to be a
  # paragraph start.
  # Default: 2.5
  if(o = options[:drop_threshold])
    text_stripper.setDropThreshold(o)
  end

  # Set the multiple of whitespace character widths for the current text
  # which the current line start can be indented from the previous line
  # start beyond which the current line start is considered to be a
  # paragraph start.
  # Default: 2.0
  if(o = options[:indent_threshold])
    text_stripper.setIndentThreshold(o)
  end

  # Set the space width-based tolerance value that is used to estimate where
  # spaces in text should be added.
  # Default: 0.5
  if(o = options[:spacing_tolerance])
    text_stripper.setSpacingTolerance(o)
  end

  # *****************************************************
  # Sort order

  # The order of the text tokens in a PDF file may not be in the same as
  # they appear visually on the screen.
  # Default: false
  if !(o = options[:sort_by_position]).nil? # Allow override of false
    text_stripper.setSortByPosition(o)
  end

  # *****************************************************
  # Separator tokens

  # Set the desired line separator for output text.
  # Default: "\n"
  if(o = options[:line_separator])
    text_stripper.setLineSeparator(o)
  end

  # Set the string which will be used at the end of a page.
  # Default: ""
  if(o = options[:page_end])
    text_stripper.setPageEnd(o)
  end

  # Set the string which will be used at the end of a page.
  # Default: ""
  if(o = options[:page_start])
    text_stripper.setPageStart(o)
  end

  # Set the string which will be used at the end of a paragraph.
  # Default: ""
  if(o = options[:paragraph_end])
    text_stripper.setParagraphEnd(o)
  end

  # Set the string which will be used at the end of a paragraph.
  # Default: ""
  if(o = options[:paragraph_start])
    text_stripper.setParagraphStart(o)
  end

end

.run(path_to_pdf, options = {}) ⇒ String

Runs text extraction and returns extracted text as string. Optionally can extract text from crop area only if crop area dimensions are given. All crop area dimensions are in inches.

Parameters:

  • path_to_pdf (String)
  • options (Hash, optional) (defaults to: {})

Options Hash (options):

  • crop_x (Float)

    crop area top left corner x-coordinate

  • crop_y (Float)

    crop area top left corner y-coordinate

  • crop_width (Float)

    crop area width

  • crop_height (Float)

    crop area height

  • average_char_tolerance (Float)
  • drop_threshold (Float)
  • indent_threshold (Float)
  • spacing_tolerance (Float)
  • sort_by_position (Boolean)
  • line_separator (String)
  • page_end (String)
  • page_start (String)
  • paragraph_end (String)
  • paragraph_start (String)

Returns:

  • (String)

    the extracted text



44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/pdfbox_text_extraction.rb', line 44

def self.run(path_to_pdf, options={})
  file = File.new(path_to_pdf)
  pd_doc = PDDocument.load(file)
  text_stripper = nil
  all_text = ''
  if [:crop_x, :crop_y, :crop_width, :crop_height].any? { |e| options[e] }
    # crop options given, extract from crop area only
    res = 72
    body_text_rect = Rectangle2D::Float.new(
      (options[:crop_x] * res),
      (options[:crop_y] * res),
      (options[:crop_width] * res),
      (options[:crop_height] * res)
    )
    text_stripper = PDFTextStripperByArea.new
    text_stripper.addRegion("bodyText", body_text_rect)
    configure_text_extraction_params(text_stripper, options)

    pd_doc.getPages.each do |page|
      text_stripper.extractRegions(page)
      # Get the body text of the current page
      all_text << text_stripper.getTextForRegion("bodyText")
    end
  else
    # No crop options given, extract all text
    text_stripper = PDFTextStripper.new
    configure_text_extraction_params(text_stripper, options)
    all_text << text_stripper.getText(pd_doc)
  end

  pd_doc.close

  all_text
end