Class: Tabula::Extraction::ObjectExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/tabula/extraction.rb

Direct Known Subclasses

SpreadsheetExtractor

Constant Summary collapse

PRINTABLE_RE =
/[[:print:]]/
DEFAULT_OPTIONS =
{
  :line_color_filter => nil,
  :extract_ruling_lines => true
}

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(pdf_filename, pages = [1], password = '', options = {}) ⇒ ObjectExtractor

TODO: the pages constructor argument does not make sense now that we have extract_page and extract_pages

Raises:

  • (Errno::ENOENT)


36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/tabula/extraction.rb', line 36

def initialize(pdf_filename, pages=[1], password='', options={})
  raise Errno::ENOENT unless File.exists?(pdf_filename)
  @pdf_filename = pdf_filename
  @pdf_file = Extraction.openPDF(pdf_filename, password)
  @all_pages = @pdf_file.getDocumentCatalog.getAllPages
  @pages = pages == :all ?  (1..@all_pages.size) : pages

  super()

  self.options = DEFAULT_OPTIONS.merge(options)
  self.characters = []
  @debug_clipping_paths = false
  @clipping_path = nil
  @transformed_clipping_path = nil
  self.clipping_paths = []
  @rulings = []
  @min_char_width = @min_char_height = Float::MAX
end

Instance Attribute Details

#charactersObject

Returns the value of attribute characters.



25
26
27
# File 'lib/tabula/extraction.rb', line 25

def characters
  @characters
end

#clipping_pathsObject

Returns the value of attribute clipping_paths.



25
26
27
# File 'lib/tabula/extraction.rb', line 25

def clipping_paths
  @clipping_paths
end

#debug_clipping_pathsObject

Returns the value of attribute debug_clipping_paths.



25
26
27
# File 'lib/tabula/extraction.rb', line 25

def debug_clipping_paths
  @debug_clipping_paths
end

#debug_textObject

Returns the value of attribute debug_text.



25
26
27
# File 'lib/tabula/extraction.rb', line 25

def debug_text
  @debug_text
end

#optionsObject

Returns the value of attribute options.



25
26
27
# File 'lib/tabula/extraction.rb', line 25

def options
  @options
end

Instance Method Details

#clear!Object



109
110
111
112
113
114
115
# File 'lib/tabula/extraction.rb', line 109

def clear!
  self.characters.clear
  self.clipping_paths.clear
  @min_char_width = @min_char_height = Float::MAX
  @page_transform = nil
  @rulings.clear
end

#close!Object



55
56
57
58
59
# File 'lib/tabula/extraction.rb', line 55

def close!
  self.ensure_open!
  @pdf_file.close
  @pdf_file_closed = true
end

#currentClippingPathObject



253
254
255
256
257
258
259
260
261
262
263
264
265
# File 'lib/tabula/extraction.rb', line 253

def currentClippingPath
  cp = self.getGraphicsState.getCurrentClippingPath

  if cp == @clipping_path
    return @transformed_clipping_path_bounds
  end

  @clipping_path = cp
  @transformed_clipping_path = self.transformPath(cp)
  @transformed_clipping_path_bounds = @transformed_clipping_path.getBounds

  return @transformed_clipping_path_bounds
end

#drawImage(image, at) ⇒ Object



229
230
# File 'lib/tabula/extraction.rb', line 229

def drawImage(image, at)
end

#drawPage(page) ⇒ Object



124
125
126
127
128
129
130
131
132
# File 'lib/tabula/extraction.rb', line 124

def drawPage(page)
  self.page = page
  if !self.page.getContents.nil?
    ensurePageSize!
    self.processStream(self.page,
                       self.page.findResources,
                       self.page.getContents.getStream)
  end
end

#ensure_open!Object



61
62
63
# File 'lib/tabula/extraction.rb', line 61

def ensure_open!
  raise "Document is closed" if @pdf_file_closed
end

#ensurePageSize!Object



117
118
119
120
121
122
# File 'lib/tabula/extraction.rb', line 117

def ensurePageSize!
  if self.pageSize.nil? && !self.page.nil?
    mediaBox = self.page.findMediaBox
    self.pageSize = (mediaBox == nil ? nil : mediaBox.createDimension)
  end
end

#extract(pages = nil) ⇒ Object



92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# File 'lib/tabula/extraction.rb', line 92

def extract(pages=nil)
  self.ensure_open!
  pages = if pages == :all
            (1..@all_pages.size)
          elsif pages.nil?
            @pages
          else
            pages
          end

  Enumerator.new do |y|
    pages.each do |i|
      y.yield self.extract_page(i)
    end
  end
end

#extract_page(page_number) ⇒ Object

extract objects from a page. Returns an instance of Tabula::Page (page_number is 1-based. i.e., first page is number 1)



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# File 'lib/tabula/extraction.rb', line 68

def extract_page(page_number)
  self.ensure_open!

  if page_number-1 >= @all_pages.size || (page_number) < 0
    raise IndexError, "Page #{page_number} doesn't exist. Skipping. Valid pages are 1..#{@all_pages.size}"
  end

  page = @all_pages.get(page_number-1)
  contents = page.getContents
  return nil if contents.nil?

  self.clear!
  self.drawPage(page)
  Tabula::Page.new(@pdf_filename,
                   page.findCropBox.width,
                   page.findCropBox.height,
                   page.getRotation.to_i,
                   page_number, #one-indexed, just like +page_number+ is.
                   self.characters,
                   self.rulings,
                   @min_char_width,
                   @min_char_height)
end

#fillPath(windingRule) ⇒ Object



225
226
227
# File 'lib/tabula/extraction.rb', line 225

def fillPath(windingRule)
  self.strokePath(self.getGraphicsState.getNonStrokingColor.getJavaColor.getRGBColorComponents(nil))
end

#getStrokeObject



138
139
140
# File 'lib/tabula/extraction.rb', line 138

def getStroke
  @basicStroke
end

#page_countObject



310
311
312
# File 'lib/tabula/extraction.rb', line 310

def page_count
  @all_pages.size
end

#pageTransformObject



236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
# File 'lib/tabula/extraction.rb', line 236

def pageTransform
  unless @page_transform.nil?
    return @page_transform
  end

  cb = page.findCropBox
  if !([90, -270, -90, 270].include?(page.getRotation))
    @page_transform = AffineTransform.getScaleInstance(1, -1)
    @page_transform.translate(0, -cb.getHeight)
  else
    @page_transform = AffineTransform.getScaleInstance(-1, 1)
    @page_transform.rotate(page.getRotation * (Math::PI/180.0),
                           cb.getLowerLeftX, cb.getLowerLeftY)
  end
  @page_transform
end

#processTextPosition(text) ⇒ Object



267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
# File 'lib/tabula/extraction.rb', line 267

def processTextPosition(text)
  c = text.getCharacter
  h = text.getHeightDir.round(2)

  if c == ' ' # replace non-breaking space for space
    c = ' '
  end

  te = Tabula::TextElement.new(text.getY.round(2) - h,
                               text.getX.round(2),
                               text.getWidthDirAdj,
                               # ugly hack follows: we need spaces to have a height, so we can
                               # test for vertical overlap. height == width seems a safe bet.
                               text.getHeightDir,
                               text.getFont,
                               text.getFontSize,
                               c,
                               # workaround a possible bug in PDFBox: https://issues.apache.org/jira/browse/PDFBOX-1755
                               (text.getWidthOfSpace.nan? || text.getWidthOfSpace == 0) ? self.currentSpaceWidth : text.getWidthOfSpace,
                               text.getDir)

  ccp_bounds = self.currentClippingPath

  if self.debug_clipping_paths && !self.clipping_paths.include?(ccp_bounds)
    self.clipping_paths << ::Tabula::ZoneEntity.new(ccp_bounds.getMinY,
                                                    ccp_bounds.getMinX,
                                                    ccp_bounds.getWidth,
                                                    ccp_bounds.getHeight)
  end

  if te.width < @min_char_width
    @min_char_width = te.width
  end

  if te.height < @min_char_height
    @min_char_height = te.height
  end

  if c =~ PRINTABLE_RE && ccp_bounds.intersects(te)
    self.characters << te
  end
end

#rulingsObject



314
315
316
# File 'lib/tabula/extraction.rb', line 314

def rulings
  @rulings.reject { |l| (l.left == l.right && l.top == l.bottom) || [l.top, l.left, l.bottom, l.right].any? { |p| p < 0 } }
end

#setStroke(stroke) ⇒ Object



134
135
136
# File 'lib/tabula/extraction.rb', line 134

def setStroke(stroke)
  @basicStroke = stroke
end

#strokePath(filter_by_color = nil) ⇒ Object



143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# File 'lib/tabula/extraction.rb', line 143

def strokePath(filter_by_color=nil)
  unless self.options[:extract_ruling_lines]
    self.getLinePath.reset
    return
  end

  path = self.pathToList(self.getLinePath)

  # skip paths whose first operation is not a MOVETO
  # or contains operations other than LINETO, MOVETO or CLOSE
  if path[0][0] != java.awt.geom.PathIterator::SEG_MOVETO \
     || path[1..-1].any? { |p|
      p.first != java.awt.geom.PathIterator::SEG_LINETO \
      && p.first != java.awt.geom.PathIterator::SEG_MOVETO \
      && p.first != java.awt.geom.PathIterator::SEG_CLOSE
    }
    self.getLinePath.reset
    return
  end

  ccp_bounds = self.currentClippingPath

  strokeColorComps = filter_by_color || self.getGraphicsState.getStrokingColor.getJavaColor.getRGBColorComponents(nil)
  color_filter = self.options[:line_color_filter]

  if !color_filter.nil? && !color_filter.call(strokeColorComps)
    self.getLinePath.reset
    return
  end

  # skip the first path operation save it as the starting position
  first = path.shift
  # last_move
  start_pos = last_move = java.awt.geom.Point2D::Float.new(first[1][0], first[1][1])

  end_pos = nil

  path.each do |p|
    case p[0]
    when java.awt.geom.PathIterator::SEG_LINETO
      end_pos = java.awt.geom.Point2D::Float.new(p[1][0], p[1][1])
      line = (start_pos <=> end_pos) == -1 \
             ? java.awt.geom.Line2D::Float.new(start_pos, end_pos) \
             : java.awt.geom.Line2D::Float.new(end_pos, start_pos)

      if line.intersects(ccp_bounds)
        # convert line to rectangle for clipping it to the current clippath
        # sucks, but awt doesn't have methods for this
        tmp = line.getBounds2D.createIntersection(ccp_bounds).getBounds2D
        @rulings << ::Tabula::Ruling.new(tmp.getY,
                                         tmp.getX,
                                         tmp.getWidth,
                                         tmp.getHeight,
                                         filter_by_color.to_a)
      end
    when java.awt.geom.PathIterator::SEG_MOVETO
      last_move = java.awt.geom.Point2D::Float.new(p[1][0], p[1][1])
    when java.awt.geom.PathIterator::SEG_CLOSE
      # according to PathIterator docs:
      # "the preceding subpath should be closed by appending a line segment
      # back to the point corresponding to the most recent SEG_MOVETO."

      line = (end_pos <=> last_move) == -1 \
             ? java.awt.geom.Line2D::Float.new(end_pos, last_move) \
             : java.awt.geom.Line2D::Float.new(last_move, end_pos)

      if line.intersects(ccp_bounds)
        # convert line to rectangle for clipping it to the current clippath
        # sucks, but awt doesn't have methods for this
        tmp = line.getBounds2D.createIntersection(ccp_bounds).getBounds2D
        @rulings << ::Tabula::Ruling.new(tmp.getY,
                                         tmp.getX,
                                         tmp.getWidth,
                                         tmp.getHeight,
                                         filter_by_color.to_a)
      end
    end
    start_pos = end_pos
  end
  self.getLinePath.reset
end

#transformPath(path) ⇒ Object



232
233
234
# File 'lib/tabula/extraction.rb', line 232

def transformPath(path)
  self.pageTransform.createTransformedShape(path)
end