Class: Tabula::Page

Inherits:
ZoneEntity
  • Object
show all
Includes:
HasCells
Defined in:
lib/tabula/entities/page.rb

Direct Known Subclasses

PageArea

Constant Summary

Constants included from HasCells

HasCells::ARBITRARY_MAGIC_HEURISTIC_NUMBER

Instance Attribute Summary collapse

Attributes inherited from ZoneEntity

#texts

Instance Method Summary collapse

Methods included from HasCells

#find_cells!, #find_spreadsheets_from_cells, #heuristic_ratio, #is_tabular?

Methods inherited from ZoneEntity

#<=>, #inspect, #merge!, #points, #tlbr, #tlwh

Constructor Details

#initialize(file_path, width, height, rotation, number, texts = [], ruling_lines = [], min_char_width = nil, min_char_height = nil, spatial_index = nil) ⇒ Page

Returns a new instance of Page.



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/tabula/entities/page.rb', line 9

def initialize(file_path, width, height, rotation, number, texts=[], ruling_lines=[], min_char_width=nil, min_char_height=nil, spatial_index=nil)
  super(0, 0, width, height)
  @rotation = rotation
  if number < 1
    raise ArgumentError, "Tabula::Page numbers are one-indexed; numbers < 1 are invalid."
  end
  @ruling_lines = ruling_lines
  @file_path = file_path
  @number_one_indexed = number
  @cells = []
  @spreadsheets = nil
  @min_char_width = min_char_width
  @min_char_height = min_char_height

  self.texts = texts

  @ruling_lines += minimal_bounding_box_of_ruling_lines.to_lines.map{|l| Ruling.new(l.getY1, l.getX1, l.getX2 - l.getX1, l.getY2 - l.getY1)}.select &:finite?

  if spatial_index.nil?
    @spatial_index = TextElementIndex.new
    self.texts.each { |te| @spatial_index << te }
  else
    @spatial_index = spatial_index
  end

end

Instance Attribute Details

#cellsObject

Returns the value of attribute cells.



7
8
9
# File 'lib/tabula/entities/page.rb', line 7

def cells
  @cells
end

#file_pathObject (readonly)

Returns the value of attribute file_path.



5
6
7
# File 'lib/tabula/entities/page.rb', line 5

def file_path
  @file_path
end

#min_char_height=(value) ⇒ Object (writeonly)

Sets the attribute min_char_height

Parameters:

  • value

    the value to set the attribute min_char_height to.



6
7
8
# File 'lib/tabula/entities/page.rb', line 6

def min_char_height=(value)
  @min_char_height = value
end

#min_char_width=(value) ⇒ Object (writeonly)

Sets the attribute min_char_width

Parameters:

  • value

    the value to set the attribute min_char_width to.



6
7
8
# File 'lib/tabula/entities/page.rb', line 6

def min_char_width=(value)
  @min_char_width = value
end

#number_one_indexedObject (readonly)

Returns the value of attribute number_one_indexed.



5
6
7
# File 'lib/tabula/entities/page.rb', line 5

def number_one_indexed
  @number_one_indexed
end

#rotationObject (readonly)

Returns the value of attribute rotation.



5
6
7
# File 'lib/tabula/entities/page.rb', line 5

def rotation
  @rotation
end

Instance Method Details

#fill_in_cell_texts!(areas) ⇒ Object



246
247
248
249
250
251
252
253
254
# File 'lib/tabula/entities/page.rb', line 246

def fill_in_cell_texts!(areas)
  texts.each do |t|
    area = areas.find{|a| a.contains(t) }
    area.text_elements << t unless area.nil?
  end
  areas.each do |area|
    area.text_elements = TextElement.merge_words(area.text_elements)
  end
end

#fill_in_cells!(options = {}) ⇒ Object



177
178
179
180
181
182
183
184
# File 'lib/tabula/entities/page.rb', line 177

def fill_in_cells!(options={})
  spreadsheets(options).each do |spreadsheet|
    spreadsheet.cells.each do |cell|
      cell.text_elements = page.get_cell_text(cell)
    end
    spreadsheet.cells_resolved = true
  end
end

#get_area(area) ⇒ Object



77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/tabula/entities/page.rb', line 77

def get_area(area)
  if area.is_a?(Array)
    top, left, bottom, right = area
    area = Tabula::ZoneEntity.new(top, left,
                                  right - left, bottom - top)
  end

  texts = self.get_text(area)
  page_area = PageArea.new(file_path,
                           area.width,
                           area.height,
                           rotation,
                           number,
                           texts,
                           Ruling.crop_rulings_to_area(@ruling_lines, area),
                           texts.map(&:width).min,
                           texts.map(&:height).min,
                           @spatial_index)
  return page_area
end

#get_cell_text(area = nil) ⇒ Object



256
257
258
# File 'lib/tabula/entities/page.rb', line 256

def get_cell_text(area=nil)
  TextElement.merge_words(self.get_text(area))
end

#get_min_char_heightObject



73
74
75
# File 'lib/tabula/entities/page.rb', line 73

def get_min_char_height
  @min_char_height ||= texts.map(&:height).min
end

#get_min_char_widthObject



69
70
71
# File 'lib/tabula/entities/page.rb', line 69

def get_min_char_width
  @min_char_width ||= texts.map(&:width).min
end

#get_ruling_lines!(options = {}) ⇒ Object

returns ruling lines, memoizes them in



214
215
216
217
218
219
220
221
222
223
224
225
226
227
# File 'lib/tabula/entities/page.rb', line 214

def get_ruling_lines!(options={})
  if @ruling_lines.nil? || @ruling_lines.empty?
    return []
  end
  self.snap_points!

  @ruling_lines.select! { |l| !(l.width == 0 && l.height == 0) }

  @vertical_ruling_lines ||= Ruling.collapse_oriented_rulings(@ruling_lines.select(&:vertical?))
  @horizontal_ruling_lines ||= Ruling.collapse_oriented_rulings(@ruling_lines.select(&:horizontal?))

  @vertical_ruling_lines + @horizontal_ruling_lines

end

#get_table(options = {}) ⇒ Object

returns a Table object



99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# File 'lib/tabula/entities/page.rb', line 99

def get_table(options={})
  options = {:vertical_rulings => []}.merge(options)
  if texts.empty?
    return Tabula::Table.new(0, [])
  end

  texts = self.texts.sort
  text_chunks = TextElement.merge_words(texts, options)

  lines = TextChunk.(text_chunks.sort).sort_by(&:top)

  columns = unless options[:vertical_rulings].empty?
              options[:vertical_rulings].map(&:left).sort #pixel locations, not entities
            else
              TextChunk.column_positions(lines).sort
            end

  table = Table.new(lines.count, columns)
  lines.each_with_index do |line, i|
    line.text_elements.select { |te| te.text !~ ONLY_SPACES_RE }.each do |te|
      j = columns.find_index { |s| te.left <= s } || columns.count
      table.add_text_element(te, i, j)
    end
  end

  # fixes up the table a little bit, replacing nils with empty TextElements
  # and sorting the lines.
  # table.rows.each do |l|
  #   l.text_elements = l.text_elements.map do |te|
  #     te || TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
  #   end
  # end
  # table.rows.sort_by!(&:top)
  table
end

#get_text(area = nil) ⇒ Object

get text insidea area area can be an Array ([top, left, width, height]) or a Rectangle2D



233
234
235
236
237
238
239
240
241
242
243
244
# File 'lib/tabula/entities/page.rb', line 233

def get_text(area=nil)
  if area.instance_of?(Array)
    top, left, bottom, right = area
    area = Tabula::ZoneEntity.new(top, left,
                                  right - left, bottom - top)
  end
  if area.nil?
    texts
  else
    @spatial_index.contains(area)
  end
end

#has_text?Boolean

Returns:

  • (Boolean)


194
195
196
# File 'lib/tabula/entities/page.rb', line 194

def has_text?
  !self.texts.empty?
end

#horizontal_ruling_linesObject



203
204
205
206
# File 'lib/tabula/entities/page.rb', line 203

def horizontal_ruling_lines
  get_ruling_lines!
  @horizontal_ruling_lines.nil? ? [] : @horizontal_ruling_lines
end

#make_table(options = {}) ⇒ Object

for API backwards-compatibility reasons, this returns an array of arrays.



136
137
138
# File 'lib/tabula/entities/page.rb', line 136

def make_table(options={})
  get_table(options).rows
end

#minimal_bounding_box_of_ruling_linesObject



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# File 'lib/tabula/entities/page.rb', line 36

def minimal_bounding_box_of_ruling_lines
  max_x = 0
  max_y = 0
  min_x = ::Float::INFINITY
  min_y = ::Float::INFINITY
  horizontal_ruling_lines.each do |t|
    min_x = t.left if t.left < min_x
    max_x = t.right if t.right > max_x
  end
  vertical_ruling_lines.each do |t|
    min_y = t.top if t.top < min_y
    max_y = t.bottom if t.bottom > max_y
  end
  java.awt.geom.Rectangle2D::Float.new(min_x, min_y, max_x - min_x, max_y - min_y)
end

#minimal_bounding_box_of_text_elementsObject

is there a scenario under which we’d prefer to use this over ‘minimal_bounding_box_of_ruling_lines`? if so, what is it? If there are no ruling lines on the page _at all_, then adding this bounding box is useless.



55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/tabula/entities/page.rb', line 55

def minimal_bounding_box_of_text_elements
  max_x = 0
  max_y = 0
  min_x = ::Float::INFINITY
  min_y = ::Float::INFINITY
  @texts.each do |t|
    min_x = t.x if t.x < min_x
    min_y = t.y if t.y < min_y
    max_x = t.x if t.x > max_x
    max_y = t.y if t.y > max_y       
  end
  java.awt.geom.Rectangle2D::Float.new(min_x, min_y, max_x - min_x, max_y - min_y)
end

#number(indexing_base = :one_indexed) ⇒ Object



186
187
188
189
190
191
192
# File 'lib/tabula/entities/page.rb', line 186

def number(indexing_base=:one_indexed)
  if indexing_base == :zero_indexed
    return @number_one_indexed - 1
  else
    return @number_one_indexed
  end
end

#ruling_linesObject

TODO no need for this, let’s choose one name



199
200
201
# File 'lib/tabula/entities/page.rb', line 199

def ruling_lines
  get_ruling_lines!
end

#snap_points!Object



269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
# File 'lib/tabula/entities/page.rb', line 269

def snap_points!
  lines_to_points = {}
  points = []
  @ruling_lines.each do |line|
    point1 = line.p1 #comptooters are the wurst
    point2 = line.p2
    # for a given line, each call to #p1 and #p2 creates a new
    # Point2D::Float object, rather than returning the same one over and
    # over again.
    # so we have to get it, store it in memory as `point1` and `point2`
    # and then store those in various places (and now, modifying one will
    # modify the reference and thereby modify the other)
    lines_to_points[line] = [point1, point2]
    points += [point1, point2]
  end

  # lines are stored separately from their constituent points
  # so you can't modify the points and then modify the lines.
  # ah, but perhaps I can stick the points in a hash AND in an array
  # and then modify the lines by means of the points in the hash.

  [[:x, :x=, self.get_min_char_width], [:y, :y=, self.get_min_char_height]].each do |getter, setter, cell_size|
    sorted_points = points.sort_by(&getter)
    first_point = sorted_points.shift
    grouped_points = sorted_points.inject([[first_point]] ) do |memo, next_point|
      last = memo.last

      if (next_point.send(getter) - last.first.send(getter)).abs < cell_size
        memo[-1] << next_point
      else
        memo << [next_point]
      end
      memo
    end
    grouped_points.each do |group|
      uniq_locs = group.map(&getter).uniq
      avg_loc = uniq_locs.sum / uniq_locs.size
      group.each{|p| p.send(setter, avg_loc) }
    end
  end

  lines_to_points.each do |l, p1_p2|
    l.java_send :setLine, [java.awt.geom.Point2D, java.awt.geom.Point2D], p1_p2[0], p1_p2[1]
  end
end

#spreadsheet_areas(options = {}) ⇒ Object



165
166
167
168
169
170
171
172
173
174
175
# File 'lib/tabula/entities/page.rb', line 165

def spreadsheet_areas (options={})
  get_ruling_lines!(options)
  self.find_cells!(self.horizontal_ruling_lines, self.vertical_ruling_lines, options)

  spreadsheet_java_areas = find_spreadsheets_from_cells #literally, java.awt.geom.Area objects. lol sorry. polygons.

  #transform each spreadsheet area into a rectangle
  # and get the cells contained within it.
  # getBounds2D is theoretically better than getBounds, but it returns a Rectangle2D.Double, which doesn't have our Ruby sugar on it.
  spreadsheet_java_areas.map{|a| a.getBounds } 
end

#spreadsheets(options = {}) ⇒ Object

returns the Spreadsheets; creating them if they’re not memoized



141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# File 'lib/tabula/entities/page.rb', line 141

def spreadsheets(options={})
  unless @spreadsheets.nil?
    return @spreadsheets
  end

  @spreadsheets = spreadsheet_areas(options).map do |rect|
    spr = Spreadsheet.new(rect.y, rect.x,
                    rect.width, rect.height,
                    self,
                    #TODO: keep track of the cells, instead of getting them again inefficiently.
                    [],
                    vertical_ruling_lines.select{|vl| rect.intersectsLine(vl) },
                    horizontal_ruling_lines.select{|hl| rect.intersectsLine(hl) }
                    )
    spr.cells = @cells.select{|c| spr.overlaps?(c) }
    spr.add_spanning_cells!
    spr
  end
  if options[:fill_in_cells]
    fill_in_cells!
  end
  spreadsheets
end

#to_json(options = {}) ⇒ Object



260
261
262
263
264
265
266
267
# File 'lib/tabula/entities/page.rb', line 260

def to_json(options={})
  { :width => self.width,
    :height => self.height,
    :number => self.number,
    :rotation => self.rotation,
    :hasText => self.has_text?
  }.to_json(options)
end

#vertical_ruling_linesObject



208
209
210
211
# File 'lib/tabula/entities/page.rb', line 208

def vertical_ruling_lines
  get_ruling_lines!
  @vertical_ruling_lines.nil? ? [] : @vertical_ruling_lines
end