Class: Tabula::Spreadsheet

Inherits:
ZoneEntity
  • Object
show all
Includes:
Tabular
Defined in:
lib/tabula/entities/spreadsheet.rb

Overview

the both should implement ‘cells`, `rows`, `cols`, `extraction_method`

Instance Attribute Summary collapse

Attributes inherited from ZoneEntity

#texts

Class Method Summary collapse

Instance Method Summary collapse

Methods included from AbstractInterface

included

Methods inherited from ZoneEntity

#<=>, #inspect, #merge!, #points, #tlbr, #tlwh

Constructor Details

#initialize(top, left, width, height, page, cells, vertical_ruling_lines, horizontal_ruling_lines) ⇒ Spreadsheet

, lines)



12
13
14
15
16
17
18
19
# File 'lib/tabula/entities/spreadsheet.rb', line 12

def initialize(top, left, width, height, page, cells, vertical_ruling_lines, horizontal_ruling_lines) #, lines)
  super(top, left, width, height)
  @cells = cells
  @page = page
  @vertical_ruling_lines = vertical_ruling_lines
  @horizontal_ruling_lines = horizontal_ruling_lines
  @extraction_method = "spreadsheet"
end

Instance Attribute Details

#cellsObject

Returns the value of attribute cells.



9
10
11
# File 'lib/tabula/entities/spreadsheet.rb', line 9

def cells
  @cells
end

#cells_resolvedObject

Returns the value of attribute cells_resolved.



9
10
11
# File 'lib/tabula/entities/spreadsheet.rb', line 9

def cells_resolved
  @cells_resolved
end

#extraction_methodObject (readonly)

Returns the value of attribute extraction_method.



10
11
12
# File 'lib/tabula/entities/spreadsheet.rb', line 10

def extraction_method
  @extraction_method
end

#horizontal_ruling_linesObject

Returns the value of attribute horizontal_ruling_lines.



9
10
11
# File 'lib/tabula/entities/spreadsheet.rb', line 9

def horizontal_ruling_lines
  @horizontal_ruling_lines
end

#pageObject (readonly)

Returns the value of attribute page.



10
11
12
# File 'lib/tabula/entities/spreadsheet.rb', line 10

def page
  @page
end

#vertical_ruling_linesObject

Returns the value of attribute vertical_ruling_lines.



9
10
11
# File 'lib/tabula/entities/spreadsheet.rb', line 9

def vertical_ruling_lines
  @vertical_ruling_lines
end

Class Method Details

.empty(page) ⇒ Object



21
22
23
# File 'lib/tabula/entities/spreadsheet.rb', line 21

def self.empty(page)
  Spreadsheet.new(0, 0, 0, 0, page, [], nil, nil) 
end

Instance Method Details

#+(other) ⇒ Object

Raises:

  • (ArgumentError)


157
158
159
160
# File 'lib/tabula/entities/spreadsheet.rb', line 157

def +(other)
  raise ArgumentError, "Data can only be added if it's from the same PDF page" unless other.page == @page
  Spreadsheet.new(nil, nil, nil, nil, @page, @cells + other.cells, nil, nil )
end

#add_spanning_cells!Object

Chapter 2 of Spreadsheet extraction, Spanning Cells #

if c is a “spanning cell”, that is

if there are N>0 vertical lines strictly between this cell's left and right

insert N placeholder cells after it with zero size (but same top)



78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# File 'lib/tabula/entities/spreadsheet.rb', line 78

def add_spanning_cells!
  #rounding: because Cell.new_from_points, using in #find_cells above, has
  # a float precision error where, for instance, a cell whose x2 coord is
  # supposed to be 160.137451171875 comes out as 160.13745498657227 because
  # of minus. :(
  vertical_uniq_locs = vertical_ruling_lines.map{|l| l.left.round(5)}.uniq    #already sorted
  horizontal_uniq_locs = horizontal_ruling_lines.map{|l| l.top.round(5)}.uniq #already sorted

  cells.each do |c|
    vertical_rulings_spanned_over = vertical_uniq_locs.select{|l| l > c.left.round(5) && l < c.right.round(5) }
    horizontal_rulings_spanned_over = horizontal_uniq_locs.select{|t| t > c.top.round(5) && t < c.bottom.round(5) }

    unless vertical_rulings_spanned_over.empty?
      c.spanning = true
      vertical_rulings_spanned_over.each do |spanned_over_line_loc|
        placeholder = Cell.new(c.top, spanned_over_line_loc, 0, c.height)
        placeholder.placeholder = true
        cells << placeholder
      end
    end
    unless horizontal_rulings_spanned_over.empty?
      c.spanning = true
      horizontal_rulings_spanned_over.each do |spanned_over_line_loc|
        placeholder = Cell.new(spanned_over_line_loc, c.left, c.width, 0)
        placeholder.placeholder = true
        cells << placeholder
      end
    end

    #if there's a spanning cell that's spans over both rows and columns, then it has "double placeholder" cells
    # e.g. -------------------
    #      | C |  C |  C | C |         (this is some pretty sweet ASCII art, eh?)
    #      |-----------------|
    #      | C |  C |  C | C |
    #      |-----------------|
    #      | C | SC    P | C |   where MC is the "spanning cell" that holds all the text within its bounds
    #      |----    +    ----|         P is a "placeholder" cell with either zero width or zero height
    #      | C | P    DP | C |         DP is a "double placeholder" cell with zero width and zero height
    #      |----    +    ----|         C is an ordinary cell.
    #      | C | P    DP | C |
    #      |-----------------|

    unless (double_placeholders = vertical_rulings_spanned_over.product(horizontal_rulings_spanned_over)).empty?
      double_placeholders.each do |vert_spanned_over, horiz_spanned_over|
        placeholder = Cell.new(horiz_spanned_over, vert_spanned_over, 0, 0)
        placeholder.placeholder = true
        cells << placeholder
      end
    end
  end
end

#cols(evaluate_cells = true) ⇒ Object

call ‘cols` with `evaluate_cells` as `false` to defer filling in the text in each cell, which can be computationally intensive.



64
65
66
67
68
69
70
# File 'lib/tabula/entities/spreadsheet.rb', line 64

def cols(evaluate_cells=true)
  if evaluate_cells
    fill_in_cells!
  end

  cells.group_by{|cell| cell.left.round(5) }.sort_by(&:first).map{|x| x.last.sort_by(&:top) }
end

#rows(evaluate_cells = true) ⇒ Object

call ‘rows` with `evaluate_cells` as `false` to defer filling in the text in each cell, which can be computationally intensive.



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/tabula/entities/spreadsheet.rb', line 36

def rows(evaluate_cells=true)
  if evaluate_cells
    fill_in_cells!
  end

  array_of_rows = cells.group_by{|cell| cell.top.round(5) }.sort_by(&:first).map{|x| x.last.sort_by(&:left) }

  #here, insert another kind of placeholder for empty corners
  # like in 01001523B_China.pdf
  #TODO: support placeholders for "empty" cells in rows other than row 1, and in #cols
  # puts array_of_rows[0].inspect
  if array_of_rows.size > 2
    if array_of_rows[0].map(&:left).uniq.size < array_of_rows[1].map(&:left).uniq.size
      missing_spots = array_of_rows[1].map(&:left) - array_of_rows[0].map(&:left)

      missing_spots.each do |missing_spot|
        missing_spot_placeholder = Cell.new(array_of_rows[0][0].top, missing_spot, 0, 0)
        missing_spot_placeholder.placeholder = true
        array_of_rows[0] << missing_spot_placeholder
      end
    end
    array_of_rows[0].sort_by!(&:left)
  end
  array_of_rows
end

#ruling_linesObject



25
26
27
# File 'lib/tabula/entities/spreadsheet.rb', line 25

def ruling_lines
  @vertical_ruling_lines + @horizontal_ruling_lines
end

#ruling_lines=(lines) ⇒ Object



29
30
31
32
# File 'lib/tabula/entities/spreadsheet.rb', line 29

def ruling_lines=(lines)
  @vertical_ruling_lines = lines.select{|vl| vl.vertical? && spr.intersectsLine(vl) }
  @horizontal_ruling_lines = lines.select{|hl| hl.horizontal? && spr.intersectsLine(hl) }
end

#to_aObject



130
131
132
133
# File 'lib/tabula/entities/spreadsheet.rb', line 130

def to_a
  fill_in_cells!
  rows.map{ |row_cells| row_cells.map(&:text) }
end

#to_csvObject



135
136
137
138
139
140
# File 'lib/tabula/entities/spreadsheet.rb', line 135

def to_csv
  out = StringIO.new
  out.set_encoding("utf-8")
  Tabula::Writers.CSV(rows, out)
  out.string
end

#to_json(*a) ⇒ Object



149
150
151
152
153
154
155
# File 'lib/tabula/entities/spreadsheet.rb', line 149

def to_json(*a)
  {
    'json_class'   => self.class.name,
    'extraction_method' => @extraction_method,
    'data' => rows,
  }.to_json(*a)
end

#to_tsvObject



142
143
144
145
146
147
# File 'lib/tabula/entities/spreadsheet.rb', line 142

def to_tsv
  out = StringIO.new
  out.set_encoding("utf-8")
  Tabula::Writers.TSV(rows, out)
  out.string
end