Module: Tabula::HasCells

Included in:
Page
Defined in:
lib/tabula/entities/has_cells.rb

Overview

subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors; ruling_lines reader

Constant Summary collapse

ARBITRARY_MAGIC_HEURISTIC_NUMBER =
0.65

Instance Method Summary collapse

Instance Method Details

#find_cells!(horizontal_ruling_lines, vertical_ruling_lines, options = {}) ⇒ Object

finds cells from the ruling lines on the page. implements Nurminen thesis algorithm cf. github.com/jazzido/tabula-extractor/issues/16 subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors



32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/tabula/entities/has_cells.rb', line 32

def find_cells!(horizontal_ruling_lines, vertical_ruling_lines, options={})
  # All lines need to been sorted from up to down,
  # and left to right in ascending order

  cellsFound = []

  intersection_points = Ruling.find_intersections(horizontal_ruling_lines, vertical_ruling_lines)

  # All crossing-points have been sorted from up to down,
  # and left to right in ascending order
  # depending on the Point2D default sort here.
  intersection_points_array = intersection_points.keys.sort

  intersection_points_array.each_with_index do |topLeft, i|
    # Fetch all points on the same vertical and horizontal
    # line with current crossing point
    horizontal, vertical = intersection_points[topLeft]

    # this lets us go to the next intersection_point in intersection_points_array
    # it is bad and I feel bad.
    catch :cellCreated do

      # CrossingPointsDirectlyBelow( topLeft );
      x_points = intersection_points_array[i..-1].select{|pt| pt.x == topLeft.x && pt.y > topLeft.y }
      # CrossingPointsDirectlyToTheRight( topLeft );
      y_points = intersection_points_array[i..-1].select{|pt| pt.y == topLeft.y && pt.x > topLeft.x }


      x_points.each do |x_point|
        #                                Skip to next crossing-point
        # if( NOT EdgeExistsBetween( topLeft, x_point)) next crossing-
        #                                                    point;
        next unless vertical.colinear?(x_point)
        y_points.each do |y_point|

          # if( NOT EdgeExistsBetween( topLeft, y_point)) next crossing-
          #                                                    point;
          next unless horizontal.colinear?(y_point)
          #Hypothetical bottom right point of rectangle
          btmRight = Point2D::Float.new(y_point.x, x_point.y)
          if intersection_points.include?(btmRight)
            btmRightHorizontal, btmRightVertical = intersection_points[btmRight]

            if btmRightHorizontal.colinear?( x_point ) &&
                btmRightVertical.colinear?( y_point )
              # Rectangle is confirmed to have 4 sides
              cellsFound << Cell.new_from_points( topLeft, btmRight, options)
              # Each crossing point can be the top left corner
              # of only a single rectangle
              #next crossing-point; we need to "next" out of the outer loop here
              # to avoid creating non-minimal cells, I htink.
              throw :cellCreated
            end
          end
        end
      end
    end #cellCreated
  end
  self.cells = cellsFound
  cellsFound
end

#find_spreadsheets_from_cellsObject

TODO: returns array of Spreadsheet objects constructed (or spreadsheet_areas => cells) maybe placeholders should be added after cells is split into spreadsheets



97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# File 'lib/tabula/entities/has_cells.rb', line 97

def find_spreadsheets_from_cells
  cells.sort!

  # via http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon

  points = Set.new
  cells.each do |cell|
    #TODO: keep track of cells for each point here for more efficiently keeping track of cells inside a polygon
    cell.points.each do |pt|
      if points.include?(pt) # Shared vertex, remove it.
        points.delete(pt)
      else
        points << pt
      end
    end
  end
  points = points.to_a

  #x first sort
  points_sort_x = points.sort{ |s, other| s.x_first_cmp(other) }
  points_sort_y = points.sort

  edges_h = {}
  edges_v = {}

  i = 0
  while i < points.size do
    curr_y = points_sort_y[i].y
    while i < points.size && points_sort_y[i].y == curr_y do
      edges_h[points_sort_y[i]] = points_sort_y[i + 1]
      edges_h[points_sort_y[i + 1]] = points_sort_y[i]
      i += 2
    end
  end

  i = 0
  while i < points.size do
    curr_x = points_sort_x[i].x
    while i < points.size && points_sort_x[i].x == curr_x do
      edges_v[points_sort_x[i]] = points_sort_x[i + 1]
      edges_v[points_sort_x[i + 1]] = points_sort_x[i]
      i += 2
    end
  end

  # Get all the polygons.
  polygons = []
  while !edges_h.empty?
    # We can start with any point.
    #TODO: should the polygon be represented just by an ordered array of points?
    polygon = [[edges_h.shift[0], :horiz]] #popitem removes and returns a random key-value pair
    loop do
      curr, e = polygon.last
      if e == :horiz
        next_vertex = edges_v.delete(curr)
        polygon << [next_vertex, :vert]
      else
        next_vertex = edges_h.delete(curr) #pop removes and returns the value at key `curr`
        polygon << [next_vertex, :horiz]
      end
      if polygon[-1] == polygon[0]
        # Closed polygon
        polygon.pop()
        break
      end
    end

    # Remove implementation-markers (:horiz and :vert) from the polygon.
    polygon.map!{|point, _| point}
    polygon.each do |vertex|
      edges_h.delete(vertex) if edges_h.include?(vertex)
      edges_v.delete(vertex) if edges_v.include?(vertex)
    end
    polygons << polygon
  end

  # for efficiency's sake, we maybe ought to use java Polygon objects internally
  # for flexibility, we don't.

  polygons.map do |polygon|
    xpoints = []
    ypoints = []
    polygon.each do |pt|
      xpoints << pt.x
      ypoints << pt.y
    end
    Area.new(Polygon.new(xpoints.to_java(Java::int), ypoints.to_java(Java::int), xpoints.size)) #lol jruby
  end
end

#heuristic_ratioObject



16
17
18
19
20
21
22
23
24
25
26
27
# File 'lib/tabula/entities/has_cells.rb', line 16

def heuristic_ratio
  #spreadsheet extraction
  spreadsheet = spreadsheets.first
  return Float::NAN if spreadsheet.nil?
   = spreadsheet.rows.size #rows filled in automatically
   = spreadsheet.cols.size

  table = self.get_table
  columns_defined_without_lines = table.cols.size
  rows_defined_without_lines = table.rows.size
  ((.to_f / columns_defined_without_lines) + (.to_f / rows_defined_without_lines)) / 2
end

#is_tabular?Boolean

Returns:

  • (Boolean)


11
12
13
14
# File 'lib/tabula/entities/has_cells.rb', line 11

def is_tabular?
  ratio = heuristic_ratio
  return ratio > ARBITRARY_MAGIC_HEURISTIC_NUMBER && ratio < (1 / ARBITRARY_MAGIC_HEURISTIC_NUMBER)
end