Class: Tabula::TextChunk
- Inherits:
-
ZoneEntity
- Object
- Tabula.javajava.awtjava.awt.geomjava.awt.geom.Rectangle2Djava.awt.geom.Rectangle2D::Float
- ZoneEntity
- Tabula::TextChunk
- Defined in:
- lib/tabula/entities/text_chunk.rb
Overview
a “collection” of TextElements
Instance Attribute Summary collapse
-
#font ⇒ Object
Returns the value of attribute font.
-
#font_size ⇒ Object
Returns the value of attribute font_size.
-
#text_elements ⇒ Object
Returns the value of attribute text_elements.
-
#width_of_space ⇒ Object
Returns the value of attribute width_of_space.
Attributes inherited from ZoneEntity
Class Method Summary collapse
-
.column_positions(lines) ⇒ Object
returns a list of column boundaries (x axis)
lines
must be an array of lines sorted by theirtop
attribute. -
.create_from_text_element(text_element) ⇒ Object
initialize a new TextChunk from a TextElement.
- .group_by_lines(text_chunks) ⇒ Object
Instance Method Summary collapse
-
#<<(text_element) ⇒ Object
add a TextElement to this TextChunk.
- #inspect ⇒ Object
- #merge!(other) ⇒ Object
-
#split_vertically!(y) ⇒ Object
split this TextChunk vertically (in place, returns the remaining chunk).
- #text ⇒ Object
- #to_h ⇒ Object
Methods inherited from ZoneEntity
#<=>, #initialize, #points, #tlbr, #tlwh, #to_json
Constructor Details
This class inherits a constructor from Tabula::ZoneEntity
Instance Attribute Details
#font ⇒ Object
Returns the value of attribute font.
5 6 7 |
# File 'lib/tabula/entities/text_chunk.rb', line 5 def font @font end |
#font_size ⇒ Object
Returns the value of attribute font_size.
5 6 7 |
# File 'lib/tabula/entities/text_chunk.rb', line 5 def font_size @font_size end |
#text_elements ⇒ Object
Returns the value of attribute text_elements.
5 6 7 |
# File 'lib/tabula/entities/text_chunk.rb', line 5 def text_elements @text_elements end |
#width_of_space ⇒ Object
Returns the value of attribute width_of_space.
5 6 7 |
# File 'lib/tabula/entities/text_chunk.rb', line 5 def width_of_space @width_of_space end |
Class Method Details
.column_positions(lines) ⇒ Object
returns a list of column boundaries (x axis) lines
must be an array of lines sorted by their top
attribute
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
# File 'lib/tabula/entities/text_chunk.rb', line 48 def self.column_positions(lines) init = lines.first.text_elements.inject([]) { |memo, text_chunk| next memo if text_chunk.text =~ ONLY_SPACES_RE memo << Tabula::ZoneEntity.new(*text_chunk.tlwh) memo } regions = lines[1..-1] .inject(init) do |column_regions, line| line_text_elements = line.text_elements.clone.select { |te| te.text !~ ONLY_SPACES_RE } column_regions.each do |cr| overlaps = line_text_elements .select { |te| te.text !~ ONLY_SPACES_RE && cr.horizontally_overlaps?(te) } overlaps.inject(cr) do |memo, te| cr.merge!(te) end line_text_elements = line_text_elements - overlaps end column_regions += line_text_elements.map { |te| Tabula::ZoneEntity.new(*te.tlwh) } end regions.map { |r| r.right.round(2) }.uniq end |
.create_from_text_element(text_element) ⇒ Object
initialize a new TextChunk from a TextElement
9 10 11 12 13 14 |
# File 'lib/tabula/entities/text_chunk.rb', line 9 def self.create_from_text_element(text_element) raise TypeError, "argument is not a TextElement" unless text_element.instance_of?(TextElement) tc = self.new(*text_element.tlwh) tc.text_elements = [text_element] return tc end |
.group_by_lines(text_chunks) ⇒ Object
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
# File 'lib/tabula/entities/text_chunk.rb', line 16 def self.group_by_lines(text_chunks) bbwidth = text_chunks.max_by(&:right).right - text_chunks.min_by(&:left).left l = Line.new l << text_chunks.first lines = text_chunks[1..-1].inject([l]) do |lines, te| if lines.last.horizontal_overlap_ratio(te) < 0.01 # skip lines such that: # - are wider than the 90% of the width of the text_chunks bounding box # - it contains a single repeated character if lines.last.width / bbwidth > 0.9 \ && l.text_elements.all? { |te| te.text =~ SAME_CHAR_RE } lines.pop end lines << Line.new end lines.last << te lines end if lines.last.width / bbwidth > 0.9 \ && l.text_elements.all? { |te| te.text =~ SAME_CHAR_RE } lines.pop end lines.map!(&:remove_sequential_spaces!) end |
Instance Method Details
#<<(text_element) ⇒ Object
add a TextElement to this TextChunk
80 81 82 83 |
# File 'lib/tabula/entities/text_chunk.rb', line 80 def <<(text_element) self.text_elements << text_element self.merge!(text_element) end |
#inspect ⇒ Object
107 108 109 |
# File 'lib/tabula/entities/text_chunk.rb', line 107 def inspect "#<TextChunk: #{self.top.round(2)},#{self.left.round(2)},#{self.bottom.round(2)},#{right.round(2)} '#{self.text}'>" end |
#merge!(other) ⇒ Object
85 86 87 88 89 90 91 92 93 94 |
# File 'lib/tabula/entities/text_chunk.rb', line 85 def merge!(other) if other.instance_of?(TextChunk) if (self <=> other) < 0 self.text_elements = self.text_elements + other.text_elements else self.text_elements = other.text_elements + self.text_elements end end super(other) end |
#split_vertically!(y) ⇒ Object
split this TextChunk vertically (in place, returns the remaining chunk)
99 100 101 |
# File 'lib/tabula/entities/text_chunk.rb', line 99 def split_vertically!(y) raise "Not Implemented" end |
#text ⇒ Object
103 104 105 |
# File 'lib/tabula/entities/text_chunk.rb', line 103 def text self.text_elements.map(&:text).join end |
#to_h ⇒ Object
111 112 113 |
# File 'lib/tabula/entities/text_chunk.rb', line 111 def to_h super.merge(:text => self.text) end |