Class: Tabula::TextElement
- Inherits:
-
ZoneEntity
- Object
- Tabula.javajava.awtjava.awt.geomjava.awt.geom.Rectangle2Djava.awt.geom.Rectangle2D::Float
- ZoneEntity
- Tabula::TextElement
- Defined in:
- lib/tabula/entities/text_element.rb
Overview
a Glyph
Constant Summary collapse
- TOLERANCE_FACTOR =
0.25
- EMPTY =
TextElement.new(0, 0, 0, 0, nil, 0, '', 0)
Instance Attribute Summary collapse
-
#direction ⇒ Object
Returns the value of attribute direction.
-
#font ⇒ Object
Returns the value of attribute font.
-
#font_size ⇒ Object
Returns the value of attribute font_size.
-
#text ⇒ Object
Returns the value of attribute text.
-
#width_of_space ⇒ Object
Returns the value of attribute width_of_space.
Attributes inherited from ZoneEntity
Class Method Summary collapse
-
.merge_words(text_elements, options = {}) ⇒ Object
heuristically merge an iterable of TextElement into a list of TextChunk lots of ideas taken from PDFBox’s PDFTextStripper.writePage here be dragons.
- .overlap(y1, height1, y2, height2, variance = 0.1) ⇒ Object
- .within(first, second, variance) ⇒ Object
Instance Method Summary collapse
- #==(other) ⇒ Object
-
#initialize(top, left, width, height, font, font_size, text, width_of_space, direction = 0) ⇒ TextElement
constructor
A new instance of TextElement.
- #inspect ⇒ Object
-
#merge!(other) ⇒ Object
merge this TextElement with another (adjust size and text content accordingly).
- #to_h ⇒ Object
Methods inherited from ZoneEntity
#<=>, #points, #tlbr, #tlwh, #to_json
Constructor Details
#initialize(top, left, width, height, font, font_size, text, width_of_space, direction = 0) ⇒ TextElement
Returns a new instance of TextElement.
11 12 13 14 15 16 17 18 |
# File 'lib/tabula/entities/text_element.rb', line 11 def initialize(top, left, width, height, font, font_size, text, width_of_space, direction=0) super(top, left, width, height) self.font = font self.font_size = font_size self.text = text self.width_of_space = width_of_space self.direction = direction end |
Instance Attribute Details
#direction ⇒ Object
Returns the value of attribute direction.
7 8 9 |
# File 'lib/tabula/entities/text_element.rb', line 7 def direction @direction end |
#font ⇒ Object
Returns the value of attribute font.
7 8 9 |
# File 'lib/tabula/entities/text_element.rb', line 7 def font @font end |
#font_size ⇒ Object
Returns the value of attribute font_size.
7 8 9 |
# File 'lib/tabula/entities/text_element.rb', line 7 def font_size @font_size end |
#text ⇒ Object
Returns the value of attribute text.
7 8 9 |
# File 'lib/tabula/entities/text_element.rb', line 7 def text @text end |
#width_of_space ⇒ Object
Returns the value of attribute width_of_space.
7 8 9 |
# File 'lib/tabula/entities/text_element.rb', line 7 def width_of_space @width_of_space end |
Class Method Details
.merge_words(text_elements, options = {}) ⇒ Object
heuristically merge an iterable of TextElement into a list of TextChunk lots of ideas taken from PDFBox’s PDFTextStripper.writePage here be dragons
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
# File 'lib/tabula/entities/text_element.rb', line 36 def self.merge_words(text_elements, ={}) = {:vertical_rulings => []} = .merge() vertical_ruling_locations = [:vertical_rulings].map(&:left) if [:vertical_rulings] return [] if text_elements.empty? text_chunks = [TextChunk.create_from_text_element(text_elements.shift)] previousAveCharWidth = text_chunks.first.width endOfLastTextX = text_chunks.first.right maxYForLine = text_chunks.first.bottom maxHeightForLine = text_chunks.first.height minYTopForLine = text_chunks.first.top lastWordSpacing = -1 sp = nil text_elements.inject(text_chunks) do |chunks, char| current_chunk = chunks.last prev_char = current_chunk.text_elements.last # Resets the average character width when we see a change in font # or a change in the font size if (char.font != prev_char.font) || (char.font_size != prev_char.font_size) previousAveCharWidth = -1; end # if same char AND overlapped, skip if (prev_char.text == char.text) && prev_char.overlaps_with_ratio?(char, 0.5) next chunks end # if char is a space that overlaps with the prev_char, skip if char.text == ' ' && prev_char.left == char.left && prev_char.top == char.top next chunks end # any vertical ruling goes across prev_char and char? across_vertical_ruling = vertical_ruling_locations.any? { |loc| prev_char.left < loc && char.left > loc } # Estimate the expected width of the space based on the # space character with some margin. wordSpacing = char.width_of_space deltaSpace = 0 deltaSpace = if (wordSpacing.nan? || wordSpacing == 0) ::Float::MAX elsif lastWordSpacing < 0 wordSpacing * 0.5 # 0.5 == spacingTolerance else ((wordSpacing + lastWordSpacing) / 2.0) * 0.5 end # Estimate the expected width of the space based on the # average character width with some margin. This calculation does not # make a true average (average of averages) but we found that it gave the # best results after numerous experiments. Based on experiments we also found that # .3 worked well. averageCharWidth = if previousAveCharWidth < 0 char.width / char.text.size else (previousAveCharWidth + (char.width / char.text.size)) / 2.0 end deltaCharWidth = averageCharWidth * 0.3 # 0.3 == averageCharTolerance # Compares the values obtained by the average method and the wordSpacing method and picks # the smaller number. expectedStartOfNextWordX = -::Float::MAX if endOfLastTextX != -1 expectedStartOfNextWordX = endOfLastTextX + [deltaCharWidth, deltaSpace].min end sameLine = true if !overlap(char.bottom, char.height, maxYForLine, maxHeightForLine) endOfLastTextX = -1 expectedStartOfNextWordX = -::Float::MAX maxYForLine = -::Float::MAX maxHeightForLine = -1 minYTopForLine = ::Float::MAX sameLine = false end endOfLastTextX = char.right # should we add a space? if !across_vertical_ruling \ && sameLine \ && expectedStartOfNextWordX < char.left \ && !prev_char.text.end_with?(' ') sp = self.new(prev_char.top, prev_char.right, expectedStartOfNextWordX - prev_char.right, prev_char.height, prev_char.font, prev_char.font_size, ' ', prev_char.width_of_space) current_chunk << sp else sp = nil end maxYForLine = [char.bottom, maxYForLine].max maxHeightForLine = [maxHeightForLine, char.height].max minYTopForLine = [minYTopForLine, char.top].min # if sameLine # puts "prev: #{prev_char.text} - char: #{char.text} - diff: #{char.left - prev_char.right} - space: #{[deltaCharWidth, deltaSpace].min} - spacing: #{wordSpacing} - sp: #{!sp.nil?}" # else # puts # end dist = (char.left - (sp ? sp.right : prev_char.right)) if !across_vertical_ruling \ && sameLine \ && (dist < 0 ? current_chunk.vertically_overlaps?(char) : dist < wordSpacing) current_chunk << char else # create a new chunk chunks << TextChunk.create_from_text_element(char) end lastWordSpacing = wordSpacing previousAveCharWidth = sp ? (averageCharWidth + sp.width) / 2.0 : averageCharWidth chunks end end |
.overlap(y1, height1, y2, height2, variance = 0.1) ⇒ Object
26 27 28 29 |
# File 'lib/tabula/entities/text_element.rb', line 26 def self.overlap(y1, height1, y2, height2, variance=0.1) within( y1, y2, variance) || (y2 <= y1 && y2 >= y1 - height1) \ || (y1 <= y2 && y1 >= y2-height2) end |
.within(first, second, variance) ⇒ Object
22 23 24 |
# File 'lib/tabula/entities/text_element.rb', line 22 def self.within(first, second, variance ) second < first + variance && second > first - variance end |
Instance Method Details
#==(other) ⇒ Object
192 193 194 |
# File 'lib/tabula/entities/text_element.rb', line 192 def ==(other) self.text.strip == other.text.strip end |
#inspect ⇒ Object
188 189 190 |
# File 'lib/tabula/entities/text_element.rb', line 188 def inspect "#<TextElement: #{self.top.round(2)},#{self.left.round(2)},#{self.bottom.round(2)},#{right.round(2)} '#{self.text}'>" end |
#merge!(other) ⇒ Object
merge this TextElement with another (adjust size and text content accordingly)
173 174 175 176 177 178 179 180 181 182 |
# File 'lib/tabula/entities/text_element.rb', line 173 def merge!(other) raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement) if (self <=> other) < 0 self.text = other.text + self.text else self.text << other.text end super(other) end |
#to_h ⇒ Object
184 185 186 |
# File 'lib/tabula/entities/text_element.rb', line 184 def to_h super.merge({:font => self.font, :text => self.text }) end |