Class: Tabula::TextChunk

Inherits:
ZoneEntity
  • Object
show all
Defined in:
lib/tabula/entities/text_chunk.rb

Overview

a “collection” of TextElements

Instance Attribute Summary collapse

Attributes inherited from ZoneEntity

#texts

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from ZoneEntity

#<=>, #initialize, #points, #tlbr, #tlwh, #to_json

Constructor Details

This class inherits a constructor from Tabula::ZoneEntity

Instance Attribute Details

#fontObject

Returns the value of attribute font.



5
6
7
# File 'lib/tabula/entities/text_chunk.rb', line 5

def font
  @font
end

#font_sizeObject

Returns the value of attribute font_size.



5
6
7
# File 'lib/tabula/entities/text_chunk.rb', line 5

def font_size
  @font_size
end

#text_elementsObject

Returns the value of attribute text_elements.



5
6
7
# File 'lib/tabula/entities/text_chunk.rb', line 5

def text_elements
  @text_elements
end

#width_of_spaceObject

Returns the value of attribute width_of_space.



5
6
7
# File 'lib/tabula/entities/text_chunk.rb', line 5

def width_of_space
  @width_of_space
end

Class Method Details

.column_positions(lines) ⇒ Object

returns a list of column boundaries (x axis) lines must be an array of lines sorted by their top attribute



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/tabula/entities/text_chunk.rb', line 48

def self.column_positions(lines)
  init = lines.first.text_elements.inject([]) { |memo, text_chunk|
    next memo if text_chunk.text =~ ONLY_SPACES_RE
    memo << Tabula::ZoneEntity.new(*text_chunk.tlwh)
    memo
  }

  regions = lines[1..-1]
    .inject(init) do |column_regions, line|

    line_text_elements = line.text_elements.clone.select { |te| te.text !~ ONLY_SPACES_RE }

    column_regions.each do |cr|

      overlaps = line_text_elements
        .select { |te| te.text !~ ONLY_SPACES_RE && cr.horizontally_overlaps?(te) }

      overlaps.inject(cr) do |memo, te|
        cr.merge!(te)
      end

      line_text_elements = line_text_elements - overlaps
    end

    column_regions += line_text_elements.map { |te| Tabula::ZoneEntity.new(*te.tlwh) }
  end

  regions.map { |r| r.right.round(2) }.uniq
end

.create_from_text_element(text_element) ⇒ Object

initialize a new TextChunk from a TextElement

Raises:

  • (TypeError)


9
10
11
12
13
14
# File 'lib/tabula/entities/text_chunk.rb', line 9

def self.create_from_text_element(text_element)
  raise TypeError, "argument is not a TextElement" unless text_element.instance_of?(TextElement)
  tc = self.new(*text_element.tlwh)
  tc.text_elements = [text_element]
  return tc
end

.group_by_lines(text_chunks) ⇒ Object



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/tabula/entities/text_chunk.rb', line 16

def self.(text_chunks)
  bbwidth = text_chunks.max_by(&:right).right - text_chunks.min_by(&:left).left

  l = Line.new
  l << text_chunks.first

  lines = text_chunks[1..-1].inject([l]) do |lines, te|
    if lines.last.horizontal_overlap_ratio(te) < 0.01
      # skip lines such that:
      # - are wider than the 90% of the width of the text_chunks bounding box
      # - it contains a single repeated character
      if lines.last.width / bbwidth > 0.9 \
        && l.text_elements.all? { |te| te.text =~  SAME_CHAR_RE }
        lines.pop
      end
      lines << Line.new
    end
    lines.last << te
    lines
  end

  if lines.last.width / bbwidth > 0.9 \
     && l.text_elements.all? { |te| te.text =~ SAME_CHAR_RE }
    lines.pop
  end

  lines.map!(&:remove_sequential_spaces!)
end

Instance Method Details

#<<(text_element) ⇒ Object

add a TextElement to this TextChunk



80
81
82
83
# File 'lib/tabula/entities/text_chunk.rb', line 80

def <<(text_element)
  self.text_elements << text_element
  self.merge!(text_element)
end

#inspectObject



107
108
109
# File 'lib/tabula/entities/text_chunk.rb', line 107

def inspect
  "#<TextChunk: #{self.top.round(2)},#{self.left.round(2)},#{self.bottom.round(2)},#{right.round(2)} '#{self.text}'>"
end

#merge!(other) ⇒ Object



85
86
87
88
89
90
91
92
93
94
# File 'lib/tabula/entities/text_chunk.rb', line 85

def merge!(other)
  if other.instance_of?(TextChunk)
    if (self <=> other) < 0
      self.text_elements = self.text_elements + other.text_elements
    else
      self.text_elements = other.text_elements + self.text_elements
    end
  end
  super(other)
end

#split_vertically!(y) ⇒ Object

split this TextChunk vertically (in place, returns the remaining chunk)



99
100
101
# File 'lib/tabula/entities/text_chunk.rb', line 99

def split_vertically!(y)
  raise "Not Implemented"
end

#textObject



103
104
105
# File 'lib/tabula/entities/text_chunk.rb', line 103

def text
  self.text_elements.map(&:text).join
end

#to_hObject



111
112
113
# File 'lib/tabula/entities/text_chunk.rb', line 111

def to_h
  super.merge(:text => self.text)
end