Class: Llmsherpa::Block

Inherits:
Object
  • Object
show all
Defined in:
lib/llmsherpa/blocks.rb

Overview

A block is a node in the layout tree. It can be a paragraph, a list item, a table, or a section header. This is the base class for all blocks such as Paragraph, ListItem, Table, Section.

Direct Known Subclasses

ListItem, Paragraph, Section, Table, TableCell, TableHeader, TableRow

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(block_json = nil) ⇒ Block

Returns a new instance of Block.



9
10
11
12
13
14
15
16
17
18
19
20
21
# File 'lib/llmsherpa/blocks.rb', line 9

def initialize(block_json = nil)
  @tag = block_json["tag"] if block_json&.key?("tag")
  @level = (block_json["level"] if block_json&.key?("level")) || 0
  @page_idx = block_json["page_idx"] if block_json&.key?("page_idx")
  @block_idx = block_json["block_idx"] if block_json&.key?("block_idx")
  @top = block_json["top"] if block_json&.key?("top")
  @left = block_json["left"] if block_json&.key?("left")
  @bbox = block_json["bbox"] if block_json&.key?("bbox")
  @sentences = block_json["sentences"] if block_json&.key?("sentences")
  @children = []
  @parent = nil
  @block_json = block_json
end

Instance Attribute Details

#bboxObject

Returns the value of attribute bbox.



7
8
9
# File 'lib/llmsherpa/blocks.rb', line 7

def bbox
  @bbox
end

#block_idxObject

Returns the value of attribute block_idx.



7
8
9
# File 'lib/llmsherpa/blocks.rb', line 7

def block_idx
  @block_idx
end

#block_jsonObject

Returns the value of attribute block_json.



7
8
9
# File 'lib/llmsherpa/blocks.rb', line 7

def block_json
  @block_json
end

#childrenObject

Returns the value of attribute children.



7
8
9
# File 'lib/llmsherpa/blocks.rb', line 7

def children
  @children
end

#leftObject

Returns the value of attribute left.



7
8
9
# File 'lib/llmsherpa/blocks.rb', line 7

def left
  @left
end

#levelObject

Returns the value of attribute level.



7
8
9
# File 'lib/llmsherpa/blocks.rb', line 7

def level
  @level
end

#page_idxObject

Returns the value of attribute page_idx.



7
8
9
# File 'lib/llmsherpa/blocks.rb', line 7

def page_idx
  @page_idx
end

#parentObject

Returns the value of attribute parent.



7
8
9
# File 'lib/llmsherpa/blocks.rb', line 7

def parent
  @parent
end

#sentencesObject

Returns the value of attribute sentences.



7
8
9
# File 'lib/llmsherpa/blocks.rb', line 7

def sentences
  @sentences
end

#tagObject

Returns the value of attribute tag.



7
8
9
# File 'lib/llmsherpa/blocks.rb', line 7

def tag
  @tag
end

#topObject

Returns the value of attribute top.



7
8
9
# File 'lib/llmsherpa/blocks.rb', line 7

def top
  @top
end

Instance Method Details

#add_child(node) ⇒ Object

Adds a child to the block. Sets the parent of the child to self.



24
25
26
27
# File 'lib/llmsherpa/blocks.rb', line 24

def add_child(node)
  @children.push(node)
  node.parent = self
end

#chunksObject

Returns all the chunks in the block. Chunking automatically splits the document into paragraphs, lists, and tables without any prior knowledge of the document structure.



93
94
95
96
97
98
99
# File 'lib/llmsherpa/blocks.rb', line 93

def chunks
  chunks = []
  iter_children(self, 0) do |node|
    chunks.push(node) if %w[para list_item table].include?(node.tag)
  end
  chunks
end

#iter_children(node, level, &node_visitor) ⇒ Object

Iterates over all the children of the node and calls the node_visitor function on each child.



76
77
78
79
80
81
# File 'lib/llmsherpa/blocks.rb', line 76

def iter_children(node, level, &node_visitor)
  node.children.each do |child|
    node_visitor.call(child)
    iter_children(child, level + 1, &node_visitor) unless %w[list_item para table].include?(child.tag)
  end
end

#paragraphsObject

Returns all the paragraphs in the block. This is useful for getting all the paragraphs in a section.



84
85
86
87
88
89
90
# File 'lib/llmsherpa/blocks.rb', line 84

def paragraphs
  paragraphs = []
  iter_children(self, 0) do |node|
    paragraphs.push(node) if node.tag == "para"
  end
  paragraphs
end

#parent_chainObject

Returns the parent chain of the block consisting of all the parents of the block until the root.



36
37
38
39
40
41
42
43
44
# File 'lib/llmsherpa/blocks.rb', line 36

def parent_chain
  chain = []
  parent = self.parent
  while parent
    chain.push(parent)
    parent = parent.parent
  end
  chain.reverse
end

#parent_textObject

Returns the text of the parent chain of the block. This is useful for adding section information to the text.



47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/llmsherpa/blocks.rb', line 47

def parent_text
  parent_chain = self.parent_chain
  header_texts = []
  para_texts = []
  parent_chain.each do |p|
    if p.tag == "header"
      header_texts.push(p.to_text)
    elsif %w[list_item para].include?(p.tag)
      para_texts.push(p.to_text)
    end
  end
  text = header_texts.join(" > ")
  text += "\n#{para_texts.join("\n")}" unless para_texts.empty?
  text
end

#sectionsObject

Returns all the sections in the block. This is useful for getting all the sections in a document.



112
113
114
115
116
117
118
# File 'lib/llmsherpa/blocks.rb', line 112

def sections
  sections = []
  iter_children(self, 0) do |node|
    sections.push(node) if node.tag == "header"
  end
  sections
end

#tablesObject

Returns all the tables in Returns all the tables in the block. This is useful for getting all the tables in a section.



103
104
105
106
107
108
109
# File 'lib/llmsherpa/blocks.rb', line 103

def tables
  tables = []
  iter_children(self, 0) do |node|
    tables.push(node) if node.tag == "table"
  end
  tables
end

#to_context_text(include_section_info = true) ⇒ Object

Returns the text of the block with section information. This provides context to the text.



64
65
66
67
68
69
70
71
72
73
# File 'lib/llmsherpa/blocks.rb', line 64

def to_context_text( = true)
  text = ""
  text += "#{parent_text}\n" if 
  text += if %w[list_item para table].include?(@tag)
            to_text(true, true)
          else
            to_text
          end
  text
end

#to_html(include_children = false, recurse = false) ⇒ Object

Converts the block to html. This is a virtual method and should be implemented by the derived classes.



30
# File 'lib/llmsherpa/blocks.rb', line 30

def to_html(include_children = false, recurse = false); end

#to_text(include_children = false, recurse = false) ⇒ Object

Converts the block to text. This is a virtual method and should be implemented by the derived classes.



33
# File 'lib/llmsherpa/blocks.rb', line 33

def to_text(include_children = false, recurse = false); end