Class: Llmsherpa::LayoutReader
- Inherits:
-
Object
- Object
- Llmsherpa::LayoutReader
- Defined in:
- lib/llmsherpa/blocks.rb
Instance Method Summary collapse
-
#debug(pdf_root) ⇒ Object
Reads the layout tree from the JSON returned by the parser API.
- #read(blocks_json) ⇒ Object
Instance Method Details
#debug(pdf_root) ⇒ Object
Reads the layout tree from the JSON returned by the parser API.
377 378 379 380 381 382 383 384 385 |
# File 'lib/llmsherpa/blocks.rb', line 377 def debug(pdf_root) iter_children = lambda do |node, level| node.children.each do |child| puts "#{"-" * level} #{child.tag} (#{child.children.length}) #{child.to_text}" iter_children.call(child, level + 1) end end iter_children.call(pdf_root, 0) end |
#read(blocks_json) ⇒ Object
387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 |
# File 'lib/llmsherpa/blocks.rb', line 387 def read(blocks_json) root = Block.new parent_stack = [root] prev_node = root parent = root list_stack = [] blocks_json.each do |block| list_stack = [] if block["tag"] != "list_item" && !list_stack.empty? node = case block["tag"] when "para" Paragraph.new(block) when "table" Table.new(block, prev_node) when "list_item" ListItem.new(block) when "header" Section.new(block) else raise "Unsupported block type: #{block["tag"]}" end case block["tag"] when "para" parent.add_child(node) when "table" parent.add_child(node) when "list_item" if prev_node.tag == "para" && prev_node.level == node.level list_stack << prev_node elsif prev_node.tag == "list_item" if node.level > prev_node.level list_stack << prev_node elsif node.level < prev_node.level list_stack.pop while !list_stack.empty? && list_stack.last.level > node.level end end if list_stack.any? list_stack.last.add_child(node) else parent.add_child(node) end when "header" if node.level > parent.level parent_stack << node parent.add_child(node) else parent_stack.pop while parent_stack.length > 1 && parent_stack.last.level >= node.level parent_stack.last.add_child(node) parent_stack << node end parent = node end prev_node = node end root end |