Class: Llmsherpa::LayoutReader

Inherits:
Object
  • Object
show all
Defined in:
lib/llmsherpa/blocks.rb

Instance Method Summary collapse

Instance Method Details

#debug(pdf_root) ⇒ Object

Reads the layout tree from the JSON returned by the parser API.



377
378
379
380
381
382
383
384
385
# File 'lib/llmsherpa/blocks.rb', line 377

def debug(pdf_root)
  iter_children = lambda do |node, level|
    node.children.each do |child|
      puts "#{"-" * level} #{child.tag} (#{child.children.length}) #{child.to_text}"
      iter_children.call(child, level + 1)
    end
  end
  iter_children.call(pdf_root, 0)
end

#read(blocks_json) ⇒ Object



387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
# File 'lib/llmsherpa/blocks.rb', line 387

def read(blocks_json)
  root = Block.new
  parent_stack = [root]
  prev_node = root
  parent = root
  list_stack = []

  blocks_json.each do |block|
    list_stack = [] if block["tag"] != "list_item" && !list_stack.empty?

    node = case block["tag"]
           when "para"
             Paragraph.new(block)
           when "table"
             Table.new(block, prev_node)
           when "list_item"
             ListItem.new(block)
           when "header"
             Section.new(block)
           else
             raise "Unsupported block type: #{block["tag"]}"
           end

    case block["tag"]
    when "para"
      parent.add_child(node)
    when "table"
      parent.add_child(node)
    when "list_item"
      if prev_node.tag == "para" && prev_node.level == node.level
        list_stack << prev_node
      elsif prev_node.tag == "list_item"
        if node.level > prev_node.level
          list_stack << prev_node
        elsif node.level < prev_node.level
          list_stack.pop while !list_stack.empty? && list_stack.last.level > node.level
        end
      end
      if list_stack.any?
        list_stack.last.add_child(node)
      else
        parent.add_child(node)
      end
    when "header"
      if node.level > parent.level
        parent_stack << node
        parent.add_child(node)
      else
        parent_stack.pop while parent_stack.length > 1 && parent_stack.last.level >= node.level
        parent_stack.last.add_child(node)
        parent_stack << node
      end
      parent = node
    end

    prev_node = node
  end

  root
end