Class: Llmsherpa::LayoutPDFReader

Inherits:
Object
  • Object
show all
Defined in:
lib/llmsherpa/layout_pdf_reader.rb

Instance Method Summary collapse

Constructor Details

#initialize(parser_api_url) ⇒ LayoutPDFReader

Reads PDF content and understands hierarchical layout of the document sections and structural components



12
13
14
# File 'lib/llmsherpa/layout_pdf_reader.rb', line 12

def initialize(parser_api_url)
  @parser_api_url = parser_api_url
end

Instance Method Details

#read_pdf(path_or_url, contents = nil) ⇒ Object



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/llmsherpa/layout_pdf_reader.rb', line 16

def read_pdf(path_or_url, contents = nil)
  pdf_file = if contents
               [path_or_url, contents, "application/pdf"]
             else
               is_url = %w[http https].include?(URI.parse(path_or_url).scheme)
               if is_url
                 _download_pdf(path_or_url)
               else
                 file_name = path_or_url
                 file_data = nil # no need to read the file here
                 [file_name, file_data, "application/pdf"]
               end
             end

  parser_response = _parse_pdf(pdf_file)
  response_json = JSON.parse(parser_response.body)
  blocks = response_json["return_dict"]["result"]["blocks"]
  Document.new(blocks)
end