Class: OnlyofficePdfParser::PdfStructure

Inherits:
Object
  • Object
show all
Includes:
PdfReaderHelper, PdfConvertToBmpHelper
Defined in:
lib/onlyoffice_pdf_parser/pdf_structure.rb

Overview

Class for working and parsing PDF files

Constant Summary collapse

PAGE_SIZE_FOR_PDF =

Returns list of default page size and names.

Returns:

  • (Hash)

    list of default page size and names

{ 'US Letter' => [612, 792],
'US Legal' => [612, 1008],
'A4' => [595, 842],
'A5' => [420, 595],
'B5' => [499, 709],
'Envelope #10' => [297, 684],
'Envelope DL' => [312, 624],
'Tabloid' => [792, 1224],
'A3' => [842, 1191],
'Tabloid Oversize' => [864, 1296],
'ROC 16K' => [558, 774],
'Envelope Choukei 3' => [340, 666],
'Super B/A3' => [936, 1368] }.freeze

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from PdfReaderHelper

#parse_font

Methods included from PdfConvertToBmpHelper

#fetch_bmp_binary

Constructor Details

#initialize(pages: [], file_path: nil) ⇒ PdfStructure

Returns a new instance of PdfStructure.



21
22
23
24
25
# File 'lib/onlyoffice_pdf_parser/pdf_structure.rb', line 21

def initialize(pages: [], file_path: nil)
  @file_path = file_path
  @pages = pages
  @pages_in_bmp = []
end

Instance Attribute Details

#file_pathString

Returns full path to file.

Returns:

  • (String)

    full path to file



17
18
19
# File 'lib/onlyoffice_pdf_parser/pdf_structure.rb', line 17

def file_path
  @file_path
end

#pagesArray, Pages

Returns array of pages.

Returns:

  • (Array, Pages)

    array of pages



15
16
17
# File 'lib/onlyoffice_pdf_parser/pdf_structure.rb', line 15

def pages
  @pages
end

#pages_in_bmpArray<String> (readonly)

Returns bin representation of BMPs.

Returns:

  • (Array<String>)

    bin representation of BMPs



19
20
21
# File 'lib/onlyoffice_pdf_parser/pdf_structure.rb', line 19

def pages_in_bmp
  @pages_in_bmp
end

Class Method Details

.parse(filename) ⇒ PdfStructure

Parse file

Parameters:

  • filename (String)

    path to file

Returns:



94
95
96
97
98
99
100
# File 'lib/onlyoffice_pdf_parser/pdf_structure.rb', line 94

def self.parse(filename)
  file = PdfStructure.new(pages: [], file_path: filename)
  file.pdf_reader_parse
  file.fetch_bmp_binary
  file.page_size
  file
end

Instance Method Details

#[](parameter) ⇒ Object

Accessor of attributes like hash

Parameters:

  • parameter (Symbol)

    attribute name

Returns:

  • (Object)

    value of attribute



30
31
32
33
34
35
36
37
38
39
# File 'lib/onlyoffice_pdf_parser/pdf_structure.rb', line 30

def [](parameter)
  case parameter
  when :pages
    @pages
  when :page_size
    @page_size
  else
    raise "Unknown instance variable - #{parameter}."
  end
end

#contain_pattern?(path_to_patter) ⇒ True, false

Returns Check if pdf file contains graphic pattern.

Returns:

  • (True, false)

    Check if pdf file contains graphic pattern



42
43
44
45
46
47
48
49
# File 'lib/onlyoffice_pdf_parser/pdf_structure.rb', line 42

def contain_pattern?(path_to_patter)
  pages_in_bmp.each do |current_page|
    bmp = BmpImage.new(current_page)
    array = bmp.get_sub_image_array(path_to_patter)
    return true unless array.empty?
  end
  false
end

#page_sizeString?

Returns name of page size.

Returns:

  • (String, nil)

    name of page size



71
72
73
74
# File 'lib/onlyoffice_pdf_parser/pdf_structure.rb', line 71

def page_size
  @page_size = PAGE_SIZE_FOR_PDF.key(page_size_points)
  @page_size ||= "Landscape #{PAGE_SIZE_FOR_PDF.key(page_size_points.reverse)}"
end

#page_size_pointsArray <Integer>

Returns page size of pdf in points.

Returns:

  • (Array <Integer>)

    page size of pdf in points



62
63
64
65
66
67
68
# File 'lib/onlyoffice_pdf_parser/pdf_structure.rb', line 62

def page_size_points
  return @page_size_points if @page_size_points

  pdfinfo = `pdfinfo "#{@file_path}"`
  page_size_fraction = pdfinfo.split('Page size:')[1].split('pts').first.strip.split(', ').first.split(' x ')
  @page_size_points = page_size_fraction.map { |size| size.to_f.round }
end

#pdf_reader_parseObject

Parse file using ‘pdf-reader` gem



52
53
54
55
56
57
58
59
# File 'lib/onlyoffice_pdf_parser/pdf_structure.rb', line 52

def pdf_reader_parse
  PDF::Reader.open(file_path.to_s) do |reader|
    reader.pages.each do |page|
      @pages << { text: page.text,
                  fonts: parse_font(page) }
    end
  end
end