Class: Docsplit::PdfExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/docsplit/pdf_extractor.rb

Defined Under Namespace

Classes: OfficeNotFound

Constant Summary collapse

HOST_OS =

Provide a set of helper functions to determine the OS.

('RbConfig') ? RbConfig : Config)::CONFIG['host_os']
CLASSPATH =
"#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'".freeze
LOGGING =
"-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties".freeze
HEADLESS =
'-Djava.awt.headless=true'.freeze
@@executable =
nil
@@version_string =
nil

Instance Method Summary collapse

Instance Method Details

#extract(docs, opts) ⇒ Object

Convert documents to PDF.



121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# File 'lib/docsplit/pdf_extractor.rb', line 121

def extract(docs, opts)
  out = opts[:output] || '.'
  FileUtils.mkdir_p out unless File.exist?(out)
  [docs].flatten.each do |doc|
    ext = File.extname(doc)
    basename = File.basename(doc, ext)
    escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)

    if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
      `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
    else
      if libre_office?
        # Set the LibreOffice user profile, so that parallel uses of cloudcrowd don't trip over each other.
        ENV['SYSUSERCONFIG'] = "file://#{File.expand_path(escaped_out)}"

        options = "--headless --invisible  --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
        cmd = "#{office_executable} #{options} 2>&1"
        result = `#{cmd}`.chomp
        raise ExtractionFailed, result if $?.exitstatus.nonzero?
        true
      else # open office presumably, rely on JODConverter to figure it out.
        options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
        run_jod "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
      end
    end
  end
end

#libre_office?Boolean

Returns:

  • (Boolean)


35
36
37
# File 'lib/docsplit/pdf_extractor.rb', line 35

def libre_office?
  !!version_string.match(/^LibreOffice/)
end

#linux?Boolean

Returns:

  • (Boolean)


18
19
20
# File 'lib/docsplit/pdf_extractor.rb', line 18

def linux?
  !!HOST_OS.match(/linux/i)
end

#office_executableObject

Identify the path to a working office executable.

Raises:



78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# File 'lib/docsplit/pdf_extractor.rb', line 78

def office_executable
  paths = office_search_paths

  # If an OFFICE_PATH has been specified on the commandline
  # raise an error if that path isn't valid, otherwise, add
  # it to the front of our search paths.
  if ENV['OFFICE_PATH']
    raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exist? ENV['OFFICE_PATH']
    paths.unshift(ENV['OFFICE_PATH'])
  end

  # The location of the office executable is OS dependent
  path_pieces = ['soffice']
  if windows?
    path_pieces += [['program', 'soffice.bin']]
  elsif osx?
    path_pieces += [%w(MacOS soffice), %w(Contents MacOS soffice)]
  else
    path_pieces += [%w(program soffice)]
  end

  # Search for the first suitable office executable
  # and short circuit an executable is found.
  paths.each do |path|
    if File.exist? path
      @@executable ||= path unless File.directory? path
      path_pieces.each do |pieces|
        check_path = File.join(path, pieces)
        @@executable ||= check_path if File.exist? check_path
      end
    end
    break if @@executable
  end
  raise OfficeNotFound, 'No office software found' unless @@executable
  @@executable
end

#office_pathObject

Used to specify the office location for JODConverter



116
117
118
# File 'lib/docsplit/pdf_extractor.rb', line 116

def office_path
  File.dirname(File.dirname(office_executable))
end

#office_search_pathsObject

A set of default locations to search for office software These have been extracted from JODConverter. Each listed path should contain a directory “program” which in turn contains the “soffice” executable. see: github.com/mirkonasato/jodconverter/blob/master/jodconverter-core/src/main/java/org/artofsolving/jodconverter/office/OfficeUtils.java#L63-L91



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/docsplit/pdf_extractor.rb', line 48

def office_search_paths
  if windows?
    office_names       = ['LibreOffice 3', 'LibreOffice 4', 'OpenOffice.org 3']
    program_files_path = ENV['CommonProgramFiles']
    search_paths       = office_names.map { |program| File.join(program_files_path, program) }
  elsif osx?
    search_paths = %w(
      /Applications/LibreOffice.app/Contents
      /Applications/OpenOffice.org.app/Contents
    )
  else # probably linux/unix
    # heroku libreoffice buildpack: https://github.com/rishihahs/heroku-buildpack-libreoffice
    search_paths = %w(
      /usr/lib/libreoffice
      /usr/lib64/libreoffice
      /opt/libreoffice
      /usr/lib/openoffice
      /usr/lib64/openoffice
      /opt/openoffice.org3
      /app/vendor/libreoffice
      /usr/bin/libreoffice
      /usr/local/bin
      /usr/lib64/libreoffice
      /usr/lib64/openoffice.org3
    )
  end
  search_paths
end

#open_office?Boolean

Returns:

  • (Boolean)


39
40
41
# File 'lib/docsplit/pdf_extractor.rb', line 39

def open_office?
  !!version_string.match(/^OpenOffice.org/)
end

#osx?Boolean

Returns:

  • (Boolean)


14
15
16
# File 'lib/docsplit/pdf_extractor.rb', line 14

def osx?
  !!HOST_OS.match(/darwin/i)
end

#version_stringObject

The first line of the help output holds the name and version number of the office software to be used for extraction.



24
25
26
27
28
29
30
31
32
33
# File 'lib/docsplit/pdf_extractor.rb', line 24

def version_string
  unless @@version_string
    null = windows? ? 'NUL' : '/dev/null'
    @@version_string = `#{office_executable} -h 2>#{null}`.split("\n").first
    if !!@@version_string.to_s.match(/[0-9]*/)
      @@version_string = `#{office_executable} --version`.split("\n").first
    end
  end
  @@version_string
end

#windows?Boolean

Returns:

  • (Boolean)


10
11
12
# File 'lib/docsplit/pdf_extractor.rb', line 10

def windows?
  !!HOST_OS.match(/mswin|windows|cygwin/i)
end