Class: Docsplit::PdfExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/docsplit/pdf_extractor.rb

Defined Under Namespace

Classes: OfficeNotFound

Constant Summary collapse

HOST_OS =

Provide a set of helper functions to determine the OS.

("RbConfig") ? RbConfig : Config)::CONFIG['host_os']
CLASSPATH =
"#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
LOGGING =
"-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
HEADLESS =
"-Djava.awt.headless=true"
@@executable =
nil
@@version_string =
nil

Instance Method Summary collapse

Instance Method Details

#extract(docs, opts) ⇒ Object

Convert documents to PDF.



117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# File 'lib/docsplit/pdf_extractor.rb', line 117

def extract(docs, opts)
  out = opts[:output] || '.'
  FileUtils.mkdir_p out unless File.exists?(out)
  [docs].flatten.each do |doc|
    ext = File.extname(doc)
    basename = File.basename(doc, ext)
    escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)

    if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
      `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
    else
      if libre_office?
        # Set the LibreOffice user profile, so that parallel uses of cloudcrowd don't trip over each other.
        ENV['SYSUSERCONFIG']="file://#{File.expand_path(escaped_out)}"
        
        options = "--headless --invisible  --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
        cmd = "#{office_executable} #{options} 2>&1"
        result = `#{cmd}`.chomp
        raise ExtractionFailed, result if $? != 0
        true
      else # open office presumably, rely on JODConverter to figure it out.
        options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
        run_jod "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
      end
    end
  end
end

#libre_office?Boolean

Returns:

  • (Boolean)


32
33
34
# File 'lib/docsplit/pdf_extractor.rb', line 32

def libre_office?
  !!version_string.match(/^LibreOffice/)
end

#linux?Boolean

Returns:

  • (Boolean)


16
17
18
# File 'lib/docsplit/pdf_extractor.rb', line 16

def linux?
  !!HOST_OS.match(/linux/i)
end

#office_executableObject

Identify the path to a working office executable.

Raises:



74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# File 'lib/docsplit/pdf_extractor.rb', line 74

def office_executable
  paths = office_search_paths

  # If an OFFICE_PATH has been specified on the commandline
  # raise an error if that path isn't valid, otherwise, add
  # it to the front of our search paths.
  if ENV['OFFICE_PATH']
    raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH']
    paths.unshift(ENV['OFFICE_PATH'])
  end
  
  # The location of the office executable is OS dependent
  path_pieces = ["soffice"]
  if windows?
    path_pieces += [["program", "soffice.bin"]]
  elsif osx?
    path_pieces += [["MacOS", "soffice"], ["Contents", "MacOS", "soffice"]]
  else
    path_pieces += [["program", "soffice"]]
  end
  
  # Search for the first suitable office executable
  # and short circuit an executable is found.
  paths.each do |path|
    if File.exists? path
      @@executable ||= path unless File.directory? path
      path_pieces.each do |pieces|
        check_path = File.join(path, pieces)
        @@executable ||= check_path if File.exists? check_path
      end
    end
    break if @@executable
  end
  raise OfficeNotFound, "No office software found" unless @@executable
  @@executable
end

#office_pathObject

Used to specify the office location for JODConverter



112
113
114
# File 'lib/docsplit/pdf_extractor.rb', line 112

def office_path
  File.dirname(File.dirname(office_executable))
end

#office_search_pathsObject

A set of default locations to search for office software These have been extracted from JODConverter. Each listed path should contain a directory “program” which in turn contains the “soffice” executable. see: github.com/mirkonasato/jodconverter/blob/master/jodconverter-core/src/main/java/org/artofsolving/jodconverter/office/OfficeUtils.java#L63-L91



44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/docsplit/pdf_extractor.rb', line 44

def office_search_paths
  if windows?
    office_names       = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
    program_files_path = ENV["CommonProgramFiles"]
    search_paths       = office_names.map{ |program| File.join(program_files_path, program) }
  elsif osx?
    search_paths = %w(
      /Applications/LibreOffice.app/Contents
      /Applications/OpenOffice.org.app/Contents
    )
  else # probably linux/unix
    # heroku libreoffice buildpack: https://github.com/rishihahs/heroku-buildpack-libreoffice
    search_paths = %w(
      /usr/lib/libreoffice
      /usr/lib64/libreoffice
      /opt/libreoffice
      /usr/lib/openoffice
      /usr/lib64/openoffice
      /opt/openoffice.org3
      /app/vendor/libreoffice
      /usr/bin/libreoffice
      /usr/local/bin
      /usr/lib64/libreoffice
      /usr/lib64/openoffice.org3
    )
  end
  search_paths
end

#open_office?Boolean

Returns:

  • (Boolean)


35
36
37
# File 'lib/docsplit/pdf_extractor.rb', line 35

def open_office?
  !!version_string.match(/^OpenOffice.org/)
end

#osx?Boolean

Returns:

  • (Boolean)


13
14
15
# File 'lib/docsplit/pdf_extractor.rb', line 13

def osx?
  !!HOST_OS.match(/darwin/i)
end

#version_stringObject

The first line of the help output holds the name and version number of the office software to be used for extraction.



22
23
24
25
26
27
28
29
30
31
# File 'lib/docsplit/pdf_extractor.rb', line 22

def version_string
  unless @@version_string
    null = windows? ? "NUL" : "/dev/null"
    @@version_string = `#{office_executable} -h 2>#{null}`.split("\n").first
    if !!@@version_string.to_s.match(/[0-9]*/)
      @@version_string = `#{office_executable} --version`.split("\n").first
    end
  end
  @@version_string
end

#windows?Boolean

Returns:

  • (Boolean)


10
11
12
# File 'lib/docsplit/pdf_extractor.rb', line 10

def windows?
  !!HOST_OS.match(/mswin|windows|cygwin/i)
end