Class: Docsplit::PdfExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/docsplit/pdf_extractor.rb

Defined Under Namespace

Classes: OfficeNotFound

Constant Summary collapse

HOST_OS =

Provide a set of helper functions to determine the OS.

("RbConfig") ? RbConfig : Config)::CONFIG['host_os']
CLASSPATH =
"#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
LOGGING =
"-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
HEADLESS =
"-Djava.awt.headless=true"
@@executable =
nil

Instance Method Summary collapse

Instance Method Details

#extract(docs, opts) ⇒ Object

Convert documents to PDF.



108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# File 'lib/docsplit/pdf_extractor.rb', line 108

def extract(docs, opts)
  out = opts[:output] || '.'
  FileUtils.mkdir_p out unless File.exists?(out)
  [docs].flatten.each do |doc|
    ext = File.extname(doc)
    basename = File.basename(doc, ext)
    escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)

    if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
      `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
    else
      if libre_office?
        # Set the LibreOffice user profile, so that parallel uses of cloudcrowd don't trip over each other.
        ENV['SYSUSERCONFIG']="file://#{File.expand_path(escaped_out)}"
        
        options = "--headless --invisible  --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
        cmd = "#{office_executable} #{options} 2>&1"
        result = `#{cmd}`.chomp
        raise ExtractionFailed, result if $? != 0
        true
      else # open office presumably, rely on JODConverter to figure it out.
        options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
        run_jod "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
      end
    end
  end
end

#libre_office?Boolean

Returns:

  • (Boolean)


29
30
31
# File 'lib/docsplit/pdf_extractor.rb', line 29

def libre_office?
  !!version_string.match(/^LibreOffice/)
end

#linux?Boolean

Returns:

  • (Boolean)


15
16
17
# File 'lib/docsplit/pdf_extractor.rb', line 15

def linux?
  !!HOST_OS.match(/linux/i)
end

#office_executableObject

Identify the path to a working office executable.

Raises:



65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/docsplit/pdf_extractor.rb', line 65

def office_executable
  paths = office_search_paths

  # If an OFFICE_PATH has been specified on the commandline
  # raise an error if that path isn't valid, otherwise, add
  # it to the front of our search paths.
  if ENV['OFFICE_PATH']
    raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH']
    paths.unshift(ENV['OFFICE_PATH'])
  end
  
  # The location of the office executable is OS dependent
  path_pieces = ["soffice"]
  if windows?
    path_pieces += [["program", "soffice.bin"]]
  elsif osx?
    path_pieces += [["MacOS", "soffice"], ["Contents", "MacOS", "soffice"]]
  else
    path_pieces += [["program", "soffice"]]
  end
  
  # Search for the first suitable office executable
  # and short circuit an executable is found.
  paths.each do |path|
    if File.exists? path
      @@executable ||= path unless File.directory? path
      path_pieces.each do |pieces|
        check_path = File.join(path, pieces)
        @@executable ||= check_path if File.exists? check_path
      end
    end
    break if @@executable
  end
  raise OfficeNotFound, "No office software found" unless @@executable
  @@executable
end

#office_pathObject

Used to specify the office location for JODConverter



103
104
105
# File 'lib/docsplit/pdf_extractor.rb', line 103

def office_path
  File.dirname(File.dirname(office_executable))
end

#office_search_pathsObject

A set of default locations to search for office software These have been extracted from JODConverter. Each listed path should contain a directory “program” which in turn contains the “soffice” executable. see: github.com/mirkonasato/jodconverter/blob/master/jodconverter-core/src/main/java/org/artofsolving/jodconverter/office/OfficeUtils.java#L63-L91



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/docsplit/pdf_extractor.rb', line 41

def office_search_paths
  if windows?
    office_names       = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
    program_files_path = ENV["CommonProgramFiles"]
    search_paths       = office_names.map{ |program| File.join(program_files_path, program) }
  elsif osx?
    search_paths = %w(
      /Applications/LibreOffice.app/Contents
      /Applications/OpenOffice.org.app/Contents
    )
  else # probably linux/unix
    search_paths = %w(
      /usr/lib/libreoffice
      /usr/lib64/libreoffice
      /opt/libreoffice
      /usr/lib/openoffice
      /usr/lib64/openoffice
      /opt/openoffice.org3
    )
  end
  search_paths
end

#open_office?Boolean

Returns:

  • (Boolean)


32
33
34
# File 'lib/docsplit/pdf_extractor.rb', line 32

def open_office?
  !!version_string.match(/^OpenOffice.org/)
end

#osx?Boolean

Returns:

  • (Boolean)


12
13
14
# File 'lib/docsplit/pdf_extractor.rb', line 12

def osx?
  !!HOST_OS.match(/darwin/i)
end

#version_stringObject

The first line of the help output holds the name and version number of the office software to be used for extraction.



21
22
23
24
25
26
27
28
# File 'lib/docsplit/pdf_extractor.rb', line 21

def version_string
  versionstr =  `#{office_executable} -h 2>&1`.split("\n").first
    if !!versionstr.match(/[0-9]*/)
            versionstr =  `#{office_executable} --version`.split("\n").first
    end
    @@help ||= versionstr

end

#windows?Boolean

Returns:

  • (Boolean)


9
10
11
# File 'lib/docsplit/pdf_extractor.rb', line 9

def windows?
  !!HOST_OS.match(/mswin|windows|cygwin/i)
end