Module: Documentalist

Defined in:
lib/dependencies.rb,
lib/documentalist.rb,
lib/backends/net_pbm.rb,
lib/backends/odf_merge.rb,
lib/backends/pdf_tools.rb,
lib/backends/open_office.rb,
lib/backends/wkhtmltopdf.rb

Defined Under Namespace

Modules: Dependencies, NetPBM, ODFMerge, OpenOffice, PdfTools, WkHtmlToPdf Classes: Error

Constant Summary collapse

BACKENDS =
{
  # Find a better pattern to pick backend, this one smells pretty bad
  :WkHtmlToPdf => {[:html, :htm] => :pdf},
  :OpenOffice => {[:odt, :doc, :rtf, :docx, :txt, :wps] => [:odt, :doc, :rtf, :pdf, :txt, :html, :htm, :wps]},
  :NetPBM => {:ppm => [:jpg, :jpeg]},
  :PdfTools => {:pdf => :txt},
}
@@config =
{}
@@logger =
nil

Class Method Summary collapse

Class Method Details

.backend_for_conversion(origin, destination) ⇒ Object

Finds the relevant server to perform the conversion



44
45
46
47
48
49
50
51
# File 'lib/documentalist.rb', line 44

def self.backend_for_conversion(origin, destination)
  origin = origin.to_s.gsub(/.*\./, "").to_sym
  destination = destination.to_s.gsub(/.*\./, "").to_sym

  BACKENDS.map { |b| [send(:const_get, b[0]), b[1]] }.detect do |s, conversions|
    conversions.keys.flatten.include?(origin) and conversions.values.flatten.include?(destination)
  end.to_a.first
end

.check_dependenciesObject

Checks the dependencies for backends



141
142
143
144
145
146
147
148
149
150
151
152
# File 'lib/documentalist.rb', line 141

def self.check_dependencies
  puts "Checking backends system dependencies"

  Documentalist.constants.each do |backend|
    backend = Documentalist.const_get backend.to_sym

    if backend.respond_to? :check_dependencies
      puts "Checking dependencies for #{backend.to_s}"
      backend.send :check_dependencies
    end
  end
end

.configObject



12
13
14
15
# File 'lib/documentalist.rb', line 12

def self.config
  default_config! unless config?
  @@config
end

.config=(hash) ⇒ Object



17
18
19
20
# File 'lib/documentalist.rb', line 17

def self.config=(hash)
  # We want to symbolize keys ourselves since we're not depending on Active Support
  @@config = symbolize hash
end

.config?Boolean

Returns:

  • (Boolean)


22
23
24
# File 'lib/documentalist.rb', line 22

def self.config?
  @@config != {}
end

.config_from_yaml!(file, options = {}) ⇒ Object



30
31
32
33
# File 'lib/documentalist.rb', line 30

def self.config_from_yaml!(file, options = {})
  self.config = YAML::load(File.open(file))
  self.config = config[options[:section].to_sym] if options[:section]
end

.convert(file, options = {}) {|| ... } ⇒ Object

Takes all conversion requests and dispatches them appropriately

Yields:

  • ()


54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/documentalist.rb', line 54

def self.convert(file, options={})
  raise "#{file} does not exist !" unless File.exist?(file)

  if options[:to_format]
    options[:to] = file.gsub(/#{"\\" + File.extname(file)}$/, ".#{options[:to_format].to_s}")
  elsif options[:to]
    options[:to_format] = File.extname(options[:to]).gsub(/\./, "").to_sym
  else
    raise Documentalist::Error.new("No destination or format was given")
  end

  options[:from_format] = File.extname(file).gsub(/\./, "").to_sym

  backend = backend_for_conversion(options[:from_format], options[:to_format])
  backend.convert(file, options)

  yield(options[:to]) if block_given?
  options[:to]
end

.default_config!Object



26
27
28
# File 'lib/documentalist.rb', line 26

def self.default_config!
  config_from_yaml! File.join(File.dirname(__FILE__), %w{.. config default.yml})
end

.extract_images(file) {|image_files| ... } ⇒ Object

Yields:

  • (image_files)


85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# File 'lib/documentalist.rb', line 85

def self.extract_images(file)
  temp_dir = File.join(Dir.tmpdir, rand(10**9).to_s)
  
  if File.extname(file) == '.pdf'
    temp_file = File.join(temp_dir, File.basename(file))

    FileUtils.mkdir_p temp_dir
    FileUtils.cp file, temp_file
    
    system "pdfimages #{temp_file} '#{File.join(temp_dir, "img")}'"

    Dir.glob(File.join(temp_dir, "*.ppm")).each do |ppm_image|
      #raise ppm_image
      Documentalist.convert(ppm_image, :to_format => :jpeg)
    end
  else
    Documentalist.convert file, :to_format => :html
  end

  image_files = Dir.glob(File.join(temp_dir, "*.{jpg,jpeg,bmp,tif,tiff,gif,png}"))

  yield(image_files) if block_given?
  image_files
end

.extract_text(file) ⇒ Object



74
75
76
77
78
79
80
81
82
83
# File 'lib/documentalist.rb', line 74

def self.extract_text(file)
  converted = convert(file, :to_format => :txt)
  
  if converted and File.exist?(converted)
    text = Kconv.toutf8(File.open(converted).read)
    FileUtils.rm(converted)
    yield(text) if block_given?
    text
  end
end

.loggerObject

Returns the logger object used to log documentalist operations



130
131
132
133
134
135
136
137
138
# File 'lib/documentalist.rb', line 130

def self.logger
  unless @@logger      
    Documentalist.config[:log_file] ||= File.join(File.dirname(File.expand_path(__FILE__)), %w{.. documentalist.log})
    @@logger = Logger.new(Documentalist.config[:log_file])
    @@logger.level = Logger.const_get(config[:log_level] ? config[:log_level].upcase : "WARN")
  end

  @@logger
end

.odf_merge(template, options = {}) ⇒ Object

Merge an ODF document with an arbitrary hash of data



7
8
9
# File 'lib/backends/odf_merge.rb', line 7

def self.odf_merge(template, options = {})
  ODFMerge.merge_template(template, options)
end

.symbolize(hash) ⇒ Object

Returns a new hash with recursively symbolized keys



155
156
157
158
159
160
# File 'lib/documentalist.rb', line 155

def self.symbolize(hash)
  hash.each_key do |key|
    hash[key.to_sym] = hash.delete key
    hash[key.to_sym] = symbolize(hash[key.to_sym]) if hash[key.to_sym].is_a?(Hash)
  end
end

.timeout(time_limit = 0, options = {:attempts => 1, :sleep => nil}) ⇒ Object

Runs a block with a system-enforced timeout and optionally retry with an optional sleep between attempts of running the given block. All times are in seconds.



113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# File 'lib/documentalist.rb', line 113

def self.timeout(time_limit = 0, options = {:attempts => 1, :sleep => nil})
  if block_given?
    attempts = options[:attempts] || 1
    begin
      SystemTimer.timeout time_limit do
        yield
      end
    rescue Timeout::Error
      attempts -= 1
      sleep(options[:sleep]) if options[:sleep]
      retry unless attempts.zero?
      raise
    end
  end
end