Class: RTesseract

Inherits:
Object
  • Object
show all
Defined in:
lib/rtesseract.rb,
lib/rtesseract/mixed.rb,
lib/rtesseract/errors.rb

Overview

Ruby wrapper for Tesseract OCR

Defined Under Namespace

Classes: ConversionError, ErrorWithMemory, ImageNotSelectedError, Mixed, TempFilesNotRemovedError

Constant Summary collapse

OPTIONS =
%w(command lang psm processor debug clear_console_output options)
LANGUAGES =

Aliases to languages names

{
  'eng' => %w(en en-us english),
  'ita' => %w(it),
  'por' => %w(pt pt-br portuguese),
  'spa' => %w(sp)
}

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(src = '', options = {}) ⇒ RTesseract

Returns a new instance of RTesseract.



31
32
33
34
35
36
# File 'lib/rtesseract.rb', line 31

def initialize(src = '', options = {})
  @options = command_line_options(options)
  @value, @x, @y, @w, @h = ['']
  @processor = RTesseract.choose_processor!(@processor)
  @source = @processor.image?(src) ? src : Pathname.new(src)
end

Instance Attribute Details

#langObject

Select the language

Languages

  • eng - English

  • deu - German

  • deu-f - German fraktur

  • fra - French

  • ita - Italian

  • nld - Dutch

  • por - Portuguese

  • spa - Spanish

  • vie - Vietnamese

Note: Make sure you have installed the language to tesseract



114
115
116
117
118
119
120
121
122
123
# File 'lib/rtesseract.rb', line 114

def lang
  language = "#{@lang}".strip.downcase
  LANGUAGES.each do |value, names|
    return " -l #{value} " if names.include? language
  end
  return " -l #{language} " if language.size > 0
  ''
rescue
  ''
end

#optionsObject

Returns the value of attribute options.



16
17
18
# File 'lib/rtesseract.rb', line 16

def options
  @options
end

#options_cmdObject

Returns the value of attribute options_cmd.



20
21
22
# File 'lib/rtesseract.rb', line 20

def options_cmd
  @options_cmd
end

#processorObject (readonly)

Returns the value of attribute processor.



19
20
21
# File 'lib/rtesseract.rb', line 19

def processor
  @processor
end

#psmObject

Page Segment Mode



126
127
128
129
130
# File 'lib/rtesseract.rb', line 126

def psm
  (@psm.nil? ? '' : " -psm #{@psm} ")
rescue
  ''
end

Class Method Details

.choose_processor!(processor) ⇒ Object



198
199
200
201
202
203
204
205
206
207
208
209
210
# File 'lib/rtesseract.rb', line 198

def self.choose_processor!(processor)
  processor =  if MiniMagickProcessor.a_name?(processor.to_s)
                  MiniMagickProcessor
                elsif QuickMagickProcessor.a_name?(processor.to_s)
                  QuickMagickProcessor
                elsif NoneProcessor.a_name?(processor.to_s)
                  NoneProcessor
                else
                  RMagickProcessor
                end
  processor.setup
  processor
end

.read(src = nil, options = {}) {|image| ... } ⇒ Object

Yields:



64
65
66
67
68
69
70
71
72
73
# File 'lib/rtesseract.rb', line 64

def self.read(src = nil, options = {}, &block)
  fail RTesseract::ImageNotSelectedError if src.nil?
  processor = RTesseract.choose_processor!(options.delete(:processor) || options.delete('processor'))
  image = processor.read_with_processor(src.to_s)

  yield image
  object = RTesseract.new('', options)
  object.from_blob(image.to_blob)
  object
end

Instance Method Details

#clear_console_outputObject

TODO: Clear console for MacOS or Windows



146
147
148
149
# File 'lib/rtesseract.rb', line 146

def clear_console_output
  return '' unless @clear_console_output
  return '2>/dev/null' if File.exist?('/dev/null') # Linux console clear
end

#command_line_options(options) ⇒ Object



42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/rtesseract.rb', line 42

def command_line_options(options)
  @command     = fetch_option(options, :command, default_command)
  @lang        = fetch_option(options, :lang, '')
  @psm         = fetch_option(options, :psm, nil)
  @processor   = fetch_option(options, :processor, 'rmagick')
  @debug       = fetch_option(options, :debug, false)
  @options_cmd = fetch_option(options, :options, [])
  @options_cmd = [@options_cmd] unless @options_cmd.kind_of?(Array)

  # Disable clear console if debug mode
  @clear_console_output = @debug ? false : fetch_option(options, :clear_console_output, true)

  options.delete_if { |k, v| OPTIONS.include?(k.to_s) }
  options
end

#configObject



132
133
134
135
# File 'lib/rtesseract.rb', line 132

def config
  @options ||= {}
  @options.map { |k, v| "#{k} #{v}" }.join("\n")
end

#config_fileObject



137
138
139
140
141
142
143
# File 'lib/rtesseract.rb', line 137

def config_file
  return '' if @options == {}
  conf = Tempfile.new('config')
  conf.write(config)
  conf.flush
  conf.path
end

#convertObject

Convert image to string



160
161
162
163
164
165
166
# File 'lib/rtesseract.rb', line 160

def convert
  `#{@command} "#{image}" "#{text_file.gsub('.txt', '')}" #{lang} #{psm} #{config_file} #{clear_console_output} #{@options_cmd.join(' ')}`
  @value = File.read(@text_file).to_s
  remove_file([@image, @text_file])
rescue => error
  raise RTesseract::ConversionError.new(error)
end

#crop!(x, y, width, height) ⇒ Object

Crop image to convert



81
82
83
84
85
# File 'lib/rtesseract.rb', line 81

def crop!(x, y, width, height)
  @value = ''
  @x, @y, @w, @h = x.to_i, y.to_i, width.to_i, height.to_i
  self
end

#default_commandObject



58
59
60
61
62
# File 'lib/rtesseract.rb', line 58

def default_command
  TesseractBin::Executables[:tesseract] || 'tesseract'
rescue
  'tesseract'
end

#fetch_option(options, name, default) ⇒ Object



38
39
40
# File 'lib/rtesseract.rb', line 38

def fetch_option(options, name, default)
  options.fetch(name.to_s, options.fetch(name, default))
end

#from_blob(blob) ⇒ Object

Read image from memory blob



169
170
171
172
173
174
175
176
177
178
179
180
# File 'lib/rtesseract.rb', line 169

def from_blob(blob)
  blob_file = Tempfile.new('blob', :encoding => 'ascii-8bit')
  blob_file.binmode
  blob_file.write(blob)
  blob_file.rewind
  blob_file.flush
  self.source = blob_file.path
  convert
  remove_file([blob_file])
rescue => error
  raise RTesseract::ConversionError.new(error)
end

#imageObject



151
152
153
# File 'lib/rtesseract.rb', line 151

def image
  (@image = @processor.image_to_tif(@source, @x, @y, @w, @h)).path
end

#remove_file(files = []) ⇒ Object

Remove files



88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/rtesseract.rb', line 88

def remove_file(files = [])
  files.each do |file|
    if file.is_a?(Tempfile)
      file.close
      file.unlink
    else
      File.unlink(file)
    end
  end
  true
rescue => error
  raise RTesseract::TempFilesNotRemovedError.new(:error => error, :files => files)
end

#source=(src) ⇒ Object



75
76
77
78
# File 'lib/rtesseract.rb', line 75

def source=(src)
  @value = ''
  @source = @processor.image?(src) ? src : Pathname.new(src)
end

#text_fileObject



155
156
157
# File 'lib/rtesseract.rb', line 155

def text_file
  @text_file = Pathname.new(Dir.tmpdir).join("#{Time.now.to_f}#{rand(1500)}.txt").to_s
end

#to_sObject

Output value



183
184
185
186
187
188
189
190
191
# File 'lib/rtesseract.rb', line 183

def to_s
  return @value if @value != ''
  if @processor.image?(@source) || @source.file?
    convert
    @value
  else
    fail RTesseract::ImageNotSelectedError.new(@source)
  end
end

#to_s_without_spacesObject

Remove spaces and break-lines



194
195
196
# File 'lib/rtesseract.rb', line 194

def to_s_without_spaces
  to_s.gsub(' ', '').gsub("\n", '').gsub("\r", '')
end