Class: RTesseract

Inherits:
Object
  • Object
show all
Defined in:
lib/rtesseract.rb,
lib/rtesseract/box.rb,
lib/rtesseract/mixed.rb,
lib/rtesseract/errors.rb,
lib/rtesseract/box_char.rb,
lib/rtesseract/configuration.rb

Overview

Configuration

Direct Known Subclasses

Box

Defined Under Namespace

Classes: Box, BoxChar, Configuration, ConversionError, ErrorWithMemory, ImageNotSelectedError, Mixed, TempFilesNotRemovedError

Constant Summary collapse

LANGUAGES =

Aliases to languages names

{
  'eng' => %w(en en-us english),
  'ita' => %w(it),
  'por' => %w(pt pt-br portuguese),
  'spa' => %w(sp)
}

Class Attribute Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(src = '', options = {}) ⇒ RTesseract

Returns a new instance of RTesseract.



23
24
25
26
27
28
29
30
# File 'lib/rtesseract.rb', line 23

def initialize(src = '', options = {})
  self.configuration = RTesseract.local_config(options)
  @options = options || {}
  @value, @points = [nil, {}]
  @processor = RTesseract.choose_processor!(self.configuration.processor)
  @source = @processor.image?(src) ? src : Pathname.new(src)
  initialize_hook
end

Class Attribute Details

.configurationObject

Returns the value of attribute configuration.



33
34
35
# File 'lib/rtesseract/configuration.rb', line 33

def configuration
  @configuration
end

Instance Attribute Details

#configurationObject

Returns the value of attribute configuration.



19
20
21
# File 'lib/rtesseract.rb', line 19

def configuration
  @configuration
end

#processorObject (readonly)

Returns the value of attribute processor.



20
21
22
# File 'lib/rtesseract.rb', line 20

def processor
  @processor
end

#sourceObject

Returns the value of attribute source.



21
22
23
# File 'lib/rtesseract.rb', line 21

def source
  @source
end

Class Method Details

.choose_processor!(processor) ⇒ Object



226
227
228
229
230
231
232
233
234
235
236
237
# File 'lib/rtesseract.rb', line 226

def self.choose_processor!(processor)
  processor =
  if MiniMagickProcessor.a_name?(processor.to_s)
    MiniMagickProcessor
  elsif NoneProcessor.a_name?(processor.to_s)
    NoneProcessor
  else
    RMagickProcessor
  end
  processor.setup
  processor
end

.configure {|configuration| ... } ⇒ Object

Yields:



36
37
38
39
# File 'lib/rtesseract/configuration.rb', line 36

def self.configure
  self.configuration ||= Configuration.new
  yield(configuration)
end

.default_commandObject



41
42
43
44
45
# File 'lib/rtesseract/configuration.rb', line 41

def self.default_command
  TesseractBin::Executables[:tesseract] || 'tesseract'
rescue
  'tesseract'
end

.local_config(options = {}) ⇒ Object

Local config to instance



48
49
50
51
52
53
54
55
56
# File 'lib/rtesseract/configuration.rb', line 48

def self.local_config(options = {})
  RTesseract::Configuration.new.tap do |config|
    config.command = config.option(options, :command, RTesseract.default_command)
    config.processor = config.option(options, :processor, 'rmagick')
    config.load_options(options, [ :lang, :psm, :tessdata_dir, :user_words, :user_patterns ])
    config.debug = config.option(options, :debug, false)
    config.options_cmd = [options.option(:options, nil)].flatten.compact
  end
end

.read(src = nil, options = {}) {|image| ... } ⇒ Object

Yields:



35
36
37
38
39
40
41
42
# File 'lib/rtesseract.rb', line 35

def self.read(src = nil, options = {})
  fail RTesseract::ImageNotSelectedError if src.nil?
  processor = RTesseract.choose_processor!(options.option(:processor, nil))
  image = processor.read_with_processor(src.to_s)
  yield(image)
  object = RTesseract.new('', options).from_blob(image.to_blob)
  object
end

Instance Method Details

#after_convert_hookObject



182
183
# File 'lib/rtesseract.rb', line 182

def after_convert_hook
end

#clear_console_outputObject

TODO: Clear console for MacOS or Windows



153
154
155
156
# File 'lib/rtesseract.rb', line 153

def clear_console_output
  return '' if self.configuration.debug
  return '2>/dev/null' if File.exist?('/dev/null') # Linux console clear
end

#configObject



137
138
139
140
141
# File 'lib/rtesseract.rb', line 137

def config
  @options ||= {}
  config_hook
  @options.map { |k, v| "#{k} #{v}" }.join("\n")
end

#config_fileObject



143
144
145
146
147
148
149
150
# File 'lib/rtesseract.rb', line 143

def config_file
  config_hook
  return '' if @options == {}
  conf = Tempfile.new('config')
  conf.write(config)
  conf.flush
  conf.path
end

#config_hookObject



134
135
# File 'lib/rtesseract.rb', line 134

def config_hook
end

#convertObject

Convert image to string



186
187
188
189
190
191
192
193
# File 'lib/rtesseract.rb', line 186

def convert
  convert_command
  after_convert_hook
  convert_text
  remove_file([@image, text_file_with_ext])
rescue => error
  raise RTesseract::ConversionError.new(error), error, caller
end

#convert_commandObject



174
175
176
# File 'lib/rtesseract.rb', line 174

def convert_command
  `#{self.configuration.command} "#{image}" "#{text_file}" #{lang} #{psm} #{tessdata_dir} #{user_words} #{user_patterns} #{config_file} #{clear_console_output} #{self.configuration.options_cmd.join(' ')}`
end

#convert_textObject



178
179
180
# File 'lib/rtesseract.rb', line 178

def convert_text
  @value = File.read(text_file_with_ext).to_s
end

#crop!(_points = {}) ⇒ Object

Crop image to convert



57
58
59
60
61
# File 'lib/rtesseract.rb', line 57

def crop!(_points = {})
  @value = nil
  @points = _points
  self
end

#file_extObject



162
163
164
# File 'lib/rtesseract.rb', line 162

def file_ext
  '.txt'
end

#from_blob(blob, ext = '') ⇒ Object

Read image from memory blob



196
197
198
199
200
201
202
203
204
205
206
207
# File 'lib/rtesseract.rb', line 196

def from_blob(blob, ext = '')
  blob_file = Tempfile.new(['blob', ext], encoding: 'ascii-8bit')
  blob_file.binmode.write(blob)
  blob_file.rewind
  blob_file.flush
  self.source = blob_file.path
  convert
  remove_file([blob_file])
  self
rescue => error
  raise RTesseract::ConversionError.new(error), error, caller
end

#imageObject



158
159
160
# File 'lib/rtesseract.rb', line 158

def image
  (@image = @processor.image_to_tif(@source, @points)).path
end

#initialize_hookObject



32
33
# File 'lib/rtesseract.rb', line 32

def initialize_hook
end

#langObject

Select the language

Languages

  • eng - English

  • deu - German

  • deu-f - German fraktur

  • fra - French

  • ita - Italian

  • nld - Dutch

  • por - Portuguese

  • spa - Spanish

  • vie - Vietnamese

Note: Make sure you have installed the language to tesseract



90
91
92
93
94
95
96
97
98
99
# File 'lib/rtesseract.rb', line 90

def lang
  language = "#{self.configuration.lang}".strip.downcase
  LANGUAGES.each do |value, names|
    return " -l #{value} " if names.include? language
  end
  return " -l #{language} " if language.size > 0
  ''
rescue
  ''
end

#options_cmdObject

Options on line



130
131
132
# File 'lib/rtesseract.rb', line 130

def options_cmd
  self.configuration.options_cmd
end

#psmObject

Page Segment Mode



102
103
104
105
106
# File 'lib/rtesseract.rb', line 102

def psm
  (self.configuration.psm.nil? ? '' : " -psm #{self.configuration.psm} ")
rescue
  ''
end

#readObject



44
45
46
47
48
49
# File 'lib/rtesseract.rb', line 44

def read
  image = @processor.read_with_processor(@source.to_s)
  new_image = yield(image)
  from_blob(new_image.to_blob, File.extname(@source.to_s))
  self
end

#remove_file(files = []) ⇒ Object

Remove files



64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/rtesseract.rb', line 64

def remove_file(files = [])
  files.each do |file|
    if file.is_a?(Tempfile)
      file.close
      file.unlink
    else
      File.unlink(file)
    end
  end
  true
rescue => error
  raise RTesseract::TempFilesNotRemovedError.new(error: error, files: files)
end

#tessdata_dirObject

Tessdata Dir



109
110
111
112
113
# File 'lib/rtesseract.rb', line 109

def tessdata_dir
  (self.configuration.tessdata_dir.nil? ? '' : " --tessdata-dir #{self.configuration.tessdata_dir} ")
rescue
  ''
end

#text_fileObject



166
167
168
# File 'lib/rtesseract.rb', line 166

def text_file
  @text_file = Pathname.new(Dir.tmpdir).join("#{Time.now.to_f}#{rand(1500)}").to_s
end

#text_file_with_ext(ext = nil) ⇒ Object



170
171
172
# File 'lib/rtesseract.rb', line 170

def text_file_with_ext(ext = nil)
  [@text_file, ext || file_ext].join('')
end

#to_sObject

Output value



210
211
212
213
214
215
216
217
218
219
# File 'lib/rtesseract.rb', line 210

def to_s
  return @value if @value != nil

  if @processor.image?(@source) || @source.file?
    convert
    @value
  else
    fail RTesseract::ImageNotSelectedError.new(@source)
  end
end

#to_s_without_spacesObject

Remove spaces and break-lines



222
223
224
# File 'lib/rtesseract.rb', line 222

def to_s_without_spaces
  to_s.gsub(' ', '').gsub("\n", '').gsub("\r", '')
end

#user_patternsObject

User Patterns



123
124
125
126
127
# File 'lib/rtesseract.rb', line 123

def user_patterns
  (self.configuration.user_patterns.nil? ? '' : " --user-patterns #{self.configuration.user_patterns} ")
rescue
  ''
end

#user_wordsObject

User Words



116
117
118
119
120
# File 'lib/rtesseract.rb', line 116

def user_words
  (self.configuration.user_words.nil? ? '' : " --user-words #{self.configuration.user_words} ")
rescue
  ''
end