Class: RTesseract

Inherits:
Object
  • Object
show all
Defined in:
lib/rtesseract.rb,
lib/rtesseract/box.rb,
lib/rtesseract/uzn.rb,
lib/processors/none.rb,
lib/rtesseract/blob.rb,
lib/rtesseract/mixed.rb,
lib/rtesseract/utils.rb,
lib/rtesseract/errors.rb,
lib/processors/rmagick.rb,
lib/rtesseract/box_char.rb,
lib/rtesseract/processor.rb,
lib/processors/mini_magick.rb,
lib/rtesseract/configuration.rb

Overview

RTesseract

Direct Known Subclasses

Box, Uzn

Defined Under Namespace

Modules: Processor, Utils Classes: Box, BoxChar, Configuration, ConversionError, ErrorWithMemory, ImageNotSelectedError, Mixed, TempFilesNotRemovedError, TesseractVersionError, Uzn

Constant Summary collapse

LANGUAGES =

Aliases to languages names

{
  'en' => 'eng',
  'en-us' => 'eng',
  'english' => 'eng',
  'pt' => 'por',
  'pt-br' => 'por',
  'portuguese' => 'por',
  'it' => 'ita',
  'sp' => 'spa'
}.freeze

Class Attribute Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(src = '', options = {}) ⇒ RTesseract

Returns a new instance of RTesseract.



15
16
17
18
19
20
21
22
# File 'lib/rtesseract.rb', line 15

def initialize(src = '', options = {})
  self.configuration = RTesseract.local_config(options)
  @options = options || {}
  @points = {}
  @processor = RTesseract::Processor.choose_processor!(configuration.processor)
  self.source = src
  initialize_hook
end

Class Attribute Details

.configurationObject

Returns the value of attribute configuration.



40
41
42
# File 'lib/rtesseract/configuration.rb', line 40

def configuration
  @configuration
end

Instance Attribute Details

#configurationObject

Returns the value of attribute configuration.



11
12
13
# File 'lib/rtesseract.rb', line 11

def configuration
  @configuration
end

#processorObject (readonly)

Returns the value of attribute processor.



12
13
14
# File 'lib/rtesseract.rb', line 12

def processor
  @processor
end

#sourceObject

Returns the value of attribute source.



13
14
15
# File 'lib/rtesseract.rb', line 13

def source
  @source
end

Class Method Details

.clear_pdf_optionObject

Clear pdf option



50
51
52
53
54
55
# File 'lib/rtesseract/configuration.rb', line 50

def self.clear_pdf_option
  if self.configuration.options_cmd
    self.configuration.options_cmd.delete('pdf')
    self.configuration.options_cmd.delete(:pdf)
  end
end

.configure {|configuration| ... } ⇒ Object

Yields:



43
44
45
46
47
# File 'lib/rtesseract/configuration.rb', line 43

def self.configure
  self.configuration ||= Configuration.new
  yield(configuration)
  self.clear_pdf_option
end

.default_commandObject

Default command



58
59
60
61
62
# File 'lib/rtesseract/configuration.rb', line 58

def self.default_command
  TesseractBin::Executables[:tesseract] || 'tesseract'
rescue
  'tesseract'
end

.local_config(options = {}) ⇒ Object

Local config to instance



65
66
67
68
69
70
71
72
73
74
# File 'lib/rtesseract/configuration.rb', line 65

def self.local_config(options = {})
  RTesseract::Configuration.new.tap do |config|
    config.command = config.option(options, :command, RTesseract.default_command)
    config.processor = config.option(options, :processor, 'rmagick')
    config.load_options(options, [:lang, :psm, :oem, :tessdata_dir, :user_words, :user_patterns])
    config.debug = config.option(options, :debug, false)
    pdf_opts = lambda { |o| o == 'pdf' || o == :pdf }
    config.options_cmd = [options.option(:options, nil)].delete_if(&pdf_opts).flatten.compact
  end
end

.read(src = nil, options = {}) {|image| ... } ⇒ Object

Read image from memory blob

Yields:



4
5
6
7
8
9
10
11
# File 'lib/rtesseract/blob.rb', line 4

def self.read(src = nil, options = {})
  fail RTesseract::ImageNotSelectedError if src.nil?
  processor = RTesseract::Processor.choose_processor!(options[:processor])
  image = processor.read_with_processor(src.to_s)
  yield(image)
  object = RTesseract.new('', options).from_blob(image.to_blob)
  object
end

Instance Method Details

#after_convert_hookObject

Hook to convert



182
183
# File 'lib/rtesseract.rb', line 182

def after_convert_hook
end

#cleanObject

Destroy pdf file



228
229
230
# File 'lib/rtesseract.rb', line 228

def clean
  RTesseract::Utils.remove_files([@pdf_path])
end

#clear_console_outputObject

TODO: Clear console for MacOS or Windows



120
121
122
123
# File 'lib/rtesseract.rb', line 120

def clear_console_output
  return '' if configuration.debug
  return '2>/dev/null' if File.exist?('/dev/null') # Linux console clear
end

#configObject

Convert configurations



103
104
105
106
107
# File 'lib/rtesseract.rb', line 103

def config
  @options ||= {}
  config_hook
  @options.map { |k, v| "#{k} #{v}" }.join("\n")
end

#config_fileObject

Write config to file



110
111
112
113
114
115
116
117
# File 'lib/rtesseract.rb', line 110

def config_file
  config_hook
  return '' if @options == {}
  conf = Tempfile.new('config')
  conf.write(config)
  conf.flush
  conf.path
end

#config_hookObject

Hook to before config



99
100
# File 'lib/rtesseract.rb', line 99

def config_hook
end

#convertObject

Convert image to string



186
187
188
189
190
191
192
# File 'lib/rtesseract.rb', line 186

def convert
  convert_command
  after_convert_hook
  convert_result
rescue => error
  raise RTesseract::ConversionError.new(error), error, caller
end

#convert_commandObject

Run command



152
153
154
# File 'lib/rtesseract.rb', line 152

def convert_command
  `#{configuration.command} "#{image}" "#{file_dest}" #{lang} #{oem} #{psm} #{tessdata_dir} #{user_words} #{user_patterns} #{config_file} #{clear_console_output} #{options_cmd.join(' ')}`
end

#convert_pdfObject

Store pdf result path



167
168
169
# File 'lib/rtesseract.rb', line 167

def convert_pdf
  @pdf_path = file_with_ext('.pdf')
end

#convert_resultObject

Convert result to proper type



172
173
174
175
176
177
178
179
# File 'lib/rtesseract.rb', line 172

def convert_result
  if pdf?
    convert_pdf
  else
    convert_text
    RTesseract::Utils.remove_files([@image, file_with_ext])
  end
end

#convert_textObject

Read result file



162
163
164
# File 'lib/rtesseract.rb', line 162

def convert_text
  @value = File.read(file_with_ext).to_s
end

#crop!(points = {}) ⇒ Object

Crop image to convert



36
37
38
39
40
# File 'lib/rtesseract.rb', line 36

def crop!(points = {})
  @value = nil
  @points = points
  self
end

#file_destObject

Rand file path



142
143
144
# File 'lib/rtesseract.rb', line 142

def file_dest
  @file_dest = Pathname.new(Dir.tmpdir).join("#{Time.now.to_f}#{rand(1500)}").to_s
end

#file_extObject

Extension of file



131
132
133
# File 'lib/rtesseract.rb', line 131

def file_ext
  '.txt'
end

#file_with_ext(ext = nil) ⇒ Object

Full path of file with txt extension



147
148
149
# File 'lib/rtesseract.rb', line 147

def file_with_ext(ext = nil)
  [@file_dest, ext || file_ext].join('')
end

#from_blob(blob, ext = '') ⇒ Object

Read image from memory blob



22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/rtesseract/blob.rb', line 22

def from_blob(blob, ext = '')
  blob_file = Tempfile.new(['blob', ext], encoding: 'ascii-8bit')
  blob_file.binmode.write(blob)
  blob_file.rewind
  blob_file.flush
  self.source = blob_file.path
  convert
  RTesseract::Utils.remove_files([blob_file])
  self
rescue => error
  raise RTesseract::ConversionError.new(error), error, caller
end

#imageObject

Get image



126
127
128
# File 'lib/rtesseract.rb', line 126

def image
  (@image = @processor.image_to_tif(@source, @points)).path
end

#initialize_hookObject

Hook to end of initialize method



25
26
# File 'lib/rtesseract.rb', line 25

def initialize_hook
end

#langObject

Select the language

Languages

  • eng - English

  • deu - German

  • deu-f - German fraktur

  • fra - French

  • ita - Italian

  • nld - Dutch

  • por - Portuguese

  • spa - Spanish

  • vie - Vietnamese

Note: Make sure you have installed the language to tesseract



54
55
56
57
58
59
# File 'lib/rtesseract.rb', line 54

def lang
  language = (configuration.lang || 'eng').to_s.strip.downcase
  " -l #{LANGUAGES[language] || language} "
rescue
  ''
end

#oemObject

Engine Mode



74
75
76
# File 'lib/rtesseract.rb', line 74

def oem
  option_to_string '--oem', configuration.oem
end

#option_to_string(prefix, value = nil) ⇒ Object

Convert option to command



62
63
64
65
66
# File 'lib/rtesseract.rb', line 62

def option_to_string(prefix, value = nil)
  (value.nil? ? '' : " #{prefix} #{value} ")
rescue
  ''
end

#options_cmdObject

Options on line



94
95
96
# File 'lib/rtesseract.rb', line 94

def options_cmd
  configuration.options_cmd
end

#pdf?Boolean

Is pdf output?

Returns:

  • (Boolean)


157
158
159
# File 'lib/rtesseract.rb', line 157

def pdf?
  options_cmd.include? 'pdf'
end

#psmObject

Page Segment Mode



69
70
71
# File 'lib/rtesseract.rb', line 69

def psm
  option_to_string('-psm', configuration.psm)
end

#readObject

Read image from memory blob



14
15
16
17
18
19
# File 'lib/rtesseract/blob.rb', line 14

def read
  image = @processor.read_with_processor(@source.to_s)
  new_image = yield(image)
  from_blob(new_image.to_blob, File.extname(@source.to_s))
  self
end

#tessdata_dirObject

Tessdata Dir



79
80
81
# File 'lib/rtesseract.rb', line 79

def tessdata_dir
  option_to_string('--tessdata-dir', configuration.tessdata_dir)
end

#tesseract_versionObject

Detect version number



136
137
138
# File 'lib/rtesseract.rb', line 136

def tesseract_version
  RTesseract::Utils.version_number
end

#to_pdfObject

Output pdf path



212
213
214
215
216
217
218
219
220
221
222
223
224
225
# File 'lib/rtesseract.rb', line 212

def to_pdf
  return @pdf_path if @pdf_path

  fail TesseractVersionError.new if tesseract_version.nil? || tesseract_version < 3.03

  if @processor.image?(@source) || @source.file?
    options_cmd << 'pdf'
    convert
    options_cmd.delete('pdf')
    @pdf_path
  else
    fail RTesseract::ImageNotSelectedError.new(@source)
  end
end

#to_sObject

Output value



195
196
197
198
199
200
201
202
203
204
# File 'lib/rtesseract.rb', line 195

def to_s
  return @value if @value

  if @processor.image?(@source) || @source.file?
    convert
    @value
  else
    fail RTesseract::ImageNotSelectedError.new(@source)
  end
end

#to_s_without_spacesObject

Remove spaces and break-lines



207
208
209
# File 'lib/rtesseract.rb', line 207

def to_s_without_spaces
  to_s.gsub(/\s/, '')
end

#user_patternsObject

User Patterns



89
90
91
# File 'lib/rtesseract.rb', line 89

def user_patterns
  option_to_string('--user-patterns', configuration.user_patterns)
end

#user_wordsObject

User Words



84
85
86
# File 'lib/rtesseract.rb', line 84

def user_words
  option_to_string('--user-words', configuration.user_words)
end