Class: RTesseract
- Inherits:
-
Object
- Object
- RTesseract
- Defined in:
- lib/rtesseract.rb,
lib/rtesseract/mixed.rb,
lib/rtesseract/errors.rb
Overview
Ruby wrapper for Tesseract OCR
Defined Under Namespace
Classes: ConversionError, ErrorWithMemory, ImageNotSelectedError, Mixed, TempFilesNotRemovedError
Constant Summary collapse
- OPTIONS =
%w(command lang psm processor debug clear_console_output options)
- LANGUAGES =
Aliases to languages names
{ 'eng' => %w(en en-us english), 'ita' => %w(it), 'por' => %w(pt pt-br portuguese), 'spa' => %w(sp) }
Instance Attribute Summary collapse
-
#lang ⇒ Object
Select the language ===Languages * eng - English * deu - German * deu-f - German fraktur * fra - French * ita - Italian * nld - Dutch * por - Portuguese * spa - Spanish * vie - Vietnamese Note: Make sure you have installed the language to tesseract.
-
#options ⇒ Object
Returns the value of attribute options.
-
#options_cmd ⇒ Object
Returns the value of attribute options_cmd.
-
#processor ⇒ Object
readonly
Returns the value of attribute processor.
-
#psm ⇒ Object
Page Segment Mode.
Class Method Summary collapse
Instance Method Summary collapse
-
#clear_console_output ⇒ Object
TODO: Clear console for MacOS or Windows.
- #command_line_options(options) ⇒ Object
- #config ⇒ Object
- #config_file ⇒ Object
-
#convert ⇒ Object
Convert image to string.
-
#crop!(x, y, width, height) ⇒ Object
Crop image to convert.
- #default_command ⇒ Object
- #fetch_option(options, name, default) ⇒ Object
-
#from_blob(blob) ⇒ Object
Read image from memory blob.
- #image ⇒ Object
-
#initialize(src = '', options = {}) ⇒ RTesseract
constructor
A new instance of RTesseract.
-
#remove_file(files = []) ⇒ Object
Remove files.
- #source=(src) ⇒ Object
- #text_file ⇒ Object
-
#to_s ⇒ Object
Output value.
-
#to_s_without_spaces ⇒ Object
Remove spaces and break-lines.
Constructor Details
#initialize(src = '', options = {}) ⇒ RTesseract
Returns a new instance of RTesseract.
30 31 32 33 34 35 |
# File 'lib/rtesseract.rb', line 30 def initialize(src = '', = {}) @options = () @value, @x, @y, @w, @h = [''] @processor = RTesseract.choose_processor!(@processor) @source = @processor.image?(src) ? src : Pathname.new(src) end |
Instance Attribute Details
#lang ⇒ Object
Select the language
Languages
-
eng - English
-
deu - German
-
deu-f - German fraktur
-
fra - French
-
ita - Italian
-
nld - Dutch
-
por - Portuguese
-
spa - Spanish
-
vie - Vietnamese
Note: Make sure you have installed the language to tesseract
113 114 115 116 117 118 119 120 121 122 |
# File 'lib/rtesseract.rb', line 113 def lang language = "#{@lang}".strip.downcase LANGUAGES.each do |value, names| return " -l #{value} " if names.include? language end return " -l #{language} " if language.size > 0 '' rescue '' end |
#options ⇒ Object
Returns the value of attribute options.
15 16 17 |
# File 'lib/rtesseract.rb', line 15 def @options end |
#options_cmd ⇒ Object
Returns the value of attribute options_cmd.
19 20 21 |
# File 'lib/rtesseract.rb', line 19 def @options_cmd end |
#processor ⇒ Object (readonly)
Returns the value of attribute processor.
18 19 20 |
# File 'lib/rtesseract.rb', line 18 def processor @processor end |
#psm ⇒ Object
Page Segment Mode
125 126 127 128 129 |
# File 'lib/rtesseract.rb', line 125 def psm (@psm.nil? ? '' : " -psm #{@psm} ") rescue '' end |
Class Method Details
.choose_processor!(processor) ⇒ Object
197 198 199 200 201 202 203 204 205 206 207 |
# File 'lib/rtesseract.rb', line 197 def self.choose_processor!(processor) processor = if MiniMagickProcessor.a_name?(processor.to_s) MiniMagickProcessor elsif QuickMagickProcessor.a_name?(processor.to_s) QuickMagickProcessor else RMagickProcessor end processor.setup processor end |
.read(src = nil, options = {}) {|image| ... } ⇒ Object
63 64 65 66 67 68 69 70 71 72 |
# File 'lib/rtesseract.rb', line 63 def self.read(src = nil, = {}, &block) fail RTesseract::ImageNotSelectedError if src.nil? processor = RTesseract.choose_processor!(.delete(:processor) || .delete('processor')) image = processor.read_with_processor(src.to_s) yield image object = RTesseract.new('', ) object.from_blob(image.to_blob) object end |
Instance Method Details
#clear_console_output ⇒ Object
TODO: Clear console for MacOS or Windows
145 146 147 148 |
# File 'lib/rtesseract.rb', line 145 def clear_console_output return '' unless @clear_console_output return '2>/dev/null' if File.exist?('/dev/null') # Linux console clear end |
#command_line_options(options) ⇒ Object
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
# File 'lib/rtesseract.rb', line 41 def () @command = fetch_option(, :command, default_command) @lang = fetch_option(, :lang, '') @psm = fetch_option(, :psm, nil) @processor = fetch_option(, :processor, 'rmagick') @debug = fetch_option(, :debug, false) @options_cmd = fetch_option(, :options, []) @options_cmd = [@options_cmd] unless @options_cmd.kind_of?(Array) # Disable clear console if debug mode @clear_console_output = @debug ? false : fetch_option(, :clear_console_output, true) .delete_if { |k, v| OPTIONS.include?(k.to_s) } end |
#config ⇒ Object
131 132 133 134 |
# File 'lib/rtesseract.rb', line 131 def config @options ||= {} @options.map { |k, v| "#{k} #{v}" }.join("\n") end |
#config_file ⇒ Object
136 137 138 139 140 141 142 |
# File 'lib/rtesseract.rb', line 136 def config_file return '' if @options == {} conf = Tempfile.new('config') conf.write(config) conf.flush conf.path end |
#convert ⇒ Object
Convert image to string
159 160 161 162 163 164 165 |
# File 'lib/rtesseract.rb', line 159 def convert `#{@command} "#{image}" "#{text_file.gsub('.txt', '')}" #{lang} #{psm} #{config_file} #{clear_console_output} #{@options_cmd.join(' ')}` @value = File.read(@text_file).to_s remove_file([@image, @text_file]) rescue => error raise RTesseract::ConversionError.new(error) end |
#crop!(x, y, width, height) ⇒ Object
Crop image to convert
80 81 82 83 84 |
# File 'lib/rtesseract.rb', line 80 def crop!(x, y, width, height) @value = '' @x, @y, @w, @h = x.to_i, y.to_i, width.to_i, height.to_i self end |
#default_command ⇒ Object
57 58 59 60 61 |
# File 'lib/rtesseract.rb', line 57 def default_command TesseractBin::Executables[:tesseract] || 'tesseract' rescue 'tesseract' end |
#fetch_option(options, name, default) ⇒ Object
37 38 39 |
# File 'lib/rtesseract.rb', line 37 def fetch_option(, name, default) .fetch(name.to_s, .fetch(name, default)) end |
#from_blob(blob) ⇒ Object
Read image from memory blob
168 169 170 171 172 173 174 175 176 177 178 179 |
# File 'lib/rtesseract.rb', line 168 def from_blob(blob) blob_file = Tempfile.new('blob', :encoding => 'ascii-8bit') blob_file.binmode blob_file.write(blob) blob_file.rewind blob_file.flush self.source = blob_file.path convert remove_file([blob_file]) rescue => error raise RTesseract::ConversionError.new(error) end |
#image ⇒ Object
150 151 152 |
# File 'lib/rtesseract.rb', line 150 def image (@image = @processor.image_to_tif(@source, @x, @y, @w, @h)).path end |
#remove_file(files = []) ⇒ Object
Remove files
87 88 89 90 91 92 93 94 95 96 97 98 99 |
# File 'lib/rtesseract.rb', line 87 def remove_file(files = []) files.each do |file| if file.is_a?(Tempfile) file.close file.unlink else File.unlink(file) end end true rescue => error raise RTesseract::TempFilesNotRemovedError.new(:error => error, :files => files) end |
#source=(src) ⇒ Object
74 75 76 77 |
# File 'lib/rtesseract.rb', line 74 def source=(src) @value = '' @source = @processor.image?(src) ? src : Pathname.new(src) end |
#text_file ⇒ Object
154 155 156 |
# File 'lib/rtesseract.rb', line 154 def text_file @text_file = Pathname.new(Dir.tmpdir).join("#{Time.now.to_f}#{rand(1500)}.txt").to_s end |
#to_s ⇒ Object
Output value
182 183 184 185 186 187 188 189 190 |
# File 'lib/rtesseract.rb', line 182 def to_s return @value if @value != '' if @processor.image?(@source) || @source.file? convert @value else fail RTesseract::ImageNotSelectedError.new(@source) end end |
#to_s_without_spaces ⇒ Object
Remove spaces and break-lines
193 194 195 |
# File 'lib/rtesseract.rb', line 193 def to_s_without_spaces to_s.gsub(' ', '').gsub("\n", '').gsub("\r", '') end |