Class: OCR::Scan

Inherits:
Object
  • Object
show all
Defined in:
lib/simple-ocr/scan.rb

Constant Summary collapse

EXTENS =
%w{pdf}

Instance Method Summary collapse

Constructor Details

#initialize(input_file, output_file, options, type) ⇒ Scan

Initialize your Input File, Output File, Options, Type.



12
13
14
15
16
17
18
19
20
21
22
23
24
# File 'lib/simple-ocr/scan.rb', line 12

def initialize(input_file, output_file, options, type)
  @output_file = output_file
  @options = options
  @type = handle_output_type(type)
  @input_file = input_file
  if pdf?(input_file)
    @image = OCR::Path.new(input_file).image_path
    convert_to_img
  else
    @image = input_file
  end
  @clean_image = OCR::Path.new(output_file).clean_image_path
end

Instance Method Details

#clean_imgObject

Shell Script for cleaning the Image.



54
55
56
57
58
# File 'lib/simple-ocr/scan.rb', line 54

def clean_img
  name = 'simple-ocr'
  g = Gem::Specification.find_by_name(name)
  `sh #{File.join(g.full_gem_path, 'lib/textcleaner')} -g -e stretch -f 25 -o 20 -t 30 -u -s 1 -T -p 20 '#{@image}' '#{@clean_image}'`
end

#convert_to_imgObject

Conversion of PDF to Image



37
38
39
# File 'lib/simple-ocr/scan.rb', line 37

def convert_to_img
  `gs -sDEVICE=png16m '-r#{OCR::MIN_DENSITY}' -o '#{@image}' '#{@input_file}'`
end

#delete_filesObject

Deleting unnecessary files after processing.



61
62
63
64
# File 'lib/simple-ocr/scan.rb', line 61

def delete_files
  FileUtils.rm_rf(@clean_image)
  FileUtils.rm_rf(@image) if pdf?
end

#exec_command(command) ⇒ Object

Execute Command



49
50
51
# File 'lib/simple-ocr/scan.rb', line 49

def exec_command(command)
  Open3.popen3(command)
end

#handle_output_type(type) ⇒ Object



26
27
28
29
30
31
32
33
34
# File 'lib/simple-ocr/scan.rb', line 26

def handle_output_type(type)
  if type == :pdf
    'pdf'
  elsif type == :hocr
    'hocr'
  else
    nil.to_s
  end
end

#pdf?(input_file = @input_file) ⇒ Boolean

Returns:

  • (Boolean)


66
67
68
# File 'lib/simple-ocr/scan.rb', line 66

def pdf?(input_file = @input_file)
  OCR::Path.new(input_file).name_exten[1] == OCR::Path::EXTENS[:pdf]
end

#scan_imgObject

OCR of Input



42
43
44
45
46
# File 'lib/simple-ocr/scan.rb', line 42

def scan_img
  clean_img
  `tesseract '#{@clean_image}' #{@options} '#{@output_file}' #{@type}`
  delete_files
end