Class: RTesseract::Box

Inherits:
RTesseract show all
Defined in:
lib/rtesseract/box.rb

Overview

Class to read char positions from an image

Direct Known Subclasses

BoxChar

Defined Under Namespace

Classes: BoxParser

Constant Summary

Constants inherited from RTesseract

LANGUAGES

Instance Attribute Summary

Attributes inherited from RTesseract

#configuration, #processor, #source

Instance Method Summary collapse

Methods inherited from RTesseract

#clean, #clear_console_output, clear_pdf_option, #config, #config_file, configure, #convert, #convert_command, #convert_pdf, #convert_result, #crop!, default_command, #file_dest, #file_with_ext, #from_blob, #image, #initialize, #lang, local_config, #option_to_string, #options_cmd, #pdf?, #psm, #read, read, #tessdata_dir, #tesseract_version, #to_pdf, #to_s_without_spaces, #user_patterns, #user_words

Constructor Details

This class inherits a constructor from RTesseract

Instance Method Details

#after_convert_hookObject

Move file html to hocr



44
45
46
# File 'lib/rtesseract/box.rb', line 44

def after_convert_hook
  FileUtils.mv(file_with_ext('.html'), file_with_ext) rescue nil
end

#config_hookObject

Aditional options to config file



15
16
17
# File 'lib/rtesseract/box.rb', line 15

def config_hook
  @options['tessedit_create_hocr'] = 1 # Split Words configuration
end

#convert_textObject

Return words to value



37
38
39
40
41
# File 'lib/rtesseract/box.rb', line 37

def convert_text
  text_objects =  []
  parse_file.each { |word| text_objects << BoxParser.new(word).to_h }
  @value = text_objects
end

#file_extObject

Extension of file



26
27
28
# File 'lib/rtesseract/box.rb', line 26

def file_ext
  '.hocr'
end

#initialize_hookObject

Setting value as blank array



10
11
12
# File 'lib/rtesseract/box.rb', line 10

def initialize_hook
  @value = []
end

#parse_fileObject

Read the result file



31
32
33
34
# File 'lib/rtesseract/box.rb', line 31

def parse_file
  html = Nokogiri::HTML(File.read(file_with_ext))
  html.css('span.ocrx_word, span.ocr_word')
end

#to_sObject

Output value



49
50
51
52
53
54
55
56
57
# File 'lib/rtesseract/box.rb', line 49

def to_s
  return @value.map { |word| word[:word] } if @value != []
  if @processor.image?(@source) || @source.file?
    convert
    @value.map { |word| word[:word] }.join(' ')
  else
    fail RTesseract::ImageNotSelectedError.new(@source)
  end
end

#wordsObject

Words converted



20
21
22
23
# File 'lib/rtesseract/box.rb', line 20

def words
  convert if @value == []
  @value
end