Module: RTesseract::Box
- Extended by:
- Base
- Defined in:
- lib/rtesseract/box.rb
Class Method Summary
collapse
Methods included from Base
remove_tmp_file, temp_file_path
Class Method Details
.parse(content) ⇒ Object
19
20
21
|
# File 'lib/rtesseract/box.rb', line 19
def parse(content)
content.lines.map { |line| parse_line(line) }.compact
end
|
.parse_confidence(line) ⇒ Object
48
49
50
|
# File 'lib/rtesseract/box.rb', line 48
def parse_confidence(line)
line.match(/(?<=;)(.*?)(?=')/).to_s.split
end
|
.parse_line(line) ⇒ Object
23
24
25
26
27
28
29
30
31
|
# File 'lib/rtesseract/box.rb', line 23
def parse_line(line)
return unless line.match?(/oc(rx|r)_word/)
word = line.to_s.scan(/>(.*)</).flatten.first.to_s
return if word.strip == ''
word_info(word, parse_position(line), parse_confidence(line))
end
|
.parse_position(line) ⇒ Object
44
45
46
|
# File 'lib/rtesseract/box.rb', line 44
def parse_position(line)
line.match(/(?<=title)(.*?)(?=;)/).to_s.split
end
|
.run(source, errors, options) ⇒ Object
8
9
10
11
12
13
14
15
16
17
|
# File 'lib/rtesseract/box.rb', line 8
def run(source, errors, options)
options = options.merge({ tessedit_create_hocr: 1 })
RTesseract::Command.new(source, temp_file_path, errors, options).run do |output_path|
filename = "#{output_path}.hocr"
content = File.read(filename)
remove_tmp_file(filename)
parse(content)
end
end
|
.word_info(word, positions, confidence) ⇒ Object
33
34
35
36
37
38
39
40
41
42
|
# File 'lib/rtesseract/box.rb', line 33
def word_info(word, positions, confidence)
{
word: word,
confidence: confidence[-1].to_i,
x_start: positions[1].to_i,
y_start: positions[2].to_i,
x_end: positions[3].to_i,
y_end: positions[4].to_i
}
end
|