Class: OcrFile::TextEngines::ResultProcessor

Inherits:
Object
  • Object
show all
Defined in:
lib/ocr-file/text_engines/result_processor.rb

Constant Summary collapse

MINIMUM_WORD_LENGTH =
4
ACCEPTABLE_NUMBER_OF_ERRORS =

Random number I pulled out of nowhere

8
ACCEPTABLE_UNIDENTIFIED_WORDS =

Random number I pulled out of nowhere

8
ASCII_ONLY =

REGEX

/[^\u{0000}-\u{007f}]/
NOISE_CHARACTERS =
/[^\w\s\/-;:]/
DUPLICATE_WORDS =
/\b(\w+)\s+\1\b/
EVERYTHING_BUT_CHARACTERS =
/[^\w\s]|(\d)/

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(text) ⇒ ResultProcessor

Returns a new instance of ResultProcessor.



16
17
18
19
# File 'lib/ocr-file/text_engines/result_processor.rb', line 16

def initialize(text)
  @text = text
  @clear_text = generate_clear_text || text || ''
end

Instance Attribute Details

#clear_textObject (readonly)

Returns the value of attribute clear_text.



14
15
16
# File 'lib/ocr-file/text_engines/result_processor.rb', line 14

def clear_text
  @clear_text
end

#textObject (readonly)

Returns the value of attribute text.



14
15
16
# File 'lib/ocr-file/text_engines/result_processor.rb', line 14

def text
  @text
end

Instance Method Details

#correctObject



21
22
23
# File 'lib/ocr-file/text_engines/result_processor.rb', line 21

def correct
  Spellchecker.correct(text.gsub(NOISE_CHARACTERS, '')).gsub("\n ", "\n").strip
end

#count_of_issuesObject



56
57
58
# File 'lib/ocr-file/text_engines/result_processor.rb', line 56

def count_of_issues
  spelling_error_count + unidentified_word_count
end

#invalid_words?Boolean

Returns:

  • (Boolean)


33
34
35
# File 'lib/ocr-file/text_engines/result_processor.rb', line 33

def invalid_words?
  !valid_words?
end

#spelling_error_countObject



52
53
54
# File 'lib/ocr-file/text_engines/result_processor.rb', line 52

def spelling_error_count
  Spellchecker.check(clear_text).count
end

#unidentified_word_countObject

Assume English



48
49
50
# File 'lib/ocr-file/text_engines/result_processor.rb', line 48

def unidentified_word_count
  clear_words.reject { |word| Spellchecker::Dictionaries::EnglishWords.include?(word) }.count
end

#valid_words?Boolean

This is a very naive way of determining if we should re-do OCR with shifted options

Returns:

  • (Boolean)


27
28
29
30
31
# File 'lib/ocr-file/text_engines/result_processor.rb', line 27

def valid_words?
  word_size_average >= MINIMUM_WORD_LENGTH &&
    spelling_error_count <= ACCEPTABLE_NUMBER_OF_ERRORS &&
    unidentified_word_count <= ACCEPTABLE_UNIDENTIFIED_WORDS
end

#word_countObject



37
38
39
40
# File 'lib/ocr-file/text_engines/result_processor.rb', line 37

def word_count
  return 0 if empty_text?
  @_word_count ||= clear_words.size
end

#word_size_averageObject



42
43
44
45
# File 'lib/ocr-file/text_engines/result_processor.rb', line 42

def word_size_average
  return 0 if empty_text?
  @_word_size_average ||= clear_words.map(&:size).sum / word_count
end