Class: OcrFile::TextEngines::ResultProcessor
- Inherits:
-
Object
- Object
- OcrFile::TextEngines::ResultProcessor
- Defined in:
- lib/ocr-file/text_engines/result_processor.rb
Constant Summary collapse
- MINIMUM_WORD_LENGTH =
4
- ACCEPTABLE_NUMBER_OF_ERRORS =
Random number I pulled out of nowhere
8
- ACCEPTABLE_UNIDENTIFIED_WORDS =
Random number I pulled out of nowhere
8
- ASCII_ONLY =
REGEX
/[^\u{0000}-\u{007f}]/
- NOISE_CHARACTERS =
/[^\w\s\/-;:]/
- DUPLICATE_WORDS =
/\b(\w+)\s+\1\b/
- EVERYTHING_BUT_CHARACTERS =
/[^\w\s]|(\d)/
Instance Attribute Summary collapse
-
#clear_text ⇒ Object
readonly
Returns the value of attribute clear_text.
-
#text ⇒ Object
readonly
Returns the value of attribute text.
Instance Method Summary collapse
- #correct ⇒ Object
- #count_of_issues ⇒ Object
-
#initialize(text) ⇒ ResultProcessor
constructor
A new instance of ResultProcessor.
- #invalid_words? ⇒ Boolean
- #spelling_error_count ⇒ Object
-
#unidentified_word_count ⇒ Object
Assume English.
-
#valid_words? ⇒ Boolean
This is a very naive way of determining if we should re-do OCR with shifted options.
- #word_count ⇒ Object
- #word_size_average ⇒ Object
Constructor Details
#initialize(text) ⇒ ResultProcessor
Returns a new instance of ResultProcessor.
16 17 18 19 |
# File 'lib/ocr-file/text_engines/result_processor.rb', line 16 def initialize(text) @text = text @clear_text = generate_clear_text || text || '' end |
Instance Attribute Details
#clear_text ⇒ Object (readonly)
Returns the value of attribute clear_text.
14 15 16 |
# File 'lib/ocr-file/text_engines/result_processor.rb', line 14 def clear_text @clear_text end |
#text ⇒ Object (readonly)
Returns the value of attribute text.
14 15 16 |
# File 'lib/ocr-file/text_engines/result_processor.rb', line 14 def text @text end |
Instance Method Details
#correct ⇒ Object
21 22 23 |
# File 'lib/ocr-file/text_engines/result_processor.rb', line 21 def correct Spellchecker.correct(text.gsub(NOISE_CHARACTERS, '')).gsub("\n ", "\n").strip end |
#count_of_issues ⇒ Object
56 57 58 |
# File 'lib/ocr-file/text_engines/result_processor.rb', line 56 def count_of_issues spelling_error_count + unidentified_word_count end |
#invalid_words? ⇒ Boolean
33 34 35 |
# File 'lib/ocr-file/text_engines/result_processor.rb', line 33 def invalid_words? !valid_words? end |
#spelling_error_count ⇒ Object
52 53 54 |
# File 'lib/ocr-file/text_engines/result_processor.rb', line 52 def spelling_error_count Spellchecker.check(clear_text).count end |
#unidentified_word_count ⇒ Object
Assume English
48 49 50 |
# File 'lib/ocr-file/text_engines/result_processor.rb', line 48 def unidentified_word_count clear_words.reject { |word| Spellchecker::Dictionaries::EnglishWords.include?(word) }.count end |
#valid_words? ⇒ Boolean
This is a very naive way of determining if we should re-do OCR with shifted options
27 28 29 30 31 |
# File 'lib/ocr-file/text_engines/result_processor.rb', line 27 def valid_words? word_size_average >= MINIMUM_WORD_LENGTH && spelling_error_count <= ACCEPTABLE_NUMBER_OF_ERRORS && unidentified_word_count <= ACCEPTABLE_UNIDENTIFIED_WORDS end |
#word_count ⇒ Object
37 38 39 40 |
# File 'lib/ocr-file/text_engines/result_processor.rb', line 37 def word_count return 0 if empty_text? @_word_count ||= clear_words.size end |
#word_size_average ⇒ Object
42 43 44 45 |
# File 'lib/ocr-file/text_engines/result_processor.rb', line 42 def word_size_average return 0 if empty_text? @_word_size_average ||= clear_words.map(&:size).sum / word_count end |