Class: IiifPrint::TextExtraction::PageOCR
- Inherits:
-
Object
- Object
- IiifPrint::TextExtraction::PageOCR
- Defined in:
- lib/iiif_print/text_extraction/page_ocr.rb
Instance Attribute Summary collapse
-
#html ⇒ Object
Returns the value of attribute html.
-
#path ⇒ Object
Returns the value of attribute path.
Instance Method Summary collapse
- #alto ⇒ Object
- #height ⇒ Object
- #identify ⇒ Object
-
#initialize(path, additional_tesseract_options: IiifPrint.config.additional_tesseract_options) ⇒ PageOCR
constructor
A new instance of PageOCR.
- #load_words ⇒ Object
- #plain ⇒ Object
- #run_ocr ⇒ Object
- #width ⇒ Object
- #word_json ⇒ Object
- #words ⇒ Object
Constructor Details
#initialize(path, additional_tesseract_options: IiifPrint.config.additional_tesseract_options) ⇒ PageOCR
Returns a new instance of PageOCR.
12 13 14 15 16 17 18 19 20 21 |
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 12 def initialize(path, additional_tesseract_options: IiifPrint.config.) @path = path # hOCR html: @html = nil @words = nil = nil @box = nil @plain = nil = end |
Instance Attribute Details
#html ⇒ Object
Returns the value of attribute html.
10 11 12 |
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 10 def html @html end |
#path ⇒ Object
Returns the value of attribute path.
10 11 12 |
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 10 def path @path end |
Instance Method Details
#alto ⇒ Object
71 72 73 74 |
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 71 def alto writer = IiifPrint::TextExtraction::RenderAlto.new(width, height) writer.to_alto(words) end |
#height ⇒ Object
67 68 69 |
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 67 def height identify[:height] end |
#identify ⇒ Object
58 59 60 61 |
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 58 def identify return unless .nil? = IiifPrint::ImageTool.new(@path). end |
#load_words ⇒ Object
32 33 34 35 36 37 38 |
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 32 def load_words preprocess_image html_path = run_ocr reader = IiifPrint::TextExtraction::HOCRReader.new(html_path) @words = reader.words @plain = reader.text end |
#plain ⇒ Object
53 54 55 56 |
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 53 def plain load_words if @plain.nil? @plain end |
#run_ocr ⇒ Object
23 24 25 26 27 28 29 30 |
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 23 def run_ocr outfile = File.join(Dir.mktmpdir, 'output_html') cmd = "OMP_THREAD_LIMIT=1 tesseract #{path} #{outfile}" cmd += " #{@additional_tesseract_options}" if .present? cmd += " hocr" `#{cmd}` outfile + '.hocr' end |
#width ⇒ Object
63 64 65 |
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 63 def width identify[:width] end |
#word_json ⇒ Object
45 46 47 48 49 50 51 |
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 45 def word_json IiifPrint::TextExtraction::WordCoordsBuilder.json_coordinates_for( words: words, width: width, height: height ) end |
#words ⇒ Object
40 41 42 43 |
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 40 def words load_words if @words.nil? @words end |