Class: IiifPrint::TextExtraction::PageOCR

Inherits:
Object
  • Object
show all
Defined in:
lib/iiif_print/text_extraction/page_ocr.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(path, additional_tesseract_options: IiifPrint.config.additional_tesseract_options) ⇒ PageOCR

Returns a new instance of PageOCR.



12
13
14
15
16
17
18
19
20
21
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 12

def initialize(path, additional_tesseract_options: IiifPrint.config.additional_tesseract_options)
  @path = path
  # hOCR html:
  @html = nil
  @words = nil
  @source_meta = nil
  @box = nil
  @plain = nil
  @additional_tesseract_options = additional_tesseract_options
end

Instance Attribute Details

#htmlObject

Returns the value of attribute html.



10
11
12
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 10

def html
  @html
end

#pathObject

Returns the value of attribute path.



10
11
12
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 10

def path
  @path
end

Instance Method Details

#altoObject



71
72
73
74
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 71

def alto
  writer = IiifPrint::TextExtraction::RenderAlto.new(width, height)
  writer.to_alto(words)
end

#heightObject



67
68
69
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 67

def height
  identify[:height]
end

#identifyObject



58
59
60
61
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 58

def identify
  return @source_meta unless @source_meta.nil?
  @source_meta = IiifPrint::ImageTool.new(@path).
end

#load_wordsObject



32
33
34
35
36
37
38
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 32

def load_words
  preprocess_image
  html_path = run_ocr
  reader = IiifPrint::TextExtraction::HOCRReader.new(html_path)
  @words = reader.words
  @plain = reader.text
end

#plainObject



53
54
55
56
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 53

def plain
  load_words if @plain.nil?
  @plain
end

#run_ocrObject



23
24
25
26
27
28
29
30
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 23

def run_ocr
  outfile = File.join(Dir.mktmpdir, 'output_html')
  cmd = "OMP_THREAD_LIMIT=1 tesseract #{path} #{outfile}"
  cmd += " #{@additional_tesseract_options}" if @additional_tesseract_options.present?
  cmd += " hocr"
  `#{cmd}`
  outfile + '.hocr'
end

#widthObject



63
64
65
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 63

def width
  identify[:width]
end

#word_jsonObject



45
46
47
48
49
50
51
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 45

def word_json
  IiifPrint::TextExtraction::WordCoordsBuilder.json_coordinates_for(
    words: words,
    width: width,
    height: height
  )
end

#wordsObject



40
41
42
43
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 40

def words
  load_words if @words.nil?
  @words
end