Class: NewspaperWorks::TextExtractionDerivativeService

Inherits:
NewspaperPageDerivativeService show all
Defined in:
app/services/newspaper_works/text_extraction_derivative_service.rb

Constant Summary

Constants inherited from NewspaperPageDerivativeService

NewspaperPageDerivativeService::TARGET_EXT

Instance Attribute Summary

Attributes inherited from NewspaperPageDerivativeService

#file_set, #master_format

Instance Method Summary collapse

Methods inherited from NewspaperPageDerivativeService

#convert_cmd, #derivative_path_factory, #identify, #im_convert, #jp2_convert, #jp2_to_intermediate, #load_destpath, #mime_type, #one_bit?, #prepare_path, target_ext, #use_color?, #valid?

Constructor Details

#initialize(file_set) ⇒ TextExtractionDerivativeService

Returns a new instance of TextExtractionDerivativeService.



3
4
5
6
7
# File 'app/services/newspaper_works/text_extraction_derivative_service.rb', line 3

def initialize(file_set)
  super(file_set)
  @alto_path = nil
  @txt_path = nil
end

Instance Method Details

#cleanup_derivativesObject



50
51
52
53
54
# File 'app/services/newspaper_works/text_extraction_derivative_service.rb', line 50

def cleanup_derivatives
  super('txt')
  super('xml')
  super('json')
end

#create_derivatives(src) ⇒ Object



9
10
11
12
13
14
15
# File 'app/services/newspaper_works/text_extraction_derivative_service.rb', line 9

def create_derivatives(src)
  from_alto = NewspaperWorks::TextFormatsFromALTOService.new(
    file_set
  )
  return from_alto.create_derivatives(src) unless from_alto.alto_path.nil?
  create_derivatives_from_ocr(src)
end

#create_derivatives_from_ocr(filename) ⇒ Object



17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'app/services/newspaper_works/text_extraction_derivative_service.rb', line 17

def create_derivatives_from_ocr(filename)
  @source_path = filename
  # prepare destination directory for ALTO (as .xml files):
  @alto_path = prepare_path('xml')
  # prepare destination directory for plain text (as .txt files):
  @txt_path = prepare_path('txt')
  # prepare destination directory for flat JSON (as .json files):
  @json_path = prepare_path('json')
  ocr = NewspaperWorks::TextExtraction::PageOCR.new(filename)
  # OCR will run once, on first method call to either .alto or .plain:
  write_plain_text(ocr.plain)
  write_alto(ocr.alto)
  write_json(ocr.word_json)
end

#write_alto(xml) ⇒ Object



32
33
34
35
36
# File 'app/services/newspaper_works/text_extraction_derivative_service.rb', line 32

def write_alto(xml)
  File.open(@alto_path, 'w') do |outfile|
    outfile.write(xml)
  end
end

#write_json(text) ⇒ Object



44
45
46
47
48
# File 'app/services/newspaper_works/text_extraction_derivative_service.rb', line 44

def write_json(text)
  File.open(@json_path, 'w') do |outfile|
    outfile.write(text)
  end
end

#write_plain_text(text) ⇒ Object



38
39
40
41
42
# File 'app/services/newspaper_works/text_extraction_derivative_service.rb', line 38

def write_plain_text(text)
  File.open(@txt_path, 'w') do |outfile|
    outfile.write(text)
  end
end