Class: OCRFile

Inherits:

Object

Object
OCRFile

show all

Defined in:: lib/ocrfile.rb

Instance Method Summary collapse

#give_me_text ⇒ Object

Send file to give me text.
#give_me_text_local ⇒ Object
#gotten_text_ok?(text) ⇒ Boolean

Checks if text was successfully extracted.
#initialize(file, input_dir, output_dir, rel_path, tika) ⇒ OCRFile constructor

A new instance of OCRFile.
#is_pdf? ⇒ Boolean

Check if file is pdf.
#load_extracted_text(file) ⇒ Object

Load text that is already extracted.
#ocr ⇒ Object

OCR file.
#ocr_pdf ⇒ Object

OCR with tesseract.

Constructor Details

#initialize(file, input_dir, output_dir, rel_path, tika) ⇒ `OCRFile`

Returns a new instance of OCRFile.

# File 'lib/ocrfile.rb', line 7

def initialize(file, input_dir, output_dir, rel_path, tika)
  @path = file
  @input_dir = input_dir
  @output_dir = output_dir
  @rel_path = rel_path
  @tika = tika
  @text = ""
end

Instance Method Details

#give_me_text ⇒ `Object`

Send file to give me text

# File 'lib/ocrfile.rb', line 49

def give_me_text
  c = Curl::Easy.new("http://givemetext.okfnlabs.org/tika/tika/form")
  c.multipart_form_post = true
  c.http_post(Curl::PostField.file('file', @path))

  @text = c.body_str
  gotten_text_ok?(@text)
end

#give_me_text_local ⇒ `Object`

# File 'lib/ocrfile.rb', line 58

def give_me_text_local
  c = Curl::Easy.new(@tika + "/tika")
  # TODO: move this mime filtering to a higher global level
  mime_magic = MimeMagic.by_path(@path)
  file_data = File.read(@path)
  c.headers['Content-Type'] = mime_magic.type
  c.headers['Accept'] = "text/plain"
  c.http_put(file_data)

  #binding.pry

  @text = c.body_str
  gotten_text_ok?(@text)
end

#gotten_text_ok?(text) ⇒ `Boolean`

Checks if text was successfully extracted

Returns:

(Boolean)



74
75
76

# File 'lib/ocrfile.rb', line 74

def gotten_text_ok?(text)
  throw :extraction_error if text.include?("java.io.IOException: Stream Closed")
end

#is_pdf? ⇒ `Boolean`

Check if file is pdf

Returns:

(Boolean)

# File 'lib/ocrfile.rb', line 38

def is_pdf?
  file_start = File.open(@path, 'r') { |f| f.read(8)}
  file_start.match(/\%PDF-\d+\.?\d+/)
end

#load_extracted_text(file) ⇒ `Object`

Load text that is already extracted



44
45
46

# File 'lib/ocrfile.rb', line 44

def load_extracted_text(file)
  @text = JSON.parse(File.read(file))["text"]
end

#ocr ⇒ `Object`

OCR file

# File 'lib/ocrfile.rb', line 17

def ocr
  begin
    if File.exist?(@output_dir+@rel_path+".json")
      load_extracted_text(@output_dir+@rel_path+".json")
    elsif @path.include?(".pdf")
      ocr_pdf
    else
      if @tika
        give_me_text_local
      else
        give_me_text
      end
    end
  rescue # Detect errors
    binding.pry
  end
  
  return @text
end

#ocr_pdf ⇒ `Object`

OCR with tesseract

# File 'lib/ocrfile.rb', line 79

def ocr_pdf
  # Dir_paths
  base = Dir.pwd+"/"
  
  # Split pages to handle large PDFs
  Docsplit.extract_pages(@path, :output => base+'pages')
  filename = @path.split("/").last.gsub(".pdf", "")
  docs = Dir[base+'pages/'+filename+'*']

  # Rename pages so that they can be processed with spaces
  docs.each do |d|
    new_name = d.split("/").last.gsub(" ", "_").gsub("(", "").gsub(")", "")
    File.rename(d, base+'pages/'+new_name)
  end
  filename = filename.gsub(" ", "_").gsub("(", "").gsub(")", "")
  docs_no_spaces = Dir[base+'pages/'+filename+'*']
  
  # Extract text and save
  Docsplit.extract_text(docs_no_spaces, :output => base+'text')
  text_files = Dir[base+'text/'+filename+'*']
  sorted_text = text_files.sort_by {|f|
f.split(filename).last.gsub("_", "").gsub(".txt", "").to_i }
  sorted_text.each do |f|
    @text += File.read(f)
  end

  # Clean up
  FileUtils.rm_f Dir.glob(base+"pages/*")
  Dir.delete(base+"pages")
  FileUtils.rm_f Dir.glob(base+"text/*")
  Dir.delete(base+"text")
end

Class: OCRFile

Instance Method Summary collapse

Constructor Details

#initialize(file, input_dir, output_dir, rel_path, tika) ⇒ OCRFile

Instance Method Details

#give_me_text ⇒ Object

#give_me_text_local ⇒ Object

#gotten_text_ok?(text) ⇒ Boolean

#is_pdf? ⇒ Boolean

#load_extracted_text(file) ⇒ Object

#ocr ⇒ Object

#ocr_pdf ⇒ Object