Class: OCRFile

Inherits:
Object
  • Object
show all
Defined in:
lib/ocrfile.rb

Instance Method Summary collapse

Constructor Details

#initialize(file, input_dir, output_dir, rel_path, tika) ⇒ OCRFile

Returns a new instance of OCRFile.



7
8
9
10
11
12
13
14
# File 'lib/ocrfile.rb', line 7

def initialize(file, input_dir, output_dir, rel_path, tika)
  @path = file
  @input_dir = input_dir
  @output_dir = output_dir
  @rel_path = rel_path
  @tika = tika
  @text = ""
end

Instance Method Details

#give_me_textObject

Send file to give me text



49
50
51
52
53
54
55
56
# File 'lib/ocrfile.rb', line 49

def give_me_text
  c = Curl::Easy.new("http://givemetext.okfnlabs.org/tika/tika/form")
  c.multipart_form_post = true
  c.http_post(Curl::PostField.file('file', @path))

  @text = c.body_str
  gotten_text_ok?(@text)
end

#give_me_text_localObject



58
59
60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/ocrfile.rb', line 58

def give_me_text_local
  c = Curl::Easy.new(@tika + "/tika")
  # TODO: move this mime filtering to a higher global level
  mime_magic = MimeMagic.by_path(@path)
  file_data = File.read(@path)
  c.headers['Content-Type'] = mime_magic.type
  c.headers['Accept'] = "text/plain"
  c.http_put(file_data)

  #binding.pry

  @text = c.body_str
  gotten_text_ok?(@text)
end

#gotten_text_ok?(text) ⇒ Boolean

Checks if text was successfully extracted

Returns:

  • (Boolean)


74
75
76
# File 'lib/ocrfile.rb', line 74

def gotten_text_ok?(text)
  throw :extraction_error if text.include?("java.io.IOException: Stream Closed")
end

#is_pdf?Boolean

Check if file is pdf

Returns:

  • (Boolean)


38
39
40
41
# File 'lib/ocrfile.rb', line 38

def is_pdf?
  file_start = File.open(@path, 'r') { |f| f.read(8)}
  file_start.match(/\%PDF-\d+\.?\d+/)
end

#load_extracted_text(file) ⇒ Object

Load text that is already extracted



44
45
46
# File 'lib/ocrfile.rb', line 44

def load_extracted_text(file)
  @text = JSON.parse(File.read(file))["text"]
end

#ocrObject

OCR file



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/ocrfile.rb', line 17

def ocr
  begin
    if File.exist?(@output_dir+@rel_path+".json")
      load_extracted_text(@output_dir+@rel_path+".json")
    elsif @path.include?(".pdf")
      ocr_pdf
    else
      if @tika
        give_me_text_local
      else
        give_me_text
      end
    end
  rescue # Detect errors
    binding.pry
  end
  
  return @text
end

#ocr_pdfObject

OCR with tesseract



79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# File 'lib/ocrfile.rb', line 79

def ocr_pdf
  # Dir_paths
  base = Dir.pwd+"/"
  
  # Split pages to handle large PDFs
  Docsplit.extract_pages(@path, :output => base+'pages')
  filename = @path.split("/").last.gsub(".pdf", "")
  docs = Dir[base+'pages/'+filename+'*']

  # Rename pages so that they can be processed with spaces
  docs.each do |d|
    new_name = d.split("/").last.gsub(" ", "_").gsub("(", "").gsub(")", "")
    File.rename(d, base+'pages/'+new_name)
  end
  filename = filename.gsub(" ", "_").gsub("(", "").gsub(")", "")
  docs_no_spaces = Dir[base+'pages/'+filename+'*']
  
  # Extract text and save
  Docsplit.extract_text(docs_no_spaces, :output => base+'text')
  text_files = Dir[base+'text/'+filename+'*']
  sorted_text = text_files.sort_by {|f|
f.split(filename).last.gsub("_", "").gsub(".txt", "").to_i }
  sorted_text.each do |f|
    @text += File.read(f)
  end

  # Clean up
  FileUtils.rm_f Dir.glob(base+"pages/*")
  Dir.delete(base+"pages")
  FileUtils.rm_f Dir.glob(base+"text/*")
  Dir.delete(base+"text")
end