Class: OCRFile

Inherits:
Object
  • Object
show all
Defined in:
lib/ocrfile.rb

Instance Method Summary collapse

Constructor Details

#initialize(file, input_dir, output_dir, rel_path, tika) ⇒ OCRFile

Returns a new instance of OCRFile.



7
8
9
10
11
12
13
14
# File 'lib/ocrfile.rb', line 7

def initialize(file, input_dir, output_dir, rel_path, tika)
  @path = file
  @input_dir = input_dir
  @output_dir = output_dir
  @rel_path = rel_path
	@tika = tika
  @text = ""
end

Instance Method Details

#give_me_textObject

Send file to give me text



52
53
54
55
56
57
58
59
60
61
# File 'lib/ocrfile.rb', line 52

def give_me_text
  puts "using: give_me_text"
  
  c = Curl::Easy.new("http://givemetext.okfnlabs.org/tika/tika/form")
  c.multipart_form_post = true
  c.http_post(Curl::PostField.file('file', @path))

	@text = c.body_str
  gotten_text_ok?(@text)
end

#give_me_text_localObject



63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/ocrfile.rb', line 63

def give_me_text_local
	puts "using: give_me_text_local"
	c = Curl::Easy.new(@tika + "/tika")
	# TODO: move this mime filtering to a higher global level
	mime_magic = MimeMagic.by_path(@path)
	file_data = File.read(@path)
	c.headers['Content-Type'] = mime_magic.type
	c.headers['Accept'] = "text/plain"
	c.http_put(file_data)

	#binding.pry
	@text = c.body_str
	gotten_text_ok?(@text)
end

#gotten_text_ok?(text) ⇒ Boolean

Checks if text was successfully extracted

Returns:

  • (Boolean)


79
80
81
# File 'lib/ocrfile.rb', line 79

def gotten_text_ok?(text)
  throw :extraction_error if text.include?("java.io.IOException: Stream Closed")
end

#is_pdf?Boolean

Check if file is pdf

Returns:

  • (Boolean)


39
40
41
42
43
# File 'lib/ocrfile.rb', line 39

def is_pdf?
  puts "determined: is_pdf"
  file_start = File.open(@path, 'r') { |f| f.read(8)}
  file_start.match(/\%PDF-\d+\.?\d+/)
end

#load_extracted_text(file) ⇒ Object

Load text that is already extracted



46
47
48
49
# File 'lib/ocrfile.rb', line 46

def load_extracted_text(file)
	puts "file exists: load_extracted_text"
  @text = JSON.parse(File.read(file))["text"]
end

#ocrObject

OCR file



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/ocrfile.rb', line 17

def ocr
  begin
    if File.exist?(@output_dir+@rel_path+".json")
      load_extracted_text(@output_dir+@rel_path+".json")
    #elsif @path.include?(".pdf")
    #  ocr_pdf
    else
      if @tika
        give_me_text_local
      else
        @text = File.read(@path)
      #  give_me_text
      end
    end
  rescue # Detect errors
    #binding.pry
  end
  
  return @text
end

#ocr_pdfObject

OCR with tesseract



84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# File 'lib/ocrfile.rb', line 84

def ocr_pdf
	puts "using: ocr_pdf"
  # Dir_paths
  base = Dir.pwd+"/"
  
  # Split pages to handle large PDFs
  Docsplit.extract_pages(@path, :output => base+'pages')
  filename = @path.split("/").last.gsub(".pdf", "")
  docs = Dir[base+'pages/'+filename+'*']

  # Rename pages so that they can be processed with spaces
  docs.each do |d|
    new_name = d.split("/").last.gsub(" ", "_").gsub("(", "").gsub(")", "")
    File.rename(d, base+'pages/'+new_name)
  end
  filename = filename.gsub(" ", "_").gsub("(", "").gsub(")", "")
  docs_no_spaces = Dir[base+'pages/'+filename+'*']
  
  # Extract text and save
  Docsplit.extract_text(docs_no_spaces, :output => base+'text')
  text_files = Dir[base+'text/'+filename+'*']
  sorted_text = text_files.sort_by {|f|
f.split(filename).last.gsub("_", "").gsub(".txt", "").to_i }
  sorted_text.each do |f|
    @text += File.read(f)
  end

  # Clean up
  FileUtils.rm_f Dir.glob(base+"pages/*")
  Dir.delete(base+"pages")
  FileUtils.rm_f Dir.glob(base+"text/*")
  Dir.delete(base+"text")
end