Module: ScanDex
- Defined in:
- lib/scandex.rb,
lib/version.rb
Constant Summary collapse
- VERSION =
'0.1.1'
Class Method Summary collapse
- .convert ⇒ Object
- .convert_to_gray_scale(source, destination) ⇒ Object
- .db(store_path) ⇒ Object
- .doctor ⇒ Object
- .documents(store_path) ⇒ Object
- .gs ⇒ Object
- .has_document(store_path, name) ⇒ Object
-
.image_to_string(image, language = "eng") ⇒ Object
TODO orientation and language detection.
- .index(store_path, source, force = false) ⇒ Object
- .index_and_store(store_path, file, force = false) ⇒ Object
- .ocr(pages, language = "eng") ⇒ Object
- .search_documents(store_path, text) ⇒ Object
- .store_document(store_path, source, content) ⇒ Object
- .tesseract ⇒ Object
Class Method Details
.convert ⇒ Object
13 14 15 |
# File 'lib/scandex.rb', line 13 def self.convert `which convert`.strip end |
.convert_to_gray_scale(source, destination) ⇒ Object
63 64 65 66 67 68 69 70 71 72 73 74 |
# File 'lib/scandex.rb', line 63 def self.convert_to_gray_scale(source, destination) cmd = "#{self.convert} -density 300 -depth 8 -type grayscale \"#{source}\" #{destination}/convert-%04d.jpg" #puts "cmd = #{cmd}" puts "Converting '#{File.basename(source)}'" ret = system(cmd) if !ret puts "Failed to convert #{source}" [] else Dir["#{destination}/convert-*.jpg"] end end |
.db(store_path) ⇒ Object
91 92 93 94 95 96 97 98 99 100 101 |
# File 'lib/scandex.rb', line 91 def self.db(store_path) store_path = '~/' if store_path.nil? || store_path.empty? filename = File.("#{store_path}/.scandex.db") migrate = !File.exists?(filename) db = SQLite3::Database.new(filename) if migrate puts "Creating DB" db.execute("CREATE TABLE documents (name VARCHAR(255), content TEXT, created TEXT, modified TEXT)") end db end |
.doctor ⇒ Object
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
# File 'lib/scandex.rb', line 17 def self.doctor convert = self.convert if convert.empty? puts "ImageMagick is missing" return false end gs = self.gs if gs.empty? puts "GhostScript is missing" return false end tesseract = self.tesseract if tesseract.empty? puts "Tesseract is missing" return false end #puts "All Tools Available" true end |
.documents(store_path) ⇒ Object
109 110 111 112 |
# File 'lib/scandex.rb', line 109 def self.documents(store_path) db = self.db(store_path) db.execute("SELECT name, created, modified FROM documents") end |
.gs ⇒ Object
9 10 11 |
# File 'lib/scandex.rb', line 9 def self.gs `which gs`.strip end |
.has_document(store_path, name) ⇒ Object
103 104 105 106 107 |
# File 'lib/scandex.rb', line 103 def self.has_document(store_path, name) db = self.db(store_path) rows = db.execute("SELECT name FROM documents WHERE name = ?", [name]) rows.size > 0 end |
.image_to_string(image, language = "eng") ⇒ Object
TODO orientation and language detection
86 87 88 89 |
# File 'lib/scandex.rb', line 86 def self.image_to_string(image, language = "eng") img = RTesseract.new(image, :lang => language) img.to_s end |
.index(store_path, source, force = false) ⇒ Object
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
# File 'lib/scandex.rb', line 45 def self.index(store_path, source, force = false) accepted_formats = ['.pdf', '.png', '.jpg', '.jpeg', '.tiff'] if (force || !self.has_document(store_path, source)) && accepted_formats.include?(File.extname(source).downcase) puts "Indexing #{source}" tmp = Dir.mktmpdir('scandex_') pages = convert_to_gray_scale(source, tmp) puts "Found #{pages.size} page(s)" if pages.size > 0 ocr(pages) else nil end else puts "Ignoring '#{source}'" nil end end |
.index_and_store(store_path, file, force = false) ⇒ Object
37 38 39 40 41 42 43 |
# File 'lib/scandex.rb', line 37 def self.index_and_store(store_path, file, force = false) content = ScanDex::index(store_path, file, force) if !content.nil? file = File.(file) ScanDex::store_document(store_path, file, content) end end |
.ocr(pages, language = "eng") ⇒ Object
76 77 78 79 80 81 82 83 |
# File 'lib/scandex.rb', line 76 def self.ocr(pages, language = "eng") text = '' pages.each do |page| puts "OCR on '#{File.basename(page)}'" text += image_to_string(page, language) end text end |
.search_documents(store_path, text) ⇒ Object
114 115 116 117 118 |
# File 'lib/scandex.rb', line 114 def self.search_documents(store_path, text) db = self.db(store_path) pattern = "%#{text.downcase}%" db.execute("SELECT name, created, modified FROM documents WHERE LOWER(content) LIKE ? OR LOWER(name) LIKE ?", [pattern, pattern]) end |
.store_document(store_path, source, content) ⇒ Object
120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
# File 'lib/scandex.rb', line 120 def self.store_document(store_path, source, content) created = File.mtime(source).utc.iso8601 modified = File.ctime(source).utc.iso8601 db = self.db(store_path) rows = db.execute("SELECT * FROM documents WHERE name = ?", source) if rows.size == 0 puts "Insert: #{source} #{created} #{modified}" db.execute("INSERT INTO documents (name, content, created, modified) VALUES (?, ?, ?, ?)", [source, content, created, modified]) else puts "Update: #{source} #{created} #{modified}" db.execute("UPDATE documents SET content = ?, modified = ? WHERE name = ?", [content, modified, source]) end end |
.tesseract ⇒ Object
5 6 7 |
# File 'lib/scandex.rb', line 5 def self.tesseract `which tesseract`.strip end |