Module: DocPdfToText
- Defined in:
- lib/docpdftotext.rb
Constant Summary collapse
- VERSION =
"1.0.0"
- ANTIWORD_PATH =
"antiword"
- ODF_CONVERTER_PATH =
"OdfConverter"
- PYTHON_PATH =
"python"
- DOC_CONVERTER_PATH =
File.join(File.dirname(__FILE__), "DocumentConverter.py")
- PDFTOTEXT_PATH =
"pdftotext"
Instance Method Summary collapse
- #doc_to_txt(file_path) ⇒ Object
- #docx_to_txt(file_path) ⇒ Object
- #file_to_txt(file_path) ⇒ Object
- #pdf_to_txt(file_path) ⇒ Object
- #read_txt_file(file_path) ⇒ Object
Instance Method Details
#doc_to_txt(file_path) ⇒ Object
61 62 63 64 65 66 67 |
# File 'lib/docpdftotext.rb', line 61 def doc_to_txt(file_path) = File.(file_path) raise ArgumentError, "Unknown file" unless File.exists?() raise ArgumentError, "Invalid file type" unless File.extname() == ".doc" cmd = "#{ANTIWORD_PATH} #{}" return `#{cmd}` end |
#docx_to_txt(file_path) ⇒ Object
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
# File 'lib/docpdftotext.rb', line 28 def docx_to_txt(file_path) = File.(file_path) raise ArgumentError, "Unknown file" unless File.exists?() raise ArgumentError, "Invalid file type" unless File.extname() == ".docx" tmp_odt = Tempfile.new("docx") tmp_odt_path = tmp_odt.path + ".odt" tmp_odt.close # so our script can write to it; it isn't deleted till gc cmd = "#{ODF_CONVERTER_PATH} /LEVEL 4 /I #{} /O #{tmp_odt_path}" `#{cmd}` tmp_final = Tempfile.new("txt") tmp_final_path = tmp_final.path + ".txt" tmp_final.close cmd = "#{PYTHON_PATH} #{DOC_CONVERTER_PATH} #{tmp_odt_path} #{tmp_final_path}" `#{cmd}` return read_txt_file(tmp_final_path) end |
#file_to_txt(file_path) ⇒ Object
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
# File 'lib/docpdftotext.rb', line 11 def file_to_txt(file_path) = File.(file_path) raise ArgumentError, "Unknown file" unless File.exists?() return case File.extname() when ".docx" docx_to_txt(file_path) when ".doc" doc_to_txt(file_path) when ".pdf" pdf_to_txt(file_path) when ".txt" read_txt_file(file_path) else raise ArgumentError, "Invalid file type" end end |
#pdf_to_txt(file_path) ⇒ Object
69 70 71 72 73 74 75 76 77 78 79 80 81 |
# File 'lib/docpdftotext.rb', line 69 def pdf_to_txt(file_path) = File.(file_path) raise ArgumentError, "Unknown file" unless File.exists?() raise ArgumentError, "Invalid file type" unless File.extname() == ".pdf" tmp = Tempfile.new("pdf") tmp_path = tmp.path tmp.close # so our script can write to it; it isn't deleted till gc cmd = "#{PDFTOTEXT_PATH} #{} #{tmp_path}" `#{cmd}` return read_txt_file(tmp_path) end |
#read_txt_file(file_path) ⇒ Object
50 51 52 53 54 55 56 57 58 59 |
# File 'lib/docpdftotext.rb', line 50 def read_txt_file(file_path) = File.(file_path) raise ArgumentError, "Unknown file" unless File.exists?() final = [] File.open(, "r") do |infile| final.push(infile.gets) end return final.join("\n") end |