Module: DocPdfToText

Defined in:
lib/docpdftotext.rb

Constant Summary collapse

VERSION =
"1.0.0"
ANTIWORD_PATH =
"antiword"
ODF_CONVERTER_PATH =
"OdfConverter"
PYTHON_PATH =
"python"
DOC_CONVERTER_PATH =
File.join(File.dirname(__FILE__), "DocumentConverter.py")
PDFTOTEXT_PATH =
"pdftotext"

Instance Method Summary collapse

Instance Method Details

#doc_to_txt(file_path) ⇒ Object

Raises:

  • (ArgumentError)


61
62
63
64
65
66
67
# File 'lib/docpdftotext.rb', line 61

def doc_to_txt(file_path)
  expanded_path = File.expand_path(file_path)
  raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
  raise ArgumentError, "Invalid file type" unless File.extname(expanded_path) == ".doc"
  cmd = "#{ANTIWORD_PATH} #{expanded_path}"
  return `#{cmd}`
end

#docx_to_txt(file_path) ⇒ Object

Raises:

  • (ArgumentError)


28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# File 'lib/docpdftotext.rb', line 28

def docx_to_txt(file_path)
  expanded_path = File.expand_path(file_path)
  raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
  raise ArgumentError, "Invalid file type" unless File.extname(expanded_path) == ".docx"

  tmp_odt = Tempfile.new("docx")
  tmp_odt_path = tmp_odt.path + ".odt"
  tmp_odt.close # so our script can write to it; it isn't deleted till gc
  
  cmd = "#{ODF_CONVERTER_PATH} /LEVEL 4 /I #{expanded_path} /O #{tmp_odt_path}"
  `#{cmd}`

  tmp_final = Tempfile.new("txt")
  tmp_final_path = tmp_final.path + ".txt"
  tmp_final.close

  cmd = "#{PYTHON_PATH} #{DOC_CONVERTER_PATH} #{tmp_odt_path} #{tmp_final_path}"
  `#{cmd}`
  
  return read_txt_file(tmp_final_path)
end

#file_to_txt(file_path) ⇒ Object

Raises:

  • (ArgumentError)


11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/docpdftotext.rb', line 11

def file_to_txt(file_path)
  expanded_path = File.expand_path(file_path)
  raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
  return case File.extname(expanded_path)
  when ".docx"
    docx_to_txt(file_path)
  when ".doc"
    doc_to_txt(file_path)
  when ".pdf"
    pdf_to_txt(file_path)
  when ".txt"
    read_txt_file(file_path)
  else
    raise ArgumentError, "Invalid file type"
  end
end

#pdf_to_txt(file_path) ⇒ Object

Raises:

  • (ArgumentError)


69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/docpdftotext.rb', line 69

def pdf_to_txt(file_path)
  expanded_path = File.expand_path(file_path)
  raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
  raise ArgumentError, "Invalid file type" unless File.extname(expanded_path) == ".pdf"

  tmp = Tempfile.new("pdf")
  tmp_path = tmp.path
  tmp.close # so our script can write to it; it isn't deleted till gc

  cmd = "#{PDFTOTEXT_PATH} #{expanded_path} #{tmp_path}"
  `#{cmd}`
  return read_txt_file(tmp_path)
end

#read_txt_file(file_path) ⇒ Object

Raises:

  • (ArgumentError)


50
51
52
53
54
55
56
57
58
59
# File 'lib/docpdftotext.rb', line 50

def read_txt_file(file_path)
  expanded_path = File.expand_path(file_path)
  raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)

  final = []
  File.open(expanded_path, "r") do |infile|
    final.push(infile.gets)
  end
  return final.join("\n")
end