Class: UploadConvert

Inherits:
Object
  • Object
show all
Defined in:
lib/uploadconvert.rb

Instance Method Summary collapse

Constructor Details

#initialize(input) ⇒ UploadConvert

Returns a new instance of UploadConvert.



6
7
8
9
10
# File 'lib/uploadconvert.rb', line 6

def initialize(input)
  @input = input
  @output = ""
  @text = ""
end

Instance Method Details

#cleanPDF(text) ⇒ Object

Removes numbers from edges of legal documents



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/uploadconvert.rb', line 68

def cleanPDF(text)
  text.gsub!(/\r?\n/, "\n")
  text.each_line do |l|
    lflag = 0
    (1..28).each do |i|
      if l == i.to_s+"\n"
        lflag = 1
      end
    end

    if lflag != 1 && l
      @text += l
    end
  end
  
  return @text
end

#detectPDFTypeObject

Use embedded fonts to detect the type of PDF



27
28
29
30
31
32
33
34
# File 'lib/uploadconvert.rb', line 27

def detectPDFType
  out = `pdffonts #{@input}`.split("\n")
  if out.length > 4
    return embedPDF
  else
    return ocrPDF
  end
end

#embedPDFObject

Extract text from embedded text PDFs



37
38
39
40
41
42
43
44
45
# File 'lib/uploadconvert.rb', line 37

def embedPDF
  Docsplit.extract_text(@input, :ocr => false)
  outfile = @input.split(".pdf")
  text = File.read(outfile[0]+".txt")
  
  # Clean up text and delete file
  File.delete(outfile[0]+".txt")
  cleanPDF(text)
end

#extractMetadataPDFObject

Extract PDF metadata



87
88
89
90
91
92
93
94
95
96
97
98
# File 'lib/uploadconvert.rb', line 87

def extractMetadataPDF
   = Hash.new
  [:author] = Docsplit.extract_author(@input)
  [:creator] =  Docsplit.extract_creator(@input)
  [:producer] = Docsplit.extract_producer(@input)
  [:title] = Docsplit.extract_title(@input)
  [:subject] = Docsplit.extract_subject(@input)
  [:date] = Docsplit.extract_date(@input)
  [:keywords] = Docsplit.extract_keywords(@input)
  [:length] = Docsplit.extract_length(@input)
  return 
end

#ocrPDFObject

OCR PDFs and turn that text into a JSON



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/uploadconvert.rb', line 48

def ocrPDF
  # Extract individual pages
  Docsplit.extract_images(@input)
  
  # OCR
  docs = Dir["*.png"]
  Docsplit.extract_text(@input, :ocr => true, :output => 'text')
  outfile = @input.split(".")
  text = File.read("text/" + outfile[0] + ".txt")

  # Clean up text and files
  File.delete("text/" + outfile[0]+".txt")
  Dir.delete("text")
  docs.each do |d|
    File.delete(d)
  end
  cleanPDF(text)
end

#pdfTojsonObject

Convert PDFs to JSON



13
14
15
16
17
18
19
20
21
22
23
24
# File 'lib/uploadconvert.rb', line 13

def pdfTojson
  # Extract and clean text
  @text = detectPDFType

  # Extract metadata and generate output                                                                                          
  extractMetadataPDF
  outhash = Hash.new
  .each{|k, v| outhash[k] = v}
  outhash[:text] = @text
  outhash[:input] = @input
  @output = JSON.pretty_generate(outhash)
end