Class: UploadConvert

Inherits:
Object
  • Object
show all
Defined in:
lib/uploadconvert.rb

Instance Method Summary collapse

Constructor Details

#initialize(input) ⇒ UploadConvert

Returns a new instance of UploadConvert.



8
9
10
11
12
# File 'lib/uploadconvert.rb', line 8

def initialize(input)
  @input = input
  @output = ""
  @text = ""
end

Instance Method Details

#cleanPDF(text) ⇒ Object

Removes numbers from edges of legal documents



99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# File 'lib/uploadconvert.rb', line 99

def cleanPDF(text)
  text.gsub!(/\r?\n/, "\n")
  text.each_line do |l|
    lflag = 0
    (1..28).each do |i|
      if l == i.to_s+"\n"
        lflag = 1
      end
    end

    if lflag != 1 && l
      @text += l
    end
  end
  
  return @text
end

#detectPDFTypeObject

Use embedded fonts to detect the type of PDF



49
50
51
52
53
54
55
56
# File 'lib/uploadconvert.rb', line 49

def detectPDFType
  out = `pdffonts #{@input}`.split("\n")
  if out.length > 4
    return embedPDF
  else
    return ocrPDF
  end
end

#embedPDFObject

Extract text from embedded text PDFs



59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/uploadconvert.rb', line 59

def embedPDF
  begin
    Docsplit.extract_text(@input, :ocr => false, :output => "public/uploads")
    outfile = @input.split(".pdf")
    if outfile[0].include? "public/uploads/"
      path = outfile[0]
    else
      path = "public/uploads/" + outfile[0]
    end

    text = File.read(path+".txt")
    
    # Clean up text and delete file
    File.delete(path+".txt")
    cleanPDF(text)
  rescue
  end
end

#extractMetadataPDFObject

Extract PDF metadata



118
119
120
121
122
123
124
125
126
127
128
129
# File 'lib/uploadconvert.rb', line 118

def extractMetadataPDF
  @metadata = Hash.new
  @metadata[:author] = Docsplit.extract_author(@input)
  @metadata[:creator] =  Docsplit.extract_creator(@input)
  @metadata[:producer] = Docsplit.extract_producer(@input)
  @metadata[:title] = Docsplit.extract_title(@input)
  @metadata[:subject] = Docsplit.extract_subject(@input)
  @metadata[:date] = Docsplit.extract_date(@input)
  @metadata[:keywords] = Docsplit.extract_keywords(@input)
  @metadata[:length] = Docsplit.extract_length(@input)
  return @metadata
end

#handleDocObject

Sends the document to the appropriate method



15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/uploadconvert.rb', line 15

def handleDoc
  if @input.include? "http"
    `wget #{@input}`
    path = @input.split("/")
    @input = path[path.length-1].chomp.strip
    handleDoc
  elsif @input.include? ".pdf"
    pdfTojson
  elsif @input.include? ".xml"
    xmlTojson(File.read(@input))
  end
end

#ocrPDFObject

OCR PDFs and turn that text into a JSON



79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/uploadconvert.rb', line 79

def ocrPDF
  # Extract individual pages
  Docsplit.extract_images(@input)
  
  # OCR
  docs = Dir["*.png"]
  Docsplit.extract_text(@input, :ocr => true, :output => 'text')
  outfile = @input.split(".")
  text = File.read("text/" + outfile[0] + ".txt")

  # Clean up text and files
  File.delete("text/" + outfile[0]+".txt")
  Dir.delete("text")
  docs.each do |d|
    File.delete(d)
  end
  cleanPDF(text)
end

#pdfTojsonObject

Convert PDFs to JSON



35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/uploadconvert.rb', line 35

def pdfTojson
  # Extract and clean text
  @text = detectPDFType

  # Extract metadata and generate output
  extractMetadataPDF
  outhash = Hash.new
  @metadata.each{|k, v| outhash[k] = v}
  outhash[:text] = @text
  outhash[:input] = @input
  @output = JSON.pretty_generate(outhash)
end

#xmlTojson(xmlin) ⇒ Object

Convert XML files to JSONs



29
30
31
32
# File 'lib/uploadconvert.rb', line 29

def xmlTojson(xmlin)
  xml = Crack::XML.parse(xmlin)
  JSON.pretty_generate(xml)
end