Class: UploadConvert

Inherits:

Object

Object
UploadConvert

show all

Defined in:: lib/uploadconvert.rb

Instance Method Summary collapse

#cleanPDF(text) ⇒ Object

Removes numbers from edges of legal documents.
#detectPDFType ⇒ Object

Use embedded fonts to detect the type of PDF.
#embedPDF ⇒ Object

Extract text from embedded text PDFs.
#extractMetadataPDF ⇒ Object

Extract PDF metadata.
#initialize(input) ⇒ UploadConvert constructor

A new instance of UploadConvert.
#ocrPDF ⇒ Object

OCR PDFs and turn that text into a JSON.
#pdfTojson ⇒ Object

Convert PDFs to JSON.

Constructor Details

#initialize(input) ⇒ `UploadConvert`

Returns a new instance of UploadConvert.

# File 'lib/uploadconvert.rb', line 6

def initialize(input)
  @input = input
  @output = ""
  @text = ""
end

Instance Method Details

#cleanPDF(text) ⇒ `Object`

Removes numbers from edges of legal documents

# File 'lib/uploadconvert.rb', line 68

def cleanPDF(text)
  text.gsub!(/\r?\n/, "\n")
  text.each_line do |l|
    lflag = 0
    (1..28).each do |i|
      if l == i.to_s+"\n"
        lflag = 1
      end
    end

    if lflag != 1 && l
      @text += l
    end
  end
  
  return @text
end

#detectPDFType ⇒ `Object`

Use embedded fonts to detect the type of PDF

# File 'lib/uploadconvert.rb', line 27

def detectPDFType
  out = `pdffonts #{@input}`.split("\n")
  if out.length > 4
    return embedPDF
  else
    return ocrPDF
  end
end

#embedPDF ⇒ `Object`

Extract text from embedded text PDFs

# File 'lib/uploadconvert.rb', line 37

def embedPDF
  Docsplit.extract_text(@input, :ocr => false)
  outfile = @input.split(".pdf")
  text = File.read(outfile[0]+".txt")
  
  # Clean up text and delete file
  File.delete(outfile[0]+".txt")
  cleanPDF(text)
end

#extractMetadataPDF ⇒ `Object`

Extract PDF metadata

# File 'lib/uploadconvert.rb', line 87

def extractMetadataPDF
  @metadata = Hash.new
  @metadata[:author] = Docsplit.extract_author(@input)
  @metadata[:creator] =  Docsplit.extract_creator(@input)
  @metadata[:producer] = Docsplit.extract_producer(@input)
  @metadata[:title] = Docsplit.extract_title(@input)
  @metadata[:subject] = Docsplit.extract_subject(@input)
  @metadata[:date] = Docsplit.extract_date(@input)
  @metadata[:keywords] = Docsplit.extract_keywords(@input)
  @metadata[:length] = Docsplit.extract_length(@input)
  return @metadata
end

#ocrPDF ⇒ `Object`

OCR PDFs and turn that text into a JSON

# File 'lib/uploadconvert.rb', line 48

def ocrPDF
  # Extract individual pages
  Docsplit.extract_images(@input)
  
  # OCR
  docs = Dir["*.png"]
  Docsplit.extract_text(@input, :ocr => true, :output => 'text')
  outfile = @input.split(".")
  text = File.read("text/" + outfile[0] + ".txt")

  # Clean up text and files
  File.delete("text/" + outfile[0]+".txt")
  Dir.delete("text")
  docs.each do |d|
    File.delete(d)
  end
  cleanPDF(text)
end

#pdfTojson ⇒ `Object`

Convert PDFs to JSON

# File 'lib/uploadconvert.rb', line 13

def pdfTojson
  # Extract and clean text
  @text = detectPDFType

  # Extract metadata and generate output                                                                                          
  extractMetadataPDF
  outhash = Hash.new
  @metadata.each{|k, v| outhash[k] = v}
  outhash[:text] = @text
  outhash[:input] = @input
  @output = JSON.pretty_generate(outhash)
end

Class: UploadConvert

Instance Method Summary collapse

Constructor Details

#initialize(input) ⇒ UploadConvert

Instance Method Details

#cleanPDF(text) ⇒ Object

#detectPDFType ⇒ Object

#embedPDF ⇒ Object

#extractMetadataPDF ⇒ Object

#ocrPDF ⇒ Object