Class: UploadConvert

Inherits:

Object

Object
UploadConvert

show all

Defined in:: lib/uploadconvert.rb

Instance Method Summary collapse

#cleanPDF(text) ⇒ Object

Removes numbers from edges of legal documents.
#detectPDFType ⇒ Object

Use embedded fonts to detect the type of PDF.
#embedPDF ⇒ Object

Extract text from embedded text PDFs.
#extractMetadataPDF ⇒ Object

Extract PDF metadata.
#handleDoc ⇒ Object

Sends the document to the appropriate method.
#initialize(input) ⇒ UploadConvert constructor

A new instance of UploadConvert.
#ocrPDF ⇒ Object

OCR PDFs and turn that text into a JSON.
#pdfTojson ⇒ Object

Convert PDFs to JSON.
#xmlTojson(xmlin) ⇒ Object

Convert XML files to JSONs.

Constructor Details

#initialize(input) ⇒ `UploadConvert`

Returns a new instance of UploadConvert.

# File 'lib/uploadconvert.rb', line 8

def initialize(input)
  @input = input
  @output = ""
  @text = ""
end

Instance Method Details

#cleanPDF(text) ⇒ `Object`

Removes numbers from edges of legal documents

# File 'lib/uploadconvert.rb', line 94

def cleanPDF(text)
  text.gsub!(/\r?\n/, "\n")
  text.each_line do |l|
    lflag = 0
    (1..28).each do |i|
      if l == i.to_s+"\n"
        lflag = 1
      end
    end

    if lflag != 1 && l
      @text += l
    end
  end
  
  return @text
end

#detectPDFType ⇒ `Object`

Use embedded fonts to detect the type of PDF

# File 'lib/uploadconvert.rb', line 49

def detectPDFType
  out = `pdffonts #{@input}`.split("\n")
  if out.length > 4
    return embedPDF
  else
    # return ocrPDF
  end
end

#embedPDF ⇒ `Object`

Extract text from embedded text PDFs

# File 'lib/uploadconvert.rb', line 59

def embedPDF
  begin
    Docsplit.extract_text(@input, :ocr => false, :output => "public/uploads")
    outfile = @input.split(".pdf")
    path = "public/uploads/" + outfile[0]
    text = File.read(path+".txt")
    
    # Clean up text and delete file
    File.delete(path+".txt")
    cleanPDF(text)
  rescue
  end
end

#extractMetadataPDF ⇒ `Object`

Extract PDF metadata

# File 'lib/uploadconvert.rb', line 113

def extractMetadataPDF
  @metadata = Hash.new
  @metadata[:author] = Docsplit.extract_author(@input)
  @metadata[:creator] =  Docsplit.extract_creator(@input)
  @metadata[:producer] = Docsplit.extract_producer(@input)
  @metadata[:title] = Docsplit.extract_title(@input)
  @metadata[:subject] = Docsplit.extract_subject(@input)
  @metadata[:date] = Docsplit.extract_date(@input)
  @metadata[:keywords] = Docsplit.extract_keywords(@input)
  @metadata[:length] = Docsplit.extract_length(@input)
  return @metadata
end

#handleDoc ⇒ `Object`

Sends the document to the appropriate method

# File 'lib/uploadconvert.rb', line 15

def handleDoc
  if @input.include? "http"
    `wget #{@input}`
    path = @input.split("/")
    @input = path[path.length-1].chomp.strip
    handleDoc
  elsif @input.include? ".pdf"
    pdfTojson
  elsif @input.include? ".xml"
    xmlTojson(File.read(@input))
  end
end

#ocrPDF ⇒ `Object`

OCR PDFs and turn that text into a JSON

# File 'lib/uploadconvert.rb', line 74

def ocrPDF
  # Extract individual pages
  Docsplit.extract_images(@input)
  
  # OCR
  docs = Dir["*.png"]
  Docsplit.extract_text(@input, :ocr => true, :output => 'text')
  outfile = @input.split(".")
  text = File.read("text/" + outfile[0] + ".txt")

  # Clean up text and files
  File.delete("text/" + outfile[0]+".txt")
  Dir.delete("text")
  docs.each do |d|
    File.delete(d)
  end
  cleanPDF(text)
end

#pdfTojson ⇒ `Object`

Convert PDFs to JSON

# File 'lib/uploadconvert.rb', line 35

def pdfTojson
  # Extract and clean text
  @text = detectPDFType

  # Extract metadata and generate output
  extractMetadataPDF
  outhash = Hash.new
  @metadata.each{|k, v| outhash[k] = v}
  outhash[:text] = @text
  outhash[:input] = @input
  @output = JSON.pretty_generate(outhash)
end

#xmlTojson(xmlin) ⇒ `Object`

Convert XML files to JSONs

# File 'lib/uploadconvert.rb', line 29

def xmlTojson(xmlin)
  xml = Crack::XML.parse(xmlin)
  JSON.pretty_generate(xml)
end

Class: UploadConvert

Instance Method Summary collapse

Constructor Details

#initialize(input) ⇒ UploadConvert

Instance Method Details

#cleanPDF(text) ⇒ Object

#detectPDFType ⇒ Object

#embedPDF ⇒ Object

#extractMetadataPDF ⇒ Object

#handleDoc ⇒ Object

#ocrPDF ⇒ Object

#pdfTojson ⇒ Object

#xmlTojson(xmlin) ⇒ Object