Module: ActAsPageExtractor

Extended by:: ActiveSupport::Concern

Defined in:: lib/act_as_page_extractor/version.rb,
lib/act_as_page_extractor.rb,
lib/act_as_page_extractor/modules/tools.rb,
lib/act_as_page_extractor/modules/saving.rb,
lib/act_as_page_extractor/modules/interface.rb,
lib/act_as_page_extractor/modules/unzipping.rb,
lib/act_as_page_extractor/modules/extracting.rb,
lib/act_as_page_extractor/modules/validating.rb,
lib/generators/act_as_page_extractor/migration_generator.rb

Overview

:nocov:

Defined Under Namespace

Constant Summary collapse

DEFAULT_ROOT_FOLDER =

Dir.pwd.to_s

ERRORS =

{
  unknown_docsplit_error: 'Unknown Docsplit error'
}.freeze

ERROR_BACKTRACE_LINES =

EXTRACTING_STATES =

{
  new: 'new',
  extracting: 'extracting',
  extracted: 'extracted',
  error_doctype: 'error_doctype',
  error_extraction: 'error_extraction',
  error_filesize: 'error_filesize'
}.freeze

EXTRACTION_TIMEOUT = 5 minutes

60*5

VALIDATE_COMPRESS_TYPES =

['zip', 'rar', '7z', 'gzip'].freeze

VALIDATE_DOC_TYPES =

['txt', 'pdf', 'doc', 'docx',
'rtf', 'odt', 'htm', 'html'].freeze

VERSION =

"0.7.3"

Class Method Summary collapse

Instance Method Summary collapse

#add_error(e) ⇒ Object
#cleanup_pages ⇒ Object
#convert_to_pdf ⇒ Object
#convert_to_text ⇒ Object
#debug_info ⇒ Object

:nocov:.
#extract_pages ⇒ Object
#initialized ⇒ Object
#is_extracted ⇒ Object

:nocov:.
#origin_file_name ⇒ Object
#page_extract! ⇒ Object
#pdf_path ⇒ Object
#remove_files ⇒ Object
#remove_last_byte(file_name) ⇒ Object

fix for openoffice/jodconverter: delete last ugly byte in converted text page.
#save_pdf ⇒ Object
#save_to_db ⇒ Object
#timeout_wrapper ⇒ Object

:nocov:.
#unzip_document ⇒ Object
#update_state ⇒ Object
#valid_document ⇒ Object
#validate_compress_types ⇒ Object
#validate_doc_types ⇒ Object
#validate_size ⇒ Object

Class Method Details

.start_extraction ⇒ `Object`



16
17
18

# File 'lib/act_as_page_extractor/modules/interface.rb', line 16

def self.start_extraction
  document_class.where(page_extraction_state: EXTRACTING_STATES[:new]).each(&:page_extract!)
end

.statistics ⇒ `Object`

# File 'lib/act_as_page_extractor/modules/interface.rb', line 20

def self.statistics
  totals_documents = document_class.count
  supported_documents = document_class.where("page_extraction_doctype ILIKE ANY (array[#{VALIDATE_DOC_TYPES.map{|dt| '\'%'+dt+'%\''}.join(',')}])").count
  {
    total: totals_documents,
    supported_documents: supported_documents,
    unsupported_documents: totals_documents - supported_documents,
    states: EXTRACTING_STATES.map{|state, value| [ state, document_class.where(page_extraction_state: value).count] }.to_h,
  }
end

Instance Method Details

#add_error(e) ⇒ `Object`

# File 'lib/act_as_page_extractor/modules/tools.rb', line 44

def add_error(e)
  if ERRORS.values.include?(e.message)
    @pages_extraction_errors << "#{e.message}\n\n"
  else
    @pages_extraction_errors << "#{e.class}, #{e.message}\n#{e.backtrace[0..ERROR_BACKTRACE_LINES].join("\n")}\n"
  end
end

#cleanup_pages ⇒ `Object`



40
41
42

# File 'lib/act_as_page_extractor/modules/tools.rb', line 40

def cleanup_pages
  self.extracted_pages.destroy_all
end

#convert_to_pdf ⇒ `Object`

# File 'lib/act_as_page_extractor/modules/extracting.rb', line 14

def convert_to_pdf
   @pdf_path = if 'pdf' == @document_path.split('.').last.downcase
     @document_path
   else
    if timeout_wrapper{ Docsplit.extract_pdf(@document_path, output: @tmp_dir)}
      pdf_path = (@document_path.split('.')[0..-2] + ['pdf']).join('.')
      pdf_path if File.exist?(pdf_path)
    end
  end
rescue StandardError => e
  add_error(e)
end

#convert_to_text ⇒ `Object`

# File 'lib/act_as_page_extractor/modules/extracting.rb', line 27

def convert_to_text
  @pdf_pages = PdfUtils.info(@pdf_path).pages
  if @pdf_pages
    if timeout_wrapper{ Docsplit::extract_text(@pdf_path, ocr: false, pages: 'all', output: @tmp_dir) }
    else
      # :nocov:
      @pdf_pages = nil
      raise ERRORS[:unknown_docsplit_error]
      # :nocov:
    end
  end
rescue StandardError => e
  add_error(e)
end

#debug_info ⇒ `Object`

:nocov:

# File 'lib/act_as_page_extractor/modules/tools.rb', line 53

def debug_info
  # ap "@tmp_dir"
  # ap @tmp_dir
  # ap "@copy_document_path"
  # ap @copy_document_path
  # ap "@document_path"
    # ap @document_path
  # ap "@pdf_path"
  # ap @pdf_path
  # ap "@pdf_pages"
    # ap @pdf_pages
end

#extract_pages ⇒ `Object`

# File 'lib/act_as_page_extractor/modules/extracting.rb', line 9

def extract_pages
  convert_to_pdf
  convert_to_text
end

#initialized ⇒ `Object`

# File 'lib/act_as_page_extractor.rb', line 62

def initialized
  @page_extraction_state = nil
  @pages_extraction_errors = ''
  # add all need callbacks
    #on destroy remove pdf

  #Add to Readme!!
  #rails g act_as_page_extractor:migration Document category_id user_id
  # add to [Document] model:
  # has_many :extracted_pages, dependent: :destroy
  create_pdf_dir
end

#is_extracted ⇒ `Object`

:nocov:



17
18
19

# File 'lib/act_as_page_extractor/modules/tools.rb', line 17

def is_extracted
  @pdf_pages.to_i > 0 && self.extracted_pages.count == @pdf_pages
end

#origin_file_name ⇒ `Object`



2
3
4

# File 'lib/act_as_page_extractor/modules/interface.rb', line 2

def origin_file_name
  self.send(:extracted_filename).url.to_s.split('/').last
end

#page_extract! ⇒ `Object`

# File 'lib/act_as_page_extractor.rb', line 75

def page_extract!
  initialized
  cleanup_pages
  create_tmp_dir
  begin
    copy_document
    unzip_document
    if valid_document
      extract_pages
      save_to_db
    end
  ensure
    update_state
    save_pdf
    debug_info
    finish
  end
end

#pdf_path ⇒ `Object`

# File 'lib/act_as_page_extractor/modules/interface.rb', line 6

def pdf_path
  if page_extraction_state == EXTRACTING_STATES[:extracted] && page_extraction_doctype&.downcase != 'pdf'
    "#{pdf_storage}/#{origin_file_name.split('.').first}.pdf"
  end
end

#remove_files ⇒ `Object`



12
13
14

# File 'lib/act_as_page_extractor/modules/interface.rb', line 12

def remove_files
  FileUtils::rm_rf(pdf_path) if File.exist?(pdf_path.to_s)
end

#remove_last_byte(file_name) ⇒ `Object`

fix for openoffice/jodconverter: delete last ugly byte in converted text page

# File 'lib/act_as_page_extractor/modules/saving.rb', line 38

def remove_last_byte(file_name)
  file = File.new(file_name, 'a+')
  if file.size > 0
    file.seek(file.size - 1)
    last_byte = file.getc
    file.truncate(file.size - 1) if last_byte == "\f"
  end
  file.close
end

#save_pdf ⇒ `Object`

# File 'lib/act_as_page_extractor/modules/saving.rb', line 2

def save_pdf
  if save_as_pdf &&
     is_extracted &&
     @document_path.split('.').last&.downcase != 'pdf'

    if @pdf_path
      FileUtils.cp(@pdf_path, pdf_storage)
    end
  end
end

#save_to_db ⇒ `Object`

# File 'lib/act_as_page_extractor/modules/saving.rb', line 13

def save_to_db
  self.update(page_extraction_state: EXTRACTING_STATES[:extracting])
  ExtractedPage.transaction do
    @pdf_pages&.times&.each do |pdf_page|
      page_filename = "#{@tmp_dir}/#{@document_filename.split('.').first}_#{(pdf_page + 1).to_s}.txt"
      remove_last_byte(page_filename)
      content = IO.read(page_filename).delete("<" ">" "&" "\u0001" "\u25A0" "\a")

      page_attributes = {
        page:        content,
        page_number: pdf_page + 1
      }

      page_attributes[extracted_document_id] = self.id

      additional_fields.each do |additional_field|
        page_attributes[additional_field] = self.send(additional_field.to_sym)
      end

      ExtractedPage.create(page_attributes)
    end
  end
end

#timeout_wrapper ⇒ `Object`

:nocov:

# File 'lib/act_as_page_extractor/modules/tools.rb', line 5

def timeout_wrapper
  result = nil
  begin
    result = Timeout::timeout(EXTRACTION_TIMEOUT) { yield }
  rescue StandardError => e
    add_error(e)
  ensure
    result
  end
end

#unzip_document ⇒ `Object`

# File 'lib/act_as_page_extractor/modules/unzipping.rb', line 2

def unzip_document
  @document_path = @copy_document_path

  return if VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase)

  if validate_compress_types
    result = TotalCompressor.decompress(@copy_document_path)
    if result[:success] && result[:files].length == 1
      origin_document_name = @origin_document_path.split("/").last.split('.').first
      unpacked_document = result[:files].first.split('/').last
      unpacked_document_format = unpacked_document.split('.').last
      @document_path = "#{@tmp_dir}/#{origin_document_name}.#{unpacked_document_format}"
      File.rename(result[:files].first, @document_path)
    end
  end
end

#update_state ⇒ `Object`

# File 'lib/act_as_page_extractor/modules/tools.rb', line 21

def update_state
  updated_attributes = if is_extracted
    {
      page_extraction_state: EXTRACTING_STATES[:extracted],
      page_extraction_pages: @pdf_pages
    }
  else
    {
      page_extraction_state: @page_extraction_state || EXTRACTING_STATES[:error_extraction],
      page_extraction_pages: 0
    }
  end.merge({
      page_extraction_doctype: @document_path&.split('.')&.last,
      page_extraction_filesize: Filesize.from("#{File.size(@document_path)} B").pretty,
      pages_extraction_errors: @pages_extraction_errors.chomp
    })
  self.update(updated_attributes)
end

#valid_document ⇒ `Object`



2
3
4

# File 'lib/act_as_page_extractor/modules/validating.rb', line 2

def valid_document
  validate_size && validate_doc_types
end

#validate_compress_types ⇒ `Object`

# File 'lib/act_as_page_extractor/modules/unzipping.rb', line 19

def validate_compress_types
  valid = VALIDATE_COMPRESS_TYPES.include?(@copy_document_path.split('.').last&.downcase)

  unless valid
    @page_extraction_state = EXTRACTING_STATES[:error_doctype]
    @pages_extraction_errors << "#{EXTRACTING_STATES[:error_doctype]} "
  end

  valid
end

#validate_doc_types ⇒ `Object`

# File 'lib/act_as_page_extractor/modules/validating.rb', line 18

def validate_doc_types
  valid = VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase)

  unless valid
    @page_extraction_state = EXTRACTING_STATES[:error_doctype]
    @pages_extraction_errors << "#{EXTRACTING_STATES[:error_doctype]} "
  end

  valid
end

#validate_size ⇒ `Object`

# File 'lib/act_as_page_extractor/modules/validating.rb', line 6

def validate_size
  mb = 2**20
  valid = File.size(@copy_document_path) <= 20*mb

  unless valid
    @page_extraction_state = EXTRACTING_STATES[:error_filesize]
    @pages_extraction_errors << "#{EXTRACTING_STATES[:error_filesize]} "
  end

  valid
end

Module: ActAsPageExtractor

Overview

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.start_extraction ⇒ Object

.statistics ⇒ Object

Instance Method Details

#add_error(e) ⇒ Object

#cleanup_pages ⇒ Object

#convert_to_pdf ⇒ Object

#convert_to_text ⇒ Object

#debug_info ⇒ Object

#extract_pages ⇒ Object

#initialized ⇒ Object

#is_extracted ⇒ Object

#origin_file_name ⇒ Object

#page_extract! ⇒ Object

#pdf_path ⇒ Object

#remove_files ⇒ Object

#remove_last_byte(file_name) ⇒ Object

#save_pdf ⇒ Object

#save_to_db ⇒ Object

#timeout_wrapper ⇒ Object

#unzip_document ⇒ Object

#update_state ⇒ Object

#valid_document ⇒ Object

#validate_compress_types ⇒ Object

#validate_doc_types ⇒ Object

#validate_size ⇒ Object

.start_extraction ⇒ `Object`

.statistics ⇒ `Object`

#add_error(e) ⇒ `Object`

#cleanup_pages ⇒ `Object`

#convert_to_pdf ⇒ `Object`

#convert_to_text ⇒ `Object`

#debug_info ⇒ `Object`

#extract_pages ⇒ `Object`

#initialized ⇒ `Object`

#is_extracted ⇒ `Object`

#origin_file_name ⇒ `Object`

#page_extract! ⇒ `Object`

#pdf_path ⇒ `Object`

#remove_files ⇒ `Object`

#remove_last_byte(file_name) ⇒ `Object`

#save_pdf ⇒ `Object`

#save_to_db ⇒ `Object`

#timeout_wrapper ⇒ `Object`

#unzip_document ⇒ `Object`

#update_state ⇒ `Object`

#valid_document ⇒ `Object`

#validate_compress_types ⇒ `Object`

#validate_doc_types ⇒ `Object`

#validate_size ⇒ `Object`