Module: ActAsPageExtractor

Extended by:
ActiveSupport::Concern
Defined in:
lib/act_as_page_extractor/version.rb,
lib/act_as_page_extractor.rb,
lib/act_as_page_extractor/modules/tools.rb,
lib/act_as_page_extractor/modules/saving.rb,
lib/act_as_page_extractor/modules/interface.rb,
lib/act_as_page_extractor/modules/unzipping.rb,
lib/act_as_page_extractor/modules/extracting.rb,
lib/act_as_page_extractor/modules/validating.rb,
lib/generators/act_as_page_extractor/migration_generator.rb

Overview

:nocov:

Defined Under Namespace

Modules: ClassMethods, Generators

Constant Summary collapse

DEFAULT_ROOT_FOLDER =
Dir.pwd.to_s
ERRORS =
{
  unknown_docsplit_error: 'Unknown Docsplit error'
}.freeze
ERROR_BACKTRACE_LINES =
15
EXTRACTING_STATES =
{
  new: 'new',
  extracting: 'extracting',
  extracted: 'extracted',
  error_doctype: 'error_doctype',
  error_extraction: 'error_extraction',
  error_filesize: 'error_filesize'
}.freeze
EXTRACTION_TIMEOUT =

5 minutes

60*5
VALIDATE_COMPRESS_TYPES =
['zip', 'rar', '7z', 'gzip'].freeze
VALIDATE_DOC_TYPES =
['txt', 'pdf', 'doc', 'docx',
'rtf', 'odt', 'htm', 'html'].freeze
VERSION =
"0.7.3"

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.start_extractionObject



16
17
18
# File 'lib/act_as_page_extractor/modules/interface.rb', line 16

def self.start_extraction
  document_class.where(page_extraction_state: EXTRACTING_STATES[:new]).each(&:page_extract!)
end

.statisticsObject



20
21
22
23
24
25
26
27
28
29
# File 'lib/act_as_page_extractor/modules/interface.rb', line 20

def self.statistics
  totals_documents = document_class.count
  supported_documents = document_class.where("page_extraction_doctype ILIKE ANY (array[#{VALIDATE_DOC_TYPES.map{|dt| '\'%'+dt+'%\''}.join(',')}])").count
  {
    total: totals_documents,
    supported_documents: supported_documents,
    unsupported_documents: totals_documents - supported_documents,
    states: EXTRACTING_STATES.map{|state, value| [ state, document_class.where(page_extraction_state: value).count] }.to_h,
  }
end

Instance Method Details

#add_error(e) ⇒ Object



44
45
46
47
48
49
50
# File 'lib/act_as_page_extractor/modules/tools.rb', line 44

def add_error(e)
  if ERRORS.values.include?(e.message)
    @pages_extraction_errors << "#{e.message}\n\n"
  else
    @pages_extraction_errors << "#{e.class}, #{e.message}\n#{e.backtrace[0..ERROR_BACKTRACE_LINES].join("\n")}\n"
  end
end

#cleanup_pagesObject



40
41
42
# File 'lib/act_as_page_extractor/modules/tools.rb', line 40

def cleanup_pages
  self.extracted_pages.destroy_all
end

#convert_to_pdfObject



14
15
16
17
18
19
20
21
22
23
24
25
# File 'lib/act_as_page_extractor/modules/extracting.rb', line 14

def convert_to_pdf
   @pdf_path = if 'pdf' == @document_path.split('.').last.downcase
     @document_path
   else
    if timeout_wrapper{ Docsplit.extract_pdf(@document_path, output: @tmp_dir)}
      pdf_path = (@document_path.split('.')[0..-2] + ['pdf']).join('.')
      pdf_path if File.exist?(pdf_path)
    end
  end
rescue StandardError => e
  add_error(e)
end

#convert_to_textObject



27
28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/act_as_page_extractor/modules/extracting.rb', line 27

def convert_to_text
  @pdf_pages = PdfUtils.info(@pdf_path).pages
  if @pdf_pages
    if timeout_wrapper{ Docsplit::extract_text(@pdf_path, ocr: false, pages: 'all', output: @tmp_dir) }
    else
      # :nocov:
      @pdf_pages = nil
      raise ERRORS[:unknown_docsplit_error]
      # :nocov:
    end
  end
rescue StandardError => e
  add_error(e)
end

#debug_infoObject

:nocov:



53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/act_as_page_extractor/modules/tools.rb', line 53

def debug_info
  # ap "@tmp_dir"
  # ap @tmp_dir
  # ap "@copy_document_path"
  # ap @copy_document_path
  # ap "@document_path"
    # ap @document_path
  # ap "@pdf_path"
  # ap @pdf_path
  # ap "@pdf_pages"
    # ap @pdf_pages
end

#extract_pagesObject



9
10
11
12
# File 'lib/act_as_page_extractor/modules/extracting.rb', line 9

def extract_pages
  convert_to_pdf
  convert_to_text
end

#initializedObject



62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/act_as_page_extractor.rb', line 62

def initialized
  @page_extraction_state = nil
  @pages_extraction_errors = ''
  # add all need callbacks
    #on destroy remove pdf

  #Add to Readme!!
  #rails g act_as_page_extractor:migration Document category_id user_id
  # add to [Document] model:
  # has_many :extracted_pages, dependent: :destroy
  create_pdf_dir
end

#is_extractedObject

:nocov:



17
18
19
# File 'lib/act_as_page_extractor/modules/tools.rb', line 17

def is_extracted
  @pdf_pages.to_i > 0 && self.extracted_pages.count == @pdf_pages
end

#origin_file_nameObject



2
3
4
# File 'lib/act_as_page_extractor/modules/interface.rb', line 2

def origin_file_name
  self.send(:extracted_filename).url.to_s.split('/').last
end

#page_extract!Object



75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/act_as_page_extractor.rb', line 75

def page_extract!
  initialized
  cleanup_pages
  create_tmp_dir
  begin
    copy_document
    unzip_document
    if valid_document
      extract_pages
      save_to_db
    end
  ensure
    update_state
    save_pdf
    debug_info
    finish
  end
end

#pdf_pathObject



6
7
8
9
10
# File 'lib/act_as_page_extractor/modules/interface.rb', line 6

def pdf_path
  if page_extraction_state == EXTRACTING_STATES[:extracted] && page_extraction_doctype&.downcase != 'pdf'
    "#{pdf_storage}/#{origin_file_name.split('.').first}.pdf"
  end
end

#remove_filesObject



12
13
14
# File 'lib/act_as_page_extractor/modules/interface.rb', line 12

def remove_files
  FileUtils::rm_rf(pdf_path) if File.exist?(pdf_path.to_s)
end

#remove_last_byte(file_name) ⇒ Object

fix for openoffice/jodconverter: delete last ugly byte in converted text page



38
39
40
41
42
43
44
45
46
# File 'lib/act_as_page_extractor/modules/saving.rb', line 38

def remove_last_byte(file_name)
  file = File.new(file_name, 'a+')
  if file.size > 0
    file.seek(file.size - 1)
    last_byte = file.getc
    file.truncate(file.size - 1) if last_byte == "\f"
  end
  file.close
end

#save_pdfObject



2
3
4
5
6
7
8
9
10
11
# File 'lib/act_as_page_extractor/modules/saving.rb', line 2

def save_pdf
  if save_as_pdf &&
     is_extracted &&
     @document_path.split('.').last&.downcase != 'pdf'

    if @pdf_path
      FileUtils.cp(@pdf_path, pdf_storage)
    end
  end
end

#save_to_dbObject



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/act_as_page_extractor/modules/saving.rb', line 13

def save_to_db
  self.update(page_extraction_state: EXTRACTING_STATES[:extracting])
  ExtractedPage.transaction do
    @pdf_pages&.times&.each do |pdf_page|
      page_filename = "#{@tmp_dir}/#{@document_filename.split('.').first}_#{(pdf_page + 1).to_s}.txt"
      remove_last_byte(page_filename)
      content = IO.read(page_filename).delete("<" ">" "&" "\u0001" "\u25A0" "\a")

      page_attributes = {
        page:        content,
        page_number: pdf_page + 1
      }

      page_attributes[extracted_document_id] = self.id

      additional_fields.each do |additional_field|
        page_attributes[additional_field] = self.send(additional_field.to_sym)
      end

      ExtractedPage.create(page_attributes)
    end
  end
end

#timeout_wrapperObject

:nocov:



5
6
7
8
9
10
11
12
13
14
# File 'lib/act_as_page_extractor/modules/tools.rb', line 5

def timeout_wrapper
  result = nil
  begin
    result = Timeout::timeout(EXTRACTION_TIMEOUT) { yield }
  rescue StandardError => e
    add_error(e)
  ensure
    result
  end
end

#unzip_documentObject



2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# File 'lib/act_as_page_extractor/modules/unzipping.rb', line 2

def unzip_document
  @document_path = @copy_document_path

  return if VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase)

  if validate_compress_types
    result = TotalCompressor.decompress(@copy_document_path)
    if result[:success] && result[:files].length == 1
      origin_document_name = @origin_document_path.split("/").last.split('.').first
      unpacked_document = result[:files].first.split('/').last
      unpacked_document_format = unpacked_document.split('.').last
      @document_path = "#{@tmp_dir}/#{origin_document_name}.#{unpacked_document_format}"
      File.rename(result[:files].first, @document_path)
    end
  end
end

#update_stateObject



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/act_as_page_extractor/modules/tools.rb', line 21

def update_state
  updated_attributes = if is_extracted
    {
      page_extraction_state: EXTRACTING_STATES[:extracted],
      page_extraction_pages: @pdf_pages
    }
  else
    {
      page_extraction_state: @page_extraction_state || EXTRACTING_STATES[:error_extraction],
      page_extraction_pages: 0
    }
  end.merge({
      page_extraction_doctype: @document_path&.split('.')&.last,
      page_extraction_filesize: Filesize.from("#{File.size(@document_path)} B").pretty,
      pages_extraction_errors: @pages_extraction_errors.chomp
    })
  self.update(updated_attributes)
end

#valid_documentObject



2
3
4
# File 'lib/act_as_page_extractor/modules/validating.rb', line 2

def valid_document
  validate_size && validate_doc_types
end

#validate_compress_typesObject



19
20
21
22
23
24
25
26
27
28
# File 'lib/act_as_page_extractor/modules/unzipping.rb', line 19

def validate_compress_types
  valid = VALIDATE_COMPRESS_TYPES.include?(@copy_document_path.split('.').last&.downcase)

  unless valid
    @page_extraction_state = EXTRACTING_STATES[:error_doctype]
    @pages_extraction_errors << "#{EXTRACTING_STATES[:error_doctype]} "
  end

  valid
end

#validate_doc_typesObject



18
19
20
21
22
23
24
25
26
27
# File 'lib/act_as_page_extractor/modules/validating.rb', line 18

def validate_doc_types
  valid = VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase)

  unless valid
    @page_extraction_state = EXTRACTING_STATES[:error_doctype]
    @pages_extraction_errors << "#{EXTRACTING_STATES[:error_doctype]} "
  end

  valid
end

#validate_sizeObject



6
7
8
9
10
11
12
13
14
15
16
# File 'lib/act_as_page_extractor/modules/validating.rb', line 6

def validate_size
  mb = 2**20
  valid = File.size(@copy_document_path) <= 20*mb

  unless valid
    @page_extraction_state = EXTRACTING_STATES[:error_filesize]
    @pages_extraction_errors << "#{EXTRACTING_STATES[:error_filesize]} "
  end

  valid
end