Module: ActAsPageExtractor
- Extended by:
- ActiveSupport::Concern
- Defined in:
- lib/act_as_page_extractor/version.rb,
lib/act_as_page_extractor.rb,
lib/act_as_page_extractor/modules/tools.rb,
lib/act_as_page_extractor/modules/saving.rb,
lib/act_as_page_extractor/modules/interface.rb,
lib/act_as_page_extractor/modules/unzipping.rb,
lib/act_as_page_extractor/modules/extracting.rb,
lib/act_as_page_extractor/modules/validating.rb,
lib/generators/act_as_page_extractor/migration_generator.rb
Overview
Defined Under Namespace
Modules: ClassMethods, Generators
Constant Summary
collapse
- DEFAULT_ROOT_FOLDER =
Dir.pwd.to_s
- ERRORS =
{
unknown_docsplit_error: 'Unknown Docsplit error'
}.freeze
- ERROR_BACKTRACE_LINES =
15
{
new: 'new',
extracting: 'extracting',
extracted: 'extracted',
error_doctype: 'error_doctype',
error_extraction: 'error_extraction',
error_filesize: 'error_filesize'
}.freeze
60*5
- VALIDATE_COMPRESS_TYPES =
['zip', 'rar', '7z', 'gzip'].freeze
- VALIDATE_DOC_TYPES =
['txt', 'pdf', 'doc', 'docx',
'rtf', 'odt', 'htm', 'html'].freeze
- VERSION =
"0.7.3"
Class Method Summary
collapse
Instance Method Summary
collapse
Class Method Details
16
17
18
|
# File 'lib/act_as_page_extractor/modules/interface.rb', line 16
def self.
document_class.where(page_extraction_state: EXTRACTING_STATES[:new]).each(&:page_extract!)
end
|
.statistics ⇒ Object
20
21
22
23
24
25
26
27
28
29
|
# File 'lib/act_as_page_extractor/modules/interface.rb', line 20
def self.statistics
totals_documents = document_class.count
supported_documents = document_class.where("page_extraction_doctype ILIKE ANY (array[#{VALIDATE_DOC_TYPES.map{|dt| '\'%'+dt+'%\''}.join(',')}])").count
{
total: totals_documents,
supported_documents: supported_documents,
unsupported_documents: totals_documents - supported_documents,
states: EXTRACTING_STATES.map{|state, value| [ state, document_class.where(page_extraction_state: value).count] }.to_h,
}
end
|
Instance Method Details
#add_error(e) ⇒ Object
44
45
46
47
48
49
50
|
# File 'lib/act_as_page_extractor/modules/tools.rb', line 44
def add_error(e)
if ERRORS.values.include?(e.message)
@pages_extraction_errors << "#{e.message}\n\n"
else
@pages_extraction_errors << "#{e.class}, #{e.message}\n#{e.backtrace[0..ERROR_BACKTRACE_LINES].join("\n")}\n"
end
end
|
#cleanup_pages ⇒ Object
40
41
42
|
# File 'lib/act_as_page_extractor/modules/tools.rb', line 40
def cleanup_pages
self..destroy_all
end
|
#convert_to_pdf ⇒ Object
14
15
16
17
18
19
20
21
22
23
24
25
|
# File 'lib/act_as_page_extractor/modules/extracting.rb', line 14
def convert_to_pdf
@pdf_path = if 'pdf' == @document_path.split('.').last.downcase
@document_path
else
if timeout_wrapper{ Docsplit.(@document_path, output: @tmp_dir)}
pdf_path = (@document_path.split('.')[0..-2] + ['pdf']).join('.')
pdf_path if File.exist?(pdf_path)
end
end
rescue StandardError => e
add_error(e)
end
|
#convert_to_text ⇒ Object
27
28
29
30
31
32
33
34
35
36
37
38
39
40
|
# File 'lib/act_as_page_extractor/modules/extracting.rb', line 27
def convert_to_text
@pdf_pages = PdfUtils.info(@pdf_path).pages
if @pdf_pages
if timeout_wrapper{ Docsplit::(@pdf_path, ocr: false, pages: 'all', output: @tmp_dir) }
else
@pdf_pages = nil
raise ERRORS[:unknown_docsplit_error]
end
end
rescue StandardError => e
add_error(e)
end
|
#debug_info ⇒ Object
53
54
55
56
57
58
59
60
61
62
63
64
|
# File 'lib/act_as_page_extractor/modules/tools.rb', line 53
def debug_info
end
|
9
10
11
12
|
# File 'lib/act_as_page_extractor/modules/extracting.rb', line 9
def
convert_to_pdf
convert_to_text
end
|
#initialized ⇒ Object
62
63
64
65
66
67
68
69
70
71
72
73
|
# File 'lib/act_as_page_extractor.rb', line 62
def initialized
@page_extraction_state = nil
@pages_extraction_errors = ''
create_pdf_dir
end
|
17
18
19
|
# File 'lib/act_as_page_extractor/modules/tools.rb', line 17
def
@pdf_pages.to_i > 0 && self..count == @pdf_pages
end
|
#origin_file_name ⇒ Object
2
3
4
|
# File 'lib/act_as_page_extractor/modules/interface.rb', line 2
def origin_file_name
self.send(:extracted_filename).url.to_s.split('/').last
end
|
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
|
# File 'lib/act_as_page_extractor.rb', line 75
def
initialized
cleanup_pages
create_tmp_dir
begin
copy_document
unzip_document
if valid_document
save_to_db
end
ensure
update_state
save_pdf
debug_info
finish
end
end
|
#pdf_path ⇒ Object
6
7
8
9
10
|
# File 'lib/act_as_page_extractor/modules/interface.rb', line 6
def pdf_path
if == EXTRACTING_STATES[:extracted] && &.downcase != 'pdf'
"#{pdf_storage}/#{origin_file_name.split('.').first}.pdf"
end
end
|
#remove_files ⇒ Object
12
13
14
|
# File 'lib/act_as_page_extractor/modules/interface.rb', line 12
def remove_files
FileUtils::rm_rf(pdf_path) if File.exist?(pdf_path.to_s)
end
|
#remove_last_byte(file_name) ⇒ Object
fix for openoffice/jodconverter: delete last ugly byte in converted text page
38
39
40
41
42
43
44
45
46
|
# File 'lib/act_as_page_extractor/modules/saving.rb', line 38
def remove_last_byte(file_name)
file = File.new(file_name, 'a+')
if file.size > 0
file.seek(file.size - 1)
last_byte = file.getc
file.truncate(file.size - 1) if last_byte == "\f"
end
file.close
end
|
#save_pdf ⇒ Object
2
3
4
5
6
7
8
9
10
11
|
# File 'lib/act_as_page_extractor/modules/saving.rb', line 2
def save_pdf
if save_as_pdf &&
&&
@document_path.split('.').last&.downcase != 'pdf'
if @pdf_path
FileUtils.cp(@pdf_path, pdf_storage)
end
end
end
|
#save_to_db ⇒ Object
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
|
# File 'lib/act_as_page_extractor/modules/saving.rb', line 13
def save_to_db
self.update(page_extraction_state: EXTRACTING_STATES[:extracting])
ExtractedPage.transaction do
@pdf_pages&.times&.each do |pdf_page|
page_filename = "#{@tmp_dir}/#{@document_filename.split('.').first}_#{(pdf_page + 1).to_s}.txt"
remove_last_byte(page_filename)
content = IO.read(page_filename).delete("<" ">" "&" "\u0001" "\u25A0" "\a")
page_attributes = {
page: content,
page_number: pdf_page + 1
}
page_attributes[] = self.id
additional_fields.each do |additional_field|
page_attributes[additional_field] = self.send(additional_field.to_sym)
end
ExtractedPage.create(page_attributes)
end
end
end
|
#timeout_wrapper ⇒ Object
5
6
7
8
9
10
11
12
13
14
|
# File 'lib/act_as_page_extractor/modules/tools.rb', line 5
def timeout_wrapper
result = nil
begin
result = Timeout::timeout(EXTRACTION_TIMEOUT) { yield }
rescue StandardError => e
add_error(e)
ensure
result
end
end
|
#unzip_document ⇒ Object
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
# File 'lib/act_as_page_extractor/modules/unzipping.rb', line 2
def unzip_document
@document_path = @copy_document_path
return if VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase)
if validate_compress_types
result = TotalCompressor.decompress(@copy_document_path)
if result[:success] && result[:files].length == 1
origin_document_name = @origin_document_path.split("/").last.split('.').first
unpacked_document = result[:files].first.split('/').last
unpacked_document_format = unpacked_document.split('.').last
@document_path = "#{@tmp_dir}/#{origin_document_name}.#{unpacked_document_format}"
File.rename(result[:files].first, @document_path)
end
end
end
|
#update_state ⇒ Object
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
|
# File 'lib/act_as_page_extractor/modules/tools.rb', line 21
def update_state
updated_attributes = if
{
page_extraction_state: EXTRACTING_STATES[:extracted],
page_extraction_pages: @pdf_pages
}
else
{
page_extraction_state: @page_extraction_state || EXTRACTING_STATES[:error_extraction],
page_extraction_pages: 0
}
end.merge({
page_extraction_doctype: @document_path&.split('.')&.last,
page_extraction_filesize: Filesize.from("#{File.size(@document_path)} B").pretty,
pages_extraction_errors: @pages_extraction_errors.chomp
})
self.update(updated_attributes)
end
|
#valid_document ⇒ Object
2
3
4
|
# File 'lib/act_as_page_extractor/modules/validating.rb', line 2
def valid_document
validate_size && validate_doc_types
end
|
#validate_compress_types ⇒ Object
19
20
21
22
23
24
25
26
27
28
|
# File 'lib/act_as_page_extractor/modules/unzipping.rb', line 19
def validate_compress_types
valid = VALIDATE_COMPRESS_TYPES.include?(@copy_document_path.split('.').last&.downcase)
unless valid
@page_extraction_state = EXTRACTING_STATES[:error_doctype]
@pages_extraction_errors << "#{EXTRACTING_STATES[:error_doctype]} "
end
valid
end
|
#validate_doc_types ⇒ Object
18
19
20
21
22
23
24
25
26
27
|
# File 'lib/act_as_page_extractor/modules/validating.rb', line 18
def validate_doc_types
valid = VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase)
unless valid
@page_extraction_state = EXTRACTING_STATES[:error_doctype]
@pages_extraction_errors << "#{EXTRACTING_STATES[:error_doctype]} "
end
valid
end
|
#validate_size ⇒ Object
6
7
8
9
10
11
12
13
14
15
16
|
# File 'lib/act_as_page_extractor/modules/validating.rb', line 6
def validate_size
mb = 2**20
valid = File.size(@copy_document_path) <= 20*mb
unless valid
@page_extraction_state = EXTRACTING_STATES[:error_filesize]
@pages_extraction_errors << "#{EXTRACTING_STATES[:error_filesize]} "
end
valid
end
|