Class: FormatParser::ZIPParser

Inherits:
Object
  • Object
show all
Includes:
IOUtils, OfficeFormats
Defined in:
lib/parsers/zip_parser.rb

Defined Under Namespace

Modules: OfficeFormats Classes: FileReader

Constant Summary collapse

ZIP_MIME_TYPE =
'application/zip'

Constants included from IOUtils

IOUtils::INTEGER_DIRECTIVES

Constants included from OfficeFormats

OfficeFormats::OFFICE_MARKER_FILES

Instance Method Summary collapse

Methods included from IOUtils

#read_bytes, #read_fixed_point, #read_int, #safe_read, #safe_skip, #skip_bytes

Methods included from OfficeFormats

#office_document?, #office_file_format_and_mime_type_from_entry_set

Instance Method Details

#call(io) ⇒ Object



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/parsers/zip_parser.rb', line 14

def call(io)
  io = FormatParser::IOConstraint.new(io)
  safe_read(io, 1) # Ensure the file is not empty

  reader = FileReader.new
  entries = reader.read_zip_structure(io: io)

  filenames_set = Set.new
  entries_archive = entries.map do |ze|
    ft = directory?(ze) ? :directory : :file
    decoded_filename = decode_filename_of(ze)
    filenames_set << decoded_filename
    FormatParser::Archive::Entry.new(type: ft, size: ze.uncompressed_size, filename: decoded_filename)
  end

  if office_document?(filenames_set)
    office_format, mime_type = office_file_format_and_mime_type_from_entry_set(filenames_set)
    FormatParser::Archive.new(nature: :document, format: office_format, entries: entries_archive, content_type: mime_type)
  else
    FormatParser::Archive.new(nature: :archive,  format: :zip, entries: entries_archive, content_type: ZIP_MIME_TYPE)
  end
rescue FileReader::Error
  # This is not a ZIP, or a broken ZIP.
  nil
end

#decode_filename(filename, likely_unicode:) ⇒ Object



48
49
50
51
# File 'lib/parsers/zip_parser.rb', line 48

def decode_filename(filename, likely_unicode:)
  filename.force_encoding(Encoding::UTF_8) if likely_unicode
  FormatParser.string_to_lossy_utf8(filename)
end

#decode_filename_of(zip_entry) ⇒ Object



53
54
55
56
57
58
59
60
61
# File 'lib/parsers/zip_parser.rb', line 53

def decode_filename_of(zip_entry)
  # Check for the EFS bit in the general-purpose flags. If it is set,
  # the entry filename can be treated as UTF-8
  if zip_entry.gp_flags & 0b100000000000 == 0b100000000000
    decode_filename(zip_entry.filename, likely_unicode: true)
  else
    decode_filename(zip_entry.filename, likely_unicode: false)
  end
end

#directory?(zip_entry) ⇒ Boolean

Returns:

  • (Boolean)


40
41
42
43
44
45
46
# File 'lib/parsers/zip_parser.rb', line 40

def directory?(zip_entry)
  # We can do a lap dance here and parse out the individual bit fields
  # from the external attributes, check the OS type that is in the entry
  # to see if it can be interpreted as UNIX or not, and generally have
  # heaps of fun. Instead, we will be frugal.
  zip_entry.filename.end_with?('/')
end

#likely_match?(filename) ⇒ Boolean

Returns:

  • (Boolean)


10
11
12
# File 'lib/parsers/zip_parser.rb', line 10

def likely_match?(filename)
  filename =~ /\.(zip|docx|keynote|numbers|pptx|xlsx)$/i
end