blob: 67cfdfb6aafc45e029f7d36873e9ffd69dec9266 [file] [log] [blame]
Haibo Huangd8830302020-03-03 10:09:46 -08001"""
2Read and write ZIP files.
3
4XXX references to utf-8 need further investigation.
5"""
6import binascii
Haibo Huangd8830302020-03-03 10:09:46 -08007import importlib.util
8import io
9import itertools
10import os
11import posixpath
12import shutil
13import stat
14import struct
15import sys
16import threading
17import time
Haibo Huang5980f852020-03-05 12:22:08 -080018import contextlib
Yi Kong71199322022-08-30 15:53:45 +080019import pathlib
Haibo Huangd8830302020-03-03 10:09:46 -080020
21try:
22 import zlib # We may need its compression method
23 crc32 = zlib.crc32
24except ImportError:
25 zlib = None
26 crc32 = binascii.crc32
27
28try:
29 import bz2 # We may need its compression method
30except ImportError:
31 bz2 = None
32
33try:
34 import lzma # We may need its compression method
35except ImportError:
36 lzma = None
37
38__all__ = ["BadZipFile", "BadZipfile", "error",
39 "ZIP_STORED", "ZIP_DEFLATED", "ZIP_BZIP2", "ZIP_LZMA",
Haibo Huangf5f93a72020-10-19 15:43:42 -070040 "is_zipfile", "ZipInfo", "ZipFile", "PyZipFile", "LargeZipFile",
41 "Path"]
Haibo Huangd8830302020-03-03 10:09:46 -080042
43class BadZipFile(Exception):
44 pass
45
46
47class LargeZipFile(Exception):
48 """
49 Raised when writing a zipfile, the zipfile requires ZIP64 extensions
50 and those extensions are disabled.
51 """
52
53error = BadZipfile = BadZipFile # Pre-3.2 compatibility names
54
55
56ZIP64_LIMIT = (1 << 31) - 1
57ZIP_FILECOUNT_LIMIT = (1 << 16) - 1
58ZIP_MAX_COMMENT = (1 << 16) - 1
59
60# constants for Zip file compression methods
61ZIP_STORED = 0
62ZIP_DEFLATED = 8
63ZIP_BZIP2 = 12
64ZIP_LZMA = 14
65# Other ZIP compression methods not supported
66
67DEFAULT_VERSION = 20
68ZIP64_VERSION = 45
69BZIP2_VERSION = 46
70LZMA_VERSION = 63
71# we recognize (but not necessarily support) all features up to that version
72MAX_EXTRACT_VERSION = 63
73
74# Below are some formats and associated data for reading/writing headers using
75# the struct module. The names and structures of headers/records are those used
76# in the PKWARE description of the ZIP file format:
77# http://www.pkware.com/documents/casestudies/APPNOTE.TXT
78# (URL valid as of January 2008)
79
80# The "end of central directory" structure, magic number, size, and indices
81# (section V.I in the format document)
82structEndArchive = b"<4s4H2LH"
83stringEndArchive = b"PK\005\006"
84sizeEndCentDir = struct.calcsize(structEndArchive)
85
86_ECD_SIGNATURE = 0
87_ECD_DISK_NUMBER = 1
88_ECD_DISK_START = 2
89_ECD_ENTRIES_THIS_DISK = 3
90_ECD_ENTRIES_TOTAL = 4
91_ECD_SIZE = 5
92_ECD_OFFSET = 6
93_ECD_COMMENT_SIZE = 7
94# These last two indices are not part of the structure as defined in the
95# spec, but they are used internally by this module as a convenience
96_ECD_COMMENT = 8
97_ECD_LOCATION = 9
98
99# The "central directory" structure, magic number, size, and indices
100# of entries in the structure (section V.F in the format document)
101structCentralDir = "<4s4B4HL2L5H2L"
102stringCentralDir = b"PK\001\002"
103sizeCentralDir = struct.calcsize(structCentralDir)
104
105# indexes of entries in the central directory structure
106_CD_SIGNATURE = 0
107_CD_CREATE_VERSION = 1
108_CD_CREATE_SYSTEM = 2
109_CD_EXTRACT_VERSION = 3
110_CD_EXTRACT_SYSTEM = 4
111_CD_FLAG_BITS = 5
112_CD_COMPRESS_TYPE = 6
113_CD_TIME = 7
114_CD_DATE = 8
115_CD_CRC = 9
116_CD_COMPRESSED_SIZE = 10
117_CD_UNCOMPRESSED_SIZE = 11
118_CD_FILENAME_LENGTH = 12
119_CD_EXTRA_FIELD_LENGTH = 13
120_CD_COMMENT_LENGTH = 14
121_CD_DISK_NUMBER_START = 15
122_CD_INTERNAL_FILE_ATTRIBUTES = 16
123_CD_EXTERNAL_FILE_ATTRIBUTES = 17
124_CD_LOCAL_HEADER_OFFSET = 18
125
126# The "local file header" structure, magic number, size, and indices
127# (section V.A in the format document)
128structFileHeader = "<4s2B4HL2L2H"
129stringFileHeader = b"PK\003\004"
130sizeFileHeader = struct.calcsize(structFileHeader)
131
132_FH_SIGNATURE = 0
133_FH_EXTRACT_VERSION = 1
134_FH_EXTRACT_SYSTEM = 2
135_FH_GENERAL_PURPOSE_FLAG_BITS = 3
136_FH_COMPRESSION_METHOD = 4
137_FH_LAST_MOD_TIME = 5
138_FH_LAST_MOD_DATE = 6
139_FH_CRC = 7
140_FH_COMPRESSED_SIZE = 8
141_FH_UNCOMPRESSED_SIZE = 9
142_FH_FILENAME_LENGTH = 10
143_FH_EXTRA_FIELD_LENGTH = 11
144
145# The "Zip64 end of central directory locator" structure, magic number, and size
146structEndArchive64Locator = "<4sLQL"
147stringEndArchive64Locator = b"PK\x06\x07"
148sizeEndCentDir64Locator = struct.calcsize(structEndArchive64Locator)
149
150# The "Zip64 end of central directory" record, magic number, size, and indices
151# (section V.G in the format document)
152structEndArchive64 = "<4sQ2H2L4Q"
153stringEndArchive64 = b"PK\x06\x06"
154sizeEndCentDir64 = struct.calcsize(structEndArchive64)
155
156_CD64_SIGNATURE = 0
157_CD64_DIRECTORY_RECSIZE = 1
158_CD64_CREATE_VERSION = 2
159_CD64_EXTRACT_VERSION = 3
160_CD64_DISK_NUMBER = 4
161_CD64_DISK_NUMBER_START = 5
162_CD64_NUMBER_ENTRIES_THIS_DISK = 6
163_CD64_NUMBER_ENTRIES_TOTAL = 7
164_CD64_DIRECTORY_SIZE = 8
165_CD64_OFFSET_START_CENTDIR = 9
166
167_DD_SIGNATURE = 0x08074b50
168
169_EXTRA_FIELD_STRUCT = struct.Struct('<HH')
170
171def _strip_extra(extra, xids):
172 # Remove Extra Fields with specified IDs.
173 unpack = _EXTRA_FIELD_STRUCT.unpack
174 modified = False
175 buffer = []
176 start = i = 0
177 while i + 4 <= len(extra):
178 xid, xlen = unpack(extra[i : i + 4])
179 j = i + 4 + xlen
180 if xid in xids:
181 if i != start:
182 buffer.append(extra[start : i])
183 start = j
184 modified = True
185 i = j
186 if not modified:
187 return extra
188 return b''.join(buffer)
189
190def _check_zipfile(fp):
191 try:
192 if _EndRecData(fp):
193 return True # file has correct magic number
194 except OSError:
195 pass
196 return False
197
198def is_zipfile(filename):
199 """Quickly see if a file is a ZIP file by checking the magic number.
200
201 The filename argument may be a file or file-like object too.
202 """
203 result = False
204 try:
205 if hasattr(filename, "read"):
206 result = _check_zipfile(fp=filename)
207 else:
208 with open(filename, "rb") as fp:
209 result = _check_zipfile(fp)
210 except OSError:
211 pass
212 return result
213
214def _EndRecData64(fpin, offset, endrec):
215 """
216 Read the ZIP64 end-of-archive records and use that to update endrec
217 """
218 try:
219 fpin.seek(offset - sizeEndCentDir64Locator, 2)
220 except OSError:
221 # If the seek fails, the file is not large enough to contain a ZIP64
222 # end-of-archive record, so just return the end record we were given.
223 return endrec
224
225 data = fpin.read(sizeEndCentDir64Locator)
226 if len(data) != sizeEndCentDir64Locator:
227 return endrec
228 sig, diskno, reloff, disks = struct.unpack(structEndArchive64Locator, data)
229 if sig != stringEndArchive64Locator:
230 return endrec
231
232 if diskno != 0 or disks > 1:
233 raise BadZipFile("zipfiles that span multiple disks are not supported")
234
235 # Assume no 'zip64 extensible data'
236 fpin.seek(offset - sizeEndCentDir64Locator - sizeEndCentDir64, 2)
237 data = fpin.read(sizeEndCentDir64)
238 if len(data) != sizeEndCentDir64:
239 return endrec
240 sig, sz, create_version, read_version, disk_num, disk_dir, \
241 dircount, dircount2, dirsize, diroffset = \
242 struct.unpack(structEndArchive64, data)
243 if sig != stringEndArchive64:
244 return endrec
245
246 # Update the original endrec using data from the ZIP64 record
247 endrec[_ECD_SIGNATURE] = sig
248 endrec[_ECD_DISK_NUMBER] = disk_num
249 endrec[_ECD_DISK_START] = disk_dir
250 endrec[_ECD_ENTRIES_THIS_DISK] = dircount
251 endrec[_ECD_ENTRIES_TOTAL] = dircount2
252 endrec[_ECD_SIZE] = dirsize
253 endrec[_ECD_OFFSET] = diroffset
254 return endrec
255
256
257def _EndRecData(fpin):
258 """Return data from the "End of Central Directory" record, or None.
259
260 The data is a list of the nine items in the ZIP "End of central dir"
261 record followed by a tenth item, the file seek offset of this record."""
262
263 # Determine file size
264 fpin.seek(0, 2)
265 filesize = fpin.tell()
266
267 # Check to see if this is ZIP file with no archive comment (the
268 # "end of central directory" structure should be the last item in the
269 # file if this is the case).
270 try:
271 fpin.seek(-sizeEndCentDir, 2)
272 except OSError:
273 return None
274 data = fpin.read()
275 if (len(data) == sizeEndCentDir and
276 data[0:4] == stringEndArchive and
277 data[-2:] == b"\000\000"):
278 # the signature is correct and there's no comment, unpack structure
279 endrec = struct.unpack(structEndArchive, data)
280 endrec=list(endrec)
281
282 # Append a blank comment and record start offset
283 endrec.append(b"")
284 endrec.append(filesize - sizeEndCentDir)
285
286 # Try to read the "Zip64 end of central directory" structure
287 return _EndRecData64(fpin, -sizeEndCentDir, endrec)
288
289 # Either this is not a ZIP file, or it is a ZIP file with an archive
290 # comment. Search the end of the file for the "end of central directory"
291 # record signature. The comment is the last item in the ZIP file and may be
292 # up to 64K long. It is assumed that the "end of central directory" magic
293 # number does not appear in the comment.
294 maxCommentStart = max(filesize - (1 << 16) - sizeEndCentDir, 0)
295 fpin.seek(maxCommentStart, 0)
296 data = fpin.read()
297 start = data.rfind(stringEndArchive)
298 if start >= 0:
299 # found the magic number; attempt to unpack and interpret
300 recData = data[start:start+sizeEndCentDir]
301 if len(recData) != sizeEndCentDir:
302 # Zip file is corrupted.
303 return None
304 endrec = list(struct.unpack(structEndArchive, recData))
305 commentSize = endrec[_ECD_COMMENT_SIZE] #as claimed by the zip file
306 comment = data[start+sizeEndCentDir:start+sizeEndCentDir+commentSize]
307 endrec.append(comment)
308 endrec.append(maxCommentStart + start)
309
310 # Try to read the "Zip64 end of central directory" structure
311 return _EndRecData64(fpin, maxCommentStart + start - filesize,
312 endrec)
313
314 # Unable to find a valid end of central directory structure
315 return None
316
317
318class ZipInfo (object):
319 """Class with attributes describing each file in the ZIP archive."""
320
321 __slots__ = (
322 'orig_filename',
323 'filename',
324 'date_time',
325 'compress_type',
326 '_compresslevel',
327 'comment',
328 'extra',
329 'create_system',
330 'create_version',
331 'extract_version',
332 'reserved',
333 'flag_bits',
334 'volume',
335 'internal_attr',
336 'external_attr',
337 'header_offset',
338 'CRC',
339 'compress_size',
340 'file_size',
341 '_raw_time',
342 )
343
344 def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)):
345 self.orig_filename = filename # Original file name in archive
346
347 # Terminate the file name at the first null byte. Null bytes in file
348 # names are used as tricks by viruses in archives.
349 null_byte = filename.find(chr(0))
350 if null_byte >= 0:
351 filename = filename[0:null_byte]
352 # This is used to ensure paths in generated ZIP files always use
353 # forward slashes as the directory separator, as required by the
354 # ZIP format specification.
355 if os.sep != "/" and os.sep in filename:
356 filename = filename.replace(os.sep, "/")
357
358 self.filename = filename # Normalized file name
359 self.date_time = date_time # year, month, day, hour, min, sec
360
361 if date_time[0] < 1980:
362 raise ValueError('ZIP does not support timestamps before 1980')
363
364 # Standard values:
365 self.compress_type = ZIP_STORED # Type of compression for the file
366 self._compresslevel = None # Level for the compressor
367 self.comment = b"" # Comment for each file
368 self.extra = b"" # ZIP extra data
369 if sys.platform == 'win32':
370 self.create_system = 0 # System which created ZIP archive
371 else:
372 # Assume everything else is unix-y
373 self.create_system = 3 # System which created ZIP archive
374 self.create_version = DEFAULT_VERSION # Version which created ZIP archive
375 self.extract_version = DEFAULT_VERSION # Version needed to extract archive
376 self.reserved = 0 # Must be zero
377 self.flag_bits = 0 # ZIP flag bits
378 self.volume = 0 # Volume number of file header
379 self.internal_attr = 0 # Internal attributes
380 self.external_attr = 0 # External file attributes
Haibo Huang5eba2b42021-01-22 11:22:02 -0800381 self.compress_size = 0 # Size of the compressed file
382 self.file_size = 0 # Size of the uncompressed file
Haibo Huangd8830302020-03-03 10:09:46 -0800383 # Other attributes are set by class ZipFile:
384 # header_offset Byte offset to the file header
385 # CRC CRC-32 of the uncompressed file
Haibo Huangd8830302020-03-03 10:09:46 -0800386
387 def __repr__(self):
388 result = ['<%s filename=%r' % (self.__class__.__name__, self.filename)]
389 if self.compress_type != ZIP_STORED:
390 result.append(' compress_type=%s' %
391 compressor_names.get(self.compress_type,
392 self.compress_type))
393 hi = self.external_attr >> 16
394 lo = self.external_attr & 0xFFFF
395 if hi:
396 result.append(' filemode=%r' % stat.filemode(hi))
397 if lo:
398 result.append(' external_attr=%#x' % lo)
399 isdir = self.is_dir()
400 if not isdir or self.file_size:
401 result.append(' file_size=%r' % self.file_size)
402 if ((not isdir or self.compress_size) and
403 (self.compress_type != ZIP_STORED or
404 self.file_size != self.compress_size)):
405 result.append(' compress_size=%r' % self.compress_size)
406 result.append('>')
407 return ''.join(result)
408
409 def FileHeader(self, zip64=None):
410 """Return the per-file header as a bytes object."""
411 dt = self.date_time
412 dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2]
413 dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2)
414 if self.flag_bits & 0x08:
415 # Set these to zero because we write them after the file data
416 CRC = compress_size = file_size = 0
417 else:
418 CRC = self.CRC
419 compress_size = self.compress_size
420 file_size = self.file_size
421
422 extra = self.extra
423
424 min_version = 0
425 if zip64 is None:
426 zip64 = file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT
427 if zip64:
428 fmt = '<HHQQ'
429 extra = extra + struct.pack(fmt,
430 1, struct.calcsize(fmt)-4, file_size, compress_size)
431 if file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT:
432 if not zip64:
433 raise LargeZipFile("Filesize would require ZIP64 extensions")
434 # File is larger than what fits into a 4 byte integer,
435 # fall back to the ZIP64 extension
436 file_size = 0xffffffff
437 compress_size = 0xffffffff
438 min_version = ZIP64_VERSION
439
440 if self.compress_type == ZIP_BZIP2:
441 min_version = max(BZIP2_VERSION, min_version)
442 elif self.compress_type == ZIP_LZMA:
443 min_version = max(LZMA_VERSION, min_version)
444
445 self.extract_version = max(min_version, self.extract_version)
446 self.create_version = max(min_version, self.create_version)
447 filename, flag_bits = self._encodeFilenameFlags()
448 header = struct.pack(structFileHeader, stringFileHeader,
449 self.extract_version, self.reserved, flag_bits,
450 self.compress_type, dostime, dosdate, CRC,
451 compress_size, file_size,
452 len(filename), len(extra))
453 return header + filename + extra
454
455 def _encodeFilenameFlags(self):
456 try:
457 return self.filename.encode('ascii'), self.flag_bits
458 except UnicodeEncodeError:
459 return self.filename.encode('utf-8'), self.flag_bits | 0x800
460
461 def _decodeExtra(self):
462 # Try to decode the extra field.
463 extra = self.extra
464 unpack = struct.unpack
465 while len(extra) >= 4:
466 tp, ln = unpack('<HH', extra[:4])
467 if ln+4 > len(extra):
468 raise BadZipFile("Corrupt extra field %04x (size=%d)" % (tp, ln))
469 if tp == 0x0001:
Haibo Huang5eba2b42021-01-22 11:22:02 -0800470 data = extra[4:ln+4]
Haibo Huangd8830302020-03-03 10:09:46 -0800471 # ZIP64 extension (large files and/or large archives)
Haibo Huang5eba2b42021-01-22 11:22:02 -0800472 try:
473 if self.file_size in (0xFFFF_FFFF_FFFF_FFFF, 0xFFFF_FFFF):
474 field = "File size"
475 self.file_size, = unpack('<Q', data[:8])
476 data = data[8:]
477 if self.compress_size == 0xFFFF_FFFF:
478 field = "Compress size"
479 self.compress_size, = unpack('<Q', data[:8])
480 data = data[8:]
481 if self.header_offset == 0xFFFF_FFFF:
482 field = "Header offset"
483 self.header_offset, = unpack('<Q', data[:8])
484 except struct.error:
485 raise BadZipFile(f"Corrupt zip64 extra field. "
486 f"{field} not found.") from None
Haibo Huangd8830302020-03-03 10:09:46 -0800487
488 extra = extra[ln+4:]
489
490 @classmethod
491 def from_file(cls, filename, arcname=None, *, strict_timestamps=True):
492 """Construct an appropriate ZipInfo for a file on the filesystem.
493
494 filename should be the path to a file or directory on the filesystem.
495
496 arcname is the name which it will have within the archive (by default,
497 this will be the same as filename, but without a drive letter and with
498 leading path separators removed).
499 """
500 if isinstance(filename, os.PathLike):
501 filename = os.fspath(filename)
502 st = os.stat(filename)
503 isdir = stat.S_ISDIR(st.st_mode)
504 mtime = time.localtime(st.st_mtime)
505 date_time = mtime[0:6]
506 if not strict_timestamps and date_time[0] < 1980:
507 date_time = (1980, 1, 1, 0, 0, 0)
508 elif not strict_timestamps and date_time[0] > 2107:
509 date_time = (2107, 12, 31, 23, 59, 59)
510 # Create ZipInfo instance to store file information
511 if arcname is None:
512 arcname = filename
513 arcname = os.path.normpath(os.path.splitdrive(arcname)[1])
514 while arcname[0] in (os.sep, os.altsep):
515 arcname = arcname[1:]
516 if isdir:
517 arcname += '/'
518 zinfo = cls(arcname, date_time)
519 zinfo.external_attr = (st.st_mode & 0xFFFF) << 16 # Unix attributes
520 if isdir:
521 zinfo.file_size = 0
522 zinfo.external_attr |= 0x10 # MS-DOS directory flag
523 else:
524 zinfo.file_size = st.st_size
525
526 return zinfo
527
528 def is_dir(self):
529 """Return True if this archive member is a directory."""
530 return self.filename[-1] == '/'
531
532
533# ZIP encryption uses the CRC32 one-byte primitive for scrambling some
534# internal keys. We noticed that a direct implementation is faster than
535# relying on binascii.crc32().
536
537_crctable = None
538def _gen_crc(crc):
539 for j in range(8):
540 if crc & 1:
541 crc = (crc >> 1) ^ 0xEDB88320
542 else:
543 crc >>= 1
544 return crc
545
546# ZIP supports a password-based form of encryption. Even though known
547# plaintext attacks have been found against it, it is still useful
548# to be able to get data out of such a file.
549#
550# Usage:
551# zd = _ZipDecrypter(mypwd)
552# plain_bytes = zd(cypher_bytes)
553
554def _ZipDecrypter(pwd):
555 key0 = 305419896
556 key1 = 591751049
557 key2 = 878082192
558
559 global _crctable
560 if _crctable is None:
561 _crctable = list(map(_gen_crc, range(256)))
562 crctable = _crctable
563
564 def crc32(ch, crc):
565 """Compute the CRC32 primitive on one byte."""
566 return (crc >> 8) ^ crctable[(crc ^ ch) & 0xFF]
567
568 def update_keys(c):
569 nonlocal key0, key1, key2
570 key0 = crc32(c, key0)
571 key1 = (key1 + (key0 & 0xFF)) & 0xFFFFFFFF
572 key1 = (key1 * 134775813 + 1) & 0xFFFFFFFF
573 key2 = crc32(key1 >> 24, key2)
574
575 for p in pwd:
576 update_keys(p)
577
578 def decrypter(data):
579 """Decrypt a bytes object."""
580 result = bytearray()
581 append = result.append
582 for c in data:
583 k = key2 | 2
584 c ^= ((k * (k^1)) >> 8) & 0xFF
585 update_keys(c)
586 append(c)
587 return bytes(result)
588
589 return decrypter
590
591
592class LZMACompressor:
593
594 def __init__(self):
595 self._comp = None
596
597 def _init(self):
598 props = lzma._encode_filter_properties({'id': lzma.FILTER_LZMA1})
599 self._comp = lzma.LZMACompressor(lzma.FORMAT_RAW, filters=[
600 lzma._decode_filter_properties(lzma.FILTER_LZMA1, props)
601 ])
602 return struct.pack('<BBH', 9, 4, len(props)) + props
603
604 def compress(self, data):
605 if self._comp is None:
606 return self._init() + self._comp.compress(data)
607 return self._comp.compress(data)
608
609 def flush(self):
610 if self._comp is None:
611 return self._init() + self._comp.flush()
612 return self._comp.flush()
613
614
615class LZMADecompressor:
616
617 def __init__(self):
618 self._decomp = None
619 self._unconsumed = b''
620 self.eof = False
621
622 def decompress(self, data):
623 if self._decomp is None:
624 self._unconsumed += data
625 if len(self._unconsumed) <= 4:
626 return b''
627 psize, = struct.unpack('<H', self._unconsumed[2:4])
628 if len(self._unconsumed) <= 4 + psize:
629 return b''
630
631 self._decomp = lzma.LZMADecompressor(lzma.FORMAT_RAW, filters=[
632 lzma._decode_filter_properties(lzma.FILTER_LZMA1,
633 self._unconsumed[4:4 + psize])
634 ])
635 data = self._unconsumed[4 + psize:]
636 del self._unconsumed
637
638 result = self._decomp.decompress(data)
639 self.eof = self._decomp.eof
640 return result
641
642
643compressor_names = {
644 0: 'store',
645 1: 'shrink',
646 2: 'reduce',
647 3: 'reduce',
648 4: 'reduce',
649 5: 'reduce',
650 6: 'implode',
651 7: 'tokenize',
652 8: 'deflate',
653 9: 'deflate64',
654 10: 'implode',
655 12: 'bzip2',
656 14: 'lzma',
657 18: 'terse',
658 19: 'lz77',
659 97: 'wavpack',
660 98: 'ppmd',
661}
662
663def _check_compression(compression):
664 if compression == ZIP_STORED:
665 pass
666 elif compression == ZIP_DEFLATED:
667 if not zlib:
668 raise RuntimeError(
669 "Compression requires the (missing) zlib module")
670 elif compression == ZIP_BZIP2:
671 if not bz2:
672 raise RuntimeError(
673 "Compression requires the (missing) bz2 module")
674 elif compression == ZIP_LZMA:
675 if not lzma:
676 raise RuntimeError(
677 "Compression requires the (missing) lzma module")
678 else:
679 raise NotImplementedError("That compression method is not supported")
680
681
682def _get_compressor(compress_type, compresslevel=None):
683 if compress_type == ZIP_DEFLATED:
684 if compresslevel is not None:
685 return zlib.compressobj(compresslevel, zlib.DEFLATED, -15)
686 return zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION, zlib.DEFLATED, -15)
687 elif compress_type == ZIP_BZIP2:
688 if compresslevel is not None:
689 return bz2.BZ2Compressor(compresslevel)
690 return bz2.BZ2Compressor()
691 # compresslevel is ignored for ZIP_LZMA
692 elif compress_type == ZIP_LZMA:
693 return LZMACompressor()
694 else:
695 return None
696
697
698def _get_decompressor(compress_type):
699 _check_compression(compress_type)
700 if compress_type == ZIP_STORED:
701 return None
702 elif compress_type == ZIP_DEFLATED:
703 return zlib.decompressobj(-15)
704 elif compress_type == ZIP_BZIP2:
705 return bz2.BZ2Decompressor()
706 elif compress_type == ZIP_LZMA:
707 return LZMADecompressor()
708 else:
709 descr = compressor_names.get(compress_type)
710 if descr:
711 raise NotImplementedError("compression type %d (%s)" % (compress_type, descr))
712 else:
713 raise NotImplementedError("compression type %d" % (compress_type,))
714
715
716class _SharedFile:
717 def __init__(self, file, pos, close, lock, writing):
718 self._file = file
719 self._pos = pos
720 self._close = close
721 self._lock = lock
722 self._writing = writing
723 self.seekable = file.seekable
724 self.tell = file.tell
725
726 def seek(self, offset, whence=0):
727 with self._lock:
728 if self._writing():
729 raise ValueError("Can't reposition in the ZIP file while "
730 "there is an open writing handle on it. "
731 "Close the writing handle before trying to read.")
732 self._file.seek(offset, whence)
733 self._pos = self._file.tell()
734 return self._pos
735
736 def read(self, n=-1):
737 with self._lock:
738 if self._writing():
739 raise ValueError("Can't read from the ZIP file while there "
740 "is an open writing handle on it. "
741 "Close the writing handle before trying to read.")
742 self._file.seek(self._pos)
743 data = self._file.read(n)
744 self._pos = self._file.tell()
745 return data
746
747 def close(self):
748 if self._file is not None:
749 fileobj = self._file
750 self._file = None
751 self._close(fileobj)
752
753# Provide the tell method for unseekable stream
754class _Tellable:
755 def __init__(self, fp):
756 self.fp = fp
757 self.offset = 0
758
759 def write(self, data):
760 n = self.fp.write(data)
761 self.offset += n
762 return n
763
764 def tell(self):
765 return self.offset
766
767 def flush(self):
768 self.fp.flush()
769
770 def close(self):
771 self.fp.close()
772
773
774class ZipExtFile(io.BufferedIOBase):
775 """File-like object for reading an archive member.
776 Is returned by ZipFile.open().
777 """
778
779 # Max size supported by decompressor.
780 MAX_N = 1 << 31 - 1
781
782 # Read from compressed files in 4k blocks.
783 MIN_READ_SIZE = 4096
784
785 # Chunk size to read during seek
786 MAX_SEEK_READ = 1 << 24
787
788 def __init__(self, fileobj, mode, zipinfo, pwd=None,
789 close_fileobj=False):
790 self._fileobj = fileobj
791 self._pwd = pwd
792 self._close_fileobj = close_fileobj
793
794 self._compress_type = zipinfo.compress_type
795 self._compress_left = zipinfo.compress_size
796 self._left = zipinfo.file_size
797
798 self._decompressor = _get_decompressor(self._compress_type)
799
800 self._eof = False
801 self._readbuffer = b''
802 self._offset = 0
803
804 self.newlines = None
805
806 self.mode = mode
807 self.name = zipinfo.filename
808
809 if hasattr(zipinfo, 'CRC'):
810 self._expected_crc = zipinfo.CRC
811 self._running_crc = crc32(b'')
812 else:
813 self._expected_crc = None
814
815 self._seekable = False
816 try:
817 if fileobj.seekable():
818 self._orig_compress_start = fileobj.tell()
819 self._orig_compress_size = zipinfo.compress_size
820 self._orig_file_size = zipinfo.file_size
821 self._orig_start_crc = self._running_crc
822 self._seekable = True
823 except AttributeError:
824 pass
825
826 self._decrypter = None
827 if pwd:
828 if zipinfo.flag_bits & 0x8:
829 # compare against the file type from extended local headers
830 check_byte = (zipinfo._raw_time >> 8) & 0xff
831 else:
832 # compare against the CRC otherwise
833 check_byte = (zipinfo.CRC >> 24) & 0xff
834 h = self._init_decrypter()
835 if h != check_byte:
836 raise RuntimeError("Bad password for file %r" % zipinfo.orig_filename)
837
838
839 def _init_decrypter(self):
840 self._decrypter = _ZipDecrypter(self._pwd)
841 # The first 12 bytes in the cypher stream is an encryption header
842 # used to strengthen the algorithm. The first 11 bytes are
843 # completely random, while the 12th contains the MSB of the CRC,
844 # or the MSB of the file time depending on the header type
845 # and is used to check the correctness of the password.
846 header = self._fileobj.read(12)
847 self._compress_left -= 12
848 return self._decrypter(header)[11]
849
850 def __repr__(self):
851 result = ['<%s.%s' % (self.__class__.__module__,
852 self.__class__.__qualname__)]
853 if not self.closed:
854 result.append(' name=%r mode=%r' % (self.name, self.mode))
855 if self._compress_type != ZIP_STORED:
856 result.append(' compress_type=%s' %
857 compressor_names.get(self._compress_type,
858 self._compress_type))
859 else:
860 result.append(' [closed]')
861 result.append('>')
862 return ''.join(result)
863
864 def readline(self, limit=-1):
865 """Read and return a line from the stream.
866
867 If limit is specified, at most limit bytes will be read.
868 """
869
870 if limit < 0:
871 # Shortcut common case - newline found in buffer.
872 i = self._readbuffer.find(b'\n', self._offset) + 1
873 if i > 0:
874 line = self._readbuffer[self._offset: i]
875 self._offset = i
876 return line
877
878 return io.BufferedIOBase.readline(self, limit)
879
880 def peek(self, n=1):
881 """Returns buffered bytes without advancing the position."""
882 if n > len(self._readbuffer) - self._offset:
883 chunk = self.read(n)
884 if len(chunk) > self._offset:
885 self._readbuffer = chunk + self._readbuffer[self._offset:]
886 self._offset = 0
887 else:
888 self._offset -= len(chunk)
889
890 # Return up to 512 bytes to reduce allocation overhead for tight loops.
891 return self._readbuffer[self._offset: self._offset + 512]
892
893 def readable(self):
Haibo Huang5eba2b42021-01-22 11:22:02 -0800894 if self.closed:
895 raise ValueError("I/O operation on closed file.")
Haibo Huangd8830302020-03-03 10:09:46 -0800896 return True
897
898 def read(self, n=-1):
899 """Read and return up to n bytes.
900 If the argument is omitted, None, or negative, data is read and returned until EOF is reached.
901 """
Haibo Huang5eba2b42021-01-22 11:22:02 -0800902 if self.closed:
903 raise ValueError("read from closed file.")
Haibo Huangd8830302020-03-03 10:09:46 -0800904 if n is None or n < 0:
905 buf = self._readbuffer[self._offset:]
906 self._readbuffer = b''
907 self._offset = 0
908 while not self._eof:
909 buf += self._read1(self.MAX_N)
910 return buf
911
912 end = n + self._offset
913 if end < len(self._readbuffer):
914 buf = self._readbuffer[self._offset:end]
915 self._offset = end
916 return buf
917
918 n = end - len(self._readbuffer)
919 buf = self._readbuffer[self._offset:]
920 self._readbuffer = b''
921 self._offset = 0
922 while n > 0 and not self._eof:
923 data = self._read1(n)
924 if n < len(data):
925 self._readbuffer = data
926 self._offset = n
927 buf += data[:n]
928 break
929 buf += data
930 n -= len(data)
931 return buf
932
933 def _update_crc(self, newdata):
934 # Update the CRC using the given data.
935 if self._expected_crc is None:
936 # No need to compute the CRC if we don't have a reference value
937 return
938 self._running_crc = crc32(newdata, self._running_crc)
939 # Check the CRC if we're at the end of the file
940 if self._eof and self._running_crc != self._expected_crc:
941 raise BadZipFile("Bad CRC-32 for file %r" % self.name)
942
943 def read1(self, n):
944 """Read up to n bytes with at most one read() system call."""
945
946 if n is None or n < 0:
947 buf = self._readbuffer[self._offset:]
948 self._readbuffer = b''
949 self._offset = 0
950 while not self._eof:
951 data = self._read1(self.MAX_N)
952 if data:
953 buf += data
954 break
955 return buf
956
957 end = n + self._offset
958 if end < len(self._readbuffer):
959 buf = self._readbuffer[self._offset:end]
960 self._offset = end
961 return buf
962
963 n = end - len(self._readbuffer)
964 buf = self._readbuffer[self._offset:]
965 self._readbuffer = b''
966 self._offset = 0
967 if n > 0:
968 while not self._eof:
969 data = self._read1(n)
970 if n < len(data):
971 self._readbuffer = data
972 self._offset = n
973 buf += data[:n]
974 break
975 if data:
976 buf += data
977 break
978 return buf
979
980 def _read1(self, n):
981 # Read up to n compressed bytes with at most one read() system call,
982 # decrypt and decompress them.
983 if self._eof or n <= 0:
984 return b''
985
986 # Read from file.
987 if self._compress_type == ZIP_DEFLATED:
988 ## Handle unconsumed data.
989 data = self._decompressor.unconsumed_tail
990 if n > len(data):
991 data += self._read2(n - len(data))
992 else:
993 data = self._read2(n)
994
995 if self._compress_type == ZIP_STORED:
996 self._eof = self._compress_left <= 0
997 elif self._compress_type == ZIP_DEFLATED:
998 n = max(n, self.MIN_READ_SIZE)
999 data = self._decompressor.decompress(data, n)
1000 self._eof = (self._decompressor.eof or
1001 self._compress_left <= 0 and
1002 not self._decompressor.unconsumed_tail)
1003 if self._eof:
1004 data += self._decompressor.flush()
1005 else:
1006 data = self._decompressor.decompress(data)
1007 self._eof = self._decompressor.eof or self._compress_left <= 0
1008
1009 data = data[:self._left]
1010 self._left -= len(data)
1011 if self._left <= 0:
1012 self._eof = True
1013 self._update_crc(data)
1014 return data
1015
1016 def _read2(self, n):
1017 if self._compress_left <= 0:
1018 return b''
1019
1020 n = max(n, self.MIN_READ_SIZE)
1021 n = min(n, self._compress_left)
1022
1023 data = self._fileobj.read(n)
1024 self._compress_left -= len(data)
1025 if not data:
1026 raise EOFError
1027
1028 if self._decrypter is not None:
1029 data = self._decrypter(data)
1030 return data
1031
1032 def close(self):
1033 try:
1034 if self._close_fileobj:
1035 self._fileobj.close()
1036 finally:
1037 super().close()
1038
1039 def seekable(self):
Haibo Huang5eba2b42021-01-22 11:22:02 -08001040 if self.closed:
1041 raise ValueError("I/O operation on closed file.")
Haibo Huangd8830302020-03-03 10:09:46 -08001042 return self._seekable
1043
1044 def seek(self, offset, whence=0):
Haibo Huang5eba2b42021-01-22 11:22:02 -08001045 if self.closed:
1046 raise ValueError("seek on closed file.")
Haibo Huangd8830302020-03-03 10:09:46 -08001047 if not self._seekable:
1048 raise io.UnsupportedOperation("underlying stream is not seekable")
1049 curr_pos = self.tell()
1050 if whence == 0: # Seek from start of file
1051 new_pos = offset
1052 elif whence == 1: # Seek from current position
1053 new_pos = curr_pos + offset
1054 elif whence == 2: # Seek from EOF
1055 new_pos = self._orig_file_size + offset
1056 else:
1057 raise ValueError("whence must be os.SEEK_SET (0), "
1058 "os.SEEK_CUR (1), or os.SEEK_END (2)")
1059
1060 if new_pos > self._orig_file_size:
1061 new_pos = self._orig_file_size
1062
1063 if new_pos < 0:
1064 new_pos = 0
1065
1066 read_offset = new_pos - curr_pos
1067 buff_offset = read_offset + self._offset
1068
1069 if buff_offset >= 0 and buff_offset < len(self._readbuffer):
1070 # Just move the _offset index if the new position is in the _readbuffer
1071 self._offset = buff_offset
1072 read_offset = 0
1073 elif read_offset < 0:
1074 # Position is before the current position. Reset the ZipExtFile
1075 self._fileobj.seek(self._orig_compress_start)
1076 self._running_crc = self._orig_start_crc
1077 self._compress_left = self._orig_compress_size
1078 self._left = self._orig_file_size
1079 self._readbuffer = b''
1080 self._offset = 0
1081 self._decompressor = _get_decompressor(self._compress_type)
1082 self._eof = False
1083 read_offset = new_pos
1084 if self._decrypter is not None:
1085 self._init_decrypter()
1086
1087 while read_offset > 0:
1088 read_len = min(self.MAX_SEEK_READ, read_offset)
1089 self.read(read_len)
1090 read_offset -= read_len
1091
1092 return self.tell()
1093
1094 def tell(self):
Haibo Huang5eba2b42021-01-22 11:22:02 -08001095 if self.closed:
1096 raise ValueError("tell on closed file.")
Haibo Huangd8830302020-03-03 10:09:46 -08001097 if not self._seekable:
1098 raise io.UnsupportedOperation("underlying stream is not seekable")
1099 filepos = self._orig_file_size - self._left - len(self._readbuffer) + self._offset
1100 return filepos
1101
1102
1103class _ZipWriteFile(io.BufferedIOBase):
1104 def __init__(self, zf, zinfo, zip64):
1105 self._zinfo = zinfo
1106 self._zip64 = zip64
1107 self._zipfile = zf
1108 self._compressor = _get_compressor(zinfo.compress_type,
1109 zinfo._compresslevel)
1110 self._file_size = 0
1111 self._compress_size = 0
1112 self._crc = 0
1113
1114 @property
1115 def _fileobj(self):
1116 return self._zipfile.fp
1117
1118 def writable(self):
1119 return True
1120
1121 def write(self, data):
1122 if self.closed:
1123 raise ValueError('I/O operation on closed file.')
Yi Kong71199322022-08-30 15:53:45 +08001124
1125 # Accept any data that supports the buffer protocol
1126 if isinstance(data, (bytes, bytearray)):
1127 nbytes = len(data)
1128 else:
1129 data = memoryview(data)
1130 nbytes = data.nbytes
Haibo Huangd8830302020-03-03 10:09:46 -08001131 self._file_size += nbytes
Yi Kong71199322022-08-30 15:53:45 +08001132
Haibo Huangd8830302020-03-03 10:09:46 -08001133 self._crc = crc32(data, self._crc)
1134 if self._compressor:
1135 data = self._compressor.compress(data)
1136 self._compress_size += len(data)
1137 self._fileobj.write(data)
1138 return nbytes
1139
1140 def close(self):
1141 if self.closed:
1142 return
1143 try:
1144 super().close()
1145 # Flush any data from the compressor, and update header info
1146 if self._compressor:
1147 buf = self._compressor.flush()
1148 self._compress_size += len(buf)
1149 self._fileobj.write(buf)
1150 self._zinfo.compress_size = self._compress_size
1151 else:
1152 self._zinfo.compress_size = self._file_size
1153 self._zinfo.CRC = self._crc
1154 self._zinfo.file_size = self._file_size
1155
1156 # Write updated header info
1157 if self._zinfo.flag_bits & 0x08:
1158 # Write CRC and file sizes after the file data
1159 fmt = '<LLQQ' if self._zip64 else '<LLLL'
1160 self._fileobj.write(struct.pack(fmt, _DD_SIGNATURE, self._zinfo.CRC,
1161 self._zinfo.compress_size, self._zinfo.file_size))
1162 self._zipfile.start_dir = self._fileobj.tell()
1163 else:
1164 if not self._zip64:
1165 if self._file_size > ZIP64_LIMIT:
1166 raise RuntimeError(
1167 'File size unexpectedly exceeded ZIP64 limit')
1168 if self._compress_size > ZIP64_LIMIT:
1169 raise RuntimeError(
1170 'Compressed size unexpectedly exceeded ZIP64 limit')
1171 # Seek backwards and write file header (which will now include
1172 # correct CRC and file sizes)
1173
1174 # Preserve current position in file
1175 self._zipfile.start_dir = self._fileobj.tell()
1176 self._fileobj.seek(self._zinfo.header_offset)
1177 self._fileobj.write(self._zinfo.FileHeader(self._zip64))
1178 self._fileobj.seek(self._zipfile.start_dir)
1179
1180 # Successfully written: Add file to our caches
1181 self._zipfile.filelist.append(self._zinfo)
1182 self._zipfile.NameToInfo[self._zinfo.filename] = self._zinfo
1183 finally:
1184 self._zipfile._writing = False
1185
1186
1187
1188class ZipFile:
1189 """ Class with methods to open, read, write, close, list zip files.
1190
1191 z = ZipFile(file, mode="r", compression=ZIP_STORED, allowZip64=True,
1192 compresslevel=None)
1193
1194 file: Either the path to the file, or a file-like object.
1195 If it is a path, the file will be opened and closed by ZipFile.
1196 mode: The mode can be either read 'r', write 'w', exclusive create 'x',
1197 or append 'a'.
1198 compression: ZIP_STORED (no compression), ZIP_DEFLATED (requires zlib),
1199 ZIP_BZIP2 (requires bz2) or ZIP_LZMA (requires lzma).
1200 allowZip64: if True ZipFile will create files with ZIP64 extensions when
1201 needed, otherwise it will raise an exception when this would
1202 be necessary.
1203 compresslevel: None (default for the given compression type) or an integer
1204 specifying the level to pass to the compressor.
1205 When using ZIP_STORED or ZIP_LZMA this keyword has no effect.
1206 When using ZIP_DEFLATED integers 0 through 9 are accepted.
1207 When using ZIP_BZIP2 integers 1 through 9 are accepted.
1208
1209 """
1210
1211 fp = None # Set here since __del__ checks it
1212 _windows_illegal_name_trans_table = None
1213
1214 def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True,
1215 compresslevel=None, *, strict_timestamps=True):
1216 """Open the ZIP file with mode read 'r', write 'w', exclusive create 'x',
1217 or append 'a'."""
1218 if mode not in ('r', 'w', 'x', 'a'):
1219 raise ValueError("ZipFile requires mode 'r', 'w', 'x', or 'a'")
1220
1221 _check_compression(compression)
1222
1223 self._allowZip64 = allowZip64
1224 self._didModify = False
1225 self.debug = 0 # Level of printing: 0 through 3
1226 self.NameToInfo = {} # Find file info given name
1227 self.filelist = [] # List of ZipInfo instances for archive
1228 self.compression = compression # Method of compression
1229 self.compresslevel = compresslevel
1230 self.mode = mode
1231 self.pwd = None
1232 self._comment = b''
1233 self._strict_timestamps = strict_timestamps
1234
1235 # Check if we were passed a file-like object
1236 if isinstance(file, os.PathLike):
1237 file = os.fspath(file)
1238 if isinstance(file, str):
1239 # No, it's a filename
1240 self._filePassed = 0
1241 self.filename = file
1242 modeDict = {'r' : 'rb', 'w': 'w+b', 'x': 'x+b', 'a' : 'r+b',
1243 'r+b': 'w+b', 'w+b': 'wb', 'x+b': 'xb'}
1244 filemode = modeDict[mode]
1245 while True:
1246 try:
1247 self.fp = io.open(file, filemode)
1248 except OSError:
1249 if filemode in modeDict:
1250 filemode = modeDict[filemode]
1251 continue
1252 raise
1253 break
1254 else:
1255 self._filePassed = 1
1256 self.fp = file
1257 self.filename = getattr(file, 'name', None)
1258 self._fileRefCnt = 1
1259 self._lock = threading.RLock()
1260 self._seekable = True
1261 self._writing = False
1262
1263 try:
1264 if mode == 'r':
1265 self._RealGetContents()
1266 elif mode in ('w', 'x'):
1267 # set the modified flag so central directory gets written
1268 # even if no files are added to the archive
1269 self._didModify = True
1270 try:
1271 self.start_dir = self.fp.tell()
1272 except (AttributeError, OSError):
1273 self.fp = _Tellable(self.fp)
1274 self.start_dir = 0
1275 self._seekable = False
1276 else:
1277 # Some file-like objects can provide tell() but not seek()
1278 try:
1279 self.fp.seek(self.start_dir)
1280 except (AttributeError, OSError):
1281 self._seekable = False
1282 elif mode == 'a':
1283 try:
1284 # See if file is a zip file
1285 self._RealGetContents()
1286 # seek to start of directory and overwrite
1287 self.fp.seek(self.start_dir)
1288 except BadZipFile:
1289 # file is not a zip file, just append
1290 self.fp.seek(0, 2)
1291
1292 # set the modified flag so central directory gets written
1293 # even if no files are added to the archive
1294 self._didModify = True
1295 self.start_dir = self.fp.tell()
1296 else:
1297 raise ValueError("Mode must be 'r', 'w', 'x', or 'a'")
1298 except:
1299 fp = self.fp
1300 self.fp = None
1301 self._fpclose(fp)
1302 raise
1303
1304 def __enter__(self):
1305 return self
1306
1307 def __exit__(self, type, value, traceback):
1308 self.close()
1309
1310 def __repr__(self):
1311 result = ['<%s.%s' % (self.__class__.__module__,
1312 self.__class__.__qualname__)]
1313 if self.fp is not None:
1314 if self._filePassed:
1315 result.append(' file=%r' % self.fp)
1316 elif self.filename is not None:
1317 result.append(' filename=%r' % self.filename)
1318 result.append(' mode=%r' % self.mode)
1319 else:
1320 result.append(' [closed]')
1321 result.append('>')
1322 return ''.join(result)
1323
1324 def _RealGetContents(self):
1325 """Read in the table of contents for the ZIP file."""
1326 fp = self.fp
1327 try:
1328 endrec = _EndRecData(fp)
1329 except OSError:
1330 raise BadZipFile("File is not a zip file")
1331 if not endrec:
1332 raise BadZipFile("File is not a zip file")
1333 if self.debug > 1:
1334 print(endrec)
1335 size_cd = endrec[_ECD_SIZE] # bytes in central directory
1336 offset_cd = endrec[_ECD_OFFSET] # offset of central directory
1337 self._comment = endrec[_ECD_COMMENT] # archive comment
1338
1339 # "concat" is zero, unless zip was concatenated to another file
1340 concat = endrec[_ECD_LOCATION] - size_cd - offset_cd
1341 if endrec[_ECD_SIGNATURE] == stringEndArchive64:
1342 # If Zip64 extension structures are present, account for them
1343 concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator)
1344
1345 if self.debug > 2:
1346 inferred = concat + offset_cd
1347 print("given, inferred, offset", offset_cd, inferred, concat)
1348 # self.start_dir: Position of start of central directory
1349 self.start_dir = offset_cd + concat
1350 fp.seek(self.start_dir, 0)
1351 data = fp.read(size_cd)
1352 fp = io.BytesIO(data)
1353 total = 0
1354 while total < size_cd:
1355 centdir = fp.read(sizeCentralDir)
1356 if len(centdir) != sizeCentralDir:
1357 raise BadZipFile("Truncated central directory")
1358 centdir = struct.unpack(structCentralDir, centdir)
1359 if centdir[_CD_SIGNATURE] != stringCentralDir:
1360 raise BadZipFile("Bad magic number for central directory")
1361 if self.debug > 2:
1362 print(centdir)
1363 filename = fp.read(centdir[_CD_FILENAME_LENGTH])
1364 flags = centdir[5]
1365 if flags & 0x800:
1366 # UTF-8 file names extension
1367 filename = filename.decode('utf-8')
1368 else:
1369 # Historical ZIP filename encoding
1370 filename = filename.decode('cp437')
1371 # Create ZipInfo instance to store file information
1372 x = ZipInfo(filename)
1373 x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH])
1374 x.comment = fp.read(centdir[_CD_COMMENT_LENGTH])
1375 x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET]
1376 (x.create_version, x.create_system, x.extract_version, x.reserved,
1377 x.flag_bits, x.compress_type, t, d,
1378 x.CRC, x.compress_size, x.file_size) = centdir[1:12]
1379 if x.extract_version > MAX_EXTRACT_VERSION:
1380 raise NotImplementedError("zip file version %.1f" %
1381 (x.extract_version / 10))
1382 x.volume, x.internal_attr, x.external_attr = centdir[15:18]
1383 # Convert date/time code to (year, month, day, hour, min, sec)
1384 x._raw_time = t
1385 x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F,
1386 t>>11, (t>>5)&0x3F, (t&0x1F) * 2 )
1387
1388 x._decodeExtra()
1389 x.header_offset = x.header_offset + concat
1390 self.filelist.append(x)
1391 self.NameToInfo[x.filename] = x
1392
1393 # update total bytes read from central directory
1394 total = (total + sizeCentralDir + centdir[_CD_FILENAME_LENGTH]
1395 + centdir[_CD_EXTRA_FIELD_LENGTH]
1396 + centdir[_CD_COMMENT_LENGTH])
1397
1398 if self.debug > 2:
1399 print("total", total)
1400
1401
1402 def namelist(self):
1403 """Return a list of file names in the archive."""
1404 return [data.filename for data in self.filelist]
1405
1406 def infolist(self):
1407 """Return a list of class ZipInfo instances for files in the
1408 archive."""
1409 return self.filelist
1410
1411 def printdir(self, file=None):
1412 """Print a table of contents for the zip file."""
1413 print("%-46s %19s %12s" % ("File Name", "Modified ", "Size"),
1414 file=file)
1415 for zinfo in self.filelist:
1416 date = "%d-%02d-%02d %02d:%02d:%02d" % zinfo.date_time[:6]
1417 print("%-46s %s %12d" % (zinfo.filename, date, zinfo.file_size),
1418 file=file)
1419
1420 def testzip(self):
1421 """Read all the files and check the CRC."""
1422 chunk_size = 2 ** 20
1423 for zinfo in self.filelist:
1424 try:
1425 # Read by chunks, to avoid an OverflowError or a
1426 # MemoryError with very large embedded files.
1427 with self.open(zinfo.filename, "r") as f:
1428 while f.read(chunk_size): # Check CRC-32
1429 pass
1430 except BadZipFile:
1431 return zinfo.filename
1432
1433 def getinfo(self, name):
1434 """Return the instance of ZipInfo given 'name'."""
1435 info = self.NameToInfo.get(name)
1436 if info is None:
1437 raise KeyError(
1438 'There is no item named %r in the archive' % name)
1439
1440 return info
1441
1442 def setpassword(self, pwd):
1443 """Set default password for encrypted files."""
1444 if pwd and not isinstance(pwd, bytes):
1445 raise TypeError("pwd: expected bytes, got %s" % type(pwd).__name__)
1446 if pwd:
1447 self.pwd = pwd
1448 else:
1449 self.pwd = None
1450
1451 @property
1452 def comment(self):
1453 """The comment text associated with the ZIP file."""
1454 return self._comment
1455
1456 @comment.setter
1457 def comment(self, comment):
1458 if not isinstance(comment, bytes):
1459 raise TypeError("comment: expected bytes, got %s" % type(comment).__name__)
1460 # check for valid comment length
1461 if len(comment) > ZIP_MAX_COMMENT:
1462 import warnings
1463 warnings.warn('Archive comment is too long; truncating to %d bytes'
1464 % ZIP_MAX_COMMENT, stacklevel=2)
1465 comment = comment[:ZIP_MAX_COMMENT]
1466 self._comment = comment
1467 self._didModify = True
1468
1469 def read(self, name, pwd=None):
1470 """Return file bytes for name."""
1471 with self.open(name, "r", pwd) as fp:
1472 return fp.read()
1473
1474 def open(self, name, mode="r", pwd=None, *, force_zip64=False):
1475 """Return file-like object for 'name'.
1476
1477 name is a string for the file name within the ZIP file, or a ZipInfo
1478 object.
1479
1480 mode should be 'r' to read a file already in the ZIP file, or 'w' to
1481 write to a file newly added to the archive.
1482
1483 pwd is the password to decrypt files (only used for reading).
1484
1485 When writing, if the file size is not known in advance but may exceed
1486 2 GiB, pass force_zip64 to use the ZIP64 format, which can handle large
1487 files. If the size is known in advance, it is best to pass a ZipInfo
1488 instance for name, with zinfo.file_size set.
1489 """
1490 if mode not in {"r", "w"}:
1491 raise ValueError('open() requires mode "r" or "w"')
1492 if pwd and not isinstance(pwd, bytes):
1493 raise TypeError("pwd: expected bytes, got %s" % type(pwd).__name__)
1494 if pwd and (mode == "w"):
1495 raise ValueError("pwd is only supported for reading files")
1496 if not self.fp:
1497 raise ValueError(
1498 "Attempt to use ZIP archive that was already closed")
1499
1500 # Make sure we have an info object
1501 if isinstance(name, ZipInfo):
1502 # 'name' is already an info object
1503 zinfo = name
1504 elif mode == 'w':
1505 zinfo = ZipInfo(name)
1506 zinfo.compress_type = self.compression
1507 zinfo._compresslevel = self.compresslevel
1508 else:
1509 # Get info object for name
1510 zinfo = self.getinfo(name)
1511
1512 if mode == 'w':
1513 return self._open_to_write(zinfo, force_zip64=force_zip64)
1514
1515 if self._writing:
1516 raise ValueError("Can't read from the ZIP file while there "
1517 "is an open writing handle on it. "
1518 "Close the writing handle before trying to read.")
1519
1520 # Open for reading:
1521 self._fileRefCnt += 1
1522 zef_file = _SharedFile(self.fp, zinfo.header_offset,
1523 self._fpclose, self._lock, lambda: self._writing)
1524 try:
1525 # Skip the file header:
1526 fheader = zef_file.read(sizeFileHeader)
1527 if len(fheader) != sizeFileHeader:
1528 raise BadZipFile("Truncated file header")
1529 fheader = struct.unpack(structFileHeader, fheader)
1530 if fheader[_FH_SIGNATURE] != stringFileHeader:
1531 raise BadZipFile("Bad magic number for file header")
1532
1533 fname = zef_file.read(fheader[_FH_FILENAME_LENGTH])
1534 if fheader[_FH_EXTRA_FIELD_LENGTH]:
1535 zef_file.read(fheader[_FH_EXTRA_FIELD_LENGTH])
1536
1537 if zinfo.flag_bits & 0x20:
1538 # Zip 2.7: compressed patched data
1539 raise NotImplementedError("compressed patched data (flag bit 5)")
1540
1541 if zinfo.flag_bits & 0x40:
1542 # strong encryption
1543 raise NotImplementedError("strong encryption (flag bit 6)")
1544
Haibo Huangf5f93a72020-10-19 15:43:42 -07001545 if fheader[_FH_GENERAL_PURPOSE_FLAG_BITS] & 0x800:
Haibo Huangd8830302020-03-03 10:09:46 -08001546 # UTF-8 filename
1547 fname_str = fname.decode("utf-8")
1548 else:
1549 fname_str = fname.decode("cp437")
1550
1551 if fname_str != zinfo.orig_filename:
1552 raise BadZipFile(
1553 'File name in directory %r and header %r differ.'
1554 % (zinfo.orig_filename, fname))
1555
1556 # check for encrypted flag & handle password
1557 is_encrypted = zinfo.flag_bits & 0x1
1558 if is_encrypted:
1559 if not pwd:
1560 pwd = self.pwd
1561 if not pwd:
1562 raise RuntimeError("File %r is encrypted, password "
1563 "required for extraction" % name)
1564 else:
1565 pwd = None
1566
1567 return ZipExtFile(zef_file, mode, zinfo, pwd, True)
1568 except:
1569 zef_file.close()
1570 raise
1571
1572 def _open_to_write(self, zinfo, force_zip64=False):
1573 if force_zip64 and not self._allowZip64:
1574 raise ValueError(
1575 "force_zip64 is True, but allowZip64 was False when opening "
1576 "the ZIP file."
1577 )
1578 if self._writing:
1579 raise ValueError("Can't write to the ZIP file while there is "
1580 "another write handle open on it. "
1581 "Close the first handle before opening another.")
1582
Haibo Huang5eba2b42021-01-22 11:22:02 -08001583 # Size and CRC are overwritten with correct data after processing the file
Haibo Huangd8830302020-03-03 10:09:46 -08001584 zinfo.compress_size = 0
1585 zinfo.CRC = 0
1586
1587 zinfo.flag_bits = 0x00
1588 if zinfo.compress_type == ZIP_LZMA:
1589 # Compressed data includes an end-of-stream (EOS) marker
1590 zinfo.flag_bits |= 0x02
1591 if not self._seekable:
1592 zinfo.flag_bits |= 0x08
1593
1594 if not zinfo.external_attr:
1595 zinfo.external_attr = 0o600 << 16 # permissions: ?rw-------
1596
1597 # Compressed size can be larger than uncompressed size
1598 zip64 = self._allowZip64 and \
1599 (force_zip64 or zinfo.file_size * 1.05 > ZIP64_LIMIT)
1600
1601 if self._seekable:
1602 self.fp.seek(self.start_dir)
1603 zinfo.header_offset = self.fp.tell()
1604
1605 self._writecheck(zinfo)
1606 self._didModify = True
1607
1608 self.fp.write(zinfo.FileHeader(zip64))
1609
1610 self._writing = True
1611 return _ZipWriteFile(self, zinfo, zip64)
1612
1613 def extract(self, member, path=None, pwd=None):
1614 """Extract a member from the archive to the current working directory,
1615 using its full name. Its file information is extracted as accurately
1616 as possible. `member' may be a filename or a ZipInfo object. You can
1617 specify a different directory using `path'.
1618 """
1619 if path is None:
1620 path = os.getcwd()
1621 else:
1622 path = os.fspath(path)
1623
1624 return self._extract_member(member, path, pwd)
1625
1626 def extractall(self, path=None, members=None, pwd=None):
1627 """Extract all members from the archive to the current working
1628 directory. `path' specifies a different directory to extract to.
1629 `members' is optional and must be a subset of the list returned
1630 by namelist().
1631 """
1632 if members is None:
1633 members = self.namelist()
1634
1635 if path is None:
1636 path = os.getcwd()
1637 else:
1638 path = os.fspath(path)
1639
1640 for zipinfo in members:
1641 self._extract_member(zipinfo, path, pwd)
1642
1643 @classmethod
1644 def _sanitize_windows_name(cls, arcname, pathsep):
1645 """Replace bad characters and remove trailing dots from parts."""
1646 table = cls._windows_illegal_name_trans_table
1647 if not table:
1648 illegal = ':<>|"?*'
1649 table = str.maketrans(illegal, '_' * len(illegal))
1650 cls._windows_illegal_name_trans_table = table
1651 arcname = arcname.translate(table)
1652 # remove trailing dots
1653 arcname = (x.rstrip('.') for x in arcname.split(pathsep))
1654 # rejoin, removing empty parts.
1655 arcname = pathsep.join(x for x in arcname if x)
1656 return arcname
1657
1658 def _extract_member(self, member, targetpath, pwd):
1659 """Extract the ZipInfo object 'member' to a physical
1660 file on the path targetpath.
1661 """
1662 if not isinstance(member, ZipInfo):
1663 member = self.getinfo(member)
1664
1665 # build the destination pathname, replacing
1666 # forward slashes to platform specific separators.
1667 arcname = member.filename.replace('/', os.path.sep)
1668
1669 if os.path.altsep:
1670 arcname = arcname.replace(os.path.altsep, os.path.sep)
1671 # interpret absolute pathname as relative, remove drive letter or
1672 # UNC path, redundant separators, "." and ".." components.
1673 arcname = os.path.splitdrive(arcname)[1]
1674 invalid_path_parts = ('', os.path.curdir, os.path.pardir)
1675 arcname = os.path.sep.join(x for x in arcname.split(os.path.sep)
1676 if x not in invalid_path_parts)
1677 if os.path.sep == '\\':
1678 # filter illegal characters on Windows
1679 arcname = self._sanitize_windows_name(arcname, os.path.sep)
1680
1681 targetpath = os.path.join(targetpath, arcname)
1682 targetpath = os.path.normpath(targetpath)
1683
1684 # Create all upper directories if necessary.
1685 upperdirs = os.path.dirname(targetpath)
1686 if upperdirs and not os.path.exists(upperdirs):
1687 os.makedirs(upperdirs)
1688
1689 if member.is_dir():
1690 if not os.path.isdir(targetpath):
1691 os.mkdir(targetpath)
1692 return targetpath
1693
1694 with self.open(member, pwd=pwd) as source, \
1695 open(targetpath, "wb") as target:
1696 shutil.copyfileobj(source, target)
1697
1698 return targetpath
1699
1700 def _writecheck(self, zinfo):
1701 """Check for errors before writing a file to the archive."""
1702 if zinfo.filename in self.NameToInfo:
1703 import warnings
1704 warnings.warn('Duplicate name: %r' % zinfo.filename, stacklevel=3)
1705 if self.mode not in ('w', 'x', 'a'):
1706 raise ValueError("write() requires mode 'w', 'x', or 'a'")
1707 if not self.fp:
1708 raise ValueError(
1709 "Attempt to write ZIP archive that was already closed")
1710 _check_compression(zinfo.compress_type)
1711 if not self._allowZip64:
1712 requires_zip64 = None
1713 if len(self.filelist) >= ZIP_FILECOUNT_LIMIT:
1714 requires_zip64 = "Files count"
1715 elif zinfo.file_size > ZIP64_LIMIT:
1716 requires_zip64 = "Filesize"
1717 elif zinfo.header_offset > ZIP64_LIMIT:
1718 requires_zip64 = "Zipfile size"
1719 if requires_zip64:
1720 raise LargeZipFile(requires_zip64 +
1721 " would require ZIP64 extensions")
1722
1723 def write(self, filename, arcname=None,
1724 compress_type=None, compresslevel=None):
1725 """Put the bytes from filename into the archive under the name
1726 arcname."""
1727 if not self.fp:
1728 raise ValueError(
1729 "Attempt to write to ZIP archive that was already closed")
1730 if self._writing:
1731 raise ValueError(
1732 "Can't write to ZIP archive while an open writing handle exists"
1733 )
1734
1735 zinfo = ZipInfo.from_file(filename, arcname,
1736 strict_timestamps=self._strict_timestamps)
1737
1738 if zinfo.is_dir():
1739 zinfo.compress_size = 0
1740 zinfo.CRC = 0
1741 else:
1742 if compress_type is not None:
1743 zinfo.compress_type = compress_type
1744 else:
1745 zinfo.compress_type = self.compression
1746
1747 if compresslevel is not None:
1748 zinfo._compresslevel = compresslevel
1749 else:
1750 zinfo._compresslevel = self.compresslevel
1751
1752 if zinfo.is_dir():
1753 with self._lock:
1754 if self._seekable:
1755 self.fp.seek(self.start_dir)
1756 zinfo.header_offset = self.fp.tell() # Start of header bytes
1757 if zinfo.compress_type == ZIP_LZMA:
1758 # Compressed data includes an end-of-stream (EOS) marker
1759 zinfo.flag_bits |= 0x02
1760
1761 self._writecheck(zinfo)
1762 self._didModify = True
1763
1764 self.filelist.append(zinfo)
1765 self.NameToInfo[zinfo.filename] = zinfo
1766 self.fp.write(zinfo.FileHeader(False))
1767 self.start_dir = self.fp.tell()
1768 else:
1769 with open(filename, "rb") as src, self.open(zinfo, 'w') as dest:
1770 shutil.copyfileobj(src, dest, 1024*8)
1771
1772 def writestr(self, zinfo_or_arcname, data,
1773 compress_type=None, compresslevel=None):
1774 """Write a file into the archive. The contents is 'data', which
1775 may be either a 'str' or a 'bytes' instance; if it is a 'str',
1776 it is encoded as UTF-8 first.
1777 'zinfo_or_arcname' is either a ZipInfo instance or
1778 the name of the file in the archive."""
1779 if isinstance(data, str):
1780 data = data.encode("utf-8")
1781 if not isinstance(zinfo_or_arcname, ZipInfo):
1782 zinfo = ZipInfo(filename=zinfo_or_arcname,
1783 date_time=time.localtime(time.time())[:6])
1784 zinfo.compress_type = self.compression
1785 zinfo._compresslevel = self.compresslevel
1786 if zinfo.filename[-1] == '/':
1787 zinfo.external_attr = 0o40775 << 16 # drwxrwxr-x
1788 zinfo.external_attr |= 0x10 # MS-DOS directory flag
1789 else:
1790 zinfo.external_attr = 0o600 << 16 # ?rw-------
1791 else:
1792 zinfo = zinfo_or_arcname
1793
1794 if not self.fp:
1795 raise ValueError(
1796 "Attempt to write to ZIP archive that was already closed")
1797 if self._writing:
1798 raise ValueError(
1799 "Can't write to ZIP archive while an open writing handle exists."
1800 )
1801
1802 if compress_type is not None:
1803 zinfo.compress_type = compress_type
1804
1805 if compresslevel is not None:
1806 zinfo._compresslevel = compresslevel
1807
1808 zinfo.file_size = len(data) # Uncompressed size
1809 with self._lock:
1810 with self.open(zinfo, mode='w') as dest:
1811 dest.write(data)
1812
1813 def __del__(self):
1814 """Call the "close()" method in case the user forgot."""
1815 self.close()
1816
1817 def close(self):
1818 """Close the file, and for mode 'w', 'x' and 'a' write the ending
1819 records."""
1820 if self.fp is None:
1821 return
1822
1823 if self._writing:
1824 raise ValueError("Can't close the ZIP file while there is "
1825 "an open writing handle on it. "
1826 "Close the writing handle before closing the zip.")
1827
1828 try:
1829 if self.mode in ('w', 'x', 'a') and self._didModify: # write ending records
1830 with self._lock:
1831 if self._seekable:
1832 self.fp.seek(self.start_dir)
1833 self._write_end_record()
1834 finally:
1835 fp = self.fp
1836 self.fp = None
1837 self._fpclose(fp)
1838
1839 def _write_end_record(self):
1840 for zinfo in self.filelist: # write central directory
1841 dt = zinfo.date_time
1842 dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2]
1843 dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2)
1844 extra = []
1845 if zinfo.file_size > ZIP64_LIMIT \
1846 or zinfo.compress_size > ZIP64_LIMIT:
1847 extra.append(zinfo.file_size)
1848 extra.append(zinfo.compress_size)
1849 file_size = 0xffffffff
1850 compress_size = 0xffffffff
1851 else:
1852 file_size = zinfo.file_size
1853 compress_size = zinfo.compress_size
1854
1855 if zinfo.header_offset > ZIP64_LIMIT:
1856 extra.append(zinfo.header_offset)
1857 header_offset = 0xffffffff
1858 else:
1859 header_offset = zinfo.header_offset
1860
1861 extra_data = zinfo.extra
1862 min_version = 0
1863 if extra:
1864 # Append a ZIP64 field to the extra's
1865 extra_data = _strip_extra(extra_data, (1,))
1866 extra_data = struct.pack(
1867 '<HH' + 'Q'*len(extra),
1868 1, 8*len(extra), *extra) + extra_data
1869
1870 min_version = ZIP64_VERSION
1871
1872 if zinfo.compress_type == ZIP_BZIP2:
1873 min_version = max(BZIP2_VERSION, min_version)
1874 elif zinfo.compress_type == ZIP_LZMA:
1875 min_version = max(LZMA_VERSION, min_version)
1876
1877 extract_version = max(min_version, zinfo.extract_version)
1878 create_version = max(min_version, zinfo.create_version)
Haibo Huang5eba2b42021-01-22 11:22:02 -08001879 filename, flag_bits = zinfo._encodeFilenameFlags()
1880 centdir = struct.pack(structCentralDir,
1881 stringCentralDir, create_version,
1882 zinfo.create_system, extract_version, zinfo.reserved,
1883 flag_bits, zinfo.compress_type, dostime, dosdate,
1884 zinfo.CRC, compress_size, file_size,
1885 len(filename), len(extra_data), len(zinfo.comment),
1886 0, zinfo.internal_attr, zinfo.external_attr,
1887 header_offset)
Haibo Huangd8830302020-03-03 10:09:46 -08001888 self.fp.write(centdir)
1889 self.fp.write(filename)
1890 self.fp.write(extra_data)
1891 self.fp.write(zinfo.comment)
1892
1893 pos2 = self.fp.tell()
1894 # Write end-of-zip-archive record
1895 centDirCount = len(self.filelist)
1896 centDirSize = pos2 - self.start_dir
1897 centDirOffset = self.start_dir
1898 requires_zip64 = None
1899 if centDirCount > ZIP_FILECOUNT_LIMIT:
1900 requires_zip64 = "Files count"
1901 elif centDirOffset > ZIP64_LIMIT:
1902 requires_zip64 = "Central directory offset"
1903 elif centDirSize > ZIP64_LIMIT:
1904 requires_zip64 = "Central directory size"
1905 if requires_zip64:
1906 # Need to write the ZIP64 end-of-archive records
1907 if not self._allowZip64:
1908 raise LargeZipFile(requires_zip64 +
1909 " would require ZIP64 extensions")
1910 zip64endrec = struct.pack(
1911 structEndArchive64, stringEndArchive64,
1912 44, 45, 45, 0, 0, centDirCount, centDirCount,
1913 centDirSize, centDirOffset)
1914 self.fp.write(zip64endrec)
1915
1916 zip64locrec = struct.pack(
1917 structEndArchive64Locator,
1918 stringEndArchive64Locator, 0, pos2, 1)
1919 self.fp.write(zip64locrec)
1920 centDirCount = min(centDirCount, 0xFFFF)
1921 centDirSize = min(centDirSize, 0xFFFFFFFF)
1922 centDirOffset = min(centDirOffset, 0xFFFFFFFF)
1923
1924 endrec = struct.pack(structEndArchive, stringEndArchive,
1925 0, 0, centDirCount, centDirCount,
1926 centDirSize, centDirOffset, len(self._comment))
1927 self.fp.write(endrec)
1928 self.fp.write(self._comment)
Haibo Huang5eba2b42021-01-22 11:22:02 -08001929 if self.mode == "a":
1930 self.fp.truncate()
Haibo Huangd8830302020-03-03 10:09:46 -08001931 self.fp.flush()
1932
1933 def _fpclose(self, fp):
1934 assert self._fileRefCnt > 0
1935 self._fileRefCnt -= 1
1936 if not self._fileRefCnt and not self._filePassed:
1937 fp.close()
1938
1939
1940class PyZipFile(ZipFile):
1941 """Class to create ZIP archives with Python library files and packages."""
1942
1943 def __init__(self, file, mode="r", compression=ZIP_STORED,
1944 allowZip64=True, optimize=-1):
1945 ZipFile.__init__(self, file, mode=mode, compression=compression,
1946 allowZip64=allowZip64)
1947 self._optimize = optimize
1948
1949 def writepy(self, pathname, basename="", filterfunc=None):
1950 """Add all files from "pathname" to the ZIP archive.
1951
1952 If pathname is a package directory, search the directory and
1953 all package subdirectories recursively for all *.py and enter
1954 the modules into the archive. If pathname is a plain
1955 directory, listdir *.py and enter all modules. Else, pathname
1956 must be a Python *.py file and the module will be put into the
1957 archive. Added modules are always module.pyc.
1958 This method will compile the module.py into module.pyc if
1959 necessary.
1960 If filterfunc(pathname) is given, it is called with every argument.
1961 When it is False, the file or directory is skipped.
1962 """
1963 pathname = os.fspath(pathname)
1964 if filterfunc and not filterfunc(pathname):
1965 if self.debug:
1966 label = 'path' if os.path.isdir(pathname) else 'file'
1967 print('%s %r skipped by filterfunc' % (label, pathname))
1968 return
1969 dir, name = os.path.split(pathname)
1970 if os.path.isdir(pathname):
1971 initname = os.path.join(pathname, "__init__.py")
1972 if os.path.isfile(initname):
1973 # This is a package directory, add it
1974 if basename:
1975 basename = "%s/%s" % (basename, name)
1976 else:
1977 basename = name
1978 if self.debug:
1979 print("Adding package in", pathname, "as", basename)
1980 fname, arcname = self._get_codename(initname[0:-3], basename)
1981 if self.debug:
1982 print("Adding", arcname)
1983 self.write(fname, arcname)
1984 dirlist = sorted(os.listdir(pathname))
1985 dirlist.remove("__init__.py")
1986 # Add all *.py files and package subdirectories
1987 for filename in dirlist:
1988 path = os.path.join(pathname, filename)
1989 root, ext = os.path.splitext(filename)
1990 if os.path.isdir(path):
1991 if os.path.isfile(os.path.join(path, "__init__.py")):
1992 # This is a package directory, add it
1993 self.writepy(path, basename,
1994 filterfunc=filterfunc) # Recursive call
1995 elif ext == ".py":
1996 if filterfunc and not filterfunc(path):
1997 if self.debug:
1998 print('file %r skipped by filterfunc' % path)
1999 continue
2000 fname, arcname = self._get_codename(path[0:-3],
2001 basename)
2002 if self.debug:
2003 print("Adding", arcname)
2004 self.write(fname, arcname)
2005 else:
2006 # This is NOT a package directory, add its files at top level
2007 if self.debug:
2008 print("Adding files from directory", pathname)
2009 for filename in sorted(os.listdir(pathname)):
2010 path = os.path.join(pathname, filename)
2011 root, ext = os.path.splitext(filename)
2012 if ext == ".py":
2013 if filterfunc and not filterfunc(path):
2014 if self.debug:
2015 print('file %r skipped by filterfunc' % path)
2016 continue
2017 fname, arcname = self._get_codename(path[0:-3],
2018 basename)
2019 if self.debug:
2020 print("Adding", arcname)
2021 self.write(fname, arcname)
2022 else:
2023 if pathname[-3:] != ".py":
2024 raise RuntimeError(
2025 'Files added with writepy() must end with ".py"')
2026 fname, arcname = self._get_codename(pathname[0:-3], basename)
2027 if self.debug:
2028 print("Adding file", arcname)
2029 self.write(fname, arcname)
2030
2031 def _get_codename(self, pathname, basename):
2032 """Return (filename, archivename) for the path.
2033
2034 Given a module name path, return the correct file path and
2035 archive name, compiling if necessary. For example, given
2036 /python/lib/string, return (/python/lib/string.pyc, string).
2037 """
2038 def _compile(file, optimize=-1):
2039 import py_compile
2040 if self.debug:
2041 print("Compiling", file)
2042 try:
2043 py_compile.compile(file, doraise=True, optimize=optimize)
2044 except py_compile.PyCompileError as err:
2045 print(err.msg)
2046 return False
2047 return True
2048
2049 file_py = pathname + ".py"
2050 file_pyc = pathname + ".pyc"
2051 pycache_opt0 = importlib.util.cache_from_source(file_py, optimization='')
2052 pycache_opt1 = importlib.util.cache_from_source(file_py, optimization=1)
2053 pycache_opt2 = importlib.util.cache_from_source(file_py, optimization=2)
2054 if self._optimize == -1:
2055 # legacy mode: use whatever file is present
2056 if (os.path.isfile(file_pyc) and
2057 os.stat(file_pyc).st_mtime >= os.stat(file_py).st_mtime):
2058 # Use .pyc file.
2059 arcname = fname = file_pyc
2060 elif (os.path.isfile(pycache_opt0) and
2061 os.stat(pycache_opt0).st_mtime >= os.stat(file_py).st_mtime):
2062 # Use the __pycache__/*.pyc file, but write it to the legacy pyc
2063 # file name in the archive.
2064 fname = pycache_opt0
2065 arcname = file_pyc
2066 elif (os.path.isfile(pycache_opt1) and
2067 os.stat(pycache_opt1).st_mtime >= os.stat(file_py).st_mtime):
2068 # Use the __pycache__/*.pyc file, but write it to the legacy pyc
2069 # file name in the archive.
2070 fname = pycache_opt1
2071 arcname = file_pyc
2072 elif (os.path.isfile(pycache_opt2) and
2073 os.stat(pycache_opt2).st_mtime >= os.stat(file_py).st_mtime):
2074 # Use the __pycache__/*.pyc file, but write it to the legacy pyc
2075 # file name in the archive.
2076 fname = pycache_opt2
2077 arcname = file_pyc
2078 else:
2079 # Compile py into PEP 3147 pyc file.
2080 if _compile(file_py):
2081 if sys.flags.optimize == 0:
2082 fname = pycache_opt0
2083 elif sys.flags.optimize == 1:
2084 fname = pycache_opt1
2085 else:
2086 fname = pycache_opt2
2087 arcname = file_pyc
2088 else:
2089 fname = arcname = file_py
2090 else:
2091 # new mode: use given optimization level
2092 if self._optimize == 0:
2093 fname = pycache_opt0
2094 arcname = file_pyc
2095 else:
2096 arcname = file_pyc
2097 if self._optimize == 1:
2098 fname = pycache_opt1
2099 elif self._optimize == 2:
2100 fname = pycache_opt2
2101 else:
2102 msg = "invalid value for 'optimize': {!r}".format(self._optimize)
2103 raise ValueError(msg)
2104 if not (os.path.isfile(fname) and
2105 os.stat(fname).st_mtime >= os.stat(file_py).st_mtime):
2106 if not _compile(file_py, optimize=self._optimize):
2107 fname = arcname = file_py
2108 archivename = os.path.split(arcname)[1]
2109 if basename:
2110 archivename = "%s/%s" % (basename, archivename)
2111 return (fname, archivename)
2112
2113
Haibo Huangd8830302020-03-03 10:09:46 -08002114def _parents(path):
2115 """
2116 Given a path with elements separated by
2117 posixpath.sep, generate all parents of that path.
2118
2119 >>> list(_parents('b/d'))
2120 ['b']
2121 >>> list(_parents('/b/d/'))
2122 ['/b']
2123 >>> list(_parents('b/d/f/'))
2124 ['b/d', 'b']
2125 >>> list(_parents('b'))
2126 []
2127 >>> list(_parents(''))
2128 []
2129 """
2130 return itertools.islice(_ancestry(path), 1, None)
2131
2132
2133def _ancestry(path):
2134 """
2135 Given a path with elements separated by
2136 posixpath.sep, generate all elements of that path
2137
2138 >>> list(_ancestry('b/d'))
2139 ['b/d', 'b']
2140 >>> list(_ancestry('/b/d/'))
2141 ['/b/d', '/b']
2142 >>> list(_ancestry('b/d/f/'))
2143 ['b/d/f', 'b/d', 'b']
2144 >>> list(_ancestry('b'))
2145 ['b']
2146 >>> list(_ancestry(''))
2147 []
2148 """
2149 path = path.rstrip(posixpath.sep)
2150 while path and path != posixpath.sep:
2151 yield path
2152 path, tail = posixpath.split(path)
2153
2154
Haibo Huangf5f93a72020-10-19 15:43:42 -07002155_dedupe = dict.fromkeys
2156"""Deduplicate an iterable in original order"""
2157
2158
2159def _difference(minuend, subtrahend):
2160 """
2161 Return items in minuend not in subtrahend, retaining order
2162 with O(1) lookup.
2163 """
2164 return itertools.filterfalse(set(subtrahend).__contains__, minuend)
2165
2166
Haibo Huang5980f852020-03-05 12:22:08 -08002167class CompleteDirs(ZipFile):
2168 """
2169 A ZipFile subclass that ensures that implied directories
2170 are always included in the namelist.
2171 """
2172
2173 @staticmethod
2174 def _implied_dirs(names):
2175 parents = itertools.chain.from_iterable(map(_parents, names))
Haibo Huangf5f93a72020-10-19 15:43:42 -07002176 as_dirs = (p + posixpath.sep for p in parents)
2177 return _dedupe(_difference(as_dirs, names))
Haibo Huang5980f852020-03-05 12:22:08 -08002178
2179 def namelist(self):
2180 names = super(CompleteDirs, self).namelist()
2181 return names + list(self._implied_dirs(names))
2182
2183 def _name_set(self):
2184 return set(self.namelist())
2185
2186 def resolve_dir(self, name):
2187 """
2188 If the name represents a directory, return that name
2189 as a directory (with the trailing slash).
2190 """
2191 names = self._name_set()
2192 dirname = name + '/'
2193 dir_match = name not in names and dirname in names
2194 return dirname if dir_match else name
2195
2196 @classmethod
2197 def make(cls, source):
2198 """
2199 Given a source (filename or zipfile), return an
2200 appropriate CompleteDirs subclass.
2201 """
2202 if isinstance(source, CompleteDirs):
2203 return source
2204
2205 if not isinstance(source, ZipFile):
2206 return cls(source)
2207
Yi Kong71199322022-08-30 15:53:45 +08002208 # Only allow for FastLookup when supplied zipfile is read-only
Haibo Huang5980f852020-03-05 12:22:08 -08002209 if 'r' not in source.mode:
2210 cls = CompleteDirs
2211
Yi Kong71199322022-08-30 15:53:45 +08002212 source.__class__ = cls
2213 return source
Haibo Huang5980f852020-03-05 12:22:08 -08002214
2215
2216class FastLookup(CompleteDirs):
2217 """
2218 ZipFile subclass to ensure implicit
2219 dirs exist and are resolved rapidly.
2220 """
Yi Kong71199322022-08-30 15:53:45 +08002221
Haibo Huang5980f852020-03-05 12:22:08 -08002222 def namelist(self):
2223 with contextlib.suppress(AttributeError):
2224 return self.__names
2225 self.__names = super(FastLookup, self).namelist()
2226 return self.__names
2227
2228 def _name_set(self):
2229 with contextlib.suppress(AttributeError):
2230 return self.__lookup
2231 self.__lookup = super(FastLookup, self)._name_set()
2232 return self.__lookup
2233
2234
Haibo Huangd8830302020-03-03 10:09:46 -08002235class Path:
2236 """
2237 A pathlib-compatible interface for zip files.
2238
2239 Consider a zip file with this structure::
2240
2241 .
2242 ├── a.txt
2243 └── b
2244 ├── c.txt
2245 └── d
2246 └── e.txt
2247
2248 >>> data = io.BytesIO()
2249 >>> zf = ZipFile(data, 'w')
2250 >>> zf.writestr('a.txt', 'content of a')
2251 >>> zf.writestr('b/c.txt', 'content of c')
2252 >>> zf.writestr('b/d/e.txt', 'content of e')
Yi Kong71199322022-08-30 15:53:45 +08002253 >>> zf.filename = 'mem/abcde.zip'
Haibo Huangd8830302020-03-03 10:09:46 -08002254
2255 Path accepts the zipfile object itself or a filename
2256
2257 >>> root = Path(zf)
2258
2259 From there, several path operations are available.
2260
2261 Directory iteration (including the zip file itself):
2262
2263 >>> a, b = root.iterdir()
2264 >>> a
Yi Kong71199322022-08-30 15:53:45 +08002265 Path('mem/abcde.zip', 'a.txt')
Haibo Huangd8830302020-03-03 10:09:46 -08002266 >>> b
Yi Kong71199322022-08-30 15:53:45 +08002267 Path('mem/abcde.zip', 'b/')
Haibo Huangd8830302020-03-03 10:09:46 -08002268
2269 name property:
2270
2271 >>> b.name
2272 'b'
2273
2274 join with divide operator:
2275
2276 >>> c = b / 'c.txt'
2277 >>> c
Yi Kong71199322022-08-30 15:53:45 +08002278 Path('mem/abcde.zip', 'b/c.txt')
Haibo Huangd8830302020-03-03 10:09:46 -08002279 >>> c.name
2280 'c.txt'
2281
2282 Read text:
2283
2284 >>> c.read_text()
2285 'content of c'
2286
2287 existence:
2288
2289 >>> c.exists()
2290 True
2291 >>> (b / 'missing.txt').exists()
2292 False
2293
2294 Coercion to string:
2295
Yi Kong71199322022-08-30 15:53:45 +08002296 >>> import os
2297 >>> str(c).replace(os.sep, posixpath.sep)
2298 'mem/abcde.zip/b/c.txt'
2299
2300 At the root, ``name``, ``filename``, and ``parent``
2301 resolve to the zipfile. Note these attributes are not
2302 valid and will raise a ``ValueError`` if the zipfile
2303 has no filename.
2304
2305 >>> root.name
2306 'abcde.zip'
2307 >>> str(root.filename).replace(os.sep, posixpath.sep)
2308 'mem/abcde.zip'
2309 >>> str(root.parent)
2310 'mem'
Haibo Huangd8830302020-03-03 10:09:46 -08002311 """
2312
2313 __repr = "{self.__class__.__name__}({self.root.filename!r}, {self.at!r})"
2314
2315 def __init__(self, root, at=""):
Yi Kong71199322022-08-30 15:53:45 +08002316 """
2317 Construct a Path from a ZipFile or filename.
2318
2319 Note: When the source is an existing ZipFile object,
2320 its type (__class__) will be mutated to a
2321 specialized type. If the caller wishes to retain the
2322 original type, the caller should either create a
2323 separate ZipFile object or pass a filename.
2324 """
Haibo Huang5980f852020-03-05 12:22:08 -08002325 self.root = FastLookup.make(root)
Haibo Huangd8830302020-03-03 10:09:46 -08002326 self.at = at
2327
Yi Kong71199322022-08-30 15:53:45 +08002328 def open(self, mode='r', *args, pwd=None, **kwargs):
Haibo Huang5eba2b42021-01-22 11:22:02 -08002329 """
2330 Open this entry as text or binary following the semantics
2331 of ``pathlib.Path.open()`` by passing arguments through
2332 to io.TextIOWrapper().
2333 """
Yi Kong71199322022-08-30 15:53:45 +08002334 if self.is_dir():
2335 raise IsADirectoryError(self)
Haibo Huang5eba2b42021-01-22 11:22:02 -08002336 zip_mode = mode[0]
Yi Kong71199322022-08-30 15:53:45 +08002337 if not self.exists() and zip_mode == 'r':
2338 raise FileNotFoundError(self)
Haibo Huang5eba2b42021-01-22 11:22:02 -08002339 stream = self.root.open(self.at, zip_mode, pwd=pwd)
2340 if 'b' in mode:
2341 if args or kwargs:
2342 raise ValueError("encoding args invalid for binary operation")
2343 return stream
Yi Kong71199322022-08-30 15:53:45 +08002344 else:
2345 kwargs["encoding"] = io.text_encoding(kwargs.get("encoding"))
Haibo Huang5eba2b42021-01-22 11:22:02 -08002346 return io.TextIOWrapper(stream, *args, **kwargs)
Haibo Huangd8830302020-03-03 10:09:46 -08002347
2348 @property
2349 def name(self):
Yi Kong71199322022-08-30 15:53:45 +08002350 return pathlib.Path(self.at).name or self.filename.name
2351
2352 @property
2353 def filename(self):
2354 return pathlib.Path(self.root.filename).joinpath(self.at)
Haibo Huangd8830302020-03-03 10:09:46 -08002355
2356 def read_text(self, *args, **kwargs):
Yi Kong71199322022-08-30 15:53:45 +08002357 kwargs["encoding"] = io.text_encoding(kwargs.get("encoding"))
Haibo Huang5eba2b42021-01-22 11:22:02 -08002358 with self.open('r', *args, **kwargs) as strm:
2359 return strm.read()
Haibo Huangd8830302020-03-03 10:09:46 -08002360
2361 def read_bytes(self):
Haibo Huang5eba2b42021-01-22 11:22:02 -08002362 with self.open('rb') as strm:
Haibo Huangd8830302020-03-03 10:09:46 -08002363 return strm.read()
2364
2365 def _is_child(self, path):
2366 return posixpath.dirname(path.at.rstrip("/")) == self.at.rstrip("/")
2367
2368 def _next(self, at):
Yi Kong71199322022-08-30 15:53:45 +08002369 return self.__class__(self.root, at)
Haibo Huangd8830302020-03-03 10:09:46 -08002370
2371 def is_dir(self):
2372 return not self.at or self.at.endswith("/")
2373
2374 def is_file(self):
Yi Kong71199322022-08-30 15:53:45 +08002375 return self.exists() and not self.is_dir()
Haibo Huangd8830302020-03-03 10:09:46 -08002376
2377 def exists(self):
Haibo Huang5980f852020-03-05 12:22:08 -08002378 return self.at in self.root._name_set()
Haibo Huangd8830302020-03-03 10:09:46 -08002379
2380 def iterdir(self):
2381 if not self.is_dir():
2382 raise ValueError("Can't listdir a file")
Haibo Huang5980f852020-03-05 12:22:08 -08002383 subs = map(self._next, self.root.namelist())
Haibo Huangd8830302020-03-03 10:09:46 -08002384 return filter(self._is_child, subs)
2385
2386 def __str__(self):
2387 return posixpath.join(self.root.filename, self.at)
2388
2389 def __repr__(self):
2390 return self.__repr.format(self=self)
2391
Yi Kong71199322022-08-30 15:53:45 +08002392 def joinpath(self, *other):
2393 next = posixpath.join(self.at, *other)
Haibo Huang5980f852020-03-05 12:22:08 -08002394 return self._next(self.root.resolve_dir(next))
Haibo Huangd8830302020-03-03 10:09:46 -08002395
2396 __truediv__ = joinpath
2397
Haibo Huangd8830302020-03-03 10:09:46 -08002398 @property
2399 def parent(self):
Yi Kong71199322022-08-30 15:53:45 +08002400 if not self.at:
2401 return self.filename.parent
Haibo Huangd8830302020-03-03 10:09:46 -08002402 parent_at = posixpath.dirname(self.at.rstrip('/'))
2403 if parent_at:
2404 parent_at += '/'
2405 return self._next(parent_at)
2406
Haibo Huangd8830302020-03-03 10:09:46 -08002407
2408def main(args=None):
2409 import argparse
2410
2411 description = 'A simple command-line interface for zipfile module.'
2412 parser = argparse.ArgumentParser(description=description)
2413 group = parser.add_mutually_exclusive_group(required=True)
2414 group.add_argument('-l', '--list', metavar='<zipfile>',
2415 help='Show listing of a zipfile')
2416 group.add_argument('-e', '--extract', nargs=2,
2417 metavar=('<zipfile>', '<output_dir>'),
2418 help='Extract zipfile into target dir')
2419 group.add_argument('-c', '--create', nargs='+',
2420 metavar=('<name>', '<file>'),
2421 help='Create zipfile from sources')
2422 group.add_argument('-t', '--test', metavar='<zipfile>',
2423 help='Test if a zipfile is valid')
2424 args = parser.parse_args(args)
2425
2426 if args.test is not None:
2427 src = args.test
2428 with ZipFile(src, 'r') as zf:
2429 badfile = zf.testzip()
2430 if badfile:
2431 print("The following enclosed file is corrupted: {!r}".format(badfile))
2432 print("Done testing")
2433
2434 elif args.list is not None:
2435 src = args.list
2436 with ZipFile(src, 'r') as zf:
2437 zf.printdir()
2438
2439 elif args.extract is not None:
2440 src, curdir = args.extract
2441 with ZipFile(src, 'r') as zf:
2442 zf.extractall(curdir)
2443
2444 elif args.create is not None:
2445 zip_name = args.create.pop(0)
2446 files = args.create
2447
2448 def addToZip(zf, path, zippath):
2449 if os.path.isfile(path):
2450 zf.write(path, zippath, ZIP_DEFLATED)
2451 elif os.path.isdir(path):
2452 if zippath:
2453 zf.write(path, zippath)
2454 for nm in sorted(os.listdir(path)):
2455 addToZip(zf,
2456 os.path.join(path, nm), os.path.join(zippath, nm))
2457 # else: ignore
2458
2459 with ZipFile(zip_name, 'w') as zf:
2460 for path in files:
2461 zippath = os.path.basename(path)
2462 if not zippath:
2463 zippath = os.path.basename(os.path.dirname(path))
2464 if zippath in ('', os.curdir, os.pardir):
2465 zippath = ''
2466 addToZip(zf, path, zippath)
2467
Haibo Huang5980f852020-03-05 12:22:08 -08002468
Haibo Huangd8830302020-03-03 10:09:46 -08002469if __name__ == "__main__":
2470 main()