Blame - Lib/gzip.py - platform/prebuilts/python/windows-x86

blob: e422773b3edfb7082062b0b97f5a0833a04ade5e [file] [log] [blame]

Haibo Huang	d883030	2020-03-03 10:09:46 -0800	[diff] [blame]	1	"""Functions that read and write gzipped files.
				2
				3	The user of the file doesn't have to worry about the compression,
				4	but random access is not allowed."""
				5
				6	# based on Andrew Kuchling's minigzip.py distributed with the zlib module
				7
				8	import struct, sys, time, os
				9	import zlib
				10	import builtins
				11	import io
				12	import _compression
				13
				14	__all__ = ["BadGzipFile", "GzipFile", "open", "compress", "decompress"]
				15
				16	FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
				17
				18	READ, WRITE = 1, 2
				19
				20	_COMPRESS_LEVEL_FAST = 1
				21	_COMPRESS_LEVEL_TRADEOFF = 6
				22	_COMPRESS_LEVEL_BEST = 9
				23
				24
				25	def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST,
				26	encoding=None, errors=None, newline=None):
				27	"""Open a gzip-compressed file in binary or text mode.
				28
				29	The filename argument can be an actual filename (a str or bytes object), or
				30	an existing file object to read from or write to.
				31
				32	The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for
				33	binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is
				34	"rb", and the default compresslevel is 9.
				35
				36	For binary mode, this function is equivalent to the GzipFile constructor:
				37	GzipFile(filename, mode, compresslevel). In this case, the encoding, errors
				38	and newline arguments must not be provided.
				39
				40	For text mode, a GzipFile object is created, and wrapped in an
				41	io.TextIOWrapper instance with the specified encoding, error handling
				42	behavior, and line ending(s).
				43
				44	"""
				45	if "t" in mode:
				46	if "b" in mode:
				47	raise ValueError("Invalid mode: %r" % (mode,))
				48	else:
				49	if encoding is not None:
				50	raise ValueError("Argument 'encoding' not supported in binary mode")
				51	if errors is not None:
				52	raise ValueError("Argument 'errors' not supported in binary mode")
				53	if newline is not None:
				54	raise ValueError("Argument 'newline' not supported in binary mode")
				55
				56	gz_mode = mode.replace("t", "")
				57	if isinstance(filename, (str, bytes, os.PathLike)):
				58	binary_file = GzipFile(filename, gz_mode, compresslevel)
				59	elif hasattr(filename, "read") or hasattr(filename, "write"):
				60	binary_file = GzipFile(None, gz_mode, compresslevel, filename)
				61	else:
				62	raise TypeError("filename must be a str or bytes object, or a file")
				63
				64	if "t" in mode:
				65	return io.TextIOWrapper(binary_file, encoding, errors, newline)
				66	else:
				67	return binary_file
				68
				69	def write32u(output, value):
				70	# The L format writes the bit pattern correctly whether signed
				71	# or unsigned.
				72	output.write(struct.pack("<L", value))
				73
				74	class _PaddedFile:
				75	"""Minimal read-only file object that prepends a string to the contents
				76	of an actual file. Shouldn't be used outside of gzip.py, as it lacks
				77	essential functionality."""
				78
				79	def __init__(self, f, prepend=b''):
				80	self._buffer = prepend
				81	self._length = len(prepend)
				82	self.file = f
				83	self._read = 0
				84
				85	def read(self, size):
				86	if self._read is None:
				87	return self.file.read(size)
				88	if self._read + size <= self._length:
				89	read = self._read
				90	self._read += size
				91	return self._buffer[read:self._read]
				92	else:
				93	read = self._read
				94	self._read = None
				95	return self._buffer[read:] + \
				96	self.file.read(size-self._length+read)
				97
				98	def prepend(self, prepend=b''):
				99	if self._read is None:
				100	self._buffer = prepend
				101	else: # Assume data was read since the last prepend() call
				102	self._read -= len(prepend)
				103	return
				104	self._length = len(self._buffer)
				105	self._read = 0
				106
				107	def seek(self, off):
				108	self._read = None
				109	self._buffer = None
				110	return self.file.seek(off)
				111
				112	def seekable(self):
				113	return True # Allows fast-forwarding even in unseekable streams
				114
				115
				116	class BadGzipFile(OSError):
				117	"""Exception raised in some cases for invalid gzip files."""
				118
				119
				120	class GzipFile(_compression.BaseStream):
				121	"""The GzipFile class simulates most of the methods of a file object with
				122	the exception of the truncate() method.
				123
				124	This class only supports opening files in binary mode. If you need to open a
				125	compressed file in text mode, use the gzip.open() function.
				126
				127	"""
				128
				129	# Overridden with internal file object to be closed, if only a filename
				130	# is passed in
				131	myfileobj = None
				132
				133	def __init__(self, filename=None, mode=None,
				134	compresslevel=_COMPRESS_LEVEL_BEST, fileobj=None, mtime=None):
				135	"""Constructor for the GzipFile class.
				136
				137	At least one of fileobj and filename must be given a
				138	non-trivial value.
				139
				140	The new class instance is based on fileobj, which can be a regular
				141	file, an io.BytesIO object, or any other object which simulates a file.
				142	It defaults to None, in which case filename is opened to provide
				143	a file object.
				144
				145	When fileobj is not None, the filename argument is only used to be
				146	included in the gzip file header, which may include the original
				147	filename of the uncompressed file. It defaults to the filename of
				148	fileobj, if discernible; otherwise, it defaults to the empty string,
				149	and in this case the original filename is not included in the header.
				150
				151	The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or
				152	'xb' depending on whether the file will be read or written. The default
				153	is the mode of fileobj if discernible; otherwise, the default is 'rb'.
				154	A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
				155	'wb', 'a' and 'ab', and 'x' and 'xb'.
				156
				157	The compresslevel argument is an integer from 0 to 9 controlling the
				158	level of compression; 1 is fastest and produces the least compression,
				159	and 9 is slowest and produces the most compression. 0 is no compression
				160	at all. The default is 9.
				161
				162	The mtime argument is an optional numeric timestamp to be written
				163	to the last modification time field in the stream when compressing.
				164	If omitted or None, the current time is used.
				165
				166	"""
				167
				168	if mode and ('t' in mode or 'U' in mode):
				169	raise ValueError("Invalid mode: {!r}".format(mode))
				170	if mode and 'b' not in mode:
				171	mode += 'b'
				172	if fileobj is None:
				173	fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
				174	if filename is None:
				175	filename = getattr(fileobj, 'name', '')
				176	if not isinstance(filename, (str, bytes)):
				177	filename = ''
				178	else:
				179	filename = os.fspath(filename)
Haibo Huang	5eba2b4	2021-01-22 11:22:02 -0800	[diff] [blame]	180	origmode = mode
Haibo Huang	d883030	2020-03-03 10:09:46 -0800	[diff] [blame]	181	if mode is None:
				182	mode = getattr(fileobj, 'mode', 'rb')
				183
				184	if mode.startswith('r'):
				185	self.mode = READ
				186	raw = _GzipReader(fileobj)
				187	self._buffer = io.BufferedReader(raw)
				188	self.name = filename
				189
				190	elif mode.startswith(('w', 'a', 'x')):
Haibo Huang	5eba2b4	2021-01-22 11:22:02 -0800	[diff] [blame]	191	if origmode is None:
				192	import warnings
				193	warnings.warn(
				194	"GzipFile was opened for writing, but this will "
				195	"change in future Python releases. "
				196	"Specify the mode argument for opening it for writing.",
				197	FutureWarning, 2)
Haibo Huang	d883030	2020-03-03 10:09:46 -0800	[diff] [blame]	198	self.mode = WRITE
				199	self._init_write(filename)
				200	self.compress = zlib.compressobj(compresslevel,
				201	zlib.DEFLATED,
				202	-zlib.MAX_WBITS,
				203	zlib.DEF_MEM_LEVEL,
				204	0)
				205	self._write_mtime = mtime
				206	else:
				207	raise ValueError("Invalid mode: {!r}".format(mode))
				208
				209	self.fileobj = fileobj
				210
				211	if self.mode == WRITE:
Haibo Huang	5980f85	2020-03-05 12:22:08 -0800	[diff] [blame]	212	self._write_gzip_header(compresslevel)
Haibo Huang	d883030	2020-03-03 10:09:46 -0800	[diff] [blame]	213
				214	@property
				215	def filename(self):
				216	import warnings
				217	warnings.warn("use the name attribute", DeprecationWarning, 2)
				218	if self.mode == WRITE and self.name[-3:] != ".gz":
				219	return self.name + ".gz"
				220	return self.name
				221
				222	@property
				223	def mtime(self):
				224	"""Last modification time read from stream, or None"""
				225	return self._buffer.raw._last_mtime
				226
				227	def __repr__(self):
				228	s = repr(self.fileobj)
				229	return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
				230
				231	def _init_write(self, filename):
				232	self.name = filename
				233	self.crc = zlib.crc32(b"")
				234	self.size = 0
				235	self.writebuf = []
				236	self.bufsize = 0
				237	self.offset = 0 # Current file offset for seek(), tell(), etc
				238
Haibo Huang	5980f85	2020-03-05 12:22:08 -0800	[diff] [blame]	239	def _write_gzip_header(self, compresslevel):
Haibo Huang	d883030	2020-03-03 10:09:46 -0800	[diff] [blame]	240	self.fileobj.write(b'\037\213') # magic header
				241	self.fileobj.write(b'\010') # compression method
				242	try:
				243	# RFC 1952 requires the FNAME field to be Latin-1. Do not
				244	# include filenames that cannot be represented that way.
				245	fname = os.path.basename(self.name)
				246	if not isinstance(fname, bytes):
				247	fname = fname.encode('latin-1')
				248	if fname.endswith(b'.gz'):
				249	fname = fname[:-3]
				250	except UnicodeEncodeError:
				251	fname = b''
				252	flags = 0
				253	if fname:
				254	flags = FNAME
				255	self.fileobj.write(chr(flags).encode('latin-1'))
				256	mtime = self._write_mtime
				257	if mtime is None:
				258	mtime = time.time()
				259	write32u(self.fileobj, int(mtime))
Haibo Huang	5980f85	2020-03-05 12:22:08 -0800	[diff] [blame]	260	if compresslevel == _COMPRESS_LEVEL_BEST:
				261	xfl = b'\002'
				262	elif compresslevel == _COMPRESS_LEVEL_FAST:
				263	xfl = b'\004'
				264	else:
				265	xfl = b'\000'
				266	self.fileobj.write(xfl)
Haibo Huang	d883030	2020-03-03 10:09:46 -0800	[diff] [blame]	267	self.fileobj.write(b'\377')
				268	if fname:
				269	self.fileobj.write(fname + b'\000')
				270
				271	def write(self,data):
				272	self._check_not_closed()
				273	if self.mode != WRITE:
				274	import errno
				275	raise OSError(errno.EBADF, "write() on read-only GzipFile object")
				276
				277	if self.fileobj is None:
				278	raise ValueError("write() on closed GzipFile object")
				279
				280	if isinstance(data, bytes):
				281	length = len(data)
				282	else:
				283	# accept any data that supports the buffer protocol
				284	data = memoryview(data)
				285	length = data.nbytes
				286
				287	if length > 0:
				288	self.fileobj.write(self.compress.compress(data))
				289	self.size += length
				290	self.crc = zlib.crc32(data, self.crc)
				291	self.offset += length
				292
				293	return length
				294
				295	def read(self, size=-1):
				296	self._check_not_closed()
				297	if self.mode != READ:
				298	import errno
				299	raise OSError(errno.EBADF, "read() on write-only GzipFile object")
				300	return self._buffer.read(size)
				301
				302	def read1(self, size=-1):
				303	"""Implements BufferedIOBase.read1()
				304
				305	Reads up to a buffer's worth of data if size is negative."""
				306	self._check_not_closed()
				307	if self.mode != READ:
				308	import errno
				309	raise OSError(errno.EBADF, "read1() on write-only GzipFile object")
				310
				311	if size < 0:
				312	size = io.DEFAULT_BUFFER_SIZE
				313	return self._buffer.read1(size)
				314
				315	def peek(self, n):
				316	self._check_not_closed()
				317	if self.mode != READ:
				318	import errno
				319	raise OSError(errno.EBADF, "peek() on write-only GzipFile object")
				320	return self._buffer.peek(n)
				321
				322	@property
				323	def closed(self):
				324	return self.fileobj is None
				325
				326	def close(self):
				327	fileobj = self.fileobj
				328	if fileobj is None:
				329	return
				330	self.fileobj = None
				331	try:
				332	if self.mode == WRITE:
				333	fileobj.write(self.compress.flush())
				334	write32u(fileobj, self.crc)
				335	# self.size may exceed 2 GiB, or even 4 GiB
				336	write32u(fileobj, self.size & 0xffffffff)
				337	elif self.mode == READ:
				338	self._buffer.close()
				339	finally:
				340	myfileobj = self.myfileobj
				341	if myfileobj:
				342	self.myfileobj = None
				343	myfileobj.close()
				344
				345	def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
				346	self._check_not_closed()
				347	if self.mode == WRITE:
				348	# Ensure the compressor's buffer is flushed
				349	self.fileobj.write(self.compress.flush(zlib_mode))
				350	self.fileobj.flush()
				351
				352	def fileno(self):
				353	"""Invoke the underlying file object's fileno() method.
				354
				355	This will raise AttributeError if the underlying file object
				356	doesn't support fileno().
				357	"""
				358	return self.fileobj.fileno()
				359
				360	def rewind(self):
				361	'''Return the uncompressed stream file position indicator to the
				362	beginning of the file'''
				363	if self.mode != READ:
				364	raise OSError("Can't rewind in write mode")
				365	self._buffer.seek(0)
				366
				367	def readable(self):
				368	return self.mode == READ
				369
				370	def writable(self):
				371	return self.mode == WRITE
				372
				373	def seekable(self):
				374	return True
				375
				376	def seek(self, offset, whence=io.SEEK_SET):
				377	if self.mode == WRITE:
				378	if whence != io.SEEK_SET:
				379	if whence == io.SEEK_CUR:
				380	offset = self.offset + offset
				381	else:
				382	raise ValueError('Seek from end not supported')
				383	if offset < self.offset:
				384	raise OSError('Negative seek in write mode')
				385	count = offset - self.offset
				386	chunk = b'\0' * 1024
				387	for i in range(count // 1024):
				388	self.write(chunk)
				389	self.write(b'\0' * (count % 1024))
				390	elif self.mode == READ:
				391	self._check_not_closed()
				392	return self._buffer.seek(offset, whence)
				393
				394	return self.offset
				395
				396	def readline(self, size=-1):
				397	self._check_not_closed()
				398	return self._buffer.readline(size)
				399
				400
				401	class _GzipReader(_compression.DecompressReader):
				402	def __init__(self, fp):
				403	super().__init__(_PaddedFile(fp), zlib.decompressobj,
				404	wbits=-zlib.MAX_WBITS)
				405	# Set flag indicating start of a new member
				406	self._new_member = True
				407	self._last_mtime = None
				408
				409	def _init_read(self):
				410	self._crc = zlib.crc32(b"")
				411	self._stream_size = 0 # Decompressed size of unconcatenated stream
				412
				413	def _read_exact(self, n):
				414	'''Read exactly n bytes from `self._fp`
				415
				416	This method is required because self._fp may be unbuffered,
				417	i.e. return short reads.
				418	'''
				419
				420	data = self._fp.read(n)
				421	while len(data) < n:
				422	b = self._fp.read(n - len(data))
				423	if not b:
				424	raise EOFError("Compressed file ended before the "
				425	"end-of-stream marker was reached")
				426	data += b
				427	return data
				428
				429	def _read_gzip_header(self):
				430	magic = self._fp.read(2)
				431	if magic == b'':
				432	return False
				433
				434	if magic != b'\037\213':
				435	raise BadGzipFile('Not a gzipped file (%r)' % magic)
				436
				437	(method, flag,
				438	self._last_mtime) = struct.unpack("<BBIxx", self._read_exact(8))
				439	if method != 8:
				440	raise BadGzipFile('Unknown compression method')
				441
				442	if flag & FEXTRA:
				443	# Read & discard the extra field, if present
				444	extra_len, = struct.unpack("<H", self._read_exact(2))
				445	self._read_exact(extra_len)
				446	if flag & FNAME:
				447	# Read and discard a null-terminated string containing the filename
				448	while True:
				449	s = self._fp.read(1)
				450	if not s or s==b'\000':
				451	break
				452	if flag & FCOMMENT:
				453	# Read and discard a null-terminated string containing a comment
				454	while True:
				455	s = self._fp.read(1)
				456	if not s or s==b'\000':
				457	break
				458	if flag & FHCRC:
				459	self._read_exact(2) # Read & discard the 16-bit header CRC
				460	return True
				461
				462	def read(self, size=-1):
				463	if size < 0:
				464	return self.readall()
				465	# size=0 is special because decompress(max_length=0) is not supported
				466	if not size:
				467	return b""
				468
				469	# For certain input data, a single
				470	# call to decompress() may not return
				471	# any data. In this case, retry until we get some data or reach EOF.
				472	while True:
				473	if self._decompressor.eof:
				474	# Ending case: we've come to the end of a member in the file,
				475	# so finish up this member, and read a new gzip header.
				476	# Check the CRC and file size, and set the flag so we read
				477	# a new member
				478	self._read_eof()
				479	self._new_member = True
				480	self._decompressor = self._decomp_factory(
				481	**self._decomp_args)
				482
				483	if self._new_member:
				484	# If the _new_member flag is set, we have to
				485	# jump to the next member, if there is one.
				486	self._init_read()
				487	if not self._read_gzip_header():
				488	self._size = self._pos
				489	return b""
				490	self._new_member = False
				491
				492	# Read a chunk of data from the file
				493	buf = self._fp.read(io.DEFAULT_BUFFER_SIZE)
				494
				495	uncompress = self._decompressor.decompress(buf, size)
				496	if self._decompressor.unconsumed_tail != b"":
				497	self._fp.prepend(self._decompressor.unconsumed_tail)
				498	elif self._decompressor.unused_data != b"":
				499	# Prepend the already read bytes to the fileobj so they can
				500	# be seen by _read_eof() and _read_gzip_header()
				501	self._fp.prepend(self._decompressor.unused_data)
				502
				503	if uncompress != b"":
				504	break
				505	if buf == b"":
				506	raise EOFError("Compressed file ended before the "
				507	"end-of-stream marker was reached")
				508
				509	self._add_read_data( uncompress )
				510	self._pos += len(uncompress)
				511	return uncompress
				512
				513	def _add_read_data(self, data):
				514	self._crc = zlib.crc32(data, self._crc)
				515	self._stream_size = self._stream_size + len(data)
				516
				517	def _read_eof(self):
				518	# We've read to the end of the file
				519	# We check the that the computed CRC and size of the
				520	# uncompressed data matches the stored values. Note that the size
				521	# stored is the true file size mod 2**32.
				522	crc32, isize = struct.unpack("<II", self._read_exact(8))
				523	if crc32 != self._crc:
				524	raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
				525	hex(self._crc)))
				526	elif isize != (self._stream_size & 0xffffffff):
				527	raise BadGzipFile("Incorrect length of data produced")
				528
				529	# Gzip files can be padded with zeroes and still have archives.
				530	# Consume all zero bytes and set the file position to the first
				531	# non-zero byte. See http://www.gzip.org/#faq8
				532	c = b"\x00"
				533	while c == b"\x00":
				534	c = self._fp.read(1)
				535	if c:
				536	self._fp.prepend(c)
				537
				538	def _rewind(self):
				539	super()._rewind()
				540	self._new_member = True
				541
				542	def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
				543	"""Compress data in one shot and return the compressed string.
				544	Optional argument is the compression level, in range of 0-9.
				545	"""
				546	buf = io.BytesIO()
				547	with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel, mtime=mtime) as f:
				548	f.write(data)
				549	return buf.getvalue()
				550
				551	def decompress(data):
				552	"""Decompress a gzip compressed string in one shot.
				553	Return the decompressed string.
				554	"""
				555	with GzipFile(fileobj=io.BytesIO(data)) as f:
				556	return f.read()
				557
				558
				559	def main():
				560	from argparse import ArgumentParser
				561	parser = ArgumentParser(description=
				562	"A simple command line interface for the gzip module: act like gzip, "
				563	"but do not delete the input file.")
				564	group = parser.add_mutually_exclusive_group()
				565	group.add_argument('--fast', action='store_true', help='compress faster')
				566	group.add_argument('--best', action='store_true', help='compress better')
				567	group.add_argument("-d", "--decompress", action="store_true",
				568	help="act like gunzip instead of gzip")
				569
				570	parser.add_argument("args", nargs="*", default=["-"], metavar='file')
				571	args = parser.parse_args()
				572
				573	compresslevel = _COMPRESS_LEVEL_TRADEOFF
				574	if args.fast:
				575	compresslevel = _COMPRESS_LEVEL_FAST
				576	elif args.best:
				577	compresslevel = _COMPRESS_LEVEL_BEST
				578
				579	for arg in args.args:
				580	if args.decompress:
				581	if arg == "-":
				582	f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
				583	g = sys.stdout.buffer
				584	else:
				585	if arg[-3:] != ".gz":
				586	print("filename doesn't end in .gz:", repr(arg))
				587	continue
				588	f = open(arg, "rb")
				589	g = builtins.open(arg[:-3], "wb")
				590	else:
				591	if arg == "-":
				592	f = sys.stdin.buffer
				593	g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer,
				594	compresslevel=compresslevel)
				595	else:
				596	f = builtins.open(arg, "rb")
				597	g = open(arg + ".gz", "wb")
				598	while True:
				599	chunk = f.read(1024)
				600	if not chunk:
				601	break
				602	g.write(chunk)
				603	if g is not sys.stdout.buffer:
				604	g.close()
				605	if f is not sys.stdin.buffer:
				606	f.close()
				607
				608	if __name__ == '__main__':
				609	main()