blob: e422773b3edfb7082062b0b97f5a0833a04ade5e [file] [log] [blame]
Haibo Huangd8830302020-03-03 10:09:46 -08001"""Functions that read and write gzipped files.
2
3The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
8import struct, sys, time, os
9import zlib
10import builtins
11import io
12import _compression
13
14__all__ = ["BadGzipFile", "GzipFile", "open", "compress", "decompress"]
15
16FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
17
18READ, WRITE = 1, 2
19
20_COMPRESS_LEVEL_FAST = 1
21_COMPRESS_LEVEL_TRADEOFF = 6
22_COMPRESS_LEVEL_BEST = 9
23
24
25def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST,
26 encoding=None, errors=None, newline=None):
27 """Open a gzip-compressed file in binary or text mode.
28
29 The filename argument can be an actual filename (a str or bytes object), or
30 an existing file object to read from or write to.
31
32 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for
33 binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is
34 "rb", and the default compresslevel is 9.
35
36 For binary mode, this function is equivalent to the GzipFile constructor:
37 GzipFile(filename, mode, compresslevel). In this case, the encoding, errors
38 and newline arguments must not be provided.
39
40 For text mode, a GzipFile object is created, and wrapped in an
41 io.TextIOWrapper instance with the specified encoding, error handling
42 behavior, and line ending(s).
43
44 """
45 if "t" in mode:
46 if "b" in mode:
47 raise ValueError("Invalid mode: %r" % (mode,))
48 else:
49 if encoding is not None:
50 raise ValueError("Argument 'encoding' not supported in binary mode")
51 if errors is not None:
52 raise ValueError("Argument 'errors' not supported in binary mode")
53 if newline is not None:
54 raise ValueError("Argument 'newline' not supported in binary mode")
55
56 gz_mode = mode.replace("t", "")
57 if isinstance(filename, (str, bytes, os.PathLike)):
58 binary_file = GzipFile(filename, gz_mode, compresslevel)
59 elif hasattr(filename, "read") or hasattr(filename, "write"):
60 binary_file = GzipFile(None, gz_mode, compresslevel, filename)
61 else:
62 raise TypeError("filename must be a str or bytes object, or a file")
63
64 if "t" in mode:
65 return io.TextIOWrapper(binary_file, encoding, errors, newline)
66 else:
67 return binary_file
68
69def write32u(output, value):
70 # The L format writes the bit pattern correctly whether signed
71 # or unsigned.
72 output.write(struct.pack("<L", value))
73
74class _PaddedFile:
75 """Minimal read-only file object that prepends a string to the contents
76 of an actual file. Shouldn't be used outside of gzip.py, as it lacks
77 essential functionality."""
78
79 def __init__(self, f, prepend=b''):
80 self._buffer = prepend
81 self._length = len(prepend)
82 self.file = f
83 self._read = 0
84
85 def read(self, size):
86 if self._read is None:
87 return self.file.read(size)
88 if self._read + size <= self._length:
89 read = self._read
90 self._read += size
91 return self._buffer[read:self._read]
92 else:
93 read = self._read
94 self._read = None
95 return self._buffer[read:] + \
96 self.file.read(size-self._length+read)
97
98 def prepend(self, prepend=b''):
99 if self._read is None:
100 self._buffer = prepend
101 else: # Assume data was read since the last prepend() call
102 self._read -= len(prepend)
103 return
104 self._length = len(self._buffer)
105 self._read = 0
106
107 def seek(self, off):
108 self._read = None
109 self._buffer = None
110 return self.file.seek(off)
111
112 def seekable(self):
113 return True # Allows fast-forwarding even in unseekable streams
114
115
116class BadGzipFile(OSError):
117 """Exception raised in some cases for invalid gzip files."""
118
119
120class GzipFile(_compression.BaseStream):
121 """The GzipFile class simulates most of the methods of a file object with
122 the exception of the truncate() method.
123
124 This class only supports opening files in binary mode. If you need to open a
125 compressed file in text mode, use the gzip.open() function.
126
127 """
128
129 # Overridden with internal file object to be closed, if only a filename
130 # is passed in
131 myfileobj = None
132
133 def __init__(self, filename=None, mode=None,
134 compresslevel=_COMPRESS_LEVEL_BEST, fileobj=None, mtime=None):
135 """Constructor for the GzipFile class.
136
137 At least one of fileobj and filename must be given a
138 non-trivial value.
139
140 The new class instance is based on fileobj, which can be a regular
141 file, an io.BytesIO object, or any other object which simulates a file.
142 It defaults to None, in which case filename is opened to provide
143 a file object.
144
145 When fileobj is not None, the filename argument is only used to be
146 included in the gzip file header, which may include the original
147 filename of the uncompressed file. It defaults to the filename of
148 fileobj, if discernible; otherwise, it defaults to the empty string,
149 and in this case the original filename is not included in the header.
150
151 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or
152 'xb' depending on whether the file will be read or written. The default
153 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
154 A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
155 'wb', 'a' and 'ab', and 'x' and 'xb'.
156
157 The compresslevel argument is an integer from 0 to 9 controlling the
158 level of compression; 1 is fastest and produces the least compression,
159 and 9 is slowest and produces the most compression. 0 is no compression
160 at all. The default is 9.
161
162 The mtime argument is an optional numeric timestamp to be written
163 to the last modification time field in the stream when compressing.
164 If omitted or None, the current time is used.
165
166 """
167
168 if mode and ('t' in mode or 'U' in mode):
169 raise ValueError("Invalid mode: {!r}".format(mode))
170 if mode and 'b' not in mode:
171 mode += 'b'
172 if fileobj is None:
173 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
174 if filename is None:
175 filename = getattr(fileobj, 'name', '')
176 if not isinstance(filename, (str, bytes)):
177 filename = ''
178 else:
179 filename = os.fspath(filename)
Haibo Huang5eba2b42021-01-22 11:22:02 -0800180 origmode = mode
Haibo Huangd8830302020-03-03 10:09:46 -0800181 if mode is None:
182 mode = getattr(fileobj, 'mode', 'rb')
183
184 if mode.startswith('r'):
185 self.mode = READ
186 raw = _GzipReader(fileobj)
187 self._buffer = io.BufferedReader(raw)
188 self.name = filename
189
190 elif mode.startswith(('w', 'a', 'x')):
Haibo Huang5eba2b42021-01-22 11:22:02 -0800191 if origmode is None:
192 import warnings
193 warnings.warn(
194 "GzipFile was opened for writing, but this will "
195 "change in future Python releases. "
196 "Specify the mode argument for opening it for writing.",
197 FutureWarning, 2)
Haibo Huangd8830302020-03-03 10:09:46 -0800198 self.mode = WRITE
199 self._init_write(filename)
200 self.compress = zlib.compressobj(compresslevel,
201 zlib.DEFLATED,
202 -zlib.MAX_WBITS,
203 zlib.DEF_MEM_LEVEL,
204 0)
205 self._write_mtime = mtime
206 else:
207 raise ValueError("Invalid mode: {!r}".format(mode))
208
209 self.fileobj = fileobj
210
211 if self.mode == WRITE:
Haibo Huang5980f852020-03-05 12:22:08 -0800212 self._write_gzip_header(compresslevel)
Haibo Huangd8830302020-03-03 10:09:46 -0800213
214 @property
215 def filename(self):
216 import warnings
217 warnings.warn("use the name attribute", DeprecationWarning, 2)
218 if self.mode == WRITE and self.name[-3:] != ".gz":
219 return self.name + ".gz"
220 return self.name
221
222 @property
223 def mtime(self):
224 """Last modification time read from stream, or None"""
225 return self._buffer.raw._last_mtime
226
227 def __repr__(self):
228 s = repr(self.fileobj)
229 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
230
231 def _init_write(self, filename):
232 self.name = filename
233 self.crc = zlib.crc32(b"")
234 self.size = 0
235 self.writebuf = []
236 self.bufsize = 0
237 self.offset = 0 # Current file offset for seek(), tell(), etc
238
Haibo Huang5980f852020-03-05 12:22:08 -0800239 def _write_gzip_header(self, compresslevel):
Haibo Huangd8830302020-03-03 10:09:46 -0800240 self.fileobj.write(b'\037\213') # magic header
241 self.fileobj.write(b'\010') # compression method
242 try:
243 # RFC 1952 requires the FNAME field to be Latin-1. Do not
244 # include filenames that cannot be represented that way.
245 fname = os.path.basename(self.name)
246 if not isinstance(fname, bytes):
247 fname = fname.encode('latin-1')
248 if fname.endswith(b'.gz'):
249 fname = fname[:-3]
250 except UnicodeEncodeError:
251 fname = b''
252 flags = 0
253 if fname:
254 flags = FNAME
255 self.fileobj.write(chr(flags).encode('latin-1'))
256 mtime = self._write_mtime
257 if mtime is None:
258 mtime = time.time()
259 write32u(self.fileobj, int(mtime))
Haibo Huang5980f852020-03-05 12:22:08 -0800260 if compresslevel == _COMPRESS_LEVEL_BEST:
261 xfl = b'\002'
262 elif compresslevel == _COMPRESS_LEVEL_FAST:
263 xfl = b'\004'
264 else:
265 xfl = b'\000'
266 self.fileobj.write(xfl)
Haibo Huangd8830302020-03-03 10:09:46 -0800267 self.fileobj.write(b'\377')
268 if fname:
269 self.fileobj.write(fname + b'\000')
270
271 def write(self,data):
272 self._check_not_closed()
273 if self.mode != WRITE:
274 import errno
275 raise OSError(errno.EBADF, "write() on read-only GzipFile object")
276
277 if self.fileobj is None:
278 raise ValueError("write() on closed GzipFile object")
279
280 if isinstance(data, bytes):
281 length = len(data)
282 else:
283 # accept any data that supports the buffer protocol
284 data = memoryview(data)
285 length = data.nbytes
286
287 if length > 0:
288 self.fileobj.write(self.compress.compress(data))
289 self.size += length
290 self.crc = zlib.crc32(data, self.crc)
291 self.offset += length
292
293 return length
294
295 def read(self, size=-1):
296 self._check_not_closed()
297 if self.mode != READ:
298 import errno
299 raise OSError(errno.EBADF, "read() on write-only GzipFile object")
300 return self._buffer.read(size)
301
302 def read1(self, size=-1):
303 """Implements BufferedIOBase.read1()
304
305 Reads up to a buffer's worth of data if size is negative."""
306 self._check_not_closed()
307 if self.mode != READ:
308 import errno
309 raise OSError(errno.EBADF, "read1() on write-only GzipFile object")
310
311 if size < 0:
312 size = io.DEFAULT_BUFFER_SIZE
313 return self._buffer.read1(size)
314
315 def peek(self, n):
316 self._check_not_closed()
317 if self.mode != READ:
318 import errno
319 raise OSError(errno.EBADF, "peek() on write-only GzipFile object")
320 return self._buffer.peek(n)
321
322 @property
323 def closed(self):
324 return self.fileobj is None
325
326 def close(self):
327 fileobj = self.fileobj
328 if fileobj is None:
329 return
330 self.fileobj = None
331 try:
332 if self.mode == WRITE:
333 fileobj.write(self.compress.flush())
334 write32u(fileobj, self.crc)
335 # self.size may exceed 2 GiB, or even 4 GiB
336 write32u(fileobj, self.size & 0xffffffff)
337 elif self.mode == READ:
338 self._buffer.close()
339 finally:
340 myfileobj = self.myfileobj
341 if myfileobj:
342 self.myfileobj = None
343 myfileobj.close()
344
345 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
346 self._check_not_closed()
347 if self.mode == WRITE:
348 # Ensure the compressor's buffer is flushed
349 self.fileobj.write(self.compress.flush(zlib_mode))
350 self.fileobj.flush()
351
352 def fileno(self):
353 """Invoke the underlying file object's fileno() method.
354
355 This will raise AttributeError if the underlying file object
356 doesn't support fileno().
357 """
358 return self.fileobj.fileno()
359
360 def rewind(self):
361 '''Return the uncompressed stream file position indicator to the
362 beginning of the file'''
363 if self.mode != READ:
364 raise OSError("Can't rewind in write mode")
365 self._buffer.seek(0)
366
367 def readable(self):
368 return self.mode == READ
369
370 def writable(self):
371 return self.mode == WRITE
372
373 def seekable(self):
374 return True
375
376 def seek(self, offset, whence=io.SEEK_SET):
377 if self.mode == WRITE:
378 if whence != io.SEEK_SET:
379 if whence == io.SEEK_CUR:
380 offset = self.offset + offset
381 else:
382 raise ValueError('Seek from end not supported')
383 if offset < self.offset:
384 raise OSError('Negative seek in write mode')
385 count = offset - self.offset
386 chunk = b'\0' * 1024
387 for i in range(count // 1024):
388 self.write(chunk)
389 self.write(b'\0' * (count % 1024))
390 elif self.mode == READ:
391 self._check_not_closed()
392 return self._buffer.seek(offset, whence)
393
394 return self.offset
395
396 def readline(self, size=-1):
397 self._check_not_closed()
398 return self._buffer.readline(size)
399
400
401class _GzipReader(_compression.DecompressReader):
402 def __init__(self, fp):
403 super().__init__(_PaddedFile(fp), zlib.decompressobj,
404 wbits=-zlib.MAX_WBITS)
405 # Set flag indicating start of a new member
406 self._new_member = True
407 self._last_mtime = None
408
409 def _init_read(self):
410 self._crc = zlib.crc32(b"")
411 self._stream_size = 0 # Decompressed size of unconcatenated stream
412
413 def _read_exact(self, n):
414 '''Read exactly *n* bytes from `self._fp`
415
416 This method is required because self._fp may be unbuffered,
417 i.e. return short reads.
418 '''
419
420 data = self._fp.read(n)
421 while len(data) < n:
422 b = self._fp.read(n - len(data))
423 if not b:
424 raise EOFError("Compressed file ended before the "
425 "end-of-stream marker was reached")
426 data += b
427 return data
428
429 def _read_gzip_header(self):
430 magic = self._fp.read(2)
431 if magic == b'':
432 return False
433
434 if magic != b'\037\213':
435 raise BadGzipFile('Not a gzipped file (%r)' % magic)
436
437 (method, flag,
438 self._last_mtime) = struct.unpack("<BBIxx", self._read_exact(8))
439 if method != 8:
440 raise BadGzipFile('Unknown compression method')
441
442 if flag & FEXTRA:
443 # Read & discard the extra field, if present
444 extra_len, = struct.unpack("<H", self._read_exact(2))
445 self._read_exact(extra_len)
446 if flag & FNAME:
447 # Read and discard a null-terminated string containing the filename
448 while True:
449 s = self._fp.read(1)
450 if not s or s==b'\000':
451 break
452 if flag & FCOMMENT:
453 # Read and discard a null-terminated string containing a comment
454 while True:
455 s = self._fp.read(1)
456 if not s or s==b'\000':
457 break
458 if flag & FHCRC:
459 self._read_exact(2) # Read & discard the 16-bit header CRC
460 return True
461
462 def read(self, size=-1):
463 if size < 0:
464 return self.readall()
465 # size=0 is special because decompress(max_length=0) is not supported
466 if not size:
467 return b""
468
469 # For certain input data, a single
470 # call to decompress() may not return
471 # any data. In this case, retry until we get some data or reach EOF.
472 while True:
473 if self._decompressor.eof:
474 # Ending case: we've come to the end of a member in the file,
475 # so finish up this member, and read a new gzip header.
476 # Check the CRC and file size, and set the flag so we read
477 # a new member
478 self._read_eof()
479 self._new_member = True
480 self._decompressor = self._decomp_factory(
481 **self._decomp_args)
482
483 if self._new_member:
484 # If the _new_member flag is set, we have to
485 # jump to the next member, if there is one.
486 self._init_read()
487 if not self._read_gzip_header():
488 self._size = self._pos
489 return b""
490 self._new_member = False
491
492 # Read a chunk of data from the file
493 buf = self._fp.read(io.DEFAULT_BUFFER_SIZE)
494
495 uncompress = self._decompressor.decompress(buf, size)
496 if self._decompressor.unconsumed_tail != b"":
497 self._fp.prepend(self._decompressor.unconsumed_tail)
498 elif self._decompressor.unused_data != b"":
499 # Prepend the already read bytes to the fileobj so they can
500 # be seen by _read_eof() and _read_gzip_header()
501 self._fp.prepend(self._decompressor.unused_data)
502
503 if uncompress != b"":
504 break
505 if buf == b"":
506 raise EOFError("Compressed file ended before the "
507 "end-of-stream marker was reached")
508
509 self._add_read_data( uncompress )
510 self._pos += len(uncompress)
511 return uncompress
512
513 def _add_read_data(self, data):
514 self._crc = zlib.crc32(data, self._crc)
515 self._stream_size = self._stream_size + len(data)
516
517 def _read_eof(self):
518 # We've read to the end of the file
519 # We check the that the computed CRC and size of the
520 # uncompressed data matches the stored values. Note that the size
521 # stored is the true file size mod 2**32.
522 crc32, isize = struct.unpack("<II", self._read_exact(8))
523 if crc32 != self._crc:
524 raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
525 hex(self._crc)))
526 elif isize != (self._stream_size & 0xffffffff):
527 raise BadGzipFile("Incorrect length of data produced")
528
529 # Gzip files can be padded with zeroes and still have archives.
530 # Consume all zero bytes and set the file position to the first
531 # non-zero byte. See http://www.gzip.org/#faq8
532 c = b"\x00"
533 while c == b"\x00":
534 c = self._fp.read(1)
535 if c:
536 self._fp.prepend(c)
537
538 def _rewind(self):
539 super()._rewind()
540 self._new_member = True
541
542def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
543 """Compress data in one shot and return the compressed string.
544 Optional argument is the compression level, in range of 0-9.
545 """
546 buf = io.BytesIO()
547 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel, mtime=mtime) as f:
548 f.write(data)
549 return buf.getvalue()
550
551def decompress(data):
552 """Decompress a gzip compressed string in one shot.
553 Return the decompressed string.
554 """
555 with GzipFile(fileobj=io.BytesIO(data)) as f:
556 return f.read()
557
558
559def main():
560 from argparse import ArgumentParser
561 parser = ArgumentParser(description=
562 "A simple command line interface for the gzip module: act like gzip, "
563 "but do not delete the input file.")
564 group = parser.add_mutually_exclusive_group()
565 group.add_argument('--fast', action='store_true', help='compress faster')
566 group.add_argument('--best', action='store_true', help='compress better')
567 group.add_argument("-d", "--decompress", action="store_true",
568 help="act like gunzip instead of gzip")
569
570 parser.add_argument("args", nargs="*", default=["-"], metavar='file')
571 args = parser.parse_args()
572
573 compresslevel = _COMPRESS_LEVEL_TRADEOFF
574 if args.fast:
575 compresslevel = _COMPRESS_LEVEL_FAST
576 elif args.best:
577 compresslevel = _COMPRESS_LEVEL_BEST
578
579 for arg in args.args:
580 if args.decompress:
581 if arg == "-":
582 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
583 g = sys.stdout.buffer
584 else:
585 if arg[-3:] != ".gz":
586 print("filename doesn't end in .gz:", repr(arg))
587 continue
588 f = open(arg, "rb")
589 g = builtins.open(arg[:-3], "wb")
590 else:
591 if arg == "-":
592 f = sys.stdin.buffer
593 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer,
594 compresslevel=compresslevel)
595 else:
596 f = builtins.open(arg, "rb")
597 g = open(arg + ".gz", "wb")
598 while True:
599 chunk = f.read(1024)
600 if not chunk:
601 break
602 g.write(chunk)
603 if g is not sys.stdout.buffer:
604 g.close()
605 if f is not sys.stdin.buffer:
606 f.close()
607
608if __name__ == '__main__':
609 main()