summaryrefslogtreecommitdiff
path: root/Lib/lzma.py
diff options
context:
space:
mode:
authorAntoine Pitrou <solipsis@pitrou.net>2015-04-11 00:31:01 +0200
committerAntoine Pitrou <solipsis@pitrou.net>2015-04-11 00:31:01 +0200
commit2dbc6e6bce0a29757acddd8000d55f7c844295a2 (patch)
treef1510e3a93b2527308dd6400a8b0544607e072db /Lib/lzma.py
parent2ce11d296cee8d71d2bf2451c7dba4ffa119d9d3 (diff)
downloadcpython-git-2dbc6e6bce0a29757acddd8000d55f7c844295a2.tar.gz
Issue #23529: Limit the size of decompressed data when reading from
GzipFile, BZ2File or LZMAFile. This defeats denial of service attacks using compressed bombs (i.e. compressed payloads which decompress to a huge size). Patch by Martin Panter and Nikolaus Rath.
Diffstat (limited to 'Lib/lzma.py')
-rw-r--r--Lib/lzma.py224
1 files changed, 29 insertions, 195 deletions
diff --git a/Lib/lzma.py b/Lib/lzma.py
index f1d3958179..89528b60ce 100644
--- a/Lib/lzma.py
+++ b/Lib/lzma.py
@@ -25,17 +25,16 @@ import builtins
import io
from _lzma import *
from _lzma import _encode_filter_properties, _decode_filter_properties
+import _compression
_MODE_CLOSED = 0
_MODE_READ = 1
-_MODE_READ_EOF = 2
+# Value 2 no longer used
_MODE_WRITE = 3
-_BUFFER_SIZE = 8192
-
-class LZMAFile(io.BufferedIOBase):
+class LZMAFile(_compression.BaseStream):
"""A file object providing transparent LZMA (de)compression.
@@ -92,8 +91,6 @@ class LZMAFile(io.BufferedIOBase):
self._fp = None
self._closefp = False
self._mode = _MODE_CLOSED
- self._pos = 0
- self._size = -1
if mode in ("r", "rb"):
if check != -1:
@@ -105,19 +102,13 @@ class LZMAFile(io.BufferedIOBase):
if format is None:
format = FORMAT_AUTO
mode_code = _MODE_READ
- # Save the args to pass to the LZMADecompressor initializer.
- # If the file contains multiple compressed streams, each
- # stream will need a separate decompressor object.
- self._init_args = {"format":format, "filters":filters}
- self._decompressor = LZMADecompressor(**self._init_args)
- self._buffer = b""
- self._buffer_offset = 0
elif mode in ("w", "wb", "a", "ab", "x", "xb"):
if format is None:
format = FORMAT_XZ
mode_code = _MODE_WRITE
self._compressor = LZMACompressor(format=format, check=check,
preset=preset, filters=filters)
+ self._pos = 0
else:
raise ValueError("Invalid mode: {!r}".format(mode))
@@ -133,6 +124,11 @@ class LZMAFile(io.BufferedIOBase):
else:
raise TypeError("filename must be a str or bytes object, or a file")
+ if self._mode == _MODE_READ:
+ raw = _compression.DecompressReader(self._fp, LZMADecompressor,
+ trailing_error=LZMAError, format=format, filters=filters)
+ self._buffer = io.BufferedReader(raw)
+
def close(self):
"""Flush and close the file.
@@ -142,9 +138,9 @@ class LZMAFile(io.BufferedIOBase):
if self._mode == _MODE_CLOSED:
return
try:
- if self._mode in (_MODE_READ, _MODE_READ_EOF):
- self._decompressor = None
- self._buffer = b""
+ if self._mode == _MODE_READ:
+ self._buffer.close()
+ self._buffer = None
elif self._mode == _MODE_WRITE:
self._fp.write(self._compressor.flush())
self._compressor = None
@@ -169,123 +165,18 @@ class LZMAFile(io.BufferedIOBase):
def seekable(self):
"""Return whether the file supports seeking."""
- return self.readable() and self._fp.seekable()
+ return self.readable() and self._buffer.seekable()
def readable(self):
"""Return whether the file was opened for reading."""
self._check_not_closed()
- return self._mode in (_MODE_READ, _MODE_READ_EOF)
+ return self._mode == _MODE_READ
def writable(self):
"""Return whether the file was opened for writing."""
self._check_not_closed()
return self._mode == _MODE_WRITE
- # Mode-checking helper functions.
-
- def _check_not_closed(self):
- if self.closed:
- raise ValueError("I/O operation on closed file")
-
- def _check_can_read(self):
- if self._mode not in (_MODE_READ, _MODE_READ_EOF):
- self._check_not_closed()
- raise io.UnsupportedOperation("File not open for reading")
-
- def _check_can_write(self):
- if self._mode != _MODE_WRITE:
- self._check_not_closed()
- raise io.UnsupportedOperation("File not open for writing")
-
- def _check_can_seek(self):
- if self._mode not in (_MODE_READ, _MODE_READ_EOF):
- self._check_not_closed()
- raise io.UnsupportedOperation("Seeking is only supported "
- "on files open for reading")
- if not self._fp.seekable():
- raise io.UnsupportedOperation("The underlying file object "
- "does not support seeking")
-
- # Fill the readahead buffer if it is empty. Returns False on EOF.
- def _fill_buffer(self):
- if self._mode == _MODE_READ_EOF:
- return False
- # Depending on the input data, our call to the decompressor may not
- # return any data. In this case, try again after reading another block.
- while self._buffer_offset == len(self._buffer):
- rawblock = (self._decompressor.unused_data or
- self._fp.read(_BUFFER_SIZE))
-
- if not rawblock:
- if self._decompressor.eof:
- self._mode = _MODE_READ_EOF
- self._size = self._pos
- return False
- else:
- raise EOFError("Compressed file ended before the "
- "end-of-stream marker was reached")
-
- if self._decompressor.eof:
- # Continue to next stream.
- self._decompressor = LZMADecompressor(**self._init_args)
- try:
- self._buffer = self._decompressor.decompress(rawblock)
- except LZMAError:
- # Trailing data isn't a valid compressed stream; ignore it.
- self._mode = _MODE_READ_EOF
- self._size = self._pos
- return False
- else:
- self._buffer = self._decompressor.decompress(rawblock)
- self._buffer_offset = 0
- return True
-
- # Read data until EOF.
- # If return_data is false, consume the data without returning it.
- def _read_all(self, return_data=True):
- # The loop assumes that _buffer_offset is 0. Ensure that this is true.
- self._buffer = self._buffer[self._buffer_offset:]
- self._buffer_offset = 0
-
- blocks = []
- while self._fill_buffer():
- if return_data:
- blocks.append(self._buffer)
- self._pos += len(self._buffer)
- self._buffer = b""
- if return_data:
- return b"".join(blocks)
-
- # Read a block of up to n bytes.
- # If return_data is false, consume the data without returning it.
- def _read_block(self, n, return_data=True):
- # If we have enough data buffered, return immediately.
- end = self._buffer_offset + n
- if end <= len(self._buffer):
- data = self._buffer[self._buffer_offset : end]
- self._buffer_offset = end
- self._pos += len(data)
- return data if return_data else None
-
- # The loop assumes that _buffer_offset is 0. Ensure that this is true.
- self._buffer = self._buffer[self._buffer_offset:]
- self._buffer_offset = 0
-
- blocks = []
- while n > 0 and self._fill_buffer():
- if n < len(self._buffer):
- data = self._buffer[:n]
- self._buffer_offset = n
- else:
- data = self._buffer
- self._buffer = b""
- if return_data:
- blocks.append(data)
- self._pos += len(data)
- n -= len(data)
- if return_data:
- return b"".join(blocks)
-
def peek(self, size=-1):
"""Return buffered data without advancing the file position.
@@ -293,9 +184,9 @@ class LZMAFile(io.BufferedIOBase):
The exact number of bytes returned is unspecified.
"""
self._check_can_read()
- if not self._fill_buffer():
- return b""
- return self._buffer[self._buffer_offset:]
+ # Relies on the undocumented fact that BufferedReader.peek() always
+ # returns at least one byte (except at EOF)
+ return self._buffer.peek(size)
def read(self, size=-1):
"""Read up to size uncompressed bytes from the file.
@@ -304,38 +195,19 @@ class LZMAFile(io.BufferedIOBase):
Returns b"" if the file is already at EOF.
"""
self._check_can_read()
- if size == 0:
- return b""
- elif size < 0:
- return self._read_all()
- else:
- return self._read_block(size)
+ return self._buffer.read(size)
def read1(self, size=-1):
"""Read up to size uncompressed bytes, while trying to avoid
- making multiple reads from the underlying stream.
+ making multiple reads from the underlying stream. Reads up to a
+ buffer's worth of data if size is negative.
Returns b"" if the file is at EOF.
"""
- # Usually, read1() calls _fp.read() at most once. However, sometimes
- # this does not give enough data for the decompressor to make progress.
- # In this case we make multiple reads, to avoid returning b"".
self._check_can_read()
- if (size == 0 or
- # Only call _fill_buffer() if the buffer is actually empty.
- # This gives a significant speedup if *size* is small.
- (self._buffer_offset == len(self._buffer) and not self._fill_buffer())):
- return b""
- if size > 0:
- data = self._buffer[self._buffer_offset :
- self._buffer_offset + size]
- self._buffer_offset += len(data)
- else:
- data = self._buffer[self._buffer_offset:]
- self._buffer = b""
- self._buffer_offset = 0
- self._pos += len(data)
- return data
+ if size < 0:
+ size = io.DEFAULT_BUFFER_SIZE
+ return self._buffer.read1(size)
def readline(self, size=-1):
"""Read a line of uncompressed bytes from the file.
@@ -345,15 +217,7 @@ class LZMAFile(io.BufferedIOBase):
case the line may be incomplete). Returns b'' if already at EOF.
"""
self._check_can_read()
- # Shortcut for the common case - the whole line is in the buffer.
- if size < 0:
- end = self._buffer.find(b"\n", self._buffer_offset) + 1
- if end > 0:
- line = self._buffer[self._buffer_offset : end]
- self._buffer_offset = end
- self._pos += len(line)
- return line
- return io.BufferedIOBase.readline(self, size)
+ return self._buffer.readline(size)
def write(self, data):
"""Write a bytes object to the file.
@@ -368,16 +232,7 @@ class LZMAFile(io.BufferedIOBase):
self._pos += len(data)
return len(data)
- # Rewind the file to the beginning of the data stream.
- def _rewind(self):
- self._fp.seek(0, 0)
- self._mode = _MODE_READ
- self._pos = 0
- self._decompressor = LZMADecompressor(**self._init_args)
- self._buffer = b""
- self._buffer_offset = 0
-
- def seek(self, offset, whence=0):
+ def seek(self, offset, whence=io.SEEK_SET):
"""Change the file position.
The new position is specified by offset, relative to the
@@ -389,38 +244,17 @@ class LZMAFile(io.BufferedIOBase):
Returns the new file position.
- Note that seeking is emulated, sp depending on the parameters,
+ Note that seeking is emulated, so depending on the parameters,
this operation may be extremely slow.
"""
self._check_can_seek()
-
- # Recalculate offset as an absolute file position.
- if whence == 0:
- pass
- elif whence == 1:
- offset = self._pos + offset
- elif whence == 2:
- # Seeking relative to EOF - we need to know the file's size.
- if self._size < 0:
- self._read_all(return_data=False)
- offset = self._size + offset
- else:
- raise ValueError("Invalid value for whence: {}".format(whence))
-
- # Make it so that offset is the number of bytes to skip forward.
- if offset < self._pos:
- self._rewind()
- else:
- offset -= self._pos
-
- # Read and discard data until we reach the desired position.
- self._read_block(offset, return_data=False)
-
- return self._pos
+ return self._buffer.seek(offset, whence)
def tell(self):
"""Return the current file position."""
self._check_not_closed()
+ if self._mode == _MODE_READ:
+ return self._buffer.tell()
return self._pos