diff options
author | Sebastian Berg <sebastianb@nvidia.com> | 2022-10-04 13:36:08 +0200 |
---|---|---|
committer | Sebastian Berg <sebastianb@nvidia.com> | 2022-10-06 17:16:00 +0200 |
commit | 81bc4565b50c6cebb21c95c685285e32e1fb9b65 (patch) | |
tree | c790e56bb10ffc9a9bd98112182b9ce9410ba683 /numpy/lib/format.py | |
parent | f062589346b52406144ad2d73b7bc969974cba90 (diff) | |
download | numpy-81bc4565b50c6cebb21c95c685285e32e1fb9b65.tar.gz |
MAINT: Ensure graceful handling of large header sizes
This ensures graceful handling of large header files. Unfortunately,
it may be a bit inconvenient for users, thus the new kwarg and the
work-around of also accepting allow-pickle.
See also the documation here:
https://docs.python.org/3.10/library/ast.html#ast.literal_eval
Diffstat (limited to 'numpy/lib/format.py')
-rw-r--r-- | numpy/lib/format.py | 59 |
1 files changed, 50 insertions, 9 deletions
diff --git a/numpy/lib/format.py b/numpy/lib/format.py index 625768b62..19fec48ed 100644 --- a/numpy/lib/format.py +++ b/numpy/lib/format.py @@ -186,6 +186,10 @@ _header_size_info = { (3, 0): ('<I', 'utf8'), } +# Python's literal_eval is not actually safe for large inputs, since parsing +# may become slow or even cause interpreter crashes. +# This is an arbitrary, low limit which should make it safe in practice. +_MAX_HEADER_SIZE = 10000 def _check_version(version): if version not in [(1, 0), (2, 0), (3, 0), None]: @@ -465,7 +469,7 @@ def write_array_header_2_0(fp, d): """ _write_array_header(fp, d, (2, 0)) -def read_array_header_1_0(fp): +def read_array_header_1_0(fp, max_header_size=_MAX_HEADER_SIZE): """ Read an array header from a filelike object using the 1.0 file format version. @@ -487,6 +491,10 @@ def read_array_header_1_0(fp): contiguous before writing it out. dtype : dtype The dtype of the file's data. + max_header_size : int, optional + Maximum allowed size of the header. Large headers may not be safe + to load securely and thus require explicitly passing a larger value. + See :py:meth:`ast.literal_eval()` for details. Raises ------ @@ -494,9 +502,10 @@ def read_array_header_1_0(fp): If the data is invalid. """ - return _read_array_header(fp, version=(1, 0)) + return _read_array_header( + fp, version=(1, 0), max_header_size=max_header_size) -def read_array_header_2_0(fp): +def read_array_header_2_0(fp, max_header_size=_MAX_HEADER_SIZE): """ Read an array header from a filelike object using the 2.0 file format version. @@ -509,6 +518,10 @@ def read_array_header_2_0(fp): ---------- fp : filelike object A file object or something with a `.read()` method like a file. + max_header_size : int, optional + Maximum allowed size of the header. Large headers may not be safe + to load securely and thus require explicitly passing a larger value. + See :py:meth:`ast.literal_eval()` for details. Returns ------- @@ -527,7 +540,8 @@ def read_array_header_2_0(fp): If the data is invalid. """ - return _read_array_header(fp, version=(2, 0)) + return _read_array_header( + fp, version=(2, 0), max_header_size=max_header_size) def _filter_header(s): @@ -565,7 +579,7 @@ def _filter_header(s): return tokenize.untokenize(tokens) -def _read_array_header(fp, version): +def _read_array_header(fp, version, max_header_size=_MAX_HEADER_SIZE): """ see read_array_header_1_0 """ @@ -581,6 +595,14 @@ def _read_array_header(fp, version): header_length = struct.unpack(hlength_type, hlength_str)[0] header = _read_bytes(fp, header_length, "array header") header = header.decode(encoding) + if len(header) > max_header_size: + raise ValueError( + f"Header info length ({len(header)}) is large and may not be safe " + "to load securely.\n" + "To allow loading, adjust `max_header_size` or fully trust " + "the `.npy` file using `allow_pickle=True`.\n" + "For safety against large resource use or crashes, sandboxing " + "may be necessary.") # The header is a pretty-printed string representation of a literal # Python dictionary with trailing newlines padded to a ARRAY_ALIGN byte @@ -694,7 +716,8 @@ def write_array(fp, array, version=None, allow_pickle=True, pickle_kwargs=None): fp.write(chunk.tobytes('C')) -def read_array(fp, allow_pickle=False, pickle_kwargs=None): +def read_array(fp, allow_pickle=False, pickle_kwargs=None, *, + max_header_size=_MAX_HEADER_SIZE): """ Read an array from an NPY file. @@ -713,6 +736,12 @@ def read_array(fp, allow_pickle=False, pickle_kwargs=None): Additional keyword arguments to pass to pickle.load. These are only useful when loading object arrays saved on Python 2 when using Python 3. + max_header_size : int, optional + Maximum allowed size of the header. Large headers may not be safe + to load securely and thus require explicitly passing a larger value. + See :py:meth:`ast.literal_eval()` for details. + This option is ignored when `allow_pickle` is passed. In that case + the file is by definition trusted and the limit is unnecessary. Returns ------- @@ -726,9 +755,15 @@ def read_array(fp, allow_pickle=False, pickle_kwargs=None): an object array. """ + if allow_pickle: + # Effectively ignore max_header_size, since `allow_pickle` indicates + # that the input is fully trusted. + max_header_size = 2**64 + version = read_magic(fp) _check_version(version) - shape, fortran_order, dtype = _read_array_header(fp, version) + shape, fortran_order, dtype = _read_array_header( + fp, version, max_header_size=max_header_size) if len(shape) == 0: count = 1 else: @@ -788,7 +823,8 @@ def read_array(fp, allow_pickle=False, pickle_kwargs=None): def open_memmap(filename, mode='r+', dtype=None, shape=None, - fortran_order=False, version=None): + fortran_order=False, version=None, *, + max_header_size=_MAX_HEADER_SIZE): """ Open a .npy file as a memory-mapped array. @@ -819,6 +855,10 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None, If the mode is a "write" mode, then this is the version of the file format used to create the file. None means use the oldest supported version that is able to store the data. Default: None + max_header_size : int, optional + Maximum allowed size of the header. Large headers may not be safe + to load securely and thus require explicitly passing a larger value. + See :py:meth:`ast.literal_eval()` for details. Returns ------- @@ -866,7 +906,8 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None, version = read_magic(fp) _check_version(version) - shape, fortran_order, dtype = _read_array_header(fp, version) + shape, fortran_order, dtype = _read_array_header( + fp, version, max_header_size=max_header_size) if dtype.hasobject: msg = "Array can't be memory-mapped: Python objects in dtype." raise ValueError(msg) |