summaryrefslogtreecommitdiff
path: root/Modules/_bz2module.c
diff options
context:
space:
mode:
authorAntoine Pitrou <solipsis@pitrou.net>2015-02-26 13:08:07 +0100
committerAntoine Pitrou <solipsis@pitrou.net>2015-02-26 13:08:07 +0100
commite71258a0e67cf744f5f2c0bca15f1d66871ce050 (patch)
treedf8d2e81cd048157d5f8a8fe7c7f32e268352c9f /Modules/_bz2module.c
parent87f50158ee72bb2ff29c5f44f0b0efbb83845d46 (diff)
downloadcpython-git-e71258a0e67cf744f5f2c0bca15f1d66871ce050.tar.gz
Issue #15955: Add an option to limit the output size in bz2.decompress().
Patch by Nikolaus Rath.
Diffstat (limited to 'Modules/_bz2module.c')
-rw-r--r--Modules/_bz2module.c259
1 files changed, 201 insertions, 58 deletions
diff --git a/Modules/_bz2module.c b/Modules/_bz2module.c
index 4f2afda097..f284cd611d 100644
--- a/Modules/_bz2module.c
+++ b/Modules/_bz2module.c
@@ -51,6 +51,14 @@ typedef struct {
bz_stream bzs;
char eof; /* T_BOOL expects a char */
PyObject *unused_data;
+ char needs_input;
+ char *input_buffer;
+ size_t input_buffer_size;
+
+ /* bzs->avail_in is only 32 bit, so we store the true length
+ separately. Conversion and looping is encapsulated in
+ decompress_buf() */
+ size_t bzs_avail_in_real;
#ifdef WITH_THREAD
PyThread_type_lock lock;
#endif
@@ -111,19 +119,23 @@ catch_bz2_error(int bzerror)
}
#if BUFSIZ < 8192
-#define SMALLCHUNK 8192
+#define INITIAL_BUFFER_SIZE 8192
#else
-#define SMALLCHUNK BUFSIZ
+#define INITIAL_BUFFER_SIZE BUFSIZ
#endif
static int
-grow_buffer(PyObject **buf)
+grow_buffer(PyObject **buf, Py_ssize_t max_length)
{
/* Expand the buffer by an amount proportional to the current size,
giving us amortized linear-time behavior. Use a less-than-double
growth factor to avoid excessive allocation. */
size_t size = PyBytes_GET_SIZE(*buf);
size_t new_size = size + (size >> 3) + 6;
+
+ if (max_length > 0 && new_size > (size_t) max_length)
+ new_size = (size_t) max_length;
+
if (new_size > size) {
return _PyBytes_Resize(buf, new_size);
} else { /* overflow */
@@ -142,14 +154,14 @@ compress(BZ2Compressor *c, char *data, size_t len, int action)
size_t data_size = 0;
PyObject *result;
- result = PyBytes_FromStringAndSize(NULL, SMALLCHUNK);
+ result = PyBytes_FromStringAndSize(NULL, INITIAL_BUFFER_SIZE);
if (result == NULL)
return NULL;
c->bzs.next_in = data;
c->bzs.avail_in = 0;
c->bzs.next_out = PyBytes_AS_STRING(result);
- c->bzs.avail_out = SMALLCHUNK;
+ c->bzs.avail_out = INITIAL_BUFFER_SIZE;
for (;;) {
char *this_out;
int bzerror;
@@ -168,7 +180,7 @@ compress(BZ2Compressor *c, char *data, size_t len, int action)
if (c->bzs.avail_out == 0) {
size_t buffer_left = PyBytes_GET_SIZE(result) - data_size;
if (buffer_left == 0) {
- if (grow_buffer(&result) < 0)
+ if (grow_buffer(&result, -1) < 0)
goto error;
c->bzs.next_out = PyBytes_AS_STRING(result) + data_size;
buffer_left = PyBytes_GET_SIZE(result) - data_size;
@@ -402,64 +414,176 @@ static PyTypeObject BZ2Compressor_Type = {
/* BZ2Decompressor class. */
-static PyObject *
-decompress(BZ2Decompressor *d, char *data, size_t len)
+/* Decompress data of length d->bzs_avail_in_real in d->bzs.next_in. The output
+ buffer is allocated dynamically and returned. At most max_length bytes are
+ returned, so some of the input may not be consumed. d->bzs.next_in and
+ d->bzs_avail_in_real are updated to reflect the consumed input. */
+static PyObject*
+decompress_buf(BZ2Decompressor *d, Py_ssize_t max_length)
{
- size_t data_size = 0;
+ /* data_size is strictly positive, but because we repeatedly have to
+ compare against max_length and PyBytes_GET_SIZE we declare it as
+ signed */
+ Py_ssize_t data_size = 0;
PyObject *result;
+ bz_stream *bzs = &d->bzs;
- result = PyBytes_FromStringAndSize(NULL, SMALLCHUNK);
+ if (max_length < 0 || max_length >= INITIAL_BUFFER_SIZE)
+ result = PyBytes_FromStringAndSize(NULL, INITIAL_BUFFER_SIZE);
+ else
+ result = PyBytes_FromStringAndSize(NULL, max_length);
if (result == NULL)
- return result;
- d->bzs.next_in = data;
- /* On a 64-bit system, len might not fit in avail_in (an unsigned int).
- Do decompression in chunks of no more than UINT_MAX bytes each. */
- d->bzs.avail_in = (unsigned int)Py_MIN(len, UINT_MAX);
- len -= d->bzs.avail_in;
- d->bzs.next_out = PyBytes_AS_STRING(result);
- d->bzs.avail_out = SMALLCHUNK;
+ return NULL;
+
+ bzs->next_out = PyBytes_AS_STRING(result);
for (;;) {
- char *this_out;
- int bzerror;
+ int bzret;
+ size_t avail;
+
+ /* On a 64-bit system, buffer length might not fit in avail_out, so we
+ do decompression in chunks of no more than UINT_MAX bytes
+ each. Note that the expression for `avail` is guaranteed to be
+ positive, so the cast is safe. */
+ avail = (size_t) (PyBytes_GET_SIZE(result) - data_size);
+ bzs->avail_out = (unsigned int)Py_MIN(avail, UINT_MAX);
+ bzs->avail_in = (unsigned int)Py_MIN(d->bzs_avail_in_real, UINT_MAX);
+ d->bzs_avail_in_real -= bzs->avail_in;
Py_BEGIN_ALLOW_THREADS
- this_out = d->bzs.next_out;
- bzerror = BZ2_bzDecompress(&d->bzs);
- data_size += d->bzs.next_out - this_out;
+ bzret = BZ2_bzDecompress(bzs);
+ data_size = bzs->next_out - PyBytes_AS_STRING(result);
+ d->bzs_avail_in_real += bzs->avail_in;
Py_END_ALLOW_THREADS
- if (catch_bz2_error(bzerror))
+ if (catch_bz2_error(bzret))
goto error;
- if (bzerror == BZ_STREAM_END) {
+ if (bzret == BZ_STREAM_END) {
d->eof = 1;
- len += d->bzs.avail_in;
- if (len > 0) { /* Save leftover input to unused_data */
- Py_CLEAR(d->unused_data);
- d->unused_data = PyBytes_FromStringAndSize(d->bzs.next_in, len);
- if (d->unused_data == NULL)
- goto error;
- }
break;
- }
- if (d->bzs.avail_in == 0) {
- if (len == 0)
+ } else if (d->bzs_avail_in_real == 0) {
+ break;
+ } else if (bzs->avail_out == 0) {
+ if (data_size == max_length)
break;
- d->bzs.avail_in = (unsigned int)Py_MIN(len, UINT_MAX);
- len -= d->bzs.avail_in;
+ if (data_size == PyBytes_GET_SIZE(result) &&
+ grow_buffer(&result, max_length) == -1)
+ goto error;
+ bzs->next_out = PyBytes_AS_STRING(result) + data_size;
}
- if (d->bzs.avail_out == 0) {
- size_t buffer_left = PyBytes_GET_SIZE(result) - data_size;
- if (buffer_left == 0) {
- if (grow_buffer(&result) < 0)
+ }
+ if (data_size != PyBytes_GET_SIZE(result))
+ if (_PyBytes_Resize(&result, data_size) == -1)
+ goto error;
+
+ return result;
+
+error:
+ Py_XDECREF(result);
+ return NULL;
+}
+
+
+static PyObject *
+decompress(BZ2Decompressor *d, char *data, size_t len, Py_ssize_t max_length)
+{
+ char input_buffer_in_use;
+ PyObject *result;
+ bz_stream *bzs = &d->bzs;
+
+ /* Prepend unconsumed input if necessary */
+ if (bzs->next_in != NULL) {
+ size_t avail_now, avail_total;
+
+ /* Number of bytes we can append to input buffer */
+ avail_now = (d->input_buffer + d->input_buffer_size)
+ - (bzs->next_in + d->bzs_avail_in_real);
+
+ /* Number of bytes we can append if we move existing
+ contents to beginning of buffer (overwriting
+ consumed input) */
+ avail_total = d->input_buffer_size - d->bzs_avail_in_real;
+
+ if (avail_total < len) {
+ size_t offset = bzs->next_in - d->input_buffer;
+ char *tmp;
+ size_t new_size = d->input_buffer_size + len - avail_now;
+
+ /* Assign to temporary variable first, so we don't
+ lose address of allocated buffer if realloc fails */
+ tmp = PyMem_Realloc(d->input_buffer, new_size);
+ if (tmp == NULL) {
+ PyErr_SetNone(PyExc_MemoryError);
+ return NULL;
+ }
+ d->input_buffer = tmp;
+ d->input_buffer_size = new_size;
+
+ bzs->next_in = d->input_buffer + offset;
+ }
+ else if (avail_now < len) {
+ memmove(d->input_buffer, bzs->next_in,
+ d->bzs_avail_in_real);
+ bzs->next_in = d->input_buffer;
+ }
+ memcpy((void*)(bzs->next_in + d->bzs_avail_in_real), data, len);
+ d->bzs_avail_in_real += len;
+ input_buffer_in_use = 1;
+ }
+ else {
+ bzs->next_in = data;
+ d->bzs_avail_in_real = len;
+ input_buffer_in_use = 0;
+ }
+
+ result = decompress_buf(d, max_length);
+ if(result == NULL)
+ return NULL;
+
+ if (d->eof) {
+ d->needs_input = 0;
+ if (d->bzs_avail_in_real > 0) {
+ Py_CLEAR(d->unused_data);
+ d->unused_data = PyBytes_FromStringAndSize(
+ bzs->next_in, d->bzs_avail_in_real);
+ if (d->unused_data == NULL)
+ goto error;
+ }
+ }
+ else if (d->bzs_avail_in_real == 0) {
+ bzs->next_in = NULL;
+ d->needs_input = 1;
+ }
+ else {
+ d->needs_input = 0;
+
+ /* If we did not use the input buffer, we now have
+ to copy the tail from the caller's buffer into the
+ input buffer */
+ if (!input_buffer_in_use) {
+
+ /* Discard buffer if it's too small
+ (resizing it may needlessly copy the current contents) */
+ if (d->input_buffer != NULL &&
+ d->input_buffer_size < d->bzs_avail_in_real) {
+ PyMem_Free(d->input_buffer);
+ d->input_buffer = NULL;
+ }
+
+ /* Allocate if necessary */
+ if (d->input_buffer == NULL) {
+ d->input_buffer = PyMem_Malloc(d->bzs_avail_in_real);
+ if (d->input_buffer == NULL) {
+ PyErr_SetNone(PyExc_MemoryError);
goto error;
- d->bzs.next_out = PyBytes_AS_STRING(result) + data_size;
- buffer_left = PyBytes_GET_SIZE(result) - data_size;
+ }
+ d->input_buffer_size = d->bzs_avail_in_real;
}
- d->bzs.avail_out = (unsigned int)Py_MIN(buffer_left, UINT_MAX);
+
+ /* Copy tail */
+ memcpy(d->input_buffer, bzs->next_in, d->bzs_avail_in_real);
+ bzs->next_in = d->input_buffer;
}
}
- if (data_size != (size_t)PyBytes_GET_SIZE(result))
- if (_PyBytes_Resize(&result, data_size) < 0)
- goto error;
+
return result;
error:
@@ -470,21 +594,29 @@ error:
/*[clinic input]
_bz2.BZ2Decompressor.decompress
+ self: self(type="BZ2Decompressor *")
data: Py_buffer
- /
+ max_length: Py_ssize_t=-1
-Provide data to the decompressor object.
+Decompress *data*, returning uncompressed data as bytes.
-Returns a chunk of decompressed data if possible, or b'' otherwise.
+If *max_length* is nonnegative, returns at most *max_length* bytes of
+decompressed data. If this limit is reached and further output can be
+produced, *self.needs_input* will be set to ``False``. In this case, the next
+call to *decompress()* may provide *data* as b'' to obtain more of the output.
-Attempting to decompress data after the end of stream is reached
-raises an EOFError. Any data found after the end of the stream
-is ignored and saved in the unused_data attribute.
+If all of the input data was decompressed and returned (either because this
+was less than *max_length* bytes, or because *max_length* was negative),
+*self.needs_input* will be set to True.
+
+Attempting to decompress data after the end of stream is reached raises an
+EOFError. Any data found after the end of the stream is ignored and saved in
+the unused_data attribute.
[clinic start generated code]*/
static PyObject *
-_bz2_BZ2Decompressor_decompress_impl(BZ2Decompressor *self, Py_buffer *data)
-/*[clinic end generated code: output=086e4b99e60cb3f6 input=616c2a6db5269961]*/
+_bz2_BZ2Decompressor_decompress_impl(BZ2Decompressor *self, Py_buffer *data, Py_ssize_t max_length)
+/*[clinic end generated code: output=7eeb5794035a2ca3 input=9558b424c8b00516]*/
{
PyObject *result = NULL;
@@ -492,7 +624,7 @@ _bz2_BZ2Decompressor_decompress_impl(BZ2Decompressor *self, Py_buffer *data)
if (self->eof)
PyErr_SetString(PyExc_EOFError, "End of stream already reached");
else
- result = decompress(self, data->buf, data->len);
+ result = decompress(self, data->buf, data->len, max_length);
RELEASE_LOCK(self);
return result;
}
@@ -527,10 +659,14 @@ _bz2_BZ2Decompressor___init___impl(BZ2Decompressor *self)
}
#endif
- self->unused_data = PyBytes_FromStringAndSize("", 0);
+ self->needs_input = 1;
+ self->bzs_avail_in_real = 0;
+ self->input_buffer = NULL;
+ self->input_buffer_size = 0;
+ self->unused_data = PyBytes_FromStringAndSize(NULL, 0);
if (self->unused_data == NULL)
goto error;
-
+
bzerror = BZ2_bzDecompressInit(&self->bzs, 0, 0);
if (catch_bz2_error(bzerror))
goto error;
@@ -549,6 +685,8 @@ error:
static void
BZ2Decompressor_dealloc(BZ2Decompressor *self)
{
+ if(self->input_buffer != NULL)
+ PyMem_Free(self->input_buffer);
BZ2_bzDecompressEnd(&self->bzs);
Py_CLEAR(self->unused_data);
#ifdef WITH_THREAD
@@ -570,11 +708,16 @@ PyDoc_STRVAR(BZ2Decompressor_eof__doc__,
PyDoc_STRVAR(BZ2Decompressor_unused_data__doc__,
"Data found after the end of the compressed stream.");
+PyDoc_STRVAR(BZ2Decompressor_needs_input_doc,
+"True if more input is needed before more decompressed data can be produced.");
+
static PyMemberDef BZ2Decompressor_members[] = {
{"eof", T_BOOL, offsetof(BZ2Decompressor, eof),
READONLY, BZ2Decompressor_eof__doc__},
{"unused_data", T_OBJECT_EX, offsetof(BZ2Decompressor, unused_data),
READONLY, BZ2Decompressor_unused_data__doc__},
+ {"needs_input", T_BOOL, offsetof(BZ2Decompressor, needs_input), READONLY,
+ BZ2Decompressor_needs_input_doc},
{NULL}
};