diff options
author | pgjones <philip.graham.jones@googlemail.com> | 2023-05-01 13:53:20 +0100 |
---|---|---|
committer | David Lord <davidism@gmail.com> | 2023-05-01 07:16:23 -0700 |
commit | f0a1733f52a7241f51cc519593233e8be6aeaa0e (patch) | |
tree | 185ce78f2cbdbcaa25733b7e17a1df39f9969372 | |
parent | 53f571cf3a6aafc1f46a18d7f893652997dc9cb9 (diff) | |
download | werkzeug-f0a1733f52a7241f51cc519593233e8be6aeaa0e.tar.gz |
Fix the parsing of large multipart bodies
There were two issues to fix. Firstly if a boundary couldn't be found
the parser should have parsed up to the end of the buffer or last
newline (whichever is earlier). However the last newline would be the
first character since 082e0e5b9c01fa3178ac0153413f082616f10914 as the
DATA_START state would have a buffer that starts with newline. This
was fixed by changing the last_newline method to take the data to
search as an argument.
Secondly the parsing was slow as the shortcut search for the boundary
was removed resulting in full regex matches on each
iteration. Restoring the shortcut restores the previous performance.
-rw-r--r-- | CHANGES.rst | 2 | ||||
-rw-r--r-- | src/werkzeug/sansio/multipart.py | 38 |
2 files changed, 25 insertions, 15 deletions
diff --git a/CHANGES.rst b/CHANGES.rst index 075ca2bc..091aa553 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -5,6 +5,8 @@ Version 2.3.3 Unreleased +- Fix parsing of large multipart bodies. Remove invalid leading newline, and restore + parsing speed. :issue:`2658, 2675` - The cookie ``Path`` attribute is set to ``/`` by default again, to prevent clients from falling back to RFC 6265's ``default-path`` behavior. :issue:`2672, 2679` diff --git a/src/werkzeug/sansio/multipart.py b/src/werkzeug/sansio/multipart.py index ae633b81..11e65ed0 100644 --- a/src/werkzeug/sansio/multipart.py +++ b/src/werkzeug/sansio/multipart.py @@ -121,15 +121,15 @@ class MultipartDecoder: self._search_position = 0 self._parts_decoded = 0 - def last_newline(self) -> int: + def last_newline(self, data: bytes) -> int: try: - last_nl = self.buffer.rindex(b"\n") + last_nl = data.rindex(b"\n") except ValueError: - last_nl = len(self.buffer) + last_nl = len(data) try: - last_cr = self.buffer.rindex(b"\r") + last_cr = data.rindex(b"\r") except ValueError: - last_cr = len(self.buffer) + last_cr = len(data) return min(last_nl, last_cr) @@ -251,17 +251,25 @@ class MultipartDecoder: else: data_start = 0 - match = self.boundary_re.search(data) - if match is not None: - if match.group(1).startswith(b"--"): - self.state = State.EPILOGUE - else: - self.state = State.PART - data_end = match.start() - del_index = match.end() + if self.buffer.find(b"--" + self.boundary) == -1: + # No complete boundary in the buffer, but there may be + # a partial boundary at the end. As the boundary + # starts with either a nl or cr find the earliest and + # return up to that as data. + data_end = del_index = self.last_newline(data[data_start:]) + more_data = True else: - data_end = del_index = self.last_newline() - more_data = match is None + match = self.boundary_re.search(data) + if match is not None: + if match.group(1).startswith(b"--"): + self.state = State.EPILOGUE + else: + self.state = State.PART + data_end = match.start() + del_index = match.end() + else: + data_end = del_index = self.last_newline(data[data_start:]) + more_data = match is None return bytes(data[data_start:data_end]), del_index, more_data |