From f0a1733f52a7241f51cc519593233e8be6aeaa0e Mon Sep 17 00:00:00 2001 From: pgjones Date: Mon, 1 May 2023 13:53:20 +0100 Subject: Fix the parsing of large multipart bodies There were two issues to fix. Firstly if a boundary couldn't be found the parser should have parsed up to the end of the buffer or last newline (whichever is earlier). However the last newline would be the first character since 082e0e5b9c01fa3178ac0153413f082616f10914 as the DATA_START state would have a buffer that starts with newline. This was fixed by changing the last_newline method to take the data to search as an argument. Secondly the parsing was slow as the shortcut search for the boundary was removed resulting in full regex matches on each iteration. Restoring the shortcut restores the previous performance. --- CHANGES.rst | 2 ++ src/werkzeug/sansio/multipart.py | 38 +++++++++++++++++++++++--------------- 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 075ca2bc..091aa553 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -5,6 +5,8 @@ Version 2.3.3 Unreleased +- Fix parsing of large multipart bodies. Remove invalid leading newline, and restore + parsing speed. :issue:`2658, 2675` - The cookie ``Path`` attribute is set to ``/`` by default again, to prevent clients from falling back to RFC 6265's ``default-path`` behavior. :issue:`2672, 2679` diff --git a/src/werkzeug/sansio/multipart.py b/src/werkzeug/sansio/multipart.py index ae633b81..11e65ed0 100644 --- a/src/werkzeug/sansio/multipart.py +++ b/src/werkzeug/sansio/multipart.py @@ -121,15 +121,15 @@ class MultipartDecoder: self._search_position = 0 self._parts_decoded = 0 - def last_newline(self) -> int: + def last_newline(self, data: bytes) -> int: try: - last_nl = self.buffer.rindex(b"\n") + last_nl = data.rindex(b"\n") except ValueError: - last_nl = len(self.buffer) + last_nl = len(data) try: - last_cr = self.buffer.rindex(b"\r") + last_cr = data.rindex(b"\r") except ValueError: - last_cr = len(self.buffer) + last_cr = len(data) return min(last_nl, last_cr) @@ -251,17 +251,25 @@ class MultipartDecoder: else: data_start = 0 - match = self.boundary_re.search(data) - if match is not None: - if match.group(1).startswith(b"--"): - self.state = State.EPILOGUE - else: - self.state = State.PART - data_end = match.start() - del_index = match.end() + if self.buffer.find(b"--" + self.boundary) == -1: + # No complete boundary in the buffer, but there may be + # a partial boundary at the end. As the boundary + # starts with either a nl or cr find the earliest and + # return up to that as data. + data_end = del_index = self.last_newline(data[data_start:]) + more_data = True else: - data_end = del_index = self.last_newline() - more_data = match is None + match = self.boundary_re.search(data) + if match is not None: + if match.group(1).startswith(b"--"): + self.state = State.EPILOGUE + else: + self.state = State.PART + data_end = match.start() + del_index = match.end() + else: + data_end = del_index = self.last_newline(data[data_start:]) + more_data = match is None return bytes(data[data_start:data_end]), del_index, more_data -- cgit v1.2.1