From f0a1733f52a7241f51cc519593233e8be6aeaa0e Mon Sep 17 00:00:00 2001
From: pgjones <philip.graham.jones@googlemail.com>
Date: Mon, 1 May 2023 13:53:20 +0100
Subject: Fix the parsing of large multipart bodies

There were two issues to fix. Firstly if a boundary couldn't be found
the parser should have parsed up to the end of the buffer or last
newline (whichever is earlier). However the last newline would be the
first character since 082e0e5b9c01fa3178ac0153413f082616f10914 as the
DATA_START state would have a buffer that starts with newline. This
was fixed by changing the last_newline method to take the data to
search as an argument.

Secondly the parsing was slow as the shortcut search for the boundary
was removed resulting in full regex matches on each
iteration. Restoring the shortcut restores the previous performance.
---
 CHANGES.rst                      |  2 ++
 src/werkzeug/sansio/multipart.py | 38 +++++++++++++++++++++++---------------
 2 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index 075ca2bc..091aa553 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -5,6 +5,8 @@ Version 2.3.3
 
 Unreleased
 
+-   Fix parsing of large multipart bodies. Remove invalid leading newline, and restore
+    parsing speed. :issue:`2658, 2675`
 -   The cookie ``Path`` attribute is set to ``/`` by default again, to prevent clients
     from falling back to RFC 6265's ``default-path`` behavior. :issue:`2672, 2679`
 
diff --git a/src/werkzeug/sansio/multipart.py b/src/werkzeug/sansio/multipart.py
index ae633b81..11e65ed0 100644
--- a/src/werkzeug/sansio/multipart.py
+++ b/src/werkzeug/sansio/multipart.py
@@ -121,15 +121,15 @@ class MultipartDecoder:
         self._search_position = 0
         self._parts_decoded = 0
 
-    def last_newline(self) -> int:
+    def last_newline(self, data: bytes) -> int:
         try:
-            last_nl = self.buffer.rindex(b"\n")
+            last_nl = data.rindex(b"\n")
         except ValueError:
-            last_nl = len(self.buffer)
+            last_nl = len(data)
         try:
-            last_cr = self.buffer.rindex(b"\r")
+            last_cr = data.rindex(b"\r")
         except ValueError:
-            last_cr = len(self.buffer)
+            last_cr = len(data)
 
         return min(last_nl, last_cr)
 
@@ -251,17 +251,25 @@ class MultipartDecoder:
         else:
             data_start = 0
 
-        match = self.boundary_re.search(data)
-        if match is not None:
-            if match.group(1).startswith(b"--"):
-                self.state = State.EPILOGUE
-            else:
-                self.state = State.PART
-            data_end = match.start()
-            del_index = match.end()
+        if self.buffer.find(b"--" + self.boundary) == -1:
+            # No complete boundary in the buffer, but there may be
+            # a partial boundary at the end. As the boundary
+            # starts with either a nl or cr find the earliest and
+            # return up to that as data.
+            data_end = del_index = self.last_newline(data[data_start:])
+            more_data = True
         else:
-            data_end = del_index = self.last_newline()
-        more_data = match is None
+            match = self.boundary_re.search(data)
+            if match is not None:
+                if match.group(1).startswith(b"--"):
+                    self.state = State.EPILOGUE
+                else:
+                    self.state = State.PART
+                data_end = match.start()
+                del_index = match.end()
+            else:
+                data_end = del_index = self.last_newline(data[data_start:])
+            more_data = match is None
 
         return bytes(data[data_start:data_end]), del_index, more_data
 
-- 
cgit v1.2.1