summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick Wellnhofer <wellnhofer@aevum.de>2023-04-13 15:11:47 +0200
committerNick Wellnhofer <wellnhofer@aevum.de>2023-04-13 15:20:56 +0200
commita19fa11e1d6c2b824f873b5be3786fc92380dd8f (patch)
treefd51f93743b3ad1d368527a6b12a3e6f9caf8771
parentb4d46cee80b577f74e218158d03db0c9f202ec06 (diff)
downloadlibxml2-a19fa11e1d6c2b824f873b5be3786fc92380dd8f.tar.gz
parser: Fix regression when switching input encodings
Revert some changes from commit 98840d40. WebKit/Chromium can actually switch from ISO-8859-1 to UTF-16 in the middle of parsing. This is a bad idea, but we have to keep supporting this use case.
-rw-r--r--parserInternals.c16
1 files changed, 12 insertions, 4 deletions
diff --git a/parserInternals.c b/parserInternals.c
index 084de4a0..b8f6648b 100644
--- a/parserInternals.c
+++ b/parserInternals.c
@@ -1177,12 +1177,20 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
}
if (in->encoder != NULL) {
+ if (in->encoder == handler)
+ return (0);
+
/*
- * TODO: Detect encoding mismatch. We should start by comparing
- * in->encoder->name and handler->name, but there are a few
- * compatible encodings like UTF-16 and UCS-2 or UTF-32 and UCS-4.
+ * Switching encodings during parsing is a really bad idea,
+ * but WebKit/Chromium switches from ISO-8859-1 to UTF-16 as soon as
+ * it finds Unicode characters with code points larger than 255.
+ *
+ * TODO: We should check whether the "raw" input buffer is empty and
+ * convert the old content using the old encoder.
*/
- xmlCharEncCloseFunc(handler);
+
+ xmlCharEncCloseFunc(in->encoder);
+ in->encoder = handler;
return (0);
}