summaryrefslogtreecommitdiff
path: root/lisp/xml.el
diff options
context:
space:
mode:
authorPhilipp Stephani <phst@google.com>2020-05-23 13:56:09 +0200
committerPhilipp Stephani <phst@google.com>2020-05-23 14:01:17 +0200
commitf8581bcf6a1942ebd331cae20e32945a3a86a3d1 (patch)
tree3f1a946cba9cdac5b03f09ec1b01be962256295f /lisp/xml.el
parent232bb691c1095574b85b358c7f33a46d2ea79f29 (diff)
downloademacs-f8581bcf6a1942ebd331cae20e32945a3a86a3d1.tar.gz
Reject invalid characters in XML strings (Bug#41094).
* lisp/xml.el (xml-escape-string): Search for invalid characters. (xml-invalid-character): New error symbol. * test/lisp/xml-tests.el (xml-print-invalid-cdata): New unit test. * etc/NEWS: Document new behavior.
Diffstat (limited to 'lisp/xml.el')
-rw-r--r--lisp/xml.el13
1 files changed, 12 insertions, 1 deletions
diff --git a/lisp/xml.el b/lisp/xml.el
index dc774a202cf..767cf042846 100644
--- a/lisp/xml.el
+++ b/lisp/xml.el
@@ -1023,9 +1023,17 @@ entity references (e.g., replace each & with &amp;).
XML character data must not contain & or < characters, nor the >
character under some circumstances. The XML spec does not impose
restriction on \" or \\=', but we just substitute for these too
-\(as is permitted by the spec)."
+\(as is permitted by the spec).
+
+If STRING contains characters that are invalid in XML (as defined
+by https://www.w3.org/TR/xml/#charsets), signal an error of type
+`xml-invalid-character'."
(with-temp-buffer
(insert string)
+ (goto-char (point-min))
+ (when (re-search-forward
+ "[^\u0009\u000A\u000D\u0020-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]")
+ (signal 'xml-invalid-character (list (char-before) (match-beginning 0))))
(dolist (substitution '(("&" . "&amp;")
("<" . "&lt;")
(">" . "&gt;")
@@ -1036,6 +1044,9 @@ restriction on \" or \\=', but we just substitute for these too
(replace-match (cdr substitution) t t nil)))
(buffer-string)))
+(define-error 'xml-invalid-character "Invalid XML character"
+ 'wrong-type-argument)
+
(defun xml-debug-print-internal (xml indent-string)
"Outputs the XML tree in the current buffer.
The first line is indented with INDENT-STRING."