summaryrefslogtreecommitdiff
path: root/lisp/international
diff options
context:
space:
mode:
authorEli Zaretskii <eliz@gnu.org>2019-05-18 10:15:19 +0300
committerEli Zaretskii <eliz@gnu.org>2019-05-18 10:15:19 +0300
commit6c9a1deb41f0a9d3dd7e8615a0015a74a851b426 (patch)
tree2c56fa0ace06dc174b55aad93c88a6e9a48e871a /lisp/international
parentd0b72dbba0bef8bc8ec0e1c2d55d179820d5e452 (diff)
downloademacs-6c9a1deb41f0a9d3dd7e8615a0015a74a851b426.tar.gz
Fix visiting and saving UTF-16 encoded XML files
* lisp/international/mule.el (sgml-xml-auto-coding-function): When the 'enncoding' tag specifies a UTF-16 encoding, enforce saving the buffer with BOM, per the XML spec. (xml-find-file-coding-system): Recognize UTF-16 encodings with BOM. (Bug#35766) (Bug#8282) * lisp/international/mule-cmds.el (select-safe-coding-system): Don't consider UTF-16 encodings with and without BOM as "different", so as not to annoy users with redundant questions about mismatch between the XML/SGML header and the selected explicit encoding.
Diffstat (limited to 'lisp/international')
-rw-r--r--lisp/international/mule-cmds.el8
-rw-r--r--lisp/international/mule.el22
2 files changed, 26 insertions, 4 deletions
diff --git a/lisp/international/mule-cmds.el b/lisp/international/mule-cmds.el
index dfa9e4e6c8c..27296ecfb2c 100644
--- a/lisp/international/mule-cmds.el
+++ b/lisp/international/mule-cmds.el
@@ -1029,7 +1029,13 @@ It is highly recommended to fix it before writing to a file."
;; This check perhaps isn't ideal, but is probably
;; the best thing to do.
(not (auto-coding-alist-lookup (or file buffer-file-name "")))
- (not (coding-system-equal coding-system auto-cs)))
+ (not (coding-system-equal coding-system auto-cs))
+ ;; coding-system-equal barfs on 'charset'.
+ (or (equal (coding-system-type auto-cs) 'charset)
+ (equal (coding-system-type coding-system) 'charset)
+ (not (coding-system-equal (coding-system-type auto-cs)
+ (coding-system-type
+ coding-system)))))
(unless (yes-or-no-p
(format "Selected encoding %s disagrees with \
%s specified by file contents. Really save (else edit coding cookies \
diff --git a/lisp/international/mule.el b/lisp/international/mule.el
index b5414de0dba..21f3118a98e 100644
--- a/lisp/international/mule.el
+++ b/lisp/international/mule.el
@@ -2498,7 +2498,18 @@ This function is intended to be added to `auto-coding-functions'."
(when end
(if (re-search-forward "encoding=[\"']\\(.+?\\)[\"']" end t)
(let* ((match (match-string 1))
- (sym (intern (downcase match))))
+ (sym-name (downcase match))
+ (sym-name
+ ;; https://www.w3.org/TR/xml/#charencoding says:
+ ;; "Entities encoded in UTF-16 MUST [...] begin
+ ;; with the Byte Order Mark." The trick below is
+ ;; based on the fact that utf-16be/le don't
+ ;; specify BOM, while utf-16-be/le do.
+ (cond
+ ((equal sym-name "utf-16le") "utf-16-le")
+ ((equal sym-name "utf-16be") "utf-16-be")
+ (t sym-name)))
+ (sym (intern sym-name)))
(if (coding-system-p sym)
;; If the encoding tag is UTF-8 and the buffer's
;; encoding is one of the variants of UTF-8, use the
@@ -2587,9 +2598,14 @@ added by processing software."
(let ((detected
(with-coding-priority '(utf-8)
(coding-system-base
- (detect-coding-region (point-min) (point-max) t)))))
- ;; Pure ASCII always comes back as undecided.
+ (detect-coding-region (point-min) (point-max) t))))
+ (bom (list (char-after 1) (char-after 2))))
(cond
+ ((equal bom '(#xFE #xFF))
+ 'utf-16be-with-signature)
+ ((equal bom '(#xFF #xFE))
+ 'utf-16le-with-signature)
+ ;; Pure ASCII always comes back as undecided.
((memq detected '(utf-8 undecided))
'utf-8)
((eq detected 'utf-16le-with-signature) 'utf-16le-with-signature)