diff options
| author | Eli Zaretskii <eliz@gnu.org> | 2019-05-18 10:15:19 +0300 | 
|---|---|---|
| committer | Eli Zaretskii <eliz@gnu.org> | 2019-05-18 10:15:19 +0300 | 
| commit | 6c9a1deb41f0a9d3dd7e8615a0015a74a851b426 (patch) | |
| tree | 2c56fa0ace06dc174b55aad93c88a6e9a48e871a /lisp | |
| parent | d0b72dbba0bef8bc8ec0e1c2d55d179820d5e452 (diff) | |
| download | emacs-6c9a1deb41f0a9d3dd7e8615a0015a74a851b426.tar.gz | |
Fix visiting and saving UTF-16 encoded XML files
* lisp/international/mule.el (sgml-xml-auto-coding-function):
When the 'enncoding' tag specifies a UTF-16 encoding, enforce
saving the buffer with BOM, per the XML spec.
(xml-find-file-coding-system): Recognize UTF-16 encodings with
BOM.  (Bug#35766)  (Bug#8282)
* lisp/international/mule-cmds.el (select-safe-coding-system):
Don't consider UTF-16 encodings with and without BOM as
"different", so as not to annoy users with redundant questions
about mismatch between the XML/SGML header and the selected
explicit encoding.
Diffstat (limited to 'lisp')
| -rw-r--r-- | lisp/international/mule-cmds.el | 8 | ||||
| -rw-r--r-- | lisp/international/mule.el | 22 | 
2 files changed, 26 insertions, 4 deletions
| diff --git a/lisp/international/mule-cmds.el b/lisp/international/mule-cmds.el index dfa9e4e6c8c..27296ecfb2c 100644 --- a/lisp/international/mule-cmds.el +++ b/lisp/international/mule-cmds.el @@ -1029,7 +1029,13 @@ It is highly recommended to fix it before writing to a file."  		 ;; This check perhaps isn't ideal, but is probably  		 ;; the best thing to do.  		 (not (auto-coding-alist-lookup (or file buffer-file-name ""))) -		 (not (coding-system-equal coding-system auto-cs))) +		 (not (coding-system-equal coding-system auto-cs)) +                 ;; coding-system-equal barfs on 'charset'. +                 (or (equal (coding-system-type auto-cs) 'charset) +                     (equal (coding-system-type coding-system) 'charset) +                     (not (coding-system-equal (coding-system-type auto-cs) +                                               (coding-system-type +                                                coding-system)))))  	    (unless (yes-or-no-p  		     (format "Selected encoding %s disagrees with \  %s specified by file contents.  Really save (else edit coding cookies \ diff --git a/lisp/international/mule.el b/lisp/international/mule.el index b5414de0dba..21f3118a98e 100644 --- a/lisp/international/mule.el +++ b/lisp/international/mule.el @@ -2498,7 +2498,18 @@ This function is intended to be added to `auto-coding-functions'."        (when end  	(if (re-search-forward "encoding=[\"']\\(.+?\\)[\"']" end t)  	    (let* ((match (match-string 1)) -		   (sym (intern (downcase match)))) +                   (sym-name (downcase match)) +                   (sym-name +                    ;; https://www.w3.org/TR/xml/#charencoding says: +                    ;; "Entities encoded in UTF-16 MUST [...] begin +                    ;; with the Byte Order Mark."  The trick below is +                    ;; based on the fact that utf-16be/le don't +                    ;; specify BOM, while utf-16-be/le do. +                    (cond +                     ((equal sym-name "utf-16le") "utf-16-le") +                     ((equal sym-name "utf-16be") "utf-16-be") +                     (t sym-name))) +		   (sym (intern sym-name)))  	      (if (coding-system-p sym)                    ;; If the encoding tag is UTF-8 and the buffer's                    ;; encoding is one of the variants of UTF-8, use the @@ -2587,9 +2598,14 @@ added by processing software."        (let ((detected               (with-coding-priority '(utf-8)                 (coding-system-base -                (detect-coding-region (point-min) (point-max) t))))) -        ;; Pure ASCII always comes back as undecided. +                (detect-coding-region (point-min) (point-max) t)))) +            (bom (list (char-after 1) (char-after 2))))          (cond +         ((equal bom '(#xFE #xFF)) +          'utf-16be-with-signature) +         ((equal bom '(#xFF #xFE)) +          'utf-16le-with-signature) +         ;; Pure ASCII always comes back as undecided.           ((memq detected '(utf-8 undecided))            'utf-8)           ((eq detected 'utf-16le-with-signature) 'utf-16le-with-signature) | 
