diff options
author | Dave Love <fx@gnu.org> | 2002-07-17 19:21:41 +0000 |
---|---|---|
committer | Dave Love <fx@gnu.org> | 2002-07-17 19:21:41 +0000 |
commit | a1d62c7293506a0c0a24785d846a5e9aafe55815 (patch) | |
tree | f10cbe2c0cc8cc6ba7adc91a92612fc793e4d4a5 | |
parent | bf45cc3f8df7e7a1de82b3e024f21871f18bf8af (diff) | |
download | emacs-a1d62c7293506a0c0a24785d846a5e9aafe55815.tar.gz |
Optimize tables. Deal with some
non-8859 charsets.
(ucs-mule-to-mule-unicode): New.
(ucs-unify-8859): Use utf-8-fragment-on-decoding, set up Quail
translation.
(ucs-fragment-8859): Modified consistent with ucs-unify-8859.
(unify-8859-on-encoding-mode): Doc mod. Fix custom version.
(unify-8859-on-decoding-mode): Doc mod. Change code. Fix custom
version. Add custom dependencies.
(ucs-insert): Check for null from decode-char.
(translation-table-for-input, ucs-quail-activate)
(ucs-minibuffer-setup, ccl-encode-unicode-font)
(ucs-tables-unload-hook): New.
-rw-r--r-- | lisp/international/ucs-tables.el | 310 |
1 files changed, 191 insertions, 119 deletions
diff --git a/lisp/international/ucs-tables.el b/lisp/international/ucs-tables.el index 076772e3e10..e59e7b3872a 100644 --- a/lisp/international/ucs-tables.el +++ b/lisp/international/ucs-tables.el @@ -25,12 +25,12 @@ ;;; Commentary: ;; This file provides tables mapping between Unicode numbers and -;; emacs-mule characters from the iso8859 charsets (and others). It +;; emacs-mule characters from the iso-8859 charsets (and others). It ;; also provides some auxiliary functions. ;; These tables are used to construct other mappings between the Mule ;; iso8859 charsets and the emacs-unicode charsets and a table that -;; unifies iso8859 characters using a single charset as far as +;; unifies iso-8859 characters using a single charset as far as ;; possible. These tables are used by latin1-disp.el to display some ;; Unicode characters without a Unicode font and by utf-8.el to unify ;; Latin-N as far as possible on encoding. @@ -44,14 +44,46 @@ ;; not idempotent. ;; Global minor modes are provided to unify on encoding and decoding. - -;; The translation table `ucs-mule-to-mule-unicode' is populated. -;; This is used by the `mule-utf-8' coding system to encode extra -;; characters. +;; These could be extended to non-iso-8859 charsets. However 8859 is +;; all that users normally care about unifying although, for instance, +;; Greek occurs in as many as nine Emacs charsets. + +;; The translation table `ucs-mule-to-mule-unicode' is populated, +;; which could be used for more general unification on decoding. This +;; is used by the `mule-utf-8' coding system to encode extra +;; characters, and also by the coding systems set up by code-pages.el. +;; The decoding tables here take account of +;; `utf-8-fragment-on-decoding' which may specify decoding Greek and +;; Cyrillic into 8859 charsets. + +;; Unification also puts a `translation-table-for-input' property on +;; relevant coding coding systems and arranges for the +;; `translation-table-for-input' variable to be set either globally or +;; locally. This is used by Quail input methods to translate input +;; characters appropriately for the buffer's coding system (if +;; possible). Unification on decoding sets it globally to translate +;; to Unicode. Unification on encoding uses hooks to set it up +;; locally to buffers. Thus in the latter case, typing `"a' into a +;; Latin-1 buffer using the `latin-2-prefix' method translates the +;; generated latin-iso8859-2 `,Bd(B' into latin-iso8859-1 `,Ad(B'. + +;; NB, this code depends on the default value of +;; `enable-character-translation'. (Making it nil would anyway lead +;; to inconsistent behaviour between CCL-based coding systems which +;; use explicit translation tables and the rest.) ;; Command `ucs-insert' is convenient for inserting a given Unicode. ;; (See also the `ucs' input method.) +;; A replacement CCL program is provided which allows characters in +;; the `ucs-mule-to-mule-unicode' table to be displayed with an +;; iso-10646-encoded font. E.g. to use a `Unicode' font for Cyrillic: +;; +;; (set-fontset-font "fontset-startup" +;; (cons (make-char 'cyrillic-iso8859-5 160) +;; (make-char 'cyrillic-iso8859-5 255)) +;; '(nil . "ISO10646-1")) + ;;; Code: ;;; Define tables, to be populated later. @@ -1067,11 +1099,7 @@ Translates from the iso8859 charsets and `mule-unicode-0100-24ff'.") (push (cons (make-char 'latin-iso8859-1 (- i 128)) i) l) (setq i (1+ i))) - (nreverse l))) - -;; (case-table (standard-case-table)) -;; (syntax-table (standard-syntax-table)) - ) + (nreverse l)))) ;; Convert the lists to the basic char tables. (dolist (n (list 15 14 9 8 7 5 4 3 2 1)) @@ -1084,41 +1112,11 @@ Translates from the iso8859 charsets and `mule-unicode-0100-24ff'.") ;; (aset ucs-ucs-to-mule-8859-table uc mule) ;; (aset ucs-mule-unicode-to-mule-8859 mu mule) (aset ucs-mule-8859-to-mule-unicode mule mu) - (aset ucs-mule-to-mule-unicode mule mu))) -;; I think this is actually done OK in characters.el. -;; Probably things like accents shouldn't have word syntax, but the -;; Latin-N syntax tables currently aren't consistent for such -;; characters anyhow. -;; ;; Make the mule-unicode characters inherit syntax and case info -;; ;; if they don't already have it. -;; (dolist (pair alist) -;; (let ((mule (car pair)) -;; (uc (cdr pair)) -;; (mu (decode-char 'ucs (cdr pair)))) -;; (let ((syntax (aref syntax-table mule))) -;; (if (eq mule (downcase mule)) -;; (if (eq mule (upcase mule)) ; non-letter or uncased letter -;; (progn -;; (if (= 4 (car syntax)) ; left delim -;; (progn -;; (aset syntax-table -;; mu -;; (cons 4 (aref ucs-mule-8859-to-mule-unicode -;; (cdr syntax)))) -;; (aset syntax-table -;; (aref ucs-mule-8859-to-mule-unicode -;; (cdr syntax)) -;; (cons 5 mu))) -;; (aset syntax-table mu syntax)) -;; (aset case-table mu mu))) -;; ;; Upper case letter -;; (let ((lower (aref ucs-mule-8859-to-mule-unicode -;; (aref case-table mule)))) -;; (aset case-table mu lower) -;; (aset case-table lower lower) -;; (modify-syntax-entry lower "w " syntax-table) -;; (modify-syntax-entry mu "w " syntax-table)))))) - )) + (aset ucs-mule-to-mule-unicode mule mu))))) + ;; The table optimizing here and elsewhere probably isn't very + ;; useful, but seems good practice. + (optimize-char-table ucs-mule-to-mule-unicode) + (optimize-char-table ucs-mule-8859-to-mule-unicode) ;; Derive tables that can be used as per-coding-system ;; `translation-table-for-encode's. (dolist (n (list 15 14 9 8 7 5 4 3 2 1)) @@ -1138,15 +1136,15 @@ Translates from the iso8859 charsets and `mule-unicode-0100-24ff'.") (if (and (setq elt (rassq v alist)) (not (assq k alist))) (aset encode-translator k (car elt)))) - ucs-mule-8859-to-ucs-table)))) + ucs-mule-8859-to-ucs-table) + (optimize-char-table encode-translator)))) ;; Register for use in CCL. (define-translation-table 'ucs-mule-8859-to-mule-unicode ucs-mule-8859-to-mule-unicode) +(define-translation-table 'ucs-mule-to-mule-unicode + ucs-mule-to-mule-unicode) -;; Fixme: Make this reversible, which means frobbing -;; `char-coding-system-table' directly to remove what we added -- see -;; codepages.el. Also make it a user option. (defun ucs-unify-8859 (&optional encode-only) "Set up translation tables for unifying characters from ISO 8859. @@ -1159,7 +1157,24 @@ everything on input operations." (interactive "P") (unless encode-only ;; Unify 8859 on decoding. (Non-CCL coding systems only.) - (unify-8859-on-decoding-mode 1)) + (if utf-8-fragment-on-decoding + (map-char-table + (lambda (k v) + (if v (aset ucs-mule-to-mule-unicode v nil))) + utf-8-translation-table-for-decode) + ;; Reset in case it was changed. + (map-char-table + (lambda (k v) + (if v (aset ucs-mule-to-mule-unicode v k))) + utf-8-translation-table-for-decode)) + (set-char-table-parent standard-translation-table-for-decode + ucs-mule-8859-to-mule-unicode) + ;; Translate Quail input globally. + (setq-default translation-table-for-input ucs-mule-to-mule-unicode) + ;; In case these are set up, but we should use the global + ;; translation table. + (remove-hook 'quail-activate-hook 'ucs-quail-activate) + (remove-hook 'minibuffer-setup-hook 'ucs-minibuffer-setup)) ;; Adjust the 8859 coding systems to fragment the unified characters ;; on encoding. (dolist (n '(1 2 3 4 5 7 8 9 14 15)) @@ -1174,58 +1189,11 @@ everything on input operations." (set-char-table-parent safe table) ;; Update the table of what encodes to what. (register-char-codings coding-system table) - (coding-system-put coding-system 'translation-table-for-encode table))) - -;;; The following works for the bundled coding systems, but it's -;;; better to use the Unicode-based ones and make it irrelevant. - -;;; ;; Update the Cyrillic special cases. -;;; ;; `translation-table-for-encode' doesn't work for CCL coding -;;; ;; systems, and `standard-translation-table-for-decode' isn't -;;; ;; applied. -;;; (let ((table (get 'cyrillic-koi8-r-encode-table 'translation-table))) -;;; (map-char-table -;;; (lambda (k v) -;;; (aset table -;;; (or (aref ucs-8859-5-encode-table k) -;;; k) -;;; v)) -;;; table) -;;; (register-char-codings 'cyrillic-koi8 table)) -;;; (let ((table (get 'cyrillic-koi8-r-nonascii-translation-table -;;; 'translation-table))) -;;; (map-char-table -;;; (lambda (k v) -;;; (if v (aset table k (or (aref ucs-mule-8859-to-mule-unicode v) -;;; v)))) -;;; table)) -;;; ;; Redefine this, since the orginal only translated 8859-5. -;;; (define-ccl-program ccl-encode-koi8 -;;; `(1 -;;; ((loop -;;; (read-multibyte-character r0 r1) -;;; (translate-character cyrillic-koi8-r-encode-table r0 r1) -;;; (write-repeat r1)))) -;;; "CCL program to encode KOI8.") -;;; (let ((table (get 'cyrillic-alternativnyj-encode-table 'translation-table))) -;;; (map-char-table -;;; (lambda (k v) -;;; (aset table -;;; (or (aref ucs-8859-5-encode-table k) -;;; k) -;;; v)) -;;; table) -;;; (register-char-codings 'cyrillic-alternativnyj table)) -;;; (let ((table (get 'cyrillic-alternativnyj-nonascii-translation-table -;;; 'translation-table))) -;;; (map-char-table -;;; (lambda (k v) -;;; (if v (aset table -;;; k -;;; (or (aref ucs-mule-8859-to-mule-unicode v) -;;; v)))) -;;; table)) - ) + (coding-system-put coding-system 'translation-table-for-encode table) + (coding-system-put coding-system 'translation-table-for-input table))) + ;; Arrange local translation tables for Quail input. + (add-hook 'quail-activate-hook 'ucs-quail-activate) + (add-hook 'minibuffer-setup-hook 'ucs-minibuffer-setup)) (defun ucs-fragment-8859 (&optional encode-only) "Undo the unification done by `ucs-unify-8859'. @@ -1235,7 +1203,8 @@ unification on input operations." ;; Maybe fix decoding. (unless encode-only ;; Unify 8859 on decoding. (Non-CCL coding systems only.) - (unify-8859-on-decoding-mode -1)) + (set-char-table-parent standard-translation-table-for-decode nil) + (setq-default translation-table-for-input nil)) ;; Fix encoding. For each charset, remove the entries in ;; `char-coding-system-table' added to its safe-chars table (as its ;; parent). @@ -1253,7 +1222,11 @@ unification on input operations." (delq coding-system codings))))) (char-table-parent safe)) (set-char-table-parent safe nil) - (coding-system-put coding-system 'translation-table-for-encode nil)))) + (coding-system-put coding-system 'translation-table-for-encode nil) + (coding-system-put coding-system 'translation-table-for-input nil))) + (optimize-char-table char-coding-system-table) + (remove-hook 'quail-activate-hook 'ucs-quail-activate) + (remove-hook 'minibuffer-setup-hook 'ucs-minibuffer-setup)) (define-minor-mode unify-8859-on-encoding-mode "Set up translation tables for unifying ISO 8859 characters on encoding. @@ -1276,42 +1249,54 @@ directly to a byte value 233. By default, in contrast, you would be prompted for a general coding system to use for saving the file, which can cope with separate Latin-1 and Latin-9 representations of e-acute. +Also sets hooks that arrange `translation-table-for-input' to be set +up locally when Quail input methods are activated. This will often +allow input generated by Quail input methods to conform with what the +buffer's file coding system can encode. Thus you could use a Latin-2 +input method to search for e-acute in a Latin-1 buffer. + See also command `unify-8859-on-decoding-mode'." :group 'mule :global t - :version "21.3" ; who knows...? :init-value nil (if unify-8859-on-encoding-mode (ucs-unify-8859 t) (ucs-fragment-8859 t))) +(custom-add-version 'unify-8859-on-encoding-mode "21.4") + (define-minor-mode unify-8859-on-decoding-mode - "Set up translation table for unifying ISO 8859 characters on decoding. -On decoding -- i.e. input operations -- non-ASCII characters from the + "Set up translation tables for unifying ISO 8859 characters on decoding. +On decoding, i.e. input operations, non-ASCII characters from the built-in ISO 8859 charsets are unified by mapping them into the `iso-latin-1' and `mule-unicode-0100-24ff' charsets. -This sets the parent of `standard-translation-table-for-decode'. Also sets `translation-table-for-input' globally, so that Quail input methods produce unified characters. -See also command `unify-8859-on-encoding-mode'." +See also command `unify-8859-on-encoding-mode' and the user option +`utf-8-fragment-on-decoding'." :group 'mule :global t - :version "21.3" ; who knows...? :init-value nil - (let ((table (if unify-8859-on-decoding-mode ucs-mule-8859-to-mule-unicode))) - (set-char-table-parent standard-translation-table-for-decode table) - (setq-default translation-table-for-input table))) + (if unify-8859-on-decoding-mode + (ucs-unify-8859) + (ucs-fragment-8859))) + +(custom-add-dependencies 'unify-8859-on-decoding-mode + '(utf-8-fragment-on-decoding)) +(custom-add-version 'unify-8859-on-decoding-mode "21.4") (defun ucs-insert (arg) "Insert the Emacs character representation of the given Unicode. Interactively, prompts for a hex string giving the code." (interactive "sUnicode (hex): ") - (insert (or (decode-char 'ucs (if (integerp arg) - arg - (string-to-number arg 16))) - (error "Unknown Unicode character")))) + (let ((c (decode-char 'ucs (if (integerp arg) + arg + (string-to-number arg 16))))) + (if c + (insert c) + (error "Character can't be decoded to UCS")))) ;;; Dealing with non-8859 character sets. @@ -2458,11 +2443,23 @@ Interactively, prompts for a hex string giving the code." (aset ucs-mule-to-mule-unicode (car pair) (cdr pair)) (if encode-translator (aset encode-translator (cdr pair) (car pair)))) + (if encode-translator + (optimize-char-table encode-translator)) (if (charsetp cs) (push cs safe-charsets) (setq safe-charsets (append (delq 'ascii (coding-system-get cs 'safe-charsets)) - safe-charsets))))) + safe-charsets))) + (cond ((eq cs 'vietnamese-viscii) + (coding-system-put 'vietnamese-viscii + 'translation-table-for-input + encode-translator) + (coding-system-put 'vietnamese-viqr + 'translation-table-for-input + encode-translator)) + ((memq cs '(lao thai-tis620 tibetan-iso-8bit)) + (coding-system-put cs 'translation-table-for-input cs))))) + (optimize-char-table ucs-mule-to-mule-unicode) (dolist (c safe-charsets) (aset table (make-char c) t)) (coding-system-put 'mule-utf-8 'safe-charsets @@ -2470,6 +2467,81 @@ Interactively, prompts for a hex string giving the code." safe-charsets)) (register-char-codings 'mule-utf-8 table))) +(defvar translation-table-for-input (make-translation-table)) + +;; Arrange to set up the translation table for Quail. This probably +;; isn't foolproof. +(defun ucs-quail-activate () + "Set up an appropriate `translation-table-for-input' for current buffer. +Intended to be added to `quail-activate-hook'." + (let ((cs (coding-system-base buffer-file-coding-system))) + (if (eq cs 'undecided) + (setq cs (coding-system-base default-buffer-file-coding-system))) + (if (coding-system-get cs 'translation-table-for-input) + (set (make-variable-buffer-local 'translation-table-for-input) + (coding-system-get cs 'translation-table-for-input))))) + +;; The minibuffer needs to acquire a `buffer-file-coding-system' for +;; the above to work in it. +(defun ucs-minibuffer-setup () + "Set up an appropriate `buffer-file-coding-system' for current buffer. +Does so by inheriting it from the cadr of the current buffer list. +Intended to be added to `minibuffer-setup-hook'." + (set (make-local-variable 'buffer-file-coding-system) + (with-current-buffer (cadr (buffer-list)) + buffer-file-coding-system))) + +;; Modified to allow display of arbitrary characters with an +;; iso-10646-encoded (`Unicode') font. +(define-ccl-program ccl-encode-unicode-font + `(0 + ((if (r0 == ,(charset-id 'ascii)) + ((r2 = r1) + (r1 = 0)) + ( + ;; Look for a translation for non-ASCII chars. For a 2D + ;; charset, produce a single code for the translation. + ;; Official 2D sets are in the charset id range [#x90,#x99], + ;; private ones in the range [#xf0,#xfe] (with #xff not used). + ;; Fixme: Is there a better way to do this? + (r3 = (r0 >= #x90)) + (r3 &= (r0 <= #x99)) + (r3 |= (r0 >= #xf0)) + (if r3 ; 2D input + (r1 = ((r1 << 7) | r2))) + (translate-character ucs-mule-to-mule-unicode r0 r1) + (r3 = (r0 >= #x90)) + (r3 &= (r0 <= #x99)) + (r3 |= (r0 >= #xf0)) + (if r3 ; 2D translation + ((r2 = (r1 & 127)) + (r1 = (r1 >> 7)))) + (if (r0 == ,(charset-id 'latin-iso8859-1)) + ((r2 = (r1 + 128)) + (r1 = 0)) + (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) + ((r1 *= 96) + (r1 += r2) + (r1 += ,(- #x100 (* 32 96) 32)) + (r1 >8= 0) + (r2 = r7)) + (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) + ((r1 *= 96) + (r1 += r2) + (r1 += ,(- #x2500 (* 32 96) 32)) + (r1 >8= 0) + (r2 = r7)) + (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) + ((r1 *= 96) + (r1 += r2) + (r1 += ,(- #xe000 (* 32 96) 32)) + (r1 >8= 0) + (r2 = r7)))))))))) + "Encode characters for display with iso10646 font. +Translate through table `ucs-mule-to-mule-unicode' initially.") + +(defalias 'ucs-tables-unload-hook 'ucs-fragment-8859) + (provide 'ucs-tables) ;;; ucs-tables.el ends here |