(ucs-mule-cjk-to-unicode)

(utf-subst-table-for-encode, ucs-unicode-to-mule-cjk) (utf-subst-table-for-decode): Specify :size, :rehash-size. (utf-translate-cjk): :set rewritten to load subst-... files. Add :set-after. (ccl-decode-mule-utf-8): Consider CJK translation for r3<#x3400.
author: Dave Love <fx@gnu.org> 2002-12-15 16:46:00 +0000
committer: Dave Love <fx@gnu.org> 2002-12-15 16:46:00 +0000
commit: ccdd5c61872c10cce32e500bc15aa3efe05c7d75 (patch)
tree: 9671ae37571353edd71be29942405581649a9357 /lisp/international/utf-8.el
parent: 241094692158692abf6c958873d98e4738ad72ef (diff)
download: emacs-ccdd5c61872c10cce32e500bc15aa3efe05c7d75.tar.gz
1 files changed, 58 insertions, 26 deletions
diff --git a/lisp/international/utf-8.el b/lisp/international/utf-8.el
index 9bbb9d2c756..018691b1168 100644
--- a/lisp/international/utf-8.el
+++ b/lisp/international/utf-8.el
@@ -47,7 +47,7 @@
 ;; idempotent -- to represent the bytes to fix that needs a new charset.
 ;;
 ;; Characters from other character sets can be encoded with mule-utf-8
-;; by populating the translation-table
+;; by populating the translation table
 ;; `utf-translation-table-for-encode' and registering the translation
 ;; with `register-char-codings'.  Hash tables
 ;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are
@@ -95,23 +95,25 @@ translation-table named `utf-translation-table-for-encode'")
 (define-translation-table 'utf-translation-table-for-decode)
 
 
-(defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq)
+(defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq :size 43000
+						 :rehash-size 1000)
   "Hash table mapping Emacs CJK character sets to Unicode code points.
 
 If `utf-translate-cjk' is non-nil, this table populates the
 translation-hash-table named `utf-subst-table-for-encode'.")
 
 (define-translation-hash-table 'utf-subst-table-for-encode 
-  (make-hash-table :test 'eq))
+  (make-hash-table :test 'eq :size 43000 :rehash-size 1000))
 
-(defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq)
+(defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq :size 43000
+						 :rehash-size 1000)
   "Hash table mapping Unicode code points to Emacs CJK character sets.
 
 If `utf-translate-cjk' is non-nil, this table populates the
 translation-hash-table named `utf-subst-table-for-decode'.")
 
 (define-translation-hash-table 'utf-subst-table-for-decode
-  (make-hash-table :test 'eq))
+  (make-hash-table :test 'eq :size 21500 :rehash-size 200))
 
 (mapc
  (lambda (pair)
@@ -205,19 +207,46 @@ Setting this variable outside customize has no effect."
 
 (defcustom utf-translate-cjk nil
   "Whether the UTF based coding systems should decode/encode CJK characters.
-
-Enabling this loads tables which enable the coding systems:
-    mule-utf-8, mule-utf-16-le, mule-utf-16-be
-to encode characters in the charsets `korean-ksc5601', `chinese-gb2312' and
-`japanese-jisx0208', and to decode the corresponding unicodes into
-such characters.  This works by loading the library `utf-8-subst'; see
-its commentary.  The tables are fairly large (about 33000 entries), so this
-option is not the default."
-  :link '(emacs-commentary-link "utf-8-subst")
+Enabling this loads tables which allow the coding systems mule-utf-8,
+mule-utf-16-le and mule-utf-16-be to encode characters in the charsets
+`korean-ksc5601', `chinese-gb2312', `chinese-big5-1',
+`chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to
+decode the corresponding unicodes into such characters.
+
+Where the charsets overlap, the one preferred for decoding is chosen
+according to the language environment in effect when this option is
+turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for
+Chinese-Big5 and jisx for other environments.
+
+The tables are large (over 40000 entries), so this option is not the
+default.  Also, installing them may be rather slow."
   :set (lambda (s v)
 	 (if v
 	     (progn
-	       (require 'utf-8-subst)
+	       ;; Load the files explicitly, to avoid having to keep
+	       ;; around the large tables they contain (as well as the
+	       ;; ones which get built).
+	       (cond
+		((string= "Korean" current-language-environment)
+		 (load "subst-jis")
+		 (load "subst-big5")
+		 (load "subst-gb2312")
+		 (load "subst-ksc"))
+		((string= "Chinese-BIG5" current-language-environment)
+		 (load "subst-jis")
+		 (load "subst-ksc")
+		 (load "subst-gb2312")
+		 (load "subst-big5"))
+		((string= "Chinese-GB" current-language-environment)
+		 (load "subst-jis")
+		 (load "subst-ksc")
+		 (load "subst-big5")
+		 (load "subst-gb2312"))
+		(t
+		 (load "subst-ksc")
+		 (load "subst-gb2312")
+		 (load "subst-big5")
+		 (load "subst-jis")))   ; jis covers as much as big5, gb2312
 	       (let ((table (make-char-table 'translation-table)))
 		 (maphash (lambda (k v)
 			    (aset table k t))
@@ -244,6 +273,7 @@ option is not the default."
 	 (set-default s v))
   :version "21.4"
   :type 'boolean
+  :set-after '(current-language-environment)
   :group 'mule)
 
 (define-ccl-program ccl-decode-mule-utf-8
@@ -378,18 +408,20 @@ option is not the default."
 			 (write-multibyte-character r0 r1))
 		    
 		      ;; mule-unicode-2500-33ff
-		      ;; Fixme: Perhaps allow translation via
-		      ;; utf-subst-table-for-decode for #x2e80 up, so
-		      ;; that we use consistent charsets for all of
-		      ;; CJK.  Would need corresponding change to
-		      ;; encoding tables.
 		      (if (r3 < #x3400)
-			  ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
-			   (r3 -= #x2500)
-			   (r3 //= 96)
-			   (r1 = (r7 + 32))
-			   (r1 += ((r3 + 32) << 7))
-			   (write-multibyte-character r0 r1))
+			  ((r4 = r3)	; don't zap r3
+			   (lookup-integer utf-subst-table-for-decode r4 r5)
+			   (if r7
+			       ;; got a translation
+			       ((write-multibyte-character r4 r5)
+				;; Zapped through register starvation.
+				(r5 = ,(charset-id 'eight-bit-control)))
+			     ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
+			      (r3 -= #x2500)
+			      (r3 //= 96)
+			      (r1 = (r7 + 32))
+			      (r1 += ((r3 + 32) << 7))
+			      (write-multibyte-character r0 r1))))
 
 			;; U+3400 .. U+D7FF
 			;; Try to convert to CJK chars, else keep
author	Dave Love <fx@gnu.org>	2002-12-15 16:46:00 +0000
committer	Dave Love <fx@gnu.org>	2002-12-15 16:46:00 +0000
commit	ccdd5c61872c10cce32e500bc15aa3efe05c7d75 (patch)
tree	9671ae37571353edd71be29942405581649a9357 /lisp/international/utf-8.el
parent	241094692158692abf6c958873d98e4738ad72ef (diff)
download	emacs-ccdd5c61872c10cce32e500bc15aa3efe05c7d75.tar.gz