admin/charsets/mapconv


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154

#!/bin/sh

# Copyright (C) 2015-2023 Free Software Foundation, Inc.

# Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
#   National Institute of Advanced Industrial Science and Technology (AIST)
#   Registration Number H13PRO009

# This file is part of GNU Emacs.

# GNU Emacs is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# GNU Emacs is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with GNU Emacs.  If not, see <https://www.gnu.org/licenses/>.

# Commentary:

# Convert charset map of various format into this:
#	0xXX 0xYYYY
# where,
#   XX is a code point of the charset in hexa-decimal,
#   YYYY is the corresponding Unicode character code in hexa-decimal.
# Arguments are:
#   $1: source map file
#   $2: address pattern for sed (optionally with substitution command)
#   $3: format of source map file
#	GLIBC-1 GLIBC-2 GLIBC-2-7 CZYBORRA IANA UNICODE UNICODE2 YASUOKA
#   $4: awk script

## So that eg [A-F] as used by KANJI-DATABASE branch below works as expected.
## Otherwise with LANG=en_US.utf8, CNS-6.map was generated with a
## bogus entry.  By experiment, LC_COLLATE=C was not enough.
LC_ALL=C
export LC_ALL

BASE=`expr "$1" : '.*/\(.*\)' '|' "$1"` # basename
FILE="admin/charsets/mapfiles/$BASE"
BASE=`expr "$BASE" : '\(.*\)\.gz$' '|' "$BASE"` # remove any .gz suffix
AWK=${AWK:-awk}

case "$3" in
    GLIBC*)
	FILE="$BASE in localedata/charmaps of glibc";
	SOURCE="";;
    CZYBORRA)
	BASE="$BASE.gz";
	SOURCE="https://czyborra.com/charsets/${BASE}";;
    IANA)
	SOURCE="https://www.iana.org/assignments/charset-reg/${BASE}";;
    UNICODE)
	SOURCE="https://www.unicode.org/Public/MAPPINGS/VENDORS/ADOBE/${BASE}";;
    UNICODE2)
	SOURCE="https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/${BASE}";;
    YASUOKA)
	BASE="$BASE.Z";
	SOURCE="http://kanji.zinbun.kyoto-u.ac.jp/~yasuoka/ftp/CJKtable/${BASE}";;
    KANJI-DATABASE)
	# FIXME: This URL no longer works.
	SOURCE="http://kanji-database.cvs.sourceforge.net/viewvc/*checkout*/kanji-database/kanji-database/data/cns2ucsdkw.txt?revision=1.4";;
    *)
	printf 'Unknown file type: %s\n' "$3"
	exit 1;;
esac

if [ -n "$SOURCE" ] ; then
    echo "# Generated from $FILE which is a copy of";
    echo "# $SOURCE"
else
    echo "# Generated from $FILE"
fi


if [ -n "$4" ] ; then
    if [ -f "$4" ] ; then
	AWKPROG="$AWK -f $4"
    else
	echo "Awk program does not exist: $4"
	exit 1
    fi
else
    AWKPROG=cat
fi

if [ "$3" = "GLIBC-1" ] ; then
    # Source format is:
    #   <UYYYY>	/xXX
    gunzip -c $1 | sed -n -e "${2}p" \
	| sed -e 's,<U\([^>]*\)>[ 	]*/x\(..\).*,0x\2 0x\1,' \
	| sort | ${AWKPROG}
elif [ "$3" = "GLIBC-2" ] ; then
    # Source format is:
    #   <UYYYY>	/xXX/xZZ
    gunzip -c $1 | sed -n -e "${2}p" \
	| sed -e 's,<U\([^>]*\)>[ 	]*/x\(..\)/x\(..\).*,0x\2\3 0x\1,' \
	| sort | ${AWKPROG}
elif [ "$3" = "GLIBC-2-7" ] ; then
    # Source format is:
    #   <UYYYY>	/xXX/xZZ
    # We must drop MSBs of XX and ZZ
    gunzip -c $1 | sed -n -e "${2}p" \
	| sed -e 's/xa/x2/g' -e 's/xb/x3/g' -e 's/xc/x4/g' \
	      -e 's/xd/x5/g' -e 's/xe/x6/g' -e 's/xf/x7/g' \
	      -e 's,<U\([^>]*\)>[ 	]*/x\(..\)/x\(..\).*,0x\2\3 0x\1,' \
	| sort | ${AWKPROG}
elif [ "$3" = "CZYBORRA" ] ; then
    # Source format is:
    #   =XX	U+YYYY
    sed -n -e "${2}p" < $1 \
	| sed -e 's/=\(..\)[^U]*U+\([0-9A-F]*\).*/0x\1 0x\2/' \
	| sort | ${AWKPROG}
elif [ "$3" = "IANA" ] ; then
    # Source format is:
    #   0xXX	0xYYYY
    sed -n -e "${2}p" < $1 \
	| sed -e 's/\(0x[0-9A-Fa-f]*\)[^0]*\(0x[0-9A-Fa-f]*\).*/\1 \2/' \
	| sort | ${AWKPROG}
elif [ "$3" = "UNICODE" ] ; then
    # Source format is:
    #   YYYY	XX
    # We perform reverse sort to prefer the first one in the
    # duplicated mappings (e.g. 0x20->U+0020, 0x20->U+00A0).
    sed -n -e "${2}p" < $1 \
	| sed -e 's/\([0-9A-F]*\)[^0-9A-F]*\([0-9A-F]*\).*/0x\2 0x\1/' \
	| sort -r
elif [ "$3" = "UNICODE2" ] ; then
    # Source format is:
    #   0xXXXX	0xYYYY	# ...
    sed -n -e "${2}p" < $1 \
	| sed -e 's/\([0-9A-Fx]*\)[^0]*\([0-9A-Fx]*\).*/\1 \2/' \
	| ${AWKPROG} | sort -n -k 4,4
elif [ "$3" = "YASUOKA" ] ; then
    # Source format is:
    # YYYY	0-XXXX (XXXX is a Kuten code)
    sed -n -e "${2}p" < $1 \
	| sed -e 's/\([0-9A-F]*\)[^0]*0-\([0-9]*\).*/0x\2 0x\1/' \
	| sort | ${AWKPROG}
elif [ "$3" = "KANJI-DATABASE" ] ; then
    # Source format is:
    # C?-XXXX U+YYYYY .....
    sed -n -e "${2}p" < $1 \
	| sed -e 's/...\(....\) U+\([0-9A-F]*\).*/0x\1 0x\2/' \
	| sort | ${AWKPROG}
else
    printf 'Invalid arguments: %s\n' "$3"
    exit 1
fi