summaryrefslogtreecommitdiff
path: root/unicode/convmap.pl
blob: a473491702a989bc29ac79fe421522e22f2d4ae2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#!/usr/bin/perl -w
# $XTermId: convmap.pl,v 1.13 2007/06/11 23:30:44 tom Exp $
#
# Generate keysym2ucs.c file
#
# See also:
# http://mail.nl.linux.org/linux-utf8/2001-04/msg00248.html
#
# $XFree86: xc/programs/xterm/unicode/convmap.pl,v 1.5 2000/01/24 22:22:05 dawes Exp $

use strict;

our $keysym;
our %name;
our %keysym_to_ucs;
our %keysym_to_keysymname;

sub utf8 ($);

sub utf8 ($) {
    my $c = shift(@_);

    if ($c < 0x80) {
        return sprintf("%c", $c);
    } elsif ($c < 0x800) {
        return sprintf("%c%c", 0xc0 | ($c >> 6), 0x80 | ($c & 0x3f));
    } elsif ($c < 0x10000) {
        return sprintf("%c%c%c",
                       0xe0 |  ($c >> 12),
                       0x80 | (($c >>  6) & 0x3f),
                       0x80 | ( $c        & 0x3f));
    } elsif ($c < 0x200000) {
        return sprintf("%c%c%c%c",
                       0xf0 |  ($c >> 18),
                       0x80 | (($c >> 12) & 0x3f),
                       0x80 | (($c >>  6) & 0x3f),
                       0x80 | ( $c        & 0x3f));
    } elsif ($c < 0x4000000) {
        return sprintf("%c%c%c%c%c",
                       0xf8 |  ($c >> 24),
                       0x80 | (($c >> 18) & 0x3f),
                       0x80 | (($c >> 12) & 0x3f),
                       0x80 | (($c >>  6) & 0x3f),
                       0x80 | ( $c        & 0x3f));

    } elsif ($c < 0x80000000) {
        return sprintf("%c%c%c%c%c%c",
                       0xfe |  ($c >> 30),
                       0x80 | (($c >> 24) & 0x3f),
                       0x80 | (($c >> 18) & 0x3f),
                       0x80 | (($c >> 12) & 0x3f),
                       0x80 | (($c >> 6)  & 0x3f),
                       0x80 | ( $c        & 0x3f));
    } else {
        return utf8(0xfffd);
    }
}

my $unicodedata = "UnicodeData-Latest.txt";

# read list of all Unicode names
if (!open(UDATA, $unicodedata) && !open(UDATA, "$unicodedata")) {
    die ("Can't open Unicode database '$unicodedata':\n$!\n\n" .
         "Please make sure that you have downloaded the file\n" .
         "ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt\n");
}
while (<UDATA>) {
    if (/^([0-9,A-F]{4,6});([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*)$/) {
        $name{hex($1)} = $2;
    } else {
        die("Syntax error in line '$_' in file '$unicodedata'");
    }
}
close(UDATA);

# read mapping (from http://wsinwp07.win.tue.nl:1234/unicode/keysym.map)
open(LIST, "<keysym.map") || die ("Can't open map file:\n$!\n");
while (<LIST>) {
    if (/^0x([0-9a-f]{4})\s+U([0-9a-f]{4})\s*(\#.*)?$/){
        my $keysym = hex($1);
        my $ucs = hex($2);
	my $comment = $3;
	$comment =~ s/^#\s*//;
        $keysym_to_ucs{$keysym} = $ucs;
	$keysym_to_keysymname{$keysym} = $comment;
    } elsif (/^\s*\#/ || /^\s*$/) {
    } else {
        die("Syntax error in 'list' in line\n$_\n");
    }
}
close(LIST);

# read entries in keysymdef.h
open(LIST, "</usr/include/X11/keysymdef.h") || die ("Can't open keysymdef.h:\n$!\n");
while (<LIST>) {
    if (/^\#define\s+XK_([A-Za-z_0-9]+)\s+0x([0-9a-fA-F]+)\s*(\/.*)?$/) {
	next if /\/\* deprecated \*\//;
	my $keysymname = $1;
	my $keysym = hex($2);
	$keysym_to_keysymname{$keysym} = $keysymname;
    }
}
close(LIST);

print <<EOT;
/* \$XTermId\$
 * This module converts keysym values into the corresponding ISO 10646
 * (UCS, Unicode) values.
 *
 * The array keysymtab[] contains pairs of X11 keysym values for graphical
 * characters and the corresponding Unicode value. The function
 * keysym2ucs() maps a keysym onto a Unicode value using a binary search,
 * therefore keysymtab[] must remain SORTED by keysym value.
 *
 * The keysym -> UTF-8 conversion will hopefully one day be provided
 * by Xlib via XmbLookupString() and should ideally not have to be
 * done in X applications. But we are not there yet.
 *
 * We allow to represent any UCS character in the range U-00000000 to
 * U-00FFFFFF by a keysym value in the range 0x01000000 to 0x01ffffff.
 * This admittedly does not cover the entire 31-bit space of UCS, but
 * it does cover all of the characters up to U-10FFFF, which can be
 * represented by UTF-16, and more, and it is very unlikely that higher
 * UCS codes will ever be assigned by ISO. So to get Unicode character
 * U+ABCD you can directly use keysym 0x0100abcd.
 *
 * NOTE: The comments in the table below contain the actual character
 * encoded in UTF-8, so for viewing and editing best use an editor in
 * UTF-8 mode.
 *
 * Author: Markus G. Kuhn <mkuhn\@acm.org>, University of Cambridge, April 2001
 *
 * Special thanks to Richard Verhoeven <river\@win.tue.nl> for preparing
 * an initial draft of the mapping table.
 *
 * This software is in the public domain. Share and enjoy!
 *
 * AUTOMATICALLY GENERATED FILE, DO NOT EDIT !!! (unicode/convmap.pl)
 */

#ifndef KEYSYM2UCS_INCLUDED
  
#include "keysym2ucs.h"
#define VISIBLE /* */

#else

#define VISIBLE static

#endif

static struct codepair {
  unsigned short keysym;
  unsigned short ucs;
} keysymtab[] = {
EOT

for $keysym (sort {$a <=> $b} keys(%keysym_to_keysymname)) {
    my $ucs = $keysym_to_ucs{$keysym};
    next if $keysym >= 0xf000 || $keysym < 0x100;
    if ($ucs) {
	printf("  { 0x%04x, 0x%04x }, /*%28s %s %s */\n",
	       $keysym, $ucs, $keysym_to_keysymname{$keysym}, utf8($ucs),
	       defined($name{$ucs}) ? $name{$ucs} : "???" );
    } else {
	printf("/*  0x%04x   %39s ? ??? */\n",
	       $keysym, $keysym_to_keysymname{$keysym});
    }
}

print <<EOT;
};

VISIBLE
long keysym2ucs(KeySym keysym)
{
    int min = 0;
    int max = sizeof(keysymtab) / sizeof(struct codepair) - 1;
    int mid;

    /* first check for Latin-1 characters (1:1 mapping) */
    if ((keysym >= 0x0020 && keysym <= 0x007e) ||
        (keysym >= 0x00a0 && keysym <= 0x00ff))
        return keysym;

    /* also check for directly encoded 24-bit UCS characters */
    if ((keysym & 0xff000000) == 0x01000000)
	return keysym & 0x00ffffff;

    /* binary search in table */
    while (max >= min) {
	mid = (min + max) / 2;
	if (keysymtab[mid].keysym < keysym)
	    min = mid + 1;
	else if (keysymtab[mid].keysym > keysym)
	    max = mid - 1;
	else {
	    /* found it */
	    return keysymtab[mid].ucs;
	}
    }

    /* no matching Unicode value found */
    return -1;
}
EOT