diff options
Diffstat (limited to 'unicode/convmap.pl')
-rwxr-xr-x | unicode/convmap.pl | 206 |
1 files changed, 206 insertions, 0 deletions
diff --git a/unicode/convmap.pl b/unicode/convmap.pl new file mode 100755 index 0000000..a473491 --- /dev/null +++ b/unicode/convmap.pl @@ -0,0 +1,206 @@ +#!/usr/bin/perl -w +# $XTermId: convmap.pl,v 1.13 2007/06/11 23:30:44 tom Exp $ +# +# Generate keysym2ucs.c file +# +# See also: +# http://mail.nl.linux.org/linux-utf8/2001-04/msg00248.html +# +# $XFree86: xc/programs/xterm/unicode/convmap.pl,v 1.5 2000/01/24 22:22:05 dawes Exp $ + +use strict; + +our $keysym; +our %name; +our %keysym_to_ucs; +our %keysym_to_keysymname; + +sub utf8 ($); + +sub utf8 ($) { + my $c = shift(@_); + + if ($c < 0x80) { + return sprintf("%c", $c); + } elsif ($c < 0x800) { + return sprintf("%c%c", 0xc0 | ($c >> 6), 0x80 | ($c & 0x3f)); + } elsif ($c < 0x10000) { + return sprintf("%c%c%c", + 0xe0 | ($c >> 12), + 0x80 | (($c >> 6) & 0x3f), + 0x80 | ( $c & 0x3f)); + } elsif ($c < 0x200000) { + return sprintf("%c%c%c%c", + 0xf0 | ($c >> 18), + 0x80 | (($c >> 12) & 0x3f), + 0x80 | (($c >> 6) & 0x3f), + 0x80 | ( $c & 0x3f)); + } elsif ($c < 0x4000000) { + return sprintf("%c%c%c%c%c", + 0xf8 | ($c >> 24), + 0x80 | (($c >> 18) & 0x3f), + 0x80 | (($c >> 12) & 0x3f), + 0x80 | (($c >> 6) & 0x3f), + 0x80 | ( $c & 0x3f)); + + } elsif ($c < 0x80000000) { + return sprintf("%c%c%c%c%c%c", + 0xfe | ($c >> 30), + 0x80 | (($c >> 24) & 0x3f), + 0x80 | (($c >> 18) & 0x3f), + 0x80 | (($c >> 12) & 0x3f), + 0x80 | (($c >> 6) & 0x3f), + 0x80 | ( $c & 0x3f)); + } else { + return utf8(0xfffd); + } +} + +my $unicodedata = "UnicodeData-Latest.txt"; + +# read list of all Unicode names +if (!open(UDATA, $unicodedata) && !open(UDATA, "$unicodedata")) { + die ("Can't open Unicode database '$unicodedata':\n$!\n\n" . + "Please make sure that you have downloaded the file\n" . + "ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt\n"); +} +while (<UDATA>) { + if (/^([0-9,A-F]{4,6});([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*)$/) { + $name{hex($1)} = $2; + } else { + die("Syntax error in line '$_' in file '$unicodedata'"); + } +} +close(UDATA); + +# read mapping (from http://wsinwp07.win.tue.nl:1234/unicode/keysym.map) +open(LIST, "<keysym.map") || die ("Can't open map file:\n$!\n"); +while (<LIST>) { + if (/^0x([0-9a-f]{4})\s+U([0-9a-f]{4})\s*(\#.*)?$/){ + my $keysym = hex($1); + my $ucs = hex($2); + my $comment = $3; + $comment =~ s/^#\s*//; + $keysym_to_ucs{$keysym} = $ucs; + $keysym_to_keysymname{$keysym} = $comment; + } elsif (/^\s*\#/ || /^\s*$/) { + } else { + die("Syntax error in 'list' in line\n$_\n"); + } +} +close(LIST); + +# read entries in keysymdef.h +open(LIST, "</usr/include/X11/keysymdef.h") || die ("Can't open keysymdef.h:\n$!\n"); +while (<LIST>) { + if (/^\#define\s+XK_([A-Za-z_0-9]+)\s+0x([0-9a-fA-F]+)\s*(\/.*)?$/) { + next if /\/\* deprecated \*\//; + my $keysymname = $1; + my $keysym = hex($2); + $keysym_to_keysymname{$keysym} = $keysymname; + } +} +close(LIST); + +print <<EOT; +/* \$XTermId\$ + * This module converts keysym values into the corresponding ISO 10646 + * (UCS, Unicode) values. + * + * The array keysymtab[] contains pairs of X11 keysym values for graphical + * characters and the corresponding Unicode value. The function + * keysym2ucs() maps a keysym onto a Unicode value using a binary search, + * therefore keysymtab[] must remain SORTED by keysym value. + * + * The keysym -> UTF-8 conversion will hopefully one day be provided + * by Xlib via XmbLookupString() and should ideally not have to be + * done in X applications. But we are not there yet. + * + * We allow to represent any UCS character in the range U-00000000 to + * U-00FFFFFF by a keysym value in the range 0x01000000 to 0x01ffffff. + * This admittedly does not cover the entire 31-bit space of UCS, but + * it does cover all of the characters up to U-10FFFF, which can be + * represented by UTF-16, and more, and it is very unlikely that higher + * UCS codes will ever be assigned by ISO. So to get Unicode character + * U+ABCD you can directly use keysym 0x0100abcd. + * + * NOTE: The comments in the table below contain the actual character + * encoded in UTF-8, so for viewing and editing best use an editor in + * UTF-8 mode. + * + * Author: Markus G. Kuhn <mkuhn\@acm.org>, University of Cambridge, April 2001 + * + * Special thanks to Richard Verhoeven <river\@win.tue.nl> for preparing + * an initial draft of the mapping table. + * + * This software is in the public domain. Share and enjoy! + * + * AUTOMATICALLY GENERATED FILE, DO NOT EDIT !!! (unicode/convmap.pl) + */ + +#ifndef KEYSYM2UCS_INCLUDED + +#include "keysym2ucs.h" +#define VISIBLE /* */ + +#else + +#define VISIBLE static + +#endif + +static struct codepair { + unsigned short keysym; + unsigned short ucs; +} keysymtab[] = { +EOT + +for $keysym (sort {$a <=> $b} keys(%keysym_to_keysymname)) { + my $ucs = $keysym_to_ucs{$keysym}; + next if $keysym >= 0xf000 || $keysym < 0x100; + if ($ucs) { + printf(" { 0x%04x, 0x%04x }, /*%28s %s %s */\n", + $keysym, $ucs, $keysym_to_keysymname{$keysym}, utf8($ucs), + defined($name{$ucs}) ? $name{$ucs} : "???" ); + } else { + printf("/* 0x%04x %39s ? ??? */\n", + $keysym, $keysym_to_keysymname{$keysym}); + } +} + +print <<EOT; +}; + +VISIBLE +long keysym2ucs(KeySym keysym) +{ + int min = 0; + int max = sizeof(keysymtab) / sizeof(struct codepair) - 1; + int mid; + + /* first check for Latin-1 characters (1:1 mapping) */ + if ((keysym >= 0x0020 && keysym <= 0x007e) || + (keysym >= 0x00a0 && keysym <= 0x00ff)) + return keysym; + + /* also check for directly encoded 24-bit UCS characters */ + if ((keysym & 0xff000000) == 0x01000000) + return keysym & 0x00ffffff; + + /* binary search in table */ + while (max >= min) { + mid = (min + max) / 2; + if (keysymtab[mid].keysym < keysym) + min = mid + 1; + else if (keysymtab[mid].keysym > keysym) + max = mid - 1; + else { + /* found it */ + return keysymtab[mid].ucs; + } + } + + /* no matching Unicode value found */ + return -1; +} +EOT |