diff options
Diffstat (limited to 'charclass.c')
-rw-r--r-- | charclass.c | 161 |
1 files changed, 161 insertions, 0 deletions
diff --git a/charclass.c b/charclass.c new file mode 100644 index 0000000..675ef17 --- /dev/null +++ b/charclass.c @@ -0,0 +1,161 @@ +/* $XTermId: charclass.c,v 1.22 2009/11/05 23:46:15 tom Exp $ */ + +/* + * Compact and efficient reimplementation of the + * xterm character class mechanism for large character sets + * + * Markus Kuhn -- mkuhn@acm.org -- 2000-07-03 + * + * Xterm allows users to select entire words with a double-click on the left + * mouse button. Opinions might differ on what type of characters are part of + * separate words, therefore xterm allows users to configure a class code for + * each 8-bit character. Words are maximum length sequences of neighboring + * characters with identical class code. Extending this mechanism to Unicode + * naively would create an at least 2^16 entries (128 kB) long class code + * table. + * + * Instead, we transform the character class table into a list of intervals, + * that will be accessed via a linear search. Changes made to the table by the + * user will be appended. A special class code IDENT (default) marks + * characters who have their code number as the class code. + * + * We could alternatively use a sorted table of non-overlapping intervals that + * can be accessed via binary search, but merging in new intervals is + * significantly more hassle and not worth the effort here. + */ + +#include <xterm.h> +#include <charclass.h> + +#if OPT_WIDE_CHARS + +static struct classentry { + int cclass; + int first; + int last; +} *classtab; + +/* + * Special convention for classtab[0]: + * - classtab[0].cclass is the allocated number of entries in classtab + * - classtab[0].first = 1 (first used entry in classtab) + * - classtab[0].last is the last used entry in classtab + */ + +int +SetCharacterClassRange(int low, int high, int value) +{ + if (high < low) + return -1; /* nothing to do */ + + /* make sure we have at least one free entry left at table end */ + if (classtab[0].last > classtab[0].cclass - 2) { + classtab[0].cclass += 5 + classtab[0].cclass / 4; + classtab = TypeRealloc(struct classentry, + (unsigned) classtab[0].cclass, classtab); + if (!classtab) + abort(); + } + + /* simply append new interval to end of interval array */ + classtab[0].last++; + classtab[classtab[0].last].first = low; + classtab[classtab[0].last].last = high; + classtab[classtab[0].last].cclass = value; + + return 0; +} + +typedef enum { + IDENT = -1, + ALNUM = 48, + CNTRL = 1, + BLANK = 32 +} Classes; + +void +init_classtab(void) +{ + const int size = 50; + + classtab = TypeMallocN(struct classentry, (unsigned) size); + if (!classtab) + abort(); + classtab[0].cclass = size; + classtab[0].first = 1; + classtab[0].last = 0; + + /* old xterm default classes */ + SetCharacterClassRange(0, 0, BLANK); + SetCharacterClassRange(1, 31, CNTRL); + SetCharacterClassRange('\t', '\t', BLANK); + SetCharacterClassRange('0', '9', ALNUM); + SetCharacterClassRange('A', 'Z', ALNUM); + SetCharacterClassRange('_', '_', ALNUM); + SetCharacterClassRange('a', 'z', ALNUM); + SetCharacterClassRange(127, 159, CNTRL); + SetCharacterClassRange(160, 191, IDENT); + SetCharacterClassRange(192, 255, ALNUM); + SetCharacterClassRange(215, 215, IDENT); + SetCharacterClassRange(247, 247, IDENT); + + /* added Unicode classes */ + SetCharacterClassRange(0x0100, 0xffdf, ALNUM); /* mostly characters */ + SetCharacterClassRange(0x037e, 0x037e, IDENT); /* Greek question mark */ + SetCharacterClassRange(0x0387, 0x0387, IDENT); /* Greek ano teleia */ + SetCharacterClassRange(0x055a, 0x055f, IDENT); /* Armenian punctuation */ + SetCharacterClassRange(0x0589, 0x0589, IDENT); /* Armenian full stop */ + SetCharacterClassRange(0x0700, 0x070d, IDENT); /* Syriac punctuation */ + SetCharacterClassRange(0x104a, 0x104f, IDENT); /* Myanmar punctuation */ + SetCharacterClassRange(0x10fb, 0x10fb, IDENT); /* Georgian punctuation */ + SetCharacterClassRange(0x1361, 0x1368, IDENT); /* Ethiopic punctuation */ + SetCharacterClassRange(0x166d, 0x166e, IDENT); /* Canadian Syl. punctuation */ + SetCharacterClassRange(0x17d4, 0x17dc, IDENT); /* Khmer punctuation */ + SetCharacterClassRange(0x1800, 0x180a, IDENT); /* Mongolian punctuation */ + SetCharacterClassRange(0x2000, 0x200a, BLANK); /* spaces */ + SetCharacterClassRange(0x200b, 0x27ff, IDENT); /* punctuation and symbols */ + SetCharacterClassRange(0x2070, 0x207f, 0x2070); /* superscript */ + SetCharacterClassRange(0x2080, 0x208f, 0x2080); /* subscript */ + SetCharacterClassRange(0x3000, 0x3000, BLANK); /* ideographic space */ + SetCharacterClassRange(0x3001, 0x3020, IDENT); /* ideographic punctuation */ + SetCharacterClassRange(0x3040, 0x309f, 0x3040); /* Hiragana */ + SetCharacterClassRange(0x30a0, 0x30ff, 0x30a0); /* Katakana */ + SetCharacterClassRange(0x3300, 0x9fff, 0x4e00); /* CJK Ideographs */ + SetCharacterClassRange(0xac00, 0xd7a3, 0xac00); /* Hangul Syllables */ + SetCharacterClassRange(0xf900, 0xfaff, 0x4e00); /* CJK Ideographs */ + SetCharacterClassRange(0xfe30, 0xfe6b, IDENT); /* punctuation forms */ + SetCharacterClassRange(0xff00, 0xff0f, IDENT); /* half/fullwidth ASCII */ + SetCharacterClassRange(0xff1a, 0xff20, IDENT); /* half/fullwidth ASCII */ + SetCharacterClassRange(0xff3b, 0xff40, IDENT); /* half/fullwidth ASCII */ + SetCharacterClassRange(0xff5b, 0xff64, IDENT); /* half/fullwidth ASCII */ + + return; +} + +int +CharacterClass(int c) +{ + int i, cclass = IDENT; + + for (i = classtab[0].first; i <= classtab[0].last; i++) + if (classtab[i].first <= c && classtab[i].last >= c) + cclass = classtab[i].cclass; + + if (cclass < 0) + cclass = c; + + return cclass; +} + +#ifdef NO_LEAKS +void +noleaks_CharacterClass(void) +{ + if (classtab != 0) { + free(classtab); + classtab = 0; + } +} +#endif + +#endif /* OPT_WIDE_CHARS */ |