summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJunio C Hamano <gitster@pobox.com>2014-06-06 11:29:38 -0700
committerJunio C Hamano <gitster@pobox.com>2014-06-06 11:29:38 -0700
commit334d40e951fa3b3961135b3183633706d976c4bd (patch)
tree445e33f7e58e9e7e9b30be0952b6bf493ac0931c
parenta0460132a740d8ff0c08dcbd54520f1b795298b9 (diff)
parent9c94389c3ee02df891100b894c1790a524268d91 (diff)
downloadgit-334d40e951fa3b3961135b3183633706d976c4bd.tar.gz
Merge branch 'tb/unicode-6.3-zero-width'
Update the logic to compute the display width needed for utf8 strings and allow us to more easily maintain the tables used in that logic. We may want to let the users choose if codepoints with ambiguous widths are treated as a double or single width in a follow-up patch. * tb/unicode-6.3-zero-width: utf8: make it easier to auto-update git_wcwidth() utf8.c: use a table for double_width
-rw-r--r--.gitignore1
-rw-r--r--Makefile1
-rw-r--r--unicode_width.h288
-rwxr-xr-xupdate_unicode.sh37
-rw-r--r--utf8.c76
5 files changed, 334 insertions, 69 deletions
diff --git a/.gitignore b/.gitignore
index dc600f9b36..42294e59a1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -226,6 +226,7 @@
/config.mak.autogen
/config.mak.append
/configure
+/unicode
/tags
/TAGS
/cscope*
diff --git a/Makefile b/Makefile
index 08fc9ca430..07ea105837 100644
--- a/Makefile
+++ b/Makefile
@@ -729,6 +729,7 @@ LIB_H += transport.h
LIB_H += tree-walk.h
LIB_H += tree.h
LIB_H += unpack-trees.h
+LIB_H += unicode_width.h
LIB_H += url.h
LIB_H += urlmatch.h
LIB_H += userdiff.h
diff --git a/unicode_width.h b/unicode_width.h
new file mode 100644
index 0000000000..4db78038e4
--- /dev/null
+++ b/unicode_width.h
@@ -0,0 +1,288 @@
+static const struct interval zero_width[] = {
+{ 0x0300, 0x036F },
+{ 0x0483, 0x0489 },
+{ 0x0591, 0x05BD },
+{ 0x05BF, 0x05BF },
+{ 0x05C1, 0x05C2 },
+{ 0x05C4, 0x05C5 },
+{ 0x05C7, 0x05C7 },
+{ 0x0600, 0x0604 },
+{ 0x0610, 0x061A },
+{ 0x061C, 0x061C },
+{ 0x064B, 0x065F },
+{ 0x0670, 0x0670 },
+{ 0x06D6, 0x06DD },
+{ 0x06DF, 0x06E4 },
+{ 0x06E7, 0x06E8 },
+{ 0x06EA, 0x06ED },
+{ 0x070F, 0x070F },
+{ 0x0711, 0x0711 },
+{ 0x0730, 0x074A },
+{ 0x07A6, 0x07B0 },
+{ 0x07EB, 0x07F3 },
+{ 0x0816, 0x0819 },
+{ 0x081B, 0x0823 },
+{ 0x0825, 0x0827 },
+{ 0x0829, 0x082D },
+{ 0x0859, 0x085B },
+{ 0x08E4, 0x08FE },
+{ 0x0900, 0x0902 },
+{ 0x093A, 0x093A },
+{ 0x093C, 0x093C },
+{ 0x0941, 0x0948 },
+{ 0x094D, 0x094D },
+{ 0x0951, 0x0957 },
+{ 0x0962, 0x0963 },
+{ 0x0981, 0x0981 },
+{ 0x09BC, 0x09BC },
+{ 0x09C1, 0x09C4 },
+{ 0x09CD, 0x09CD },
+{ 0x09E2, 0x09E3 },
+{ 0x0A01, 0x0A02 },
+{ 0x0A3C, 0x0A3C },
+{ 0x0A41, 0x0A42 },
+{ 0x0A47, 0x0A48 },
+{ 0x0A4B, 0x0A4D },
+{ 0x0A51, 0x0A51 },
+{ 0x0A70, 0x0A71 },
+{ 0x0A75, 0x0A75 },
+{ 0x0A81, 0x0A82 },
+{ 0x0ABC, 0x0ABC },
+{ 0x0AC1, 0x0AC5 },
+{ 0x0AC7, 0x0AC8 },
+{ 0x0ACD, 0x0ACD },
+{ 0x0AE2, 0x0AE3 },
+{ 0x0B01, 0x0B01 },
+{ 0x0B3C, 0x0B3C },
+{ 0x0B3F, 0x0B3F },
+{ 0x0B41, 0x0B44 },
+{ 0x0B4D, 0x0B4D },
+{ 0x0B56, 0x0B56 },
+{ 0x0B62, 0x0B63 },
+{ 0x0B82, 0x0B82 },
+{ 0x0BC0, 0x0BC0 },
+{ 0x0BCD, 0x0BCD },
+{ 0x0C3E, 0x0C40 },
+{ 0x0C46, 0x0C48 },
+{ 0x0C4A, 0x0C4D },
+{ 0x0C55, 0x0C56 },
+{ 0x0C62, 0x0C63 },
+{ 0x0CBC, 0x0CBC },
+{ 0x0CBF, 0x0CBF },
+{ 0x0CC6, 0x0CC6 },
+{ 0x0CCC, 0x0CCD },
+{ 0x0CE2, 0x0CE3 },
+{ 0x0D41, 0x0D44 },
+{ 0x0D4D, 0x0D4D },
+{ 0x0D62, 0x0D63 },
+{ 0x0DCA, 0x0DCA },
+{ 0x0DD2, 0x0DD4 },
+{ 0x0DD6, 0x0DD6 },
+{ 0x0E31, 0x0E31 },
+{ 0x0E34, 0x0E3A },
+{ 0x0E47, 0x0E4E },
+{ 0x0EB1, 0x0EB1 },
+{ 0x0EB4, 0x0EB9 },
+{ 0x0EBB, 0x0EBC },
+{ 0x0EC8, 0x0ECD },
+{ 0x0F18, 0x0F19 },
+{ 0x0F35, 0x0F35 },
+{ 0x0F37, 0x0F37 },
+{ 0x0F39, 0x0F39 },
+{ 0x0F71, 0x0F7E },
+{ 0x0F80, 0x0F84 },
+{ 0x0F86, 0x0F87 },
+{ 0x0F8D, 0x0F97 },
+{ 0x0F99, 0x0FBC },
+{ 0x0FC6, 0x0FC6 },
+{ 0x102D, 0x1030 },
+{ 0x1032, 0x1037 },
+{ 0x1039, 0x103A },
+{ 0x103D, 0x103E },
+{ 0x1058, 0x1059 },
+{ 0x105E, 0x1060 },
+{ 0x1071, 0x1074 },
+{ 0x1082, 0x1082 },
+{ 0x1085, 0x1086 },
+{ 0x108D, 0x108D },
+{ 0x109D, 0x109D },
+{ 0x1160, 0x11FF },
+{ 0x135D, 0x135F },
+{ 0x1712, 0x1714 },
+{ 0x1732, 0x1734 },
+{ 0x1752, 0x1753 },
+{ 0x1772, 0x1773 },
+{ 0x17B4, 0x17B5 },
+{ 0x17B7, 0x17BD },
+{ 0x17C6, 0x17C6 },
+{ 0x17C9, 0x17D3 },
+{ 0x17DD, 0x17DD },
+{ 0x180B, 0x180E },
+{ 0x18A9, 0x18A9 },
+{ 0x1920, 0x1922 },
+{ 0x1927, 0x1928 },
+{ 0x1932, 0x1932 },
+{ 0x1939, 0x193B },
+{ 0x1A17, 0x1A18 },
+{ 0x1A1B, 0x1A1B },
+{ 0x1A56, 0x1A56 },
+{ 0x1A58, 0x1A5E },
+{ 0x1A60, 0x1A60 },
+{ 0x1A62, 0x1A62 },
+{ 0x1A65, 0x1A6C },
+{ 0x1A73, 0x1A7C },
+{ 0x1A7F, 0x1A7F },
+{ 0x1B00, 0x1B03 },
+{ 0x1B34, 0x1B34 },
+{ 0x1B36, 0x1B3A },
+{ 0x1B3C, 0x1B3C },
+{ 0x1B42, 0x1B42 },
+{ 0x1B6B, 0x1B73 },
+{ 0x1B80, 0x1B81 },
+{ 0x1BA2, 0x1BA5 },
+{ 0x1BA8, 0x1BA9 },
+{ 0x1BAB, 0x1BAB },
+{ 0x1BE6, 0x1BE6 },
+{ 0x1BE8, 0x1BE9 },
+{ 0x1BED, 0x1BED },
+{ 0x1BEF, 0x1BF1 },
+{ 0x1C2C, 0x1C33 },
+{ 0x1C36, 0x1C37 },
+{ 0x1CD0, 0x1CD2 },
+{ 0x1CD4, 0x1CE0 },
+{ 0x1CE2, 0x1CE8 },
+{ 0x1CED, 0x1CED },
+{ 0x1CF4, 0x1CF4 },
+{ 0x1DC0, 0x1DE6 },
+{ 0x1DFC, 0x1DFF },
+{ 0x200B, 0x200F },
+{ 0x202A, 0x202E },
+{ 0x2060, 0x2064 },
+{ 0x2066, 0x206F },
+{ 0x20D0, 0x20F0 },
+{ 0x2CEF, 0x2CF1 },
+{ 0x2D7F, 0x2D7F },
+{ 0x2DE0, 0x2DFF },
+{ 0x302A, 0x302D },
+{ 0x3099, 0x309A },
+{ 0xA66F, 0xA672 },
+{ 0xA674, 0xA67D },
+{ 0xA69F, 0xA69F },
+{ 0xA6F0, 0xA6F1 },
+{ 0xA802, 0xA802 },
+{ 0xA806, 0xA806 },
+{ 0xA80B, 0xA80B },
+{ 0xA825, 0xA826 },
+{ 0xA8C4, 0xA8C4 },
+{ 0xA8E0, 0xA8F1 },
+{ 0xA926, 0xA92D },
+{ 0xA947, 0xA951 },
+{ 0xA980, 0xA982 },
+{ 0xA9B3, 0xA9B3 },
+{ 0xA9B6, 0xA9B9 },
+{ 0xA9BC, 0xA9BC },
+{ 0xAA29, 0xAA2E },
+{ 0xAA31, 0xAA32 },
+{ 0xAA35, 0xAA36 },
+{ 0xAA43, 0xAA43 },
+{ 0xAA4C, 0xAA4C },
+{ 0xAAB0, 0xAAB0 },
+{ 0xAAB2, 0xAAB4 },
+{ 0xAAB7, 0xAAB8 },
+{ 0xAABE, 0xAABF },
+{ 0xAAC1, 0xAAC1 },
+{ 0xAAEC, 0xAAED },
+{ 0xAAF6, 0xAAF6 },
+{ 0xABE5, 0xABE5 },
+{ 0xABE8, 0xABE8 },
+{ 0xABED, 0xABED },
+{ 0xFB1E, 0xFB1E },
+{ 0xFE00, 0xFE0F },
+{ 0xFE20, 0xFE26 },
+{ 0xFEFF, 0xFEFF },
+{ 0xFFF9, 0xFFFB },
+{ 0x101FD, 0x101FD },
+{ 0x10A01, 0x10A03 },
+{ 0x10A05, 0x10A06 },
+{ 0x10A0C, 0x10A0F },
+{ 0x10A38, 0x10A3A },
+{ 0x10A3F, 0x10A3F },
+{ 0x11001, 0x11001 },
+{ 0x11038, 0x11046 },
+{ 0x11080, 0x11081 },
+{ 0x110B3, 0x110B6 },
+{ 0x110B9, 0x110BA },
+{ 0x110BD, 0x110BD },
+{ 0x11100, 0x11102 },
+{ 0x11127, 0x1112B },
+{ 0x1112D, 0x11134 },
+{ 0x11180, 0x11181 },
+{ 0x111B6, 0x111BE },
+{ 0x116AB, 0x116AB },
+{ 0x116AD, 0x116AD },
+{ 0x116B0, 0x116B5 },
+{ 0x116B7, 0x116B7 },
+{ 0x16F8F, 0x16F92 },
+{ 0x1D167, 0x1D169 },
+{ 0x1D173, 0x1D182 },
+{ 0x1D185, 0x1D18B },
+{ 0x1D1AA, 0x1D1AD },
+{ 0x1D242, 0x1D244 },
+{ 0xE0001, 0xE0001 },
+{ 0xE0020, 0xE007F },
+{ 0xE0100, 0xE01EF }
+};
+static const struct interval double_width[] = {
+{ /* plane */ 0x0, 0x1C },
+{ /* plane */ 0x1C, 0x21 },
+{ /* plane */ 0x21, 0x22 },
+{ /* plane */ 0x22, 0x23 },
+{ /* plane */ 0x0, 0x0 },
+{ /* plane */ 0x0, 0x0 },
+{ /* plane */ 0x0, 0x0 },
+{ /* plane */ 0x0, 0x0 },
+{ /* plane */ 0x0, 0x0 },
+{ /* plane */ 0x0, 0x0 },
+{ /* plane */ 0x0, 0x0 },
+{ /* plane */ 0x0, 0x0 },
+{ /* plane */ 0x0, 0x0 },
+{ /* plane */ 0x0, 0x0 },
+{ /* plane */ 0x0, 0x0 },
+{ /* plane */ 0x0, 0x0 },
+{ /* plane */ 0x0, 0x0 },
+{ 0x1100, 0x115F },
+{ 0x2329, 0x232A },
+{ 0x2E80, 0x2E99 },
+{ 0x2E9B, 0x2EF3 },
+{ 0x2F00, 0x2FD5 },
+{ 0x2FF0, 0x2FFB },
+{ 0x3000, 0x303E },
+{ 0x3041, 0x3096 },
+{ 0x3099, 0x30FF },
+{ 0x3105, 0x312D },
+{ 0x3131, 0x318E },
+{ 0x3190, 0x31BA },
+{ 0x31C0, 0x31E3 },
+{ 0x31F0, 0x321E },
+{ 0x3220, 0x3247 },
+{ 0x3250, 0x32FE },
+{ 0x3300, 0x4DBF },
+{ 0x4E00, 0xA48C },
+{ 0xA490, 0xA4C6 },
+{ 0xA960, 0xA97C },
+{ 0xAC00, 0xD7A3 },
+{ 0xF900, 0xFAFF },
+{ 0xFE10, 0xFE19 },
+{ 0xFE30, 0xFE52 },
+{ 0xFE54, 0xFE66 },
+{ 0xFE68, 0xFE6B },
+{ 0xFF01, 0xFF60 },
+{ 0xFFE0, 0xFFE6 },
+{ 0x1B000, 0x1B001 },
+{ 0x1F200, 0x1F202 },
+{ 0x1F210, 0x1F23A },
+{ 0x1F240, 0x1F248 },
+{ 0x1F250, 0x1F251 },
+{ 0x20000, 0x2FFFD },
+{ 0x30000, 0x3FFFD }
+};
diff --git a/update_unicode.sh b/update_unicode.sh
new file mode 100755
index 0000000000..000b937e68
--- /dev/null
+++ b/update_unicode.sh
@@ -0,0 +1,37 @@
+#!/bin/sh
+#See http://www.unicode.org/reports/tr44/
+#
+#Me Enclosing_Mark an enclosing combining mark
+#Mn Nonspacing_Mark a nonspacing combining mark (zero advance width)
+#Cf Format a format control character
+#
+UNICODEWIDTH_H=../unicode_width.h
+if ! test -d unicode; then
+ mkdir unicode
+fi &&
+( cd unicode &&
+ if ! test -f UnicodeData.txt; then
+ wget http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
+ fi &&
+ if ! test -f EastAsianWidth.txt; then
+ wget http://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
+ fi &&
+ if ! test -d uniset; then
+ git clone https://github.com/depp/uniset.git
+ fi &&
+ (
+ cd uniset &&
+ if ! test -x uniset; then
+ autoreconf -i &&
+ ./configure --enable-warnings=-Werror CFLAGS='-O0 -ggdb'
+ fi &&
+ make
+ ) &&
+ echo "static const struct interval zero_width[] = {" >$UNICODEWIDTH_H &&
+ UNICODE_DIR=. ./uniset/uniset --32 cat:Me,Mn,Cf + U+1160..U+11FF - U+00AD |
+ grep -v plane >>$UNICODEWIDTH_H &&
+ echo "};" >>$UNICODEWIDTH_H &&
+ echo "static const struct interval double_width[] = {" >>$UNICODEWIDTH_H &&
+ UNICODE_DIR=. ./uniset/uniset --32 eaw:F,W >>$UNICODEWIDTH_H &&
+ echo "};" >>$UNICODEWIDTH_H
+)
diff --git a/utf8.c b/utf8.c
index 77c28d492c..b30790d043 100644
--- a/utf8.c
+++ b/utf8.c
@@ -80,52 +80,8 @@ static int git_wcwidth(ucs_char_t ch)
{
/*
* Sorted list of non-overlapping intervals of non-spacing characters,
- * generated by
- * "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c".
*/
- static const struct interval combining[] = {
- { 0x0300, 0x036F }, { 0x0483, 0x0489 }, { 0x0591, 0x05BD },
- { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 }, { 0x05C4, 0x05C5 },
- { 0x05C7, 0x05C7 }, { 0x0600, 0x0604 }, { 0x0610, 0x061A },
- { 0x064B, 0x065F }, { 0x0670, 0x0670 }, { 0x06D6, 0x06E4 },
- { 0x06E7, 0x06E8 }, { 0x06EA, 0x06ED }, { 0x070F, 0x070F },
- { 0x0711, 0x0711 }, { 0x0730, 0x074A }, { 0x07A6, 0x07B0 },
- { 0x0901, 0x0902 }, { 0x093C, 0x093C }, { 0x0941, 0x0948 },
- { 0x094D, 0x094D }, { 0x0951, 0x0954 }, { 0x0962, 0x0963 },
- { 0x0981, 0x0981 }, { 0x09BC, 0x09BC }, { 0x09C1, 0x09C4 },
- { 0x09CD, 0x09CD }, { 0x09E2, 0x09E3 }, { 0x0A01, 0x0A02 },
- { 0x0A3C, 0x0A3C }, { 0x0A41, 0x0A42 }, { 0x0A47, 0x0A48 },
- { 0x0A4B, 0x0A4D }, { 0x0A70, 0x0A71 }, { 0x0A81, 0x0A82 },
- { 0x0ABC, 0x0ABC }, { 0x0AC1, 0x0AC5 }, { 0x0AC7, 0x0AC8 },
- { 0x0ACD, 0x0ACD }, { 0x0AE2, 0x0AE3 }, { 0x0B01, 0x0B01 },
- { 0x0B3C, 0x0B3C }, { 0x0B3F, 0x0B3F }, { 0x0B41, 0x0B43 },
- { 0x0B4D, 0x0B4D }, { 0x0B56, 0x0B56 }, { 0x0B82, 0x0B82 },
- { 0x0BC0, 0x0BC0 }, { 0x0BCD, 0x0BCD }, { 0x0C3E, 0x0C40 },
- { 0x0C46, 0x0C48 }, { 0x0C4A, 0x0C4D }, { 0x0C55, 0x0C56 },
- { 0x0CBC, 0x0CBC }, { 0x0CBF, 0x0CBF }, { 0x0CC6, 0x0CC6 },
- { 0x0CCC, 0x0CCD }, { 0x0D41, 0x0D43 }, { 0x0D4D, 0x0D4D },
- { 0x0DCA, 0x0DCA }, { 0x0DD2, 0x0DD4 }, { 0x0DD6, 0x0DD6 },
- { 0x0E31, 0x0E31 }, { 0x0E34, 0x0E3A }, { 0x0E47, 0x0E4E },
- { 0x0EB1, 0x0EB1 }, { 0x0EB4, 0x0EB9 }, { 0x0EBB, 0x0EBC },
- { 0x0EC8, 0x0ECD }, { 0x0F18, 0x0F19 }, { 0x0F35, 0x0F35 },
- { 0x0F37, 0x0F37 }, { 0x0F39, 0x0F39 }, { 0x0F71, 0x0F7E },
- { 0x0F80, 0x0F84 }, { 0x0F86, 0x0F87 }, { 0x0F90, 0x0F97 },
- { 0x0F99, 0x0FBC }, { 0x0FC6, 0x0FC6 }, { 0x102D, 0x1030 },
- { 0x1032, 0x1032 }, { 0x1036, 0x1037 }, { 0x1039, 0x1039 },
- { 0x1058, 0x1059 }, { 0x1160, 0x11FF }, { 0x1712, 0x1714 },
- { 0x1732, 0x1734 }, { 0x1752, 0x1753 }, { 0x1772, 0x1773 },
- { 0x17B4, 0x17B5 }, { 0x17B7, 0x17BD }, { 0x17C6, 0x17C6 },
- { 0x17C9, 0x17D3 }, { 0x17DD, 0x17DD }, { 0x180B, 0x180D },
- { 0x18A9, 0x18A9 }, { 0x1920, 0x1922 }, { 0x1927, 0x1928 },
- { 0x1932, 0x1932 }, { 0x1939, 0x193B }, { 0x200B, 0x200F },
- { 0x202A, 0x202E }, { 0x2060, 0x2063 }, { 0x206A, 0x206F },
- { 0x20D0, 0x20EA }, { 0x302A, 0x302F }, { 0x3099, 0x309A },
- { 0xFB1E, 0xFB1E }, { 0xFE00, 0xFE0F }, { 0xFE20, 0xFE23 },
- { 0xFEFF, 0xFEFF }, { 0xFFF9, 0xFFFB }, { 0x1D167, 0x1D169 },
- { 0x1D173, 0x1D182 }, { 0x1D185, 0x1D18B },
- { 0x1D1AA, 0x1D1AD }, { 0xE0001, 0xE0001 },
- { 0xE0020, 0xE007F }, { 0xE0100, 0xE01EF }
- };
+#include "unicode_width.h"
/* test for 8-bit control characters */
if (ch == 0)
@@ -134,34 +90,16 @@ static int git_wcwidth(ucs_char_t ch)
return -1;
/* binary search in table of non-spacing characters */
- if (bisearch(ch, combining, sizeof(combining)
+ if (bisearch(ch, zero_width, sizeof(zero_width)
/ sizeof(struct interval) - 1))
return 0;
- /*
- * If we arrive here, ch is neither a combining nor a C0/C1
- * control character.
- */
+ /* binary search in table of double width characters */
+ if (bisearch(ch, double_width, sizeof(double_width)
+ / sizeof(struct interval) - 1))
+ return 2;
- return 1 +
- (ch >= 0x1100 &&
- /* Hangul Jamo init. consonants */
- (ch <= 0x115f ||
- ch == 0x2329 || ch == 0x232a ||
- /* CJK ... Yi */
- (ch >= 0x2e80 && ch <= 0xa4cf &&
- ch != 0x303f) ||
- /* Hangul Syllables */
- (ch >= 0xac00 && ch <= 0xd7a3) ||
- /* CJK Compatibility Ideographs */
- (ch >= 0xf900 && ch <= 0xfaff) ||
- /* CJK Compatibility Forms */
- (ch >= 0xfe30 && ch <= 0xfe6f) ||
- /* Fullwidth Forms */
- (ch >= 0xff00 && ch <= 0xff60) ||
- (ch >= 0xffe0 && ch <= 0xffe6) ||
- (ch >= 0x20000 && ch <= 0x2fffd) ||
- (ch >= 0x30000 && ch <= 0x3fffd)));
+ return 1;
}
/*