summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaiki Ueno <ueno@gnu.org>2016-10-12 17:40:37 +0200
committerDaiki Ueno <dueno@redhat.com>2017-10-30 11:13:20 +0100
commited0bf5d5284b787af71b1b95fca1fdb6389fee3c (patch)
tree2825a6954b97ec150986e07b8b2ae32c45ea52b0
parentf466816e06ec3516567c3edcd0219bd1f9b736eb (diff)
downloadgnulib-ueno/unicode-9.0.0.tar.gz
libunistring: update to Unicode 9.0.0ueno/unicode-9.0.0
* lib/gen-uni-tables.c (fill_properties): Recognize Sentence_Terminal and Prepended_Concatenation_Mark. (is_property_default_ignorable_code_point): Exclude U+08E2. (fill_arabicshaping): Allow missing whitespace when parsing; recognize "AFRICAN FEH", "AFRICAN QAF", and "AFRICAN MOON". (output_blocks): Increase the element size of the level1 table to accommodate more blocks. (get_lbp): Recognize ZWJ, E_Base, and E_Modifier characters; Update each class according to the standard. (get_wbp): Recognize ZWJ, E_Base, E_Modifier, Glue_After_Zwj, and E_Base_GAZ characters. (output_gbp_table): Recognize ZWJ, E_Base, E_Modifier, Glue_After_Zwj, and E_Base_GAZ characters. * lib/unictype.in.h (UC_JOINING_GROUP_AFRICAN_FEH, UC_JOINING_GROUP_AFRICAN_QAF, UC_JOINING_GROUP_AFRICAN_MOON): New enum value. * lib/unilbrk/lbrktables.h (LBP_ZWJ, LBP_EB, LBP_EM): New enum value. * lib/unilbrk/lbrktables.c (unilbrk_table): Extend the table with LBP_ZWJ, LBP_EB, and LBP_EM. * lib/uniwbrk.in.h (WBP_ZWJ, WBP_EB, WBP_EM, WBP_GAZ, WBP_EBG): New enum value. * lib/uniwbrk/u-wordbreaks.h: Implement WB3c, WB15, and WB16. * lib/uniwbrk/wbrktable.h (uniwbrk_prop_index): New variable declaration. * lib/uniwbrk/wbrktable.c (uniwbrk_prop_index): New variable. (uniwbrk_table): Implement WB14. * tests/uniwbrk/test-uc-wordbreaks.c (wordbreakproperty_to_string): Check WBP_ZWJ, WBP_EB, WBP_EM, WBP_GAZ, and WBP_EBG. * modules/unigbrk/u{32,16,8}-grapheme-breaks: No longer depend on uc-is-grapheme-break. * modules/unigbrk/uc-grapheme-breaks: New module. * modules/unigbrk/uc-grapheme-breaks-tests: New module. * lib/unigbrk.in.h (GBP_ZWJ, GBP_EB, GBP_EM, GBP_GAZ, GBP_EBG): New enum value. (uc_grapheme_breaks): New function, replacing uc_is_grapheme_break. * lib/unigbrk/u-grapheme-breaks.h: New file. * lib/unigbrk/u{32,16,8}-grapheme-breaks.c: Rewrite using u-grapheme-breaks.h instead of uc_is_grapheme_break. * lib/unigbrk/uc-grapheme-breaks.c: New file. * lib/unigbrk/uc-is-grapheme-break.c: Partially update to TR29 rev 29. * tests/unigbrk/test-uc-gbrk-prop.c (graphemebreakproperty_to_string): Check GBP_ZWJ, GBP_EB, GBP_EM, GBP_GAZ, and GBP_EBG. * tests/unigbrk/test-uc-grapheme-breaks.c: New test. * tests/unigbrk/test-uc-is-grapheme-break.c (graphemebreakproperty_to_string): Check GBP_ZWJ, GBP_EB, GBP_EM, GBP_GAZ, and GBP_EBG. (main): Skip unsupported rules involving 3 or more characters, namely GB10, GB12, and GB13. * lib/uniwidth/width.c (nonspacing_table_data): Update.
-rw-r--r--lib/gen-uni-tables.c316
-rw-r--r--lib/unictype.in.h5
-rw-r--r--lib/unigbrk.in.h9
-rw-r--r--lib/unigbrk/u-grapheme-breaks.h122
-rw-r--r--lib/unigbrk/u16-grapheme-breaks.c26
-rw-r--r--lib/unigbrk/u32-grapheme-breaks.c24
-rw-r--r--lib/unigbrk/u8-grapheme-breaks.c27
-rw-r--r--lib/unigbrk/uc-gbrk-prop.c3
-rw-r--r--lib/unigbrk/uc-grapheme-breaks.c39
-rw-r--r--lib/unigbrk/uc-is-grapheme-break.c29
-rw-r--r--lib/unilbrk/lbrktables.c61
-rw-r--r--lib/unilbrk/lbrktables.h23
-rwxr-xr-xlib/uniname/gen-uninames.lisp2
-rw-r--r--lib/uniwbrk.in.h7
-rw-r--r--lib/uniwbrk/u-wordbreaks.h66
-rw-r--r--lib/uniwbrk/wbrktable.c66
-rw-r--r--lib/uniwbrk/wbrktable.h3
-rw-r--r--lib/uniwidth/width.c46
-rw-r--r--modules/unigbrk/u16-grapheme-breaks5
-rw-r--r--modules/unigbrk/u32-grapheme-breaks5
-rw-r--r--modules/unigbrk/u8-grapheme-breaks5
-rw-r--r--modules/unigbrk/uc-grapheme-breaks28
-rw-r--r--modules/unigbrk/uc-grapheme-breaks-tests14
-rw-r--r--tests/unigbrk/test-uc-gbrk-prop.c5
-rw-r--r--tests/unigbrk/test-uc-grapheme-breaks.c191
-rwxr-xr-xtests/unigbrk/test-uc-grapheme-breaks.sh3
-rw-r--r--tests/unigbrk/test-uc-is-grapheme-break.c44
-rw-r--r--tests/uniwbrk/test-uc-wordbreaks.c5
-rwxr-xr-xtests/uniwidth/test-uc_width2.sh58
29 files changed, 989 insertions, 248 deletions
diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c
index acdafe25b9..14c71ee28e 100644
--- a/lib/gen-uni-tables.c
+++ b/lib/gen-uni-tables.c
@@ -32,7 +32,7 @@
/usr/local/share/Unidata/CompositionExclusions.txt \
/usr/local/share/Unidata/SpecialCasing.txt \
/usr/local/share/Unidata/CaseFolding.txt \
- 8.0.0
+ 9.0.0
*/
#include <assert.h>
@@ -2591,6 +2591,7 @@ enum
PROP_VARIATION_SELECTOR,
PROP_PATTERN_WHITE_SPACE,
PROP_PATTERN_SYNTAX,
+ PROP_PREPENDED_CONCATENATION_MARK,
/* DerivedCoreProperties.txt */
PROP_MATH,
PROP_ALPHABETIC,
@@ -2692,10 +2693,11 @@ fill_properties (const char *proplist_filename)
PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION)
PROP ("Other_ID_Start", PROP_OTHER_ID_START)
PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE)
- PROP ("STerm", PROP_STERM)
+ PROP ("Sentence_Terminal", PROP_STERM)
PROP ("Variation_Selector", PROP_VARIATION_SELECTOR)
PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE)
PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX)
+ PROP ("Prepended_Concatenation_Mark", PROP_PREPENDED_CONCATENATION_MARK)
/* DerivedCoreProperties.txt */
PROP ("Math", PROP_MATH)
PROP ("Alphabetic", PROP_ALPHABETIC)
@@ -2890,7 +2892,8 @@ is_property_default_ignorable_code_point (unsigned int ch)
&& !((ch >= 0x0600 && ch <= 0x0605) || ch == 0x06DD || ch == 0x070F)
/* For some reason, the following are not listed as having property
Default_Ignorable_Code_Point. */
- && !(ch == 0x110BD))
+ && !(ch == 0x110BD)
+ && !(ch == 0x8E2))
|| ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
|| ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
bool result2 =
@@ -3787,7 +3790,10 @@ enum
UC_JOINING_GROUP_MANICHAEAN_FIVE, /* Manichaean_Five */
UC_JOINING_GROUP_MANICHAEAN_TEN, /* Manichaean_Ten */
UC_JOINING_GROUP_MANICHAEAN_TWENTY, /* Manichaean_Twenty */
- UC_JOINING_GROUP_MANICHAEAN_HUNDRED /* Manichaean_Hundred */
+ UC_JOINING_GROUP_MANICHAEAN_HUNDRED, /* Manichaean_Hundred */
+ UC_JOINING_GROUP_AFRICAN_FEH, /* African_Feh */
+ UC_JOINING_GROUP_AFRICAN_QAF, /* African_Qaf */
+ UC_JOINING_GROUP_AFRICAN_NOON /* African_Noon */
};
static uint8_t unicode_joining_group[0x110000];
@@ -3815,30 +3821,26 @@ fill_arabicshaping (const char *arabicshaping_filename)
lineno = 0;
for (;;)
{
- char buf[100+1];
- char separator1[100+1];
- char padding1[100+1];
- char schematic_name[100+1];
- char separator2[100+1];
- char padding2[100+1];
- char joining_type_name[100+1];
- char separator3[100+1];
- char padding3[100+1];
- char joining_group_name[100+1];
+ char buf[200+1];
+ char separator1[200+1];
+ char schematic_name[200+1];
+ char separator2[200+1];
+ char joining_type_name[200+1];
+ char separator3[200+1];
+ char joining_group_name[200+1];
int joining_type;
int joining_group;
lineno++;
- if (fscanf (stream, "%100[^\n]\n", buf) < 1)
+ if (fscanf (stream, "%200[^\n]\n", buf) < 1)
break;
if (buf[0] == '\0' || buf[0] == '#')
continue;
- if (sscanf (buf, "%X%[;]%[ ]%[^;]%[;]%[ ]%[^;]%[;]%[ ]%100[^\n]",
- &i, separator1, padding1, schematic_name, separator2,
- padding2, joining_type_name, separator3, padding3,
- joining_group_name) != 10)
+ if (sscanf (buf, "%X%[; ]%[^;]%[; ]%[^;]%[; ]%100[^\n]",
+ &i, separator1, schematic_name, separator2, joining_type_name,
+ separator3, joining_group_name) != 7)
{
fprintf (stderr, "parse error in '%s':%d\n",
arabicshaping_filename, lineno);
@@ -3955,6 +3957,9 @@ fill_arabicshaping (const char *arabicshaping_filename)
TRY(UC_JOINING_GROUP_MANICHAEAN_TEN, "MANICHAEAN TEN")
TRY(UC_JOINING_GROUP_MANICHAEAN_TWENTY, "MANICHAEAN TWENTY")
TRY(UC_JOINING_GROUP_MANICHAEAN_HUNDRED, "MANICHAEAN HUNDRED")
+ TRY(UC_JOINING_GROUP_AFRICAN_FEH, "AFRICAN FEH")
+ TRY(UC_JOINING_GROUP_AFRICAN_QAF, "AFRICAN QAF")
+ TRY(UC_JOINING_GROUP_AFRICAN_NOON, "AFRICAN NOON")
#undef TRY
else
{
@@ -4264,6 +4269,9 @@ joining_group_as_c_identifier (int joining_group)
TRY(UC_JOINING_GROUP_MANICHAEAN_TEN)
TRY(UC_JOINING_GROUP_MANICHAEAN_TWENTY)
TRY(UC_JOINING_GROUP_MANICHAEAN_HUNDRED)
+ TRY(UC_JOINING_GROUP_AFRICAN_FEH)
+ TRY(UC_JOINING_GROUP_AFRICAN_QAF)
+ TRY(UC_JOINING_GROUP_AFRICAN_NOON)
#undef TRY
abort ();
}
@@ -4901,7 +4909,7 @@ output_blocks (const char *version)
fprintf (stream, "};\n");
fprintf (stream, "#define blocks_level1_shift %d\n", shift);
fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold);
- fprintf (stream, "static const uint8_t blocks_level1[%d * 2] =\n",
+ fprintf (stream, "static const uint16_t blocks_level1[%d * 2] =\n",
threshold >> shift);
fprintf (stream, "{\n");
for (i1 = 0; i1 < (threshold >> shift); i1++)
@@ -6292,22 +6300,22 @@ output_width_property_test (const char *filename)
enum
{
- /* Values >= 27 are resolved at run time. */
- LBP_BK = 27, /* mandatory break */
+ /* Values >= 30 are resolved at run time. */
+ LBP_BK = 30, /* mandatory break */
/*LBP_CR, carriage return - not used here because it's a DOSism */
/*LBP_LF, line feed - not used here because it's a DOSism */
- LBP_CM = 28, /* attached characters and combining marks */
+ LBP_CM = 31, /* attached characters and combining marks */
/*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
/*LBP_SG, surrogates - not used here because they are not characters */
LBP_WJ = 0, /* word joiner */
- LBP_ZW = 29, /* zero width space */
+ LBP_ZW = 32, /* zero width space */
LBP_GL = 1, /* non-breaking (glue) */
- LBP_SP = 30, /* space */
+ LBP_SP = 33, /* space */
LBP_B2 = 2, /* break opportunity before and after */
LBP_BA = 3, /* break opportunity after */
LBP_BB = 4, /* break opportunity before */
LBP_HY = 5, /* hyphen */
- LBP_CB = 31, /* contingent break opportunity */
+ LBP_CB = 34, /* contingent break opportunity */
LBP_CL = 6, /* closing punctuation */
LBP_CP = 7, /* closing parenthesis */
LBP_EX = 8, /* exclamation/interrogation */
@@ -6320,7 +6328,7 @@ enum
LBP_PO = 15, /* postfix (numeric) */
LBP_PR = 16, /* prefix (numeric) */
LBP_SY = 17, /* symbols allowing breaks */
- LBP_AI = 32, /* ambiguous (alphabetic or ideograph) */
+ LBP_AI = 35, /* ambiguous (alphabetic or ideograph) */
LBP_AL = 18, /* ordinary alphabetic and symbol characters */
/*LBP_CJ, conditional Japanese starter, resolved to NS */
LBP_H2 = 19, /* Hangul LV syllable */
@@ -6331,8 +6339,11 @@ enum
LBP_JV = 23, /* Hangul V Jamo */
LBP_JT = 24, /* Hangul T Jamo */
LBP_RI = 26, /* regional indicator */
- LBP_SA = 33, /* complex context (South East Asian) */
- LBP_XX = 34 /* unknown */
+ LBP_SA = 36, /* complex context (South East Asian) */
+ LBP_ZWJ = 27, /* zero width joiner */
+ LBP_EB = 28, /* emoji base */
+ LBP_EM = 29, /* emoji modifier */
+ LBP_XX = 37 /* unknown */
};
/* Returns the line breaking classification for ch, as a bit mask. */
@@ -6363,6 +6374,45 @@ get_lbp (unsigned int ch)
if (ch == 0x200B /* ZERO WIDTH SPACE */)
attr |= (int64_t) 1 << LBP_ZW;
+ /* zero width joiner */
+ if (ch == 0x200D /* ZERO WIDTH JOINER */)
+ attr |= (int64_t) 1 << LBP_ZWJ;
+
+ /* emoji base */
+ if (ch == 0x261D /* WHITE UP POINTING INDEX */
+ || ch == 0x26F9 /* PERSON WITH BALL */
+ || (ch >= 0x270A && ch <= 0x270D) /* RAISED FIST..WRITING HAND */
+ || ch == 0x1F385 /* FATHER CHRISTMAS */
+ || (ch >= 0x1F3C3 && ch <= 0x1F3C4) /* RUNNER..SURFER */
+ || (ch >= 0x1F3CA && ch <= 0x1F3CB) /* SWIMMER..WEIGHT LIFTER */
+ || (ch >= 0x1F442 && ch <= 0x1F443) /* EAR..NOSE */
+ || (ch >= 0x1F446 && ch <= 0x1F450) /* WHITE UP POINTING BACKHAND INDEX..OPEN HANDS SIGN */
+ || (ch >= 0x1F466 && ch <= 0x1F469) /* BOY..WOMAN */
+ || ch == 0x1F46E /* POLICE OFFICER */
+ || (ch >= 0x1F470 && ch <= 0x1F478) /* BRIDE WITH VEIL..PRINCESS */
+ || ch == 0x1F47C /* BABY ANGEL */
+ || (ch >= 0x1F481 && ch <= 0x1F483) /* INFORMATION DESK PERSON..DANCER */
+ || (ch >= 0x1F485 && ch <= 0x1F487) /* NAIL POLISH..HAIRCUT */
+ || ch == 0x1F4AA /* FLEXED BICEPS */
+ || ch == 0x1F575 /* SLEUTH OR SPY */
+ || ch == 0x1F57A /* MAN DANCING */
+ || ch == 0x1F590 /* RAISED HAND WITH FINGERS SPLAYED */
+ || (ch >= 0x1F595 && ch <= 0x1F596) /* REVERSED HAND WITH MIDDLE FINGER EXTENDED..RAISED HAND WITH PART BETWEEN MIDDLE AND RING FINGERS */
+ || (ch >= 0x1F645 && ch <= 0x1F647) /* FACE WITH NO GOOD GESTURE..PERSON BOWING DEEPLY */
+ || (ch >= 0x1F64B && ch <= 0x1F64F) /* HAPPY PERSON RAISING ONE HAND..PERSON WITH FOLDED HANDS */
+ || ch == 0x1F6A3 /* ROWBOAT */
+ || (ch >= 0x1F6B4 && ch <= 0x1F6B6) /* BICYCLIST..PEDESTRIAN */
+ || ch == 0x1F6C0 /* BATH */
+ || (ch >= 0x1F918 && ch <= 0x1F91E) /* SIGN OF THE HORNS..HAND WITH INDEX AND MIDDLE FINGERS CROSSED */
+ || ch == 0x1F926 /* FACE PALM */
+ || ch == 0x1F930 /* PREGNANT WOMAN */
+ || (ch >= 0x1F933 && ch <= 0x1F939) /* SELFIE..JUGGLING */
+ || (ch >= 0x1F93C && ch <= 0x1F93E) /* WRESTLERS..HANDBALL */)
+ attr |= (int64_t) 1 << LBP_EB;
+
+ if ((ch >= 0x1F3FB && ch <= 0x1F3FF) /* EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6 */)
+ attr |= (int64_t) 1 << LBP_EM;
+
/* non-breaking (glue) */
if (ch == 0x00A0 /* NO-BREAK SPACE */
|| ch == 0x202F /* NARROW NO-BREAK SPACE */
@@ -6496,6 +6546,8 @@ get_lbp (unsigned int ch)
|| ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
|| (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
|| ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
+ || ch == 0x2E43 /* DASH WITH LEFT UPTURN */
+ || ch == 0x2E44 /* DOUBLE SUSPENSION MARK */
|| ch == 0x2E3C /* STENOGRAPHIC FULL STOP */
|| ch == 0x2E3D /* VERTICAL SIX DOTS */
|| ch == 0x2E3E /* WIGGLY VERTICAL LINE */
@@ -6554,12 +6606,15 @@ get_lbp (unsigned int ch)
|| ch == 0x1123B /* KHOJKI SECTION MARK */
|| ch == 0x1123C /* KHOJKI DOUBLE SECTION MARK */
|| ch == 0x112A9 /* MULTANI SECTION MARK */
+ || (ch >= 0x1144B && ch <= 0x1144E) /* NEWA DANDA..NEWA GAP FILLER */
+ || ch == 0x1145B /* NEWA PLACEHOLDER MARK */
|| ch == 0x115C2 /* SIDDHAM DANDA */
|| ch == 0x115C3 /* SIDDHAM DOUBLE DANDA */
|| (ch >= 0x115C9 && ch <= 0x115D7) /* SIDDHAM END OF TEXT MARK..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES */
|| ch == 0x11641 /* MODI DANDA */
|| ch == 0x11642 /* MODI DOUBLE DANDA */
|| (ch >= 0x1173C && ch <= 0x1173E) /* AHOM SIGN SMALL SECTION..AHOM SIGN RULAI */
+ || (ch >= 0x11C41 && ch <= 0x11C45) /* BHAIKSUKI DANDA..BHAIKSUKI GAP FILLER-2 */
|| ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
|| ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
|| ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */
@@ -6598,7 +6653,9 @@ get_lbp (unsigned int ch)
|| ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */
|| ch == 0x11175 /* MAHAJANI SECTION MARK */
|| ch == 0x111DB /* SHARADA SIGN SIDDHAM */
- || ch == 0x115C1 /* SIDDHAM SIGN SIDDHAM */)
+ || ch == 0x115C1 /* SIDDHAM SIGN SIDDHAM */
+ || (ch >= 0x11660 && ch <= 0x1166C) /* MONGOLIAN BIRGA WITH ORNAMENT..MONGOLIAN TURNED SWIRL BIRGA WITH DOUBLE ORNAMENT */
+ || ch == 0x11C70 /* MARCHEN HEAD MARK */)
attr |= (int64_t) 1 << LBP_BB;
/* hyphen */
@@ -6676,7 +6733,8 @@ get_lbp (unsigned int ch)
|| ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
|| ch == 0xFF1F /* FULLWIDTH QUESTION MARK */
|| ch == 0x115C4 /* SIDDHAM SEPARATOR DOT */
- || ch == 0x115C5 /* SIDDHAM SEPARATOR BAR */)
+ || ch == 0x115C5 /* SIDDHAM SEPARATOR BAR */
+ || ch == 0x11C71 /* MARCHEN MARK SHAD */)
attr |= (int64_t) 1 << LBP_EX;
/* inseparable */
@@ -6717,6 +6775,7 @@ get_lbp (unsigned int ch)
|| ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
|| ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
|| ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
+ || ch == 0x16FE0 /* TANGUT ITERATION MARK */
|| ch == 0x1F679 /* HEAVY INTERROBANG ORNAMENT */
|| ch == 0x1F67A /* SANS-SERIF INTERROBANG ORNAMENT */
|| ch == 0x1F67B /* HEAVY SANS-SERIF INTERROBANG ORNAMENT */
@@ -6737,7 +6796,8 @@ get_lbp (unsigned int ch)
|| ch == 0x13286 /* EGYPTIAN HIEROGLYPH O036A */
|| ch == 0x13288 /* EGYPTIAN HIEROGLYPH O036C */
|| ch == 0x13379 /* EGYPTIAN HIEROGLYPH V011A */
- || ch == 0x145CE /* ANATOLIAN HIEROGLYPH A410 BEGIN LOGOGRAM MARK */)
+ || ch == 0x145CE /* ANATOLIAN HIEROGLYPH A410 BEGIN LOGOGRAM MARK */
+ || (ch >= 0x1E95E && ch <= 0x1E95F) /* ADLAM INITIAL EXCLAMATION MARK..ADLAM INITIAL QUESTION MARK */)
attr |= (int64_t) 1 << LBP_OP;
/* ambiguous quotation */
@@ -6905,9 +6965,10 @@ get_lbp (unsigned int ch)
|| (unicode_attributes[ch].category[0] == 'C'
&& (unicode_attributes[ch].category[1] == 'c'
|| unicode_attributes[ch].category[1] == 'f')
- && ch != 0x110BD /* KAITHI NUMBER SIGN */)
+ && ch != 0x110BD /* KAITHI NUMBER SIGN */
+ && ch != 0x08E2 /* ARABIC DISPUTED END OF AYAH */)
|| ch == 0x3035 /* VERTICAL KANA REPEAT MARK LOWER HALF */)
- if (!(attr & (((int64_t) 1 << LBP_BK) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_WJ) | ((int64_t) 1 << LBP_ZW))))
+ if (!(attr & (((int64_t) 1 << LBP_BK) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_WJ) | ((int64_t) 1 << LBP_ZW) | ((int64_t) 1 << LBP_ZWJ))))
attr |= (int64_t) 1 << LBP_CM;
/* ideographic */
@@ -6983,6 +7044,7 @@ get_lbp (unsigned int ch)
|| ch == 0x270B /* RAISED HAND */
|| ch == 0x270C /* VICTORY HAND */
|| ch == 0x270D /* WRITING HAND */
+ || ch == 0x2764 /* HEAVY BLACK HEART */
|| (ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
|| (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
|| (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
@@ -7046,6 +7108,15 @@ get_lbp (unsigned int ch)
|| ch == 0xFFE3 /* FULLWIDTH MACRON */
|| ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */
/* Extra characters for compatibility with Unicode LineBreak.txt. */
+ || ch == 0xFF66 /* Halfwidth Katakana */
+ || (ch >= 0xFF71 && ch <= 0xFF9D) /* Halfwidth Katakana */
+ || (ch >= 0xFFA0 && ch <= 0xFFBE) /* Halfwidth Hangul */
+ || (ch >= 0xFFC2 && ch <= 0xFFC7) /* Halfwidth Hangul */
+ || (ch >= 0xFFCA && ch <= 0xFFCF) /* Halfwidth Hangul */
+ || (ch >= 0xFFD2 && ch <= 0xFFD7) /* Halfwidth Hangul */
+ || (ch >= 0xFFDA && ch <= 0xFFDC) /* Halfwidth Hangul */
+ || (ch >= 0x17000 && ch <= 0x187EC) /* Tangut Ideograph */
+ || (ch >= 0x18800 && ch <= 0x18AF2) /* Tangut Ideograph */
|| (ch >= 0x1B000 && ch <= 0x1B001) /* Kana Supplement */
|| (ch >= 0x1F000 && ch <= 0x1F02B) /* Mahjong Tiles */
|| (ch >= 0x1F030 && ch <= 0x1F093) /* Domino Tiles */
@@ -7064,14 +7135,14 @@ get_lbp (unsigned int ch)
&& !(ch >= 0x1F5D4 && ch <= 0x1F5DB)
&& !(ch >= 0x1F5F4 && ch <= 0x1F5F9))
|| (ch >= 0x1F600 && ch <= 0x1F64F) /* Emoticons */
- || (ch >= 0x1F680 && ch <= 0x1F6D0) /* Transport and Map Symbols */
+ || (ch >= 0x1F680 && ch <= 0x1F6DF) /* Transport and Map Symbols */
|| (ch >= 0x1F6E0 && ch <= 0x1F6EC) /* Transport and Map Symbols */
- || (ch >= 0x1F6F0 && ch <= 0x1F6F3) /* Transport and Map Symbols */
+ || (ch >= 0x1F6F0 && ch <= 0x1F6F6) /* Transport and Map Symbols */
|| (ch >= 0x1F900 && ch <= 0x1F9FF) /* Supplemental Symbols and Pictographs */
|| (ch >= 0x2A700 && ch <= 0x2B734) /* CJK Ideograph Extension C */
|| (ch >= 0x2B740 && ch <= 0x2B81D) /* CJK Ideograph Extension D */
|| (ch >= 0x2B820 && ch <= 0x2CEAF) /* CJK Ideograph Extension E */)
- if (!(attr & (((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_CM))))
+ if (!(attr & (((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_EB))))
{
/* ambiguous (ideograph) ? */
if ((unicode_width[ch] != NULL
@@ -7134,13 +7205,14 @@ get_lbp (unsigned int ch)
|| ch == 0x0605 /* ARABIC NUMBER MARK ABOVE */
|| ch == 0x06DD /* ARABIC END OF AYAH */
|| ch == 0x070F /* SYRIAC ABBREVIATION MARK */
+ || ch == 0x08E2 /* ARABIC DISPUTED END OF AYAH */
|| ch == 0x2061 /* FUNCTION APPLICATION */
|| ch == 0x2062 /* INVISIBLE TIMES */
|| ch == 0x2063 /* INVISIBLE SEPARATOR */
|| ch == 0x2064 /* INVISIBLE PLUS */
/* Extra characters for compatibility with Unicode LineBreak.txt. */
|| ch == 0x110BD /* KAITHI NUMBER SIGN */)
- if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_HL) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_RI) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID)))
+ if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_HL) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_RI) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID) | ((int64_t) 1 << LBP_EB) | ((int64_t) 1 << LBP_EM)))
&& ch != 0x3035 /* VERTICAL KANA REPEAT MARK LOWER HALF */)
{
/* ambiguous (alphabetic) ? */
@@ -7192,7 +7264,11 @@ get_lbp (unsigned int ch)
|| ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */
|| ch == 0x2616 /* WHITE SHOGI PIECE */
|| ch == 0x2617 /* BLACK SHOGI PIECE */
+ || ch == 0x2757 /* HEAVY EXCLAMATION MARK SYMBOL */
+ || ch == 0x2B55 /* HEAVY LARGE CIRCLE */
|| ch == 0x1F10B /* DINGBAT CIRCLED SANS-SERIF DIGIT ZERO */
+ || ch == 0x1F18E /* NEGATIVE SQUARED AB */
+ || (ch >= 0x1F191 && ch <= 0x1F19A) /* SQUARED CL..SQUARED VS */
|| ch == 0x1F10C /* DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO */)
attr |= (int64_t) 1 << LBP_AI;
else
@@ -7206,6 +7282,38 @@ get_lbp (unsigned int ch)
if ((ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A */
|| (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs */
|| (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs */
+ || (ch >= 0x1F02C && ch <= 0x1F02F) /* reserved */
+ || (ch >= 0x1F094 && ch <= 0x1F09F) /* reserved */
+ || (ch >= 0x1F0AF && ch <= 0x1F0B0) /* reserved */
+ || ch == 0x1F0C0 /* reserved */
+ || ch == 0x1F0D0 /* reserved */
+ || (ch >= 0x1F0F6 && ch <= 0x1F0FF) /* reserved */
+ || (ch >= 0x1F10D && ch <= 0x1F10F) /* reserved */
+ || ch == 0x1F12F /* reserved */
+ || (ch >= 0x1F16C && ch <= 0x1F16F) /* reserved */
+ || (ch >= 0x1F1AD && ch <= 0x1F1E5) /* reserved */
+ || (ch >= 0x1F203 && ch <= 0x1F20F) /* reserved */
+ || (ch >= 0x1F23C && ch <= 0x1F23F) /* reserved */
+ || (ch >= 0x1F249 && ch <= 0x1F24F) /* reserved */
+ || (ch >= 0x1F252 && ch <= 0x1F2FF) /* reserved */
+ || (ch >= 0x1F6D3 && ch <= 0x1F6DF) /* reserved */
+ || (ch >= 0x1F6ED && ch <= 0x1F6EF) /* reserved */
+ || (ch >= 0x1F6F7 && ch <= 0x1F6FF) /* reserved */
+ || (ch >= 0x1F774 && ch <= 0x1F77F) /* reserved */
+ || (ch >= 0x1F7D5 && ch <= 0x1F7FF) /* reserved */
+ || (ch >= 0x1F80C && ch <= 0x1F80F) /* reserved */
+ || (ch >= 0x1F848 && ch <= 0x1F84F) /* reserved */
+ || (ch >= 0x1F85A && ch <= 0x1F85F) /* reserved */
+ || (ch >= 0x1F888 && ch <= 0x1F88F) /* reserved */
+ || (ch >= 0x1F8AE && ch <= 0x1F90F) /* reserved */
+ || ch == 0x1F91F /* reserved */
+ || ch == 0x1F93F /* reserved */
+ || (ch >= 0x1F928 && ch <= 0x1F92F) /* reserved */
+ || (ch >= 0x1F931 && ch <= 0x1F932) /* reserved */
+ || (ch >= 0x1F94C && ch <= 0x1F94F) /* reserved */
+ || (ch >= 0x1F95F && ch <= 0x1F97F) /* reserved */
+ || (ch >= 0x1F992 && ch <= 0x1F9BF) /* reserved */
+ || (ch >= 0x1F9C1 && ch <= 0x1FFFD) /* reserved */
|| (ch >= 0x20000 && ch <= 0x2A6FF) /* CJK Unified Ideographs Extension B */
|| (ch >= 0x2A700 && ch <= 0x2F7FF) /* CJK Unified Ideographs Extension C,
Supplementary Ideographic Plane (Plane 2) outside of blocks */
@@ -7270,6 +7378,9 @@ debug_output_lbp (FILE *stream)
PRINT_BIT(attr,LBP_JT);
PRINT_BIT(attr,LBP_RI);
PRINT_BIT(attr,LBP_SA);
+ PRINT_BIT(attr,LBP_ZWJ);
+ PRINT_BIT(attr,LBP_EB);
+ PRINT_BIT(attr,LBP_EM);
PRINT_BIT(attr,LBP_XX);
#undef PRINT_BIT
fprintf (stream, "\n");
@@ -7386,6 +7497,9 @@ fill_org_lbp (const char *linebreak_filename)
TRY(LBP_JT)
TRY(LBP_RI)
TRY(LBP_SA)
+ TRY(LBP_ZWJ)
+ TRY(LBP_EB)
+ TRY(LBP_EM)
TRY(LBP_XX)
#undef TRY
else if (strcmp (field1, "LF") == 0) value = LBP_BK;
@@ -7469,6 +7583,9 @@ debug_output_org_lbp (FILE *stream)
PRINT_BIT(attr,LBP_JT);
PRINT_BIT(attr,LBP_RI);
PRINT_BIT(attr,LBP_SA);
+ PRINT_BIT(attr,LBP_ZWJ);
+ PRINT_BIT(attr,LBP_EB);
+ PRINT_BIT(attr,LBP_EM);
PRINT_BIT(attr,LBP_XX);
#undef PRINT_BIT
fprintf (stream, "\n");
@@ -7643,6 +7760,9 @@ output_lbp (FILE *stream1, FILE *stream2)
CASE(LBP_JT);
CASE(LBP_RI);
CASE(LBP_SA);
+ CASE(LBP_ZWJ);
+ CASE(LBP_EB);
+ CASE(LBP_EM);
CASE(LBP_XX);
#undef CASE
default:
@@ -7745,7 +7865,12 @@ enum
WBP_RI = 13,
WBP_DQ = 14,
WBP_SQ = 15,
- WBP_HL = 16
+ WBP_HL = 16,
+ WBP_ZWJ = 17,
+ WBP_EB = 18,
+ WBP_EM = 19,
+ WBP_GAZ = 20,
+ WBP_EBG = 21
};
/* Returns the word breaking property for ch, as a bit mask. */
@@ -7768,13 +7893,15 @@ get_wbp (unsigned int ch)
attr |= 1 << WBP_NEWLINE;
if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0
+ || ((unicode_properties[ch] >> PROP_OTHER_GRAPHEME_EXTEND) & 1) != 0
|| (unicode_attributes[ch].category != NULL
&& strcmp (unicode_attributes[ch].category, "Mc") == 0))
attr |= 1 << WBP_EXTEND;
if (unicode_attributes[ch].category != NULL
&& strcmp (unicode_attributes[ch].category, "Cf") == 0
- && ch != 0x200B && ch != 0x200C && ch != 0x200D)
+ && ch != 0x200B && ch != 0x200C && ch != 0x200D
+ && !(ch >= 0xe0020 && ch <= 0xe007f))
attr |= 1 << WBP_FORMAT;
if ((unicode_scripts[ch] < numscripts
@@ -7816,8 +7943,9 @@ get_wbp (unsigned int ch)
&& ch != 0x066C)
attr |= 1 << WBP_NUMERIC;
- if (unicode_attributes[ch].category != NULL
- && strcmp (unicode_attributes[ch].category, "Pc") == 0)
+ if ((unicode_attributes[ch].category != NULL
+ && strcmp (unicode_attributes[ch].category, "Pc") == 0)
+ || ch == 0x202F /* NARROW NO-BREAK SPACE */)
attr |= 1 << WBP_EXTENDNUMLET;
if (((get_lbp (ch) >> LBP_RI) & 1) != 0)
@@ -7828,6 +7956,20 @@ get_wbp (unsigned int ch)
if (ch == 0x0027)
attr |= 1 << WBP_SQ;
+
+ if (ch == 0x200D)
+ attr |= 1 << WBP_ZWJ;
+
+ if (ch >= 0x1F466 && ch <= 0x1F469)
+ attr |= 1 << WBP_EBG;
+ else if (((get_lbp (ch) >> LBP_EB) & 1) != 0)
+ attr |= 1 << WBP_EB;
+
+ if (((get_lbp (ch) >> LBP_EM) & 1) != 0)
+ attr |= 1 << WBP_EM;
+
+ if (ch == 0x2764 || ch == 0x1F48B || ch == 0x1F5E8)
+ attr |= 1 << WBP_GAZ;
}
if (attr == 0)
@@ -7881,6 +8023,16 @@ debug_output_wbp (FILE *stream)
fprintf (stream, " Single_Quote");
if (attr & (1 << WBP_HL))
fprintf (stream, " Hebrew_Letter");
+ if (attr & (1 << WBP_ZWJ))
+ fprintf (stream, " ZWJ");
+ if (attr & (1 << WBP_EB))
+ fprintf (stream, " E_Base");
+ if (attr & (1 << WBP_EM))
+ fprintf (stream, " E_Modifier");
+ if (attr & (1 << WBP_GAZ))
+ fprintf (stream, " Glue_After_Zwj");
+ if (attr & (1 << WBP_EBG))
+ fprintf (stream, " E_Base_GAZ");
fprintf (stream, "\n");
}
}
@@ -7970,6 +8122,11 @@ fill_org_wbp (const char *wordbreakproperty_filename)
PROP ("Double_Quote", WBP_DQ)
PROP ("Single_Quote", WBP_SQ)
PROP ("Hebrew_Letter", WBP_HL)
+ PROP ("ZWJ", WBP_ZWJ)
+ PROP ("E_Base", WBP_EB)
+ PROP ("E_Modifier", WBP_EM)
+ PROP ("Glue_After_Zwj", WBP_GAZ)
+ PROP ("E_Base_GAZ", WBP_EBG)
#undef PROP
{
fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
@@ -8019,6 +8176,11 @@ debug_output_org_wbp (FILE *stream)
PROP ("Double_Quote", WBP_DQ)
PROP ("Single_Quote", WBP_SQ)
PROP ("Hebrew_Letter", WBP_HL)
+ PROP ("ZWJ", WBP_ZWJ)
+ PROP ("E_Base", WBP_EB)
+ PROP ("E_Modifier", WBP_EM)
+ PROP ("Glue_After_Zwj", WBP_GAZ)
+ PROP ("E_Base_GAZ", WBP_EBG)
#undef PROP
fprintf (stream, " ??");
fprintf (stream, "\n");
@@ -8174,6 +8336,11 @@ output_wbp (FILE *stream)
CASE(WBP_DQ);
CASE(WBP_SQ);
CASE(WBP_HL);
+ CASE(WBP_ZWJ);
+ CASE(WBP_EB);
+ CASE(WBP_EM);
+ CASE(WBP_GAZ);
+ CASE(WBP_EBG);
#undef CASE
default:
abort ();
@@ -8238,7 +8405,7 @@ output_wbrk_tables (const char *filename, const char *version)
/* ========================================================================= */
/* Grapheme break property.
- Updated for Unicode TR #29 revision 17. */
+ Updated for Unicode TR #29 revision 29. */
/* Possible values of the Grapheme_Cluster_Break property. */
enum
@@ -8255,7 +8422,12 @@ enum
GBP_T = 9,
GBP_LV = 10,
GBP_LVT = 11,
- GBP_RI = 12
+ GBP_RI = 12,
+ GBP_ZWJ = 13,
+ GBP_EB = 14,
+ GBP_EM = 15,
+ GBP_GAZ = 16,
+ GBP_EBG = 17
};
/* Construction of sparse 3-level tables. */
@@ -8327,6 +8499,11 @@ output_gbp_test (const char *filename)
CASE (GBP_LV)
CASE (GBP_LVT)
CASE (GBP_RI)
+ CASE (GBP_ZWJ)
+ CASE (GBP_EB)
+ CASE (GBP_EM)
+ CASE (GBP_GAZ)
+ CASE (GBP_EBG)
#undef CASE
default:
abort ();
@@ -8396,7 +8573,7 @@ output_gbp_table (const char *filename, const char *version)
fprintf (stream, " {\n");
fprintf (stream, " int level1[%zu];\n", t.level1_size);
fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
- fprintf (stream, " unsigned char level3[(%zu << %d) / 2];\n",
+ fprintf (stream, " unsigned char level3[%zu << %d];\n",
t.level3_size, t.p);
fprintf (stream, " }\n");
fprintf (stream, "unigbrkprop =\n");
@@ -8434,7 +8611,7 @@ output_gbp_table (const char *filename, const char *version)
fprintf (stream, " %5d", -1);
else
fprintf (stream, " %5zu",
- (offset - level3_offset) / sizeof (uint8_t) / 2);
+ (offset - level3_offset) / sizeof (uint8_t));
if (i+1 < t.level2_size << t.q)
fprintf (stream, ",");
}
@@ -8442,19 +8619,43 @@ output_gbp_table (const char *filename, const char *version)
fprintf (stream, "\n ");
fprintf (stream, " },\n");
fprintf (stream, " {");
- if (t.level3_size << t.p > 8)
+ if (t.level3_size << t.p > 4)
fprintf (stream, "\n ");
- for (i = 0; i < (t.level3_size << t.p) / 2; i++)
+ for (i = 0; i < t.level3_size << t.p; i++)
{
- unsigned char *p = (unsigned char *) (t.result + level3_offset);
- unsigned char value0 = p[i * 2];
- unsigned char value1 = p[i * 2 + 1];
- if (i > 0 && (i % 8) == 0)
+ unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
+ const char *value_string;
+ switch (value)
+ {
+#define CASE(x) case x: value_string = #x; break;
+ CASE (GBP_OTHER)
+ CASE (GBP_CR)
+ CASE (GBP_LF)
+ CASE (GBP_CONTROL)
+ CASE (GBP_EXTEND)
+ CASE (GBP_PREPEND)
+ CASE (GBP_SPACINGMARK)
+ CASE (GBP_L)
+ CASE (GBP_V)
+ CASE (GBP_T)
+ CASE (GBP_LV)
+ CASE (GBP_LVT)
+ CASE (GBP_RI)
+ CASE (GBP_ZWJ)
+ CASE (GBP_EB)
+ CASE (GBP_EM)
+ CASE (GBP_GAZ)
+ CASE (GBP_EBG)
+#undef CASE
+ default:
+ abort ();
+ }
+ if (i > 0 && (i % 4) == 0)
fprintf (stream, "\n ");
- fprintf (stream, " 0x%02x%s", (value1 << 4) + value0,
- (i+1 < (t.level3_size << t.p) / 2 ? "," : ""));
+ fprintf (stream, " %s%s", value_string,
+ (i+1 < t.level3_size << t.p ? "," : ""));
}
- if (t.level3_size << t.p > 8)
+ if (t.level3_size << t.p > 4)
fprintf (stream, "\n ");
fprintf (stream, " }\n");
fprintf (stream, "};\n");
@@ -8525,6 +8726,11 @@ fill_org_gbp (const char *graphemebreakproperty_filename)
PROP ("LV", GBP_LV)
PROP ("LVT", GBP_LVT)
PROP ("Regional_Indicator", GBP_RI)
+ PROP ("ZWJ", GBP_ZWJ)
+ PROP ("E_Base", GBP_EB)
+ PROP ("E_Modifier", GBP_EM)
+ PROP ("Glue_After_Zwj", GBP_GAZ)
+ PROP ("E_Base_GAZ", GBP_EBG)
#undef PROP
{
fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname,
diff --git a/lib/unictype.in.h b/lib/unictype.in.h
index dde4871a6e..e746ee08b9 100644
--- a/lib/unictype.in.h
+++ b/lib/unictype.in.h
@@ -551,7 +551,10 @@ enum
UC_JOINING_GROUP_MANICHAEAN_FIVE, /* Manichaean_Five */
UC_JOINING_GROUP_MANICHAEAN_TEN, /* Manichaean_Ten */
UC_JOINING_GROUP_MANICHAEAN_TWENTY, /* Manichaean_Twenty */
- UC_JOINING_GROUP_MANICHAEAN_HUNDRED /* Manichaean_Hundred */
+ UC_JOINING_GROUP_MANICHAEAN_HUNDRED, /* Manichaean_Hundred */
+ UC_JOINING_GROUP_AFRICAN_FEH, /* African_Feh */
+ UC_JOINING_GROUP_AFRICAN_QAF, /* African_Qaf */
+ UC_JOINING_GROUP_AFRICAN_NOON /* African_Noon */
};
/* Return the name of a joining group. */
diff --git a/lib/unigbrk.in.h b/lib/unigbrk.in.h
index 87b298b9b5..7cfc2df624 100644
--- a/lib/unigbrk.in.h
+++ b/lib/unigbrk.in.h
@@ -52,7 +52,12 @@ enum
GBP_T = 9,
GBP_LV = 10,
GBP_LVT = 11,
- GBP_RI = 12
+ GBP_RI = 12,
+ GBP_ZWJ = 13,
+ GBP_EB = 14,
+ GBP_EM = 15,
+ GBP_GAZ = 16,
+ GBP_EBG = 17
};
/* Return the Grapheme_Cluster_Break property of a Unicode character. */
@@ -118,6 +123,8 @@ extern void
u32_grapheme_breaks (const uint32_t *s, size_t n, char *p);
extern void
ulc_grapheme_breaks (const char *s, size_t n, char *p);
+extern void
+ uc_grapheme_breaks (const ucs4_t *s, size_t n, char *p);
/* ========================================================================= */
diff --git a/lib/unigbrk/u-grapheme-breaks.h b/lib/unigbrk/u-grapheme-breaks.h
new file mode 100644
index 0000000000..358c5c6cc4
--- /dev/null
+++ b/lib/unigbrk/u-grapheme-breaks.h
@@ -0,0 +1,122 @@
+/* Grapheme cluster break function.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Written by Ben Pfaff <blp@cs.stanford.edu>, 2010.
+
+ This program is free software: you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+void
+FUNC (const UNIT *s, size_t n, char *p)
+{
+ if (n > 0)
+ {
+ const UNIT *s_end = s + n;
+
+ /* Grapheme Cluster break property of the last character.
+ -1 at the very beginning of the string. */
+ int last_char_prop = -1;
+
+ /* Grapheme Cluster break property of the last complex character.
+ -1 at the very beginning of the string. */
+ int last_compchar_prop = -1;
+
+ size_t ri_count = 0;
+
+ /* Don't break inside multibyte characters. */
+ memset (p, 0, n);
+
+ while (s < s_end)
+ {
+ ucs4_t uc;
+ int count = U_MBTOUC (&uc, s, s_end - s);
+ int prop = uc_graphemeclusterbreak_property (uc);
+
+ /* Break at the start of the string (GB1). */
+ if (last_char_prop < 0)
+ *p = 1;
+ else
+ {
+ /* No break between CR and LF (GB3). */
+ if (last_char_prop == GBP_CR && prop == GBP_LF)
+ /* *p = 0 */;
+ /* Break before and after newlines (GB4, GB5). */
+ else if ((last_char_prop == GBP_CR
+ || last_char_prop == GBP_LF
+ || last_char_prop == GBP_CONTROL)
+ || (prop == GBP_CR
+ || prop == GBP_LF
+ || prop == GBP_CONTROL))
+ *p = 1;
+ /* No break between Hangul syllable sequences (GB6, GB7, GB8). */
+ else if ((last_char_prop == GBP_L
+ && (prop == GBP_L
+ || prop == GBP_V
+ || prop == GBP_LV
+ || prop == GBP_LVT))
+ || ((last_char_prop == GBP_LV
+ || last_char_prop == GBP_V)
+ && (prop == GBP_V
+ || prop == GBP_T))
+ || ((last_char_prop == GBP_LVT
+ || last_char_prop == GBP_T)
+ && prop == GBP_T))
+ /* *p = 0 */;
+ /* No break before extending characters or ZWJ (GB9). */
+ else if (prop == GBP_EXTEND || prop == GBP_ZWJ)
+ /* *p = 0 */;
+ /* No break before SpacingMarks (GB9a). */
+ else if (prop == GBP_SPACINGMARK)
+ /* *p = 0 */;
+ /* No break after Prepend characters (GB9b). */
+ else if (last_char_prop == GBP_PREPEND)
+ /* *p = 0 */;
+ /* No break within emoji modifier sequences (GB10). */
+ else if ((last_compchar_prop == GBP_EB
+ || last_compchar_prop == GBP_EBG)
+ && prop == GBP_EM)
+ /* *p = 0 */;
+ /* No break within emoji zwj sequences (GB11). */
+ else if (last_char_prop == GBP_ZWJ
+ && (prop == GBP_GAZ
+ || prop == GBP_EBG))
+ /* *p = 0 */;
+ /* No break between RI if there is an odd number of RI
+ characters before (GB12, GB13). */
+ else if (prop == GBP_RI)
+ {
+ if (ri_count % 2 == 0)
+ *p = 1;
+ /* else *p = 0; */
+ }
+ /* Break everywhere (GBP999). */
+ else
+ *p = 1;
+ }
+
+ last_char_prop = prop;
+
+ if (!(prop == GBP_EXTEND
+ && (last_compchar_prop == GBP_EB
+ || last_compchar_prop == GBP_EBG)))
+ last_compchar_prop = prop;
+
+ if (prop == GBP_RI)
+ ri_count++;
+ else
+ ri_count = 0;
+
+ s += count;
+ p += count;
+ }
+ }
+}
diff --git a/lib/unigbrk/u16-grapheme-breaks.c b/lib/unigbrk/u16-grapheme-breaks.c
index 9da87a36dd..be7cdeff6f 100644
--- a/lib/unigbrk/u16-grapheme-breaks.c
+++ b/lib/unigbrk/u16-grapheme-breaks.c
@@ -20,25 +20,11 @@
/* Specification. */
#include "unigbrk.h"
-#include "unistr.h"
-
-void
-u16_grapheme_breaks (const uint16_t *s, size_t n, char *p)
-{
- ucs4_t prev;
- int mblen;
-
- prev = 0;
- for (; n > 0; s += mblen, p += mblen, n -= mblen)
- {
- ucs4_t next;
+#include <string.h>
- mblen = u16_mbtouc (&next, s, n);
-
- p[0] = uc_is_grapheme_break (prev, next);
- if (mblen > 1)
- p[1] = 0;
+#include "unistr.h"
- prev = next;
- }
-}
+#define FUNC u16_grapheme_breaks
+#define UNIT uint16_t
+#define U_MBTOUC u16_mbtouc
+#include "u-grapheme-breaks.h"
diff --git a/lib/unigbrk/u32-grapheme-breaks.c b/lib/unigbrk/u32-grapheme-breaks.c
index 6220ec6c65..d85f03152f 100644
--- a/lib/unigbrk/u32-grapheme-breaks.c
+++ b/lib/unigbrk/u32-grapheme-breaks.c
@@ -20,23 +20,11 @@
/* Specification. */
#include "unigbrk.h"
-#include "unistr.h"
-
-void
-u32_grapheme_breaks (const uint32_t *s, size_t n, char *p)
-{
- ucs4_t prev;
- size_t i;
-
- prev = 0;
- for (i = 0; i < n; i++)
- {
- ucs4_t next;
+#include <string.h>
- u32_mbtouc (&next, &s[i], 1);
-
- p[i] = uc_is_grapheme_break (prev, next);
+#include "unistr.h"
- prev = next;
- }
-}
+#define FUNC u32_grapheme_breaks
+#define UNIT uint32_t
+#define U_MBTOUC u32_mbtouc
+#include "u-grapheme-breaks.h"
diff --git a/lib/unigbrk/u8-grapheme-breaks.c b/lib/unigbrk/u8-grapheme-breaks.c
index 83383f9375..7b655f717b 100644
--- a/lib/unigbrk/u8-grapheme-breaks.c
+++ b/lib/unigbrk/u8-grapheme-breaks.c
@@ -21,26 +21,11 @@
/* Specification. */
#include "unigbrk.h"
-#include "unistr.h"
-
-void
-u8_grapheme_breaks (const uint8_t *s, size_t n, char *p)
-{
- ucs4_t prev;
- int mblen;
-
- prev = 0;
- for (; n > 0; s += mblen, p += mblen, n -= mblen)
- {
- ucs4_t next;
- int i;
+#include <string.h>
- mblen = u8_mbtouc (&next, s, n);
-
- p[0] = uc_is_grapheme_break (prev, next);
- for (i = 1; i < mblen; i++)
- p[i] = 0;
+#include "unistr.h"
- prev = next;
- }
-}
+#define FUNC u8_grapheme_breaks
+#define UNIT uint8_t
+#define U_MBTOUC u8_mbtouc
+#include "u-grapheme-breaks.h"
diff --git a/lib/unigbrk/uc-gbrk-prop.c b/lib/unigbrk/uc-gbrk-prop.c
index c224032508..194b19c047 100644
--- a/lib/unigbrk/uc-gbrk-prop.c
+++ b/lib/unigbrk/uc-gbrk-prop.c
@@ -36,8 +36,7 @@ uc_graphemeclusterbreak_property (ucs4_t uc)
if (lookup2 >= 0)
{
unsigned int index3 = uc & gbrkprop_header_4;
- unsigned char lookup3 = unigbrkprop.level3[lookup2 + index3 / 2];
- return (lookup3 >> ((uc & 1) << 2)) & 0x0f;
+ return unigbrkprop.level3[lookup2 + index3];
}
}
}
diff --git a/lib/unigbrk/uc-grapheme-breaks.c b/lib/unigbrk/uc-grapheme-breaks.c
new file mode 100644
index 0000000000..a5e81ba31d
--- /dev/null
+++ b/lib/unigbrk/uc-grapheme-breaks.c
@@ -0,0 +1,39 @@
+/* Grapheme cluster breaks function.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Written by Ben Pfaff <blp@cs.stanford.edu>, 2010.
+
+ This program is free software: you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+/* Specification. */
+#include "unigbrk.h"
+
+#include <string.h>
+
+#include "unistr.h"
+
+/* This is similar to u32_mbtouc_unsafe(), but doesn't check invalid
+ characters. */
+static int
+uc_grapheme_breaks_mbtouc (ucs4_t *puc, const ucs4_t *s, size_t n)
+{
+ *puc = *s;
+ return 1;
+}
+
+#define FUNC uc_grapheme_breaks
+#define UNIT ucs4_t
+#define U_MBTOUC uc_grapheme_breaks_mbtouc
+#include "u-grapheme-breaks.h"
diff --git a/lib/unigbrk/uc-is-grapheme-break.c b/lib/unigbrk/uc-is-grapheme-break.c
index f14a01c51a..8ac975b336 100644
--- a/lib/unigbrk/uc-is-grapheme-break.c
+++ b/lib/unigbrk/uc-is-grapheme-break.c
@@ -47,19 +47,22 @@
/* GB8 */ \
((A) == GBP_LVT || (A) == GBP_T) && (B) == GBP_T ? false : \
\
- /* GB8a */ \
- (A) == GBP_RI && (B) == GBP_RI ? false : \
- \
/* GB9 */ \
- (B) == GBP_EXTEND ? false : \
+ (B) == GBP_EXTEND || (B) == GBP_ZWJ ? false : \
\
/* GB9a */ \
(B) == GBP_SPACINGMARK ? false : \
\
/* GB9b */ \
- (A) == GBP_PREPEND ? false \
+ (A) == GBP_PREPEND ? false : \
\
- /* GB10 */ \
+ /* GB10 -- incomplete */ \
+ ((A) == GBP_EB || (A) == GBP_EBG) && (B) == GBP_EM ? false : \
+ \
+ /* GB11 */ \
+ (A) == GBP_ZWJ && ((B) == GBP_GAZ || (B) == GBP_EBG) ? false \
+ \
+ /* GB999 */ \
: true)
#define UC_GRAPHEME_BREAKS_FOR(A) \
@@ -75,9 +78,14 @@
| (UC_IS_GRAPHEME_BREAK(A, GBP_T) << GBP_T) \
| (UC_IS_GRAPHEME_BREAK(A, GBP_LV) << GBP_LV) \
| (UC_IS_GRAPHEME_BREAK(A, GBP_LVT) << GBP_LVT) \
- | (UC_IS_GRAPHEME_BREAK(A, GBP_RI) << GBP_RI))
+ | (UC_IS_GRAPHEME_BREAK(A, GBP_RI) << GBP_RI) \
+ | (UC_IS_GRAPHEME_BREAK(A, GBP_ZWJ) << GBP_ZWJ) \
+ | (UC_IS_GRAPHEME_BREAK(A, GBP_EB) << GBP_EB) \
+ | (UC_IS_GRAPHEME_BREAK(A, GBP_EM) << GBP_EM) \
+ | (UC_IS_GRAPHEME_BREAK(A, GBP_GAZ) << GBP_GAZ) \
+ | (UC_IS_GRAPHEME_BREAK(A, GBP_EBG) << GBP_EBG))
-static const unsigned short int gb_table[13] =
+static const unsigned long int gb_table[18] =
{
UC_GRAPHEME_BREAKS_FOR(0), /* GBP_OTHER */
UC_GRAPHEME_BREAKS_FOR(1), /* GBP_CR */
@@ -92,6 +100,11 @@ static const unsigned short int gb_table[13] =
UC_GRAPHEME_BREAKS_FOR(10), /* GBP_LV */
UC_GRAPHEME_BREAKS_FOR(11), /* GBP_LVT */
UC_GRAPHEME_BREAKS_FOR(12), /* GBP_RI */
+ UC_GRAPHEME_BREAKS_FOR(13), /* GBP_ZWJ */
+ UC_GRAPHEME_BREAKS_FOR(14), /* GBP_EB */
+ UC_GRAPHEME_BREAKS_FOR(15), /* GBP_EM */
+ UC_GRAPHEME_BREAKS_FOR(16), /* GBP_GAZ */
+ UC_GRAPHEME_BREAKS_FOR(17), /* GBP_EBG */
};
bool
diff --git a/lib/unilbrk/lbrktables.c b/lib/unilbrk/lbrktables.c
index 90ade32096..c5254e69b2 100644
--- a/lib/unilbrk/lbrktables.c
+++ b/lib/unilbrk/lbrktables.c
@@ -23,37 +23,40 @@
/* Define unilbrkprop, table of line breaking properties. */
#include "unilbrk/lbrkprop2.h"
-const unsigned char unilbrk_table[27][27] =
+const unsigned char unilbrk_table[30][30] =
{
/* after */
- /* WJ GL B2 BA BB HY CL CP EX IN NS OP QU IS NU PO PR SY AL H2 H3 ID JL JV JT HL RI */
-/* WJ */ { P, I, I, I, I, I, P, P, P, I, I, I, I, P, I, I, I, P, I, I, I, I, I, I, I, I, I, },
-/* GL */ { P, I, I, I, I, I, P, P, P, I, I, I, I, P, I, I, I, P, I, I, I, I, I, I, I, I, I, },
-/* B2 */ { P, I, P, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D, D, D, D, D, D, },
-/* BA */ { P, D, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D, D, D, D, D, D, },
-/* BB */ { P, I, I, I, I, I, P, P, P, I, I, I, I, P, I, I, I, P, I, I, I, I, I, I, I, I, I, },
-/* HY */ { P, D, D, I, D, I, P, P, P, D, I, D, I, P, I, D, D, P, D, D, D, D, D, D, D, D, D, },
-/* CL */ { P, I, D, I, D, I, P, P, P, D, P, D, I, P, D, I, I, P, D, D, D, D, D, D, D, D, D, },
-/* CP */ { P, I, D, I, D, I, P, P, P, D, P, D, I, P, I, I, I, P, I, D, D, D, D, D, D, I, D, },
-/* EX */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, D, D, P, D, D, D, D, D, D, D, D, D, },
-/* IN */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, D, D, P, D, D, D, D, D, D, D, D, D, },
-/* NS */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D, D, D, D, D, D, },
-/* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, },
-/* QU */ { P, I, I, I, I, I, P, P, P, I, I, P, I, P, I, I, I, P, I, I, I, I, I, I, I, I, I, },
-/* IS */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, I, D, D, P, D, D, D, D, D, D, D, D, D, },
-/* NU */ { P, I, D, I, D, I, P, P, P, I, I, I, I, P, I, I, I, P, I, D, D, D, D, D, D, I, D, },
-/* PO */ { P, I, D, I, D, I, P, P, P, D, I, I, I, P, I, D, D, P, I, D, D, D, D, D, D, I, D, },
-/* PR */ { P, I, D, I, D, I, P, P, P, D, I, I, I, P, I, D, D, P, I, I, I, I, I, I, I, I, D, },
-/* SY */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, I, D, D, P, D, D, D, D, D, D, D, I, D, },
-/* AL */ { P, I, D, I, D, I, P, P, P, I, I, I, I, P, I, D, D, P, I, D, D, D, D, D, D, I, D, },
-/* H2 */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, D, I, I, D, D, },
-/* H3 */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, D, D, I, D, D, },
-/* ID */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, D, D, D, D, D, },
-/* JL */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, I, I, D, I, I, D, D, D, },
-/* JV */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, D, I, I, D, D, },
-/* JT */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, D, D, I, D, D, },
-/* HL */ { P, I, D, I, D, I, P, P, P, I, I, I, I, P, I, D, D, P, I, D, D, D, D, D, D, I, D, },
-/* RI */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D, D, D, I, D, I, },
+ /* WJ GL B2 BA BB HY CL CP EX IN NS OP QU IS NU PO PR SY AL H2 H3 ID JL JV JT HL RI ZWJ EB EM */
+/* WJ */ { P, I, I, I, I, I, P, P, P, I, I, I, I, P, I, I, I, P, I, I, I, I, I, I, I, I, I, I, I, I, },
+/* GL */ { P, I, I, I, I, I, P, P, P, I, I, I, I, P, I, I, I, P, I, I, I, I, I, I, I, I, I, I, I, I, },
+/* B2 */ { P, I, P, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D, D, D, D, D, D, I, D, D, },
+/* BA */ { P, D, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D, D, D, D, D, D, I, D, D, },
+/* BB */ { P, I, I, I, I, I, P, P, P, I, I, I, I, P, I, I, I, P, I, I, I, I, I, I, I, I, I, I, I, I, },
+/* HY */ { P, D, D, I, D, I, P, P, P, D, I, D, I, P, I, D, D, P, D, D, D, D, D, D, D, D, D, I, D, D, },
+/* CL */ { P, I, D, I, D, I, P, P, P, D, P, D, I, P, D, I, I, P, D, D, D, D, D, D, D, D, D, I, D, D, },
+/* CP */ { P, I, D, I, D, I, P, P, P, D, P, D, I, P, I, I, I, P, I, D, D, D, D, D, D, I, D, I, D, D, },
+/* EX */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, D, D, P, D, D, D, D, D, D, D, D, D, I, D, D, },
+/* IN */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, D, D, P, D, D, D, D, D, D, D, D, D, I, D, D, },
+/* NS */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D, D, D, D, D, D, I, D, D, },
+/* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, },
+/* QU */ { P, I, I, I, I, I, P, P, P, I, I, P, I, P, I, I, I, P, I, I, I, I, I, I, I, I, I, I, I, I, },
+/* IS */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, I, D, D, P, D, D, D, D, D, D, D, D, D, I, D, D, },
+/* NU */ { P, I, D, I, D, I, P, P, P, I, I, I, I, P, I, I, I, P, I, D, D, D, D, D, D, I, D, I, D, D, },
+/* PO */ { P, I, D, I, D, I, P, P, P, D, I, I, I, P, I, D, D, P, I, D, D, D, D, D, D, I, D, I, D, D, },
+/* PR */ { P, I, D, I, D, I, P, P, P, D, I, I, I, P, I, D, D, P, I, I, I, I, I, I, I, I, D, I, I, I, },
+/* SY */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, I, D, D, P, D, D, D, D, D, D, D, I, D, I, D, D, },
+/* AL */ { P, I, D, I, D, I, P, P, P, I, I, I, I, P, I, D, D, P, I, D, D, D, D, D, D, I, D, I, D, D, },
+/* H2 */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, D, I, I, D, D, I, D, D, },
+/* H3 */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, D, D, I, D, D, I, D, D, },
+/* ID */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, D, D, D, D, D, I, D, D, },
+/* JL */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, I, I, D, I, I, D, D, D, I, D, D, },
+/* JV */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, D, I, I, D, D, I, D, D, },
+/* JT */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, D, D, I, D, D, I, D, D, },
+/* HL */ { P, I, D, I, D, I, P, P, P, I, I, I, I, P, I, D, D, P, I, D, D, D, D, D, D, I, D, I, D, D, },
+/* RI */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D, D, D, I, D, I, I, D, D, },
+/* ZWJ */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, I, D, D, I, D, I, I, I, I, },
+/* EB */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, D, D, I, D, I, I, D, I, },
+/* EM */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D, D, D, I, D, I, I, D, D, },
/* "" */
/* before */
};
diff --git a/lib/unilbrk/lbrktables.h b/lib/unilbrk/lbrktables.h
index 8111dc8fc7..e3671136d5 100644
--- a/lib/unilbrk/lbrktables.h
+++ b/lib/unilbrk/lbrktables.h
@@ -21,22 +21,22 @@
enum
{
- /* Values >= 27 are resolved at run time. */
- LBP_BK = 27, /* mandatory break */
+ /* Values >= 30 are resolved at run time. */
+ LBP_BK = 30, /* mandatory break */
/*LBP_CR, carriage return - not used here because it's a DOSism */
/*LBP_LF, line feed - not used here because it's a DOSism */
- LBP_CM = 28, /* attached characters and combining marks */
+ LBP_CM = 31, /* attached characters and combining marks */
/*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
/*LBP_SG, surrogates - not used here because they are not characters */
LBP_WJ = 0, /* word joiner */
- LBP_ZW = 29, /* zero width space */
+ LBP_ZW = 32, /* zero width space */
LBP_GL = 1, /* non-breaking (glue) */
- LBP_SP = 30, /* space */
+ LBP_SP = 33, /* space */
LBP_B2 = 2, /* break opportunity before and after */
LBP_BA = 3, /* break opportunity after */
LBP_BB = 4, /* break opportunity before */
LBP_HY = 5, /* hyphen */
- LBP_CB = 31, /* contingent break opportunity */
+ LBP_CB = 34, /* contingent break opportunity */
LBP_CL = 6, /* closing punctuation */
LBP_CP = 7, /* closing parenthesis */
LBP_EX = 8, /* exclamation/interrogation */
@@ -49,7 +49,7 @@ enum
LBP_PO = 15, /* postfix (numeric) */
LBP_PR = 16, /* prefix (numeric) */
LBP_SY = 17, /* symbols allowing breaks */
- LBP_AI = 32, /* ambiguous (alphabetic or ideograph) */
+ LBP_AI = 35, /* ambiguous (alphabetic or ideograph) */
LBP_AL = 18, /* ordinary alphabetic and symbol characters */
/*LBP_CJ, conditional Japanese starters, resolved to NS */
LBP_H2 = 19, /* Hangul LV syllable */
@@ -60,8 +60,11 @@ enum
LBP_JV = 23, /* Hangul V Jamo */
LBP_JT = 24, /* Hangul T Jamo */
LBP_RI = 26, /* regional indicator */
- LBP_SA = 33, /* complex context (South East Asian) */
- LBP_XX = 34 /* unknown */
+ LBP_SA = 36, /* complex context (South East Asian) */
+ LBP_ZWJ = 27, /* zero width joiner */
+ LBP_EB = 28, /* emoji base */
+ LBP_EM = 29, /* emoji modifier */
+ LBP_XX = 37 /* unknown */
};
#include "lbrkprop1.h"
@@ -92,7 +95,7 @@ unilbrkprop_lookup (ucs4_t uc)
#define I 2 /* indirect break opportunity, '%' in table 7.3 of UTR #14 */
#define P 3 /* prohibited break, '^' in table 7.3 of UTR #14 */
-extern const unsigned char unilbrk_table[27][27];
+extern const unsigned char unilbrk_table[30][30];
/* We don't support line breaking of complex-context dependent characters
(Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */
diff --git a/lib/uniname/gen-uninames.lisp b/lib/uniname/gen-uninames.lisp
index 9f795621bd..937b50ef51 100755
--- a/lib/uniname/gen-uninames.lisp
+++ b/lib/uniname/gen-uninames.lisp
@@ -196,7 +196,7 @@
) ) )
(format ostream "};~%")
|#
- (format ostream "static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[~D] = {~%"
+ (format ostream "static const struct { uint32_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[~D] = {~%"
(1+ (length words-by-length))
)
(let ((extra-offset 0)
diff --git a/lib/uniwbrk.in.h b/lib/uniwbrk.in.h
index ec5ee5cd48..a639237e50 100644
--- a/lib/uniwbrk.in.h
+++ b/lib/uniwbrk.in.h
@@ -53,7 +53,12 @@ enum
WBP_RI = 13,
WBP_DQ = 14,
WBP_SQ = 15,
- WBP_HL = 16
+ WBP_HL = 16,
+ WBP_ZWJ = 17,
+ WBP_EB = 18,
+ WBP_EM = 19,
+ WBP_GAZ = 20,
+ WBP_EBG = 21
};
/* Return the Word_Break property of a Unicode character. */
diff --git a/lib/uniwbrk/u-wordbreaks.h b/lib/uniwbrk/u-wordbreaks.h
index 55314714a2..becf89e533 100644
--- a/lib/uniwbrk/u-wordbreaks.h
+++ b/lib/uniwbrk/u-wordbreaks.h
@@ -39,6 +39,8 @@ FUNC (const UNIT *s, size_t n, char *p)
-1 at the very beginning of the string. */
int secondlast_compchar_prop = -1;
+ size_t ri_count = 0;
+
/* Don't break inside multibyte characters. */
memset (p, 0, n);
@@ -51,10 +53,10 @@ FUNC (const UNIT *s, size_t n, char *p)
/* No break at the start of the string. */
if (last_char_prop >= 0)
{
- /* No break between CR and LF. */
+ /* No break between CR and LF (WB3). */
if (last_char_prop == WBP_CR && prop == WBP_LF)
/* *p = 0 */;
- /* Break before and after newlines. */
+ /* Break before and after newlines (WB3a, WB3b). */
else if ((last_char_prop == WBP_CR
|| last_char_prop == WBP_LF
|| last_char_prop == WBP_NEWLINE)
@@ -62,8 +64,12 @@ FUNC (const UNIT *s, size_t n, char *p)
|| prop == WBP_LF
|| prop == WBP_NEWLINE))
*p = 1;
+ /* No break within emoji zwj sequence (WB3c). */
+ else if (last_char_prop == WBP_ZWJ &&
+ (prop == WBP_GAZ || prop == WBP_EBG))
+ /* *p = 0 */;
/* Ignore Format and Extend characters. */
- else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT))
+ else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ))
{
/* No break in these situations (see UAX #29):
@@ -75,16 +81,8 @@ FUNC (const UNIT *s, size_t n, char *p)
Numeric × (MidNum | MidNumLet | SQ) Numeric (WB12)
HL × DQ HL (WB7b)
HL DQ × HL (WB7c)
- (ALetter | HL) × (ALetter | HL) (WB5)
- (ALetter | HL) × Numeric (WB9)
- Numeric × (ALetter | HL) (WB10)
- Numeric × Numeric (WB8)
- HL × SQ (WB7a)
- Katakana × Katakana (WB13)
- (ALetter | HL | Numeric | Katakana) × ExtendNumLet (WB13a)
- ExtendNumLet × ExtendNumLet (WB13a)
- ExtendNumLet × (ALetter | HL | Numeric | Katakana) (WB13b)
- Regional_Indicator × Regional_Indicator (WB13c)
+ ^ (RI RI)* RI × RI (WB15)
+ [^RI] (RI RI)* RI × RI (WB16)
*/
/* No break across certain punctuation. Also, disable word
breaks that were recognized earlier (due to lookahead of
@@ -108,27 +106,29 @@ FUNC (const UNIT *s, size_t n, char *p)
*last_compchar_ptr = 0;
/* *p = 0; */
}
- /* Break after Format and Extend characters. */
+ /* Break before RI, if odd number of RI's are
+ preceding (WB15, WB16). */
+ else if (last_compchar_prop == WBP_RI && prop == WBP_RI)
+ {
+ if (ri_count % 2 == 0)
+ *p = 1;
+ /* else *p = 0 */
+ }
+ /* Break after Format and Extend character. */
else if (last_compchar_prop == WBP_EXTEND
|| last_compchar_prop == WBP_FORMAT)
*p = 1;
else
{
- /* Normalize property value to table index,
- skipping 5 properties: WBP_EXTEND,
- WBP_FORMAT, WBP_NEWLINE, WBP_CR, and
- WBP_LF. */
- int last_compchar_prop_index = last_compchar_prop;
- int prop_index = prop;
-
- if (last_compchar_prop_index >= WBP_EXTEND)
- last_compchar_prop_index -= 5;
-
- if (prop_index >= WBP_EXTEND)
- prop_index -= 5;
+ int last_compchar_index =
+ uniwbrk_prop_index[last_compchar_prop];
+ int index = uniwbrk_prop_index[prop];
+ /* Break between unknown pair (WB999). */
+ if (last_compchar_index < 0 || index < 0)
+ *p = 1;
/* Perform a single table lookup. */
- if (uniwbrk_table[last_compchar_prop_index][prop_index])
+ else if (uniwbrk_table[last_compchar_index][index])
*p = 1;
/* else *p = 0; */
}
@@ -136,17 +136,23 @@ FUNC (const UNIT *s, size_t n, char *p)
}
last_char_prop = prop;
- /* Ignore Format and Extend characters, except at the start
- of the line. */
+
+ /* Ignore Format and Extend characters, except at the
+ start of the line. */
if (last_compchar_prop < 0
|| last_compchar_prop == WBP_CR
|| last_compchar_prop == WBP_LF
|| last_compchar_prop == WBP_NEWLINE
- || !(prop == WBP_EXTEND || prop == WBP_FORMAT))
+ || !(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ))
{
secondlast_compchar_prop = last_compchar_prop;
last_compchar_prop = prop;
last_compchar_ptr = p;
+
+ if (prop == WBP_RI)
+ ri_count++;
+ else
+ ri_count = 0;
}
s += count;
diff --git a/lib/uniwbrk/wbrktable.c b/lib/uniwbrk/wbrktable.c
index 566d576d19..8140bf7c55 100644
--- a/lib/uniwbrk/wbrktable.c
+++ b/lib/uniwbrk/wbrktable.c
@@ -20,6 +20,32 @@
/* Specification. */
#include "wbrktable.h"
+const int uniwbrk_prop_index[22] =
+{
+ 0, /* WBP_OTHER */
+ 1, /* WBP_KATAKANA */
+ 2, /* WBP_ALETTER */
+ 3, /* WBP_MIDNUMLET */
+ 4, /* WBP_MIDLETTER */
+ 5, /* WBP_MIDNUM */
+ 6, /* WBP_NUMERIC */
+ 7, /* WBP_EXTENDNUMLET */
+ -1, /* WBP_EXTEND */
+ -1, /* WBP_FORMAT */
+ -1, /* WBP_NEWLINE */
+ -1, /* WBP_CR */
+ -1, /* WBP_LF */
+ -1, /* WBP_RI */
+ 8, /* WBP_DQ */
+ 9, /* WBP_SQ */
+ 10, /* WBP_HL */
+ -1, /* WBP_ZWJ */
+ 11, /* WBP_EB */
+ 12, /* WBP_EM */
+ -1, /* WBP_GAZ */
+ 13 /* WBP_EBG */
+};
+
/* This table contains the following rules (see UAX #29):
last current
@@ -33,24 +59,30 @@
(ALetter | HL | Numeric | Katakana) × ExtendNumLet (WB13a)
ExtendNumLet × ExtendNumLet (WB13a)
ExtendNumLet × (ALetter | HL | Numeric | Katakana) (WB13b)
- Regional_Indicator × Regional_Indicator (WB13c)
+ (E_Base | EBG) × E_Modifier (WB14)
+
+ Note that the following rules are not handled here but in the loop in u-wordbreaks.h:
+ - The rules need to look back or look ahead the second character (WB6, WB7, WB7b, WB7c, WB11, WB12)
+ - The rules with a higher precedence over the "ignore" rule (WB4), such as WB3c
*/
-const unsigned char uniwbrk_table[12][12] =
-{ /* current: OTHER MIDNUMLET NUMERIC DQ */
- /* KATAKANA MIDLETTER EXTENDNUMLET SQ */
- /* ALETTER MIDNUM RI HL */
+const unsigned char uniwbrk_table[14][14] =
+{ /* current: OTHER MIDNUMLET NUMERIC SQ EM */
+ /* KATAKANA MIDLETTER EXNUMLET HL EBG */
+ /* ALETTER MIDNUM DQ EB */
/* last */
- /* WBP_OTHER */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
- /* WBP_KATAKANA */ { 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1 },
- /* WBP_ALETTER */ { 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0 },
- /* WBP_MIDNUMLET */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
- /* WBP_MIDLETTER */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
- /* WBP_MIDNUM */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
- /* WBP_NUMERIC */ { 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0 },
- /* WBP_EXTENDNUMLET */ { 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0 },
- /* WBP_RI */ { 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1 },
- /* WBP_DQ */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
- /* WBP_SQ */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
- /* WBP_HL */ { 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0 }
+ /* WBP_OTHER */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+ /* WBP_KATAKANA */ { 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1 },
+ /* WBP_ALETTER */ { 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1 },
+ /* WBP_MIDNUMLET */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+ /* WBP_MIDLETTER */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+ /* WBP_MIDNUM */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+ /* WBP_NUMERIC */ { 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1 },
+ /* WBP_EXTENDNUMLET */ { 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1 },
+ /* WBP_DQ */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+ /* WBP_SQ */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+ /* WBP_HL */ { 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1 },
+ /* WBP_EB */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1 },
+ /* WBP_EM */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+ /* WBP_EBG */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1 }
};
diff --git a/lib/uniwbrk/wbrktable.h b/lib/uniwbrk/wbrktable.h
index 66b3457028..4a1f1ed821 100644
--- a/lib/uniwbrk/wbrktable.h
+++ b/lib/uniwbrk/wbrktable.h
@@ -15,4 +15,5 @@
You should have received a copy of the GNU Lesser General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>. */
-extern const unsigned char uniwbrk_table[12][12];
+extern const int uniwbrk_prop_index[22];
+extern const unsigned char uniwbrk_table[14][14];
diff --git a/lib/uniwidth/width.c b/lib/uniwidth/width.c
index d1993d677f..07168f24cf 100644
--- a/lib/uniwidth/width.c
+++ b/lib/uniwidth/width.c
@@ -32,7 +32,7 @@
* - Zero width characters; generated from
* "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
*/
-static const unsigned char nonspacing_table_data[36*64] = {
+static const unsigned char nonspacing_table_data[38*64] = {
/* 0x0000-0x01ff */
0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0000-0x003f */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0x0040-0x007f */
@@ -73,7 +73,7 @@ static const unsigned char nonspacing_table_data[36*64] = {
0x00, 0x00, 0xc0, 0xfb, 0xef, 0x3e, 0x00, 0x00, /* 0x0800-0x083f */
0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, /* 0x0840-0x087f */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0880-0x08bf */
- 0x00, 0x00, 0x00, 0x00, 0xf8, 0xff, 0xff, 0xff, /* 0x08c0-0x08ff */
+ 0x00, 0x00, 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08c0-0x08ff */
0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x14, /* 0x0900-0x093f */
0xfe, 0x21, 0xfe, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0940-0x097f */
0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0980-0x09bf */
@@ -135,7 +135,7 @@ static const unsigned char nonspacing_table_data[36*64] = {
/* 0x1800-0x19ff */
0x00, 0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1800-0x183f */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1840-0x187f */
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* 0x1880-0x18bf */
+ 0x60, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* 0x1880-0x18bf */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18c0-0x18ff */
0x00, 0x00, 0x00, 0x00, 0x87, 0x01, 0x04, 0x0e, /* 0x1900-0x193f */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1940-0x197f */
@@ -158,7 +158,7 @@ static const unsigned char nonspacing_table_data[36*64] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d00-0x1d3f */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d40-0x1d7f */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d80-0x1dbf */
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f, 0xf0, /* 0x1dc0-0x1dff */
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f, 0xf8, /* 0x1dc0-0x1dff */
/* 0x2000-0x21ff */
0x00, 0xf8, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, /* 0x2000-0x203f */
0x00, 0x00, 0x00, 0x00, 0xdf, 0xff, 0x00, 0x00, /* 0x2040-0x207f */
@@ -199,7 +199,7 @@ static const unsigned char nonspacing_table_data[36*64] = {
0x44, 0x08, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, /* 0xa800-0xa83f */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa840-0xa87f */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa880-0xa8bf */
- 0x10, 0x00, 0x00, 0x00, 0xff, 0xff, 0x03, 0x00, /* 0xa8c0-0xa8ff */
+ 0x30, 0x00, 0x00, 0x00, 0xff, 0xff, 0x03, 0x00, /* 0xa8c0-0xa8ff */
0x00, 0x00, 0x00, 0x00, 0xc0, 0x3f, 0x00, 0x00, /* 0xa900-0xa93f */
0x80, 0xff, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa940-0xa97f */
0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc8, 0x13, /* 0xa980-0xa9bf */
@@ -268,7 +268,7 @@ static const unsigned char nonspacing_table_data[36*64] = {
0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, 0x7f, /* 0x11180-0x111bf */
0x00, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x111c0-0x111ff */
/* 0x11200-0x113ff */
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0xd3, 0x00, /* 0x11200-0x1123f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0xd3, 0x40, /* 0x11200-0x1123f */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11240-0x1127f */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11280-0x112bf */
0x00, 0x00, 0x00, 0x80, 0xf8, 0x07, 0x00, 0x00, /* 0x112c0-0x112ff */
@@ -277,8 +277,8 @@ static const unsigned char nonspacing_table_data[36*64] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11380-0x113bf */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x113c0-0x113ff */
/* 0x11400-0x115ff */
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11400-0x1143f */
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11440-0x1147f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, /* 0x11400-0x1143f */
+ 0x5c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11440-0x1147f */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0x85, /* 0x11480-0x114bf */
0x0d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x114c0-0x114ff */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11500-0x1153f */
@@ -294,6 +294,15 @@ static const unsigned char nonspacing_table_data[36*64] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11740-0x1177f */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11780-0x117bf */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x117c0-0x117ff */
+ /* 0x11c00-0x11dff */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0x3f, /* 0x11c00-0x11c3f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11c40-0x11c7f */
+ 0x00, 0x00, 0xfc, 0xff, 0xff, 0xfc, 0x6d, 0x00, /* 0x11c80-0x11cbf */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11cc0-0x11cff */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11d00-0x11d3f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11d40-0x11d7f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11d80-0x11dbf */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11dc0-0x11dff */
/* 0x16a00-0x16bff */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x16a00-0x16a3f */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x16a40-0x16a7f */
@@ -348,13 +357,22 @@ static const unsigned char nonspacing_table_data[36*64] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1db40-0x1db7f */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1db80-0x1dbbf */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1dbc0-0x1dbff */
+ /* 0x1e000-0x1e1ff */
+ 0x7f, 0xff, 0xff, 0xf9, 0xdb, 0x07, 0x00, 0x00, /* 0x1e000-0x1e03f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1e040-0x1e07f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1e080-0x1e0bf */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1e0c0-0x1e0ff */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1e100-0x1e13f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1e140-0x1e17f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1e180-0x1e1bf */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1e1c0-0x1e1ff */
/* 0x1e800-0x1e9ff */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1e800-0x1e83f */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1e840-0x1e87f */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1e880-0x1e8bf */
0x00, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1e8c0-0x1e8ff */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1e900-0x1e93f */
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1e940-0x1e97f */
+ 0xf0, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1e940-0x1e97f */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1e980-0x1e9bf */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* 0x1e9c0-0x1e9ff */
};
@@ -376,20 +394,20 @@ static const signed char nonspacing_table_ind[248] = {
-1, -1, -1, -1, -1, -1, -1, -1, /* 0xe000-0xefff */
-1, -1, -1, -1, -1, 20, -1, 21, /* 0xf000-0xffff */
22, 23, -1, -1, -1, 24, -1, -1, /* 0x10000-0x10fff */
- 25, 26, 27, 28, -1, -1, -1, -1, /* 0x11000-0x11fff */
+ 25, 26, 27, 28, -1, -1, 29, -1, /* 0x11000-0x11fff */
-1, -1, -1, -1, -1, -1, -1, -1, /* 0x12000-0x12fff */
-1, -1, -1, -1, -1, -1, -1, -1, /* 0x13000-0x13fff */
-1, -1, -1, -1, -1, -1, -1, -1, /* 0x14000-0x14fff */
-1, -1, -1, -1, -1, -1, -1, -1, /* 0x15000-0x15fff */
- -1, -1, -1, -1, -1, 29, -1, 30, /* 0x16000-0x16fff */
+ -1, -1, -1, -1, -1, 30, -1, 31, /* 0x16000-0x16fff */
-1, -1, -1, -1, -1, -1, -1, -1, /* 0x17000-0x17fff */
-1, -1, -1, -1, -1, -1, -1, -1, /* 0x18000-0x18fff */
-1, -1, -1, -1, -1, -1, -1, -1, /* 0x19000-0x19fff */
-1, -1, -1, -1, -1, -1, -1, -1, /* 0x1a000-0x1afff */
- -1, -1, -1, -1, -1, -1, 31, -1, /* 0x1b000-0x1bfff */
+ -1, -1, -1, -1, -1, -1, 32, -1, /* 0x1b000-0x1bfff */
-1, -1, -1, -1, -1, -1, -1, -1, /* 0x1c000-0x1cfff */
- 32, 33, -1, -1, -1, 34, -1, -1, /* 0x1d000-0x1dfff */
- -1, -1, -1, -1, 35, -1, -1, -1 /* 0x1e000-0x1efff */
+ 33, 34, -1, -1, -1, 35, -1, -1, /* 0x1d000-0x1dfff */
+ 36, -1, -1, -1, 37, -1, -1, -1 /* 0x1e000-0x1efff */
};
/* Determine number of column positions required for UC. */
diff --git a/modules/unigbrk/u16-grapheme-breaks b/modules/unigbrk/u16-grapheme-breaks
index aed6720afa..d5f8acf964 100644
--- a/modules/unigbrk/u16-grapheme-breaks
+++ b/modules/unigbrk/u16-grapheme-breaks
@@ -3,10 +3,11 @@ Find grapheme cluster breaks in UTF-16 string.
Files:
lib/unigbrk/u16-grapheme-breaks.c
-tests/macros.h
+lib/unigbrk/u-grapheme-breaks.h
Depends-on:
-unigbrk/uc-is-grapheme-break
+unigbrk/base
+unigbrk/uc-gbrk-prop
unistr/u16-mbtouc
configure.ac:
diff --git a/modules/unigbrk/u32-grapheme-breaks b/modules/unigbrk/u32-grapheme-breaks
index fec52f5b99..55d5a9272a 100644
--- a/modules/unigbrk/u32-grapheme-breaks
+++ b/modules/unigbrk/u32-grapheme-breaks
@@ -3,10 +3,11 @@ Find grapheme cluster breaks in UTF-32 string.
Files:
lib/unigbrk/u32-grapheme-breaks.c
-tests/macros.h
+lib/unigbrk/u-grapheme-breaks.h
Depends-on:
-unigbrk/uc-is-grapheme-break
+unigbrk/base
+unigbrk/uc-gbrk-prop
unistr/u32-mbtouc
configure.ac:
diff --git a/modules/unigbrk/u8-grapheme-breaks b/modules/unigbrk/u8-grapheme-breaks
index 56ca72c4da..5d76bc841c 100644
--- a/modules/unigbrk/u8-grapheme-breaks
+++ b/modules/unigbrk/u8-grapheme-breaks
@@ -3,10 +3,11 @@ Find grapheme cluster breaks in UTF-8 string.
Files:
lib/unigbrk/u8-grapheme-breaks.c
-tests/macros.h
+lib/unigbrk/u-grapheme-breaks.h
Depends-on:
-unigbrk/uc-is-grapheme-break
+unigbrk/base
+unigbrk/uc-gbrk-prop
unistr/u8-mbtouc
configure.ac:
diff --git a/modules/unigbrk/uc-grapheme-breaks b/modules/unigbrk/uc-grapheme-breaks
new file mode 100644
index 0000000000..438b0fdc02
--- /dev/null
+++ b/modules/unigbrk/uc-grapheme-breaks
@@ -0,0 +1,28 @@
+Description:
+Find grapheme cluster breaks.
+
+Files:
+lib/unigbrk/uc-grapheme-breaks.c
+lib/unigbrk/u-grapheme-breaks.h
+
+Depends-on:
+unigbrk/base
+unigbrk/uc-gbrk-prop
+
+configure.ac:
+gl_MODULE_INDICATOR([unigbrk/uc-grapheme-breaks])
+gl_LIBUNISTRING_MODULE([0.9.6], [unigbrk/uc-grapheme-breaks])
+
+Makefile.am:
+if LIBUNISTRING_COMPILE_UNIGBRK_U32_GRAPHEME_BREAKS
+lib_SOURCES += unigbrk/uc-grapheme-breaks.c
+endif
+
+Include:
+"unigbrk.h"
+
+License:
+LGPLv3+ or GPLv2
+
+Maintainer:
+Ben Pfaff, Daiki Ueno
diff --git a/modules/unigbrk/uc-grapheme-breaks-tests b/modules/unigbrk/uc-grapheme-breaks-tests
new file mode 100644
index 0000000000..3c211ff5cb
--- /dev/null
+++ b/modules/unigbrk/uc-grapheme-breaks-tests
@@ -0,0 +1,14 @@
+Files:
+tests/unigbrk/test-uc-grapheme-breaks.c
+tests/unigbrk/test-uc-grapheme-breaks.sh
+tests/unigbrk/GraphemeBreakTest.txt
+
+Depends-on:
+
+configure.ac:
+
+Makefile.am:
+TESTS += unigbrk/test-uc-grapheme-breaks.sh
+check_PROGRAMS += test-uc-grapheme-breaks
+test_uc_grapheme_breaks_SOURCES = unigbrk/test-uc-grapheme-breaks.c
+test_uc_grapheme_breaks_LDADD = $(LDADD) $(LIBUNISTRING)
diff --git a/tests/unigbrk/test-uc-gbrk-prop.c b/tests/unigbrk/test-uc-gbrk-prop.c
index 7e3cb4699f..726c3e0ce1 100644
--- a/tests/unigbrk/test-uc-gbrk-prop.c
+++ b/tests/unigbrk/test-uc-gbrk-prop.c
@@ -51,6 +51,11 @@ graphemebreakproperty_to_string (int gbp)
CASE(LV)
CASE(LVT)
CASE(RI)
+ CASE(ZWJ)
+ CASE(EB)
+ CASE(EM)
+ CASE(GAZ)
+ CASE(EBG)
}
abort ();
}
diff --git a/tests/unigbrk/test-uc-grapheme-breaks.c b/tests/unigbrk/test-uc-grapheme-breaks.c
new file mode 100644
index 0000000000..7ee02b9c45
--- /dev/null
+++ b/tests/unigbrk/test-uc-grapheme-breaks.c
@@ -0,0 +1,191 @@
+/* Grapheme cluster break function test.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+/* Written by Ben Pfaff <blp@cs.stanford.edu>, 2010. */
+
+#include <config.h>
+
+/* Specification. */
+#include <unigbrk.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "macros.h"
+
+static const char *
+graphemebreakproperty_to_string (int gbp)
+{
+ printf ("%d\n", gbp);
+ switch (gbp)
+ {
+#define CASE(VALUE) case GBP_##VALUE: return #VALUE;
+ CASE(OTHER)
+ CASE(CR)
+ CASE(LF)
+ CASE(CONTROL)
+ CASE(EXTEND)
+ CASE(PREPEND)
+ CASE(SPACINGMARK)
+ CASE(L)
+ CASE(V)
+ CASE(T)
+ CASE(LV)
+ CASE(LVT)
+ CASE(RI)
+ CASE(ZWJ)
+ CASE(EB)
+ CASE(EM)
+ CASE(GAZ)
+ CASE(EBG)
+ }
+ abort ();
+}
+
+static void
+test_uc_grapheme_breaks (const char *expected, ucs4_t *s, size_t n,
+ const char *filename, int lineno)
+{
+ char breaks[16];
+ size_t i;
+
+ ASSERT (n <= 16);
+
+ uc_grapheme_breaks (s, n, breaks);
+ for (i = 0; i < n; i++)
+ if (breaks[i] != (expected[i] == '#'))
+ {
+ size_t j;
+
+ fprintf (stderr, "wrong grapheme breaks:\n");
+
+ fprintf (stderr, " input:");
+ for (j = 0; j < n; j++)
+ fprintf (stderr, " %02x", s[j]);
+ putc ('\n', stderr);
+
+ fprintf (stderr, "expected:");
+ for (j = 0; j < n; j++)
+ fprintf (stderr, " %d", expected[j] == '#');
+ putc ('\n', stderr);
+
+ fprintf (stderr, " actual:");
+ for (j = 0; j < n; j++)
+ fprintf (stderr, " %d", breaks[j]);
+ putc ('\n', stderr);
+
+ abort ();
+ }
+}
+
+int
+main (int argc, char *argv[])
+{
+ const char *filename;
+ char line[1024];
+ int exit_code;
+ FILE *stream;
+ int lineno;
+
+ if (argc != 2)
+ {
+ fprintf (stderr, "usage: %s FILENAME\n"
+ "where FILENAME is the location of the GraphemeBreakTest.txt\n"
+ "test file.\n", argv[0]);
+ exit (1);
+ }
+
+ filename = argv[1];
+ stream = fopen (filename, "r");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "error during fopen of '%s'\n", filename);
+ exit (1);
+ }
+
+ exit_code = 0;
+ lineno = 0;
+ while (fgets (line, sizeof line, stream))
+ {
+ char *comment;
+ const char *p;
+ ucs4_t s[16];
+ char breaks[16];
+ size_t i = 0;
+
+ lineno++;
+
+ comment = strchr (line, '#');
+ if (comment != NULL)
+ *comment = '\0';
+ if (line[strspn (line, " \t\r\n")] == '\0')
+ continue;
+
+ s[0] = 0;
+ p = line;
+ do
+ {
+ ucs4_t next;
+
+ p += strspn (p, " \t\r\n");
+ if (!strncmp (p, "\303\267" /* ÷ */, 2))
+ {
+ breaks[i] = '#';
+ p += 2;
+ }
+ else if (!strncmp (p, "\303\227" /* × */, 2))
+ {
+ breaks[i] = '_';
+ p += 2;
+ }
+ else
+ {
+ fprintf (stderr, "%s:%d.%d: syntax error expecting '÷' or '×'\n",
+ filename, lineno, (int) (p - line + 1));
+ exit (1);
+ }
+
+ p += strspn (p, " \t\r\n");
+ if (*p == '\0')
+ s[i] = 0;
+ else
+ {
+ unsigned int next_int;
+ int n;
+
+ if (sscanf (p, "%x%n", &next_int, &n) != 1)
+ {
+ fprintf (stderr, "%s:%d.%d: syntax error at '%s' "
+ "expecting hexadecimal Unicode code point number\n",
+ filename, lineno, (int) (p - line + 1), p);
+ exit (1);
+ }
+ p += n;
+
+ s[i] = next_int;
+ }
+ p += strspn (p, " \t\r\n");
+ i++;
+ }
+ while (*p != '\0');
+
+ if (i > 0)
+ test_uc_grapheme_breaks (breaks, s, i, filename, lineno);
+ }
+
+ return exit_code;
+}
diff --git a/tests/unigbrk/test-uc-grapheme-breaks.sh b/tests/unigbrk/test-uc-grapheme-breaks.sh
new file mode 100755
index 0000000000..021c9e4e55
--- /dev/null
+++ b/tests/unigbrk/test-uc-grapheme-breaks.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+./test-uc-grapheme-breaks${EXEEXT} "${srcdir}/unigbrk/GraphemeBreakTest.txt"
diff --git a/tests/unigbrk/test-uc-is-grapheme-break.c b/tests/unigbrk/test-uc-is-grapheme-break.c
index 6795db16aa..3bf92eeead 100644
--- a/tests/unigbrk/test-uc-is-grapheme-break.c
+++ b/tests/unigbrk/test-uc-is-grapheme-break.c
@@ -45,6 +45,11 @@ graphemebreakproperty_to_string (int gbp)
CASE(LV)
CASE(LVT)
CASE(RI)
+ CASE(ZWJ)
+ CASE(EB)
+ CASE(EM)
+ CASE(GAZ)
+ CASE(EBG)
}
abort ();
}
@@ -81,6 +86,8 @@ main (int argc, char *argv[])
char *comment;
const char *p;
ucs4_t prev;
+ int last_compchar_prop;
+ size_t ri_count;
lineno++;
@@ -90,6 +97,8 @@ main (int argc, char *argv[])
if (line[strspn (line, " \t\r\n")] == '\0')
continue;
+ last_compchar_prop = -1;
+ ri_count = 0;
prev = 0;
p = line;
do
@@ -135,7 +144,30 @@ main (int argc, char *argv[])
next = next_int;
}
- if (uc_is_grapheme_break (prev, next) != should_break)
+ if ((last_compchar_prop == GBP_EB
+ || last_compchar_prop == GBP_EBG)
+ && uc_graphemeclusterbreak_property (next) == GBP_EM)
+ {
+ int prev_gbp = uc_graphemeclusterbreak_property (prev);
+ int next_gbp = uc_graphemeclusterbreak_property (next);
+ fprintf (stderr, "%s:%d: skipping GB10: should join U+%04X (%s) "
+ "and U+%04X (%s)\n",
+ filename, lineno,
+ prev, graphemebreakproperty_to_string (prev_gbp),
+ next, graphemebreakproperty_to_string (next_gbp));
+ }
+ else if (uc_graphemeclusterbreak_property (next) == GBP_RI
+ && ri_count % 2 != 0)
+ {
+ int prev_gbp = uc_graphemeclusterbreak_property (prev);
+ int next_gbp = uc_graphemeclusterbreak_property (next);
+ fprintf (stderr, "%s:%d: skipping GB12: should join U+%04X (%s) "
+ "and U+%04X (%s)\n",
+ filename, lineno,
+ prev, graphemebreakproperty_to_string (prev_gbp),
+ next, graphemebreakproperty_to_string (next_gbp));
+ }
+ else if (uc_is_grapheme_break (prev, next) != should_break)
{
int prev_gbp = uc_graphemeclusterbreak_property (prev);
int next_gbp = uc_graphemeclusterbreak_property (next);
@@ -150,6 +182,16 @@ main (int argc, char *argv[])
p += strspn (p, " \t\r\n");
prev = next;
+
+ if (!(uc_graphemeclusterbreak_property (next) == GBP_EXTEND
+ && (last_compchar_prop == GBP_EB
+ || last_compchar_prop == GBP_EBG)))
+ last_compchar_prop = uc_graphemeclusterbreak_property (next);
+
+ if (uc_graphemeclusterbreak_property (next) == GBP_RI)
+ ri_count++;
+ else
+ ri_count = 0;
}
while (*p != '\0');
}
diff --git a/tests/uniwbrk/test-uc-wordbreaks.c b/tests/uniwbrk/test-uc-wordbreaks.c
index 48888105fb..ce452db49e 100644
--- a/tests/uniwbrk/test-uc-wordbreaks.c
+++ b/tests/uniwbrk/test-uc-wordbreaks.c
@@ -51,6 +51,11 @@ wordbreakproperty_to_string (int wbp)
CASE(DQ)
CASE(SQ)
CASE(HL)
+ CASE(ZWJ)
+ CASE(EB)
+ CASE(EM)
+ CASE(GAZ)
+ CASE(EBG)
}
abort ();
}
diff --git a/tests/uniwidth/test-uc_width2.sh b/tests/uniwidth/test-uc_width2.sh
index 1463d3454b..2ade585056 100755
--- a/tests/uniwidth/test-uc_width2.sh
+++ b/tests/uniwidth/test-uc_width2.sh
@@ -65,8 +65,8 @@ cat > uc_width.ok <<\EOF
0829..082D 0
082E..0858 A
0859..085B 0
-085C..08E2 A
-08E3..0902 0
+085C..08D3 A
+08D4..0902 0
0903..0939 A
093A 0
093B A
@@ -251,7 +251,9 @@ cat > uc_width.ok <<\EOF
17DD 0
17DE..180A A
180B..180E 0
-180F..18A8 A
+180F..1884 A
+1885..1886 0
+1887..18A8 A
18A9 0
18AA..191F A
1920..1922 0
@@ -327,8 +329,8 @@ cat > uc_width.ok <<\EOF
1CF8..1CF9 0
1CFA..1DBF A
1DC0..1DF5 0
-1DF6..1DFB A
-1DFC..1DFF 0
+1DF6..1DFA A
+1DFB..1DFF 0
1E00..200A A
200B..200F 0
2010..2029 A
@@ -376,8 +378,8 @@ A80B 0
A80C..A824 A
A825..A826 0
A827..A8C3 A
-A8C4 0
-A8C5..A8DF A
+A8C4..A8C5 0
+A8C6..A8DF A
A8E0..A8F1 0
A8F2..A925 A
A926..A92D 0
@@ -493,7 +495,9 @@ FFFC..101FC 1
11234 0
11235 1
11236..11237 0
-11238..112DE 1
+11238..1123D 1
+1123E 0
+1123F..112DE 1
112DF 0
112E0..112E2 1
112E3..112EA 0
@@ -507,7 +511,13 @@ FFFC..101FC 1
11366..1136C 0
1136D..1136F 1
11370..11374 0
-11375..114B2 1
+11375..11437 1
+11438..1143F 0
+11440..11441 1
+11442..11444 0
+11445 1
+11446 0
+11447..114B2 1
114B3..114B8 0
114B9 1
114BA 0
@@ -543,7 +553,19 @@ FFFC..101FC 1
11722..11725 0
11726 1
11727..1172B 0
-1172C..16AEF 1
+1172C..11C2F 1
+11C30..11C36 0
+11C37 1
+11C38..11C3D 0
+11C3E..11C91 1
+11C92..11CA7 0
+11CA8..11CA9 1
+11CAA..11CB0 0
+11CB1 1
+11CB2..11CB3 0
+11CB4 1
+11CB5..11CB6 0
+11CB7..16AEF 1
16AF0..16AF4 0
16AF5..16B2F 1
16B30..16B36 0
@@ -575,9 +597,21 @@ FFFC..101FC 1
1DA9B..1DA9F 0
1DAA0 1
1DAA1..1DAAF 0
-1DAB0..1E8CF 1
+1DAB0..1DFFF 1
+1E000..1E006 0
+1E007 1
+1E008..1E018 0
+1E019..1E01A 1
+1E01B..1E021 0
+1E022 1
+1E023..1E024 0
+1E025 1
+1E026..1E02A 0
+1E02B..1E8CF 1
1E8D0..1E8D6 0
-1E8D7..1FFFF 1
+1E8D7..1E943 1
+1E944..1E94A 0
+1E94B..1FFFF 1
20000..3FFFF 2
40000..E0000 1
E0001 0