summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2015-12-03 13:12:51 -0700
committerKarl Williamson <khw@cpan.org>2015-12-09 23:43:21 -0700
commit36eaa8111efe6b0ebe974f6b26ed667c1206dc9f (patch)
tree111d349b4deb4ee6cd3cb2d8b030107a4fd616e7 /utf8.c
parent5af9bc9750ba392c2a4adfdc3ced4b0b301f656a (diff)
downloadperl-36eaa8111efe6b0ebe974f6b26ed667c1206dc9f.tar.gz
Skip casing for some non-cased scripts
Characters whose upper, lower, title, or fold case differ from the character itself amount to just 1.5% of the assigned Unicode characters, and this percentage falls with each new Unicode release, as almost all cased scripts have already been encoded. But a lot of code is written assuming a cased language, such as calling uc() or lcfirst(), or doing qr//i. When such code is run on a non-cased language, the work expended in doing the casing is wasted. And casing is expensive. But finding out if a character is cased or not is nearly as expensive, so one might as well just do the casing. However, the Unicode code space is organized so that there are some long stretches of contiguous code points that aren't cased. By adding tests to see if the input code point is in just a few of these ranges, we can quickly rule casing out for most of the non-cased scripts that are of commercial use today, at essentially no expense to handling the more common cased scripts. Testing for just 3 ranges in Plane 0 of Unicode (where most of the code points in common use today reside) allows us to skip doing casing for more than 82% of code points in the plane, including the following languages: Arabic, Chinese, Hebrew, Japanese, Korean, Thai, and the major scripts of India. No longer is a swash generated when trying to case one of these, so runtime memory usage is decreased. (It should be noted that some of these languages have characters scattered in other areas, because the original allocation for them turned out to be not large enough. When changing the case of these other characters, the lookups won't be skippped. But that original allocation included all or nearly all the characters in current common use, so these other characters are comparatively rare.) The comments in the code indicate some candidate non-cased ranges that I chose not to treat specially at this time. The next commit will address planes above Plane 0. When this command is run on a perl compiled with -O2, no DEBUGGING: blead Porting/bench.pl --perlargs="-Ilib -X" --benchfile=plane0_casing_perf /path_to_prior_perl=before_this_commit /path_to_new_perl=after and file 'plane0_casing_perf' contains [ 'string::casing::greek' => { desc => 'should be no change', setup => 'my $a = "\x{3B1}"', # GREEK SMALL LETTER ALPHA code => 'uc($a)' }, 'string::casing::hebrew' => { desc => 'yes swash vs no swash', setup => 'my $a = "\x{5D0}"', # HEBREW LETTER ALEF code => 'uc($a)' }, 'string::casing::cjk' => { desc => 'yes swash vs no swash', setup => 'my $a = "\x{4E01}"', code => 'uc($a)' }, 'string::casing::korean' => { desc => 'yes swash vs no swash', setup => 'my $a = "\x{AC00}"', code => 'uc($a)' }, ]; These are the results: The numbers represent raw counts per loop iteration. string::casing::cjk yes swash vs no swash before_this_commit after ------------------ -------- Ir 931.0 300.0 Dr 217.0 93.0 Dw 94.0 45.0 COND 129.0 48.0 IND 7.0 4.0 COND_m 1.5 0.0 IND_m 4.0 2.0 Ir_m1 0.1 0.0 Dr_m1 0.0 0.0 Dw_m1 0.0 0.0 Ir_mm 0.0 0.0 Dr_mm 0.0 0.0 Dw_mm 0.0 0.0 string::casing::greek should be no change before_this_commit after ------------------ -------- Ir 946.0 920.0 Dr 218.0 220.0 Dw 100.0 100.0 COND 127.0 121.0 IND 6.0 8.0 COND_m 0.5 1.3 IND_m 2.0 2.0 Ir_m1 0.1 0.0 Dr_m1 0.0 0.0 Dw_m1 0.0 0.0 Ir_mm 0.0 0.0 Dr_mm 0.0 0.0 Dw_mm 0.0 0.0 string::casing::hebrew yes swash vs no swash before_this_commit after ------------------ -------- Ir 928.0 290.0 Dr 224.0 92.0 Dw 100.0 45.0 COND 129.0 46.0 IND 6.0 4.0 COND_m 0.5 0.0 IND_m 2.0 2.0 Ir_m1 0.1 0.0 Dr_m1 0.0 0.0 Dw_m1 0.0 0.0 Ir_mm 0.0 0.0 Dr_mm 0.0 0.0 Dw_mm 0.0 0.0 string::casing::korean yes swash vs no swash before_this_commit after ------------------ -------- Ir 953.0 307.6 Dr 224.0 93.0 Dw 100.0 45.0 COND 131.0 50.9 IND 7.0 4.0 COND_m 1.5 0.0 IND_m 4.0 2.0 Ir_m1 0.1 0.0 Dr_m1 0.0 0.0 Dw_m1 0.0 0.0 Ir_mm 0.0 0.0 Dr_mm 0.0 0.0 Dw_mm 0.0 0.0
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c68
1 files changed, 62 insertions, 6 deletions
diff --git a/utf8.c b/utf8.c
index c009f3b67c..4c43bdec2f 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1919,10 +1919,55 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
PERL_ARGS_ASSERT__TO_UTF8_CASE;
- /* Note that swash_fetch() doesn't output warnings for these because it
- * assumes we will */
- if (uv1 >= UNICODE_SURROGATE_FIRST) {
- if (UNLIKELY(uv1 <= UNICODE_SURROGATE_LAST)) {
+ /* For code points that don't change case, we already know that the output
+ * of this function is the unchanged input, so we can skip doing look-ups
+ * for them. Unfortunately the case-changing code points are scattered
+ * around. But there are some long consecutive ranges where there are no
+ * case changing code points. By adding tests, we can eliminate the lookup
+ * for all the ones in such ranges. This is currently done here only for
+ * just a few cases where the scripts are in common use in modern commerce
+ * (and scripts adjacent to those which can be included without additional
+ * tests). */
+
+ if (uv1 >= 0x0590) {
+ /* This keeps from needing further processing the code points most
+ * likely to be used in the following non-cased scripts: Hebrew,
+ * Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, Devanagari,
+ * Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
+ * Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */
+ if (uv1 < 0x10A0) {
+ goto cases_to_self;
+ }
+
+ /* The following largish code point ranges also don't have case
+ * changes, but khw didn't think they warranted extra tests to speed
+ * them up (which would slightly slow down everything else above them):
+ * 1100..139F Hangul Jamo, Ethiopic
+ * 1400..1CFF Unified Canadian Aboriginal Syllabics, Ogham, Runic,
+ * Tagalog, Hanunoo, Buhid, Tagbanwa, Khmer, Mongolian,
+ * Limbu, Tai Le, New Tai Lue, Buginese, Tai Tham,
+ * Combining Diacritical Marks Extended, Balinese,
+ * Sundanese, Batak, Lepcha, Ol Chiki
+ * 2000..206F General Punctuation
+ */
+
+ if (uv1 >= 0x2D30) {
+
+ /* This keeps the from needing further processing the code points
+ * most likely to be used in the following non-cased major scripts:
+ * CJK, Katakana, Hiragana, plus some less-likely scripts.
+ *
+ * (0x2D30 above might have to be changed to 2F00 in the unlikely
+ * event that Unicode eventually allocates the unused block as of
+ * v8.0 2FE0..2FEF to code points that are cased. khw has verified
+ * that the test suite will start having failures to alert you
+ * should that happen) */
+ if (uv1 < 0xA640) {
+ goto cases_to_self;
+ }
+
+ if (uv1 >= 0xAC00) {
+ if (UNLIKELY(UNICODE_IS_SURROGATE(uv1))) {
if (ckWARN_d(WARN_SURROGATE)) {
const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
@@ -1930,6 +1975,14 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
}
goto cases_to_self;
}
+
+ /* AC00..FAFF Catches Hangul syllables and private use, plus
+ * some others */
+ if (uv1 < 0xFB00) {
+ goto cases_to_self;
+
+ }
+
if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
if ( UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP)
&& ckWARN_d(WARN_DEPRECATED))
@@ -1944,9 +1997,12 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
}
goto cases_to_self;
}
+ }
+ }
- /* Note that non-characters are perfectly legal, so no warning should
- * be given */
+ /* Note that non-characters are perfectly legal, so no warning should
+ * be given. There are so few of them, that it isn't worth the extra
+ * tests to avoid swash creation */
}
if (!*swashp) /* load on-demand */