summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2018-08-16 16:14:01 -0600
committerTony Cook <tony@develop-help.com>2018-08-30 11:36:12 +1000
commita553774b3d2cea21c792aecbc83e9d349337fe1f (patch)
tree0507181a9e071826302655a085ffd43303f5a5b4
parent9fcae9f15ece3ecae8ea65355fc7b66fad0b8e31 (diff)
downloadperl-a553774b3d2cea21c792aecbc83e9d349337fe1f.tar.gz
Fix script run bug '1' followed by Thai digit
This does not have a ticket, but was pointed out in http://nntp.perl.org/group/perl.perl5.porters/251870 The logic for deciding if it was needed to check if a character is a digit was flawed. (cherry picked from commit 7da8e27b9d7d2be4e770d074405ddb9941e6c8b7)
-rw-r--r--regexec.c46
-rw-r--r--t/re/script_run.t5
2 files changed, 36 insertions, 15 deletions
diff --git a/regexec.c b/regexec.c
index 56d5b10fd8..7c83cbe4ab 100644
--- a/regexec.c
+++ b/regexec.c
@@ -10599,23 +10599,39 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
scripts_match:
/* Here, the script of the character is compatible with that of the
- * run. Either they match exactly, or one or both can be any of
- * several scripts, and the intersection is not empty. If the
- * character is not a decimal digit, we are done with it. Otherwise,
- * it could still fail if it is from a different set of 10 than seen
- * already (or we may not have seen any, and we need to set the
- * sequence). If we have determined a single script and that script
- * only has one set of digits (almost all scripts are like that), then
- * this isn't a problem, as any digit must come from the same sequence.
- * The only scripts that have multiple sequences have been constructed
- * to be 0 in 'script_zeros[]'.
+ * run. That means that in most cases, it continues the script run.
+ * Either it and the run match exactly, or one or both can be in any of
+ * several scripts, and the intersection is not empty. But if the
+ * character is a decimal digit, we need further handling. If we
+ * haven't seen a digit before, it would establish what set of 10 all
+ * must come from; and if we have established a set, we need to check
+ * that this is in it.
*
- * Here we check if it is a digit. */
+ * But there are cases we can rule out without having to look up if
+ * this is a digit:
+ * a. All instances of [0-9] have been dealt with earlier.
+ * b. The next digit encoded by Unicode is 1600 code points further
+ * on, so if the code point in this loop iteration is less than
+ * that, it isn't a digit.
+ * c. Most scripts that have digits have a single set of 10. If
+ * we've encountered a digit in such a script, 'zero_of_run' is
+ * set to the code point (call it z) whose numeric value is 0.
+ * If the code point in this loop iteration is in the range
+ * z..z+9, it is in the script's set of 10, and we've actually
+ * handled it earlier in this function and won't reach this
+ * point. But, code points in that script that aren't in that
+ * range can't be digits, so we don't have to look any such up.
+ * We can tell if this script is such a one by looking at
+ * 'script_zeros[]' for it. It is non-zero iff it has a single
+ * set of digits. This rule doesn't apply if we haven't narrowed
+ * down the possible scripts to a single one yet. Nor if the
+ * zero of the run is '0', as that also hasn't narrowed things
+ * down completely */
if ( cp >= FIRST_NON_ASCII_DECIMAL_DIGIT
- && ( ( zero_of_run == 0
- || ( ( script_of_char >= 0
- && script_zeros[script_of_char] == 0)
- || intersection))))
+ && ( intersection
+ || script_of_char < 0 /* Also implies an intersection */
+ || zero_of_run == '0'
+ || script_zeros[script_of_char] == 0))
{
SSize_t range_zero_index;
range_zero_index = _invlist_search(decimals_invlist, cp);
diff --git a/t/re/script_run.t b/t/re/script_run.t
index ca234d9d4e..10c71034c4 100644
--- a/t/re/script_run.t
+++ b/t/re/script_run.t
@@ -84,6 +84,11 @@ foreach my $type ('script_run', 'sr', 'atomic_script_run', 'asr') {
# From UTS 39
like("写真だけの結婚式", $script_run, "Mixed Hiragana and Han");
+
+ unlike "\N{THAI DIGIT FIVE}1", $script_run, "Thai digit followed by '1'";
+ unlike "1\N{THAI DIGIT FIVE}", $script_run, "'1' followed by Thai digit ";
+ unlike "\N{BENGALI DIGIT ZERO}\N{CHAKMA DIGIT SEVEN}", $script_run,
+ "Two digits in same extended script but from different sets of 10";
}
# Until fixed, this was skipping the '['