Add isFOO_utf8_safe() macros

The original API does not check that we aren't reading beyond the end of a buffer, apparently assuming that we could keep malformed UTF-8 out by use of gatekeepers, but that is currently impossible. This commit adds "safe" macros for determining if a UTF-8 sequence represents an alphabetic, a digit, etc. Each new macro has an extra parameter pointing to the end of the sequence, so that looking beyond the input string can be avoided. The macros aren't currently completely safe, as they don't test that there is at least a single valid byte in the input, except by an assertion in DEBUGGING builds. This is because typically they are called in code that makes that assumption, and frequently tests the current byte for one thing or another.
author: Karl Williamson <khw@cpan.org> 2016-12-15 16:30:27 -0700
committer: Karl Williamson <khw@cpan.org> 2016-12-23 16:48:34 -0700
commit: da8c1a98236a9f56df850c47705cb3046d6636aa (patch)
tree: 678fc6d876d96346a0beb5b19b9b9249c17df9ea /utf8.c
parent: 9dfb44ee59033dc1f1f858d46a05a3f3c8ce85d9 (diff)
download: perl-da8c1a98236a9f56df850c47705cb3046d6636aa.tar.gz
1 files changed, 80 insertions, 3 deletions
diff --git a/utf8.c b/utf8.c
index 6b2c12856d..44aada5ce6 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2116,7 +2116,7 @@ Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
 {
     U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
-    return _is_utf8_FOO(classnum, tmpbuf);
+    return _is_utf8_FOO_with_len(classnum, tmpbuf, tmpbuf + sizeof(tmpbuf));
 }
 
 /* Internal function so we can deprecate the external one, and call
@@ -2137,7 +2137,7 @@ Perl__is_uni_perl_idcont(pTHX_ UV c)
 {
     U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
-    return _is_utf8_perl_idcont(tmpbuf);
+    return _is_utf8_perl_idcont_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf));
 }
 
 bool
@@ -2145,7 +2145,7 @@ Perl__is_uni_perl_idstart(pTHX_ UV c)
 {
     U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
-    return _is_utf8_perl_idstart(tmpbuf);
+    return _is_utf8_perl_idstart_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf));
 }
 
 UV
@@ -2445,6 +2445,40 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
     return swash_fetch(*swash, p, TRUE) != 0;
 }
 
+PERL_STATIC_INLINE bool
+S_is_utf8_common_with_len(pTHX_ const U8 *const p, const U8 * const e, SV **swash,
+		          const char *const swashname, SV* const invlist)
+{
+    /* returns a boolean giving whether or not the UTF8-encoded character that
+     * starts at <p>, and extending no further than <e - 1> is in the swash
+     * indicated by <swashname>.  <swash> contains a pointer to where the swash
+     * indicated by <swashname> is to be stored; which this routine will do, so
+     * that future calls will look at <*swash> and only generate a swash if it
+     * is not null.  <invlist> is NULL or an inversion list that defines the
+     * swash.  If not null, it saves time during initialization of the swash.
+     */
+
+    PERL_ARGS_ASSERT_IS_UTF8_COMMON_WITH_LEN;
+
+    if (! isUTF8_CHAR(p, e)) {
+        _force_out_malformed_utf8_message(p, e, 0, 1);
+        NOT_REACHED; /* NOTREACHED */
+    }
+
+    if (!*swash) {
+        U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
+        *swash = _core_swash_init("utf8",
+
+                                  /* Only use the name if there is no inversion
+                                   * list; otherwise will go out to disk */
+                                  (invlist) ? "" : swashname,
+
+                                  &PL_sv_undef, 1, 0, invlist, &flags);
+    }
+
+    return swash_fetch(*swash, p, TRUE) != 0;
+}
+
 bool
 Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
 {
@@ -2459,6 +2493,21 @@ Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
 }
 
 bool
+Perl__is_utf8_FOO_with_len(pTHX_ const U8 classnum, const U8 *p,
+                                                            const U8 * const e)
+{
+    PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN;
+
+    assert(classnum < _FIRST_NON_SWASH_CC);
+
+    return is_utf8_common_with_len(p,
+                                   e,
+                                   &PL_utf8_swash_ptrs[classnum],
+                                   swash_property_names[classnum],
+                                   PL_XPosix_ptrs[classnum]);
+}
+
+bool
 Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
 {
     SV* invlist = NULL;
@@ -2472,6 +2521,20 @@ Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
 }
 
 bool
+Perl__is_utf8_perl_idstart_with_len(pTHX_ const U8 *p, const U8 * const e)
+{
+    SV* invlist = NULL;
+
+    PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN;
+
+    if (! PL_utf8_perl_idstart) {
+        invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
+    }
+    return is_utf8_common_with_len(p, e, &PL_utf8_perl_idstart,
+                                      "_Perl_IDStart", invlist);
+}
+
+bool
 Perl__is_utf8_xidstart(pTHX_ const U8 *p)
 {
     PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
@@ -2495,6 +2558,20 @@ Perl__is_utf8_perl_idcont(pTHX_ const U8 *p)
 }
 
 bool
+Perl__is_utf8_perl_idcont_with_len(pTHX_ const U8 *p, const U8 * const e)
+{
+    SV* invlist = NULL;
+
+    PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN;
+
+    if (! PL_utf8_perl_idcont) {
+        invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
+    }
+    return is_utf8_common_with_len(p, e, &PL_utf8_perl_idcont,
+                                   "_Perl_IDCont", invlist);
+}
+
+bool
 Perl__is_utf8_idcont(pTHX_ const U8 *p)
 {
     PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
author	Karl Williamson <khw@cpan.org>	2016-12-15 16:30:27 -0700
committer	Karl Williamson <khw@cpan.org>	2016-12-23 16:48:34 -0700
commit	da8c1a98236a9f56df850c47705cb3046d6636aa (patch)
tree	678fc6d876d96346a0beb5b19b9b9249c17df9ea /utf8.c
parent	9dfb44ee59033dc1f1f858d46a05a3f3c8ce85d9 (diff)
download	perl-da8c1a98236a9f56df850c47705cb3046d6636aa.tar.gz