diff options
author | Karl Williamson <khw@cpan.org> | 2016-12-15 16:30:27 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2016-12-23 16:48:34 -0700 |
commit | da8c1a98236a9f56df850c47705cb3046d6636aa (patch) | |
tree | 678fc6d876d96346a0beb5b19b9b9249c17df9ea /utf8.c | |
parent | 9dfb44ee59033dc1f1f858d46a05a3f3c8ce85d9 (diff) | |
download | perl-da8c1a98236a9f56df850c47705cb3046d6636aa.tar.gz |
Add isFOO_utf8_safe() macros
The original API does not check that we aren't reading beyond the end of
a buffer, apparently assuming that we could keep malformed UTF-8 out by
use of gatekeepers, but that is currently impossible. This commit adds
"safe" macros for determining if a UTF-8 sequence represents
an alphabetic, a digit, etc. Each new macro has an extra parameter
pointing to the end of the sequence, so that looking beyond the input
string can be avoided.
The macros aren't currently completely safe, as they don't test that
there is at least a single valid byte in the input, except by an
assertion in DEBUGGING builds. This is because typically they are
called in code that makes that assumption, and frequently tests the
current byte for one thing or another.
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 83 |
1 files changed, 80 insertions, 3 deletions
@@ -2116,7 +2116,7 @@ Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); - return _is_utf8_FOO(classnum, tmpbuf); + return _is_utf8_FOO_with_len(classnum, tmpbuf, tmpbuf + sizeof(tmpbuf)); } /* Internal function so we can deprecate the external one, and call @@ -2137,7 +2137,7 @@ Perl__is_uni_perl_idcont(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); - return _is_utf8_perl_idcont(tmpbuf); + return _is_utf8_perl_idcont_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf)); } bool @@ -2145,7 +2145,7 @@ Perl__is_uni_perl_idstart(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); - return _is_utf8_perl_idstart(tmpbuf); + return _is_utf8_perl_idstart_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf)); } UV @@ -2445,6 +2445,40 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, return swash_fetch(*swash, p, TRUE) != 0; } +PERL_STATIC_INLINE bool +S_is_utf8_common_with_len(pTHX_ const U8 *const p, const U8 * const e, SV **swash, + const char *const swashname, SV* const invlist) +{ + /* returns a boolean giving whether or not the UTF8-encoded character that + * starts at <p>, and extending no further than <e - 1> is in the swash + * indicated by <swashname>. <swash> contains a pointer to where the swash + * indicated by <swashname> is to be stored; which this routine will do, so + * that future calls will look at <*swash> and only generate a swash if it + * is not null. <invlist> is NULL or an inversion list that defines the + * swash. If not null, it saves time during initialization of the swash. + */ + + PERL_ARGS_ASSERT_IS_UTF8_COMMON_WITH_LEN; + + if (! isUTF8_CHAR(p, e)) { + _force_out_malformed_utf8_message(p, e, 0, 1); + NOT_REACHED; /* NOTREACHED */ + } + + if (!*swash) { + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + *swash = _core_swash_init("utf8", + + /* Only use the name if there is no inversion + * list; otherwise will go out to disk */ + (invlist) ? "" : swashname, + + &PL_sv_undef, 1, 0, invlist, &flags); + } + + return swash_fetch(*swash, p, TRUE) != 0; +} + bool Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p) { @@ -2459,6 +2493,21 @@ Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p) } bool +Perl__is_utf8_FOO_with_len(pTHX_ const U8 classnum, const U8 *p, + const U8 * const e) +{ + PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN; + + assert(classnum < _FIRST_NON_SWASH_CC); + + return is_utf8_common_with_len(p, + e, + &PL_utf8_swash_ptrs[classnum], + swash_property_names[classnum], + PL_XPosix_ptrs[classnum]); +} + +bool Perl__is_utf8_perl_idstart(pTHX_ const U8 *p) { SV* invlist = NULL; @@ -2472,6 +2521,20 @@ Perl__is_utf8_perl_idstart(pTHX_ const U8 *p) } bool +Perl__is_utf8_perl_idstart_with_len(pTHX_ const U8 *p, const U8 * const e) +{ + SV* invlist = NULL; + + PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN; + + if (! PL_utf8_perl_idstart) { + invlist = _new_invlist_C_array(_Perl_IDStart_invlist); + } + return is_utf8_common_with_len(p, e, &PL_utf8_perl_idstart, + "_Perl_IDStart", invlist); +} + +bool Perl__is_utf8_xidstart(pTHX_ const U8 *p) { PERL_ARGS_ASSERT__IS_UTF8_XIDSTART; @@ -2495,6 +2558,20 @@ Perl__is_utf8_perl_idcont(pTHX_ const U8 *p) } bool +Perl__is_utf8_perl_idcont_with_len(pTHX_ const U8 *p, const U8 * const e) +{ + SV* invlist = NULL; + + PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN; + + if (! PL_utf8_perl_idcont) { + invlist = _new_invlist_C_array(_Perl_IDCont_invlist); + } + return is_utf8_common_with_len(p, e, &PL_utf8_perl_idcont, + "_Perl_IDCont", invlist); +} + +bool Perl__is_utf8_idcont(pTHX_ const U8 *p) { PERL_ARGS_ASSERT__IS_UTF8_IDCONT; |