summaryrefslogtreecommitdiff
path: root/ext/pcre/pcrelib/pcre_valid_utf8.c
diff options
context:
space:
mode:
Diffstat (limited to 'ext/pcre/pcrelib/pcre_valid_utf8.c')
-rw-r--r--ext/pcre/pcrelib/pcre_valid_utf8.c17
1 files changed, 16 insertions, 1 deletions
diff --git a/ext/pcre/pcrelib/pcre_valid_utf8.c b/ext/pcre/pcrelib/pcre_valid_utf8.c
index 3c81dc9ecc..d54d3bd2d6 100644
--- a/ext/pcre/pcrelib/pcre_valid_utf8.c
+++ b/ext/pcre/pcrelib/pcre_valid_utf8.c
@@ -70,6 +70,20 @@ Arguments:
Returns: < 0 if the string is a valid UTF-8 string
>= 0 otherwise; the value is the offset of the bad byte
+
+Bad bytes can be:
+
+ . An isolated byte whose most significant bits are 0x80, because this
+ can only correctly appear within a UTF-8 character;
+
+ . A byte whose most significant bits are 0xc0, but whose other bits indicate
+ that there are more than 3 additional bytes (i.e. an RFC 2279 starting
+ byte, which is no longer valid under RFC 3629);
+
+ .
+
+The returned offset may also be equal to the length of the string; this means
+that one or more bytes is missing from the final UTF-8 character.
*/
int
@@ -91,7 +105,8 @@ for (p = string; length-- > 0; p++)
if (c < 128) continue;
if (c < 0xc0) return p - string;
ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */
- if (length < ab || ab > 3) return p - string;
+ if (ab > 3) return p - string; /* Too many for RFC 3629 */
+ if (length < ab) return p + 1 + length - string; /* Missing bytes */
length -= ab;
/* Check top bits in the second byte */