From 02eb7b47b8a6793752e5b001af6e62c374b2c440 Mon Sep 17 00:00:00 2001 From: Jarkko Hietaniemi Date: Sun, 26 Nov 2000 19:01:05 +0000 Subject: Make utf8_length() and utf8_distance() (the latter of which is unused at the moment) to be less forgiving about bad UTF-8. p4raw-id: //depot/perl@7869 --- utf8.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'utf8.c') diff --git a/utf8.c b/utf8.c index fc625dc464..d25b43bbe7 100644 --- a/utf8.c +++ b/utf8.c @@ -357,8 +357,8 @@ Perl_utf8_to_uv_simple(pTHX_ U8* s, STRLEN* retlen) =for apidoc|utf8_length|U8 *s|U8 *e Return the length of the UTF-8 char encoded string C in characters. -Stops at string C. If C s> or if the scan would end up -past C, return -1. +Stops at C (inclusive). If C s> or if the scan would end +up past C, croaks. =cut */ @@ -369,12 +369,12 @@ Perl_utf8_length(pTHX_ U8* s, U8* e) STRLEN len = 0; if (e < s) - return -1; + Perl_croak(aTHX_ "panic: utf8_length: unexpected end"); while (s < e) { - STRLEN t = UTF8SKIP(s); + U8 t = UTF8SKIP(s); if (e - s < t) - return -1; + Perl_croak(aTHX_ "panic: utf8_length: unaligned end"); s += t; len++; } @@ -385,22 +385,32 @@ Perl_utf8_length(pTHX_ U8* s, U8* e) /* utf8_distance(a,b) returns the number of UTF8 characters between the pointers a and b */ -I32 +IV Perl_utf8_distance(pTHX_ U8 *a, U8 *b) { - I32 off = 0; + IV off = 0; + if (a < b) { while (a < b) { - a += UTF8SKIP(a); + U8 c = UTF8SKIP(a); + + if (b - a < c) + Perl_croak(aTHX_ "panic: utf8_distance: unaligned end"); + a += c; off--; } } else { while (b < a) { - b += UTF8SKIP(b); + U8 c = UTF8SKIP(b); + + if (a - b < c) + Perl_croak(aTHX_ "panic: utf8_distance: unaligned end"); + b += c; off++; } } + return off; } -- cgit v1.2.1