summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorAndreas König <a.koenig@mind.de>2000-10-24 16:01:26 +0200
committerJarkko Hietaniemi <jhi@iki.fi>2000-10-24 17:55:17 +0000
commit067a85ef854d787b90f02ceec6c3398c3103295b (patch)
treeaef8b158234263f6ad54afd6c597bf4ee6243cc2 /utf8.c
parente2c57c3ea2e1fe3adabb752ab93e7f4b7746a103 (diff)
downloadperl-067a85ef854d787b90f02ceec6c3398c3103295b.tar.gz
Fix the bug reported in
Subject: Encode bug? Message-ID: <m3lmveqwh5.fsf@ak-71.mind.de> Also make is_utf8_char() stricter. p4raw-id: //depot/perl@7425
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c38
1 files changed, 28 insertions, 10 deletions
diff --git a/utf8.c b/utf8.c
index 98236ed170..2e4833983a 100644
--- a/utf8.c
+++ b/utf8.c
@@ -104,27 +104,41 @@ Perl_uv_to_utf8(pTHX_ U8 *d, UV uv)
/* Tests if some arbitrary number of bytes begins in a valid UTF-8 character.
* The actual number of bytes in the UTF-8 character will be returned if it
* is valid, otherwise 0. */
-int
+STRLEN
Perl_is_utf8_char(pTHX_ U8 *s)
{
U8 u = *s;
- int slen, len;
+ STRLEN slen, len;
+ UV uv, ouv;
- if (!(u & 0x80))
+ if (u <= 0x7f)
return 1;
- if (!(u & 0x40))
+ if (u >= 0x80 && u <= 0xbf)
return 0;
len = UTF8SKIP(s);
+ if (len < 2 || (u >= 0xc0 && u <= 0xfd && s[1] < 0x80))
+ return 0;
+
slen = len - 1;
s++;
+ uv = u;
+ ouv = uv;
while (slen--) {
if ((*s & 0xc0) != 0x80)
return 0;
+ uv = (uv << 6) | (*s & 0x3f);
+ if (uv < ouv)
+ return 0;
+ ouv = uv;
s++;
}
+
+ if (UTF8LEN(uv) < len)
+ return 0;
+
return len;
}
@@ -140,16 +154,20 @@ string, false otherwise.
bool
Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
{
- U8* x=s;
- U8* send=s+len;
- int c;
+ U8* x = s;
+ U8* send = s + len;
+ STRLEN c;
+
while (x < send) {
c = is_utf8_char(x);
+ if (!c)
+ return FALSE;
x += c;
- if (!c || x > send)
- return 0;
+ if (x > send)
+ return FALSE;
}
- return 1;
+
+ return TRUE;
}
/*