summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2000-07-31 04:15:02 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2000-07-31 04:15:02 +0000
commitdea0fc0b9e5a61b92c4be2ecafe0a8d9396d4cc1 (patch)
tree349418eaad4586c0413b7deb1e36247b8098aa26 /utf8.c
parent24142eb29e13a3c1fffe9021ceab90a4be7b9da1 (diff)
downloadperl-dea0fc0b9e5a61b92c4be2ecafe0a8d9396d4cc1.tar.gz
The swallow_bom() saga continues. The #23 of require.t
(UTF16-LE) still fails (silently, no output) but the #22 (UTF16-BE) seems to be working now. The root of the failure may be in sv_gets(): is it UTF-16LE-aware, especially when it comes to line endings? p4raw-id: //depot/perl@6469
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c40
1 files changed, 18 insertions, 22 deletions
diff --git a/utf8.c b/utf8.c
index d00b9f3de8..6a99d9dff1 100644
--- a/utf8.c
+++ b/utf8.c
@@ -321,26 +321,25 @@ Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len)
}
/*
- * Convert native or reversed UTF-16 to UTF-8.
+ * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
*
* Destination must be pre-extended to 3/2 source. Do not use in-place.
* We optimize for native, for obvious reasons. */
-/* There are several problems with utf16_to_utf8().
- * (1) U16 is not necessarily *exactly* two bytes.
- * (2) Secondly, no check is made for odd length.
- * (3) Thirdly, the "Malformed UTF-16 surrogate" should probably be
- * a hard error (and it should be listed in perldiag).
- * (4) The tests (in comp/t/require.t) are a joke: the UTF16 BOM
- * really ought to be followed by valid UTF16 characters.
- * See swallow_bom() in toke.c.
- * --Mike Guy */
U8*
-Perl_utf16_to_utf8(pTHX_ U16* p, U8* d, I32 bytelen)
+Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
{
- U16* pend = p + bytelen / 2;
+ U8* pend;
+ U8* dstart = d;
+
+ if (bytelen & 1)
+ Perl_croak("panic: utf16_to_utf8: odd bytelen");
+
+ pend = p + bytelen;
+
while (p < pend) {
- UV uv = *p++;
+ UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
+ p += 2;
if (uv < 0x80) {
*d++ = uv;
continue;
@@ -352,13 +351,9 @@ Perl_utf16_to_utf8(pTHX_ U16* p, U8* d, I32 bytelen)
}
if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
dTHR;
- int low = *p++;
- if (low < 0xdc00 || low >= 0xdfff) {
- if (ckWARN_d(WARN_UTF8))
- Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-16 surrogate");
- p--;
- uv = 0xfffd;
- }
+ UV low = *p++;
+ if (low < 0xdc00 || low >= 0xdfff)
+ Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
}
if (uv < 0x10000) {
@@ -375,13 +370,14 @@ Perl_utf16_to_utf8(pTHX_ U16* p, U8* d, I32 bytelen)
continue;
}
}
+ *newlen = d - dstart;
return d;
}
/* Note: this one is slightly destructive of the source. */
U8*
-Perl_utf16_to_utf8_reversed(pTHX_ U16* p, U8* d, I32 bytelen)
+Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
{
U8* s = (U8*)p;
U8* send = s + bytelen;
@@ -391,7 +387,7 @@ Perl_utf16_to_utf8_reversed(pTHX_ U16* p, U8* d, I32 bytelen)
s[1] = tmp;
s += 2;
}
- return utf16_to_utf8(p, d, bytelen);
+ return utf16_to_utf8(p, d, bytelen, newlen);
}
/* for now these are all defined (inefficiently) in terms of the utf8 versions */