Add support for UTF-16BE and UTF-32BE

Add support for bigendian UTF-16 and UTF-32, and (for symmetry) add explicitly littleendian operators. Signed-off-by: H. Peter Anvin <hpa@zytor.com>
author: H. Peter Anvin <hpa@zytor.com> 2012-02-25 15:29:37 -0800
committer: H. Peter Anvin <hpa@zytor.com> 2012-02-25 15:29:37 -0800
commit: 9fa2e72997b698107b7c5a02e8f03e84e8d8fb74 (patch)
tree: 7de3095e072a8b4a88631fc31055b17ab82a375f
parent: 5a24fdd547f4c02fe46c37b84a020febfa41bfd2 (diff)
download: nasm-9fa2e72997b698107b7c5a02e8f03e84e8d8fb74.tar.gz
5 files changed, 226 insertions, 7 deletions
diff --git a/doc/nasmdoc.src b/doc/nasmdoc.src
index 217c12a8..3c912eef 100644
--- a/doc/nasmdoc.src
+++ b/doc/nasmdoc.src
@@ -1596,9 +1596,12 @@ operands to \c{DW}, and so forth.
 
 \S{unicode} \I{UTF-16}\I{UTF-32}\i{Unicode} Strings
 
-The special operators \i\c{__utf16__} and \i\c{__utf32__} allows
-definition of Unicode strings.  They take a string in UTF-8 format and
-converts it to (littleendian) UTF-16 or UTF-32, respectively.
+The special operators \i\c{__utf16__}, \i\c{__utf16le__},
+\i\c{__utf16be__}, \i\c{__utf32__}, \i\c{__utf32le__} and
+\i\c{__utf32be__} allows definition of Unicode strings.  They take a
+string in UTF-8 format and converts it to UTF-16 or UTF-32,
+respectively.  Unless the \c{be} forms are specified, the output is
+littleendian.
 
 For example:
 
@@ -1608,9 +1611,9 @@ For example:
 \c       dw u('C:\WINDOWS'), 0       ; Pathname in UTF-16
 \c       dd w(`A + B = \u206a`), 0   ; String in UTF-32
 
-\c{__utf16__} and \c{__utf32__} can be applied either to strings
-passed to the \c{DB} family instructions, or to character constants in
-an expression context.
+The UTF operators can be applied either to strings passed to the
+\c{DB} family instructions, or to character constants in an expression
+context.
 
 \S{fltconst} \I{floating-point, constants}Floating-Point Constants
 
diff --git a/nasm.h b/nasm.h
index 46e4c051..5b4b5ff3 100644
--- a/nasm.h
+++ b/nasm.h
@@ -224,7 +224,7 @@ enum token_type { /* token types, other than chars */
     TOKEN_SEG,          /* SEG */
     TOKEN_WRT,          /* WRT */
     TOKEN_FLOATIZE,     /* __floatX__ */
-    TOKEN_STRFUNC,      /* __utf16__, __utf32__ */
+    TOKEN_STRFUNC,      /* __utf16*__, __utf32*__ */
 };
 
 enum floatize {
@@ -241,7 +241,11 @@ enum floatize {
 /* Must match the list in string_transform(), in strfunc.c */
 enum strfunc {
     STRFUNC_UTF16,
+    STRFUNC_UTF16LE,
+    STRFUNC_UTF16BE,
     STRFUNC_UTF32,
+    STRFUNC_UTF32LE,
+    STRFUNC_UTF32BE,
 };
 
 size_t string_transform(char *, size_t, char **, enum strfunc);
diff --git a/strfunc.c b/strfunc.c
index a34f738a..4b5af40b 100644
--- a/strfunc.c
+++ b/strfunc.c
@@ -111,6 +111,84 @@ static size_t utf8_to_16le(uint8_t *str, size_t len, char *op)
 }
 
 /*
+ * Convert a string in UTF-8 format to UTF-16BE
+ */
+static size_t utf8_to_16be(uint8_t *str, size_t len, char *op)
+{
+#define EMIT(x)                                 \
+    do {                                        \
+        uint16_t _y = (x);                      \
+        if (op) {                               \
+            WRITECHAR(op, _y >> 8);             \
+            WRITECHAR(op, _y);                  \
+        }                                       \
+        outlen++;                               \
+    } while (0)                                 \
+
+    size_t outlen = 0;
+    int expect = 0;
+    uint8_t c;
+    uint32_t v = 0, vmin = 0;
+
+    while (len--) {
+	c = *str++;
+
+	if (expect) {
+	    if ((c & 0xc0) != 0x80) {
+		expect = 0;
+		return -1;
+	    } else {
+		v = (v << 6) | (c & 0x3f);
+		if (!--expect) {
+		    if (v < vmin || v > 0x10ffff ||
+			(v >= 0xd800 && v <= 0xdfff)) {
+			return -1;
+		    } else if (v > 0xffff) {
+			v -= 0x10000;
+			EMIT(0xdc00 | (v & 0x3ff));
+			EMIT(0xd800 | (v >> 10));
+		    } else {
+			EMIT(v);
+		    }
+		}
+		continue;
+	    }
+	}
+
+	if (c < 0x80) {
+	    EMIT(c);
+	} else if (c < 0xc0 || c >= 0xfe) {
+	    /* Invalid UTF-8 */
+	    return -1;
+	} else if (c < 0xe0) {
+	    v = c & 0x1f;
+	    expect = 1;
+	    vmin = 0x80;
+	} else if (c < 0xf0) {
+	    v = c & 0x0f;
+	    expect = 2;
+	    vmin = 0x800;
+	} else if (c < 0xf8) {
+	    v = c & 0x07;
+	    expect = 3;
+	    vmin = 0x10000;
+	} else if (c < 0xfc) {
+	    v = c & 0x03;
+	    expect = 4;
+	    vmin = 0x200000;
+	} else {
+	    v = c & 0x01;
+	    expect = 5;
+	    vmin = 0x4000000;
+	}
+    }
+
+    return expect ? (size_t)-1 : outlen << 1;
+
+#undef EMIT
+}
+
+/*
  * Convert a string in UTF-8 format to UTF-32LE
  */
 static size_t utf8_to_32le(uint8_t *str, size_t len, char *op)
@@ -174,6 +252,80 @@ static size_t utf8_to_32le(uint8_t *str, size_t len, char *op)
 #undef EMIT
 }
 
+/*
+ * Convert a string in UTF-8 format to UTF-32BE
+ */
+static size_t utf8_to_32be(uint8_t *str, size_t len, char *op)
+{
+#define EMIT(x)                                         \
+    do {                                                \
+        uint32_t _y = (x);                              \
+        if (op) {                                       \
+            WRITECHAR(op,_y >> 24);                     \
+            WRITECHAR(op,_y >> 16);                     \
+            WRITECHAR(op,_y >> 8);                      \
+            WRITECHAR(op,_y);                           \
+        }                                               \
+        outlen++;                                       \
+    } while (0)
+
+    size_t outlen = 0;
+    int expect = 0;
+    uint8_t c;
+    uint32_t v = 0, vmin = 0;
+
+    while (len--) {
+	c = *str++;
+
+	if (expect) {
+	    if ((c & 0xc0) != 0x80) {
+		return -1;
+	    } else {
+		v = (v << 6) | (c & 0x3f);
+		if (!--expect) {
+		    if (v < vmin || (v >= 0xd800 && v <= 0xdfff)) {
+			return -1;
+		    } else {
+			EMIT(v);
+		    }
+		}
+		continue;
+	    }
+	}
+
+	if (c < 0x80) {
+	    EMIT(c);
+	} else if (c < 0xc0 || c >= 0xfe) {
+	    /* Invalid UTF-8 */
+	    return -1;
+	} else if (c < 0xe0) {
+	    v = c & 0x1f;
+	    expect = 1;
+	    vmin = 0x80;
+	} else if (c < 0xf0) {
+	    v = c & 0x0f;
+	    expect = 2;
+	    vmin = 0x800;
+	} else if (c < 0xf8) {
+	    v = c & 0x07;
+	    expect = 3;
+	    vmin = 0x10000;
+	} else if (c < 0xfc) {
+	    v = c & 0x03;
+	    expect = 4;
+	    vmin = 0x200000;
+	} else {
+	    v = c & 0x01;
+	    expect = 5;
+	    vmin = 0x4000000;
+	}
+    }
+
+    return expect ? (size_t)-1 : outlen << 2;
+
+#undef EMIT
+}
+
 typedef size_t (*transform_func)(uint8_t *, size_t, char *);
 
 /*
@@ -186,7 +338,11 @@ size_t string_transform(char *str, size_t len, char **out, enum strfunc func)
     /* This should match enum strfunc in nasm.h */
     static const transform_func str_transforms[] = {
 	utf8_to_16le,
+	utf8_to_16le,
+	utf8_to_16be,
+	utf8_to_32le,
 	utf8_to_32le,
+	utf8_to_32be,
     };
     transform_func transform = str_transforms[func];
     size_t outlen;
diff --git a/test/utf.asm b/test/utf.asm
index 4b894f87..00207dc6 100644
--- a/test/utf.asm
+++ b/test/utf.asm
@@ -2,6 +2,10 @@
 ;Testname=error; Arguments=-fbin -outf.bin -DERROR; Files=stdout stderr utf.bin
 %define u(x) __utf16__(x)
 %define w(x) __utf32__(x)
+%define ul(x) __utf16le__(x)
+%define wl(x) __utf32le__(x)
+%define ub(x) __utf16be__(x)
+%define wb(x) __utf32be__(x)
 
 	db `Test \u306a\U0001abcd\n`
 	dw u(`Test \u306a\U0001abcd\n`)
@@ -21,10 +25,58 @@
 	mov ebx,u(`\U0001abcd`)
 	mov ecx,w(`\U0001abcd`)
 
+	db `Test \u306a\U0001abcd\n`
+	dw ul(`Test \u306a\U0001abcd\n`)
+	dd wl(`Test \u306a\U0001abcd\n`)
+
+	db `\u306a`
+	db `\xe3\x81\xaa`
+
+	dw __utf16le__ "Hello, World!"
+
+	nop
+
+	mov ax,ul(`a`)
+	mov bx,ul(`\u306a`)
+	mov cx,ul(`\xe3\x81\xaa`)
+	mov eax,ul(`ab`)
+	mov ebx,ul(`\U0001abcd`)
+	mov ecx,wl(`\U0001abcd`)
+	
+	db `Test \u306a\U0001abcd\n`
+	dw ub(`Test \u306a\U0001abcd\n`)
+	dd wb(`Test \u306a\U0001abcd\n`)
+
+	db `\u306a`
+	db `\xe3\x81\xaa`
+
+	dw __utf16be__ "Hello, World!"
+
+	nop
+
+	mov ax,ub(`a`)
+	mov bx,ub(`\u306a`)
+	mov cx,ub(`\xe3\x81\xaa`)
+	mov eax,ub(`ab`)
+	mov ebx,ub(`\U0001abcd`)
+	mov ecx,wb(`\U0001abcd`)
+
 %ifdef ERROR
 	dw __utf16__ 33
 	dw __utf16__, 46
 	dw __utf16__("Hello, World!",16)
 	dw __utf16__("Hello, World!",16
 	dw u(`\xff`)
+
+	dw __utf16le__ 33
+	dw __utf16le__, 46
+	dw __utf16le__("Hello, World!",16)
+	dw __utf16le__("Hello, World!",16
+	dw ul(`\xff`)
+
+	dw __utf16be__ 33
+	dw __utf16be__, 46
+	dw __utf16be__("Hello, World!",16)
+	dw __utf16be__("Hello, World!",16
+	dw ub(`\xff`)
 %endif
diff --git a/tokens.dat b/tokens.dat
index 25179fad..bb5fccb0 100644
--- a/tokens.dat
+++ b/tokens.dat
@@ -91,7 +91,11 @@ __float128h__
 
 % TOKEN_STRFUNC, 0, STRFUNC_{__*__}
 __utf16__
+__utf16le__
+__utf16be__
 __utf32__
+__utf32le__
+__utf32be__
 
 % TOKEN_*, 0, 0
 seg
author	H. Peter Anvin <hpa@zytor.com>	2012-02-25 15:29:37 -0800
committer	H. Peter Anvin <hpa@zytor.com>	2012-02-25 15:29:37 -0800
commit	9fa2e72997b698107b7c5a02e8f03e84e8d8fb74 (patch)
tree	7de3095e072a8b4a88631fc31055b17ab82a375f
parent	5a24fdd547f4c02fe46c37b84a020febfa41bfd2 (diff)
download	nasm-9fa2e72997b698107b7c5a02e8f03e84e8d8fb74.tar.gz