summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@zytor.com>2012-02-25 15:29:37 -0800
committerH. Peter Anvin <hpa@zytor.com>2012-02-25 15:29:37 -0800
commit9fa2e72997b698107b7c5a02e8f03e84e8d8fb74 (patch)
tree7de3095e072a8b4a88631fc31055b17ab82a375f
parent5a24fdd547f4c02fe46c37b84a020febfa41bfd2 (diff)
downloadnasm-9fa2e72997b698107b7c5a02e8f03e84e8d8fb74.tar.gz
Add support for UTF-16BE and UTF-32BE
Add support for bigendian UTF-16 and UTF-32, and (for symmetry) add explicitly littleendian operators. Signed-off-by: H. Peter Anvin <hpa@zytor.com>
-rw-r--r--doc/nasmdoc.src15
-rw-r--r--nasm.h6
-rw-r--r--strfunc.c156
-rw-r--r--test/utf.asm52
-rw-r--r--tokens.dat4
5 files changed, 226 insertions, 7 deletions
diff --git a/doc/nasmdoc.src b/doc/nasmdoc.src
index 217c12a8..3c912eef 100644
--- a/doc/nasmdoc.src
+++ b/doc/nasmdoc.src
@@ -1596,9 +1596,12 @@ operands to \c{DW}, and so forth.
\S{unicode} \I{UTF-16}\I{UTF-32}\i{Unicode} Strings
-The special operators \i\c{__utf16__} and \i\c{__utf32__} allows
-definition of Unicode strings. They take a string in UTF-8 format and
-converts it to (littleendian) UTF-16 or UTF-32, respectively.
+The special operators \i\c{__utf16__}, \i\c{__utf16le__},
+\i\c{__utf16be__}, \i\c{__utf32__}, \i\c{__utf32le__} and
+\i\c{__utf32be__} allows definition of Unicode strings. They take a
+string in UTF-8 format and converts it to UTF-16 or UTF-32,
+respectively. Unless the \c{be} forms are specified, the output is
+littleendian.
For example:
@@ -1608,9 +1611,9 @@ For example:
\c dw u('C:\WINDOWS'), 0 ; Pathname in UTF-16
\c dd w(`A + B = \u206a`), 0 ; String in UTF-32
-\c{__utf16__} and \c{__utf32__} can be applied either to strings
-passed to the \c{DB} family instructions, or to character constants in
-an expression context.
+The UTF operators can be applied either to strings passed to the
+\c{DB} family instructions, or to character constants in an expression
+context.
\S{fltconst} \I{floating-point, constants}Floating-Point Constants
diff --git a/nasm.h b/nasm.h
index 46e4c051..5b4b5ff3 100644
--- a/nasm.h
+++ b/nasm.h
@@ -224,7 +224,7 @@ enum token_type { /* token types, other than chars */
TOKEN_SEG, /* SEG */
TOKEN_WRT, /* WRT */
TOKEN_FLOATIZE, /* __floatX__ */
- TOKEN_STRFUNC, /* __utf16__, __utf32__ */
+ TOKEN_STRFUNC, /* __utf16*__, __utf32*__ */
};
enum floatize {
@@ -241,7 +241,11 @@ enum floatize {
/* Must match the list in string_transform(), in strfunc.c */
enum strfunc {
STRFUNC_UTF16,
+ STRFUNC_UTF16LE,
+ STRFUNC_UTF16BE,
STRFUNC_UTF32,
+ STRFUNC_UTF32LE,
+ STRFUNC_UTF32BE,
};
size_t string_transform(char *, size_t, char **, enum strfunc);
diff --git a/strfunc.c b/strfunc.c
index a34f738a..4b5af40b 100644
--- a/strfunc.c
+++ b/strfunc.c
@@ -111,6 +111,84 @@ static size_t utf8_to_16le(uint8_t *str, size_t len, char *op)
}
/*
+ * Convert a string in UTF-8 format to UTF-16BE
+ */
+static size_t utf8_to_16be(uint8_t *str, size_t len, char *op)
+{
+#define EMIT(x) \
+ do { \
+ uint16_t _y = (x); \
+ if (op) { \
+ WRITECHAR(op, _y >> 8); \
+ WRITECHAR(op, _y); \
+ } \
+ outlen++; \
+ } while (0) \
+
+ size_t outlen = 0;
+ int expect = 0;
+ uint8_t c;
+ uint32_t v = 0, vmin = 0;
+
+ while (len--) {
+ c = *str++;
+
+ if (expect) {
+ if ((c & 0xc0) != 0x80) {
+ expect = 0;
+ return -1;
+ } else {
+ v = (v << 6) | (c & 0x3f);
+ if (!--expect) {
+ if (v < vmin || v > 0x10ffff ||
+ (v >= 0xd800 && v <= 0xdfff)) {
+ return -1;
+ } else if (v > 0xffff) {
+ v -= 0x10000;
+ EMIT(0xdc00 | (v & 0x3ff));
+ EMIT(0xd800 | (v >> 10));
+ } else {
+ EMIT(v);
+ }
+ }
+ continue;
+ }
+ }
+
+ if (c < 0x80) {
+ EMIT(c);
+ } else if (c < 0xc0 || c >= 0xfe) {
+ /* Invalid UTF-8 */
+ return -1;
+ } else if (c < 0xe0) {
+ v = c & 0x1f;
+ expect = 1;
+ vmin = 0x80;
+ } else if (c < 0xf0) {
+ v = c & 0x0f;
+ expect = 2;
+ vmin = 0x800;
+ } else if (c < 0xf8) {
+ v = c & 0x07;
+ expect = 3;
+ vmin = 0x10000;
+ } else if (c < 0xfc) {
+ v = c & 0x03;
+ expect = 4;
+ vmin = 0x200000;
+ } else {
+ v = c & 0x01;
+ expect = 5;
+ vmin = 0x4000000;
+ }
+ }
+
+ return expect ? (size_t)-1 : outlen << 1;
+
+#undef EMIT
+}
+
+/*
* Convert a string in UTF-8 format to UTF-32LE
*/
static size_t utf8_to_32le(uint8_t *str, size_t len, char *op)
@@ -174,6 +252,80 @@ static size_t utf8_to_32le(uint8_t *str, size_t len, char *op)
#undef EMIT
}
+/*
+ * Convert a string in UTF-8 format to UTF-32BE
+ */
+static size_t utf8_to_32be(uint8_t *str, size_t len, char *op)
+{
+#define EMIT(x) \
+ do { \
+ uint32_t _y = (x); \
+ if (op) { \
+ WRITECHAR(op,_y >> 24); \
+ WRITECHAR(op,_y >> 16); \
+ WRITECHAR(op,_y >> 8); \
+ WRITECHAR(op,_y); \
+ } \
+ outlen++; \
+ } while (0)
+
+ size_t outlen = 0;
+ int expect = 0;
+ uint8_t c;
+ uint32_t v = 0, vmin = 0;
+
+ while (len--) {
+ c = *str++;
+
+ if (expect) {
+ if ((c & 0xc0) != 0x80) {
+ return -1;
+ } else {
+ v = (v << 6) | (c & 0x3f);
+ if (!--expect) {
+ if (v < vmin || (v >= 0xd800 && v <= 0xdfff)) {
+ return -1;
+ } else {
+ EMIT(v);
+ }
+ }
+ continue;
+ }
+ }
+
+ if (c < 0x80) {
+ EMIT(c);
+ } else if (c < 0xc0 || c >= 0xfe) {
+ /* Invalid UTF-8 */
+ return -1;
+ } else if (c < 0xe0) {
+ v = c & 0x1f;
+ expect = 1;
+ vmin = 0x80;
+ } else if (c < 0xf0) {
+ v = c & 0x0f;
+ expect = 2;
+ vmin = 0x800;
+ } else if (c < 0xf8) {
+ v = c & 0x07;
+ expect = 3;
+ vmin = 0x10000;
+ } else if (c < 0xfc) {
+ v = c & 0x03;
+ expect = 4;
+ vmin = 0x200000;
+ } else {
+ v = c & 0x01;
+ expect = 5;
+ vmin = 0x4000000;
+ }
+ }
+
+ return expect ? (size_t)-1 : outlen << 2;
+
+#undef EMIT
+}
+
typedef size_t (*transform_func)(uint8_t *, size_t, char *);
/*
@@ -186,7 +338,11 @@ size_t string_transform(char *str, size_t len, char **out, enum strfunc func)
/* This should match enum strfunc in nasm.h */
static const transform_func str_transforms[] = {
utf8_to_16le,
+ utf8_to_16le,
+ utf8_to_16be,
+ utf8_to_32le,
utf8_to_32le,
+ utf8_to_32be,
};
transform_func transform = str_transforms[func];
size_t outlen;
diff --git a/test/utf.asm b/test/utf.asm
index 4b894f87..00207dc6 100644
--- a/test/utf.asm
+++ b/test/utf.asm
@@ -2,6 +2,10 @@
;Testname=error; Arguments=-fbin -outf.bin -DERROR; Files=stdout stderr utf.bin
%define u(x) __utf16__(x)
%define w(x) __utf32__(x)
+%define ul(x) __utf16le__(x)
+%define wl(x) __utf32le__(x)
+%define ub(x) __utf16be__(x)
+%define wb(x) __utf32be__(x)
db `Test \u306a\U0001abcd\n`
dw u(`Test \u306a\U0001abcd\n`)
@@ -21,10 +25,58 @@
mov ebx,u(`\U0001abcd`)
mov ecx,w(`\U0001abcd`)
+ db `Test \u306a\U0001abcd\n`
+ dw ul(`Test \u306a\U0001abcd\n`)
+ dd wl(`Test \u306a\U0001abcd\n`)
+
+ db `\u306a`
+ db `\xe3\x81\xaa`
+
+ dw __utf16le__ "Hello, World!"
+
+ nop
+
+ mov ax,ul(`a`)
+ mov bx,ul(`\u306a`)
+ mov cx,ul(`\xe3\x81\xaa`)
+ mov eax,ul(`ab`)
+ mov ebx,ul(`\U0001abcd`)
+ mov ecx,wl(`\U0001abcd`)
+
+ db `Test \u306a\U0001abcd\n`
+ dw ub(`Test \u306a\U0001abcd\n`)
+ dd wb(`Test \u306a\U0001abcd\n`)
+
+ db `\u306a`
+ db `\xe3\x81\xaa`
+
+ dw __utf16be__ "Hello, World!"
+
+ nop
+
+ mov ax,ub(`a`)
+ mov bx,ub(`\u306a`)
+ mov cx,ub(`\xe3\x81\xaa`)
+ mov eax,ub(`ab`)
+ mov ebx,ub(`\U0001abcd`)
+ mov ecx,wb(`\U0001abcd`)
+
%ifdef ERROR
dw __utf16__ 33
dw __utf16__, 46
dw __utf16__("Hello, World!",16)
dw __utf16__("Hello, World!",16
dw u(`\xff`)
+
+ dw __utf16le__ 33
+ dw __utf16le__, 46
+ dw __utf16le__("Hello, World!",16)
+ dw __utf16le__("Hello, World!",16
+ dw ul(`\xff`)
+
+ dw __utf16be__ 33
+ dw __utf16be__, 46
+ dw __utf16be__("Hello, World!",16)
+ dw __utf16be__("Hello, World!",16
+ dw ub(`\xff`)
%endif
diff --git a/tokens.dat b/tokens.dat
index 25179fad..bb5fccb0 100644
--- a/tokens.dat
+++ b/tokens.dat
@@ -91,7 +91,11 @@ __float128h__
% TOKEN_STRFUNC, 0, STRFUNC_{__*__}
__utf16__
+__utf16le__
+__utf16be__
__utf32__
+__utf32le__
+__utf32be__
% TOKEN_*, 0, 0
seg