10 files changed, 328 insertions, 113 deletions
diff --git a/Makefile.in b/Makefile.in
index d207acc2..7d32d06a 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -67,8 +67,8 @@ NASM =	nasm.$(O) nasmlib.$(O) raa.$(O) saa.$(O) \
 	output/outobj.$(O) output/outas86.$(O) output/outrdf2.$(O) \
 	output/outdbg.$(O) output/outieee.$(O) output/outmacho.$(O) \
 	preproc.$(O) quote.$(O) pptok.$(O) macros.$(O) \
-	listing.$(O) eval.$(O) exprlib.$(O) stdscan.$(O) tokhash.$(O) \
-	regvals.$(O) regflags.$(O)
+	listing.$(O) eval.$(O) exprlib.$(O) stdscan.$(O) strfunc.$(O) \
+	tokhash.$(O) regvals.$(O) regflags.$(O)
 
 NDISASM = ndisasm.$(O) disasm.$(O) sync.$(O) nasmlib.$(O) \
 	insnsd.$(O) insnsb.$(O) insnsn.$(O) regs.$(O) regdis.$(O)
@@ -234,7 +234,7 @@ alldeps: perlreq
 #-- Everything below is generated by mkdep.pl - do not edit --#
 assemble.$(O): assemble.c assemble.h compiler.h config.h insns.h insnsi.h \
  nasm.h nasmlib.h regs.h tables.h tokens.h version.h
-crc64.$(O): crc64.c compiler.h config.h
+crc64.$(O): crc64.c compiler.h config.h nasmlib.h
 disasm.$(O): disasm.c compiler.h config.h disasm.h insns.h insnsi.h nasm.h \
  nasmlib.h regdis.h regs.h sync.h tables.h tokens.h version.h
 eval.$(O): eval.c compiler.h config.h eval.h float.h insnsi.h labels.h \
@@ -309,6 +309,8 @@ regvals.$(O): regvals.c compiler.h config.h insnsi.h tables.h
 saa.$(O): saa.c compiler.h config.h nasmlib.h saa.h
 stdscan.$(O): stdscan.c compiler.h config.h insns.h insnsi.h nasm.h \
  nasmlib.h quote.h regs.h stdscan.h tokens.h version.h
+strfunc.$(O): strfunc.c compiler.h config.h insnsi.h nasm.h nasmlib.h regs.h \
+ version.h
 sync.$(O): sync.c compiler.h config.h nasmlib.h sync.h
 tokhash.$(O): tokhash.c compiler.h config.h hashtbl.h insns.h insnsi.h \
  nasm.h nasmlib.h regs.h tokens.h version.h
diff --git a/Mkfiles/msvc.mak b/Mkfiles/msvc.mak
index c2904d85..04188bff 100644
--- a/Mkfiles/msvc.mak
+++ b/Mkfiles/msvc.mak
@@ -180,7 +180,7 @@ everything: all doc rdf
 #-- Everything below is generated by mkdep.pl - do not edit --#
 assemble.$(O): assemble.c assemble.h compiler.h insns.h insnsi.h nasm.h \
  nasmlib.h regs.h tables.h tokens.h version.h
-crc64.$(O): crc64.c compiler.h
+crc64.$(O): crc64.c compiler.h nasmlib.h
 disasm.$(O): disasm.c compiler.h disasm.h insns.h insnsi.h nasm.h nasmlib.h \
  regdis.h regs.h sync.h tables.h tokens.h version.h
 eval.$(O): eval.c compiler.h eval.h float.h insnsi.h labels.h nasm.h \
@@ -253,6 +253,8 @@ regvals.$(O): regvals.c compiler.h insnsi.h tables.h
 saa.$(O): saa.c compiler.h nasmlib.h saa.h
 stdscan.$(O): stdscan.c compiler.h insns.h insnsi.h nasm.h nasmlib.h quote.h \
  regs.h stdscan.h tokens.h version.h
+strfunc.$(O): strfunc.c compiler.h insnsi.h nasm.h nasmlib.h regs.h \
+ version.h
 sync.$(O): sync.c compiler.h nasmlib.h sync.h
 tokhash.$(O): tokhash.c compiler.h hashtbl.h insns.h insnsi.h nasm.h \
  nasmlib.h regs.h tokens.h version.h
diff --git a/Mkfiles/netware.mak b/Mkfiles/netware.mak
index f4ec46b8..c1a970fb 100644
--- a/Mkfiles/netware.mak
+++ b/Mkfiles/netware.mak
@@ -120,7 +120,7 @@ $(OBJDIR)/version.inc: $(PROOT)/version $(PROOT)/version.pl $(OBJDIR)
 #-- Everything below is generated by mkdep.pl - do not edit --#
 assemble.o: assemble.c assemble.h compiler.h config.h insns.h insnsi.h \
  nasm.h nasmlib.h regs.h tables.h tokens.h version.h
-crc64.o: crc64.c compiler.h config.h
+crc64.o: crc64.c compiler.h config.h nasmlib.h
 disasm.o: disasm.c compiler.h config.h disasm.h insns.h insnsi.h nasm.h \
  nasmlib.h regdis.h regs.h sync.h tables.h tokens.h version.h
 eval.o: eval.c compiler.h config.h eval.h float.h insnsi.h labels.h nasm.h \
@@ -193,6 +193,8 @@ regvals.o: regvals.c compiler.h config.h insnsi.h tables.h
 saa.o: saa.c compiler.h config.h nasmlib.h saa.h
 stdscan.o: stdscan.c compiler.h config.h insns.h insnsi.h nasm.h nasmlib.h \
  quote.h regs.h stdscan.h tokens.h version.h
+strfunc.o: strfunc.c compiler.h config.h insnsi.h nasm.h nasmlib.h regs.h \
+ version.h
 sync.o: sync.c compiler.h config.h nasmlib.h sync.h
 tokhash.o: tokhash.c compiler.h config.h hashtbl.h insns.h insnsi.h nasm.h \
  nasmlib.h regs.h tokens.h version.h
diff --git a/Mkfiles/openwcom.mak b/Mkfiles/openwcom.mak
index fb95f579..aeb42c57 100644
--- a/Mkfiles/openwcom.mak
+++ b/Mkfiles/openwcom.mak
@@ -209,7 +209,7 @@ everything: all doc rdf
 #-- Everything below is generated by mkdep.pl - do not edit --#
 assemble.$(O): assemble.c assemble.h compiler.h insns.h insnsi.h nasm.h &
  nasmlib.h regs.h tables.h tokens.h version.h
-crc64.$(O): crc64.c compiler.h
+crc64.$(O): crc64.c compiler.h nasmlib.h
 disasm.$(O): disasm.c compiler.h disasm.h insns.h insnsi.h nasm.h nasmlib.h &
  regdis.h regs.h sync.h tables.h tokens.h version.h
 eval.$(O): eval.c compiler.h eval.h float.h insnsi.h labels.h nasm.h &
@@ -282,6 +282,8 @@ regvals.$(O): regvals.c compiler.h insnsi.h tables.h
 saa.$(O): saa.c compiler.h nasmlib.h saa.h
 stdscan.$(O): stdscan.c compiler.h insns.h insnsi.h nasm.h nasmlib.h quote.h &
  regs.h stdscan.h tokens.h version.h
+strfunc.$(O): strfunc.c compiler.h insnsi.h nasm.h nasmlib.h regs.h &
+ version.h
 sync.$(O): sync.c compiler.h nasmlib.h sync.h
 tokhash.$(O): tokhash.c compiler.h hashtbl.h insns.h insnsi.h nasm.h &
  nasmlib.h regs.h tokens.h version.h
diff --git a/Mkfiles/owlinux.mak b/Mkfiles/owlinux.mak
index bb10d9e7..ce4dc6d8 100644
--- a/Mkfiles/owlinux.mak
+++ b/Mkfiles/owlinux.mak
@@ -219,7 +219,7 @@ everything: all doc rdf
 #-- Everything below is generated by mkdep.pl - do not edit --#
 assemble.$(O): assemble.c assemble.h compiler.h insns.h insnsi.h nasm.h \
  nasmlib.h regs.h tables.h tokens.h version.h
-crc64.$(O): crc64.c compiler.h
+crc64.$(O): crc64.c compiler.h nasmlib.h
 disasm.$(O): disasm.c compiler.h disasm.h insns.h insnsi.h nasm.h nasmlib.h \
  regdis.h regs.h sync.h tables.h tokens.h version.h
 eval.$(O): eval.c compiler.h eval.h float.h insnsi.h labels.h nasm.h \
@@ -292,6 +292,8 @@ regvals.$(O): regvals.c compiler.h insnsi.h tables.h
 saa.$(O): saa.c compiler.h nasmlib.h saa.h
 stdscan.$(O): stdscan.c compiler.h insns.h insnsi.h nasm.h nasmlib.h quote.h \
  regs.h stdscan.h tokens.h version.h
+strfunc.$(O): strfunc.c compiler.h insnsi.h nasm.h nasmlib.h regs.h \
+ version.h
 sync.$(O): sync.c compiler.h nasmlib.h sync.h
 tokhash.$(O): tokhash.c compiler.h hashtbl.h insns.h insnsi.h nasm.h \
  nasmlib.h regs.h tokens.h version.h
diff --git a/assemble.c b/assemble.c
index 442ed2a4..7ab53ad3 100644
--- a/assemble.c
+++ b/assemble.c
@@ -335,7 +335,8 @@ int64_t assemble(int32_t segment, int64_t offset, int bits, uint32_t cp,
                         out(offset, segment, &e->offset,
                             OUT_ADDRESS, wsize, e->segment, e->wrt);
                     offset += wsize;
-                } else if (e->type == EOT_DB_STRING) {
+                } else if (e->type == EOT_DB_STRING ||
+			   e->type == EOT_DB_STRING_FREE) {
                     int align;
 
                     out(offset, segment, e->stringval,
@@ -348,6 +349,8 @@ int64_t assemble(int32_t segment, int64_t offset, int bits, uint32_t cp,
                             OUT_RAWDATA, align, NO_SEG, NO_SEG);
                     }
                     offset += e->stringlen + align;
+		    if (e->type == EOT_DB_STRING_FREE)
+			nasm_free(e->stringval);
                 }
             }
             if (t > 0 && t == instruction->times - 1) {
@@ -365,15 +368,8 @@ int64_t assemble(int32_t segment, int64_t offset, int bits, uint32_t cp,
     }
 
     if (instruction->opcode == I_INCBIN) {
-        static char fname[FILENAME_MAX];
+        const char *fname = instruction->eops->stringval;
         FILE *fp;
-        int32_t len;
-
-        len = FILENAME_MAX - 1;
-        if (len > instruction->eops->stringlen)
-            len = instruction->eops->stringlen;
-        strncpy(fname, instruction->eops->stringval, len);
-        fname[len] = '\0';
 
 	fp = fopen(fname, "rb");
 	if (!fp) {
@@ -383,17 +379,18 @@ int64_t assemble(int32_t segment, int64_t offset, int bits, uint32_t cp,
             error(ERR_NONFATAL, "`incbin': unable to seek on file `%s'",
                   fname);
 	} else {
-            static char buf[2048];
-            int32_t t = instruction->times;
-            int32_t base = 0;
+            static char buf[4096];
+            size_t t = instruction->times;
+            size_t base = 0;
+	    size_t len;
 
             len = ftell(fp);
             if (instruction->eops->next) {
                 base = instruction->eops->next->offset;
                 len -= base;
                 if (instruction->eops->next->next &&
-                    len > instruction->eops->next->next->offset)
-                    len = instruction->eops->next->next->offset;
+                    len > (size_t)instruction->eops->next->next->offset)
+                    len = (size_t)instruction->eops->next->next->offset;
             }
             /*
              * Dummy call to list->output to give the offset to the
@@ -402,7 +399,7 @@ int64_t assemble(int32_t segment, int64_t offset, int bits, uint32_t cp,
             list->output(offset, NULL, OUT_RAWDATA, 0);
             list->uplevel(LIST_INCBIN);
             while (t--) {
-                int32_t l;
+                size_t l;
 
                 fseek(fp, base, SEEK_SET);
                 l = len;
@@ -660,7 +657,8 @@ int64_t insn_size(int32_t segment, int64_t offset, int bits, uint32_t cp,
             osize = 0;
             if (e->type == EOT_DB_NUMBER)
                 osize = 1;
-            else if (e->type == EOT_DB_STRING)
+            else if (e->type == EOT_DB_STRING ||
+		     e->type == EOT_DB_STRING_FREE)
                 osize = e->stringlen;
 
             align = (-osize) % wsize;
@@ -672,16 +670,10 @@ int64_t insn_size(int32_t segment, int64_t offset, int bits, uint32_t cp,
     }
 
     if (instruction->opcode == I_INCBIN) {
-        char fname[FILENAME_MAX];
+	const char *fname = instruction->eops->stringval;
         FILE *fp;
-        int32_t len;
-
-        len = FILENAME_MAX - 1;
-        if (len > instruction->eops->stringlen)
-            len = instruction->eops->stringlen;
-        strncpy(fname, instruction->eops->stringval, len);
-        fname[len] = '\0';
-	
+        size_t len;
+
 	fp = fopen(fname, "rb");
 	if (!fp)
             error(ERR_NONFATAL, "`incbin': unable to open file `%s'",
@@ -695,8 +687,8 @@ int64_t insn_size(int32_t segment, int64_t offset, int bits, uint32_t cp,
             if (instruction->eops->next) {
                 len -= instruction->eops->next->offset;
                 if (instruction->eops->next->next &&
-                    len > instruction->eops->next->next->offset) {
-                    len = instruction->eops->next->next->offset;
+                    len > (size_t)instruction->eops->next->next->offset) {
+                    len = (size_t)instruction->eops->next->next->offset;
                 }
             }
             return instruction->times * len;
diff --git a/nasm.h b/nasm.h
index fedf8583..ec44f164 100644
--- a/nasm.h
+++ b/nasm.h
@@ -182,6 +182,7 @@ enum token_type {		/* token types, other than chars */
     TOKEN_DBL_AND, TOKEN_DBL_OR, TOKEN_DBL_XOR, /* &&, || and ^^ */
     TOKEN_SEG, TOKEN_WRT,       /* SEG and WRT */
     TOKEN_FLOATIZE,		/* __floatX__ */
+    TOKEN_STRFUNC,		/* __utf16__, __utf32__ */
 };
 
 enum floatize {
@@ -195,6 +196,14 @@ enum floatize {
     FLOAT_128H,
 };
 
+/* Must match the list in string_transform(), in strfunc.c */
+enum strfunc {
+    STRFUNC_UTF16,
+    STRFUNC_UTF32,
+};
+
+size_t string_transform(char *, size_t, char **, enum strfunc);
+
 /*
  * The expression evaluator must be passed a scanner function; a
  * standard scanner is provided as part of nasmlib.c. The
@@ -605,11 +614,14 @@ enum prefixes {			/* instruction prefixes */
     PREFIX_ENUM_LIMIT
 };
 
-enum {                          /* extended operand types */
-    EOT_NOTHING, EOT_DB_STRING, EOT_DB_NUMBER
+enum extop_type {		/* extended operand types */
+    EOT_NOTHING,
+    EOT_DB_STRING,		/* Byte string */
+    EOT_DB_STRING_FREE,		/* Byte string which should be nasm_free'd*/
+    EOT_DB_NUMBER,		/* Integer */
 };
 
-enum {                          /* special EA flags */
+enum ea_flags {			/* special EA flags */
     EAF_BYTEOFFS =  1,          /* force offset part to byte size */
     EAF_WORDOFFS =  2,          /* force offset part to [d]word size */
     EAF_TIMESTWO =  4,          /* really do EAX*2 not EAX+EAX */
@@ -643,12 +655,12 @@ typedef struct operand {	/* operand to an instruction */
 
 typedef struct extop {          /* extended operand */
     struct extop *next;         /* linked list */
-    int32_t type;               /* defined above */
-    char *stringval;          /* if it's a string, then here it is */
-    int stringlen;              /* ... and here's how long it is */
-    int32_t segment;            /* if it's a number/address, then... */
+    char *stringval;	        /* if it's a string, then here it is */
+    size_t stringlen;           /* ... and here's how long it is */
     int64_t offset;             /* ... it's given here ... */
+    int32_t segment;            /* if it's a number/address, then... */
     int32_t wrt;                /* ... and here */
+    enum extop_type type;	/* defined above */
 } extop;
 
 /* Prefix positions: each type of prefix goes in a specific slot.
diff --git a/parser.c b/parser.c
index 6fb7e3c7..caff1b18 100644
--- a/parser.c
+++ b/parser.c
@@ -334,6 +334,7 @@ restart_parse:
 	result->opcode == I_DY || result->opcode == I_INCBIN) {
         extop *eop, **tail = &result->eops, **fixptr;
         int oper_num = 0;
+	int32_t sign;
 
         result->eops_float = false;
 
@@ -355,85 +356,114 @@ restart_parse:
             eop->next = NULL;
             eop->type = EOT_NOTHING;
             oper_num++;
+	    sign = +1;
 
+	    /* is_comma_next() here is to distinguish this from
+	       a string used as part of an expression... */
             if (i == TOKEN_STR && is_comma_next()) {
                 eop->type = EOT_DB_STRING;
                 eop->stringval = tokval.t_charptr;
                 eop->stringlen = tokval.t_inttwo;
                 i = stdscan(NULL, &tokval);     /* eat the comma */
-                continue;
-            }
-
-            if ((i == TOKEN_FLOAT && is_comma_next())
-		|| i == '-' || i == '+') {
-                int32_t sign = +1;
-
-                if (i == '+' || i == '-') {
-                    char *save = stdscan_bufptr;
-		    int token = i;
-		    sign = (i == '-') ? -1 : 1;
-                    i = stdscan(NULL, &tokval);
-                    if (i != TOKEN_FLOAT || !is_comma_next()) {
-                        stdscan_bufptr = save;
-                        i = tokval.t_type = token;
-                    }
-                }
-
-                if (i == TOKEN_FLOAT) {
-                    eop->type = EOT_DB_STRING;
-                    result->eops_float = true;
-		    switch (result->opcode) {
-		    case I_DB:
-			eop->stringlen = 1;
-			break;
-		    case I_DW:
-			eop->stringlen = 2;
-			break;
-		    case I_DD:
-                        eop->stringlen = 4;
-			break;
-		    case I_DQ:
-                        eop->stringlen = 8;
-			break;
-		    case I_DT:
-                        eop->stringlen = 10;
-			break;
-		    case I_DO:
-                        eop->stringlen = 16;
-			break;
-		    case I_DY:
-                        error(ERR_NONFATAL, "floating-point constant"
-                              " encountered in DY instruction");
-			eop->stringlen = 0;
-			break;
-		    default:
-                        error(ERR_NONFATAL, "floating-point constant"
-                              " encountered in unknown instruction");
-                        /*
-                         * fix suggested by Pedro Gimeno... original line
-                         * was:
-                         * eop->type = EOT_NOTHING;
-                         */
-                        eop->stringlen = 0;
-			break;
-                    }
-                    eop = nasm_realloc(eop, sizeof(extop) + eop->stringlen);
-                    tail = &eop->next;
-                    *fixptr = eop;
-                    eop->stringval = (char *)eop + sizeof(extop);
-                    if (!eop->stringlen ||
-                        !float_const(tokval.t_charptr, sign,
-                                     (uint8_t *)eop->stringval,
-                                     eop->stringlen, error))
-                        eop->type = EOT_NOTHING;
-                    i = stdscan(NULL, &tokval); /* eat the comma */
-                    continue;
-                }
-            }
-
-            /* anything else */
-            {
+	    } else if (i == TOKEN_STRFUNC) {
+		bool parens = false;
+		const char *funcname = tokval.t_charptr;
+		enum strfunc func = tokval.t_integer;
+		i = stdscan(NULL, &tokval);
+		if (i == '(') {
+		    parens = true;
+		    i = stdscan(NULL, &tokval);
+		}
+		if (i != TOKEN_STR) {
+		    error(ERR_NONFATAL,
+			  "%s must be followed by a string constant",
+			  funcname);
+			eop->type = EOT_NOTHING;
+		} else {
+		    eop->type = EOT_DB_STRING_FREE;
+		    eop->stringlen =
+			string_transform(tokval.t_charptr, tokval.t_inttwo,
+					 &eop->stringval, func);
+		    if (eop->stringlen == (size_t)-1) {
+			error(ERR_NONFATAL, "invalid string for transform");
+			eop->type = EOT_NOTHING;
+		    }
+		}
+		if (parens && i && i != ')') {
+		    i = stdscan(NULL, &tokval);
+		    if (i != ')') {
+			error(ERR_NONFATAL, "unterminated %s function",
+			      funcname);
+		    }
+		}
+		if (i && i != ',')
+		    i = stdscan(NULL, &tokval);
+	    } else if (i == '-' || i == '+') {
+		char *save = stdscan_bufptr;
+		int token = i;
+		sign = (i == '-') ? -1 : 1;
+		i = stdscan(NULL, &tokval);
+		if (i != TOKEN_FLOAT) {
+		    stdscan_bufptr = save;
+		    i = tokval.t_type = token;
+		    goto is_expression;
+		} else {
+		    goto is_float;
+		}
+            } else if (i == TOKEN_FLOAT) {
+	    is_float:
+		eop->type = EOT_DB_STRING;
+		result->eops_float = true;
+		switch (result->opcode) {
+		case I_DB:
+		    eop->stringlen = 1;
+		    break;
+		case I_DW:
+		    eop->stringlen = 2;
+		    break;
+		case I_DD:
+		    eop->stringlen = 4;
+		    break;
+		case I_DQ:
+		    eop->stringlen = 8;
+		    break;
+		case I_DT:
+		    eop->stringlen = 10;
+		    break;
+		case I_DO:
+		    eop->stringlen = 16;
+		    break;
+		case I_DY:
+		    error(ERR_NONFATAL, "floating-point constant"
+			  " encountered in DY instruction");
+		    eop->stringlen = 0;
+		    break;
+		default:
+		    error(ERR_NONFATAL, "floating-point constant"
+			  " encountered in unknown instruction");
+		    /*
+		     * fix suggested by Pedro Gimeno... original line
+		     * was:
+		     * eop->type = EOT_NOTHING;
+		     */
+		    eop->stringlen = 0;
+		    break;
+		}
+		eop = nasm_realloc(eop, sizeof(extop) + eop->stringlen);
+		tail = &eop->next;
+		*fixptr = eop;
+		eop->stringval = (char *)eop + sizeof(extop);
+		if (!eop->stringlen ||
+		    !float_const(tokval.t_charptr, sign,
+				 (uint8_t *)eop->stringval,
+				 eop->stringlen, error))
+		    eop->type = EOT_NOTHING;
+		i = stdscan(NULL, &tokval); /* eat the comma */
+	    } else {
+		/* anything else, assume it is an expression */
                 expr *value;
+
+	    is_expression:
                 value = evaluate(stdscan, NULL, &tokval, NULL,
                                  critical, error, NULL);
                 i = tokval.t_type;
diff --git a/strfunc.c b/strfunc.c
new file mode 100644
index 00000000..9fb72706
--- /dev/null
+++ b/strfunc.c
@@ -0,0 +1,167 @@
+/*
+ * strfunc.c
+ *
+ * String transformation functions
+ */
+
+#include "nasmlib.h"
+#include "nasm.h"
+
+/*
+ * Convert a string in UTF-8 format to UTF-16LE
+ */
+static size_t utf8_to_16le(uint8_t *str, size_t len, char *op)
+{
+#define EMIT(x) do { if (op) { WRITESHORT(op,x); } outlen++; } while(0)
+
+    size_t outlen = 0;
+    int expect = 0;
+    uint8_t c;
+    uint32_t v = 0, vmin = 0;
+
+    while (len--) {
+	c = *str++;
+
+	if (expect) {
+	    if ((c & 0xc0) != 0x80) {
+		expect = 0;
+		return -1;
+	    } else {
+		v = (v << 6) | (c & 0x3f);
+		if (!--expect) {
+		    if (v < vmin || v > 0x10ffff ||
+			(v >= 0xd800 && v <= 0xdfff)) {
+			return -1;
+		    } else if (v > 0xffff) {
+			v -= 0x10000;
+			EMIT(0xd800 | (v >> 10));
+			EMIT(0xdc00 | (v & 0x3ff));
+		    } else {
+			EMIT(v);
+		    }
+		}
+		continue;
+	    }
+	}
+
+	if (c < 0x80) {
+	    EMIT(c);
+	} else if (c < 0xa0 || c >= 0xfe) {
+	    /* Invalid UTF-8 */
+	    return -1;
+	} else if (c < 0xe0) {
+	    v = c & 0x1f;
+	    expect = 1;
+	    vmin = 0x80;
+	} else if (c < 0xf0) {
+	    v = c & 0x0f;
+	    expect = 2;
+	    vmin = 0x800;
+	} else if (c < 0xf8) {
+	    v = c & 0x07;
+	    expect = 3;
+	    vmin = 0x10000;
+	} else if (c < 0xfc) {
+	    v = c & 0x03;
+	    expect = 4;
+	    vmin = 0x200000;
+	} else {
+	    v = c & 0x01;
+	    expect = 5;
+	    vmin = 0x4000000;
+	}
+    }
+
+    return expect ? (size_t)-1 : outlen << 1;
+
+#undef EMIT
+}
+
+/*
+ * Convert a string in UTF-8 format to UTF-32LE
+ */
+static size_t utf8_to_32le(uint8_t *str, size_t len, char *op)
+{
+#define EMIT(x) do { if (op) { WRITELONG(op,x); } outlen++; } while(0)
+
+    size_t outlen = 0;
+    int expect = 0;
+    uint8_t c;
+    uint32_t v = 0, vmin = 0;
+
+    while (len--) {
+	c = *str++;
+
+	if (expect) {
+	    if ((c & 0xc0) != 0x80) {
+		return -1;
+	    } else {
+		v = (v << 6) | (c & 0x3f);
+		if (!--expect) {
+		    if (v < vmin || (v >= 0xd800 && v <= 0xdfff)) {
+			return -1;
+		    } else {
+			EMIT(v);
+		    }
+		}
+		continue;
+	    }
+	}
+
+	if (c < 0x80) {
+	    EMIT(c);
+	} else if (c < 0xa0 || c >= 0xfe) {
+	    /* Invalid UTF-8 */
+	    return -1;
+	} else if (c < 0xe0) {
+	    v = c & 0x1f;
+	    expect = 1;
+	    vmin = 0x80;
+	} else if (c < 0xf0) {
+	    v = c & 0x0f;
+	    expect = 2;
+	    vmin = 0x800;
+	} else if (c < 0xf8) {
+	    v = c & 0x07;
+	    expect = 3;
+	    vmin = 0x10000;
+	} else if (c < 0xfc) {
+	    v = c & 0x03;
+	    expect = 4;
+	    vmin = 0x200000;
+	} else {
+	    v = c & 0x01;
+	    expect = 5;
+	    vmin = 0x4000000;
+	}
+    }
+
+    return expect ? (size_t)-1 : outlen << 2;
+
+#undef EMIT
+}
+
+typedef size_t (*transform_func)(uint8_t *, size_t, char *);
+
+/*
+ * Apply a specific string transform and return it in a nasm_malloc'd
+ * buffer, returning the length.  On error, returns (size_t)-1 and no
+ * buffer is allocated.
+ */
+size_t string_transform(char *str, size_t len, char **out, enum strfunc func)
+{
+    /* This should match enum strfunc in nasm.h */
+    static const transform_func str_transforms[] = {
+	utf8_to_16le,
+	utf8_to_32le,
+    };
+    transform_func transform = str_transforms[func];
+    size_t outlen;
+    uint8_t *s = (uint8_t *)str;
+
+    outlen = transform(s, len, NULL);
+    if (outlen == (size_t)-1)
+	return -1;
+
+    return transform(s, len, *out = nasm_malloc(outlen));
+}
diff --git a/tokens.dat b/tokens.dat
index 6c3ad650..128bc670 100644
--- a/tokens.dat
+++ b/tokens.dat
@@ -53,6 +53,10 @@ __float80e__
 __float128l__
 __float128h__
 
+% TOKEN_STRFUNC, 0, STRFUNC_{__*__}
+__utf16__
+__utf32__
+
 % TOKEN_*, 0, 0
 seg
 wrt