summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile.in8
-rw-r--r--Mkfiles/msvc.mak4
-rw-r--r--Mkfiles/netware.mak4
-rw-r--r--Mkfiles/openwcom.mak4
-rw-r--r--Mkfiles/owlinux.mak4
-rw-r--r--assemble.c46
-rw-r--r--nasm.h26
-rw-r--r--parser.c174
-rw-r--r--strfunc.c167
-rw-r--r--tokens.dat4
10 files changed, 328 insertions, 113 deletions
diff --git a/Makefile.in b/Makefile.in
index d207acc2..7d32d06a 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -67,8 +67,8 @@ NASM = nasm.$(O) nasmlib.$(O) raa.$(O) saa.$(O) \
output/outobj.$(O) output/outas86.$(O) output/outrdf2.$(O) \
output/outdbg.$(O) output/outieee.$(O) output/outmacho.$(O) \
preproc.$(O) quote.$(O) pptok.$(O) macros.$(O) \
- listing.$(O) eval.$(O) exprlib.$(O) stdscan.$(O) tokhash.$(O) \
- regvals.$(O) regflags.$(O)
+ listing.$(O) eval.$(O) exprlib.$(O) stdscan.$(O) strfunc.$(O) \
+ tokhash.$(O) regvals.$(O) regflags.$(O)
NDISASM = ndisasm.$(O) disasm.$(O) sync.$(O) nasmlib.$(O) \
insnsd.$(O) insnsb.$(O) insnsn.$(O) regs.$(O) regdis.$(O)
@@ -234,7 +234,7 @@ alldeps: perlreq
#-- Everything below is generated by mkdep.pl - do not edit --#
assemble.$(O): assemble.c assemble.h compiler.h config.h insns.h insnsi.h \
nasm.h nasmlib.h regs.h tables.h tokens.h version.h
-crc64.$(O): crc64.c compiler.h config.h
+crc64.$(O): crc64.c compiler.h config.h nasmlib.h
disasm.$(O): disasm.c compiler.h config.h disasm.h insns.h insnsi.h nasm.h \
nasmlib.h regdis.h regs.h sync.h tables.h tokens.h version.h
eval.$(O): eval.c compiler.h config.h eval.h float.h insnsi.h labels.h \
@@ -309,6 +309,8 @@ regvals.$(O): regvals.c compiler.h config.h insnsi.h tables.h
saa.$(O): saa.c compiler.h config.h nasmlib.h saa.h
stdscan.$(O): stdscan.c compiler.h config.h insns.h insnsi.h nasm.h \
nasmlib.h quote.h regs.h stdscan.h tokens.h version.h
+strfunc.$(O): strfunc.c compiler.h config.h insnsi.h nasm.h nasmlib.h regs.h \
+ version.h
sync.$(O): sync.c compiler.h config.h nasmlib.h sync.h
tokhash.$(O): tokhash.c compiler.h config.h hashtbl.h insns.h insnsi.h \
nasm.h nasmlib.h regs.h tokens.h version.h
diff --git a/Mkfiles/msvc.mak b/Mkfiles/msvc.mak
index c2904d85..04188bff 100644
--- a/Mkfiles/msvc.mak
+++ b/Mkfiles/msvc.mak
@@ -180,7 +180,7 @@ everything: all doc rdf
#-- Everything below is generated by mkdep.pl - do not edit --#
assemble.$(O): assemble.c assemble.h compiler.h insns.h insnsi.h nasm.h \
nasmlib.h regs.h tables.h tokens.h version.h
-crc64.$(O): crc64.c compiler.h
+crc64.$(O): crc64.c compiler.h nasmlib.h
disasm.$(O): disasm.c compiler.h disasm.h insns.h insnsi.h nasm.h nasmlib.h \
regdis.h regs.h sync.h tables.h tokens.h version.h
eval.$(O): eval.c compiler.h eval.h float.h insnsi.h labels.h nasm.h \
@@ -253,6 +253,8 @@ regvals.$(O): regvals.c compiler.h insnsi.h tables.h
saa.$(O): saa.c compiler.h nasmlib.h saa.h
stdscan.$(O): stdscan.c compiler.h insns.h insnsi.h nasm.h nasmlib.h quote.h \
regs.h stdscan.h tokens.h version.h
+strfunc.$(O): strfunc.c compiler.h insnsi.h nasm.h nasmlib.h regs.h \
+ version.h
sync.$(O): sync.c compiler.h nasmlib.h sync.h
tokhash.$(O): tokhash.c compiler.h hashtbl.h insns.h insnsi.h nasm.h \
nasmlib.h regs.h tokens.h version.h
diff --git a/Mkfiles/netware.mak b/Mkfiles/netware.mak
index f4ec46b8..c1a970fb 100644
--- a/Mkfiles/netware.mak
+++ b/Mkfiles/netware.mak
@@ -120,7 +120,7 @@ $(OBJDIR)/version.inc: $(PROOT)/version $(PROOT)/version.pl $(OBJDIR)
#-- Everything below is generated by mkdep.pl - do not edit --#
assemble.o: assemble.c assemble.h compiler.h config.h insns.h insnsi.h \
nasm.h nasmlib.h regs.h tables.h tokens.h version.h
-crc64.o: crc64.c compiler.h config.h
+crc64.o: crc64.c compiler.h config.h nasmlib.h
disasm.o: disasm.c compiler.h config.h disasm.h insns.h insnsi.h nasm.h \
nasmlib.h regdis.h regs.h sync.h tables.h tokens.h version.h
eval.o: eval.c compiler.h config.h eval.h float.h insnsi.h labels.h nasm.h \
@@ -193,6 +193,8 @@ regvals.o: regvals.c compiler.h config.h insnsi.h tables.h
saa.o: saa.c compiler.h config.h nasmlib.h saa.h
stdscan.o: stdscan.c compiler.h config.h insns.h insnsi.h nasm.h nasmlib.h \
quote.h regs.h stdscan.h tokens.h version.h
+strfunc.o: strfunc.c compiler.h config.h insnsi.h nasm.h nasmlib.h regs.h \
+ version.h
sync.o: sync.c compiler.h config.h nasmlib.h sync.h
tokhash.o: tokhash.c compiler.h config.h hashtbl.h insns.h insnsi.h nasm.h \
nasmlib.h regs.h tokens.h version.h
diff --git a/Mkfiles/openwcom.mak b/Mkfiles/openwcom.mak
index fb95f579..aeb42c57 100644
--- a/Mkfiles/openwcom.mak
+++ b/Mkfiles/openwcom.mak
@@ -209,7 +209,7 @@ everything: all doc rdf
#-- Everything below is generated by mkdep.pl - do not edit --#
assemble.$(O): assemble.c assemble.h compiler.h insns.h insnsi.h nasm.h &
nasmlib.h regs.h tables.h tokens.h version.h
-crc64.$(O): crc64.c compiler.h
+crc64.$(O): crc64.c compiler.h nasmlib.h
disasm.$(O): disasm.c compiler.h disasm.h insns.h insnsi.h nasm.h nasmlib.h &
regdis.h regs.h sync.h tables.h tokens.h version.h
eval.$(O): eval.c compiler.h eval.h float.h insnsi.h labels.h nasm.h &
@@ -282,6 +282,8 @@ regvals.$(O): regvals.c compiler.h insnsi.h tables.h
saa.$(O): saa.c compiler.h nasmlib.h saa.h
stdscan.$(O): stdscan.c compiler.h insns.h insnsi.h nasm.h nasmlib.h quote.h &
regs.h stdscan.h tokens.h version.h
+strfunc.$(O): strfunc.c compiler.h insnsi.h nasm.h nasmlib.h regs.h &
+ version.h
sync.$(O): sync.c compiler.h nasmlib.h sync.h
tokhash.$(O): tokhash.c compiler.h hashtbl.h insns.h insnsi.h nasm.h &
nasmlib.h regs.h tokens.h version.h
diff --git a/Mkfiles/owlinux.mak b/Mkfiles/owlinux.mak
index bb10d9e7..ce4dc6d8 100644
--- a/Mkfiles/owlinux.mak
+++ b/Mkfiles/owlinux.mak
@@ -219,7 +219,7 @@ everything: all doc rdf
#-- Everything below is generated by mkdep.pl - do not edit --#
assemble.$(O): assemble.c assemble.h compiler.h insns.h insnsi.h nasm.h \
nasmlib.h regs.h tables.h tokens.h version.h
-crc64.$(O): crc64.c compiler.h
+crc64.$(O): crc64.c compiler.h nasmlib.h
disasm.$(O): disasm.c compiler.h disasm.h insns.h insnsi.h nasm.h nasmlib.h \
regdis.h regs.h sync.h tables.h tokens.h version.h
eval.$(O): eval.c compiler.h eval.h float.h insnsi.h labels.h nasm.h \
@@ -292,6 +292,8 @@ regvals.$(O): regvals.c compiler.h insnsi.h tables.h
saa.$(O): saa.c compiler.h nasmlib.h saa.h
stdscan.$(O): stdscan.c compiler.h insns.h insnsi.h nasm.h nasmlib.h quote.h \
regs.h stdscan.h tokens.h version.h
+strfunc.$(O): strfunc.c compiler.h insnsi.h nasm.h nasmlib.h regs.h \
+ version.h
sync.$(O): sync.c compiler.h nasmlib.h sync.h
tokhash.$(O): tokhash.c compiler.h hashtbl.h insns.h insnsi.h nasm.h \
nasmlib.h regs.h tokens.h version.h
diff --git a/assemble.c b/assemble.c
index 442ed2a4..7ab53ad3 100644
--- a/assemble.c
+++ b/assemble.c
@@ -335,7 +335,8 @@ int64_t assemble(int32_t segment, int64_t offset, int bits, uint32_t cp,
out(offset, segment, &e->offset,
OUT_ADDRESS, wsize, e->segment, e->wrt);
offset += wsize;
- } else if (e->type == EOT_DB_STRING) {
+ } else if (e->type == EOT_DB_STRING ||
+ e->type == EOT_DB_STRING_FREE) {
int align;
out(offset, segment, e->stringval,
@@ -348,6 +349,8 @@ int64_t assemble(int32_t segment, int64_t offset, int bits, uint32_t cp,
OUT_RAWDATA, align, NO_SEG, NO_SEG);
}
offset += e->stringlen + align;
+ if (e->type == EOT_DB_STRING_FREE)
+ nasm_free(e->stringval);
}
}
if (t > 0 && t == instruction->times - 1) {
@@ -365,15 +368,8 @@ int64_t assemble(int32_t segment, int64_t offset, int bits, uint32_t cp,
}
if (instruction->opcode == I_INCBIN) {
- static char fname[FILENAME_MAX];
+ const char *fname = instruction->eops->stringval;
FILE *fp;
- int32_t len;
-
- len = FILENAME_MAX - 1;
- if (len > instruction->eops->stringlen)
- len = instruction->eops->stringlen;
- strncpy(fname, instruction->eops->stringval, len);
- fname[len] = '\0';
fp = fopen(fname, "rb");
if (!fp) {
@@ -383,17 +379,18 @@ int64_t assemble(int32_t segment, int64_t offset, int bits, uint32_t cp,
error(ERR_NONFATAL, "`incbin': unable to seek on file `%s'",
fname);
} else {
- static char buf[2048];
- int32_t t = instruction->times;
- int32_t base = 0;
+ static char buf[4096];
+ size_t t = instruction->times;
+ size_t base = 0;
+ size_t len;
len = ftell(fp);
if (instruction->eops->next) {
base = instruction->eops->next->offset;
len -= base;
if (instruction->eops->next->next &&
- len > instruction->eops->next->next->offset)
- len = instruction->eops->next->next->offset;
+ len > (size_t)instruction->eops->next->next->offset)
+ len = (size_t)instruction->eops->next->next->offset;
}
/*
* Dummy call to list->output to give the offset to the
@@ -402,7 +399,7 @@ int64_t assemble(int32_t segment, int64_t offset, int bits, uint32_t cp,
list->output(offset, NULL, OUT_RAWDATA, 0);
list->uplevel(LIST_INCBIN);
while (t--) {
- int32_t l;
+ size_t l;
fseek(fp, base, SEEK_SET);
l = len;
@@ -660,7 +657,8 @@ int64_t insn_size(int32_t segment, int64_t offset, int bits, uint32_t cp,
osize = 0;
if (e->type == EOT_DB_NUMBER)
osize = 1;
- else if (e->type == EOT_DB_STRING)
+ else if (e->type == EOT_DB_STRING ||
+ e->type == EOT_DB_STRING_FREE)
osize = e->stringlen;
align = (-osize) % wsize;
@@ -672,16 +670,10 @@ int64_t insn_size(int32_t segment, int64_t offset, int bits, uint32_t cp,
}
if (instruction->opcode == I_INCBIN) {
- char fname[FILENAME_MAX];
+ const char *fname = instruction->eops->stringval;
FILE *fp;
- int32_t len;
-
- len = FILENAME_MAX - 1;
- if (len > instruction->eops->stringlen)
- len = instruction->eops->stringlen;
- strncpy(fname, instruction->eops->stringval, len);
- fname[len] = '\0';
-
+ size_t len;
+
fp = fopen(fname, "rb");
if (!fp)
error(ERR_NONFATAL, "`incbin': unable to open file `%s'",
@@ -695,8 +687,8 @@ int64_t insn_size(int32_t segment, int64_t offset, int bits, uint32_t cp,
if (instruction->eops->next) {
len -= instruction->eops->next->offset;
if (instruction->eops->next->next &&
- len > instruction->eops->next->next->offset) {
- len = instruction->eops->next->next->offset;
+ len > (size_t)instruction->eops->next->next->offset) {
+ len = (size_t)instruction->eops->next->next->offset;
}
}
return instruction->times * len;
diff --git a/nasm.h b/nasm.h
index fedf8583..ec44f164 100644
--- a/nasm.h
+++ b/nasm.h
@@ -182,6 +182,7 @@ enum token_type { /* token types, other than chars */
TOKEN_DBL_AND, TOKEN_DBL_OR, TOKEN_DBL_XOR, /* &&, || and ^^ */
TOKEN_SEG, TOKEN_WRT, /* SEG and WRT */
TOKEN_FLOATIZE, /* __floatX__ */
+ TOKEN_STRFUNC, /* __utf16__, __utf32__ */
};
enum floatize {
@@ -195,6 +196,14 @@ enum floatize {
FLOAT_128H,
};
+/* Must match the list in string_transform(), in strfunc.c */
+enum strfunc {
+ STRFUNC_UTF16,
+ STRFUNC_UTF32,
+};
+
+size_t string_transform(char *, size_t, char **, enum strfunc);
+
/*
* The expression evaluator must be passed a scanner function; a
* standard scanner is provided as part of nasmlib.c. The
@@ -605,11 +614,14 @@ enum prefixes { /* instruction prefixes */
PREFIX_ENUM_LIMIT
};
-enum { /* extended operand types */
- EOT_NOTHING, EOT_DB_STRING, EOT_DB_NUMBER
+enum extop_type { /* extended operand types */
+ EOT_NOTHING,
+ EOT_DB_STRING, /* Byte string */
+ EOT_DB_STRING_FREE, /* Byte string which should be nasm_free'd*/
+ EOT_DB_NUMBER, /* Integer */
};
-enum { /* special EA flags */
+enum ea_flags { /* special EA flags */
EAF_BYTEOFFS = 1, /* force offset part to byte size */
EAF_WORDOFFS = 2, /* force offset part to [d]word size */
EAF_TIMESTWO = 4, /* really do EAX*2 not EAX+EAX */
@@ -643,12 +655,12 @@ typedef struct operand { /* operand to an instruction */
typedef struct extop { /* extended operand */
struct extop *next; /* linked list */
- int32_t type; /* defined above */
- char *stringval; /* if it's a string, then here it is */
- int stringlen; /* ... and here's how long it is */
- int32_t segment; /* if it's a number/address, then... */
+ char *stringval; /* if it's a string, then here it is */
+ size_t stringlen; /* ... and here's how long it is */
int64_t offset; /* ... it's given here ... */
+ int32_t segment; /* if it's a number/address, then... */
int32_t wrt; /* ... and here */
+ enum extop_type type; /* defined above */
} extop;
/* Prefix positions: each type of prefix goes in a specific slot.
diff --git a/parser.c b/parser.c
index 6fb7e3c7..caff1b18 100644
--- a/parser.c
+++ b/parser.c
@@ -334,6 +334,7 @@ restart_parse:
result->opcode == I_DY || result->opcode == I_INCBIN) {
extop *eop, **tail = &result->eops, **fixptr;
int oper_num = 0;
+ int32_t sign;
result->eops_float = false;
@@ -355,85 +356,114 @@ restart_parse:
eop->next = NULL;
eop->type = EOT_NOTHING;
oper_num++;
+ sign = +1;
+ /* is_comma_next() here is to distinguish this from
+ a string used as part of an expression... */
if (i == TOKEN_STR && is_comma_next()) {
eop->type = EOT_DB_STRING;
eop->stringval = tokval.t_charptr;
eop->stringlen = tokval.t_inttwo;
i = stdscan(NULL, &tokval); /* eat the comma */
- continue;
- }
-
- if ((i == TOKEN_FLOAT && is_comma_next())
- || i == '-' || i == '+') {
- int32_t sign = +1;
-
- if (i == '+' || i == '-') {
- char *save = stdscan_bufptr;
- int token = i;
- sign = (i == '-') ? -1 : 1;
- i = stdscan(NULL, &tokval);
- if (i != TOKEN_FLOAT || !is_comma_next()) {
- stdscan_bufptr = save;
- i = tokval.t_type = token;
- }
- }
-
- if (i == TOKEN_FLOAT) {
- eop->type = EOT_DB_STRING;
- result->eops_float = true;
- switch (result->opcode) {
- case I_DB:
- eop->stringlen = 1;
- break;
- case I_DW:
- eop->stringlen = 2;
- break;
- case I_DD:
- eop->stringlen = 4;
- break;
- case I_DQ:
- eop->stringlen = 8;
- break;
- case I_DT:
- eop->stringlen = 10;
- break;
- case I_DO:
- eop->stringlen = 16;
- break;
- case I_DY:
- error(ERR_NONFATAL, "floating-point constant"
- " encountered in DY instruction");
- eop->stringlen = 0;
- break;
- default:
- error(ERR_NONFATAL, "floating-point constant"
- " encountered in unknown instruction");
- /*
- * fix suggested by Pedro Gimeno... original line
- * was:
- * eop->type = EOT_NOTHING;
- */
- eop->stringlen = 0;
- break;
- }
- eop = nasm_realloc(eop, sizeof(extop) + eop->stringlen);
- tail = &eop->next;
- *fixptr = eop;
- eop->stringval = (char *)eop + sizeof(extop);
- if (!eop->stringlen ||
- !float_const(tokval.t_charptr, sign,
- (uint8_t *)eop->stringval,
- eop->stringlen, error))
- eop->type = EOT_NOTHING;
- i = stdscan(NULL, &tokval); /* eat the comma */
- continue;
- }
- }
-
- /* anything else */
- {
+ } else if (i == TOKEN_STRFUNC) {
+ bool parens = false;
+ const char *funcname = tokval.t_charptr;
+ enum strfunc func = tokval.t_integer;
+ i = stdscan(NULL, &tokval);
+ if (i == '(') {
+ parens = true;
+ i = stdscan(NULL, &tokval);
+ }
+ if (i != TOKEN_STR) {
+ error(ERR_NONFATAL,
+ "%s must be followed by a string constant",
+ funcname);
+ eop->type = EOT_NOTHING;
+ } else {
+ eop->type = EOT_DB_STRING_FREE;
+ eop->stringlen =
+ string_transform(tokval.t_charptr, tokval.t_inttwo,
+ &eop->stringval, func);
+ if (eop->stringlen == (size_t)-1) {
+ error(ERR_NONFATAL, "invalid string for transform");
+ eop->type = EOT_NOTHING;
+ }
+ }
+ if (parens && i && i != ')') {
+ i = stdscan(NULL, &tokval);
+ if (i != ')') {
+ error(ERR_NONFATAL, "unterminated %s function",
+ funcname);
+ }
+ }
+ if (i && i != ',')
+ i = stdscan(NULL, &tokval);
+ } else if (i == '-' || i == '+') {
+ char *save = stdscan_bufptr;
+ int token = i;
+ sign = (i == '-') ? -1 : 1;
+ i = stdscan(NULL, &tokval);
+ if (i != TOKEN_FLOAT) {
+ stdscan_bufptr = save;
+ i = tokval.t_type = token;
+ goto is_expression;
+ } else {
+ goto is_float;
+ }
+ } else if (i == TOKEN_FLOAT) {
+ is_float:
+ eop->type = EOT_DB_STRING;
+ result->eops_float = true;
+ switch (result->opcode) {
+ case I_DB:
+ eop->stringlen = 1;
+ break;
+ case I_DW:
+ eop->stringlen = 2;
+ break;
+ case I_DD:
+ eop->stringlen = 4;
+ break;
+ case I_DQ:
+ eop->stringlen = 8;
+ break;
+ case I_DT:
+ eop->stringlen = 10;
+ break;
+ case I_DO:
+ eop->stringlen = 16;
+ break;
+ case I_DY:
+ error(ERR_NONFATAL, "floating-point constant"
+ " encountered in DY instruction");
+ eop->stringlen = 0;
+ break;
+ default:
+ error(ERR_NONFATAL, "floating-point constant"
+ " encountered in unknown instruction");
+ /*
+ * fix suggested by Pedro Gimeno... original line
+ * was:
+ * eop->type = EOT_NOTHING;
+ */
+ eop->stringlen = 0;
+ break;
+ }
+ eop = nasm_realloc(eop, sizeof(extop) + eop->stringlen);
+ tail = &eop->next;
+ *fixptr = eop;
+ eop->stringval = (char *)eop + sizeof(extop);
+ if (!eop->stringlen ||
+ !float_const(tokval.t_charptr, sign,
+ (uint8_t *)eop->stringval,
+ eop->stringlen, error))
+ eop->type = EOT_NOTHING;
+ i = stdscan(NULL, &tokval); /* eat the comma */
+ } else {
+ /* anything else, assume it is an expression */
expr *value;
+
+ is_expression:
value = evaluate(stdscan, NULL, &tokval, NULL,
critical, error, NULL);
i = tokval.t_type;
diff --git a/strfunc.c b/strfunc.c
new file mode 100644
index 00000000..9fb72706
--- /dev/null
+++ b/strfunc.c
@@ -0,0 +1,167 @@
+/*
+ * strfunc.c
+ *
+ * String transformation functions
+ */
+
+#include "nasmlib.h"
+#include "nasm.h"
+
+/*
+ * Convert a string in UTF-8 format to UTF-16LE
+ */
+static size_t utf8_to_16le(uint8_t *str, size_t len, char *op)
+{
+#define EMIT(x) do { if (op) { WRITESHORT(op,x); } outlen++; } while(0)
+
+ size_t outlen = 0;
+ int expect = 0;
+ uint8_t c;
+ uint32_t v = 0, vmin = 0;
+
+ while (len--) {
+ c = *str++;
+
+ if (expect) {
+ if ((c & 0xc0) != 0x80) {
+ expect = 0;
+ return -1;
+ } else {
+ v = (v << 6) | (c & 0x3f);
+ if (!--expect) {
+ if (v < vmin || v > 0x10ffff ||
+ (v >= 0xd800 && v <= 0xdfff)) {
+ return -1;
+ } else if (v > 0xffff) {
+ v -= 0x10000;
+ EMIT(0xd800 | (v >> 10));
+ EMIT(0xdc00 | (v & 0x3ff));
+ } else {
+ EMIT(v);
+ }
+ }
+ continue;
+ }
+ }
+
+ if (c < 0x80) {
+ EMIT(c);
+ } else if (c < 0xa0 || c >= 0xfe) {
+ /* Invalid UTF-8 */
+ return -1;
+ } else if (c < 0xe0) {
+ v = c & 0x1f;
+ expect = 1;
+ vmin = 0x80;
+ } else if (c < 0xf0) {
+ v = c & 0x0f;
+ expect = 2;
+ vmin = 0x800;
+ } else if (c < 0xf8) {
+ v = c & 0x07;
+ expect = 3;
+ vmin = 0x10000;
+ } else if (c < 0xfc) {
+ v = c & 0x03;
+ expect = 4;
+ vmin = 0x200000;
+ } else {
+ v = c & 0x01;
+ expect = 5;
+ vmin = 0x4000000;
+ }
+ }
+
+ return expect ? (size_t)-1 : outlen << 1;
+
+#undef EMIT
+}
+
+/*
+ * Convert a string in UTF-8 format to UTF-32LE
+ */
+static size_t utf8_to_32le(uint8_t *str, size_t len, char *op)
+{
+#define EMIT(x) do { if (op) { WRITELONG(op,x); } outlen++; } while(0)
+
+ size_t outlen = 0;
+ int expect = 0;
+ uint8_t c;
+ uint32_t v = 0, vmin = 0;
+
+ while (len--) {
+ c = *str++;
+
+ if (expect) {
+ if ((c & 0xc0) != 0x80) {
+ return -1;
+ } else {
+ v = (v << 6) | (c & 0x3f);
+ if (!--expect) {
+ if (v < vmin || (v >= 0xd800 && v <= 0xdfff)) {
+ return -1;
+ } else {
+ EMIT(v);
+ }
+ }
+ continue;
+ }
+ }
+
+ if (c < 0x80) {
+ EMIT(c);
+ } else if (c < 0xa0 || c >= 0xfe) {
+ /* Invalid UTF-8 */
+ return -1;
+ } else if (c < 0xe0) {
+ v = c & 0x1f;
+ expect = 1;
+ vmin = 0x80;
+ } else if (c < 0xf0) {
+ v = c & 0x0f;
+ expect = 2;
+ vmin = 0x800;
+ } else if (c < 0xf8) {
+ v = c & 0x07;
+ expect = 3;
+ vmin = 0x10000;
+ } else if (c < 0xfc) {
+ v = c & 0x03;
+ expect = 4;
+ vmin = 0x200000;
+ } else {
+ v = c & 0x01;
+ expect = 5;
+ vmin = 0x4000000;
+ }
+ }
+
+ return expect ? (size_t)-1 : outlen << 2;
+
+#undef EMIT
+}
+
+typedef size_t (*transform_func)(uint8_t *, size_t, char *);
+
+/*
+ * Apply a specific string transform and return it in a nasm_malloc'd
+ * buffer, returning the length. On error, returns (size_t)-1 and no
+ * buffer is allocated.
+ */
+size_t string_transform(char *str, size_t len, char **out, enum strfunc func)
+{
+ /* This should match enum strfunc in nasm.h */
+ static const transform_func str_transforms[] = {
+ utf8_to_16le,
+ utf8_to_32le,
+ };
+ transform_func transform = str_transforms[func];
+ size_t outlen;
+ uint8_t *s = (uint8_t *)str;
+
+ outlen = transform(s, len, NULL);
+ if (outlen == (size_t)-1)
+ return -1;
+
+ return transform(s, len, *out = nasm_malloc(outlen));
+}
diff --git a/tokens.dat b/tokens.dat
index 6c3ad650..128bc670 100644
--- a/tokens.dat
+++ b/tokens.dat
@@ -53,6 +53,10 @@ __float80e__
__float128l__
__float128h__
+% TOKEN_STRFUNC, 0, STRFUNC_{__*__}
+__utf16__
+__utf32__
+
% TOKEN_*, 0, 0
seg
wrt