Merge commit 'origin/sse5'

author: H. Peter Anvin <hpa@zytor.com> 2007-09-19 16:22:03 -0700
committer: H. Peter Anvin <hpa@zytor.com> 2007-09-19 16:22:03 -0700
commit: eb49a4e1d402d5a1ce95e495787b900aa5303a47 (patch)
tree: 81fbe28b4d1faf6e8d68aa3d7af58b4443e948d2
parent: b4b43178783e963e95fb290e82f1a0c6d6725520 (diff)
parent: bf9a24f46471abad75fa3efba059646a6c4f5026 (diff)
download: nasm-eb49a4e1d402d5a1ce95e495787b900aa5303a47.tar.gz
17 files changed, 1547 insertions, 587 deletions
diff --git a/assemble.c b/assemble.c
index 54522712..efb02207 100644
--- a/assemble.c
+++ b/assemble.c
@@ -12,39 +12,43 @@
  *                 (POP is never used for CS) depending on operand 0
  * \5, \7        - the second byte of POP/PUSH codes for FS, GS, depending
  *                 on operand 0
- * \10, \11, \12 - a literal byte follows in the code stream, to be added
- *                 to the register value of operand 0, 1 or 2
- * \17           - encodes the literal byte 0. (Some compilers don't take
- *                 kindly to a zero byte in the _middle_ of a compile time
- *                 string constant, so I had to put this hack in.)
- * \14, \15, \16 - a signed byte immediate operand, from operand 0, 1 or 2
- * \20, \21, \22 - a byte immediate operand, from operand 0, 1 or 2
- * \24, \25, \26 - an unsigned byte immediate operand, from operand 0, 1 or 2
- * \30, \31, \32 - a word immediate operand, from operand 0, 1 or 2
- * \34, \35, \36 - select between \3[012] and \4[012] depending on 16/32 bit
+ * \10..\13      - a literal byte follows in the code stream, to be added
+ *                 to the register value of operand 0..3
+ * \14..\17      - a signed byte immediate operand, from operand 0..3
+ * \20..\23      - a byte immediate operand, from operand 0..3
+ * \24..\27      - an unsigned byte immediate operand, from operand 0..3
+ * \30..\33      - a word immediate operand, from operand 0..3
+ * \34..\37      - select between \3[0-3] and \4[0-3] depending on 16/32 bit
  *                 assembly mode or the operand-size override on the operand
- * \37           - a word constant, from the _segment_ part of operand 0
- * \40, \41, \42 - a long immediate operand, from operand 0, 1 or 2
- * \44, \45, \46 - select between \3[012], \4[012] and \5[456]
+ * \40..\43      - a long immediate operand, from operand 0..3
+ * \44..\47      - select between \3[0-3], \4[0-3] and \5[4-7]
  *		   depending on assembly mode or the address-size override
  *		   on the operand.
- * \50, \51, \52 - a byte relative operand, from operand 0, 1 or 2
- * \54, \55, \56 - a qword immediate operand, from operand 0, 1 or 2
- * \60, \61, \62 - a word relative operand, from operand 0, 1 or 2
- * \64, \65, \66 - select between \6[012] and \7[012] depending on 16/32 bit
+ * \50..\53      - a byte relative operand, from operand 0..3
+ * \54..\57      - a qword immediate operand, from operand 0..3
+ * \60..\63      - a word relative operand, from operand 0..3
+ * \64..\67      - select between \6[0-3] and \7[0-3] depending on 16/32 bit
  *                 assembly mode or the operand-size override on the operand
- * \70, \71, \72 - a long relative operand, from operand 0, 1 or 2
+ * \70..\73      - a long relative operand, from operand 0..3
+ * \74..\77       - a word constant, from the _segment_ part of operand 0..3
  * \1ab          - a ModRM, calculated on EA in operand a, with the spare
  *                 field the register value of operand b.
- * \130,\131,\132 - an immediate word or signed byte for operand 0, 1, or 2
- * \133,\134,\135 - or 2 (s-field) into next opcode byte if operand 0, 1, or 2
+ * \140..\143    - an immediate word or signed byte for operand 0..3
+ * \144..\147    - or 2 (s-field) into next opcode byte if operand 0..3
  *		    is a signed byte rather than a word.
- * \140,\141,\142 - an immediate dword or signed byte for operand 0, 1, or 2
- * \143,\144,\145 - or 2 (s-field) into next opcode byte if operand 0, 1, or 2
+ * \150..\153     - an immediate dword or signed byte for operand 0..3
+ * \154..\157     - or 2 (s-field) into next opcode byte if operand 0..3
  *		    is a signed byte rather than a dword.
- * \150,\151,\152 - an immediate qword or signed byte for operand 0, 1, or 2
- * \153,\154,\155 - or 2 (s-field) into next opcode byte if operand 0, 1, or 2
- *		    is a signed byte rather than a qword.
+ * \160..\163    - this instruction uses DREX rather than REX, with the
+ *		   OC0 field set to 0, and the dest field taken from
+ *                 operand 0..3.
+ * \164..\167    - this instruction uses DREX rather than REX, with the
+ *		   OC0 field set to 1, and the dest field taken from
+ *                 operand 0..3.
+ * \170          - encodes the literal byte 0. (Some compilers don't take
+ *                 kindly to a zero byte in the _middle_ of a compile time
+ *                 string constant, so I had to put this hack in.)
+ * \171		 - placement of DREX suffix in the absence of an EA
  * \2ab          - a ModRM, calculated on EA in operand a, with the spare
  *                 field equal to digit b.
  * \30x          - might be an 0x67 byte, depending on the address size of
@@ -246,6 +250,9 @@ int32_t assemble(int32_t segment, int32_t offset, int bits, uint32_t cp,
     case I_DT:
         wsize = 10;
         break;
+    case I_DO:
+	wsize = 16;
+	break;
     default:
 	break;
     }
@@ -560,10 +567,9 @@ int32_t insn_size(int32_t segment, int32_t offset, int bits, uint32_t cp,
     if (instruction->opcode == -1)
         return 0;
 
-    if (instruction->opcode == I_DB ||
-        instruction->opcode == I_DW ||
-        instruction->opcode == I_DD ||
-        instruction->opcode == I_DQ || instruction->opcode == I_DT) {
+    if (instruction->opcode == I_DB || instruction->opcode == I_DW ||
+        instruction->opcode == I_DD || instruction->opcode == I_DQ ||
+	instruction->opcode == I_DT || instruction->opcode == I_DO) {
         extop *e;
         int32_t isize, osize, wsize = 0;   /* placate gcc */
 
@@ -584,6 +590,9 @@ int32_t insn_size(int32_t segment, int32_t offset, int bits, uint32_t cp,
         case I_DT:
             wsize = 10;
             break;
+	case I_DO:
+	    wsize = 16;
+	    break;
 	default:
 	    break;
         }
@@ -730,73 +739,79 @@ static int32_t calcsize(int32_t segment, int32_t offset, int bits,
         case 010:
         case 011:
         case 012:
+	case 013:
 	    ins->rex |=
 		op_rexflags(&ins->oprs[c - 010], REX_B|REX_H|REX_P|REX_W);
             codes++, length++;
             break;
-        case 017:
-            length++;
-            break;
         case 014:
         case 015:
         case 016:
+	case 017:
             length++;
             break;
         case 020:
         case 021:
         case 022:
+	case 023:
             length++;
             break;
         case 024:
         case 025:
         case 026:
+	case 027:
             length++;
             break;
         case 030:
         case 031:
         case 032:
+	case 033:
             length += 2;
             break;
         case 034:
         case 035:
         case 036:
+	case 037:
             if (ins->oprs[c - 034].type & (BITS16 | BITS32 | BITS64))
                 length += (ins->oprs[c - 034].type & BITS16) ? 2 : 4;
             else
                 length += (bits == 16) ? 2 : 4;
             break;
-        case 037:
-            length += 2;
-            break;
         case 040:
         case 041:
         case 042:
+	case 043:
             length += 4;
             break;
         case 044:
         case 045:
         case 046:
+	case 047:
             length += ((ins->oprs[c - 044].addr_size ?
                         ins->oprs[c - 044].addr_size : bits) >> 3);
             break;
         case 050:
         case 051:
         case 052:
+	case 053:
             length++;
             break;
         case 054:
         case 055:
         case 056:
+	case 057:
             length += 8; /* MOV reg64/imm */
             break;
         case 060:
         case 061:
         case 062:
+	case 063:
             length += 2;
             break;
         case 064:
         case 065:
         case 066:
+	case 067:
             if (ins->oprs[c - 064].type & (BITS16 | BITS32 | BITS64))
                 length += (ins->oprs[c - 064].type & BITS16) ? 2 : 4;
             else
@@ -805,33 +820,66 @@ static int32_t calcsize(int32_t segment, int32_t offset, int bits,
         case 070:
         case 071:
         case 072:
+	case 073:
             length += 4;
             break;
-        case 0130:
-        case 0131:
-        case 0132:
-            length += is_sbyte(ins, c - 0130, 16) ? 1 : 2;
-            break;
-        case 0133:
-        case 0134:
-        case 0135:
-            codes += 2;
-            length++;
+        case 074:
+        case 075:
+        case 076:
+        case 077:
+            length += 2;
             break;
         case 0140:
         case 0141:
         case 0142:
-            length += is_sbyte(ins, c - 0140, 32) ? 1 : 4;
+	case 0143:
+            length += is_sbyte(ins, c - 0140, 16) ? 1 : 2;
             break;
-        case 0143:
         case 0144:
         case 0145:
+        case 0146:
+        case 0147:
+            codes += 2;
+            length++;
+            break;
+        case 0150:
+        case 0151:
+        case 0152:
+        case 0153:
+            length += is_sbyte(ins, c - 0150, 32) ? 1 : 4;
+            break;
+        case 0154:
+        case 0155:
+        case 0156:
+        case 0157:
             codes += 2;
             length++;
             break;
+	case 0160:
+	case 0161:
+	case 0162:
+	case 0163:
+	    length++;
+	    ins->rex |= REX_D;
+	    ins->drexdst = regval(&ins->oprs[c & 3]);
+	    break;
+	case 0164:
+	case 0165:
+	case 0166:
+	case 0167:
+	    length++;
+	    ins->rex |= REX_D|REX_OC;
+	    ins->drexdst = regval(&ins->oprs[c & 3]);
+	    break;
+        case 0170:
+            length++;
+            break;
+	case 0171:
+	    break;
         case 0300:
         case 0301:
         case 0302:         
+        case 0303:         
             length += chsize(&ins->oprs[c - 0300], bits);
             break;
         case 0310:
@@ -927,7 +975,19 @@ static int32_t calcsize(int32_t segment, int32_t offset, int bits,
         }
 
     ins->rex &= rex_mask;
-    if (ins->rex & REX_REAL) {
+    
+    if (ins->rex & REX_D) {
+	if (ins->rex & REX_H) {
+	    errfunc(ERR_NONFATAL, "cannot use high register in drex instruction");
+	    return -1;
+	}
+	if (bits != 64 && ((ins->rex & (REX_W|REX_X|REX_B)) ||
+			   ins->drexdst > 7)) {
+	    errfunc(ERR_NONFATAL, "invalid operands in non-64-bit mode");
+	    return -1;
+	}
+	length++;
+    } else if (ins->rex & REX_REAL) {
 	if (ins->rex & REX_H) {
 	    errfunc(ERR_NONFATAL, "cannot use high register in rex instruction");
 	    return -1;
@@ -937,8 +997,8 @@ static int32_t calcsize(int32_t segment, int32_t offset, int bits,
 		    cpu >= IF_X86_64)) {
 	    length++;
 	} else {
-	  errfunc(ERR_NONFATAL, "invalid operands in non-64-bit mode");
-	  return -1;
+	    errfunc(ERR_NONFATAL, "invalid operands in non-64-bit mode");
+	    return -1;
 	}
     }
 
@@ -946,7 +1006,7 @@ static int32_t calcsize(int32_t segment, int32_t offset, int bits,
 }
 
 #define EMIT_REX()							\
-    if((ins->rex & REX_REAL) && (bits == 64)) {				\
+    if (!(ins->rex & REX_D) && (ins->rex & REX_REAL) && (bits == 64)) {	\
 	ins->rex = (ins->rex & REX_REAL)|REX_P;				\
 	out(offset, segment, &ins->rex, OUT_RAWDATA+1, NO_SEG, NO_SEG); \
 	ins->rex = 0;							\
@@ -1020,21 +1080,17 @@ static void gencode(int32_t segment, int32_t offset, int bits,
         case 010:
         case 011:
         case 012:
+	case 013:
 	    EMIT_REX();
             bytes[0] = *codes++ + ((regval(&ins->oprs[c - 010])) & 7);
             out(offset, segment, bytes, OUT_RAWDATA + 1, NO_SEG, NO_SEG);
             offset += 1;
             break;
 
-        case 017:
-            bytes[0] = 0;
-            out(offset, segment, bytes, OUT_RAWDATA + 1, NO_SEG, NO_SEG);
-            offset += 1;
-            break;
-
         case 014:
         case 015:
         case 016:
+	case 017:
             if (ins->oprs[c - 014].offset < -128
                 || ins->oprs[c - 014].offset > 127) {
                 errfunc(ERR_WARNING, "signed byte value exceeds bounds");
@@ -1055,6 +1111,7 @@ static void gencode(int32_t segment, int32_t offset, int bits,
         case 020:
         case 021:
         case 022:
+	case 023:
             if (ins->oprs[c - 020].offset < -256
                 || ins->oprs[c - 020].offset > 255) {
                 errfunc(ERR_WARNING, "byte value exceeds bounds");
@@ -1074,6 +1131,7 @@ static void gencode(int32_t segment, int32_t offset, int bits,
         case 024:
         case 025:
         case 026:
+	case 027:
             if (ins->oprs[c - 024].offset < 0
                 || ins->oprs[c - 024].offset > 255)
                 errfunc(ERR_WARNING, "unsigned byte value exceeds bounds");
@@ -1092,6 +1150,7 @@ static void gencode(int32_t segment, int32_t offset, int bits,
         case 030:
         case 031:
         case 032:
+	case 033:
             if (ins->oprs[c - 030].segment == NO_SEG &&
                 ins->oprs[c - 030].wrt == NO_SEG &&
                 (ins->oprs[c - 030].offset < -65536L ||
@@ -1107,6 +1166,7 @@ static void gencode(int32_t segment, int32_t offset, int bits,
         case 034:
         case 035:
         case 036:
+	case 037:
             if (ins->oprs[c - 034].type & (BITS16 | BITS32))
                 size = (ins->oprs[c - 034].type & BITS16) ? 2 : 4;
             else
@@ -1119,20 +1179,10 @@ static void gencode(int32_t segment, int32_t offset, int bits,
             offset += size;
             break;
 
-        case 037:
-            if (ins->oprs[0].segment == NO_SEG)
-                errfunc(ERR_NONFATAL, "value referenced by FAR is not"
-                        " relocatable");
-            data = 0L;
-            out(offset, segment, &data, OUT_ADDRESS + 2,
-                outfmt->segbase(1 + ins->oprs[0].segment),
-                ins->oprs[0].wrt);
-            offset += 2;
-            break;
-
         case 040:
         case 041:
         case 042:
+	case 043:
             data = ins->oprs[c - 040].offset;
             out(offset, segment, &data, OUT_ADDRESS + 4,
                 ins->oprs[c - 040].segment, ins->oprs[c - 040].wrt);
@@ -1142,6 +1192,7 @@ static void gencode(int32_t segment, int32_t offset, int bits,
         case 044:
         case 045:
         case 046:
+	case 047:
             data = ins->oprs[c - 044].offset;
             size = ((ins->oprs[c - 044].addr_size ?
                      ins->oprs[c - 044].addr_size : bits) >> 3);
@@ -1155,6 +1206,7 @@ static void gencode(int32_t segment, int32_t offset, int bits,
         case 050:
         case 051:
         case 052:
+	case 053:
             if (ins->oprs[c - 050].segment != segment)
                 errfunc(ERR_NONFATAL,
                         "short relative jump outside segment");
@@ -1169,6 +1221,7 @@ static void gencode(int32_t segment, int32_t offset, int bits,
         case 054:
         case 055:
         case 056:
+	case 057:
             data = (int64_t)ins->oprs[c - 054].offset;
             out(offset, segment, &data, OUT_ADDRESS + 8,
                 ins->oprs[c - 054].segment, ins->oprs[c - 054].wrt);
@@ -1178,6 +1231,7 @@ static void gencode(int32_t segment, int32_t offset, int bits,
         case 060:
         case 061:
         case 062:
+	case 063:
             if (ins->oprs[c - 060].segment != segment) {
                 data = ins->oprs[c - 060].offset;
                 out(offset, segment, &data,
@@ -1194,6 +1248,7 @@ static void gencode(int32_t segment, int32_t offset, int bits,
         case 064:
         case 065:
         case 066:
+	case 067:
             if (ins->oprs[c - 064].type & (BITS16 | BITS32 | BITS64))
                 size = (ins->oprs[c - 064].type & BITS16) ? 2 : 4;
             else
@@ -1214,6 +1269,7 @@ static void gencode(int32_t segment, int32_t offset, int bits,
         case 070:
         case 071:
         case 072:
+	case 073:
             if (ins->oprs[c - 070].segment != segment) {
                 data = ins->oprs[c - 070].offset;
                 out(offset, segment, &data,
@@ -1227,70 +1283,115 @@ static void gencode(int32_t segment, int32_t offset, int bits,
             offset += 4;
             break;
 
-        case 0130:
-        case 0131:
-        case 0132:
-            data = ins->oprs[c - 0130].offset;
-            if (is_sbyte(ins, c - 0130, 16)) {
+        case 074:
+        case 075:
+        case 076:
+        case 077:
+            if (ins->oprs[c - 074].segment == NO_SEG)
+                errfunc(ERR_NONFATAL, "value referenced by FAR is not"
+                        " relocatable");
+            data = 0L;
+            out(offset, segment, &data, OUT_ADDRESS + 2,
+                outfmt->segbase(1 + ins->oprs[c - 074].segment),
+                ins->oprs[c - 074].wrt);
+            offset += 2;
+            break;
+
+        case 0140:
+        case 0141:
+        case 0142:
+	case 0143:
+            data = ins->oprs[c - 0140].offset;
+            if (is_sbyte(ins, c - 0140, 16)) {
                 bytes[0] = data;
                 out(offset, segment, bytes, OUT_RAWDATA + 1, NO_SEG,
                     NO_SEG);
                 offset++;
             } else {
-                if (ins->oprs[c - 0130].segment == NO_SEG &&
-                    ins->oprs[c - 0130].wrt == NO_SEG &&
+                if (ins->oprs[c - 0140].segment == NO_SEG &&
+                    ins->oprs[c - 0140].wrt == NO_SEG &&
                     (data < -65536L || data > 65535L)) {
                     errfunc(ERR_WARNING, "word value exceeds bounds");
                 }
                 out(offset, segment, &data, OUT_ADDRESS + 2,
-                    ins->oprs[c - 0130].segment, ins->oprs[c - 0130].wrt);
+                    ins->oprs[c - 0140].segment, ins->oprs[c - 0140].wrt);
                 offset += 2;
             }
             break;
 
-        case 0133:
-        case 0134:
-        case 0135:
+        case 0144:
+        case 0145:
+        case 0146:
+	case 0147:
 	    EMIT_REX();
             codes++;
             bytes[0] = *codes++;
-            if (is_sbyte(ins, c - 0133, 16))
+            if (is_sbyte(ins, c - 0144, 16))
                 bytes[0] |= 2;  /* s-bit */
             out(offset, segment, bytes, OUT_RAWDATA + 1, NO_SEG, NO_SEG);
             offset++;
             break;
 
-        case 0140:
-        case 0141:
-        case 0142:
-            data = ins->oprs[c - 0140].offset;
-            if (is_sbyte(ins, c - 0140, 32)) {
+        case 0150:
+        case 0151:
+        case 0152:
+	case 0153:
+            data = ins->oprs[c - 0150].offset;
+            if (is_sbyte(ins, c - 0150, 32)) {
                 bytes[0] = data;
                 out(offset, segment, bytes, OUT_RAWDATA + 1, NO_SEG,
                     NO_SEG);
                 offset++;
             } else {
                 out(offset, segment, &data, OUT_ADDRESS + 4,
-                    ins->oprs[c - 0140].segment, ins->oprs[c - 0140].wrt);
+                    ins->oprs[c - 0150].segment, ins->oprs[c - 0150].wrt);
                 offset += 4;
             }
             break;
 
-        case 0143:
-        case 0144:
-        case 0145:
+        case 0154:
+        case 0155:
+        case 0156:
+	case 0157:
 	    EMIT_REX();
             codes++;
             bytes[0] = *codes++;
-            if (is_sbyte(ins, c - 0143, 32))
+            if (is_sbyte(ins, c - 0154, 32))
                 bytes[0] |= 2;  /* s-bit */
             out(offset, segment, bytes, OUT_RAWDATA + 1, NO_SEG, NO_SEG);
             offset++;
             break;
 
+	case 0160:
+	case 0161:
+	case 0162:
+	case 0163:
+	case 0164:
+	case 0165:
+	case 0166:
+	case 0167:
+	    break;
+
+        case 0170:
+            bytes[0] = 0;
+            out(offset, segment, bytes, OUT_RAWDATA + 1, NO_SEG, NO_SEG);
+            offset += 1;
+            break;
+
+	case 0171:
+	    bytes[0] =
+		(ins->drexdst << 4) |
+		(ins->rex & REX_OC ? 0x08 : 0) |
+		(ins->rex & (REX_R|REX_X|REX_B));
+	    ins->rex = 0;
+            out(offset, segment, bytes, OUT_RAWDATA + 1, NO_SEG, NO_SEG);
+	    offset++;
+	    break;
+
         case 0300:
         case 0301:
         case 0302:
+        case 0303:
             if (chsize(&ins->oprs[c - 0300], bits)) {
                 *bytes = 0x67;
                 out(offset, segment, bytes,
@@ -1448,6 +1549,15 @@ static void gencode(int32_t segment, int32_t offset, int bits,
                 if (ea_data.sib_present)
                     *p++ = ea_data.sib;
 
+		/* DREX suffixes come between the SIB and the displacement */
+		if (ins->rex & REX_D) {
+		    *p++ =
+			(ins->drexdst << 4) |
+			(ins->rex & REX_OC ? 0x08 : 0) |
+			(ins->rex & (REX_R|REX_X|REX_B));
+		    ins->rex = 0;
+		}
+
                 s = p - bytes;
                 out(offset, segment, bytes, OUT_RAWDATA + s,
                     NO_SEG, NO_SEG);
@@ -1537,7 +1647,7 @@ static int rexflags(int val, int32_t flags, int mask)
 
 static int matches(const struct itemplate *itemp, insn * instruction, int bits)
 {
-    int i, size[3], asize, oprs, ret;
+    int i, size[MAX_OPERANDS], asize, oprs, ret;
 
     ret = 100;
 
@@ -1564,7 +1674,12 @@ static int matches(const struct itemplate *itemp, insn * instruction, int bits)
      * Check that the operand flags all match up
      */
     for (i = 0; i < itemp->operands; i++) {
-        if (itemp->opd[i] & ~instruction->oprs[i].type ||
+	if (itemp->opd[i] & SAME_AS) {
+	    int j = itemp->opd[i] & ~SAME_AS;
+	    if (instruction->oprs[i].type != instruction->oprs[j].type ||
+		instruction->oprs[i].basereg != instruction->oprs[j].basereg)
+		return 0;
+	} else  if (itemp->opd[i] & ~instruction->oprs[i].type ||
             ((itemp->opd[i] & SIZE_MASK) &&
              ((itemp->opd[i] ^ instruction->oprs[i].type) & SIZE_MASK))) {
             if ((itemp->opd[i] & ~instruction->oprs[i].type & ~SIZE_MASK) ||
@@ -1579,7 +1694,7 @@ static int matches(const struct itemplate *itemp, insn * instruction, int bits)
      * Check operand sizes
      */
     if (itemp->flags & IF_ARMASK) {
-        size[0] = size[1] = size[2] = 0;
+	memset(size, 0, sizeof size);
 
         switch (itemp->flags & IF_ARMASK) {
         case IF_AR0:
@@ -1591,34 +1706,59 @@ static int matches(const struct itemplate *itemp, insn * instruction, int bits)
         case IF_AR2:
             i = 2;
             break;
+	case IF_AR3:
+	    i = 3;
+	    break;
         default:
             break;              /* Shouldn't happen */
         }
-        if (itemp->flags & IF_SB) {
+	switch (itemp->flags & IF_SMASK) {
+	case IF_SB:
             size[i] = BITS8;
-        } else if (itemp->flags & IF_SW) {
+	    break;
+	case IF_SW:
             size[i] = BITS16;
-        } else if (itemp->flags & IF_SD) {
+	    break;
+	case IF_SD:
             size[i] = BITS32;
-        } else if (itemp->flags & IF_SQ) {
+	    break;
+	case IF_SQ:
             size[i] = BITS64;
+	    break;
+	case IF_SO:
+	    size[i] = BITS128;
+	    break;
+	default:
+	    break;
         }
     } else {
         asize = 0;
-        if (itemp->flags & IF_SB) {
+	switch (itemp->flags & IF_SMASK) {
+	case IF_SB:
             asize = BITS8;
             oprs = itemp->operands;
-        } else if (itemp->flags & IF_SW) {
+	    break;
+	case IF_SW:
             asize = BITS16;
             oprs = itemp->operands;
-        } else if (itemp->flags & IF_SD) {
+	    break;
+	case IF_SD:
             asize = BITS32;
             oprs = itemp->operands;
-        } else if (itemp->flags & IF_SQ) {
+	    break;
+	case IF_SQ:
             asize = BITS64;
             oprs = itemp->operands;
+	    break;
+	case IF_SO:
+            asize = BITS128;
+            oprs = itemp->operands;
+	    break;
+	default:
+	    break;
         }
-        size[0] = size[1] = size[2] = asize;
+	for (i = 0; i < MAX_OPERANDS; i++)
+	    size[i] = asize;
     }
     
     if (itemp->flags & (IF_SM | IF_SM2)) {
diff --git a/disasm.c b/disasm.c
index 0452c295..a6c1c729 100644
--- a/disasm.c
+++ b/disasm.c
@@ -167,16 +167,46 @@ static const char *whichcond(int condval)
 }
 
 /*
+ * Process a DREX suffix
+ */
+static uint8_t *do_drex(uint8_t *data, insn *ins)
+{
+    uint8_t drex = *data++;
+    operand *dst = &ins->oprs[ins->drexdst];
+
+    if ((drex & 8) != ((ins->rex & REX_OC) ? 8 : 0))
+	return NULL;	/* OC0 mismatch */
+    ins->rex = (ins->rex & ~7) | (drex & 7);
+    
+    dst->segment = SEG_RMREG;
+    dst->basereg = drex >> 4;
+    return data;
+}
+
+
+/*
  * Process an effective address (ModRM) specification.
  */
 static uint8_t *do_ea(uint8_t *data, int modrm, int asize,
-		      int segsize, operand * op, int rex)
+		      int segsize, operand * op, insn *ins)
 {
     int mod, rm, scale, index, base;
+    int rex;
+    uint8_t sib = 0;
 
     mod = (modrm >> 6) & 03;
     rm = modrm & 07;
 
+    if (mod != 3 && rm == 4 && asize != 16)
+	sib = *data++;
+
+    if (ins->rex & REX_D) {
+	data = do_drex(data, ins);
+	if (!data)
+	    return NULL;
+    }
+    rex = ins->rex;
+
     if (mod == 3) {             /* pure register version */
         op->basereg = rm+(rex & REX_B ? 8 : 0);
         op->segment |= SEG_RMREG;
@@ -282,10 +312,9 @@ static uint8_t *do_ea(uint8_t *data, int modrm, int asize,
         }
 
         if (rm == 4) {          /* process SIB */
-            scale = (*data >> 6) & 03;
-            index = (*data >> 3) & 07;
-            base = *data & 07;
-            data++;
+            scale = (sib >> 6) & 03;
+            index = (sib >> 3) & 07;
+            base = sib & 07;
 
             op->scale = 1 << scale;
 
@@ -341,12 +370,12 @@ static int matches(const struct itemplate *t, uint8_t *data,
     uint8_t lock = prefix->lock;
     int osize = prefix->osize;
     int asize = prefix->asize;
+    int i;
 
-    ins->oprs[0].segment = ins->oprs[1].segment =
-	ins->oprs[2].segment =
-	ins->oprs[0].addr_size = ins->oprs[1].addr_size =
-	ins->oprs[2].addr_size = (segsize == 64 ? SEG_64BIT :
-				  segsize == 32 ? SEG_32BIT : 0);
+    for (i = 0; i < MAX_OPERANDS; i++) {
+	ins->oprs[i].segment = ins->oprs[i].addr_size =
+	    (segsize == 64 ? SEG_64BIT : segsize == 32 ? SEG_32BIT : 0);
+    }
     ins->condition = -1;
     ins->rex = prefix->rex;
 
@@ -419,7 +448,7 @@ static int matches(const struct itemplate *t, uint8_t *data,
             default:
                 return FALSE;
             }
-	} else if (c >= 010 && c <= 012) {
+	} else if (c >= 010 && c <= 013) {
             int t = *r++, d = *data++;
             if (d < t || d > t + 7)
                 return FALSE;
@@ -428,20 +457,17 @@ static int matches(const struct itemplate *t, uint8_t *data,
 		    (ins->rex & REX_B ? 8 : 0);
                 ins->oprs[c - 010].segment |= SEG_RMREG;
             }
-        } else if (c == 017) {
-            if (*data++)
-                return FALSE;
-	} else if (c >= 014 && c <= 016) {
+	} else if (c >= 014 && c <= 017) {
             ins->oprs[c - 014].offset = (int8_t)*data++;
             ins->oprs[c - 014].segment |= SEG_SIGNED;
-        } else if (c >= 020 && c <= 022) {
+        } else if (c >= 020 && c <= 023) {
             ins->oprs[c - 020].offset = *data++;
-	} else if (c >= 024 && c <= 026) {
+	} else if (c >= 024 && c <= 027) {
             ins->oprs[c - 024].offset = *data++;
-	} else if (c >= 030 && c <= 032) {
+	} else if (c >= 030 && c <= 033) {
             ins->oprs[c - 030].offset = getu16(data);
 	    data += 2;
-        } else if (c >= 034 && c <= 036) {
+        } else if (c >= 034 && c <= 037) {
 	    if (osize == 32) {
 		ins->oprs[c - 034].offset = getu32(data);
 		data += 4;
@@ -451,10 +477,10 @@ static int matches(const struct itemplate *t, uint8_t *data,
 	    }
             if (segsize != asize)
                 ins->oprs[c - 034].addr_size = asize;
-        } else if (c >= 040 && c <= 042) {
+        } else if (c >= 040 && c <= 043) {
             ins->oprs[c - 040].offset = getu32(data);
 	    data += 4;
-        } else if (c >= 044 && c <= 046) {
+        } else if (c >= 044 && c <= 047) {
 	    switch (asize) {
 	    case 16:
 		ins->oprs[c - 044].offset = getu16(data);
@@ -471,18 +497,18 @@ static int matches(const struct itemplate *t, uint8_t *data,
 	    }
             if (segsize != asize)
                 ins->oprs[c - 044].addr_size = asize;
-        } else if (c >= 050 && c <= 052) {
+        } else if (c >= 050 && c <= 053) {
             ins->oprs[c - 050].offset = gets8(data++);
             ins->oprs[c - 050].segment |= SEG_RELATIVE;
-        } else if (c >= 054 && c <= 056) {
+        } else if (c >= 054 && c <= 057) {
 	    ins->oprs[c - 054].offset = getu64(data);
 	    data += 8;
-	} else if (c >= 060 && c <= 062) {
+	} else if (c >= 060 && c <= 063) {
             ins->oprs[c - 060].offset = gets16(data);
 	    data += 2;
             ins->oprs[c - 060].segment |= SEG_RELATIVE;
             ins->oprs[c - 060].segment &= ~SEG_32BIT;
-        } else if (c >= 064 && c <= 066) {
+        } else if (c >= 064 && c <= 067) {
 	    if (osize == 16) {
 		ins->oprs[c - 064].offset = getu16(data);
 		data += 2;
@@ -498,30 +524,44 @@ static int matches(const struct itemplate *t, uint8_t *data,
                     (ins->oprs[c - 064].type & ~SIZE_MASK)
                     | ((osize == 16) ? BITS16 : BITS32);
             }
-        } else if (c >= 070 && c <= 072) {
+        } else if (c >= 070 && c <= 073) {
             ins->oprs[c - 070].offset = getu32(data);
 	    data += 4;
             ins->oprs[c - 070].segment |= SEG_32BIT | SEG_RELATIVE;
-        } else if (c >= 0100 && c < 0130) {
+        } else if (c >= 0100 && c < 0140) {
             int modrm = *data++;
-            ins->oprs[c & 07].basereg = ((modrm >> 3)&7)+
-		(ins->rex & REX_R ? 8 : 0);
             ins->oprs[c & 07].segment |= SEG_RMREG;
             data = do_ea(data, modrm, asize, segsize,
-                         &ins->oprs[(c >> 3) & 07], ins->rex);
-        } else if (c >= 0130 && c <= 0132) {
-            ins->oprs[c - 0130].offset = getu16(data);
+			 &ins->oprs[(c >> 3) & 07], ins);
+	    if (!data)
+		return FALSE;
+            ins->oprs[c & 07].basereg = ((modrm >> 3)&7)+
+		(ins->rex & REX_R ? 8 : 0);
+        } else if (c >= 0140 && c <= 0143) {
+            ins->oprs[c - 0140].offset = getu16(data);
 	    data += 2;
-        } else if (c >= 0140 && c <= 0142) {
-	    ins->oprs[c - 0140].offset = getu32(data);
+        } else if (c >= 0150 && c <= 0153) {
+	    ins->oprs[c - 0150].offset = getu32(data);
 	    data += 4;
+	} else if (c >= 0160 && c <= 0167) {
+	    ins->rex |= (c & 4) ? REX_D|REX_OC : REX_D;
+	    ins->drexdst = c & 3;
+        } else if (c == 0170) {
+            if (*data++)
+                return FALSE;
+	} else if (c == 0171) {
+	    data = do_drex(data, ins);
+	    if (!data)
+		return FALSE;
         } else if (c >= 0200 && c <= 0277) {
             int modrm = *data++;
             if (((modrm >> 3) & 07) != (c & 07))
                 return FALSE;   /* spare field doesn't match up */
             data = do_ea(data, modrm, asize, segsize,
-                         &ins->oprs[(c >> 3) & 07], ins->rex);
-        } else if (c >= 0300 && c <= 0302) {
+                         &ins->oprs[(c >> 3) & 07], ins);
+	    if (!data)
+		return FALSE;
+        } else if (c >= 0300 && c <= 0303) {
             a_used = TRUE;
         } else if (c == 0310) {
             if (asize != 16)
@@ -605,6 +645,10 @@ static int matches(const struct itemplate *t, uint8_t *data,
 	}
     }
 
+    /* REX cannot be combined with DREX */
+    if ((ins->rex & REX_D) && (prefix->rex))
+	return FALSE;
+
     /*
      * Check for unused rep or a/o prefixes.
      */
@@ -627,9 +671,11 @@ int32_t disasm(uint8_t *data, char *output, int outbufsize, int segsize,
             int32_t offset, int autosync, uint32_t prefer)
 {
     const struct itemplate * const *p, * const *best_p;
+    const struct disasm_index *ix;
+    uint8_t *dp;
     int length, best_length = 0;
     char *segover;
-    int i, slen, colon;
+    int i, slen, colon, n;
     uint8_t *origdata;
     int works;
     insn tmp_ins, ins;
@@ -684,7 +730,14 @@ int32_t disasm(uint8_t *data, char *output, int outbufsize, int segsize,
     best_p = NULL;
     best_pref = INT_MAX;
 
-    for (p = itable[*data]; *p; p++) {
+    dp = data;
+    ix = itable + *dp++;
+    while (ix->n == (size_t)-1) {
+	ix = (const struct disasm_index *)ix->p + *dp++;
+    }
+
+    p = (const struct itemplate * const *)ix->p;
+    for (n = ix->n; n; n--, p++) {
         if ((length = matches(*p, data, &prefix, segsize, &tmp_ins))) {
             works = TRUE;
             /*
@@ -692,19 +745,21 @@ int32_t disasm(uint8_t *data, char *output, int outbufsize, int segsize,
 	     * XXX: Need to make sure this is actually correct.
              */
             for (i = 0; i < (*p)->operands; i++) {
-                if (
-                       /* If it's a mem-only EA but we have a register, die. */
-                       ((tmp_ins.oprs[i].segment & SEG_RMREG) &&
-                        !(MEMORY & ~(*p)->opd[i])) ||
-                       /* If it's a reg-only EA but we have a memory ref, die. */
-                       (!(tmp_ins.oprs[i].segment & SEG_RMREG) &&
-                        !(REG_EA & ~(*p)->opd[i]) &&
-                        !((*p)->opd[i] & REG_SMASK)) ||
-                       /* Register type mismatch (eg FS vs REG_DESS): die. */
-                       ((((*p)->opd[i] & (REGISTER | FPUREG)) ||
-                         (tmp_ins.oprs[i].segment & SEG_RMREG)) &&
-                        !whichreg((*p)->opd[i],
-                                  tmp_ins.oprs[i].basereg, tmp_ins.rex))) {
+                if (!((*p)->opd[i] & SAME_AS) &&
+		    (
+			/* If it's a mem-only EA but we have a register, die. */
+			((tmp_ins.oprs[i].segment & SEG_RMREG) &&
+			 !(MEMORY & ~(*p)->opd[i])) ||
+			/* If it's a reg-only EA but we have a memory ref, die. */
+			(!(tmp_ins.oprs[i].segment & SEG_RMREG) &&
+			 !(REG_EA & ~(*p)->opd[i]) &&
+			 !((*p)->opd[i] & REG_SMASK)) ||
+			/* Register type mismatch (eg FS vs REG_DESS): die. */
+			((((*p)->opd[i] & (REGISTER | FPUREG)) ||
+			  (tmp_ins.oprs[i].segment & SEG_RMREG)) &&
+			 !whichreg((*p)->opd[i],
+				   tmp_ins.oprs[i].basereg, tmp_ins.rex))
+			)) {
                     works = FALSE;
                     break;
                 }
@@ -793,107 +848,116 @@ int32_t disasm(uint8_t *data, char *output, int outbufsize, int segsize,
     colon = FALSE;
     length += data - origdata;  /* fix up for prefixes */
     for (i = 0; i < (*p)->operands; i++) {
+	opflags_t t = (*p)->opd[i];
+	const operand *o = &ins.oprs[i];
+	int64_t offs;
+
+	if (t & SAME_AS) {
+	    o = &ins.oprs[t & ~SAME_AS];
+	    t = (*p)->opd[t & ~SAME_AS];
+	}
+
         output[slen++] = (colon ? ':' : i == 0 ? ' ' : ',');
 
-        if (ins.oprs[i].segment & SEG_RELATIVE) {
-            ins.oprs[i].offset += offset + length;
+	offs = o->offset;
+        if (o->segment & SEG_RELATIVE) {
+            offs += offset + length;
             /*
              * sort out wraparound
              */
-            if (!(ins.oprs[i].segment & (SEG_32BIT|SEG_64BIT)))
-		ins.oprs[i].offset &= 0xffff;
+            if (!(o->segment & (SEG_32BIT|SEG_64BIT)))
+		offs &= 0xffff;
             /*
              * add sync marker, if autosync is on
              */
             if (autosync)
-                add_sync(ins.oprs[i].offset, 0L);
+                add_sync(offs, 0L);
         }
 
-        if ((*p)->opd[i] & COLON)
+        if (t & COLON)
             colon = TRUE;
         else
             colon = FALSE;
 
-        if (((*p)->opd[i] & (REGISTER | FPUREG)) ||
-            (ins.oprs[i].segment & SEG_RMREG)) {
-            ins.oprs[i].basereg = whichreg((*p)->opd[i],
-                                           ins.oprs[i].basereg, ins.rex);
-            if ((*p)->opd[i] & TO)
+        if ((t & (REGISTER | FPUREG)) ||
+            (o->segment & SEG_RMREG)) {
+	    enum reg_enum reg;
+            reg = whichreg(t, o->basereg, ins.rex);
+            if (t & TO)
                 slen += snprintf(output + slen, outbufsize - slen, "to ");
             slen += snprintf(output + slen, outbufsize - slen, "%s",
-                             reg_names[ins.oprs[i].basereg -
-                                       EXPR_REG_START]);
-        } else if (!(UNITY & ~(*p)->opd[i])) {
+                             reg_names[reg - EXPR_REG_START]);
+        } else if (!(UNITY & ~t)) {
             output[slen++] = '1';
-        } else if ((*p)->opd[i] & IMMEDIATE) {
-            if ((*p)->opd[i] & BITS8) {
+        } else if (t & IMMEDIATE) {
+            if (t & BITS8) {
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "byte ");
-                if (ins.oprs[i].segment & SEG_SIGNED) {
-                    if (ins.oprs[i].offset < 0) {
-                        ins.oprs[i].offset *= -1;
+                if (o->segment & SEG_SIGNED) {
+                    if (offs < 0) {
+                        offs *= -1;
                         output[slen++] = '-';
                     } else
                         output[slen++] = '+';
                 }
-            } else if ((*p)->opd[i] & BITS16) {
+            } else if (t & BITS16) {
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "word ");
-            } else if ((*p)->opd[i] & BITS32) {
+            } else if (t & BITS32) {
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "dword ");
-            } else if ((*p)->opd[i] & BITS64) {
+            } else if (t & BITS64) {
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "qword ");
-            } else if ((*p)->opd[i] & NEAR) {
+            } else if (t & NEAR) {
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "near ");
-            } else if ((*p)->opd[i] & SHORT) {
+            } else if (t & SHORT) {
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "short ");
             }
             slen +=
                 snprintf(output + slen, outbufsize - slen, "0x%"PRIx64"",
-                         ins.oprs[i].offset);
-        } else if (!(MEM_OFFS & ~(*p)->opd[i])) {
+                         offs);
+        } else if (!(MEM_OFFS & ~t)) {
             slen +=
                 snprintf(output + slen, outbufsize - slen, "[%s%s%s0x%"PRIx64"]",
                          (segover ? segover : ""),
                          (segover ? ":" : ""),
-                         (ins.oprs[i].addr_size ==
-                          32 ? "dword " : ins.oprs[i].addr_size ==
-                          16 ? "word " : ""), ins.oprs[i].offset);
+                         (o->addr_size ==
+                          32 ? "dword " : o->addr_size ==
+                          16 ? "word " : ""), offs);
             segover = NULL;
-        } else if (!(REGMEM & ~(*p)->opd[i])) {
+        } else if (!(REGMEM & ~t)) {
             int started = FALSE;
-            if ((*p)->opd[i] & BITS8)
+            if (t & BITS8)
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "byte ");
-            if ((*p)->opd[i] & BITS16)
+            if (t & BITS16)
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "word ");
-            if ((*p)->opd[i] & BITS32)
+            if (t & BITS32)
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "dword ");
-            if ((*p)->opd[i] & BITS64)
+            if (t & BITS64)
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "qword ");
-            if ((*p)->opd[i] & BITS80)
+            if (t & BITS80)
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "tword ");
-            if ((*p)->opd[i] & FAR)
+            if (t & FAR)
                 slen += snprintf(output + slen, outbufsize - slen, "far ");
-            if ((*p)->opd[i] & NEAR)
+            if (t & NEAR)
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "near ");
             output[slen++] = '[';
-            if (ins.oprs[i].addr_size)
+            if (o->addr_size)
                 slen += snprintf(output + slen, outbufsize - slen, "%s",
-                                 (ins.oprs[i].addr_size == 64 ? "qword " :
-				  ins.oprs[i].addr_size == 32 ? "dword " :
-                                  ins.oprs[i].addr_size == 16 ? "word " :
+                                 (o->addr_size == 64 ? "qword " :
+				  o->addr_size == 32 ? "dword " :
+                                  o->addr_size == 16 ? "word " :
 				  ""));
-	    if (ins.oprs[i].eaflags & EAF_REL)
+	    if (o->eaflags & EAF_REL)
 		slen += snprintf(output + slen, outbufsize - slen, "rel ");
             if (segover) {
                 slen +=
@@ -901,27 +965,27 @@ int32_t disasm(uint8_t *data, char *output, int outbufsize, int segsize,
                              segover);
                 segover = NULL;
             }
-            if (ins.oprs[i].basereg != -1) {
+            if (o->basereg != -1) {
                 slen += snprintf(output + slen, outbufsize - slen, "%s",
-                                 reg_names[(ins.oprs[i].basereg -
+                                 reg_names[(o->basereg -
                                             EXPR_REG_START)]);
                 started = TRUE;
             }
-            if (ins.oprs[i].indexreg != -1) {
+            if (o->indexreg != -1) {
                 if (started)
                     output[slen++] = '+';
                 slen += snprintf(output + slen, outbufsize - slen, "%s",
-                                 reg_names[(ins.oprs[i].indexreg -
+                                 reg_names[(o->indexreg -
                                             EXPR_REG_START)]);
-                if (ins.oprs[i].scale > 1)
+                if (o->scale > 1)
                     slen +=
                         snprintf(output + slen, outbufsize - slen, "*%d",
-                                 ins.oprs[i].scale);
+                                 o->scale);
                 started = TRUE;
             }
-            if (ins.oprs[i].segment & SEG_DISP8) {
+            if (o->segment & SEG_DISP8) {
 		int minus = 0;
-		int8_t offset = ins.oprs[i].offset;
+		int8_t offset = offs;
 		if (offset < 0) {
 		    minus = 1;
 		    offset = -offset;
@@ -929,9 +993,9 @@ int32_t disasm(uint8_t *data, char *output, int outbufsize, int segsize,
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "%s0x%"PRIx8"",
 			     minus ? "-" : "+", offset);
-            } else if (ins.oprs[i].segment & SEG_DISP16) {
+            } else if (o->segment & SEG_DISP16) {
 		int minus = 0;
-		int16_t offset = ins.oprs[i].offset;
+		int16_t offset = offs;
 		if (offset < 0) {
 		    minus = 1;
 		    offset = -offset;
@@ -939,9 +1003,9 @@ int32_t disasm(uint8_t *data, char *output, int outbufsize, int segsize,
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "%s0x%"PRIx16"",
 			     minus ? "-" : started ? "+" : "", offset);
-            } else if (ins.oprs[i].segment & SEG_DISP32) {
+            } else if (o->segment & SEG_DISP32) {
 		    char *prefix = "";
-		    int32_t offset = ins.oprs[i].offset;
+		    int32_t offset = offs;
 		    if (offset < 0) {
 			offset = -offset;
 			prefix = "-";
diff --git a/doc/nasmdoc.src b/doc/nasmdoc.src
index 13ae013d..c79cd39b 100644
--- a/doc/nasmdoc.src
+++ b/doc/nasmdoc.src
@@ -151,6 +151,7 @@ convention
 \IR{ms-dos} MS-DOS
 \IR{ms-dos device drivers} MS-DOS device drivers
 \IR{multipush} \c{multipush} macro
+\IR{nan} NaN
 \IR{nasm version} NASM version
 \IR{netbsd} NetBSD
 \IR{omf} OMF
@@ -1093,7 +1094,7 @@ syntax in which register names must be prefixed by a \c{%} sign), or
 they can be \i{effective addresses} (see \k{effaddr}), constants
 (\k{const}) or expressions (\k{expr}).
 
-For \i{floating-point} instructions, NASM accepts a wide range of
+For x87 \i{floating-point} instructions, NASM accepts a wide range of
 syntaxes: you can use two-operand forms like MASM supports, or you
 can use NASM's native single-operand forms in most cases.
 \# Details of
@@ -1107,7 +1108,7 @@ For example, you can code:
 \c         fadd    st1,st0         ; this sets st1 := st1 + st0
 \c         fadd    to st1          ; so does this
 
-Almost any floating-point instruction that references memory must
+Almost any x87 floating-point instruction that references memory must
 use one of the prefixes \i\c{DWORD}, \i\c{QWORD} or \i\c{TWORD} to
 indicate what size of \i{memory operand} it refers to.
 
@@ -1115,19 +1116,19 @@ indicate what size of \i{memory operand} it refers to.
 \H{pseudop} \i{Pseudo-Instructions}
 
 Pseudo-instructions are things which, though not real x86 machine
-instructions, are used in the instruction field anyway because
-that's the most convenient place to put them. The current
-pseudo-instructions are \i\c{DB}, \i\c{DW}, \i\c{DD}, \i\c{DQ} and
-\i\c{DT}, their \i{uninitialized} counterparts \i\c{RESB},
-\i\c{RESW}, \i\c{RESD}, \i\c{RESQ} and \i\c{REST}, the \i\c{INCBIN}
+instructions, are used in the instruction field anyway because that's
+the most convenient place to put them. The current pseudo-instructions
+are \i\c{DB}, \i\c{DW}, \i\c{DD}, \i\c{DQ}, \i\c{DT} and \i\c{DO};
+their \i{uninitialized} counterparts \i\c{RESB}, \i\c{RESW},
+\i\c{RESD}, \i\c{RESQ}, \i\c{REST} and \i\c{RESO}; the \i\c{INCBIN}
 command, the \i\c{EQU} command, and the \i\c{TIMES} prefix.
 
 
 \S{db} \c{DB} and friends: Declaring initialized Data
 
-\i\c{DB}, \i\c{DW}, \i\c{DD}, \i\c{DQ} and \i\c{DT} are used, much
-as in MASM, to declare initialized data in the output file. They can
-be invoked in a wide range of ways:
+\i\c{DB}, \i\c{DW}, \i\c{DD}, \i\c{DQ}, \i\c{DT} and \i\c{DO} are
+used, much as in MASM, to declare initialized data in the output
+file. They can be invoked in a wide range of ways:
 \I{floating-point}\I{character constant}\I{string constant}
 
 \c       db    0x55                ; just the byte 0x55
@@ -1144,20 +1145,21 @@ be invoked in a wide range of ways:
 \c       dq    1.234567e20         ; double-precision float
 \c       dt    1.234567e20         ; extended-precision float
 
-\c{DT} does not accept \i{numeric constants} as operands.
+\c{DT} and \c{DO} do not accept \i{numeric constants} as operands.
+\c{DB} does not accept \i{floating-point} numbers as operands.
 
 
 \S{resb} \c{RESB} and friends: Declaring \i{Uninitialized} Data
 
-\i\c{RESB}, \i\c{RESW}, \i\c{RESD}, \i\c{RESQ} and \i\c{REST} are
-designed to be used in the BSS section of a module: they declare
-\e{uninitialized} storage space. Each takes a single operand, which
-is the number of bytes, words, doublewords or whatever to reserve.
-As stated in \k{qsother}, NASM does not support the MASM/TASM syntax
-of reserving uninitialized space by writing \I\c{?}\c{DW ?} or
-similar things: this is what it does instead. The operand to a
-\c{RESB}-type pseudo-instruction is a \i\e{critical expression}: see
-\k{crit}.
+\i\c{RESB}, \i\c{RESW}, \i\c{RESD}, \i\c{RESQ}, \i\c{REST} and
+\i\c{RESO} are designed to be used in the BSS section of a module:
+they declare \e{uninitialized} storage space. Each takes a single
+operand, which is the number of bytes, words, doublewords or whatever
+to reserve.  As stated in \k{qsother}, NASM does not support the
+MASM/TASM syntax of reserving uninitialized space by writing
+\I\c{?}\c{DW ?} or similar things: this is what it does instead. The
+operand to a \c{RESB}-type pseudo-instruction is a \i\e{critical
+expression}: see \k{crit}.
 
 For example:
 
@@ -1390,20 +1392,28 @@ when they are operands to \c{dw}.
 \S{fltconst} \I{floating-point, constants}Floating-Point Constants
 
 \i{Floating-point} constants are acceptable only as arguments to
-\i\c{DD}, \i\c{DQ} and \i\c{DT}. They are expressed in the
-traditional form: digits, then a period, then optionally more
-digits, then optionally an \c{E} followed by an exponent. The period
-is mandatory, so that NASM can distinguish between \c{dd 1}, which
-declares an integer constant, and \c{dd 1.0} which declares a
-floating-point constant.
+\i\c{DW}, \i\c{DD}, \i\c{DQ}, \i\c{DT}, and \i\c{DO}. They are
+expressed in the traditional form: digits, then a period, then
+optionally more digits, then optionally an \c{E} followed by an
+exponent. The period is mandatory, so that NASM can distinguish
+between \c{dd 1}, which declares an integer constant, and \c{dd 1.0}
+which declares a floating-point constant.
+
+NASM also support C99-style hexadecimal floating-point: \c{0x},
+hexadecimal digits, period, optionally more hexadeximal digits, then
+optionally a \c{P} followed by a \e{binary} (not hexadecimal) exponent
+in decimal notation.
 
 Some examples:
 
+\c       dw    -0.5                    ; IEEE half precision
 \c       dd    1.2                     ; an easy one
+\c	 dd    0x1p+2		       ; 1.0x2^2 = 4.0
 \c       dq    1.e10                   ; 10,000,000,000
 \c       dq    1.e+10                  ; synonymous with 1.e10
 \c       dq    1.e-10                  ; 0.000 000 000 1
 \c       dt    3.141592653589793238462 ; pi
+\c       do    1.e+4000		       ; IEEE quad precision
 
 NASM cannot do compile-time arithmetic on floating-point constants.
 This is because NASM is designed to be portable - although it always
@@ -1415,18 +1425,21 @@ do floating arithmetic it would have to include its own complete set
 of floating-point routines, which would significantly increase the
 size of the assembler for very little benefit.
 
+The special tokens \i\c{__Infinity__}, \i\c{__QNaN__} (or
+\i\c{__NaN__}) and \i\c{__SNaN__} can be used to generate
+\I{infinity}infinities, quiet \i{NaN}s, and signalling NaNs,
+respectively.  These are normally used as macros:
 
-\H{expr} \i{Expressions}
+\c %define Inf __Infinity__
+\c %define NaN __QNaN__
+\c
+\c       dq    +1.5, -Inf, NaN         ; Double-precision constants
 
-Expressions in NASM are similar in syntax to those in C.
+\H{expr} \i{Expressions}
 
-NASM does not guarantee the size of the integers used to evaluate
-expressions at compile time: since NASM can compile and run on
-64-bit systems quite happily, don't assume that expressions are
-evaluated in 32-bit registers and so try to make deliberate use of
-\i{integer overflow}. It might not always work. The only thing NASM
-will guarantee is what's guaranteed by ANSI C: you always have \e{at
-least} 32 bits to work in.
+Expressions in NASM are similar in syntax to those in C.  Expressions
+are evaluated as 64-bit integers which are then adjusted to the
+appropriate size.
 
 NASM supports two special tokens in expressions, allowing
 calculations to involve the current assembly position: the
@@ -1560,11 +1573,11 @@ invent one using the macro processor.
 
 When assembling with the optimizer set to level 2 or higher (see
 \k{opt-On}), NASM will use size specifiers (\c{BYTE}, \c{WORD},
-\c{DWORD}, \c{QWORD}, or \c{TWORD}), but will give them the smallest
-possible size. The keyword \c{STRICT} can be used to inhibit
+\c{DWORD}, \c{QWORD}, \c{TWORD} or \c{OWORD}), but will give them the
+smallest possible size. The keyword \c{STRICT} can be used to inhibit
 optimization and force a particular operand to be emitted in the
-specified size. For example, with the optimizer on, and in
-\c{BITS 16} mode,
+specified size. For example, with the optimizer on, and in \c{BITS 16}
+mode,
 
 \c         push dword 33
 
@@ -3425,15 +3438,21 @@ using 16-bit data need an 0x66 and those working on 16-bit addresses
 need an 0x67.
 
 When NASM is in \c{BITS 64} mode, most instructions operate the same
-as they do for \c{BITS 32} mode. However, 16-bit addresses are depreciated
-in the x86-64 architecture extension and the 0x67 prefix is used for 32-bit
-addressing. This is due to the default of 64-bit addressing. When the \c{REX}
-prefix is used, the processor does not know how to address the AH, BH, CH or
-DH (high 8-bit legacy) registers. This because the x86-64 has added a new
-set of registers and the capability to address the low 8-bits of the SP, BP
-SI and DI registers as SPL, BPL, SIL and DIL, respectively; but only when
-the REX prefix is used. In summary, the \c{REX} prefix causes the addressing
-of AH, BH, CH and DH to be replaced by SPL, BPL, SIL and DIL.
+as they do for \c{BITS 32} mode. However, there are 8 more general and
+SSE registers, and 16-bit addressing is no longer supported.
+
+The default address size is 64 bits; 32-bit addressing can be selected
+with the 0x67 prefix.  The default operand size is still 32 bits,
+however, and the 0x66 prefix selects 16-bit operand size.  The \c{REX}
+prefix is used both to select 64-bit operand size, and to access the
+new registers. NASM automatically inserts REX prefixes when
+necessary.
+
+When the \c{REX} prefix is used, the processor does not know how to
+address the AH, BH, CH or DH (high 8-bit legacy) registers. Instead,
+it is possible to access the the low 8-bits of the SP, BP SI and DI
+registers as SPL, BPL, SIL and DIL, respectively; but only when the
+REX prefix is used.
 
 The \c{BITS} directive has an exactly equivalent primitive form,
 \c{[BITS 16]}, \c{[BITS 32]} and \c{[BITS 64]}. The user-level form is
diff --git a/float.c b/float.c
index 099e23f2..d22aa19c 100644
--- a/float.c
+++ b/float.c
@@ -8,6 +8,7 @@
  * initial version 13/ix/96 by Simon Tatham
  */
 
+#include <ctype.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -18,8 +19,8 @@
 #define TRUE 1
 #define FALSE 0
 
-#define MANT_WORDS 6            /* 64 bits + 32 for accuracy == 96 */
-#define MANT_DIGITS 28          /* 29 digits don't fit in 96 bits */
+#define MANT_WORDS  10          /* 112 bits + 48 for accuracy == 160 */
+#define MANT_DIGITS 49          /* 50 digits don't fit in 160 bits */
 
 /*
  * guaranteed top bit of from is set
@@ -47,9 +48,8 @@ static int ieee_multiply(uint16_t *to, uint16_t *from)
         temp[i] &= 0xFFFF;
     }
     if (temp[0] & 0x8000) {
-        for (i = 0; i < MANT_WORDS; i++)
-            to[i] = temp[i] & 0xFFFF;
-        return 0;
+	memcpy(to, temp, 2*MANT_WORDS);
+	return 0;
     } else {
         for (i = 0; i < MANT_WORDS; i++)
             to[i] = (temp[i] << 1) + !!(temp[i + 1] & 0x8000);
@@ -57,6 +57,91 @@ static int ieee_multiply(uint16_t *to, uint16_t *from)
     }
 }
 
+static int hexval(char c)
+{
+    if (c >= '0' && c <= '9')
+	return c-'0';
+    else if (c >= 'a' && c <= 'f')
+	return c-'a'+10;
+    else
+	return c-'A'+10;
+}
+
+static void ieee_flconvert_hex(char *string, uint16_t *mant,
+			       int32_t *exponent, efunc error)
+{
+    static const int log2tbl[16] =
+	{ -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3 };
+    uint16_t mult[MANT_WORDS+1], *mp;
+    int ms;
+    int32_t twopwr;
+    int seendot, seendigit;
+    unsigned char c;
+
+    twopwr = 0;
+    seendot = seendigit = 0;
+
+    memset(mult, 0, sizeof mult);
+
+    while ((c = *string++) != '\0') {
+	if (c == '.') {
+            if (!seendot)
+                seendot = TRUE;
+            else {
+                error(ERR_NONFATAL,
+                      "too many periods in floating-point constant");
+                return;
+            }
+	} else if (isxdigit(c)) {
+	    int v = hexval(c);
+
+	    if (!seendigit && v) {
+		int l = log2tbl[v];
+
+		seendigit = 1;
+		mp = mult;
+		ms = 15-l;
+
+		twopwr = seendot ? twopwr-4+l : l-3;
+	    }
+
+	    if (seendigit) {
+		if (ms <= 0) {
+		    *mp |= v >> -ms;
+		    mp++;
+		    if (mp > &mult[MANT_WORDS])
+			mp = &mult[MANT_WORDS]; /* Guard slot */
+		    ms += 16;
+		}
+		*mp |= v << ms;
+		ms -= 4;
+
+		if (!seendot)
+		    twopwr += 4;
+	    } else {
+		if (seendot)
+		    twopwr -= 4;
+	    }
+	} else if (c == 'p' || c == 'P') {
+	    twopwr += atoi(string);
+	    break;
+	} else {
+            error(ERR_NONFATAL,
+                  "floating-point constant: `%c' is invalid character",
+                  c);
+            return;
+        }
+    }
+
+    if (!seendigit) {
+	memset(mant, 0, 2*MANT_WORDS); /* Zero */
+	*exponent = 0;
+    } else {
+	memcpy(mant, mult, 2*MANT_WORDS);
+	*exponent = twopwr;
+    }
+}
+
 static void ieee_flconvert(char *string, uint16_t *mant,
                            int32_t *exponent, efunc error)
 {
@@ -67,6 +152,11 @@ static void ieee_flconvert(char *string, uint16_t *mant,
     int32_t tenpwr, twopwr;
     int extratwos, started, seendot;
 
+    if (string[0] == '0' && (string[1] == 'x' || string[1] == 'X')) {
+	ieee_flconvert_hex(string+2, mant, exponent, error);
+	return;
+    }
+
     p = digits;
     tenpwr = 0;
     started = seendot = FALSE;
@@ -213,123 +303,135 @@ static int ieee_round(uint16_t *mant, int i)
 
 #define put(a,b) ( (*(a)=(b)), ((a)[1]=(b)>>8) )
 
-static int to_double(char *str, int32_t sign, uint8_t *result,
-                     efunc error)
+/* Set a bit, using *bigendian* bit numbering (0 = MSB) */
+static void set_bit(uint16_t *mant, int bit)
 {
-    uint16_t mant[MANT_WORDS];
-    int32_t exponent;
+    mant[bit >> 4] |= 1 << (~bit & 15);
+}
 
-    sign = (sign < 0 ? 0x8000L : 0L);
+/* Produce standard IEEE formats, with implicit "1" bit; this makes
+   the following assumptions:
 
-    ieee_flconvert(str, mant, &exponent, error);
-    if (mant[0] & 0x8000) {
-        /*
-         * Non-zero.
-         */
-        exponent--;
-        if (exponent >= -1022 && exponent <= 1024) {
-            /*
-             * Normalised.
-             */
-            exponent += 1023;
-            ieee_shr(mant, 11);
-            ieee_round(mant, 4);
-            if (mant[0] & 0x20) /* did we scale up by one? */
-                ieee_shr(mant, 1), exponent++;
-            mant[0] &= 0xF;     /* remove leading one */
-            put(result + 6, (exponent << 4) | mant[0] | sign);
-            put(result + 4, mant[1]);
-            put(result + 2, mant[2]);
-            put(result + 0, mant[3]);
-        } else if (exponent < -1022 && exponent >= -1074) {
-            /*
-             * Denormal.
-             */
-            int shift = -(exponent + 1011);
-            int sh = shift % 16, wds = shift / 16;
-            ieee_shr(mant, sh);
-            if (ieee_round(mant, 4 - wds)
-                || (sh > 0 && (mant[0] & (0x8000 >> (sh - 1))))) {
-                ieee_shr(mant, 1);
-                if (sh == 0)
-                    mant[0] |= 0x8000;
-                exponent++;
-            }
-            put(result + 6, (wds == 0 ? mant[0] : 0) | sign);
-            put(result + 4, (wds <= 1 ? mant[1 - wds] : 0));
-            put(result + 2, (wds <= 2 ? mant[2 - wds] : 0));
-            put(result + 0, (wds <= 3 ? mant[3 - wds] : 0));
-        } else {
-            if (exponent > 0) {
-                error(ERR_NONFATAL, "overflow in floating-point constant");
-                return 0;
-            } else
-                memset(result, 0, 8);
-        }
-    } else {
-        /*
-         * Zero.
-         */
-        memset(result, 0, 8);
-    }
-    return 1;                   /* success */
-}
+   - the sign bit is the MSB, followed by the exponent.
+   - the sign bit plus exponent fit in 16 bits.
+   - the exponent bias is 2^(n-1)-1 for an n-bit exponent */
+
+struct ieee_format {
+    int words;
+    int mantissa;		/* Bits in the mantissa */
+    int exponent;		/* Bits in the exponent */
+};
 
+static const struct ieee_format ieee_16  = { 1,  10,  5 };
+static const struct ieee_format ieee_32  = { 2,  23,  8 };
+static const struct ieee_format ieee_64  = { 4,  52, 11 };
+static const struct ieee_format ieee_128 = { 8, 112, 15 };
+
+/* Produce all the standard IEEE formats: 16, 32, 64, and 128 bits */
 static int to_float(char *str, int32_t sign, uint8_t *result,
-                    efunc error)
+		    const struct ieee_format *fmt, efunc error)
 {
-    uint16_t mant[MANT_WORDS];
+    uint16_t mant[MANT_WORDS], *mp;
     int32_t exponent;
+    int32_t expmax = 1 << (fmt->exponent-1);
+    uint16_t implicit_one = 0x8000 >> fmt->exponent;
+    int i;
 
     sign = (sign < 0 ? 0x8000L : 0L);
 
-    ieee_flconvert(str, mant, &exponent, error);
-    if (mant[0] & 0x8000) {
-        /*
-         * Non-zero.
-         */
-        exponent--;
-        if (exponent >= -126 && exponent <= 128) {
-            /*
-             * Normalised.
-             */
-            exponent += 127;
-            ieee_shr(mant, 8);
-            ieee_round(mant, 2);
-            if (mant[0] & 0x100)        /* did we scale up by one? */
-                ieee_shr(mant, 1), exponent++;
-            mant[0] &= 0x7F;    /* remove leading one */
-            put(result + 2, (exponent << 7) | mant[0] | sign);
-            put(result + 0, mant[1]);
-        } else if (exponent < -126 && exponent >= -149) {
-            /*
-             * Denormal.
-             */
-            int shift = -(exponent + 118);
-            int sh = shift % 16, wds = shift / 16;
-            ieee_shr(mant, sh);
-            if (ieee_round(mant, 2 - wds)
-                || (sh > 0 && (mant[0] & (0x8000 >> (sh - 1))))) {
-                ieee_shr(mant, 1);
-                if (sh == 0)
-                    mant[0] |= 0x8000;
-                exponent++;
-            }
-            put(result + 2, (wds == 0 ? mant[0] : 0) | sign);
-            put(result + 0, (wds <= 1 ? mant[1 - wds] : 0));
-        } else {
-            if (exponent > 0) {
-                error(ERR_NONFATAL, "overflow in floating-point constant");
-                return 0;
-            } else
-                memset(result, 0, 4);
-        }
+    if (str[0] == '_') {
+	/* NaN or Infinity */
+	int32_t expmask = (1 << fmt->exponent)-1;
+
+	memset(mant, 0, sizeof mant);
+	mant[0] = expmask << (15-fmt->exponent); /* Exponent: all bits one */
+
+	switch (str[2]) {
+	case 'n':		/* __nan__ */
+	case 'N':
+	case 'q':		/* __qnan__ */
+	case 'Q':
+	    set_bit(mant, fmt->exponent+1); /* Highest bit in mantissa */
+	    break;
+	case 's':		/* __snan__ */
+	case 'S':
+	    set_bit(mant, fmt->exponent+fmt->mantissa);	/* Last bit */
+	    break;
+	case 'i':		/* __infinity__ */
+	case 'I':
+	    break;
+	}
     } else {
-        memset(result, 0, 4);
+	ieee_flconvert(str, mant, &exponent, error);
+	if (mant[0] & 0x8000) {
+	    /*
+	     * Non-zero.
+	     */
+	    exponent--;
+	    if (exponent >= 2-expmax && exponent <= expmax) {
+		/*
+		 * Normalised.
+		 */
+		exponent += expmax;
+		ieee_shr(mant, fmt->exponent);
+		ieee_round(mant, fmt->words);
+		/* did we scale up by one? */
+		if (mant[0] & (implicit_one << 1)) {
+		    ieee_shr(mant, 1);
+		    exponent++;
+		}
+		
+		mant[0] &= (implicit_one-1);     /* remove leading one */
+		mant[0] |= exponent << (15 - fmt->exponent);
+	    } else if (exponent < 2-expmax &&
+		       exponent >= 2-expmax-fmt->mantissa) {
+		/*
+		 * Denormal.
+		 */
+		int shift = -(exponent + expmax-2-fmt->exponent);
+		int sh = shift % 16, wds = shift / 16;
+		ieee_shr(mant, sh);
+		if (ieee_round(mant, fmt->words - wds)
+		    || (sh > 0 && (mant[0] & (0x8000 >> (sh - 1))))) {
+		    ieee_shr(mant, 1);
+		    if (sh == 0)
+			mant[0] |= 0x8000;
+		    exponent++;
+		}
+		
+		if (wds) {
+		    for (i = fmt->words-1; i >= wds; i--)
+			mant[i] = mant[i-wds];
+		    for (; i >= 0; i--)
+			mant[i] = 0;
+		}
+	    } else {
+		if (exponent > 0) {
+		    error(ERR_NONFATAL, "overflow in floating-point constant");
+		    return 0;
+		} else {
+		    memset(mant, 0, 2*fmt->words);
+		}
+	    }
+	} else {
+	    /* Zero */
+	    memset(mant, 0, 2*fmt->words);
+	}
     }
-    return 1;
+
+    mant[0] |= sign;
+
+    for (mp = &mant[fmt->words], i = 0; i < fmt->words; i++) {
+	uint16_t m = *--mp;
+	put(result, m);
+	result += 2;
+    }
+
+    return 1;                   /* success */
 }
 
+/* 80-bit format with 64-bit mantissa *including an explicit integer 1*
+   and 15-bit exponent. */
 static int to_ldoub(char *str, int32_t sign, uint8_t *result,
                     efunc error)
 {
@@ -338,6 +440,31 @@ static int to_ldoub(char *str, int32_t sign, uint8_t *result,
 
     sign = (sign < 0 ? 0x8000L : 0L);
 
+    if (str[0] == '_') {
+	uint16_t is_snan = 0, is_qnan = 0x8000;
+	switch (str[2]) {
+	case 'n':
+	case 'N':
+	case 'q':
+	case 'Q':
+	    is_qnan = 0xc000;
+	    break;
+	case 's':
+	case 'S':
+	    is_snan = 1;
+	    break;
+	case 'i':
+	case 'I':
+	    break;
+	}
+	put(result + 0, is_snan);
+	put(result + 2, 0);
+	put(result + 4, 0);
+	put(result + 6, is_qnan);
+	put(result + 8, 0x7fff|sign);
+	return 1;
+    }
+
     ieee_flconvert(str, mant, &exponent, error);
     if (mant[0] & 0x8000) {
         /*
@@ -351,11 +478,11 @@ static int to_ldoub(char *str, int32_t sign, uint8_t *result,
             exponent += 16383;
             if (ieee_round(mant, 4))    /* did we scale up by one? */
                 ieee_shr(mant, 1), mant[0] |= 0x8000, exponent++;
-            put(result + 8, exponent | sign);
-            put(result + 6, mant[0]);
-            put(result + 4, mant[1]);
-            put(result + 2, mant[2]);
             put(result + 0, mant[3]);
+            put(result + 2, mant[2]);
+            put(result + 4, mant[1]);
+            put(result + 6, mant[0]);
+            put(result + 8, exponent | sign);
         } else if (exponent < -16383 && exponent >= -16446) {
             /*
              * Denormal.
@@ -370,23 +497,29 @@ static int to_ldoub(char *str, int32_t sign, uint8_t *result,
                     mant[0] |= 0x8000;
                 exponent++;
             }
-            put(result + 8, sign);
-            put(result + 6, (wds == 0 ? mant[0] : 0));
-            put(result + 4, (wds <= 1 ? mant[1 - wds] : 0));
-            put(result + 2, (wds <= 2 ? mant[2 - wds] : 0));
             put(result + 0, (wds <= 3 ? mant[3 - wds] : 0));
+            put(result + 2, (wds <= 2 ? mant[2 - wds] : 0));
+            put(result + 4, (wds <= 1 ? mant[1 - wds] : 0));
+            put(result + 6, (wds == 0 ? mant[0] : 0));
+            put(result + 8, sign);
         } else {
             if (exponent > 0) {
                 error(ERR_NONFATAL, "overflow in floating-point constant");
                 return 0;
-            } else
-                memset(result, 0, 10);
+            } else {
+		goto zero;
+	    }
         }
     } else {
         /*
          * Zero.
          */
-        memset(result, 0, 10);
+    zero:
+	put(result + 0, 0);
+	put(result + 2, 0);
+	put(result + 4, 0);
+	put(result + 6, 0);
+	put(result + 8, sign);
     }
     return 1;
 }
@@ -394,13 +527,18 @@ static int to_ldoub(char *str, int32_t sign, uint8_t *result,
 int float_const(char *number, int32_t sign, uint8_t *result, int bytes,
                 efunc error)
 {
-    if (bytes == 4)
-        return to_float(number, sign, result, error);
-    else if (bytes == 8)
-        return to_double(number, sign, result, error);
-    else if (bytes == 10)
+    switch (bytes) {
+    case 2:
+	return to_float(number, sign, result, &ieee_16, error);
+    case 4:
+        return to_float(number, sign, result, &ieee_32, error);
+    case 8:
+        return to_float(number, sign, result, &ieee_64, error);
+    case 10:
         return to_ldoub(number, sign, result, error);
-    else {
+    case 16:
+        return to_float(number, sign, result, &ieee_128, error);
+    default:
         error(ERR_PANIC, "strange value %d passed to float_const", bytes);
         return 0;
     }
diff --git a/insns.dat b/insns.dat
index 5043b0b3..61af07f5 100644
--- a/insns.dat
+++ b/insns.dat
@@ -14,6 +14,22 @@
 ; see the comment at the top of assemble.c.  For a detailed description
 ; of the flags (fourth field), please see insns.h.
 ;
+
+; Special instructions...
+DB        ignore              ignore                        ignore
+DW        ignore              ignore                        ignore
+DD        ignore              ignore                        ignore
+DQ        ignore              ignore                        ignore
+DT        ignore              ignore                        ignore
+DO        ignore              ignore                        ignore
+RESB      imm                 \340                          8086
+RESW      ignore              ignore                        ignore
+RESD      ignore              ignore                        ignore
+RESQ      ignore              ignore                        ignore
+REST      ignore              ignore                        ignore
+RESO      ignore              ignore                        ignore
+
+; Conventional instructions
 AAA       void                \1\x37                        8086,NOLONG
 AAD       void                \2\xD5\x0A                    8086,NOLONG
 AAD       imm                 \1\xD5\24                     8086,SB,NOLONG
@@ -47,14 +63,14 @@ ADC       reg_eax,imm         \321\1\x15\41                 386,SM
 ADC       reg_rax,sbyte       \321\1\x83\202\15             X64,SM,ND
 ADC       reg_rax,imm         \321\1\x15\41                 X64,SM
 ADC       rm8,imm             \300\1\x80\202\21             8086,SM
-ADC       rm16,imm            \320\300\134\1\x81\202\131    8086,SM
-ADC       rm32,imm            \321\300\144\1\x81\202\141    386,SM
-ADC       rm64,imm            \324\300\144\1\x81\202\141    X64,SM
+ADC       rm16,imm            \320\300\145\1\x81\202\141    8086,SM
+ADC       rm32,imm            \321\300\155\1\x81\202\151    386,SM
+ADC       rm64,imm            \324\300\155\1\x81\202\151    X64,SM
 ADC       mem,imm8            \300\1\x80\202\21             8086,SM
-ADC       mem,imm16           \320\300\134\1\x81\202\131    8086,SM
-ADC       mem,imm32           \321\300\144\1\x81\202\141    386,SM
-ADD       mem,reg8            \300\17\101                   8086,SM
-ADD       reg8,reg8           \17\101                       8086
+ADC       mem,imm16           \320\300\145\1\x81\202\141    8086,SM
+ADC       mem,imm32           \321\300\155\1\x81\202\151    386,SM
+ADD       mem,reg8            \300\170\101                  8086,SM
+ADD       reg8,reg8           \170\101                      8086
 ADD       mem,reg16           \320\300\1\x01\101            8086,SM
 ADD       reg16,reg16         \320\1\x01\101                8086
 ADD       mem,reg32           \321\300\1\x01\101            386,SM
@@ -80,12 +96,12 @@ ADD       reg_eax,imm         \321\1\x05\41                 386,SM
 ADD       reg_rax,sbyte       \321\1\x83\200\15             X64,SM,ND
 ADD       reg_rax,imm         \323\1\x05\41                 X64,SM
 ADD       rm8,imm             \300\1\x80\200\21             8086,SM
-ADD       rm16,imm            \320\300\134\1\x81\200\131    8086,SM
-ADD       rm32,imm            \321\300\144\1\x81\200\141    386,SM
-ADD       rm64,imm            \324\300\144\1\x81\200\141    X64,SM
+ADD       rm16,imm            \320\300\145\1\x81\200\141    8086,SM
+ADD       rm32,imm            \321\300\155\1\x81\200\151    386,SM
+ADD       rm64,imm            \324\300\155\1\x81\200\151    X64,SM
 ADD       mem,imm8            \300\1\x80\200\21             8086,SM
-ADD       mem,imm16           \320\300\134\1\x81\200\131    8086,SM
-ADD       mem,imm32           \321\300\144\1\x81\200\141    386,SM
+ADD       mem,imm16           \320\300\145\1\x81\200\141    8086,SM
+ADD       mem,imm32           \321\300\155\1\x81\200\151    386,SM
 AND       mem,reg8            \300\1\x20\101                8086,SM
 AND       reg8,reg8           \1\x20\101                    8086
 AND       mem,reg16           \320\300\1\x21\101            8086,SM
@@ -113,12 +129,12 @@ AND       reg_eax,imm         \321\1\x25\41                 386,SM
 AND       reg_rax,sbyte       \321\1\x83\204\15             X64,SM,ND
 AND       reg_rax,imm         \324\1\x25\41                 X64,SM
 AND       rm8,imm             \300\1\x80\204\21             8086,SM
-AND       rm16,imm            \320\300\134\1\x81\204\131    8086,SM
-AND       rm32,imm            \321\300\144\1\x81\204\141    386,SM
-AND       rm64,imm            \324\300\144\1\x81\204\141    X64,SM
+AND       rm16,imm            \320\300\145\1\x81\204\141    8086,SM
+AND       rm32,imm            \321\300\155\1\x81\204\151    386,SM
+AND       rm64,imm            \324\300\155\1\x81\204\151    X64,SM
 AND       mem,imm8            \300\1\x80\204\21             8086,SM
-AND       mem,imm16           \320\300\134\1\x81\204\131    8086,SM
-AND       mem,imm32           \321\300\144\1\x81\204\141    386,SM
+AND       mem,imm16           \320\300\145\1\x81\204\141    8086,SM
+AND       mem,imm32           \321\300\155\1\x81\204\151    386,SM
 ARPL      mem,reg16           \300\1\x63\101                286,PROT,SM,NOLONG
 ARPL      reg16,reg16         \1\x63\101                    286,PROT,NOLONG
 BOUND     reg16,mem           \320\301\1\x62\110            186,NOLONG
@@ -175,13 +191,13 @@ BTS       rm32,imm            \321\300\2\x0F\xBA\205\25     386,SB
 BTS       rm64,imm            \324\300\2\x0F\xBA\205\25     X64,SB
 CALL      imm                 \322\1\xE8\64                 8086
 CALL      imm|near            \322\1\xE8\64                 8086
-CALL      imm|far             \322\1\x9A\34\37              8086,ND,NOLONG
+CALL      imm|far             \322\1\x9A\34\74              8086,ND,NOLONG
 CALL      imm16               \320\1\xE8\64                 8086
 CALL      imm16|near          \320\1\xE8\64                 8086
-CALL      imm16|far           \320\1\x9A\34\37              8086,ND,NOLONG
+CALL      imm16|far           \320\1\x9A\34\74              8086,ND,NOLONG
 CALL      imm32               \321\1\xE8\64                 386
 CALL      imm32|near          \321\1\xE8\64                 386
-CALL      imm32|far           \321\1\x9A\34\37              386,ND,NOLONG
+CALL      imm32|far           \321\1\x9A\34\74              386,ND,NOLONG
 CALL      imm:imm             \322\1\x9A\35\30              8086,NOLONG
 CALL      imm16:imm           \320\1\x9A\31\30              8086,NOLONG
 CALL      imm:imm16           \320\1\x9A\31\30              8086,NOLONG
@@ -238,12 +254,12 @@ CMP       reg_eax,imm         \321\1\x3D\41                 386,SM
 CMP       reg_rax,sbyte       \321\1\x83\207\15             X64,SM,ND
 CMP       reg_rax,imm         \321\1\x3D\41                 X64,SM
 CMP       rm8,imm             \300\1\x80\207\21             8086,SM
-CMP       rm16,imm            \320\300\134\1\x81\207\131    8086,SM
-CMP       rm32,imm            \321\300\144\1\x81\207\141    386,SM
-CMP       rm64,imm            \324\300\144\1\x81\207\141    X64,SM
+CMP       rm16,imm            \320\300\145\1\x81\207\141    8086,SM
+CMP       rm32,imm            \321\300\155\1\x81\207\151    386,SM
+CMP       rm64,imm            \324\300\155\1\x81\207\151    X64,SM
 CMP       mem,imm8            \300\1\x80\207\21             8086,SM
-CMP       mem,imm16           \320\300\134\1\x81\207\131    8086,SM
-CMP       mem,imm32           \321\300\144\1\x81\207\141    386,SM
+CMP       mem,imm16           \320\300\145\1\x81\207\141    8086,SM
+CMP       mem,imm32           \321\300\155\1\x81\207\151    386,SM
 CMPSB     void                \335\1\xA6                    8086
 CMPSD     void                \335\321\1\xA7                386
 CMPSQ     void                \335\324\1\xA7                X64
@@ -270,8 +286,6 @@ CWD       void                \320\1\x99                    8086
 CWDE      void                \321\1\x98                    386
 DAA       void                \1\x27                        8086,NOLONG
 DAS       void                \1\x2F                        8086,NOLONG
-DB        ignore              ignore                        ignore
-DD        ignore              ignore                        ignore
 DEC       reg16               \320\10\x48                   8086,NOLONG
 DEC       reg32               \321\10\x48                   386,NOLONG
 DEC       rm8                 \300\1\xFE\201                8086
@@ -282,9 +296,6 @@ DIV       rm8                 \300\1\xF6\206                8086
 DIV       rm16                \320\300\1\xF7\206            8086
 DIV       rm32                \321\300\1\xF7\206            386
 DIV       rm64                \324\300\1\xF7\206            X64
-DQ        ignore              ignore                        ignore
-DT        ignore              ignore                        ignore
-DW        ignore              ignore                        ignore
 EMMS      void                \2\x0F\x77                    PENT,MMX
 ENTER     imm,imm             \1\xC8\30\25                  186
 EQU       imm                 \0                            8086
@@ -497,38 +508,38 @@ IMUL      reg64,reg64         \324\2\x0F\xAF\110            X64
 IMUL      reg16,mem,imm8      \320\301\1\x6B\110\16         186,SM
 IMUL      reg16,mem,sbyte     \320\301\1\x6B\110\16         186,SM,ND
 IMUL      reg16,mem,imm16     \320\301\1\x69\110\32         186,SM
-IMUL      reg16,mem,imm       \320\301\135\1\x69\110\132    186,SM,ND
+IMUL      reg16,mem,imm       \320\301\146\1\x69\110\142    186,SM,ND
 IMUL      reg16,reg16,imm8    \320\1\x6B\110\16             186
 IMUL      reg16,reg16,sbyte   \320\1\x6B\110\16             186,SM,ND
 IMUL      reg16,reg16,imm16   \320\1\x69\110\32             186
-IMUL      reg16,reg16,imm     \320\135\1\x69\110\132        186,SM,ND
+IMUL      reg16,reg16,imm     \320\146\1\x69\110\142        186,SM,ND
 IMUL      reg32,mem,imm8      \321\301\1\x6B\110\16         386,SM
 IMUL      reg32,mem,sbyte     \321\301\1\x6B\110\16         386,SM,ND
 IMUL      reg32,mem,imm32     \321\301\1\x69\110\42         386,SM
-IMUL      reg32,mem,imm       \321\301\145\1\x69\110\142    386,SM,ND
+IMUL      reg32,mem,imm       \321\301\156\1\x69\110\152    386,SM,ND
 IMUL      reg32,reg32,imm8    \321\1\x6B\110\16             386
 IMUL      reg32,reg32,sbyte   \321\1\x6B\110\16             386,SM,ND
 IMUL      reg32,reg32,imm32   \321\1\x69\110\42             386
-IMUL      reg32,reg32,imm     \321\145\1\x69\110\142        386,SM,ND
+IMUL      reg32,reg32,imm     \321\156\1\x69\110\152        386,SM,ND
 IMUL      reg64,mem,imm8      \324\301\1\x6B\110\16         X64,SM
 IMUL      reg64,mem,sbyte     \324\301\1\x6B\110\16         X64,SM,ND
 IMUL      reg64,mem,imm32     \324\301\1\x69\110\42         X64,SM
-IMUL      reg64,mem,imm       \324\301\145\1\x69\110\142    X64,SM,ND
+IMUL      reg64,mem,imm       \324\301\156\1\x69\110\152    X64,SM,ND
 IMUL      reg64,reg64,imm8    \324\1\x6B\110\16             X64
 IMUL      reg64,reg64,sbyte   \324\1\x6B\110\16             X64,SM,ND
 IMUL      reg64,reg64,imm32   \324\1\x69\110\42             X64
-IMUL      reg64,reg64,imm     \324\145\1\x69\110\142        X64,SM,ND
+IMUL      reg64,reg64,imm     \324\156\1\x69\110\152        X64,SM,ND
 IMUL      reg16,imm8          \320\1\x6B\100\15             186
 IMUL      reg16,sbyte         \320\1\x6B\100\15             186,SM,ND
 IMUL      reg16,imm16         \320\1\x69\100\31             186
-IMUL      reg16,imm           \320\134\1\x69\100\131        186,SM,ND
+IMUL      reg16,imm           \320\145\1\x69\100\141        186,SM,ND
 IMUL      reg32,imm8          \321\1\x6B\100\15             386
 IMUL      reg32,sbyte         \321\1\x6B\100\15             386,SM,ND
 IMUL      reg32,imm32         \321\1\x69\100\41             386
-IMUL      reg32,imm           \321\144\1\x69\100\141        386,SM,ND
+IMUL      reg32,imm           \321\155\1\x69\100\151        386,SM,ND
 IMUL      reg64,sbyte         \324\1\x6B\100\15             X64,SM,ND
 IMUL      reg64,imm32         \324\1\x69\100\41             X64
-IMUL      reg64,imm           \324\144\1\x69\100\141        X64,SM,ND
+IMUL      reg64,imm           \324\155\1\x69\100\151        X64,SM,ND
 IN        reg_al,imm          \1\xE4\25                     8086,SB
 IN        reg_ax,imm          \320\1\xE5\25                 8086,SB
 IN        reg_eax,imm         \321\1\xE5\25                 386,SB
@@ -564,13 +575,13 @@ JMP       imm|short           \1\xEB\50                     8086
 JMP       imm                 \371\1\xEB\50                 8086,ND
 JMP       imm                 \322\1\xE9\64                 8086
 JMP       imm|near            \322\1\xE9\64                 8086,ND
-JMP       imm|far             \322\1\xEA\34\37              8086,ND,NOLONG
+JMP       imm|far             \322\1\xEA\34\74              8086,ND,NOLONG
 JMP       imm16               \320\1\xE9\64                 8086
 JMP       imm16|near          \320\1\xE9\64                 8086,ND
-JMP       imm16|far           \320\1\xEA\34\37              8086,ND,NOLONG
+JMP       imm16|far           \320\1\xEA\34\74              8086,ND,NOLONG
 JMP       imm32               \321\1\xE9\64                 386
 JMP       imm32|near          \321\1\xE9\64                 386,ND
-JMP       imm32|far           \321\1\xEA\34\37              386,ND,NOLONG
+JMP       imm32|far           \321\1\xEA\34\74              386,ND,NOLONG
 JMP       imm:imm             \322\1\xEA\35\30              8086,NOLONG
 JMP       imm16:imm           \320\1\xEA\31\30              8086,NOLONG
 JMP       imm:imm16           \320\1\xEA\31\30              8086,NOLONG
@@ -618,9 +629,9 @@ LGDT      mem                 \300\2\x0F\x01\202            286,PRIV
 LGS       reg16,mem           \320\301\2\x0F\xB5\110        386
 LGS       reg32,mem           \321\301\2\x0F\xB5\110        386
 LIDT      mem                 \300\2\x0F\x01\203            286,PRIV
-LLDT      mem                 \300\1\x0F\17\202             286,PROT,PRIV
-LLDT      mem16               \300\1\x0F\17\202             286,PROT,PRIV
-LLDT      reg16               \1\x0F\17\202                 286,PROT,PRIV
+LLDT      mem                 \300\1\x0F\170\202            286,PROT,PRIV
+LLDT      mem16               \300\1\x0F\170\202            286,PROT,PRIV
+LLDT      reg16               \1\x0F\170\202                286,PROT,PRIV
 LMSW      mem                 \300\2\x0F\x01\206            286,PRIV
 LMSW      mem16               \300\2\x0F\x01\206            286,PRIV
 LMSW      reg16               \2\x0F\x01\206                286,PRIV
@@ -658,9 +669,9 @@ LSL       reg64,mem           \324\301\2\x0F\x03\110        X64,SM
 LSL       reg64,reg64         \324\2\x0F\x03\110            X64,PROT
 LSS       reg16,mem           \320\301\2\x0F\xB2\110        386
 LSS       reg32,mem           \321\301\2\x0F\xB2\110        386
-LTR       mem                 \300\1\x0F\17\203             286,PROT,PRIV
-LTR       mem16               \300\1\x0F\17\203             286,PROT,PRIV,NOLONG
-LTR       reg16               \1\x0F\17\203                 286,PROT,PRIV,NOLONG
+LTR       mem                 \300\1\x0F\170\203            286,PROT,PRIV
+LTR       mem16               \300\1\x0F\170\203            286,PROT,PRIV,NOLONG
+LTR       reg16               \1\x0F\170\203                286,PROT,PRIV,NOLONG
 MFENCE    void                \3\x0F\xAE\xF0                X64,AMD
 MONITOR   void		      \3\x0F\x01\xC8		    PRESCOTT
 MONITOR	  reg_eax,reg_ecx,reg_edx      \3\x0F\x01\xC8	    PRESCOTT,ND
@@ -791,12 +802,12 @@ OR        reg_eax,imm         \321\1\x0D\41                 386,SM
 OR        reg_rax,sbyte       \321\1\x83\201\15             X64,SM,ND
 OR        reg_rax,imm         \321\1\x0D\41                 X64,SM
 OR        rm8,imm             \300\1\x80\201\21             8086,SM
-OR        rm16,imm            \320\300\134\1\x81\201\131    8086,SM
-OR        rm32,imm            \321\300\144\1\x81\201\141    386,SM
-OR        rm64,imm            \324\300\144\1\x81\201\141    X64,SM
+OR        rm16,imm            \320\300\145\1\x81\201\141    8086,SM
+OR        rm32,imm            \321\300\155\1\x81\201\151    386,SM
+OR        rm64,imm            \324\300\155\1\x81\201\151    X64,SM
 OR        mem,imm8            \300\1\x80\201\21             8086,SM
-OR        mem,imm16           \320\300\134\1\x81\201\131    8086,SM
-OR        mem,imm32           \321\300\144\1\x81\201\141    386,SM
+OR        mem,imm16           \320\300\145\1\x81\201\141    8086,SM
+OR        mem,imm32           \321\300\155\1\x81\201\151    386,SM
 OUT       imm,reg_al          \1\xE6\24                     8086,SB
 OUT       imm,reg_ax          \320\1\xE7\24                 8086,SB
 OUT       imm,reg_eax         \321\1\xE7\24                 386,SB
@@ -990,9 +1001,9 @@ PUSH      reg_dess            \6                            8086,NOLONG
 PUSH      reg_fsgs            \1\x0F\7                      386
 PUSH      imm8                \1\x6A\14                     186
 PUSH      sbyte               \1\x6A\14                     186,ND
-PUSH      imm16               \320\133\1\x68\130            186
-PUSH      imm32               \321\143\1\x68\140            386,NOLONG
-PUSH      imm64               \321\143\1\x68\140            X64
+PUSH      imm16               \320\144\1\x68\140            186
+PUSH      imm32               \321\154\1\x68\150            386,NOLONG
+PUSH      imm64               \321\154\1\x68\150            X64
 PUSH	  imm		      \1\x68\34			    186
 PUSHA     void                \322\1\x60                    186,NOLONG
 PUSHAD    void                \321\1\x60                    386,NOLONG
@@ -1032,11 +1043,6 @@ RDMSR     void                \2\x0F\x32                    PENT,PRIV
 RDPMC     void                \2\x0F\x33                    P6
 RDTSC     void                \2\x0F\x31                    PENT
 RDTSCP    void                \3\x0F\x01\xF9                X64
-RESB      imm                 \340                          8086
-RESD      ignore              ignore                        ignore
-RESQ      ignore              ignore                        ignore
-REST      ignore              ignore                        ignore
-RESW      ignore              ignore                        ignore
 RET       void                \1\xC3                        8086
 RET       imm                 \1\xC2\30                     8086,SW
 RETF      void                \1\xCB                        8086
@@ -1124,12 +1130,12 @@ SBB       reg_eax,imm         \321\1\x1D\41                 386,SM
 SBB       reg_rax,sbyte       \321\1\x83\203\15             X64,SM,ND
 SBB       reg_rax,imm         \321\1\x1D\41                 X64,SM
 SBB       rm8,imm             \300\1\x80\203\21             8086,SM
-SBB       rm16,imm            \320\300\134\1\x81\203\131    8086,SM
-SBB       rm32,imm            \321\300\144\1\x81\203\141    386,SM
-SBB       rm64,imm            \324\300\144\1\x81\203\141    X64,SM
+SBB       rm16,imm            \320\300\145\1\x81\203\141    8086,SM
+SBB       rm32,imm            \321\300\155\1\x81\203\151    386,SM
+SBB       rm64,imm            \324\300\155\1\x81\203\151    X64,SM
 SBB       mem,imm8            \300\1\x80\203\21             8086,SM
-SBB       mem,imm16           \320\300\134\1\x81\203\131    8086,SM
-SBB       mem,imm32           \321\300\144\1\x81\203\141    386,SM
+SBB       mem,imm16           \320\300\145\1\x81\203\141    8086,SM
+SBB       mem,imm32           \321\300\155\1\x81\203\151    386,SM
 SCASB     void                \335\1\xAE                    8086
 SCASD     void                \335\321\1\xAF                386
 SCASQ     void                \335\324\1\xAF                X64
@@ -1185,10 +1191,10 @@ SHRD      reg32,reg32,reg_cl  \321\2\x0F\xAD\101            386
 SHRD      mem,reg64,reg_cl    \300\324\2\x0F\xAD\101        X64,SM
 SHRD      reg64,reg64,reg_cl  \324\2\x0F\xAD\101            X64
 SIDT      mem                 \300\2\x0F\x01\201            286
-SLDT      mem                 \300\1\x0F\17\200             286
-SLDT      mem16               \300\1\x0F\17\200             286
-SLDT      reg16               \320\1\x0F\17\200             286
-SLDT      reg32               \321\1\x0F\17\200             386
+SLDT      mem                 \300\1\x0F\170\200            286
+SLDT      mem16               \300\1\x0F\170\200            286
+SLDT      reg16               \320\1\x0F\170\200            286
+SLDT      reg32               \321\1\x0F\170\200            386
 SKINIT    void                \3\x0F\x01\xDE                X64
 SMI       void                \1\xF1                        386,UNDOC
 SMINT     void                \2\x0F\x38                    P6,CYRIX
@@ -1206,11 +1212,11 @@ STOSB     void                \1\xAA                        8086
 STOSD     void                \321\1\xAB                    386
 STOSQ     void                \324\1\xAB                    X64
 STOSW     void                \320\1\xAB                    8086
-STR       mem                 \300\1\x0F\17\201             286,PROT
-STR       mem16               \300\1\x0F\17\201             286,PROT
-STR       reg16               \320\1\x0F\17\201             286,PROT
-STR       reg32               \321\1\x0F\17\201             386,PROT
-STR       reg64               \324\1\x0F\17\201             X64
+STR       mem                 \300\1\x0F\170\201            286,PROT
+STR       mem16               \300\1\x0F\170\201            286,PROT
+STR       reg16               \320\1\x0F\170\201            286,PROT
+STR       reg32               \321\1\x0F\170\201            386,PROT
+STR       reg64               \324\1\x0F\170\201            X64
 SUB       mem,reg8            \300\1\x28\101                8086,SM
 SUB       reg8,reg8           \1\x28\101                    8086
 SUB       mem,reg16           \320\300\1\x29\101            8086,SM
@@ -1238,12 +1244,12 @@ SUB       reg_eax,imm         \321\1\x2D\41                 386,SM
 SUB       reg_rax,sbyte       \321\1\x83\205\15             X64,SM,ND
 SUB       reg_rax,imm         \321\1\x2D\41                 X64,SM
 SUB       rm8,imm             \300\1\x80\205\21             8086,SM
-SUB       rm16,imm            \320\300\134\1\x81\205\131    8086,SM
-SUB       rm32,imm            \321\300\144\1\x81\205\141    386,SM
-SUB       rm64,imm            \324\300\144\1\x81\205\141    X64,SM
+SUB       rm16,imm            \320\300\145\1\x81\205\141    8086,SM
+SUB       rm32,imm            \321\300\155\1\x81\205\151    386,SM
+SUB       rm64,imm            \324\300\155\1\x81\205\151    X64,SM
 SUB       mem,imm8            \300\1\x80\205\21             8086,SM
-SUB       mem,imm16           \320\300\134\1\x81\205\131    8086,SM
-SUB       mem,imm32           \321\300\144\1\x81\205\141    386,SM
+SUB       mem,imm16           \320\300\145\1\x81\205\141    8086,SM
+SUB       mem,imm32           \321\300\155\1\x81\205\151    386,SM
 SVDC      mem80,reg_sreg      \300\2\x0F\x78\101            486,CYRIX,SMM
 SVLDT     mem80               \300\2\x0F\x7A\200            486,CYRIX,SMM
 SVTS      mem80               \300\2\x0F\x7C\200            486,CYRIX,SMM
@@ -1290,12 +1296,12 @@ UMOV      reg16,mem           \320\301\2\x0F\x13\110        386,UNDOC,SM
 UMOV      reg16,reg16         \320\2\x0F\x13\110            386,UNDOC
 UMOV      reg32,mem           \321\301\2\x0F\x13\110        386,UNDOC,SM
 UMOV      reg32,reg32         \321\2\x0F\x13\110            386,UNDOC
-VERR      mem                 \300\1\x0F\17\204             286,PROT
-VERR      mem16               \300\1\x0F\17\204             286,PROT
-VERR      reg16               \1\x0F\17\204                 286,PROT
-VERW      mem                 \300\1\x0F\17\205             286,PROT
-VERW      mem16               \300\1\x0F\17\205             286,PROT
-VERW      reg16               \1\x0F\17\205                 286,PROT
+VERR      mem                 \300\1\x0F\170\204            286,PROT
+VERR      mem16               \300\1\x0F\170\204            286,PROT
+VERR      reg16               \1\x0F\170\204                286,PROT
+VERW      mem                 \300\1\x0F\170\205            286,PROT
+VERW      mem16               \300\1\x0F\170\205            286,PROT
+VERW      reg16               \1\x0F\170\205                286,PROT
 WAIT      void                \1\x9B                        8086
 FWAIT     void                \1\x9B                        8086
 WBINVD    void                \2\x0F\x09                    486,PRIV
@@ -1363,12 +1369,12 @@ XOR       reg_eax,imm         \321\1\x35\41                 386,SM
 XOR       reg_rax,sbyte       \321\1\x83\206\15             X64,SM,ND
 XOR       reg_rax,imm         \321\1\x35\41                 X64,SM
 XOR       rm8,imm             \300\1\x80\206\21             8086,SM
-XOR       rm16,imm            \320\300\134\1\x81\206\131    8086,SM
-XOR       rm32,imm            \321\300\144\1\x81\206\141    386,SM
-XOR       rm64,imm            \324\300\144\1\x81\206\141    X64,SM
+XOR       rm16,imm            \320\300\145\1\x81\206\141    8086,SM
+XOR       rm32,imm            \321\300\155\1\x81\206\151    386,SM
+XOR       rm64,imm            \324\300\155\1\x81\206\151    X64,SM
 XOR       mem,imm8            \300\1\x80\206\21             8086,SM
-XOR       mem,imm16           \320\300\134\1\x81\206\131    8086,SM
-XOR       mem,imm32           \321\300\144\1\x81\206\141    386,SM
+XOR       mem,imm16           \320\300\145\1\x81\206\141    8086,SM
+XOR       mem,imm32           \321\300\155\1\x81\206\151    386,SM
 XSTORE    void                \3\x0F\xA7\xC0                P6,CYRIX
 CMOVcc    reg16,mem           \320\301\1\x0F\330\x40\110    P6,SM
 CMOVcc    reg16,reg16         \320\1\x0F\330\x40\110        P6
@@ -2023,3 +2029,169 @@ PCMPGTQ		xmmreg,xmmrm		\366\3\x0F\x38\x37\110		SSE42
 POPCNT		reg16,rm16		\320\333\2\x0F\xB8\110		NEHALEM
 POPCNT		reg32,rm32		\321\333\2\x0F\xB8\110		NEHALEM
 POPCNT		reg64,rm32		\324\333\2\x0F\xB8\110		NEHALEM,X64
+
+; AMD SSE5 instructions
+
+; Four operands with DREX
+FMADDPS		xmmreg,=0,xmmreg,xmmrm	\160\2\x0F\x24\170\132		SSE5,AMD
+FMADDPS		xmmreg,=0,xmmrm,xmmreg	\164\2\x0F\x24\170\123		SSE5,AMD
+FMADDPS		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x04\121		SSE5,AMD
+FMADDPS		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x04\112		SSE5,AMD
+FMADDPD		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x01\132		SSE5,AMD
+FMADDPD		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x01\123		SSE5,AMD
+FMADDPD		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x05\121		SSE5,AMD
+FMADDPD		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x05\112		SSE5,AMD
+FMADDSS		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x02\132		SSE5,AMD
+FMADDSS		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x02\123		SSE5,AMD
+FMADDSS		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x06\121		SSE5,AMD
+FMADDSS		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x06\112		SSE5,AMD
+FMADDSD		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x03\132		SSE5,AMD
+FMADDSD		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x03\123		SSE5,AMD
+FMADDSD		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x07\121		SSE5,AMD
+FMADDSD		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x07\112		SSE5,AMD
+FMSUBPS		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x08\132		SSE5,AMD
+FMSUBPS		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x08\123		SSE5,AMD
+FMSUBPS		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x0C\121		SSE5,AMD
+FMSUBPS		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x0C\112		SSE5,AMD
+FMSUBPD		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x09\132		SSE5,AMD
+FMSUBPD		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x09\123		SSE5,AMD
+FMSUBPD		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x0D\121		SSE5,AMD
+FMSUBPD		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x0D\112		SSE5,AMD
+FMSUBSS		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x0A\132		SSE5,AMD
+FMSUBSS		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x0A\123		SSE5,AMD
+FMSUBSS		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x0E\121		SSE5,AMD
+FMSUBSS		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x0E\112		SSE5,AMD
+FMSUBSD		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x0B\132		SSE5,AMD
+FMSUBSD		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x0B\123		SSE5,AMD
+FMSUBSD		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x0F\121		SSE5,AMD
+FMSUBSD		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x0F\112		SSE5,AMD
+FMNADDPS	xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x10\132		SSE5,AMD
+FMNADDPS	xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x10\123		SSE5,AMD
+FMNADDPS	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x14\121		SSE5,AMD
+FMNADDPS	xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x14\112		SSE5,AMD
+FMNADDPD	xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x11\132		SSE5,AMD
+FMNADDPD	xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x11\123		SSE5,AMD
+FMNADDPD	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x15\121		SSE5,AMD
+FMNADDPD	xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x15\112		SSE5,AMD
+FMNADDSS	xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x12\132		SSE5,AMD
+FMNADDSS	xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x12\123		SSE5,AMD
+FMNADDSS	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x16\121		SSE5,AMD
+FMNADDSS	xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x16\112		SSE5,AMD
+FMNADDSD	xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x13\132		SSE5,AMD
+FMNADDSD	xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x13\123		SSE5,AMD
+FMNADDSD	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x17\121		SSE5,AMD
+FMNADDSD	xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x17\112		SSE5,AMD
+FMNSUBPS	xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x18\132		SSE5,AMD
+FMNSUBPS	xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x18\123		SSE5,AMD
+FMNSUBPS	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x1C\121		SSE5,AMD
+FMNSUBPS	xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x1C\112		SSE5,AMD
+FMNSUBPD	xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x19\132		SSE5,AMD
+FMNSUBPD	xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x19\123		SSE5,AMD
+FMNSUBPD	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x1D\121		SSE5,AMD
+FMNSUBPD	xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x1D\112		SSE5,AMD
+FMNSUBSS	xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x1A\132		SSE5,AMD
+FMNSUBSS	xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x1A\123		SSE5,AMD
+FMNSUBSS	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x1E\121		SSE5,AMD
+FMNSUBSS	xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x1E\112		SSE5,AMD
+FMNSUBSD	xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x1B\132		SSE5,AMD
+FMNSUBSD	xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x1B\123		SSE5,AMD
+FMNSUBSD	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x1F\121		SSE5,AMD
+FMNSUBSD	xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x1F\112		SSE5,AMD
+COMPS		xmmreg,xmmreg,xmmrm,imm \160\3\x0F\x25\x2C\121\27	SSE5,AMD
+COMPD		xmmreg,xmmreg,xmmrm,imm	\160\3\x0F\x25\x2D\121\27	SSE5,AMD
+COMSS		xmmreg,xmmreg,xmmrm,imm	\160\3\x0F\x25\x2E\121\27	SSE5,AMD
+COMSD		xmmreg,xmmreg,xmmrm,imm	\160\3\x0F\x25\x2F\121\27	SSE5,AMD
+PCOMB		xmmreg,xmmreg,xmmrm,imm	\160\3\x0F\x25\x4C\121\27	SSE5,AMD
+PCOMW		xmmreg,xmmreg,xmmrm,imm	\160\3\x0F\x25\x4D\121\27	SSE5,AMD
+PCOMD		xmmreg,xmmreg,xmmrm,imm	\160\3\x0F\x25\x4E\121\27	SSE5,AMD
+PCOMQ		xmmreg,xmmreg,xmmrm,imm	\160\3\x0F\x25\x4F\121\27	SSE5,AMD
+PCOMUB		xmmreg,xmmreg,xmmrm,imm	\160\3\x0F\x25\x6C\121\27	SSE5,AMD
+PCOMUW		xmmreg,xmmreg,xmmrm,imm	\160\3\x0F\x25\x6D\121\27	SSE5,AMD
+PCOMUD		xmmreg,xmmreg,xmmrm,imm	\160\3\x0F\x25\x6E\121\27	SSE5,AMD
+PCOMUQ		xmmreg,xmmreg,xmmrm,imm	\160\3\x0F\x25\x6F\121\27	SSE5,AMD
+PERMPS		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x20\132		SSE5,AMD
+PERMPS		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x20\123		SSE5,AMD
+PERMPS		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x24\121		SSE5,AMD
+PERMPS		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x24\112		SSE5,AMD
+PERMPD		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x21\132		SSE5,AMD
+PERMPD		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x21\123		SSE5,AMD
+PERMPD		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x25\121		SSE5,AMD
+PERMPD		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x25\112		SSE5,AMD
+PCMOV		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x22\132		SSE5,AMD
+PCMOV		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x22\123		SSE5,AMD
+PCMOV		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x26\121		SSE5,AMD
+PCMOV		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x26\112		SSE5,AMD
+PPERM		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x23\132		SSE5,AMD
+PPERM		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x23\123		SSE5,AMD
+PPERM		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x27\121		SSE5,AMD
+PPERM		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x27\112		SSE5,AMD
+PMACSSWW	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x85\121		SSE5,AMD
+PMACSWW		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x95\121		SSE5,AMD
+PMACSSWD	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x86\121		SSE5,AMD
+PMACSWD		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x96\121		SSE5,AMD
+PMACSSDD	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x8E\121		SSE5,AMD
+PMACSDD		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x9E\121		SSE5,AMD
+PMACSSDQL	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x87\121		SSE5,AMD
+PMACSDQL	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x97\121		SSE5,AMD
+PMACSSDQH	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x8F\121		SSE5,AMD
+PMACSDQH	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x9F\121		SSE5,AMD
+PMADCSSWD	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\xA6\121		SSE5,AMD
+PMADCSWD	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\xB6\121		SSE5,AMD
+
+; Three operands with DREX
+PROTB		xmmreg,xmmreg,xmmrm	\160\3\x0F\x24\x40\121		SSE5,AMD
+PROTB		xmmreg,xmmrm,xmmreg	\164\3\x0F\x24\x40\112		SSE5,AMD
+PROTW		xmmreg,xmmreg,xmmrm	\160\3\x0F\x24\x41\121		SSE5,AMD
+PROTW		xmmreg,xmmrm,xmmreg	\164\3\x0F\x24\x41\112		SSE5,AMD
+PROTD		xmmreg,xmmreg,xmmrm	\160\3\x0F\x24\x42\121		SSE5,AMD
+PROTD		xmmreg,xmmrm,xmmreg	\164\3\x0F\x24\x42\112		SSE5,AMD
+PROTQ		xmmreg,xmmreg,xmmrm	\160\3\x0F\x24\x43\121		SSE5,AMD
+PROTQ		xmmreg,xmmrm,xmmreg	\164\3\x0F\x24\x43\112		SSE5,AMD
+PSHLB		xmmreg,xmmreg,xmmrm	\160\3\x0F\x24\x44\121		SSE5,AMD
+PSHLB		xmmreg,xmmrm,xmmreg	\164\3\x0F\x24\x44\112		SSE5,AMD
+PSHLW		xmmreg,xmmreg,xmmrm	\160\3\x0F\x24\x45\121		SSE5,AMD
+PSHLW		xmmreg,xmmrm,xmmreg	\164\3\x0F\x24\x45\112		SSE5,AMD
+PSHLD		xmmreg,xmmreg,xmmrm	\160\3\x0F\x24\x46\121		SSE5,AMD
+PSHLD		xmmreg,xmmrm,xmmreg	\164\3\x0F\x24\x46\112		SSE5,AMD
+PSHLQ		xmmreg,xmmreg,xmmrm	\160\3\x0F\x24\x47\121		SSE5,AMD
+PSHLQ		xmmreg,xmmrm,xmmreg	\164\3\x0F\x24\x47\112		SSE5,AMD
+PSHAB		xmmreg,xmmreg,xmmrm	\160\3\x0F\x24\x48\121		SSE5,AMD
+PSHAB		xmmreg,xmmrm,xmmreg	\164\3\x0F\x24\x48\112		SSE5,AMD
+PSHAW		xmmreg,xmmreg,xmmrm	\160\3\x0F\x24\x49\121		SSE5,AMD
+PSHAW		xmmreg,xmmrm,xmmreg	\164\3\x0F\x24\x49\112		SSE5,AMD
+PSHAD		xmmreg,xmmreg,xmmrm	\160\3\x0F\x24\x4A\121		SSE5,AMD
+PSHAD		xmmreg,xmmrm,xmmreg	\164\3\x0F\x24\x4A\112		SSE5,AMD
+PSHAQ		xmmreg,xmmreg,xmmrm	\160\3\x0F\x24\x4B\121		SSE5,AMD
+PSHAQ		xmmreg,xmmrm,xmmreg	\164\3\x0F\x24\x4B\112		SSE5,AMD
+
+; Non-DREX
+FRCZPS		xmmreg,xmmrm		\3\x0F\x7A\x10\110		SSE5,AMD
+FRCZPD		xmmreg,xmmrm		\3\x0F\x7A\x11\110		SSE5,AMD
+FRCZSS		xmmreg,xmmrm		\3\x0F\x7A\x12\110		SSE5,AMD
+FRCZSD		xmmreg,xmmrm		\3\x0F\x7A\x13\110		SSE5,AMD
+CVTPH2PS	xmmreg,xmmrm		\3\x0F\x7A\x30\110		SSE5,AMD,SQ
+CVTPS2PH	xmmrm,xmmreg		\3\x0F\x7A\x31\101		SSE5,AMD,SQ
+PHADDBW		xmmreg,xmmrm		\3\x0F\x7A\x41\110		SSE5,AMD
+PHADDBD		xmmreg,xmmrm		\3\x0F\x7A\x42\110		SSE5,AMD
+PHADDBQ		xmmreg,xmmrm		\3\x0F\x7A\x43\110		SSE5,AMD
+PHADDWD		xmmreg,xmmrm		\3\x0F\x7A\x46\110		SSE5,AMD
+PHADDWQ		xmmreg,xmmrm		\3\x0F\x7A\x47\110		SSE5,AMD
+PHADDDQ		xmmreg,xmmrm		\3\x0F\x7A\x4B\110		SSE5,AMD
+PHADDUBW	xmmreg,xmmrm		\3\x0F\x7A\x51\110		SSE5,AMD
+PHADDUBD	xmmreg,xmmrm		\3\x0F\x7A\x52\110		SSE5,AMD
+PHADDUBQ	xmmreg,xmmrm		\3\x0F\x7A\x53\110		SSE5,AMD
+PHADDUWD	xmmreg,xmmrm		\3\x0F\x7A\x56\110		SSE5,AMD
+PHADDUWQ	xmmreg,xmmrm		\3\x0F\x7A\x57\110		SSE5,AMD
+PHADDUDQ	xmmreg,xmmrm		\3\x0F\x7A\x5B\110		SSE5,AMD
+PHSUBBW		xmmreg,xmmrm		\3\x0F\x7A\x61\110		SSE5,AMD
+PHSUBWD		xmmreg,xmmrm		\3\x0F\x7A\x62\110		SSE5,AMD
+PHSUBDQ		xmmreg,xmmrm		\3\x0F\x7A\x63\110		SSE5,AMD
+PROTB		xmmreg,xmmrm,imm	\3\x0F\x7B\x40\110\26		SSE5,AMD
+PROTW		xmmreg,xmmrm,imm	\3\x0F\x7B\x41\110\26		SSE5,AMD
+PROTD		xmmreg,xmmrm,imm	\3\x0F\x7B\x42\110\26		SSE5,AMD
+PROTQ		xmmreg,xmmrm,imm	\3\x0F\x7B\x43\110\26		SSE5,AMD
+PTEST		xmmreg,xmmrm		\366\3\x0F\x38\x17\110		SSE5,AMD
+ROUNDPS		xmmreg,xmmrm,imm	\366\3\x0F\x3A\x08\110\26	SSE5,AMD
+ROUNDPD		xmmreg,xmmrm,imm	\366\3\x0F\x3A\x08\110\26	SSE5,AMD
+ROUNDSS		xmmreg,xmmrm,imm	\366\3\x0F\x3A\x08\110\26	SSE5,AMD
+ROUNDSD 	xmmreg,xmmrm,imm	\366\3\x0F\x3A\x08\110\26	SSE5,AMD
diff --git a/insns.h b/insns.h
index 4deccf94..314737af 100644
--- a/insns.h
+++ b/insns.h
@@ -9,26 +9,35 @@
 #ifndef NASM_INSNS_H
 #define NASM_INSNS_H
 
-#include "insnsi.h"             /* instruction opcode enum */
+#include "nasm.h"
 
 /* max length of any instruction, register name etc. */
-#if MAX_INSLEN > 9              /* MAX_INSLEN defined in insnsi.h */
+#if MAX_INSLEN > 12              /* MAX_INSLEN defined in insnsi.h */
 #define MAX_KEYWORD MAX_INSLEN
 #else
-#define MAX_KEYWORD 9
+#define MAX_KEYWORD 12
 #endif
 
 struct itemplate {
     enum opcode opcode;		/* the token, passed from "parser.c" */
     int operands;		/* number of operands */
-    int32_t opd[3];		/* bit flags for operand types */
+    opflags_t opd[MAX_OPERANDS]; /* bit flags for operand types */
     const char *code;		/* the code it assembles to */
     uint32_t flags;		/* some flags */
 };
 
+/* Disassembler table structure */
+/* If n == -1, then p points to another table of 256
+   struct disasm_index, otherwise p points to a list of n
+   struct itemplates to consider. */
+struct disasm_index {
+    const void *p;
+    int n;
+};
+
 /* Tables for the assembler and disassembler, respectively */
 extern const struct itemplate * const nasm_instructions[];
-extern const struct itemplate * const * const itable[];
+extern const struct disasm_index itable[256];
 
 /*
  * this define is used to signify the end of an itemplate
@@ -66,12 +75,15 @@ extern const struct itemplate * const * const itable[];
 #define IF_SM2    0x00000002UL  /* size match first two operands */
 #define IF_SB     0x00000004UL  /* unsized operands can't be non-byte */
 #define IF_SW     0x00000008UL  /* unsized operands can't be non-word */
-#define IF_SD     0x00000010UL  /* unsized operands can't be non-dword */
-#define IF_SQ     0x00000020UL  /* unsized operands can't be non-qword */
-#define IF_AR0	  0x00000040UL  /* SB, SW, SD applies to argument 0 */
-#define IF_AR1	  0x00000080UL  /* SB, SW, SD applies to argument 1 */
-#define IF_AR2	  0x000000C0UL  /* SB, SW, SD applies to argument 2 */
-#define IF_ARMASK 0x000000C0UL  /* mask for unsized argument spec */
+#define IF_SD     0x0000000CUL  /* unsized operands can't be non-dword */
+#define IF_SQ     0x00000010UL  /* unsized operands can't be non-qword */
+#define IF_SO     0x00000014UL  /* unsized operands can't be non-oword */
+#define IF_SMASK  0x0000001CUL  /* mask for unsized argument size */
+#define IF_AR0	  0x00000020UL  /* SB, SW, SD applies to argument 0 */
+#define IF_AR1	  0x00000040UL  /* SB, SW, SD applies to argument 1 */
+#define IF_AR2	  0x00000060UL  /* SB, SW, SD applies to argument 2 */
+#define IF_AR3	  0x00000080UL  /* SB, SW, SD applies to argument 2 */
+#define IF_ARMASK 0x000000E0UL  /* mask for unsized argument spec */
 #define IF_PRIV   0x00000100UL  /* it's a privileged instruction */
 #define IF_SMM    0x00000200UL  /* it's only valid in SMM */
 #define IF_PROT   0x00000400UL  /* it's protected mode only */
@@ -88,6 +100,7 @@ extern const struct itemplate * const * const itable[];
 #define IF_SSSE3  0x00200000UL  /* it's an SSSE3 instruction */
 #define IF_SSE41  0x00400000UL  /* it's an SSE4.1 instruction */
 #define IF_SSE42  0x00800000UL  /* it's an SSE4.2 instruction */
+#define IF_SSE5   0x00800000UL  /* HACK NEED TO REORGANIZE THESE BITS */
 #define IF_PMASK  0xFF000000UL  /* the mask for processor types */
 #define IF_PLEVEL 0x0F000000UL  /* the mask for processor instr. level */
                                         /* also the highest possible processor */
diff --git a/insns.pl b/insns.pl
index 421f16aa..356c183d 100644
--- a/insns.pl
+++ b/insns.pl
@@ -7,6 +7,10 @@
 # redistributable under the licence given in the file "Licence"
 # distributed in the NASM archive.
 
+# Opcode prefixes which need their own opcode tables
+# LONGER PREFIXES FIRST!
+@disasm_prefixes = qw(0F0F 0F24 0F25 0F38 0F3A 0F7A 0F);
+
 print STDERR "Reading insns.dat...\n";
 
 @args   = ();
@@ -26,6 +30,8 @@ foreach $arg ( @ARGV ) {
 $fname = "insns.dat" unless $fname = $args[0];
 open (F, $fname) || die "unable to open $fname";
 
+%dinstables = ();
+
 $line = 0;
 $insns = 0;
 while (<F>) {
@@ -50,9 +56,11 @@ while (<F>) {
   }
   if ($formatted && !$nd) {
     push @big, $formatted;
-    foreach $i (&startbyte($_[2])) {
-      $aname = sprintf "dd_%02X",$i;
-      push @$aname, $#big;
+    foreach $i (startseq($_[2])) {
+	if (!defined($dinstables{$i})) {
+	    $dinstables{$i} = [];
+	}
+	push(@{$dinstables{$i}}, $#big);
     }
   }
 }
@@ -102,26 +110,42 @@ if ( !defined($output) || $output eq 'd' ) {
     print D "\n";
     
     print D "static const struct itemplate instrux[] = {\n";
+    $n = 0;
     foreach $j (@big) {
-	print D "    $j\n";
+	printf D "    /* %4d */ %s\n", $n++, $j;
     }
-	print D "    ITEMPLATE_END\n};\n\n";
-    
-    for ($c=0; $c<256; $c++) {
-	$h = sprintf "%02X", $c;
-	print D "static const struct itemplate * const itable_${h}[] = {\n";
-	$aname = "dd_$h";
-	foreach $j (@$aname) {
+    print D "};\n";
+
+    foreach $h (sort(keys(%dinstables))) {
+	print D "\nstatic const struct itemplate * const itable_${h}[] = {\n";
+	foreach $j (@{$dinstables{$h}}) {
 	    print D "    instrux + $j,\n";
 	}
-	print D "    NULL\n};\n\n";
-    }
-    
-    print D "const struct itemplate * const * const itable[] = {\n";
-    for ($c=0; $c<256; $c++) {
-	printf D "    itable_%02X,\n", $c;
+	print D "};\n";
     }
+
+    foreach $h (@disasm_prefixes, '') {
+	$is_prefix{$h} = 1;
+	print D "\n";
+	print D "static " unless ($h eq '');
+	print D "const struct disasm_index ";
+	print D ($h eq '') ? 'itable' : "itable_$h";
+	print D "[256] = {\n";
+	for ($c = 0; $c < 256; $c++) {
+	    $nn = sprintf("%s%02X", $h, $c);
+	    if ($is_prefix{$nn}) {
+		die "$0: ambiguous decoding of $nn\n"
+		    if (defined($dinstables{$nn}));
+		printf D "    { itable_%s, -1 },\n", $nn;
+	    } elsif (defined($dinstables{$nn})) {
+		printf D "    { itable_%s, %u },\n",
+	    	$nn, scalar(@{$dinstables{$nn}});
+	    } else {
+		printf D "    { NULL, 0 },\n";
+	    }
+	}
     print D "};\n";
+    }
     
     close D;
 }
@@ -203,60 +227,130 @@ if ( !defined($output) || $output eq 'n' ) {
 printf STDERR "Done: %d instructions\n", $insns;
 
 sub format {
-  local ($opcode, $operands, $codes, $flags) = @_;
-  local $num, $nd = 0;
-
-  return (undef, undef) if $operands eq "ignore";
+    my ($opcode, $operands, $codes, $flags) = @_;
+    my $num, $nd = 0;
 
-  # format the operands
-  $operands =~ s/:/|colon,/g;
-  $operands =~ s/mem(\d+)/mem|bits$1/g;
-  $operands =~ s/mem/memory/g;
-  $operands =~ s/memory_offs/mem_offs/g;
-  $operands =~ s/imm(\d+)/imm|bits$1/g;
-  $operands =~ s/imm/immediate/g;
-  $operands =~ s/rm(\d+)/rm_gpr|bits$1/g;
-  $operands =~ s/mmxrm/rm_mmx/g;
-  $operands =~ s/xmmrm/rm_xmm/g;
-  $num = 3;
-  $operands = '0,0,0', $num = 0 if $operands eq 'void';
-  $operands .= ',0', $num-- while $operands !~ /,.*,/;
-  $operands =~ tr/a-z/A-Z/;
+    return (undef, undef) if $operands eq "ignore";
+    
+    # format the operands
+    $operands =~ s/:/|colon,/g;
+    $operands =~ s/mem(\d+)/mem|bits$1/g;
+    $operands =~ s/mem/memory/g;
+    $operands =~ s/memory_offs/mem_offs/g;
+    $operands =~ s/imm(\d+)/imm|bits$1/g;
+    $operands =~ s/imm/immediate/g;
+    $operands =~ s/rm(\d+)/rm_gpr|bits$1/g;
+    $operands =~ s/mmxrm/rm_mmx/g;
+    $operands =~ s/xmmrm/rm_xmm/g;
+    $operands =~ s/\=([0-9]+)/same_as|$1/g;
+    if ($operands eq 'void') {
+	@ops = ();
+    } else {
+	@ops = split(/\,/, $operands);
+    }
+    $num = scalar(@ops);
+    while (scalar(@ops) < 4) {
+	push(@ops, '0');
+    }
+    $operands = join(',', @ops);
+    $operands =~ tr/a-z/A-Z/;
+    
+    # format the flags
+    $flags =~ s/,/|IF_/g;
+    $flags =~ s/(\|IF_ND|IF_ND\|)//, $nd = 1 if $flags =~ /IF_ND/;
+    $flags = "IF_" . $flags;
+    
+    ("{I_$opcode, $num, {$operands}, \"$codes\", $flags},", $nd);
+}
 
-  # format the flags
-  $flags =~ s/,/|IF_/g;
-  $flags =~ s/(\|IF_ND|IF_ND\|)//, $nd = 1 if $flags =~ /IF_ND/;
-  $flags = "IF_" . $flags;
+sub hexlist($$$) {
+    my($prefix, $start, $n) = @_;
+    my $i;
+    my @l = ();
 
-  ("{I_$opcode, $num, {$operands}, \"$codes\", $flags},", $nd);
+    for ($i = 0; $i < $n; $i++) {
+	push(@l, sprintf("%s%02X", $prefix, $start+$i));
+    }
+    return @l;
 }
 
 # Here we determine the range of possible starting bytes for a given
 # instruction. We need only consider the codes:
 # \1 \2 \3     mean literal bytes, of course
 # \4 \5 \6 \7  mean PUSH/POP of segment registers: special case
-# \10 \11 \12  mean byte plus register value
-# \17          means byte zero
+# \1[0123]     mean byte plus register value
+# \170         means byte zero
 # \330         means byte plus condition code
 # \0 or \340   mean give up and return empty set
-sub startbyte {
-  local ($codes) = @_;
-  local $word, @range;
+sub startseq($) {
+  my ($codestr) = @_;
+  my $word, @range;
+  my @codes = ();
+  my $c = $codestr;
+  my $c0, $c1, $i;
+  my $prefix = '';
+
+  # Although these are C-syntax strings, by convention they should have
+  # only octal escapes (for directives) and hexadecimal escapes
+  # (for verbatim bytes)
+  while ($c ne '') {
+      if ($c =~ /^\\x([0-9a-f]+)(.*)$/i) {
+	  push(@codes, hex $1);
+	  $c = $2;
+	  next;
+      } elsif ($c =~ /^\\([0-7]{1,3})(.*)$/) {
+	  push(@codes, oct $1);
+	  $c = $2;
+	  next;
+      } else {
+	  die "$0: unknown code format in \"$codestr\"\n";
+      }
+  }
+
+  while ($c0 = shift(@codes)) {
+      $c1 = $codes[0];
+      if ($c0 == 01 || $c0 == 02 || $c0 == 03 || $c0 == 0170) {
+	  # Fixed byte string
+	  my $fbs = $prefix;
+	  while (1) {
+	      if ($c0 == 01 || $c0 == 02 || $c0 == 03) {
+		  while ($c0--) {
+		      $fbs .= sprintf("%02X", shift(@codes));
+		  }
+	      } elsif ($c0 == 0170) {
+		  $fbs .= '00';
+	      } else {
+		  last;
+	      }
+	      $c0 = shift(@codes);
+	  }
+
+	  foreach $pfx (@disasm_prefixes) {
+	      if ($fbs =~ /^$pfx(.*)$/) {
+		  $prefix = $pfx;
+		  $fbs = $1;
+		  last;
+	      }
+	  }
 
-  while (1) {
-    die "couldn't get code in '$codes'" if $codes !~ /^(\\[^\\]+)(\\.*)?$/;
-    $word = $1, $codes = $2;
-    return (hex $1) if $word =~ /^\\[123]$/ && $codes =~ /^\\x(..)/;
-    return (0x07, 0x17, 0x1F) if $word eq "\\4";
-    return (0xA1, 0xA9) if $word eq "\\5";
-    return (0x06, 0x0E, 0x16, 0x1E) if $word eq "\\6";
-    return (0xA0, 0xA8) if $word eq "\\7";
-    $start=hex $1, $r=8, last if $word =~ /^\\1[012]$/ && $codes =~/^\\x(..)/;
-    return (0) if $word eq "\\17";
-    $start=hex $1, $r=16, last if $word =~ /^\\330$/ && $codes =~ /^\\x(..)/;
-    return () if $word eq "\\0" || $word eq "\\340";
+	  if ($fbs ne '') {
+	      return ($prefix.substr($fbs,0,2));
+	  }
+      } elsif ($c0 == 04) {
+	  return ("07", "17", "1F");
+      } elsif ($c0 == 05) {
+	  return ("A1", "A9");
+      } elsif ($c0 == 06) {
+	  return ("06", "0E", "16", "1E");
+      } elsif ($c0 == 07) {
+	  return ("A0", "A8");
+      } elsif ($c0 >= 010 && $c0 <= 013) {
+	  return hexlist($prefix, $c1, 8);
+      } elsif ($c0 == 0330) {
+	  return hexlist($prefix, $c1, 16);
+      } elsif ($c0 == 0 || $c0 == 0340) {
+	  return ();
+      }
   }
-  @range = ();
-  push @range, $start++ while ($r-- > 0);
-  @range;
+  return ();
 }
diff --git a/nasm.h b/nasm.h
index 4ae93b61..f4afad36 100644
--- a/nasm.h
+++ b/nasm.h
@@ -375,7 +375,7 @@ enum {
  *
  * The bits are assigned as follows:
  *
- * Bits 0-7: sizes
+ * Bits 0-7, 29: sizes
  *  0:  8 bits (BYTE)
  *  1: 16 bits (WORD)
  *  2: 32 bits (DWORD)
@@ -384,6 +384,7 @@ enum {
  *  5: FAR
  *  6: NEAR
  *  7: SHORT
+ * 29: 128 bits (OWORD)
  *
  * Bits 8-11 modifiers
  *  8: TO
@@ -438,21 +439,29 @@ enum {
  * 25: RM_MMX (MMXREG)
  * 26: RM_XMM (XMMREG)
  *
- * Bits 27-31 are currently unallocated.
+ * Bits 27-29 & 31 are currently unallocated.
+ *
+ * 30: SAME_AS
+ * Special flag only used in instruction patterns; means this operand
+ * has to be identical to another operand.  Currently only supported
+ * for registers.
  */
 
+typedef uint32_t opflags_t;
+
 /* Size, and other attributes, of the operand */
 #define BITS8     	0x00000001L
 #define BITS16    	0x00000002L
 #define BITS32    	0x00000004L
 #define BITS64    	0x00000008L   /* x64 and FPU only */
 #define BITS80    	0x00000010L   /* FPU only */
+#define BITS128		0x20000000L
 #define FAR       	0x00000020L   /* grotty: this means 16:16 or */
                                        /* 16:32, like in CALL/JMP */
 #define NEAR      	0x00000040L
 #define SHORT     	0x00000080L   /* and this means what it says :) */
 
-#define SIZE_MASK 	0x000000FFL   /* all the size attributes */
+#define SIZE_MASK 	0x200000FFL   /* all the size attributes */
 
 /* Modifiers */
 #define MODIFIER_MASK	0x00000f00L
@@ -527,6 +536,9 @@ enum {
 #define UNITY		0x00012000L   /* for shift/rotate instructions */
 #define SBYTE		0x00022000L   /* for op r16/32,immediate instrs. */
 
+/* special flags */
+#define SAME_AS		0x40000000L
+
 /* Register names automatically generated from regs.dat */
 #include "regs.h"
 
@@ -540,6 +552,8 @@ enum ccode {			/* condition code names */
 /*
  * REX flags
  */
+#define REX_OC		0x0200	/* DREX suffix has the OC0 bit set */
+#define REX_D		0x0100	/* Instruction uses DREX instead of REX */
 #define REX_H		0x80	/* High register present, REX forbidden */
 #define REX_P		0x40	/* REX prefix present/required */
 #define REX_L		0x20	/* Use LOCK prefix instead of REX.R */
@@ -607,6 +621,7 @@ typedef struct extop {          /* extended operand */
 } extop;
 
 #define MAXPREFIX 4
+#define MAX_OPERANDS 4
 
 typedef struct {                /* an instruction itself */
     char *label;              /* the label defined, or NULL */
@@ -616,12 +631,13 @@ typedef struct {                /* an instruction itself */
     enum ccode condition;       /* the condition code, if Jcc/SETcc */
     int operands;               /* how many operands? 0-3 
                                  * (more if db et al) */
-    operand oprs[3];            /* the operands, defined as above */
+    operand oprs[MAX_OPERANDS]; /* the operands, defined as above */
     extop *eops;                /* extended operands */
     int eops_float;             /* true if DD and floating */
     int32_t times;              /* repeat count (TIMES prefix) */
     int forw_ref;               /* is there a forward reference? */
-    uint8_t rex;                /* Special REX Prefix */
+    int rex;			/* Special REX Prefix */
+    int drexdst;		/* Destination register for DREX suffix */
 } insn;
 
 enum geninfo { GI_SWITCH };
@@ -945,8 +961,8 @@ struct dfmt {
  */
 
 enum special_tokens {
-    S_ABS, S_BYTE, S_DWORD, S_FAR, S_LONG, S_NEAR, S_NOSPLIT, S_QWORD, S_REL,
-    S_SHORT, S_STRICT, S_TO, S_TWORD, S_WORD
+    S_ABS, S_BYTE, S_DWORD, S_FAR, S_LONG, S_NEAR, S_NOSPLIT,
+    S_OWORD, S_QWORD, S_REL, S_SHORT, S_STRICT, S_TO, S_TWORD, S_WORD
 };
 
 /*
diff --git a/parser.c b/parser.c
index 1c7b8d9b..31c3612a 100644
--- a/parser.c
+++ b/parser.c
@@ -175,23 +175,25 @@ insn *parse_line(int pass, char *buffer, insn * result,
      * For the moment, EQU has the same difficulty, so we'll
      * include that.
      */
-    if (result->opcode == I_RESB || result->opcode == I_RESW || result->opcode == I_RESD || result->opcode == I_RESQ || result->opcode == I_REST || result->opcode == I_EQU || result->opcode == I_INCBIN) {    /* fbk */
+    if (result->opcode == I_RESB || result->opcode == I_RESW ||
+	result->opcode == I_RESD || result->opcode == I_RESQ ||
+	result->opcode == I_REST || result->opcode == I_RESO ||
+	result->opcode == I_EQU || result->opcode == I_INCBIN) {
         critical = pass0;
     } else
         critical = (pass == 2 ? 2 : 0);
 
-    if (result->opcode == I_DB ||
-        result->opcode == I_DW ||
-        result->opcode == I_DD ||
-        result->opcode == I_DQ ||
-        result->opcode == I_DT || result->opcode == I_INCBIN) {
+    if (result->opcode == I_DB || result->opcode == I_DW ||
+        result->opcode == I_DD || result->opcode == I_DQ ||
+        result->opcode == I_DT || result->opcode == I_DO ||
+	result->opcode == I_INCBIN) {
         extop *eop, **tail = &result->eops, **fixptr;
         int oper_num = 0;
 
         result->eops_float = FALSE;
 
         /*
-         * Begin to read the DB/DW/DD/DQ/DT/INCBIN operands.
+         * Begin to read the DB/DW/DD/DQ/DT/DO/INCBIN operands.
          */
         while (1) {
             i = stdscan(NULL, &tokval);
@@ -212,45 +214,56 @@ insn *parse_line(int pass, char *buffer, insn * result,
                 continue;
             }
 
-            if ((i == TOKEN_FLOAT && is_comma_next()) || i == '-') {
-                int32_t sign = +1L;
+            if ((i == TOKEN_FLOAT && is_comma_next())
+		|| i == '-' || i == '+') {
+                int32_t sign = +1;
 
-                if (i == '-') {
+                if (i == '+' || i == '-') {
                     char *save = stdscan_bufptr;
+		    int token = i;
+		    sign = (i == '-') ? -1 : 1;
                     i = stdscan(NULL, &tokval);
-                    sign = -1L;
                     if (i != TOKEN_FLOAT || !is_comma_next()) {
                         stdscan_bufptr = save;
-                        i = tokval.t_type = '-';
+                        i = tokval.t_type = token;
                     }
                 }
 
                 if (i == TOKEN_FLOAT) {
                     eop->type = EOT_DB_STRING;
                     result->eops_float = TRUE;
-                    if (result->opcode == I_DD)
+		    switch (result->opcode) {
+		    case I_DW:
+			eop->stringlen = 2;
+			break;
+		    case I_DD:
                         eop->stringlen = 4;
-                    else if (result->opcode == I_DQ)
+			break;
+		    case I_DQ:
                         eop->stringlen = 8;
-                    else if (result->opcode == I_DT)
+			break;
+		    case I_DT:
                         eop->stringlen = 10;
-                    else {
+			break;
+		    case I_DO:
+                        eop->stringlen = 16;
+			break;
+		    default:
                         error(ERR_NONFATAL, "floating-point constant"
-                              " encountered in `D%c' instruction",
-                              result->opcode == I_DW ? 'W' : 'B');
+                              " encountered in `db' instruction");
                         /*
                          * fix suggested by Pedro Gimeno... original line
                          * was:
                          * eop->type = EOT_NOTHING;
                          */
                         eop->stringlen = 0;
+			break;
                     }
-                    eop =
-                        nasm_realloc(eop, sizeof(extop) + eop->stringlen);
+                    eop = nasm_realloc(eop, sizeof(extop) + eop->stringlen);
                     tail = &eop->next;
                     *fixptr = eop;
                     eop->stringval = (char *)eop + sizeof(extop);
-                    if (eop->stringlen < 4 ||
+                    if (!eop->stringlen ||
                         !float_const(tokval.t_charptr, sign,
                                      (uint8_t *)eop->stringval,
                                      eop->stringlen, error))
@@ -339,10 +352,10 @@ insn *parse_line(int pass, char *buffer, insn * result,
         return result;
     }
 
-    /* right. Now we begin to parse the operands. There may be up to three
+    /* right. Now we begin to parse the operands. There may be up to four
      * of these, separated by commas, and terminated by a zero token. */
 
-    for (operand = 0; operand < 3; operand++) {
+    for (operand = 0; operand < MAX_OPERANDS; operand++) {
         expr *value;            /* used most of the time */
         int mref;               /* is this going to be a memory ref? */
         int bracket;            /* is it a [] mref, or a & mref? */
@@ -384,6 +397,11 @@ insn *parse_line(int pass, char *buffer, insn * result,
                     result->oprs[operand].type |= BITS80;
                 setsize = 1;
                 break;
+            case S_OWORD:
+                if (!setsize)
+                    result->oprs[operand].type |= BITS128;
+                setsize = 1;
+                break;
             case S_TO:
                 result->oprs[operand].type |= TO;
                 break;
@@ -440,6 +458,9 @@ insn *parse_line(int pass, char *buffer, insn * result,
                     case S_TWORD:
                         result->oprs[operand].type |= BITS80;
                         break;
+                    case S_OWORD:
+                        result->oprs[operand].type |= BITS128;
+                        break;
                     default:
                         error(ERR_NONFATAL,
                               "invalid operand size specification");
@@ -751,7 +772,7 @@ insn *parse_line(int pass, char *buffer, insn * result,
         result->oprs[operand++].type = 0;
 
     /*
-     * Transform RESW, RESD, RESQ, REST into RESB.
+     * Transform RESW, RESD, RESQ, REST, RESO into RESB.
      */
     switch (result->opcode) {
     case I_RESW:
@@ -770,6 +791,10 @@ insn *parse_line(int pass, char *buffer, insn * result,
         result->opcode = I_RESB;
         result->oprs[0].offset *= 10;
         break;
+    case I_RESO:
+        result->opcode = I_RESB;
+        result->oprs[0].offset *= 16;
+        break;
     default:
 	break;
     }
diff --git a/perllib/phash.ph b/perllib/phash.ph
index 60334272..3bb3a05b 100644
--- a/perllib/phash.ph
+++ b/perllib/phash.ph
@@ -42,8 +42,8 @@ sub prehash($$$) {
 
     foreach $c (unpack("C*", $key)) {
 	$ko1 = $k1;  $ko2 = $k2;
-	$k1 = int32(rot($ko1,$s0)-rot($ko2, $s1)+$c);
-	$k2 = int32(rot($ko2,$s2)-rot($ko1, $s3)+$c);
+	$k1 = int32(rot($ko1,$s0)^int32(rot($ko2, $s1)+$c));
+	$k2 = int32(rot($ko2,$s2)^int32(rot($ko1, $s3)+$c));
     }
 
     # Create a bipartite graph...
diff --git a/pptok.pl b/pptok.pl
index a0425b7c..a835bf3e 100755
--- a/pptok.pl
+++ b/pptok.pl
@@ -191,8 +191,8 @@ if ($what eq 'c') {
     print OUT  "    while ((c = *p++) != 0) {\n";
     print OUT  "        uint32_t kn1, kn2;\n";
     print OUT  "        c |= 0x20; /* convert to lower case */\n";
-    printf OUT "        kn1 = rot(k1,%2d) - rot(k2,%2d) + c;\n", ${$sv}[0], ${$sv}[1];
-    printf OUT "        kn2 = rot(k2,%2d) - rot(k1,%2d) + c;\n", ${$sv}[2], ${$sv}[3];
+    printf OUT "        kn1 = rot(k1,%2d)^(rot(k2,%2d) + c);\n", ${$sv}[0], ${$sv}[1];
+    printf OUT "        kn2 = rot(k2,%2d)^(rot(k1,%2d) + c);\n", ${$sv}[2], ${$sv}[3];
     print OUT  "        k1 = kn1; k2 = kn2;\n";
     print OUT  "    }\n";
     print OUT  "\n";
diff --git a/stdscan.c b/stdscan.c
index d4ad696d..aecbd4a7 100644
--- a/stdscan.c
+++ b/stdscan.c
@@ -75,7 +75,6 @@ int stdscan(void *private_data, struct tokenval *tv)
         (*stdscan_bufptr == '$' && isidstart(stdscan_bufptr[1]))) {
         /* now we've got an identifier */
         int is_sym = FALSE;
-	int t;
 
         if (*stdscan_bufptr == '$') {
             is_sym = TRUE;
@@ -99,10 +98,7 @@ int stdscan(void *private_data, struct tokenval *tv)
         *r = '\0';
         /* right, so we have an identifier sitting in temp storage. now,
          * is it actually a register or instruction name, or what? */
-	if ((t = nasm_token_hash(ourcopy, tv)) != -1)
-	    return t;
-	else
-	    return tv->t_type = TOKEN_ID;
+	return nasm_token_hash(ourcopy, tv);
     } else if (*stdscan_bufptr == '$' && !isnumchar(stdscan_bufptr[1])) {
         /*
          * It's a $ sign with no following hex number; this must
@@ -130,7 +126,9 @@ int stdscan(void *private_data, struct tokenval *tv)
             stdscan_bufptr++;
             while (isnumchar(*stdscan_bufptr) ||
                    ((stdscan_bufptr[-1] == 'e'
-                     || stdscan_bufptr[-1] == 'E')
+                     || stdscan_bufptr[-1] == 'E'
+		     || stdscan_bufptr[-1] == 'p'
+		     || stdscan_bufptr[-1] == 'P')
                     && (*stdscan_bufptr == '-' || *stdscan_bufptr == '+'))) {
                 stdscan_bufptr++;
             }
diff --git a/test/float.asm b/test/float.asm
new file mode 100644
index 00000000..bcb2ec28
--- /dev/null
+++ b/test/float.asm
@@ -0,0 +1,133 @@
+;
+; Test of floating-point formats
+;
+
+; 16-bit
+	dw 1.0
+	dw +1.0
+	dw -1.0
+	dw 0.0
+	dw +0.0
+	dw -0.0
+	dw 1.83203125
+	dw +1.83203125
+	dw -1.83203125
+	dw 1.83203125e3
+	dw +1.83203125e3
+	dw -1.83203125e3
+	dw 1.83203125e-3
+	dw +1.83203125e-3
+	dw -1.83203125e-3
+	dw 1.83203125e-6		; Denormal!
+	dw +1.83203125e-6		; Denormal!
+	dw -1.83203125e-6		; Denormal!
+	dw __Infinity__
+	dw +__Infinity__
+	dw -__Infinity__
+	dw __NaN__
+	dw __QNaN__
+	dw __SNaN__
+
+; 32-bit
+	dd 1.0
+	dd +1.0
+	dd -1.0
+	dd 0.0
+	dd +0.0
+	dd -0.0
+	dd 1.83203125
+	dd +1.83203125
+	dd -1.83203125
+	dd 1.83203125e15
+	dd +1.83203125e15
+	dd -1.83203125e15
+	dd 1.83203125e-15
+	dd +1.83203125e-15
+	dd -1.83203125e-15
+	dd 1.83203125e-40		; Denormal!
+	dd +1.83203125e-40		; Denormal!
+	dd -1.83203125e-40		; Denormal!
+	dd __Infinity__
+	dd +__Infinity__
+	dd -__Infinity__
+	dd __NaN__
+	dd __QNaN__
+	dd __SNaN__
+
+; 64-bit
+	dq 1.0
+	dq +1.0
+	dq -1.0
+	dq 0.0
+	dq +0.0
+	dq -0.0
+	dq 1.83203125
+	dq +1.83203125
+	dq -1.83203125
+	dq 1.83203125e300
+	dq +1.83203125e300
+	dq -1.83203125e300
+	dq 1.83203125e-300
+	dq +1.83203125e-300
+	dq -1.83203125e-300
+	dq 1.83203125e-320		; Denormal!
+	dq +1.83203125e-320		; Denormal!
+	dq -1.83203125e-320		; Denormal!
+	dq __Infinity__
+	dq +__Infinity__
+	dq -__Infinity__
+	dq __NaN__
+	dq __QNaN__
+	dq __SNaN__
+
+; 80-bit
+	dt 1.0
+	dt +1.0
+	dt -1.0
+	dt 0.0
+	dt +0.0
+	dt -0.0
+	dt 1.83203125
+	dt +1.83203125
+	dt -1.83203125
+	dt 1.83203125e+4000
+	dt +1.83203125e+4000
+	dt -1.83203125e+4000
+	dt 1.83203125e-4000
+	dt +1.83203125e-4000
+	dt -1.83203125e-4000
+	dt 1.83203125e-4940		; Denormal!
+	dt +1.83203125e-4940		; Denormal!
+	dt -1.83203125e-4940		; Denormal!
+	dt __Infinity__
+	dt +__Infinity__
+	dt -__Infinity__
+	dt __NaN__
+	dt __QNaN__
+	dt __SNaN__
+
+; 128-bit
+	do 1.0
+	do +1.0
+	do -1.0
+	do 0.0
+	do +0.0
+	do -0.0
+	do 1.83203125
+	do +1.83203125
+	do -1.83203125
+	do 1.83203125e+4000
+	do +1.83203125e+4000
+	do -1.83203125e+4000
+	do 1.83203125e-4000
+	do +1.83203125e-4000
+	do -1.83203125e-4000
+	do 1.83203125e-4940		; Denormal!
+	do +1.83203125e-4940		; Denormal!
+	do -1.83203125e-4940		; Denormal!
+	do __Infinity__
+	do +__Infinity__
+	do -__Infinity__
+	do __NaN__
+	do __QNaN__
+	do __SNaN__
diff --git a/test/floatx.asm b/test/floatx.asm
new file mode 100644
index 00000000..f513ec83
--- /dev/null
+++ b/test/floatx.asm
@@ -0,0 +1,125 @@
+;
+; floatx.asm
+;
+; Test hexadecimal floating-point numbers
+
+; 16-bit
+	dw 1.0
+	dw 0x1.0
+	dw 2.0
+	dw 0x2.0
+	dw 0x1.0p+1
+	dw 0x1.0p-1
+	dw 0x0.0
+	dw 0x1.23456789
+	dw 0x0.123456789
+	dw 0x0.0000123456789
+	dw 0x1.23456789p10
+	dw 0x1.23456789p+10
+	dw 0x1.23456789p-10
+	dw 0x0.123456789p10
+	dw 0x0.123456789p+10
+	dw 0x0.123456789abcdef0123456789abcdef012345p-10
+	dw 0x0.0000123456789
+	dw 0x0.0000123456789p+10
+	dw 0x0.0000123456789p-10
+
+; 32-bit
+	dd 1.0
+	dd 0x1.0
+	dd 2.0
+	dd 0x2.0
+	dd 0x1.0p+1
+	dd 0x1.0p-1
+	dd 0x0.0
+	dd 0x1.23456789
+	dd 0x0.123456789
+	dd 0x0.0000123456789
+	dd 0x1.23456789p10
+	dd 0x1.23456789p+10
+	dd 0x1.23456789p-10
+	dd 0x0.123456789p10
+	dd 0x0.123456789p+10
+	dd 0x0.123456789abcdef0123456789abcdef012345p-10
+	dd 0x0.0000123456789
+	dd 0x0.0000123456789p+10
+	dd 0x0.0000123456789p-10
+	dd 0x123456789.0
+	dd 0x0000123456789.0
+	dd 0x123456789.0p+0
+	dd 0x123456789.0p+64
+
+; 64-bit
+	dq 1.0
+	dq 0x1.0
+	dq 2.0
+	dq 0x2.0
+	dq 0x1.0p+1
+	dq 0x1.0p-1
+	dq 0x0.0
+	dq 0x1.23456789
+	dq 0x0.123456789
+	dq 0x0.0000123456789
+	dq 0x1.23456789p10
+	dq 0x1.23456789p+10
+	dq 0x1.23456789p-10
+	dq 0x0.123456789p10
+	dq 0x0.123456789p+10
+	dq 0x0.123456789abcdef0123456789abcdef012345p-10
+	dq 0x0.0000123456789
+	dq 0x0.0000123456789p+10
+	dq 0x0.0000123456789p-10
+	dq 0x123456789.0
+	dq 0x0000123456789.0
+	dq 0x123456789.0p+0
+	dq 0x123456789.0p+300
+	
+; 80-bit
+	dt 1.0
+	dt 0x1.0
+	dt 2.0
+	dt 0x2.0
+	dt 0x1.0p+1
+	dt 0x1.0p-1
+	dt 0x0.0
+	dt 0x1.23456789
+	dt 0x0.123456789
+	dt 0x0.0000123456789
+	dt 0x1.23456789p10
+	dt 0x1.23456789p+10
+	dt 0x1.23456789p-10
+	dt 0x0.123456789p10
+	dt 0x0.123456789p+10
+	dt 0x0.123456789abcdef0123456789abcdef012345p-10
+	dt 0x0.0000123456789
+	dt 0x0.0000123456789p+10
+	dt 0x0.0000123456789p-10
+	dt 0x123456789.0
+	dt 0x0000123456789.0
+	dt 0x123456789.0p+0
+	dt 0x123456789.0p+1024
+
+; 128-bit
+	do 1.0
+	do 0x1.0
+	do 2.0
+	do 0x2.0
+	do 0x1.0p+1
+	do 0x1.0p-1
+	do 0x0.0
+	do 0x1.23456789
+	do 0x0.123456789
+	do 0x0.0000123456789
+	do 0x1.23456789p10
+	do 0x1.23456789p+10
+	do 0x1.23456789p-10
+	do 0x0.123456789p10
+	do 0x0.123456789p+10
+	do 0x0.123456789abcdef0123456789abcdef012345p-10
+	do 0x0.0000123456789
+	do 0x0.0000123456789p+10
+	do 0x0.0000123456789p-10
+	do 0x123456789.0
+	do 0x0000123456789.0
+	do 0x123456789.0p+0
+	do 0x123456789.0p+1024
diff --git a/test/fmsub.asm b/test/fmsub.asm
new file mode 100644
index 00000000..7f087cd7
--- /dev/null
+++ b/test/fmsub.asm
@@ -0,0 +1,16 @@
+	bits 64
+
+	fmsubps xmm0,xmm0,xmm1,xmm2
+	fmsubps xmm0,xmm0,xmm1,[rax]
+	fmsubps xmm0,xmm0,xmm1,[rax+0x77]
+	fmsubps xmm0,xmm0,xmm1,[rax+0x7777]
+	fmsubps xmm1,xmm2,xmm3,xmm1
+	fmsubps xmm1,xmm2,[rax],xmm1
+	fmsubps xmm1,xmm2,[rax+0x77],xmm1
+	fmsubps xmm1,xmm2,[rax+0x7777],xmm1
+	fmsubps xmm0,[rax],xmm2,xmm0
+	fmsubps xmm0,[rax+0x77],xmm2,xmm0
+	fmsubps xmm0,[rax+0x7777],xmm2,xmm0
+	fmsubps xmm14,[rax],xmm2,xmm14
+	fmsubps xmm14,[rax+0x77],xmm2,xmm14
+	fmsubps xmm14,[rax+0x7777],xmm2,xmm14
diff --git a/tokens.dat b/tokens.dat
index 6acaba49..e7c1cb29 100644
--- a/tokens.dat
+++ b/tokens.dat
@@ -23,6 +23,7 @@ far
 long
 near
 nosplit
+oword
 qword
 rel
 short
@@ -31,6 +32,12 @@ to
 tword
 word
 
+% TOKEN_FLOAT, 0, 0
+__infinity__
+__nan__
+__qnan__
+__snan__
+
 % TOKEN_*, 0, 0
 seg
 wrt
diff --git a/tokhash.pl b/tokhash.pl
index 5f1a9f4c..a63e55f3 100755
--- a/tokhash.pl
+++ b/tokhash.pl
@@ -187,21 +187,21 @@ print  "    const char *p = token;\n";
 print  "\n";
 
 print  "    while ((c = *p++) != 0) {\n";
-printf "        uint32_t kn1 = rot(k1,%2d) - rot(k2,%2d) + c;\n", ${$sv}[0], ${$sv}[1];
-printf "        uint32_t kn2 = rot(k2,%2d) - rot(k1,%2d) + c;\n", ${$sv}[2], ${$sv}[3];
+printf "        uint32_t kn1 = rot(k1,%2d)^(rot(k2,%2d) + c);\n", ${$sv}[0], ${$sv}[1];
+printf "        uint32_t kn2 = rot(k2,%2d)^(rot(k1,%2d) + c);\n", ${$sv}[2], ${$sv}[3];
 print  "        k1 = kn1; k2 = kn2;\n";
 print  "    }\n";
 print  "\n";
 printf "    ix = hash1[k1 & 0x%x] + hash2[k2 & 0x%x];\n", $n-1, $n-1;
 printf "    if (ix >= %d)\n", scalar(@tokendata);
-print  "        return -1;\n";
+print  "        return tv->t_type = TOKEN_ID;\n";
 print  "\n";
 print  "    data = &tokendata[ix];\n";
 
 # print  "    fprintf(stderr, \"Looked for: %s found: %s\\n\", token, data->string);\n\n";
 
 print  "    if (strcmp(data->string, token))\n";
-print  "        return -1;\n";
+print  "        return tv->t_type = TOKEN_ID;\n";
 print  "\n";
 print  "    tv->t_integer = data->num;\n";
 print  "    tv->t_inttwo  = data->aux;\n";
author	H. Peter Anvin <hpa@zytor.com>	2007-09-19 16:22:03 -0700
committer	H. Peter Anvin <hpa@zytor.com>	2007-09-19 16:22:03 -0700
commit	eb49a4e1d402d5a1ce95e495787b900aa5303a47 (patch)
tree	81fbe28b4d1faf6e8d68aa3d7af58b4443e948d2
parent	b4b43178783e963e95fb290e82f1a0c6d6725520 (diff)
parent	bf9a24f46471abad75fa3efba059646a6c4f5026 (diff)
download	nasm-eb49a4e1d402d5a1ce95e495787b900aa5303a47.tar.gz