74 files changed, 18243 insertions, 3858 deletions
diff --git a/Changes b/Changes
index 4a01227e..242a96d7 100644
--- a/Changes
+++ b/Changes
@@ -129,8 +129,8 @@ be output when absolute labels were made global.
 
 Updates to RDOFF subdirectory, and changes to outrdf.c.
 
-0.95 released July 1997
------------------------
+0.95 not released yet
+---------------------
 
 Fixed yet another ELF bug. This one manifested if the user relied on
 the default segment, and attempted to define global symbols without
@@ -241,3 +241,165 @@ can be implemented.
 Fixed the implementation of WRT, which was too restrictive in that
 you couldn't do `mov ax,[di+abc wrt dgroup]' because (di+abc) wasn't
 a relocatable reference.
+
+0.96 not released yet
+---------------------
+
+Fixed a bug whereby, if `nasm sourcefile' would cause a filename
+collision warning and put output into `nasm.out', then `nasm
+sourcefile -o outputfile' still gave the warning even though the
+`-o' was honoured.
+
+Fixed name pollution under Digital UNIX: one of its header files
+defined R_SP, which broke the enum in nasm.h.
+
+Fixed minor instruction table problems: FUCOM and FUCOMP didn't have
+two-operand forms; NDISASM didn't recognise the longer register
+forms of PUSH and POP (eg FF F3 for PUSH BX); TEST mem,imm32 was
+flagged as undocumented; the 32-bit forms of CMOV had 16-bit operand
+size prefixes; `AAD imm' and `AAM imm' are no longer flagged as
+undocumented because the Intel Architecture reference documents
+them.
+
+Fixed a problem with the local-label mechanism, whereby strange
+types of symbol (EQUs, auto-defined OBJ segment base symbols)
+interfered with the `previous global label' value and screwed up
+local labels.
+
+Fixed a bug whereby the stub preprocessor didn't communicate with
+the listing file generator, so that the -a and -l options in
+conjunction would produce a useless listing file.
+
+Merged `os2' object file format back into `obj', after discovering
+that `obj' _also_ shouldn't have a link pass separator in a module
+containing a non-trivial MODEND. Flat segments are now declared
+using the FLAT attribute. `os2' is no longer a valid object format
+name: use `obj'.
+
+Removed the fixed-size temporary storage in the evaluator. Very very
+long expressions (like `mov ax,1+1+1+1+...' for two hundred 1s or
+so) should now no longer crash NASM.
+
+Fixed a bug involving segfaults on disassembly of MMX instructions,
+by changing the meaning of one of the operand-type flags in nasm.h.
+This may cause other apparently unrelated MMX problems; it needs to
+be tested thoroughly.
+
+Fixed some buffer overrun problems with large OBJ output files.
+Thanks to DJ Delorie for the bug report and fix.
+
+Made preprocess-only mode actually listen to the %line markers as it
+prints them, so that it can report errors more sanely.
+
+Re-designed the evaluator to keep more sensible track of expressions
+involving forward references: can now cope with previously-nightmare
+situations such as
+   mov ax,foo | bar
+   foo equ 1
+   bar equ 2
+
+Added the ALIGN and ALIGNB standard macros.
+
+Added PIC support in ELF: use of WRT to obtain the four extra
+relocation types needed.
+
+Added the ability for output file formats to define their own
+extensions to the GLOBAL, COMMON and EXTERN directives.
+
+Implemented common-variable alignment, and global-symbol type and
+size declarations, in ELF.
+
+Implemented NEAR and FAR keywords for common variables, plus
+far-common element size specification, in OBJ.
+
+Added a feature whereby EXTERNs and COMMONs in OBJ can be given a
+default WRT specification (either a segment or a group).
+
+Transformed the Unix NASM archive into an auto-configuring package.
+
+Added a sanity-check for people applying SEG to things which are
+already segment bases: this previously went unnoticed by the SEG
+processing and caused OBJ-driver panics later.
+
+Added the ability, in OBJ format, to deal with `MOV EAX,<segment>'
+type references: OBJ doesn't directly support dword-size segment
+base fixups, but as long as the low two bytes of the constant term
+are zero, a word-size fixup can be generated instead and it will
+work.
+
+Added the ability to specify sections' alignment requirements in
+Win32 object files and pure binary files.
+
+Added preprocess-time expression evaluation: the %assign (and
+%iassign) directive and the bare %if (and %elif) conditional. Added
+relational operators to the evaluator, for use only in %if
+constructs: the standard relationals = < > <= >= <> (and C-like
+synonyms == and !=) plus low-precedence logical operators &&, ^^ and
+||.
+
+Added a preprocessor repeat construct: %rep / %exitrep / %endrep.
+
+Added the __FILE__ and __LINE__ standard macros.
+
+Added a sanity check for number constants being greater than
+0xFFFFFFFF. The warning can be disabled.
+
+Added the %0 token whereby a variadic multi-line macro can tell how
+many parameters it's been given in a specific invocation.
+
+Added %rotate, allowing multi-line macro parameters to be cycled.
+
+Added the `*' option for the maximum parameter count on multi-line
+macros, allowing them to take arbitrarily many parameters.
+
+Added the ability for the user-level forms of EXTERN, GLOBAL and
+COMMON to take more than one argument.
+
+Added the IMPORT and EXPORT directives in OBJ format, to deal with
+Windows DLLs.
+
+Added some more preprocessor %if constructs: %ifidn / %ifidni (exact
+textual identity), and %ifid / %ifnum / %ifstr (token type testing).
+
+Added the ability to distinguish SHL AX,1 (the 8086 version) from
+SHL AX,BYTE 1 (the 286-and-upwards version whose constant happens to
+be 1).
+
+Added NetBSD/FreeBSD/OpenBSD's variant of a.out format, complete
+with PIC shared library features.
+
+Changed NASM's idiosyncratic handling of FCLEX, FDISI, FENI, FINIT,
+FSAVE, FSTCW, FSTENV, and FSTSW to bring it into line with the
+otherwise accepted standard. The previous behaviour, though it was a
+deliberate feature, was a deliberate feature based on a
+misunderstanding. Apologies for the inconvenience.
+
+Improved the flexibility of ABSOLUTE: you can now give it an
+expression rather than being restricted to a constant, and it can
+take relocatable arguments as well.
+
+Added the ability for a variable to be declared as EXTERN multiple
+times, and the subsequent definitions are just ignored.
+
+We now allow instruction prefixes (CS, DS, LOCK, REPZ etc) to be
+alone on a line (without a following instruction).
+
+Improved sanity checks on whether the arguments to EXTERN, GLOBAL
+and COMMON are valid identifiers.
+
+Added misc/exebin.mac to allow direct generation of .EXE files by
+hacking up an EXE header using DB and DW; also added test/binexe.asm
+to demonstrate the use of this. Thanks to Yann Guidon for
+contributing the EXE header code.
+
+ndisasm forgot to check whether the input file had been successfully
+opened. Now it does. Doh!
+
+Added the Cyrix extensions to the MMX instruction set.
+
+Added a hinting mechanism to allow [EAX+EBX] and [EBX+EAX] to be
+assembled differently. This is important since [ESI+EBP] and
+[EBP+ESI] have different default base segment registers.
+
+Added support for the PharLap OMF extension for 4096-byte segment
+alignment.
diff --git a/Makefile b/Makefile
deleted file mode 100644
index 7a9c54cc..00000000
--- a/Makefile
+++ /dev/null
@@ -1,109 +0,0 @@
-# Makefile for the Netwide Assembler
-#
-# The Netwide Assembler is copyright (C) 1996 Simon Tatham and
-# Julian Hall. All rights reserved. The software is
-# redistributable under the licence given in the file "Licence"
-# distributed in the NASM archive.
-#
-# This Makefile is designed for use under Unix (probably fairly
-# portably). It can also be used without change to build NASM using
-# DJGPP. The makefile "Makefile.dos" can be used to build NASM using
-# a 16-bit DOS C compiler such as Microsoft C.
-#
-# The `make dist' section at the end of the Makefile is not
-# guaranteed to work anywhere except Linux. Come to think of it,
-# I'm not sure I want to guarantee it to work anywhere except on
-# _my_ computer. :-)
-
-CC = gcc
-CCFLAGS = -c -g -O -Wall -ansi -pedantic
-LINK = gcc
-LINKFLAGS = -o nasm
-DLINKFLAGS = -o ndisasm
-LIBRARIES =
-STRIP = strip
-EXE =#
-OBJ = o#
-
-.c.$(OBJ):
-	$(CC) $(CCFLAGS) $*.c
-
-NASMOBJS = nasm.$(OBJ) nasmlib.$(OBJ) float.$(OBJ) insnsa.$(OBJ) \
-           assemble.$(OBJ) labels.$(OBJ) parser.$(OBJ) outform.$(OBJ) \
-	   outbin.$(OBJ) outaout.$(OBJ) outcoff.$(OBJ) outelf.$(OBJ) \
-	   outobj.$(OBJ) outas86.$(OBJ) outrdf.$(OBJ) outdbg.$(OBJ) \
-	   preproc.$(OBJ) listing.$(OBJ)
-
-NDISASMOBJS = ndisasm.$(OBJ) disasm.$(OBJ) sync.$(OBJ) nasmlib.$(OBJ) \
-	      insnsd.$(OBJ)
-
-all : nasm$(EXE) ndisasm$(EXE)
-
-nasm$(EXE): $(NASMOBJS)
-	$(LINK) $(LINKFLAGS) $(NASMOBJS) $(LIBRARIES)
-
-ndisasm$(EXE): $(NDISASMOBJS)
-	$(LINK) $(DLINKFLAGS) $(NDISASMOBJS) $(LIBRARIES)
-
-assemble.$(OBJ): assemble.c nasm.h nasmlib.h assemble.h insns.h
-disasm.$(OBJ): disasm.c nasm.h disasm.h sync.h insns.h names.c
-float.$(OBJ): float.c nasm.h
-insnsa.$(OBJ): insnsa.c nasm.h insns.h
-insnsd.$(OBJ): insnsd.c nasm.h insns.h
-labels.$(OBJ): labels.c nasm.h nasmlib.h
-listing.$(OBJ): listing.c nasm.h nasmlib.h listing.h
-macros.$(OBJ): macros.c
-names.$(OBJ): names.c
-nasm.$(OBJ): nasm.c nasm.h nasmlib.h preproc.h parser.h assemble.h labels.h \
- outform.h listing.h
-nasmlib.$(OBJ): nasmlib.c nasm.h nasmlib.h
-ndisasm.$(OBJ): ndisasm.c nasm.h nasmlib.h sync.h disasm.h
-outaout.$(OBJ): outaout.c nasm.h nasmlib.h outform.h
-outas86.$(OBJ): outas86.c nasm.h nasmlib.h outform.h
-outbin.$(OBJ): outbin.c nasm.h nasmlib.h outform.h
-outcoff.$(OBJ): outcoff.c nasm.h nasmlib.h outform.h
-outdbg.$(OBJ): outdbg.c nasm.h nasmlib.h outform.h
-outelf.$(OBJ): outelf.c nasm.h nasmlib.h outform.h
-outform.$(OBJ): outform.c outform.h nasm.h
-outobj.$(OBJ): outobj.c nasm.h nasmlib.h outform.h
-outrdf.$(OBJ): outrdf.c nasm.h nasmlib.h outform.h
-parser.$(OBJ): parser.c nasm.h nasmlib.h parser.h float.h names.c
-preproc.$(OBJ): preproc.c nasm.h nasmlib.h macros.c
-sync.$(OBJ): sync.c sync.h
-
-# These two source files are automagically generated from a single
-# instruction-table file by a Perl script. They're distributed,
-# though, so it isn't necessary to have Perl just to recompile NASM
-# from the distribution.
-
-AUTOSRCS = insnsa.c insnsd.c
-$(AUTOSRCS): insns.dat insns.pl
-	perl insns.pl
-
-# This source file is generated from the standard macros file
-# `standard.mac' by another Perl script. Again, it's part of the
-# standard distribution.
-
-macros.c: standard.mac
-	perl macros.pl
-
-# Clean the whole thing up after compilation.
-
-clean :
-	rm -f $(NASMOBJS) $(NDISASMOBJS) nasm$(EXE) ndisasm$(EXE)
-	make -C rdoff clean
-	make -C test clean
-
-# Here the `make dist' section begins. Nothing is guaranteed hereafter
-# unless you're using the Makefile under Linux, running bash, with
-# gzip, GNU tar and a sensible version of zip readily available.
-
-MANPAGES = nasm.man ndisasm.man
-
-.SUFFIXES: .man .1
-
-.1.man:
-	-man ./$< | ul > $@
-
-dist: $(AUTOSRCS) $(MANPAGES) clean
-	makedist.sh
diff --git a/Makefile.bc2 b/Makefile.bc2
index 7daf4a43..92ec9d2e 100644
--- a/Makefile.bc2
+++ b/Makefile.bc2
@@ -69,7 +69,7 @@ DCCFLAGS = /d /c /O /A /mh /n$(OBJD) #compiler flags for NDISASM
   #/A=ANSI standard C
   #/mh=Model huge
   #/n$(OBJD)= put the OBJ files in the diectory given.
-  #NOTE: Huge modle is used, and the array in insnsd.c is large enough to
+  #NOTE: Huge model is used, and the array in insnsd.c is large enough to
   #over size the d-group in large mode.
 
 LINKFLAGS = /c /x       #linker flags
@@ -95,7 +95,7 @@ DASM_ASM=$(CC) $(DCCFLAGS) $&.c        #command line for NDISASM
 NASMOBJS = $(OBJD)nasm.$(OBJ)   $(OBJD)nasmlib.$(OBJ)  $(OBJD)float.$(OBJ)  \
            $(OBJD)insnsa.$(OBJ) $(OBJD)assemble.$(OBJ) $(OBJD)labels.$(OBJ) \
            $(OBJD)parser.$(OBJ) $(OBJD)outform.$(OBJ)  $(OBJD)preproc.$(OBJ) \
-	   $(OBJD)listing.$(OBJ)
+	   $(OBJD)listing.$(OBJ) $(OBJD)eval.$(OBJ)
 
 ################################################################
 #The OBJ files that NDISASM is dependent on
@@ -150,6 +150,9 @@ $(OBJD)labels.$(OBJ): labels.c nasm.h nasmlib.h
 $(OBJD)listing.$(OBJ): listing.c nasm.h nasmlib.h listing.h
         $(NASM_ASM)
 
+$(OBJD)eval.$(OBJ): eval.c nasm.h nasmlib.h eval.h
+        $(NASM_ASM)
+
 $(OBJD)nasm.$(OBJ): nasm.c nasm.h nasmlib.h parser.h assemble.h labels.h \
 		listing.h outform.h
         $(NASM_ASM)
diff --git a/Makefile.bor b/Makefile.bor
index c415de00..90e96fba 100644
--- a/Makefile.bor
+++ b/Makefile.bor
@@ -27,7 +27,7 @@ NASMOBJS1 = nasm.$(OBJ) nasmlib.$(OBJ) float.$(OBJ) insnsa.$(OBJ)
 NASMOBJS2 = assemble.$(OBJ) labels.$(OBJ) parser.$(OBJ) outform.$(OBJ)
 NASMOBJS3 = outbin.$(OBJ) outaout.$(OBJ) outcoff.$(OBJ) outelf.$(OBJ)
 NASMOBJS4 = outobj.$(OBJ) outas86.$(OBJ) outdbg.$(OBJ) outrdf.$(OBJ)
-NASMOBJS5 = preproc.$(OBJ) listing.$(OBJ)
+NASMOBJS5 = preproc.$(OBJ) listing.$(OBJ) eval.$(OBJ)
 
 NASMOBJS = $(NASMOBJS1) $(NASMOBJS2) $(NASMOBJS3) $(NASMOBJS4) $(NASMOBJS5)
 
@@ -53,6 +53,7 @@ ndisasm$(EXE): $(NDISASMOBJS)
 
 assemble.$(OBJ): assemble.c nasm.h assemble.h insns.h
 disasm.$(OBJ): disasm.c nasm.h disasm.h sync.h insns.h names.c
+eval.$(OBJ): eval.c nasm.h nasmlib.h eval.h
 float.$(OBJ): float.c nasm.h
 insnsa.$(OBJ): insnsa.c nasm.h insns.h
 insnsd.$(OBJ): insnsd.c nasm.h insns.h
diff --git a/Makefile.dos b/Makefile.dos
index 18a3b363..94403fc1 100644
--- a/Makefile.dos
+++ b/Makefile.dos
@@ -10,8 +10,8 @@
 # It's been tested with Microsoft C 5.x plus Borland Make. (Yes, I
 # know it's silly, but...)
 
-CC = cl /c /O /AL
-QCL = qcl /c /AL
+CC = cl /c /O /AL /Gt
+QCL = qcl /c /AL /Gt
 LINK = cl
 LINKFLAGS =
 LIBRARIES =
@@ -25,7 +25,7 @@ NASMOBJS = nasm.$(OBJ) nasmlib.$(OBJ) float.$(OBJ) insnsa.$(OBJ) \
            assemble.$(OBJ) labels.$(OBJ) parser.$(OBJ) outform.$(OBJ) \
 	   outbin.$(OBJ) outaout.$(OBJ) outcoff.$(OBJ) outelf.$(OBJ) \
 	   outobj.$(OBJ) outas86.$(OBJ) outrdf.$(OBJ) outdbg.$(OBJ) \
-	   preproc.$(OBJ) listing.$(OBJ)
+	   preproc.$(OBJ) listing.$(OBJ) eval.$(OBJ)
 
 NDISASMOBJS = ndisasm.$(OBJ) disasm.$(OBJ) sync.$(OBJ) nasmlib.$(OBJ) \
               insnsd.$(OBJ)
@@ -34,7 +34,7 @@ all : nasm$(EXE) ndisasm$(EXE)
 
 # We have to have a horrible kludge here to get round the 128 character
 # limit, as usual...
-LINKOBJS = a*.obj f*.obj insnsa.obj l*.obj na*.obj o*.obj p*.obj
+LINKOBJS = a*.obj e*.obj f*.obj insnsa.obj l*.obj na*.obj o*.obj p*.obj
 nasm$(EXE): $(NASMOBJS)
 	cl /Fenasm.exe /F 4000 $(LINKOBJS)
 
@@ -43,6 +43,7 @@ ndisasm$(EXE): $(NDISASMOBJS)
 
 assemble.$(OBJ): assemble.c nasm.h assemble.h insns.h
 disasm.$(OBJ): disasm.c nasm.h disasm.h sync.h insns.h names.c
+eval.$(OBJ): eval.c eval.h nasm.h nasmlib.h
 float.$(OBJ): float.c nasm.h
 labels.$(OBJ): labels.c nasm.h nasmlib.h
 listing.$(OBJ): listing.c nasm.h nasmlib.h listing.h
diff --git a/Makefile.in b/Makefile.in
new file mode 100644
index 00000000..5b8acebc
--- /dev/null
+++ b/Makefile.in
@@ -0,0 +1,99 @@
+#
+# Auto-configuring Makefile for the Netwide Assembler.
+#
+# The Netwide Assembler is copyright (C) 1996 Simon Tatham and
+# Julian Hall. All rights reserved. The software is
+# redistributable under the licence given in the file "Licence"
+# distributed in the NASM archive.
+
+srcdir = @srcdir@
+VPATH = @srcdir@
+prefix = @prefix@
+exec_prefix = @exec_prefix@
+bindir = @bindir@
+mandir = @mandir@
+
+CC = @CC@
+CFLAGS = @CFLAGS@ @GCCFLAGS@ -I$(srcdir) -I.
+
+INSTALL = @INSTALL@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_DATA = @INSTALL_DATA@
+
+.c.o:
+	$(CC) -c $(CFLAGS) $<
+
+NASM = nasm.o nasmlib.o float.o insnsa.o assemble.o labels.o \
+       parser.o outform.o outbin.o outaout.o outcoff.o outelf.o \
+       outobj.o outas86.o outrdf.o outdbg.o preproc.o listing.o \
+       eval.o
+
+NDISASM = ndisasm.o disasm.o sync.o nasmlib.o insnsd.o
+
+all: nasm ndisasm
+
+nasm: $(NASM)
+	$(CC) -o nasm $(NASM)
+
+ndisasm: $(NDISASM)
+	$(CC) -o ndisasm $(NDISASM)
+
+assemble.o: assemble.c nasm.h nasmlib.h assemble.h insns.h
+disasm.o: disasm.c nasm.h disasm.h sync.h insns.h names.c
+eval.o: eval.c eval.h nasm.h nasmlib.h
+float.o: float.c nasm.h
+insnsa.o: insnsa.c nasm.h insns.h
+insnsd.o: insnsd.c nasm.h insns.h
+labels.o: labels.c nasm.h nasmlib.h
+listing.o: listing.c nasm.h nasmlib.h listing.h
+nasm.o: nasm.c nasm.h nasmlib.h preproc.h parser.h assemble.h labels.h \
+ outform.h listing.h
+nasmlib.o: nasmlib.c nasm.h nasmlib.h
+ndisasm.o: ndisasm.c nasm.h nasmlib.h sync.h disasm.h
+outaout.o: outaout.c nasm.h nasmlib.h outform.h
+outas86.o: outas86.c nasm.h nasmlib.h outform.h
+outbin.o: outbin.c nasm.h nasmlib.h outform.h
+outcoff.o: outcoff.c nasm.h nasmlib.h outform.h
+outdbg.o: outdbg.c nasm.h nasmlib.h outform.h
+outelf.o: outelf.c nasm.h nasmlib.h outform.h
+outform.o: outform.c outform.h nasm.h
+outobj.o: outobj.c nasm.h nasmlib.h outform.h
+outrdf.o: outrdf.c nasm.h nasmlib.h outform.h
+parser.o: parser.c nasm.h nasmlib.h parser.h float.h names.c
+preproc.o: preproc.c nasm.h nasmlib.h macros.c
+sync.o: sync.c sync.h
+
+# These two source files are automagically generated from a single
+# instruction-table file by a Perl script. They're distributed,
+# though, so it isn't necessary to have Perl just to recompile NASM
+# from the distribution.
+
+insnsa.c insnsd.c: insns.dat insns.pl
+	perl $(srcdir)/insns.pl $(srcdir)/insns.dat
+
+# This source file is generated from the standard macros file
+# `standard.mac' by another Perl script. Again, it's part of the
+# standard distribution.
+
+macros.c: standard.mac macros.pl
+	perl $(srcdir)/macros.pl $(srcdir)/standard.mac
+
+install: nasm ndisasm
+	$(INSTALL_PROGRAM) nasm $(bindir)/nasm
+	$(INSTALL_PROGRAM) ndisasm $(bindir)/ndisasm
+	$(INSTALL_DATA) $(srcdir)/nasm.1 $(mandir)/man1/nasm.1
+	$(INSTALL_DATA) $(srcdir)/ndisasm.1 $(mandir)/man1/ndisasm.1
+
+clean:
+	rm -f *.o nasm ndisasm
+	cd rdoff; $(MAKE) clean
+
+spotless: clean
+	rm -f config.* Makefile
+	cd rdoff; $(MAKE) spotless
+
+rdf:
+	cd rdoff; $(MAKE)
+
+rdf_install install_rdf:
+	cd rdoff; $(MAKE) install
diff --git a/Makefile.sc b/Makefile.sc
index e8386bfd..b5d0e35c 100644
--- a/Makefile.sc
+++ b/Makefile.sc
@@ -1,222 +1,114 @@
-# Makefile for the Netwide Assembler under 32-bit Windows(tm)
-
+# Makefile for the Netwide Assembler under 32-bit DOS(tm)
 #
-
 # The Netwide Assembler is copyright (C) 1996 Simon Tatham and
-
 # Julian Hall. All rights reserved. The software is
-
 # redistributable under the licence given in the file "Licence"
-
 # distributed in the NASM archive.
-
 #
-
 # This Makefile is designed to build NASM using the 32-bit WIN32 C
-
 # compiler Symantec(tm) C++ 7.5, provided you have a MAKE-utility
-
 # that's compatible to SMAKE.
 
-
-
 CC = sc
-
-CCFLAGS = -c -a1 -mn -Nc -w2 -w7 -o+time -5
-
+CCFLAGS = -c -a1 -mx -Nc -w2 -w7 -o+time -5
 # -5            optimize for pentium (tm)
-
 # -c            compile only
-
 # -o-all        no optimizations (to avoid problems in disasm.c)
-
 # -o+time       optimize for speed
-
 # -o+space      optimize for size
-
 # -A1           byte alignment for structures
-
 # -mn           compile for Win32 executable
-
+# -mx           compile for DOS386 (DOSX) executable
 # -Nc           create COMDAT records
-
 # -w2           possible unattended assignment: off
-
 # -w7           for loops with empty instruction-body
 
-
-
 LINK = link
-
-LINKFLAGS = /noi /exet:NT /su:console
-
+LINKFLAGS = /noi /exet:DOSX
 # /noignorecase all symbols are case-sensitive
-
 # /exet:NT      Exetype: NT (Win32)
-
+# /exet:DOSX    Exetype: DOSX (DOS32)
 # /su:console   Subsystem: Console (Console-App)
 
-
-
 LIBRARIES =
-
 EXE = .exe
-
 OBJ = obj
 
-
-
 .c.$(OBJ):
-
         $(CC) $(CCFLAGS) $*.c
 
 
-
-
-
 #
-
 # modules needed for different programs
-
 #
 
-
-
 NASMOBJS = nasm.$(OBJ) nasmlib.$(OBJ) float.$(OBJ) insnsa.$(OBJ) \
-
            assemble.$(OBJ) labels.$(OBJ) parser.$(OBJ) outform.$(OBJ) \
-
 	   outbin.$(OBJ) outaout.$(OBJ) outcoff.$(OBJ) outelf.$(OBJ) \
-
 	   outobj.$(OBJ) outas86.$(OBJ) outrdf.$(OBJ) outdbg.$(OBJ) \
-
-	   preproc.$(OBJ) listing.$(OBJ)
-
-
+	   preproc.$(OBJ) listing.$(OBJ) eval.$(OBJ)
 
 NDISASMOBJS = ndisasm.$(OBJ) disasm.$(OBJ) sync.$(OBJ) nasmlib.$(OBJ) \
-
               insnsd.$(OBJ)
 
 
-
-
-
 #
-
 # programs to create
-
 #
 
-
-
 all : nasm$(EXE) ndisasm$(EXE)
 
 
-
-
-
 #
-
 # We have to have a horrible kludge here to get round the 128 character
-
 # limit, as usual... we'll simply use LNK-files :)
-
 #
-
 nasm$(EXE): $(NASMOBJS)
-
         $(LINK) $(LINKFLAGS) @<<
-
-$(NASMOBJS)
-
-nasm.exe;
-
+cx.obj $(NASMOBJS)
+nasm.exe
 <<
 
-
-
 ndisasm$(EXE): $(NDISASMOBJS)
-
         $(LINK) $(LINKFLAGS) @<<
-
-$(NDISASMOBJS)
-
-ndisasm.exe;
-
+cx.obj $(NDISASMOBJS)
+ndisasm.exe
 <<
 
 
 
-
-
-
-
 #
-
 # modules for programs
-
 #
 
-
-
 disasm.$(OBJ): disasm.c nasm.h disasm.h sync.h insns.h names.c
-
 assemble.$(OBJ): assemble.c nasm.h assemble.h insns.h
-
+eval.$(OBJ): eval.c nasm.h nasmlib.h eval.h
 float.$(OBJ): float.c nasm.h
-
 labels.$(OBJ): labels.c nasm.h nasmlib.h
-
 listing.$(OBJ): listing.c nasm.h nasmlib.h listing.h
-
 nasm.$(OBJ): nasm.c nasm.h nasmlib.h parser.h assemble.h labels.h \
-
 	listing.h outform.h
-
 nasmlib.$(OBJ): nasmlib.c nasm.h nasmlib.h
-
 ndisasm.$(OBJ): ndisasm.c nasm.h sync.h disasm.h
-
 outas86.$(OBJ): outas86.c nasm.h nasmlib.h
-
 outaout.$(OBJ): outaout.c nasm.h nasmlib.h
-
 outbin.$(OBJ): outbin.c nasm.h nasmlib.h
-
 outcoff.$(OBJ): outcoff.c nasm.h nasmlib.h
-
 outdbg.$(OBJ): outdbg.c nasm.h nasmlib.h
-
 outelf.$(OBJ): outelf.c nasm.h nasmlib.h
-
 outobj.$(OBJ): outobj.c nasm.h nasmlib.h
-
 outrdf.$(OBJ): outrdf.c nasm.h nasmlib.h
-
 outform.$(OBJ): outform.c outform.h nasm.h
-
 parser.$(OBJ): parser.c nasm.h nasmlib.h parser.h float.h names.c
-
 preproc.$(OBJ): preproc.c macros.c preproc.h nasm.h nasmlib.h
-
 sync.$(OBJ): sync.c sync.h
-
 insnsa.$(OBJ): insnsa.c nasm.h insns.h
-
 insnsd.$(OBJ): insnsd.c nasm.h insns.h
 
 
 
-
-
-
-
 clean :
-
 	del *.obj
-
 	del nasm$(EXE)
-
 	del ndisasm$(EXE)
-
diff --git a/Makefile.scw b/Makefile.scw
new file mode 100644
index 00000000..6953b464
--- /dev/null
+++ b/Makefile.scw
@@ -0,0 +1,114 @@
+# Makefile for the Netwide Assembler under 32-bit Windows(tm)
+#
+# The Netwide Assembler is copyright (C) 1996 Simon Tatham and
+# Julian Hall. All rights reserved. The software is
+# redistributable under the licence given in the file "Licence"
+# distributed in the NASM archive.
+#
+# This Makefile is designed to build NASM using the 32-bit WIN32 C
+# compiler Symantec(tm) C++ 7.5, provided you have a MAKE-utility
+# that's compatible to SMAKE.
+
+CC = sc
+CCFLAGS = -c -a1 -mn -Nc -w2 -w7 -o+time -5
+# -5            optimize for pentium (tm)
+# -c            compile only
+# -o-all        no optimizations (to avoid problems in disasm.c)
+# -o+time       optimize for speed
+# -o+space      optimize for size
+# -A1           byte alignment for structures
+# -mn           compile for Win32 executable
+# -mx           compile for DOS386 (DOSX) executable
+# -Nc           create COMDAT records
+# -w2           possible unattended assignment: off
+# -w7           for loops with empty instruction-body
+
+LINK = link
+LINKFLAGS = /noi /exet:NT /su:console
+# /noignorecase all symbols are case-sensitive
+# /exet:NT      Exetype: NT (Win32)
+# /exet:DOSX    Exetype: DOSX (DOS32)
+# /su:console   Subsystem: Console (Console-App)
+
+LIBRARIES =
+EXE = .exe
+OBJ = obj
+
+.c.$(OBJ):
+        $(CC) $(CCFLAGS) $*.c
+
+
+#
+# modules needed for different programs
+#
+
+NASMOBJS = nasm.$(OBJ) nasmlib.$(OBJ) float.$(OBJ) insnsa.$(OBJ) \
+           assemble.$(OBJ) labels.$(OBJ) parser.$(OBJ) outform.$(OBJ) \
+	   outbin.$(OBJ) outaout.$(OBJ) outcoff.$(OBJ) outelf.$(OBJ) \
+	   outobj.$(OBJ) outas86.$(OBJ) outrdf.$(OBJ) outdbg.$(OBJ) \
+	   preproc.$(OBJ) listing.$(OBJ) eval.$(OBJ)
+
+NDISASMOBJS = ndisasm.$(OBJ) disasm.$(OBJ) sync.$(OBJ) nasmlib.$(OBJ) \
+              insnsd.$(OBJ)
+
+
+#
+# programs to create
+#
+
+all : nasmw$(EXE) ndisasmw$(EXE)
+
+
+#
+# We have to have a horrible kludge here to get round the 128 character
+# limit, as usual... we'll simply use LNK-files :)
+#
+nasmw$(EXE): $(NASMOBJS)
+        $(LINK) $(LINKFLAGS) @<<
+$(NASMOBJS)
+nasmw.exe;
+<<
+
+ndisasmw$(EXE): $(NDISASMOBJS)
+        $(LINK) $(LINKFLAGS) @<<
+$(NDISASMOBJS)
+ndisasmw.exe;
+<<
+
+
+
+#
+# modules for programs
+#
+
+disasm.$(OBJ): disasm.c nasm.h disasm.h sync.h insns.h names.c
+assemble.$(OBJ): assemble.c nasm.h assemble.h insns.h
+eval.$(OBJ): eval.c nasm.h nasmlib.h eval.h
+float.$(OBJ): float.c nasm.h
+labels.$(OBJ): labels.c nasm.h nasmlib.h
+listing.$(OBJ): listing.c nasm.h nasmlib.h listing.h
+nasm.$(OBJ): nasm.c nasm.h nasmlib.h parser.h assemble.h labels.h \
+	listing.h outform.h
+nasmlib.$(OBJ): nasmlib.c nasm.h nasmlib.h
+ndisasm.$(OBJ): ndisasm.c nasm.h sync.h disasm.h
+outas86.$(OBJ): outas86.c nasm.h nasmlib.h
+outaout.$(OBJ): outaout.c nasm.h nasmlib.h
+outbin.$(OBJ): outbin.c nasm.h nasmlib.h
+outcoff.$(OBJ): outcoff.c nasm.h nasmlib.h
+outdbg.$(OBJ): outdbg.c nasm.h nasmlib.h
+outelf.$(OBJ): outelf.c nasm.h nasmlib.h
+outobj.$(OBJ): outobj.c nasm.h nasmlib.h
+outrdf.$(OBJ): outrdf.c nasm.h nasmlib.h
+outform.$(OBJ): outform.c outform.h nasm.h
+parser.$(OBJ): parser.c nasm.h nasmlib.h parser.h float.h names.c
+preproc.$(OBJ): preproc.c macros.c preproc.h nasm.h nasmlib.h
+sync.$(OBJ): sync.c sync.h
+insnsa.$(OBJ): insnsa.c nasm.h insns.h
+insnsd.$(OBJ): insnsd.c nasm.h insns.h
+
+
+
+clean :
+	del *.obj
+	del nasmw$(EXE)
+	del ndisasmw$(EXE)
diff --git a/Makefile.unx b/Makefile.unx
new file mode 100644
index 00000000..f74d5440
--- /dev/null
+++ b/Makefile.unx
@@ -0,0 +1,97 @@
+# Unix fall-back makefile for the Netwide Assembler. For use if
+# `configure' fails to generate a workable Makefile.
+#
+# The Netwide Assembler is copyright (C) 1996 Simon Tatham and
+# Julian Hall. All rights reserved. The software is
+# redistributable under the licence given in the file "Licence"
+# distributed in the NASM archive.
+
+# You may need to adjust these values.
+
+prefix = /usr/local
+CC = cc
+CFLAGS = -O -I.
+
+# You _shouldn't_ need to adjust anything below this line.
+
+exec_prefix = ${prefix}
+bindir = ${exec_prefix}/bin
+mandir = ${prefix}/man
+
+INSTALL = install -c
+INSTALL_PROGRAM = ${INSTALL}
+INSTALL_DATA = ${INSTALL} -m 644
+
+.c.o:
+	$(CC) -c $(CFLAGS) $*.c
+
+NASM = nasm.o nasmlib.o float.o insnsa.o assemble.o labels.o \
+       parser.o outform.o outbin.o outaout.o outcoff.o outelf.o \
+       outobj.o outas86.o outrdf.o outdbg.o preproc.o listing.o \
+       eval.o
+
+NDISASM = ndisasm.o disasm.o sync.o nasmlib.o insnsd.o
+
+all: nasm ndisasm
+
+nasm: $(NASM)
+	$(CC) -o nasm $(NASM)
+
+ndisasm: $(NDISASM)
+	$(CC) -o ndisasm $(NDISASM)
+
+assemble.o: assemble.c nasm.h nasmlib.h assemble.h insns.h
+disasm.o: disasm.c nasm.h disasm.h sync.h insns.h names.c
+eval.o: eval.c eval.h nasm.h nasmlib.h
+float.o: float.c nasm.h
+insnsa.o: insnsa.c nasm.h insns.h
+insnsd.o: insnsd.c nasm.h insns.h
+labels.o: labels.c nasm.h nasmlib.h
+listing.o: listing.c nasm.h nasmlib.h listing.h
+nasm.o: nasm.c nasm.h nasmlib.h preproc.h parser.h assemble.h labels.h \
+ outform.h listing.h
+nasmlib.o: nasmlib.c nasm.h nasmlib.h
+ndisasm.o: ndisasm.c nasm.h nasmlib.h sync.h disasm.h
+outaout.o: outaout.c nasm.h nasmlib.h outform.h
+outas86.o: outas86.c nasm.h nasmlib.h outform.h
+outbin.o: outbin.c nasm.h nasmlib.h outform.h
+outcoff.o: outcoff.c nasm.h nasmlib.h outform.h
+outdbg.o: outdbg.c nasm.h nasmlib.h outform.h
+outelf.o: outelf.c nasm.h nasmlib.h outform.h
+outform.o: outform.c outform.h nasm.h
+outobj.o: outobj.c nasm.h nasmlib.h outform.h
+outrdf.o: outrdf.c nasm.h nasmlib.h outform.h
+parser.o: parser.c nasm.h nasmlib.h parser.h float.h names.c
+preproc.o: preproc.c nasm.h nasmlib.h macros.c
+sync.o: sync.c sync.h
+
+# These two source files are automagically generated from a single
+# instruction-table file by a Perl script. They're distributed,
+# though, so it isn't necessary to have Perl just to recompile NASM
+# from the distribution.
+
+insnsa.c insnsd.c: insns.dat insns.pl
+	perl insns.pl insns.dat
+
+# This source file is generated from the standard macros file
+# `standard.mac' by another Perl script. Again, it's part of the
+# standard distribution.
+
+macros.c: standard.mac macros.pl
+	perl macros.pl standard.mac
+
+install: nasm ndisasm
+	$(INSTALL_PROGRAM) nasm $(bindir)/nasm
+	$(INSTALL_PROGRAM) ndisasm $(bindir)/ndisasm
+	$(INSTALL_DATA) nasm.1 $(mandir)/man1/nasm.1
+	$(INSTALL_DATA) ndisasm.1 $(mandir)/man1/ndisasm.1
+
+clean:
+	rm -f *.o nasm ndisasm
+	$(MAKE) -C rdoff clean
+
+rdf:
+	$(MAKE) -C rdoff
+
+rdf_install install_rdf:
+	$(MAKE) -C rdoff install
diff --git a/Makefile.vc b/Makefile.vc
index 0140a008..80beba4f 100644
--- a/Makefile.vc
+++ b/Makefile.vc
@@ -24,7 +24,7 @@ NASMOBJS = nasm.$(OBJ) nasmlib.$(OBJ) float.$(OBJ) insnsa.$(OBJ) \
            assemble.$(OBJ) labels.$(OBJ) parser.$(OBJ) outform.$(OBJ) \
 	   outbin.$(OBJ) outaout.$(OBJ) outcoff.$(OBJ) outelf.$(OBJ) \
 	   outobj.$(OBJ) outas86.$(OBJ) outrdf.$(OBJ) outdbg.$(OBJ) \
-	   preproc.$(OBJ) listing.$(OBJ)
+	   preproc.$(OBJ) listing.$(OBJ) eval.$(OBJ)
 
 NDISASMOBJS = ndisasm.$(OBJ) disasm.$(OBJ) sync.$(OBJ) nasmlib.$(OBJ) \
               insnsd.$(OBJ)
@@ -33,7 +33,7 @@ all : nasm$(SUFFIX)$(EXE) ndisasm$(SUFFIX)$(EXE)
 
 # We have to have a horrible kludge here to get round the 128 character
 # limit, as usual...
-LINKOBJS = a*.obj f*.obj insnsa.obj l*.obj na*.obj o*.obj p*.obj
+LINKOBJS = a*.obj e*.obj f*.obj insnsa.obj l*.obj na*.obj o*.obj p*.obj
 nasm$(SUFFIX)$(EXE): $(NASMOBJS)
 	cl /Fenasm$(SUFFIX).exe $(LINKOBJS)
 
@@ -42,6 +42,7 @@ ndisasm$(SUFFIX)$(EXE): $(NDISASMOBJS)
 
 assemble.$(OBJ): assemble.c nasm.h assemble.h insns.h
 disasm.$(OBJ): disasm.c nasm.h disasm.h sync.h insns.h names.c
+eval.$(OBJ): eval.c nasm.h nasmlib.h eval.h
 float.$(OBJ): float.c nasm.h
 labels.$(OBJ): labels.c nasm.h nasmlib.h
 listing.$(OBJ): listing.c nasm.h nasmlib.h listing.h
diff --git a/Makefile.wc b/Makefile.wc
index ab08b049..6f0d48a5 100644
--- a/Makefile.wc
+++ b/Makefile.wc
@@ -42,7 +42,7 @@ NASMOBJS = nasm.$(OBJ) nasmlib.$(OBJ) float.$(OBJ) insnsa.$(OBJ) \
            assemble.$(OBJ) labels.$(OBJ) parser.$(OBJ) outform.$(OBJ) \
 	   outbin.$(OBJ) outaout.$(OBJ) outcoff.$(OBJ) outelf.$(OBJ) \
 	   outobj.$(OBJ) outas86.$(OBJ) outrdf.$(OBJ) outdbg.$(OBJ) \
-	   preproc.$(OBJ) listing.$(OBJ)
+	   preproc.$(OBJ) listing.$(OBJ) eval.$(OBJ)
 
 NDISASMOBJS = ndisasm.$(OBJ) disasm.$(OBJ) sync.$(OBJ) nasmlib.$(OBJ) \
 	      insnsd.$(OBJ)
@@ -62,10 +62,12 @@ NASM.LNK: makefile.wc
         echo N nasm.exe > NASM.LNK
         echo F nasm.$(OBJ) >> NASM.LNK
         echo F nasmlib.$(OBJ) >> NASM.LNK
+        echo F eval.$(OBJ) >> NASM.LNK
         echo F float.$(OBJ) >> NASM.LNK
         echo F insnsa.$(OBJ) >> NASM.LNK
         echo F assemble.$(OBJ) >> NASM.LNK
         echo F labels.$(OBJ) >> NASM.LNK
+        echo F listing.$(OBJ) >> NASM.LNK
         echo F parser.$(OBJ) >> NASM.LNK
 	echo F preproc.$(OBJ) >> NASM.LNK
         echo F outform.$(OBJ) >> NASM.LNK
@@ -88,6 +90,7 @@ NDISASM.LNK: makefile.wc
 
 assemble.$(OBJ): assemble.c nasm.h assemble.h insns.h
 disasm.$(OBJ): disasm.c nasm.h disasm.h sync.h insns.h names.c
+eval.$(OBJ): eval.c nasm.h nasmlib.h eval.h
 float.$(OBJ): float.c nasm.h
 insnsa.$(OBJ): insnsa.c nasm.h insns.h
 insnsd.$(OBJ): insnsd.c nasm.h insns.h
diff --git a/Makefile.wcw b/Makefile.wcw
index 25705ab7..d592c695 100644
--- a/Makefile.wcw
+++ b/Makefile.wcw
@@ -42,7 +42,7 @@ NASMOBJS = nasm.$(OBJ) nasmlib.$(OBJ) float.$(OBJ) insnsa.$(OBJ) \
            assemble.$(OBJ) labels.$(OBJ) parser.$(OBJ) outform.$(OBJ) \
 	   outbin.$(OBJ) outaout.$(OBJ) outcoff.$(OBJ) outelf.$(OBJ) \
 	   outobj.$(OBJ) outas86.$(OBJ) outrdf.$(OBJ) outdbg.$(OBJ) \
-	   preproc.$(OBJ) listing.$(OBJ)
+	   preproc.$(OBJ) listing.$(OBJ) eval.$(OBJ)
 
 NDISASMOBJS = ndisasm.$(OBJ) disasm.$(OBJ) sync.$(OBJ) nasmlib.$(OBJ) \
 	      insnsd.$(OBJ)
@@ -62,10 +62,12 @@ NASM.LNK: makefile.wcw
         echo N nasm.exe > NASM.LNK
         echo F nasm.$(OBJ) >> NASM.LNK
         echo F nasmlib.$(OBJ) >> NASM.LNK
+        echo F eval.$(OBJ) >> NASM.LNK
         echo F float.$(OBJ) >> NASM.LNK
         echo F insnsa.$(OBJ) >> NASM.LNK
         echo F assemble.$(OBJ) >> NASM.LNK
         echo F labels.$(OBJ) >> NASM.LNK
+        echo F listing.$(OBJ) >> NASM.LNK
         echo F parser.$(OBJ) >> NASM.LNK
 	echo F preproc.$(OBJ) >> NASM.LNK
         echo F outform.$(OBJ) >> NASM.LNK
@@ -88,6 +90,7 @@ NDISASM.LNK: makefile.wcw
 
 assemble.$(OBJ): assemble.c nasm.h assemble.h insns.h
 disasm.$(OBJ): disasm.c nasm.h disasm.h sync.h insns.h names.c
+eval.$(OBJ): eval.c nasm.h nasmlib.h eval.h
 float.$(OBJ): float.c nasm.h
 insnsa.$(OBJ): insnsa.c nasm.h insns.h
 insnsd.$(OBJ): insnsd.c nasm.h insns.h
diff --git a/Readme b/Readme
index 97e8beb3..95ce9f13 100644
--- a/Readme
+++ b/Readme
@@ -13,6 +13,11 @@ search path (maybe /usr/local/bin, or ~/bin if you don't have root
 access). You may also want to copy the man page `nasm.1' (and maybe
 `ndisasm.1') to somewhere sensible.
 
+To install under DOS, if you don't need to rebuild from the sources,
+you can just copy nasm.exe and ndisasm.exe (16-bit DOS executables),
+or nasmw.exe and ndisasmw.exe (Win32 console applications - less
+likely to run out of memory), to somewhere on your PATH.
+
 To rebuild the DOS sources, various makefiles are provided:
 
 - Makefile.dos, the one I build the standard 16-bit releases from,
@@ -26,7 +31,11 @@ To rebuild the DOS sources, various makefiles are provided:
 - Makefile.bc2, also for Borland C, contributed by Fox Cutter.
   Reported to work better than Makefile.bor on some systems.
 
-- Makefile.sc, for Symantec C++. Contributed by Mark Junker.
+- Makefile.sc, for Symantec C++, compiling to a 32-bit extended DOS
+  executable.. Contributed by Mark Junker.
+- Makefile.scw, also for Symantec C++, compiling to a Win32 command-
+  line application. Also contributed by Mark Junker.
+
 - Makefile.wc, for Watcom C, compiling to a 32-bit extended DOS
   executable. Contributed by Dominik Behr.
 - Makefile.wcw, also for Watcom C, compiling to a Win32 command-
@@ -53,6 +62,9 @@ NDISASM.EXE) into standalone executables incorporating Tran's
 PMODE/W DOS extender, rather than depending on an external extender
 program.
 
+Some of the Windows makefiles produce executables called nasmw.exe
+and ndisasmw.exe, and some don't. Be prepared for either...
+
 If you're trying to unpack the DOS (.ZIP format) archive under Unix
 instead of using the .tar.gz version, you can save some time by
 doing `unzip -aL', which will convert the DOS-format text files to
@@ -65,9 +77,9 @@ equivalently by adding compiler command line options in the
 Makefile.
 
 There is a machine description file for the `LCC' retargetable C
-compiler, in the directory `lcc', along with instructions for its
-use. This means that NASM can now be used as the code-generator back
-end for a useful C compiler.
+compiler (version 3.6), in the directory `lcc', along with
+instructions for its use. This means that NASM can now be used as
+the code-generator back end for a useful C compiler.
 
 Michael `Wuschel' Tippach has ported his DOS extender `WDOSX' to
 enable it to work with the 32-bit binary files NASM can output: the
@@ -83,7 +95,11 @@ JED programmers' editor (see http://space.mit.edu/~davis/jed.html
 for details about JED). The comment at the start of the file gives
 instructions on how to install the mode. This directory also
 contains a file (`magic') containing lines to add to /etc/magic on
-Unix systems to allow the `file' command to recognise RDF files.
+Unix systems to allow the `file' command to recognise RDF files, and
+a zip file (`exasm.zip') containing the necessary files for syntax
+highlighting in the Aurora DOS editor. (The Aurora files were
+contributed by <U993847220@aol.com>; I haven't tested them as I
+don't have Aurora.)
 
 The `rdoff' directory contains sources for a linker and loader for
 the RDF object file format, to run under Linux, and also
@@ -93,7 +109,9 @@ For information about how you can distribute and use NASM, see the
 file Licence. We were tempted to put NASM under the GPL, but decided
 that in many ways it was too restrictive for developers.
 
-For information about how to use NASM, see `nasm.doc'. For
+For information about how to use NASM, see the various forms of
+documentation in the `doc' directory: documentation is provided in
+HTML, PostScript, plain text, Texinfo, and Windows Help formats. For
 information about how to use NDISASM, see `ndisasm.doc'. For
 information about the internal structure of NASM, see
 `internal.doc'. (In particular, _please_ read `internal.doc' before
diff --git a/Wishlist b/Wishlist
new file mode 100644
index 00000000..17ef6998
--- /dev/null
+++ b/Wishlist
@@ -0,0 +1,97 @@
+NASM Wishlist
+=============
+
+- PUSH WORD EAX silently becomes PUSH EAX. Should warn.
+
+- ndisasm hangs at eof.
+
+- missing heading in documentation - some subsect in chapter 4.
+
+- Add support for lcc 4.0.
+  * If-when this happens, remember to bump the `supported lcc
+    version' number in Readme.
+
+- Re-work the evaluator, again, with a per-object-format fixup
+  routine, so as to be able to cope with section offsets "really"
+  being pure numbers; should be able to allow at _least_ the two
+  common idioms
+     TIMES 510-$ DB 0            ; bootsector
+     MOV AX,(PROG_END-100H)/16   ; .COM TSR
+  Would need to call the fixup throughout the evaluator, and the
+  fixup would have to be allowed to return UNKNOWN on pass one if it
+  had to. (_Always_ returning UNKNOWN on pass one, though a lovely
+  clean design, breaks the first of the above examples.)
+
+- Preprocessor identifier concatenation?
+
+- Arbitrary section names in `bin'.
+
+- Ability to read from a pipe. Obviously not useful under dos, so
+  memory problems with storing entire input file aren't a problem
+  either.
+
+- Subsection support?
+
+- A good ALIGN mechanism, similar to GAS's. GAS pads out space by
+  means of the following (32-bit) instructions:
+          8DB42600000000    lea esi,[esi+0x0]
+          8DB600000000      lea esi,[esi+0x0]
+          8D742600          lea esi,[esi+0x0]
+          8D7600            lea esi,[esi+0x0]
+          8D36              lea esi,[esi]
+          90                nop
+  It uses up to two of these instructions to do up to 14-byte pads;
+  when more than 14 bytes are needed, it issues a (short) jump to
+  the end of the padded section and then NOPs the rest. Come up with
+  a similar scheme for 16 bit mode, and also come up with a way to
+  use it - internal to the assembler, so that programs using ALIGN
+  don't knock over preprocess-only mode.
+    Also re-work the macro form so that when given one argument in a
+  code section it calls this feature.
+
+- Possibly a means whereby FP constants can be specified as
+  immediate operands to non-FP instructions.
+  * Possible syntax: MOV EAX,FLOAT 1.2 to get a single-precision FP
+    constant. Then maybe MOV EAX,HI_FLOAT 1.2 and MOV EAX,LO_FLOAT
+    1.2 to get the two halves of a double-precision one. Best to
+    ignore extended-precision in case it bites.
+  * Alternatively, maybe MOV EAX,FLOAT(4,0-4,1.2) to get bytes 0-4
+    (ie 0-3) of a 4-byte constant. Then HI_FLOAT is FLOAT(8,4-8,x)
+    and LO_FLOAT is FLOAT(8,0-4,x). But this version allows two-byte
+    chunks, one-byte chunks, even stranger chunks, and pieces of
+    ten-byte reals to be bandied around as well.
+
+- A UNION macro might be quite cool, now that ABSOLUTE is sane
+  enough to be able to handle it.
+
+- An equivalent to gcc's ## stringify operator, plus string
+  concatenation, somehow implemented without undue ugliness, so as
+  to be able to do `%include "/my/path/%1"' in a macro, or something
+  similar...
+
+- Actually _do_ something with the processor, privileged and
+  undocumented flags in the instruction table.
+
+- Maybe NEC V20/V30 instructions?
+
+- Yet more object formats.
+  * Possibly direct support for .EXE files?
+
+- Debug information, in all formats it can be usefully done in.
+  * including line-number record support.
+
+- Symbol map in binary format. Format-specific options...
+
+- REDESIGN: Think about EQU dependency, and about start-point
+  specification in OBJ. Possibly re-think directive support.
+
+- Think about a wrapper program like gcc? Possibly invent a _patch_
+  for gcc so that it can take .asm files on the command line?
+
+- If a wrapper happens, think about adding an option to cause the
+  resulting executable file to be executed immediately, thus
+  allowing NASM source files to have #!... (probably silly)
+
+- Multi-platform support? If so: definitely Alpha; possibly Java
+  byte code; probably ARM/StrongARM; maybe Sparc; maybe Mips; maybe
+  Vax. Perhaps Z80 and 6502, just for a laugh?
diff --git a/assemble.c b/assemble.c
index c6cc00a3..8d144121 100644
--- a/assemble.c
+++ b/assemble.c
@@ -592,22 +592,40 @@ static void gencode (long segment, long offset, int bits,
       case 014: case 015: case 016:
 	if (ins->oprs[c-014].offset < -128 || ins->oprs[c-014].offset > 127)
 	    errfunc (ERR_WARNING, "signed byte value exceeds bounds");
-	bytes[0] = ins->oprs[c-014].offset;
-	out (offset, segment, bytes, OUT_RAWDATA+1, NO_SEG, NO_SEG);
+	if (ins->oprs[c-014].segment != NO_SEG) {
+	    data = ins->oprs[c-014].offset;
+	    out (offset, segment, &data, OUT_ADDRESS+1,
+		 ins->oprs[c-014].segment, ins->oprs[c-014].wrt);
+	} else {
+	    bytes[0] = ins->oprs[c-014].offset;
+	    out (offset, segment, bytes, OUT_RAWDATA+1, NO_SEG, NO_SEG);
+	}
 	offset += 1;
 	break;
       case 020: case 021: case 022:
 	if (ins->oprs[c-020].offset < -256 || ins->oprs[c-020].offset > 255)
 	    errfunc (ERR_WARNING, "byte value exceeds bounds");
-	bytes[0] = ins->oprs[c-020].offset;
-	out (offset, segment, bytes, OUT_RAWDATA+1, NO_SEG, NO_SEG);
+	if (ins->oprs[c-020].segment != NO_SEG) {
+	    data = ins->oprs[c-020].offset;
+	    out (offset, segment, &data, OUT_ADDRESS+1,
+		 ins->oprs[c-020].segment, ins->oprs[c-020].wrt);
+	} else {
+	    bytes[0] = ins->oprs[c-020].offset;
+	    out (offset, segment, bytes, OUT_RAWDATA+1, NO_SEG, NO_SEG);
+	}
 	offset += 1;
 	break;
       case 024: case 025: case 026:
 	if (ins->oprs[c-024].offset < 0 || ins->oprs[c-024].offset > 255)
 	    errfunc (ERR_WARNING, "unsigned byte value exceeds bounds");
-	bytes[0] = ins->oprs[c-024].offset;
-	out (offset, segment, bytes, OUT_RAWDATA+1, NO_SEG, NO_SEG);
+	if (ins->oprs[c-024].segment != NO_SEG) {
+	    data = ins->oprs[c-024].offset;
+	    out (offset, segment, &data, OUT_ADDRESS+1,
+		 ins->oprs[c-024].segment, ins->oprs[c-024].wrt);
+	} else {
+	    bytes[0] = ins->oprs[c-024].offset;
+	    out (offset, segment, bytes, OUT_RAWDATA+1, NO_SEG, NO_SEG);
+	}
 	offset += 1;
 	break;
       case 030: case 031: case 032:
@@ -757,8 +775,9 @@ static void gencode (long segment, long offset, int bits,
 	    errfunc (ERR_PANIC, "non-constant BSS size in pass two");
 	else {
 	    long size = ins->oprs[0].offset << (c-0340);
-	    out (offset, segment, NULL,
-		 OUT_RESERVE+size, NO_SEG, NO_SEG);
+	    if (size > 0)
+		out (offset, segment, NULL,
+		     OUT_RESERVE+size, NO_SEG, NO_SEG);
 	    offset += size;
 	}
 	break;
@@ -792,9 +811,16 @@ static void gencode (long segment, long offset, int bits,
 	      case 0:
 		break;
 	      case 1:
-	        *bytes = ins->oprs[(c>>3)&7].offset;
-		out (offset, segment, bytes, OUT_RAWDATA+1,
-		     NO_SEG, NO_SEG);
+		if (ins->oprs[(c>>3)&7].segment != NO_SEG) {
+		    data = ins->oprs[(c>>3)&7].offset;
+		    out (offset, segment, &data, OUT_ADDRESS+1,
+			 ins->oprs[(c>>3)&7].segment,
+			 ins->oprs[(c>>3)&7].wrt);
+		} else {
+		    *bytes = ins->oprs[(c>>3)&7].offset;
+		    out (offset, segment, bytes, OUT_RAWDATA+1,
+			 NO_SEG, NO_SEG);
+		}
 		s++;
 		break;
 	      case 2:
@@ -887,6 +913,9 @@ static int matches (struct itemplate *itemp, insn *instruction) {
     if (itemp->flags & IF_SB) {
 	size = BITS8;
 	oprs = itemp->operands;
+    } else if (itemp->flags & IF_SW) {
+	size = BITS16;
+	oprs = itemp->operands;
     } else if (itemp->flags & IF_SD) {
 	size = BITS32;
 	oprs = itemp->operands;
@@ -939,6 +968,8 @@ static ea *process_ea (operand *input, ea *output, int addrbits, int rfield,
 	} else {		       /* it's an indirection */
 	    int i=input->indexreg, b=input->basereg, s=input->scale;
 	    long o=input->offset, seg=input->segment;
+	    int hb=input->hintbase, ht=input->hinttype;
+	    int t;
 
 	    if (s==0) i = -1;	       /* make this easy, at least */
 
@@ -960,11 +991,15 @@ static ea *process_ea (operand *input, ea *output, int addrbits, int rfield,
 		    return NULL;
 
 		/* now reorganise base/index */
+		if (s == 1 && b != i && b != -1 && i != -1 &&
+		    ((hb==b&&ht==EAH_NOTBASE) || (hb==i&&ht==EAH_MAKEBASE)))
+		    t = b, b = i, i = t;   /* swap if hints say so */
 		if (b==i)	       /* convert EAX+2*EAX to 3*EAX */
 		    b = -1, s++;
-		if (b==-1 && s==1)     /* single register should be base */
-		    b = i, i = -1;
-		if (((s==2 && i!=R_ESP) || s==3 || s==5 || s==9) && b==-1)
+		if (b==-1 && s==1 && !(hb == i && ht == EAH_NOTBASE))
+		    b = i, i = -1;     /* make single reg base, unless hint */
+		if (((s==2 && i!=R_ESP && !(input->eaflags & EAF_TIMESTWO)) ||
+		     s==3 || s==5 || s==9) && b==-1)
 		    b = i, s--;       /* convert 3*EAX to EAX+2*EAX */
 		if (s==1 && i==R_ESP)  /* swap ESP into base if scale is 1 */
 		    i = b, b = R_ESP;
@@ -986,11 +1021,15 @@ static ea *process_ea (operand *input, ea *output, int addrbits, int rfield,
 			return NULL;
 		    }
 		    if (b==-1 || (b!=R_EBP && o==0 &&
-				  seg==NO_SEG && !forw_ref))
+				  seg==NO_SEG && !forw_ref &&
+				  !(input->eaflags &
+				    (EAF_BYTEOFFS|EAF_WORDOFFS))))
 		    	mod = 0;
-		    else if (o>=-128 && o<=127 && seg==NO_SEG && !forw_ref)
+		    else if (input->eaflags & EAF_BYTEOFFS ||
+			     (o>=-128 && o<=127 && seg==NO_SEG && !forw_ref &&
+			      !(input->eaflags & EAF_WORDOFFS))) {
 		    	mod = 1;
-		    else
+		    } else
 		    	mod = 2;
 
 		    output->sib_present = FALSE;
@@ -1036,9 +1075,13 @@ static ea *process_ea (operand *input, ea *output, int addrbits, int rfield,
 		    }
 
 		    if (b==-1 || (b!=R_EBP && o==0 &&
-				  seg==NO_SEG && !forw_ref))
+				  seg==NO_SEG && !forw_ref &&
+				  !(input->eaflags &
+				    (EAF_BYTEOFFS|EAF_WORDOFFS))))
 		    	mod = 0;
-		    else if (o>=-128 && o<=127 && seg==NO_SEG && !forw_ref)
+		    else if (input->eaflags & EAF_BYTEOFFS ||
+			     (o>=-128 && o<=127 && seg==NO_SEG && !forw_ref &&
+			      !(input->eaflags & EAF_WORDOFFS)))
 		    	mod = 1;
 		    else
 		    	mod = 2;
@@ -1089,9 +1132,12 @@ static ea *process_ea (operand *input, ea *output, int addrbits, int rfield,
 		if (rm==-1)	       /* can't happen, in theory */
 		    return NULL;      /* so panic if it does */
 
-		if (o==0 && seg==NO_SEG && !forw_ref && rm!=6)
+		if (o==0 && seg==NO_SEG && !forw_ref && rm!=6 &&
+		    !(input->eaflags & (EAF_BYTEOFFS|EAF_WORDOFFS)))
 		    mod = 0;
-		else if (o>=-128 && o<=127 && seg==NO_SEG && !forw_ref)
+		else if (input->eaflags & EAF_BYTEOFFS ||
+			 (o>=-128 && o<=127 && seg==NO_SEG && !forw_ref &&
+			  !(input->eaflags & EAF_WORDOFFS)))
 		    mod = 1;
 		else
 		    mod = 2;
diff --git a/disasm.c b/disasm.c
index c4c0dc12..3dded0d9 100644
--- a/disasm.c
+++ b/disasm.c
@@ -73,6 +73,10 @@ static int whichreg(long regflags, int regval) {
 	return R_ST0;
     if (!(REG_CS & ~regflags))
 	return R_CS;
+    if (!(REG_DESS & ~regflags))
+	return (regval == 0 || regval == 2 || regval == 3 ? sreg[regval] : 0);
+    if (!(REG_FSGS & ~regflags))
+	return (regval == 4 || regval == 5 ? sreg[regval] : 0);
     if (!((REGMEM|BITS8) & ~regflags))
 	return reg8[regval];
     if (!((REGMEM|BITS16) & ~regflags))
@@ -488,11 +492,22 @@ long disasm (unsigned char *data, char *output, int segsize, long offset,
 	     * Final check to make sure the types of r/m match up.
 	     */
 	    for (i = 0; i < (*p)->operands; i++)
-		if (((ins.oprs[i].segment & SEG_RMREG) &&
+		if (
+		    
+		    /* If it's a mem-only EA but we have a register, die. */
+		    ((ins.oprs[i].segment & SEG_RMREG) &&
 		     !(MEMORY & ~(*p)->opd[i])) ||
+		    
+		    /* If it's a reg-only EA but we have a memory ref, die. */
 		    (!(ins.oprs[i].segment & SEG_RMREG) &&
 		     !(REGNORM & ~(*p)->opd[i]) &&
-		     !((*p)->opd[i] & REG_SMASK)))
+		     !((*p)->opd[i] & REG_SMASK)) ||
+
+		    /* Register type mismatch (eg FS vs REG_DESS): die. */
+		    ((((*p)->opd[i] & (REGISTER | FPUREG)) ||
+		      (ins.oprs[i].segment & SEG_RMREG)) &&
+		     !whichreg ((*p)->opd[i], ins.oprs[i].basereg)))
+
 		    works = FALSE;
 	    if (works)
 		break;
@@ -559,7 +574,7 @@ long disasm (unsigned char *data, char *output, int segsize, long offset,
 	    ins.oprs[i].basereg = whichreg ((*p)->opd[i],
 					    ins.oprs[i].basereg);
 	    slen += sprintf(output+slen, "%s",
-			    reg_names[ins.oprs[i].basereg]);
+			    reg_names[ins.oprs[i].basereg-EXPR_REG_START]);
 	} else if (!(UNITY & ~(*p)->opd[i])) {
 	    output[slen++] = '1';
 	} else if ( (*p)->opd[i] & IMMEDIATE ) {
@@ -617,14 +632,16 @@ long disasm (unsigned char *data, char *output, int segsize, long offset,
 	    }
 	    if (ins.oprs[i].basereg != -1) {
 		slen += sprintf(output+slen, "%s",
-				reg_names[ins.oprs[i].basereg]);
+				reg_names[(ins.oprs[i].basereg -
+					   EXPR_REG_START)]);
 		started = TRUE;
 	    }
 	    if (ins.oprs[i].indexreg != -1) {
 		if (started)
 		    output[slen++] = '+';
 		slen += sprintf(output+slen, "%s",
-				reg_names[ins.oprs[i].indexreg]);
+				reg_names[(ins.oprs[i].indexreg -
+					   EXPR_REG_START)]);
 		if (ins.oprs[i].scale > 1)
 		    slen += sprintf(output+slen, "*%d", ins.oprs[i].scale);
 		started = TRUE;
diff --git a/doc/nasmdoc.src b/doc/nasmdoc.src
new file mode 100644
index 00000000..0f802663
--- /dev/null
+++ b/doc/nasmdoc.src
@@ -0,0 +1,8530 @@
+\IR{-o} \c{-o} option
+\IR{-f} \c{-f} option
+\IR{-l} \c{-l} option
+\IR{-s} \c{-s} option
+\IR{-i} \c{-i} option
+\IR{-p} \c{-p} option
+\IR{-d} \c{-d} option
+\IR{-e} \c{-e} option
+\IR{-a} \c{-a} option
+\IR{-w} \c{-w} option
+\IR{!=} \c{!=} operator
+\IR{$ here} \c{$} Here token
+\IR{$$} \c{$$} token
+\IR{%} \c{%} operator
+\IR{%%} \c{%%} operator
+\IR{%+1} \c{%+1} and \c{%-1} syntax
+\IA{%-1}{%+1}
+\IR{%0} \c{%0} parameter count
+\IR{&} \c{&} operator
+\IR{&&} \c{&&} operator
+\IR{*} \c{*} operator
+\IR{..@} \c{..@} symbol prefix
+\IR{/} \c{/} operator
+\IR{//} \c{//} operator
+\IR{<} \c{<} operator
+\IR{<<} \c{<<} operator
+\IR{<=} \c{<=} operator
+\IR{<>} \c{<>} operator
+\IR{=} \c{=} operator
+\IR{==} \c{==} operator
+\IR{>} \c{>} operator
+\IR{>=} \c{>=} operator
+\IR{>>} \c{>>} operator
+\IR{?} \c{?} MASM syntax
+\IR{^} \c{^} operator
+\IR{^^} \c{^^} operator
+\IR{|} \c{|} operator
+\IR{||} \c{||} operator
+\IR{~} \c{~} operator
+\IR{%$} \c{%$} and \c{%$$} prefixes
+\IA{%$$}{%$}
+\IR{+ opaddition} \c{+} operator, binary
+\IR{+ opunary} \c{+} operator, unary
+\IR{+ modifier} \c{+} modifier
+\IR{- opsubtraction} \c{-} operator, binary
+\IR{- opunary} \c{-} operator, unary
+\IR{alignment, in bin sections} alignment, in \c{bin} sections
+\IR{alignment, in elf sections} alignment, in \c{elf} sections
+\IR{alignment, in win32 sections} alignment, in \c{win32} sections
+\IR{alignment, of elf common variables} alignment, of \c{elf} common
+variables
+\IR{alignment, in obj sections} alignment, in \c{obj} sections
+\IR{a.out, bsd version} \c{a.out}, BSD version
+\IR{a.out, linux version} \c{a.out}, Linux version
+\IR{autoconf} Autoconf
+\IR{bitwise and} bitwise AND
+\IR{bitwise or} bitwise OR
+\IR{bitwise xor} bitwise XOR
+\IR{block ifs} block IFs
+\IR{borland pascal} Borland, Pascal
+\IR{borland's win32 compilers} Borland, Win32 compilers
+\IR{braces, after % sign} braces, after \c{%} sign
+\IR{bsd} BSD
+\IR{c calling convention} C calling convention
+\IR{c symbol names} C symbol names
+\IA{critical expressions}{critical expression}
+\IA{command line}{command-line}
+\IA{case sensitivity}{case sensitive}
+\IA{case-sensitive}{case sensitive}
+\IA{case-insensitive}{case sensitive}
+\IA{character constants}{character constant}
+\IR{common object file format} Common Object File Format
+\IR{common variables, alignment in elf} common variables, alignment
+in \c{elf}
+\IR{common, elf extensions to} \c{COMMON}, \c{elf} extensions to
+\IR{common, obj extensions to} \c{COMMON}, \c{obj} extensions to
+\IR{declaring structure} declaring structures
+\IR{default-wrt mechanism} default-\c{WRT} mechanism
+\IR{devpac} DevPac
+\IR{djgpp} DJGPP
+\IR{dll symbols, exporting} DLL symbols, exporting
+\IR{dll symbols, importing} DLL symbols, importing
+\IR{dos} DOS
+\IR{dos archive} DOS archive
+\IR{dos source archive} DOS source archive
+\IA{effective address}{effective addresses}
+\IA{effective-address}{effective addresses}
+\IR{elf shared libraries} \c{elf} shared libraries
+\IR{freebsd} FreeBSD
+\IR{freelink} FreeLink
+\IR{functions, c calling convention} functions, C calling convention
+\IR{functions, pascal calling convention} functions, Pascal calling
+convention
+\IR{global, aoutb extensions to} \c{GLOBAL}, \c{aoutb} extensions to
+\IR{global, elf extensions to} \c{GLOBAL}, \c{elf} extensions to
+\IR{got} GOT
+\IR{got relocations} \c{GOT} relocations
+\IR{gotoff relocation} \c{GOTOFF} relocations
+\IR{gotpc relocation} \c{GOTPC} relocations
+\IR{linux elf} Linux ELF
+\IR{logical and} logical AND
+\IR{logical or} logical OR
+\IR{logical xor} logical XOR
+\IR{masm} MASM
+\IA{memory reference}{memory references}
+\IA{misc directory}{misc subdirectory}
+\IR{misc subdirectory} \c{misc} subdirectory
+\IR{microsoft omf} Microsoft OMF
+\IR{mmx registers} MMX registers
+\IA{modr/m}{modr/m byte}
+\IR{modr/m byte} ModR/M byte
+\IR{ms-dos} MS-DOS
+\IR{ms-dos device drivers} MS-DOS device drivers
+\IR{multipush} \c{multipush} macro
+\IR{nasm version} NASM version
+\IR{netbsd} NetBSD
+\IR{omf} OMF
+\IR{openbsd} OpenBSD
+\IR{operating-system} operating system
+\IR{os/2} OS/2
+\IR{pascal calling convention}Pascal calling convention
+\IR{passes} passes, assembly
+\IR{perl} Perl
+\IR{pic} PIC
+\IR{pharlap} PharLap
+\IR{plt} PLT
+\IR{plt} \c{PLT} relocations
+\IA{pre-defining macros}{pre-define}
+\IR{qbasic} QBasic
+\IA{rdoff subdirectory}{rdoff}
+\IR{rdoff} \c{rdoff} subdirectory
+\IR{relocatable dynamic object file format} Relocatable Dynamic
+Object File Format
+\IR{relocations, pic-specific} relocations, PIC-specific
+\IA{repeating}{repeating code}
+\IR{section alignment, in elf} section alignment, in \c{elf}
+\IR{section alignment, in bin} section alignment, in \c{bin}
+\IR{section alignment, in obj} section alignment, in \c{obj}
+\IR{section alignment, in win32} section alignment, in \c{win32}
+\IR{section, elf extensions to} \c{SECTION}, \c{elf} extensions to
+\IR{section, win32 extensions to} \c{SECTION}, \c{win32} extensions to
+\IR{segment alignment, in bin} segment alignment, in \c{bin}
+\IR{segment alignment, in obj} segment alignment, in \c{obj}
+\IR{segment, obj extensions to} \c{SEGMENT}, \c{elf} extensions to
+\IR{segment names, borland pascal} segment names, Borland Pascal
+\IR{shift commane} \c{shift} command
+\IA{sib}{sib byte}
+\IR{sib byte} SIB byte
+\IA{standard section names}{standardised section names}
+\IR{symbols, exporting from dlls} symbols, exporting from DLLs
+\IR{symbols, importing from dlls} symbols, importing from DLLs
+\IR{tasm} TASM
+\IR{test subdirectory} \c{test} subdirectory
+\IR{tlink} TLINK
+\IR{underscore, in c symbols} underscore, in C symbols
+\IR{unix} Unix
+\IR{unix source archive} Unix source archive
+\IR{val} VAL
+\IR{version number of nasm} version number of NASM
+\IR{visual c++} Visual C++
+\IR{www page} WWW page
+\IR{win32} Win32
+\IR{windows} Windows
+\IR{windows 95} Windows 95
+\IR{windows nt} Windows NT
+\# \IC{program entry point}{entry point, program}
+\# \IC{program entry point}{start point, program}
+\# \IC{MS-DOS device drivers}{device drivers, MS-DOS}
+\# \IC{16-bit mode, versus 32-bit mode}{32-bit mode, versus 16-bit mode}
+\# \IC{c symbol names}{symbol names, in C}
+
+\C{intro} Introduction
+
+\H{whatsnasm} What Is NASM?
+
+The Netwide Assembler, NASM, is an 80x86 assembler designed for
+portability and modularity. It supports a range of object file
+formats, including Linux \c{a.out} and ELF, NetBSD/FreeBSD, COFF,
+Microsoft 16-bit OBJ and Win32. It will also output plain binary
+files. Its syntax is designed to be simple and easy to understand,
+similar to Intel's but less complex. It supports Pentium, P6 and MMX
+opcodes, and has macro capability.
+
+\S{yaasm} Why Yet Another Assembler?
+
+The Netwide Assembler grew out of an idea on \i\c{comp.lang.asm.x86}
+(or possibly \i\c{alt.lang.asm} - I forget which), which was
+essentially that there didn't seem to be a good free x86-series
+assembler around, and that maybe someone ought to write one.
+
+\b \i\c{a86} is good, but not free, and in particular you don't get any
+32-bit capability until you pay. It's DOS only, too.
+
+\b \i\c{gas} is free, and ports over DOS and Unix, but it's not very good,
+since it's designed to be a back end to \i\c{gcc}, which always feeds
+it correct code. So its error checking is minimal. Also, its syntax
+is horrible, from the point of view of anyone trying to actually
+\e{write} anything in it. Plus you can't write 16-bit code in it
+(properly).
+
+\b \i\c{as86} is Linux-specific, and (my version at least) doesn't seem to
+have much (or any) documentation.
+
+\b \i{MASM} isn't very good, and it's expensive, and it runs only under
+DOS.
+
+\b \i{TASM} is better, but still strives for \i{MASM} compatibility, which
+means millions of directives and tons of red tape. And its syntax is
+essentially \i{MASM}'s, with the contradictions and quirks that entails
+(although it sorts out some of those by means of Ideal mode). It's
+expensive too. And it's DOS-only.
+
+So here, for your coding pleasure, is NASM. At present it's
+still in prototype stage - we don't promise that it can outperform
+any of these assemblers. But please, \e{please} send us bug reports,
+fixes, helpful information, and anything else you can get your hands
+on (and thanks to the many people who've done this already! You all
+know who you are), and we'll improve it out of all recognition.
+Again.
+
+\S{legal} Licence Conditions
+
+Please see the file \c{Licence}, supplied as part of any NASM
+distribution archive, for the \i{licence} conditions under which you
+may use NASM.
+
+\H{contact} Contact Information
+
+NASM has a \i{WWW page} at
+\W{http://www.cryogen.com/Nasm}\c{http://www.cryogen.com/Nasm}. The
+authors are \i{e\-mail}able as
+\W{mailto:jules@earthcorp.com}\c{jules@earthcorp.com} and
+\W{mailto:anakin@pobox.com}\c{anakin@pobox.com}. If you want to
+report a bug to us, please read \k{bugs} first.
+
+\i{New releases} of NASM are uploaded to
+\W{ftp://sunsite.unc.edu/pub/Linux/devel/lang/assemblers/}\i\c{sunsite.unc.edu},
+\W{ftp://ftp.simtel.net/pub/simtelnet/msdos/asmutl/}\i\c{ftp.simtel.net}
+and
+\W{ftp://ftp.coast.net/coast/msdos/asmutil/}\i\c{ftp.coast.net}.
+Announcements are posted to
+\W{news:comp.lang.asm.x86}\i\c{comp.lang.asm.x86},
+\W{news:alt.lang.asm}\i\c{alt.lang.asm},
+\W{news:comp.os.linux.announce}\i\c{comp.os.linux.announce} and
+\W{news:comp.archives.msdos.announce}\i\c{comp.archives.msdos.announce}
+(the last one is done automagically by uploading to
+\W{ftp://ftp.simtel.net/pub/simtelnet/msdos/asmutl/}\c{ftp.simtel.net}).
+
+If you don't have Usenet access, or would rather be informed by
+\i{e\-mail} when new releases come out, e\-mail
+\W{mailto:anakin@pobox.com}\c{anakin@pobox.com}
+and ask.
+
+\H{install} Installation
+
+\S{instdos} \i{Installing} NASM under MS-\i{DOS} or Windows
+
+Once you've obtained the \i{DOS archive} for NASM, \i\c{nasmXXX.zip}
+(where \c{XXX} denotes the version number of NASM contained in the
+archive), unpack it into its own directory (for example
+\c{c:\\nasm}).
+
+The archive will contain four executable files: the NASM executable
+files \i\c{nasm.exe} and \i\c{nasmw.exe}, and the NDISASM executable
+files \i\c{ndisasm.exe} and \i\c{ndisasmw.exe}. In each case, the
+file whose name ends in \c{w} is a \i{Win32} executable, designed to
+run under \i{Windows 95} or \i{Windows NT} Intel, and the other one
+is a 16-bit \i{DOS} executable.
+
+The only file NASM needs to run is its own executable, so copy
+(at least) one of \c{nasm.exe} and \c{nasmw.exe} to a directory on
+your PATH, or alternatively edit \i\c{autoexec.bat} to add the
+\c{nasm} directory to your \i\c{PATH}. (If you're only installing the
+Win32 version, you may wish to rename it to \c{nasm.exe}.)
+
+That's it - NASM is installed. You don't need the \c{nasm} directory
+to be present to run NASM (unless you've added it to your \c{PATH}),
+so you can delete it if you need to save space; however, you may
+want to keep the documentation or test programs.
+
+If you've downloaded the \i{DOS source archive}, \i\c{nasmXXXs.zip},
+the \c{nasm} directory will also contain the full NASM \i{source
+code}, and a selection of \i{Makefiles} you can (hopefully) use to
+rebuild your copy of NASM from scratch. The file \c{Readme} lists
+the various Makefiles and which compilers they work with. Note that
+the source files \c{insnsa.c} and \c{insnsd.c} are automatically
+generated from the master instruction table \c{insns.dat} by a Perl
+script; a \i{QBasic} version of the program is provided, but it is
+recommended that you use the Perl version. A DOS port of \i{Perl} is
+available from
+\W{http://www.perl.org/CPAN/ports/msdos/}\i{www.perl.org}.
+
+\S{instdos} Installing NASM under \i{Unix}
+
+Once you've obtained the \i{Unix source archive} for NASM,
+\i\c{nasm-X.XX.tar.gz} (where \c{X.XX} denotes the version number of
+NASM contained in the archive), unpack it into a directory such
+as \c{/usr/local/src}. The archive, when unpacked, will create its
+own subdirectory \c{nasm-X.XX}.
+
+NASM is an \I{Autoconf}\I\c{configure}auto-configuring package: once
+you've unpacked it, \c{cd} to the directory it's been unpacked into
+and type \c{./configure}. This shell script will find the best C
+compiler to use for building NASM and set up \i{Makefiles}
+accordingly.
+
+Once NASM has auto-configured, you can type \i\c{make} to build the
+\c{nasm} and \c{ndisasm} binaries, and then \c{make install} to
+install them in \c{/usr/local/bin} and install the \i{man pages}
+\i\c{nasm.1} and \i\c{ndisasm.1} in \c{/usr/local/man/man1}.
+Alternatively, you can give options such as \c{--prefix} to the
+\c{configure} script (see the file \i\c{INSTALL} for more details), or
+install the programs yourself.
+
+NASM also comes with a set of utilities for handling the RDOFF
+custom object-file format, which are in the \i\c{rdoff} subdirectory
+of the NASM archive. You can build these with \c{make rdf} and
+install them with \c{make rdf_install}, if you want them.
+
+If NASM fails to auto-configure, you may still be able to make it
+compile by using the fall-back Unix makefile \i\c{Makefile.unx}.
+Copy or rename that file to \c{Makefile} and try typing \c{make}.
+There is also a \c{Makefile.unx} file in the \c{rdoff} subdirectory.
+
+\C{running} Running NASM
+
+\H{syntax} NASM \i{Command-Line} Syntax
+
+To assemble a file, you issue a command of the form
+
+\c nasm -f <format> <filename> [-o <output>]
+
+For example,
+
+\c nasm -f elf myfile.asm
+
+will assemble \c{myfile.asm} into an ELF object file \c{myfile.o}. And
+
+\c nasm -f bin myfile.asm -o myfile.com
+
+will assemble \c{myfile.asm} into a raw binary file \c{myfile.com}.
+
+To produce a listing file, with the hex codes output from NASM
+displayed on the left of the original sources, use the \c{-l} option
+to give a listing file name, for example:
+
+\c nasm -f coff myfile.asm -l myfile.lst
+
+To get further usage instructions from NASM, try typing
+
+\c nasm -h
+
+This will also list the available output file formats, and what they
+are.
+
+If you use Linux but aren't sure whether your system is \c{a.out} or
+ELF, type
+
+\c file nasm
+
+(in the directory in which you put the NASM binary when you
+installed it). If it says something like
+
+\c nasm: ELF 32-bit LSB executable i386 (386 and up) Version 1
+
+then your system is ELF, and you should use the option \c{-f elf}
+when you want NASM to produce Linux object files. If it says
+
+\c nasm: Linux/i386 demand-paged executable (QMAGIC)
+
+or something similar, your system is \c{a.out}, and you should use
+\c{-f aout} instead.
+
+Like Unix compilers and assemblers, NASM is silent unless it
+goes wrong: you won't see any output at all, unless it gives error
+messages.
+
+\S{opt-o} The \i\c{-o} Option: Specifying the Output File Name
+
+NASM will normally choose the name of your output file for you;
+precisely how it does this is dependent on the object file format.
+For Microsoft object file formats (\i\c{obj} and \i\c{win32}), it
+will remove the \c{.asm} \i{extension} (or whatever extension you
+like to use - NASM doesn't care) from your source file name and
+substitute \c{.obj}. For Unix object file formats (\i\c{aout},
+\i\c{coff}, \i\c{elf} and \i\c{as86}) it will substitute \c{.o}. For
+\i\c{rdf}, it will use \c{.rdf}, and for the \i\c{bin} format it
+will simply remove the extension, so that \c{myfile.asm} produces
+the output file \c{myfile}.
+
+If the output file already exists, NASM will overwrite it, unless it
+has the same name as the input file, in which case it will give a
+warning and use \i\c{nasm.out} as the output file name instead.
+
+For situations in which this behaviour is unacceptable, NASM
+provides the \c{-o} command-line option, which allows you to specify
+your desired output file name. You invoke \c{-o} by following it
+with the name you wish for the output file, either with or without
+an intervening space. For example:
+
+\c nasm -f bin program.asm -o program.com
+\c nasm -f bin driver.asm -odriver.sys
+
+\S{opt-f} The \i\c{-f} Option: Specifying the \i{Output File Format}
+
+If you do not supply the \c{-f} option to NASM, it will choose an
+output file format for you itself. In the distribution versions of
+NASM, the default is always \i\c{bin}; if you've compiled your own
+copy of NASM, you can redefine \i\c{OF_DEFAULT} at compile time and
+choose what you want the default to be.
+
+Like \c{-o}, the intervening space between \c{-f} and the output
+file format is optional; so \c{-f elf} and \c{-felf} are both valid.
+
+A complete list of the available output file formats can be given by
+issuing the command \i\c{nasm -h}.
+
+\S{opt-l} The \i\c{-l} Option: Generating a \i{Listing File}
+
+If you supply the \c{-l} option to NASM, followed (with the usual
+optional space) by a file name, NASM will generate a
+\i{source-listing file} for you, in which addresses and generated
+code are listed on the left, and the actual source code, with
+expansions of multi-line macros (except those which specifically
+request no expansion in source listings: see \k{nolist}) on the
+right. For example:
+
+\c nasm -f elf myfile.asm -l myfile.lst
+
+\S{opt-s} The \i\c{-s} Option: Send Errors to \i\c{stdout}
+
+Under MS-\i{DOS} it can be difficult (though there are ways) to
+redirect the standard-error output of a program to a file. Since
+NASM usually produces its warning and \i{error messages} on
+\i\c{stderr}, this can make it hard to capture the errors if (for
+example) you want to load them into an editor.
+
+NASM therefore provides the \c{-s} option, requiring no argument,
+which causes errors to be sent to standard output rather than
+standard error. Therefore you can \I{redirecting errors}redirect
+the errors into a file by typing
+
+\c nasm -s -f obj myfile.asm > myfile.err
+
+\S{opt-i} The \i\c{-i} Option: Include File Search Directories
+
+When NASM sees the \i\c{%include} directive in a source file (see
+\k{include}), it will search for the given file not only in the
+current directory, but also in any directories specified on the
+command line by the use of the \c{-i} option. Therefore you can
+include files from a \i{macro library}, for example, by typing
+
+\c nasm -ic:\\macrolib\\ -f obj myfile.asm
+
+(As usual, a space between \c{-i} and the path name is allowed, and
+optional).
+
+NASM, in the interests of complete source-code portability, does not
+understand the file naming conventions of the OS it is running on;
+the string you provide as an argument to the \c{-i} option will be
+prepended exactly as written to the name of the include file.
+Therefore the trailing backslash in the above example is necessary.
+Under Unix, a trailing forward slash is similarly necessary.
+
+(You can use this to your advantage, if you're really \i{perverse},
+by noting that the option \c{-ifoo} will cause \c{%include "bar.i"}
+to search for the file \c{foobar.i}...)
+
+If you want to define a \e{standard} \i{include search path},
+similar to \c{/usr/include} on Unix systems, you should place one or
+more \c{-i} directives in the \c{NASM} environment variable (see
+\k{nasmenv}).
+
+\S{opt-p} The \i\c{-p} Option: \I{pre-including files}Pre-Include a File
+
+\I\c{%include}NASM allows you to specify files to be
+\e{pre-included} into your source file, by the use of the \c{-p}
+option. So running
+
+\c nasm myfile.asm -p myinc.inc
+
+is equivalent to running \c{nasm myfile.asm} and placing the
+directive \c{%include "myinc.inc"} at the start of the file.
+
+\S{opt-d} The \i\c{-d} Option: \I{pre-defining macros} Pre-Define a Macro
+
+\I\c{%define}Just as the \c{-p} option gives an alternative to placing
+\c{%include} directives at the start of a source file, the \c{-d}
+option gives an alternative to placing a \c{%define} directive. You
+could code
+
+\c nasm myfile.asm -dFOO=100
+
+as an alternative to placing the directive
+
+\c %define FOO 100
+
+at the start of the file. You can miss off the macro value, as well:
+the option \c{-dFOO} is equivalent to coding \c{%define FOO}. This
+form of the directive may be useful for selecting \i{assembly-time
+options} which are then tested using \c{%ifdef}, for example
+\c{-dDEBUG}.
+
+\S{opt-e} The \i\c{-e} Option: Preprocess Only
+
+NASM allows the \i{preprocessor} to be run on its own, up to a
+point. Using the \c{-e} option (which requires no arguments) will
+cause NASM to preprocess its input file, expand all the macro
+references, remove all the comments and preprocessor directives, and
+print the resulting file on standard output (or save it to a file,
+if the \c{-o} option is also used).
+
+This option cannot be applied to programs which require the
+preprocessor to evaluate \I{preprocessor expressions}\i{expressions}
+which depend on the values of symbols: so code such as
+
+\c %assign tablesize ($-tablestart)
+
+will cause an error in \i{preprocess-only mode}.
+
+\S{opt-a} The \i\c{-a} Option: Don't Preprocess At All
+
+If NASM is being used as the back end to a compiler, it might be
+desirable to \I{suppressing preprocessing}suppress preprocessing
+completely and assume the compiler has already done it, to save time
+and increase compilation speeds. The \c{-a} option, requiring no
+argument, instructs NASM to replace its powerful \i{preprocessor}
+with a \i{stub preprocessor} which does nothing.
+
+\S{opt-w} The \i\c{-w} Option: Enable or Disable Assembly \i{Warnings}
+
+NASM can observe many conditions during the course of assembly which
+are worth mentioning to the user, but not a sufficiently severe
+error to justify NASM refusing to generate an output file. These
+conditions are reported like errors, but come up with the word
+`warning' before the message. Warnings do not prevent NASM from
+generating an output file and returning a success status to the
+operating system.
+
+Some conditions are even less severe than that: they are only
+sometimes worth mentioning to the user. Therefore NASM supports the
+\c{-w} command-line option, which enables or disables certain
+classes of assembly warning. Such warning classes are described by a
+name, for example \c{orphan-labels}; you can enable warnings of
+this class by the command-line option \c{-w+orphan-labels} and
+disable it by \c{-w-orphan-labels}.
+
+The \i{suppressible warning} classes are:
+
+\b \i\c{macro-params} covers warnings about \i{multi-line macros}
+being invoked with the wrong number of parameters. This warning
+class is enabled by default; see \k{mlmacover} for an example of why
+you might want to disable it.
+
+\b \i\c{orphan-labels} covers warnings about source lines which
+contain no instruction but define a label without a trailing colon.
+NASM does not warn about this somewhat obscure condition by default;
+see \k{syntax} for an example of why you might want it to.
+
+\b \i\c{number-overflow} covers warnings about numeric constants which
+don't fit in 32 bits (for example, it's easy to type one too many Fs
+and produce \c{0x7ffffffff} by mistake). This warning class is
+enabled by default.
+
+\S{nasmenv} The \c{NASM} \i{Environment} Variable
+
+If you define an environment variable called \c{NASM}, the program
+will interpret it as a list of extra command-line options, which are
+processed before the real command line. You can use this to define
+standard search directories for include files, by putting \c{-i}
+options in the \c{NASM} variable.
+
+The value of the variable is split up at white space, so that the
+value \c{-s -ic:\\nasmlib} will be treated as two separate options.
+However, that means that the value \c{-dNAME="my name"} won't do
+what you might want, because it will be split at the space and the
+NASM command-line processing will get confused by the two
+nonsensical words \c{-dNAME="my} and \c{name"}.
+
+To get round this, NASM provides a feature whereby, if you begin the
+\c{NASM} environment variable with some character that isn't a minus
+sign, then NASM will treat this character as the \i{separator
+character} for options. So setting the \c{NASM} variable to the
+value \c{!-s!-ic:\\nasmlib} is equivalent to setting it to \c{-s
+-ic:\\nasmlib}, but \c{!-dNAME="my name"} will work.
+
+\H{qstart} \i{Quick Start} for \i{MASM} Users
+
+If you're used to writing programs with MASM, or with \i{TASM} in
+MASM-compatible (non-Ideal) mode, or with \i\c{a86}, this section
+attempts to outline the major differences between MASM's syntax and
+NASM's. If you're not already used to MASM, it's probably worth
+skipping this section.
+
+\S{qscs} NASM Is \I{case sensitivity}Case-Sensitive
+
+One simple difference is that NASM is case-sensitive. It makes a
+difference whether you call your label \c{foo}, \c{Foo} or \c{FOO}.
+If you're assembling to DOS or OS/2 \c{.OBJ} files, you can invoke
+the \i\c{UPPERCASE} directive (documented in \k{objfmt}) to ensure
+that all symbols exported to other code modules are forced to be
+upper case; but even then, \e{within} a single module, NASM will
+distinguish between labels differing only in case.
+
+\S{qsbrackets} NASM Requires \i{Square Brackets} For \i{Memory References}
+
+NASM was designed with simplicity of syntax in mind. One of the
+\i{design goals} of NASM is that it should be possible, as far as is
+practical, for the user to look at a single line of NASM code
+and tell what opcode is generated by it. You can't do this in MASM:
+if you declare, for example,
+
+\c foo       equ 1
+\c bar       dw 2
+
+then the two lines of code
+
+\c           mov ax,foo
+\c           mov ax,bar
+
+generate completely different opcodes, despite having
+identical-looking syntaxes.
+
+NASM avoids this undesirable situation by having a much simpler
+syntax for memory references. The rule is simply that any access to
+the \e{contents} of a memory location requires square brackets
+around the address, and any access to the \e{address} of a variable
+doesn't. So an instruction of the form \c{mov ax,foo} will
+\e{always} refer to a compile-time constant, whether it's an \c{EQU}
+or the address of a variable; and to access the \e{contents} of the
+variable \c{bar}, you must code \c{mov ax,[bar]}.
+
+This also means that NASM has no need for MASM's \i\c{OFFSET}
+keyword, since the MASM code \c{mov ax,offset bar} means exactly the
+same thing as NASM's \c{mov ax,bar}. If you're trying to get
+large amounts of MASM code to assemble sensibly under NASM, you
+can always code \c{%idefine offset} to make the preprocessor treat
+the \c{OFFSET} keyword as a no-op.
+
+This issue is even more confusing in \i\c{a86}, where declaring a
+label with a trailing colon defines it to be a `label' as opposed to
+a `variable' and causes \c{a86} to adopt NASM-style semantics; so in
+\c{a86}, \c{mov ax,var} has different behaviour depending on whether
+\c{var} was declared as \c{var: dw 0} (a label) or \c{var dw 0} (a
+word-size variable). NASM is very simple by comparison:
+\e{everything} is a label.
+
+NASM, in the interests of simplicity, also does not support the
+\i{hybrid syntaxes} supported by MASM and its clones, such as
+\c{mov ax,table[bx]}, where a memory reference is denoted by one
+portion outside square brackets and another portion inside. The
+correct syntax for the above is \c{mov ax,[table+bx]}. Likewise,
+\c{mov ax,es:[di]} is wrong and \c{mov ax,[es:di]} is right.
+
+\S{qstypes} NASM Doesn't Store \i{Variable Types}
+
+NASM, by design, chooses not to remember the types of variables you
+declare. Whereas MASM will remember, on seeing \c{var dw 0}, that
+you declared \c{var} as a word-size variable, and will then be able
+to fill in the \i{ambiguity} in the size of the instruction \c{mov
+var,2}, NASM will deliberately remember nothing about the symbol
+\c{var} except where it begins, and so you must explicitly code
+\c{mov word [var],2}.
+
+For this reason, NASM doesn't support the \c{LODS}, \c{MOVS},
+\c{STOS}, \c{SCAS}, \c{CMPS}, \c{INS}, or \c{OUTS} instructions,
+but only supports the forms such as \c{LODSB}, \c{MOVSW}, and
+\c{SCASD}, which explicitly specify the size of the components of
+the strings being manipulated.
+
+\S{qsassume} NASM Doesn't \i\c{ASSUME}
+
+As part of NASM's drive for simplicity, it also does not support the
+\c{ASSUME} directive. NASM will not keep track of what values you
+choose to put in your segment registers, and will never
+\e{automatically} generate a \i{segment override} prefix.
+
+\S{qsmodel} NASM Doesn't Support \i{Memory Models}
+
+NASM also does not have any directives to support different 16-bit
+memory models. The programmer has to keep track of which functions
+are supposed to be called with a \i{far call} and which with a
+\i{near call}, and is responsible for putting the correct form of
+\c{RET} instruction (\c{RETN} or \c{RETF}; NASM accepts \c{RET}
+itself as an alternate form for \c{RETN}); in addition, the
+programmer is responsible for coding CALL FAR instructions where
+necessary when calling \e{external} functions, and must also keep
+track of which external variable definitions are far and which are
+near.
+
+\S{qsfpu} \i{Floating-Point} Differences
+
+NASM uses different names to refer to floating-point registers from
+MASM: where MASM would call them \c{ST(0)}, \c{ST(1)} and so on, and
+\i\c{a86} would call them simply \c{0}, \c{1} and so on, NASM
+chooses to call them \c{st0}, \c{st1} etc.
+
+As of version 0.96, NASM now treats the instructions with
+\i{`nowait'} forms in the same way as MASM-compatible assemblers.
+The idiosyncratic treatment employed by 0.95 and earlier was based
+on a misunderstanding by the authors.
+
+\S{qsother} Other Differences
+
+For historical reasons, NASM uses the keyword \i\c{TWORD} where MASM
+and compatible assemblers use \i\c{TBYTE}.
+
+NASM does not declare \i{uninitialised storage} in the same way as
+MASM: where a MASM programmer might use \c{stack db 64 dup (?)},
+NASM requires \c{stack resb 64}, intended to be read as `reserve 64
+bytes'. For a limited amount of compatibility, since NASM treats
+\c{?} as a valid character in symbol names, you can code \c{? equ 0}
+and then writing \c{dw ?} will at least do something vaguely useful.
+\I\c{RESB}\i\c{DUP} is still not a supported syntax, however.
+
+In addition to all of this, macros and directives work completely
+differently to MASM. See \k{preproc} and \k{directive} for further
+details.
+
+\C{lang} The NASM Language
+
+\H{syntax} Layout of a NASM Source Line
+
+Like most assemblers, each NASM source line contains (unless it
+is a macro, a preprocessor directive or an assembler directive: see
+\k{preproc} and \k{directive}) some combination of the four fields
+
+\c label:    instruction operands        ; comment
+
+As usual, most of these fields are optional; the presence or absence
+of any combination of a label, an instruction and a comment is allowed.
+Of course, the operand field is either required or forbidden by the
+presence and nature of the instruction field.
+
+NASM places no restrictions on white space within a line: labels may
+have white space before them, or instructions may have no space
+before them, or anything. The \i{colon} after a label is also
+optional. (Note that this means that if you intend to code \c{lodsb}
+alone on a line, and type \c{lodab} by accident, then that's still a
+valid source line which does nothing but define a label. Running
+NASM with the command-line option
+\I{orphan-labels}\c{-w+orphan-labels} will cause it to warn you if
+you define a label alone on a line without a \i{trailing colon}.)
+
+\i{Valid characters} in labels are letters, numbers, \c{_}, \c{$},
+\c{#}, \c{@}, \c{~}, \c{.}, and \c{?}. The only characters which may
+be used as the \e{first} character of an identifier are letters,
+\c{.} (with special meaning: see \k{locallab}), \c{_} and \c{?}.
+An identifier may also be prefixed with a \I{$prefix}\c{$} to
+indicate that it is intended to be read as an identifier and not a
+reserved word; thus, if some other module you are linking with
+defines a symbol called \c{eax}, you can refer to \c{$eax} in NASM
+code to distinguish the symbol from the register.
+
+The instruction field may contain any machine instruction: Pentium
+and P6 instructions, FPU instructions, MMX instructions and even
+undocumented instructions are all supported. The instruction may be
+prefixed by \c{LOCK}, \c{REP}, \c{REPE}/\c{REPZ} or
+\c{REPNE}/\c{REPNZ}, in the usual way. Explicit \I{address-size
+prefixes}address-size and \i{operand-size prefixes} \c{A16},
+\c{A32}, \c{O16} and \c{O32} are provided - one example of their use
+is given in \k{mixsize}. You can also use the name of a \I{segment
+override}segment register as an instruction prefix: coding
+\c{es mov [bx],ax} is equivalent to coding \c{mov [es:bx],ax}. We
+recommend the latter syntax, since it is consistent with other
+syntactic features of the language, but for instructions such as
+\c{LODSB}, which has no operands and yet can require a segment
+override, there is no clean syntactic way to proceed apart from
+\c{es lodsb}.
+
+An instruction is not required to use a prefix: prefixes such as
+\c{CS}, \c{A32}, \c{LOCK} or \c{REPE} can appear on a line by
+themselves, and NASM will just generate the prefix bytes.
+
+In addition to actual machine instructions, NASM also supports a
+number of pseudo-instructions, described in \k{pseudop}.
+
+Instruction \i{operands} may take a number of forms: they can be
+registers, described simply by the register name (e.g. \c{ax},
+\c{bp}, \c{ebx}, \c{cr0}: NASM does not use the \c{gas}-style
+syntax in which register names must be prefixed by a \c{%} sign), or
+they can be \i{effective addresses} (see \k{effaddr}), constants
+(\k{const}) or expressions (\k{expr}).
+
+For \i{floating-point} instructions, NASM accepts a wide range of
+syntaxes: you can use two-operand forms like MASM supports, or you
+can use NASM's native single-operand forms in most cases. Details of
+all forms of each supported instruction are given in
+\k{iref}. For example, you can code:
+
+\c           fadd st1               ; this sets st0 := st0 + st1
+\c           fadd st0,st1           ; so does this
+\c
+\c           fadd st1,st0           ; this sets st1 := st1 + st0
+\c           fadd to st1            ; so does this
+
+Almost any floating-point instruction that references memory must
+use one of the prefixes \i\c{DWORD}, \i\c{QWORD} or \i\c{TWORD} to
+indicate what size of \i{memory operand} it refers to.
+
+\H{pseudop} \i{Pseudo-Instructions}
+
+Pseudo-instructions are things which, though not real x86 machine
+instructions, are used in the instruction field anyway because
+that's the most convenient place to put them. The current
+pseudo-instructions are \i\c{DB}, \i\c{DW}, \i\c{DD}, \i\c{DQ} and
+\i\c{DT}, their \i{uninitialised} counterparts \i\c{RESB},
+\i\c{RESW}, \i\c{RESD}, \i\c{RESQ} and \i\c{REST}, the \i\c{INCBIN}
+command, the \i\c{EQU} command, and the \i\c{TIMES} prefix.
+
+\S{db} \c{DB} and friends: Declaring Initialised Data
+
+\i\c{DB}, \i\c{DW}, \i\c{DD}, \i\c{DQ} and \i\c{DT} are used, much
+as in MASM, to declare initialised data in the output file. They can
+be invoked in a wide range of ways:
+\I{floating-point}\I{character constant}\I{string constant}
+
+\c           db 0x55                ; just the byte 0x55
+\c           db 0x55,0x56,0x57      ; three bytes in succession
+\c           db 'a',0x55            ; character constants are OK
+\c           db 'hello',13,10,'$'   ; so are string constants
+\c           dw 0x1234              ; 0x34 0x12
+\c           dw 'a'                 ; 0x41 0x00 (it's just a number)
+\c           dw 'ab'                ; 0x41 0x42 (character constant)
+\c           dw 'abc'               ; 0x41 0x42 0x43 0x00 (string)
+\c           dd 0x12345678          ; 0x78 0x56 0x34 0x12
+\c           dd 1.234567e20         ; floating-point constant
+\c           dq 1.234567e20         ; double-precision float
+\c           dt 1.234567e20         ; extended-precision float
+
+\c{DQ} and \c{DT} do not accept \i{numeric constants} or string
+constants as operands.
+
+\S{resb} \c{RESB} and friends: Declaring \i{Uninitialised} Data
+
+\i\c{RESB}, \i\c{RESW}, \i\c{RESD}, \i\c{RESQ} and \i\c{REST} are
+designed to be used in the BSS section of a module: they declare
+\e{uninitialised} storage space. Each takes a single operand, which
+is the number of bytes, words, doublewords or whatever to reserve.
+As stated in \k{qsother}, NASM does not support the MASM/TASM syntax
+of reserving uninitialised space by writing \I\c{?}\c{DW ?} or
+similar things: this is what it does instead. The operand to a
+\c{RESB}-type pseudo-instruction is a \i\e{critical expression}: see
+\k{crit}.
+
+For example:
+
+\c buffer:   resb 64                ; reserve 64 bytes
+\c wordvar:  resw 1                 ; reserve a word
+\c realarray resq 10                ; array of ten reals
+
+\S{incbin} \i\c{INCBIN}: Including External \i{Binary Files}
+
+\c{INCBIN} is borrowed from the old Amiga assembler \i{DevPac}: it
+includes a binary file verbatim into the output file. This can be
+handy for (for example) including \i{graphics} and \i{sound} data
+directly into a game executable file. It can be called in one of
+these three ways:
+
+\c           incbin "file.dat"      ; include the whole file
+\c           incbin "file.dat",1024 ; skip the first 1024 bytes
+\c           incbin "file.dat",1024,512 ; skip the first 1024, and
+\c                                  ; actually include at most 512
+
+\S{equ} \i\c{EQU}: Defining Constants
+
+\c{EQU} defines a symbol to a given constant value: when \c{EQU} is
+used, the source line must contain a label. The action of \c{EQU} is
+to define the given label name to the value of its (only) operand.
+This definition is absolute, and cannot change later. So, for
+example,
+
+\c message   db 'hello, world'
+\c msglen    equ $-message
+
+defines \c{msglen} to be the constant 12. \c{msglen} may not then be
+redefined later. This is not a \i{preprocessor} definition either:
+the value of \c{msglen} is evaluated \e{once}, using the value of
+\c{$} (see \k{expr} for an explanation of \c{$}) at the point of
+definition, rather than being evaluated wherever it is referenced
+and using the value of \c{$} at the point of reference. Note that
+the operand to an \c{EQU} is also a \i{critical expression}
+(\k{crit}).
+
+\S{times} \i\c{TIMES}: \i{Repeating} Instructions or Data
+
+The \c{TIMES} prefix causes the instruction to be assembled multiple
+times. This is partly present as NASM's equivalent of the \i\c{DUP}
+syntax supported by \i{MASM}-compatible assemblers, in that you can
+code
+
+\c zerobuf:  times 64 db 0
+
+or similar things; but \c{TIMES} is more versatile than that. The
+argument to \c{TIMES} is not just a numeric constant, but a numeric
+\e{expression}, so you can do things like
+
+\c buffer:   db 'hello, world'
+\c           times 64-$+buffer db ' '
+
+which will store exactly enough spaces to make the total length of
+\c{buffer} up to 64. Finally, \c{TIMES} can be applied to ordinary
+instructions, so you can code trivial \i{unrolled loops} in it:
+
+\c           times 100 movsb
+
+Note that there is no effective difference between \c{times 100 resb
+1} and \c{resb 100}, except that the latter will be assembled about
+100 times faster due to the internal structure of the assembler.
+
+The operand to \c{TIMES}, like that of \c{EQU} and those of \c{RESB}
+and friends, is a critical expression (\k{crit}).
+
+Note also that \c{TIMES} can't be applied to \i{macros}: the reason
+for this is that \c{TIMES} is processed after the macro phase, which
+allows the argument to \c{TIMES} to contain expressions such as
+\c{64-$+buffer} as above. To repeat more than one line of code, or a
+complex macro, use the preprocessor \i\c{%rep} directive.
+
+\H{effaddr} Effective Addresses
+
+An \i{effective address} is any operand to an instruction which
+\I{memory reference}references memory. Effective addresses, in NASM,
+have a very simple syntax: they consist of an expression evaluating
+to the desired address, enclosed in \i{square brackets}. For
+example:
+
+\c wordvar   dw 123
+\c           mov ax,[wordvar]
+\c           mov ax,[wordvar+1]
+\c           mov ax,[es:wordvar+bx]
+
+Anything not conforming to this simple system is not a valid memory
+reference in NASM, for example \c{es:wordvar[bx]}.
+
+More complicated effective addresses, such as those involving more
+than one register, work in exactly the same way:
+
+\c           mov eax,[ebx*2+ecx+offset]
+\c           mov ax,[bp+di+8]
+
+NASM is capable of doing \i{algebra} on these effective addresses,
+so that things which don't necessarily \e{look} legal are perfectly
+all right:
+
+\c           mov eax,[ebx*5]        ; assembles as [ebx*4+ebx]
+\c           mov eax,[label1*2-label2] ; ie [label1+(label1-label2)]
+
+Some forms of effective address have more than one assembled form;
+in most such cases NASM will generate the smallest form it can. For
+example, there are distinct assembled forms for the 32-bit effective
+addresses \c{[eax*2+0]} and \c{[eax+eax]}, and NASM will generally
+generate the latter on the grounds that the former requires four
+bytes to store a zero offset.
+
+NASM has a hinting mechanism which will cause \c{[eax+ebx]} and
+\c{[ebx+eax]} to generate different opcodes; this is occasionally
+useful because \c{[esi+ebp]} and \c{[ebp+esi]} have different
+default segment registers.
+
+However, you can force NASM to generate an effective address in a
+particular form by the use of the keywords \c{BYTE}, \c{WORD},
+\c{DWORD} and \c{NOSPLIT}. If you need \c{[eax+3]} to be assembled
+using a double-word offset field instead of the one byte NASM will
+normally generate, you can code \c{[dword eax+3]}. Similarly, you
+can force NASM to use a byte offset for a small value which it
+hasn't seen on the first pass (see \k{crit} for an example of such a
+code fragment) by using \c{[byte eax+offset]}. As special cases,
+\c{[byte eax]} will code \c{[eax+0]} with a byte offset of zero, and
+\c{[dword eax]} will code it with a double-word offset of zero. The
+normal form, \c{[eax]}, will be coded with no offset field.
+
+Similarly, NASM will split \c{[eax*2]} into \c{[eax+eax]} because
+that allows the offset field to be absent and space to be saved; in
+fact, it will also split \c{[eax*2+offset]} into
+\c{[eax+eax+offset]}. You can combat this behaviour by the use of
+the \c{NOSPLIT} keyword: \c{[nosplit eax*2]} will force
+\c{[eax*2+0]} to be generated literally.
+
+\H{const} \i{Constants}
+
+NASM understands four different types of constant: numeric,
+character, string and floating-point.
+
+\S{numconst} \i{Numeric Constants}
+
+A numeric constant is simply a number. NASM allows you to specify
+numbers in a variety of number bases, in a variety of ways: you can
+suffix \c{H}, \c{Q} and \c{B} for \i{hex}, \i{octal} and \i{binary},
+or you can prefix \c{0x} for hex in the style of C, or you can
+prefix \c{$} for hex in the style of Borland Pascal. Note, though,
+that the \I{$prefix}\c{$} prefix does double duty as a prefix on
+identifiers (see \k{syntax}), so a hex number prefixed with a \c{$}
+sign must have a digit after the \c{$} rather than a letter.
+
+Some examples:
+
+\c           mov ax,100             ; decimal
+\c           mov ax,0a2h            ; hex
+\c           mov ax,$0a2            ; hex again: the 0 is required
+\c           mov ax,0xa2            ; hex yet again
+\c           mov ax,777q            ; octal
+\c           mov ax,10010011b       ; binary
+
+\S{chrconst} \i{Character Constants}
+
+A character constant consists of up to four characters enclosed in
+either single or double quotes. The type of quote makes no
+difference to NASM, except of course that surrounding the constant
+with single quotes allows double quotes to appear within it and vice
+versa.
+
+A character constant with more than one character will be arranged
+with \i{little-endian} order in mind: if you code
+
+\c           mov eax,'abcd'
+
+then the constant generated is not \c{0x61626364}, but
+\c{0x64636261}, so that if you were then to store the value into
+memory, it would read \c{abcd} rather than \c{dcba}. This is also
+the sense of character constants understood by the Pentium's
+\i\c{CPUID} instruction (see \k{insCPUID}).
+
+\S{strconst} String Constants
+
+String constants are only acceptable to some pseudo-instructions,
+namely the \I\c{DW}\I\c{DD}\I\c{DQ}\I\c{DT}\i\c{DB} family and
+\i\c{INCBIN}.
+
+A string constant looks like a character constant, only longer. It
+is treated as a concatenation of maximum-size character constants
+for the conditions. So the following are equivalent:
+
+\c           db 'hello'             ; string constant
+\c           db 'h','e','l','l','o' ; equivalent character constants
+
+And the following are also equivalent:
+
+\c           dd 'ninechars'         ; doubleword string constant
+\c           dd 'nine','char','s'   ; becomes three doublewords
+\c           db 'ninechars',0,0,0   ; and really looks like this
+
+Note that when used as an operand to \c{db}, a constant like
+\c{'ab'} is treated as a string constant despite being short enough
+to be a character constant, because otherwise \c{db 'ab'} would have
+the same effect as \c{db 'a'}, which would be silly. Similarly,
+three-character or four-character constants are treated as strings
+when they are operands to \c{dw}.
+
+\S{fltconst} \I{floating-point, constants}Floating-Point Constants
+
+\i{Floating-point} constants are acceptable only as arguments to
+\i\c{DD}, \i\c{DQ} and \i\c{DT}. They are expressed in the
+traditional form: digits, then a period, then optionally more
+digits, then optionally an \c{E} followed by an exponent. The period
+is mandatory, so that NASM can distinguish between \c{dd 1}, which
+declares an integer constant, and \c{dd 1.0} which declares a
+floating-point constant.
+
+Some examples:
+
+\c           dd 1.2                 ; an easy one
+\c           dq 1.e10               ; 10,000,000,000
+\c           dq 1.e+10              ; synonymous with 1.e10
+\c           dq 1.e-10              ; 0.000 000 000 1
+\c           dt 3.141592653589793238462 ; pi
+
+NASM cannot do compile-time arithmetic on floating-point constants.
+This is because NASM is designed to be portable - although it always
+generates code to run on x86 processors, the assembler itself can
+run on any system with an ANSI C compiler. Therefore, the assembler
+cannot guarantee the presence of a floating-point unit capable of
+handling the \i{Intel number formats}, and so for NASM to be able to
+do floating arithmetic it would have to include its own complete set
+of floating-point routines, which would significantly increase the
+size of the assembler for very little benefit.
+
+\H{expr} \i{Expressions}
+
+Expressions in NASM are similar in syntax to those in C.
+
+NASM does not guarantee the size of the integers used to evaluate
+expressions at compile time: since NASM can compile and run on
+64-bit systems quite happily, don't assume that expressions are
+evaluated in 32-bit registers and so try to make deliberate use of
+\i{integer overflow}. It might not always work. The only thing NASM
+will guarantee is what's guaranteed by ANSI C: you always have \e{at
+least} 32 bits to work in.
+
+NASM supports two special tokens in expressions, allowing
+calculations to involve the current assembly position: the
+\I{$ here}\c{$} and \i\c{$$} tokens. \c{$} evaluates to the assembly
+position at the beginning of the line containing the expression; so
+you can code an \i{infinite loop} using \c{JMP $}. \c{$$} evaluates
+to the beginning of the current section; so you can tell how far
+into the section you are by using \c{($-$$)}.
+
+The arithmetic \i{operators} provided by NASM are listed here, in
+increasing order of \i{precedence}.
+
+\S{expor} \i\c{|}: \i{Bitwise OR} Operator
+
+The \c{|} operator gives a bitwise OR, exactly as performed by the
+\c{OR} machine instruction. Bitwise OR is the lowest-priority
+arithmetic operator supported by NASM.
+
+\S{expxor} \i\c{^}: \i{Bitwise XOR} Operator
+
+\c{^} provides the bitwise XOR operation.
+
+\S{expand} \i\c{&}: \i{Bitwise AND} Operator
+
+\c{&} provides the bitwise AND operation.
+
+\S{expshift} \i\c{<<} and \i\c{>>}: \i{Bit Shift} Operators
+
+\c{<<} gives a bit-shift to the left, just as it does in C. So \c{5<<3}
+evaluates to 5 times 8, or 40. \c{>>} gives a bit-shift to the
+right; in NASM, such a shift is \e{always} unsigned, so that
+the bits shifted in from the left-hand end are filled with zero
+rather than a sign-extension of the previous highest bit.
+
+\S{expplmi} \I{+ opaddition}\c{+} and \I{- opsubtraction}\c{-}:
+\i{Addition} and \i{Subtraction} Operators
+
+The \c{+} and \c{-} operators do perfectly ordinary addition and
+subtraction.
+
+\S{expmul} \i\c{*}, \i\c{/}, \i\c{//}, \i\c{%} and \i\c{%%}:
+\i{Multiplication} and \i{Division}
+
+\c{*} is the multiplication operator. \c{/} and \c{//} are both
+division operators: \c{/} is \i{unsigned division} and \c{//} is
+\i{signed division}. Similarly, \c{%} and \c{%%} provide \I{unsigned
+modulo}\I{modulo operators}unsigned and
+\i{signed modulo} operators respectively.
+
+NASM, like ANSI C, provides no guarantees about the sensible
+operation of the signed modulo operator.
+
+Since the \c{%} character is used extensively by the macro
+\i{preprocessor}, you should ensure that both the signed and unsigned
+modulo operators are followed by white space wherever they appear.
+
+\S{expmul} \i{Unary Operators}: \I{+ opunary}\c{+}, \I{- opunary}\c{-},
+\i\c{~} and \i\c{SEG}
+
+The highest-priority operators in NASM's expression grammar are
+those which only apply to one argument. \c{-} negates its operand,
+\c{+} does nothing (it's provided for symmetry with \c{-}), \c{~}
+computes the \i{one's complement} of its operand, and \c{SEG}
+provides the \i{segment address} of its operand (explained in more
+detail in \k{segwrt}).
+
+\H{segwrt} \i\c{SEG} and \i\c{WRT}
+
+When writing large 16-bit programs, which must be split into
+multiple \i{segments}, it is often necessary to be able to refer to
+the \I{segment address}segment part of the address of a symbol. NASM
+supports the \c{SEG} operator to perform this function.
+
+The \c{SEG} operator returns the \i\e{preferred} segment base of a
+symbol, defined as the segment base relative to which the offset of
+the symbol makes sense. So the code
+
+\c           mov ax,seg symbol
+\c           mov es,ax
+\c           mov bx,symbol
+
+will load \c{ES:BX} with a valid pointer to the symbol \c{symbol}.
+
+Things can be more complex than this: since 16-bit segments and
+\i{groups} may \I{overlapping segments}overlap, you might occasionally
+want to refer to some symbol using a different segment base from the
+preferred one. NASM lets you do this, by the use of the \c{WRT}
+(With Reference To) keyword. So you can do things like
+
+\c           mov ax,weird_seg       ; weird_seg is a segment base
+\c           mov es,ax
+\c           mov bx,symbol wrt weird_seg
+
+to load \c{ES:BX} with a different, but functionally equivalent,
+pointer to the symbol \c{symbol}.
+
+NASM supports far (inter-segment) calls and jumps by means of the
+syntax \c{call segment:offset}, where \c{segment} and \c{offset}
+both represent immediate values. So to call a far procedure, you
+could code either of
+
+\c           call (seg procedure):procedure
+\c           call weird_seg:(procedure wrt weird_seg)
+
+(The parentheses are included for clarity, to show the intended
+parsing of the above instructions. They are not necessary in
+practice.)
+
+NASM supports the syntax \I\c{CALL FAR}\c{call far procedure} as a
+synonym for the first of the above usages. \c{JMP} works identically
+to \c{CALL} in these examples.
+
+To declare a \i{far pointer} to a data item in a data segment, you
+must code
+
+\c           dw symbol, seg symbol
+
+NASM supports no convenient synonym for this, though you can always
+invent one using the macro processor.
+
+\H{crit} \i{Critical Expressions}
+
+A limitation of NASM is that it is a \i{two-pass assembler}; unlike
+TASM and others, it will always do exactly two \I{passes}\i{assembly
+passes}. Therefore it is unable to cope with source files that are
+complex enough to require three or more passes.
+
+The first pass is used to determine the size of all the assembled
+code and data, so that the second pass, when generating all the
+code, knows all the symbol addresses the code refers to. So one
+thing NASM can't handle is code whose size depends on the value of a
+symbol declared after the code in question. For example,
+
+\c           times (label-$) db 0
+\c label:    db 'Where am I?'
+
+The argument to \i\c{TIMES} in this case could equally legally
+evaluate to anything at all; NASM will reject this example because
+it cannot tell the size of the \c{TIMES} line when it first sees it.
+It will just as firmly reject the slightly \I{paradox}paradoxical
+code
+
+\c           times (label-$+1) db 0
+\c label:    db 'NOW where am I?'
+
+in which \e{any} value for the \c{TIMES} argument is by definition
+wrong!
+
+NASM rejects these examples by means of a concept called a
+\e{critical expression}, which is defined to be an expression whose
+value is required to be computable in the first pass, and which must
+therefore depend only on symbols defined before it. The argument to
+the \c{TIMES} prefix is a critical expression; for the same reason,
+the arguments to the \i\c{RESB} family of pseudo-instructions are
+also critical expressions.
+
+Critical expressions can crop up in other contexts as well: consider
+the following code.
+
+\c           mov ax,symbol1
+\c symbol1   equ symbol2
+\c symbol2:
+
+On the first pass, NASM cannot determine the value of \c{symbol1},
+because \c{symbol1} is defined to be equal to \c{symbol2} which NASM
+hasn't seen yet. On the second pass, therefore, when it encounters
+the line \c{mov ax,symbol1}, it is unable to generate the code for
+it because it still doesn't know the value of \c{symbol1}. On the
+next line, it would see the \i\c{EQU} again and be able to determine
+the value of \c{symbol1}, but by then it would be too late.
+
+NASM avoids this problem by defining the right-hand side of an
+\c{EQU} statement to be a critical expression, so the definition of
+\c{symbol1} would be rejected in the first pass.
+
+There is a related issue involving \i{forward references}: consider
+this code fragment.
+
+\c           mov eax,[ebx+offset]
+\c offset    equ 10
+
+NASM, on pass one, must calculate the size of the instruction \c{mov
+eax,[ebx+offset]} without knowing the value of \c{offset}. It has no
+way of knowing that \c{offset} is small enough to fit into a
+one-byte offset field and that it could therefore get away with
+generating a shorter form of the \i{effective-address} encoding; for
+all it knows, in pass one, \c{offset} could be a symbol in the code
+segment, and it might need the full four-byte form. So it is forced
+to compute the size of the instruction to accommodate a four-byte
+address part. In pass two, having made this decision, it is now
+forced to honour it and keep the instruction large, so the code
+generated in this case is not as small as it could have been. This
+problem can be solved by defining \c{offset} before using it, or by
+forcing byte size in the effective address by coding \c{[byte
+ebx+offset]}.
+
+\H{locallab} \i{Local Labels}
+
+NASM gives special treatment to symbols beginning with a \i{period}.
+A label beginning with a single period is treated as a \e{local}
+label, which means that it is associated with the previous non-local
+label. So, for example:
+
+\c label1    ; some code
+\c .loop     ; some more code
+\c           jne .loop
+\c           ret
+\c label2    ; some code
+\c .loop     ; some more code
+\c           jne .loop
+\c           ret
+
+In the above code fragment, each \c{JNE} instruction jumps to the
+line immediately before it, because the two definitions of \c{.loop}
+are kept separate by virtue of each being associated with the
+previous non-local label.
+
+This form of local label handling is borrowed from the old Amiga
+assembler \i{DevPac}; however, NASM goes one step further, in
+allowing access to local labels from other parts of the code. This
+is achieved by means of \e{defining} a local label in terms of the
+previous non-local label: the first definition of \c{.loop} above is
+really defining a symbol called \c{label1.loop}, and the second
+defines a symbol called \c{label2.loop}. So, if you really needed
+to, you could write
+
+\c label3    ; some more code
+\c           ; and some more
+\c           jmp label1.loop
+
+Sometimes it is useful - in a macro, for instance - to be able to
+define a label which can be referenced from anywhere but which
+doesn't interfere with the normal local-label mechanism. Such a
+label can't be non-local because it would interfere with subsequent
+definitions of, and references to, local labels; and it can't be
+local because the macro that defined it wouldn't know the label's
+full name. NASM therefore introduces a third type of label, which is
+probably only useful in macro definitions: if a label begins with
+the \I{label prefix}special prefix \i\c{..@}, then it does nothing
+to the local label mechanism. So you could code
+
+\c label1:   ; a non-local label
+\c .local:   ; this is really label1.local
+\c ..@foo:   ; this is a special symbol
+\c label2:   ; another non-local label
+\c .local:   ; this is really label2.local
+\c           jmp ..@foo             ; this will jump three lines up
+
+NASM has the capacity to define other special symbols beginning with
+a double period: for example, \c{..start} is used to specify the
+entry point in the \c{obj} output format (see \k{dotdotstart}).
+
+\C{preproc} The NASM \i{Preprocessor}
+
+NASM contains a powerful \i{macro processor}, which supports
+conditional assembly, multi-level file inclusion, two forms of macro
+(single-line and multi-line), and a `context stack' mechanism for
+extra macro power. Preprocessor directives all begin with a \c{%}
+sign.
+
+\H{slmacro} \i{Single-Line Macros}
+
+\S{define} The Normal Way: \I\c{%idefine}\i\c{%define}
+
+Single-line macros are defined using the \c{%define} preprocessor
+directive. The definitions work in a similar way to C; so you can do
+things like
+
+\c %define ctrl 0x1F &
+\c %define param(a,b) ((a)+(a)*(b))
+\c           mov byte [param(2,ebx)], ctrl 'D'
+
+which will expand to
+
+\c           mov byte [(2)+(2)*(ebx)], 0x1F & 'D'
+
+When the expansion of a single-line macro contains tokens which
+invoke another macro, the expansion is performed at invocation time,
+not at definition time. Thus the code
+
+\c %define a(x) 1+b(x)
+\c %define b(x) 2*x
+\c           mov ax,a(8)
+
+will evaluate in the expected way to \c{mov ax,1+2*8}, even though
+the macro \c{b} wasn't defined at the time of definition of \c{a}.
+
+Macros defined with \c{%define} are \i{case sensitive}: after
+\c{%define foo bar}, only \c{foo} will expand to \c{bar}: \c{Foo} or
+\c{FOO} will not. By using \c{%idefine} instead of \c{%define} (the
+`i' stands for `insensitive') you can define all the case variants
+of a macro at once, so that \c{%idefine foo bar} would cause
+\c{foo}, \c{Foo}, \c{FOO}, \c{fOO} and so on all to expand to
+\c{bar}.
+
+There is a mechanism which detects when a macro call has occurred as
+a result of a previous expansion of the same macro, to guard against
+\i{circular references} and infinite loops. If this happens, the
+preprocessor will only expand the first occurrence of the macro.
+Hence, if you code
+
+\c %define a(x) 1+a(x)
+\c           mov ax,a(3)
+
+the macro \c{a(3)} will expand once, becoming \c{1+a(3)}, and will
+then expand no further. This behaviour can be useful: see \k{32c}
+for an example of its use.
+
+You can \I{overloading, single-line macros}overload single-line
+macros: if you write
+
+\c %define foo(x) 1+x
+\c %define foo(x,y) 1+x*y
+
+the preprocessor will be able to handle both types of macro call,
+by counting the parameters you pass; so \c{foo(3)} will become
+\c{1+3} whereas \c{foo(ebx,2)} will become \c{1+ebx*2}. However, if
+you define
+
+\c %define foo bar
+
+then no other definition of \c{foo} will be accepted: a macro with
+no parameters prohibits the definition of the same name as a macro
+\e{with} parameters, and vice versa.
+
+You can \i{pre-define} single-line macros using the `-d' option on
+the NASM command line: see \k{opt-d}.
+
+\S{assign} \i{Preprocessor Variables}: \i\c{%assign}
+
+An alternative way to define single-line macros is by means of the
+\c{%assign} command (and its \i{case sensitive}case-insensitive
+counterpart \i\c{%iassign}, which differs from \c{%assign} in
+exactly the same way that \c{%idefine} differs from \c{%define}).
+
+\c{%assign} is used to define single-line macros which take no
+parameters and have a numeric value. This value can be specified in
+the form of an expression, and it will be evaluated once, when the
+\c{%assign} directive is processed.
+
+\c{%assign} is useful for controlling the termination of \c{%rep}
+preprocessor loops: see \k{rep} for an example of this. Another
+use for \c{%assign} is given in \k{16c} and \k{32c}.
+
+The expression passed to \c{%assign} is a \i{critical expression}
+(see \k{crit}), and must also evaluate to a pure number (rather than
+a relocatable reference such as a code or data address, or anything
+involving a register).
+
+\H{mlmacro} \i{Multi-Line Macros}: \I\c{%imacro}\i\c{%macro}
+
+Multi-line macros are much more like the type of macro seen in MASM
+and TASM: a multi-line macro definition in NASM looks something like
+this.
+
+\c %macro prologue 1
+\c           push ebp
+\c           mov ebp,esp
+\c           sub esp,%1
+\c %endmacro
+
+This defines a C-like function prologue as a macro: so you would
+invoke the macro with a call such as
+
+\c myfunc:   prologue 12
+
+which would expand to the three lines of code
+
+\c myfunc:   push ebp
+\c           mov ebp,esp
+\c           sub esp,12
+
+The number \c{1} after the macro name in the \c{%macro} line defines
+the number of parameters the macro \c{prologue} expects to receive.
+The use of \c{%1} inside the macro definition refers to the first
+parameter to the macro call. With a macro taking more than one
+parameter, subsequent parameters would be referred to as \c{%2},
+\c{%3} and so on.
+
+Multi-line macros, like single-line macros, are \i{case-sensitive},
+unless you define them using the alternative directive \c{%imacro}.
+
+If you need to pass a comma as \e{part} of a parameter to a
+multi-line macro, you can do that by enclosing the entire parameter
+in \I{braces, around macro parameters}braces. So you could code
+things like
+
+\c %macro silly 2
+\c %2:       db %1
+\c %endmacro
+\c           silly 'a', letter_a    ; letter_a:  db 'a'
+\c           silly 'ab', string_ab  ; string_ab: db 'ab'
+\c           silly {13,10}, crlf    ; crlf:      db 13,10
+
+\S{mlmacover} \I{Overloading Multi-Line Macros}
+
+As with single-line macros, multi-line macros can be overloaded by
+defining the same macro name several times with different numbers of
+parameters. This time, no exception is made for macros with no
+parameters at all. So you could define
+
+\c %macro prologue 0
+\c           push ebp
+\c           mov ebp,esp
+\c %endmacro
+
+to define an alternative form of the function prologue which
+allocates no local stack space.
+
+Sometimes, however, you might want to `overload' a machine
+instruction; for example, you might want to define
+
+\c %macro push 2
+\c           push %1
+\c           push %2
+\c %endmacro
+
+so that you could code
+
+\c           push ebx               ; this line is not a macro call
+\c           push eax,ecx           ; but this one is
+
+Ordinarily, NASM will give a warning for the first of the above two
+lines, since \c{push} is now defined to be a macro, and is being
+invoked with a number of parameters for which no definition has been
+given. The correct code will still be generated, but the assembler
+will give a warning. This warning can be disabled by the use of the
+\c{-w-macro-params} command-line option (see \k{opt-w}).
+
+\S{maclocal} \i{Macro-Local Labels}
+
+NASM allows you to define labels within a multi-line macro
+definition in such a way as to make them local to the macro call: so
+calling the same macro multiple times will use a different label
+each time. You do this by prefixing \i\c{%%} to the label name. So
+you can invent an instruction which executes a \c{RET} if the \c{Z}
+flag is set by doing this:
+
+\c %macro retz 0
+\c           jnz %%skip
+\c           ret
+\c %%skip:
+\c %endmacro
+
+You can call this macro as many times as you want, and every time
+you call it NASM will make up a different `real' name to substitute
+for the label \c{%%skip}. The names NASM invents are of the form
+\c{..@2345.skip}, where the number 2345 changes with every macro
+call. The \i\c{..@} prefix prevents macro-local labels from
+interfering with the local label mechanism, as described in
+\k{locallab}. You should avoid defining your own labels in this form
+(the \c{..@} prefix, then a number, then another period) in case
+they interfere with macro-local labels.
+
+\S{mlmacgre} \i{Greedy Macro Parameters}
+
+Occasionally it is useful to define a macro which lumps its entire
+command line into one parameter definition, possibly after
+extracting one or two smaller parameters from the front. An example
+might be a macro to write a text string to a file in MS-DOS, where
+you might want to be able to write
+
+\c           writefile [filehandle],"hello, world",13,10
+
+NASM allows you to define the last parameter of a macro to be
+\e{greedy}, meaning that if you invoke the macro with more
+parameters than it expects, all the spare parameters get lumped into
+the last defined one along with the separating commas. So if you
+code:
+
+\c %macro writefile 2+
+\c           jmp %%endstr
+\c %%str:    db %2
+\c %%endstr: mov dx,%%str
+\c           mov cx,%%endstr-%%str
+\c           mov bx,%1
+\c           mov ah,0x40
+\c           int 0x21
+\c %endmacro
+
+then the example call to \c{writefile} above will work as expected:
+the text before the first comma, \c{[filehandle]}, is used as the
+first macro parameter and expanded when \c{%1} is referred to, and
+all the subsequent text is lumped into \c{%2} and placed after the
+\c{db}.
+
+The greedy nature of the macro is indicated to NASM by the use of
+the \I{+ modifier}\c{+} sign after the parameter count on the
+\c{%macro} line.
+
+If you define a greedy macro, you are effectively telling NASM how
+it should expand the macro given \e{any} number of parameters from
+the actual number specified up to infinity; in this case, for
+example, NASM now knows what to do when it sees a call to
+\c{writefile} with 2, 3, 4 or more parameters. NASM will take this
+into account when overloading macros, and will not allow you to
+define another form of \c{writefile} taking 4 parameters (for
+example).
+
+Of course, the above macro could have been implemented as a
+non-greedy macro, in which case the call to it would have had to
+look like
+
+\c           writefile [filehandle], {"hello, world",13,10}
+
+NASM provides both mechanisms for putting \i{commas in macro
+parameters}, and you choose which one you prefer for each macro
+definition.
+
+See \k{sectmac} for a better way to write the above macro.
+
+\S{mlmacdef} \i{Default Macro Parameters}
+
+NASM also allows you to define a multi-line macro with a \e{range}
+of allowable parameter counts. If you do this, you can specify
+defaults for \i{omitted parameters}. So, for example:
+
+\c %macro die 0-1 "Painful program death has occurred."
+\c           writefile 2,%1
+\c           mov ax,0x4c01
+\c           int 0x21
+\c %endmacro
+
+This macro (which makes use of the \c{writefile} macro defined in
+\k{mlmacgre}) can be called with an explicit error message, which it
+will display on the error output stream before exiting, or it can be
+called with no parameters, in which case it will use the default
+error message supplied in the macro definition.
+
+In general, you supply a minimum and maximum number of parameters
+for a macro of this type; the minimum number of parameters are then
+required in the macro call, and then you provide defaults for the
+optional ones. So if a macro definition began with the line
+
+\c %macro foobar 1-3 eax,[ebx+2]
+
+then it could be called with between one and three parameters, and
+\c{%1} would always be taken from the macro call. \c{%2}, if not
+specified by the macro call, would default to \c{eax}, and \c{%3} if
+not specified would default to \c{[ebx+2]}.
+
+You may omit parameter defaults from the macro definition, in which
+case the parameter default is taken to be blank. This can be useful
+for macros which can take a variable number of parameters, since the
+\i\c{%0} token (see \k{percent0}) allows you to determine how many
+parameters were really passed to the macro call.
+
+This defaulting mechanism can be combined with the greedy-parameter
+mechanism; so the \c{die} macro above could be made more powerful,
+and more useful, by changing the first line of the definition to
+
+\c %macro die 0-1+ "Painful program death has occurred.",13,10
+
+The maximum parameter count can be infinite, denoted by \c{*}. In
+this case, of course, it is impossible to provide a \e{full} set of
+default parameters. Examples of this usage are shown in \k{rotate}.
+
+\S{percent0} \i\c{%0}: \I{counting macro parameters}Macro Parameter Counter
+
+For a macro which can take a variable number of parameters, the
+parameter reference \c{%0} will return a numeric constant giving the
+number of parameters passed to the macro. This can be used as an
+argument to \c{%rep} (see \k{rep}) in order to iterate through all
+the parameters of a macro. Examples are given in \k{rotate}.
+
+\S{rotate} \i\c{%rotate}: \i{Rotating Macro Parameters}
+
+Unix shell programmers will be familiar with the \I{shift
+command}\c{shift} shell command, which allows the arguments passed
+to a shell script (referenced as \c{$1}, \c{$2} and so on) to be
+moved left by one place, so that the argument previously referenced
+as \c{$2} becomes available as \c{$1}, and the argument previously
+referenced as \c{$1} is no longer available at all.
+
+NASM provides a similar mechanism, in the form of \c{%rotate}. As
+its name suggests, it differs from the Unix \c{shift} in that no
+parameters are lost: parameters rotated off the left end of the
+argument list reappear on the right, and vice versa.
+
+\c{%rotate} is invoked with a single numeric argument (which may be
+an expression). The macro parameters are rotated to the left by that
+many places. If the argument to \c{%rotate} is negative, the macro
+parameters are rotated to the right.
+
+\I{iterating over macro parameters}So a pair of macros to save and
+restore a set of registers might work as follows:
+
+\c %macro multipush 1-*
+\c %rep %0
+\c           push %1
+\c %rotate 1
+\c %endrep
+\c %endmacro
+
+This macro invokes the \c{PUSH} instruction on each of its arguments
+in turn, from left to right. It begins by pushing its first
+argument, \c{%1}, then invokes \c{%rotate} to move all the arguments
+one place to the left, so that the original second argument is now
+available as \c{%1}. Repeating this procedure as many times as there
+were arguments (achieved by supplying \c{%0} as the argument to
+\c{%rep}) causes each argument in turn to be pushed.
+
+Note also the use of \c{*} as the maximum parameter count,
+indicating that there is no upper limit on the number of parameters
+you may supply to the \i\c{multipush} macro.
+
+It would be convenient, when using this macro, to have a \c{POP}
+equivalent, which \e{didn't} require the arguments to be given in
+reverse order. Ideally, you would write the \c{multipush} macro
+call, then cut-and-paste the line to where the pop needed to be
+done, and change the name of the called macro to \c{multipop}, and
+the macro would take care of popping the registers in the opposite
+order from the one in which they were pushed.
+
+This can be done by the following definition:
+
+\c %macro multipop 1-*
+\c %rep %0
+\c %rotate -1
+\c           pop %1
+\c %endrep
+\c %endmacro
+
+This macro begins by rotating its arguments one place to the
+\e{right}, so that the original \e{last} argument appears as \c{%1}.
+This is then popped, and the arguments are rotated right again, so
+the second-to-last argument becomes \c{%1}. Thus the arguments are
+iterated through in reverse order.
+
+\S{concat} \i{Concatenating Macro Parameters}
+
+NASM can concatenate macro parameters on to other text surrounding
+them. This allows you to declare a family of symbols, for example,
+in a macro definition. If, for example, you wanted to generate a
+table of key codes along with offsets into the table, you could code
+something like
+
+\c %macro keytab_entry 2
+\c keypos%1 equ $-keytab
+\c           db %2
+\c %endmacro
+\c keytab:
+\c           keytab_entry F1,128+1
+\c           keytab_entry F2,128+2
+\c           keytab_entry Return,13
+
+which would expand to
+
+\c keytab:
+\c keyposF1 equ $-keytab
+\c           db 128+1
+\c keyposF2 equ $-keytab
+\c           db 128+2
+\c keyposReturn equ $-keytab
+\c           db 13
+
+You can just as easily concatenate text on to the other end of a
+macro parameter, by writing \c{%1foo}.
+
+If you need to append a \e{digit} to a macro parameter, for example
+defining labels \c{foo1} and \c{foo2} when passed the parameter
+\c{foo}, you can't code \c{%11} because that would be taken as the
+eleventh macro parameter. Instead, you must code
+\I{braces, after % sign}\c{%\{1\}1}, which will separate the first
+\c{1} (giving the number of the macro parameter) from the second
+(literal text to be concatenated to the parameter).
+
+This concatenation can also be applied to other preprocessor in-line
+objects, such as macro-local labels (\k{maclocal}) and context-local
+labels (\k{ctxlocal}). In all cases, ambiguities in syntax can be
+resolved by enclosing everything after the \c{%} sign and before the
+literal text in braces: so \c{%\{%foo\}bar} concatenates the text
+\c{bar} to the end of the real name of the macro-local label
+\c{%%foo}. (This is unnecessary, since the form NASM uses for the
+real names of macro-local labels means that the two usages
+\c{%\{%foo\}bar} and \c{%%foobar} would both expand to the same
+thing anyway; nevertheless, the capability is there.)
+
+\S{mlmaccc} \i{Condition Codes as Macro Parameters}
+
+NASM can give special treatment to a macro parameter which contains
+a condition code. For a start, you can refer to the macro parameter
+\c{%1} by means of the alternative syntax \i\c{%+1}, which informs
+NASM that this macro parameter is supposed to contain a condition
+code, and will cause the preprocessor to report an error message if
+the macro is called with a parameter which is \e{not} a valid
+condition code.
+
+Far more usefully, though, you can refer to the macro parameter by
+means of \i\c{%-1}, which NASM will expand as the \e{inverse}
+condition code. So the \c{retz} macro defined in \k{maclocal} can be
+replaced by a general \i{conditional-return macro} like this:
+
+\c %macro retc 1
+\c           j%-1 %%skip
+\c           ret
+\c %%skip:
+\c %endmacro
+
+This macro can now be invoked using calls like \c{retc ne}, which
+will cause the conditional-jump instruction in the macro expansion
+to come out as \c{JE}, or \c{retc po} which will make the jump a
+\c{JPE}.
+
+The \c{%+1} macro-parameter reference is quite happy to interpret
+the arguments \c{CXZ} and \c{ECXZ} as valid condition codes;
+however, \c{%-1} will report an error if passed either of these,
+because no inverse condition code exists.
+
+\S{nolist} \i{Disabling Listing Expansion}\I\c{.nolist}
+
+When NASM is generating a listing file from your program, it will
+generally expand multi-line macros by means of writing the macro
+call and then listing each line of the expansion. This allows you to
+see which instructions in the macro expansion are generating what
+code; however, for some macros this clutters the listing up
+unnecessarily.
+
+NASM therefore provides the \c{.nolist} qualifier, which you can
+include in a macro definition to inhibit the expansion of the macro
+in the listing file. The \c{.nolist} qualifier comes directly after
+the number of parameters, like this:
+
+\c %macro foo 1.nolist
+
+Or like this:
+
+\c %macro bar 1-5+.nolist a,b,c,d,e,f,g,h
+
+\H{condasm} \i{Conditional Assembly}\I\c{%if}
+
+Similarly to the C preprocessor, NASM allows sections of a source
+file to be assembled only if certain conditions are met. The general
+syntax of this feature looks like this:
+
+\c %if<condition>
+\c ; some code which only appears if <condition> is met
+\c %elif<condition2>
+\c ; only appears if <condition> is not met but <condition2> is
+\c %else
+\c ; this appears if neither <condition> nor <condition2> was met
+\c %endif
+
+The \i\c{%else} clause is optional, as is the \i\c{%elif} clause.
+You can have more than one \c{%elif} clause as well.
+
+\S{ifdef} \i\c{%ifdef}: \i{Testing Single-Line Macro Existence}
+
+Beginning a conditional-assembly block with the line \c{%ifdef
+MACRO} will assemble the subsequent code if, and only if, a
+single-line macro called \c{MACRO} is defined. If not, then the
+\c{%elif} and \c{%else} blocks (if any) will be processed instead.
+
+For example, when debugging a program, you might want to write code
+such as
+
+\c           ; perform some function
+\c %ifdef DEBUG
+\c           writefile 2,"Function performed successfully",13,10
+\c %endif
+\c           ; go and do something else
+
+Then you could use the command-line option \c{-dDEBUG} to create a
+version of the program which produced debugging messages, and remove
+the option to generate the final release version of the program.
+
+You can test for a macro \e{not} being defined by using
+\i\c{%ifndef} instead of \c{%ifdef}. You can also test for macro
+definitions in \c{%elif} blocks by using \i\c{%elifdef} and
+\i\c{%elifndef}.
+
+\S{ifctx} \i\c{%ifctx}: \i{Testing the Context Stack}
+
+The conditional-assembly construct \c{%ifctx ctxname} will cause the
+subsequent code to be assembled if and only if the top context on
+the preprocessor's context stack has the name \c{ctxname}. As with
+\c{%ifdef}, the inverse and \c{%elif} forms \i\c{%ifnctx},
+\i\c{%elifctx} and \i\c{%elifnctx} are also supported.
+
+For more details of the context stack, see \k{ctxstack}. For a
+sample use of \c{%ifctx}, see \k{blockif}.
+
+\S{if} \i\c{%if}: \i{Testing Arbitrary Numeric Expressions}
+
+The conditional-assembly construct \c{%if expr} will cause the
+subsequent code to be assembled if and only if the value of the
+numeric expression \c{expr} is non-zero. An example of the use of
+this feature is in deciding when to break out of a \c{%rep}
+preprocessor loop: see \k{rep} for a detailed example.
+
+The expression given to \c{%if}, and its counterpart \i\c{%elif}, is
+a critical expression (see \k{crit}).
+
+\c{%if} extends the normal NASM expression syntax, by providing a
+set of \i{relational operators} which are not normally available in
+expressions. The operators \i\c{=}, \i\c{<}, \i\c{>}, \i\c{<=},
+\i\c{>=} and \i\c{<>} test equality, less-than, greater-than,
+less-or-equal, greater-or-equal and not-equal respectively. The
+C-like forms \i\c{==} and \i\c{!=} are supported as alternative
+forms of \c{=} and \c{<>}. In addition, low-priority logical
+operators \i\c{&&}, \i\c{^^} and \i\c{||} are provided, supplying
+\i{logical AND}, \i{logical XOR} and \i{logical OR}. These work like
+the C logical operators (although C has no logical XOR), in that
+they always return either 0 or 1, and treat any non-zero input as 1
+(so that \c{^^}, for example, returns 1 if exactly one of its inputs
+is zero, and 0 otherwise). The relational operators also return 1
+for true and 0 for false.
+
+\S{ifidn} \i\c{%ifidn} and \i\c{%ifidni}: \i{Testing Exact Text
+Identity}
+
+The construct \c{%ifidn text1,text2} will cause the subsequent code
+to be assembled if and only if \c{text1} and \c{text2}, after
+expanding single-line macros, are identical pieces of text.
+Differences in white space are not counted.
+
+\c{%ifidni} is similar to \c{%ifidn}, but is \i{case-insensitive}.
+
+For example, the following macro pushes a register or number on the
+stack, and allows you to treat \c{IP} as a real register:
+
+\c %macro pushparam 1
+\c %ifidni %1,ip
+\c           call %%label
+\c %%label:
+\c %else
+\c           push %1
+\c %endif
+\c %endmacro
+
+Like most other \c{%if} constructs, \c{%ifidn} has a counterpart
+\i\c{%elifidn}, and negative forms \i\c{%ifnidn} and \i\c{%elifnidn}.
+Similarly, \c{%ifidni} has counterparts \i\c{%elifidni},
+\i\c{%ifnidni} and \i\c{%elifnidni}.
+
+\S{iftyp} \i\c{%ifid}, \i\c{%ifnum}, \i\c{%ifstr}: \i{Testing Token
+Types}
+
+Some macros will want to perform different tasks depending on
+whether they are passed a number, a string, or an identifier. For
+example, a string output macro might want to be able to cope with
+being passed either a string constant or a pointer to an existing
+string.
+
+The conditional assembly construct \c{%ifid}, taking one parameter
+(which may be blank), assembles the subsequent code if and only if
+the first token in the parameter exists and is an identifier.
+\c{%ifnum} works similarly, but tests for the token being a numeric
+constant; \c{%ifstr} tests for it being a string.
+
+For example, the \c{writefile} macro defined in \k{mlmacgre} can be
+extended to take advantage of \c{%ifstr} in the following fashion:
+
+\c %macro writefile 2-3+
+\c %ifstr %2
+\c           jmp %%endstr
+\c %if %0 = 3
+\c %%str:	  db %2,%3
+\c %else
+\c %%str:	  db %2
+\c %endif
+\c %%endstr: mov dx,%%str
+\c           mov cx,%%endstr-%%str
+\c %else
+\c 	  mov dx,%2
+\c 	  mov cx,%3
+\c %endif
+\c           mov bx,%1
+\c           mov ah,0x40
+\c           int 0x21
+\c %endmacro
+
+Then the \c{writefile} macro can cope with being called in either of
+the following two ways:
+
+\c           writefile [file], strpointer, length
+\c           writefile [file], "hello", 13, 10
+
+In the first, \c{strpointer} is used as the address of an
+already-declared string, and \c{length} is used as its length; in
+the second, a string is given to the macro, which therefore declares
+it itself and works out the address and length for itself.
+
+Note the use of \c{%if} inside the \c{%ifstr}: this is to detect
+whether the macro was passed two arguments (so the string would be a
+single string constant, and \c{db %2} would be adequate) or more (in
+which case, all but the first two would be lumped together into
+\c{%3}, and \c{db %2,%3} would be required).
+
+\I\c{%ifnid}\I\c{%elifid}\I\c{%elifnid}\I\c{%ifnnum}\I\c{%elifnum}\I\c{%elifnnum}\I\c{%ifnstr}\I\c{%elifstr}\I\c{%elifnstr}
+The usual \c{%elifXXX}, \c{%ifnXXX} and \c{%elifnXXX} versions exist
+for each of \c{%ifid}, \c{%ifnum} and \c{%ifstr}.
+
+\S{pperror} \i\c{%error}: Reporting \i{User-Defined Errors}
+
+The preprocessor directive \c{%error} will cause NASM to report an
+error if it occurs in assembled code. So if other users are going to
+try to assemble your source files, you can ensure that they define
+the right macros by means of code like this:
+
+\c %ifdef SOME_MACRO
+\c ; do some setup
+\c %elifdef SOME_OTHER_MACRO
+\c ; do some different setup
+\c %else
+\c %error Neither SOME_MACRO nor SOME_OTHER_MACRO was defined.
+\c %endif
+
+Then any user who fails to understand the way your code is supposed
+to be assembled will be quickly warned of their mistake, rather than
+having to wait until the program crashes on being run and then not
+knowing what went wrong.
+
+\H{rep} \i{Preprocessor Loops}\I{repeating code}: \i\c{%rep}
+
+NASM's \c{TIMES} prefix, though useful, cannot be used to invoke a
+multi-line macro multiple times, because it is processed by NASM
+after macros have already been expanded. Therefore NASM provides
+another form of loop, this time at the preprocessor level: \c{%rep}.
+
+The directives \c{%rep} and \i\c{%endrep} (\c{%rep} takes a numeric
+argument, which can be an expression; \c{%endrep} takes no
+arguments) can be used to enclose a chunk of code, which is then
+replicated as many times as specified by the preprocessor:
+
+\c %assign i 0
+\c %rep 64
+\c           inc word [table+2*i]
+\c %assign i i+1
+\c %endrep
+
+This will generate a sequence of 64 \c{INC} instructions,
+incrementing every word of memory from \c{[table]} to
+\c{[table+126]}.
+
+For more complex termination conditions, or to break out of a repeat
+loop part way along, you can use the \i\c{%exitrep} directive to
+terminate the loop, like this:
+
+\c fibonacci:
+\c %assign i 0
+\c %assign j 1
+\c %rep 100
+\c %if j > 65535
+\c %exitrep
+\c %endif
+\c           dw j
+\c %assign k j+i
+\c %assign i j
+\c %assign j k
+\c %endrep
+\c fib_number equ ($-fibonacci)/2
+
+This produces a list of all the Fibonacci numbers that will fit in
+16 bits. Note that a maximum repeat count must still be given to
+\c{%rep}. This is to prevent the possibility of NASM getting into an
+infinite loop in the preprocessor, which (on multitasking or
+multi-user systems) would typically cause all the system memory to
+be gradually used up and other applications to start crashing.
+
+\H{include} \i{Including Other Files}
+
+Using, once again, a very similar syntax to the C preprocessor,
+NASM's preprocessor lets you include other source files into your
+code. This is done by the use of the \i\c{%include} directive:
+
+\c %include "macros.mac"
+
+will include the contents of the file \c{macros.mac} into the source
+file containing the \c{%include} directive.
+
+Include files are \I{searching for include files}searched for in the
+current directory (the directory you're in when you run NASM, as
+opposed to the location of the NASM executable or the location of
+the source file), plus any directories specified on the NASM command
+line using the \c{-i} option.
+
+The standard C idiom for preventing a file being included more than
+once is just as applicable in NASM: if the file \c{macros.mac} has
+the form
+
+\c %ifndef MACROS_MAC
+\c %define MACROS_MAC
+\c ; now define some macros
+\c %endif
+
+then including the file more than once will not cause errors,
+because the second time the file is included nothing will happen
+because the macro \c{MACROS_MAC} will already be defined.
+
+You can force a file to be included even if there is no \c{%include}
+directive that explicitly includes it, by using the \i\c{-p} option
+on the NASM command line (see \k{opt-p}).
+
+\H{ctxstack} The \i{Context Stack}
+
+Having labels that are local to a macro definition is sometimes not
+quite powerful enough: sometimes you want to be able to share labels
+between several macro calls. An example might be a \c{REPEAT} ...
+\c{UNTIL} loop, in which the expansion of the \c{REPEAT} macro
+would need to be able to refer to a label which the \c{UNTIL} macro
+had defined. However, for such a macro you would also want to be
+able to nest these loops.
+
+NASM provides this level of power by means of a \e{context stack}.
+The preprocessor maintains a stack of \e{contexts}, each of which is
+characterised by a name. You add a new context to the stack using
+the \i\c{%push} directive, and remove one using \i\c{%pop}. You can
+define labels that are local to a particular context on the stack.
+
+\S{pushpop} \i\c{%push} and \i\c{%pop}: \I{creating
+contexts}\I{removing contexts}Creating and Removing Contexts
+
+The \c{%push} directive is used to create a new context and place it
+on the top of the context stack. \c{%push} requires one argument,
+which is the name of the context. For example:
+
+\c %push foobar
+
+This pushes a new context called \c{foobar} on the stack. You can
+have several contexts on the stack with the same name: they can
+still be distinguished.
+
+The directive \c{%pop}, requiring no arguments, removes the top
+context from the context stack and destroys it, along with any
+labels associated with it.
+
+\S{ctxlocal} \i{Context-Local Labels}
+
+Just as the usage \c{%%foo} defines a label which is local to the
+particular macro call in which it is used, the usage \I{%$}\c{%$foo}
+is used to define a label which is local to the context on the top
+of the context stack. So the \c{REPEAT} and \c{UNTIL} example given
+above could be implemented by means of:
+
+\c %macro repeat 0
+\c %push repeat
+\c %$begin:
+\c %endmacro
+
+\c %macro until 1
+\c           j%-1 %$begin
+\c %pop
+\c %endmacro
+
+and invoked by means of, for example,
+
+\c           mov cx,string
+\c           repeat
+\c           add cx,3
+\c           scasb
+\c           until e
+
+which would scan every fourth byte of a string in search of the byte
+in \c{AL}.
+
+If you need to define, or access, labels local to the context
+\e{below} the top one on the stack, you can use \I{%$$}\c{%$$foo}, or
+\c{%$$$foo} for the context below that, and so on.
+
+\S{ctxdefine} \i{Context-Local Single-Line Macros}
+
+NASM also allows you to define single-line macros which are local to
+a particular context, in just the same way:
+
+\c %define %$localmac 3
+
+will define the single-line macro \c{%$localmac} to be local to the
+top context on the stack. Of course, after a subsequent \c{%push},
+it can then still be accessed by the name \c{%$$localmac}.
+
+\S{ctxrepl} \i\c{%repl}: \I{renaming contexts}Renaming a Context
+
+If you need to change the name of the top context on the stack (in
+order, for example, to have it respond differently to \c{%ifctx}),
+you can execute a \c{%pop} followed by a \c{%push}; but this will
+have the side effect of destroying all context-local labels and
+macros associated with the context that was just popped.
+
+NASM provides the directive \c{%repl}, which \e{replaces} a context
+with a different name, without touching the associated macros and
+labels. So you could replace the destructive code
+
+\c %pop
+\c %push newname
+
+with the non-destructive version \c{%repl newname}.
+
+\S{blockif} Example Use of the \i{Context Stack}: \i{Block IFs}
+
+This example makes use of almost all the context-stack features,
+including the conditional-assembly construct \i\c{%ifctx}, to
+implement a block IF statement as a set of macros.
+
+\c %macro if 1
+\c     %push if
+\c     j%-1 %$ifnot
+\c %endmacro
+
+\c %macro else 0
+\c     %ifctx if
+\c         %repl else
+\c         jmp %$ifend
+\c         %$ifnot:
+\c     %else
+\c         %error "expected `if' before `else'"
+\c     %endif
+\c %endmacro
+
+\c %macro endif 0
+\c     %ifctx if
+\c         %$ifnot:
+\c         %pop
+\c     %elifctx else
+\c         %$ifend:
+\c         %pop
+\c     %else
+\c         %error "expected `if' or `else' before `endif'"
+\c     %endif
+\c %endmacro
+
+This code is more robust than the \c{REPEAT} and \c{UNTIL} macros
+given in \k{ctxlocal}, because it uses conditional assembly to check
+that the macros are issued in the right order (for example, not
+calling \c{endif} before \c{if}) and issues a \c{%error} if they're
+not.
+
+In addition, the \c{endif} macro has to be able to cope with the two
+distinct cases of either directly following an \c{if}, or following
+an \c{else}. It achieves this, again, by using conditional assembly
+to do different things depending on whether the context on top of
+the stack is \c{if} or \c{else}.
+
+The \c{else} macro has to preserve the context on the stack, in
+order to have the \c{%$ifnot} referred to by the \c{if} macro be the
+same as the one defined by the \c{endif} macro, but has to change
+the context's name so that \c{endif} will know there was an
+intervening \c{else}. It does this by the use of \c{%repl}.
+
+A sample usage of these macros might look like:
+
+\c           cmp ax,bx
+\c           if ae
+\c             cmp bx,cx
+\c             if ae
+\c               mov ax,cx
+\c             else
+\c               mov ax,bx
+\c             endif
+\c           else
+\c             cmp ax,cx
+\c             if ae
+\c               mov ax,cx
+\c             endif
+\c           endif
+
+The block-\c{IF} macros handle nesting quite happily, by means of
+pushing another context, describing the inner \c{if}, on top of the
+one describing the outer \c{if}; thus \c{else} and \c{endif} always
+refer to the last unmatched \c{if} or \c{else}.
+
+\H{stdmac} \i{Standard Macros}
+
+NASM defines a set of standard macros, which are already defined
+when it starts to process any source file. If you really need a
+program to be assembled with no pre-defined macros, you can use the
+\i\c{%clear} directive to empty the preprocessor of everything.
+
+Most \i{user-level assembler directives} (see \k{directive}) are
+implemented as macros which invoke primitive directives; these are
+described in \k{directive}. The rest of the standard macro set is
+described here.
+
+\S{stdmacver} \i\c{__NASM_MAJOR__} and \i\c{__NASM_MINOR__}: \i{NASM
+Version}
+
+The single-line macros \c{__NASM_MAJOR__} and \c{__NASM_MINOR__}
+expand to the major and minor parts of the \i{version number of
+NASM} being used. So, under NASM 0.96 for example,
+\c{__NASM_MAJOR__} would be defined to be 0 and \c{__NASM_MINOR__}
+would be defined as 96.
+
+\S{fileline} \i\c{__FILE__} and \i\c{__LINE__}: File Name and Line Number
+
+Like the C preprocessor, NASM allows the user to find out the file
+name and line number containing the current instruction. The macro
+\c{__FILE__} expands to a string constant giving the name of the
+current input file (which may change through the course of assembly
+if \c{%include} directives are used), and \c{__LINE__} expands to a
+numeric constant giving the current line number in the input file.
+
+These macros could be used, for example, to communicate debugging
+information to a macro, since invoking \c{__LINE__} inside a macro
+definition (either single-line or multi-line) will return the line
+number of the macro \e{call}, rather than \e{definition}. So to
+determine where in a piece of code a crash is occurring, for
+example, one could write a routine \c{stillhere}, which is passed a
+line number in \c{EAX} and outputs something like `line 155: still
+here'. You could then write a macro
+
+\c %macro notdeadyet 0
+\c           push eax
+\c           mov eax,__LINE__
+\c           call stillhere
+\c           pop eax
+\c %endmacro
+
+and then pepper your code with calls to \c{notdeadyet} until you
+find the crash point.
+
+\S{struc} \i\c{STRUC} and \i\c{ENDSTRUC}: \i{Declaring Structure} Data Types
+
+The core of NASM contains no intrinsic means of defining data
+structures; instead, the preprocessor is sufficiently powerful that
+data structures can be implemented as a set of macros. The macros
+\c{STRUC} and \c{ENDSTRUC} are used to define a structure data type.
+
+\c{STRUC} takes one parameter, which is the name of the data type.
+This name is defined as a symbol with the value zero, and also has
+the suffix \c{_size} appended to it and is then defined as an
+\c{EQU} giving the size of the structure. Once \c{STRUC} has been
+issued, you are defining the structure, and should define fields
+using the \c{RESB} family of pseudo-instructions, and then invoke
+\c{ENDSTRUC} to finish the definition.
+
+For example, to define a structure called \c{mytype} containing a
+longword, a word, a byte and a string of bytes, you might code
+
+\c           struc mytype
+\c mt_long:  resd 1
+\c mt_word:  resw 1
+\c mt_byte:  resb 1
+\c mt_str:   resb 32
+\c           endstruc
+
+The above code defines six symbols: \c{mt_long} as 0 (the offset
+from the beginning of a \c{mytype} structure to the longword field),
+\c{mt_word} as 4, \c{mt_byte} as 6, \c{mt_str} as 7, \c{mytype_size}
+as 39, and \c{mytype} itself as zero.
+
+The reason why the structure type name is defined at zero is a side
+effect of allowing structures to work with the local label
+mechanism: if your structure members tend to have the same names in
+more than one structure, you can define the above structure like this:
+
+\c           struc mytype
+\c .long:    resd 1
+\c .word:    resw 1
+\c .byte:    resb 1
+\c .str:     resb 32
+\c           endstruc
+
+This defines the offsets to the structure fields as \c{mytype.long},
+\c{mytype.word}, \c{mytype.byte} and \c{mytype.str}.
+
+NASM, since it has no \e{intrinsic} structure support, does not
+support any form of period notation to refer to the elements of a
+structure once you have one (except the above local-label notation),
+so code such as \c{mov ax,[mystruc.mt_word]} is not valid.
+\c{mt_word} is a constant just like any other constant, so the
+correct syntax is \c{mov ax,[mystruc+mt_word]} or \c{mov
+ax,[mystruc+mytype.word]}.
+
+\S{istruc} \i\c{ISTRUC}, \i\c{AT} and \i\c{IEND}: Declaring
+\i{Instances of Structures}
+
+Having defined a structure type, the next thing you typically want
+to do is to declare instances of that structure in your data
+segment. NASM provides an easy way to do this in the \c{ISTRUC}
+mechanism. To declare a structure of type \c{mytype} in a program,
+you code something like this:
+
+\c mystruc:  istruc mytype
+\c           at mt_long, dd 123456
+\c           at mt_word, dw 1024
+\c           at mt_byte, db 'x'
+\c           at mt_str, db 'hello, world', 13, 10, 0
+\c           iend
+
+The function of the \c{AT} macro is to make use of the \c{TIMES}
+prefix to advance the assembly position to the correct point for the
+specified structure field, and then to declare the specified data.
+Therefore the structure fields must be declared in the same order as
+they were specified in the structure definition.
+
+If the data to go in a structure field requires more than one source
+line to specify, the remaining source lines can easily come after
+the \c{AT} line. For example:
+
+\c           at mt_str, db 123,134,145,156,167,178,189
+\c           db 190,100,0
+
+Depending on personal taste, you can also omit the code part of the
+\c{AT} line completely, and start the structure field on the next
+line:
+
+\c           at mt_str
+\c           db 'hello, world'
+\c           db 13,10,0
+
+\S{align} \i\c{ALIGN} and \i\c{ALIGNB}: Data Alignment
+
+The \c{ALIGN} and \c{ALIGNB} macros provides a convenient way to
+align code or data on a word, longword, paragraph or other boundary.
+(Some assemblers call this directive \i\c{EVEN}.) The syntax of the
+\c{ALIGN} and \c{ALIGNB} macros is
+
+\c           align 4                ; align on 4-byte boundary
+\c           align 16               ; align on 16-byte boundary
+\c           align 8,db 0           ; pad with 0s rather than NOPs
+\c           align 4,resb 1         ; align to 4 in the BSS
+\c           alignb 4               ; equivalent to previous line
+
+Both macros require their first argument to be a power of two; they
+both compute the number of additional bytes required to bring the
+length of the current section up to a multiple of that power of two,
+and then apply the \c{TIMES} prefix to their second argument to
+perform the alignment.
+
+If the second argument is not specified, the default for \c{ALIGN}
+is \c{NOP}, and the default for \c{ALIGNB} is \c{RESB 1}. So if the
+second argument is specified, the two macros are equivalent.
+Normally, you can just use \c{ALIGN} in code and data sections and
+\c{ALIGNB} in BSS sections, and never need the second argument
+except for special purposes.
+
+\c{ALIGN} and \c{ALIGNB}, being simple macros, perform no error
+checking: they cannot warn you if their first argument fails to be a
+power of two, or if their second argument generates more than one
+byte of code. In each of these cases they will silently do the wrong
+thing.
+
+\c{ALIGNB} (or \c{ALIGN} with a second argument of \c{RESB 1}) can
+be used within structure definitions:
+
+\c           struc mytype2
+\c mt_byte:  resb 1
+\c           alignb 2
+\c mt_word:  resw 1
+\c           alignb 4
+\c mt_long:  resd 1
+\c mt_str:   resb 32
+\c           endstruc
+
+This will ensure that the structure members are sensibly aligned
+relative to the base of the structure.
+
+A final caveat: \c{ALIGN} and \c{ALIGNB} work relative to the
+beginning of the \e{section}, not the beginning of the address space
+in the final executable. Aligning to a 16-byte boundary when the
+section you're in is only guaranteed to be aligned to a 4-byte
+boundary, for example, is a waste of effort. Again, NASM does not
+check that the section's alignment characteristics are sensible for
+the use of \c{ALIGN} or \c{ALIGNB}.
+
+\C{directive} \i{Assembler Directives}
+
+NASM, though it attempts to avoid the bureaucracy of assemblers like
+MASM and TASM, is nevertheless forced to support a \e{few}
+directives. These are described in this chapter.
+
+NASM's directives come in two types: \i{user-level
+directives}\e{user-level} directives and \i{primitive
+directives}\e{primitive} directives. Typically, each directive has a
+user-level form and a primitive form. In almost all cases, we
+recommend that users use the user-level forms of the directives,
+which are implemented as macros which call the primitive forms.
+
+Primitive directives are enclosed in square brackets; user-level
+directives are not.
+
+In addition to the universal directives described in this chapter,
+each object file format can optionally supply extra directives in
+order to control particular features of that file format. These
+\i{format-specific directives}\e{format-specific} directives are
+documented along with the formats that implement them, in \k{outfmt}.
+
+\H{bits} \i\c{BITS}: Specifying Target \i{Processor Mode}
+
+The \c{BITS} directive specifies whether NASM should generate code
+\I{16-bit mode, versus 32-bit mode}designed to run on a processor
+operating in 16-bit mode, or code designed to run on a processor
+operating in 32-bit mode. The syntax is \c{BITS 16} or \c{BITS 32}.
+
+In most cases, you should not need to use \c{BITS} explicitly. The
+\c{aout}, \c{coff}, \c{elf} and \c{win32} object formats, which are
+designed for use in 32-bit operating systems, all cause NASM to
+select 32-bit mode by default. The \c{obj} object format allows you
+to specify each segment you define as either \c{USE16} or \c{USE32},
+and NASM will set its operating mode accordingly, so the use of the
+\c{BITS} directive is once again unnecessary.
+
+The most likely reason for using the \c{BITS} directive is to write
+32-bit code in a flat binary file; this is because the \c{bin}
+output format defaults to 16-bit mode in anticipation of it being
+used most frequently to write DOS \c{.COM} programs, DOS \c{.SYS}
+device drivers and boot loader software.
+
+You do \e{not} need to specify \c{BITS 32} merely in order to use
+32-bit instructions in a 16-bit DOS program; if you do, the
+assembler will generate incorrect code because it will be writing
+code targeted at a 32-bit platform, to be run on a 16-bit one.
+
+When NASM is in \c{BITS 16} state, instructions which use 32-bit
+data are prefixed with an 0x66 byte, and those referring to 32-bit
+addresses have an 0x67 prefix. In \c{BITS 32} state, the reverse is
+true: 32-bit instructions require no prefixes, whereas instructions
+using 16-bit data need an 0x66 and those working in 16-bit addresses
+need an 0x67.
+
+The \c{BITS} directive has an exactly equivalent primitive form,
+\c{[BITS 16]} and \c{[BITS 32]}. The user-level form is a macro
+which has no function other than to call the primitive form.
+
+\H{section} \i\c{SECTION} or \i\c{SEGMENT}: Changing and \i{Defining
+Sections}
+
+\I{changing sections}\I{switching between sections}The \c{SECTION}
+directive (\c{SEGMENT} is an exactly equivalent synonym) changes
+which section of the output file the code you write will be
+assembled into. In some object file formats, the number and names of
+sections are fixed; in others, the user may make up as many as they
+wish. Hence \c{SECTION} may sometimes give an error message, or may
+define a new section, if you try to switch to a section that does
+not (yet) exist.
+
+The Unix object formats, and the \c{bin} object format, all support
+the \i{standardised section names} \c{.text}, \c{.data} and \c{.bss}
+for the code, data and uninitialised-data sections. The \c{obj}
+format, by contrast, does not recognise these section names as being
+special, and indeed will strip off the leading period of any section
+name that has one.
+
+\S{sectmac} The \i\c{__SECT__} Macro
+
+The \c{SECTION} directive is unusual in that its user-level form
+functions differently from its primitive form. The primitive form,
+\c{[SECTION xyz]}, simply switches the current target section to the
+one given. The user-level form, \c{SECTION xyz}, however, first
+defines the single-line macro \c{__SECT__} to be the primitive
+\c{[SECTION]} directive which it is about to issue, and then issues
+it. So the user-level directive
+
+\c           SECTION .text
+
+expands to the two lines
+
+\c %define __SECT__ [SECTION .text]
+\c           [SECTION .text]
+
+Users may find it useful to make use of this in their own macros.
+For example, the \c{writefile} macro defined in \k{mlmacgre} can be
+usefully rewritten in the following more sophisticated form:
+
+\c %macro writefile 2+
+\c           [section .data]
+\c %%str:    db %2
+\c %%endstr:
+\c           __SECT__
+\c           mov dx,%%str
+\c           mov cx,%%endstr-%%str
+\c           mov bx,%1
+\c           mov ah,0x40
+\c           int 0x21
+\c %endmacro
+
+This form of the macro, once passed a string to output, first
+switches temporarily to the data section of the file, using the
+primitive form of the \c{SECTION} directive so as not to modify
+\c{__SECT__}. It then declares its string in the data section, and
+then invokes \c{__SECT__} to switch back to \e{whichever} section
+the user was previously working in. It thus avoids the need, in the
+previous version of the macro, to include a \c{JMP} instruction to
+jump over the data, and also does not fail if, in a complicated
+\c{OBJ} format module, the user could potentially be assembling the
+code in any of several separate code sections.
+
+\H{absolute} \i\c{ABSOLUTE}: Defining Absolute Labels
+
+The \c{ABSOLUTE} directive can be thought of as an alternative form
+of \c{SECTION}: it causes the subsequent code to be directed at no
+physical section, but at the hypothetical section starting at the
+given absolute address. The only instructions you can use in this
+mode are the \c{RESB} family.
+
+\c{ABSOLUTE} is used as follows:
+
+\c           absolute 0x1A
+\c kbuf_chr  resw 1
+\c kbuf_free resw 1
+\c kbuf      resw 16
+
+This example describes a section of the PC BIOS data area, at
+segment address 0x40: the above code defines \c{kbuf_chr} to be
+0x1A, \c{kbuf_free} to be 0x1C, and \c{kbuf} to be 0x1E.
+
+The user-level form of \c{ABSOLUTE}, like that of \c{SECTION},
+redefines the \i\c{__SECT__} macro when it is invoked.
+
+\i\c{STRUC} and \i\c{ENDSTRUC} are defined as macros which use
+\c{ABSOLUTE} (and also \c{__SECT__}).
+
+\c{ABSOLUTE} doesn't have to take an absolute constant as an
+argument: it can take an expression (actually, a \i{critical
+expression}: see \k{crit}) and it can be a value in a segment. For
+example, a TSR can re-use its setup code as run-time BSS like this:
+
+\c           org 100h               ; it's a .COM program
+\c           jmp setup              ; setup code comes last
+\c           ; the resident part of the TSR goes here
+\c setup:    ; now write the code that installs the TSR here
+\c           absolute setup
+\c runtimevar1 resw 1
+\c runtimevar2 resd 20
+\c tsr_end:
+
+This defines some variables `on top of' the setup code, so that
+after the setup has finished running, the space it took up can be
+re-used as data storage for the running TSR. The symbol `tsr_end'
+can be used to calculate the total size of the part of the TSR that
+needs to be made resident.
+
+\H{extern} \i\c{EXTERN}: \i{Importing Symbols} from Other Modules
+
+\c{EXTERN} is similar to the MASM directive \c{EXTRN} and the C
+keyword \c{extern}: it is used to declare a symbol which is not
+defined anywhere in the module being assembled, but is assumed to be
+defined in some other module and needs to be referred to by this
+one. Not every object-file format can support external variables:
+the \c{bin} format cannot.
+
+The \c{EXTERN} directive takes as many arguments as you like. Each
+argument is the name of a symbol:
+
+\c           extern _printf
+\c           extern _sscanf,_fscanf
+
+Some object-file formats provide extra features to the \c{EXTERN}
+directive. In all cases, the extra features are used by suffixing a
+colon to the symbol name followed by object-format specific text.
+For example, the \c{obj} format allows you to declare that the
+default segment base of an external should be the group \c{dgroup}
+by means of the directive
+
+\c           extern _variable:wrt dgroup
+
+The primitive form of \c{EXTERN} differs from the user-level form
+only in that it can take only one argument at a time: the support
+for multiple arguments is implemented at the preprocessor level.
+
+You can declare the same variable as \c{EXTERN} more than once: NASM
+will quietly ignore the second and later redeclarations. You can't
+declare a variable as \c{EXTERN} as well as something else, though.
+
+\H{global} \i\c{GLOBAL}: \i{Exporting Symbols} to Other Modules
+
+\c{GLOBAL} is the other end of \c{EXTERN}: if one module declares a
+symbol as \c{EXTERN} and refers to it, then in order to prevent
+linker errors, some other module must actually \e{define} the
+symbol and declare it as \c{GLOBAL}. Some assemblers use the name
+\i\c{PUBLIC} for this purpose.
+
+The \c{GLOBAL} directive applying to a symbol must appear \e{before}
+the definition of the symbol.
+
+\c{GLOBAL} uses the same syntax as \c{EXTERN}, except that it must
+refer to symbols which \e{are} defined in the same module as the
+\c{GLOBAL} directive. For example:
+
+\c           global _main
+\c _main:    ; some code
+
+\c{GLOBAL}, like \c{EXTERN}, allows object formats to define private
+extensions by means of a colon. The \c{elf} object format, for
+example, lets you specify whether global data items are functions or
+data:
+
+\c           global hashlookup:function, hashtable:data
+
+Like \c{EXTERN}, the primitive form of \c{GLOBAL} differs from the
+user-level form only in that it can take only one argument at a
+time.
+
+\H{common} \i\c{COMMON}: Defining Common Data Areas
+
+The \c{COMMON} directive is used to declare \i\e{common variables}.
+A common variable is much like a global variable declared in the
+uninitialised data section, so that
+
+\c           common intvar 4
+
+is similar in function to
+
+\c           global intvar
+\c           section .bss
+\c intvar    resd 1
+
+The difference is that if more than one module defines the same
+common variable, then at link time those variables will be
+\e{merged}, and references to \c{intvar} in all modules will point
+at the same piece of memory.
+
+Like \c{GLOBAL} and \c{EXTERN}, \c{COMMON} supports object-format
+specific extensions. For example, the \c{obj} format allows common
+variables to be NEAR or FAR, and the \c{elf} format allows you to
+specify the alignment requirements of a common variable:
+
+\c           common commvar 4:near  ; works in OBJ
+\c           common intarray 100:4  ; works in ELF: 4 byte aligned
+
+Once again, like \c{EXTERN} and \c{GLOBAL}, the primitive form of
+\c{COMMON} differs from the user-level form only in that it can take
+only one argument at a time.
+
+\C{outfmt} \i{Output Formats}
+
+NASM is a portable assembler, designed to be able to compile on any
+ANSI C-supporting platform and produce output to run on a variety of
+Intel x86 operating systems. For this reason, it has a large number
+of available output formats, selected using the \i\c{-f} option on
+the NASM \i{command line}. Each of these formats, along with its
+extensions to the base NASM syntax, is detailed in this chapter.
+
+As stated in \k{opt-o}, NASM chooses a \i{default name} for your
+output file based on the input file name and the chosen output
+format. This will be generated by removing the \i{extension}
+(\c{.asm}, \c{.s}, or whatever you like to use) from the input file
+name, and substituting an extension defined by the output format.
+The extensions are given with each format below.
+
+\H{binfmt} \i\c{bin}: \i{Flat-Form Binary}\I{pure binary} Output
+
+The \c{bin} format does not produce object files: it generates
+nothing in the output file except the code you wrote. Such `pure
+binary' files are used by \i{MS-DOS}: \i\c{.COM} executables and
+\i\c{.SYS} device drivers are pure binary files. Pure binary output
+is also useful for \i{operating-system} and \i{boot loader}
+development.
+
+\c{bin} supports the three \i{standardised section names} \i\c{.text},
+\i\c{.data} and \i\c{.bss} only. The file NASM outputs will contain the
+contents of the \c{.text} section first, followed by the contents of
+the \c{.data} section, aligned on a four-byte boundary. The \c{.bss}
+section is not stored in the output file at all, but is assumed to
+appear directly after the end of the \c{.data} section, again
+aligned on a four-byte boundary.
+
+If you specify no explicit \c{SECTION} directive, the code you write
+will be directed by default into the \c{.text} section.
+
+Using the \c{bin} format puts NASM by default into 16-bit mode (see
+\k{bits}). In order to use \c{bin} to write 32-bit code such as an
+OS kernel, you need to explicitly issue the \I\c{BITS}\c{BITS 32}
+directive.
+
+\c{bin} has no default output file name extension: instead, it
+leaves your file name as it is once the original extension has been
+removed. Thus, the default is for NASM to assemble \c{binprog.asm}
+into a binary file called \c{binprog}.
+
+\S{org} \i\c{ORG}: Binary File \i{Program Origin}
+
+The \c{bin} format provides an additional directive to the list
+given in \k{directive}: \c{ORG}. The function of the \c{ORG}
+directive is to specify the origin address which NASM will assume
+the program begins at when it is loaded into memory.
+
+For example, the following code will generate the longword
+\c{0x00000104}:
+
+\c           org 0x100
+\c           dd label
+\c label:
+
+Unlike the \c{ORG} directive provided by MASM-compatible assemblers,
+which allows you to jump around in the object file and overwrite
+code you have already generated, NASM's \c{ORG} does exactly what
+the directive says: \e{origin}. Its sole function is to specify one
+offset which is added to all internal address references within the
+file; it does not permit any of the trickery that MASM's version
+does. See \k{proborg} for further comments.
+
+\S{binseg} \c{bin} Extensions to the \c{SECTION}
+Directive\I{SECTION, bin extensions to}
+
+The \c{bin} output format extends the \c{SECTION} (or \c{SEGMENT})
+directive to allow you to specify the alignment requirements of
+segments. This is done by appending the \i\c{ALIGN} qualifier to the
+end of the section-definition line. For example,
+
+\c           section .data align=16
+
+switches to the section \c{.data} and also specifies that it must be
+aligned on a 16-byte boundary.
+
+The parameter to \c{ALIGN} specifies how many low bits of the
+section start address must be forced to zero. The alignment value
+given may be any power of two.\I{section alignment, in
+bin}\I{segment alignment, in bin}\I{alignment, in bin sections}
+
+\H{objfmt} \i\c{obj}: \i{Microsoft OMF}\I{OMF} Object Files
+
+The \c{obj} file format (NASM calls it \c{obj} rather than \c{omf}
+for historical reasons) is the one produced by \i{MASM} and
+\i{TASM}, which is typically fed to 16-bit DOS linkers to produce
+\i\c{.EXE} files. It is also the format used by \i{OS/2}.
+
+\c{obj} provides a default output file-name extension of \c{.obj}.
+
+\c{obj} is not exclusively a 16-bit format, though: NASM has full
+support for the 32-bit extensions to the format. In particular,
+32-bit \c{obj} format files are used by \i{Borland's Win32
+compilers}, instead of using Microsoft's newer \i\c{win32} object
+file format.
+
+The \c{obj} format does not define any special segment names: you
+can call your segments anything you like. Typical names for segments
+in \c{obj} format files are \c{CODE}, \c{DATA} and \c{BSS}.
+
+If your source file contains code before specifying an explicit
+\c{SEGMENT} directive, then NASM will invent its own segment called
+\i\c{__NASMDEFSEG} for you.
+
+When you define a segment in an \c{obj} file, NASM defines the
+segment name as a symbol as well, so that you can access the segment
+address of the segment. So, for example:
+
+\c           segment data
+\c dvar:     dw 1234
+\c           segment code
+\c function: mov ax,data            ; get segment address of data
+\c           mov ds,ax              ; and move it into DS
+\c           inc word [dvar]        ; now this reference will work
+\c           ret
+
+The \c{obj} format also enables the use of the \i\c{SEG} and
+\i\c{WRT} operators, so that you can write code which does things
+like
+
+\c           extern foo
+\c           mov ax,seg foo         ; get preferred segment of foo
+\c           mov ds,ax
+\c           mov ax,data            ; a different segment
+\c           mov es,ax
+\c           mov ax,[ds:foo]        ; this accesses `foo'
+\c           mov [es:foo wrt data],bx  ; so does this
+
+\S{objseg} \c{obj} Extensions to the \c{SEGMENT}
+Directive\I{SEGMENT, obj extensions to}
+
+The \c{obj} output format extends the \c{SEGMENT} (or \c{SECTION})
+directive to allow you to specify various properties of the segment
+you are defining. This is done by appending extra qualifiers to the
+end of the segment-definition line. For example,
+
+\c           segment code private align=16
+
+defines the segment \c{code}, but also declares it to be a private
+segment, and requires that the portion of it described in this code
+module must be aligned on a 16-byte boundary.
+
+The available qualifiers are:
+
+\b \i\c{PRIVATE}, \i\c{PUBLIC}, \i\c{COMMON} and \i\c{STACK} specify
+the combination characteristics of the segment. \c{PRIVATE} segments
+do not get combined with any others by the linker; \c{PUBLIC} and
+\c{STACK} segments get concatenated together at link time; and
+\c{COMMON} segments all get overlaid on top of each other rather
+than stuck end-to-end.
+
+\b \i\c{ALIGN} is used, as shown above, to specify how many low bits
+of the segment start address must be forced to zero. The alignment
+value given may be any power of two from 1 to 4096; in reality, the
+only values supported are 1, 2, 4, 16, 256 and 4096, so if 8 is
+specified it will be rounded up to 16, and 32, 64 and 128 will all
+be rounded up to 256, and so on. Note that alignment to 4096-byte
+boundaries is a \i{PharLap} extension to the format and may not be
+supported by all linkers.\I{section alignment, in OBJ}\I{segment
+alignment, in OBJ}\I{alignment, in OBJ sections}
+
+\b \i\c{CLASS} can be used to specify the segment class; this feature
+indicates to the linker that segments of the same class should be
+placed near each other in the output file. The class name can be any
+word, e.g. \c{CLASS=CODE}.
+
+\b \i\c{OVERLAY}, like \c{CLASS}, is specified with an arbitrary word
+as an argument, and provides overlay information to an
+overlay-capable linker.
+
+\b Segments can be declared as \i\c{USE16} or \i\c{USE32}, which has
+the effect of recording the choice in the object file and also
+ensuring that NASM's default assembly mode when assembling in that
+segment is 16-bit or 32-bit respectively.
+
+\b When writing \i{OS/2} object files, you should declare 32-bit
+segments as \i\c{FLAT}, which causes the default segment base for
+anything in the segment to be the special group \c{FLAT}, and also
+defines the group if it is not already defined.
+
+\b The \c{obj} file format also allows segments to be declared as
+having a pre-defined absolute segment address, although no linkers
+are currently known to make sensible use of this feature;
+nevertheless, NASM allows you to declare a segment such as
+\c{SEGMENT SCREEN ABSOLUTE=0xB800} if you need to. The \i\c{ABSOLUTE}
+and \c{ALIGN} keywords are mutually exclusive.
+
+NASM's default segment attributes are \c{PUBLIC}, \c{ALIGN=1}, no
+class, no overlay, and \c{USE16}.
+
+\S{group} \i\c{GROUP}: Defining Groups of Segments\I{segments, groups of}
+
+The \c{obj} format also allows segments to be grouped, so that a
+single segment register can be used to refer to all the segments in
+a group. NASM therefore supplies the \c{GROUP} directive, whereby
+you can code
+
+\c           segment data
+\c           ; some data
+\c           segment bss
+\c           ; some uninitialised data
+\c           group dgroup data bss
+
+which will define a group called \c{dgroup} to contain the segments
+\c{data} and \c{bss}. Like \c{SEGMENT}, \c{GROUP} causes the group
+name to be defined as a symbol, so that you can refer to a variable
+\c{var} in the \c{data} segment as \c{var wrt data} or as \c{var wrt
+dgroup}, depending on which segment value is currently in your
+segment register.
+
+If you just refer to \c{var}, however, and \c{var} is declared in a
+segment which is part of a group, then NASM will default to giving
+you the offset of \c{var} from the beginning of the \e{group}, not
+the \e{segment}. Therefore \c{SEG var}, also, will return the group
+base rather than the segment base.
+
+NASM will allow a segment to be part of more than one group, but
+will generate a warning if you do this. Variables declared in a
+segment which is part of more than one group will default to being
+relative to the first group that was defined to contain the segment.
+
+A group does not have to contain any segments; you can still make
+\c{WRT} references to a group which does not contain the variable
+you are referring to. OS/2, for example, defines the special group
+\c{FLAT} with no segments in it.
+
+\S{uppercase} \i\c{UPPERCASE}: Disabling Case Sensitivity in Output
+
+Although NASM itself is \i{case sensitive}, some OMF linkers are
+not; therefore it can be useful for NASM to output single-case
+object files. The \c{UPPERCASE} format-specific directive causes all
+segment, group and symbol names that are written to the object file
+to be forced to upper case just before being written. Within a
+source file, NASM is still case-sensitive; but the object file can
+be written entirely in upper case if desired.
+
+\c{UPPERCASE} is used alone on a line; it requires no parameters.
+
+\S{import} \i\c{IMPORT}: Importing DLL Symbols\I{DLL symbols,
+importing}\I{symbols, importing from DLLs}
+
+The \c{IMPORT} format-specific directive defines a symbol to be
+imported from a DLL, for use if you are writing a DLL's \i{import
+library} in NASM. You still need to declare the symbol as \c{EXTERN}
+as well as using the \c{IMPORT} directive.
+
+The \c{IMPORT} directive takes two required parameters, separated by
+white space, which are (respectively) the name of the symbol you
+wish to import and the name of the library you wish to import it
+from. For example:
+
+\c           import WSAStartup wsock32.dll
+
+A third optional parameter gives the name by which the symbol is
+known in the library you are importing it from, in case this is not
+the same as the name you wish the symbol to be known by to your code
+once you have imported it. For example:
+
+\c           import asyncsel wsock32.dll WSAAsyncSelect
+
+\S{export} \i\c{EXPORT}: Exporting DLL Symbols\I{DLL symbols,
+exporting}\I{symbols, exporting from DLLs}
+
+The \c{EXPORT} format-specific directive defines a global symbol to
+be exported as a DLL symbol, for use if you are writing a DLL in
+NASM. You still need to declare the symbol as \c{GLOBAL} as well as
+using the \c{EXPORT} directive.
+
+\c{EXPORT} takes one required parameter, which is the name of the
+symbol you wish to export, as it was defined in your source file. An
+optional second parameter (separated by white space from the first)
+gives the \e{external} name of the symbol: the name by which you
+wish the symbol to be known to programs using the DLL. If this name
+is the same as the internal name, you may leave the second parameter
+off.
+
+Further parameters can be given to define attributes of the exported
+symbol. These parameters, like the second, are separated by white
+space. If further parameters are given, the external name must also
+be specified, even if it is the same as the internal name. The
+available attributes are:
+
+\b \c{resident} indicates that the exported name is to be kept
+resident by the system loader. This is an optimisation for
+frequently used symbols imported by name.
+
+\b \c{nodata} indicates that the exported symbol is a function which
+does not make use of any initialised data.
+
+\b \c{parm=NNN}, where \c{NNN} is an integer, sets the number of
+parameter words for the case in which the symbol is a call gate
+between 32-bit and 16-bit segments.
+
+\b An attribute which is just a number indicates that the symbol
+should be exported with an identifying number (ordinal), and gives
+the desired number.
+
+For example:
+
+\c           export myfunc
+\c           export myfunc TheRealMoreFormalLookingFunctionName
+\c           export myfunc myfunc 1234  ; export by ordinal
+\c           export myfunc myfunc resident parm=23 nodata
+
+\S{dotdotstart} \i\c{..start}: Defining the \i{Program Entry
+Point}
+
+OMF linkers require exactly one of the object files being linked to
+define the program entry point, where execution will begin when the
+program is run. If the object file that defines the entry point is
+assembled using NASM, you specify the entry point by declaring the
+special symbol \c{..start} at the point where you wish execution to
+begin.
+
+\S{objextern} \c{obj} Extensions to the \c{EXTERN}
+Directive\I{EXTERN, obj extensions to}
+
+If you declare an external symbol with the directive
+
+\c           extern foo
+
+then references such as \c{mov ax,foo} will give you the offset of
+\c{foo} from its preferred segment base (as specified in whichever
+module \c{foo} is actually defined in). So to access the contents of
+\c{foo} you will usually need to do something like
+
+\c           mov ax,seg foo         ; get preferred segment base
+\c           mov es,ax              ; move it into ES
+\c           mov ax,[es:foo]        ; and use offset `foo' from it
+
+This is a little unwieldy, particularly if you know that an external
+is going to be accessible from a given segment or group, say
+\c{dgroup}. So if \c{DS} already contained \c{dgroup}, you could
+simply code
+
+\c           mov ax,[foo wrt dgroup]
+
+However, having to type this every time you want to access \c{foo}
+can be a pain; so NASM allows you to declare \c{foo} in the
+alternative form
+
+\c           extern foo:wrt dgroup
+
+This form causes NASM to pretend that the preferred segment base of
+\c{foo} is in fact \c{dgroup}; so the expression \c{seg foo} will
+now return \c{dgroup}, and the expression \c{foo} is equivalent to
+\c{foo wrt dgroup}.
+
+This \I{default-WRT mechanism}default-\c{WRT} mechanism can be used
+to make externals appear to be relative to any group or segment in
+your program. It can also be applied to common variables: see
+\k{objcommon}.
+
+\S{objcommon} \c{obj} Extensions to the \c{COMMON}
+Directive\I{COMMON, obj extensions to}
+
+The \c{obj} format allows common variables to be either near\I{near
+common variables} or far\I{far common variables}; NASM allows you to
+specify which your variables should be by the use of the syntax
+
+\c           common nearvar 2:near  ; `nearvar' is a near common
+\c           common farvar 10:far   ; and `farvar' is far
+
+Far common variables may be greater in size than 64Kb, and so the
+OMF specification says that they are declared as a number of
+\e{elements} of a given size. So a 10-byte far common variable could
+be declared as ten one-byte elements, five two-byte elements, two
+five-byte elements or one ten-byte element.
+
+Some OMF linkers require the \I{element size, in common
+variables}\I{common variables, element size}element size, as well as
+the variable size, to match when resolving common variables declared
+in more than one module. Therefore NASM must allow you to specify
+the element size on your far common variables. This is done by the
+following syntax:
+
+\c           common c_5by2 10:far 5 ; two five-byte elements
+\c           common c_2by5 10:far 2 ; five two-byte elements
+
+If no element size is specified, the default is 1. Also, the \c{FAR}
+keyword is not required when an element size is specified, since
+only far commons may have element sizes at all. So the above
+declarations could equivalently be
+
+\c           common c_5by2 10:5     ; two five-byte elements
+\c           common c_2by5 10:2     ; five two-byte elements
+
+In addition to these extensions, the \c{COMMON} directive in \c{obj}
+also supports default-\c{WRT} specification like \c{EXTERN} does
+(explained in \k{objextern}). So you can also declare things like
+
+\c           common foo 10:wrt dgroup
+\c           common bar 16:far 2:wrt data
+\c           common baz 24:wrt data:6
+
+\H{win32fmt} \i\c{win32}: Microsoft Win32 Object Files
+
+The \c{win32} output format generates Microsoft Win32 object files,
+suitable for passing to Microsoft linkers such as \i{Visual C++}.
+Note that Borland Win32 compilers do not use this format, but use
+\c{obj} instead (see \k{objfmt}).
+
+\c{win32} provides a default output file-name extension of \c{.obj}.
+
+Note that although Microsoft say that Win32 object files follow the
+COFF (Common Object File Format) standard, the object files produced
+by Microsoft Win32 compilers are not compatible with COFF linkers
+such as DJGPP's, and vice versa. This is due to a difference of
+opinion over the precise semantics of PC-relative relocations. To
+produce COFF files suitable for DJGPP, use NASM's \c{coff} output
+format; conversely, the \c{coff} format does not produce object
+files that Win32 linkers can generate correct output from.
+
+\S{win32sect} \c{win32} Extensions to the \c{SECTION}
+Directive\I{SECTION, win32 extensions to}
+
+Like the \c{obj} format, \c{win32} allows you to specify additional
+information on the \c{SECTION} directive line, to control the type
+and properties of sections you declare. Section types and properties
+are generated automatically by NASM for the \i{standard section names}
+\c{.text}, \c{.data} and \c{.bss}, but may still be overridden by
+these qualifiers.
+
+The available qualifiers are:
+
+\b \c{code}, or equivalently \c{text}, defines the section to be a
+code section. This marks the section as readable and executable, but
+not writable, and also indicates to the linker that the type of the
+section is code.
+
+\b \c{data} and \c{bss} define the section to be a data section,
+analogously to \c{code}. Data sections are marked as readable and
+writable, but not executable. \c{data} declares an initialised data
+section, whereas \c{bss} declares an uninitialised data section.
+
+\b \c{info} defines the section to be an \i{informational section},
+which is not included in the executable file by the linker, but may
+(for example) pass information \e{to} the linker. For example,
+declaring an \c{info}-type section called \i\c{.drectve} causes the
+linker to interpret the contents of the section as command-line
+options.
+
+\b \c{align=}, used with a trailing number as in \c{obj}, gives the
+\I{section alignment, in win32}\I{alignment, in win32
+sections}alignment requirements of the section. The maximum you may
+specify is 64: the Win32 object file format contains no means to
+request a greater section alignment than this. If alignment is not
+explicitly specified, the defaults are 16-byte alignment for code
+sections, and 4-byte alignment for data (and BSS) sections.
+Informational sections get a default alignment of 1 byte (no
+alignment), though the value does not matter.
+
+The defaults assumed by NASM if you do not specify the above
+qualifiers are:
+
+\c           section .text code align=16
+\c           section .data data align=4
+\c           section .bss bss align=4
+
+Any other section name is treated by default like \c{.text}.
+
+\H{cofffmt} \i\c{coff}: \i{Common Object File Format}
+
+The \c{coff} output type produces COFF object files suitable for
+linking with the \i{DJGPP} linker.
+
+\c{coff} provides a default output file-name extension of \c{.o}.
+
+The \c{coff} format supports the same extensions to the \c{SECTION}
+directive as \c{win32} does, except that the \c{align} qualifier and
+the \c{info} section type are not supported.
+
+\H{elffmt} \i\c{elf}: \i{Linux ELF}\I{Executable and Linkable
+Format}Object Files
+
+The \c{elf} output format generates ELF32 (Executable and Linkable
+Format) object files, as used by Linux. \c{elf} provides a default
+output file-name extension of \c{.o}.
+
+\S{elfsect} \c{elf} Extensions to the \c{SECTION}
+Directive\I{SECTION, elf extensions to}
+
+Like the \c{obj} format, \c{elf} allows you to specify additional
+information on the \c{SECTION} directive line, to control the type
+and properties of sections you declare. Section types and properties
+are generated automatically by NASM for the \i{standard section
+names} \i\c{.text}, \i\c{.data} and \i\c{.bss}, but may still be
+overridden by these qualifiers.
+
+The available qualifiers are:
+
+\b \i\c{alloc} defines the section to be one which is loaded into
+memory when the program is run. \i\c{noalloc} defines it to be one
+which is not, such as an informational or comment section.
+
+\b \i\c{exec} defines the section to be one which should have execute
+permission when the program is run. \i\c{noexec} defines it as one
+which should not.
+
+\b \i\c{write} defines the section to be one which should be writable
+when the program is run. \i\c{nowrite} defines it as one which should
+not.
+
+\b \i\c{progbits} defines the section to be one with explicit contents
+stored in the object file: an ordinary code or data section, for
+example, \i\c{nobits} defines the section to be one with no explicit
+contents given, such as a BSS section.
+
+\b \c{align=}, used with a trailing number as in \c{obj}, gives the
+\I{section alignment, in elf}\I{alignment, in elf sections}alignment
+requirements of the section.
+
+The defaults assumed by NASM if you do not specify the above
+qualifiers are:
+
+\c           section .text progbits alloc   exec nowrite align=16
+\c           section .data progbits alloc noexec   write align=4
+\c           section .bss    nobits alloc noexec   write align=4
+\c           section other progbits alloc noexec nowrite align=1
+
+(Any section name other than \c{.text}, \c{.data} and \c{.bss} is
+treated by default like \c{other} in the above code.)
+
+\S{elfwrt} \i{Position-Independent Code}\I{PIC}: \c{elf} Special
+Symbols and \i\c{WRT}
+
+The ELF specification contains enough features to allow
+position-independent code (PIC) to be written, which makes \i{ELF
+shared libraries} very flexible. However, it also means NASM has to
+be able to generate a variety of strange relocation types in ELF
+object files, if it is to be an assembler which can write PIC.
+
+Since ELF does not support segment-base references, the \c{WRT}
+operator is not used for its normal purpose; therefore NASM's
+\c{elf} output format makes use of \c{WRT} for a different purpose,
+namely the PIC-specific \I{relocations, PIC-specific}relocation
+types.
+
+\c{elf} defines five special symbols which you can use as the
+right-hand side of the \c{WRT} operator to obtain PIC relocation
+types. They are \i\c{..gotpc}, \i\c{..gotoff}, \i\c{..got},
+\i\c{..plt} and \i\c{..sym}. Their functions are summarised here:
+
+\b Referring to the symbol marking the global offset table base
+using \c{wrt ..gotpc} will end up giving the distance from the
+beginning of the current section to the global offset table.
+(\i\c{_GLOBAL_OFFSET_TABLE_} is the standard symbol name used to
+refer to the \i{GOT}.) So you would then need to add \i\c{$$} to the
+result to get the real address of the GOT.
+
+\b Referring to a location in one of your own sections using \c{wrt
+..gotoff} will give the distance from the beginning of the GOT to
+the specified location, so that adding on the address of the GOT
+would give the real address of the location you wanted.
+
+\b Referring to an external or global symbol using \c{wrt ..got}
+causes the linker to build an entry \e{in} the GOT containing the
+address of the symbol, and the reference gives the distance from the
+beginning of the GOT to the entry; so you can add on the address of
+the GOT, load from the resulting address, and end up with the
+address of the symbol.
+
+\b Referring to a procedure name using \c{wrt ..plt} causes the
+linker to build a \i{procedure linkage table} entry for the symbol,
+and the reference gives the address of the \i{PLT} entry. You can
+only use this in contexts which would generate a PC-relative
+relocation normally (i.e. as the destination for \c{CALL} or
+\c{JMP}), since ELF contains no relocation type to refer to PLT
+entries absolutely.
+
+\b Referring to a symbol name using \c{wrt ..sym} causes NASM to
+write an ordinary relocation, but instead of making the relocation
+relative to the start of the section and then adding on the offset
+to the symbol, it will write a relocation record aimed directly at
+the symbol in question. The distinction is a necessary one due to a
+peculiarity of the dynamic linker.
+
+A fuller explanation of how to use these relocation types to write
+shared libraries entirely in NASM is given in \k{picdll}.
+
+\S{elfglob} \c{elf} Extensions to the \c{GLOBAL} Directive\I{GLOBAL,
+elf extensions to}\I{GLOBAL, aoutb extensions to}
+
+ELF object files can contain more information about a global symbol
+than just its address: they can contain the \I{symbol sizes,
+specifying}\I{size, of symbols}size of the symbol and its \I{symbol
+types, specifying}\I{type, of symbols}type as well. These are not
+merely debugger conveniences, but are actually necessary when the
+program being written is a \i{shared library}. NASM therefore
+supports some extensions to the \c{GLOBAL} directive, allowing you
+to specify these features.
+
+You can specify whether a global variable is a function or a data
+object by suffixing the name with a colon and the word
+\i\c{function} or \i\c{data}. (\i\c{object} is a synonym for
+\c{data}.) For example:
+
+\c           global hashlookup:function, hashtable:data
+
+exports the global symbol \c{hashlookup} as a function and
+\c{hashtable} as a data object.
+
+You can also specify the size of the data associated with the
+symbol, as a numeric expression (which may involve labels, and even
+forward references) after the type specifier. Like this:
+
+\c           global hashtable:data (hashtable.end - hashtable)
+\c hashtable:
+\c           db this,that,theother  ; some data here
+\c .end:
+
+This makes NASM automatically calculate the length of the table and
+place that information into the ELF symbol table.
+
+Declaring the type and size of global symbols is necessary when
+writing shared library code. For more information, see
+\k{picglobal}.
+
+\S{elfcomm} \c{elf} Extensions to the \c{COMMON} Directive\I{COMMON,
+elf extensions to}
+
+ELF also allows you to specify alignment requirements \I{common
+variables, alignment in elf}\I{alignment, of elf common variables}on
+common variables. This is done by putting a number (which must be a
+power of two) after the name and size of the common variable,
+separated (as usual) by a colon. For example, an array of
+doublewords would benefit from 4-byte alignment:
+
+\c           common dwordarray 128:4
+
+This declares the total size of the array to be 128 bytes, and
+requires that it be aligned on a 4-byte boundary.
+
+\H{aoutfmt} \i\c{aout}: Linux \I{a.out, Linux version}\c{a.out} Object Files
+
+The \c{aout} format generates \c{a.out} object files, in the form
+used by early Linux systems. (These differ from other \c{a.out}
+object files in that the magic number in the first four bytes of the
+file is different. Also, some implementations of \c{a.out}, for
+example NetBSD's, support position-independent code, which Linux's
+implementation doesn't.)
+
+\c{a.out} provides a default output file-name extension of \c{.o}.
+
+\c{a.out} is a very simple object format. It supports no special
+directives, no special symbols, no use of \c{SEG} or \c{WRT}, and no
+extensions to any standard directives. It supports only the three
+\i{standard section names} \i\c{.text}, \i\c{.data} and \i\c{.bss}.
+
+\H{aoutfmt} \i\c{aoutb}: \i{NetBSD}/\i{FreeBSD}/\i{OpenBSD}
+\I{a.out, BSD version}\c{a.out} Object Files
+
+The \c{aoutb} format generates \c{a.out} object files, in the form
+used by the various free BSD Unix clones, NetBSD, FreeBSD and
+OpenBSD. For simple object files, this object format is exactly the
+same as \c{aout} except for the magic number in the first four bytes
+of the file. However, the \c{aoutb} format supports
+\I{PIC}\i{position-independent code} in the same way as the \c{elf}
+format, so you can use it to write BSD \i{shared libraries}.
+
+\c{aoutb} provides a default output file-name extension of \c{.o}.
+
+\c{aoutb} supports no special directives, no special symbols, and
+only the three \i{standard section names} \i\c{.text}, \i\c{.data}
+and \i\c{.bss}. However, it also supports the same use of \i\c{WRT} as
+\c{elf} does, to provide position-independent code relocation types.
+See \k{elfwrt} for full documentation of this feature.
+
+\c{aoutb} also supports the same extensions to the \c{GLOBAL}
+directive as \c{elf} does: see \k{elfglob} for documentation of
+this.
+
+\H{as86fmt} \c{as86}: Linux \i\c{as86} Object Files
+
+The Linux 16-bit assembler \c{as86} has its own non-standard object
+file format. Although its companion linker \i\c{ld86} produces
+something close to ordinary \c{a.out} binaries as output, the object
+file format used to communicate between \c{as86} and \c{ld86} is not
+itself \c{a.out}.
+
+NASM supports this format, just in case it is useful, as \c{as86}.
+\c{as86} provides a default output file-name extension of \c{.o}.
+
+\c{as86} is a very simple object format (from the NASM user's point
+of view). It supports no special directives, no special symbols, no
+use of \c{SEG} or \c{WRT}, and no extensions to any standard
+directives. It supports only the three \i{standard section names}
+\i\c{.text}, \i\c{.data} and \i\c{.bss}.
+
+\H{rdffmt} \I{RDOFF}\i\c{rdf}: \i{Relocatable Dynamic Object File
+Format}
+
+The \c{rdf} output format produces RDOFF object files. RDOFF
+(Relocatable Dynamic Object File Format) is a home-grown object-file
+format, designed alongside NASM itself and reflecting in its file
+format the internal structure of the assembler.
+
+RDOFF is not used by any well-known operating systems. Those writing
+their own systems, however, may well wish to use RDOFF as their
+object format, on the grounds that it is designed primarily for
+simplicity and contains very little file-header bureaucracy.
+
+The Unix NASM archive, and the DOS archive which includes sources,
+both contain an \I{rdoff subdirectory}\c{rdoff} subdirectory holding
+a set of RDOFF utilities: an RDF linker, an RDF static-library
+manager, an RDF file dump utility, and a program which will load and
+execute an RDF executable under Linux.
+
+\c{rdf} supports only the \i{standard section names} \i\c{.text},
+\i\c{.data} and \i\c{.bss}.
+
+\S{rdflib} Requiring a Library: The \i\c{LIBRARY} Directive
+
+RDOFF contains a mechanism for an object file to demand a given
+library to be linked to the module, either at load time or run time.
+This is done by the \c{LIBRARY} directive, which takes one argument
+which is the name of the module:
+
+\c           library mylib.rdl
+
+\H{dbgfmt} \i\c{dbg}: Debugging Format
+
+The \c{dbg} output format is not built into NASM in the default
+configuration. If you are building your own NASM executable from the
+sources, you can define \i\c{OF_DBG} in \c{outform.h} or on the
+compiler command line, and obtain the \c{dbg} output format.
+
+The \c{dbg} format does not output an object file as such; instead,
+it outputs a text file which contains a complete list of all the
+transactions between the main body of NASM and the output-format
+back end module. It is primarily intended to aid people who want to
+write their own output drivers, so that they can get a clearer idea
+of the various requests the main program makes of the output driver,
+and in what order they happen.
+
+For simple files, one can easily use the \c{dbg} format like this:
+
+\c nasm -f dbg filename.asm
+
+which will generate a diagnostic file called \c{filename.dbg}.
+However, this will not work well on files which were designed for a
+different object format, because each object format defines its own
+macros (usually user-level forms of directives), and those macros
+will not be defined in the \c{dbg} format. Therefore it can be
+useful to run NASM twice, in order to do the preprocessing with the
+native object format selected:
+
+\c nasm -e -f rdf -o rdfprog.i rdfprog.asm
+\c nasm -a -f dbg rdfprog.i
+
+This preprocesses \c{rdfprog.asm} into \c{rdfprog.i}, keeping the
+\c{rdf} object format selected in order to make sure RDF special
+directives are converted into primitive form correctly. Then the
+preprocessed source is fed through the \c{dbg} format to generate
+the final diagnostic output.
+
+This workaround will still typically not work for programs intended
+for \c{obj} format, because the \c{obj} \c{SEGMENT} and \c{GROUP}
+directives have side effects of defining the segment and group names
+as symbols; \c{dbg} will not do this, so the program will not
+assemble. You will have to work around that by defining the symbols
+yourself (using \c{EXTERN}, for example) if you really need to get a
+\c{dbg} trace of an \c{obj}-specific source file.
+
+\c{dbg} accepts any section name and any directives at all, and logs
+them all to its output file.
+
+\C{16bit} Writing 16-bit Code (DOS, Windows 3/3.1)
+
+This chapter attempts to cover some of the common issues encountered
+when writing 16-bit code to run under MS-DOS or Windows 3.x. It
+covers how to link programs to produce \c{.EXE} or \c{.COM} files,
+how to write \c{.SYS} device drivers, and how to interface assembly
+language code with 16-bit C compilers and with Borland Pascal.
+
+\H{exefiles} Producing \i\c{.EXE} Files
+
+Any large program written under DOS needs to be built as a \c{.EXE}
+file: only \c{.EXE} files have the necessary internal structure
+required to span more than one 64K segment. \i{Windows} programs,
+also, have to be built as \c{.EXE} files, since Windows does not
+support the \c{.COM} format.
+
+In general, you generate \c{.EXE} files by using the \c{obj} output
+format to produce one or more \i\c{.OBJ} files, and then linking
+them together using a linker. However, NASM also supports the direct
+generation of simple DOS \c{.EXE} files using the \c{bin} output
+format (by using \c{DB} and \c{DW} to construct the \c{.EXE} file
+header), and a macro package is supplied to do this. Thanks to
+Yann Guidon for contributing the code for this.
+
+NASM may also support \c{.EXE} natively as another output format in
+future releases.
+
+\S{objexe} Using the \c{obj} Format To Generate \c{.EXE} Files
+
+This section describes the usual method of generating \c{.EXE} files
+by linking \c{.OBJ} files together.
+
+Most 16-bit programming language packages come with a suitable
+linker; if you have none of these, there is a free linker called
+\i{VAL}\I{linker, free}, available in \c{LZH} archive format from
+\W{ftp://x2ftp.oulu.fi/pub/msdos/programming/lang/}\i\c{x2ftp.oulu.fi}.
+An LZH archiver can be found at
+\W{ftp://ftp.simtel.net/pub/simtelnet/msdos/arcers}\i\c{ftp.simtel.net}.
+There is another `free' linker (though this one doesn't come with
+sources) called \i{FREELINK}, available from
+\W{http://www.pcorner.com/tpc/old/3-101.html}\i\c{www.pcorner.com}.
+A third, \i\c{djlink}, written by DJ Delorie, is available at
+\W{http://www.delorie.com/djgpp/16bit/djlink/}\i\c{www.delorie.com}.
+
+When linking several \c{.OBJ} files into a \c{.EXE} file, you should
+ensure that exactly one of them has a start point defined (using the
+\I{program entry point}\i\c{..start} special symbol defined by the
+\c{obj} format: see \k{dotdotstart}). If no module defines a start
+point, the linker will not know what value to give the entry-point
+field in the output file header; if more than one defines a start
+point, the linker will not know \e{which} value to use.
+
+An example of a NASM source file which can be assembled to a
+\c{.OBJ} file and linked on its own to a \c{.EXE} is given here. It
+demonstrates the basic principles of defining a stack, initialising
+the segment registers, and declaring a start point. This file is
+also provided in the \I{test subdirectory}\c{test} subdirectory of
+the NASM archives, under the name \c{objexe.asm}.
+
+\c           segment code
+\c 
+\c ..start:  mov ax,data
+\c           mov ds,ax
+\c           mov ax,stack
+\c           mov ss,ax
+\c           mov sp,stacktop
+
+This initial piece of code sets up \c{DS} to point to the data
+segment, and initialises \c{SS} and \c{SP} to point to the top of
+the provided stack. Notice that interrupts are implicitly disabled
+for one instruction after a move into \c{SS}, precisely for this
+situation, so that there's no chance of an interrupt occurring
+between the loads of \c{SS} and \c{SP} and not having a stack to
+execute on.
+
+Note also that the special symbol \c{..start} is defined at the
+beginning of this code, which means that will be the entry point
+into the resulting executable file.
+
+\c           mov dx,hello
+\c           mov ah,9
+\c           int 0x21
+
+The above is the main program: load \c{DS:DX} with a pointer to the
+greeting message (\c{hello} is implicitly relative to the segment
+\c{data}, which was loaded into \c{DS} in the setup code, so the
+full pointer is valid), and call the DOS print-string function.
+
+\c           mov ax,0x4c00
+\c           int 0x21
+
+This terminates the program using another DOS system call.
+
+\c           segment data
+\c hello:    db 'hello, world', 13, 10, '$'
+
+The data segment contains the string we want to display.
+
+\c           segment stack stack
+\c           resb 64
+\c stacktop:
+
+The above code declares a stack segment containing 64 bytes of
+uninitialised stack space, and points \c{stacktop} at the top of it.
+The directive \c{segment stack stack} defines a segment \e{called}
+\c{stack}, and also of \e{type} \c{STACK}. The latter is not
+necessary to the correct running of the program, but linkers are
+likely to issue warnings or errors if your program has no segment of
+type \c{STACK}.
+
+The above file, when assembled into a \c{.OBJ} file, will link on
+its own to a valid \c{.EXE} file, which when run will print `hello,
+world' and then exit.
+
+\S{binexe} Using the \c{bin} Format To Generate \c{.EXE} Files
+
+The \c{.EXE} file format is simple enough that it's possible to
+build a \c{.EXE} file by writing a pure-binary program and sticking
+a 32-byte header on the front. This header is simple enough that it
+can be generated using \c{DB} and \c{DW} commands by NASM itself, so
+that you can use the \c{bin} output format to directly generate
+\c{.EXE} files.
+
+Included in the NASM archives, in the \I{misc subdirectory}\c{misc}
+subdirectory, is a file \i\c{exebin.mac} of macros. It defines three
+macros: \i\c{EXE_begin}, \i\c{EXE_stack} and \i\c{EXE_end}.
+
+To produce a \c{.EXE} file using this method, you should start by
+using \c{%include} to load the \c{exebin.mac} macro package into
+your source file. You should then issue the \c{EXE_begin} macro call
+(which takes no arguments) to generate the file header data. Then
+write code as normal for the \c{bin} format - you can use all three
+standard sections \c{.text}, \c{.data} and \c{.bss}. At the end of
+the file you should call the \c{EXE_end} macro (again, no arguments),
+which defines some symbols to mark section sizes, and these symbols
+are referred to in the header code generated by \c{EXE_begin}.
+
+In this model, the code you end up writing starts at \c{0x100}, just
+like a \c{.COM} file - in fact, if you strip off the 32-byte header
+from the resulting \c{.EXE} file, you will have a valid \c{.COM}
+program. All the segment bases are the same, so you are limited to a
+64K program, again just like a \c{.COM} file. Note that an \c{ORG}
+directive is issued by the \c{EXE_begin} macro, so you should not
+explicitly issue one of your own.
+
+You can't directly refer to your segment base value, unfortunately,
+since this would require a relocation in the header, and things
+would get a lot more complicated. So you should get your segment
+base by copying it out of \c{CS} instead.
+
+On entry to your \c{.EXE} file, \c{SS:SP} are already set up to
+point to the top of a 2Kb stack. You can adjust the default stack
+size of 2Kb by calling the \c{EXE_stack} macro. For example, to
+change the stack size of your program to 64 bytes, you would call
+\c{EXE_stack 64}.
+
+A sample program which generates a \c{.EXE} file in this way is
+given in the \c{test} subdirectory of the NASM archive, as
+\c{binexe.asm}.
+
+\H{comfiles} Producing \i\c{.COM} Files
+
+While large DOS programs must be written as \c{.EXE} files, small
+ones are often better written as \c{.COM} files. \c{.COM} files are
+pure binary, and therefore most easily produced using the \c{bin}
+output format.
+
+\S{combinfmt} Using the \c{bin} Format To Generate \c{.COM} Files
+
+\c{.COM} files expect to be loaded at offset \c{100h} into their
+segment (though the segment may change). Execution then begins at
+\I\c{ORG}\c{100h}, i.e. right at the start of the program. So to
+write a \c{.COM} program, you would create a source file looking
+like
+
+\c           org 100h
+\c           section .text
+\c start:    ; put your code here
+\c           section .data
+\c           ; put data items here
+\c           section .bss
+\c           ; put uninitialised data here
+
+The \c{bin} format puts the \c{.text} section first in the file, so
+you can declare data or BSS items before beginning to write code if
+you want to and the code will still end up at the front of the file
+where it belongs.
+
+The BSS (uninitialised data) section does not take up space in the
+\c{.COM} file itself: instead, addresses of BSS items are resolved
+to point at space beyond the end of the file, on the grounds that
+this will be free memory when the program is run. Therefore you
+should not rely on your BSS being initialised to all zeros when you
+run.
+
+To assemble the above program, you should use a command line like
+
+\c nasm myprog.asm -fbin -o myprog.com
+
+The \c{bin} format would produce a file called \c{myprog} if no
+explicit output file name were specified, so you have to override it
+and give the desired file name.
+
+\S{comobjfmt} Using the \c{obj} Format To Generate \c{.COM} Files
+
+If you are writing a \c{.COM} program as more than one module, you
+may wish to assemble several \c{.OBJ} files and link them together
+into a \c{.COM} program. You can do this, provided you have a linker
+capable of outputting \c{.COM} files directly (\i{TLINK} does this),
+or alternatively a converter program such as \i\c{EXE2BIN} to
+transform the \c{.EXE} file output from the linker into a \c{.COM}
+file.
+
+If you do this, you need to take care of several things:
+
+\b The first object file containing code should start its code
+segment with a line like \c{RESB 100h}. This is to ensure that the
+code begins at offset \c{100h} relative to the beginning of the code
+segment, so that the linker or converter program does not have to
+adjust address references within the file when generating the
+\c{.COM} file. Other assemblers use an \i\c{ORG} directive for this
+purpose, but \c{ORG} in NASM is a format-specific directive to the
+\c{bin} output format, and does not mean the same thing as it does
+in MASM-compatible assemblers.
+
+\b You don't need to define a stack segment.
+
+\b All your segments should be in the same group, so that every time
+your code or data references a symbol offset, all offsets are
+relative to the same segment base. This is because, when a \c{.COM}
+file is loaded, all the segment registers contain the same value.
+
+\H{sysfiles} Producing \i\c{.SYS} Files
+
+\i{MS-DOS device drivers} - \c{.SYS} files - are pure binary files,
+similar to \c{.COM} files, except that they start at origin zero
+rather than \c{100h}. Therefore, if you are writing a device driver
+using the \c{bin} format, you do not need the \c{ORG} directive,
+since the default origin for \c{bin} is zero. Similarly, if you are
+using \c{obj}, you do not need the \c{RESB 100h} at the start of
+your code segment.
+
+\c{.SYS} files start with a header structure, containing pointers to
+the various routines inside the driver which do the work. This
+structure should be defined at the start of the code segment, even
+though it is not actually code.
+
+For more information on the format of \c{.SYS} files, and the data
+which has to go in the header structure, a list of books is given in
+the Frequently Asked Questions list for the newsgroup
+\W{news:comp.os.msdos.programmer}\i\c{comp.os.msdos.programmer}.
+
+\H{16c} Interfacing to 16-bit C Programs
+
+This section covers the basics of writing assembly routines that
+call, or are called from, C programs. To do this, you would
+typically write an assembly module as a \c{.OBJ} file, and link it
+with your C modules to produce a \i{mixed-language program}.
+
+\S{16cunder} External Symbol Names
+
+\I{C symbol names}\I{underscore, in C symbols}C compilers have the
+convention that the names of all global symbols (functions or data)
+they define are formed by prefixing an underscore to the name as it
+appears in the C program. So, for example, the function a C
+programmer thinks of as \c{printf} appears to an assembly language
+programmer as \c{_printf}. This means that in your assembly
+programs, you can define symbols without a leading underscore, and
+not have to worry about name clashes with C symbols.
+
+If you find the underscores inconvenient, you can define macros to
+replace the \c{GLOBAL} and \c{EXTERN} directives as follows:
+
+\c %macro cglobal 1
+\c           global _%1
+\c %define %1 _%1
+\c %endmacro
+
+\c %macro cextern 1
+\c           extern _%1
+\c %define %1 _%1
+\c %endmacro
+
+(These forms of the macros only take one argument at a time; a
+\c{%rep} construct could solve this.)
+
+If you then declare an external like this:
+
+\c           cextern printf
+
+then the macro will expand it as
+
+\c           extern _printf
+\c %define printf _printf
+
+Thereafter, you can reference \c{printf} as if it was a symbol, and
+the preprocessor will put the leading underscore on where necessary.
+
+The \c{cglobal} macro works similarly. You must use \c{cglobal}
+before defining the symbol in question, but you would have had to do
+that anyway if you used \c{GLOBAL}.
+
+\S{16cmodels} \i{Memory Models}
+
+NASM contains no mechanism to support the various C memory models
+directly; you have to keep track yourself of which one you are
+writing for. This means you have to keep track of the following
+things:
+
+\b In models using a single code segment (tiny, small and compact),
+functions are near. This means that function pointers, when stored
+in data segments or pushed on the stack as function arguments, are
+16 bits long and contain only an offset field (the \c{CS} register
+never changes its value, and always gives the segment part of the
+full function address), and that functions are called using ordinary
+near \c{CALL} instructions and return using \c{RETN} (which, in
+NASM, is synonymous with \c{RET} anyway). This means both that you
+should write your own routines to return with \c{RETN}, and that you
+should call external C routines with near \c{CALL} instructions.
+
+\b In models using more than one code segment (medium, large and
+huge), functions are far. This means that function pointers are 32
+bits long (consisting of a 16-bit offset followed by a 16-bit
+segment), and that functions are called using \c{CALL FAR} (or
+\c{CALL seg:offset}) and return using \c{RETF}. Again, you should
+therefore write your own routines to return with \c{RETF} and use
+\c{CALL FAR} to call external routines.
+
+\b In models using a single data segment (tiny, small and medium),
+data pointers are 16 bits long, containing only an offset field (the
+\c{DS} register doesn't change its value, and always gives the
+segment part of the full data item address).
+
+\b In models using more than one data segment (compact, large and
+huge), data pointers are 32 bits long, consisting of a 16-bit offset
+followed by a 16-bit segment. You should still be careful not to
+modify \c{DS} in your routines without restoring it afterwards, but
+\c{ES} is free for you to use to access the contents of 32-bit data
+pointers you are passed.
+
+\b The huge memory model allows single data items to exceed 64K in
+size. In all other memory models, you can access the whole of a data
+item just by doing arithmetic on the offset field of the pointer you
+are given, whether a segment field is present or not; in huge model,
+you have to be more careful of your pointer arithmetic.
+
+\b In most memory models, there is a \e{default} data segment, whose
+segment address is kept in \c{DS} throughout the program. This data
+segment is typically the same segment as the stack, kept in \c{SS},
+so that functions' local variables (which are stored on the stack)
+and global data items can both be accessed easily without changing
+\c{DS}. Particularly large data items are typically stored in other
+segments. However, some memory models (though not the standard
+ones, usually) allow the assumption that \c{SS} and \c{DS} hold the
+same value to be removed. Be careful about functions' local
+variables in this latter case.
+
+In models with a single code segment, the segment is called
+\i\c{_TEXT}, so your code segment must also go by this name in order
+to be linked into the same place as the main code segment. In models
+with a single data segment, or with a default data segment, it is
+called \i\c{_DATA}.
+
+\S{16cfunc} Function Definitions and Function Calls
+
+\I{functions, C calling convention}The \i{C calling convention} in
+16-bit programs is as follows. In the following description, the
+words \e{caller} and \e{callee} are used to denote the function
+doing the calling and the function which gets called.
+
+\b The caller pushes the function's parameters on the stack, one
+after another, in reverse order (right to left, so that the first
+argument specified to the function is pushed last).
+
+\b The caller then executes a \c{CALL} instruction to pass control
+to the callee. This \c{CALL} is either near or far depending on the
+memory model.
+
+\b The callee receives control, and typically (although this is not
+actually necessary, in functions which do not need to access their
+parameters) starts by saving the value of \c{SP} in \c{BP} so as to
+be able to use \c{BP} as a base pointer to find its parameters on
+the stack. However, the caller was probably doing this too, so part
+of the calling convention states that \c{BP} must be preserved by
+any C function. Hence the callee, if it is going to set up \c{BP} as
+a \i\e{frame pointer}, must push the previous value first.
+
+\b The callee may then access its parameters relative to \c{BP}.
+The word at \c{[BP]} holds the previous value of \c{BP} as it was
+pushed; the next word, at \c{[BP+2]}, holds the offset part of the
+return address, pushed implicitly by \c{CALL}. In a small-model
+(near) function, the parameters start after that, at \c{[BP+4]}; in
+a large-model (far) function, the segment part of the return address
+lives at \c{[BP+4]}, and the parameters begin at \c{[BP+6]}. The
+leftmost parameter of the function, since it was pushed last, is
+accessible at this offset from \c{BP}; the others follow, at
+successively greater offsets. Thus, in a function such as \c{printf}
+which takes a variable number of parameters, the pushing of the
+parameters in reverse order means that the function knows where to
+find its first parameter, which tells it the number and type of the
+remaining ones.
+
+\b The callee may also wish to decrease \c{SP} further, so as to
+allocate space on the stack for local variables, which will then be
+accessible at negative offsets from \c{BP}.
+
+\b The callee, if it wishes to return a value to the caller, should
+leave the value in \c{AL}, \c{AX} or \c{DX:AX} depending on the size
+of the value. Floating-point results are sometimes (depending on the
+compiler) returned in \c{ST0}.
+
+\b Once the callee has finished processing, it restores \c{SP} from
+\c{BP} if it had allocated local stack space, then pops the previous
+value of \c{BP}, and returns via \c{RETN} or \c{RETF} depending on
+memory model.
+
+\b When the caller regains control from the callee, the function
+parameters are still on the stack, so it typically adds an immediate
+constant to \c{SP} to remove them (instead of executing a number of
+slow \c{POP} instructions). Thus, if a function is accidentally
+called with the wrong number of parameters due to a prototype
+mismatch, the stack will still be returned to a sensible state since
+the caller, which \e{knows} how many parameters it pushed, does the
+removing.
+
+It is instructive to compare this calling convention with that for
+Pascal programs (described in \k{16bpfunc}). Pascal has a simpler
+convention, since no functions have variable numbers of parameters.
+Therefore the callee knows how many parameters it should have been
+passed, and is able to deallocate them from the stack itself by
+passing an immediate argument to the \c{RET} or \c{RETF}
+instruction, so the caller does not have to do it. Also, the
+parameters are pushed in left-to-right order, not right-to-left,
+which means that a compiler can give better guarantees about
+sequence points without performance suffering.
+
+Thus, you would define a function in C style in the following way.
+The following example is for small model:
+
+\c           global _myfunc
+\c _myfunc:  push bp
+\c           mov bp,sp
+\c           sub sp,0x40            ; 64 bytes of local stack space
+\c           mov bx,[bp+4]          ; first parameter to function
+\c           ; some more code
+\c           mov sp,bp              ; undo "sub sp,0x40" above
+\c           pop bp
+\c           ret
+
+For a large-model function, you would replace \c{RET} by \c{RETF},
+and look for the first parameter at \c{[BP+6]} instead of
+\c{[BP+4]}. Of course, if one of the parameters is a pointer, then
+the offsets of \e{subsequent} parameters will change depending on
+the memory model as well: far pointers take up four bytes on the
+stack when passed as a parameter, whereas near pointers take up two.
+
+At the other end of the process, to call a C function from your
+assembly code, you would do something like this:
+
+\c           extern _printf
+\c           ; and then, further down...
+\c           push word [myint]      ; one of my integer variables
+\c           push word mystring     ; pointer into my data segment
+\c           call _printf
+\c           add sp,byte 4          ; `byte' saves space
+\c           ; then those data items...
+\c           segment _DATA
+\c myint     dw 1234
+\c mystring  db 'This number -> %d <- should be 1234',10,0
+
+This piece of code is the small-model assembly equivalent of the C
+code
+
+\c     int myint = 1234;
+\c     printf("This number -> %d <- should be 1234\n", myint);
+
+In large model, the function-call code might look more like this. In
+this example, it is assumed that \c{DS} already holds the segment
+base of the segment \c{_DATA}. If not, you would have to initialise
+it first.
+
+\c           push word [myint]
+\c           push word seg mystring ; Now push the segment, and...
+\c           push word mystring     ; ... offset of "mystring"
+\c           call far _printf
+\c           add sp,byte 6
+
+The integer value still takes up one word on the stack, since large
+model does not affect the size of the \c{int} data type. The first
+argument (pushed last) to \c{printf}, however, is a data pointer,
+and therefore has to contain a segment and offset part. The segment
+should be stored second in memory, and therefore must be pushed
+first. (Of course, \c{PUSH DS} would have been a shorter instruction
+than \c{PUSH WORD SEG mystring}, if \c{DS} was set up as the above
+example assumed.) Then the actual call becomes a far call, since
+functions expect far calls in large model; and \c{SP} has to be
+increased by 6 rather than 4 afterwards to make up for the extra
+word of parameters.
+
+\S{16cdata} Accessing Data Items
+
+To get at the contents of C variables, or to declare variables which
+C can access, you need only declare the names as \c{GLOBAL} or
+\c{EXTERN}. (Again, the names require leading underscores, as stated
+in \k{16cunder}.) Thus, a C variable declared as \c{int i} can be
+accessed from assembler as
+
+\c           extern _i
+\c           mov ax,[_i]
+
+And to declare your own integer variable which C programs can access
+as \c{extern int j}, you do this (making sure you are assembling in
+the \c{_DATA} segment, if necessary):
+
+\c           global _j
+\c _j        dw 0
+
+To access a C array, you need to know the size of the components of
+the array. For example, \c{int} variables are two bytes long, so if
+a C program declares an array as \c{int a[10]}, you can access
+\c{a[3]} by coding \c{mov ax,[_a+6]}. (The byte offset 6 is obtained
+by multiplying the desired array index, 3, by the size of the array
+element, 2.) The sizes of the C base types in 16-bit compilers are:
+1 for \c{char}, 2 for \c{short} and \c{int}, 4 for \c{long} and
+\c{float}, and 8 for \c{double}.
+
+To access a C \i{data structure}, you need to know the offset from
+the base of the structure to the field you are interested in. You
+can either do this by converting the C structure definition into a
+NASM structure definition (using \i\c{STRUC}), or by calculating the
+one offset and using just that.
+
+To do either of these, you should read your C compiler's manual to
+find out how it organises data structures. NASM gives no special
+alignment to structure members in its own \c{STRUC} macro, so you
+have to specify alignment yourself if the C compiler generates it.
+Typically, you might find that a structure like
+
+\c struct {
+\c     char c;
+\c     int i;
+\c } foo;
+
+might be four bytes long rather than three, since the \c{int} field
+would be aligned to a two-byte boundary. However, this sort of
+feature tends to be a configurable option in the C compiler, either
+using command-line options or \c{#pragma} lines, so you have to find
+out how your own compiler does it.
+
+\S{16cmacro} \i\c{c16.mac}: Helper Macros for the 16-bit C Interface
+
+Included in the NASM archives, in the \I{misc subdirectory}\c{misc}
+directory, is a file \c{c16.mac} of macros. It defines three macros:
+\i\c{proc}, \i\c{arg} and \i\c{endproc}. These are intended to be
+used for C-style procedure definitions, and they automate a lot of
+the work involved in keeping track of the calling convention.
+
+An example of an assembly function using the macro set is given
+here:
+
+\c           proc _nearproc
+\c %$i       arg
+\c %$j       arg
+\c           mov ax,[bp + %$i]
+\c           mov bx,[bp + %$j]
+\c           add ax,[bx]
+\c           endproc
+
+This defines \c{_nearproc} to be a procedure taking two arguments,
+the first (\c{i}) an integer and the second (\c{j}) a pointer to an
+integer. It returns \c{i + *j}.
+
+Note that the \c{arg} macro has an \c{EQU} as the first line of its
+expansion, and since the label before the macro call gets prepended
+to the first line of the expanded macro, the \c{EQU} works, defining
+\c{%$i} to be an offset from \c{BP}. A context-local variable is
+used, local to the context pushed by the \c{proc} macro and popped
+by the \c{endproc} macro, so that the same argument name can be used
+in later procedures. Of course, you don't \e{have} to do that.
+
+The macro set produces code for near functions (tiny, small and
+compact-model code) by default. You can have it generate far
+functions (medium, large and huge-model code) by means of coding
+\I\c{FARCODE}\c{%define FARCODE}. This changes the kind of return
+instruction generated by \c{endproc}, and also changes the starting
+point for the argument offsets. The macro set contains no intrinsic
+dependency on whether data pointers are far or not.
+
+\c{arg} can take an optional parameter, giving the size of the
+argument. If no size is given, 2 is assumed, since it is likely that
+many function parameters will be of type \c{int}.
+
+The large-model equivalent of the above function would look like this:
+
+\c %define FARCODE
+\c           proc _farproc
+\c %$i       arg
+\c %$j       arg 4
+\c           mov ax,[bp + %$i]
+\c           mov bx,[bp + %$j]
+\c           mov es,[bp + %$j + 2]
+\c           add ax,[bx]
+\c           endproc
+
+This makes use of the argument to the \c{arg} macro to define a
+parameter of size 4, because \c{j} is now a far pointer. When we
+load from \c{j}, we must load a segment and an offset.
+
+\H{16bp} Interfacing to \i{Borland Pascal} Programs
+
+Interfacing to Borland Pascal programs is similar in concept to
+interfacing to 16-bit C programs. The differences are:
+
+\b The leading underscore required for interfacing to C programs is
+not required for Pascal.
+
+\b The memory model is always large: functions are far, data
+pointers are far, and no data item can be more than 64K long.
+(Actually, some functions are near, but only those functions that
+are local to a Pascal unit and never called from outside it. All
+assembly functions that Pascal calls, and all Pascal functions that
+assembly routines are able to call, are far.) However, all static
+data declared in a Pascal program goes into the default data
+segment, which is the one whose segment address will be in \c{DS}
+when control is passed to your assembly code. The only things that
+do not live in the default data segment are local variables (they
+live in the stack segment) and dynamically allocated variables. All
+data \e{pointers}, however, are far.
+
+\b The function calling convention is different - described below.
+
+\b Some data types, such as strings, are stored differently.
+
+\b There are restrictions on the segment names you are allowed to
+use - Borland Pascal will ignore code or data declared in a segment
+it doesn't like the name of. The restrictions are described below.
+
+\S{16bpfunc} The Pascal Calling Convention
+
+\I{functions, Pascal calling convention}\I{Pascal calling
+convention}The 16-bit Pascal calling convention is as follows. In
+the following description, the words \e{caller} and \e{callee} are
+used to denote the function doing the calling and the function which
+gets called.
+
+\b The caller pushes the function's parameters on the stack, one
+after another, in normal order (left to right, so that the first
+argument specified to the function is pushed first).
+
+\b The caller then executes a far \c{CALL} instruction to pass
+control to the callee.
+
+\b The callee receives control, and typically (although this is not
+actually necessary, in functions which do not need to access their
+parameters) starts by saving the value of \c{SP} in \c{BP} so as to
+be able to use \c{BP} as a base pointer to find its parameters on
+the stack. However, the caller was probably doing this too, so part
+of the calling convention states that \c{BP} must be preserved by
+any function. Hence the callee, if it is going to set up \c{BP} as a
+\i{frame pointer}, must push the previous value first.
+
+\b The callee may then access its parameters relative to \c{BP}.
+The word at \c{[BP]} holds the previous value of \c{BP} as it was
+pushed. The next word, at \c{[BP+2]}, holds the offset part of the
+return address, and the next one at \c{[BP+4]} the segment part. The
+parameters begin at \c{[BP+6]}. The rightmost parameter of the
+function, since it was pushed last, is accessible at this offset
+from \c{BP}; the others follow, at successively greater offsets.
+
+\b The callee may also wish to decrease \c{SP} further, so as to
+allocate space on the stack for local variables, which will then be
+accessible at negative offsets from \c{BP}.
+
+\b The callee, if it wishes to return a value to the caller, should
+leave the value in \c{AL}, \c{AX} or \c{DX:AX} depending on the size
+of the value. Floating-point results are returned in \c{ST0}.
+Results of type \c{Real} (Borland's own custom floating-point data
+type, not handled directly by the FPU) are returned in \c{DX:BX:AX}.
+To return a result of type \c{String}, the caller pushes a pointer
+to a temporary string before pushing the parameters, and the callee
+places the returned string value at that location. The pointer is
+not a parameter, and should not be removed from the stack by the
+\c{RETF} instruction.
+
+\b Once the callee has finished processing, it restores \c{SP} from
+\c{BP} if it had allocated local stack space, then pops the previous
+value of \c{BP}, and returns via \c{RETF}. It uses the form of
+\c{RETF} with an immediate parameter, giving the number of bytes
+taken up by the parameters on the stack. This causes the parameters
+to be removed from the stack as a side effect of the return
+instruction.
+
+\b When the caller regains control from the callee, the function
+parameters have already been removed from the stack, so it needs to
+do nothing further.
+
+Thus, you would define a function in Pascal style, taking two
+\c{Integer}-type parameters, in the following way:
+
+\c           global myfunc
+\c myfunc:   push bp
+\c           mov bp,sp
+\c           sub sp,0x40            ; 64 bytes of local stack space
+\c           mov bx,[bp+8]          ; first parameter to function
+\c           mov bx,[bp+6]          ; second parameter to function
+\c           ; some more code
+\c           mov sp,bp              ; undo "sub sp,0x40" above
+\c           pop bp
+\c           retf 4                 ; total size of params is 4
+
+At the other end of the process, to call a Pascal function from your
+assembly code, you would do something like this:
+
+\c           extern SomeFunc
+\c           ; and then, further down...
+\c           push word seg mystring ; Now push the segment, and...
+\c           push word mystring     ; ... offset of "mystring"
+\c           push word [myint]      ; one of my variables
+\c           call far SomeFunc
+
+This is equivalent to the Pascal code
+
+\c procedure SomeFunc(String: PChar; Int: Integer);
+\c     SomeFunc(@mystring, myint);
+
+\S{16bpseg} Borland Pascal \I{segment names, Borland Pascal}Segment
+Name Restrictions
+
+Since Borland Pascal's internal unit file format is completely
+different from \c{OBJ}, it only makes a very sketchy job of actually
+reading and understanding the various information contained in a
+real \c{OBJ} file when it links that in. Therefore an object file
+intended to be linked to a Pascal program must obey a number of
+restrictions:
+
+\b Procedures and functions must be in a segment whose name is
+either \c{CODE}, \c{CSEG}, or something ending in \c{_TEXT}.
+
+\b Initialised data must be in a segment whose name is either
+\c{CONST} or something ending in \c{_DATA}.
+
+\b Uninitialised data must be in a segment whose name is either
+\c{DATA}, \c{DSEG}, or something ending in \c{_BSS}.
+
+\b Any other segments in the object file are completely ignored.
+\c{GROUP} directives and segment attributes are also ignored.
+
+\S{16bpmacro} Using \i\c{c16.mac} With Pascal Programs
+
+The \c{c16.mac} macro package, described in \k{16cmacro}, can also
+be used to simplify writing functions to be called from Pascal
+programs, if you code \I\c{PASCAL}\c{%define PASCAL}. This
+definition ensures that functions are far (it implies
+\i\c{FARCODE}), and also causes procedure return instructions to be
+generated with an operand.
+
+Defining \c{PASCAL} does not change the code which calculates the
+argument offsets; you must declare your function's arguments in
+reverse order. For example:
+
+\c %define PASCAL
+\c           proc _pascalproc
+\c %$j       arg 4
+\c %$i       arg
+\c           mov ax,[bp + %$i]
+\c           mov bx,[bp + %$j]
+\c           mov es,[bp + %$j + 2]
+\c           add ax,[bx]
+\c           endproc
+
+This defines the same routine, conceptually, as the example in
+\k{16cmacro}: it defines a function taking two arguments, an integer
+and a pointer to an integer, which returns the sum of the integer
+and the contents of the pointer. The only difference between this
+code and the large-model C version is that \c{PASCAL} is defined
+instead of \c{FARCODE}, and that the arguments are declared in
+reverse order.
+
+\C{32bit} Writing 32-bit Code (Unix, Win32, DJGPP)
+
+This chapter attempts to cover some of the common issues involved
+when writing 32-bit code, to run under \i{Win32} or Unix, or to be
+linked with C code generated by a Unix-style C compiler such as
+\i{DJGPP}. It covers how to write assembly code to interface with
+32-bit C routines, and how to write position-independent code for
+shared libraries.
+
+Almost all 32-bit code, and in particular all code running under
+Win32, DJGPP or any of the PC Unix variants, runs in \I{flat memory
+model}\e{flat} memory model. This means that the segment registers
+and paging have already been set up to give you the same 32-bit 4Gb
+address space no matter what segment you work relative to, and that
+you should ignore all segment registers completely. When writing
+flat-model application code, you never need to use a segment
+override or modify any segment register, and the code-section
+addresses you pass to \c{CALL} and \c{JMP} live in the same address
+space as the data-section addresses you access your variables by and
+the stack-section addresses you access local variables and procedure
+parameters by. Every address is 32 bits long and contains only an
+offset part.
+
+\H{32c} Interfacing to 32-bit C Programs
+
+A lot of the discussion in \k{16c}, about interfacing to 16-bit C
+programs, still applies when working in 32 bits. The absence of
+memory models or segmentation worries simplifies things a lot.
+
+\S{32cunder} External Symbol Names
+
+Most 32-bit C compilers share the convention used by 16-bit
+compilers, that the names of all global symbols (functions or data)
+they define are formed by prefixing an underscore to the name as it
+appears in the C program. However, not all of them do: the ELF
+specification states that C symbols do \e{not} have a leading
+underscore on their assembly-language names.
+
+The older Linux \c{a.out} C compiler, all Win32 compilers, DJGPP,
+and NetBSD and FreeBSD, all use the leading underscore; for these
+compilers, the macros \c{cextern} and \c{cglobal}, as given in
+\k{16cunder}, will still work. For ELF, though, the leading
+underscore should not be used.
+
+\S{32cfunc} Function Definitions and Function Calls
+
+\I{functions, C calling convention}The \i{C calling convention}The C
+calling convention in 32-bit programs is as follows. In the
+following description, the words \e{caller} and \e{callee} are used
+to denote the function doing the calling and the function which gets
+called.
+
+\b The caller pushes the function's parameters on the stack, one
+after another, in reverse order (right to left, so that the first
+argument specified to the function is pushed last).
+
+\b The caller then executes a near \c{CALL} instruction to pass
+control to the callee.
+
+\b The callee receives control, and typically (although this is not
+actually necessary, in functions which do not need to access their
+parameters) starts by saving the value of \c{ESP} in \c{EBP} so as
+to be able to use \c{EBP} as a base pointer to find its parameters
+on the stack. However, the caller was probably doing this too, so
+part of the calling convention states that \c{EBP} must be preserved
+by any C function. Hence the callee, if it is going to set up
+\c{EBP} as a \i{frame pointer}, must push the previous value first.
+
+\b The callee may then access its parameters relative to \c{EBP}.
+The doubleword at \c{[EBP]} holds the previous value of \c{EBP} as
+it was pushed; the next doubleword, at \c{[EBP+4]}, holds the return
+address, pushed implicitly by \c{CALL}. The parameters start after
+that, at \c{[EBP+8]}. The leftmost parameter of the function, since
+it was pushed last, is accessible at this offset from \c{EBP}; the
+others follow, at successively greater offsets. Thus, in a function
+such as \c{printf} which takes a variable number of parameters, the
+pushing of the parameters in reverse order means that the function
+knows where to find its first parameter, which tells it the number
+and type of the remaining ones.
+
+\b The callee may also wish to decrease \c{ESP} further, so as to
+allocate space on the stack for local variables, which will then be
+accessible at negative offsets from \c{EBP}.
+
+\b The callee, if it wishes to return a value to the caller, should
+leave the value in \c{AL}, \c{AX} or \c{EAX} depending on the size
+of the value. Floating-point results are typically returned in
+\c{ST0}.
+
+\b Once the callee has finished processing, it restores \c{ESP} from
+\c{EBP} if it had allocated local stack space, then pops the previous
+value of \c{EBP}, and returns via \c{RET} (equivalently, \c{RETN}).
+
+\b When the caller regains control from the callee, the function
+parameters are still on the stack, so it typically adds an immediate
+constant to \c{ESP} to remove them (instead of executing a number of
+slow \c{POP} instructions). Thus, if a function is accidentally
+called with the wrong number of parameters due to a prototype
+mismatch, the stack will still be returned to a sensible state since
+the caller, which \e{knows} how many parameters it pushed, does the
+removing.
+
+There is an alternative calling convention used by Win32 programs
+for Windows API calls, and also for functions called \e{by} the
+Windows API such as window procedures: they follow what Microsoft
+calls the \c{__stdcall} convention. This is slightly closer to the
+Pascal convention, in that the callee clears the stack by passing a
+parameter to the \c{RET} instruction. However, the parameters are
+still pushed in right-to-left order.
+
+Thus, you would define a function in C style in the following way:
+
+\c           global _myfunc
+\c _myfunc:  push ebp
+\c           mov ebp,esp
+\c           sub esp,0x40           ; 64 bytes of local stack space
+\c           mov ebx,[ebp+8]        ; first parameter to function
+\c           ; some more code
+\c           leave                  ; mov esp,ebp / pop ebp
+\c           ret
+
+At the other end of the process, to call a C function from your
+assembly code, you would do something like this:
+
+\c           extern _printf
+\c           ; and then, further down...
+\c           push dword [myint]     ; one of my integer variables
+\c           push dword mystring    ; pointer into my data segment
+\c           call _printf
+\c           add esp,byte 8         ; `byte' saves space
+\c           ; then those data items...
+\c           segment _DATA
+\c myint     dd 1234
+\c mystring  db 'This number -> %d <- should be 1234',10,0
+
+This piece of code is the assembly equivalent of the C code
+
+\c     int myint = 1234;
+\c     printf("This number -> %d <- should be 1234\n", myint);
+
+\S{32cdata} Accessing Data Items
+
+To get at the contents of C variables, or to declare variables which
+C can access, you need only declare the names as \c{GLOBAL} or
+\c{EXTERN}. (Again, the names require leading underscores, as stated
+in \k{32cunder}.) Thus, a C variable declared as \c{int i} can be
+accessed from assembler as
+
+\c           extern _i
+\c           mov eax,[_i]
+
+And to declare your own integer variable which C programs can access
+as \c{extern int j}, you do this (making sure you are assembling in
+the \c{_DATA} segment, if necessary):
+
+\c           global _j
+\c _j        dd 0
+
+To access a C array, you need to know the size of the components of
+the array. For example, \c{int} variables are four bytes long, so if
+a C program declares an array as \c{int a[10]}, you can access
+\c{a[3]} by coding \c{mov ax,[_a+12]}. (The byte offset 12 is obtained
+by multiplying the desired array index, 3, by the size of the array
+element, 4.) The sizes of the C base types in 32-bit compilers are:
+1 for \c{char}, 2 for \c{short}, 4 for \c{int}, \c{long} and
+\c{float}, and 8 for \c{double}. Pointers, being 32-bit addresses,
+are also 4 bytes long.
+
+To access a C \i{data structure}, you need to know the offset from
+the base of the structure to the field you are interested in. You
+can either do this by converting the C structure definition into a
+NASM structure definition (using \c{STRUC}), or by calculating the
+one offset and using just that.
+
+To do either of these, you should read your C compiler's manual to
+find out how it organises data structures. NASM gives no special
+alignment to structure members in its own \i\c{STRUC} macro, so you
+have to specify alignment yourself if the C compiler generates it.
+Typically, you might find that a structure like
+
+\c struct {
+\c     char c;
+\c     int i;
+\c } foo;
+
+might be eight bytes long rather than five, since the \c{int} field
+would be aligned to a four-byte boundary. However, this sort of
+feature is sometimes a configurable option in the C compiler, either
+using command-line options or \c{#pragma} lines, so you have to find
+out how your own compiler does it.
+
+\S{32cmacro} \i\c{c32.mac}: Helper Macros for the 32-bit C Interface
+
+Included in the NASM archives, in the \I{misc directory}\c{misc}
+directory, is a file \c{c32.mac} of macros. It defines three macros:
+\i\c{proc}, \i\c{arg} and \i\c{endproc}. These are intended to be
+used for C-style procedure definitions, and they automate a lot of
+the work involved in keeping track of the calling convention.
+
+An example of an assembly function using the macro set is given
+here:
+
+\c           proc _proc32
+\c %$i       arg
+\c %$j       arg
+\c           mov eax,[ebp + %$i]
+\c           mov ebx,[ebp + %$j]
+\c           add eax,[ebx]
+\c           endproc
+
+This defines \c{_proc32} to be a procedure taking two arguments, the
+first (\c{i}) an integer and the second (\c{j}) a pointer to an
+integer. It returns \c{i + *j}.
+
+Note that the \c{arg} macro has an \c{EQU} as the first line of its
+expansion, and since the label before the macro call gets prepended
+to the first line of the expanded macro, the \c{EQU} works, defining
+\c{%$i} to be an offset from \c{BP}. A context-local variable is
+used, local to the context pushed by the \c{proc} macro and popped
+by the \c{endproc} macro, so that the same argument name can be used
+in later procedures. Of course, you don't \e{have} to do that.
+
+\c{arg} can take an optional parameter, giving the size of the
+argument. If no size is given, 4 is assumed, since it is likely that
+many function parameters will be of type \c{int} or pointers.
+
+\H{picdll} Writing NetBSD/FreeBSD/OpenBSD and Linux/ELF \i{Shared
+Libraries}
+
+ELF replaced the older \c{a.out} object file format under Linux
+because it contains support for \i{position-independent code}
+(\i{PIC}), which makes writing shared libraries much easier. NASM
+supports the ELF position-independent code features, so you can
+write Linux ELF shared libraries in NASM.
+
+\i{NetBSD}, and its close cousins \i{FreeBSD} and \i{OpenBSD}, take
+a different approach by hacking PIC support into the \c{a.out}
+format. NASM supports this as the \i\c{aoutb} output format, so you
+can write \i{BSD} shared libraries in NASM too.
+
+The operating system loads a PIC shared library by memory-mapping
+the library file at an arbitrarily chosen point in the address space
+of the running process. The contents of the library's code section
+must therefore not depend on where it is loaded in memory.
+
+Therefore, you cannot get at your variables by writing code like
+this:
+
+\c           mov eax,[myvar]        ; WRONG
+
+Instead, the linker provides an area of memory called the
+\i\e{global offset table}, or \i{GOT}; the GOT is situated at a
+constant distance from your library's code, so if you can find out
+where your library is loaded (which is typically done using a
+\c{CALL} and \c{POP} combination), you can obtain the address of the
+GOT, and you can then load the addresses of your variables out of
+linker-generated entries in the GOT.
+
+The \e{data} section of a PIC shared library does not have these
+restrictions: since the data section is writable, it has to be
+copied into memory anyway rather than just paged in from the library
+file, so as long as it's being copied it can be relocated too. So
+you can put ordinary types of relocation in the data section without
+too much worry (but see \k{picglobal} for a caveat).
+
+\S{picgot} Obtaining the Address of the GOT
+
+Each code module in your shared library should define the GOT as an
+external symbol:
+
+\c           extern _GLOBAL_OFFSET_TABLE_   ; in ELF
+\c           extern __GLOBAL_OFFSET_TABLE_  ; in BSD a.out
+
+At the beginning of any function in your shared library which plans
+to access your data or BSS sections, you must first calculate the
+address of the GOT. This is typically done by writing the function
+in this form:
+
+\c func:     push ebp
+\c           mov ebp,esp
+\c           push ebx
+\c           call .get_GOT
+\c .get_GOT: pop ebx
+\c           add ebx,_GLOBAL_OFFSET_TABLE_+$$-.get_GOT wrt ..gotpc
+\c           ; the function body comes here
+\c           mov ebx,[ebp-4]
+\c           mov esp,ebp
+\c           pop ebp
+\c           ret
+
+(For BSD, again, the symbol \c{_GLOBAL_OFFSET_TABLE} requires a
+second leading underscore.)
+
+The first two lines of this function are simply the standard C
+prologue to set up a stack frame, and the last three lines are
+standard C function epilogue. The third line, and the fourth to last
+line, save and restore the \c{EBX} register, because PIC shared
+libraries use this register to store the address of the GOT.
+
+The interesting bit is the \c{CALL} instruction and the following
+two lines. The \c{CALL} and \c{POP} combination obtains the address
+of the label \c{.get_GOT}, without having to know in advance where
+the program was loaded (since the \c{CALL} instruction is encoded
+relative to the current position). The \c{ADD} instruction makes use
+of one of the special PIC relocation types: \i{GOTPC relocation}.
+With the \i\c{WRT ..gotpc} qualifier specified, the symbol
+referenced (here \c{_GLOBAL_OFFSET_TABLE_}, the special symbol
+assigned to the GOT) is given as an offset from the beginning of the
+section. (Actually, ELF encodes it as the offset from the operand
+field of the \c{ADD} instruction, but NASM simplifies this
+deliberately, so you do things the same way for both ELF and BSD.)
+So the instruction then \e{adds} the beginning of the section, to
+get the real address of the GOT, and subtracts the value of
+\c{.get_GOT} which it knows is in \c{EBX}. Therefore, by the time
+that instruction has finished,
+\c{EBX} contains the address of the GOT.
+
+If you didn't follow that, don't worry: it's never necessary to
+obtain the address of the GOT by any other means, so you can put
+those three instructions into a macro and safely ignore them:
+
+\c %macro get_GOT 0
+\c           call %%getgot
+\c %%getgot: pop ebx
+\c           add ebx,_GLOBAL_OFFSET_TABLE_+$$-%%getgot wrt ..gotpc
+\c %endmacro
+
+\S{piclocal} Finding Your Local Data Items
+
+Having got the GOT, you can then use it to obtain the addresses of
+your data items. Most variables will reside in the sections you have
+declared; they can be accessed using the \I{GOTOFF
+relocation}\c{..gotoff} special \I\c{WRT ..gotoff}\c{WRT} type. The
+way this works is like this:
+
+\c           lea eax,[ebx+myvar wrt ..gotoff]
+
+The expression \c{myvar wrt ..gotoff} is calculated, when the shared
+library is linked, to be the offset to the local variable \c{myvar}
+from the beginning of the GOT. Therefore, adding it to \c{EBX} as
+above will place the real address of \c{myvar} in \c{EAX}.
+
+If you declare variables as \c{GLOBAL} without specifying a size for
+them, they are shared between code modules in the library, but do
+not get exported from the library to the program that loaded it.
+They will still be in your ordinary data and BSS sections, so you
+can access them in the same way as local variables, using the above
+\c{..gotoff} mechanism.
+
+Note that due to a peculiarity of the way BSD \c{a.out} format
+handles this relocation type, there must be at least one non-local
+symbol in the same section as the address you're trying to access.
+
+\S{picextern} Finding External and Common Data Items
+
+If your library needs to get at an external variable (external to
+the \e{library}, not just to one of the modules within it), you must
+use the \I{GOT relocations}\I\c{WRT ..got}\c{..got} type to get at
+it. The \c{..got} type, instead of giving you the offset from the
+GOT base to the variable, gives you the offset from the GOT base to
+a GOT \e{entry} containing the address of the variable. The linker
+will set up this GOT entry when it builds the library, and the
+dynamic linker will place the correct address in it at load time. So
+to obtain the address of an external variable \c{extvar} in \c{EAX},
+you would code
+
+\c           mov eax,[ebx+extvar wrt ..got]
+
+This loads the address of \c{extvar} out of an entry in the GOT. The
+linker, when it builds the shared library, collects together every
+relocation of type \c{..got}, and builds the GOT so as to ensure it
+has every necessary entry present.
+
+Common variables must also be accessed in this way.
+
+\S{picglobal} Exporting Symbols to the Library User
+
+If you want to export symbols to the user of the library, you have
+to declare whether they are functions or data, and if they are data,
+you have to give the size of the data item. This is because the
+dynamic linker has to build \I{PLT}\i{procedure linkage table}
+entries for any exported functions, and also moves exported data
+items away from the library's data section in which they were
+declared.
+
+So to export a function to users of the library, you must use
+
+\c           global func:function   ; declare it as a function
+\c func:     push ebp
+\c           ; etc.
+
+And to export a data item such as an array, you would have to code
+
+\c           global array:data array.end-array ; give the size too
+\c array:    resd 128
+\c .end:
+
+Be careful: If you export a variable to the library user, by
+declaring it as \c{GLOBAL} and supplying a size, the variable will
+end up living in the data section of the main program, rather than
+in your library's data section, where you declared it. So you will
+have to access your own global variable with the \c{..got} mechanism
+rather than \c{..gotoff}, as if it were external (which,
+effectively, it has become).
+
+Equally, if you need to store the address of an exported global in
+one of your data sections, you can't do it by means of the standard
+sort of code:
+
+\c dataptr:  dd global_data_item    ; WRONG
+
+NASM will interpret this code as an ordinary relocation, in which
+\c{global_data_item} is merely an offset from the beginning of the
+\c{.data} section (or whatever); so this reference will end up
+pointing at your data section instead of at the exported global
+which resides elsewhere.
+
+Instead of the above code, then, you must write
+
+\c dataptr:  dd global_data_item wrt ..sym
+
+which makes use of the special \c{WRT} type \I\c{WRT ..sym}\c{..sym}
+to instruct NASM to search the symbol table for a particular symbol
+at that address, rather than just relocating by section base.
+
+Either method will work for functions: referring to one of your
+functions by means of
+
+\c funcptr:  dd my_function
+
+will give the user the address of the code you wrote, whereas
+
+\c funcptr:  dd my_function wrt ..sym
+
+will give the address of the procedure linkage table for the
+function, which is where the calling program will \e{believe} the
+function lives. Either address is a valid way to call the function.
+
+\S{picproc} Calling Procedures Outside the Library
+
+Calling procedures outside your shared library has to be done by
+means of a \i\e{procedure linkage table}, or \i{PLT}. The PLT is
+placed at a known offset from where the library is loaded, so the
+library code can make calls to the PLT in a position-independent
+way. Within the PLT there is code to jump to offsets contained in
+the GOT, so function calls to other shared libraries or to routines
+in the main program can be transparently passed off to their real
+destinations.
+
+To call an external routine, you must use another special PIC
+relocation type, \I{PLT relocations}\i\c{WRT ..plt}. This is much
+easier than the GOT-based ones: you simply replace calls such as
+\c{CALL printf} with the PLT-relative version \c{CALL printf WRT
+..plt}.
+
+\S{link} Generating the Library File
+
+Having written some code modules and assembled them to \c{.o} files,
+you then generate your shared library with a command such as
+
+\c ld -shared -o library.so module1.o module2.o       # for ELF
+\c ld -Bshareable -o library.so module1.o module2.o   # for BSD
+
+For ELF, if your shared library is going to reside in system
+directories such as \c{/usr/lib} or \c{/lib}, it is usually worth
+using the \i\c{-soname} flag to the linker, to store the final
+library file name, with a version number, into the library:
+
+\c ld -shared -soname library.so.1 -o library.so.1.2 *.o
+
+You would then copy \c{library.so.1.2} into the library directory,
+and create \c{library.so.1} as a symbolic link to it.
+
+\C{mixsize} Mixing 16 and 32 Bit Code
+
+This chapter tries to cover some of the issues, largely related to
+unusual forms of addressing and jump instructions, encountered when
+writing operating system code such as protected-mode initialisation
+routines, which require code that operates in mixed segment sizes,
+such as code in a 16-bit segment trying to modify data in a 32-bit
+one, or jumps between different-size segments.
+
+\H{mixjump} Mixed-Size Jumps\I{jumps, mixed-size}
+
+\I{operating system, writing}\I{writing operating systems}The most
+common form of \i{mixed-size instruction} is the one used when
+writing a 32-bit OS: having done your setup in 16-bit mode, such as
+loading the kernel, you then have to boot it by switching into
+protected mode and jumping to the 32-bit kernel start address. In a
+fully 32-bit OS, this tends to be the \e{only} mixed-size
+instruction you need, since everything before it can be done in pure
+16-bit code, and everything after it can be pure 32-bit.
+
+This jump must specify a 48-bit far address, since the target
+segment is a 32-bit one. However, it must be assembled in a 16-bit
+segment, so just coding, for example,
+
+\c           jmp 0x1234:0x56789ABC  ; wrong!
+
+will not work, since the offset part of the address will be
+truncated to \c{0x9ABC} and the jump will be an ordinary 16-bit far
+one.
+
+The Linux kernel setup code gets round the inability of \c{as86} to
+generate the required instruction by coding it manually, using
+\c{DB} instructions. NASM can go one better than that, by actually
+generating the right instruction itself. Here's how to do it right:
+
+\c           jmp dword 0x1234:0x56789ABC  ; right
+
+\I\c{JMP DWORD}The \c{DWORD} prefix (strictly speaking, it should
+come \e{after} the colon, since it is declaring the \e{offset} field
+to be a doubleword; but NASM will accept either form, since both are
+unambiguous) forces the offset part to be treated as far, in the
+assumption that you are deliberately writing a jump from a 16-bit
+segment to a 32-bit one.
+
+You can do the reverse operation, jumping from a 32-bit segment to a
+16-bit one, by means of the \c{WORD} prefix:
+
+\c           jmp word 0x8765:0x4321 ; 32 to 16 bit
+
+If the \c{WORD} prefix is specified in 16-bit mode, or the \c{DWORD}
+prefix in 32-bit mode, they will be ignored, since each is
+explicitly forcing NASM into a mode it was in anyway.
+
+\H{mixaddr} Addressing Between Different-Size Segments\I{addressing,
+mixed-size}\I{mixed-size addressing}
+
+If your OS is mixed 16 and 32-bit, or if you are writing a DOS
+extender, you are likely to have to deal with some 16-bit segments
+and some 32-bit ones. At some point, you will probably end up
+writing code in a 16-bit segment which has to access data in a
+32-bit segment, or vice versa.
+
+If the data you are trying to access in a 32-bit segment lies within
+the first 64K of the segment, you may be able to get away with using
+an ordinary 16-bit addressing operation for the purpose; but sooner
+or later, you will want to do 32-bit addressing from 16-bit mode.
+
+The easiest way to do this is to make sure you use a register for
+the address, since any effective address containing a 32-bit
+register is forced to be a 32-bit address. So you can do
+
+\c           mov eax,offset_into_32_bit_segment_specified_by_fs
+\c           mov dword [fs:eax],0x11223344
+
+This is fine, but slightly cumbersome (since it wastes an
+instruction and a register) if you already know the precise offset
+you are aiming at. The x86 architecture does allow 32-bit effective
+addresses to specify nothing but a 4-byte offset, so why shouldn't
+NASM be able to generate the best instruction for the purpose?
+
+It can. As in \k{mixjump}, you need only prefix the address with the
+\c{DWORD} keyword, and it will be forced to be a 32-bit address:
+
+\c           mov dword [fs:dword my_offset],0x11223344
+
+Also as in \k{mixjump}, NASM is not fussy about whether the
+\c{DWORD} prefix comes before or after the segment override, so
+arguably a nicer-looking way to code the above instruction is
+
+\c           mov dword [dword fs:my_offset],0x11223344
+
+Don't confuse the \c{DWORD} prefix \e{outside} the square brackets,
+which controls the size of the data stored at the address, with the
+one \c{inside} the square brackets which controls the length of the
+address itself. The two can quite easily be different:
+
+\c           mov word [dword 0x12345678],0x9ABC
+
+This moves 16 bits of data to an address specified by a 32-bit
+offset.
+
+You can also specify \c{WORD} or \c{DWORD} prefixes along with the
+\c{FAR} prefix to indirect far jumps or calls. For example:
+
+\c           call dword far [fs:word 0x4321]
+
+This instruction contains an address specified by a 16-bit offset;
+it loads a 48-bit far pointer from that (16-bit segment and 32-bit
+offset), and calls that address.
+
+\H{mixother} Other Mixed-Size Instructions
+
+The other way you might want to access data might be using the
+string instructions (\c{LODSx}, \c{STOSx} and so on) or the
+\c{XLATB} instruction. These instructions, since they take no
+parameters, might seem to have no easy way to make them perform
+32-bit addressing when assembled in a 16-bit segment.
+
+This is the purpose of NASM's \i\c{a16} and \i\c{a32} prefixes. If
+you are coding \c{LODSB} in a 16-bit segment but it is supposed to
+be accessing a string in a 32-bit segment, you should load the
+desired address into \c{ESI} and then code
+
+\c           a32 lodsb
+
+The prefix forces the addressing size to 32 bits, meaning that
+\c{LODSB} loads from \c{[DS:ESI]} instead of \c{[DS:SI]}. To access
+a string in a 16-bit segment when coding in a 32-bit one, the
+corresponding \c{a16} prefix can be used.
+
+The \c{a16} and \c{a32} prefixes can be applied to any instruction
+in NASM's instruction table, but most of them can generate all the
+useful forms without them. The prefixes are necessary only for
+instructions with implicit addressing: \c{CMPSx} (\k{insCMPSB}),
+\c{SCASx} (\k{insSCASB}), \c{LODSx} (\k{insLODSB}), \c{STOSx}
+(\k{insSTOSB}), \c{MOVSx} (\k{insMOVSB}), \c{INSx} (\k{insINSB}),
+\c{OUTSx} (\k{insOUTSB}), and \c{XLATB} (\k{insXLATB}). Also, the
+various push and pop instructions (\c{PUSHA} and \c{POPF} as well as
+the more usual \c{PUSH} and \c{POP}) can accept \c{a16} or \c{a32}
+prefixes to force a particular one of \c{SP} or \c{ESP} to be used
+as a stack pointer, in case the stack segment in use is a different
+size from the code segment.
+
+\c{PUSH} and \c{POP}, when applied to segment registers in 32-bit
+mode, also have the slightly odd behaviour that they push and pop 4
+bytes at a time, of which the top two are ignored and the bottom two
+give the value of the segment register being manipulated. To force
+the 16-bit behaviour of segment-register push and pop instructions,
+you can use the operand-size prefix \i\c{o16}:
+
+\c           o16 push ss
+\c           o16 push ds
+
+This code saves a doubleword of stack space by fitting two segment
+registers into the space which would normally be consumed by pushing
+one.
+
+(You can also use the \i\c{o32} prefix to force the 32-bit behaviour
+when in 16-bit mode, but this seems less useful.)
+
+\C{trouble} Troubleshooting
+
+This chapter describes some of the common problems that users have
+been known to encounter with NASM, and answers them. It also gives
+instructions for reporting bugs in NASM if you find a difficulty
+that isn't listed here.
+
+\H{problems} Common Problems
+
+\S{inefficient} NASM Generates \i{Inefficient Code}
+
+I get a lot of `bug' reports about NASM generating inefficient, or
+even `wrong', code on instructions such as \c{ADD ESP,8}. This is a
+deliberate design feature, connected to predictability of output:
+NASM, on seeing \c{ADD ESP,8}, will generate the form of the
+instruction which leaves room for a 32-bit offset. You need to code
+\I\c{BYTE}\c{ADD ESP,BYTE 8} if you want the space-efficient
+form of the instruction. This isn't a bug: at worst it's a
+misfeature, and that's a matter of opinion only.
+
+\S{jmprange} My Jumps are Out of Range\I{out of range, jumps}
+
+Similarly, people complain that when they issue \i{conditional
+jumps} (which are \c{SHORT} by default) that try to jump too far,
+NASM reports `short jump out of range' instead of making the jumps
+longer.
+
+This, again, is partly a predictability issue, but in fact has a
+more practical reason as well. NASM has no means of being told what
+type of processor the code it is generating will be run on; so it
+cannot decide for itself that it should generate \i\c{Jcc NEAR} type
+instructions, because it doesn't know that it's working for a 386 or
+above. Alternatively, it could replace the out-of-range short
+\c{JNE} instruction with a very short \c{JE} instruction that jumps
+over a \c{JMP NEAR}; this is a sensible solution for processors
+below a 386, but hardly efficient on processors which have good
+branch prediction \e{and} could have used \c{JNE NEAR} instead. So,
+once again, it's up to the user, not the assembler, to decide what
+instructions should be generated.
+
+\S{proborg} \i\c{ORG} Doesn't Work
+
+People writing \i{boot sector} programs in the \c{bin} format often
+complain that \c{ORG} doesn't work the way they'd like: in order to
+place the \c{0xAA55} signature word at the end of a 512-byte boot
+sector, people who are used to MASM tend to code
+
+\c           ORG 0
+\c           ; some boot sector code
+\c           ORG 510
+\c           DW 0xAA55
+
+This is not the intended use of the \c{ORG} directive in NASM, and
+will not work. The correct way to solve this problem in NASM is to
+use the \i\c{TIMES} directive, like this:
+
+\c           ORG 0
+\c           ; some boot sector code
+\c           TIMES 510-($-$$) DB 0
+\c           DW 0xAA55
+
+The \c{TIMES} directive will insert exactly enough zero bytes into
+the output to move the assembly point up to 510. This method also
+has the advantage that if you accidentally fill your boot sector too
+full, NASM will catch the problem at assembly time and report it, so
+you won't end up with a boot sector that you have to disassemble to
+find out what's wrong with it.
+
+\S{probtimes} \i\c{TIMES} Doesn't Work
+
+The other common problem with the above code is people who write the
+\c{TIMES} line as
+
+\c           TIMES 510-$ DB 0
+
+by reasoning that \c{$} should be a pure number, just like 510, so
+the difference between them is also a pure number and can happily be
+fed to \c{TIMES}.
+
+NASM is a \e{modular} assembler: the various component parts are
+designed to be easily separable for re-use, so they don't exchange
+information unnecessarily. In consequence, the \c{bin} output
+format, even though it has been told by the \c{ORG} directive that
+the \c{.text} section should start at 0, does not pass that
+information back to the expression evaluator. So from the
+evaluator's point of view, \c{$} isn't a pure number: it's an offset
+from a section base. Therefore the difference between \c{$} and 510
+is also not a pure number, but involves a section base. Values
+involving section bases cannot be passed as arguments to \c{TIMES}.
+
+The solution, as in the previous section, is to code the \c{TIMES}
+line in the form
+
+\c           TIMES 510-($-$$) DB 0
+
+in which \c{$} and \c{$$} are offsets from the same section base,
+and so their difference is a pure number. This will solve the
+problem and generate sensible code.
+
+\H{bugs} \i{Bugs}\I{reporting bugs}
+
+We have never yet released a version of NASM with any \e{known}
+bugs. That doesn't usually stop there being plenty we didn't know
+about, though. Any that you find should be reported to
+\W{mailto:anakin@pobox.com}\c{anakin@pobox.com}.
+
+Please read \k{qstart} first, and don't report the bug if it's
+listed in there as a deliberate feature. (If you think the feature
+is badly thought out, feel free to send us reasons why you think it
+should be changed, but don't just send us mail saying `This is a
+bug' if the documentation says we did it on purpose.) Then read
+\k{problems}, and don't bother reporting the bug if it's listed
+there.
+
+If you do report a bug, \e{please} give us all of the following
+information:
+
+\b What operating system you're running NASM under. DOS, Linux,
+NetBSD, Win16, Win32, VMS (I'd be impressed), whatever.
+
+\b If you're running NASM under DOS or Win32, tell us whether you've
+compiled your own executable from the DOS source archive, or whether
+you were using the standard distribution binaries out of the
+archive. If you were using a locally built executable, try to
+reproduce the problem using one of the standard binaries, as this
+will make it easier for us to reproduce your problem prior to fixing
+it.
+
+\b Which version of NASM you're using, and exactly how you invoked
+it. Give us the precise command line, and the contents of the
+\c{NASM} environment variable if any.
+
+\b Which versions of any supplementary programs you're using, and
+how you invoked them. If the problem only becomes visible at link
+time, tell us what linker you're using, what version of it you've
+got, and the exact linker command line. If the problem involves
+linking against object files generated by a compiler, tell us what
+compiler, what version, and what command line or options you used.
+(If you're compiling in an IDE, please try to reproduce the problem
+with the command-line version of the compiler.)
+
+\b If at all possible, send us a NASM source file which exhibits the
+problem. If this causes copyright problems (e.g. you can only
+reproduce the bug in restricted-distribution code) then bear in mind
+the following two points: firstly, we guarantee that any source code
+sent to us for the purposes of debugging NASM will be used \e{only}
+for the purposes of debugging NASM, and that we will delete all our
+copies of it as soon as we have found and fixed the bug or bugs in
+question; and secondly, we would prefer \e{not} to be mailed large
+chunks of code anyway. The smaller the file, the better. A
+three-line sample file that does nothing useful \e{except}
+demonstrate the problem is much easier to work with than a
+fully fledged ten-thousand-line program. (Of course, some errors
+\e{do} only crop up in large files, so this may not be possible.)
+
+\b A description of what the problem actually \e{is}. `It doesn't
+work' is \e{not} a helpful description! Please describe exactly what
+is happening that shouldn't be, or what isn't happening that should.
+Examples might be: `NASM generates an error message saying Line 3
+for an error that's actually on Line 5'; `NASM generates an error
+message that I believe it shouldn't be generating at all'; `NASM
+fails to generate an error message that I believe it \e{should} be
+generating'; `the object file produced from this source code crashes
+my linker'; `the ninth byte of the output file is 66 and I think it
+should be 77 instead'.
+
+\b If you believe the output file from NASM to be faulty, send it to
+us. That allows us to determine whether our own copy of NASM
+generates the same file, or whether the problem is related to
+portability issues between our development platforms and yours. We
+can handle binary files mailed to us as MIME attachments, uuencoded,
+and even BinHex. Alternatively, we may be able to provide an FTP
+site you can upload the suspect files to; but mailing them is easier
+for us.
+
+\b Any other information or data files that might be helpful. If,
+for example, the problem involves NASM failing to generate an object
+file while TASM can generate an equivalent file without trouble,
+then send us \e{both} object files, so we can see what TASM is doing
+differently from us.
+
+\A{iref} Intel x86 Instruction Reference
+
+This appendix provides a complete list of the machine instructions
+which NASM will assemble, and a short description of the function of
+each one.
+
+It is not intended to be exhaustive documentation on the fine
+details of the instructions' function, such as which exceptions they
+can trigger: for such documentation, you should go to Intel's Web
+site, \W{http://www.intel.com}\c{http://www.intel.com}.
+
+Instead, this appendix is intended primarily to provide
+documentation on the way the instructions may be used within NASM.
+For example, looking up \c{LOOP} will tell you that NASM allows
+\c{CX} or \c{ECX} to be specified as an optional second argument to
+the \c{LOOP} instruction, to enforce which of the two possible
+counter registers should be used if the default is not the one
+desired.
+
+The instructions are not quite listed in alphabetical order, since
+groups of instructions with similar functions are lumped together in
+the same entry. Most of them don't move very far from their
+alphabetic position because of this.
+
+\H{iref-opr} Key to Operand Specifications
+
+The instruction descriptions in this appendix specify their operands
+using the following notation:
+
+\b Registers: \c{reg8} denotes an 8-bit \i{general purpose
+register}, \c{reg16} denotes a 16-bit general purpose register, and
+\c{reg32} a 32-bit one. \c{fpureg} denotes one of the eight FPU
+stack registers, \c{mmxreg} denotes one of the eight 64-bit MMX
+registers, and \c{segreg} denotes a segment register. In addition,
+some registers (such as \c{AL}, \c{DX} or
+\c{ECX}) may be specified explicitly.
+
+\b Immediate operands: \c{imm} denotes a generic \i{immediate operand}.
+\c{imm8}, \c{imm16} and \c{imm32} are used when the operand is
+intended to be a specific size. For some of these instructions, NASM
+needs an explicit specifier: for example, \c{ADD ESP,16} could be
+interpreted as either \c{ADD r/m32,imm32} or \c{ADD r/m32,imm8}.
+NASM chooses the former by default, and so you must specify \c{ADD
+ESP,BYTE 16} for the latter.
+
+\b Memory references: \c{mem} denotes a generic \i{memory reference};
+\c{mem8}, \c{mem16}, \c{mem32}, \c{mem64} and \c{mem80} are used
+when the operand needs to be a specific size. Again, a specifier is
+needed in some cases: \c{DEC [address]} is ambiguous and will be
+rejected by NASM. You must specify \c{DEC BYTE [address]}, \c{DEC
+WORD [address]} or \c{DEC DWORD [address]} instead.
+
+\b \i{Restricted memory references}: one form of the \c{MOV}
+instruction allows a memory address to be specified \e{without}
+allowing the normal range of register combinations and effective
+address processing. This is denoted by \c{memoffs8}, \c{memoffs16}
+and \c{memoffs32}.
+
+\b Register or memory choices: many instructions can accept either a
+register \e{or} a memory reference as an operand. \c{r/m8} is a
+shorthand for \c{reg8/mem8}; similarly \c{r/m16} and \c{r/m32}.
+\c{r/m64} is MMX-related, and is a shorthand for \c{mmxreg/mem64}.
+
+\H{iref-opc} Key to Opcode Descriptions
+
+This appendix also provides the opcodes which NASM will generate for
+each form of each instruction. The opcodes are listed in the
+following way:
+
+\b A hex number, such as \c{3F}, indicates a fixed byte containing
+that number.
+
+\b A hex number followed by \c{+r}, such as \c{C8+r}, indicates that
+one of the operands to the instruction is a register, and the
+`register value' of that register should be added to the hex number
+to produce the generated byte. For example, EDX has register value
+2, so the code \c{C8+r}, when the register operand is EDX, generates
+the hex byte \c{CA}. Register values for specific registers are
+given in \k{iref-rv}.
+
+\b A hex number followed by \c{+cc}, such as \c{40+cc}, indicates
+that the instruction name has a condition code suffix, and the
+numeric representation of the condition code should be added to the
+hex number to produce the generated byte. For example, the code
+\c{40+cc}, when the instruction contains the \c{NE} condition,
+generates the hex byte \c{45}. Condition codes and their numeric
+representations are given in \k{iref-cc}.
+
+\b A slash followed by a digit, such as \c{/2}, indicates that one
+of the operands to the instruction is a memory address or register
+(denoted \c{mem} or \c{r/m}, with an optional size). This is to be
+encoded as an effective address, with a \i{ModR/M byte}, an optional
+\i{SIB byte}, and an optional displacement, and the spare (register)
+field of the ModR/M byte should be the digit given (which will be
+from 0 to 7, so it fits in three bits). The encoding of effective
+addresses is given in \k{iref-ea}.
+
+\b The code \c{/r} combines the above two: it indicates that one of
+the operands is a memory address or \c{r/m}, and another is a
+register, and that an effective address should be generated with the
+spare (register) field in the ModR/M byte being equal to the
+`register value' of the register operand. The encoding of effective
+addresses is given in \k{iref-ea}; register values are given in
+\k{iref-rv}.
+
+\b The codes \c{ib}, \c{iw} and \c{id} indicate that one of the
+operands to the instruction is an immediate value, and that this is
+to be encoded as a byte, little-endian word or little-endian
+doubleword respectively.
+
+\b The codes \c{rb}, \c{rw} and \c{rd} indicate that one of the
+operands to the instruction is an immediate value, and that the
+\e{difference} between this value and the address of the end of the
+instruction is to be encoded as a byte, word or doubleword
+respectively. Where the form \c{rw/rd} appears, it indicates that
+either \c{rw} or \c{rd} should be used according to whether assembly
+is being performed in \c{BITS 16} or \c{BITS 32} state respectively.
+
+\b The codes \c{ow} and \c{od} indicate that one of the operands to
+the instruction is a reference to the contents of a memory address
+specified as an immediate value: this encoding is used in some forms
+of the \c{MOV} instruction in place of the standard
+effective-address mechanism. The displacement is encoded as a word
+or doubleword. Again, \c{ow/od} denotes that \c{ow} or \c{od} should
+be chosen according to the \c{BITS} setting.
+
+\b The codes \c{o16} and \c{o32} indicate that the given form of the
+instruction should be assembled with operand size 16 or 32 bits. In
+other words, \c{o16} indicates a \c{66} prefix in \c{BITS 32} state,
+but generates no code in \c{BITS 16} state; and \c{o32} indicates a
+\c{66} prefix in \c{BITS 16} state but generates nothing in \c{BITS
+32}.
+
+\b The codes \c{a16} and \c{a32}, similarly to \c{o16} and \c{o32},
+indicate the address size of the given form of the instruction.
+Where this does not match the \c{BITS} setting, a \c{67} prefix is
+required.
+
+\S{iref-rv} Register Values
+
+Where an instruction requires a register value, it is already
+implicit in the encoding of the rest of the instruction what type of
+register is intended: an 8-bit general-purpose register, a segment
+register, a debug register, an MMX register, or whatever. Therefore
+there is no problem with registers of different types sharing an
+encoding value.
+
+The encodings for the various classes of register are:
+
+\b 8-bit general registers: \c{AL} is 0, \c{CL} is 1, \c{DL} is 2,
+\c{BL} is 3, \c{AH} is 4, \c{CH} is 5, \c{DH} is 6, and \c{BH} is
+7.
+
+\b 16-bit general registers: \c{AX} is 0, \c{CX} is 1, \c{DX} is 2,
+\c{BX} is 3, \c{SP} is 4, \c{BP} is 5, \c{SI} is 6, and \c{DI} is 7.
+
+\b 32-bit general registers: \c{EAX} is 0, \c{ECX} is 1, \c{EDX} is
+2, \c{EBX} is 3, \c{ESP} is 4, \c{EBP} is 5, \c{ESI} is 6, and
+\c{EDI} is 7.
+
+\b \i{Segment registers}: \c{ES} is 0, \c{CS} is 1, \c{SS} is 2, \c{DS}
+is 3, \c{FS} is 4, and \c{GS} is 5.
+
+\b \I{floating-point, registers}{Floating-point registers}: \c{ST0}
+is 0, \c{ST1} is 1, \c{ST2} is 2, \c{ST3} is 3, \c{ST4} is 4,
+\c{ST5} is 5, \c{ST6} is 6, and \c{ST7} is 7.
+
+\b 64-bit \i{MMX registers}: \c{MM0} is 0, \c{MM1} is 1, \c{MM2} is 2,
+\c{MM3} is 3, \c{MM4} is 4, \c{MM5} is 5, \c{MM6} is 6, and \c{MM7}
+is 7.
+
+\b \i{Control registers}: \c{CR0} is 0, \c{CR2} is 2, \c{CR3} is 3,
+and \c{CR4} is 4.
+
+\b \i{Debug registers}: \c{DR0} is 0, \c{DR1} is 1, \c{DR2} is 2,
+\c{DR3} is 3, \c{DR6} is 6, and \c{DR7} is 7.
+
+\b \i{Test registers}: \c{TR3} is 3, \c{TR4} is 4, \c{TR5} is 5,
+\c{TR6} is 6, and \c{TR7} is 7.
+
+(Note that wherever a register name contains a number, that number
+is also the register value for that register.)
+
+\S{iref-cc} \i{Condition Codes}
+
+The available condition codes are given here, along with their
+numeric representations as part of opcodes. Many of these condition
+codes have synonyms, so several will be listed at a time.
+
+In the following descriptions, the word `either', when applied to two
+possible trigger conditions, is used to mean `either or both'. If
+`either but not both' is meant, the phrase `exactly one of' is used.
+
+\b \c{O} is 0 (trigger if the overflow flag is set); \c{NO} is 1.
+
+\b \c{B}, \c{C} and \c{NAE} are 2 (trigger if the carry flag is
+set); \c{AE}, \c{NB} and \c{NC} are 3.
+
+\b \c{E} and \c{Z} are 4 (trigger if the zero flag is set); \c{NE}
+and \c{NZ} are 5.
+
+\b \c{BE} and \c{NA} are 6 (trigger if either of the carry or zero
+flags is set); \c{A} and \c{NBE} are 7.
+
+\b \c{S} is 8 (trigger if the sign flag is set); \c{NS} is 9.
+
+\b \c{P} and \c{PE} are 10 (trigger if the parity flag is set);
+\c{NP} and \c{PO} are 11.
+
+\b \c{L} and \c{NGE} are 12 (trigger if exactly one of the sign and
+overflow flags is set); \c{GE} and \c{NL} are 13.
+
+\b \c{LE} and \c{NG} are 14 (trigger if either the zero flag is set,
+or exactly one of the sign and overflow flags is set); \c{G} and
+\c{NLE} are 15.
+
+Note that in all cases, the sense of a condition code may be
+reversed by changing the low bit of the numeric representation.
+
+\S{iref-ea} Effective Address Encoding: \i{ModR/M} and \i{SIB}
+
+An \i{effective address} is encoded in up to three parts: a ModR/M
+byte, an optional SIB byte, and an optional byte, word or doubleword
+displacement field.
+
+The ModR/M byte consists of three fields: the \c{mod} field, ranging
+from 0 to 3, in the upper two bits of the byte, the \c{r/m} field,
+ranging from 0 to 7, in the lower three bits, and the spare
+(register) field in the middle (bit 3 to bit 5). The spare field is
+not relevant to the effective address being encoded, and either
+contains an extension to the instruction opcode or the register
+value of another operand.
+
+The ModR/M system can be used to encode a direct register reference
+rather than a memory access. This is always done by setting the
+\c{mod} field to 3 and the \c{r/m} field to the register value of
+the register in question (it must be a general-purpose register, and
+the size of the register must already be implicit in the encoding of
+the rest of the instruction). In this case, the SIB byte and
+displacement field are both absent.
+
+In 16-bit addressing mode (either \c{BITS 16} with no \c{67} prefix,
+or \c{BITS 32} with a \c{67} prefix), the SIB byte is never used.
+The general rules for \c{mod} and \c{r/m} (there is an exception,
+given below) are:
+
+\b The \c{mod} field gives the length of the displacement field: 0
+means no displacement, 1 means one byte, and 2 means two bytes.
+
+\b The \c{r/m} field encodes the combination of registers to be
+added to the displacement to give the accessed address: 0 means
+\c{BX+SI}, 1 means \c{BX+DI}, 2 means \c{BP+SI}, 3 means \c{BP+DI},
+4 means \c{SI} only, 5 means \c{DI} only, 6 means \c{BP} only, and 7
+means \c{BX} only.
+
+However, there is a special case:
+
+\b If \c{mod} is 0 and \c{r/m} is 6, the effective address encoded
+is not \c{[BP]} as the above rules would suggest, but instead
+\c{[disp16]}: the displacement field is present and is two bytes
+long, and no registers are added to the displacement.
+
+Therefore the effective address \c{[BP]} cannot be encoded as
+efficiently as \c{[BX]}; so if you code \c{[BP]} in a program, NASM
+adds a notional 8-bit zero displacement, and sets \c{mod} to 1,
+\c{r/m} to 6, and the one-byte displacement field to 0.
+
+In 32-bit addressing mode (either \c{BITS 16} with a \c{67} prefix,
+or \c{BITS 32} with no \c{67} prefix) the general rules (again,
+there are exceptions) for \c{mod} and \c{r/m} are:
+
+\b The \c{mod} field gives the length of the displacement field: 0
+means no displacement, 1 means one byte, and 2 means four bytes.
+
+\b If only one register is to be added to the displacement, and it
+is not \c{ESP}, the \c{r/m} field gives its register value, and the
+SIB byte is absent. If the \c{r/m} field is 4 (which would encode
+\c{ESP}), the SIB byte is present and gives the combination and
+scaling of registers to be added to the displacement.
+
+If the SIB byte is present, it describes the combination of
+registers (an optional base register, and an optional index register
+scaled by multiplication by 1, 2, 4 or 8) to be added to the
+displacement. The SIB byte is divided into the \c{scale} field, in
+the top two bits, the \c{index} field in the next three, and the
+\c{base} field in the bottom three. The general rules are:
+
+\b The \c{base} field encodes the register value of the base
+register.
+
+\b The \c{index} field encodes the register value of the index
+register, unless it is 4, in which case no index register is used
+(so \c{ESP} cannot be used as an index register).
+
+\b The \c{scale} field encodes the multiplier by which the index
+register is scaled before adding it to the base and displacement: 0
+encodes a multiplier of 1, 1 encodes 2, 2 encodes 4 and 3 encodes 8.
+
+The exceptions to the 32-bit encoding rules are:
+
+\b If \c{mod} is 0 and \c{r/m} is 5, the effective address encoded
+is not \c{[EBP]} as the above rules would suggest, but instead
+\c{[disp32]}: the displacement field is present and is four bytes
+long, and no registers are added to the displacement.
+
+\b If \c{mod} is 0, \c{r/m} is 4 (meaning the SIB byte is present)
+and \c{base} is 4, the effective address encoded is not
+\c{[EBP+index]} as the above rules would suggest, but instead
+\c{[disp32+index]}: the displacement field is present and is four
+bytes long, and there is no base register (but the index register is
+still processed in the normal way).
+
+\H{iref-flg} Key to Instruction Flags
+
+Given along with each instruction in this appendix is a set of
+flags, denoting the type of the instruction. The types are as follows:
+
+\b \c{8086}, \c{186}, \c{286}, \c{386}, \c{486}, \c{PENT} and \c{P6}
+denote the lowest processor type that supports the instruction. Most
+instructions run on all processors above the given type; those that
+do not are documented. The Pentium II contains no additional
+instructions beyond the P6 (Pentium Pro); from the point of view of
+its instruction set, it can be thought of as a P6 with MMX
+capability.
+
+\b \c{CYRIX} indicates that the instruction is specific to Cyrix
+processors, for example the extra MMX instructions in the Cyrix
+extended MMX instruction set.
+
+\b \c{FPU} indicates that the instruction is a floating-point one,
+and will only run on machines with a coprocessor (automatically
+including 486DX, Pentium and above).
+
+\b \c{MMX} indicates that the instruction is an MMX one, and will
+run on MMX-capable Pentium processors and the Pentium II.
+
+\b \c{PRIV} indicates that the instruction is a protected-mode
+management instruction. Many of these may only be used in protected
+mode, or only at privilege level zero.
+
+\b \c{UNDOC} indicates that the instruction is an undocumented one,
+and not part of the official Intel Architecture; it may or may not
+be supported on any given machine.
+
+\H{insAAA} \i\c{AAA}, \i\c{AAS}, \i\c{AAM}, \i\c{AAD}: ASCII
+Adjustments
+
+\c AAA                           ; 37                   [8086]
+
+\c AAS                           ; 3F                   [8086]
+
+\c AAD                           ; D5 0A                [8086]
+\c AAD imm                       ; D5 ib                [8086]
+
+\c AAM                           ; D4 0A                [8086]
+\c AAM imm                       ; D4 ib                [8086]
+
+These instructions are used in conjunction with the add, subtract,
+multiply and divide instructions to perform binary-coded decimal
+arithmetic in \e{unpacked} (one BCD digit per byte - easy to
+translate to and from ASCII, hence the instruction names) form.
+There are also packed BCD instructions \c{DAA} and \c{DAS}: see
+\k{insDAA}.
+
+\c{AAA} should be used after a one-byte \c{ADD} instruction whose
+destination was the \c{AL} register: by means of examining the value
+in the low nibble of \c{AL} and also the auxiliary carry flag
+\c{AF}, it determines whether the addition has overflowed, and
+adjusts it (and sets the carry flag) if so. You can add long BCD
+strings together by doing \c{ADD}/\c{AAA} on the low digits, then
+doing \c{ADC}/\c{AAA} on each subsequent digit.
+
+\c{AAS} works similarly to \c{AAA}, but is for use after \c{SUB}
+instructions rather than \c{ADD}.
+
+\c{AAM} is for use after you have multiplied two decimal digits
+together and left the result in \c{AL}: it divides \c{AL} by ten and
+stores the quotient in \c{AH}, leaving the remainder in \c{AL}. The
+divisor 10 can be changed by specifying an operand to the
+instruction: a particularly handy use of this is \c{AAM 16}, causing
+the two nibbles in \c{AL} to be separated into \c{AH} and \c{AL}.
+
+\c{AAD} performs the inverse operation to \c{AAM}: it multiplies
+\c{AH} by ten, adds it to \c{AL}, and sets \c{AH} to zero. Again,
+the multiplier 10 can be changed.
+
+\H{insADC} \i\c{ADC}: Add with Carry
+
+\c ADC r/m8,reg8                 ; 10 /r                [8086]
+\c ADC r/m16,reg16               ; o16 11 /r            [8086]
+\c ADC r/m32,reg32               ; o32 11 /r            [386]
+
+\c ADC reg8,r/m8                 ; 12 /r                [8086]
+\c ADC reg16,r/m16               ; o16 13 /r            [8086]
+\c ADC reg32,r/m32               ; o32 13 /r            [386]
+
+\c ADC r/m8,imm8                 ; 80 /2 ib             [8086]
+\c ADC r/m16,imm16               ; o16 81 /2 iw         [8086]
+\c ADC r/m32,imm32               ; o32 81 /2 id         [386]
+
+\c ADC r/m16,imm8                ; o16 83 /2 ib         [8086]
+\c ADC r/m32,imm8                ; o32 83 /2 ib         [386]
+
+\c ADC AL,imm8                   ; 14 ib                [8086]
+\c ADC AX,imm16                  ; o16 15 iw            [8086]
+\c ADC EAX,imm32                 ; o32 15 id            [386]
+
+\c{ADC} performs integer addition: it adds its two operands
+together, plus the value of the carry flag, and leaves the result in
+its destination (first) operand. The flags are set according to the
+result of the operation: in particular, the carry flag is affected
+and can be used by a subsequent \c{ADC} instruction.
+
+In the forms with an 8-bit immediate second operand and a longer
+first operand, the second operand is considered to be signed, and is
+sign-extended to the length of the first operand. In these cases,
+the \c{BYTE} qualifier is necessary to force NASM to generate this
+form of the instruction.
+
+To add two numbers without also adding the contents of the carry
+flag, use \c{ADD} (\k{insADD}).
+
+\H{insADD} \i\c{ADD}: Add Integers
+
+\c ADD r/m8,reg8                 ; 00 /r                [8086]
+\c ADD r/m16,reg16               ; o16 01 /r            [8086]
+\c ADD r/m32,reg32               ; o32 01 /r            [386]
+
+\c ADD reg8,r/m8                 ; 02 /r                [8086]
+\c ADD reg16,r/m16               ; o16 03 /r            [8086]
+\c ADD reg32,r/m32               ; o32 03 /r            [386]
+
+\c ADD r/m8,imm8                 ; 80 /0 ib             [8086]
+\c ADD r/m16,imm16               ; o16 81 /0 iw         [8086]
+\c ADD r/m32,imm32               ; o32 81 /0 id         [386]
+
+\c ADD r/m16,imm8                ; o16 83 /0 ib         [8086]
+\c ADD r/m32,imm8                ; o32 83 /0 ib         [386]
+
+\c ADD AL,imm8                   ; 04 ib                [8086]
+\c ADD AX,imm16                  ; o16 05 iw            [8086]
+\c ADD EAX,imm32                 ; o32 05 id            [386]
+
+\c{ADD} performs integer addition: it adds its two operands
+together, and leaves the result in its destination (first) operand.
+The flags are set according to the result of the operation: in
+particular, the carry flag is affected and can be used by a
+subsequent \c{ADC} instruction (\k{insADC}).
+
+In the forms with an 8-bit immediate second operand and a longer
+first operand, the second operand is considered to be signed, and is
+sign-extended to the length of the first operand. In these cases,
+the \c{BYTE} qualifier is necessary to force NASM to generate this
+form of the instruction.
+
+\H{insAND} \i\c{AND}: Bitwise AND
+
+\c AND r/m8,reg8                 ; 20 /r                [8086]
+\c AND r/m16,reg16               ; o16 21 /r            [8086]
+\c AND r/m32,reg32               ; o32 21 /r            [386]
+
+\c AND reg8,r/m8                 ; 22 /r                [8086]
+\c AND reg16,r/m16               ; o16 23 /r            [8086]
+\c AND reg32,r/m32               ; o32 23 /r            [386]
+
+\c AND r/m8,imm8                 ; 80 /4 ib             [8086]
+\c AND r/m16,imm16               ; o16 81 /4 iw         [8086]
+\c AND r/m32,imm32               ; o32 81 /4 id         [386]
+
+\c AND r/m16,imm8                ; o16 83 /4 ib         [8086]
+\c AND r/m32,imm8                ; o32 83 /4 ib         [386]
+
+\c AND AL,imm8                   ; 24 ib                [8086]
+\c AND AX,imm16                  ; o16 25 iw            [8086]
+\c AND EAX,imm32                 ; o32 25 id            [386]
+
+\c{AND} performs a bitwise AND operation between its two operands
+(i.e. each bit of the result is 1 if and only if the corresponding
+bits of the two inputs were both 1), and stores the result in the
+destination (first) operand.
+
+In the forms with an 8-bit immediate second operand and a longer
+first operand, the second operand is considered to be signed, and is
+sign-extended to the length of the first operand. In these cases,
+the \c{BYTE} qualifier is necessary to force NASM to generate this
+form of the instruction.
+
+The MMX instruction \c{PAND} (see \k{insPAND}) performs the same
+operation on the 64-bit MMX registers.
+
+\H{insARPL} \i\c{ARPL}: Adjust RPL Field of Selector
+
+\c ARPL r/m16,reg16              ; 63 /r                [286,PRIV]
+
+\c{ARPL} expects its two word operands to be segment selectors. It
+adjusts the RPL (requested privilege level - stored in the bottom
+two bits of the selector) field of the destination (first) operand
+to ensure that it is no less (i.e. no more privileged than) the RPL
+field of the source operand. The zero flag is set if and only if a
+change had to be made.
+
+\H{insBOUND} \i\c{BOUND}: Check Array Index against Bounds
+
+\c BOUND reg16,mem               ; o16 62 /r            [186]
+\c BOUND reg32,mem               ; o32 62 /r            [386]
+
+\c{BOUND} expects its second operand to point to an area of memory
+containing two signed values of the same size as its first operand
+(i.e. two words for the 16-bit form; two doublewords for the 32-bit
+form). It performs two signed comparisons: if the value in the
+register passed as its first operand is less than the first of the
+in-memory values, or is greater than or equal to the second, it
+throws a BR exception. Otherwise, it does nothing.
+
+\H{insBSF} \i\c{BSF}, \i\c{BSR}: Bit Scan
+
+\c BSF reg16,r/m16               ; o16 0F BC /r         [386]
+\c BSF reg32,r/m32               ; o32 0F BC /r         [386]
+
+\c BSR reg16,r/m16               ; o16 0F BD /r         [386]
+\c BSR reg32,r/m32               ; o32 0F BD /r         [386]
+
+\c{BSF} searches for a set bit in its source (second) operand,
+starting from the bottom, and if it finds one, stores the index in
+its destination (first) operand. If no set bit is found, the
+contents of the destination operand are undefined.
+
+\c{BSR} performs the same function, but searches from the top
+instead, so it finds the most significant set bit.
+
+Bit indices are from 0 (least significant) to 15 or 31 (most
+significant).
+
+\H{insBSWAP} \i\c{BSWAP}: Byte Swap
+
+\c BSWAP reg32                   ; o32 0F C8+r          [486]
+
+\c{BSWAP} swaps the order of the four bytes of a 32-bit register:
+bits 0-7 exchange places with bits 24-31, and bits 8-15 swap with
+bits 16-23. There is no explicit 16-bit equivalent: to byte-swap
+\c{AX}, \c{BX}, \c{CX} or \c{DX}, \c{XCHG} can be used.
+
+\H{insBT} \i\c{BT}, \i\c{BTC}, \i\c{BTR}, \i\c{BTS}: Bit Test
+
+\c BT r/m16,reg16                ; o16 0F A3 /r         [386]
+\c BT r/m32,reg32                ; o32 0F A3 /r         [386]
+\c BT r/m16,imm8                 ; o16 0F BA /4 ib      [386]
+\c BT r/m32,imm8                 ; o32 0F BA /4 ib      [386]
+
+\c BTC r/m16,reg16               ; o16 0F BB /r         [386]
+\c BTC r/m32,reg32               ; o32 0F BB /r         [386]
+\c BTC r/m16,imm8                ; o16 0F BA /7 ib      [386]
+\c BTC r/m32,imm8                ; o32 0F BA /7 ib      [386]
+
+\c BTR r/m16,reg16               ; o16 0F B3 /r         [386]
+\c BTR r/m32,reg32               ; o32 0F B3 /r         [386]
+\c BTR r/m16,imm8                ; o16 0F BA /6 ib      [386]
+\c BTR r/m32,imm8                ; o32 0F BA /6 ib      [386]
+
+\c BTS r/m16,reg16               ; o16 0F AB /r         [386]
+\c BTS r/m32,reg32               ; o32 0F AB /r         [386]
+\c BTS r/m16,imm                 ; o16 0F BA /5 ib      [386]
+\c BTS r/m32,imm                 ; o32 0F BA /5 ib      [386]
+
+These instructions all test one bit of their first operand, whose
+index is given by the second operand, and store the value of that
+bit into the carry flag. Bit indices are from 0 (least significant)
+to 15 or 31 (most significant).
+
+In addition to storing the original value of the bit into the carry
+flag, \c{BTR} also resets (clears) the bit in the operand itself.
+\c{BTS} sets the bit, and \c{BTC} complements the bit. \c{BT} does
+not modify its operands.
+
+The bit offset should be no greater than the size of the operand.
+
+\H{insCALL} \i\c{CALL}: Call Subroutine
+
+\c CALL imm                      ; E8 rw/rd             [8086]
+\c CALL imm:imm16                ; o16 9A iw iw         [8086]
+\c CALL imm:imm32                ; o32 9A id iw         [386]
+\c CALL FAR mem16                ; o16 FF /3            [8086]
+\c CALL FAR mem32                ; o32 FF /3            [386]
+\c CALL r/m16                    ; o16 FF /2            [8086]
+\c CALL r/m32                    ; o32 FF /2            [386]
+
+\c{CALL} calls a subroutine, by means of pushing the current
+instruction pointer (\c{IP}) and optionally \c{CS} as well on the
+stack, and then jumping to a given address.
+
+\c{CS} is pushed as well as \c{IP} if and only if the call is a far
+call, i.e. a destination segment address is specified in the
+instruction. The forms involving two colon-separated arguments are
+far calls; so are the \c{CALL FAR mem} forms.
+
+You can choose between the two immediate \i{far call} forms (\c{CALL
+imm:imm}) by the use of the \c{WORD} and \c{DWORD} keywords: \c{CALL
+WORD 0x1234:0x5678}) or \c{CALL DWORD 0x1234:0x56789abc}.
+
+The \c{CALL FAR mem} forms execute a far call by loading the
+destination address out of memory. The address loaded consists of 16
+or 32 bits of offset (depending on the operand size), and 16 bits of
+segment. The operand size may be overridden using \c{CALL WORD FAR
+mem} or \c{CALL DWORD FAR mem}.
+
+The \c{CALL r/m} forms execute a \i{near call} (within the same
+segment), loading the destination address out of memory or out of a
+register. The keyword \c{NEAR} may be specified, for clarity, in
+these forms, but is not necessary. Again, operand size can be
+overridden using \c{CALL WORD mem} or \c{CALL DWORD mem}.
+
+As a convenience, NASM does not require you to call a far procedure
+symbol by coding the cumbersome \c{CALL SEG routine:routine}, but
+instead allows the easier synonym \c{CALL FAR routine}.
+
+The \c{CALL r/m} forms given above are near calls; NASM will accept
+the \c{NEAR} keyword (e.g. \c{CALL NEAR [address]}), even though it
+is not strictly necessary.
+
+\H{insCBW} \i\c{CBW}, \i\c{CWD}, \i\c{CDQ}, \i\c{CWDE}: Sign Extensions
+
+\c CBW                           ; o16 98               [8086]
+\c CWD                           ; o16 99               [8086]
+\c CDQ                           ; o32 99               [386]
+\c CWDE                          ; o32 98               [386]
+
+All these instructions sign-extend a short value into a longer one,
+by replicating the top bit of the original value to fill the
+extended one.
+
+\c{CBW} extends \c{AL} into \c{AX} by repeating the top bit of
+\c{AL} in every bit of \c{AH}. \c{CWD} extends \c{AX} into \c{DX:AX}
+by repeating the top bit of \c{AX} throughout \c{DX}. \c{CWDE}
+extends \c{AX} into \c{EAX}, and \c{CDQ} extends \c{EAX} into
+\c{EDX:EAX}.
+
+\H{insCLC} \i\c{CLC}, \i\c{CLD}, \i\c{CLI}, \i\c{CLTS}: Clear Flags
+
+\c CLC                           ; F8                   [8086]
+\c CLD                           ; FC                   [8086]
+\c CLI                           ; FA                   [8086]
+\c CLTS                          ; 0F 06                [286,PRIV]
+
+These instructions clear various flags. \c{CLC} clears the carry
+flag; \c{CLD} clears the direction flag; \c{CLI} clears the
+interrupt flag (thus disabling interrupts); and \c{CLTS} clears the
+task-switched (\c{TS}) flag in \c{CR0}.
+
+To set the carry, direction, or interrupt flags, use the \c{STC},
+\c{STD} and \c{STI} instructions (\k{insSTC}). To invert the carry
+flag, use \c{CMC} (\k{insCMC}).
+
+\H{insCMC} \i\c{CMC}: Complement Carry Flag
+
+\c CMC                           ; F5                   [8086]
+
+\c{CMC} changes the value of the carry flag: if it was 0, it sets it
+to 1, and vice versa.
+
+\H{insCMOVcc} \i\c{CMOVcc}: Conditional Move
+
+\c CMOVcc reg16,r/m16            ; o16 0F 40+cc /r      [P6]
+\c CMOVcc reg32,r/m32            ; o32 0F 40+cc /r      [P6]
+
+\c{CMOV} moves its source (second) operand into its destination
+(first) operand if the given condition code is satisfied; otherwise
+it does nothing.
+
+For a list of condition codes, see \k{iref-cc}.
+
+Although the \c{CMOV} instructions are flagged \c{P6} above, they
+may not be supported by all Pentium Pro processors; the \c{CPUID}
+instruction (\k{insCPUID}) will return a bit which indicates whether
+conditional moves are supported.
+
+\H{insCMP} \i\c{CMP}: Compare Integers
+
+\c CMP r/m8,reg8                 ; 38 /r                [8086]
+\c CMP r/m16,reg16               ; o16 39 /r            [8086]
+\c CMP r/m32,reg32               ; o32 39 /r            [386]
+
+\c CMP reg8,r/m8                 ; 3A /r                [8086]
+\c CMP reg16,r/m16               ; o16 3B /r            [8086]
+\c CMP reg32,r/m32               ; o32 3B /r            [386]
+
+\c CMP r/m8,imm8                 ; 80 /0 ib             [8086]
+\c CMP r/m16,imm16               ; o16 81 /0 iw         [8086]
+\c CMP r/m32,imm32               ; o32 81 /0 id         [386]
+
+\c CMP r/m16,imm8                ; o16 83 /0 ib         [8086]
+\c CMP r/m32,imm8                ; o32 83 /0 ib         [386]
+
+\c CMP AL,imm8                   ; 3C ib                [8086]
+\c CMP AX,imm16                  ; o16 3D iw            [8086]
+\c CMP EAX,imm32                 ; o32 3D id            [386]
+
+\c{CMP} performs a `mental' subtraction of its second operand from
+its first operand, and affects the flags as if the subtraction had
+taken place, but does not store the result of the subtraction
+anywhere.
+
+In the forms with an 8-bit immediate second operand and a longer
+first operand, the second operand is considered to be signed, and is
+sign-extended to the length of the first operand. In these cases,
+the \c{BYTE} qualifier is necessary to force NASM to generate this
+form of the instruction.
+
+\H{insCMPSB} \i\c{CMPSB}, \i\c{CMPSW}, \i\c{CMPSD}: Compare Strings
+
+\c CMPSB                         ; A6                   [8086]
+\c CMPSW                         ; o16 A7               [8086]
+\c CMPSD                         ; o32 A7               [386]
+
+\c{CMPSB} compares the byte at \c{[DS:SI]} or \c{[DS:ESI]} with the
+byte at \c{[ES:DI]} or \c{[ES:EDI]}, and sets the flags accordingly.
+It then increments or decrements (depending on the direction flag:
+increments if the flag is clear, decrements if it is set) \c{SI} and
+\c{DI} (or \c{ESI} and \c{EDI}).
+
+The registers used are \c{SI} and \c{DI} if the address size is 16
+bits, and \c{ESI} and \c{EDI} if it is 32 bits. If you need to use
+an address size not equal to the current \c{BITS} setting, you can
+use an explicit \i\c{a16} or \i\c{a32} prefix.
+
+The segment register used to load from \c{[SI]} or \c{[ESI]} can be
+overridden by using a segment register name as a prefix (for
+example, \c{es cmpsb}). The use of \c{ES} for the load from \c{[DI]}
+or \c{[EDI]} cannot be overridden.
+
+\c{CMPSW} and \c{CMPSD} work in the same way, but they compare a
+word or a doubleword instead of a byte, and increment or decrement
+the addressing registers by 2 or 4 instead of 1.
+
+The \c{REPE} and \c{REPNE} prefixes (equivalently, \c{REPZ} and
+\c{REPNZ}) may be used to repeat the instruction up to \c{CX} (or
+\c{ECX} - again, the address size chooses which) times until the
+first unequal or equal byte is found.
+
+\H{insCMPXCHG} \i\c{CMPXCHG}, \i\c{CMPXCHG486}: Compare and Exchange
+
+\c CMPXCHG r/m8,reg8             ; 0F B0 /r             [PENT]
+\c CMPXCHG r/m16,reg16           ; o16 0F B1 /r         [PENT]
+\c CMPXCHG r/m32,reg32           ; o32 0F B1 /r         [PENT]
+
+\c CMPXCHG486 r/m8,reg8          ; 0F A6 /r             [486,UNDOC]
+\c CMPXCHG486 r/m16,reg16        ; o16 0F A7 /r         [486,UNDOC]
+\c CMPXCHG486 r/m32,reg32        ; o32 0F A7 /r         [486,UNDOC]
+
+These two instructions perform exactly the same operation; however,
+apparently some (not all) 486 processors support it under a
+non-standard opcode, so NASM provides the undocumented
+\c{CMPXCHG486} form to generate the non-standard opcode.
+
+\c{CMPXCHG} compares its destination (first) operand to the value in
+\c{AL}, \c{AX} or \c{EAX} (depending on the size of the
+instruction). If they are equal, it copies its source (second)
+operand into the destination and sets the zero flag. Otherwise, it
+clears the zero flag and leaves the destination alone.
+
+\c{CMPXCHG} is intended to be used for atomic operations in
+multitasking or multiprocessor environments. To safely update a
+value in shared memory, for example, you might load the value into
+\c{EAX}, load the updated value into \c{EBX}, and then execute the
+instruction \c{lock cmpxchg [value],ebx}. If \c{value} has not
+changed since being loaded, it is updated with your desired new
+value, and the zero flag is set to let you know it has worked. (The
+\c{LOCK} prefix prevents another processor doing anything in the
+middle of this operation: it guarantees atomicity.) However, if
+another processor has modified the value in between your load and
+your attempted store, the store does not happen, and you are
+notified of the failure by a cleared zero flag, so you can go round
+and try again.
+
+\H{insCMPXCHG8B} \i\c{CMPXCHG8B}: Compare and Exchange Eight Bytes
+
+\c CMPXCHG8B mem                 ; 0F C7 /1             [PENT]
+
+This is a larger and more unwieldy version of \c{CMPXCHG}: it
+compares the 64-bit (eight-byte) value stored at \c{[mem]} with the
+value in \c{EDX:EAX}. If they are equal, it sets the zero flag and
+stores \c{ECX:EBX} into the memory area. If they are unequal, it
+clears the zero flag and leaves the memory area untouched.
+
+\H{insCPUID} \i\c{CPUID}: Get CPU Identification Code
+
+\c CPUID                         ; 0F A2                [PENT]
+
+\c{CPUID} returns various information about the processor it is
+being executed on. It fills the four registers \c{EAX}, \c{EBX},
+\c{ECX} and \c{EDX} with information, which varies depending on the
+input contents of \c{EAX}.
+
+\c{CPUID} also acts as a barrier to serialise instruction execution:
+executing the \c{CPUID} instruction guarantees that all the effects
+(memory modification, flag modification, register modification) of
+previous instructions have been completed before the next
+instruction gets fetched.
+
+The information returned is as follows:
+
+\b If \c{EAX} is zero on input, \c{EAX} on output holds the maximum
+acceptable input value of \c{EAX}, and \c{EBX:EDX:ECX} contain the
+string \c{"GenuineIntel"} (or not, if you have a clone processor).
+That is to say, \c{EBX} contains \c{"Genu"} (in NASM's own sense of
+character constants, described in \k{chrconst}), \c{EDX} contains
+\c{"ineI"} and \c{ECX} contains \c{"ntel"}.
+
+\b If \c{EAX} is one on input, \c{EAX} on output contains version
+information about the processor, and \c{EDX} contains a set of
+feature flags, showing the presence and absence of various features.
+For example, bit 8 is set if the \c{CMPXCHG8B} instruction
+(\k{insCMPXCHG8B}) is supported, bit 15 is set if the conditional
+move instructions (\k{insCMOVcc} and \k{insFCMOVB}) are supported,
+and bit 23 is set if MMX instructions are supported.
+
+\b If \c{EAX} is two on input, \c{EAX}, \c{EBX}, \c{ECX} and \c{EDX}
+all contain information about caches and TLBs (Translation Lookahead
+Buffers).
+
+For more information on the data returned from \c{CPUID}, see the
+documentation on Intel's web site.
+
+\H{insDAA} \i\c{DAA}, \i\c{DAS}: Decimal Adjustments
+
+\c DAA                           ; 27                   [8086]
+\c DAS                           ; 2F                   [8086]
+
+These instructions are used in conjunction with the add and subtract
+instructions to perform binary-coded decimal arithmetic in
+\e{packed} (one BCD digit per nibble) form. For the unpacked
+equivalents, see \k{insAAA}.
+
+\c{DAA} should be used after a one-byte \c{ADD} instruction whose
+destination was the \c{AL} register: by means of examining the value
+in the \c{AL} and also the auxiliary carry flag \c{AF}, it
+determines whether either digit of the addition has overflowed, and
+adjusts it (and sets the carry and auxiliary-carry flags) if so. You
+can add long BCD strings together by doing \c{ADD}/\c{DAA} on the
+low two digits, then doing \c{ADC}/\c{DAA} on each subsequent pair
+of digits.
+
+\c{DAS} works similarly to \c{DAA}, but is for use after \c{SUB}
+instructions rather than \c{ADD}.
+
+\H{insDEC} \i\c{DEC}: Decrement Integer
+
+\c DEC reg16                     ; o16 48+r             [8086]
+\c DEC reg32                     ; o32 48+r             [386]
+\c DEC r/m8                      ; FE /1                [8086]
+\c DEC r/m16                     ; o16 FF /1            [8086]
+\c DEC r/m32                     ; o32 FF /1            [386]
+
+\c{DEC} subtracts 1 from its operand. It does \e{not} affect the
+carry flag: to affect the carry flag, use \c{SUB something,1} (see
+\k{insSUB}). See also \c{INC} (\k{insINC}).
+
+\H{insDIV} \i\c{DIV}: Unsigned Integer Divide
+
+\c DIV r/m8                      ; F6 /6                [8086]
+\c DIV r/m16                     ; o16 F7 /6            [8086]
+\c DIV r/m32                     ; o32 F7 /6            [386]
+
+\c{DIV} performs unsigned integer division. The explicit operand
+provided is the divisor; the dividend and destination operands are
+implicit, in the following way:
+
+\b For \c{DIV r/m8}, \c{AX} is divided by the given operand; the
+quotient is stored in \c{AL} and the remainder in \c{AH}.
+
+\b For \c{DIV r/m16}, \c{DX:AX} is divided by the given operand; the
+quotient is stored in \c{AX} and the remainder in \c{DX}.
+
+\b For \c{DIV r/m32}, \c{EDX:EAX} is divided by the given operand;
+the quotient is stored in \c{EAX} and the remainder in \c{EDX}.
+
+Signed integer division is performed by the \c{IDIV} instruction:
+see \k{insIDIV}.
+
+\H{insEMMS} \i\c{EMMS}: Empty MMX State
+
+\c EMMS                          ; 0F 77                [PENT,MMX]
+
+\c{EMMS} sets the FPU tag word (marking which floating-point
+registers are available) to all ones, meaning all registers are
+available for the FPU to use. It should be used after executing MMX
+instructions and before executing any subsequent floating-point
+operations.
+
+\H{insENTER} \i\c{ENTER}: Create Stack Frame
+
+\c ENTER imm,imm                 ; C8 iw ib             [186]
+
+\c{ENTER} constructs a stack frame for a high-level language
+procedure call. The first operand (the \c{iw} in the opcode
+definition above refers to the first operand) gives the amount of
+stack space to allocate for local variables; the second (the \c{ib}
+above) gives the nesting level of the procedure (for languages like
+Pascal, with nested procedures).
+
+The function of \c{ENTER}, with a nesting level of zero, is
+equivalent to
+
+\c           PUSH EBP            ; or PUSH BP         in 16 bits
+\c           MOV EBP,ESP         ; or MOV BP,SP       in 16 bits
+\c           SUB ESP,operand1    ; or SUB SP,operand1 in 16 bits
+
+This creates a stack frame with the procedure parameters accessible
+upwards from \c{EBP}, and local variables accessible downwards from
+\c{EBP}.
+
+With a nesting level of one, the stack frame created is 4 (or 2)
+bytes bigger, and the value of the final frame pointer \c{EBP} is
+accessible in memory at \c{[EBP-4]}.
+
+This allows \c{ENTER}, when called with a nesting level of two, to
+look at the stack frame described by the \e{previous} value of
+\c{EBP}, find the frame pointer at offset -4 from that, and push it
+along with its new frame pointer, so that when a level-two procedure
+is called from within a level-one procedure, \c{[EBP-4]} holds the
+frame pointer of the most recent level-one procedure call and
+\c{[EBP-8]} holds that of the most recent level-two call. And so on,
+for nesting levels up to 31.
+
+Stack frames created by \c{ENTER} can be destroyed by the \c{LEAVE}
+instruction: see \k{insLEAVE}.
+
+\H{insF2XM1} \i\c{F2XM1}: Calculate 2**X-1
+
+\c F2XM1                         ; D9 F0                [8086,FPU]
+
+\c{F2XM1} raises 2 to the power of \c{ST0}, subtracts one, and
+stores the result back into \c{ST0}. The initial contents of \c{ST0}
+must be a number in the range -1 to +1.
+
+\H{insFABS} \i\c{FABS}: Floating-Point Absolute Value
+
+\c FABS                          ; D9 E1                [8086,FPU]
+
+\c{FABS} computes the absolute value of \c{ST0}, storing the result
+back in \c{ST0}.
+
+\H{insFADD} \i\c{FADD}, \i\c{FADDP}: Floating-Point Addition
+
+\c FADD mem32                    ; D8 /0                [8086,FPU]
+\c FADD mem64                    ; DC /0                [8086,FPU]
+
+\c FADD fpureg                   ; D8 C0+r              [8086,FPU]
+\c FADD ST0,fpureg               ; D8 C0+r              [8086,FPU]
+
+\c FADD TO fpureg                ; DC C0+r              [8086,FPU]
+\c FADD fpureg,ST0               ; DC C0+r              [8086,FPU]
+
+\c FADDP fpureg                  ; DE C0+r              [8086,FPU]
+\c FADDP fpureg,ST0              ; DE C0+r              [8086,FPU]
+
+\c{FADD}, given one operand, adds the operand to \c{ST0} and stores
+the result back in \c{ST0}. If the operand has the \c{TO} modifier,
+the result is stored in the register given rather than in \c{ST0}.
+
+\c{FADDP} performs the same function as \c{FADD TO}, but pops the
+register stack after storing the result.
+
+The given two-operand forms are synonyms for the one-operand forms.
+
+\H{insFBLD} \i\c{FBLD}, \i\c{FBSTP}: BCD Floating-Point Load and Store
+
+\c FBLD mem80                    ; DF /4                [8086,FPU]
+\c FBSTP mem80                   ; DF /6                [8086,FPU]
+
+\c{FBLD} loads an 80-bit (ten-byte) packed binary-coded decimal
+number from the given memory address, converts it to a real, and
+pushes it on the register stack. \c{FBSTP} stores the value of
+\c{ST0}, in packed BCD, at the given address and then pops the
+register stack.
+
+\H{insFCHS} \i\c{FCHS}: Floating-Point Change Sign
+
+\c FCHS                          ; D9 E0                [8086,FPU]
+
+\c{FCHS} negates the number in \c{ST0}: negative numbers become
+positive, and vice versa.
+
+\H{insFCLEX} \i\c{FCLEX}, \{FNCLEX}: Clear Floating-Point Exceptions
+
+\c FCLEX                         ; 9B DB E2             [8086,FPU]
+\c FNCLEX                        ; DB E2                [8086,FPU]
+
+\c{FCLEX} clears any floating-point exceptions which may be pending.
+\c{FNCLEX} does the same thing but doesn't wait for previous
+floating-point operations (including the \e{handling} of pending
+exceptions) to finish first.
+
+\H{insFCMOVB} \i\c{FCMOVcc}: Floating-Point Conditional Move
+
+\c FCMOVB fpureg                 ; DA C0+r              [P6,FPU]
+\c FCMOVB ST0,fpureg             ; DA C0+r              [P6,FPU]
+
+\c FCMOVBE fpureg                ; DA D0+r              [P6,FPU]
+\c FCMOVBE ST0,fpureg            ; DA D0+r              [P6,FPU]
+
+\c FCMOVE fpureg                 ; DA C8+r              [P6,FPU]
+\c FCMOVE ST0,fpureg             ; DA C8+r              [P6,FPU]
+
+\c FCMOVNB fpureg                ; DB C0+r              [P6,FPU]
+\c FCMOVNB ST0,fpureg            ; DB C0+r              [P6,FPU]
+
+\c FCMOVNBE fpureg               ; DB D0+r              [P6,FPU]
+\c FCMOVNBE ST0,fpureg           ; DB D0+r              [P6,FPU]
+
+\c FCMOVNE fpureg                ; DB C8+r              [P6,FPU]
+\c FCMOVNE ST0,fpureg            ; DB C8+r              [P6,FPU]
+
+\c FCMOVNU fpureg                ; DB D8+r              [P6,FPU]
+\c FCMOVNU ST0,fpureg            ; DB D8+r              [P6,FPU]
+
+\c FCMOVU fpureg                 ; DA D8+r              [P6,FPU]
+\c FCMOVU ST0,fpureg             ; DA D8+r              [P6,FPU]
+
+The \c{FCMOV} instructions perform conditional move operations: each
+of them moves the contents of the given register into \c{ST0} if its
+condition is satisfied, and does nothing if not.
+
+The conditions are not the same as the standard condition codes used
+with conditional jump instructions. The conditions \c{B}, \c{BE},
+\c{NB}, \c{NBE}, \c{E} and \c{NE} are exactly as normal, but none of
+the other standard ones are supported. Instead, the condition \c{U}
+and its counterpart \c{NU} are provided; the \c{U} condition is
+satisfied if the last two floating-point numbers compared were
+\e{unordered}, i.e. they were not equal but neither one could be
+said to be greater than the other, for example if they were NaNs.
+(The flag state which signals this is the setting of the parity
+flag: so the \c{U} condition is notionally equivalent to \c{PE}, and
+\c{NU} is equivalent to \c{PO}.)
+
+The \c{FCMOV} conditions test the main processor's status flags, not
+the FPU status flags, so using \c{FCMOV} directly after \c{FCOM}
+will not work. Instead, you should either use \c{FCOMI} which writes
+directly to the main CPU flags word, or use \c{FSTSW} to extract the
+FPU flags.
+
+Although the \c{FCMOV} instructions are flagged \c{P6} above, they
+may not be supported by all Pentium Pro processors; the \c{CPUID}
+instruction (\k{insCPUID}) will return a bit which indicates whether
+conditional moves are supported.
+
+\H{insFCOM} \i\c{FCOM}, \i\c{FCOMP}, \i\c{FCOMPP}, \i\c{FCOMI}, \i\c{FCOMIP}: Floating-Point Compare
+
+\c FCOM mem32                    ; D8 /2                [8086,FPU]
+\c FCOM mem64                    ; DC /2                [8086,FPU]
+\c FCOM fpureg                   ; D8 D0+r              [8086,FPU]
+\c FCOM ST0,fpureg               ; D8 D0+r              [8086,FPU]
+
+\c FCOMP mem32                   ; D8 /3                [8086,FPU]
+\c FCOMP mem64                   ; DC /3                [8086,FPU]
+\c FCOMP fpureg                  ; D8 D8+r              [8086,FPU]
+\c FCOMP ST0,fpureg              ; D8 D8+r              [8086,FPU]
+
+\c FCOMPP                        ; DE D9                [8086,FPU]
+
+\c FCOMI fpureg                  ; DB F0+r              [P6,FPU]
+\c FCOMI ST0,fpureg              ; DB F0+r              [P6,FPU]
+
+\c FCOMIP fpureg                 ; DF F0+r              [P6,FPU]
+\c FCOMIP ST0,fpureg             ; DF F0+r              [P6,FPU]
+
+\c{FCOM} compares \c{ST0} with the given operand, and sets the FPU
+flags accordingly. \c{ST0} is treated as the left-hand side of the
+comparison, so that the carry flag is set (for a `less-than' result)
+if \c{ST0} is less than the given operand.
+
+\c{FCOMP} does the same as \c{FCOM}, but pops the register stack
+afterwards. \c{FCOMPP} compares \c{ST0} with \c{ST1} and then pops
+the register stack twice.
+
+\c{FCOMI} and \c{FCOMIP} work like the corresponding forms of
+\c{FCOM} and \c{FCOMP}, but write their results directly to the CPU
+flags register rather than the FPU status word, so they can be
+immediately followed by conditional jump or conditional move
+instructions.
+
+The \c{FCOM} instructions differ from the \c{FUCOM} instructions
+(\k{insFUCOM}) only in the way they handle quiet NaNs: \c{FUCOM}
+will handle them silently and set the condition code flags to an
+`unordered' result, whereas \c{FCOM} will generate an exception.
+
+\H{insFCOS} \i\c{FCOS}: Cosine
+
+\c FCOS                          ; D9 FF                [386,FPU]
+
+\c{FCOS} computes the cosine of \c{ST0} (in radians), and stores the
+result in \c{ST0}. See also \c{FSINCOS} (\k{insFSIN}).
+
+\H{insFDECSTP} \i\c{FDECSTP}: Decrement Floating-Point Stack Pointer
+
+\c FDECSTP                       ; D9 F6                [8086,FPU]
+
+\c{FDECSTP} decrements the `top' field in the floating-point status
+word. This has the effect of rotating the FPU register stack by one,
+as if the contents of \c{ST7} had been pushed on the stack. See also
+\c{FINCSTP} (\k{insFINCSTP}).
+
+\H{insFDISI} \i\c{FxDISI}, \i\c{FxENI}: Disable and Enable Floating-Point Interrupts
+
+\c FDISI                         ; 9B DB E1             [8086,FPU]
+\c FNDISI                        ; DB E1                [8086,FPU]
+
+\c FENI                          ; 9B DB E0             [8086,FPU]
+\c FNENI                         ; DB E0                [8086,FPU]
+
+\c{FDISI} and \c{FENI} disable and enable floating-point interrupts.
+These instructions are only meaningful on original 8087 processors:
+the 287 and above treat them as no-operation instructions.
+
+\c{FNDISI} and \c{FNENI} do the same thing as \c{FDISI} and \c{FENI}
+respectively, but without waiting for the floating-point processor
+to finish what it was doing first.
+
+\H{insFDIV} \i\c{FDIV}, \i\c{FDIVP}, \i\c{FDIVR}, \i\c{FDIVRP}: Floating-Point Division
+
+\c FDIV mem32                    ; D8 /6                [8086,FPU]
+\c FDIV mem64                    ; DC /6                [8086,FPU]
+
+\c FDIV fpureg                   ; D8 F0+r              [8086,FPU]
+\c FDIV ST0,fpureg               ; D8 F0+r              [8086,FPU]
+
+\c FDIV TO fpureg                ; DC F8+r              [8086,FPU]
+\c FDIV fpureg,ST0               ; DC F8+r              [8086,FPU]
+
+\c FDIVR mem32                   ; D8 /0                [8086,FPU]
+\c FDIVR mem64                   ; DC /0                [8086,FPU]
+
+\c FDIVR fpureg                  ; D8 F8+r              [8086,FPU]
+\c FDIVR ST0,fpureg              ; D8 F8+r              [8086,FPU]
+
+\c FDIVR TO fpureg               ; DC F0+r              [8086,FPU]
+\c FDIVR fpureg,ST0              ; DC F0+r              [8086,FPU]
+
+\c FDIVP fpureg                  ; DE F8+r              [8086,FPU]
+\c FDIVP fpureg,ST0              ; DE F8+r              [8086,FPU]
+
+\c FDIVRP fpureg                 ; DE F0+r              [8086,FPU]
+\c FDIVRP fpureg,ST0             ; DE F0+r              [8086,FPU]
+
+\c{FDIV} divides \c{ST0} by the given operand and stores the result
+back in \c{ST0}, unless the \c{TO} qualifier is given, in which case
+it divides the given operand by \c{ST0} and stores the result in the
+operand.
+
+\c{FDIVR} does the same thing, but does the division the other way
+up: so if \c{TO} is not given, it divides the given operand by
+\c{ST0} and stores the result in \c{ST0}, whereas if \c{TO} is given
+it divides \c{ST0} by its operand and stores the result in the
+operand.
+
+\c{FDIVP} operates like \c{FDIV TO}, but pops the register stack
+once it has finished. \c{FDIVRP} operates like \c{FDIVR TO}, but
+pops the register stack once it has finished.
+
+\H{insFFREE} \i\c{FFREE}: Flag Floating-Point Register as Unused
+
+\c FFREE fpureg                  ; DD C0+r              [8086,FPU]
+
+\c{FFREE} marks the given register as being empty.
+
+\H{insFIADD} \i\c{FIADD}: Floating-Point/Integer Addition
+
+\c FIADD mem16                   ; DE /0                [8086,FPU]
+\c FIADD mem32                   ; DA /0                [8086,FPU]
+
+\c{FIADD} adds the 16-bit or 32-bit integer stored in the given
+memory location to \c{ST0}, storing the result in \c{ST0}.
+
+\H{insFICOM} \i\c{FICOM}, \i\c{FICOMP}: Floating-Point/Integer Compare
+
+\c FICOM mem16                   ; DE /2                [8086,FPU]
+\c FICOM mem32                   ; DA /2                [8086,FPU]
+
+\c FICOMP mem16                  ; DE /3                [8086,FPU]
+\c FICOMP mem32                  ; DA /3                [8086,FPU]
+
+\c{FICOM} compares \c{ST0} with the 16-bit or 32-bit integer stored
+in the given memory location, and sets the FPU flags accordingly.
+\c{FICOMP} does the same, but pops the register stack afterwards.
+
+\H{insFIDIV} \i\c{FIDIV}, \i\c{FIDIVR}: Floating-Point/Integer Division
+
+\c FIDIV mem16                   ; DE /6                [8086,FPU]
+\c FIDIV mem32                   ; DA /6                [8086,FPU]
+
+\c FIDIVR mem16                  ; DE /0                [8086,FPU]
+\c FIDIVR mem32                  ; DA /0                [8086,FPU]
+
+\c{FIDIV} divides \c{ST0} by the 16-bit or 32-bit integer stored in
+the given memory location, and stores the result in \c{ST0}.
+\c{FIDIVR} does the division the other way up: it divides the
+integer by \c{ST0}, but still stores the result in \c{ST0}.
+
+\H{insFILD} \i\c{FILD}, \i\c{FIST}, \i\c{FISTP}: Floating-Point/Integer Conversion
+
+\c FILD mem16                    ; DF /0                [8086,FPU]
+\c FILD mem32                    ; DB /0                [8086,FPU]
+\c FILD mem64                    ; DF /5                [8086,FPU]
+
+\c FIST mem16                    ; DF /2                [8086,FPU]
+\c FIST mem32                    ; DB /2                [8086,FPU]
+
+\c FISTP mem16                   ; DF /3                [8086,FPU]
+\c FISTP mem32                   ; DB /3                [8086,FPU]
+\c FISTP mem64                   ; DF /0                [8086,FPU]
+
+\c{FILD} loads an integer out of a memory location, converts it to a
+real, and pushes it on the FPU register stack. \c{FIST} converts
+\c{ST0} to an integer and stores that in memory; \c{FISTP} does the
+same as \c{FIST}, but pops the register stack afterwards.
+
+\H{insFIMUL} \i\c{FIMUL}: Floating-Point/Integer Multiplication
+
+\c FIMUL mem16                   ; DE /1                [8086,FPU]
+\c FIMUL mem32                   ; DA /1                [8086,FPU]
+
+\c{FIMUL} multiplies \c{ST0} by the 16-bit or 32-bit integer stored
+in the given memory location, and stores the result in \c{ST0}.
+
+\H{insFINCSTP} \i\c{FINCSTP}: Increment Floating-Point Stack Pointer
+
+\c FINCSTP                       ; D9 F7                [8086,FPU]
+
+\c{FINCSTP} increments the `top' field in the floating-point status
+word. This has the effect of rotating the FPU register stack by one,
+as if the register stack had been popped; however, unlike the
+popping of the stack performed by many FPU instructions, it does not
+flag the new \c{ST7} (previously \c{ST0}) as empty. See also
+\c{FDECSTP} (\k{insFDECSTP}).
+
+\H{insFINIT} \i\c{FINIT}, \i\c{FNINIT}: Initialise Floating-Point Unit
+
+\c FINIT                         ; 9B DB E3             [8086,FPU]
+\c FNINIT                        ; DB E3                [8086,FPU]
+
+\c{FINIT} initialises the FPU to its default state. It flags all
+registers as empty, though it does not actually change their values.
+\c{FNINIT} does the same, without first waiting for pending
+exceptions to clear.
+
+\H{insFISUB} \i\c{FISUB}: Floating-Point/Integer Subtraction
+
+\c FISUB mem16                   ; DE /4                [8086,FPU]
+\c FISUB mem32                   ; DA /4                [8086,FPU]
+
+\c FISUBR mem16                  ; DE /5                [8086,FPU]
+\c FISUBR mem32                  ; DA /5                [8086,FPU]
+
+\c{FISUB} subtracts the 16-bit or 32-bit integer stored in the given
+memory location from \c{ST0}, and stores the result in \c{ST0}.
+\c{FISUBR} does the subtraction the other way round, i.e. it
+subtracts \c{ST0} from the given integer, but still stores the
+result in \c{ST0}.
+
+\H{insFLD} \i\c{FLD}: Floating-Point Load
+
+\c FLD mem32                     ; D9 /0                [8086,FPU]
+\c FLD mem64                     ; DD /0                [8086,FPU]
+\c FLD mem80                     ; DB /5                [8086,FPU]
+\c FLD fpureg                    ; D9 C0+r              [8086,FPU]
+
+\c{FLD} loads a floating-point value out of the given register or
+memory location, and pushes it on the FPU register stack.
+
+\H{insFLD1} \i\c{FLDxx}: Floating-Point Load Constants
+
+\c FLD1                          ; D9 E8                [8086,FPU]
+\c FLDL2E                        ; D9 EA                [8086,FPU]
+\c FLDL2T                        ; D9 E9                [8086,FPU]
+\c FLDLG2                        ; D9 EC                [8086,FPU]
+\c FLDLN2                        ; D9 ED                [8086,FPU]
+\c FLDPI                         ; D9 EB                [8086,FPU]
+\c FLDZ                          ; D9 EE                [8086,FPU]
+
+These instructions push specific standard constants on the FPU
+register stack. \c{FLD1} pushes the value 1; \c{FLDL2E} pushes the
+base-2 logarithm of e; \c{FLDL2T} pushes the base-2 log of 10;
+\c{FLDLG2} pushes the base-10 log of 2; \c{FLDLN2} pushes the base-e
+log of 2; \c{FLDPI} pushes pi; and \c{FLDZ} pushes zero.
+
+\H{insFLDCW} \i\c{FLDCW}: Load Floating-Point Control Word
+
+\c FLDCW mem16                   ; D9 /5                [8086,FPU]
+
+\c{FLDCW} loads a 16-bit value out of memory and stores it into the
+FPU control word (governing things like the rounding mode, the
+precision, and the exception masks). See also \c{FSTCW}
+(\k{insFSTCW}).
+
+\H{insFLDENV} \i\c{FLDENV}: Load Floating-Point Environment
+
+\c FLDENV mem                    ; D9 /4                [8086,FPU]
+
+\c{FLDENV} loads the FPU operating environment (control word, status
+word, tag word, instruction pointer, data pointer and last opcode)
+from memory. The memory area is 14 or 28 bytes long, depending on
+the CPU mode at the time. See also \c{FSTENV} (\k{insFSTENV}).
+
+\H{insFMUL} \i\c{FMUL}, \i\c{FMULP}: Floating-Point Multiply
+
+\c FMUL mem32                    ; D8 /1                [8086,FPU]
+\c FMUL mem64                    ; DC /1                [8086,FPU]
+
+\c FMUL fpureg                   ; D8 C8+r              [8086,FPU]
+\c FMUL ST0,fpureg               ; D8 C8+r              [8086,FPU]
+
+\c FMUL TO fpureg                ; DC C8+r              [8086,FPU]
+\c FMUL fpureg,ST0               ; DC C8+r              [8086,FPU]
+
+\c FMULP fpureg                  ; DE C8+r              [8086,FPU]
+\c FMULP fpureg,ST0              ; DE C8+r              [8086,FPU]
+
+\c{FMUL} multiplies \c{ST0} by the given operand, and stores the
+result in \c{ST0}, unless the \c{TO} qualifier is used in which case
+it stores the result in the operand. \c{FMULP} performs the same
+operation as \c{FMUL TO}, and then pops the register stack.
+
+\H{insFNOP} \i\c{FNOP}: Floating-Point No Operation
+
+\c FNOP                          ; D9 D0                [8086,FPU]
+
+\c{FNOP} does nothing.
+
+\H{insFPATAN} \i\c{FPATAN}, \i\c{FPTAN}: Arctangent and Tangent
+
+\c FPATAN                        ; D9 F3                [8086,FPU]
+\c FPTAN                         ; D9 F2                [8086,FPU]
+
+\c{FPATAN} computes the arctangent, in radians, of the result of
+dividing \c{ST1} by \c{ST0}, stores the result in \c{ST1}, and pops
+the register stack. It works like the C \c{atan2} function, in that
+changing the sign of both \c{ST0} and \c{ST1} changes the output
+value by pi (so it performs true rectangular-to-polar coordinate
+conversion, with \c{ST1} being the Y coordinate and \c{ST0} being
+the X coordinate, not merely an arctangent).
+
+\c{FPTAN} computes the tangent of the value in \c{ST0} (in radians),
+and stores the result back into \c{ST0}.
+
+\H{insFPREM} \i\c{FPREM}, \i\c{FPREM1}: Floating-Point Partial Remainder
+
+\c FPREM                         ; D9 F8                [8086,FPU]
+\c FPREM1                        ; D9 F5                [386,FPU]
+
+These instructions both produce the remainder obtained by dividing
+\c{ST0} by \c{ST1}. This is calculated, notionally, by dividing
+\c{ST0} by \c{ST1}, rounding the result to an integer, multiplying
+by \c{ST1} again, and computing the value which would need to be
+added back on to the result to get back to the original value in
+\c{ST0}.
+
+The two instructions differ in the way the notional round-to-integer
+operation is performed. \c{FPREM} does it by rounding towards zero,
+so that the remainder it returns always has the same sign as the
+original value in \c{ST0}; \c{FPREM1} does it by rounding to the
+nearest integer, so that the remainder always has at most half the
+magnitude of \c{ST1}.
+
+Both instructions calculate \e{partial} remainders, meaning that
+they may not manage to provide the final result, but might leave
+intermediate results in \c{ST0} instead. If this happens, they will
+set the C2 flag in the FPU status word; therefore, to calculate a
+remainder, you should repeatedly execute \c{FPREM} or \c{FPREM1}
+until C2 becomes clear.
+
+\H{insFRNDINT} \i\c{FRNDINT}: Floating-Point Round to Integer
+
+\c FRNDINT                       ; D9 FC                [8086,FPU]
+
+\c{FRNDINT} rounds the contents of \c{ST0} to an integer, according
+to the current rounding mode set in the FPU control word, and stores
+the result back in \c{ST0}.
+
+\H{insFRSTOR} \i\c{FSAVE}, \i\c{FRSTOR}: Save/Restore Floating-Point State
+
+\c FSAVE mem                     ; 9B DD /6             [8086,FPU]
+\c FNSAVE mem                    ; DD /6                [8086,FPU]
+
+\c FRSTOR mem                    ; DD /4                [8086,FPU]
+
+\c{FSAVE} saves the entire floating-point unit state, including all
+the information saved by \c{FSTENV} (\k{insFSTENV}) plus the
+contents of all the registers, to a 94 or 108 byte area of memory
+(depending on the CPU mode). \c{FRSTOR} restores the floating-point
+state from the same area of memory.
+
+\c{FNSAVE} does the same as \c{FSAVE}, without first waiting for
+pending floating-point exceptions to clear.
+
+\H{insFSCALE} \i\c{FSCALE}: Scale Floating-Point Value by Power of Two
+
+\c FSCALE                        ; D9 FD                [8086,FPU]
+
+\c{FSCALE} scales a number by a power of two: it rounds \c{ST1}
+towards zero to obtain an integer, then multiplies \c{ST0} by two to
+the power of that integer, and stores the result in \c{ST0}.
+
+\H{insFSETPM} \i\c{FSETPM}: Set Protected Mode
+
+\c FSETPM                        ; DB E4                [286,FPU]
+
+This instruction initalises protected mode on the 287 floating-point
+coprocessor. It is only meaningful on that processor: the 387 and
+above treat the instruction as a no-operation.
+
+\H{insFSIN} \i\c{FSIN}, \i\c{FSINCOS}: Sine and Cosine
+
+\c FSIN                          ; D9 FE                [386,FPU]
+\c FSINCOS                       ; D9 FB                [386,FPU]
+
+\c{FSIN} calculates the sine of \c{ST0} (in radians) and stores the
+result in \c{ST0}. \c{FSINCOS} does the same, but then pushes the
+cosine of the same value on the register stack, so that the sine
+ends up in \c{ST1} and the cosine in \c{ST0}. \c{FSINCOS} is faster
+than executing \c{FSIN} and \c{FCOS} (see \k{insFCOS}) in
+succession.
+
+\H{insFSQRT} \i\c{FSQRT}: Floating-Point Square Root
+
+\c FSQRT                         ; D9 FA                [8086,FPU]
+
+\c{FSQRT} calculates the square root of \c{ST0} and stores the
+result in \c{ST0}.
+
+\H{insFST} \i\c{FST}, \i\c{FSTP}: Floating-Point Store
+
+\c FST mem32                     ; D9 /2                [8086,FPU]
+\c FST mem64                     ; DD /2                [8086,FPU]
+\c FST fpureg                    ; DD D0+r              [8086,FPU]
+
+\c FSTP mem32                    ; D9 /3                [8086,FPU]
+\c FSTP mem64                    ; DD /3                [8086,FPU]
+\c FSTP mem80                    ; DB /0                [8086,FPU]
+\c FSTP fpureg                   ; DD D8+r              [8086,FPU]
+
+\c{FST} stores the value in \c{ST0} into the given memory location
+or other FPU register. \c{FSTP} does the same, but then pops the
+register stack.
+
+\H{insFSTCW} \i\c{FSTCW}: Store Floating-Point Control Word
+
+\c FSTCW mem16                   ; 9B D9 /0             [8086,FPU]
+\c FNSTCW mem16                  ; D9 /0                [8086,FPU]
+
+\c{FSTCW} stores the FPU control word (governing things like the
+rounding mode, the precision, and the exception masks) into a 2-byte
+memory area. See also \c{FLDCW} (\k{insFLDCW}).
+
+\c{FNSTCW} does the same thing as \c{FSTCW}, without first waiting
+for pending floating-point exceptions to clear.
+
+\H{insFSTENV} \i\c{FSTENV}: Store Floating-Point Environment
+
+\c FSTENV mem                    ; 9B D9 /6             [8086,FPU]
+\c FNSTENV mem                   ; D9 /6                [8086,FPU]
+
+\c{FSTENV} stores the FPU operating environment (control word,
+status word, tag word, instruction pointer, data pointer and last
+opcode) into memory. The memory area is 14 or 28 bytes long,
+depending on the CPU mode at the time. See also \c{FLDENV}
+(\k{insFLDENV}).
+
+\c{FNSTENV} does the same thing as \c{FSTENV}, without first waiting
+for pending floating-point exceptions to clear.
+
+\H{insFSTSW} \i\c{FSTSW}: Store Floating-Point Status Word
+
+\c FSTSW mem16                   ; 9B DD /0             [8086,FPU]
+\c FSTSW AX                      ; 9B DF E0             [286,FPU]
+
+\c FNSTSW mem16                  ; DD /0                [8086,FPU]
+\c FNSTSW AX                     ; DF E0                [286,FPU]
+
+\c{FSTSW} stores the FPU status word into \c{AX} or into a 2-byte
+memory area.
+
+\c{FNSTSW} does the same thing as \c{FSTSW}, without first waiting
+for pending floating-point exceptions to clear.
+
+\H{insFSUB} \i\c{FSUB}, \i\c{FSUBP}, \i\c{FSUBR}, \i\c{FSUBRP}: Floating-Point Subtract
+
+\c FSUB mem32                    ; D8 /4                [8086,FPU]
+\c FSUB mem64                    ; DC /4                [8086,FPU]
+
+\c FSUB fpureg                   ; D8 E0+r              [8086,FPU]
+\c FSUB ST0,fpureg               ; D8 E0+r              [8086,FPU]
+
+\c FSUB TO fpureg                ; DC E8+r              [8086,FPU]
+\c FSUB fpureg,ST0               ; DC E8+r              [8086,FPU]
+
+\c FSUBR mem32                   ; D8 /5                [8086,FPU]
+\c FSUBR mem64                   ; DC /5                [8086,FPU]
+
+\c FSUBR fpureg                  ; D8 E8+r              [8086,FPU]
+\c FSUBR ST0,fpureg              ; D8 E8+r              [8086,FPU]
+
+\c FSUBR TO fpureg               ; DC E0+r              [8086,FPU]
+\c FSUBR fpureg,ST0              ; DC E0+r              [8086,FPU]
+
+\c FSUBP fpureg                  ; DE E8+r              [8086,FPU]
+\c FSUBP fpureg,ST0              ; DE E8+r              [8086,FPU]
+
+\c FSUBRP fpureg                 ; DE E0+r              [8086,FPU]
+\c FSUBRP fpureg,ST0             ; DE E0+r              [8086,FPU]
+
+\c{FSUB} subtracts the given operand from \c{ST0} and stores the
+result back in \c{ST0}, unless the \c{TO} qualifier is given, in
+which case it subtracts \c{ST0} from the given operand and stores
+the result in the operand.
+
+\c{FSUBR} does the same thing, but does the subtraction the other way
+up: so if \c{TO} is not given, it subtracts \c{ST0} from the given
+operand and stores the result in \c{ST0}, whereas if \c{TO} is given
+it subtracts its operand from \c{ST0} and stores the result in the
+operand.
+
+\c{FSUBP} operates like \c{FSUB TO}, but pops the register stack
+once it has finished. \c{FSUBRP} operates like \c{FSUBR TO}, but
+pops the register stack once it has finished.
+
+\H{insFTST} \i\c{FTST}: Test \c{ST0} Against Zero
+
+\c FTST                          ; D9 E4                [8086,FPU]
+
+\c{FTST} compares \c{ST0} with zero and sets the FPU flags
+accordingly. \c{ST0} is treated as the left-hand side of the
+comparison, so that a `less-than' result is generated if \c{ST0} is
+negative.
+
+\H{insFUCOM} \i\c{FUCOMxx}: Floating-Point Unordered Compare
+
+\c FUCOM fpureg                  ; DD E0+r              [386,FPU]
+\c FUCOM ST0,fpureg              ; DD E0+r              [386,FPU]
+
+\c FUCOMP fpureg                 ; DD E8+r              [386,FPU]
+\c FUCOMP ST0,fpureg             ; DD E8+r              [386,FPU]
+
+\c FUCOMPP                       ; DA E9                [386,FPU]
+
+\c FUCOMI fpureg                 ; DB E8+r              [P6,FPU]
+\c FUCOMI ST0,fpureg             ; DB E8+r              [P6,FPU]
+
+\c FUCOMIP fpureg                ; DF E8+r              [P6,FPU]
+\c FUCOMIP ST0,fpureg            ; DF E8+r              [P6,FPU]
+
+\c{FUCOM} compares \c{ST0} with the given operand, and sets the FPU
+flags accordingly. \c{ST0} is treated as the left-hand side of the
+comparison, so that the carry flag is set (for a `less-than' result)
+if \c{ST0} is less than the given operand.
+
+\c{FUCOMP} does the same as \c{FUCOM}, but pops the register stack
+afterwards. \c{FUCOMPP} compares \c{ST0} with \c{ST1} and then pops
+the register stack twice.
+
+\c{FUCOMI} and \c{FUCOMIP} work like the corresponding forms of
+\c{FUCOM} and \c{FUCOMP}, but write their results directly to the CPU
+flags register rather than the FPU status word, so they can be
+immediately followed by conditional jump or conditional move
+instructions.
+
+The \c{FUCOM} instructions differ from the \c{FCOM} instructions
+(\k{insFCOM}) only in the way they handle quiet NaNs: \c{FUCOM} will
+handle them silently and set the condition code flags to an
+`unordered' result, whereas \c{FCOM} will generate an exception.
+
+\H{insFXAM} \i\c{FXAM}: Examine Class of Value in \c{ST0}
+
+\c FXAM                          ; D9 E5                [8086,FPU]
+
+\c{FXAM} sets the FPU flags C3, C2 and C0 depending on the type of
+value stored in \c{ST0}: 000 (respectively) for an unsupported
+format, 001 for a NaN, 010 for a normal finite number, 011 for an
+infinity, 100 for a zero, 101 for an empty register, and 110 for a
+denormal. It also sets the C1 flag to the sign of the number.
+
+\H{insFXCH} \i\c{FXCH}: Floating-Point Exchange
+
+\c FXCH                          ; D9 C9                [8086,FPU]
+\c FXCH fpureg                   ; D9 C8+r              [8086,FPU]
+\c FXCH fpureg,ST0               ; D9 C8+r              [8086,FPU]
+\c FXCH ST0,fpureg               ; D9 C8+r              [8086,FPU]
+
+\c{FXCH} exchanges \c{ST0} with a given FPU register. The no-operand
+form exchanges \c{ST0} with \c{ST1}.
+
+\H{insFXTRACT} \i\c{FXTRACT}: Extract Exponent and Significand
+
+\c FXTRACT                       ; D9 F4                [8086,FPU]
+
+\c{FXTRACT} separates the number in \c{ST0} into its exponent and
+significand (mantissa), stores the exponent back into \c{ST0}, and
+then pushes the significand on the register stack (so that the
+significand ends up in \c{ST0}, and the exponent in \c{ST1}).
+
+\H{insFYL2X} \i\c{FYL2X}, \i\c{FYL2XP1}: Compute Y times Log2(X) or Log2(X+1)
+
+\c FYL2X                         ; D9 F1                [8086,FPU]
+\c FYL2XP1                       ; D9 F9                [8086,FPU]
+
+\c{FYL2X} multiplies \c{ST1} by the base-2 logarithm of \c{ST0},
+stores the result in \c{ST1}, and pops the register stack (so that
+the result ends up in \c{ST0}). \c{ST0} must be non-zero and
+positive.
+
+\c{FYL2XP1} works the same way, but replacing the base-2 log of
+\c{ST0} with that of \c{ST0} plus one. This time, \c{ST0} must have
+magnitude no greater than 1 minus half the square root of two.
+
+\H{insHLT} \i\c{HLT}: Halt Processor
+
+\c HLT                           ; F4                   [8086]
+
+\c{HLT} puts the processor into a halted state, where it will
+perform no more operations until restarted by an interrupt or a
+reset.
+
+\H{insIBTS} \i\c{IBTS}: Insert Bit String
+
+\c IBTS r/m16,reg16              ; o16 0F A7 /r         [386,UNDOC]
+\c IBTS r/m32,reg32              ; o32 0F A7 /r         [386,UNDOC]
+
+No clear documentation seems to be available for this instruction:
+the best I've been able to find reads `Takes a string of bits from
+the second operand and puts them in the first operand'. It is
+present only in early 386 processors, and conflicts with the opcodes
+for \c{CMPXCHG486}. NASM supports it only for completeness. Its
+counterpart is \c{XBTS} (see \k{insXBTS}).
+
+\H{insIDIV} \i\c{IDIV}: Signed Integer Divide
+
+\c IDIV r/m8                     ; F6 /7                [8086]
+\c IDIV r/m16                    ; o16 F7 /7            [8086]
+\c IDIV r/m32                    ; o32 F7 /7            [386]
+
+\c{IDIV} performs signed integer division. The explicit operand
+provided is the divisor; the dividend and destination operands are
+implicit, in the following way:
+
+\b For \c{IDIV r/m8}, \c{AX} is divided by the given operand; the
+quotient is stored in \c{AL} and the remainder in \c{AH}.
+
+\b For \c{IDIV r/m16}, \c{DX:AX} is divided by the given operand; the
+quotient is stored in \c{AX} and the remainder in \c{DX}.
+
+\b For \c{IDIV r/m32}, \c{EDX:EAX} is divided by the given operand;
+the quotient is stored in \c{EAX} and the remainder in \c{EDX}.
+
+Unsigned integer division is performed by the \c{DIV} instruction:
+see \k{insDIV}.
+
+\H{insIMUL} \i\c{IMUL}: Signed Integer Multiply
+
+\c IMUL r/m8                     ; F6 /5                [8086]
+\c IMUL r/m16                    ; o16 F7 /5            [8086]
+\c IMUL r/m32                    ; o32 F7 /5            [386]
+
+\c IMUL reg16,r/m16              ; o16 0F AF /r         [386]
+\c IMUL reg32,r/m32              ; o32 0F AF /r         [386]
+
+\c IMUL reg16,imm8               ; o16 6B /r ib         [286]
+\c IMUL reg16,imm16              ; o16 69 /r iw         [286]
+\c IMUL reg32,imm8               ; o32 6B /r ib         [386]
+\c IMUL reg32,imm32              ; o32 69 /r id         [386]
+
+\c IMUL reg16,r/m16,imm8         ; o16 6B /r ib         [286]
+\c IMUL reg16,r/m16,imm16        ; o16 69 /r iw         [286]
+\c IMUL reg32,r/m32,imm8         ; o32 6B /r ib         [386]
+\c IMUL reg32,r/m32,imm32        ; o32 69 /r id         [386]
+
+\c{IMUL} performs signed integer multiplication. For the
+single-operand form, the other operand and destination are implicit,
+in the following way:
+
+\b For \c{IMUL r/m8}, \c{AL} is multiplied by the given operand; the
+product is stored in \c{AX}.
+
+\b For \c{IMUL r/m16}, \c{AX} is multiplied by the given operand;
+the product is stored in \c{DX:AX}.
+
+\b For \c{IMUL r/m32}, \c{EAX} is multiplied by the given operand;
+the product is stored in \c{EDX:EAX}.
+
+The two-operand form multiplies its two operands and stores the
+result in the destination (first) operand. The three-operand form
+multiplies its last two operands and stores the result in the first
+operand.
+
+The two-operand form is in fact a shorthand for the three-operand
+form, as can be seen by examining the opcode descriptions: in the
+two-operand form, the code \c{/r} takes both its register and
+\c{r/m} parts from the same operand (the first one).
+
+In the forms with an 8-bit immediate operand and another longer
+source operand, the immediate operand is considered to be signed,
+and is sign-extended to the length of the other source operand. In
+these cases, the \c{BYTE} qualifier is necessary to force NASM to
+generate this form of the instruction.
+
+Unsigned integer multiplication is performed by the \c{MUL}
+instruction: see \k{insMUL}.
+
+\H{insIN} \i\c{IN}: Input from I/O Port
+
+\c IN AL,imm8                    ; E4 ib                [8086]
+\c IN AX,imm8                    ; o16 E5 ib            [8086]
+\c IN EAX,imm8                   ; o32 E5 ib            [386]
+\c IN AL,DX                      ; EC                   [8086]
+\c IN AX,DX                      ; o16 ED               [8086]
+\c IN EAX,DX                     ; o32 ED               [386]
+
+\c{IN} reads a byte, word or doubleword from the specified I/O port,
+and stores it in the given destination register. The port number may
+be specified as an immediate value if it is between 0 and 255, and
+otherwise must be stored in \c{DX}. See also \c{OUT} (\k{insOUT}).
+
+\H{insINC} \i\c{INC}: Increment Integer
+
+\c INC reg16                     ; o16 40+r             [8086]
+\c INC reg32                     ; o32 40+r             [386]
+\c INC r/m8                      ; FE /0                [8086]
+\c INC r/m16                     ; o16 FF /0            [8086]
+\c INC r/m32                     ; o32 FF /0            [386]
+
+\c{INC} adds 1 to its operand. It does \e{not} affect the carry
+flag: to affect the carry flag, use \c{ADD something,1} (see
+\k{insADD}). See also \c{DEC} (\k{insDEC}).
+
+\H{insINSB} \i\c{INSB}, \i\c{INSW}, \i\c{INSD}: Input String from I/O Port
+
+\c INSB                          ; 6C                   [186]
+\c INSW                          ; o16 6D               [186]
+\c INSD                          ; o32 6D               [386]
+
+\c{INSB} inputs a byte from the I/O port specified in \c{DX} and
+stores it at \c{[ES:DI]} or \c{[ES:EDI]}. It then increments or
+decrements (depending on the direction flag: increments if the flag
+is clear, decrements if it is set) \c{DI} or \c{EDI}.
+
+The register used is \c{DI} if the address size is 16 bits, and
+\c{EDI} if it is 32 bits. If you need to use an address size not
+equal to the current \c{BITS} setting, you can use an explicit
+\i\c{a16} or \i\c{a32} prefix.
+
+Segment override prefixes have no effect for this instruction: the
+use of \c{ES} for the load from \c{[DI]} or \c{[EDI]} cannot be
+overridden.
+
+\c{INSW} and \c{INSD} work in the same way, but they input a word or
+a doubleword instead of a byte, and increment or decrement the
+addressing register by 2 or 4 instead of 1.
+
+The \c{REP} prefix may be used to repeat the instruction \c{CX} (or
+\c{ECX} - again, the address size chooses which) times.
+
+See also \c{OUTSB}, \c{OUTSW} and \c{OUTSD} (\k{insOUTSB}).
+
+\H{insINT} \i\c{INT}: Software Interrupt
+
+\c INT imm8                      ; CD ib                [8086]
+
+\c{INT} causes a software interrupt through a specified vector
+number from 0 to 255.
+
+The code generated by the \c{INT} instruction is always two bytes
+long: although there are short forms for some \c{INT} instructions,
+NASM does not generate them when it sees the \c{INT} mnemonic. In
+order to generate single-byte breakpoint instructions, use the
+\c{INT3} or \c{INT1} instructions (see \k{insINT1}) instead.
+
+\H{insINT1} \i\c{INT3}, \i\c{INT1}, \i\c{ICEBP}, \i\c{INT01}: Breakpoints
+
+\c INT1                          ; F1                   [P6]
+\c ICEBP                         ; F1                   [P6]
+\c INT01                         ; F1                   [P6]
+
+\c INT3                          ; CC                   [8086]
+
+\c{INT1} and \c{INT3} are short one-byte forms of the instructions
+\c{INT 1} and \c{INT 3} (see \k{insINT}). They perform a similar
+function to their longer counterparts, but take up less code space.
+They are used as breakpoints by debuggers.
+
+\c{INT1}, and its alternative synonyms \c{INT01} and \c{ICEBP}, is
+an instruction used by in-circuit emulators (ICEs). It is present,
+though not documented, on some processors down to the 286, but is
+only documented for the Pentium Pro. \c{INT3} is the instruction
+normally used as a breakpoint by debuggers.
+
+\c{INT3} is not precisely equivalent to \c{INT 3}: the short form,
+since it is designed to be used as a breakpoint, bypasses the normal
+IOPL checks in virtual-8086 mode, and also does not go through
+interrupt redirection.
+
+\H{insINTO} \i\c{INTO}: Interrupt if Overflow
+
+\c INTO                          ; CE                   [8086]
+
+\c{INTO} performs an \c{INT 4} software interrupt (see \k{insINT})
+if and only if the overflow flag is set.
+
+\H{insINVD} \i\c{INVD}: Invalidate Internal Caches
+
+\c INVD                          ; 0F 08                [486]
+
+\c{INVD} invalidates and empties the processor's internal caches,
+and causes the processor to instruct external caches to do the same.
+It does not write the contents of the caches back to memory first:
+any modified data held in the caches will be lost. To write the data
+back first, use \c{WBINVD} (\k{insWBINVD}).
+
+\H{insINVLPG} \i\c{INVLPG}: Invalidate TLB Entry
+
+\c INVLPG mem                    ; 0F 01 /0             [486]
+
+\c{INVLPG} invalidates the translation lookahead buffer (TLB) entry
+associated with the supplied memory address.
+
+\H{insIRET} \i\c{IRET}, \i\c{IRETW}, \i\c{IRETD}: Return from Interrupt
+
+\c IRET                          ; CF                   [8086]
+\c IRETW                         ; o16 CF               [8086]
+\c IRETD                         ; o32 CF               [386]
+
+\c{IRET} returns from an interrupt (hardware or software) by means
+of popping \c{IP} (or \c{EIP}), \c{CS} and the flags off the stack
+and then continuing execution from the new \c{CS:IP}.
+
+\c{IRETW} pops \c{IP}, \c{CS} and the flags as 2 bytes each, taking
+6 bytes off the stack in total. \c{IRETD} pops \c{EIP} as 4 bytes,
+pops a further 4 bytes of which the top two are discarded and the
+bottom two go into \c{CS}, and pops the flags as 4 bytes as well,
+taking 12 bytes off the stack.
+
+\c{IRET} is a shorthand for either \c{IRETW} or \c{IRETD}, depending
+on the default \c{BITS} setting at the time.
+
+\H{insJCXZ} \i\c{JCXZ}, \i\c{JECXZ}: Jump if CX/ECX Zero
+
+\c JCXZ imm                      ; o16 E3 rb            [8086]
+\c JECXZ imm                     ; o32 E3 rb            [386]
+
+\c{JCXZ} performs a short jump (with maximum range 128 bytes) if and
+only if the contents of the \c{CX} register is 0. \c{JECXZ} does the
+same thing, but with \c{ECX}.
+
+\H{insJMP} \i\c{JMP}: Jump
+
+\c JMP imm                       ; E9 rw/rd             [8086]
+\c JMP SHORT imm                 ; EB rb                [8086]
+\c JMP imm:imm16                 ; o16 EA iw iw         [8086]
+\c JMP imm:imm32                 ; o32 EA id iw         [386]
+\c JMP FAR mem                   ; o16 FF /5            [8086]
+\c JMP FAR mem                   ; o32 FF /5            [386]
+\c JMP r/m16                     ; o16 FF /4            [8086]
+\c JMP r/m32                     ; o32 FF /4            [386]
+
+\c{JMP} jumps to a given address. The address may be specified as an
+absolute segment and offset, or as a relative jump within the
+current segment.
+
+\c{JMP SHORT imm} has a maximum range of 128 bytes, since the
+displacement is specified as only 8 bits, but takes up less code
+space. NASM does not choose when to generate \c{JMP SHORT} for you:
+you must explicitly code \c{SHORT} every time you want a short jump.
+
+You can choose between the two immediate \i{far jump} forms (\c{JMP
+imm:imm}) by the use of the \c{WORD} and \c{DWORD} keywords: \c{JMP
+WORD 0x1234:0x5678}) or \c{JMP DWORD 0x1234:0x56789abc}.
+
+The \c{JMP FAR mem} forms execute a far jump by loading the
+destination address out of memory. The address loaded consists of 16
+or 32 bits of offset (depending on the operand size), and 16 bits of
+segment. The operand size may be overridden using \c{JMP WORD FAR
+mem} or \c{JMP DWORD FAR mem}.
+
+The \c{JMP r/m} forms execute a \i{near jump} (within the same
+segment), loading the destination address out of memory or out of a
+register. The keyword \c{NEAR} may be specified, for clarity, in
+these forms, but is not necessary. Again, operand size can be
+overridden using \c{JMP WORD mem} or \c{JMP DWORD mem}.
+
+As a convenience, NASM does not require you to jump to a far symbol
+by coding the cumbersome \c{JMP SEG routine:routine}, but instead
+allows the easier synonym \c{JMP FAR routine}.
+
+The \c{CALL r/m} forms given above are near calls; NASM will accept
+the \c{NEAR} keyword (e.g. \c{CALL NEAR [address]}), even though it
+is not strictly necessary.
+
+\H{insJcc} \i\c{Jcc}: Conditional Branch
+
+\c Jcc imm                       ; 70+cc rb             [8086]
+\c Jcc NEAR imm                  ; 0F 80+cc rw/rd       [386]
+
+The \i{conditional jump} instructions execute a near (same segment)
+jump if and only if their conditions are satisfied. For example,
+\c{JNZ} jumps only if the zero flag is not set.
+
+The ordinary form of the instructions has only a 128-byte range; the
+\c{NEAR} form is a 386 extension to the instruction set, and can
+span the full size of a segment. NASM will not override your choice
+of jump instruction: if you want \c{Jcc NEAR}, you have to use the
+\c{NEAR} keyword.
+
+The \c{SHORT} keyword is allowed on the first form of the
+instruction, for clarity, but is not necessary.
+
+\H{insLAHF} \i\c{LAHF}: Load AH from Flags
+
+\c LAHF                          ; 9F                   [8086]
+
+\c{LAHF} sets the \c{AH} register according to the contents of the
+low byte of the flags word. See also \c{SAHF} (\k{insSAHF}).
+
+\H{insLAR} \i\c{LAR}: Load Access Rights
+
+\c LAR reg16,r/m16               ; o16 0F 02 /r         [286,PRIV]
+\c LAR reg32,r/m32               ; o32 0F 02 /r         [286,PRIV]
+
+\c{LAR} takes the segment selector specified by its source (second)
+operand, finds the corresponding segment descriptor in the GDT or
+LDT, and loads the access-rights byte of the descriptor into its
+destination (first) operand.
+
+\H{insLDS} \i\c{LDS}, \i\c{LES}, \i\c{LFS}, \i\c{LGS}, \i\c{LSS}: Load Far Pointer
+
+\c LDS reg16,mem                 ; o16 C5 /r            [8086]
+\c LDS reg32,mem                 ; o32 C5 /r            [8086]
+
+\c LES reg16,mem                 ; o16 C4 /r            [8086]
+\c LES reg32,mem                 ; o32 C4 /r            [8086]
+
+\c LFS reg16,mem                 ; o16 0F B4 /r         [386]
+\c LFS reg32,mem                 ; o32 0F B4 /r         [386]
+
+\c LGS reg16,mem                 ; o16 0F B5 /r         [386]
+\c LGS reg32,mem                 ; o32 0F B5 /r         [386]
+
+\c LSS reg16,mem                 ; o16 0F B2 /r         [386]
+\c LSS reg32,mem                 ; o32 0F B2 /r         [386]
+
+These instructions load an entire far pointer (16 or 32 bits of
+offset, plus 16 bits of segment) out of memory in one go. \c{LDS},
+for example, loads 16 or 32 bits from the given memory address into
+the given register (depending on the size of the register), then
+loads the \e{next} 16 bits from memory into \c{DS}. \c{LES},
+\c{LFS}, \c{LGS} and \c{LSS} work in the same way but use the other
+segment registers.
+
+\H{insLEA} \i\c{LEA}: Load Effective Address
+
+\c LEA reg16,mem                 ; o16 8D /r            [8086]
+\c LEA reg32,mem                 ; o32 8D /r            [8086]
+
+\c{LEA}, despite its syntax, does not access memory. It calculates
+the effective address specified by its second operand as if it were
+going to load or store data from it, but instead it stores the
+calculated address into the register specified by its first operand.
+This can be used to perform quite complex calculations (e.g. \c{LEA
+EAX,[EBX+ECX*4+100]}) in one instruction.
+
+\c{LEA}, despite being a purely arithmetic instruction which
+accesses no memory, still requires square brackets around its second
+operand, as if it were a memory reference.
+
+\H{insLEAVE} \i\c{LEAVE}: Destroy Stack Frame
+
+\c LEAVE                         ; C9                   [186]
+
+\c{LEAVE} destroys a stack frame of the form created by the
+\c{ENTER} instruction (see \k{insENTER}). It is functionally
+equivalent to \c{MOV ESP,EBP} followed by \c{POP EBP} (or \c{MOV
+SP,BP} followed by \c{POP BP} in 16-bit mode).
+
+\H{insLGDT} \i\c{LGDT}, \i\c{LIDT}, \i\c{LLDT}: Load Descriptor Tables
+
+\c LGDT mem                      ; 0F 01 /2             [286,PRIV]
+\c LIDT mem                      ; 0F 01 /3             [286,PRIV]
+\c LLDT r/m16                    ; 0F 00 /2             [286,PRIV]
+
+\c{LGDT} and \c{LIDT} both take a 6-byte memory area as an operand:
+they load a 32-bit linear address and a 16-bit size limit from that
+area (in the opposite order) into the GDTR (global descriptor table
+register) or IDTR (interrupt descriptor table register). These are
+the only instructions which directly use \e{linear} addresses,
+rather than segment/offset pairs.
+
+\c{LLDT} takes a segment selector as an operand. The processor looks
+up that selector in the GDT and stores the limit and base address
+given there into the LDTR (local descriptor table register).
+
+See also \c{SGDT}, \c{SIDT} and \c{SLDT} (\k{insSGDT}).
+
+\H{insLMSW} \i\c{LMSW}: Load/Store Machine Status Word
+
+\c LMSW r/m16                    ; 0F 01 /6             [286,PRIV]
+
+\c{LMSW} loads the bottom four bits of the source operand into the
+bottom four bits of the \c{CR0} control register (or the Machine
+Status Word, on 286 processors). See also \c{SMSW} (\k{insSMSW}).
+
+\H{insLOADALL} \i\c{LOADALL}, \i\c{LOADALL286}: Load Processor State
+
+\c LOADALL                       ; 0F 07                [386,UNDOC]
+\c LOADALL286                    ; 0F 05                [286,UNDOC]
+
+This instruction, in its two different-opcode forms, is apparently
+supported on most 286 processors, some 386 and possibly some 486.
+The opcode differs between the 286 and the 386.
+
+The function of the instruction is to load all information relating
+to the state of the processor out of a block of memory: on the 286,
+this block is located implicitly at absolute address \c{0x800}, and
+on the 386 and 486 it is at \c{[ES:EDI]}.
+
+\H{insLODSB} \i\c{LODSB}, \i\c{LODSW}, \i\c{LODSD}: Load from String
+
+\c LODSB                         ; AC                   [8086]
+\c LODSW                         ; o16 AD               [8086]
+\c LODSD                         ; o32 AD               [386]
+
+\c{LODSB} loads a byte from \c{[DS:SI]} or \c{[DS:ESI]} into \c{AL}.
+It then increments or decrements (depending on the direction flag:
+increments if the flag is clear, decrements if it is set) \c{SI} or
+\c{ESI}.
+
+The register used is \c{SI} if the address size is 16 bits, and
+\c{ESI} if it is 32 bits. If you need to use an address size not
+equal to the current \c{BITS} setting, you can use an explicit
+\i\c{a16} or \i\c{a32} prefix.
+
+The segment register used to load from \c{[SI]} or \c{[ESI]} can be
+overridden by using a segment register name as a prefix (for
+example, \c{es lodsb}).
+
+\c{LODSW} and \c{LODSD} work in the same way, but they load a
+word or a doubleword instead of a byte, and increment or decrement
+the addressing registers by 2 or 4 instead of 1.
+
+\H{insLOOP} \i\c{LOOP}, \i\c{LOOPE}, \i\c{LOOPZ}, \i\c{LOOPNE}, \i\c{LOOPNZ}: Loop with Counter
+
+\c LOOP imm                      ; E2 rb                [8086]
+\c LOOP imm,CX                   ; a16 E2 rb            [8086]
+\c LOOP imm,ECX                  ; a32 E2 rb            [386]
+
+\c LOOPE imm                     ; E1 rb                [8086]
+\c LOOPE imm,CX                  ; a16 E1 rb            [8086]
+\c LOOPE imm,ECX                 ; a32 E1 rb            [386]
+\c LOOPZ imm                     ; E1 rb                [8086]
+\c LOOPZ imm,CX                  ; a16 E1 rb            [8086]
+\c LOOPZ imm,ECX                 ; a32 E1 rb            [386]
+
+\c LOOPNE imm                    ; E0 rb                [8086]
+\c LOOPNE imm,CX                 ; a16 E0 rb            [8086]
+\c LOOPNE imm,ECX                ; a32 E0 rb            [386]
+\c LOOPNZ imm                    ; E0 rb                [8086]
+\c LOOPNZ imm,CX                 ; a16 E0 rb            [8086]
+\c LOOPNZ imm,ECX                ; a32 E0 rb            [386]
+
+\c{LOOP} decrements its counter register (either \c{CX} or \c{ECX} -
+if one is not specified explicitly, the \c{BITS} setting dictates
+which is used) by one, and if the counter does not become zero as a
+result of this operation, it jumps to the given label. The jump has
+a range of 128 bytes.
+
+\c{LOOPE} (or its synonym \c{LOOPZ}) adds the additional condition
+that it only jumps if the counter is nonzero \e{and} the zero flag
+is set. Similarly, \c{LOOPNE} (and \c{LOOPNZ}) jumps only if the
+counter is nonzero and the zero flag is clear.
+
+\H{insLSL} \i\c{LSL}: Load Segment Limit
+
+\c LSL reg16,r/m16               ; o16 0F 03 /r         [286,PRIV]
+\c LSL reg32,r/m32               ; o32 0F 03 /r         [286,PRIV]
+
+\c{LSL} is given a segment selector in its source (second) operand;
+it computes the segment limit value by loading the segment limit
+field from the associated segment descriptor in the GDT or LDT.
+(This involves shifting left by 12 bits if the segment limit is
+page-granular, and not if it is byte-granular; so you end up with a
+byte limit in either case.) The segment limit obtained is then
+loaded into the destination (first) operand.
+
+\H{insLTR} \i\c{LTR}: Load Task Register
+
+\c LTR r/m16                     ; 0F 00 /3             [286,PRIV]
+
+\c{LTR} looks up the segment base and limit in the GDT or LDT
+descriptor specified by the segment selector given as its operand,
+and loads them into the Task Register.
+
+\H{insMOV} \i\c{MOV}: Move Data
+
+\c MOV r/m8,reg8                 ; 88 /r                [8086]
+\c MOV r/m16,reg16               ; o16 89 /r            [8086]
+\c MOV r/m32,reg32               ; o32 89 /r            [386]
+\c MOV reg8,r/m8                 ; 8A /r                [8086]
+\c MOV reg16,r/m16               ; o16 8B /r            [8086]
+\c MOV reg32,r/m32               ; o32 8B /r            [386]
+
+\c MOV reg8,imm8                 ; B0+r ib              [8086]
+\c MOV reg16,imm16               ; o16 B8+r iw          [8086]
+\c MOV reg32,imm32               ; o32 B8+r id          [386]
+\c MOV r/m8,imm8                 ; C6 /0 ib             [8086]
+\c MOV r/m16,imm16               ; o16 C7 /0 iw         [8086]
+\c MOV r/m32,imm32               ; o32 C7 /0 id         [386]
+
+\c MOV AL,memoffs8               ; A0 ow/od             [8086]
+\c MOV AX,memoffs16              ; o16 A1 ow/od         [8086]
+\c MOV EAX,memoffs32             ; o32 A1 ow/od         [386]
+\c MOV memoffs8,AL               ; A2 ow/od             [8086]
+\c MOV memoffs16,AX              ; o16 A3 ow/od         [8086]
+\c MOV memoffs32,EAX             ; o32 A3 ow/od         [386]
+
+\c MOV r/m16,segreg              ; o16 8C /r            [8086]
+\c MOV r/m32,segreg              ; o32 8C /r            [386]
+\c MOV segreg,r/m16              ; o16 8E /r            [8086]
+\c MOV segreg,r/m32              ; o32 8E /r            [386]
+
+\c MOV reg32,CR0/2/3/4           ; 0F 20 /r             [386]
+\c MOV reg32,DR0/1/2/3/6/7       ; 0F 21 /r             [386]
+\c MOV reg32,TR3/4/5/6/7         ; 0F 24 /r             [386]
+\c MOV CR0/2/3/4,reg32           ; 0F 22 /r             [386]
+\c MOV DR0/1/2/3/6/7,reg32       ; 0F 23 /r             [386]
+\c MOV TR3/4/5/6/7,reg32         ; 0F 26 /r             [386]
+
+\c{MOV} copies the contents of its source (second) operand into its
+destination (first) operand.
+
+In all forms of the \c{MOV} instruction, the two operands are the
+same size, except for moving between a segment register and an
+\c{r/m32} operand. These instructions are treated exactly like the
+corresponding 16-bit equivalent (so that, for example, \c{MOV
+DS,EAX} functions identically to \c{MOV DS,AX} but saves a prefix
+when in 32-bit mode), except that when a segment register is moved
+into a 32-bit destination, the top two bytes of the result are
+undefined.
+
+\c{MOV} may not use \c{CS} as a destination.
+
+\c{CR4} is only a supported register on the Pentium and above.
+
+\H{insMOVD} \i\c{MOVD}: Move Doubleword to/from MMX Register
+
+\c MOVD mmxreg,r/m32             ; 0F 6E /r             [PENT,MMX]
+\c MOVD r/m32,mmxreg             ; 0F 7E /r             [PENT,MMX]
+
+\c{MOVD} copies 32 bits from its source (second) operand into its
+destination (first) operand. When the destination is a 64-bit MMX
+register, the top 32 bits are set to zero.
+
+\H{insMOVQ} \i\c{MOVQ}: Move Quadword to/from MMX Register
+
+\c MOVQ mmxreg,r/m64             ; 0F 6F /r             [PENT,MMX]
+\c MOVQ r/m64,mmxreg             ; 0F 7F /r             [PENT,MMX]
+
+\c{MOVQ} copies 64 bits from its source (second) operand into its
+destination (first) operand.
+
+\H{insMOVSB} \i\c{MOVSB}, \i\c{MOVSW}, \i\c{MOVSD}: Move String
+
+\c MOVSB                         ; A4                   [8086]
+\c MOVSW                         ; o16 A5               [8086]
+\c MOVSD                         ; o32 A5               [386]
+
+\c{MOVSB} copies the byte at \c{[ES:DI]} or \c{[ES:EDI]} to
+\c{[DS:SI]} or \c{[DS:ESI]}. It then increments or decrements
+(depending on the direction flag: increments if the flag is clear,
+decrements if it is set) \c{SI} and \c{DI} (or \c{ESI} and \c{EDI}).
+
+The registers used are \c{SI} and \c{DI} if the address size is 16
+bits, and \c{ESI} and \c{EDI} if it is 32 bits. If you need to use
+an address size not equal to the current \c{BITS} setting, you can
+use an explicit \i\c{a16} or \i\c{a32} prefix.
+
+The segment register used to load from \c{[SI]} or \c{[ESI]} can be
+overridden by using a segment register name as a prefix (for
+example, \c{es movsb}). The use of \c{ES} for the store to \c{[DI]}
+or \c{[EDI]} cannot be overridden.
+
+\c{MOVSW} and \c{MOVSD} work in the same way, but they copy a word
+or a doubleword instead of a byte, and increment or decrement the
+addressing registers by 2 or 4 instead of 1.
+
+The \c{REP} prefix may be used to repeat the instruction \c{CX} (or
+\c{ECX} - again, the address size chooses which) times.
+
+\H{insMOVSX} \i\c{MOVSX}, \i\c{MOVZX}: Move Data with Sign or Zero Extend
+
+\c MOVSX reg16,r/m8              ; o16 0F BE /r         [386]
+\c MOVSX reg32,r/m8              ; o32 0F BE /r         [386]
+\c MOVSX reg32,r/m16             ; o32 0F BF /r         [386]
+
+\c MOVZX reg16,r/m8              ; o16 0F B6 /r         [386]
+\c MOVZX reg32,r/m8              ; o32 0F B6 /r         [386]
+\c MOVZX reg32,r/m16             ; o32 0F B7 /r         [386]
+
+\c{MOVSX} sign-extends its source (second) operand to the length of
+its destination (first) operand, and copies the result into the
+destination operand. \c{MOVZX} does the same, but zero-extends
+rather than sign-extending.
+
+\H{insMUL} \i\c{MUL}: Unsigned Integer Multiply
+
+\c MUL r/m8                      ; F6 /4                [8086]
+\c MUL r/m16                     ; o16 F7 /4            [8086]
+\c MUL r/m32                     ; o32 F7 /4            [386]
+
+\c{MUL} performs unsigned integer multiplication. The other operand
+to the multiplication, and the destination operand, are implicit, in
+the following way:
+
+\b For \c{MUL r/m8}, \c{AL} is multiplied by the given operand; the
+product is stored in \c{AX}.
+
+\b For \c{MUL r/m16}, \c{AX} is multiplied by the given operand;
+the product is stored in \c{DX:AX}.
+
+\b For \c{MUL r/m32}, \c{EAX} is multiplied by the given operand;
+the product is stored in \c{EDX:EAX}.
+
+Signed integer multiplication is performed by the \c{IMUL}
+instruction: see \k{insIMUL}.
+
+\H{insNEG} \i\c{NEG}, \i\c{NOT}: Two's and One's Complement
+
+\c NEG r/m8                      ; F6 /3                [8086]
+\c NEG r/m16                     ; o16 F7 /3            [8086]
+\c NEG r/m32                     ; o32 F7 /3            [386]
+
+\c NOT r/m8                      ; F6 /2                [8086]
+\c NOT r/m16                     ; o16 F7 /2            [8086]
+\c NOT r/m32                     ; o32 F7 /2            [386]
+
+\c{NEG} replaces the contents of its operand by the two's complement
+negation (invert all the bits and then add one) of the original
+value. \c{NOT}, similarly, performs one's complement (inverts all
+the bits).
+
+\H{insNOP} \i\c{NOP}: No Operation
+
+\c NOP                           ; 90                   [8086]
+
+\c{NOP} performs no operation. Its opcode is the same as that
+generated by \c{XCHG AX,AX} or \c{XCHG EAX,EAX} (depending on the
+processor mode; see \k{insXCHG}).
+
+\H{insOR} \i\c{OR}: Bitwise OR
+
+\c OR r/m8,reg8                  ; 08 /r                [8086]
+\c OR r/m16,reg16                ; o16 09 /r            [8086]
+\c OR r/m32,reg32                ; o32 09 /r            [386]
+
+\c OR reg8,r/m8                  ; 0A /r                [8086]
+\c OR reg16,r/m16                ; o16 0B /r            [8086]
+\c OR reg32,r/m32                ; o32 0B /r            [386]
+
+\c OR r/m8,imm8                  ; 80 /1 ib             [8086]
+\c OR r/m16,imm16                ; o16 81 /1 iw         [8086]
+\c OR r/m32,imm32                ; o32 81 /1 id         [386]
+
+\c OR r/m16,imm8                 ; o16 83 /1 ib         [8086]
+\c OR r/m32,imm8                 ; o32 83 /1 ib         [386]
+
+\c OR AL,imm8                    ; 0C ib                [8086]
+\c OR AX,imm16                   ; o16 0D iw            [8086]
+\c OR EAX,imm32                  ; o32 0D id            [386]
+
+\c{OR} performs a bitwise OR operation between its two operands
+(i.e. each bit of the result is 1 if and only if at least one of the
+corresponding bits of the two inputs was 1), and stores the result
+in the destination (first) operand.
+
+In the forms with an 8-bit immediate second operand and a longer
+first operand, the second operand is considered to be signed, and is
+sign-extended to the length of the first operand. In these cases,
+the \c{BYTE} qualifier is necessary to force NASM to generate this
+form of the instruction.
+
+The MMX instruction \c{POR} (see \k{insPOR}) performs the same
+operation on the 64-bit MMX registers.
+
+\H{insOUT} \i\c{OUT}: Output Data to I/O Port
+
+\c OUT imm8,AL                   ; E6 ib                [8086]
+\c OUT imm8,AX                   ; o16 E7 ib            [8086]
+\c OUT imm8,EAX                  ; o32 E7 ib            [386]
+\c OUT DX,AL                     ; EE                   [8086]
+\c OUT DX,AX                     ; o16 EF               [8086]
+\c OUT DX,EAX                    ; o32 EF               [386]
+
+\c{IN} writes the contents of the given source register to the
+specified I/O port. The port number may be specified as an immediate
+value if it is between 0 and 255, and otherwise must be stored in
+\c{DX}. See also \c{IN} (\k{insIN}).
+
+\H{insOUTSB} \i\c{OUTSB}, \i\c{OUTSW}, \i\c{OUTSD}: Output String to I/O Port
+
+\c OUTSB                         ; 6E                   [186]
+
+\c OUTSW                         ; o16 6F               [186]
+
+\c OUTSD                         ; o32 6F               [386]
+
+\c{OUTSB} loads a byte from \c{[DS:SI]} or \c{[DS:ESI]} and writes
+it to the I/O port specified in \c{DX}. It then increments or
+decrements (depending on the direction flag: increments if the flag
+is clear, decrements if it is set) \c{SI} or \c{ESI}.
+
+The register used is \c{SI} if the address size is 16 bits, and
+\c{ESI} if it is 32 bits. If you need to use an address size not
+equal to the current \c{BITS} setting, you can use an explicit
+\i\c{a16} or \i\c{a32} prefix.
+
+The segment register used to load from \c{[SI]} or \c{[ESI]} can be
+overridden by using a segment register name as a prefix (for
+example, \c{es outsb}).
+
+\c{OUTSW} and \c{OUTSD} work in the same way, but they output a
+word or a doubleword instead of a byte, and increment or decrement
+the addressing registers by 2 or 4 instead of 1.
+
+The \c{REP} prefix may be used to repeat the instruction \c{CX} (or
+\c{ECX} - again, the address size chooses which) times.
+
+\H{insPACKSSDW} \i\c{PACKSSDW}, \i\c{PACKSSWB}, \i\c{PACKUSWB}: Pack Data
+
+\c PACKSSDW mmxreg,r/m64         ; 0F 6B /r             [PENT,MMX]
+\c PACKSSWB mmxreg,r/m64         ; 0F 63 /r             [PENT,MMX]
+\c PACKUSWB mmxreg,r/m64         ; 0F 67 /r             [PENT,MMX]
+
+All these instructions start by forming a notional 128-bit word by
+placing the source (second) operand on the left of the destination
+(first) operand. \c{PACKSSDW} then splits this 128-bit word into
+four doublewords, converts each to a word, and loads them side by
+side into the destination register; \c{PACKSSWB} and \c{PACKUSWB}
+both split the 128-bit word into eight words, converts each to a
+byte, and loads \e{those} side by side into the destination
+register.
+
+\c{PACKSSDW} and \c{PACKSSWB} perform signed saturation when
+reducing the length of numbers: if the number is too large to fit
+into the reduced space, they replace it by the largest signed number
+(\c{7FFFh} or \c{7Fh}) that \e{will} fit, and if it is too small
+then they replace it by the smallest signed number (\c{8000h} or
+\c{80h}) that will fit. \c{PACKUSWB} performs unsigned saturation:
+it treats its input as unsigned, and replaces it by the largest
+unsigned number that will fit.
+
+\H{insPADDB} \i\c{PADDxx}: MMX Packed Addition
+
+\c PADDB mmxreg,r/m64            ; 0F FC /r             [PENT,MMX]
+\c PADDW mmxreg,r/m64            ; 0F FD /r             [PENT,MMX]
+\c PADDD mmxreg,r/m64            ; 0F FE /r             [PENT,MMX]
+
+\c PADDSB mmxreg,r/m64           ; 0F EC /r             [PENT,MMX]
+\c PADDSW mmxreg,r/m64           ; 0F ED /r             [PENT,MMX]
+
+\c PADDUSB mmxreg,r/m64          ; 0F DC /r             [PENT,MMX]
+\c PADDUSW mmxreg,r/m64          ; 0F DD /r             [PENT,MMX]
+
+\c{PADDxx} all perform packed addition between their two 64-bit
+operands, storing the result in the destination (first) operand. The
+\c{PADDxB} forms treat the 64-bit operands as vectors of eight
+bytes, and add each byte individually; \c{PADDxW} treat the operands
+as vectors of four words; and \c{PADDD} treats its operands as
+vectors of two doublewords.
+
+\c{PADDSB} and \c{PADDSW} perform signed saturation on the sum of
+each pair of bytes or words: if the result of an addition is too
+large or too small to fit into a signed byte or word result, it is
+clipped (saturated) to the largest or smallest value which \e{will}
+fit. \c{PADDUSB} and \c{PADDUSW} similarly perform unsigned
+saturation, clipping to \c{0FFh} or \c{0FFFFh} if the result is
+larger than that.
+
+\H{insPADDSIW} \i\c{PADDSIW}: MMX Packed Addition to Implicit
+Destination
+
+\c PADDSIW mmxreg,r/m64          ; 0F 51 /r             [CYRIX,MMX]
+
+\c{PADDSIW}, specific to the Cyrix extensions to the MMX instruction
+set, performs the same function as \c{PADDSW}, except that the
+result is not placed in the register specified by the first operand,
+but instead in the register whose number differs from the first
+operand only in the last bit. So \c{PADDSIW MM0,MM2} would put the
+result in \c{MM1}, but \c{PADDSIW MM1,MM2} would put the result in
+\c{MM0}.
+
+\H{insPAND} \i\c{PAND}, \i\c{PANDN}: MMX Bitwise AND and AND-NOT
+
+\c PAND mmxreg,r/m64             ; 0F DB /r             [PENT,MMX]
+\c PANDN mmxreg,r/m64            ; 0F DF /r             [PENT,MMX]
+
+\c{PAND} performs a bitwise AND operation between its two operands
+(i.e. each bit of the result is 1 if and only if the corresponding
+bits of the two inputs were both 1), and stores the result in the
+destination (first) operand.
+
+\c{PANDN} performs the same operation, but performs a one's
+complement operation on the destination (first) operand first.
+
+\H{insPAVEB} \i\c{PAVEB}: MMX Packed Average
+
+\c PAVEB mmxreg,r/m64            ; 0F 50 /r             [CYRIX,MMX]
+
+\c{PAVEB}, specific to the Cyrix MMX extensions, treats its two
+operands as vectors of eight unsigned bytes, and calculates the
+average of the corresponding bytes in the operands. The resulting
+vector of eight averages is stored in the first operand.
+
+\H{insPCMPEQB} \i\c{PCMPxx}: MMX Packed Comparison
+
+\c PCMPEQB mmxreg,r/m64          ; 0F 74 /r             [PENT,MMX]
+\c PCMPEQW mmxreg,r/m64          ; 0F 75 /r             [PENT,MMX]
+\c PCMPEQD mmxreg,r/m64          ; 0F 76 /r             [PENT,MMX]
+
+\c PCMPGTB mmxreg,r/m64          ; 0F 64 /r             [PENT,MMX]
+\c PCMPGTW mmxreg,r/m64          ; 0F 65 /r             [PENT,MMX]
+\c PCMPGTD mmxreg,r/m64          ; 0F 66 /r             [PENT,MMX]
+
+The \c{PCMPxx} instructions all treat their operands as vectors of
+bytes, words, or doublewords; corresponding elements of the source
+and destination are compared, and the corresponding element of the
+destination (first) operand is set to all zeros or all ones
+depending on the result of the comparison.
+
+\c{PCMPxxB} treats the operands as vectors of eight bytes,
+\c{PCMPxxW} treats them as vectors of four words, and \c{PCMPxxD} as
+two doublewords.
+
+\c{PCMPEQx} sets the corresponding element of the destination
+operand to all ones if the two elements compared are equal;
+\c{PCMPGTx} sets the destination element to all ones if the element
+of the first (destination) operand is greater (treated as a signed
+integer) than that of the second (source) operand.
+
+\H{insPDISTIB} \i\c{PDISTIB}: MMX Packed Distance and Accumulate
+with Implied Register
+
+\c PDISTIB mmxreg,mem64          ; 0F 54 /r             [CYRIX,MMX]
+
+\c{PDISTIB}, specific to the Cyrix MMX extensions, treats its two
+input operands as vectors of eight unsigned bytes. For each byte
+position, it finds the absolute difference between the bytes in that
+position in the two input operands, and adds that value to the byte
+in the same position in the implied output register. The addition is
+saturated to an unsigned byte in the same way as \c{PADDUSB}.
+
+The implied output register is found in the same way as \c{PADDSIW}
+(\k{insPADDSIW}).
+
+Note that \c{PDISTIB} cannot take a register as its second source
+operand.
+
+\H{insPMACHRIW} \i\c{PMACHRIW}: MMX Packed Multiply and Accumulate
+with Rounding
+
+\c PMACHRIW mmxreg,mem64         ; 0F 5E /r             [CYRIX,MMX]
+
+\c{PMACHRIW} acts almost identically to \c{PMULHRIW}
+(\k{insPMULHRW}), but instead of \e{storing} its result in the
+implied destination register, it \e{adds} its result, as four packed
+words, to the implied destination register. No saturation is done:
+the addition can wrap around.
+
+Note that \c{PMACHRIW} cannot take a register as its second source
+operand.
+
+\H{insPMADDWD} \i\c{PMADDWD}: MMX Packed Multiply and Add
+
+\c PMADDWD mmxreg,r/m64          ; 0F F5 /r             [PENT,MMX]
+
+\c{PMADDWD} treats its two inputs as vectors of four signed words.
+It multiplies corresponding elements of the two operands, giving
+four signed doubleword results. The top two of these are added and
+placed in the top 32 bits of the destination (first) operand; the
+bottom two are added and placed in the bottom 32 bits.
+
+\H{insPMAGW} \i\c{PMAGW}: MMX Packed Magnitude
+
+\c PMAGW mmxreg,r/m64            ; 0F 52 /r             [CYRIX,MMX]
+
+\c{PMAGW}, specific to the Cyrix MMX extensions, treats both its
+operands as vectors of four signed words. It compares the absolute
+values of the words in corresponding positions, and sets each word
+of the destination (first) operand to whichever of the two words in
+that position had the larger absolute value.
+
+\H{insPMULHRW} \i\c{PMULHRW}, \i\c{PMULHRIW}: MMX Packed Multiply
+High with Rounding
+
+\c PMULHRW mmxreg,r/m64          ; 0F 59 /r             [CYRIX,MMX]
+\c PMULHRIW mmxreg,r/m64         ; 0F 5D /r             [CYRIX,MMX]
+
+These instructions, specific to the Cyrix MMX extensions, treat
+their operands as vectors of four signed words. Words in
+corresponding positions are multiplied, to give a 32-bit value in
+which bits 30 and 31 are guaranteed equal. Bits 30 to 15 of this
+value (bit mask \c{0x7FFF8000}) are taken and stored in the
+corresponding position of the destination operand, after first
+rounding the low bit (equivalent to adding \c{0x4000} before
+extracting bits 30 to 15).
+
+For \c{PMULHRW}, the destination operand is the first operand; for
+\c{PMULHRIW} the destination operand is implied by the first operand
+in the manner of \c{PADDSIW} (\k{insPADDSIW}).
+
+\H{insPMULHW} \i\c{PMULHW}, \i\c{PMULLW}: MMX Packed Multiply
+
+\c PMULHW mmxreg,r/m64           ; 0F E5 /r             [PENT,MMX]
+\c PMULLW mmxreg,r/m64           ; 0F D5 /r             [PENT,MMX]
+
+\c{PMULxW} treats its two inputs as vectors of four signed words. It
+multiplies corresponding elements of the two operands, giving four
+signed doubleword results.
+
+\c{PMULHW} then stores the top 16 bits of each doubleword in the
+destination (first) operand; \c{PMULLW} stores the bottom 16 bits of
+each doubleword in the destination operand.
+
+\H{insPMVccZB} \i\c{PMVccZB}: MMX Packed Conditional Move
+
+\c PMVZB mmxreg,mem64            ; 0F 58 /r             [CYRIX,MMX]
+\c PMVNZB mmxreg,mem64           ; 0F 5A /r             [CYRIX,MMX]
+\c PMVLZB mmxreg,mem64           ; 0F 5B /r             [CYRIX,MMX]
+\c PMVGEZB mmxreg,mem64          ; 0F 5C /r             [CYRIX,MMX]
+
+These instructions, specific to the Cyrix MMX extensions, perform
+parallel conditional moves. The two input operands are treated as
+vectors of eight bytes. Each byte of the destination (first) operand
+is either written from the corresponding byte of the source (second)
+operand, or left alone, depending on the value of the byte in the
+\e{implied} operand (specified in the same way as \c{PADDSIW}, in
+\k{insPADDSIW}).
+
+\c{PMVZB} performs each move if the corresponding byte in the
+implied operand is zero. \c{PMVNZB} moves if the byte is non-zero.
+\c{PMVLZB} moves if the byte is less than zero, and \c{PMVGEZB}
+moves if the byte is greater than or equal to zero.
+
+Note that these instructions cannot take a register as their second
+source operand.
+
+\H{insPOP} \i\c{POP}: Pop Data from Stack
+
+\c POP reg16                     ; o16 58+r             [8086]
+\c POP reg32                     ; o32 58+r             [386]
+
+\c POP r/m16                     ; o16 8F /0            [8086]
+\c POP r/m32                     ; o32 8F /0            [386]
+
+\c POP CS                        ; 0F                   [8086,UNDOC]
+\c POP DS                        ; 1F                   [8086]
+\c POP ES                        ; 07                   [8086]
+\c POP SS                        ; 17                   [8086]
+\c POP FS                        ; 0F A1                [386]
+\c POP GS                        ; 0F A9                [386]
+
+\c{POP} loads a value from the stack (from \c{[SS:SP]} or
+\c{[SS:ESP]}) and then increments the stack pointer.
+
+The address-size attribute of the instruction determines whether
+\c{SP} or \c{ESP} is used as the stack pointer: to deliberately
+override the default given by the \c{BITS} setting, you can use an
+\i\c{a16} or \i\c{a32} prefix.
+
+The operand-size attribute of the instruction determines whether the
+stack pointer is incremented by 2 or 4: this means that segment
+register pops in \c{BITS 32} mode will pop 4 bytes off the stack and
+discard the upper two of them. If you need to override that, you can
+use an \i\c{o16} or \i\c{o32} prefix.
+
+The above opcode listings give two forms for general-purpose
+register pop instructions: for example, \c{POP BX} has the two forms
+\c{5B} and \c{8F C3}. NASM will always generate the shorter form
+when given \c{POP BX}. NDISASM will disassemble both.
+
+\c{POP CS} is not a documented instruction, and is not supported on
+any processor above the 8086 (since they use \c{0Fh} as an opcode
+prefix for instruction set extensions). However, at least some 8086
+processors do support it, and so NASM generates it for completeness.
+
+\H{insPOPA} \i\c{POPAx}: Pop All General-Purpose Registers
+
+\c POPA                          ; 61                   [186]
+\c POPAW                         ; o16 61               [186]
+\c POPAD                         ; o32 61               [386]
+
+\c{POPAW} pops a word from the stack into each of, successively,
+\c{DI}, \c{SI}, \c{BP}, nothing (it discards a word from the stack
+which was a placeholder for \c{SP}), \c{BX}, \c{DX}, \c{CX} and
+\c{AX}. It is intended to reverse the operation of \c{PUSHAW} (see
+\k{insPUSHA}), but it ignores the value for \c{SP} that was pushed
+on the stack by \c{PUSHAW}.
+
+\c{POPAD} pops twice as much data, and places the results in
+\c{EDI}, \c{ESI}, \c{EBP}, nothing (placeholder for \c{ESP}),
+\c{EBX}, \c{EDX}, \c{ECX} and \c{EAX}. It reverses the operation of
+\c{PUSHAD}.
+
+\c{POPA} is an alias mnemonic for either \c{POPAW} or \c{POPAD},
+depending on the current \c{BITS} setting.
+
+Note that the registers are popped in reverse order of their numeric
+values in opcodes (see \k{iref-rv}).
+
+\H{insPOPF} \i\c{POPFx}: Pop Flags Register
+
+\c POPF                          ; 9D                   [186]
+\c POPFW                         ; o16 9D               [186]
+\c POPFD                         ; o32 9D               [386]
+
+\c{POPFW} pops a word from the stack and stores it in the bottom 16
+bits of the flags register (or the whole flags register, on
+processors below a 386). \c{POPFD} pops a doubleword and stores it
+in the entire flags register.
+
+\c{POPF} is an alias mnemonic for either \c{POPFW} or \c{POPFD},
+depending on the current \c{BITS} setting.
+
+See also \c{PUSHF} (\k{insPUSHF}).
+
+\H{insPOR} \i\c{POR}: MMX Bitwise OR
+
+\c POR mmxreg,r/m64              ; 0F EB /r             [PENT,MMX]
+
+\c{POR} performs a bitwise OR operation between its two operands
+(i.e. each bit of the result is 1 if and only if at least one of the
+corresponding bits of the two inputs was 1), and stores the result
+in the destination (first) operand.
+
+\H{insPSLLD} \i\c{PSLLx}, \i\c{PSRLx}, \i\c{PSRAx}: MMX Bit Shifts
+
+\c PSLLW mmxreg,r/m64            ; 0F F1 /r             [PENT,MMX]
+\c PSLLW mmxreg,imm8             ; 0F 71 /6 ib          [PENT,MMX]
+
+\c PSLLD mmxreg,r/m64            ; 0F F2 /r             [PENT,MMX]
+\c PSLLD mmxreg,imm8             ; 0F 72 /6 ib          [PENT,MMX]
+
+\c PSLLQ mmxreg,r/m64            ; 0F F3 /r             [PENT,MMX]
+\c PSLLQ mmxreg,imm8             ; 0F 73 /6 ib          [PENT,MMX]
+
+\c PSRAW mmxreg,r/m64            ; 0F E1 /r             [PENT,MMX]
+\c PSRAW mmxreg,imm8             ; 0F 71 /4 ib          [PENT,MMX]
+
+\c PSRAD mmxreg,r/m64            ; 0F E2 /r             [PENT,MMX]
+\c PSRAD mmxreg,imm8             ; 0F 72 /4 ib          [PENT,MMX]
+
+\c PSRLW mmxreg,r/m64            ; 0F D1 /r             [PENT,MMX]
+\c PSRLW mmxreg,imm8             ; 0F 71 /2 ib          [PENT,MMX]
+
+\c PSRLD mmxreg,r/m64            ; 0F D2 /r             [PENT,MMX]
+\c PSRLD mmxreg,imm8             ; 0F 72 /2 ib          [PENT,MMX]
+
+\c PSRLQ mmxreg,r/m64            ; 0F D3 /r             [PENT,MMX]
+\c PSRLQ mmxreg,imm8             ; 0F 73 /2 ib          [PENT,MMX]
+
+\c{PSxxQ} perform simple bit shifts on the 64-bit MMX registers: the
+destination (first) operand is shifted left or right by the number of
+bits given in the source (second) operand, and the vacated bits are
+filled in with zeros (for a logical shift) or copies of the original
+sign bit (for an arithmetic right shift).
+
+\c{PSxxW} and \c{PSxxD} perform packed bit shifts: the destination
+operand is treated as a vector of four words or two doublewords, and
+each element is shifted individually, so bits shifted out of one
+element do not interfere with empty bits coming into the next.
+
+\c{PSLLx} and \c{PSRLx} perform logical shifts: the vacated bits at
+one end of the shifted number are filled with zeros. \c{PSRAx}
+performs an arithmetic right shift: the vacated bits at the top of
+the shifted number are filled with copies of the original top (sign)
+bit.
+
+\H{insPSUBB} \i\c{PSUBxx}: MMX Packed Subtraction
+
+\c PSUBB mmxreg,r/m64            ; 0F F8 /r             [PENT,MMX]
+\c PSUBW mmxreg,r/m64            ; 0F F9 /r             [PENT,MMX]
+\c PSUBD mmxreg,r/m64            ; 0F FA /r             [PENT,MMX]
+
+\c PSUBSB mmxreg,r/m64           ; 0F E8 /r             [PENT,MMX]
+\c PSUBSW mmxreg,r/m64           ; 0F E9 /r             [PENT,MMX]
+
+\c PSUBUSB mmxreg,r/m64          ; 0F D8 /r             [PENT,MMX]
+\c PSUBUSW mmxreg,r/m64          ; 0F D9 /r             [PENT,MMX]
+
+\c{PSUBxx} all perform packed subtraction between their two 64-bit
+operands, storing the result in the destination (first) operand. The
+\c{PSUBxB} forms treat the 64-bit operands as vectors of eight
+bytes, and subtract each byte individually; \c{PSUBxW} treat the operands
+as vectors of four words; and \c{PSUBD} treats its operands as
+vectors of two doublewords.
+
+In all cases, the elements of the operand on the right are
+subtracted from the corresponding elements of the operand on the
+left, not the other way round.
+
+\c{PSUBSB} and \c{PSUBSW} perform signed saturation on the sum of
+each pair of bytes or words: if the result of a subtraction is too
+large or too small to fit into a signed byte or word result, it is
+clipped (saturated) to the largest or smallest value which \e{will}
+fit. \c{PSUBUSB} and \c{PSUBUSW} similarly perform unsigned
+saturation, clipping to \c{0FFh} or \c{0FFFFh} if the result is
+larger than that.
+
+\H{insPSUBSIW} \i\c{PSUBSIW}: MMX Packed Subtract with Saturation to
+Implied Destination
+
+\c PSUBSIW mmxreg,r/m64          ; 0F 55 /r             [CYRIX,MMX]
+
+\c{PSUBSIW}, specific to the Cyrix extensions to the MMX instruction
+set, performs the same function as \c{PSUBSW}, except that the
+result is not placed in the register specified by the first operand,
+but instead in the implied destination register, specified as for
+\c{PADDSIW} (\k{insPADDSIW}).
+
+\H{insPUNPCKHBW} \i\c{PUNPCKxxx}: Unpack Data
+
+\c PUNPCKHBW mmxreg,r/m64        ; 0F 68 /r             [PENT,MMX]
+\c PUNPCKHWD mmxreg,r/m64        ; 0F 69 /r             [PENT,MMX]
+\c PUNPCKHDQ mmxreg,r/m64        ; 0F 6A /r             [PENT,MMX]
+
+\c PUNPCKLBW mmxreg,r/m64        ; 0F 60 /r             [PENT,MMX]
+\c PUNPCKLWD mmxreg,r/m64        ; 0F 61 /r             [PENT,MMX]
+\c PUNPCKLDQ mmxreg,r/m64        ; 0F 62 /r             [PENT,MMX]
+
+\c{PUNPCKxx} all treat their operands as vectors, and produce a new
+vector generated by interleaving elements from the two inputs. The
+\c{PUNPCKHxx} instructions start by throwing away the bottom half of
+each input operand, and the \c{PUNPCKLxx} instructions throw away
+the top half.
+
+The remaining elements, totalling 64 bits, are then interleaved into
+the destination, alternating elements from the second (source)
+operand and the first (destination) operand: so the leftmost element
+in the result always comes from the second operand, and the
+rightmost from the destination.
+
+\c{PUNPCKxBW} works a byte at a time, \c{PUNPCKxWD} a word at a
+time, and \c{PUNPCKxDQ} a doubleword at a time.
+
+So, for example, if the first operand held \c{0x7A6A5A4A3A2A1A0A}
+and the second held \c{0x7B6B5B4B3B2B1B0B}, then:
+
+\b \c{PUNPCKHBW} would return \c{0x7B7A6B6A5B5A4B4A}.
+
+\b \c{PUNPCKHWD} would return \c{0x7B6B7A6A5B4B5A4A}.
+
+\b \c{PUNPCKHDQ} would return \c{0x7B6B5B4B7A6A5A4A}.
+
+\b \c{PUNPCKLBW} would return \c{0x3B3A2B2A1B1A0B0A}.
+
+\b \c{PUNPCKLWD} would return \c{0x3B2B3A2A1B0B1A0A}.
+
+\b \c{PUNPCKLDQ} would return \c{0x3B2B1B0B3A2A1A0A}.
+
+\H{insPUSH} \i\c{PUSH}: Push Data on Stack
+
+\c PUSH reg16                    ; o16 50+r             [8086]
+\c PUSH reg32                    ; o32 50+r             [386]
+
+\c PUSH r/m16                    ; o16 FF /6            [8086]
+\c PUSH r/m32                    ; o32 FF /6            [386]
+
+\c PUSH CS                       ; 0E                   [8086]
+\c PUSH DS                       ; 1E                   [8086]
+\c PUSH ES                       ; 06                   [8086]
+\c PUSH SS                       ; 16                   [8086]
+\c PUSH FS                       ; 0F A0                [386]
+\c PUSH GS                       ; 0F A8                [386]
+
+\c PUSH imm8                     ; 6A ib                [286]
+\c PUSH imm16                    ; o16 68 iw            [286]
+\c PUSH imm32                    ; o32 68 id            [386]
+
+\c{PUSH} decrements the stack pointer (\c{SP} or \c{ESP}) by 2 or 4,
+and then stores the given value at \c{[SS:SP]} or \c{[SS:ESP]}.
+
+The address-size attribute of the instruction determines whether
+\c{SP} or \c{ESP} is used as the stack pointer: to deliberately
+override the default given by the \c{BITS} setting, you can use an
+\i\c{a16} or \i\c{a32} prefix.
+
+The operand-size attribute of the instruction determines whether the
+stack pointer is decremented by 2 or 4: this means that segment
+register pushes in \c{BITS 32} mode will push 4 bytes on the stack,
+of which the upper two are undefined. If you need to override that,
+you can use an \i\c{o16} or \i\c{o32} prefix.
+
+The above opcode listings give two forms for general-purpose
+\i{register push} instructions: for example, \c{PUSH BX} has the two
+forms \c{53} and \c{FF F3}. NASM will always generate the shorter
+form when given \c{PUSH BX}. NDISASM will disassemble both.
+
+Unlike the undocumented and barely supported \c{POP CS}, \c{PUSH CS}
+is a perfectly valid and sensible instruction, supported on all
+processors.
+
+The instruction \c{PUSH SP} may be used to distinguish an 8086 from
+later processors: on an 8086, the value of \c{SP} stored is the
+value it has \e{after} the push instruction, whereas on later
+processors it is the value \e{before} the push instruction.
+
+\H{insPUSHA} \i\c{PUSHAx}: Push All General-Purpose Registers
+
+\c PUSHA                         ; 60                   [186]
+\c PUSHAD                        ; o32 60               [386]
+\c PUSHAW                        ; o16 60               [186]
+
+\c{PUSHAW} pushes, in succession, \c{AX}, \c{CX}, \c{DX}, \c{BX},
+\c{SP}, \c{BP}, \c{SI} and \c{DI} on the stack, decrementing the
+stack pointer by a total of 16.
+
+\c{PUSHAD} pushes, in succession, \c{EAX}, \c{ECX}, \c{EDX},
+\c{EBX}, \c{ESP}, \c{EBP}, \c{ESI} and \c{EDI} on the stack,
+decrementing the stack pointer by a total of 32.
+
+In both cases, the value of \c{SP} or \c{ESP} pushed is its
+\e{original} value, as it had before the instruction was executed.
+
+\c{PUSHA} is an alias mnemonic for either \c{PUSHAW} or \c{PUSHAD},
+depending on the current \c{BITS} setting.
+
+Note that the registers are pushed in order of their numeric values
+in opcodes (see \k{iref-rv}).
+
+See also \c{POPA} (\k{insPOPA}).
+
+\H{insPUSHF} \i\c{PUSHFx}: Push Flags Register
+
+\c PUSHF                         ; 9C                   [186]
+\c PUSHFD                        ; o32 9C               [386]
+\c PUSHFW                        ; o16 9C               [186]
+
+\c{PUSHFW} pops a word from the stack and stores it in the bottom 16
+bits of the flags register (or the whole flags register, on
+processors below a 386). \c{PUSHFD} pops a doubleword and stores it
+in the entire flags register.
+
+\c{PUSHF} is an alias mnemonic for either \c{PUSHFW} or \c{PUSHFD},
+depending on the current \c{BITS} setting.
+
+See also \c{POPF} (\k{insPOPF}).
+
+\H{insPXOR} \i\c{PXOR}: MMX Bitwise XOR
+
+\c PXOR mmxreg,r/m64             ; 0F EF /r             [PENT,MMX]
+
+\c{PXOR} performs a bitwise XOR operation between its two operands
+(i.e. each bit of the result is 1 if and only if exactly one of the
+corresponding bits of the two inputs was 1), and stores the result
+in the destination (first) operand.
+
+\H{insRCL} \i\c{RCL}, \i\c{RCR}: Bitwise Rotate through Carry Bit
+
+\c RCL r/m8,1                    ; D0 /2                [8086]
+\c RCL r/m8,CL                   ; D2 /2                [8086]
+\c RCL r/m8,imm8                 ; C0 /2 ib             [286]
+\c RCL r/m16,1                   ; o16 D1 /2            [8086]
+\c RCL r/m16,CL                  ; o16 D3 /2            [8086]
+\c RCL r/m16,imm8                ; o16 C1 /2 ib         [286]
+\c RCL r/m32,1                   ; o32 D1 /2            [386]
+\c RCL r/m32,CL                  ; o32 D3 /2            [386]
+\c RCL r/m32,imm8                ; o32 C1 /2 ib         [386]
+
+\c RCR r/m8,1                    ; D0 /3                [8086]
+\c RCR r/m8,CL                   ; D2 /3                [8086]
+\c RCR r/m8,imm8                 ; C0 /3 ib             [286]
+\c RCR r/m16,1                   ; o16 D1 /3            [8086]
+\c RCR r/m16,CL                  ; o16 D3 /3            [8086]
+\c RCR r/m16,imm8                ; o16 C1 /3 ib         [286]
+\c RCR r/m32,1                   ; o32 D1 /3            [386]
+\c RCR r/m32,CL                  ; o32 D3 /3            [386]
+\c RCR r/m32,imm8                ; o32 C1 /3 ib         [386]
+
+\c{RCL} and \c{RCR} perform a 9-bit, 17-bit or 33-bit bitwise
+rotation operation, involving the given source/destination (first)
+operand and the carry bit. Thus, for example, in the operation
+\c{RCR AL,1}, a 9-bit rotation is performed in which \c{AL} is
+shifted left by 1, the top bit of \c{AL} moves into the carry flag,
+and the original value of the carry flag is placed in the low bit of
+\c{AL}.
+
+The number of bits to rotate by is given by the second operand. Only
+the bottom five bits of the rotation count are considered by
+processors above the 8086.
+
+You can force the longer (286 and upwards, beginning with a \c{C1}
+byte) form of \c{RCL foo,1} by using a \c{BYTE} prefix: \c{RCL
+foo,BYTE 1}. Similarly with \c{RCR}.
+
+\H{insRDMSR} \i\c{RDMSR}: Read Model-Specific Registers
+
+\c RDMSR                         ; 0F 32                [PENT]
+
+\c{RDMSR} reads the processor Model-Specific Register (MSR) whose
+index is stored in \c{ECX}, and stores the result in \c{EDX:EAX}.
+See also \c{WRMSR} (\k{insWRMSR}).
+
+\H{insRDPMC} \i\c{RDPMC}: Read Performance-Monitoring Counters
+
+\c RDPMC                         ; 0F 33                [P6]
+
+\c{RDPMC} reads the processor performance-monitoring counter whose
+index is stored in \c{ECX}, and stores the result in \c{EDX:EAX}.
+
+\H{insRDTSC} \i\c{RDTSC}: Read Time-Stamp Counter
+
+\c RDTSC                         ; 0F 31                [PENT]
+
+\c{RDTSC} reads the processor's time-stamp counter into \c{EDX:EAX}.
+
+\H{insRET} \i\c{RET}, \i\c{RETF}, \i\c{RETN}: Return from Procedure Call
+
+\c RET                           ; C3                   [8086]
+\c RET imm16                     ; C2 iw                [8086]
+
+\c RETF                          ; CB                   [8086]
+\c RETF imm16                    ; CA iw                [8086]
+
+\c RETN                          ; C3                   [8086]
+\c RETN imm16                    ; C2 iw                [8086]
+
+\c{RET}, and its exact synonym \c{RETN}, pop \c{IP} or \c{EIP} from
+the stack and transfer control to the new address. Optionally, if a
+numeric second operand is provided, they increment the stack pointer
+by a further \c{imm16} bytes after popping the return address.
+
+\c{RETF} executes a far return: after popping \c{IP}/\c{EIP}, it
+then pops \c{CS}, and \e{then} increments the stack pointer by the
+optional argument if present.
+
+\H{insROL} \i\c{ROL}, \i\c{ROR}: Bitwise Rotate
+
+\c ROL r/m8,1                    ; D0 /0                [8086]
+\c ROL r/m8,CL                   ; D2 /0                [8086]
+\c ROL r/m8,imm8                 ; C0 /0 ib             [286]
+\c ROL r/m16,1                   ; o16 D1 /0            [8086]
+\c ROL r/m16,CL                  ; o16 D3 /0            [8086]
+\c ROL r/m16,imm8                ; o16 C1 /0 ib         [286]
+\c ROL r/m32,1                   ; o32 D1 /0            [386]
+\c ROL r/m32,CL                  ; o32 D3 /0            [386]
+\c ROL r/m32,imm8                ; o32 C1 /0 ib         [386]
+
+\c ROR r/m8,1                    ; D0 /1                [8086]
+\c ROR r/m8,CL                   ; D2 /1                [8086]
+\c ROR r/m8,imm8                 ; C0 /1 ib             [286]
+\c ROR r/m16,1                   ; o16 D1 /1            [8086]
+\c ROR r/m16,CL                  ; o16 D3 /1            [8086]
+\c ROR r/m16,imm8                ; o16 C1 /1 ib         [286]
+\c ROR r/m32,1                   ; o32 D1 /1            [386]
+\c ROR r/m32,CL                  ; o32 D3 /1            [386]
+\c ROR r/m32,imm8                ; o32 C1 /1 ib         [386]
+
+\c{ROL} and \c{ROR} perform a bitwise rotation operation on the given
+source/destination (first) operand. Thus, for example, in the
+operation \c{ROR AL,1}, an 8-bit rotation is performed in which
+\c{AL} is shifted left by 1 and the original top bit of \c{AL} moves
+round into the low bit.
+
+The number of bits to rotate by is given by the second operand. Only
+the bottom 3, 4 or 5 bits (depending on the source operand size) of
+the rotation count are considered by processors above the 8086.
+
+You can force the longer (286 and upwards, beginning with a \c{C1}
+byte) form of \c{ROL foo,1} by using a \c{BYTE} prefix: \c{ROL
+foo,BYTE 1}. Similarly with \c{ROR}.
+
+\H{insRSM} \i\c{RSM}: Resume from System-Management Mode
+
+\c RSM                           ; 0F AA                [PENT]
+
+\c{RSM} returns the processor to its normal operating mode when it
+was in System-Management Mode.
+
+\H{insSAHF} \i\c{SAHF}: Store AH to Flags
+
+\c SAHF                          ; 9E                   [8086]
+
+\c{SAHF} sets the low byte of the flags word according to the
+contents of the \c{AH} register. See also \c{LAHF} (\k{insLAHF}).
+
+\H{insSAL} \i\c{SAL}, \i\c{SAR}: Bitwise Arithmetic Shifts
+
+\c SAL r/m8,1                    ; D0 /4                [8086]
+\c SAL r/m8,CL                   ; D2 /4                [8086]
+\c SAL r/m8,imm8                 ; C0 /4 ib             [286]
+\c SAL r/m16,1                   ; o16 D1 /4            [8086]
+\c SAL r/m16,CL                  ; o16 D3 /4            [8086]
+\c SAL r/m16,imm8                ; o16 C1 /4 ib         [286]
+\c SAL r/m32,1                   ; o32 D1 /4            [386]
+\c SAL r/m32,CL                  ; o32 D3 /4            [386]
+\c SAL r/m32,imm8                ; o32 C1 /4 ib         [386]
+
+\c SAR r/m8,1                    ; D0 /0                [8086]
+\c SAR r/m8,CL                   ; D2 /0                [8086]
+\c SAR r/m8,imm8                 ; C0 /0 ib             [286]
+\c SAR r/m16,1                   ; o16 D1 /0            [8086]
+\c SAR r/m16,CL                  ; o16 D3 /0            [8086]
+\c SAR r/m16,imm8                ; o16 C1 /0 ib         [286]
+\c SAR r/m32,1                   ; o32 D1 /0            [386]
+\c SAR r/m32,CL                  ; o32 D3 /0            [386]
+\c SAR r/m32,imm8                ; o32 C1 /0 ib         [386]
+
+\c{SAL} and \c{SAR} perform an arithmetic shift operation on the given
+source/destination (first) operand. The vacated bits are filled with
+zero for \c{SAL}, and with copies of the original high bit of the
+source operand for \c{SAR}.
+
+\c{SAL} is a synonym for \c{SHL} (see \k{insSHL}). NASM will
+assemble either one to the same code, but NDISASM will always
+disassemble that code as \c{SHL}.
+
+The number of bits to shift by is given by the second operand. Only
+the bottom 3, 4 or 5 bits (depending on the source operand size) of
+the shift count are considered by processors above the 8086.
+
+You can force the longer (286 and upwards, beginning with a \c{C1}
+byte) form of \c{SAL foo,1} by using a \c{BYTE} prefix: \c{SAL
+foo,BYTE 1}. Similarly with \c{SAR}.
+
+\H{insSALC} \i\c{SALC}: Set AL from Carry Flag
+
+\c SALC                          ; D6                   [8086,UNDOC]
+
+\c{SALC} is an early undocumented instruction similar in concept to
+\c{SETcc} (\k{insSETcc}). Its function is to set \c{AL} to zero if
+the carry flag is clear, or to \c{0xFF} if it is set.
+
+\H{insSBB} \i\c{SBB}: Subtract with Borrow
+
+\c SBB r/m8,reg8                 ; 18 /r                [8086]
+\c SBB r/m16,reg16               ; o16 19 /r            [8086]
+\c SBB r/m32,reg32               ; o32 19 /r            [386]
+
+\c SBB reg8,r/m8                 ; 1A /r                [8086]
+\c SBB reg16,r/m16               ; o16 1B /r            [8086]
+\c SBB reg32,r/m32               ; o32 1B /r            [386]
+
+\c SBB r/m8,imm8                 ; 80 /3 ib             [8086]
+\c SBB r/m16,imm16               ; o16 81 /3 iw         [8086]
+\c SBB r/m32,imm32               ; o32 81 /3 id         [386]
+
+\c SBB r/m16,imm8                ; o16 83 /3 ib         [8086]
+\c SBB r/m32,imm8                ; o32 83 /3 ib         [8086]
+
+\c SBB AL,imm8                   ; 1C ib                [8086]
+\c SBB AX,imm16                  ; o16 1D iw            [8086]
+\c SBB EAX,imm32                 ; o32 1D id            [386]
+
+\c{SBB} performs integer subtraction: it subtracts its second
+operand, plus the value of the carry flag, from its first, and
+leaves the result in its destination (first) operand. The flags are
+set according to the result of the operation: in particular, the
+carry flag is affected and can be used by a subsequent \c{SBB}
+instruction.
+
+In the forms with an 8-bit immediate second operand and a longer
+first operand, the second operand is considered to be signed, and is
+sign-extended to the length of the first operand. In these cases,
+the \c{BYTE} qualifier is necessary to force NASM to generate this
+form of the instruction.
+
+To subtract one number from another without also subtracting the
+contents of the carry flag, use \c{SUB} (\k{insSUB}).
+
+\H{insSCASB} \i\c{SCASB}, \i\c{SCASW}, \i\c{SCASD}: Scan String
+
+\c SCASB                         ; AE                   [8086]
+\c SCASW                         ; o16 AF               [8086]
+\c SCASD                         ; o32 AF               [386]
+
+\c{SCASB} compares the byte in \c{AL} with the byte at \c{[ES:DI]}
+or \c{[ES:EDI]}, and sets the flags accordingly. It then increments
+or decrements (depending on the direction flag: increments if the
+flag is clear, decrements if it is set) \c{DI} (or \c{EDI}).
+
+The register used is \c{DI} if the address size is 16 bits, and
+\c{EDI} if it is 32 bits. If you need to use an address size not
+equal to the current \c{BITS} setting, you can use an explicit
+\i\c{a16} or \i\c{a32} prefix.
+
+Segment override prefixes have no effect for this instruction: the
+use of \c{ES} for the load from \c{[DI]} or \c{[EDI]} cannot be
+overridden.
+
+\c{SCASW} and \c{SCASD} work in the same way, but they compare a
+word to \c{AX} or a doubleword to \c{EAX} instead of a byte to
+\c{AL}, and increment or decrement the addressing registers by 2 or
+4 instead of 1.
+
+The \c{REPE} and \c{REPNE} prefixes (equivalently, \c{REPZ} and
+\c{REPNZ}) may be used to repeat the instruction up to \c{CX} (or
+\c{ECX} - again, the address size chooses which) times until the
+first unequal or equal byte is found.
+
+\H{insSETcc} \i\c{SETcc}: Set Register from Condition
+
+\c SETcc r/m8                    ; 0F 90+cc /2          [386]
+
+\c{SETcc} sets the given 8-bit operand to zero if its condition is
+not satisfied, and to 1 if it is.
+
+\H{insSGDT} \i\c{SGDT}, \i\c{SIDT}, \i\c{SLDT}: Store Descriptor Table Pointers
+
+\c SGDT mem                      ; 0F 01 /0             [286,PRIV]
+\c SIDT mem                      ; 0F 01 /1             [286,PRIV]
+\c SLDT r/m16                    ; 0F 00 /0             [286,PRIV]
+
+\c{SGDT} and \c{SIDT} both take a 6-byte memory area as an operand:
+they store the contents of the GDTR (global descriptor table
+register) or IDTR (interrupt descriptor table register) into that
+area as a 32-bit linear address and a 16-bit size limit from that
+area (in that order). These are the only instructions which directly
+use \e{linear} addresses, rather than segment/offset pairs.
+
+\c{SLDT} stores the segment selector corresponding to the LDT (local
+descriptor table) into the given operand.
+
+See also \c{LGDT}, \c{LIDT} and \c{LLDT} (\k{insLGDT}).
+
+\H{insSHL} \i\c{SHL}, \i\c{SHR}: Bitwise Logical Shifts
+
+\c SHL r/m8,1                    ; D0 /4                [8086]
+\c SHL r/m8,CL                   ; D2 /4                [8086]
+\c SHL r/m8,imm8                 ; C0 /4 ib             [286]
+\c SHL r/m16,1                   ; o16 D1 /4            [8086]
+\c SHL r/m16,CL                  ; o16 D3 /4            [8086]
+\c SHL r/m16,imm8                ; o16 C1 /4 ib         [286]
+\c SHL r/m32,1                   ; o32 D1 /4            [386]
+\c SHL r/m32,CL                  ; o32 D3 /4            [386]
+\c SHL r/m32,imm8                ; o32 C1 /4 ib         [386]
+
+\c SHR r/m8,1                    ; D0 /5                [8086]
+\c SHR r/m8,CL                   ; D2 /5                [8086]
+\c SHR r/m8,imm8                 ; C0 /5 ib             [286]
+\c SHR r/m16,1                   ; o16 D1 /5            [8086]
+\c SHR r/m16,CL                  ; o16 D3 /5            [8086]
+\c SHR r/m16,imm8                ; o16 C1 /5 ib         [286]
+\c SHR r/m32,1                   ; o32 D1 /5            [386]
+\c SHR r/m32,CL                  ; o32 D3 /5            [386]
+\c SHR r/m32,imm8                ; o32 C1 /5 ib         [386]
+
+\c{SHL} and \c{SHR} perform a logical shift operation on the given
+source/destination (first) operand. The vacated bits are filled with
+zero.
+
+A synonym for \c{SHL} is \c{SAL} (see \k{insSAL}). NASM will
+assemble either one to the same code, but NDISASM will always
+disassemble that code as \c{SHL}.
+
+The number of bits to shift by is given by the second operand. Only
+the bottom 3, 4 or 5 bits (depending on the source operand size) of
+the shift count are considered by processors above the 8086.
+
+You can force the longer (286 and upwards, beginning with a \c{C1}
+byte) form of \c{SHL foo,1} by using a \c{BYTE} prefix: \c{SHL
+foo,BYTE 1}. Similarly with \c{SHR}.
+
+\H{insSHLD} \i\c{SHLD}, \i\c{SHRD}: Bitwise Double-Precision Shifts
+
+\c SHLD r/m16,reg16,imm8         ; o16 0F A4 /r ib      [386]
+\c SHLD r/m16,reg32,imm8         ; o32 0F A4 /r ib      [386]
+\c SHLD r/m16,reg16,CL           ; o16 0F A5 /r         [386]
+\c SHLD r/m16,reg32,CL           ; o32 0F A5 /r         [386]
+
+\c SHRD r/m16,reg16,imm8         ; o16 0F AC /r ib      [386]
+\c SHRD r/m32,reg32,imm8         ; o32 0F AC /r ib      [386]
+\c SHRD r/m16,reg16,CL           ; o16 0F AD /r         [386]
+\c SHRD r/m32,reg32,CL           ; o32 0F AD /r         [386]
+
+\c{SHLD} performs a double-precision left shift. It notionally places
+its second operand to the right of its first, then shifts the entire
+bit string thus generated to the left by a number of bits specified
+in the third operand. It then updates only the \e{first} operand
+according to the result of this. The second operand is not modified.
+
+\c{SHRD} performs the corresponding right shift: it notionally
+places the second operand to the \e{left} of the first, shifts the
+whole bit string right, and updates only the first operand.
+
+For example, if \c{EAX} holds \c{0x01234567} and \c{EBX} holds
+\c{0x89ABCDEF}, then the instruction \c{SHLD EAX,EBX,4} would update
+\c{EAX} to hold \c{0x12345678}. Under the same conditions, \c{SHRD
+EAX,EBX,4} would update \c{EAX} to hold \c{0xF0123456}.
+
+The number of bits to shift by is given by the third operand. Only
+the bottom 5 bits of the shift count are considered.
+
+\H{insSMI} \i\c{SMI}: System Management Interrupt
+
+\c SMI                           ; F1                   [386,UNDOC]
+
+This is an opcode apparently supported by some AMD processors (which
+is why it can generate the same opcode as \c{INT1}), and places the
+machine into system-management mode, a special debugging mode.
+
+\H{insSMSW} \i\c{SMSW}: Store Machine Status Word
+
+\c SMSW r/m16                    ; 0F 01 /4             [286,PRIV]
+
+\c{SMSW} stores the bottom half of the \c{CR0} control register (or
+the Machine Status Word, on 286 processors) into the destination
+operand. See also \c{LMSW} (\k{insLMSW}).
+
+\H{insSTC} \i\c{STC}, \i\c{STD}, \i\c{STI}: Set Flags
+
+\c STC                           ; F9                   [8086]
+\c STD                           ; FD                   [8086]
+\c STI                           ; FB                   [8086]
+
+These instructions set various flags. \c{STC} sets the carry flag;
+\c{STD} sets the direction flag; and \c{STI} sets the interrupt flag
+(thus enabling interrupts).
+
+To clear the carry, direction, or interrupt flags, use the \c{CLC},
+\c{CLD} and \c{CLI} instructions (\k{insCLC}). To invert the carry
+flag, use \c{CMC} (\k{insCMC}).
+
+\H{insSTOSB} \i\c{STOSB}, \i\c{STOSW}, \i\c{STOSD}: Store Byte to String
+
+\c STOSB                         ; AA                   [8086]
+\c STOSW                         ; o16 AB               [8086]
+\c STOSD                         ; o32 AB               [386]
+
+\c{STOSB} stores the byte in \c{AL} at \c{[ES:DI]} or \c{[ES:EDI]},
+and sets the flags accordingly. It then increments or decrements
+(depending on the direction flag: increments if the flag is clear,
+decrements if it is set) \c{DI} (or \c{EDI}).
+
+The register used is \c{DI} if the address size is 16 bits, and
+\c{EDI} if it is 32 bits. If you need to use an address size not
+equal to the current \c{BITS} setting, you can use an explicit
+\i\c{a16} or \i\c{a32} prefix.
+
+Segment override prefixes have no effect for this instruction: the
+use of \c{ES} for the store to \c{[DI]} or \c{[EDI]} cannot be
+overridden.
+
+\c{STOSW} and \c{STOSD} work in the same way, but they store the
+word in \c{AX} or the doubleword in \c{EAX} instead of the byte in
+\c{AL}, and increment or decrement the addressing registers by 2 or
+4 instead of 1.
+
+The \c{REP} prefix may be used to repeat the instruction \c{CX} (or
+\c{ECX} - again, the address size chooses which) times.
+
+\H{insSTR} \i\c{STR}: Store Task Register
+
+\c STR r/m16                     ; 0F 00 /1             [286,PRIV]
+
+\c{STR} stores the segment selector corresponding to the contents of
+the Task Register into its operand.
+
+\H{insSUB} \i\c{SUB}: Subtract Integers
+
+\c SUB r/m8,reg8                 ; 28 /r                [8086]
+\c SUB r/m16,reg16               ; o16 29 /r            [8086]
+\c SUB r/m32,reg32               ; o32 29 /r            [386]
+
+\c SUB reg8,r/m8                 ; 2A /r                [8086]
+\c SUB reg16,r/m16               ; o16 2B /r            [8086]
+\c SUB reg32,r/m32               ; o32 2B /r            [386]
+
+\c SUB r/m8,imm8                 ; 80 /5 ib             [8086]
+\c SUB r/m16,imm16               ; o16 81 /5 iw         [8086]
+\c SUB r/m32,imm32               ; o32 81 /5 id         [386]
+
+\c SUB r/m16,imm8                ; o16 83 /5 ib         [8086]
+\c SUB r/m32,imm8                ; o32 83 /5 ib         [386]
+
+\c SUB AL,imm8                   ; 2C ib                [8086]
+\c SUB AX,imm16                  ; o16 2D iw            [8086]
+\c SUB EAX,imm32                 ; o32 2D id            [386]
+
+\c{SUB} performs integer subtraction: it subtracts its second
+operand from its first, and leaves the result in its destination
+(first) operand. The flags are set according to the result of the
+operation: in particular, the carry flag is affected and can be used
+by a subsequent \c{SBB} instruction (\k{insSBB}).
+
+In the forms with an 8-bit immediate second operand and a longer
+first operand, the second operand is considered to be signed, and is
+sign-extended to the length of the first operand. In these cases,
+the \c{BYTE} qualifier is necessary to force NASM to generate this
+form of the instruction.
+
+\H{insTEST} \i\c{TEST}: Test Bits (notional bitwise AND)
+
+\c TEST r/m8,reg8                ; 84 /r                [8086]
+\c TEST r/m16,reg16              ; o16 85 /r            [8086]
+\c TEST r/m32,reg32              ; o32 85 /r            [386]
+
+\c TEST r/m8,imm8                ; F6 /7 ib             [8086]
+\c TEST r/m16,imm16              ; o16 F7 /7 iw         [8086]
+\c TEST r/m32,imm32              ; o32 F7 /7 id         [386]
+
+\c TEST AL,imm8                  ; A8 ib                [8086]
+\c TEST AX,imm16                 ; o16 A9 iw            [8086]
+\c TEST EAX,imm32                ; o32 A9 id            [386]
+
+\c{TEST} performs a `mental' bitwise AND of its two operands, and
+affects the flags as if the operation had taken place, but does not
+store the result of the operation anywhere.
+
+\H{insUMOV} \i\c{UMOV}: User Move Data
+
+\c UMOV r/m8,reg8                ; 0F 10 /r             [386,UNDOC]
+\c UMOV r/m16,reg16              ; o16 0F 11 /r         [386,UNDOC]
+\c UMOV r/m32,reg32              ; o32 0F 11 /r         [386,UNDOC]
+
+\c UMOV reg8,r/m8                ; 0F 12 /r             [386,UNDOC]
+\c UMOV reg16,r/m16              ; o16 0F 13 /r         [386,UNDOC]
+\c UMOV reg32,r/m32              ; o32 0F 13 /r         [386,UNDOC]
+
+This undocumented instruction is used by in-circuit emulators to
+access user memory (as opposed to host memory). It is used just like
+an ordinary memory/register or register/register \c{MOV}
+instruction, but accesses user space.
+
+\H{insVERR} \i\c{VERR}, \i\c{VERW}: Verify Segment Readability/Writability
+
+\c VERR r/m16                    ; 0F 00 /4             [286,PRIV]
+
+\c VERW r/m16                    ; 0F 00 /5             [286,PRIV]
+
+\c{VERR} sets the zero flag if the segment specified by the selector
+in its operand can be read from at the current privilege level.
+\c{VERW} sets the zero flag if the segment can be written.
+
+\H{insWAIT} \i\c{WAIT}: Wait for Floating-Point Processor
+
+\c WAIT                          ; 9B                   [8086]
+
+\c{WAIT}, on 8086 systems with a separate 8087 FPU, waits for the
+FPU to have finished any operation it is engaged in before
+continuing main processor operations, so that (for example) an FPU
+store to main memory can be guaranteed to have completed before the
+CPU tries to read the result back out.
+
+On higher processors, \c{WAIT} is unnecessary for this purpose, and
+it has the alternative purpose of ensuring that any pending unmasked
+FPU exceptions have happened before execution continues.
+
+\H{insWBINVD} \i\c{WBINVD}: Write Back and Invalidate Cache
+
+\c WBINVD                        ; 0F 09                [486]
+
+\c{WBINVD} invalidates and empties the processor's internal caches,
+and causes the processor to instruct external caches to do the same.
+It writes the contents of the caches back to memory first, so no
+data is lost. To flush the caches quickly without bothering to write
+the data back first, use \c{INVD} (\k{insINVD}).
+
+\H{insWRMSR} \i\c{WRMSR}: Write Model-Specific Registers
+
+\c WRMSR                         ; 0F 30                [PENT]
+
+\c{WRMSR} writes the value in \c{EDX:EAX} to the processor
+Model-Specific Register (MSR) whose index is stored in \c{ECX}. See
+also \c{RDMSR} (\k{insRDMSR}).
+
+\H{insXADD} \i\c{XADD}: Exchange and Add
+
+\c XADD r/m8,reg8                ; 0F C0 /r             [486]
+\c XADD r/m16,reg16              ; o16 0F C1 /r         [486]
+\c XADD r/m32,reg32              ; o32 0F C1 /r         [486]
+
+\c{XADD} exchanges the values in its two operands, and then adds
+them together and writes the result into the destination (first)
+operand. This instruction can be used with a \c{LOCK} prefix for
+multi-processor synchronisation purposes.
+
+\H{insXBTS} \i\c{XBTS}: Extract Bit String
+
+\c XBTS reg16,r/m16              ; o16 0F A6 /r         [386,UNDOC]
+\c XBTS reg32,r/m32              ; o32 0F A6 /r         [386,UNDOC]
+
+No clear documentation seems to be available for this instruction:
+the best I've been able to find reads `Takes a string of bits from
+the first operand and puts them in the second operand'. It is
+present only in early 386 processors, and conflicts with the opcodes
+for \c{CMPXCHG486}. NASM supports it only for completeness. Its
+counterpart is \c{IBTS} (see \k{insIBTS}).
+
+\H{insXCHG} \i\c{XCHG}: Exchange
+
+\c XCHG reg8,r/m8                ; 86 /r                [8086]
+\c XCHG reg16,r/m8               ; o16 87 /r            [8086]
+\c XCHG reg32,r/m32              ; o32 87 /r            [386]
+
+\c XCHG r/m8,reg8                ; 86 /r                [8086]
+\c XCHG r/m16,reg16              ; o16 87 /r            [8086]
+\c XCHG r/m32,reg32              ; o32 87 /r            [386]
+
+\c XCHG AX,reg16                 ; o16 90+r             [8086]
+\c XCHG EAX,reg32                ; o32 90+r             [386]
+\c XCHG reg16,AX                 ; o16 90+r             [8086]
+\c XCHG reg32,EAX                ; o32 90+r             [386]
+
+\c{XCHG} exchanges the values in its two operands. It can be used
+with a \c{LOCK} prefix for purposes of multi-processor
+synchronisation.
+
+\c{XCHG AX,AX} or \c{XCHG EAX,EAX} (depending on the \c{BITS}
+setting) generates the opcode \c{90h}, and so is a synonym for
+\c{NOP} (\k{insNOP}).
+
+\H{insXLATB} \i\c{XLATB}: Translate Byte in Lookup Table
+
+\c XLATB                         ; D7                   [8086]
+
+\c{XLATB} adds the value in \c{AL}, treated as an unsigned byte, to
+\c{BX} or \c{EBX}, and loads the byte from the resulting address (in
+the segment specified by \c{DS}) back into \c{AL}.
+
+The base register used is \c{BX} if the address size is 16 bits, and
+\c{EBX} if it is 32 bits. If you need to use an address size not
+equal to the current \c{BITS} setting, you can use an explicit
+\i\c{a16} or \i\c{a32} prefix.
+
+The segment register used to load from \c{[BX+AL]} or \c{[EBX+AL]}
+can be overridden by using a segment register name as a prefix (for
+example, \c{es xlatb}).
+
+\H{insXOR} \i\c{XOR}: Bitwise Exclusive OR
+
+\c XOR r/m8,reg8                 ; 30 /r                [8086]
+\c XOR r/m16,reg16               ; o16 31 /r            [8086]
+\c XOR r/m32,reg32               ; o32 31 /r            [386]
+
+\c XOR reg8,r/m8                 ; 32 /r                [8086]
+\c XOR reg16,r/m16               ; o16 33 /r            [8086]
+\c XOR reg32,r/m32               ; o32 33 /r            [386]
+
+\c XOR r/m8,imm8                 ; 80 /6 ib             [8086]
+\c XOR r/m16,imm16               ; o16 81 /6 iw         [8086]
+\c XOR r/m32,imm32               ; o32 81 /6 id         [386]
+
+\c XOR r/m16,imm8                ; o16 83 /6 ib         [8086]
+\c XOR r/m32,imm8                ; o32 83 /6 ib         [386]
+
+\c XOR AL,imm8                   ; 34 ib                [8086]
+\c XOR AX,imm16                  ; o16 35 iw            [8086]
+\c XOR EAX,imm32                 ; o32 35 id            [386]
+
+\c{XOR} performs a bitwise XOR operation between its two operands
+(i.e. each bit of the result is 1 if and only if exactly one of the
+corresponding bits of the two inputs was 1), and stores the result
+in the destination (first) operand.
+
+In the forms with an 8-bit immediate second operand and a longer
+first operand, the second operand is considered to be signed, and is
+sign-extended to the length of the first operand. In these cases,
+the \c{BYTE} qualifier is necessary to force NASM to generate this
+form of the instruction.
+
+The MMX instruction \c{PXOR} (see \k{insPXOR}) performs the same
+operation on the 64-bit MMX registers.
diff --git a/doc/rdsrc.pl b/doc/rdsrc.pl
new file mode 100644
index 00000000..1a981287
--- /dev/null
+++ b/doc/rdsrc.pl
@@ -0,0 +1,2134 @@
+#!/usr/bin/perl
+
+# Read the source-form of the NASM manual and generate the various
+# output forms.
+
+# TODO:
+#
+# PS output:
+# - show page numbers in printed output
+# - think about double-sided support (start all chapters on RHS,
+#   ie odd-numbered, pages).
+#
+# Ellipsis support would be nice.
+
+# Source-form features:
+# ---------------------
+# 
+# Bullet \b
+#   Bullets the paragraph. Rest of paragraph is indented to cope. In
+#   HTML, consecutive groups of bulleted paragraphs become unordered
+#   lists.
+# 
+# Emphasis \e{foobar}
+#   produces `_foobar_' in text and italics in HTML, PS, RTF
+# 
+# Inline code \c{foobar}
+#   produces ``foobar'' in text, and fixed-pitch font in HTML, PS, RTF
+# 
+# Display code
+# \c  line one
+# \c   line two
+#   produces fixed-pitch font where appropriate, and doesn't break
+#   pages except sufficiently far into the middle of a display.
+# 
+# Chapter, header and subheader
+# \C{intro} Introduction
+# \H{whatsnasm} What is NASM?
+# \S{free} NASM Is Free
+#   dealt with as appropriate. Chapters begin on new sides, possibly
+#   even new _pages_. (Sub)?headers are good places to begin new
+#   pages. Just _after_ a (sub)?header isn't.
+#   The keywords can be substituted with \K and \k.
+#
+# Keyword \K{cintro} \k{cintro}
+#   Expands to `Chapter 1', `Section 1.1', `Section 1.1.1'. \K has an
+#   initial capital whereas \k doesn't. In HTML, will produce
+#   hyperlinks.
+# 
+# Web link \W{http://foobar/}{text} or \W{mailto:me@here}\c{me@here}
+#   the \W prefix is ignored except in HTML; in HTML the last part
+#   becomes a hyperlink to the first part.
+# 
+# Literals \{ \} \\
+#   In case it's necessary, they expand to the real versions.
+# 
+# Nonbreaking hyphen \-
+#   Need more be said?
+# 
+# Source comment \#
+#   Causes everything after it on the line to be ignored by the
+#   source-form processor.
+#
+# Indexable word \i{foobar} (or \i\e{foobar} or \i\c{foobar}, equally)
+#   makes word appear in index, referenced to that point
+#   \i\c comes up in code style even in the index; \i\e doesn't come
+#   up in emphasised style.
+#
+# Indexable non-displayed word \I{foobar} or \I\c{foobar}
+#   just as \i{foobar} except that nothing is displayed for it
+#
+# Index rewrite
+# \IR{foobar} \c{foobar} operator, uses of
+#   tidies up the appearance in the index of something the \i or \I
+#   operator was applied to
+#
+# Index alias
+# \IA{foobar}{bazquux}
+#   aliases one index tag (as might be supplied to \i or \I) to
+#   another, so that \I{foobar} has the effect of \I{bazquux}, and
+#   \i{foobar} has the effect of \I{bazquux}foobar
+
+$diag = 1, shift @ARGV if $ARGV[0] eq "-d";
+
+$| = 1;
+
+$tstruct_previtem = $node = "Top";
+$nodes = ($node);
+$tstruct_level{$tstruct_previtem} = 0;
+$tstruct_last[$tstruct_level{$tstruct_previtem}] = $tstruct_previtem;
+$MAXLEVEL = 10;  # really 3, but play safe ;-)
+
+# Read the file; pass a paragraph at a time to the paragraph processor.
+print "Reading input...";
+$pname = "para000000";
+@pnames = @pflags = ();
+$para = undef;
+while (<>) {
+  chomp;
+  if (!/\S/ || /^\\I[AR]/) { # special case: \I[AR] implies new-paragraph
+    &got_para($para);
+    $para = undef;
+  }
+  if (/\S/) {
+    s/\\#.*$//; # strip comments
+    $para .= " " . $_;
+  }
+}
+&got_para($para);
+print "done.\n";
+
+# Now we've read in the entire document and we know what all the
+# heading keywords refer to. Go through and fix up the \k references.
+print "Fixing up cross-references...";
+&fixup_xrefs;
+print "done.\n";
+
+# Sort the index tags, according to the slightly odd order I've decided on.
+print "Sorting index tags...";
+&indexsort;
+print "done.\n";
+
+if ($diag) {
+  print "Writing index-diagnostic file...";
+  &indexdiag;
+  print "done.\n";
+}
+
+# OK. Write out the various output files.
+print "Producing text output: ";
+&write_txt;
+print "done.\n";
+print "Producing HTML output: ";
+&write_html;
+print "done.\n";
+print "Producing PostScript output: ";
+&write_ps;
+print "done.\n";
+print "Producing Texinfo output: ";
+&write_texi;
+print "done.\n";
+print "Producing WinHelp output: ";
+&write_hlp;
+print "done.\n";
+
+sub got_para {
+  local ($_) = @_;
+  my $pflags = "", $i, $w, $l, $t;
+  return if !/\S/;
+
+  @$pname = ();
+
+  # Strip off _leading_ spaces, then determine type of paragraph.
+  s/^\s*//;
+  $irewrite = undef;
+  if (/^\\c[^{]/) {
+    # A code paragraph. The paragraph-array will contain the simple
+    # strings which form each line of the paragraph.
+    $pflags = "code";
+    while (/^\\c (([^\\]|\\[^c])*)(.*)$/) {
+      $l = $1;
+      $_ = $3;
+      $l =~ s/\\{/{/g;
+      $l =~ s/\\}/}/g;
+      $l =~ s/\\\\/\\/g;
+      push @$pname, $l;
+    }
+    $_ = ''; # suppress word-by-word code
+  } elsif (/^\\C/) {
+    # A chapter heading. Define the keyword and allocate a chapter
+    # number.
+    $cnum++;
+    $hnum = 0;
+    $snum = 0;
+    $xref = "chapter-$cnum";
+    $pflags = "chap $cnum :$xref";
+    die "badly formatted chapter heading: $_\n" if !/^\\C{([^}]*)}\s*(.*)$/;
+    $refs{$1} = "chapter $cnum";
+    $node = "Chapter $cnum";
+    &add_item($node, 1);
+    $xrefnodes{$node} = $xref; $nodexrefs{$xref} = $node;
+    $xrefs{$1} = $xref;
+    $_ = $2;
+    # the standard word-by-word code will happen next
+  } elsif (/^\\A/) {
+    # An appendix heading. Define the keyword and allocate an appendix
+    # letter.
+    $cnum++;
+    $cnum = 'A' if $cnum =~ /[0-9]+/;
+    $hnum = 0;
+    $snum = 0;
+    $xref = "appendix-$cnum";
+    $pflags = "appn $cnum :$xref";
+    die "badly formatted appendix heading: $_\n" if !/^\\A{([^}]*)}\s*(.*)$/;
+    $refs{$1} = "appendix $cnum";
+    $node = "Appendix $cnum";
+    &add_item($node, 1);
+    $xrefnodes{$node} = $xref; $nodexrefs{$xref} = $node;
+    $xrefs{$1} = $xref;
+    $_ = $2;
+    # the standard word-by-word code will happen next
+  } elsif (/^\\H/) {
+    # A major heading. Define the keyword and allocate a section number.
+    $hnum++;
+    $snum = 0;
+    $xref = "section-$cnum.$hnum";
+    $pflags = "head $cnum.$hnum :$xref";
+    die "badly formatted heading: $_\n" if !/^\\[HP]{([^}]*)}\s*(.*)$/;
+    $refs{$1} = "section $cnum.$hnum";
+    $node = "Section $cnum.$hnum";
+    &add_item($node, 2);
+    $xrefnodes{$node} = $xref; $nodexrefs{$xref} = $node;
+    $xrefs{$1} = $xref;
+    $_ = $2;
+    # the standard word-by-word code will happen next
+  } elsif (/^\\S/) {
+    # A sub-heading. Define the keyword and allocate a section number.
+    $snum++;
+    $xref = "section-$cnum.$hnum.$snum";
+    $pflags = "subh $cnum.$hnum.$snum :$xref";
+    die "badly formatted subheading: $_\n" if !/^\\S{([^}]*)}\s*(.*)$/;
+    $refs{$1} = "section $cnum.$hnum.$snum";
+    $node = "Section $cnum.$hnum.$snum";
+    &add_item($node, 3);
+    $xrefnodes{$node} = $xref; $nodexrefs{$xref} = $node;
+    $xrefs{$1} = $xref;
+    $_ = $2;
+    # the standard word-by-word code will happen next
+  } elsif (/^\\IR/) {
+    # An index-rewrite.
+    die "badly formatted index rewrite: $_\n" if !/^\\IR{([^}]*)}\s*(.*)$/;
+    $irewrite = $1;
+    $_ = $2;
+    # the standard word-by-word code will happen next
+  } elsif (/^\\IA/) {
+    # An index-alias.
+    die "badly formatted index alias: $_\n" if !/^\\IA{([^}]*)}{([^}]*)}\s*$/;
+    $idxalias{$1} = $2;
+    return; # avoid word-by-word code
+  } elsif (/^\\b/) {
+    # A bulleted paragraph. Strip off the initial \b and let the
+    # word-by-word code take care of the rest.
+    $pflags = "bull";
+    s/^\\b\s*//;
+  } else {
+    # A normal paragraph. Just set $pflags: the word-by-word code does
+    # the rest.
+    $pflags = "norm";
+  }
+
+  # The word-by-word code: unless @$pname is already defined (which it
+  # will be in the case of a code paragraph), split the paragraph up
+  # into words and push each on @$pname.
+  #
+  # Each thing pushed on @$pname should have a two-character type
+  # code followed by the text.
+  #
+  # Type codes are:
+  # "n " for normal
+  # "da" for a dash
+  # "es" for first emphasised word in emphasised bit
+  # "e " for emphasised in mid-emphasised-bit
+  # "ee" for last emphasised word in emphasised bit
+  # "eo" for single (only) emphasised word
+  # "c " for code
+  # "k " for cross-ref
+  # "kK" for capitalised cross-ref
+  # "w " for Web link
+  # "wc" for code-type Web link
+  # "x " for beginning of resolved cross-ref; generates no visible output,
+  #      and the text is the cross-reference code
+  # "xe" for end of resolved cross-ref; text is same as for "x ".
+  # "i " for point to be indexed: the text is the internal index into the
+  #      index-items arrays
+  # "sp" for space
+  while (/\S/) {
+    s/^\s*//, push @$pname, "sp" if /^\s/;
+    $indexing = $qindex = 0;
+    if (/^(\\[iI])?\\c/) {
+      $qindex = 1 if $1 eq "\\I";
+      $indexing = 1, s/^\\[iI]// if $1;
+      s/^\\c//;
+      die "badly formatted \\c: \\c$_\n" if !/{(([^\\}]|\\.)*)}(.*)$/;
+      $w = $1;
+      $_ = $3;
+      $w =~ s/\\{/{/g;
+      $w =~ s/\\}/}/g;
+      $w =~ s/\\-/-/g;
+      $w =~ s/\\\\/\\/g;
+      (push @$pname,"i"),$lastp = $#$pname if $indexing;
+      push @$pname,"c $w" if !$qindex;
+      $$pname[$lastp] = &addidx($node, $w, "c $w") if $indexing;
+    } elsif (/^\\[iIe]/) {
+      /^(\\[iI])?(\\e)?/;
+      $emph = 0;
+      $qindex = 1 if $1 eq "\\I";
+      $indexing = 1, $type = "\\i" if $1;
+      $emph = 1, $type = "\\e" if $2;
+      s/^(\\[iI])?(\\e?)//;
+      die "badly formatted $type: $type$_\n" if !/{(([^\\}]|\\.)*)}(.*)$/;
+      $w = $1;
+      $_ = $3;
+      $w =~ s/\\{/{/g;
+      $w =~ s/\\}/}/g;
+      $w =~ s/\\-/-/g;
+      $w =~ s/\\\\/\\/g;
+      $t = $emph ? "es" : "n ";
+      @ientry = ();
+      (push @$pname,"i"),$lastp = $#$pname if $indexing;
+      foreach $i (split /\s+/,$w) {  # \e and \i can be multiple words
+        push @$pname,"$t$i","sp" if !$qindex;
+	($ii=$i) =~ tr/A-Z/a-z/, push @ientry,"n $ii","sp" if $indexing;
+	$t = $emph ? "e " : "n ";
+      }
+      $w =~ tr/A-Z/a-z/, pop @ientry if $indexing;
+      $$pname[$lastp] = &addidx($node, $w, @ientry) if $indexing;
+      pop @$pname if !$qindex; # remove final space
+      if (substr($$pname[$#$pname],0,2) eq "es" && !$qindex) {
+        substr($$pname[$#$pname],0,2) = "eo";
+      } elsif ($emph && !$qindex) {
+        substr($$pname[$#$pname],0,2) = "ee";
+      }
+    } elsif (/^\\[kK]/) {
+      $t = "k ";
+      $t = "kK" if /^\\K/;
+      s/^\\[kK]//;
+      die "badly formatted \\k: \\c$_\n" if !/{([^}]*)}(.*)$/;
+      $_ = $2;
+      push @$pname,"$t$1";
+    } elsif (/^\\W/) {
+      s/^\\W//;
+      die "badly formatted \\W: \\W$_\n"
+          if !/{([^}]*)}(\\i)?(\\c)?{(([^\\}]|\\.)*)}(.*)$/;
+      $l = $1;
+      $w = $4;
+      $_ = $6;
+      $t = "w ";
+      $t = "wc" if $3 eq "\\c";
+      $indexing = 1 if $2;
+      $w =~ s/\\{/{/g;
+      $w =~ s/\\}/}/g;
+      $w =~ s/\\-/-/g;
+      $w =~ s/\\\\/\\/g;
+      (push @$pname,"i"),$lastp = $#$pname if $indexing;
+      push @$pname,"$t<$l>$w";
+      $$pname[$lastp] = &addidx($node, $w, "c $w") if $indexing;
+    } else {
+      die "what the hell? $_\n" if !/^(([^\s\\\-]|\\[\\{}\-])*-?)(.*)$/;
+      die "painful death! $_\n" if !length $1;
+      $w = $1;
+      $_ = $3;
+      $w =~ s/\\{/{/g;
+      $w =~ s/\\}/}/g;
+      $w =~ s/\\-/-/g;
+      $w =~ s/\\\\/\\/g;
+      if ($w eq "-") {
+        push @$pname,"da";
+      } else {
+        push @$pname,"n $w";
+      }
+    }
+  }
+  if ($irewrite ne undef) {
+    &addidx(undef, $irewrite, @$pname);
+    @$pname = ();
+  } else {
+    push @pnames, $pname;
+    push @pflags, $pflags;
+    $pname++;
+  }
+}
+
+sub addidx {
+  my ($node, $text, @ientry) = @_;
+  $text = $idxalias{$text} || $text;
+  if ($node eq undef || !$idxmap{$text}) {
+    @$ientry = @ientry;
+    $idxmap{$text} = $ientry;
+    $ientry++;
+  }
+  if ($node) {
+    $idxnodes{$node,$text} = 1;
+    return "i $text";
+  }
+}
+
+sub indexsort {
+  my $iitem, $ientry, $i, $piitem, $pcval, $cval, $clrcval;
+
+  @itags = map { # get back the original data as the 1st elt of each list
+             $_->[0]
+	   } sort { # compare auxiliary (non-first) elements of lists
+	     $a->[1] cmp $b->[1] ||
+	     $a->[2] cmp $b->[2] ||
+	     $a->[0] cmp $b->[0]
+           } map { # transform array into list of 3-element lists
+	     my $ientry = $idxmap{$_};
+	     my $a = substr($$ientry[0],2);
+	     $a =~ tr/A-Za-z//cd;
+	     [$_, uc($a), substr($$ientry[0],0,2)]
+	   } keys %idxmap;
+
+  # Having done that, check for comma-hood.
+  $cval = 0;
+  foreach $iitem (@itags) {
+    $ientry = $idxmap{$iitem};
+    $clrcval = 1;
+    $pcval = $cval;
+    FL:for ($i=0; $i <= $#$ientry; $i++) {
+      if ($$ientry[$i] =~ /^(n .*,)(.*)/) {
+        $$ientry[$i] = $1;
+	splice @$ientry,$i+1,0,"n $2" if length $2;
+	$commapos{$iitem} = $i+1;
+	$cval = join("\002", @$ientry[0..$i]);
+	$clrcval = 0;
+	last FL;
+      }
+    }
+    $cval = undef if $clrcval;
+    $commanext{$iitem} = $commaafter{$piitem} = 1
+      if $cval and ($cval eq $pcval);
+    $piitem = $iitem;
+  }
+}
+
+sub indexdiag {
+  my $iitem,$ientry,$w,$ww,$foo,$node;
+  open INDEXDIAG,">index.diag";
+  foreach $iitem (@itags) {
+    $ientry = $idxmap{$iitem};
+    print INDEXDIAG "<$iitem> ";
+    foreach $w (@$ientry) {
+      $ww = &word_txt($w);
+      print INDEXDIAG $ww unless $ww eq "\001";
+    }
+    print INDEXDIAG ":";
+    $foo = " ";
+    foreach $node (@nodes) {
+      (print INDEXDIAG $foo,$node), $foo = ", " if $idxnodes{$node,$iitem};
+    }
+    print INDEXDIAG "\n";
+  }
+  close INDEXDIAG;
+}
+
+sub fixup_xrefs {
+  my $pname, $p, $i, $j, $k, $caps, @repl;
+
+  for ($p=0; $p<=$#pnames; $p++) {
+    next if $pflags[$p] eq "code";
+    $pname = $pnames[$p];
+    for ($i=$#$pname; $i >= 0; $i--) {
+      if ($$pname[$i] =~ /^k/) {
+        $k = $$pname[$i];
+        $caps = ($k =~ /^kK/);
+	$k = substr($k,2);	
+        $repl = $refs{$k};
+	die "undefined keyword `$k'\n" unless $repl;
+	substr($repl,0,1) =~ tr/a-z/A-Z/ if $caps;
+	@repl = ();
+	push @repl,"x $xrefs{$k}";
+	foreach $j (split /\s+/,$repl) {
+	  push @repl,"n $j";
+	  push @repl,"sp";
+	}
+	pop @repl; # remove final space
+	push @repl,"xe$xrefs{$k}";
+	splice @$pname,$i,1,@repl;
+      }
+    }
+  }
+}
+
+sub write_txt {
+  # This is called from the top level, so I won't bother using
+  # my or local.
+
+  # Open file.
+  print "writing file...";
+  open TEXT,">nasmdoc.txt";
+  select TEXT;
+
+  # Preamble.
+  $title = "The Netwide Assembler: NASM";
+  $spaces = ' ' x ((75-(length $title))/2);
+  ($underscore = $title) =~ s/./=/g;
+  print "$spaces$title\n$spaces$underscore\n";
+
+  for ($para = 0; $para <= $#pnames; $para++) {
+    $pname = $pnames[$para];
+    $pflags = $pflags[$para];
+    $ptype = substr($pflags,0,4);
+
+    print "\n"; # always one of these before a new paragraph
+
+    if ($ptype eq "chap") {
+      # Chapter heading. "Chapter N: Title" followed by a line of
+      # minus signs.
+      $pflags =~ /chap (.*) :(.*)/;
+      $title = "Chapter $1: ";
+      foreach $i (@$pname) {
+        $ww = &word_txt($i);
+        $title .= $ww unless $ww eq "\001";
+      }
+      print "$title\n";
+      $title =~ s/./-/g;
+      print "$title\n";
+    } elsif ($ptype eq "appn") {
+      # Appendix heading. "Appendix N: Title" followed by a line of
+      # minus signs.
+      $pflags =~ /appn (.*) :(.*)/;
+      $title = "Appendix $1: ";
+      foreach $i (@$pname) {
+        $ww = &word_txt($i);
+        $title .= $ww unless $ww eq "\001";
+      }
+      print "$title\n";
+      $title =~ s/./-/g;
+      print "$title\n";
+    } elsif ($ptype eq "head" || $ptype eq "subh") {
+      # Heading or subheading. Just a number and some text.
+      $pflags =~ /.... (.*) :(.*)/;
+      $title = sprintf "%6s ", $1;
+      foreach $i (@$pname) {
+        $ww = &word_txt($i);
+        $title .= $ww unless $ww eq "\001";
+      }
+      print "$title\n";
+    } elsif ($ptype eq "code") {
+      # Code paragraph. Emit each line with a seven character indent.
+      foreach $i (@$pname) {
+        warn "code line longer than 68 chars: $i\n" if length $i > 68;
+        print ' 'x7, $i, "\n";
+      }
+    } elsif ($ptype eq "bull" || $ptype eq "norm") {
+      # Ordinary paragraph, optionally bulleted. We wrap, with ragged
+      # 75-char right margin and either 7 or 11 char left margin
+      # depending on bullets.
+      if ($ptype eq "bull") {
+        $line = ' 'x7 . '(*) ';
+	$next = ' 'x11;
+      } else {
+        $line = $next = ' 'x7;
+      }
+      @a = @$pname;
+      $wd = $wprev = '';
+      do {
+        do { $w = &word_txt(shift @a) } while $w eq "\001"; # nasty hack
+	$wd .= $wprev;
+	if ($wprev =~ /-$/ || $w eq ' ' || $w eq '' || $w eq undef) {
+	  if (length ($line . $wd) > 75) {
+	    $line =~ s/\s*$//; # trim trailing spaces
+	    print "$line\n";
+	    $line = $next;
+	    $wd =~ s/^\s*//; # trim leading spaces
+	  }
+	  $line .= $wd;
+	  $wd = '';
+	}
+	$wprev = $w;
+      } while ($w ne '' && $w ne undef);
+      if ($line =~ /\S/) {
+	$line =~ s/\s*$//; # trim trailing spaces
+	print "$line\n";
+      }
+    }
+  }
+
+  # Close file.
+  select STDOUT;
+  close TEXT;
+}
+
+sub word_txt {
+  my ($w) = @_;
+  my $wtype, $wmajt;
+
+  return undef if $w eq '' || $w eq undef;
+  $wtype = substr($w,0,2);
+  $wmajt = substr($wtype,0,1);
+  $w = substr($w,2);
+  $w =~ s/<.*>// if $wmajt eq "w"; # remove web links
+  if ($wmajt eq "n" || $wtype eq "e " || $wtype eq "w ") {
+    return $w;
+  } elsif ($wtype eq "sp") {
+    return ' ';
+  } elsif ($wtype eq "da") {
+    return '-';
+  } elsif ($wmajt eq "c" || $wtype eq "wc") {
+    return "`${w}'";
+  } elsif ($wtype eq "es") {
+    return "_${w}";
+  } elsif ($wtype eq "ee") {
+    return "${w}_";
+  } elsif ($wtype eq "eo") {
+    return "_${w}_";
+  } elsif ($wmajt eq "x" || $wmajt eq "i") {
+    return "\001";
+  } else {
+    die "panic in word_txt: $wtype$w\n";
+  }
+}
+
+sub write_html {
+  # This is called from the top level, so I won't bother using
+  # my or local.
+
+  # Write contents file. Just the preamble, then a menu of links to the
+  # separate chapter files and the nodes therein.
+  print "writing contents file...";
+  open TEXT,">nasmdoc0.html";
+  select TEXT;
+  &html_preamble(0);
+  print "<p>This manual documents NASM, the Netwide Assembler: an assembler\n";
+  print "targetting the Intel x86 series of processors, with portable source.\n";
+  print "<p>";
+  for ($node = $tstruct_next{'Top'}; $node; $node = $tstruct_next{$node}) {
+    if ($tstruct_level{$node} == 1) {
+      # Invent a file name.
+      ($number = lc($xrefnodes{$node})) =~ s/.*-//;
+      $fname="nasmdocx.html";
+      substr($fname,8 - length $number, length $number) = $number;
+      $html_fnames{$node} = $fname;
+      $link = $fname;
+      print "<p>";
+    } else {
+      # Use the preceding filename plus a marker point.
+      $link = $fname . "#$xrefnodes{$node}";
+    }
+    $title = "$node: ";
+    $pname = $tstruct_pname{$node};
+    foreach $i (@$pname) {
+      $ww = &word_html($i);
+      $title .= $ww unless $ww eq "\001";
+    }
+    print "<a href=\"$link\">$title</a><br>\n";
+  }
+  print "<p><a href=\"nasmdoci.html\">Index</a>\n";
+  print "</body></html>\n";
+  select STDOUT;
+  close TEXT;
+
+  # Open a null file, to ensure output (eg random &html_jumppoints calls)
+  # goes _somewhere_.
+  print "writing chapter files...";
+  open TEXT,">/dev/null";
+  select TEXT;
+  $html_lastf = '';
+
+  $in_list = 0;
+
+  for ($para = 0; $para <= $#pnames; $para++) {
+    $pname = $pnames[$para];
+    $pflags = $pflags[$para];
+    $ptype = substr($pflags,0,4);
+
+    $in_list = 0, print "</ul>\n" if $in_list && $ptype ne "bull";
+    if ($ptype eq "chap") {
+      # Chapter heading. Begin a new file.
+      $pflags =~ /chap (.*) :(.*)/;
+      $title = "Chapter $1: ";
+      $xref = $2;
+      &html_jumppoints; print "</body></html>\n"; select STDOUT; close TEXT;
+      $html_lastf = $html_fnames{$chapternode};
+      $chapternode = $nodexrefs{$xref};
+      $html_nextf = $html_fnames{$tstruct_mnext{$chapternode}};
+      open TEXT,">$html_fnames{$chapternode}"; select TEXT; &html_preamble(1);
+      foreach $i (@$pname) {
+        $ww = &word_html($i);
+        $title .= $ww unless $ww eq "\001";
+      }
+      $h = "<h2><a name=\"$xref\">$title</a></h2>\n";
+      print $h; print FULL $h;
+    } elsif ($ptype eq "appn") {
+      # Appendix heading. Begin a new file.
+      $pflags =~ /appn (.*) :(.*)/;
+      $title = "Appendix $1: ";
+      $xref = $2;
+      &html_jumppoints; print "</body></html>\n"; select STDOUT; close TEXT;
+      $html_lastf = $html_fnames{$chapternode};
+      $chapternode = $nodexrefs{$xref};
+      $html_nextf = $html_fnames{$tstruct_mnext{$chapternode}};
+      open TEXT,">$html_fnames{$chapternode}"; select TEXT; &html_preamble(1);
+      foreach $i (@$pname) {
+        $ww = &word_html($i);
+        $title .= $ww unless $ww eq "\001";
+      }
+      print "<h2><a name=\"$xref\">$title</a></h2>\n";
+    } elsif ($ptype eq "head" || $ptype eq "subh") {
+      # Heading or subheading.
+      $pflags =~ /.... (.*) :(.*)/;
+      $hdr = ($ptype eq "subh" ? "h4" : "h3");
+      $title = $1 . " ";
+      $xref = $2;
+      foreach $i (@$pname) {
+        $ww = &word_html($i);
+        $title .= $ww unless $ww eq "\001";
+      }
+      print "<$hdr><a name=\"$xref\">$title</a></$hdr>\n";
+    } elsif ($ptype eq "code") {
+      # Code paragraph.
+      print "<p><pre>\n";
+      foreach $i (@$pname) {
+	$w = $i;
+	$w =~ s/&/&amp;/g;
+	$w =~ s/</&lt;/g;
+	$w =~ s/>/&gt;/g;
+        print $w, "\n";
+      }
+      print "</pre>\n";
+    } elsif ($ptype eq "bull" || $ptype eq "norm") {
+      # Ordinary paragraph, optionally bulleted. We wrap, with ragged
+      # 75-char right margin and either 7 or 11 char left margin
+      # depending on bullets.
+      if ($ptype eq "bull") {
+        $in_list = 1, print "<ul>\n" unless $in_list;
+        $line = '<li>';
+      } else {
+        $line = '<p>';
+      }
+      @a = @$pname;
+      $wd = $wprev = '';
+      do {
+        do { $w = &word_html(shift @a) } while $w eq "\001"; # nasty hack
+	$wd .= $wprev;
+	if ($w eq ' ' || $w eq '' || $w eq undef) {
+	  if (length ($line . $wd) > 75) {
+	    $line =~ s/\s*$//; # trim trailing spaces
+	    print "$line\n";
+	    $line = '';
+	    $wd =~ s/^\s*//; # trim leading spaces
+	  }
+	  $line .= $wd;
+	  $wd = '';
+	}
+	$wprev = $w;
+      } while ($w ne '' && $w ne undef);
+      if ($line =~ /\S/) {
+	$line =~ s/\s*$//; # trim trailing spaces
+	print "$line\n";
+      }
+    }
+  }
+
+  # Close whichever file was open.
+  &html_jumppoints;
+  print "</body></html>\n";
+  select STDOUT;
+  close TEXT;
+
+  print "\n   writing index file...";
+  open TEXT,">nasmdoci.html";
+  select TEXT;
+  &html_preamble(0);
+  print "<p align=center><a href=\"nasmdoc0.html\">Contents</a>\n";
+  print "<p>";
+  &html_index;
+  print "<p align=center><a href=\"nasmdoc0.html\">Contents</a>\n";
+  print "</body></html>\n";
+  select STDOUT;
+  close TEXT;
+}
+
+sub html_preamble {
+  print "<html><head><title>NASM Manual</title></head>\n";
+  print "<body><h1 align=center>The Netwide Assembler: NASM</h1>\n\n";
+  &html_jumppoints if $_[0];
+}
+
+sub html_jumppoints {
+  print "<p align=center>";
+  print "<a href=\"$html_nextf\">Next Chapter</a> |\n" if $html_nextf;
+  print "<a href=\"$html_lastf\">Previous Chapter</a> |\n" if $html_lastf;
+  print "<a href=\"nasmdoc0.html\">Contents</a> |\n";
+  print "<a href=\"nasmdoci.html\">Index</a>\n";
+}
+
+sub html_index {
+  my $itag, $a, @ientry, $sep, $w, $wd, $wprev, $line;
+
+  $chapternode = '';
+  foreach $itag (@itags) {
+    $ientry = $idxmap{$itag};
+    @a = @$ientry;
+    push @a, "n :";
+    $sep = 0;
+    foreach $node (@nodes) {
+      next if !$idxnodes{$node,$itag};
+      push @a, "n ," if $sep;
+      push @a, "sp", "x $xrefnodes{$node}", "n $node", "xe$xrefnodes{$node}";
+      $sep = 1;
+    }
+    $line = '';
+    do {
+      do { $w = &word_html(shift @a) } while $w eq "\001"; # nasty hack
+      $wd .= $wprev;
+      if ($w eq ' ' || $w eq '' || $w eq undef) {
+        if (length ($line . $wd) > 75) {
+	  $line =~ s/\s*$//; # trim trailing spaces
+	  print "$line\n";
+	  $line = '';
+	  $wd =~ s/^\s*//; # trim leading spaces
+	}
+	$line .= $wd;
+	$wd = '';
+      }
+      $wprev = $w;
+    } while ($w ne '' && $w ne undef);
+    if ($line =~ /\S/) {
+      $line =~ s/\s*$//; # trim trailing spaces
+      print "$line\n";
+    }
+    print "<br>\n";
+  }
+}
+
+sub word_html {
+  my ($w) = @_;
+  my $wtype, $wmajt, $pfx, $sfx;
+
+  return undef if $w eq '' || $w eq undef;
+
+  $wtype = substr($w,0,2);
+  $wmajt = substr($wtype,0,1);
+  $w = substr($w,2);
+  $pfx = $sfx = '';
+  $pfx = "<a href=\"$1\">", $sfx = "</a>", $w = $2
+    if $wmajt eq "w" && $w =~ /^<(.*)>(.*)$/;
+  $w =~ s/&/&amp;/g;
+  $w =~ s/</&lt;/g;
+  $w =~ s/>/&gt;/g;
+  if ($wmajt eq "n" || $wtype eq "e " || $wtype eq "w ") {
+    return $pfx . $w . $sfx;
+  } elsif ($wtype eq "sp") {
+    return ' ';
+  } elsif ($wtype eq "da") {
+    return '-'; # sadly, en-dashes are non-standard in HTML
+  } elsif ($wmajt eq "c" || $wtype eq "wc") {
+    return $pfx . "<code><nobr>${w}</nobr></code>" . $sfx;
+  } elsif ($wtype eq "es") {
+    return "<em>${w}";
+  } elsif ($wtype eq "ee") {
+    return "${w}</em>";
+  } elsif ($wtype eq "eo") {
+    return "<em>${w}</em>";
+  } elsif ($wtype eq "x ") {
+    # Magic: we must resolve the cross reference into file and marker
+    # parts, then dispose of the file part if it's us, and dispose of
+    # the marker part if the cross reference describes the top node of
+    # another file.
+    my $node = $nodexrefs{$w}; # find the node we're aiming at
+    my $level = $tstruct_level{$node}; # and its level
+    my $up = $node, $uplev = $level-1;
+    $up = $tstruct_up{$up} while $uplev--; # get top node of containing file
+    my $file = ($up ne $chapternode) ? $html_fnames{$up} : "";
+    my $marker = ($level == 1 and $file) ? "" : "#$w";
+    return "<a href=\"$file$marker\">";
+  } elsif ($wtype eq "xe") {
+    return "</a>";
+  } elsif ($wmajt eq "i") {
+    return "\001";
+  } else {
+    die "panic in word_html: $wtype$w\n";
+  }
+}
+
+sub write_ps {
+  # This is called from the top level, so I won't bother using
+  # my or local.
+
+  # First, set up the font metric arrays.
+  &font_metrics;
+
+  # First stage: reprocess the source arrays into a list of
+  # lines, each of which is a list of word-strings, each of
+  # which has a single-letter font code followed by text.
+  # Each line also has an associated type, which will be
+  # used for final alignment and font selection and things.
+  #
+  # Font codes are:
+  #   n == Normal
+  #   e == Emphasised
+  #   c == Code
+  #  ' ' == space (no following text required)
+  #  '-' == dash (no following text required)
+  #
+  # Line types are:
+  #   chap == Chapter or appendix heading.
+  #   head == Major heading.
+  #   subh == Sub-heading.
+  #   Ccha == Contents entry for a chapter.
+  #   Chea == Contents entry for a heading.
+  #   Csub == Contents entry for a subheading.
+  #   cone == Code paragraph with just this one line on it.
+  #   cbeg == First line of multi-line code paragraph.
+  #   cbdy == Interior line of multi-line code paragraph.
+  #   cend == Final line of multi-line code paragraph.
+  #   none == Normal paragraph with just this one line on it.
+  #   nbeg == First line of multi-line normal paragraph.
+  #   nbdy == Interior line of multi-line normal paragraph.
+  #   nend == Final line of multi-line normal paragraph.
+  #   bone == Bulleted paragraph with just this one line on it.
+  #   bbeg == First line of multi-line bulleted paragraph.
+  #   bbdy == Interior line of multi-line bulleted paragraph.
+  #   bend == Final line of multi-line bulleted paragraph.
+  print "line-breaks...";
+  $lname = "psline000000";
+  $lnamei = "idx" . $lname;
+  @lnames = @ltypes = ();
+
+  for ($para = 0; $para <= $#pnames; $para++) {
+    $pname = $pnames[$para];
+    $pflags = $pflags[$para];
+    $ptype = substr($pflags,0,4);
+
+    # New paragraph _ergo_ new line.
+    @line = ();
+    @lindex = (); # list of index tags referenced to this line
+
+    if ($ptype eq "chap") {
+      # Chapter heading. "Chapter N: Title" followed by a line of
+      # minus signs.
+      $pflags =~ /chap (.*) :(.*)/;
+      push @line, "nChapter", " ", "n$1:", " ";
+      foreach $i (@$pname) {
+        $ww = &word_ps($i);
+        push @line, $ww unless $ww eq "x";
+      }
+      @$lname = @line; @$lnamei = @lindex;
+      push @lnames, $lname++;
+      $lnamei = "idx" . $lname;
+      push @ltypes, "chap";
+    } elsif ($ptype eq "appn") {
+      # Appendix heading. "Appendix N: Title" followed by a line of
+      # minus signs.
+      $pflags =~ /appn (.*) :(.*)/;
+      push @line, "nAppendix", " ", "n$1:", " ";
+      foreach $i (@$pname) {
+        $ww = &word_ps($i);
+        push @line, $ww unless $ww eq "x";
+      }
+      @$lname = @line; @$lnamei = @lindex;
+      push @lnames, $lname++;
+      $lnamei = "idx" . $lname;
+      push @ltypes, "chap";
+    } elsif ($ptype eq "head") {
+      # Heading. Just a number and some text.
+      $pflags =~ /.... (.*) :(.*)/;
+      push @line, "n$1";
+      foreach $i (@$pname) {
+        $ww = &word_ps($i);
+        push @line, $ww unless $ww eq "x";
+      }
+      @$lname = @line; @$lnamei = @lindex;
+      push @lnames, $lname++;
+      $lnamei = "idx" . $lname;
+      push @ltypes, $ptype;
+    } elsif ($ptype eq "subh") {
+      # Subheading. Just a number and some text.
+      $pflags =~ /subh (.*) :(.*)/;
+      push @line, "n$1";
+      foreach $i (@$pname) {
+        push @line, &word_ps($i);
+      }
+      @$lname = @line; @$lnamei = @lindex;
+      push @lnames, $lname++;
+      $lnamei = "idx" . $lname;
+      push @ltypes, "subh";
+    } elsif ($ptype eq "code") {
+      # Code paragraph. Emit lines one at a time.
+      $type = "cbeg";
+      foreach $i (@$pname) {
+        @$lname = ("c$i");
+	push @lnames, $lname++;
+	$lnamei = "idx" . $lname;
+	push @ltypes, $type;
+	$type = "cbdy";
+      }
+      $ltypes[$#ltypes] = ($ltypes[$#ltypes] eq "cbeg" ? "cone" : "cend");
+    } elsif ($ptype eq "bull" || $ptype eq "norm") {
+      # Ordinary paragraph, optionally bulleted. We wrap, with ragged
+      # 75-char right margin and either 7 or 11 char left margin
+      # depending on bullets.
+      if ($ptype eq "bull") {
+        $width = 456; # leave 12-pt left indent for the bullet
+	$type = $begtype = "bbeg";
+	$bodytype = "bbdy";
+	$onetype = "bone";
+	$endtype = "bend";
+      } else {
+        $width = 468;
+	$type = $begtype = "nbeg";
+	$bodytype = "nbdy";
+	$onetype = "none";
+	$endtype = "nend";
+      }
+      @a = @$pname;
+      @line = @wd = ();
+      $linelen = 0;
+      $wprev = undef;
+      do {
+        do { $w = &word_ps(shift @a) } while ($w eq "x");
+	push @wd, $wprev if $wprev;
+	if ($wprev =~ /^n.*-$/ || $w eq ' ' || $w eq '' || $w eq undef) {
+	  $wdlen = &len_ps(@wd);
+	  if ($linelen + $wdlen > $width) {
+	    pop @line while $line[$#line] eq ' '; # trim trailing spaces
+	    @$lname = @line; @$lnamei = @lindex;
+	    push @lnames, $lname++;
+	    $lnamei = "idx" . $lname;
+	    push @ltypes, $type;
+	    $type = $bodytype;
+	    @line = @lindex = ();
+	    $linelen = 0;
+	    shift @wd while $wd[0] eq ' '; # trim leading spaces
+	  }
+	  push @line, @wd;
+	  $linelen += $wdlen;
+	  @wd = ();
+	}
+	$wprev = $w;
+      } while ($w ne '' && $w ne undef);
+      if (@line) {
+        pop @line while $line[$#line] eq ' '; # trim trailing spaces
+	@$lname = @line; @$lnamei = @lindex;
+	push @lnames, $lname++;
+	$lnamei = "idx" . $lname;
+	push @ltypes, $type;
+	$type = $bodytype;
+      }
+      $ltypes[$#ltypes] =
+        ($ltypes[$#ltypes] eq $begtype ? $onetype : $endtype);
+    }
+  }
+
+  # We've now processed the document source into lines. Before we
+  # go on and do the page breaking, we'll fabricate a table of contents,
+  # line by line, and then after doing page breaks we'll go back and
+  # insert the page numbers into the contents entries.
+  print "building contents...";
+  @clnames = @cltypes = ();
+  $clname = "pscont000000";
+  @$clname = ("nContents"); # "chapter heading" for TOC
+  push @clnames,$clname++;
+  push @cltypes,"chap";
+  for ($i=0; $i<=$#lnames; $i++) {
+    $lname = $lnames[$i];
+    if ($ltypes[$i] =~ /^(chap|head|subh)/) {
+      @$clname = @$lname;
+      splice @$clname,1,0," " if ($ltypes[$i] !~ /chap/);
+      push @$clname,$i; # placeholder for page number
+      push @clnames,$clname++;
+      push @cltypes,"C" . substr($ltypes[$i],0,3);
+    }
+  }
+  @$clname = ("nIndex"); # contents entry for Index
+  push @$clname,$i;      # placeholder for page number
+  $idx_clname = $clname;
+  push @clnames,$clname++;
+  push @cltypes,"Ccha";
+  $contlen = $#clnames + 1;
+  unshift @lnames,@clnames;
+  unshift @ltypes,@cltypes;
+
+  # Second stage: now we have a list of lines, break them into pages.
+  # We do this by means of adding a third array in parallel with
+  # @lnames and @ltypes, called @lpages, in which we store the page
+  # number that each line resides on. We also add @ycoord which
+  # stores the vertical position of each line on the page.
+  #
+  # Page breaks may not come after line-types:
+  #   chap head subh cbeg nbeg bbeg
+  # and may not come before line-types:
+  #   cend nend bend
+  # They are forced before line-types:
+  #   chap
+  print "page-breaks...";
+  $pmax = 600; # ADJUSTABLE: maximum length of a page in points
+  $textht = 11; # ADJUSTABLE: height of a normal line in points
+  $spacing = 6; # ADJUSTABLE: space between paragraphs, in points
+  $headht = 14; # ADJUSTABLE: height of a major heading in points
+  $subht = 12; # ADJUSTABLE: height of a sub-heading in points
+  $pstart = 0; # start line of current page
+  $plen = 0; # current length of current page
+  $pnum = 1; # number of current page
+  $bpt = -1; # last feasible break point
+  $i = 0; # line number
+  while ($i <= $#lnames) {
+    $lname = $lnames[$i];
+    # Add the height of this line (computed the last time we went round
+    # the loop, unless we're a chapter heading in which case we do it
+    # now) to the length of the current page. Also, _put_ this line on
+    # the current page, and allocate it a y-coordinate.
+    if ($ltypes[$i] =~ /^chap$/) {
+      $plen = 100; # ADJUSTABLE: space taken up by a chapter heading
+      $ycoord[$i] = 0; # chapter heading: y-coord doesn't matter
+    } else {
+      $ycoord[$i] = $plen + $space;
+      $plen += $space + $ht;
+    }
+    # See if we can break after this line.
+    $bpt = $i if $ltypes[$i] !~ /^chap|head|subh|cbeg|nbeg|bbeg$/ &&
+		 $ltypes[$i+1] !~ /^cend|nend|bend$/;
+    # Assume, to start with, that we don't break after this line.
+    $break = 0;
+    # See if a break is forced.
+    $break = 1, $bpt = $i if $ltypes[$i+1] eq "chap" || !$ltypes[$i+1];
+    # Otherwise, compute the height of the next line, and break if
+    # it would make this page too long.
+    $ht = $textht, $space = 0 if $ltypes[$i+1] =~ /^[nbc](bdy|end)$/;
+    $ht = $textht, $space = $spacing if $ltypes[$i+1] =~ /^[nbc](one|beg)$/;
+    $ht = $textht, $space = $spacing if $ltypes[$i+1] =~ /^C/;
+    $ht = $subht, $space = $spacing if $ltypes[$i+1] eq "subh";
+    $ht = $headht, $space = $spacing if $ltypes[$i+1] eq "head";
+    $break = 1 if $plen + $space + $ht > $pmax;
+    # Now, if we're breaking, assign page number $pnum to all lines up
+    # to $bpt, set $i == $bpt+1, and zero $space since we are at the
+    # start of a new page and don't want leading space.
+    if ($break) {
+      die "no feasible break point at all on page $pnum\n" if $bpt == -1;
+      for ($j = $pstart; $j <= $bpt; $j++) {
+	$lnamei = "idx" . $lnames[$j];
+	foreach $k (@$lnamei) {
+	  ${$psidxpp{$k}}{$pnum} = 1;
+	}
+        $lpages[$j] = $pnum;
+      }
+      $pnum++;
+      $i = $bpt;
+      $bpt = -1;
+      $pstart = $i+1;
+      $plen = 0;
+      $space = 0;
+    }
+    $i++;
+  }
+
+  # Now fix up the TOC with page numbers.
+  print "\n   fixing up contents...";
+  for ($i=0; $i<=$#lnames; $i++) {
+    $lname = $lnames[$i];
+    if ($ltypes[$i] =~ /^C/) {
+      $j = pop @$lname;
+      push @$lname, "n" . $lpages[$j+$contlen];
+    }
+  }
+
+  # Having got page numbers for most stuff, generate an index.
+  print "building index...";
+  $iwid = 222;
+  $sep = 12;
+  $commaindent = 32;
+  foreach $k (@itags) {
+    @line = ();
+    $cmd = "index";
+    @idxentry = @{$idxmap{$k}};
+    if ($commaafter{$k} and !$commanext{$k}) {
+      # This line is a null line beginning a multiple entry. We must
+      # output the prefix on a line by itself.
+
+      @idxhead = splice @idxentry,0,$commapos{$k};
+      @line = ();
+      foreach $i (@idxhead) {
+        $ww = &word_ps($i);
+	push @line, $ww unless $ww eq "x";
+      }
+      &ps_idxout("index",\@line,[]);
+      $cmd = "iindex";
+      @line = ();
+    }
+    $cmd = "iindex", splice @idxentry,0,$commapos{$k} if $commanext{$k};
+    foreach $i (@idxentry) {
+      $ww = &word_ps($i);
+      push @line, $ww unless $ww eq "x";
+    }
+    $len = $iwid - $sep - &len_ps(@line);
+    warn "text for index tag `%s' is longer than one index line!\n"
+      if $len < -$sep;
+    @pp = ();
+    $inums = join(',',sort { $a <=> $b } keys %{$psidxpp{$k}});
+    while (length $inums) {
+      $inums =~ /^([^,]+,?)(.*)$/;
+      $inums = $2, $inum = $1;
+      @pnum = (" ", "n$inum");
+      $pnumlen = &len_ps(@pnum);
+      if ($pnumlen > $len) {
+        &ps_idxout($cmd,\@line,\@pp);
+	@pp = ();
+	@line = ();
+	$cmd = "index";
+	$len = $iwid - $sep;
+      }
+      push @pp, @pnum;
+      $len -= $pnumlen;
+    }
+    &ps_idxout($cmd,\@line,\@pp) if (length @pp);
+    $l1 = &len_ps(@line);
+    $l2 = &len_ps($pp);
+  }
+  $$idx_clname[$#$idx_clname] = "n" . $pnum; # fix up TOC entry for index
+
+  print "writing file...";
+  open PS,">nasmdoc.ps";
+  select PS;
+  $page = $lpages[0];
+  &ps_header;
+  for ($i=0; $i<=$#lnames; $i++) {
+    &ps_throw_pg($page,$lpages[$i]) if $page != $lpages[$i];
+    $page = $lpages[$i];
+    &ps_out_line($ycoord[$i],$ltypes[$i],$lnames[$i]);
+  }
+  $i = 0;
+  while ($i <= $#psindex) {
+    &ps_throw_pg($page, $pnum) if $page != $pnum;
+    $page = $pnum++;
+    $ypos = 0;
+    $ypos = 100, &ps_out_line(0, "chap", ["nIndex"]) if !$i;
+    $lines = ($pmax - $ypos) / $textht;
+    my $col; # ps_out_line hits this variable
+    PAGE:for ($col = 1; $col <= 2; $col++) {
+      $y = $ypos; $l = $lines;
+      COL: while ($l > 0) {
+        $j = $i+1;
+	$j++ while $psindex[$j] and ($psindex[$j][3] == 0); # find next break
+	last COL if $j-$i > $l or $i > $#psindex;
+	while ($i < $j) {
+	  &ps_out_line($y, $psindex[$i][0] eq "index" ? "idl$col" : "ldl$col",
+	               $psindex[$i][1]);
+	  &ps_out_line($y,"idr$col",$psindex[$i][2]);
+	  $i++;
+	  $y += $textht;
+	  $l--;
+	}
+      }
+      last PAGE if $i > $#psindex;
+    }
+  }
+  &ps_trailer;
+  close PS;
+  select STDOUT;
+}
+
+sub ps_idxout {
+  my ($cmd, $left, $right) = @_;
+  my $break = 1;
+  $break = 0
+      if ($#psindex >= 0) and ( ($#$left < 0) or ($cmd eq "iindex") );
+  push @psindex,[$cmd,[@$left],[@$right],$break];
+}
+
+sub ps_header {
+  @pshdr = (
+    '/sp (n ) def', # here it's sure not to get wrapped inside ()
+    '/nf /Times-Roman findfont 11 scalefont def',
+    '/ef /Times-Italic findfont 11 scalefont def',
+    '/cf /Courier findfont 11 scalefont def',
+    '/nc /Helvetica-Bold findfont 18 scalefont def',
+    '/ec /Helvetica-Oblique findfont 18 scalefont def',
+    '/cc /Courier-Bold findfont 18 scalefont def',
+    '/nh /Helvetica-Bold findfont 14 scalefont def',
+    '/eh /Helvetica-Oblique findfont 14 scalefont def',
+    '/ch /Courier-Bold findfont 14 scalefont def',
+    '/ns /Helvetica-Bold findfont 12 scalefont def',
+    '/es /Helvetica-Oblique findfont 12 scalefont def',
+    '/cs /Courier-Bold findfont 12 scalefont def',
+    '/n 16#6E def /e 16#65 def /c 16#63 def',
+    '/chapter {',
+    '  100 620 moveto',
+    '  {',
+    '    dup 0 get',
+    '    dup n eq {pop nc setfont} {',
+    '      e eq {ec setfont} {cc setfont} ifelse',
+    '    } ifelse',
+    '    dup length 1 sub 1 exch getinterval show',
+    '  } forall',
+    '  0 setlinecap 3 setlinewidth',
+    '  newpath 100 610 moveto 468 0 rlineto stroke',
+    '} def',
+    '/heading {',
+    '  686 exch sub /y exch def /a exch def',
+    '  90 y moveto a 0 get dup length 1 sub 1 exch getinterval',
+    '  nh setfont dup stringwidth pop neg 0 rmoveto show',
+    '  100 y moveto',
+    '  a dup length 1 sub 1 exch getinterval {',
+    '    /s exch def',
+    '    s 0 get',
+    '    dup n eq {pop nh setfont} {',
+    '      e eq {eh setfont} {ch setfont} ifelse',
+    '    } ifelse',
+    '    s s length 1 sub 1 exch getinterval show',
+    '  } forall',
+    '} def',
+    '/subhead {',
+    '  688 exch sub /y exch def /a exch def',
+    '  90 y moveto a 0 get dup length 1 sub 1 exch getinterval',
+    '  ns setfont dup stringwidth pop neg 0 rmoveto show',
+    '  100 y moveto',
+    '  a dup length 1 sub 1 exch getinterval {',
+    '    /s exch def',
+    '    s 0 get',
+    '    dup n eq {pop ns setfont} {',
+    '      e eq {es setfont} {cs setfont} ifelse',
+    '    } ifelse',
+    '    s s length 1 sub 1 exch getinterval show',
+    '  } forall',
+    '} def',
+    '/disp { /j exch def',
+    '  568 exch sub exch 689 exch sub moveto',
+    '  {',
+    '    /s exch def',
+    '    s 0 get',
+    '    dup n eq {pop nf setfont} {',
+    '      e eq {ef setfont} {cf setfont} ifelse',
+    '    } ifelse',
+    '    s s length 1 sub 1 exch getinterval show',
+    '    s sp eq {j 0 rmoveto} if',
+    '  } forall',
+    '} def',
+    '/contents { /w exch def /y exch def /a exch def',
+    '  /yy 689 y sub def',
+    '  a a length 1 sub get dup length 1 sub 1 exch getinterval /s exch def',
+    '  nf setfont 568 s stringwidth pop sub /ex exch def',
+    '  ex yy moveto s show',
+    '  a 0 a length 1 sub getinterval y w 0 disp',
+    '  /sx currentpoint pop def nf setfont',
+    '  100 10 568 { /i exch def',
+    '    i 5 sub sx gt i 5 add ex lt and {',
+    '      i yy moveto (.) show',
+    '    } if',
+    '  } for',
+    '} def',
+    '/just { /w exch def /y exch def /a exch def',
+    '  /jj w def /spaces 0 def',
+    '  a {',
+    '    /s exch def',
+    '    s 0 get',
+    '    dup n eq {pop nf setfont} {',
+    '      e eq {ef setfont} {cf setfont} ifelse',
+    '    } ifelse',
+    '    s s length 1 sub 1 exch getinterval stringwidth pop',
+    '    jj exch sub /jj exch def',
+    '    s sp eq {/spaces spaces 1 add def} if',
+    '  } forall',
+    '  a y w jj spaces spaces 0 eq {pop pop 0} {div} ifelse disp',
+    '} def',
+    '/idl { 468 exch sub 0 disp } def',
+    '/ldl { 436 exch sub 0 disp } def',
+    '/idr { 222 add 468 exch sub /x exch def /y exch def /a exch def',
+    '  a {',
+    '    /s exch def',
+    '    s 0 get',
+    '    dup n eq {pop nf setfont} {',
+    '      e eq {ef setfont} {cf setfont} ifelse',
+    '    } ifelse',
+    '    s s length 1 sub 1 exch getinterval stringwidth pop',
+    '    x add /x exch def',
+    '  } forall',
+    '  a y x 0 disp',
+    '} def',
+    '/left {0 disp} def',
+    '/bullet {',
+    '  nf setfont dup 100 exch 689 exch sub moveto (\267) show',
+    '} def'
+  );
+  print "%!PS-Adobe-3.0\n";
+  print "%%BoundingBox: 95 95 590 705\n";
+  print "%%Creator: a nasty Perl script\n";
+  print "%%DocumentData: Clean7Bit\n";
+  print "%%Orientation: Portrait\n";
+  print "%%Pages: $lpages[$#lpages]\n";
+  print "%%DocumentNeededResources: font Times-Roman Times-Italic\n";
+  print "%%+ font Helvetica-Bold Courier Courier-Bold\n";
+  print "%%EndComments\n%%BeginProlog\n%%EndProlog\n%%BeginSetup\nsave\n";
+  $pshdr = join(' ',@pshdr);
+  $pshdr =~ s/\s+/ /g;
+  while ($pshdr =~ /\S/) {
+    last if length($pshdr) < 72 || $pshdr !~ /^(.{0,72}\S)\s(.*)$/;
+    $pshdr = $2;
+    print "$1\n";
+  }
+  print "$pshdr\n" if $pshdr =~ /\S/;
+  print "%%EndSetup\n";
+  &ps_initpg($lpages[0]);
+}
+
+sub ps_trailer {
+  &ps_donepg;
+  print "%%Trailer\nrestore\n%%EOF\n";
+}
+
+sub ps_throw_pg {
+  my ($oldpg, $newpg) = @_;
+  &ps_donepg;
+  &ps_initpg($newpg);
+}
+
+sub ps_initpg {
+  my ($pgnum) = @_;
+  print "%%Page: $pgnum $pgnum\n";
+  print "%%BeginPageSetup\nsave\n%%EndPageSetup\n";
+}
+
+sub ps_donepg {
+  print "%%PageTrailer\nrestore showpage\n";
+}
+
+sub ps_out_line {
+  my ($ypos,$ltype,$lname) = @_;
+  my $c,$d,$wid;
+
+  print "[";
+  $col = 1;
+  foreach $c (@$lname) {#
+    $c= "n " if $c eq " ";
+    $c = "n\261" if $c eq "-";
+    $d = '';
+    while (length $c) {
+      $d .= $1, $c = $2 while $c =~ /^([ -'\*-\[\]-~]+)(.*)$/;
+      while (1) {
+        $d .= "\\$1", $c = $2, next if $c =~ /^([\\\(\)])(.*)$/;
+	($d .= sprintf "\\%3o",unpack("C",$1)), $c = $2, next
+	  if $c =~ /^([^ -~])(.*)$/;
+	last;
+      }
+    }
+    $d = "($d)";
+    $col = 0, print "\n" if $col>0 && $col+length $d > 77;
+    print $d;
+    $col += length $d;
+  }
+  print "\n" if $col > 60;
+  print "]";
+  if ($ltype =~ /^[nb](beg|bdy)$/) {
+    printf "%d %s%d just\n",
+      $ypos, ($ltype eq "bbeg" ? "bullet " : ""),
+      ($ltype =~ /^b/ ? 456 : 468);
+  } elsif ($ltype =~ /^[nb](one|end)$/) {
+    printf "%d %s%d left\n",
+      $ypos, ($ltype eq "bone" ? "bullet " : ""),
+      ($ltype =~ /^b/ ? 456 : 468);
+  } elsif ($ltype =~ /^c(one|beg|bdy|end)$/) {
+    printf "$ypos 468 left\n";
+  } elsif ($ltype =~ /^C/) {
+    $wid = 468;
+    $wid = 456 if $ltype eq "Chea";
+    $wid = 444 if $ltype eq "Csub";
+    printf "$ypos $wid contents\n";
+  } elsif ($ltype eq "chap") {
+    printf "chapter\n";
+  } elsif ($ltype eq "head") {
+    printf "$ypos heading\n";
+  } elsif ($ltype eq "subh") {
+    printf "$ypos subhead\n";
+  } elsif ($ltype =~ /([il]d[lr])([12])/) {
+    $left = ($2 eq "2" ? 468-222 : 0);
+    printf "$ypos $left $1\n";
+  }
+}
+
+sub word_ps {
+  my ($w) = @_;
+  my $wtype, $wmajt;
+
+  return undef if $w eq '' || $w eq undef;
+
+  $wtype = substr($w,0,2);
+  $wmajt = substr($wtype,0,1);
+  $w = substr($w,2);
+  $w =~ s/<.*>// if $wmajt eq "w"; # remove web links
+  if ($wmajt eq "n" || $wtype eq "w ") {
+    return "n$w";
+  } elsif ($wtype eq "sp") {
+    return ' ';
+  } elsif ($wtype eq "da") {
+    return '-';
+  } elsif ($wmajt eq "c" || $wtype eq "wc") {
+    return "c$w";
+  } elsif ($wmajt eq "e") {
+    return "e$w";
+  } elsif ($wmajt eq "x") {
+    return "x";
+  } elsif ($wtype eq "i ") {
+    push @lindex, $w;
+    return "x";
+  } else {
+    die "panic in word_ps: $wtype$w\n";
+  }
+}
+
+sub len_ps {
+  my (@line) = @_;
+  my $l = 0;
+  my $w, $size;
+
+  $size = 11/1000; # used only for length calculations
+  while ($w = shift @line) {
+    $w = "n " if $w eq " ";
+    $w = "n\261" if $w eq "-";
+    $f = substr($w,0,1);
+    $f = "timesr" if $f eq "n";
+    $f = "timesi" if $f eq "e";
+    $f = "courr" if $f eq "c";
+    foreach $c (unpack 'C*',substr($w,1)) {
+      $l += $size * $$f[$c];
+    }
+  }
+  return $l;
+}
+
+sub write_texi {
+  # This is called from the top level, so I won't bother using
+  # my or local.
+
+  # Open file.
+  print "writing file...";
+  open TEXT,">nasmdoc.texi";
+  select TEXT;
+
+  # Preamble.
+  print "\input texinfo   \@c -*-texinfo-*-\n";
+  print "\@c \%**start of header\n";
+  print "\@setfilename nasm.info\n";
+  print "\@settitle NASM: The Netwide Assembler\n";
+  print "\@setchapternewpage odd\n";
+  print "\@c \%**end of header\n";
+  print "\n";
+  print "\@ifinfo\n";
+  print "This file documents NASM, the Netwide Assembler: an assembler\n";
+  print "targetting the Intel x86 series of processors, with portable source.\n";
+  print "\n";
+  print "Copyright 1997 Simon Tatham\n";
+  print "\n";
+  print "All rights reserved. This document is redistributable under the\n";
+  print "licence given in the file \"Licence\" distributed in the NASM archive.\n";
+  print "\@end ifinfo\n";
+  print "\n";
+  print "\@titlepage\n";
+  print "\@title NASM: The Netwide Assembler\n";
+  print "\@author Simon Tatham\n";
+  print "\n";
+  print "\@page\n";
+  print "\@vskip 0pt plus 1filll\n";
+  print "Copyright \@copyright{} 1997 Simon Tatham\n";
+  print "\n";
+  print "All rights reserved. This document is redistributable under the\n";
+  print "licence given in the file \"Licence\" distributed in the NASM archive.\n";
+  print "\@end titlepage\n";
+  print "\n";
+  print "\@node Top, $tstruct_next{'Top'}, (dir), (dir)\n";
+  print "\@top\n";
+  print "\n";
+  print "\@ifinfo\n";
+  print "This file documents NASM, the Netwide Assembler: an assembler\n";
+  print "targetting the Intel x86 series of processors, with portable source.\n";
+  print "\@end ifinfo\n";
+
+  $node = "Top";
+
+  $bulleting = 0;
+  for ($para = 0; $para <= $#pnames; $para++) {
+    $pname = $pnames[$para];
+    $pflags = $pflags[$para];
+    $ptype = substr($pflags,0,4);
+
+    $bulleting = 0, print "\@end itemize\n" if $bulleting && $ptype ne "bull";
+    print "\n"; # always one of these before a new paragraph
+
+    if ($ptype eq "chap") {
+      # Chapter heading. Begin a new node.
+      &texi_menu($node)
+        if $tstruct_level{$tstruct_next{$node}} > $tstruct_level{$node};
+      $pflags =~ /chap (.*) :(.*)/;
+      $node = "Chapter $1";
+      $title = "Chapter $1: ";
+      foreach $i (@$pname) {
+        $ww = &word_texi($i);
+        $title .= $ww unless $ww eq "\001";
+      }
+      print "\@node $node, $tstruct_next{$node}, $tstruct_prev{$node},";
+      print " $tstruct_up{$node}\n\@unnumbered $title\n";
+    } elsif ($ptype eq "appn") {
+      # Appendix heading. Begin a new node.
+      &texi_menu($node)
+        if $tstruct_level{$tstruct_next{$node}} > $tstruct_level{$node};
+      $pflags =~ /appn (.*) :(.*)/;
+      $node = "Appendix $1";
+      $title = "Appendix $1: ";
+      foreach $i (@$pname) {
+        $ww = &word_texi($i);
+        $title .= $ww unless $ww eq "\001";
+      }
+      print "\@node $node, $tstruct_next{$node}, $tstruct_prev{$node},";
+      print " $tstruct_up{$node}\n\@unnumbered $title\n";
+    } elsif ($ptype eq "head" || $ptype eq "subh") {
+      # Heading or subheading. Begin a new node.
+      &texi_menu($node)
+        if $tstruct_level{$tstruct_next{$node}} > $tstruct_level{$node};
+      $pflags =~ /.... (.*) :(.*)/;
+      $node = "Section $1";
+      $title = "$1. ";
+      foreach $i (@$pname) {
+        $ww = &word_texi($i);
+        $title .= $ww unless $ww eq "\001";
+      }
+      print "\@node $node, $tstruct_next{$node}, $tstruct_prev{$node},";
+      print " $tstruct_up{$node}\n\@unnumbered $title\n";
+    } elsif ($ptype eq "code") {
+      # Code paragraph. Surround with @example / @end example.
+      print "\@example\n";
+      foreach $i (@$pname) {
+        warn "code line longer than 68 chars: $i\n" if length $i > 68;
+	$i =~ s/\@/\@\@/g;
+	$i =~ s/\{/\@\{/g;
+	$i =~ s/\}/\@\}/g;
+        print "$i\n";
+      }
+      print "\@end example\n";
+    } elsif ($ptype eq "bull" || $ptype eq "norm") {
+      # Ordinary paragraph, optionally bulleted. We wrap, FWIW.
+      if ($ptype eq "bull") {
+        $bulleting = 1, print "\@itemize \@bullet\n" if !$bulleting;
+	print "\@item\n";
+      }
+      $line = '';
+      @a = @$pname;
+      $wd = $wprev = '';
+      do {
+        do { $w = &word_texi(shift @a); } while $w eq "\001"; # hack
+	$wd .= $wprev;
+	if ($wprev =~ /-$/ || $w eq ' ' || $w eq '' || $w eq undef) {
+	  if (length ($line . $wd) > 75) {
+	    $line =~ s/\s*$//; # trim trailing spaces
+	    print "$line\n";
+	    $line = '';
+	    $wd =~ s/^\s*//; # trim leading spaces
+	  }
+	  $line .= $wd;
+	  $wd = '';
+	}
+	$wprev = $w;
+      } while ($w ne '' && $w ne undef);
+      if ($line =~ /\S/) {
+	$line =~ s/\s*$//; # trim trailing spaces
+	print "$line\n";
+      }
+    }
+  }
+
+  # Write index.
+  &texi_index;
+
+  # Close file.
+  print "\n\@contents\n\@bye\n";
+  select STDOUT;
+  close TEXT;
+}
+
+# Side effect of this procedure: update global `texiwdlen' to be the length
+# in chars of the formatted version of the word.
+sub word_texi {
+  my ($w) = @_;
+  my $wtype, $wmajt;
+
+  return undef if $w eq '' || $w eq undef;
+  $wtype = substr($w,0,2);
+  $wmajt = substr($wtype,0,1);
+  $w = substr($w,2);
+  $wlen = length $w;
+  $w =~ s/\@/\@\@/g;
+  $w =~ s/\{/\@\{/g;
+  $w =~ s/\}/\@\}/g;
+  $w =~ s/<.*>// if $wmajt eq "w"; # remove web links
+  substr($w,0,1) =~ tr/a-z/A-Z/, $capital = 0 if $capital;
+  if ($wmajt eq "n" || $wtype eq "e " || $wtype eq "w ") {
+    $texiwdlen = $wlen;
+    return $w;
+  } elsif ($wtype eq "sp") {
+    $texiwdlen = 1;
+    return ' ';
+  } elsif ($wtype eq "da") {
+    $texiwdlen = 2;
+    return '--';
+  } elsif ($wmajt eq "c" || $wtype eq "wc") {
+    $texiwdlen = 2 + $wlen;
+    return "\@code\{$w\}";
+  } elsif ($wtype eq "es") {
+    $texiwdlen = 1 + $wlen;
+    return "\@emph\{${w}";
+  } elsif ($wtype eq "ee") {
+    $texiwdlen = 1 + $wlen;
+    return "${w}\}";
+  } elsif ($wtype eq "eo") {
+    $texiwdlen = 2 + $wlen;
+    return "\@emph\{${w}\}";
+  } elsif ($wtype eq "x ") {
+    $texiwdlen = 0; # we don't need it in this case
+    $capital = 1; # hack
+    return "\@ref\{";
+  } elsif ($wtype eq "xe") {
+    $texiwdlen = 0; # we don't need it in this case
+    return "\}";
+  } elsif ($wmajt eq "i") {
+    $texiwdlen = 0; # we don't need it in this case
+    return "\001";
+  } else {
+    die "panic in word_texi: $wtype$w\n";
+  }
+}
+
+sub texi_menu {
+  my ($topitem) = @_;
+  my $item, $i, $mpname, $title, $wd;
+
+  $item = $tstruct_next{$topitem};
+  print "\@menu\n";
+  while ($item) {
+    $title = "";
+    $mpname = $tstruct_pname{$item};
+    foreach $i (@$mpname) {
+      $wd = &word_texi($i);
+      $title .= $wd unless $wd eq "\001";
+    }
+    print "* ${item}:: $title\n";
+    $item = $tstruct_mnext{$item};
+  }
+  print "* Index::\n" if $topitem eq "Top";
+  print "\@end menu\n";
+}
+
+sub texi_index {
+  my $itag, $ientry, @a, $wd, $item, $len;
+  my $subnums = "123456789ABCDEFGHIJKLMNOPQRSTU" .
+                "VWXYZabcdefghijklmnopqrstuvwxyz";
+
+  print "\@ifinfo\n\@node Index, , $FIXMElastnode, Top\n";
+  print "\@unnumbered Index\n\n\@menu\n";
+
+  foreach $itag (@itags) {
+    $ientry = $idxmap{$itag};
+    @a = @$ientry;
+    $item = '';
+    $len = 0;
+    foreach $i (@a) {
+      $wd = &word_texi($i);
+      $item .= $wd, $len += $texiwdlen unless $wd eq "\001";
+    }
+    $i = 0;
+    foreach $node (@nodes) {
+      next if !$idxnodes{$node,$itag};
+      printf "* %s%s (%s): %s.\n",
+          $item, " " x (40-$len), substr($subnums,$i++,1), $node;
+    }
+  }
+  print "\@end menu\n\@end ifinfo\n";
+}
+
+sub write_hlp {
+  # This is called from the top level, so I won't bother using
+  # my or local.
+
+  # Build the index-tag text forms.
+  print "building index entries...";
+  @hlp_index = map {
+                 my $i,$ww;
+		 my $ientry = $idxmap{$_};
+		 my $title = "";
+                 foreach $i (@$ientry) {
+		   $ww = &word_hlp($i,0);
+		   $title .= $ww unless $ww eq "\001";
+		 }
+		 $title;
+               } @itags;
+
+  # Write the HPJ project-description file.
+  print "writing .hpj file...";
+  open HPJ,">nasmdoc.hpj";
+  print HPJ "[OPTIONS]\ncompress=true\n";
+  print HPJ "title=NASM: The Netwide Assembler\noldkeyphrase=no\n\n";
+  print HPJ "[FILES]\nnasmdoc.rtf\n\n";
+  print HPJ "[CONFIG]\n";
+  print HPJ 'CreateButton("btn_up", "&Up",'.
+            ' "JumpContents(`nasmdoc.hlp'."'".')")';
+  print HPJ "\nBrowseButtons()\n";
+  close HPJ;
+
+  # Open file.
+  print "\n   writing .rtf file...";
+  open TEXT,">nasmdoc.rtf";
+  select TEXT;
+
+  # Preamble.
+  print "{\\rtf1\\ansi{\\fonttbl\n";
+  print "\\f0\\froman Times New Roman;\\f1\\fmodern Courier New;\n";
+  print "\\f2\\fswiss Arial;\\f3\\ftech Wingdings}\\deff0\n";
+  print "#{\\footnote Top}\n";
+  print "\${\\footnote Contents}\n";
+  print "+{\\footnote browse:00000}\n";
+  print "!{\\footnote DisableButton(\"btn_up\")}\n";
+  print "\\keepn\\f2\\b\\fs30\\sb0\n";
+  print "NASM: The Netwide Assembler\n";
+  print "\\par\\pard\\plain\\sb120\n";
+  print "This file documents NASM, the Netwide Assembler: an assembler \n";
+  print "targetting the Intel x86 series of processors, with portable source.\n";
+
+  $node = "Top";
+  $browse = 0;
+
+  $newpar = "\\par\\sb120\n";
+  for ($para = 0; $para <= $#pnames; $para++) {
+    $pname = $pnames[$para];
+    $pflags = $pflags[$para];
+    $ptype = substr($pflags,0,4);
+
+    print $newpar;
+    $newpar = "\\par\\sb120\n";
+
+    if ($ptype eq "chap") {
+      # Chapter heading. Begin a new node.
+      &hlp_menu($node)
+        if $tstruct_level{$tstruct_next{$node}} > $tstruct_level{$node};
+      $pflags =~ /chap (.*) :(.*)/;
+      $node = "Chapter $1";
+      $title = $footnotetitle = "Chapter $1: ";
+      foreach $i (@$pname) {
+        $ww = &word_hlp($i,1);
+	$title .= $ww, $footnotetitle .= &word_hlp($i,0) unless $ww eq "\001";
+      }
+      print "\\page\n";
+      printf "#{\\footnote %s}\n", &hlp_sectkw($node);
+      print "\${\\footnote $footnotetitle}\n";
+      printf "+{\\footnote browse:%05d}\n", ++$browse;
+      printf "!{\\footnote ChangeButtonBinding(\"btn_up\"," .
+             "\"JumpId(\`nasmdoc.hlp',\`%s')\");\n",
+	     &hlp_sectkw($tstruct_up{$node});
+      print "EnableButton(\"btn_up\")}\n";
+      &hlp_keywords($node);
+      print "\\keepn\\f2\\b\\fs30\\sb60\\sa60\n";
+      print "$title\n";
+      $newpar = "\\par\\pard\\plain\\sb120\n";
+    } elsif ($ptype eq "appn") {
+      # Appendix heading. Begin a new node.
+      &hlp_menu($node)
+        if $tstruct_level{$tstruct_next{$node}} > $tstruct_level{$node};
+      $pflags =~ /appn (.*) :(.*)/;
+      $node = "Appendix $1";
+      $title = $footnotetitle = "Appendix $1: ";
+      foreach $i (@$pname) {
+        $ww = &word_hlp($i,1);
+	$title .= $ww, $footnotetitle .= &word_hlp($i,0) unless $ww eq "\001";
+      }
+      print "\\page\n";
+      printf "#{\\footnote %s}\n", &hlp_sectkw($node);
+      print "\${\\footnote $footnotetitle}\n";
+      printf "+{\\footnote browse:%05d}\n", ++$browse;
+      printf "!{\\footnote ChangeButtonBinding(\"btn_up\"," .
+             "\"JumpId(\`nasmdoc.hlp',\`%s')\");\n",
+	     &hlp_sectkw($tstruct_up{$node});
+      print "EnableButton(\"btn_up\")}\n";
+      &hlp_keywords($node);
+      print "\\keepn\\f2\\b\\fs30\\sb60\\sa60\n";
+      print "$title\n";
+      $newpar = "\\par\\pard\\plain\\sb120\n";
+    } elsif ($ptype eq "head" || $ptype eq "subh") {
+      # Heading or subheading. Begin a new node.
+      &hlp_menu($node)
+        if $tstruct_level{$tstruct_next{$node}} > $tstruct_level{$node};
+      $pflags =~ /.... (.*) :(.*)/;
+      $node = "Section $1";
+      $title = $footnotetitle = "$1. ";
+      foreach $i (@$pname) {
+        $ww = &word_hlp($i,1);
+	$title .= $ww, $footnotetitle .= &word_hlp($i,0) unless $ww eq "\001";
+      }
+      print "\\page\n";
+      printf "#{\\footnote %s}\n", &hlp_sectkw($node);
+      print "\${\\footnote $footnotetitle}\n";
+      printf "+{\\footnote browse:%05d}\n", ++$browse;
+      printf "!{\\footnote ChangeButtonBinding(\"btn_up\"," .
+             "\"JumpId(\`nasmdoc.hlp',\`%s')\");\n",
+	     &hlp_sectkw($tstruct_up{$node});
+      print "EnableButton(\"btn_up\")}\n";
+      &hlp_keywords($node);
+      print "\\keepn\\f2\\b\\fs30\\sb60\\sa60\n";
+      print "$title\n";
+      $newpar = "\\par\\pard\\plain\\sb120\n";
+    } elsif ($ptype eq "code") {
+      # Code paragraph.
+      print "\\keep\\f1\\sb120\n";
+      foreach $i (@$pname) {
+        warn "code line longer than 68 chars: $i\n" if length $i > 68;
+	$i =~ s/\\/\\\\/g;
+	$i =~ s/\{/\\\{/g;
+	$i =~ s/\}/\\\}/g;
+        print "$i\\par\\sb0\n";
+      }
+      $newpar = "\\pard\\f0\\sb120\n";
+    } elsif ($ptype eq "bull" || $ptype eq "norm") {
+      # Ordinary paragraph, optionally bulleted. We wrap, FWIW.
+      if ($ptype eq "bull") {
+        print "\\tx360\\li360\\fi-360{\\f3\\'9F}\\tab\n";
+	$newpar = "\\par\\pard\\sb120\n";
+      } else {
+	$newpar = "\\par\\sb120\n";
+      }
+      $line = '';
+      @a = @$pname;
+      $wd = $wprev = '';
+      do {
+        do { $w = &word_hlp((shift @a),1); } while $w eq "\001"; # hack
+	$wd .= $wprev;
+	if ($w eq ' ' || $w eq '' || $w eq undef) {
+	  if (length ($line . $wd) > 75) {
+	    $line =~ s/\s*$//; # trim trailing spaces
+	    print "$line \n"; # and put one back
+	    $line = '';
+	    $wd =~ s/^\s*//; # trim leading spaces
+	  }
+	  $line .= $wd;
+	  $wd = '';
+	}
+	$wprev = $w;
+      } while ($w ne '' && $w ne undef);
+      if ($line =~ /\S/) {
+	$line =~ s/\s*$//; # trim trailing spaces
+	print "$line\n";
+      }
+    }
+  }
+
+  # Close file.
+  print "\\page}\n";
+  select STDOUT;
+  close TEXT;
+}
+
+sub word_hlp {
+  my ($w, $docode) = @_;
+  my $wtype, $wmajt;
+
+  return undef if $w eq '' || $w eq undef;
+  $wtype = substr($w,0,2);
+  $wmajt = substr($wtype,0,1);
+  $w = substr($w,2);
+  $w =~ s/\\/\\\\/g;
+  $w =~ s/\{/\\\{/g;
+  $w =~ s/\}/\\\}/g;
+  $w =~ s/<.*>// if $wmajt eq "w"; # remove web links
+  substr($w,0,length($w)-1) =~ s/-/\\'AD/g if $wmajt ne "x"; #nonbreakhyphens
+  if ($wmajt eq "n" || $wtype eq "e " || $wtype eq "w ") {
+    return $w;
+  } elsif ($wtype eq "sp") {
+    return ' ';
+  } elsif ($wtype eq "da") {
+    return "\\'96";
+  } elsif ($wmajt eq "c" || $wtype eq "wc") {
+    $w =~ s/ /\\'A0/g; # make spaces non-breaking
+    return $docode ? "{\\f1 ${w}}" : $w;
+  } elsif ($wtype eq "es") {
+    return "{\\i ${w}";
+  } elsif ($wtype eq "ee") {
+    return "${w}}";
+  } elsif ($wtype eq "eo") {
+    return "{\\i ${w}}";
+  } elsif ($wtype eq "x ") {
+    return "{\\uldb ";
+  } elsif ($wtype eq "xe") {
+    $w = &hlp_sectkw($w);
+    return "}{\\v ${w}}";
+  } elsif ($wmajt eq "i") {
+    return "\001";
+  } else {
+    die "panic in word_hlp: $wtype$w\n";
+  }
+}
+
+sub hlp_menu {
+  my ($topitem) = @_;
+  my $item, $kword, $i, $mpname, $title;
+
+  $item = $tstruct_next{$topitem};
+  print "\\li360\\fi-360\n";
+  while ($item) {
+    $title = "";
+    $mpname = $tstruct_pname{$item};
+    foreach $i (@$mpname) {
+      $ww = &word_hlp($i, 0);
+      $title .= $ww unless $ww eq "\001";
+    }
+    $kword = &hlp_sectkw($item);
+    print "{\\uldb ${item}: $title}{\\v $kword}\\par\\sb0\n";
+    $item = $tstruct_mnext{$item};
+  }
+  print "\\pard\\sb120\n";
+}
+
+sub hlp_sectkw {
+  my ($node) = @_;
+  $node =~ tr/A-Z/a-z/;
+  $node =~ tr/- ./___/;
+  $node;
+}
+
+sub hlp_keywords {
+  my ($node) = @_;
+  my $pfx = "K{\\footnote ";
+  my $done = 0;
+  foreach $i (0..$#itags) {
+    (print $pfx,$hlp_index[$i]), $pfx = ";\n", $done++
+        if $idxnodes{$node,$itags[$i]};
+  }
+  print "}\n" if $done;
+}
+
+# Make tree structures. $tstruct_* is top-level and global.
+sub add_item {
+  my ($item, $level) = @_;
+  my $i;
+
+  $tstruct_pname{$item} = $pname;
+  $tstruct_next{$tstruct_previtem} = $item;
+  $tstruct_prev{$item} = $tstruct_previtem;
+  $tstruct_level{$item} = $level;
+  $tstruct_up{$item} = $tstruct_last[$level-1];
+  $tstruct_mnext{$tstruct_last[$level]} = $item;
+  $tstruct_last[$level] = $item;
+  for ($i=$level+1; $i<$MAXLEVEL; $i++) { $tstruct_last[$i] = undef; }
+  $tstruct_previtem = $item;
+  push @nodes, $item;
+}
+
+# PostScript font metric data. Used for line breaking.
+sub font_metrics {
+  @timesr = (
+     250,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0,
+     250, 333, 408, 500, 500, 833, 778, 333,
+     333, 333, 500, 564, 250, 333, 250, 278,
+     500, 500, 500, 500, 500, 500, 500, 500,
+     500, 500, 278, 278, 564, 564, 564, 444,
+     921, 722, 667, 667, 722, 611, 556, 722,
+     722, 333, 389, 722, 611, 889, 722, 722,
+     556, 722, 667, 556, 611, 722, 722, 944,
+     722, 722, 611, 333, 278, 333, 469, 500,
+     333, 444, 500, 444, 500, 444, 333, 500,
+     500, 278, 278, 500, 278, 778, 500, 500,
+     500, 500, 333, 389, 278, 500, 500, 722,
+     500, 500, 444, 480, 200, 480, 541,   0,
+       0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0,
+       0, 333, 500, 500, 167, 500, 500, 500,
+     500, 180, 444, 500, 333, 333, 556, 556,
+       0, 500, 500, 500, 250,   0, 453, 350,
+     333, 444, 444, 500,1000,1000,   0, 444,
+       0, 333, 333, 333, 333, 333, 333, 333,
+     333,   0, 333, 333,   0, 333, 333, 333,
+    1000,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0,
+       0, 889,   0, 276,   0,   0,   0,   0,
+     611, 722, 889, 310,   0,   0,   0,   0,
+       0, 667,   0,   0,   0, 278,   0,   0,
+     278, 500, 722, 500,   0,   0,   0,   0
+  );
+  @timesi = (
+     250,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0,
+     250, 333, 420, 500, 500, 833, 778, 333,
+     333, 333, 500, 675, 250, 333, 250, 278,
+     500, 500, 500, 500, 500, 500, 500, 500,
+     500, 500, 333, 333, 675, 675, 675, 500,
+     920, 611, 611, 667, 722, 611, 611, 722,
+     722, 333, 444, 667, 556, 833, 667, 722,
+     611, 722, 611, 500, 556, 722, 611, 833,
+     611, 556, 556, 389, 278, 389, 422, 500,
+     333, 500, 500, 444, 500, 444, 278, 500,
+     500, 278, 278, 444, 278, 722, 500, 500,
+     500, 500, 389, 389, 278, 500, 444, 667,
+     444, 444, 389, 400, 275, 400, 541,   0,
+       0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0,
+       0, 389, 500, 500, 167, 500, 500, 500,
+     500, 214, 556, 500, 333, 333, 500, 500,
+       0, 500, 500, 500, 250,   0, 523, 350,
+     333, 556, 556, 500, 889,1000,   0, 500,
+       0, 333, 333, 333, 333, 333, 333, 333,
+     333,   0, 333, 333,   0, 333, 333, 333,
+     889,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0,
+       0, 889,   0, 276,   0,   0,   0,   0,
+     556, 722, 944, 310,   0,   0,   0,   0,
+       0, 667,   0,   0,   0, 278,   0,   0,
+     278, 500, 667, 500,   0,   0,   0,   0
+  );
+  @courr = (
+     600,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0,
+     600, 600, 600, 600, 600, 600, 600, 600,
+     600, 600, 600, 600, 600, 600, 600, 600,
+     600, 600, 600, 600, 600, 600, 600, 600,
+     600, 600, 600, 600, 600, 600, 600, 600,
+     600, 600, 600, 600, 600, 600, 600, 600,
+     600, 600, 600, 600, 600, 600, 600, 600,
+     600, 600, 600, 600, 600, 600, 600, 600,
+     600, 600, 600, 600, 600, 600, 600, 600,
+     600, 600, 600, 600, 600, 600, 600, 600,
+     600, 600, 600, 600, 600, 600, 600, 600,
+     600, 600, 600, 600, 600, 600, 600, 600,
+     600, 600, 600, 600, 600, 600, 600,   0,
+       0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0,
+       0, 600, 600, 600, 600, 600, 600, 600,
+     600, 600, 600, 600, 600, 600, 600, 600,
+       0, 600, 600, 600, 600,   0, 600, 600,
+     600, 600, 600, 600, 600, 600,   0, 600,
+       0, 600, 600, 600, 600, 600, 600, 600,
+     600,   0, 600, 600,   0, 600, 600, 600,
+     600,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0,
+       0, 600,   0, 600,   0,   0,   0,   0,
+     600, 600, 600, 600,   0,   0,   0,   0,
+       0, 600,   0,   0,   0, 600,   0,   0,
+     600, 600, 600, 600,   0,   0,   0,   0
+  );
+}
diff --git a/eval.c b/eval.c
new file mode 100644
index 00000000..0e81c928
--- /dev/null
+++ b/eval.c
@@ -0,0 +1,761 @@
+/* eval.c    expression evaluator for the Netwide Assembler
+ *
+ * The Netwide Assembler is copyright (C) 1996 Simon Tatham and
+ * Julian Hall. All rights reserved. The software is
+ * redistributable under the licence given in the file "Licence"
+ * distributed in the NASM archive.
+ *
+ * initial version 27/iii/95 by Simon Tatham
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "nasm.h"
+#include "nasmlib.h"
+#include "eval.h"
+
+static expr **tempexprs = NULL;
+static int ntempexprs, tempexprs_size = 0;
+#define TEMPEXPRS_DELTA 128
+
+static expr *tempexpr;
+static int ntempexpr, tempexpr_size;
+#define TEMPEXPR_DELTA 8
+
+static scanner scan;
+static void *scpriv;
+static struct tokenval *tokval;
+static efunc error;
+static int i;
+static int seg, ofs;
+static char *label = NULL, special_empty_string[] = "";
+static lfunc labelfunc;
+static struct ofmt *outfmt;
+static int *forward;
+
+static struct eval_hints *hint;
+
+/*
+ * Construct a temporary expression.
+ */
+static void begintemp(void) {
+    tempexpr = NULL;
+    tempexpr_size = ntempexpr = 0;
+}
+
+static void addtotemp(long type, long value) {
+    while (ntempexpr >= tempexpr_size) {
+	tempexpr_size += TEMPEXPR_DELTA;
+	tempexpr = nasm_realloc(tempexpr,
+				 tempexpr_size*sizeof(*tempexpr));
+    }
+    tempexpr[ntempexpr].type = type;
+    tempexpr[ntempexpr++].value = value;
+}
+
+static expr *finishtemp(void) {
+    addtotemp (0L, 0L);		       /* terminate */
+    while (ntempexprs >= tempexprs_size) {
+	tempexprs_size += TEMPEXPRS_DELTA;
+	tempexprs = nasm_realloc(tempexprs,
+				 tempexprs_size*sizeof(*tempexprs));
+    }
+    return tempexprs[ntempexprs++] = tempexpr;
+}
+
+/*
+ * Add two vector datatypes. We have some bizarre behaviour on far-
+ * absolute segment types: we preserve them during addition _only_
+ * if one of the segments is a truly pure scalar.
+ */
+static expr *add_vectors(expr *p, expr *q) {
+    int preserve;
+
+    preserve = is_really_simple(p) || is_really_simple(q);
+
+    begintemp();
+
+    while (p->type && q->type &&
+	   p->type < EXPR_SEGBASE+SEG_ABS &&
+	   q->type < EXPR_SEGBASE+SEG_ABS) {
+	int lasttype;
+
+    	if (p->type > q->type) {
+	    addtotemp(q->type, q->value);
+	    lasttype = q++->type;
+	} else if (p->type < q->type) {
+	    addtotemp(p->type, p->value);
+	    lasttype = p++->type;
+	} else {		       /* *p and *q have same type */
+	    addtotemp(p->type, p->value + q->value);
+	    lasttype = p->type;
+	    p++, q++;
+	}
+	if (lasttype == EXPR_UNKNOWN) {
+	    return finishtemp();
+	}
+    }
+    while (p->type &&
+	   (preserve || p->type < EXPR_SEGBASE+SEG_ABS)) {
+	addtotemp(p->type, p->value);
+	p++;
+    }
+    while (q->type &&
+	   (preserve || q->type < EXPR_SEGBASE+SEG_ABS)) {
+	addtotemp(q->type, q->value);
+	q++;
+    }
+
+    return finishtemp();
+}
+
+/*
+ * Multiply a vector by a scalar. Strip far-absolute segment part
+ * if present.
+ *
+ * Explicit treatment of UNKNOWN is not required in this routine,
+ * since it will silently do the Right Thing anyway.
+ *
+ * If `affect_hints' is set, we also change the hint type to
+ * NOTBASE if a MAKEBASE hint points at a register being
+ * multiplied. This allows [eax*1+ebx] to hint EBX rather than EAX
+ * as the base register.
+ */
+static expr *scalar_mult(expr *vect, long scalar, int affect_hints) {
+    expr *p = vect;
+
+    while (p->type && p->type < EXPR_SEGBASE+SEG_ABS) {
+	p->value = scalar * (p->value);
+	if (hint && hint->type == EAH_MAKEBASE &&
+	    p->type == hint->base && affect_hints)
+	    hint->type = EAH_NOTBASE;
+	p++;
+    }
+    p->type = 0;
+
+    return vect;
+}
+
+static expr *scalarvect (long scalar) {
+    begintemp();
+    addtotemp(EXPR_SIMPLE, scalar);
+    return finishtemp();
+}
+
+static expr *unknown_expr (void) {
+    begintemp();
+    addtotemp(EXPR_UNKNOWN, 1L);
+    return finishtemp();
+}
+
+/*
+ * The SEG operator: calculate the segment part of a relocatable
+ * value. Return NULL, as usual, if an error occurs. Report the
+ * error too.
+ */
+static expr *segment_part (expr *e) {
+    long seg;
+
+    if (is_unknown(e))
+	return unknown_expr();
+
+    if (!is_reloc(e)) {
+	error(ERR_NONFATAL, "cannot apply SEG to a non-relocatable value");
+	return NULL;
+    }
+
+    seg = reloc_seg(e);
+    if (seg == NO_SEG) {
+	error(ERR_NONFATAL, "cannot apply SEG to a non-relocatable value");
+	return NULL;
+    } else if (seg & SEG_ABS) {
+	return scalarvect(seg & ~SEG_ABS);
+    } else if (seg & 1) {
+	error(ERR_NONFATAL, "SEG applied to something which"
+	      " is already a segment base");
+	return NULL;
+    }
+    else {
+	long base = outfmt->segbase(seg+1);
+
+	begintemp();
+	addtotemp((base == NO_SEG ? EXPR_UNKNOWN : EXPR_SEGBASE+base), 1L);
+	return finishtemp();
+    }
+}
+
+/*
+ * Recursive-descent parser. Called with a single boolean operand,
+ * which is TRUE if the evaluation is critical (i.e. unresolved
+ * symbols are an error condition). Must update the global `i' to
+ * reflect the token after the parsed string. May return NULL.
+ *
+ * evaluate() should report its own errors: on return it is assumed
+ * that if NULL has been returned, the error has already been
+ * reported.
+ */
+
+/*
+ * Grammar parsed is:
+ *
+ * expr  : bexpr [ WRT expr6 ]
+ * bexpr : rexp0 or expr0 depending on relative-mode setting
+ * rexp0 : rexp1 [ {||} rexp1...]
+ * rexp1 : rexp2 [ {^^} rexp2...]
+ * rexp2 : rexp3 [ {&&} rexp3...]
+ * rexp3 : expr0 [ {=,==,<>,!=,<,>,<=,>=} expr0 ]
+ * expr0 : expr1 [ {|} expr1...]
+ * expr1 : expr2 [ {^} expr2...]
+ * expr2 : expr3 [ {&} expr3...]
+ * expr3 : expr4 [ {<<,>>} expr4...]
+ * expr4 : expr5 [ {+,-} expr5...]
+ * expr5 : expr6 [ {*,/,%,//,%%} expr6...]
+ * expr6 : { ~,+,-,SEG } expr6
+ *       | (bexpr)
+ *       | symbol
+ *       | $
+ *       | number
+ */
+
+static expr *rexp0(int), *rexp1(int), *rexp2(int), *rexp3(int);
+
+static expr *expr0(int), *expr1(int), *expr2(int), *expr3(int);
+static expr *expr4(int), *expr5(int), *expr6(int);
+
+static expr *(*bexpr)(int);
+
+static expr *rexp0(int critical) {
+    expr *e, *f;
+
+    e = rexp1(critical);
+    if (!e)
+	return NULL;
+    while (i == TOKEN_DBL_OR) {
+	i = scan(scpriv, tokval);
+	f = rexp1(critical);
+	if (!f)
+	    return NULL;
+	if (!(is_simple(e) || is_just_unknown(e)) ||
+	    !(is_simple(f) || is_just_unknown(f))) {
+		error(ERR_NONFATAL, "`|' operator may only be applied to"
+		      " scalar values");
+	    }
+	if (is_just_unknown(e) || is_just_unknown(f))
+	    e = unknown_expr();
+	else
+	    e = scalarvect ((long) (reloc_value(e) || reloc_value(f)));
+    }
+    return e;
+}
+
+static expr *rexp1(int critical) {
+    expr *e, *f;
+
+    e = rexp2(critical);
+    if (!e)
+	return NULL;
+    while (i == TOKEN_DBL_XOR) {
+	i = scan(scpriv, tokval);
+	f = rexp2(critical);
+	if (!f)
+	    return NULL;
+	if (!(is_simple(e) || is_just_unknown(e)) ||
+	    !(is_simple(f) || is_just_unknown(f))) {
+	    error(ERR_NONFATAL, "`^' operator may only be applied to"
+		  " scalar values");
+	}
+	if (is_just_unknown(e) || is_just_unknown(f))
+	    e = unknown_expr();
+	else
+	    e = scalarvect ((long) (!reloc_value(e) ^ !reloc_value(f)));
+    }
+    return e;
+}
+
+static expr *rexp2(int critical) {
+    expr *e, *f;
+
+    e = rexp3(critical);
+    if (!e)
+	return NULL;
+    while (i == TOKEN_DBL_AND) {
+	i = scan(scpriv, tokval);
+	f = rexp3(critical);
+	if (!f)
+	    return NULL;
+	if (!(is_simple(e) || is_just_unknown(e)) ||
+	    !(is_simple(f) || is_just_unknown(f))) {
+	    error(ERR_NONFATAL, "`&' operator may only be applied to"
+		  " scalar values");
+	}
+	if (is_just_unknown(e) || is_just_unknown(f))
+	    e = unknown_expr();
+	else
+	    e = scalarvect ((long) (reloc_value(e) && reloc_value(f)));
+    }
+    return e;
+}
+
+static expr *rexp3(int critical) {
+    expr *e, *f;
+    long v;
+
+    e = expr0(critical);
+    if (!e)
+	return NULL;
+    while (i == TOKEN_EQ || i == TOKEN_LT || i == TOKEN_GT ||
+	   i == TOKEN_NE || i == TOKEN_LE || i == TOKEN_GE) {
+	int j = i;
+	i = scan(scpriv, tokval);
+	f = expr0(critical);
+	if (!f)
+	    return NULL;
+	e = add_vectors (e, scalar_mult(f, -1L, FALSE));
+	switch (j) {
+	  case TOKEN_EQ: case TOKEN_NE:
+	    if (is_unknown(e))
+		v = -1;		       /* means unknown */
+	    else if (!is_really_simple(e) || reloc_value(e) != 0)
+		v = (j == TOKEN_NE);   /* unequal, so return TRUE if NE */
+	    else
+		v = (j == TOKEN_EQ);   /* equal, so return TRUE if EQ */
+	    break;
+	  default:
+	    if (is_unknown(e))
+		v = -1;		       /* means unknown */
+	    else if (!is_really_simple(e)) {
+		error(ERR_NONFATAL, "`%s': operands differ by a non-scalar",
+		      (j == TOKEN_LE ? "<=" : j == TOKEN_LT ? "<" :
+		       j == TOKEN_GE ? ">=" : ">"));
+		v = 0;		       /* must set it to _something_ */
+	    } else {
+		int vv = reloc_value(e);
+		if (vv == 0)
+		    v = (j == TOKEN_LE || j == TOKEN_GE);
+		else if (vv > 0)
+		    v = (j == TOKEN_GE || j == TOKEN_GT);
+		else /* vv < 0 */
+		    v = (j == TOKEN_LE || j == TOKEN_LT);
+	    }
+	    break;
+	}
+	if (v == -1)
+	    e = unknown_expr();
+	else
+	    e = scalarvect(v);
+    }
+    return e;
+}
+
+static expr *expr0(int critical) {
+    expr *e, *f;
+
+    e = expr1(critical);
+    if (!e)
+	return NULL;
+    while (i == '|') {
+	i = scan(scpriv, tokval);
+	f = expr1(critical);
+	if (!f)
+	    return NULL;
+	if (!(is_simple(e) || is_just_unknown(e)) ||
+	    !(is_simple(f) || is_just_unknown(f))) {
+		error(ERR_NONFATAL, "`|' operator may only be applied to"
+		      " scalar values");
+	    }
+	if (is_just_unknown(e) || is_just_unknown(f))
+	    e = unknown_expr();
+	else
+	    e = scalarvect (reloc_value(e) | reloc_value(f));
+    }
+    return e;
+}
+
+static expr *expr1(int critical) {
+    expr *e, *f;
+
+    e = expr2(critical);
+    if (!e)
+	return NULL;
+    while (i == '^') {
+	i = scan(scpriv, tokval);
+	f = expr2(critical);
+	if (!f)
+	    return NULL;
+	if (!(is_simple(e) || is_just_unknown(e)) ||
+	    !(is_simple(f) || is_just_unknown(f))) {
+	    error(ERR_NONFATAL, "`^' operator may only be applied to"
+		  " scalar values");
+	}
+	if (is_just_unknown(e) || is_just_unknown(f))
+	    e = unknown_expr();
+	else
+	    e = scalarvect (reloc_value(e) ^ reloc_value(f));
+    }
+    return e;
+}
+
+static expr *expr2(int critical) {
+    expr *e, *f;
+
+    e = expr3(critical);
+    if (!e)
+	return NULL;
+    while (i == '&') {
+	i = scan(scpriv, tokval);
+	f = expr3(critical);
+	if (!f)
+	    return NULL;
+	if (!(is_simple(e) || is_just_unknown(e)) ||
+	    !(is_simple(f) || is_just_unknown(f))) {
+	    error(ERR_NONFATAL, "`&' operator may only be applied to"
+		  " scalar values");
+	}
+	if (is_just_unknown(e) || is_just_unknown(f))
+	    e = unknown_expr();
+	else
+	    e = scalarvect (reloc_value(e) & reloc_value(f));
+    }
+    return e;
+}
+
+static expr *expr3(int critical) {
+    expr *e, *f;
+
+    e = expr4(critical);
+    if (!e)
+	return NULL;
+    while (i == TOKEN_SHL || i == TOKEN_SHR) {
+	int j = i;
+	i = scan(scpriv, tokval);
+	f = expr4(critical);
+	if (!f)
+	    return NULL;
+	if (!(is_simple(e) || is_just_unknown(e)) ||
+	    !(is_simple(f) || is_just_unknown(f))) {
+	    error(ERR_NONFATAL, "shift operator may only be applied to"
+		  " scalar values");
+	} else if (is_just_unknown(e) || is_just_unknown(f)) {
+	    e = unknown_expr();
+	} else switch (j) {
+	  case TOKEN_SHL:
+	    e = scalarvect (reloc_value(e) << reloc_value(f));
+	    break;
+	  case TOKEN_SHR:
+	    e = scalarvect (((unsigned long)reloc_value(e)) >>
+			    reloc_value(f));
+	    break;
+	}
+    }
+    return e;
+}
+
+static expr *expr4(int critical) {
+    expr *e, *f;
+
+    e = expr5(critical);
+    if (!e)
+	return NULL;
+    while (i == '+' || i == '-') {
+	int j = i;
+	i = scan(scpriv, tokval);
+	f = expr5(critical);
+	if (!f)
+	    return NULL;
+	switch (j) {
+	  case '+':
+	    e = add_vectors (e, f);
+	    break;
+	  case '-':
+	    e = add_vectors (e, scalar_mult(f, -1L, FALSE));
+	    break;
+	}
+    }
+    return e;
+}
+
+static expr *expr5(int critical) {
+    expr *e, *f;
+
+    e = expr6(critical);
+    if (!e)
+	return NULL;
+    while (i == '*' || i == '/' || i == '%' ||
+	   i == TOKEN_SDIV || i == TOKEN_SMOD) {
+	int j = i;
+	i = scan(scpriv, tokval);
+	f = expr6(critical);
+	if (!f)
+	    return NULL;
+	if (j != '*' && (!(is_simple(e) || is_just_unknown(e)) ||
+			 !(is_simple(f) || is_just_unknown(f)))) {
+	    error(ERR_NONFATAL, "division operator may only be applied to"
+		  " scalar values");
+	    return NULL;
+	}
+	if (j != '*' && !is_unknown(f) && reloc_value(f) == 0) {
+	    error(ERR_NONFATAL, "division by zero");
+	    return NULL;
+	}
+	switch (j) {
+	  case '*':
+	    if (is_simple(e))
+		e = scalar_mult (f, reloc_value(e), TRUE);
+	    else if (is_simple(f))
+		e = scalar_mult (e, reloc_value(f), TRUE);
+	    else if (is_just_unknown(e) && is_just_unknown(f))
+		e = unknown_expr();
+	    else {
+		error(ERR_NONFATAL, "unable to multiply two "
+		      "non-scalar objects");
+		return NULL;
+	    }
+	    break;
+	  case '/':
+	    if (is_just_unknown(e) || is_just_unknown(f))
+		e = unknown_expr();
+	    else
+		e = scalarvect (((unsigned long)reloc_value(e)) /
+				((unsigned long)reloc_value(f)));
+	    break;
+	  case '%':
+	    if (is_just_unknown(e) || is_just_unknown(f))
+		e = unknown_expr();
+	    else
+		e = scalarvect (((unsigned long)reloc_value(e)) %
+				((unsigned long)reloc_value(f)));
+	    break;
+	  case TOKEN_SDIV:
+	    if (is_just_unknown(e) || is_just_unknown(f))
+		e = unknown_expr();
+	    else
+		e = scalarvect (((signed long)reloc_value(e)) /
+				((signed long)reloc_value(f)));
+	    break;
+	  case TOKEN_SMOD:
+	    if (is_just_unknown(e) || is_just_unknown(f))
+		e = unknown_expr();
+	    else
+		e = scalarvect (((signed long)reloc_value(e)) %
+				((signed long)reloc_value(f)));
+	    break;
+	}
+    }
+    return e;
+}
+
+static expr *expr6(int critical) {
+    long type;
+    expr *e;
+    long label_seg, label_ofs;
+
+    if (i == '-') {
+	i = scan(scpriv, tokval);
+	e = expr6(critical);
+	if (!e)
+	    return NULL;
+	return scalar_mult (e, -1L, FALSE);
+    } else if (i == '+') {
+	i = scan(scpriv, tokval);
+	return expr6(critical);
+    } else if (i == '~') {
+	i = scan(scpriv, tokval);
+	e = expr6(critical);
+	if (!e)
+	    return NULL;
+	if (is_just_unknown(e))
+	    return unknown_expr();
+	else if (!is_simple(e)) {
+	    error(ERR_NONFATAL, "`~' operator may only be applied to"
+		  " scalar values");
+	    return NULL;
+	}
+	return scalarvect(~reloc_value(e));
+    } else if (i == TOKEN_SEG) {
+	i = scan(scpriv, tokval);
+	e = expr6(critical);
+	if (!e)
+	    return NULL;
+	e = segment_part(e);
+	if (is_unknown(e) && critical) {
+	    error(ERR_NONFATAL, "unable to determine segment base");
+	    return NULL;
+	}
+	return e;
+    } else if (i == '(') {
+	i = scan(scpriv, tokval);
+	e = bexpr(critical);
+	if (!e)
+	    return NULL;
+	if (i != ')') {
+	    error(ERR_NONFATAL, "expecting `)'");
+	    return NULL;
+	}
+	i = scan(scpriv, tokval);
+	return e;
+    } else if (i == TOKEN_NUM || i == TOKEN_REG || i == TOKEN_ID ||
+	       i == TOKEN_HERE || i == TOKEN_BASE) {
+	begintemp();
+	switch (i) {
+	  case TOKEN_NUM:
+	    addtotemp(EXPR_SIMPLE, tokval->t_integer);
+	    break;
+	  case TOKEN_REG:
+	    addtotemp(tokval->t_integer, 1L);
+	    if (hint && hint->type == EAH_NOHINT)
+		hint->base = tokval->t_integer, hint->type = EAH_MAKEBASE;
+	    break;
+	  case TOKEN_ID:
+	  case TOKEN_HERE:
+	  case TOKEN_BASE:
+	    /*
+	     * If "label" begins with "%", this indicates that no
+	     * symbol, Here or Base references are valid because we
+	     * are in preprocess-only mode.
+	     */
+	    if (*label == '%') {
+		error(ERR_NONFATAL,
+		      "%s not supported in preprocess-only mode",
+		      (i == TOKEN_ID ? "symbol references" :
+		       i == TOKEN_HERE ? "`$'" : "`$$'"));
+		addtotemp(EXPR_UNKNOWN, 1L);
+		break;
+	    }
+
+	    /*
+	     * Since the whole line is parsed before the label it
+	     * defines is given to the label manager, we have
+	     * problems with lines such as
+	     *
+	     *   end: TIMES 512-(end-start) DB 0
+	     *
+	     * where `end' is not known on pass one, despite not
+	     * really being a forward reference, and due to
+	     * criticality it is _needed_. Hence we check our label
+	     * against the currently defined one, and do our own
+	     * resolution of it if we have to.
+	     */
+	    type = EXPR_SIMPLE;	       /* might get overridden by UNKNOWN */
+	    if (i == TOKEN_BASE) {
+		label_seg = seg;
+		label_ofs = 0;
+	    } else if (i == TOKEN_HERE || !strcmp(tokval->t_charptr, label)) {
+		label_seg = seg;
+		label_ofs = ofs;
+	    } else if (!labelfunc(tokval->t_charptr,&label_seg,&label_ofs)) {
+		if (critical == 2) {
+		    error (ERR_NONFATAL, "symbol `%s' undefined",
+			   tokval->t_charptr);
+		    return NULL;
+		} else if (critical == 1) {
+		    error (ERR_NONFATAL, "symbol `%s' not defined before use",
+			   tokval->t_charptr);
+		    return NULL;
+		} else {
+		    if (forward)
+			*forward = TRUE;
+		    type = EXPR_UNKNOWN;
+		    label_seg = NO_SEG;
+		    label_ofs = 1;
+		}
+	    }
+	    addtotemp(type, label_ofs);
+	    if (label_seg!=NO_SEG)
+		addtotemp(EXPR_SEGBASE + label_seg, 1L);
+	    break;
+	}
+	i = scan(scpriv, tokval);
+	return finishtemp();
+    } else {
+	error(ERR_NONFATAL, "expression syntax error");
+	return NULL;
+    }
+}
+
+void eval_global_info (struct ofmt *output, lfunc lookup_label) {
+    outfmt = output;
+    labelfunc = lookup_label;
+}
+
+void eval_info (char *labelname, long segment, long offset) {
+    if (label != special_empty_string)
+	nasm_free (label);
+    if (labelname)
+	label = nasm_strdup(labelname);
+    else {
+	label = special_empty_string;
+	seg = segment;
+	ofs = offset;
+    }
+}
+
+expr *evaluate (scanner sc, void *scprivate, struct tokenval *tv,
+		int *fwref, int critical, efunc report_error,
+		struct eval_hints *hints) {
+    expr *e;
+    expr *f = NULL;
+
+    hint = hints;
+    if (hint)
+	hint->type = EAH_NOHINT;
+
+    if (critical & 0x10) {
+	critical &= ~0x10;
+	bexpr = rexp0;
+    } else
+	bexpr = expr0;
+
+    scan = sc;
+    scpriv = scprivate;
+    tokval = tv;
+    error = report_error;
+    forward = fwref;
+
+    if (tokval->t_type == TOKEN_INVALID)
+	i = scan(scpriv, tokval);
+    else
+	i = tokval->t_type;
+
+    while (ntempexprs)		       /* initialise temporary storage */
+	nasm_free (tempexprs[--ntempexprs]);
+
+    e = bexpr (critical);
+    if (!e)
+	return NULL;
+
+    if (i == TOKEN_WRT) {
+	i = scan(scpriv, tokval);      /* eat the WRT */
+	f = expr6 (critical);
+	if (!f)
+	    return NULL;
+    }
+    e = scalar_mult (e, 1L, FALSE);    /* strip far-absolute segment part */
+    if (f) {
+	expr *g;
+	if (is_just_unknown(f))
+	    g = unknown_expr();
+	else {
+	    long value;
+	    begintemp();
+	    if (!is_reloc(f)) {
+		error(ERR_NONFATAL, "invalid right-hand operand to WRT");
+		return NULL;
+	    }
+	    value = reloc_seg(f);
+	    if (value == NO_SEG)
+		value = reloc_value(f) | SEG_ABS;
+	    else if (!(value & SEG_ABS) && !(value % 2) && critical) {
+		error(ERR_NONFATAL, "invalid right-hand operand to WRT");
+		return NULL;
+	    }
+	    addtotemp(EXPR_WRT, value);
+	    g = finishtemp();
+	}
+	e = add_vectors (e, g);
+    }
+    return e;
+}
diff --git a/eval.h b/eval.h
new file mode 100644
index 00000000..26bde15a
--- /dev/null
+++ b/eval.h
@@ -0,0 +1,34 @@
+/* eval.h   header file for eval.c
+ *
+ * The Netwide Assembler is copyright (C) 1996 Simon Tatham and
+ * Julian Hall. All rights reserved. The software is
+ * redistributable under the licence given in the file "Licence"
+ * distributed in the NASM archive.
+ */
+
+#ifndef NASM_EVAL_H
+#define NASM_EVAL_H
+
+/*
+ * Called once to tell the evaluator what output format is
+ * providing segment-base details, and what function can be used to
+ * look labels up.
+ */
+void eval_global_info (struct ofmt *output, lfunc lookup_label);
+
+/*
+ * Called to set the information the evaluator needs: the value of
+ * $ is set from `segment' and `offset' if `labelname' is NULL, and
+ * otherwise the name of the current line's label is set from
+ * `labelname' instead.
+ */
+void eval_info (char *labelname, long segment, long offset);
+
+/*
+ * The evaluator itself.
+ */
+expr *evaluate (scanner sc, void *scprivate, struct tokenval *tv,
+		int *fwref, int critical, efunc report_error,
+		struct eval_hints *hints);
+
+#endif
diff --git a/insns.bas b/insns.bas
new file mode 100644
index 00000000..ea52bcdb
--- /dev/null
+++ b/insns.bas
@@ -0,0 +1,535 @@
+' INFO_1: Converter for INSNS.DAT to INSNSA.C and INSNSD.C
+'
+' INFO_2: Written by Mark Junker in 1997
+'         InterNet: mjs@prg.hannover.sgh-net.de
+'         FIDO:     Mark Junker@2:2437/47.21
+'
+' COMMENT: While I wrote this program I often asked me, if it isn't easier
+'          to write an interpreter for pearl-scripts :]
+'
+' COMMENT: To start the program press SHIFT+F5 within the QBasic IDE
+'          or start it from the command-line with QBASIC /RUN MACROS
+'
+
+DEFINT A-Z
+
+DECLARE FUNCTION ReplaceOp$ (a$)
+DECLARE FUNCTION StrTrimLeft$ (a$, b$)
+DECLARE FUNCTION StrTrimRight$ (a$, b$)
+DECLARE FUNCTION StrTrim$ (a$, b$)
+DECLARE SUB StrSplitString (SplitString$, SplitChars$, SplitField$(), SplitCount%)
+DECLARE FUNCTION Min% (a%, b%)
+DECLARE FUNCTION StrInstrLeft% (SearchStart%, SearchIn$, SearchFor$)
+DECLARE FUNCTION StrAscii% (a$)
+
+
+CONST MaxOpCodeBase = 3
+CONST MaxOpCodeType = 8
+
+CLS
+DIM LineData$(1 TO 2)
+DIM StrucData$(1 TO 5)
+DIM OpCodeList$(0 TO 255)
+DIM OpCodeByte(1 TO MaxOpCodeType, 1 TO MaxOpCodeBase)
+DIM OpCodeStat(1 TO 10)   ' don't need mode :)
+
+Instructs$ = ""
+LineOfs$ = ""
+
+OPEN "I", 1, "insns.dat"
+OPEN "B", 3, "insns.tmp"
+
+qt$ = CHR$(34)
+crlf$ = CHR$(13) + CHR$(10)
+
+
+'
+' preprocessing the current file
+'
+
+HexChar$ = "0123456789ABCDEF"
+
+PRINT "Preprocessing INSNS.DAT"
+OpCodes = 0
+OpCodeDebug = 0
+NowLineOfs& = 1
+lineNr = 0
+WHILE NOT EOF(1)
+  lineNr = lineNr + 1
+  IF (lineNr AND 15) = 0 THEN
+    LOCATE , 1
+    PRINT lineNr, OpCodes, OpCodeDebug;
+  END IF
+
+  LINE INPUT #1, l$
+  CALL StrSplitString(l$, ";", LineData$(), SplitCount)
+  IF SplitCount THEN
+    LineData$(1) = StrTrim$(LineData$(1), CHR$(9) + " ")
+    IF LEN(LineData$(1)) THEN
+      CALL StrSplitString(LineData$(1), " ", StrucData$(), cntSplit)
+      IF cntSplit <> 4 THEN
+        PRINT "line"; lineNr; " does not contain four fields"
+        END
+      END IF
+
+      tst$ = UCASE$(StrucData$(2))
+      res$ = ""
+      cnt% = 1
+      isfirst = 1
+      op = 1
+      p = StrInstrLeft(1, tst$ + ",", "|:,")
+      WHILE p
+        h$ = ReplaceOp$(MID$(tst$, op, p - op))
+        IF LEN(h$) THEN
+          SELECT CASE MID$(tst$, p, 1)
+          CASE ""
+            IF isfirst THEN
+              res$ = res$ + h$
+            ELSE
+              res$ = res$ + "|" + h$
+            END IF
+            isfirst = 0
+          CASE ","
+            IF isfirst THEN
+              res$ = res$ + h$ + ","
+            ELSE
+              res$ = res$ + "|" + h$ + ","
+            END IF
+            cnt% = cnt% + 1
+            isfirst = 1
+          CASE "|"
+            IF isfirst THEN
+              res$ = res$ + h$
+            ELSE
+              res$ = res$ + "|" + h$
+            END IF
+            isfirst = 0
+          CASE ":"
+            res$ = res$ + h$ + "|COLON,"
+            cnt% = cnt% + 1
+          END SELECT
+        END IF
+        op = p + 1
+        p = StrInstrLeft(op, tst$ + ",", "|:,")
+      WEND
+      FOR a = cnt% + 1 TO 3
+        res$ = res$ + ",0"
+      NEXT
+      StrucData$(2) = res$
+      IF LEFT$(res$, 2) = "0," THEN cnt% = cnt% - 1
+      StrucData$(5) = LTRIM$(STR$(cnt%))
+
+      NoDebug = 0
+      res$ = ""
+      tst$ = UCASE$(StrucData$(4))
+      op = 1
+      p = INSTR(tst$ + ",", ",")
+      isfirst = 1
+      WHILE p
+        h$ = MID$(tst$, op, p - op)
+        IF h$ = "ND" THEN
+          NoDebug = 1
+        ELSE
+          IF isfirst THEN
+            res$ = res$ + "IF_" + h$
+          ELSE
+            res$ = res$ + "|IF_" + h$
+          END IF
+          isfirst = 0
+        END IF
+        op = p + 1
+        p = INSTR(op, tst$ + ",", ",")
+      WEND
+      StrucData$(4) = res$
+
+      tst$ = UCASE$(StrucData$(3))
+      SELECT CASE tst$
+      CASE "IGNORE"
+        GOTO skipOpCode
+      CASE "\0", "\340"
+        OpCodeDebug = OpCodeDebug + 1    ' don't forget to increment
+        GOTO skipOpCode
+      END SELECT
+
+      AddRegs = 0
+      AddCCode = 0
+      NextIsOpCode = 0
+      opCodeVal$ = ""
+      op = 1
+      p = INSTR(tst$ + "\", "\")
+      DO WHILE p
+        h$ = MID$(tst$, op, p - op)
+        IF LEFT$(h$, 1) = "X" THEN
+          opCodeVal$ = CHR$(VAL("&H" + MID$(h$, 2)))
+          EXIT DO
+        ELSE
+          SELECT CASE h$
+          CASE "1", "2", "3"
+            NextIsOpCode = 1
+          CASE "4"
+            opCodeVal$ = CHR$(&H7) + CHR$(&H17) + CHR$(&H1F)
+            EXIT DO
+          CASE "5"
+            opCodeVal$ = CHR$(&HA1) + CHR$(&HA9)
+            EXIT DO
+          CASE "6"
+            opCodeVal$ = CHR$(&H6) + CHR$(&HE) + CHR$(&H16) + CHR$(&H1E)
+            EXIT DO
+          CASE "7"
+            opCodeVal$ = CHR$(&HA0) + CHR$(&HA8)
+            EXIT DO
+          CASE "10", "11", "12"
+            NextIsOpCode = 1
+            AddRegs = VAL(h$) - 9
+          CASE "330"
+            NextIsOpCode = 1
+            AddCCode = VAL(h$) - 329
+          CASE "17"
+            opCodeVal$ = CHR$(0)
+            EXIT DO
+          CASE ELSE
+            IF NextIsOpCode THEN
+              PRINT "Line:"; lineNr
+              PRINT "Unknown value: " + h$
+              END
+            END IF
+          END SELECT
+        END IF
+        op = p + 1
+        p = INSTR(op, tst$ + "\", "\")
+      LOOP
+      IF (p = 0) THEN
+        PRINT "No opcode found in line"; lineNr
+        PRINT "Line:"
+        PRINT l$
+        END
+      END IF
+
+      IF NoDebug = 0 THEN
+        FOR a = 1 TO LEN(opCodeVal$)
+          h = ASC(MID$(opCodeVal$, a, 1))
+          OpCodeStr$ = MKI$(OpCodeDebug)
+          IF AddRegs THEN
+            EndNr = 7
+          ELSEIF AddCCode THEN
+            EndNr = 15
+          ELSE
+            EndNr = 0
+          END IF
+          FOR b = 0 TO EndNr
+            OpCodeList$(h + b) = OpCodeList$(h + b) + OpCodeStr$
+          NEXT
+        NEXT
+        OpCodeDebug = OpCodeDebug + 1
+      END IF
+
+skipOpCode:
+      OpCodes = OpCodes + 1
+      LineOfs$ = LineOfs$ + MKL$(NowLineOfs&)
+      LineLg = 1
+      h$ = CHR$(NoDebug)
+      PUT #3, NowLineOfs&, h$
+      NowLineOfs& = NowLineOfs& + 1
+      FOR a = 1 TO 5
+        lg = LEN(StrucData$(a))
+        h$ = CHR$(lg) + StrucData$(a)
+        PUT #3, NowLineOfs&, h$
+        NowLineOfs& = NowLineOfs& + lg + 1
+        LineLg = LineLg + lg + 1
+      NEXT
+      LineOfs$ = LineOfs$ + MKI$(LineLg)
+    END IF
+  END IF
+WEND
+LOCATE , 1
+PRINT lineNr, OpCodes, OpCodeDebug
+
+
+'
+' creating insnsa.c
+'
+
+
+PRINT "Creating INSNSA.C"
+
+OPEN "O", 2, "insnsa.c"
+strBegStart$ = "static struct itemplate instrux_"
+strBegEnd$ = "[] = {"
+strEnd$ = "    {-1}" + crlf$ + "};" + crlf$
+
+PRINT #2, "/* This file auto-generated from insns.dat by insns.bas - don't edit it */"
+PRINT #2, ""
+PRINT #2, "#include <stdio.h>"
+PRINT #2, "#include " + qt$ + "nasm.h" + qt$
+PRINT #2, "#include " + qt$ + "insns.h" + qt$
+PRINT #2, ""
+
+oldOpCode$ = ""
+pOfs = 1
+FOR a = 1 TO OpCodes
+  LineOfs& = CVL(MID$(LineOfs$, pOfs, 4))
+  l$ = SPACE$(CVI(MID$(LineOfs$, pOfs + 4, 2)))
+  pOfs = pOfs + 6
+  GET #3, LineOfs&, l$
+
+  ' split data into fields
+  NoDebug = ASC(LEFT$(l$, 1))
+  pLn = 2
+  FOR b = 1 TO 5
+    lgLn = ASC(MID$(l$, pLn, 1))
+    StrucData$(b) = MID$(l$, pLn + 1, lgLn)
+    pLn = pLn + lgLn + 1
+  NEXT
+
+  IF oldOpCode$ <> StrucData$(1) THEN
+    Instructs$ = Instructs$ + StrucData$(1) + CHR$(0)
+    IF LEN(oldOpCode$) THEN PRINT #2, strEnd$
+    PRINT #2, strBegStart$ + StrucData$(1) + strBegEnd$
+    oldOpCode$ = StrucData$(1)
+  END IF
+  SELECT CASE UCASE$(StrucData$(3))
+  CASE "IGNORE"
+  CASE ELSE
+    PRINT #2, "    {I_" + oldOpCode$ + ", " + StrucData$(5) + ", {" + StrucData$(2) + "}, " + qt$ + StrucData$(3) + qt$ + ", " + StrucData$(4) + "},"
+  END SELECT
+NEXT
+IF LEN(oldOpCode$) THEN PRINT #2, strEnd$
+
+PRINT #2, "struct itemplate *nasm_instructions[] = {"
+op = 1
+p = INSTR(Instructs$, CHR$(0))
+WHILE p
+  h$ = MID$(Instructs$, op, p - op)
+  PRINT #2, "    instrux_" + h$ + ","
+  op = p + 1
+  p = INSTR(op, Instructs$, CHR$(0))
+WEND
+PRINT #2, "};"
+
+CLOSE 2
+
+
+
+'
+' creating insnsd.c
+'
+
+
+PRINT "Creating INSNSD.C"
+
+OPEN "O", 2, "insnsd.c"
+
+PRINT #2, "/* This file auto-generated from insns.dat by insns.bas - don't edit it */"
+PRINT #2, ""
+PRINT #2, "#include <stdio.h>"
+PRINT #2, "#include " + qt$ + "nasm.h" + qt$
+PRINT #2, "#include " + qt$ + "insns.h" + qt$
+PRINT #2, ""
+
+
+PRINT #2, "static struct itemplate instrux[] = {"
+pOfs = 1
+FOR a = 1 TO OpCodes
+  LineOfs& = CVL(MID$(LineOfs$, pOfs, 4))
+  l$ = SPACE$(CVI(MID$(LineOfs$, pOfs + 4, 2)))
+  pOfs = pOfs + 6
+  GET #3, LineOfs&, l$
+
+  ' split data into fields
+  NoDebug = ASC(LEFT$(l$, 1))
+  pLn = 2
+  FOR b = 1 TO 5
+    lgLn = ASC(MID$(l$, pLn, 1))
+    StrucData$(b) = MID$(l$, pLn + 1, lgLn)
+    pLn = pLn + lgLn + 1
+  NEXT
+
+  IF NoDebug OR (UCASE$(StrucData$(3)) = "IGNORE") THEN
+    ' ignorieren
+  ELSE
+    PRINT #2, "    {I_" + StrucData$(1) + ", " + StrucData$(5) + ", {" + StrucData$(2) + "}, " + qt$ + StrucData$(3) + qt$ + ", " + StrucData$(4) + "},"
+  END IF
+NEXT
+PRINT #2, "    {-1}" + crlf$ + "};" + crlf$
+
+
+OpCodeBegS$ = "static struct itemplate *itable_"
+OpCodeBegE$ = "[] = {"
+OpCodeEnd$ = "    NULL" + crlf$ + "};" + crlf$
+
+FOR a = 0 TO 255
+  PRINT #2, OpCodeBegS$ + RIGHT$("00" + HEX$(a), 2) + OpCodeBegE$
+  h$ = OpCodeList$(a)
+  FOR b = 1 TO LEN(h$) STEP 2
+    OpCodePos = CVI(MID$(h$, b, 2))
+    PRINT #2, "    instrux +" + STR$(OpCodePos) + ","
+  NEXT
+  PRINT #2, OpCodeEnd$
+NEXT
+
+PRINT #2, "struct itemplate **itable[] = {"
+FOR a = 0 TO 255
+  PRINT #2, "    itable_" + RIGHT$("00" + HEX$(a), 2) + ","
+NEXT
+PRINT #2, "};"
+
+CLOSE 2
+
+
+
+CLOSE 3
+KILL "insns.tmp"
+CLOSE 1
+SYSTEM
+
+FUNCTION ReplaceOp$ (a$)
+  tst$ = UCASE$(a$)
+  SELECT CASE tst$
+'  CASE "ND"
+'    ReplaceOp$ = ""
+  CASE "VOID", ""
+    ReplaceOp$ = "0"
+  CASE "IMM"
+    ReplaceOp$ = "IMMEDIATE"
+  CASE "MEM"
+    ReplaceOp$ = "MEMORY"
+  CASE "MEM8", "MEM16", "MEM32", "MEM64", "MEM80"
+    ReplaceOp$ = "MEMORY|BITS" + MID$(tst$, 4)
+  CASE "REG8", "REG16", "REG32"
+    ReplaceOp$ = tst$
+  CASE "RM8", "RM16", "RM32"
+    ReplaceOp$ = "REGMEM|BITS" + MID$(tst$, 3)
+  CASE "IMM8", "IMM16", "IMM32"
+    ReplaceOp$ = "IMMEDIATE|BITS" + MID$(tst$, 4)
+  CASE ELSE
+    ReplaceOp$ = tst$
+  END SELECT
+END FUNCTION
+
+FUNCTION Min% (a%, b%)
+  IF a% < b% THEN Min% = a% ELSE Min% = b%
+END FUNCTION
+
+FUNCTION StrAscii (a$)
+  IF LEN(a$) = 0 THEN
+    StrAscii = -1
+  ELSE
+    StrAscii = ASC(a$)
+  END IF
+END FUNCTION
+
+' same as =INSTR(SearchStart, SearchIn, ANY SearchFor$) in PowerBASIC(tm)
+'
+FUNCTION StrInstrLeft (SearchStart, SearchIn$, SearchFor$)
+ ValuesCount = LEN(SearchFor$)
+ MaxValue = LEN(SearchIn$) + 1
+ MinValue = MaxValue
+ FOR Counter1 = 1 TO ValuesCount
+  SearchChar$ = MID$(SearchFor$, Counter1, 1)
+  hVal2 = INSTR(SearchStart, SearchIn$, SearchChar$)
+  IF hVal2 > 0 THEN MinValue = Min%(hVal2, MinValue)
+ NEXT
+ IF MinValue = MaxValue THEN MinValue = 0
+ StrInstrLeft = MinValue
+END FUNCTION
+
+'
+' This is a very damn fuckin' shit version of this splitting routine.
+' At this time, it's not very useful :]
+'
+SUB StrSplitString (SplitString$, SplitChars$, SplitField$(), SplitCount)
+  StartIndex = LBOUND(SplitField$)
+  LastIndex = UBOUND(SplitField$)
+  ActualIndex& = StartIndex
+  SplitCount = 0
+
+  LastPos = 1
+  FoundPos = StrInstrLeft(LastPos, SplitString$, SplitChars$ + CHR$(34))
+  GetDirect = 0
+  EndLoop = 0
+  TempString$ = ""
+  DO WHILE FoundPos > 0
+    FoundCharVal = StrAscii(MID$(SplitString$, FoundPos, 1))
+    PosDiff = (FoundPos - LastPos) + 1
+    SELECT CASE FoundCharVal
+    CASE 34
+      TempString$ = TempString$ + MID$(SplitString$, LastPos, PosDiff - 1)
+      SELECT CASE EndLoop
+      CASE 0
+        EndLoop = 2
+      CASE 3
+        EndLoop = 0
+      END SELECT
+    CASE ELSE
+      TempString$ = TempString$ + MID$(SplitString$, LastPos, PosDiff - 1)
+      SplitField$(ActualIndex&) = TempString$
+      TempString$ = ""
+      ActualIndex& = ActualIndex& + 1
+      IF ActualIndex& > LastIndex THEN
+        ActualIndex& = LastIndex
+        EndLoop = 1
+      END IF
+    END SELECT
+    SELECT CASE EndLoop
+    CASE 0
+      DO
+        LastPos = FoundPos + 1
+        FoundPos = StrInstrLeft(LastPos, SplitString$, SplitChars$)
+      LOOP WHILE LastPos = FoundPos
+      FoundPos = StrInstrLeft(LastPos, SplitString$, SplitChars$ + CHR$(34))
+    CASE 1
+      FoundPos = 0
+      LastPos = LEN(SplitString$) + 1
+    CASE 2
+      EndLoop = 3
+      LastPos = FoundPos + 1
+      FoundPos = StrInstrLeft(LastPos, SplitString$, CHR$(34))
+      IF FoundPos = 0 THEN
+       SplitString$ = SplitString$ + CHR$(34)
+       FoundPos = LEN(SplitString$)
+      END IF
+    END SELECT
+  LOOP
+  IF EndLoop = 0 THEN
+    IF LEN(TempString$) > 0 THEN
+      SplitField$(ActualIndex&) = TempString$
+    ELSEIF LastPos <= LEN(SplitString$) THEN
+      SplitField$(ActualIndex&) = MID$(SplitString$, LastPos)
+    ELSE
+      ActualIndex& = ActualIndex& - 1
+    END IF
+  END IF
+  FOR a = ActualIndex& + 1 TO LastIndex
+    SplitField$(a) = ""
+  NEXT
+  SplitCount = (ActualIndex& - StartIndex) + 1
+END SUB
+
+FUNCTION StrTrim$ (a$, b$)
+        StrTrim$ = StrTrimRight$(StrTrimLeft$(a$, b$), b$)
+END FUNCTION
+
+FUNCTION StrTrimLeft$ (a$, b$) 'public
+        p = 0
+        l = LEN(a$)
+        DO
+          p = p + 1
+          t$ = MID$(a$, p, 1)
+        LOOP WHILE (p < l) AND (INSTR(b$, t$) > 0)
+        StrTrimLeft$ = MID$(a$, p)
+END FUNCTION
+
+FUNCTION StrTrimRight$ (a$, b$) 'public
+        l = LEN(a$)
+        p = l + 1
+        DO
+          p = p - 1
+          IF p > 0 THEN
+            t$ = MID$(a$, p, 1)
+          ELSE
+            t$ = ""
+          END IF
+        LOOP WHILE (p > 0) AND (INSTR(b$, t$) > 0)
+        StrTrimRight$ = LEFT$(a$, p)
+END FUNCTION
+
diff --git a/insns.dat b/insns.dat
index ea0afe7a..27436f1b 100644
--- a/insns.dat
+++ b/insns.dat
@@ -14,9 +14,9 @@
 
 AAA       void                \1\x37                        8086
 AAD       void                \2\xD5\x0A                    8086
-AAD       imm                 \1\xD5\24                     8086,UNDOC
+AAD       imm                 \1\xD5\24                     8086
 AAM       void                \2\xD4\x0A                    8086
-AAM       imm                 \1\xD4\24                     8086,UNDOC
+AAM       imm                 \1\xD4\24                     8086
 AAS       void                \1\x3F                        8086
 ADC       mem,reg8            \300\1\x10\101                8086,SM
 ADC       reg8,reg8           \300\1\x10\101                8086
@@ -225,7 +225,7 @@ FBLD      mem                 \300\1\xDF\204                8086,FPU
 FBSTP     mem80               \300\1\xDF\206                8086,FPU
 FBSTP     mem                 \300\1\xDF\206                8086,FPU
 FCHS      void                \2\xD9\xE0                    8086,FPU
-FCLEX     void                \2\xDB\xE2                    8086,FPU
+FCLEX     void                \3\x9B\xDB\xE2                8086,FPU
 FCMOVB    fpureg              \1\xDA\10\xC0                 P6,FPU
 FCMOVB    fpu0,fpureg         \1\xDA\11\xC0                 P6,FPU
 FCMOVBE   fpureg              \1\xDA\10\xD0                 P6,FPU
@@ -257,7 +257,7 @@ FCOMP     fpu0,fpureg         \1\xD8\11\xD8                 8086,FPU
 FCOMPP    void                \2\xDE\xD9                    8086,FPU
 FCOS      void                \2\xD9\xFF                    386,FPU
 FDECSTP   void                \2\xD9\xF6                    8086,FPU
-FDISI     void                \2\xDB\xE1                    8086,FPU
+FDISI     void                \3\x9B\xDB\xE1                8086,FPU
 FDIV      mem32               \300\1\xD8\206                8086,FPU
 FDIV      mem64               \300\1\xDC\206                8086,FPU
 FDIV      fpureg|to           \1\xDC\10\xF8                 8086,FPU
@@ -274,7 +274,7 @@ FDIVR     fpureg              \1\xD8\10\xF8                 8086,FPU
 FDIVR     fpu0,fpureg         \1\xD8\11\xF8                 8086,FPU
 FDIVRP    fpureg              \1\xDE\10\xF0                 8086,FPU
 FDIVRP    fpureg,fpu0         \1\xDE\10\xF0                 8086,FPU
-FENI      void                \2\xDB\xE0                    8086,FPU
+FENI      void                \3\x9B\xDB\xE0                8086,FPU
 FFREE     fpureg              \1\xDD\10\xC0                 8086,FPU
 FIADD     mem32               \300\1\xDA\200                8086,FPU
 FIADD     mem16               \300\1\xDE\200                8086,FPU
@@ -292,7 +292,7 @@ FILD      mem64               \300\1\xDF\205                8086,FPU
 FIMUL     mem32               \300\1\xDA\201                8086,FPU
 FIMUL     mem16               \300\1\xDE\201                8086,FPU
 FINCSTP   void                \2\xD9\xF7                    8086,FPU
-FINIT     void                \2\xDB\xE3                    8086,FPU
+FINIT     void                \3\x9B\xDB\xE3                8086,FPU
 FIST      mem32               \300\1\xDB\202                8086,FPU
 FIST      mem16               \300\1\xDF\202                8086,FPU
 FISTP     mem32               \300\1\xDB\203                8086,FPU
@@ -323,14 +323,23 @@ FMUL      fpureg              \1\xD8\10\xC8                 8086,FPU
 FMUL      fpu0,fpureg         \1\xD8\11\xC8                 8086,FPU
 FMULP     fpureg              \1\xDE\10\xC8                 8086,FPU
 FMULP     fpureg,fpu0         \1\xDE\10\xC8                 8086,FPU
+FNCLEX    void                \2\xDB\xE2                    8086,FPU
+FNDISI    void                \2\xDB\xE1                    8086,FPU
+FNENI     void                \2\xDB\xE0                    8086,FPU
+FNINIT    void                \2\xDB\xE3                    8086,FPU
 FNOP      void                \2\xD9\xD0                    8086,FPU
+FNSAVE    mem                 \300\1\xDD\206                8086,FPU
+FNSTCW    mem                 \300\1\xD9\207                8086,FPU,SW
+FNSTENV   mem                 \300\1\xD9\206                8086,FPU
+FNSTSW    mem                 \300\1\xDD\207                8086,FPU,SW
+FNSTSW    reg_ax              \2\xDF\xE0                    286,FPU
 FPATAN    void                \2\xD9\xF3                    8086,FPU
 FPREM     void                \2\xD9\xF8                    8086,FPU
 FPREM1    void                \2\xD9\xF5                    386,FPU
 FPTAN     void                \2\xD9\xF2                    8086,FPU
 FRNDINT   void                \2\xD9\xFC                    8086,FPU
 FRSTOR    mem                 \300\1\xDD\204                8086,FPU
-FSAVE     mem                 \300\1\xDD\206                8086,FPU
+FSAVE     mem                 \300\2\x9B\xDD\206            8086,FPU
 FSCALE    void                \2\xD9\xFD                    8086,FPU
 FSETPM    void                \2\xDB\xE4                    286,FPU
 FSIN      void                \2\xD9\xFE                    386,FPU
@@ -339,14 +348,14 @@ FSQRT     void                \2\xD9\xFA                    8086,FPU
 FST       mem32               \300\1\xD9\202                8086,FPU
 FST       mem64               \300\1\xDD\202                8086,FPU
 FST       fpureg              \1\xDD\10\xD0                 8086,FPU
-FSTCW     mem                 \300\1\xD9\207                8086,FPU,SW
-FSTENV    mem                 \300\1\xD9\206                8086,FPU
+FSTCW     mem                 \300\2\x9B\xD9\207            8086,FPU,SW
+FSTENV    mem                 \300\2\x9B\xD9\206            8086,FPU
 FSTP      mem32               \300\1\xD9\203                8086,FPU
 FSTP      mem64               \300\1\xDD\203                8086,FPU
 FSTP      mem80               \300\1\xDB\207                8086,FPU
 FSTP      fpureg              \1\xDD\10\xD8                 8086,FPU
-FSTSW     mem                 \300\1\xDD\207                8086,FPU,SW
-FSTSW     reg_ax              \2\xDF\xE0                    286,FPU
+FSTSW     mem                 \300\2\x9B\xDD\207            8086,FPU,SW
+FSTSW     reg_ax              \3\x9B\xDF\xE0                286,FPU
 FSUB      mem32               \300\1\xD8\204                8086,FPU
 FSUB      mem64               \300\1\xDC\204                8086,FPU
 FSUB      fpureg|to           \1\xDC\10\xE8                 8086,FPU
@@ -365,11 +374,13 @@ FSUBRP    fpureg              \1\xDE\10\xE0                 8086,FPU
 FSUBRP    fpureg,fpu0         \1\xDE\10\xE0                 8086,FPU
 FTST      void                \2\xD9\xE4                    8086,FPU
 FUCOM     fpureg              \1\xDD\10\xE0                 386,FPU
+FUCOM     fpu0,fpureg         \1\xDD\11\xE0                 386,FPU
 FUCOMI    fpureg              \1\xDB\10\xE8                 P6,FPU
 FUCOMI    fpu0,fpureg         \1\xDB\11\xE8                 P6,FPU
 FUCOMIP   fpureg              \1\xDF\10\xE8                 P6,FPU
 FUCOMIP   fpu0,fpureg         \1\xDF\11\xE8                 P6,FPU
 FUCOMP    fpureg              \1\xDD\10\xE8                 386,FPU
+FUCOMP    fpu0,fpureg         \1\xDD\11\xE8                 386,FPU
 FUCOMPP   void                \2\xDA\xE9                    386,FPU
 FXAM      void                \2\xD9\xE5                    8086,FPU
 FXCH      void                \2\xD9\xC9                    8086,FPU
@@ -384,7 +395,7 @@ IBTS      mem,reg16           \320\300\2\x0F\xA7\101        386,SW,UNDOC,ND
 IBTS      reg16,reg16         \320\300\2\x0F\xA7\101        386,UNDOC,ND
 IBTS      mem,reg32           \321\300\2\x0F\xA7\101        386,SD,UNDOC,ND
 IBTS      reg32,reg32         \321\300\2\x0F\xA7\101        386,UNDOC,ND
-ICEBP     void                \1\xF1                        286,UNDOC
+ICEBP     void                \1\xF1                        P6,ND
 IDIV      rm8                 \300\1\xF6\207                8086
 IDIV      rm16                \320\300\1\xF7\207            8086
 IDIV      rm32                \321\300\1\xF7\207            386
@@ -398,7 +409,7 @@ IMUL      reg32,reg32         \321\301\2\x0F\xAF\110        386
 IMUL      reg16,mem,imm8      \320\301\1\x6B\110\16         286,SM
 IMUL      reg16,reg16,imm8    \320\301\1\x6B\110\16         286
 IMUL      reg16,mem,imm       \320\301\1\x69\110\32         286,SM
-IMUL      reg16,reg16,imm     \320\301\1\x69\110\32         286
+IMUL      reg16,reg16,imm     \320\301\1\x69\110\32         286,SM
 IMUL      reg32,mem,imm8      \321\301\1\x6B\110\16         386,SM
 IMUL      reg32,reg32,imm8    \321\301\1\x6B\110\16         386
 IMUL      reg32,mem,imm       \321\301\1\x69\110\42         386,SM
@@ -423,8 +434,8 @@ INSB      void                \1\x6C                        186
 INSD      void                \321\1\x6D                    386
 INSW      void                \320\1\x6D                    186
 INT       imm                 \1\xCD\24                     8086
-INT01     void                \1\xF1                        286,UNDOC
-INT1      void                \1\xF1                        286,UNDOC
+INT01     void                \1\xF1                        P6,ND
+INT1      void                \1\xF1                        P6
 INT3      void                \1\xCC                        8086
 INTO      void                \1\xCE                        8086
 INVD      void                \2\x0F\x08                    486
@@ -506,16 +517,21 @@ LSS       reg32,mem           \321\301\2\x0F\xB2\110        386
 LTR       mem                 \300\1\x0F\17\203             286,PRIV
 LTR       mem16               \300\1\x0F\17\203             286,PRIV
 LTR       reg16               \300\1\x0F\17\203             286,PRIV
-MOV       mem,reg_cs          \300\1\x8C\101                8086,SM
-MOV       mem,reg_dess        \300\1\x8C\101                8086,SM
-MOV       mem,reg_fsgs        \300\1\x8C\101                386,SM
-MOV       reg16,reg_cs        \300\1\x8C\101                8086
-MOV       reg16,reg_dess      \300\1\x8C\101                8086
-MOV       reg16,reg_fsgs      \300\1\x8C\101                386
-MOV       reg_dess,mem        \301\1\x8E\110                8086,SM
-MOV       reg_dess,reg16      \301\1\x8E\110                8086
-MOV       reg_fsgs,mem        \301\1\x8E\110                386,SM
-MOV       reg_fsgs,reg16      \301\1\x8E\110                386
+MOV       mem,reg_cs          \320\300\1\x8C\201            8086,SM
+MOV       mem,reg_dess        \320\300\1\x8C\101            8086,SM
+MOV       mem,reg_fsgs        \320\300\1\x8C\101            386,SM
+MOV       reg16,reg_cs        \320\300\1\x8C\201            8086
+MOV       reg16,reg_dess      \320\300\1\x8C\101            8086
+MOV       reg16,reg_fsgs      \320\300\1\x8C\101            386
+MOV       rm32,reg_cs         \321\300\1\x8C\201            8086
+MOV       rm32,reg_dess       \321\300\1\x8C\101            8086
+MOV       rm32,reg_fsgs       \321\300\1\x8C\101            386
+MOV       reg_dess,mem        \320\301\1\x8E\110            8086,SM
+MOV       reg_fsgs,mem        \320\301\1\x8E\110            386,SM
+MOV       reg_dess,reg16      \320\301\1\x8E\110            8086
+MOV       reg_fsgs,reg16      \320\301\1\x8E\110            386
+MOV       reg_dess,rm32       \321\301\1\x8E\110            8086
+MOV       reg_fsgs,rm32       \321\301\1\x8E\110            386
 MOV       reg_al,mem_offs     \301\1\xA0\35                 8086,SM
 MOV       reg_ax,mem_offs     \301\320\1\xA1\35             8086,SM
 MOV       reg_eax,mem_offs    \301\321\1\xA1\35             386,SM
@@ -624,6 +640,8 @@ PADDD     mmxreg,mem          \301\2\x0F\xFE\110            PENT,MMX,SM
 PADDD     mmxreg,mmxreg       \2\x0F\xFE\110                PENT,MMX
 PADDSB    mmxreg,mem          \301\2\x0F\xEC\110            PENT,MMX,SM
 PADDSB    mmxreg,mmxreg       \2\x0F\xEC\110                PENT,MMX
+PADDSIW   mmxreg,mem          \301\2\x0F\x51\110            PENT,MMX,SM,CYRIX
+PADDSIW   mmxreg,mmxreg       \2\x0F\x51\110                PENT,MMX,CYRIX
 PADDSW    mmxreg,mem          \301\2\x0F\xED\110            PENT,MMX,SM
 PADDSW    mmxreg,mmxreg       \2\x0F\xED\110                PENT,MMX
 PADDUSB   mmxreg,mem          \301\2\x0F\xDC\110            PENT,MMX,SM
@@ -636,6 +654,8 @@ PAND      mmxreg,mem          \301\2\x0F\xDB\110            PENT,MMX,SM
 PAND      mmxreg,mmxreg       \2\x0F\xDB\110                PENT,MMX
 PANDN     mmxreg,mem          \301\2\x0F\xDF\110            PENT,MMX,SM
 PANDN     mmxreg,mmxreg       \2\x0F\xDF\110                PENT,MMX
+PAVEB     mmxreg,mem          \301\2\x0F\x50\110            PENT,MMX,SM,CYRIX
+PAVEB     mmxreg,mmxreg       \2\x0F\x50\110                PENT,MMX,CYRIX
 PCMPEQB   mmxreg,mem          \301\2\x0F\x74\110            PENT,MMX,SM
 PCMPEQB   mmxreg,mmxreg       \2\x0F\x74\110                PENT,MMX
 PCMPEQD   mmxreg,mem          \301\2\x0F\x76\110            PENT,MMX,SM
@@ -648,19 +668,31 @@ PCMPGTD   mmxreg,mem          \301\2\x0F\x66\110            PENT,MMX,SM
 PCMPGTD   mmxreg,mmxreg       \2\x0F\x66\110                PENT,MMX
 PCMPGTW   mmxreg,mem          \301\2\x0F\x65\110            PENT,MMX,SM
 PCMPGTW   mmxreg,mmxreg       \2\x0F\x65\110                PENT,MMX
+PDISTIB   mmxreg,mem          \301\2\x0F\x54\110            PENT,MMX,SM,CYRIX
+PMACHRIW  mmxreg,mem          \301\2\x0F\x5E\110            PENT,MMX,SM,CYRIX
 PMADDWD   mmxreg,mem          \301\2\x0F\xF5\110            PENT,MMX,SM
 PMADDWD   mmxreg,mmxreg       \2\x0F\xF5\110                PENT,MMX
+PMAGW     mmxreg,mem          \301\2\x0F\x52\110            PENT,MMX,SM,CYRIX
+PMAGW     mmxreg,mmxreg       \2\x0F\x52\110                PENT,MMX,CYRIX
+PMULHRW   mmxreg,mem          \301\2\x0F\x59\110            PENT,MMX,SM,CYRIX
+PMULHRW   mmxreg,mmxreg       \2\x0F\x59\110                PENT,MMX,CYRIX
+PMULHRIW  mmxreg,mem          \301\2\x0F\x5D\110            PENT,MMX,SM,CYRIX
+PMULHRIW  mmxreg,mmxreg       \2\x0F\x5D\110                PENT,MMX,CYRIX
 PMULHW    mmxreg,mem          \301\2\x0F\xE5\110            PENT,MMX,SM
 PMULHW    mmxreg,mmxreg       \2\x0F\xE5\110                PENT,MMX
 PMULLW    mmxreg,mem          \301\2\x0F\xD5\110            PENT,MMX,SM
 PMULLW    mmxreg,mmxreg       \2\x0F\xD5\110                PENT,MMX
-POP       mem16               \320\300\1\x8F\200            8086
-POP       mem32               \321\300\1\x8F\200            386
+PMVGEZB   mmxreg,mem          \301\2\x0F\x5C\110            PENT,MMX,SM,CYRIX
+PMVLZB    mmxreg,mem          \301\2\x0F\x5B\110            PENT,MMX,SM,CYRIX
+PMVNZB    mmxreg,mem          \301\2\x0F\x5A\110            PENT,MMX,SM,CYRIX
+PMVZB     mmxreg,mem          \301\2\x0F\x58\110            PENT,MMX,SM,CYRIX
+POP       reg16               \320\10\x58                   8086
+POP       reg32               \321\10\x58                   386
+POP       rm16                \320\300\1\x8F\200            8086
+POP       rm32                \321\300\1\x8F\200            386
 POP       reg_cs              \1\x0F                        8086,UNDOC,ND
 POP       reg_dess            \4                            8086
 POP       reg_fsgs            \1\x0F\5                      386
-POP       reg16               \320\10\x58                   8086
-POP       reg32               \321\10\x58                   386
 POPA      void                \322\1\x61                    186
 POPAD     void                \321\1\x61                    386
 POPAW     void                \320\1\x61                    186
@@ -699,6 +731,8 @@ PSUBD     mmxreg,mem          \301\2\x0F\xFA\110            PENT,MMX,SM
 PSUBD     mmxreg,mmxreg       \2\x0F\xFA\110                PENT,MMX
 PSUBSB    mmxreg,mem          \301\2\x0F\xE8\110            PENT,MMX,SM
 PSUBSB    mmxreg,mmxreg       \2\x0F\xE8\110                PENT,MMX
+PSUBSIW   mmxreg,mem          \301\2\x0F\x55\110            PENT,MMX,SM,CYRIX
+PSUBSIW   mmxreg,mmxreg       \2\x0F\x55\110                PENT,MMX,CYRIX
 PSUBSW    mmxreg,mem          \301\2\x0F\xE9\110            PENT,MMX,SM
 PSUBSW    mmxreg,mmxreg       \2\x0F\xE9\110                PENT,MMX
 PSUBUSB   mmxreg,mem          \301\2\x0F\xD8\110            PENT,MMX,SM
@@ -719,12 +753,12 @@ PUNPCKLDQ mmxreg,mem          \301\2\x0F\x62\110            PENT,MMX,SM
 PUNPCKLDQ mmxreg,mmxreg       \2\x0F\x62\110                PENT,MMX
 PUNPCKLWD mmxreg,mem          \301\2\x0F\x61\110            PENT,MMX,SM
 PUNPCKLWD mmxreg,mmxreg       \2\x0F\x61\110                PENT,MMX
-PUSH      mem16               \320\300\1\xFF\206            8086
-PUSH      mem32               \321\300\1\xFF\206            386
-PUSH      reg_fsgs            \1\x0F\7                      386
-PUSH      reg_sreg            \6                            8086
 PUSH      reg16               \320\10\x50                   8086
 PUSH      reg32               \321\10\x50                   386
+PUSH      rm16                \320\300\1\xFF\206            8086
+PUSH      rm32                \321\300\1\xFF\206            386
+PUSH      reg_fsgs            \1\x0F\7                      386
+PUSH      reg_sreg            \6                            8086
 PUSH      imm8                \1\x6A\14                     286
 PUSH      imm16               \320\1\x68\30                 286
 PUSH      imm32               \321\1\x68\40                 386
@@ -738,22 +772,22 @@ PXOR      mmxreg,mem          \301\2\x0F\xEF\110            PENT,MMX,SM
 PXOR      mmxreg,mmxreg       \2\x0F\xEF\110                PENT,MMX
 RCL       rm8,unity           \300\1\xD0\202                8086
 RCL       rm8,reg_cl          \300\1\xD2\202                8086
-RCL       rm8,imm             \300\1\xC0\202\25             286
+RCL       rm8,imm             \300\1\xC0\202\25             286,SB
 RCL       rm16,unity          \320\300\1\xD1\202            8086
 RCL       rm16,reg_cl         \320\300\1\xD3\202            8086
-RCL       rm16,imm            \320\300\1\xC1\202\25         286
+RCL       rm16,imm            \320\300\1\xC1\202\25         286,SB
 RCL       rm32,unity          \321\300\1\xD1\202            386
 RCL       rm32,reg_cl         \321\300\1\xD3\202            386
-RCL       rm32,imm            \321\300\1\xC1\202\25         386
+RCL       rm32,imm            \321\300\1\xC1\202\25         386,SB
 RCR       rm8,unity           \300\1\xD0\203                8086
 RCR       rm8,reg_cl          \300\1\xD2\203                8086
-RCR       rm8,imm             \300\1\xC0\203\25             286
+RCR       rm8,imm             \300\1\xC0\203\25             286,SB
 RCR       rm16,unity          \320\300\1\xD1\203            8086
 RCR       rm16,reg_cl         \320\300\1\xD3\203            8086
-RCR       rm16,imm            \320\300\1\xC1\203\25         286
+RCR       rm16,imm            \320\300\1\xC1\203\25         286,SB
 RCR       rm32,unity          \321\300\1\xD1\203            386
 RCR       rm32,reg_cl         \321\300\1\xD3\203            386
-RCR       rm32,imm            \321\300\1\xC1\203\25         386
+RCR       rm32,imm            \321\300\1\xC1\203\25         386,SB
 RDMSR     void                \2\x0F\x32                    PENT
 RDPMC     void                \2\x0F\x33                    P6
 RDTSC     void                \2\x0F\x31                    PENT
@@ -770,43 +804,43 @@ RETN      void                \1\xC3                        8086
 RETN      imm                 \1\xC2\30                     8086
 ROL       rm8,unity           \300\1\xD0\200                8086
 ROL       rm8,reg_cl          \300\1\xD2\200                8086
-ROL       rm8,imm             \300\1\xC0\200\25             286
+ROL       rm8,imm             \300\1\xC0\200\25             286,SB
 ROL       rm16,unity          \320\300\1\xD1\200            8086
 ROL       rm16,reg_cl         \320\300\1\xD3\200            8086
-ROL       rm16,imm            \320\300\1\xC1\200\25         286
+ROL       rm16,imm            \320\300\1\xC1\200\25         286,SB
 ROL       rm32,unity          \321\300\1\xD1\200            386
 ROL       rm32,reg_cl         \321\300\1\xD3\200            386
-ROL       rm32,imm            \321\300\1\xC1\200\25         386
+ROL       rm32,imm            \321\300\1\xC1\200\25         386,SB
 ROR       rm8,unity           \300\1\xD0\201                8086
 ROR       rm8,reg_cl          \300\1\xD2\201                8086
-ROR       rm8,imm             \300\1\xC0\201\25             286
+ROR       rm8,imm             \300\1\xC0\201\25             286,SB
 ROR       rm16,unity          \320\300\1\xD1\201            8086
 ROR       rm16,reg_cl         \320\300\1\xD3\201            8086
-ROR       rm16,imm            \320\300\1\xC1\201\25         286
+ROR       rm16,imm            \320\300\1\xC1\201\25         286,SB
 ROR       rm32,unity          \321\300\1\xD1\201            386
 ROR       rm32,reg_cl         \321\300\1\xD3\201            386
-ROR       rm32,imm            \321\300\1\xC1\201\25         386
+ROR       rm32,imm            \321\300\1\xC1\201\25         386,SB
 RSM       void                \2\x0F\xAA                    PENT
 SAHF      void                \1\x9E                        8086
 SAL       rm8,unity           \300\1\xD0\204                8086,ND
 SAL       rm8,reg_cl          \300\1\xD2\204                8086,ND
-SAL       rm8,imm             \300\1\xC0\204\25             286,ND
+SAL       rm8,imm             \300\1\xC0\204\25             286,ND,SB
 SAL       rm16,unity          \320\300\1\xD1\204            8086,ND
 SAL       rm16,reg_cl         \320\300\1\xD3\204            8086,ND
-SAL       rm16,imm            \320\300\1\xC1\204\25         286,ND
+SAL       rm16,imm            \320\300\1\xC1\204\25         286,ND,SB
 SAL       rm32,unity          \321\300\1\xD1\204            386,ND
 SAL       rm32,reg_cl         \321\300\1\xD3\204            386,ND
-SAL       rm32,imm            \321\300\1\xC1\204\25         386,ND
+SAL       rm32,imm            \321\300\1\xC1\204\25         386,ND,SB
 SALC      void                \1\xD6                        8086,UNDOC
 SAR       rm8,unity           \300\1\xD0\207                8086
 SAR       rm8,reg_cl          \300\1\xD2\207                8086
-SAR       rm8,imm             \300\1\xC0\207\25             286
+SAR       rm8,imm             \300\1\xC0\207\25             286,SB
 SAR       rm16,unity          \320\300\1\xD1\207            8086
 SAR       rm16,reg_cl         \320\300\1\xD3\207            8086
-SAR       rm16,imm            \320\300\1\xC1\207\25         286
+SAR       rm16,imm            \320\300\1\xC1\207\25         286,SB
 SAR       rm32,unity          \321\300\1\xD1\207            386
 SAR       rm32,reg_cl         \321\300\1\xD3\207            386
-SAR       rm32,imm            \321\300\1\xC1\207\25         386
+SAR       rm32,imm            \321\300\1\xC1\207\25         386,SB
 SBB       mem,reg8            \300\1\x18\101                8086,SM
 SBB       reg8,reg8           \300\1\x18\101                8086
 SBB       mem,reg16           \320\300\1\x19\101            8086,SM
@@ -836,13 +870,13 @@ SCASW     void                \320\1\xAF                    8086
 SGDT      mem                 \300\2\x0F\x01\200            286,PRIV
 SHL       rm8,unity           \300\1\xD0\204                8086
 SHL       rm8,reg_cl          \300\1\xD2\204                8086
-SHL       rm8,imm             \300\1\xC0\204\25             286
+SHL       rm8,imm             \300\1\xC0\204\25             286,SB
 SHL       rm16,unity          \320\300\1\xD1\204            8086
 SHL       rm16,reg_cl         \320\300\1\xD3\204            8086
-SHL       rm16,imm            \320\300\1\xC1\204\25         286
+SHL       rm16,imm            \320\300\1\xC1\204\25         286,SB
 SHL       rm32,unity          \321\300\1\xD1\204            386
 SHL       rm32,reg_cl         \321\300\1\xD3\204            386
-SHL       rm32,imm            \321\300\1\xC1\204\25         386
+SHL       rm32,imm            \321\300\1\xC1\204\25         386,SB
 SHLD      mem,reg16,imm       \300\320\2\x0F\xA4\101\26     386,SM2
 SHLD      reg16,reg16,imm     \300\320\2\x0F\xA4\101\26     386,SM2
 SHLD      mem,reg32,imm       \300\321\2\x0F\xA4\101\26     386,SM2
@@ -853,13 +887,13 @@ SHLD      mem,reg32,reg_cl    \300\321\2\x0F\xA5\101        386,SM
 SHLD      reg32,reg32,reg_cl  \300\321\2\x0F\xA5\101        386
 SHR       rm8,unity           \300\1\xD0\205                8086
 SHR       rm8,reg_cl          \300\1\xD2\205                8086
-SHR       rm8,imm             \300\1\xC0\205\25             286
+SHR       rm8,imm             \300\1\xC0\205\25             286,SB
 SHR       rm16,unity          \320\300\1\xD1\205            8086
 SHR       rm16,reg_cl         \320\300\1\xD3\205            8086
-SHR       rm16,imm            \320\300\1\xC1\205\25         286
+SHR       rm16,imm            \320\300\1\xC1\205\25         286,SB
 SHR       rm32,unity          \321\300\1\xD1\205            386
 SHR       rm32,reg_cl         \321\300\1\xD3\205            386
-SHR       rm32,imm            \321\300\1\xC1\205\25         386
+SHR       rm32,imm            \321\300\1\xC1\205\25         386,SB
 SHRD      mem,reg16,imm       \300\320\2\x0F\xAC\101\26     386,SM2
 SHRD      reg16,reg16,imm     \300\320\2\x0F\xAC\101\26     386,SM2
 SHRD      mem,reg32,imm       \300\321\2\x0F\xAC\101\26     386,SM2
@@ -874,6 +908,7 @@ SLDT      mem16               \300\1\x0F\17\200             286,PRIV
 SLDT      reg16               \300\1\x0F\17\200             286,PRIV
 SMI       void                \1\xF1                        386,UNDOC
 SMSW      mem                 \300\2\x0F\x01\204            286,PRIV
+SMSW      mem16               \300\2\x0F\x01\204            286,PRIV
 SMSW      reg16               \300\2\x0F\x01\204            286,PRIV
 STC       void                \1\xF9                        8086
 STD       void                \1\xFD                        8086
@@ -921,7 +956,7 @@ TEST      rm16,imm            \320\300\1\xF7\200\31         8086,SM
 TEST      rm32,imm            \321\300\1\xF7\200\41         386,SM
 TEST      mem,imm8            \300\1\xF6\200\21             8086,SM
 TEST      mem,imm16           \320\300\1\xF7\200\31         8086,SM
-TEST      mem,imm32           \321\300\1\xF7\200\41         386,UNDOC,SM
+TEST      mem,imm32           \321\300\1\xF7\200\41         386,SM
 UMOV      mem,reg8            \300\2\x0F\x10\101            386,UNDOC,SM
 UMOV      reg8,reg8           \300\2\x0F\x10\101            386,UNDOC
 UMOV      mem,reg16           \320\300\2\x0F\x11\101        386,UNDOC,SM
@@ -995,8 +1030,8 @@ XOR       mem,imm16           \320\300\1\x81\206\31         8086,SM
 XOR       mem,imm32           \321\300\1\x81\206\41         386,SM
 CMOVcc    reg16,mem           \320\301\1\x0F\330\x40\110    P6,SM
 CMOVcc    reg16,reg16         \320\301\1\x0F\330\x40\110    P6
-CMOVcc    reg32,mem           \320\301\1\x0F\330\x40\110    P6,SM
-CMOVcc    reg32,reg32         \320\301\1\x0F\330\x40\110    P6
+CMOVcc    reg32,mem           \321\301\1\x0F\330\x40\110    P6,SM
+CMOVcc    reg32,reg32         \321\301\1\x0F\330\x40\110    P6
 Jcc       imm|near            \322\1\x0F\330\x80\64         386
 Jcc       imm                 \330\x70\50                   8086
 Jcc       imm|short           \330\x70\50                   8086
diff --git a/insns.h b/insns.h
index 7ae5df39..1b637fc5 100644
--- a/insns.h
+++ b/insns.h
@@ -56,6 +56,7 @@ struct itemplate {
 #define IF_486    0x0400	       /* 486+ instruction */
 #define IF_PENT   0x0500	       /* Pentium instruction */
 #define IF_P6     0x0600	       /* P6 instruction */
+#define IF_CYRIX  0x0800	       /* Cyrix-specific instruction */
 #define IF_PMASK  0x0F00	       /* the mask for processor types */
 #define IF_PRIV   0x1000	       /* it's a privileged instruction */
 #define IF_UNDOC  0x2000	       /* it's an undocumented instruction */
diff --git a/insns.pl b/insns.pl
index def84bd7..9dc4dd45 100644
--- a/insns.pl
+++ b/insns.pl
@@ -9,7 +9,8 @@
 
 print STDERR "Reading insns.dat...\n";
 
-open (F, "insns.dat") || die "unable to open insns.dat";
+$fname = "insns.dat" unless $fname = $ARGV[0];
+open (F, $fname) || die "unable to open $fname";
 
 $line = 0;
 $opcodes = 0;
diff --git a/labels.c b/labels.c
index 7793a0a9..2c17c7c8 100644
--- a/labels.c
+++ b/labels.c
@@ -29,15 +29,20 @@
 #define PERMTS_SIZE  4096	       /* size of text blocks */
 
 /* values for label.defn.is_global */
+#define DEFINED_BIT 1
+#define GLOBAL_BIT 2
+#define EXTERN_BIT 4
+
 #define NOT_DEFINED_YET 0
-#define LOCAL_SYMBOL 1
-#define GLOBAL_SYMBOL 2
-#define GLOBAL_PLACEHOLDER 3
+#define TYPE_MASK 3
+#define LOCAL_SYMBOL (DEFINED_BIT)
+#define GLOBAL_PLACEHOLDER (GLOBAL_BIT)
+#define GLOBAL_SYMBOL (DEFINED_BIT|GLOBAL_BIT)
 
 union label {			       /* actual label structures */
     struct {
 	long segment, offset;
-        char *label;
+        char *label, *special;
 	int is_global;
     } defn;
     struct {
@@ -62,6 +67,8 @@ static char *perm_copy (char *string1, char *string2);
 
 static char *prevlabel;
 
+static int initialised = FALSE;
+
 /*
  * Internal routine: finds the `union label' corresponding to the
  * given label name. Creates a new one, if it isn't found, and if
@@ -107,6 +114,7 @@ static union label *find_label (char *label, int create) {
 
 	lfree[hash]->admin.movingon = BOGUS_VALUE;
 	lfree[hash]->defn.label = perm_copy (prev, label);
+	lfree[hash]->defn.special = NULL;
 	lfree[hash]->defn.is_global = NOT_DEFINED_YET;
 	return lfree[hash]++;
     } else
@@ -116,9 +124,11 @@ static union label *find_label (char *label, int create) {
 int lookup_label (char *label, long *segment, long *offset) {
     union label *lptr;
 
+    if (!initialised)
+	return 0;
+
     lptr = find_label (label, 0);
-    if (lptr && (lptr->defn.is_global == LOCAL_SYMBOL ||
-		 lptr->defn.is_global == GLOBAL_SYMBOL)) {
+    if (lptr && (lptr->defn.is_global & DEFINED_BIT)) {
 	*segment = lptr->defn.segment;
 	*offset = lptr->defn.offset;
 	return 1;
@@ -126,6 +136,19 @@ int lookup_label (char *label, long *segment, long *offset) {
 	return 0;
 }
 
+int is_extern (char *label) {
+    union label *lptr;
+
+    if (!initialised)
+	return 0;
+
+    lptr = find_label (label, 0);
+    if (lptr && (lptr->defn.is_global & EXTERN_BIT))
+	return 1;
+    else
+	return 0;
+}
+
 void define_label_stub (char *label, efunc error) {
     union label *lptr;
 
@@ -138,26 +161,22 @@ void define_label_stub (char *label, efunc error) {
     }
 }
 
-void define_label (char *label, long segment, long offset,
-		   struct ofmt *ofmt, efunc error) {
+void define_label (char *label, long segment, long offset, char *special,
+		   int is_norm, int isextrn, struct ofmt *ofmt, efunc error) {
     union label *lptr;
 
     lptr = find_label (label, 1);
-    switch (lptr->defn.is_global) {
-      case NOT_DEFINED_YET:
-	lptr->defn.is_global = LOCAL_SYMBOL;
-	break;
-      case GLOBAL_PLACEHOLDER:
-	lptr->defn.is_global = GLOBAL_SYMBOL;
-	break;
-      default:
+    if (lptr->defn.is_global & DEFINED_BIT) {
 	error(ERR_NONFATAL, "symbol `%s' redefined", label);
 	return;
     }
+    lptr->defn.is_global |= DEFINED_BIT;
+    if (isextrn)
+	lptr->defn.is_global |= EXTERN_BIT;
 
-    if (label[0] != '.')	       /* not local, but not special either */
+    if (label[0] != '.' && is_norm)    /* not local, but not special either */
 	prevlabel = lptr->defn.label;
-    else if (label[1] != '.' && !*prevlabel)
+    else if (label[0] == '.' && label[1] != '.' && !*prevlabel)
 	error(ERR_NONFATAL, "attempt to define a local label before any"
 	      " non-local labels");
 
@@ -165,25 +184,20 @@ void define_label (char *label, long segment, long offset,
     lptr->defn.offset = offset;
 
     ofmt->symdef (lptr->defn.label, segment, offset,
-		  lptr->defn.is_global == GLOBAL_SYMBOL);
+		  !!(lptr->defn.is_global & GLOBAL_BIT),
+		  special ? special : lptr->defn.special);
 }
 
-void define_common (char *label, long segment, long size,
+void define_common (char *label, long segment, long size, char *special,
 		    struct ofmt *ofmt, efunc error) {
     union label *lptr;
 
     lptr = find_label (label, 1);
-    switch (lptr->defn.is_global) {
-      case NOT_DEFINED_YET:
-	lptr->defn.is_global = LOCAL_SYMBOL;
-	break;
-      case GLOBAL_PLACEHOLDER:
-	lptr->defn.is_global = GLOBAL_SYMBOL;
-	break;
-      default:
+    if (lptr->defn.is_global & DEFINED_BIT) {
 	error(ERR_NONFATAL, "symbol `%s' redefined", label);
 	return;
     }
+    lptr->defn.is_global |= DEFINED_BIT;
 
     if (label[0] != '.')	       /* not local, but not special either */
 	prevlabel = lptr->defn.label;
@@ -194,10 +208,11 @@ void define_common (char *label, long segment, long size,
     lptr->defn.segment = segment;
     lptr->defn.offset = 0;
 
-    ofmt->symdef (lptr->defn.label, segment, size, 2);
+    ofmt->symdef (lptr->defn.label, segment, size, 2,
+		  special ? special : lptr->defn.special);
 }
 
-void declare_as_global (char *label, efunc error) {
+void declare_as_global (char *label, char *special, efunc error) {
     union label *lptr;
 
     if (islocal(label)) {
@@ -206,16 +221,18 @@ void declare_as_global (char *label, efunc error) {
 	return;
     }
     lptr = find_label (label, 1);
-    switch (lptr->defn.is_global) {
+    switch (lptr->defn.is_global & TYPE_MASK) {
       case NOT_DEFINED_YET:
 	lptr->defn.is_global = GLOBAL_PLACEHOLDER;
+	lptr->defn.special = special ? perm_copy(special, "") : NULL;
 	break;
       case GLOBAL_PLACEHOLDER:	       /* already done: silently ignore */
       case GLOBAL_SYMBOL:
 	break;
       case LOCAL_SYMBOL:
-	error(ERR_NONFATAL, "symbol `%s': [GLOBAL] directive must"
-	      " appear before symbol definition", label);
+	if (!lptr->defn.is_global & EXTERN_BIT)
+	    error(ERR_NONFATAL, "symbol `%s': GLOBAL directive must"
+		  " appear before symbol definition", label);
 	break;
     }
 }
@@ -241,12 +258,16 @@ int init_labels (void) {
 
     prevlabel = "";
 
+    initialised = TRUE;
+
     return 0;
 }
 
 void cleanup_labels (void) {
     int i;
 
+    initialised = FALSE;
+
     for (i=0; i<LABEL_HASHES; i++) {
 	union label *lptr, *lhold;
 
diff --git a/labels.h b/labels.h
index fb466ca1..111104bf 100644
--- a/labels.h
+++ b/labels.h
@@ -7,11 +7,12 @@
  */
 
 int lookup_label (char *label, long *segment, long *offset);
-void define_label (char *label, long segment, long offset,
-		   struct ofmt *ofmt, efunc error);
-void define_common (char *label, long segment, long size,
+int is_extern (char *label);
+void define_label (char *label, long segment, long offset, char *special,
+		   int is_norm, int isextrn, struct ofmt *ofmt, efunc error);
+void define_common (char *label, long segment, long size, char *special,
 		    struct ofmt *ofmt, efunc error);
 void define_label_stub (char *label, efunc error);
-void declare_as_global (char *label, efunc error);
+void declare_as_global (char *label, char *special, efunc error);
 int init_labels (void);
 void cleanup_labels (void);
diff --git a/macros.bas b/macros.bas
new file mode 100644
index 00000000..cad02c3c
--- /dev/null
+++ b/macros.bas
@@ -0,0 +1,175 @@
+' INFO_1: Converter for STANDARD.MAC to MACRO.C
+'
+' INFO_2: Written by Mark Junker in 1997
+'         InterNet: mjs@prg.hannover.sgh-net.de
+'         FIDO:     Mark Junker@2:2437/47.21
+'
+' COMMENT: To start the program press SHIFT+F5 within the QBasic IDE
+'          or start it from the command-line with QBASIC /RUN MACROS
+'
+
+DEFINT A-Z
+
+DECLARE FUNCTION StrTrimLeft$ (a$, b$)
+DECLARE FUNCTION StrTrimRight$ (a$, b$)
+DECLARE FUNCTION StrTrim$ (a$, b$)
+DECLARE SUB StrSplitString (SplitString$, SplitChars$, SplitField$(), SplitCount%)
+DECLARE FUNCTION Min% (a%, b%)
+DECLARE FUNCTION StrInstrLeft% (SearchStart%, SearchIn$, SearchFor$)
+DECLARE FUNCTION StrAscii% (a$)
+
+
+CLS
+DIM LineData$(1 TO 2)
+
+OPEN "I", 1, "STANDARD.MAC"
+OPEN "O", 2, "macros.c"
+
+PRINT #2, "/* This file auto-generated from standard.mac by macros.bas - don't edit it */"
+PRINT #2, ""
+PRINT #2, "static char *stdmac[] = {"
+
+WHILE NOT EOF(1)
+  LINE INPUT #1, l$
+  CALL StrSplitString(l$, ";", LineData$(), SplitCount)
+  IF SplitCount THEN
+    LineData$(1) = StrTrim$(LineData$(1), CHR$(9) + " ")
+    IF LEN(LineData$(1)) THEN
+      PRINT #2, "    " + CHR$(34) + LineData$(1) + CHR$(34) + ","
+    END IF
+  END IF
+WEND
+PRINT #2, "    NULL"
+PRINT #2, "};"
+
+CLOSE 2
+CLOSE 1
+SYSTEM
+
+FUNCTION Min% (a%, b%)
+  IF a% < b% THEN Min% = a% ELSE Min% = b%
+END FUNCTION
+
+FUNCTION StrAscii (a$)
+  IF LEN(a$) = 0 THEN
+    StrAscii = -1
+  ELSE
+    StrAscii = ASC(a$)
+  END IF
+END FUNCTION
+
+' same as =INSTR(SearchStart, SearchIn, ANY SearchFor$) in PowerBASIC(tm)
+'
+FUNCTION StrInstrLeft (SearchStart, SearchIn$, SearchFor$)
+ ValuesCount = LEN(SearchFor$)
+ MaxValue = LEN(SearchIn$) + 1
+ MinValue = MaxValue
+ FOR Counter1 = 1 TO ValuesCount
+  SearchChar$ = MID$(SearchFor$, Counter1, 1)
+  hVal2 = INSTR(SearchStart, SearchIn$, SearchChar$)
+  IF hVal2 > 0 THEN MinValue = Min%(hVal2, MinValue)
+ NEXT
+ IF MinValue = MaxValue THEN MinValue = 0
+ StrInstrLeft = MinValue
+END FUNCTION
+
+'
+' This is a very damn fuckin' shit version of this splitting routine.
+' At this time, it's not very useful :]
+'
+SUB StrSplitString (SplitString$, SplitChars$, SplitField$(), SplitCount)
+  StartIndex = LBOUND(SplitField$)
+  LastIndex = UBOUND(SplitField$)
+  ActualIndex& = StartIndex
+  SplitCount = 0
+
+  LastPos = 1
+  FoundPos = StrInstrLeft(LastPos, SplitString$, SplitChars$ + CHR$(34))
+  GetDirect = 0
+  EndLoop = 0
+  TempString$ = ""
+  DO WHILE FoundPos > 0
+    FoundCharVal = StrAscii(MID$(SplitString$, FoundPos, 1))
+    PosDiff = (FoundPos - LastPos) + 1
+    SELECT CASE FoundCharVal
+    CASE 34
+      TempString$ = TempString$ + MID$(SplitString$, LastPos, PosDiff - 1)
+      SELECT CASE EndLoop
+      CASE 0
+        EndLoop = 2
+      CASE 3
+        EndLoop = 0
+      END SELECT
+    CASE ELSE
+      TempString$ = TempString$ + MID$(SplitString$, LastPos, PosDiff - 1)
+      SplitField$(ActualIndex&) = TempString$
+      TempString$ = ""
+      ActualIndex& = ActualIndex& + 1
+      IF ActualIndex& > LastIndex THEN
+        ActualIndex& = LastIndex
+        EndLoop = 1
+      END IF
+    END SELECT
+    SELECT CASE EndLoop
+    CASE 0
+      DO
+        LastPos = FoundPos + 1
+        FoundPos = StrInstrLeft(LastPos, SplitString$, SplitChars$)
+      LOOP WHILE LastPos = FoundPos
+      FoundPos = StrInstrLeft(LastPos, SplitString$, SplitChars$ + CHR$(34))
+    CASE 1
+      FoundPos = 0
+      LastPos = LEN(SplitString$) + 1
+    CASE 2
+      EndLoop = 3
+      LastPos = FoundPos + 1
+      FoundPos = StrInstrLeft(LastPos, SplitString$, CHR$(34))
+      IF FoundPos = 0 THEN
+       SplitString$ = SplitString$ + CHR$(34)
+       FoundPos = LEN(SplitString$)
+      END IF
+    END SELECT
+  LOOP
+  IF EndLoop = 0 THEN
+    IF LEN(TempString$) > 0 THEN
+      SplitField$(ActualIndex&) = TempString$
+    ELSEIF LastPos <= LEN(SplitString$) THEN
+      SplitField$(ActualIndex&) = MID$(SplitString$, LastPos)
+    ELSE
+      ActualIndex& = ActualIndex& - 1
+    END IF
+  END IF
+  FOR a = ActualIndex& + 1 TO LastIndex
+    SplitField$(a) = ""
+  NEXT
+  SplitCount = (ActualIndex& - StartIndex) + 1
+END SUB
+
+FUNCTION StrTrim$ (a$, b$)
+        StrTrim$ = StrTrimRight$(StrTrimLeft$(a$, b$), b$)
+END FUNCTION
+
+FUNCTION StrTrimLeft$ (a$, b$) 'public
+        p = 0
+        l = LEN(a$)
+        DO
+          p = p + 1
+          t$ = MID$(a$, p, 1)
+        LOOP WHILE (p < l) AND (INSTR(b$, t$) > 0)
+        StrTrimLeft$ = MID$(a$, p)
+END FUNCTION
+
+FUNCTION StrTrimRight$ (a$, b$) 'public
+        l = LEN(a$)
+        p = l + 1
+        DO
+          p = p - 1
+          IF p > 0 THEN
+            t$ = MID$(a$, p, 1)
+          ELSE
+            t$ = ""
+          END IF
+        LOOP WHILE (p > 0) AND (INSTR(b$, t$) > 0)
+        StrTrimRight$ = LEFT$(a$, p)
+END FUNCTION
+
diff --git a/macros.c b/macros.c
index c7e03ae1..e2c97adb 100644
--- a/macros.c
+++ b/macros.c
@@ -2,7 +2,9 @@
 
 static char *stdmac[] = {
     "%define __NASM_MAJOR__ 0",
-    "%define __NASM_MINOR__ 95",
+    "%define __NASM_MINOR__ 96",
+    "%define __FILE__",
+    "%define __LINE__",
     "%define __SECT__",
     "%imacro section 1+.nolist",
     "%define __SECT__ [section %1]",
@@ -20,6 +22,7 @@ static char *stdmac[] = {
     "%push struc",
     "%define %$strucname %1",
     "[absolute 0]",
+    "%$strucname:",
     "%endmacro",
     "%imacro endstruc 0.nolist",
     "%{$strucname}_size:",
@@ -39,29 +42,32 @@ static char *stdmac[] = {
     "times %{$strucname}_size-($-%$strucstart) db 0",
     "%pop",
     "%endmacro",
-    "%imacro extern 1+.nolist",
+    "%imacro align 1-2+.nolist nop",
+    "times ($$-$) & ((%1)-1) %2",
+    "%endmacro",
+    "%imacro alignb 1-2+.nolist resb 1",
+    "times ($$-$) & ((%1)-1) %2",
+    "%endmacro",
+    "%imacro extern 1-*.nolist",
+    "%rep %0",
     "[extern %1]",
+    "%rotate 1",
+    "%endrep",
     "%endmacro",
     "%imacro bits 1+.nolist",
     "[bits %1]",
     "%endmacro",
-    "%imacro global 1+.nolist",
+    "%imacro global 1-*.nolist",
+    "%rep %0",
     "[global %1]",
+    "%rotate 1",
+    "%endrep",
     "%endmacro",
-    "%imacro common 1+.nolist",
+    "%imacro common 1-*.nolist",
+    "%rep %0",
     "[common %1]",
-    "%endmacro",
-    "%imacro org 1+.nolist",
-    "[org %1]",
-    "%endmacro",
-    "%imacro group 1+.nolist",
-    "[group %1]",
-    "%endmacro",
-    "%imacro uppercase 1+.nolist",
-    "[uppercase %1]",
-    "%endmacro",
-    "%imacro library 1+.nolist",
-    "[library %1]",
+    "%rotate 1",
+    "%endrep",
     "%endmacro",
     NULL
 };
diff --git a/macros.pl b/macros.pl
index 733f7e1e..0a12bb0d 100644
--- a/macros.pl
+++ b/macros.pl
@@ -7,7 +7,8 @@
 # redistributable under the licence given in the file "Licence"
 # distributed in the NASM archive.
 
-open INPUT,"standard.mac" || die "unable to open standard.mac\n";
+$fname = "standard.mac" unless $fname = $ARGV[0];
+open INPUT,$fname || die "unable to open $fname\n";
 open OUTPUT,">macros.c" || die "unable to open macros.c\n";
 
 print OUTPUT "/* This file auto-generated from standard.mac by macros.pl" .
diff --git a/misc/c16.mac b/misc/c16.mac
new file mode 100644
index 00000000..86e6bf92
--- /dev/null
+++ b/misc/c16.mac
@@ -0,0 +1,37 @@
+; NASM macro set to make interfacing to 16-bit programs easier -*- nasm -*-
+
+%imacro proc 1			; begin a procedure definition
+%push proc
+	  global %1
+%1:	  push bp
+	  mov bp,sp
+%ifdef FARCODE PASCAL		; arguments may start at bp+4 or bp+6
+%assign %$arg 6
+%else
+%assign %$arg 4
+%endif
+%define %$procname %1
+%endmacro
+
+%imacro arg 0-1 2		; used with the argument name as a label
+	  equ %$arg
+%assign %$arg %1+%$arg
+%endmacro
+
+%imacro endproc 0
+%ifnctx proc
+%error Mismatched `endproc'/`proc'
+%else
+          mov sp,bp
+          pop bp
+%ifdef PASCAL
+          retf %$arg
+%elifdef FARCODE
+	  retf
+%else
+	  retn
+%endif
+__end_%$procname:		; useful for calculating function size
+%pop
+%endif
+%endmacro
diff --git a/misc/c32.mac b/misc/c32.mac
new file mode 100644
index 00000000..a59acfde
--- /dev/null
+++ b/misc/c32.mac
@@ -0,0 +1,26 @@
+; NASM macro set to make interfacing to 32-bit programs easier -*- nasm -*-
+
+%imacro proc 1			; begin a procedure definition
+%push proc
+          global %1
+%1:       push ebp
+          mov ebp,esp
+%assign %$arg 8
+%define %$procname %1
+%endmacro
+
+%imacro arg 0-1 4		; used with the argument name as a label
+	  equ %$arg
+%assign %$arg %1+%$arg
+%endmacro
+
+%imacro endproc 0
+%ifnctx proc
+%error Mismatched `endproc'/`proc'
+%else
+	  leave
+	  ret
+__end_%$procname:		; useful for calculating function size
+%pop
+%endif
+%endmacro
diff --git a/misc/exasm.zip b/misc/exasm.zip
new file mode 100644
index 00000000..b4e9e58a
--- /dev/null
+++ b/misc/exasm.zip
diff --git a/misc/exebin.mac b/misc/exebin.mac
new file mode 100644
index 00000000..8d1eaf8c
--- /dev/null
+++ b/misc/exebin.mac
@@ -0,0 +1,57 @@
+; -*- nasm -*-
+; NASM macro file to allow the `bin' output format to generate
+; simple .EXE files by constructing the EXE header by hand.
+; Adapted from a contribution by Yann Guidon <whygee_corp@hol.fr>
+
+%define EXE_stack_size EXE_realstacksize
+
+%macro EXE_begin 0
+	  ORG 0E0h
+	  section .text
+
+header_start:
+	  db 4Dh,5Ah		; EXE file signature
+	  dw EXE_allocsize % 512
+	  dw (EXE_allocsize + 511) / 512
+	  dw 0			; relocation information: none
+	  dw (header_end-header_start)/16 ; header size in paragraphs
+	  dw (EXE_absssize + EXE_realstacksize) / 16 ; min extra mem
+	  dw (EXE_absssize + EXE_realstacksize) / 16 ; max extra mem
+	  dw -10h		; Initial SS (before fixup)
+	  dw EXE_endbss + EXE_realstacksize ; Initial SP (1K DPMI+1K STACK)
+	  dw 0			; (no) Checksum
+	  dw 100h		; Initial IP - start just after the header
+	  dw -10h		; Initial CS (before fixup)
+	  dw 0			; file offset to relocation table: none
+	  dw 0			; (no overlay)
+	  align 16,db 0
+header_end:
+
+EXE_startcode:
+	  section .data
+EXE_startdata:
+	  section .bss
+EXE_startbss:
+%endmacro
+
+%macro EXE_stack 1
+EXE_realstacksize equ %1
+%define EXE_stack_size EXE_bogusstacksize ; defeat EQU in EXE_end
+%endmacro
+
+%macro EXE_end 0
+	  section .text
+EXE_endcode:
+	  section .data
+EXE_enddata:
+	  section .bss
+	  alignb 4
+EXE_endbss:
+
+EXE_acodesize equ (EXE_endcode-EXE_startcode+3) & (~3)
+EXE_datasize equ EXE_enddata-EXE_startdata
+EXE_absssize equ (EXE_endbss-EXE_startbss+3) & (~3)
+EXE_allocsize equ EXE_acodesize + EXE_datasize
+
+EXE_stack_size equ 0x800	; default if nothing else was used
+%endmacro
diff --git a/misc/nasm.sl b/misc/nasm.sl
index c47d28b8..691254a4 100644
--- a/misc/nasm.sl
+++ b/misc/nasm.sl
@@ -294,6 +294,14 @@ define_keywords_n($1, nasm_kw_8, 8, 0);
 define_keywords_n($1, nasm_kw_9, 9, 0);
 define_keywords_n($1, nasm_kw_10, 10, 0);
 
+define_keywords_n($1, "org", 3, 1);
+define_keywords_n($1, "bitsiend", 4, 1);
+define_keywords_n($1, "aligngroupstruc", 5, 1);
+define_keywords_n($1, "alignbcommonexternglobalistruc", 6, 1);
+define_keywords_n($1, "sectionsegmentlibrary", 7, 1);
+define_keywords_n($1, "absoluteendstruc", 8, 1);
+define_keywords_n($1, "uppercase", 9, 1);
+
 !if (keymap_p ($1)) make_keymap ($1);
 definekey("nasm_bol_self_ins", ";", $1);
 definekey("nasm_bol_self_ins", "#", $1);
diff --git a/names.c b/names.c
index 9ef1e7b5..218ce5aa 100644
--- a/names.c
+++ b/names.c
@@ -8,7 +8,7 @@
  */
 
 static char *reg_names[] = {	       /* register names, as strings */
-    "\0", "ah", "al", "ax", "bh", "bl", "bp", "bx", "ch", "cl",
+    "ah", "al", "ax", "bh", "bl", "bp", "bx", "ch", "cl",
     "cr0", "cr2", "cr3", "cr4", "cs", "cx", "dh", "di", "dl", "dr0",
     "dr1", "dr2", "dr3", "dr6", "dr7", "ds", "dx", "eax", "ebp",
     "ebx", "ecx", "edi", "edx", "es", "esi", "esp", "fs", "gs",
@@ -32,37 +32,41 @@ static char *insn_names[] = {	       /* instruction names, as strings */
     "fidivr", "fild", "fimul", "fincstp", "finit", "fist", "fistp",
     "fisub", "fisubr", "fld", "fld1", "fldcw", "fldenv", "fldl2e",
     "fldl2t", "fldlg2", "fldln2", "fldpi", "fldz", "fmul", "fmulp",
-    "fnop", "fpatan", "fprem", "fprem1", "fptan", "frndint",
-    "frstor", "fsave", "fscale", "fsetpm", "fsin", "fsincos",
-    "fsqrt", "fst", "fstcw", "fstenv", "fstp", "fstsw", "fsub",
-    "fsubp", "fsubr", "fsubrp", "ftst", "fucom", "fucomi",
-    "fucomip", "fucomp", "fucompp", "fxam", "fxch", "fxtract",
-    "fyl2x", "fyl2xp1", "hlt", "ibts", "icebp", "idiv", "imul",
-    "in", "inc", "incbin", "insb", "insd", "insw", "int", "int1",
-    "int01", "int3", "into", "invd", "invlpg", "iret", "iretd",
-    "iretw", "jcxz", "jecxz", "jmp", "lahf", "lar", "lds", "lea",
-    "leave", "les", "lfs", "lgdt", "lgs", "lidt", "lldt", "lmsw",
-    "loadall", "loadall286", "lodsb", "lodsd", "lodsw", "loop",
-    "loope", "loopne", "loopnz", "loopz", "lsl", "lss", "ltr",
-    "mov", "movd", "movq", "movsb", "movsd", "movsw", "movsx",
-    "movzx", "mul", "neg", "nop", "not", "or", "out", "outsb",
-    "outsd", "outsw", "packssdw", "packsswb", "packuswb", "paddb",
-    "paddd", "paddsb", "paddsw", "paddusb", "paddusw", "paddw",
-    "pand", "pandn", "pcmpeqb", "pcmpeqd", "pcmpeqw", "pcmpgtb",
-    "pcmpgtd", "pcmpgtw", "pmaddwd", "pmulhw", "pmullw", "pop",
-    "popa", "popad", "popaw", "popf", "popfd", "popfw", "por",
-    "pslld", "psllq", "psllw", "psrad", "psraw", "psrld", "psrlq",
-    "psrlw", "psubb", "psubd", "psubsb", "psubsw", "psubusb",
-    "psubusw", "psubw", "punpckhbw", "punpckhdq", "punpckhwd",
-    "punpcklbw", "punpckldq", "punpcklwd", "push", "pusha",
-    "pushad", "pushaw", "pushf", "pushfd", "pushfw", "pxor", "rcl",
-    "rcr", "rdmsr", "rdpmc", "rdtsc", "resb", "resd", "resq",
-    "rest", "resw", "ret", "retf", "retn", "rol", "ror", "rsm",
-    "sahf", "sal", "salc", "sar", "sbb", "scasb", "scasd", "scasw",
-    "sgdt", "shl", "shld", "shr", "shrd", "sidt", "sldt", "smi",
-    "smsw", "stc", "std", "sti", "stosb", "stosd", "stosw", "str",
-    "sub", "test", "umov", "verr", "verw", "wait", "wbinvd",
-    "wrmsr", "xadd", "xbts", "xchg", "xlatb", "xor"
+    "fnclex", "fndisi", "fneni", "fninit", "fnop", "fnsave",
+    "fnstcw", "fnstenv", "fnstsw", "fpatan", "fprem", "fprem1",
+    "fptan", "frndint", "frstor", "fsave", "fscale", "fsetpm",
+    "fsin", "fsincos", "fsqrt", "fst", "fstcw", "fstenv", "fstp",
+    "fstsw", "fsub", "fsubp", "fsubr", "fsubrp", "ftst", "fucom",
+    "fucomi", "fucomip", "fucomp", "fucompp", "fxam", "fxch",
+    "fxtract", "fyl2x", "fyl2xp1", "hlt", "ibts", "icebp", "idiv",
+    "imul", "in", "inc", "incbin", "insb", "insd", "insw", "int",
+    "int1", "int01", "int3", "into", "invd", "invlpg", "iret",
+    "iretd", "iretw", "jcxz", "jecxz", "jmp", "lahf", "lar", "lds",
+    "lea", "leave", "les", "lfs", "lgdt", "lgs", "lidt", "lldt",
+    "lmsw", "loadall", "loadall286", "lodsb", "lodsd", "lodsw",
+    "loop", "loope", "loopne", "loopnz", "loopz", "lsl", "lss",
+    "ltr", "mov", "movd", "movq", "movsb", "movsd", "movsw",
+    "movsx", "movzx", "mul", "neg", "nop", "not", "or", "out",
+    "outsb", "outsd", "outsw", "packssdw", "packsswb", "packuswb",
+    "paddb", "paddd", "paddsb", "paddsiw", "paddsw", "paddusb",
+    "paddusw", "paddw", "pand", "pandn", "paveb", "pcmpeqb",
+    "pcmpeqd", "pcmpeqw", "pcmpgtb", "pcmpgtd", "pcmpgtw",
+    "pdistib", "pmachriw", "pmaddwd", "pmagw", "pmulhrw",
+    "pmulhriw", "pmulhw", "pmullw", "pmvgezb", "pmvlzb", "pmvnzb",
+    "pmvzb", "pop", "popa", "popad", "popaw", "popf", "popfd",
+    "popfw", "por", "pslld", "psllq", "psllw", "psrad", "psraw",
+    "psrld", "psrlq", "psrlw", "psubb", "psubd", "psubsb",
+    "psubsiw", "psubsw", "psubusb", "psubusw", "psubw", "punpckhbw",
+    "punpckhdq", "punpckhwd", "punpcklbw", "punpckldq", "punpcklwd",
+    "push", "pusha", "pushad", "pushaw", "pushf", "pushfd",
+    "pushfw", "pxor", "rcl", "rcr", "rdmsr", "rdpmc", "rdtsc",
+    "resb", "resd", "resq", "rest", "resw", "ret", "retf", "retn",
+    "rol", "ror", "rsm", "sahf", "sal", "salc", "sar", "sbb",
+    "scasb", "scasd", "scasw", "sgdt", "shl", "shld", "shr", "shrd",
+    "sidt", "sldt", "smi", "smsw", "stc", "std", "sti", "stosb",
+    "stosd", "stosw", "str", "sub", "test", "umov", "verr", "verw",
+    "wait", "wbinvd", "wrmsr", "xadd", "xbts", "xchg", "xlatb",
+    "xor"
 };
 
 static char *icn[] = {		       /* conditional instructions */
diff --git a/nasm.1 b/nasm.1
new file mode 100644
index 00000000..051da482
--- /dev/null
+++ b/nasm.1
@@ -0,0 +1,423 @@
+.TH NASM 1 "The Netwide Assembler Project"
+.SH NAME
+nasm \- the Netwide Assembler \- portable 80x86 assembler
+.SH SYNOPSIS
+.B nasm
+[
+.B \-f
+format
+] [
+.B \-o
+outfile
+] [
+.IR options ...
+] infile
+.br
+.B nasm \-h
+.br
+.B nasm \-r
+.SH DESCRIPTION
+The
+.B nasm
+command assembles the file
+.I infile
+and directs output to the file
+.I outfile
+if specified. If
+.I outfile
+is not specified,
+.B nasm
+will derive a default output file name from the name of its input
+file, usually by appending `.o' or `.obj', or by removing all
+extensions for a raw binary file. Failing that, the output file name
+will be `nasm.out'.
+.SS OPTIONS
+.TP
+.B \-h
+Causes
+.B nasm
+to exit immediately, after giving a summary of its invocation
+options, and listing all its supported output file formats.
+.TP
+.B \-a
+Causes
+.B nasm
+to assemble the given input file without first applying the macro
+preprocessor.
+.TP
+.B \-e
+Causes
+.B nasm
+to preprocess the given input file, and write the output to
+.I stdout
+(or the specified output file name), and not actually assemble
+anything.
+.TP
+.BI \-r
+Causes
+.B nasm
+to exit immediately, after displaying its version number.
+.TP
+.BI \-f " format"
+Specifies the output file format. Formats include
+.IR bin ,
+to produce flat-form binary files, and
+.I aout
+and
+.I elf
+to produce Linux a.out and ELF object files, respectively.
+.TP
+.BI \-o " outfile"
+Specifies a precise name for the output file, overriding
+.BR nasm 's
+default means of determining it.
+.TP
+.BI \-l " listfile"
+Causes an assembly listing to be directed to the given file, in
+which the original source is displayed on the right hand side (plus
+the source for included files and the expansions of multi-line
+macros) and the generated code is shown in hex on the left.
+.TP
+.B \-s
+Causes
+.B nasm
+to send its error messages and/or help text to
+.I stdout
+instead of
+.IR stderr .
+.TP
+.BI \-w [+-]foo
+Causes
+.B nasm
+to enable or disable certain classes of warning messages, for
+example
+.B \-w+orphan-labels
+or
+.B \-w-macro-params
+to, respectively, enable warnings about labels alone on lines or
+disable warnings about incorrect numbers of parameters in macro
+calls.
+.TP
+.BI \-i " directory"
+Adds a directory to the search path for include files. The directory
+specification must include the trailing slash, as it will be
+directly prepended to the name of the include file.
+.TP
+.BI \-p " file"
+Specifies a file to be pre-included, before the main source file
+starts to be processed.
+.TP
+.BI \-d " macro[=value]"
+Pre-defines a single-line macro.
+.PP
+.RE
+.SS SYNTAX
+This man page does not fully describe the syntax of
+.BR nasm 's
+assembly language, but does give a summary of the differences from
+other assemblers.
+.PP
+.I Registers
+have no leading `%' sign, unlike
+.BR gas ,
+and floating-point stack registers are referred to as
+.IR st0 ,
+.IR st1 ,
+and so on.
+.PP
+.I Floating-point instructions
+may use either the single-operand form or the double. A
+.I TO
+keyword is provided; thus, one could either write
+.PP
+.ti +15n
+fadd st0,st1
+.br
+.ti +15n
+fadd st1,st0
+.PP
+or one could use the alternative single-operand forms
+.PP
+.ti +15n
+fadd st1
+.br
+.ti +15n
+fadd to st1
+.PP
+.I Uninitialised storage
+is reserved using the
+.IR RESB ,
+.IR RESW ,
+.IR RESD ,
+.I RESQ
+and
+.I REST
+pseudo-opcodes, each taking one parameter which gives the number of
+bytes, words, doublewords, quadwords or ten-byte words to reserve.
+.PP
+.I Repetition
+of data items is not done by the
+.I DUP
+keyword as seen in DOS assemblers, but by the use of the
+.I TIMES
+prefix, like this:
+.PP
+.ti +6n
+.ta 9n
+message:	times 3 db 'abc'
+.br
+.ti +15n
+times 64-$+message db 0
+.PP
+which defines the string `abcabcabc', followed by the right number
+of zero bytes to make the total length up to 64 bytes.
+.PP
+.I Symbol references
+are always understood to be immediate (i.e. the address of the
+symbol), unless square brackets are used, in which case the contents
+of the memory location are used. Thus:
+.PP
+.ti +15n
+mov ax,wordvar
+.PP
+loads AX with the address of the variable `wordvar', whereas
+.PP
+.ti +15n
+mov ax,[wordvar]
+.br
+.ti +15n
+mov ax,[wordvar+1]
+.br
+.ti +15n
+mov ax,[es:wordvar+bx]
+.PP
+all refer to the
+.I contents
+of memory locations. The syntaxes
+.PP
+.ti +15n
+mov ax,es:wordvar[bx]
+.br
+.ti +15n
+es mov ax,wordvar[1]
+.PP
+are not legal at all, although the use of a segment register name as
+an instruction prefix is valid, and can be used with instructions
+such as
+.I LODSB
+which can't be overridden any other way.
+.PP
+.I Constants
+may be expressed numerically in most formats: a trailing H, Q or B
+denotes hex, octal or binary respectively, and a leading `0x' or `$'
+denotes hex as well. Leading zeros are not treated specially at all.
+Character constants may be enclosed in single or double quotes;
+there is no escape character. The ordering is little-endian
+(reversed), so that the character constant
+.I 'abcd'
+denotes 0x64636261 and not 0x61626364.
+.PP
+.I Local labels
+begin with a period, and their `locality' is granted by the
+assembler prepending the name of the previous non-local symbol. Thus
+declaring a label `.loop' after a label `label' has actually defined
+a symbol called `label.loop'.
+.SS DIRECTIVES
+.I SECTION name
+or
+.I SEGMENT name
+causes
+.B nasm
+to direct all following code to the named section. Section names
+vary with output file format, although most formats support the
+names
+.IR .text ,
+.I .data
+and
+.IR .bss .
+(The exception is the
+.I obj
+format, in which all segments are user-definable.)
+.PP
+.I ABSOLUTE address
+causes
+.B nasm
+to position its notional assembly point at an absolute address: so
+no code or data may be generated, but you can use
+.IR RESB ,
+.I RESW
+and
+.I RESD
+to move the assembly point further on, and you can define labels. So
+this directive may be used to define data structures. When you have
+finished doing absolute assembly, you must issue another
+.I SECTION
+directive to return to normal assembly.
+.PP
+.I BITS 16
+or
+.I BITS 32
+switches the default processor mode for which
+.B nasm
+is generating code: it is equivalent to
+.I USE16
+or
+.I USE32
+in DOS assemblers.
+.PP
+.I EXTERN symbol
+and
+.I GLOBAL symbol
+import and export symbol definitions, respectively, from and to
+other modules. Note that the
+.I GLOBAL
+directive must appear before the definition of the symbol it refers
+to.
+.PP
+.I STRUC strucname
+and
+.IR ENDSTRUC ,
+when used to bracket a number of
+.IR RESB ,
+.I RESW
+or similar instructions, define a data structure. In addition to
+defining the offsets of the structure members, the construct also
+defines a symbol for the size of the structure, which is simply the
+structure name with
+.I _size
+tacked on to the end.
+.SS FORMAT-SPECIFIC DIRECTIVES
+.I ORG address
+is used by the
+.I bin
+flat-form binary output format, and specifies the address at which
+the output code will eventually be loaded.
+.PP
+.I GROUP grpname seg1 seg2...
+is used by the
+.I obj
+(Microsoft 16-bit) output format, and defines segment groups. This
+format also uses
+.IR UPPERCASE ,
+which directs that all segment, group and symbol names output to the
+object file should be in uppercase. Note that the actual assembly is
+still case sensitive.
+.PP
+.I LIBRARY libname
+is used by the
+.I rdf
+output format, and causes a dependency record to be written to the
+output file which indicates that the program requires a certain
+library in order to run.
+.SS MACRO PREPROCESSOR
+Single-line macros are defined using the
+.I %define
+or
+.I %idefine
+commands, in a similar fashion to the C preprocessor. They can be
+overloaded with respect to number of parameters, although defining a
+macro with no parameters prevents the definition of any macro with
+the same name taking parameters, and vice versa.
+.I %define
+defines macros whose names match case-sensitively, whereas
+.I %idefine
+defines case-insensitive macros.
+.PP
+Multi-line macros are defined using
+.I %macro
+and
+.I %imacro
+(the distinction is the same as that between
+.I %define
+and
+.IR %idefine ),
+whose syntax is as follows:
+.PP
+.ti +6n
+%macro
+.I name
+.IR minprm [- maxprm "][+][.nolist] [" defaults ]
+.br
+.ti +15n
+<some lines of macro expansion text>
+.br
+.ti +6n
+%endmacro
+.PP
+Again, these macros may be overloaded. The trailing plus sign
+indicates that any parameters after the last one get subsumed, with
+their separating commas, into the last parameter. The
+.I defaults
+part can be used to specify defaults for unspecified macro
+parameters after
+.IR minparam .
+.I %endm
+is a valid synonym for
+.IR %endmacro .
+.PP
+To refer to the macro parameters within a macro expansion, you use
+.IR %1 ,
+.I %2
+and so on. You can also enforce that a macro parameter should
+contain a condition code by using
+.IR %+1 ,
+and you can invert the condition code by using
+.IR %-1 .
+You can also define a label specific to a macro invocation by
+prefixing it with a double % sign.
+.PP
+Files can be included using the
+.I %include
+directive, which works like C.
+.PP
+The preprocessor has a `context stack', which may be used by one
+macro to store information that a later one will retrieve. You can
+push a context on the stack using
+.IR %push ,
+remove one using
+.IR %pop ,
+and change the name of the top context (without disturbing any
+associated definitions) using
+.IR %repl .
+Labels and
+.I %define
+macros specific to the top context may be defined by prefixing their
+names with %$, and things specific to the next context down with
+%$$, and so on.
+.PP
+Conditional assembly is done by means of
+.IR %ifdef ,
+.IR %ifndef ,
+.I %else
+and
+.I %endif
+as in C. (Except that
+.I %ifdef
+can accept several putative macro names, and will evaluate TRUE if
+any of them is defined.) In addition, the directives
+.I %ifctx
+and
+.I %ifnctx
+can be used to condition on the name of the top context on the
+context stack. The obvious set of `else-if' directives,
+.IR %elifdef ,
+.IR %elifndef ,
+.IR %elifctx
+and
+.IR %elifnctx
+are also supported.
+.SH BUGS
+There is a reported seg-fault on some (Linux) systems with some
+large source files. This appears to be very hard to reproduce. All
+other
+.I known
+bugs have been fixed...
+.SH RESTRICTIONS
+There is no support for listing files, symbol maps, or debugging
+object-file records. The advanced features of the ELF and Win32
+object file formats are not supported, and there is no means for
+warning the programmer against using an instruction beyond the
+capability of the target processor.
+.SH SEE ALSO
+.BR as "(" 1 "),"
+.BR ld "(" 1 ")."
diff --git a/nasm.c b/nasm.c
index 1d663b36..a1e4dd8a 100644
--- a/nasm.c
+++ b/nasm.c
@@ -16,6 +16,7 @@
 #include "nasmlib.h"
 #include "preproc.h"
 #include "parser.h"
+#include "eval.h"
 #include "assemble.h"
 #include "labels.h"
 #include "outform.h"
@@ -32,7 +33,6 @@ static char *obuf;
 static char inname[FILENAME_MAX];
 static char outname[FILENAME_MAX];
 static char listname[FILENAME_MAX];
-static char realout[FILENAME_MAX];
 static int lineno;		       /* for error reporting */
 static int lineinc;		       /* set by [LINE] or [ONELINE] */
 static int globallineno;	       /* for forward-reference tracking */
@@ -44,7 +44,7 @@ static int sb = 16;		       /* by default */
 
 static int use_stdout = FALSE;	       /* by default, errors to stderr */
 
-static long current_seg;
+static long current_seg, abs_seg;
 static struct RAA *offsets;
 static long abs_offset;
 
@@ -62,7 +62,7 @@ static char currentfile[FILENAME_MAX];
  * doesn't do anything. Initial defaults are given here.
  */
 static char suppressed[1+ERR_WARN_MAX] = {
-    0, FALSE, TRUE
+    0, FALSE, TRUE, FALSE
 };
 
 /*
@@ -70,7 +70,7 @@ static char suppressed[1+ERR_WARN_MAX] = {
  * zero does nothing.
  */
 static char *suppressed_names[1+ERR_WARN_MAX] = {
-    NULL, "macro-params", "orphan-labels"
+    NULL, "macro-params", "orphan-labels", "number-overflow"
 };
 
 /*
@@ -79,7 +79,8 @@ static char *suppressed_names[1+ERR_WARN_MAX] = {
  */
 static char *suppressed_what[1+ERR_WARN_MAX] = {
     NULL, "macro calls with wrong no. of params",
-    "labels alone on lines without trailing `:'"
+    "labels alone on lines without trailing `:'",
+    "numeric constants greater than 0xFFFFFFFF"
 };
 
 /*
@@ -88,7 +89,7 @@ static char *suppressed_what[1+ERR_WARN_MAX] = {
  * not preprocess their source file.
  */
 
-static void no_pp_reset (char *, efunc, ListGen *);
+static void no_pp_reset (char *, int, efunc, evalfunc, ListGen *);
 static char *no_pp_getline (void);
 static void no_pp_cleanup (void);
 static Preproc no_pp = {
@@ -130,6 +131,10 @@ int main(int argc, char **argv) {
 	return 1;
     }
 
+    if (ofmt->stdmac)
+	pp_extra_stdmac (ofmt->stdmac);
+    eval_global_info (ofmt, lookup_label);
+
     if (preprocess_only) {
 	char *line;
 
@@ -140,12 +145,29 @@ int main(int argc, char **argv) {
 			      "unable to open output file `%s'", outname);
 	} else
 	    ofile = NULL;
-	preproc->reset (inname, report_error, &nasmlist);
+
+	eval_info ("%", 0L, 0L);       /* disallow labels, $ or $$ in exprs */
+
+	preproc->reset (inname, 2, report_error, evaluate, &nasmlist);
 	strcpy(currentfile,inname);
 	lineno = 0;
 	lineinc = 1;
 	while ( (line = preproc->getline()) ) {
+	    int ln, li;
+	    char buf[FILENAME_MAX];
+
 	    lineno += lineinc;
+	    /*
+	     * We must still check for %line directives, so that we
+	     * can report errors accurately.
+	     */
+	    if (!strncmp(line, "%line", 5) &&
+		sscanf(line, "%%line %d+%d %s", &ln, &li, buf) == 3) {
+		lineno = ln - li;
+		lineinc = li;
+		strncpy (currentfile, buf, FILENAME_MAX-1);
+		currentfile[FILENAME_MAX-1] = '\0';
+	    }
 	    if (ofile) {
 		fputs(line, ofile);
 		fputc('\n', ofile);
@@ -166,10 +188,7 @@ int main(int argc, char **argv) {
 	 * the name of the input file and then put that inside the
 	 * file.
 	 */
-	ofmt->filename (inname, realout, report_error);
-	if (!*outname) {
-	    strcpy(outname, realout);
-	}
+	ofmt->filename (inname, outname, report_error);
 
 	ofile = fopen(outname, "wb");
 	if (!ofile) {
@@ -182,7 +201,7 @@ int main(int argc, char **argv) {
 	 * init routines. (eg OS/2 defines the FLAT group)
 	 */
 	init_labels ();
-	ofmt->init (ofile, report_error, define_label);
+	ofmt->init (ofile, report_error, define_label, evaluate);
 	assemble_file (inname);
 	if (!terminate_after_phase) {
 	    ofmt->cleanup ();
@@ -377,19 +396,23 @@ static void parse_cmdline(int argc, char **argv) {
 }
 
 static void assemble_file (char *fname) {
-    char *value, *p, *line;
+    char *value, *p, *q, *special, *line;
     insn output_ins;
-    int i, rn_error;
-    long seg;
+    int i, rn_error, validid;
+    long seg, offs;
+    struct tokenval tokval;
+    expr *e;
 
     /* pass one */
     pass = 1;
     current_seg = ofmt->section(NULL, pass, &sb);
-    preproc->reset(fname, report_error, &nasmlist);
+    preproc->reset(fname, 1, report_error, evaluate, &nasmlist);
     strcpy(currentfile,fname);
     lineno = 0;
     lineinc = 1;
     globallineno = 0;
+    offs = get_curr_ofs;
+    eval_info (NULL, current_seg, offs);   /* set $ */
     while ( (line = preproc->getline()) ) {
 	lineno += lineinc;
 	globallineno++;
@@ -433,11 +456,33 @@ static void assemble_file (char *fname) {
 		    current_seg = seg;
 		}
 		break;
-	      case 2:	       /* [EXTERN label] */
+	      case 2:	       /* [EXTERN label:special] */
 		if (*value == '$')
 		    value++;	       /* skip initial $ if present */
-		declare_as_global (value, report_error);
-		define_label (value, seg_alloc(), 0L, ofmt, report_error);
+		q = value;
+		validid = TRUE;
+		if (!isidstart(*q))
+		    validid = FALSE;
+		while (*q && *q != ':') {
+		    if (!isidchar(*q))
+			validid = FALSE;
+		    q++;
+		}
+		if (!validid) {
+		    report_error (ERR_NONFATAL,
+				  "identifier expected after EXTERN");
+		    break;
+		}
+		if (*q == ':') {
+		    *q++ = '\0';
+		    special = q;
+		} else
+		    special = NULL;
+		if (!is_extern(value)) {   /* allow re-EXTERN to be ignored */
+		    declare_as_global (value, special, report_error);
+		    define_label (value, seg_alloc(), 0L, NULL, FALSE, TRUE,
+				  ofmt, report_error);
+		}
 		break;
 	      case 3:	       /* [BITS bits] */
 		switch (atoi(value)) {
@@ -452,39 +497,87 @@ static void assemble_file (char *fname) {
 		    break;
 		}
 		break;
-	      case 4:	       /* [GLOBAL symbol] */
+	      case 4:	       /* [GLOBAL symbol:special] */
 		if (*value == '$')
 		    value++;	       /* skip initial $ if present */
-		declare_as_global (value, report_error);
+		q = value;
+		validid = TRUE;
+		if (!isidstart(*q))
+		    validid = FALSE;
+		while (*q && *q != ':') {
+		    if (!isidchar(*q))
+			validid = FALSE;
+		    q++;
+		}
+		if (!validid) {
+		    report_error (ERR_NONFATAL,
+				  "identifier expected after GLOBAL");
+		    break;
+		}
+		if (*q == ':') {
+		    *q++ = '\0';
+		    special = q;
+		} else
+		    special = NULL;
+		declare_as_global (value, special, report_error);
 		break;
-	      case 5:	       /* [COMMON symbol size] */
+	      case 5:	       /* [COMMON symbol size:special] */
 		p = value;
-		while (*p && !isspace(*p))
+		validid = TRUE;
+		if (!isidstart(*p))
+		    validid = FALSE;
+		while (*p && !isspace(*p)) {
+		    if (!isidchar(*p))
+			validid = FALSE;
 		    p++;
+		}
+		if (!validid) {
+		    report_error (ERR_NONFATAL,
+				  "identifier expected after COMMON");
+		    break;
+		}
 		if (*p) {
 		    long size;
 
 		    while (*p && isspace(*p))
 			*p++ = '\0';
+		    q = p;
+		    while (*q && *q != ':')
+			q++;
+		    if (*q == ':') {
+			*q++ = '\0';
+			special = q;
+		    } else
+			special = NULL;
 		    size = readnum (p, &rn_error);
 		    if (rn_error)
 			report_error (ERR_NONFATAL, "invalid size specified"
 				      " in COMMON declaration");
 		    else
 			define_common (value, seg_alloc(), size,
-				       ofmt, report_error);
+				       special, ofmt, report_error);
 		} else
 		    report_error (ERR_NONFATAL, "no size specified in"
 				  " COMMON declaration");
 		break;
 	      case 6:		       /* [ABSOLUTE address] */
 		current_seg = NO_SEG;
-		abs_offset = readnum(value, &rn_error);
-		if (rn_error) {
-		    report_error (ERR_NONFATAL, "invalid address specified"
-				  " for ABSOLUTE directive");
+		stdscan_reset();
+		stdscan_bufptr = value;
+		tokval.t_type = TOKEN_INVALID;
+		e = evaluate(stdscan, NULL, &tokval, NULL, 1, report_error,
+			     NULL);
+		if (e) {
+		    if (!is_reloc(e))
+			report_error (ERR_NONFATAL, "cannot use non-"
+				      "relocatable expression as ABSOLUTE"
+				      " address");
+		    else {
+			abs_seg = reloc_seg(e);
+			abs_offset = reloc_value(e);
+		    }
+		} else
 		    abs_offset = 0x100;/* don't go near zero in case of / */
-		}
 		break;
 	      default:
 		if (!ofmt->directive (line+1, value, 1))
@@ -493,9 +586,8 @@ static void assemble_file (char *fname) {
 		break;
 	    }
 	} else {
-	    long offs = get_curr_ofs;
-	    parse_line (current_seg, offs, lookup_label,
-			1, line, &output_ins, ofmt, report_error);
+	    parse_line (1, line, &output_ins,
+			report_error, evaluate, eval_info);
 	    if (output_ins.forw_ref)
 		*(int *)saa_wstruct(forwrefs) = globallineno;
 
@@ -535,7 +627,7 @@ static void assemble_file (char *fname) {
 			define_label (output_ins.label,
 				      output_ins.oprs[0].segment,
 				      output_ins.oprs[0].offset,
-				      ofmt, report_error);
+				      NULL, FALSE, FALSE, ofmt, report_error);
 		    } else if (output_ins.operands == 2 &&
 			       (output_ins.oprs[0].type & IMMEDIATE) &&
 			       (output_ins.oprs[0].type & COLON) &&
@@ -547,15 +639,15 @@ static void assemble_file (char *fname) {
 			define_label (output_ins.label,
 				      output_ins.oprs[0].offset | SEG_ABS,
 				      output_ins.oprs[1].offset,
-				      ofmt, report_error);
+				      NULL, FALSE, FALSE, ofmt, report_error);
 		    } else
 			report_error(ERR_NONFATAL, "bad syntax for EQU");
 		}
 	    } else {
 		if (output_ins.label)
 		    define_label (output_ins.label,
-				  current_seg, offs,
-				  ofmt, report_error);
+				  current_seg==NO_SEG ? abs_seg : current_seg,
+				  offs, NULL, TRUE, FALSE, ofmt, report_error);
 		offs += insn_size (current_seg, offs, sb,
 				   &output_ins, report_error);
 		set_curr_ofs (offs);
@@ -563,6 +655,8 @@ static void assemble_file (char *fname) {
 	    cleanup_insn (&output_ins);
 	}
 	nasm_free (line);
+	offs = get_curr_ofs;
+	eval_info (NULL, current_seg, offs);   /* set $ */
     }
     preproc->cleanup();
 
@@ -589,11 +683,13 @@ static void assemble_file (char *fname) {
     current_seg = ofmt->section(NULL, pass, &sb);
     raa_free (offsets);
     offsets = raa_init();
-    preproc->reset(fname, report_error, &nasmlist);
+    preproc->reset(fname, 2, report_error, evaluate, &nasmlist);
     strcpy(currentfile,fname);
     lineno = 0;
     lineinc = 1;
     globallineno = 0;
+    offs = get_curr_ofs;
+    eval_info (NULL, current_seg, offs);   /* set $ */
     while ( (line = preproc->getline()) ) {
 	lineno += lineinc;
 	globallineno++;
@@ -619,7 +715,6 @@ static void assemble_file (char *fname) {
 
 	/* here we parse our directives; this is not handled by
 	 * the 'real' parser. */
-
 	if ( (i = getkw (line, &value)) ) {
 	    switch (i) {
 	      case 1:	       /* [SEGMENT n] */
@@ -631,6 +726,13 @@ static void assemble_file (char *fname) {
 		    current_seg = seg;
 		break;
 	      case 2:	       /* [EXTERN label] */
+		q = value;
+		while (*q && *q != ':')
+		    q++;
+		if (*q == ':') {
+		    *q++ = '\0';
+		    ofmt->symdef(value, 0L, 0L, 3, q);
+		}
 		break;
 	      case 3:	       /* [BITS bits] */
 		switch (atoi(value)) {
@@ -646,13 +748,42 @@ static void assemble_file (char *fname) {
 		}
 		break;
 	      case 4:		       /* [GLOBAL symbol] */
+		q = value;
+		while (*q && *q != ':')
+		    q++;
+		if (*q == ':') {
+		    *q++ = '\0';
+		    ofmt->symdef(value, 0L, 0L, 3, q);
+		}
 		break;
 	      case 5:		       /* [COMMON symbol size] */
+		q = value;
+		while (*q && *q != ':') {
+		    if (isspace(*q))
+			*q = '\0';
+		    q++;
+		}
+		if (*q == ':') {
+		    *q++ = '\0';
+		    ofmt->symdef(value, 0L, 0L, 3, q);
+		}
 		break;
 	      case 6:		       /* [ABSOLUTE addr] */
 		current_seg = NO_SEG;
-		abs_offset = readnum(value, &rn_error);
-		if (rn_error)
+		stdscan_reset();
+		stdscan_bufptr = value;
+		tokval.t_type = TOKEN_INVALID;
+		e = evaluate(stdscan, NULL, &tokval, NULL, 2, report_error,
+			     NULL);
+		if (e) {
+		    if (!is_reloc(e))
+			report_error (ERR_PANIC, "non-reloc ABSOLUTE address"
+				      " in pass two");
+		    else {
+			abs_seg = reloc_seg(e);
+			abs_offset = reloc_value(e);
+		    }
+		} else
 		    report_error (ERR_PANIC, "invalid ABSOLUTE address "
 				  "in pass two");
 		break;
@@ -662,9 +793,8 @@ static void assemble_file (char *fname) {
 		break;
 	    }
 	} else {
-	    long offs = get_curr_ofs;
-	    parse_line (current_seg, offs, lookup_label, 2,
-			line, &output_ins, ofmt, report_error);
+	    parse_line (2, line, &output_ins,
+			report_error, evaluate, eval_info);
 	    if (globallineno == forwline) {
 		int *p = saa_rstruct (forwrefs);
 		if (p)
@@ -702,7 +832,7 @@ static void assemble_file (char *fname) {
 			define_label (output_ins.label,
 				      output_ins.oprs[0].segment,
 				      output_ins.oprs[0].offset,
-				      ofmt, report_error);
+				      NULL, FALSE, FALSE, ofmt, report_error);
 		    } else if (output_ins.operands == 2 &&
 			       (output_ins.oprs[0].type & IMMEDIATE) &&
 			       (output_ins.oprs[0].type & COLON) &&
@@ -712,7 +842,7 @@ static void assemble_file (char *fname) {
 			define_label (output_ins.label,
 				      output_ins.oprs[0].offset | SEG_ABS,
 				      output_ins.oprs[1].offset,
-				      ofmt, report_error);
+				      NULL, FALSE, FALSE, ofmt, report_error);
 		    } else
 			report_error(ERR_NONFATAL, "bad syntax for EQU");
 		}
@@ -723,6 +853,9 @@ static void assemble_file (char *fname) {
 	    set_curr_ofs (offs);
 	}
 	nasm_free (line);
+
+	offs = get_curr_ofs;
+	eval_info (NULL, current_seg, offs);   /* set $ */
     }
     preproc->cleanup();
     nasmlist.cleanup();
@@ -786,6 +919,12 @@ static void report_error (int severity, char *fmt, ...) {
 	suppressed[ (severity & ERR_WARN_MASK) >> ERR_WARN_SHR ])
 	return;			       /* and bail out if so */
 
+    /*
+     * See if it's a pass-one only warning and we're not in pass one.
+     */
+    if ((severity & ERR_PASS1) && pass != 1)
+	return;
+
     if (severity & ERR_NOFILE)
 	fputs ("nasm: ", use_stdout ? stdout : stderr);
     else
@@ -839,6 +978,9 @@ static void register_output_formats(void) {
 #ifdef OF_AOUT
     extern struct ofmt of_aout;
 #endif
+#ifdef OF_AOUTB
+    extern struct ofmt of_aoutb;
+#endif
 #ifdef OF_COFF
     extern struct ofmt of_coff;
 #endif
@@ -856,9 +998,6 @@ static void register_output_formats(void) {
 #ifdef OF_WIN32
     extern struct ofmt of_win32;
 #endif
-#ifdef OF_OS2
-    extern struct ofmt of_os2;
-#endif
 #ifdef OF_RDF
     extern struct ofmt of_rdf;
 #endif
@@ -872,6 +1011,9 @@ static void register_output_formats(void) {
 #ifdef OF_AOUT
     ofmt_register (&of_aout);
 #endif
+#ifdef OF_AOUTB
+    ofmt_register (&of_aoutb);
+#endif
 #ifdef OF_COFF
     ofmt_register (&of_coff);
 #endif
@@ -887,9 +1029,6 @@ static void register_output_formats(void) {
 #ifdef OF_WIN32
     ofmt_register (&of_win32);
 #endif
-#ifdef OF_OS2
-    ofmt_register (&of_os2);
-#endif
 #ifdef OF_RDF
     ofmt_register (&of_rdf);
 #endif
@@ -906,14 +1045,18 @@ static void register_output_formats(void) {
 
 static FILE *no_pp_fp;
 static efunc no_pp_err;
+static ListGen *no_pp_list;
 
-static void no_pp_reset (char *file, efunc error, ListGen *listgen) {
+static void no_pp_reset (char *file, int pass, efunc error, evalfunc eval,
+			 ListGen *listgen) {
     no_pp_err = error;
     no_pp_fp = fopen(file, "r");
     if (!no_pp_fp)
 	no_pp_err (ERR_FATAL | ERR_NOFILE,
 		   "unable to open input file `%s'", file);
-    (void) listgen;		       /* placate compilers */
+    no_pp_list = listgen;
+    (void) pass;		       /* placate compilers */
+    (void) eval;		       /* placate compilers */
 }
 
 static char *no_pp_getline (void) {
@@ -954,6 +1097,8 @@ static char *no_pp_getline (void) {
      */
     buffer[strcspn(buffer, "\032")] = '\0';
 
+    no_pp_list->line (LIST_READ, buffer);
+
     return buffer;
 }
 
diff --git a/nasm.doc b/nasm.doc
deleted file mode 100644
index 264d5ba7..00000000
--- a/nasm.doc
+++ /dev/null
@@ -1,1769 +0,0 @@
-		     The Netwide Assembler, NASM
-		     ===========================
-
-Introduction
-============
-
-The Netwide Assembler grew out of an idea on comp.lang.asm.x86 (or
-possibly alt.lang.asm, I forget which), which was essentially that
-there didn't seem to be a good free x86-series assembler around, and
-that maybe someone ought to write one.
-
-- A86 is good, but not free, and in particular you don't get any
-  32-bit capability until you pay. It's DOS only, too.
-
-- GAS is free, and ports over DOS/Unix, but it's not very good,
-  since it's designed to be a back end to gcc, which always feeds it
-  correct code. So its error checking is minimal. Also its syntax is
-  horrible, from the point of view of anyone trying to actually
-  _write_ anything in it. Plus you can't write 16-bit code in it
-  (properly).
-
-- AS86 is Linux specific, and (my version at least) doesn't seem to
-  have much (or any) documentation.
-
-- MASM isn't very good. And it's expensive. And it runs only under
-  DOS.
-
-- TASM is better, but still strives for MASM compatibility, which
-  means millions of directives and tons of red tape. And its syntax
-  is essentially MASM's, with the contradictions and quirks that
-  entails (although it sorts out some of those by means of Ideal
-  mode). It's expensive too. And it's DOS only.
-
-So here, for your coding pleasure, is NASM. At present it's still in
-prototype stage - we don't promise that it can outperform any of
-these assemblers. But please, _please_ send us bug reports, fixes,
-helpful information, and anything else you can get your hands on
-(and thanks to the many people who've done this already! You all
-know who you are), and we'll improve it out of all recognition.
-Again.
-
-Please see the file `Licence' for the legalese.
-
-Getting Started: Installation
-=============================
-
-NASM is distributed in source form, in what we hope is totally
-ANSI-compliant C. It uses no non-portable code at all, that we know
-of. It ought to compile without change on any system you care to try
-it on. We also supply a pre-compiled 16-bit DOS binary.
-
-To install it, edit the Makefile to describe your C compiler, and
-type `make'. Then copy the binary to somewhere on your path. That's
-all - NASM relies on no files other than its own executable.
-Although if you're on a Unix system, you may also want to install
-the NASM manpage (`nasm.1'). You may also want to install the binary
-and manpage for the Netwide Disassembler, NDISASM (also see
-`ndisasm.doc').
-
-Running NASM
-============
-
-To assemble a file, you issue a command of the form
-
-	nasm -f <format> <filename> [-o <output>]
-
-For example,
-
-	nasm -f elf myfile.asm
-
-will assemble `myfile.asm' into an ELF object file `myfile.o'. And
-
-	nasm -f bin myfile.asm -o myfile.com
-
-will assemble `myfile.asm' into a raw binary program `myfile.com'.
-
-To produce a listing file, with the hex codes output from NASM
-displayed on the left of the original sources, use `-l' to give a
-listing file name, for example:
-
-	nasm -f coff myfile.asm -l myfile.lst
-
-To get further usage instructions from NASM, try typing `nasm -h'.
-This will also list the available output file formats, and what they
-are.
-
-If you use Linux but aren't sure whether your system is a.out or
-ELF, type `file /usr/bin/nasm' or wherever you put the NASM binary.
-If it says something like
-
-/usr/bin/nasm: ELF 32-bit LSB executable i386 (386 and up) Version 1
-
-then your system is ELF, and you should use `-f elf' when you want
-NASM to produce Linux object files. If it says
-
-/usr/bin/nasm: Linux/i386 demand-paged executable (QMAGIC)
-
-or something similar, your system is a.out, and you should use `-f
-aout' instead.
-
-Like Unix compilers and assemblers, NASM is silent unless it goes
-wrong: you won't see any output at all, unless it gives error
-messages.
-
-If you define an environment variable called NASM, the program will
-interpret it as a list of extra command-line options, processed
-before the real command line. This is probably most useful for
-defining an include-file search path by putting a lot of `-i'
-options in the NASM variable.
-
-The variable's value will be considered to be a space-separated list
-of options unless it begins with something other than a minus sign,
-in which case the first character will be taken as the separator.
-For example, if you want to define a macro whose value has a space
-in it, then setting the NASM variable to `-dNAME="my name"' won't
-work because the string will be split at the space into `-dNAME="my'
-and `name"', but setting it to `|-dNAME="my name"' will be fine
-because all further operands will be considered to be separated by
-vertical bars and so the space has no special meaning.	
-
-Quick Start for MASM Users
-==========================
-
-If you're used to writing programs with MASM, or with TASM in
-MASM-compatible (non-Ideal) mode, or with A86, this section attempts
-to outline the major differences between MASM's syntax and NASM's.
-If you're not already used to MASM, it's probably worth skipping
-this section.
-
-One simple difference is that NASM is case-sensitive. It makes a
-difference whether you call your label `foo', `Foo' or `FOO'. If
-you're assembling to the `obj' MS-DOS output format (or `os2'), you
-can invoke the `UPPERCASE' directive (documented below, in the
-Output Formats section) and ensure that all symbols exported to
-other code modules are forced to uppercase; but even then, _within_
-a single module, NASM will distinguish between labels differing only
-in case.
-
-There are also differences in some of the instructions and register
-names: for example, NASM calls the floating-point stack registers
-`st0', `st1' and so on, rather than MASM's `ST(0)' notation or A86's
-simple numeric `0'. And NASM doesn't support LODS, MOVS, STOS, SCAS,
-CMPS, INS, or OUTS, but only supports the size-specified versions
-LODSB, MOVSW, SCASD and so on.
-
-The _major_ difference, though, is the absence in NASM of variable
-typing. MASM will notice when you declare a variable as `var dw 0',
-and will remember that `var' is a WORD-type variable, so that
-instructions such as `mov var,2' can be unambiguously given the WORD
-size rather than BYTE or DWORD. NASM doesn't and won't do this. The
-statement `var dw 0' merely defines `var' to be a label marking a
-point in memory: no more and no less. It so happens that there are
-two bytes of data following that point in memory before the next
-line of code, but NASM doesn't remember or care. If you want to
-store the number 2 in such a variable, you must specify the size of
-the operation _always_: `mov word [var],2'. This is a deliberate
-design decision, _not_ a bug, so please could people not send us
-mail asking us to `fix' it...
-
-The above example also illustrates another important difference
-between MASM and NASM syntax: the use of OFFSET and of square
-brackets. In MASM, declaring `var dw 0' entitles you to code `mov
-ax,var' to get at the _contents_ of the variable, and you must write
-`mov ax,offset var' to get the _address_ of the variable. In NASM,
-`mov ax,var' gives you the address, and to get at the contents you
-must code `mov ax,[var]'. Again, this is a deliberate design
-decision, since it brings consistency to the syntax: `mov ax,[var]'
-and `mov ax,[bx]' both refer to the contents of memory and both have
-square brackets, whereas neither `mov ax,bx' nor `mov ax,var' refers
-to memory contents and so neither one has square brackets.
-
-This is even more confusing in A86, where declaring a label with a
-trailing colon defines it to be a `label' as opposed to a `variable'
-and causes A86 to adopt NASM-style semantics; so in A86, `mov
-ax,var' has different behaviour depending on whether `var' was
-declared as `var: dw 0' or `var dw 0'. NASM is very simple by
-comparison: _everything_ is a label. The OFFSET keyword is not
-required, and in fact constitutes a syntax error (though you can
-code `%define offset' to suppress the error messages if you want),
-and `var' always refers to the _address_ of the label whereas
-`[var]' refers to the _contents_.
-
-As an addendum to this point of syntax, it's also worth noting that
-the hybrid-style syntaxes supported by MASM and its clones, such as
-`mov ax,table[bx]', where a memory reference is denoted by one
-portion outside square brackets and another portion inside, are also
-not supported by NASM. The correct syntax for the above is `mov
-ax,[table+bx]'. Likewise, `mov ax,es:[di]' is wrong and `mov
-ax,[es:di]' is right.
-
-Writing Programs with NASM
-==========================
-
-Each line of a NASM source file should contain some combination of
-the four fields
-
-LABEL:	INSTRUCTION	OPERANDS	; COMMENT
-
-`LABEL' defines a label pointing to that point in the source. There
-are no restrictions on white space: labels may have white space
-before them, or not, as you please. The colon after the label is
-also optional. (Note that NASM can be made to give a warning when it
-sees a label which is the only thing on a line with no trailing
-colon, on the grounds that such a label might easily be a mistyped
-instruction name. The command line option `-w+orphan-labels' will
-enable this feature.)
-
-Valid characters in labels are letters, numbers, `_', `$', `#', `@',
-`~', `?', and `.'. The only characters which may be used as the
-_first_ character of an identifier are letters, `_' and `?', and
-(with special meaning: see `Local Labels') `.'. An identifier may
-also be prefixed with a $ sign to indicate that it is intended to be
-read as an identifier and not a reserved word; thus, if some other
-module you are linking with defines a symbol `eax', you can refer to
-`$eax' in NASM code to distinguish it from the register name.
-
-`INSTRUCTION' can be any machine opcode (Pentium and P6 opcodes, FPU
-opcodes, MMX opcodes and even undocumented opcodes are all
-supported). The instruction may be prefixed by LOCK, REP, REPE/REPZ
-or REPNE/REPNZ, in the usual way. Explicit address-size and operand-
-size prefixes A16, A32, O16 and O32 are provided - one example of
-their use is given in the `Unusual Instruction Sizes' section below.
-You can also use a segment register as a prefix: coding `es mov
-[bx],ax' is equivalent to coding `mov [es:bx],ax'. We recommend the
-latter syntax, since it is consistent with other syntactic features
-of the language, but for instructions such as `lodsb' there isn't
-anywhere to put a segment override except as a prefix. This is why
-we support it.
-
-The `INSTRUCTION' field may also contain some pseudo-opcodes: see
-the section on pseudo-opcodes for details.
-
-`OPERANDS' can be nonexistent, or huge, depending on the
-instruction, of course. When operands are registers, they are given
-simply as register names: `eax', `ss', `di' for example. NASM does
-_not_ use the GAS syntax, in which register names are prefixed by a
-`%' sign. Operands may also be effective addresses, or they may be
-constants or expressions. See the separate sections on these for
-details.
-
-`COMMENT' is anything after the first semicolon on the line,
-excluding semicolons inside quoted strings.
-
-Of course, all these fields are optional: the presence or absence of
-the OPERANDS field is required by the nature of the INSTRUCTION
-field, but any line may contain a LABEL or not, may contain an
-INSTRUCTION or not, and may contain a COMMENT or not, independently
-of each other.
-
-Lines may also contain nothing but a directive: see `Assembler
-Directives' below for details.
-
-NASM can currently not handle any line longer than 1024 characters.
-This may be fixed in a future release.
-
-Floating Point Instructions
-===========================
-
-NASM has support for assembling FPU opcodes. However, its syntax is
-not necessarily the same as anyone else's.
-
-NASM uses the notation `st0', `st1', etc. to denote the FPU stack
-registers. NASM also accepts a wide range of single-operand and
-two-operand forms of the instructions. For people who wish to use
-the single-operand form exclusively (this is in fact the `canonical'
-form from NASM's point of view, in that it is the form produced by
-the Netwide Disassembler), there is a TO keyword which makes
-available the opcodes which cannot be so easily accessed by one
-operand. Hence:
-
-	fadd st1	; this sets st0 := st0 + st1
-	fadd st0,st1	; so does this
-	fadd st1,st0	; this sets st1 := st1 + st0
-	fadd to st1	; so does this
-
-It's also worth noting that the FPU instructions that reference
-memory must use the prefixes DWORD, QWORD or TWORD to indicate what
-size of memory operand they refer to.
-
-NASM, in keeping with our policy of not trying to second-guess the
-programmer, will _never_ automatically insert WAIT instructions into
-your code stream. You must code WAIT yourself before _any_
-instruction that needs it. (Of course, on 286 processors or above,
-it isn't needed anyway...)
-
-NASM supports specification of floating point constants by means of
-`dd' (single precision), `dq' (double precision) and `dt' (extended
-precision). Floating-point _arithmetic_ is not done, due to
-portability constraints (not all platforms on which NASM can be run
-support the same floating point types), but simple constants can be
-specified. For example:
-
-gamma	dq 0.5772156649		; Euler's constant
-
-Pseudo-Opcodes
-==============
-
-Pseudo-opcodes are not real x86 machine opcodes, but are used in the
-instruction field anyway because that's the most convenient place to
-put them. The current pseudo-opcodes are DB, DW, DD, DQ and DT,
-their uninitialised counterparts RESB, RESW, RESD, RESQ and REST,
-the INCBIN command, the EQU command, and the TIMES prefix.
-
-DB, DW, DD, DQ and DT work as you would expect: they can each take
-an arbitrary number of operands, and when assembled, they generate
-nothing but those operands. All three of them can take string
-constants as operands. See the `Constants' section for details about
-string constants.
-
-RESB, RESW, RESD, RESQ and REST are designed to be used in the BSS
-section of a module: they declare _uninitialised_ storage space.
-Each takes a single operand, which is the number of bytes, words or
-doublewords to reserve. We do not support the MASM/TASM syntax of
-reserving uninitialised space by writing `DW ?' or similar: this is
-what we do instead. (But see `Critical Expressions' for a caveat on
-the nature of the operand.)
-
-(An aside: if you want to be able to write `DW ?' and have something
-vaguely useful happen, you can always code `? EQU 0'...)
-
-INCBIN is borrowed from the old Amiga assembler Devpac: it includes
-a binary file verbatim into the output file. This can be handy for
-(for example) including graphics and sound data directly into a game
-executable file. It can be called in one of these three ways:
-
-    INCBIN "file.dat"               ; include the whole file
-    INCBIN "file.dat",1024          ; skip the first 1024 bytes
-    INCBIN "file.dat",1024,512      ; skip the first 1024, and
-                                    ; actually include at most 512
-
-EQU defines a symbol to a specified value: when EQU is used, the
-LABEL field must be present. The action of EQU is to define the
-given label name to the value of its (only) operand. This definition
-is absolute, and cannot change later. So, for example,
-
-message db 'hello, world'
-msglen  equ $-message
-
-defines `msglen' to be the constant 12. `msglen' may not then be
-redefined later. This is not a preprocessor definition either: the
-value of `msglen' is evaluated _once_, using the value of `$' (see
-the section `Expressions' for details of `$') at the point of
-definition, rather than being evaluated wherever it is referenced
-and using the value of `$' at the point of reference. Note that the
-caveat in `Critical Expressions' applies to EQU too, at the moment.
-
-Finally, the TIMES prefix causes the instruction to be assembled
-multiple times. This is partly NASM's equivalent of the DUP syntax
-supported by MASM-compatible assemblers, in that one can do
-
-zerobuf: times 64 db 0
-
-or similar, but TIMES is more versatile than that. TIMES takes not
-just a numeric constant, but a numeric _expression_, so one can do
-things like
-
-buffer: db 'hello, world'
-        times 64-$+buffer db ' '
-
-which will store exactly enough spaces to make the total length of
-`buffer' up to 64. (See the section `Critical Expressions' for a
-caveat on the use of TIMES.) Finally, TIMES can be applied to
-ordinary opcodes, so you can code trivial unrolled loops in it:
-
-	times 100 movsb
-
-Note that there is no effective difference between `times 100 resb
-1' and `resb 100', except that the latter will be assembled about
-100 times faster due to the internal structure of the assembler.
-
-Note also that TIMES can't be applied to macros: the reason for this
-is that TIMES is processed after the macro phase, which allows the
-argument to TIMES to contain expressions such as `64-$+buffer' as
-above.
-
-Effective Addresses
-===================
-
-NASM's addressing scheme is very simple, although it can involve
-more typing than other assemblers. Where other assemblers
-distinguish between a _variable_ (label declared without a colon)
-and a _label_ (declared with a colon), and use different means of
-addressing the two, NASM is totally consistent.
-
-To refer to the contents of a memory location, square brackets are
-required. This applies to simple variables, computed offsets,
-segment overrides, effective addresses - _everything_. E.g.:
-
-wordvar dw 123
-	mov ax,[wordvar]
-	mov ax,[wordvar+1]
-	mov ax,[es:wordvar+bx]
-
-NASM does _not_ support the various strange syntaxes used by MASM
-and others, such as
-
-	mov ax,wordvar	; this is legal, but means something else
-	mov ax,es:wordvar[bx]	; not even slightly legal
-	es mov ax,wordvar[1]	; the prefix is OK, but not the rest
-
-If no square brackets are used, NASM interprets label references to
-mean the address of the label. Hence there is no need for MASM's
-OFFSET keyword, but
-
-	mov ax,wordvar
-
-loads AX with the _address_ of the variable `wordvar'.
-
-More complicated effective addresses are handled by enclosing them
-within square brackets as before:
-
-	mov eax,[ebp+2*edi+offset]
-	mov ax,[bx+di+8]
-
-NASM will cope with some fairly strange effective addresses, if you
-try it: provided your effective address expression evaluates
-_algebraically_ to something that the instruction set supports, it
-will be able to assemble it. For example,
-
-	mov eax,[ebx*5]		; actually assembles to [ebx+ebx*4]
-	mov ax,[bx-si+2*si]	; actually assembles to [bx+si]
-
-will both work.
-
-There is an ambiguity in the instruction set, which allows two forms
-of 32-bit effective address with equivalent meaning:
-
-	mov eax,[2*eax+0]
-	mov eax,[eax+eax]
-
-These two expressions clearly refer to the same address. The
-difference is that the first one, if assembled `as is', requires a
-four-byte offset to be stored as part of the instruction, so it
-takes up more space. NASM will generate the second (smaller) form
-for both of the above instructions, in an effort to save space.
-There is not, currently, any means for forcing NASM to generate the
-larger form of the instruction.
-
-An alternative syntax is supported, in which prefixing an operand
-with `&' is synonymous with enclosing it in square brackets. The
-square bracket syntax is the recommended one, however, and is the
-syntax generated by NDISASM. But, for example, `mov eax,&ebx+ecx' is
-equivalent to `mov eax,[ebx+ecx]'.
-
-Mixing 16 and 32 Bit Code: Unusual Instruction Sizes
-====================================================
-
-A number of assemblers seem to have trouble assembling instructions
-that use a different operand or address size from the one they are
-expecting; as86 is a good example, even though the Linux kernel boot
-process (which is assembled using as86) needs several such
-instructions and as86 can't do them.
-
-Instructions such as `mov eax,2' in 16-bit mode are easy, of course,
-and NASM can do them just as well as any other assembler. The
-difficult instructions are things like far jumps.
-
-Suppose you are in a 16-bit segment, in protected mode, and you want
-to execute a far jump to a point in a 32-bit segment. You need to
-code a 32-bit far jump in a 16-bit segment; not all assemblers will
-easily support this. NASM can, by means of the `word' and `dword'
-specifiers. So you can code
-
-	jmp 1234h:5678h	; this uses the default segment size
-	jmp word 1234h:5678h	; this is guaranteed to be 16-bit
-	jmp dword 1234h:56789ABCh ; and this is guaranteed 32-bit
-
-and NASM will generate correct code for them.
-
-Similarly, if you are coding in a 16-bit code segment, but trying to
-access memory in a 32-bit data segment, your effective addresses
-will want to be 32-bit. Of course as soon as you specify an
-effective address containing a 32-bit register, like `[eax]', the
-addressing is forced to be 32-bit anyway. But if you try to specify
-a simple offset, such as `[label]' or `[0x10000]', you will get the
-default address size, which in this case will be wrong. However,
-NASM allows you to code `[dword 0x10000]' to force a 32-bit address
-size, or conversely `[word wlabel]' to force 16 bits.
-
-Be careful not to confuse `word' and `dword' _inside_ the square
-brackets with _outside_: consider the instruction
-
-	mov word [dword 0x123456],0x7890
-
-which moves 16 bits of data to an address specified by a 32-bit
-offset. There is no contradiction between the `word' and `dword' in
-this instruction, since they modify different aspects of the
-functionality. Or, even more confusingly,
-
-	call dword far [fs:word 0x4321]
-
-which takes an address specified by a 16-bit offset, and extracts a
-48-bit DWORD FAR pointer from it to call.
-
-Using this effective-address syntax, the `dword' or `word' override
-may come before or after the segment override if any: NASM isn't
-fussy. Hence:
-
-	mov ax,[fs:dword 0x123456]
-	mov ax,[dword fs:0x123456]
-
-are equivalent forms, and generate the same code.
-
-The LOOP instruction comes in strange sizes, too: in a 16-bit
-segment it uses CX as its count register by default, and in a 32-bit
-segment it uses ECX. But it's possible to do either one in the other
-segment, and NASM will cope by letting you specify the count
-register as a second operand:
-
-	loop label		; uses CX or ECX depending on mode
-	loop label,cx		; always uses CX
-	loop label,ecx		; always uses ECX
-
-Finally, the string instructions LODSB, STOSB, MOVSB, CMPSB, SCASB,
-INSB, and OUTSB can all have strange address sizes: typically, in a
-16-bit segment they read from [DS:SI] and write to [ES:DI], and in a
-32-bit segment they read from [DS:ESI] and write to [ES:EDI].
-However, this can be changed by the use of the explicit address-size
-prefixes `a16' and `a32'. These prefixes generate null code if used
-in the same size segment as they specify, but generate an 0x67
-prefix otherwise. Hence `a16' generates no code in a 16-bit segment,
-but 0x67 in a 32-bit one, and vice versa. So `a16 lodsb' will always
-generate code to read a byte from [DS:SI], no matter what the size
-of the segment. There are also explicit operand-size override
-prefixes, `o16' and `o32', which will optionally generate 0x66
-bytes, but these are provided for completeness and should never have
-to be used. (Note that NASM does not support the LODS, STOS, MOVS
-etc. forms of the string instructions.)
-
-Constants
-=========
-
-NASM can accept three kinds of constant: _numeric_, _character_ and
-_string_ constants.
-
-Numeric constants are simply numbers. NASM supports a variety of
-syntaxes for expressing numbers in strange bases: you can do any of
-
-	100	; this is decimal
-	0x100	; hex
-	100h	; hex as well
-	$100	; hex again
-	100q	; octal
-	100b	; binary
-
-NASM does not support A86's syntax of treating anything with a
-leading zero as hex, nor does it support the C syntax of treating
-anything with a leading zero as octal. Leading zeros make no
-difference to NASM. (Except that, as usual, if you have a hex
-constant beginning with a letter, and you want to use the trailing-H
-syntax to represent it, you have to use a leading zero so that NASM
-will recognise it as a number instead of a label.)
-
-The `x' in `0x100', and the trailing `h', `q' and `b', may all be
-upper case if you want.
-
-Character constants consist of up to four characters enclosed in
-single or double quotes. No escape character is defined for
-including the quote character itself: if you want to declare a
-character constant containing a double quote, enclose it in single
-quotes, and vice versa.
-
-Character constants' values are worked out in terms of a
-little-endian computer: if you code
-
-	mov eax,'abcd'
-
-then if you were to examine the binary output from NASM, it would
-contain the visible string `abcd', which of course means that the
-actual value loaded into EAX would be 0x64636261, not 0x61626364.
-
-String constants are like character constants, only more so: if a
-character constant appearing as operand to a DB, DW or DD is longer
-than the word size involved (1, 2 or 4 respectively), it will be
-treated as a string constant instead, which is to say the
-concatenation of separate character constants.
-
-For example,
-
-	db 'hello, world'
-
-declares a twelve-character string constant. And
-
-	dd 'dontpanic'
-
-(a string constant) is equivalent to writing
-
-	dd 'dont','pani','c'
-
-(three character constants), so that what actually gets assembled is
-equivalent to
-
-	db 'dontpanic',0,0,0
-
-(It's worth noting that one of the reasons for the reversal of
-character constants is so that the instruction `dw "ab"' has the
-same meaning whether "ab" is treated as a character constant or a
-string constant. Hence there is less confusion.)
-
-Expressions
-===========
-
-Expressions in NASM can be formed of the following operators: `|'
-(bitwise OR), `^' (bitwise XOR), `&' (bitwise AND), `<<' and `>>'
-(logical bit shifts), `+', `-', `*' (ordinary addition, subtraction
-and multiplication), `/', `%' (unsigned division and modulo), `//',
-`%%' (signed division and modulo), `~' (bitwise NOT), and the
-operators SEG and WRT (see `SEG and WRT' below).
-
-The order of precedence is:
-
-|			lowest
-^
-&
-<< >>
-binary + and -
-* / % // %%
-unary + and -, ~, SEG	highest
-
-As usual, operators within a precedence level associate to the left
-(i.e. `2-3-4' evaluates the same way as `(2-3)-4').
-
-Note that since the `%' character is used by the preprocessor, it's
-worth making sure that the `%' and `%%' operators are followed by a
-space, to prevent the preprocessor trying to interpret them as
-macro-related things.
-
-A form of algebra is done by NASM when evaluating expressions: I
-have already stated that an effective address expression such as
-`[EAX*6-EAX]' will be recognised by NASM as algebraically equivalent
-to `[EAX*4+EAX]', and assembled as such. In addition, algebra can be
-done on labels as well: `label2*2-label1' is an acceptable way to
-define an address as far beyond `label2' as `label1' is before it.
-(In less algebraically capable assemblers, one might have to write
-that as `label2 + (label2-label1)', where the value of every
-sub-expression is either a valid address or a constant. NASM can of
-course cope with that version as well.)
-
-Expressions may also contain the special token `$', known as a Here
-token, which always evaluates to the address of the current assembly
-point. (That is, the address of the assembly point _before_ the
-current instruction gets assembled.) The special token `$$'
-evaluates to the address of the beginning of the current section;
-this can be used for alignment, as shown below:
-
-	times ($$-$) & 3 nop	; pad with NOPs to 4-byte boundary
-
-Note that this technique aligns to a four-byte boundary with respect
-to the beginning of the _segment_; if you can't guarantee that the
-segment itself begins on a four-byte boundary, this alignment is
-useless or worse. Be sure you know what kind of alignment you can
-guarantee to get out of your linker before you start trying to use
-TIMES to align to page boundaries. (Of course, the `obj' and `os2'
-file formats can happily cope with page alignment, provided you
-specify that segment attribute.)
-
-SEG and WRT
-===========
-
-NASM contains the capability for its object file formats (currently,
-only `obj' and its variant `os2' make use of this) to permit
-programs to directly refer to the segment-base values of their
-segments. This is achieved either by the object format defining the
-segment names as symbols (`obj' and `os2' do this), or by the use of
-the SEG operator.
-
-SEG is a unary prefix operator which, when applied to a symbol
-defined in a segment, will yield the segment base value of that
-segment. (In `obj' and `os2' format, symbols defined in segments
-which are grouped are considered to be primarily a member of the
-_group_, not the segment, and the return value of SEG reflects
-this.)
-
-SEG may be used for far pointers: it is guaranteed that for any
-symbol `sym', using the offset `sym' from the segment base `SEG sym'
-yields a correct pointer to the symbol. Hence you can code a far
-call by means of
-
-	CALL SEG routine:routine
-
-or store a far pointer in a data segment by
-
-	DW routine, SEG routine
-
-For convenience, NASM supports the forms
-
-	CALL FAR routine
-	JMP FAR routine
-
-as direct synonyms for the canonical syntax
-
-	CALL SEG routine:routine
-	JMP SEG routine:routine
-
-No alternative syntax for
-
-	DW routine, SEG routine
-
-is supported.
-
-Simply referring to `sym', for some symbol, will return the offset
-of `sym' from its _preferred_ segment base (as returned from `SEG
-sym'); sometimes, you may want to obtain the offset of `sym' from
-some _other_ segment base. (E.g. the offset of `sym' from the base
-of the segment it's in, where normally you'd get the offset from a
-group base). This is accomplished using the WRT (With Reference To)
-keyword: if `sym' is defined in segment `seg' but you want its
-offset relative to the beginning of segment `seg2', you can do
-
-	mov ax,sym WRT seg2
-
-The right-hand operand to WRT must be a segment-base value. You can
-also do `sym WRT SEG sym2' if you need to.
-
-Critical Expressions
-====================
-
-NASM is a two-pass assembler: it goes over the input once to
-determine the location of all the symbols, then once more to
-actually generate the output code. Most expressions are
-non-critical, in that if they contain a forward reference and hence
-their correct value is unknown during the first pass, it doesn't
-matter. However, arguments to RESB, RESW and RESD, and the argument
-to the TIMES prefix, can actually affect the _size_ of the generated
-code, and so it is critical that the expression can be evaluated
-correctly on the first pass. So in these situations, expressions may
-not contain forward references. This prevents NASM from having to
-sort out a mess such as
-
-	times (label-$) db 0
-label:	db 'where am I?'
-
-in which the TIMES argument could equally legally evaluate to
-_anything_, or perhaps even worse,
-
-	times (label-$+1) db 0
-label:	db 'NOW where am I?'
-
-in which any value for the TIMES argument is by definition invalid.
-
-Since NASM is a two-pass assembler, this criticality condition also
-applies to the argument to EQU. Suppose, if this were not the case,
-we were to have the setup
-
-  mov ax,a
-a equ b
-b:
-
-On pass one, `a' cannot be defined properly, since `b' is not known
-yet. On pass two, `b' is known, so line two can define `a' properly.
-Unfortunately, line 1 needed `a' to be defined properly, so this
-code will not assemble using only two passes.
-
-There's a related issue: in an effective address such as
-`[eax+offset]', the value of `offset' can be stored as either 1 or 4
-bytes. NASM will use the one-byte form if it knows it can, to save
-space, but will therefore be fooled by the following:
-
-  mov eax,[ebx+offset]
-offset equ 10
-
-In this case, although `offset' is a small value and could easily
-fit into the one-byte form of the instruction, when NASM sees the
-instruction in the first pass it doesn't know what `offset' is, and
-for all it knows `offset' could be a symbol requiring relocation. So
-it will allocate the full four bytes for the value of `offset'. This
-can be solved by defining `offset' before it's used.
-
-Local Labels
-============
-
-NASM takes its local label scheme mainly from the old Amiga
-assembler Devpac: a local label is one that begins with a period.
-The `localness' comes from the fact that local labels are associated
-with the previous non-local label, so that you may declare the same
-local label twice if a non-local one intervenes. Hence:
-
-label1	; some code
-.loop	; some more code
-	jne .loop
-	ret
-label2	; some code
-.loop	; some more code
-	jne .loop
-	ret
-
-In the above code, each `jne' instruction jumps to the line of code
-before it, since the `.loop' labels are distinct from each other.
-
-NASM, however, introduces an extra capability not present in Devpac,
-which is that the local labels are actually _defined_ in terms of
-their associated non-local label. So if you really have to, you can
-write
-
-label3	; some more code
-	; and some more
-	jmp label1.loop
-
-So although local labels are _usually_ local, it is possible to
-reference them from anywhere in your program, if you really have to.
-
-Assembler Directives
-====================
-
-Assembler directives appear on a line by themselves (apart from a
-comment). They come in two forms: user-level directives and
-primitive directives. Primitive directives are enclosed in square
-brackets (no white space may appear before the opening square
-bracket, although white space and a comment may come after the
-closing bracket), and were the only form of directive supported by
-earlier versions of NASM. User-level directives look the same, only
-without the square brackets, and are the more modern form. (They are
-implemented as macros expanding to primitive directives.) There is a
-distinction in functionality, which is explained below in the
-section on structures.
-
-Some directives are universal: they may be used in any situation,
-and do not change their syntax. The universal directives are listed
-below.
-
-`BITS 16' or `BITS 32' switches NASM into 16-bit or 32-bit mode.
-(This is equivalent to USE16 and USE32 segments, in TASM or MASM.)
-In 32-bit mode, instructions are prefixed with 0x66 or 0x67 prefixes
-when they use 16-bit data or addresses; in 16-bit mode, the reverse
-happens. NASM's default depends on the object format; the defaults
-are documented with the formats. (See `obj' and `os2', in
-particular, for some unusual behaviour.)
-
-`SECTION name' or `SEGMENT name' changes which section the code you
-write will be assembled into. Acceptable section names vary between
-output formats, but most formats (indeed, all formats at the moment)
-support the names `.text', `.data' and `.bss'. Note that `.bss' is
-an uninitialised data section, and so you will receive a warning
-from NASM if you try to assemble any code or data in it. The only
-thing you can do in `.bss' without triggering a warning is to use
-RESB, RESW and RESD. That's what they're for.
-
-`ABSOLUTE address' can be considered a different form of `SECTION',
-in that it must be overridden using a SECTION directive once you
-have finished using it. It is used to assemble notional code at an
-absolute offset address; of course, you can't actually assemble
-_code_ there, since no object file format is capable of putting the
-code in place, but you can use RESB, RESW and RESD, and you can
-define labels. Hence you could, for example, define a C-like data
-structure by means of
-
-	        absolute 0
-	stLong  resd 1
-	stWord  resw 1
-	stByte1 resb 1
-	stByte2 resb 1
-	st_size:
-	        segment .text
-
-and then carry on coding. This defines `stLong' to be zero, `stWord'
-to be 4, `stByte1' to be 6, `stByte2' to be 7 and `st_size' to be 8.
-So this has defined a data structure. The STRUC directive provides a
-nicer way to do this: see below.
-
-`EXTERN symbol' defines a symbol as being `external', in the C
-sense: `EXTERN' states that the symbol is _not_ declared in this
-module, but is declared elsewhere, and that you wish to _reference_
-it in this module.
-
-`GLOBAL symbol' defines a symbol as being global, in the sense that
-it is exported from this module and other modules may reference it.
-All symbols are local, unless declared as global. Note that the
-`GLOBAL' directive must appear before the definition of the symbol
-it refers to.
-
-`COMMON symbol size' defines a symbol as being common: it is
-declared to have the given size, and it is merged at link time with
-any declarations of the same symbol in other modules. This is not
-_fully_ supported in the `obj' or `os2' file format: see the section
-on `obj' for details.
-
-`STRUC structure' begins the definition of a data structure, and
-`ENDSTRUC' ends it. The structure shown above may be defined,
-exactly equivalently, using STRUC as follows:
-
-	        struc st
-	stLong  resd 1
-	stWord  resw 1
-	stByte  resb 1
-	stStr   resb 32
-	        endstruc
-
-Notice that this code still defines the symbol `st_size' to be the
-size of the structure. The `_size' suffix is automatically appended
-to the structure name. Notice also that the assembler takes care of
-remembering which section you were assembling in (whereas in the
-version using `ABSOLUTE' it was up to the programmer to sort that
-out).
-
-`ISTRUC structure' begins the declaration of an initialised instance
-of a data structure. You can then use the `AT' macro to assign
-values to the structure members, and `IEND' to finish. So, for
-example, given the structure `st' above:
-
-		istruc st
-		at stLong, dd 0x1234
-		at stWord, dw 23
-		at stByte, db 'q'
-		at stStr, db 'hello, world', 13, 10, 0
-		iend
-
-Note that there's nothing stopping the instruction after `at' from
-overflowing on to the next line if you want. So the above example
-could just as well have contained
-
-		at stStr, db 'hello, world'
-		db 13, 10, 0
-
-or even (if you prefer this style)
-
-		at stStr
-		db 'hello, world'
-		db 13, 10, 0
-
-Note also that the `ISTRUC' mechanism is implemented as a set of
-macros, and uses TIMES internally to achieve its effect; so the
-structure fields must be initialised in the same order as they were
-defined in.
-
-This is where user-level directives differ from primitives: the
-`SECTION' (and `SEGMENT') user-level directives don't just call the
-primitive versions, but they also `%define' the special preprocessor
-symbol `__SECT__' to be the primitive directive that specifies the
-current section. So the `ENDSTRUC' directive can remember what
-section the assembly was directed to before the structure definition
-began. For this reason, there is no primitive version of STRUC or
-ENDSTRUC - they are implemented in terms of ABSOLUTE and SECTION.
-This also means that if you use STRUC before explicitly announcing a
-target section, you should explicitly announce one after ENDSTRUC.
-
-Directives may also be specific to the output file format. At
-present, the `bin', `obj' and `os2' formats define extra directives,
-which are specified below.
-
-The Preprocessor
-================
-
-NASM contains a full-featured macro preprocessor, which supports
-conditional assembly, multi-level file inclusion, two forms of macro
-(single-line and multi-line), and a `context stack' mechanism for
-extra macro power. Preprocessor directives all begin with a `%'
-sign.
-
-Single-line macros
-------------------
-
-Single-line macros are defined in a similar way to C, using the
-`%define' command. Hence you can do:
-
-   %define ctrl 0x1F &
-   %define param(a,b) ((a)+(a)*(b))
-     mov byte [param(2,ebx)], ctrl 'D'
-
-which will expand to
-
-     mov byte [(2)+(2)*(ebx)], 0x1F & 'D'
-
-When the expansion of a single-line macro contains tokens which
-invoke another macro, the expansion is performed at invocation time,
-not at definition time. Thus the code
-
-   %define a(x) 1+b(x)
-   %define b(x) 2*x
-     mov ax,a(8)
-
-will evaluate in the expected way to `mov ax,1+2*8', even though the
-macro `b' wasn't defined at the time of definition of `a'.
-
-Macros defined with `%define' are case sensitive: after `%define foo
-bar', only `foo' will expand to bar: `Foo' or `FOO' will not. By
-using `%idefine' instead of `%define' (the `i' stands for
-`insensitive') you can define all the case variants of a macro at
-once, so that `%idefine foo bar' would cause `foo', `Foo' and `FOO'
-all to expand to `bar'.
-
-There is a mechanism which detects when a macro call has occurred as
-a result of a previous expansion of the same macro, to guard against
-circular references and infinite loops. If this happens, the
-preprocessor will only expand the first occurrence of the macro.
-Hence:
-
-   %define a(x) 1+a(x)
-     mov ax,a(3)        ; becomes 1+a(3) and expands no further
-
-This can be useful for doing things like this:
-
-   %macro extrn 1       ; see next section for explanation of `%macro'
-     extern _%1
-     %define %1 _%1
-   %endmacro
-
-which would avoid having to put leading underscores on external
-variables, because you could just code
-
-     extrn foo
-     mov ax,foo
-
-and it would expand as
-
-     extern foo
-   %define foo _foo
-     mov ax,foo         ; becomes mov ax,_foo as required
-
-Single-line macros with parameters can be overloaded: it is possible
-to define two or more single-line macros with the same name, each
-taking a different number of parameters, and the macro processor
-will be able to distinguish between them. However, a parameterless
-single-line macro excludes the possibility of any macro of the same
-name _with_ parameters, and vice versa (though single-line macros
-may be redefined, keeping the same number of parameters, without
-error).
-
-You can pre-define single-line macros using the `-d' option on the
-NASM command line, such as
-
-    nasm filename -dDEBUG
-
-(and then you might have various conditional-assembly bits under
-`%ifdef DEBUG'), or possibly
-
-    nasm filename -dTYPE=4
-
-(which might allow you to re-assemble your code to do several
-different things depending on the value of TYPE).
-
-Multiple-line macros
---------------------
-
-These are defined using `%macro' and `%endmacro', so that simple things
-like this can be done:
-
-   %macro prologue 0
-     push ebp
-     mov ebp,esp
-   %endmacro
-
-This defines `prologue' to be a multi-line macro, taking no
-parameters, which expands to the two lines of code given.
-
-Similarly to single-line macros, multi-line macros are case-
-sensitive, unless you define them using `%imacro' instead of
-`%macro'.
-
-The `0' on the `%macro' line indicates that the macro `prologue'
-expects no parameters. Macros can be overloaded: if two macros are
-defined with the same name but different numbers of parameters, they
-will be treated as separate. Multi-line macros may not be redefined.
-
-The assembler will usually generate a warning if you code a line
-which looks like a macro call but involves a number of parameters
-which the macro in question isn't ready to support. (For example, if
-you code a macro `%macro foo 1' and also `%macro foo 3', then you
-write `foo a,b', a warning will be generated.) This feature can be
-disabled by the use of the command line option `-w-macro-params',
-since sometimes it's intentional (for example, you might define
-`%macro push 2' to allow you to push two registers at once; but
-`push ax' shouldn't then generate a warning).
-
-Macros taking parameters can be written using `%1', `%2' and so on
-to reference the parameters. So this code
-
-   %macro movs 2
-     push %2
-     pop %1
-   %endmacro
-     movs ds,cs
-
-will define a macro `movs' to perform an effective MOV operation
-from segment to segment register. The macro call given would of
-course expand to `push cs' followed by `pop ds'.
-
-You can define a label inside a macro in such a way as to make it
-unique to that macro call (so that repeated calls to the same macro
-won't produce multiple labels with the same name), by prefixing it
-with `%%'. So:
-
-   %macro retz
-     jnz %%skip
-     ret
-   %%skip:
-   %endmacro
-
-This defines a different label in place of `%%skip' every time it's
-called. (Of course the above code could have easily been coded using
-`jnz $+3', but not in more complex cases...) The actual label
-defined would be `..@2345.skip', where 2345 is replaced by some
-number that changes with each macro call. Users are warned to avoid
-defining labels of this shape themselves.
-
-Sometimes you want a macro to be able to accept arbitrarily many
-parameters and lump them into one. This can be done using the `+'
-modifier on the `%macro' line:
-
-   %macro fputs 2+
-   [section .data] ; this is done as a primitive to avoid
-                   ; disturbing the __SECT__ define
-   %%str db %2
-   %%end:
-   __SECT__        ; this expands to a whole [section xxx] primitive
-     mov dx,%%str
-     mov cx,%%end-%%str
-     mov bx,%1
-     call writefile
-   %endmacro
-     fputs [filehandle], "hi there", 13, 10
-
-This declares `fputs' to be a macro that accepts _at least two_
-parameters, and all parameters after the first one are lumped
-together as part of the last specified one (in this case %2). So in
-the macro call, `%1' expands to `[filehandle]' while `%2' expands to
-the whole remainder of the line: `"hi there", 13, 10'. Note also the
-switching of sections in the middle of this macro expansion, to
-ensure separation of data and code.
-
-There is an alternative mechanism for putting commas in macro
-parameters: instead of specifying the large-parameter-ness at macro
-definition time, you can specify it at macro call time, by the use
-of braces to surround a parameter which you want to contain commas.
-So:
-
-   %macro table_entry 2
-   %%start:
-     db %1
-     times 32-($-%%start) db 0
-     db %2
-     times 64-($-%%start) db 0
-   %endmacro
-     table_entry 'foo','bar'
-     table_entry 'megafoo', { 27,'[1mBAR!',27,'[m' }
-
-will expand to, effectively (actually, there will be labels present,
-but these have been omitted for clarity), the following:
-
-     db 'foo'
-     times 32-3 db 0
-     db 'bar'
-     times 64-35 db 0
-     db 'megafoo'
-     times 32-7 db 0
-     db 27,'[1mBAR!',27,'[m'
-     times 64-46 db 0
-
-Macro parameter expansions can be concatenated on to other tokens,
-so that you can do this:
-
-   %macro keytab_entry 2
-   keypos%1 equ $-keytab
-     db %2
-   %endmacro
-   keytab:
-     keytab_entry F1,128+1
-     keytab_entry F2,128+2
-     keytab_entry Return,13
-
-which will define labels called `keyposF1', `keyposF2' and
-`keyposReturn'. You can similarly do concatenations on the other
-end, such as `%1foo'. If you need to concatenate a digit on to the
-end of a macro parameter expansion, you can do this by enclosing the
-parameter number in braces: `%{1}' is always a valid synonym for
-`%1', and has the advantage that it can be legitimately prepended to
-a digit, as in `%{1}2', and cause no confusion with `%{12}'.
-Macro-specific labels and defines can be concatenated similarly:
-`%{%foo}bar' will succeed where `%%foobar' would cause confusion.
-(As it happens, `%%foobar' would work anyway, due to the format of
-macro-specific labels, but for clarity, `%{%foo}bar' is recommended
-if you _really_ want to do anything this perverse...)
-
-The parameter handling has a special case: it can treat a macro
-parameter specially if it's thought to contain a condition code. The
-reference `%+1' is identical to `%1' except that it will perform an
-initial sanity check to see if the parameter in question is a
-condition code; more usefully, the reference `%-1' will produce the
-_opposite_ condition code to the one specified in the parameter.
-This allows for things such as a conditional-MOV macro to be
-defined:
-
-   %macro movc 3
-     j%-1 %%skip
-     mov %2,%3
-   %%skip:
-   %endmacro
-     movc ae,ax,bx
-
-which will expand to something like
-
-     jnae ..@1234.skip
-     mov ax,bx
-   ..@1234.skip:
-
-Note that `%+1' will allow CXZ or ECXZ to be passed as condition
-codes, but `%-1' will of course be unable to invert them.
-
-Parameters can also be defaulted: you can define a macro which, for
-example, said
-
-   %macro strange 1-3 bx,3
-     < some expansion text >
-   %endmacro
-
-This macro takes between 1 and 3 parameters (inclusive); if
-parameter 2 is not specified it defaults to BX, and if parameter 3
-is not specified it defaults to 3. So the calls
-
-     strange dx,si,di
-     strange dx,si
-     strange dx
-
-would be equivalent to
-
-     strange dx,si,di
-     strange dx,si,3
-     strange dx,bx,3
-
-Defaults may be omitted, in which case they are taken to be blank.
-
-`%endm' is a valid synonym for `%endmacro'.
-
-The specification for the number of macro parameters can be suffixed
-with `.nolist' if you don't want the macro to be explicitly expanded
-in listing files:
-
-   %macro ping 1-2+.nolist
-     ; some stuff
-   %endmacro
-
-Standard Macros and `%clear'
-----------------------------
-
-NASM defines a set of standard macros, before the input file gets
-processed; these are primarily there in order to provide standard
-language features (such as structure support). However, it's
-conceivable that a user might want to write code that doesn't have
-the standard macros defined; you can achieve this by using the
-preprocessor directive `%clear' at the top of your program, which
-will undefine _everything_ that's defined by the preprocessor.
-
-In particular, NASM defines the symbols `__NASM_MAJOR__' and
-`__NASM_MINOR__' to be the major and minor version numbers of NASM.
-
-Conditional Assembly
---------------------
-
-Similarly to the C preprocessor, the commands `%ifdef' and `%endif'
-may be used to bracket a section of code, which will then only be
-assembled if at least one of the identifiers following `%ifdef' is
-defined as a single-line macro. The command `%ifndef' has opposite
-sense to `%ifdef', and `%else' can be placed between the `%if' and
-the `%endif' to work as expected. Since there is no analogue to C's
-`#if', there is no precise `elif' directive, but `%elifdef' and
-`%elifndef' work as expected.
-
-There is another family of `%if' constructs: `%ifctx', `%ifnctx',
-`%elifctx' and `%elifnctx', which operate on the context stack
-(described below).
-
-File Inclusion
---------------
-
-You can include a file using the `%include' directive. Included
-files are searched for in the current directory, and then in all
-directories specified on the command line with the `-i' option.
-(Note that the directories specified on the command line are
-directly prepended to the filename, so they must include the
-necessary trailing slash under DOS or Unix, or the equivalent on
-other systems.)
-
-This, again, works like C: `%include' is used to include a file. Of
-course it's quite likely you'd want to do the normal sort of thing
-inside the file:
-
-   %ifndef MY_MACROS_FILE
-   %define MY_MACROS_FILE
-     < go and define some macros >
-   %endif
-
-and then elsewhere
-
-   %include "my-macros-file"
-     < some code making use of the macros >
-
-so that it doesn't matter if the file accidentally gets included
-more than once.
-
-You can force an include file to be included without using a
-`%include' command, by specifying it as a pre-include file on the
-command line using the `-p' option.
-
-The Context Stack
------------------
-
-This is a feature which adds a whole extra level of power to NASM's
-macro capability. The context stack is an internal object within the
-preprocessor, which holds a stack of `contexts'. Each context has a
-name - just an identifier-type token - and can also have labels and
-`%define' macros associated with it. Other macros can manipulate the
-context stack: this is where the power comes in.
-
-To start with: the preprocessor command `%push' will create a new
-context with the given name, and push it on to the top of the stack.
-`%pop', taking no arguments, pops the top context off the stack and
-destroys it. `%repl' renames the top context without destroying any
-associated labels or macros, so it's distinct from doing `%pop'
-followed by `%push'. Finally, `%ifctx' and `%ifnctx' invoke
-conditional assembly based on the name of the top context. (The
-alternative forms `%elifctx' and `%elifnctx' are also available.)
-
-As well as the `%%foo' syntax to define labels specific to a macro
-call, there is also the syntax `%$foo' to define a label specific to
-the context currently on top of the stack. `%$$foo' can be used to
-refer to the context below that, or `%$$$foo' below that, and so on.
-
-This lot allows the definition of macro combinations that enclose
-other code, such as the following big example:
-
-   %macro if 1
-     %push if
-     j%-1 %$ifnot
-   %endmacro
-   %macro else 0
-     %ifctx if
-       %repl else
-       jmp %$ifend
-       %$ifnot:
-     %else
-       %error "expected `if' before `else'"
-     %endif
-   %endmacro
-   %macro endif 0
-     %ifctx if
-       %$ifnot:
-       %pop
-     %elifctx else
-       %$ifend:
-       %pop
-     %else
-       %error "expected `if' or `else' before `endif'"
-     %endif
-   %endmacro
-
-This will cope with a large `if/endif' construct _or_ an
-`if/else/endif', without flinching. So you can code:
-
-     cmp ax,bx
-     if ae
-       cmp bx,cx
-       if ae
-         mov ax,cx
-       else
-         mov ax,bx
-       endif
-     else
-       cmp ax,cx
-       if ae
-         mov ax,cx
-       endif
-     endif
-
-which will place the smallest out of AX, BX and CX into AX. Note the
-use of `%repl' to change the current context from `if' to `else'
-without disturbing the associated labels `%$ifend' and `%$ifnot';
-also note that the stack mechanism allows handling of nested IF
-statements without a hitch, and that conditional assembly is used in
-the `endif' macro in order to cope with the two possible forms with
-and without an `else'. Note also the directive `%error', which
-allows the user to report errors on improper invocation of a macro
-and so can catch unmatched `endif's at preprocess time.
-
-Output Formats
-==============
-
-The current output formats supported are `bin', `aout', `coff',
-`elf', `as86', `obj', `os2', `win32', `rdf', and the debug
-pseudo-format `dbg'.
-
-`bin': flat-form binary
------------------------
-
-This is at present the only output format that generates instantly
-runnable code: all the others produce object files that need linking
-before they become executable.
-
-`bin' output files contain no red tape at all: they simply contain
-the binary representation of the exact code you wrote.
-
-The `bin' format supports a format-specific directive, which is ORG.
-`ORG addr' declares that your code should be assembled as if it were
-to be loaded into memory at the address `addr'. So a DOS .COM file
-should state `ORG 0x100', and a DOS .SYS file should state `ORG 0'.
-There should be _one_ ORG directive, at most, in an assembly file:
-NASM does not support the use of ORG to jump around inside an object
-file, like MASM does (see the `Bugs' section for a demonstration of
-the use of MASM's form of ORG to do something that NASM's won't do.)
-
-Like almost all formats (but not `obj' or `os2'), the `bin' format
-defines the section names `.text', `.data' and `.bss'. The layout is
-that `.text' comes first in the output file, followed by `.data',
-and notionally followed by `.bss'. So if you declare a BSS section
-in a flat binary file, references to the BSS section will refer to
-space past the end of the actual file. The `.data' and `.bss'
-sections are considered to be aligned on four-byte boundaries: this
-is achieved by inserting padding zero bytes between the end of the
-text section and the start of the data, if there is data present. Of
-course if no SECTION directives are present, everything will go into
-`.text', and you will get nothing in the output except the code you
-wrote.
-
-`bin' silently ignores GLOBAL directives, and will also not complain
-at EXTERN ones. You only get an error if you actually _reference_ an
-external symbol.
-
-Using the `bin' format, the default output filename is `filename'
-for inputs of `filename.asm'. If there is no extension to be
-removed, output will be placed in `nasm.out' and a warning will be
-generated.
-
-`bin' defaults to 16-bit assembly mode.
-
-`aout' and `elf': Linux object files
-------------------------------------
-
-These two object formats are the ones used under Linux. They have no
-format-specific directives, and their default output filename is
-`filename.o'.
-
-`aout' defines the three standard sections `.text', `.data' and
-`.bss'. `elf' also, defines these three, but in addition it can
-support user-defined section names, which can be declared along with
-section attributes like this:
-
-	section foo align=32 exec
-	section bar write nobits
-
-The available options are:
-
-- A section can be `progbits' (the default) or `nobits'. `nobits'
-  sections are BSS: their contents are not stored in the object
-  file, and the only thing you can sensibly do in one is RESB.
-  `progbits' are normal sections.
-
-- A section can be `exec' (indicating that it contains executable
-  code), or `noexec' (the default).
-
-- A section can be `write' (indicating that it should be writable
-  when linked), or `nowrite' (the default).
-
-- A section can be `alloc' (indicating that its contents should be
-  loaded into program VM at load time; the default) or `noalloc'
-  (for storing comments and things that don't form part of the
-  loaded program).
-
-- You can specify a power of two for the section alignment by
-  writing `align=64' or similar.
-
-The attributes of the default sections `.text', `.data' and `.bss'
-can also be redefined from their defaults. The NASM defaults are:
-
-section .text align=16 alloc exec nowrite progbits
-section .data align=4 alloc write noexec progbits
-section .bss align=4 alloc write noexec nobits
-
-ELF is a much more featureful object-file format than a.out: in
-particular it has enough features to support the writing of position
-independent code by means of a global offset table, and position
-independent shared libraries by means of a procedure linkage table.
-Unfortunately NASM, as yet, does not support these extensions, and
-so NASM cannot be used to write shared library code under ELF. NASM
-also does not support the capability, in ELF, for specifying precise
-alignment constraints on common variables.
-
-Both `aout' and `elf' default to 32-bit assembly mode.
-
-`coff' and `win32': Common Object File Format
----------------------------------------------
-
-The `coff' format generates standard Unix COFF object files, which
-can be fed to (for example) the DJGPP linker. Its default output
-filename, like the other Unix formats, is `filename.o'.
-
-The `win32' format generates Microsoft Win32 (Windows 95 or
-Intel-platform Windows NT) object files, which nominally use the
-COFF standard, but in fact are not compatible. Its default output
-filename is `filename.obj'.
-
-`coff' and `win32' are not quite compatible formats, due to the fact
-that Microsoft's interpretation of the term `relative relocation'
-does not seem to be the same as the interpretation used by anyone
-else. It is therefore more correct to state that Win32 uses a
-_variant_ of COFF. The object files will not therefore produce
-correct output when fed to each other's linkers. (I've tried it!)
-
-In addition to this subtle incompatibility, Win32 also defines
-extensions to basic COFF, such as a mechanism for importing symbols
-from dynamic-link libraries at load time. NASM may eventually
-support this extension in the form of a format-specific directive.
-However, as yet, it does not. Neither the `coff' nor `win32' output
-formats have any specific directives.
-
-The Microsoft linker also has a small blind spot: it cannot
-correctly relocate a relative CALL or JMP to an absolute address.
-Hence all PC-relative CALLs or JMPs, when using the `win32' format,
-must have targets which are relative to sections, or to external
-symbols. You can't do
-	call 0x123456
-_even_ if you happen to know that there is executable code at that
-address. The linker simply won't get the reference right; so in the
-interests of not generating incorrect code, NASM will not allow this
-form of reference to be written to a Win32 object file. (Standard
-COFF, or at least the DJGPP linker, seems to be able to cope with
-this contingency. Although that may be due to the executable having
-a zero load address...)
-
-Note also that Borland Win32 compilers reportedly do not use this
-object file format: while Borland linkers will output Win32-COFF
-type executables, their object format is the same as the old DOS OBJ
-format. So if you are using a Borland compiler, don't use the
-`win32' object format, just use `obj' and declare all your segments
-as `USE32'.
-
-Both `coff' and `win32' support, in addition to the three standard
-section names `.text', `.data' and `.bss', the ability to define
-your own sections. Currently (this may change in the future) you can
-provide the options `text' (or `code'), `data' or `bss' to determine
-the type of section. Win32 also allows `info', which is an
-informational section type used by Microsoft C compilers to store
-linker directives. So you can do:
-
-    section .mysect code    ; defines an extra code section
-
-or maybe, in Win32,
-
-    section .drectve info   ; defines an MS-compatible directive section
-    db '-defaultlib:LIBC -defaultlib:OLDNAMES '
-
-to pass directives to the MS linker.
-
-Both `coff' and `win32' default to 32-bit assembly mode.
-
-`obj' and `os2': Microsoft 16-bit Object Module Format
-------------------------------------------------------
-
-The `obj' format generates 16-bit Microsoft object files, suitable
-for feeding to 16-bit versions of Microsoft C, and probably
-TLINK as well (although that hasn't been tested). The Use32
-extensions are supported.
-
-`obj' defines no special segment names: you can call segments what
-you like. Unlike the other formats, too, segment names are actually
-defined as symbols, so you can write
-
-	segment CODE
-	mov ax,CODE
-
-and get the _segment_ address of the segment, suitable for loading
-into a segment register.
-
-Segments can be declared with attributes:
-
-	SEGMENT CODE PRIVATE ALIGN=16 CLASS=CODE OVERLAY=OVL2 USE16
-
-You can specify segments to be PRIVATE, PUBLIC, COMMON or STACK;
-their alignment may be any power of two from 1 to 256 (although only
-1, 2, 4, 16 and 256 are really supported, so anything else gets
-rounded up to the next highest one of those); their class and
-overlay names may be specified. You may also specify segments to be
-USE16 or USE32. The defaults are PUBLIC ALIGN=1, no class, no
-alignment, USE16.
-
-You can also specify that a segment is _absolute_ at a certain
-segment address:
-
-	SEGMENT SCREEN ABSOLUTE=0xB800
-
-The ABSOLUTE and ALIGN keywords are mutually exclusive.
-
-The format-specific directive GROUP allows segment grouping: `GROUP
-DGROUP DATA BSS' defines the group DGROUP to contain segments DATA
-and BSS.
-
-Segments are defined as part of their group by default: if variable
-`var' is declared in segment `data', which is part of group
-`dgroup', then the expression `SEG var' is equivalent to the
-expression `dgroup', and the expression `var' evaluates to the
-offset of the variable `var' relative to the beginning of the group
-`dgroup'. You must use the expression `var WRT data' to get the
-offset of the variable `var' relative to the beginning of its
-_segment_.
-
-NASM allows a segment to be part of more than one group (like A86,
-and unlike TASM), but will generate a warning (unlike A86!).
-References to the symbols in that segment will be resolved relative
-to the _first_ group it is defined in.
-
-The directive `UPPERCASE' causes all symbol, segment and group names
-output to the object file to be uppercased. The actual _assembly_ is
-still case sensitive.
-
-To avoid getting tangled up in NASM's local label mechanism, segment
-and group names have leading periods stripped when they are defined.
-Thus, the directive `SEGMENT .text' will define a segment called
-`text', which will clash with any other symbol called `text', and
-you will _not_ be able to reference the segment base as `.text', but
-only as `text'.
-
-Common variables in OBJ files can be `near' or `far': currently,
-NASM has a horribly grotty way to support that, which is that if you
-specify the common variable's size as negative, it will be near, and
-otherwise it will be far. The support isn't perfect: if you declare
-a far common variable both in a NASM assembly module and in a C
-program, you may well find the linker reports "mismatch in
-array-size" or some such. The reason for this is that far common
-variables are defined by means of _two_ size constants, which are
-multiplied to give the real size. Apparently the Microsoft linker
-(at least) likes both constants, not merely their product, to match
-up. This may be fixed in a future release.
-
-If the module you're writing is intended to contain the program
-entry point, you can declare this by defining the special label
-`..start' at the start point, either as a label or by EQU (although
-of course the normal caveats about EQU dependency still apply).
-
-`obj' has an unusual handling of assembly modes: instead of having a
-global default for the whole file, there is a separate default for
-each segment. Thus, each SEGMENT directive carries an implicit BITS
-directive with it, which switches to 16-bit or 32-bit mode depending
-on whether the segment is a Use16 or Use32 segment. If you want to
-place 32-bit code in a Use16 segment, you can use an explicit `BITS
-32' override, but if you switch temporarily away from that segment,
-you will have to repeat the override after coming back to it.
-
-If you're trying to build a .COM application by linking several .OBJ
-files together, you need to put `resb 0x100' at the front of the
-code segment in the first object file, since otherwise the linker
-will get the linking wrong.
-
-OS/2 uses an almost exactly similar file format to DOS, with a
-couple of differences, principally that OS/2 defines a pseudo-group
-called FLAT, containing no segments, and every relocation is made
-relative to that (so it would be equivalent to writing `label WRT
-FLAT' in place of `label' _throughout_ your code). Since this would
-be inconvenient to write code for, NASM implements the `os2' variant
-on `obj', which provides this FLAT group itself and automatically
-makes the default relocation format relative to FLAT.
-
-NOTE TO OS/2 USERS: The OS/2 output format is new in NASM version
-0.95. It hasn't been tested on any actual OS/2 systems, and I don't
-know for sure that it'll work properly. Any OS/2 users are
-encouraged to give it a thorough testing and report the results to
-me. Thanks!
-
-`as86': Linux as86 (bin86-0.3)
-------------------------------
-
-This output format attempts to replicate the format used to pass
-data between the Linux x86 assembler and linker, as86 and ld86. Its
-default file name, yet again, is `filename.o'. Its default
-segment-size attribute is 16 bits.
-
-`rdf': Relocatable Dynamic Object File Format
----------------------------------------------
-
-RDOFF was designed initially to test the object-file production
-interface to NASM. It soon became apparent that it could be enhanced
-for use in serious applications due to its simplicity; code to load
-and execute an RDOFF object module is very simple. It also contains
-enhancements to allow it to be linked with a dynamic link library at
-either run- or load- time, depending on how complex you wish to make
-your loader.
-
-The `rdoff' directory in the NASM distribution archive contains
-source for an RDF linker and loader to run under Linux.
-
-`rdf' has a default segment-size attribute of 32 bits.
-
-Debugging format: `dbg'
------------------------
-
-This output format is not built into NASM by default: it's for
-debugging purposes. It produces a debug dump of everything that the
-NASM assembly module feeds to the output driver, for the benefit of
-people trying to write their own output drivers.
-
-Common Problems
-===============
-
-A few problems that people repeatedly ask me about are documented
-here.
-
-NASM's design philosophy of generating exactly the code the
-programmer asks for, without second-guessing or re-interpreting, has
-been known to cause confusion in a couple of areas.
-
-Firstly, several people have complained that instructions such as
-`add esp,4' are assembled in a form that allocates a full four-byte
-offset field to store the `4' in, even though the instruction has a
-shorter form with a single-byte offset field which would work in
-this case. The answer is that NASM by design doesn't try to guess
-which one of these forms you want: if you want one, you code one,
-and if you want the other, you code the other. The other form is
-`add esp, byte 4'.
-
-Secondly, and similarly, I've had repeated questions about
-conditional jumps. The simple `jne label', in NASM, translates
-directly to the old 8086 form of the conditional jump, in which the
-offset can be up to 128 bytes (or thereabouts) in either direction.
-NASM won't automatically generate `je $+3 / jmp label' for labels
-that are further away, and neither will it generate the 386 long-
-offset form of the instruction. If you want the 386-specific
-conditional jump that's capable of reaching anywhere in the same
-segment as the jump instruction, you want `jne near label'. If you
-want an 8086-compatible `je' over another `jmp', code one
-explicitly, or define a macro to do so. NASM doesn't do either of
-these things for you, again by design.
-
-Bugs
-====
-
-Apart from the missing features (correct OBJ COMMON support, ELF
-alignment, ELF PIC support, etc.), there are no _known_ bugs.
-However, any you find, with patches if possible, should be sent to
-<jules@earthcorp.com> or <anakin@pobox.com>, and we'll try to fix
-them.
-
-Beware of Pentium-specific instructions: Intel have provided a macro
-file for MASM, to implement the eight or nine new Pentium opcodes as
-MASM macros. NASM does not generate the same code for the CMPXCHG8B
-instruction as these macros do: this is due to a bug in the _macro_,
-not in NASM. The macro works by generating an SIDT instruction (if I
-remember rightly), which has almost exactly the right form, then
-using ORG to back up a bit and do a DB over the top of one of the
-opcode bytes. The trouble is that Intel overlooked (or MASM syntax
-didn't let them allow for) the possibility that the SIDT instruction
-may contain an 0x66 or 0x67 operand or address size prefix. If this
-happens, the ORG will back up by the wrong amount, and the macro
-will generate incorrect code. NASM gets it right. This, also, is not
-a bug in NASM, so please don't report it as one. (Also please note
-that the ORG directive in NASM doesn't work this way, and so you
-can't do equivalent tricks with it...)
-
-That's All Folks!
-=================
-
-Enjoy using NASM! Please feel free to send me comments, or
-constructive criticism, or bug fixes, or requests, or general chat.
-
-Contributions are also welcome: if anyone knows anything about any
-other object file formats I should support, please feel free to send
-me documentation and some short example files (in my experience,
-documentation is useless without at _least_ one example), or even to
-write me an output module. OS/2 object files, in particular, spring
-to mind. I don't have OS/2, though.
-
-Please keep flames to a minimum: I have had some very angry e-mails
-in the past, condemning me for writing a useless assembler, that
-output in no useful format (at the time, that was true), generated
-incorrect code (several typos in the instruction table, since fixed)
-and took up too much memory and disk space (the price you pay for
-total portability, it seems). All these were criticisms I was happy
-to hear, but I didn't appreciate the flames that went with them.
-NASM _is_ still a prototype, and you use it at your own risk. I
-_think_ it works, and if it doesn't then I want to know about it,
-but I don't guarantee anything. So don't flame me, please. Blame,
-but don't flame.
-
-- Simon Tatham <anakin@pobox.com>, 21-Nov-96
diff --git a/nasm.h b/nasm.h
index dfee14a9..9f1fbc95 100644
--- a/nasm.h
+++ b/nasm.h
@@ -12,8 +12,8 @@
 #define NASM_NASM_H
 
 #define NASM_MAJOR_VER 0
-#define NASM_MINOR_VER 95
-#define NASM_VER "0.95"
+#define NASM_MINOR_VER 96
+#define NASM_VER "0.96"
 
 #ifndef NULL
 #define NULL 0
@@ -34,6 +34,15 @@
 #endif
 
 /*
+ * Name pollution problems: <time.h> on Digital UNIX pulls in some
+ * strange hardware header file which sees fit to define R_SP. We
+ * undefine it here so as not to break the enum below.
+ */
+#ifdef R_SP
+#undef R_SP
+#endif
+
+/*
  * We must declare the existence of this structure type up here,
  * since we have to reference it before we define it...
  */
@@ -66,15 +75,18 @@ typedef void (*efunc) (int severity, char *fmt, ...);
 #define ERR_OFFBY1 0x40		       /* report error as being on the line 
 					* we're just _about_ to read, not
 					* the one we've just read */
+#define ERR_PASS1 0x80		       /* only print this error on pass one */
+
 /*
  * These codes define specific types of suppressible warning.
  */
 #define ERR_WARN_MNP  0x0100	       /* macro-num-parameters warning */
 #define ERR_WARN_OL   0x0200	       /* orphan label (no colon, and
 					* alone on line) */
+#define ERR_WARN_NOV  0x0300	       /* numeric overflow */
 #define ERR_WARN_MASK 0xFF00	       /* the mask for this feature */
 #define ERR_WARN_SHR  8		       /* how far to shift right */
-#define ERR_WARN_MAX  2		       /* the highest numbered one */
+#define ERR_WARN_MAX  3		       /* the highest numbered one */
 
 /*
  * -----------------------
@@ -88,10 +100,14 @@ typedef void (*efunc) (int severity, char *fmt, ...);
 typedef int (*lfunc) (char *label, long *segment, long *offset);
 
 /*
- * And a label-definition function like this.
+ * And a label-definition function like this. The boolean parameter
+ * `is_norm' states whether the label is a `normal' label (which
+ * should affect the local-label system), or something odder like
+ * an EQU or a segment-base symbol, which shouldn't.
  */
-typedef void (*ldfunc) (char *label, long segment, long offset,
-			struct ofmt *ofmt, efunc error);
+typedef void (*ldfunc) (char *label, long segment, long offset, char *special,
+			int is_norm, int isextrn, struct ofmt *ofmt,
+			efunc error);
 
 /*
  * List-file generators should look like this:
@@ -153,14 +169,127 @@ typedef struct {
 } ListGen;
 
 /*
+ * The expression evaluator must be passed a scanner function; a
+ * standard scanner is provided as part of nasmlib.c. The
+ * preprocessor will use a different one. Scanners, and the
+ * token-value structures they return, look like this.
+ *
+ * The return value from the scanner is always a copy of the
+ * `t_type' field in the structure.
+ */
+struct tokenval {
+    int t_type;
+    long t_integer, t_inttwo;
+    char *t_charptr;
+};
+typedef int (*scanner) (void *private_data, struct tokenval *tv);
+
+/*
+ * Token types returned by the scanner, in addition to ordinary
+ * ASCII character values, and zero for end-of-string.
+ */
+enum {				       /* token types, other than chars */
+    TOKEN_INVALID = -1,		       /* a placeholder value */
+    TOKEN_EOS = 0,		       /* end of string */
+    TOKEN_EQ = '=', TOKEN_GT = '>', TOKEN_LT = '<',   /* aliases */
+    TOKEN_ID = 256, TOKEN_NUM, TOKEN_REG, TOKEN_INSN,  /* major token types */
+    TOKEN_ERRNUM,		       /* numeric constant with error in */
+    TOKEN_HERE, TOKEN_BASE,	       /* $ and $$ */
+    TOKEN_SPECIAL,		       /* BYTE, WORD, DWORD, FAR, NEAR, etc */
+    TOKEN_PREFIX,		       /* A32, O16, LOCK, REPNZ, TIMES, etc */
+    TOKEN_SHL, TOKEN_SHR,	       /* << and >> */
+    TOKEN_SDIV, TOKEN_SMOD,	       /* // and %% */
+    TOKEN_GE, TOKEN_LE, TOKEN_NE,      /* >=, <= and <> (!= is same as <>) */
+    TOKEN_DBL_AND, TOKEN_DBL_OR, TOKEN_DBL_XOR,   /* &&, || and ^^ */
+    TOKEN_SEG, TOKEN_WRT,	       /* SEG and WRT */
+    TOKEN_FLOAT			       /* floating-point constant */
+};
+
+/*
+ * Expression-evaluator datatype. Expressions, within the
+ * evaluator, are stored as an array of these beasts, terminated by
+ * a record with type==0. Mostly, it's a vector type: each type
+ * denotes some kind of a component, and the value denotes the
+ * multiple of that component present in the expression. The
+ * exception is the WRT type, whose `value' field denotes the
+ * segment to which the expression is relative. These segments will
+ * be segment-base types, i.e. either odd segment values or SEG_ABS
+ * types. So it is still valid to assume that anything with a
+ * `value' field of zero is insignificant.
+ */
+typedef struct {
+    long type;			       /* a register, or EXPR_xxx */
+    long value;			       /* must be >= 32 bits */
+} expr;
+
+/*
+ * The evaluator can also return hints about which of two registers
+ * used in an expression should be the base register. See also the
+ * `operand' structure.
+ */
+struct eval_hints {
+    int base;
+    int type;
+};
+
+/*
+ * The actual expression evaluator function looks like this. When
+ * called, it expects the first token of its expression to already
+ * be in `*tv'; if it is not, set tv->t_type to TOKEN_INVALID and
+ * it will start by calling the scanner.
+ *
+ * If a forward reference happens during evaluation, the evaluator
+ * must set `*fwref' to TRUE if `fwref' is non-NULL.
+ *
+ * `critical' is non-zero if the expression may not contain forward
+ * references. The evaluator will report its own error if this
+ * occurs; if `critical' is 1, the error will be "symbol not
+ * defined before use", whereas if `critical' is 2, the error will
+ * be "symbol undefined".
+ *
+ * If `critical' has bit 4 set (in addition to its main value: 0x11
+ * and 0x12 correspond to 1 and 2) then an extended expression
+ * syntax is recognised, in which relational operators such as =, <
+ * and >= are accepted, as well as low-precedence logical operators
+ * &&, ^^ and ||.
+ *
+ * If `hints' is non-NULL, it gets filled in with some hints as to
+ * the base register in complex effective addresses.
+ */
+typedef expr *(*evalfunc) (scanner sc, void *scprivate, struct tokenval *tv,
+			   int *fwref, int critical, efunc error,
+			   struct eval_hints *hints);
+
+/*
+ * There's also an auxiliary routine through which the evaluator
+ * needs to hear about the value of $ and the label (if any)
+ * defined on the current line.
+ */
+typedef void (*evalinfofunc) (char *labelname, long segment, long offset);
+
+/*
+ * Special values for expr->type. ASSUMPTION MADE HERE: the number
+ * of distinct register names (i.e. possible "type" fields for an
+ * expr structure) does not exceed 124 (EXPR_REG_START through
+ * EXPR_REG_END).
+ */
+#define EXPR_REG_START 1
+#define EXPR_REG_END 124
+#define EXPR_UNKNOWN 125L	       /* for forward references */
+#define EXPR_SIMPLE 126L
+#define EXPR_WRT 127L
+#define EXPR_SEGBASE 128L
+
+/*
  * Preprocessors ought to look like this:
  */
 typedef struct {
     /*
-     * Called at the start of a pass; given a file name, an error
-     * reporting function and a listing generator to talk to.
+     * Called at the start of a pass; given a file name, the number
+     * of the pass, an error reporting function, an evaluator
+     * function, and a listing generator to talk to.
      */
-    void (*reset) (char *, efunc, ListGen *);
+    void (*reset) (char *, int, efunc, evalfunc, ListGen *);
 
     /*
      * Called to fetch a line of preprocessed source. The line
@@ -252,9 +381,9 @@ enum {
 #define REG8      0x00201001L
 #define REG16     0x00201002L
 #define REG32     0x00201004L
+#define MMXREG    0x00201008L	       /* MMX registers */
 #define FPUREG    0x01000000L	       /* floating point stack registers */
 #define FPU0      0x01000800L	       /* FPU stack register zero */
-#define MMXREG    0x00001008L	       /* MMX registers */
 
 /* special register operands: these may be treated differently */
 #define REG_SMASK 0x00070000L	       /* a mask for the following */
@@ -290,13 +419,13 @@ enum {
  */
 
 enum {				       /* register names */
-    R_AH = 1, R_AL, R_AX, R_BH, R_BL, R_BP, R_BX, R_CH, R_CL, R_CR0,
-    R_CR2, R_CR3, R_CR4, R_CS, R_CX, R_DH, R_DI, R_DL, R_DR0, R_DR1,
-    R_DR2, R_DR3, R_DR6, R_DR7, R_DS, R_DX, R_EAX, R_EBP, R_EBX,
-    R_ECX, R_EDI, R_EDX, R_ES, R_ESI, R_ESP, R_FS, R_GS, R_MM0,
-    R_MM1, R_MM2, R_MM3, R_MM4, R_MM5, R_MM6, R_MM7, R_SI, R_SP,
-    R_SS, R_ST0, R_ST1, R_ST2, R_ST3, R_ST4, R_ST5, R_ST6, R_ST7,
-    R_TR3, R_TR4, R_TR5, R_TR6, R_TR7, REG_ENUM_LIMIT
+    R_AH = EXPR_REG_START, R_AL, R_AX, R_BH, R_BL, R_BP, R_BX, R_CH,
+    R_CL, R_CR0, R_CR2, R_CR3, R_CR4, R_CS, R_CX, R_DH, R_DI, R_DL,
+    R_DR0, R_DR1, R_DR2, R_DR3, R_DR6, R_DR7, R_DS, R_DX, R_EAX,
+    R_EBP, R_EBX, R_ECX, R_EDI, R_EDX, R_ES, R_ESI, R_ESP, R_FS,
+    R_GS, R_MM0, R_MM1, R_MM2, R_MM3, R_MM4, R_MM5, R_MM6, R_MM7,
+    R_SI, R_SP, R_SS, R_ST0, R_ST1, R_ST2, R_ST3, R_ST4, R_ST5,
+    R_ST6, R_ST7, R_TR3, R_TR4, R_TR5, R_TR6, R_TR7, REG_ENUM_LIMIT
 };
 
 enum {				       /* instruction names */
@@ -314,38 +443,41 @@ enum {				       /* instruction names */
     I_FIDIVR, I_FILD, I_FIMUL, I_FINCSTP, I_FINIT, I_FIST, I_FISTP,
     I_FISUB, I_FISUBR, I_FLD, I_FLD1, I_FLDCW, I_FLDENV, I_FLDL2E,
     I_FLDL2T, I_FLDLG2, I_FLDLN2, I_FLDPI, I_FLDZ, I_FMUL, I_FMULP,
-    I_FNOP, I_FPATAN, I_FPREM, I_FPREM1, I_FPTAN, I_FRNDINT,
-    I_FRSTOR, I_FSAVE, I_FSCALE, I_FSETPM, I_FSIN, I_FSINCOS,
-    I_FSQRT, I_FST, I_FSTCW, I_FSTENV, I_FSTP, I_FSTSW, I_FSUB,
-    I_FSUBP, I_FSUBR, I_FSUBRP, I_FTST, I_FUCOM, I_FUCOMI,
-    I_FUCOMIP, I_FUCOMP, I_FUCOMPP, I_FXAM, I_FXCH, I_FXTRACT,
-    I_FYL2X, I_FYL2XP1, I_HLT, I_IBTS, I_ICEBP, I_IDIV, I_IMUL,
-    I_IN, I_INC, I_INCBIN, I_INSB, I_INSD, I_INSW, I_INT, I_INT1,
-    I_INT01, I_INT3, I_INTO, I_INVD, I_INVLPG, I_IRET, I_IRETD,
-    I_IRETW, I_JCXZ, I_JECXZ, I_JMP, I_LAHF, I_LAR, I_LDS, I_LEA,
-    I_LEAVE, I_LES, I_LFS, I_LGDT, I_LGS, I_LIDT, I_LLDT, I_LMSW,
-    I_LOADALL, I_LOADALL286, I_LODSB, I_LODSD, I_LODSW, I_LOOP,
-    I_LOOPE, I_LOOPNE, I_LOOPNZ, I_LOOPZ, I_LSL, I_LSS, I_LTR,
-    I_MOV, I_MOVD, I_MOVQ, I_MOVSB, I_MOVSD, I_MOVSW, I_MOVSX,
-    I_MOVZX, I_MUL, I_NEG, I_NOP, I_NOT, I_OR, I_OUT, I_OUTSB,
-    I_OUTSD, I_OUTSW, I_PACKSSDW, I_PACKSSWB, I_PACKUSWB, I_PADDB,
-    I_PADDD, I_PADDSB, I_PADDSW, I_PADDUSB, I_PADDUSW, I_PADDW,
-    I_PAND, I_PANDN, I_PCMPEQB, I_PCMPEQD, I_PCMPEQW, I_PCMPGTB,
-    I_PCMPGTD, I_PCMPGTW, I_PMADDWD, I_PMULHW, I_PMULLW, I_POP,
-    I_POPA, I_POPAD, I_POPAW, I_POPF, I_POPFD, I_POPFW, I_POR,
-    I_PSLLD, I_PSLLQ, I_PSLLW, I_PSRAD, I_PSRAW, I_PSRLD, I_PSRLQ,
-    I_PSRLW, I_PSUBB, I_PSUBD, I_PSUBSB, I_PSUBSW, I_PSUBUSB,
-    I_PSUBUSW, I_PSUBW, I_PUNPCKHBW, I_PUNPCKHDQ, I_PUNPCKHWD,
-    I_PUNPCKLBW, I_PUNPCKLDQ, I_PUNPCKLWD, I_PUSH, I_PUSHA,
-    I_PUSHAD, I_PUSHAW, I_PUSHF, I_PUSHFD, I_PUSHFW, I_PXOR, I_RCL,
-    I_RCR, I_RDMSR, I_RDPMC, I_RDTSC, I_RESB, I_RESD, I_RESQ,
-    I_REST, I_RESW, I_RET, I_RETF, I_RETN, I_ROL, I_ROR, I_RSM,
-    I_SAHF, I_SAL, I_SALC, I_SAR, I_SBB, I_SCASB, I_SCASD, I_SCASW,
-    I_SGDT, I_SHL, I_SHLD, I_SHR, I_SHRD, I_SIDT, I_SLDT, I_SMI,
-    I_SMSW, I_STC, I_STD, I_STI, I_STOSB, I_STOSD, I_STOSW, I_STR,
-    I_SUB, I_TEST, I_UMOV, I_VERR, I_VERW, I_WAIT, I_WBINVD,
-    I_WRMSR, I_XADD, I_XBTS, I_XCHG, I_XLATB, I_XOR, I_CMOVcc,
-    I_Jcc, I_SETcc
+    I_FNCLEX, I_FNDISI, I_FNENI, I_FNINIT, I_FNOP, I_FNSAVE,
+    I_FNSTCW, I_FNSTENV, I_FNSTSW, I_FPATAN, I_FPREM, I_FPREM1,
+    I_FPTAN, I_FRNDINT, I_FRSTOR, I_FSAVE, I_FSCALE, I_FSETPM,
+    I_FSIN, I_FSINCOS, I_FSQRT, I_FST, I_FSTCW, I_FSTENV, I_FSTP,
+    I_FSTSW, I_FSUB, I_FSUBP, I_FSUBR, I_FSUBRP, I_FTST, I_FUCOM,
+    I_FUCOMI, I_FUCOMIP, I_FUCOMP, I_FUCOMPP, I_FXAM, I_FXCH,
+    I_FXTRACT, I_FYL2X, I_FYL2XP1, I_HLT, I_IBTS, I_ICEBP, I_IDIV,
+    I_IMUL, I_IN, I_INC, I_INCBIN, I_INSB, I_INSD, I_INSW, I_INT,
+    I_INT1, I_INT01, I_INT3, I_INTO, I_INVD, I_INVLPG, I_IRET,
+    I_IRETD, I_IRETW, I_JCXZ, I_JECXZ, I_JMP, I_LAHF, I_LAR, I_LDS,
+    I_LEA, I_LEAVE, I_LES, I_LFS, I_LGDT, I_LGS, I_LIDT, I_LLDT,
+    I_LMSW, I_LOADALL, I_LOADALL286, I_LODSB, I_LODSD, I_LODSW,
+    I_LOOP, I_LOOPE, I_LOOPNE, I_LOOPNZ, I_LOOPZ, I_LSL, I_LSS,
+    I_LTR, I_MOV, I_MOVD, I_MOVQ, I_MOVSB, I_MOVSD, I_MOVSW,
+    I_MOVSX, I_MOVZX, I_MUL, I_NEG, I_NOP, I_NOT, I_OR, I_OUT,
+    I_OUTSB, I_OUTSD, I_OUTSW, I_PACKSSDW, I_PACKSSWB, I_PACKUSWB,
+    I_PADDB, I_PADDD, I_PADDSB, I_PADDSIW, I_PADDSW, I_PADDUSB,
+    I_PADDUSW, I_PADDW, I_PAND, I_PANDN, I_PAVEB, I_PCMPEQB,
+    I_PCMPEQD, I_PCMPEQW, I_PCMPGTB, I_PCMPGTD, I_PCMPGTW,
+    I_PDISTIB, I_PMACHRIW, I_PMADDWD, I_PMAGW, I_PMULHRW,
+    I_PMULHRIW, I_PMULHW, I_PMULLW, I_PMVGEZB, I_PMVLZB, I_PMVNZB,
+    I_PMVZB, I_POP, I_POPA, I_POPAD, I_POPAW, I_POPF, I_POPFD,
+    I_POPFW, I_POR, I_PSLLD, I_PSLLQ, I_PSLLW, I_PSRAD, I_PSRAW,
+    I_PSRLD, I_PSRLQ, I_PSRLW, I_PSUBB, I_PSUBD, I_PSUBSB,
+    I_PSUBSIW, I_PSUBSW, I_PSUBUSB, I_PSUBUSW, I_PSUBW, I_PUNPCKHBW,
+    I_PUNPCKHDQ, I_PUNPCKHWD, I_PUNPCKLBW, I_PUNPCKLDQ, I_PUNPCKLWD,
+    I_PUSH, I_PUSHA, I_PUSHAD, I_PUSHAW, I_PUSHF, I_PUSHFD,
+    I_PUSHFW, I_PXOR, I_RCL, I_RCR, I_RDMSR, I_RDPMC, I_RDTSC,
+    I_RESB, I_RESD, I_RESQ, I_REST, I_RESW, I_RET, I_RETF, I_RETN,
+    I_ROL, I_ROR, I_RSM, I_SAHF, I_SAL, I_SALC, I_SAR, I_SBB,
+    I_SCASB, I_SCASD, I_SCASW, I_SGDT, I_SHL, I_SHLD, I_SHR, I_SHRD,
+    I_SIDT, I_SLDT, I_SMI, I_SMSW, I_STC, I_STD, I_STI, I_STOSB,
+    I_STOSD, I_STOSW, I_STR, I_SUB, I_TEST, I_UMOV, I_VERR, I_VERW,
+    I_WAIT, I_WBINVD, I_WRMSR, I_XADD, I_XBTS, I_XCHG, I_XLATB,
+    I_XOR, I_CMOVcc, I_Jcc, I_SETcc
 };
 
 enum {				       /* condition code names */
@@ -369,13 +501,27 @@ enum {				       /* extended operand types */
     EOT_NOTHING, EOT_DB_STRING, EOT_DB_NUMBER
 };
 
+enum {				       /* special EA flags */
+    EAF_BYTEOFFS = 1,		       /* force offset part to byte size */
+    EAF_WORDOFFS = 2,		       /* force offset part to [d]word size */
+    EAF_TIMESTWO = 4		       /* really do EAX*2 not EAX+EAX */
+};
+
+enum {				       /* values for `hinttype' */
+    EAH_NOHINT = 0,		       /* no hint at all - our discretion */
+    EAH_MAKEBASE = 1,		       /* try to make given reg the base */
+    EAH_NOTBASE = 2		       /* try _not_ to make reg the base */
+};
+
 typedef struct {		       /* operand to an instruction */
     long type;			       /* type of operand */
     int addr_size;		       /* 0 means default; 16; 32 */
     int basereg, indexreg, scale;      /* registers and scale involved */
+    int hintbase, hinttype;	       /* hint as to real base register */
     long segment;		       /* immediate segment, if needed */
     long offset;		       /* any immediate number */
     long wrt;			       /* segment base it's relative to */
+    int eaflags;		       /* special EA flags */
 } operand;
 
 typedef struct extop {		       /* extended operand */
@@ -423,13 +569,22 @@ struct ofmt {
     char *shortname;
 
     /*
+     * This, if non-NULL, is a NULL-terminated list of `char *'s
+     * pointing to extra standard macros supplied by the object
+     * format (e.g. a sensible initial default value of __SECT__,
+     * and user-level equivalents for any format-specific
+     * directives).
+     */
+    char **stdmac;
+
+    /*
      * This procedure is called at the start of an output session.
      * It tells the output format what file it will be writing to,
      * what routine to report errors through, and how to interface
-     * to the label manager if necessary. It also gives it a chance
-     * to do other initialisation.
+     * to the label manager and expression evaluator if necessary.
+     * It also gives it a chance to do other initialisation.
      */
-    void (*init) (FILE *fp, efunc error, ldfunc ldef);
+    void (*init) (FILE *fp, efunc error, ldfunc ldef, evalfunc eval);
 
     /*
      * This procedure is called by assemble() to write actual
@@ -465,8 +620,14 @@ struct ofmt {
      * re-entrancy is guaranteed in the label manager. However, the
      * label manager will in turn call this routine, so it should
      * be prepared to be re-entrant itself.
+     *
+     * The `special' parameter contains special information passed
+     * through from the command that defined the label: it may have
+     * been an EXTERN, a COMMON or a GLOBAL. The distinction should
+     * be obvious to the output format from the other parameters.
      */
-    void (*symdef) (char *name, long segment, long offset, int is_global);
+    void (*symdef) (char *name, long segment, long offset, int is_global,
+		    char *special);
 
     /*
      * This procedure is called when the source code requests a
@@ -492,6 +653,11 @@ struct ofmt {
      * required to produce in return a segment value which may be
      * different. It can map segment bases to absolute numbers by
      * means of returning SEG_ABS types.
+     *
+     * It should return NO_SEG if the segment base cannot be
+     * determined; the evaluator (which calls this routine) is
+     * responsible for throwing an error condition if that occurs
+     * in pass two or in a critical expression.
      */
     long (*segbase) (long segment);
 
@@ -516,8 +682,9 @@ struct ofmt {
      * the "init" routine - and is passed the name of the input
      * file from which this output file is being generated. It
      * should return its preferred name for the output file in
-     * `outfunc'. Since it is called before the driver is properly
-     * initialised, it has to be passed its error handler
+     * `outname', if outname[0] is not '\0', and do nothing to
+     * `outname' otherwise. Since it is called before the driver is
+     * properly initialised, it has to be passed its error handler
      * separately.
      *
      * This procedure may also take its own copy of the input file
diff --git a/nasmlib.c b/nasmlib.c
index b2441c7b..bd671f55 100644
--- a/nasmlib.c
+++ b/nasmlib.c
@@ -102,6 +102,28 @@ char *nasm_strdup (char *s)
     return p;
 }
 
+#ifdef LOGALLOC
+char *nasm_strndup_log (char *file, int line, char *s, size_t len)
+#else
+char *nasm_strndup (char *s, size_t len)
+#endif
+{
+    char *p;
+    int size = len+1;
+
+    p = malloc(size);
+    if (!p)
+	nasm_malloc_error (ERR_FATAL | ERR_NOFILE, "out of memory");
+#ifdef LOGALLOC
+    else
+	fprintf(logfp, "%s %d strndup(%ld) returns %p\n",
+		file, line, (long)size, p);
+#endif
+    strncpy (p, s, len);
+    p[len] = '\0';
+    return p;
+}
+
 int nasm_stricmp (char *s1, char *s2) {
     while (*s1 && toupper(*s1) == toupper(*s2))
 	s1++, s2++;
@@ -130,7 +152,8 @@ int nasm_strnicmp (char *s1, char *s2, int n) {
 long readnum (char *str, int *error) {
     char *r = str, *q;
     long radix;
-    long result;
+    unsigned long result, checklimit;
+    int warn = FALSE;
 
     *error = FALSE;
 
@@ -157,15 +180,42 @@ long readnum (char *str, int *error) {
     else
 	radix = 10;
 
+    /*
+     * If this number has been found for us by something other than
+     * the ordinary scanners, then it might be malformed by having
+     * nothing between the prefix and the suffix. Check this case
+     * now.
+     */
+    if (r >= q) {
+	*error = TRUE;
+	return 0;
+    }
+
+    /*
+     * `checklimit' must be 2**32 / radix. We can't do that in
+     * 32-bit arithmetic, which we're (probably) using, so we
+     * cheat: since we know that all radices we use are even, we
+     * can divide 2**31 by radix/2 instead.
+     */
+    checklimit = 0x80000000UL / (radix>>1);
+
     result = 0;
     while (*r && r < q) {
 	if (*r<'0' || (*r>'9' && *r<'A') || numvalue(*r)>=radix) {
 	    *error = TRUE;
 	    return 0;
 	}
+	if (result >= checklimit)
+	    warn = TRUE;
 	result = radix * result + numvalue(*r);
 	r++;
     }
+
+    if (warn)
+	nasm_malloc_error (ERR_WARNING | ERR_PASS1 | ERR_WARN_NOV,
+			   "numeric constant %s does not fit in 32 bits",
+			   str);
+
     return result;
 }
 
@@ -195,6 +245,8 @@ void standard_extension (char *inname, char *outname, char *extension,
 			 efunc error) {
     char *p, *q;
 
+    if (*outname)		       /* file name already exists, */
+	return;			       /* so do nothing */
     q = inname;
     p = outname;
     while (*q) *p++ = *q++;	       /* copy, and find end of string */
@@ -225,7 +277,21 @@ typedef struct RAA_LEAF RAA_LEAF;
 typedef struct RAA_BRANCH RAA_BRANCH;
 
 struct RAA {
+    /*
+     * Number of layers below this one to get to the real data. 0
+     * means this structure is a leaf, holding RAA_BLKSIZE real
+     * data items; 1 and above mean it's a branch, holding
+     * RAA_LAYERSIZE pointers to the next level branch or leaf
+     * structures.
+     */
     int layers;
+    /*
+     * Number of real data items spanned by one position in the
+     * `data' array at this level. This number is 1, trivially, for
+     * a leaf (level 0): for a level 1 branch it should be
+     * RAA_BLKSIZE, and for a level 2 branch it's
+     * RAA_LAYERSIZE*RAA_BLKSIZE.
+     */
     long stepsize;
     union RAA_UNION {
 	struct RAA_LEAF {
@@ -254,8 +320,8 @@ static struct RAA *real_raa_init (int layers) {
 	r = nasm_malloc (BRANCHSIZ);
 	memset (r->u.b.data, 0, sizeof(r->u.b.data));
 	r->layers = layers;
-	r->stepsize = 1L;
-	while (layers--)
+	r->stepsize = RAA_BLKSIZE;
+	while (--layers)
 	    r->stepsize *= RAA_LAYERSIZE;
     }
     return r;
@@ -541,3 +607,353 @@ void saa_fpwrite (struct SAA *s, FILE *fp) {
     while ( (data = saa_rbytes (s, &len)) )
 	fwrite (data, 1, len, fp);
 }
+
+/*
+ * Register, instruction, condition-code and prefix keywords used
+ * by the scanner.
+ */
+#include "names.c"
+static char *special_names[] = {
+    "byte", "dword", "far", "long", "near", "nosplit", "qword",
+    "short", "to", "tword", "word"
+};
+static char *prefix_names[] = {
+    "a16", "a32", "lock", "o16", "o32", "rep", "repe", "repne",
+    "repnz", "repz", "times"
+};
+
+
+/*
+ * Standard scanner routine used by parser.c and some output
+ * formats. It keeps a succession of temporary-storage strings in
+ * stdscan_tempstorage, which can be cleared using stdscan_reset.
+ */
+static char **stdscan_tempstorage = NULL;
+static int stdscan_tempsize = 0, stdscan_templen = 0;
+#define STDSCAN_TEMP_DELTA 256
+
+static void stdscan_pop(void) {
+    nasm_free (stdscan_tempstorage[--stdscan_templen]);
+}
+
+void stdscan_reset(void) {
+    while (stdscan_templen > 0)
+	stdscan_pop();
+}
+
+static char *stdscan_copy(char *p, int len) {
+    char *text;
+
+    text = nasm_malloc(len+1);
+    strncpy (text, p, len);
+    text[len] = '\0';
+
+    if (stdscan_templen >= stdscan_tempsize) {
+	stdscan_tempsize += STDSCAN_TEMP_DELTA;
+	stdscan_tempstorage = nasm_realloc(stdscan_tempstorage,
+					   stdscan_tempsize*sizeof(char *));
+    }
+    stdscan_tempstorage[stdscan_templen++] = text;
+
+    return text;
+}
+
+char *stdscan_bufptr = NULL;
+int stdscan (void *private_data, struct tokenval *tv) {
+    char ourcopy[256], *r, *s;
+
+    while (isspace(*stdscan_bufptr)) stdscan_bufptr++;
+    if (!*stdscan_bufptr)
+	return tv->t_type = 0;
+
+    /* we have a token; either an id, a number or a char */
+    if (isidstart(*stdscan_bufptr) ||
+	(*stdscan_bufptr == '$' && isidstart(stdscan_bufptr[1]))) {
+	/* now we've got an identifier */
+	int i;
+	int is_sym = FALSE;
+
+	if (*stdscan_bufptr == '$') {
+	    is_sym = TRUE;
+	    stdscan_bufptr++;
+	}
+
+ 	r = stdscan_bufptr++;
+	while (isidchar(*stdscan_bufptr)) stdscan_bufptr++;
+	tv->t_charptr = stdscan_copy(r, stdscan_bufptr - r);
+
+	for (s=tv->t_charptr, r=ourcopy; *s; s++)
+	    *r++ = tolower (*s);
+	*r = '\0';
+	if (is_sym)
+	    return tv->t_type = TOKEN_ID;/* bypass all other checks */
+	/* right, so we have an identifier sitting in temp storage. now,
+	 * is it actually a register or instruction name, or what? */
+	if ((tv->t_integer=bsi(ourcopy, reg_names,
+			       elements(reg_names)))>=0) {
+	    tv->t_integer += EXPR_REG_START;
+	    return tv->t_type = TOKEN_REG;
+	} else if ((tv->t_integer=bsi(ourcopy, insn_names,
+				      elements(insn_names)))>=0) {
+	    return tv->t_type = TOKEN_INSN;
+	}
+	for (i=0; i<elements(icn); i++)
+	    if (!strncmp(ourcopy, icn[i], strlen(icn[i]))) {
+		char *p = ourcopy + strlen(icn[i]);
+		tv->t_integer = ico[i];
+		if ((tv->t_inttwo=bsi(p, conditions,
+					 elements(conditions)))>=0)
+		    return tv->t_type = TOKEN_INSN;
+	    }
+	if ((tv->t_integer=bsi(ourcopy, prefix_names,
+				  elements(prefix_names)))>=0) {
+	    tv->t_integer += PREFIX_ENUM_START;
+	    return tv->t_type = TOKEN_PREFIX;
+	}
+	if ((tv->t_integer=bsi(ourcopy, special_names,
+				  elements(special_names)))>=0)
+	    return tv->t_type = TOKEN_SPECIAL;
+	if (!strcmp(ourcopy, "seg"))
+	    return tv->t_type = TOKEN_SEG;
+	if (!strcmp(ourcopy, "wrt"))
+	    return tv->t_type = TOKEN_WRT;
+	return tv->t_type = TOKEN_ID;
+    } else if (*stdscan_bufptr == '$' && !isnumchar(stdscan_bufptr[1])) {
+	/*
+	 * It's a $ sign with no following hex number; this must
+	 * mean it's a Here token ($), evaluating to the current
+	 * assembly location, or a Base token ($$), evaluating to
+	 * the base of the current segment.
+	 */
+	stdscan_bufptr++;
+	if (*stdscan_bufptr == '$') {
+	    stdscan_bufptr++;
+	    return tv->t_type = TOKEN_BASE;
+	}
+	return tv->t_type = TOKEN_HERE;
+    } else if (isnumstart(*stdscan_bufptr)) {  /* now we've got a number */
+	int rn_error;
+
+	r = stdscan_bufptr++;
+	while (isnumchar(*stdscan_bufptr))
+	    stdscan_bufptr++;
+
+	if (*stdscan_bufptr == '.') {
+	    /*
+	     * a floating point constant
+	     */
+	    stdscan_bufptr++;
+	    while (isnumchar(*stdscan_bufptr)) {
+		stdscan_bufptr++;
+	    }
+	    tv->t_charptr = stdscan_copy(r, stdscan_bufptr - r);
+	    return tv->t_type = TOKEN_FLOAT;
+	}
+	r = stdscan_copy(r, stdscan_bufptr - r);
+	tv->t_integer = readnum(r, &rn_error);
+	stdscan_pop();
+	if (rn_error)
+	    return tv->t_type = TOKEN_ERRNUM;/* some malformation occurred */
+	tv->t_charptr = NULL;
+	return tv->t_type = TOKEN_NUM;
+    } else if (*stdscan_bufptr == '\'' ||
+	       *stdscan_bufptr == '"') {/* a char constant */
+    	char quote = *stdscan_bufptr++, *r;
+	r = tv->t_charptr = stdscan_bufptr;
+	while (*stdscan_bufptr && *stdscan_bufptr != quote) stdscan_bufptr++;
+	tv->t_inttwo = stdscan_bufptr - r;      /* store full version */
+	if (!*stdscan_bufptr)
+	    return tv->t_type = TOKEN_ERRNUM;       /* unmatched quotes */
+	tv->t_integer = 0;
+	r = stdscan_bufptr++;		       /* skip over final quote */
+	while (quote != *--r) {
+	    tv->t_integer = (tv->t_integer<<8) + (unsigned char) *r;
+	}
+	return tv->t_type = TOKEN_NUM;
+    } else if (*stdscan_bufptr == ';') {  /* a comment has happened - stay */
+	return tv->t_type = 0;
+    } else if (stdscan_bufptr[0] == '>' && stdscan_bufptr[1] == '>') {
+	stdscan_bufptr += 2;
+	return tv->t_type = TOKEN_SHR;
+    } else if (stdscan_bufptr[0] == '<' && stdscan_bufptr[1] == '<') {
+	stdscan_bufptr += 2;
+	return tv->t_type = TOKEN_SHL;
+    } else if (stdscan_bufptr[0] == '/' && stdscan_bufptr[1] == '/') {
+	stdscan_bufptr += 2;
+	return tv->t_type = TOKEN_SDIV;
+    } else if (stdscan_bufptr[0] == '%' && stdscan_bufptr[1] == '%') {
+	stdscan_bufptr += 2;
+	return tv->t_type = TOKEN_SMOD;
+    } else if (stdscan_bufptr[0] == '=' && stdscan_bufptr[1] == '=') {
+	stdscan_bufptr += 2;
+	return tv->t_type = TOKEN_EQ;
+    } else if (stdscan_bufptr[0] == '<' && stdscan_bufptr[1] == '>') {
+	stdscan_bufptr += 2;
+	return tv->t_type = TOKEN_NE;
+    } else if (stdscan_bufptr[0] == '!' && stdscan_bufptr[1] == '=') {
+	stdscan_bufptr += 2;
+	return tv->t_type = TOKEN_NE;
+    } else if (stdscan_bufptr[0] == '<' && stdscan_bufptr[1] == '=') {
+	stdscan_bufptr += 2;
+	return tv->t_type = TOKEN_LE;
+    } else if (stdscan_bufptr[0] == '>' && stdscan_bufptr[1] == '=') {
+	stdscan_bufptr += 2;
+	return tv->t_type = TOKEN_GE;
+    } else if (stdscan_bufptr[0] == '&' && stdscan_bufptr[1] == '&') {
+	stdscan_bufptr += 2;
+	return tv->t_type = TOKEN_DBL_AND;
+    } else if (stdscan_bufptr[0] == '^' && stdscan_bufptr[1] == '^') {
+	stdscan_bufptr += 2;
+	return tv->t_type = TOKEN_DBL_XOR;
+    } else if (stdscan_bufptr[0] == '|' && stdscan_bufptr[1] == '|') {
+	stdscan_bufptr += 2;
+	return tv->t_type = TOKEN_DBL_OR;
+    } else			       /* just an ordinary char */
+    	return tv->t_type = (unsigned char) (*stdscan_bufptr++);
+}
+
+/*
+ * Return TRUE if the argument is a simple scalar. (Or a far-
+ * absolute, which counts.)
+ */
+int is_simple (expr *vect) {
+    while (vect->type && !vect->value)
+    	vect++;
+    if (!vect->type)
+	return 1;
+    if (vect->type != EXPR_SIMPLE)
+	return 0;
+    do {
+	vect++;
+    } while (vect->type && !vect->value);
+    if (vect->type && vect->type < EXPR_SEGBASE+SEG_ABS) return 0;
+    return 1;
+}
+
+/*
+ * Return TRUE if the argument is a simple scalar, _NOT_ a far-
+ * absolute.
+ */
+int is_really_simple (expr *vect) {
+    while (vect->type && !vect->value)
+    	vect++;
+    if (!vect->type)
+	return 1;
+    if (vect->type != EXPR_SIMPLE)
+	return 0;
+    do {
+	vect++;
+    } while (vect->type && !vect->value);
+    if (vect->type) return 0;
+    return 1;
+}
+
+/*
+ * Return TRUE if the argument is relocatable (i.e. a simple
+ * scalar, plus at most one segment-base, plus possibly a WRT).
+ */
+int is_reloc (expr *vect) {
+    while (vect->type && !vect->value)
+    	vect++;
+    if (!vect->type)
+	return 1;
+    if (vect->type < EXPR_SIMPLE)
+	return 0;
+    if (vect->type == EXPR_SIMPLE) {
+	do {
+	    vect++;
+	} while (vect->type && !vect->value);
+	if (!vect->type)
+	    return 1;
+    }
+    if (vect->type != EXPR_WRT && vect->value != 0 && vect->value != 1)
+	return 0;		       /* segment base multiplier non-unity */
+    do {
+	vect++;
+    } while (vect->type && (vect->type == EXPR_WRT || !vect->value));
+    if (!vect->type)
+	return 1;
+    return 0;
+}
+
+/*
+ * Return TRUE if the argument contains an `unknown' part.
+ */
+int is_unknown(expr *vect) {
+    while (vect->type && vect->type < EXPR_UNKNOWN)
+	vect++;
+    return (vect->type == EXPR_UNKNOWN);
+}
+
+/*
+ * Return TRUE if the argument contains nothing but an `unknown'
+ * part.
+ */
+int is_just_unknown(expr *vect) {
+    while (vect->type && !vect->value)
+	vect++;
+    return (vect->type == EXPR_UNKNOWN);
+}
+
+/*
+ * Return the scalar part of a relocatable vector. (Including
+ * simple scalar vectors - those qualify as relocatable.)
+ */
+long reloc_value (expr *vect) {
+    while (vect->type && !vect->value)
+    	vect++;
+    if (!vect->type) return 0;
+    if (vect->type == EXPR_SIMPLE)
+	return vect->value;
+    else
+	return 0;
+}
+
+/*
+ * Return the segment number of a relocatable vector, or NO_SEG for
+ * simple scalars.
+ */
+long reloc_seg (expr *vect) {
+    while (vect->type && (vect->type == EXPR_WRT || !vect->value))
+    	vect++;
+    if (vect->type == EXPR_SIMPLE) {
+	do {
+	    vect++;
+	} while (vect->type && (vect->type == EXPR_WRT || !vect->value));
+    }
+    if (!vect->type)
+	return NO_SEG;
+    else
+	return vect->type - EXPR_SEGBASE;
+}
+
+/*
+ * Return the WRT segment number of a relocatable vector, or NO_SEG
+ * if no WRT part is present.
+ */
+long reloc_wrt (expr *vect) {
+    while (vect->type && vect->type < EXPR_WRT)
+    	vect++;
+    if (vect->type == EXPR_WRT) {
+	return vect->value;
+    } else
+	return NO_SEG;
+}
+
+/*
+ * Binary search.
+ */
+int bsi (char *string, char **array, int size) {
+    int i = -1, j = size;	       /* always, i < index < j */
+    while (j-i >= 2) {
+	int k = (i+j)/2;
+	int l = strcmp(string, array[k]);
+	if (l<0)		       /* it's in the first half */
+	    j = k;
+	else if (l>0)		       /* it's in the second half */
+	    i = k;
+	else			       /* we've got it :) */
+	    return k;
+    }
+    return -1;			       /* we haven't got it :( */
+}
diff --git a/nasmlib.h b/nasmlib.h
index 21202a80..9168f183 100644
--- a/nasmlib.h
+++ b/nasmlib.h
@@ -32,15 +32,18 @@ void *nasm_malloc (size_t);
 void *nasm_realloc (void *, size_t);
 void nasm_free (void *);
 char *nasm_strdup (char *);
+char *nasm_strndup (char *, size_t);
 #else
 void *nasm_malloc_log (char *, int, size_t);
 void *nasm_realloc_log (char *, int, void *, size_t);
 void nasm_free_log (char *, int, void *);
 char *nasm_strdup_log (char *, int, char *);
+char *nasm_strndup_log (char *, int, char *, size_t);
 #define nasm_malloc(x) nasm_malloc_log(__FILE__,__LINE__,x)
 #define nasm_realloc(x,y) nasm_realloc_log(__FILE__,__LINE__,x,y)
 #define nasm_free(x) nasm_free_log(__FILE__,__LINE__,x)
 #define nasm_strdup(x) nasm_strdup_log(__FILE__,__LINE__,x)
+#define nasm_strndup(x,y) nasm_strndup_log(__FILE__,__LINE__,x,y)
 #endif
 #endif
 
@@ -136,4 +139,34 @@ void saa_fread (struct SAA *s, long posn, void *p, long len);   /* fixup */
 void saa_fwrite (struct SAA *s, long posn, void *p, long len);   /* fixup */
 void saa_fpwrite (struct SAA *, FILE *);
 
+#ifdef NASM_NASM_H
+/*
+ * Standard scanner.
+ */
+extern char *stdscan_bufptr;
+void stdscan_reset(void);
+int stdscan (void *private_data, struct tokenval *tv);
+#endif
+
+#ifdef NASM_NASM_H
+/*
+ * Library routines to manipulate expression data types.
+ */
+int is_reloc(expr *);
+int is_simple(expr *);
+int is_really_simple (expr *);
+int is_unknown(expr *);
+int is_just_unknown(expr *);
+long reloc_value(expr *);
+long reloc_seg(expr *);
+long reloc_wrt(expr *);
+#endif
+
+/*
+ * Binary search routine. Returns index into `array' of an entry
+ * matching `string', or <0 if no match. `array' is taken to
+ * contain `size' elements.
+ */
+int bsi (char *string, char **array, int size);
+
 #endif
diff --git a/ndisasm.1 b/ndisasm.1
new file mode 100644
index 00000000..78d313c3
--- /dev/null
+++ b/ndisasm.1
@@ -0,0 +1,117 @@
+.TH NDISASM 1 "The Netwide Assembler Project"
+.SH NAME
+ndisasm \- the Netwide Disassembler \- 80x86 binary file disassembler
+.SH SYNOPSIS
+.B ndisasm
+[
+.B \-o
+origin
+] [
+.B \-s
+sync-point [...]]
+[
+.B \-a
+|
+.B \-i
+] [
+.B \-b
+bits
+] [
+.B -u
+] [
+.B \-e
+hdrlen
+] [
+.B \-k
+offset,length [...]]
+infile
+.br
+.B ndisasm \-h
+.br
+.B ndisasm \-r
+.SH DESCRIPTION
+The
+.B ndisasm
+command generates a disassembly listing of the binary file
+.I infile
+and directs it to stdout.
+.SS OPTIONS
+.TP
+.B \-h
+Causes
+.B ndisasm
+to exit immediately, after giving a summary of its invocation
+options.
+.TP
+.BI \-r
+Causes
+.B ndisasm
+to exit immediately, after displaying its version number.
+.TP
+.BI \-o " origin"
+Specifies the notional load address for the file. This option causes
+.B ndisasm
+to get the addresses it lists down the left hand margin, and the
+target addresses of PC-relative jumps and calls, right.
+.TP
+.BI \-s " sync-point"
+Manually specifies a synchronisation address, such that
+.B ndisasm
+will not output any machine instruction which encompasses bytes on
+both sides of the address. Hence the instruction which
+.I starts
+at that address will be correctly disassembled.
+.TP
+.BI \-e " hdrlen"
+Specifies a number of bytes to discard from the beginning of the
+file before starting disassembly. This does not count towards the
+calculation of the disassembly offset: the first
+.I disassembled
+instruction will be shown starting at the given load address.
+.TP
+.BI \-k " offset,length"
+Specifies that
+.I length
+bytes, starting from disassembly offset
+.IR offset ,
+should be skipped over without generating any output. The skipped
+bytes still count towards the calculation of the disassembly offset.
+.TP
+.BR \-a " or " \-i
+Enables automatic (or intelligent) sync mode, in which
+.B ndisasm
+will attempt to guess where synchronisation should be performed, by
+means of examining the target addresses of the relative jumps and
+calls it disassembles.
+.TP
+.BI \-b " bits"
+Specifies either 16-bit or 32-bit mode. The default is 16-bit mode.
+.TP
+.B \-u
+Specifies 32-bit mode, more compactly than using `-b 32'.
+.PP
+.RE
+.SH RESTRICTIONS
+.B ndisasm
+only disassembles binary files: it has no understanding of the
+header information present in object or executable files. If you
+want to disassemble an object file, you should probably be using
+.BR objdump "(" 1 ")."
+.PP
+Auto-sync mode won't necessarily cure all your synchronisation
+problems: a sync marker can only be placed automatically if a jump
+or call instruction is found to refer to it
+.I before
+.B ndisasm
+actually disassembles that part of the code. Also, if spurious jumps
+or calls result from disassembling non-machine-code data, sync
+markers may get placed in strange places. Feel free to turn
+auto-sync off and go back to doing it manually if necessary.
+.PP
+.B ndisasm
+can only keep track of 8192 sync markers internally at once: this is
+to do with portability, since DOS machines don't take kindly to more
+than 64K being allocated at a time.
+.PP
+.SH SEE ALSO
+.BR objdump "(" 1 ")."
diff --git a/ndisasm.c b/ndisasm.c
index bfbad8a5..ffe6c129 100644
--- a/ndisasm.c
+++ b/ndisasm.c
@@ -10,6 +10,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
+#include <errno.h>
 
 #include "nasm.h"
 #include "nasmlib.h"
@@ -169,6 +170,11 @@ int main(int argc, char **argv) {
     }
 
     fp = fopen(filename, "rb");
+    if (!fp) {
+	fprintf(stderr, "%s: unable to open `%s': %s\n",
+		pname, filename, strerror(errno));
+	return 1;
+    }
     if (initskip > 0)
 	skip (initskip, fp);
 
diff --git a/outaout.c b/outaout.c
index 2b5a381c..e4c7610c 100644
--- a/outaout.c
+++ b/outaout.c
@@ -16,20 +16,34 @@
 #include "nasmlib.h"
 #include "outform.h"
 
-#ifdef OF_AOUT
+#if defined OF_AOUT || defined OF_AOUTB
+
+#define RELTYPE_ABSOLUTE 0x00
+#define RELTYPE_RELATIVE 0x01
+#define RELTYPE_GOTPC    0x01   /* no explicit GOTPC in a.out */
+#define RELTYPE_GOTOFF   0x10
+#define RELTYPE_GOT      0x10   /* distinct from GOTOFF bcos sym not sect */
+#define RELTYPE_PLT      0x21
+#define RELTYPE_SYMFLAG  0x08
 
 struct Reloc {
     struct Reloc *next;
     long address;		       /* relative to _start_ of section */
     long symbol;		       /* symbol number or -ve section id */
     int bytes;			       /* 2 or 4 */
-    int relative;		       /* TRUE or FALSE */
+    int reltype;		       /* see above */
 };
 
 struct Symbol {
     long strpos;		       /* string table position of name */
     int type;			       /* symbol type - see flags below */
     long value;			       /* address, or COMMON variable size */
+    long size;			       /* size for data or function exports */
+    long segment;		       /* back-reference used by gsym_reloc */
+    struct Symbol *next;	       /* list of globals in each section */
+    struct Symbol *nextfwd;	       /* list of unresolved-size symbols */
+    char *name;			       /* for unresolved-size symbols */
+    long symnum;		       /* index into symbol table */
 };
 
 /*
@@ -43,9 +57,12 @@ struct Symbol {
 #define SECT_MASK 0xE		       /* mask out any of the above */
 
 /*
- * Another flag used in Symbol.type.
+ * More flags used in Symbol.type.
  */
 #define SYM_GLOBAL 1		       /* it's a global symbol */
+#define SYM_DATA 0x100		       /* used for shared libs */
+#define SYM_FUNCTION 0x200	       /* used for shared libs */
+#define SYM_WITH_SIZE 0x4000	       /* not output; internal only */
 
 /*
  * Bit more explanation of symbol types: SECT_xxx denotes a local
@@ -61,11 +78,10 @@ struct Section {
     unsigned long len, size, nrelocs;
     long index;
     struct Reloc *head, **tail;
+    struct Symbol *gsyms, *asym;
 };
 
-static struct Section stext, sdata;
-static unsigned long bsslen;
-static long bssindex;
+static struct Section stext, sdata, sbss;
 
 static struct SAA *syms;
 static unsigned long nsyms;
@@ -75,8 +91,14 @@ static struct RAA *bsym;
 static struct SAA *strs;
 static unsigned long strslen;
 
+static struct Symbol *fwds;
+
 static FILE *aoutfp;
 static efunc error;
+static evalfunc evaluate;
+
+static int bsd;
+static int is_pic;
 
 static void aout_write(void);
 static void aout_write_relocs(struct Reloc *);
@@ -85,24 +107,73 @@ static void aout_sect_write(struct Section *, unsigned char *, unsigned long);
 static void aout_pad_sections(void);
 static void aout_fixup_relocs(struct Section *);
 
-static void aout_init(FILE *fp, efunc errfunc, ldfunc ldef) {
+/*
+ * Special section numbers which are used to define special
+ * symbols, which can be used with WRT to provide PIC relocation
+ * types.
+ */
+static long aout_gotpc_sect, aout_gotoff_sect;
+static long aout_got_sect, aout_plt_sect;
+static long aout_sym_sect;
+
+static void aoutg_init(FILE *fp, efunc errfunc, ldfunc ldef, evalfunc eval) {
     aoutfp = fp;
     error = errfunc;
+    evaluate = eval;
     (void) ldef;		       /* placate optimisers */
     stext.data = saa_init(1L); stext.head = NULL; stext.tail = &stext.head;
     sdata.data = saa_init(1L); sdata.head = NULL; sdata.tail = &sdata.head;
-    stext.len = stext.size = sdata.len = sdata.size = bsslen = 0;
+    stext.len = stext.size = sdata.len = sdata.size = sbss.len = 0;
     stext.nrelocs = sdata.nrelocs = 0;
+    stext.gsyms = sdata.gsyms = sbss.gsyms = NULL;
     stext.index = seg_alloc();
     sdata.index = seg_alloc();
-    bssindex = seg_alloc();
+    sbss.index = seg_alloc();
+    stext.asym = sdata.asym = sbss.asym = NULL;
     syms = saa_init((long)sizeof(struct Symbol));
     nsyms = 0;
     bsym = raa_init();
     strs = saa_init(1L);
     strslen = 0;
+    fwds = NULL;
 }
 
+#ifdef OF_AOUT
+
+static void aout_init(FILE *fp, efunc errfunc, ldfunc ldef, evalfunc eval) {
+    bsd = FALSE;
+    aoutg_init (fp, errfunc, ldef, eval);
+
+    aout_gotpc_sect = aout_gotoff_sect = aout_got_sect =
+	aout_plt_sect = aout_sym_sect = NO_SEG;
+}
+
+#endif
+
+#ifdef OF_AOUTB
+
+extern struct ofmt of_aoutb;
+
+static void aoutb_init(FILE *fp, efunc errfunc, ldfunc ldef, evalfunc eval) {
+    bsd = TRUE;
+    aoutg_init (fp, errfunc, ldef, eval);
+
+    is_pic = 0x00;		       /* may become 0x40 */
+
+    aout_gotpc_sect = seg_alloc();
+    ldef("..gotpc", aout_gotpc_sect+1, 0L, NULL, FALSE,FALSE,&of_aoutb,error);
+    aout_gotoff_sect = seg_alloc();
+    ldef("..gotoff", aout_gotoff_sect+1, 0L,NULL,FALSE,FALSE,&of_aoutb,error);
+    aout_got_sect = seg_alloc();
+    ldef("..got", aout_got_sect+1, 0L, NULL, FALSE,FALSE,&of_aoutb,error);
+    aout_plt_sect = seg_alloc();
+    ldef("..plt", aout_plt_sect+1, 0L, NULL, FALSE,FALSE,&of_aoutb,error);
+    aout_sym_sect = seg_alloc();
+    ldef("..sym", aout_sym_sect+1, 0L, NULL, FALSE,FALSE,&of_aoutb,error);
+}
+
+#endif
+
 static void aout_cleanup(void) {
     struct Reloc *r;
 
@@ -143,21 +214,66 @@ static long aout_section_names (char *name, int pass, int *bits) {
     else if (!strcmp(name, ".data"))
 	return sdata.index;
     else if (!strcmp(name, ".bss"))
-	return bssindex;
+	return sbss.index;
     else
 	return NO_SEG;
 }
 
 static void aout_deflabel (char *name, long segment, long offset,
-			   int is_global) {
+			   int is_global, char *special) {
     int pos = strslen+4;
     struct Symbol *sym;
+    int special_used = FALSE;
 
     if (name[0] == '.' && name[1] == '.' && name[2] != '@') {
-	error (ERR_NONFATAL, "unrecognised special symbol `%s'", name);
+	/*
+	 * This is a NASM special symbol. We never allow it into
+	 * the a.out symbol table, even if it's a valid one. If it
+	 * _isn't_ a valid one, we should barf immediately.
+	 */
+	if (strcmp(name, "..gotpc") && strcmp(name, "..gotoff") &&
+	    strcmp(name, "..got") && strcmp(name, "..plt") &&
+	    strcmp(name, "..sym"))
+	    error (ERR_NONFATAL, "unrecognised special symbol `%s'", name);
 	return;
     }
 
+    if (is_global == 3) {
+	struct Symbol **s;
+	/*
+	 * Fix up a forward-reference symbol size from the first
+	 * pass.
+	 */
+	for (s = &fwds; *s; s = &(*s)->nextfwd)
+	    if (!strcmp((*s)->name, name)) {
+		struct tokenval tokval;
+		expr *e;
+		char *p = special;
+
+		while (*p && !isspace(*p)) p++;
+		while (*p && isspace(*p)) p++;
+		stdscan_reset();
+		stdscan_bufptr = p;
+		tokval.t_type = TOKEN_INVALID;
+		e = evaluate(stdscan, NULL, &tokval, NULL, 1, error, NULL);
+		if (e) {
+		    if (!is_simple(e))
+			error (ERR_NONFATAL, "cannot use relocatable"
+			       " expression as symbol size");
+		    else
+			(*s)->size = reloc_value(e);
+		}
+
+		/*
+		 * Remove it from the list of unresolved sizes.
+		 */
+		nasm_free ((*s)->name);
+		*s = (*s)->nextfwd;
+		return;
+	    }
+	return;			       /* it wasn't an important one */
+    }
+
     saa_wbytes (strs, name, (long)(1+strlen(name)));
     strslen += 1+strlen(name);
 
@@ -165,34 +281,110 @@ static void aout_deflabel (char *name, long segment, long offset,
 
     sym->strpos = pos;
     sym->type = is_global ? SYM_GLOBAL : 0;
+    sym->segment = segment;
     if (segment == NO_SEG)
 	sym->type |= SECT_ABS;
-    else if (segment == stext.index)
+    else if (segment == stext.index) {
 	sym->type |= SECT_TEXT;
-    else if (segment == sdata.index)
+	if (is_global) {
+	    sym->next = stext.gsyms;
+	    stext.gsyms = sym;
+	} else if (!stext.asym)
+	    stext.asym = sym;
+    } else if (segment == sdata.index) {
 	sym->type |= SECT_DATA;
-    else if (segment == bssindex)
+	if (is_global) {
+	    sym->next = sdata.gsyms;
+	    sdata.gsyms = sym;
+	} else if (!sdata.asym)
+	    sdata.asym = sym;
+    } else if (segment == sbss.index) {
 	sym->type |= SECT_BSS;
-    else
+	if (is_global) {
+	    sym->next = sbss.gsyms;
+	    sbss.gsyms = sym;
+	} else if (!sbss.asym)
+	    sbss.asym = sym;
+    } else
 	sym->type = SYM_GLOBAL;
     if (is_global == 2)
 	sym->value = offset;
     else
 	sym->value = (sym->type == SYM_GLOBAL ? 0 : offset);
 
+    if (is_global && sym->type != SYM_GLOBAL) {
+	/*
+	 * Global symbol exported _from_ this module. We must check
+	 * the special text for type information.
+	 */
+
+	if (special) {
+	    int n = strcspn(special, " ");
+
+	    if (!nasm_strnicmp(special, "function", n))
+		sym->type |= SYM_FUNCTION;
+	    else if (!nasm_strnicmp(special, "data", n) ||
+		     !nasm_strnicmp(special, "object", n))
+		sym->type |= SYM_DATA;
+	    else
+		error(ERR_NONFATAL, "unrecognised symbol type `%.*s'",
+		      n, special);
+	    if (special[n]) {
+		struct tokenval tokval;
+		expr *e;
+		int fwd = FALSE;
+
+		if (!bsd) {
+		    error(ERR_NONFATAL, "Linux a.out does not support"
+			  " symbol size information");
+		} else {
+		    while (special[n] && isspace(special[n]))
+			n++;
+		    /*
+		     * We have a size expression; attempt to
+		     * evaluate it.
+		     */
+		    sym->type |= SYM_WITH_SIZE;
+		    stdscan_reset();
+		    stdscan_bufptr = special+n;
+		    tokval.t_type = TOKEN_INVALID;
+		    e = evaluate(stdscan, NULL, &tokval, &fwd, 0, error, NULL);
+		    if (fwd) {
+			sym->nextfwd = fwds;
+			fwds = sym;
+			sym->name = nasm_strdup(name);
+		    } else if (e) {
+			if (!is_simple(e))
+			    error (ERR_NONFATAL, "cannot use relocatable"
+				   " expression as symbol size");
+			else
+			    sym->size = reloc_value(e);
+		    }
+		}
+	    }
+	    special_used = TRUE;
+	}
+    }
+
     /*
      * define the references from external-symbol segment numbers
      * to these symbol records.
      */
     if (segment != NO_SEG && segment != stext.index &&
-	segment != sdata.index && segment != bssindex)
+	segment != sdata.index && segment != sbss.index)
 	bsym = raa_write (bsym, segment, nsyms);
+    sym->symnum = nsyms;
 
     nsyms++;
+    if (sym->type & SYM_WITH_SIZE)
+	nsyms++;		       /* and another for the size */
+
+    if (special && !special_used)
+	error(ERR_NONFATAL, "no special symbol features supported here");
 }
 
 static void aout_add_reloc (struct Section *sect, long segment,
-			    int relative, int bytes) {
+			    int reltype, int bytes) {
     struct Reloc *r;
 
     r = *sect->tail = nasm_malloc(sizeof(struct Reloc));
@@ -203,25 +395,151 @@ static void aout_add_reloc (struct Section *sect, long segment,
     r->symbol = (segment == NO_SEG ? -SECT_ABS :
 		 segment == stext.index ? -SECT_TEXT :
 		 segment == sdata.index ? -SECT_DATA :
-		 segment == bssindex ? -SECT_BSS :
+		 segment == sbss.index ? -SECT_BSS :
 		 raa_read(bsym, segment));
-    r->relative = relative;
+    r->reltype = reltype;
+    if (r->symbol >= 0)
+	r->reltype |= RELTYPE_SYMFLAG;
+    r->bytes = bytes;
+
+    sect->nrelocs++;
+}
+
+/*
+ * This routine deals with ..got and ..sym relocations: the more
+ * complicated kinds. In shared-library writing, some relocations
+ * with respect to global symbols must refer to the precise symbol
+ * rather than referring to an offset from the base of the section
+ * _containing_ the symbol. Such relocations call to this routine,
+ * which searches the symbol list for the symbol in question.
+ *
+ * RELTYPE_GOT references require the _exact_ symbol address to be
+ * used; RELTYPE_ABSOLUTE references can be at an offset from the
+ * symbol. The boolean argument `exact' tells us this.
+ *
+ * Return value is the adjusted value of `addr', having become an
+ * offset from the symbol rather than the section. Should always be
+ * zero when returning from an exact call.
+ *
+ * Limitation: if you define two symbols at the same place,
+ * confusion will occur.
+ *
+ * Inefficiency: we search, currently, using a linked list which
+ * isn't even necessarily sorted.
+ */
+static long aout_add_gsym_reloc (struct Section *sect,
+				 long segment, long offset,
+				 int type, int bytes, int exact) {
+    struct Symbol *sym, *sm, *shead;
+    struct Reloc *r;
+
+    /*
+     * First look up the segment to find whether it's text, data,
+     * bss or an external symbol.
+     */
+    shead = NULL;
+    if (segment == stext.index)
+	shead = stext.gsyms;
+    else if (segment == sdata.index)
+	shead = sdata.gsyms;
+    else if (segment == sbss.index)
+	shead = sbss.gsyms;
+    if (!shead) {
+	if (exact && offset != 0)
+	    error (ERR_NONFATAL, "unable to find a suitable global symbol"
+		   " for this reference");
+	else
+	    aout_add_reloc (sect, segment, type, bytes);
+	return offset;
+    }
+
+    if (exact) {
+	/*
+	 * Find a symbol pointing _exactly_ at this one.
+	 */
+	for (sym = shead; sym; sym = sym->next)
+	    if (sym->value == offset)
+		break;
+    } else {
+	/*
+	 * Find the nearest symbol below this one.
+	 */
+	sym = NULL;
+	for (sm = shead; sm; sm = sm->next)
+	    if (sm->value <= offset && (!sym || sm->value > sym->value))
+		sym = sm;
+    }
+    if (!sym && exact) {
+	error (ERR_NONFATAL, "unable to find a suitable global symbol"
+	       " for this reference");
+	return 0;
+    }
+
+    r = *sect->tail = nasm_malloc(sizeof(struct Reloc));
+    sect->tail = &r->next;
+    r->next = NULL;
+
+    r->address = sect->len;
+    r->symbol = sym->symnum;
+    r->reltype = type | RELTYPE_SYMFLAG;
     r->bytes = bytes;
 
     sect->nrelocs++;
+
+    return offset - sym->value;
+}
+
+/*
+ * This routine deals with ..gotoff relocations. These _must_ refer
+ * to a symbol, due to a perversity of *BSD's PIC implementation,
+ * and it must be a non-global one as well; so we store `asym', the
+ * first nonglobal symbol defined in each section, and always work
+ * from that. Relocation type is always RELTYPE_GOTOFF.
+ *
+ * Return value is the adjusted value of `addr', having become an
+ * offset from the `asym' symbol rather than the section.
+ */
+static long aout_add_gotoff_reloc (struct Section *sect, long segment,
+				   long offset, int bytes) {
+    struct Reloc *r;
+    struct Symbol *asym;
+
+    /*
+     * First look up the segment to find whether it's text, data,
+     * bss or an external symbol.
+     */
+    asym = NULL;
+    if (segment == stext.index)
+	asym = stext.asym;
+    else if (segment == sdata.index)
+	asym = sdata.asym;
+    else if (segment == sbss.index)
+	asym = sbss.asym;
+    if (!asym)
+	error (ERR_NONFATAL, "`..gotoff' relocations require a non-global"
+	       " symbol in the section");
+
+    r = *sect->tail = nasm_malloc(sizeof(struct Reloc));
+    sect->tail = &r->next;
+    r->next = NULL;
+
+    r->address = sect->len;
+    r->symbol = asym->symnum;
+    r->reltype = RELTYPE_GOTOFF;
+    r->bytes = bytes;
+
+    sect->nrelocs++;
+
+    return offset - asym->value;
 }
 
 static void aout_out (long segto, void *data, unsigned long type,
 		      long segment, long wrt) {
     struct Section *s;
     long realbytes = type & OUT_SIZMASK;
+    long addr;
     unsigned char mydata[4], *p;
 
-    if (wrt != NO_SEG) {
-	wrt = NO_SEG;		       /* continue to do _something_ */
-	error (ERR_NONFATAL, "WRT not supported by a.out output format");
-    }
-
     type &= OUT_TYPMASK;
 
     /*
@@ -238,7 +556,7 @@ static void aout_out (long segto, void *data, unsigned long type,
 	s = &stext;
     else if (segto == sdata.index)
 	s = &sdata;
-    else if (segto == bssindex)
+    else if (segto == sbss.index)
 	s = NULL;
     else {
 	error(ERR_WARNING, "attempt to assemble code in"
@@ -253,7 +571,7 @@ static void aout_out (long segto, void *data, unsigned long type,
 	    realbytes = 2;
 	else if (type == OUT_REL4ADR)
 	    realbytes = 4;
-	bsslen += realbytes;
+	sbss.len += realbytes;
 	return;
     }
 
@@ -264,24 +582,55 @@ static void aout_out (long segto, void *data, unsigned long type,
 		  (segto == stext.index ? "code" : "data"));
 	    aout_sect_write (s, NULL, realbytes);
 	} else
-	    bsslen += realbytes;
+	    sbss.len += realbytes;
     } else if (type == OUT_RAWDATA) {
 	if (segment != NO_SEG)
 	    error(ERR_PANIC, "OUT_RAWDATA with other than NO_SEG");
 	aout_sect_write (s, data, realbytes);
     } else if (type == OUT_ADDRESS) {
+	addr = *(long *)data;
 	if (segment != NO_SEG) {
 	    if (segment % 2) {
 		error(ERR_NONFATAL, "a.out format does not support"
 		      " segment base references");
-	    } else
-		aout_add_reloc (s, segment, FALSE, realbytes);
+	    } else {
+		if (wrt == NO_SEG) {
+		    aout_add_reloc (s, segment, RELTYPE_ABSOLUTE, realbytes);
+		} else if (!bsd) {
+		    error (ERR_NONFATAL, "Linux a.out format does not support"
+			   " any use of WRT");
+		    wrt = NO_SEG;      /* we can at least _try_ to continue */
+		} else if (wrt == aout_gotpc_sect+1) {
+		    is_pic = 0x40;
+		    aout_add_reloc (s, segment, RELTYPE_GOTPC, realbytes);
+		} else if (wrt == aout_gotoff_sect+1) {
+		    is_pic = 0x40;
+		    addr = aout_add_gotoff_reloc (s, segment,
+						  addr, realbytes);
+		} else if (wrt == aout_got_sect+1) {
+		    is_pic = 0x40;
+		    addr = aout_add_gsym_reloc (s, segment, addr, RELTYPE_GOT,
+						realbytes, TRUE);
+		} else if (wrt == aout_sym_sect+1) {
+		    addr = aout_add_gsym_reloc (s, segment, addr,
+						RELTYPE_ABSOLUTE, realbytes,
+						FALSE);
+		} else if (wrt == aout_plt_sect+1) {
+		    is_pic = 0x40;
+		    error(ERR_NONFATAL, "a.out format cannot produce non-PC-"
+			  "relative PLT references");
+		} else {
+		    error (ERR_NONFATAL, "a.out format does not support this"
+			   " use of WRT");
+		    wrt = NO_SEG;      /* we can at least _try_ to continue */
+		}
+	    }
 	}
 	p = mydata;
 	if (realbytes == 2)
-	    WRITESHORT (p, *(long *)data);
+	    WRITESHORT (p, addr);
 	else
-	    WRITELONG (p, *(long *)data);
+	    WRITELONG (p, addr);
 	aout_sect_write (s, mydata, realbytes);
     } else if (type == OUT_REL2ADR) {
 	if (segment == segto)
@@ -289,8 +638,27 @@ static void aout_out (long segto, void *data, unsigned long type,
 	if (segment != NO_SEG && segment % 2) {
 	    error(ERR_NONFATAL, "a.out format does not support"
 		  " segment base references");
-	} else
-	    aout_add_reloc (s, segment, TRUE, 2);
+	} else {
+	    if (wrt == NO_SEG) {
+		aout_add_reloc (s, segment, RELTYPE_RELATIVE, 2);
+	    } else if (!bsd) {
+		error (ERR_NONFATAL, "Linux a.out format does not support"
+		       " any use of WRT");
+		wrt = NO_SEG;      /* we can at least _try_ to continue */
+	    } else if (wrt == aout_plt_sect+1) {
+		is_pic = 0x40;
+		aout_add_reloc (s, segment, RELTYPE_PLT, 2);
+	    } else if (wrt == aout_gotpc_sect+1 ||
+		       wrt == aout_gotoff_sect+1 ||
+		       wrt == aout_got_sect+1) {
+		error(ERR_NONFATAL, "a.out format cannot produce PC-"
+		      "relative GOT references");
+	    } else {
+		error (ERR_NONFATAL, "a.out format does not support this"
+		       " use of WRT");
+		wrt = NO_SEG;      /* we can at least _try_ to continue */
+	    }
+	}
 	p = mydata;
 	WRITESHORT (p, *(long*)data-(realbytes + s->len));
 	aout_sect_write (s, mydata, 2L);
@@ -300,8 +668,27 @@ static void aout_out (long segto, void *data, unsigned long type,
 	if (segment != NO_SEG && segment % 2) {
 	    error(ERR_NONFATAL, "a.out format does not support"
 		  " segment base references");
-	} else
-	    aout_add_reloc (s, segment, TRUE, 4);
+	} else {
+	    if (wrt == NO_SEG) {
+		aout_add_reloc (s, segment, RELTYPE_RELATIVE, 4);
+	    } else if (!bsd) {
+		error (ERR_NONFATAL, "Linux a.out format does not support"
+		       " any use of WRT");
+		wrt = NO_SEG;      /* we can at least _try_ to continue */
+	    } else if (wrt == aout_plt_sect+1) {
+		is_pic = 0x40;
+		aout_add_reloc (s, segment, RELTYPE_PLT, 4);
+	    } else if (wrt == aout_gotpc_sect+1 ||
+		       wrt == aout_gotoff_sect+1 ||
+		       wrt == aout_got_sect+1) {
+		error(ERR_NONFATAL, "a.out format cannot produce PC-"
+		      "relative GOT references");
+	    } else {
+		error (ERR_NONFATAL, "a.out format does not support this"
+		       " use of WRT");
+		wrt = NO_SEG;      /* we can at least _try_ to continue */
+	    }
+	}
 	p = mydata;
 	WRITELONG (p, *(long*)data-(realbytes + s->len));
 	aout_sect_write (s, mydata, 4L);
@@ -315,9 +702,9 @@ static void aout_pad_sections(void) {
      * length is a multiple of four. (NOP == 0x90.) Also increase
      * the length of the BSS section similarly.
      */
-    aout_sect_write (&stext, pad, (-stext.len) & 3);
-    aout_sect_write (&sdata, pad, (-sdata.len) & 3);
-    bsslen = (bsslen + 3) & ~3;
+    aout_sect_write (&stext, pad, (-(long)stext.len) & 3);
+    aout_sect_write (&sdata, pad, (-(long)sdata.len) & 3);
+    sbss.len = (sbss.len + 3) & ~3;
 }
 
 /*
@@ -340,10 +727,12 @@ static void aout_fixup_relocs(struct Section *sect) {
 	saa_fread (sect->data, r->address, blk, (long)r->bytes);
 	p = q = blk;
 	l = *p++;
-	l += ((long)*p++) << 8;
-	if (r->bytes == 4) {
-	    l += ((long)*p++) << 16;
-	    l += ((long)*p++) << 24;
+	if (r->bytes > 1) {
+	    l += ((long)*p++) << 8;
+	    if (r->bytes == 4) {
+		l += ((long)*p++) << 16;
+		l += ((long)*p++) << 24;
+	    }
 	}
 	if (r->symbol == -SECT_DATA)
 	    l += stext.len;
@@ -351,8 +740,10 @@ static void aout_fixup_relocs(struct Section *sect) {
 	    l += stext.len + sdata.len;
 	if (r->bytes == 4)
 	    WRITELONG(q, l);
-	else
+	else if (r->bytes == 2)
 	    WRITESHORT(q, l);
+	else
+	    *q++ = l & 0xFF;
 	saa_fwrite (sect->data, r->address, blk, (long)r->bytes);
     }
 }
@@ -361,10 +752,11 @@ static void aout_write(void) {
     /*
      * Emit the a.out header.
      */
-    fwritelong (0x640107L, aoutfp);    /* OMAGIC, M_386, no flags */
+    /* OMAGIC, M_386 or MID_I386, no flags */
+    fwritelong (bsd ? 0x07018600 | is_pic : 0x640107L, aoutfp);
     fwritelong (stext.len, aoutfp);
     fwritelong (sdata.len, aoutfp);
-    fwritelong (bsslen, aoutfp);
+    fwritelong (sbss.len, aoutfp);
     fwritelong (nsyms * 12, aoutfp);   /* length of symbol table */
     fwritelong (0L, aoutfp);	       /* object files have no entry point */
     fwritelong (stext.nrelocs * 8, aoutfp);   /* size of text relocs */
@@ -401,12 +793,12 @@ static void aout_write_relocs (struct Reloc *r) {
 	fwritelong (r->address, aoutfp);
 
 	if (r->symbol >= 0)
-	    word2 = r->symbol | 0x8000000L;
+	    word2 = r->symbol;
 	else
 	    word2 = -r->symbol;
-	if (r->relative)
-	    word2 |= 0x1000000L;
-	word2 |= (r->bytes == 2 ? 0x2000000L : 0x4000000L);
+	word2 |= r->reltype << 24;
+	word2 |= (r->bytes == 1 ? 0 :
+		  r->bytes == 2 ? 0x2000000L : 0x4000000L);
 	fwritelong (word2, aoutfp);
 
 	r = r->next;
@@ -420,7 +812,7 @@ static void aout_write_syms (void) {
     for (i=0; i<nsyms; i++) {
 	struct Symbol *sym = saa_rstruct(syms);
 	fwritelong (sym->strpos, aoutfp);
-	fwritelong ((long)sym->type, aoutfp);
+	fwritelong ((long)sym->type & ~SYM_WITH_SIZE, aoutfp);
 	/*
 	 * Fix up the symbol value now we know the final section
 	 * sizes.
@@ -430,6 +822,15 @@ static void aout_write_syms (void) {
 	if ((sym->type & SECT_MASK) == SECT_BSS)
 	    sym->value += stext.len + sdata.len;
 	fwritelong (sym->value, aoutfp);
+	/*
+	 * Output a size record if necessary.
+	 */
+	if (sym->type & SYM_WITH_SIZE) {
+	    fwritelong(sym->strpos, aoutfp);
+	    fwritelong(0x0DL, aoutfp);  /* special value: means size */
+	    fwritelong(sym->size, aoutfp);
+	    i++;		       /* use up another of `nsyms' */
+	}
     }
 }
 
@@ -451,9 +852,19 @@ static void aout_filename (char *inname, char *outname, efunc error) {
     standard_extension (inname, outname, ".o", error);
 }
 
+static char *aout_stdmac[] = {
+    "%define __SECT__ [section .text]",
+    NULL
+};
+
+#endif /* OF_AOUT || OF_AOUTB */
+
+#ifdef OF_AOUT
+
 struct ofmt of_aout = {
-    "GNU a.out (i386) object files (e.g. Linux)",
+    "Linux a.out object files",
     "aout",
+    aout_stdmac,
     aout_init,
     aout_out,
     aout_deflabel,
@@ -464,4 +875,22 @@ struct ofmt of_aout = {
     aout_cleanup
 };
 
-#endif /* OF_AOUT */
+#endif
+
+#ifdef OF_AOUTB
+
+struct ofmt of_aoutb = {
+    "NetBSD/FreeBSD a.out object files",
+    "aoutb",
+    aout_stdmac,
+    aoutb_init,
+    aout_out,
+    aout_deflabel,
+    aout_section_names,
+    aout_segbase,
+    aout_directive,
+    aout_filename,
+    aout_cleanup
+};
+
+#endif
diff --git a/outas86.c b/outas86.c
index dd0656f2..f214d866 100644
--- a/outas86.c
+++ b/outas86.c
@@ -80,7 +80,7 @@ static void as86_write_section (struct Section *, int);
 static int as86_add_string (char *name);
 static void as86_sect_write(struct Section *, unsigned char *, unsigned long);
 
-static void as86_init(FILE *fp, efunc errfunc, ldfunc ldef) {
+static void as86_init(FILE *fp, efunc errfunc, ldfunc ldef, evalfunc eval) {
     as86fp = fp;
     error = errfunc;
     (void) ldef;		       /* placate optimisers */
@@ -158,9 +158,13 @@ static int as86_add_string (char *name) {
 }
 
 static void as86_deflabel (char *name, long segment, long offset,
-			   int is_global) {
+			   int is_global, char *special) {
     struct Symbol *sym;
 
+    if (special)
+	error (ERR_NONFATAL, "as86 format does not support any"
+	       " special symbol types");
+
     if (name[0] == '.' && name[1] == '.' && name[2] != '@') {
 	error (ERR_NONFATAL, "unrecognised special symbol `%s'", name);
 	return;
@@ -429,7 +433,7 @@ static void as86_write(void) {
 static void as86_set_rsize (int size) {
     if (as86_reloc_size != size) {
 	switch (as86_reloc_size = size) {
-	  case 1: fputc (0x01, as86fp); break;   /* shouldn't happen */
+	  case 1: fputc (0x01, as86fp); break;
 	  case 2: fputc (0x02, as86fp); break;
 	  case 4: fputc (0x03, as86fp); break;
 	  default: error (ERR_PANIC, "bizarre relocation size %d", size);
@@ -533,9 +537,15 @@ static void as86_filename (char *inname, char *outname, efunc error) {
     standard_extension (inname, outname, ".o", error);
 }
 
+static char *as86_stdmac[] = {
+    "%define __SECT__ [section .text]",
+    NULL
+};
+
 struct ofmt of_as86 = {
     "Linux as86 (bin86 version 0.3) object files",
     "as86",
+    as86_stdmac,
     as86_init,
     as86_out,
     as86_deflabel,
diff --git a/outbin.c b/outbin.c
index da82abc5..3540739f 100644
--- a/outbin.c
+++ b/outbin.c
@@ -10,6 +10,8 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <ctype.h>
+
 #include "nasm.h"
 #include "nasmlib.h"
 #include "outform.h"
@@ -35,6 +37,8 @@ static struct Reloc {
     struct Section *target;
 } *relocs, **reloctail;
 
+static long data_align, bss_align;
+
 static long start_point;
 
 static void add_reloc (struct Section *s, long bytes, long secref,
@@ -51,7 +55,7 @@ static void add_reloc (struct Section *s, long bytes, long secref,
     r->target = s;
 }
 
-static void bin_init (FILE *afp, efunc errfunc, ldfunc ldef) {
+static void bin_init (FILE *afp, efunc errfunc, ldfunc ldef, evalfunc eval) {
     fp = afp;
 
     error = errfunc;
@@ -67,20 +71,22 @@ static void bin_init (FILE *afp, efunc errfunc, ldfunc ldef) {
     bssindex = seg_alloc();
     relocs = NULL;
     reloctail = &relocs;
+    data_align = bss_align = 4;
 }
 
 static void bin_cleanup (void) {
     struct Reloc *r;
-    long datapos, dataalign, bsspos;
+    long datapos, datagap, bsspos;
 
-    datapos = (start_point + textsect.length + 3) & ~3;/* align on 4 bytes */
-    dataalign = datapos - (start_point + textsect.length);
+    datapos = start_point + textsect.length;
+    datapos = (datapos + data_align-1) & ~(data_align-1);
+    datagap = datapos - (start_point + textsect.length);
+    bsspos = datapos + datasect.length;
+    bsspos = (bsspos + bss_align-1) & ~(bss_align-1);
 
     saa_rewind (textsect.contents);
     saa_rewind (datasect.contents);
 
-    bsspos = (datapos + datasect.length + 3) & ~3;
-
     for (r = relocs; r; r = r->next) {
 	unsigned char *p, *q, mydata[4];
 	long l;
@@ -88,10 +94,12 @@ static void bin_cleanup (void) {
 	saa_fread (r->target->contents, r->posn, mydata, r->bytes);
 	p = q = mydata;
 	l = *p++;
-	l += ((long)*p++) << 8;
-	if (r->bytes == 4) {
-	    l += ((long)*p++) << 16;
-	    l += ((long)*p++) << 24;
+	if (r->bytes > 1) {
+	    l += ((long)*p++) << 8;
+	    if (r->bytes == 4) {
+		l += ((long)*p++) << 16;
+		l += ((long)*p++) << 24;
+	    }
 	}
 
 	if (r->secref == textsect.index)
@@ -110,13 +118,16 @@ static void bin_cleanup (void) {
 
 	if (r->bytes == 4)
 	    WRITELONG(q, l);
-	else
+	else if (r->bytes == 2)
 	    WRITESHORT(q, l);
+	else
+	    *q++ = l & 0xFF;
 	saa_fwrite (r->target->contents, r->posn, mydata, r->bytes);
     }
     saa_fpwrite (textsect.contents, fp);
     if (datasect.length > 0) {
-	fwrite ("\0\0\0\0", dataalign, 1, fp);
+	while (datagap--)
+	    fputc('\0', fp);
 	saa_fpwrite (datasect.contents, fp);
     }
     fclose (fp);
@@ -240,7 +251,12 @@ static void bin_out (long segto, void *data, unsigned long type,
 }
 
 static void bin_deflabel (char *name, long segment, long offset,
-			  int is_global) {
+			  int is_global, char *special) {
+
+    if (special)
+	error (ERR_NONFATAL, "binary format does not support any"
+	       " special symbol types");
+
     if (name[0] == '.' && name[1] == '.' && name[2] != '@') {
 	error (ERR_NONFATAL, "unrecognised special symbol `%s'", name);
 	return;
@@ -253,6 +269,10 @@ static void bin_deflabel (char *name, long segment, long offset,
 }
 
 static long bin_secname (char *name, int pass, int *bits) {
+    int sec_index;
+    long *sec_align;
+    char *p;
+
     /*
      * Default is 16 bits.
      */
@@ -262,14 +282,40 @@ static long bin_secname (char *name, int pass, int *bits) {
     if (!name)
 	return textsect.index;
 
-    if (!strcmp(name, ".text"))
-	return textsect.index;
-    else if (!strcmp(name, ".data"))
-	return datasect.index;
-    else if (!strcmp(name, ".bss"))
-	return bssindex;
-    else
+    p = name;
+    while (*p && !isspace(*p)) p++;
+    if (*p) *p++ = '\0';
+    if (!strcmp(name, ".text")) {
+	sec_index = textsect.index;
+	sec_align = NULL;
+    } else if (!strcmp(name, ".data")) {
+	sec_index = datasect.index;
+	sec_align = &data_align;
+    } else if (!strcmp(name, ".bss")) {
+	sec_index = bssindex;
+	sec_align = &bss_align;
+    } else
 	return NO_SEG;
+
+    if (*p) {
+	if (!nasm_strnicmp(p,"align=",6)) {
+	    if (sec_align == NULL)
+		error(ERR_NONFATAL, "cannot specify an alignment to"
+		      " the `.text' section");
+	    else if (p[6+strspn(p+6,"0123456789")])
+		error(ERR_NONFATAL, "argument to `align' is not numeric");
+	    else {
+		unsigned int align = atoi(p+6);
+		if (!align || ((align-1) & align))
+		    error(ERR_NONFATAL, "argument to `align' is not a"
+			  " power of two");
+		else
+		    *sec_align = align;
+	    }
+	}
+    }
+
+    return sec_index;
 }
 
 static long bin_segbase (long segment) {
@@ -292,9 +338,18 @@ static void bin_filename (char *inname, char *outname, efunc error) {
     standard_extension (inname, outname, "", error);
 }
 
+static char *bin_stdmac[] = {
+    "%define __SECT__ [section .text]",
+    "%imacro org 1+.nolist",
+    "[org %1]",
+    "%endmacro",
+    NULL
+};
+
 struct ofmt of_bin = {
     "flat-form binary files (e.g. DOS .COM, .SYS)",
     "bin",
+    bin_stdmac,
     bin_init,
     bin_out,
     bin_deflabel,
diff --git a/outcoff.c b/outcoff.c
index 21b9bac4..09e886ce 100644
--- a/outcoff.c
+++ b/outcoff.c
@@ -37,14 +37,11 @@
  * (2) Win32 doesn't bother putting any flags in the header flags
  * field (at offset 0x12 into the file).
  *
- * (3) Win32 puts some weird flags into the section header table.
- * It uses flags 0x80000000 (writable), 0x40000000 (readable) and
- * 0x20000000 (executable) in the expected combinations, which
- * standard COFF doesn't seem to bother with, but it also does
- * something else strange: it also flags code sections as
- * 0x00500000 and data/bss as 0x00300000. Even Microsoft's
- * documentation doesn't explain what these things mean. I just go
- * ahead and use them anyway - it seems to work.
+ * (3) Win32 uses some extra flags into the section header table:
+ * it defines flags 0x80000000 (writable), 0x40000000 (readable)
+ * and 0x20000000 (executable), and uses them in the expected
+ * combinations. It also defines 0x00100000 through 0x00700000 for
+ * section alignments of 1 through 64 bytes.
  *
  * (4) Both standard COFF and Win32 COFF seem to use the DWORD
  * field directly after the section name in the section header
@@ -53,8 +50,7 @@
  * to end starting at zero. Dunno why. Microsoft's documentation
  * lists this field as "Virtual Size of Section", which doesn't
  * seem to fit at all. In fact, Win32 even includes non-linked
- * sections such as .drectve in this calculation. Not that I can be
- * bothered with those things anyway.
+ * sections such as .drectve in this calculation.
  *
  * (5) Standard COFF does something very strange to common
  * variables: the relocation point for a common variable is as far
@@ -131,13 +127,15 @@ static void coff_section_header (char *, long, long, long, long, int, long);
 static void coff_write_relocs (struct Section *);
 static void coff_write_symbols (void);
 
-static void coff_win32_init(FILE *fp, efunc errfunc, ldfunc ldef) {
+static void coff_win32_init(FILE *fp, efunc errfunc,
+			    ldfunc ldef, evalfunc eval) {
     win32 = TRUE;
     (void) ldef;		       /* placate optimisers */
     coff_gen_init(fp, errfunc);
 }
 
-static void coff_std_init(FILE *fp, efunc errfunc, ldfunc ldef) {
+static void coff_std_init(FILE *fp, efunc errfunc,
+			  ldfunc ldef, evalfunc eval) {
     win32 = FALSE;
     (void) ldef;		       /* placate optimisers */
     coff_gen_init(fp, errfunc);
@@ -209,7 +207,7 @@ static int coff_make_section (char *name, unsigned long flags) {
 
 static long coff_section_names (char *name, int pass, int *bits) {
     char *p;
-    unsigned long flags;
+    unsigned long flags, align_and = ~0L, align_or = 0L;
     int i;
 
     /*
@@ -224,7 +222,7 @@ static long coff_section_names (char *name, int pass, int *bits) {
     p = name;
     while (*p && !isspace(*p)) p++;
     if (*p) *p++ = '\0';
-    if (strlen(p) > 8) {
+    if (strlen(name) > 8) {
 	error (ERR_WARNING, "COFF section names limited to 8 characters:"
 	       " truncating");
 	p[8] = '\0';
@@ -237,7 +235,7 @@ static long coff_section_names (char *name, int pass, int *bits) {
 	while (*p && !isspace(*p)) p++;
 	if (*p) *p++ = '\0';
 	while (*p && isspace(*p)) p++;
-	
+
 	if (!nasm_stricmp(q, "code") || !nasm_stricmp(q, "text")) {
 	    flags = TEXT_FLAGS;
 	} else if (!nasm_stricmp(q, "data")) {
@@ -252,6 +250,32 @@ static long coff_section_names (char *name, int pass, int *bits) {
 		error (ERR_NONFATAL, "standard COFF does not support"
 		       " informational sections");
 	    }
+	} else if (!nasm_strnicmp(q,"align=",6)) {
+	    if (!win32)
+		error (ERR_NONFATAL, "standard COFF does not support"
+		       " section alignment specification");
+	    else {
+		if (q[6+strspn(q+6,"0123456789")])
+		    error(ERR_NONFATAL, "argument to `align' is not numeric");
+		else {
+		    unsigned int align = atoi(q+6);
+		    if (!align || ((align-1) & align))
+			error(ERR_NONFATAL, "argument to `align' is not a"
+			      " power of two");
+		    else if (align > 64)
+			error(ERR_NONFATAL, "Win32 cannot align sections"
+			      " to better than 64-byte boundaries");
+		    else {
+			align_and = ~0x00F00000L;
+			align_or = (align == 1 ? 0x00100000L :
+				    align == 2 ? 0x00200000L :
+				    align == 4 ? 0x00300000L :
+				    align == 8 ? 0x00400000L :
+				    align == 16 ? 0x00500000L :
+				    align == 32 ? 0x00600000L : 0x00700000L);
+		    }
+		}
+	    }
 	}
     }
 
@@ -259,18 +283,19 @@ static long coff_section_names (char *name, int pass, int *bits) {
 	if (!strcmp(name, sects[i]->name))
 	    break;
     if (i == nsects) {
-	if (!strcmp(name, ".text") && !flags)
-	    i = coff_make_section (name, TEXT_FLAGS);
-	else if (!strcmp(name, ".data") && !flags)
-	    i = coff_make_section (name, DATA_FLAGS);
-	else if (!strcmp(name, ".bss") && !flags)
-	    i = coff_make_section (name, BSS_FLAGS);
-	else if (flags)
-	    i = coff_make_section (name, flags);
-	else
-	    i = coff_make_section (name, TEXT_FLAGS);
+	if (!flags) {
+	    if (!strcmp(name, ".data"))
+		flags = DATA_FLAGS;
+	    else if (!strcmp(name, ".bss"))
+		flags = BSS_FLAGS;
+	    else
+		flags = TEXT_FLAGS;
+	}
+	i = coff_make_section (name, flags);
 	if (flags)
 	    sects[i]->flags = flags;
+	sects[i]->flags &= align_and;
+	sects[i]->flags |= align_or;
     } else if (pass == 1) {
 	if (flags)
 	    error (ERR_WARNING, "section attributes ignored on"
@@ -281,10 +306,14 @@ static long coff_section_names (char *name, int pass, int *bits) {
 }
 
 static void coff_deflabel (char *name, long segment, long offset,
-			   int is_global) {
+			   int is_global, char *special) {
     int pos = strslen+4;
     struct Symbol *sym;
 
+    if (special)
+	error (ERR_NONFATAL, "binary format does not support any"
+	       " special symbol types");
+
     if (name[0] == '.' && name[1] == '.' && name[2] != '@') {
 	error (ERR_NONFATAL, "unrecognised special symbol `%s'", name);
 	return;
@@ -430,8 +459,8 @@ static void coff_out (long segto, void *data, unsigned long type,
 	    error(ERR_PANIC, "OUT_RAWDATA with other than NO_SEG");
 	coff_sect_write (s, data, realbytes);
     } else if (type == OUT_ADDRESS) {
-	if (realbytes == 2 && (segment != NO_SEG || wrt != NO_SEG))
-	    error(ERR_NONFATAL, "COFF format does not support 16-bit"
+	if (realbytes != 4 && (segment != NO_SEG || wrt != NO_SEG))
+	    error(ERR_NONFATAL, "COFF format does not support non-32-bit"
 		  " relocations");
 	else {
 	    long fix = 0;
@@ -661,11 +690,17 @@ static void coff_win32_filename (char *inname, char *outname, efunc error) {
 
 #endif /* defined(OF_COFF) || defined(OF_WIN32) */
 
+static char *coff_stdmac[] = {
+    "%define __SECT__ [section .text]",
+    NULL
+};
+
 #ifdef OF_COFF
 
 struct ofmt of_coff = {
     "COFF (i386) object files (e.g. DJGPP for DOS)",
     "coff",
+    coff_stdmac,
     coff_std_init,
     coff_out,
     coff_deflabel,
@@ -683,6 +718,7 @@ struct ofmt of_coff = {
 struct ofmt of_win32 = {
     "Microsoft Win32 (i386) object files",
     "win32",
+    coff_stdmac,
     coff_win32_init,
     coff_out,
     coff_deflabel,
diff --git a/outdbg.c b/outdbg.c
index e37ebdbf..b3d23a04 100644
--- a/outdbg.c
+++ b/outdbg.c
@@ -18,30 +18,39 @@
 
 #ifdef OF_DBG
 
+struct Section {
+    struct Section *next;
+    long number;
+    char *name;
+} *dbgsect;
+
 FILE *dbgf;
 efunc dbgef;
 
-int segcode,segdata,segbss;
-
-static void dbg_init(FILE *fp, efunc errfunc, ldfunc ldef)
+static void dbg_init(FILE *fp, efunc errfunc, ldfunc ldef, evalfunc eval)
 {
-  dbgf = fp;
-  dbgef = errfunc;
-  (void) ldef;
-  segcode = seg_alloc();
-  segdata = seg_alloc();
-  segbss = seg_alloc();
-  fprintf(fp,"NASM Output format debug dump - code=%d,data=%d,bss=%d\n",
-	  segcode,segdata,segbss);
+    dbgf = fp;
+    dbgef = errfunc;
+    dbgsect = NULL;
+    (void) ldef;
+    fprintf(fp,"NASM Output format debug dump\n");
 }
 
 static void dbg_cleanup(void)
 {
-  fclose(dbgf);
+    while (dbgsect) {
+	struct Section *tmp = dbgsect;
+	dbgsect = dbgsect->next;
+	nasm_free (tmp->name);
+	nasm_free (tmp);
+    }
+    fclose(dbgf);
 }
 
 static long dbg_section_names (char *name, int pass, int *bits)
 {
+    int seg;
+
     /*
      * We must have an initial default: let's make it 16.
      */
@@ -49,73 +58,91 @@ static long dbg_section_names (char *name, int pass, int *bits)
 	*bits = 16;
 
     if (!name)
-	return 0;
-
-    if (!strcmp(name, ".text"))
-	return segcode;
-    else if (!strcmp(name, ".data"))
-	return segdata;
-    else if (!strcmp(name, ".bss"))
-	return segbss;
-    else
-	return NO_SEG;
+	fprintf(dbgf, "section_name on init: returning %d\n",
+		seg = seg_alloc());
+    else {
+	int n = strcspn(name, " \t");
+	char *sname = nasm_strndup(name, n);
+	struct Section *s;
+
+	seg = NO_SEG;
+	for (s = dbgsect; s; s = s->next)
+	    if (!strcmp(s->name, sname))
+		seg = s->number;
+	
+	if (seg == NO_SEG) {
+	    s = nasm_malloc(sizeof(*s));
+	    s->name = sname;
+	    s->number = seg = seg_alloc();
+	    s->next = dbgsect;
+	    dbgsect = s;
+	    fprintf(dbgf, "section_name %s (pass %d): returning %d\n",
+		    name, pass, seg);
+	}
+    }
+    return seg;
 }
 
 static void dbg_deflabel (char *name, long segment, long offset,
-			   int is_global) {
-    fprintf(dbgf,"deflabel %s := %08lx:%08lx %s (%d)\n",name,segment,offset,
-	    is_global ? "global" : "local", is_global);
+			  int is_global, char *special) {
+    fprintf(dbgf,"deflabel %s := %08lx:%08lx %s (%d)%s%s\n",
+	    name, segment, offset,
+	    is_global == 2 ? "common" : is_global ? "global" : "local",
+	    is_global,
+	    special ? ": " : "", special);
 }
 
 static void dbg_out (long segto, void *data, unsigned long type,
-		      long segment, long wrt) {
-  long realbytes = type & OUT_SIZMASK;
-  long ldata;
-  int id;
-
-  type &= OUT_TYPMASK;
-
-  fprintf(dbgf,"out to %lx, len = %ld: ",segto,realbytes);
-
-  switch(type) {
-  case OUT_RESERVE:
-    fprintf(dbgf,"reserved.\n"); break;
-  case OUT_RAWDATA:
-    fprintf(dbgf,"raw data = ");
-    while (realbytes--) {
-      id = *(unsigned char *)data;
-      data = (char *)data + 1;
-      fprintf(dbgf,"%02x ",id);
+		     long segment, long wrt) {
+    long realbytes = type & OUT_SIZMASK;
+    long ldata;
+    int id;
+
+    type &= OUT_TYPMASK;
+
+    fprintf(dbgf,"out to %lx, len = %ld: ",segto,realbytes);
+
+    switch(type) {
+      case OUT_RESERVE:
+	fprintf(dbgf,"reserved.\n"); break;
+      case OUT_RAWDATA:
+	fprintf(dbgf,"raw data = ");
+	while (realbytes--) {
+	    id = *(unsigned char *)data;
+	    data = (char *)data + 1;
+	    fprintf(dbgf,"%02x ",id);
+	}
+	fprintf(dbgf,"\n"); break;
+      case OUT_ADDRESS:
+	ldata = 0; /* placate gcc */
+	if (realbytes == 1)
+	    ldata = *((char *)data);
+	else if (realbytes == 2)
+	    ldata = *((short *)data);
+	else if (realbytes == 4)
+	    ldata = *((long *)data);
+	fprintf(dbgf,"addr %08lx (seg %08lx, wrt %08lx)\n",ldata,
+		segment,wrt);break;
+      case OUT_REL2ADR:
+	fprintf(dbgf,"rel2adr %04x (seg %08lx)\n",(int)*(short *)data,segment);
+	break;
+      case OUT_REL4ADR:
+	fprintf(dbgf,"rel4adr %08lx (seg %08lx)\n",*(long *)data,segment);
+	break;
+      default:
+	fprintf(dbgf,"unknown\n");
+	break;
     }
-    fprintf(dbgf,"\n"); break;
-  case OUT_ADDRESS:
-    ldata = 0; /* placate gcc */
-    if (realbytes == 1)
-      ldata = *((char *)data);
-    else if (realbytes == 2)
-      ldata = *((short *)data);
-    else if (realbytes == 4)
-      ldata = *((long *)data);
-    fprintf(dbgf,"addr %08lx (seg %08lx, wrt %08lx)\n",ldata,
-	    segment,wrt);break;
-  case OUT_REL2ADR:
-    fprintf(dbgf,"rel2adr %04x (seg %08lx)\n",(int)*(short *)data,segment);
-    break;
-  case OUT_REL4ADR:
-    fprintf(dbgf,"rel4adr %08lx (seg %08lx)\n",*(long *)data,segment);
-    break;
-  default:
-    fprintf(dbgf,"unknown\n");
-    break;
-  }
 }
 
 static long dbg_segbase(long segment) {
-  return segment;
+    return segment;
 }
 
 static int dbg_directive (char *directive, char *value, int pass) {
-  return 0;
+    fprintf(dbgf, "directive [%s] value [%s] (pass %d)\n",
+	    directive, value, pass);
+    return 1;
 }
 
 static void dbg_filename (char *inname, char *outname, efunc error) {
@@ -125,6 +152,7 @@ static void dbg_filename (char *inname, char *outname, efunc error) {
 struct ofmt of_dbg = {
     "Trace of all info passed to output stage",
     "dbg",
+    NULL,
     dbg_init,
     dbg_out,
     dbg_deflabel,
diff --git a/outelf.c b/outelf.c
index 3c7f2763..6f6c1be6 100644
--- a/outelf.c
+++ b/outelf.c
@@ -18,18 +18,33 @@
 
 #ifdef OF_ELF
 
+/*
+ * Relocation types.
+ */
+#define R_386_32 1		       /* ordinary absolute relocation */
+#define R_386_PC32 2		       /* PC-relative relocation */
+#define R_386_GOT32 3		       /* an offset into GOT */
+#define R_386_PLT32 4		       /* a PC-relative offset into PLT */
+#define R_386_GOTOFF 9		       /* an offset from GOT base */
+#define R_386_GOTPC 10		       /* a PC-relative offset _to_ GOT */
+
 struct Reloc {
     struct Reloc *next;
     long address;		       /* relative to _start_ of section */
     long symbol;		       /* ELF symbol info thingy */
-    int relative;		       /* TRUE or FALSE */
+    int type;			       /* type of relocation */
 };
 
 struct Symbol {
     long strpos;		       /* string table position of name */
     long section;		       /* section ID of the symbol */
-    int type;			       /* TRUE or FALSE */
-    long value;			       /* address, or COMMON variable size */
+    int type;			       /* symbol type */
+    long value;			       /* address, or COMMON variable align */
+    long size;			       /* size of symbol */
+    long globnum;		       /* symbol table offset if global */
+    struct Symbol *next;	       /* list of globals in each section */
+    struct Symbol *nextfwd;	       /* list of unresolved-size symbols */
+    char *name;			       /* used temporarily if in above list */
 };
 
 #define SHT_PROGBITS 1
@@ -50,6 +65,7 @@ struct Section {
     struct SAA *rel;
     long rellen;
     struct Reloc *head, **tail;
+    struct Symbol *gsyms;	       /* global symbols in section */
 };
 
 #define SECT_DELTA 32
@@ -72,15 +88,22 @@ static unsigned long strslen;
 
 static FILE *elffp;
 static efunc error;
+static evalfunc evaluate;
+
+static struct Symbol *fwds;
 
 static char elf_module[FILENAME_MAX];
 
+extern struct ofmt of_elf;
+
 #define SHN_ABS 0xFFF1
 #define SHN_COMMON 0xFFF2
 #define SHN_UNDEF 0
 
 #define SYM_SECTION 0x04
 #define SYM_GLOBAL 0x10
+#define SYM_DATA 0x01
+#define SYM_FUNCTION 0x02
 
 #define GLOBAL_TEMP_BASE 6	       /* bigger than any constant sym id */
 
@@ -107,9 +130,19 @@ static struct SAA *elf_build_symtab (long *, long *);
 static struct SAA *elf_build_reltab (long *, struct Reloc *);
 static void add_sectname (char *, char *);
 
-static void elf_init(FILE *fp, efunc errfunc, ldfunc ldef) {
+/*
+ * Special section numbers which are used to define ELF special
+ * symbols, which can be used with WRT to provide PIC relocation
+ * types.
+ */
+static long elf_gotpc_sect, elf_gotoff_sect;
+static long elf_got_sect, elf_plt_sect;
+static long elf_sym_sect;
+
+static void elf_init(FILE *fp, efunc errfunc, ldfunc ldef, evalfunc eval) {
     elffp = fp;
     error = errfunc;
+    evaluate = eval;
     (void) ldef;		       /* placate optimisers */
     sects = NULL;
     nsects = sectlen = 0;
@@ -123,6 +156,20 @@ static void elf_init(FILE *fp, efunc errfunc, ldfunc ldef) {
     shstrtab = NULL;
     shstrtablen = shstrtabsize = 0;;
     add_sectname ("", "");
+
+    fwds = NULL;
+
+    elf_gotpc_sect = seg_alloc();
+    ldef("..gotpc", elf_gotpc_sect+1, 0L, NULL, FALSE, FALSE, &of_elf, error);
+    elf_gotoff_sect = seg_alloc();
+    ldef("..gotoff", elf_gotoff_sect+1, 0L, NULL, FALSE, FALSE,&of_elf,error);
+    elf_got_sect = seg_alloc();
+    ldef("..got", elf_got_sect+1, 0L, NULL, FALSE, FALSE, &of_elf, error);
+    elf_plt_sect = seg_alloc();
+    ldef("..plt", elf_plt_sect+1, 0L, NULL, FALSE, FALSE, &of_elf, error);
+    elf_sym_sect = seg_alloc();
+    ldef("..sym", elf_sym_sect+1, 0L, NULL, FALSE, FALSE, &of_elf, error);
+
     def_seg = seg_alloc();
 }
 
@@ -179,6 +226,7 @@ static int elf_make_section (char *name, int type, int flags, int align) {
     s->type = type;
     s->flags = flags;
     s->align = align;
+    s->gsyms = NULL;
 
     if (nsects >= sectlen)
 	sects = nasm_realloc (sects, (sectlen += SECT_DELTA)*sizeof(*sects));
@@ -286,15 +334,60 @@ static long elf_section_names (char *name, int pass, int *bits) {
 }
 
 static void elf_deflabel (char *name, long segment, long offset,
-			   int is_global) {
+			   int is_global, char *special) {
     int pos = strslen;
     struct Symbol *sym;
+    int special_used = FALSE;
 
     if (name[0] == '.' && name[1] == '.' && name[2] != '@') {
-	error (ERR_NONFATAL, "unrecognised special symbol `%s'", name);
+	/*
+	 * This is a NASM special symbol. We never allow it into
+	 * the ELF symbol table, even if it's a valid one. If it
+	 * _isn't_ a valid one, we should barf immediately.
+	 */
+	if (strcmp(name, "..gotpc") && strcmp(name, "..gotoff") &&
+	    strcmp(name, "..got") && strcmp(name, "..plt") &&
+	    strcmp(name, "..sym"))
+	    error (ERR_NONFATAL, "unrecognised special symbol `%s'", name);
 	return;
     }
 
+    if (is_global == 3) {
+	struct Symbol **s;
+	/*
+	 * Fix up a forward-reference symbol size from the first
+	 * pass.
+	 */
+	for (s = &fwds; *s; s = &(*s)->nextfwd)
+	    if (!strcmp((*s)->name, name)) {
+		struct tokenval tokval;
+		expr *e;
+		char *p = special;
+
+		while (*p && !isspace(*p)) p++;
+		while (*p && isspace(*p)) p++;
+		stdscan_reset();
+		stdscan_bufptr = p;
+		tokval.t_type = TOKEN_INVALID;
+		e = evaluate(stdscan, NULL, &tokval, NULL, 1, error, NULL);
+		if (e) {
+		    if (!is_simple(e))
+			error (ERR_NONFATAL, "cannot use relocatable"
+			       " expression as symbol size");
+		    else
+			(*s)->size = reloc_value(e);
+		}
+
+		/*
+		 * Remove it from the list of unresolved sizes.
+		 */
+		nasm_free ((*s)->name);
+		*s = (*s)->nextfwd;
+		return;
+	    }
+	return;			       /* it wasn't an important one */
+    }
+
     saa_wbytes (strs, name, (long)(1+strlen(name)));
     strslen += 1+strlen(name);
 
@@ -302,6 +395,7 @@ static void elf_deflabel (char *name, long segment, long offset,
 
     sym->strpos = pos;
     sym->type = is_global ? SYM_GLOBAL : 0;
+    sym->size = 0;
     if (segment == NO_SEG)
 	sym->section = SHN_ABS;
     else {
@@ -322,21 +416,96 @@ static void elf_deflabel (char *name, long segment, long offset,
     }
 
     if (is_global == 2) {
-	sym->value = offset;
+	sym->size = offset;
+	sym->value = 0;
 	sym->section = SHN_COMMON;
+	/*
+	 * We have a common variable. Check the special text to see
+	 * if it's a valid number and power of two; if so, store it
+	 * as the alignment for the common variable.
+	 */
+	if (special) {
+	    int err;
+	    sym->value = readnum (special, &err);
+	    if (err)
+		error(ERR_NONFATAL, "alignment constraint `%s' is not a"
+		      " valid number", special);
+	    else if ( (sym->value | (sym->value-1)) != 2*sym->value - 1)
+		error(ERR_NONFATAL, "alignment constraint `%s' is not a"
+		      " power of two", special);
+	}
+	special_used = TRUE;
     } else
 	sym->value = (sym->section == SHN_UNDEF ? 0 : offset);
 
     if (sym->type == SYM_GLOBAL) {
 	if (sym->section == SHN_UNDEF || sym->section == SHN_COMMON)
 	    bsym = raa_write (bsym, segment, nglobs);
+	else {
+	    /*
+	     * This is a global symbol; so we must add it to the linked
+	     * list of global symbols in its section. We'll push it on
+	     * the beginning of the list, because it doesn't matter
+	     * much which end we put it on and it's easier like this.
+	     *
+	     * In addition, we check the special text for symbol
+	     * type and size information.
+	     */
+	    sym->next = sects[sym->section-1]->gsyms;
+	    sects[sym->section-1]->gsyms = sym;
+
+	    if (special) {
+		int n = strcspn(special, " ");
+
+		if (!nasm_strnicmp(special, "function", n))
+		    sym->type |= SYM_FUNCTION;
+		else if (!nasm_strnicmp(special, "data", n) ||
+			 !nasm_strnicmp(special, "object", n))
+		    sym->type |= SYM_DATA;
+		else
+		    error(ERR_NONFATAL, "unrecognised symbol type `%.*s'",
+			  n, special);
+		if (special[n]) {
+		    struct tokenval tokval;
+		    expr *e;
+		    int fwd = FALSE;
+
+		    while (special[n] && isspace(special[n]))
+			n++;
+		    /*
+		     * We have a size expression; attempt to
+		     * evaluate it.
+		     */
+		    stdscan_reset();
+		    stdscan_bufptr = special+n;
+		    tokval.t_type = TOKEN_INVALID;
+		    e = evaluate(stdscan, NULL, &tokval, &fwd, 0, error, NULL);
+		    if (fwd) {
+			sym->nextfwd = fwds;
+			fwds = sym;
+			sym->name = nasm_strdup(name);
+		    } else if (e) {
+			if (!is_simple(e))
+			    error (ERR_NONFATAL, "cannot use relocatable"
+				   " expression as symbol size");
+			else
+			    sym->size = reloc_value(e);
+		    }
+		}
+		special_used = TRUE;
+	    }
+	}
+	sym->globnum = nglobs;
 	nglobs++;
     } else
 	nlocals++;
+
+    if (special && !special_used)
+	error(ERR_NONFATAL, "no special symbol features supported here");
 }
 
 static void elf_add_reloc (struct Section *sect, long segment,
-			    int relative) {
+			   int type) {
     struct Reloc *r;
 
     r = *sect->tail = nasm_malloc(sizeof(struct Reloc));
@@ -355,23 +524,106 @@ static void elf_add_reloc (struct Section *sect, long segment,
 	if (!r->symbol)
 	    r->symbol = GLOBAL_TEMP_BASE + raa_read(bsym, segment);
     }
-    r->relative = relative;
+    r->type = type;
+
+    sect->nrelocs++;
+}
+
+/*
+ * This routine deals with ..got and ..sym relocations: the more
+ * complicated kinds. In shared-library writing, some relocations
+ * with respect to global symbols must refer to the precise symbol
+ * rather than referring to an offset from the base of the section
+ * _containing_ the symbol. Such relocations call to this routine,
+ * which searches the symbol list for the symbol in question.
+ *
+ * R_386_GOT32 references require the _exact_ symbol address to be
+ * used; R_386_32 references can be at an offset from the symbol.
+ * The boolean argument `exact' tells us this.
+ *
+ * Return value is the adjusted value of `addr', having become an
+ * offset from the symbol rather than the section. Should always be
+ * zero when returning from an exact call.
+ *
+ * Limitation: if you define two symbols at the same place,
+ * confusion will occur.
+ *
+ * Inefficiency: we search, currently, using a linked list which
+ * isn't even necessarily sorted.
+ */
+static long elf_add_gsym_reloc (struct Section *sect,
+				long segment, long offset,
+				int type, int exact) {
+    struct Reloc *r;
+    struct Section *s;
+    struct Symbol *sym, *sm;
+    int i;
+
+    /*
+     * First look up the segment/offset pair and find a global
+     * symbol corresponding to it. If it's not one of our segments,
+     * then it must be an external symbol, in which case we're fine
+     * doing a normal elf_add_reloc after first sanity-checking
+     * that the offset from the symbol is zero.
+     */
+    s = NULL;
+    for (i=0; i<nsects; i++)
+	if (segment == sects[i]->index) {
+	    s = sects[i];
+	    break;
+	}
+    if (!s) {
+	if (exact && offset != 0)
+	    error (ERR_NONFATAL, "unable to find a suitable global symbol"
+		   " for this reference");
+	else
+	    elf_add_reloc (sect, segment, type);
+	return offset;
+    }
+
+    if (exact) {
+	/*
+	 * Find a symbol pointing _exactly_ at this one.
+	 */
+	for (sym = s->gsyms; sym; sym = sym->next)
+	    if (sym->value == offset)
+		break;
+    } else {
+	/*
+	 * Find the nearest symbol below this one.
+	 */
+	sym = NULL;
+	for (sm = s->gsyms; sm; sm = sm->next)
+	    if (sm->value <= offset && (!sym || sm->value > sym->value))
+		sym = sm;
+    }
+    if (!sym && exact) {
+	error (ERR_NONFATAL, "unable to find a suitable global symbol"
+	       " for this reference");
+	return 0;
+    }
+
+    r = *sect->tail = nasm_malloc(sizeof(struct Reloc));
+    sect->tail = &r->next;
+    r->next = NULL;
+
+    r->address = sect->len;
+    r->symbol = GLOBAL_TEMP_BASE + sym->globnum;
+    r->type = type;
 
     sect->nrelocs++;
+
+    return offset - sym->value;
 }
 
 static void elf_out (long segto, void *data, unsigned long type,
 		      long segment, long wrt) {
     struct Section *s;
     long realbytes = type & OUT_SIZMASK;
+    long addr;
     unsigned char mydata[4], *p;
     int i;
 
-    if (wrt != NO_SEG) {
-	wrt = NO_SEG;		       /* continue to do _something_ */
-	error (ERR_NONFATAL, "WRT not supported by ELF output format");
-    }
-
     type &= OUT_TYPMASK;
 
     /*
@@ -421,20 +673,45 @@ static void elf_out (long segto, void *data, unsigned long type,
 	    error(ERR_PANIC, "OUT_RAWDATA with other than NO_SEG");
 	elf_sect_write (s, data, realbytes);
     } else if (type == OUT_ADDRESS) {
-	if (wrt != NO_SEG)
-	    error(ERR_NONFATAL, "ELF format does not support WRT types");
+	addr = *(long *)data;
 	if (segment != NO_SEG) {
 	    if (segment % 2) {
 		error(ERR_NONFATAL, "ELF format does not support"
 		      " segment base references");
-	    } else
-		elf_add_reloc (s, segment, FALSE);
+	    } else {
+		if (wrt == NO_SEG) {
+		    elf_add_reloc (s, segment, R_386_32);
+		} else if (wrt == elf_gotpc_sect+1) {
+		    /*
+		     * The user will supply GOT relative to $$. ELF
+		     * will let us have GOT relative to $. So we
+		     * need to fix up the data item by $-$$.
+		     */
+		    addr += s->len;
+		    elf_add_reloc (s, segment, R_386_GOTPC);
+		} else if (wrt == elf_gotoff_sect+1) {
+		    elf_add_reloc (s, segment, R_386_GOTOFF);
+		} else if (wrt == elf_got_sect+1) {
+		    addr = elf_add_gsym_reloc (s, segment, addr,
+					       R_386_GOT32, TRUE);
+		} else if (wrt == elf_sym_sect+1) {
+		    addr = elf_add_gsym_reloc (s, segment, addr,
+					       R_386_32, FALSE);
+		} else if (wrt == elf_plt_sect+1) {
+		    error(ERR_NONFATAL, "ELF format cannot produce non-PC-"
+			  "relative PLT references");
+		} else {
+		    error (ERR_NONFATAL, "ELF format does not support this"
+			   " use of WRT");
+		    wrt = NO_SEG;      /* we can at least _try_ to continue */
+		}
+	    }
 	}
 	p = mydata;
-	if (realbytes == 2 && segment != NO_SEG)
-	    error (ERR_NONFATAL, "ELF format does not support 16-bit"
+	if (realbytes != 4 && segment != NO_SEG)
+	    error (ERR_NONFATAL, "ELF format does not support non-32-bit"
 		   " relocations");
-	WRITELONG (p, *(long *)data);
+	WRITELONG (p, addr);
 	elf_sect_write (s, mydata, realbytes);
     } else if (type == OUT_REL2ADR) {
 	error (ERR_NONFATAL, "ELF format does not support 16-bit"
@@ -445,8 +722,22 @@ static void elf_out (long segto, void *data, unsigned long type,
 	if (segment != NO_SEG && segment % 2) {
 	    error(ERR_NONFATAL, "ELF format does not support"
 		  " segment base references");
-	} else
-	    elf_add_reloc (s, segment, TRUE);
+	} else {
+	    if (wrt == NO_SEG) {
+		elf_add_reloc (s, segment, R_386_PC32);
+	    } else if (wrt == elf_plt_sect+1) {
+		elf_add_reloc (s, segment, R_386_PLT32);
+	    } else if (wrt == elf_gotpc_sect+1 ||
+		       wrt == elf_gotoff_sect+1 ||
+		       wrt == elf_got_sect+1) {
+		error(ERR_NONFATAL, "ELF format cannot produce PC-"
+		      "relative GOT references");
+	    } else {
+		error (ERR_NONFATAL, "ELF format does not support this"
+		       " use of WRT");
+		wrt = NO_SEG;      /* we can at least _try_ to continue */
+	    }
+	}
 	p = mydata;
 	WRITELONG (p, *(long*)data - realbytes);
 	elf_sect_write (s, mydata, 4L);
@@ -614,16 +905,13 @@ static struct SAA *elf_build_symtab (long *len, long *local) {
      */
     saa_rewind (syms);
     while ( (sym = saa_rstruct (syms)) ) {
-	if (sym->type == SYM_GLOBAL)
+	if (sym->type & SYM_GLOBAL)
 	    continue;
 	p = entry;
 	WRITELONG (p, sym->strpos);
 	WRITELONG (p, sym->value);
-	if (sym->section == SHN_COMMON)
-	    WRITELONG (p, sym->value);
-	else
-	    WRITELONG (p, 0);
-	WRITESHORT (p, 0);	       /* local non-typed thing */
+	WRITELONG (p, sym->size);
+	WRITESHORT (p, sym->type);     /* local non-typed thing */
 	WRITESHORT (p, sym->section);
 	saa_wbytes (s, entry, 16L);
         *len += 16;
@@ -635,16 +923,13 @@ static struct SAA *elf_build_symtab (long *len, long *local) {
      */
     saa_rewind (syms);
     while ( (sym = saa_rstruct (syms)) ) {
-	if (sym->type != SYM_GLOBAL)
+	if (!(sym->type & SYM_GLOBAL))
 	    continue;
 	p = entry;
 	WRITELONG (p, sym->strpos);
 	WRITELONG (p, sym->value);
-	if (sym->section == SHN_COMMON)
-	    WRITELONG (p, sym->value);
-	else
-	    WRITELONG (p, 0);
-	WRITESHORT (p, SYM_GLOBAL);    /* global non-typed thing */
+	WRITELONG (p, sym->size);
+	WRITESHORT (p, sym->type);     /* global non-typed thing */
 	WRITESHORT (p, sym->section);
 	saa_wbytes (s, entry, 16L);
 	*len += 16;
@@ -671,7 +956,7 @@ static struct SAA *elf_build_reltab (long *len, struct Reloc *r) {
 
 	p = entry;
 	WRITELONG (p, r->address);
-	WRITELONG (p, (sym << 8) + (r->relative ? 2 : 1));
+	WRITELONG (p, (sym << 8) + r->type);
 	saa_wbytes (s, entry, 8L);
 	*len += 8;
 
@@ -737,9 +1022,15 @@ static void elf_filename (char *inname, char *outname, efunc error) {
     standard_extension (inname, outname, ".o", error);
 }
 
+static char *elf_stdmac[] = {
+    "%define __SECT__ [section .text]",
+    NULL
+};
+
 struct ofmt of_elf = {
     "ELF32 (i386) object files (e.g. Linux)",
     "elf",
+    elf_stdmac,
     elf_init,
     elf_out,
     elf_deflabel,
diff --git a/outform.h b/outform.h
index a77e534e..e23f3c79 100644
--- a/outform.h
+++ b/outform.h
@@ -17,8 +17,8 @@
  * OF_name                -- ensure that output format 'name' is included
  * OF_NO_name             -- remove output format 'name'
  * OF_DOS                 -- ensure that 'obj', 'bin' & 'win32' are included.
- * OF_UNIX                -- ensure that 'aout', 'coff' and 'elf' are in.
- * OF_OTHERS              -- ensure that 'bin', 'as86', 'os2' & 'rdf' are in.
+ * OF_UNIX                -- ensure that 'aout', 'aoutb', 'coff', 'elf' are in.
+ * OF_OTHERS              -- ensure that 'bin', 'as86' & 'rdf' are in.
  * OF_ALL                 -- ensure that all formats are included.
  *
  * OF_DEFAULT=of_name     -- ensure that 'name' is the default format.
@@ -60,7 +60,7 @@ void ofmt_register (struct ofmt *);
 
 /* ====configurable info begins here==== */
 /* formats configurable:
- * bin,obj,elf,aout,coff,win32,as86,rdf */
+ * bin,obj,elf,aout,aoutb,coff,win32,as86,rdf */
 
 /* process options... */
 
@@ -77,9 +77,6 @@ void ofmt_register (struct ofmt *);
 #ifndef OF_OBJ
 #define OF_OBJ
 #endif
-#ifndef OF_OS2
-#define OF_OS2
-#endif
 #ifndef OF_ELF
 #define OF_ELF
 #endif
@@ -89,6 +86,9 @@ void ofmt_register (struct ofmt *);
 #ifndef OF_AOUT
 #define OF_AOUT
 #endif
+#ifndef OF_AOUTB
+#define OF_AOUTB
+#endif
 #ifndef OF_WIN32
 #define OF_WIN32
 #endif
@@ -117,6 +117,9 @@ void ofmt_register (struct ofmt *);
 #ifndef OF_AOUT
 #define OF_AOUT
 #endif
+#ifndef OF_AOUTB
+#define OF_AOUTB
+#endif
 #ifndef OF_COFF
 #define OF_COFF
 #endif
@@ -135,9 +138,6 @@ void ofmt_register (struct ofmt *);
 #ifndef OF_RDF
 #define OF_RDF
 #endif
-#ifndef OF_OS2
-#define OF_OS2
-#endif
 #endif
 
 /* finally... override any format specifically specifed to be off */
@@ -153,6 +153,9 @@ void ofmt_register (struct ofmt *);
 #ifdef OF_NO_AOUT
 #undef OF_AOUT
 #endif
+#ifdef OF_NO_AOUTB
+#undef OF_AOUTB
+#endif
 #ifdef OF_NO_COFF
 #undef OF_COFF
 #endif
@@ -165,9 +168,6 @@ void ofmt_register (struct ofmt *);
 #ifdef OF_NO_RDF
 #undef OF_RDF
 #endif
-#ifdef OF_NO_OS2
-#undef OF_OS2
-#endif
 
 #ifndef OF_DEFAULT
 #define OF_DEFAULT of_bin
diff --git a/outobj.c b/outobj.c
index 544ec663..0a64c6ae 100644
--- a/outobj.c
+++ b/outobj.c
@@ -22,6 +22,7 @@ static char obj_infile[FILENAME_MAX];
 static int obj_uppercase;
 
 static efunc error;
+static evalfunc evaluate;
 static ldfunc deflabel;
 static FILE *ofp;
 static long first_seg;
@@ -35,6 +36,9 @@ static int any_segs;
 
 static unsigned char record[RECORD_MAX], *recptr;
 
+struct Segment;			       /* need to know these structs exist */
+struct Group;
+
 static struct Public {
     struct Public *next;
     char *name;
@@ -46,13 +50,27 @@ static struct External {
     struct External *next;
     char *name;
     long commonsize;
-} *exthead, **exttail;
+    long commonelem;		       /* element size if FAR, else zero */
+    int index;			       /* OBJ-file external index */
+    enum {
+	DEFWRT_NONE,		       /* no unusual default-WRT */
+	DEFWRT_STRING,		       /* a string we don't yet understand */
+	DEFWRT_SEGMENT,		       /* a segment */
+	DEFWRT_GROUP		       /* a group */
+    } defwrt_type;
+    union {
+	char *string;
+	struct Segment *seg;
+	struct Group *grp;
+    } defwrt_ptr;
+    struct External *next_dws;	       /* next with DEFWRT_STRING */
+} *exthead, **exttail, *dws;
 
 static int externals;
 
 static struct ExtBack {
     struct ExtBack *next;
-    int index[EXT_BLKSIZ];
+    struct External *exts[EXT_BLKSIZ];
 } *ebhead, **ebtail;
 
 static struct Segment {
@@ -85,7 +103,7 @@ static struct Group {
 	long index;
 	char *name;
     } segs[GROUP_MAX];		       /* ...in this */
-} *grphead, **grptail, *obj_grp_needs_update, *defgrp;
+} *grphead, **grptail, *obj_grp_needs_update;
 
 static struct ObjData {
     struct ObjData *next;
@@ -97,9 +115,28 @@ static struct ObjData {
     unsigned char fixupp[RECORD_MAX], *fptr;
 } *datahead, *datacurr, **datatail;
 
-static long obj_entry_seg, obj_entry_ofs;
+static struct ImpDef {
+    struct ImpDef *next;
+    char *extname;
+    char *libname;
+    unsigned int impindex;
+    char *impname;
+} *imphead, **imptail;
+
+static struct ExpDef {
+    struct ExpDef *next;
+    char *intname;
+    char *extname;
+    unsigned int ordinal;
+    int flags;
+} *exphead, **exptail;
+
+#define EXPDEF_FLAG_ORDINAL  0x80
+#define EXPDEF_FLAG_RESIDENT 0x40
+#define EXPDEF_FLAG_NODATA   0x20
+#define EXPDEF_MASK_PARMCNT  0x1F
 
-static int os2;
+static long obj_entry_seg, obj_entry_ofs;
 
 enum RecordID {			       /* record ID codes */
 
@@ -140,9 +177,10 @@ static unsigned char *obj_write_value(unsigned char *, unsigned long);
 static void obj_record(int, unsigned char *, unsigned char *);
 static int obj_directive (char *, char *, int);
 
-static void obj_init (FILE *fp, efunc errfunc, ldfunc ldef) {
+static void obj_init (FILE *fp, efunc errfunc, ldfunc ldef, evalfunc eval) {
     ofp = fp;
     error = errfunc;
+    evaluate = eval;
     deflabel = ldef;
     first_seg = seg_alloc();
     any_segs = FALSE;
@@ -150,6 +188,11 @@ static void obj_init (FILE *fp, efunc errfunc, ldfunc ldef) {
     fpubtail = &fpubhead;
     exthead = NULL;
     exttail = &exthead;
+    imphead = NULL;
+    imptail = &imphead;
+    exphead = NULL;
+    exptail = &exphead;
+    dws = NULL;
     externals = 0;
     ebhead = NULL;
     ebtail = &ebhead;
@@ -161,22 +204,6 @@ static void obj_init (FILE *fp, efunc errfunc, ldfunc ldef) {
     datatail = &datahead;
     obj_entry_seg = NO_SEG;
     obj_uppercase = FALSE;
-
-    if (os2) {
-	obj_directive ("group", "FLAT", 1);
-	defgrp = grphead;
-    } else
-	defgrp = NULL;
-}
-
-static void dos_init (FILE *fp, efunc errfunc, ldfunc ldef) {
-    os2 = FALSE;
-    obj_init (fp, errfunc, ldef);
-}
-
-static void os2_init (FILE *fp, efunc errfunc, ldfunc ldef) {
-    os2 = TRUE;
-    obj_init (fp, errfunc, ldef);
 }
 
 static void obj_cleanup (void) {
@@ -188,6 +215,7 @@ static void obj_cleanup (void) {
 	while (segtmp->pubhead) {
 	    struct Public *pubtmp = segtmp->pubhead;
 	    segtmp->pubhead = pubtmp->next;
+	    nasm_free (pubtmp->name);
 	    nasm_free (pubtmp);
 	}
 	nasm_free (segtmp);
@@ -195,6 +223,7 @@ static void obj_cleanup (void) {
     while (fpubhead) {
 	struct Public *pubtmp = fpubhead;
 	fpubhead = fpubhead->next;
+	nasm_free (pubtmp->name);
 	nasm_free (pubtmp);
     }
     while (exthead) {
@@ -202,6 +231,21 @@ static void obj_cleanup (void) {
 	exthead = exthead->next;
 	nasm_free (exttmp);
     }
+    while (imphead) {
+	struct ImpDef *imptmp = imphead;
+	imphead = imphead->next;
+	nasm_free (imptmp->extname);
+	nasm_free (imptmp->libname);
+	nasm_free (imptmp->impname);   /* nasm_free won't mind if it's NULL */
+	nasm_free (imptmp);
+    }
+    while (exphead) {
+	struct ExpDef *exptmp = exphead;
+	exphead = exphead->next;
+	nasm_free (exptmp->extname);
+	nasm_free (exptmp->intname);
+	nasm_free (exptmp);
+    }
     while (ebhead) {
 	struct ExtBack *ebtmp = ebhead;
 	ebhead = ebhead->next;
@@ -219,8 +263,34 @@ static void obj_cleanup (void) {
     }
 }
 
+static void obj_ext_set_defwrt (struct External *ext, char *id) {
+    struct Segment *seg;
+    struct Group *grp;
+
+    for (seg = seghead; seg; seg = seg->next)
+	if (!strcmp(seg->name, id)) {
+	    ext->defwrt_type = DEFWRT_SEGMENT;
+	    ext->defwrt_ptr.seg = seg;
+	    nasm_free (id);
+	    return;
+	}
+
+    for (grp = grphead; grp; grp = grp->next)
+	if (!strcmp(grp->name, id)) {
+	    ext->defwrt_type = DEFWRT_GROUP;
+	    ext->defwrt_ptr.grp = grp;
+	    nasm_free (id);
+	    return;
+	}
+
+    ext->defwrt_type = DEFWRT_STRING;
+    ext->defwrt_ptr.string = id;
+    ext->next_dws = dws;
+    dws = ext;
+}
+
 static void obj_deflabel (char *name, long segment,
-			  long offset, int is_global) {
+			  long offset, int is_global, char *special) {
     /*
      * We have three cases:
      *
@@ -241,6 +311,13 @@ static void obj_deflabel (char *name, long segment,
     struct ExtBack *eb;
     struct Segment *seg;
     int i;
+    int used_special = FALSE;	       /* have we used the special text? */
+
+    /*
+     * If it's a special-retry from pass two, discard it.
+     */
+    if (is_global == 3)
+	return;
 
     /*
      * First check for the double-period, signifying something
@@ -278,10 +355,13 @@ static void obj_deflabel (char *name, long segment,
 	    pub = *fpubtail = nasm_malloc(sizeof(*pub));
 	    fpubtail = &pub->next;
 	    pub->next = NULL;
-	    pub->name = name;
+	    pub->name = nasm_strdup(name);
 	    pub->offset = offset;
 	    pub->segment = (segment == NO_SEG ? 0 : segment & ~SEG_ABS);
 	}
+	if (special)
+	    error(ERR_NONFATAL, "OBJ supports no special symbol features"
+		  " for this symbol type");
 	return;
     }
 
@@ -306,9 +386,12 @@ static void obj_deflabel (char *name, long segment,
 		pub = *seg->pubtail = nasm_malloc(sizeof(*pub));
 		seg->pubtail = &pub->next;
 		pub->next = NULL;
-		pub->name = name;
+		pub->name = nasm_strdup(name);
 		pub->offset = offset;
 	    }
+	    if (special)
+		error(ERR_NONFATAL, "OBJ supports no special symbol features"
+		      " for this symbol type");
 	    return;
 	}
 
@@ -319,11 +402,97 @@ static void obj_deflabel (char *name, long segment,
     ext->next = NULL;
     exttail = &ext->next;
     ext->name = name;
-    if (is_global == 2)
+    ext->defwrt_type = DEFWRT_NONE;
+    if (is_global == 2) {
 	ext->commonsize = offset;
-    else
+	ext->commonelem = 1;	       /* default FAR */
+    } else
 	ext->commonsize = 0;
 
+    /*
+     * Now process the special text, if any, to find default-WRT
+     * specifications and common-variable element-size and near/far
+     * specifications.
+     */
+    while (special && *special) {
+	used_special = TRUE;
+
+	/*
+	 * We might have a default-WRT specification.
+	 */
+	if (!nasm_strnicmp(special, "wrt", 3)) {
+	    char *p;
+	    int len;
+	    special += 3;
+	    special += strspn(special, " \t");
+	    p = nasm_strndup(special, len = strcspn(special, ":"));
+	    obj_ext_set_defwrt (ext, p);
+	    special += len;
+	    if (*special && *special != ':')
+		error(ERR_NONFATAL, "`:' expected in special symbol"
+		      " text for `%s'", ext->name);
+	    else if (*special == ':')
+		special++;
+	}
+
+	/*
+	 * The NEAR or FAR keywords specify nearness or
+	 * farness. FAR gives default element size 1.
+	 */
+	if (!nasm_strnicmp(special, "far", 3)) {
+	    if (ext->commonsize)
+		ext->commonelem = 1;
+	    else
+		error(ERR_NONFATAL, "`%s': `far' keyword may only be applied"
+		      " to common variables\n", ext->name);
+	    special += 3;
+	    special += strspn(special, " \t");
+	} else if (!nasm_strnicmp(special, "near", 4)) {
+	    if (ext->commonsize)
+		ext->commonelem = 0;
+	    else
+		error(ERR_NONFATAL, "`%s': `far' keyword may only be applied"
+		      " to common variables\n", ext->name);
+	    special += 4;
+	    special += strspn(special, " \t");
+	}
+
+	/*
+	 * If it's a common, and anything else remains on the line
+	 * before a further colon, evaluate it as an expression and
+	 * use that as the element size. Forward references aren't
+	 * allowed.
+	 */
+	if (*special == ':')
+	    special++;
+	else if (*special) {
+	    if (ext->commonsize) {
+		expr *e;
+		struct tokenval tokval;
+
+		stdscan_reset();
+		stdscan_bufptr = special;
+		tokval.t_type = TOKEN_INVALID;
+		e = evaluate(stdscan, NULL, &tokval, NULL, 1, error, NULL);
+		if (e) {
+		    if (!is_simple(e))
+			error (ERR_NONFATAL, "cannot use relocatable"
+			       " expression as common-variable element size");
+		    else
+			ext->commonelem = reloc_value(e);
+		}
+		special = stdscan_bufptr;
+	    } else {
+		error (ERR_NONFATAL, "`%s': element-size specifications only"
+		       " apply to common variables", ext->name);
+		while (*special && *special != ':')
+		    special++;
+		if (*special == ':')
+		    special++;
+	    }
+	}
+    }
+
     i = segment/2;
     eb = ebhead;
     if (!eb) {
@@ -341,7 +510,12 @@ static void obj_deflabel (char *name, long segment,
 	}
 	i -= EXT_BLKSIZ;
     }
-    eb->index[i] = ++externals;
+    eb->exts[i] = ext;
+    ext->index = ++externals;
+
+    if (special && !used_special)
+	error(ERR_NONFATAL, "OBJ supports no special symbol features"
+	      " for this symbol type");
 }
 
 static void obj_out (long segto, void *data, unsigned long type,
@@ -400,6 +574,8 @@ static void obj_out (long segto, void *data, unsigned long type,
 	}
     } else if (realtype == OUT_ADDRESS || realtype == OUT_REL2ADR ||
 	       realtype == OUT_REL4ADR) {
+	int rsize;
+
 	if (segment == NO_SEG && realtype != OUT_ADDRESS)
 	    error(ERR_NONFATAL, "relative call to absolute address not"
 		  " supported by OBJ format");
@@ -407,10 +583,14 @@ static void obj_out (long segto, void *data, unsigned long type,
 	    error(ERR_NONFATAL, "far-absolute relocations not supported"
 		  " by OBJ format");
 	ldata = *(long *)data;
-	if (realtype == OUT_REL2ADR)
+	if (realtype == OUT_REL2ADR) {
 	    ldata += (size-2);
-	if (realtype == OUT_REL4ADR)
+	    size = 2;
+	}
+	if (realtype == OUT_REL4ADR) {
 	    ldata += (size-4);
+	    size = 4;
+	}
 	if (obj_ledata_space(seg) < 4 || !obj_fixup_free(seg))
 	    obj_ledata_new(seg);
 	if (size == 2)
@@ -418,8 +598,22 @@ static void obj_out (long segto, void *data, unsigned long type,
 	else
 	    datacurr->lptr = obj_write_dword (datacurr->lptr, ldata);
 	datacurr->nonempty = TRUE;
+	rsize = size;
+	if (segment < SEG_ABS && segment % 2 && size == 4) {
+	    /*
+	     * This is a 4-byte segment-base relocation such as
+	     * `MOV EAX,SEG foo'. OBJ format can't actually handle
+	     * these, but if the constant term has the 16 low bits
+	     * zero, we can just apply a 2-byte segment-base
+	     * relocation to the low word instead.
+	     */
+	    rsize = 2;
+	    if (ldata & 0xFFFF)
+		error(ERR_NONFATAL, "OBJ format cannot handle complex"
+		      " dword-size segment base references");
+	}
 	if (segment != NO_SEG)
-	    obj_write_fixup (datacurr, size,
+	    obj_write_fixup (datacurr, rsize,
 			     (realtype == OUT_REL2ADR ||
 			      realtype == OUT_REL4ADR ? 0 : 0x4000),
 			     segment, wrt,
@@ -479,6 +673,13 @@ static void obj_write_fixup (struct ObjData *data, int bytes,
     long tidx, fidx;
     struct Segment *s = NULL;
     struct Group *g = NULL;
+    struct External *e = NULL;
+
+    if (bytes == 1) {
+	error(ERR_NONFATAL, "`obj' output driver does not support"
+	      " one-byte relocations");
+	return;
+    }
 
     locat = 0x8000 | segrel | offset;
     if (seg % 2) {
@@ -486,8 +687,8 @@ static void obj_write_fixup (struct ObjData *data, int bytes,
 	locat |= 0x800;
 	seg--;
 	if (bytes != 2)
-	    error(ERR_NONFATAL, "OBJ format can only handle 2-byte"
-		  " segment base references");
+	    error(ERR_PANIC, "OBJ: 4-byte segment base fixup got"
+		  " through sanity check");
     } else {
 	base = FALSE;
 	if (bytes == 2)
@@ -527,7 +728,7 @@ static void obj_write_fixup (struct ObjData *data, int bytes,
 		i -= EXT_BLKSIZ;
 	    }
 	    if (eb)
-		method = 6, tidx = eb->index[i];
+		method = 6, e = eb->exts[i], tidx = e->index;
 	    else
 		error(ERR_PANIC,
 		      "unrecognised segment value in obj_write_fixup");
@@ -536,17 +737,28 @@ static void obj_write_fixup (struct ObjData *data, int bytes,
 
     /*
      * If no WRT given, assume the natural default, which is method
-     * F5 unless we are doing an OFFSET fixup for a grouped
-     * segment, in which case we require F1 (group). Oh, and in
-     * OS/2 mode we're in F1 (group) on `defgrp' _always_, by
-     * default.
+     * F5 unless:
+     *
+     * - we are doing an OFFSET fixup for a grouped segment, in
+     *   which case we require F1 (group).
+     *
+     * - we are doing an OFFSET fixup for an external with a
+     *   default WRT, in which case we must honour the default WRT.
      */
     if (wrt == NO_SEG) {
-	if (os2)
-	    method |= 0x10, fidx = defgrp->obj_index;
-	else if (!base && s && s->grp)
+	if (!base && s && s->grp)
 	    method |= 0x10, fidx = s->grp->obj_index;
-	else
+	else if (!base && e && e->defwrt_type != DEFWRT_NONE) {
+	    if (e->defwrt_type == DEFWRT_SEGMENT)
+		method |= 0x00, fidx = e->defwrt_ptr.seg->obj_index;
+	    else if (e->defwrt_type == DEFWRT_GROUP)
+		method |= 0x10, fidx = e->defwrt_ptr.grp->obj_index;
+	    else {
+		error(ERR_NONFATAL, "default WRT specification for"
+		      " external `%s' unresolved", e->name);
+		method |= 0x50, fidx = -1; /* got to do _something_ */
+	    }
+	} else
 	    method |= 0x50, fidx = -1;
     } else {
 	/*
@@ -575,7 +787,7 @@ static void obj_write_fixup (struct ObjData *data, int bytes,
 		    i -= EXT_BLKSIZ;
 		}
 		if (eb)
-		    method |= 0x20, fidx = eb->index[i];
+		    method |= 0x20, fidx = eb->exts[i]->index;
 		else
 		    error(ERR_PANIC,
 			  "unrecognised WRT value in obj_write_fixup");
@@ -603,6 +815,7 @@ static long obj_segment (char *name, int pass, int *bits) {
     } else {
 	struct Segment *seg;
 	struct Group *grp;
+	struct External **extp;
 	int obj_idx, i, attrs, rn_error;
 	char *p;
 
@@ -686,7 +899,32 @@ static long obj_segment (char *name, int pass, int *bits) {
 		seg->use32 = FALSE;
 	    else if (!nasm_stricmp(p, "use32"))
 		seg->use32 = TRUE;
-	    else if (!nasm_strnicmp(p, "class=", 6))
+	    else if (!nasm_stricmp(p, "flat")) {
+		/*
+		 * This segment is an OS/2 FLAT segment. That means
+		 * that its default group is group FLAT, even if
+		 * the group FLAT does not explicitly _contain_ the
+		 * segment.
+		 * 
+		 * When we see this, we must create the group
+		 * `FLAT', containing no segments, if it does not
+		 * already exist; then we must set the default
+		 * group of this segment to be the FLAT group.
+		 */
+		struct Group *grp;
+		for (grp = grphead; grp; grp = grp->next)
+		    if (!strcmp(grp->name, "FLAT"))
+			break;
+		if (!grp) {
+		    obj_directive ("group", "FLAT", 1);
+		    for (grp = grphead; grp; grp = grp->next)
+			if (!strcmp(grp->name, "FLAT"))
+			    break;
+		    if (!grp)
+			error (ERR_PANIC, "failure to define FLAT?!");
+		}
+		seg->grp = grp;
+	    } else if (!nasm_strnicmp(p, "class=", 6))
 		seg->segclass = nasm_strdup(p+6);
 	    else if (!nasm_strnicmp(p, "overlay=", 8))
 		seg->overlay = nasm_strdup(p+8);
@@ -703,6 +941,7 @@ static long obj_segment (char *name, int pass, int *bits) {
 		  case 4:	       /* DWORD */
 		  case 16:	       /* PARA */
 		  case 256:	       /* PAGE */
+		  case 4096:	       /* PharLap extension */
 		    break;
 		  case 8:
 		    error(ERR_WARNING, "OBJ format does not support alignment"
@@ -716,6 +955,13 @@ static long obj_segment (char *name, int pass, int *bits) {
 			  " of %d: rounding up to 256", seg->align);
 		    seg->align = 256;
 		    break;
+		  case 512:
+		  case 1024:
+		  case 2048:
+		    error(ERR_WARNING, "OBJ format does not support alignment"
+			  " of %d: rounding up to 4096", seg->align);
+		    seg->align = 4096;
+		    break;
 		  default:
 		    error(ERR_NONFATAL, "invalid alignment value %d",
 			  seg->align);
@@ -732,9 +978,11 @@ static long obj_segment (char *name, int pass, int *bits) {
 
 	obj_seg_needs_update = seg;
 	if (seg->align >= SEG_ABS)
-	    deflabel (name, NO_SEG, seg->align - SEG_ABS, &of_obj, error);
+	    deflabel (name, NO_SEG, seg->align - SEG_ABS,
+		      NULL, FALSE, FALSE, &of_obj, error);
 	else
-	    deflabel (name, seg->index+1, 0L, &of_obj, error);
+	    deflabel (name, seg->index+1, 0L,
+		      NULL, FALSE, FALSE, &of_obj, error);
 	obj_seg_needs_update = NULL;
 
 	/*
@@ -756,6 +1004,22 @@ static long obj_segment (char *name, int pass, int *bits) {
 	    }
 	}
 
+	/*
+	 * Walk through the list of externals with unresolved
+	 * default-WRT clauses, and resolve any that point at this
+	 * segment.
+	 */
+	extp = &dws;
+	while (*extp) {
+	    if ((*extp)->defwrt_type == DEFWRT_STRING &&
+		!strcmp((*extp)->defwrt_ptr.string, seg->name)) {
+		(*extp)->defwrt_type = DEFWRT_SEGMENT;
+		(*extp)->defwrt_ptr.seg = seg;
+		*extp = (*extp)->next_dws;
+	    } else
+		extp = &(*extp)->next_dws;
+	}
+
 	if (seg->use32)
 	    *bits = 32;
 	else
@@ -770,6 +1034,7 @@ static int obj_directive (char *directive, char *value, int pass) {
 	if (pass == 1) {
 	    struct Group *grp;
 	    struct Segment *seg;
+	    struct External **extp;
 	    int obj_idx;
 
 	    q = value;
@@ -813,7 +1078,8 @@ static int obj_directive (char *directive, char *value, int pass) {
 	    grp->name = NULL;
 
 	    obj_grp_needs_update = grp;
-	    deflabel (v, grp->index+1, 0L, &of_obj, error);
+	    deflabel (v, grp->index+1, 0L,
+		      NULL, FALSE, FALSE, &of_obj, error);
 	    obj_grp_needs_update = NULL;
 
 	    while (*q) {
@@ -852,6 +1118,22 @@ static int obj_directive (char *directive, char *value, int pass) {
 		    grp->segs[grp->nentries++].name = nasm_strdup(p);
 		}
 	    }
+
+	    /*
+	     * Walk through the list of externals with unresolved
+	     * default-WRT clauses, and resolve any that point at
+	     * this group.
+	     */
+	    extp = &dws;
+	    while (*extp) {
+		if ((*extp)->defwrt_type == DEFWRT_STRING &&
+		    !strcmp((*extp)->defwrt_ptr.string, grp->name)) {
+		    (*extp)->defwrt_type = DEFWRT_GROUP;
+		    (*extp)->defwrt_ptr.grp = grp;
+		    *extp = (*extp)->next_dws;
+	    } else
+		    extp = &(*extp)->next_dws;
+	    }
 	}
 	return 1;
     }
@@ -859,6 +1141,129 @@ static int obj_directive (char *directive, char *value, int pass) {
 	obj_uppercase = TRUE;
 	return 1;
     }
+    if (!strcmp(directive, "import")) {
+	char *q, *extname, *libname, *impname;
+
+	if (pass == 2)
+	    return 1;		       /* ignore in pass two */
+	extname = q = value;
+	while (*q && !isspace(*q))
+	    q++;
+	if (isspace(*q)) {
+	    *q++ = '\0';
+	    while (*q && isspace(*q))
+		q++;
+	}
+
+	libname = q;
+	while (*q && !isspace(*q))
+	    q++;
+	if (isspace(*q)) {
+	    *q++ = '\0';
+	    while (*q && isspace(*q))
+		q++;
+	}
+
+	impname = q;
+
+	if (!*extname || !*libname)
+	    error(ERR_NONFATAL, "`import' directive requires symbol name"
+		  " and library name");
+	else {
+	    struct ImpDef *imp;
+	    int err = FALSE;
+
+	    imp = *imptail = nasm_malloc(sizeof(struct ImpDef));
+	    imptail = &imp->next;
+	    imp->next = NULL;
+	    imp->extname = nasm_strdup(extname);
+	    imp->libname = nasm_strdup(libname);
+	    imp->impindex = readnum(impname, &err);
+	    if (!*impname || err)
+		imp->impname = nasm_strdup(impname);
+	    else
+		imp->impname = NULL;
+	}
+
+	return 1;
+    }
+    if (!strcmp(directive, "export")) {
+	char *q, *extname, *intname, *v;
+	struct ExpDef *export;
+	int flags = 0;
+	unsigned int ordinal = 0;
+
+	if (pass == 2)
+	    return 1;		       /* ignore in pass two */
+	intname = q = value;
+	while (*q && !isspace(*q))
+	    q++;
+	if (isspace(*q)) {
+	    *q++ = '\0';
+	    while (*q && isspace(*q))
+		q++;
+	}
+
+	extname = q;
+	while (*q && !isspace(*q))
+	    q++;
+	if (isspace(*q)) {
+	    *q++ = '\0';
+	    while (*q && isspace(*q))
+		q++;
+	}
+
+	if (!*intname) {
+	    error(ERR_NONFATAL, "`export' directive requires export name");
+	    return 1;
+	}
+	if (!*extname) {
+	    extname = intname;
+	    intname = "";
+	}
+	while (*q) {
+	    v = q;
+	    while (*q && !isspace(*q))
+		q++;
+	    if (isspace(*q)) {
+		*q++ = '\0';
+		while (*q && isspace(*q))
+		    q++;
+	    }
+	    if (!nasm_stricmp(v, "resident"))
+		flags |= EXPDEF_FLAG_RESIDENT;
+	    else if (!nasm_stricmp(v, "nodata"))
+		flags |= EXPDEF_FLAG_NODATA;
+	    else if (!nasm_strnicmp(v, "parm=", 5)) {
+		int err = FALSE;
+		flags |= EXPDEF_MASK_PARMCNT & readnum(v+5, &err);
+		if (err) {
+		    error(ERR_NONFATAL,
+			  "value `%s' for `parm' is non-numeric", v+5);
+		    return 1;
+		}
+	    } else {
+		int err = FALSE;
+		ordinal = readnum(v, &err);
+		if (err) {
+		    error(ERR_NONFATAL, "unrecognised export qualifier `%s'",
+			  v);
+		    return 1;
+		}
+		flags |= EXPDEF_FLAG_ORDINAL;
+	    }
+	}
+
+	export = *exptail = nasm_malloc(sizeof(struct ExpDef));
+	exptail = &export->next;
+	export->next = NULL;
+	export->extname = nasm_strdup(extname);
+	export->intname = nasm_strdup(intname);
+	export->ordinal = ordinal;
+	export->flags = flags;
+
+	return 1;
+    }
     return 0;
 }
 
@@ -872,8 +1277,35 @@ static long obj_segbase (long segment) {
 	if (seg->index == segment-1)
 	    break;
 
-    if (!seg)
+    if (!seg) {
+	/*
+	 * Might be an external with a default WRT.
+	 */
+	long i = segment/2;
+	struct ExtBack *eb = ebhead;
+	struct External *e;
+
+	while (i > EXT_BLKSIZ) {
+	    if (eb)
+		eb = eb->next;
+	    else
+		break;
+	    i -= EXT_BLKSIZ;
+	}
+	if (eb) {
+	    e = eb->exts[i];
+	    if (e->defwrt_type == DEFWRT_NONE)
+		return segment;	       /* fine */
+	    else if (e->defwrt_type == DEFWRT_SEGMENT)
+		return e->defwrt_ptr.seg->index+1;
+	    else if (e->defwrt_type == DEFWRT_GROUP)
+		return e->defwrt_ptr.grp->index+1;
+	    else if (e->defwrt_type == DEFWRT_STRING)
+		return NO_SEG;	       /* can't tell what it is */
+	}
+
 	return segment;		       /* not one of ours - leave it alone */
+    }
 
     if (seg->align >= SEG_ABS)
 	return seg->align;	       /* absolute segment */
@@ -894,6 +1326,8 @@ static void obj_write_file (void) {
     struct Public *pub;
     struct External *ext;
     struct ObjData *data;
+    struct ImpDef *imp;
+    struct ExpDef *export;
     static char boast[] = "The Netwide Assembler " NASM_VER;
     int lname_idx, rectype;
 
@@ -913,6 +1347,41 @@ static void obj_write_file (void) {
     obj_record (COMENT, record, recptr);
 
     /*
+     * Write the IMPDEF records, if any.
+     */
+    for (imp = imphead; imp; imp = imp->next) {
+	recptr = record;
+	recptr = obj_write_rword (recptr, 0xA0);   /* comment class A0 */
+	recptr = obj_write_byte (recptr, 1);   /* subfunction 1: IMPDEF */
+	if (imp->impname)
+	    recptr = obj_write_byte (recptr, 0);   /* import by name */
+	else
+	    recptr = obj_write_byte (recptr, 1);   /* import by ordinal */
+	recptr = obj_write_name (recptr, imp->extname);
+	recptr = obj_write_name (recptr, imp->libname);
+	if (imp->impname)
+	    recptr = obj_write_name (recptr, imp->impname);
+	else
+	    recptr = obj_write_word (recptr, imp->impindex);
+	obj_record (COMENT, record, recptr);
+    }
+
+    /*
+     * Write the EXPDEF records, if any.
+     */
+    for (export = exphead; export; export = export->next) {
+	recptr = record;
+	recptr = obj_write_rword (recptr, 0xA0);   /* comment class A0 */
+	recptr = obj_write_byte (recptr, 2);   /* subfunction 1: EXPDEF */
+	recptr = obj_write_byte (recptr, export->flags);
+	recptr = obj_write_name (recptr, export->extname);
+	recptr = obj_write_name (recptr, export->intname);
+	if (export->flags & EXPDEF_FLAG_ORDINAL)
+	    recptr = obj_write_word (recptr, export->ordinal);
+	obj_record (COMENT, record, recptr);
+    }
+
+    /*
      * Write the first LNAMES record, containing LNAME one, which
      * is null. Also initialise the LNAME counter.
      */
@@ -961,10 +1430,12 @@ static void obj_write_file (void) {
 	/* A field */
 	if (seg->align >= SEG_ABS)
 	    acbp |= 0x00;
-	else if (seg->align >= 256) {
-	    if (seg->align > 256)
+	else if (seg->align >= 4096) {
+	    if (seg->align > 4096)
 		error(ERR_NONFATAL, "segment `%s' requires more alignment"
 		      " than OBJ format supports", seg->name);
+	    acbp |= 0xC0;	       /* PharLap extension */
+	} else if (seg->align >= 256) {
 	    acbp |= 0x80;
 	} else if (seg->align >= 16) {
 	    acbp |= 0x60;
@@ -1000,11 +1471,11 @@ static void obj_write_file (void) {
      */
     recptr = record;
     for (grp = grphead; grp; grp = grp->next) {
-	recptr = obj_write_name (recptr, grp->name);
-	if (recptr - record > 1024) {
+	if (recptr - record + strlen(grp->name)+2 > 1024) {
 	    obj_record (LNAMES, record, recptr);
 	    recptr = record;
 	}
+	recptr = obj_write_name (recptr, grp->name);
     }
     if (recptr > record)
 	obj_record (LNAMES, record, recptr);
@@ -1083,28 +1554,29 @@ static void obj_write_file (void) {
     recptr = record;
     for (ext = exthead; ext; ext = ext->next) {
 	if (ext->commonsize == 0) {
-	    recptr = obj_write_name (recptr, ext->name);
-	    recptr = obj_write_index (recptr, 0);
-	    if (recptr - record > 1024) {
+	    /* dj@delorie.com: check for buffer overrun before we overrun it */
+	    if (recptr - record + strlen(ext->name)+2 > RECORD_MAX) {
 		obj_record (EXTDEF, record, recptr);
 		recptr = record;
 	    }
+	    recptr = obj_write_name (recptr, ext->name);
+	    recptr = obj_write_index (recptr, 0);
 	} else {
 	    if (recptr > record)
 		obj_record (EXTDEF, record, recptr);
 	    recptr = record;
-	    if (ext->commonsize > 0) {
+	    if (ext->commonsize) {
 		recptr = obj_write_name (recptr, ext->name);
 		recptr = obj_write_index (recptr, 0);
-		recptr = obj_write_byte (recptr, 0x61);/* far communal */
-		recptr = obj_write_value (recptr, 1L);
-		recptr = obj_write_value (recptr, ext->commonsize);
-		obj_record (COMDEF, record, recptr);
-	    } else if (ext->commonsize < 0) {
-		recptr = obj_write_name (recptr, ext->name);
-		recptr = obj_write_index (recptr, 0);
-		recptr = obj_write_byte (recptr, 0x62);/* near communal */
-		recptr = obj_write_value (recptr, ext->commonsize);
+		if (ext->commonelem) {
+		    recptr = obj_write_byte (recptr, 0x61);/* far communal */
+		    recptr = obj_write_value (recptr, (ext->commonsize /
+						       ext->commonelem));
+		    recptr = obj_write_value (recptr, ext->commonelem);
+		} else {
+		    recptr = obj_write_byte (recptr, 0x62);/* near communal */
+		    recptr = obj_write_value (recptr, ext->commonsize);
+		}
 		obj_record (COMDEF, record, recptr);
 	    }
 	    recptr = record;
@@ -1115,12 +1587,12 @@ static void obj_write_file (void) {
 
     /*
      * Write a COMENT record stating that the linker's first pass
-     * may stop processing at this point. Exception is if we're in
-     * OS/2 mode and our MODEND record specifies a start point, in
-     * which case, according to the OS/2 documentation, this COMENT
-     * should be omitted.
+     * may stop processing at this point. Exception is if our
+     * MODEND record specifies a start point, in which case,
+     * according to some variants of the documentation, this COMENT
+     * should be omitted. So we'll omit it just in case.
      */
-    if (!os2 || obj_entry_seg == NO_SEG) {
+    if (obj_entry_seg == NO_SEG) {
 	recptr = record;
 	recptr = obj_write_rword (recptr, 0x40A2);
 	recptr = obj_write_byte (recptr, 1);
@@ -1262,13 +1734,31 @@ static void obj_record(int type, unsigned char *start, unsigned char *end) {
     fwrite (start, 1, end-start, ofp);
     while (start < end)
 	cksum += *start++;
-    fputc ( (-cksum) & 0xFF, ofp);
+    fputc ( (-(long)cksum) & 0xFF, ofp);
 }
 
+static char *obj_stdmac[] = {
+    "%define __SECT__ [section .text]",
+    "%imacro group 1+.nolist",
+    "[group %1]",
+    "%endmacro",
+    "%imacro uppercase 1+.nolist",
+    "[uppercase %1]",
+    "%endmacro",
+    "%imacro export 1+.nolist",
+    "[export %1]",
+    "%endmacro",
+    "%imacro import 1+.nolist",
+    "[import %1]",
+    "%endmacro",
+    NULL
+};
+
 struct ofmt of_obj = {
     "Microsoft MS-DOS 16-bit OMF object files",
     "obj",
-    dos_init,
+    obj_stdmac,
+    obj_init,
     obj_out,
     obj_deflabel,
     obj_segment,
@@ -1277,18 +1767,4 @@ struct ofmt of_obj = {
     obj_filename,
     obj_cleanup
 };
-
-struct ofmt of_os2 = {
-    "OS/2 object files (variant of OMF)",
-    "os2",
-    os2_init,
-    obj_out,
-    obj_deflabel,
-    obj_segment,
-    obj_segbase,
-    obj_directive,
-    obj_filename,
-    obj_cleanup
-};
-
 #endif /* OF_OBJ */
diff --git a/outrdf.c b/outrdf.c
index dd877751..6b01c751 100644
--- a/outrdf.c
+++ b/outrdf.c
@@ -187,7 +187,7 @@ static efunc error;
 static int segtext,segdata,segbss;
 static long bsslength;
 
-static void rdf_init(FILE *fp, efunc errfunc, ldfunc ldef)
+static void rdf_init(FILE *fp, efunc errfunc, ldfunc ldef, evalfunc eval)
 {
   ofile = fp;
   error = errfunc;
@@ -261,7 +261,8 @@ static void write_dll_rec(struct DLLRec *r)
     membufwrite(header,r->libname,strlen(r->libname) + 1);
 }
 
-static void rdf_deflabel(char *name, long segment, long offset, int is_global)
+static void rdf_deflabel(char *name, long segment, long offset,
+			 int is_global, char *special)
 {
   struct ExportRec r;
   struct ImportRec ri;
@@ -269,28 +270,23 @@ static void rdf_deflabel(char *name, long segment, long offset, int is_global)
   static int warned_common = 0;
 #endif
 
+  if (special)
+    error (ERR_NONFATAL, "RDOFF format does not support any"
+	   " special symbol types");
+
   if (name[0] == '.' && name[1] == '.' && name[2] != '@') {
     error (ERR_NONFATAL, "unrecognised special symbol `%s'", name);
     return;
   }
 
-  if (is_global && segment > 4) {
+  if (is_global == 2) {
 #ifdef VERBOSE_WARNINGS
-    if (! warned_common) {
-      error(ERR_WARNING,"common declarations not supported... using extern");
+    if (!warned_common) {
+      error(ERR_WARNING,"common declarations not supported: using extern");
       warned_common = 1;
     }
 #endif
-    is_global = 0;
-  }
-
-  if (is_global) {
-    r.type = 3;
-    r.segment = segment;
-    r.offset = offset;
-    strncpy(r.label,name,32);
-    r.label[32] = 0;
-    write_export_rec(&r);
+    is_global = 1;
   }
 
   if (segment > 4) {   /* EXTERN declaration */
@@ -299,6 +295,13 @@ static void rdf_deflabel(char *name, long segment, long offset, int is_global)
     strncpy(ri.label,name,32);
     ri.label[32] = 0;
     write_import_rec(&ri);
+  } else if (is_global) {
+    r.type = 3;
+    r.segment = segment;
+    r.offset = offset;
+    strncpy(r.label,name,32);
+    r.label[32] = 0;
+    write_export_rec(&r);
   }
 }
 
@@ -484,9 +487,18 @@ static void rdf_filename (char *inname, char *outname, efunc error) {
   standard_extension(inname,outname,".rdf",error);
 }
 
+static char *rdf_stdmac[] = {
+    "%define __SECT__ [section .text]",
+    "%imacro library 1+.nolist",
+    "[library %1]",
+    "%endmacro",
+    NULL
+};
+
 struct ofmt of_rdf = {
   "Relocatable Dynamic Object File Format v1.1",
   "rdf",
+  rdf_stdmac,
   rdf_init,
   rdf_out,
   rdf_deflabel,
diff --git a/parser.c b/parser.c
index f031ec0e..db465cd9 100644
--- a/parser.c
+++ b/parser.c
@@ -19,8 +19,6 @@
 #include "parser.h"
 #include "float.h"
 
-#include "names.c"
-
 static long reg_flags[] = {	       /* sizes and special flags */
     0, REG8, REG_AL, REG_AX, REG8, REG8, REG16, REG16, REG8, REG_CL,
     REG_CREG, REG_CREG, REG_CREG, REG_CR4, REG_CS, REG_CX, REG8,
@@ -34,105 +32,32 @@ static long reg_flags[] = {	       /* sizes and special flags */
 };
 
 enum {				       /* special tokens */
-    S_BYTE, S_DWORD, S_FAR, S_LONG, S_NEAR, S_QWORD, S_SHORT, S_TO,
-    S_TWORD, S_WORD
-};
-
-static char *special_names[] = {       /* and the actual text */
-    "byte", "dword", "far", "long", "near", "qword", "short", "to",
-    "tword", "word"
-};
-
-static char *prefix_names[] = {
-    "a16", "a32", "lock", "o16", "o32", "rep", "repe", "repne",
-    "repnz", "repz", "times"
-};
-
-/*
- * Evaluator datatype. Expressions, within the evaluator, are
- * stored as an array of these beasts, terminated by a record with
- * type==0. Mostly, it's a vector type: each type denotes some kind
- * of a component, and the value denotes the multiple of that
- * component present in the expression. The exception is the WRT
- * type, whose `value' field denotes the segment to which the
- * expression is relative. These segments will be segment-base
- * types, i.e. either odd segment values or SEG_ABS types. So it is
- * still valid to assume that anything with a `value' field of zero
- * is insignificant.
- */
-typedef struct {
-    long type;			       /* a register, or EXPR_xxx */
-    long value;			       /* must be >= 32 bits */
-} expr;
-
-static void eval_reset(void);
-static expr *evaluate(int);
-
-/*
- * ASSUMPTION MADE HERE. The number of distinct register names
- * (i.e. possible "type" fields for an expr structure) does not
- * exceed 126.
- */
-#define EXPR_SIMPLE 126
-#define EXPR_WRT 127
-#define EXPR_SEGBASE 128
-
-static int is_reloc(expr *);
-static int is_simple(expr *);
-static int is_really_simple (expr *);
-static long reloc_value(expr *);
-static long reloc_seg(expr *);
-static long reloc_wrt(expr *);
-
-enum {				       /* token types, other than chars */
-    TOKEN_ID = 256, TOKEN_NUM, TOKEN_REG, TOKEN_INSN, TOKEN_ERRNUM,
-    TOKEN_HERE, TOKEN_BASE, TOKEN_SPECIAL, TOKEN_PREFIX, TOKEN_SHL,
-    TOKEN_SHR, TOKEN_SDIV, TOKEN_SMOD, TOKEN_SEG, TOKEN_WRT,
-    TOKEN_FLOAT
-};
-
-struct tokenval {
-    long t_integer, t_inttwo;
-    char *t_charptr;
+    S_BYTE, S_DWORD, S_FAR, S_LONG, S_NEAR, S_NOSPLIT, S_QWORD,
+    S_SHORT, S_TO, S_TWORD, S_WORD
 };
 
-static char tempstorage[1024], *q;
-static int bsi (char *string, char **array, int size);/* binary search */
-
-static int nexttoken (void);
 static int is_comma_next (void);
 
-static char *bufptr;
 static int i;
 static struct tokenval tokval;
-static lfunc labelfunc;
 static efunc error;
-static char *label;
-static struct ofmt *outfmt;
-
-static long seg, ofs;
-
-static int forward;
 
-insn *parse_line (long segment, long offset, lfunc lookup_label, int pass,
-		  char *buffer, insn *result, struct ofmt *output,
-		  efunc errfunc) {
+insn *parse_line (int pass, char *buffer, insn *result,
+		  efunc errfunc, evalfunc evaluate, evalinfofunc einfo) {
     int operand;
     int critical;
+    struct eval_hints hints;
 
-    forward = result->forw_ref = FALSE;
-    q = tempstorage;
-    bufptr = buffer;
-    labelfunc = lookup_label;
-    outfmt = output;
+    result->forw_ref = FALSE;
     error = errfunc;
-    seg = segment;
-    ofs = offset;
-    label = "";
+    einfo ("", 0L, 0L);
 
-    i = nexttoken();
+    stdscan_reset();
+    stdscan_bufptr = buffer;
+    i = stdscan(NULL, &tokval);
 
     result->eops = NULL;	       /* must do this, whatever happens */
+    result->operands = 0;	       /* must initialise this */
 
     if (i==0) {			       /* blank line - ignore */
 	result->label = NULL;	       /* so, no label on it */
@@ -149,10 +74,11 @@ insn *parse_line (long segment, long offset, lfunc lookup_label, int pass,
     }
 
     if (i == TOKEN_ID) {	       /* there's a label here */
-	label = result->label = tokval.t_charptr;
-	i = nexttoken();
+	result->label = tokval.t_charptr;
+	einfo (result->label, 0L, 0L);
+	i = stdscan(NULL, &tokval);
 	if (i == ':') {		       /* skip over the optional colon */
-	    i = nexttoken();
+	    i = stdscan(NULL, &tokval);
 	} else if (i == 0 && pass == 1) {
 	    error (ERR_WARNING|ERR_WARN_OL,
 		   "label alone on a line without a colon might be in error");
@@ -176,9 +102,9 @@ insn *parse_line (long segment, long offset, lfunc lookup_label, int pass,
 	if (i == TOKEN_PREFIX && tokval.t_integer == P_TIMES) {
 	    expr *value;
 
-	    i = nexttoken();
-	    eval_reset();
-	    value = evaluate (pass);
+	    i = stdscan(NULL, &tokval);
+	    value = evaluate (stdscan, NULL, &tokval, NULL, pass, error, NULL);
+	    i = tokval.t_type;
 	    if (!value) {	       /* but, error in evaluator */
 		result->opcode = -1;   /* unrecoverable parse error: */
 		return result;	       /* ignore this instruction */
@@ -199,14 +125,28 @@ insn *parse_line (long segment, long offset, lfunc lookup_label, int pass,
 		       "instruction has more than %d prefixes", MAXPREFIX);
 	    else
 		result->prefixes[result->nprefix++] = tokval.t_integer;
-	    i = nexttoken();
+	    i = stdscan(NULL, &tokval);
 	}
     }
 
     if (i != TOKEN_INSN) {
-	error (ERR_NONFATAL, "parser: instruction expected");
-	result->opcode = -1;
-	return result;
+	if (result->nprefix > 0 && i == 0) {
+	    /*
+	     * Instruction prefixes are present, but no actual
+	     * instruction. This is allowed: at this point we
+	     * invent a notional instruction of RESB 0.
+	     */
+	    result->opcode = I_RESB;
+	    result->operands = 1;
+	    result->oprs[0].type = IMMEDIATE;
+	    result->oprs[0].offset = 0L;
+	    result->oprs[0].segment = result->oprs[0].wrt = NO_SEG;
+	    return result;
+	} else {
+	    error (ERR_NONFATAL, "parser: instruction expected");
+	    result->opcode = -1;
+	    return result;
+	}
     }
 
     result->opcode = tokval.t_integer;
@@ -246,7 +186,7 @@ insn *parse_line (long segment, long offset, lfunc lookup_label, int pass,
 	 * Begin to read the DB/DW/DD/DQ/DT operands.
 	 */
 	while (1) {
-	    i = nexttoken();
+	    i = stdscan(NULL, &tokval);
 	    if (i == 0)
 		break;
 	    eop = *tail = nasm_malloc(sizeof(extop));
@@ -259,7 +199,7 @@ insn *parse_line (long segment, long offset, lfunc lookup_label, int pass,
 		eop->type = EOT_DB_STRING;
 		eop->stringval = tokval.t_charptr;
 		eop->stringlen = tokval.t_inttwo;
-		i = nexttoken();       /* eat the comma */
+		i = stdscan(NULL, &tokval);       /* eat the comma */
 		continue;
 	    }
 
@@ -267,18 +207,17 @@ insn *parse_line (long segment, long offset, lfunc lookup_label, int pass,
 		long sign = +1L;
 
 		if (i == '-') {
-		    char *save = bufptr;
-		    i = nexttoken();
+		    char *save = stdscan_bufptr;
+		    i = stdscan(NULL, &tokval);
 		    sign = -1L;
 		    if (i != TOKEN_FLOAT) {
-			bufptr = save;
-			i = '-';
+			stdscan_bufptr = save;
+			i = tokval.t_type = '-';
 		    }
 		}
 
 		if (i == TOKEN_FLOAT) {
 		    eop->type = EOT_DB_STRING;
-		    eop->stringval = q;
 		    if (result->opcode == I_DD)
 			eop->stringlen = 4;
 		    else if (result->opcode == I_DQ)
@@ -291,39 +230,44 @@ insn *parse_line (long segment, long offset, lfunc lookup_label, int pass,
 			      result->opcode == I_DW ? 'W' : 'B');
 			eop->type = EOT_NOTHING;
 		    }
-		    q += eop->stringlen;
+		    eop = nasm_realloc(eop, sizeof(extop)+eop->stringlen);
+		    eop->stringval = (char *)eop + sizeof(extop);
 		    if (!float_const (tokval.t_charptr, sign,
 				      (unsigned char *)eop->stringval,
 				      eop->stringlen, error))
 			eop->type = EOT_NOTHING;
-		    i = nexttoken();       /* eat the comma */
+		    i = stdscan(NULL, &tokval);       /* eat the comma */
 		    continue;
 		}
 	    }
 
 	    /* anything else */ {
 		expr *value;
-		eval_reset();
-		value = evaluate (critical);
-		if (!value) {	       /* but, error in evaluator */
+		value = evaluate (stdscan, NULL, &tokval, NULL,
+				  critical, error, NULL);
+		i = tokval.t_type;
+		if (!value) {	       /* error in evaluator */
 		    result->opcode = -1;/* unrecoverable parse error: */
 		    return result;     /* ignore this instruction */
 		}
-		if (is_reloc(value)) {
+		if (is_unknown(value)) {
+		    eop->type = EOT_DB_NUMBER;
+		    eop->offset = 0;   /* doesn't matter what we put */
+		    eop->segment = eop->wrt = NO_SEG;   /* likewise */
+		} else if (is_reloc(value)) {
 		    eop->type = EOT_DB_NUMBER;
 		    eop->offset = reloc_value(value);
 		    eop->segment = reloc_seg(value);
 		    eop->wrt = reloc_wrt(value);
 		} else {
 		    error (ERR_NONFATAL,
-			   "`%s' operand %d: expression is not simple"
-			   " or relocatable",
-			   insn_names[result->opcode], oper_num);
+			   "operand %d: expression is not simple"
+			   " or relocatable", oper_num);
 		}
 	    }
 
 	    /*
-	     * We're about to call nexttoken(), which will eat the
+	     * We're about to call stdscan(), which will eat the
 	     * comma that we're currently sitting on between
 	     * arguments. However, we'd better check first that it
 	     * _is_ a comma.
@@ -331,8 +275,8 @@ insn *parse_line (long segment, long offset, lfunc lookup_label, int pass,
 	    if (i == 0)		       /* also could be EOL */
 		break;
 	    if (i != ',') {
-		error (ERR_NONFATAL, "comma expected after `%s' operand %d",
-		       insn_names[result->opcode], oper_num);
+		error (ERR_NONFATAL, "comma expected after operand %d",
+		       oper_num);
 		result->opcode = -1;/* unrecoverable parse error: */
 		return result;     /* ignore this instruction */
 	    }
@@ -374,12 +318,13 @@ insn *parse_line (long segment, long offset, lfunc lookup_label, int pass,
      * of these, separated by commas, and terminated by a zero token. */
 
     for (operand = 0; operand < 3; operand++) {
-	expr *seg, *value;	       /* used most of the time */
+	expr *value;		       /* used most of the time */
 	int mref;		       /* is this going to be a memory ref? */
 	int bracket;		       /* is it a [] mref, or a & mref? */
 
 	result->oprs[operand].addr_size = 0;/* have to zero this whatever */
-	i = nexttoken();
+	result->oprs[operand].eaflags = 0;   /* and this */
+	i = stdscan(NULL, &tokval);
 	if (i == 0) break;	       /* end of operands: get out of here */
 	result->oprs[operand].type = 0;   /* so far, no override */
 	while (i == TOKEN_SPECIAL)	{/* size specifiers */
@@ -413,45 +358,63 @@ insn *parse_line (long segment, long offset, lfunc lookup_label, int pass,
 		result->oprs[operand].type |= SHORT;
 		break;
 	    }
-	    i = nexttoken();
+	    i = stdscan(NULL, &tokval);
 	}
 
 	if (i == '[' || i == '&') {    /* memory reference */
 	    mref = TRUE;
 	    bracket = (i == '[');
-	    i = nexttoken();	    
+	    i = stdscan(NULL, &tokval);	    
 	    if (i == TOKEN_SPECIAL) {  /* check for address size override */
 		switch ((int)tokval.t_integer) {
+		  case S_NOSPLIT:
+		    result->oprs[operand].eaflags |= EAF_TIMESTWO;
+		    break;
+		  case S_BYTE:
+		    result->oprs[operand].eaflags |= EAF_BYTEOFFS;
+		    break;
 		  case S_WORD:
 		    result->oprs[operand].addr_size = 16;
+		    result->oprs[operand].eaflags |= EAF_WORDOFFS;
 		    break;
 		  case S_DWORD:
 		  case S_LONG:
 		    result->oprs[operand].addr_size = 32;
+		    result->oprs[operand].eaflags |= EAF_WORDOFFS;
 		    break;
 		  default:
 		    error (ERR_NONFATAL, "invalid size specification in"
 			   " effective address");
 		}
-		i = nexttoken();
+		i = stdscan(NULL, &tokval);
 	    }
 	} else {		       /* immediate operand, or register */
 	    mref = FALSE;
 	    bracket = FALSE;	       /* placate optimisers */
 	}
 
-	eval_reset();
-
-	value = evaluate (critical);
-	if (forward)
-	    result->forw_ref = TRUE;
+	value = evaluate (stdscan, NULL, &tokval,
+			  &result->forw_ref, critical, error, &hints);
+	i = tokval.t_type;
 	if (!value) {		       /* error in evaluator */
 	    result->opcode = -1;       /* unrecoverable parse error: */
 	    return result;	       /* ignore this instruction */
 	}
 	if (i == ':' && mref) {	       /* it was seg:offset */
-	    seg = value;	       /* so shift this into the segment */
-	    i = nexttoken();	       /* then skip the colon */
+	    /*
+	     * Process the segment override.
+	     */
+	    if (value[1].type!=0 || value->value!=1 ||
+		REG_SREG & ~reg_flags[value->type])
+		error (ERR_NONFATAL, "invalid segment override");
+	    else if (result->nprefix == MAXPREFIX)
+		error (ERR_NONFATAL,
+		       "instruction has more than %d prefixes",
+		       MAXPREFIX);
+	    else
+		result->prefixes[result->nprefix++] = value->type;
+
+	    i = stdscan(NULL, &tokval);	       /* then skip the colon */
 	    if (i == TOKEN_SPECIAL) {  /* another check for size override */
 		switch ((int)tokval.t_integer) {
 		  case S_WORD:
@@ -465,30 +428,30 @@ insn *parse_line (long segment, long offset, lfunc lookup_label, int pass,
 		    error (ERR_NONFATAL, "invalid size specification in"
 			   " effective address");
 		}
-		i = nexttoken();
+		i = stdscan(NULL, &tokval);
 	    }
-	    value = evaluate (critical);
-	    if (forward)
-		result->forw_ref = TRUE;	
+	    value = evaluate (stdscan, NULL, &tokval,
+			      &result->forw_ref, critical, error, &hints);
+	    i = tokval.t_type;
 	    /* and get the offset */
 	    if (!value) {	       /* but, error in evaluator */
 		result->opcode = -1;   /* unrecoverable parse error: */
 		return result;	       /* ignore this instruction */
 	    }
-	} else seg = NULL;
+	}
 	if (mref && bracket) {	       /* find ] at the end */
 	    if (i != ']') {
 		error (ERR_NONFATAL, "parser: expecting ]");
 		do {		       /* error recovery again */
-		    i = nexttoken();
+		    i = stdscan(NULL, &tokval);
 		} while (i != 0 && i != ',');
 	    } else		       /* we got the required ] */
-		i = nexttoken();
+		i = stdscan(NULL, &tokval);
 	} else {		       /* immediate operand */
 	    if (i != 0 && i != ',' && i != ':') {
 		error (ERR_NONFATAL, "comma or end of line expected");
 		do {		       /* error recovery */
-		    i = nexttoken();
+		    i = stdscan(NULL, &tokval);
 		} while (i != 0 && i != ',');
 	    } else if (i == ':') {
 		result->oprs[operand].type |= COLON;
@@ -503,28 +466,18 @@ insn *parse_line (long segment, long offset, lfunc lookup_label, int pass,
 	    int b, i, s;	       /* basereg, indexreg, scale */
 	    long o;		       /* offset */
 
-	    if (seg) {		       /* segment override */
-		if (seg[1].type!=0 || seg->value!=1 ||
-		    REG_SREG & ~reg_flags[seg->type])
-		    error (ERR_NONFATAL, "invalid segment override");
-		else if (result->nprefix == MAXPREFIX)
-		    error (ERR_NONFATAL,
-			   "instruction has more than %d prefixes",
-			   MAXPREFIX);
-		else
-		    result->prefixes[result->nprefix++] = seg->type;
-	    }
-
 	    b = i = -1, o = s = 0;
+	    result->oprs[operand].hintbase = hints.base;
+	    result->oprs[operand].hinttype = hints.type;
 
-	    if (e->type < EXPR_SIMPLE) {   /* this bit's a register */
+	    if (e->type <= EXPR_REG_END) {   /* this bit's a register */
 		if (e->value == 1) /* in fact it can be basereg */
 		    b = e->type;
 		else	       /* no, it has to be indexreg */
 		    i = e->type, s = e->value;
 		e++;
 	    }
-	    if (e->type && e->type < EXPR_SIMPLE) {/* it's a second register */
+	    if (e->type && e->type <= EXPR_REG_END) {/* it's a 2nd register */
 		if (e->value != 1) {   /* it has to be indexreg */
 		    if (i != -1) {     /* but it can't be */
 			error(ERR_NONFATAL, "invalid effective address");
@@ -541,46 +494,54 @@ insn *parse_line (long segment, long offset, lfunc lookup_label, int pass,
 		e++;
 	    }
 	    if (e->type != 0) {	       /* is there an offset? */
-		if (e->type < EXPR_SIMPLE) {/* in fact, is there an error? */
+		if (e->type <= EXPR_REG_END) {/* in fact, is there an error? */
 		    error (ERR_NONFATAL, "invalid effective address");
 		    result->opcode = -1;
 		    return result;
 		} else {
-		    if (e->type == EXPR_SIMPLE) {
-			o = e->value;
-			e++;
-		    }
-		    if (e->type == EXPR_WRT) {
-			result->oprs[operand].wrt = e->value;
-			e++;
-		    } else
-			result->oprs[operand].wrt = NO_SEG;
-		    /*
-		     * Look for a segment base type.
-		     */
-		    if (e->type && e->type < EXPR_SEGBASE) {
-			error (ERR_NONFATAL, "invalid effective address");
-			result->opcode = -1;
-			return result;
-		    }
-		    while (e->type && e->value == 0)
-			e++;
-		    if (e->type && e->value != 1) {
-			error (ERR_NONFATAL, "invalid effective address");
-			result->opcode = -1;
-			return result;
-		    }
-		    if (e->type) {
-			result->oprs[operand].segment = e->type-EXPR_SEGBASE;
-			e++;
-		    } else
-			result->oprs[operand].segment = NO_SEG;
-		    while (e->type && e->value == 0)
-			e++;
-		    if (e->type) {
-			error (ERR_NONFATAL, "invalid effective address");
-			result->opcode = -1;
-			return result;
+		    if (e->type == EXPR_UNKNOWN) {
+			o = 0;	       /* doesn't matter what */
+			result->oprs[operand].wrt = NO_SEG;   /* nor this */
+			result->oprs[operand].segment = NO_SEG;  /* or this */
+			while (e->type) e++;   /* go to the end of the line */
+		    } else {
+			if (e->type == EXPR_SIMPLE) {
+			    o = e->value;
+			    e++;
+			}
+			if (e->type == EXPR_WRT) {
+			    result->oprs[operand].wrt = e->value;
+			    e++;
+			} else
+			    result->oprs[operand].wrt = NO_SEG;
+			/*
+			 * Look for a segment base type.
+			 */
+			if (e->type && e->type < EXPR_SEGBASE) {
+			    error (ERR_NONFATAL, "invalid effective address");
+			    result->opcode = -1;
+			    return result;
+			}
+			while (e->type && e->value == 0)
+			    e++;
+			if (e->type && e->value != 1) {
+			    error (ERR_NONFATAL, "invalid effective address");
+			    result->opcode = -1;
+			    return result;
+			}
+			if (e->type) {
+			    result->oprs[operand].segment =
+				e->type - EXPR_SEGBASE;
+			    e++;
+			} else
+			    result->oprs[operand].segment = NO_SEG;
+			while (e->type && e->value == 0)
+			    e++;
+			if (e->type) {
+			    error (ERR_NONFATAL, "invalid effective address");
+			    result->opcode = -1;
+			    return result;
+			}
 		    }
 		}
 	    } else {
@@ -603,7 +564,12 @@ insn *parse_line (long segment, long offset, lfunc lookup_label, int pass,
 	    result->oprs[operand].scale = s;
 	    result->oprs[operand].offset = o;
 	} else {		       /* it's not a memory reference */
-	    if (is_reloc(value)) {     /* it's immediate */
+	    if (is_just_unknown(value)) {     /* it's immediate but unknown */
+		result->oprs[operand].type |= IMMEDIATE;
+		result->oprs[operand].offset = 0;   /* don't care */
+		result->oprs[operand].segment = NO_SEG; /* don't care again */
+		result->oprs[operand].wrt = NO_SEG;/* still don't care */
+	    } else if (is_reloc(value)) {     /* it's immediate */
 		result->oprs[operand].type |= IMMEDIATE;
 		result->oprs[operand].offset = reloc_value(value);
 		result->oprs[operand].segment = reloc_seg(value);
@@ -645,153 +611,13 @@ insn *parse_line (long segment, long offset, lfunc lookup_label, int pass,
 
 static int is_comma_next (void) {
     char *p;
+    int i;
+    struct tokenval tv;
 
-    p = bufptr;
-    while (isspace(*p)) p++;
-    return (*p == ',' || *p == ';' || !*p);
-}
-
-/*
- * This tokeniser routine has only one side effect, that of
- * updating `bufptr'. Hence by saving `bufptr', lookahead may be
- * performed.
- */
-
-static int nexttoken (void) {
-    char ourcopy[256], *r, *s;
-
-    while (isspace(*bufptr)) bufptr++;
-    if (!*bufptr) return 0;
-
-    /* we have a token; either an id, a number or a char */
-    if (isidstart(*bufptr) ||
-	(*bufptr == '$' && isidstart(bufptr[1]))) {
-	/* now we've got an identifier */
-	int i;
-	int is_sym = FALSE;
-
-	if (*bufptr == '$') {
-	    is_sym = TRUE;
-	    bufptr++;
-	}
-
- 	tokval.t_charptr = q;
-	*q++ = *bufptr++;
-	while (isidchar(*bufptr)) *q++ = *bufptr++;
-	*q++ = '\0';
-	for (s=tokval.t_charptr, r=ourcopy; *s; s++)
-	    *r++ = tolower (*s);
-	*r = '\0';
-	if (is_sym)
-	    return TOKEN_ID;	       /* bypass all other checks */
-	/* right, so we have an identifier sitting in temp storage. now,
-	 * is it actually a register or instruction name, or what? */
-	if ((tokval.t_integer=bsi(ourcopy, reg_names,
-				  elements(reg_names)))>=0)
-	    return TOKEN_REG;
-	if ((tokval.t_integer=bsi(ourcopy, insn_names,
-				  elements(insn_names)))>=0)
-	    return TOKEN_INSN;
-	for (i=0; i<elements(icn); i++)
-	    if (!strncmp(ourcopy, icn[i], strlen(icn[i]))) {
-		char *p = ourcopy + strlen(icn[i]);
-		tokval.t_integer = ico[i];
-		if ((tokval.t_inttwo=bsi(p, conditions,
-					 elements(conditions)))>=0)
-		    return TOKEN_INSN;
-	    }
-	if ((tokval.t_integer=bsi(ourcopy, prefix_names,
-				  elements(prefix_names)))>=0) {
-	    tokval.t_integer += PREFIX_ENUM_START;
-	    return TOKEN_PREFIX;
-	}
-	if ((tokval.t_integer=bsi(ourcopy, special_names,
-				  elements(special_names)))>=0)
-	    return TOKEN_SPECIAL;
-	if (!strcmp(ourcopy, "seg"))
-	    return TOKEN_SEG;
-	if (!strcmp(ourcopy, "wrt"))
-	    return TOKEN_WRT;
-	return TOKEN_ID;
-    } else if (*bufptr == '$' && !isnumchar(bufptr[1])) {
-	/*
-	 * It's a $ sign with no following hex number; this must
-	 * mean it's a Here token ($), evaluating to the current
-	 * assembly location, or a Base token ($$), evaluating to
-	 * the base of the current segment.
-	 */
-	bufptr++;
-	if (*bufptr == '$') {
-	    bufptr++;
-	    return TOKEN_BASE;
-	}
-	return TOKEN_HERE;
-    } else if (isnumstart(*bufptr)) {	       /* now we've got a number */
-	char *r = q;
-	int rn_error;
-
-	*q++ = *bufptr++;
-	while (isnumchar(*bufptr)) {
-	    *q++ = *bufptr++;
-	}
-	if (*bufptr == '.') {
-	    /*
-	     * a floating point constant
-	     */
-	    *q++ = *bufptr++;
-	    while (isnumchar(*bufptr)) {
-		*q++ = *bufptr++;
-	    }
-	    *q++ = '\0';
-	    tokval.t_charptr = r;
-	    return TOKEN_FLOAT;
-	}
-	*q++ = '\0';
-	tokval.t_integer = readnum(r, &rn_error);
-	if (rn_error)
-	    return TOKEN_ERRNUM;       /* some malformation occurred */
-	tokval.t_charptr = NULL;
-	return TOKEN_NUM;
-    } else if (*bufptr == '\'' || *bufptr == '"') {/* a char constant */
-    	char quote = *bufptr++, *r;
-	r = tokval.t_charptr = bufptr;
-	while (*bufptr && *bufptr != quote) bufptr++;
-	tokval.t_inttwo = bufptr - r;      /* store full version */
-	if (!*bufptr)
-	    return TOKEN_ERRNUM;       /* unmatched quotes */
-	tokval.t_integer = 0;
-	r = bufptr++;		       /* skip over final quote */
-	while (quote != *--r) {
-	    tokval.t_integer = (tokval.t_integer<<8) + (unsigned char) *r;
-	}
-	return TOKEN_NUM;
-    } else if (*bufptr == ';') {       /* a comment has happened - stay */
-	return 0;
-    } else if ((*bufptr == '>' || *bufptr == '<' ||
-		*bufptr == '/' || *bufptr == '%') && bufptr[1] == *bufptr) {
-	bufptr += 2;
-	return (bufptr[-2] == '>' ? TOKEN_SHR :
-		bufptr[-2] == '<' ? TOKEN_SHL :
-		bufptr[-2] == '/' ? TOKEN_SDIV :
-		TOKEN_SMOD);
-    } else			       /* just an ordinary char */
-    	return (unsigned char) (*bufptr++);
-}
-
-/* return index of "string" in "array", or -1 if no match. */
-static int bsi (char *string, char **array, int size) {
-    int i = -1, j = size;	       /* always, i < index < j */
-    while (j-i >= 2) {
-	int k = (i+j)/2;
-	int l = strcmp(string, array[k]);
-	if (l<0)		       /* it's in the first half */
-	    j = k;
-	else if (l>0)		       /* it's in the second half */
-	    i = k;
-	else			       /* we've got it :) */
-	    return k;
-    }
-    return -1;			       /* we haven't got it :( */
+    p = stdscan_bufptr;
+    i = stdscan (NULL, &tv);
+    stdscan_bufptr = p;
+    return (i == ',' || i == ';' || !i);
 }
 
 void cleanup_insn (insn *i) {
@@ -803,562 +629,3 @@ void cleanup_insn (insn *i) {
 	nasm_free (e);
     }
 }
-
-/* ------------- Evaluator begins here ------------------ */
-
-static expr exprtempstorage[1024], *tempptr;   /* store exprs in here */
-
-/*
- * Add two vector datatypes. We have some bizarre behaviour on far-
- * absolute segment types: we preserve them during addition _only_
- * if one of the segments is a truly pure scalar.
- */
-static expr *add_vectors(expr *p, expr *q) {
-    expr *r = tempptr;
-    int preserve;
-
-    preserve = is_really_simple(p) || is_really_simple(q);
-
-    while (p->type && q->type &&
-	   p->type < EXPR_SEGBASE+SEG_ABS &&
-	   q->type < EXPR_SEGBASE+SEG_ABS)
-    	if (p->type > q->type) {
-	    tempptr->type = q->type;
-	    tempptr->value = q->value;
-	    tempptr++, q++;
-	} else if (p->type < q->type) {
-	    tempptr->type = p->type;
-	    tempptr->value = p->value;
-	    tempptr++, p++;
-	} else {		       /* *p and *q have same type */
-	    tempptr->type = p->type;
-	    tempptr->value = p->value + q->value;
-	    tempptr++, p++, q++;
-	}
-    while (p->type &&
-	   (preserve || p->type < EXPR_SEGBASE+SEG_ABS)) {
-	tempptr->type = p->type;
-	tempptr->value = p->value;
-	tempptr++, p++;
-    }
-    while (q->type &&
-	   (preserve || q->type < EXPR_SEGBASE+SEG_ABS)) {
-	tempptr->type = q->type;
-	tempptr->value = q->value;
-	tempptr++, q++;
-    }
-    (tempptr++)->type = 0;
-
-    return r;
-}
-
-/*
- * Multiply a vector by a scalar. Strip far-absolute segment part
- * if present.
- */
-static expr *scalar_mult(expr *vect, long scalar) {
-    expr *p = vect;
-
-    while (p->type && p->type < EXPR_SEGBASE+SEG_ABS) {
-	p->value = scalar * (p->value);
-	p++;
-    }
-    p->type = 0;
-
-    return vect;
-}
-
-static expr *scalarvect (long scalar) {
-    expr *p = tempptr;
-    tempptr->type = EXPR_SIMPLE;
-    tempptr->value = scalar;
-    tempptr++;
-    tempptr->type = 0;
-    tempptr++;
-    return p;
-}
-
-/*
- * Return TRUE if the argument is a simple scalar. (Or a far-
- * absolute, which counts.)
- */
-static int is_simple (expr *vect) {
-    while (vect->type && !vect->value)
-    	vect++;
-    if (!vect->type)
-	return 1;
-    if (vect->type != EXPR_SIMPLE)
-	return 0;
-    do {
-	vect++;
-    } while (vect->type && !vect->value);
-    if (vect->type && vect->type < EXPR_SEGBASE+SEG_ABS) return 0;
-    return 1;
-}
-
-/*
- * Return TRUE if the argument is a simple scalar, _NOT_ a far-
- * absolute.
- */
-static int is_really_simple (expr *vect) {
-    while (vect->type && !vect->value)
-    	vect++;
-    if (!vect->type)
-	return 1;
-    if (vect->type != EXPR_SIMPLE)
-	return 0;
-    do {
-	vect++;
-    } while (vect->type && !vect->value);
-    if (vect->type) return 0;
-    return 1;
-}
-
-/*
- * Return TRUE if the argument is relocatable (i.e. a simple
- * scalar, plus at most one segment-base, plus possibly a WRT).
- */
-static int is_reloc (expr *vect) {
-    while (vect->type && !vect->value)
-    	vect++;
-    if (!vect->type)
-	return 1;
-    if (vect->type < EXPR_SIMPLE)
-	return 0;
-    if (vect->type == EXPR_SIMPLE) {
-	do {
-	    vect++;
-	} while (vect->type && !vect->value);
-	if (!vect->type)
-	    return 1;
-    }
-    if (vect->type != EXPR_WRT && vect->value != 0 && vect->value != 1)
-	return 0;		       /* segment base multiplier non-unity */
-    do {
-	vect++;
-    } while (vect->type && (vect->type == EXPR_WRT || !vect->value));
-    if (!vect->type)
-	return 1;
-    return 1;
-}
-
-/*
- * Return the scalar part of a relocatable vector. (Including
- * simple scalar vectors - those qualify as relocatable.)
- */
-static long reloc_value (expr *vect) {
-    while (vect->type && !vect->value)
-    	vect++;
-    if (!vect->type) return 0;
-    if (vect->type == EXPR_SIMPLE)
-	return vect->value;
-    else
-	return 0;
-}
-
-/*
- * Return the segment number of a relocatable vector, or NO_SEG for
- * simple scalars.
- */
-static long reloc_seg (expr *vect) {
-    while (vect->type && (vect->type == EXPR_WRT || !vect->value))
-    	vect++;
-    if (vect->type == EXPR_SIMPLE) {
-	do {
-	    vect++;
-	} while (vect->type && (vect->type == EXPR_WRT || !vect->value));
-    }
-    if (!vect->type)
-	return NO_SEG;
-    else
-	return vect->type - EXPR_SEGBASE;
-}
-
-/*
- * Return the WRT segment number of a relocatable vector, or NO_SEG
- * if no WRT part is present.
- */
-static long reloc_wrt (expr *vect) {
-    while (vect->type && vect->type < EXPR_WRT)
-    	vect++;
-    if (vect->type == EXPR_WRT) {
-	return vect->value;
-    } else
-	return NO_SEG;
-}
-
-static void eval_reset(void) {
-    tempptr = exprtempstorage;	       /* initialise temporary storage */
-}
-
-/*
- * The SEG operator: calculate the segment part of a relocatable
- * value. Return NULL, as usual, if an error occurs. Report the
- * error too.
- */
-static expr *segment_part (expr *e) {
-    long seg;
-
-    if (!is_reloc(e)) {
-	error(ERR_NONFATAL, "cannot apply SEG to a non-relocatable value");
-	return NULL;
-    }
-
-    seg = reloc_seg(e);
-    if (seg == NO_SEG) {
-	error(ERR_NONFATAL, "cannot apply SEG to a non-relocatable value");
-	return NULL;
-    } else if (seg & SEG_ABS)
-	return scalarvect(seg & ~SEG_ABS);
-    else {
-	expr *f = tempptr++;
-	tempptr++->type = 0;
-	f->type = EXPR_SEGBASE+outfmt->segbase(seg+1);
-	f->value = 1;
-	return f;
-    }
-}
-
-/*
- * Recursive-descent parser. Called with a single boolean operand,
- * which is TRUE if the evaluation is critical (i.e. unresolved
- * symbols are an error condition). Must update the global `i' to
- * reflect the token after the parsed string. May return NULL.
- *
- * evaluate() should report its own errors: on return it is assumed
- * that if NULL has been returned, the error has already been
- * reported.
- */
-
-/*
- * Grammar parsed is:
- *
- * expr  : expr0 [ WRT expr6 ]
- * expr0 : expr1 [ {|} expr1]
- * expr1 : expr2 [ {^} expr2]
- * expr2 : expr3 [ {&} expr3]
- * expr3 : expr4 [ {<<,>>} expr4...]
- * expr4 : expr5 [ {+,-} expr5...]
- * expr5 : expr6 [ {*,/,%,//,%%} expr6...]
- * expr6 : { ~,+,-,SEG } expr6
- *       | (expr0)
- *       | symbol
- *       | $
- *       | number
- */
-
-static expr *expr0(int), *expr1(int), *expr2(int), *expr3(int);
-static expr *expr4(int), *expr5(int), *expr6(int);
-
-static expr *expr0(int critical) {
-    expr *e, *f;
-
-    e = expr1(critical);
-    if (!e)
-	return NULL;
-    while (i == '|') {
-	i = nexttoken();
-	f = expr1(critical);
-	if (!f)
-	    return NULL;
-	if (!is_simple(e) || !is_simple(f)) {
-	    error(ERR_NONFATAL, "`|' operator may only be applied to"
-		  " scalar values");
-	}
-	e = scalarvect (reloc_value(e) | reloc_value(f));
-    }
-    return e;
-}
-
-static expr *expr1(int critical) {
-    expr *e, *f;
-
-    e = expr2(critical);
-    if (!e)
-	return NULL;
-    while (i == '^') {
-	i = nexttoken();
-	f = expr2(critical);
-	if (!f)
-	    return NULL;
-	if (!is_simple(e) || !is_simple(f)) {
-	    error(ERR_NONFATAL, "`^' operator may only be applied to"
-		  " scalar values");
-	}
-	e = scalarvect (reloc_value(e) ^ reloc_value(f));
-    }
-    return e;
-}
-
-static expr *expr2(int critical) {
-    expr *e, *f;
-
-    e = expr3(critical);
-    if (!e)
-	return NULL;
-    while (i == '&') {
-	i = nexttoken();
-	f = expr3(critical);
-	if (!f)
-	    return NULL;
-	if (!is_simple(e) || !is_simple(f)) {
-	    error(ERR_NONFATAL, "`&' operator may only be applied to"
-		  " scalar values");
-	}
-	e = scalarvect (reloc_value(e) & reloc_value(f));
-    }
-    return e;
-}
-
-static expr *expr3(int critical) {
-    expr *e, *f;
-
-    e = expr4(critical);
-    if (!e)
-	return NULL;
-    while (i == TOKEN_SHL || i == TOKEN_SHR) {
-	int j = i;
-	i = nexttoken();
-	f = expr4(critical);
-	if (!f)
-	    return NULL;
-	if (!is_simple(e) || !is_simple(f)) {
-	    error(ERR_NONFATAL, "shift operator may only be applied to"
-		  " scalar values");
-	}
-	switch (j) {
-	  case TOKEN_SHL:
-	    e = scalarvect (reloc_value(e) << reloc_value(f));
-	    break;
-	  case TOKEN_SHR:
-	    e = scalarvect (((unsigned long)reloc_value(e)) >>
-			    reloc_value(f));
-	    break;
-	}
-    }
-    return e;
-}
-
-static expr *expr4(int critical) {
-    expr *e, *f;
-
-    e = expr5(critical);
-    if (!e)
-	return NULL;
-    while (i == '+' || i == '-') {
-	int j = i;
-	i = nexttoken();
-	f = expr5(critical);
-	if (!f)
-	    return NULL;
-	switch (j) {
-	  case '+':
-	    e = add_vectors (e, f);
-	    break;
-	  case '-':
-	    e = add_vectors (e, scalar_mult(f, -1L));
-	    break;
-	}
-    }
-    return e;
-}
-
-static expr *expr5(int critical) {
-    expr *e, *f;
-
-    e = expr6(critical);
-    if (!e)
-	return NULL;
-    while (i == '*' || i == '/' || i == '*' ||
-	   i == TOKEN_SDIV || i == TOKEN_SMOD) {
-	int j = i;
-	i = nexttoken();
-	f = expr6(critical);
-	if (!f)
-	    return NULL;
-	if (j != '*' && (!is_simple(e) || !is_simple(f))) {
-	    error(ERR_NONFATAL, "division operator may only be applied to"
-		  " scalar values");
-	    return NULL;
-	}
-	if (j != '*' && reloc_value(f) == 0) {
-	    error(ERR_NONFATAL, "division by zero");
-	    return NULL;
-	}
-	switch (j) {
-	  case '*':
-	    if (is_simple(e))
-		e = scalar_mult (f, reloc_value(e));
-	    else if (is_simple(f))
-		e = scalar_mult (e, reloc_value(f));
-	    else {
-		error(ERR_NONFATAL, "unable to multiply two "
-		      "non-scalar objects");
-		return NULL;
-	    }
-	    break;
-	  case '/':
-	    e = scalarvect (((unsigned long)reloc_value(e)) /
-			    ((unsigned long)reloc_value(f)));
-	    break;
-	  case '%':
-	    e = scalarvect (((unsigned long)reloc_value(e)) %
-			    ((unsigned long)reloc_value(f)));
-	    break;
-	  case TOKEN_SDIV:
-	    e = scalarvect (((signed long)reloc_value(e)) /
-			    ((signed long)reloc_value(f)));
-	    break;
-	  case TOKEN_SMOD:
-	    e = scalarvect (((signed long)reloc_value(e)) %
-			    ((signed long)reloc_value(f)));
-	    break;
-	}
-    }
-    return e;
-}
-
-static expr *expr6(int critical) {
-    expr *e;
-    long label_seg, label_ofs;
-
-    if (i == '-') {
-	i = nexttoken();
-	e = expr6(critical);
-	if (!e)
-	    return NULL;
-	return scalar_mult (e, -1L);
-    } else if (i == '+') {
-	i = nexttoken();
-	return expr6(critical);
-    } else if (i == '~') {
-	i = nexttoken();
-	e = expr6(critical);
-	if (!e)
-	    return NULL;
-	if (!is_simple(e)) {
-	    error(ERR_NONFATAL, "`~' operator may only be applied to"
-		  " scalar values");
-	    return NULL;
-	}
-	return scalarvect(~reloc_value(e));
-    } else if (i == TOKEN_SEG) {
-	i = nexttoken();
-	e = expr6(critical);
-	if (!e)
-	    return NULL;
-	return segment_part(e);
-    } else if (i == '(') {
-	i = nexttoken();
-	e = expr0(critical);
-	if (!e)
-	    return NULL;
-	if (i != ')') {
-	    error(ERR_NONFATAL, "expecting `)'");
-	    return NULL;
-	}
-	i = nexttoken();
-	return e;
-    } else if (i == TOKEN_NUM || i == TOKEN_REG || i == TOKEN_ID ||
-	       i == TOKEN_HERE || i == TOKEN_BASE) {
-	e = tempptr;
-	switch (i) {
-	  case TOKEN_NUM:
-	    e->type = EXPR_SIMPLE;
-	    e->value = tokval.t_integer;
-	    break;
-	  case TOKEN_REG:
-	    e->type = tokval.t_integer;
-	    e->value = 1;
-	    break;
-	  case TOKEN_ID:
-	  case TOKEN_HERE:
-	  case TOKEN_BASE:
-	    /*
-	     * Since the whole line is parsed before the label it
-	     * defines is given to the label manager, we have
-	     * problems with lines such as
-	     *
-	     *   end: TIMES 512-(end-start) DB 0
-	     *
-	     * where `end' is not known on pass one, despite not
-	     * really being a forward reference, and due to
-	     * criticality it is _needed_. Hence we check our label
-	     * against the currently defined one, and do our own
-	     * resolution of it if we have to.
-	     */
-	    if (i == TOKEN_BASE) {
-		label_seg = seg;
-		label_ofs = 0;
-	    } else if (i == TOKEN_HERE || !strcmp(tokval.t_charptr, label)) {
-		label_seg = seg;
-		label_ofs = ofs;
-	    } else if (!labelfunc(tokval.t_charptr, &label_seg, &label_ofs)) {
-		if (critical == 2) {
-		    error (ERR_NONFATAL, "symbol `%s' undefined",
-			   tokval.t_charptr);
-		    return NULL;
-		} else if (critical == 1) {
-		    error (ERR_NONFATAL, "symbol `%s' not defined before use",
-			   tokval.t_charptr);
-		    return NULL;
-		} else {
-		    forward = TRUE;
-		    label_seg = seg;
-		    label_ofs = ofs;
-		}
-	    }
-	    e->type = EXPR_SIMPLE;
-	    e->value = label_ofs;
-	    if (label_seg!=NO_SEG) {
-		tempptr++;
-		tempptr->type = EXPR_SEGBASE + label_seg;
-		tempptr->value = 1;
-	    }
-	    break;
-	}
-	tempptr++;
-	tempptr->type = 0;
-	tempptr++;
-	i = nexttoken();
-	return e;
-    } else {
-	error(ERR_NONFATAL, "expression syntax error");
-	return NULL;
-    }
-}
-
-static expr *evaluate (int critical) {
-    expr *e;
-    expr *f = NULL;
-
-    e = expr0 (critical);
-    if (!e)
-	return NULL;
-
-    if (i == TOKEN_WRT) {
-	i = nexttoken();	       /* eat the WRT */
-	f = expr6 (critical);
-	if (!f)
-	    return NULL;
-    }
-    e = scalar_mult (e, 1L);	       /* strip far-absolute segment part */
-    if (f) {
-	expr *g = tempptr++;
-	tempptr++->type = 0;
-	g->type = EXPR_WRT;
-	if (!is_reloc(f)) {
-	    error(ERR_NONFATAL, "invalid right-hand operand to WRT");
-	    return NULL;
-	}
-	g->value = reloc_seg(f);
-	if (g->value == NO_SEG)
-	    g->value = reloc_value(f) | SEG_ABS;
-	else if (!(g->value & SEG_ABS) && !(g->value % 2) && critical) {
-	    error(ERR_NONFATAL, "invalid right-hand operand to WRT");
-	    return NULL;
-	}
-	e = add_vectors (e, g);
-    }
-    return e;
-}
diff --git a/parser.h b/parser.h
index 2c58d743..0681cd03 100644
--- a/parser.h
+++ b/parser.h
@@ -10,9 +10,8 @@
 #ifndef NASM_PARSER_H
 #define NASM_PARSER_H
 
-insn *parse_line (long segment, long offset, lfunc lookup_label, int pass,
-		  char *buffer, insn *result, struct ofmt *output,
-		  efunc error);
+insn *parse_line (int pass, char *buffer, insn *result,
+		  efunc error, evalfunc evaluate, evalinfofunc einfo);
 void cleanup_insn (insn *instruction);
 
 #endif
diff --git a/preproc.c b/preproc.c
index 574e852f..4318e33f 100644
--- a/preproc.c
+++ b/preproc.c
@@ -13,6 +13,7 @@
 #include <stddef.h>
 #include <string.h>
 #include <ctype.h>
+#include <limits.h>
 
 #include "nasm.h"
 #include "nasmlib.h"
@@ -39,7 +40,21 @@ struct SMacro {
 };
 
 /*
- * Store the definition of a multi-line macro.
+ * Store the definition of a multi-line macro. This is also used to
+ * store the interiors of `%rep...%endrep' blocks, which are
+ * effectively self-re-invoking multi-line macros which simply
+ * don't have a name or bother to appear in the hash tables. %rep
+ * blocks are signified by having a NULL `name' field.
+ *
+ * In a MMacro describing a `%rep' block, the `in_progress' field
+ * isn't merely boolean, but gives the number of repeats left to
+ * run.
+ *
+ * The `next' field is used for storing MMacros in hash tables; the
+ * `next_active' field is for stacking them on istk entries.
+ *
+ * When a MMacro is being expanded, `params', `iline', `nparam',
+ * `paramlen', `rotate' and `unique' are local to the invocation.
  */
 struct MMacro {
     MMacro *next;
@@ -50,7 +65,13 @@ struct MMacro {
     int nolist;			       /* is this macro listing-inhibited? */
     int in_progress;
     Token **defaults, *dlist;
+    int ndefs;			       /* number of default parameters */
     Line *expansion;
+
+    MMacro *next_active;
+    Token **params, *iline;
+    int nparam, rotate, *paramlen;
+    unsigned long unique;
 };
 
 /*
@@ -90,7 +111,7 @@ struct Context {
 struct Token {
     Token *next;
     char *text;
-    SMacro *mac;		       /* associated macro for TOK_MAC_END */
+    SMacro *mac;		       /* associated macro for TOK_SMAC_END */
     int type;
 };
 enum {
@@ -116,10 +137,10 @@ enum {
  *
  * Some of these structures, rather than being actual lines, are
  * markers delimiting the end of the expansion of a given macro.
- * This is for use in the cycle-tracking code. Such structures have
- * `finishes' non-NULL, and `first' NULL. All others have
- * `finishes' NULL, but `first' may still be NULL if the line is
- * blank.
+ * This is for use in the cycle-tracking and %rep-handling code.
+ * Such structures have `finishes' non-NULL, and `first' NULL. All
+ * others have `finishes' NULL, but `first' may still be NULL if
+ * the line is blank.
  */
 struct Line {
     Line *next;
@@ -138,6 +159,7 @@ struct Include {
     Line *expansion;
     char *fname;
     int lineno, lineinc;
+    MMacro *mstk;		       /* stack of active macros/reps */
 };
 
 /*
@@ -211,11 +233,41 @@ static int inverse_ccs[] = {
     c_Z, c_NO, c_NP, c_PO, c_PE, c_NS, c_NZ
 };
 
+/*
+ * Directive names.
+ */
+static char *directives[] = {
+    "%assign", "%clear", "%define", "%elif", "%elifctx", "%elifdef",
+    "%elifid", "%elifidn", "%elifidni", "%elifnctx", "%elifndef",
+    "%elifnid", "%elifnidn", "%elifnidni", "%elifnnum", "%elifnstr",
+    "%elifnum", "%elifstr", "%else", "%endif", "%endm", "%endmacro",
+    "%endrep", "%error", "%exitrep", "%iassign", "%idefine", "%if",
+    "%ifctx", "%ifdef", "%ifid", "%ifidn", "%ifidni", "%ifnctx",
+    "%ifndef", "%ifnid", "%ifnidn", "%ifnidni", "%ifnnum",
+    "%ifnstr", "%ifnum", "%ifstr", "%imacro", "%include", "%line",
+    "%macro", "%pop", "%push", "%rep", "%repl", "%rotate"
+};
+enum {
+    PP_ASSIGN, PP_CLEAR, PP_DEFINE, PP_ELIF, PP_ELIFCTX, PP_ELIFDEF,
+    PP_ELIFID, PP_ELIFIDN, PP_ELIFIDNI, PP_ELIFNCTX, PP_ELIFNDEF,
+    PP_ELIFNID, PP_ELIFNIDN, PP_ELIFNIDNI, PP_ELIFNNUM, PP_ELIFNSTR,
+    PP_ELIFNUM, PP_ELIFSTR, PP_ELSE, PP_ENDIF, PP_ENDM, PP_ENDMACRO,
+    PP_ENDREP, PP_ERROR, PP_EXITREP, PP_IASSIGN, PP_IDEFINE, PP_IF,
+    PP_IFCTX, PP_IFDEF, PP_IFID, PP_IFIDN, PP_IFIDNI, PP_IFNCTX,
+    PP_IFNDEF, PP_IFNID, PP_IFNIDN, PP_IFNIDNI, PP_IFNNUM,
+    PP_IFNSTR, PP_IFNUM, PP_IFSTR, PP_IMACRO, PP_INCLUDE, PP_LINE,
+    PP_MACRO, PP_POP, PP_PUSH, PP_REP, PP_REPL, PP_ROTATE
+};
+
+
 static Context *cstk;
 static Include *istk;
 static IncPath *ipath = NULL;
 
 static efunc error;
+static evalfunc evaluate;
+
+static int pass;
 
 static unsigned long unique;	       /* unique identifier numbers */
 
@@ -241,7 +293,8 @@ static MMacro *mmacros[NHASH];
 static SMacro *smacros[NHASH];
 
 /*
- * The multi-line macro we are currently defining, if any.
+ * The multi-line macro we are currently defining, or the %rep
+ * block we are currently reading, if any.
  */
 static MMacro *defining;
 
@@ -258,6 +311,19 @@ static MMacro *defining;
 static char **stdmacpos;
 
 /*
+ * The extra standard macros that come from the object format, if
+ * any.
+ */
+static char **extrastdmac = NULL;
+int any_extrastdmac;
+
+/*
+ * Forward declarations.
+ */
+static Token *expand_smacro (Token *tline);
+static void update_fileline (int which);
+
+/*
  * The pre-preprocessing stage... This function translates line
  * number indications as they emerge from GNU cpp (`# lineno "file"
  * flags') into NASM preprocessor line number indications (`%line
@@ -360,12 +426,11 @@ static void ctx_pop (void) {
  * knows which source file the current output has really come from.
  */
 static void line_sync (void) {
-    char text[80];
+    char text[30+FILENAME_MAX];
     sprintf(text, "%%line %d+%d %s",
 	    (istk->expansion ? istk->lineno - istk->lineinc : istk->lineno),
 	    (istk->expansion ? 0 : istk->lineinc), istk->fname);
-    if (linesync)
-	free (linesync);
+    nasm_free (linesync);
     linesync = nasm_strdup(text);
 }
 
@@ -383,6 +448,11 @@ static char *read_line (void) {
     if (stdmacpos) {
 	if (*stdmacpos) {
 	    char *ret = nasm_strdup(*stdmacpos++);
+	    if (!*stdmacpos && any_extrastdmac) {
+		stdmacpos = extrastdmac;
+		any_extrastdmac = FALSE;
+		return ret;
+	    }
 	    /*
 	     * Nasty hack: here we push the contents of `predef' on
 	     * to the top-level expansion stack, since this is the
@@ -415,6 +485,7 @@ static char *read_line (void) {
 	} else {
 	    stdmacpos = NULL;
 	    line_sync();
+	    update_fileline(3);	       /* update __FILE__ and __LINE__ */
 	}
     }
 
@@ -428,6 +499,7 @@ static char *read_line (void) {
 	p += strlen(p);
 	if (p > buffer && p[-1] == '\n') {
 	    istk->lineno += istk->lineinc;
+	    update_fileline(1);	       /* update __LINE__ only */
 	    break;
 	}
 	if (p-buffer > bufsize-10) {
@@ -491,7 +563,7 @@ static Token *tokenise (char *line) {
 		while (*p && isidchar(*p))
 		    p++;
 	    }
-	} else if (isidstart(*p)) {
+	} else if (isidstart(*p) || (*p == '$' && isidstart(p[1]))) {
 	    type = TOK_ID;
 	    p++;
 	    while (*p && isidchar(*p))
@@ -533,14 +605,26 @@ static Token *tokenise (char *line) {
 	    while (*p) p++;
 	} else {
 	    /*
-	     * Anything else is an operator of some kind; with the
-	     * exceptions of >>, <<, // and %%, all operator tokens
-	     * are single-character.
+	     * Anything else is an operator of some kind. We check
+	     * for all the double-character operators (>>, <<, //,
+	     * %%, <=, >=, ==, !=, <>, &&, ||, ^^), but anything
+	     * else is a single-character operator.
 	     */
-	    char c = *p++;
 	    type = TOK_OTHER;
-	    if ( (c == '>' || c == '<' || c == '/' || c == '%') && *p == c)
+	    if ((p[0] == '>' && p[1] == '>') ||
+		(p[0] == '<' && p[1] == '<') ||
+		(p[0] == '/' && p[1] == '/') ||
+		(p[0] == '%' && p[1] == '%') ||
+		(p[0] == '<' && p[1] == '=') ||
+		(p[0] == '>' && p[1] == '=') ||
+		(p[0] == '=' && p[1] == '=') ||
+		(p[0] == '!' && p[1] == '=') ||
+		(p[0] == '<' && p[1] == '>') ||
+		(p[0] == '&' && p[1] == '&') ||
+		(p[0] == '|' && p[1] == '|') ||
+		(p[0] == '^' && p[1] == '^'))
 		p++;
+	    p++;
 	}
 	if (type != TOK_COMMENT) {
 	    *tail = t = nasm_malloc (sizeof(Token));
@@ -590,6 +674,79 @@ static char *detoken (Token *tlist) {
 }
 
 /*
+ * A scanner, suitable for use by the expression evaluator, which
+ * operates on a line of Tokens. Expects a pointer to a pointer to
+ * the first token in the line to be passed in as its private_data
+ * field.
+ */
+static int ppscan(void *private_data, struct tokenval *tokval) {
+    Token **tlineptr = private_data;
+    Token *tline;
+
+    do {
+	tline = *tlineptr;
+	*tlineptr = tline ? tline->next : NULL;
+    } while (tline && (tline->type == TOK_WHITESPACE ||
+		       tline->type == TOK_COMMENT));
+
+    if (!tline)
+	return tokval->t_type = TOKEN_EOS;
+
+    if (tline->text[0] == '$' && !tline->text[1])
+	return tokval->t_type = TOKEN_HERE;
+    if (tline->text[0] == '$' && tline->text[1] == '$' && !tline->text[1])
+	return tokval->t_type = TOKEN_BASE;
+
+    if (tline->type == TOK_ID) {
+	tokval->t_charptr = tline->text;
+	if (tline->text[0] == '$') {
+	    tokval->t_charptr++;
+	    return tokval->t_type = TOKEN_ID;
+	}
+
+	/*
+	 * This is the only special case we actually need to worry
+	 * about in this restricted context.
+	 */
+	if (!nasm_stricmp(tline->text, "seg"))
+	    return tokval->t_type = TOKEN_SEG;
+
+	return tokval->t_type = TOKEN_ID;
+    }
+
+    if (tline->type == TOK_NUMBER) {
+	int rn_error;
+
+	tokval->t_integer = readnum(tline->text, &rn_error);
+	if (rn_error)
+	    return tokval->t_type = TOKEN_ERRNUM;
+	tokval->t_charptr = NULL;
+	return tokval->t_type = TOKEN_NUM;
+    }
+
+    if (tline->type == TOK_OTHER) {
+	if (!strcmp(tline->text, "<<")) return tokval->t_type = TOKEN_SHL;
+	if (!strcmp(tline->text, ">>")) return tokval->t_type = TOKEN_SHR;
+	if (!strcmp(tline->text, "//")) return tokval->t_type = TOKEN_SDIV;
+	if (!strcmp(tline->text, "%%")) return tokval->t_type = TOKEN_SMOD;
+	if (!strcmp(tline->text, "==")) return tokval->t_type = TOKEN_EQ;
+	if (!strcmp(tline->text, "<>")) return tokval->t_type = TOKEN_NE;
+	if (!strcmp(tline->text, "!=")) return tokval->t_type = TOKEN_NE;
+	if (!strcmp(tline->text, "<=")) return tokval->t_type = TOKEN_LE;
+	if (!strcmp(tline->text, ">=")) return tokval->t_type = TOKEN_GE;
+	if (!strcmp(tline->text, "&&")) return tokval->t_type = TOKEN_DBL_AND;
+	if (!strcmp(tline->text, "^^")) return tokval->t_type = TOKEN_DBL_XOR;
+	if (!strcmp(tline->text, "||")) return tokval->t_type = TOKEN_DBL_OR;
+    }
+
+    /*
+     * We have no other options: just return the first character of
+     * the token text.
+     */
+    return tokval->t_type = tline->text[0];
+}
+
+/*
  * Return the Context structure associated with a %$ token. Return
  * NULL, having _already_ reported an error condition, if the
  * context stack isn't deep enough for the supplied number of $
@@ -708,6 +865,42 @@ static int smacro_defined (char *name, int nparam, SMacro **defn) {
 }
 
 /*
+ * Update the __FILE__ and __LINE__ macros. Specifically, update
+ * __FILE__ if bit 1 of our argument is set, and update __LINE__ if
+ * bit 0 is set.
+ *
+ * If the macros don't exist, a `%clear' must have happened, in
+ * which case we should exit quite happily and carry on going. It's
+ * not an error condition.
+ */
+static void update_fileline(int which) {
+    SMacro *sm;
+    char num[20];
+
+    if ((which & 3) && smacro_defined ("__FILE__", 0, &sm) && sm) {
+	free_tlist(sm->expansion);
+	sm->expansion = nasm_malloc(sizeof(Token));
+	sm->expansion->next = NULL;
+	sm->expansion->mac = NULL;
+	sm->expansion->type = TOK_STRING;
+	sm->expansion->text = nasm_malloc(3+strlen(istk->fname));
+	/* FIXME: throw an error if both sorts of quote are present */
+	/* Better still, invent a way for us to cope with that case */
+	sprintf(sm->expansion->text, "\"%s\"", istk->fname);
+    }
+
+    if ((which & 1) && smacro_defined ("__LINE__", 0, &sm) && sm) {
+	free_tlist(sm->expansion);
+	sm->expansion = nasm_malloc(sizeof(Token));
+	sm->expansion->next = NULL;
+	sm->expansion->mac = NULL;
+	sm->expansion->type = TOK_NUMBER;
+	sprintf(num, "%d", istk->lineno - istk->lineinc);
+	sm->expansion->text = nasm_strdup(num);
+    }
+}
+
+/*
  * Count and mark off the parameters in a multi-line macro call.
  * This is called both from within the multi-line macro expansion
  * code, and also to mark off the default parameters when provided
@@ -758,16 +951,178 @@ static void count_mmac_params (Token *t, int *nparam, Token ***params) {
 }
 
 /*
+ * Determine whether one of the various `if' conditions is true or
+ * not.
+ *
+ * We must free the tline we get passed.
+ */
+static int if_condition (Token *tline, int i) {
+    int j, casesense;
+    Token *t, *tt, **tptr, *origline;
+    struct tokenval tokval;
+    expr *evalresult;
+
+    origline = tline;
+
+    switch (i) {
+      case PP_IFCTX: case PP_ELIFCTX:
+      case PP_IFNCTX: case PP_ELIFNCTX:
+	j = FALSE;		       /* have we matched yet? */
+	if (!cstk)
+	    error(ERR_FATAL|ERR_OFFBY1,
+		  "`%s': context stack is empty", directives[i]);
+	else while (tline) {
+	    if (tline->type == TOK_WHITESPACE)
+		tline = tline->next;
+	    if (!tline || tline->type != TOK_ID) {
+		error(ERR_NONFATAL|ERR_OFFBY1,
+		      "`%s' expects context identifiers", directives[i]);
+		free_tlist (origline);
+		return -1;
+	    }
+	    if (!nasm_stricmp(tline->text, cstk->name))
+		j = TRUE;
+	    tline = tline->next;
+	}
+	if (i == PP_IFNCTX || i == PP_ELIFNCTX)
+	    j = !j;
+	free_tlist (origline);
+	return j;
+
+      case PP_IFDEF: case PP_ELIFDEF:
+      case PP_IFNDEF: case PP_ELIFNDEF:
+	j = FALSE;		       /* have we matched yet? */
+	while (tline) {
+	    if (tline->type == TOK_WHITESPACE)
+		tline = tline->next;
+	    if (!tline || (tline->type != TOK_ID &&
+			   (tline->type != TOK_PREPROC_ID ||
+			    tline->text[1] != '$'))) {
+		error(ERR_NONFATAL|ERR_OFFBY1,
+		      "`%%if%sdef' expects macro identifiers",
+		      (i==PP_ELIFNDEF ? "n" : ""));
+		free_tlist (origline);
+		return -1;
+	    }
+	    if (smacro_defined(tline->text, 0, NULL))
+		j = TRUE;
+		tline = tline->next;
+	}
+	if (i == PP_IFNDEF || i == PP_ELIFNDEF)
+	    j = !j;
+	free_tlist (origline);
+	return j;
+
+      case PP_IFIDN: case PP_ELIFIDN: case PP_IFNIDN: case PP_ELIFNIDN:
+      case PP_IFIDNI: case PP_ELIFIDNI: case PP_IFNIDNI: case PP_ELIFNIDNI:
+	tline = expand_smacro(tline);
+	t = tt = tline;
+	while (tt && (tt->type != TOK_OTHER || strcmp(tt->text, ",")))
+	    tt = tt->next;
+	if (!tt) {
+	    error(ERR_NONFATAL, "`%s' expects two comma-separated arguments");
+	    free_tlist (tline);
+	    return -1;
+	}
+	tt = tt->next;
+	casesense = (i == PP_IFIDN || i == PP_ELIFIDN ||
+		     i == PP_IFNIDN || i == PP_ELIFNIDN);
+	j = TRUE;		       /* assume equality unless proved not */
+	while ((t->type != TOK_OTHER || strcmp(t->text, ",")) && tt) {
+	    if (tt->type == TOK_OTHER && !strcmp(tt->text, ",")) {
+		error(ERR_NONFATAL, "`%s': more than one comma on line",
+		      directives[i]);
+		free_tlist (tline);
+		return -1;
+	    }
+	    if (t->type == TOK_WHITESPACE) {
+		t = t->next;
+		continue;
+	    } else if (tt->type == TOK_WHITESPACE) {
+		tt = tt->next;
+		continue;
+	    } else if (tt->type != t->type ||
+		       (casesense ? strcmp(tt->text, t->text) :
+			nasm_stricmp(tt->text, t->text))) {
+		j = FALSE;	       /* found mismatching tokens */
+		break;
+	    } else {
+		t = t->next;
+		tt = tt->next;
+		continue;
+	    }
+	}
+	if ((t->type != TOK_OTHER || strcmp(t->text, ",")) || tt)
+	    j = FALSE;		       /* trailing gunk on one end or other */
+	if (i == PP_IFNIDN || i == PP_ELIFNIDN)
+	    j = !j;
+	free_tlist (tline);
+	return j;
+
+      case PP_IFID: case PP_ELIFID: case PP_IFNID: case PP_ELIFNID:
+      case PP_IFNUM: case PP_ELIFNUM: case PP_IFNNUM: case PP_ELIFNNUM:
+      case PP_IFSTR: case PP_ELIFSTR: case PP_IFNSTR: case PP_ELIFNSTR:
+	tline = expand_smacro(tline);
+	t = tline;
+	while (t && t->type == TOK_WHITESPACE)
+	    t = t->next;
+	j = FALSE;		       /* placate optimiser */
+	switch (i) {
+	  case PP_IFID: case PP_ELIFID: case PP_IFNID: case PP_ELIFNID:
+	    j = (t->type == TOK_ID);
+	    break;
+	  case PP_IFNUM: case PP_ELIFNUM: case PP_IFNNUM: case PP_ELIFNNUM:
+	    j = (t->type == TOK_NUMBER);
+	    break;
+	  case PP_IFSTR: case PP_ELIFSTR: case PP_IFNSTR: case PP_ELIFNSTR:
+	    j = (t->type == TOK_STRING);
+	    break;
+	}
+	if (i == PP_IFNID || i == PP_ELIFNID ||
+	    i == PP_IFNNUM || i == PP_ELIFNNUM ||
+	    i == PP_IFNSTR || i == PP_ELIFNSTR)
+	    j = !j;
+	free_tlist (tline);
+	return j;
+
+      case PP_IF: case PP_ELIF:
+	t = tline = expand_smacro(tline);
+	tptr = &t;
+	tokval.t_type = TOKEN_INVALID;
+	evalresult = evaluate (ppscan, tptr, &tokval,
+			       NULL, pass | 0x10, error, NULL);
+	free_tlist (tline);
+	if (!evalresult)
+	    return -1;
+	if (tokval.t_type)
+	    error(ERR_WARNING|ERR_OFFBY1,
+		  "trailing garbage after expression ignored");
+	if (!is_simple(evalresult)) {
+	    error(ERR_NONFATAL|ERR_OFFBY1,
+		  "non-constant value given to `%s'", directives[i]);
+	    return -1;
+	}
+	return reloc_value(evalresult) != 0;
+
+      default:
+	error(ERR_FATAL|ERR_OFFBY1,
+	      "preprocessor directive `%s' not yet implemented",
+	      directives[i]);
+	free_tlist (origline);
+	return -1;		       /* yeah, right */
+    }
+}
+
+/*
  * Find out if a line contains a preprocessor directive, and deal
  * with it if so.
  * 
- * If a directive _is_ found, the line will never be de-tokenised
- * as is, so we have carte blanche to fiddle with it and adjust
- * token values.
+ * If a directive _is_ found, we are expected to free_tlist() the
+ * line.
  *
  * Return values go like this:
  * 
- * bit 0 is set if a directive was found
+ * bit 0 is set if a directive was found (so the line gets freed)
  * bit 1 is set if a blank line should be emitted
  * bit 2 is set if a re-sync line number comment should be emitted
  *
@@ -776,26 +1131,19 @@ static void count_mmac_params (Token *t, int *nparam, Token ***params) {
  * which both are set)
  */
 static int do_directive (Token *tline) {
-    static char *directives[] = {
-	"%clear", "%define", "%elifctx", "%elifdef", "%elifnctx",
-	"%elifndef", "%else", "%endif", "%endm", "%endmacro", "%error",
-	"%idefine", "%ifctx", "%ifdef", "%ifnctx", "%ifndef", "%imacro",
-	"%include", "%line", "%macro", "%pop", "%push", "%repl"
-    };
-    enum {
-	PP_CLEAR, PP_DEFINE, PP_ELIFCTX, PP_ELIFDEF, PP_ELIFNCTX,
-	PP_ELIFNDEF, PP_ELSE, PP_ENDIF, PP_ENDM, PP_ENDMACRO, PP_ERROR,
-	PP_IDEFINE, PP_IFCTX, PP_IFDEF, PP_IFNCTX, PP_IFNDEF, PP_IMACRO,
-	PP_INCLUDE, PP_LINE, PP_MACRO, PP_POP, PP_PUSH, PP_REPL
-    };
-    int i, j, k, m, nparam;
+    int i, j, k, m, nparam, nolist;
     char *p, *mname;
     Include *inc;
     Context *ctx;
     Cond *cond;
     SMacro *smac, **smhead;
     MMacro *mmac;
-    Token *t, *tt, *param_start, *macro_start, *last;
+    Token *t, *tt, *param_start, *macro_start, *last, **tptr, *origline;
+    Line *l;
+    struct tokenval tokval;
+    expr *evalresult;
+
+    origline = tline;
 
     if (tline && tline->type == TOK_WHITESPACE)
 	tline = tline->next;
@@ -820,22 +1168,39 @@ static int do_directive (Token *tline) {
 
     /*
      * If we're in a non-emitting branch of a condition construct,
+     * or walking to the end of an already terminated %rep block,
      * we should ignore all directives except for condition
      * directives.
      */
-    if (istk->conds && !emitting(istk->conds->state) &&
-	i != PP_IFCTX && i != PP_IFDEF && i != PP_IFNCTX && i != PP_IFNDEF &&
-	i!=PP_ELIFCTX && i!=PP_ELIFDEF && i!=PP_ELIFNCTX && i!=PP_ELIFNDEF &&
+    if (((istk->conds && !emitting(istk->conds->state)) ||
+	 (istk->mstk && !istk->mstk->in_progress)) &&
+	i != PP_IF && i != PP_ELIF &&
+	i != PP_IFCTX && i != PP_ELIFCTX &&
+	i != PP_IFDEF && i != PP_ELIFDEF &&
+	i != PP_IFID && i != PP_ELIFID &&
+	i != PP_IFIDN && i != PP_ELIFIDN &&
+	i != PP_IFIDNI && i != PP_ELIFIDNI &&
+	i != PP_IFNCTX && i != PP_ELIFNCTX &&
+	i != PP_IFNDEF && i != PP_ELIFNDEF &&
+	i != PP_IFNID && i != PP_ELIFNID &&
+	i != PP_IFNIDN && i != PP_ELIFNIDN &&
+	i != PP_IFNIDNI && i != PP_ELIFNIDNI &&
+	i != PP_IFNNUM && i != PP_ELIFNNUM &&
+	i != PP_IFNSTR && i != PP_ELIFNSTR &&
+	i != PP_IFNUM && i != PP_ELIFNUM &&
+	i != PP_IFSTR && i != PP_ELIFSTR &&
 	i != PP_ELSE && i != PP_ENDIF)
 	return 0;
 
     /*
-     * If we're defining a macro, we should ignore all directives
-     * except for %macro/%imacro (which generate an error) and
-     * %endm/%endmacro.
+     * If we're defining a macro or reading a %rep block, we should
+     * ignore all directives except for %macro/%imacro (which
+     * generate an error), %endm/%endmacro, and (only if we're in a
+     * %rep block) %endrep.
      */
     if (defining && i != PP_MACRO && i != PP_IMACRO &&
-	i != PP_ENDMACRO && i != PP_ENDM)
+	i != PP_ENDMACRO && i != PP_ENDM &&
+	(defining->name || i != PP_ENDREP))
 	return 0;
 
     if (j != -2) {
@@ -867,6 +1232,7 @@ static int do_directive (Token *tline) {
 		nasm_free (s);
 	    }
 	}
+	free_tlist (origline);
 	return 3;
 
       case PP_INCLUDE:
@@ -876,6 +1242,7 @@ static int do_directive (Token *tline) {
 	if (!tline || (tline->type != TOK_STRING &&
 		       tline->type != TOK_INTERNAL_STRING)) {
 	    error(ERR_NONFATAL|ERR_OFFBY1, "`%%include' expects a file name");
+	    free_tlist (origline);
 	    return 3;		       /* but we did _something_ */
 	}
 	if (tline->next)
@@ -893,8 +1260,11 @@ static int do_directive (Token *tline) {
 	inc->fname = nasm_strdup(p);
 	inc->lineno = inc->lineinc = 1;
 	inc->expansion = NULL;
+	inc->mstk = NULL;
 	istk = inc;
 	list->uplevel (LIST_INCLUDE);
+	update_fileline(3);	       /* update __FILE__ and __LINE__ */
+	free_tlist (origline);
 	return 5;
 
       case PP_PUSH:
@@ -904,6 +1274,7 @@ static int do_directive (Token *tline) {
 	if (!tline || tline->type != TOK_ID) {
 	    error(ERR_NONFATAL|ERR_OFFBY1,
 		  "`%%push' expects a context identifier");
+	    free_tlist (origline);
 	    return 3;		       /* but we did _something_ */
 	}
 	if (tline->next)
@@ -915,6 +1286,7 @@ static int do_directive (Token *tline) {
 	ctx->name = nasm_strdup(tline->text);
 	ctx->number = unique++;
 	cstk = ctx;
+	free_tlist (origline);
 	break;
 
       case PP_REPL:
@@ -924,6 +1296,7 @@ static int do_directive (Token *tline) {
 	if (!tline || tline->type != TOK_ID) {
 	    error(ERR_NONFATAL|ERR_OFFBY1,
 		  "`%%repl' expects a context identifier");
+	    free_tlist (origline);
 	    return 3;		       /* but we did _something_ */
 	}
 	if (tline->next)
@@ -936,6 +1309,7 @@ static int do_directive (Token *tline) {
 	    nasm_free (cstk->name);
 	    cstk->name = nasm_strdup(tline->text);
 	}
+	free_tlist (origline);
 	break;
 
       case PP_POP:
@@ -947,6 +1321,7 @@ static int do_directive (Token *tline) {
 		  "`%%pop': context stack is already empty");
 	else
 	    ctx_pop();
+	free_tlist (origline);
 	break;
 
       case PP_ERROR:
@@ -956,6 +1331,7 @@ static int do_directive (Token *tline) {
 	if (!tline || tline->type != TOK_STRING) {
 	    error(ERR_NONFATAL|ERR_OFFBY1,
 		  "`%%error' expects an error string");
+	    free_tlist (origline);
 	    return 3;		       /* but we did _something_ */
 	}
 	if (tline->next)
@@ -964,136 +1340,71 @@ static int do_directive (Token *tline) {
 	p = tline->text+1;	       /* point past the quote to the name */
 	p[strlen(p)-1] = '\0';	       /* remove the trailing quote */
 	error(ERR_NONFATAL|ERR_OFFBY1, "user error: %s", p);
+	free_tlist (origline);
 	break;
 
+      case PP_IF:
       case PP_IFCTX:
-      case PP_IFNCTX:
-	tline = tline->next;
-	if (istk->conds && !emitting(istk->conds->state))
-	    j = COND_NEVER;
-	else {
-	    j = FALSE;		       /* have we matched yet? */
-	    if (!cstk)
-		error(ERR_FATAL|ERR_OFFBY1,
-		      "`%%if%sctx': context stack is empty",
-		      (i==PP_IFNCTX ? "n" : ""));
-	    else while (tline) {
-		if (tline->type == TOK_WHITESPACE)
-		    tline = tline->next;
-		if (!tline || tline->type != TOK_ID) {
-		    error(ERR_NONFATAL|ERR_OFFBY1,
-			  "`%%ifctx' expects context identifiers");
-		    return 3;	       /* but we did _something_ */
-		}
-		if (!nasm_stricmp(tline->text, cstk->name))
-		    j = TRUE;
-		tline = tline->next;
-	    }
-	    if (i == PP_IFNCTX)
-		j = !j;
-	    j = (j ? COND_IF_TRUE : COND_IF_FALSE);
-	}
-	cond = nasm_malloc(sizeof(Cond));
-	cond->next = istk->conds;
-	cond->state = j;
-	istk->conds = cond;
-	return 1;
-
-      case PP_ELIFCTX:
-      case PP_ELIFNCTX:
-	tline = tline->next;
-	if (!istk->conds)
-	    error(ERR_FATAL|ERR_OFFBY1, "`%%elif%sctx': no matching `%%if'",
-		  (i==PP_ELIFNCTX ? "n" : ""));
-	if (emitting(istk->conds->state) || istk->conds->state == COND_NEVER)
-	    istk->conds->state = COND_NEVER;
-	else {
-	    j = FALSE;		       /* have we matched yet? */
-	    if (!cstk)
-		error(ERR_FATAL|ERR_OFFBY1,
-		      "`%%elif%sctx': context stack is empty",
-		      (i==PP_ELIFNCTX ? "n" : ""));
-	    else while (tline) {
-		if (tline->type == TOK_WHITESPACE)
-		    tline = tline->next;
-		if (!tline || tline->type != TOK_ID) {
-		    error(ERR_NONFATAL|ERR_OFFBY1,
-			  "`%%elif%sctx' expects context identifiers",
-			  (i==PP_ELIFNCTX ? "n" : ""));
-		    return 3;	       /* but we did _something_ */
-		}
-		if (!nasm_stricmp(tline->text, cstk->name))
-		    j = TRUE;
-		tline = tline->next;
-	    }
-	    if (i == PP_ELIFNCTX)
-		j = !j;
-	    istk->conds->state = (j ? COND_IF_TRUE : COND_IF_FALSE);
-	}
-	return 1;
-
       case PP_IFDEF:
+      case PP_IFID:
+      case PP_IFIDN:
+      case PP_IFIDNI:
+      case PP_IFNCTX:
       case PP_IFNDEF:
-	tline = tline->next;
+      case PP_IFNID:
+      case PP_IFNIDN:
+      case PP_IFNIDNI:
+      case PP_IFNNUM:
+      case PP_IFNSTR:
+      case PP_IFNUM:
+      case PP_IFSTR:
 	if (istk->conds && !emitting(istk->conds->state))
 	    j = COND_NEVER;
 	else {
-	    j = FALSE;		       /* have we matched yet? */
-	    while (tline) {
-		if (tline->type == TOK_WHITESPACE)
-		    tline = tline->next;
-		if (!tline || (tline->type != TOK_ID &&
-			       (tline->type != TOK_PREPROC_ID ||
-				tline->text[1] != '$'))) {
-		    error(ERR_NONFATAL|ERR_OFFBY1,
-			  "`%%if%sdef' expects macro identifiers",
-			  (i==PP_ELIFNDEF ? "n" : ""));
-		    return 3;	       /* but we did _something_ */
-		}
-		if (smacro_defined(tline->text, 0, NULL))
-		    j = TRUE;
-		tline = tline->next;
-	    }
-	    if (i == PP_IFNDEF)
-		j = !j;
-	    j = (j ? COND_IF_TRUE : COND_IF_FALSE);
+	    j = if_condition(tline->next, i);
+	    tline->next = NULL;	       /* it got freed */
+	    free_tlist (origline);
+	    if (j < 0)
+		return 3;
+	    else
+		j = j ? COND_IF_TRUE : COND_IF_FALSE;
 	}
 	cond = nasm_malloc(sizeof(Cond));
 	cond->next = istk->conds;
 	cond->state = j;
 	istk->conds = cond;
-	return 1;
+	return (j == COND_IF_TRUE ? 3 : 1);
 
+      case PP_ELIF:
+      case PP_ELIFCTX:
       case PP_ELIFDEF:
+      case PP_ELIFID:
+      case PP_ELIFIDN:
+      case PP_ELIFIDNI:
+      case PP_ELIFNCTX:
       case PP_ELIFNDEF:
-	tline = tline->next;
+      case PP_ELIFNID:
+      case PP_ELIFNIDN:
+      case PP_ELIFNIDNI:
+      case PP_ELIFNNUM:
+      case PP_ELIFNSTR:
+      case PP_ELIFNUM:
+      case PP_ELIFSTR:
 	if (!istk->conds)
-	    error(ERR_FATAL|ERR_OFFBY1, "`%%elif%sctx': no matching `%%if'",
-		  (i==PP_ELIFNCTX ? "n" : ""));
+	    error(ERR_FATAL|ERR_OFFBY1, "`%s': no matching `%%if'",
+		  directives[i]);
 	if (emitting(istk->conds->state) || istk->conds->state == COND_NEVER)
 	    istk->conds->state = COND_NEVER;
 	else {
-	    j = FALSE;		       /* have we matched yet? */
-	    while (tline) {
-		if (tline->type == TOK_WHITESPACE)
-		    tline = tline->next;
-		if (!tline || (tline->type != TOK_ID &&
-			       (tline->type != TOK_PREPROC_ID ||
-				tline->text[1] != '$'))) {
-		    error(ERR_NONFATAL|ERR_OFFBY1,
-			  "`%%elif%sdef' expects macro identifiers",
-			  (i==PP_ELIFNDEF ? "n" : ""));
-		    return 3;	       /* but we did _something_ */
-		}
-		if (smacro_defined(tline->text, 0, NULL))
-		    j = TRUE;
-		tline = tline->next;
-	    }
-	    if (i == PP_ELIFNDEF)
-		j = !j;
-	    istk->conds->state = (j ? COND_IF_TRUE : COND_IF_FALSE);
+	    j = if_condition(tline->next, i);
+	    tline->next = NULL;	       /* it got freed */
+	    free_tlist (origline);
+	    if (j < 0)
+		return 3;
+	    else
+		istk->conds->state = j ? COND_IF_TRUE : COND_IF_FALSE;
 	}
-	return 1;
+	return (istk->conds->state == COND_IF_TRUE ? 5 : 1);
 
       case PP_ELSE:
 	if (tline->next)
@@ -1106,7 +1417,8 @@ static int do_directive (Token *tline) {
 	    istk->conds->state = COND_ELSE_FALSE;
 	else
 	    istk->conds->state = COND_ELSE_TRUE;
-	return 1;
+	free_tlist (origline);
+	return 5;
 
       case PP_ENDIF:
 	if (tline->next)
@@ -1118,6 +1430,7 @@ static int do_directive (Token *tline) {
 	cond = istk->conds;
 	istk->conds = cond->next;
 	nasm_free (cond);
+	free_tlist (origline);
 	return 5;
 
       case PP_MACRO:
@@ -1159,7 +1472,10 @@ static int do_directive (Token *tline) {
 	if (tline && tline->next && tline->next->type == TOK_OTHER &&
 	    !strcmp(tline->next->text, "-")) {
 	    tline = tline->next->next;
-	    if (!tline || tline->type != TOK_NUMBER)
+	    if (tline && tline->type == TOK_OTHER &&
+		!strcmp(tline->text, "*"))
+		defining->nparam_max = INT_MAX;
+	    else if (!tline || tline->type != TOK_NUMBER)
 		error (ERR_NONFATAL|ERR_OFFBY1,
 		       "`%%%smacro' expects a parameter count after `-'",
 		       (i == PP_IMACRO ? "i" : ""));
@@ -1199,21 +1515,16 @@ static int do_directive (Token *tline) {
 	 * Handle default parameters.
 	 */
 	if (tline && tline->next) {
-	    int np, want_np;
-
 	    defining->dlist = tline->next;
 	    tline->next = NULL;
-	    count_mmac_params (defining->dlist, &np, &defining->defaults);
-	    want_np = defining->nparam_max - defining->nparam_min;
-	    defining->defaults = nasm_realloc (defining->defaults,
-					       want_np*sizeof(Token *));
-	    while (np < want_np)
-		defining->defaults[np++] = NULL;
+	    count_mmac_params (defining->dlist, &defining->ndefs,
+			       &defining->defaults);
 	} else {
 	    defining->dlist = NULL;
 	    defining->defaults = NULL;
 	}
 	defining->expansion = NULL;
+	free_tlist (origline);
 	return 1;
 
       case PP_ENDM:
@@ -1227,8 +1538,128 @@ static int do_directive (Token *tline) {
 	defining->next = mmacros[k];
 	mmacros[k] = defining;
 	defining = NULL;
+	free_tlist (origline);
 	return 5;
 
+      case PP_ROTATE:
+	if (tline->next && tline->next->type == TOK_WHITESPACE)
+	    tline = tline->next;
+	t = expand_smacro(tline->next);
+	tline->next = NULL;
+	free_tlist (origline);
+	tline = t;
+	tptr = &t;
+	tokval.t_type = TOKEN_INVALID;
+	evalresult = evaluate (ppscan, tptr, &tokval, NULL, pass, error, NULL);
+	free_tlist (tline);
+	if (!evalresult)
+	    return 3;
+	if (tokval.t_type)
+	    error(ERR_WARNING|ERR_OFFBY1,
+		  "trailing garbage after expression ignored");
+	if (!is_simple(evalresult)) {
+	    error(ERR_NONFATAL|ERR_OFFBY1,
+		  "non-constant value given to `%%rotate'");
+	    return 3;
+	}
+	mmac = istk->mstk;
+	while (mmac && !mmac->name)    /* avoid mistaking %reps for macros */
+	    mmac = mmac->next_active;
+	if (!mmac)
+	    error(ERR_NONFATAL, "`%rotate' invoked outside a macro call");
+	mmac->rotate = mmac->rotate + reloc_value(evalresult);
+	if (mmac->rotate < 0)
+	    mmac->rotate = mmac->nparam - (-mmac->rotate) % mmac->nparam;
+	mmac->rotate %= mmac->nparam;
+	return 1;
+
+      case PP_REP:
+	nolist = FALSE;
+	tline = tline->next;
+	if (tline->next && tline->next->type == TOK_WHITESPACE)
+	    tline = tline->next;
+	if (tline->next && tline->next->type == TOK_ID &&
+	    !nasm_stricmp(tline->next->text, ".nolist")) {
+	    tline = tline->next;
+	    nolist = TRUE;
+	}
+	t = expand_smacro(tline->next);
+	tline->next = NULL;
+	free_tlist (origline);
+	tline = t;
+	tptr = &t;
+	tokval.t_type = TOKEN_INVALID;
+	evalresult = evaluate (ppscan, tptr, &tokval, NULL, pass, error, NULL);
+	free_tlist (tline);
+	if (!evalresult)
+	    return 3;
+	if (tokval.t_type)
+	    error(ERR_WARNING|ERR_OFFBY1,
+		  "trailing garbage after expression ignored");
+	if (!is_simple(evalresult)) {
+	    error(ERR_NONFATAL|ERR_OFFBY1,
+		  "non-constant value given to `%%rep'");
+	    return 3;
+	}
+	defining = nasm_malloc(sizeof(MMacro));
+	defining->name = NULL;	       /* flags this macro as a %rep block */
+	defining->casesense = 0;
+	defining->plus = FALSE;
+	defining->nolist = nolist;
+	defining->in_progress = reloc_value(evalresult) + 1;
+	defining->nparam_min = defining->nparam_max = 0;
+	defining->expansion = NULL;
+	defining->next_active = istk->mstk;
+	return 1;
+
+      case PP_ENDREP:
+	if (!defining) {
+	    error (ERR_NONFATAL|ERR_OFFBY1,
+		   "`%%endrep': no matching `%%rep'");
+	    return 3;
+	}
+
+	/*
+	 * Now we have a "macro" defined - although it has no name
+	 * and we won't be entering it in the hash tables - we must
+	 * push a macro-end marker for it on to istk->expansion.
+	 * After that, it will take care of propagating itself (a
+	 * macro-end marker line for a macro which is really a %rep
+	 * block will cause the macro to be re-expanded, complete
+	 * with another macro-end marker to ensure the process
+	 * continues) until the whole expansion is forcibly removed
+	 * from istk->expansion by a %exitrep.
+	 */
+    	l = nasm_malloc(sizeof(Line));
+	l->next = istk->expansion;
+	l->finishes = defining;
+	l->first = NULL;
+	istk->expansion = l;
+
+	istk->mstk = defining;
+
+	list->uplevel (defining->nolist ? LIST_MACRO_NOLIST : LIST_MACRO);
+	defining = NULL;
+	free_tlist (origline);
+	return 1;		       /* the expansion will line-sync */
+
+      case PP_EXITREP:
+	/*
+	 * We must search along istk->expansion until we hit a
+	 * macro-end marker for a macro with no name. Then we set
+	 * its `in_progress' flag to 0.
+	 */
+	for (l = istk->expansion; l; l = l->next)
+	    if (l->finishes && !l->finishes->name)
+		break;
+
+	if (l->finishes && !l->finishes->name)
+	    l->finishes->in_progress = 0;
+	else
+	    error (ERR_NONFATAL, "`%%exitrep' not within `%%rep' block");
+	free_tlist (origline);
+	return 1;		       /* the end marker will line-sync */
+
       case PP_DEFINE:
       case PP_IDEFINE:
 	tline = tline->next;
@@ -1240,6 +1671,7 @@ static int do_directive (Token *tline) {
 	    error (ERR_NONFATAL|ERR_OFFBY1,
 		   "`%%%sdefine' expects a macro identifier",
 		   (i == PP_IDEFINE ? "i" : ""));
+	    free_tlist (origline);
 	    return 3;
 	}
 	mname = tline->text;
@@ -1271,12 +1703,14 @@ static int do_directive (Token *tline) {
 		if (!tline) {
 		    error (ERR_NONFATAL|ERR_OFFBY1,
 			   "parameter identifier expected");
+		    free_tlist (origline);
 		    return 3;
 		}
 		if (tline->type != TOK_ID) {
 		    error (ERR_NONFATAL|ERR_OFFBY1,
 			   "`%s': parameter identifier expected",
 			   tline->text);
+		    free_tlist (origline);
 		    return 3;
 		}
 		tline->type = TOK_SMAC_PARAM + nparam++;
@@ -1292,6 +1726,7 @@ static int do_directive (Token *tline) {
 		    strcmp(tline->text, ")")) {
 		    error (ERR_NONFATAL|ERR_OFFBY1,
 			   "`)' expected to terminate macro template");
+		    free_tlist (origline);
 		    return 3;
 		}
 		break;
@@ -1348,6 +1783,105 @@ static int do_directive (Token *tline) {
 	smac->nparam = nparam;
 	smac->expansion = macro_start;
 	smac->in_progress = FALSE;
+	free_tlist (origline);
+	return 3;
+
+      case PP_ASSIGN:
+      case PP_IASSIGN:
+	tline = tline->next;
+	if (tline && tline->type == TOK_WHITESPACE)
+	    tline = tline->next;
+	if (!tline || (tline->type != TOK_ID &&
+		       (tline->type != TOK_PREPROC_ID ||
+			tline->text[1] != '$'))) {
+	    error (ERR_NONFATAL|ERR_OFFBY1,
+		   "`%%%sassign' expects a macro identifier",
+		   (i == PP_IASSIGN ? "i" : ""));
+	    free_tlist (origline);
+	    return 3;
+	}
+	mname = tline->text;
+	if (tline->type == TOK_ID) {
+	    p = tline->text;
+	    smhead = &smacros[hash(mname)];
+	} else {
+	    ctx = get_ctx (tline->text);
+	    if (ctx == NULL) {
+		free_tlist (origline);
+		return 3;
+	    } else {
+		p = tline->text+1;
+		p += strspn(p, "$");
+		smhead = &ctx->localmac;
+	    }
+	}
+	last = tline;
+	tline = tline->next;
+	last->next = NULL;
+
+	tline = expand_smacro (tline);
+	t = tline;
+	tptr = &t;
+	tokval.t_type = TOKEN_INVALID;
+	evalresult = evaluate (ppscan, tptr, &tokval, NULL, pass, error, NULL);
+	free_tlist (tline);
+	if (!evalresult) {
+	    free_tlist (origline);
+	    return 3;
+	}
+
+	if (tokval.t_type)
+	    error(ERR_WARNING|ERR_OFFBY1,
+		  "trailing garbage after expression ignored");
+
+	if (!is_simple(evalresult)) {
+	    error(ERR_NONFATAL|ERR_OFFBY1,
+		  "non-constant value given to `%%%sassign'",
+		  (i == PP_IASSIGN ? "i" : ""));
+	    free_tlist (origline);
+	    return 3;
+	}
+
+	macro_start = nasm_malloc(sizeof(*macro_start));
+	macro_start->next = NULL;
+	{
+	    char numbuf[20];
+	    sprintf(numbuf, "%ld", reloc_value(evalresult));
+	    macro_start->text = nasm_strdup(numbuf);
+	}
+	macro_start->mac = NULL;
+	macro_start->type = TOK_NUMBER;
+
+	/*
+	 * We now have a macro name, an implicit parameter count of
+	 * zero, and a numeric token to use as an expansion. Create
+	 * and store an SMacro.
+	 */
+	if (smacro_defined (mname, 0, &smac)) {
+	    if (!smac)
+		error (ERR_WARNING|ERR_OFFBY1,
+		       "single-line macro `%s' defined both with and"
+		       " without parameters", mname);
+	    else {
+		/*
+		 * We're redefining, so we have to take over an
+		 * existing SMacro structure. This means freeing
+		 * what was already in it.
+		 */
+		nasm_free (smac->name);
+		free_tlist (smac->expansion);
+	    }
+	} else {
+	    smac = nasm_malloc(sizeof(SMacro));
+	    smac->next = *smhead;
+	    *smhead = smac;
+	}
+	smac->name = nasm_strdup(p);
+	smac->casesense = (i == PP_ASSIGN);
+	smac->nparam = 0;
+	smac->expansion = macro_start;
+	smac->in_progress = FALSE;
+	free_tlist (origline);
 	return 3;
 
       case PP_LINE:
@@ -1359,6 +1893,7 @@ static int do_directive (Token *tline) {
 	    tline = tline->next;
 	if (!tline || tline->type != TOK_NUMBER) {
 	    error (ERR_NONFATAL|ERR_OFFBY1, "`%%line' expects line number");
+	    free_tlist (origline);
 	    return 3;
 	}
 	k = readnum(tline->text, &j);
@@ -1369,6 +1904,7 @@ static int do_directive (Token *tline) {
 	    if (!tline || tline->type != TOK_NUMBER) {
 		error (ERR_NONFATAL|ERR_OFFBY1,
 		       "`%%line' expects line increment");
+		free_tlist (origline);
 		return 3;
 	    }
 	    m = readnum(tline->text, &j);
@@ -1378,23 +1914,200 @@ static int do_directive (Token *tline) {
 	    tline = tline->next;
 	istk->lineno = k;
 	istk->lineinc = m;
+	update_fileline(3);	       /* update __FILE__ and __LINE__ */
 	if (tline) {
 	    char *s = detoken(tline);
 	    nasm_free (istk->fname);
 	    istk->fname = s;
 	}
+	free_tlist (origline);
 	return 5;
 
       default:
 	error(ERR_FATAL|ERR_OFFBY1,
 	      "preprocessor directive `%s' not yet implemented",
-	      directives[k]);
+	      directives[i]);
 	break;
     }
     return 3;
 }
 
 /*
+ * Ensure that a macro parameter contains a condition code and
+ * nothing else. Return the condition code index if so, or -1
+ * otherwise.
+ */
+static int find_cc (Token *t) {
+    Token *tt;
+    int i, j, k, m;
+
+    if (t && t->type == TOK_WHITESPACE)
+	t = t->next;
+    if (t->type != TOK_ID)
+	return -1;
+    tt = t->next;
+    if (tt && tt->type == TOK_WHITESPACE)
+	tt = tt->next;
+    if (tt && (tt->type != TOK_OTHER || strcmp(tt->text, ",")))
+	return -1;
+
+    i = -1;
+    j = sizeof(conditions)/sizeof(*conditions);
+    while (j-i > 1) {
+	k = (j+i) / 2;
+	m = nasm_stricmp(t->text, conditions[k]);
+	if (m == 0) {
+	    i = k;
+	    j = -2;
+	    break;
+	} else if (m < 0) {
+	    j = k;
+	} else
+	    i = k;
+    }
+    if (j != -2)
+	return -1;
+    return i;
+}
+
+/*
+ * Expand MMacro-local things: parameter references (%0, %n, %+n,
+ * %-n) and MMacro-local identifiers (%%foo).
+ */
+static Token *expand_mmac_params (Token *tline) {
+    Token *t, *tt, *ttt, **tail, *thead;
+
+    tail = &thead;
+    thead = NULL;
+
+    while (tline) {
+	if (tline->type == TOK_PREPROC_ID &&
+	    (tline->text[1] == '+' || tline->text[1] == '-' ||
+	     tline->text[1] == '%' ||
+	     (tline->text[1] >= '0' && tline->text[1] <= '9'))) {
+	    char *text = NULL;
+	    int type = 0, cc;	       /* type = 0 to placate optimisers */
+	    char tmpbuf[30];
+	    int n, i;
+	    MMacro *mac;
+
+	    t = tline;
+	    tline = tline->next;
+
+	    mac = istk->mstk;
+	    while (mac && !mac->name)  /* avoid mistaking %reps for macros */
+		mac = mac->next_active;
+	    if (!mac)
+		error(ERR_NONFATAL, "`%s': not in a macro call", t->text);
+	    else switch (t->text[1]) {
+		/*
+		 * We have to make a substitution of one of the
+		 * forms %1, %-1, %+1, %%foo, %0.
+		 */
+	      case '0':
+		type = TOK_NUMBER;
+		sprintf(tmpbuf, "%d", mac->nparam);
+		text = nasm_strdup(tmpbuf);
+		break;
+	      case '%':
+		type = TOK_ID;
+		sprintf(tmpbuf, "..@%lu.", mac->unique);
+		text = nasm_malloc(strlen(tmpbuf)+strlen(t->text+2)+1);
+		strcpy(text, tmpbuf);
+		strcat(text, t->text+2);
+		break;
+	      case '-':
+		n = atoi(t->text+2)-1;
+		if (n >= mac->nparam)
+		    tt = NULL;
+		else {
+		    if (mac->nparam > 1)
+			n = (n + mac->rotate) % mac->nparam;
+		    tt = mac->params[n];
+		}
+		cc = find_cc (tt);
+		if (cc == -1) {
+		    error (ERR_NONFATAL|ERR_OFFBY1,
+			   "macro parameter %d is not a condition code",
+			   n+1);
+		    text = NULL;
+		} else {
+		    type = TOK_ID;
+		    if (inverse_ccs[cc] == -1) {
+			error (ERR_NONFATAL|ERR_OFFBY1,
+			       "condition code `%s' is not invertible",
+			       conditions[cc]);
+			text = NULL;
+		    } else
+			text = nasm_strdup(conditions[inverse_ccs[cc]]);
+		}
+		break;
+	      case '+':
+		n = atoi(t->text+2)-1;
+		if (n >= mac->nparam)
+		    tt = NULL;
+		else {
+		    if (mac->nparam > 1)
+			n = (n + mac->rotate) % mac->nparam;
+		    tt = mac->params[n];
+		}
+		cc = find_cc (tt);
+		if (cc == -1) {
+		    error (ERR_NONFATAL|ERR_OFFBY1,
+			   "macro parameter %d is not a condition code",
+			   n+1);
+		    text = NULL;
+		} else {
+		    type = TOK_ID;
+		    text = nasm_strdup(conditions[cc]);
+		}
+		break;
+	      default:
+		n = atoi(t->text+1)-1;
+		if (n >= mac->nparam)
+		    tt = NULL;
+		else {
+		    if (mac->nparam > 1)
+			n = (n + mac->rotate) % mac->nparam;
+		    tt = mac->params[n];
+		}
+		if (tt) {
+		    for (i=0; i<mac->paramlen[n]; i++) {
+			ttt = *tail = nasm_malloc(sizeof(Token));
+			ttt->next = NULL;
+			tail = &ttt->next;
+			ttt->type = tt->type;
+			ttt->text = nasm_strdup(tt->text);
+			ttt->mac = NULL;
+			tt = tt->next;
+		    }
+		}
+		text = NULL;       /* we've done it here */
+		break;
+	    }
+	    nasm_free (t->text);
+	    nasm_free (t);
+	    if (text) {
+		t = *tail = nasm_malloc(sizeof(Token));
+		t->next = NULL;
+		tail = &t->next;
+		t->type = type;
+		t->text = text;
+		t->mac = NULL;
+	    }
+	    continue;
+	} else {
+	    t = *tail = tline;
+	    tline = tline->next;
+	    t->mac = NULL;
+	    t->next = NULL;
+	    tail = &t->next;
+	}
+    }
+    return thead;
+}
+
+/*
  * Expand all single-line macro calls made in the given line.
  * Return the expanded version of the line. The original is deemed
  * to be destroyed in the process. (In reality we'll just move
@@ -1453,6 +2166,7 @@ static Token *expand_smacro (Token *tline) {
 		}
 	    }
 	}
+
 	if (!tline)
 	    break;
 	/*
@@ -1650,44 +2364,6 @@ static Token *expand_smacro (Token *tline) {
 }
 
 /*
- * Ensure that a macro parameter contains a condition code and
- * nothing else. Return the condition code index if so, or -1
- * otherwise.
- */
-static int find_cc (Token *t) {
-    Token *tt;
-    int i, j, k, m;
-
-    if (t && t->type == TOK_WHITESPACE)
-	t = t->next;
-    if (t->type != TOK_ID)
-	return -1;
-    tt = t->next;
-    if (tt && tt->type == TOK_WHITESPACE)
-	tt = tt->next;
-    if (tt && (tt->type != TOK_OTHER || strcmp(tt->text, ",")))
-	return -1;
-
-    i = -1;
-    j = sizeof(conditions)/sizeof(*conditions);
-    while (j-i > 1) {
-	k = (j+i) / 2;
-	m = nasm_stricmp(t->text, conditions[k]);
-	if (m == 0) {
-	    i = k;
-	    j = -2;
-	    break;
-	} else if (m < 0) {
-	    j = k;
-	} else
-	    i = k;
-    }
-    if (j != -2)
-	return -1;
-    return i;
-}
-
-/*
  * Determine whether the given line constitutes a multi-line macro
  * call, and return the MMacro structure called if so. Doesn't have
  * to check for an initial label - that's taken care of in
@@ -1741,22 +2417,29 @@ static MMacro *is_mmacro (Token *tline, Token ***params_array) {
 	     * It's right, and we can use it. Add its default
 	     * parameters to the end of our list if necessary.
 	     */
-	    params = nasm_realloc (params, (m->nparam_max+1)*sizeof(*params));
-	    if (m->defaults) {
-		while (nparam < m->nparam_max) {
+	    if (m->defaults && nparam < m->nparam_min + m->ndefs) {
+		params = nasm_realloc (params, ((m->nparam_min+m->ndefs+1) *
+						sizeof(*params)));
+		while (nparam < m->nparam_min + m->ndefs) {
 		    params[nparam] = m->defaults[nparam - m->nparam_min];
 		    nparam++;
 		}
-	    } else {
-		while (nparam < m->nparam_max) {
-		    params[nparam] = NULL;
-		    nparam++;
-		}
 	    }
 	    /*
+	     * If we've gone over the maximum parameter count (and
+	     * we're in Plus mode), ignore parameters beyond
+	     * nparam_max.
+	     */
+	    if (m->plus && nparam > m->nparam_max)
+		nparam = m->nparam_max;
+	    /*
 	     * Then terminate the parameter list, and leave.
 	     */
-	    params[m->nparam_max] = NULL;
+	    if (!params) {	       /* need this special case */
+		params = nasm_malloc(sizeof(*params));
+		nparam = 0;
+	    }
+	    params[nparam] = NULL;
 	    *params_array = params;
 	    return m;
 	}
@@ -1787,10 +2470,10 @@ static MMacro *is_mmacro (Token *tline, Token ***params_array) {
  * line sync is needed (2 if it is). Otherwise return 0.
  */
 static int expand_mmacro (Token *tline) {
-    Token *label = NULL, **params, *t, *tt, *ttt, *last = NULL;
+    Token *label = NULL, **params, *t, *tt, *last = NULL;
     MMacro *m = NULL;
     Line *l, *ll;
-    int i, n, nparam, *paramlen;
+    int i, nparam, *paramlen;
     int need_sync = FALSE;
 
     t = tline;
@@ -1870,23 +2553,33 @@ static int expand_mmacro (Token *tline) {
     /*
      * OK, we have a MMacro structure together with a set of
      * parameters. We must now go through the expansion and push
-     * _copies_ of each Line on to istk->expansion, having first
-     * substituted for most % tokens (%1, %+1, %-1, %%foo). Note
-     * that %$bar, %$$baz, %$$$quux, and so on, do not get
-     * substituted here but rather have to wait until the
-     * single-line macro substitution process. This is because they
-     * don't just crop up in macro definitions, but can appear
-     * anywhere they like.
+     * copies of each Line on to istk->expansion. Substitution of
+     * parameter tokens and macro-local tokens doesn't get done
+     * until the single-line macro substitution process; this is
+     * because delaying them allows us to change the semantics
+     * later through %rotate.
      *
-     * First, push an end marker on to istk->expansion, and mark
-     * this macro as in progress.
+     * First, push an end marker on to istk->expansion, mark this
+     * macro as in progress, and set up its invocation-specific
+     * variables.
      */
     ll = nasm_malloc(sizeof(Line));
     ll->next = istk->expansion;
     ll->finishes = m;
     ll->first = NULL;
     istk->expansion = ll;
+
     m->in_progress = TRUE;
+    m->params = params;
+    m->iline = tline;
+    m->nparam = nparam;
+    m->rotate = 0;
+    m->paramlen = paramlen;
+    m->unique = unique++;
+
+    m->next_active = istk->mstk;
+    istk->mstk = m;
+
     for (l = m->expansion; l; l = l->next) {
 	Token **tail;
 
@@ -1897,90 +2590,12 @@ static int expand_mmacro (Token *tline) {
 	tail = &ll->first;
 
 	for (t = l->first; t; t = t->next) {
-	    char *text;
-	    int type = 0, cc;	       /* type = 0 to placate optimisers */
-	    char tmpbuf[30];
-
-	    if (t->type == TOK_PREPROC_ID &&
-		(t->text[1] == '+' || t->text[1] == '-' ||
-		 t->text[1] == '%' ||
-		 (t->text[1] >= '0' && t->text[1] <= '9'))) {
-		/*
-		 * We have to make a substitution of one of the
-		 * forms %1, %-1, %+1, %%foo.
-		 */
-		switch (t->text[1]) {
-		  case '%':
-		    type = TOK_ID;
-		    sprintf(tmpbuf, "..@%lu.", unique);
-		    text = nasm_malloc(strlen(tmpbuf)+strlen(t->text+2)+1);
-		    strcpy(text, tmpbuf);
-		    strcat(text, t->text+2);
-		    break;
-		  case '-':
-		    n = atoi(t->text+2)-1;
-		    tt = params[n];
-		    cc = find_cc (tt);
-		    if (cc == -1) {
-			error (ERR_NONFATAL|ERR_OFFBY1,
-			       "macro parameter %d is not a condition code",
-			       n+1);
-			text = NULL;
-		    } else {
-			type = TOK_ID;
-			if (inverse_ccs[cc] == -1) {
-			    error (ERR_NONFATAL|ERR_OFFBY1,
-				   "condition code `%s' is not invertible",
-				   conditions[cc]);
-			    text = NULL;
-			} else
-			    text = nasm_strdup(conditions[inverse_ccs[cc]]);
-		    }
-		    break;
-		  case '+':
-		    n = atoi(t->text+2)-1;
-		    tt = params[n];
-		    cc = find_cc (tt);
-		    if (cc == -1) {
-			error (ERR_NONFATAL|ERR_OFFBY1,
-			       "macro parameter %d is not a condition code",
-			       n+1);
-			text = NULL;
-		    } else {
-			type = TOK_ID;
-			text = nasm_strdup(conditions[cc]);
-		    }
-		    break;
-		  default:
-		    n = atoi(t->text+1)-1;
-		    if (n < nparam) {
-			ttt = params[n];
-			for (i=0; i<paramlen[n]; i++) {
-			    tt = *tail = nasm_malloc(sizeof(Token));
-			    tt->next = NULL;
-			    tail = &tt->next;
-			    tt->type = ttt->type;
-			    tt->text = nasm_strdup(ttt->text);
-			    tt->mac = NULL;
-			    ttt = ttt->next;
-			}
-		    }
-		    text = NULL;       /* we've done it here */
-		    break;
-		}
-	    } else {
-		type = t->type;
-		text = nasm_strdup(t->text);
-	    }
-
-	    if (text) {
-		tt = *tail = nasm_malloc(sizeof(Token));
-		tt->next = NULL;
-		tail = &tt->next;
-		tt->type = type;
-		tt->text = text;
-		tt->mac = NULL;
-	    }
+	    tt = *tail = nasm_malloc(sizeof(Token));
+	    tt->next = NULL;
+	    tail = &tt->next;
+	    tt->type = t->type;
+	    tt->text = nasm_strdup(t->text);
+	    tt->mac = NULL;
 	}
 
 	istk->expansion = ll;
@@ -1996,20 +2611,13 @@ static int expand_mmacro (Token *tline) {
 	istk->expansion->first = label;
     }
 
-    /*
-     * Clean up.
-     */
-    unique++;
-    nasm_free (paramlen);
-    nasm_free (params);
-    free_tlist (tline);
-
     list->uplevel (m->nolist ? LIST_MACRO_NOLIST : LIST_MACRO);
 
     return need_sync ? 2 : 1;
 }
 
-static void pp_reset (char *file, efunc errfunc, ListGen *listgen) {
+static void pp_reset (char *file, int apass, efunc errfunc, evalfunc eval,
+		      ListGen *listgen) {
     int h;
 
     error = errfunc;
@@ -2019,6 +2627,7 @@ static void pp_reset (char *file, efunc errfunc, ListGen *listgen) {
     istk->next = NULL;
     istk->conds = NULL;
     istk->expansion = NULL;
+    istk->mstk = NULL;
     istk->fp = fopen(file, "r");
     istk->fname = nasm_strdup(file);
     istk->lineno = istk->lineinc = 1;
@@ -2031,7 +2640,10 @@ static void pp_reset (char *file, efunc errfunc, ListGen *listgen) {
     }
     unique = 0;
     stdmacpos = stdmac;
+    any_extrastdmac = (extrastdmac != NULL);
     list = listgen;
+    evaluate = eval;
+    pass = apass;
 }
 
 static char *pp_getline (void) {
@@ -2053,12 +2665,65 @@ static char *pp_getline (void) {
 	tline = NULL;
 	while (istk->expansion && istk->expansion->finishes) {
 	    Line *l = istk->expansion;
-	    l->finishes->in_progress = FALSE;
-	    istk->expansion = l->next;
-	    nasm_free (l);
-	    list->downlevel (LIST_MACRO);
-	    if (!istk->expansion)
+	    if (!l->finishes->name && l->finishes->in_progress > 1) {
+		Line *ll;
+
+		/*
+		 * This is a macro-end marker for a macro with no
+		 * name, which means it's not really a macro at all
+		 * but a %rep block, and the `in_progress' field is
+		 * more than 1, meaning that we still need to
+		 * repeat. (1 means the natural last repetition; 0
+		 * means termination by %exitrep.) We have
+		 * therefore expanded up to the %endrep, and must
+		 * push the whole block on to the expansion buffer
+		 * again. We don't bother to remove the macro-end
+		 * marker: we'd only have to generate another one
+		 * if we did.
+		 */
+		l->finishes->in_progress--;
+		for (l = l->finishes->expansion; l; l = l->next) {
+		    Token *t, *tt, **tail;
+
+		    ll = nasm_malloc(sizeof(Line));
+		    ll->next = istk->expansion;
+		    ll->finishes = NULL;
+		    ll->first = NULL;
+		    tail = &ll->first;
+
+		    for (t = l->first; t; t = t->next) {
+			if (t->text) {
+			    tt = *tail = nasm_malloc(sizeof(Token));
+			    tt->next = NULL;
+			    tail = &tt->next;
+			    tt->type = t->type;
+			    tt->text = nasm_strdup(t->text);
+			    tt->mac = NULL;
+			}
+		    }
+
+		    istk->expansion = ll;
+		}
 		line_sync();
+	    } else {
+		if (istk->mstk->name) {
+		    /*
+		     * This was a real macro call, not a %rep, and
+		     * therefore the parameter information needs to
+		     * be freed.
+		     */
+		    nasm_free(istk->mstk->params);
+		    free_tlist(istk->mstk->iline);
+		    nasm_free(istk->mstk->paramlen);
+		}
+		istk->mstk = istk->mstk->next_active;
+		l->finishes->in_progress = FALSE;
+		istk->expansion = l->next;
+		nasm_free (l);
+		list->downlevel (LIST_MACRO);
+		if (!istk->expansion)
+		    line_sync();
+	    }
 	}
 	if (istk->expansion) {
 	    char *p;
@@ -2091,6 +2756,7 @@ static char *pp_getline (void) {
 		    return NULL;
 		else
 		    line_sync();
+		update_fileline(3);    /* update __FILE__ and __LINE__ */
 		line = read_line();
 	    }
 	    line = prepreproc(line);
@@ -2099,11 +2765,21 @@ static char *pp_getline (void) {
 	}
 
 	/*
+	 * We must expand MMacro parameters and MMacro-local labels
+	 * _before_ we plunge into directive processing, to cope
+	 * with things like `%define something %1' such as STRUC
+	 * uses. Unless we're _defining_ a MMacro, in which case
+	 * those tokens should be left alone to go into the
+	 * definition.
+	 */
+	if (!defining)
+	    tline = expand_mmac_params(tline);
+
+	/*
 	 * Check the line to see if it's a preprocessor directive.
 	 */
 	ret = do_directive(tline);
 	if (ret & 1) {
-	    free_tlist (tline);
 	    if (ret & 4)
 		line_sync();
 	    if ((ret & 2) && !stdmacpos) {/* give a blank line to the output */
@@ -2135,6 +2811,17 @@ static char *pp_getline (void) {
 	     */
 	    free_tlist(tline);
 	    continue;
+	} else if (istk->mstk && !istk->mstk->in_progress) {
+	    /*
+	     * We're in a %rep block which has been terminated, so
+	     * we're walking through to the %endrep without
+	     * emitting anything. Emit nothing at all, not even a
+	     * blank line: when we emerge from the %rep block we'll
+	     * give a line-number directive so we keep our place
+	     * correctly.
+	     */
+	    free_tlist(tline);
+	    continue;
 	} else {
 	    tline = expand_smacro(tline);
 	    ret = expand_mmacro(tline);
@@ -2189,6 +2876,7 @@ static void pp_cleanup (void) {
 	    mmacros[h] = mmacros[h]->next;
 	    nasm_free (m->name);
 	    free_tlist (m->dlist);
+	    nasm_free (m->defaults);
 	    free_llist (m->expansion);
 	    nasm_free (m);
 	}
@@ -2275,6 +2963,10 @@ void pp_pre_define (char *definition) {
     predef = l;
 }
 
+void pp_extra_stdmac (char **macros) {
+    extrastdmac = macros;
+}
+
 Preproc nasmpp = {
     pp_reset,
     pp_getline,
diff --git a/preproc.h b/preproc.h
index fea3e8cb..c70548f0 100644
--- a/preproc.h
+++ b/preproc.h
@@ -12,6 +12,7 @@
 void pp_include_path (char *);
 void pp_pre_include (char *);
 void pp_pre_define (char *);
+void pp_extra_stdmac (char **);
 
 extern Preproc nasmpp;
 
diff --git a/rdoff/Makefile.in b/rdoff/Makefile.in
new file mode 100644
index 00000000..5ab409de
--- /dev/null
+++ b/rdoff/Makefile.in
@@ -0,0 +1,74 @@
+#
+# Auto-configuring Makefile for RDOFF object file utils; part of the
+# Netwide Assembler
+#
+# The Netwide Assembler is copyright (C) 1996 Simon Tatham and
+# Julian Hall. All rights reserved. The software is
+# redistributable under the licence given in the file "Licence"
+# distributed in the NASM archive.
+
+top_srcdir = @top_srcdir@
+srcdir = @srcdir@
+VPATH = @srcdir@
+prefix = @prefix@
+exec_prefix = @exec_prefix@
+bindir = @bindir@
+mandir = @mandir@
+
+CC = @CC@
+CFLAGS = @CFLAGS@ @GCCFLAGS@ -I$(top_srcdir)
+
+INSTALL = @INSTALL@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_DATA = @INSTALL_DATA@
+LN_S = @LN_S@
+
+LDRDFLIBS = rdoff.o nasmlib.o symtab.o collectn.o rdlib.o
+RDXLIBS = rdoff.o rdfload.o symtab.o collectn.o
+
+.c.o:
+	$(CC) -c $(CFLAGS) $<
+
+all: rdfdump ldrdf rdx rdflib rdf2bin rdf2com
+
+rdfdump: rdfdump.o
+	$(CC) -o rdfdump rdfdump.o
+
+ldrdf: ldrdf.o $(LDRDFLIBS)
+	$(CC) -o ldrdf ldrdf.o $(LDRDFLIBS)
+rdx: rdx.o $(RDXLIBS)
+	$(CC) -o rdx rdx.o $(RDXLIBS)
+rdflib: rdflib.o
+	$(CC) -o rdflib rdflib.o
+rdf2bin: rdf2bin.o $(RDXLIBS) nasmlib.o
+	$(CC) -o rdf2bin rdf2bin.o $(RDXLIBS) nasmlib.o
+rdf2com:
+	$(LN_S) rdf2bin rdf2com
+
+rdf2bin.o: rdf2bin.c
+rdfdump.o: rdfdump.c
+rdoff.o: rdoff.c rdoff.h
+ldrdf.o: ldrdf.c rdoff.h $(top_srcdir)/nasmlib.h symtab.h collectn.h rdlib.h
+symtab.o: symtab.c symtab.h
+collectn.o: collectn.c collectn.h
+rdx.o: rdx.c rdoff.h rdfload.h symtab.h
+rdfload.o: rdfload.c rdfload.h rdoff.h collectn.h symtab.h
+rdlib.o: rdlib.c rdlib.h
+rdflib.o: rdflib.c
+
+nasmlib.o: $(top_srcdir)/nasmlib.c
+	$(CC) -c $(CFLAGS) $(top_srcdir)/nasmlib.c
+
+clean:
+	rm -f *.o rdfdump ldrdf rdx rdflib rdf2bin rdf2com
+
+spotless: clean
+	rm -f Makefile
+
+install: rdfdump ldrdf rdx rdflib rdf2bin rdf2com
+	$(INSTALL_PROGRAM) rdfdump $(bindir)/rdfdump
+	$(INSTALL_PROGRAM) ldrdf $(bindir)/ldrdf
+	$(INSTALL_PROGRAM) rdx $(bindir)/rdx
+	$(INSTALL_PROGRAM) rdflib $(bindir)/rdflib
+	$(INSTALL_PROGRAM) rdf2bin $(bindir)/rdf2bin
+	cd $(bindir); $(LN_S) rdf2bin rdf2com
diff --git a/rdoff/Makefile.unx b/rdoff/Makefile.unx
new file mode 100644
index 00000000..f1558396
--- /dev/null
+++ b/rdoff/Makefile.unx
@@ -0,0 +1,73 @@
+# Generated automatically from Makefile.in by configure.
+#
+# Auto-configuring Makefile for RDOFF object file utils; part of the
+# Netwide Assembler
+#
+# The Netwide Assembler is copyright (C) 1996 Simon Tatham and
+# Julian Hall. All rights reserved. The software is
+# redistributable under the licence given in the file "Licence"
+# distributed in the NASM archive.
+
+# You may need to adjust these values.
+
+prefix = /usr/local
+CC = cc
+CFLAGS = -O -I..
+
+# You _shouldn't_ need to adjust anything below this line.
+
+exec_prefix = ${prefix}
+bindir = ${exec_prefix}/bin
+mandir = ${prefix}/man
+
+INSTALL = /usr/bin/install -c
+INSTALL_PROGRAM = ${INSTALL}
+INSTALL_DATA = ${INSTALL} -m 644
+LN_S = ln -s
+
+LDRDFLIBS = rdoff.o nasmlib.o symtab.o collectn.o rdlib.o
+RDXLIBS = rdoff.o rdfload.o symtab.o collectn.o
+
+.c.o:
+	$(CC) -c $(CFLAGS) $*.c
+
+all: rdfdump ldrdf rdx rdflib rdf2bin rdf2com
+
+rdfdump: rdfdump.o
+	$(CC) -o rdfdump rdfdump.o
+
+ldrdf: ldrdf.o $(LDRDFLIBS)
+	$(CC) -o ldrdf ldrdf.o $(LDRDFLIBS)
+rdx: rdx.o $(RDXLIBS)
+	$(CC) -o rdx rdx.o $(RDXLIBS)
+rdflib: rdflib.o
+	$(CC) -o rdflib rdflib.o
+rdf2bin: rdf2bin.o $(RDXLIBS) nasmlib.o
+	$(CC) -o rdf2bin rdf2bin.o $(RDXLIBS) nasmlib.o
+rdf2com:
+	$(LN_S) rdf2bin rdf2com
+
+rdf2bin.o: rdf2bin.c
+rdfdump.o: rdfdump.c
+rdoff.o: rdoff.c rdoff.h
+ldrdf.o: ldrdf.c rdoff.h ../nasmlib.h symtab.h collectn.h rdlib.h
+symtab.o: symtab.c symtab.h
+collectn.o: collectn.c collectn.h
+rdx.o: rdx.c rdoff.h rdfload.h symtab.h
+rdfload.o: rdfload.c rdfload.h rdoff.h collectn.h symtab.h
+rdlib.o: rdlib.c rdlib.h
+rdflib.o: rdflib.c
+
+nasmlib.o: ../nasmlib.c ../nasmlib.h ../names.c ../nasm.h
+	$(CC) -c $(CFLAGS) ../nasmlib.c
+
+clean:
+	rm -f *.o rdfdump ldrdf rdx rdflib rdf2bin rdf2com
+
+install: rdfdump ldrdf rdx rdflib rdf2bin rdf2com
+	$(INSTALL_PROGRAM) rdfdump $(bindir)/rdfdump
+	$(INSTALL_PROGRAM) ldrdf $(bindir)/ldrdf
+	$(INSTALL_PROGRAM) rdx $(bindir)/rdx
+	$(INSTALL_PROGRAM) rdflib $(bindir)/rdflib
+	$(INSTALL_PROGRAM) rdf2bin $(bindir)/rdf2bin
+	cd $(bindir); $(LN_S) rdf2bin rdf2com
diff --git a/rdoff/ldrdf.c b/rdoff/ldrdf.c
index e2541fa4..9e4a215d 100644
--- a/rdoff/ldrdf.c
+++ b/rdoff/ldrdf.c
@@ -24,7 +24,6 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "nasm.h"
 #include "rdoff.h"
 #include "nasmlib.h"
 #include "symtab.h"
@@ -419,8 +418,11 @@ void link_segments(void)
 		relto = r->r.segment == 0 ? mod->coderel : mod->datarel;
 	    }
 	    else
+	    {
 		bRelative = 0;		/* non-relative - need to relocate
 					 * at load time			*/
+		relto = 0;	       /* placate optimiser warnings */
+	    }
 
 	    /* calculate absolute offset of reference, not rel to beginning of
 	       segment */
diff --git a/rdoff/rdf.doc b/rdoff/rdf.doc
new file mode 100644
index 00000000..300c2bc5
--- /dev/null
+++ b/rdoff/rdf.doc
@@ -0,0 +1,99 @@
+RDOFF: Relocatable Dynamically-linked Object File Format
+========================================================
+
+RDOFF was designed initially to test the object-file production
+interface to NASM. It soon became apparent that it could be enhanced
+for use in serious applications due to its simplicity; code to load
+and execute an RDOFF object module is very simple. It also contains
+enhancements to allow it to be linked with a dynamic link library at
+either run- or load- time, depending on how complex you wish to make
+your loader.
+
+The RDOFF format (version 1.1, as produced by NASM v0.91) is defined
+as follows:
+
+The first six bytes of the file contain the string 'RDOFF1'. Other
+versions of the format may contain other last characters other than
+'1' - all little endian versions of the file will always contain an
+ASCII character with value greater than 32. If RDOFF is used on a
+big-endian machine at some point in the future, the version will be
+encoded in decimal rather than ASCII, so will be below 32.
+
+All multi-byte fields follwing this are encoded in either little- or
+big-endian format depending on the system described by this version
+information. Object files should be encoded in the endianness of
+their target machine; files of incorrect endianness will be rejected
+by the loader - this means that loaders do not need to convert
+endianness, as RDOFF has been designed with simplicity of loading at
+the forefront of the design requirements.
+
+The next 4 byte field is the length of the header in bytes. The
+header consists of a sequence of variable length records. Each
+record's type is identified by the first byte of the record. Record
+types 1-4 are currently supported. Record type 5 will be added in
+the near future, when I implement BSS segments. Record type 6 may be
+to do with debugging, when I get debugging implemented.
+
+Type 1: Relocation
+==================
+
+Offset  Length  Description
+0       1       Type (contains 1)
+1       1       Segment that contains reference (0 = text, 1 = data)
+                Add 64 to this number to indicate a relative linkage
+                to an external symbol (see notes)
+2       4       Offset of reference
+6       1       Length of reference (1,2 or 4 bytes)
+7       2       Segment to which reference is made (0 = text, 1 =
+                data, 2 = BSS [when implemented]) others are external
+                symbols.
+
+Total length = 9 bytes
+
+Type 2: Symbol Import
+=====================
+
+0       1       Type (2)
+1       2       Segment number that will be used in references to this
+                symbol.
+3       ?       Null terminated string containing label (up to 32
+                chars) to match against exports in linkage.
+
+Type 3: Symbol Export
+=====================
+
+0       1       Type (3)
+1       1       Segment containing object to be exported (0/1/2)
+2       4       Offset within segment
+6       ?       Null terminate string containing label to export (32
+                char maximum length)
+
+Type 4: Dynamic Link Library
+============================
+
+0       1       Type (4)
+1       ?       Library name (up to 128 chars)
+
+Type 5: Reserve BSS
+===================
+
+0       1       Type (5)
+1       4       Amount of BSS space to reserve in bytes
+
+Total length: 5 bytes
+
+-----------------------------------------------------------------------------
+
+Following the header is the text (code) segment. This is preceded by
+a 4-byte integer, which is its length in bytes. This is followed by
+the length of the data segment (also 4 bytes), and finally the data
+segment.
+
+Notes
+=====
+
+Relative linking: The number stored at the address is offset
+required from the imported symbol, with the address of the end of
+the instruction subtracted from it. This means that the linker can
+simply add the address of the label relative to the beginning of the
+current segment to it.
diff --git a/rdoff/rdfdump.c b/rdoff/rdfdump.c
index bc55a974..080c2e73 100644
--- a/rdoff/rdfdump.c
+++ b/rdoff/rdfdump.c
@@ -57,7 +57,7 @@ void print_header(long length) {
     case 3:             /* export record */
       fread(&s,1,1,infile);
       fread(&o,4,1,infile);
-      l = 0;
+      ll = 0;
       do {
 	fread(&buf[ll],1,1,infile);
       } while (buf[ll++]);
@@ -65,7 +65,7 @@ void print_header(long length) {
       length -= ll + 6;
       break;
     case 4:		/* DLL record */
-      l = 0;
+      ll = 0;
       do {
 	fread(&buf[ll],1,1,infile);
       } while (buf[ll++]);
@@ -88,6 +88,7 @@ int main(int argc,char **argv) {
   char id[7];
   long l;
   int verbose = 0;
+  long offset;
 
   puts("RDOFF Dump utility v1.1 (C) Copyright 1996 Julian R Hall");
 
@@ -133,9 +134,15 @@ int main(int argc,char **argv) {
   fread(&l,4,1,infile);
   l = translatelong(l);
   printf("\nText segment length = %ld bytes\n",l);
+  offset = 0;
   while(l--) {
     fread(id,1,1,infile);
-    if (verbose) printf("  %02x",(int) (unsigned char)id[0]);
+    if (verbose) {
+      if (offset % 16 == 0)
+	printf("\n%08lx ", offset);
+      printf(" %02x",(int) (unsigned char)id[0]);
+      offset++;
+    }
   }
   if (verbose) printf("\n\n");
 
@@ -145,9 +152,13 @@ int main(int argc,char **argv) {
 
   if (verbose)
   {
+    offset = 0;
     while (l--) {
       fread(id,1,1,infile);
-      printf("  %02x",(int) (unsigned char) id[0]);
+      if (offset % 16 == 0)
+	printf("\n%08lx ", offset);
+      printf(" %02x",(int) (unsigned char) id[0]);
+      offset++;
     }
     printf("\n");
   }
diff --git a/rdoff/test/Makefile b/rdoff/test/Makefile
new file mode 100644
index 00000000..8e9f42e2
--- /dev/null
+++ b/rdoff/test/Makefile
@@ -0,0 +1,2 @@
+clean:
+	rm -f *.rdf *.rdx
diff --git a/rdoff/test/makelib b/rdoff/test/makelib
new file mode 100644
index 00000000..baa46766
--- /dev/null
+++ b/rdoff/test/makelib
@@ -0,0 +1,14 @@
+
+LIBNAME=$1;
+shift;
+
+if [ "$LIBNAME" = "" ]; then
+	echo 'Usage: makelib <library name> <module> [...]'
+fi
+
+rdflib c $LIBNAME
+
+for FILE in $*; do
+	rdflib a $LIBNAME $FILE $FILE
+done
+  
diff --git a/rdoff/test/rdftest1.asm b/rdoff/test/rdftest1.asm
new file mode 100644
index 00000000..76f1e43e
--- /dev/null
+++ b/rdoff/test/rdftest1.asm
@@ -0,0 +1,54 @@
+	;; program to test RDOFF production and linkage
+
+	;; items to test include:
+	;;	[1] relocation within the same segment in each module
+	;;	[2] relocation to different segments in same module
+	;;	[3] relocation to same segment in different module
+	;;	[4] relocation to different segment in different module
+	;;	[5] relative relocation to same module
+	;;	[6] relative relocation to different module
+	;;	[7] correct generation of BSS addresses
+
+[SECTION .text]
+[BITS 32]
+	
+_main:
+	mov ax,localdata	; [2] (16 bit) => 66 b8 0000
+	mov eax,localdata2	; [2] (32 bit) => b8 0000000a
+
+[EXTERN _fardata]
+
+	mov eax,[_fardata]	; [4] => a1 00000000 (+20)
+	mov cx,next		; [1] => 66 b9 0012
+next:
+	call localproc		; [5] => e8 00000019
+
+[EXTERN _farproc]
+	mov eax,_farproc	; [3] => b8 00000000 (+40+0)
+	call _farproc		; [6] => e8 -$ (-0+40+0) (=1f)
+
+	mov eax,localbss	; [7] => b8 00000000
+
+[GLOBAL _term]
+_term:	xor ax,ax		; => 66 31 c0
+	int 21h			; => cd 21
+	jmp _term		; => e9 -0a (=fffffff6)
+
+localproc:	
+	ret			; => c3
+
+[GLOBAL _test1proc]
+_test1proc:
+	call localproc		; [5] => e8 -$ (-0+0+?) (=-6=fffffffa)
+	ret			; => c3
+			
+[SECTION .data]
+[GLOBAL localdata2]
+localdata:	db 'localdata',0
+localdata2:	db 'localdata2',0
+farref:		dd _fardata	; [3] => 0 (+20)
+localref:	dd _main	; [2] => 0 (+0)
+
+[SECTION .bss]
+localbss:	resw 4		; reserve 8 bytes BSS
+	
+\ No newline at end of file
diff --git a/rdoff/test/rdftest2.asm b/rdoff/test/rdftest2.asm
new file mode 100644
index 00000000..25b8c189
--- /dev/null
+++ b/rdoff/test/rdftest2.asm
@@ -0,0 +1,33 @@
+	;; rdftest2.asm - test linkage and generation of RDOFF files
+
+[SECTION .text]
+[BITS 32]
+
+[GLOBAL _farproc]
+[EXTERN _test1proc]
+[EXTERN localdata2]
+[EXTERN _term]
+_farproc:
+	
+	mov bx,localdata2	; [4] 0 => 66 bb 000a(+0)
+	mov eax,_term		; [3] 5 => b8 00000000(+26+0)
+	call _test1proc		; [6] A => e8 fffffff2(-40+0+31)(=ffffffe3)
+
+	mov eax,_farproc	; [1] => b8 00000000(+40)
+	add eax,[_fardata]	; [2] => 03 05 00000000(+20)
+
+	mov ebx,mybssdata	; [7] => bb 00000000(+08)
+	call myproc		; [5] => e8 00000001
+	ret
+
+myproc:
+	add eax,ebx
+	ret
+	
+[SECTION .data]
+[GLOBAL _fardata]
+_fardata:	dw _term	; [4]
+_localref:	dd _farproc	; [2]
+
+[SECTION .bss]
+mybssdata:	resw 1
diff --git a/rdoff/test/rdtlib.asm b/rdoff/test/rdtlib.asm
new file mode 100644
index 00000000..6c2b8ec9
--- /dev/null
+++ b/rdoff/test/rdtlib.asm
@@ -0,0 +1,48 @@
+	;; library functions for rdtmain - test of rdx linking and execution
+
+	;; library function = _strcmp, defined as in C
+
+[SECTION .text]
+[BITS 32]
+
+[GLOBAL _strcmp]
+_strcmp:
+	push ebp
+	mov ebp,esp
+
+	;; ebp+8 = first paramater, ebp+12 = second
+
+	mov esi,[ebp+8]
+	mov edi,[ebp+12]
+
+.loop:
+	mov cl,byte [esi]
+	mov dl,byte [edi]
+	cmp cl,dl
+	jb .below
+	ja .above
+	or cl,cl
+	jz .match
+	inc esi
+	inc edi
+	jmp .loop
+
+.below:	
+	mov eax,-1
+	pop ebp
+	ret
+	
+.above:
+	mov eax,1
+	pop ebp
+	ret
+
+.match:
+	xor eax,eax
+	pop ebp
+	ret
+
+[SECTION .data]
+[GLOBAL _message]
+
+_message:	db 'hello',0
+\ No newline at end of file
diff --git a/rdoff/test/rdtmain.asm b/rdoff/test/rdtmain.asm
new file mode 100644
index 00000000..626a2e29
--- /dev/null
+++ b/rdoff/test/rdtmain.asm
@@ -0,0 +1,47 @@
+	;; rdtmain - main part of test program for RDX execution.
+	;; returns true (0) if its parameter equals the phrase "hello"
+	;; "hello" is stored in the library part, to complicate the
+	;; linkage.
+
+	;; assemble and link with the following commands:
+	;; nasm -f rdf rdtmain.asm
+	;; nasm -f rdf rdtlib.asm
+	;; ldrdf rdtmain.rdf rdtlib.rdf -o rdxtest.rdx
+
+	;; run with 'rdx rdxtest.rdx [parameters]' on a Linux (or possibly
+	;; other 32 bit OS) systems (x86 architectures only!)
+	;; try using '&& echo Yes' afterwards to find out when it returns 0.
+	
+[EXTERN _strcmp]		; strcmp is an imported function
+[EXTERN _message]		; imported data
+[SECTION .text]
+[BITS 32]
+
+	;; main(int argc,char **argv)
+[GLOBAL _main]
+_main:
+	push ebp
+	mov ebp,esp
+
+	;; ebp+8 = argc, ebp+12 = argv
+
+	cmp dword [ebp+8],2
+	jb error		; cause error if < 1 parameters
+
+	mov eax, [ebp+12]	; eax = argv
+
+	mov ebx, [eax+4]	; ebx = argv[1]
+	mov ecx, _message	; ecx = "hello"
+
+	push ecx
+	push ebx
+	call _strcmp		; compare strings
+	add esp,8		; caller clears stack
+	
+	pop ebp
+	ret			; return return value of _strcmp
+	
+error:
+	mov eax,2		; return 2 on error
+	pop ebp
+	ret
diff --git a/rdoff/test/testlib.asm b/rdoff/test/testlib.asm
new file mode 100644
index 00000000..6ee3d89a
--- /dev/null
+++ b/rdoff/test/testlib.asm
@@ -0,0 +1,18 @@
+; program to test retrieval of and linkage to modules in libraries by
+; ldrdf
+
+[SECTION .text]
+[GLOBAL _main]
+[EXTERN _strcmp]
+
+_main:
+	push dword string1
+	push dword string2
+	call _strcmp
+	add esp,8		; doh! clear up stack ;-)
+	ret
+
+[SECTION .data]
+
+string1:	db 'abc',0	; try changing these strings and see
+string2:	db 'abd',0	; what happens!
diff --git a/standard.mac b/standard.mac
index 5653ba62..92aab583 100644
--- a/standard.mac
+++ b/standard.mac
@@ -1,7 +1,15 @@
-; Standard macro set for NASM 0.95
+; Standard macro set for NASM 0.96 -*- nasm -*-
+; Note that although some user-level forms of directives are defined
+; here, not all of them are: the user-level form of a format-specific
+; directive should be defined in the module for that directive.
 
 %define __NASM_MAJOR__ 0
-%define __NASM_MINOR__ 95
+%define __NASM_MINOR__ 96
+
+; These two need to be defined, though the actual definitions will
+; be constantly updated during preprocessing.
+%define __FILE__
+%define __LINE__
 
 %define __SECT__		; it ought to be defined, even if as nothing
 
@@ -23,6 +31,7 @@
 %push struc
 %define %$strucname %1
 [absolute 0]
+%$strucname:			; allow definition of `.member' to work sanely
 %endmacro
 %imacro endstruc 0.nolist
 %{$strucname}_size:
@@ -44,34 +53,34 @@ __SECT__
 %pop
 %endmacro
 
-%imacro extern 1+.nolist
+%imacro align 1-2+.nolist nop
+	  times ($$-$) & ((%1)-1) %2
+%endmacro
+%imacro alignb 1-2+.nolist resb 1
+	  times ($$-$) & ((%1)-1) %2
+%endmacro
+
+%imacro extern 1-*.nolist
+%rep %0
 [extern %1]
+%rotate 1
+%endrep
 %endmacro
 
 %imacro bits 1+.nolist
 [bits %1]
 %endmacro
 
-%imacro global 1+.nolist
+%imacro global 1-*.nolist
+%rep %0
 [global %1]
+%rotate 1
+%endrep
 %endmacro
 
-%imacro common 1+.nolist
+%imacro common 1-*.nolist
+%rep %0
 [common %1]
-%endmacro
-
-%imacro org 1+.nolist
-[org %1]
-%endmacro
-
-%imacro group 1+.nolist
-[group %1]
-%endmacro
-
-%imacro uppercase 1+.nolist
-[uppercase %1]
-%endmacro
-
-%imacro library 1+.nolist
-[library %1]
+%rotate 1
+%endrep
 %endmacro
diff --git a/test/Makefile b/test/Makefile
index 5f0e5c6f..bdb55a62 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -1,2 +1,2 @@
 clean:
-	rm -f *.o *.obj *.com bintest inctest
+	rm -f *test *.com *.o *.obj *so *.exe
diff --git a/test/aoutso.asm b/test/aoutso.asm
new file mode 100644
index 00000000..b95b66ee
--- /dev/null
+++ b/test/aoutso.asm
@@ -0,0 +1,96 @@
+; test source file for assembling to NetBSD/FreeBSD a.out shared library
+; build with:
+;    nasm -f aoutb aoutso.asm
+;    ld -Bshareable -o aoutso.so aoutso.o
+; test with:
+;    cc -o aoutso aouttest.c aoutso.so
+;    ./aoutso
+
+; This file should test the following:
+; [1] Define and export a global text-section symbol
+; [2] Define and export a global data-section symbol
+; [3] Define and export a global BSS-section symbol
+; [4] Define a non-global text-section symbol
+; [5] Define a non-global data-section symbol
+; [6] Define a non-global BSS-section symbol
+; [7] Define a COMMON symbol
+; [8] Define a NASM local label
+; [9] Reference a NASM local label
+; [10] Import an external symbol
+; [11] Make a PC-relative call to an external symbol
+; [12] Reference a text-section symbol in the text section
+; [13] Reference a data-section symbol in the text section
+; [14] Reference a BSS-section symbol in the text section
+; [15] Reference a text-section symbol in the data section
+; [16] Reference a data-section symbol in the data section
+; [17] Reference a BSS-section symbol in the data section
+
+	  BITS 32
+	  EXTERN __GLOBAL_OFFSET_TABLE_
+	  GLOBAL _lrotate:function ; [1]
+	  GLOBAL _greet:function ; [1]
+	  GLOBAL _asmstr:data _asmstr.end-_asmstr ; [2]
+	  GLOBAL _textptr:data 4 ; [2]
+	  GLOBAL _selfptr:data 4 ; [2]
+	  GLOBAL _integer:data 4 ; [3]
+	  EXTERN _printf	; [10]
+	  COMMON _commvar 4	; [7]
+
+	  SECTION .text
+
+; prototype: long lrotate(long x, int num);
+_lrotate:			; [1]
+	  push ebp
+	  mov ebp,esp
+	  mov eax,[ebp+8]
+	  mov ecx,[ebp+12]
+.label	  rol eax,1		; [4] [8]
+	  loop .label		; [9] [12]
+	  mov esp,ebp
+	  pop ebp
+	  ret
+
+; prototype: void greet(void);
+_greet	  push ebx		; we'll use EBX for GOT, so save it
+	  call .getgot
+.getgot:  pop ebx
+	  add ebx,__GLOBAL_OFFSET_TABLE_ + $$ - .getgot wrt ..gotpc
+	  mov eax,[ebx+_integer wrt ..got] ; [14]
+	  mov eax,[eax]
+	  inc eax
+	  mov [ebx+localint wrt ..gotoff],eax ; [14]
+	  mov eax,[ebx+_commvar wrt ..got]
+	  push dword [eax]
+	  mov eax,[ebx+localptr wrt ..gotoff] ; [13]
+	  push dword [eax]
+	  mov eax,[ebx+_integer wrt ..got] ; [1] [14]
+	  push dword [eax]
+	  lea eax,[ebx+_printfstr wrt ..gotoff]
+	  push eax		; [13]
+	  call _printf wrt ..plt ; [11]
+	  add esp,16
+	  pop ebx
+	  ret
+
+	  SECTION .data
+
+; a string
+_asmstr	  db 'hello, world', 0	; [2]
+.end
+
+; a string for Printf
+_printfstr db "integer==%d, localint==%d, commvar=%d"
+	  db 10, 0
+
+; some pointers
+localptr  dd localint		; [5] [17]
+_textptr  dd _greet wrt ..sym	; [15]
+_selfptr  dd _selfptr wrt ..sym	; [16]
+
+	  SECTION .bss
+
+; an integer
+_integer  resd 1		; [3]
+
+; a local integer
+localint  resd 1		; [6]
diff --git a/test/binexe.asm b/test/binexe.asm
new file mode 100644
index 00000000..ab852fb9
--- /dev/null
+++ b/test/binexe.asm
@@ -0,0 +1,32 @@
+; Demonstration of how to write an entire .EXE format program by using
+; the `exebin.mac' macro package.
+; To build:
+;    nasm -fbin binexe.asm -o binexe.exe -ipath
+; (where `path' is such as to allow the %include directive to find
+; exebin.mac)
+; To test:
+;    binexe
+; (should print `hello, world')
+
+%include "exebin.mac"
+
+	  EXE_begin
+	  EXE_stack 64		; demonstrates overriding the 0x800 default
+
+	  section .text
+
+	  mov ax,cs
+	  mov ds,ax
+
+	  mov dx,hello
+	  mov ah,9
+	  int 0x21
+
+	  mov ax,0x4c00
+	  int 0x21
+
+	  section .data
+
+hello:	  db 'hello, world', 13, 10, '$'
+
+	  EXE_end
diff --git a/test/elfso.asm b/test/elfso.asm
new file mode 100644
index 00000000..5adb6339
--- /dev/null
+++ b/test/elfso.asm
@@ -0,0 +1,97 @@
+; test source file for assembling to ELF shared library
+; build with:
+;    nasm -f elf elfso.asm
+;    ld -shared -o elfso.so elfso.o
+; test with:
+;    gcc -o elfso elftest.c ./elfso.so
+;    ./elfso
+; (assuming your gcc is ELF, and you're running bash)
+
+; This file should test the following:
+; [1] Define and export a global text-section symbol
+; [2] Define and export a global data-section symbol
+; [3] Define and export a global BSS-section symbol
+; [4] Define a non-global text-section symbol
+; [5] Define a non-global data-section symbol
+; [6] Define a non-global BSS-section symbol
+; [7] Define a COMMON symbol
+; [8] Define a NASM local label
+; [9] Reference a NASM local label
+; [10] Import an external symbol
+; [11] Make a PC-relative call to an external symbol
+; [12] Reference a text-section symbol in the text section
+; [13] Reference a data-section symbol in the text section
+; [14] Reference a BSS-section symbol in the text section
+; [15] Reference a text-section symbol in the data section
+; [16] Reference a data-section symbol in the data section
+; [17] Reference a BSS-section symbol in the data section
+
+	  BITS 32
+	  GLOBAL lrotate:function ; [1]
+	  GLOBAL greet:function	; [1]
+	  GLOBAL asmstr:data asmstr.end-asmstr ; [2]
+	  GLOBAL textptr:data 4	; [2]
+	  GLOBAL selfptr:data 4	; [2]
+	  GLOBAL integer:data 4	; [3]
+	  EXTERN printf		; [10]
+	  COMMON commvar 4:4	; [7]
+	  EXTERN _GLOBAL_OFFSET_TABLE_
+
+	  SECTION .text
+
+; prototype: long lrotate(long x, int num);
+lrotate:			; [1]
+	  push ebp
+	  mov ebp,esp
+	  mov eax,[ebp+8]
+	  mov ecx,[ebp+12]
+.label	  rol eax,1		; [4] [8]
+	  loop .label		; [9] [12]
+	  mov esp,ebp
+	  pop ebp
+	  ret
+
+; prototype: void greet(void);
+greet	  push ebx		; we'll use EBX for GOT, so save it
+	  call .getgot
+.getgot:  pop ebx
+	  add ebx,_GLOBAL_OFFSET_TABLE_ + $$ - .getgot wrt ..gotpc
+	  mov eax,[ebx+integer wrt ..got] ; [14]
+	  mov eax,[eax]
+	  inc eax
+	  mov [ebx+localint wrt ..gotoff],eax ; [14]
+	  mov eax,[ebx+commvar wrt ..got]
+	  push dword [eax]
+	  mov eax,[ebx+localptr wrt ..gotoff] ; [13]
+	  push dword [eax]
+	  mov eax,[ebx+integer wrt ..got] ; [1] [14]
+	  push dword [eax]
+	  lea eax,[ebx+printfstr wrt ..gotoff]
+	  push eax		; [13]
+	  call printf wrt ..plt	; [11]
+	  add esp,16
+	  pop ebx
+	  ret
+
+	  SECTION .data
+
+; a string
+asmstr	  db 'hello, world', 0	; [2]
+.end
+
+; a string for Printf
+printfstr db "integer==%d, localint==%d, commvar=%d"
+	  db 10, 0
+
+; some pointers
+localptr  dd localint		; [5] [17]
+textptr	  dd greet wrt ..sym	; [15]
+selfptr	  dd selfptr wrt ..sym	; [16]
+
+	  SECTION .bss
+
+; an integer
+integer	  resd 1		; [3]
+
+; a local integer
+localint  resd 1		; [6]
diff --git a/test/objexe.asm b/test/objexe.asm
new file mode 100644
index 00000000..9959f40f
--- /dev/null
+++ b/test/objexe.asm
@@ -0,0 +1,30 @@
+; Demonstration of how to write an entire .EXE format program as a .OBJ
+; file to be linked. Tested with the VAL free linker.
+; To build:
+;    nasm -fobj objexe.asm
+;    val objexe.obj,objexe.exe;
+; To test:
+;    objexe
+; (should print `hello, world')
+	  
+	  segment code
+
+..start:  mov ax,data
+	  mov ds,ax
+	  mov ax,stack
+	  mov ss,ax
+	  mov sp,stacktop
+
+	  mov dx,hello
+	  mov ah,9
+	  int 0x21
+
+	  mov ax,0x4c00
+	  int 0x21
+
+	  segment data
+hello:	  db 'hello, world', 13, 10, '$'
+
+	  segment stack stack
+	  resb 64
+stacktop: