75 files changed, 3307 insertions, 986 deletions
diff --git a/.gitignore b/.gitignore
index 2227de34..73b54b3e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -50,6 +50,7 @@ core
 /ecc-256.h
 /ecc-384.h
 /ecc-521.h
+/ecc-25519.h
 /nettle.aux
 /nettle.cp
 /nettle.cps
diff --git a/ChangeLog b/ChangeLog
index 79c81cfb..faaf7e0d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,273 @@
+2015-01-30  Niels Möller  <nisse@lysator.liu.se>
+
+	Update chacha-poly1305 for draft-irtf-cfrg-chacha20-poly1305-08.
+	* chacha-poly1305.h (CHACHA_POLY1305_NONCE_SIZE): Increase to 12
+	bytes, i.e., CHACHA_NONCE96_SIZE.
+	* chacha-poly1305.c (chacha_poly1305_set_nonce): Use
+	chacha_set_nonce96.
+	(poly1305_pad): New function.
+	(chacha_poly1305_encrypt): Use poly1305_pad.
+	(chacha_poly1305_digest): Call poly1305_pad, and format length
+	fields as a single poly1305 block.
+
+	* chacha-set-nonce.c (chacha_set_nonce96): New function.
+	* chacha.h (CHACHA_NONCE96_SIZE): New constant.
+	* testsuite/chacha-test.c: Add test for chacha with 96-bit nonce.
+
+2015-01-27  Niels Möller  <nisse@lysator.liu.se>
+
+	* ecc.h: Deleted declarations of unused itch functions. Moved
+	declarations of internal functions to...
+	* ecc-internal.h: ...new location. Also added a leading under
+	score on the symbols.
+	(ecc_a_to_j, ecc_j_to_a, ecc_eh_to_a, ecc_dup_jj, ecc_add_jja)
+	(ecc_add_jjj, ecc_dup_eh, ecc_add_eh, ecc_add_ehh, ecc_mul_g)
+	(ecc_mul_a, ecc_mul_g_eh, ecc_mul_a_eh): Affected functions.
+
+2015-01-26  Niels Möller  <nisse@lysator.liu.se>
+
+	* ecc-add-eh.c (ecc_add_eh_itch): Deleted.
+	* ecc-add-ehh.c (ecc_add_ehh_itch): Deleted.
+	* ecc-add-jja.c (ecc_add_jja_itch): Deleted.
+	* ecc-add-jjj.c (ecc_add_jjj_itch): Deleted.
+	* ecc-dup-eh.c (ecc_dup_eh_itch): Deleted.
+	* ecc-dup-jj.c (ecc_dup_jj_itch): Deleted.
+	* ecc-eh-to-a.c (ecc_eh_to_a_itch): Deleted.
+	* ecc-j-to-a.c (ecc_j_to_a_itch): Deleted.
+	* ecc-mul-a-eh.c (ecc_mul_a_eh_itch): Deleted.
+	* ecc-mul-a.c (ecc_mul_a_itch): Deleted.
+	* ecc-mul-g-eh.c (ecc_mul_g_eh_itch): Deleted.
+	* ecc-mul-g.c (ecc_mul_g_itch): Deleted.
+
+2015-01-25  Niels Möller  <nisse@lysator.liu.se>
+
+	* arm/fat/sha1-compress-2.asm: New file.
+	* arm/fat/sha256-compress-2.asm: Likewise.
+	* fat-arm.c (fat_init): Setup for use of additional v6 assembly
+	functions.
+
+	* sha1-compress.c: Prepare for fat build with C and assembly
+	implementations.
+	* sha256-compress.c: Likewise.
+
+	* fat-setup.h (sha1_compress_func, sha256_compress_func): New typedefs.
+
+	* configure.ac (asm_nettle_optional_list): Added
+	sha1-compress-2.asm and sha256-compress-2.asm, and corresponding
+	HAVE_NATIVE_*.
+
+	From Martin Storsjö:
+	* arm: Add .arch directives for armv6. This allows building these
+	files as part of a fat build, even if the assembler by default
+	targets a lower architecture version.
+
+2015-01-23  Niels Möller  <nisse@lysator.liu.se>
+
+	* fat-setup.h (DEFINE_FAT_FUNC): Check value of function pointer,
+	before calling fat_init. Should be correct even without memory
+	barrier.
+	* fat-x86_64.c (fat_init): Deleted static variable initialized.
+	The checks of the relevant pointer in DEFINE_FAT_FUNC is more
+	robust.
+	* fat-arm.c (fat_init): Likewise.
+
+2015-01-21  Niels Möller  <nisse@lysator.liu.se>
+
+	* fat-arm.c (fat_init): Setup for use of neon assembly functions.
+
+	* arm/fat/salsa20-core-internal-2.asm: New file.
+	* arm/fat/sha3-permute-2.asm: New file.
+	* arm/fat/sha512-compress-2.asm: New file.
+	* arm/fat/umac-nh-2.asm: New file.
+	* arm/fat/umac-nh-n-2.asm: New file.
+
+	* salsa20-core-internal.c: Prepare for fat build with C and
+	assembly implementations.
+	* sha512-compress.c: Likewise.
+	* sha3-permute.c: Likewise.
+	* umac-nh.c: Likewise.
+	* umac-nh-n.c: Likewise.
+
+	* configure.ac (asm_nettle_optional_list): Added more *-2.asm
+	files, and corresponding HAVE_NATIVE_* defines. Recognize PROLOGUE
+	macro in asm files, also when not at the start of the line.
+
+2015-01-20  Niels Möller  <nisse@lysator.liu.se>
+
+	* fat-arm.c (get_arm_features): Check NETTLE_FAT_OVERRIDE
+	environment variable.
+
+	* fat-x86_64.c (get_x86_features): New function. Check
+	NETTLE_FAT_OVERRIDE environment variable.
+	(fat_init): Use it.
+
+	* fat-setup.h (secure_getenv) [!HAVE_SECURE_GETENV]: Dummy
+	definition, returning NULL.
+	(ENV_OVERRIDE): New constant.
+
+	* configure.ac: Check for secure_getenv function.
+
+2015-01-19  Niels Möller  <nisse@lysator.liu.se>
+
+	* configure.ac: Fat library setup for arm.
+	* fat-arm.c: New file.
+	* arm/fat/aes-encrypt-internal.asm: New files.
+	* arm/fat/aes-encrypt-internal-2.asm: New file.
+	* arm/fat/aes-decrypt-internal.asm: New file.
+	* arm/fat/aes-decrypt-internal-2.asm: New file.
+
+	* Makefile.in (DISTFILES): Added fat-setup.h.
+
+	* fat-setup.h: New file, declarations moved from...
+	* fat-x86_64.c: ... old location
+
+2015-01-17  Niels Möller  <nisse@lysator.liu.se>
+
+	* fat-x86_64.c (DECLARE_FAT_FUNC, DEFINE_FAT_FUNC)
+	(DECLARE_FAT_FUNC_VAR): New macros, to define needed resolver and
+	wrapper functions.
+
+	* config.m4.in (SYMBOL_PREFIX): Define from from autoconf
+	ASM_SYMBOL_PREFIX.
+	(C_NAMS): move definition to...
+	* asm.m4 (C_NAME): Define here, also take fat_transform.
+	(fat_suffix): Replaced by...
+	(fat_transform): New macro, taking symbol nama as argument.
+	Updated all uses of fat_suffix.
+	* fat-x86_64.c: Updated for internal "_nettle" prefix on
+	cpu-specific memxor functions.
+
+	* fat-x86_64.c: Set up for sse2 vs non-sse2 memxor. Patch by Nikos
+	Mavrogiannopoulos.
+	* configure.ac (asm_nettle_optional_list): Added memxor-2.asm.
+	* x86_64/fat/memxor-2.asm: New file.
+	* x86_64/fat/memxor.asm: New file.
+
+	* x86_64/memxor.asm: Use ifdef, not ifelse, for testing USE_SSE2.
+
+2015-01-16  Niels Möller  <nisse@lysator.liu.se>
+
+	* configure.ac (OPT_NETTLE_SOURCES): New substituted variable.
+	(asm_path): Fixed x86_64 fat setup. Include only x86_64 and
+	x86_64/fat in the asm_path. Put fat-x86_64.c in
+	OPT_NETTLE_SOURCES, with no symlinking.
+
+	* fat-x86_64.c: Renamed,...
+	* x86_64/fat/fat.c: ... from old name.
+
+2015-01-13  Niels Möller  <nisse@lysator.liu.se>
+
+	* x86_64/fat/fat.c: For constructor hack, check
+	HAVE_GCC_ATTRIBUTE, not __GNUC__. Also support sun compilers, as
+	suggested by Nikos Mavrogiannopoulos, and attch the constructor
+	attribute directly to fat_init.
+	(fat_constructor): Deleted wrapper function.
+
+	* x86_64/fat/fat.c: New file, initialization for x86_64 fat
+	library.
+
+	* x86_64/fat/cpuid.asm (_nettle_cpuid): New file and function.
+
+	* x86_64/fat/aes-encrypt-internal.asm: New file, including
+	x86_64/aes-encrypt-internal.asm, after setting fat_suffix to
+	_x86_64.
+	* x86_64/fat/aes-decrypt-internal.asm: New file, analogous setup.
+	* x86_64/fat/aes-encrypt-internal-2.asm: New file, including
+	x86_64/aesni/aes-encrypt-internal.asm, after setting fat_suffix to
+	_aesni.
+	* x86_64/fat/aes-decrypt-internal.asm-2: New file, analogous
+	setup.
+
+	* configure.ac: New command line option --enable-fat.
+	(asm_nettle_optional_list): Added cpuid.asm, fat.c,
+	aes-encrypt-internal-2.asm, and aes-decrypt-internal-2.asm.
+
+	* asm.m4 (fat_suffix): New suffix added to symbol names.
+
+	* x86_64/aesni/aes-encrypt-internal.asm: Use explicit .byte
+	sequences for aes instructions, don't rely on assembler support.
+	* x86_64/aesni/aes-decrypt-internal.asm: Likewise.
+
+	* aclocal.m4 (NETTLE_CHECK_IFUNC): New macro, checking for ifunc
+	and settting HAVE_LINK_IFUNC if working.
+	* configure.ac: Use it.
+
+2015-01-12  Niels Möller  <nisse@lysator.liu.se>
+
+	* asm.m4 (DECLARE_FUNC): New macro, extracted from PROLOGUE.
+	(PROLOGUE): Use it.
+
+	* configure.ac (OPT_NETTLE_OBJS, OPT_HOGWEED_OBJS): Renamed
+	substituted variables, and list the object files rather than
+	source files.
+	(OPT_ASM_NETTLE_SOURCES, OPT_ASM_HOGWEED_SOURCES): ...Old names.
+	* Makefile.in (OPT_NETTLE_OBJS, OPT_HOGWEED_OBJS): Use new
+	variables.
+
+2015-01-11  Niels Möller  <nisse@lysator.liu.se>
+
+	* x86_64/aesni/aes-decrypt-internal.asm: New file.
+	* x86_64/aesni/aes-encrypt-internal.asm: New file.
+	* configure.ac: New configure flag --enable-x86-aesni.
+
+	* aclocal.m4 (LSH_RPATH_INIT): Handle freebsd, in the same way as
+	gnu/linux, with -Wl,-rpath,.
+
+	Merged memxor-reorg changes, starting at 2014-10-23.
+
+2015-01-10  Niels Möller  <nisse@lysator.liu.se>
+
+	* arm/memxor.asm (memxor3): Moved to new file.
+	* arm/memxor3.asm: New file.
+
+2014-11-24  Niels Möller  <nisse@lysator.liu.se>
+
+	* x86_64/memxor3.asm (memxor3): New file, code moved from old
+	memxor.asm.
+	* x86_64/memxor.asm (memxor): Rewritten, no longer jumps into
+	memxor3.
+
+	* configure.ac (asm_replace_list): Added memxor.asm and
+	memxor3.asm.
+
+2014-10-23  Niels Möller  <nisse@lysator.liu.se>
+
+	* configure.ac (IF_ASM): New substituted variable.
+	* testsuite/Makefile.in (VALGRIND): Allow partial loads only when
+	build includes assembly files.
+
+	* memxor-internal.h (READ_PARTIAL): New macro.
+	* memxor.c (memxor_different_alignment): Avoid out-of-bounds
+	reads, corresponding to valgrind's --partial-loads-ok. Use
+	READ_PARTIAL.
+	* memxor3.c: Analogous changes for unaligned operations.
+
+	* configure.ac (asm_replace_list): Deleted memxor.asm, now
+	incompatible with the memxor/memxor3 split.
+
+	* memxor3.c: New file, split off from memxor.c.
+	* memxor-internal.h: New file, declarations shared by memxor.c and
+	memxor3.c.
+	* memxor.c: memxor3 fucntions moved out from this file.
+	* Makefile.in (nettle_SOURCES): Added memxor3.c.
+	(DISTFILES): Added memxor-internal.h.
+
+	* memxor.c (memxor_common_alignment, memxor_different_alignment)
+	(memxor): Change loop order, iterate from the end.
+	(memxor3_common_alignment): Unroll twice.
+	(word_t): On x86_64, unconditionally define as uint64_t, to get 64
+	bits also in M$ windows. Replaced all uses of SIZEOF_LONG.
+
+2014-12-12  Niels Möller  <nisse@lysator.liu.se>
+
+	* cbc.h (CBC_ENCRYPT, CBC_DECRYPT): Make type-checking hack
+	stricter, warn if type of length argument is smaller than size_t.
+	* ctr.h (CTR_CRYPT): Likewise.
+	* eax.h (EAX_SET_KEY, EAX_SET_NONCE, EAX_UPDATE, EAX_ENCRYPT)
+	(EAX_DECRYPT, EAX_DIGEST): Likewise.
+	* gcm.h (GCM_SET_KEY, GCM_ENCRYPT, GCM_DECRYPT, GCM_DIGEST):
+	Likewise.
+
 2014-12-08  Niels Möller  <nisse@lysator.liu.se>
 
 	* aclocal.m4 (LD_VERSION_SCRIPT): Linker scripts no longer located
@@ -9162,8 +9432,8 @@
 	computing n = p * q.
 
 	* rsa-compat.c: Adapted to new private key struct.
-	* rsa_md5.c: Likesize.
-	* rsa_sha1.c: Likesize.
+	* rsa_md5.c: Likewise.
+	* rsa_sha1.c: Likewise.
 
 	* rsa.c (rsa_check_size): New function, for computing and checking
 	the size of the modulo in octets.
diff --git a/Makefile.in b/Makefile.in
index ebd2bb1e..fce79ea2 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -12,9 +12,10 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@
 INSTALL_STRIP_PROGRAM = $(INSTALL_PROGRAM) -s
 MKDIR_P = @MKDIR_P@
 
-OPT_ASM_NETTLE_SOURCES = @OPT_ASM_NETTLE_SOURCES@
-OPT_ASM_HOGWEED_SOURCES = @OPT_ASM_HOGWEED_SOURCES@
+OPT_NETTLE_OBJS = @OPT_NETTLE_OBJS@
+OPT_HOGWEED_OBJS = @OPT_HOGWEED_OBJS@
 
+OPT_NETTLE_SOURCES = @OPT_NETTLE_SOURCES@
 OPT_HOGWEED_SOURCES = @IF_MINI_GMP@ mini-gmp.c
 
 SUBDIRS = tools testsuite examples
@@ -109,7 +110,7 @@ nettle_SOURCES = aes-decrypt-internal.c aes-decrypt.c \
 		 knuth-lfib.c \
 		 md2.c md2-meta.c md4.c md4-meta.c \
 		 md5.c md5-compress.c md5-compat.c md5-meta.c \
-		 memxor.c \
+		 memxor.c memxor3.c \
 		 nettle-meta-aeads.c nettle-meta-armors.c \
 		 nettle-meta-ciphers.c nettle-meta-hashes.c \
 		 pbkdf2.c pbkdf2-hmac-sha1.c pbkdf2-hmac-sha256.c \
@@ -134,7 +135,8 @@ nettle_SOURCES = aes-decrypt-internal.c aes-decrypt.c \
 		 umac-poly64.c umac-poly128.c umac-set-key.c \
 		 umac32.c umac64.c umac96.c umac128.c \
 		 write-be32.c write-le32.c write-le64.c \
-		 yarrow256.c yarrow_key_event.c
+		 yarrow256.c yarrow_key_event.c \
+		 $(OPT_NETTLE_SOURCES)
 
 hogweed_SOURCES = sexp.c sexp-format.c \
 		  sexp-transport.c sexp-transport-format.c \
@@ -204,6 +206,10 @@ SOURCES = $(nettle_SOURCES) $(hogweed_SOURCES) \
 	  $(getopt_SOURCES) $(internal_SOURCES) \
 	  aesdata.c desdata.c twofishdata.c shadata.c gcmdata.c eccdata.c
 
+# FIXME: $(SOURCES) includes $(OPT_NETTLE_SOURCES) and
+# $(OPT_HOGWEED_SOURCES), which means that inclusion in this list
+# depends on which sources where included in the build. It needs to
+# always include all files, and we no duplicates.
 DISTFILES = $(SOURCES) $(HEADERS) getopt.h getopt_int.h \
 	.bootstrap run-tests \
 	aclocal.m4 configure.ac \
@@ -216,15 +222,15 @@ DISTFILES = $(SOURCES) $(HEADERS) getopt.h getopt_int.h \
 	$(des_headers) descore.README \
 	aes-internal.h camellia-internal.h serpent-internal.h \
 	cast128_sboxes.h desinfo.h desCode.h \
-	nettle-internal.h nettle-write.h \
-	gmp-glue.h ecc-internal.h \
+	memxor-internal.h nettle-internal.h nettle-write.h \
+	gmp-glue.h ecc-internal.h fat-setup.h \
 	mini-gmp.h mini-gmp.c asm.m4 \
 	nettle.texinfo nettle.info nettle.html nettle.pdf sha-example.c
 
 # Rules building static libraries
-nettle_OBJS = $(nettle_SOURCES:.c=.$(OBJEXT)) $(OPT_ASM_NETTLE_SOURCES:.asm=.$(OBJEXT))
+nettle_OBJS = $(nettle_SOURCES:.c=.$(OBJEXT)) $(OPT_NETTLE_OBJS)
 
-hogweed_OBJS = $(hogweed_SOURCES:.c=.$(OBJEXT)) $(OPT_ASM_HOGWEED_SOURCES:.asm=.$(OBJEXT))
+hogweed_OBJS = $(hogweed_SOURCES:.c=.$(OBJEXT)) $(OPT_HOGWEED_OBJS)
 
 libnettle.a: $(nettle_OBJS)
 	-rm -f $@
diff --git a/NEWS b/NEWS
index f7e46ebf..4ff80c01 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,66 @@
+NEWS for the Nettle 3.1 release
+
+	This release adds a couple of new features.
+
+XXX	The shared library names???
+
+	Bug fixes:
+
+	* Eliminate out-of-bounds reads in the C implementation of
+	  memxor (related to valgrind's --partial-loads-ok flag).
+
+	Interface changes:
+
+	* Declarations of many internal functions are moved from ecc.h
+	  to ecc-internal.h. The functions are undocumented, and
+	  luckily they're apparently also unused by applications, so I
+	  don't expect any problems from this change.
+
+	New features:
+
+	* Support for curve25519 and for EdDSA25519 signatures.
+
+	* Support for "fat builds" on x86_64 and arm, where the
+	  implementation of certain functions is selected at run-time
+	  depending on available cpu features. Configure with
+	  --enable-fat to try this out. If it turns out to work well
+	  enough, it will likely be enabled by default in later
+	  releases.
+
+	* Support for building the hogweed library (public key
+	  support) using "mini-gmp", a small but slower implementation
+	  of a subset of the GMP interfaces. Note that builds using
+	  mini-gmp are *not* binary compatible with regular builds,
+	  and more likely to leak side-channel information.
+
+	  One intended usecase is for small embedded applications
+	  which need to verify digital signatures.
+
+XXX	* The shared libraries are now built with versioned symbols.
+	  Should reduce problems in case a program links explicitly
+	  both to nettle and/or hogweed, and to gnutls, and the
+	  program and gnutls expects different versions.
+
+	Optimizations:
+
+	* New x86_64 implementation of AES, using the "aesni"
+	  instructions. Autodetected in fat builds. In non-fat builds,
+	  it has to be enabled explicitly with --enable-x86-aesni.
+
+	Build system:
+
+	* Use the same object files for both static and shared
+	  libraries. This eliminates the *.po object files which were
+	  confusing to some tools (as well as humans). Like before,
+	  PIC code is used by default; to build a non-pic static
+	  library, configure with --disable-pic --disable-shared.
+
+	Miscellaneous:
+
+	* Made type-checking hack in CBC_ENCRYPT and similar macros
+	  stricter, to generate warnings if they are used with
+	  functions which have a length argument smaller than size_t.
+
 NEWS for the Nettle 3.0 release
 
 	This is a major release, including several interface changes,
diff --git a/aclocal.m4 b/aclocal.m4
index 28ec2705..debcf9c7 100644
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -109,7 +109,7 @@ case "$host_os" in
       RPATHFLAG=-R
     fi
     ;;
-  linux*)		RPATHFLAG="-Wl,-rpath," ;;
+  linux*|freebsd*)	RPATHFLAG="-Wl,-rpath," ;;
   *)			RPATHFLAG="" ;;
 esac
 
@@ -643,6 +643,41 @@ foo:
 fi
 ])
 
+dnl NETTLE_CHECK_IFUNC
+dnl ------------------
+dnl Check if __attribute__ ((ifunc(...))) works
+AC_DEFUN([NETTLE_CHECK_IFUNC],
+[AC_REQUIRE([AC_PROG_CC])
+AC_CACHE_CHECK([for ifunc support],
+  nettle_cv_link_ifunc,
+  AC_LINK_IFELSE([AC_LANG_PROGRAM([
+static int
+foo_imp(int x)
+{
+  return 1;
+}
+
+typedef void void_func (void);
+
+static void_func *
+foo_resolv(void)
+{
+  return (void_func *) foo_imp;
+}
+
+int foo (int x) __attribute__ ((ifunc("foo_resolv")));
+],[
+  return foo(0);
+
+])],
+[nettle_cv_link_ifunc=yes],
+[nettle_cv_link_ifunc=no]))
+AH_TEMPLATE([HAVE_LINK_IFUNC], [Define if compiler and linker supports __attribute__ ifunc])
+if test "x$nettle_cv_link_ifunc" = xyes ; then
+  AC_DEFINE(HAVE_LINK_IFUNC)
+fi 
+])
+
 dnl @synopsis AX_CREATE_STDINT_H [( HEADER-TO-GENERATE [, HEADERS-TO-CHECK])]
 dnl
 dnl the "ISO C9X: 7.18 Integer types <stdint.h>" section requires the
diff --git a/arm/fat/aes-decrypt-internal-2.asm b/arm/fat/aes-decrypt-internal-2.asm
new file mode 100644
index 00000000..2110f310
--- /dev/null
+++ b/arm/fat/aes-decrypt-internal-2.asm
@@ -0,0 +1,35 @@
+C arm/fat/aes-decrypt-internal-2.asm
+
+
+ifelse(<
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+define(<fat_transform>, <$1_armv6>)
+include_src(<arm/v6/aes-decrypt-internal.asm>)
diff --git a/arm/fat/aes-decrypt-internal.asm b/arm/fat/aes-decrypt-internal.asm
new file mode 100644
index 00000000..8d763889
--- /dev/null
+++ b/arm/fat/aes-decrypt-internal.asm
@@ -0,0 +1,35 @@
+C arm/fat/aes-decrypt-internal.asm
+
+
+ifelse(<
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+define(<fat_transform>, <$1_arm>)
+include_src(<arm/aes-decrypt-internal.asm>)
diff --git a/arm/fat/aes-encrypt-internal-2.asm b/arm/fat/aes-encrypt-internal-2.asm
new file mode 100644
index 00000000..490a52be
--- /dev/null
+++ b/arm/fat/aes-encrypt-internal-2.asm
@@ -0,0 +1,35 @@
+C arm/fat/aes-encrypt-internal-2.asm
+
+
+ifelse(<
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+define(<fat_transform>, <$1_armv6>)
+include_src(<arm/v6/aes-encrypt-internal.asm>)
diff --git a/arm/fat/aes-encrypt-internal.asm b/arm/fat/aes-encrypt-internal.asm
new file mode 100644
index 00000000..e695a289
--- /dev/null
+++ b/arm/fat/aes-encrypt-internal.asm
@@ -0,0 +1,35 @@
+C arm/fat/aes-encrypt-internal.asm
+
+
+ifelse(<
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+define(<fat_transform>, <$1_arm>)
+include_src(<arm/aes-encrypt-internal.asm>)
diff --git a/arm/fat/salsa20-core-internal-2.asm b/arm/fat/salsa20-core-internal-2.asm
new file mode 100644
index 00000000..64d90302
--- /dev/null
+++ b/arm/fat/salsa20-core-internal-2.asm
@@ -0,0 +1,37 @@
+C arm/fat/salsa20-core-internal-2.asm
+
+
+ifelse(<
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+dnl PROLOGUE(_nettle_salsa20_core) picked up by configure
+
+define(<fat_transform>, <$1_neon>)
+include_src(<arm/neon/salsa20-core-internal.asm>)
diff --git a/arm/fat/sha1-compress-2.asm b/arm/fat/sha1-compress-2.asm
new file mode 100644
index 00000000..c326befd
--- /dev/null
+++ b/arm/fat/sha1-compress-2.asm
@@ -0,0 +1,37 @@
+C arm/fat/sha1-compress-2.asm
+
+
+ifelse(<
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+dnl PROLOGUE(_nettle_sha1_compress) picked up by configure
+
+define(<fat_transform>, <$1_armv6>)
+include_src(<arm/v6/sha1-compress.asm>)
diff --git a/arm/fat/sha256-compress-2.asm b/arm/fat/sha256-compress-2.asm
new file mode 100644
index 00000000..e1babb37
--- /dev/null
+++ b/arm/fat/sha256-compress-2.asm
@@ -0,0 +1,37 @@
+C arm/fat/sha256-compress-2.asm
+
+
+ifelse(<
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+dnl PROLOGUE(_nettle_sha256_compress) picked up by configure
+
+define(<fat_transform>, <$1_armv6>)
+include_src(<arm/v6/sha256-compress.asm>)
diff --git a/arm/fat/sha3-permute-2.asm b/arm/fat/sha3-permute-2.asm
new file mode 100644
index 00000000..b423a762
--- /dev/null
+++ b/arm/fat/sha3-permute-2.asm
@@ -0,0 +1,37 @@
+C arm/fat/sha3-permute-2.asm
+
+
+ifelse(<
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+dnl PROLOGUE(_nettle_sha3_permute) picked up by configure
+
+define(<fat_transform>, <_$1_neon>)
+include_src(<arm/neon/sha3-permute.asm>)
diff --git a/arm/fat/sha512-compress-2.asm b/arm/fat/sha512-compress-2.asm
new file mode 100644
index 00000000..428604e0
--- /dev/null
+++ b/arm/fat/sha512-compress-2.asm
@@ -0,0 +1,37 @@
+C arm/fat/sha3-compress-2.asm
+
+
+ifelse(<
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+dnl PROLOGUE(_nettle_sha512_compress) picked up by configure
+
+define(<fat_transform>, <$1_neon>)
+include_src(<arm/neon/sha512-compress.asm>)
diff --git a/arm/fat/umac-nh-2.asm b/arm/fat/umac-nh-2.asm
new file mode 100644
index 00000000..fc97cc6b
--- /dev/null
+++ b/arm/fat/umac-nh-2.asm
@@ -0,0 +1,37 @@
+C arm/fat/umac-nh-2.asm
+
+
+ifelse(<
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+dnl PROLOGUE(_nettle_umac_nh) picked up by configure
+
+define(<fat_transform>, <$1_neon>)
+include_src(<arm/neon/umac-nh.asm>)
diff --git a/arm/fat/umac-nh-n-2.asm b/arm/fat/umac-nh-n-2.asm
new file mode 100644
index 00000000..32b7a830
--- /dev/null
+++ b/arm/fat/umac-nh-n-2.asm
@@ -0,0 +1,37 @@
+C arm/fat/umac-nh-n-2.asm
+
+
+ifelse(<
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+dnl PROLOGUE(_nettle_umac_nh_n) picked up by configure
+
+define(<fat_transform>, <$1_neon>)
+include_src(<arm/neon/umac-nh-n.asm>)
diff --git a/arm/memxor.asm b/arm/memxor.asm
index fd0f6330..a50e91bc 100644
--- a/arm/memxor.asm
+++ b/arm/memxor.asm
@@ -227,274 +227,3 @@ PROLOGUE(nettle_memxor)
 	b	.Lmemxor_bytes
 	
 EPILOGUE(nettle_memxor)
-
-define(<DST>, <r0>)
-define(<AP>, <r1>)
-define(<BP>, <r2>)
-define(<N>, <r3>)
-undefine(<CNT>)
-undefine(<TNC>)
-
-C Temporaries r4-r7
-define(<ACNT>, <r8>)
-define(<ATNC>, <r10>)
-define(<BCNT>, <r11>)
-define(<BTNC>, <r12>)
-
-	C memxor3(void *dst, const void *a, const void *b, size_t n)
-	.align 2
-PROLOGUE(nettle_memxor3)
-	cmp	N, #0
-	beq	.Lmemxor3_ret
-
-	push	{r4,r5,r6,r7,r8,r10,r11}
-	cmp	N, #7
-
-	add	AP, N
-	add	BP, N
-	add	DST, N
-
-	bcs	.Lmemxor3_large
-
-	C Simple byte loop
-.Lmemxor3_bytes:
-	ldrb	r4, [AP, #-1]!
-	ldrb	r5, [BP, #-1]!
-	eor	r4, r5
-	strb	r4, [DST, #-1]!
-	subs	N, #1
-	bne	.Lmemxor3_bytes
-
-.Lmemxor3_done:
-	pop	{r4,r5,r6,r7,r8,r10,r11}
-.Lmemxor3_ret:
-	bx	lr
-
-.Lmemxor3_align_loop:
-	ldrb	r4, [AP, #-1]!
-	ldrb	r5, [BP, #-1]!
-	eor	r5, r4
-	strb	r5, [DST, #-1]!
-	sub	N, #1
-
-.Lmemxor3_large:
-	tst	DST, #3
-	bne	.Lmemxor3_align_loop
-
-	C We have at least 4 bytes left to do here.
-	sub	N, #4
-	ands	ACNT, AP, #3
-	lsl	ACNT, #3
-	beq	.Lmemxor3_a_aligned
-
-	ands	BCNT, BP, #3
-	lsl	BCNT, #3
-	bne	.Lmemxor3_uu
-
-	C Swap
-	mov	r4, AP
-	mov	AP, BP
-	mov	BP, r4
-
-.Lmemxor3_au:
-	C NOTE: We have the relevant shift count in ACNT, not BCNT
-
-	C AP is aligned, BP is not
-	C           v original SRC
-	C +-------+------+
-	C |SRC-4  |SRC   |
-	C +---+---+------+
-	C     |DST-4  |
-	C     +-------+
-	C
-	C With little-endian, we need to do
-	C DST[i-i] ^= (SRC[i-i] >> CNT) ^ (SRC[i] << TNC)
-	rsb	ATNC, ACNT, #32
-	bic	BP, #3
-
-	ldr	r4, [BP]
-
-	tst	N, #4
-	itet	eq
-	moveq	r5, r4
-	subne	N, #4
-	beq	.Lmemxor3_au_odd
-
-.Lmemxor3_au_loop:
-	ldr	r5, [BP, #-4]!
-	ldr	r6, [AP, #-4]!
-	eor	r6, r6, r4, lsl ATNC
-	eor	r6, r6, r5, lsr ACNT
-	str	r6, [DST, #-4]!
-.Lmemxor3_au_odd:
-	ldr	r4, [BP, #-4]!
-	ldr	r6, [AP, #-4]!
-	eor	r6, r6, r5, lsl ATNC
-	eor	r6, r6, r4, lsr ACNT
-	str	r6, [DST, #-4]!
-	subs	N, #8
-	bcs	.Lmemxor3_au_loop
-	adds	N, #8
-	beq	.Lmemxor3_done
-
-	C Leftover bytes in r4, low end
-	ldr	r5, [AP, #-4]
-	eor	r4, r5, r4, lsl ATNC
-
-.Lmemxor3_au_leftover:
-	C Store a byte at a time
-	ror	r4, #24
-	strb	r4, [DST, #-1]!
-	subs	N, #1
-	beq	.Lmemxor3_done
-	subs	ACNT, #8
-	sub	AP, #1
-	bne	.Lmemxor3_au_leftover
-	b	.Lmemxor3_bytes
-
-.Lmemxor3_a_aligned:
-	ands	ACNT, BP, #3
-	lsl	ACNT, #3
-	bne	.Lmemxor3_au ;
-
-	C a, b and dst all have the same alignment.
-	subs	N, #8
-	bcc	.Lmemxor3_aligned_word_end
-
-	C This loop runs at 8 cycles per iteration. It has been
-	C observed running at only 7 cycles, for this speed, the loop
-	C started at offset 0x2ac in the object file.
-
-	C FIXME: consider software pipelining, similarly to the memxor
-	C loop.
-	
-.Lmemxor3_aligned_word_loop:
-	ldmdb	AP!, {r4,r5,r6}
-	ldmdb	BP!, {r7,r8,r10}
-	subs	N, #12
-	eor	r4, r7
-	eor	r5, r8
-	eor	r6, r10
-	stmdb	DST!, {r4, r5,r6}
-	bcs	.Lmemxor3_aligned_word_loop
-
-.Lmemxor3_aligned_word_end:
-	C We have 0-11 bytes left to do, and N holds number of bytes -12.
-	adds	N, #4
-	bcc	.Lmemxor3_aligned_lt_8
-	C Do 8 bytes more, leftover is in N
-	ldmdb	AP!, {r4, r5}
-	ldmdb	BP!, {r6, r7}
-	eor	r4, r6
-	eor	r5, r7
-	stmdb	DST!, {r4,r5}
-	beq	.Lmemxor3_done
-	b	.Lmemxor3_bytes
-
-.Lmemxor3_aligned_lt_8:
-	adds	N, #4
-	bcc	.Lmemxor3_aligned_lt_4
-
-	ldr	r4, [AP,#-4]!
-	ldr	r5, [BP,#-4]!
-	eor	r4, r5
-	str	r4, [DST,#-4]!
-	beq	.Lmemxor3_done
-	b	.Lmemxor3_bytes
-
-.Lmemxor3_aligned_lt_4:
-	adds	N, #4	
-	beq	.Lmemxor3_done
-	b	.Lmemxor3_bytes
-
-.Lmemxor3_uu:
-
-	cmp	ACNT, BCNT
-	bic	AP, #3
-	bic	BP, #3
-	rsb	ATNC, ACNT, #32
-
-	bne	.Lmemxor3_uud
-
-	C AP and BP are unaligned in the same way
-
-	ldr	r4, [AP]
-	ldr	r6, [BP]
-	eor	r4, r6
-
-	tst	N, #4
-	itet	eq
-	moveq	r5, r4
-	subne	N, #4
-	beq	.Lmemxor3_uu_odd
-
-.Lmemxor3_uu_loop:
-	ldr	r5, [AP, #-4]!
-	ldr	r6, [BP, #-4]!
-	eor	r5, r6
-	lsl	r4, ATNC
-	eor	r4, r4, r5, lsr ACNT
-	str	r4, [DST, #-4]!
-.Lmemxor3_uu_odd:
-	ldr	r4, [AP, #-4]!
-	ldr	r6, [BP, #-4]!
-	eor	r4, r6
-	lsl	r5, ATNC
-	eor	r5, r5, r4, lsr ACNT
-	str	r5, [DST, #-4]!
-	subs	N, #8
-	bcs	.Lmemxor3_uu_loop
-	adds	N, #8
-	beq	.Lmemxor3_done
-
-	C Leftover bytes in a4, low end
-	ror	r4, ACNT
-.Lmemxor3_uu_leftover:
-	ror	r4, #24
-	strb	r4, [DST, #-1]!
-	subs	N, #1
-	beq	.Lmemxor3_done
-	subs	ACNT, #8
-	bne	.Lmemxor3_uu_leftover
-	b	.Lmemxor3_bytes
-
-.Lmemxor3_uud:
-	C Both AP and BP unaligned, and in different ways
-	rsb	BTNC, BCNT, #32
-
-	ldr	r4, [AP]
-	ldr	r6, [BP]
-
-	tst	N, #4
-	ittet	eq
-	moveq	r5, r4
-	moveq	r7, r6
-	subne	N, #4
-	beq	.Lmemxor3_uud_odd
-
-.Lmemxor3_uud_loop:
-	ldr	r5, [AP, #-4]!
-	ldr	r7, [BP, #-4]!
-	lsl	r4, ATNC
-	eor	r4, r4, r6, lsl BTNC
-	eor	r4, r4, r5, lsr ACNT
-	eor	r4, r4, r7, lsr BCNT
-	str	r4, [DST, #-4]!
-.Lmemxor3_uud_odd:
-	ldr	r4, [AP, #-4]!
-	ldr	r6, [BP, #-4]!
-	lsl	r5, ATNC
-	eor	r5, r5, r7, lsl BTNC
-	eor	r5, r5, r4, lsr ACNT
-	eor	r5, r5, r6, lsr BCNT
-	str	r5, [DST, #-4]!
-	subs	N, #8
-	bcs	.Lmemxor3_uud_loop
-	adds	N, #8
-	beq	.Lmemxor3_done
-
-	C FIXME: More clever left-over handling? For now, just adjust pointers.
-	add	AP, AP,	ACNT, lsr #3
-	add	BP, BP, BCNT, lsr #3
-	b	.Lmemxor3_bytes
-EPILOGUE(nettle_memxor3)
diff --git a/arm/memxor3.asm b/arm/memxor3.asm
new file mode 100644
index 00000000..139fd208
--- /dev/null
+++ b/arm/memxor3.asm
@@ -0,0 +1,315 @@
+C arm/memxor3.asm
+
+ifelse(<
+   Copyright (C) 2013, 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+C Possible speedups:
+C
+C The ldm instruction can do load two registers per cycle,
+C if the address is two-word aligned. Or three registers in two
+C cycles, regardless of alignment.
+
+C Register usage:
+
+define(<DST>, <r0>)
+define(<AP>, <r1>)
+define(<BP>, <r2>)
+define(<N>, <r3>)
+
+C Temporaries r4-r7
+define(<ACNT>, <r8>)
+define(<ATNC>, <r10>)
+define(<BCNT>, <r11>)
+define(<BTNC>, <r12>)
+
+	.syntax unified
+
+	.file "memxor3.asm"
+
+	.text
+	.arm
+
+	C memxor3(void *dst, const void *a, const void *b, size_t n)
+	.align 2
+PROLOGUE(nettle_memxor3)
+	cmp	N, #0
+	beq	.Lmemxor3_ret
+
+	push	{r4,r5,r6,r7,r8,r10,r11}
+	cmp	N, #7
+
+	add	AP, N
+	add	BP, N
+	add	DST, N
+
+	bcs	.Lmemxor3_large
+
+	C Simple byte loop
+.Lmemxor3_bytes:
+	ldrb	r4, [AP, #-1]!
+	ldrb	r5, [BP, #-1]!
+	eor	r4, r5
+	strb	r4, [DST, #-1]!
+	subs	N, #1
+	bne	.Lmemxor3_bytes
+
+.Lmemxor3_done:
+	pop	{r4,r5,r6,r7,r8,r10,r11}
+.Lmemxor3_ret:
+	bx	lr
+
+.Lmemxor3_align_loop:
+	ldrb	r4, [AP, #-1]!
+	ldrb	r5, [BP, #-1]!
+	eor	r5, r4
+	strb	r5, [DST, #-1]!
+	sub	N, #1
+
+.Lmemxor3_large:
+	tst	DST, #3
+	bne	.Lmemxor3_align_loop
+
+	C We have at least 4 bytes left to do here.
+	sub	N, #4
+	ands	ACNT, AP, #3
+	lsl	ACNT, #3
+	beq	.Lmemxor3_a_aligned
+
+	ands	BCNT, BP, #3
+	lsl	BCNT, #3
+	bne	.Lmemxor3_uu
+
+	C Swap
+	mov	r4, AP
+	mov	AP, BP
+	mov	BP, r4
+
+.Lmemxor3_au:
+	C NOTE: We have the relevant shift count in ACNT, not BCNT
+
+	C AP is aligned, BP is not
+	C           v original SRC
+	C +-------+------+
+	C |SRC-4  |SRC   |
+	C +---+---+------+
+	C     |DST-4  |
+	C     +-------+
+	C
+	C With little-endian, we need to do
+	C DST[i-i] ^= (SRC[i-i] >> CNT) ^ (SRC[i] << TNC)
+	rsb	ATNC, ACNT, #32
+	bic	BP, #3
+
+	ldr	r4, [BP]
+
+	tst	N, #4
+	itet	eq
+	moveq	r5, r4
+	subne	N, #4
+	beq	.Lmemxor3_au_odd
+
+.Lmemxor3_au_loop:
+	ldr	r5, [BP, #-4]!
+	ldr	r6, [AP, #-4]!
+	eor	r6, r6, r4, lsl ATNC
+	eor	r6, r6, r5, lsr ACNT
+	str	r6, [DST, #-4]!
+.Lmemxor3_au_odd:
+	ldr	r4, [BP, #-4]!
+	ldr	r6, [AP, #-4]!
+	eor	r6, r6, r5, lsl ATNC
+	eor	r6, r6, r4, lsr ACNT
+	str	r6, [DST, #-4]!
+	subs	N, #8
+	bcs	.Lmemxor3_au_loop
+	adds	N, #8
+	beq	.Lmemxor3_done
+
+	C Leftover bytes in r4, low end
+	ldr	r5, [AP, #-4]
+	eor	r4, r5, r4, lsl ATNC
+
+.Lmemxor3_au_leftover:
+	C Store a byte at a time
+	ror	r4, #24
+	strb	r4, [DST, #-1]!
+	subs	N, #1
+	beq	.Lmemxor3_done
+	subs	ACNT, #8
+	sub	AP, #1
+	bne	.Lmemxor3_au_leftover
+	b	.Lmemxor3_bytes
+
+.Lmemxor3_a_aligned:
+	ands	ACNT, BP, #3
+	lsl	ACNT, #3
+	bne	.Lmemxor3_au ;
+
+	C a, b and dst all have the same alignment.
+	subs	N, #8
+	bcc	.Lmemxor3_aligned_word_end
+
+	C This loop runs at 8 cycles per iteration. It has been
+	C observed running at only 7 cycles, for this speed, the loop
+	C started at offset 0x2ac in the object file.
+
+	C FIXME: consider software pipelining, similarly to the memxor
+	C loop.
+
+.Lmemxor3_aligned_word_loop:
+	ldmdb	AP!, {r4,r5,r6}
+	ldmdb	BP!, {r7,r8,r10}
+	subs	N, #12
+	eor	r4, r7
+	eor	r5, r8
+	eor	r6, r10
+	stmdb	DST!, {r4, r5,r6}
+	bcs	.Lmemxor3_aligned_word_loop
+
+.Lmemxor3_aligned_word_end:
+	C We have 0-11 bytes left to do, and N holds number of bytes -12.
+	adds	N, #4
+	bcc	.Lmemxor3_aligned_lt_8
+	C Do 8 bytes more, leftover is in N
+	ldmdb	AP!, {r4, r5}
+	ldmdb	BP!, {r6, r7}
+	eor	r4, r6
+	eor	r5, r7
+	stmdb	DST!, {r4,r5}
+	beq	.Lmemxor3_done
+	b	.Lmemxor3_bytes
+
+.Lmemxor3_aligned_lt_8:
+	adds	N, #4
+	bcc	.Lmemxor3_aligned_lt_4
+
+	ldr	r4, [AP,#-4]!
+	ldr	r5, [BP,#-4]!
+	eor	r4, r5
+	str	r4, [DST,#-4]!
+	beq	.Lmemxor3_done
+	b	.Lmemxor3_bytes
+
+.Lmemxor3_aligned_lt_4:
+	adds	N, #4
+	beq	.Lmemxor3_done
+	b	.Lmemxor3_bytes
+
+.Lmemxor3_uu:
+
+	cmp	ACNT, BCNT
+	bic	AP, #3
+	bic	BP, #3
+	rsb	ATNC, ACNT, #32
+
+	bne	.Lmemxor3_uud
+
+	C AP and BP are unaligned in the same way
+
+	ldr	r4, [AP]
+	ldr	r6, [BP]
+	eor	r4, r6
+
+	tst	N, #4
+	itet	eq
+	moveq	r5, r4
+	subne	N, #4
+	beq	.Lmemxor3_uu_odd
+
+.Lmemxor3_uu_loop:
+	ldr	r5, [AP, #-4]!
+	ldr	r6, [BP, #-4]!
+	eor	r5, r6
+	lsl	r4, ATNC
+	eor	r4, r4, r5, lsr ACNT
+	str	r4, [DST, #-4]!
+.Lmemxor3_uu_odd:
+	ldr	r4, [AP, #-4]!
+	ldr	r6, [BP, #-4]!
+	eor	r4, r6
+	lsl	r5, ATNC
+	eor	r5, r5, r4, lsr ACNT
+	str	r5, [DST, #-4]!
+	subs	N, #8
+	bcs	.Lmemxor3_uu_loop
+	adds	N, #8
+	beq	.Lmemxor3_done
+
+	C Leftover bytes in a4, low end
+	ror	r4, ACNT
+.Lmemxor3_uu_leftover:
+	ror	r4, #24
+	strb	r4, [DST, #-1]!
+	subs	N, #1
+	beq	.Lmemxor3_done
+	subs	ACNT, #8
+	bne	.Lmemxor3_uu_leftover
+	b	.Lmemxor3_bytes
+
+.Lmemxor3_uud:
+	C Both AP and BP unaligned, and in different ways
+	rsb	BTNC, BCNT, #32
+
+	ldr	r4, [AP]
+	ldr	r6, [BP]
+
+	tst	N, #4
+	ittet	eq
+	moveq	r5, r4
+	moveq	r7, r6
+	subne	N, #4
+	beq	.Lmemxor3_uud_odd
+
+.Lmemxor3_uud_loop:
+	ldr	r5, [AP, #-4]!
+	ldr	r7, [BP, #-4]!
+	lsl	r4, ATNC
+	eor	r4, r4, r6, lsl BTNC
+	eor	r4, r4, r5, lsr ACNT
+	eor	r4, r4, r7, lsr BCNT
+	str	r4, [DST, #-4]!
+.Lmemxor3_uud_odd:
+	ldr	r4, [AP, #-4]!
+	ldr	r6, [BP, #-4]!
+	lsl	r5, ATNC
+	eor	r5, r5, r7, lsl BTNC
+	eor	r5, r5, r4, lsr ACNT
+	eor	r5, r5, r6, lsr BCNT
+	str	r5, [DST, #-4]!
+	subs	N, #8
+	bcs	.Lmemxor3_uud_loop
+	adds	N, #8
+	beq	.Lmemxor3_done
+
+	C FIXME: More clever left-over handling? For now, just adjust pointers.
+	add	AP, AP,	ACNT, lsr #3
+	add	BP, BP, BCNT, lsr #3
+	b	.Lmemxor3_bytes
+EPILOGUE(nettle_memxor3)
diff --git a/arm/v6/aes-decrypt-internal.asm b/arm/v6/aes-decrypt-internal.asm
index 28d8f6f8..3eab3ebc 100644
--- a/arm/v6/aes-decrypt-internal.asm
+++ b/arm/v6/aes-decrypt-internal.asm
@@ -30,6 +30,8 @@ ifelse(<
    not, see http://www.gnu.org/licenses/.
 >) 
 
+	.arch armv6
+
 include_src(<arm/aes.m4>)
 
 define(<PARAM_ROUNDS>, <r0>)
diff --git a/arm/v6/aes-encrypt-internal.asm b/arm/v6/aes-encrypt-internal.asm
index f7f47698..e4fa25d0 100644
--- a/arm/v6/aes-encrypt-internal.asm
+++ b/arm/v6/aes-encrypt-internal.asm
@@ -30,6 +30,8 @@ ifelse(<
    not, see http://www.gnu.org/licenses/.
 >) 
 
+	.arch armv6
+
 include_src(<arm/aes.m4>)
 
 C	Benchmarked at at 706, 870, 963 cycles/block on cortex A9,
diff --git a/arm/v6/sha1-compress.asm b/arm/v6/sha1-compress.asm
index fc1ebf05..59d6297e 100644
--- a/arm/v6/sha1-compress.asm
+++ b/arm/v6/sha1-compress.asm
@@ -31,6 +31,7 @@ ifelse(<
 >) 
 
 	.file "sha1-compress.asm"
+	.arch armv6
 
 define(<STATE>, <r0>)
 define(<INPUT>, <r1>)
diff --git a/arm/v6/sha256-compress.asm b/arm/v6/sha256-compress.asm
index 93c12b1a..75607789 100644
--- a/arm/v6/sha256-compress.asm
+++ b/arm/v6/sha256-compress.asm
@@ -31,6 +31,7 @@ ifelse(<
 >) 
 
 	.file "sha256-compress.asm"
+	.arch armv6
 
 define(<STATE>, <r0>)
 define(<INPUT>, <r1>)
diff --git a/asm.m4 b/asm.m4
index d59191e1..bbc90bd4 100644
--- a/asm.m4
+++ b/asm.m4
@@ -12,22 +12,25 @@ changecom()dnl
 dnl Including files from the srcdir
 define(<include_src>, <include(srcdir/$1)>)dnl
 
-dnl Pseudo ops
+dnl default definition, changed in fat builds
+define(<fat_transform>, <$1>)
+define(<C_NAME>, <SYMBOL_PREFIX<>fat_transform($1)>)
 
-define(<PROLOGUE>,
+dnl Pseudo ops
+define(<DECLARE_FUNC>,
 <ifelse(ELF_STYLE,yes,
-<.globl C_NAME($1)
-.type C_NAME($1),TYPE_FUNCTION
-C_NAME($1):>,
-COFF_STYLE, <yes>,
-<.globl C_NAME($1)
-.def C_NAME($1)
+<.type $1,TYPE_FUNCTION>,
+COFF_STYLE, yes,
+<.def $1
 .scl 2
 .type 32
-.endef
-C_NAME($1):>,
+.endef>,
+<>)>)
+
+define(<PROLOGUE>,
 <.globl C_NAME($1)
-C_NAME($1):>)>)
+DECLARE_FUNC(C_NAME($1))
+C_NAME($1):>)
 
 define(<EPILOGUE>,
 <ifelse(ELF_STYLE,yes,
diff --git a/cbc.h b/cbc.h
index bd0e2a22..93b2e739 100644
--- a/cbc.h
+++ b/cbc.h
@@ -64,14 +64,16 @@ memcpy((ctx)->iv, (data), sizeof((ctx)->iv))
 
 /* NOTE: Avoid using NULL, as we don't include anything defining it. */
 #define CBC_ENCRYPT(self, f, length, dst, src)		\
-(0 ? ((f)(&(self)->ctx, 0, (void *)0, (void *)0))			\
+  (0 ? ((f)(&(self)->ctx, ~(size_t) 0,			\
+	    (uint8_t *) 0, (const uint8_t *) 0))	\
    : cbc_encrypt((void *) &(self)->ctx,			\
 		 (nettle_cipher_func *) (f),		\
 		 sizeof((self)->iv), (self)->iv,	\
 		 (length), (dst), (src)))
 
 #define CBC_DECRYPT(self, f, length, dst, src)		\
-(0 ? ((f)(&(self)->ctx, 0, (void *)0, (void *)0))			\
+  (0 ? ((f)(&(self)->ctx, ~(size_t) 0,			\
+	    (uint8_t *) 0, (const uint8_t *) 0))	\
    : cbc_decrypt((void *) &(self)->ctx,			\
 		 (nettle_cipher_func *) (f),		\
 		 sizeof((self)->iv), (self)->iv,	\
diff --git a/chacha-poly1305.c b/chacha-poly1305.c
index 35c4bfe7..c5109b86 100644
--- a/chacha-poly1305.c
+++ b/chacha-poly1305.c
@@ -2,7 +2,7 @@
 
    AEAD mechanism based on chacha and poly1305.
 
-   Copyright (C) 2014 Niels Möller
+   Copyright (C) 2014, 2015 Niels Möller
 
    This file is part of GNU Nettle.
 
@@ -31,6 +31,20 @@
    not, see http://www.gnu.org/licenses/.
 */
 
+/* This implements chacha-poly1305 according to
+   draft-irtf-cfrg-chacha20-poly1305-08. The inputs to poly1305 are:
+
+     associated data
+     zero padding
+     ciphertext
+     zero padding
+     length of associated data (64-bit, little endian)
+     length of ciphertext (64-bit, little endian)
+
+   where the padding fields are 0-15 zero bytes, filling up to a
+   16-byte boundary.
+*/
+
 #if HAVE_CONFIG_H
 # include "config.h"
 #endif
@@ -62,7 +76,7 @@ chacha_poly1305_set_nonce (struct chacha_poly1305_ctx *ctx,
     uint8_t subkey[32];
   } u;
 
-  chacha_set_nonce (&ctx->chacha, nonce);
+  chacha_set_nonce96 (&ctx->chacha, nonce);
   /* Generate authentication key */
   _chacha_core (u.x, ctx->chacha.state, CHACHA_ROUNDS);
   poly1305_set_key (&ctx->poly1305, u.subkey);  
@@ -84,6 +98,17 @@ poly1305_update (struct chacha_poly1305_ctx *ctx,
   MD_UPDATE (ctx, length, data, COMPRESS, (void) 0);
 }
 
+static void
+poly1305_pad (struct chacha_poly1305_ctx *ctx)
+{
+  if (ctx->index)
+    {
+      memset (ctx->block + ctx->index, 0,
+	      POLY1305_BLOCK_SIZE - ctx->index);
+      _poly1305_block(&ctx->poly1305, ctx->block, 1);
+      ctx->index = 0;
+    }
+}
 void
 chacha_poly1305_update (struct chacha_poly1305_ctx *ctx,
 			size_t length, const uint8_t *data)
@@ -102,12 +127,8 @@ chacha_poly1305_encrypt (struct chacha_poly1305_ctx *ctx,
     return;
 
   assert (ctx->data_size % CHACHA_POLY1305_BLOCK_SIZE == 0);
-  if (!ctx->data_size)
-    {
-      uint8_t buf[8];
-      LE_WRITE_UINT64 (buf, ctx->auth_size);
-      poly1305_update (ctx, sizeof(buf), buf);
-    }
+  poly1305_pad (ctx);
+
   chacha_crypt (&ctx->chacha, length, dst, src);
   poly1305_update (ctx, length, dst);
   ctx->data_size += length;
@@ -121,12 +142,8 @@ chacha_poly1305_decrypt (struct chacha_poly1305_ctx *ctx,
     return;
 
   assert (ctx->data_size % CHACHA_POLY1305_BLOCK_SIZE == 0);
-  if (!ctx->data_size)
-    {
-      uint8_t buf[8];
-      LE_WRITE_UINT64 (buf, ctx->auth_size);
-      poly1305_update (ctx, sizeof(buf), buf);
-    }
+  poly1305_pad (ctx);
+
   poly1305_update (ctx, length, src);
   chacha_crypt (&ctx->chacha, length, dst, src);
   ctx->data_size += length;
@@ -136,27 +153,14 @@ void
 chacha_poly1305_digest (struct chacha_poly1305_ctx *ctx,
 			size_t length, uint8_t *digest)
 {
-  uint8_t buf[8];
-  if (!ctx->data_size)
-    {
-      LE_WRITE_UINT64 (buf, ctx->auth_size);
-      poly1305_update (ctx, sizeof(buf), buf);
-    }
-  LE_WRITE_UINT64 (buf, ctx->data_size);
-  poly1305_update (ctx, sizeof(buf), buf);
+  uint8_t buf[16];
 
-  /* Final bytes. FIXME: Duplicated in poly1305_aes128.c */
-  if (ctx->index > 0)
-    {
-      assert (ctx->index < POLY1305_BLOCK_SIZE);
+  poly1305_pad (ctx);
+  LE_WRITE_UINT64 (buf, ctx->auth_size);
+  LE_WRITE_UINT64 (buf + 8, ctx->data_size);
 
-      ctx->block[ctx->index] = 1;
-      memset (ctx->block + ctx->index + 1,
-	      0, POLY1305_BLOCK_SIZE - 1 - ctx->index);
+  _poly1305_block (&ctx->poly1305, buf, 1);
 
-      _poly1305_block (&ctx->poly1305, ctx->block, 0);
-    }
-  
   poly1305_digest (&ctx->poly1305, &ctx->s);
   memcpy (digest, &ctx->s.b, length);
 }
diff --git a/chacha-poly1305.h b/chacha-poly1305.h
index 9c2688b1..ce40b77a 100644
--- a/chacha-poly1305.h
+++ b/chacha-poly1305.h
@@ -53,7 +53,7 @@ extern "C" {
 #define CHACHA_POLY1305_BLOCK_SIZE 64
 /* FIXME: Any need for 128-bit variant? */
 #define CHACHA_POLY1305_KEY_SIZE 32
-#define CHACHA_POLY1305_NONCE_SIZE CHACHA_NONCE_SIZE
+#define CHACHA_POLY1305_NONCE_SIZE CHACHA_NONCE96_SIZE
 #define CHACHA_POLY1305_DIGEST_SIZE 16
 
 struct chacha_poly1305_ctx
diff --git a/chacha-set-nonce.c b/chacha-set-nonce.c
index e73babce..607f176b 100644
--- a/chacha-set-nonce.c
+++ b/chacha-set-nonce.c
@@ -59,3 +59,12 @@ chacha_set_nonce(struct chacha_ctx *ctx, const uint8_t *nonce)
   ctx->state[14] = LE_READ_UINT32(nonce + 0);
   ctx->state[15] = LE_READ_UINT32(nonce + 4);
 }
+
+void
+chacha_set_nonce96(struct chacha_ctx *ctx, const uint8_t *nonce)
+{
+  ctx->state[12] = 0;
+  ctx->state[13] = LE_READ_UINT32(nonce + 0);
+  ctx->state[14] = LE_READ_UINT32(nonce + 4);
+  ctx->state[15] = LE_READ_UINT32(nonce + 8);
+}
diff --git a/chacha.h b/chacha.h
index 41df7076..3f082834 100644
--- a/chacha.h
+++ b/chacha.h
@@ -45,6 +45,7 @@ extern "C" {
 /* Name mangling */
 #define chacha_set_key nettle_chacha_set_key
 #define chacha_set_nonce nettle_chacha_set_nonce
+#define chacha_set_nonce96 nettle_chacha_set_nonce96
 #define chacha_crypt nettle_chacha_crypt
 #define _chacha_core _nettle_chacha_core
 
@@ -52,6 +53,7 @@ extern "C" {
 #define CHACHA_KEY_SIZE 32
 #define CHACHA_BLOCK_SIZE 64
 #define CHACHA_NONCE_SIZE 8
+#define CHACHA_NONCE96_SIZE 12
 
 #define _CHACHA_STATE_LENGTH 16
 
@@ -78,6 +80,9 @@ void
 chacha_set_nonce(struct chacha_ctx *ctx, const uint8_t *nonce);
 
 void
+chacha_set_nonce96(struct chacha_ctx *ctx, const uint8_t *nonce);
+
+void
 chacha_crypt(struct chacha_ctx *ctx, size_t length, 
              uint8_t *dst, const uint8_t *src);
 
diff --git a/config.m4.in b/config.m4.in
index 3f7700bb..e39c880c 100644
--- a/config.m4.in
+++ b/config.m4.in
@@ -1,5 +1,5 @@
 define(<srcdir>, <<@srcdir@>>)dnl
-define(<C_NAME>, <@ASM_SYMBOL_PREFIX@><$1>)dnl
+define(<SYMBOL_PREFIX>, <@ASM_SYMBOL_PREFIX@><$1>)dnl
 define(<ELF_STYLE>, <@ASM_ELF_STYLE@>)dnl
 define(<COFF_STYLE>, <@ASM_COFF_STYLE@>)dnl
 define(<TYPE_FUNCTION>, <@ASM_TYPE_FUNCTION@>)dnl
diff --git a/configure.ac b/configure.ac
index 5a366424..7892aef7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -68,10 +68,17 @@ AC_ARG_ENABLE(documentation,
   AC_HELP_STRING([--disable-documentation], [Omit building and installing the documentation. (default=auto)]),,
   [enable_documentation=auto])
 
+AC_ARG_ENABLE(fat, AC_HELP_STRING([--enable-fat], [Enable fat library build (default=no)]),,
+  [enable_fat=no])
+
 AC_ARG_ENABLE(arm-neon,
   AC_HELP_STRING([--enable-arm-neon], [Enable ARM Neon assembly. (default=auto)]),,
   [enable_arm_neon=auto])
 
+AC_ARG_ENABLE(x86-aesni,
+  AC_HELP_STRING([--enable-x86-aesni], [Enable x86_64 aes instructions. (default=no)]),,
+  [enable_x86_aesni=no])
+
 AC_ARG_ENABLE(mini-gmp,
   AC_HELP_STRING([--enable-mini-gmp], [Enable mini-gmp, used instead of libgmp.]),,
   [enable_mini_gmp=no])
@@ -94,6 +101,8 @@ LSH_RPATH_INIT([`echo $with_lib_path | sed 's/:/ /g'` \
 # Checks for programs.
 AC_PROG_CC
 
+NETTLE_CHECK_IFUNC
+
 # When $CC foo.c -o foo creates both foo and foo.exe, autoconf picks
 # up the foo.exe and sets exeext to .exe. That is correct for cygwin,
 # which has some kind of magic link from foo to foo.exe, but not for
@@ -234,6 +243,8 @@ if test "x$ABI" != xstandard ; then
   fi
 fi
 
+OPT_NETTLE_SOURCES=""
+
 # Select assembler code
 asm_path=
 if test "x$enable_assembler" = xyes ; then
@@ -244,6 +255,12 @@ if test "x$enable_assembler" = xyes ; then
     [x86_64 | amd64])
       if test "$ABI" = 64 ; then
 	asm_path=x86_64
+	if test "x$enable_fat" = xyes ; then
+	  asm_path="x86_64/fat $asm_path"
+	  OPT_NETTLE_SOURCES="fat-x86_64.c $OPT_NETTLE_SOURCES"
+	elif test "x$enable_x86_aesni" = xyes ; then
+	  asm_path="x86_64/aesni $asm_path"
+	fi
       else
 	asm_path=x86
       fi
@@ -255,18 +272,25 @@ if test "x$enable_assembler" = xyes ; then
 	asm_path=sparc32
       fi
       ;;
-    armv6* | armv7*)
-      NETTLE_CHECK_ARM_NEON
+    arm*)
+      asm_path=arm
+      if test "x$enable_fat" = xyes ; then
+	asm_path="arm/fat $asm_path"
+	OPT_NETTLE_SOURCES="fat-arm.c $OPT_NETTLE_SOURCES"
+      else
+	case "$host_cpu" in
+	  armv6* | armv7*)
+	    NETTLE_CHECK_ARM_NEON
 
-      asm_path="arm/v6 arm"
+	    asm_path="arm/v6 arm"
       
-      if test "x$enable_arm_neon" = xyes ; then
-	asm_path="arm/neon $asm_path"
+	    if test "x$enable_arm_neon" = xyes ; then
+	      asm_path="arm/neon $asm_path"
+	    fi
+	    ;;
+	esac
       fi
       ;;
-    arm*)
-      asm_path=arm
-      ;;
     *)
       enable_assembler=no
       ;;
@@ -277,7 +301,8 @@ fi
 # to a new object file).
 asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \
 		arcfour-crypt.asm camellia-crypt-internal.asm \
-		md5-compress.asm memxor.asm poly1305-internal.asm \
+		md5-compress.asm memxor.asm memxor3.asm \
+		poly1305-internal.asm \
 		chacha-core-internal.asm \
 		salsa20-crypt.asm salsa20-core-internal.asm \
 		serpent-encrypt.asm serpent-decrypt.asm \
@@ -285,15 +310,20 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \
 		sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4"
 
 # Assembler files which generate additional object files if they are used.
-asm_nettle_optional_list="gcm-hash8.asm"
+asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \
+  aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \
+  salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \
+  sha3-permute-2.asm sha512-compress-2.asm \
+  umac-nh-n-2.asm umac-nh-2.asm"
+
 asm_hogweed_optional_list=""
 if test "x$enable_public_key" = "xyes" ; then
   asm_hogweed_optional_list="ecc-192-modp.asm ecc-224-modp.asm \
     ecc-25519-modp.asm ecc-256-redc.asm ecc-384-modp.asm ecc-521-modp.asm"
 fi
 
-OPT_ASM_NETTLE_SOURCES=""
-OPT_ASM_HOGWEED_SOURCES=""
+OPT_NETTLE_OBJS=""
+OPT_HOGWEED_OBJS=""
 
 asm_file_list=""
 
@@ -312,6 +342,8 @@ if test "x$enable_assembler" = xyes ; then
     dnl Workaround for AC_CONFIG_LINKS, which complains if we use the
     dnl same destination argument $tmp_f multiple times.
     for tmp_n in $asm_nettle_optional_list ; do
+      dnl Note extra pair of [] in sed expression
+      tmp_b=`echo "$tmp_n" | sed 's/\.[[^.]]*$//'`
       for asm_dir in $asm_path ; do
 	if test -f "$srcdir/$asm_dir/$tmp_n"; then
 	  asm_file_list="$asm_file_list $tmp_n"
@@ -320,14 +352,16 @@ if test "x$enable_assembler" = xyes ; then
 	    AC_DEFINE_UNQUOTED(HAVE_NATIVE_$tmp_func)
 	    eval HAVE_NATIVE_$tmp_func=yes
 	  done <<EOF
-[`sed -n 's/[^ 	]*PROLOGUE(_*\(nettle_\)*\([^)]*\)).*$/\2/p' < "$srcdir/$asm_dir/$tmp_n"`]
+[`sed -n 's/^.*[^ 	]*PROLOGUE(_*\(nettle_\)*\([^)]*\)).*$/\2/p' < "$srcdir/$asm_dir/$tmp_n"`]
 EOF
-	  OPT_ASM_NETTLE_SOURCES="$OPT_ASM_NETTLE_SOURCES $tmp_n"
+	  OPT_NETTLE_OBJS="$OPT_NETTLE_OBJS $tmp_b"'.$(OBJEXT)'
 	  break
 	fi
       done
     done	
     for tmp_h in $asm_hogweed_optional_list ; do
+      dnl Note extra pair of [] in sed expression
+      tmp_b=`echo "$tmp_h" | sed 's/\.[[^.]]*$//'`
       for asm_dir in $asm_path ; do
 	if test -f "$srcdir/$asm_dir/$tmp_h"; then
 	  asm_file_list="$asm_file_list $tmp_h"
@@ -338,7 +372,7 @@ EOF
 	  done <<EOF
 [`sed -n 's/[^ 	]*PROLOGUE(_*\(nettle_\)*\([^)]*\)).*$/\2/p' < "$srcdir/$asm_dir/$tmp_h"`]
 EOF
-	  OPT_ASM_HOGWEED_SOURCES="$OPT_ASM_HOGWEED_SOURCES $tmp_h"
+	  OPT_HOGWEED_OBJS="$OPT_HOGWEED_OBJS $tmp_b"'.$(OBJEXT)'
 	  break
 	fi
       done
@@ -358,9 +392,16 @@ EOF
   esac
 fi
 
-AC_SUBST([OPT_ASM_NETTLE_SOURCES])
-AC_SUBST([OPT_ASM_HOGWEED_SOURCES])
+AC_SUBST([OPT_NETTLE_OBJS])
+AC_SUBST([OPT_HOGWEED_OBJS])
+AC_SUBST([OPT_NETTLE_SOURCES])
 AC_SUBST([ASM_RODATA])
+if test "x$enable_assembler" = xyes ; then
+  IF_ASM=''
+else
+  IF_ASM='#'
+fi
+AC_SUBST([IF_ASM])
 
 AH_VERBATIM([HAVE_NATIVE],
 [/* Define to 1 each of the following for which a native (ie. CPU specific)
@@ -376,7 +417,14 @@ AH_VERBATIM([HAVE_NATIVE],
 #undef HAVE_NATIVE_ecc_384_redc
 #undef HAVE_NATIVE_ecc_521_modp
 #undef HAVE_NATIVE_ecc_521_redc
-#undef HAVE_NATIVE_gcm_hash8])
+#undef HAVE_NATIVE_gcm_hash8
+#undef HAVE_NATIVE_salsa20_core
+#undef HAVE_NATIVE_sha1_compress
+#undef HAVE_NATIVE_sha256_compress
+#undef HAVE_NATIVE_sha512_compress
+#undef HAVE_NATIVE_sha3_permute
+#undef HAVE_NATIVE_umac_nh
+#undef HAVE_NATIVE_umac_nh_n])
 
 if test "x$enable_pic" = xyes; then
     LSH_CCPIC
@@ -652,8 +700,9 @@ AC_CHECK_HEADERS([valgrind/memcheck.h])
 
 LSH_FUNC_ALLOCA
 LSH_FUNC_STRERROR
-# Used in the testsuite
-AC_CHECK_FUNCS(getline)
+# getenv_secure is used for fat overrides,
+# getline is used in the testsuite
+AC_CHECK_FUNCS(secure_getenv getline)
 AC_C_BIGENDIAN
 
 LSH_GCC_ATTRIBUTES
diff --git a/ctr.h b/ctr.h
index b67fdc5d..7dd06a2d 100644
--- a/ctr.h
+++ b/ctr.h
@@ -57,7 +57,8 @@ ctr_crypt(const void *ctx, nettle_cipher_func *f,
 memcpy((ctx)->ctr, (data), sizeof((ctx)->ctr))
 
 #define CTR_CRYPT(self, f, length, dst, src)		\
-(0 ? ((f)(&(self)->ctx, 0, NULL, NULL))			\
+  (0 ? ((f)(&(self)->ctx, ~(size_t) 0,			\
+	  (uint8_t *) 0, (const uint8_t *) 0))		\
    : ctr_crypt((void *) &(self)->ctx,			\
 	       (nettle_cipher_func *) (f),		\
 	       sizeof((self)->ctr), (self)->ctr,	\
diff --git a/eax.h b/eax.h
index 10749b4e..e9747f3c 100644
--- a/eax.h
+++ b/eax.h
@@ -114,36 +114,42 @@ eax_digest (struct eax_ctx *eax, const struct eax_key *key,
 #define EAX_SET_KEY(ctx, set_key, encrypt, data)			\
   do {									\
     (set_key)(&(ctx)->cipher, (data));					\
-    if (0) (encrypt) (&(ctx)->cipher, 0, (void *) 0, (void *) 0);	\
+    if (0) (encrypt) (&(ctx)->cipher, ~(size_t) 0,			\
+		      (uint8_t *) 0, (const uint8_t *) 0);		\
     eax_set_key (&(ctx)->key, &(ctx)->cipher, (nettle_cipher_func *) encrypt); \
   } while (0)
 
-#define EAX_SET_NONCE(ctx, encrypt, length, nonce) \
-  (0 ? (encrypt) (&(ctx)->cipher, 0, (void *) 0, (void *) 0)	\
+#define EAX_SET_NONCE(ctx, encrypt, length, nonce)			\
+  (0 ? (encrypt) (&(ctx)->cipher, ~(size_t) 0,				\
+		  (uint8_t *) 0, (const uint8_t *) 0)			\
    : eax_set_nonce (&(ctx)->eax, &(ctx)->key,			\
 		    &(ctx)->cipher, (nettle_cipher_func *) (encrypt),	\
 		    (length), (nonce)))
 
 #define EAX_UPDATE(ctx, encrypt, length, data)				\
-  (0 ? (encrypt) (&(ctx)->cipher, 0, (void *) 0, (void *) 0)		\
+  (0 ? (encrypt) (&(ctx)->cipher, ~(size_t) 0,				\
+		  (uint8_t *) 0, (const uint8_t *) 0)			\
    : eax_update (&(ctx)->eax, &(ctx)->key,				\
 		 &(ctx)->cipher, (nettle_cipher_func *) (encrypt),	\
 		 (length), (data)))
 
 #define EAX_ENCRYPT(ctx, encrypt, length, dst, src)			\
-  (0 ? (encrypt) (&(ctx)->cipher, 0, (void *) 0, (void *) 0)		\
+  (0 ? (encrypt) (&(ctx)->cipher, ~(size_t) 0,				\
+		  (uint8_t *) 0, (const uint8_t *) 0)			\
    : eax_encrypt (&(ctx)->eax, &(ctx)->key,				\
 		 &(ctx)->cipher, (nettle_cipher_func *) (encrypt),	\
 		  (length), (dst), (src)))
 
 #define EAX_DECRYPT(ctx, encrypt, length, dst, src)			\
-  (0 ? (encrypt) (&(ctx)->cipher, 0, (void *) 0, (void *) 0)		\
+  (0 ? (encrypt) (&(ctx)->cipher, ~(size_t) 0,				\
+		  (uint8_t *) 0, (const uint8_t *) 0)			\
    : eax_decrypt (&(ctx)->eax, &(ctx)->key,				\
 		 &(ctx)->cipher, (nettle_cipher_func *) (encrypt),	\
 		  (length), (dst), (src)))
 
 #define EAX_DIGEST(ctx, encrypt, length, digest)			\
-  (0 ? (encrypt) (&(ctx)->cipher, 0, (void *) 0, (void *) 0)		\
+  (0 ? (encrypt) (&(ctx)->cipher, ~(size_t) 0,				\
+		  (uint8_t *) 0, (const uint8_t *) 0)			\
    : eax_digest (&(ctx)->eax, &(ctx)->key,				\
 		 &(ctx)->cipher, (nettle_cipher_func *) (encrypt),	\
 		 (length), (digest)))
diff --git a/ecc-add-eh.c b/ecc-add-eh.c
index 34b39f03..a16be4cb 100644
--- a/ecc-add-eh.c
+++ b/ecc-add-eh.c
@@ -36,12 +36,6 @@
 #include "ecc.h"
 #include "ecc-internal.h"
 
-mp_size_t
-ecc_add_eh_itch (const struct ecc_curve *ecc)
-{
-  return ECC_ADD_EH_ITCH (ecc->p.size);
-}
-
 /* Add two points on an Edwards curve, with result and first point in
    homogeneous coordinates. */
 void
diff --git a/ecc-add-ehh.c b/ecc-add-ehh.c
index 46a91492..8fdc9ec3 100644
--- a/ecc-add-ehh.c
+++ b/ecc-add-ehh.c
@@ -36,12 +36,6 @@
 #include "ecc.h"
 #include "ecc-internal.h"
 
-mp_size_t
-ecc_add_ehh_itch (const struct ecc_curve *ecc)
-{
-  return ECC_ADD_EHH_ITCH (ecc->p.size);
-}
-
 /* Add two points on an Edwards curve, in homogeneous coordinates */
 void
 ecc_add_ehh (const struct ecc_curve *ecc,
@@ -61,25 +55,25 @@ ecc_add_ehh (const struct ecc_curve *ecc,
 #define z3 (r + 2*ecc->p.size)
 
   /* Formulas (from djb,
-     http://www.hyperelliptic.org/EFD/g1p/auto-edwards-projective.html#doubling-dbl-2007-bl):
+     http://www.hyperelliptic.org/EFD/g1p/auto-edwards-projective.html#addition-add-2007-bl):
 
      Computation	Operation	Live variables
 
      C = x1*x2		mul		C
      D = y1*y2		mul		C, D
-     T = (x1+y1)(x2+y2) - C - D		C, D, T
+     T = (x1+y1)(x2+y2) - C - D, mul	C, D, T
      E = b*C*D		2 mul		C, E, T (Replace C <-- D - C)
      A = z1*z2		mul		A, C, E, T
      B = A^2		sqr		A, B, C, E, T
      F = B - E				A, B, C, E, F, T
      G = B + E     			A, C, F, G, T
-     x3 = A*F*T		3 mul		A, C, G
+     x3 = A*F*T		2 mul		A, C, G
      y3 = A*G*(D-C)	2 mul		F, G
      z3 = F*G		mul
 
-     But when working with the twist curve, we need to the factor
-     x1*x2. We need to switch sign in y3 expressions, and swap F and
-     G.
+     But when working with the twist curve, we have to negate the
+     factor C = x1*x2. We change subtract to add in the y3
+     expression, and swap F and G.
   */
 #define C scratch
 #define D (scratch + ecc->p.size)
diff --git a/ecc-add-jja.c b/ecc-add-jja.c
index 40f5a0cf..9b5cab9d 100644
--- a/ecc-add-jja.c
+++ b/ecc-add-jja.c
@@ -49,12 +49,6 @@
    + p = q   ==>  r = 0, invalid
 */
 
-mp_size_t
-ecc_add_jja_itch (const struct ecc_curve *ecc)
-{
-  return ECC_ADD_JJA_ITCH (ecc->p.size);
-}
-
 void
 ecc_add_jja (const struct ecc_curve *ecc,
 	     mp_limb_t *r, const mp_limb_t *p, const mp_limb_t *q,
diff --git a/ecc-add-jjj.c b/ecc-add-jjj.c
index d298b517..1143e79a 100644
--- a/ecc-add-jjj.c
+++ b/ecc-add-jjj.c
@@ -38,13 +38,6 @@
 #include "ecc.h"
 #include "ecc-internal.h"
 
-mp_size_t
-ecc_add_jjj_itch (const struct ecc_curve *ecc)
-{
-  /* Needs 8 * ecc->p.size */
-  return ECC_ADD_JJJ_ITCH (ecc->p.size);
-}
-
 void
 ecc_add_jjj (const struct ecc_curve *ecc,
 	     mp_limb_t *r, const mp_limb_t *p, const mp_limb_t *q,
diff --git a/ecc-dup-eh.c b/ecc-dup-eh.c
index ab6b8418..2a5c5a07 100644
--- a/ecc-dup-eh.c
+++ b/ecc-dup-eh.c
@@ -36,12 +36,6 @@
 #include "ecc.h"
 #include "ecc-internal.h"
 
-mp_size_t
-ecc_dup_eh_itch (const struct ecc_curve *ecc)
-{
-  return ECC_DUP_EH_ITCH (ecc->p.size);
-}
-
 /* Double a point on an Edwards curve, in homogeneous coordinates */
 void
 ecc_dup_eh (const struct ecc_curve *ecc,
diff --git a/ecc-dup-jj.c b/ecc-dup-jj.c
index 7466976a..8e1cf36c 100644
--- a/ecc-dup-jj.c
+++ b/ecc-dup-jj.c
@@ -42,12 +42,6 @@
 
    + p = 0  ==>  r = 0, correct!
 */
-mp_size_t
-ecc_dup_jj_itch (const struct ecc_curve *ecc)
-{
-  return ECC_DUP_JJ_ITCH (ecc->p.size);
-}
-
 void
 ecc_dup_jj (const struct ecc_curve *ecc,
 	    mp_limb_t *r, const mp_limb_t *p,
diff --git a/ecc-eh-to-a.c b/ecc-eh-to-a.c
index 95f30a7c..2acaacb1 100644
--- a/ecc-eh-to-a.c
+++ b/ecc-eh-to-a.c
@@ -38,14 +38,6 @@
 #include "ecc.h"
 #include "ecc-internal.h"
 
-mp_size_t
-ecc_eh_to_a_itch (const struct ecc_curve *ecc)
-{
-  /* Needs 2*ecc->p.size + scratch for ecc_modq_inv */  
-  return ECC_EH_TO_A_ITCH (ecc->p.size, ecc->p.invert_itch);
-}
-
-
 /* Convert from homogeneous coordinates on the Edwards curve to affine
    coordinates. */
 void
diff --git a/ecc-internal.h b/ecc-internal.h
index f3a21bc8..5e0a94d9 100644
--- a/ecc-internal.h
+++ b/ecc-internal.h
@@ -53,6 +53,19 @@
 #define ecc_mod _nettle_ecc_mod
 #define ecc_mod_inv _nettle_ecc_mod_inv
 #define ecc_hash _nettle_ecc_hash
+#define ecc_a_to_j _nettle_ecc_a_to_j
+#define ecc_j_to_a _nettle_ecc_j_to_a
+#define ecc_eh_to_a _nettle_ecc_eh_to_a
+#define ecc_dup_jj _nettle_ecc_dup_jj
+#define ecc_add_jja _nettle_ecc_add_jja
+#define ecc_add_jjj _nettle_ecc_add_jjj
+#define ecc_dup_eh _nettle_ecc_dup_eh
+#define ecc_add_eh _nettle_ecc_add_eh
+#define ecc_add_ehh _nettle_ecc_add_ehh
+#define ecc_mul_g _nettle_ecc_mul_g
+#define ecc_mul_a _nettle_ecc_mul_a
+#define ecc_mul_g_eh _nettle_ecc_mul_g_eh
+#define ecc_mul_a_eh _nettle_ecc_mul_a_eh
 #define cnd_copy _nettle_cnd_copy
 #define sec_add_1 _nettle_sec_add_1
 #define sec_sub_1 _nettle_sec_sub_1
@@ -248,6 +261,102 @@ ecc_hash (const struct ecc_modulo *m,
 	  mp_limb_t *hp,
 	  size_t length, const uint8_t *digest);
 
+/* Converts a point P in affine coordinates into a point R in jacobian
+   coordinates. */
+void
+ecc_a_to_j (const struct ecc_curve *ecc,
+	    mp_limb_t *r, const mp_limb_t *p);
+
+/* Converts a point P in jacobian coordinates into a point R in affine
+   coordinates. If op == 1, produce x coordinate only. If op == 2,
+   produce the x coordiante only, and in also it modulo q. FIXME: For
+   the public interface, have separate for the three cases, and use
+   this flag argument only for the internal ecc->h_to_a function. */
+void
+ecc_j_to_a (const struct ecc_curve *ecc,
+	    int op,
+	    mp_limb_t *r, const mp_limb_t *p,
+	    mp_limb_t *scratch);
+
+/* Converts a point P on an Edwards curve to affine coordinates on
+   the corresponding Montgomery curve. */
+void
+ecc_eh_to_a (const struct ecc_curve *ecc,
+	     int op,
+	     mp_limb_t *r, const mp_limb_t *p,
+	     mp_limb_t *scratch);
+
+/* Group operations */
+
+/* Point doubling, with jacobian input and output. Corner cases:
+   Correctly sets R = 0 (r_Z = 0) if p = 0 or 2p = 0. */
+void
+ecc_dup_jj (const struct ecc_curve *ecc,
+	    mp_limb_t *r, const mp_limb_t *p,
+	    mp_limb_t *scratch);
+
+/* Point addition, with jacobian output, one jacobian input and one
+   affine input. Corner cases: Fails for the cases
+
+     P = Q != 0                       Duplication of non-zero point
+     P = 0, Q != 0 or P != 0, Q = 0   One input zero
+   
+     Correctly gives R = 0 if P = Q = 0 or P = -Q. */
+void
+ecc_add_jja (const struct ecc_curve *ecc,
+	     mp_limb_t *r, const mp_limb_t *p, const mp_limb_t *q,
+	     mp_limb_t *scratch);
+
+/* Point addition with Jacobian input and output. */
+void
+ecc_add_jjj (const struct ecc_curve *ecc,
+	     mp_limb_t *r, const mp_limb_t *p, const mp_limb_t *q,
+	     mp_limb_t *scratch);
+
+/* Point doubling on an Edwards curve, with homogeneous
+   cooordinates. */
+void
+ecc_dup_eh (const struct ecc_curve *ecc,
+	    mp_limb_t *r, const mp_limb_t *p,
+	    mp_limb_t *scratch);
+
+void
+ecc_add_eh (const struct ecc_curve *ecc,
+	    mp_limb_t *r, const mp_limb_t *p, const mp_limb_t *q,
+	    mp_limb_t *scratch);
+
+void
+ecc_add_ehh (const struct ecc_curve *ecc,
+	     mp_limb_t *r, const mp_limb_t *p, const mp_limb_t *q,
+	     mp_limb_t *scratch);
+
+/* Computes N * the group generator. N is an array of ecc_size()
+   limbs. It must be in the range 0 < N < group order, then R != 0,
+   and the algorithm can work without any intermediate values getting
+   to zero. */ 
+void
+ecc_mul_g (const struct ecc_curve *ecc, mp_limb_t *r,
+	   const mp_limb_t *np, mp_limb_t *scratch);
+
+/* Computes N * P. The scalar N is the same as for ecc_mul_g. P is a
+   non-zero point on the curve, in affine coordinates. Output R is a
+   non-zero point, in Jacobian coordinates. */
+void
+ecc_mul_a (const struct ecc_curve *ecc,
+	   mp_limb_t *r,
+	   const mp_limb_t *np, const mp_limb_t *p,
+	   mp_limb_t *scratch);
+
+void
+ecc_mul_g_eh (const struct ecc_curve *ecc, mp_limb_t *r,
+	      const mp_limb_t *np, mp_limb_t *scratch);
+
+void
+ecc_mul_a_eh (const struct ecc_curve *ecc,
+	      mp_limb_t *r,
+	      const mp_limb_t *np, const mp_limb_t *p,
+	      mp_limb_t *scratch);
+
 void
 cnd_copy (int cnd, mp_limb_t *rp, const mp_limb_t *ap, mp_size_t n);
 
diff --git a/ecc-j-to-a.c b/ecc-j-to-a.c
index 2e48b94d..eca10f0f 100644
--- a/ecc-j-to-a.c
+++ b/ecc-j-to-a.c
@@ -38,13 +38,6 @@
 #include "ecc.h"
 #include "ecc-internal.h"
 
-mp_size_t
-ecc_j_to_a_itch (const struct ecc_curve *ecc)
-{
-  /* Needs 2*ecc->size + scratch for ecc_modq_inv */
-  return ECC_J_TO_A_ITCH (ecc->p.size);
-}
-
 void
 ecc_j_to_a (const struct ecc_curve *ecc,
 	    int op,
diff --git a/ecc-mul-a-eh.c b/ecc-mul-a-eh.c
index 2e273349..cf743236 100644
--- a/ecc-mul-a-eh.c
+++ b/ecc-mul-a-eh.c
@@ -38,17 +38,12 @@
 #include "ecc.h"
 #include "ecc-internal.h"
 
-mp_size_t
-ecc_mul_a_eh_itch (const struct ecc_curve *ecc)
-{
-  /* Binary algorithm needs 6*ecc->p.size + scratch for ecc_add_ehh,
-     total 13 ecc->p.size
+/* Binary algorithm needs 6*ecc->p.size + scratch for ecc_add_ehh,
+   total 13 ecc->p.size
 
-     Window algorithm needs (3<<w) * ecc->p.size for the table,
-     3*ecc->p.size for a temporary point, and scratch for
-     ecc_add_ehh. */
-  return ECC_MUL_A_EH_ITCH (ecc->p.size);
-}
+   Window algorithm needs (3<<w) * ecc->p.size for the table,
+   3*ecc->p.size for a temporary point, and scratch for
+   ecc_add_ehh. */
 
 #if ECC_MUL_A_EH_WBITS == 0
 void
diff --git a/ecc-mul-a.c b/ecc-mul-a.c
index 9b2be3d8..cb9c7d41 100644
--- a/ecc-mul-a.c
+++ b/ecc-mul-a.c
@@ -40,17 +40,12 @@
 #include "ecc.h"
 #include "ecc-internal.h"
 
-mp_size_t
-ecc_mul_a_itch (const struct ecc_curve *ecc)
-{
-  /* Binary algorithm needs 6*ecc->p.size + scratch for ecc_add_jja.
-     Current total is 12 ecc->p.size, at most 864 bytes.
+/* Binary algorithm needs 6*ecc->p.size + scratch for ecc_add_jja.
+   Current total is 12 ecc->p.size, at most 864 bytes.
 
-     Window algorithm needs (3<<w) * ecc->p.size for the table,
-     3*ecc->p.size for a temporary point, and scratch for
-     ecc_add_jjj. */
-  return ECC_MUL_A_ITCH (ecc->p.size);
-}
+   Window algorithm needs (3<<w) * ecc->p.size for the table,
+   3*ecc->p.size for a temporary point, and scratch for
+   ecc_add_jjj. */
 
 #if ECC_MUL_A_WBITS == 0
 void
diff --git a/ecc-mul-g-eh.c b/ecc-mul-g-eh.c
index fc0f565c..a945494d 100644
--- a/ecc-mul-g-eh.c
+++ b/ecc-mul-g-eh.c
@@ -40,13 +40,6 @@
 #include "ecc.h"
 #include "ecc-internal.h"
 
-mp_size_t
-ecc_mul_g_eh_itch (const struct ecc_curve *ecc)
-{
-  /* Needs 3*ecc->p.size + scratch for ecc_add_jja. */
-  return ECC_MUL_G_EH_ITCH (ecc->p.size);
-}
-
 void
 ecc_mul_g_eh (const struct ecc_curve *ecc, mp_limb_t *r,
 	      const mp_limb_t *np, mp_limb_t *scratch)
diff --git a/ecc-mul-g.c b/ecc-mul-g.c
index b2dcb404..c4a1b5bb 100644
--- a/ecc-mul-g.c
+++ b/ecc-mul-g.c
@@ -40,13 +40,6 @@
 #include "ecc.h"
 #include "ecc-internal.h"
 
-mp_size_t
-ecc_mul_g_itch (const struct ecc_curve *ecc)
-{
-  /* Needs 3*ecc->p.size + scratch for ecc_add_jja. */
-  return ECC_MUL_G_ITCH (ecc->p.size);
-}
-
 void
 ecc_mul_g (const struct ecc_curve *ecc, mp_limb_t *r,
 	   const mp_limb_t *np, mp_limb_t *scratch)
diff --git a/ecc.h b/ecc.h
index a003a1ed..c67ccdc3 100644
--- a/ecc.h
+++ b/ecc.h
@@ -58,31 +58,6 @@ extern "C" {
 #define ecc_size nettle_ecc_size
 #define ecc_size_a nettle_ecc_size_a
 #define ecc_size_j nettle_ecc_size_j
-#define ecc_a_to_j nettle_ecc_a_to_j
-#define ecc_j_to_a_itch nettle_ecc_j_to_a_itch
-#define ecc_j_to_a nettle_ecc_j_to_a
-#define ecc_eh_to_a_itch nettle_ecc_eh_to_a_itch
-#define ecc_eh_to_a nettle_ecc_eh_to_a
-#define ecc_dup_jj_itch nettle_ecc_dup_jj_itch
-#define ecc_dup_jj nettle_ecc_dup_jj
-#define ecc_add_jja_itch nettle_ecc_add_jja_itch
-#define ecc_add_jja nettle_ecc_add_jja
-#define ecc_add_jjj_itch nettle_ecc_add_jjj_itch
-#define ecc_add_jjj nettle_ecc_add_jjj
-#define ecc_dup_eh_itch nettle_ecc_dup_eh_itch
-#define ecc_dup_eh nettle_ecc_dup_eh
-#define ecc_add_eh_itch nettle_ecc_add_eh_itch
-#define ecc_add_eh nettle_ecc_add_eh
-#define ecc_add_ehh_itch nettle_ecc_add_ehh_itch
-#define ecc_add_ehh nettle_ecc_add_ehh
-#define ecc_mul_g_itch nettle_ecc_mul_g_itch
-#define ecc_mul_g nettle_ecc_mul_g
-#define ecc_mul_a_itch nettle_ecc_mul_a_itch
-#define ecc_mul_a nettle_ecc_mul_a
-#define ecc_mul_g_eh_itch nettle_ecc_mul_g_eh_itch
-#define ecc_mul_g_eh nettle_ecc_mul_g_eh
-#define ecc_mul_a_eh_itch nettle_ecc_mul_a_eh_itch
-#define ecc_mul_a_eh nettle_ecc_mul_a_eh
 
 struct ecc_curve;
 
@@ -174,133 +149,8 @@ ecc_size_a (const struct ecc_curve *ecc);
 mp_size_t
 ecc_size_j (const struct ecc_curve *ecc);
 
-/* FIXME: Rename the low-level (and side-channel silent) functions to
-   _ecc_*, and provide public ecc_* functions which handle the
-   infinity points properly? */
-
-/* Converts a point P in affine coordinates into a point R in jacobian
-   coordinates. */
-void
-ecc_a_to_j (const struct ecc_curve *ecc,
-	    mp_limb_t *r, const mp_limb_t *p);
-
-/* Converts a point P in jacobian coordinates into a point R in affine
-   coordinates. If op == 1, produce x coordinate only. If op == 2,
-   produce the x coordiante only, and in also it modulo q. FIXME: For
-   the public interface, have separate for the three cases, and use
-   this flag argument only for the internal ecc->h_to_a function. */
-mp_size_t
-ecc_j_to_a_itch (const struct ecc_curve *ecc);
-void
-ecc_j_to_a (const struct ecc_curve *ecc,
-	    int op,
-	    mp_limb_t *r, const mp_limb_t *p,
-	    mp_limb_t *scratch);
-
-/* Converts a point P on an Edwards curve to affine coordinates on
-   the corresponding Montgomery curve. */
-
-mp_size_t
-ecc_eh_to_a_itch (const struct ecc_curve *ecc);
-void
-ecc_eh_to_a (const struct ecc_curve *ecc,
-	     int op,
-	     mp_limb_t *r, const mp_limb_t *p,
-	     mp_limb_t *scratch);
-
-/* Group operations */
-
-/* Point doubling, with jacobian input and output. Corner cases:
-   Correctly sets R = 0 (r_Z = 0) if p = 0 or 2p = 0. */
-mp_size_t
-ecc_dup_jj_itch (const struct ecc_curve *ecc);
-void
-ecc_dup_jj (const struct ecc_curve *ecc,
-	    mp_limb_t *r, const mp_limb_t *p,
-	    mp_limb_t *scratch);
-
-
-/* Point addition, with jacobian output, one jacobian input and one
-   affine input. Corner cases: Fails for the cases
-
-     P = Q != 0                       Duplication of non-zero point
-     P = 0, Q != 0 or P != 0, Q = 0   One input zero
-   
-     Correctly gives R = 0 if P = Q = 0 or P = -Q. */
-mp_size_t
-ecc_add_jja_itch (const struct ecc_curve *ecc);
-void
-ecc_add_jja (const struct ecc_curve *ecc,
-	     mp_limb_t *r, const mp_limb_t *p, const mp_limb_t *q,
-	     mp_limb_t *scratch);
-
-/* Point addition with Jacobian input and output. */
-mp_size_t
-ecc_add_jjj_itch (const struct ecc_curve *ecc);
-void
-ecc_add_jjj (const struct ecc_curve *ecc,
-	     mp_limb_t *r, const mp_limb_t *p, const mp_limb_t *q,
-	     mp_limb_t *scratch);
-
-/* FIXME: Use a generic ecc_dup, ecc_add, for any type of curve. */
-/* Point doubling on an Edwards curve, with homogeneous
-   cooordinates. */
-mp_size_t
-ecc_dup_eh_itch (const struct ecc_curve *ecc);
-void
-ecc_dup_eh (const struct ecc_curve *ecc,
-	    mp_limb_t *r, const mp_limb_t *p,
-	    mp_limb_t *scratch);
-
-mp_size_t
-ecc_add_eh_itch (const struct ecc_curve *ecc);
-void
-ecc_add_eh (const struct ecc_curve *ecc,
-	    mp_limb_t *r, const mp_limb_t *p, const mp_limb_t *q,
-	    mp_limb_t *scratch);
-
-mp_size_t
-ecc_add_ehh_itch (const struct ecc_curve *ecc);
-void
-ecc_add_ehh (const struct ecc_curve *ecc,
-	     mp_limb_t *r, const mp_limb_t *p, const mp_limb_t *q,
-	     mp_limb_t *scratch);
-
-/* Computes N * the group generator. N is an array of ecc_size()
-   limbs. It must be in the range 0 < N < group order, then R != 0,
-   and the algorithm can work without any intermediate values getting
-   to zero. */ 
-mp_size_t
-ecc_mul_g_itch (const struct ecc_curve *ecc);
-void
-ecc_mul_g (const struct ecc_curve *ecc, mp_limb_t *r,
-	   const mp_limb_t *np, mp_limb_t *scratch);
-
-/* Computes N * P. The scalar N is the same as for ecc_mul_g. P is a
-   non-zero point on the curve, in affine coordinates. Output R is a
-   non-zero point, in Jacobian coordinates. */
-mp_size_t
-ecc_mul_a_itch (const struct ecc_curve *ecc);
-void
-ecc_mul_a (const struct ecc_curve *ecc,
-	   mp_limb_t *r,
-	   const mp_limb_t *np, const mp_limb_t *p,
-	   mp_limb_t *scratch);
-
-mp_size_t
-ecc_mul_g_eh_itch (const struct ecc_curve *ecc);
-void
-ecc_mul_g_eh (const struct ecc_curve *ecc, mp_limb_t *r,
-	      const mp_limb_t *np, mp_limb_t *scratch);
-
-mp_size_t
-ecc_mul_a_eh_itch (const struct ecc_curve *ecc);
-void
-ecc_mul_a_eh (const struct ecc_curve *ecc,
-	      mp_limb_t *r,
-	      const mp_limb_t *np, const mp_limb_t *p,
-	      mp_limb_t *scratch);
-
+/* FIXME: Define a generic ecc_dup, ecc_add, for any type of curve. Do
+   they need to handle infinity points? */
 
 #ifdef __cplusplus
 }
diff --git a/fat-arm.c b/fat-arm.c
new file mode 100644
index 00000000..1156499d
--- /dev/null
+++ b/fat-arm.c
@@ -0,0 +1,267 @@
+/* fat-arm.c
+
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "nettle-types.h"
+
+#include "aes-internal.h"
+#include "fat-setup.h"
+
+struct arm_features
+{
+  /* /proc/cpuinfo "CPU Architecture" doesn't correspond exactly to
+     ARM architecture version, but it's good enough for our purposes.
+     Will be set to 5, 6, 7 or 8. */
+  unsigned arch_version;
+  int have_neon;
+};
+
+#define SKIP(s, slen, literal, llen)				\
+  (((slen) >= (llen) && memcmp ((s), (literal), llen) == 0)	\
+   ? ((slen) -= (llen), (s) += (llen), 1) : 0)
+#define MATCH(s, slen, literal, llen)				\
+  ((slen) == (llen) && memcmp ((s), (literal), llen) == 0)
+
+static void
+get_arm_features (struct arm_features *features)
+{
+  const char *s;
+  features->arch_version = 5;
+  features->have_neon = 0;
+
+  s = secure_getenv (ENV_OVERRIDE);
+  if (s)
+    for (;;)
+      {
+	const char *sep = strchr (s, ',');
+	size_t length = sep ? (size_t) (sep - s) : strlen(s);
+
+	if (SKIP (s, length, "arch:", 5))
+	  {
+	    if (length == 1 && *s >= '0' && *s <= '9')
+	      features->arch_version = *s - '0';
+	  }
+	else if (MATCH (s, length, "neon", 4))
+	  features->have_neon = 1;
+	if (!sep)
+	  break;
+	s = sep + 1;
+      }
+  else
+    {
+      FILE *f;
+      char line[200];
+      int seen_arch = 0;
+      int seen_features = 0;
+
+      f = fopen ("/proc/cpuinfo", "r");
+      if (!f)
+	return;
+      while (seen_features + seen_arch < 2
+	     && fgets (line, sizeof(line), f))
+	{
+	  char *sep;
+	  char *p;
+	  sep = strchr (line, ':');
+	  if (!sep)
+	    continue;
+	  for (p = sep; p - line > 0 && p[-1] == '\t'; p--)
+	    ;
+
+	  *p = '\0';
+	  p = sep+1;
+
+	  if (strcmp (line, "Features") == 0)
+	    {
+	      features->have_neon = (strstr (p, " neon ") != NULL);
+	      seen_features = 1;
+	    }
+	  else if (strcmp (line, "CPU architecture") == 0)
+	    {
+	      /* Don't use strtol, since it's locale dependent. */
+	      while (p[0] == ' ')
+		p++;
+	      if (p[0] > '5' && p[0] <= '9')
+		features->arch_version = p[0] - '0';
+	      else if (strcmp (p, "AArch64") == 0)
+		features->arch_version = 8;
+	      seen_arch = 1;
+	    }
+	}
+      if (features->arch_version >= 8)
+	{
+	  /* Neon is not required, and maybe not listed in feature flags */
+	  features->have_neon = 1;
+	}
+      fclose (f);
+    }
+}
+
+DECLARE_FAT_FUNC(_nettle_aes_encrypt, aes_crypt_internal_func)
+DECLARE_FAT_FUNC_VAR(aes_encrypt, aes_crypt_internal_func, arm)
+DECLARE_FAT_FUNC_VAR(aes_encrypt, aes_crypt_internal_func, armv6)
+
+DECLARE_FAT_FUNC(_nettle_aes_decrypt, aes_crypt_internal_func)
+DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, arm)
+DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, armv6)
+
+DECLARE_FAT_FUNC(_nettle_salsa20_core, salsa20_core_func)
+DECLARE_FAT_FUNC_VAR(salsa20_core, salsa20_core_func, c)
+DECLARE_FAT_FUNC_VAR(salsa20_core, salsa20_core_func, neon)
+
+DECLARE_FAT_FUNC(_nettle_sha1_compress, sha1_compress_func)
+DECLARE_FAT_FUNC_VAR(sha1_compress, sha1_compress_func, c)
+DECLARE_FAT_FUNC_VAR(sha1_compress, sha1_compress_func, armv6)
+
+DECLARE_FAT_FUNC(_nettle_sha256_compress, sha256_compress_func)
+DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, c)
+DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, armv6)
+
+DECLARE_FAT_FUNC(_nettle_sha512_compress, sha512_compress_func)
+DECLARE_FAT_FUNC_VAR(sha512_compress, sha512_compress_func, c)
+DECLARE_FAT_FUNC_VAR(sha512_compress, sha512_compress_func, neon)
+
+DECLARE_FAT_FUNC(nettle_sha3_permute, sha3_permute_func)
+DECLARE_FAT_FUNC_VAR(sha3_permute, sha3_permute_func, c)
+DECLARE_FAT_FUNC_VAR(sha3_permute, sha3_permute_func, neon)
+
+DECLARE_FAT_FUNC(_nettle_umac_nh, umac_nh_func)
+DECLARE_FAT_FUNC_VAR(umac_nh, umac_nh_func, c);
+DECLARE_FAT_FUNC_VAR(umac_nh, umac_nh_func, neon);
+
+DECLARE_FAT_FUNC(_nettle_umac_nh_n, umac_nh_n_func)
+DECLARE_FAT_FUNC_VAR(umac_nh_n, umac_nh_n_func, c);
+DECLARE_FAT_FUNC_VAR(umac_nh_n, umac_nh_n_func, neon);
+
+static void CONSTRUCTOR
+fat_init (void)
+{
+  struct arm_features features;
+  int verbose;
+
+  get_arm_features (&features);
+
+  verbose = getenv (ENV_VERBOSE) != NULL;
+  if (verbose)
+    fprintf (stderr, "libnettle: cpu features: arch:%d%s\n",
+	     features.arch_version,
+	     features.have_neon ? ",neon" : "");
+
+  if (features.arch_version >= 6)
+    {
+      if (verbose)
+	fprintf (stderr, "libnettle: enabling armv6 code.\n");
+      _nettle_aes_encrypt_vec = _nettle_aes_encrypt_armv6;
+      _nettle_aes_decrypt_vec = _nettle_aes_decrypt_armv6;
+      _nettle_sha1_compress_vec = _nettle_sha1_compress_armv6;
+      _nettle_sha256_compress_vec = _nettle_sha256_compress_armv6;
+    }
+  else
+    {
+      if (verbose)
+	fprintf (stderr, "libnettle: not enabling armv6 code.\n");
+      _nettle_aes_encrypt_vec = _nettle_aes_encrypt_arm;
+      _nettle_aes_decrypt_vec = _nettle_aes_decrypt_arm;
+      _nettle_sha1_compress_vec = _nettle_sha1_compress_c;
+      _nettle_sha256_compress_vec = _nettle_sha256_compress_c;
+    }
+  if (features.have_neon)
+    {
+      if (verbose)
+	fprintf (stderr, "libnettle: enabling neon code.\n");
+      _nettle_salsa20_core_vec = _nettle_salsa20_core_neon;
+      _nettle_sha512_compress_vec = _nettle_sha512_compress_neon;
+      nettle_sha3_permute_vec = _nettle_sha3_permute_neon;
+      _nettle_umac_nh_vec = _nettle_umac_nh_neon;
+      _nettle_umac_nh_n_vec = _nettle_umac_nh_n_neon;
+    }
+  else
+    {
+      if (verbose)
+	fprintf (stderr, "libnettle: not enabling neon code.\n");
+      _nettle_salsa20_core_vec = _nettle_salsa20_core_c;
+      _nettle_sha512_compress_vec = _nettle_sha512_compress_c;
+      nettle_sha3_permute_vec = _nettle_sha3_permute_c;
+      _nettle_umac_nh_vec = _nettle_umac_nh_c;
+      _nettle_umac_nh_n_vec = _nettle_umac_nh_n_c;
+    }
+}
+  
+DEFINE_FAT_FUNC(_nettle_aes_encrypt, void,
+		(unsigned rounds, const uint32_t *keys,
+		 const struct aes_table *T,
+		 size_t length, uint8_t *dst,
+		 const uint8_t *src),
+		(rounds, keys, T, length, dst, src))
+
+DEFINE_FAT_FUNC(_nettle_aes_decrypt, void,
+		(unsigned rounds, const uint32_t *keys,
+		 const struct aes_table *T,
+		 size_t length, uint8_t *dst,
+		 const uint8_t *src),
+		(rounds, keys, T, length, dst, src))
+
+DEFINE_FAT_FUNC(_nettle_salsa20_core, void,
+		(uint32_t *dst, const uint32_t *src, unsigned rounds),
+		(dst, src, rounds))
+
+DEFINE_FAT_FUNC(_nettle_sha1_compress, void,
+		(uint32_t *state, const uint8_t *input),
+		(state, input))
+
+DEFINE_FAT_FUNC(_nettle_sha256_compress, void,
+		(uint32_t *state, const uint8_t *input, const uint32_t *k),
+		(state, input, k))
+
+DEFINE_FAT_FUNC(_nettle_sha512_compress, void,
+		(uint64_t *state, const uint8_t *input, const uint64_t *k),
+		(state, input, k))
+
+DEFINE_FAT_FUNC(nettle_sha3_permute, void,
+		(struct sha3_state *state), (state))
+
+DEFINE_FAT_FUNC(_nettle_umac_nh, uint64_t,
+		(const uint32_t *key, unsigned length, const uint8_t *msg),
+		(key, length, msg))
+
+DEFINE_FAT_FUNC(_nettle_umac_nh_n, void,
+		(uint64_t *out, unsigned n, const uint32_t *key,
+		 unsigned length, const uint8_t *msg),
+		(out, n, key, length, msg))
+
diff --git a/fat-setup.h b/fat-setup.h
new file mode 100644
index 00000000..09e8c371
--- /dev/null
+++ b/fat-setup.h
@@ -0,0 +1,167 @@
+/* fat-setup.h
+
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+/* Fat library initialization works as follows. The main function is
+   fat_init. We try to do initialization only once, but since it is
+   idempotent, there's no harm if it is in some cases called multiple
+   times from several threads. For correctness, we rely on atomic
+   writes, but not on memory barriers or any other synchronization
+   mechanism.
+
+   The fat_init function checks the cpuid flags, and sets function
+   pointers, e.g, _nettle_aes_encrypt_vec, to point to the appropriate
+   implementation.
+
+   To get everything hooked in, we use a belt-and-suspenders approach.
+
+   We try to register fat_init as a constructor function to be called
+   at load time. If this is unavailable or non-working, we instead
+   arrange fat_init to be called lazily.
+
+   For the actual indirection, there are two cases. 
+
+   * If ifunc support is available, function pointers are statically
+     initialized to NULL, and we register resolver functions, e.g.,
+     _nettle_aes_encrypt_resolve, which call fat_init, and then return
+     the function pointer, e.g., the value of _nettle_aes_encrypt_vec.
+
+   * If ifunc is not available, we have to define a wrapper function
+     to jump via the function pointer. (FIXME: For internal calls, we
+     could do this as a macro).
+
+     We statically initialize each function pointer to point to a
+     special initialization function, e.g., _nettle_aes_encrypt_init,
+     which calls fat_init, and then invokes the right function. This
+     way, all pointers are setup correctly at the first call to any
+     fat function.
+
+     And atomic writes are required for correctness in the case that
+     several threads do "first call to any fat function" at the same
+     time.
+*/
+
+#if HAVE_GCC_ATTRIBUTE
+# define CONSTRUCTOR __attribute__ ((constructor))
+#else
+# define CONSTRUCTOR
+# if defined (__sun)
+#  pragma init(fat_init)
+# endif
+#endif
+
+#if !HAVE_SECURE_GETENV
+#define secure_getenv(s) NULL
+#endif
+
+#define ENV_VERBOSE "NETTLE_FAT_VERBOSE"
+#define ENV_OVERRIDE "NETTLE_FAT_OVERRIDE"
+
+/* DECLARE_FAT_FUNC(name, ftype)
+ *
+ *   name is the public function, e.g., _nettle_aes_encrypt.
+ *   ftype is its type, e.g., aes_crypt_internal_func.
+ *
+ * DECLARE_FAT_VAR(name, type, var)
+ *
+ *   name is name without _nettle prefix.
+ *   type is its type.
+ *   var is the variant, used as a suffix on the symbol name.
+ *
+ * DEFINE_FAT_FUNC(name, rtype, prototype, args)
+ *
+ *   name is the public function.
+ *   rtype its return type.
+ *   prototype is the list of formal arguments, with types.
+ *   args contain the argument list without any types.
+ */
+
+#if HAVE_LINK_IFUNC
+#define IFUNC(resolve) __attribute__ ((ifunc (resolve)))
+#define DECLARE_FAT_FUNC(name, ftype)	\
+  ftype name IFUNC(#name"_resolve");	\
+  static ftype *name##_vec = NULL;
+
+#define DEFINE_FAT_FUNC(name, rtype, prototype, args)		  \
+  static void_func * name##_resolve(void)			  \
+  {								  \
+    if (getenv (ENV_VERBOSE))					  \
+      fprintf (stderr, "libnettle: "#name"_resolve\n");		  \
+    if (!name##_vec)						  \
+      fat_init();						  \
+    return (void_func *) name##_vec;				  \
+  }
+
+#else /* !HAVE_LINK_IFUNC */
+#define DECLARE_FAT_FUNC(name, ftype)		\
+  ftype name;					\
+  static ftype name##_init;			\
+  static ftype *name##_vec = name##_init;				
+
+#define DEFINE_FAT_FUNC(name, rtype, prototype, args)		\
+  rtype name prototype						\
+  {								\
+    return name##_vec args;					\
+  }								\
+  static rtype name##_init prototype {				\
+    if (getenv (ENV_VERBOSE))					\
+      fprintf (stderr, "libnettle: "#name"_init\n");		\
+    if (name##_vec == name##_init)				\
+      fat_init();						\
+    assert (name##_vec != name##_init);				\
+    return name##_vec args;					\
+  }
+#endif /* !HAVE_LINK_IFUNC */
+
+#define DECLARE_FAT_FUNC_VAR(name, type, var)	\
+       type _nettle_##name##_##var;
+
+typedef void void_func (void);
+
+typedef void aes_crypt_internal_func (unsigned rounds, const uint32_t *keys,
+				      const struct aes_table *T,
+				      size_t length, uint8_t *dst,
+				      const uint8_t *src);
+
+typedef void *(memxor_func)(void *dst, const void *src, size_t n);
+
+typedef void salsa20_core_func (uint32_t *dst, const uint32_t *src, unsigned rounds);
+
+typedef void sha1_compress_func(uint32_t *state, const uint8_t *input);
+typedef void sha256_compress_func(uint32_t *state, const uint8_t *input, const uint32_t *k);
+
+struct sha3_state;
+typedef void sha3_permute_func (struct sha3_state *state);
+
+typedef void sha512_compress_func (uint64_t *state, const uint8_t *input, const uint64_t *k);
+
+typedef uint64_t umac_nh_func (const uint32_t *key, unsigned length, const uint8_t *msg);
+typedef void umac_nh_n_func (uint64_t *out, unsigned n, const uint32_t *key,
+			     unsigned length, const uint8_t *msg);
diff --git a/fat-x86_64.c b/fat-x86_64.c
new file mode 100644
index 00000000..2e97d1e5
--- /dev/null
+++ b/fat-x86_64.c
@@ -0,0 +1,187 @@
+/* fat-x86_64.c
+
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#define _GNU_SOURCE
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "nettle-types.h"
+
+#include "aes-internal.h"
+#include "memxor.h"
+#include "fat-setup.h"
+
+void _nettle_cpuid (uint32_t input, uint32_t regs[4]);
+
+struct x86_features
+{
+  enum x86_vendor { X86_OTHER, X86_INTEL, X86_AMD } vendor;
+  int have_aesni;
+};
+
+#define SKIP(s, slen, literal, llen)				\
+  (((slen) >= (llen) && memcmp ((s), (literal), llen) == 0)	\
+   ? ((slen) -= (llen), (s) += (llen), 1) : 0)
+#define MATCH(s, slen, literal, llen)				\
+  ((slen) == (llen) && memcmp ((s), (literal), llen) == 0)
+
+static void
+get_x86_features (struct x86_features *features)
+{
+  const char *s;
+  features->vendor = X86_OTHER;
+  features->have_aesni = 0;
+
+  s = secure_getenv (ENV_OVERRIDE);
+  if (s)
+    for (;;)
+      {
+	const char *sep = strchr (s, ',');
+	size_t length = sep ? (size_t) (sep - s) : strlen(s);
+
+	if (SKIP (s, length, "vendor:", 7))
+	  {
+	    if (MATCH(s, length, "intel", 5))
+	      features->vendor = X86_INTEL;
+	    else if (MATCH(s, length, "amd", 3))
+	      features->vendor = X86_AMD;
+	    
+	  }
+	else if (MATCH (s, length, "aesni", 5))
+	  features->have_aesni = 1;
+	if (!sep)
+	  break;
+	s = sep + 1;	
+      }
+  else
+    {
+      uint32_t cpuid_data[4];
+      _nettle_cpuid (0, cpuid_data);
+      if (memcmp (cpuid_data + 1, "Genu" "ntel" "ineI", 12) == 0)
+	features->vendor = X86_INTEL;
+      else if (memcmp (cpuid_data + 1, "Auth" "cAMD" "enti", 12) == 0)
+	features->vendor = X86_AMD;
+
+      _nettle_cpuid (1, cpuid_data);
+      if (cpuid_data[2] & 0x02000000)
+	features->have_aesni = 1;      
+    }
+}
+
+DECLARE_FAT_FUNC(_nettle_aes_encrypt, aes_crypt_internal_func)
+DECLARE_FAT_FUNC_VAR(aes_encrypt, aes_crypt_internal_func, x86_64)
+DECLARE_FAT_FUNC_VAR(aes_encrypt, aes_crypt_internal_func, aesni)
+
+DECLARE_FAT_FUNC(_nettle_aes_decrypt, aes_crypt_internal_func)
+DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, x86_64)
+DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, aesni)
+
+DECLARE_FAT_FUNC(nettle_memxor, memxor_func)
+DECLARE_FAT_FUNC_VAR(memxor, memxor_func, x86_64)
+DECLARE_FAT_FUNC_VAR(memxor, memxor_func, sse2)
+
+/* This function should usually be called only once, at startup. But
+   it is idempotent, and on x86, pointer updates are atomic, so
+   there's no danger if it is called simultaneously from multiple
+   threads. */
+static void CONSTRUCTOR
+fat_init (void)
+{
+  struct x86_features features;
+  int verbose;
+
+  /* FIXME: Replace all getenv calls by getenv_secure? */
+  verbose = getenv (ENV_VERBOSE) != NULL;
+  if (verbose)
+    fprintf (stderr, "libnettle: fat library initialization.\n");
+
+  get_x86_features (&features);
+  if (verbose)
+    {
+      const char * const vendor_names[3] =
+	{ "other", "intel", "amd" };
+      fprintf (stderr, "libnettle: cpu features: vendor:%s%s\n",
+	       vendor_names[features.vendor],
+	       features.have_aesni ? ",aesni" : "");
+    }
+  if (features.have_aesni)
+    {
+      if (verbose)
+	fprintf (stderr, "libnettle: using aes instructions.\n");
+      _nettle_aes_encrypt_vec = _nettle_aes_encrypt_aesni;
+      _nettle_aes_decrypt_vec = _nettle_aes_decrypt_aesni;
+    }
+  else
+    {
+      if (verbose)
+	fprintf (stderr, "libnettle: not using aes instructions.\n");
+      _nettle_aes_encrypt_vec = _nettle_aes_encrypt_x86_64;
+      _nettle_aes_decrypt_vec = _nettle_aes_decrypt_x86_64;
+    }
+
+  if (features.vendor == X86_INTEL)
+    {
+      if (verbose)
+	fprintf (stderr, "libnettle: intel SSE2 will be used for memxor.\n");
+      nettle_memxor_vec = _nettle_memxor_sse2;
+    }
+  else
+    {
+      if (verbose)
+	fprintf (stderr, "libnettle: intel SSE2 will not be used for memxor.\n");
+      nettle_memxor_vec = _nettle_memxor_x86_64;
+    }
+}
+
+DEFINE_FAT_FUNC(_nettle_aes_encrypt, void,
+		(unsigned rounds, const uint32_t *keys,
+		 const struct aes_table *T,
+		 size_t length, uint8_t *dst,
+		 const uint8_t *src),
+		(rounds, keys, T, length, dst, src))
+
+DEFINE_FAT_FUNC(_nettle_aes_decrypt, void,
+		(unsigned rounds, const uint32_t *keys,
+		 const struct aes_table *T,
+		 size_t length, uint8_t *dst,
+		 const uint8_t *src),
+		(rounds, keys, T, length, dst, src))
+
+DEFINE_FAT_FUNC(nettle_memxor, void *,
+		(void *dst, const void *src, size_t n),
+		(dst, src, n))
diff --git a/gcm.h b/gcm.h
index c157f829..766019ae 100644
--- a/gcm.h
+++ b/gcm.h
@@ -151,10 +151,11 @@ gcm_digest(struct gcm_ctx *ctx, const struct gcm_key *key,
   { struct gcm_key key; struct gcm_ctx gcm; type cipher; }
 
 /* NOTE: Avoid using NULL, as we don't include anything defining it. */
-#define GCM_SET_KEY(ctx, set_key, encrypt, gcm_key)			\
+#define GCM_SET_KEY(ctx, set_key, encrypt, gcm_key)		\
   do {								\
-    (set_key)(&(ctx)->cipher, (gcm_key));				\
-    if (0) (encrypt)(&(ctx)->cipher, 0, (void *)0, (void *)0);	\
+    (set_key)(&(ctx)->cipher, (gcm_key));			\
+    if (0) (encrypt)(&(ctx)->cipher, ~(size_t) 0,		\
+		     (uint8_t *) 0, (const uint8_t *) 0);	\
     gcm_set_key(&(ctx)->key, &(ctx)->cipher,			\
 		(nettle_cipher_func *) (encrypt));		\
   } while (0)
@@ -166,19 +167,22 @@ gcm_digest(struct gcm_ctx *ctx, const struct gcm_key *key,
   gcm_update(&(ctx)->gcm, &(ctx)->key, (length), (data))
 
 #define GCM_ENCRYPT(ctx, encrypt, length, dst, src)			\
-  (0 ? (encrypt)(&(ctx)->cipher, 0, (void *)0, (void *)0)		\
+  (0 ? (encrypt)(&(ctx)->cipher, ~(size_t) 0,				\
+		 (uint8_t *) 0, (const uint8_t *) 0)			\
      : gcm_encrypt(&(ctx)->gcm, &(ctx)->key, &(ctx)->cipher,		\
 		   (nettle_cipher_func *) (encrypt),			\
 		   (length), (dst), (src)))
 
 #define GCM_DECRYPT(ctx, encrypt, length, dst, src)			\
-  (0 ? (encrypt)(&(ctx)->cipher, 0, (void *)0, (void *)0)		\
+  (0 ? (encrypt)(&(ctx)->cipher, ~(size_t) 0,				\
+		 (uint8_t *) 0, (const uint8_t *) 0)			\
      : gcm_decrypt(&(ctx)->gcm,  &(ctx)->key, &(ctx)->cipher,		\
 		   (nettle_cipher_func *) (encrypt),			\
 		   (length), (dst), (src)))
 
 #define GCM_DIGEST(ctx, encrypt, length, digest)			\
-  (0 ? (encrypt)(&(ctx)->cipher, 0, (void *)0, (void *)0)		\
+  (0 ? (encrypt)(&(ctx)->cipher, ~(size_t) 0,				\
+		 (uint8_t *) 0, (const uint8_t *) 0)			\
      : gcm_digest(&(ctx)->gcm, &(ctx)->key, &(ctx)->cipher,		\
 		  (nettle_cipher_func *) (encrypt),			\
 		  (length), (digest)))
diff --git a/memxor-internal.h b/memxor-internal.h
new file mode 100644
index 00000000..dbb5e990
--- /dev/null
+++ b/memxor-internal.h
@@ -0,0 +1,73 @@
+/* memxor-internal.h
+
+   Copyright (C) 2010, 2014 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#ifndef NETTLE_MEMXOR_INTERNAL_H_INCLUDED
+#define NETTLE_MEMXOR_INTERNAL_H_INCLUDED
+
+#include "nettle-types.h"
+
+/* The word_t type is intended to be the native word size. */
+#if defined(__x86_64__) || defined(__arch64__)
+/* Including on M$ windows, where unsigned long is only 32 bits */
+typedef uint64_t word_t;
+#else
+typedef unsigned long int word_t;
+#endif
+
+#define ALIGN_OFFSET(p) ((uintptr_t) (p) % sizeof(word_t))
+
+#ifndef WORDS_BIGENDIAN
+#define MERGE(w0, sh_1, w1, sh_2) \
+  (((w0) >> (sh_1)) | ((w1) << (sh_2)))
+#else
+#define MERGE(w0, sh_1, w1, sh_2) \
+  (((w0) << (sh_1)) | ((w1) >> (sh_2)))
+#endif
+
+#ifndef WORDS_BIGENDIAN
+#define READ_PARTIAL(r,p,n) do {			\
+    word_t _rp_x;					\
+    unsigned _rp_i;					\
+    for (_rp_i = (n), _rp_x = (p)[--_rp_i]; _rp_i > 0;)	\
+      _rp_x = (_rp_x << CHAR_BIT) | (p)[--_rp_i];	\
+    (r) = _rp_x;					\
+  } while (0)
+#else
+#define READ_PARTIAL(r,p,n) do {			\
+    word_t _rp_x;						\
+    unsigned _rp_i;						\
+    for (_rp_x = (p)[0], _rp_i = 1; _rp_i < (n); _rp_i++)	\
+      _rp_x = (_rp_x << CHAR_BIT) | (p)[_rp_i];			\
+    (r) = _rp_x;						\
+  } while (0)
+#endif
+
+#endif /* NETTLE_MEMXOR_INTERNAL_H_INCLUDED */
diff --git a/memxor.c b/memxor.c
index 99f46f37..36306ac8 100644
--- a/memxor.c
+++ b/memxor.c
@@ -1,6 +1,6 @@
 /* memxor.c
 
-   Copyright (C) 2010 Niels Möller
+   Copyright (C) 2010, 2014 Niels Möller
 
    This file is part of GNU Nettle.
 
@@ -37,26 +37,11 @@
 # include "config.h"
 #endif
 
+#include <assert.h>
 #include <limits.h>
 
 #include "memxor.h"
-
-/* For uintptr_t */
-#include "nettle-types.h"
-
-typedef unsigned long int word_t;
-
-#if SIZEOF_LONG & (SIZEOF_LONG - 1)
-#error Word size must be a power of two
-#endif
-
-#define ALIGN_OFFSET(p) ((uintptr_t) (p) % sizeof(word_t))
-
-#ifndef WORDS_BIGENDIAN
-#define MERGE(w0, sh_1, w1, sh_2) (((w0) >> (sh_1)) | ((w1) << (sh_2)))
-#else
-#define MERGE(w0, sh_1, w1, sh_2) (((w0) << (sh_1)) | ((w1) >> (sh_2)))
-#endif
+#include "memxor-internal.h"
 
 #define WORD_T_THRESH 16
 
@@ -70,13 +55,14 @@ memxor_common_alignment (word_t *dst, const word_t *src, size_t n)
 
   if (n & 1)
     {
-      *dst++ ^= *src++;
       n--;
+      dst[n] ^= src[n];
     }
-  for (; n >= 2; dst += 2, src += 2, n -= 2)
+  while (n >= 2)
     {
-      dst[0] ^= src[0];
-      dst[1] ^= src[1];
+      n -= 2;
+      dst[n+1] ^= src[n+1];
+      dst[n] ^= src[n];
     }
 }
 
@@ -84,35 +70,52 @@ memxor_common_alignment (word_t *dst, const word_t *src, size_t n)
    words, not bytes. Assumes we can read complete words at the start
    and end of the src operand. */
 static void
-memxor_different_alignment (word_t *dst, const char *src, size_t n)
+memxor_different_alignment (word_t *dst, const unsigned char *src, size_t n)
 {
-  size_t i;
   int shl, shr;
   const word_t *src_word;
   unsigned offset = ALIGN_OFFSET (src);
   word_t s0, s1;
 
+  assert (n > 0);
   shl = CHAR_BIT * offset;
   shr = CHAR_BIT * (sizeof(word_t) - offset);
 
-  src_word = (const word_t *) ((uintptr_t) src & -SIZEOF_LONG);
+  src_word = (const word_t *) ((uintptr_t) src & -sizeof(word_t));
+
+  /* Read top offset bytes, in native byte order. */
+  READ_PARTIAL (s0, (unsigned char *) &src_word[n], offset);
+#ifdef WORDS_BIGENDIAN
+  s0 <<= shr; /* FIXME: Eliminate this shift? */
+#endif
 
-  /* FIXME: Unroll four times, like memcmp? */
-  i = n & 1;
-  s0 = src_word[i];
-  if (i)
+  /* Do n-1 regular iterations */
+  if (n & 1)
+    s1 = s0;
+  else
     {
-      s1 = src_word[0];
-      dst[0] ^= MERGE (s1, shl, s0, shr);
+      n--;
+      s1 = src_word[n];
+      dst[n] ^= MERGE (s1, shl, s0, shr);
     }
 
-  for (; i < n; i += 2)
+  assert (n & 1);
+  while (n > 2)
     {
-      s1 = src_word[i+1];
-      dst[i] ^= MERGE(s0, shl, s1, shr);
-      s0 = src_word[i+2];
-      dst[i+1] ^= MERGE(s1, shl, s0, shr);
+      n -= 2;
+      s0 = src_word[n+1];
+      dst[n+1] ^= MERGE(s0, shl, s1, shr);
+      s1 = src_word[n]; /* FIXME: Overread on last iteration */
+      dst[n] ^= MERGE(s1, shl, s0, shr);
     }
+  assert (n == 1);
+  /* Read low wordsize - offset bytes */
+  READ_PARTIAL (s0, src, sizeof(word_t) - offset);
+#ifndef WORDS_BIGENDIAN
+  s0 <<= shl; /* FIXME: eliminate shift? */
+#endif /* !WORDS_BIGENDIAN */
+
+  dst[0] ^= MERGE(s0, shl, s1, shr);
 }
 
 /* Performance, Intel SU1400 (x86_64): 0.25 cycles/byte aligned, 0.45
@@ -123,216 +126,36 @@ memxor_different_alignment (word_t *dst, const char *src, size_t n)
 void *
 memxor(void *dst_in, const void *src_in, size_t n)
 {
-  char *dst = dst_in;
-  const char *src = src_in;
-
-  if (n >= WORD_T_THRESH)
-    {
-      /* There are at least some bytes to compare.  No need to test
-	 for N == 0 in this alignment loop.  */
-      while (ALIGN_OFFSET (dst))
-	{
-	  *dst++ ^= *src++;
-	  n--;
-	}
-      if (ALIGN_OFFSET (src))
-	memxor_different_alignment ((word_t *) dst, src, n / sizeof(word_t));
-      else
-	memxor_common_alignment ((word_t *) dst, (const word_t *) src, n / sizeof(word_t));
-
-      dst += n & -SIZEOF_LONG;
-      src += n & -SIZEOF_LONG;
-      n = n & (SIZEOF_LONG - 1);
-    }
-  for (; n > 0; n--)
-    *dst++ ^= *src++;
-
-  return dst_in;
-}
-
-
-/* XOR word-aligned areas. n is the number of words, not bytes. */
-static void
-memxor3_common_alignment (word_t *dst,
-			  const word_t *a, const word_t *b, size_t n)
-{
-  /* FIXME: Require n > 0? */
-  while (n-- > 0)
-    dst[n] = a[n] ^ b[n];
-}
-
-static void
-memxor3_different_alignment_b (word_t *dst,
-			       const word_t *a, const char *b, unsigned offset, size_t n)
-{
-  int shl, shr;
-  const word_t *b_word;
-
-  word_t s0, s1;
-
-  shl = CHAR_BIT * offset;
-  shr = CHAR_BIT * (sizeof(word_t) - offset);
-
-  b_word = (const word_t *) ((uintptr_t) b & -SIZEOF_LONG);
-
-  if (n & 1)
-    {
-      n--;
-      s1 = b_word[n];
-      s0 = b_word[n+1];
-      dst[n] = a[n] ^ MERGE (s1, shl, s0, shr);
-    }
-  else
-    s1 = b_word[n];
-  
-  while (n > 0)
-    {
-      n -= 2;
-      s0 = b_word[n+1]; 
-      dst[n+1] = a[n+1] ^ MERGE(s0, shl, s1, shr);
-      s1 = b_word[n];
-      dst[n] = a[n] ^ MERGE(s1, shl, s0, shr);
-    }
-}
-
-static void
-memxor3_different_alignment_ab (word_t *dst,
-				const char *a, const char *b,
-				unsigned offset, size_t n)
-{
-  int shl, shr;
-  const word_t *a_word;
-  const word_t *b_word;
-  
-  word_t s0, s1;
-
-  shl = CHAR_BIT * offset;
-  shr = CHAR_BIT * (sizeof(word_t) - offset);
-
-  a_word = (const word_t *) ((uintptr_t) a & -SIZEOF_LONG);
-  b_word = (const word_t *) ((uintptr_t) b & -SIZEOF_LONG);
-
-  if (n & 1)
-    {
-      n--;
-      s1 = a_word[n] ^ b_word[n];
-      s0 = a_word[n+1] ^ b_word[n+1];
-      dst[n] = MERGE (s1, shl, s0, shr);
-    }
-  else    
-    s1 = a_word[n] ^ b_word[n];
-  
-  while (n > 0)
-    {
-      n -= 2;
-      s0 = a_word[n+1] ^ b_word[n+1]; 
-      dst[n+1] = MERGE(s0, shl, s1, shr);
-      s1 = a_word[n] ^ b_word[n];
-      dst[n] = MERGE(s1, shl, s0, shr);
-    }
-}
-
-static void
-memxor3_different_alignment_all (word_t *dst,
-				 const char *a, const char *b,
-				 unsigned a_offset, unsigned b_offset,
-				 size_t n)
-{
-  int al, ar, bl, br;
-  const word_t *a_word;
-  const word_t *b_word;
-  
-  word_t a0, a1, b0, b1;
-
-  al = CHAR_BIT * a_offset;
-  ar = CHAR_BIT * (sizeof(word_t) - a_offset);
-  bl = CHAR_BIT * b_offset;
-  br = CHAR_BIT * (sizeof(word_t) - b_offset);
-
-  a_word = (const word_t *) ((uintptr_t) a & -SIZEOF_LONG);
-  b_word = (const word_t *) ((uintptr_t) b & -SIZEOF_LONG);
-
-  if (n & 1)
-    {
-      n--;
-      a1 = a_word[n]; a0 = a_word[n+1];
-      b1 = b_word[n]; b0 = b_word[n+1];
-      
-      dst[n] = MERGE (a1, al, a0, ar) ^ MERGE (b1, bl, b0, br);
-    }
-  else    
-    {
-      a1 = a_word[n];
-      b1 = b_word[n];
-    }
-  
-  while (n > 0)
-    {
-      n -= 2;
-      a0 = a_word[n+1]; b0 = b_word[n+1]; 
-      dst[n+1] = MERGE(a0, al, a1, ar) ^ MERGE(b0, bl, b1, br);
-      a1 = a_word[n]; b1 = b_word[n];
-      dst[n] = MERGE(a1, al, a0, ar) ^ MERGE(b1, bl, b0, br);
-    }
-}
-
-/* Current implementation processes data in descending order, to
-   support overlapping operation with one of the sources overlapping
-   the start of the destination area. This feature is used only
-   internally by cbc decrypt, and it is not advertised or documented
-   to nettle users. */
-void *
-memxor3(void *dst_in, const void *a_in, const void *b_in, size_t n)
-{
-  char *dst = dst_in;
-  const char *a = a_in;
-  const char *b = b_in;
+  unsigned char *dst = dst_in;
+  const unsigned char *src = src_in;
 
   if (n >= WORD_T_THRESH)
     {
       unsigned i;
-      unsigned a_offset;
-      unsigned b_offset;
+      unsigned offset;
       size_t nwords;
-
+      /* There are at least some bytes to compare.  No need to test
+	 for N == 0 in this alignment loop.  */
       for (i = ALIGN_OFFSET(dst + n); i > 0; i--)
 	{
 	  n--;
-	  dst[n] = a[n] ^ b[n];
+	  dst[n] ^= src[n];
 	}
-
-      a_offset = ALIGN_OFFSET(a + n);
-      b_offset = ALIGN_OFFSET(b + n);
-
+      offset = ALIGN_OFFSET(src + n);
       nwords = n / sizeof (word_t);
       n %= sizeof (word_t);
 
-      if (a_offset == b_offset)
-	{
-	  if (!a_offset)
-	    memxor3_common_alignment((word_t *) (dst + n),
-				     (const word_t *) (a + n),
-				     (const word_t *) (b + n), nwords);
-	  else
-	    memxor3_different_alignment_ab((word_t *) (dst + n),
-					   a + n, b + n, a_offset,
-					   nwords);
-	}
-      else if (!a_offset)
-	memxor3_different_alignment_b((word_t *) (dst + n),
-				      (const word_t *) (a + n), b + n,
-				      b_offset, nwords);
-      else if (!b_offset)
-	memxor3_different_alignment_b((word_t *) (dst + n),
-				      (const word_t *) (b + n), a + n,
-				      a_offset, nwords);
+      if (offset)
+	memxor_different_alignment ((word_t *) (dst+n), src+n, nwords);
       else
-	memxor3_different_alignment_all((word_t *) (dst + n), a + n, b + n,
-					a_offset, b_offset, nwords);
-					
+	memxor_common_alignment ((word_t *) (dst+n),
+				 (const word_t *) (src+n), nwords);
+    }
+  while (n > 0)
+    {
+      n--;
+      dst[n] ^= src[n];
     }
-  while (n-- > 0)
-    dst[n] = a[n] ^ b[n];
 
   return dst;
 }
diff --git a/memxor3.c b/memxor3.c
new file mode 100644
index 00000000..fe208bf1
--- /dev/null
+++ b/memxor3.c
@@ -0,0 +1,292 @@
+/* memxor3.c
+
+   Copyright (C) 2010, 2014 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+/* Implementation inspired by memcmp in glibc, contributed to the FSF
+   by Torbjorn Granlund.
+ */
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <assert.h>
+#include <limits.h>
+
+#include "memxor.h"
+#include "memxor-internal.h"
+
+#define WORD_T_THRESH 16
+
+/* XOR word-aligned areas. n is the number of words, not bytes. */
+static void
+memxor3_common_alignment (word_t *dst,
+			  const word_t *a, const word_t *b, size_t n)
+{
+  /* FIXME: Require n > 0? */
+  if (n & 1)
+    {
+      n--;
+      dst[n] = a[n] ^ b[n];
+    }
+  while (n > 0)
+    {
+      n -= 2;
+      dst[n+1] = a[n+1] ^ b[n+1];
+      dst[n] = a[n] ^ b[n];
+    }
+}
+
+static void
+memxor3_different_alignment_b (word_t *dst,
+			       const word_t *a, const unsigned char *b,
+			       unsigned offset, size_t n)
+{
+  int shl, shr;
+  const word_t *b_word;
+
+  word_t s0, s1;
+
+  assert (n > 0);
+
+  shl = CHAR_BIT * offset;
+  shr = CHAR_BIT * (sizeof(word_t) - offset);
+
+  b_word = (const word_t *) ((uintptr_t) b & -sizeof(word_t));
+
+  /* Read top offset bytes, in native byte order. */
+  READ_PARTIAL (s0, (unsigned char *) &b_word[n], offset);
+#ifdef WORDS_BIGENDIAN
+  s0 <<= shr;
+#endif
+
+  if (n & 1)
+    s1 = s0;
+  else
+    {
+      n--;
+      s1 = b_word[n];
+      dst[n] = a[n] ^ MERGE (s1, shl, s0, shr);
+    }
+
+  while (n > 2)
+    {
+      n -= 2;
+      s0 = b_word[n+1];
+      dst[n+1] = a[n+1] ^ MERGE(s0, shl, s1, shr);
+      s1 = b_word[n];
+      dst[n] = a[n] ^ MERGE(s1, shl, s0, shr);
+    }
+  assert (n == 1);
+  /* Read low wordsize - offset bytes */
+  READ_PARTIAL (s0, b, sizeof(word_t) - offset);
+#ifndef WORDS_BIGENDIAN
+  s0 <<= shl;
+#endif /* !WORDS_BIGENDIAN */
+
+  dst[0] = a[0] ^ MERGE(s0, shl, s1, shr);
+}
+
+static void
+memxor3_different_alignment_ab (word_t *dst,
+				const unsigned char *a, const unsigned char *b,
+				unsigned offset, size_t n)
+{
+  int shl, shr;
+  const word_t *a_word;
+  const word_t *b_word;
+
+  word_t s0, s1, t;
+
+  assert (n > 0);
+
+  shl = CHAR_BIT * offset;
+  shr = CHAR_BIT * (sizeof(word_t) - offset);
+
+  a_word = (const word_t *) ((uintptr_t) a & -sizeof(word_t));
+  b_word = (const word_t *) ((uintptr_t) b & -sizeof(word_t));
+
+  /* Read top offset bytes, in native byte order. */
+  READ_PARTIAL (s0, (unsigned char *) &a_word[n], offset);
+  READ_PARTIAL (t,  (unsigned char *) &b_word[n], offset);
+  s0 ^= t;
+#ifdef WORDS_BIGENDIAN
+  s0 <<= shr;
+#endif
+
+  if (n & 1)
+    s1 = s0;
+  else
+    {
+      n--;
+      s1 = a_word[n] ^ b_word[n];
+      dst[n] = MERGE (s1, shl, s0, shr);
+    }
+
+  while (n > 2)
+    {
+      n -= 2;
+      s0 = a_word[n+1] ^ b_word[n+1];
+      dst[n+1] = MERGE(s0, shl, s1, shr);
+      s1 = a_word[n] ^ b_word[n];
+      dst[n] = MERGE(s1, shl, s0, shr);
+    }
+  assert (n == 1);
+  /* Read low wordsize - offset bytes */
+  READ_PARTIAL (s0, a, sizeof(word_t) - offset);
+  READ_PARTIAL (t,  b, sizeof(word_t) - offset);
+  s0 ^= t;
+#ifndef WORDS_BIGENDIAN
+  s0 <<= shl;
+#endif /* !WORDS_BIGENDIAN */
+
+  dst[0] = MERGE(s0, shl, s1, shr);
+}
+
+static void
+memxor3_different_alignment_all (word_t *dst,
+				 const unsigned char *a, const unsigned char *b,
+				 unsigned a_offset, unsigned b_offset,
+				 size_t n)
+{
+  int al, ar, bl, br;
+  const word_t *a_word;
+  const word_t *b_word;
+
+  word_t a0, a1, b0, b1;
+
+  al = CHAR_BIT * a_offset;
+  ar = CHAR_BIT * (sizeof(word_t) - a_offset);
+  bl = CHAR_BIT * b_offset;
+  br = CHAR_BIT * (sizeof(word_t) - b_offset);
+
+  a_word = (const word_t *) ((uintptr_t) a & -sizeof(word_t));
+  b_word = (const word_t *) ((uintptr_t) b & -sizeof(word_t));
+
+  /* Read top offset bytes, in native byte order. */
+  READ_PARTIAL (a0, (unsigned char *) &a_word[n], a_offset);
+  READ_PARTIAL (b0, (unsigned char *) &b_word[n], b_offset);
+#ifdef WORDS_BIGENDIAN
+  a0 <<= ar;
+  b0 <<= br;
+#endif
+
+  if (n & 1)
+    {
+      a1 = a0; b1 = b0;
+    }
+  else
+    {
+      n--;
+      a1 = a_word[n];
+      b1 = b_word[n];
+
+      dst[n] = MERGE (a1, al, a0, ar) ^ MERGE (b1, bl, b0, br);
+    }
+  while (n > 2)
+    {
+      n -= 2;
+      a0 = a_word[n+1]; b0 = b_word[n+1];
+      dst[n+1] = MERGE(a0, al, a1, ar) ^ MERGE(b0, bl, b1, br);
+      a1 = a_word[n]; b1 = b_word[n];
+      dst[n] = MERGE(a1, al, a0, ar) ^ MERGE(b1, bl, b0, br);
+    }
+  assert (n == 1);
+  /* Read low wordsize - offset bytes */
+  READ_PARTIAL (a0, a, sizeof(word_t) - a_offset);
+  READ_PARTIAL (b0, b, sizeof(word_t) - b_offset);
+#ifndef WORDS_BIGENDIAN
+  a0 <<= al;
+  b0 <<= bl;
+#endif /* !WORDS_BIGENDIAN */
+
+  dst[0] = MERGE(a0, al, a1, ar) ^ MERGE(b0, bl, b1, br);
+}
+
+/* Current implementation processes data in descending order, to
+   support overlapping operation with one of the sources overlapping
+   the start of the destination area. This feature is used only
+   internally by cbc decrypt, and it is not advertised or documented
+   to nettle users. */
+void *
+memxor3(void *dst_in, const void *a_in, const void *b_in, size_t n)
+{
+  unsigned char *dst = dst_in;
+  const unsigned char *a = a_in;
+  const unsigned char *b = b_in;
+
+  if (n >= WORD_T_THRESH)
+    {
+      unsigned i;
+      unsigned a_offset;
+      unsigned b_offset;
+      size_t nwords;
+
+      for (i = ALIGN_OFFSET(dst + n); i > 0; i--)
+	{
+	  n--;
+	  dst[n] = a[n] ^ b[n];
+	}
+
+      a_offset = ALIGN_OFFSET(a + n);
+      b_offset = ALIGN_OFFSET(b + n);
+
+      nwords = n / sizeof (word_t);
+      n %= sizeof (word_t);
+
+      if (a_offset == b_offset)
+	{
+	  if (!a_offset)
+	    memxor3_common_alignment((word_t *) (dst + n),
+				     (const word_t *) (a + n),
+				     (const word_t *) (b + n), nwords);
+	  else
+	    memxor3_different_alignment_ab((word_t *) (dst + n),
+					   a + n, b + n, a_offset,
+					   nwords);
+	}
+      else if (!a_offset)
+	memxor3_different_alignment_b((word_t *) (dst + n),
+				      (const word_t *) (a + n), b + n,
+				      b_offset, nwords);
+      else if (!b_offset)
+	memxor3_different_alignment_b((word_t *) (dst + n),
+				      (const word_t *) (b + n), a + n,
+				      a_offset, nwords);
+      else
+	memxor3_different_alignment_all((word_t *) (dst + n), a + n, b + n,
+					a_offset, b_offset, nwords);
+
+    }
+  while (n-- > 0)
+    dst[n] = a[n] ^ b[n];
+
+  return dst;
+}
diff --git a/misc/plan.html b/misc/plan.html
index bc42981e..93ea1bc5 100644
--- a/misc/plan.html
+++ b/misc/plan.html
@@ -14,7 +14,7 @@
   <h1> Nettle release plans </h1>
   <p> This is an attempt at defining a development target for
     Nettle-3.1, inspired by similar pages for recent GMP releases.
-    [Last updated 2014-10-22]</p>
+    [Last updated 2015-01-27]</p>
   <p class='should'>
     This really ought to be done before release
   </p>
@@ -31,7 +31,7 @@
   <h1> Plans for nettle-3.1 </h1>
 
   <h2> Interface changes </h2>
-  <p class='should'>
+  <p class='done'>
     Review public functions in ecc.h, move some to ecc-internal.h, to
     enable sane support for other types of curves.
   </p>
@@ -62,7 +62,7 @@
   </p>
 
   <h2> Optimizations </h2>
-  <p class='should'>
+  <p class='done'>
     Support for using AES acceleration.
   </p>
 
@@ -90,10 +90,13 @@
   </p>
   
   <h2> Build system </h2>
-  <p class='should'>
+  <p class='done'>
     Support for fat binaries on ARM and x86_64, selecting code at
     runtime depending on cpu capabilities.
   </p>
+  <p class='should'>
+    Fix the handling of optional C source files with make dist.
+  </p>
   <p class='done'>
     Stop using the nonstandard <tt>.po</tt> extension.
   </p>
diff --git a/salsa20-core-internal.c b/salsa20-core-internal.c
index 08cd8367..c26057d5 100644
--- a/salsa20-core-internal.c
+++ b/salsa20-core-internal.c
@@ -48,6 +48,13 @@
 
 #include "macros.h"
 
+/* For fat builds */
+#if HAVE_NATIVE_salsa20_core
+void
+_nettle_salsa20_core_c(uint32_t *dst, const uint32_t *src, unsigned rounds);
+#define _nettle_salsa20_core _nettle_salsa20_core_c
+#endif
+
 #ifndef SALSA20_DEBUG
 # define SALSA20_DEBUG 0
 #endif
diff --git a/sha1-compress.c b/sha1-compress.c
index 769a4cfb..377b9c1e 100644
--- a/sha1-compress.c
+++ b/sha1-compress.c
@@ -129,6 +129,13 @@
 #define subRound(a, b, c, d, e, f, k, data) \
     ( e += ROTL32( 5, a ) + f( b, c, d ) + k + data, b = ROTL32( 30, b ) )
 
+/* For fat builds */
+#if HAVE_NATIVE_sha1_compress
+void
+_nettle_sha1_compress_c(uint32_t *state, const uint8_t *input);
+#define _nettle_sha1_compress _nettle_sha1_compress_c
+#endif
+
 /* Perform the SHA transformation.  Note that this code, like MD5, seems to
    break some optimizing compilers due to the complexity of the expressions
    and the size of the basic block.  It may be necessary to split it into
diff --git a/sha256-compress.c b/sha256-compress.c
index 5c32f931..156c8cf9 100644
--- a/sha256-compress.c
+++ b/sha256-compress.c
@@ -116,6 +116,13 @@
     h += S0(a) + Majority(a,b,c);		\
   } while (0)
 
+/* For fat builds */
+#if HAVE_NATIVE_sha256_compress
+void
+_nettle_sha256_compress_c(uint32_t *state, const uint8_t *input, const uint32_t *k);
+#define _nettle_sha256_compress _nettle_sha256_compress_c
+#endif
+
 void
 _nettle_sha256_compress(uint32_t *state, const uint8_t *input, const uint32_t *k)
 {
diff --git a/sha3-permute.c b/sha3-permute.c
index b80e6258..14fb0d4c 100644
--- a/sha3-permute.c
+++ b/sha3-permute.c
@@ -41,6 +41,13 @@
 
 #define SHA3_ROUNDS 24
 
+/* For fat builds */
+#if HAVE_NATIVE_sha3_permute
+void
+_nettle_sha3_permute_c(struct sha3_state *state);
+#define nettle_sha3_permute _nettle_sha3_permute_c
+#endif
+
 void
 sha3_permute (struct sha3_state *state)
 {
diff --git a/sha512-compress.c b/sha512-compress.c
index 59cda41f..24007f28 100644
--- a/sha512-compress.c
+++ b/sha512-compress.c
@@ -59,6 +59,13 @@
 /* A block, treated as a sequence of 64-bit words. */
 #define SHA512_DATA_LENGTH 16
 
+/* For fat builds */
+#if HAVE_NATIVE_sha512_compress
+void
+_nettle_sha512_compress_c (uint64_t *state, const uint8_t *input, const uint64_t *k);
+#define _nettle_sha512_compress _nettle_sha512_compress_c
+#endif
+
 /* The SHA512 functions. The Choice function is the same as the SHA1
    function f1, and the majority function is the same as the SHA1 f3
    function, and the same as for SHA256. */
diff --git a/testsuite/Makefile.in b/testsuite/Makefile.in
index 96c0bc80..6bc1907c 100644
--- a/testsuite/Makefile.in
+++ b/testsuite/Makefile.in
@@ -114,7 +114,7 @@ $(TARGETS) $(EXTRA_TARGETS): testutils.$(OBJEXT) ../nettle-internal.$(OBJEXT) \
 
 # --partial-loads-ok=yes is needed for memxor's handling of unaligned
 # data.
-VALGRIND = valgrind --error-exitcode=1 --leak-check=full --partial-loads-ok=yes --show-reachable=yes
+VALGRIND = valgrind --error-exitcode=1 --leak-check=full --show-reachable=yes @IF_ASM@ --partial-loads-ok=yes
 
 # The PATH update is for locating dlls on w*ndows.
 check: $(TS_ALL)
diff --git a/testsuite/chacha-poly1305-test.c b/testsuite/chacha-poly1305-test.c
index 2f320f32..313e8226 100644
--- a/testsuite/chacha-poly1305-test.c
+++ b/testsuite/chacha-poly1305-test.c
@@ -4,13 +4,30 @@
 void
 test_main(void)
 {
-  /* From draft-agl-tls-chacha20poly1305-04 */
+  /* From draft-irtf-cfrg-chacha20-poly1305-08 */
   test_aead (&nettle_chacha_poly1305, NULL,
-	     SHEX("4290bcb154173531f314af57f3be3b50"
-		  "06da371ece272afa1b5dbdd1100a1007"),	/* key */
-	     SHEX("87e229d4500845a079c0"),		/* auth data */
-	     SHEX("86d09974840bded2a5ca"),		/* plain text */
-	     SHEX("e3e446f7ede9a19b62a4"),		/* ciphertext */
-	     SHEX("cd7cf67be39c794a"),			/* nonce */
-	     SHEX("677dabf4e3d24b876bb284753896e1d6"));	/* tag */
+	     SHEX("8081828384858687 88898a8b8c8d8e8f"
+		  "9091929394959697 98999a9b9c9d9e9f"),
+	     SHEX("50515253c0c1c2c3 c4c5c6c7"),
+	     SHEX("4c61646965732061 6e642047656e746c"
+		  "656d656e206f6620 74686520636c6173"
+		  "73206f6620273939 3a20496620492063"
+		  "6f756c64206f6666 657220796f75206f"
+		  "6e6c79206f6e6520 74697020666f7220"
+		  "7468652066757475 72652c2073756e73"
+		  "637265656e20776f 756c642062652069"
+		  "742e"),
+	     SHEX("d31a8d34648e60db7b86afbc53ef7ec2"
+		  "a4aded51296e08fea9e2b5a736ee62d6"
+		  "3dbea45e8ca9671282fafb69da92728b"
+		  "1a71de0a9e060b2905d6a5b67ecd3b36"
+		  "92ddbd7f2d778b8c9803aee328091b58"
+		  "fab324e4fad675945585808b4831d7bc"
+		  "3ff4def08e4b7a9de576d26586cec64b"
+		  "6116"),
+	     /* The draft splits the nonce into a "common part" and an
+		iv, and it seams the "common part" is the first 4
+		bytes. */
+	     SHEX("0700000040414243 44454647"),
+	     SHEX("1ae10b594f09e26a 7e902ecbd0600691"));
 }
diff --git a/testsuite/chacha-test.c b/testsuite/chacha-test.c
index 8c5630da..9edb9410 100644
--- a/testsuite/chacha-test.c
+++ b/testsuite/chacha-test.c
@@ -44,20 +44,30 @@ test_chacha(const struct tstring *key, const struct tstring *nonce,
 
   ASSERT (key->length == CHACHA_KEY_SIZE);
   chacha_set_key (&ctx, key->data);
-  ASSERT (nonce->length == CHACHA_NONCE_SIZE);
 
   if (rounds == 20)
     {
       uint8_t *data = xalloc (expected->length + 2);
-      data++;
       size_t length;
+      data++;
 
       for (length = 1; length <= expected->length; length++)
 	{
 	  data[-1] = 17;
 	  memset (data, 0, length);
 	  data[length] = 17;
-	  chacha_set_nonce(&ctx, nonce->data);
+	  if (nonce->length == CHACHA_NONCE_SIZE)
+	    chacha_set_nonce(&ctx, nonce->data);
+	  else if (nonce->length == CHACHA_NONCE96_SIZE)
+	    {
+	      chacha_set_nonce96(&ctx, nonce->data);
+	      /* Use initial counter 1, for
+		 draft-irtf-cfrg-chacha20-poly1305-08 test cases. */
+	      ctx.state[12]++;
+	    }
+	  else
+	    die ("Bad nonce size %u.\n", (unsigned) nonce->length);
+
 	  chacha_crypt (&ctx, length, data, data);
 
 	  ASSERT (data[-1] == 17);
@@ -84,6 +94,7 @@ test_chacha(const struct tstring *key, const struct tstring *nonce,
 	 numbers of rounds. */
       uint32_t out[_CHACHA_STATE_LENGTH];
       ASSERT (expected->length == CHACHA_BLOCK_SIZE);
+      ASSERT (nonce->length == CHACHA_NONCE_SIZE);
 
       chacha_set_nonce(&ctx, nonce->data);
       _chacha_core (out, ctx.state, rounds);
@@ -622,4 +633,14 @@ test_main(void)
 		   "ae2c4c90225ba9ea 14d518f55929dea0"
 		   "98ca7a6ccfe61227 053c84e49a4a3332"),
 	      20);
+
+  /* From draft-irtf-cfrg-chacha20-poly1305-08, with 96-bit nonce */
+  test_chacha(SHEX("0001020304050607 08090a0b0c0d0e0f"
+		   "1011121314151617 18191a1b1c1d1e1f"),
+	      SHEX("000000090000004a 00000000"),
+	      SHEX("10f1e7e4d13b5915 500fdd1fa32071c4"
+		   "c7d1f4c733c06803 0422aa9ac3d46c4e"
+		   "d2826446079faa09 14c2d705d98b02a2"
+		   "b5129cd1de164eb9 cbd083e8a2503c4e"),
+	      20);
 }
diff --git a/umac-nh-n.c b/umac-nh-n.c
index e4430b0e..e9233716 100644
--- a/umac-nh-n.c
+++ b/umac-nh-n.c
@@ -39,6 +39,14 @@
 #include "umac.h"
 #include "macros.h"
 
+/* For fat builds */
+#if HAVE_NATIVE_umac_nh_n
+void
+_nettle_umac_nh_n_c (uint64_t *out, unsigned n, const uint32_t *key,
+		     unsigned length, const uint8_t *msg);
+#define _nettle_umac_nh_n _nettle_umac_nh_n_c
+#endif
+
 void
 _umac_nh_n (uint64_t *out, unsigned n, const uint32_t *key,
 	    unsigned length, const uint8_t *msg)
diff --git a/umac-nh.c b/umac-nh.c
index 9d371755..ab1b392a 100644
--- a/umac-nh.c
+++ b/umac-nh.c
@@ -38,6 +38,13 @@
 #include "umac.h"
 #include "macros.h"
 
+/* For fat builds */
+#if HAVE_NATIVE_umac_nh
+uint64_t
+_nettle_umac_nh_c (const uint32_t *key, unsigned length, const uint8_t *msg);
+#define _nettle_umac_nh _nettle_umac_nh_c
+#endif
+
 uint64_t
 _umac_nh (const uint32_t *key, unsigned length, const uint8_t *msg)
 {
diff --git a/x86_64/aesni/aes-decrypt-internal.asm b/x86_64/aesni/aes-decrypt-internal.asm
new file mode 100644
index 00000000..412e8d31
--- /dev/null
+++ b/x86_64/aesni/aes-decrypt-internal.asm
@@ -0,0 +1,100 @@
+C x86_64/aesni/aes-decrypt-internal.asm
+
+
+ifelse(<
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+C Input argument
+define(<ROUNDS>, <%rdi>)
+define(<KEYS>,	<%rsi>)
+C define(<TABLE>,	<%rdx>) C Unused here
+define(<LENGTH>,<%rcx>)
+define(<DST>,	<%r8>)
+define(<SRC>,	<%r9>)
+
+C Round counter
+define(<CNT>, <%rdx>)
+C Subkey pointer
+define(<KEY>, <%rax>)
+
+dnl aesdec %xmm1, %xmm0
+define(<AESDEC>, <.byte 0x66, 0x0f, 0x38, 0xde, 0xc1>)
+dnl aesdeclast %xmm1, %xmm0
+define(<AESDECLAST>, <.byte 0x66, 0x0f, 0x38, 0xdf, 0xc1>)
+
+	.file "aes-decrypt-internal.asm"
+
+	C _aes_decrypt(unsigned rounds, const uint32_t *keys,
+	C	       const struct aes_table *T,
+	C	       size_t length, uint8_t *dst,
+	C	       uint8_t *src)
+	.text
+	ALIGN(16)
+PROLOGUE(_nettle_aes_decrypt)
+	W64_ENTRY(6, 2)
+	shr	$4, LENGTH
+	test	LENGTH, LENGTH
+	jz	.Lend
+
+	decl	XREG(ROUNDS)
+
+.Lblock_loop:
+	mov	ROUNDS, CNT
+	mov	KEYS, KEY
+	movups	(SRC), %xmm0
+	C FIXME: Better alignment of subkeys, so we can use movaps.
+	movups	(KEY), %xmm1
+	pxor	%xmm1, %xmm0
+
+	C FIXME: Could use some unrolling. Also all subkeys fit in
+	C registers, so they could be loaded once (on W64 we would
+	C need to save and restore some xmm registers, though).
+
+.Lround_loop:
+	add	$16, KEY
+
+	movups	(KEY), %xmm1
+	AESDEC	C %xmm1, %xmm0
+	decl	XREG(CNT)
+	jnz	.Lround_loop
+
+	movups	16(KEY), %xmm1
+	AESDECLAST	C %xmm1, %xmm0
+
+	movups	%xmm0, (DST)
+	add	$16, SRC
+	add	$16, DST
+	dec	LENGTH
+	jnz	.Lblock_loop
+
+.Lend:
+	W64_EXIT(6, 2)
+	ret
+EPILOGUE(_nettle_aes_decrypt)
diff --git a/x86_64/aesni/aes-encrypt-internal.asm b/x86_64/aesni/aes-encrypt-internal.asm
new file mode 100644
index 00000000..07f17b25
--- /dev/null
+++ b/x86_64/aesni/aes-encrypt-internal.asm
@@ -0,0 +1,100 @@
+C x86_64/aesni/aes-encrypt-internal.asm
+
+
+ifelse(<
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+C Input argument
+define(<ROUNDS>, <%rdi>)
+define(<KEYS>,	<%rsi>)
+C define(<TABLE>,	<%rdx>) C Unused here
+define(<LENGTH>,<%rcx>)
+define(<DST>,	<%r8>)
+define(<SRC>,	<%r9>)
+
+C Round counter
+define(<CNT>, <%rdx>)
+C Subkey pointer
+define(<KEY>, <%rax>)
+
+dnl aesenc %xmm1, %xmm0
+define(<AESENC>, <.byte 0x66, 0x0f, 0x38, 0xdc, 0xc1>)
+dnl aesenclast %xmm1, %xmm0
+define(<AESENCLAST>, <.byte 0x66, 0x0f, 0x38, 0xdd, 0xc1>)
+	
+	.file "aes-encrypt-internal.asm"
+
+	C _aes_encrypt(unsigned rounds, const uint32_t *keys,
+	C	       const struct aes_table *T,
+	C	       size_t length, uint8_t *dst,
+	C	       uint8_t *src)
+	.text
+	ALIGN(16)
+PROLOGUE(_nettle_aes_encrypt)
+	W64_ENTRY(6, 2)
+	shr	$4, LENGTH
+	test	LENGTH, LENGTH
+	jz	.Lend
+
+	decl	XREG(ROUNDS)
+
+.Lblock_loop:
+	mov	ROUNDS, CNT
+	mov	KEYS, KEY
+	movups	(SRC), %xmm0
+	C FIXME: Better alignment of subkeys, so we can use movaps.
+	movups	(KEY), %xmm1
+	pxor	%xmm1, %xmm0
+
+	C FIXME: Could use some unrolling. Also all subkeys fit in
+	C registers, so they could be loaded once (on W64 we would
+	C need to save and restore some xmm registers, though).
+
+.Lround_loop:
+	add	$16, KEY
+
+	movups	(KEY), %xmm1
+	AESENC	C %xmm1, %xmm0
+	decl	XREG(CNT)
+	jnz	.Lround_loop
+
+	movups	16(KEY), %xmm1
+	AESENCLAST	C %xmm1, %xmm0
+
+	movups	%xmm0, (DST)
+	add	$16, SRC
+	add	$16, DST
+	dec	LENGTH
+	jnz	.Lblock_loop
+
+.Lend:
+	W64_EXIT(6, 2)
+	ret
+EPILOGUE(_nettle_aes_encrypt)
diff --git a/x86_64/fat/aes-decrypt-internal-2.asm b/x86_64/fat/aes-decrypt-internal-2.asm
new file mode 100644
index 00000000..2dd45959
--- /dev/null
+++ b/x86_64/fat/aes-decrypt-internal-2.asm
@@ -0,0 +1,35 @@
+C x86_64/fat/aes-decrypt-internal-2.asm
+
+
+ifelse(<
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+define(<fat_transform>, <$1_aesni>)
+include_src(<x86_64/aesni/aes-decrypt-internal.asm>)
diff --git a/x86_64/fat/aes-decrypt-internal.asm b/x86_64/fat/aes-decrypt-internal.asm
new file mode 100644
index 00000000..26738d66
--- /dev/null
+++ b/x86_64/fat/aes-decrypt-internal.asm
@@ -0,0 +1,35 @@
+C x86_64/fat/aes-decrypt-internal.asm
+
+
+ifelse(<
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+define(<fat_transform>, <$1_x86_64>)
+include_src(<x86_64/aes-decrypt-internal.asm>)
diff --git a/x86_64/fat/aes-encrypt-internal-2.asm b/x86_64/fat/aes-encrypt-internal-2.asm
new file mode 100644
index 00000000..2a5ce7b1
--- /dev/null
+++ b/x86_64/fat/aes-encrypt-internal-2.asm
@@ -0,0 +1,35 @@
+C x86_64/fat/aes-encrypt-internal-2.asm
+
+
+ifelse(<
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+define(<fat_transform>, <$1_aesni>)
+include_src(<x86_64/aesni/aes-encrypt-internal.asm>)
diff --git a/x86_64/fat/aes-encrypt-internal.asm b/x86_64/fat/aes-encrypt-internal.asm
new file mode 100644
index 00000000..f0bdf59e
--- /dev/null
+++ b/x86_64/fat/aes-encrypt-internal.asm
@@ -0,0 +1,35 @@
+C x86_64/fat/aes-encrypt-internal.asm
+
+
+ifelse(<
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+define(<fat_transform>, <$1_x86_64>)
+include_src(<x86_64/aes-encrypt-internal.asm>)
diff --git a/x86_64/fat/cpuid.asm b/x86_64/fat/cpuid.asm
new file mode 100644
index 00000000..16a66d57
--- /dev/null
+++ b/x86_64/fat/cpuid.asm
@@ -0,0 +1,58 @@
+C x86_64/fat/cpuid.asm
+
+ifelse(<
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+C Input argument
+C cpuid input: %edi
+C output pointer: %rsi 	
+
+	.file "cpuid.asm"
+
+	C void _nettle_cpuid(uint32_t in, uint32_t *out)
+
+	.text
+	ALIGN(16)
+PROLOGUE(_nettle_cpuid)
+	W64_ENTRY(2)
+	push	%rbx
+	
+	movl	%edi, %eax
+	cpuid
+	mov	%eax, (%rsi)
+	mov	%ebx, 4(%rsi)
+	mov	%ecx, 8(%rsi)
+	mov	%edx, 12(%rsi)
+
+	pop	%rbx
+	W64_EXIT(2)
+	ret
+EPILOGUE(_nettle_cpuid)
+
diff --git a/x86_64/fat/memxor-2.asm b/x86_64/fat/memxor-2.asm
new file mode 100644
index 00000000..e3bf9da8
--- /dev/null
+++ b/x86_64/fat/memxor-2.asm
@@ -0,0 +1,36 @@
+C x86_64/fat/memxor-2.asm
+
+
+ifelse(<
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+define(<fat_transform>, <_$1_sse2>)
+define(<USE_SSE2>, <yes>)	
+include_src(<x86_64/memxor.asm>)
diff --git a/x86_64/fat/memxor.asm b/x86_64/fat/memxor.asm
new file mode 100644
index 00000000..be33d273
--- /dev/null
+++ b/x86_64/fat/memxor.asm
@@ -0,0 +1,35 @@
+C x86_64/fat/memxor.asm
+
+
+ifelse(<
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+define(<fat_transform>, <_$1_x86_64>)
+include_src(<x86_64/memxor.asm>)
diff --git a/x86_64/memxor.asm b/x86_64/memxor.asm
index e14e31a1..f07f0017 100644
--- a/x86_64/memxor.asm
+++ b/x86_64/memxor.asm
@@ -1,7 +1,7 @@
 C x86_64/memxor.asm
 
 ifelse(<
-   Copyright (C) 2010, Niels Möller
+   Copyright (C) 2010, 2014, Niels Möller
 
    This file is part of GNU Nettle.
 
@@ -32,9 +32,8 @@ ifelse(<
 
 C Register usage:
 define(<DST>, <%rax>) C Originally in %rdi
-define(<AP>, <%rsi>)
-define(<BP>, <%rdx>)
-define(<N>, <%r10>)
+define(<SRC>, <%rsi>)
+define(<N>, <%rdx>)
 define(<TMP>, <%r8>)
 define(<TMP2>, <%r9>)
 define(<CNT>, <%rdi>)
@@ -53,20 +52,7 @@ define(<USE_SSE2>, <no>)
 
 PROLOGUE(nettle_memxor)
 	W64_ENTRY(3, 0)
-	mov	%rdx, %r10
-	mov	%rdi, %rdx
-	jmp 	.Lmemxor3_entry
-EPILOGUE(nettle_memxor)
 
-	C memxor3(void *dst, const void *a, const void *b, size_t n)
-	C 	          %rdi              %rsi              %rdx      %rcx
-	ALIGN(16)
-	
-PROLOGUE(nettle_memxor3)
-	W64_ENTRY(4, 0)
-	C %cl needed for shift count, so move away N
-	mov	%rcx, N
-.Lmemxor3_entry:
 	test	N, N
 	C Get number of unaligned bytes at the end
 	C %rdi is used as CNT, %rax as DST and as return value
@@ -87,94 +73,17 @@ PROLOGUE(nettle_memxor3)
 .Lalign_loop:
 	
 	sub	$1, N
-	movb	(AP, N), LREG(TMP)
-	xorb	(BP, N), LREG(TMP)
-	movb	LREG(TMP), (DST, N)
+	movb	(SRC, N), LREG(TMP)
+	xorb	LREG(TMP), (DST, N)
 	sub	$1, CNT
 	jnz	.Lalign_loop
 
 .Laligned:
-ifelse(USE_SSE2, yes, <
+ifdef(<USE_SSE2>, <
 	cmp	$16, N
 	jnc	.Lsse2_case
 >)
-	C Check for the case that AP and BP have the same alignment,
-	C but different from DST.
-	mov	AP, TMP
-	sub	BP, TMP
-	test	$7, TMP
-	jnz	.Lno_shift_case
-	mov	AP, %rcx
-	sub	DST, %rcx
-	and	$7, %rcx
-	jz	.Lno_shift_case
-	sub	%rcx, AP
-	sub	%rcx, BP
-	shl	$3, %rcx
-
-	C Unrolling, with aligned values alternating in S0 and S1
-	test	$8, N
-	jnz	.Lshift_odd
-	mov	(AP, N), S1
-	xor	(BP, N), S1
-	jmp	.Lshift_next
-
-.Lshift_odd:
-	mov	-8(AP, N), S1
-	mov	(AP, N), S0
-	xor	-8(BP, N), S1
-	xor	(BP, N), S0
-	mov	S1, TMP
-	shr	%cl, TMP
-	neg	%cl
-	shl	%cl, S0
-	neg	%cl
-	
-	or	S0, TMP
-	mov	TMP, -8(DST, N)
-	sub	$8, N
-	jz	.Ldone
-	jmp 	.Lshift_next
-
-	ALIGN(16)
 
-.Lshift_loop:
-	mov	8(AP, N), S0
-	xor	8(BP, N), S0
-	mov	S0, TMP
-	shr	%cl, TMP
-	neg	%cl
-	shl	%cl, S1
-	neg	%cl
-	or	S1, TMP
-	mov	TMP, 8(DST, N)
-
-	mov	(AP, N), S1
-	xor	(BP, N), S1
-	mov	S1, TMP
-	shr	%cl, TMP
-	neg	%cl
-	shl	%cl, S0
-	neg 	%cl
-	or	S0, TMP
-	mov	TMP, (DST, N)
-.Lshift_next:
-	sub	$16, N
-	C FIXME: Handle the case N == 16 specially,
-	C like in the non-shifted case? 
-C 	ja	.Lshift_loop
-C 	jz	.Ldone
-	jnc	.Lshift_loop
-
-	add	$15, N
-	jnc	.Ldone
-
-	shr	$3, %rcx
-	add	%rcx, AP
-	add	%rcx, BP
-	jmp	.Lfinal_loop
-	
-.Lno_shift_case:
 	C Next destination word is -8(DST, N)
 	C Setup for unrolling
 	test	$8, N
@@ -183,21 +92,18 @@ C 	jz	.Ldone
 	sub	$8, N
 	jz	.Lone_word
 
-	mov	(AP, N), TMP
-	xor	(BP, N), TMP
-	mov	TMP, (DST, N)
+	mov	(SRC, N), TMP
+	xor	TMP, (DST, N)
 	
 	jmp	.Lword_next
 
 	ALIGN(16)
 
 .Lword_loop:
-	mov	8(AP, N), TMP
-	mov	(AP, N), TMP2
-	xor	8(BP, N), TMP
-	xor	(BP, N), TMP2
-	mov	TMP, 8(DST, N)
-	mov	TMP2, (DST, N)
+	mov	8(SRC, N), TMP
+	mov	(SRC, N), TMP2
+	xor	TMP, 8(DST, N)
+	xor	TMP2, (DST, N)
 
 .Lword_next:
 	sub	$16, N
@@ -205,51 +111,45 @@ C 	jz	.Ldone
 	jnz	.Lfinal
 
 	C Final operation is word aligned
-	mov	8(AP, N), TMP
-	xor	8(BP, N), TMP
-	mov	TMP, 8(DST, N)
+	mov	8(SRC, N), TMP
+	xor	TMP, 8(DST, N)
 	
 .Lone_word:
-	mov	(AP, N), TMP
-	xor	(BP, N), TMP
-	mov	TMP, (DST, N)
+	mov	(SRC, N), TMP
+	xor	TMP, (DST, N)
 
-	C ENTRY might have been 3 args, too, but it doesn't matter for the exit
-	W64_EXIT(4, 0)
+	W64_EXIT(3, 0)
 	ret
 
 .Lfinal:
 	add	$15, N
 
 .Lfinal_loop:
-	movb	(AP, N), LREG(TMP)
-	xorb	(BP, N), LREG(TMP)
-	movb	LREG(TMP), (DST, N)
+	movb	(SRC, N), LREG(TMP)
+	xorb	LREG(TMP), (DST, N)
 .Lfinal_next:
 	sub	$1, N
 	jnc	.Lfinal_loop
 
 .Ldone:
-	C ENTRY might have been 3 args, too, but it doesn't matter for the exit
-	W64_EXIT(4, 0)
+	W64_EXIT(3, 0)
 	ret
 
-ifelse(USE_SSE2, yes, <
+ifdef(<USE_SSE2>, <
 
 .Lsse2_case:
 	lea	(DST, N), TMP
 	test	$8, TMP
 	jz	.Lsse2_next
 	sub	$8, N
-	mov	(AP, N), TMP
-	xor	(BP, N), TMP
-	mov	TMP, (DST, N)
+	mov	(SRC, N), TMP
+	xor	TMP, (DST, N)
 	jmp	.Lsse2_next
 
 	ALIGN(16)
 .Lsse2_loop:
-	movdqu	(AP, N), %xmm0
-	movdqu	(BP, N), %xmm1
+	movdqu	(SRC, N), %xmm0
+	movdqa	(DST, N), %xmm1
 	pxor	%xmm0, %xmm1
 	movdqa	%xmm1, (DST, N)
 .Lsse2_next:
@@ -261,14 +161,13 @@ ifelse(USE_SSE2, yes, <
 	jnz	.Lfinal		
 
 	C Final operation is aligned
-	movdqu	(AP), %xmm0
-	movdqu	(BP), %xmm1
+	movdqu	(SRC), %xmm0
+	movdqa	(DST), %xmm1
 	pxor	%xmm0, %xmm1
 	movdqa	%xmm1, (DST)
-	C ENTRY might have been 3 args, too, but it doesn't matter for the exit
-	W64_EXIT(4, 0)
+
+	W64_EXIT(3, 0)
 	ret
 >)	
-	
 
-EPILOGUE(nettle_memxor3)
+EPILOGUE(nettle_memxor)
diff --git a/x86_64/memxor3.asm b/x86_64/memxor3.asm
new file mode 100644
index 00000000..8ff3e79c
--- /dev/null
+++ b/x86_64/memxor3.asm
@@ -0,0 +1,263 @@
+C x86_64/memxor3.asm
+
+ifelse(<
+   Copyright (C) 2010, 2014 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+C Register usage:
+define(<DST>, <%rax>) C Originally in %rdi
+define(<AP>, <%rsi>)
+define(<BP>, <%rdx>)
+define(<N>, <%r10>)
+define(<TMP>, <%r8>)
+define(<TMP2>, <%r9>)
+define(<CNT>, <%rdi>)
+define(<S0>, <%r11>)
+define(<S1>, <%rdi>) C Overlaps with CNT 
+
+define(<USE_SSE2>, <no>)
+
+	.file "memxor3.asm"
+
+	.text
+
+	C memxor3(void *dst, const void *a, const void *b, size_t n)
+	C 	          %rdi              %rsi              %rdx      %rcx
+	ALIGN(16)
+	
+PROLOGUE(nettle_memxor3)
+	W64_ENTRY(4, 0)
+	C %cl needed for shift count, so move away N
+	mov	%rcx, N
+.Lmemxor3_entry:
+	test	N, N
+	C Get number of unaligned bytes at the end
+	C %rdi is used as CNT, %rax as DST and as return value
+	mov	%rdi, %rax
+	jz	.Ldone
+	add 	N, CNT
+	and	$7, CNT
+	
+	jz	.Laligned
+
+	cmp	$8, N
+	jc	.Lfinal_next
+
+	C FIXME: Instead of this loop, could try cmov with memory
+	C destination, as a sequence of one 8-bit, one 16-bit and one
+	C 32-bit operations. (Except that cmov can't do 8-bit ops, so
+	C that step has to use a conditional).
+.Lalign_loop:
+	
+	sub	$1, N
+	movb	(AP, N), LREG(TMP)
+	xorb	(BP, N), LREG(TMP)
+	movb	LREG(TMP), (DST, N)
+	sub	$1, CNT
+	jnz	.Lalign_loop
+
+.Laligned:
+ifelse(USE_SSE2, yes, <
+	cmp	$16, N
+	jnc	.Lsse2_case
+>)
+	C Check for the case that AP and BP have the same alignment,
+	C but different from DST.
+	mov	AP, TMP
+	sub	BP, TMP
+	test	$7, TMP
+	jnz	.Lno_shift_case
+	mov	AP, %rcx
+	sub	DST, %rcx
+	and	$7, %rcx
+	jz	.Lno_shift_case
+	sub	%rcx, AP
+	sub	%rcx, BP
+	shl	$3, %rcx
+
+	C Unrolling, with aligned values alternating in S0 and S1
+	test	$8, N
+	jnz	.Lshift_odd
+	mov	(AP, N), S1
+	xor	(BP, N), S1
+	jmp	.Lshift_next
+
+.Lshift_odd:
+	mov	-8(AP, N), S1
+	mov	(AP, N), S0
+	xor	-8(BP, N), S1
+	xor	(BP, N), S0
+	mov	S1, TMP
+	shr	%cl, TMP
+	neg	%cl
+	shl	%cl, S0
+	neg	%cl
+	
+	or	S0, TMP
+	mov	TMP, -8(DST, N)
+	sub	$8, N
+	jz	.Ldone
+	jmp 	.Lshift_next
+
+	ALIGN(16)
+
+.Lshift_loop:
+	mov	8(AP, N), S0
+	xor	8(BP, N), S0
+	mov	S0, TMP
+	shr	%cl, TMP
+	neg	%cl
+	shl	%cl, S1
+	neg	%cl
+	or	S1, TMP
+	mov	TMP, 8(DST, N)
+
+	mov	(AP, N), S1
+	xor	(BP, N), S1
+	mov	S1, TMP
+	shr	%cl, TMP
+	neg	%cl
+	shl	%cl, S0
+	neg 	%cl
+	or	S0, TMP
+	mov	TMP, (DST, N)
+.Lshift_next:
+	sub	$16, N
+	C FIXME: Handle the case N == 16 specially,
+	C like in the non-shifted case? 
+C 	ja	.Lshift_loop
+C 	jz	.Ldone
+	jnc	.Lshift_loop
+
+	add	$15, N
+	jnc	.Ldone
+
+	shr	$3, %rcx
+	add	%rcx, AP
+	add	%rcx, BP
+	jmp	.Lfinal_loop
+	
+.Lno_shift_case:
+	C Next destination word is -8(DST, N)
+	C Setup for unrolling
+	test	$8, N
+	jz	.Lword_next
+
+	sub	$8, N
+	jz	.Lone_word
+
+	mov	(AP, N), TMP
+	xor	(BP, N), TMP
+	mov	TMP, (DST, N)
+	
+	jmp	.Lword_next
+
+	ALIGN(16)
+
+.Lword_loop:
+	mov	8(AP, N), TMP
+	mov	(AP, N), TMP2
+	xor	8(BP, N), TMP
+	xor	(BP, N), TMP2
+	mov	TMP, 8(DST, N)
+	mov	TMP2, (DST, N)
+
+.Lword_next:
+	sub	$16, N
+	ja	.Lword_loop	C Not zero and no carry
+	jnz	.Lfinal
+
+	C Final operation is word aligned
+	mov	8(AP, N), TMP
+	xor	8(BP, N), TMP
+	mov	TMP, 8(DST, N)
+	
+.Lone_word:
+	mov	(AP, N), TMP
+	xor	(BP, N), TMP
+	mov	TMP, (DST, N)
+
+	C ENTRY might have been 3 args, too, but it doesn't matter for the exit
+	W64_EXIT(4, 0)
+	ret
+
+.Lfinal:
+	add	$15, N
+
+.Lfinal_loop:
+	movb	(AP, N), LREG(TMP)
+	xorb	(BP, N), LREG(TMP)
+	movb	LREG(TMP), (DST, N)
+.Lfinal_next:
+	sub	$1, N
+	jnc	.Lfinal_loop
+
+.Ldone:
+	C ENTRY might have been 3 args, too, but it doesn't matter for the exit
+	W64_EXIT(4, 0)
+	ret
+
+ifelse(USE_SSE2, yes, <
+
+.Lsse2_case:
+	lea	(DST, N), TMP
+	test	$8, TMP
+	jz	.Lsse2_next
+	sub	$8, N
+	mov	(AP, N), TMP
+	xor	(BP, N), TMP
+	mov	TMP, (DST, N)
+	jmp	.Lsse2_next
+
+	ALIGN(16)
+.Lsse2_loop:
+	movdqu	(AP, N), %xmm0
+	movdqu	(BP, N), %xmm1
+	pxor	%xmm0, %xmm1
+	movdqa	%xmm1, (DST, N)
+.Lsse2_next:
+	sub	$16, N
+	ja	.Lsse2_loop
+	
+	C FIXME: See if we can do a full word first, before the
+	C byte-wise final loop.
+	jnz	.Lfinal		
+
+	C Final operation is aligned
+	movdqu	(AP), %xmm0
+	movdqu	(BP), %xmm1
+	pxor	%xmm0, %xmm1
+	movdqa	%xmm1, (DST)
+	C ENTRY might have been 3 args, too, but it doesn't matter for the exit
+	W64_EXIT(4, 0)
+	ret
+>)	
+	
+
+EPILOGUE(nettle_memxor3)