7 files changed, 896 insertions, 1280 deletions
diff --git a/Makefile.am b/Makefile.am
index dfdcea6..6fb3d47 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -144,7 +144,6 @@ EXTRA_libffi_la_SOURCES = \
 	src/x86/ffi.c src/x86/sysv.S					\
 	 src/x86/ffiw64.c src/x86/win64.S 				\
 	 src/x86/ffi64.c src/x86/unix64.S				\
-	 src/x86/darwin64.S src/x86/darwin.S				\
 	src/xtensa/ffi.c src/xtensa/sysv.S
 
 TARGET_OBJ = @TARGET_OBJ@
diff --git a/configure.host b/configure.host
index bc3e838..5ee632c 100644
--- a/configure.host
+++ b/configure.host
@@ -84,7 +84,12 @@ case "${host}" in
 	;;
 
   i?86-*-darwin* | x86_64-*-darwin*)
-	TARGET=X86_DARWIN; TARGETDIR=x86
+	TARGETDIR=x86
+	if test $ac_cv_sizeof_size_t = 4; then
+	  TARGET=X86_DARWIN
+	else
+	  TARGET=X86_64
+	fi
 	;;
 
   i?86-*-* | x86_64-*-* | amd64-*)
@@ -237,7 +242,7 @@ case "${TARGET}" in
   POWERPC_FREEBSD)
 	SOURCES="ffi.c ffi_sysv.c sysv.S ppc_closure.S"
 	;;
-  X86 | X86_FREEBSD | X86_WIN32)
+  X86 | X86_DARWIN | X86_FREEBSD | X86_WIN32)
 	SOURCES="ffi.c sysv.S"
 	;;
   X86_64)
@@ -246,9 +251,6 @@ case "${TARGET}" in
   X86_WIN64)
 	SOURCES="ffiw64.c win64.S"
 	;;
-  X86_DARWIN)
-	SOURCES="ffi.c darwin.S ffi64.c darwin64.S"
-	;;
 esac
 
 # If we failed to configure SOURCES, we can't do anything.
diff --git a/src/x86/darwin.S b/src/x86/darwin.S
deleted file mode 100644
index 8f0f070..0000000
--- a/src/x86/darwin.S
+++ /dev/null
@@ -1,444 +0,0 @@
-/* -----------------------------------------------------------------------
-   darwin.S - Copyright (c) 1996, 1998, 2001, 2002, 2003, 2005  Red Hat, Inc.
-	Copyright (C) 2008  Free Software Foundation, Inc.
-
-   X86 Foreign Function Interface
-
-   Permission is hereby granted, free of charge, to any person obtaining
-   a copy of this software and associated documentation files (the
-   ``Software''), to deal in the Software without restriction, including
-   without limitation the rights to use, copy, modify, merge, publish,
-   distribute, sublicense, and/or sell copies of the Software, and to
-   permit persons to whom the Software is furnished to do so, subject to
-   the following conditions:
-
-   The above copyright notice and this permission notice shall be included
-   in all copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
-   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-   DEALINGS IN THE SOFTWARE.
-   -----------------------------------------------------------------------
-   */
-
-#ifndef __x86_64__
-
-#define LIBFFI_ASM	
-#include <fficonfig.h>
-#include <ffi.h>
-
-.text
-
-.globl _ffi_prep_args
-
-	.align 4
-.globl _ffi_call_SYSV
-
-_ffi_call_SYSV:
-.LFB1:
-        pushl %ebp
-.LCFI0:
-        movl  %esp,%ebp
-.LCFI1:
-        subl $8,%esp
-	/* Make room for all of the new args.  */
-	movl  16(%ebp),%ecx
-	subl  %ecx,%esp
-
-	movl  %esp,%eax
-
-	/* Place all of the ffi_prep_args in position  */
-	subl  $8,%esp
-	pushl 12(%ebp)
-	pushl %eax
-	call  *8(%ebp)
-
-	/* Return stack to previous state and call the function  */
-	addl  $16,%esp	
-
-	call  *28(%ebp)
-
-	/* Load %ecx with the return type code  */
-	movl  20(%ebp),%ecx	
-
-	/* Protect %esi.  We're going to pop it in the epilogue.  */
-	pushl %esi
-
-	/* If the return value pointer is NULL, assume no return value.  */
-	cmpl  $0,24(%ebp)
-	jne  0f
-
-	/* Even if there is no space for the return value, we are 
-	   obliged to handle floating-point values.  */
-	cmpl  $FFI_TYPE_FLOAT,%ecx
-	jne   noretval
-	fstp  %st(0)
-
-	jmp   epilogue
-0:
-	.align 4
-	call 1f
-.Lstore_table:
-	.long   noretval-.Lstore_table		/* FFI_TYPE_VOID */
-	.long   retint-.Lstore_table		/* FFI_TYPE_INT */
-	.long   retfloat-.Lstore_table		/* FFI_TYPE_FLOAT */
-	.long   retdouble-.Lstore_table		/* FFI_TYPE_DOUBLE */
-	.long   retlongdouble-.Lstore_table     /* FFI_TYPE_LONGDOUBLE */
-	.long   retuint8-.Lstore_table		/* FFI_TYPE_UINT8 */
-	.long   retsint8-.Lstore_table		/* FFI_TYPE_SINT8 */
-	.long   retuint16-.Lstore_table		/* FFI_TYPE_UINT16 */
-	.long   retsint16-.Lstore_table		/* FFI_TYPE_SINT16 */
-	.long   retint-.Lstore_table		/* FFI_TYPE_UINT32 */
-	.long   retint-.Lstore_table		/* FFI_TYPE_SINT32 */
-	.long   retint64-.Lstore_table		/* FFI_TYPE_UINT64 */
-	.long   retint64-.Lstore_table		/* FFI_TYPE_SINT64 */
-	.long   retstruct-.Lstore_table		/* FFI_TYPE_STRUCT */
-	.long   retint-.Lstore_table		/* FFI_TYPE_POINTER */
-	.long   retstruct1b-.Lstore_table	/* FFI_TYPE_SMALL_STRUCT_1B */
-	.long   retstruct2b-.Lstore_table	/* FFI_TYPE_SMALL_STRUCT_2B */
-1:
-	pop  %esi
-	add  (%esi, %ecx, 4), %esi
-	jmp  *%esi
-
-	/* Sign/zero extend as appropriate.  */
-retsint8:
-	movsbl  %al, %eax
-	jmp  retint
-
-retsint16:
-	movswl  %ax, %eax
-	jmp  retint
-
-retuint8:
-	movzbl  %al, %eax
-	jmp  retint
-
-retuint16:
-	movzwl  %ax, %eax
-	jmp  retint
-
-retfloat:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx
-	fstps (%ecx)
-	jmp   epilogue
-
-retdouble:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx
-	fstpl (%ecx)
-	jmp   epilogue
-
-retlongdouble:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx
-	fstpt (%ecx)
-	jmp   epilogue
-
-retint64:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx
-	movl  %eax,0(%ecx)
-	movl  %edx,4(%ecx)
-	jmp   epilogue
-
-retstruct1b:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx
-	movb  %al,0(%ecx)
-	jmp   epilogue
-
-retstruct2b:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx
-	movw  %ax,0(%ecx)
-	jmp   epilogue
-
-retint:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx
-	movl  %eax,0(%ecx)
-
-retstruct:
-	/* Nothing to do!  */
-
-noretval:
-epilogue:
-	popl %esi
-	movl %ebp,%esp
-	popl %ebp
-	ret
-
-.LFE1:
-.ffi_call_SYSV_end:
-
-	.align	4
-FFI_HIDDEN (ffi_closure_SYSV)
-.globl _ffi_closure_SYSV
-
-_ffi_closure_SYSV:
-.LFB2:
-	pushl	%ebp
-.LCFI2:
-	movl	%esp, %ebp
-.LCFI3:
-	subl	$40, %esp
-	leal	-24(%ebp), %edx
-	movl	%edx, -12(%ebp)	/* resp */
-	leal	8(%ebp), %edx
-	movl	%edx, 4(%esp)	/* args = __builtin_dwarf_cfa () */
-	leal	-12(%ebp), %edx
-	movl	%edx, (%esp)	/* &resp */
-	movl	%ebx, 8(%esp)
-.LCFI7:
-	call	L_ffi_closure_SYSV_inner$stub
-	movl	8(%esp), %ebx
-	movl	-12(%ebp), %ecx
-	cmpl	$FFI_TYPE_INT, %eax
-	je	.Lcls_retint
-
-	/* Handle FFI_TYPE_UINT8, FFI_TYPE_SINT8, FFI_TYPE_UINT16,
-	   FFI_TYPE_SINT16, FFI_TYPE_UINT32, FFI_TYPE_SINT32.  */
-	cmpl	$FFI_TYPE_UINT64, %eax
-	jge	0f
-	cmpl	$FFI_TYPE_UINT8, %eax
-	jge	.Lcls_retint
-
-0:	cmpl	$FFI_TYPE_FLOAT, %eax
-	je	.Lcls_retfloat
-	cmpl	$FFI_TYPE_DOUBLE, %eax
-	je	.Lcls_retdouble
-	cmpl	$FFI_TYPE_LONGDOUBLE, %eax
-	je	.Lcls_retldouble
-	cmpl	$FFI_TYPE_SINT64, %eax
-	je	.Lcls_retllong
-	cmpl	$FFI_TYPE_SMALL_STRUCT_1B, %eax
-	je	.Lcls_retstruct1b
-	cmpl	$FFI_TYPE_SMALL_STRUCT_2B, %eax
-	je	.Lcls_retstruct2b
-	cmpl	$FFI_TYPE_STRUCT, %eax
-	je	.Lcls_retstruct
-.Lcls_epilogue:
-	movl	%ebp, %esp
-	popl	%ebp
-	ret
-.Lcls_retint:
-	movl	(%ecx), %eax
-	jmp	.Lcls_epilogue
-.Lcls_retfloat:
-	flds	(%ecx)
-	jmp	.Lcls_epilogue
-.Lcls_retdouble:
-	fldl	(%ecx)
-	jmp	.Lcls_epilogue
-.Lcls_retldouble:
-	fldt	(%ecx)
-	jmp	.Lcls_epilogue
-.Lcls_retllong:
-	movl	(%ecx), %eax
-	movl	4(%ecx), %edx
-	jmp	.Lcls_epilogue
-.Lcls_retstruct1b:
-	movsbl	(%ecx), %eax
-	jmp	.Lcls_epilogue
-.Lcls_retstruct2b:
-	movswl	(%ecx), %eax
-	jmp	.Lcls_epilogue
-.Lcls_retstruct:
-	lea -8(%ebp),%esp
-	movl	%ebp, %esp
-	popl	%ebp
-	ret $4
-.LFE2:
-
-#if !FFI_NO_RAW_API
-
-#define RAW_CLOSURE_CIF_OFFSET ((FFI_TRAMPOLINE_SIZE + 3) & ~3)
-#define RAW_CLOSURE_FUN_OFFSET (RAW_CLOSURE_CIF_OFFSET + 4)
-#define RAW_CLOSURE_USER_DATA_OFFSET (RAW_CLOSURE_FUN_OFFSET + 4)
-#define CIF_FLAGS_OFFSET 20
-
-	.align	4
-FFI_HIDDEN (ffi_closure_raw_SYSV)
-.globl _ffi_closure_raw_SYSV
-
-_ffi_closure_raw_SYSV:
-.LFB3:
-	pushl	%ebp
-.LCFI4:
-	movl	%esp, %ebp
-.LCFI5:
-	pushl	%esi
-.LCFI6:
-	subl	$36, %esp
-	movl	RAW_CLOSURE_CIF_OFFSET(%eax), %esi	 /* closure->cif */
-	movl	RAW_CLOSURE_USER_DATA_OFFSET(%eax), %edx /* closure->user_data */
-	movl	%edx, 12(%esp)	/* user_data */
-	leal	8(%ebp), %edx	/* __builtin_dwarf_cfa () */
-	movl	%edx, 8(%esp)	/* raw_args */
-	leal	-24(%ebp), %edx
-	movl	%edx, 4(%esp)	/* &res */
-	movl	%esi, (%esp)	/* cif */
-	call	*RAW_CLOSURE_FUN_OFFSET(%eax)		 /* closure->fun */
-	movl	CIF_FLAGS_OFFSET(%esi), %eax		 /* rtype */
-	cmpl	$FFI_TYPE_INT, %eax
-	je	.Lrcls_retint
-
-	/* Handle FFI_TYPE_UINT8, FFI_TYPE_SINT8, FFI_TYPE_UINT16,
-	   FFI_TYPE_SINT16, FFI_TYPE_UINT32, FFI_TYPE_SINT32.  */
-	cmpl	$FFI_TYPE_UINT64, %eax
-	jge	0f
-	cmpl	$FFI_TYPE_UINT8, %eax
-	jge	.Lrcls_retint
-0:
-	cmpl	$FFI_TYPE_FLOAT, %eax
-	je	.Lrcls_retfloat
-	cmpl	$FFI_TYPE_DOUBLE, %eax
-	je	.Lrcls_retdouble
-	cmpl	$FFI_TYPE_LONGDOUBLE, %eax
-	je	.Lrcls_retldouble
-	cmpl	$FFI_TYPE_SINT64, %eax
-	je	.Lrcls_retllong
-.Lrcls_epilogue:
-	addl	$36, %esp
-	popl	%esi
-	popl	%ebp
-	ret
-.Lrcls_retint:
-	movl	-24(%ebp), %eax
-	jmp	.Lrcls_epilogue
-.Lrcls_retfloat:
-	flds	-24(%ebp)
-	jmp	.Lrcls_epilogue
-.Lrcls_retdouble:
-	fldl	-24(%ebp)
-	jmp	.Lrcls_epilogue
-.Lrcls_retldouble:
-	fldt	-24(%ebp)
-	jmp	.Lrcls_epilogue
-.Lrcls_retllong:
-	movl	-24(%ebp), %eax
-	movl	-20(%ebp), %edx
-	jmp	.Lrcls_epilogue
-.LFE3:
-#endif
-
-.section __IMPORT,__jump_table,symbol_stubs,self_modifying_code+pure_instructions,5
-L_ffi_closure_SYSV_inner$stub:
-	.indirect_symbol _ffi_closure_SYSV_inner
-	hlt ; hlt ; hlt ; hlt ; hlt
-
-
-.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
-EH_frame1:
-	.set	L$set$0,LECIE1-LSCIE1
-	.long	L$set$0
-LSCIE1:
-	.long	0x0
-	.byte	0x1
-	.ascii "zR\0"
-	.byte	0x1
-	.byte	0x7c
-	.byte	0x8
-	.byte	0x1
-	.byte	0x10
-	.byte	0xc
-	.byte	0x5
-	.byte	0x4
-	.byte	0x88
-	.byte	0x1
-	.align 2
-LECIE1:
-.globl _ffi_call_SYSV.eh
-_ffi_call_SYSV.eh:
-LSFDE1:
-	.set	L$set$1,LEFDE1-LASFDE1
-	.long	L$set$1
-LASFDE1:
-	.long	LASFDE1-EH_frame1
-	.long	.LFB1-.
-	.set L$set$2,.LFE1-.LFB1
-	.long L$set$2
-	.byte	0x0
-	.byte	0x4
-	.set L$set$3,.LCFI0-.LFB1
-	.long L$set$3
-	.byte	0xe
-	.byte	0x8
-	.byte	0x84
-	.byte	0x2
-	.byte	0x4
-	.set L$set$4,.LCFI1-.LCFI0
-	.long L$set$4
-	.byte	0xd
-	.byte	0x4
-	.align 2
-LEFDE1:
-.globl _ffi_closure_SYSV.eh
-_ffi_closure_SYSV.eh:
-LSFDE2:
-	.set	L$set$5,LEFDE2-LASFDE2
-	.long	L$set$5
-LASFDE2:
-	.long	LASFDE2-EH_frame1
-	.long	.LFB2-.
-	.set L$set$6,.LFE2-.LFB2
-	.long L$set$6
-	.byte	0x0
-	.byte	0x4
-	.set L$set$7,.LCFI2-.LFB2
-	.long L$set$7
-	.byte	0xe
-	.byte	0x8
-	.byte	0x84
-	.byte	0x2
-	.byte	0x4
-	.set L$set$8,.LCFI3-.LCFI2
-	.long L$set$8
-	.byte	0xd
-	.byte	0x4
-	.align 2
-LEFDE2:
-
-#if !FFI_NO_RAW_API
-
-.globl _ffi_closure_raw_SYSV.eh
-_ffi_closure_raw_SYSV.eh:
-LSFDE3:
-	.set	L$set$10,LEFDE3-LASFDE3
-	.long	L$set$10
-LASFDE3:
-	.long	LASFDE3-EH_frame1
-	.long	.LFB3-.
-	.set L$set$11,.LFE3-.LFB3
-	.long L$set$11
-	.byte	0x0
-	.byte	0x4
-	.set L$set$12,.LCFI4-.LFB3
-	.long L$set$12
-	.byte	0xe
-	.byte	0x8
-	.byte	0x84
-	.byte	0x2
-	.byte	0x4
-	.set L$set$13,.LCFI5-.LCFI4
-	.long L$set$13
-	.byte	0xd
-	.byte	0x4
-	.byte	0x4
-	.set L$set$14,.LCFI6-.LCFI5
-	.long L$set$14
-	.byte	0x85
-	.byte	0x3
-	.align 2
-LEFDE3:
-
-#endif
-
-#endif /* ifndef __x86_64__ */
diff --git a/src/x86/darwin64.S b/src/x86/darwin64.S
deleted file mode 100644
index 2f7394e..0000000
--- a/src/x86/darwin64.S
+++ /dev/null
@@ -1,416 +0,0 @@
-/* -----------------------------------------------------------------------
-   darwin64.S - Copyright (c) 2006 Free Software Foundation, Inc.
-	        Copyright (c) 2008 Red Hat, Inc.
-   derived from unix64.S
-
-   x86-64 Foreign Function Interface for Darwin.
-
-   Permission is hereby granted, free of charge, to any person obtaining
-   a copy of this software and associated documentation files (the
-   ``Software''), to deal in the Software without restriction, including
-   without limitation the rights to use, copy, modify, merge, publish,
-   distribute, sublicense, and/or sell copies of the Software, and to
-   permit persons to whom the Software is furnished to do so, subject to
-   the following conditions:
-
-   The above copyright notice and this permission notice shall be included
-   in all copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, EXPRESS
-   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-   IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR
-   OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-   ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-   OTHER DEALINGS IN THE SOFTWARE.
-   ----------------------------------------------------------------------- */
-
-#ifdef __x86_64__
-#define LIBFFI_ASM
-#include <fficonfig.h>
-#include <ffi.h>
-
-	.file "darwin64.S"
-.text
-
-/* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
-		    void *raddr, void (*fnaddr)(void));
-
-   Bit o trickiness here -- ARGS+BYTES is the base of the stack frame
-   for this function.  This has been allocated by ffi_call.  We also
-   deallocate some of the stack that has been alloca'd.  */
-
-	.align	3
-	.globl	_ffi_call_unix64
-
-_ffi_call_unix64:
-LUW0:
-	movq	(%rsp), %r10		/* Load return address.  */
-	leaq	(%rdi, %rsi), %rax	/* Find local stack base.  */
-	movq	%rdx, (%rax)		/* Save flags.  */
-	movq	%rcx, 8(%rax)		/* Save raddr.  */
-	movq	%rbp, 16(%rax)		/* Save old frame pointer.  */
-	movq	%r10, 24(%rax)		/* Relocate return address.  */
-	movq	%rax, %rbp		/* Finalize local stack frame.  */
-LUW1:
-	movq	%rdi, %r10		/* Save a copy of the register area. */
-	movq	%r8, %r11		/* Save a copy of the target fn.  */
-	movl	%r9d, %eax		/* Set number of SSE registers.  */
-
-	/* Load up all argument registers.  */
-	movq	(%r10), %rdi
-	movq	8(%r10), %rsi
-	movq	16(%r10), %rdx
-	movq	24(%r10), %rcx
-	movq	32(%r10), %r8
-	movq	40(%r10), %r9
-	testl	%eax, %eax
-	jnz	Lload_sse
-Lret_from_load_sse:
-
-	/* Deallocate the reg arg area.  */
-	leaq	176(%r10), %rsp
-
-	/* Call the user function.  */
-	call	*%r11
-
-	/* Deallocate stack arg area; local stack frame in redzone.  */
-	leaq	24(%rbp), %rsp
-
-	movq	0(%rbp), %rcx		/* Reload flags.  */
-	movq	8(%rbp), %rdi		/* Reload raddr.  */
-	movq	16(%rbp), %rbp		/* Reload old frame pointer.  */
-LUW2:
-
-	/* The first byte of the flags contains the FFI_TYPE.  */
-	movzbl	%cl, %r10d
-	leaq	Lstore_table(%rip), %r11
-	movslq	(%r11, %r10, 4), %r10
-	addq	%r11, %r10
-	jmp	*%r10
-
-Lstore_table:
-	.long	Lst_void-Lstore_table		/* FFI_TYPE_VOID */
-	.long	Lst_sint32-Lstore_table		/* FFI_TYPE_INT */
-	.long	Lst_float-Lstore_table		/* FFI_TYPE_FLOAT */
-	.long	Lst_double-Lstore_table		/* FFI_TYPE_DOUBLE */
-	.long	Lst_ldouble-Lstore_table	/* FFI_TYPE_LONGDOUBLE */
-	.long	Lst_uint8-Lstore_table		/* FFI_TYPE_UINT8 */
-	.long	Lst_sint8-Lstore_table		/* FFI_TYPE_SINT8 */
-	.long	Lst_uint16-Lstore_table		/* FFI_TYPE_UINT16 */
-	.long	Lst_sint16-Lstore_table		/* FFI_TYPE_SINT16 */
-	.long	Lst_uint32-Lstore_table		/* FFI_TYPE_UINT32 */
-	.long	Lst_sint32-Lstore_table		/* FFI_TYPE_SINT32 */
-	.long	Lst_int64-Lstore_table		/* FFI_TYPE_UINT64 */
-	.long	Lst_int64-Lstore_table		/* FFI_TYPE_SINT64 */
-	.long	Lst_struct-Lstore_table		/* FFI_TYPE_STRUCT */
-	.long	Lst_int64-Lstore_table		/* FFI_TYPE_POINTER */
-
-	.text
-	.align	3
-Lst_void:
-	ret
-	.align	3
-Lst_uint8:
-	movzbq	%al, %rax
-	movq	%rax, (%rdi)
-	ret
-	.align	3
-Lst_sint8:
-	movsbq	%al, %rax
-	movq	%rax, (%rdi)
-	ret
-	.align	3
-Lst_uint16:
-	movzwq	%ax, %rax
-	movq	%rax, (%rdi)
-	.align	3
-Lst_sint16:
-	movswq	%ax, %rax
-	movq	%rax, (%rdi)
-	ret
-	.align	3
-Lst_uint32:
-	movl	%eax, %eax
-	movq	%rax, (%rdi)
-	.align	3
-Lst_sint32:
-	cltq
-	movq	%rax, (%rdi)
-	ret
-	.align	3
-Lst_int64:
-	movq	%rax, (%rdi)
-	ret
-	.align	3
-Lst_float:
-	movss	%xmm0, (%rdi)
-	ret
-	.align	3
-Lst_double:
-	movsd	%xmm0, (%rdi)
-	ret
-Lst_ldouble:
-	fstpt	(%rdi)
-	ret
-	.align	3
-Lst_struct:
-	leaq	-20(%rsp), %rsi		/* Scratch area in redzone.  */
-
-	/* We have to locate the values now, and since we don't want to
-	   write too much data into the user's return value, we spill the
-	   value to a 16 byte scratch area first.  Bits 8, 9, and 10
-	   control where the values are located.  Only one of the three
-	   bits will be set; see ffi_prep_cif_machdep for the pattern.  */
-	movd	%xmm0, %r10
-	movd	%xmm1, %r11
-	testl	$0x100, %ecx
-	cmovnz	%rax, %rdx
-	cmovnz	%r10, %rax
-	testl	$0x200, %ecx
-	cmovnz	%r10, %rdx
-	testl	$0x400, %ecx
-	cmovnz	%r10, %rax
-	cmovnz	%r11, %rdx
-	movq	%rax, (%rsi)
-	movq	%rdx, 8(%rsi)
-
-	/* Bits 12-31 contain the true size of the structure.  Copy from
-	   the scratch area to the true destination.  */
-	shrl	$12, %ecx
-	rep movsb
-	ret
-
-	/* Many times we can avoid loading any SSE registers at all.
-	   It's not worth an indirect jump to load the exact set of
-	   SSE registers needed; zero or all is a good compromise.  */
-	.align	3
-LUW3:
-Lload_sse:
-	movdqa	48(%r10), %xmm0
-	movdqa	64(%r10), %xmm1
-	movdqa	80(%r10), %xmm2
-	movdqa	96(%r10), %xmm3
-	movdqa	112(%r10), %xmm4
-	movdqa	128(%r10), %xmm5
-	movdqa	144(%r10), %xmm6
-	movdqa	160(%r10), %xmm7
-	jmp	Lret_from_load_sse
-
-LUW4:
-	.align	3
-	.globl	_ffi_closure_unix64
-
-_ffi_closure_unix64:
-LUW5:
-	/* The carry flag is set by the trampoline iff SSE registers
-	   are used.  Don't clobber it before the branch instruction.  */
-	leaq    -200(%rsp), %rsp
-LUW6:
-	movq	%rdi, (%rsp)
-	movq    %rsi, 8(%rsp)
-	movq    %rdx, 16(%rsp)
-	movq    %rcx, 24(%rsp)
-	movq    %r8, 32(%rsp)
-	movq    %r9, 40(%rsp)
-	jc      Lsave_sse
-Lret_from_save_sse:
-
-	movq	%r10, %rdi
-	leaq	176(%rsp), %rsi
-	movq	%rsp, %rdx
-	leaq	208(%rsp), %rcx
-	call	_ffi_closure_unix64_inner
-
-	/* Deallocate stack frame early; return value is now in redzone.  */
-	addq	$200, %rsp
-LUW7:
-
-	/* The first byte of the return value contains the FFI_TYPE.  */
-	movzbl	%al, %r10d
-	leaq	Lload_table(%rip), %r11
-	movslq	(%r11, %r10, 4), %r10
-	addq	%r11, %r10
-	jmp	*%r10
-
-Lload_table:
-	.long	Lld_void-Lload_table		/* FFI_TYPE_VOID */
-	.long	Lld_int32-Lload_table		/* FFI_TYPE_INT */
-	.long	Lld_float-Lload_table		/* FFI_TYPE_FLOAT */
-	.long	Lld_double-Lload_table		/* FFI_TYPE_DOUBLE */
-	.long	Lld_ldouble-Lload_table		/* FFI_TYPE_LONGDOUBLE */
-	.long	Lld_int8-Lload_table		/* FFI_TYPE_UINT8 */
-	.long	Lld_int8-Lload_table		/* FFI_TYPE_SINT8 */
-	.long	Lld_int16-Lload_table		/* FFI_TYPE_UINT16 */
-	.long	Lld_int16-Lload_table		/* FFI_TYPE_SINT16 */
-	.long	Lld_int32-Lload_table		/* FFI_TYPE_UINT32 */
-	.long	Lld_int32-Lload_table		/* FFI_TYPE_SINT32 */
-	.long	Lld_int64-Lload_table		/* FFI_TYPE_UINT64 */
-	.long	Lld_int64-Lload_table		/* FFI_TYPE_SINT64 */
-	.long	Lld_struct-Lload_table		/* FFI_TYPE_STRUCT */
-	.long	Lld_int64-Lload_table		/* FFI_TYPE_POINTER */
-
-	.text
-	.align	3
-Lld_void:
-	ret
-	.align	3
-Lld_int8:
-	movzbl	-24(%rsp), %eax
-	ret
-	.align	3
-Lld_int16:
-	movzwl	-24(%rsp), %eax
-	ret
-	.align	3
-Lld_int32:
-	movl	-24(%rsp), %eax
-	ret
-	.align	3
-Lld_int64:
-	movq	-24(%rsp), %rax
-	ret
-	.align	3
-Lld_float:
-	movss	-24(%rsp), %xmm0
-	ret
-	.align	3
-Lld_double:
-	movsd	-24(%rsp), %xmm0
-	ret
-	.align	3
-Lld_ldouble:
-	fldt	-24(%rsp)
-	ret
-	.align	3
-Lld_struct:
-	/* There are four possibilities here, %rax/%rdx, %xmm0/%rax,
-	   %rax/%xmm0, %xmm0/%xmm1.  We collapse two by always loading
-	   both rdx and xmm1 with the second word.  For the remaining,
-	   bit 8 set means xmm0 gets the second word, and bit 9 means
-	   that rax gets the second word.  */
-	movq	-24(%rsp), %rcx
-	movq	-16(%rsp), %rdx
-	movq	-16(%rsp), %xmm1
-	testl	$0x100, %eax
-	cmovnz	%rdx, %rcx
-	movd	%rcx, %xmm0
-	testl	$0x200, %eax
-	movq	-24(%rsp), %rax
-	cmovnz	%rdx, %rax
-	ret
-
-	/* See the comment above Lload_sse; the same logic applies here.  */
-	.align	3
-LUW8:
-Lsave_sse:
-	movdqa	%xmm0, 48(%rsp)
-	movdqa	%xmm1, 64(%rsp)
-	movdqa	%xmm2, 80(%rsp)
-	movdqa	%xmm3, 96(%rsp)
-	movdqa	%xmm4, 112(%rsp)
-	movdqa	%xmm5, 128(%rsp)
-	movdqa	%xmm6, 144(%rsp)
-	movdqa	%xmm7, 160(%rsp)
-	jmp	Lret_from_save_sse
-
-LUW9:
-.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
-EH_frame1:
-	.set	L$set$0,LECIE1-LSCIE1		/* CIE Length */
-	.long	L$set$0
-LSCIE1:
-	.long	0x0		/* CIE Identifier Tag */
-	.byte	0x1		/* CIE Version */
-	.ascii	"zR\0"		/* CIE Augmentation */
-	.byte	0x1		/* uleb128 0x1; CIE Code Alignment Factor */
-	.byte	0x78		/* sleb128 -8; CIE Data Alignment Factor */
-	.byte	0x10		/* CIE RA Column */
-	.byte	0x1		/* uleb128 0x1; Augmentation size */
-	.byte	0x10		/* FDE Encoding (pcrel sdata4) */
-	.byte	0xc		/* DW_CFA_def_cfa, %rsp offset 8 */
-	.byte	0x7		/* uleb128 0x7 */
-	.byte	0x8		/* uleb128 0x8 */
-	.byte	0x90		/* DW_CFA_offset, column 0x10 */
-	.byte	0x1
-	.align	3
-LECIE1:
-	.globl _ffi_call_unix64.eh
-_ffi_call_unix64.eh:
-LSFDE1:
-	.set	L$set$1,LEFDE1-LASFDE1	/* FDE Length */
-	.long	L$set$1
-LASFDE1:
-	.long	LASFDE1-EH_frame1	/* FDE CIE offset */
-	.quad	LUW0-.			/* FDE initial location */
-	.set	L$set$2,LUW4-LUW0	/* FDE address range */
-	.quad	L$set$2
-	.byte	0x0			/* Augmentation size */
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.set	L$set$3,LUW1-LUW0
-	.long	L$set$3
-
-	/* New stack frame based off rbp.  This is a itty bit of unwind
-	   trickery in that the CFA *has* changed.  There is no easy way
-	   to describe it correctly on entry to the function.  Fortunately,
-	   it doesn't matter too much since at all points we can correctly
-	   unwind back to ffi_call.  Note that the location to which we
-	   moved the return address is (the new) CFA-8, so from the
-	   perspective of the unwind info, it hasn't moved.  */
-	.byte	0xc			/* DW_CFA_def_cfa, %rbp offset 32 */
-	.byte	0x6
-	.byte	0x20
-	.byte	0x80+6			/* DW_CFA_offset, %rbp offset 2*-8 */
-	.byte	0x2
-	.byte	0xa			/* DW_CFA_remember_state */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.set	L$set$4,LUW2-LUW1
-	.long	L$set$4
-	.byte	0xc			/* DW_CFA_def_cfa, %rsp offset 8 */
-	.byte	0x7
-	.byte	0x8
-	.byte	0xc0+6			/* DW_CFA_restore, %rbp */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.set	L$set$5,LUW3-LUW2
-	.long	L$set$5
-	.byte	0xb			/* DW_CFA_restore_state */
-
-	.align	3
-LEFDE1:
-	.globl _ffi_closure_unix64.eh
-_ffi_closure_unix64.eh:
-LSFDE3:
-	.set	L$set$6,LEFDE3-LASFDE3	/* FDE Length */
-	.long	L$set$6
-LASFDE3:
-	.long	LASFDE3-EH_frame1	/* FDE CIE offset */
-	.quad	LUW5-.			/* FDE initial location */
-	.set	L$set$7,LUW9-LUW5	/* FDE address range */
-	.quad	L$set$7
-	.byte	0x0			/* Augmentation size */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.set	L$set$8,LUW6-LUW5
-	.long	L$set$8
-	.byte	0xe			/* DW_CFA_def_cfa_offset */
-	.byte	208,1			/* uleb128 208 */
-	.byte	0xa			/* DW_CFA_remember_state */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.set	L$set$9,LUW7-LUW6
-	.long	L$set$9
-	.byte	0xe			/* DW_CFA_def_cfa_offset */
-	.byte	0x8
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.set	L$set$10,LUW8-LUW7
-	.long	L$set$10
-	.byte	0xb			/* DW_CFA_restore_state */
-
-	.align	3
-LEFDE3:
-	.subsections_via_symbols
-
-#endif /* __x86_64__ */
diff --git a/src/x86/ffi.c b/src/x86/ffi.c
index 4c96c6d..c4d740a 100644
--- a/src/x86/ffi.c
+++ b/src/x86/ffi.c
@@ -332,13 +332,28 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
       else
 	{
 	  size_t za = ALIGN (z, FFI_SIZEOF_ARG);
+	  size_t align = FFI_SIZEOF_ARG;
+
+	  /* Alignment rules for arguments are quite complex.  Vectors and
+	     structures with 16 byte alignment get it.  Note that long double
+	     on Darwin does have 16 byte alignment, and does not get this
+	     alignment if passed directly; a structure with a long double
+	     inside, however, would get 16 byte alignment.  Since libffi does
+	     not support vectors, we need non concern ourselves with other
+	     cases.  */
+	  if (t == FFI_TYPE_STRUCT && ty->alignment >= 16)
+	    align = 16;
+	    
 	  if (dir < 0)
 	    {
+	      /* ??? These reverse argument ABIs are probably too old
+		 to have cared about alignment.  Someone should check.  */
 	      argp -= za;
 	      memcpy (argp, valp, z);
 	    }
 	  else
 	    {
+	      argp = (char *)ALIGN (argp, align);
 	      memcpy (argp, valp, z);
 	      argp += za;
 	    }
@@ -419,8 +434,9 @@ ffi_closure_inner (struct closure_frame *frame, char *stack)
   arg_types = cif->arg_types;
   for (i = 0; i < n; ++i)
     {
-      size_t z = arg_types[i]->size;
-      int t = arg_types[i]->type;
+      ffi_type *ty = arg_types[i];
+      size_t z = ty->size;
+      int t = ty->type;
       void *valp;
 
       if (z <= FFI_SIZEOF_ARG && t != FFI_TYPE_STRUCT)
@@ -441,13 +457,22 @@ ffi_closure_inner (struct closure_frame *frame, char *stack)
       else
 	{
 	  size_t za = ALIGN (z, FFI_SIZEOF_ARG);
+	  size_t align = FFI_SIZEOF_ARG;
+
+	  /* See the comment in ffi_call_int.  */
+	  if (t == FFI_TYPE_STRUCT && ty->alignment >= 16)
+	    align = 16;
+
 	  if (dir < 0)
 	    {
+	      /* ??? These reverse argument ABIs are probably too old
+		 to have cared about alignment.  Someone should check.  */
 	      argp -= za;
 	      valp = argp;
 	    }
 	  else
 	    {
+	      argp = (char *)ALIGN (argp, align);
 	      valp = argp;
 	      argp += za;
 	    }
diff --git a/src/x86/sysv.S b/src/x86/sysv.S
index ebd1693..6043c67 100644
--- a/src/x86/sysv.S
+++ b/src/x86/sysv.S
@@ -30,7 +30,6 @@
 #define LIBFFI_ASM	
 #include <fficonfig.h>
 #include <ffi.h>
-#include <ffi_cfi.h>
 #include "internal.h"
 
 #define C2(X, Y)  X ## Y
@@ -41,6 +40,12 @@
 # define C(X)     X
 #endif
 
+#ifdef X86_DARWIN
+# define L(X)     C1(L, X)
+#else
+# define L(X)     C1(.L, X)
+#endif
+
 #ifdef __ELF__
 # define ENDF(X)  .type	X,@function; .size X, . - X
 #else
@@ -60,14 +65,14 @@
    actual table.  The entry points into the table are all 8 bytes.
    The use of ORG asserts that we're at the correct location.  */
 /* ??? The clang assembler doesn't handle .org with symbolic expressions.  */
-#ifdef __clang__
-# define E(X)	.align 8
+#if defined(__clang__) || defined(__APPLE__)
+# define E(BASE, X)	.balign 8
 #else
-# define E(X)	.align 8; .org 0b + X * 8
+# define E(BASE, X)	.balign 8; .org BASE + X * 8
 #endif
 
 	.text
-	.align	16
+	.balign	16
 	.globl	ffi_call_i386
 	FFI_HIDDEN(ffi_call_i386)
 
@@ -83,7 +88,8 @@
 */
 
 ffi_call_i386:
-	cfi_startproc
+L(UW0):
+	# cfi_startproc
 	movl	(%esp), %eax		/* move the return address */
 	movl	%ebp, (%ecx)		/* store %ebp into local frame */
 	movl	%eax, 4(%ecx)		/* store retaddr into local frame */
@@ -96,8 +102,9 @@ ffi_call_i386:
 	   moved the return address is (the new) CFA-4, so from the
 	   perspective of the unwind info, it hasn't moved.  */
 	movl	%ecx, %ebp
-	cfi_def_cfa(%ebp, 8)
-	cfi_rel_offset(%ebp, 0)
+L(UW1):
+	# cfi_def_cfa(%ebp, 8)
+	# cfi_rel_offset(%ebp, 0)
 
 	movl	%edx, %esp		/* set outgoing argument stack */
 	movl	20+R_EAX*4(%ebp), %eax	/* set register arguments */
@@ -108,80 +115,86 @@ ffi_call_i386:
 
 	movl	12(%ebp), %ecx		/* load return type code */
 	movl	%ebx, 8(%ebp)		/* preserve %ebx */
-	cfi_rel_offset(%ebx, 8)
+L(UW2):
+	# cfi_rel_offset(%ebx, 8)
 
 	andl	$X86_RET_TYPE_MASK, %ecx
 #ifdef __PIC__
-	call	__x86.get_pc_thunk.bx
-1:	leal	0f-1b(%ebx, %ecx, 8), %ebx
+	call	C(__x86.get_pc_thunk.bx)
+L(pc1):
+	leal	L(store_table)-L(pc1)(%ebx, %ecx, 8), %ebx
 #else
-	leal	0f(,%ecx, 8), %ebx
+	leal	L(store_table)(,%ecx, 8), %ebx
 #endif
 	movl	16(%ebp), %ecx		/* load result address */
 	jmp	*%ebx
 
-	.align	8
-0:
-E(X86_RET_FLOAT)
+	.balign	8
+L(store_table):
+E(L(store_table), X86_RET_FLOAT)
 	fstps	(%ecx)
-	jmp	9f
-E(X86_RET_DOUBLE)
+	jmp	L(e1)
+E(L(store_table), X86_RET_DOUBLE)
 	fstpl	(%ecx)
-	jmp	9f
-E(X86_RET_LDOUBLE)
+	jmp	L(e1)
+E(L(store_table), X86_RET_LDOUBLE)
 	fstpt	(%ecx)
-	jmp	9f
-E(X86_RET_SINT8)
+	jmp	L(e1)
+E(L(store_table), X86_RET_SINT8)
 	movsbl	%al, %eax
 	mov	%eax, (%ecx)
-	jmp	9f
-E(X86_RET_SINT16)
+	jmp	L(e1)
+E(L(store_table), X86_RET_SINT16)
 	movswl	%ax, %eax
 	mov	%eax, (%ecx)
-	jmp	9f
-E(X86_RET_UINT8)
+	jmp	L(e1)
+E(L(store_table), X86_RET_UINT8)
 	movzbl	%al, %eax
 	mov	%eax, (%ecx)
-	jmp	9f
-E(X86_RET_UINT16)
+	jmp	L(e1)
+E(L(store_table), X86_RET_UINT16)
 	movzwl	%ax, %eax
 	mov	%eax, (%ecx)
-	jmp	9f
-E(X86_RET_INT64)
+	jmp	L(e1)
+E(L(store_table), X86_RET_INT64)
 	movl	%edx, 4(%ecx)
 	/* fallthru */
-E(X86_RET_INT32)
+E(L(store_table), X86_RET_INT32)
 	movl	%eax, (%ecx)
 	/* fallthru */
-E(X86_RET_VOID)
-9:	movl	8(%ebp), %ebx
+E(L(store_table), X86_RET_VOID)
+L(e1):
+	movl	8(%ebp), %ebx
 	movl	%ebp, %esp
 	popl	%ebp
-	cfi_remember_state
-	cfi_def_cfa(%esp, 4)
-	cfi_restore(%ebx)
-	cfi_restore(%ebp)
+L(UW3):
+	# cfi_remember_state
+	# cfi_def_cfa(%esp, 4)
+	# cfi_restore(%ebx)
+	# cfi_restore(%ebp)
 	ret
-	cfi_restore_state
-
-E(X86_RET_STRUCTPOP)
-	jmp	9b
-E(X86_RET_STRUCTARG)
-	jmp	9b
-E(X86_RET_STRUCT_1B)
+L(UW4):
+	# cfi_restore_state
+
+E(L(store_table), X86_RET_STRUCTPOP)
+	jmp	L(e1)
+E(L(store_table), X86_RET_STRUCTARG)
+	jmp	L(e1)
+E(L(store_table), X86_RET_STRUCT_1B)
 	movb	%al, (%ecx)
-	jmp	9b
-E(X86_RET_STRUCT_2B)
+	jmp	L(e1)
+E(L(store_table), X86_RET_STRUCT_2B)
 	movw	%ax, (%ecx)
-	jmp	9b
+	jmp	L(e1)
 
 	/* Fill out the table so that bad values are predictable.  */
-E(X86_RET_UNUSED14)
+E(L(store_table), X86_RET_UNUSED14)
 	ud2
-E(X86_RET_UNUSED15)
+E(L(store_table), X86_RET_UNUSED15)
 	ud2
 
-	cfi_endproc
+L(UW5):
+	# cfi_endproc
 ENDF(ffi_call_i386)
 
 /* The inner helper is declared as
@@ -212,190 +225,242 @@ ENDF(ffi_call_i386)
 	movl	%ecx, 32(%esp);						\
 	movl	%eax, 36(%esp)
 
+# define FFI_CLOSURE_CALL_INNER(UW)					\
+	movl	%esp, %ecx;			/* load closure_data */	\
+	leal	closure_FS+4(%esp), %edx;	/* load incoming stack */ \
+	call	ffi_closure_inner
+#define FFI_CLOSURE_MASK_AND_JUMP(N, UW)				\
+	andl	$X86_RET_TYPE_MASK, %eax;				\
+	leal	L(C1(load_table,N))(, %eax, 8), %eax;			\
+	jmp	*%eax
 
 #ifdef __PIC__
-/* We're going to always load the got register here, even if .hidden says
-   we're going to avoid the PLT call.  We'll use the got register in
-   FFI_CLOSURE_MASK_AND_JUMP.  */
-# if defined HAVE_HIDDEN_VISIBILITY_ATTRIBUTE
-#  define PLT(X) X
+# if defined X86_DARWIN || defined HAVE_HIDDEN_VISIBILITY_ATTRIBUTE
+#  undef FFI_CLOSURE_MASK_AND_JUMP
+#  define FFI_CLOSURE_MASK_AND_JUMP(N, UW)				\
+	andl	$X86_RET_TYPE_MASK, %eax;				\
+	call	C(__x86.get_pc_thunk.dx);				\
+L(C1(pc,N)):								\
+	leal	L(C1(load_table,N))-L(C1(pc,N))(%edx, %eax, 8), %eax;	\
+	jmp	*%eax
 # else
-#  define PLT(X) X@PLT
-# endif
-# define FFI_CLOSURE_CALL_INNER						\
+#  define FFI_CLOSURE_CALL_INNER_SAVE_EBX
+#  undef FFI_CLOSURE_CALL_INNER
+#  define FFI_CLOSURE_CALL_INNER(UWN)					\
 	movl	%esp, %ecx;			/* load closure_data */	\
 	leal	closure_FS+4(%esp), %edx;	/* load incoming stack */ \
 	movl	%ebx, 40(%esp);			/* save ebx */		\
-	cfi_rel_offset(%ebx, 40);					\
-	call	__x86.get_pc_thunk.bx;		/* load got register */	\
+L(C1(UW,UWN)):								\
+	# cfi_rel_offset(%ebx, 40);					\
+	call	C(__x86.get_pc_thunk.bx);	/* load got register */	\
 	addl	$C(_GLOBAL_OFFSET_TABLE_), %ebx;			\
-	call	PLT(ffi_closure_inner)
-#define FFI_CLOSURE_MASK_AND_JUMP					\
+	call	ffi_closure_inner@PLT
+#  undef FFI_CLOSURE_MASK_AND_JUMP
+#  define FFI_CLOSURE_MASK_AND_JUMP(N, UWN)				\
 	andl	$X86_RET_TYPE_MASK, %eax;				\
-	leal	0f@GOTOFF(%ebx, %eax, 8), %eax;				\
+	leal	L(C1(load_table,N))@GOTOFF(%ebx, %eax, 8), %eax;	\
 	movl	40(%esp), %ebx;			/* restore ebx */	\
-	cfi_restore(%ebx);						\
-	jmp	*%eax
-#else
-# define FFI_CLOSURE_CALL_INNER						\
-	movl	%esp, %ecx;			/* load closure_data */	\
-	leal	closure_FS+4(%esp), %edx;	/* load incoming stack */ \
-	call	ffi_closure_inner
-#define FFI_CLOSURE_MASK_AND_JUMP					\
-	andl	$X86_RET_TYPE_MASK, %eax;				\
-	leal	0f(, %eax, 8), %eax;					\
+L(C1(UW,UWN)):								\
+	# cfi_restore(%ebx);						\
 	jmp	*%eax
+# endif /* DARWIN || HIDDEN */
 #endif /* __PIC__ */
 
-#define FFI_GO_CLOSURE(suffix, chain, t1, t2)				\
-	.align	16;							\
-	.globl	C(C1(ffi_go_closure_,suffix));				\
-	FFI_HIDDEN(C(C1(ffi_go_closure_,suffix)));			\
-C(C1(ffi_go_closure_,suffix)):						\
-	cfi_startproc;							\
-	subl	$closure_FS, %esp;					\
-	/* Note clang bug 21515: adjust_cfa_offset error across endproc.  */ \
-	cfi_def_cfa_offset(closure_FS + 4);				\
-	FFI_CLOSURE_SAVE_REGS;						\
-	movl	4(chain), t1;		/* copy cif */			\
-	movl	8(chain), t2;		/* copy fun */			\
-	movl	t1, 28(%esp);						\
-	movl	t2, 32(%esp);						\
-	movl	chain, 36(%esp);	/* closure is user_data */	\
-	jmp	88f;							\
-	cfi_endproc;							\
-ENDF(C(C1(ffi_go_closure_,suffix)))
-
-FFI_GO_CLOSURE(EAX, %eax, %edx, %ecx)
-FFI_GO_CLOSURE(ECX, %ecx, %edx, %eax)
+	.balign	16
+	.globl	C(ffi_go_closure_EAX)
+	FFI_HIDDEN(C(ffi_go_closure_EAX))
+C(ffi_go_closure_EAX):
+L(UW6):
+	# cfi_startproc
+	subl	$closure_FS, %esp
+L(UW7):
+	# cfi_def_cfa_offset(closure_FS + 4)
+	FFI_CLOSURE_SAVE_REGS
+	movl	4(%eax), %edx		/* copy cif */
+	movl	8(%eax), %ecx		/* copy fun */
+	movl	%edx, 28(%esp)
+	movl	%ecx, 32(%esp)
+	movl	%eax, 36(%esp)		/* closure is user_data */
+	jmp	L(do_closure_i386)
+L(UW8):
+	# cfi_endproc
+ENDF(C(ffi_go_closure_EAX))
+
+	.balign	16
+	.globl	C(ffi_go_closure_ECX)
+	FFI_HIDDEN(C(ffi_go_closure_ECX))
+C(ffi_go_closure_ECX):
+L(UW9):
+	# cfi_startproc
+	subl	$closure_FS, %esp
+L(UW10):
+	# cfi_def_cfa_offset(closure_FS + 4)
+	FFI_CLOSURE_SAVE_REGS
+	movl	4(%ecx), %edx		/* copy cif */
+	movl	8(%ecx), %eax		/* copy fun */
+	movl	%edx, 28(%esp)
+	movl	%eax, 32(%esp)
+	movl	%ecx, 36(%esp)		/* closure is user_data */
+	jmp	L(do_closure_i386)
+L(UW11):
+	# cfi_endproc
+ENDF(C(ffi_go_closure_ECX))
 
 /* The closure entry points are reached from the ffi_closure trampoline.
    On entry, %eax contains the address of the ffi_closure.  */
 
-	.align	16
+	.balign	16
 	.globl	C(ffi_closure_i386)
 	FFI_HIDDEN(C(ffi_closure_i386))
 
 C(ffi_closure_i386):
-	cfi_startproc
+L(UW12):
+	# cfi_startproc
 	subl	$closure_FS, %esp
-	/* Note clang bug 21515: adjust_cfa_offset error across endproc.  */
-	cfi_def_cfa_offset(closure_FS + 4)
+L(UW13):
+	# cfi_def_cfa_offset(closure_FS + 4)
 
 	FFI_CLOSURE_SAVE_REGS
 	FFI_CLOSURE_COPY_TRAMP_DATA
 
-88:	/* Entry point from preceeding Go closures.  */
+	/* Entry point from preceeding Go closures.  */
+L(do_closure_i386):
 
-	FFI_CLOSURE_CALL_INNER
-	FFI_CLOSURE_MASK_AND_JUMP
+	FFI_CLOSURE_CALL_INNER(14)
+	FFI_CLOSURE_MASK_AND_JUMP(2, 15)
 
-	.align	8
-0:
-E(X86_RET_FLOAT)
+	.balign	8
+L(load_table2):
+E(L(load_table2), X86_RET_FLOAT)
 	flds	(%esp)
-	jmp	9f
-E(X86_RET_DOUBLE)
+	jmp	L(e2)
+E(L(load_table2), X86_RET_DOUBLE)
 	fldl	(%esp)
-	jmp	9f
-E(X86_RET_LDOUBLE)
+	jmp	L(e2)
+E(L(load_table2), X86_RET_LDOUBLE)
 	fldt	(%esp)
-	jmp	9f
-E(X86_RET_SINT8)
+	jmp	L(e2)
+E(L(load_table2), X86_RET_SINT8)
 	movsbl	(%esp), %eax
-	jmp	9f
-E(X86_RET_SINT16)
+	jmp	L(e2)
+E(L(load_table2), X86_RET_SINT16)
 	movswl	(%esp), %eax
-	jmp	9f
-E(X86_RET_UINT8)
+	jmp	L(e2)
+E(L(load_table2), X86_RET_UINT8)
 	movzbl	(%esp), %eax
-	jmp	9f
-E(X86_RET_UINT16)
+	jmp	L(e2)
+E(L(load_table2), X86_RET_UINT16)
 	movzwl	(%esp), %eax
-	jmp	9f
-E(X86_RET_INT64)
+	jmp	L(e2)
+E(L(load_table2), X86_RET_INT64)
 	movl	4(%esp), %edx
 	/* fallthru */
-E(X86_RET_INT32)
+E(L(load_table2), X86_RET_INT32)
 	movl	(%esp), %eax
 	/* fallthru */
-E(X86_RET_VOID)
-9:	addl	$closure_FS, %esp
-	cfi_adjust_cfa_offset(-closure_FS)
+E(L(load_table2), X86_RET_VOID)
+L(e2):
+	addl	$closure_FS, %esp
+L(UW16):
+	# cfi_adjust_cfa_offset(-closure_FS)
 	ret
-	cfi_adjust_cfa_offset(closure_FS)
-E(X86_RET_STRUCTPOP)
+L(UW17):
+	# cfi_adjust_cfa_offset(closure_FS)
+E(L(load_table2), X86_RET_STRUCTPOP)
 	addl	$closure_FS, %esp
-	cfi_adjust_cfa_offset(-closure_FS)
+L(UW18):
+	# cfi_adjust_cfa_offset(-closure_FS)
 	ret	$4
-	cfi_adjust_cfa_offset(closure_FS)
-E(X86_RET_STRUCTARG)
+L(UW19):
+	# cfi_adjust_cfa_offset(closure_FS)
+E(L(load_table2), X86_RET_STRUCTARG)
 	movl	(%esp), %eax
-	jmp	9b
-E(X86_RET_STRUCT_1B)
+	jmp	L(e2)
+E(L(load_table2), X86_RET_STRUCT_1B)
 	movzbl	(%esp), %eax
-	jmp	9b
-E(X86_RET_STRUCT_2B)
+	jmp	L(e2)
+E(L(load_table2), X86_RET_STRUCT_2B)
 	movzwl	(%esp), %eax
-	jmp	9b
+	jmp	L(e2)
 
 	/* Fill out the table so that bad values are predictable.  */
-E(X86_RET_UNUSED14)
+E(L(load_table2), X86_RET_UNUSED14)
 	ud2
-E(X86_RET_UNUSED15)
+E(L(load_table2), X86_RET_UNUSED15)
 	ud2
 
-	cfi_endproc
+L(UW20):
+	# cfi_endproc
 ENDF(C(ffi_closure_i386))
 
-FFI_GO_CLOSURE(STDCALL, %ecx, %edx, %eax)
+	.balign	16
+	.globl	C(ffi_go_closure_STDCALL)
+	FFI_HIDDEN(C(ffi_go_closure_STDCALL))
+C(ffi_go_closure_STDCALL):
+L(UW21):
+	# cfi_startproc
+	subl	$closure_FS, %esp
+L(UW22):
+	# cfi_def_cfa_offset(closure_FS + 4)
+	FFI_CLOSURE_SAVE_REGS
+	movl	4(%ecx), %edx		/* copy cif */
+	movl	8(%ecx), %eax		/* copy fun */
+	movl	%edx, 28(%esp)
+	movl	%eax, 32(%esp)
+	movl	%ecx, 36(%esp)		/* closure is user_data */
+	jmp	L(do_closure_STDCALL)
+L(UW23):
+	# cfi_endproc
+ENDF(C(ffi_go_closure_STDCALL))
 
 /* For REGISTER, we have no available parameter registers, and so we
    enter here having pushed the closure onto the stack.  */
 
-	.align	16
+	.balign	16
 	.globl	C(ffi_closure_REGISTER)
 	FFI_HIDDEN(C(ffi_closure_REGISTER))
 C(ffi_closure_REGISTER):
-	cfi_startproc
-	cfi_def_cfa(%esp, 8)
-	cfi_offset(%eip, -8)
+L(UW24):
+	# cfi_startproc
+	# cfi_def_cfa(%esp, 8)
+	# cfi_offset(%eip, -8)
 	subl	$closure_FS-4, %esp
-	/* Note clang bug 21515: adjust_cfa_offset error across endproc.  */
-	cfi_def_cfa_offset(closure_FS + 4)
-
+L(UW25):
+	# cfi_def_cfa_offset(closure_FS + 4)
 	FFI_CLOSURE_SAVE_REGS
-
 	movl	closure_FS-4(%esp), %ecx	/* load retaddr */
 	movl	closure_FS(%esp), %eax		/* load closure */
 	movl	%ecx, closure_FS(%esp)		/* move retaddr */
-	jmp	0f
-
-	cfi_endproc
+	jmp	L(do_closure_REGISTER)
+L(UW26):
+	# cfi_endproc
 ENDF(C(ffi_closure_REGISTER))
 
 /* For STDCALL (and others), we need to pop N bytes of arguments off
    the stack following the closure.  The amount needing to be popped
    is returned to us from ffi_closure_inner.  */
 
-	.align	16
+	.balign	16
 	.globl	C(ffi_closure_STDCALL)
 	FFI_HIDDEN(C(ffi_closure_STDCALL))
 C(ffi_closure_STDCALL):
-	cfi_startproc
+L(UW27):
+	# cfi_startproc
 	subl	$closure_FS, %esp
-	/* Note clang bug 21515: adjust_cfa_offset error across endproc.  */
-	cfi_def_cfa_offset(closure_FS + 4)
+L(UW28):
+	# cfi_def_cfa_offset(closure_FS + 4)
 
 	FFI_CLOSURE_SAVE_REGS
 
-0:	/* Entry point from ffi_closure_REGISTER.  */
+	/* Entry point from ffi_closure_REGISTER.  */
+L(do_closure_REGISTER):
 
 	FFI_CLOSURE_COPY_TRAMP_DATA
 
-88:	/* Entry point from preceeding Go closure.  */
+	/* Entry point from preceeding Go closure.  */
+L(do_closure_STDCALL):
 
-	FFI_CLOSURE_CALL_INNER
+	FFI_CLOSURE_CALL_INNER(29)
 
 	movl	%eax, %ecx
 	shrl	$X86_RET_POP_SHIFT, %ecx	/* isolate pop count */
@@ -403,100 +468,98 @@ C(ffi_closure_STDCALL):
 	movl	closure_FS(%esp), %edx		/* move return address */
 	movl	%edx, (%ecx)
 
-	/* New pseudo-stack frame based off ecx.  This is unwind trickery
-	   in that the CFA *has* changed, to the proper popped stack address.
-	   Note that the location to which we moved the return address
-	   is the new CFA-4, so that's unchanged.  */
-	cfi_def_cfa(%ecx, 4)
-	/* Normally esp is unwound to CFA + the caller's ARGS_SIZE.
-	   We've just set the CFA to that final value.  Tell the unwinder
-	   to restore esp from CFA without the ARGS_SIZE:
-	   DW_CFA_val_expression %esp, DW_OP_call_frame_cfa.  */
-	cfi_escape(0x16, 4, 1, 0x9c)
-
-	FFI_CLOSURE_MASK_AND_JUMP
-
-	.align	8
-0:
-E(X86_RET_FLOAT)
+	/* From this point on, the value of %esp upon return is %ecx+4,
+	   and we've copied the return address to %ecx to make return easy.
+	   There's no point in representing this in the unwind info, as
+	   there is always a window between the mov and the ret which
+	   will be wrong from one point of view or another.  */
+
+	FFI_CLOSURE_MASK_AND_JUMP(3, 30)
+
+	.balign	8
+L(load_table3):
+E(L(load_table3), X86_RET_FLOAT)
 	flds    (%esp)
 	movl    %ecx, %esp
 	ret
-E(X86_RET_DOUBLE)
+E(L(load_table3), X86_RET_DOUBLE)
 	fldl    (%esp)
 	movl    %ecx, %esp
 	ret
-E(X86_RET_LDOUBLE)
+E(L(load_table3), X86_RET_LDOUBLE)
 	fldt    (%esp)
 	movl    %ecx, %esp
 	ret
-E(X86_RET_SINT8)
+E(L(load_table3), X86_RET_SINT8)
 	movsbl  (%esp), %eax
 	movl    %ecx, %esp
 	ret
-E(X86_RET_SINT16)
+E(L(load_table3), X86_RET_SINT16)
 	movswl  (%esp), %eax
 	movl    %ecx, %esp
 	ret
-E(X86_RET_UINT8)
+E(L(load_table3), X86_RET_UINT8)
 	movzbl  (%esp), %eax
 	movl    %ecx, %esp
 	ret
-E(X86_RET_UINT16)
+E(L(load_table3), X86_RET_UINT16)
 	movzwl  (%esp), %eax
 	movl    %ecx, %esp
 	ret
-E(X86_RET_INT64)
+E(L(load_table3), X86_RET_INT64)
 	popl    %eax
 	popl    %edx
 	movl    %ecx, %esp
 	ret
-E(X86_RET_INT32)
+E(L(load_table3), X86_RET_INT32)
 	movl    (%esp), %eax
 	movl    %ecx, %esp
 	ret
-E(X86_RET_VOID)
+E(L(load_table3), X86_RET_VOID)
 	movl    %ecx, %esp
 	ret
-E(X86_RET_STRUCTPOP)
+E(L(load_table3), X86_RET_STRUCTPOP)
 	movl    %ecx, %esp
 	ret
-E(X86_RET_STRUCTARG)
+E(L(load_table3), X86_RET_STRUCTARG)
 	movl	(%esp), %eax
 	movl	%ecx, %esp
 	ret
-E(X86_RET_STRUCT_1B)
+E(L(load_table3), X86_RET_STRUCT_1B)
 	movzbl	(%esp), %eax
 	movl	%ecx, %esp
 	ret
-E(X86_RET_STRUCT_2B)
+E(L(load_table3), X86_RET_STRUCT_2B)
 	movzwl	(%esp), %eax
 	movl	%ecx, %esp
 	ret
 
 	/* Fill out the table so that bad values are predictable.  */
-E(X86_RET_UNUSED14)
+E(L(load_table3), X86_RET_UNUSED14)
 	ud2
-E(X86_RET_UNUSED15)
+E(L(load_table3), X86_RET_UNUSED15)
 	ud2
 
-	cfi_endproc
+L(UW31):
+	# cfi_endproc
 ENDF(C(ffi_closure_STDCALL))
 
 #if !FFI_NO_RAW_API
 
 #define raw_closure_S_FS	(16+16+12)
 
-	.align	16
+	.balign	16
 	.globl	C(ffi_closure_raw_SYSV)
 	FFI_HIDDEN(C(ffi_closure_raw_SYSV))
 C(ffi_closure_raw_SYSV):
-	cfi_startproc
+L(UW32):
+	# cfi_startproc
 	subl	$raw_closure_S_FS, %esp
-	/* Note clang bug 21515: adjust_cfa_offset error across endproc.  */
-	cfi_def_cfa_offset(raw_closure_S_FS + 4)
+L(UW33):
+	# cfi_def_cfa_offset(raw_closure_S_FS + 4)
 	movl	%ebx, raw_closure_S_FS-4(%esp)
-	cfi_rel_offset(%ebx, raw_closure_S_FS-4)
+L(UW34):
+	# cfi_rel_offset(%ebx, raw_closure_S_FS-4)
 
 	movl	FFI_TRAMPOLINE_SIZE+8(%eax), %edx	/* load cl->user_data */
 	movl	%edx, 12(%esp)
@@ -511,96 +574,108 @@ C(ffi_closure_raw_SYSV):
 	movl	20(%ebx), %eax				/* load cif->flags */
 	andl	$X86_RET_TYPE_MASK, %eax
 #ifdef __PIC__
-	call	__x86.get_pc_thunk.bx
-1:	leal	0f-1b(%ebx, %eax, 8), %eax
+	call	C(__x86.get_pc_thunk.bx)
+L(pc4):
+	leal	L(load_table4)-L(pc4)(%ebx, %eax, 8), %eax
 #else
-	leal	0f(,%eax, 8), %eax
+	leal	L(load_table4)(,%eax, 8), %eax
 #endif
 	movl	raw_closure_S_FS-4(%esp), %ebx
-	cfi_restore(%ebx)
+L(UW35):
+	# cfi_restore(%ebx)
 	jmp	*%eax
 
-	.align	8
-0:
-E(X86_RET_FLOAT)
+	.balign	8
+L(load_table4):
+E(L(load_table4), X86_RET_FLOAT)
 	flds	16(%esp)
-	jmp	9f
-E(X86_RET_DOUBLE)
+	jmp	L(e4)
+E(L(load_table4), X86_RET_DOUBLE)
 	fldl	16(%esp)
-	jmp	9f
-E(X86_RET_LDOUBLE)
+	jmp	L(e4)
+E(L(load_table4), X86_RET_LDOUBLE)
 	fldt	16(%esp)
-	jmp	9f
-E(X86_RET_SINT8)
+	jmp	L(e4)
+E(L(load_table4), X86_RET_SINT8)
 	movsbl	16(%esp), %eax
-	jmp	9f
-E(X86_RET_SINT16)
+	jmp	L(e4)
+E(L(load_table4), X86_RET_SINT16)
 	movswl	16(%esp), %eax
-	jmp	9f
-E(X86_RET_UINT8)
+	jmp	L(e4)
+E(L(load_table4), X86_RET_UINT8)
 	movzbl	16(%esp), %eax
-	jmp	9f
-E(X86_RET_UINT16)
+	jmp	L(e4)
+E(L(load_table4), X86_RET_UINT16)
 	movzwl	16(%esp), %eax
-	jmp	9f
-E(X86_RET_INT64)
+	jmp	L(e4)
+E(L(load_table4), X86_RET_INT64)
 	movl	16+4(%esp), %edx
 	/* fallthru */
-E(X86_RET_INT32)
+E(L(load_table4), X86_RET_INT32)
 	movl	16(%esp), %eax
 	/* fallthru */
-E(X86_RET_VOID)
-9:	addl	$raw_closure_S_FS, %esp
-	cfi_adjust_cfa_offset(-raw_closure_S_FS)
+E(L(load_table4), X86_RET_VOID)
+L(e4):
+	addl	$raw_closure_S_FS, %esp
+L(UW36):
+	# cfi_adjust_cfa_offset(-raw_closure_S_FS)
 	ret
-	cfi_adjust_cfa_offset(raw_closure_S_FS)
-E(X86_RET_STRUCTPOP)
+L(UW37):
+	# cfi_adjust_cfa_offset(raw_closure_S_FS)
+E(L(load_table4), X86_RET_STRUCTPOP)
 	addl	$raw_closure_S_FS, %esp
-	cfi_adjust_cfa_offset(-raw_closure_S_FS)
+L(UW38):
+	# cfi_adjust_cfa_offset(-raw_closure_S_FS)
 	ret	$4
-	cfi_adjust_cfa_offset(raw_closure_S_FS)
-E(X86_RET_STRUCTARG)
+L(UW39):
+	# cfi_adjust_cfa_offset(raw_closure_S_FS)
+E(L(load_table4), X86_RET_STRUCTARG)
 	movl	16(%esp), %eax
-	jmp	9b
-E(X86_RET_STRUCT_1B)
+	jmp	L(e4)
+E(L(load_table4), X86_RET_STRUCT_1B)
 	movzbl	16(%esp), %eax
-	jmp	9b
-E(X86_RET_STRUCT_2B)
+	jmp	L(e4)
+E(L(load_table4), X86_RET_STRUCT_2B)
 	movzwl	16(%esp), %eax
-	jmp	9b
+	jmp	L(e4)
 
 	/* Fill out the table so that bad values are predictable.  */
-E(X86_RET_UNUSED14)
+E(L(load_table4), X86_RET_UNUSED14)
 	ud2
-E(X86_RET_UNUSED15)
+E(L(load_table4), X86_RET_UNUSED15)
 	ud2
 
-	cfi_endproc
+L(UW40):
+	# cfi_endproc
 ENDF(C(ffi_closure_raw_SYSV))
 
-#undef	raw_closure_S_FS
 #define raw_closure_T_FS	(16+16+8)
 
-	.align	16
+	.balign	16
 	.globl	C(ffi_closure_raw_THISCALL)
 	FFI_HIDDEN(C(ffi_closure_raw_THISCALL))
 C(ffi_closure_raw_THISCALL):
-	cfi_startproc
+L(UW41):
+	# cfi_startproc
 	/* Rearrange the stack such that %ecx is the first argument.
 	   This means moving the return address.  */
 	popl	%edx
-	/* Note clang bug 21515: adjust_cfa_offset error across endproc.  */
-	cfi_def_cfa_offset(0)
-	cfi_register(%eip, %edx)
+L(UW42):
+	# cfi_def_cfa_offset(0)
+	# cfi_register(%eip, %edx)
 	pushl	%ecx
-	cfi_adjust_cfa_offset(4)
+L(UW43):
+	# cfi_adjust_cfa_offset(4)
 	pushl	%edx
-	cfi_adjust_cfa_offset(4)
-	cfi_rel_offset(%eip, 0)
+L(UW44):
+	# cfi_adjust_cfa_offset(4)
+	# cfi_rel_offset(%eip, 0)
 	subl	$raw_closure_T_FS, %esp
-	cfi_adjust_cfa_offset(raw_closure_T_FS)
+L(UW45):
+	# cfi_adjust_cfa_offset(raw_closure_T_FS)
 	movl	%ebx, raw_closure_T_FS-4(%esp)
-	cfi_rel_offset(%ebx, raw_closure_T_FS-4)
+L(UW46):
+	# cfi_rel_offset(%ebx, raw_closure_T_FS-4)
 
 	movl	FFI_TRAMPOLINE_SIZE+8(%eax), %edx	/* load cl->user_data */
 	movl	%edx, 12(%esp)
@@ -615,90 +690,329 @@ C(ffi_closure_raw_THISCALL):
 	movl	20(%ebx), %eax				/* load cif->flags */
 	andl	$X86_RET_TYPE_MASK, %eax
 #ifdef __PIC__
-	call	__x86.get_pc_thunk.bx
-1:	leal	0f-1b(%ebx, %eax, 8), %eax
+	call	C(__x86.get_pc_thunk.bx)
+L(pc5):
+	leal	L(load_table5)-L(pc5)(%ebx, %eax, 8), %eax
 #else
-	leal	0f(,%eax, 8), %eax
+	leal	L(load_table5)(,%eax, 8), %eax
 #endif
 	movl	raw_closure_T_FS-4(%esp), %ebx
-	cfi_restore(%ebx)
+L(UW47):
+	# cfi_restore(%ebx)
 	jmp	*%eax
 
-	.align	8
-0:
-E(X86_RET_FLOAT)
+	.balign	8
+L(load_table5):
+E(L(load_table5), X86_RET_FLOAT)
 	flds	16(%esp)
-	jmp	9f
-E(X86_RET_DOUBLE)
+	jmp	L(e5)
+E(L(load_table5), X86_RET_DOUBLE)
 	fldl	16(%esp)
-	jmp	9f
-E(X86_RET_LDOUBLE)
+	jmp	L(e5)
+E(L(load_table5), X86_RET_LDOUBLE)
 	fldt	16(%esp)
-	jmp	9f
-E(X86_RET_SINT8)
+	jmp	L(e5)
+E(L(load_table5), X86_RET_SINT8)
 	movsbl	16(%esp), %eax
-	jmp	9f
-E(X86_RET_SINT16)
+	jmp	L(e5)
+E(L(load_table5), X86_RET_SINT16)
 	movswl	16(%esp), %eax
-	jmp	9f
-E(X86_RET_UINT8)
+	jmp	L(e5)
+E(L(load_table5), X86_RET_UINT8)
 	movzbl	16(%esp), %eax
-	jmp	9f
-E(X86_RET_UINT16)
+	jmp	L(e5)
+E(L(load_table5), X86_RET_UINT16)
 	movzwl	16(%esp), %eax
-	jmp	9f
-E(X86_RET_INT64)
+	jmp	L(e5)
+E(L(load_table5), X86_RET_INT64)
 	movl	16+4(%esp), %edx
 	/* fallthru */
-E(X86_RET_INT32)
+E(L(load_table5), X86_RET_INT32)
 	movl	16(%esp), %eax
 	/* fallthru */
-E(X86_RET_VOID)
-9:	addl	$raw_closure_T_FS, %esp
-	cfi_adjust_cfa_offset(-raw_closure_T_FS)
+E(L(load_table5), X86_RET_VOID)
+L(e5):
+	addl	$raw_closure_T_FS, %esp
+L(UW48):
+	# cfi_adjust_cfa_offset(-raw_closure_T_FS)
 	/* Remove the extra %ecx argument we pushed.  */
 	ret	$4
-	cfi_adjust_cfa_offset(raw_closure_T_FS)
-E(X86_RET_STRUCTPOP)
+L(UW49):
+	# cfi_adjust_cfa_offset(raw_closure_T_FS)
+E(L(load_table5), X86_RET_STRUCTPOP)
 	addl	$raw_closure_T_FS, %esp
-	cfi_adjust_cfa_offset(-raw_closure_T_FS)
+L(UW50):
+	# cfi_adjust_cfa_offset(-raw_closure_T_FS)
 	ret	$8
-	cfi_adjust_cfa_offset(raw_closure_T_FS)
-E(X86_RET_STRUCTARG)
+L(UW51):
+	# cfi_adjust_cfa_offset(raw_closure_T_FS)
+E(L(load_table5), X86_RET_STRUCTARG)
 	movl	16(%esp), %eax
-	jmp	9b
-E(X86_RET_STRUCT_1B)
+	jmp	L(e5)
+E(L(load_table5), X86_RET_STRUCT_1B)
 	movzbl	16(%esp), %eax
-	jmp	9b
-E(X86_RET_STRUCT_2B)
+	jmp	L(e5)
+E(L(load_table5), X86_RET_STRUCT_2B)
 	movzwl	16(%esp), %eax
-	jmp	9b
+	jmp	L(e5)
 
 	/* Fill out the table so that bad values are predictable.  */
-E(X86_RET_UNUSED14)
+E(L(load_table5), X86_RET_UNUSED14)
 	ud2
-E(X86_RET_UNUSED15)
+E(L(load_table5), X86_RET_UNUSED15)
 	ud2
 
-	cfi_endproc
+L(UW52):
+	# cfi_endproc
 ENDF(C(ffi_closure_raw_THISCALL))
 
 #endif /* !FFI_NO_RAW_API */
 
+#ifdef X86_DARWIN
+# define COMDAT(X)							\
+        .section __TEXT,__textcoal_nt,coalesced,pure_instructions;	\
+        .weak_definition X;						\
+        .private_extern X
+#elif defined __ELF__
+# define COMDAT(X)							\
+	.section .text.X,"axG",@progbits,X,comdat;			\
+	.globl	X;							\
+	FFI_HIDDEN(X)
+#else
+# define COMDAT(X)
+#endif
+
 #if defined(__PIC__)
-	.section .text.__x86.get_pc_thunk.bx,"axG",@progbits,__x86.get_pc_thunk.bx,comdat
-	.globl	__x86.get_pc_thunk.bx
-	.hidden	__x86.get_pc_thunk.bx
-	.type	__x86.get_pc_thunk.bx,@function
-__x86.get_pc_thunk.bx:
-	cfi_startproc
+	COMDAT(C(__x86.get_pc_thunk.bx))
+C(__x86.get_pc_thunk.bx):
 	movl	(%esp), %ebx
 	ret
-	cfi_endproc
-	.size	__x86.get_pc_thunk.bx, . - __x86.get_pc_thunk.bx
+ENDF(C(__x86.get_pc_thunk.bx))
+# if defined X86_DARWIN || defined HAVE_HIDDEN_VISIBILITY_ATTRIBUTE
+	COMDAT(C(__x86.get_pc_thunk.dx))
+C(__x86.get_pc_thunk.dx):
+	movl	(%esp), %edx
+	ret
+ENDF(C(__x86.get_pc_thunk.dx))
+#endif /* DARWIN || HIDDEN */
 #endif /* __PIC__ */
 
+/* Sadly, OSX cctools-as doesn't understand .cfi directives at all.  */
+
+#ifdef __APPLE__
+.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
+EHFrame0:
+#elif defined(HAVE_AS_X86_64_UNWIND_SECTION_TYPE)
+.section .eh_frame,"a",@unwind
+#else
+.section .eh_frame,"a",@progbits
+#endif
+
+#ifdef HAVE_AS_X86_PCREL
+# define PCREL(X)	X - .
+#else
+# define PCREL(X)	X@rel
+#endif
+
+/* Simplify advancing between labels.  Assume DW_CFA_advance_loc1 fits.  */
+#define ADV(N, P)	.byte 2, L(N)-L(P)
+
+	.balign 4
+L(CIE):
+	.set	L(set0),L(ECIE)-L(SCIE)
+	.long	L(set0)			/* CIE Length */
+L(SCIE):
+	.long	0			/* CIE Identifier Tag */
+	.byte	1			/* CIE Version */
+	.ascii	"zR\0"			/* CIE Augmentation */
+	.byte	1			/* CIE Code Alignment Factor */
+	.byte	0x7c			/* CIE Data Alignment Factor */
+	.byte	0x8			/* CIE RA Column */
+	.byte	1			/* Augmentation size */
+	.byte	0x1b			/* FDE Encoding (pcrel sdata4) */
+	.byte	0xc, 4, 4		/* DW_CFA_def_cfa, %esp offset 4 */
+	.byte	0x80+8, 1		/* DW_CFA_offset, %eip offset 1*-4 */
+	.balign 4
+L(ECIE):
+
+	.set	L(set1),L(EFDE1)-L(SFDE1)
+	.long	L(set1)			/* FDE Length */
+L(SFDE1):
+	.long	L(SFDE1)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW0))		/* Initial location */
+	.long	L(UW5)-L(UW0)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW1, UW0)
+	.byte	0xc, 5, 8		/* DW_CFA_def_cfa, %ebp 8 */
+	.byte	0x80+5, 2		/* DW_CFA_offset, %ebp 2*-4 */
+	ADV(UW2, UW1)
+	.byte	0x80+3, 0		/* DW_CFA_offset, %ebx 0*-4 */
+	ADV(UW3, UW2)
+	.byte	0xa			/* DW_CFA_remember_state */
+	.byte	0xc, 4, 4		/* DW_CFA_def_cfa, %esp 4 */
+	.byte	0xc0+3			/* DW_CFA_restore, %ebx */
+	.byte	0xc0+5			/* DW_CFA_restore, %ebp */
+	ADV(UW4, UW3)
+	.byte	0xb			/* DW_CFA_restore_state */
+	.balign	4
+L(EFDE1):
+
+	.set	L(set2),L(EFDE2)-L(SFDE2)
+	.long	L(set2)			/* FDE Length */
+L(SFDE2):
+	.long	L(SFDE2)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW6))		/* Initial location */
+	.long	L(UW8)-L(UW6)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW7, UW6)
+	.byte	0xe, closure_FS+4	/* DW_CFA_def_cfa_offset */
+	.balign	4
+L(EFDE2):
+
+	.set	L(set3),L(EFDE3)-L(SFDE3)
+	.long	L(set3)			/* FDE Length */
+L(SFDE3):
+	.long	L(SFDE3)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW9))		/* Initial location */
+	.long	L(UW11)-L(UW9)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW10, UW9)
+	.byte	0xe, closure_FS+4	/* DW_CFA_def_cfa_offset */
+	.balign	4
+L(EFDE3):
+
+	.set	L(set4),L(EFDE4)-L(SFDE4)
+	.long	L(set4)			/* FDE Length */
+L(SFDE4):
+	.long	L(SFDE4)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW12))		/* Initial location */
+	.long	L(UW20)-L(UW12)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW13, UW12)
+	.byte	0xe, closure_FS+4	/* DW_CFA_def_cfa_offset */
+#ifdef FFI_CLOSURE_CALL_INNER_SAVE_EBX
+	ADV(UW14, UW13)
+	.byte	0x80+3, (40-(closure_FS+4))/-4  /* DW_CFA_offset %ebx */
+	ADV(UW15, UW14)
+	.byte	0xc0+3			/* DW_CFA_restore %ebx */
+	ADV(UW16, UW15)
+#else
+	ADV(UW16, UW13)
+#endif
+	.byte	0xe, 4			/* DW_CFA_def_cfa_offset */
+	ADV(UW17, UW16)
+	.byte	0xe, closure_FS+4	/* DW_CFA_def_cfa_offset */
+	ADV(UW18, UW17)
+	.byte	0xe, 4			/* DW_CFA_def_cfa_offset */
+	ADV(UW19, UW18)
+	.byte	0xe, closure_FS+4	/* DW_CFA_def_cfa_offset */
+	.balign	4
+L(EFDE4):
+
+	.set	L(set5),L(EFDE5)-L(SFDE5)
+	.long	L(set5)			/* FDE Length */
+L(SFDE5):
+	.long	L(SFDE5)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW21))		/* Initial location */
+	.long	L(UW23)-L(UW21)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW22, UW21)
+	.byte	0xe, closure_FS+4	/* DW_CFA_def_cfa_offset */
+	.balign	4
+L(EFDE5):
+
+	.set	L(set6),L(EFDE6)-L(SFDE6)
+	.long	L(set6)			/* FDE Length */
+L(SFDE6):
+	.long	L(SFDE6)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW24))		/* Initial location */
+	.long	L(UW26)-L(UW24)		/* Address range */
+	.byte	0			/* Augmentation size */
+	.byte	0xe, 8			/* DW_CFA_def_cfa_offset */
+	.byte	0x80+8, 2		/* DW_CFA_offset %eip, 2*-4 */
+	ADV(UW25, UW24)
+	.byte	0xe, closure_FS+4	/* DW_CFA_def_cfa_offset */
+	.balign	4
+L(EFDE6):
+
+	.set	L(set7),L(EFDE7)-L(SFDE7)
+	.long	L(set7)			/* FDE Length */
+L(SFDE7):
+	.long	L(SFDE7)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW27))		/* Initial location */
+	.long	L(UW31)-L(UW27)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW28, UW27)
+	.byte	0xe, closure_FS+4	/* DW_CFA_def_cfa_offset */
+#ifdef FFI_CLOSURE_CALL_INNER_SAVE_EBX
+	ADV(UW29, UW28)
+	.byte	0x80+3, (40-(closure_FS+4))/-4  /* DW_CFA_offset %ebx */
+	ADV(UW30, UW29)
+	.byte	0xc0+3			/* DW_CFA_restore %ebx */
+#endif
+	.balign	4
+L(EFDE7):
+
+#if !FFI_NO_RAW_API
+	.set	L(set8),L(EFDE8)-L(SFDE8)
+	.long	L(set8)			/* FDE Length */
+L(SFDE8):
+	.long	L(SFDE8)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW32))		/* Initial location */
+	.long	L(UW40)-L(UW32)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW33, UW32)
+	.byte	0xe, raw_closure_S_FS+4	/* DW_CFA_def_cfa_offset */
+	ADV(UW34, UW33)
+	.byte	0x80+3, 2		/* DW_CFA_offset %ebx 2*-4 */
+	ADV(UW35, UW34)
+	.byte	0xc0+3			/* DW_CFA_restore %ebx */
+	ADV(UW36, UW35)
+	.byte	0xe, 4			/* DW_CFA_def_cfa_offset */
+	ADV(UW37, UW36)
+	.byte	0xe, raw_closure_S_FS+4	/* DW_CFA_def_cfa_offset */
+	ADV(UW38, UW37)
+	.byte	0xe, 4			/* DW_CFA_def_cfa_offset */
+	ADV(UW39, UW38)
+	.byte	0xe, raw_closure_S_FS+4	/* DW_CFA_def_cfa_offset */
+	.balign	4
+L(EFDE8):
+
+	.set	L(set9),L(EFDE9)-L(SFDE9)
+	.long	L(set9)			/* FDE Length */
+L(SFDE9):
+	.long	L(SFDE9)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW41))		/* Initial location */
+	.long	L(UW52)-L(UW41)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW42, UW41)
+	.byte	0xe, 0			/* DW_CFA_def_cfa_offset */
+	.byte	0x9, 8, 2		/* DW_CFA_register %eip, %edx */
+	ADV(UW43, UW42)
+	.byte	0xe, 4			/* DW_CFA_def_cfa_offset */
+	ADV(UW44, UW43)
+	.byte	0xe, 8			/* DW_CFA_def_cfa_offset */
+	.byte	0x80+8, 2		/* DW_CFA_offset %eip 2*-4 */
+	ADV(UW45, UW44)
+	.byte	0xe, raw_closure_T_FS+8	/* DW_CFA_def_cfa_offset */
+	ADV(UW46, UW45)
+	.byte	0x80+3, 3		/* DW_CFA_offset %ebx 3*-4 */
+	ADV(UW47, UW46)
+	.byte	0xc0+3			/* DW_CFA_restore %ebx */
+	ADV(UW48, UW47)
+	.byte	0xe, 8			/* DW_CFA_def_cfa_offset */
+	ADV(UW49, UW48)
+	.byte	0xe, raw_closure_T_FS+8	/* DW_CFA_def_cfa_offset */
+	ADV(UW50, UW49)
+	.byte	0xe, 8			/* DW_CFA_def_cfa_offset */
+	ADV(UW51, UW50)
+	.byte	0xe, raw_closure_T_FS+8	/* DW_CFA_def_cfa_offset */
+	.balign	4
+L(EFDE9):
+#endif /* !FFI_NO_RAW_API */
+
 #endif /* ifndef __x86_64__ */
+
 #if defined __ELF__ && defined __linux__
 	.section	.note.GNU-stack,"",@progbits
 #endif
diff --git a/src/x86/unix64.S b/src/x86/unix64.S
index 58cb153..f9f9163 100644
--- a/src/x86/unix64.S
+++ b/src/x86/unix64.S
@@ -30,21 +30,41 @@
 #define LIBFFI_ASM	
 #include <fficonfig.h>
 #include <ffi.h>
-#include <ffi_cfi.h>
 #include "internal64.h"
 
 	.text
 
+#define C2(X, Y)  X ## Y
+#define C1(X, Y)  C2(X, Y)
+#ifdef __USER_LABEL_PREFIX__
+# define C(X)     C1(__USER_LABEL_PREFIX__, X)
+#else
+# define C(X)     X
+#endif
+
+#ifdef __APPLE__
+# define L(X)     C1(L, X)
+#else
+# define L(X)     C1(.L, X)
+#endif
+
+#ifdef __ELF__
+# define PLT(X)	  X@PLT
+# define ENDF(X)  .type	X,@function; .size X, . - X
+#else
+# define PLT(X)	  X
+# define ENDF(X)
+#endif
+
 /* This macro allows the safe creation of jump tables without an
    actual table.  The entry points into the table are all 8 bytes.
    The use of ORG asserts that we're at the correct location.  */
 /* ??? The clang assembler doesn't handle .org with symbolic expressions.  */
-.macro E index
-	.align	8
-#ifndef __clang__
-	.org	0b + \index * 8, 0x90
+#if defined(__clang__) || defined(__APPLE__)
+# define E(BASE, X)	.balign 8
+#else
+# define E(BASE, X)	.balign 8; .org BASE + X * 8
 #endif
-.endm
 
 /* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
 	            void *raddr, void (*fnaddr)(void));
@@ -53,13 +73,12 @@
    for this function.  This has been allocated by ffi_call.  We also
    deallocate some of the stack that has been alloca'd.  */
 
-	.align	8
-	.globl	ffi_call_unix64
-	.type	ffi_call_unix64,@function
-	FFI_HIDDEN(ffi_call_unix64)
+	.balign	8
+	.globl	C(ffi_call_unix64)
+	FFI_HIDDEN(C(ffi_call_unix64))
 
-ffi_call_unix64:
-	cfi_startproc
+C(ffi_call_unix64):
+L(UW0):
 	movq	(%rsp), %r10		/* Load return address.  */
 	leaq	(%rdi, %rsi), %rax	/* Find local stack base.  */
 	movq	%rdx, (%rax)		/* Save flags.  */
@@ -75,8 +94,9 @@ ffi_call_unix64:
 	   unwind back to ffi_call.  Note that the location to which we
 	   moved the return address is (the new) CFA-8, so from the
 	   perspective of the unwind info, it hasn't moved.  */
-	cfi_def_cfa(%rbp, 32)
-	cfi_rel_offset(%rbp, 16)
+L(UW1):
+	/* cfi_def_cfa(%rbp, 32) */
+	/* cfi_rel_offset(%rbp, 16) */
 
 	movq	%rdi, %r10		/* Save a copy of the register area. */
 	movq	%r8, %r11		/* Save a copy of the target fn.  */
@@ -91,8 +111,8 @@ ffi_call_unix64:
 	movq	0x28(%r10), %r9
 	movl	0xb0(%r10), %eax
 	testl	%eax, %eax
-	jnz	.Lload_sse
-.Lret_from_load_sse:
+	jnz	L(load_sse)
+L(ret_from_load_sse):
 
 	/* Deallocate the reg arg area, except for r10, then load via pop.  */
 	leaq	0xb8(%r10), %rsp
@@ -107,94 +127,98 @@ ffi_call_unix64:
 	movq	0(%rbp), %rcx		/* Reload flags.  */
 	movq	8(%rbp), %rdi		/* Reload raddr.  */
 	movq	16(%rbp), %rbp		/* Reload old frame pointer.  */
-	cfi_remember_state
-	cfi_def_cfa(%rsp, 8)
-	cfi_restore(%rbp)
+L(UW2):
+	/* cfi_remember_state */
+	/* cfi_def_cfa(%rsp, 8) */
+	/* cfi_restore(%rbp) */
 
 	/* The first byte of the flags contains the FFI_TYPE.  */
 	cmpb	$UNIX64_RET_LAST, %cl
 	movzbl	%cl, %r10d
-	leaq	0f(%rip), %r11
-	ja	9f
+	leaq	L(store_table)(%rip), %r11
+	ja	L(sa)
 	leaq	(%r11, %r10, 8), %r10
 
 	/* Prep for the structure cases: scratch area in redzone.  */
 	leaq	-20(%rsp), %rsi
 	jmp	*%r10
 
-	.align	8
-0:
-E UNIX64_RET_VOID
+	.balign	8
+L(store_table):
+E(L(store_table), UNIX64_RET_VOID)
 	ret
-E UNIX64_RET_UINT8
+E(L(store_table), UNIX64_RET_UINT8)
 	movzbl	%al, %eax
 	movq	%rax, (%rdi)
 	ret
-E UNIX64_RET_UINT16
+E(L(store_table), UNIX64_RET_UINT16)
 	movzwl	%ax, %eax
 	movq	%rax, (%rdi)
 	ret
-E UNIX64_RET_UINT32
+E(L(store_table), UNIX64_RET_UINT32)
 	movl	%eax, %eax
 	movq	%rax, (%rdi)
 	ret
-E UNIX64_RET_SINT8
+E(L(store_table), UNIX64_RET_SINT8)
 	movsbq	%al, %rax
 	movq	%rax, (%rdi)
 	ret
-E UNIX64_RET_SINT16
+E(L(store_table), UNIX64_RET_SINT16)
 	movswq	%ax, %rax
 	movq	%rax, (%rdi)
 	ret
-E UNIX64_RET_SINT32
+E(L(store_table), UNIX64_RET_SINT32)
 	cltq
 	movq	%rax, (%rdi)
 	ret
-E UNIX64_RET_INT64
+E(L(store_table), UNIX64_RET_INT64)
 	movq	%rax, (%rdi)
 	ret
-E UNIX64_RET_XMM32
+E(L(store_table), UNIX64_RET_XMM32)
 	movd	%xmm0, (%rdi)
 	ret
-E UNIX64_RET_XMM64
+E(L(store_table), UNIX64_RET_XMM64)
 	movq	%xmm0, (%rdi)
 	ret
-E UNIX64_RET_X87
+E(L(store_table), UNIX64_RET_X87)
 	fstpt	(%rdi)
 	ret
-E UNIX64_RET_X87_2
+E(L(store_table), UNIX64_RET_X87_2)
 	fstpt	(%rdi)
 	fstpt	16(%rdi)
 	ret
-E UNIX64_RET_ST_XMM0_RAX
+E(L(store_table), UNIX64_RET_ST_XMM0_RAX)
 	movq	%rax, 8(%rsi)
-	jmp	3f
-E UNIX64_RET_ST_RAX_XMM0
+	jmp	L(s3)
+E(L(store_table), UNIX64_RET_ST_RAX_XMM0)
 	movq	%xmm0, 8(%rsi)
-	jmp	2f
-E UNIX64_RET_ST_XMM0_XMM1
+	jmp	L(s2)
+E(L(store_table), UNIX64_RET_ST_XMM0_XMM1)
 	movq	%xmm1, 8(%rsi)
-	jmp	3f
-E UNIX64_RET_ST_RAX_RDX
+	jmp	L(s3)
+E(L(store_table), UNIX64_RET_ST_RAX_RDX)
 	movq	%rdx, 8(%rsi)
-2:	movq	%rax, (%rsi)
+L(s2):
+	movq	%rax, (%rsi)
 	shrl	$UNIX64_SIZE_SHIFT, %ecx
 	rep movsb
 	ret
-	.align 8
-3:	movq	%xmm0, (%rsi)
+	.balign 8
+L(s3):
+	movq	%xmm0, (%rsi)
 	shrl	$UNIX64_SIZE_SHIFT, %ecx
 	rep movsb
 	ret
 
-9:	call	abort@PLT
+L(sa):	call	PLT(C(abort))
 
 	/* Many times we can avoid loading any SSE registers at all.
 	   It's not worth an indirect jump to load the exact set of
 	   SSE registers needed; zero or all is a good compromise.  */
-	.align 2
-	cfi_restore_state
-.Lload_sse:
+	.balign 2
+L(UW3):
+	/* cfi_restore_state */
+L(load_sse):
 	movdqa	0x30(%r10), %xmm0
 	movdqa	0x40(%r10), %xmm1
 	movdqa	0x50(%r10), %xmm2
@@ -203,10 +227,10 @@ E UNIX64_RET_ST_RAX_RDX
 	movdqa	0x80(%r10), %xmm5
 	movdqa	0x90(%r10), %xmm6
 	movdqa	0xa0(%r10), %xmm7
-	jmp	.Lret_from_load_sse
+	jmp	L(ret_from_load_sse)
 
-	cfi_endproc
-	.size    ffi_call_unix64,.-ffi_call_unix64
+L(UW4):
+ENDF(C(ffi_call_unix64))
 
 /* 6 general registers, 8 vector registers,
    32 bytes of rvalue, 8 bytes of alignment.  */
@@ -218,16 +242,15 @@ E UNIX64_RET_ST_RAX_RDX
 /* The location of rvalue within the red zone after deallocating the frame.  */
 #define ffi_closure_RED_RVALUE	(ffi_closure_OFS_RVALUE - ffi_closure_FS)
 
-	.align	2
-	.globl	ffi_closure_unix64_sse
-	.type	ffi_closure_unix64_sse,@function
-	FFI_HIDDEN(ffi_closure_unix64_sse)
+	.balign	2
+	.globl	C(ffi_closure_unix64_sse)
+	FFI_HIDDEN(C(ffi_closure_unix64_sse))
 
-ffi_closure_unix64_sse:
-	cfi_startproc
+C(ffi_closure_unix64_sse):
+L(UW5):
 	subq	$ffi_closure_FS, %rsp
-	/* Note clang bug 21515: adjust_cfa_offset error across endproc.  */
-	cfi_def_cfa_offset(ffi_closure_FS + 8)
+L(UW6):
+	/* cfi_adjust_cfa_offset(ffi_closure_FS) */
 
 	movdqa	%xmm0, ffi_closure_OFS_V+0x00(%rsp)
 	movdqa	%xmm1, ffi_closure_OFS_V+0x10(%rsp)
@@ -237,22 +260,21 @@ ffi_closure_unix64_sse:
 	movdqa	%xmm5, ffi_closure_OFS_V+0x50(%rsp)
 	movdqa	%xmm6, ffi_closure_OFS_V+0x60(%rsp)
 	movdqa	%xmm7, ffi_closure_OFS_V+0x70(%rsp)
-	jmp	0f
+	jmp	L(sse_entry1)
 
-	cfi_endproc
-	.size	ffi_closure_unix64_sse,.-ffi_closure_unix64_sse
+L(UW7):
+ENDF(C(ffi_closure_unix64_sse))
 
-	.align	2
-	.globl	ffi_closure_unix64
-	.type	ffi_closure_unix64,@function
-	FFI_HIDDEN(ffi_closure_unix64)
+	.balign	2
+	.globl	C(ffi_closure_unix64)
+	FFI_HIDDEN(C(ffi_closure_unix64))
 
-ffi_closure_unix64:
-	cfi_startproc
+C(ffi_closure_unix64):
+L(UW8):
 	subq	$ffi_closure_FS, %rsp
-	/* Note clang bug 21515: adjust_cfa_offset error across endproc.  */
-	cfi_def_cfa_offset(ffi_closure_FS + 8)
-0:
+L(UW9):
+	/* cfi_adjust_cfa_offset(ffi_closure_FS) */
+L(sse_entry1):
 	movq	%rdi, ffi_closure_OFS_G+0x00(%rsp)
 	movq    %rsi, ffi_closure_OFS_G+0x08(%rsp)
 	movq    %rdx, ffi_closure_OFS_G+0x10(%rsp)
@@ -269,95 +291,97 @@ ffi_closure_unix64:
 	movq	FFI_TRAMPOLINE_SIZE+8(%r10), %rsi	/* Load fun */
 	movq	FFI_TRAMPOLINE_SIZE+16(%r10), %rdx	/* Load user_data */
 #endif
-.Ldo_closure:
+L(do_closure):
 	leaq	ffi_closure_OFS_RVALUE(%rsp), %rcx	/* Load rvalue */
 	movq	%rsp, %r8				/* Load reg_args */
 	leaq	ffi_closure_FS+8(%rsp), %r9		/* Load argp */
-	call	ffi_closure_unix64_inner
+	call	C(ffi_closure_unix64_inner)
 
 	/* Deallocate stack frame early; return value is now in redzone.  */
 	addq	$ffi_closure_FS, %rsp
-	cfi_adjust_cfa_offset(-ffi_closure_FS)
+L(UW10):
+	/* cfi_adjust_cfa_offset(-ffi_closure_FS) */
 
 	/* The first byte of the return value contains the FFI_TYPE.  */
 	cmpb	$UNIX64_RET_LAST, %al
 	movzbl	%al, %r10d
-	leaq	0f(%rip), %r11
-	ja	9f
+	leaq	L(load_table)(%rip), %r11
+	ja	L(la)
 	leaq	(%r11, %r10, 8), %r10
 	leaq	ffi_closure_RED_RVALUE(%rsp), %rsi
 	jmp	*%r10
 
-	.align	8
-0:
-E UNIX64_RET_VOID
+	.balign	8
+L(load_table):
+E(L(load_table), UNIX64_RET_VOID)
 	ret
-E UNIX64_RET_UINT8
+E(L(load_table), UNIX64_RET_UINT8)
 	movzbl	(%rsi), %eax
 	ret
-E UNIX64_RET_UINT16
+E(L(load_table), UNIX64_RET_UINT16)
 	movzwl	(%rsi), %eax
 	ret
-E UNIX64_RET_UINT32
+E(L(load_table), UNIX64_RET_UINT32)
 	movl	(%rsi), %eax
 	ret
-E UNIX64_RET_SINT8
+E(L(load_table), UNIX64_RET_SINT8)
 	movsbl	(%rsi), %eax
 	ret
-E UNIX64_RET_SINT16
+E(L(load_table), UNIX64_RET_SINT16)
 	movswl	(%rsi), %eax
 	ret
-E UNIX64_RET_SINT32
+E(L(load_table), UNIX64_RET_SINT32)
 	movl	(%rsi), %eax
 	ret
-E UNIX64_RET_INT64
+E(L(load_table), UNIX64_RET_INT64)
 	movq	(%rsi), %rax
 	ret
-E UNIX64_RET_XMM32
+E(L(load_table), UNIX64_RET_XMM32)
 	movd	(%rsi), %xmm0
 	ret
-E UNIX64_RET_XMM64
+E(L(load_table), UNIX64_RET_XMM64)
 	movq	(%rsi), %xmm0
 	ret
-E UNIX64_RET_X87
+E(L(load_table), UNIX64_RET_X87)
 	fldt	(%rsi)
 	ret
-E UNIX64_RET_X87_2
+E(L(load_table), UNIX64_RET_X87_2)
 	fldt	16(%rsi)
 	fldt	(%rsi)
 	ret
-E UNIX64_RET_ST_XMM0_RAX
+E(L(load_table), UNIX64_RET_ST_XMM0_RAX)
 	movq	8(%rsi), %rax
-	jmp	3f
-E UNIX64_RET_ST_RAX_XMM0
+	jmp	L(l3)
+E(L(load_table), UNIX64_RET_ST_RAX_XMM0)
 	movq	8(%rsi), %xmm0
-	jmp	2f
-E UNIX64_RET_ST_XMM0_XMM1
+	jmp	L(l2)
+E(L(load_table), UNIX64_RET_ST_XMM0_XMM1)
 	movq	8(%rsi), %xmm1
-	jmp	3f
-E UNIX64_RET_ST_RAX_RDX
+	jmp	L(l3)
+E(L(load_table), UNIX64_RET_ST_RAX_RDX)
 	movq	8(%rsi), %rdx
-2:	movq	(%rsi), %rax
+L(l2):
+	movq	(%rsi), %rax
 	ret
-	.align	8
-3:	movq	(%rsi), %xmm0
+	.balign	8
+L(l3):
+	movq	(%rsi), %xmm0
 	ret
 
-9:	call	abort@PLT
+L(la):	call	PLT(C(abort))
 
-	cfi_endproc
-	.size	ffi_closure_unix64,.-ffi_closure_unix64
+L(UW11):
+ENDF(C(ffi_closure_unix64))
 
-	.align	2
-	.globl	ffi_go_closure_unix64_sse
-	.type	ffi_go_closure_unix64_sse,@function
-	FFI_HIDDEN(ffi_go_closure_unix64_sse)
+	.balign	2
+	.globl	C(ffi_go_closure_unix64_sse)
+	FFI_HIDDEN(C(ffi_go_closure_unix64_sse))
 
-ffi_go_closure_unix64_sse:
-	cfi_startproc
+C(ffi_go_closure_unix64_sse):
+L(UW12):
 	subq	$ffi_closure_FS, %rsp
-	/* Note clang bug 21515: adjust_cfa_offset error across endproc.  */
-	cfi_def_cfa_offset(ffi_closure_FS + 8)
+L(UW13):
+	/* cfi_adjust_cfa_offset(ffi_closure_FS) */
 
 	movdqa	%xmm0, ffi_closure_OFS_V+0x00(%rsp)
 	movdqa	%xmm1, ffi_closure_OFS_V+0x10(%rsp)
@@ -367,22 +391,21 @@ ffi_go_closure_unix64_sse:
 	movdqa	%xmm5, ffi_closure_OFS_V+0x50(%rsp)
 	movdqa	%xmm6, ffi_closure_OFS_V+0x60(%rsp)
 	movdqa	%xmm7, ffi_closure_OFS_V+0x70(%rsp)
-	jmp	0f
+	jmp	L(sse_entry2)
 
-	cfi_endproc
-	.size	ffi_go_closure_unix64_sse,.-ffi_go_closure_unix64_sse
+L(UW14):
+ENDF(C(ffi_go_closure_unix64_sse))
 
-	.align	2
-	.globl	ffi_go_closure_unix64
-	.type	ffi_go_closure_unix64,@function
-	FFI_HIDDEN(ffi_go_closure_unix64)
+	.balign	2
+	.globl	C(ffi_go_closure_unix64)
+	FFI_HIDDEN(C(ffi_go_closure_unix64))
 
-ffi_go_closure_unix64:
-	cfi_startproc
+C(ffi_go_closure_unix64):
+L(UW15):
 	subq	$ffi_closure_FS, %rsp
-	/* Note clang bug 21515: adjust_cfa_offset error across endproc.  */
-	cfi_def_cfa_offset(ffi_closure_FS + 8)
-0:
+L(UW16):
+	/* cfi_adjust_cfa_offset(ffi_closure_FS) */
+L(sse_entry2):
 	movq	%rdi, ffi_closure_OFS_G+0x00(%rsp)
 	movq    %rsi, ffi_closure_OFS_G+0x08(%rsp)
 	movq    %rdx, ffi_closure_OFS_G+0x10(%rsp)
@@ -399,10 +422,123 @@ ffi_go_closure_unix64:
 	movq	16(%r10), %rsi		/* Load fun */
 	movq	%r10, %rdx		/* Load closure (user_data) */
 #endif
-	jmp	.Ldo_closure
+	jmp	L(do_closure)
+
+L(UW17):
+ENDF(C(ffi_go_closure_unix64))
+
+/* Sadly, OSX cctools-as doesn't understand .cfi directives at all.  */
+
+#ifdef __APPLE__
+.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
+EHFrame0:
+#elif defined(HAVE_AS_X86_64_UNWIND_SECTION_TYPE)
+.section .eh_frame,"a",@unwind
+#else
+.section .eh_frame,"a",@progbits
+#endif
 
-	cfi_endproc
-	.size	ffi_go_closure_unix64,.-ffi_go_closure_unix64
+#ifdef HAVE_AS_X86_PCREL
+# define PCREL(X)	X - .
+#else
+# define PCREL(X)	X@rel
+#endif
+
+/* Simplify advancing between labels.  Assume DW_CFA_advance_loc1 fits.  */
+#define ADV(N, P)	.byte 2, L(N)-L(P)
+
+	.balign 8
+L(CIE):
+	.set	L(set0),L(ECIE)-L(SCIE)
+	.long	L(set0)			/* CIE Length */
+L(SCIE):
+	.long	0			/* CIE Identifier Tag */
+	.byte	1			/* CIE Version */
+	.ascii	"zR\0"			/* CIE Augmentation */
+	.byte	1			/* CIE Code Alignment Factor */
+	.byte	0x78			/* CIE Data Alignment Factor */
+	.byte	0x10			/* CIE RA Column */
+	.byte	1			/* Augmentation size */
+	.byte	0x1b			/* FDE Encoding (pcrel sdata4) */
+	.byte	0xc, 7, 8		/* DW_CFA_def_cfa, %rsp offset 8 */
+	.byte	0x80+16, 1		/* DW_CFA_offset, %rip offset 1*-8 */
+	.balign 8
+L(ECIE):
+
+	.set	L(set1),L(EFDE1)-L(SFDE1)
+	.long	L(set1)			/* FDE Length */
+L(SFDE1):
+	.long	L(SFDE1)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW0))		/* Initial location */
+	.long	L(UW4)-L(UW0)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW1, UW0)
+	.byte	0xc, 6, 32		/* DW_CFA_def_cfa, %rbp 32 */
+	.byte	0x80+6, 2		/* DW_CFA_offset, %rbp 2*-8 */
+	ADV(UW2, UW1)
+	.byte	0xa			/* DW_CFA_remember_state */
+	.byte	0xc, 7, 8		/* DW_CFA_def_cfa, %rsp 8 */
+	.byte	0xc0+6			/* DW_CFA_restore, %rbp */
+	ADV(UW3, UW2)
+	.byte	0xb			/* DW_CFA_restore_state */
+	.balign	8
+L(EFDE1):
+
+	.set	L(set2),L(EFDE2)-L(SFDE2)
+	.long	L(set2)			/* FDE Length */
+L(SFDE2):
+	.long	L(SFDE2)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW5))		/* Initial location */
+	.long	L(UW7)-L(UW5)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW6, UW5)
+	.byte	0xe			/* DW_CFA_def_cfa_offset */
+	.byte	ffi_closure_FS + 8, 1	/* uleb128, assuming 128 <= FS < 255 */
+	.balign	8
+L(EFDE2):
+
+	.set	L(set3),L(EFDE3)-L(SFDE3)
+	.long	L(set3)			/* FDE Length */
+L(SFDE3):
+	.long	L(SFDE3)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW8))		/* Initial location */
+	.long	L(UW11)-L(UW8)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW9, UW8)
+	.byte	0xe			/* DW_CFA_def_cfa_offset */
+	.byte	ffi_closure_FS + 8, 1	/* uleb128, assuming 128 <= FS < 255 */
+	ADV(UW10, UW9)
+	.byte	0xe, 8			/* DW_CFA_def_cfa_offset 8 */
+L(EFDE3):
+
+	.set	L(set4),L(EFDE4)-L(SFDE4)
+	.long	L(set4)			/* FDE Length */
+L(SFDE4):
+	.long	L(SFDE4)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW12))		/* Initial location */
+	.long	L(UW14)-L(UW12)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW13, UW12)
+	.byte	0xe			/* DW_CFA_def_cfa_offset */
+	.byte	ffi_closure_FS + 8, 1	/* uleb128, assuming 128 <= FS < 255 */
+	.balign	8
+L(EFDE4):
+
+	.set	L(set5),L(EFDE5)-L(SFDE5)
+	.long	L(set5)			/* FDE Length */
+L(SFDE5):
+	.long	L(SFDE5)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW15))		/* Initial location */
+	.long	L(UW17)-L(UW15)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW16, UW15)
+	.byte	0xe			/* DW_CFA_def_cfa_offset */
+	.byte	ffi_closure_FS + 8, 1	/* uleb128, assuming 128 <= FS < 255 */
+	.balign	8
+L(EFDE5):
+#ifdef __APPLE__
+	.subsections_via_symbols
+#endif
 
 #endif /* __x86_64__ */
 #if defined __ELF__ && defined __linux__