diff options
author | hjl-tools <hjl.tools@gmail.com> | 2020-02-21 19:08:06 -0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-02-21 22:08:06 -0500 |
commit | 7855656148b96c7070ec362d2a73af840025a2b7 (patch) | |
tree | 8a22c616b7e88ea007d8c454f94f7a70f2aa1565 | |
parent | 4d6d2866ae43e55325e8ee96561221804602cd7a (diff) | |
download | libffi-7855656148b96c7070ec362d2a73af840025a2b7.tar.gz |
x86: Add indirect branch tracking support (#540)
Intel Control-flow Enforcement Technology (CET):
https://software.intel.com/en-us/articles/intel-sdm
contains shadow stack (SHSTK) and indirect branch tracking (IBT). When
CET is enabled, ELF object files must be marked with .note.gnu.property
section. When Intel CET is enabled, include <cet.h> in assembly codes
to mark Intel CET support.
Also when IBT is enabled, all indirect branch targets must start with
ENDBR instruction and notrack prefix can be used to disable IBT on
indirect branch. <cet.h> defines _CET_ENDBR which can be used in
assembly codes for ENDBR instruction. If <cet.h> isn't included,
define _CET_ENDBR as empty so that _CET_ENDBR can be used in assembly
codes.
Trampoline must be enlarged to add ENDBR instruction unconditionally,
which is NOP on non-CET processors. This is required regardless if
libffi is enabled with CET since libffi.so will be marked in legacy
bitmap, but trampoline won't. Update library version for larger
FFI_TRAMPOLINE_SIZE.
This fixed:
https://github.com/libffi/libffi/issues/474
Tested with
$ CC="gcc -Wl,-z,cet-report=error -fcf-protection" CXX="g++ -Wl,-z,cet-report=error -fcf-protection" .../configure
on Linux CET machines in i686, x32 and x86-64 modes.
-rw-r--r-- | Makefile.am | 3 | ||||
-rw-r--r-- | libtool-version | 2 | ||||
-rw-r--r-- | src/x86/ffi.c | 11 | ||||
-rw-r--r-- | src/x86/ffi64.c | 18 | ||||
-rw-r--r-- | src/x86/ffitarget.h | 17 | ||||
-rw-r--r-- | src/x86/ffiw64.c | 18 | ||||
-rw-r--r-- | src/x86/sysv.S | 17 | ||||
-rw-r--r-- | src/x86/unix64.S | 60 | ||||
-rw-r--r-- | src/x86/win64.S | 5 |
9 files changed, 120 insertions, 31 deletions
diff --git a/Makefile.am b/Makefile.am index 4fd6193..563e9f2 100644 --- a/Makefile.am +++ b/Makefile.am @@ -144,7 +144,8 @@ endif libffi_version_info = -version-info `grep -v '^\#' $(srcdir)/libtool-version` libffi.map: $(top_srcdir)/libffi.map.in - $(COMPILE) -D$(TARGET) -E -x assembler-with-cpp -o $@ $< + $(COMPILE) -D$(TARGET) -DGENERATE_LIBFFI_MAP \ + -E -x assembler-with-cpp -o $@ $< libffi_la_LDFLAGS = -no-undefined $(libffi_version_info) $(libffi_version_script) $(LTLDFLAGS) $(AM_LTLDFLAGS) libffi_la_DEPENDENCIES = $(libffi_la_LIBADD) $(libffi_version_dep) diff --git a/libtool-version b/libtool-version index e4f5aa2..607fee5 100644 --- a/libtool-version +++ b/libtool-version @@ -26,4 +26,4 @@ # release, then set age to 0. # # CURRENT:REVISION:AGE -8:0:1 +9:0:1 diff --git a/src/x86/ffi.c b/src/x86/ffi.c index 9a59218..e247322 100644 --- a/src/x86/ffi.c +++ b/src/x86/ffi.c @@ -557,13 +557,16 @@ ffi_prep_closure_loc (ffi_closure* closure, return FFI_BAD_ABI; } + /* endbr32. */ + *(UINT32 *) tramp = 0xfb1e0ff3; + /* movl or pushl immediate. */ - tramp[0] = op; - *(void **)(tramp + 1) = codeloc; + tramp[4] = op; + *(void **)(tramp + 5) = codeloc; /* jmp dest */ - tramp[5] = 0xe9; - *(unsigned *)(tramp + 6) = (unsigned)dest - ((unsigned)codeloc + 10); + tramp[9] = 0xe9; + *(unsigned *)(tramp + 10) = (unsigned)dest - ((unsigned)codeloc + 10); closure->cif = cif; closure->fun = fun; diff --git a/src/x86/ffi64.c b/src/x86/ffi64.c index dec331c..ed82e23 100644 --- a/src/x86/ffi64.c +++ b/src/x86/ffi64.c @@ -728,13 +728,15 @@ ffi_prep_closure_loc (ffi_closure* closure, void *user_data, void *codeloc) { - static const unsigned char trampoline[16] = { - /* leaq -0x7(%rip),%r10 # 0x0 */ - 0x4c, 0x8d, 0x15, 0xf9, 0xff, 0xff, 0xff, - /* jmpq *0x3(%rip) # 0x10 */ - 0xff, 0x25, 0x03, 0x00, 0x00, 0x00, - /* nopl (%rax) */ - 0x0f, 0x1f, 0x00 + static const unsigned char trampoline[24] = { + /* endbr64 */ + 0xf3, 0x0f, 0x1e, 0xfa, + /* leaq -0xb(%rip),%r10 # 0x0 */ + 0x4c, 0x8d, 0x15, 0xf5, 0xff, 0xff, 0xff, + /* jmpq *0x7(%rip) # 0x18 */ + 0xff, 0x25, 0x07, 0x00, 0x00, 0x00, + /* nopl 0(%rax) */ + 0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00 }; void (*dest)(void); char *tramp = closure->tramp; @@ -752,7 +754,7 @@ ffi_prep_closure_loc (ffi_closure* closure, dest = ffi_closure_unix64; memcpy (tramp, trampoline, sizeof(trampoline)); - *(UINT64 *)(tramp + 16) = (uintptr_t)dest; + *(UINT64 *)(tramp + sizeof (trampoline)) = (uintptr_t)dest; closure->cif = cif; closure->fun = fun; diff --git a/src/x86/ffitarget.h b/src/x86/ffitarget.h index 85ccedf..a34f3e5 100644 --- a/src/x86/ffitarget.h +++ b/src/x86/ffitarget.h @@ -136,12 +136,25 @@ typedef enum ffi_abi { #if defined (X86_64) || defined(X86_WIN64) \ || (defined (__x86_64__) && defined (X86_DARWIN)) -# define FFI_TRAMPOLINE_SIZE 24 +/* 4 bytes of ENDBR64 + 7 bytes of LEA + 6 bytes of JMP + 7 bytes of NOP + + 8 bytes of pointer. */ +# define FFI_TRAMPOLINE_SIZE 32 # define FFI_NATIVE_RAW_API 0 #else -# define FFI_TRAMPOLINE_SIZE 12 +/* 4 bytes of ENDBR32 + 5 bytes of MOV + 5 bytes of JMP + 2 unused + bytes. */ +# define FFI_TRAMPOLINE_SIZE 16 # define FFI_NATIVE_RAW_API 1 /* x86 has native raw api support */ #endif +#if !defined(GENERATE_LIBFFI_MAP) && defined(__ASSEMBLER__) \ + && defined(__CET__) +# include <cet.h> +# define _CET_NOTRACK notrack +#else +# define _CET_ENDBR +# define _CET_NOTRACK +#endif + #endif diff --git a/src/x86/ffiw64.c b/src/x86/ffiw64.c index b68f69c..034dffd 100644 --- a/src/x86/ffiw64.c +++ b/src/x86/ffiw64.c @@ -196,13 +196,15 @@ EFI64(ffi_prep_closure_loc)(ffi_closure* closure, void *user_data, void *codeloc) { - static const unsigned char trampoline[16] = { - /* leaq -0x7(%rip),%r10 # 0x0 */ - 0x4c, 0x8d, 0x15, 0xf9, 0xff, 0xff, 0xff, - /* jmpq *0x3(%rip) # 0x10 */ - 0xff, 0x25, 0x03, 0x00, 0x00, 0x00, - /* nopl (%rax) */ - 0x0f, 0x1f, 0x00 + static const unsigned char trampoline[FFI_TRAMPOLINE_SIZE - 8] = { + /* endbr64 */ + 0xf3, 0x0f, 0x1e, 0xfa, + /* leaq -0xb(%rip),%r10 # 0x0 */ + 0x4c, 0x8d, 0x15, 0xf5, 0xff, 0xff, 0xff, + /* jmpq *0x7(%rip) # 0x18 */ + 0xff, 0x25, 0x07, 0x00, 0x00, 0x00, + /* nopl 0(%rax) */ + 0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00 }; char *tramp = closure->tramp; @@ -216,7 +218,7 @@ EFI64(ffi_prep_closure_loc)(ffi_closure* closure, } memcpy (tramp, trampoline, sizeof(trampoline)); - *(UINT64 *)(tramp + 16) = (uintptr_t)ffi_closure_win64; + *(UINT64 *)(tramp + sizeof (trampoline)) = (uintptr_t)ffi_closure_win64; closure->cif = cif; closure->fun = fun; diff --git a/src/x86/sysv.S b/src/x86/sysv.S index 7c9598c..6d56483 100644 --- a/src/x86/sysv.S +++ b/src/x86/sysv.S @@ -92,6 +92,7 @@ ffi_call_i386: L(UW0): # cfi_startproc + _CET_ENDBR #if !HAVE_FASTCALL movl 4(%esp), %ecx movl 8(%esp), %edx @@ -133,7 +134,7 @@ L(pc1): leal L(store_table)(,%ecx, 8), %ebx #endif movl 16(%ebp), %ecx /* load result address */ - jmp *%ebx + _CET_NOTRACK jmp *%ebx .balign 8 L(store_table): @@ -256,7 +257,7 @@ ENDF(ffi_call_i386) andl $X86_RET_TYPE_MASK, %eax; \ leal L(C1(load_table,N))(, %eax, 8), %edx; \ movl closure_CF(%esp), %eax; /* optimiztic load */ \ - jmp *%edx + _CET_NOTRACK jmp *%edx #ifdef __PIC__ # if defined X86_DARWIN || defined HAVE_HIDDEN_VISIBILITY_ATTRIBUTE @@ -267,7 +268,7 @@ ENDF(ffi_call_i386) L(C1(pc,N)): \ leal L(C1(load_table,N))-L(C1(pc,N))(%edx, %eax, 8), %edx; \ movl closure_CF(%esp), %eax; /* optimiztic load */ \ - jmp *%edx + _CET_NOTRACK jmp *%edx # else # define FFI_CLOSURE_CALL_INNER_SAVE_EBX # undef FFI_CLOSURE_CALL_INNER @@ -286,7 +287,7 @@ L(C1(UW,UWN)): \ L(C1(UW,UWN)): \ /* cfi_restore(%ebx); */ \ movl closure_CF(%esp), %eax; /* optimiztic load */ \ - jmp *%edx + _CET_NOTRACK jmp *%edx # endif /* DARWIN || HIDDEN */ #endif /* __PIC__ */ @@ -296,6 +297,7 @@ L(C1(UW,UWN)): \ C(ffi_go_closure_EAX): L(UW6): # cfi_startproc + _CET_ENDBR subl $closure_FS, %esp L(UW7): # cfi_def_cfa_offset(closure_FS + 4) @@ -316,6 +318,7 @@ ENDF(C(ffi_go_closure_EAX)) C(ffi_go_closure_ECX): L(UW9): # cfi_startproc + _CET_ENDBR subl $closure_FS, %esp L(UW10): # cfi_def_cfa_offset(closure_FS + 4) @@ -340,6 +343,7 @@ ENDF(C(ffi_go_closure_ECX)) C(ffi_closure_i386): L(UW12): # cfi_startproc + _CET_ENDBR subl $closure_FS, %esp L(UW13): # cfi_def_cfa_offset(closure_FS + 4) @@ -423,6 +427,7 @@ ENDF(C(ffi_closure_i386)) C(ffi_go_closure_STDCALL): L(UW21): # cfi_startproc + _CET_ENDBR subl $closure_FS, %esp L(UW22): # cfi_def_cfa_offset(closure_FS + 4) @@ -448,6 +453,7 @@ L(UW24): # cfi_startproc # cfi_def_cfa(%esp, 8) # cfi_offset(%eip, -8) + _CET_ENDBR subl $closure_FS-4, %esp L(UW25): # cfi_def_cfa_offset(closure_FS + 4) @@ -470,6 +476,7 @@ ENDF(C(ffi_closure_REGISTER)) C(ffi_closure_STDCALL): L(UW27): # cfi_startproc + _CET_ENDBR subl $closure_FS, %esp L(UW28): # cfi_def_cfa_offset(closure_FS + 4) @@ -576,6 +583,7 @@ ENDF(C(ffi_closure_STDCALL)) C(ffi_closure_raw_SYSV): L(UW32): # cfi_startproc + _CET_ENDBR subl $raw_closure_S_FS, %esp L(UW33): # cfi_def_cfa_offset(raw_closure_S_FS + 4) @@ -679,6 +687,7 @@ ENDF(C(ffi_closure_raw_SYSV)) C(ffi_closure_raw_THISCALL): L(UW41): # cfi_startproc + _CET_ENDBR /* Rearrange the stack such that %ecx is the first argument. This means moving the return address. */ popl %edx diff --git a/src/x86/unix64.S b/src/x86/unix64.S index 41563f5..ee3c04f 100644 --- a/src/x86/unix64.S +++ b/src/x86/unix64.S @@ -42,7 +42,11 @@ #if defined(__clang__) || defined(__APPLE__) || (defined (__sun__) && defined(__svr4__)) # define E(BASE, X) .balign 8 #else -# define E(BASE, X) .balign 8; .org BASE + X * 8 +# ifdef __CET__ +# define E(BASE, X) .balign 8; .org BASE + X * 16 +# else +# define E(BASE, X) .balign 8; .org BASE + X * 8 +# endif #endif /* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags, @@ -58,6 +62,7 @@ C(ffi_call_unix64): L(UW0): + _CET_ENDBR movq (%rsp), %r10 /* Load return address. */ leaq (%rdi, %rsi), %rax /* Find local stack base. */ movq %rdx, (%rax) /* Save flags. */ @@ -116,6 +121,11 @@ L(UW2): movzbl %cl, %r10d leaq L(store_table)(%rip), %r11 ja L(sa) +#ifdef __CET__ + /* NB: Originally, each slot is 8 byte. 4 bytes of ENDBR64 + + 4 bytes NOP padding double slot size to 16 bytes. */ + addl %r10d, %r10d +#endif leaq (%r11, %r10, 8), %r10 /* Prep for the structure cases: scratch area in redzone. */ @@ -125,57 +135,73 @@ L(UW2): .balign 8 L(store_table): E(L(store_table), UNIX64_RET_VOID) + _CET_ENDBR ret E(L(store_table), UNIX64_RET_UINT8) + _CET_ENDBR movzbl %al, %eax movq %rax, (%rdi) ret E(L(store_table), UNIX64_RET_UINT16) + _CET_ENDBR movzwl %ax, %eax movq %rax, (%rdi) ret E(L(store_table), UNIX64_RET_UINT32) + _CET_ENDBR movl %eax, %eax movq %rax, (%rdi) ret E(L(store_table), UNIX64_RET_SINT8) + _CET_ENDBR movsbq %al, %rax movq %rax, (%rdi) ret E(L(store_table), UNIX64_RET_SINT16) + _CET_ENDBR movswq %ax, %rax movq %rax, (%rdi) ret E(L(store_table), UNIX64_RET_SINT32) + _CET_ENDBR cltq movq %rax, (%rdi) ret E(L(store_table), UNIX64_RET_INT64) + _CET_ENDBR movq %rax, (%rdi) ret E(L(store_table), UNIX64_RET_XMM32) + _CET_ENDBR movd %xmm0, (%rdi) ret E(L(store_table), UNIX64_RET_XMM64) + _CET_ENDBR movq %xmm0, (%rdi) ret E(L(store_table), UNIX64_RET_X87) + _CET_ENDBR fstpt (%rdi) ret E(L(store_table), UNIX64_RET_X87_2) + _CET_ENDBR fstpt (%rdi) fstpt 16(%rdi) ret E(L(store_table), UNIX64_RET_ST_XMM0_RAX) + _CET_ENDBR movq %rax, 8(%rsi) jmp L(s3) E(L(store_table), UNIX64_RET_ST_RAX_XMM0) + _CET_ENDBR movq %xmm0, 8(%rsi) jmp L(s2) E(L(store_table), UNIX64_RET_ST_XMM0_XMM1) + _CET_ENDBR movq %xmm1, 8(%rsi) jmp L(s3) E(L(store_table), UNIX64_RET_ST_RAX_RDX) + _CET_ENDBR movq %rdx, 8(%rsi) L(s2): movq %rax, (%rsi) @@ -227,6 +253,7 @@ ENDF(C(ffi_call_unix64)) C(ffi_closure_unix64_sse): L(UW5): + _CET_ENDBR subq $ffi_closure_FS, %rsp L(UW6): /* cfi_adjust_cfa_offset(ffi_closure_FS) */ @@ -250,6 +277,7 @@ ENDF(C(ffi_closure_unix64_sse)) C(ffi_closure_unix64): L(UW8): + _CET_ENDBR subq $ffi_closure_FS, %rsp L(UW9): /* cfi_adjust_cfa_offset(ffi_closure_FS) */ @@ -286,6 +314,11 @@ L(UW10): movzbl %al, %r10d leaq L(load_table)(%rip), %r11 ja L(la) +#ifdef __CET__ + /* NB: Originally, each slot is 8 byte. 4 bytes of ENDBR64 + + 4 bytes NOP padding double slot size to 16 bytes. */ + addl %r10d, %r10d +#endif leaq (%r11, %r10, 8), %r10 leaq ffi_closure_RED_RVALUE(%rsp), %rsi jmp *%r10 @@ -293,51 +326,67 @@ L(UW10): .balign 8 L(load_table): E(L(load_table), UNIX64_RET_VOID) + _CET_ENDBR ret E(L(load_table), UNIX64_RET_UINT8) + _CET_ENDBR movzbl (%rsi), %eax ret E(L(load_table), UNIX64_RET_UINT16) + _CET_ENDBR movzwl (%rsi), %eax ret E(L(load_table), UNIX64_RET_UINT32) + _CET_ENDBR movl (%rsi), %eax ret E(L(load_table), UNIX64_RET_SINT8) + _CET_ENDBR movsbl (%rsi), %eax ret E(L(load_table), UNIX64_RET_SINT16) + _CET_ENDBR movswl (%rsi), %eax ret E(L(load_table), UNIX64_RET_SINT32) + _CET_ENDBR movl (%rsi), %eax ret E(L(load_table), UNIX64_RET_INT64) + _CET_ENDBR movq (%rsi), %rax ret E(L(load_table), UNIX64_RET_XMM32) + _CET_ENDBR movd (%rsi), %xmm0 ret E(L(load_table), UNIX64_RET_XMM64) + _CET_ENDBR movq (%rsi), %xmm0 ret E(L(load_table), UNIX64_RET_X87) + _CET_ENDBR fldt (%rsi) ret E(L(load_table), UNIX64_RET_X87_2) + _CET_ENDBR fldt 16(%rsi) fldt (%rsi) ret E(L(load_table), UNIX64_RET_ST_XMM0_RAX) + _CET_ENDBR movq 8(%rsi), %rax jmp L(l3) E(L(load_table), UNIX64_RET_ST_RAX_XMM0) + _CET_ENDBR movq 8(%rsi), %xmm0 jmp L(l2) E(L(load_table), UNIX64_RET_ST_XMM0_XMM1) + _CET_ENDBR movq 8(%rsi), %xmm1 jmp L(l3) E(L(load_table), UNIX64_RET_ST_RAX_RDX) + _CET_ENDBR movq 8(%rsi), %rdx L(l2): movq (%rsi), %rax @@ -358,6 +407,7 @@ ENDF(C(ffi_closure_unix64)) C(ffi_go_closure_unix64_sse): L(UW12): + _CET_ENDBR subq $ffi_closure_FS, %rsp L(UW13): /* cfi_adjust_cfa_offset(ffi_closure_FS) */ @@ -381,6 +431,7 @@ ENDF(C(ffi_go_closure_unix64_sse)) C(ffi_go_closure_unix64): L(UW15): + _CET_ENDBR subq $ffi_closure_FS, %rsp L(UW16): /* cfi_adjust_cfa_offset(ffi_closure_FS) */ @@ -424,7 +475,12 @@ EHFrame0: #endif /* Simplify advancing between labels. Assume DW_CFA_advance_loc1 fits. */ -#define ADV(N, P) .byte 2, L(N)-L(P) +#ifdef __CET__ +/* Use DW_CFA_advance_loc2 when IBT is enabled. */ +# define ADV(N, P) .byte 3; .2byte L(N)-L(P) +#else +# define ADV(N, P) .byte 2, L(N)-L(P) +#endif .balign 8 L(CIE): diff --git a/src/x86/win64.S b/src/x86/win64.S index 2c334c8..57c0e65 100644 --- a/src/x86/win64.S +++ b/src/x86/win64.S @@ -48,6 +48,7 @@ SEH(.seh_proc ffi_call_win64) C(ffi_call_win64): cfi_startproc + _CET_ENDBR /* Set up the local stack frame and install it in rbp/rsp. */ movq (%rsp), %rax movq %rbp, (arg1) @@ -80,7 +81,7 @@ C(ffi_call_win64): cmpl $FFI_TYPE_SMALL_STRUCT_4B, %ecx leaq (%r10, %rcx, 8), %r10 ja 99f - jmp *%r10 + _CET_NOTRACK jmp *%r10 /* Below, we're space constrained most of the time. Thus we eschew the modern "mov, pop, ret" sequence (5 bytes) for "leave, ret" (2 bytes). */ @@ -176,6 +177,7 @@ E(0b, FFI_TYPE_SMALL_STRUCT_4B) SEH(.seh_proc ffi_go_closure_win64) C(ffi_go_closure_win64): cfi_startproc + _CET_ENDBR /* Save all integer arguments into the incoming reg stack space. */ movq %rcx, 8(%rsp) movq %rdx, 16(%rsp) @@ -196,6 +198,7 @@ C(ffi_go_closure_win64): SEH(.seh_proc ffi_closure_win64) C(ffi_closure_win64): cfi_startproc + _CET_ENDBR /* Save all integer arguments into the incoming reg stack space. */ movq %rcx, 8(%rsp) movq %rdx, 16(%rsp) |