summaryrefslogtreecommitdiff
path: root/lib/xray/xray_trampoline_x86_64.S
diff options
context:
space:
mode:
Diffstat (limited to 'lib/xray/xray_trampoline_x86_64.S')
-rw-r--r--lib/xray/xray_trampoline_x86_64.S163
1 files changed, 102 insertions, 61 deletions
diff --git a/lib/xray/xray_trampoline_x86_64.S b/lib/xray/xray_trampoline_x86_64.S
index b9fef6dad..ffbfb5c7e 100644
--- a/lib/xray/xray_trampoline_x86_64.S
+++ b/lib/xray/xray_trampoline_x86_64.S
@@ -13,42 +13,51 @@
//
//===----------------------------------------------------------------------===//
+#include "../builtins/assembly.h"
+
.macro SAVE_REGISTERS
- subq $200, %rsp
- movupd %xmm0, 184(%rsp)
- movupd %xmm1, 168(%rsp)
- movupd %xmm2, 152(%rsp)
- movupd %xmm3, 136(%rsp)
- movupd %xmm4, 120(%rsp)
- movupd %xmm5, 104(%rsp)
- movupd %xmm6, 88(%rsp)
- movupd %xmm7, 72(%rsp)
- movq %rdi, 64(%rsp)
- movq %rax, 56(%rsp)
- movq %rdx, 48(%rsp)
- movq %rsi, 40(%rsp)
- movq %rcx, 32(%rsp)
- movq %r8, 24(%rsp)
- movq %r9, 16(%rsp)
+ subq $192, %rsp
+ .cfi_def_cfa_offset 200
+ // At this point, the stack pointer should be aligned to an 8-byte boundary,
+ // because any call instructions that come after this will add another 8
+ // bytes and therefore align it to 16-bytes.
+ movq %rbp, 184(%rsp)
+ movupd %xmm0, 168(%rsp)
+ movupd %xmm1, 152(%rsp)
+ movupd %xmm2, 136(%rsp)
+ movupd %xmm3, 120(%rsp)
+ movupd %xmm4, 104(%rsp)
+ movupd %xmm5, 88(%rsp)
+ movupd %xmm6, 72(%rsp)
+ movupd %xmm7, 56(%rsp)
+ movq %rdi, 48(%rsp)
+ movq %rax, 40(%rsp)
+ movq %rdx, 32(%rsp)
+ movq %rsi, 24(%rsp)
+ movq %rcx, 16(%rsp)
+ movq %r8, 8(%rsp)
+ movq %r9, 0(%rsp)
.endm
.macro RESTORE_REGISTERS
- movupd 184(%rsp), %xmm0
- movupd 168(%rsp), %xmm1
- movupd 152(%rsp), %xmm2
- movupd 136(%rsp), %xmm3
- movupd 120(%rsp), %xmm4
- movupd 104(%rsp), %xmm5
- movupd 88(%rsp) , %xmm6
- movupd 72(%rsp) , %xmm7
- movq 64(%rsp), %rdi
- movq 56(%rsp), %rax
- movq 48(%rsp), %rdx
- movq 40(%rsp), %rsi
- movq 32(%rsp), %rcx
- movq 24(%rsp), %r8
- movq 16(%rsp), %r9
- addq $200, %rsp
+ movq 184(%rsp), %rbp
+ movupd 168(%rsp), %xmm0
+ movupd 152(%rsp), %xmm1
+ movupd 136(%rsp), %xmm2
+ movupd 120(%rsp), %xmm3
+ movupd 104(%rsp), %xmm4
+ movupd 88(%rsp), %xmm5
+ movupd 72(%rsp) , %xmm6
+ movupd 56(%rsp) , %xmm7
+ movq 48(%rsp), %rdi
+ movq 40(%rsp), %rax
+ movq 32(%rsp), %rdx
+ movq 24(%rsp), %rsi
+ movq 16(%rsp), %rcx
+ movq 8(%rsp), %r8
+ movq 0(%rsp), %r9
+ addq $192, %rsp
+ .cfi_def_cfa_offset 8
.endm
.text
@@ -62,8 +71,6 @@
__xray_FunctionEntry:
.cfi_startproc
- pushq %rbp
- .cfi_def_cfa_offset 16
SAVE_REGISTERS
// This load has to be atomic, it's concurrent with __xray_patch().
@@ -78,7 +85,6 @@ __xray_FunctionEntry:
callq *%rax
.Ltmp0:
RESTORE_REGISTERS
- popq %rbp
retq
.Ltmp1:
.size __xray_FunctionEntry, .Ltmp1-__xray_FunctionEntry
@@ -94,14 +100,13 @@ __xray_FunctionExit:
// Save the important registers first. Since we're assuming that this
// function is only jumped into, we only preserve the registers for
// returning.
- pushq %rbp
- .cfi_def_cfa_offset 16
subq $56, %rsp
- .cfi_def_cfa_offset 32
- movupd %xmm0, 40(%rsp)
- movupd %xmm1, 24(%rsp)
- movq %rax, 16(%rsp)
- movq %rdx, 8(%rsp)
+ .cfi_def_cfa_offset 64
+ movq %rbp, 48(%rsp)
+ movupd %xmm0, 32(%rsp)
+ movupd %xmm1, 16(%rsp)
+ movq %rax, 8(%rsp)
+ movq %rdx, 0(%rsp)
movq _ZN6__xray19XRayPatchedFunctionE(%rip), %rax
testq %rax,%rax
je .Ltmp2
@@ -111,12 +116,13 @@ __xray_FunctionExit:
callq *%rax
.Ltmp2:
// Restore the important registers.
- movupd 40(%rsp), %xmm0
- movupd 24(%rsp), %xmm1
- movq 16(%rsp), %rax
- movq 8(%rsp), %rdx
+ movq 48(%rsp), %rbp
+ movupd 32(%rsp), %xmm0
+ movupd 16(%rsp), %xmm1
+ movq 8(%rsp), %rax
+ movq 0(%rsp), %rdx
addq $56, %rsp
- popq %rbp
+ .cfi_def_cfa_offset 8
retq
.Ltmp3:
.size __xray_FunctionExit, .Ltmp3-__xray_FunctionExit
@@ -129,12 +135,6 @@ __xray_FunctionExit:
.type __xray_FunctionTailExit,@function
__xray_FunctionTailExit:
.cfi_startproc
- // Save the important registers as in the entry trampoline, but indicate that
- // this is an exit. In the future, we will introduce a new entry type that
- // differentiates between a normal exit and a tail exit, but we'd have to do
- // this and increment the version number for the header.
- pushq %rbp
- .cfi_def_cfa_offset 16
SAVE_REGISTERS
movq _ZN6__xray19XRayPatchedFunctionE(%rip), %rax
@@ -142,12 +142,11 @@ __xray_FunctionTailExit:
je .Ltmp4
movl %r10d, %edi
- movl $1, %esi
+ movl $2, %esi
callq *%rax
.Ltmp4:
RESTORE_REGISTERS
- popq %rbp
retq
.Ltmp5:
.size __xray_FunctionTailExit, .Ltmp5-__xray_FunctionTailExit
@@ -160,8 +159,6 @@ __xray_FunctionTailExit:
.type __xray_ArgLoggerEntry,@function
__xray_ArgLoggerEntry:
.cfi_startproc
- pushq %rbp
- .cfi_def_cfa_offset 16
SAVE_REGISTERS
// Again, these function pointer loads must be atomic; MOV is fine.
@@ -175,16 +172,60 @@ __xray_ArgLoggerEntry:
je .Larg1entryFail
.Larg1entryLog:
- movq %rdi, %rdx // first argument will become the third
- xorq %rsi, %rsi // XRayEntryType::ENTRY into the second
- movl %r10d, %edi // 32-bit function ID becomes the first
+
+ // First argument will become the third
+ movq %rdi, %rdx
+
+ // XRayEntryType::LOG_ARGS_ENTRY into the second
+ mov $0x3, %esi
+
+ // 32-bit function ID becomes the first
+ movl %r10d, %edi
callq *%rax
.Larg1entryFail:
RESTORE_REGISTERS
- popq %rbp
retq
.Larg1entryEnd:
.size __xray_ArgLoggerEntry, .Larg1entryEnd-__xray_ArgLoggerEntry
.cfi_endproc
+
+//===----------------------------------------------------------------------===//
+
+ .global __xray_CustomEvent
+ .align 16, 0x90
+ .type __xray_CustomEvent,@function
+__xray_CustomEvent:
+ .cfi_startproc
+ SAVE_REGISTERS
+
+ // We take two arguments to this trampoline, which should be in rdi and rsi
+ // already. We also make sure that we stash %rax because we use that register
+ // to call the logging handler.
+ movq _ZN6__xray22XRayPatchedCustomEventE(%rip), %rax
+ testq %rax,%rax
+ je .LcustomEventCleanup
+
+ // At this point we know that rcx and rdx already has the data, so we just
+ // call the logging handler, after aligning the stack to a 16-byte boundary.
+ // The approach we're taking here uses additional stack space to stash the
+ // stack pointer twice before aligning the pointer to 16-bytes. If the stack
+ // was 8-byte aligned, it will become 16-byte aligned -- when restoring the
+ // pointer, we can always look -8 bytes from the current position to get
+ // either of the values we've stashed in the first place.
+ pushq %rsp
+ pushq (%rsp)
+ andq $-0x10, %rsp
+ callq *%rax
+ movq 8(%rsp), %rsp
+
+.LcustomEventCleanup:
+ RESTORE_REGISTERS
+ retq
+
+.Ltmp8:
+ .size __xray_CustomEvent, .Ltmp8-__xray_CustomEvent
+ .cfi_endproc
+
+NO_EXEC_STACK_DIRECTIVE