summaryrefslogtreecommitdiff
path: root/sysdeps/aarch64/dl-machine.h
diff options
context:
space:
mode:
authorAdhemerval Zanella <adhemerval.zanella@linaro.org>2021-08-04 15:30:56 +0000
committerAdhemerval Zanella <adhemerval.zanella@linaro.org>2021-09-10 15:07:38 -0300
commitc8315ccd30fcecc1b93a9bc3f073010190a86e05 (patch)
tree5f8cc1af7a951206e25b72eeca7875561f024a4f /sysdeps/aarch64/dl-machine.h
parent171fdd4bd4f337001db053721477add60d205ed8 (diff)
downloadglibc-c8315ccd30fcecc1b93a9bc3f073010190a86e05.tar.gz
elf: Add SVE support for aarch64 rtld-auditazanella/ld-audit-fixes
To implement lazy binding is enabled when profiling or auditing used, even when STO_AARCH64_VARIANT_PCS is set. Also, to not incur in performance penalties on architecture without SVE, the PLT entrypoint is set to a newer one, _dl_runtime_profile_sve, which is used iff 'hwcap' has HWCAP_SVE bit set. This should be a fair assumption since SVE has a defined set of registers for argument passing and return values. A new ABI with either different argument passing or different registers would require a different PLT entry, but I assume this would require another symbol flag anyway (or at least a different ELF mark to indicate so). The profile '_dl_runtime_profile_sve' entrypoint assumes the largest SVE register size possible (2048 bits) and thus it requires a quite large stack (8976 bytes). I think it would be possible make the stack requirement dynamic depending of the vector length, but it would make the PLT audit function way more complex. It extends the La_aarch64_vector with a long double pointer to a stack alloced buffer to hold the SVE Z register, along with a pointer to hold the P registers on La_aarch64_regs. It means the if 'lr_sve' is 0 in either La_aarch64_regs or La_aarch64_retval the La_aarch64_vector contains the floating-pointer registers that can be accessed directly (non SVE hardware). Otherwise, 'La_aarch64_vector.z' points to a memory area that holds up to 'lr_sve' bytes for the Z registers, which can be loaded with svld1 intrinsic for instance (as tst-audit28.c does). The P register follows the same logic, with each La_aarch64_regs.lr_sve_pregs pointing to an area of memory 'lr_sve/8' in size. So, to access the FP register as float you can use: static inline float regs_vec_to_float (const La_aarch64_regs *regs, int idx) { float r; if (regs->lr_sve == 0) r = regs->lr_vreg[idx].s; else memcpy (&r, &regs->lr_vreg[idx].z[0], sizeof (r)); return r; } This patch is not complete yet: the tst-audit28 does not check if compiler supports SVE (we would need a configure check to disable for such case), I need to add a proper comment for the _dl_runtime_profile_sve stack layout, the test need to check for the P register state clobbering. I also haven't check the performance penalties with this approach, and maybe the way I am saving/restoring the SVE register might be optimized. In any case, I checked on a SVE machine and at least the testcase work as expected without any regressions. I also did a sniff test on a non SVE machine.
Diffstat (limited to 'sysdeps/aarch64/dl-machine.h')
-rw-r--r--sysdeps/aarch64/dl-machine.h14
1 files changed, 12 insertions, 2 deletions
diff --git a/sysdeps/aarch64/dl-machine.h b/sysdeps/aarch64/dl-machine.h
index 3e10cb462f..38fa07f111 100644
--- a/sysdeps/aarch64/dl-machine.h
+++ b/sysdeps/aarch64/dl-machine.h
@@ -66,6 +66,9 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
ElfW(Addr) *got;
extern void _dl_runtime_resolve (ElfW(Word));
extern void _dl_runtime_profile (ElfW(Word));
+#if HAVE_AARCH64_SVE_ASM
+ extern void _dl_runtime_profile_sve (ElfW(Word));
+#endif
got = (ElfW(Addr) *) D_PTR (l, l_info[DT_PLTGOT]);
if (got[1])
@@ -82,7 +85,12 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
end in this function. */
if ( profile)
{
- got[2] = (ElfW(Addr)) &_dl_runtime_profile;
+#if HAVE_AARCH64_SVE_ASM
+ if (GLRO(dl_hwcap) & HWCAP_SVE)
+ got[2] = (ElfW(Addr)) &_dl_runtime_profile_sve;
+ else
+#endif
+ got[2] = (ElfW(Addr)) &_dl_runtime_profile;
if (GLRO(dl_profile) != NULL
&& _dl_name_match_p (GLRO(dl_profile), l))
@@ -382,6 +390,7 @@ __attribute__ ((always_inline))
elf_machine_lazy_rel (struct link_map *map,
ElfW(Addr) l_addr,
const ElfW(Rela) *reloc,
+ int profile,
int skip_ifunc)
{
ElfW(Addr) *const reloc_addr = (void *) (l_addr + reloc->r_offset);
@@ -389,7 +398,8 @@ elf_machine_lazy_rel (struct link_map *map,
/* Check for unexpected PLT reloc type. */
if (__builtin_expect (r_type == AARCH64_R(JUMP_SLOT), 1))
{
- if (__glibc_unlikely (map->l_info[DT_AARCH64 (VARIANT_PCS)] != NULL))
+ if (__glibc_unlikely (map->l_info[DT_AARCH64 (VARIANT_PCS)] != NULL)
+ && profile == 0)
{
/* Check the symbol table for variant PCS symbols. */
const Elf_Symndx symndx = ELFW (R_SYM) (reloc->r_info);