summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSzabolcs Nagy <szabolcs.nagy@arm.com>2021-02-16 12:55:13 +0000
committerSzabolcs Nagy <szabolcs.nagy@arm.com>2021-04-13 08:43:40 +0100
commitb116855de71098ef7dd2875dd3237f8f3ecc12c2 (patch)
tree6da94b9ca548a5042d743626fdbe1990fa5221a2
parentf8ea2b9982e39fd950d157f5dba31121ceb51df3 (diff)
downloadglibc-nsz/bug19329-v2.tar.gz
RFC elf: Fix slow tls access after dlopen [BZ #19924]nsz/bug19329-v2
In short: __tls_get_addr checks the global generation counter, _dl_update_slotinfo updates up to the generation of the accessed module. If the global generation is newer than geneneration of the module then __tls_get_addr keeps hitting the slow path that updates the dtv. Possible approaches i can see: 1. update to global generation instead of module, 2. check the module generation in the fast path. This patch is 1.: it needs additional sync (load acquire) so the slotinfo list is up to date with the observed global generation. Approach 2. would require walking the slotinfo list at all times. I don't know how to make that fast with many modules. Note: in the x86_64 version of dl-tls.c the generation is only loaded once, since relaxed mo is not faster than acquire mo load. I have not benchmarked this yet.
-rw-r--r--elf/dl-close.c2
-rw-r--r--elf/dl-open.c8
-rw-r--r--elf/dl-reloc.c5
-rw-r--r--elf/dl-tls.c28
-rw-r--r--sysdeps/generic/ldsodefs.h3
-rw-r--r--sysdeps/x86_64/dl-tls.c4
6 files changed, 23 insertions, 27 deletions
diff --git a/elf/dl-close.c b/elf/dl-close.c
index 9f31532f41..45f8a7fe31 100644
--- a/elf/dl-close.c
+++ b/elf/dl-close.c
@@ -780,7 +780,7 @@ _dl_close_worker (struct link_map *map, bool force)
if (__glibc_unlikely (newgen == 0))
_dl_fatal_printf ("TLS generation counter wrapped! Please report as described in "REPORT_BUGS_TO".\n");
/* Can be read concurrently. */
- atomic_store_relaxed (&GL(dl_tls_generation), newgen);
+ atomic_store_release (&GL(dl_tls_generation), newgen);
if (tls_free_end == GL(dl_tls_static_used))
GL(dl_tls_static_used) = tls_free_start;
diff --git a/elf/dl-open.c b/elf/dl-open.c
index 661f26977e..5b9816e4e8 100644
--- a/elf/dl-open.c
+++ b/elf/dl-open.c
@@ -400,7 +400,7 @@ update_tls_slotinfo (struct link_map *new)
_dl_fatal_printf (N_("\
TLS generation counter wrapped! Please report this."));
/* Can be read concurrently. */
- atomic_store_relaxed (&GL(dl_tls_generation), newgen);
+ atomic_store_release (&GL(dl_tls_generation), newgen);
/* We need a second pass for static tls data, because
_dl_update_slotinfo must not be run while calls to
@@ -417,8 +417,8 @@ TLS generation counter wrapped! Please report this."));
now, but we can delay updating the DTV. */
imap->l_need_tls_init = 0;
#ifdef SHARED
- /* Update the slot information data for at least the
- generation of the DSO we are allocating data for. */
+ /* Update the slot information data for the current
+ generation. */
/* FIXME: This can terminate the process on memory
allocation failure. It is not possible to raise
@@ -426,7 +426,7 @@ TLS generation counter wrapped! Please report this."));
_dl_update_slotinfo would have to be split into two
operations, similar to resize_scopes and update_scopes
above. This is related to bug 16134. */
- _dl_update_slotinfo (imap->l_tls_modid);
+ _dl_update_slotinfo (imap->l_tls_modid, newgen);
#endif
GL(dl_init_static_tls) (imap);
diff --git a/elf/dl-reloc.c b/elf/dl-reloc.c
index c2df26deea..427669d769 100644
--- a/elf/dl-reloc.c
+++ b/elf/dl-reloc.c
@@ -111,11 +111,10 @@ _dl_try_allocate_static_tls (struct link_map *map, bool optional)
if (map->l_real->l_relocated)
{
#ifdef SHARED
+// TODO: it is not clear why we need to update the DTV here, add comment
if (__builtin_expect (THREAD_DTV()[0].counter != GL(dl_tls_generation),
0))
- /* Update the slot information data for at least the generation of
- the DSO we are allocating data for. */
- (void) _dl_update_slotinfo (map->l_tls_modid);
+ (void) _dl_update_slotinfo (map->l_tls_modid, GL(dl_tls_generation));
#endif
GL(dl_init_static_tls) (map);
diff --git a/elf/dl-tls.c b/elf/dl-tls.c
index b0257185e9..b51a4f3a19 100644
--- a/elf/dl-tls.c
+++ b/elf/dl-tls.c
@@ -701,7 +701,7 @@ allocate_and_init (struct link_map *map)
struct link_map *
-_dl_update_slotinfo (unsigned long int req_modid)
+_dl_update_slotinfo (unsigned long int req_modid, size_t new_gen)
{
struct link_map *the_map = NULL;
dtv_t *dtv = THREAD_DTV ();
@@ -718,19 +718,12 @@ _dl_update_slotinfo (unsigned long int req_modid)
code and therefore add to the slotinfo list. This is a problem
since we must not pick up any information about incomplete work.
The solution to this is to ignore all dtv slots which were
- created after the one we are currently interested. We know that
- dynamic loading for this module is completed and this is the last
- load operation we know finished. */
- unsigned long int idx = req_modid;
+ created after the generation we are interested in. We know that
+ dynamic loading for this generation is completed and this is the
+ last load operation we know finished. */
struct dtv_slotinfo_list *listp = GL(dl_tls_dtv_slotinfo_list);
- while (idx >= listp->len)
- {
- idx -= listp->len;
- listp = listp->next;
- }
-
- if (dtv[0].counter < listp->slotinfo[idx].gen)
+ if (dtv[0].counter < new_gen)
{
/* CONCURRENCY NOTES:
@@ -751,7 +744,6 @@ _dl_update_slotinfo (unsigned long int req_modid)
other entries are racy. However updating a non-relevant dtv
entry does not affect correctness. For a relevant module m,
max_modid >= modid of m. */
- size_t new_gen = listp->slotinfo[idx].gen;
size_t total = 0;
size_t max_modid = atomic_load_relaxed (&GL(dl_tls_max_dtv_idx));
assert (max_modid >= req_modid);
@@ -894,9 +886,9 @@ tls_get_addr_tail (GET_ADDR_ARGS, dtv_t *dtv, struct link_map *the_map)
static struct link_map *
__attribute_noinline__
-update_get_addr (GET_ADDR_ARGS)
+update_get_addr (GET_ADDR_ARGS, size_t gen)
{
- struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE);
+ struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE, gen);
dtv_t *dtv = THREAD_DTV ();
void *p = dtv[GET_ADDR_MODULE].pointer.val;
@@ -931,7 +923,11 @@ __tls_get_addr (GET_ADDR_ARGS)
by user code, see CONCURRENCY NOTES in _dl_update_slotinfo. */
size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
if (__glibc_unlikely (dtv[0].counter != gen))
- return update_get_addr (GET_ADDR_PARAM);
+ {
+// TODO: needs comment update if we rely on consistent generation with slotinfo
+ gen = atomic_load_acquire (&GL(dl_tls_generation));
+ return update_get_addr (GET_ADDR_PARAM, gen);
+ }
void *p = dtv[GET_ADDR_MODULE].pointer.val;
diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
index ea3f7a69d0..614463f016 100644
--- a/sysdeps/generic/ldsodefs.h
+++ b/sysdeps/generic/ldsodefs.h
@@ -1224,7 +1224,8 @@ extern void _dl_add_to_slotinfo (struct link_map *l, bool do_add)
/* Update slot information data for at least the generation of the
module with the given index. */
-extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid)
+extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid,
+ size_t gen)
attribute_hidden;
/* Look up the module's TLS block as for __tls_get_addr,
diff --git a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c
index 24ef560b71..4ded8dd6b9 100644
--- a/sysdeps/x86_64/dl-tls.c
+++ b/sysdeps/x86_64/dl-tls.c
@@ -40,9 +40,9 @@ __tls_get_addr_slow (GET_ADDR_ARGS)
{
dtv_t *dtv = THREAD_DTV ();
- size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
+ size_t gen = atomic_load_acquire (&GL(dl_tls_generation));
if (__glibc_unlikely (dtv[0].counter != gen))
- return update_get_addr (GET_ADDR_PARAM);
+ return update_get_addr (GET_ADDR_PARAM, gen);
return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL);
}