diff options
Diffstat (limited to 'rts/sm')
-rw-r--r-- | rts/sm/BlockAlloc.c | 15 | ||||
-rw-r--r-- | rts/sm/CNF.c | 16 | ||||
-rw-r--r-- | rts/sm/Compact.c | 26 | ||||
-rw-r--r-- | rts/sm/Compact.h | 12 | ||||
-rw-r--r-- | rts/sm/Evac.c | 130 | ||||
-rw-r--r-- | rts/sm/GC.c | 232 | ||||
-rw-r--r-- | rts/sm/GC.h | 10 | ||||
-rw-r--r-- | rts/sm/GCThread.h | 6 | ||||
-rw-r--r-- | rts/sm/GCUtils.c | 31 | ||||
-rw-r--r-- | rts/sm/GCUtils.h | 6 | ||||
-rw-r--r-- | rts/sm/MarkWeak.c | 17 | ||||
-rw-r--r-- | rts/sm/OSMem.h | 1 | ||||
-rw-r--r-- | rts/sm/Sanity.c | 23 | ||||
-rw-r--r-- | rts/sm/Scav.c | 217 | ||||
-rw-r--r-- | rts/sm/Storage.c | 126 |
15 files changed, 482 insertions, 386 deletions
diff --git a/rts/sm/BlockAlloc.c b/rts/sm/BlockAlloc.c index 2a02ecc9c5..bbb4f8a6c1 100644 --- a/rts/sm/BlockAlloc.c +++ b/rts/sm/BlockAlloc.c @@ -210,6 +210,12 @@ void recordFreedBlocks(uint32_t node, uint32_t n) Allocation -------------------------------------------------------------------------- */ +STATIC_INLINE bdescr * +tail_of (bdescr *bd) +{ + return bd + bd->blocks - 1; +} + STATIC_INLINE void initGroup(bdescr *head) { @@ -223,7 +229,7 @@ initGroup(bdescr *head) // mblocks don't have bdescrs; freeing these is handled in a // different way by free_mblock_group(). if (head->blocks > 1 && head->blocks <= BLOCKS_PER_MBLOCK) { - bdescr *last = head + head->blocks-1; + bdescr *last = tail_of(head); last->blocks = 0; last->link = head; } @@ -285,13 +291,6 @@ free_list_insert (uint32_t node, bdescr *bd) dbl_link_onto(bd, &free_list[node][ln]); } - -STATIC_INLINE bdescr * -tail_of (bdescr *bd) -{ - return bd + bd->blocks - 1; -} - // After splitting a group, the last block of each group must have a // tail that points to the head block, to keep our invariants for // coalescing. diff --git a/rts/sm/CNF.c b/rts/sm/CNF.c index c12f53a120..6bc58cde75 100644 --- a/rts/sm/CNF.c +++ b/rts/sm/CNF.c @@ -722,14 +722,14 @@ verify_consistency_block (StgCompactNFData *str, StgCompactNFDataBlock *block) p += arr_words_sizeW((StgArrBytes*)p); break; - case MUT_ARR_PTRS_FROZEN: - case MUT_ARR_PTRS_FROZEN0: + case MUT_ARR_PTRS_FROZEN_CLEAN: + case MUT_ARR_PTRS_FROZEN_DIRTY: verify_mut_arr_ptrs(str, (StgMutArrPtrs*)p); p += mut_arr_ptrs_sizeW((StgMutArrPtrs*)p); break; - case SMALL_MUT_ARR_PTRS_FROZEN: - case SMALL_MUT_ARR_PTRS_FROZEN0: + case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN: + case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY: { uint32_t i; StgSmallMutArrPtrs *arr = (StgSmallMutArrPtrs*)p; @@ -969,14 +969,14 @@ fixup_block(StgCompactNFDataBlock *block, StgWord *fixup_table, uint32_t count) p += arr_words_sizeW((StgArrBytes*)p); break; - case MUT_ARR_PTRS_FROZEN: - case MUT_ARR_PTRS_FROZEN0: + case MUT_ARR_PTRS_FROZEN_CLEAN: + case MUT_ARR_PTRS_FROZEN_DIRTY: fixup_mut_arr_ptrs(fixup_table, count, (StgMutArrPtrs*)p); p += mut_arr_ptrs_sizeW((StgMutArrPtrs*)p); break; - case SMALL_MUT_ARR_PTRS_FROZEN: - case SMALL_MUT_ARR_PTRS_FROZEN0: + case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN: + case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY: { uint32_t i; StgSmallMutArrPtrs *arr = (StgSmallMutArrPtrs*)p; diff --git a/rts/sm/Compact.c b/rts/sm/Compact.c index 0e2fea8990..004e042069 100644 --- a/rts/sm/Compact.c +++ b/rts/sm/Compact.c @@ -25,7 +25,8 @@ #include "Trace.h" #include "Weak.h" #include "MarkWeak.h" -#include "Stable.h" +#include "StablePtr.h" +#include "StableName.h" // Turn off inlining when debugging - it obfuscates things #if defined(DEBUG) @@ -212,7 +213,7 @@ thread_static( StgClosure* p ) p = *THUNK_STATIC_LINK(p); continue; case FUN_STATIC: - p = *FUN_STATIC_LINK(p); + p = *STATIC_LINK(info,p); continue; case CONSTR: case CONSTR_NOCAF: @@ -482,8 +483,8 @@ update_fwd_large( bdescr *bd ) case MUT_ARR_PTRS_CLEAN: case MUT_ARR_PTRS_DIRTY: - case MUT_ARR_PTRS_FROZEN: - case MUT_ARR_PTRS_FROZEN0: + case MUT_ARR_PTRS_FROZEN_CLEAN: + case MUT_ARR_PTRS_FROZEN_DIRTY: // follow everything { StgMutArrPtrs *a; @@ -497,8 +498,8 @@ update_fwd_large( bdescr *bd ) case SMALL_MUT_ARR_PTRS_CLEAN: case SMALL_MUT_ARR_PTRS_DIRTY: - case SMALL_MUT_ARR_PTRS_FROZEN: - case SMALL_MUT_ARR_PTRS_FROZEN0: + case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN: + case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY: // follow everything { StgSmallMutArrPtrs *a; @@ -682,8 +683,8 @@ thread_obj (const StgInfoTable *info, StgPtr p) case MUT_ARR_PTRS_CLEAN: case MUT_ARR_PTRS_DIRTY: - case MUT_ARR_PTRS_FROZEN: - case MUT_ARR_PTRS_FROZEN0: + case MUT_ARR_PTRS_FROZEN_CLEAN: + case MUT_ARR_PTRS_FROZEN_DIRTY: // follow everything { StgMutArrPtrs *a; @@ -698,8 +699,8 @@ thread_obj (const StgInfoTable *info, StgPtr p) case SMALL_MUT_ARR_PTRS_CLEAN: case SMALL_MUT_ARR_PTRS_DIRTY: - case SMALL_MUT_ARR_PTRS_FROZEN: - case SMALL_MUT_ARR_PTRS_FROZEN0: + case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN: + case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY: // follow everything { StgSmallMutArrPtrs *a; @@ -1000,7 +1001,10 @@ compact(StgClosure *static_objects) thread_static(static_objects /* ToDo: ok? */); // the stable pointer table - threadStableTables((evac_fn)thread_root, NULL); + threadStablePtrTable((evac_fn)thread_root, NULL); + + // the stable name table + threadStableNameTable((evac_fn)thread_root, NULL); // the CAF list (used by GHCi) markCAFs((evac_fn)thread_root, NULL); diff --git a/rts/sm/Compact.h b/rts/sm/Compact.h index 6dcb50b1aa..63abfc7180 100644 --- a/rts/sm/Compact.h +++ b/rts/sm/Compact.h @@ -20,8 +20,8 @@ mark(StgPtr p, bdescr *bd) { uint32_t offset_within_block = p - bd->start; // in words StgPtr bitmap_word = (StgPtr)bd->u.bitmap + - (offset_within_block / (sizeof(W_)*BITS_PER_BYTE)); - StgWord bit_mask = (StgWord)1 << (offset_within_block & (sizeof(W_)*BITS_PER_BYTE - 1)); + (offset_within_block / BITS_IN(W_)); + StgWord bit_mask = (StgWord)1 << (offset_within_block & (BITS_IN(W_) - 1)); *bitmap_word |= bit_mask; } @@ -30,8 +30,8 @@ unmark(StgPtr p, bdescr *bd) { uint32_t offset_within_block = p - bd->start; // in words StgPtr bitmap_word = (StgPtr)bd->u.bitmap + - (offset_within_block / (sizeof(W_)*BITS_PER_BYTE)); - StgWord bit_mask = (StgWord)1 << (offset_within_block & (sizeof(W_)*BITS_PER_BYTE - 1)); + (offset_within_block / BITS_IN(W_)); + StgWord bit_mask = (StgWord)1 << (offset_within_block & (BITS_IN(W_) - 1)); *bitmap_word &= ~bit_mask; } @@ -40,8 +40,8 @@ is_marked(StgPtr p, bdescr *bd) { uint32_t offset_within_block = p - bd->start; // in words StgPtr bitmap_word = (StgPtr)bd->u.bitmap + - (offset_within_block / (sizeof(W_)*BITS_PER_BYTE)); - StgWord bit_mask = (StgWord)1 << (offset_within_block & (sizeof(W_)*BITS_PER_BYTE - 1)); + (offset_within_block / BITS_IN(W_)); + StgWord bit_mask = (StgWord)1 << (offset_within_block & (BITS_IN(W_)- 1)); return (*bitmap_word & bit_mask); } diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c index fb1af0f692..289031945d 100644 --- a/rts/sm/Evac.c +++ b/rts/sm/Evac.c @@ -28,10 +28,6 @@ #include "CNF.h" #include "Scav.h" -#if defined(PROF_SPIN) && defined(THREADED_RTS) && defined(PARALLEL_GC) -StgWord64 whitehole_spin = 0; -#endif - #if defined(THREADED_RTS) && !defined(PARALLEL_GC) #define evacuate(p) evacuate1(p) #define evacuate_BLACKHOLE(p) evacuate_BLACKHOLE1(p) @@ -47,7 +43,7 @@ StgWord64 whitehole_spin = 0; */ #define MAX_THUNK_SELECTOR_DEPTH 16 -static void eval_thunk_selector (StgClosure **q, StgSelector * p, bool); +static void eval_thunk_selector (StgClosure **q, StgSelector *p, bool); STATIC_INLINE void evacuate_large(StgPtr p); /* ----------------------------------------------------------------------------- @@ -197,8 +193,9 @@ spin: info = xchg((StgPtr)&src->header.info, (W_)&stg_WHITEHOLE_info); if (info == (W_)&stg_WHITEHOLE_info) { #if defined(PROF_SPIN) - whitehole_spin++; + whitehole_gc_spin++; #endif + busy_wait_nop(); goto spin; } if (IS_FORWARDING_PTR(info)) { @@ -281,14 +278,7 @@ evacuate_large(StgPtr p) } // remove from large_object list - if (bd->u.back) { - bd->u.back->link = bd->link; - } else { // first object in the list - gen->large_objects = bd->link; - } - if (bd->link) { - bd->link->u.back = bd->u.back; - } + dbl_link_remove(bd, &gen->large_objects); /* link it on to the evacuated large object list of the destination gen */ @@ -417,14 +407,7 @@ evacuate_compact (StgPtr p) } // remove from compact_objects list - if (bd->u.back) { - bd->u.back->link = bd->link; - } else { // first object in the list - gen->compact_objects = bd->link; - } - if (bd->link) { - bd->link->u.back = bd->u.back; - } + dbl_link_remove(bd, &gen->compact_objects); /* link it on to the evacuated compact object list of the destination gen */ @@ -539,14 +522,14 @@ loop: switch (info->type) { case THUNK_STATIC: - if (info->srt_bitmap != 0) { + if (info->srt != 0) { evacuate_static_object(THUNK_STATIC_LINK((StgClosure *)q), q); } return; case FUN_STATIC: - if (info->srt_bitmap != 0) { - evacuate_static_object(FUN_STATIC_LINK((StgClosure *)q), q); + if (info->srt != 0 || info->layout.payload.ptrs != 0) { + evacuate_static_object(STATIC_LINK(info,(StgClosure *)q), q); } return; @@ -707,9 +690,6 @@ loop: case THUNK_1_1: case THUNK_2_0: case THUNK_0_2: -#if defined(NO_PROMOTE_THUNKS) -#error bitrotted -#endif copy(p,info,q,sizeofW(StgThunk)+2,gen_no); return; @@ -753,6 +733,19 @@ loop: copy(p,info,q,sizeofW(StgInd),gen_no); return; } + // Note [BLACKHOLE pointing to IND] + // + // BLOCKING_QUEUE can be overwritten by IND (see + // wakeBlockingQueue()). However, when this happens we must + // be updating the BLACKHOLE, so the BLACKHOLE's indirectee + // should now point to the value. + // + // The mutator might observe an inconsistent state, because + // the writes are happening in another thread, so it's + // possible for the mutator to follow an indirectee and find + // an IND. But this should never happen in the GC, because + // the mutators are all stopped and the writes have + // completed. ASSERT(i != &stg_IND_info); } q = r; @@ -818,16 +811,16 @@ loop: case MUT_ARR_PTRS_CLEAN: case MUT_ARR_PTRS_DIRTY: - case MUT_ARR_PTRS_FROZEN: - case MUT_ARR_PTRS_FROZEN0: + case MUT_ARR_PTRS_FROZEN_CLEAN: + case MUT_ARR_PTRS_FROZEN_DIRTY: // just copy the block copy(p,info,q,mut_arr_ptrs_sizeW((StgMutArrPtrs *)q),gen_no); return; case SMALL_MUT_ARR_PTRS_CLEAN: case SMALL_MUT_ARR_PTRS_DIRTY: - case SMALL_MUT_ARR_PTRS_FROZEN: - case SMALL_MUT_ARR_PTRS_FROZEN0: + case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN: + case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY: // just copy the block copy(p,info,q,small_mut_arr_ptrs_sizeW((StgSmallMutArrPtrs *)q),gen_no); return; @@ -898,9 +891,16 @@ evacuate_BLACKHOLE(StgClosure **p) bd = Bdescr((P_)q); - // blackholes can't be in a compact, or large - ASSERT((bd->flags & (BF_COMPACT | BF_LARGE)) == 0); + // blackholes can't be in a compact + ASSERT((bd->flags & BF_COMPACT) == 0); + // blackholes *can* be in a large object: when raiseAsync() creates an + // AP_STACK the payload might be large enough to create a large object. + // See #14497. + if (bd->flags & BF_LARGE) { + evacuate_large((P_)q); + return; + } if (bd->flags & BF_EVACUATED) { if (bd->gen_no < gct->evac_gen_no) { gct->failed_to_evac = true; @@ -934,23 +934,34 @@ evacuate_BLACKHOLE(StgClosure **p) copy(p,info,q,sizeofW(StgInd),gen_no); } -/* ----------------------------------------------------------------------------- - Evaluate a THUNK_SELECTOR if possible. +/* ---------------------------------------------------------------------------- + Update a chain of thunk selectors with the given value. All selectors in the + chain become IND pointing to the value, except when there is a loop (i.e. + the value of a THUNK_SELECTOR is the THUNK_SELECTOR itself), in that case we + leave the selector as-is. + + p is the current selector to update. In eval_thunk_selector we make a list + from selectors using ((StgThunk*)p)->payload[0] for the link field and use + that field to traverse the chain here. + + val is the final value of the selector chain. + + A chain is formed when we've got something like: - p points to a THUNK_SELECTOR that we want to evaluate. The - result of "evaluating" it will be evacuated and a pointer to the - to-space closure will be returned. + let x = C1 { f1 = e1 } + y = C2 { f2 = f1 x } + z = f2 y - If the THUNK_SELECTOR could not be evaluated (its selectee is still - a THUNK, for example), then the THUNK_SELECTOR itself will be - evacuated. + Here the chain (p) we get when evacuating z is: + + [ f2 y, f1 x ] + + and val is e1. -------------------------------------------------------------------------- */ + static void unchain_thunk_selectors(StgSelector *p, StgClosure *val) { - StgSelector *prev; - - prev = NULL; while (p) { ASSERT(p->header.info == &stg_WHITEHOLE_info); @@ -960,7 +971,7 @@ unchain_thunk_selectors(StgSelector *p, StgClosure *val) // not evacuate it), so in this case val is in from-space. // ASSERT(!HEAP_ALLOCED_GC(val) || Bdescr((P_)val)->gen_no > N || (Bdescr((P_)val)->flags & BF_EVACUATED)); - prev = (StgSelector*)((StgClosure *)p)->payload[0]; + StgSelector *prev = (StgSelector*)((StgClosure *)p)->payload[0]; // Update the THUNK_SELECTOR with an indirection to the // value. The value is still in from-space at this stage. @@ -997,8 +1008,18 @@ unchain_thunk_selectors(StgSelector *p, StgClosure *val) } } +/* ----------------------------------------------------------------------------- + Evaluate a THUNK_SELECTOR if possible. + + p points to a THUNK_SELECTOR that we want to evaluate. + + If the THUNK_SELECTOR could not be evaluated (its selectee is still a THUNK, + for example), then the THUNK_SELECTOR itself will be evacuated depending on + the evac parameter. + -------------------------------------------------------------------------- */ + static void -eval_thunk_selector (StgClosure **q, StgSelector * p, bool evac) +eval_thunk_selector (StgClosure **q, StgSelector *p, bool evac) // NB. for legacy reasons, p & q are swapped around :( { uint32_t field; @@ -1007,7 +1028,6 @@ eval_thunk_selector (StgClosure **q, StgSelector * p, bool evac) StgClosure *selectee; StgSelector *prev_thunk_selector; bdescr *bd; - StgClosure *val; prev_thunk_selector = NULL; // this is a chain of THUNK_SELECTORs that we are going to update @@ -1057,9 +1077,14 @@ selector_chain: // In threaded mode, we'll use WHITEHOLE to lock the selector // thunk while we evaluate it. { - do { + while(true) { info_ptr = xchg((StgPtr)&p->header.info, (W_)&stg_WHITEHOLE_info); - } while (info_ptr == (W_)&stg_WHITEHOLE_info); + if (info_ptr != (W_)&stg_WHITEHOLE_info) { break; } +#if defined(PROF_SPIN) + ++whitehole_gc_spin; +#endif + busy_wait_nop(); + } // make sure someone else didn't get here first... if (IS_FORWARDING_PTR(info_ptr) || @@ -1127,7 +1152,7 @@ selector_loop: info->layout.payload.nptrs)); // Select the right field from the constructor - val = selectee->payload[field]; + StgClosure *val = selectee->payload[field]; #if defined(PROFILING) // For the purposes of LDV profiling, we have destroyed @@ -1159,6 +1184,8 @@ selector_loop: val = ((StgInd *)val)->indirectee; goto val_loop; case THUNK_SELECTOR: + // Use payload to make a list of thunk selectors, to be + // used in unchain_thunk_selectors ((StgClosure*)p)->payload[0] = (StgClosure *)prev_thunk_selector; prev_thunk_selector = p; p = (StgSelector*)val; @@ -1273,5 +1300,4 @@ bale_out: copy(q,(const StgInfoTable *)info_ptr,(StgClosure *)p,THUNK_SELECTOR_sizeW(),bd->dest_no); } unchain_thunk_selectors(prev_thunk_selector, *q); - return; } diff --git a/rts/sm/GC.c b/rts/sm/GC.c index aa804a8b76..70d6d8efe5 100644 --- a/rts/sm/GC.c +++ b/rts/sm/GC.c @@ -28,6 +28,7 @@ #include "Sparks.h" #include "Sweep.h" +#include "Arena.h" #include "Storage.h" #include "RtsUtils.h" #include "Apply.h" @@ -45,9 +46,15 @@ #include "RetainerProfile.h" #include "LdvProfile.h" #include "RaiseAsync.h" -#include "Stable.h" +#include "StableName.h" +#include "StablePtr.h" #include "CheckUnload.h" #include "CNF.h" +#include "RtsFlags.h" + +#if defined(PROFILING) +#include "RetainerProfile.h" +#endif #include <string.h> // for memset() #include <unistd.h> @@ -89,6 +96,8 @@ * * We build up a static object list while collecting generations 0..N, * which is then appended to the static object list of generation N+1. + * + * See also: Note [STATIC_LINK fields] in Storage.h. */ /* N is the oldest generation being collected, where the generations @@ -112,8 +121,6 @@ uint32_t mutlist_MUTVARS, mutlist_TVAR_WATCH_QUEUE, mutlist_TREC_CHUNK, mutlist_TREC_HEADER, - mutlist_ATOMIC_INVARIANT, - mutlist_INVARIANT_CHECK_QUEUE, mutlist_OTHERS; #endif @@ -122,7 +129,9 @@ uint32_t mutlist_MUTVARS, gc_thread **gc_threads = NULL; #if !defined(THREADED_RTS) -StgWord8 the_gc_thread[sizeof(gc_thread) + 64 * sizeof(gen_workspace)]; +// Must be aligned to 64-bytes to meet stated 64-byte alignment of gen_workspace +StgWord8 the_gc_thread[sizeof(gc_thread) + 64 * sizeof(gen_workspace)] + ATTRIBUTE_ALIGNED(64); #endif // Number of threads running in *this* GC. Affects how many @@ -132,6 +141,13 @@ uint32_t n_gc_threads; // For stats: static long copied; // *words* copied & scavenged during this GC +#if defined(PROF_SPIN) && defined(THREADED_RTS) +// spin and yield counts for the quasi-SpinLock in waitForGcThreads +volatile StgWord64 waitForGcThreads_spin = 0; +volatile StgWord64 waitForGcThreads_yield = 0; +volatile StgWord64 whitehole_gc_spin = 0; +#endif + bool work_stealing; uint32_t static_flag = STATIC_FLAG_B; @@ -188,7 +204,9 @@ GarbageCollect (uint32_t collect_gen, { bdescr *bd; generation *gen; - StgWord live_blocks, live_words, par_max_copied, par_balanced_copied; + StgWord live_blocks, live_words, par_max_copied, par_balanced_copied, + gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield, + any_work, no_work, scav_find_work; #if defined(THREADED_RTS) gc_thread *saved_gct; #endif @@ -221,8 +239,9 @@ GarbageCollect (uint32_t collect_gen, // tell the stats department that we've started a GC stat_startGC(cap, gct); - // lock the StablePtr table - stableLock(); + // Lock the StablePtr table. This prevents FFI calls manipulating + // the table from occurring during GC. + stablePtrLock(); #if defined(DEBUG) mutlist_MUTVARS = 0; @@ -232,8 +251,6 @@ GarbageCollect (uint32_t collect_gen, mutlist_TVAR_WATCH_QUEUE = 0; mutlist_TREC_CHUNK = 0; mutlist_TREC_HEADER = 0; - mutlist_ATOMIC_INVARIANT = 0; - mutlist_INVARIANT_CHECK_QUEUE = 0; mutlist_OTHERS = 0; #endif @@ -385,17 +402,15 @@ GarbageCollect (uint32_t collect_gen, markScheduler(mark_root, gct); -#if defined(RTS_USER_SIGNALS) - // mark the signal handlers (signals should be already blocked) - markSignalHandlers(mark_root, gct); -#endif - // Mark the weak pointer list, and prepare to detect dead weak pointers. markWeakPtrList(); initWeakForGC(); // Mark the stable pointer table. - markStableTables(mark_root, gct); + markStablePtrTable(mark_root, gct); + + // Remember old stable name addresses. + rememberOldStableNameAddresses (); /* ------------------------------------------------------------------------- * Repeatedly scavenge all the areas we know about until there's no @@ -421,7 +436,7 @@ GarbageCollect (uint32_t collect_gen, shutdown_gc_threads(gct->thread_index, idle_cap); // Now see which stable names are still alive. - gcStableTables(); + gcStableNameTable(); #if defined(THREADED_RTS) if (n_gc_threads == 1) { @@ -461,32 +476,53 @@ GarbageCollect (uint32_t collect_gen, copied = 0; par_max_copied = 0; par_balanced_copied = 0; + gc_spin_spin = 0; + gc_spin_yield = 0; + mut_spin_spin = 0; + mut_spin_yield = 0; + any_work = 0; + no_work = 0; + scav_find_work = 0; { uint32_t i; uint64_t par_balanced_copied_acc = 0; + const gc_thread* thread; for (i=0; i < n_gc_threads; i++) { copied += gc_threads[i]->copied; } for (i=0; i < n_gc_threads; i++) { + thread = gc_threads[i]; if (n_gc_threads > 1) { debugTrace(DEBUG_gc,"thread %d:", i); - debugTrace(DEBUG_gc," copied %ld", gc_threads[i]->copied * sizeof(W_)); - debugTrace(DEBUG_gc," scanned %ld", gc_threads[i]->scanned * sizeof(W_)); - debugTrace(DEBUG_gc," any_work %ld", gc_threads[i]->any_work); - debugTrace(DEBUG_gc," no_work %ld", gc_threads[i]->no_work); - debugTrace(DEBUG_gc," scav_find_work %ld", gc_threads[i]->scav_find_work); + debugTrace(DEBUG_gc," copied %ld", + thread->copied * sizeof(W_)); + debugTrace(DEBUG_gc," scanned %ld", + thread->scanned * sizeof(W_)); + debugTrace(DEBUG_gc," any_work %ld", + thread->any_work); + debugTrace(DEBUG_gc," no_work %ld", + thread->no_work); + debugTrace(DEBUG_gc," scav_find_work %ld", + thread->scav_find_work); + +#if defined(THREADED_RTS) && defined(PROF_SPIN) + gc_spin_spin += thread->gc_spin.spin; + gc_spin_yield += thread->gc_spin.yield; + mut_spin_spin += thread->mut_spin.spin; + mut_spin_yield += thread->mut_spin.yield; +#endif + + any_work += thread->any_work; + no_work += thread->no_work; + scav_find_work += thread->scav_find_work; + + par_max_copied = stg_max(gc_threads[i]->copied, par_max_copied); + par_balanced_copied_acc += + stg_min(n_gc_threads * gc_threads[i]->copied, copied); } - par_max_copied = stg_max(gc_threads[i]->copied, par_max_copied); - par_balanced_copied_acc += - stg_min(n_gc_threads * gc_threads[i]->copied, copied); } - if (n_gc_threads == 1) { - par_max_copied = 0; - par_balanced_copied = 0; - } - else - { + if (n_gc_threads > 1) { // See Note [Work Balance] for an explanation of this computation par_balanced_copied = (par_balanced_copied_acc - copied + (n_gc_threads - 1) / 2) / @@ -521,13 +557,11 @@ GarbageCollect (uint32_t collect_gen, copied += mut_list_size; debugTrace(DEBUG_gc, - "mut_list_size: %lu (%d vars, %d arrays, %d MVARs, %d TVARs, %d TVAR_WATCH_QUEUEs, %d TREC_CHUNKs, %d TREC_HEADERs, %d ATOMIC_INVARIANTs, %d INVARIANT_CHECK_QUEUEs, %d others)", + "mut_list_size: %lu (%d vars, %d arrays, %d MVARs, %d TVARs, %d TVAR_WATCH_QUEUEs, %d TREC_CHUNKs, %d TREC_HEADERs, %d others)", (unsigned long)(mut_list_size * sizeof(W_)), mutlist_MUTVARS, mutlist_MUTARRS, mutlist_MVARS, mutlist_TVAR, mutlist_TVAR_WATCH_QUEUE, mutlist_TREC_CHUNK, mutlist_TREC_HEADER, - mutlist_ATOMIC_INVARIANT, - mutlist_INVARIANT_CHECK_QUEUE, mutlist_OTHERS); } @@ -701,15 +735,15 @@ GarbageCollect (uint32_t collect_gen, if (major_gc) { gcCAFs(); } #endif - // Update the stable pointer hash table. - updateStableTables(major_gc); + // Update the stable name hash table + updateStableNameTable(major_gc); // unlock the StablePtr table. Must be before scheduleFinalizers(), // because a finalizer may call hs_free_fun_ptr() or // hs_free_stable_ptr(), both of which access the StablePtr table. - stableUnlock(); + stablePtrUnlock(); - // Must be after stableUnlock(), because it might free stable ptrs. + // Must be after stablePtrUnlock(), because it might free stable ptrs. if (major_gc) { checkUnload (gct->scavenged_static_objects); } @@ -751,24 +785,51 @@ GarbageCollect (uint32_t collect_gen, ACQUIRE_SM_LOCK; if (major_gc) { - W_ need, got; - need = BLOCKS_TO_MBLOCKS(n_alloc_blocks); - got = mblocks_allocated; + W_ need_prealloc, need_live, need, got; + uint32_t i; + + need_live = 0; + for (i = 0; i < RtsFlags.GcFlags.generations; i++) { + need_live += genLiveBlocks(&generations[i]); + } + need_live = stg_max(RtsFlags.GcFlags.minOldGenSize, need_live); + + need_prealloc = 0; + for (i = 0; i < n_nurseries; i++) { + need_prealloc += nurseries[i].n_blocks; + } + need_prealloc += RtsFlags.GcFlags.largeAllocLim; + need_prealloc += countAllocdBlocks(exec_block); + need_prealloc += arenaBlocks(); +#if defined(PROFILING) + if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_RETAINER) { + need_prealloc = retainerStackBlocks(); + } +#endif + /* If the amount of data remains constant, next major GC we'll - require (F+1)*need. We leave (F+2)*need in order to reduce - repeated deallocation and reallocation. */ - need = (RtsFlags.GcFlags.oldGenFactor + 2) * need; + * require (F+1)*live + prealloc. We leave (F+2)*live + prealloc + * in order to reduce repeated deallocation and reallocation. #14702 + */ + need = need_prealloc + (RtsFlags.GcFlags.oldGenFactor + 2) * need_live; + + /* Also, if user set heap size, do not drop below it. + */ + need = stg_max(RtsFlags.GcFlags.heapSizeSuggestion, need); + /* But with a large nursery, the above estimate might exceed * maxHeapSize. A large resident set size might make the OS * kill this process, or swap unnecessarily. Therefore we * ensure that our estimate does not exceed maxHeapSize. */ if (RtsFlags.GcFlags.maxHeapSize != 0) { - W_ max = BLOCKS_TO_MBLOCKS(RtsFlags.GcFlags.maxHeapSize); - if (need > max) { - need = max; - } + need = stg_min(RtsFlags.GcFlags.maxHeapSize, need); } + + need = BLOCKS_TO_MBLOCKS(need); + + got = mblocks_allocated; + if (got > need) { returnMemoryToOS(got - need); } @@ -797,7 +858,9 @@ GarbageCollect (uint32_t collect_gen, // ok, GC over: tell the stats department what happened. stat_endGC(cap, gct, live_words, copied, live_blocks * BLOCK_SIZE_W - live_words /* slop */, - N, n_gc_threads, par_max_copied, par_balanced_copied); + N, n_gc_threads, par_max_copied, par_balanced_copied, + gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield, + any_work, no_work, scav_find_work); #if defined(RTS_USER_SIGNALS) if (RtsFlags.MiscFlags.install_signal_handlers) { @@ -825,11 +888,6 @@ static void heapOverflow(void) Initialise the gc_thread structures. -------------------------------------------------------------------------- */ -#define GC_THREAD_INACTIVE 0 -#define GC_THREAD_STANDING_BY 1 -#define GC_THREAD_RUNNING 2 -#define GC_THREAD_WAITING_TO_CONTINUE 3 - static void new_gc_thread (uint32_t n, gc_thread *t) { @@ -1132,6 +1190,9 @@ waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[]) const uint32_t me = cap->no; uint32_t i, j; bool retry = true; + Time t0, t1, t2; + + t0 = t1 = t2 = getProcessElapsedTime(); while(retry) { for (i=0; i < n_threads; i++) { @@ -1151,8 +1212,32 @@ waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[]) } } if (!retry) break; +#if defined(PROF_SPIN) + waitForGcThreads_yield++; +#endif yieldThread(); } + + t2 = getProcessElapsedTime(); + if (RtsFlags.GcFlags.longGCSync != 0 && + t2 - t1 > RtsFlags.GcFlags.longGCSync) { + /* call this every longGCSync of delay */ + rtsConfig.longGCSync(cap->no, t2 - t0); + t1 = t2; + } + if (retry) { +#if defined(PROF_SPIN) + // This is a bit strange, we'll get more yields than spins. + // I guess that means it's not a spin-lock at all, but these + // numbers are still useful (I think). + waitForGcThreads_spin++; +#endif + } + } + + if (RtsFlags.GcFlags.longGCSync != 0 && + t2 - t0 > RtsFlags.GcFlags.longGCSync) { + rtsConfig.longGCSyncEnd(t2 - t0); } } @@ -1324,7 +1409,7 @@ prepare_collected_gen (generation *gen) bdescr *bitmap_bdescr; StgWord *bitmap; - bitmap_size = gen->n_old_blocks * BLOCK_SIZE / (sizeof(W_)*BITS_PER_BYTE); + bitmap_size = gen->n_old_blocks * BLOCK_SIZE / BITS_IN(W_); if (bitmap_size > 0) { bitmap_bdescr = allocGroup((StgWord)BLOCK_ROUND_UP(bitmap_size) @@ -1342,7 +1427,7 @@ prepare_collected_gen (generation *gen) // block descriptor. for (bd=gen->old_blocks; bd != NULL; bd = bd->link) { bd->u.bitmap = bitmap; - bitmap += BLOCK_SIZE_W / (sizeof(W_)*BITS_PER_BYTE); + bitmap += BLOCK_SIZE_W / BITS_IN(W_); // Also at this point we set the BF_MARKED flag // for this block. The invariant is that @@ -1446,9 +1531,6 @@ collect_gct_blocks (void) take a global lock. Here we collect those blocks from the cap->pinned_object_blocks lists and put them on the main g0->large_object list. - - Returns: the number of words allocated this way, for stats - purposes. -------------------------------------------------------------------------- */ static void @@ -1744,8 +1826,8 @@ resize_nursery (void) Sanity code for CAF garbage collection. With DEBUG turned on, we manage a CAF list in addition to the SRT - mechanism. After GC, we run down the CAF list and blackhole any - CAFs which have been garbage collected. This means we get an error + mechanism. After GC, we run down the CAF list and make any + CAFs which have been garbage collected GCD_CAF. This means we get an error whenever the program tries to enter a garbage collected CAF. Any garbage collected CAFs are taken off the CAF list at the same @@ -1771,7 +1853,10 @@ static void gcCAFs(void) info = get_itbl((StgClosure*)p); ASSERT(info->type == IND_STATIC); - if (p->static_link == NULL) { + // See Note [STATIC_LINK fields] in Storage.h + // This condition identifies CAFs that have just been GC'd and + // don't have static_link==3 which means they should be ignored. + if ((((StgWord)(p->static_link)&STATIC_BITS) | prev_static_flag) != 3) { debugTrace(DEBUG_gccafs, "CAF gc'd at 0x%p", p); SET_INFO((StgClosure*)p,&stg_GCD_CAF_info); // stub it if (prev == NULL) { @@ -1788,3 +1873,28 @@ static void gcCAFs(void) debugTrace(DEBUG_gccafs, "%d CAFs live", i); } #endif + + +/* ----------------------------------------------------------------------------- + The GC can leave some work for the mutator to do before the next + GC, provided the work can be safely overlapped with mutation. This + can help reduce the GC pause time. + + The mutator can call doIdleGCWork() any time it likes, but + preferably when it is idle. It's safe for multiple capabilities to + call doIdleGCWork(). + + When 'all' is + * false: doIdleGCWork() should only take a short, bounded, amount + of time. + * true: doIdleGCWork() will complete all the outstanding GC work. + + The return value is + * true if there's more to do (only if 'all' is false). + * false otherwise. + -------------------------------------------------------------------------- */ + +bool doIdleGCWork(Capability *cap STG_UNUSED, bool all) +{ + return runSomeFinalizers(all); +} diff --git a/rts/sm/GC.h b/rts/sm/GC.h index c6b0c13a46..437a25f8d9 100644 --- a/rts/sm/GC.h +++ b/rts/sm/GC.h @@ -26,6 +26,8 @@ typedef void (*evac_fn)(void *user, StgClosure **root); StgClosure * isAlive ( StgClosure *p ); void markCAFs ( evac_fn evac, void *user ); +bool doIdleGCWork(Capability *cap, bool all); + extern uint32_t N; extern bool major_gc; @@ -40,13 +42,13 @@ extern uint32_t mutlist_MUTVARS, mutlist_MUTARRS, mutlist_MVARS, mutlist_OTHERS, mutlist_TVAR, mutlist_TVAR_WATCH_QUEUE, mutlist_TREC_CHUNK, - mutlist_TREC_HEADER, - mutlist_ATOMIC_INVARIANT, - mutlist_INVARIANT_CHECK_QUEUE; + mutlist_TREC_HEADER; #endif #if defined(PROF_SPIN) && defined(THREADED_RTS) -extern StgWord64 whitehole_spin; +extern volatile StgWord64 whitehole_gc_spin; +extern volatile StgWord64 waitForGcThreads_spin; +extern volatile StgWord64 waitForGcThreads_yield; #endif void gcWorkerThread (Capability *cap); diff --git a/rts/sm/GCThread.h b/rts/sm/GCThread.h index bb206db64c..e865dabe5d 100644 --- a/rts/sm/GCThread.h +++ b/rts/sm/GCThread.h @@ -116,6 +116,12 @@ typedef struct gen_workspace_ { of the GC threads ------------------------------------------------------------------------- */ +/* values for the wakeup field */ +#define GC_THREAD_INACTIVE 0 +#define GC_THREAD_STANDING_BY 1 +#define GC_THREAD_RUNNING 2 +#define GC_THREAD_WAITING_TO_CONTINUE 3 + typedef struct gc_thread_ { Capability *cap; diff --git a/rts/sm/GCUtils.c b/rts/sm/GCUtils.c index 0373c2b925..31b2913a37 100644 --- a/rts/sm/GCUtils.c +++ b/rts/sm/GCUtils.c @@ -81,6 +81,14 @@ freeChain_sync(bdescr *bd) RELEASE_SPIN_LOCK(&gc_alloc_block_sync); } +void +freeGroup_sync(bdescr *bd) +{ + ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync); + freeGroup(bd); + RELEASE_SPIN_LOCK(&gc_alloc_block_sync); +} + /* ----------------------------------------------------------------------------- Workspace utilities -------------------------------------------------------------------------- */ @@ -261,7 +269,7 @@ todo_block_full (uint32_t size, gen_workspace *ws) // object. However, if the object we're copying is // larger than a block, then we might have an empty // block here. - freeGroup(bd); + freeGroup_sync(bd); } else { push_scanned_block(bd, ws); } @@ -341,24 +349,3 @@ alloc_todo_block (gen_workspace *ws, uint32_t size) return ws->todo_free; } - -/* ----------------------------------------------------------------------------- - * Debugging - * -------------------------------------------------------------------------- */ - -#if defined(DEBUG) -void -printMutableList(bdescr *bd) -{ - StgPtr p; - - debugBelch("mutable list %p: ", bd); - - for (; bd != NULL; bd = bd->link) { - for (p = bd->start; p < bd->free; p++) { - debugBelch("%p (%s), ", (void *)*p, info_type((StgClosure *)*p)); - } - } - debugBelch("\n"); -} -#endif /* DEBUG */ diff --git a/rts/sm/GCUtils.h b/rts/sm/GCUtils.h index 2e2d4b199d..8b6040769e 100644 --- a/rts/sm/GCUtils.h +++ b/rts/sm/GCUtils.h @@ -31,6 +31,7 @@ INLINE_HEADER bdescr *allocBlockOnNode_sync(uint32_t node) } void freeChain_sync(bdescr *bd); +void freeGroup_sync(bdescr *bd); void push_scanned_block (bdescr *bd, gen_workspace *ws); StgPtr todo_block_full (uint32_t size, gen_workspace *ws); @@ -50,11 +51,6 @@ isPartiallyFull(bdescr *bd) return (bd->free + WORK_UNIT_WORDS < bd->start + BLOCK_SIZE_W); } - -#if defined(DEBUG) -void printMutableList (bdescr *bd); -#endif - // Version of recordMutableGen for use during GC. This uses the // mutable lists attached to the current gc_thread structure, which // are the same as the mutable lists on the Capability. diff --git a/rts/sm/MarkWeak.c b/rts/sm/MarkWeak.c index 9a077b3d14..88037f6a34 100644 --- a/rts/sm/MarkWeak.c +++ b/rts/sm/MarkWeak.c @@ -32,11 +32,11 @@ /* ----------------------------------------------------------------------------- Weak Pointers - traverse_weak_ptr_list is called possibly many times during garbage + traverseWeakPtrList is called possibly many times during garbage collection. It returns a flag indicating whether it did any work (i.e. called evacuate on any live pointers). - Invariant: traverse_weak_ptr_list is called when the heap is in an + Invariant: traverseWeakPtrList is called when the heap is in an idempotent state. That means that there are no pending evacuate/scavenge operations. This invariant helps the weak pointer code decide which weak pointers are dead - if there are no @@ -60,7 +60,7 @@ Now, we discover which *threads* are still alive. Pointers to threads from the all_threads and main thread lists are the - weakest of all: a pointers from the finalizer of a dead weak + weakest of all: a pointer from the finalizer of a dead weak pointer can keep a thread alive. Any threads found to be unreachable are evacuated and placed on the resurrected_threads list so we can send them a signal later. @@ -72,7 +72,7 @@ -------------------------------------------------------------------------- */ /* Which stage of processing various kinds of weak pointer are we at? - * (see traverse_weak_ptr_list() below for discussion). + * (see traverseWeakPtrList() below for discussion). */ typedef enum { WeakPtrs, WeakThreads, WeakDone } WeakStage; static WeakStage weak_stage; @@ -185,7 +185,7 @@ traverseWeakPtrList(void) } default: - barf("traverse_weak_ptr_list"); + barf("traverseWeakPtrList"); return true; } } @@ -344,7 +344,7 @@ static void tidyThreadList (generation *gen) if (tmp == NULL) { // not alive (yet): leave this thread on the - // old_all_threads list. + // old_threads list. prev = &(t->global_link); } else { @@ -378,14 +378,13 @@ static void checkWeakPtrSanity(StgWeak *hd, StgWeak *tl) void collectFreshWeakPtrs() { uint32_t i; - generation *gen = &generations[0]; // move recently allocated weak_ptr_list to the old list as well for (i = 0; i < n_capabilities; i++) { Capability *cap = capabilities[i]; if (cap->weak_ptr_list_tl != NULL) { IF_DEBUG(sanity, checkWeakPtrSanity(cap->weak_ptr_list_hd, cap->weak_ptr_list_tl)); - cap->weak_ptr_list_tl->link = gen->weak_ptr_list; - gen->weak_ptr_list = cap->weak_ptr_list_hd; + cap->weak_ptr_list_tl->link = g0->weak_ptr_list; + g0->weak_ptr_list = cap->weak_ptr_list_hd; cap->weak_ptr_list_tl = NULL; cap->weak_ptr_list_hd = NULL; } else { diff --git a/rts/sm/OSMem.h b/rts/sm/OSMem.h index 3b0cee9630..7dd0efdc23 100644 --- a/rts/sm/OSMem.h +++ b/rts/sm/OSMem.h @@ -18,6 +18,7 @@ void osFreeAllMBlocks(void); size_t getPageSize (void); StgWord64 getPhysicalMemorySize (void); void setExecutable (void *p, W_ len, bool exec); +bool osBuiltWithNumaSupport(void); // See #14956 bool osNumaAvailable(void); uint32_t osNumaNodes(void); uint64_t osNumaMask(void); diff --git a/rts/sm/Sanity.c b/rts/sm/Sanity.c index 53b101024a..8d4171b1cd 100644 --- a/rts/sm/Sanity.c +++ b/rts/sm/Sanity.c @@ -380,8 +380,8 @@ checkClosure( const StgClosure* p ) case MUT_ARR_PTRS_CLEAN: case MUT_ARR_PTRS_DIRTY: - case MUT_ARR_PTRS_FROZEN: - case MUT_ARR_PTRS_FROZEN0: + case MUT_ARR_PTRS_FROZEN_CLEAN: + case MUT_ARR_PTRS_FROZEN_DIRTY: { StgMutArrPtrs* a = (StgMutArrPtrs *)p; uint32_t i; @@ -391,6 +391,18 @@ checkClosure( const StgClosure* p ) return mut_arr_ptrs_sizeW(a); } + case SMALL_MUT_ARR_PTRS_CLEAN: + case SMALL_MUT_ARR_PTRS_DIRTY: + case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN: + case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY: + { + StgSmallMutArrPtrs *a = (StgSmallMutArrPtrs *)p; + for (uint32_t i = 0; i < a->ptrs; i++) { + ASSERT(LOOKS_LIKE_CLOSURE_PTR(a->payload[i])); + } + return small_mut_arr_ptrs_sizeW(a); + } + case TSO: checkTSO((StgTSO *)p); return sizeofW(StgTSO); @@ -535,7 +547,8 @@ checkTSO(StgTSO *tso) ASSERT(next == END_TSO_QUEUE || info == &stg_MVAR_TSO_QUEUE_info || info == &stg_TSO_info || - info == &stg_WHITEHOLE_info); // happens due to STM doing lockTSO() + info == &stg_WHITEHOLE_info); // used to happen due to STM doing + // lockTSO(), might not happen now if ( tso->why_blocked == BlockedOnMVar || tso->why_blocked == BlockedOnMVarRead @@ -677,7 +690,7 @@ checkStaticObjects ( StgClosure* static_objects ) break; case FUN_STATIC: - p = *FUN_STATIC_LINK((StgClosure *)p); + p = *STATIC_LINK(info,(StgClosure *)p); break; case CONSTR: @@ -859,7 +872,7 @@ void findSlop(bdescr *bd) slop = (bd->blocks * BLOCK_SIZE_W) - (bd->free - bd->start); if (slop > (1024/sizeof(W_))) { debugBelch("block at %p (bdescr %p) has %" FMT_Word "KB slop\n", - bd->start, bd, slop / (1024/sizeof(W_))); + bd->start, bd, slop / (1024/(W_)sizeof(W_))); } } } diff --git a/rts/sm/Scav.c b/rts/sm/Scav.c index 1ae8a4c19b..2f61914e55 100644 --- a/rts/sm/Scav.c +++ b/rts/sm/Scav.c @@ -11,6 +11,37 @@ * * ---------------------------------------------------------------------------*/ +/* ---------------------------------------------------------------------------- + We have two main scavenge functions: + + - scavenge_block(bdescr *bd) + - scavenge_one(StgPtr p) + + As the names and parameters suggest, first one scavenges a whole block while + the second one only scavenges one object. This however is not the only + difference. scavenge_block scavenges all SRTs while scavenge_one only + scavenges SRTs of stacks. The reason is because scavenge_one is called in two + cases: + + - When scavenging a mut_list + - When scavenging a large object + + We don't have to scavenge SRTs when scavenging a mut_list, because we only + scavenge mut_lists in minor GCs, and static objects are only collected in + major GCs. + + However, because scavenge_one is also used to scavenge large objects (which + are scavenged even in major GCs), we need to deal with SRTs of large + objects. We never allocate large FUNs and THUNKs, but we allocate large + STACKs (e.g. in threadStackOverflow), and stack frames can have SRTs. So + scavenge_one skips FUN and THUNK SRTs but scavenges stack frame SRTs. + + In summary, in a major GC: + + - scavenge_block() scavenges all SRTs + - scavenge_one() scavenges only stack frame SRTs + ------------------------------------------------------------------------- */ + #include "PosixSource.h" #include "Rts.h" @@ -329,105 +360,17 @@ scavenge_AP (StgAP *ap) Scavenge SRTs -------------------------------------------------------------------------- */ -/* Similar to scavenge_large_bitmap(), but we don't write back the - * pointers we get back from evacuate(). - */ -static void -scavenge_large_srt_bitmap( StgLargeSRT *large_srt ) -{ - uint32_t i, j, size; - StgWord bitmap; - StgClosure **p; - - size = (uint32_t)large_srt->l.size; - p = (StgClosure **)large_srt->srt; - - for (i = 0; i < size / BITS_IN(W_); i++) { - bitmap = large_srt->l.bitmap[i]; - // skip zero words: bitmaps can be very sparse, and this helps - // performance a lot in some cases. - if (bitmap != 0) { - for (j = 0; j < BITS_IN(W_); j++) { - if ((bitmap & 1) != 0) { - evacuate(p); - } - p++; - bitmap = bitmap >> 1; - } - } else { - p += BITS_IN(W_); - } - } - if (size % BITS_IN(W_) != 0) { - bitmap = large_srt->l.bitmap[i]; - for (j = 0; j < size % BITS_IN(W_); j++) { - if ((bitmap & 1) != 0) { - evacuate(p); - } - p++; - bitmap = bitmap >> 1; - } - } -} - -/* evacuate the SRT. If srt_bitmap is zero, then there isn't an - * srt field in the info table. That's ok, because we'll - * never dereference it. - */ -STATIC_INLINE GNUC_ATTR_HOT void -scavenge_srt (StgClosure **srt, uint32_t srt_bitmap) -{ - uint32_t bitmap; - StgClosure **p; - - bitmap = srt_bitmap; - p = srt; - - if (bitmap == (StgHalfWord)(-1)) { - scavenge_large_srt_bitmap( (StgLargeSRT *)srt ); - return; - } - - while (bitmap != 0) { - if ((bitmap & 1) != 0) { -#if defined(COMPILING_WINDOWS_DLL) - // Special-case to handle references to closures hiding out in DLLs, since - // double indirections required to get at those. The code generator knows - // which is which when generating the SRT, so it stores the (indirect) - // reference to the DLL closure in the table by first adding one to it. - // We check for this here, and undo the addition before evacuating it. - // - // If the SRT entry hasn't got bit 0 set, the SRT entry points to a - // closure that's fixed at link-time, and no extra magic is required. - if ( (W_)(*srt) & 0x1 ) { - evacuate( (StgClosure**) ((W_) (*srt) & ~0x1)); - } else { - evacuate(p); - } -#else - evacuate(p); -#endif - } - p++; - bitmap = bitmap >> 1; - } -} - - STATIC_INLINE GNUC_ATTR_HOT void scavenge_thunk_srt(const StgInfoTable *info) { StgThunkInfoTable *thunk_info; - uint32_t bitmap; if (!major_gc) return; thunk_info = itbl_to_thunk_itbl(info); - bitmap = thunk_info->i.srt_bitmap; - if (bitmap) { - // don't read srt_offset if bitmap==0, because it doesn't exist - // and so the memory might not be readable. - scavenge_srt((StgClosure **)GET_SRT(thunk_info), bitmap); + if (thunk_info->i.srt) { + StgClosure *srt = (StgClosure*)GET_SRT(thunk_info); + evacuate(&srt); } } @@ -435,16 +378,13 @@ STATIC_INLINE GNUC_ATTR_HOT void scavenge_fun_srt(const StgInfoTable *info) { StgFunInfoTable *fun_info; - uint32_t bitmap; if (!major_gc) return; fun_info = itbl_to_fun_itbl(info); - bitmap = fun_info->i.srt_bitmap; - if (bitmap) { - // don't read srt_offset if bitmap==0, because it doesn't exist - // and so the memory might not be readable. - scavenge_srt((StgClosure **)GET_FUN_SRT(fun_info), bitmap); + if (fun_info->i.srt) { + StgClosure *srt = (StgClosure*)GET_FUN_SRT(fun_info); + evacuate(&srt); } } @@ -737,18 +677,16 @@ scavenge_block (bdescr *bd) break; } - case MUT_ARR_PTRS_FROZEN: - case MUT_ARR_PTRS_FROZEN0: + case MUT_ARR_PTRS_FROZEN_CLEAN: + case MUT_ARR_PTRS_FROZEN_DIRTY: // follow everything { p = scavenge_mut_arr_ptrs((StgMutArrPtrs*)p); - // If we're going to put this object on the mutable list, then - // set its info ptr to MUT_ARR_PTRS_FROZEN0 to indicate that. if (gct->failed_to_evac) { - ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN0_info; + ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_DIRTY_info; } else { - ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_info; + ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_CLEAN_info; } break; } @@ -780,8 +718,8 @@ scavenge_block (bdescr *bd) break; } - case SMALL_MUT_ARR_PTRS_FROZEN: - case SMALL_MUT_ARR_PTRS_FROZEN0: + case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN: + case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY: // follow everything { StgPtr next; @@ -791,12 +729,10 @@ scavenge_block (bdescr *bd) evacuate((StgClosure **)p); } - // If we're going to put this object on the mutable list, then - // set its info ptr to SMALL_MUT_ARR_PTRS_FROZEN0 to indicate that. if (gct->failed_to_evac) { - ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN0_info; + ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_DIRTY_info; } else { - ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_info; + ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_CLEAN_info; } break; } @@ -1133,20 +1069,18 @@ scavenge_mark_stack(void) break; } - case MUT_ARR_PTRS_FROZEN: - case MUT_ARR_PTRS_FROZEN0: + case MUT_ARR_PTRS_FROZEN_CLEAN: + case MUT_ARR_PTRS_FROZEN_DIRTY: // follow everything { StgPtr q = p; scavenge_mut_arr_ptrs((StgMutArrPtrs *)p); - // If we're going to put this object on the mutable list, then - // set its info ptr to MUT_ARR_PTRS_FROZEN0 to indicate that. if (gct->failed_to_evac) { - ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN0_info; + ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_DIRTY_info; } else { - ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_info; + ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_CLEAN_info; } break; } @@ -1180,8 +1114,8 @@ scavenge_mark_stack(void) break; } - case SMALL_MUT_ARR_PTRS_FROZEN: - case SMALL_MUT_ARR_PTRS_FROZEN0: + case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN: + case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY: // follow everything { StgPtr next, q = p; @@ -1191,12 +1125,10 @@ scavenge_mark_stack(void) evacuate((StgClosure **)p); } - // If we're going to put this object on the mutable list, then - // set its info ptr to SMALL_MUT_ARR_PTRS_FROZEN0 to indicate that. if (gct->failed_to_evac) { - ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN0_info; + ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_DIRTY_info; } else { - ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_info; + ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_CLEAN_info; } break; } @@ -1365,7 +1297,7 @@ scavenge_one(StgPtr p) case WEAK: // This WEAK object will not be considered by tidyWeakList during this - // collection because it is in a generation >= N, but it is on the + // collection because it is in a generation > N, but it is on the // mutable list so we must evacuate all of its pointers because some // of them may point into a younger generation. scavengeLiveWeak((StgWeak *)p); @@ -1457,18 +1389,16 @@ scavenge_one(StgPtr p) break; } - case MUT_ARR_PTRS_FROZEN: - case MUT_ARR_PTRS_FROZEN0: + case MUT_ARR_PTRS_FROZEN_CLEAN: + case MUT_ARR_PTRS_FROZEN_DIRTY: { // follow everything scavenge_mut_arr_ptrs((StgMutArrPtrs *)p); - // If we're going to put this object on the mutable list, then - // set its info ptr to MUT_ARR_PTRS_FROZEN0 to indicate that. if (gct->failed_to_evac) { - ((StgClosure *)p)->header.info = &stg_MUT_ARR_PTRS_FROZEN0_info; + ((StgClosure *)p)->header.info = &stg_MUT_ARR_PTRS_FROZEN_DIRTY_info; } else { - ((StgClosure *)p)->header.info = &stg_MUT_ARR_PTRS_FROZEN_info; + ((StgClosure *)p)->header.info = &stg_MUT_ARR_PTRS_FROZEN_CLEAN_info; } break; } @@ -1502,8 +1432,8 @@ scavenge_one(StgPtr p) break; } - case SMALL_MUT_ARR_PTRS_FROZEN: - case SMALL_MUT_ARR_PTRS_FROZEN0: + case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN: + case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY: { // follow everything StgPtr next, q=p; @@ -1513,12 +1443,10 @@ scavenge_one(StgPtr p) evacuate((StgClosure **)p); } - // If we're going to put this object on the mutable list, then - // set its info ptr to SMALL_MUT_ARR_PTRS_FROZEN0 to indicate that. if (gct->failed_to_evac) { - ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN0_info; + ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_DIRTY_info; } else { - ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_info; + ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_CLEAN_info; } break; } @@ -1653,8 +1581,8 @@ scavenge_mutable_list(bdescr *bd, generation *gen) mutlist_MUTVARS++; break; case MUT_ARR_PTRS_CLEAN: case MUT_ARR_PTRS_DIRTY: - case MUT_ARR_PTRS_FROZEN: - case MUT_ARR_PTRS_FROZEN0: + case MUT_ARR_PTRS_FROZEN_CLEAN: + case MUT_ARR_PTRS_FROZEN_DIRTY: mutlist_MUTARRS++; break; case MVAR_CLEAN: barf("MVAR_CLEAN on mutable list"); @@ -1669,10 +1597,6 @@ scavenge_mutable_list(bdescr *bd, generation *gen) mutlist_TVAR_WATCH_QUEUE++; else if (((StgClosure*)p)->header.info == &stg_TREC_HEADER_info) mutlist_TREC_HEADER++; - else if (((StgClosure*)p)->header.info == &stg_ATOMIC_INVARIANT_info) - mutlist_ATOMIC_INVARIANT++; - else if (((StgClosure*)p)->header.info == &stg_INVARIANT_CHECK_QUEUE_info) - mutlist_INVARIANT_CHECK_QUEUE++; else mutlist_OTHERS++; break; @@ -1690,6 +1614,7 @@ scavenge_mutable_list(bdescr *bd, generation *gen) // switch (get_itbl((StgClosure *)p)->type) { case MUT_ARR_PTRS_CLEAN: + case SMALL_MUT_ARR_PTRS_CLEAN: recordMutableGen_GC((StgClosure *)p,gen_no); continue; case MUT_ARR_PTRS_DIRTY: @@ -1813,7 +1738,11 @@ scavenge_static(void) case FUN_STATIC: scavenge_fun_srt(info); - break; + /* fallthrough */ + + // a FUN_STATIC can also be an SRT, so it may have pointer + // fields. See Note [SRTs] in CmmBuildInfoTables, specifically + // the [FUN] optimisation. case CONSTR: case CONSTR_NOCAF: @@ -1979,8 +1908,10 @@ scavenge_stack(StgPtr p, StgPtr stack_end) p = scavenge_small_bitmap(p, size, bitmap); follow_srt: - if (major_gc) - scavenge_srt((StgClosure **)GET_SRT(info), info->i.srt_bitmap); + if (major_gc && info->i.srt) { + StgClosure *srt = (StgClosure*)GET_SRT(info); + evacuate(&srt); + } continue; case RET_BCO: { diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c index ffaed5f17c..dcc5b3a3c7 100644 --- a/rts/sm/Storage.c +++ b/rts/sm/Storage.c @@ -197,11 +197,7 @@ initStorage (void) #if defined(THREADED_RTS) initSpinLock(&gc_alloc_block_sync); -#if defined(PROF_SPIN) - whitehole_spin = 0; #endif -#endif - N = 0; for (n = 0; n < n_numa_nodes; n++) { @@ -224,6 +220,7 @@ initStorage (void) void storageAddCapabilities (uint32_t from, uint32_t to) { uint32_t n, g, i, new_n_nurseries; + nursery *old_nurseries; if (RtsFlags.GcFlags.nurseryChunkSize == 0) { new_n_nurseries = to; @@ -233,6 +230,7 @@ void storageAddCapabilities (uint32_t from, uint32_t to) stg_max(to, total_alloc / RtsFlags.GcFlags.nurseryChunkSize); } + old_nurseries = nurseries; if (from > 0) { nurseries = stgReallocBytes(nurseries, new_n_nurseries * sizeof(struct nursery_), @@ -244,8 +242,9 @@ void storageAddCapabilities (uint32_t from, uint32_t to) // we've moved the nurseries, so we have to update the rNursery // pointers from the Capabilities. - for (i = 0; i < to; i++) { - capabilities[i]->r.rNursery = &nurseries[i]; + for (i = 0; i < from; i++) { + uint32_t index = capabilities[i]->r.rNursery - old_nurseries; + capabilities[i]->r.rNursery = &nurseries[index]; } /* The allocation area. Policy: keep the allocation area @@ -307,21 +306,21 @@ freeStorage (bool free_heap) The entry code for every CAF does the following: - - calls newCaf, which builds a CAF_BLACKHOLE on the heap and atomically + - calls newCAF, which builds a CAF_BLACKHOLE on the heap and atomically updates the CAF with IND_STATIC pointing to the CAF_BLACKHOLE - - if newCaf returns zero, it re-enters the CAF (see Note [atomic + - if newCAF returns zero, it re-enters the CAF (see Note [atomic CAF entry]) - pushes an update frame pointing to the CAF_BLACKHOLE - Why do we build an BLACKHOLE in the heap rather than just updating + Why do we build a BLACKHOLE in the heap rather than just updating the thunk directly? It's so that we only need one kind of update frame - otherwise we'd need a static version of the update frame too, and various other parts of the RTS that deal with update frames would also need special cases for static update frames. - newCaf() does the following: + newCAF() does the following: - atomically locks the CAF (see [atomic CAF entry]) @@ -339,7 +338,7 @@ freeStorage (bool free_heap) ------------------ Note [atomic CAF entry] - With THREADED_RTS, newCaf() is required to be atomic (see + With THREADED_RTS, newCAF() is required to be atomic (see #5558). This is because if two threads happened to enter the same CAF simultaneously, they would create two distinct CAF_BLACKHOLEs, and so the normal threadPaused() machinery for detecting duplicate @@ -359,7 +358,7 @@ freeStorage (bool free_heap) - we must be able to *revert* CAFs that have been evaluated, to their pre-evaluated form. - To do this, we use an additional CAF list. When newCaf() is + To do this, we use an additional CAF list. When newCAF() is called on a dynamically-loaded CAF, we add it to the CAF list instead of the old-generation mutable list, and save away its old info pointer (in caf->saved_info) for later reversion. @@ -796,6 +795,20 @@ move_STACK (StgStack *src, StgStack *dest) dest->sp = (StgPtr)dest->sp + diff; } +STATIC_INLINE void +accountAllocation(Capability *cap, W_ n) +{ + TICK_ALLOC_HEAP_NOCTR(WDS(n)); + CCS_ALLOC(cap->r.rCCCS,n); + if (cap->r.rCurrentTSO != NULL) { + // cap->r.rCurrentTSO->alloc_limit -= n*sizeof(W_) + ASSIGN_Int64((W_*)&(cap->r.rCurrentTSO->alloc_limit), + (PK_Int64((W_*)&(cap->r.rCurrentTSO->alloc_limit)) + - n*sizeof(W_))); + } + +} + /* ----------------------------------------------------------------------------- StgPtr allocate (Capability *cap, W_ n) @@ -812,21 +825,37 @@ move_STACK (StgStack *src, StgStack *dest) that operation fails, then the whole process will be killed. -------------------------------------------------------------------------- */ +/* + * Allocate some n words of heap memory; terminating + * on heap overflow + */ StgPtr allocate (Capability *cap, W_ n) { + StgPtr p = allocateMightFail(cap, n); + if (p == NULL) { + reportHeapOverflow(); + // heapOverflow() doesn't exit (see #2592), but we aren't + // in a position to do a clean shutdown here: we + // either have to allocate the memory or exit now. + // Allocating the memory would be bad, because the user + // has requested that we not exceed maxHeapSize, so we + // just exit. + stg_exit(EXIT_HEAPOVERFLOW); + } + return p; +} + +/* + * Allocate some n words of heap memory; returning NULL + * on heap overflow + */ +StgPtr +allocateMightFail (Capability *cap, W_ n) +{ bdescr *bd; StgPtr p; - TICK_ALLOC_HEAP_NOCTR(WDS(n)); - CCS_ALLOC(cap->r.rCCCS,n); - if (cap->r.rCurrentTSO != NULL) { - // cap->r.rCurrentTSO->alloc_limit -= n*sizeof(W_) - ASSIGN_Int64((W_*)&(cap->r.rCurrentTSO->alloc_limit), - (PK_Int64((W_*)&(cap->r.rCurrentTSO->alloc_limit)) - - n*sizeof(W_))); - } - if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) { // The largest number of words such that // the computation of req_blocks will not overflow. @@ -845,16 +874,12 @@ allocate (Capability *cap, W_ n) req_blocks >= HS_INT32_MAX) // avoid overflow when // calling allocGroup() below { - reportHeapOverflow(); - // heapOverflow() doesn't exit (see #2592), but we aren't - // in a position to do a clean shutdown here: we - // either have to allocate the memory or exit now. - // Allocating the memory would be bad, because the user - // has requested that we not exceed maxHeapSize, so we - // just exit. - stg_exit(EXIT_HEAPOVERFLOW); + return NULL; } + // Only credit allocation after we've passed the size check above + accountAllocation(cap, n); + ACQUIRE_SM_LOCK bd = allocGroupOnNode(cap->node,req_blocks); dbl_link_onto(bd, &g0->large_objects); @@ -870,6 +895,7 @@ allocate (Capability *cap, W_ n) /* small allocation (<LARGE_OBJECT_THRESHOLD) */ + accountAllocation(cap, n); bd = cap->r.rCurrentAlloc; if (bd == NULL || bd->free + n > bd->start + BLOCK_SIZE_W) { @@ -955,7 +981,8 @@ allocate (Capability *cap, W_ n) to pinned ByteArrays, not scavenging is ok. This function is called by newPinnedByteArray# which immediately - fills the allocated memory with a MutableByteArray#. + fills the allocated memory with a MutableByteArray#. Note that + this returns NULL on heap overflow. ------------------------------------------------------------------------- */ StgPtr @@ -967,20 +994,16 @@ allocatePinned (Capability *cap, W_ n) // If the request is for a large object, then allocate() // will give us a pinned object anyway. if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) { - p = allocate(cap, n); - Bdescr(p)->flags |= BF_PINNED; - return p; - } - - TICK_ALLOC_HEAP_NOCTR(WDS(n)); - CCS_ALLOC(cap->r.rCCCS,n); - if (cap->r.rCurrentTSO != NULL) { - // cap->r.rCurrentTSO->alloc_limit -= n*sizeof(W_); - ASSIGN_Int64((W_*)&(cap->r.rCurrentTSO->alloc_limit), - (PK_Int64((W_*)&(cap->r.rCurrentTSO->alloc_limit)) - - n*sizeof(W_))); + p = allocateMightFail(cap, n); + if (p == NULL) { + return NULL; + } else { + Bdescr(p)->flags |= BF_PINNED; + return p; + } } + accountAllocation(cap, n); bd = cap->pinned_object_block; // If we don't have a block of pinned objects yet, or the current @@ -1135,7 +1158,7 @@ dirty_MVAR(StgRegTable *reg, StgClosure *p) * -------------------------------------------------------------------------- */ /* ----------------------------------------------------------------------------- - * [Note allocation accounting] + * Note [allocation accounting] * * - When cap->r.rCurrentNusery moves to a new block in the nursery, * we add the size of the used portion of the previous block to @@ -1241,16 +1264,15 @@ W_ gcThreadLiveBlocks (uint32_t i, uint32_t g) * to store bitmaps and the mark stack. Note: blocks_needed does not * include the blocks in the nursery. * - * Assume: all data currently live will remain live. Generationss + * Assume: all data currently live will remain live. Generations * that will be collected next time will therefore need twice as many * blocks since all the data will be copied. */ extern W_ calcNeeded (bool force_major, memcount *blocks_needed) { - W_ needed = 0, blocks; - uint32_t g, N; - generation *gen; + W_ needed = 0; + uint32_t N; if (force_major) { N = RtsFlags.GcFlags.generations - 1; @@ -1258,12 +1280,12 @@ calcNeeded (bool force_major, memcount *blocks_needed) N = 0; } - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - gen = &generations[g]; + for (uint32_t g = 0; g < RtsFlags.GcFlags.generations; g++) { + generation *gen = &generations[g]; - blocks = gen->n_blocks // or: gen->n_words / BLOCK_SIZE_W (?) - + gen->n_large_blocks - + gen->n_compact_blocks; + W_ blocks = gen->n_blocks // or: gen->n_words / BLOCK_SIZE_W (?) + + gen->n_large_blocks + + gen->n_compact_blocks; // we need at least this much space needed += blocks; |