diff options
author | Lorry Tar Creator <lorry-tar-importer@baserock.org> | 2015-02-17 17:25:57 +0000 |
---|---|---|
committer | <> | 2015-03-17 16:26:24 +0000 |
commit | 780b92ada9afcf1d58085a83a0b9e6bc982203d1 (patch) | |
tree | 598f8b9fa431b228d29897e798de4ac0c1d3d970 /src/mp/mp_alloc.c | |
parent | 7a2660ba9cc2dc03a69ddfcfd95369395cc87444 (diff) | |
download | berkeleydb-db-6.1.23.tar.gz |
Diffstat (limited to 'src/mp/mp_alloc.c')
-rw-r--r-- | src/mp/mp_alloc.c | 320 |
1 files changed, 240 insertions, 80 deletions
diff --git a/src/mp/mp_alloc.c b/src/mp/mp_alloc.c index dc331215..011f54c6 100644 --- a/src/mp/mp_alloc.c +++ b/src/mp/mp_alloc.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -22,8 +22,112 @@ #endif /* + * __memp_bh_unreachable -- + * + * Determine whether this buffer can not ever be seen again: is the next + * newer version visible to the same transaction which sees this one? + * If both versions are visibile to the same transaction, there is no + * reason to keep the older one: it can be purged. + * + * If this buffer has a more recent version, and there is a transaction + * with a read_lsn between this buffer's and that more recent version's, + * the buffer is visible to at least that transaction, so return FALSE. + * Otherwise return TRUE. + * + * txns: 3/10 2/10 2/5 2/1 1/10 + * vers: 3/15 2/15 2/14 2/10 2/8 1/150 + * vis vis unreach vis unreach vis + * who new txns 3/10 2/10 2/5, 2/1 + * sees + * + * Note: in the abvove example, the page was allocated after txn 1/10 + * started. 1/10 would not see any version of the page. + * + * PUBLIC: int __memp_bh_unreachable __P((ENV *, BH *, DB_LSN *, int)); + */ +int +__memp_bh_unreachable(env, bhp, snapshots, n_snapshots) + ENV *env; + BH *bhp; + DB_LSN *snapshots; + int n_snapshots; +{ + BH *newer_bhp; + DB_LSN b_vlsn, n_vlsn; + int i, ret; +#ifdef DIAGNOSTIC + DB_MPOOL *dbmp; + DB_MSGBUF mb; + MPOOLFILE *bh_mfp; +#endif + + /* + * The buffer can't be purged if it is being used, or is the most recent + * version, or the next newer version isn't a copy yet. + */ + if (BH_REFCOUNT(bhp) != 0 || + (newer_bhp = SH_CHAIN_NEXT(bhp, vc, __bh)) == NULL || + newer_bhp->td_off == INVALID_ROFF) + return (FALSE); + + /* + * Find the visiblity LSNs for this buffer (b_vlsn) and the more recent, + * newer buffer (n_vlsn). If the newer version hasn't committed yet the + * bhp could be needed. + */ + n_vlsn = *VISIBLE_LSN(env, newer_bhp); + if (IS_MAX_LSN(n_vlsn)) + return (FALSE); + if (bhp->td_off == INVALID_ROFF) + INIT_LSN(b_vlsn); + else + b_vlsn = *VISIBLE_LSN(env, bhp); + + ret = TRUE; + /* + * Look for a transaction which is between n_lsn and b_lsn - determining + * that bhp is reachable. Stop looking once the transactions get so + * small (old) that they precede the buffer's version; no earlier txn + * could be between n_vlsn and b_vlsn. + */ + for (i = 0; + i < n_snapshots && LOG_COMPARE(&snapshots[i], &b_vlsn) >= 0; + i++) { + if (LOG_COMPARE(&snapshots[i], &n_vlsn) < 0) { + /* + * This txn can see (started after) bhp, but not + * newer_bhp (which committed after this txn started). + */ + ret = FALSE; + break; + } + } + +#ifdef DIAGNOSTIC + if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC)) { + dbmp = env->mp_handle; + bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); + DB_MSGBUF_INIT(&mb); + __db_msgadd(env, &mb, + "bh_unreachable %s pgno %d %s %lu/%lu %x newer %lu/%lu txn #%d in\n", + __memp_fns(dbmp, bh_mfp), bhp->pgno, + ret ? "purgeable" : "needed", + (u_long)b_vlsn.file, (u_long)b_vlsn.offset, bhp->flags, + (u_long)n_vlsn.file, (u_long)n_vlsn.offset, i); + for (i = 0; i != n_snapshots; i++) + __db_msgadd(env, &mb, " %lu/%lu", + (u_long)snapshots[i].file, + (u_long)snapshots[i].offset); + DB_MSGBUF_FLUSH(env, &mb); + } +#endif + return (ret); +} + +/* * __memp_alloc -- - * Allocate some space from a cache region. + * Allocate some space from a cache region. If the region is full then + * reuse one or more cache buffers. * * PUBLIC: int __memp_alloc __P((DB_MPOOL *, * PUBLIC: REGINFO *, MPOOLFILE *, size_t, roff_t *, void *)); @@ -39,7 +143,7 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp) { BH *bhp, *current_bhp, *mvcc_bhp, *oldest_bhp; BH_FROZEN_PAGE *frozen_bhp; - DB_LSN oldest_reader, vlsn; + DB_LSN *snapshots, vlsn; DB_MPOOL_HASH *dbht, *hp, *hp_end, *hp_saved, *hp_tmp; ENV *env; MPOOL *c_mp; @@ -49,7 +153,7 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp) u_int32_t dirty_eviction, high_priority, priority, versions; u_int32_t priority_saved, put_counter, lru_generation, total_buckets; int aggressive, alloc_freeze, b_lock, giveup; - int h_locked, need_free, obsolete, ret, write_error; + int h_locked, need_free, n_snapshots, obsolete, ret, write_error; u_int8_t *endp; void *p; @@ -58,11 +162,10 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp) dbht = R_ADDR(infop, c_mp->htab); hp_end = &dbht[c_mp->htab_buckets]; hp_saved = NULL; - priority_saved = 0; - write_error = 0; - + snapshots = NULL; + priority_saved = write_error = 0; buckets = buffers = put_counter = total_buckets = versions = 0; - aggressive = alloc_freeze = giveup = h_locked = 0; + aggressive = alloc_freeze = giveup = h_locked = n_snapshots = 0; /* * If we're allocating a buffer, and the one we're discarding is the @@ -138,13 +241,15 @@ found: if (offsetp != NULL) c_mp->stat.st_alloc_pages, buffers, infop->id); } #endif - return (0); + goto done; } else if (giveup || c_mp->pages == 0) { MPOOL_REGION_UNLOCK(env, infop); __db_errx(env, DB_STR("3017", "unable to allocate space from the buffer cache")); - return ((ret == ENOMEM && write_error != 0) ? EIO : ret); + if (ret == ENOMEM && write_error != 0) + ret = EIO; + goto done; } search: @@ -158,7 +263,6 @@ search: lru_generation = c_mp->lru_generation; ret = 0; - MAX_LSN(oldest_reader); /* * We re-attempt the allocation every time we've freed 3 times what @@ -222,6 +326,13 @@ search: goto alloc; MPOOL_REGION_UNLOCK(env, infop); + /* Refresh the list of mvcc reader transactions. */ + if (snapshots != NULL) + __os_free(env, snapshots); + if ((ret = __txn_get_readers( + env, &snapshots, &n_snapshots)) != 0) + goto err; + aggressive++; /* * Once aggressive, we consider all buffers. By setting @@ -266,13 +377,6 @@ search: if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) continue; - /* Set aggressive if we have already searched for too long. */ - if (aggressive == 0 && buckets >= MPOOL_ALLOC_SEARCH_LIMIT) { - aggressive = 1; - /* Once aggressive, we consider all buffers. */ - high_priority = MPOOL_LRU_MAX; - } - /* Unlock the region and lock the hash bucket. */ MPOOL_REGION_UNLOCK(env, infop); MUTEX_READLOCK(env, hp->mtx_hash); @@ -280,29 +384,45 @@ search: b_lock = 0; /* + * Set aggressive to consider all buffers if we have already + * searched in too many buckets. + */ + if (buckets > MPOOL_ALLOC_SEARCH_LIMIT && aggressive == 0) { + aggressive = 1; + /* Once aggressive, we consider all buffers. */ + high_priority = MPOOL_LRU_MAX; + if (snapshots == NULL && (ret = __txn_get_readers( + env, &snapshots, &n_snapshots)) != 0) + goto err; + } + + /* * Find a buffer we can use. + * Skip over refcount > 0 buffers; we can't get rid of them. * - * We use the lowest-LRU singleton buffer if we find one and - * it's better than the result of another hash bucket we've + * Without MVCC we use the lowest-LRU singleton buffer we find + * that's better than the result of another hash bucket we've * reviewed. We do not use a buffer which has a priority * greater than high_priority unless we are being aggressive. * - * With MVCC buffers, the situation is more complicated: we - * don't want to free a buffer out of the middle of an MVCC - * chain, since that requires I/O. So, walk the buffers, - * looking for an obsolete buffer at the end of an MVCC chain. - * Once a buffer becomes obsolete, its LRU priority is - * irrelevant because that version can never be accessed again. + * MVCC requires looking at additional factors: we don't want to + * free a still-relevent buffer out of the middle of an MVCC + * chain, since that requires freezing - lots of I/O. So, + * walk the buffers, looking for an obsolete buffer at the + * end of the MVCC chain. Once a buffer becomes obsolete, its + * LRU priority is irrelevant because that version can never + * be accessed again. * * If we don't find any obsolete MVCC buffers, we will get * aggressive, and in that case consider the lowest priority * buffer within a chain. - * - * Ignore referenced buffers, we can't get rid of them. */ retry_search: bhp = NULL; bucket_priority = high_priority; obsolete = 0; + if (n_snapshots > 0 && LOG_COMPARE(&snapshots[n_snapshots - 1], + &hp->old_reader) > 0) + hp->old_reader = snapshots[n_snapshots - 1]; SH_TAILQ_FOREACH(current_bhp, &hp->hash_bucket, hq, __bh) { /* * First, do the standard LRU check for singletons. @@ -340,55 +460,63 @@ retry_search: bhp = NULL; mvcc_bhp != NULL; oldest_bhp = mvcc_bhp, mvcc_bhp = SH_CHAIN_PREV(mvcc_bhp, vc, __bh)) { + DB_ASSERT(env, mvcc_bhp != + SH_CHAIN_PREV(mvcc_bhp, vc, __bh)); #ifdef MPOOL_ALLOC_SEARCH_DYN if (aggressive == 0 && - ++high_priority >= c_mp->lru_priority) + ++high_priority >= c_mp->lru_priority) { aggressive = 1; + if (snapshots == NULL && (ret = + __txn_readers(env, + &snapshots, &n_snapshots)) != 0) + goto err; + } #endif - DB_ASSERT(env, mvcc_bhp != - SH_CHAIN_PREV(mvcc_bhp, vc, __bh)); - if ((aggressive < 2 && - ++versions < (buffers >> 2)) || - BH_REFCOUNT(mvcc_bhp) != 0) + if (n_snapshots > 0 && + __memp_bh_unreachable(env, + mvcc_bhp, snapshots, n_snapshots)) { + oldest_bhp = mvcc_bhp; + goto is_obsolete; + } + if (bhp != NULL && + mvcc_bhp->priority >= bhp->priority) + continue; + if (BH_REFCOUNT(mvcc_bhp) != 0) + continue; + /* + * Since taking still-relevant versions requires + * freezing, skip over them at low aggression + * levels unless we see that a high proportion + * of buffers (over 1/4) are MVCC copies. + */ + if (aggressive < 2 && + ++versions < (buffers >> 2)) continue; buffers++; - if (!F_ISSET(mvcc_bhp, BH_FROZEN) && - (bhp == NULL || - bhp->priority > mvcc_bhp->priority)) { - if (bhp != NULL) - atomic_dec(env, &bhp->ref); - bhp = mvcc_bhp; - atomic_inc(env, &bhp->ref); - } + if (F_ISSET(mvcc_bhp, BH_FROZEN)) + continue; + /* + * Select mvcc_bhp as current best candidate, + * releasing the current candidate, if any. + */ + if (bhp != NULL) + atomic_dec(env, &bhp->ref); + bhp = mvcc_bhp; + atomic_inc(env, &bhp->ref); } /* * oldest_bhp is the last buffer on the MVCC chain, and * an obsolete buffer at the end of the MVCC chain gets - * used without further search. Before checking for - * obsolescence, update the cached oldest reader LSN in - * the bucket if it is older than call's oldest_reader. + * used without further search. */ if (BH_REFCOUNT(oldest_bhp) != 0) continue; - if (LOG_COMPARE(&oldest_reader, &hp->old_reader) > 0) { - if (IS_MAX_LSN(oldest_reader) && - (ret = __txn_oldest_reader( - env, &oldest_reader)) != 0) { - MUTEX_UNLOCK(env, hp->mtx_hash); - if (bhp != NULL) - atomic_dec(env, &bhp->ref); - return (ret); - } - if (LOG_COMPARE(&oldest_reader, - &hp->old_reader) > 0) - hp->old_reader = oldest_reader; - } - if (BH_OBSOLETE(oldest_bhp, hp->old_reader, vlsn)) { if (aggressive < 2) buffers++; +is_obsolete: obsolete = 1; if (bhp != NULL) atomic_dec(env, &bhp->ref); @@ -410,10 +538,18 @@ retry_search: bhp = NULL; /* * Compare two hash buckets and select the one with the lower - * priority. Performance testing showed looking at two improves - * the LRU-ness and looking at more only does a little better. + * priority, except mvcc at high aggression levels. Performance + * testing shows looking at two improves the LRU-ness and + * looking at more only does a little better. */ if (hp_saved == NULL) { + /* + * At high aggressive levels when mvcc is active, stop + * looking for candidate once one has been found. + * Freezing takes more time than writing out to a db. + */ + if (aggressive > 1 && n_snapshots > 1) + goto this_buffer; hp_saved = hp; priority_saved = priority; goto next_hb; @@ -487,11 +623,15 @@ this_buffer: /* /* We cannot block as the caller is probably holding locks. */ if ((ret = MUTEX_TRYLOCK(env, bhp->mtx_buf)) != 0) { - if (ret != DB_LOCK_NOTGRANTED) - return (ret); + if (ret != DB_LOCK_NOTGRANTED) { + goto err; + } + ret = 0; goto next_hb; } F_SET(bhp, BH_EXCLUSIVE); + if (obsolete) + F_SET(bhp, BH_UNREACHABLE); b_lock = 1; /* Someone may have grabbed it while we got the lock. */ @@ -557,7 +697,7 @@ this_buffer: /* F_CLR(bhp, BH_EXCLUSIVE); MUTEX_UNLOCK(env, bhp->mtx_buf); DB_ASSERT(env, !h_locked); - return (ret); + goto err; } } @@ -573,16 +713,25 @@ this_buffer: /* if (BH_REFCOUNT(bhp) != 1 || F_ISSET(bhp, BH_DIRTY) || (SH_CHAIN_HASNEXT(bhp, vc) && SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off != bhp->td_off && - !BH_OBSOLETE(bhp, hp->old_reader, vlsn))) + !(obsolete || BH_OBSOLETE(bhp, hp->old_reader, vlsn)))) { + if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC)) + __db_msg(env, + "memp_alloc next_hb past bhp %lx flags %x ref %d %lx/%lx", + (u_long)R_OFFSET(infop, bhp), bhp->flags, + BH_REFCOUNT(bhp), + (u_long)R_OFFSET(infop, SH_CHAIN_NEXTP(bhp, vc, __bh)), + (u_long)R_OFFSET(infop, SH_CHAIN_PREVP(bhp, vc, __bh))); goto next_hb; + } /* * If the buffer is frozen, thaw it and look for another one - * we can use. (Calling __memp_bh_freeze above will not - * mark bhp BH_FROZEN.) + * we can use. (Calling __memp_bh_freeze above will not mark + * this bhp BH_FROZEN; it creates another frozen one.) */ if (F_ISSET(bhp, BH_FROZEN)) { - DB_ASSERT(env, obsolete || SH_CHAIN_SINGLETON(bhp, vc)); + DB_ASSERT(env, SH_CHAIN_SINGLETON(bhp, vc) || + obsolete || BH_OBSOLETE(bhp, hp->old_reader, vlsn)); DB_ASSERT(env, BH_REFCOUNT(bhp) > 0); if (!F_ISSET(bhp, BH_THAWED)) { /* @@ -592,10 +741,10 @@ this_buffer: /* */ if ((ret = __memp_bh_thaw(dbmp, infop, hp, bhp, NULL)) != 0) - return (ret); + goto done; MUTEX_READLOCK(env, hp->mtx_hash); } else { - need_free = (atomic_dec(env, &bhp->ref) == 0); + need_free = atomic_dec(env, &bhp->ref) == 0; F_CLR(bhp, BH_EXCLUSIVE); MUTEX_UNLOCK(env, bhp->mtx_buf); if (need_free) { @@ -626,7 +775,10 @@ this_buffer: /* if (alloc_freeze) { if ((ret = __memp_bhfree(dbmp, infop, bh_mfp, hp, bhp, 0)) != 0) - return (ret); + goto err; + DB_ASSERT(env, bhp->mtx_buf != MUTEX_INVALID); + if ((ret = __mutex_free(env, &bhp->mtx_buf)) != 0) + goto err; b_lock = 0; h_locked = 0; @@ -654,23 +806,21 @@ this_buffer: /* } /* - * Check to see if the buffer is the size we're looking for. - * If so, we can simply reuse it. Otherwise, free the buffer - * and its space and keep looking. + * If the buffer is the size we're looking for, we can simply + * reuse it. Otherwise, free it and keep looking. */ if (mfp != NULL && mfp->pagesize == bh_mfp->pagesize) { if ((ret = __memp_bhfree(dbmp, infop, bh_mfp, hp, bhp, 0)) != 0) - return (ret); + goto err; p = bhp; goto found; } freed_space += sizeof(*bhp) + bh_mfp->pagesize; - if ((ret = - __memp_bhfree(dbmp, infop, - bh_mfp, hp, bhp, BH_FREE_FREEMEM)) != 0) - return (ret); + if ((ret = __memp_bhfree(dbmp, + infop, bh_mfp, hp, bhp, BH_FREE_FREEMEM)) != 0) + goto err; /* Reset "aggressive" and "write_error" if we free any space. */ if (aggressive > 1) @@ -689,12 +839,14 @@ next_hb: if (bhp != NULL) { if (b_lock) { F_CLR(bhp, BH_EXCLUSIVE); MUTEX_UNLOCK(env, bhp->mtx_buf); + b_lock = 0; } } if (h_locked) MUTEX_UNLOCK(env, hp->mtx_hash); h_locked = 0; } + obsolete = 0; MPOOL_REGION_LOCK(env, infop); /* @@ -706,7 +858,15 @@ next_hb: if (bhp != NULL) { if (freed_space >= 3 * len) goto alloc; } - /* NOTREACHED */ +err: + if (h_locked) { + MUTEX_UNLOCK(env, hp->mtx_hash); + h_locked = 0; + } +done: + if (snapshots != NULL) + __os_free(env, snapshots); + return (ret); } /* |