diff options
Diffstat (limited to 'bdb/env/env_recover.c')
-rw-r--r-- | bdb/env/env_recover.c | 747 |
1 files changed, 544 insertions, 203 deletions
diff --git a/bdb/env/env_recover.c b/bdb/env/env_recover.c index bc5e4760584..fbe3b345b0d 100644 --- a/bdb/env/env_recover.c +++ b/bdb/env/env_recover.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ @@ -9,9 +9,9 @@ #ifndef lint static const char copyright[] = - "Copyright (c) 1996-2000\nSleepycat Software Inc. All rights reserved.\n"; + "Copyright (c) 1996-2002\nSleepycat Software Inc. All rights reserved.\n"; static const char revid[] = - "$Id: env_recover.c,v 11.33 2001/01/04 22:38:42 ubell Exp $"; + "$Id: env_recover.c,v 11.97 2002/08/22 17:43:22 margo Exp $"; #endif #ifndef NO_SYSTEM_INCLUDES @@ -32,37 +32,65 @@ static const char revid[] = #endif #include "db_int.h" -#include "db_page.h" -#include "db_dispatch.h" -#include "db_am.h" -#include "log.h" -#include "txn.h" - -static float __lsn_diff __P((DB_LSN *, DB_LSN *, DB_LSN *, u_int32_t, int)); -static int __log_earliest __P((DB_ENV *, int32_t *, DB_LSN *)); +#include "dbinc/db_page.h" +#include "dbinc/db_shash.h" +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/rep.h" +#include "dbinc/txn.h" +#include "dbinc/db_am.h" + +static int __log_backup __P((DB_ENV *, DB_LOGC *, DB_LSN *, DB_LSN *)); +static int __log_earliest __P((DB_ENV *, DB_LOGC *, int32_t *, DB_LSN *)); +static double __lsn_diff __P((DB_LSN *, DB_LSN *, DB_LSN *, u_int32_t, int)); /* * __db_apprec -- - * Perform recovery. + * Perform recovery. If max_lsn is non-NULL, then we are trying + * to synchronize this system up with another system that has a max + * LSN of max_lsn, so we need to roll back sufficiently far for that + * to work. See __log_backup for details. * - * PUBLIC: int __db_apprec __P((DB_ENV *, u_int32_t)); + * PUBLIC: int __db_apprec __P((DB_ENV *, DB_LSN *, u_int32_t)); */ int -__db_apprec(dbenv, flags) +__db_apprec(dbenv, max_lsn, flags) DB_ENV *dbenv; + DB_LSN *max_lsn; u_int32_t flags; { DBT data; - DB_LSN ckp_lsn, first_lsn, last_lsn, lowlsn, lsn, open_lsn; + DB_LOGC *logc; + DB_LSN ckp_lsn, first_lsn, last_lsn, lowlsn, lsn, stop_lsn; DB_TXNREGION *region; __txn_ckp_args *ckp_args; time_t now, tlow; - float nfiles; - int32_t low; - int is_thread, progress, ret; + int32_t log_size, low; + double nfiles; + int have_rec, is_thread, progress, ret, t_ret; + int (**dtab) __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + size_t dtabsize; + u_int32_t hi_txn, lockid, txnid; + char *p, *pass, t1[60], t2[60]; void *txninfo; - COMPQUIET(nfiles, (float)0); + COMPQUIET(nfiles, (double)0); + + logc = NULL; + ckp_args = NULL; + dtab = NULL; + hi_txn = TXN_MAXIMUM; + lockid = DB_LOCK_INVALIDID; + txninfo = NULL; + pass = "initial"; + + /* + * XXX + * Get the log size. No locking required because we're single-threaded + * during recovery. + */ + log_size = + ((LOG *)(((DB_LOG *)dbenv->lg_handle)->reginfo.primary))->log_size; /* * Save the state of the thread flag -- we don't need it on at the @@ -70,60 +98,83 @@ __db_apprec(dbenv, flags) */ is_thread = F_ISSET(dbenv, DB_ENV_THREAD) ? 1 : 0; F_CLR(dbenv, DB_ENV_THREAD); + + /* Set in-recovery flags. */ F_SET((DB_LOG *)dbenv->lg_handle, DBLOG_RECOVER); + region = ((DB_TXNMGR *)dbenv->tx_handle)->reginfo.primary; + F_SET(region, TXN_IN_RECOVERY); + + /* Allocate a cursor for the log. */ + if ((ret = dbenv->log_cursor(dbenv, &logc, 0)) != 0) + goto err; /* - * If the user is specifying recover to a particular point in time, - * verify that the logs present are sufficient to do this. + * If the user is specifying recovery to a particular point in time + * or to a particular LSN, find the point to start recovery from. */ ZERO_LSN(lowlsn); - if (dbenv->tx_timestamp != 0) { - if ((ret = __log_earliest(dbenv, &low, &lowlsn)) != 0) - return (ret); + if (max_lsn != NULL) { + if ((ret = __log_backup(dbenv, logc, max_lsn, &lowlsn)) != 0) + goto err; + } else if (dbenv->tx_timestamp != 0) { + if ((ret = __log_earliest(dbenv, logc, &low, &lowlsn)) != 0) + goto err; if ((int32_t)dbenv->tx_timestamp < low) { - char t1[30], t2[30]; - - strcpy(t1, ctime(&dbenv->tx_timestamp)); + (void)snprintf(t1, sizeof(t1), + "%s", ctime(&dbenv->tx_timestamp)); + if ((p = strchr(t1, '\n')) != NULL) + *p = '\0'; tlow = (time_t)low; - strcpy(t2, ctime(&tlow)); + (void)snprintf(t2, sizeof(t2), "%s", ctime(&tlow)); + if ((p = strchr(t2, '\n')) != NULL) + *p = '\0'; __db_err(dbenv, - "Invalid recovery timestamp %.*s; earliest time is %.*s", - 24, t1, 24, t2); - return (EINVAL); + "Invalid recovery timestamp %s; earliest time is %s", + t1, t2); + ret = EINVAL; + goto err; } } - /* Initialize the transaction list. */ - if ((ret = __db_txnlist_init(dbenv, &txninfo)) != 0) - return (ret); - /* * Recovery is done in three passes: * Pass #0: - * We need to find the position from which we will open files - * We need to open files beginning with the last to next - * checkpoint because we might have crashed after writing the - * last checkpoint record, but before having written out all - * the open file information. + * We need to find the position from which we will open files. + * We need to open files beginning with the earlier of the + * most recent checkpoint LSN and a checkpoint LSN before the + * recovery timestamp, if specified. We need to be before the + * most recent checkpoint LSN because we are going to collect + * information about which transactions were begun before we + * start rolling forward. Those that were should never be undone + * because queue cannot use LSNs to determine what operations can + * safely be aborted and it cannot rollback operations in + * transactions for which there may be records not processed + * during recovery. We need to consider earlier points in time + * in case we are recovering to a particular timestamp. * * Pass #1: - * Read forward through the log from the second to last checkpoint - * opening and closing files so that at the end of the log we have - * the "current" set of files open. + * Read forward through the log from the position found in pass 0 + * opening and closing files, and recording transactions for which + * we've seen their first record (the transaction's prev_lsn is + * 0,0). At the end of this pass, we know all transactions for + * which we've seen begins and we have the "current" set of files + * open. * * Pass #2: * Read backward through the log undoing any uncompleted TXNs. - * There are three cases: - * 1. If doing catastrophic recovery, we read to the beginning - * of the log + * There are four cases: + * 1. If doing catastrophic recovery, we read to the + * beginning of the log * 2. If we are doing normal reovery, then we have to roll - * back to the most recent checkpoint that occurs - * before the most recent checkpoint LSN, which is - * returned by __log_findckp(). + * back to the most recent checkpoint LSN. * 3. If we are recovering to a point in time, then we have * to roll back to the checkpoint whose ckp_lsn is earlier * than the specified time. __log_earliest will figure * this out for us. + * 4. If we are recovering back to a particular LSN, then + * we have to roll back to the checkpoint whose ckp_lsn + * is earlier than the max_lsn. __log_backup will figure + * that out for us. * In case 2, "uncompleted TXNs" include all those who commited * after the user's specified timestamp. * @@ -133,6 +184,14 @@ __db_apprec(dbenv, flags) * specified rollback point). During this pass, checkpoint * file information is ignored, and file openings and closings * are redone. + * + * ckp_lsn -- lsn of the last checkpoint or the first in the log. + * first_lsn -- the lsn where the forward passes begin. + * last_lsn -- the last lsn in the log, used for feedback + * lowlsn -- the lsn we are rolling back to, if we are recovering + * to a point in time. + * lsn -- temporary use lsn. + * stop_lsn -- the point at which forward roll should stop */ /* @@ -143,132 +202,209 @@ __db_apprec(dbenv, flags) * same amount of time (a false assumption) and then use the %-age * of the amount of log traversed to figure out how much of the * pass we've accomplished. + * + * If we can't find any log records, we're kind of done. */ +#ifdef UMRW + ZERO_LSN(last_lsn); +#endif memset(&data, 0, sizeof(data)); - if (dbenv->db_feedback != NULL && - (ret = log_get(dbenv, &last_lsn, &data, DB_LAST)) != 0) - goto out; + if ((ret = logc->get(logc, &last_lsn, &data, DB_LAST)) != 0) { + if (ret == DB_NOTFOUND) + ret = 0; + else + __db_err(dbenv, "Last log record not found"); + goto err; + } + + do { + /* txnid is after rectype, which is a u_int32. */ + memcpy(&txnid, + (u_int8_t *)data.data + sizeof(u_int32_t), sizeof(txnid)); + + if (txnid != 0) + break; + } while ((ret = logc->get(logc, &lsn, &data, DB_PREV)) == 0); /* - * Pass #0 - * Find the second to last checkpoint in the log. This is the point - * from which we want to begin pass #1 (the open files pass). + * There are no transactions, so there is nothing to do unless + * we're recovering to an LSN. If we are, we need to proceed since + * we'll still need to do a vtruncate based on information we haven't + * yet collected. */ - ckp_args = NULL; + if (ret == DB_NOTFOUND) { + ret = 0; + if (max_lsn == NULL) + goto done; + } + if (ret != 0) + goto err; - if (LF_ISSET(DB_RECOVER_FATAL)) { - if ((ret = log_get(dbenv, &ckp_lsn, &data, DB_FIRST)) != 0) { - if (ret == DB_NOTFOUND) - ret = 0; - else - __db_err(dbenv, "First log record not found"); - goto out; + hi_txn = txnid; + + /* + * Pass #0 + * Find the LSN from which we begin OPENFILES. + * + * If this is a catastrophic recovery, or if no checkpoint exists + * in the log, the LSN is the first LSN in the log. + * + * Otherwise, it is the minimum of (1) the LSN in the last checkpoint + * and (2) the LSN in the checkpoint before any specified recovery + * timestamp or max_lsn. + */ + /* + * Get the first LSN in the log; it's an initial default + * even if this is not a catastrophic recovery. + */ + if ((ret = logc->get(logc, &ckp_lsn, &data, DB_FIRST)) != 0) { + if (ret == DB_NOTFOUND) + ret = 0; + else + __db_err(dbenv, "First log record not found"); + goto err; + } + first_lsn = ckp_lsn; + have_rec = 1; + + if (!LF_ISSET(DB_RECOVER_FATAL)) { + if ((ret = __txn_getckp(dbenv, &ckp_lsn)) == 0 && + (ret = logc->get(logc, &ckp_lsn, &data, DB_SET)) == 0) { + /* We have a recent checkpoint. This is LSN (1). */ + if ((ret = __txn_ckp_read(dbenv, + data.data, &ckp_args)) != 0) { + __db_err(dbenv, + "Invalid checkpoint record at [%ld][%ld]", + (u_long)ckp_lsn.file, + (u_long)ckp_lsn.offset); + goto err; + } + first_lsn = ckp_args->ckp_lsn; + have_rec = 0; } - open_lsn = ckp_lsn; - } else if ((ret = - log_get(dbenv, &ckp_lsn, &data, DB_CHECKPOINT)) != 0) { + /* - * If we don't find a checkpoint, start from the beginning. - * If that fails, we're done. Note, we do not require that - * there be log records if we're performing recovery. + * If LSN (2) exists, use it if it's before LSN (1). + * (If LSN (1) doesn't exist, first_lsn is the + * beginning of the log, so will "win" this check.) + * + * XXX + * In the recovery-to-a-timestamp case, lowlsn is chosen by + * __log_earliest, and is the checkpoint LSN of the + * *earliest* checkpoint in the unreclaimed log. I + * (krinsky) believe that we could optimize this by looking + * instead for the LSN of the *latest* checkpoint before + * the timestamp of interest, but I'm not sure that this + * is worth doing right now. (We have to look for lowlsn + * and low anyway, to make sure the requested timestamp is + * somewhere in the logs we have, and all that's required + * is that we pick *some* checkpoint after the beginning of + * the logs and before the timestamp. */ -first: if ((ret = log_get(dbenv, &ckp_lsn, &data, DB_FIRST)) != 0) { - if (ret == DB_NOTFOUND) - ret = 0; - else - __db_err(dbenv, "First log record not found"); - goto out; + if ((dbenv->tx_timestamp != 0 || max_lsn != NULL) && + log_compare(&lowlsn, &first_lsn) < 0) { + DB_ASSERT(have_rec == 0); + first_lsn = lowlsn; } - open_lsn = ckp_lsn; - } else if ((ret = __txn_ckp_read(dbenv, data.data, &ckp_args)) != 0) { - __db_err(dbenv, "Invalid checkpoint record at [%ld][%ld]\n", - (u_long)ckp_lsn.file, (u_long)ckp_lsn.offset); - goto out; - } else if (IS_ZERO_LSN(ckp_args->last_ckp) || - (ret = log_get(dbenv, &ckp_args->last_ckp, &data, DB_SET)) != 0) - goto first; - else - open_lsn = ckp_args->last_ckp; + } + + /* Get the record at first_lsn if we don't have it already. */ + if (!have_rec && + (ret = logc->get(logc, &first_lsn, &data, DB_SET)) != 0) { + __db_err(dbenv, "Checkpoint LSN record [%ld][%ld] not found", + (u_long)first_lsn.file, (u_long)first_lsn.offset); + goto err; + } if (dbenv->db_feedback != NULL) { - if (last_lsn.file == open_lsn.file) - nfiles = (float)(last_lsn.offset - open_lsn.offset) / - dbenv->lg_max; + if (last_lsn.file == first_lsn.file) + nfiles = (double) + (last_lsn.offset - first_lsn.offset) / log_size; else - nfiles = (float)(last_lsn.file - open_lsn.file) + - (float)(dbenv->lg_max - open_lsn.offset + - last_lsn.offset) / dbenv->lg_max; + nfiles = (double)(last_lsn.file - first_lsn.file) + + (double)(log_size - first_lsn.offset + + last_lsn.offset) / log_size; /* We are going to divide by nfiles; make sure it isn't 0. */ if (nfiles == 0) - nfiles = (float)0.001; + nfiles = (double)0.001; } + /* Find a low txnid. */ + ret = 0; + do { + /* txnid is after rectype, which is a u_int32. */ + memcpy(&txnid, + (u_int8_t *)data.data + sizeof(u_int32_t), sizeof(txnid)); + + if (txnid != 0) + break; + } while ((ret = logc->get(logc, &lsn, &data, DB_NEXT)) == 0); + /* - * Pass #1 - * Now, ckp_lsn is either the lsn of the last checkpoint - * or the lsn of the first record in the log. Open_lsn is - * the second to last checkpoint or the beinning of the log; - * begin the open files pass from that lsn, and proceed to - * the end of the log. + * There are no transactions and we're not recovering to an LSN (see + * above), so there is nothing to do. */ - lsn = open_lsn; - for (;;) { - if (dbenv->db_feedback != NULL) { - progress = (int)(33 * (__lsn_diff(&open_lsn, - &last_lsn, &lsn, dbenv->lg_max, 1) / nfiles)); - dbenv->db_feedback(dbenv, DB_RECOVER, progress); - } - ret = __db_dispatch(dbenv, - &data, &lsn, DB_TXN_OPENFILES, txninfo); - if (ret != 0 && ret != DB_TXN_CKP) - goto msgerr; - if ((ret = log_get(dbenv, &lsn, &data, DB_NEXT)) != 0) { - if (ret == DB_NOTFOUND) - break; - goto out; - } + if (ret == DB_NOTFOUND) { + ret = 0; + if (max_lsn == NULL) + goto done; } + /* Reset to the first lsn. */ + if (ret != 0 || (ret = logc->get(logc, &first_lsn, &data, DB_SET)) != 0) + goto err; + + /* Initialize the transaction list. */ + if ((ret = + __db_txnlist_init(dbenv, txnid, hi_txn, max_lsn, &txninfo)) != 0) + goto err; + + /* + * Pass #1 + * Run forward through the log starting at the first relevant lsn. + */ + if ((ret = __env_openfiles(dbenv, logc, + txninfo, &data, &first_lsn, &last_lsn, nfiles, 1)) != 0) + goto err; + /* * Pass #2. * - * Before we can begin pass #2, backward roll phase, we determine how - * far back in the log to recover. If we are doing catastrophic - * recovery, then we go as far back as we have files. If we are - * doing normal recovery, we go as back to the most recent checkpoint - * that occurs before the most recent checkpoint LSN. If we are - * recovering to a point in time, then rollback to the checkpoint whose - * ckp_lsn precedes the first log record (and then roll forward to - * the appropriate timestamp in Pass #3). + * We used first_lsn to tell us how far back we need to recover, + * use it here. */ - if (LF_ISSET(DB_RECOVER_FATAL)) { - ZERO_LSN(first_lsn); - } else if (dbenv->tx_timestamp != 0) - first_lsn = lowlsn; - else - if ((ret = __log_findckp(dbenv, &first_lsn)) == DB_NOTFOUND) { - /* - * We don't require that log files exist if recovery - * was specified. - */ - ret = 0; - goto out; - } if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY)) __db_err(dbenv, "Recovery starting from [%lu][%lu]", (u_long)first_lsn.file, (u_long)first_lsn.offset); - for (ret = log_get(dbenv, &lsn, &data, DB_LAST); - ret == 0 && log_compare(&lsn, &first_lsn) > 0; - ret = log_get(dbenv, &lsn, &data, DB_PREV)) { + /* + * If we are doing client recovery, then we need to allocate + * the page-info lock table. + */ + if (max_lsn != NULL) { + if ((ret = __rep_lockpgno_init(dbenv, &dtab, &dtabsize)) != 0) + goto err; + if ((ret = dbenv->lock_id(dbenv, &lockid)) != 0) + goto err; + } + + pass = "backward"; + for (ret = logc->get(logc, &lsn, &data, DB_LAST); + ret == 0 && log_compare(&lsn, &first_lsn) >= 0; + ret = logc->get(logc, &lsn, &data, DB_PREV)) { if (dbenv->db_feedback != NULL) { - progress = 34 + (int)(33 * (__lsn_diff(&open_lsn, - &last_lsn, &lsn, dbenv->lg_max, 0) / nfiles)); + progress = 34 + (int)(33 * (__lsn_diff(&first_lsn, + &last_lsn, &lsn, log_size, 0) / nfiles)); dbenv->db_feedback(dbenv, DB_RECOVER, progress); } - ret = __db_dispatch(dbenv, - &data, &lsn, DB_TXN_BACKWARD_ROLL, txninfo); + if (max_lsn != NULL && (ret = __rep_lockpages(dbenv, + dtab, dtabsize, &lsn, NULL, NULL, lockid)) != 0) + continue; + + ret = __db_dispatch(dbenv, dbenv->recover_dtab, + dbenv->recover_dtab_size, &data, &lsn, + DB_TXN_BACKWARD_ROLL, txninfo); if (ret != 0) { if (ret != DB_TXN_CKP) goto msgerr; @@ -277,63 +413,128 @@ first: if ((ret = log_get(dbenv, &ckp_lsn, &data, DB_FIRST)) != 0) { } } if (ret != 0 && ret != DB_NOTFOUND) - goto out; + goto err; /* - * Pass #3. + * Pass #3. If we are recovering to a timestamp or to an LSN, + * we need to make sure that we don't roll-forward beyond that + * point because there may be non-transactional operations (e.g., + * closes that would fail). The last_lsn variable is used for + * feedback calculations, but use it to set an initial stopping + * point for the forward pass, and then reset appropriately to + * derive a real stop_lsn that tells how far the forward pass + * should go. */ - for (ret = log_get(dbenv, &lsn, &data, DB_NEXT); - ret == 0; ret = log_get(dbenv, &lsn, &data, DB_NEXT)) { + pass = "forward"; + stop_lsn = last_lsn; + if (max_lsn != NULL || dbenv->tx_timestamp != 0) + stop_lsn = ((DB_TXNHEAD *)txninfo)->maxlsn; + + for (ret = logc->get(logc, &lsn, &data, DB_NEXT); + ret == 0; ret = logc->get(logc, &lsn, &data, DB_NEXT)) { + /* + * If we are recovering to a timestamp or an LSN, + * we need to make sure that we don't try to roll + * forward beyond the soon-to-be end of log. + */ + if (log_compare(&lsn, &stop_lsn) > 0) + break; + if (dbenv->db_feedback != NULL) { - progress = 67 + (int)(33 * (__lsn_diff(&open_lsn, - &last_lsn, &lsn, dbenv->lg_max, 1) / nfiles)); + progress = 67 + (int)(33 * (__lsn_diff(&first_lsn, + &last_lsn, &lsn, log_size, 1) / nfiles)); dbenv->db_feedback(dbenv, DB_RECOVER, progress); } - ret = __db_dispatch(dbenv, - &data, &lsn, DB_TXN_FORWARD_ROLL, txninfo); + ret = __db_dispatch(dbenv, dbenv->recover_dtab, + dbenv->recover_dtab_size, &data, &lsn, + DB_TXN_FORWARD_ROLL, txninfo); if (ret != 0) { if (ret != DB_TXN_CKP) goto msgerr; else ret = 0; } + } - if (ret != DB_NOTFOUND) - goto out; + if (ret != 0 && ret != DB_NOTFOUND) + goto err; /* - * Process any pages that were on the limbo list - * and move them to the free list. Do this - * before checkpointing the database. + * Process any pages that were on the limbo list and move them to + * the free list. Do this before checkpointing the database. */ - if ((ret = __db_do_the_limbo(dbenv, txninfo)) != 0) - goto out; + if ((ret = __db_do_the_limbo(dbenv, NULL, NULL, txninfo)) != 0) + goto err; - /* - * Now set the last checkpoint lsn and the current time, - * take a checkpoint, and reset the txnid. - */ - (void)time(&now); - region = ((DB_TXNMGR *)dbenv->tx_handle)->reginfo.primary; - region->last_txnid = ((DB_TXNHEAD *)txninfo)->maxid; - region->last_ckp = ckp_lsn; - region->time_ckp = (u_int32_t)now; + if (max_lsn == NULL) + region->last_txnid = ((DB_TXNHEAD *)txninfo)->maxid; - /* - * Take two checkpoints so that we don't re-recover any of the - * work we've already done. - */ - if ((ret = txn_checkpoint(dbenv, 0, 0, DB_FORCE)) != 0) - goto out; + /* Take a checkpoint here to force any dirty data pages to disk. */ + if (dbenv->tx_timestamp != 0) { + region->last_ckp = ((DB_TXNHEAD *)txninfo)->ckplsn; + __log_vtruncate(dbenv, &((DB_TXNHEAD *)txninfo)->maxlsn, + &((DB_TXNHEAD *)txninfo)->ckplsn); + } - /* Now close all the db files that are open. */ - __log_close_files(dbenv); + if ((ret = dbenv->txn_checkpoint(dbenv, 0, 0, DB_FORCE)) != 0) + goto err; - if ((ret = txn_checkpoint(dbenv, 0, 0, DB_FORCE)) != 0) - goto out; - region->last_txnid = TXN_MINIMUM; + /* Close all the db files that are open. */ + if ((ret = __dbreg_close_files(dbenv)) != 0) + goto err; + + if (max_lsn != NULL) { + region->last_ckp = ((DB_TXNHEAD *)txninfo)->ckplsn; + + /* We are going to truncate, so we'd best close the cursor. */ + if (logc != NULL && (ret = logc->close(logc, 0)) != 0) + goto err; + __log_vtruncate(dbenv, + max_lsn, &((DB_TXNHEAD *)txninfo)->ckplsn); + + /* + * Now we need to open files that should be open in order for + * client processing to continue. However, since we've + * truncated the log, we need to recompute from where the + * openfiles pass should begin. + */ + if ((ret = dbenv->log_cursor(dbenv, &logc, 0)) != 0) + goto err; + if ((ret = logc->get(logc, &first_lsn, &data, DB_FIRST)) != 0) { + if (ret == DB_NOTFOUND) + ret = 0; + else + __db_err(dbenv, "First log record not found"); + goto err; + } + if ((ret = __txn_getckp(dbenv, &first_lsn)) == 0 && + (ret = logc->get(logc, &first_lsn, &data, DB_SET)) == 0) { + /* We have a recent checkpoint. This is LSN (1). */ + if ((ret = __txn_ckp_read(dbenv, + data.data, &ckp_args)) != 0) { + __db_err(dbenv, + "Invalid checkpoint record at [%ld][%ld]", + (u_long)first_lsn.file, + (u_long)first_lsn.offset); + goto err; + } + first_lsn = ckp_args->ckp_lsn; + } + if ((ret = logc->get(logc, &first_lsn, &data, DB_SET)) != 0) + goto err; + if ((ret = __env_openfiles(dbenv, logc, + txninfo, &data, &first_lsn, NULL, nfiles, 1)) != 0) + goto err; + } else if (region->stat.st_nrestores == 0) + /* + * If there are no prepared transactions that need resolution, + * we need to reset the transaction ID space and log this fact. + */ + if ((ret = __txn_reset(dbenv)) != 0) + goto err; if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY)) { + (void)time(&now); __db_err(dbenv, "Recovery complete at %.24s", ctime(&now)); __db_err(dbenv, "%s %lx %s [%lu][%lu]", "Maximum transaction ID", @@ -344,18 +545,41 @@ first: if ((ret = log_get(dbenv, &ckp_lsn, &data, DB_FIRST)) != 0) { } if (0) { -msgerr: __db_err(dbenv, "Recovery function for LSN %lu %lu failed", - (u_long)lsn.file, (u_long)lsn.offset); +msgerr: __db_err(dbenv, + "Recovery function for LSN %lu %lu failed on %s pass", + (u_long)lsn.file, (u_long)lsn.offset, pass); } -out: if (is_thread) - F_SET(dbenv, DB_ENV_THREAD); - __db_txnlist_end(dbenv, txninfo); +done: +err: if (lockid != DB_LOCK_INVALIDID) { + if ((t_ret = __rep_unlockpages(dbenv, lockid)) != 0 && ret == 0) + ret = t_ret; + + if ((t_ret = + dbenv->lock_id_free(dbenv, lockid)) != 0 && ret == 0) + ret = t_ret; + } + + if (logc != NULL && (t_ret = logc->close(logc, 0)) != 0 && ret == 0) + ret = t_ret; + + if (txninfo != NULL) + __db_txnlist_end(dbenv, txninfo); + + if (dtab != NULL) + __os_free(dbenv, dtab); + if (ckp_args != NULL) - __os_free(ckp_args, sizeof(*ckp_args)); - F_CLR((DB_LOG *)dbenv->lg_handle, DBLOG_RECOVER); + __os_free(dbenv, ckp_args); dbenv->tx_timestamp = 0; + + /* Restore the state of the thread flag, clear in-recovery flags. */ + if (is_thread) + F_SET(dbenv, DB_ENV_THREAD); + F_CLR((DB_LOG *)dbenv->lg_handle, DBLOG_RECOVER); + F_CLR(region, TXN_IN_RECOVERY); + return (ret); } @@ -365,13 +589,13 @@ out: if (is_thread) * we are moving backward, we are computing high - current. max is * the number of bytes per logfile. */ -static float +static double __lsn_diff(low, high, current, max, is_forward) DB_LSN *low, *high, *current; u_int32_t max; int is_forward; { - float nf; + double nf; /* * There are three cases in each direction. If you are in the @@ -382,27 +606,78 @@ __lsn_diff(low, high, current, max, is_forward) */ if (is_forward) { if (current->file == low->file) - nf = (float)(current->offset - low->offset) / max; + nf = (double)(current->offset - low->offset) / max; else if (current->offset < low->offset) - nf = (float)(current->file - low->file - 1) + - (float)(max - low->offset + current->offset) / max; + nf = (double)(current->file - low->file - 1) + + (double)(max - low->offset + current->offset) / max; else - nf = (float)(current->file - low->file) + - (float)(current->offset - low->offset) / max; + nf = (double)(current->file - low->file) + + (double)(current->offset - low->offset) / max; } else { if (current->file == high->file) - nf = (float)(high->offset - current->offset) / max; + nf = (double)(high->offset - current->offset) / max; else if (current->offset > high->offset) - nf = (float)(high->file - current->file - 1) + - (float)(max - current->offset + high->offset) / max; + nf = (double)(high->file - current->file - 1) + + (double) + (max - current->offset + high->offset) / max; else - nf = (float)(high->file - current->file) + - (float)(high->offset - current->offset) / max; + nf = (double)(high->file - current->file) + + (double)(high->offset - current->offset) / max; } return (nf); } /* + * __log_backup -- + * + * This is used to find the earliest log record to process when a client + * is trying to sync up with a master whose max LSN is less than this + * client's max lsn; we want to roll back everything after that + * + * Find the latest checkpoint whose ckp_lsn is less than the max lsn. + */ +static int +__log_backup(dbenv, logc, max_lsn, start_lsn) + DB_ENV *dbenv; + DB_LOGC *logc; + DB_LSN *max_lsn, *start_lsn; +{ + DB_LSN lsn; + DBT data; + __txn_ckp_args *ckp_args; + int ret; + + memset(&data, 0, sizeof(data)); + ckp_args = NULL; + + /* + * Follow checkpoints through the log until we find one with + * a ckp_lsn less than max_lsn. + */ + if ((ret = __txn_getckp(dbenv, &lsn)) != 0) + goto err; + while ((ret = logc->get(logc, &lsn, &data, DB_SET)) == 0) { + if ((ret = __txn_ckp_read(dbenv, data.data, &ckp_args)) != 0) + return (ret); + if (log_compare(&ckp_args->ckp_lsn, max_lsn) <= 0) { + *start_lsn = ckp_args->ckp_lsn; + break; + } + + lsn = ckp_args->prev_lsn; + if (IS_ZERO_LSN(lsn)) + break; + __os_free(dbenv, ckp_args); + } + + if (ckp_args != NULL) + __os_free(dbenv, ckp_args); +err: if (IS_ZERO_LSN(*start_lsn) && (ret == 0 || ret == DB_NOTFOUND)) + ret = logc->get(logc, start_lsn, &data, DB_FIRST); + return (ret); +} + +/* * __log_earliest -- * * Return the earliest recovery point for the log files present. The @@ -410,8 +685,9 @@ __lsn_diff(low, high, current, max, is_forward) * whose checkpoint LSN is greater than the first LSN we process. */ static int -__log_earliest(dbenv, lowtime, lowlsn) +__log_earliest(dbenv, logc, lowtime, lowlsn) DB_ENV *dbenv; + DB_LOGC *logc; int32_t *lowtime; DB_LSN *lowlsn; { @@ -427,19 +703,17 @@ __log_earliest(dbenv, lowtime, lowlsn) * record whose ckp_lsn is greater than first_lsn. */ - for (ret = log_get(dbenv, &first_lsn, &data, DB_FIRST); - ret == 0; ret = log_get(dbenv, &lsn, &data, DB_NEXT)) { - if (ret != 0) - break; + for (ret = logc->get(logc, &first_lsn, &data, DB_FIRST); + ret == 0; ret = logc->get(logc, &lsn, &data, DB_NEXT)) { memcpy(&rectype, data.data, sizeof(rectype)); - if (rectype != DB_txn_ckp) + if (rectype != DB___txn_ckp) continue; if ((ret = __txn_ckp_read(dbenv, data.data, &ckpargs)) == 0) { cmp = log_compare(&ckpargs->ckp_lsn, &first_lsn); *lowlsn = ckpargs->ckp_lsn; *lowtime = ckpargs->timestamp; - __os_free(ckpargs, 0); + __os_free(dbenv, ckpargs); if (cmp >= 0) break; } @@ -447,3 +721,70 @@ __log_earliest(dbenv, lowtime, lowlsn) return (ret); } + +/* + * __env_openfiles -- + * Perform the pass of recovery that opens files. This is used + * both during regular recovery and an initial call to txn_recover (since + * we need files open in order to abort prepared, but not yet committed + * transactions). + * + * See the comments in db_apprec for a detailed description of the + * various recovery passes. + * + * If we are not doing feedback processing (i.e., we are doing txn_recover + * processing and in_recovery is zero), then last_lsn can be NULL. + * + * PUBLIC: int __env_openfiles __P((DB_ENV *, DB_LOGC *, + * PUBLIC: void *, DBT *, DB_LSN *, DB_LSN *, double, int)); + */ +int +__env_openfiles(dbenv, logc, txninfo, + data, open_lsn, last_lsn, nfiles, in_recovery) + DB_ENV *dbenv; + DB_LOGC *logc; + void *txninfo; + DBT *data; + DB_LSN *open_lsn, *last_lsn; + int in_recovery; + double nfiles; +{ + DB_LSN lsn; + u_int32_t log_size; + int progress, ret; + + /* + * XXX + * Get the log size. No locking required because we're single-threaded + * during recovery. + */ + log_size = + ((LOG *)(((DB_LOG *)dbenv->lg_handle)->reginfo.primary))->log_size; + + lsn = *open_lsn; + for (;;) { + if (in_recovery && dbenv->db_feedback != NULL) { + DB_ASSERT(last_lsn != NULL); + progress = (int)(33 * (__lsn_diff(open_lsn, + last_lsn, &lsn, log_size, 1) / nfiles)); + dbenv->db_feedback(dbenv, DB_RECOVER, progress); + } + ret = __db_dispatch(dbenv, + dbenv->recover_dtab, dbenv->recover_dtab_size, data, &lsn, + in_recovery ? DB_TXN_OPENFILES : DB_TXN_POPENFILES, + txninfo); + if (ret != 0 && ret != DB_TXN_CKP) { + __db_err(dbenv, + "Recovery function for LSN %lu %lu failed", + (u_long)lsn.file, (u_long)lsn.offset); + break; + } + if ((ret = logc->get(logc, &lsn, data, DB_NEXT)) != 0) { + if (ret == DB_NOTFOUND) + ret = 0; + break; + } + } + + return (ret); +} |