summaryrefslogtreecommitdiff
path: root/bdb/btree/bt_verify.c
diff options
context:
space:
mode:
authorunknown <tim@threads.polyesthetic.msg>2001-03-04 19:42:05 -0500
committerunknown <tim@threads.polyesthetic.msg>2001-03-04 19:42:05 -0500
commitec6ae091617bdfdca9e65e8d3e65b950d234f676 (patch)
tree9dd732e08dba156ee3d7635caedc0dc3107ecac6 /bdb/btree/bt_verify.c
parent87d70fb598105b64b538ff6b81eef9da626255b1 (diff)
downloadmariadb-git-ec6ae091617bdfdca9e65e8d3e65b950d234f676.tar.gz
Import changeset
Diffstat (limited to 'bdb/btree/bt_verify.c')
-rw-r--r--bdb/btree/bt_verify.c2237
1 files changed, 2237 insertions, 0 deletions
diff --git a/bdb/btree/bt_verify.c b/bdb/btree/bt_verify.c
new file mode 100644
index 00000000000..9f8647e7e2a
--- /dev/null
+++ b/bdb/btree/bt_verify.c
@@ -0,0 +1,2237 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ *
+ * $Id: bt_verify.c,v 1.44 2000/12/06 19:55:44 ubell Exp $
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: bt_verify.c,v 1.44 2000/12/06 19:55:44 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "db_verify.h"
+#include "btree.h"
+
+static int __bam_safe_getdata __P((DB *, PAGE *, u_int32_t, int, DBT *, int *));
+static int __bam_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
+ db_indx_t *, u_int32_t));
+static int __bam_vrfy_treeorder __P((DB *, db_pgno_t, PAGE *, BINTERNAL *,
+ BINTERNAL *, int (*)(DB *, const DBT *, const DBT *), u_int32_t));
+static int __ram_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
+ db_indx_t *, u_int32_t));
+
+#define OKFLAGS (DB_AGGRESSIVE | DB_NOORDERCHK | DB_SALVAGE)
+
+/*
+ * __bam_vrfy_meta --
+ * Verify the btree-specific part of a metadata page.
+ *
+ * PUBLIC: int __bam_vrfy_meta __P((DB *, VRFY_DBINFO *, BTMETA *,
+ * PUBLIC: db_pgno_t, u_int32_t));
+ */
+int
+__bam_vrfy_meta(dbp, vdp, meta, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ BTMETA *meta;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ VRFY_PAGEINFO *pip;
+ int isbad, t_ret, ret;
+ db_indx_t ovflsize;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ isbad = 0;
+
+ /*
+ * If VRFY_INCOMPLETE is not set, then we didn't come through
+ * __db_vrfy_pagezero and didn't incompletely
+ * check this page--we haven't checked it at all.
+ * Thus we need to call __db_vrfy_meta and check the common fields.
+ *
+ * If VRFY_INCOMPLETE is set, we've already done all the same work
+ * in __db_vrfy_pagezero, so skip the check.
+ */
+ if (!F_ISSET(pip, VRFY_INCOMPLETE) &&
+ (ret = __db_vrfy_meta(dbp, vdp, &meta->dbmeta, pgno, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+ /* bt_minkey: must be >= 2; must produce sensible ovflsize */
+
+ /* avoid division by zero */
+ ovflsize = meta->minkey > 0 ?
+ B_MINKEY_TO_OVFLSIZE(meta->minkey, dbp->pgsize) : 0;
+
+ if (meta->minkey < 2 ||
+ ovflsize > B_MINKEY_TO_OVFLSIZE(DEFMINKEYPAGE, dbp->pgsize)) {
+ pip->bt_minkey = 0;
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Nonsensical bt_minkey value %lu on metadata page %lu",
+ (u_long)meta->minkey, (u_long)pgno));
+ } else
+ pip->bt_minkey = meta->minkey;
+
+ /* bt_maxkey: no constraints (XXX: right?) */
+ pip->bt_maxkey = meta->maxkey;
+
+ /* re_len: no constraints on this (may be zero or huge--we make rope) */
+ pip->re_len = meta->re_len;
+
+ /*
+ * The root must not be current page or 0 and it must be within
+ * database. If this metadata page is the master meta data page
+ * of the file, then the root page had better be page 1.
+ */
+ pip->root = 0;
+ if (meta->root == PGNO_INVALID
+ || meta->root == pgno || !IS_VALID_PGNO(meta->root) ||
+ (pgno == PGNO_BASE_MD && meta->root != 1)) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Nonsensical root page %lu on metadata page %lu",
+ (u_long)meta->root, (u_long)vdp->last_pgno));
+ } else
+ pip->root = meta->root;
+
+ /* Flags. */
+ if (F_ISSET(&meta->dbmeta, BTM_RENUMBER))
+ F_SET(pip, VRFY_IS_RRECNO);
+
+ if (F_ISSET(&meta->dbmeta, BTM_SUBDB)) {
+ /*
+ * If this is a master db meta page, it had better not have
+ * duplicates.
+ */
+ if (F_ISSET(&meta->dbmeta, BTM_DUP) && pgno == PGNO_BASE_MD) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Btree metadata page %lu has both duplicates and multiple databases",
+ (u_long)pgno));
+ }
+ F_SET(pip, VRFY_HAS_SUBDBS);
+ }
+
+ if (F_ISSET(&meta->dbmeta, BTM_DUP))
+ F_SET(pip, VRFY_HAS_DUPS);
+ if (F_ISSET(&meta->dbmeta, BTM_DUPSORT))
+ F_SET(pip, VRFY_HAS_DUPSORT);
+ if (F_ISSET(&meta->dbmeta, BTM_RECNUM))
+ F_SET(pip, VRFY_HAS_RECNUMS);
+ if (F_ISSET(pip, VRFY_HAS_RECNUMS) && F_ISSET(pip, VRFY_HAS_DUPS)) {
+ EPRINT((dbp->dbenv,
+ "Btree metadata page %lu illegally has both recnums and dups",
+ (u_long)pgno));
+ isbad = 1;
+ }
+
+ if (F_ISSET(&meta->dbmeta, BTM_RECNO)) {
+ F_SET(pip, VRFY_IS_RECNO);
+ dbp->type = DB_RECNO;
+ } else if (F_ISSET(pip, VRFY_IS_RRECNO)) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Metadata page %lu has renumber flag set but is not recno",
+ (u_long)pgno));
+ }
+
+ if (F_ISSET(pip, VRFY_IS_RECNO) && F_ISSET(pip, VRFY_HAS_DUPS)) {
+ EPRINT((dbp->dbenv,
+ "Recno metadata page %lu specifies duplicates",
+ (u_long)pgno));
+ isbad = 1;
+ }
+
+ if (F_ISSET(&meta->dbmeta, BTM_FIXEDLEN))
+ F_SET(pip, VRFY_IS_FIXEDLEN);
+ else if (pip->re_len > 0) {
+ /*
+ * It's wrong to have an re_len if it's not a fixed-length
+ * database
+ */
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "re_len of %lu in non-fixed-length database",
+ (u_long)pip->re_len));
+ }
+
+ /*
+ * We do not check that the rest of the page is 0, because it may
+ * not be and may still be correct.
+ */
+
+err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __ram_vrfy_leaf --
+ * Verify a recno leaf page.
+ *
+ * PUBLIC: int __ram_vrfy_leaf __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
+ * PUBLIC: u_int32_t));
+ */
+int
+__ram_vrfy_leaf(dbp, vdp, h, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ BKEYDATA *bk;
+ VRFY_PAGEINFO *pip;
+ db_indx_t i;
+ int ret, t_ret, isbad;
+ u_int32_t re_len_guess, len;
+
+ isbad = 0;
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ if ((ret = __db_fchk(dbp->dbenv,
+ "__ram_vrfy_leaf", flags, OKFLAGS)) != 0)
+ goto err;
+
+ if (TYPE(h) != P_LRECNO) {
+ /* We should not have been called. */
+ TYPE_ERR_PRINT(dbp->dbenv, "__ram_vrfy_leaf", pgno, TYPE(h));
+ DB_ASSERT(0);
+ ret = EINVAL;
+ goto err;
+ }
+
+ /*
+ * Verify (and, if relevant, save off) page fields common to
+ * all PAGEs.
+ */
+ if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+ /*
+ * Verify inp[]. Return immediately if it returns DB_VERIFY_BAD;
+ * further checks are dangerous.
+ */
+ if ((ret = __bam_vrfy_inp(dbp,
+ vdp, h, pgno, &pip->entries, flags)) != 0)
+ goto err;
+
+ if (F_ISSET(pip, VRFY_HAS_DUPS)) {
+ EPRINT((dbp->dbenv,
+ "Recno database has dups on page %lu", (u_long)pgno));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+
+ /*
+ * Walk through inp and see if the lengths of all the records are the
+ * same--if so, this may be a fixed-length database, and we want to
+ * save off this value. We know inp to be safe if we've gotten this
+ * far.
+ */
+ re_len_guess = 0;
+ for (i = 0; i < NUM_ENT(h); i++) {
+ bk = GET_BKEYDATA(h, i);
+ /* KEYEMPTY. Go on. */
+ if (B_DISSET(bk->type))
+ continue;
+ if (bk->type == B_OVERFLOW)
+ len = ((BOVERFLOW *)bk)->tlen;
+ else if (bk->type == B_KEYDATA)
+ len = bk->len;
+ else {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Nonsensical type for item %lu, page %lu",
+ (u_long)i, (u_long)pgno));
+ continue;
+ }
+ if (re_len_guess == 0)
+ re_len_guess = len;
+
+ /*
+ * Is this item's len the same as the last one's? If not,
+ * reset to 0 and break--we don't have a single re_len.
+ * Otherwise, go on to the next item.
+ */
+ if (re_len_guess != len) {
+ re_len_guess = 0;
+ break;
+ }
+ }
+ pip->re_len = re_len_guess;
+
+ /* Save off record count. */
+ pip->rec_cnt = NUM_ENT(h);
+
+err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : 0);
+}
+
+/*
+ * __bam_vrfy --
+ * Verify a btree leaf or internal page.
+ *
+ * PUBLIC: int __bam_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
+ * PUBLIC: u_int32_t));
+ */
+int
+__bam_vrfy(dbp, vdp, h, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ VRFY_PAGEINFO *pip;
+ int ret, t_ret, isbad;
+
+ isbad = 0;
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ case P_IRECNO:
+ case P_LBTREE:
+ case P_LDUP:
+ break;
+ default:
+ TYPE_ERR_PRINT(dbp->dbenv, "__bam_vrfy", pgno, TYPE(h));
+ DB_ASSERT(0);
+ ret = EINVAL;
+ goto err;
+ }
+
+ /*
+ * Verify (and, if relevant, save off) page fields common to
+ * all PAGEs.
+ */
+ if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+ /*
+ * The record count is, on internal pages, stored in an overloaded
+ * next_pgno field. Save it off; we'll verify it when we check
+ * overall database structure. We could overload the field
+ * in VRFY_PAGEINFO, too, but this seems gross, and space
+ * is not at such a premium.
+ */
+ pip->rec_cnt = RE_NREC(h);
+
+ /*
+ * Verify inp[].
+ */
+ if (TYPE(h) == P_IRECNO) {
+ if ((ret = __ram_vrfy_inp(dbp,
+ vdp, h, pgno, &pip->entries, flags)) != 0)
+ goto err;
+ } else if ((ret = __bam_vrfy_inp(dbp,
+ vdp, h, pgno, &pip->entries, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ EPRINT((dbp->dbenv,
+ "item order check on page %lu unsafe: skipping",
+ (u_long)pgno));
+ } else if (!LF_ISSET(DB_NOORDERCHK) && (ret =
+ __bam_vrfy_itemorder(dbp, vdp, h, pgno, 0, 0, 0, flags)) != 0) {
+ /*
+ * We know that the elements of inp are reasonable.
+ *
+ * Check that elements fall in the proper order.
+ */
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : 0);
+}
+
+/*
+ * __ram_vrfy_inp --
+ * Verify that all entries in a P_IRECNO inp[] array are reasonable,
+ * and count them. Note that P_LRECNO uses __bam_vrfy_inp;
+ * P_IRECNOs are a special, and simpler, case, since they have
+ * RINTERNALs rather than BKEYDATA/BINTERNALs.
+ */
+static int
+__ram_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ db_indx_t *nentriesp;
+ u_int32_t flags;
+{
+ RINTERNAL *ri;
+ VRFY_CHILDINFO child;
+ VRFY_PAGEINFO *pip;
+ int ret, t_ret, isbad;
+ u_int32_t himark, i, offset, nentries;
+ u_int8_t *pagelayout, *p;
+
+ isbad = 0;
+ memset(&child, 0, sizeof(VRFY_CHILDINFO));
+ nentries = 0;
+ pagelayout = NULL;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ if (TYPE(h) != P_IRECNO) {
+ TYPE_ERR_PRINT(dbp->dbenv, "__ram_vrfy_inp", pgno, TYPE(h));
+ DB_ASSERT(0);
+ ret = EINVAL;
+ goto err;
+ }
+
+ himark = dbp->pgsize;
+ if ((ret =
+ __os_malloc(dbp->dbenv, dbp->pgsize, NULL, &pagelayout)) != 0)
+ goto err;
+ memset(pagelayout, 0, dbp->pgsize);
+ for (i = 0; i < NUM_ENT(h); i++) {
+ if ((u_int8_t *)h->inp + i >= (u_int8_t *)h + himark) {
+ EPRINT((dbp->dbenv,
+ "Page %lu entries listing %lu overlaps data",
+ (u_long)pgno, (u_long)i));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ offset = h->inp[i];
+ /*
+ * Check that the item offset is reasonable: it points
+ * somewhere after the inp array and before the end of the
+ * page.
+ */
+ if (offset <= (u_int32_t)((u_int8_t *)h->inp + i -
+ (u_int8_t *)h) ||
+ offset > (u_int32_t)(dbp->pgsize - RINTERNAL_SIZE)) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Bad offset %lu at page %lu index %lu",
+ (u_long)offset, (u_long)pgno, (u_long)i));
+ continue;
+ }
+
+ /* Update the high-water mark (what HOFFSET should be) */
+ if (offset < himark)
+ himark = offset;
+
+ nentries++;
+
+ /* Make sure this RINTERNAL is not multiply referenced. */
+ ri = GET_RINTERNAL(h, i);
+ if (pagelayout[offset] == 0) {
+ pagelayout[offset] = 1;
+ child.pgno = ri->pgno;
+ child.type = V_RECNO;
+ child.nrecs = ri->nrecs;
+ if ((ret = __db_vrfy_childput(vdp, pgno, &child)) != 0)
+ goto err;
+ } else {
+ EPRINT((dbp->dbenv,
+ "RINTERNAL structure at offset %lu, page %lu referenced twice",
+ (u_long)offset, (u_long)pgno));
+ isbad = 1;
+ }
+ }
+
+ for (p = pagelayout + himark;
+ p < pagelayout + dbp->pgsize;
+ p += RINTERNAL_SIZE)
+ if (*p != 1) {
+ EPRINT((dbp->dbenv,
+ "Gap between items at offset %lu, page %lu",
+ (u_long)(p - pagelayout), (u_long)pgno));
+ isbad = 1;
+ }
+
+ if ((db_indx_t)himark != HOFFSET(h)) {
+ EPRINT((dbp->dbenv, "Bad HOFFSET %lu, appears to be %lu",
+ (u_long)(HOFFSET(h)), (u_long)himark));
+ isbad = 1;
+ }
+
+ *nentriesp = nentries;
+
+err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ if (pagelayout != NULL)
+ __os_free(pagelayout, dbp->pgsize);
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __bam_vrfy_inp --
+ * Verify that all entries in inp[] array are reasonable;
+ * count them.
+ */
+static int
+__bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ db_indx_t *nentriesp;
+ u_int32_t flags;
+{
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ VRFY_CHILDINFO child;
+ VRFY_PAGEINFO *pip;
+ int isbad, initem, isdupitem, ret, t_ret;
+ u_int32_t himark, offset; /* These would be db_indx_ts but for algnmt.*/
+ u_int32_t i, endoff, nentries;
+ u_int8_t *pagelayout;
+
+ isbad = isdupitem = 0;
+ nentries = 0;
+ memset(&child, 0, sizeof(VRFY_CHILDINFO));
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ case P_LBTREE:
+ case P_LDUP:
+ case P_LRECNO:
+ break;
+ default:
+ /*
+ * In the salvager, we might call this from a page which
+ * we merely suspect is a btree page. Otherwise, it
+ * shouldn't get called--if it is, that's a verifier bug.
+ */
+ if (LF_ISSET(DB_SALVAGE))
+ break;
+ TYPE_ERR_PRINT(dbp->dbenv, "__bam_vrfy_inp", pgno, TYPE(h));
+ DB_ASSERT(0);
+ ret = EINVAL;
+ goto err;
+ }
+
+ /*
+ * Loop through inp[], the array of items, until we either
+ * run out of entries or collide with the data. Keep track
+ * of h_offset in himark.
+ *
+ * For each element in inp[i], make sure it references a region
+ * that starts after the end of the inp array (as defined by
+ * NUM_ENT(h)), ends before the beginning of the page, doesn't
+ * overlap any other regions, and doesn't have a gap between
+ * it and the region immediately after it.
+ */
+ himark = dbp->pgsize;
+ if ((ret = __os_malloc(dbp->dbenv,
+ dbp->pgsize, NULL, &pagelayout)) != 0)
+ goto err;
+ memset(pagelayout, 0, dbp->pgsize);
+ for (i = 0; i < NUM_ENT(h); i++) {
+
+ ret = __db_vrfy_inpitem(dbp,
+ h, pgno, i, 1, flags, &himark, &offset);
+ if (ret == DB_VERIFY_BAD) {
+ isbad = 1;
+ continue;
+ } else if (ret == DB_VERIFY_FATAL) {
+ isbad = 1;
+ goto err;
+ } else if (ret != 0)
+ DB_ASSERT(0);
+
+ /*
+ * We now have a plausible beginning for the item, and we know
+ * its length is safe.
+ *
+ * Mark the beginning and end in pagelayout so we can make sure
+ * items have no overlaps or gaps.
+ */
+ bk = GET_BKEYDATA(h, i);
+#define ITEM_BEGIN 1
+#define ITEM_END 2
+ if (pagelayout[offset] == 0)
+ pagelayout[offset] = ITEM_BEGIN;
+ else if (pagelayout[offset] == ITEM_BEGIN) {
+ /*
+ * Having two inp entries that point at the same patch
+ * of page is legal if and only if the page is
+ * a btree leaf and they're onpage duplicate keys--
+ * that is, if (i % P_INDX) == 0.
+ */
+ if ((i % P_INDX == 0) && (TYPE(h) == P_LBTREE)) {
+ /* Flag for later. */
+ F_SET(pip, VRFY_HAS_DUPS);
+
+ /* Bump up nentries so we don't undercount. */
+ nentries++;
+
+ /*
+ * We'll check to make sure the end is
+ * equal, too.
+ */
+ isdupitem = 1;
+ } else {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Duplicated item %lu on page %lu",
+ (u_long)i, (u_long)pgno));
+ }
+ }
+
+ /*
+ * Mark the end. Its location varies with the page type
+ * and the item type.
+ *
+ * If the end already has a sign other than 0, do nothing--
+ * it's an overlap that we'll catch later.
+ */
+ switch(B_TYPE(bk->type)) {
+ case B_KEYDATA:
+ if (TYPE(h) == P_IBTREE)
+ /* It's a BINTERNAL. */
+ endoff = offset + BINTERNAL_SIZE(bk->len) - 1;
+ else
+ endoff = offset + BKEYDATA_SIZE(bk->len) - 1;
+ break;
+ case B_DUPLICATE:
+ /*
+ * Flag that we have dups; we'll check whether
+ * that's okay during the structure check.
+ */
+ F_SET(pip, VRFY_HAS_DUPS);
+ /* FALLTHROUGH */
+ case B_OVERFLOW:
+ /*
+ * Overflow entries on internal pages are stored
+ * as the _data_ of a BINTERNAL; overflow entries
+ * on leaf pages are stored as the entire entry.
+ */
+ endoff = offset +
+ ((TYPE(h) == P_IBTREE) ?
+ BINTERNAL_SIZE(BOVERFLOW_SIZE) :
+ BOVERFLOW_SIZE) - 1;
+ break;
+ default:
+ /*
+ * We'll complain later; for now, just mark
+ * a minimum.
+ */
+ endoff = offset + BKEYDATA_SIZE(0) - 1;
+ break;
+ }
+
+ /*
+ * If this is an onpage duplicate key we've seen before,
+ * the end had better coincide too.
+ */
+ if (isdupitem && pagelayout[endoff] != ITEM_END) {
+ EPRINT((dbp->dbenv,
+ "Duplicated item %lu on page %lu",
+ (u_long)i, (u_long)pgno));
+ isbad = 1;
+ } else if (pagelayout[endoff] == 0)
+ pagelayout[endoff] = ITEM_END;
+ isdupitem = 0;
+
+ /*
+ * There should be no deleted items in a quiescent tree,
+ * except in recno.
+ */
+ if (B_DISSET(bk->type) && TYPE(h) != P_LRECNO) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Item %lu on page %lu marked deleted",
+ (u_long)i, (u_long)pgno));
+ }
+
+ /*
+ * Check the type and such of bk--make sure it's reasonable
+ * for the pagetype.
+ */
+ switch (B_TYPE(bk->type)) {
+ case B_KEYDATA:
+ /*
+ * This is a normal, non-overflow BKEYDATA or BINTERNAL.
+ * The only thing to check is the len, and that's
+ * already been done.
+ */
+ break;
+ case B_DUPLICATE:
+ if (TYPE(h) == P_IBTREE) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Duplicate page referenced by internal btree page %lu at item %lu",
+ (u_long)pgno, (u_long)i));
+ break;
+ } else if (TYPE(h) == P_LRECNO) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Duplicate page referenced by recno page %lu at item %lu",
+ (u_long)pgno, (u_long)i));
+ break;
+ }
+ /* FALLTHROUGH */
+ case B_OVERFLOW:
+ bo = (TYPE(h) == P_IBTREE) ?
+ (BOVERFLOW *)(((BINTERNAL *)bk)->data) :
+ (BOVERFLOW *)bk;
+
+ if (B_TYPE(bk->type) == B_OVERFLOW)
+ /* Make sure tlen is reasonable. */
+ if (bo->tlen > dbp->pgsize * vdp->last_pgno) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Impossible tlen %lu, item %lu, page %lu",
+ (u_long)bo->tlen, (u_long)i,
+ (u_long)pgno));
+ /* Don't save as a child. */
+ break;
+ }
+
+ if (!IS_VALID_PGNO(bo->pgno) || bo->pgno == pgno ||
+ bo->pgno == PGNO_INVALID) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Offpage item %lu, page %lu has bad pgno",
+ (u_long)i, (u_long)pgno));
+ /* Don't save as a child. */
+ break;
+ }
+
+ child.pgno = bo->pgno;
+ child.type = (B_TYPE(bk->type) == B_OVERFLOW ?
+ V_OVERFLOW : V_DUPLICATE);
+ child.tlen = bo->tlen;
+ if ((ret = __db_vrfy_childput(vdp, pgno, &child)) != 0)
+ goto err;
+ break;
+ default:
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Item %lu on page %lu of invalid type %lu",
+ (u_long)i, (u_long)pgno));
+ break;
+ }
+ }
+
+ /*
+ * Now, loop through and make sure the items are contiguous and
+ * non-overlapping.
+ */
+ initem = 0;
+ for (i = himark; i < dbp->pgsize; i++)
+ if (initem == 0)
+ switch (pagelayout[i]) {
+ case 0:
+ /* May be just for alignment. */
+ if (i != ALIGN(i, sizeof(u_int32_t)))
+ continue;
+
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Gap between items, page %lu offset %lu",
+ (u_long)pgno, (u_long)i));
+ /* Find the end of the gap */
+ for ( ; pagelayout[i + 1] == 0 &&
+ (size_t)(i + 1) < dbp->pgsize; i++)
+ ;
+ break;
+ case ITEM_BEGIN:
+ /* We've found an item. Check its alignment. */
+ if (i != ALIGN(i, sizeof(u_int32_t))) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Offset %lu page %lu unaligned",
+ (u_long)i, (u_long)pgno));
+ }
+ initem = 1;
+ nentries++;
+ break;
+ case ITEM_END:
+ /*
+ * We've hit the end of an item even though
+ * we don't think we're in one; must
+ * be an overlap.
+ */
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Overlapping items, page %lu offset %lu",
+ (u_long)pgno, (u_long)i));
+ break;
+ default:
+ /* Should be impossible. */
+ DB_ASSERT(0);
+ ret = EINVAL;
+ goto err;
+ }
+ else
+ switch (pagelayout[i]) {
+ case 0:
+ /* In the middle of an item somewhere. Okay. */
+ break;
+ case ITEM_END:
+ /* End of an item; switch to out-of-item mode.*/
+ initem = 0;
+ break;
+ case ITEM_BEGIN:
+ /*
+ * Hit a second item beginning without an
+ * end. Overlap.
+ */
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Overlapping items, page %lu offset %lu",
+ (u_long)pgno, (u_long)i));
+ break;
+ }
+
+ (void)__os_free(pagelayout, dbp->pgsize);
+
+ /* Verify HOFFSET. */
+ if ((db_indx_t)himark != HOFFSET(h)) {
+ EPRINT((dbp->dbenv, "Bad HOFFSET %lu, appears to be %lu",
+ (u_long)HOFFSET(h), (u_long)himark));
+ isbad = 1;
+ }
+
+err: if (nentriesp != NULL)
+ *nentriesp = nentries;
+
+ if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __bam_vrfy_itemorder --
+ * Make sure the items on a page sort correctly.
+ *
+ * Assumes that NUM_ENT(h) and inp[0]..inp[NUM_ENT(h) - 1] are
+ * reasonable; be sure that __bam_vrfy_inp has been called first.
+ *
+ * If ovflok is set, it also assumes that overflow page chains
+ * hanging off the current page have been sanity-checked, and so we
+ * can use __bam_cmp to verify their ordering. If it is not set,
+ * and we run into an overflow page, carp and return DB_VERIFY_BAD;
+ * we shouldn't be called if any exist.
+ *
+ * PUBLIC: int __bam_vrfy_itemorder __P((DB *, VRFY_DBINFO *, PAGE *,
+ * PUBLIC: db_pgno_t, u_int32_t, int, int, u_int32_t));
+ */
+int
+__bam_vrfy_itemorder(dbp, vdp, h, pgno, nentries, ovflok, hasdups, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t nentries;
+ int ovflok, hasdups;
+ u_int32_t flags;
+{
+ DBT dbta, dbtb, dup1, dup2, *p1, *p2, *tmp;
+ BTREE *bt;
+ BINTERNAL *bi;
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ VRFY_PAGEINFO *pip;
+ db_indx_t i;
+ int cmp, freedup1, freedup2, isbad, ret, t_ret;
+ int (*dupfunc) __P((DB *, const DBT *, const DBT *));
+ int (*func) __P((DB *, const DBT *, const DBT *));
+ void *buf1, *buf2, *tmpbuf;
+
+ /*
+ * We need to work in the ORDERCHKONLY environment where we might
+ * not have a pip, but we also may need to work in contexts where
+ * NUM_ENT isn't safe.
+ */
+ if (vdp != NULL) {
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+ nentries = pip->entries;
+ } else
+ pip = NULL;
+
+ ret = isbad = 0;
+ bo = NULL; /* Shut up compiler. */
+
+ memset(&dbta, 0, sizeof(DBT));
+ F_SET(&dbta, DB_DBT_REALLOC);
+
+ memset(&dbtb, 0, sizeof(DBT));
+ F_SET(&dbtb, DB_DBT_REALLOC);
+
+ buf1 = buf2 = NULL;
+
+ DB_ASSERT(!LF_ISSET(DB_NOORDERCHK));
+
+ dupfunc = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare;
+ if (TYPE(h) == P_LDUP)
+ func = dupfunc;
+ else {
+ func = __bam_defcmp;
+ if (dbp->bt_internal != NULL) {
+ bt = (BTREE *)dbp->bt_internal;
+ if (bt->bt_compare != NULL)
+ func = bt->bt_compare;
+ }
+ }
+
+ /*
+ * We alternate our use of dbta and dbtb so that we can walk
+ * through the page key-by-key without copying a dbt twice.
+ * p1 is always the dbt for index i - 1, and p2 for index i.
+ */
+ p1 = &dbta;
+ p2 = &dbtb;
+
+ /*
+ * Loop through the entries. nentries ought to contain the
+ * actual count, and so is a safe way to terminate the loop; whether
+ * we inc. by one or two depends on whether we're a leaf page--
+ * on a leaf page, we care only about keys. On internal pages
+ * and LDUP pages, we want to check the order of all entries.
+ *
+ * Note that on IBTREE pages, we start with item 1, since item
+ * 0 doesn't get looked at by __bam_cmp.
+ */
+ for (i = (TYPE(h) == P_IBTREE) ? 1 : 0; i < nentries;
+ i += (TYPE(h) == P_LBTREE) ? P_INDX : O_INDX) {
+ /*
+ * Put key i-1, now in p2, into p1, by swapping DBTs and bufs.
+ */
+ tmp = p1;
+ p1 = p2;
+ p2 = tmp;
+ tmpbuf = buf1;
+ buf1 = buf2;
+ buf2 = tmpbuf;
+
+ /*
+ * Get key i into p2.
+ */
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ bi = GET_BINTERNAL(h, i);
+ if (B_TYPE(bi->type) == B_OVERFLOW) {
+ bo = (BOVERFLOW *)(bi->data);
+ goto overflow;
+ } else {
+ p2->data = bi->data;
+ p2->size = bi->len;
+ }
+
+ /*
+ * The leftmost key on an internal page must be
+ * len 0, since it's just a placeholder and
+ * automatically sorts less than all keys.
+ *
+ * XXX
+ * This criterion does not currently hold!
+ * See todo list item #1686. Meanwhile, it's harmless
+ * to just not check for it.
+ */
+#if 0
+ if (i == 0 && bi->len != 0) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Lowest key on internal page %lu of nonzero length",
+ (u_long)pgno));
+ }
+#endif
+ break;
+ case P_LBTREE:
+ case P_LDUP:
+ bk = GET_BKEYDATA(h, i);
+ if (B_TYPE(bk->type) == B_OVERFLOW) {
+ bo = (BOVERFLOW *)bk;
+ goto overflow;
+ } else {
+ p2->data = bk->data;
+ p2->size = bk->len;
+ }
+ break;
+ default:
+ /*
+ * This means our caller screwed up and sent us
+ * an inappropriate page.
+ */
+ TYPE_ERR_PRINT(dbp->dbenv,
+ "__bam_vrfy_itemorder", pgno, TYPE(h))
+ DB_ASSERT(0);
+ ret = EINVAL;
+ goto err;
+ }
+
+ if (0) {
+ /*
+ * If ovflok != 1, we can't safely go chasing
+ * overflow pages with the normal routines now;
+ * they might be unsafe or nonexistent. Mark this
+ * page as incomplete and return.
+ *
+ * Note that we don't need to worry about freeing
+ * buffers, since they can't have been allocated
+ * if overflow items are unsafe.
+ */
+overflow: if (!ovflok) {
+ F_SET(pip, VRFY_INCOMPLETE);
+ goto err;
+ }
+
+ /*
+ * Overflow items are safe to chase. Do so.
+ * Fetch the overflow item into p2->data,
+ * NULLing it or reallocing it as appropriate.
+ *
+ * (We set p2->data to buf2 before the call
+ * so we're sure to realloc if we can and if p2
+ * was just pointing at a non-overflow item.)
+ */
+ p2->data = buf2;
+ if ((ret = __db_goff(dbp,
+ p2, bo->tlen, bo->pgno, NULL, NULL)) != 0) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Error %lu in fetching overflow item %lu, page %lu",
+ (u_long)ret, (u_long)i, (u_long)pgno));
+ }
+ /* In case it got realloc'ed and thus changed. */
+ buf2 = p2->data;
+ }
+
+ /* Compare with the last key. */
+ if (p1->data != NULL && p2->data != NULL) {
+ cmp = func(dbp, p1, p2);
+
+ /* comparison succeeded */
+ if (cmp > 0) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Out-of-order key, page %lu item %lu",
+ (u_long)pgno, (u_long)i));
+ /* proceed */
+ } else if (cmp == 0) {
+ /*
+ * If they compared equally, this
+ * had better be a (sub)database with dups.
+ * Mark it so we can check during the
+ * structure check.
+ */
+ if (pip != NULL)
+ F_SET(pip, VRFY_HAS_DUPS);
+ else if (hasdups == 0) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Database with no duplicates has duplicated keys on page %lu",
+ (u_long)pgno));
+ }
+
+ /*
+ * If we're a btree leaf, check to see
+ * if the data items of these on-page dups are
+ * in sorted order. If not, flag this, so
+ * that we can make sure during the
+ * structure checks that the DUPSORT flag
+ * is unset.
+ *
+ * At this point i points to a duplicate key.
+ * Compare the datum before it (same key)
+ * to the datum after it, i.e. i-1 to i+1.
+ */
+ if (TYPE(h) == P_LBTREE) {
+ /*
+ * Unsafe; continue and we'll pick
+ * up the bogus nentries later.
+ */
+ if (i + 1 >= (db_indx_t)nentries)
+ continue;
+
+ /*
+ * We don't bother with clever memory
+ * management with on-page dups,
+ * as it's only really a big win
+ * in the overflow case, and overflow
+ * dups are probably (?) rare.
+ */
+ if (((ret = __bam_safe_getdata(dbp,
+ h, i - 1, ovflok, &dup1,
+ &freedup1)) != 0) ||
+ ((ret = __bam_safe_getdata(dbp,
+ h, i + 1, ovflok, &dup2,
+ &freedup2)) != 0))
+ goto err;
+
+ /*
+ * If either of the data are NULL,
+ * it's because they're overflows and
+ * it's not safe to chase them now.
+ * Mark an incomplete and return.
+ */
+ if (dup1.data == NULL ||
+ dup2.data == NULL) {
+ DB_ASSERT(!ovflok);
+ F_SET(pip, VRFY_INCOMPLETE);
+ goto err;
+ }
+
+ /*
+ * If the dups are out of order,
+ * flag this. It's not an error
+ * until we do the structure check
+ * and see whether DUPSORT is set.
+ */
+ if (dupfunc(dbp, &dup1, &dup2) > 0)
+ F_SET(pip, VRFY_DUPS_UNSORTED);
+
+ if (freedup1)
+ __os_free(dup1.data, 0);
+ if (freedup2)
+ __os_free(dup2.data, 0);
+ }
+ }
+ }
+ }
+
+err: if (pip != NULL &&
+ ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0) && ret == 0)
+ ret = t_ret;
+
+ if (buf1 != NULL)
+ __os_free(buf1, 0);
+ if (buf2 != NULL)
+ __os_free(buf2, 0);
+
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __bam_vrfy_structure --
+ * Verify the tree structure of a btree database (including the master
+ * database containing subdbs).
+ *
+ * PUBLIC: int __bam_vrfy_structure __P((DB *, VRFY_DBINFO *, db_pgno_t,
+ * PUBLIC: u_int32_t));
+ */
+int
+__bam_vrfy_structure(dbp, vdp, meta_pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t meta_pgno;
+ u_int32_t flags;
+{
+ DB *pgset;
+ VRFY_PAGEINFO *mip, *rip;
+ db_pgno_t root, p;
+ int t_ret, ret;
+ u_int32_t nrecs, level, relen, stflags;
+
+ mip = rip = 0;
+ pgset = vdp->pgset;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, meta_pgno, &mip)) != 0)
+ return (ret);
+
+ if ((ret = __db_vrfy_pgset_get(pgset, meta_pgno, (int *)&p)) != 0)
+ goto err;
+ if (p != 0) {
+ EPRINT((dbp->dbenv,
+ "Btree metadata page number %lu observed twice",
+ (u_long)meta_pgno));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ if ((ret = __db_vrfy_pgset_inc(pgset, meta_pgno)) != 0)
+ goto err;
+
+ root = mip->root;
+
+ if (root == 0) {
+ EPRINT((dbp->dbenv,
+ "Btree metadata page %lu has no root", (u_long)meta_pgno));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, root, &rip)) != 0)
+ goto err;
+
+ switch (rip->type) {
+ case P_IBTREE:
+ case P_LBTREE:
+ stflags = flags | ST_TOPLEVEL;
+ if (F_ISSET(mip, VRFY_HAS_DUPS))
+ stflags |= ST_DUPOK;
+ if (F_ISSET(mip, VRFY_HAS_DUPSORT))
+ stflags |= ST_DUPSORT;
+ if (F_ISSET(mip, VRFY_HAS_RECNUMS))
+ stflags |= ST_RECNUM;
+ ret = __bam_vrfy_subtree(dbp,
+ vdp, root, NULL, NULL, stflags, NULL, NULL, NULL);
+ break;
+ case P_IRECNO:
+ case P_LRECNO:
+ stflags = flags | ST_RECNUM | ST_IS_RECNO | ST_TOPLEVEL;
+ if (mip->re_len > 0)
+ stflags |= ST_RELEN;
+ if ((ret = __bam_vrfy_subtree(dbp, vdp,
+ root, NULL, NULL, stflags, &level, &nrecs, &relen)) != 0)
+ goto err;
+ /*
+ * Even if mip->re_len > 0, re_len may come back zero if the
+ * tree is empty. It should be okay to just skip the check in
+ * this case, as if there are any non-deleted keys at all,
+ * that should never happen.
+ */
+ if (mip->re_len > 0 && relen > 0 && mip->re_len != relen) {
+ EPRINT((dbp->dbenv,
+ "Recno database with meta page %lu has bad re_len %lu",
+ (u_long)meta_pgno, (u_long)relen));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ ret = 0;
+ break;
+ case P_LDUP:
+ EPRINT((dbp->dbenv,
+ "Duplicate tree referenced from metadata page %lu",
+ (u_long)meta_pgno));
+ ret = DB_VERIFY_BAD;
+ break;
+ default:
+ EPRINT((dbp->dbenv,
+ "Btree root of incorrect type %lu on meta page %lu",
+ (u_long)rip->type, (u_long)meta_pgno));
+ ret = DB_VERIFY_BAD;
+ break;
+ }
+
+err: if (mip != NULL &&
+ ((t_ret = __db_vrfy_putpageinfo(vdp, mip)) != 0) && ret == 0)
+ t_ret = ret;
+ if (rip != NULL &&
+ ((t_ret = __db_vrfy_putpageinfo(vdp, rip)) != 0) && ret == 0)
+ t_ret = ret;
+ return (ret);
+}
+
+/*
+ * __bam_vrfy_subtree--
+ * Verify a subtree (or entire) btree with specified root.
+ *
+ * Note that this is public because it must be called to verify
+ * offpage dup trees, including from hash.
+ *
+ * PUBLIC: int __bam_vrfy_subtree __P((DB *, VRFY_DBINFO *, db_pgno_t, void *,
+ * PUBLIC: void *, u_int32_t, u_int32_t *, u_int32_t *, u_int32_t *));
+ */
+int
+__bam_vrfy_subtree(dbp,
+ vdp, pgno, l, r, flags, levelp, nrecsp, relenp)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ void *l, *r;
+ u_int32_t flags, *levelp, *nrecsp, *relenp;
+{
+ BINTERNAL *li, *ri, *lp, *rp;
+ DB *pgset;
+ DBC *cc;
+ PAGE *h;
+ VRFY_CHILDINFO *child;
+ VRFY_PAGEINFO *pip;
+ db_recno_t nrecs, child_nrecs;
+ db_indx_t i;
+ int ret, t_ret, isbad, toplevel, p;
+ int (*func) __P((DB *, const DBT *, const DBT *));
+ u_int32_t level, child_level, stflags, child_relen, relen;
+
+ ret = isbad = 0;
+ nrecs = 0;
+ h = NULL;
+ relen = 0;
+ rp = (BINTERNAL *)r;
+ lp = (BINTERNAL *)l;
+
+ /* Provide feedback on our progress to the application. */
+ if (!LF_ISSET(DB_SALVAGE))
+ __db_vrfy_struct_feedback(dbp, vdp);
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ cc = NULL;
+ level = pip->bt_level;
+
+ toplevel = LF_ISSET(ST_TOPLEVEL);
+ LF_CLR(ST_TOPLEVEL);
+
+ /*
+ * We are recursively descending a btree, starting from the root
+ * and working our way out to the leaves.
+ *
+ * There are four cases we need to deal with:
+ * 1. pgno is a recno leaf page. Any children are overflows.
+ * 2. pgno is a duplicate leaf page. Any children
+ * are overflow pages; traverse them, and then return
+ * level and nrecs.
+ * 3. pgno is an ordinary leaf page. Check whether dups are
+ * allowed, and if so, traverse any off-page dups or
+ * overflows. Then return nrecs and level.
+ * 4. pgno is a recno internal page. Recursively check any
+ * child pages, making sure their levels are one lower
+ * and their nrecs sum to ours.
+ * 5. pgno is a btree internal page. Same as #4, plus we
+ * must verify that for each pair of BINTERNAL entries
+ * N and N+1, the leftmost item on N's child sorts
+ * greater than N, and the rightmost item on N's child
+ * sorts less than N+1.
+ *
+ * Furthermore, in any sorted page type (P_LDUP, P_LBTREE, P_IBTREE),
+ * we need to verify the internal sort order is correct if,
+ * due to overflow items, we were not able to do so earlier.
+ */
+ switch (pip->type) {
+ case P_LRECNO:
+ case P_LDUP:
+ case P_LBTREE:
+ /*
+ * Cases 1, 2 and 3 (overflow pages are common to all three);
+ * traverse child list, looking for overflows.
+ */
+ if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0)
+ goto err;
+ for (ret = __db_vrfy_ccset(cc, pgno, &child); ret == 0;
+ ret = __db_vrfy_ccnext(cc, &child))
+ if (child->type == V_OVERFLOW &&
+ (ret = __db_vrfy_ovfl_structure(dbp, vdp,
+ child->pgno, child->tlen,
+ flags | ST_OVFL_LEAF)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto done;
+ }
+
+ if ((ret = __db_vrfy_ccclose(cc)) != 0)
+ goto err;
+ cc = NULL;
+
+ /* Case 1 */
+ if (pip->type == P_LRECNO) {
+ if (!LF_ISSET(ST_IS_RECNO) &&
+ !(LF_ISSET(ST_DUPOK) && !LF_ISSET(ST_DUPSORT))) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Recno leaf page %lu in non-recno tree",
+ (u_long)pgno));
+ goto done;
+ }
+ goto leaf;
+ } else if (LF_ISSET(ST_IS_RECNO)) {
+ /*
+ * It's a non-recno leaf. Had better not be a recno
+ * subtree.
+ */
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Non-recno leaf page %lu in recno tree",
+ (u_long)pgno));
+ goto done;
+ }
+
+ /* Case 2--no more work. */
+ if (pip->type == P_LDUP)
+ goto leaf;
+
+ /* Case 3 */
+
+ /* Check if we have any dups. */
+ if (F_ISSET(pip, VRFY_HAS_DUPS)) {
+ /* If dups aren't allowed in this btree, trouble. */
+ if (!LF_ISSET(ST_DUPOK)) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Duplicates on page %lu in non-dup btree",
+ (u_long)pgno));
+ } else {
+ /*
+ * We correctly have dups. If any are off-page,
+ * traverse those btrees recursively.
+ */
+ if ((ret =
+ __db_vrfy_childcursor(vdp, &cc)) != 0)
+ goto err;
+ for (ret = __db_vrfy_ccset(cc, pgno, &child);
+ ret == 0;
+ ret = __db_vrfy_ccnext(cc, &child)) {
+ stflags = flags | ST_RECNUM | ST_DUPSET;
+ /* Skip any overflow entries. */
+ if (child->type == V_DUPLICATE) {
+ if ((ret = __db_vrfy_duptype(
+ dbp, vdp, child->pgno,
+ stflags)) != 0) {
+ isbad = 1;
+ /* Next child. */
+ continue;
+ }
+ if ((ret = __bam_vrfy_subtree(
+ dbp, vdp, child->pgno, NULL,
+ NULL, stflags, NULL, NULL,
+ NULL)) != 0) {
+ if (ret !=
+ DB_VERIFY_BAD)
+ goto err;
+ else
+ isbad = 1;
+ }
+ }
+ }
+
+ if ((ret = __db_vrfy_ccclose(cc)) != 0)
+ goto err;
+ cc = NULL;
+
+ /*
+ * If VRFY_DUPS_UNSORTED is set,
+ * ST_DUPSORT had better not be.
+ */
+ if (F_ISSET(pip, VRFY_DUPS_UNSORTED) &&
+ LF_ISSET(ST_DUPSORT)) {
+ EPRINT((dbp->dbenv,
+ "Unsorted duplicate set at page %lu in sorted-dup database",
+ (u_long)pgno));
+ isbad = 1;
+ }
+ }
+ }
+ goto leaf;
+ break;
+ case P_IBTREE:
+ case P_IRECNO:
+ /* We handle these below. */
+ break;
+ default:
+ /*
+ * If a P_IBTREE or P_IRECNO contains a reference to an
+ * invalid page, we'll wind up here; handle it gracefully.
+ * Note that the code at the "done" label assumes that the
+ * current page is a btree/recno one of some sort; this
+ * is not the case here, so we goto err.
+ */
+ EPRINT((dbp->dbenv,
+ "Page %lu is of inappropriate type %lu",
+ (u_long)pgno, (u_long)pip->type));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+
+ /*
+ * Cases 4 & 5: This is a btree or recno internal page. For each child,
+ * recurse, keeping a running count of nrecs and making sure the level
+ * is always reasonable.
+ */
+ if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0)
+ goto err;
+ for (ret = __db_vrfy_ccset(cc, pgno, &child); ret == 0;
+ ret = __db_vrfy_ccnext(cc, &child))
+ if (child->type == V_RECNO) {
+ if (pip->type != P_IRECNO) {
+ TYPE_ERR_PRINT(dbp->dbenv, "__bam_vrfy_subtree",
+ pgno, pip->type);
+ DB_ASSERT(0);
+ ret = EINVAL;
+ goto err;
+ }
+ if ((ret = __bam_vrfy_subtree(dbp, vdp, child->pgno,
+ NULL, NULL, flags, &child_level, &child_nrecs,
+ &child_relen)) != 0) {
+ if (ret != DB_VERIFY_BAD)
+ goto done;
+ else
+ isbad = 1;
+ }
+
+ if (LF_ISSET(ST_RELEN)) {
+ if (relen == 0)
+ relen = child_relen;
+ /*
+ * child_relen may be zero if the child subtree
+ * is empty.
+ */
+ else if (child_relen > 0 &&
+ relen != child_relen) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Recno page %lu returned bad re_len",
+ (u_long)child->pgno));
+ }
+ if (relenp)
+ *relenp = relen;
+ }
+ if (LF_ISSET(ST_RECNUM))
+ nrecs += child_nrecs;
+ if (level != child_level + 1) {
+ isbad = 1;
+ EPRINT((dbp->dbenv, "%s%lu%s%lu%s%lu",
+ "Recno level incorrect on page ",
+ (u_long)child->pgno, ": got ",
+ (u_long)child_level, ", expected ",
+ (u_long)(level - 1)));
+ }
+ } else if (child->type == V_OVERFLOW &&
+ (ret = __db_vrfy_ovfl_structure(dbp, vdp,
+ child->pgno, child->tlen, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto done;
+ }
+
+ if ((ret = __db_vrfy_ccclose(cc)) != 0)
+ goto err;
+ cc = NULL;
+
+ /* We're done with case 4. */
+ if (pip->type == P_IRECNO)
+ goto done;
+
+ /*
+ * Case 5. Btree internal pages.
+ * As described above, we need to iterate through all the
+ * items on the page and make sure that our children sort appropriately
+ * with respect to them.
+ *
+ * For each entry, li will be the "left-hand" key for the entry
+ * itself, which must sort lower than all entries on its child;
+ * ri will be the key to its right, which must sort greater.
+ */
+ if (h == NULL && (ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
+ goto err;
+ for (i = 0; i < pip->entries; i += O_INDX) {
+ li = GET_BINTERNAL(h, i);
+ ri = (i + O_INDX < pip->entries) ?
+ GET_BINTERNAL(h, i + O_INDX) : NULL;
+
+ /*
+ * The leftmost key is forcibly sorted less than all entries,
+ * so don't bother passing it.
+ */
+ if ((ret = __bam_vrfy_subtree(dbp, vdp, li->pgno,
+ i == 0 ? NULL : li, ri, flags, &child_level,
+ &child_nrecs, NULL)) != 0) {
+ if (ret != DB_VERIFY_BAD)
+ goto done;
+ else
+ isbad = 1;
+ }
+
+ if (LF_ISSET(ST_RECNUM)) {
+ /*
+ * Keep a running tally on the actual record count so
+ * we can return it to our parent (if we have one) or
+ * compare it to the NRECS field if we're a root page.
+ */
+ nrecs += child_nrecs;
+
+ /*
+ * Make sure the actual record count of the child
+ * is equal to the value in the BINTERNAL structure.
+ */
+ if (li->nrecs != child_nrecs) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Item %lu page %lu has incorrect record count of %lu, should be %lu",
+ (u_long)i, (u_long)pgno, (u_long)li->nrecs,
+ (u_long)child_nrecs));
+ }
+ }
+
+ if (level != child_level + 1) {
+ isbad = 1;
+ EPRINT((dbp->dbenv, "%s%lu%s%lu%s%lu",
+ "Btree level incorrect on page ", (u_long)li->pgno,
+ ": got ", (u_long)child_level, ", expected ",
+ (u_long)(level - 1)));
+ }
+ }
+
+ if (0) {
+leaf: level = LEAFLEVEL;
+ if (LF_ISSET(ST_RECNUM))
+ nrecs = pip->rec_cnt;
+
+ /* XXX
+ * We should verify that the record count on a leaf page
+ * is the sum of the number of keys and the number of
+ * records in its off-page dups. This requires looking
+ * at the page again, however, and it may all be changing
+ * soon, so for now we don't bother.
+ */
+
+ if (LF_ISSET(ST_RELEN) && relenp)
+ *relenp = pip->re_len;
+ }
+done: if (F_ISSET(pip, VRFY_INCOMPLETE) && isbad == 0 && ret == 0) {
+ /*
+ * During the page-by-page pass, item order verification was
+ * not finished due to the presence of overflow items. If
+ * isbad == 0, though, it's now safe to do so, as we've
+ * traversed any child overflow pages. Do it.
+ */
+ if (h == NULL && (ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
+ goto err;
+ if ((ret = __bam_vrfy_itemorder(dbp,
+ vdp, h, pgno, 0, 1, 0, flags)) != 0)
+ goto err;
+ F_CLR(pip, VRFY_INCOMPLETE);
+ }
+
+ /*
+ * Our parent has sent us BINTERNAL pointers to parent records
+ * so that we can verify our place with respect to them. If it's
+ * appropriate--we have a default sort function--verify this.
+ */
+ if (isbad == 0 && ret == 0 && !LF_ISSET(DB_NOORDERCHK) && lp != NULL) {
+ if (h == NULL && (ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
+ goto err;
+
+ /*
+ * __bam_vrfy_treeorder needs to know what comparison function
+ * to use. If ST_DUPSET is set, we're in a duplicate tree
+ * and we use the duplicate comparison function; otherwise,
+ * use the btree one. If unset, use the default, of course.
+ */
+ func = LF_ISSET(ST_DUPSET) ? dbp->dup_compare :
+ ((BTREE *)dbp->bt_internal)->bt_compare;
+ if (func == NULL)
+ func = __bam_defcmp;
+
+ if ((ret = __bam_vrfy_treeorder(
+ dbp, pgno, h, lp, rp, func, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+ }
+
+ /*
+ * This is guaranteed to succeed for leaf pages, but no harm done.
+ *
+ * Internal pages below the top level do not store their own
+ * record numbers, so we skip them.
+ */
+ if (LF_ISSET(ST_RECNUM) && nrecs != pip->rec_cnt && toplevel) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Bad record count on page %lu: got %lu, expected %lu",
+ (u_long)pgno, (u_long)nrecs, (u_long)pip->rec_cnt));
+ }
+
+ if (levelp)
+ *levelp = level;
+ if (nrecsp)
+ *nrecsp = nrecs;
+
+ pgset = vdp->pgset;
+ if ((ret = __db_vrfy_pgset_get(pgset, pgno, &p)) != 0)
+ goto err;
+ if (p != 0) {
+ isbad = 1;
+ EPRINT((dbp->dbenv, "Page %lu linked twice", (u_long)pgno));
+ } else if ((ret = __db_vrfy_pgset_inc(pgset, pgno)) != 0)
+ goto err;
+
+err: if (h != NULL && (t_ret = memp_fput(dbp->mpf, h, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ if (cc != NULL && ((t_ret = __db_vrfy_ccclose(cc)) != 0) && ret == 0)
+ ret = t_ret;
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __bam_vrfy_treeorder --
+ * Verify that the lowest key on a page sorts greater than the
+ * BINTERNAL which points to it (lp), and the highest key
+ * sorts less than the BINTERNAL above that (rp).
+ *
+ * If lp is NULL, this means that it was the leftmost key on the
+ * parent, which (regardless of sort function) sorts less than
+ * all keys. No need to check it.
+ *
+ * If rp is NULL, lp was the highest key on the parent, so there's
+ * no higher key we must sort less than.
+ */
+static int
+__bam_vrfy_treeorder(dbp, pgno, h, lp, rp, func, flags)
+ DB *dbp;
+ db_pgno_t pgno;
+ PAGE *h;
+ BINTERNAL *lp, *rp;
+ int (*func) __P((DB *, const DBT *, const DBT *));
+ u_int32_t flags;
+{
+ BOVERFLOW *bo;
+ DBT dbt;
+ db_indx_t last;
+ int ret, cmp;
+
+ memset(&dbt, 0, sizeof(DBT));
+ F_SET(&dbt, DB_DBT_MALLOC);
+ ret = 0;
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ case P_LDUP:
+ last = NUM_ENT(h) - O_INDX;
+ break;
+ case P_LBTREE:
+ last = NUM_ENT(h) - P_INDX;
+ break;
+ default:
+ TYPE_ERR_PRINT(dbp->dbenv,
+ "__bam_vrfy_treeorder", pgno, TYPE(h));
+ DB_ASSERT(0);
+ return (EINVAL);
+ }
+
+ /*
+ * The key on page h, the child page, is more likely to be
+ * an overflow page, so we pass its offset, rather than lp/rp's,
+ * into __bam_cmp. This will take advantage of __db_moff.
+ */
+
+ /*
+ * Skip first-item check if we're an internal page--the first
+ * entry on an internal page is treated specially by __bam_cmp,
+ * so what's on the page shouldn't matter. (Plus, since we're passing
+ * our page and item 0 as to __bam_cmp, we'll sort before our
+ * parent and falsely report a failure.)
+ */
+ if (lp != NULL && TYPE(h) != P_IBTREE) {
+ if (lp->type == B_KEYDATA) {
+ dbt.data = lp->data;
+ dbt.size = lp->len;
+ } else if (lp->type == B_OVERFLOW) {
+ bo = (BOVERFLOW *)lp->data;
+ if ((ret = __db_goff(dbp, &dbt, bo->tlen, bo->pgno,
+ NULL, NULL)) != 0)
+ return (ret);
+ } else {
+ DB_ASSERT(0);
+ EPRINT((dbp->dbenv,
+ "Unknown type for internal record"));
+ return (EINVAL);
+ }
+
+ /* On error, fall through, free if neeeded, and return. */
+ if ((ret = __bam_cmp(dbp, &dbt, h, 0, func, &cmp)) == 0) {
+ if (cmp > 0) {
+ EPRINT((dbp->dbenv,
+ "First item on page %lu sorted greater than parent entry",
+ (u_long)PGNO(h)));
+ ret = DB_VERIFY_BAD;
+ }
+ } else
+ EPRINT((dbp->dbenv,
+ "First item on page %lu had comparison error",
+ (u_long)PGNO(h)));
+
+ if (dbt.data != lp->data)
+ __os_free(dbt.data, 0);
+ if (ret != 0)
+ return (ret);
+ }
+
+ if (rp != NULL) {
+ if (rp->type == B_KEYDATA) {
+ dbt.data = rp->data;
+ dbt.size = rp->len;
+ } else if (rp->type == B_OVERFLOW) {
+ bo = (BOVERFLOW *)rp->data;
+ if ((ret = __db_goff(dbp, &dbt, bo->tlen, bo->pgno,
+ NULL, NULL)) != 0)
+ return (ret);
+ } else {
+ DB_ASSERT(0);
+ EPRINT((dbp->dbenv,
+ "Unknown type for internal record"));
+ return (EINVAL);
+ }
+
+ /* On error, fall through, free if neeeded, and return. */
+ if ((ret = __bam_cmp(dbp, &dbt, h, last, func, &cmp)) == 0) {
+ if (cmp < 0) {
+ EPRINT((dbp->dbenv,
+ "Last item on page %lu sorted greater than parent entry",
+ (u_long)PGNO(h)));
+ ret = DB_VERIFY_BAD;
+ }
+ } else
+ EPRINT((dbp->dbenv,
+ "Last item on page %lu had comparison error",
+ (u_long)PGNO(h)));
+
+ if (dbt.data != rp->data)
+ __os_free(dbt.data, 0);
+ }
+
+ return (ret);
+}
+
+/*
+ * __bam_salvage --
+ * Safely dump out anything that looks like a key on an alleged
+ * btree leaf page.
+ *
+ * PUBLIC: int __bam_salvage __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t,
+ * PUBLIC: PAGE *, void *, int (*)(void *, const void *), DBT *,
+ * PUBLIC: u_int32_t));
+ */
+int
+__bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ u_int32_t pgtype;
+ PAGE *h;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ DBT *key;
+ u_int32_t flags;
+{
+ DBT dbt, unkdbt;
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ db_indx_t i, beg, end;
+ u_int32_t himark;
+ u_int8_t *pgmap;
+ void *ovflbuf;
+ int t_ret, ret, err_ret;
+
+ /* Shut up lint. */
+ COMPQUIET(end, 0);
+
+ ovflbuf = pgmap = NULL;
+ err_ret = ret = 0;
+
+ memset(&dbt, 0, sizeof(DBT));
+ dbt.flags = DB_DBT_REALLOC;
+
+ memset(&unkdbt, 0, sizeof(DBT));
+ unkdbt.size = strlen("UNKNOWN") + 1;
+ unkdbt.data = "UNKNOWN";
+
+ /*
+ * Allocate a buffer for overflow items. Start at one page;
+ * __db_safe_goff will realloc as needed.
+ */
+ if ((ret = __os_malloc(dbp->dbenv, dbp->pgsize, NULL, &ovflbuf)) != 0)
+ return (ret);
+
+ if (LF_ISSET(DB_AGGRESSIVE)) {
+ if ((ret =
+ __os_malloc(dbp->dbenv, dbp->pgsize, NULL, &pgmap)) != 0)
+ goto err;
+ memset(pgmap, 0, dbp->pgsize);
+ }
+
+ /*
+ * Loop through the inp array, spitting out key/data pairs.
+ *
+ * If we're salvaging normally, loop from 0 through NUM_ENT(h).
+ * If we're being aggressive, loop until we hit the end of the page--
+ * NUM_ENT() may be bogus.
+ */
+ himark = dbp->pgsize;
+ for (i = 0;; i += O_INDX) {
+ /* If we're not aggressive, break when we hit NUM_ENT(h). */
+ if (!LF_ISSET(DB_AGGRESSIVE) && i >= NUM_ENT(h))
+ break;
+
+ /* Verify the current item. */
+ ret = __db_vrfy_inpitem(dbp,
+ h, pgno, i, 1, flags, &himark, NULL);
+ /* If this returned a fatality, it's time to break. */
+ if (ret == DB_VERIFY_FATAL) {
+ /*
+ * Don't return DB_VERIFY_FATAL; it's private
+ * and means only that we can't go on with this
+ * page, not with the whole database. It's
+ * not even an error if we've run into it
+ * after NUM_ENT(h).
+ */
+ ret = (i < NUM_ENT(h)) ? DB_VERIFY_BAD : 0;
+ break;
+ }
+
+ /*
+ * If this returned 0, it's safe to print or (carefully)
+ * try to fetch.
+ */
+ if (ret == 0) {
+ /*
+ * We only want to print deleted items if
+ * DB_AGGRESSIVE is set.
+ */
+ bk = GET_BKEYDATA(h, i);
+ if (!LF_ISSET(DB_AGGRESSIVE) && B_DISSET(bk->type))
+ continue;
+
+ /*
+ * We're going to go try to print the next item. If
+ * key is non-NULL, we're a dup page, so we've got to
+ * print the key first, unless SA_SKIPFIRSTKEY is set
+ * and we're on the first entry.
+ */
+ if (key != NULL &&
+ (i != 0 || !LF_ISSET(SA_SKIPFIRSTKEY)))
+ if ((ret = __db_prdbt(key,
+ 0, " ", handle, callback, 0, NULL)) != 0)
+ err_ret = ret;
+
+ beg = h->inp[i];
+ switch (B_TYPE(bk->type)) {
+ case B_DUPLICATE:
+ end = beg + BOVERFLOW_SIZE - 1;
+ /*
+ * If we're not on a normal btree leaf page,
+ * there shouldn't be off-page
+ * dup sets. Something's confused; just
+ * drop it, and the code to pick up unlinked
+ * offpage dup sets will print it out
+ * with key "UNKNOWN" later.
+ */
+ if (pgtype != P_LBTREE)
+ break;
+
+ bo = (BOVERFLOW *)bk;
+
+ /*
+ * If the page number is unreasonable, or
+ * if this is supposed to be a key item,
+ * just spit out "UNKNOWN"--the best we
+ * can do is run into the data items in the
+ * unlinked offpage dup pass.
+ */
+ if (!IS_VALID_PGNO(bo->pgno) ||
+ (i % P_INDX == 0)) {
+ /* Not much to do on failure. */
+ if ((ret = __db_prdbt(&unkdbt, 0, " ",
+ handle, callback, 0, NULL)) != 0)
+ err_ret = ret;
+ break;
+ }
+
+ if ((ret = __db_salvage_duptree(dbp,
+ vdp, bo->pgno, &dbt, handle, callback,
+ flags | SA_SKIPFIRSTKEY)) != 0)
+ err_ret = ret;
+
+ break;
+ case B_KEYDATA:
+ end = ALIGN(beg + bk->len, sizeof(u_int32_t)) - 1;
+ dbt.data = bk->data;
+ dbt.size = bk->len;
+ if ((ret = __db_prdbt(&dbt,
+ 0, " ", handle, callback, 0, NULL)) != 0)
+ err_ret = ret;
+ break;
+ case B_OVERFLOW:
+ end = beg + BOVERFLOW_SIZE - 1;
+ bo = (BOVERFLOW *)bk;
+ if ((ret = __db_safe_goff(dbp, vdp,
+ bo->pgno, &dbt, &ovflbuf, flags)) != 0) {
+ err_ret = ret;
+ /* We care about err_ret more. */
+ (void)__db_prdbt(&unkdbt, 0, " ",
+ handle, callback, 0, NULL);
+ break;
+ }
+ if ((ret = __db_prdbt(&dbt,
+ 0, " ", handle, callback, 0, NULL)) != 0)
+ err_ret = ret;
+ break;
+ default:
+ /*
+ * We should never get here; __db_vrfy_inpitem
+ * should not be returning 0 if bk->type
+ * is unrecognizable.
+ */
+ DB_ASSERT(0);
+ return (EINVAL);
+ }
+
+ /*
+ * If we're being aggressive, mark the beginning
+ * and end of the item; we'll come back and print
+ * whatever "junk" is in the gaps in case we had
+ * any bogus inp elements and thereby missed stuff.
+ */
+ if (LF_ISSET(DB_AGGRESSIVE)) {
+ pgmap[beg] = ITEM_BEGIN;
+ pgmap[end] = ITEM_END;
+ }
+ }
+ }
+
+ /*
+ * If i is odd and this is a btree leaf, we've printed out a key but not
+ * a datum; fix this imbalance by printing an "UNKNOWN".
+ */
+ if (pgtype == P_LBTREE && (i % P_INDX == 1) && ((ret =
+ __db_prdbt(&unkdbt, 0, " ", handle, callback, 0, NULL)) != 0))
+ err_ret = ret;
+
+err: if (pgmap != NULL)
+ __os_free(pgmap, 0);
+ __os_free(ovflbuf, 0);
+
+ /* Mark this page as done. */
+ if ((t_ret = __db_salvage_markdone(vdp, pgno)) != 0)
+ return (t_ret);
+
+ return ((err_ret != 0) ? err_ret : ret);
+}
+
+/*
+ * __bam_salvage_walkdupint --
+ * Walk a known-good btree or recno internal page which is part of
+ * a dup tree, calling __db_salvage_duptree on each child page.
+ *
+ * PUBLIC: int __bam_salvage_walkdupint __P((DB *, VRFY_DBINFO *, PAGE *,
+ * PUBLIC: DBT *, void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__bam_salvage_walkdupint(dbp, vdp, h, key, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ DBT *key;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ RINTERNAL *ri;
+ BINTERNAL *bi;
+ int ret, t_ret;
+ db_indx_t i;
+
+ ret = 0;
+ for (i = 0; i < NUM_ENT(h); i++) {
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ bi = GET_BINTERNAL(h, i);
+ if ((t_ret = __db_salvage_duptree(dbp,
+ vdp, bi->pgno, key, handle, callback, flags)) != 0)
+ ret = t_ret;
+ case P_IRECNO:
+ ri = GET_RINTERNAL(h, i);
+ if ((t_ret = __db_salvage_duptree(dbp,
+ vdp, ri->pgno, key, handle, callback, flags)) != 0)
+ ret = t_ret;
+ break;
+ default:
+ __db_err(dbp->dbenv,
+ "__bam_salvage_walkdupint called on non-int. page");
+ DB_ASSERT(0);
+ return (EINVAL);
+ }
+ /* Pass SA_SKIPFIRSTKEY, if set, on to the 0th child only. */
+ flags &= ~LF_ISSET(SA_SKIPFIRSTKEY);
+ }
+
+ return (ret);
+}
+
+/*
+ * __bam_meta2pgset --
+ * Given a known-good meta page, return in pgsetp a 0-terminated list of
+ * db_pgno_t's corresponding to the pages in the btree.
+ *
+ * We do this by a somewhat sleazy method, to avoid having to traverse the
+ * btree structure neatly: we walk down the left side to the very
+ * first leaf page, then we mark all the pages in the chain of
+ * NEXT_PGNOs (being wary of cycles and invalid ones), then we
+ * consolidate our scratch array into a nice list, and return. This
+ * avoids the memory management hassles of recursion and the
+ * trouble of walking internal pages--they just don't matter, except
+ * for the left branch.
+ *
+ * PUBLIC: int __bam_meta2pgset __P((DB *, VRFY_DBINFO *, BTMETA *,
+ * PUBLIC: u_int32_t, DB *));
+ */
+int
+__bam_meta2pgset(dbp, vdp, btmeta, flags, pgset)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ BTMETA *btmeta;
+ u_int32_t flags;
+ DB *pgset;
+{
+ BINTERNAL *bi;
+ PAGE *h;
+ RINTERNAL *ri;
+ db_pgno_t current, p;
+ int err_ret, ret;
+
+ h = NULL;
+ ret = err_ret = 0;
+ DB_ASSERT(pgset != NULL);
+ for (current = btmeta->root;;) {
+ if (!IS_VALID_PGNO(current) || current == PGNO(btmeta)) {
+ err_ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ if ((ret = memp_fget(dbp->mpf, &current, 0, &h)) != 0) {
+ err_ret = ret;
+ goto err;
+ }
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ case P_IRECNO:
+ if ((ret = __bam_vrfy(dbp,
+ vdp, h, current, flags | DB_NOORDERCHK)) != 0) {
+ err_ret = ret;
+ goto err;
+ }
+ if (TYPE(h) == P_IBTREE) {
+ bi = GET_BINTERNAL(h, 0);
+ current = bi->pgno;
+ } else { /* P_IRECNO */
+ ri = GET_RINTERNAL(h, 0);
+ current = ri->pgno;
+ }
+ break;
+ case P_LBTREE:
+ case P_LRECNO:
+ goto traverse;
+ default:
+ err_ret = DB_VERIFY_BAD;
+ goto err;
+ }
+
+ if ((ret = memp_fput(dbp->mpf, h, 0)) != 0)
+ err_ret = ret;
+ h = NULL;
+ }
+
+ /*
+ * At this point, current is the pgno of leaf page h, the 0th in the
+ * tree we're concerned with.
+ */
+traverse:
+ while (IS_VALID_PGNO(current) && current != PGNO_INVALID) {
+ if (h == NULL &&
+ (ret = memp_fget(dbp->mpf, &current, 0, &h) != 0)) {
+ err_ret = ret;
+ break;
+ }
+
+ if ((ret = __db_vrfy_pgset_get(pgset, current, (int *)&p)) != 0)
+ goto err;
+
+ if (p != 0) {
+ /*
+ * We've found a cycle. Return success anyway--
+ * our caller may as well use however much of
+ * the pgset we've come up with.
+ */
+ break;
+ }
+ if ((ret = __db_vrfy_pgset_inc(pgset, current)) != 0)
+ goto err;
+
+ current = NEXT_PGNO(h);
+ if ((ret = memp_fput(dbp->mpf, h, 0)) != 0)
+ err_ret = ret;
+ h = NULL;
+ }
+
+err: if (h != NULL)
+ (void)memp_fput(dbp->mpf, h, 0);
+
+ return (ret == 0 ? err_ret : ret);
+}
+
+/*
+ * __bam_safe_getdata --
+ *
+ * Utility function for __bam_vrfy_itemorder. Safely gets the datum at
+ * index i, page h, and sticks it in DBT dbt. If ovflok is 1 and i's an
+ * overflow item, we do a safe_goff to get the item and signal that we need
+ * to free dbt->data; if ovflok is 0, we leaves the DBT zeroed.
+ */
+static int
+__bam_safe_getdata(dbp, h, i, ovflok, dbt, freedbtp)
+ DB *dbp;
+ PAGE *h;
+ u_int32_t i;
+ int ovflok;
+ DBT *dbt;
+ int *freedbtp;
+{
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+
+ memset(dbt, 0, sizeof(DBT));
+ *freedbtp = 0;
+
+ bk = GET_BKEYDATA(h, i);
+ if (B_TYPE(bk->type) == B_OVERFLOW) {
+ if (!ovflok)
+ return (0);
+
+ bo = (BOVERFLOW *)bk;
+ F_SET(dbt, DB_DBT_MALLOC);
+
+ *freedbtp = 1;
+ return (__db_goff(dbp, dbt, bo->tlen, bo->pgno, NULL, NULL));
+ } else {
+ dbt->data = bk->data;
+ dbt->size = bk->len;
+ }
+
+ return (0);
+}