diff options
author | unknown <tim@threads.polyesthetic.msg> | 2001-03-04 19:42:05 -0500 |
---|---|---|
committer | unknown <tim@threads.polyesthetic.msg> | 2001-03-04 19:42:05 -0500 |
commit | ec6ae091617bdfdca9e65e8d3e65b950d234f676 (patch) | |
tree | 9dd732e08dba156ee3d7635caedc0dc3107ecac6 /bdb/btree/bt_verify.c | |
parent | 87d70fb598105b64b538ff6b81eef9da626255b1 (diff) | |
download | mariadb-git-ec6ae091617bdfdca9e65e8d3e65b950d234f676.tar.gz |
Import changeset
Diffstat (limited to 'bdb/btree/bt_verify.c')
-rw-r--r-- | bdb/btree/bt_verify.c | 2237 |
1 files changed, 2237 insertions, 0 deletions
diff --git a/bdb/btree/bt_verify.c b/bdb/btree/bt_verify.c new file mode 100644 index 00000000000..9f8647e7e2a --- /dev/null +++ b/bdb/btree/bt_verify.c @@ -0,0 +1,2237 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1999, 2000 + * Sleepycat Software. All rights reserved. + * + * $Id: bt_verify.c,v 1.44 2000/12/06 19:55:44 ubell Exp $ + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: bt_verify.c,v 1.44 2000/12/06 19:55:44 ubell Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_verify.h" +#include "btree.h" + +static int __bam_safe_getdata __P((DB *, PAGE *, u_int32_t, int, DBT *, int *)); +static int __bam_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, + db_indx_t *, u_int32_t)); +static int __bam_vrfy_treeorder __P((DB *, db_pgno_t, PAGE *, BINTERNAL *, + BINTERNAL *, int (*)(DB *, const DBT *, const DBT *), u_int32_t)); +static int __ram_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, + db_indx_t *, u_int32_t)); + +#define OKFLAGS (DB_AGGRESSIVE | DB_NOORDERCHK | DB_SALVAGE) + +/* + * __bam_vrfy_meta -- + * Verify the btree-specific part of a metadata page. + * + * PUBLIC: int __bam_vrfy_meta __P((DB *, VRFY_DBINFO *, BTMETA *, + * PUBLIC: db_pgno_t, u_int32_t)); + */ +int +__bam_vrfy_meta(dbp, vdp, meta, pgno, flags) + DB *dbp; + VRFY_DBINFO *vdp; + BTMETA *meta; + db_pgno_t pgno; + u_int32_t flags; +{ + VRFY_PAGEINFO *pip; + int isbad, t_ret, ret; + db_indx_t ovflsize; + + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + + isbad = 0; + + /* + * If VRFY_INCOMPLETE is not set, then we didn't come through + * __db_vrfy_pagezero and didn't incompletely + * check this page--we haven't checked it at all. + * Thus we need to call __db_vrfy_meta and check the common fields. + * + * If VRFY_INCOMPLETE is set, we've already done all the same work + * in __db_vrfy_pagezero, so skip the check. + */ + if (!F_ISSET(pip, VRFY_INCOMPLETE) && + (ret = __db_vrfy_meta(dbp, vdp, &meta->dbmeta, pgno, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + + /* bt_minkey: must be >= 2; must produce sensible ovflsize */ + + /* avoid division by zero */ + ovflsize = meta->minkey > 0 ? + B_MINKEY_TO_OVFLSIZE(meta->minkey, dbp->pgsize) : 0; + + if (meta->minkey < 2 || + ovflsize > B_MINKEY_TO_OVFLSIZE(DEFMINKEYPAGE, dbp->pgsize)) { + pip->bt_minkey = 0; + isbad = 1; + EPRINT((dbp->dbenv, + "Nonsensical bt_minkey value %lu on metadata page %lu", + (u_long)meta->minkey, (u_long)pgno)); + } else + pip->bt_minkey = meta->minkey; + + /* bt_maxkey: no constraints (XXX: right?) */ + pip->bt_maxkey = meta->maxkey; + + /* re_len: no constraints on this (may be zero or huge--we make rope) */ + pip->re_len = meta->re_len; + + /* + * The root must not be current page or 0 and it must be within + * database. If this metadata page is the master meta data page + * of the file, then the root page had better be page 1. + */ + pip->root = 0; + if (meta->root == PGNO_INVALID + || meta->root == pgno || !IS_VALID_PGNO(meta->root) || + (pgno == PGNO_BASE_MD && meta->root != 1)) { + isbad = 1; + EPRINT((dbp->dbenv, + "Nonsensical root page %lu on metadata page %lu", + (u_long)meta->root, (u_long)vdp->last_pgno)); + } else + pip->root = meta->root; + + /* Flags. */ + if (F_ISSET(&meta->dbmeta, BTM_RENUMBER)) + F_SET(pip, VRFY_IS_RRECNO); + + if (F_ISSET(&meta->dbmeta, BTM_SUBDB)) { + /* + * If this is a master db meta page, it had better not have + * duplicates. + */ + if (F_ISSET(&meta->dbmeta, BTM_DUP) && pgno == PGNO_BASE_MD) { + isbad = 1; + EPRINT((dbp->dbenv, + "Btree metadata page %lu has both duplicates and multiple databases", + (u_long)pgno)); + } + F_SET(pip, VRFY_HAS_SUBDBS); + } + + if (F_ISSET(&meta->dbmeta, BTM_DUP)) + F_SET(pip, VRFY_HAS_DUPS); + if (F_ISSET(&meta->dbmeta, BTM_DUPSORT)) + F_SET(pip, VRFY_HAS_DUPSORT); + if (F_ISSET(&meta->dbmeta, BTM_RECNUM)) + F_SET(pip, VRFY_HAS_RECNUMS); + if (F_ISSET(pip, VRFY_HAS_RECNUMS) && F_ISSET(pip, VRFY_HAS_DUPS)) { + EPRINT((dbp->dbenv, + "Btree metadata page %lu illegally has both recnums and dups", + (u_long)pgno)); + isbad = 1; + } + + if (F_ISSET(&meta->dbmeta, BTM_RECNO)) { + F_SET(pip, VRFY_IS_RECNO); + dbp->type = DB_RECNO; + } else if (F_ISSET(pip, VRFY_IS_RRECNO)) { + isbad = 1; + EPRINT((dbp->dbenv, + "Metadata page %lu has renumber flag set but is not recno", + (u_long)pgno)); + } + + if (F_ISSET(pip, VRFY_IS_RECNO) && F_ISSET(pip, VRFY_HAS_DUPS)) { + EPRINT((dbp->dbenv, + "Recno metadata page %lu specifies duplicates", + (u_long)pgno)); + isbad = 1; + } + + if (F_ISSET(&meta->dbmeta, BTM_FIXEDLEN)) + F_SET(pip, VRFY_IS_FIXEDLEN); + else if (pip->re_len > 0) { + /* + * It's wrong to have an re_len if it's not a fixed-length + * database + */ + isbad = 1; + EPRINT((dbp->dbenv, + "re_len of %lu in non-fixed-length database", + (u_long)pip->re_len)); + } + + /* + * We do not check that the rest of the page is 0, because it may + * not be and may still be correct. + */ + +err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0) + ret = t_ret; + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); +} + +/* + * __ram_vrfy_leaf -- + * Verify a recno leaf page. + * + * PUBLIC: int __ram_vrfy_leaf __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, + * PUBLIC: u_int32_t)); + */ +int +__ram_vrfy_leaf(dbp, vdp, h, pgno, flags) + DB *dbp; + VRFY_DBINFO *vdp; + PAGE *h; + db_pgno_t pgno; + u_int32_t flags; +{ + BKEYDATA *bk; + VRFY_PAGEINFO *pip; + db_indx_t i; + int ret, t_ret, isbad; + u_int32_t re_len_guess, len; + + isbad = 0; + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + + if ((ret = __db_fchk(dbp->dbenv, + "__ram_vrfy_leaf", flags, OKFLAGS)) != 0) + goto err; + + if (TYPE(h) != P_LRECNO) { + /* We should not have been called. */ + TYPE_ERR_PRINT(dbp->dbenv, "__ram_vrfy_leaf", pgno, TYPE(h)); + DB_ASSERT(0); + ret = EINVAL; + goto err; + } + + /* + * Verify (and, if relevant, save off) page fields common to + * all PAGEs. + */ + if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + + /* + * Verify inp[]. Return immediately if it returns DB_VERIFY_BAD; + * further checks are dangerous. + */ + if ((ret = __bam_vrfy_inp(dbp, + vdp, h, pgno, &pip->entries, flags)) != 0) + goto err; + + if (F_ISSET(pip, VRFY_HAS_DUPS)) { + EPRINT((dbp->dbenv, + "Recno database has dups on page %lu", (u_long)pgno)); + ret = DB_VERIFY_BAD; + goto err; + } + + /* + * Walk through inp and see if the lengths of all the records are the + * same--if so, this may be a fixed-length database, and we want to + * save off this value. We know inp to be safe if we've gotten this + * far. + */ + re_len_guess = 0; + for (i = 0; i < NUM_ENT(h); i++) { + bk = GET_BKEYDATA(h, i); + /* KEYEMPTY. Go on. */ + if (B_DISSET(bk->type)) + continue; + if (bk->type == B_OVERFLOW) + len = ((BOVERFLOW *)bk)->tlen; + else if (bk->type == B_KEYDATA) + len = bk->len; + else { + isbad = 1; + EPRINT((dbp->dbenv, + "Nonsensical type for item %lu, page %lu", + (u_long)i, (u_long)pgno)); + continue; + } + if (re_len_guess == 0) + re_len_guess = len; + + /* + * Is this item's len the same as the last one's? If not, + * reset to 0 and break--we don't have a single re_len. + * Otherwise, go on to the next item. + */ + if (re_len_guess != len) { + re_len_guess = 0; + break; + } + } + pip->re_len = re_len_guess; + + /* Save off record count. */ + pip->rec_cnt = NUM_ENT(h); + +err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0) + ret = t_ret; + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : 0); +} + +/* + * __bam_vrfy -- + * Verify a btree leaf or internal page. + * + * PUBLIC: int __bam_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, + * PUBLIC: u_int32_t)); + */ +int +__bam_vrfy(dbp, vdp, h, pgno, flags) + DB *dbp; + VRFY_DBINFO *vdp; + PAGE *h; + db_pgno_t pgno; + u_int32_t flags; +{ + VRFY_PAGEINFO *pip; + int ret, t_ret, isbad; + + isbad = 0; + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + + switch (TYPE(h)) { + case P_IBTREE: + case P_IRECNO: + case P_LBTREE: + case P_LDUP: + break; + default: + TYPE_ERR_PRINT(dbp->dbenv, "__bam_vrfy", pgno, TYPE(h)); + DB_ASSERT(0); + ret = EINVAL; + goto err; + } + + /* + * Verify (and, if relevant, save off) page fields common to + * all PAGEs. + */ + if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + + /* + * The record count is, on internal pages, stored in an overloaded + * next_pgno field. Save it off; we'll verify it when we check + * overall database structure. We could overload the field + * in VRFY_PAGEINFO, too, but this seems gross, and space + * is not at such a premium. + */ + pip->rec_cnt = RE_NREC(h); + + /* + * Verify inp[]. + */ + if (TYPE(h) == P_IRECNO) { + if ((ret = __ram_vrfy_inp(dbp, + vdp, h, pgno, &pip->entries, flags)) != 0) + goto err; + } else if ((ret = __bam_vrfy_inp(dbp, + vdp, h, pgno, &pip->entries, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + EPRINT((dbp->dbenv, + "item order check on page %lu unsafe: skipping", + (u_long)pgno)); + } else if (!LF_ISSET(DB_NOORDERCHK) && (ret = + __bam_vrfy_itemorder(dbp, vdp, h, pgno, 0, 0, 0, flags)) != 0) { + /* + * We know that the elements of inp are reasonable. + * + * Check that elements fall in the proper order. + */ + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + +err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0) + ret = t_ret; + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : 0); +} + +/* + * __ram_vrfy_inp -- + * Verify that all entries in a P_IRECNO inp[] array are reasonable, + * and count them. Note that P_LRECNO uses __bam_vrfy_inp; + * P_IRECNOs are a special, and simpler, case, since they have + * RINTERNALs rather than BKEYDATA/BINTERNALs. + */ +static int +__ram_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) + DB *dbp; + VRFY_DBINFO *vdp; + PAGE *h; + db_pgno_t pgno; + db_indx_t *nentriesp; + u_int32_t flags; +{ + RINTERNAL *ri; + VRFY_CHILDINFO child; + VRFY_PAGEINFO *pip; + int ret, t_ret, isbad; + u_int32_t himark, i, offset, nentries; + u_int8_t *pagelayout, *p; + + isbad = 0; + memset(&child, 0, sizeof(VRFY_CHILDINFO)); + nentries = 0; + pagelayout = NULL; + + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + + if (TYPE(h) != P_IRECNO) { + TYPE_ERR_PRINT(dbp->dbenv, "__ram_vrfy_inp", pgno, TYPE(h)); + DB_ASSERT(0); + ret = EINVAL; + goto err; + } + + himark = dbp->pgsize; + if ((ret = + __os_malloc(dbp->dbenv, dbp->pgsize, NULL, &pagelayout)) != 0) + goto err; + memset(pagelayout, 0, dbp->pgsize); + for (i = 0; i < NUM_ENT(h); i++) { + if ((u_int8_t *)h->inp + i >= (u_int8_t *)h + himark) { + EPRINT((dbp->dbenv, + "Page %lu entries listing %lu overlaps data", + (u_long)pgno, (u_long)i)); + ret = DB_VERIFY_BAD; + goto err; + } + offset = h->inp[i]; + /* + * Check that the item offset is reasonable: it points + * somewhere after the inp array and before the end of the + * page. + */ + if (offset <= (u_int32_t)((u_int8_t *)h->inp + i - + (u_int8_t *)h) || + offset > (u_int32_t)(dbp->pgsize - RINTERNAL_SIZE)) { + isbad = 1; + EPRINT((dbp->dbenv, + "Bad offset %lu at page %lu index %lu", + (u_long)offset, (u_long)pgno, (u_long)i)); + continue; + } + + /* Update the high-water mark (what HOFFSET should be) */ + if (offset < himark) + himark = offset; + + nentries++; + + /* Make sure this RINTERNAL is not multiply referenced. */ + ri = GET_RINTERNAL(h, i); + if (pagelayout[offset] == 0) { + pagelayout[offset] = 1; + child.pgno = ri->pgno; + child.type = V_RECNO; + child.nrecs = ri->nrecs; + if ((ret = __db_vrfy_childput(vdp, pgno, &child)) != 0) + goto err; + } else { + EPRINT((dbp->dbenv, + "RINTERNAL structure at offset %lu, page %lu referenced twice", + (u_long)offset, (u_long)pgno)); + isbad = 1; + } + } + + for (p = pagelayout + himark; + p < pagelayout + dbp->pgsize; + p += RINTERNAL_SIZE) + if (*p != 1) { + EPRINT((dbp->dbenv, + "Gap between items at offset %lu, page %lu", + (u_long)(p - pagelayout), (u_long)pgno)); + isbad = 1; + } + + if ((db_indx_t)himark != HOFFSET(h)) { + EPRINT((dbp->dbenv, "Bad HOFFSET %lu, appears to be %lu", + (u_long)(HOFFSET(h)), (u_long)himark)); + isbad = 1; + } + + *nentriesp = nentries; + +err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0) + ret = t_ret; + if (pagelayout != NULL) + __os_free(pagelayout, dbp->pgsize); + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); +} + +/* + * __bam_vrfy_inp -- + * Verify that all entries in inp[] array are reasonable; + * count them. + */ +static int +__bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) + DB *dbp; + VRFY_DBINFO *vdp; + PAGE *h; + db_pgno_t pgno; + db_indx_t *nentriesp; + u_int32_t flags; +{ + BKEYDATA *bk; + BOVERFLOW *bo; + VRFY_CHILDINFO child; + VRFY_PAGEINFO *pip; + int isbad, initem, isdupitem, ret, t_ret; + u_int32_t himark, offset; /* These would be db_indx_ts but for algnmt.*/ + u_int32_t i, endoff, nentries; + u_int8_t *pagelayout; + + isbad = isdupitem = 0; + nentries = 0; + memset(&child, 0, sizeof(VRFY_CHILDINFO)); + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + + switch (TYPE(h)) { + case P_IBTREE: + case P_LBTREE: + case P_LDUP: + case P_LRECNO: + break; + default: + /* + * In the salvager, we might call this from a page which + * we merely suspect is a btree page. Otherwise, it + * shouldn't get called--if it is, that's a verifier bug. + */ + if (LF_ISSET(DB_SALVAGE)) + break; + TYPE_ERR_PRINT(dbp->dbenv, "__bam_vrfy_inp", pgno, TYPE(h)); + DB_ASSERT(0); + ret = EINVAL; + goto err; + } + + /* + * Loop through inp[], the array of items, until we either + * run out of entries or collide with the data. Keep track + * of h_offset in himark. + * + * For each element in inp[i], make sure it references a region + * that starts after the end of the inp array (as defined by + * NUM_ENT(h)), ends before the beginning of the page, doesn't + * overlap any other regions, and doesn't have a gap between + * it and the region immediately after it. + */ + himark = dbp->pgsize; + if ((ret = __os_malloc(dbp->dbenv, + dbp->pgsize, NULL, &pagelayout)) != 0) + goto err; + memset(pagelayout, 0, dbp->pgsize); + for (i = 0; i < NUM_ENT(h); i++) { + + ret = __db_vrfy_inpitem(dbp, + h, pgno, i, 1, flags, &himark, &offset); + if (ret == DB_VERIFY_BAD) { + isbad = 1; + continue; + } else if (ret == DB_VERIFY_FATAL) { + isbad = 1; + goto err; + } else if (ret != 0) + DB_ASSERT(0); + + /* + * We now have a plausible beginning for the item, and we know + * its length is safe. + * + * Mark the beginning and end in pagelayout so we can make sure + * items have no overlaps or gaps. + */ + bk = GET_BKEYDATA(h, i); +#define ITEM_BEGIN 1 +#define ITEM_END 2 + if (pagelayout[offset] == 0) + pagelayout[offset] = ITEM_BEGIN; + else if (pagelayout[offset] == ITEM_BEGIN) { + /* + * Having two inp entries that point at the same patch + * of page is legal if and only if the page is + * a btree leaf and they're onpage duplicate keys-- + * that is, if (i % P_INDX) == 0. + */ + if ((i % P_INDX == 0) && (TYPE(h) == P_LBTREE)) { + /* Flag for later. */ + F_SET(pip, VRFY_HAS_DUPS); + + /* Bump up nentries so we don't undercount. */ + nentries++; + + /* + * We'll check to make sure the end is + * equal, too. + */ + isdupitem = 1; + } else { + isbad = 1; + EPRINT((dbp->dbenv, + "Duplicated item %lu on page %lu", + (u_long)i, (u_long)pgno)); + } + } + + /* + * Mark the end. Its location varies with the page type + * and the item type. + * + * If the end already has a sign other than 0, do nothing-- + * it's an overlap that we'll catch later. + */ + switch(B_TYPE(bk->type)) { + case B_KEYDATA: + if (TYPE(h) == P_IBTREE) + /* It's a BINTERNAL. */ + endoff = offset + BINTERNAL_SIZE(bk->len) - 1; + else + endoff = offset + BKEYDATA_SIZE(bk->len) - 1; + break; + case B_DUPLICATE: + /* + * Flag that we have dups; we'll check whether + * that's okay during the structure check. + */ + F_SET(pip, VRFY_HAS_DUPS); + /* FALLTHROUGH */ + case B_OVERFLOW: + /* + * Overflow entries on internal pages are stored + * as the _data_ of a BINTERNAL; overflow entries + * on leaf pages are stored as the entire entry. + */ + endoff = offset + + ((TYPE(h) == P_IBTREE) ? + BINTERNAL_SIZE(BOVERFLOW_SIZE) : + BOVERFLOW_SIZE) - 1; + break; + default: + /* + * We'll complain later; for now, just mark + * a minimum. + */ + endoff = offset + BKEYDATA_SIZE(0) - 1; + break; + } + + /* + * If this is an onpage duplicate key we've seen before, + * the end had better coincide too. + */ + if (isdupitem && pagelayout[endoff] != ITEM_END) { + EPRINT((dbp->dbenv, + "Duplicated item %lu on page %lu", + (u_long)i, (u_long)pgno)); + isbad = 1; + } else if (pagelayout[endoff] == 0) + pagelayout[endoff] = ITEM_END; + isdupitem = 0; + + /* + * There should be no deleted items in a quiescent tree, + * except in recno. + */ + if (B_DISSET(bk->type) && TYPE(h) != P_LRECNO) { + isbad = 1; + EPRINT((dbp->dbenv, + "Item %lu on page %lu marked deleted", + (u_long)i, (u_long)pgno)); + } + + /* + * Check the type and such of bk--make sure it's reasonable + * for the pagetype. + */ + switch (B_TYPE(bk->type)) { + case B_KEYDATA: + /* + * This is a normal, non-overflow BKEYDATA or BINTERNAL. + * The only thing to check is the len, and that's + * already been done. + */ + break; + case B_DUPLICATE: + if (TYPE(h) == P_IBTREE) { + isbad = 1; + EPRINT((dbp->dbenv, + "Duplicate page referenced by internal btree page %lu at item %lu", + (u_long)pgno, (u_long)i)); + break; + } else if (TYPE(h) == P_LRECNO) { + isbad = 1; + EPRINT((dbp->dbenv, + "Duplicate page referenced by recno page %lu at item %lu", + (u_long)pgno, (u_long)i)); + break; + } + /* FALLTHROUGH */ + case B_OVERFLOW: + bo = (TYPE(h) == P_IBTREE) ? + (BOVERFLOW *)(((BINTERNAL *)bk)->data) : + (BOVERFLOW *)bk; + + if (B_TYPE(bk->type) == B_OVERFLOW) + /* Make sure tlen is reasonable. */ + if (bo->tlen > dbp->pgsize * vdp->last_pgno) { + isbad = 1; + EPRINT((dbp->dbenv, + "Impossible tlen %lu, item %lu, page %lu", + (u_long)bo->tlen, (u_long)i, + (u_long)pgno)); + /* Don't save as a child. */ + break; + } + + if (!IS_VALID_PGNO(bo->pgno) || bo->pgno == pgno || + bo->pgno == PGNO_INVALID) { + isbad = 1; + EPRINT((dbp->dbenv, + "Offpage item %lu, page %lu has bad pgno", + (u_long)i, (u_long)pgno)); + /* Don't save as a child. */ + break; + } + + child.pgno = bo->pgno; + child.type = (B_TYPE(bk->type) == B_OVERFLOW ? + V_OVERFLOW : V_DUPLICATE); + child.tlen = bo->tlen; + if ((ret = __db_vrfy_childput(vdp, pgno, &child)) != 0) + goto err; + break; + default: + isbad = 1; + EPRINT((dbp->dbenv, + "Item %lu on page %lu of invalid type %lu", + (u_long)i, (u_long)pgno)); + break; + } + } + + /* + * Now, loop through and make sure the items are contiguous and + * non-overlapping. + */ + initem = 0; + for (i = himark; i < dbp->pgsize; i++) + if (initem == 0) + switch (pagelayout[i]) { + case 0: + /* May be just for alignment. */ + if (i != ALIGN(i, sizeof(u_int32_t))) + continue; + + isbad = 1; + EPRINT((dbp->dbenv, + "Gap between items, page %lu offset %lu", + (u_long)pgno, (u_long)i)); + /* Find the end of the gap */ + for ( ; pagelayout[i + 1] == 0 && + (size_t)(i + 1) < dbp->pgsize; i++) + ; + break; + case ITEM_BEGIN: + /* We've found an item. Check its alignment. */ + if (i != ALIGN(i, sizeof(u_int32_t))) { + isbad = 1; + EPRINT((dbp->dbenv, + "Offset %lu page %lu unaligned", + (u_long)i, (u_long)pgno)); + } + initem = 1; + nentries++; + break; + case ITEM_END: + /* + * We've hit the end of an item even though + * we don't think we're in one; must + * be an overlap. + */ + isbad = 1; + EPRINT((dbp->dbenv, + "Overlapping items, page %lu offset %lu", + (u_long)pgno, (u_long)i)); + break; + default: + /* Should be impossible. */ + DB_ASSERT(0); + ret = EINVAL; + goto err; + } + else + switch (pagelayout[i]) { + case 0: + /* In the middle of an item somewhere. Okay. */ + break; + case ITEM_END: + /* End of an item; switch to out-of-item mode.*/ + initem = 0; + break; + case ITEM_BEGIN: + /* + * Hit a second item beginning without an + * end. Overlap. + */ + isbad = 1; + EPRINT((dbp->dbenv, + "Overlapping items, page %lu offset %lu", + (u_long)pgno, (u_long)i)); + break; + } + + (void)__os_free(pagelayout, dbp->pgsize); + + /* Verify HOFFSET. */ + if ((db_indx_t)himark != HOFFSET(h)) { + EPRINT((dbp->dbenv, "Bad HOFFSET %lu, appears to be %lu", + (u_long)HOFFSET(h), (u_long)himark)); + isbad = 1; + } + +err: if (nentriesp != NULL) + *nentriesp = nentries; + + if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0) + ret = t_ret; + + return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret); +} + +/* + * __bam_vrfy_itemorder -- + * Make sure the items on a page sort correctly. + * + * Assumes that NUM_ENT(h) and inp[0]..inp[NUM_ENT(h) - 1] are + * reasonable; be sure that __bam_vrfy_inp has been called first. + * + * If ovflok is set, it also assumes that overflow page chains + * hanging off the current page have been sanity-checked, and so we + * can use __bam_cmp to verify their ordering. If it is not set, + * and we run into an overflow page, carp and return DB_VERIFY_BAD; + * we shouldn't be called if any exist. + * + * PUBLIC: int __bam_vrfy_itemorder __P((DB *, VRFY_DBINFO *, PAGE *, + * PUBLIC: db_pgno_t, u_int32_t, int, int, u_int32_t)); + */ +int +__bam_vrfy_itemorder(dbp, vdp, h, pgno, nentries, ovflok, hasdups, flags) + DB *dbp; + VRFY_DBINFO *vdp; + PAGE *h; + db_pgno_t pgno; + u_int32_t nentries; + int ovflok, hasdups; + u_int32_t flags; +{ + DBT dbta, dbtb, dup1, dup2, *p1, *p2, *tmp; + BTREE *bt; + BINTERNAL *bi; + BKEYDATA *bk; + BOVERFLOW *bo; + VRFY_PAGEINFO *pip; + db_indx_t i; + int cmp, freedup1, freedup2, isbad, ret, t_ret; + int (*dupfunc) __P((DB *, const DBT *, const DBT *)); + int (*func) __P((DB *, const DBT *, const DBT *)); + void *buf1, *buf2, *tmpbuf; + + /* + * We need to work in the ORDERCHKONLY environment where we might + * not have a pip, but we also may need to work in contexts where + * NUM_ENT isn't safe. + */ + if (vdp != NULL) { + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + nentries = pip->entries; + } else + pip = NULL; + + ret = isbad = 0; + bo = NULL; /* Shut up compiler. */ + + memset(&dbta, 0, sizeof(DBT)); + F_SET(&dbta, DB_DBT_REALLOC); + + memset(&dbtb, 0, sizeof(DBT)); + F_SET(&dbtb, DB_DBT_REALLOC); + + buf1 = buf2 = NULL; + + DB_ASSERT(!LF_ISSET(DB_NOORDERCHK)); + + dupfunc = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare; + if (TYPE(h) == P_LDUP) + func = dupfunc; + else { + func = __bam_defcmp; + if (dbp->bt_internal != NULL) { + bt = (BTREE *)dbp->bt_internal; + if (bt->bt_compare != NULL) + func = bt->bt_compare; + } + } + + /* + * We alternate our use of dbta and dbtb so that we can walk + * through the page key-by-key without copying a dbt twice. + * p1 is always the dbt for index i - 1, and p2 for index i. + */ + p1 = &dbta; + p2 = &dbtb; + + /* + * Loop through the entries. nentries ought to contain the + * actual count, and so is a safe way to terminate the loop; whether + * we inc. by one or two depends on whether we're a leaf page-- + * on a leaf page, we care only about keys. On internal pages + * and LDUP pages, we want to check the order of all entries. + * + * Note that on IBTREE pages, we start with item 1, since item + * 0 doesn't get looked at by __bam_cmp. + */ + for (i = (TYPE(h) == P_IBTREE) ? 1 : 0; i < nentries; + i += (TYPE(h) == P_LBTREE) ? P_INDX : O_INDX) { + /* + * Put key i-1, now in p2, into p1, by swapping DBTs and bufs. + */ + tmp = p1; + p1 = p2; + p2 = tmp; + tmpbuf = buf1; + buf1 = buf2; + buf2 = tmpbuf; + + /* + * Get key i into p2. + */ + switch (TYPE(h)) { + case P_IBTREE: + bi = GET_BINTERNAL(h, i); + if (B_TYPE(bi->type) == B_OVERFLOW) { + bo = (BOVERFLOW *)(bi->data); + goto overflow; + } else { + p2->data = bi->data; + p2->size = bi->len; + } + + /* + * The leftmost key on an internal page must be + * len 0, since it's just a placeholder and + * automatically sorts less than all keys. + * + * XXX + * This criterion does not currently hold! + * See todo list item #1686. Meanwhile, it's harmless + * to just not check for it. + */ +#if 0 + if (i == 0 && bi->len != 0) { + isbad = 1; + EPRINT((dbp->dbenv, + "Lowest key on internal page %lu of nonzero length", + (u_long)pgno)); + } +#endif + break; + case P_LBTREE: + case P_LDUP: + bk = GET_BKEYDATA(h, i); + if (B_TYPE(bk->type) == B_OVERFLOW) { + bo = (BOVERFLOW *)bk; + goto overflow; + } else { + p2->data = bk->data; + p2->size = bk->len; + } + break; + default: + /* + * This means our caller screwed up and sent us + * an inappropriate page. + */ + TYPE_ERR_PRINT(dbp->dbenv, + "__bam_vrfy_itemorder", pgno, TYPE(h)) + DB_ASSERT(0); + ret = EINVAL; + goto err; + } + + if (0) { + /* + * If ovflok != 1, we can't safely go chasing + * overflow pages with the normal routines now; + * they might be unsafe or nonexistent. Mark this + * page as incomplete and return. + * + * Note that we don't need to worry about freeing + * buffers, since they can't have been allocated + * if overflow items are unsafe. + */ +overflow: if (!ovflok) { + F_SET(pip, VRFY_INCOMPLETE); + goto err; + } + + /* + * Overflow items are safe to chase. Do so. + * Fetch the overflow item into p2->data, + * NULLing it or reallocing it as appropriate. + * + * (We set p2->data to buf2 before the call + * so we're sure to realloc if we can and if p2 + * was just pointing at a non-overflow item.) + */ + p2->data = buf2; + if ((ret = __db_goff(dbp, + p2, bo->tlen, bo->pgno, NULL, NULL)) != 0) { + isbad = 1; + EPRINT((dbp->dbenv, + "Error %lu in fetching overflow item %lu, page %lu", + (u_long)ret, (u_long)i, (u_long)pgno)); + } + /* In case it got realloc'ed and thus changed. */ + buf2 = p2->data; + } + + /* Compare with the last key. */ + if (p1->data != NULL && p2->data != NULL) { + cmp = func(dbp, p1, p2); + + /* comparison succeeded */ + if (cmp > 0) { + isbad = 1; + EPRINT((dbp->dbenv, + "Out-of-order key, page %lu item %lu", + (u_long)pgno, (u_long)i)); + /* proceed */ + } else if (cmp == 0) { + /* + * If they compared equally, this + * had better be a (sub)database with dups. + * Mark it so we can check during the + * structure check. + */ + if (pip != NULL) + F_SET(pip, VRFY_HAS_DUPS); + else if (hasdups == 0) { + isbad = 1; + EPRINT((dbp->dbenv, + "Database with no duplicates has duplicated keys on page %lu", + (u_long)pgno)); + } + + /* + * If we're a btree leaf, check to see + * if the data items of these on-page dups are + * in sorted order. If not, flag this, so + * that we can make sure during the + * structure checks that the DUPSORT flag + * is unset. + * + * At this point i points to a duplicate key. + * Compare the datum before it (same key) + * to the datum after it, i.e. i-1 to i+1. + */ + if (TYPE(h) == P_LBTREE) { + /* + * Unsafe; continue and we'll pick + * up the bogus nentries later. + */ + if (i + 1 >= (db_indx_t)nentries) + continue; + + /* + * We don't bother with clever memory + * management with on-page dups, + * as it's only really a big win + * in the overflow case, and overflow + * dups are probably (?) rare. + */ + if (((ret = __bam_safe_getdata(dbp, + h, i - 1, ovflok, &dup1, + &freedup1)) != 0) || + ((ret = __bam_safe_getdata(dbp, + h, i + 1, ovflok, &dup2, + &freedup2)) != 0)) + goto err; + + /* + * If either of the data are NULL, + * it's because they're overflows and + * it's not safe to chase them now. + * Mark an incomplete and return. + */ + if (dup1.data == NULL || + dup2.data == NULL) { + DB_ASSERT(!ovflok); + F_SET(pip, VRFY_INCOMPLETE); + goto err; + } + + /* + * If the dups are out of order, + * flag this. It's not an error + * until we do the structure check + * and see whether DUPSORT is set. + */ + if (dupfunc(dbp, &dup1, &dup2) > 0) + F_SET(pip, VRFY_DUPS_UNSORTED); + + if (freedup1) + __os_free(dup1.data, 0); + if (freedup2) + __os_free(dup2.data, 0); + } + } + } + } + +err: if (pip != NULL && + ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0) && ret == 0) + ret = t_ret; + + if (buf1 != NULL) + __os_free(buf1, 0); + if (buf2 != NULL) + __os_free(buf2, 0); + + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); +} + +/* + * __bam_vrfy_structure -- + * Verify the tree structure of a btree database (including the master + * database containing subdbs). + * + * PUBLIC: int __bam_vrfy_structure __P((DB *, VRFY_DBINFO *, db_pgno_t, + * PUBLIC: u_int32_t)); + */ +int +__bam_vrfy_structure(dbp, vdp, meta_pgno, flags) + DB *dbp; + VRFY_DBINFO *vdp; + db_pgno_t meta_pgno; + u_int32_t flags; +{ + DB *pgset; + VRFY_PAGEINFO *mip, *rip; + db_pgno_t root, p; + int t_ret, ret; + u_int32_t nrecs, level, relen, stflags; + + mip = rip = 0; + pgset = vdp->pgset; + + if ((ret = __db_vrfy_getpageinfo(vdp, meta_pgno, &mip)) != 0) + return (ret); + + if ((ret = __db_vrfy_pgset_get(pgset, meta_pgno, (int *)&p)) != 0) + goto err; + if (p != 0) { + EPRINT((dbp->dbenv, + "Btree metadata page number %lu observed twice", + (u_long)meta_pgno)); + ret = DB_VERIFY_BAD; + goto err; + } + if ((ret = __db_vrfy_pgset_inc(pgset, meta_pgno)) != 0) + goto err; + + root = mip->root; + + if (root == 0) { + EPRINT((dbp->dbenv, + "Btree metadata page %lu has no root", (u_long)meta_pgno)); + ret = DB_VERIFY_BAD; + goto err; + } + + if ((ret = __db_vrfy_getpageinfo(vdp, root, &rip)) != 0) + goto err; + + switch (rip->type) { + case P_IBTREE: + case P_LBTREE: + stflags = flags | ST_TOPLEVEL; + if (F_ISSET(mip, VRFY_HAS_DUPS)) + stflags |= ST_DUPOK; + if (F_ISSET(mip, VRFY_HAS_DUPSORT)) + stflags |= ST_DUPSORT; + if (F_ISSET(mip, VRFY_HAS_RECNUMS)) + stflags |= ST_RECNUM; + ret = __bam_vrfy_subtree(dbp, + vdp, root, NULL, NULL, stflags, NULL, NULL, NULL); + break; + case P_IRECNO: + case P_LRECNO: + stflags = flags | ST_RECNUM | ST_IS_RECNO | ST_TOPLEVEL; + if (mip->re_len > 0) + stflags |= ST_RELEN; + if ((ret = __bam_vrfy_subtree(dbp, vdp, + root, NULL, NULL, stflags, &level, &nrecs, &relen)) != 0) + goto err; + /* + * Even if mip->re_len > 0, re_len may come back zero if the + * tree is empty. It should be okay to just skip the check in + * this case, as if there are any non-deleted keys at all, + * that should never happen. + */ + if (mip->re_len > 0 && relen > 0 && mip->re_len != relen) { + EPRINT((dbp->dbenv, + "Recno database with meta page %lu has bad re_len %lu", + (u_long)meta_pgno, (u_long)relen)); + ret = DB_VERIFY_BAD; + goto err; + } + ret = 0; + break; + case P_LDUP: + EPRINT((dbp->dbenv, + "Duplicate tree referenced from metadata page %lu", + (u_long)meta_pgno)); + ret = DB_VERIFY_BAD; + break; + default: + EPRINT((dbp->dbenv, + "Btree root of incorrect type %lu on meta page %lu", + (u_long)rip->type, (u_long)meta_pgno)); + ret = DB_VERIFY_BAD; + break; + } + +err: if (mip != NULL && + ((t_ret = __db_vrfy_putpageinfo(vdp, mip)) != 0) && ret == 0) + t_ret = ret; + if (rip != NULL && + ((t_ret = __db_vrfy_putpageinfo(vdp, rip)) != 0) && ret == 0) + t_ret = ret; + return (ret); +} + +/* + * __bam_vrfy_subtree-- + * Verify a subtree (or entire) btree with specified root. + * + * Note that this is public because it must be called to verify + * offpage dup trees, including from hash. + * + * PUBLIC: int __bam_vrfy_subtree __P((DB *, VRFY_DBINFO *, db_pgno_t, void *, + * PUBLIC: void *, u_int32_t, u_int32_t *, u_int32_t *, u_int32_t *)); + */ +int +__bam_vrfy_subtree(dbp, + vdp, pgno, l, r, flags, levelp, nrecsp, relenp) + DB *dbp; + VRFY_DBINFO *vdp; + db_pgno_t pgno; + void *l, *r; + u_int32_t flags, *levelp, *nrecsp, *relenp; +{ + BINTERNAL *li, *ri, *lp, *rp; + DB *pgset; + DBC *cc; + PAGE *h; + VRFY_CHILDINFO *child; + VRFY_PAGEINFO *pip; + db_recno_t nrecs, child_nrecs; + db_indx_t i; + int ret, t_ret, isbad, toplevel, p; + int (*func) __P((DB *, const DBT *, const DBT *)); + u_int32_t level, child_level, stflags, child_relen, relen; + + ret = isbad = 0; + nrecs = 0; + h = NULL; + relen = 0; + rp = (BINTERNAL *)r; + lp = (BINTERNAL *)l; + + /* Provide feedback on our progress to the application. */ + if (!LF_ISSET(DB_SALVAGE)) + __db_vrfy_struct_feedback(dbp, vdp); + + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + + cc = NULL; + level = pip->bt_level; + + toplevel = LF_ISSET(ST_TOPLEVEL); + LF_CLR(ST_TOPLEVEL); + + /* + * We are recursively descending a btree, starting from the root + * and working our way out to the leaves. + * + * There are four cases we need to deal with: + * 1. pgno is a recno leaf page. Any children are overflows. + * 2. pgno is a duplicate leaf page. Any children + * are overflow pages; traverse them, and then return + * level and nrecs. + * 3. pgno is an ordinary leaf page. Check whether dups are + * allowed, and if so, traverse any off-page dups or + * overflows. Then return nrecs and level. + * 4. pgno is a recno internal page. Recursively check any + * child pages, making sure their levels are one lower + * and their nrecs sum to ours. + * 5. pgno is a btree internal page. Same as #4, plus we + * must verify that for each pair of BINTERNAL entries + * N and N+1, the leftmost item on N's child sorts + * greater than N, and the rightmost item on N's child + * sorts less than N+1. + * + * Furthermore, in any sorted page type (P_LDUP, P_LBTREE, P_IBTREE), + * we need to verify the internal sort order is correct if, + * due to overflow items, we were not able to do so earlier. + */ + switch (pip->type) { + case P_LRECNO: + case P_LDUP: + case P_LBTREE: + /* + * Cases 1, 2 and 3 (overflow pages are common to all three); + * traverse child list, looking for overflows. + */ + if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0) + goto err; + for (ret = __db_vrfy_ccset(cc, pgno, &child); ret == 0; + ret = __db_vrfy_ccnext(cc, &child)) + if (child->type == V_OVERFLOW && + (ret = __db_vrfy_ovfl_structure(dbp, vdp, + child->pgno, child->tlen, + flags | ST_OVFL_LEAF)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto done; + } + + if ((ret = __db_vrfy_ccclose(cc)) != 0) + goto err; + cc = NULL; + + /* Case 1 */ + if (pip->type == P_LRECNO) { + if (!LF_ISSET(ST_IS_RECNO) && + !(LF_ISSET(ST_DUPOK) && !LF_ISSET(ST_DUPSORT))) { + isbad = 1; + EPRINT((dbp->dbenv, + "Recno leaf page %lu in non-recno tree", + (u_long)pgno)); + goto done; + } + goto leaf; + } else if (LF_ISSET(ST_IS_RECNO)) { + /* + * It's a non-recno leaf. Had better not be a recno + * subtree. + */ + isbad = 1; + EPRINT((dbp->dbenv, + "Non-recno leaf page %lu in recno tree", + (u_long)pgno)); + goto done; + } + + /* Case 2--no more work. */ + if (pip->type == P_LDUP) + goto leaf; + + /* Case 3 */ + + /* Check if we have any dups. */ + if (F_ISSET(pip, VRFY_HAS_DUPS)) { + /* If dups aren't allowed in this btree, trouble. */ + if (!LF_ISSET(ST_DUPOK)) { + isbad = 1; + EPRINT((dbp->dbenv, + "Duplicates on page %lu in non-dup btree", + (u_long)pgno)); + } else { + /* + * We correctly have dups. If any are off-page, + * traverse those btrees recursively. + */ + if ((ret = + __db_vrfy_childcursor(vdp, &cc)) != 0) + goto err; + for (ret = __db_vrfy_ccset(cc, pgno, &child); + ret == 0; + ret = __db_vrfy_ccnext(cc, &child)) { + stflags = flags | ST_RECNUM | ST_DUPSET; + /* Skip any overflow entries. */ + if (child->type == V_DUPLICATE) { + if ((ret = __db_vrfy_duptype( + dbp, vdp, child->pgno, + stflags)) != 0) { + isbad = 1; + /* Next child. */ + continue; + } + if ((ret = __bam_vrfy_subtree( + dbp, vdp, child->pgno, NULL, + NULL, stflags, NULL, NULL, + NULL)) != 0) { + if (ret != + DB_VERIFY_BAD) + goto err; + else + isbad = 1; + } + } + } + + if ((ret = __db_vrfy_ccclose(cc)) != 0) + goto err; + cc = NULL; + + /* + * If VRFY_DUPS_UNSORTED is set, + * ST_DUPSORT had better not be. + */ + if (F_ISSET(pip, VRFY_DUPS_UNSORTED) && + LF_ISSET(ST_DUPSORT)) { + EPRINT((dbp->dbenv, + "Unsorted duplicate set at page %lu in sorted-dup database", + (u_long)pgno)); + isbad = 1; + } + } + } + goto leaf; + break; + case P_IBTREE: + case P_IRECNO: + /* We handle these below. */ + break; + default: + /* + * If a P_IBTREE or P_IRECNO contains a reference to an + * invalid page, we'll wind up here; handle it gracefully. + * Note that the code at the "done" label assumes that the + * current page is a btree/recno one of some sort; this + * is not the case here, so we goto err. + */ + EPRINT((dbp->dbenv, + "Page %lu is of inappropriate type %lu", + (u_long)pgno, (u_long)pip->type)); + ret = DB_VERIFY_BAD; + goto err; + } + + /* + * Cases 4 & 5: This is a btree or recno internal page. For each child, + * recurse, keeping a running count of nrecs and making sure the level + * is always reasonable. + */ + if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0) + goto err; + for (ret = __db_vrfy_ccset(cc, pgno, &child); ret == 0; + ret = __db_vrfy_ccnext(cc, &child)) + if (child->type == V_RECNO) { + if (pip->type != P_IRECNO) { + TYPE_ERR_PRINT(dbp->dbenv, "__bam_vrfy_subtree", + pgno, pip->type); + DB_ASSERT(0); + ret = EINVAL; + goto err; + } + if ((ret = __bam_vrfy_subtree(dbp, vdp, child->pgno, + NULL, NULL, flags, &child_level, &child_nrecs, + &child_relen)) != 0) { + if (ret != DB_VERIFY_BAD) + goto done; + else + isbad = 1; + } + + if (LF_ISSET(ST_RELEN)) { + if (relen == 0) + relen = child_relen; + /* + * child_relen may be zero if the child subtree + * is empty. + */ + else if (child_relen > 0 && + relen != child_relen) { + isbad = 1; + EPRINT((dbp->dbenv, + "Recno page %lu returned bad re_len", + (u_long)child->pgno)); + } + if (relenp) + *relenp = relen; + } + if (LF_ISSET(ST_RECNUM)) + nrecs += child_nrecs; + if (level != child_level + 1) { + isbad = 1; + EPRINT((dbp->dbenv, "%s%lu%s%lu%s%lu", + "Recno level incorrect on page ", + (u_long)child->pgno, ": got ", + (u_long)child_level, ", expected ", + (u_long)(level - 1))); + } + } else if (child->type == V_OVERFLOW && + (ret = __db_vrfy_ovfl_structure(dbp, vdp, + child->pgno, child->tlen, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto done; + } + + if ((ret = __db_vrfy_ccclose(cc)) != 0) + goto err; + cc = NULL; + + /* We're done with case 4. */ + if (pip->type == P_IRECNO) + goto done; + + /* + * Case 5. Btree internal pages. + * As described above, we need to iterate through all the + * items on the page and make sure that our children sort appropriately + * with respect to them. + * + * For each entry, li will be the "left-hand" key for the entry + * itself, which must sort lower than all entries on its child; + * ri will be the key to its right, which must sort greater. + */ + if (h == NULL && (ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) + goto err; + for (i = 0; i < pip->entries; i += O_INDX) { + li = GET_BINTERNAL(h, i); + ri = (i + O_INDX < pip->entries) ? + GET_BINTERNAL(h, i + O_INDX) : NULL; + + /* + * The leftmost key is forcibly sorted less than all entries, + * so don't bother passing it. + */ + if ((ret = __bam_vrfy_subtree(dbp, vdp, li->pgno, + i == 0 ? NULL : li, ri, flags, &child_level, + &child_nrecs, NULL)) != 0) { + if (ret != DB_VERIFY_BAD) + goto done; + else + isbad = 1; + } + + if (LF_ISSET(ST_RECNUM)) { + /* + * Keep a running tally on the actual record count so + * we can return it to our parent (if we have one) or + * compare it to the NRECS field if we're a root page. + */ + nrecs += child_nrecs; + + /* + * Make sure the actual record count of the child + * is equal to the value in the BINTERNAL structure. + */ + if (li->nrecs != child_nrecs) { + isbad = 1; + EPRINT((dbp->dbenv, + "Item %lu page %lu has incorrect record count of %lu, should be %lu", + (u_long)i, (u_long)pgno, (u_long)li->nrecs, + (u_long)child_nrecs)); + } + } + + if (level != child_level + 1) { + isbad = 1; + EPRINT((dbp->dbenv, "%s%lu%s%lu%s%lu", + "Btree level incorrect on page ", (u_long)li->pgno, + ": got ", (u_long)child_level, ", expected ", + (u_long)(level - 1))); + } + } + + if (0) { +leaf: level = LEAFLEVEL; + if (LF_ISSET(ST_RECNUM)) + nrecs = pip->rec_cnt; + + /* XXX + * We should verify that the record count on a leaf page + * is the sum of the number of keys and the number of + * records in its off-page dups. This requires looking + * at the page again, however, and it may all be changing + * soon, so for now we don't bother. + */ + + if (LF_ISSET(ST_RELEN) && relenp) + *relenp = pip->re_len; + } +done: if (F_ISSET(pip, VRFY_INCOMPLETE) && isbad == 0 && ret == 0) { + /* + * During the page-by-page pass, item order verification was + * not finished due to the presence of overflow items. If + * isbad == 0, though, it's now safe to do so, as we've + * traversed any child overflow pages. Do it. + */ + if (h == NULL && (ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) + goto err; + if ((ret = __bam_vrfy_itemorder(dbp, + vdp, h, pgno, 0, 1, 0, flags)) != 0) + goto err; + F_CLR(pip, VRFY_INCOMPLETE); + } + + /* + * Our parent has sent us BINTERNAL pointers to parent records + * so that we can verify our place with respect to them. If it's + * appropriate--we have a default sort function--verify this. + */ + if (isbad == 0 && ret == 0 && !LF_ISSET(DB_NOORDERCHK) && lp != NULL) { + if (h == NULL && (ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) + goto err; + + /* + * __bam_vrfy_treeorder needs to know what comparison function + * to use. If ST_DUPSET is set, we're in a duplicate tree + * and we use the duplicate comparison function; otherwise, + * use the btree one. If unset, use the default, of course. + */ + func = LF_ISSET(ST_DUPSET) ? dbp->dup_compare : + ((BTREE *)dbp->bt_internal)->bt_compare; + if (func == NULL) + func = __bam_defcmp; + + if ((ret = __bam_vrfy_treeorder( + dbp, pgno, h, lp, rp, func, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + } + + /* + * This is guaranteed to succeed for leaf pages, but no harm done. + * + * Internal pages below the top level do not store their own + * record numbers, so we skip them. + */ + if (LF_ISSET(ST_RECNUM) && nrecs != pip->rec_cnt && toplevel) { + isbad = 1; + EPRINT((dbp->dbenv, + "Bad record count on page %lu: got %lu, expected %lu", + (u_long)pgno, (u_long)nrecs, (u_long)pip->rec_cnt)); + } + + if (levelp) + *levelp = level; + if (nrecsp) + *nrecsp = nrecs; + + pgset = vdp->pgset; + if ((ret = __db_vrfy_pgset_get(pgset, pgno, &p)) != 0) + goto err; + if (p != 0) { + isbad = 1; + EPRINT((dbp->dbenv, "Page %lu linked twice", (u_long)pgno)); + } else if ((ret = __db_vrfy_pgset_inc(pgset, pgno)) != 0) + goto err; + +err: if (h != NULL && (t_ret = memp_fput(dbp->mpf, h, 0)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0) + ret = t_ret; + if (cc != NULL && ((t_ret = __db_vrfy_ccclose(cc)) != 0) && ret == 0) + ret = t_ret; + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); +} + +/* + * __bam_vrfy_treeorder -- + * Verify that the lowest key on a page sorts greater than the + * BINTERNAL which points to it (lp), and the highest key + * sorts less than the BINTERNAL above that (rp). + * + * If lp is NULL, this means that it was the leftmost key on the + * parent, which (regardless of sort function) sorts less than + * all keys. No need to check it. + * + * If rp is NULL, lp was the highest key on the parent, so there's + * no higher key we must sort less than. + */ +static int +__bam_vrfy_treeorder(dbp, pgno, h, lp, rp, func, flags) + DB *dbp; + db_pgno_t pgno; + PAGE *h; + BINTERNAL *lp, *rp; + int (*func) __P((DB *, const DBT *, const DBT *)); + u_int32_t flags; +{ + BOVERFLOW *bo; + DBT dbt; + db_indx_t last; + int ret, cmp; + + memset(&dbt, 0, sizeof(DBT)); + F_SET(&dbt, DB_DBT_MALLOC); + ret = 0; + + switch (TYPE(h)) { + case P_IBTREE: + case P_LDUP: + last = NUM_ENT(h) - O_INDX; + break; + case P_LBTREE: + last = NUM_ENT(h) - P_INDX; + break; + default: + TYPE_ERR_PRINT(dbp->dbenv, + "__bam_vrfy_treeorder", pgno, TYPE(h)); + DB_ASSERT(0); + return (EINVAL); + } + + /* + * The key on page h, the child page, is more likely to be + * an overflow page, so we pass its offset, rather than lp/rp's, + * into __bam_cmp. This will take advantage of __db_moff. + */ + + /* + * Skip first-item check if we're an internal page--the first + * entry on an internal page is treated specially by __bam_cmp, + * so what's on the page shouldn't matter. (Plus, since we're passing + * our page and item 0 as to __bam_cmp, we'll sort before our + * parent and falsely report a failure.) + */ + if (lp != NULL && TYPE(h) != P_IBTREE) { + if (lp->type == B_KEYDATA) { + dbt.data = lp->data; + dbt.size = lp->len; + } else if (lp->type == B_OVERFLOW) { + bo = (BOVERFLOW *)lp->data; + if ((ret = __db_goff(dbp, &dbt, bo->tlen, bo->pgno, + NULL, NULL)) != 0) + return (ret); + } else { + DB_ASSERT(0); + EPRINT((dbp->dbenv, + "Unknown type for internal record")); + return (EINVAL); + } + + /* On error, fall through, free if neeeded, and return. */ + if ((ret = __bam_cmp(dbp, &dbt, h, 0, func, &cmp)) == 0) { + if (cmp > 0) { + EPRINT((dbp->dbenv, + "First item on page %lu sorted greater than parent entry", + (u_long)PGNO(h))); + ret = DB_VERIFY_BAD; + } + } else + EPRINT((dbp->dbenv, + "First item on page %lu had comparison error", + (u_long)PGNO(h))); + + if (dbt.data != lp->data) + __os_free(dbt.data, 0); + if (ret != 0) + return (ret); + } + + if (rp != NULL) { + if (rp->type == B_KEYDATA) { + dbt.data = rp->data; + dbt.size = rp->len; + } else if (rp->type == B_OVERFLOW) { + bo = (BOVERFLOW *)rp->data; + if ((ret = __db_goff(dbp, &dbt, bo->tlen, bo->pgno, + NULL, NULL)) != 0) + return (ret); + } else { + DB_ASSERT(0); + EPRINT((dbp->dbenv, + "Unknown type for internal record")); + return (EINVAL); + } + + /* On error, fall through, free if neeeded, and return. */ + if ((ret = __bam_cmp(dbp, &dbt, h, last, func, &cmp)) == 0) { + if (cmp < 0) { + EPRINT((dbp->dbenv, + "Last item on page %lu sorted greater than parent entry", + (u_long)PGNO(h))); + ret = DB_VERIFY_BAD; + } + } else + EPRINT((dbp->dbenv, + "Last item on page %lu had comparison error", + (u_long)PGNO(h))); + + if (dbt.data != rp->data) + __os_free(dbt.data, 0); + } + + return (ret); +} + +/* + * __bam_salvage -- + * Safely dump out anything that looks like a key on an alleged + * btree leaf page. + * + * PUBLIC: int __bam_salvage __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t, + * PUBLIC: PAGE *, void *, int (*)(void *, const void *), DBT *, + * PUBLIC: u_int32_t)); + */ +int +__bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags) + DB *dbp; + VRFY_DBINFO *vdp; + db_pgno_t pgno; + u_int32_t pgtype; + PAGE *h; + void *handle; + int (*callback) __P((void *, const void *)); + DBT *key; + u_int32_t flags; +{ + DBT dbt, unkdbt; + BKEYDATA *bk; + BOVERFLOW *bo; + db_indx_t i, beg, end; + u_int32_t himark; + u_int8_t *pgmap; + void *ovflbuf; + int t_ret, ret, err_ret; + + /* Shut up lint. */ + COMPQUIET(end, 0); + + ovflbuf = pgmap = NULL; + err_ret = ret = 0; + + memset(&dbt, 0, sizeof(DBT)); + dbt.flags = DB_DBT_REALLOC; + + memset(&unkdbt, 0, sizeof(DBT)); + unkdbt.size = strlen("UNKNOWN") + 1; + unkdbt.data = "UNKNOWN"; + + /* + * Allocate a buffer for overflow items. Start at one page; + * __db_safe_goff will realloc as needed. + */ + if ((ret = __os_malloc(dbp->dbenv, dbp->pgsize, NULL, &ovflbuf)) != 0) + return (ret); + + if (LF_ISSET(DB_AGGRESSIVE)) { + if ((ret = + __os_malloc(dbp->dbenv, dbp->pgsize, NULL, &pgmap)) != 0) + goto err; + memset(pgmap, 0, dbp->pgsize); + } + + /* + * Loop through the inp array, spitting out key/data pairs. + * + * If we're salvaging normally, loop from 0 through NUM_ENT(h). + * If we're being aggressive, loop until we hit the end of the page-- + * NUM_ENT() may be bogus. + */ + himark = dbp->pgsize; + for (i = 0;; i += O_INDX) { + /* If we're not aggressive, break when we hit NUM_ENT(h). */ + if (!LF_ISSET(DB_AGGRESSIVE) && i >= NUM_ENT(h)) + break; + + /* Verify the current item. */ + ret = __db_vrfy_inpitem(dbp, + h, pgno, i, 1, flags, &himark, NULL); + /* If this returned a fatality, it's time to break. */ + if (ret == DB_VERIFY_FATAL) { + /* + * Don't return DB_VERIFY_FATAL; it's private + * and means only that we can't go on with this + * page, not with the whole database. It's + * not even an error if we've run into it + * after NUM_ENT(h). + */ + ret = (i < NUM_ENT(h)) ? DB_VERIFY_BAD : 0; + break; + } + + /* + * If this returned 0, it's safe to print or (carefully) + * try to fetch. + */ + if (ret == 0) { + /* + * We only want to print deleted items if + * DB_AGGRESSIVE is set. + */ + bk = GET_BKEYDATA(h, i); + if (!LF_ISSET(DB_AGGRESSIVE) && B_DISSET(bk->type)) + continue; + + /* + * We're going to go try to print the next item. If + * key is non-NULL, we're a dup page, so we've got to + * print the key first, unless SA_SKIPFIRSTKEY is set + * and we're on the first entry. + */ + if (key != NULL && + (i != 0 || !LF_ISSET(SA_SKIPFIRSTKEY))) + if ((ret = __db_prdbt(key, + 0, " ", handle, callback, 0, NULL)) != 0) + err_ret = ret; + + beg = h->inp[i]; + switch (B_TYPE(bk->type)) { + case B_DUPLICATE: + end = beg + BOVERFLOW_SIZE - 1; + /* + * If we're not on a normal btree leaf page, + * there shouldn't be off-page + * dup sets. Something's confused; just + * drop it, and the code to pick up unlinked + * offpage dup sets will print it out + * with key "UNKNOWN" later. + */ + if (pgtype != P_LBTREE) + break; + + bo = (BOVERFLOW *)bk; + + /* + * If the page number is unreasonable, or + * if this is supposed to be a key item, + * just spit out "UNKNOWN"--the best we + * can do is run into the data items in the + * unlinked offpage dup pass. + */ + if (!IS_VALID_PGNO(bo->pgno) || + (i % P_INDX == 0)) { + /* Not much to do on failure. */ + if ((ret = __db_prdbt(&unkdbt, 0, " ", + handle, callback, 0, NULL)) != 0) + err_ret = ret; + break; + } + + if ((ret = __db_salvage_duptree(dbp, + vdp, bo->pgno, &dbt, handle, callback, + flags | SA_SKIPFIRSTKEY)) != 0) + err_ret = ret; + + break; + case B_KEYDATA: + end = ALIGN(beg + bk->len, sizeof(u_int32_t)) - 1; + dbt.data = bk->data; + dbt.size = bk->len; + if ((ret = __db_prdbt(&dbt, + 0, " ", handle, callback, 0, NULL)) != 0) + err_ret = ret; + break; + case B_OVERFLOW: + end = beg + BOVERFLOW_SIZE - 1; + bo = (BOVERFLOW *)bk; + if ((ret = __db_safe_goff(dbp, vdp, + bo->pgno, &dbt, &ovflbuf, flags)) != 0) { + err_ret = ret; + /* We care about err_ret more. */ + (void)__db_prdbt(&unkdbt, 0, " ", + handle, callback, 0, NULL); + break; + } + if ((ret = __db_prdbt(&dbt, + 0, " ", handle, callback, 0, NULL)) != 0) + err_ret = ret; + break; + default: + /* + * We should never get here; __db_vrfy_inpitem + * should not be returning 0 if bk->type + * is unrecognizable. + */ + DB_ASSERT(0); + return (EINVAL); + } + + /* + * If we're being aggressive, mark the beginning + * and end of the item; we'll come back and print + * whatever "junk" is in the gaps in case we had + * any bogus inp elements and thereby missed stuff. + */ + if (LF_ISSET(DB_AGGRESSIVE)) { + pgmap[beg] = ITEM_BEGIN; + pgmap[end] = ITEM_END; + } + } + } + + /* + * If i is odd and this is a btree leaf, we've printed out a key but not + * a datum; fix this imbalance by printing an "UNKNOWN". + */ + if (pgtype == P_LBTREE && (i % P_INDX == 1) && ((ret = + __db_prdbt(&unkdbt, 0, " ", handle, callback, 0, NULL)) != 0)) + err_ret = ret; + +err: if (pgmap != NULL) + __os_free(pgmap, 0); + __os_free(ovflbuf, 0); + + /* Mark this page as done. */ + if ((t_ret = __db_salvage_markdone(vdp, pgno)) != 0) + return (t_ret); + + return ((err_ret != 0) ? err_ret : ret); +} + +/* + * __bam_salvage_walkdupint -- + * Walk a known-good btree or recno internal page which is part of + * a dup tree, calling __db_salvage_duptree on each child page. + * + * PUBLIC: int __bam_salvage_walkdupint __P((DB *, VRFY_DBINFO *, PAGE *, + * PUBLIC: DBT *, void *, int (*)(void *, const void *), u_int32_t)); + */ +int +__bam_salvage_walkdupint(dbp, vdp, h, key, handle, callback, flags) + DB *dbp; + VRFY_DBINFO *vdp; + PAGE *h; + DBT *key; + void *handle; + int (*callback) __P((void *, const void *)); + u_int32_t flags; +{ + RINTERNAL *ri; + BINTERNAL *bi; + int ret, t_ret; + db_indx_t i; + + ret = 0; + for (i = 0; i < NUM_ENT(h); i++) { + switch (TYPE(h)) { + case P_IBTREE: + bi = GET_BINTERNAL(h, i); + if ((t_ret = __db_salvage_duptree(dbp, + vdp, bi->pgno, key, handle, callback, flags)) != 0) + ret = t_ret; + case P_IRECNO: + ri = GET_RINTERNAL(h, i); + if ((t_ret = __db_salvage_duptree(dbp, + vdp, ri->pgno, key, handle, callback, flags)) != 0) + ret = t_ret; + break; + default: + __db_err(dbp->dbenv, + "__bam_salvage_walkdupint called on non-int. page"); + DB_ASSERT(0); + return (EINVAL); + } + /* Pass SA_SKIPFIRSTKEY, if set, on to the 0th child only. */ + flags &= ~LF_ISSET(SA_SKIPFIRSTKEY); + } + + return (ret); +} + +/* + * __bam_meta2pgset -- + * Given a known-good meta page, return in pgsetp a 0-terminated list of + * db_pgno_t's corresponding to the pages in the btree. + * + * We do this by a somewhat sleazy method, to avoid having to traverse the + * btree structure neatly: we walk down the left side to the very + * first leaf page, then we mark all the pages in the chain of + * NEXT_PGNOs (being wary of cycles and invalid ones), then we + * consolidate our scratch array into a nice list, and return. This + * avoids the memory management hassles of recursion and the + * trouble of walking internal pages--they just don't matter, except + * for the left branch. + * + * PUBLIC: int __bam_meta2pgset __P((DB *, VRFY_DBINFO *, BTMETA *, + * PUBLIC: u_int32_t, DB *)); + */ +int +__bam_meta2pgset(dbp, vdp, btmeta, flags, pgset) + DB *dbp; + VRFY_DBINFO *vdp; + BTMETA *btmeta; + u_int32_t flags; + DB *pgset; +{ + BINTERNAL *bi; + PAGE *h; + RINTERNAL *ri; + db_pgno_t current, p; + int err_ret, ret; + + h = NULL; + ret = err_ret = 0; + DB_ASSERT(pgset != NULL); + for (current = btmeta->root;;) { + if (!IS_VALID_PGNO(current) || current == PGNO(btmeta)) { + err_ret = DB_VERIFY_BAD; + goto err; + } + if ((ret = memp_fget(dbp->mpf, ¤t, 0, &h)) != 0) { + err_ret = ret; + goto err; + } + + switch (TYPE(h)) { + case P_IBTREE: + case P_IRECNO: + if ((ret = __bam_vrfy(dbp, + vdp, h, current, flags | DB_NOORDERCHK)) != 0) { + err_ret = ret; + goto err; + } + if (TYPE(h) == P_IBTREE) { + bi = GET_BINTERNAL(h, 0); + current = bi->pgno; + } else { /* P_IRECNO */ + ri = GET_RINTERNAL(h, 0); + current = ri->pgno; + } + break; + case P_LBTREE: + case P_LRECNO: + goto traverse; + default: + err_ret = DB_VERIFY_BAD; + goto err; + } + + if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) + err_ret = ret; + h = NULL; + } + + /* + * At this point, current is the pgno of leaf page h, the 0th in the + * tree we're concerned with. + */ +traverse: + while (IS_VALID_PGNO(current) && current != PGNO_INVALID) { + if (h == NULL && + (ret = memp_fget(dbp->mpf, ¤t, 0, &h) != 0)) { + err_ret = ret; + break; + } + + if ((ret = __db_vrfy_pgset_get(pgset, current, (int *)&p)) != 0) + goto err; + + if (p != 0) { + /* + * We've found a cycle. Return success anyway-- + * our caller may as well use however much of + * the pgset we've come up with. + */ + break; + } + if ((ret = __db_vrfy_pgset_inc(pgset, current)) != 0) + goto err; + + current = NEXT_PGNO(h); + if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) + err_ret = ret; + h = NULL; + } + +err: if (h != NULL) + (void)memp_fput(dbp->mpf, h, 0); + + return (ret == 0 ? err_ret : ret); +} + +/* + * __bam_safe_getdata -- + * + * Utility function for __bam_vrfy_itemorder. Safely gets the datum at + * index i, page h, and sticks it in DBT dbt. If ovflok is 1 and i's an + * overflow item, we do a safe_goff to get the item and signal that we need + * to free dbt->data; if ovflok is 0, we leaves the DBT zeroed. + */ +static int +__bam_safe_getdata(dbp, h, i, ovflok, dbt, freedbtp) + DB *dbp; + PAGE *h; + u_int32_t i; + int ovflok; + DBT *dbt; + int *freedbtp; +{ + BKEYDATA *bk; + BOVERFLOW *bo; + + memset(dbt, 0, sizeof(DBT)); + *freedbtp = 0; + + bk = GET_BKEYDATA(h, i); + if (B_TYPE(bk->type) == B_OVERFLOW) { + if (!ovflok) + return (0); + + bo = (BOVERFLOW *)bk; + F_SET(dbt, DB_DBT_MALLOC); + + *freedbtp = 1; + return (__db_goff(dbp, dbt, bo->tlen, bo->pgno, NULL, NULL)); + } else { + dbt->data = bk->data; + dbt->size = bk->len; + } + + return (0); +} |