42 files changed, 11409 insertions, 0 deletions
diff --git a/db/Makefile b/db/Makefile
new file mode 100644
index 0000000000..c7a385a486
--- /dev/null
+++ b/db/Makefile
@@ -0,0 +1,28 @@
+# Makefile for 4.4BSD db code in GNU C library.
+# This code is taken verbatim from the BSD db 1.85 package.  Only this
+# Makefile and compat.h were written for GNU libc, and the header files
+# moved up to this directory.
+
+subdir = db
+
+dbdirs = btree db hash mpool recno
+vpath %.c $(dbdirs)
+
+routines := bt_close bt_conv bt_debug bt_delete bt_get bt_open bt_overflow \
+	    bt_page bt_put bt_search bt_seq bt_split bt_utils		   \
+	    db								   \
+	    hash hash_bigkey hash_buf hash_func hash_log2 hash_page	   \
+	    ndbm							   \
+	    mpool							   \
+	    rec_close rec_delete rec_get rec_open rec_put rec_search	   \
+	    rec_seq rec_utils
+
+headers		:= db.h mpool.h ndbm.h
+distribute	:= compat.h					\
+		   btree/btree.h btree/extern.h			\
+		   hash/extern.h hash/hash.h hash/page.h	\
+		   recno/extern.h recno/recno.h
+
+include ../Rules
+
+CPPFLAGS += -D__DBINTERFACE_PRIVATE
diff --git a/db/btree/bt_close.c b/db/btree/bt_close.c
new file mode 100644
index 0000000000..27f9ab660f
--- /dev/null
+++ b/db/btree/bt_close.c
@@ -0,0 +1,182 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)bt_close.c	8.7 (Berkeley) 8/17/94";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/param.h>
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <db.h>
+#include "btree.h"
+
+static int bt_meta __P((BTREE *));
+
+/*
+ * BT_CLOSE -- Close a btree.
+ *
+ * Parameters:
+ *	dbp:	pointer to access method
+ *
+ * Returns:
+ *	RET_ERROR, RET_SUCCESS
+ */
+int
+__bt_close(dbp)
+	DB *dbp;
+{
+	BTREE *t;
+	int fd;
+
+	t = dbp->internal;
+
+	/* Toss any page pinned across calls. */
+	if (t->bt_pinned != NULL) {
+		mpool_put(t->bt_mp, t->bt_pinned, 0);
+		t->bt_pinned = NULL;
+	}
+
+	/* Sync the tree. */
+	if (__bt_sync(dbp, 0) == RET_ERROR)
+		return (RET_ERROR);
+
+	/* Close the memory pool. */
+	if (mpool_close(t->bt_mp) == RET_ERROR)
+		return (RET_ERROR);
+
+	/* Free random memory. */
+	if (t->bt_cursor.key.data != NULL) {
+		free(t->bt_cursor.key.data);
+		t->bt_cursor.key.size = 0;
+		t->bt_cursor.key.data = NULL;
+	}
+	if (t->bt_rkey.data) {
+		free(t->bt_rkey.data);
+		t->bt_rkey.size = 0;
+		t->bt_rkey.data = NULL;
+	}
+	if (t->bt_rdata.data) {
+		free(t->bt_rdata.data);
+		t->bt_rdata.size = 0;
+		t->bt_rdata.data = NULL;
+	}
+
+	fd = t->bt_fd;
+	free(t);
+	free(dbp);
+	return (close(fd) ? RET_ERROR : RET_SUCCESS);
+}
+
+/*
+ * BT_SYNC -- sync the btree to disk.
+ *
+ * Parameters:
+ *	dbp:	pointer to access method
+ *
+ * Returns:
+ *	RET_SUCCESS, RET_ERROR.
+ */
+int
+__bt_sync(dbp, flags)
+	const DB *dbp;
+	u_int flags;
+{
+	BTREE *t;
+	int status;
+
+	t = dbp->internal;
+
+	/* Toss any page pinned across calls. */
+	if (t->bt_pinned != NULL) {
+		mpool_put(t->bt_mp, t->bt_pinned, 0);
+		t->bt_pinned = NULL;
+	}
+
+	/* Sync doesn't currently take any flags. */
+	if (flags != 0) {
+		errno = EINVAL;
+		return (RET_ERROR);
+	}
+
+	if (F_ISSET(t, B_INMEM | B_RDONLY) || !F_ISSET(t, B_MODIFIED))
+		return (RET_SUCCESS);
+
+	if (F_ISSET(t, B_METADIRTY) && bt_meta(t) == RET_ERROR)
+		return (RET_ERROR);
+
+	if ((status = mpool_sync(t->bt_mp)) == RET_SUCCESS)
+		F_CLR(t, B_MODIFIED);
+
+	return (status);
+}
+
+/*
+ * BT_META -- write the tree meta data to disk.
+ *
+ * Parameters:
+ *	t:	tree
+ *
+ * Returns:
+ *	RET_ERROR, RET_SUCCESS
+ */
+static int
+bt_meta(t)
+	BTREE *t;
+{
+	BTMETA m;
+	void *p;
+
+	if ((p = mpool_get(t->bt_mp, P_META, 0)) == NULL)
+		return (RET_ERROR);
+
+	/* Fill in metadata. */
+	m.magic = BTREEMAGIC;
+	m.version = BTREEVERSION;
+	m.psize = t->bt_psize;
+	m.free = t->bt_free;
+	m.nrecs = t->bt_nrecs;
+	m.flags = F_ISSET(t, SAVEMETA);
+
+	memmove(p, &m, sizeof(BTMETA));
+	mpool_put(t->bt_mp, p, MPOOL_DIRTY);
+	return (RET_SUCCESS);
+}
diff --git a/db/btree/bt_conv.c b/db/btree/bt_conv.c
new file mode 100644
index 0000000000..1cb208b14d
--- /dev/null
+++ b/db/btree/bt_conv.c
@@ -0,0 +1,221 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)bt_conv.c	8.5 (Berkeley) 8/17/94";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/param.h>
+
+#include <stdio.h>
+
+#include <db.h>
+#include "btree.h"
+
+static void mswap __P((PAGE *));
+
+/*
+ * __BT_BPGIN, __BT_BPGOUT --
+ *	Convert host-specific number layout to/from the host-independent
+ *	format stored on disk.
+ *
+ * Parameters:
+ *	t:	tree
+ *	pg:	page number
+ *	h:	page to convert
+ */
+void
+__bt_pgin(t, pg, pp)
+	void *t;
+	pgno_t pg;
+	void *pp;
+{
+	PAGE *h;
+	indx_t i, top;
+	u_char flags;
+	char *p;
+
+	if (!F_ISSET(((BTREE *)t), B_NEEDSWAP))
+		return;
+	if (pg == P_META) {
+		mswap(pp);
+		return;
+	}
+
+	h = pp;
+	M_32_SWAP(h->pgno);
+	M_32_SWAP(h->prevpg);
+	M_32_SWAP(h->nextpg);
+	M_32_SWAP(h->flags);
+	M_16_SWAP(h->lower);
+	M_16_SWAP(h->upper);
+
+	top = NEXTINDEX(h);
+	if ((h->flags & P_TYPE) == P_BINTERNAL)
+		for (i = 0; i < top; i++) {
+			M_16_SWAP(h->linp[i]);
+			p = (char *)GETBINTERNAL(h, i);
+			P_32_SWAP(p);
+			p += sizeof(u_int32_t);
+			P_32_SWAP(p);
+			p += sizeof(pgno_t);
+			if (*(u_char *)p & P_BIGKEY) {
+				p += sizeof(u_char);
+				P_32_SWAP(p);
+				p += sizeof(pgno_t);
+				P_32_SWAP(p);
+			}
+		}
+	else if ((h->flags & P_TYPE) == P_BLEAF)
+		for (i = 0; i < top; i++) {
+			M_16_SWAP(h->linp[i]);
+			p = (char *)GETBLEAF(h, i);
+			P_32_SWAP(p);
+			p += sizeof(u_int32_t);
+			P_32_SWAP(p);
+			p += sizeof(u_int32_t);
+			flags = *(u_char *)p;
+			if (flags & (P_BIGKEY | P_BIGDATA)) {
+				p += sizeof(u_char);
+				if (flags & P_BIGKEY) {
+					P_32_SWAP(p);
+					p += sizeof(pgno_t);
+					P_32_SWAP(p);
+				}
+				if (flags & P_BIGDATA) {
+					p += sizeof(u_int32_t);
+					P_32_SWAP(p);
+					p += sizeof(pgno_t);
+					P_32_SWAP(p);
+				}
+			}
+		}
+}
+
+void
+__bt_pgout(t, pg, pp)
+	void *t;
+	pgno_t pg;
+	void *pp;
+{
+	PAGE *h;
+	indx_t i, top;
+	u_char flags;
+	char *p;
+
+	if (!F_ISSET(((BTREE *)t), B_NEEDSWAP))
+		return;
+	if (pg == P_META) {
+		mswap(pp);
+		return;
+	}
+
+	h = pp;
+	top = NEXTINDEX(h);
+	if ((h->flags & P_TYPE) == P_BINTERNAL)
+		for (i = 0; i < top; i++) {
+			p = (char *)GETBINTERNAL(h, i);
+			P_32_SWAP(p);
+			p += sizeof(u_int32_t);
+			P_32_SWAP(p);
+			p += sizeof(pgno_t);
+			if (*(u_char *)p & P_BIGKEY) {
+				p += sizeof(u_char);
+				P_32_SWAP(p);
+				p += sizeof(pgno_t);
+				P_32_SWAP(p);
+			}
+			M_16_SWAP(h->linp[i]);
+		}
+	else if ((h->flags & P_TYPE) == P_BLEAF)
+		for (i = 0; i < top; i++) {
+			p = (char *)GETBLEAF(h, i);
+			P_32_SWAP(p);
+			p += sizeof(u_int32_t);
+			P_32_SWAP(p);
+			p += sizeof(u_int32_t);
+			flags = *(u_char *)p;
+			if (flags & (P_BIGKEY | P_BIGDATA)) {
+				p += sizeof(u_char);
+				if (flags & P_BIGKEY) {
+					P_32_SWAP(p);
+					p += sizeof(pgno_t);
+					P_32_SWAP(p);
+				}
+				if (flags & P_BIGDATA) {
+					p += sizeof(u_int32_t);
+					P_32_SWAP(p);
+					p += sizeof(pgno_t);
+					P_32_SWAP(p);
+				}
+			}
+			M_16_SWAP(h->linp[i]);
+		}
+
+	M_32_SWAP(h->pgno);
+	M_32_SWAP(h->prevpg);
+	M_32_SWAP(h->nextpg);
+	M_32_SWAP(h->flags);
+	M_16_SWAP(h->lower);
+	M_16_SWAP(h->upper);
+}
+
+/*
+ * MSWAP -- Actually swap the bytes on the meta page.
+ *
+ * Parameters:
+ *	p:	page to convert
+ */
+static void
+mswap(pg)
+	PAGE *pg;
+{
+	char *p;
+
+	p = (char *)pg;
+	P_32_SWAP(p);		/* magic */
+	p += sizeof(u_int32_t);
+	P_32_SWAP(p);		/* version */
+	p += sizeof(u_int32_t);
+	P_32_SWAP(p);		/* psize */
+	p += sizeof(u_int32_t);
+	P_32_SWAP(p);		/* free */
+	p += sizeof(u_int32_t);
+	P_32_SWAP(p);		/* nrecs */
+	p += sizeof(u_int32_t);
+	P_32_SWAP(p);		/* flags */
+	p += sizeof(u_int32_t);
+}
diff --git a/db/btree/bt_debug.c b/db/btree/bt_debug.c
new file mode 100644
index 0000000000..3aefbe7622
--- /dev/null
+++ b/db/btree/bt_debug.c
@@ -0,0 +1,329 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)bt_debug.c	8.5 (Berkeley) 8/17/94";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/param.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <db.h>
+#include "btree.h"
+
+#ifdef DEBUG
+/*
+ * BT_DUMP -- Dump the tree
+ *
+ * Parameters:
+ *	dbp:	pointer to the DB
+ */
+void
+__bt_dump(dbp)
+	DB *dbp;
+{
+	BTREE *t;
+	PAGE *h;
+	pgno_t i;
+	char *sep;
+
+	t = dbp->internal;
+	(void)fprintf(stderr, "%s: pgsz %d",
+	    F_ISSET(t, B_INMEM) ? "memory" : "disk", t->bt_psize);
+	if (F_ISSET(t, R_RECNO))
+		(void)fprintf(stderr, " keys %lu", t->bt_nrecs);
+#undef X
+#define	X(flag, name) \
+	if (F_ISSET(t, flag)) { \
+		(void)fprintf(stderr, "%s%s", sep, name); \
+		sep = ", "; \
+	}
+	if (t->flags != 0) {
+		sep = " flags (";
+		X(R_FIXLEN,	"FIXLEN");
+		X(B_INMEM,	"INMEM");
+		X(B_NODUPS,	"NODUPS");
+		X(B_RDONLY,	"RDONLY");
+		X(R_RECNO,	"RECNO");
+		X(B_METADIRTY,"METADIRTY");
+		(void)fprintf(stderr, ")\n");
+	}
+#undef X
+
+	for (i = P_ROOT; (h = mpool_get(t->bt_mp, i, 0)) != NULL; ++i) {
+		__bt_dpage(h);
+		(void)mpool_put(t->bt_mp, h, 0);
+	}
+}
+
+/*
+ * BT_DMPAGE -- Dump the meta page
+ *
+ * Parameters:
+ *	h:	pointer to the PAGE
+ */
+void
+__bt_dmpage(h)
+	PAGE *h;
+{
+	BTMETA *m;
+	char *sep;
+
+	m = (BTMETA *)h;
+	(void)fprintf(stderr, "magic %lx\n", m->magic);
+	(void)fprintf(stderr, "version %lu\n", m->version);
+	(void)fprintf(stderr, "psize %lu\n", m->psize);
+	(void)fprintf(stderr, "free %lu\n", m->free);
+	(void)fprintf(stderr, "nrecs %lu\n", m->nrecs);
+	(void)fprintf(stderr, "flags %lu", m->flags);
+#undef X
+#define	X(flag, name) \
+	if (m->flags & flag) { \
+		(void)fprintf(stderr, "%s%s", sep, name); \
+		sep = ", "; \
+	}
+	if (m->flags) {
+		sep = " (";
+		X(B_NODUPS,	"NODUPS");
+		X(R_RECNO,	"RECNO");
+		(void)fprintf(stderr, ")");
+	}
+}
+
+/*
+ * BT_DNPAGE -- Dump the page
+ *
+ * Parameters:
+ *	n:	page number to dump.
+ */
+void
+__bt_dnpage(dbp, pgno)
+	DB *dbp;
+	pgno_t pgno;
+{
+	BTREE *t;
+	PAGE *h;
+
+	t = dbp->internal;
+	if ((h = mpool_get(t->bt_mp, pgno, 0)) != NULL) {
+		__bt_dpage(h);
+		(void)mpool_put(t->bt_mp, h, 0);
+	}
+}
+
+/*
+ * BT_DPAGE -- Dump the page
+ *
+ * Parameters:
+ *	h:	pointer to the PAGE
+ */
+void
+__bt_dpage(h)
+	PAGE *h;
+{
+	BINTERNAL *bi;
+	BLEAF *bl;
+	RINTERNAL *ri;
+	RLEAF *rl;
+	indx_t cur, top;
+	char *sep;
+
+	(void)fprintf(stderr, "    page %d: (", h->pgno);
+#undef X
+#define	X(flag, name) \
+	if (h->flags & flag) { \
+		(void)fprintf(stderr, "%s%s", sep, name); \
+		sep = ", "; \
+	}
+	sep = "";
+	X(P_BINTERNAL,	"BINTERNAL")		/* types */
+	X(P_BLEAF,	"BLEAF")
+	X(P_RINTERNAL,	"RINTERNAL")		/* types */
+	X(P_RLEAF,	"RLEAF")
+	X(P_OVERFLOW,	"OVERFLOW")
+	X(P_PRESERVE,	"PRESERVE");
+	(void)fprintf(stderr, ")\n");
+#undef X
+
+	(void)fprintf(stderr, "\tprev %2d next %2d", h->prevpg, h->nextpg);
+	if (h->flags & P_OVERFLOW)
+		return;
+
+	top = NEXTINDEX(h);
+	(void)fprintf(stderr, " lower %3d upper %3d nextind %d\n",
+	    h->lower, h->upper, top);
+	for (cur = 0; cur < top; cur++) {
+		(void)fprintf(stderr, "\t[%03d] %4d ", cur, h->linp[cur]);
+		switch (h->flags & P_TYPE) {
+		case P_BINTERNAL:
+			bi = GETBINTERNAL(h, cur);
+			(void)fprintf(stderr,
+			    "size %03d pgno %03d", bi->ksize, bi->pgno);
+			if (bi->flags & P_BIGKEY)
+				(void)fprintf(stderr, " (indirect)");
+			else if (bi->ksize)
+				(void)fprintf(stderr,
+				    " {%.*s}", (int)bi->ksize, bi->bytes);
+			break;
+		case P_RINTERNAL:
+			ri = GETRINTERNAL(h, cur);
+			(void)fprintf(stderr, "entries %03d pgno %03d",
+				ri->nrecs, ri->pgno);
+			break;
+		case P_BLEAF:
+			bl = GETBLEAF(h, cur);
+			if (bl->flags & P_BIGKEY)
+				(void)fprintf(stderr,
+				    "big key page %lu size %u/",
+				    *(pgno_t *)bl->bytes,
+				    *(u_int32_t *)(bl->bytes + sizeof(pgno_t)));
+			else if (bl->ksize)
+				(void)fprintf(stderr, "%s/", bl->bytes);
+			if (bl->flags & P_BIGDATA)
+				(void)fprintf(stderr,
+				    "big data page %lu size %u",
+				    *(pgno_t *)(bl->bytes + bl->ksize),
+				    *(u_int32_t *)(bl->bytes + bl->ksize +
+				    sizeof(pgno_t)));
+			else if (bl->dsize)
+				(void)fprintf(stderr, "%.*s",
+				    (int)bl->dsize, bl->bytes + bl->ksize);
+			break;
+		case P_RLEAF:
+			rl = GETRLEAF(h, cur);
+			if (rl->flags & P_BIGDATA)
+				(void)fprintf(stderr,
+				    "big data page %lu size %u",
+				    *(pgno_t *)rl->bytes,
+				    *(u_int32_t *)(rl->bytes + sizeof(pgno_t)));
+			else if (rl->dsize)
+				(void)fprintf(stderr,
+				    "%.*s", (int)rl->dsize, rl->bytes);
+			break;
+		}
+		(void)fprintf(stderr, "\n");
+	}
+}
+#endif
+
+#ifdef STATISTICS
+/*
+ * BT_STAT -- Gather/print the tree statistics
+ *
+ * Parameters:
+ *	dbp:	pointer to the DB
+ */
+void
+__bt_stat(dbp)
+	DB *dbp;
+{
+	extern u_long bt_cache_hit, bt_cache_miss, bt_pfxsaved, bt_rootsplit;
+	extern u_long bt_sortsplit, bt_split;
+	BTREE *t;
+	PAGE *h;
+	pgno_t i, pcont, pinternal, pleaf;
+	u_long ifree, lfree, nkeys;
+	int levels;
+
+	t = dbp->internal;
+	pcont = pinternal = pleaf = 0;
+	nkeys = ifree = lfree = 0;
+	for (i = P_ROOT; (h = mpool_get(t->bt_mp, i, 0)) != NULL; ++i) {
+		switch (h->flags & P_TYPE) {
+		case P_BINTERNAL:
+		case P_RINTERNAL:
+			++pinternal;
+			ifree += h->upper - h->lower;
+			break;
+		case P_BLEAF:
+		case P_RLEAF:
+			++pleaf;
+			lfree += h->upper - h->lower;
+			nkeys += NEXTINDEX(h);
+			break;
+		case P_OVERFLOW:
+			++pcont;
+			break;
+		}
+		(void)mpool_put(t->bt_mp, h, 0);
+	}
+
+	/* Count the levels of the tree. */
+	for (i = P_ROOT, levels = 0 ;; ++levels) {
+		h = mpool_get(t->bt_mp, i, 0);
+		if (h->flags & (P_BLEAF|P_RLEAF)) {
+			if (levels == 0)
+				levels = 1;
+			(void)mpool_put(t->bt_mp, h, 0);
+			break;
+		}
+		i = F_ISSET(t, R_RECNO) ?
+		    GETRINTERNAL(h, 0)->pgno :
+		    GETBINTERNAL(h, 0)->pgno;
+		(void)mpool_put(t->bt_mp, h, 0);
+	}
+
+	(void)fprintf(stderr, "%d level%s with %ld keys",
+	    levels, levels == 1 ? "" : "s", nkeys);
+	if (F_ISSET(t, R_RECNO))
+		(void)fprintf(stderr, " (%ld header count)", t->bt_nrecs);
+	(void)fprintf(stderr,
+	    "\n%lu pages (leaf %ld, internal %ld, overflow %ld)\n",
+	    pinternal + pleaf + pcont, pleaf, pinternal, pcont);
+	(void)fprintf(stderr, "%ld cache hits, %ld cache misses\n",
+	    bt_cache_hit, bt_cache_miss);
+	(void)fprintf(stderr, "%ld splits (%ld root splits, %ld sort splits)\n",
+	    bt_split, bt_rootsplit, bt_sortsplit);
+	pleaf *= t->bt_psize - BTDATAOFF;
+	if (pleaf)
+		(void)fprintf(stderr,
+		    "%.0f%% leaf fill (%ld bytes used, %ld bytes free)\n",
+		    ((double)(pleaf - lfree) / pleaf) * 100,
+		    pleaf - lfree, lfree);
+	pinternal *= t->bt_psize - BTDATAOFF;
+	if (pinternal)
+		(void)fprintf(stderr,
+		    "%.0f%% internal fill (%ld bytes used, %ld bytes free\n",
+		    ((double)(pinternal - ifree) / pinternal) * 100,
+		    pinternal - ifree, ifree);
+	if (bt_pfxsaved)
+		(void)fprintf(stderr, "prefix checking removed %lu bytes.\n",
+		    bt_pfxsaved);
+}
+#endif
diff --git a/db/btree/bt_delete.c b/db/btree/bt_delete.c
new file mode 100644
index 0000000000..ece1ab656e
--- /dev/null
+++ b/db/btree/bt_delete.c
@@ -0,0 +1,657 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)bt_delete.c	8.13 (Berkeley) 7/28/94";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/types.h>
+
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <db.h>
+#include "btree.h"
+
+static int __bt_bdelete __P((BTREE *, const DBT *));
+static int __bt_curdel __P((BTREE *, const DBT *, PAGE *, u_int));
+static int __bt_pdelete __P((BTREE *, PAGE *));
+static int __bt_relink __P((BTREE *, PAGE *));
+static int __bt_stkacq __P((BTREE *, PAGE **, CURSOR *));
+
+/*
+ * __bt_delete
+ *	Delete the item(s) referenced by a key.
+ *
+ * Return RET_SPECIAL if the key is not found.
+ */
+int
+__bt_delete(dbp, key, flags)
+	const DB *dbp;
+	const DBT *key;
+	u_int flags;
+{
+	BTREE *t;
+	CURSOR *c;
+	PAGE *h;
+	int status;
+
+	t = dbp->internal;
+
+	/* Toss any page pinned across calls. */
+	if (t->bt_pinned != NULL) {
+		mpool_put(t->bt_mp, t->bt_pinned, 0);
+		t->bt_pinned = NULL;
+	}
+
+	/* Check for change to a read-only tree. */
+	if (F_ISSET(t, B_RDONLY)) {
+		errno = EPERM;
+		return (RET_ERROR);
+	}
+
+	switch (flags) {
+	case 0:
+		status = __bt_bdelete(t, key);
+		break;
+	case R_CURSOR:
+		/*
+		 * If flags is R_CURSOR, delete the cursor.  Must already
+		 * have started a scan and not have already deleted it.
+		 */
+		c = &t->bt_cursor;
+		if (F_ISSET(c, CURS_INIT)) {
+			if (F_ISSET(c, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE))
+				return (RET_SPECIAL);
+			if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL)
+				return (RET_ERROR);
+
+			/*
+			 * If the page is about to be emptied, we'll need to
+			 * delete it, which means we have to acquire a stack.
+			 */
+			if (NEXTINDEX(h) == 1)
+				if (__bt_stkacq(t, &h, &t->bt_cursor))
+					return (RET_ERROR);
+
+			status = __bt_dleaf(t, NULL, h, c->pg.index);
+
+			if (NEXTINDEX(h) == 0 && status == RET_SUCCESS) {
+				if (__bt_pdelete(t, h))
+					return (RET_ERROR);
+			} else
+				mpool_put(t->bt_mp,
+				    h, status == RET_SUCCESS ? MPOOL_DIRTY : 0);
+			break;
+		}
+		/* FALLTHROUGH */
+	default:
+		errno = EINVAL;
+		return (RET_ERROR);
+	}
+	if (status == RET_SUCCESS)
+		F_SET(t, B_MODIFIED);
+	return (status);
+}
+
+/*
+ * __bt_stkacq --
+ *	Acquire a stack so we can delete a cursor entry.
+ *
+ * Parameters:
+ *	  t:	tree
+ *	 hp:	pointer to current, pinned PAGE pointer
+ *	  c:	pointer to the cursor
+ *
+ * Returns:
+ *	0 on success, 1 on failure
+ */
+static int
+__bt_stkacq(t, hp, c)
+	BTREE *t;
+	PAGE **hp;
+	CURSOR *c;
+{
+	BINTERNAL *bi;
+	EPG *e;
+	EPGNO *parent;
+	PAGE *h;
+	indx_t index;
+	pgno_t pgno;
+	recno_t nextpg, prevpg;
+	int exact, level;
+	
+	/*
+	 * Find the first occurrence of the key in the tree.  Toss the
+	 * currently locked page so we don't hit an already-locked page.
+	 */
+	h = *hp;
+	mpool_put(t->bt_mp, h, 0);
+	if ((e = __bt_search(t, &c->key, &exact)) == NULL)
+		return (1);
+	h = e->page;
+
+	/* See if we got it in one shot. */
+	if (h->pgno == c->pg.pgno)
+		goto ret;
+
+	/*
+	 * Move right, looking for the page.  At each move we have to move
+	 * up the stack until we don't have to move to the next page.  If
+	 * we have to change pages at an internal level, we have to fix the
+	 * stack back up.
+	 */
+	while (h->pgno != c->pg.pgno) {
+		if ((nextpg = h->nextpg) == P_INVALID)
+			break;
+		mpool_put(t->bt_mp, h, 0);
+
+		/* Move up the stack. */
+		for (level = 0; (parent = BT_POP(t)) != NULL; ++level) {
+			/* Get the parent page. */
+			if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
+				return (1);
+
+			/* Move to the next index. */
+			if (parent->index != NEXTINDEX(h) - 1) {
+				index = parent->index + 1;
+				BT_PUSH(t, h->pgno, index);
+				break;
+			}
+			mpool_put(t->bt_mp, h, 0);
+		}
+
+		/* Restore the stack. */
+		while (level--) {
+			/* Push the next level down onto the stack. */
+			bi = GETBINTERNAL(h, index);
+			pgno = bi->pgno;
+			BT_PUSH(t, pgno, 0);
+
+			/* Lose the currently pinned page. */
+			mpool_put(t->bt_mp, h, 0);
+
+			/* Get the next level down. */
+			if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL)
+				return (1);
+			index = 0;
+		}
+		mpool_put(t->bt_mp, h, 0);
+		if ((h = mpool_get(t->bt_mp, nextpg, 0)) == NULL)
+			return (1);
+	}
+
+	if (h->pgno == c->pg.pgno)
+		goto ret;
+
+	/* Reacquire the original stack. */
+	mpool_put(t->bt_mp, h, 0);
+	if ((e = __bt_search(t, &c->key, &exact)) == NULL)
+		return (1);
+	h = e->page;
+
+	/*
+	 * Move left, looking for the page.  At each move we have to move
+	 * up the stack until we don't have to change pages to move to the
+	 * next page.  If we have to change pages at an internal level, we
+	 * have to fix the stack back up.
+	 */
+	while (h->pgno != c->pg.pgno) {
+		if ((prevpg = h->prevpg) == P_INVALID)
+			break;
+		mpool_put(t->bt_mp, h, 0);
+
+		/* Move up the stack. */
+		for (level = 0; (parent = BT_POP(t)) != NULL; ++level) {
+			/* Get the parent page. */
+			if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
+				return (1);
+
+			/* Move to the next index. */
+			if (parent->index != 0) {
+				index = parent->index - 1;
+				BT_PUSH(t, h->pgno, index);
+				break;
+			}
+			mpool_put(t->bt_mp, h, 0);
+		}
+
+		/* Restore the stack. */
+		while (level--) {
+			/* Push the next level down onto the stack. */
+			bi = GETBINTERNAL(h, index);
+			pgno = bi->pgno;
+
+			/* Lose the currently pinned page. */
+			mpool_put(t->bt_mp, h, 0);
+
+			/* Get the next level down. */
+			if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL)
+				return (1);
+
+			index = NEXTINDEX(h) - 1;
+			BT_PUSH(t, pgno, index);
+		}
+		mpool_put(t->bt_mp, h, 0);
+		if ((h = mpool_get(t->bt_mp, prevpg, 0)) == NULL)
+			return (1);
+	}
+	
+
+ret:	mpool_put(t->bt_mp, h, 0);
+	return ((*hp = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL);
+}
+
+/*
+ * __bt_bdelete --
+ *	Delete all key/data pairs matching the specified key.
+ *
+ * Parameters:
+ *	  t:	tree
+ *	key:	key to delete
+ *
+ * Returns:
+ *	RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found.
+ */
+static int
+__bt_bdelete(t, key)
+	BTREE *t;
+	const DBT *key;
+{
+	EPG *e;
+	PAGE *h;
+	int deleted, exact, redo;
+
+	deleted = 0;
+
+	/* Find any matching record; __bt_search pins the page. */
+loop:	if ((e = __bt_search(t, key, &exact)) == NULL)
+		return (deleted ? RET_SUCCESS : RET_ERROR);
+	if (!exact) {
+		mpool_put(t->bt_mp, e->page, 0);
+		return (deleted ? RET_SUCCESS : RET_SPECIAL);
+	}
+
+	/*
+	 * Delete forward, then delete backward, from the found key.  If
+	 * there are duplicates and we reach either side of the page, do
+	 * the key search again, so that we get them all.
+	 */
+	redo = 0;
+	h = e->page;
+	do {
+		if (__bt_dleaf(t, key, h, e->index)) {
+			mpool_put(t->bt_mp, h, 0);
+			return (RET_ERROR);
+		}
+		if (F_ISSET(t, B_NODUPS)) {
+			if (NEXTINDEX(h) == 0) {
+				if (__bt_pdelete(t, h))
+					return (RET_ERROR);
+			} else
+				mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+			return (RET_SUCCESS);
+		}
+		deleted = 1;
+	} while (e->index < NEXTINDEX(h) && __bt_cmp(t, key, e) == 0);
+
+	/* Check for right-hand edge of the page. */
+	if (e->index == NEXTINDEX(h))
+		redo = 1;
+
+	/* Delete from the key to the beginning of the page. */
+	while (e->index-- > 0) {
+		if (__bt_cmp(t, key, e) != 0)
+			break;
+		if (__bt_dleaf(t, key, h, e->index) == RET_ERROR) {
+			mpool_put(t->bt_mp, h, 0);
+			return (RET_ERROR);
+		}
+		if (e->index == 0)
+			redo = 1;
+	}
+
+	/* Check for an empty page. */
+	if (NEXTINDEX(h) == 0) {
+		if (__bt_pdelete(t, h))
+			return (RET_ERROR);
+		goto loop;
+	}
+
+	/* Put the page. */
+	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+
+	if (redo)
+		goto loop;
+	return (RET_SUCCESS);
+}
+
+/*
+ * __bt_pdelete --
+ *	Delete a single page from the tree.
+ *
+ * Parameters:
+ *	t:	tree
+ *	h:	leaf page
+ *
+ * Returns:
+ *	RET_SUCCESS, RET_ERROR.
+ *
+ * Side-effects:
+ *	mpool_put's the page
+ */
+static int
+__bt_pdelete(t, h)
+	BTREE *t;
+	PAGE *h;
+{
+	BINTERNAL *bi;
+	PAGE *pg;
+	EPGNO *parent;
+	indx_t cnt, index, *ip, offset;
+	u_int32_t nksize;
+	char *from;
+
+	/*
+	 * Walk the parent page stack -- a LIFO stack of the pages that were
+	 * traversed when we searched for the page where the delete occurred.
+	 * Each stack entry is a page number and a page index offset.  The
+	 * offset is for the page traversed on the search.  We've just deleted
+	 * a page, so we have to delete the key from the parent page.
+	 *
+	 * If the delete from the parent page makes it empty, this process may
+	 * continue all the way up the tree.  We stop if we reach the root page
+	 * (which is never deleted, it's just not worth the effort) or if the
+	 * delete does not empty the page.
+	 */
+	while ((parent = BT_POP(t)) != NULL) {
+		/* Get the parent page. */
+		if ((pg = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
+			return (RET_ERROR);
+		
+		index = parent->index;
+		bi = GETBINTERNAL(pg, index);
+
+		/* Free any overflow pages. */
+		if (bi->flags & P_BIGKEY &&
+		    __ovfl_delete(t, bi->bytes) == RET_ERROR) {
+			mpool_put(t->bt_mp, pg, 0);
+			return (RET_ERROR);
+		}
+
+		/*
+		 * Free the parent if it has only the one key and it's not the
+		 * root page. If it's the rootpage, turn it back into an empty
+		 * leaf page.
+		 */
+		if (NEXTINDEX(pg) == 1)
+			if (pg->pgno == P_ROOT) {
+				pg->lower = BTDATAOFF;
+				pg->upper = t->bt_psize;
+				pg->flags = P_BLEAF;
+			} else {
+				if (__bt_relink(t, pg) || __bt_free(t, pg))
+					return (RET_ERROR);
+				continue;
+			}
+		else {
+			/* Pack remaining key items at the end of the page. */
+			nksize = NBINTERNAL(bi->ksize);
+			from = (char *)pg + pg->upper;
+			memmove(from + nksize, from, (char *)bi - from);
+			pg->upper += nksize;
+
+			/* Adjust indices' offsets, shift the indices down. */
+			offset = pg->linp[index];
+			for (cnt = index, ip = &pg->linp[0]; cnt--; ++ip)
+				if (ip[0] < offset)
+					ip[0] += nksize;
+			for (cnt = NEXTINDEX(pg) - index; --cnt; ++ip)
+				ip[0] = ip[1] < offset ? ip[1] + nksize : ip[1];
+			pg->lower -= sizeof(indx_t);
+		}
+
+		mpool_put(t->bt_mp, pg, MPOOL_DIRTY);
+		break;
+	}
+
+	/* Free the leaf page, as long as it wasn't the root. */
+	if (h->pgno == P_ROOT) {
+		mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+		return (RET_SUCCESS);
+	}
+	return (__bt_relink(t, h) || __bt_free(t, h));
+}
+
+/*
+ * __bt_dleaf --
+ *	Delete a single record from a leaf page.
+ *
+ * Parameters:
+ *	t:	tree
+ *    key:	referenced key
+ *	h:	page
+ *	index:	index on page to delete
+ *
+ * Returns:
+ *	RET_SUCCESS, RET_ERROR.
+ */
+int
+__bt_dleaf(t, key, h, index)
+	BTREE *t;
+	const DBT *key;
+	PAGE *h;
+	u_int index;
+{
+	BLEAF *bl;
+	indx_t cnt, *ip, offset;
+	u_int32_t nbytes;
+	void *to;
+	char *from;
+
+	/* If this record is referenced by the cursor, delete the cursor. */
+	if (F_ISSET(&t->bt_cursor, CURS_INIT) &&
+	    !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) &&
+	    t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index == index &&
+	    __bt_curdel(t, key, h, index))
+		return (RET_ERROR);
+
+	/* If the entry uses overflow pages, make them available for reuse. */
+	to = bl = GETBLEAF(h, index);
+	if (bl->flags & P_BIGKEY && __ovfl_delete(t, bl->bytes) == RET_ERROR)
+		return (RET_ERROR);
+	if (bl->flags & P_BIGDATA &&
+	    __ovfl_delete(t, bl->bytes + bl->ksize) == RET_ERROR)
+		return (RET_ERROR);
+
+	/* Pack the remaining key/data items at the end of the page. */
+	nbytes = NBLEAF(bl);
+	from = (char *)h + h->upper;
+	memmove(from + nbytes, from, (char *)to - from);
+	h->upper += nbytes;
+
+	/* Adjust the indices' offsets, shift the indices down. */
+	offset = h->linp[index];
+	for (cnt = index, ip = &h->linp[0]; cnt--; ++ip)
+		if (ip[0] < offset)
+			ip[0] += nbytes;
+	for (cnt = NEXTINDEX(h) - index; --cnt; ++ip)
+		ip[0] = ip[1] < offset ? ip[1] + nbytes : ip[1];
+	h->lower -= sizeof(indx_t);
+
+	/* If the cursor is on this page, adjust it as necessary. */
+	if (F_ISSET(&t->bt_cursor, CURS_INIT) &&
+	    !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) &&
+	    t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index > index)
+		--t->bt_cursor.pg.index;
+
+	return (RET_SUCCESS);
+}
+
+/*
+ * __bt_curdel --
+ *	Delete the cursor.
+ *
+ * Parameters:
+ *	t:	tree
+ *    key:	referenced key (or NULL)
+ *	h:	page
+ *  index:	index on page to delete
+ *
+ * Returns:
+ *	RET_SUCCESS, RET_ERROR.
+ */
+static int
+__bt_curdel(t, key, h, index)
+	BTREE *t;
+	const DBT *key;
+	PAGE *h;
+	u_int index;
+{
+	CURSOR *c;
+	EPG e;
+	PAGE *pg;
+	int curcopy, status;
+
+	/*
+	 * If there are duplicates, move forward or backward to one.
+	 * Otherwise, copy the key into the cursor area.
+	 */
+	c = &t->bt_cursor;
+	F_CLR(c, CURS_AFTER | CURS_BEFORE | CURS_ACQUIRE);
+
+	curcopy = 0;
+	if (!F_ISSET(t, B_NODUPS)) {
+		/*
+		 * We're going to have to do comparisons.  If we weren't
+		 * provided a copy of the key, i.e. the user is deleting
+		 * the current cursor position, get one.
+		 */
+		if (key == NULL) {
+			e.page = h;
+			e.index = index;
+			if ((status = __bt_ret(t, &e,
+			    &c->key, &c->key, NULL, NULL, 1)) != RET_SUCCESS)
+				return (status);
+			curcopy = 1;
+			key = &c->key;
+		}
+		/* Check previous key, if not at the beginning of the page. */
+		if (index > 0) { 
+			e.page = h;
+			e.index = index - 1;
+			if (__bt_cmp(t, key, &e) == 0) {
+				F_SET(c, CURS_BEFORE);
+				goto dup2;
+			}
+		}
+		/* Check next key, if not at the end of the page. */
+		if (index < NEXTINDEX(h) - 1) {
+			e.page = h;
+			e.index = index + 1;
+			if (__bt_cmp(t, key, &e) == 0) {
+				F_SET(c, CURS_AFTER);
+				goto dup2;
+			}
+		}
+		/* Check previous key if at the beginning of the page. */
+		if (index == 0 && h->prevpg != P_INVALID) {
+			if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL)
+				return (RET_ERROR);
+			e.page = pg;
+			e.index = NEXTINDEX(pg) - 1;
+			if (__bt_cmp(t, key, &e) == 0) {
+				F_SET(c, CURS_BEFORE);
+				goto dup1;
+			}
+			mpool_put(t->bt_mp, pg, 0);
+		}
+		/* Check next key if at the end of the page. */
+		if (index == NEXTINDEX(h) - 1 && h->nextpg != P_INVALID) {
+			if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL)
+				return (RET_ERROR);
+			e.page = pg;
+			e.index = 0;
+			if (__bt_cmp(t, key, &e) == 0) {
+				F_SET(c, CURS_AFTER);
+dup1:				mpool_put(t->bt_mp, pg, 0);
+dup2:				c->pg.pgno = e.page->pgno;
+				c->pg.index = e.index;
+				return (RET_SUCCESS);
+			}
+			mpool_put(t->bt_mp, pg, 0);
+		}
+	}
+	e.page = h;
+	e.index = index;
+	if (curcopy || (status =
+	    __bt_ret(t, &e, &c->key, &c->key, NULL, NULL, 1)) == RET_SUCCESS) {
+		F_SET(c, CURS_ACQUIRE);
+		return (RET_SUCCESS);
+	}
+	return (status);
+}
+
+/*
+ * __bt_relink --
+ *	Link around a deleted page.
+ *
+ * Parameters:
+ *	t:	tree
+ *	h:	page to be deleted
+ */
+static int
+__bt_relink(t, h)
+	BTREE *t;
+	PAGE *h;
+{
+	PAGE *pg;
+
+	if (h->nextpg != P_INVALID) {
+		if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL)
+			return (RET_ERROR);
+		pg->prevpg = h->prevpg;
+		mpool_put(t->bt_mp, pg, MPOOL_DIRTY);
+	}
+	if (h->prevpg != P_INVALID) {
+		if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL)
+			return (RET_ERROR);
+		pg->nextpg = h->nextpg;
+		mpool_put(t->bt_mp, pg, MPOOL_DIRTY);
+	}
+	return (0);
+}
diff --git a/db/btree/bt_get.c b/db/btree/bt_get.c
new file mode 100644
index 0000000000..74824c73f4
--- /dev/null
+++ b/db/btree/bt_get.c
@@ -0,0 +1,105 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)bt_get.c	8.6 (Berkeley) 7/20/94";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/types.h>
+
+#include <errno.h>
+#include <stddef.h>
+#include <stdio.h>
+
+#include <db.h>
+#include "btree.h"
+
+/*
+ * __BT_GET -- Get a record from the btree.
+ *
+ * Parameters:
+ *	dbp:	pointer to access method
+ *	key:	key to find
+ *	data:	data to return
+ *	flag:	currently unused
+ *
+ * Returns:
+ *	RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found.
+ */
+int
+__bt_get(dbp, key, data, flags)
+	const DB *dbp;
+	const DBT *key;
+	DBT *data;
+	u_int flags;
+{
+	BTREE *t;
+	EPG *e;
+	int exact, status;
+
+	t = dbp->internal;
+
+	/* Toss any page pinned across calls. */
+	if (t->bt_pinned != NULL) {
+		mpool_put(t->bt_mp, t->bt_pinned, 0);
+		t->bt_pinned = NULL;
+	}
+
+	/* Get currently doesn't take any flags. */
+	if (flags) {
+		errno = EINVAL;
+		return (RET_ERROR);
+	}
+
+	if ((e = __bt_search(t, key, &exact)) == NULL)
+		return (RET_ERROR);
+	if (!exact) {
+		mpool_put(t->bt_mp, e->page, 0);
+		return (RET_SPECIAL);
+	}
+
+	status = __bt_ret(t, e, NULL, NULL, data, &t->bt_rdata, 0);
+
+	/*
+	 * If the user is doing concurrent access, we copied the
+	 * key/data, toss the page.
+	 */
+	if (F_ISSET(t, B_DB_LOCK))
+		mpool_put(t->bt_mp, e->page, 0);
+	else
+		t->bt_pinned = e->page;
+	return (status);
+}
diff --git a/db/btree/bt_open.c b/db/btree/bt_open.c
new file mode 100644
index 0000000000..f052249777
--- /dev/null
+++ b/db/btree/bt_open.c
@@ -0,0 +1,444 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)bt_open.c	8.10 (Berkeley) 8/17/94";
+#endif /* LIBC_SCCS and not lint */
+
+/*
+ * Implementation of btree access method for 4.4BSD.
+ *
+ * The design here was originally based on that of the btree access method
+ * used in the Postgres database system at UC Berkeley.  This implementation
+ * is wholly independent of the Postgres code.
+ */
+
+#include <sys/param.h>
+#include <sys/stat.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <db.h>
+#include "btree.h"
+
+#ifdef DEBUG
+#undef	MINPSIZE
+#define	MINPSIZE	128
+#endif
+
+static int byteorder __P((void));
+static int nroot __P((BTREE *));
+static int tmp __P((void));
+
+/*
+ * __BT_OPEN -- Open a btree.
+ *
+ * Creates and fills a DB struct, and calls the routine that actually
+ * opens the btree.
+ *
+ * Parameters:
+ *	fname:	filename (NULL for in-memory trees)
+ *	flags:	open flag bits
+ *	mode:	open permission bits
+ *	b:	BTREEINFO pointer
+ *
+ * Returns:
+ *	NULL on failure, pointer to DB on success.
+ *
+ */
+DB *
+__bt_open(fname, flags, mode, openinfo, dflags)
+	const char *fname;
+	int flags, mode, dflags;
+	const BTREEINFO *openinfo;
+{
+	struct stat sb;
+	BTMETA m;
+	BTREE *t;
+	BTREEINFO b;
+	DB *dbp;
+	pgno_t ncache;
+	ssize_t nr;
+	int machine_lorder;
+
+	t = NULL;
+
+	/*
+	 * Intention is to make sure all of the user's selections are okay
+	 * here and then use them without checking.  Can't be complete, since
+	 * we don't know the right page size, lorder or flags until the backing
+	 * file is opened.  Also, the file's page size can cause the cachesize
+	 * to change.
+	 */
+	machine_lorder = byteorder();
+	if (openinfo) {
+		b = *openinfo;
+
+		/* Flags: R_DUP. */
+		if (b.flags & ~(R_DUP))
+			goto einval;
+
+		/*
+		 * Page size must be indx_t aligned and >= MINPSIZE.  Default
+		 * page size is set farther on, based on the underlying file
+		 * transfer size.
+		 */
+		if (b.psize &&
+		    (b.psize < MINPSIZE || b.psize > MAX_PAGE_OFFSET + 1 ||
+		    b.psize & sizeof(indx_t) - 1))
+			goto einval;
+
+		/* Minimum number of keys per page; absolute minimum is 2. */
+		if (b.minkeypage) {
+			if (b.minkeypage < 2)
+				goto einval;
+		} else
+			b.minkeypage = DEFMINKEYPAGE;
+
+		/* If no comparison, use default comparison and prefix. */
+		if (b.compare == NULL) {
+			b.compare = __bt_defcmp;
+			if (b.prefix == NULL)
+				b.prefix = __bt_defpfx;
+		}
+
+		if (b.lorder == 0)
+			b.lorder = machine_lorder;
+	} else {
+		b.compare = __bt_defcmp;
+		b.cachesize = 0;
+		b.flags = 0;
+		b.lorder = machine_lorder;
+		b.minkeypage = DEFMINKEYPAGE;
+		b.prefix = __bt_defpfx;
+		b.psize = 0;
+	}
+
+	/* Check for the ubiquitous PDP-11. */
+	if (b.lorder != BIG_ENDIAN && b.lorder != LITTLE_ENDIAN)
+		goto einval;
+
+	/* Allocate and initialize DB and BTREE structures. */
+	if ((t = (BTREE *)malloc(sizeof(BTREE))) == NULL)
+		goto err;
+	memset(t, 0, sizeof(BTREE));
+	t->bt_fd = -1;			/* Don't close unopened fd on error. */
+	t->bt_lorder = b.lorder;
+	t->bt_order = NOT;
+	t->bt_cmp = b.compare;
+	t->bt_pfx = b.prefix;
+	t->bt_rfd = -1;
+
+	if ((t->bt_dbp = dbp = (DB *)malloc(sizeof(DB))) == NULL)
+		goto err;
+	memset(t->bt_dbp, 0, sizeof(DB));
+	if (t->bt_lorder != machine_lorder)
+		F_SET(t, B_NEEDSWAP);
+
+	dbp->type = DB_BTREE;
+	dbp->internal = t;
+	dbp->close = __bt_close;
+	dbp->del = __bt_delete;
+	dbp->fd = __bt_fd;
+	dbp->get = __bt_get;
+	dbp->put = __bt_put;
+	dbp->seq = __bt_seq;
+	dbp->sync = __bt_sync;
+
+	/*
+	 * If no file name was supplied, this is an in-memory btree and we
+	 * open a backing temporary file.  Otherwise, it's a disk-based tree.
+	 */
+	if (fname) {
+		switch (flags & O_ACCMODE) {
+		case O_RDONLY:
+			F_SET(t, B_RDONLY);
+			break;
+		case O_RDWR:
+			break;
+		case O_WRONLY:
+		default:
+			goto einval;
+		}
+		
+		if ((t->bt_fd = open(fname, flags, mode)) < 0)
+			goto err;
+
+	} else {
+		if ((flags & O_ACCMODE) != O_RDWR)
+			goto einval;
+		if ((t->bt_fd = tmp()) == -1)
+			goto err;
+		F_SET(t, B_INMEM);
+	}
+
+	if (fcntl(t->bt_fd, F_SETFD, 1) == -1)
+		goto err;
+
+	if (fstat(t->bt_fd, &sb))
+		goto err;
+	if (sb.st_size) {
+		if ((nr = read(t->bt_fd, &m, sizeof(BTMETA))) < 0)
+			goto err;
+		if (nr != sizeof(BTMETA))
+			goto eftype;
+
+		/*
+		 * Read in the meta-data.  This can change the notion of what
+		 * the lorder, page size and flags are, and, when the page size
+		 * changes, the cachesize value can change too.  If the user
+		 * specified the wrong byte order for an existing database, we
+		 * don't bother to return an error, we just clear the NEEDSWAP
+		 * bit.
+		 */
+		if (m.magic == BTREEMAGIC)
+			F_CLR(t, B_NEEDSWAP);
+		else {
+			F_SET(t, B_NEEDSWAP);
+			M_32_SWAP(m.magic);
+			M_32_SWAP(m.version);
+			M_32_SWAP(m.psize);
+			M_32_SWAP(m.free);
+			M_32_SWAP(m.nrecs);
+			M_32_SWAP(m.flags);
+		}
+		if (m.magic != BTREEMAGIC || m.version != BTREEVERSION)
+			goto eftype;
+		if (m.psize < MINPSIZE || m.psize > MAX_PAGE_OFFSET + 1 ||
+		    m.psize & sizeof(indx_t) - 1)
+			goto eftype;
+		if (m.flags & ~SAVEMETA)
+			goto eftype;
+		b.psize = m.psize;
+		F_SET(t, m.flags);
+		t->bt_free = m.free;
+		t->bt_nrecs = m.nrecs;
+	} else {
+		/*
+		 * Set the page size to the best value for I/O to this file.
+		 * Don't overflow the page offset type.
+		 */
+		if (b.psize == 0) {
+			b.psize = sb.st_blksize;
+			if (b.psize < MINPSIZE)
+				b.psize = MINPSIZE;
+			if (b.psize > MAX_PAGE_OFFSET + 1)
+				b.psize = MAX_PAGE_OFFSET + 1;
+		}
+
+		/* Set flag if duplicates permitted. */
+		if (!(b.flags & R_DUP))
+			F_SET(t, B_NODUPS);
+
+		t->bt_free = P_INVALID;
+		t->bt_nrecs = 0;
+		F_SET(t, B_METADIRTY);
+	}
+
+	t->bt_psize = b.psize;
+
+	/* Set the cache size; must be a multiple of the page size. */
+	if (b.cachesize && b.cachesize & b.psize - 1)
+		b.cachesize += (~b.cachesize & b.psize - 1) + 1;
+	if (b.cachesize < b.psize * MINCACHE)
+		b.cachesize = b.psize * MINCACHE;
+
+	/* Calculate number of pages to cache. */
+	ncache = (b.cachesize + t->bt_psize - 1) / t->bt_psize;
+
+	/*
+	 * The btree data structure requires that at least two keys can fit on
+	 * a page, but other than that there's no fixed requirement.  The user
+	 * specified a minimum number per page, and we translated that into the
+	 * number of bytes a key/data pair can use before being placed on an
+	 * overflow page.  This calculation includes the page header, the size
+	 * of the index referencing the leaf item and the size of the leaf item
+	 * structure.  Also, don't let the user specify a minkeypage such that
+	 * a key/data pair won't fit even if both key and data are on overflow
+	 * pages.
+	 */
+	t->bt_ovflsize = (t->bt_psize - BTDATAOFF) / b.minkeypage -
+	    (sizeof(indx_t) + NBLEAFDBT(0, 0));
+	if (t->bt_ovflsize < NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t))
+		t->bt_ovflsize =
+		    NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t);
+
+	/* Initialize the buffer pool. */
+	if ((t->bt_mp =
+	    mpool_open(NULL, t->bt_fd, t->bt_psize, ncache)) == NULL)
+		goto err;
+	if (!F_ISSET(t, B_INMEM))
+		mpool_filter(t->bt_mp, __bt_pgin, __bt_pgout, t);
+
+	/* Create a root page if new tree. */
+	if (nroot(t) == RET_ERROR)
+		goto err;
+
+	/* Global flags. */
+	if (dflags & DB_LOCK)
+		F_SET(t, B_DB_LOCK);
+	if (dflags & DB_SHMEM)
+		F_SET(t, B_DB_SHMEM);
+	if (dflags & DB_TXN)
+		F_SET(t, B_DB_TXN);
+
+	return (dbp);
+
+einval:	errno = EINVAL;
+	goto err;
+
+eftype:	errno = EFTYPE;
+	goto err;
+
+err:	if (t) {
+		if (t->bt_dbp)
+			free(t->bt_dbp);
+		if (t->bt_fd != -1)
+			(void)close(t->bt_fd);
+		free(t);
+	}
+	return (NULL);
+}
+
+/*
+ * NROOT -- Create the root of a new tree.
+ *
+ * Parameters:
+ *	t:	tree
+ *
+ * Returns:
+ *	RET_ERROR, RET_SUCCESS
+ */
+static int
+nroot(t)
+	BTREE *t;
+{
+	PAGE *meta, *root;
+	pgno_t npg;
+
+	if ((meta = mpool_get(t->bt_mp, 0, 0)) != NULL) {
+		mpool_put(t->bt_mp, meta, 0);
+		return (RET_SUCCESS);
+	}
+	if (errno != EINVAL)		/* It's OK to not exist. */
+		return (RET_ERROR);
+	errno = 0;
+
+	if ((meta = mpool_new(t->bt_mp, &npg)) == NULL)
+		return (RET_ERROR);
+
+	if ((root = mpool_new(t->bt_mp, &npg)) == NULL)
+		return (RET_ERROR);
+
+	if (npg != P_ROOT)
+		return (RET_ERROR);
+	root->pgno = npg;
+	root->prevpg = root->nextpg = P_INVALID;
+	root->lower = BTDATAOFF;
+	root->upper = t->bt_psize;
+	root->flags = P_BLEAF;
+	memset(meta, 0, t->bt_psize);
+	mpool_put(t->bt_mp, meta, MPOOL_DIRTY);
+	mpool_put(t->bt_mp, root, MPOOL_DIRTY);
+	return (RET_SUCCESS);
+}
+
+static int
+tmp()
+{
+	sigset_t set, oset;
+	int fd;
+	char *envtmp;
+	char path[MAXPATHLEN];
+
+	envtmp = getenv("TMPDIR");
+	(void)snprintf(path,
+	    sizeof(path), "%s/bt.XXXXXX", envtmp ? envtmp : "/tmp");
+
+	(void)sigfillset(&set);
+	(void)sigprocmask(SIG_BLOCK, &set, &oset);
+	if ((fd = mkstemp(path)) != -1)
+		(void)unlink(path);
+	(void)sigprocmask(SIG_SETMASK, &oset, NULL);
+	return(fd);
+}
+
+static int
+byteorder()
+{
+	u_int32_t x;
+	u_char *p;
+
+	x = 0x01020304;
+	p = (u_char *)&x;
+	switch (*p) {
+	case 1:
+		return (BIG_ENDIAN);
+	case 4:
+		return (LITTLE_ENDIAN);
+	default:
+		return (0);
+	}
+}
+
+int
+__bt_fd(dbp)
+        const DB *dbp;
+{
+	BTREE *t;
+
+	t = dbp->internal;
+
+	/* Toss any page pinned across calls. */
+	if (t->bt_pinned != NULL) {
+		mpool_put(t->bt_mp, t->bt_pinned, 0);
+		t->bt_pinned = NULL;
+	}
+
+	/* In-memory database can't have a file descriptor. */
+	if (F_ISSET(t, B_INMEM)) {
+		errno = ENOENT;
+		return (-1);
+	}
+	return (t->bt_fd);
+}
diff --git a/db/btree/bt_overflow.c b/db/btree/bt_overflow.c
new file mode 100644
index 0000000000..b28b8e0471
--- /dev/null
+++ b/db/btree/bt_overflow.c
@@ -0,0 +1,228 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)bt_overflow.c	8.5 (Berkeley) 7/16/94";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/param.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <db.h>
+#include "btree.h"
+
+/*
+ * Big key/data code.
+ *
+ * Big key and data entries are stored on linked lists of pages.  The initial
+ * reference is byte string stored with the key or data and is the page number
+ * and size.  The actual record is stored in a chain of pages linked by the
+ * nextpg field of the PAGE header.
+ *
+ * The first page of the chain has a special property.  If the record is used
+ * by an internal page, it cannot be deleted and the P_PRESERVE bit will be set
+ * in the header.
+ *
+ * XXX
+ * A single DBT is written to each chain, so a lot of space on the last page
+ * is wasted.  This is a fairly major bug for some data sets.
+ */
+
+/*
+ * __OVFL_GET -- Get an overflow key/data item.
+ *
+ * Parameters:
+ *	t:	tree
+ *	p:	pointer to { pgno_t, u_int32_t }
+ *	buf:	storage address
+ *	bufsz:	storage size
+ *
+ * Returns:
+ *	RET_ERROR, RET_SUCCESS
+ */
+int
+__ovfl_get(t, p, ssz, buf, bufsz)
+	BTREE *t;
+	void *p;
+	size_t *ssz;
+	void **buf;
+	size_t *bufsz;
+{
+	PAGE *h;
+	pgno_t pg;
+	size_t nb, plen;
+	u_int32_t sz;
+
+	memmove(&pg, p, sizeof(pgno_t));
+	memmove(&sz, (char *)p + sizeof(pgno_t), sizeof(u_int32_t));
+	*ssz = sz;
+
+#ifdef DEBUG
+	if (pg == P_INVALID || sz == 0)
+		abort();
+#endif
+	/* Make the buffer bigger as necessary. */
+	if (*bufsz < sz) {
+		*buf = (char *)(*buf == NULL ? malloc(sz) : realloc(*buf, sz));
+		if (*buf == NULL)
+			return (RET_ERROR);
+		*bufsz = sz;
+	}
+
+	/*
+	 * Step through the linked list of pages, copying the data on each one
+	 * into the buffer.  Never copy more than the data's length.
+	 */
+	plen = t->bt_psize - BTDATAOFF;
+	for (p = *buf;; p = (char *)p + nb, pg = h->nextpg) {
+		if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+			return (RET_ERROR);
+
+		nb = MIN(sz, plen);
+		memmove(p, (char *)h + BTDATAOFF, nb);
+		mpool_put(t->bt_mp, h, 0);
+
+		if ((sz -= nb) == 0)
+			break;
+	}
+	return (RET_SUCCESS);
+}
+
+/*
+ * __OVFL_PUT -- Store an overflow key/data item.
+ *
+ * Parameters:
+ *	t:	tree
+ *	data:	DBT to store
+ *	pgno:	storage page number
+ *
+ * Returns:
+ *	RET_ERROR, RET_SUCCESS
+ */
+int
+__ovfl_put(t, dbt, pg)
+	BTREE *t;
+	const DBT *dbt;
+	pgno_t *pg;
+{
+	PAGE *h, *last;
+	void *p;
+	pgno_t npg;
+	size_t nb, plen;
+	u_int32_t sz;
+
+	/*
+	 * Allocate pages and copy the key/data record into them.  Store the
+	 * number of the first page in the chain.
+	 */
+	plen = t->bt_psize - BTDATAOFF;
+	for (last = NULL, p = dbt->data, sz = dbt->size;;
+	    p = (char *)p + plen, last = h) {
+		if ((h = __bt_new(t, &npg)) == NULL)
+			return (RET_ERROR);
+
+		h->pgno = npg;
+		h->nextpg = h->prevpg = P_INVALID;
+		h->flags = P_OVERFLOW;
+		h->lower = h->upper = 0;
+
+		nb = MIN(sz, plen);
+		memmove((char *)h + BTDATAOFF, p, nb);
+
+		if (last) {
+			last->nextpg = h->pgno;
+			mpool_put(t->bt_mp, last, MPOOL_DIRTY);
+		} else
+			*pg = h->pgno;
+
+		if ((sz -= nb) == 0) {
+			mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+			break;
+		}
+	}
+	return (RET_SUCCESS);
+}
+
+/*
+ * __OVFL_DELETE -- Delete an overflow chain.
+ *
+ * Parameters:
+ *	t:	tree
+ *	p:	pointer to { pgno_t, u_int32_t }
+ *
+ * Returns:
+ *	RET_ERROR, RET_SUCCESS
+ */
+int
+__ovfl_delete(t, p)
+	BTREE *t;
+	void *p;
+{
+	PAGE *h;
+	pgno_t pg;
+	size_t plen;
+	u_int32_t sz;
+
+	memmove(&pg, p, sizeof(pgno_t));
+	memmove(&sz, (char *)p + sizeof(pgno_t), sizeof(u_int32_t));
+
+#ifdef DEBUG
+	if (pg == P_INVALID || sz == 0)
+		abort();
+#endif
+	if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+		return (RET_ERROR);
+
+	/* Don't delete chains used by internal pages. */
+	if (h->flags & P_PRESERVE) {
+		mpool_put(t->bt_mp, h, 0);
+		return (RET_SUCCESS);
+	}
+
+	/* Step through the chain, calling the free routine for each page. */
+	for (plen = t->bt_psize - BTDATAOFF;; sz -= plen) {
+		pg = h->nextpg;
+		__bt_free(t, h);
+		if (sz <= plen)
+			break;
+		if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+			return (RET_ERROR);
+	}
+	return (RET_SUCCESS);
+}
diff --git a/db/btree/bt_page.c b/db/btree/bt_page.c
new file mode 100644
index 0000000000..0d9d138d5c
--- /dev/null
+++ b/db/btree/bt_page.c
@@ -0,0 +1,98 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)bt_page.c	8.3 (Berkeley) 7/14/94";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/types.h>
+
+#include <stdio.h>
+
+#include <db.h>
+#include "btree.h"
+
+/*
+ * __bt_free --
+ *	Put a page on the freelist.
+ *
+ * Parameters:
+ *	t:	tree
+ *	h:	page to free
+ *
+ * Returns:
+ *	RET_ERROR, RET_SUCCESS
+ *
+ * Side-effect:
+ *	mpool_put's the page.
+ */
+int
+__bt_free(t, h)
+	BTREE *t;
+	PAGE *h;
+{
+	/* Insert the page at the head of the free list. */
+	h->prevpg = P_INVALID;
+	h->nextpg = t->bt_free;
+	t->bt_free = h->pgno;
+
+	/* Make sure the page gets written back. */
+	return (mpool_put(t->bt_mp, h, MPOOL_DIRTY));
+}
+
+/*
+ * __bt_new --
+ *	Get a new page, preferably from the freelist.
+ *
+ * Parameters:
+ *	t:	tree
+ *	npg:	storage for page number.
+ *
+ * Returns:
+ *	Pointer to a page, NULL on error.
+ */
+PAGE *
+__bt_new(t, npg)
+	BTREE *t;
+	pgno_t *npg;
+{
+	PAGE *h;
+
+	if (t->bt_free != P_INVALID &&
+	    (h = mpool_get(t->bt_mp, t->bt_free, 0)) != NULL) {
+		*npg = t->bt_free;
+		t->bt_free = h->nextpg;
+		return (h);
+	}
+	return (mpool_new(t->bt_mp, npg));
+}
diff --git a/db/btree/bt_put.c b/db/btree/bt_put.c
new file mode 100644
index 0000000000..952be09e55
--- /dev/null
+++ b/db/btree/bt_put.c
@@ -0,0 +1,320 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)bt_put.c	8.8 (Berkeley) 7/26/94";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/types.h>
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <db.h>
+#include "btree.h"
+
+static EPG *bt_fast __P((BTREE *, const DBT *, const DBT *, int *));
+
+/*
+ * __BT_PUT -- Add a btree item to the tree.
+ *
+ * Parameters:
+ *	dbp:	pointer to access method
+ *	key:	key
+ *	data:	data
+ *	flag:	R_NOOVERWRITE
+ *
+ * Returns:
+ *	RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key is already in the
+ *	tree and R_NOOVERWRITE specified.
+ */
+int
+__bt_put(dbp, key, data, flags)
+	const DB *dbp;
+	DBT *key;
+	const DBT *data;
+	u_int flags;
+{
+	BTREE *t;
+	DBT tkey, tdata;
+	EPG *e;
+	PAGE *h;
+	indx_t index, nxtindex;
+	pgno_t pg;
+	u_int32_t nbytes;
+	int dflags, exact, status;
+	char *dest, db[NOVFLSIZE], kb[NOVFLSIZE];
+
+	t = dbp->internal;
+
+	/* Toss any page pinned across calls. */
+	if (t->bt_pinned != NULL) {
+		mpool_put(t->bt_mp, t->bt_pinned, 0);
+		t->bt_pinned = NULL;
+	}
+
+	/* Check for change to a read-only tree. */
+	if (F_ISSET(t, B_RDONLY)) {
+		errno = EPERM;
+		return (RET_ERROR);
+	}
+
+	switch (flags) {
+	case 0:
+	case R_NOOVERWRITE:
+		break;
+	case R_CURSOR:
+		/*
+		 * If flags is R_CURSOR, put the cursor.  Must already
+		 * have started a scan and not have already deleted it.
+		 */
+		if (F_ISSET(&t->bt_cursor, CURS_INIT) &&
+		    !F_ISSET(&t->bt_cursor,
+		        CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE))
+			break;
+		/* FALLTHROUGH */
+	default:
+		errno = EINVAL;
+		return (RET_ERROR);
+	}
+
+	/*
+	 * If the key/data pair won't fit on a page, store it on overflow
+	 * pages.  Only put the key on the overflow page if the pair are
+	 * still too big after moving the data to an overflow page.
+	 *
+	 * XXX
+	 * If the insert fails later on, the overflow pages aren't recovered.
+	 */
+	dflags = 0;
+	if (key->size + data->size > t->bt_ovflsize) {
+		if (key->size > t->bt_ovflsize) {
+storekey:		if (__ovfl_put(t, key, &pg) == RET_ERROR)
+				return (RET_ERROR);
+			tkey.data = kb;
+			tkey.size = NOVFLSIZE;
+			memmove(kb, &pg, sizeof(pgno_t));
+			memmove(kb + sizeof(pgno_t),
+			    &key->size, sizeof(u_int32_t));
+			dflags |= P_BIGKEY;
+			key = &tkey;
+		}
+		if (key->size + data->size > t->bt_ovflsize) {
+			if (__ovfl_put(t, data, &pg) == RET_ERROR)
+				return (RET_ERROR);
+			tdata.data = db;
+			tdata.size = NOVFLSIZE;
+			memmove(db, &pg, sizeof(pgno_t));
+			memmove(db + sizeof(pgno_t),
+			    &data->size, sizeof(u_int32_t));
+			dflags |= P_BIGDATA;
+			data = &tdata;
+		}
+		if (key->size + data->size > t->bt_ovflsize)
+			goto storekey;
+	}
+
+	/* Replace the cursor. */
+	if (flags == R_CURSOR) {
+		if ((h = mpool_get(t->bt_mp, t->bt_cursor.pg.pgno, 0)) == NULL)
+			return (RET_ERROR);
+		index = t->bt_cursor.pg.index;
+		goto delete;
+	}
+
+	/*
+	 * Find the key to delete, or, the location at which to insert.
+	 * Bt_fast and __bt_search both pin the returned page.
+	 */
+	if (t->bt_order == NOT || (e = bt_fast(t, key, data, &exact)) == NULL)
+		if ((e = __bt_search(t, key, &exact)) == NULL)
+			return (RET_ERROR);
+	h = e->page;
+	index = e->index;
+
+	/*
+	 * Add the key/data pair to the tree.  If an identical key is already
+	 * in the tree, and R_NOOVERWRITE is set, an error is returned.  If
+	 * R_NOOVERWRITE is not set, the key is either added (if duplicates are
+	 * permitted) or an error is returned.
+	 */
+	switch (flags) {
+	case R_NOOVERWRITE:
+		if (!exact)
+			break;
+		mpool_put(t->bt_mp, h, 0);
+		return (RET_SPECIAL);
+	default:
+		if (!exact || !F_ISSET(t, B_NODUPS))
+			break;
+		/*
+		 * !!!
+		 * Note, the delete may empty the page, so we need to put a
+		 * new entry into the page immediately.
+		 */
+delete:		if (__bt_dleaf(t, key, h, index) == RET_ERROR) {
+			mpool_put(t->bt_mp, h, 0);
+			return (RET_ERROR);
+		}
+		break;
+	}
+
+	/*
+	 * If not enough room, or the user has put a ceiling on the number of
+	 * keys permitted in the page, split the page.  The split code will
+	 * insert the key and data and unpin the current page.  If inserting
+	 * into the offset array, shift the pointers up.
+	 */
+	nbytes = NBLEAFDBT(key->size, data->size);
+	if (h->upper - h->lower < nbytes + sizeof(indx_t)) {
+		if ((status = __bt_split(t, h, key,
+		    data, dflags, nbytes, index)) != RET_SUCCESS)
+			return (status);
+		goto success;
+	}
+
+	if (index < (nxtindex = NEXTINDEX(h)))
+		memmove(h->linp + index + 1, h->linp + index,
+		    (nxtindex - index) * sizeof(indx_t));
+	h->lower += sizeof(indx_t);
+
+	h->linp[index] = h->upper -= nbytes;
+	dest = (char *)h + h->upper;
+	WR_BLEAF(dest, key, data, dflags);
+
+	/* If the cursor is on this page, adjust it as necessary. */
+	if (F_ISSET(&t->bt_cursor, CURS_INIT) &&
+	    !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) &&
+	    t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index >= index)
+		++t->bt_cursor.pg.index;
+
+	if (t->bt_order == NOT)
+		if (h->nextpg == P_INVALID) {
+			if (index == NEXTINDEX(h) - 1) {
+				t->bt_order = FORWARD;
+				t->bt_last.index = index;
+				t->bt_last.pgno = h->pgno;
+			}
+		} else if (h->prevpg == P_INVALID) {
+			if (index == 0) {
+				t->bt_order = BACK;
+				t->bt_last.index = 0;
+				t->bt_last.pgno = h->pgno;
+			}
+		}
+
+	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+
+success:
+	if (flags == R_SETCURSOR)
+		__bt_setcur(t, e->page->pgno, e->index);
+
+	F_SET(t, B_MODIFIED);
+	return (RET_SUCCESS);
+}
+
+#ifdef STATISTICS
+u_long bt_cache_hit, bt_cache_miss;
+#endif
+
+/*
+ * BT_FAST -- Do a quick check for sorted data.
+ *
+ * Parameters:
+ *	t:	tree
+ *	key:	key to insert
+ *
+ * Returns:
+ * 	EPG for new record or NULL if not found.
+ */
+static EPG *
+bt_fast(t, key, data, exactp)
+	BTREE *t;
+	const DBT *key, *data;
+	int *exactp;
+{
+	PAGE *h;
+	u_int32_t nbytes;
+	int cmp;
+
+	if ((h = mpool_get(t->bt_mp, t->bt_last.pgno, 0)) == NULL) {
+		t->bt_order = NOT;
+		return (NULL);
+	}
+	t->bt_cur.page = h;
+	t->bt_cur.index = t->bt_last.index;
+
+	/*
+	 * If won't fit in this page or have too many keys in this page,
+	 * have to search to get split stack.
+	 */
+	nbytes = NBLEAFDBT(key->size, data->size);
+	if (h->upper - h->lower < nbytes + sizeof(indx_t))
+		goto miss;
+
+	if (t->bt_order == FORWARD) {
+		if (t->bt_cur.page->nextpg != P_INVALID)
+			goto miss;
+		if (t->bt_cur.index != NEXTINDEX(h) - 1)
+			goto miss;
+		if ((cmp = __bt_cmp(t, key, &t->bt_cur)) < 0)
+			goto miss;
+		t->bt_last.index = cmp ? ++t->bt_cur.index : t->bt_cur.index;
+	} else {
+		if (t->bt_cur.page->prevpg != P_INVALID)
+			goto miss;
+		if (t->bt_cur.index != 0)
+			goto miss;
+		if ((cmp = __bt_cmp(t, key, &t->bt_cur)) > 0)
+			goto miss;
+		t->bt_last.index = 0;
+	}
+	*exactp = cmp == 0;
+#ifdef STATISTICS
+	++bt_cache_hit;
+#endif
+	return (&t->bt_cur);
+
+miss:
+#ifdef STATISTICS
+	++bt_cache_miss;
+#endif
+	t->bt_order = NOT;
+	mpool_put(t->bt_mp, h, 0);
+	return (NULL);
+}
diff --git a/db/btree/bt_search.c b/db/btree/bt_search.c
new file mode 100644
index 0000000000..485afcbbf0
--- /dev/null
+++ b/db/btree/bt_search.c
@@ -0,0 +1,213 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)bt_search.c	8.8 (Berkeley) 7/31/94";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/types.h>
+
+#include <stdio.h>
+
+#include <db.h>
+#include "btree.h"
+
+static int __bt_snext __P((BTREE *, PAGE *, const DBT *, int *));
+static int __bt_sprev __P((BTREE *, PAGE *, const DBT *, int *));
+
+/*
+ * __bt_search --
+ *	Search a btree for a key.
+ *
+ * Parameters:
+ *	t:	tree to search
+ *	key:	key to find
+ *	exactp:	pointer to exact match flag
+ *
+ * Returns:
+ *	The EPG for matching record, if any, or the EPG for the location
+ *	of the key, if it were inserted into the tree, is entered into
+ *	the bt_cur field of the tree.  A pointer to the field is returned.
+ */
+EPG *
+__bt_search(t, key, exactp)
+	BTREE *t;
+	const DBT *key;
+	int *exactp;
+{
+	PAGE *h;
+	indx_t base, index, lim;
+	pgno_t pg;
+	int cmp;
+
+	BT_CLR(t);
+	for (pg = P_ROOT;;) {
+		if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+			return (NULL);
+
+		/* Do a binary search on the current page. */
+		t->bt_cur.page = h;
+		for (base = 0, lim = NEXTINDEX(h); lim; lim >>= 1) {
+			t->bt_cur.index = index = base + (lim >> 1);
+			if ((cmp = __bt_cmp(t, key, &t->bt_cur)) == 0) {
+				if (h->flags & P_BLEAF) {
+					*exactp = 1;
+					return (&t->bt_cur);
+				}
+				goto next;
+			}
+			if (cmp > 0) {
+				base = index + 1;
+				--lim;
+			}
+		}
+
+		/*
+		 * If it's a leaf page, we're almost done.  If no duplicates
+		 * are allowed, or we have an exact match, we're done.  Else,
+		 * it's possible that there were matching keys on this page,
+		 * which later deleted, and we're on a page with no matches
+		 * while there are matches on other pages.  If at the start or
+		 * end of a page, check the adjacent page.
+		 */
+		if (h->flags & P_BLEAF) {
+			if (!F_ISSET(t, B_NODUPS)) {
+				if (base == 0 &&
+				    h->prevpg != P_INVALID &&
+				    __bt_sprev(t, h, key, exactp))
+					return (&t->bt_cur);
+				if (base == NEXTINDEX(h) &&
+				    h->nextpg != P_INVALID &&
+				    __bt_snext(t, h, key, exactp))
+					return (&t->bt_cur);
+			}
+			*exactp = 0;
+			t->bt_cur.index = base;
+			return (&t->bt_cur);
+		}
+
+		/*
+		 * No match found.  Base is the smallest index greater than
+		 * key and may be zero or a last + 1 index.  If it's non-zero,
+		 * decrement by one, and record the internal page which should
+		 * be a parent page for the key.  If a split later occurs, the
+		 * inserted page will be to the right of the saved page.
+		 */
+		index = base ? base - 1 : base;
+
+next:		BT_PUSH(t, h->pgno, index);
+		pg = GETBINTERNAL(h, index)->pgno;
+		mpool_put(t->bt_mp, h, 0);
+	}
+}
+
+/*
+ * __bt_snext --
+ *	Check for an exact match after the key.
+ *
+ * Parameters:
+ *	t:	tree
+ *	h:	current page
+ *	key:	key
+ *	exactp:	pointer to exact match flag
+ *
+ * Returns:
+ *	If an exact match found.
+ */
+static int
+__bt_snext(t, h, key, exactp)
+	BTREE *t;
+	PAGE *h;
+	const DBT *key;
+	int *exactp;
+{
+	EPG e;
+
+	/*
+	 * Get the next page.  The key is either an exact
+	 * match, or not as good as the one we already have.
+	 */
+	if ((e.page = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL)
+		return (0);
+	e.index = 0;
+	if (__bt_cmp(t, key, &e) == 0) {
+		mpool_put(t->bt_mp, h, 0);
+		t->bt_cur = e;
+		*exactp = 1;
+		return (1);
+	}
+	mpool_put(t->bt_mp, e.page, 0);
+	return (0);
+}
+
+/*
+ * __bt_sprev --
+ *	Check for an exact match before the key.
+ *
+ * Parameters:
+ *	t:	tree
+ *	h:	current page
+ *	key:	key
+ *	exactp:	pointer to exact match flag
+ *
+ * Returns:
+ *	If an exact match found.
+ */
+static int
+__bt_sprev(t, h, key, exactp)
+	BTREE *t;
+	PAGE *h;
+	const DBT *key;
+	int *exactp;
+{
+	EPG e;
+
+	/*
+	 * Get the previous page.  The key is either an exact
+	 * match, or not as good as the one we already have.
+	 */
+	if ((e.page = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL)
+		return (0);
+	e.index = NEXTINDEX(e.page) - 1;
+	if (__bt_cmp(t, key, &e) == 0) {
+		mpool_put(t->bt_mp, h, 0);
+		t->bt_cur = e;
+		*exactp = 1;
+		return (1);
+	}
+	mpool_put(t->bt_mp, e.page, 0);
+	return (0);
+}
diff --git a/db/btree/bt_seq.c b/db/btree/bt_seq.c
new file mode 100644
index 0000000000..303b481903
--- /dev/null
+++ b/db/btree/bt_seq.c
@@ -0,0 +1,460 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)bt_seq.c	8.7 (Berkeley) 7/20/94";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/types.h>
+
+#include <errno.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <db.h>
+#include "btree.h"
+
+static int __bt_first __P((BTREE *, const DBT *, EPG *, int *));
+static int __bt_seqadv __P((BTREE *, EPG *, int));
+static int __bt_seqset __P((BTREE *, EPG *, DBT *, int));
+
+/*
+ * Sequential scan support.
+ *
+ * The tree can be scanned sequentially, starting from either end of the
+ * tree or from any specific key.  A scan request before any scanning is
+ * done is initialized as starting from the least node.
+ */
+
+/*
+ * __bt_seq --
+ *	Btree sequential scan interface.
+ *
+ * Parameters:
+ *	dbp:	pointer to access method
+ *	key:	key for positioning and return value
+ *	data:	data return value
+ *	flags:	R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV.
+ *
+ * Returns:
+ *	RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
+ */
+int
+__bt_seq(dbp, key, data, flags)
+	const DB *dbp;
+	DBT *key, *data;
+	u_int flags;
+{
+	BTREE *t;
+	EPG e;
+	int status;
+
+	t = dbp->internal;
+
+	/* Toss any page pinned across calls. */
+	if (t->bt_pinned != NULL) {
+		mpool_put(t->bt_mp, t->bt_pinned, 0);
+		t->bt_pinned = NULL;
+	}
+
+	/*
+	 * If scan unitialized as yet, or starting at a specific record, set
+	 * the scan to a specific key.  Both __bt_seqset and __bt_seqadv pin
+	 * the page the cursor references if they're successful.
+	 */
+	switch (flags) {
+	case R_NEXT:
+	case R_PREV:
+		if (F_ISSET(&t->bt_cursor, CURS_INIT)) {
+			status = __bt_seqadv(t, &e, flags);
+			break;
+		}
+		/* FALLTHROUGH */
+	case R_FIRST:
+	case R_LAST:
+	case R_CURSOR:
+		status = __bt_seqset(t, &e, key, flags);
+		break;
+	default:
+		errno = EINVAL;
+		return (RET_ERROR);
+	}
+
+	if (status == RET_SUCCESS) {
+		__bt_setcur(t, e.page->pgno, e.index);
+
+		status =
+		    __bt_ret(t, &e, key, &t->bt_rkey, data, &t->bt_rdata, 0);
+
+		/*
+		 * If the user is doing concurrent access, we copied the
+		 * key/data, toss the page.
+		 */
+		if (F_ISSET(t, B_DB_LOCK))
+			mpool_put(t->bt_mp, e.page, 0);
+		else
+			t->bt_pinned = e.page;
+	}
+	return (status);
+}
+
+/*
+ * __bt_seqset --
+ *	Set the sequential scan to a specific key.
+ *
+ * Parameters:
+ *	t:	tree
+ *	ep:	storage for returned key
+ *	key:	key for initial scan position
+ *	flags:	R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV
+ *
+ * Side effects:
+ *	Pins the page the cursor references.
+ *
+ * Returns:
+ *	RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
+ */
+static int
+__bt_seqset(t, ep, key, flags)
+	BTREE *t;
+	EPG *ep;
+	DBT *key;
+	int flags;
+{
+	PAGE *h;
+	pgno_t pg;
+	int exact;
+
+	/*
+	 * Find the first, last or specific key in the tree and point the
+	 * cursor at it.  The cursor may not be moved until a new key has
+	 * been found.
+	 */
+	switch (flags) {
+	case R_CURSOR:				/* Keyed scan. */
+		/*
+		 * Find the first instance of the key or the smallest key
+		 * which is greater than or equal to the specified key.
+		 */
+		if (key->data == NULL || key->size == 0) {
+			errno = EINVAL;
+			return (RET_ERROR);
+		}
+		return (__bt_first(t, key, ep, &exact));
+	case R_FIRST:				/* First record. */
+	case R_NEXT:
+		/* Walk down the left-hand side of the tree. */
+		for (pg = P_ROOT;;) {
+			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+				return (RET_ERROR);
+
+			/* Check for an empty tree. */
+			if (NEXTINDEX(h) == 0) {
+				mpool_put(t->bt_mp, h, 0);
+				return (RET_SPECIAL);
+			}
+
+			if (h->flags & (P_BLEAF | P_RLEAF))
+				break;
+			pg = GETBINTERNAL(h, 0)->pgno;
+			mpool_put(t->bt_mp, h, 0);
+		}
+		ep->page = h;
+		ep->index = 0;
+		break;
+	case R_LAST:				/* Last record. */
+	case R_PREV:
+		/* Walk down the right-hand side of the tree. */
+		for (pg = P_ROOT;;) {
+			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+				return (RET_ERROR);
+
+			/* Check for an empty tree. */
+			if (NEXTINDEX(h) == 0) {
+				mpool_put(t->bt_mp, h, 0);
+				return (RET_SPECIAL);
+			}
+
+			if (h->flags & (P_BLEAF | P_RLEAF))
+				break;
+			pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno;
+			mpool_put(t->bt_mp, h, 0);
+		}
+
+		ep->page = h;
+		ep->index = NEXTINDEX(h) - 1;
+		break;
+	}
+	return (RET_SUCCESS);
+}
+
+/*
+ * __bt_seqadvance --
+ *	Advance the sequential scan.
+ *
+ * Parameters:
+ *	t:	tree
+ *	flags:	R_NEXT, R_PREV
+ *
+ * Side effects:
+ *	Pins the page the new key/data record is on.
+ *
+ * Returns:
+ *	RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
+ */
+static int
+__bt_seqadv(t, ep, flags)
+	BTREE *t;
+	EPG *ep;
+	int flags;
+{
+	CURSOR *c;
+	PAGE *h;
+	indx_t index;
+	pgno_t pg;
+	int exact;
+
+	/*
+	 * There are a couple of states that we can be in.  The cursor has
+	 * been initialized by the time we get here, but that's all we know.
+	 */
+	c = &t->bt_cursor;
+
+	/*
+	 * The cursor was deleted where there weren't any duplicate records,
+	 * so the key was saved.  Find out where that key would go in the
+	 * current tree.  It doesn't matter if the returned key is an exact
+	 * match or not -- if it's an exact match, the record was added after
+	 * the delete so we can just return it.  If not, as long as there's
+	 * a record there, return it.
+	 */
+	if (F_ISSET(c, CURS_ACQUIRE))
+		return (__bt_first(t, &c->key, ep, &exact));
+
+	/* Get the page referenced by the cursor. */
+	if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL)
+		return (RET_ERROR);
+
+	/*
+ 	 * Find the next/previous record in the tree and point the cursor at
+	 * it.  The cursor may not be moved until a new key has been found.
+	 */
+	switch (flags) {
+	case R_NEXT:			/* Next record. */
+		/*
+		 * The cursor was deleted in duplicate records, and moved
+		 * forward to a record that has yet to be returned.  Clear
+		 * that flag, and return the record.
+		 */
+		if (F_ISSET(c, CURS_AFTER))
+			goto usecurrent;
+		index = c->pg.index;
+		if (++index == NEXTINDEX(h)) {
+			pg = h->nextpg;
+			mpool_put(t->bt_mp, h, 0);
+			if (pg == P_INVALID)
+				return (RET_SPECIAL);
+			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+				return (RET_ERROR);
+			index = 0;
+		}
+		break;
+	case R_PREV:			/* Previous record. */
+		/*
+		 * The cursor was deleted in duplicate records, and moved
+		 * backward to a record that has yet to be returned.  Clear
+		 * that flag, and return the record.
+		 */
+		if (F_ISSET(c, CURS_BEFORE)) {
+usecurrent:		F_CLR(c, CURS_AFTER | CURS_BEFORE);
+			ep->page = h;
+			ep->index = c->pg.index;
+			return (RET_SUCCESS);
+		}
+		index = c->pg.index;
+		if (index == 0) {
+			pg = h->prevpg;
+			mpool_put(t->bt_mp, h, 0);
+			if (pg == P_INVALID)
+				return (RET_SPECIAL);
+			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+				return (RET_ERROR);
+			index = NEXTINDEX(h) - 1;
+		} else
+			--index;
+		break;
+	}
+
+	ep->page = h;
+	ep->index = index;
+	return (RET_SUCCESS);
+}
+
+/*
+ * __bt_first --
+ *	Find the first entry.
+ *
+ * Parameters:
+ *	t:	the tree
+ *    key:	the key
+ *  erval:	return EPG
+ * exactp:	pointer to exact match flag
+ *
+ * Returns:
+ *	The first entry in the tree greater than or equal to key,
+ *	or RET_SPECIAL if no such key exists.
+ */
+static int
+__bt_first(t, key, erval, exactp)
+	BTREE *t;
+	const DBT *key;
+	EPG *erval;
+	int *exactp;
+{
+	PAGE *h;
+	EPG *ep, save;
+	pgno_t pg;
+
+	/*
+	 * Find any matching record; __bt_search pins the page.
+	 *
+	 * If it's an exact match and duplicates are possible, walk backwards
+	 * in the tree until we find the first one.  Otherwise, make sure it's
+	 * a valid key (__bt_search may return an index just past the end of a
+	 * page) and return it.
+	 */
+	if ((ep = __bt_search(t, key, exactp)) == NULL)
+		return (NULL);
+	if (*exactp) {
+		if (F_ISSET(t, B_NODUPS)) {
+			*erval = *ep;
+			return (RET_SUCCESS);
+		}
+			
+		/*
+		 * Walk backwards, as long as the entry matches and there are
+		 * keys left in the tree.  Save a copy of each match in case
+		 * we go too far.
+		 */
+		save = *ep;
+		h = ep->page;
+		do {
+			if (save.page->pgno != ep->page->pgno) {
+				mpool_put(t->bt_mp, save.page, 0);
+				save = *ep;
+			} else
+				save.index = ep->index;
+
+			/*
+			 * Don't unpin the page the last (or original) match
+			 * was on, but make sure it's unpinned if an error
+			 * occurs.
+			 */
+			if (ep->index == 0) {
+				if (h->prevpg == P_INVALID)
+					break;
+				if (h->pgno != save.page->pgno)
+					mpool_put(t->bt_mp, h, 0);
+				if ((h = mpool_get(t->bt_mp,
+				    h->prevpg, 0)) == NULL) {
+					if (h->pgno == save.page->pgno)
+						mpool_put(t->bt_mp,
+						    save.page, 0);
+					return (RET_ERROR);
+				}
+				ep->page = h;
+				ep->index = NEXTINDEX(h);
+			}
+			--ep->index;
+		} while (__bt_cmp(t, key, ep) == 0);
+
+		/*
+		 * Reach here with the last page that was looked at pinned,
+		 * which may or may not be the same as the last (or original)
+		 * match page.  If it's not useful, release it.
+		 */
+		if (h->pgno != save.page->pgno)
+			mpool_put(t->bt_mp, h, 0);
+
+		*erval = save;
+		return (RET_SUCCESS);
+	}
+
+	/* If at the end of a page, find the next entry. */
+	if (ep->index == NEXTINDEX(ep->page)) {
+		h = ep->page;
+		pg = h->nextpg;
+		mpool_put(t->bt_mp, h, 0);
+		if (pg == P_INVALID)
+			return (RET_SPECIAL);
+		if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+			return (RET_ERROR);
+		ep->index = 0;
+		ep->page = h;
+	}
+	*erval = *ep;
+	return (RET_SUCCESS);
+}
+
+/*
+ * __bt_setcur --
+ *	Set the cursor to an entry in the tree.
+ *
+ * Parameters:
+ *	t:	the tree
+ *   pgno:	page number
+ *  index:	page index
+ */
+void
+__bt_setcur(t, pgno, index)
+	BTREE *t;
+	pgno_t pgno;
+	u_int index;
+{
+	/* Lose any already deleted key. */
+	if (t->bt_cursor.key.data != NULL) {
+		free(t->bt_cursor.key.data);
+		t->bt_cursor.key.size = 0;
+		t->bt_cursor.key.data = NULL;
+	}
+	F_CLR(&t->bt_cursor, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE);
+
+	/* Update the cursor. */
+	t->bt_cursor.pg.pgno = pgno;
+	t->bt_cursor.pg.index = index;
+	F_SET(&t->bt_cursor, CURS_INIT);
+}
diff --git a/db/btree/bt_split.c b/db/btree/bt_split.c
new file mode 100644
index 0000000000..1646d82159
--- /dev/null
+++ b/db/btree/bt_split.c
@@ -0,0 +1,827 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)bt_split.c	8.9 (Berkeley) 7/26/94";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/types.h>
+
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <db.h>
+#include "btree.h"
+
+static int	 bt_broot __P((BTREE *, PAGE *, PAGE *, PAGE *));
+static PAGE	*bt_page
+		    __P((BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t));
+static int	 bt_preserve __P((BTREE *, pgno_t));
+static PAGE	*bt_psplit
+		    __P((BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t));
+static PAGE	*bt_root
+		    __P((BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t));
+static int	 bt_rroot __P((BTREE *, PAGE *, PAGE *, PAGE *));
+static recno_t	 rec_total __P((PAGE *));
+
+#ifdef STATISTICS
+u_long	bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
+#endif
+
+/*
+ * __BT_SPLIT -- Split the tree.
+ *
+ * Parameters:
+ *	t:	tree
+ *	sp:	page to split
+ *	key:	key to insert
+ *	data:	data to insert
+ *	flags:	BIGKEY/BIGDATA flags
+ *	ilen:	insert length
+ *	skip:	index to leave open
+ *
+ * Returns:
+ *	RET_ERROR, RET_SUCCESS
+ */
+int
+__bt_split(t, sp, key, data, flags, ilen, argskip)
+	BTREE *t;
+	PAGE *sp;
+	const DBT *key, *data;
+	int flags;
+	size_t ilen;
+	u_int32_t argskip;
+{
+	BINTERNAL *bi;
+	BLEAF *bl, *tbl;
+	DBT a, b;
+	EPGNO *parent;
+	PAGE *h, *l, *r, *lchild, *rchild;
+	indx_t nxtindex;
+	u_int16_t skip;
+	u_int32_t n, nbytes, nksize;
+	int parentsplit;
+	char *dest;
+
+	/*
+	 * Split the page into two pages, l and r.  The split routines return
+	 * a pointer to the page into which the key should be inserted and with
+	 * skip set to the offset which should be used.  Additionally, l and r
+	 * are pinned.
+	 */
+	skip = argskip;
+	h = sp->pgno == P_ROOT ?
+	    bt_root(t, sp, &l, &r, &skip, ilen) :
+	    bt_page(t, sp, &l, &r, &skip, ilen);
+	if (h == NULL)
+		return (RET_ERROR);
+
+	/*
+	 * Insert the new key/data pair into the leaf page.  (Key inserts
+	 * always cause a leaf page to split first.)
+	 */
+	h->linp[skip] = h->upper -= ilen;
+	dest = (char *)h + h->upper;
+	if (F_ISSET(t, R_RECNO))
+		WR_RLEAF(dest, data, flags)
+	else
+		WR_BLEAF(dest, key, data, flags)
+
+	/* If the root page was split, make it look right. */
+	if (sp->pgno == P_ROOT &&
+	    (F_ISSET(t, R_RECNO) ?
+	    bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
+		goto err2;
+
+	/*
+	 * Now we walk the parent page stack -- a LIFO stack of the pages that
+	 * were traversed when we searched for the page that split.  Each stack
+	 * entry is a page number and a page index offset.  The offset is for
+	 * the page traversed on the search.  We've just split a page, so we
+	 * have to insert a new key into the parent page.
+	 *
+	 * If the insert into the parent page causes it to split, may have to
+	 * continue splitting all the way up the tree.  We stop if the root
+	 * splits or the page inserted into didn't have to split to hold the
+	 * new key.  Some algorithms replace the key for the old page as well
+	 * as the new page.  We don't, as there's no reason to believe that the
+	 * first key on the old page is any better than the key we have, and,
+	 * in the case of a key being placed at index 0 causing the split, the
+	 * key is unavailable.
+	 *
+	 * There are a maximum of 5 pages pinned at any time.  We keep the left
+	 * and right pages pinned while working on the parent.   The 5 are the
+	 * two children, left parent and right parent (when the parent splits)
+	 * and the root page or the overflow key page when calling bt_preserve.
+	 * This code must make sure that all pins are released other than the
+	 * root page or overflow page which is unlocked elsewhere.
+	 */
+	while ((parent = BT_POP(t)) != NULL) {
+		lchild = l;
+		rchild = r;
+
+		/* Get the parent page. */
+		if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
+			goto err2;
+
+	 	/*
+		 * The new key goes ONE AFTER the index, because the split
+		 * was to the right.
+		 */
+		skip = parent->index + 1;
+
+		/*
+		 * Calculate the space needed on the parent page.
+		 *
+		 * Prefix trees: space hack when inserting into BINTERNAL
+		 * pages.  Retain only what's needed to distinguish between
+		 * the new entry and the LAST entry on the page to its left.
+		 * If the keys compare equal, retain the entire key.  Note,
+		 * we don't touch overflow keys, and the entire key must be
+		 * retained for the next-to-left most key on the leftmost
+		 * page of each level, or the search will fail.  Applicable
+		 * ONLY to internal pages that have leaf pages as children.
+		 * Further reduction of the key between pairs of internal
+		 * pages loses too much information.
+		 */
+		switch (rchild->flags & P_TYPE) {
+		case P_BINTERNAL:
+			bi = GETBINTERNAL(rchild, 0);
+			nbytes = NBINTERNAL(bi->ksize);
+			break;
+		case P_BLEAF:
+			bl = GETBLEAF(rchild, 0);
+			nbytes = NBINTERNAL(bl->ksize);
+			if (t->bt_pfx && !(bl->flags & P_BIGKEY) &&
+			    (h->prevpg != P_INVALID || skip > 1)) {
+				tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
+				a.size = tbl->ksize;
+				a.data = tbl->bytes;
+				b.size = bl->ksize;
+				b.data = bl->bytes;
+				nksize = t->bt_pfx(&a, &b);
+				n = NBINTERNAL(nksize);
+				if (n < nbytes) {
+#ifdef STATISTICS
+					bt_pfxsaved += nbytes - n;
+#endif
+					nbytes = n;
+				} else
+					nksize = 0;
+			} else
+				nksize = 0;
+			break;
+		case P_RINTERNAL:
+		case P_RLEAF:
+			nbytes = NRINTERNAL;
+			break;
+		default:
+			abort();
+		}
+
+		/* Split the parent page if necessary or shift the indices. */
+		if (h->upper - h->lower < nbytes + sizeof(indx_t)) {
+			sp = h;
+			h = h->pgno == P_ROOT ?
+			    bt_root(t, h, &l, &r, &skip, nbytes) :
+			    bt_page(t, h, &l, &r, &skip, nbytes);
+			if (h == NULL)
+				goto err1;
+			parentsplit = 1;
+		} else {
+			if (skip < (nxtindex = NEXTINDEX(h)))
+				memmove(h->linp + skip + 1, h->linp + skip,
+				    (nxtindex - skip) * sizeof(indx_t));
+			h->lower += sizeof(indx_t);
+			parentsplit = 0;
+		}
+
+		/* Insert the key into the parent page. */
+		switch (rchild->flags & P_TYPE) {
+		case P_BINTERNAL:
+			h->linp[skip] = h->upper -= nbytes;
+			dest = (char *)h + h->linp[skip];
+			memmove(dest, bi, nbytes);
+			((BINTERNAL *)dest)->pgno = rchild->pgno;
+			break;
+		case P_BLEAF:
+			h->linp[skip] = h->upper -= nbytes;
+			dest = (char *)h + h->linp[skip];
+			WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
+			    rchild->pgno, bl->flags & P_BIGKEY);
+			memmove(dest, bl->bytes, nksize ? nksize : bl->ksize);
+			if (bl->flags & P_BIGKEY &&
+			    bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
+				goto err1;
+			break;
+		case P_RINTERNAL:
+			/*
+			 * Update the left page count.  If split
+			 * added at index 0, fix the correct page.
+			 */
+			if (skip > 0)
+				dest = (char *)h + h->linp[skip - 1];
+			else
+				dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
+			((RINTERNAL *)dest)->nrecs = rec_total(lchild);
+			((RINTERNAL *)dest)->pgno = lchild->pgno;
+
+			/* Update the right page count. */
+			h->linp[skip] = h->upper -= nbytes;
+			dest = (char *)h + h->linp[skip];
+			((RINTERNAL *)dest)->nrecs = rec_total(rchild);
+			((RINTERNAL *)dest)->pgno = rchild->pgno;
+			break;
+		case P_RLEAF:
+			/*
+			 * Update the left page count.  If split
+			 * added at index 0, fix the correct page.
+			 */
+			if (skip > 0)
+				dest = (char *)h + h->linp[skip - 1];
+			else
+				dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
+			((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild);
+			((RINTERNAL *)dest)->pgno = lchild->pgno;
+
+			/* Update the right page count. */
+			h->linp[skip] = h->upper -= nbytes;
+			dest = (char *)h + h->linp[skip];
+			((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild);
+			((RINTERNAL *)dest)->pgno = rchild->pgno;
+			break;
+		default:
+			abort();
+		}
+
+		/* Unpin the held pages. */
+		if (!parentsplit) {
+			mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+			break;
+		}
+
+		/* If the root page was split, make it look right. */
+		if (sp->pgno == P_ROOT &&
+		    (F_ISSET(t, R_RECNO) ?
+		    bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
+			goto err1;
+
+		mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
+		mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
+	}
+
+	/* Unpin the held pages. */
+	mpool_put(t->bt_mp, l, MPOOL_DIRTY);
+	mpool_put(t->bt_mp, r, MPOOL_DIRTY);
+
+	/* Clear any pages left on the stack. */
+	return (RET_SUCCESS);
+
+	/*
+	 * If something fails in the above loop we were already walking back
+	 * up the tree and the tree is now inconsistent.  Nothing much we can
+	 * do about it but release any memory we're holding.
+	 */
+err1:	mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
+	mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
+
+err2:	mpool_put(t->bt_mp, l, 0);
+	mpool_put(t->bt_mp, r, 0);
+	__dbpanic(t->bt_dbp);
+	return (RET_ERROR);
+}
+
+/*
+ * BT_PAGE -- Split a non-root page of a btree.
+ *
+ * Parameters:
+ *	t:	tree
+ *	h:	root page
+ *	lp:	pointer to left page pointer
+ *	rp:	pointer to right page pointer
+ *	skip:	pointer to index to leave open
+ *	ilen:	insert length
+ *
+ * Returns:
+ *	Pointer to page in which to insert or NULL on error.
+ */
+static PAGE *
+bt_page(t, h, lp, rp, skip, ilen)
+	BTREE *t;
+	PAGE *h, **lp, **rp;
+	indx_t *skip;
+	size_t ilen;
+{
+	PAGE *l, *r, *tp;
+	pgno_t npg;
+
+#ifdef STATISTICS
+	++bt_split;
+#endif
+	/* Put the new right page for the split into place. */
+	if ((r = __bt_new(t, &npg)) == NULL)
+		return (NULL);
+	r->pgno = npg;
+	r->lower = BTDATAOFF;
+	r->upper = t->bt_psize;
+	r->nextpg = h->nextpg;
+	r->prevpg = h->pgno;
+	r->flags = h->flags & P_TYPE;
+
+	/*
+	 * If we're splitting the last page on a level because we're appending
+	 * a key to it (skip is NEXTINDEX()), it's likely that the data is
+	 * sorted.  Adding an empty page on the side of the level is less work
+	 * and can push the fill factor much higher than normal.  If we're
+	 * wrong it's no big deal, we'll just do the split the right way next
+	 * time.  It may look like it's equally easy to do a similar hack for
+	 * reverse sorted data, that is, split the tree left, but it's not.
+	 * Don't even try.
+	 */
+	if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
+#ifdef STATISTICS
+		++bt_sortsplit;
+#endif
+		h->nextpg = r->pgno;
+		r->lower = BTDATAOFF + sizeof(indx_t);
+		*skip = 0;
+		*lp = h;
+		*rp = r;
+		return (r);
+	}
+
+	/* Put the new left page for the split into place. */
+	if ((l = (PAGE *)malloc(t->bt_psize)) == NULL) {
+		mpool_put(t->bt_mp, r, 0);
+		return (NULL);
+	}
+#ifdef PURIFY
+	memset(l, 0xff, t->bt_psize);
+#endif
+	l->pgno = h->pgno;
+	l->nextpg = r->pgno;
+	l->prevpg = h->prevpg;
+	l->lower = BTDATAOFF;
+	l->upper = t->bt_psize;
+	l->flags = h->flags & P_TYPE;
+
+	/* Fix up the previous pointer of the page after the split page. */
+	if (h->nextpg != P_INVALID) {
+		if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
+			free(l);
+			/* XXX mpool_free(t->bt_mp, r->pgno); */
+			return (NULL);
+		}
+		tp->prevpg = r->pgno;
+		mpool_put(t->bt_mp, tp, MPOOL_DIRTY);
+	}
+
+	/*
+	 * Split right.  The key/data pairs aren't sorted in the btree page so
+	 * it's simpler to copy the data from the split page onto two new pages
+	 * instead of copying half the data to the right page and compacting
+	 * the left page in place.  Since the left page can't change, we have
+	 * to swap the original and the allocated left page after the split.
+	 */
+	tp = bt_psplit(t, h, l, r, skip, ilen);
+
+	/* Move the new left page onto the old left page. */
+	memmove(h, l, t->bt_psize);
+	if (tp == l)
+		tp = h;
+	free(l);
+
+	*lp = h;
+	*rp = r;
+	return (tp);
+}
+
+/*
+ * BT_ROOT -- Split the root page of a btree.
+ *
+ * Parameters:
+ *	t:	tree
+ *	h:	root page
+ *	lp:	pointer to left page pointer
+ *	rp:	pointer to right page pointer
+ *	skip:	pointer to index to leave open
+ *	ilen:	insert length
+ *
+ * Returns:
+ *	Pointer to page in which to insert or NULL on error.
+ */
+static PAGE *
+bt_root(t, h, lp, rp, skip, ilen)
+	BTREE *t;
+	PAGE *h, **lp, **rp;
+	indx_t *skip;
+	size_t ilen;
+{
+	PAGE *l, *r, *tp;
+	pgno_t lnpg, rnpg;
+
+#ifdef STATISTICS
+	++bt_split;
+	++bt_rootsplit;
+#endif
+	/* Put the new left and right pages for the split into place. */
+	if ((l = __bt_new(t, &lnpg)) == NULL ||
+	    (r = __bt_new(t, &rnpg)) == NULL)
+		return (NULL);
+	l->pgno = lnpg;
+	r->pgno = rnpg;
+	l->nextpg = r->pgno;
+	r->prevpg = l->pgno;
+	l->prevpg = r->nextpg = P_INVALID;
+	l->lower = r->lower = BTDATAOFF;
+	l->upper = r->upper = t->bt_psize;
+	l->flags = r->flags = h->flags & P_TYPE;
+
+	/* Split the root page. */
+	tp = bt_psplit(t, h, l, r, skip, ilen);
+
+	*lp = l;
+	*rp = r;
+	return (tp);
+}
+
+/*
+ * BT_RROOT -- Fix up the recno root page after it has been split.
+ *
+ * Parameters:
+ *	t:	tree
+ *	h:	root page
+ *	l:	left page
+ *	r:	right page
+ *
+ * Returns:
+ *	RET_ERROR, RET_SUCCESS
+ */
+static int
+bt_rroot(t, h, l, r)
+	BTREE *t;
+	PAGE *h, *l, *r;
+{
+	char *dest;
+
+	/* Insert the left and right keys, set the header information. */
+	h->linp[0] = h->upper = t->bt_psize - NRINTERNAL;
+	dest = (char *)h + h->upper;
+	WR_RINTERNAL(dest,
+	    l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
+
+	h->linp[1] = h->upper -= NRINTERNAL;
+	dest = (char *)h + h->upper;
+	WR_RINTERNAL(dest,
+	    r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
+
+	h->lower = BTDATAOFF + 2 * sizeof(indx_t);
+
+	/* Unpin the root page, set to recno internal page. */
+	h->flags &= ~P_TYPE;
+	h->flags |= P_RINTERNAL;
+	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+
+	return (RET_SUCCESS);
+}
+
+/*
+ * BT_BROOT -- Fix up the btree root page after it has been split.
+ *
+ * Parameters:
+ *	t:	tree
+ *	h:	root page
+ *	l:	left page
+ *	r:	right page
+ *
+ * Returns:
+ *	RET_ERROR, RET_SUCCESS
+ */
+static int
+bt_broot(t, h, l, r)
+	BTREE *t;
+	PAGE *h, *l, *r;
+{
+	BINTERNAL *bi;
+	BLEAF *bl;
+	u_int32_t nbytes;
+	char *dest;
+
+	/*
+	 * If the root page was a leaf page, change it into an internal page.
+	 * We copy the key we split on (but not the key's data, in the case of
+	 * a leaf page) to the new root page.
+	 *
+	 * The btree comparison code guarantees that the left-most key on any
+	 * level of the tree is never used, so it doesn't need to be filled in.
+	 */
+	nbytes = NBINTERNAL(0);
+	h->linp[0] = h->upper = t->bt_psize - nbytes;
+	dest = (char *)h + h->upper;
+	WR_BINTERNAL(dest, 0, l->pgno, 0);
+
+	switch (h->flags & P_TYPE) {
+	case P_BLEAF:
+		bl = GETBLEAF(r, 0);
+		nbytes = NBINTERNAL(bl->ksize);
+		h->linp[1] = h->upper -= nbytes;
+		dest = (char *)h + h->upper;
+		WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
+		memmove(dest, bl->bytes, bl->ksize);
+
+		/*
+		 * If the key is on an overflow page, mark the overflow chain
+		 * so it isn't deleted when the leaf copy of the key is deleted.
+		 */
+		if (bl->flags & P_BIGKEY &&
+		    bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
+			return (RET_ERROR);
+		break;
+	case P_BINTERNAL:
+		bi = GETBINTERNAL(r, 0);
+		nbytes = NBINTERNAL(bi->ksize);
+		h->linp[1] = h->upper -= nbytes;
+		dest = (char *)h + h->upper;
+		memmove(dest, bi, nbytes);
+		((BINTERNAL *)dest)->pgno = r->pgno;
+		break;
+	default:
+		abort();
+	}
+
+	/* There are two keys on the page. */
+	h->lower = BTDATAOFF + 2 * sizeof(indx_t);
+
+	/* Unpin the root page, set to btree internal page. */
+	h->flags &= ~P_TYPE;
+	h->flags |= P_BINTERNAL;
+	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+
+	return (RET_SUCCESS);
+}
+
+/*
+ * BT_PSPLIT -- Do the real work of splitting the page.
+ *
+ * Parameters:
+ *	t:	tree
+ *	h:	page to be split
+ *	l:	page to put lower half of data
+ *	r:	page to put upper half of data
+ *	pskip:	pointer to index to leave open
+ *	ilen:	insert length
+ *
+ * Returns:
+ *	Pointer to page in which to insert.
+ */
+static PAGE *
+bt_psplit(t, h, l, r, pskip, ilen)
+	BTREE *t;
+	PAGE *h, *l, *r;
+	indx_t *pskip;
+	size_t ilen;
+{
+	BINTERNAL *bi;
+	BLEAF *bl;
+	CURSOR *c;
+	RLEAF *rl;
+	PAGE *rval;
+	void *src;
+	indx_t full, half, nxt, off, skip, top, used;
+	u_int32_t nbytes;
+	int bigkeycnt, isbigkey;
+
+	/*
+	 * Split the data to the left and right pages.  Leave the skip index
+	 * open.  Additionally, make some effort not to split on an overflow
+	 * key.  This makes internal page processing faster and can save
+	 * space as overflow keys used by internal pages are never deleted.
+	 */
+	bigkeycnt = 0;
+	skip = *pskip;
+	full = t->bt_psize - BTDATAOFF;
+	half = full / 2;
+	used = 0;
+	for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
+		if (skip == off) {
+			nbytes = ilen;
+			isbigkey = 0;		/* XXX: not really known. */
+		} else
+			switch (h->flags & P_TYPE) {
+			case P_BINTERNAL:
+				src = bi = GETBINTERNAL(h, nxt);
+				nbytes = NBINTERNAL(bi->ksize);
+				isbigkey = bi->flags & P_BIGKEY;
+				break;
+			case P_BLEAF:
+				src = bl = GETBLEAF(h, nxt);
+				nbytes = NBLEAF(bl);
+				isbigkey = bl->flags & P_BIGKEY;
+				break;
+			case P_RINTERNAL:
+				src = GETRINTERNAL(h, nxt);
+				nbytes = NRINTERNAL;
+				isbigkey = 0;
+				break;
+			case P_RLEAF:
+				src = rl = GETRLEAF(h, nxt);
+				nbytes = NRLEAF(rl);
+				isbigkey = 0;
+				break;
+			default:
+				abort();
+			}
+
+		/*
+		 * If the key/data pairs are substantial fractions of the max
+		 * possible size for the page, it's possible to get situations
+		 * where we decide to try and copy too much onto the left page.
+		 * Make sure that doesn't happen.
+		 */
+		if (skip <= off && used + nbytes >= full) {
+			--off;
+			break;
+		}
+
+		/* Copy the key/data pair, if not the skipped index. */
+		if (skip != off) {
+			++nxt;
+
+			l->linp[off] = l->upper -= nbytes;
+			memmove((char *)l + l->upper, src, nbytes);
+		}
+
+		used += nbytes;
+		if (used >= half) {
+			if (!isbigkey || bigkeycnt == 3)
+				break;
+			else
+				++bigkeycnt;
+		}
+	}
+
+	/*
+	 * Off is the last offset that's valid for the left page.
+	 * Nxt is the first offset to be placed on the right page.
+	 */
+	l->lower += (off + 1) * sizeof(indx_t);
+
+	/*
+	 * If splitting the page that the cursor was on, the cursor has to be
+	 * adjusted to point to the same record as before the split.  If the
+	 * cursor is at or past the skipped slot, the cursor is incremented by
+	 * one.  If the cursor is on the right page, it is decremented by the
+	 * number of records split to the left page.
+	 */
+	c = &t->bt_cursor;
+	if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) {
+		if (c->pg.index >= skip)
+			++c->pg.index;
+		if (c->pg.index < nxt)			/* Left page. */
+			c->pg.pgno = l->pgno;
+		else {					/* Right page. */
+			c->pg.pgno = r->pgno;
+			c->pg.index -= nxt;
+		}
+	}
+
+	/*
+	 * If the skipped index was on the left page, just return that page.
+	 * Otherwise, adjust the skip index to reflect the new position on
+	 * the right page.
+	 */
+	if (skip <= off) {
+		skip = 0;
+		rval = l;
+	} else {
+		rval = r;
+		*pskip -= nxt;
+	}
+
+	for (off = 0; nxt < top; ++off) {
+		if (skip == nxt) {
+			++off;
+			skip = 0;
+		}
+		switch (h->flags & P_TYPE) {
+		case P_BINTERNAL:
+			src = bi = GETBINTERNAL(h, nxt);
+			nbytes = NBINTERNAL(bi->ksize);
+			break;
+		case P_BLEAF:
+			src = bl = GETBLEAF(h, nxt);
+			nbytes = NBLEAF(bl);
+			break;
+		case P_RINTERNAL:
+			src = GETRINTERNAL(h, nxt);
+			nbytes = NRINTERNAL;
+			break;
+		case P_RLEAF:
+			src = rl = GETRLEAF(h, nxt);
+			nbytes = NRLEAF(rl);
+			break;
+		default:
+			abort();
+		}
+		++nxt;
+		r->linp[off] = r->upper -= nbytes;
+		memmove((char *)r + r->upper, src, nbytes);
+	}
+	r->lower += off * sizeof(indx_t);
+
+	/* If the key is being appended to the page, adjust the index. */
+	if (skip == top)
+		r->lower += sizeof(indx_t);
+
+	return (rval);
+}
+
+/*
+ * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
+ *
+ * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
+ * record that references them gets deleted.  Chains pointed to by internal
+ * pages never get deleted.  This routine marks a chain as pointed to by an
+ * internal page.
+ *
+ * Parameters:
+ *	t:	tree
+ *	pg:	page number of first page in the chain.
+ *
+ * Returns:
+ *	RET_SUCCESS, RET_ERROR.
+ */
+static int
+bt_preserve(t, pg)
+	BTREE *t;
+	pgno_t pg;
+{
+	PAGE *h;
+
+	if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+		return (RET_ERROR);
+	h->flags |= P_PRESERVE;
+	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+	return (RET_SUCCESS);
+}
+
+/*
+ * REC_TOTAL -- Return the number of recno entries below a page.
+ *
+ * Parameters:
+ *	h:	page
+ *
+ * Returns:
+ *	The number of recno entries below a page.
+ *
+ * XXX
+ * These values could be set by the bt_psplit routine.  The problem is that the
+ * entry has to be popped off of the stack etc. or the values have to be passed
+ * all the way back to bt_split/bt_rroot and it's not very clean.
+ */
+static recno_t
+rec_total(h)
+	PAGE *h;
+{
+	recno_t recs;
+	indx_t nxt, top;
+
+	for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
+		recs += GETRINTERNAL(h, nxt)->nrecs;
+	return (recs);
+}
diff --git a/db/btree/bt_utils.c b/db/btree/bt_utils.c
new file mode 100644
index 0000000000..9c1438eb84
--- /dev/null
+++ b/db/btree/bt_utils.c
@@ -0,0 +1,260 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)bt_utils.c	8.8 (Berkeley) 7/20/94";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/param.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <db.h>
+#include "btree.h"
+
+/*
+ * __bt_ret --
+ *	Build return key/data pair.
+ *
+ * Parameters:
+ *	t:	tree
+ *	e:	key/data pair to be returned
+ *	key:	user's key structure (NULL if not to be filled in)
+ *	rkey:	memory area to hold key
+ *	data:	user's data structure (NULL if not to be filled in)
+ *	rdata:	memory area to hold data
+ *       copy:	always copy the key/data item
+ *
+ * Returns:
+ *	RET_SUCCESS, RET_ERROR.
+ */
+int
+__bt_ret(t, e, key, rkey, data, rdata, copy)
+	BTREE *t;
+	EPG *e;
+	DBT *key, *rkey, *data, *rdata;
+	int copy;
+{
+	BLEAF *bl;
+	void *p;
+
+	bl = GETBLEAF(e->page, e->index);
+
+	/*
+	 * We must copy big keys/data to make them contigous.  Otherwise,
+	 * leave the page pinned and don't copy unless the user specified
+	 * concurrent access.
+	 */
+	if (key == NULL)
+		goto dataonly;
+
+	if (bl->flags & P_BIGKEY) {
+		if (__ovfl_get(t, bl->bytes,
+		    &key->size, &rkey->data, &rkey->size))
+			return (RET_ERROR);
+		key->data = rkey->data;
+	} else if (copy || F_ISSET(t, B_DB_LOCK)) {
+		if (bl->ksize > rkey->size) {
+			p = (void *)(rkey->data == NULL ?
+			    malloc(bl->ksize) : realloc(rkey->data, bl->ksize));
+			if (p == NULL)
+				return (RET_ERROR);
+			rkey->data = p;
+			rkey->size = bl->ksize;
+		}
+		memmove(rkey->data, bl->bytes, bl->ksize);
+		key->size = bl->ksize;
+		key->data = rkey->data;
+	} else {
+		key->size = bl->ksize;
+		key->data = bl->bytes;
+	}
+
+dataonly:
+	if (data == NULL)
+		return (RET_SUCCESS);
+
+	if (bl->flags & P_BIGDATA) {
+		if (__ovfl_get(t, bl->bytes + bl->ksize,
+		    &data->size, &rdata->data, &rdata->size))
+			return (RET_ERROR);
+		data->data = rdata->data;
+	} else if (copy || F_ISSET(t, B_DB_LOCK)) {
+		/* Use +1 in case the first record retrieved is 0 length. */
+		if (bl->dsize + 1 > rdata->size) {
+			p = (void *)(rdata->data == NULL ?
+			    malloc(bl->dsize + 1) :
+			    realloc(rdata->data, bl->dsize + 1));
+			if (p == NULL)
+				return (RET_ERROR);
+			rdata->data = p;
+			rdata->size = bl->dsize + 1;
+		}
+		memmove(rdata->data, bl->bytes + bl->ksize, bl->dsize);
+		data->size = bl->dsize;
+		data->data = rdata->data;
+	} else {
+		data->size = bl->dsize;
+		data->data = bl->bytes + bl->ksize;
+	}
+
+	return (RET_SUCCESS);
+}
+
+/*
+ * __BT_CMP -- Compare a key to a given record.
+ *
+ * Parameters:
+ *	t:	tree
+ *	k1:	DBT pointer of first arg to comparison
+ *	e:	pointer to EPG for comparison
+ *
+ * Returns:
+ *	< 0 if k1 is < record
+ *	= 0 if k1 is = record
+ *	> 0 if k1 is > record
+ */
+int
+__bt_cmp(t, k1, e)
+	BTREE *t;
+	const DBT *k1;
+	EPG *e;
+{
+	BINTERNAL *bi;
+	BLEAF *bl;
+	DBT k2;
+	PAGE *h;
+	void *bigkey;
+
+	/*
+	 * The left-most key on internal pages, at any level of the tree, is
+	 * guaranteed by the following code to be less than any user key.
+	 * This saves us from having to update the leftmost key on an internal
+	 * page when the user inserts a new key in the tree smaller than
+	 * anything we've yet seen.
+	 */
+	h = e->page;
+	if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & P_BLEAF))
+		return (1);
+
+	bigkey = NULL;
+	if (h->flags & P_BLEAF) {
+		bl = GETBLEAF(h, e->index);
+		if (bl->flags & P_BIGKEY)
+			bigkey = bl->bytes;
+		else {
+			k2.data = bl->bytes;
+			k2.size = bl->ksize;
+		}
+	} else {
+		bi = GETBINTERNAL(h, e->index);
+		if (bi->flags & P_BIGKEY)
+			bigkey = bi->bytes;
+		else {
+			k2.data = bi->bytes;
+			k2.size = bi->ksize;
+		}
+	}
+
+	if (bigkey) {
+		if (__ovfl_get(t, bigkey,
+		    &k2.size, &t->bt_rdata.data, &t->bt_rdata.size))
+			return (RET_ERROR);
+		k2.data = t->bt_rdata.data;
+	}
+	return ((*t->bt_cmp)(k1, &k2));
+}
+
+/*
+ * __BT_DEFCMP -- Default comparison routine.
+ *
+ * Parameters:
+ *	a:	DBT #1
+ *	b: 	DBT #2
+ *
+ * Returns:
+ *	< 0 if a is < b
+ *	= 0 if a is = b
+ *	> 0 if a is > b
+ */
+int
+__bt_defcmp(a, b)
+	const DBT *a, *b;
+{
+	register size_t len;
+	register u_char *p1, *p2;
+
+	/*
+	 * XXX
+	 * If a size_t doesn't fit in an int, this routine can lose.
+	 * What we need is a integral type which is guaranteed to be
+	 * larger than a size_t, and there is no such thing.
+	 */
+	len = MIN(a->size, b->size);
+	for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2)
+		if (*p1 != *p2)
+			return ((int)*p1 - (int)*p2);
+	return ((int)a->size - (int)b->size);
+}
+
+/*
+ * __BT_DEFPFX -- Default prefix routine.
+ *
+ * Parameters:
+ *	a:	DBT #1
+ *	b: 	DBT #2
+ *
+ * Returns:
+ *	Number of bytes needed to distinguish b from a.
+ */
+size_t
+__bt_defpfx(a, b)
+	const DBT *a, *b;
+{
+	register u_char *p1, *p2;
+	register size_t cnt, len;
+
+	cnt = 1;
+	len = MIN(a->size, b->size);
+	for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2, ++cnt)
+		if (*p1 != *p2)
+			return (cnt);
+
+	/* a->size must be <= b->size, or they wouldn't be in this order. */
+	return (a->size < b->size ? a->size + 1 : a->size);
+}
diff --git a/db/btree/btree.h b/db/btree/btree.h
new file mode 100644
index 0000000000..36d35c998b
--- /dev/null
+++ b/db/btree/btree.h
@@ -0,0 +1,383 @@
+/*-
+ * Copyright (c) 1991, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)btree.h	8.11 (Berkeley) 8/17/94
+ */
+
+/* Macros to set/clear/test flags. */
+#define	F_SET(p, f)	(p)->flags |= (f)
+#define	F_CLR(p, f)	(p)->flags &= ~(f)
+#define	F_ISSET(p, f)	((p)->flags & (f))
+
+#include <mpool.h>
+
+#define	DEFMINKEYPAGE	(2)		/* Minimum keys per page */
+#define	MINCACHE	(5)		/* Minimum cached pages */
+#define	MINPSIZE	(512)		/* Minimum page size */
+
+/*
+ * Page 0 of a btree file contains a copy of the meta-data.  This page is also
+ * used as an out-of-band page, i.e. page pointers that point to nowhere point
+ * to page 0.  Page 1 is the root of the btree.
+ */
+#define	P_INVALID	 0		/* Invalid tree page number. */
+#define	P_META		 0		/* Tree metadata page number. */
+#define	P_ROOT		 1		/* Tree root page number. */
+
+/*
+ * There are five page layouts in the btree: btree internal pages (BINTERNAL),
+ * btree leaf pages (BLEAF), recno internal pages (RINTERNAL), recno leaf pages
+ * (RLEAF) and overflow pages.  All five page types have a page header (PAGE).
+ * This implementation requires that values within structures NOT be padded.
+ * (ANSI C permits random padding.)  If your compiler pads randomly you'll have
+ * to do some work to get this package to run.
+ */
+typedef struct _page {
+	pgno_t	pgno;			/* this page's page number */
+	pgno_t	prevpg;			/* left sibling */
+	pgno_t	nextpg;			/* right sibling */
+
+#define	P_BINTERNAL	0x01		/* btree internal page */
+#define	P_BLEAF		0x02		/* leaf page */
+#define	P_OVERFLOW	0x04		/* overflow page */
+#define	P_RINTERNAL	0x08		/* recno internal page */
+#define	P_RLEAF		0x10		/* leaf page */
+#define P_TYPE		0x1f		/* type mask */
+#define	P_PRESERVE	0x20		/* never delete this chain of pages */
+	u_int32_t flags;
+
+	indx_t	lower;			/* lower bound of free space on page */
+	indx_t	upper;			/* upper bound of free space on page */
+	indx_t	linp[1];		/* indx_t-aligned VAR. LENGTH DATA */
+} PAGE;
+
+/* First and next index. */
+#define	BTDATAOFF							\
+	(sizeof(pgno_t) + sizeof(pgno_t) + sizeof(pgno_t) +		\
+	    sizeof(u_int32_t) + sizeof(indx_t) + sizeof(indx_t))
+#define	NEXTINDEX(p)	(((p)->lower - BTDATAOFF) / sizeof(indx_t))
+
+/*
+ * For pages other than overflow pages, there is an array of offsets into the
+ * rest of the page immediately following the page header.  Each offset is to
+ * an item which is unique to the type of page.  The h_lower offset is just
+ * past the last filled-in index.  The h_upper offset is the first item on the
+ * page.  Offsets are from the beginning of the page.
+ *
+ * If an item is too big to store on a single page, a flag is set and the item
+ * is a { page, size } pair such that the page is the first page of an overflow
+ * chain with size bytes of item.  Overflow pages are simply bytes without any
+ * external structure.
+ *
+ * The page number and size fields in the items are pgno_t-aligned so they can
+ * be manipulated without copying.  (This presumes that 32 bit items can be
+ * manipulated on this system.)
+ */
+#define	LALIGN(n)	(((n) + sizeof(pgno_t) - 1) & ~(sizeof(pgno_t) - 1))
+#define	NOVFLSIZE	(sizeof(pgno_t) + sizeof(u_int32_t))
+
+/*
+ * For the btree internal pages, the item is a key.  BINTERNALs are {key, pgno}
+ * pairs, such that the key compares less than or equal to all of the records
+ * on that page.  For a tree without duplicate keys, an internal page with two
+ * consecutive keys, a and b, will have all records greater than or equal to a
+ * and less than b stored on the page associated with a.  Duplicate keys are
+ * somewhat special and can cause duplicate internal and leaf page records and
+ * some minor modifications of the above rule.
+ */
+typedef struct _binternal {
+	u_int32_t ksize;		/* key size */
+	pgno_t	pgno;			/* page number stored on */
+#define	P_BIGDATA	0x01		/* overflow data */
+#define	P_BIGKEY	0x02		/* overflow key */
+	u_char	flags;
+	char	bytes[1];		/* data */
+} BINTERNAL;
+
+/* Get the page's BINTERNAL structure at index indx. */
+#define	GETBINTERNAL(pg, indx)						\
+	((BINTERNAL *)((char *)(pg) + (pg)->linp[indx]))
+
+/* Get the number of bytes in the entry. */
+#define NBINTERNAL(len)							\
+	LALIGN(sizeof(u_int32_t) + sizeof(pgno_t) + sizeof(u_char) + (len))
+
+/* Copy a BINTERNAL entry to the page. */
+#define	WR_BINTERNAL(p, size, pgno, flags) {				\
+	*(u_int32_t *)p = size;						\
+	p += sizeof(u_int32_t);						\
+	*(pgno_t *)p = pgno;						\
+	p += sizeof(pgno_t);						\
+	*(u_char *)p = flags;						\
+	p += sizeof(u_char);						\
+}
+
+/*
+ * For the recno internal pages, the item is a page number with the number of
+ * keys found on that page and below.
+ */
+typedef struct _rinternal {
+	recno_t	nrecs;			/* number of records */
+	pgno_t	pgno;			/* page number stored below */
+} RINTERNAL;
+
+/* Get the page's RINTERNAL structure at index indx. */
+#define	GETRINTERNAL(pg, indx)						\
+	((RINTERNAL *)((char *)(pg) + (pg)->linp[indx]))
+
+/* Get the number of bytes in the entry. */
+#define NRINTERNAL							\
+	LALIGN(sizeof(recno_t) + sizeof(pgno_t))
+
+/* Copy a RINTERAL entry to the page. */
+#define	WR_RINTERNAL(p, nrecs, pgno) {					\
+	*(recno_t *)p = nrecs;						\
+	p += sizeof(recno_t);						\
+	*(pgno_t *)p = pgno;						\
+}
+
+/* For the btree leaf pages, the item is a key and data pair. */
+typedef struct _bleaf {
+	u_int32_t	ksize;		/* size of key */
+	u_int32_t	dsize;		/* size of data */
+	u_char	flags;			/* P_BIGDATA, P_BIGKEY */
+	char	bytes[1];		/* data */
+} BLEAF;
+
+/* Get the page's BLEAF structure at index indx. */
+#define	GETBLEAF(pg, indx)						\
+	((BLEAF *)((char *)(pg) + (pg)->linp[indx]))
+
+/* Get the number of bytes in the entry. */
+#define NBLEAF(p)	NBLEAFDBT((p)->ksize, (p)->dsize)
+
+/* Get the number of bytes in the user's key/data pair. */
+#define NBLEAFDBT(ksize, dsize)						\
+	LALIGN(sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_char) +	\
+	    (ksize) + (dsize))
+
+/* Copy a BLEAF entry to the page. */
+#define	WR_BLEAF(p, key, data, flags) {					\
+	*(u_int32_t *)p = key->size;					\
+	p += sizeof(u_int32_t);						\
+	*(u_int32_t *)p = data->size;					\
+	p += sizeof(u_int32_t);						\
+	*(u_char *)p = flags;						\
+	p += sizeof(u_char);						\
+	memmove(p, key->data, key->size);				\
+	p += key->size;							\
+	memmove(p, data->data, data->size);				\
+}
+
+/* For the recno leaf pages, the item is a data entry. */
+typedef struct _rleaf {
+	u_int32_t	dsize;		/* size of data */
+	u_char	flags;			/* P_BIGDATA */
+	char	bytes[1];
+} RLEAF;
+
+/* Get the page's RLEAF structure at index indx. */
+#define	GETRLEAF(pg, indx)						\
+	((RLEAF *)((char *)(pg) + (pg)->linp[indx]))
+
+/* Get the number of bytes in the entry. */
+#define NRLEAF(p)	NRLEAFDBT((p)->dsize)
+
+/* Get the number of bytes from the user's data. */
+#define	NRLEAFDBT(dsize)						\
+	LALIGN(sizeof(u_int32_t) + sizeof(u_char) + (dsize))
+
+/* Copy a RLEAF entry to the page. */
+#define	WR_RLEAF(p, data, flags) {					\
+	*(u_int32_t *)p = data->size;					\
+	p += sizeof(u_int32_t);						\
+	*(u_char *)p = flags;						\
+	p += sizeof(u_char);						\
+	memmove(p, data->data, data->size);				\
+}
+
+/*
+ * A record in the tree is either a pointer to a page and an index in the page
+ * or a page number and an index.  These structures are used as a cursor, stack
+ * entry and search returns as well as to pass records to other routines.
+ *
+ * One comment about searches.  Internal page searches must find the largest
+ * record less than key in the tree so that descents work.  Leaf page searches
+ * must find the smallest record greater than key so that the returned index
+ * is the record's correct position for insertion.
+ */
+typedef struct _epgno {
+	pgno_t	pgno;			/* the page number */
+	indx_t	index;			/* the index on the page */
+} EPGNO;
+
+typedef struct _epg {
+	PAGE	*page;			/* the (pinned) page */
+	indx_t	 index;			/* the index on the page */
+} EPG;
+
+/*
+ * About cursors.  The cursor (and the page that contained the key/data pair
+ * that it referenced) can be deleted, which makes things a bit tricky.  If
+ * there are no duplicates of the cursor key in the tree (i.e. B_NODUPS is set
+ * or there simply aren't any duplicates of the key) we copy the key that it
+ * referenced when it's deleted, and reacquire a new cursor key if the cursor
+ * is used again.  If there are duplicates keys, we move to the next/previous
+ * key, and set a flag so that we know what happened.  NOTE: if duplicate (to
+ * the cursor) keys are added to the tree during this process, it is undefined
+ * if they will be returned or not in a cursor scan.
+ *
+ * The flags determine the possible states of the cursor:
+ *
+ * CURS_INIT	The cursor references *something*.
+ * CURS_ACQUIRE	The cursor was deleted, and a key has been saved so that
+ *		we can reacquire the right position in the tree.
+ * CURS_AFTER, CURS_BEFORE
+ *		The cursor was deleted, and now references a key/data pair
+ *		that has not yet been returned, either before or after the
+ *		deleted key/data pair.
+ * XXX
+ * This structure is broken out so that we can eventually offer multiple
+ * cursors as part of the DB interface.
+ */
+typedef struct _cursor {
+	EPGNO	 pg;			/* B: Saved tree reference. */
+	DBT	 key;			/* B: Saved key, or key.data == NULL. */
+	recno_t	 rcursor;		/* R: recno cursor (1-based) */
+
+#define	CURS_ACQUIRE	0x01		/*  B: Cursor needs to be reacquired. */
+#define	CURS_AFTER	0x02		/*  B: Unreturned cursor after key. */
+#define	CURS_BEFORE	0x04		/*  B: Unreturned cursor before key. */
+#define	CURS_INIT	0x08		/* RB: Cursor initialized. */
+	u_int8_t flags;
+} CURSOR;
+
+/*
+ * The metadata of the tree.  The nrecs field is used only by the RECNO code.
+ * This is because the btree doesn't really need it and it requires that every
+ * put or delete call modify the metadata.
+ */
+typedef struct _btmeta {
+	u_int32_t	magic;		/* magic number */
+	u_int32_t	version;	/* version */
+	u_int32_t	psize;		/* page size */
+	u_int32_t	free;		/* page number of first free page */
+	u_int32_t	nrecs;		/* R: number of records */
+
+#define	SAVEMETA	(B_NODUPS | R_RECNO)
+	u_int32_t	flags;		/* bt_flags & SAVEMETA */
+} BTMETA;
+
+/* The in-memory btree/recno data structure. */
+typedef struct _btree {
+	MPOOL	 *bt_mp;		/* memory pool cookie */
+
+	DB	 *bt_dbp;		/* pointer to enclosing DB */
+
+	EPG	  bt_cur;		/* current (pinned) page */
+	PAGE	 *bt_pinned;		/* page pinned across calls */
+
+	CURSOR	  bt_cursor;		/* cursor */
+
+#define	BT_PUSH(t, p, i) {						\
+	t->bt_sp->pgno = p; 						\
+	t->bt_sp->index = i; 						\
+	++t->bt_sp;							\
+}
+#define	BT_POP(t)	(t->bt_sp == t->bt_stack ? NULL : --t->bt_sp)
+#define	BT_CLR(t)	(t->bt_sp = t->bt_stack)
+	EPGNO	  bt_stack[50];		/* stack of parent pages */
+	EPGNO	 *bt_sp;		/* current stack pointer */
+
+	DBT	  bt_rkey;		/* returned key */
+	DBT	  bt_rdata;		/* returned data */
+
+	int	  bt_fd;		/* tree file descriptor */
+
+	pgno_t	  bt_free;		/* next free page */
+	u_int32_t bt_psize;		/* page size */
+	indx_t	  bt_ovflsize;		/* cut-off for key/data overflow */
+	int	  bt_lorder;		/* byte order */
+					/* sorted order */
+	enum { NOT, BACK, FORWARD } bt_order;
+	EPGNO	  bt_last;		/* last insert */
+
+					/* B: key comparison function */
+	int	(*bt_cmp) __P((const DBT *, const DBT *));
+					/* B: prefix comparison function */
+	size_t	(*bt_pfx) __P((const DBT *, const DBT *));
+					/* R: recno input function */
+	int	(*bt_irec) __P((struct _btree *, recno_t));
+
+	FILE	 *bt_rfp;		/* R: record FILE pointer */
+	int	  bt_rfd;		/* R: record file descriptor */
+
+	caddr_t	  bt_cmap;		/* R: current point in mapped space */
+	caddr_t	  bt_smap;		/* R: start of mapped space */
+	caddr_t   bt_emap;		/* R: end of mapped space */
+	size_t	  bt_msize;		/* R: size of mapped region. */
+
+	recno_t	  bt_nrecs;		/* R: number of records */
+	size_t	  bt_reclen;		/* R: fixed record length */
+	u_char	  bt_bval;		/* R: delimiting byte/pad character */
+
+/*
+ * NB:
+ * B_NODUPS and R_RECNO are stored on disk, and may not be changed.
+ */
+#define	B_INMEM		0x00001		/* in-memory tree */
+#define	B_METADIRTY	0x00002		/* need to write metadata */
+#define	B_MODIFIED	0x00004		/* tree modified */
+#define	B_NEEDSWAP	0x00008		/* if byte order requires swapping */
+#define	B_RDONLY	0x00010		/* read-only tree */
+
+#define	B_NODUPS	0x00020		/* no duplicate keys permitted */
+#define	R_RECNO		0x00080		/* record oriented tree */
+
+#define	R_CLOSEFP	0x00040		/* opened a file pointer */
+#define	R_EOF		0x00100		/* end of input file reached. */
+#define	R_FIXLEN	0x00200		/* fixed length records */
+#define	R_MEMMAPPED	0x00400		/* memory mapped file. */
+#define	R_INMEM		0x00800		/* in-memory file */
+#define	R_MODIFIED	0x01000		/* modified file */
+#define	R_RDONLY	0x02000		/* read-only file */
+
+#define	B_DB_LOCK	0x04000		/* DB_LOCK specified. */
+#define	B_DB_SHMEM	0x08000		/* DB_SHMEM specified. */
+#define	B_DB_TXN	0x10000		/* DB_TXN specified. */
+	u_int32_t flags;
+} BTREE;
+
+#include "extern.h"
diff --git a/db/btree/extern.h b/db/btree/extern.h
new file mode 100644
index 0000000000..ebd9c54923
--- /dev/null
+++ b/db/btree/extern.h
@@ -0,0 +1,70 @@
+/*-
+ * Copyright (c) 1991, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)extern.h	8.10 (Berkeley) 7/20/94
+ */
+
+int	 __bt_close __P((DB *));
+int	 __bt_cmp __P((BTREE *, const DBT *, EPG *));
+int	 __bt_crsrdel __P((BTREE *, EPGNO *));
+int	 __bt_defcmp __P((const DBT *, const DBT *));
+size_t	 __bt_defpfx __P((const DBT *, const DBT *));
+int	 __bt_delete __P((const DB *, const DBT *, u_int));
+int	 __bt_dleaf __P((BTREE *, const DBT *, PAGE *, u_int));
+int	 __bt_fd __P((const DB *));
+int	 __bt_free __P((BTREE *, PAGE *));
+int	 __bt_get __P((const DB *, const DBT *, DBT *, u_int));
+PAGE	*__bt_new __P((BTREE *, pgno_t *));
+void	 __bt_pgin __P((void *, pgno_t, void *));
+void	 __bt_pgout __P((void *, pgno_t, void *));
+int	 __bt_push __P((BTREE *, pgno_t, int));
+int	 __bt_put __P((const DB *dbp, DBT *, const DBT *, u_int));
+int	 __bt_ret __P((BTREE *, EPG *, DBT *, DBT *, DBT *, DBT *, int));
+EPG	*__bt_search __P((BTREE *, const DBT *, int *));
+int	 __bt_seq __P((const DB *, DBT *, DBT *, u_int));
+void	 __bt_setcur __P((BTREE *, pgno_t, u_int));
+int	 __bt_split __P((BTREE *, PAGE *,
+	    const DBT *, const DBT *, int, size_t, u_int32_t));
+int	 __bt_sync __P((const DB *, u_int));
+
+int	 __ovfl_delete __P((BTREE *, void *));
+int	 __ovfl_get __P((BTREE *, void *, size_t *, void **, size_t *));
+int	 __ovfl_put __P((BTREE *, const DBT *, pgno_t *));
+
+#ifdef DEBUG
+void	 __bt_dnpage __P((DB *, pgno_t));
+void	 __bt_dpage __P((PAGE *));
+void	 __bt_dump __P((DB *));
+#endif
+#ifdef STATISTICS
+void	 __bt_stat __P((DB *));
+#endif
diff --git a/db/compat.h b/db/compat.h
new file mode 100644
index 0000000000..706e58265d
--- /dev/null
+++ b/db/compat.h
@@ -0,0 +1,49 @@
+/* Values for building 4.4 BSD db routines in the GNU C library.  */
+
+#ifndef _compat_h_
+#define _compat_h_
+
+#include <fcntl.h>
+
+/*
+ * If you can't provide lock values in the open(2) call.  Note, this
+ * allows races to happen.
+ */
+#ifndef O_EXLOCK			/* 4.4BSD extension. */
+#define	O_EXLOCK	0
+#endif
+
+#ifndef O_SHLOCK			/* 4.4BSD extension. */
+#define	O_SHLOCK	0
+#endif
+
+#include <errno.h>
+
+#ifndef EFTYPE
+#define	EFTYPE		EINVAL		/* POSIX 1003.1 format errno. */
+#endif
+
+#include <unistd.h>
+#include <limits.h>
+
+#ifndef _POSIX_VDISABLE			/* POSIX 1003.1 disabling char. */
+#define	_POSIX_VDISABLE	0		/* Some systems used 0. */
+#endif
+
+#include <termios.h>
+
+#ifndef	TCSASOFT			/* 4.4BSD extension. */
+#define	TCSASOFT	0
+#endif
+
+#include <sys/param.h>
+
+#ifndef	MAX				/* Usually found in <sys/param.h>. */
+#define	MAX(_a,_b)	((_a)<(_b)?(_b):(_a))
+#endif
+#ifndef	MIN				/* Usually found in <sys/param.h>. */
+#define	MIN(_a,_b)	((_a)<(_b)?(_a):(_b))
+#endif
+
+
+#endif /* compat.h */
diff --git a/db/db.h b/db/db.h
new file mode 100644
index 0000000000..13c7495d64
--- /dev/null
+++ b/db/db.h
@@ -0,0 +1,236 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)db.h	8.7 (Berkeley) 6/16/94
+ */
+
+#ifndef _DB_H_
+#define	_DB_H_
+
+#include <sys/types.h>
+#include <sys/cdefs.h>
+
+#include <limits.h>
+
+#ifdef __DBINTERFACE_PRIVATE
+#include <compat.h>
+#endif
+
+#define	RET_ERROR	-1		/* Return values. */
+#define	RET_SUCCESS	 0
+#define	RET_SPECIAL	 1
+
+#ifndef	__BIT_TYPES_DEFINED__
+#define	__BIT_TYPES_DEFINED__
+typedef	__signed char		   int8_t;
+typedef	unsigned char		 u_int8_t;
+typedef	short			  int16_t;
+typedef	unsigned short		u_int16_t;
+typedef	int			  int32_t;
+typedef	unsigned int		u_int32_t;
+#ifdef WE_DONT_NEED_QUADS
+typedef	long long		  int64_t;
+typedef	unsigned long long	u_int64_t;
+#endif
+#endif
+
+#define	MAX_PAGE_NUMBER	0xffffffff	/* >= # of pages in a file */
+typedef u_int32_t	pgno_t;
+#define	MAX_PAGE_OFFSET	65535		/* >= # of bytes in a page */
+typedef u_int16_t	indx_t;
+#define	MAX_REC_NUMBER	0xffffffff	/* >= # of records in a tree */
+typedef u_int32_t	recno_t;
+
+/* Key/data structure -- a Data-Base Thang. */
+typedef struct {
+	void	*data;			/* data */
+	size_t	 size;			/* data length */
+} DBT;
+
+/* Routine flags. */
+#define	R_CURSOR	1		/* del, put, seq */
+#define	__R_UNUSED	2		/* UNUSED */
+#define	R_FIRST		3		/* seq */
+#define	R_IAFTER	4		/* put (RECNO) */
+#define	R_IBEFORE	5		/* put (RECNO) */
+#define	R_LAST		6		/* seq (BTREE, RECNO) */
+#define	R_NEXT		7		/* seq */
+#define	R_NOOVERWRITE	8		/* put */
+#define	R_PREV		9		/* seq (BTREE, RECNO) */
+#define	R_SETCURSOR	10		/* put (RECNO) */
+#define	R_RECNOSYNC	11		/* sync (RECNO) */
+
+typedef enum { DB_BTREE, DB_HASH, DB_RECNO } DBTYPE;
+
+/*
+ * !!!
+ * The following flags are included in the dbopen(3) call as part of the
+ * open(2) flags.  In order to avoid conflicts with the open flags, start
+ * at the top of the 16 or 32-bit number space and work our way down.  If
+ * the open flags were significantly expanded in the future, it could be
+ * a problem.  Wish I'd left another flags word in the dbopen call.
+ *
+ * !!!
+ * None of this stuff is implemented yet.  The only reason that it's here
+ * is so that the access methods can skip copying the key/data pair when
+ * the DB_LOCK flag isn't set.
+ */
+#if UINT_MAX > 65535
+#define	DB_LOCK		0x20000000	/* Do locking. */
+#define	DB_SHMEM	0x40000000	/* Use shared memory. */
+#define	DB_TXN		0x80000000	/* Do transactions. */
+#else
+#define	DB_LOCK		    0x2000	/* Do locking. */
+#define	DB_SHMEM	    0x4000	/* Use shared memory. */
+#define	DB_TXN		    0x8000	/* Do transactions. */
+#endif
+
+/* Access method description structure. */
+typedef struct __db {
+	DBTYPE type;			/* Underlying db type. */
+	int (*close)	__P((struct __db *));
+	int (*del)	__P((const struct __db *, const DBT *, u_int));
+	int (*get)	__P((const struct __db *, const DBT *, DBT *, u_int));
+	int (*put)	__P((const struct __db *, DBT *, const DBT *, u_int));
+	int (*seq)	__P((const struct __db *, DBT *, DBT *, u_int));
+	int (*sync)	__P((const struct __db *, u_int));
+	void *internal;			/* Access method private. */
+	int (*fd)	__P((const struct __db *));
+} DB;
+
+#define	BTREEMAGIC	0x053162
+#define	BTREEVERSION	3
+
+/* Structure used to pass parameters to the btree routines. */
+typedef struct {
+#define	R_DUP		0x01	/* duplicate keys */
+	u_long	flags;
+	u_int	cachesize;	/* bytes to cache */
+	int	maxkeypage;	/* maximum keys per page */
+	int	minkeypage;	/* minimum keys per page */
+	u_int	psize;		/* page size */
+	int	(*compare)	/* comparison function */
+	    __P((const DBT *, const DBT *));
+	size_t	(*prefix)	/* prefix function */
+	    __P((const DBT *, const DBT *));
+	int	lorder;		/* byte order */
+} BTREEINFO;
+
+#define	HASHMAGIC	0x061561
+#define	HASHVERSION	2
+
+/* Structure used to pass parameters to the hashing routines. */
+typedef struct {
+	u_int	bsize;		/* bucket size */
+	u_int	ffactor;	/* fill factor */
+	u_int	nelem;		/* number of elements */
+	u_int	cachesize;	/* bytes to cache */
+	u_int32_t		/* hash function */
+		(*hash) __P((const void *, size_t));
+	int	lorder;		/* byte order */
+} HASHINFO;
+
+/* Structure used to pass parameters to the record routines. */
+typedef struct {
+#define	R_FIXEDLEN	0x01	/* fixed-length records */
+#define	R_NOKEY		0x02	/* key not required */
+#define	R_SNAPSHOT	0x04	/* snapshot the input */
+	u_long	flags;
+	u_int	cachesize;	/* bytes to cache */
+	u_int	psize;		/* page size */
+	int	lorder;		/* byte order */
+	size_t	reclen;		/* record length (fixed-length records) */
+	u_char	bval;		/* delimiting byte (variable-length records */
+	char	*bfname;	/* btree file name */ 
+} RECNOINFO;
+
+#ifdef __DBINTERFACE_PRIVATE
+/*
+ * Little endian <==> big endian 32-bit swap macros.
+ *	M_32_SWAP	swap a memory location
+ *	P_32_SWAP	swap a referenced memory location
+ *	P_32_COPY	swap from one location to another
+ */
+#define	M_32_SWAP(a) {							\
+	u_int32_t _tmp = a;						\
+	((char *)&a)[0] = ((char *)&_tmp)[3];				\
+	((char *)&a)[1] = ((char *)&_tmp)[2];				\
+	((char *)&a)[2] = ((char *)&_tmp)[1];				\
+	((char *)&a)[3] = ((char *)&_tmp)[0];				\
+}
+#define	P_32_SWAP(a) {							\
+	u_int32_t _tmp = *(u_int32_t *)a;				\
+	((char *)a)[0] = ((char *)&_tmp)[3];				\
+	((char *)a)[1] = ((char *)&_tmp)[2];				\
+	((char *)a)[2] = ((char *)&_tmp)[1];				\
+	((char *)a)[3] = ((char *)&_tmp)[0];				\
+}
+#define	P_32_COPY(a, b) {						\
+	((char *)&(b))[0] = ((char *)&(a))[3];				\
+	((char *)&(b))[1] = ((char *)&(a))[2];				\
+	((char *)&(b))[2] = ((char *)&(a))[1];				\
+	((char *)&(b))[3] = ((char *)&(a))[0];				\
+}
+
+/*
+ * Little endian <==> big endian 16-bit swap macros.
+ *	M_16_SWAP	swap a memory location
+ *	P_16_SWAP	swap a referenced memory location
+ *	P_16_COPY	swap from one location to another
+ */
+#define	M_16_SWAP(a) {							\
+	u_int16_t _tmp = a;						\
+	((char *)&a)[0] = ((char *)&_tmp)[1];				\
+	((char *)&a)[1] = ((char *)&_tmp)[0];				\
+}
+#define	P_16_SWAP(a) {							\
+	u_int16_t _tmp = *(u_int16_t *)a;				\
+	((char *)a)[0] = ((char *)&_tmp)[1];				\
+	((char *)a)[1] = ((char *)&_tmp)[0];				\
+}
+#define	P_16_COPY(a, b) {						\
+	((char *)&(b))[0] = ((char *)&(a))[1];				\
+	((char *)&(b))[1] = ((char *)&(a))[0];				\
+}
+#endif
+
+__BEGIN_DECLS
+DB *dbopen __P((const char *, int, int, DBTYPE, const void *));
+
+#ifdef __DBINTERFACE_PRIVATE
+DB	*__bt_open __P((const char *, int, int, const BTREEINFO *, int));
+DB	*__hash_open __P((const char *, int, int, const HASHINFO *, int));
+DB	*__rec_open __P((const char *, int, int, const RECNOINFO *, int));
+void	 __dbpanic __P((DB *dbp));
+#endif
+__END_DECLS
+#endif /* !_DB_H_ */
diff --git a/db/db/db.c b/db/db/db.c
new file mode 100644
index 0000000000..a18f056db8
--- /dev/null
+++ b/db/db/db.c
@@ -0,0 +1,99 @@
+/*-
+ * Copyright (c) 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)db.c	8.4 (Berkeley) 2/21/94";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/types.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <stdio.h>
+
+#include <db.h>
+
+DB *
+dbopen(fname, flags, mode, type, openinfo)
+	const char *fname;
+	int flags, mode;
+	DBTYPE type;
+	const void *openinfo;
+{
+
+#define	DB_FLAGS	(DB_LOCK | DB_SHMEM | DB_TXN)
+#define	USE_OPEN_FLAGS							\
+	(O_CREAT | O_EXCL | O_EXLOCK | O_NONBLOCK | O_RDONLY |		\
+	 O_RDWR | O_SHLOCK | O_TRUNC)
+
+	if ((flags & ~(USE_OPEN_FLAGS | DB_FLAGS)) == 0)
+		switch (type) {
+		case DB_BTREE:
+			return (__bt_open(fname, flags & USE_OPEN_FLAGS,
+			    mode, openinfo, flags & DB_FLAGS));
+		case DB_HASH:
+			return (__hash_open(fname, flags & USE_OPEN_FLAGS,
+			    mode, openinfo, flags & DB_FLAGS));
+		case DB_RECNO:
+			return (__rec_open(fname, flags & USE_OPEN_FLAGS,
+			    mode, openinfo, flags & DB_FLAGS));
+		}
+	errno = EINVAL;
+	return (NULL);
+}
+
+static int
+__dberr()
+{
+	return (RET_ERROR);
+}
+
+/*
+ * __DBPANIC -- Stop.
+ *
+ * Parameters:
+ *	dbp:	pointer to the DB structure.
+ */
+void
+__dbpanic(dbp)
+	DB *dbp;
+{
+	/* The only thing that can succeed is a close. */
+	dbp->del = (int (*)())__dberr;
+	dbp->fd = (int (*)())__dberr;
+	dbp->get = (int (*)())__dberr;
+	dbp->put = (int (*)())__dberr;
+	dbp->seq = (int (*)())__dberr;
+	dbp->sync = (int (*)())__dberr;
+}
diff --git a/db/hash/extern.h b/db/hash/extern.h
new file mode 100644
index 0000000000..3167e6d0f7
--- /dev/null
+++ b/db/hash/extern.h
@@ -0,0 +1,65 @@
+/*-
+ * Copyright (c) 1991, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)extern.h	8.4 (Berkeley) 6/16/94
+ */
+
+BUFHEAD	*__add_ovflpage __P((HTAB *, BUFHEAD *));
+int	 __addel __P((HTAB *, BUFHEAD *, const DBT *, const DBT *));
+int	 __big_delete __P((HTAB *, BUFHEAD *));
+int	 __big_insert __P((HTAB *, BUFHEAD *, const DBT *, const DBT *));
+int	 __big_keydata __P((HTAB *, BUFHEAD *, DBT *, DBT *, int));
+int	 __big_return __P((HTAB *, BUFHEAD *, int, DBT *, int));
+int	 __big_split __P((HTAB *, BUFHEAD *, BUFHEAD *, BUFHEAD *,
+		int, u_int32_t, SPLIT_RETURN *));
+int	 __buf_free __P((HTAB *, int, int));
+void	 __buf_init __P((HTAB *, int));
+u_int32_t	 __call_hash __P((HTAB *, char *, int));
+int	 __delpair __P((HTAB *, BUFHEAD *, int));
+int	 __expand_table __P((HTAB *));
+int	 __find_bigpair __P((HTAB *, BUFHEAD *, int, char *, int));
+u_int16_t	 __find_last_page __P((HTAB *, BUFHEAD **));
+void	 __free_ovflpage __P((HTAB *, BUFHEAD *));
+BUFHEAD	*__get_buf __P((HTAB *, u_int32_t, BUFHEAD *, int));
+int	 __get_page __P((HTAB *, char *, u_int32_t, int, int, int));
+int	 __ibitmap __P((HTAB *, int, int, int));
+u_int32_t	 __log2 __P((u_int32_t));
+int	 __put_page __P((HTAB *, char *, u_int32_t, int, int));
+void	 __reclaim_buf __P((HTAB *, BUFHEAD *));
+int	 __split_page __P((HTAB *, u_int32_t, u_int32_t));
+
+/* Default hash routine. */
+extern u_int32_t (*__default_hash) __P((const void *, size_t));
+
+#ifdef HASH_STATISTICS
+extern int hash_accesses, hash_collisions, hash_expansions, hash_overflows;
+#endif
diff --git a/db/hash/hash.c b/db/hash/hash.c
new file mode 100644
index 0000000000..4b7b732a8f
--- /dev/null
+++ b/db/hash/hash.c
@@ -0,0 +1,994 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)hash.c	8.9 (Berkeley) 6/16/94";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/param.h>
+#include <sys/stat.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#ifdef DEBUG
+#include <assert.h>
+#endif
+
+#include <db.h>
+#include "hash.h"
+#include "page.h"
+#include "extern.h"
+
+static int   alloc_segs __P((HTAB *, int));
+static int   flush_meta __P((HTAB *));
+static int   hash_access __P((HTAB *, ACTION, DBT *, DBT *));
+static int   hash_close __P((DB *));
+static int   hash_delete __P((const DB *, const DBT *, u_int32_t));
+static int   hash_fd __P((const DB *));
+static int   hash_get __P((const DB *, const DBT *, DBT *, u_int32_t));
+static int   hash_put __P((const DB *, DBT *, const DBT *, u_int32_t));
+static void *hash_realloc __P((SEGMENT **, int, int));
+static int   hash_seq __P((const DB *, DBT *, DBT *, u_int32_t));
+static int   hash_sync __P((const DB *, u_int32_t));
+static int   hdestroy __P((HTAB *));
+static HTAB *init_hash __P((HTAB *, const char *, HASHINFO *));
+static int   init_htab __P((HTAB *, int));
+#if BYTE_ORDER == LITTLE_ENDIAN
+static void  swap_header __P((HTAB *));
+static void  swap_header_copy __P((HASHHDR *, HASHHDR *));
+#endif
+
+/* Fast arithmetic, relying on powers of 2, */
+#define MOD(x, y)		((x) & ((y) - 1))
+
+#define RETURN_ERROR(ERR, LOC)	{ save_errno = ERR; goto LOC; }
+
+/* Return values */
+#define	SUCCESS	 (0)
+#define	ERROR	(-1)
+#define	ABNORMAL (1)
+
+#ifdef HASH_STATISTICS
+int hash_accesses, hash_collisions, hash_expansions, hash_overflows;
+#endif
+
+/************************** INTERFACE ROUTINES ***************************/
+/* OPEN/CLOSE */
+
+extern DB *
+__hash_open(file, flags, mode, info, dflags)
+	const char *file;
+	int flags, mode, dflags;
+	const HASHINFO *info;	/* Special directives for create */
+{
+	HTAB *hashp;
+	struct stat statbuf;
+	DB *dbp;
+	int bpages, hdrsize, new_table, nsegs, save_errno;
+
+	if ((flags & O_ACCMODE) == O_WRONLY) {
+		errno = EINVAL;
+		return (NULL);
+	}
+
+	if (!(hashp = (HTAB *)calloc(1, sizeof(HTAB))))
+		return (NULL);
+	hashp->fp = -1;
+
+	/*
+	 * Even if user wants write only, we need to be able to read
+	 * the actual file, so we need to open it read/write. But, the
+	 * field in the hashp structure needs to be accurate so that
+	 * we can check accesses.
+	 */
+	hashp->flags = flags;
+
+	new_table = 0;
+	if (!file || (flags & O_TRUNC) ||
+	    (stat(file, &statbuf) && (errno == ENOENT))) {
+		if (errno == ENOENT)
+			errno = 0; /* Just in case someone looks at errno */
+		new_table = 1;
+	}
+	if (file) {
+		if ((hashp->fp = open(file, flags, mode)) == -1)
+			RETURN_ERROR(errno, error0);
+		(void)fcntl(hashp->fp, F_SETFD, 1);
+	}
+	if (new_table) {
+		if (!(hashp = init_hash(hashp, file, (HASHINFO *)info)))
+			RETURN_ERROR(errno, error1);
+	} else {
+		/* Table already exists */
+		if (info && info->hash)
+			hashp->hash = info->hash;
+		else
+			hashp->hash = __default_hash;
+
+		hdrsize = read(hashp->fp, &hashp->hdr, sizeof(HASHHDR));
+#if BYTE_ORDER == LITTLE_ENDIAN
+		swap_header(hashp);
+#endif
+		if (hdrsize == -1)
+			RETURN_ERROR(errno, error1);
+		if (hdrsize != sizeof(HASHHDR))
+			RETURN_ERROR(EFTYPE, error1);
+		/* Verify file type, versions and hash function */
+		if (hashp->MAGIC != HASHMAGIC)
+			RETURN_ERROR(EFTYPE, error1);
+#define	OLDHASHVERSION	1
+		if (hashp->VERSION != HASHVERSION &&
+		    hashp->VERSION != OLDHASHVERSION)
+			RETURN_ERROR(EFTYPE, error1);
+		if (hashp->hash(CHARKEY, sizeof(CHARKEY)) != hashp->H_CHARKEY)
+			RETURN_ERROR(EFTYPE, error1);
+		/*
+		 * Figure out how many segments we need.  Max_Bucket is the
+		 * maximum bucket number, so the number of buckets is
+		 * max_bucket + 1.
+		 */
+		nsegs = (hashp->MAX_BUCKET + 1 + hashp->SGSIZE - 1) /
+			 hashp->SGSIZE;
+		hashp->nsegs = 0;
+		if (alloc_segs(hashp, nsegs))
+			/*
+			 * If alloc_segs fails, table will have been destroyed
+			 * and errno will have been set.
+			 */
+			return (NULL);
+		/* Read in bitmaps */
+		bpages = (hashp->SPARES[hashp->OVFL_POINT] +
+		    (hashp->BSIZE << BYTE_SHIFT) - 1) >>
+		    (hashp->BSHIFT + BYTE_SHIFT);
+
+		hashp->nmaps = bpages;
+		(void)memset(&hashp->mapp[0], 0, bpages * sizeof(u_int32_t *));
+	}
+
+	/* Initialize Buffer Manager */
+	if (info && info->cachesize)
+		__buf_init(hashp, info->cachesize);
+	else
+		__buf_init(hashp, DEF_BUFSIZE);
+
+	hashp->new_file = new_table;
+	hashp->save_file = file && (hashp->flags & O_RDWR);
+	hashp->cbucket = -1;
+	if (!(dbp = (DB *)malloc(sizeof(DB)))) {
+		save_errno = errno;
+		hdestroy(hashp);
+		errno = save_errno;
+		return (NULL);
+	}
+	dbp->internal = hashp;
+	dbp->close = hash_close;
+	dbp->del = hash_delete;
+	dbp->fd = hash_fd;
+	dbp->get = hash_get;
+	dbp->put = hash_put;
+	dbp->seq = hash_seq;
+	dbp->sync = hash_sync;
+	dbp->type = DB_HASH;
+
+#ifdef DEBUG
+	(void)fprintf(stderr,
+"%s\n%s%x\n%s%d\n%s%d\n%s%d\n%s%d\n%s%d\n%s%d\n%s%d\n%s%d\n%s%d\n%s%x\n%s%x\n%s%d\n%s%d\n",
+	    "init_htab:",
+	    "TABLE POINTER   ", hashp,
+	    "BUCKET SIZE     ", hashp->BSIZE,
+	    "BUCKET SHIFT    ", hashp->BSHIFT,
+	    "DIRECTORY SIZE  ", hashp->DSIZE,
+	    "SEGMENT SIZE    ", hashp->SGSIZE,
+	    "SEGMENT SHIFT   ", hashp->SSHIFT,
+	    "FILL FACTOR     ", hashp->FFACTOR,
+	    "MAX BUCKET      ", hashp->MAX_BUCKET,
+	    "OVFL POINT	     ", hashp->OVFL_POINT,
+	    "LAST FREED      ", hashp->LAST_FREED,
+	    "HIGH MASK       ", hashp->HIGH_MASK,
+	    "LOW  MASK       ", hashp->LOW_MASK,
+	    "NSEGS           ", hashp->nsegs,
+	    "NKEYS           ", hashp->NKEYS);
+#endif
+#ifdef HASH_STATISTICS
+	hash_overflows = hash_accesses = hash_collisions = hash_expansions = 0;
+#endif
+	return (dbp);
+
+error1:
+	if (hashp != NULL)
+		(void)close(hashp->fp);
+
+error0:
+	free(hashp);
+	errno = save_errno;
+	return (NULL);
+}
+
+static int
+hash_close(dbp)
+	DB *dbp;
+{
+	HTAB *hashp;
+	int retval;
+
+	if (!dbp)
+		return (ERROR);
+
+	hashp = (HTAB *)dbp->internal;
+	retval = hdestroy(hashp);
+	free(dbp);
+	return (retval);
+}
+
+static int
+hash_fd(dbp)
+	const DB *dbp;
+{
+	HTAB *hashp;
+
+	if (!dbp)
+		return (ERROR);
+
+	hashp = (HTAB *)dbp->internal;
+	if (hashp->fp == -1) {
+		errno = ENOENT;
+		return (-1);
+	}
+	return (hashp->fp);
+}
+
+/************************** LOCAL CREATION ROUTINES **********************/
+static HTAB *
+init_hash(hashp, file, info)
+	HTAB *hashp;
+	const char *file;
+	HASHINFO *info;
+{
+	struct stat statbuf;
+	int nelem;
+
+	nelem = 1;
+	hashp->NKEYS = 0;
+	hashp->LORDER = BYTE_ORDER;
+	hashp->BSIZE = DEF_BUCKET_SIZE;
+	hashp->BSHIFT = DEF_BUCKET_SHIFT;
+	hashp->SGSIZE = DEF_SEGSIZE;
+	hashp->SSHIFT = DEF_SEGSIZE_SHIFT;
+	hashp->DSIZE = DEF_DIRSIZE;
+	hashp->FFACTOR = DEF_FFACTOR;
+	hashp->hash = __default_hash;
+	memset(hashp->SPARES, 0, sizeof(hashp->SPARES));
+	memset(hashp->BITMAPS, 0, sizeof (hashp->BITMAPS));
+
+	/* Fix bucket size to be optimal for file system */
+	if (file != NULL) {
+		if (stat(file, &statbuf))
+			return (NULL);
+		hashp->BSIZE = statbuf.st_blksize;
+		hashp->BSHIFT = __log2(hashp->BSIZE);
+	}
+
+	if (info) {
+		if (info->bsize) {
+			/* Round pagesize up to power of 2 */
+			hashp->BSHIFT = __log2(info->bsize);
+			hashp->BSIZE = 1 << hashp->BSHIFT;
+			if (hashp->BSIZE > MAX_BSIZE) {
+				errno = EINVAL;
+				return (NULL);
+			}
+		}
+		if (info->ffactor)
+			hashp->FFACTOR = info->ffactor;
+		if (info->hash)
+			hashp->hash = info->hash;
+		if (info->nelem)
+			nelem = info->nelem;
+		if (info->lorder) {
+			if (info->lorder != BIG_ENDIAN &&
+			    info->lorder != LITTLE_ENDIAN) {
+				errno = EINVAL;
+				return (NULL);
+			}
+			hashp->LORDER = info->lorder;
+		}
+	}
+	/* init_htab should destroy the table and set errno if it fails */
+	if (init_htab(hashp, nelem))
+		return (NULL);
+	else
+		return (hashp);
+}
+/*
+ * This calls alloc_segs which may run out of memory.  Alloc_segs will destroy
+ * the table and set errno, so we just pass the error information along.
+ *
+ * Returns 0 on No Error
+ */
+static int
+init_htab(hashp, nelem)
+	HTAB *hashp;
+	int nelem;
+{
+	register int nbuckets, nsegs;
+	int l2;
+
+	/*
+	 * Divide number of elements by the fill factor and determine a
+	 * desired number of buckets.  Allocate space for the next greater
+	 * power of two number of buckets.
+	 */
+	nelem = (nelem - 1) / hashp->FFACTOR + 1;
+
+	l2 = __log2(MAX(nelem, 2));
+	nbuckets = 1 << l2;
+
+	hashp->SPARES[l2] = l2 + 1;
+	hashp->SPARES[l2 + 1] = l2 + 1;
+	hashp->OVFL_POINT = l2;
+	hashp->LAST_FREED = 2;
+
+	/* First bitmap page is at: splitpoint l2 page offset 1 */
+	if (__ibitmap(hashp, OADDR_OF(l2, 1), l2 + 1, 0))
+		return (-1);
+
+	hashp->MAX_BUCKET = hashp->LOW_MASK = nbuckets - 1;
+	hashp->HIGH_MASK = (nbuckets << 1) - 1;
+	hashp->HDRPAGES = ((MAX(sizeof(HASHHDR), MINHDRSIZE) - 1) >>
+	    hashp->BSHIFT) + 1;
+
+	nsegs = (nbuckets - 1) / hashp->SGSIZE + 1;
+	nsegs = 1 << __log2(nsegs);
+
+	if (nsegs > hashp->DSIZE)
+		hashp->DSIZE = nsegs;
+	return (alloc_segs(hashp, nsegs));
+}
+
+/********************** DESTROY/CLOSE ROUTINES ************************/
+
+/*
+ * Flushes any changes to the file if necessary and destroys the hashp
+ * structure, freeing all allocated space.
+ */
+static int
+hdestroy(hashp)
+	HTAB *hashp;
+{
+	int i, save_errno;
+
+	save_errno = 0;
+
+#ifdef HASH_STATISTICS
+	(void)fprintf(stderr, "hdestroy: accesses %ld collisions %ld\n",
+	    hash_accesses, hash_collisions);
+	(void)fprintf(stderr, "hdestroy: expansions %ld\n",
+	    hash_expansions);
+	(void)fprintf(stderr, "hdestroy: overflows %ld\n",
+	    hash_overflows);
+	(void)fprintf(stderr, "keys %ld maxp %d segmentcount %d\n",
+	    hashp->NKEYS, hashp->MAX_BUCKET, hashp->nsegs);
+
+	for (i = 0; i < NCACHED; i++)
+		(void)fprintf(stderr,
+		    "spares[%d] = %d\n", i, hashp->SPARES[i]);
+#endif
+	/*
+	 * Call on buffer manager to free buffers, and if required,
+	 * write them to disk.
+	 */
+	if (__buf_free(hashp, 1, hashp->save_file))
+		save_errno = errno;
+	if (hashp->dir) {
+		free(*hashp->dir);	/* Free initial segments */
+		/* Free extra segments */
+		while (hashp->exsegs--)
+			free(hashp->dir[--hashp->nsegs]);
+		free(hashp->dir);
+	}
+	if (flush_meta(hashp) && !save_errno)
+		save_errno = errno;
+	/* Free Bigmaps */
+	for (i = 0; i < hashp->nmaps; i++)
+		if (hashp->mapp[i])
+			free(hashp->mapp[i]);
+
+	if (hashp->fp != -1)
+		(void)close(hashp->fp);
+
+	free(hashp);
+
+	if (save_errno) {
+		errno = save_errno;
+		return (ERROR);
+	}
+	return (SUCCESS);
+}
+/*
+ * Write modified pages to disk
+ *
+ * Returns:
+ *	 0 == OK
+ *	-1 ERROR
+ */
+static int
+hash_sync(dbp, flags)
+	const DB *dbp;
+	u_int32_t flags;
+{
+	HTAB *hashp;
+
+	if (flags != 0) {
+		errno = EINVAL;
+		return (ERROR);
+	}
+
+	if (!dbp)
+		return (ERROR);
+
+	hashp = (HTAB *)dbp->internal;
+	if (!hashp->save_file)
+		return (0);
+	if (__buf_free(hashp, 0, 1) || flush_meta(hashp))
+		return (ERROR);
+	hashp->new_file = 0;
+	return (0);
+}
+
+/*
+ * Returns:
+ *	 0 == OK
+ *	-1 indicates that errno should be set
+ */
+static int
+flush_meta(hashp)
+	HTAB *hashp;
+{
+	HASHHDR *whdrp;
+#if BYTE_ORDER == LITTLE_ENDIAN
+	HASHHDR whdr;
+#endif
+	int fp, i, wsize;
+
+	if (!hashp->save_file)
+		return (0);
+	hashp->MAGIC = HASHMAGIC;
+	hashp->VERSION = HASHVERSION;
+	hashp->H_CHARKEY = hashp->hash(CHARKEY, sizeof(CHARKEY));
+
+	fp = hashp->fp;
+	whdrp = &hashp->hdr;
+#if BYTE_ORDER == LITTLE_ENDIAN
+	whdrp = &whdr;
+	swap_header_copy(&hashp->hdr, whdrp);
+#endif
+	if ((lseek(fp, (off_t)0, SEEK_SET) == -1) ||
+	    ((wsize = write(fp, whdrp, sizeof(HASHHDR))) == -1))
+		return (-1);
+	else
+		if (wsize != sizeof(HASHHDR)) {
+			errno = EFTYPE;
+			hashp->errno = errno;
+			return (-1);
+		}
+	for (i = 0; i < NCACHED; i++)
+		if (hashp->mapp[i])
+			if (__put_page(hashp, (char *)hashp->mapp[i],
+				hashp->BITMAPS[i], 0, 1))
+				return (-1);
+	return (0);
+}
+
+/*******************************SEARCH ROUTINES *****************************/
+/*
+ * All the access routines return
+ *
+ * Returns:
+ *	 0 on SUCCESS
+ *	 1 to indicate an external ERROR (i.e. key not found, etc)
+ *	-1 to indicate an internal ERROR (i.e. out of memory, etc)
+ */
+static int
+hash_get(dbp, key, data, flag)
+	const DB *dbp;
+	const DBT *key;
+	DBT *data;
+	u_int32_t flag;
+{
+	HTAB *hashp;
+
+	hashp = (HTAB *)dbp->internal;
+	if (flag) {
+		hashp->errno = errno = EINVAL;
+		return (ERROR);
+	}
+	return (hash_access(hashp, HASH_GET, (DBT *)key, data));
+}
+
+static int
+hash_put(dbp, key, data, flag)
+	const DB *dbp;
+	DBT *key;
+	const DBT *data;
+	u_int32_t flag;
+{
+	HTAB *hashp;
+
+	hashp = (HTAB *)dbp->internal;
+	if (flag && flag != R_NOOVERWRITE) {
+		hashp->errno = errno = EINVAL;
+		return (ERROR);
+	}
+	if ((hashp->flags & O_ACCMODE) == O_RDONLY) {
+		hashp->errno = errno = EPERM;
+		return (ERROR);
+	}
+	return (hash_access(hashp, flag == R_NOOVERWRITE ?
+	    HASH_PUTNEW : HASH_PUT, (DBT *)key, (DBT *)data));
+}
+
+static int
+hash_delete(dbp, key, flag)
+	const DB *dbp;
+	const DBT *key;
+	u_int32_t flag;		/* Ignored */
+{
+	HTAB *hashp;
+
+	hashp = (HTAB *)dbp->internal;
+	if (flag && flag != R_CURSOR) {
+		hashp->errno = errno = EINVAL;
+		return (ERROR);
+	}
+	if ((hashp->flags & O_ACCMODE) == O_RDONLY) {
+		hashp->errno = errno = EPERM;
+		return (ERROR);
+	}
+	return (hash_access(hashp, HASH_DELETE, (DBT *)key, NULL));
+}
+
+/*
+ * Assume that hashp has been set in wrapper routine.
+ */
+static int
+hash_access(hashp, action, key, val)
+	HTAB *hashp;
+	ACTION action;
+	DBT *key, *val;
+{
+	register BUFHEAD *rbufp;
+	BUFHEAD *bufp, *save_bufp;
+	register u_int16_t *bp;
+	register int n, ndx, off, size;
+	register char *kp;
+	u_int16_t pageno;
+
+#ifdef HASH_STATISTICS
+	hash_accesses++;
+#endif
+
+	off = hashp->BSIZE;
+	size = key->size;
+	kp = (char *)key->data;
+	rbufp = __get_buf(hashp, __call_hash(hashp, kp, size), NULL, 0);
+	if (!rbufp)
+		return (ERROR);
+	save_bufp = rbufp;
+
+	/* Pin the bucket chain */
+	rbufp->flags |= BUF_PIN;
+	for (bp = (u_int16_t *)rbufp->page, n = *bp++, ndx = 1; ndx < n;)
+		if (bp[1] >= REAL_KEY) {
+			/* Real key/data pair */
+			if (size == off - *bp &&
+			    memcmp(kp, rbufp->page + *bp, size) == 0)
+				goto found;
+			off = bp[1];
+#ifdef HASH_STATISTICS
+			hash_collisions++;
+#endif
+			bp += 2;
+			ndx += 2;
+		} else if (bp[1] == OVFLPAGE) {
+			rbufp = __get_buf(hashp, *bp, rbufp, 0);
+			if (!rbufp) {
+				save_bufp->flags &= ~BUF_PIN;
+				return (ERROR);
+			}
+			/* FOR LOOP INIT */
+			bp = (u_int16_t *)rbufp->page;
+			n = *bp++;
+			ndx = 1;
+			off = hashp->BSIZE;
+		} else if (bp[1] < REAL_KEY) {
+			if ((ndx =
+			    __find_bigpair(hashp, rbufp, ndx, kp, size)) > 0)
+				goto found;
+			if (ndx == -2) {
+				bufp = rbufp;
+				if (!(pageno =
+				    __find_last_page(hashp, &bufp))) {
+					ndx = 0;
+					rbufp = bufp;
+					break;	/* FOR */
+				}
+				rbufp = __get_buf(hashp, pageno, bufp, 0);
+				if (!rbufp) {
+					save_bufp->flags &= ~BUF_PIN;
+					return (ERROR);
+				}
+				/* FOR LOOP INIT */
+				bp = (u_int16_t *)rbufp->page;
+				n = *bp++;
+				ndx = 1;
+				off = hashp->BSIZE;
+			} else {
+				save_bufp->flags &= ~BUF_PIN;
+				return (ERROR);
+			}
+		}
+
+	/* Not found */
+	switch (action) {
+	case HASH_PUT:
+	case HASH_PUTNEW:
+		if (__addel(hashp, rbufp, key, val)) {
+			save_bufp->flags &= ~BUF_PIN;
+			return (ERROR);
+		} else {
+			save_bufp->flags &= ~BUF_PIN;
+			return (SUCCESS);
+		}
+	case HASH_GET:
+	case HASH_DELETE:
+	default:
+		save_bufp->flags &= ~BUF_PIN;
+		return (ABNORMAL);
+	}
+
+found:
+	switch (action) {
+	case HASH_PUTNEW:
+		save_bufp->flags &= ~BUF_PIN;
+		return (ABNORMAL);
+	case HASH_GET:
+		bp = (u_int16_t *)rbufp->page;
+		if (bp[ndx + 1] < REAL_KEY) {
+			if (__big_return(hashp, rbufp, ndx, val, 0))
+				return (ERROR);
+		} else {
+			val->data = (u_char *)rbufp->page + (int)bp[ndx + 1];
+			val->size = bp[ndx] - bp[ndx + 1];
+		}
+		break;
+	case HASH_PUT:
+		if ((__delpair(hashp, rbufp, ndx)) ||
+		    (__addel(hashp, rbufp, key, val))) {
+			save_bufp->flags &= ~BUF_PIN;
+			return (ERROR);
+		}
+		break;
+	case HASH_DELETE:
+		if (__delpair(hashp, rbufp, ndx))
+			return (ERROR);
+		break;
+	default:
+		abort();
+	}
+	save_bufp->flags &= ~BUF_PIN;
+	return (SUCCESS);
+}
+
+static int
+hash_seq(dbp, key, data, flag)
+	const DB *dbp;
+	DBT *key, *data;
+	u_int32_t flag;
+{
+	register u_int32_t bucket;
+	register BUFHEAD *bufp;
+	HTAB *hashp;
+	u_int16_t *bp, ndx;
+
+	hashp = (HTAB *)dbp->internal;
+	if (flag && flag != R_FIRST && flag != R_NEXT) {
+		hashp->errno = errno = EINVAL;
+		return (ERROR);
+	}
+#ifdef HASH_STATISTICS
+	hash_accesses++;
+#endif
+	if ((hashp->cbucket < 0) || (flag == R_FIRST)) {
+		hashp->cbucket = 0;
+		hashp->cndx = 1;
+		hashp->cpage = NULL;
+	}
+
+	for (bp = NULL; !bp || !bp[0]; ) {
+		if (!(bufp = hashp->cpage)) {
+			for (bucket = hashp->cbucket;
+			    bucket <= hashp->MAX_BUCKET;
+			    bucket++, hashp->cndx = 1) {
+				bufp = __get_buf(hashp, bucket, NULL, 0);
+				if (!bufp)
+					return (ERROR);
+				hashp->cpage = bufp;
+				bp = (u_int16_t *)bufp->page;
+				if (bp[0])
+					break;
+			}
+			hashp->cbucket = bucket;
+			if (hashp->cbucket > hashp->MAX_BUCKET) {
+				hashp->cbucket = -1;
+				return (ABNORMAL);
+			}
+		} else
+			bp = (u_int16_t *)hashp->cpage->page;
+
+#ifdef DEBUG
+		assert(bp);
+		assert(bufp);
+#endif
+		while (bp[hashp->cndx + 1] == OVFLPAGE) {
+			bufp = hashp->cpage =
+			    __get_buf(hashp, bp[hashp->cndx], bufp, 0);
+			if (!bufp)
+				return (ERROR);
+			bp = (u_int16_t *)(bufp->page);
+			hashp->cndx = 1;
+		}
+		if (!bp[0]) {
+			hashp->cpage = NULL;
+			++hashp->cbucket;
+		}
+	}
+	ndx = hashp->cndx;
+	if (bp[ndx + 1] < REAL_KEY) {
+		if (__big_keydata(hashp, bufp, key, data, 1))
+			return (ERROR);
+	} else {
+		key->data = (u_char *)hashp->cpage->page + bp[ndx];
+		key->size = (ndx > 1 ? bp[ndx - 1] : hashp->BSIZE) - bp[ndx];
+		data->data = (u_char *)hashp->cpage->page + bp[ndx + 1];
+		data->size = bp[ndx] - bp[ndx + 1];
+		ndx += 2;
+		if (ndx > bp[0]) {
+			hashp->cpage = NULL;
+			hashp->cbucket++;
+			hashp->cndx = 1;
+		} else
+			hashp->cndx = ndx;
+	}
+	return (SUCCESS);
+}
+
+/********************************* UTILITIES ************************/
+
+/*
+ * Returns:
+ *	 0 ==> OK
+ *	-1 ==> Error
+ */
+extern int
+__expand_table(hashp)
+	HTAB *hashp;
+{
+	u_int32_t old_bucket, new_bucket;
+	int dirsize, new_segnum, spare_ndx;
+
+#ifdef HASH_STATISTICS
+	hash_expansions++;
+#endif
+	new_bucket = ++hashp->MAX_BUCKET;
+	old_bucket = (hashp->MAX_BUCKET & hashp->LOW_MASK);
+
+	new_segnum = new_bucket >> hashp->SSHIFT;
+
+	/* Check if we need a new segment */
+	if (new_segnum >= hashp->nsegs) {
+		/* Check if we need to expand directory */
+		if (new_segnum >= hashp->DSIZE) {
+			/* Reallocate directory */
+			dirsize = hashp->DSIZE * sizeof(SEGMENT *);
+			if (!hash_realloc(&hashp->dir, dirsize, dirsize << 1))
+				return (-1);
+			hashp->DSIZE = dirsize << 1;
+		}
+		if ((hashp->dir[new_segnum] =
+		    (SEGMENT)calloc(hashp->SGSIZE, sizeof(SEGMENT))) == NULL)
+			return (-1);
+		hashp->exsegs++;
+		hashp->nsegs++;
+	}
+	/*
+	 * If the split point is increasing (MAX_BUCKET's log base 2
+	 * * increases), we need to copy the current contents of the spare
+	 * split bucket to the next bucket.
+	 */
+	spare_ndx = __log2(hashp->MAX_BUCKET + 1);
+	if (spare_ndx > hashp->OVFL_POINT) {
+		hashp->SPARES[spare_ndx] = hashp->SPARES[hashp->OVFL_POINT];
+		hashp->OVFL_POINT = spare_ndx;
+	}
+
+	if (new_bucket > hashp->HIGH_MASK) {
+		/* Starting a new doubling */
+		hashp->LOW_MASK = hashp->HIGH_MASK;
+		hashp->HIGH_MASK = new_bucket | hashp->LOW_MASK;
+	}
+	/* Relocate records to the new bucket */
+	return (__split_page(hashp, old_bucket, new_bucket));
+}
+
+/*
+ * If realloc guarantees that the pointer is not destroyed if the realloc
+ * fails, then this routine can go away.
+ */
+static void *
+hash_realloc(p_ptr, oldsize, newsize)
+	SEGMENT **p_ptr;
+	int oldsize, newsize;
+{
+	register void *p;
+
+	if (p = malloc(newsize)) {
+		memmove(p, *p_ptr, oldsize);
+		memset((char *)p + oldsize, 0, newsize - oldsize);
+		free(*p_ptr);
+		*p_ptr = p;
+	}
+	return (p);
+}
+
+extern u_int32_t
+__call_hash(hashp, k, len)
+	HTAB *hashp;
+	char *k;
+	int len;
+{
+	int n, bucket;
+
+	n = hashp->hash(k, len);
+	bucket = n & hashp->HIGH_MASK;
+	if (bucket > hashp->MAX_BUCKET)
+		bucket = bucket & hashp->LOW_MASK;
+	return (bucket);
+}
+
+/*
+ * Allocate segment table.  On error, destroy the table and set errno.
+ *
+ * Returns 0 on success
+ */
+static int
+alloc_segs(hashp, nsegs)
+	HTAB *hashp;
+	int nsegs;
+{
+	register int i;
+	register SEGMENT store;
+
+	int save_errno;
+
+	if ((hashp->dir =
+	    (SEGMENT *)calloc(hashp->DSIZE, sizeof(SEGMENT *))) == NULL) {
+		save_errno = errno;
+		(void)hdestroy(hashp);
+		errno = save_errno;
+		return (-1);
+	}
+	/* Allocate segments */
+	if ((store =
+	    (SEGMENT)calloc(nsegs << hashp->SSHIFT, sizeof(SEGMENT))) == NULL) {
+		save_errno = errno;
+		(void)hdestroy(hashp);
+		errno = save_errno;
+		return (-1);
+	}
+	for (i = 0; i < nsegs; i++, hashp->nsegs++)
+		hashp->dir[i] = &store[i << hashp->SSHIFT];
+	return (0);
+}
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+/*
+ * Hashp->hdr needs to be byteswapped.
+ */
+static void
+swap_header_copy(srcp, destp)
+	HASHHDR *srcp, *destp;
+{
+	int i;
+
+	P_32_COPY(srcp->magic, destp->magic);
+	P_32_COPY(srcp->version, destp->version);
+	P_32_COPY(srcp->lorder, destp->lorder);
+	P_32_COPY(srcp->bsize, destp->bsize);
+	P_32_COPY(srcp->bshift, destp->bshift);
+	P_32_COPY(srcp->dsize, destp->dsize);
+	P_32_COPY(srcp->ssize, destp->ssize);
+	P_32_COPY(srcp->sshift, destp->sshift);
+	P_32_COPY(srcp->ovfl_point, destp->ovfl_point);
+	P_32_COPY(srcp->last_freed, destp->last_freed);
+	P_32_COPY(srcp->max_bucket, destp->max_bucket);
+	P_32_COPY(srcp->high_mask, destp->high_mask);
+	P_32_COPY(srcp->low_mask, destp->low_mask);
+	P_32_COPY(srcp->ffactor, destp->ffactor);
+	P_32_COPY(srcp->nkeys, destp->nkeys);
+	P_32_COPY(srcp->hdrpages, destp->hdrpages);
+	P_32_COPY(srcp->h_charkey, destp->h_charkey);
+	for (i = 0; i < NCACHED; i++) {
+		P_32_COPY(srcp->spares[i], destp->spares[i]);
+		P_16_COPY(srcp->bitmaps[i], destp->bitmaps[i]);
+	}
+}
+
+static void
+swap_header(hashp)
+	HTAB *hashp;
+{
+	HASHHDR *hdrp;
+	int i;
+
+	hdrp = &hashp->hdr;
+
+	M_32_SWAP(hdrp->magic);
+	M_32_SWAP(hdrp->version);
+	M_32_SWAP(hdrp->lorder);
+	M_32_SWAP(hdrp->bsize);
+	M_32_SWAP(hdrp->bshift);
+	M_32_SWAP(hdrp->dsize);
+	M_32_SWAP(hdrp->ssize);
+	M_32_SWAP(hdrp->sshift);
+	M_32_SWAP(hdrp->ovfl_point);
+	M_32_SWAP(hdrp->last_freed);
+	M_32_SWAP(hdrp->max_bucket);
+	M_32_SWAP(hdrp->high_mask);
+	M_32_SWAP(hdrp->low_mask);
+	M_32_SWAP(hdrp->ffactor);
+	M_32_SWAP(hdrp->nkeys);
+	M_32_SWAP(hdrp->hdrpages);
+	M_32_SWAP(hdrp->h_charkey);
+	for (i = 0; i < NCACHED; i++) {
+		M_32_SWAP(hdrp->spares[i]);
+		M_16_SWAP(hdrp->bitmaps[i]);
+	}
+}
+#endif
diff --git a/db/hash/hash.h b/db/hash/hash.h
new file mode 100644
index 0000000000..913e82b400
--- /dev/null
+++ b/db/hash/hash.h
@@ -0,0 +1,293 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)hash.h	8.3 (Berkeley) 5/31/94
+ */
+
+/* Operations */
+typedef enum {
+	HASH_GET, HASH_PUT, HASH_PUTNEW, HASH_DELETE, HASH_FIRST, HASH_NEXT
+} ACTION;
+
+/* Buffer Management structures */
+typedef struct _bufhead BUFHEAD;
+
+struct _bufhead {
+	BUFHEAD		*prev;		/* LRU links */
+	BUFHEAD		*next;		/* LRU links */
+	BUFHEAD		*ovfl;		/* Overflow page buffer header */
+	u_int32_t	 addr;		/* Address of this page */
+	char		*page;		/* Actual page data */
+	char	 	flags;
+#define	BUF_MOD		0x0001
+#define BUF_DISK	0x0002
+#define	BUF_BUCKET	0x0004
+#define	BUF_PIN		0x0008
+};
+
+#define IS_BUCKET(X)	((X) & BUF_BUCKET)
+
+typedef BUFHEAD **SEGMENT;
+
+/* Hash Table Information */
+typedef struct hashhdr {		/* Disk resident portion */
+	int		magic;		/* Magic NO for hash tables */
+	int		version;	/* Version ID */
+	u_int32_t	lorder;		/* Byte Order */
+	int		bsize;		/* Bucket/Page Size */
+	int		bshift;		/* Bucket shift */
+	int		dsize;		/* Directory Size */
+	int		ssize;		/* Segment Size */
+	int		sshift;		/* Segment shift */
+	int		ovfl_point;	/* Where overflow pages are being 
+					 * allocated */
+	int		last_freed;	/* Last overflow page freed */
+	int		max_bucket;	/* ID of Maximum bucket in use */
+	int		high_mask;	/* Mask to modulo into entire table */
+	int		low_mask;	/* Mask to modulo into lower half of 
+					 * table */
+	int		ffactor;	/* Fill factor */
+	int		nkeys;		/* Number of keys in hash table */
+	int		hdrpages;	/* Size of table header */
+	int		h_charkey;	/* value of hash(CHARKEY) */
+#define NCACHED	32			/* number of bit maps and spare 
+					 * points */
+	int		spares[NCACHED];/* spare pages for overflow */
+	u_int16_t	bitmaps[NCACHED];	/* address of overflow page 
+						 * bitmaps */
+} HASHHDR;
+
+typedef struct htab	 {		/* Memory resident data structure */
+	HASHHDR 	hdr;		/* Header */
+	int		nsegs;		/* Number of allocated segments */
+	int		exsegs;		/* Number of extra allocated 
+					 * segments */
+	u_int32_t			/* Hash function */
+	    (*hash)__P((const void *, size_t));
+	int		flags;		/* Flag values */
+	int		fp;		/* File pointer */
+	char		*tmp_buf;	/* Temporary Buffer for BIG data */
+	char		*tmp_key;	/* Temporary Buffer for BIG keys */
+	BUFHEAD 	*cpage;		/* Current page */
+	int		cbucket;	/* Current bucket */
+	int		cndx;		/* Index of next item on cpage */
+	int		errno;		/* Error Number -- for DBM 
+					 * compatability */
+	int		new_file;	/* Indicates if fd is backing store 
+					 * or no */
+	int		save_file;	/* Indicates whether we need to flush 
+					 * file at
+					 * exit */
+	u_int32_t	*mapp[NCACHED];	/* Pointers to page maps */
+	int		nmaps;		/* Initial number of bitmaps */
+	int		nbufs;		/* Number of buffers left to 
+					 * allocate */
+	BUFHEAD 	bufhead;	/* Header of buffer lru list */
+	SEGMENT 	*dir;		/* Hash Bucket directory */
+} HTAB;
+
+/*
+ * Constants
+ */
+#define	MAX_BSIZE		65536		/* 2^16 */
+#define MIN_BUFFERS		6
+#define MINHDRSIZE		512
+#define DEF_BUFSIZE		65536		/* 64 K */
+#define DEF_BUCKET_SIZE		4096
+#define DEF_BUCKET_SHIFT	12		/* log2(BUCKET) */
+#define DEF_SEGSIZE		256
+#define DEF_SEGSIZE_SHIFT	8		/* log2(SEGSIZE)	 */
+#define DEF_DIRSIZE		256
+#define DEF_FFACTOR		65536
+#define MIN_FFACTOR		4
+#define SPLTMAX			8
+#define CHARKEY			"%$sniglet^&"
+#define NUMKEY			1038583
+#define BYTE_SHIFT		3
+#define INT_TO_BYTE		2
+#define INT_BYTE_SHIFT		5
+#define ALL_SET			((u_int32_t)0xFFFFFFFF)
+#define ALL_CLEAR		0
+
+#define PTROF(X)	((BUFHEAD *)((ptrdiff_t)(X)&~0x3))
+#define ISMOD(X)	((u_int32_t)(ptrdiff_t)(X)&0x1)
+#define DOMOD(X)	((X) = (char *)((ptrdiff_t)(X)|0x1))
+#define ISDISK(X)	((u_int32_t)(ptrdiff_t)(X)&0x2)
+#define DODISK(X)	((X) = (char *)((ptrdiff_t)(X)|0x2))
+
+#define BITS_PER_MAP	32
+
+/* Given the address of the beginning of a big map, clear/set the nth bit */
+#define CLRBIT(A, N)	((A)[(N)/BITS_PER_MAP] &= ~(1<<((N)%BITS_PER_MAP)))
+#define SETBIT(A, N)	((A)[(N)/BITS_PER_MAP] |= (1<<((N)%BITS_PER_MAP)))
+#define ISSET(A, N)	((A)[(N)/BITS_PER_MAP] & (1<<((N)%BITS_PER_MAP)))
+
+/* Overflow management */
+/*
+ * Overflow page numbers are allocated per split point.  At each doubling of
+ * the table, we can allocate extra pages.  So, an overflow page number has
+ * the top 5 bits indicate which split point and the lower 11 bits indicate
+ * which page at that split point is indicated (pages within split points are
+ * numberered starting with 1).
+ */
+
+#define SPLITSHIFT	11
+#define SPLITMASK	0x7FF
+#define SPLITNUM(N)	(((u_int32_t)(N)) >> SPLITSHIFT)
+#define OPAGENUM(N)	((N) & SPLITMASK)
+#define	OADDR_OF(S,O)	((u_int32_t)((u_int32_t)(S) << SPLITSHIFT) + (O))
+
+#define BUCKET_TO_PAGE(B) \
+	(B) + hashp->HDRPAGES + ((B) ? hashp->SPARES[__log2((B)+1)-1] : 0)
+#define OADDR_TO_PAGE(B) 	\
+	BUCKET_TO_PAGE ( (1 << SPLITNUM((B))) -1 ) + OPAGENUM((B));
+
+/*
+ * page.h contains a detailed description of the page format.
+ *
+ * Normally, keys and data are accessed from offset tables in the top of
+ * each page which point to the beginning of the key and data.  There are
+ * four flag values which may be stored in these offset tables which indicate
+ * the following:
+ *
+ *
+ * OVFLPAGE	Rather than a key data pair, this pair contains
+ *		the address of an overflow page.  The format of
+ *		the pair is:
+ *		    OVERFLOW_PAGE_NUMBER OVFLPAGE
+ *
+ * PARTIAL_KEY	This must be the first key/data pair on a page
+ *		and implies that page contains only a partial key.
+ *		That is, the key is too big to fit on a single page
+ *		so it starts on this page and continues on the next.
+ *		The format of the page is:
+ *		    KEY_OFF PARTIAL_KEY OVFL_PAGENO OVFLPAGE
+ *		
+ *		    KEY_OFF -- offset of the beginning of the key
+ *		    PARTIAL_KEY -- 1
+ *		    OVFL_PAGENO - page number of the next overflow page
+ *		    OVFLPAGE -- 0
+ *
+ * FULL_KEY	This must be the first key/data pair on the page.  It
+ *		is used in two cases.
+ *
+ *		Case 1:
+ *		    There is a complete key on the page but no data
+ *		    (because it wouldn't fit).  The next page contains
+ *		    the data.
+ *
+ *		    Page format it:
+ *		    KEY_OFF FULL_KEY OVFL_PAGENO OVFL_PAGE
+ *
+ *		    KEY_OFF -- offset of the beginning of the key
+ *		    FULL_KEY -- 2
+ *		    OVFL_PAGENO - page number of the next overflow page
+ *		    OVFLPAGE -- 0
+ *
+ *		Case 2:
+ *		    This page contains no key, but part of a large
+ *		    data field, which is continued on the next page.
+ *
+ *		    Page format it:
+ *		    DATA_OFF FULL_KEY OVFL_PAGENO OVFL_PAGE
+ *
+ *		    KEY_OFF -- offset of the beginning of the data on
+ *				this page
+ *		    FULL_KEY -- 2
+ *		    OVFL_PAGENO - page number of the next overflow page
+ *		    OVFLPAGE -- 0
+ *
+ * FULL_KEY_DATA 
+ *		This must be the first key/data pair on the page.
+ *		There are two cases:
+ *
+ *		Case 1:
+ *		    This page contains a key and the beginning of the
+ *		    data field, but the data field is continued on the
+ *		    next page.
+ *
+ *		    Page format is:
+ *		    KEY_OFF FULL_KEY_DATA OVFL_PAGENO DATA_OFF
+ *
+ *		    KEY_OFF -- offset of the beginning of the key
+ *		    FULL_KEY_DATA -- 3
+ *		    OVFL_PAGENO - page number of the next overflow page
+ *		    DATA_OFF -- offset of the beginning of the data
+ *
+ *		Case 2:
+ *		    This page contains the last page of a big data pair.
+ *		    There is no key, only the  tail end of the data
+ *		    on this page.
+ *
+ *		    Page format is:
+ *		    DATA_OFF FULL_KEY_DATA <OVFL_PAGENO> <OVFLPAGE>
+ *
+ *		    DATA_OFF -- offset of the beginning of the data on
+ *				this page
+ *		    FULL_KEY_DATA -- 3
+ *		    OVFL_PAGENO - page number of the next overflow page
+ *		    OVFLPAGE -- 0
+ *
+ *		    OVFL_PAGENO and OVFLPAGE are optional (they are
+ *		    not present if there is no next page).
+ */
+
+#define OVFLPAGE	0
+#define PARTIAL_KEY	1
+#define FULL_KEY	2
+#define FULL_KEY_DATA	3
+#define	REAL_KEY	4
+
+/* Short hands for accessing structure */
+#define BSIZE		hdr.bsize
+#define BSHIFT		hdr.bshift
+#define DSIZE		hdr.dsize
+#define SGSIZE		hdr.ssize
+#define SSHIFT		hdr.sshift
+#define LORDER		hdr.lorder
+#define OVFL_POINT	hdr.ovfl_point
+#define	LAST_FREED	hdr.last_freed
+#define MAX_BUCKET	hdr.max_bucket
+#define FFACTOR		hdr.ffactor
+#define HIGH_MASK	hdr.high_mask
+#define LOW_MASK	hdr.low_mask
+#define NKEYS		hdr.nkeys
+#define HDRPAGES	hdr.hdrpages
+#define SPARES		hdr.spares
+#define BITMAPS		hdr.bitmaps
+#define VERSION		hdr.version
+#define MAGIC		hdr.magic
+#define NEXT_FREE	hdr.next_free
+#define H_CHARKEY	hdr.h_charkey
diff --git a/db/hash/hash_bigkey.c b/db/hash/hash_bigkey.c
new file mode 100644
index 0000000000..578314a645
--- /dev/null
+++ b/db/hash/hash_bigkey.c
@@ -0,0 +1,667 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)hash_bigkey.c	8.3 (Berkeley) 5/31/94";
+#endif /* LIBC_SCCS and not lint */
+
+/*
+ * PACKAGE: hash
+ * DESCRIPTION:
+ *	Big key/data handling for the hashing package.
+ *
+ * ROUTINES:
+ * External
+ *	__big_keydata
+ *	__big_split
+ *	__big_insert
+ *	__big_return
+ *	__big_delete
+ *	__find_last_page
+ * Internal
+ *	collect_key
+ *	collect_data
+ */
+
+#include <sys/param.h>
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef DEBUG
+#include <assert.h>
+#endif
+
+#include <db.h>
+#include "hash.h"
+#include "page.h"
+#include "extern.h"
+
+static int collect_key __P((HTAB *, BUFHEAD *, int, DBT *, int));
+static int collect_data __P((HTAB *, BUFHEAD *, int, int));
+
+/*
+ * Big_insert
+ *
+ * You need to do an insert and the key/data pair is too big
+ *
+ * Returns:
+ * 0 ==> OK
+ *-1 ==> ERROR
+ */
+extern int
+__big_insert(hashp, bufp, key, val)
+	HTAB *hashp;
+	BUFHEAD *bufp;
+	const DBT *key, *val;
+{
+	register u_int16_t *p;
+	int key_size, n, val_size;
+	u_int16_t space, move_bytes, off;
+	char *cp, *key_data, *val_data;
+
+	cp = bufp->page;		/* Character pointer of p. */
+	p = (u_int16_t *)cp;
+
+	key_data = (char *)key->data;
+	key_size = key->size;
+	val_data = (char *)val->data;
+	val_size = val->size;
+
+	/* First move the Key */
+	for (space = FREESPACE(p) - BIGOVERHEAD; key_size;
+	    space = FREESPACE(p) - BIGOVERHEAD) {
+		move_bytes = MIN(space, key_size);
+		off = OFFSET(p) - move_bytes;
+		memmove(cp + off, key_data, move_bytes);
+		key_size -= move_bytes;
+		key_data += move_bytes;
+		n = p[0];
+		p[++n] = off;
+		p[0] = ++n;
+		FREESPACE(p) = off - PAGE_META(n);
+		OFFSET(p) = off;
+		p[n] = PARTIAL_KEY;
+		bufp = __add_ovflpage(hashp, bufp);
+		if (!bufp)
+			return (-1);
+		n = p[0];
+		if (!key_size)
+			if (FREESPACE(p)) {
+				move_bytes = MIN(FREESPACE(p), val_size);
+				off = OFFSET(p) - move_bytes;
+				p[n] = off;
+				memmove(cp + off, val_data, move_bytes);
+				val_data += move_bytes;
+				val_size -= move_bytes;
+				p[n - 2] = FULL_KEY_DATA;
+				FREESPACE(p) = FREESPACE(p) - move_bytes;
+				OFFSET(p) = off;
+			} else
+				p[n - 2] = FULL_KEY;
+		p = (u_int16_t *)bufp->page;
+		cp = bufp->page;
+		bufp->flags |= BUF_MOD;
+	}
+
+	/* Now move the data */
+	for (space = FREESPACE(p) - BIGOVERHEAD; val_size;
+	    space = FREESPACE(p) - BIGOVERHEAD) {
+		move_bytes = MIN(space, val_size);
+		/*
+		 * Here's the hack to make sure that if the data ends on the
+		 * same page as the key ends, FREESPACE is at least one.
+		 */
+		if (space == val_size && val_size == val->size)
+			move_bytes--;
+		off = OFFSET(p) - move_bytes;
+		memmove(cp + off, val_data, move_bytes);
+		val_size -= move_bytes;
+		val_data += move_bytes;
+		n = p[0];
+		p[++n] = off;
+		p[0] = ++n;
+		FREESPACE(p) = off - PAGE_META(n);
+		OFFSET(p) = off;
+		if (val_size) {
+			p[n] = FULL_KEY;
+			bufp = __add_ovflpage(hashp, bufp);
+			if (!bufp)
+				return (-1);
+			cp = bufp->page;
+			p = (u_int16_t *)cp;
+		} else
+			p[n] = FULL_KEY_DATA;
+		bufp->flags |= BUF_MOD;
+	}
+	return (0);
+}
+
+/*
+ * Called when bufp's page  contains a partial key (index should be 1)
+ *
+ * All pages in the big key/data pair except bufp are freed.  We cannot
+ * free bufp because the page pointing to it is lost and we can't get rid
+ * of its pointer.
+ *
+ * Returns:
+ * 0 => OK
+ *-1 => ERROR
+ */
+extern int
+__big_delete(hashp, bufp)
+	HTAB *hashp;
+	BUFHEAD *bufp;
+{
+	register BUFHEAD *last_bfp, *rbufp;
+	u_int16_t *bp, pageno;
+	int key_done, n;
+
+	rbufp = bufp;
+	last_bfp = NULL;
+	bp = (u_int16_t *)bufp->page;
+	pageno = 0;
+	key_done = 0;
+
+	while (!key_done || (bp[2] != FULL_KEY_DATA)) {
+		if (bp[2] == FULL_KEY || bp[2] == FULL_KEY_DATA)
+			key_done = 1;
+
+		/*
+		 * If there is freespace left on a FULL_KEY_DATA page, then
+		 * the data is short and fits entirely on this page, and this
+		 * is the last page.
+		 */
+		if (bp[2] == FULL_KEY_DATA && FREESPACE(bp))
+			break;
+		pageno = bp[bp[0] - 1];
+		rbufp->flags |= BUF_MOD;
+		rbufp = __get_buf(hashp, pageno, rbufp, 0);
+		if (last_bfp)
+			__free_ovflpage(hashp, last_bfp);
+		last_bfp = rbufp;
+		if (!rbufp)
+			return (-1);		/* Error. */
+		bp = (u_int16_t *)rbufp->page;
+	}
+
+	/*
+	 * If we get here then rbufp points to the last page of the big
+	 * key/data pair.  Bufp points to the first one -- it should now be
+	 * empty pointing to the next page after this pair.  Can't free it
+	 * because we don't have the page pointing to it.
+	 */
+
+	/* This is information from the last page of the pair. */
+	n = bp[0];
+	pageno = bp[n - 1];
+
+	/* Now, bp is the first page of the pair. */
+	bp = (u_int16_t *)bufp->page;
+	if (n > 2) {
+		/* There is an overflow page. */
+		bp[1] = pageno;
+		bp[2] = OVFLPAGE;
+		bufp->ovfl = rbufp->ovfl;
+	} else
+		/* This is the last page. */
+		bufp->ovfl = NULL;
+	n -= 2;
+	bp[0] = n;
+	FREESPACE(bp) = hashp->BSIZE - PAGE_META(n);
+	OFFSET(bp) = hashp->BSIZE - 1;
+
+	bufp->flags |= BUF_MOD;
+	if (rbufp)
+		__free_ovflpage(hashp, rbufp);
+	if (last_bfp != rbufp)
+		__free_ovflpage(hashp, last_bfp);
+
+	hashp->NKEYS--;
+	return (0);
+}
+/*
+ * Returns:
+ *  0 = key not found
+ * -1 = get next overflow page
+ * -2 means key not found and this is big key/data
+ * -3 error
+ */
+extern int
+__find_bigpair(hashp, bufp, ndx, key, size)
+	HTAB *hashp;
+	BUFHEAD *bufp;
+	int ndx;
+	char *key;
+	int size;
+{
+	register u_int16_t *bp;
+	register char *p;
+	int ksize;
+	u_int16_t bytes;
+	char *kkey;
+
+	bp = (u_int16_t *)bufp->page;
+	p = bufp->page;
+	ksize = size;
+	kkey = key;
+
+	for (bytes = hashp->BSIZE - bp[ndx];
+	    bytes <= size && bp[ndx + 1] == PARTIAL_KEY;
+	    bytes = hashp->BSIZE - bp[ndx]) {
+		if (memcmp(p + bp[ndx], kkey, bytes))
+			return (-2);
+		kkey += bytes;
+		ksize -= bytes;
+		bufp = __get_buf(hashp, bp[ndx + 2], bufp, 0);
+		if (!bufp)
+			return (-3);
+		p = bufp->page;
+		bp = (u_int16_t *)p;
+		ndx = 1;
+	}
+
+	if (bytes != ksize || memcmp(p + bp[ndx], kkey, bytes)) {
+#ifdef HASH_STATISTICS
+		++hash_collisions;
+#endif
+		return (-2);
+	} else
+		return (ndx);
+}
+
+/*
+ * Given the buffer pointer of the first overflow page of a big pair,
+ * find the end of the big pair
+ *
+ * This will set bpp to the buffer header of the last page of the big pair.
+ * It will return the pageno of the overflow page following the last page
+ * of the pair; 0 if there isn't any (i.e. big pair is the last key in the
+ * bucket)
+ */
+extern u_int16_t
+__find_last_page(hashp, bpp)
+	HTAB *hashp;
+	BUFHEAD **bpp;
+{
+	BUFHEAD *bufp;
+	u_int16_t *bp, pageno;
+	int n;
+
+	bufp = *bpp;
+	bp = (u_int16_t *)bufp->page;
+	for (;;) {
+		n = bp[0];
+
+		/*
+		 * This is the last page if: the tag is FULL_KEY_DATA and
+		 * either only 2 entries OVFLPAGE marker is explicit there
+		 * is freespace on the page.
+		 */
+		if (bp[2] == FULL_KEY_DATA &&
+		    ((n == 2) || (bp[n] == OVFLPAGE) || (FREESPACE(bp))))
+			break;
+
+		pageno = bp[n - 1];
+		bufp = __get_buf(hashp, pageno, bufp, 0);
+		if (!bufp)
+			return (0);	/* Need to indicate an error! */
+		bp = (u_int16_t *)bufp->page;
+	}
+
+	*bpp = bufp;
+	if (bp[0] > 2)
+		return (bp[3]);
+	else
+		return (0);
+}
+
+/*
+ * Return the data for the key/data pair that begins on this page at this
+ * index (index should always be 1).
+ */
+extern int
+__big_return(hashp, bufp, ndx, val, set_current)
+	HTAB *hashp;
+	BUFHEAD *bufp;
+	int ndx;
+	DBT *val;
+	int set_current;
+{
+	BUFHEAD *save_p;
+	u_int16_t *bp, len, off, save_addr;
+	char *tp;
+
+	bp = (u_int16_t *)bufp->page;
+	while (bp[ndx + 1] == PARTIAL_KEY) {
+		bufp = __get_buf(hashp, bp[bp[0] - 1], bufp, 0);
+		if (!bufp)
+			return (-1);
+		bp = (u_int16_t *)bufp->page;
+		ndx = 1;
+	}
+
+	if (bp[ndx + 1] == FULL_KEY) {
+		bufp = __get_buf(hashp, bp[bp[0] - 1], bufp, 0);
+		if (!bufp)
+			return (-1);
+		bp = (u_int16_t *)bufp->page;
+		save_p = bufp;
+		save_addr = save_p->addr;
+		off = bp[1];
+		len = 0;
+	} else
+		if (!FREESPACE(bp)) {
+			/*
+			 * This is a hack.  We can't distinguish between
+			 * FULL_KEY_DATA that contains complete data or
+			 * incomplete data, so we require that if the data
+			 * is complete, there is at least 1 byte of free
+			 * space left.
+			 */
+			off = bp[bp[0]];
+			len = bp[1] - off;
+			save_p = bufp;
+			save_addr = bufp->addr;
+			bufp = __get_buf(hashp, bp[bp[0] - 1], bufp, 0);
+			if (!bufp)
+				return (-1);
+			bp = (u_int16_t *)bufp->page;
+		} else {
+			/* The data is all on one page. */
+			tp = (char *)bp;
+			off = bp[bp[0]];
+			val->data = (u_char *)tp + off;
+			val->size = bp[1] - off;
+			if (set_current) {
+				if (bp[0] == 2) {	/* No more buckets in
+							 * chain */
+					hashp->cpage = NULL;
+					hashp->cbucket++;
+					hashp->cndx = 1;
+				} else {
+					hashp->cpage = __get_buf(hashp,
+					    bp[bp[0] - 1], bufp, 0);
+					if (!hashp->cpage)
+						return (-1);
+					hashp->cndx = 1;
+					if (!((u_int16_t *)
+					    hashp->cpage->page)[0]) {
+						hashp->cbucket++;
+						hashp->cpage = NULL;
+					}
+				}
+			}
+			return (0);
+		}
+
+	val->size = collect_data(hashp, bufp, (int)len, set_current);
+	if (val->size == -1)
+		return (-1);
+	if (save_p->addr != save_addr) {
+		/* We are pretty short on buffers. */
+		errno = EINVAL;			/* OUT OF BUFFERS */
+		return (-1);
+	}
+	memmove(hashp->tmp_buf, (save_p->page) + off, len);
+	val->data = (u_char *)hashp->tmp_buf;
+	return (0);
+}
+/*
+ * Count how big the total datasize is by recursing through the pages.  Then
+ * allocate a buffer and copy the data as you recurse up.
+ */
+static int
+collect_data(hashp, bufp, len, set)
+	HTAB *hashp;
+	BUFHEAD *bufp;
+	int len, set;
+{
+	register u_int16_t *bp;
+	register char *p;
+	BUFHEAD *xbp;
+	u_int16_t save_addr;
+	int mylen, totlen;
+
+	p = bufp->page;
+	bp = (u_int16_t *)p;
+	mylen = hashp->BSIZE - bp[1];
+	save_addr = bufp->addr;
+
+	if (bp[2] == FULL_KEY_DATA) {		/* End of Data */
+		totlen = len + mylen;
+		if (hashp->tmp_buf)
+			free(hashp->tmp_buf);
+		if ((hashp->tmp_buf = (char *)malloc(totlen)) == NULL)
+			return (-1);
+		if (set) {
+			hashp->cndx = 1;
+			if (bp[0] == 2) {	/* No more buckets in chain */
+				hashp->cpage = NULL;
+				hashp->cbucket++;
+			} else {
+				hashp->cpage =
+				    __get_buf(hashp, bp[bp[0] - 1], bufp, 0);
+				if (!hashp->cpage)
+					return (-1);
+				else if (!((u_int16_t *)hashp->cpage->page)[0]) {
+					hashp->cbucket++;
+					hashp->cpage = NULL;
+				}
+			}
+		}
+	} else {
+		xbp = __get_buf(hashp, bp[bp[0] - 1], bufp, 0);
+		if (!xbp || ((totlen =
+		    collect_data(hashp, xbp, len + mylen, set)) < 1))
+			return (-1);
+	}
+	if (bufp->addr != save_addr) {
+		errno = EINVAL;			/* Out of buffers. */
+		return (-1);
+	}
+	memmove(&hashp->tmp_buf[len], (bufp->page) + bp[1], mylen);
+	return (totlen);
+}
+
+/*
+ * Fill in the key and data for this big pair.
+ */
+extern int
+__big_keydata(hashp, bufp, key, val, set)
+	HTAB *hashp;
+	BUFHEAD *bufp;
+	DBT *key, *val;
+	int set;
+{
+	key->size = collect_key(hashp, bufp, 0, val, set);
+	if (key->size == -1)
+		return (-1);
+	key->data = (u_char *)hashp->tmp_key;
+	return (0);
+}
+
+/*
+ * Count how big the total key size is by recursing through the pages.  Then
+ * collect the data, allocate a buffer and copy the key as you recurse up.
+ */
+static int
+collect_key(hashp, bufp, len, val, set)
+	HTAB *hashp;
+	BUFHEAD *bufp;
+	int len;
+	DBT *val;
+	int set;
+{
+	BUFHEAD *xbp;
+	char *p;
+	int mylen, totlen;
+	u_int16_t *bp, save_addr;
+
+	p = bufp->page;
+	bp = (u_int16_t *)p;
+	mylen = hashp->BSIZE - bp[1];
+
+	save_addr = bufp->addr;
+	totlen = len + mylen;
+	if (bp[2] == FULL_KEY || bp[2] == FULL_KEY_DATA) {    /* End of Key. */
+		if (hashp->tmp_key != NULL)
+			free(hashp->tmp_key);
+		if ((hashp->tmp_key = (char *)malloc(totlen)) == NULL)
+			return (-1);
+		if (__big_return(hashp, bufp, 1, val, set))
+			return (-1);
+	} else {
+		xbp = __get_buf(hashp, bp[bp[0] - 1], bufp, 0);
+		if (!xbp || ((totlen =
+		    collect_key(hashp, xbp, totlen, val, set)) < 1))
+			return (-1);
+	}
+	if (bufp->addr != save_addr) {
+		errno = EINVAL;		/* MIS -- OUT OF BUFFERS */
+		return (-1);
+	}
+	memmove(&hashp->tmp_key[len], (bufp->page) + bp[1], mylen);
+	return (totlen);
+}
+
+/*
+ * Returns:
+ *  0 => OK
+ * -1 => error
+ */
+extern int
+__big_split(hashp, op, np, big_keyp, addr, obucket, ret)
+	HTAB *hashp;
+	BUFHEAD *op;	/* Pointer to where to put keys that go in old bucket */
+	BUFHEAD *np;	/* Pointer to new bucket page */
+			/* Pointer to first page containing the big key/data */
+	BUFHEAD *big_keyp;
+	int addr;	/* Address of big_keyp */
+	u_int32_t   obucket;/* Old Bucket */
+	SPLIT_RETURN *ret;
+{
+	register BUFHEAD *tmpp;
+	register u_int16_t *tp;
+	BUFHEAD *bp;
+	DBT key, val;
+	u_int32_t change;
+	u_int16_t free_space, n, off;
+
+	bp = big_keyp;
+
+	/* Now figure out where the big key/data goes */
+	if (__big_keydata(hashp, big_keyp, &key, &val, 0))
+		return (-1);
+	change = (__call_hash(hashp, key.data, key.size) != obucket);
+
+	if (ret->next_addr = __find_last_page(hashp, &big_keyp)) {
+		if (!(ret->nextp =
+		    __get_buf(hashp, ret->next_addr, big_keyp, 0)))
+			return (-1);;
+	} else
+		ret->nextp = NULL;
+
+	/* Now make one of np/op point to the big key/data pair */
+#ifdef DEBUG
+	assert(np->ovfl == NULL);
+#endif
+	if (change)
+		tmpp = np;
+	else
+		tmpp = op;
+
+	tmpp->flags |= BUF_MOD;
+#ifdef DEBUG1
+	(void)fprintf(stderr,
+	    "BIG_SPLIT: %d->ovfl was %d is now %d\n", tmpp->addr,
+	    (tmpp->ovfl ? tmpp->ovfl->addr : 0), (bp ? bp->addr : 0));
+#endif
+	tmpp->ovfl = bp;	/* one of op/np point to big_keyp */
+	tp = (u_int16_t *)tmpp->page;
+#ifdef DEBUG
+	assert(FREESPACE(tp) >= OVFLSIZE);
+#endif
+	n = tp[0];
+	off = OFFSET(tp);
+	free_space = FREESPACE(tp);
+	tp[++n] = (u_int16_t)addr;
+	tp[++n] = OVFLPAGE;
+	tp[0] = n;
+	OFFSET(tp) = off;
+	FREESPACE(tp) = free_space - OVFLSIZE;
+
+	/*
+	 * Finally, set the new and old return values. BIG_KEYP contains a
+	 * pointer to the last page of the big key_data pair. Make sure that
+	 * big_keyp has no following page (2 elements) or create an empty
+	 * following page.
+	 */
+
+	ret->newp = np;
+	ret->oldp = op;
+
+	tp = (u_int16_t *)big_keyp->page;
+	big_keyp->flags |= BUF_MOD;
+	if (tp[0] > 2) {
+		/*
+		 * There may be either one or two offsets on this page.  If
+		 * there is one, then the overflow page is linked on normally
+		 * and tp[4] is OVFLPAGE.  If there are two, tp[4] contains
+		 * the second offset and needs to get stuffed in after the
+		 * next overflow page is added.
+		 */
+		n = tp[4];
+		free_space = FREESPACE(tp);
+		off = OFFSET(tp);
+		tp[0] -= 2;
+		FREESPACE(tp) = free_space + OVFLSIZE;
+		OFFSET(tp) = off;
+		tmpp = __add_ovflpage(hashp, big_keyp);
+		if (!tmpp)
+			return (-1);
+		tp[4] = n;
+	} else
+		tmpp = big_keyp;
+
+	if (change)
+		ret->newp = tmpp;
+	else
+		ret->oldp = tmpp;
+	return (0);
+}
diff --git a/db/hash/hash_buf.c b/db/hash/hash_buf.c
new file mode 100644
index 0000000000..92e1f933ad
--- /dev/null
+++ b/db/hash/hash_buf.c
@@ -0,0 +1,355 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)hash_buf.c	8.5 (Berkeley) 7/15/94";
+#endif /* LIBC_SCCS and not lint */
+
+/*
+ * PACKAGE: hash
+ *
+ * DESCRIPTION:
+ *	Contains buffer management
+ *
+ * ROUTINES:
+ * External
+ *	__buf_init
+ *	__get_buf
+ *	__buf_free
+ *	__reclaim_buf
+ * Internal
+ *	newbuf
+ */
+
+#include <sys/param.h>
+
+#include <errno.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef DEBUG
+#include <assert.h>
+#endif
+
+#include <db.h>
+#include "hash.h"
+#include "page.h"
+#include "extern.h"
+
+static BUFHEAD *newbuf __P((HTAB *, u_int32_t, BUFHEAD *));
+
+/* Unlink B from its place in the lru */
+#define BUF_REMOVE(B) { \
+	(B)->prev->next = (B)->next; \
+	(B)->next->prev = (B)->prev; \
+}
+
+/* Insert B after P */
+#define BUF_INSERT(B, P) { \
+	(B)->next = (P)->next; \
+	(B)->prev = (P); \
+	(P)->next = (B); \
+	(B)->next->prev = (B); \
+}
+
+#define	MRU	hashp->bufhead.next
+#define	LRU	hashp->bufhead.prev
+
+#define MRU_INSERT(B)	BUF_INSERT((B), &hashp->bufhead)
+#define LRU_INSERT(B)	BUF_INSERT((B), LRU)
+
+/*
+ * We are looking for a buffer with address "addr".  If prev_bp is NULL, then
+ * address is a bucket index.  If prev_bp is not NULL, then it points to the
+ * page previous to an overflow page that we are trying to find.
+ *
+ * CAVEAT:  The buffer header accessed via prev_bp's ovfl field may no longer
+ * be valid.  Therefore, you must always verify that its address matches the
+ * address you are seeking.
+ */
+extern BUFHEAD *
+__get_buf(hashp, addr, prev_bp, newpage)
+	HTAB *hashp;
+	u_int32_t addr;
+	BUFHEAD *prev_bp;
+	int newpage;	/* If prev_bp set, indicates a new overflow page. */
+{
+	register BUFHEAD *bp;
+	register u_int32_t is_disk_mask;
+	register int is_disk, segment_ndx;
+	SEGMENT segp;
+
+	is_disk = 0;
+	is_disk_mask = 0;
+	if (prev_bp) {
+		bp = prev_bp->ovfl;
+		if (!bp || (bp->addr != addr))
+			bp = NULL;
+		if (!newpage)
+			is_disk = BUF_DISK;
+	} else {
+		/* Grab buffer out of directory */
+		segment_ndx = addr & (hashp->SGSIZE - 1);
+
+		/* valid segment ensured by __call_hash() */
+		segp = hashp->dir[addr >> hashp->SSHIFT];
+#ifdef DEBUG
+		assert(segp != NULL);
+#endif
+		bp = PTROF(segp[segment_ndx]);
+		is_disk_mask = ISDISK(segp[segment_ndx]);
+		is_disk = is_disk_mask || !hashp->new_file;
+	}
+
+	if (!bp) {
+		bp = newbuf(hashp, addr, prev_bp);
+		if (!bp ||
+		    __get_page(hashp, bp->page, addr, !prev_bp, is_disk, 0))
+			return (NULL);
+		if (!prev_bp)
+			segp[segment_ndx] =
+			    (BUFHEAD *)((ptrdiff_t)bp | is_disk_mask);
+	} else {
+		BUF_REMOVE(bp);
+		MRU_INSERT(bp);
+	}
+	return (bp);
+}
+
+/*
+ * We need a buffer for this page. Either allocate one, or evict a resident
+ * one (if we have as many buffers as we're allowed) and put this one in.
+ *
+ * If newbuf finds an error (returning NULL), it also sets errno.
+ */
+static BUFHEAD *
+newbuf(hashp, addr, prev_bp)
+	HTAB *hashp;
+	u_int32_t addr;
+	BUFHEAD *prev_bp;
+{
+	register BUFHEAD *bp;		/* The buffer we're going to use */
+	register BUFHEAD *xbp;		/* Temp pointer */
+	register BUFHEAD *next_xbp;
+	SEGMENT segp;
+	int segment_ndx;
+	u_int16_t oaddr, *shortp;
+
+	oaddr = 0;
+	bp = LRU;
+	/*
+	 * If LRU buffer is pinned, the buffer pool is too small. We need to
+	 * allocate more buffers.
+	 */
+	if (hashp->nbufs || (bp->flags & BUF_PIN)) {
+		/* Allocate a new one */
+		if ((bp = (BUFHEAD *)malloc(sizeof(BUFHEAD))) == NULL)
+			return (NULL);
+#ifdef PURIFY
+		memset(bp, 0xff, sizeof(BUFHEAD));
+#endif
+		if ((bp->page = (char *)malloc(hashp->BSIZE)) == NULL) {
+			free(bp);
+			return (NULL);
+		}
+#ifdef PURIFY
+		memset(bp->page, 0xff, hashp->BSIZE);
+#endif
+		if (hashp->nbufs)
+			hashp->nbufs--;
+	} else {
+		/* Kick someone out */
+		BUF_REMOVE(bp);
+		/*
+		 * If this is an overflow page with addr 0, it's already been
+		 * flushed back in an overflow chain and initialized.
+		 */
+		if ((bp->addr != 0) || (bp->flags & BUF_BUCKET)) {
+			/*
+			 * Set oaddr before __put_page so that you get it
+			 * before bytes are swapped.
+			 */
+			shortp = (u_int16_t *)bp->page;
+			if (shortp[0])
+				oaddr = shortp[shortp[0] - 1];
+			if ((bp->flags & BUF_MOD) && __put_page(hashp, bp->page,
+			    bp->addr, (int)IS_BUCKET(bp->flags), 0))
+				return (NULL);
+			/*
+			 * Update the pointer to this page (i.e. invalidate it).
+			 *
+			 * If this is a new file (i.e. we created it at open
+			 * time), make sure that we mark pages which have been
+			 * written to disk so we retrieve them from disk later,
+			 * rather than allocating new pages.
+			 */
+			if (IS_BUCKET(bp->flags)) {
+				segment_ndx = bp->addr & (hashp->SGSIZE - 1);
+				segp = hashp->dir[bp->addr >> hashp->SSHIFT];
+#ifdef DEBUG
+				assert(segp != NULL);
+#endif
+
+				if (hashp->new_file &&
+				    ((bp->flags & BUF_MOD) ||
+				    ISDISK(segp[segment_ndx])))
+					segp[segment_ndx] = (BUFHEAD *)BUF_DISK;
+				else
+					segp[segment_ndx] = NULL;
+			}
+			/*
+			 * Since overflow pages can only be access by means of
+			 * their bucket, free overflow pages associated with
+			 * this bucket.
+			 */
+			for (xbp = bp; xbp->ovfl;) {
+				next_xbp = xbp->ovfl;
+				xbp->ovfl = 0;
+				xbp = next_xbp;
+
+				/* Check that ovfl pointer is up date. */
+				if (IS_BUCKET(xbp->flags) ||
+				    (oaddr != xbp->addr))
+					break;
+
+				shortp = (u_int16_t *)xbp->page;
+				if (shortp[0])
+					/* set before __put_page */
+					oaddr = shortp[shortp[0] - 1];
+				if ((xbp->flags & BUF_MOD) && __put_page(hashp,
+				    xbp->page, xbp->addr, 0, 0))
+					return (NULL);
+				xbp->addr = 0;
+				xbp->flags = 0;
+				BUF_REMOVE(xbp);
+				LRU_INSERT(xbp);
+			}
+		}
+	}
+
+	/* Now assign this buffer */
+	bp->addr = addr;
+#ifdef DEBUG1
+	(void)fprintf(stderr, "NEWBUF1: %d->ovfl was %d is now %d\n",
+	    bp->addr, (bp->ovfl ? bp->ovfl->addr : 0), 0);
+#endif
+	bp->ovfl = NULL;
+	if (prev_bp) {
+		/*
+		 * If prev_bp is set, this is an overflow page, hook it in to
+		 * the buffer overflow links.
+		 */
+#ifdef DEBUG1
+		(void)fprintf(stderr, "NEWBUF2: %d->ovfl was %d is now %d\n",
+		    prev_bp->addr, (prev_bp->ovfl ? bp->ovfl->addr : 0),
+		    (bp ? bp->addr : 0));
+#endif
+		prev_bp->ovfl = bp;
+		bp->flags = 0;
+	} else
+		bp->flags = BUF_BUCKET;
+	MRU_INSERT(bp);
+	return (bp);
+}
+
+extern void
+__buf_init(hashp, nbytes)
+	HTAB *hashp;
+	int nbytes;
+{
+	BUFHEAD *bfp;
+	int npages;
+
+	bfp = &(hashp->bufhead);
+	npages = (nbytes + hashp->BSIZE - 1) >> hashp->BSHIFT;
+	npages = MAX(npages, MIN_BUFFERS);
+
+	hashp->nbufs = npages;
+	bfp->next = bfp;
+	bfp->prev = bfp;
+	/*
+	 * This space is calloc'd so these are already null.
+	 *
+	 * bfp->ovfl = NULL;
+	 * bfp->flags = 0;
+	 * bfp->page = NULL;
+	 * bfp->addr = 0;
+	 */
+}
+
+extern int
+__buf_free(hashp, do_free, to_disk)
+	HTAB *hashp;
+	int do_free, to_disk;
+{
+	BUFHEAD *bp;
+
+	/* Need to make sure that buffer manager has been initialized */
+	if (!LRU)
+		return (0);
+	for (bp = LRU; bp != &hashp->bufhead;) {
+		/* Check that the buffer is valid */
+		if (bp->addr || IS_BUCKET(bp->flags)) {
+			if (to_disk && (bp->flags & BUF_MOD) &&
+			    __put_page(hashp, bp->page,
+			    bp->addr, IS_BUCKET(bp->flags), 0))
+				return (-1);
+		}
+		/* Check if we are freeing stuff */
+		if (do_free) {
+			if (bp->page)
+				free(bp->page);
+			BUF_REMOVE(bp);
+			free(bp);
+			bp = LRU;
+		} else
+			bp = bp->prev;
+	}
+	return (0);
+}
+
+extern void
+__reclaim_buf(hashp, bp)
+	HTAB *hashp;
+	BUFHEAD *bp;
+{
+	bp->ovfl = 0;
+	bp->addr = 0;
+	bp->flags = 0;
+	BUF_REMOVE(bp);
+	LRU_INSERT(bp);
+}
diff --git a/db/hash/hash_func.c b/db/hash/hash_func.c
new file mode 100644
index 0000000000..a5ec434ee9
--- /dev/null
+++ b/db/hash/hash_func.c
@@ -0,0 +1,212 @@
+/*-
+ * Copyright (c) 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)hash_func.c	8.2 (Berkeley) 2/21/94";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/types.h>
+
+#include <db.h>
+#include "hash.h"
+#include "page.h"
+#include "extern.h"
+
+static u_int32_t hash1 __P((const void *, size_t));
+static u_int32_t hash2 __P((const void *, size_t));
+static u_int32_t hash3 __P((const void *, size_t));
+static u_int32_t hash4 __P((const void *, size_t));
+
+/* Global default hash function */
+u_int32_t (*__default_hash) __P((const void *, size_t)) = hash4;
+
+/*
+ * HASH FUNCTIONS
+ *
+ * Assume that we've already split the bucket to which this key hashes,
+ * calculate that bucket, and check that in fact we did already split it.
+ *
+ * This came from ejb's hsearch.
+ */
+
+#define PRIME1		37
+#define PRIME2		1048583
+
+static u_int32_t
+hash1(keyarg, len)
+	const void *keyarg;
+	register size_t len;
+{
+	register const u_char *key;
+	register u_int32_t h;
+
+	/* Convert string to integer */
+	for (key = keyarg, h = 0; len--;)
+		h = h * PRIME1 ^ (*key++ - ' ');
+	h %= PRIME2;
+	return (h);
+}
+
+/*
+ * Phong's linear congruential hash
+ */
+#define dcharhash(h, c)	((h) = 0x63c63cd9*(h) + 0x9c39c33d + (c))
+
+static u_int32_t
+hash2(keyarg, len)
+	const void *keyarg;
+	size_t len;
+{
+	register const u_char *e, *key;
+	register u_int32_t h;
+	register u_char c;
+
+	key = keyarg;
+	e = key + len;
+	for (h = 0; key != e;) {
+		c = *key++;
+		if (!c && key > e)
+			break;
+		dcharhash(h, c);
+	}
+	return (h);
+}
+
+/*
+ * This is INCREDIBLY ugly, but fast.  We break the string up into 8 byte
+ * units.  On the first time through the loop we get the "leftover bytes"
+ * (strlen % 8).  On every other iteration, we perform 8 HASHC's so we handle
+ * all 8 bytes.  Essentially, this saves us 7 cmp & branch instructions.  If
+ * this routine is heavily used enough, it's worth the ugly coding.
+ *
+ * OZ's original sdbm hash
+ */
+static u_int32_t
+hash3(keyarg, len)
+	const void *keyarg;
+	register size_t len;
+{
+	register const u_char *key;
+	register size_t loop;
+	register u_int32_t h;
+
+#define HASHC   h = *key++ + 65599 * h
+
+	h = 0;
+	key = keyarg;
+	if (len > 0) {
+		loop = (len + 8 - 1) >> 3;
+
+		switch (len & (8 - 1)) {
+		case 0:
+			do {
+				HASHC;
+				/* FALLTHROUGH */
+		case 7:
+				HASHC;
+				/* FALLTHROUGH */
+		case 6:
+				HASHC;
+				/* FALLTHROUGH */
+		case 5:
+				HASHC;
+				/* FALLTHROUGH */
+		case 4:
+				HASHC;
+				/* FALLTHROUGH */
+		case 3:
+				HASHC;
+				/* FALLTHROUGH */
+		case 2:
+				HASHC;
+				/* FALLTHROUGH */
+		case 1:
+				HASHC;
+			} while (--loop);
+		}
+	}
+	return (h);
+}
+
+/* Hash function from Chris Torek. */
+static u_int32_t
+hash4(keyarg, len)
+	const void *keyarg;
+	register size_t len;
+{
+	register const u_char *key;
+	register size_t loop;
+	register u_int32_t h;
+
+#define HASH4a   h = (h << 5) - h + *key++;
+#define HASH4b   h = (h << 5) + h + *key++;
+#define HASH4 HASH4b
+
+	h = 0;
+	key = keyarg;
+	if (len > 0) {
+		loop = (len + 8 - 1) >> 3;
+
+		switch (len & (8 - 1)) {
+		case 0:
+			do {
+				HASH4;
+				/* FALLTHROUGH */
+		case 7:
+				HASH4;
+				/* FALLTHROUGH */
+		case 6:
+				HASH4;
+				/* FALLTHROUGH */
+		case 5:
+				HASH4;
+				/* FALLTHROUGH */
+		case 4:
+				HASH4;
+				/* FALLTHROUGH */
+		case 3:
+				HASH4;
+				/* FALLTHROUGH */
+		case 2:
+				HASH4;
+				/* FALLTHROUGH */
+		case 1:
+				HASH4;
+			} while (--loop);
+		}
+	}
+	return (h);
+}
diff --git a/db/hash/hash_log2.c b/db/hash/hash_log2.c
new file mode 100644
index 0000000000..c8c56bff2d
--- /dev/null
+++ b/db/hash/hash_log2.c
@@ -0,0 +1,54 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)hash_log2.c	8.2 (Berkeley) 5/31/94";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/types.h>
+
+#include <db.h>
+
+u_int32_t
+__log2(num)
+	u_int32_t num;
+{
+	register u_int32_t i, limit;
+
+	limit = 1;
+	for (i = 0; limit < num; limit = limit << 1, i++);
+	return (i);
+}
diff --git a/db/hash/hash_page.c b/db/hash/hash_page.c
new file mode 100644
index 0000000000..e1dfe6b8d6
--- /dev/null
+++ b/db/hash/hash_page.c
@@ -0,0 +1,944 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)hash_page.c	8.7 (Berkeley) 8/16/94";
+#endif /* LIBC_SCCS and not lint */
+
+/*
+ * PACKAGE:  hashing
+ *
+ * DESCRIPTION:
+ *	Page manipulation for hashing package.
+ *
+ * ROUTINES:
+ *
+ * External
+ *	__get_page
+ *	__add_ovflpage
+ * Internal
+ *	overflow_page
+ *	open_temp
+ */
+
+#include <sys/types.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#ifdef DEBUG
+#include <assert.h>
+#endif
+
+#include <db.h>
+#include "hash.h"
+#include "page.h"
+#include "extern.h"
+
+static u_int32_t	*fetch_bitmap __P((HTAB *, int));
+static u_int32_t	 first_free __P((u_int32_t));
+static int	 open_temp __P((HTAB *));
+static u_int16_t	 overflow_page __P((HTAB *));
+static void	 putpair __P((char *, const DBT *, const DBT *));
+static void	 squeeze_key __P((u_int16_t *, const DBT *, const DBT *));
+static int	 ugly_split
+		    __P((HTAB *, u_int32_t, BUFHEAD *, BUFHEAD *, int, int));
+
+#define	PAGE_INIT(P) { \
+	((u_int16_t *)(P))[0] = 0; \
+	((u_int16_t *)(P))[1] = hashp->BSIZE - 3 * sizeof(u_int16_t); \
+	((u_int16_t *)(P))[2] = hashp->BSIZE; \
+}
+
+/*
+ * This is called AFTER we have verified that there is room on the page for
+ * the pair (PAIRFITS has returned true) so we go right ahead and start moving
+ * stuff on.
+ */
+static void
+putpair(p, key, val)
+	char *p;
+	const DBT *key, *val;
+{
+	register u_int16_t *bp, n, off;
+
+	bp = (u_int16_t *)p;
+
+	/* Enter the key first. */
+	n = bp[0];
+
+	off = OFFSET(bp) - key->size;
+	memmove(p + off, key->data, key->size);
+	bp[++n] = off;
+
+	/* Now the data. */
+	off -= val->size;
+	memmove(p + off, val->data, val->size);
+	bp[++n] = off;
+
+	/* Adjust page info. */
+	bp[0] = n;
+	bp[n + 1] = off - ((n + 3) * sizeof(u_int16_t));
+	bp[n + 2] = off;
+}
+
+/*
+ * Returns:
+ *	 0 OK
+ *	-1 error
+ */
+extern int
+__delpair(hashp, bufp, ndx)
+	HTAB *hashp;
+	BUFHEAD *bufp;
+	register int ndx;
+{
+	register u_int16_t *bp, newoff;
+	register int n;
+	u_int16_t pairlen;
+
+	bp = (u_int16_t *)bufp->page;
+	n = bp[0];
+
+	if (bp[ndx + 1] < REAL_KEY)
+		return (__big_delete(hashp, bufp));
+	if (ndx != 1)
+		newoff = bp[ndx - 1];
+	else
+		newoff = hashp->BSIZE;
+	pairlen = newoff - bp[ndx + 1];
+
+	if (ndx != (n - 1)) {
+		/* Hard Case -- need to shuffle keys */
+		register int i;
+		register char *src = bufp->page + (int)OFFSET(bp);
+		register char *dst = src + (int)pairlen;
+		memmove(dst, src, bp[ndx + 1] - OFFSET(bp));
+
+		/* Now adjust the pointers */
+		for (i = ndx + 2; i <= n; i += 2) {
+			if (bp[i + 1] == OVFLPAGE) {
+				bp[i - 2] = bp[i];
+				bp[i - 1] = bp[i + 1];
+			} else {
+				bp[i - 2] = bp[i] + pairlen;
+				bp[i - 1] = bp[i + 1] + pairlen;
+			}
+		}
+	}
+	/* Finally adjust the page data */
+	bp[n] = OFFSET(bp) + pairlen;
+	bp[n - 1] = bp[n + 1] + pairlen + 2 * sizeof(u_int16_t);
+	bp[0] = n - 2;
+	hashp->NKEYS--;
+
+	bufp->flags |= BUF_MOD;
+	return (0);
+}
+/*
+ * Returns:
+ *	 0 ==> OK
+ *	-1 ==> Error
+ */
+extern int
+__split_page(hashp, obucket, nbucket)
+	HTAB *hashp;
+	u_int32_t obucket, nbucket;
+{
+	register BUFHEAD *new_bufp, *old_bufp;
+	register u_int16_t *ino;
+	register char *np;
+	DBT key, val;
+	int n, ndx, retval;
+	u_int16_t copyto, diff, off, moved;
+	char *op;
+
+	copyto = (u_int16_t)hashp->BSIZE;
+	off = (u_int16_t)hashp->BSIZE;
+	old_bufp = __get_buf(hashp, obucket, NULL, 0);
+	if (old_bufp == NULL)
+		return (-1);
+	new_bufp = __get_buf(hashp, nbucket, NULL, 0);
+	if (new_bufp == NULL)
+		return (-1);
+
+	old_bufp->flags |= (BUF_MOD | BUF_PIN);
+	new_bufp->flags |= (BUF_MOD | BUF_PIN);
+
+	ino = (u_int16_t *)(op = old_bufp->page);
+	np = new_bufp->page;
+
+	moved = 0;
+
+	for (n = 1, ndx = 1; n < ino[0]; n += 2) {
+		if (ino[n + 1] < REAL_KEY) {
+			retval = ugly_split(hashp, obucket, old_bufp, new_bufp,
+			    (int)copyto, (int)moved);
+			old_bufp->flags &= ~BUF_PIN;
+			new_bufp->flags &= ~BUF_PIN;
+			return (retval);
+
+		}
+		key.data = (u_char *)op + ino[n];
+		key.size = off - ino[n];
+
+		if (__call_hash(hashp, key.data, key.size) == obucket) {
+			/* Don't switch page */
+			diff = copyto - off;
+			if (diff) {
+				copyto = ino[n + 1] + diff;
+				memmove(op + copyto, op + ino[n + 1],
+				    off - ino[n + 1]);
+				ino[ndx] = copyto + ino[n] - ino[n + 1];
+				ino[ndx + 1] = copyto;
+			} else
+				copyto = ino[n + 1];
+			ndx += 2;
+		} else {
+			/* Switch page */
+			val.data = (u_char *)op + ino[n + 1];
+			val.size = ino[n] - ino[n + 1];
+			putpair(np, &key, &val);
+			moved += 2;
+		}
+
+		off = ino[n + 1];
+	}
+
+	/* Now clean up the page */
+	ino[0] -= moved;
+	FREESPACE(ino) = copyto - sizeof(u_int16_t) * (ino[0] + 3);
+	OFFSET(ino) = copyto;
+
+#ifdef DEBUG3
+	(void)fprintf(stderr, "split %d/%d\n",
+	    ((u_int16_t *)np)[0] / 2,
+	    ((u_int16_t *)op)[0] / 2);
+#endif
+	/* unpin both pages */
+	old_bufp->flags &= ~BUF_PIN;
+	new_bufp->flags &= ~BUF_PIN;
+	return (0);
+}
+
+/*
+ * Called when we encounter an overflow or big key/data page during split
+ * handling.  This is special cased since we have to begin checking whether
+ * the key/data pairs fit on their respective pages and because we may need
+ * overflow pages for both the old and new pages.
+ *
+ * The first page might be a page with regular key/data pairs in which case
+ * we have a regular overflow condition and just need to go on to the next
+ * page or it might be a big key/data pair in which case we need to fix the
+ * big key/data pair.
+ *
+ * Returns:
+ *	 0 ==> success
+ *	-1 ==> failure
+ */
+static int
+ugly_split(hashp, obucket, old_bufp, new_bufp, copyto, moved)
+	HTAB *hashp;
+	u_int32_t obucket;	/* Same as __split_page. */
+	BUFHEAD *old_bufp, *new_bufp;
+	int copyto;	/* First byte on page which contains key/data values. */
+	int moved;	/* Number of pairs moved to new page. */
+{
+	register BUFHEAD *bufp;	/* Buffer header for ino */
+	register u_int16_t *ino;	/* Page keys come off of */
+	register u_int16_t *np;	/* New page */
+	register u_int16_t *op;	/* Page keys go on to if they aren't moving */
+
+	BUFHEAD *last_bfp;	/* Last buf header OVFL needing to be freed */
+	DBT key, val;
+	SPLIT_RETURN ret;
+	u_int16_t n, off, ov_addr, scopyto;
+	char *cino;		/* Character value of ino */
+
+	bufp = old_bufp;
+	ino = (u_int16_t *)old_bufp->page;
+	np = (u_int16_t *)new_bufp->page;
+	op = (u_int16_t *)old_bufp->page;
+	last_bfp = NULL;
+	scopyto = (u_int16_t)copyto;	/* ANSI */
+
+	n = ino[0] - 1;
+	while (n < ino[0]) {
+		if (ino[2] < REAL_KEY && ino[2] != OVFLPAGE) {
+			if (__big_split(hashp, old_bufp,
+			    new_bufp, bufp, bufp->addr, obucket, &ret))
+				return (-1);
+			old_bufp = ret.oldp;
+			if (!old_bufp)
+				return (-1);
+			op = (u_int16_t *)old_bufp->page;
+			new_bufp = ret.newp;
+			if (!new_bufp)
+				return (-1);
+			np = (u_int16_t *)new_bufp->page;
+			bufp = ret.nextp;
+			if (!bufp)
+				return (0);
+			cino = (char *)bufp->page;
+			ino = (u_int16_t *)cino;
+			last_bfp = ret.nextp;
+		} else if (ino[n + 1] == OVFLPAGE) {
+			ov_addr = ino[n];
+			/*
+			 * Fix up the old page -- the extra 2 are the fields
+			 * which contained the overflow information.
+			 */
+			ino[0] -= (moved + 2);
+			FREESPACE(ino) =
+			    scopyto - sizeof(u_int16_t) * (ino[0] + 3);
+			OFFSET(ino) = scopyto;
+
+			bufp = __get_buf(hashp, ov_addr, bufp, 0);
+			if (!bufp)
+				return (-1);
+
+			ino = (u_int16_t *)bufp->page;
+			n = 1;
+			scopyto = hashp->BSIZE;
+			moved = 0;
+
+			if (last_bfp)
+				__free_ovflpage(hashp, last_bfp);
+			last_bfp = bufp;
+		}
+		/* Move regular sized pairs of there are any */
+		off = hashp->BSIZE;
+		for (n = 1; (n < ino[0]) && (ino[n + 1] >= REAL_KEY); n += 2) {
+			cino = (char *)ino;
+			key.data = (u_char *)cino + ino[n];
+			key.size = off - ino[n];
+			val.data = (u_char *)cino + ino[n + 1];
+			val.size = ino[n] - ino[n + 1];
+			off = ino[n + 1];
+
+			if (__call_hash(hashp, key.data, key.size) == obucket) {
+				/* Keep on old page */
+				if (PAIRFITS(op, (&key), (&val)))
+					putpair((char *)op, &key, &val);
+				else {
+					old_bufp =
+					    __add_ovflpage(hashp, old_bufp);
+					if (!old_bufp)
+						return (-1);
+					op = (u_int16_t *)old_bufp->page;
+					putpair((char *)op, &key, &val);
+				}
+				old_bufp->flags |= BUF_MOD;
+			} else {
+				/* Move to new page */
+				if (PAIRFITS(np, (&key), (&val)))
+					putpair((char *)np, &key, &val);
+				else {
+					new_bufp =
+					    __add_ovflpage(hashp, new_bufp);
+					if (!new_bufp)
+						return (-1);
+					np = (u_int16_t *)new_bufp->page;
+					putpair((char *)np, &key, &val);
+				}
+				new_bufp->flags |= BUF_MOD;
+			}
+		}
+	}
+	if (last_bfp)
+		__free_ovflpage(hashp, last_bfp);
+	return (0);
+}
+
+/*
+ * Add the given pair to the page
+ *
+ * Returns:
+ *	0 ==> OK
+ *	1 ==> failure
+ */
+extern int
+__addel(hashp, bufp, key, val)
+	HTAB *hashp;
+	BUFHEAD *bufp;
+	const DBT *key, *val;
+{
+	register u_int16_t *bp, *sop;
+	int do_expand;
+
+	bp = (u_int16_t *)bufp->page;
+	do_expand = 0;
+	while (bp[0] && (bp[2] < REAL_KEY || bp[bp[0]] < REAL_KEY))
+		/* Exception case */
+		if (bp[2] == FULL_KEY_DATA && bp[0] == 2)
+			/* This is the last page of a big key/data pair
+			   and we need to add another page */
+			break;
+		else if (bp[2] < REAL_KEY && bp[bp[0]] != OVFLPAGE) {
+			bufp = __get_buf(hashp, bp[bp[0] - 1], bufp, 0);
+			if (!bufp)
+				return (-1);
+			bp = (u_int16_t *)bufp->page;
+		} else
+			/* Try to squeeze key on this page */
+			if (FREESPACE(bp) > PAIRSIZE(key, val)) {
+				squeeze_key(bp, key, val);
+				return (0);
+			} else {
+				bufp = __get_buf(hashp, bp[bp[0] - 1], bufp, 0);
+				if (!bufp)
+					return (-1);
+				bp = (u_int16_t *)bufp->page;
+			}
+
+	if (PAIRFITS(bp, key, val))
+		putpair(bufp->page, key, val);
+	else {
+		do_expand = 1;
+		bufp = __add_ovflpage(hashp, bufp);
+		if (!bufp)
+			return (-1);
+		sop = (u_int16_t *)bufp->page;
+
+		if (PAIRFITS(sop, key, val))
+			putpair((char *)sop, key, val);
+		else
+			if (__big_insert(hashp, bufp, key, val))
+				return (-1);
+	}
+	bufp->flags |= BUF_MOD;
+	/*
+	 * If the average number of keys per bucket exceeds the fill factor,
+	 * expand the table.
+	 */
+	hashp->NKEYS++;
+	if (do_expand ||
+	    (hashp->NKEYS / (hashp->MAX_BUCKET + 1) > hashp->FFACTOR))
+		return (__expand_table(hashp));
+	return (0);
+}
+
+/*
+ *
+ * Returns:
+ *	pointer on success
+ *	NULL on error
+ */
+extern BUFHEAD *
+__add_ovflpage(hashp, bufp)
+	HTAB *hashp;
+	BUFHEAD *bufp;
+{
+	register u_int16_t *sp;
+	u_int16_t ndx, ovfl_num;
+#ifdef DEBUG1
+	int tmp1, tmp2;
+#endif
+	sp = (u_int16_t *)bufp->page;
+
+	/* Check if we are dynamically determining the fill factor */
+	if (hashp->FFACTOR == DEF_FFACTOR) {
+		hashp->FFACTOR = sp[0] >> 1;
+		if (hashp->FFACTOR < MIN_FFACTOR)
+			hashp->FFACTOR = MIN_FFACTOR;
+	}
+	bufp->flags |= BUF_MOD;
+	ovfl_num = overflow_page(hashp);
+#ifdef DEBUG1
+	tmp1 = bufp->addr;
+	tmp2 = bufp->ovfl ? bufp->ovfl->addr : 0;
+#endif
+	if (!ovfl_num || !(bufp->ovfl = __get_buf(hashp, ovfl_num, bufp, 1)))
+		return (NULL);
+	bufp->ovfl->flags |= BUF_MOD;
+#ifdef DEBUG1
+	(void)fprintf(stderr, "ADDOVFLPAGE: %d->ovfl was %d is now %d\n",
+	    tmp1, tmp2, bufp->ovfl->addr);
+#endif
+	ndx = sp[0];
+	/*
+	 * Since a pair is allocated on a page only if there's room to add
+	 * an overflow page, we know that the OVFL information will fit on
+	 * the page.
+	 */
+	sp[ndx + 4] = OFFSET(sp);
+	sp[ndx + 3] = FREESPACE(sp) - OVFLSIZE;
+	sp[ndx + 1] = ovfl_num;
+	sp[ndx + 2] = OVFLPAGE;
+	sp[0] = ndx + 2;
+#ifdef HASH_STATISTICS
+	hash_overflows++;
+#endif
+	return (bufp->ovfl);
+}
+
+/*
+ * Returns:
+ *	 0 indicates SUCCESS
+ *	-1 indicates FAILURE
+ */
+extern int
+__get_page(hashp, p, bucket, is_bucket, is_disk, is_bitmap)
+	HTAB *hashp;
+	char *p;
+	u_int32_t bucket;
+	int is_bucket, is_disk, is_bitmap;
+{
+	register int fd, page, size;
+	int rsize;
+	u_int16_t *bp;
+
+	fd = hashp->fp;
+	size = hashp->BSIZE;
+
+	if ((fd == -1) || !is_disk) {
+		PAGE_INIT(p);
+		return (0);
+	}
+	if (is_bucket)
+		page = BUCKET_TO_PAGE(bucket);
+	else
+		page = OADDR_TO_PAGE(bucket);
+	if ((lseek(fd, (off_t)page << hashp->BSHIFT, SEEK_SET) == -1) ||
+	    ((rsize = read(fd, p, size)) == -1))
+		return (-1);
+	bp = (u_int16_t *)p;
+	if (!rsize)
+		bp[0] = 0;	/* We hit the EOF, so initialize a new page */
+	else
+		if (rsize != size) {
+			errno = EFTYPE;
+			return (-1);
+		}
+	if (!is_bitmap && !bp[0]) {
+		PAGE_INIT(p);
+	} else
+		if (hashp->LORDER != BYTE_ORDER) {
+			register int i, max;
+
+			if (is_bitmap) {
+				max = hashp->BSIZE >> 2; /* divide by 4 */
+				for (i = 0; i < max; i++)
+					M_32_SWAP(((int *)p)[i]);
+			} else {
+				M_16_SWAP(bp[0]);
+				max = bp[0] + 2;
+				for (i = 1; i <= max; i++)
+					M_16_SWAP(bp[i]);
+			}
+		}
+	return (0);
+}
+
+/*
+ * Write page p to disk
+ *
+ * Returns:
+ *	 0 ==> OK
+ *	-1 ==>failure
+ */
+extern int
+__put_page(hashp, p, bucket, is_bucket, is_bitmap)
+	HTAB *hashp;
+	char *p;
+	u_int32_t bucket;
+	int is_bucket, is_bitmap;
+{
+	register int fd, page, size;
+	int wsize;
+
+	size = hashp->BSIZE;
+	if ((hashp->fp == -1) && open_temp(hashp))
+		return (-1);
+	fd = hashp->fp;
+
+	if (hashp->LORDER != BYTE_ORDER) {
+		register int i;
+		register int max;
+
+		if (is_bitmap) {
+			max = hashp->BSIZE >> 2;	/* divide by 4 */
+			for (i = 0; i < max; i++)
+				M_32_SWAP(((int *)p)[i]);
+		} else {
+			max = ((u_int16_t *)p)[0] + 2;
+			for (i = 0; i <= max; i++)
+				M_16_SWAP(((u_int16_t *)p)[i]);
+		}
+	}
+	if (is_bucket)
+		page = BUCKET_TO_PAGE(bucket);
+	else
+		page = OADDR_TO_PAGE(bucket);
+	if ((lseek(fd, (off_t)page << hashp->BSHIFT, SEEK_SET) == -1) ||
+	    ((wsize = write(fd, p, size)) == -1))
+		/* Errno is set */
+		return (-1);
+	if (wsize != size) {
+		errno = EFTYPE;
+		return (-1);
+	}
+	return (0);
+}
+
+#define BYTE_MASK	((1 << INT_BYTE_SHIFT) -1)
+/*
+ * Initialize a new bitmap page.  Bitmap pages are left in memory
+ * once they are read in.
+ */
+extern int
+__ibitmap(hashp, pnum, nbits, ndx)
+	HTAB *hashp;
+	int pnum, nbits, ndx;
+{
+	u_int32_t *ip;
+	int clearbytes, clearints;
+
+	if ((ip = (u_int32_t *)malloc(hashp->BSIZE)) == NULL)
+		return (1);
+	hashp->nmaps++;
+	clearints = ((nbits - 1) >> INT_BYTE_SHIFT) + 1;
+	clearbytes = clearints << INT_TO_BYTE;
+	(void)memset((char *)ip, 0, clearbytes);
+	(void)memset(((char *)ip) + clearbytes, 0xFF,
+	    hashp->BSIZE - clearbytes);
+	ip[clearints - 1] = ALL_SET << (nbits & BYTE_MASK);
+	SETBIT(ip, 0);
+	hashp->BITMAPS[ndx] = (u_int16_t)pnum;
+	hashp->mapp[ndx] = ip;
+	return (0);
+}
+
+static u_int32_t
+first_free(map)
+	u_int32_t map;
+{
+	register u_int32_t i, mask;
+
+	mask = 0x1;
+	for (i = 0; i < BITS_PER_MAP; i++) {
+		if (!(mask & map))
+			return (i);
+		mask = mask << 1;
+	}
+	return (i);
+}
+
+static u_int16_t
+overflow_page(hashp)
+	HTAB *hashp;
+{
+	register u_int32_t *freep;
+	register int max_free, offset, splitnum;
+	u_int16_t addr;
+	int bit, first_page, free_bit, free_page, i, in_use_bits, j;
+#ifdef DEBUG2
+	int tmp1, tmp2;
+#endif
+	splitnum = hashp->OVFL_POINT;
+	max_free = hashp->SPARES[splitnum];
+
+	free_page = (max_free - 1) >> (hashp->BSHIFT + BYTE_SHIFT);
+	free_bit = (max_free - 1) & ((hashp->BSIZE << BYTE_SHIFT) - 1);
+
+	/* Look through all the free maps to find the first free block */
+	first_page = hashp->LAST_FREED >>(hashp->BSHIFT + BYTE_SHIFT);
+	for ( i = first_page; i <= free_page; i++ ) {
+		if (!(freep = (u_int32_t *)hashp->mapp[i]) &&
+		    !(freep = fetch_bitmap(hashp, i)))
+			return (0);
+		if (i == free_page)
+			in_use_bits = free_bit;
+		else
+			in_use_bits = (hashp->BSIZE << BYTE_SHIFT) - 1;
+		
+		if (i == first_page) {
+			bit = hashp->LAST_FREED &
+			    ((hashp->BSIZE << BYTE_SHIFT) - 1);
+			j = bit / BITS_PER_MAP;
+			bit = bit & ~(BITS_PER_MAP - 1);
+		} else {
+			bit = 0;
+			j = 0;
+		}
+		for (; bit <= in_use_bits; j++, bit += BITS_PER_MAP)
+			if (freep[j] != ALL_SET)
+				goto found;
+	}
+
+	/* No Free Page Found */
+	hashp->LAST_FREED = hashp->SPARES[splitnum];
+	hashp->SPARES[splitnum]++;
+	offset = hashp->SPARES[splitnum] -
+	    (splitnum ? hashp->SPARES[splitnum - 1] : 0);
+
+#define	OVMSG	"HASH: Out of overflow pages.  Increase page size\n"
+	if (offset > SPLITMASK) {
+		if (++splitnum >= NCACHED) {
+			(void)write(STDERR_FILENO, OVMSG, sizeof(OVMSG) - 1);
+			return (0);
+		}
+		hashp->OVFL_POINT = splitnum;
+		hashp->SPARES[splitnum] = hashp->SPARES[splitnum-1];
+		hashp->SPARES[splitnum-1]--;
+		offset = 1;
+	}
+
+	/* Check if we need to allocate a new bitmap page */
+	if (free_bit == (hashp->BSIZE << BYTE_SHIFT) - 1) {
+		free_page++;
+		if (free_page >= NCACHED) {
+			(void)write(STDERR_FILENO, OVMSG, sizeof(OVMSG) - 1);
+			return (0);
+		}
+		/*
+		 * This is tricky.  The 1 indicates that you want the new page
+		 * allocated with 1 clear bit.  Actually, you are going to
+		 * allocate 2 pages from this map.  The first is going to be
+		 * the map page, the second is the overflow page we were
+		 * looking for.  The init_bitmap routine automatically, sets
+		 * the first bit of itself to indicate that the bitmap itself
+		 * is in use.  We would explicitly set the second bit, but
+		 * don't have to if we tell init_bitmap not to leave it clear
+		 * in the first place.
+		 */
+		if (__ibitmap(hashp,
+		    (int)OADDR_OF(splitnum, offset), 1, free_page))
+			return (0);
+		hashp->SPARES[splitnum]++;
+#ifdef DEBUG2
+		free_bit = 2;
+#endif
+		offset++;
+		if (offset > SPLITMASK) {
+			if (++splitnum >= NCACHED) {
+				(void)write(STDERR_FILENO, OVMSG,
+				    sizeof(OVMSG) - 1);
+				return (0);
+			}
+			hashp->OVFL_POINT = splitnum;
+			hashp->SPARES[splitnum] = hashp->SPARES[splitnum-1];
+			hashp->SPARES[splitnum-1]--;
+			offset = 0;
+		}
+	} else {
+		/*
+		 * Free_bit addresses the last used bit.  Bump it to address
+		 * the first available bit.
+		 */
+		free_bit++;
+		SETBIT(freep, free_bit);
+	}
+
+	/* Calculate address of the new overflow page */
+	addr = OADDR_OF(splitnum, offset);
+#ifdef DEBUG2
+	(void)fprintf(stderr, "OVERFLOW_PAGE: ADDR: %d BIT: %d PAGE %d\n",
+	    addr, free_bit, free_page);
+#endif
+	return (addr);
+
+found:
+	bit = bit + first_free(freep[j]);
+	SETBIT(freep, bit);
+#ifdef DEBUG2
+	tmp1 = bit;
+	tmp2 = i;
+#endif
+	/*
+	 * Bits are addressed starting with 0, but overflow pages are addressed
+	 * beginning at 1. Bit is a bit addressnumber, so we need to increment
+	 * it to convert it to a page number.
+	 */
+	bit = 1 + bit + (i * (hashp->BSIZE << BYTE_SHIFT));
+	if (bit >= hashp->LAST_FREED)
+		hashp->LAST_FREED = bit - 1;
+
+	/* Calculate the split number for this page */
+	for (i = 0; (i < splitnum) && (bit > hashp->SPARES[i]); i++);
+	offset = (i ? bit - hashp->SPARES[i - 1] : bit);
+	if (offset >= SPLITMASK)
+		return (0);	/* Out of overflow pages */
+	addr = OADDR_OF(i, offset);
+#ifdef DEBUG2
+	(void)fprintf(stderr, "OVERFLOW_PAGE: ADDR: %d BIT: %d PAGE %d\n",
+	    addr, tmp1, tmp2);
+#endif
+
+	/* Allocate and return the overflow page */
+	return (addr);
+}
+
+/*
+ * Mark this overflow page as free.
+ */
+extern void
+__free_ovflpage(hashp, obufp)
+	HTAB *hashp;
+	BUFHEAD *obufp;
+{
+	register u_int16_t addr;
+	u_int32_t *freep;
+	int bit_address, free_page, free_bit;
+	u_int16_t ndx;
+
+	addr = obufp->addr;
+#ifdef DEBUG1
+	(void)fprintf(stderr, "Freeing %d\n", addr);
+#endif
+	ndx = (((u_int16_t)addr) >> SPLITSHIFT);
+	bit_address =
+	    (ndx ? hashp->SPARES[ndx - 1] : 0) + (addr & SPLITMASK) - 1;
+	 if (bit_address < hashp->LAST_FREED)
+		hashp->LAST_FREED = bit_address;
+	free_page = (bit_address >> (hashp->BSHIFT + BYTE_SHIFT));
+	free_bit = bit_address & ((hashp->BSIZE << BYTE_SHIFT) - 1);
+
+	if (!(freep = hashp->mapp[free_page]))
+		freep = fetch_bitmap(hashp, free_page);
+#ifdef DEBUG
+	/*
+	 * This had better never happen.  It means we tried to read a bitmap
+	 * that has already had overflow pages allocated off it, and we
+	 * failed to read it from the file.
+	 */
+	if (!freep)
+		assert(0);
+#endif
+	CLRBIT(freep, free_bit);
+#ifdef DEBUG2
+	(void)fprintf(stderr, "FREE_OVFLPAGE: ADDR: %d BIT: %d PAGE %d\n",
+	    obufp->addr, free_bit, free_page);
+#endif
+	__reclaim_buf(hashp, obufp);
+}
+
+/*
+ * Returns:
+ *	 0 success
+ *	-1 failure
+ */
+static int
+open_temp(hashp)
+	HTAB *hashp;
+{
+	sigset_t set, oset;
+	static char namestr[] = "_hashXXXXXX";
+
+	/* Block signals; make sure file goes away at process exit. */
+	(void)sigfillset(&set);
+	(void)sigprocmask(SIG_BLOCK, &set, &oset);
+	if ((hashp->fp = mkstemp(namestr)) != -1) {
+		(void)unlink(namestr);
+		(void)fcntl(hashp->fp, F_SETFD, 1);
+	}
+	(void)sigprocmask(SIG_SETMASK, &oset, (sigset_t *)NULL);
+	return (hashp->fp != -1 ? 0 : -1);
+}
+
+/*
+ * We have to know that the key will fit, but the last entry on the page is
+ * an overflow pair, so we need to shift things.
+ */
+static void
+squeeze_key(sp, key, val)
+	u_int16_t *sp;
+	const DBT *key, *val;
+{
+	register char *p;
+	u_int16_t free_space, n, off, pageno;
+
+	p = (char *)sp;
+	n = sp[0];
+	free_space = FREESPACE(sp);
+	off = OFFSET(sp);
+
+	pageno = sp[n - 1];
+	off -= key->size;
+	sp[n - 1] = off;
+	memmove(p + off, key->data, key->size);
+	off -= val->size;
+	sp[n] = off;
+	memmove(p + off, val->data, val->size);
+	sp[0] = n + 2;
+	sp[n + 1] = pageno;
+	sp[n + 2] = OVFLPAGE;
+	FREESPACE(sp) = free_space - PAIRSIZE(key, val);
+	OFFSET(sp) = off;
+}
+
+static u_int32_t *
+fetch_bitmap(hashp, ndx)
+	HTAB *hashp;
+	int ndx;
+{
+	if (ndx >= hashp->nmaps)
+		return (NULL);
+	if ((hashp->mapp[ndx] = (u_int32_t *)malloc(hashp->BSIZE)) == NULL)
+		return (NULL);
+	if (__get_page(hashp,
+	    (char *)hashp->mapp[ndx], hashp->BITMAPS[ndx], 0, 1, 1)) {
+		free(hashp->mapp[ndx]);
+		return (NULL);
+	}
+	return (hashp->mapp[ndx]);
+}
+
+#ifdef DEBUG4
+int
+print_chain(addr)
+	int addr;
+{
+	BUFHEAD *bufp;
+	short *bp, oaddr;
+
+	(void)fprintf(stderr, "%d ", addr);
+	bufp = __get_buf(hashp, addr, NULL, 0);
+	bp = (short *)bufp->page;
+	while (bp[0] && ((bp[bp[0]] == OVFLPAGE) ||
+		((bp[0] > 2) && bp[2] < REAL_KEY))) {
+		oaddr = bp[bp[0] - 1];
+		(void)fprintf(stderr, "%d ", (int)oaddr);
+		bufp = __get_buf(hashp, (int)oaddr, bufp, 0);
+		bp = (short *)bufp->page;
+	}
+	(void)fprintf(stderr, "\n");
+}
+#endif
diff --git a/db/hash/ndbm.c b/db/hash/ndbm.c
new file mode 100644
index 0000000000..2cbbe91368
--- /dev/null
+++ b/db/hash/ndbm.c
@@ -0,0 +1,202 @@
+/*-
+ * Copyright (c) 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)ndbm.c	8.4 (Berkeley) 7/21/94";
+#endif /* LIBC_SCCS and not lint */
+
+/*
+ * This package provides a dbm compatible interface to the new hashing
+ * package described in db(3).
+ */
+
+#include <sys/param.h>
+
+#include <stdio.h>
+#include <string.h>
+
+#include <ndbm.h>
+#include "hash.h"
+
+/*
+ * Returns:
+ * 	*DBM on success
+ *	 NULL on failure
+ */
+extern DBM *
+dbm_open(file, flags, mode)
+	const char *file;
+	int flags, mode;
+{
+	HASHINFO info;
+	char path[MAXPATHLEN];
+
+	info.bsize = 4096;
+	info.ffactor = 40;
+	info.nelem = 1;
+	info.cachesize = 0;
+	info.hash = NULL;
+	info.lorder = 0;
+	(void)strcpy(path, file);
+	(void)strcat(path, DBM_SUFFIX);
+	return ((DBM *)__hash_open(path, flags, mode, &info, 0));
+}
+
+extern void
+dbm_close(db)
+	DBM *db;
+{
+	(void)(db->close)(db);
+}
+
+/*
+ * Returns:
+ *	DATUM on success
+ *	NULL on failure
+ */
+extern datum
+dbm_fetch(db, key)
+	DBM *db;
+	datum key;
+{
+	datum retval;
+	int status;
+
+	status = (db->get)(db, (DBT *)&key, (DBT *)&retval, 0);
+	if (status) {
+		retval.dptr = NULL;
+		retval.dsize = 0;
+	}
+	return (retval);
+}
+
+/*
+ * Returns:
+ *	DATUM on success
+ *	NULL on failure
+ */
+extern datum
+dbm_firstkey(db)
+	DBM *db;
+{
+	int status;
+	datum retdata, retkey;
+
+	status = (db->seq)(db, (DBT *)&retkey, (DBT *)&retdata, R_FIRST);
+	if (status)
+		retkey.dptr = NULL;
+	return (retkey);
+}
+
+/*
+ * Returns:
+ *	DATUM on success
+ *	NULL on failure
+ */
+extern datum
+dbm_nextkey(db)
+	DBM *db;
+{
+	int status;
+	datum retdata, retkey;
+
+	status = (db->seq)(db, (DBT *)&retkey, (DBT *)&retdata, R_NEXT);
+	if (status)
+		retkey.dptr = NULL;
+	return (retkey);
+}
+/*
+ * Returns:
+ *	 0 on success
+ *	<0 failure
+ */
+extern int
+dbm_delete(db, key)
+	DBM *db;
+	datum key;
+{
+	int status;
+
+	status = (db->del)(db, (DBT *)&key, 0);
+	if (status)
+		return (-1);
+	else
+		return (0);
+}
+
+/*
+ * Returns:
+ *	 0 on success
+ *	<0 failure
+ *	 1 if DBM_INSERT and entry exists
+ */
+extern int
+dbm_store(db, key, content, flags)
+	DBM *db;
+	datum key, content;
+	int flags;
+{
+	return ((db->put)(db, (DBT *)&key, (DBT *)&content,
+	    (flags == DBM_INSERT) ? R_NOOVERWRITE : 0));
+}
+
+extern int
+dbm_error(db)
+	DBM *db;
+{
+	HTAB *hp;
+
+	hp = (HTAB *)db->internal;
+	return (hp->errno);
+}
+
+extern int
+dbm_clearerr(db)
+	DBM *db;
+{
+	HTAB *hp;
+
+	hp = (HTAB *)db->internal;
+	hp->errno = 0;
+	return (0);
+}
+
+extern int
+dbm_dirfno(db)
+	DBM *db;
+{
+	return(((HTAB *)db->internal)->fp);
+}
diff --git a/db/hash/page.h b/db/hash/page.h
new file mode 100644
index 0000000000..0fc0d5a3e9
--- /dev/null
+++ b/db/hash/page.h
@@ -0,0 +1,92 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)page.h	8.2 (Berkeley) 5/31/94
+ */
+
+/*
+ * Definitions for hashing page file format.
+ */
+
+/*
+ * routines dealing with a data page
+ *
+ * page format:
+ *	+------------------------------+
+ * p	| n | keyoff | datoff | keyoff |
+ * 	+------------+--------+--------+
+ *	| datoff | free  |  ptr  | --> |
+ *	+--------+---------------------+
+ *	|	 F R E E A R E A       |
+ *	+--------------+---------------+
+ *	|  <---- - - - | data          |
+ *	+--------+-----+----+----------+
+ *	|  key   | data     | key      |
+ *	+--------+----------+----------+
+ *
+ * Pointer to the free space is always:  p[p[0] + 2]
+ * Amount of free space on the page is:  p[p[0] + 1]
+ */
+
+/*
+ * How many bytes required for this pair?
+ *	2 shorts in the table at the top of the page + room for the
+ *	key and room for the data
+ *
+ * We prohibit entering a pair on a page unless there is also room to append
+ * an overflow page. The reason for this it that you can get in a situation
+ * where a single key/data pair fits on a page, but you can't append an
+ * overflow page and later you'd have to split the key/data and handle like
+ * a big pair.
+ * You might as well do this up front.
+ */
+
+#define	PAIRSIZE(K,D)	(2*sizeof(u_int16_t) + (K)->size + (D)->size)
+#define BIGOVERHEAD	(4*sizeof(u_int16_t))
+#define KEYSIZE(K)	(4*sizeof(u_int16_t) + (K)->size);
+#define OVFLSIZE	(2*sizeof(u_int16_t))
+#define FREESPACE(P)	((P)[(P)[0]+1])
+#define	OFFSET(P)	((P)[(P)[0]+2])
+#define PAIRFITS(P,K,D) \
+	(((P)[2] >= REAL_KEY) && \
+	    (PAIRSIZE((K),(D)) + OVFLSIZE) <= FREESPACE((P)))
+#define PAGE_META(N)	(((N)+3) * sizeof(u_int16_t))
+
+typedef struct {
+	BUFHEAD *newp;
+	BUFHEAD *oldp;
+	BUFHEAD *nextp;
+	u_int16_t next_addr;
+}       SPLIT_RETURN;
diff --git a/db/mpool.h b/db/mpool.h
new file mode 100644
index 0000000000..40d1022309
--- /dev/null
+++ b/db/mpool.h
@@ -0,0 +1,99 @@
+/*-
+ * Copyright (c) 1991, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)mpool.h	8.2 (Berkeley) 7/14/94
+ */
+
+#include <sys/queue.h>
+
+/*
+ * The memory pool scheme is a simple one.  Each in-memory page is referenced
+ * by a bucket which is threaded in up to two of three ways.  All active pages
+ * are threaded on a hash chain (hashed by page number) and an lru chain.
+ * Inactive pages are threaded on a free chain.  Each reference to a memory
+ * pool is handed an opaque MPOOL cookie which stores all of this information.
+ */
+#define	HASHSIZE	128
+#define	HASHKEY(pgno)	((pgno - 1) % HASHSIZE)
+
+/* The BKT structures are the elements of the queues. */
+typedef struct _bkt {
+	CIRCLEQ_ENTRY(_bkt) hq;		/* hash queue */
+	CIRCLEQ_ENTRY(_bkt) q;		/* lru queue */
+	void    *page;			/* page */
+	pgno_t   pgno;			/* page number */
+
+#define	MPOOL_DIRTY	0x01		/* page needs to be written */
+#define	MPOOL_PINNED	0x02		/* page is pinned into memory */
+	u_int8_t flags;			/* flags */
+} BKT;
+
+typedef struct MPOOL {
+	CIRCLEQ_HEAD(_lqh, _bkt) lqh;	/* lru queue head */
+					/* hash queue array */
+	CIRCLEQ_HEAD(_hqh, _bkt) hqh[HASHSIZE];
+	pgno_t	curcache;		/* current number of cached pages */
+	pgno_t	maxcache;		/* max number of cached pages */
+	pgno_t	npages;			/* number of pages in the file */
+	u_long	pagesize;		/* file page size */
+	int	fd;			/* file descriptor */
+					/* page in conversion routine */
+	void    (*pgin) __P((void *, pgno_t, void *));
+					/* page out conversion routine */
+	void    (*pgout) __P((void *, pgno_t, void *));
+	void	*pgcookie;		/* cookie for page in/out routines */
+#ifdef STATISTICS
+	u_long	cachehit;
+	u_long	cachemiss;
+	u_long	pagealloc;
+	u_long	pageflush;
+	u_long	pageget;
+	u_long	pagenew;
+	u_long	pageput;
+	u_long	pageread;
+	u_long	pagewrite;
+#endif
+} MPOOL;
+
+__BEGIN_DECLS
+MPOOL	*mpool_open __P((void *, int, pgno_t, pgno_t));
+void	 mpool_filter __P((MPOOL *, void (*)(void *, pgno_t, void *),
+	    void (*)(void *, pgno_t, void *), void *));
+void	*mpool_new __P((MPOOL *, pgno_t *));
+void	*mpool_get __P((MPOOL *, pgno_t, u_int));
+int	 mpool_put __P((MPOOL *, void *, u_int));
+int	 mpool_sync __P((MPOOL *));
+int	 mpool_close __P((MPOOL *));
+#ifdef STATISTICS
+void	 mpool_stat __P((MPOOL *));
+#endif
+__END_DECLS
diff --git a/db/mpool/mpool.c b/db/mpool/mpool.c
new file mode 100644
index 0000000000..a61041e091
--- /dev/null
+++ b/db/mpool/mpool.c
@@ -0,0 +1,463 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)mpool.c	8.5 (Berkeley) 7/26/94";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/param.h>
+#include <sys/queue.h>
+#include <sys/stat.h>
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <db.h>
+
+#define	__MPOOLINTERFACE_PRIVATE
+#include <mpool.h>
+
+static BKT *mpool_bkt __P((MPOOL *));
+static BKT *mpool_look __P((MPOOL *, pgno_t));
+static int  mpool_write __P((MPOOL *, BKT *));
+
+/*
+ * mpool_open --
+ *	Initialize a memory pool.
+ */
+MPOOL *
+mpool_open(key, fd, pagesize, maxcache)
+	void *key;
+	int fd;
+	pgno_t pagesize, maxcache;
+{
+	struct stat sb;
+	MPOOL *mp;
+	int entry;
+
+	/*
+	 * Get information about the file.
+	 *
+	 * XXX
+	 * We don't currently handle pipes, although we should.
+	 */
+	if (fstat(fd, &sb))
+		return (NULL);
+	if (!S_ISREG(sb.st_mode)) {
+		errno = ESPIPE;
+		return (NULL);
+	}
+
+	/* Allocate and initialize the MPOOL cookie. */
+	if ((mp = (MPOOL *)calloc(1, sizeof(MPOOL))) == NULL)
+		return (NULL);
+	CIRCLEQ_INIT(&mp->lqh);
+	for (entry = 0; entry < HASHSIZE; ++entry)
+		CIRCLEQ_INIT(&mp->hqh[entry]);
+	mp->maxcache = maxcache;
+	mp->npages = sb.st_size / pagesize;
+	mp->pagesize = pagesize;
+	mp->fd = fd;
+	return (mp);
+}
+
+/*
+ * mpool_filter --
+ *	Initialize input/output filters.
+ */
+void
+mpool_filter(mp, pgin, pgout, pgcookie)
+	MPOOL *mp;
+	void (*pgin) __P((void *, pgno_t, void *));
+	void (*pgout) __P((void *, pgno_t, void *));
+	void *pgcookie;
+{
+	mp->pgin = pgin;
+	mp->pgout = pgout;
+	mp->pgcookie = pgcookie;
+}
+	
+/*
+ * mpool_new --
+ *	Get a new page of memory.
+ */
+void *
+mpool_new(mp, pgnoaddr)
+	MPOOL *mp;
+	pgno_t *pgnoaddr;
+{
+	struct _hqh *head;
+	BKT *bp;
+
+	if (mp->npages == MAX_PAGE_NUMBER) {
+		(void)fprintf(stderr, "mpool_new: page allocation overflow.\n");
+		abort();
+	}
+#ifdef STATISTICS
+	++mp->pagenew;
+#endif
+	/*
+	 * Get a BKT from the cache.  Assign a new page number, attach
+	 * it to the head of the hash chain, the tail of the lru chain,
+	 * and return.
+	 */
+	if ((bp = mpool_bkt(mp)) == NULL)
+		return (NULL);
+	*pgnoaddr = bp->pgno = mp->npages++;
+	bp->flags = MPOOL_PINNED;
+
+	head = &mp->hqh[HASHKEY(bp->pgno)];
+	CIRCLEQ_INSERT_HEAD(head, bp, hq);
+	CIRCLEQ_INSERT_TAIL(&mp->lqh, bp, q);
+	return (bp->page);
+}
+
+/*
+ * mpool_get
+ *	Get a page.
+ */
+void *
+mpool_get(mp, pgno, flags)
+	MPOOL *mp;
+	pgno_t pgno;
+	u_int flags;				/* XXX not used? */
+{
+	struct _hqh *head;
+	BKT *bp;
+	off_t off;
+	int nr;
+
+	/* Check for attempt to retrieve a non-existent page. */
+	if (pgno >= mp->npages) {
+		errno = EINVAL;
+		return (NULL);
+	}
+
+#ifdef STATISTICS
+	++mp->pageget;
+#endif
+
+	/* Check for a page that is cached. */
+	if ((bp = mpool_look(mp, pgno)) != NULL) {
+#ifdef DEBUG
+		if (bp->flags & MPOOL_PINNED) {
+			(void)fprintf(stderr,
+			    "mpool_get: page %d already pinned\n", bp->pgno);
+			abort();
+		}
+#endif
+		/*
+		 * Move the page to the head of the hash chain and the tail
+		 * of the lru chain.
+		 */
+		head = &mp->hqh[HASHKEY(bp->pgno)];
+		CIRCLEQ_REMOVE(head, bp, hq);
+		CIRCLEQ_INSERT_HEAD(head, bp, hq);
+		CIRCLEQ_REMOVE(&mp->lqh, bp, q);
+		CIRCLEQ_INSERT_TAIL(&mp->lqh, bp, q);
+
+		/* Return a pinned page. */
+		bp->flags |= MPOOL_PINNED;
+		return (bp->page);
+	}
+
+	/* Get a page from the cache. */
+	if ((bp = mpool_bkt(mp)) == NULL)
+		return (NULL);
+
+	/* Read in the contents. */
+#ifdef STATISTICS
+	++mp->pageread;
+#endif
+	off = mp->pagesize * pgno;
+	if (lseek(mp->fd, off, SEEK_SET) != off)
+		return (NULL);
+	if ((nr = read(mp->fd, bp->page, mp->pagesize)) != mp->pagesize) {
+		if (nr >= 0)
+			errno = EFTYPE;
+		return (NULL);
+	}
+
+	/* Set the page number, pin the page. */
+	bp->pgno = pgno;
+	bp->flags = MPOOL_PINNED;
+
+	/*
+	 * Add the page to the head of the hash chain and the tail
+	 * of the lru chain.
+	 */
+	head = &mp->hqh[HASHKEY(bp->pgno)];
+	CIRCLEQ_INSERT_HEAD(head, bp, hq);
+	CIRCLEQ_INSERT_TAIL(&mp->lqh, bp, q);
+
+	/* Run through the user's filter. */
+	if (mp->pgin != NULL)
+		(mp->pgin)(mp->pgcookie, bp->pgno, bp->page);
+
+	return (bp->page);
+}
+
+/*
+ * mpool_put
+ *	Return a page.
+ */
+int
+mpool_put(mp, page, flags)
+	MPOOL *mp;
+	void *page;
+	u_int flags;
+{
+	BKT *bp;
+
+#ifdef STATISTICS
+	++mp->pageput;
+#endif
+	bp = (BKT *)((char *)page - sizeof(BKT));
+#ifdef DEBUG
+	if (!(bp->flags & MPOOL_PINNED)) {
+		(void)fprintf(stderr,
+		    "mpool_put: page %d not pinned\n", bp->pgno);
+		abort();
+	}
+#endif
+	bp->flags &= ~MPOOL_PINNED;
+	bp->flags |= flags & MPOOL_DIRTY;
+	return (RET_SUCCESS);
+}
+
+/*
+ * mpool_close
+ *	Close the buffer pool.
+ */
+int
+mpool_close(mp)
+	MPOOL *mp;
+{
+	BKT *bp;
+
+	/* Free up any space allocated to the lru pages. */
+	while ((bp = mp->lqh.cqh_first) != (void *)&mp->lqh) {
+		CIRCLEQ_REMOVE(&mp->lqh, mp->lqh.cqh_first, q);
+		free(bp);
+	}
+
+	/* Free the MPOOL cookie. */
+	free(mp);
+	return (RET_SUCCESS);
+}
+
+/*
+ * mpool_sync
+ *	Sync the pool to disk.
+ */
+int
+mpool_sync(mp)
+	MPOOL *mp;
+{
+	BKT *bp;
+
+	/* Walk the lru chain, flushing any dirty pages to disk. */
+	for (bp = mp->lqh.cqh_first;
+	    bp != (void *)&mp->lqh; bp = bp->q.cqe_next)
+		if (bp->flags & MPOOL_DIRTY &&
+		    mpool_write(mp, bp) == RET_ERROR)
+			return (RET_ERROR);
+
+	/* Sync the file descriptor. */
+	return (fsync(mp->fd) ? RET_ERROR : RET_SUCCESS);
+}
+
+/*
+ * mpool_bkt
+ *	Get a page from the cache (or create one).
+ */
+static BKT *
+mpool_bkt(mp)
+	MPOOL *mp;
+{
+	struct _hqh *head;
+	BKT *bp;
+
+	/* If under the max cached, always create a new page. */
+	if (mp->curcache < mp->maxcache)
+		goto new;
+
+	/*
+	 * If the cache is max'd out, walk the lru list for a buffer we
+	 * can flush.  If we find one, write it (if necessary) and take it
+	 * off any lists.  If we don't find anything we grow the cache anyway.
+	 * The cache never shrinks.
+	 */
+	for (bp = mp->lqh.cqh_first;
+	    bp != (void *)&mp->lqh; bp = bp->q.cqe_next)
+		if (!(bp->flags & MPOOL_PINNED)) {
+			/* Flush if dirty. */
+			if (bp->flags & MPOOL_DIRTY &&
+			    mpool_write(mp, bp) == RET_ERROR)
+				return (NULL);
+#ifdef STATISTICS
+			++mp->pageflush;
+#endif
+			/* Remove from the hash and lru queues. */
+			head = &mp->hqh[HASHKEY(bp->pgno)];
+			CIRCLEQ_REMOVE(head, bp, hq);
+			CIRCLEQ_REMOVE(&mp->lqh, bp, q);
+#ifdef DEBUG
+			{ void *spage;
+				spage = bp->page;
+				memset(bp, 0xff, sizeof(BKT) + mp->pagesize);
+				bp->page = spage;
+			}
+#endif
+			return (bp);
+		}
+
+new:	if ((bp = (BKT *)malloc(sizeof(BKT) + mp->pagesize)) == NULL)
+		return (NULL);
+#ifdef STATISTICS
+	++mp->pagealloc;
+#endif
+#if defined(DEBUG) || defined(PURIFY)
+	memset(bp, 0xff, sizeof(BKT) + mp->pagesize);
+#endif
+	bp->page = (char *)bp + sizeof(BKT);
+	++mp->curcache;
+	return (bp);
+}
+
+/*
+ * mpool_write
+ *	Write a page to disk.
+ */
+static int
+mpool_write(mp, bp)
+	MPOOL *mp;
+	BKT *bp;
+{
+	off_t off;
+
+#ifdef STATISTICS
+	++mp->pagewrite;
+#endif
+
+	/* Run through the user's filter. */
+	if (mp->pgout)
+		(mp->pgout)(mp->pgcookie, bp->pgno, bp->page);
+
+	off = mp->pagesize * bp->pgno;
+	if (lseek(mp->fd, off, SEEK_SET) != off)
+		return (RET_ERROR);
+	if (write(mp->fd, bp->page, mp->pagesize) != mp->pagesize)
+		return (RET_ERROR);
+
+	bp->flags &= ~MPOOL_DIRTY;
+	return (RET_SUCCESS);
+}
+
+/*
+ * mpool_look
+ *	Lookup a page in the cache.
+ */
+static BKT *
+mpool_look(mp, pgno)
+	MPOOL *mp;
+	pgno_t pgno;
+{
+	struct _hqh *head;
+	BKT *bp;
+
+	head = &mp->hqh[HASHKEY(pgno)];
+	for (bp = head->cqh_first; bp != (void *)head; bp = bp->hq.cqe_next)
+		if (bp->pgno == pgno) {
+#ifdef STATISTICS
+			++mp->cachehit;
+#endif
+			return (bp);
+		}
+#ifdef STATISTICS
+	++mp->cachemiss;
+#endif
+	return (NULL);
+}
+
+#ifdef STATISTICS
+/*
+ * mpool_stat
+ *	Print out cache statistics.
+ */
+void
+mpool_stat(mp)
+	MPOOL *mp;
+{
+	BKT *bp;
+	int cnt;
+	char *sep;
+
+	(void)fprintf(stderr, "%lu pages in the file\n", mp->npages);
+	(void)fprintf(stderr,
+	    "page size %lu, cacheing %lu pages of %lu page max cache\n",
+	    mp->pagesize, mp->curcache, mp->maxcache);
+	(void)fprintf(stderr, "%lu page puts, %lu page gets, %lu page new\n",
+	    mp->pageput, mp->pageget, mp->pagenew);
+	(void)fprintf(stderr, "%lu page allocs, %lu page flushes\n",
+	    mp->pagealloc, mp->pageflush);
+	if (mp->cachehit + mp->cachemiss)
+		(void)fprintf(stderr,
+		    "%.0f%% cache hit rate (%lu hits, %lu misses)\n", 
+		    ((double)mp->cachehit / (mp->cachehit + mp->cachemiss))
+		    * 100, mp->cachehit, mp->cachemiss);
+	(void)fprintf(stderr, "%lu page reads, %lu page writes\n",
+	    mp->pageread, mp->pagewrite);
+
+	sep = "";
+	cnt = 0;
+	for (bp = mp->lqh.cqh_first;
+	    bp != (void *)&mp->lqh; bp = bp->q.cqe_next) {
+		(void)fprintf(stderr, "%s%d", sep, bp->pgno);
+		if (bp->flags & MPOOL_DIRTY)
+			(void)fprintf(stderr, "d");
+		if (bp->flags & MPOOL_PINNED)
+			(void)fprintf(stderr, "P");
+		if (++cnt == 10) {
+			sep = "\n";
+			cnt = 0;
+		} else
+			sep = ", ";
+			
+	}
+	(void)fprintf(stderr, "\n");
+}
+#endif
diff --git a/db/ndbm.h b/db/ndbm.h
new file mode 100644
index 0000000000..a545bca132
--- /dev/null
+++ b/db/ndbm.h
@@ -0,0 +1,77 @@
+/*-
+ * Copyright (c) 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ndbm.h	8.1 (Berkeley) 6/2/93
+ */
+
+#ifndef _NDBM_H_
+#define	_NDBM_H_
+
+#include <db.h>
+
+/* Map dbm interface onto db(3). */
+#define DBM_RDONLY	O_RDONLY
+
+/* Flags to dbm_store(). */
+#define DBM_INSERT      0
+#define DBM_REPLACE     1
+
+/*
+ * The db(3) support for ndbm(3) always appends this suffix to the
+ * file name to avoid overwriting the user's original database.
+ */
+#define	DBM_SUFFIX	".db"
+
+typedef struct {
+	char *dptr;
+	int dsize;
+} datum;
+
+typedef DB DBM;
+#define	dbm_pagfno(a)	DBM_PAGFNO_NOT_AVAILABLE
+
+__BEGIN_DECLS
+void	 dbm_close __P((DBM *));
+int	 dbm_delete __P((DBM *, datum));
+datum	 dbm_fetch __P((DBM *, datum));
+datum	 dbm_firstkey __P((DBM *));
+long	 dbm_forder __P((DBM *, datum));
+datum	 dbm_nextkey __P((DBM *));
+DBM	*dbm_open __P((const char *, int, int));
+int	 dbm_store __P((DBM *, datum, datum, int));
+int	 dbm_dirfno __P((DBM *));
+__END_DECLS
+
+#endif /* !_NDBM_H_ */
diff --git a/db/recno/extern.h b/db/recno/extern.h
new file mode 100644
index 0000000000..feed434453
--- /dev/null
+++ b/db/recno/extern.h
@@ -0,0 +1,54 @@
+/*-
+ * Copyright (c) 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)extern.h	8.3 (Berkeley) 6/4/94
+ */
+
+#include "../btree/extern.h"
+
+int	 __rec_close __P((DB *));
+int	 __rec_delete __P((const DB *, const DBT *, u_int));
+int	 __rec_dleaf __P((BTREE *, PAGE *, u_int32_t));
+int	 __rec_fd __P((const DB *));
+int	 __rec_fmap __P((BTREE *, recno_t));
+int	 __rec_fout __P((BTREE *));
+int	 __rec_fpipe __P((BTREE *, recno_t));
+int	 __rec_get __P((const DB *, const DBT *, DBT *, u_int));
+int	 __rec_iput __P((BTREE *, recno_t, const DBT *, u_int));
+int	 __rec_put __P((const DB *dbp, DBT *, const DBT *, u_int));
+int	 __rec_ret __P((BTREE *, EPG *, recno_t, DBT *, DBT *));
+EPG	*__rec_search __P((BTREE *, recno_t, enum SRCHOP));
+int	 __rec_seq __P((const DB *, DBT *, DBT *, u_int));
+int	 __rec_sync __P((const DB *, u_int));
+int	 __rec_vmap __P((BTREE *, recno_t));
+int	 __rec_vout __P((BTREE *));
+int	 __rec_vpipe __P((BTREE *, recno_t));
diff --git a/db/recno/rec_close.c b/db/recno/rec_close.c
new file mode 100644
index 0000000000..16fb0b4eca
--- /dev/null
+++ b/db/recno/rec_close.c
@@ -0,0 +1,182 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)rec_close.c	8.6 (Berkeley) 8/18/94";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/mman.h>
+
+#include <errno.h>
+#include <limits.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <db.h>
+#include "recno.h"
+
+/*
+ * __REC_CLOSE -- Close a recno tree.
+ *
+ * Parameters:
+ *	dbp:	pointer to access method
+ *
+ * Returns:
+ *	RET_ERROR, RET_SUCCESS
+ */
+int
+__rec_close(dbp)
+	DB *dbp;
+{
+	BTREE *t;
+	int status;
+
+	t = dbp->internal;
+
+	/* Toss any page pinned across calls. */
+	if (t->bt_pinned != NULL) {
+		mpool_put(t->bt_mp, t->bt_pinned, 0);
+		t->bt_pinned = NULL;
+	}
+
+	if (__rec_sync(dbp, 0) == RET_ERROR)
+		return (RET_ERROR);
+
+	/* Committed to closing. */
+	status = RET_SUCCESS;
+	if (F_ISSET(t, R_MEMMAPPED) && munmap(t->bt_smap, t->bt_msize))
+		status = RET_ERROR;
+
+	if (!F_ISSET(t, R_INMEM))
+		if (F_ISSET(t, R_CLOSEFP)) {
+			if (fclose(t->bt_rfp))
+				status = RET_ERROR;
+		} else
+			if (close(t->bt_rfd))
+				status = RET_ERROR;
+
+	if (__bt_close(dbp) == RET_ERROR)
+		status = RET_ERROR;
+
+	return (status);
+}
+
+/*
+ * __REC_SYNC -- sync the recno tree to disk.
+ *
+ * Parameters:
+ *	dbp:	pointer to access method
+ *
+ * Returns:
+ *	RET_SUCCESS, RET_ERROR.
+ */
+int
+__rec_sync(dbp, flags)
+	const DB *dbp;
+	u_int flags;
+{
+	struct iovec iov[2];
+	BTREE *t;
+	DBT data, key;
+	off_t off;
+	recno_t scursor, trec;
+	int status;
+
+	t = dbp->internal;
+
+	/* Toss any page pinned across calls. */
+	if (t->bt_pinned != NULL) {
+		mpool_put(t->bt_mp, t->bt_pinned, 0);
+		t->bt_pinned = NULL;
+	}
+
+	if (flags == R_RECNOSYNC)
+		return (__bt_sync(dbp, 0));
+
+	if (F_ISSET(t, R_RDONLY | R_INMEM) || !F_ISSET(t, R_MODIFIED))
+		return (RET_SUCCESS);
+
+	/* Read any remaining records into the tree. */
+	if (!F_ISSET(t, R_EOF) && t->bt_irec(t, MAX_REC_NUMBER) == RET_ERROR)
+		return (RET_ERROR);
+
+	/* Rewind the file descriptor. */
+	if (lseek(t->bt_rfd, (off_t)0, SEEK_SET) != 0)
+		return (RET_ERROR);
+
+	/* Save the cursor. */
+	scursor = t->bt_cursor.rcursor;
+
+	key.size = sizeof(recno_t);
+	key.data = &trec;
+
+	if (F_ISSET(t, R_FIXLEN)) {
+		/*
+		 * We assume that fixed length records are all fixed length.
+		 * Any that aren't are either EINVAL'd or corrected by the
+		 * record put code.
+		 */
+		status = (dbp->seq)(dbp, &key, &data, R_FIRST);
+		while (status == RET_SUCCESS) {
+			if (write(t->bt_rfd, data.data, data.size) != data.size)
+				return (RET_ERROR);
+			status = (dbp->seq)(dbp, &key, &data, R_NEXT);
+		}
+	} else {
+		iov[1].iov_base = &t->bt_bval;
+		iov[1].iov_len = 1;
+
+		status = (dbp->seq)(dbp, &key, &data, R_FIRST);
+		while (status == RET_SUCCESS) {
+			iov[0].iov_base = data.data;
+			iov[0].iov_len = data.size;
+			if (writev(t->bt_rfd, iov, 2) != data.size + 1)
+				return (RET_ERROR);
+			status = (dbp->seq)(dbp, &key, &data, R_NEXT);
+		}
+	}
+
+	/* Restore the cursor. */
+	t->bt_cursor.rcursor = scursor;
+
+	if (status == RET_ERROR)
+		return (RET_ERROR);
+	if ((off = lseek(t->bt_rfd, (off_t)0, SEEK_CUR)) == -1)
+		return (RET_ERROR);
+	if (ftruncate(t->bt_rfd, off))
+		return (RET_ERROR);
+	F_CLR(t, R_MODIFIED);
+	return (RET_SUCCESS);
+}
diff --git a/db/recno/rec_delete.c b/db/recno/rec_delete.c
new file mode 100644
index 0000000000..a16593d4e6
--- /dev/null
+++ b/db/recno/rec_delete.c
@@ -0,0 +1,197 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)rec_delete.c	8.7 (Berkeley) 7/14/94";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/types.h>
+
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <db.h>
+#include "recno.h"
+
+static int rec_rdelete __P((BTREE *, recno_t));
+
+/*
+ * __REC_DELETE -- Delete the item(s) referenced by a key.
+ *
+ * Parameters:
+ *	dbp:	pointer to access method
+ *	key:	key to delete
+ *	flags:	R_CURSOR if deleting what the cursor references
+ *
+ * Returns:
+ *	RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found.
+ */
+int
+__rec_delete(dbp, key, flags)
+	const DB *dbp;
+	const DBT *key;
+	u_int flags;
+{
+	BTREE *t;
+	recno_t nrec;
+	int status;
+
+	t = dbp->internal;
+
+	/* Toss any page pinned across calls. */
+	if (t->bt_pinned != NULL) {
+		mpool_put(t->bt_mp, t->bt_pinned, 0);
+		t->bt_pinned = NULL;
+	}
+
+	switch(flags) {
+	case 0:
+		if ((nrec = *(recno_t *)key->data) == 0)
+			goto einval;
+		if (nrec > t->bt_nrecs)
+			return (RET_SPECIAL);
+		--nrec;
+		status = rec_rdelete(t, nrec);
+		break;
+	case R_CURSOR:
+		if (!F_ISSET(&t->bt_cursor, CURS_INIT))
+			goto einval;
+		if (t->bt_nrecs == 0)
+			return (RET_SPECIAL);
+		status = rec_rdelete(t, t->bt_cursor.rcursor - 1);
+		if (status == RET_SUCCESS)
+			--t->bt_cursor.rcursor;
+		break;
+	default:
+einval:		errno = EINVAL;
+		return (RET_ERROR);
+	}
+
+	if (status == RET_SUCCESS)
+		F_SET(t, B_MODIFIED | R_MODIFIED);
+	return (status);
+}
+
+/*
+ * REC_RDELETE -- Delete the data matching the specified key.
+ *
+ * Parameters:
+ *	tree:	tree
+ *	nrec:	record to delete
+ *
+ * Returns:
+ *	RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found.
+ */
+static int
+rec_rdelete(t, nrec)
+	BTREE *t;
+	recno_t nrec;
+{
+	EPG *e;
+	PAGE *h;
+	int status;
+
+	/* Find the record; __rec_search pins the page. */
+	if ((e = __rec_search(t, nrec, SDELETE)) == NULL)
+		return (RET_ERROR);
+
+	/* Delete the record. */
+	h = e->page;
+	status = __rec_dleaf(t, h, e->index);
+	if (status != RET_SUCCESS) {
+		mpool_put(t->bt_mp, h, 0);
+		return (status);
+	}
+	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+	return (RET_SUCCESS);
+}
+
+/*
+ * __REC_DLEAF -- Delete a single record from a recno leaf page.
+ *
+ * Parameters:
+ *	t:	tree
+ *	index:	index on current page to delete
+ *
+ * Returns:
+ *	RET_SUCCESS, RET_ERROR.
+ */
+int
+__rec_dleaf(t, h, index)
+	BTREE *t;
+	PAGE *h;
+	u_int32_t index;
+{
+	RLEAF *rl;
+	indx_t *ip, cnt, offset;
+	u_int32_t nbytes;
+	char *from;
+	void *to;
+
+	/*
+	 * Delete a record from a recno leaf page.  Internal records are never
+	 * deleted from internal pages, regardless of the records that caused
+	 * them to be added being deleted.  Pages made empty by deletion are
+	 * not reclaimed.  They are, however, made available for reuse.
+	 *
+	 * Pack the remaining entries at the end of the page, shift the indices
+	 * down, overwriting the deleted record and its index.  If the record
+	 * uses overflow pages, make them available for reuse.
+	 */
+	to = rl = GETRLEAF(h, index);
+	if (rl->flags & P_BIGDATA && __ovfl_delete(t, rl->bytes) == RET_ERROR)
+		return (RET_ERROR);
+	nbytes = NRLEAF(rl);
+
+	/*
+	 * Compress the key/data pairs.  Compress and adjust the [BR]LEAF
+	 * offsets.  Reset the headers.
+	 */
+	from = (char *)h + h->upper;
+	memmove(from + nbytes, from, (char *)to - from);
+	h->upper += nbytes;
+
+	offset = h->linp[index];
+	for (cnt = &h->linp[index] - (ip = &h->linp[0]); cnt--; ++ip)
+		if (ip[0] < offset)
+			ip[0] += nbytes;
+	for (cnt = &h->linp[NEXTINDEX(h)] - ip; --cnt; ++ip)
+		ip[0] = ip[1] < offset ? ip[1] + nbytes : ip[1];
+	h->lower -= sizeof(indx_t);
+	--t->bt_nrecs;
+	return (RET_SUCCESS);
+}
diff --git a/db/recno/rec_get.c b/db/recno/rec_get.c
new file mode 100644
index 0000000000..47dd773fb9
--- /dev/null
+++ b/db/recno/rec_get.c
@@ -0,0 +1,311 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)rec_get.c	8.9 (Berkeley) 8/18/94";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/types.h>
+
+#include <errno.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <db.h>
+#include "recno.h"
+
+/*
+ * __REC_GET -- Get a record from the btree.
+ *
+ * Parameters:
+ *	dbp:	pointer to access method
+ *	key:	key to find
+ *	data:	data to return
+ *	flag:	currently unused
+ *
+ * Returns:
+ *	RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found.
+ */
+int
+__rec_get(dbp, key, data, flags)
+	const DB *dbp;
+	const DBT *key;
+	DBT *data;
+	u_int flags;
+{
+	BTREE *t;
+	EPG *e;
+	recno_t nrec;
+	int status;
+
+	t = dbp->internal;
+
+	/* Toss any page pinned across calls. */
+	if (t->bt_pinned != NULL) {
+		mpool_put(t->bt_mp, t->bt_pinned, 0);
+		t->bt_pinned = NULL;
+	}
+
+	/* Get currently doesn't take any flags, and keys of 0 are illegal. */
+	if (flags || (nrec = *(recno_t *)key->data) == 0) {
+		errno = EINVAL;
+		return (RET_ERROR);
+	}
+
+	/*
+	 * If we haven't seen this record yet, try to find it in the
+	 * original file.
+	 */
+	if (nrec > t->bt_nrecs) {
+		if (F_ISSET(t, R_EOF | R_INMEM))
+			return (RET_SPECIAL);
+		if ((status = t->bt_irec(t, nrec)) != RET_SUCCESS)
+			return (status);
+	}
+
+	--nrec;
+	if ((e = __rec_search(t, nrec, SEARCH)) == NULL)
+		return (RET_ERROR);
+
+	status = __rec_ret(t, e, 0, NULL, data);
+	if (F_ISSET(t, B_DB_LOCK))
+		mpool_put(t->bt_mp, e->page, 0);
+	else
+		t->bt_pinned = e->page;
+	return (status);
+}
+
+/*
+ * __REC_FPIPE -- Get fixed length records from a pipe.
+ *
+ * Parameters:
+ *	t:	tree
+ *	cnt:	records to read
+ *
+ * Returns:
+ *	RET_ERROR, RET_SUCCESS
+ */
+int
+__rec_fpipe(t, top)
+	BTREE *t;
+	recno_t top;
+{
+	DBT data;
+	recno_t nrec;
+	size_t len;
+	int ch;
+	u_char *p;
+
+	if (t->bt_rdata.size < t->bt_reclen) {
+		t->bt_rdata.data = t->bt_rdata.data == NULL ?
+		    malloc(t->bt_reclen) :
+		    realloc(t->bt_rdata.data, t->bt_reclen);
+		if (t->bt_rdata.data == NULL)
+			return (RET_ERROR);
+		t->bt_rdata.size = t->bt_reclen;
+	}
+	data.data = t->bt_rdata.data;
+	data.size = t->bt_reclen;
+
+	for (nrec = t->bt_nrecs; nrec < top;) {
+		len = t->bt_reclen;
+		for (p = t->bt_rdata.data;; *p++ = ch)
+			if ((ch = getc(t->bt_rfp)) == EOF || !--len) {
+				if (ch != EOF)
+					*p = ch;
+				if (len != 0)
+					memset(p, t->bt_bval, len);
+				if (__rec_iput(t,
+				    nrec, &data, 0) != RET_SUCCESS)
+					return (RET_ERROR);
+				++nrec;
+				break;
+			}
+		if (ch == EOF)
+			break;
+	}
+	if (nrec < top) {
+		F_SET(t, R_EOF);
+		return (RET_SPECIAL);
+	}
+	return (RET_SUCCESS);
+}
+
+/*
+ * __REC_VPIPE -- Get variable length records from a pipe.
+ *
+ * Parameters:
+ *	t:	tree
+ *	cnt:	records to read
+ *
+ * Returns:
+ *	RET_ERROR, RET_SUCCESS
+ */
+int
+__rec_vpipe(t, top)
+	BTREE *t;
+	recno_t top;
+{
+	DBT data;
+	recno_t nrec;
+	indx_t len;
+	size_t sz;
+	int bval, ch;
+	u_char *p;
+
+	bval = t->bt_bval;
+	for (nrec = t->bt_nrecs; nrec < top; ++nrec) {
+		for (p = t->bt_rdata.data,
+		    sz = t->bt_rdata.size;; *p++ = ch, --sz) {
+			if ((ch = getc(t->bt_rfp)) == EOF || ch == bval) {
+				data.data = t->bt_rdata.data;
+				data.size = p - (u_char *)t->bt_rdata.data;
+				if (ch == EOF && data.size == 0)
+					break;
+				if (__rec_iput(t, nrec, &data, 0)
+				    != RET_SUCCESS)
+					return (RET_ERROR);
+				break;
+			}
+			if (sz == 0) {
+				len = p - (u_char *)t->bt_rdata.data;
+				t->bt_rdata.size += (sz = 256);
+				t->bt_rdata.data = t->bt_rdata.data == NULL ?
+				    malloc(t->bt_rdata.size) :
+				    realloc(t->bt_rdata.data, t->bt_rdata.size);
+				if (t->bt_rdata.data == NULL)
+					return (RET_ERROR);
+				p = (u_char *)t->bt_rdata.data + len;
+			}
+		}
+		if (ch == EOF)
+			break;
+	}
+	if (nrec < top) {
+		F_SET(t, R_EOF);
+		return (RET_SPECIAL);
+	}
+	return (RET_SUCCESS);
+}
+
+/*
+ * __REC_FMAP -- Get fixed length records from a file.
+ *
+ * Parameters:
+ *	t:	tree
+ *	cnt:	records to read
+ *
+ * Returns:
+ *	RET_ERROR, RET_SUCCESS
+ */
+int
+__rec_fmap(t, top)
+	BTREE *t;
+	recno_t top;
+{
+	DBT data;
+	recno_t nrec;
+	u_char *sp, *ep, *p;
+	size_t len;
+
+	if (t->bt_rdata.size < t->bt_reclen) {
+		t->bt_rdata.data = t->bt_rdata.data == NULL ?
+		    malloc(t->bt_reclen) :
+		    realloc(t->bt_rdata.data, t->bt_reclen);
+		if (t->bt_rdata.data == NULL)
+			return (RET_ERROR);
+		t->bt_rdata.size = t->bt_reclen;
+	}
+	data.data = t->bt_rdata.data;
+	data.size = t->bt_reclen;
+
+	sp = (u_char *)t->bt_cmap;
+	ep = (u_char *)t->bt_emap;
+	for (nrec = t->bt_nrecs; nrec < top; ++nrec) {
+		if (sp >= ep) {
+			F_SET(t, R_EOF);
+			return (RET_SPECIAL);
+		}
+		len = t->bt_reclen;
+		for (p = t->bt_rdata.data;
+		    sp < ep && len > 0; *p++ = *sp++, --len);
+		if (len != 0)
+			memset(p, t->bt_bval, len);
+		if (__rec_iput(t, nrec, &data, 0) != RET_SUCCESS)
+			return (RET_ERROR);
+	}
+	t->bt_cmap = (caddr_t)sp;
+	return (RET_SUCCESS);
+}
+
+/*
+ * __REC_VMAP -- Get variable length records from a file.
+ *
+ * Parameters:
+ *	t:	tree
+ *	cnt:	records to read
+ *
+ * Returns:
+ *	RET_ERROR, RET_SUCCESS
+ */
+int
+__rec_vmap(t, top)
+	BTREE *t;
+	recno_t top;
+{
+	DBT data;
+	u_char *sp, *ep;
+	recno_t nrec;
+	int bval;
+
+	sp = (u_char *)t->bt_cmap;
+	ep = (u_char *)t->bt_emap;
+	bval = t->bt_bval;
+
+	for (nrec = t->bt_nrecs; nrec < top; ++nrec) {
+		if (sp >= ep) {
+			F_SET(t, R_EOF);
+			return (RET_SPECIAL);
+		}
+		for (data.data = sp; sp < ep && *sp != bval; ++sp);
+		data.size = sp - (u_char *)data.data;
+		if (__rec_iput(t, nrec, &data, 0) != RET_SUCCESS)
+			return (RET_ERROR);
+		++sp;
+	}
+	t->bt_cmap = (caddr_t)sp;
+	return (RET_SUCCESS);
+}
diff --git a/db/recno/rec_open.c b/db/recno/rec_open.c
new file mode 100644
index 0000000000..51d8a3c260
--- /dev/null
+++ b/db/recno/rec_open.c
@@ -0,0 +1,241 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)rec_open.c	8.10 (Berkeley) 9/1/94";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <db.h>
+#include "recno.h"
+
+DB *
+__rec_open(fname, flags, mode, openinfo, dflags)
+	const char *fname;
+	int flags, mode, dflags;
+	const RECNOINFO *openinfo;
+{
+	BTREE *t;
+	BTREEINFO btopeninfo;
+	DB *dbp;
+	PAGE *h;
+	struct stat sb;
+	int rfd, sverrno;
+
+	/* Open the user's file -- if this fails, we're done. */
+	if (fname != NULL && (rfd = open(fname, flags, mode)) < 0)
+		return (NULL);
+
+	/* Create a btree in memory (backed by disk). */
+	dbp = NULL;
+	if (openinfo) {
+		if (openinfo->flags & ~(R_FIXEDLEN | R_NOKEY | R_SNAPSHOT))
+			goto einval;
+		btopeninfo.flags = 0;
+		btopeninfo.cachesize = openinfo->cachesize;
+		btopeninfo.maxkeypage = 0;
+		btopeninfo.minkeypage = 0;
+		btopeninfo.psize = openinfo->psize;
+		btopeninfo.compare = NULL;
+		btopeninfo.prefix = NULL;
+		btopeninfo.lorder = openinfo->lorder;
+		dbp = __bt_open(openinfo->bfname,
+		    O_RDWR, S_IRUSR | S_IWUSR, &btopeninfo, dflags);
+	} else
+		dbp = __bt_open(NULL, O_RDWR, S_IRUSR | S_IWUSR, NULL, dflags);
+	if (dbp == NULL)
+		goto err;
+
+	/*
+	 * Some fields in the tree structure are recno specific.  Fill them
+	 * in and make the btree structure look like a recno structure.  We
+	 * don't change the bt_ovflsize value, it's close enough and slightly
+	 * bigger.
+	 */
+	t = dbp->internal;
+	if (openinfo) {
+		if (openinfo->flags & R_FIXEDLEN) {
+			F_SET(t, R_FIXLEN);
+			t->bt_reclen = openinfo->reclen;
+			if (t->bt_reclen == 0)
+				goto einval;
+		}
+		t->bt_bval = openinfo->bval;
+	} else
+		t->bt_bval = '\n';
+
+	F_SET(t, R_RECNO);
+	if (fname == NULL)
+		F_SET(t, R_EOF | R_INMEM);
+	else
+		t->bt_rfd = rfd;
+
+	if (fname != NULL) {
+		/*
+		 * In 4.4BSD, stat(2) returns true for ISSOCK on pipes.
+		 * Unfortunately, that's not portable, so we use lseek
+		 * and check the errno values.
+		 */
+		errno = 0;
+		if (lseek(rfd, (off_t)0, SEEK_CUR) == -1 && errno == ESPIPE) {
+			switch (flags & O_ACCMODE) {
+			case O_RDONLY:
+				F_SET(t, R_RDONLY);
+				break;
+			default:
+				goto einval;
+			}
+slow:			if ((t->bt_rfp = fdopen(rfd, "r")) == NULL)
+				goto err;
+			F_SET(t, R_CLOSEFP);
+			t->bt_irec =
+			    F_ISSET(t, R_FIXLEN) ? __rec_fpipe : __rec_vpipe;
+		} else {
+			switch (flags & O_ACCMODE) {
+			case O_RDONLY:
+				F_SET(t, R_RDONLY);
+				break;
+			case O_RDWR:
+				break;
+			default:
+				goto einval;
+			}
+
+			if (fstat(rfd, &sb))
+				goto err;
+			/*
+			 * Kluge -- we'd like to test to see if the file is too
+			 * big to mmap.  Since, we don't know what size or type
+			 * off_t's or size_t's are, what the largest unsigned
+			 * integral type is, or what random insanity the local
+			 * C compiler will perpetrate, doing the comparison in
+			 * a portable way is flatly impossible.  Hope that mmap
+			 * fails if the file is too large.
+			 */
+			if (sb.st_size == 0)
+				F_SET(t, R_EOF);
+			else {
+#ifdef MMAP_NOT_AVAILABLE
+				/*
+				 * XXX
+				 * Mmap doesn't work correctly on many current
+				 * systems.  In particular, it can fail subtly,
+				 * with cache coherency problems.  Don't use it
+				 * for now.
+				 */
+				t->bt_msize = sb.st_size;
+				if ((t->bt_smap = mmap(NULL, t->bt_msize,
+				    PROT_READ, MAP_PRIVATE, rfd,
+				    (off_t)0)) == (caddr_t)-1)
+					goto slow;
+				t->bt_cmap = t->bt_smap;
+				t->bt_emap = t->bt_smap + sb.st_size;
+				t->bt_irec = F_ISSET(t, R_FIXLEN) ?
+				    __rec_fmap : __rec_vmap;
+				F_SET(t, R_MEMMAPPED);
+#else
+				goto slow;
+#endif
+			}
+		}
+	}
+
+	/* Use the recno routines. */
+	dbp->close = __rec_close;
+	dbp->del = __rec_delete;
+	dbp->fd = __rec_fd;
+	dbp->get = __rec_get;
+	dbp->put = __rec_put;
+	dbp->seq = __rec_seq;
+	dbp->sync = __rec_sync;
+
+	/* If the root page was created, reset the flags. */
+	if ((h = mpool_get(t->bt_mp, P_ROOT, 0)) == NULL)
+		goto err;
+	if ((h->flags & P_TYPE) == P_BLEAF) {
+		F_CLR(h, P_TYPE);
+		F_SET(h, P_RLEAF);
+		mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+	} else
+		mpool_put(t->bt_mp, h, 0);
+
+	if (openinfo && openinfo->flags & R_SNAPSHOT &&
+	    !F_ISSET(t, R_EOF | R_INMEM) &&
+	    t->bt_irec(t, MAX_REC_NUMBER) == RET_ERROR)
+                goto err;
+	return (dbp);
+
+einval:	errno = EINVAL;
+err:	sverrno = errno;
+	if (dbp != NULL)
+		(void)__bt_close(dbp);
+	if (fname != NULL)
+		(void)close(rfd);
+	errno = sverrno;
+	return (NULL);
+}
+
+int
+__rec_fd(dbp)
+	const DB *dbp;
+{
+	BTREE *t;
+
+	t = dbp->internal;
+
+	/* Toss any page pinned across calls. */
+	if (t->bt_pinned != NULL) {
+		mpool_put(t->bt_mp, t->bt_pinned, 0);
+		t->bt_pinned = NULL;
+	}
+
+	/* In-memory database can't have a file descriptor. */
+	if (F_ISSET(t, R_INMEM)) {
+		errno = ENOENT;
+		return (-1);
+	}
+	return (t->bt_rfd);
+}
diff --git a/db/recno/rec_put.c b/db/recno/rec_put.c
new file mode 100644
index 0000000000..1afae0d5a6
--- /dev/null
+++ b/db/recno/rec_put.c
@@ -0,0 +1,280 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)rec_put.c	8.7 (Berkeley) 8/18/94";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/types.h>
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <db.h>
+#include "recno.h"
+
+/*
+ * __REC_PUT -- Add a recno item to the tree.
+ *
+ * Parameters:
+ *	dbp:	pointer to access method
+ *	key:	key
+ *	data:	data
+ *	flag:	R_CURSOR, R_IAFTER, R_IBEFORE, R_NOOVERWRITE
+ *
+ * Returns:
+ *	RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key is
+ *	already in the tree and R_NOOVERWRITE specified.
+ */
+int
+__rec_put(dbp, key, data, flags)
+	const DB *dbp;
+	DBT *key;
+	const DBT *data;
+	u_int flags;
+{
+	BTREE *t;
+	DBT fdata, tdata;
+	recno_t nrec;
+	int status;
+
+	t = dbp->internal;
+
+	/* Toss any page pinned across calls. */
+	if (t->bt_pinned != NULL) {
+		mpool_put(t->bt_mp, t->bt_pinned, 0);
+		t->bt_pinned = NULL;
+	}
+
+	/*
+	 * If using fixed-length records, and the record is long, return
+	 * EINVAL.  If it's short, pad it out.  Use the record data return
+	 * memory, it's only short-term.
+	 */
+	if (F_ISSET(t, R_FIXLEN) && data->size != t->bt_reclen) {
+		if (data->size > t->bt_reclen)
+			goto einval;
+
+		if (t->bt_rdata.size < t->bt_reclen) {
+			t->bt_rdata.data = t->bt_rdata.data == NULL ?
+			    malloc(t->bt_reclen) :
+			    realloc(t->bt_rdata.data, t->bt_reclen);
+			if (t->bt_rdata.data == NULL)
+				return (RET_ERROR);
+			t->bt_rdata.size = t->bt_reclen;
+		}
+		memmove(t->bt_rdata.data, data->data, data->size);
+		memset((char *)t->bt_rdata.data + data->size,
+		    t->bt_bval, t->bt_reclen - data->size);
+		fdata.data = t->bt_rdata.data;
+		fdata.size = t->bt_reclen;
+	} else {
+		fdata.data = data->data;
+		fdata.size = data->size;
+	}
+
+	switch (flags) {
+	case R_CURSOR:
+		if (!F_ISSET(&t->bt_cursor, CURS_INIT))
+			goto einval;
+		nrec = t->bt_cursor.rcursor;
+		break;
+	case R_SETCURSOR:
+		if ((nrec = *(recno_t *)key->data) == 0)
+			goto einval;
+		break;
+	case R_IAFTER:
+		if ((nrec = *(recno_t *)key->data) == 0) {
+			nrec = 1;
+			flags = R_IBEFORE;
+		}
+		break;
+	case 0:
+	case R_IBEFORE:
+		if ((nrec = *(recno_t *)key->data) == 0)
+			goto einval;
+		break;
+	case R_NOOVERWRITE:
+		if ((nrec = *(recno_t *)key->data) == 0)
+			goto einval;
+		if (nrec <= t->bt_nrecs)
+			return (RET_SPECIAL);
+		break;
+	default:
+einval:		errno = EINVAL;
+		return (RET_ERROR);
+	}
+
+	/*
+	 * Make sure that records up to and including the put record are
+	 * already in the database.  If skipping records, create empty ones.
+	 */
+	if (nrec > t->bt_nrecs) {
+		if (!F_ISSET(t, R_EOF | R_INMEM) &&
+		    t->bt_irec(t, nrec) == RET_ERROR)
+			return (RET_ERROR);
+		if (nrec > t->bt_nrecs + 1) {
+			if (F_ISSET(t, R_FIXLEN)) {
+				if ((tdata.data =
+				    (void *)malloc(t->bt_reclen)) == NULL)
+					return (RET_ERROR);
+				tdata.size = t->bt_reclen;
+				memset(tdata.data, t->bt_bval, tdata.size);
+			} else {
+				tdata.data = NULL;
+				tdata.size = 0;
+			}
+			while (nrec > t->bt_nrecs + 1)
+				if (__rec_iput(t,
+				    t->bt_nrecs, &tdata, 0) != RET_SUCCESS)
+					return (RET_ERROR);
+			if (F_ISSET(t, R_FIXLEN))
+				free(tdata.data);
+		}
+	}
+
+	if ((status = __rec_iput(t, nrec - 1, &fdata, flags)) != RET_SUCCESS)
+		return (status);
+
+	if (flags == R_SETCURSOR)
+		t->bt_cursor.rcursor = nrec;
+	
+	F_SET(t, R_MODIFIED);
+	return (__rec_ret(t, NULL, nrec, key, NULL));
+}
+
+/*
+ * __REC_IPUT -- Add a recno item to the tree.
+ *
+ * Parameters:
+ *	t:	tree
+ *	nrec:	record number
+ *	data:	data
+ *
+ * Returns:
+ *	RET_ERROR, RET_SUCCESS
+ */
+int
+__rec_iput(t, nrec, data, flags)
+	BTREE *t;
+	recno_t nrec;
+	const DBT *data;
+	u_int flags;
+{
+	DBT tdata;
+	EPG *e;
+	PAGE *h;
+	indx_t index, nxtindex;
+	pgno_t pg;
+	u_int32_t nbytes;
+	int dflags, status;
+	char *dest, db[NOVFLSIZE];
+
+	/*
+	 * If the data won't fit on a page, store it on indirect pages.
+	 *
+	 * XXX
+	 * If the insert fails later on, these pages aren't recovered.
+	 */
+	if (data->size > t->bt_ovflsize) {
+		if (__ovfl_put(t, data, &pg) == RET_ERROR)
+			return (RET_ERROR);
+		tdata.data = db;
+		tdata.size = NOVFLSIZE;
+		*(pgno_t *)db = pg;
+		*(u_int32_t *)(db + sizeof(pgno_t)) = data->size;
+		dflags = P_BIGDATA;
+		data = &tdata;
+	} else
+		dflags = 0;
+
+	/* __rec_search pins the returned page. */
+	if ((e = __rec_search(t, nrec,
+	    nrec > t->bt_nrecs || flags == R_IAFTER || flags == R_IBEFORE ?
+	    SINSERT : SEARCH)) == NULL)
+		return (RET_ERROR);
+
+	h = e->page;
+	index = e->index;
+
+	/*
+	 * Add the specified key/data pair to the tree.  The R_IAFTER and
+	 * R_IBEFORE flags insert the key after/before the specified key.
+	 *
+	 * Pages are split as required.
+	 */
+	switch (flags) {
+	case R_IAFTER:
+		++index;
+		break;
+	case R_IBEFORE:
+		break;
+	default:
+		if (nrec < t->bt_nrecs &&
+		    __rec_dleaf(t, h, index) == RET_ERROR) {
+			mpool_put(t->bt_mp, h, 0);
+			return (RET_ERROR);
+		}
+		break;
+	}
+
+	/*
+	 * If not enough room, split the page.  The split code will insert
+	 * the key and data and unpin the current page.  If inserting into
+	 * the offset array, shift the pointers up.
+	 */
+	nbytes = NRLEAFDBT(data->size);
+	if (h->upper - h->lower < nbytes + sizeof(indx_t)) {
+		status = __bt_split(t, h, NULL, data, dflags, nbytes, index);
+		if (status == RET_SUCCESS)
+			++t->bt_nrecs;
+		return (status);
+	}
+
+	if (index < (nxtindex = NEXTINDEX(h)))
+		memmove(h->linp + index + 1, h->linp + index,
+		    (nxtindex - index) * sizeof(indx_t));
+	h->lower += sizeof(indx_t);
+
+	h->linp[index] = h->upper -= nbytes;
+	dest = (char *)h + h->upper;
+	WR_RLEAF(dest, data, dflags);
+
+	++t->bt_nrecs;
+	F_SET(t, B_MODIFIED);
+	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+
+	return (RET_SUCCESS);
+}
diff --git a/db/recno/rec_search.c b/db/recno/rec_search.c
new file mode 100644
index 0000000000..acc109e992
--- /dev/null
+++ b/db/recno/rec_search.c
@@ -0,0 +1,126 @@
+/*-
+ * Copyright (c) 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)rec_search.c	8.4 (Berkeley) 7/14/94";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/types.h>
+
+#include <errno.h>
+#include <stdio.h>
+
+#include <db.h>
+#include "recno.h"
+
+/*
+ * __REC_SEARCH -- Search a btree for a key.
+ *
+ * Parameters:
+ *	t:	tree to search
+ *	recno:	key to find
+ *	op: 	search operation
+ *
+ * Returns:
+ *	EPG for matching record, if any, or the EPG for the location of the
+ *	key, if it were inserted into the tree.
+ *
+ * Returns:
+ *	The EPG for matching record, if any, or the EPG for the location
+ *	of the key, if it were inserted into the tree, is entered into
+ *	the bt_cur field of the tree.  A pointer to the field is returned.
+ */
+EPG *
+__rec_search(t, recno, op)
+	BTREE *t;
+	recno_t recno;
+	enum SRCHOP op;
+{
+	register indx_t index;
+	register PAGE *h;
+	EPGNO *parent;
+	RINTERNAL *r;
+	pgno_t pg;
+	indx_t top;
+	recno_t total;
+	int sverrno;
+
+	BT_CLR(t);
+	for (pg = P_ROOT, total = 0;;) {
+		if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+			goto err;
+		if (h->flags & P_RLEAF) {
+			t->bt_cur.page = h;
+			t->bt_cur.index = recno - total;
+			return (&t->bt_cur);
+		}
+		for (index = 0, top = NEXTINDEX(h);;) {
+			r = GETRINTERNAL(h, index);
+			if (++index == top || total + r->nrecs > recno)
+				break;
+			total += r->nrecs;
+		}
+
+		BT_PUSH(t, pg, index - 1);
+		
+		pg = r->pgno;
+		switch (op) {
+		case SDELETE:
+			--GETRINTERNAL(h, (index - 1))->nrecs;
+			mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+			break;
+		case SINSERT:
+			++GETRINTERNAL(h, (index - 1))->nrecs;
+			mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+			break;
+		case SEARCH:
+			mpool_put(t->bt_mp, h, 0);
+			break;
+		}
+
+	}
+	/* Try and recover the tree. */
+err:	sverrno = errno;
+	if (op != SEARCH)
+		while  ((parent = BT_POP(t)) != NULL) {
+			if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
+				break;
+			if (op == SINSERT)
+				--GETRINTERNAL(h, parent->index)->nrecs;
+			else
+				++GETRINTERNAL(h, parent->index)->nrecs;
+                        mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+                }
+	errno = sverrno;
+	return (NULL);
+}
diff --git a/db/recno/rec_seq.c b/db/recno/rec_seq.c
new file mode 100644
index 0000000000..f80992c598
--- /dev/null
+++ b/db/recno/rec_seq.c
@@ -0,0 +1,131 @@
+/*-
+ * Copyright (c) 1991, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef lint
+static char sccsid[] = "@(#)rec_seq.c	8.3 (Berkeley) 7/14/94";
+#endif /* not lint */
+
+#include <sys/types.h>
+
+#include <errno.h>
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <db.h>
+#include "recno.h"
+
+/*
+ * __REC_SEQ -- Recno sequential scan interface.
+ *
+ * Parameters:
+ *	dbp:	pointer to access method
+ *	key:	key for positioning and return value
+ *	data:	data return value
+ *	flags:	R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV.
+ *
+ * Returns:
+ *	RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
+ */
+int
+__rec_seq(dbp, key, data, flags)
+	const DB *dbp;
+	DBT *key, *data;
+	u_int flags;
+{
+	BTREE *t;
+	EPG *e;
+	recno_t nrec;
+	int status;
+
+	t = dbp->internal;
+
+	/* Toss any page pinned across calls. */
+	if (t->bt_pinned != NULL) {
+		mpool_put(t->bt_mp, t->bt_pinned, 0);
+		t->bt_pinned = NULL;
+	}
+
+	switch(flags) {
+	case R_CURSOR:
+		if ((nrec = *(recno_t *)key->data) == 0)
+			goto einval;
+		break;
+	case R_NEXT:
+		if (F_ISSET(&t->bt_cursor, CURS_INIT)) {
+			nrec = t->bt_cursor.rcursor + 1;
+			break;
+		}
+		/* FALLTHROUGH */
+	case R_FIRST:
+		nrec = 1;
+		break;
+	case R_PREV:
+		if (F_ISSET(&t->bt_cursor, CURS_INIT)) {
+			if ((nrec = t->bt_cursor.rcursor - 1) == 0)
+				return (RET_SPECIAL);
+			break;
+		}
+		/* FALLTHROUGH */
+	case R_LAST:
+		if (!F_ISSET(t, R_EOF | R_INMEM) &&
+		    t->bt_irec(t, MAX_REC_NUMBER) == RET_ERROR)
+			return (RET_ERROR);
+		nrec = t->bt_nrecs;
+		break;
+	default:
+einval:		errno = EINVAL;
+		return (RET_ERROR);
+	}
+	
+	if (t->bt_nrecs == 0 || nrec > t->bt_nrecs) {
+		if (!F_ISSET(t, R_EOF | R_INMEM) &&
+		    (status = t->bt_irec(t, nrec)) != RET_SUCCESS)
+			return (status);
+		if (t->bt_nrecs == 0 || nrec > t->bt_nrecs)
+			return (RET_SPECIAL);
+	}
+
+	if ((e = __rec_search(t, nrec - 1, SEARCH)) == NULL)
+		return (RET_ERROR);
+
+	F_SET(&t->bt_cursor, CURS_INIT);
+	t->bt_cursor.rcursor = nrec;
+
+	status = __rec_ret(t, e, nrec, key, data);
+	if (F_ISSET(t, B_DB_LOCK))
+		mpool_put(t->bt_mp, e->page, 0);
+	else
+		t->bt_pinned = e->page;
+	return (status);
+}
diff --git a/db/recno/rec_utils.c b/db/recno/rec_utils.c
new file mode 100644
index 0000000000..baea3fad50
--- /dev/null
+++ b/db/recno/rec_utils.c
@@ -0,0 +1,122 @@
+/*-
+ * Copyright (c) 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)rec_utils.c	8.6 (Berkeley) 7/16/94";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/param.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <db.h>
+#include "recno.h"
+
+/*
+ * __rec_ret --
+ *	Build return data.
+ *
+ * Parameters:
+ *	t:	tree
+ *	e:	key/data pair to be returned
+ *   nrec:	record number
+ *    key:	user's key structure
+ *	data:	user's data structure
+ *
+ * Returns:
+ *	RET_SUCCESS, RET_ERROR.
+ */
+int
+__rec_ret(t, e, nrec, key, data)
+	BTREE *t;
+	EPG *e;
+	recno_t nrec;
+	DBT *key, *data;
+{
+	RLEAF *rl;
+	void *p;
+
+	if (key == NULL)
+		goto dataonly;
+
+	/* We have to copy the key, it's not on the page. */
+	if (sizeof(recno_t) > t->bt_rkey.size) {
+		p = (void *)(t->bt_rkey.data == NULL ?
+		    malloc(sizeof(recno_t)) :
+		    realloc(t->bt_rkey.data, sizeof(recno_t)));
+		if (p == NULL)
+			return (RET_ERROR);
+		t->bt_rkey.data = p;
+		t->bt_rkey.size = sizeof(recno_t);
+	}
+	memmove(t->bt_rkey.data, &nrec, sizeof(recno_t));
+	key->size = sizeof(recno_t);
+	key->data = t->bt_rkey.data;
+
+dataonly:
+	if (data == NULL)
+		return (RET_SUCCESS);
+
+	/*
+	 * We must copy big keys/data to make them contigous.  Otherwise,
+	 * leave the page pinned and don't copy unless the user specified
+	 * concurrent access.
+	 */
+	rl = GETRLEAF(e->page, e->index);
+	if (rl->flags & P_BIGDATA) {
+		if (__ovfl_get(t, rl->bytes,
+		    &data->size, &t->bt_rdata.data, &t->bt_rdata.size))
+			return (RET_ERROR);
+		data->data = t->bt_rdata.data;
+	} else if (F_ISSET(t, B_DB_LOCK)) {
+		/* Use +1 in case the first record retrieved is 0 length. */
+		if (rl->dsize + 1 > t->bt_rdata.size) {
+			p = (void *)(t->bt_rdata.data == NULL ?
+			    malloc(rl->dsize + 1) :
+			    realloc(t->bt_rdata.data, rl->dsize + 1));
+			if (p == NULL)
+				return (RET_ERROR);
+			t->bt_rdata.data = p;
+			t->bt_rdata.size = rl->dsize + 1;
+		}
+		memmove(t->bt_rdata.data, rl->bytes, rl->dsize);
+		data->size = rl->dsize;
+		data->data = t->bt_rdata.data;
+	} else {
+		data->size = rl->dsize;
+		data->data = rl->bytes;
+	}
+	return (RET_SUCCESS);
+}
diff --git a/db/recno/recno.h b/db/recno/recno.h
new file mode 100644
index 0000000000..bec772c2fa
--- /dev/null
+++ b/db/recno/recno.h
@@ -0,0 +1,39 @@
+/*-
+ * Copyright (c) 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)recno.h	8.1 (Berkeley) 6/4/93
+ */
+
+enum SRCHOP { SDELETE, SINSERT, SEARCH};	/* Rec_search operation. */
+
+#include "../btree/btree.h"
+#include "extern.h"