/*-
 * See the file LICENSE for redistribution information.
 *
 * Copyright (c) 1996, 2015 Oracle and/or its affiliates.  All rights reserved.
 *
 * $Id$
 */

#include "db_config.h"

#include "db_int.h"
#include "dbinc/log.h"
#include "dbinc/mp.h"
#include "dbinc/db_page.h"
#include "dbinc/hash.h"

static int __memp_mpf_alloc __P((DB_MPOOL *,
    DB_MPOOLFILE *, const char *, u_int32_t, u_int32_t, MPOOLFILE **));
static int __memp_mpf_find __P((ENV *,
    DB_MPOOLFILE *, DB_MPOOL_HASH *, const char *, u_int32_t, MPOOLFILE **));

/*
 * __memp_fopen_pp --
 *	DB_MPOOLFILE->open pre/post processing.
 *
 * PUBLIC: int __memp_fopen_pp
 * PUBLIC:     __P((DB_MPOOLFILE *, const char *, u_int32_t, int, size_t));
 */
int
__memp_fopen_pp(dbmfp, path, flags, mode, pagesize)
	DB_MPOOLFILE *dbmfp;
	const char *path;
	u_int32_t flags;
	int mode;
	size_t pagesize;
{
	DB_THREAD_INFO *ip;
	ENV *env;
	int ret;

	env = dbmfp->env;

	/* Validate arguments. */
	if ((ret = __db_fchk(env, "DB_MPOOLFILE->open", flags,
	    DB_CREATE | DB_DIRECT | DB_EXTENT | DB_MULTIVERSION |
	    DB_NOMMAP | DB_ODDFILESIZE | DB_RDONLY | DB_TRUNCATE)) != 0)
		return (ret);

	/*
	 * Require a power-of-two pagesize, smaller than the clear length.  A
	 * non-zero page size is only allowed if opening an existing, in-memory
	 * db.
	 */
	if (!POWER_OF_TWO(pagesize) ||
	    (pagesize == 0 && (LF_ISSET(DB_CREATE) ||
	    !FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)))) {
		__db_errx(env, DB_STR("3033",
		    "DB_MPOOLFILE->open: page sizes must be a power-of-2"));
		return (EINVAL);
	}
	if (pagesize != 0 && dbmfp->clear_len > pagesize) {
		__db_errx(env, DB_STR("3034",
		    "DB_MPOOLFILE->open: clear length larger than page size"));
		return (EINVAL);
	}

	/* Read-only checks, and local flag. */
	if (LF_ISSET(DB_RDONLY) && path == NULL) {
		__db_errx(env, DB_STR("3035",
		    "DB_MPOOLFILE->open: temporary files can't be readonly"));
		return (EINVAL);
	}

	if (LF_ISSET(DB_MULTIVERSION) && !TXN_ON(env)) {
		__db_errx(env, DB_STR("3036",
	    "DB_MPOOLFILE->open: DB_MULTIVERSION requires transactions"));
		return (EINVAL);
	}

	ENV_ENTER(env, ip);
	REPLICATION_WRAP(env,
	    (__memp_fopen(dbmfp, NULL,
	    path, NULL, flags, mode, pagesize)), 0, ret);
	ENV_LEAVE(env, ip);
	return (ret);
}

/*
 * Generate the number of user opens.  If there is no backing file
 * there is an extra open count to keep the in memory db around.
 */
#define	MFP_OPEN_CNT(mfp)	((mfp)->mpf_cnt - ((mfp)->neutral_cnt +	\
				   (u_int32_t)(mfp)->no_backing_file))
#define	MP_IOINFO_RETRIES	5
/*
 * __memp_fopen --
 *	DB_MPOOLFILE->open.
 *
 * PUBLIC: int __memp_fopen __P((DB_MPOOLFILE *, MPOOLFILE *,
 * PUBLIC:     const char *, const char **, u_int32_t, int, size_t));
 */
int
__memp_fopen(dbmfp, mfp, path, dirp, flags, mode, pgsize)
	DB_MPOOLFILE *dbmfp;
	MPOOLFILE *mfp;
	const char *path;
	const char **dirp;
	u_int32_t flags;
	int mode;
	size_t pgsize;
{
	DB_ENV *dbenv;
	DB_MPOOL *dbmp;
	DB_MPOOLFILE *tmp_dbmfp;
	DB_MPOOL_HASH *hp;
	ENV *env;
	MPOOL *mp;
	MPOOLFILE *alloc_mfp;
	size_t maxmap;
	db_pgno_t last_pgno;
	u_int32_t bucket, mbytes, bytes, oflags, pagesize;
	int isdir, refinc, ret, tries;
	char *rpath;

	/* If this handle is already open, return. */
	if (F_ISSET(dbmfp, MP_OPEN_CALLED))
		return (0);

	env = dbmfp->env;
	dbmp = env->mp_handle;
	dbenv = env->dbenv;
	mp = dbmp->reginfo[0].primary;
	alloc_mfp = NULL;
	mbytes = bytes = 0;
	refinc = ret = isdir = 0;
	rpath = NULL;

	/*
	 * We're keeping the page size as a size_t in the public API, but
	 * it's a u_int32_t everywhere internally.
	 */
	pagesize = (u_int32_t)pgsize;

	/*
	 * We're called internally with a specified mfp, in which case the
	 * path is NULL, but we'll get the path from the underlying region
	 * information.  Otherwise, if the path is NULL, it's a temporary
	 * file -- we know we can't join any existing files, and we'll delay
	 * the open until we actually need to write the file. All temporary
	 * files will go into the first hash bucket.
	 */
	DB_ASSERT(env, mfp == NULL || path == NULL);

	bucket = 0;
	hp = R_ADDR(dbmp->reginfo, mp->ftab);
	if (mfp == NULL) {
		if (path == NULL)
			goto alloc;

		/*
		 * If fileid is not set but the file exists on the disk,
		 * we try to use __os_fileid to set it. We do this
		 * because we want to use the fileid to check if we have
		 * opened the mpoolfile as early as possible.
		 *
		 * Note: DB layer always calls __memp_fopen with fileid set,
		 * so this is only for using mpool api to open a file.
		 */

		if (!FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE) &&
		    !F_ISSET(dbmfp, MP_FILEID_SET)) {
			if ((ret = __db_appname(env,
			     DB_APP_DATA, path, dirp, &rpath)) != 0)
				goto err;
			ret = __os_exists(env, rpath, &isdir);
			if (ret == 0 && isdir) {
				ret = EINVAL;
				goto err;
			} else if (ret == 0) {
				if  ((ret = __os_fileid(env,
				    rpath, 0, dbmfp->fileid)) != 0)
					goto err;
				F_SET(dbmfp, MP_FILEID_SET);
			}
		}

		/*
		 * Hash to the proper file table entry and walk it.
		 *
		 * The fileID is a filesystem unique number (e.g., a
		 * UNIX dev/inode pair) plus a timestamp.  If files are
		 * removed and created in less than a second, the fileID
		 * can be repeated.  The problem with repetition happens
		 * when the file that previously had the fileID value still
		 * has pages in the pool, since we don't want to use them
		 * to satisfy requests for the new file. Because the
		 * DB_TRUNCATE flag reuses the dev/inode pair, repeated
		 * opens with that flag set guarantees matching fileIDs
		 * when the machine can open a file and then re-open
		 * with truncate within a second.  For this reason, we
		 * pass that flag down, and, if we find a matching entry,
		 * we ensure that it's never found again, and we create
		 * a new entry for the current request.
		 */
		if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE) ||
		    F_ISSET(dbmfp, MP_FILEID_SET)) {
			if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE))
				bucket = FNBUCKET(path, strlen(path));
			else
				bucket = FNBUCKET(dbmfp->fileid,
				    DB_FILE_ID_LEN);

			hp += bucket;
			/*
			 * If we find the MPOOLFILE and inc its ref count.
			 * That way it cannot go away while we open it.
			 */
			MUTEX_LOCK(env, hp->mtx_hash);
			ret =
			    __memp_mpf_find(env, dbmfp, hp, path, flags, &mfp);
			if (ret == 0 && mfp != NULL) {
				refinc = 1;

				if (LF_ISSET(DB_MULTIVERSION)) {
					if (MFP_OPEN_CNT(mfp) > (u_int32_t)
					    (LF_ISSET(DB_RDONLY) ? 0 : 1) &&
					    atomic_read(
					    &mfp->multiversion) == 0) {
						MUTEX_UNLOCK(env, hp->mtx_hash);
						goto mvcc_err;
					}
					atomic_inc(env, &mfp->multiversion);
					F_SET(dbmfp, MP_MULTIVERSION);
				}
			}
			MUTEX_UNLOCK(env, hp->mtx_hash);
			if (ret != 0)
				goto err;
		}
	} else {
		/*
		 * Deadfile can only be set if mpf_cnt goes to zero (or if we
		 * failed creating the file DB_AM_DISCARD).  Increment the ref
		 * count so the file cannot become dead and be unlinked.
		 */
		MUTEX_LOCK(env, mfp->mutex);
		if (!mfp->deadfile) {
			if (LF_ISSET(DB_MULTIVERSION)) {
				MUTEX_UNLOCK(env, mfp->mutex);
				if (MFP_OPEN_CNT(mfp) > 0 &&
				     atomic_read(&mfp->multiversion) == 0) {
mvcc_err:				__db_errx(env, DB_STR("3041",
"DB_MULTIVERSION cannot be specified on a database file that is already open"));
					ret = EINVAL;
					goto err;
				}

				atomic_inc(env, &mfp->multiversion);
				F_SET(dbmfp, MP_MULTIVERSION);
			}
			/*
			 * Increment the reference count.  We also track
			 * those references that don't effect the ability
			 * to convert the handle to either NOT_DURABLE or
			 * MVCC.  These are readonly opens or threads that
			 * are using the handle just to flush a buffer.
			 */
			++mfp->mpf_cnt;
			if (LF_ISSET(DB_FLUSH | DB_RDONLY))
				++mfp->neutral_cnt;
			if (LF_ISSET(DB_FLUSH))
				F_SET(dbmfp, MP_FOR_FLUSH);
			refinc = 1;
		}
		MUTEX_UNLOCK(env, mfp->mutex);

		/*
		 * Test one last time to see if the file is dead -- it may have
		 * been removed.  This happens when a checkpoint trying to open
		 * the file to flush a buffer races with the Db::remove method.
		 * The error will be ignored, so don't output an error message.
		 */
		if (mfp->deadfile) {
			ret = EINVAL;
			goto err;
		}
	}

	if (LF_ISSET(DB_RDONLY))
		F_SET(dbmfp, MP_READONLY);
	if (LF_ISSET(DB_FLUSH))
		F_SET(dbmfp, MP_FLUSH);
	/*
	 * Share the underlying file descriptor if that's possible.
	 */
	if (mfp != NULL && !FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) {
		MUTEX_LOCK(env, dbmp->mutex);
		TAILQ_FOREACH(tmp_dbmfp, &dbmp->dbmfq, q)
			if (mfp == tmp_dbmfp->mfp &&
			    (F_ISSET(dbmfp, MP_READONLY) ||
			    !F_ISSET(tmp_dbmfp, MP_READONLY))) {
				++tmp_dbmfp->fhp->ref;
				dbmfp->fhp = tmp_dbmfp->fhp;
				dbmfp->addr = tmp_dbmfp->addr;
				dbmfp->len = tmp_dbmfp->len;
				break;
			}
		MUTEX_UNLOCK(env, dbmp->mutex);
		if (dbmfp->fhp != NULL)
			goto have_mfp;
	}

	/*
	 * If there's no backing file, we can join existing files in the cache,
	 * but there's nothing to read from disk.
	 */
	if (!FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) {
		/* Convert MP open flags to DB OS-layer open flags. */
		oflags = 0;
		if (LF_ISSET(DB_CREATE))
			oflags |= DB_OSO_CREATE;
		if (LF_ISSET(DB_DIRECT))
			oflags |= DB_OSO_DIRECT;
		if (LF_ISSET(DB_RDONLY))
			oflags |= DB_OSO_RDONLY;

		/*
		 * XXX
		 * A grievous layering violation, the DB_DSYNC_DB flag
		 * was left in the ENV structure and not driven through
		 * the cache API.  This needs to be fixed when the general
		 * API configuration is fixed.
		 */
		if (F_ISSET(env->dbenv, DB_ENV_DSYNC_DB))
			oflags |= DB_OSO_DSYNC;

		/*
		 * Get the real name for this file and open it.
		 *
		 * Supply a page size so os_open can decide whether to
		 * turn buffering off if the DB_DIRECT_DB flag is set.
		 *
		 * Acquire the region lock if we're using a path from
		 * an underlying MPOOLFILE -- there's a race in accessing
		 * the path name stored in the region, __memp_nameop may
		 * be simultaneously renaming the file.
		 */

		ret = 0;
		if (mfp != NULL) {
			MPOOL_SYSTEM_LOCK(env);
			path = R_ADDR(dbmp->reginfo, mfp->path_off);
			if (rpath != NULL) {
				__os_free(env, rpath);
				rpath = NULL;
			}
		}
		if (rpath == NULL)
			ret = __db_appname(env,
			    DB_APP_DATA, path, dirp, &rpath);
		if (ret == 0)
			ret = __os_open(env, rpath,
			     (u_int32_t)pagesize, oflags, mode, &dbmfp->fhp);
		if (mfp != NULL)
			MPOOL_SYSTEM_UNLOCK(env);
		if (ret != 0)
			goto err;

		/*
		 * Cache file handles are shared, and have mutexes to
		 * protect the underlying file handle across seek and
		 * read/write calls.
		 */
		dbmfp->fhp->ref = 1;
		if ((ret = __mutex_alloc(env, MTX_MPOOL_FH,
		     DB_MUTEX_PROCESS_ONLY, &dbmfp->fhp->mtx_fh)) != 0)
			goto err;

		/* Figure out the file's size. */
		if ((ret = __os_ioinfo(
		    env, rpath, dbmfp->fhp, &mbytes, &bytes, NULL)) != 0) {
			__db_err(env, ret, "%s", rpath);
			goto err;
		}

		/*
		 * Don't permit files that aren't a multiple of the pagesize,
		 * and find the number of the last page in the file, all the
		 * time being careful not to overflow 32 bits.
		 *
		 * During verify or recovery, we might have to cope with a
		 * truncated file; if the file size is not a multiple of the
		 * page size, round down to a page, we'll take care of the
		 * partial page outside the mpool system.
		 *
		 * Pagesize of 0 is only allowed for in-mem dbs.
		 */
		DB_ASSERT(env, pagesize != 0);
		if (bytes % pagesize != 0) {
			if (LF_ISSET(DB_ODDFILESIZE))
				bytes -= (u_int32_t)(bytes % pagesize);
			else {
				/*
				 * If the file size is not a multiple of the
				 * pagesize, it is likely because the ioinfo
				 * call is racing with a write that is extending
				 * the file.  Many file systems will extend
				 * in fs block size units, and if the pagesize
				 * is larger than that, we can briefly see a
				 * file size that is not a multiple of pagesize.
				 *
				 * Yield the processor to allow that to finish
				 * and try again a few times.
				 */
				tries = 0;
				STAT((mp->stat.st_oddfsize_detect++));
				while (tries < MP_IOINFO_RETRIES) {
					if ((ret = __os_ioinfo(env, rpath,
					    dbmfp->fhp, &mbytes, &bytes,
					    NULL)) != 0) {
						__db_err(env, ret, "%s", rpath);
						goto err;
					}
					if (bytes % pagesize != 0) {
						__os_yield(env, 0, 50000);
						tries++;
					} else {
					    STAT((
					    mp->stat.st_oddfsize_resolve++));
					    break;
					}
				}
				if (tries == MP_IOINFO_RETRIES) {
					__db_errx(env, DB_STR_A("3043",
    "%s: file size (%lu %lu) not a multiple of the pagesize %lu",
    "%s %lu %lu %lu"),
    rpath, (u_long)mbytes, (u_long)bytes, (u_long)pagesize);
					ret = EINVAL;
					goto err;
				}
			}
		}

		/*
		 * Get the file id if we weren't given one.  Generated file id's
		 * don't use timestamps, otherwise there'd be no chance of any
		 * other process joining the party.  Don't bother looking for
		 * this id in the hash table, its new.
		 */
		if (mfp == NULL && !F_ISSET(dbmfp, MP_FILEID_SET)) {
			if  ((ret =
			     __os_fileid(env, rpath, 0, dbmfp->fileid)) != 0)
				goto err;
			F_SET(dbmfp, MP_FILEID_SET);
			bucket = FNBUCKET(dbmfp->fileid, DB_FILE_ID_LEN);
			hp += bucket;
			goto alloc;
		}
	}

	if (mfp != NULL)
		goto have_mfp;

	/*
	 * We can race with another process opening the same file when
	 * we allocate the mpoolfile structure.  We will come back
	 * here and check the hash table again to see if it has appeared.
	 * For most files this is not a problem, since the name is locked
	 * at a higher layer but QUEUE extent files are not locked.
	 */
check:	MUTEX_LOCK(env, hp->mtx_hash);
	if ((ret = __memp_mpf_find(env, dbmfp, hp, path, flags, &mfp) != 0))
		goto err;

	if (alloc_mfp != NULL && mfp == NULL) {
		mfp = alloc_mfp;
		alloc_mfp = NULL;
		SH_TAILQ_INSERT_HEAD(&hp->hash_bucket, mfp, q, __mpoolfile);
	} else if (mfp != NULL) {
		refinc = 1;
		/*
		 * Some things about a file cannot be changed: the clear length,
		 * page size, or LSN location.  However, if this is an attempt
		 * to open a named in-memory file, we may not yet have that
		 * information. so accept uninitialized entries.
		 *
		 * The file type can change if the application's pre- and post-
		 * processing needs change.  For example, an application that
		 * created a hash subdatabase in a database that was previously
		 * all btree.
		 *
		 * !!!
		 * We do not check to see if the pgcookie information changed,
		 * or update it if it is.
		 */
		if ((dbmfp->clear_len != DB_CLEARLEN_NOTSET &&
		    mfp->clear_len != DB_CLEARLEN_NOTSET &&
		    dbmfp->clear_len != mfp->clear_len) ||
		    (pagesize != 0 && pagesize != mfp->pagesize) ||
		    (dbmfp->lsn_offset != DB_LSN_OFF_NOTSET &&
		    mfp->lsn_off != DB_LSN_OFF_NOTSET &&
		    dbmfp->lsn_offset != mfp->lsn_off)) {
			__db_errx(env, DB_STR_A("3038",
		    "%s: clear length, page size or LSN location changed",
			    "%s"), path);
			MUTEX_UNLOCK(env, hp->mtx_hash);
			ret = EINVAL;
			goto err;
		}
	}
	if (mfp != NULL && LF_ISSET(DB_MULTIVERSION)) {
		if (MFP_OPEN_CNT(mfp) > 1 &&
		    atomic_read(&mfp->multiversion) == 0) {
			MUTEX_UNLOCK(env, hp->mtx_hash);
			goto mvcc_err;
		}
		atomic_inc(env, &mfp->multiversion);
		F_SET(dbmfp, MP_MULTIVERSION);
	}

	MUTEX_UNLOCK(env, hp->mtx_hash);
	if (alloc_mfp != NULL) {
		MUTEX_LOCK(env, alloc_mfp->mutex);
		if ((ret = __memp_mf_discard(dbmp, alloc_mfp, 0)) != 0)
			goto err;
	}

	if (mfp == NULL) {
		/*
		 * If we didn't find the file and this is an in-memory file,
		 * then the create flag should be set.
		 */
		if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE) &&
		    !LF_ISSET(DB_CREATE)) {
			ret = ENOENT;
			goto err;
		}

alloc:		if ((ret = __memp_mpf_alloc(dbmp,
		     dbmfp, path, pagesize, flags, &alloc_mfp)) != 0)
			goto err;

		/*
		 * If the user specifies DB_MPOOL_LAST or DB_MPOOL_NEW on a
		 * page get, we have to increment the last page in the file.
		 * Figure it out and save it away.
		 *
		 * Note correction: page numbers are zero-based, not 1-based.
		 */
		DB_ASSERT(env, pagesize != 0);
		last_pgno = (db_pgno_t)(mbytes * (MEGABYTE / pagesize));
		last_pgno += (db_pgno_t)(bytes / pagesize);
		if (last_pgno != 0)
			--last_pgno;

		alloc_mfp->last_flushed_pgno = alloc_mfp->orig_last_pgno =
		    alloc_mfp->last_pgno = last_pgno;

		alloc_mfp->bucket = bucket;

		/* Go back and see if someone else has opened the file. */
		if (path != NULL)
			goto check;

		mfp = alloc_mfp;

		if (LF_ISSET(DB_MULTIVERSION)) {
			atomic_inc(env, &mfp->multiversion);
			F_SET(dbmfp, MP_MULTIVERSION);
		}

		/* This is a temp, noone else can see it, put it at the end. */
		MUTEX_LOCK(env, hp->mtx_hash);
		SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, mfp, q);
		MUTEX_UNLOCK(env, hp->mtx_hash);
	}
have_mfp:
	/*
	 * We need to verify that all handles open a file either durable or not
	 * durable.  This needs to be cross process and cross sub-databases, so
	 * mpool is the place to do it.
	 */
	if (!LF_ISSET(DB_DURABLE_UNKNOWN | DB_RDONLY)) {
		if (F_ISSET(mfp, MP_DURABLE_UNKNOWN)) {
			if (LF_ISSET(DB_TXN_NOT_DURABLE))
				F_SET(mfp, MP_NOT_DURABLE);
			F_CLR(mfp, MP_DURABLE_UNKNOWN);
		} else if (!LF_ISSET(DB_TXN_NOT_DURABLE) !=
		    !F_ISSET(mfp, MP_NOT_DURABLE)) {
			__db_errx(env, DB_STR("3039",
	     "Cannot open DURABLE and NOT DURABLE handles in the same file"));
			ret = EINVAL;
			goto err;
		}
	}

	/*
	 * All paths to here have initialized the mfp variable to reference
	 * the selected (or allocated) MPOOLFILE.
	 */
	dbmfp->mfp = mfp;

	/*
	 * Check to see if we can mmap the file.  If a file:
	 *	+ isn't temporary
	 *	+ is read-only
	 *	+ doesn't require any pgin/pgout support
	 *	+ the DB_NOMMAP flag wasn't set (in either the file open or
	 *	  the environment in which it was opened)
	 *	+ and is less than mp_mmapsize bytes in size
	 *
	 * we can mmap it instead of reading/writing buffers.  Don't do error
	 * checking based on the mmap call failure.  We want to do normal I/O
	 * on the file if the reason we failed was because the file was on an
	 * NFS mounted partition, and we can fail in buffer I/O just as easily
	 * as here.
	 *
	 * We'd like to test to see if the file is too big to mmap.  Since we
	 * don't know what size or type off_t's or size_t's are, or the largest
	 * unsigned integral type is, or what random insanity the local C
	 * compiler will perpetrate, doing the comparison in a portable way is
	 * flatly impossible.  Hope that mmap fails if the file is too large.
	 */
#define	DB_MAXMMAPSIZE	(10 * 1024 * 1024)	/* 10 MB. */
	if (F_ISSET(mfp, MP_CAN_MMAP) && dbmfp->addr == NULL) {
		maxmap = dbenv->mp_mmapsize == 0 ?
		    DB_MAXMMAPSIZE : dbenv->mp_mmapsize;
		if (path == NULL ||
		    FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE))
			F_CLR(mfp, MP_CAN_MMAP);
		else if (!F_ISSET(dbmfp, MP_READONLY))
			F_CLR(mfp, MP_CAN_MMAP);
		else if (dbmfp->ftype != 0)
			F_CLR(mfp, MP_CAN_MMAP);
		else if (LF_ISSET(DB_NOMMAP) || F_ISSET(dbenv, DB_ENV_NOMMAP))
			F_CLR(mfp, MP_CAN_MMAP);
		else {
			MPOOL_SYSTEM_LOCK(env);
			maxmap = mp->mp_mmapsize == 0 ?
			    DB_MAXMMAPSIZE : mp->mp_mmapsize;
			MPOOL_SYSTEM_UNLOCK(env);
			if (mbytes > maxmap / MEGABYTE ||
			    (mbytes == maxmap / MEGABYTE &&
			    bytes >= maxmap % MEGABYTE))
				F_CLR(mfp, MP_CAN_MMAP);
		}

		dbmfp->addr = NULL;
		if (F_ISSET(mfp, MP_CAN_MMAP)) {
			dbmfp->len = (size_t)mbytes * MEGABYTE + bytes;
			if (__os_mapfile(env, rpath,
			    dbmfp->fhp, dbmfp->len, 1, &dbmfp->addr) != 0) {
				dbmfp->addr = NULL;
				F_CLR(mfp, MP_CAN_MMAP);
			}
		}
	}

	F_SET(dbmfp, MP_OPEN_CALLED);

	/*
	 * Add the file to the process' list of DB_MPOOLFILEs.
	 */
	MUTEX_LOCK(env, dbmp->mutex);
	TAILQ_INSERT_TAIL(&dbmp->dbmfq, dbmfp, q);
	MUTEX_UNLOCK(env, dbmp->mutex);

	if (0) {
err:		if (refinc) {
			/*
			 * If mpf_cnt goes to zero here and unlink_on_close is
			 * set, then we missed the last close, but there was an
			 * error trying to open the file, so we probably cannot
			 * unlink it anyway.
			 */
			MUTEX_LOCK(env, mfp->mutex);
			--mfp->mpf_cnt;
			if (LF_ISSET(DB_FLUSH | DB_RDONLY)) {
				DB_ASSERT(env, mfp->neutral_cnt != 0);
				--mfp->neutral_cnt;
			}
			MUTEX_UNLOCK(env, mfp->mutex);
		}

	}
	if (rpath != NULL)
		__os_free(env, rpath);
	return (ret);
}

/*
 * __memp_mpf_find --
 *	Search a hash bucket for a MPOOLFILE.
 */
static int
__memp_mpf_find(env, dbmfp, hp, path, flags, mfpp)
	ENV *env;
	DB_MPOOLFILE *dbmfp;
	DB_MPOOL_HASH *hp;
	const char *path;
	u_int32_t flags;
	MPOOLFILE **mfpp;
{
	DB_MPOOL *dbmp;
	MPOOLFILE *mfp;

	dbmp = env->mp_handle;

	SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) {
		/* Skip dead files and temporary files. */
		if (mfp->deadfile || F_ISSET(mfp, MP_TEMP))
			continue;

		/*
		 * Any remaining DB_MPOOL_NOFILE databases are in-memory
		 * named databases and need only match other in-memory
		 * databases with the same name.
		 */
		if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) {
			if (!mfp->no_backing_file)
				continue;

			if (strcmp(path, R_ADDR(dbmp->reginfo, mfp->path_off)))
				continue;

			/*
			 * We matched an in-memory file; grab the fileid if
			 * it is set in the region, but not in the dbmfp.
			 */
			if (!F_ISSET(dbmfp, MP_FILEID_SET))
				(void)__memp_set_fileid(dbmfp,
				    R_ADDR(dbmp->reginfo, mfp->fileid_off));
		} else
			if (memcmp(dbmfp->fileid, R_ADDR(dbmp->reginfo,
			    mfp->fileid_off), DB_FILE_ID_LEN) != 0)
				continue;

		/*
		 * If the file is being truncated, remove it from the system
		 * and create a new entry.
		 *
		 * !!!
		 * We should be able to set mfp to NULL and break out of the
		 * loop, but I like the idea of checking all the entries.
		 */
		if (LF_ISSET(DB_TRUNCATE)) {
			MUTEX_LOCK(env, mfp->mutex);
			mfp->deadfile = 1;
			MUTEX_UNLOCK(env, mfp->mutex);
			continue;
		}

		/*
		 * Check to see if this file has died while we waited.
		 *
		 * We normally don't lock the deadfile field when we read it as
		 * we only care if the field is zero or non-zero.  We do lock
		 * on read when searching for a matching MPOOLFILE so that two
		 * threads of control don't race between setting the deadfile
		 * bit and incrementing the reference count, that is, a thread
		 * of control decrementing the reference count and then setting
		 * deadfile because the reference count is 0 blocks us finding
		 * the file without knowing it's about to be marked dead.
		 */
		MUTEX_LOCK(env, mfp->mutex);
		if (mfp->deadfile) {
			MUTEX_UNLOCK(env, mfp->mutex);
			continue;
		}
		++mfp->mpf_cnt;
		if (LF_ISSET(DB_FLUSH | DB_RDONLY))
			++mfp->neutral_cnt;
		if (LF_ISSET(DB_FLUSH))
			F_SET(dbmfp, MP_FOR_FLUSH);
		MUTEX_UNLOCK(env, mfp->mutex);

		/* Initialize any fields that are not yet set. */
		if (dbmfp->ftype != 0)
			mfp->ftype = dbmfp->ftype;
		if (dbmfp->clear_len != DB_CLEARLEN_NOTSET)
			mfp->clear_len = dbmfp->clear_len;
		if (dbmfp->lsn_offset != -1)
			mfp->lsn_off = dbmfp->lsn_offset;

		break;
	}

	*mfpp = mfp;
	return (0);
}

static int
__memp_mpf_alloc(dbmp, dbmfp, path, pagesize, flags, retmfp)
	DB_MPOOL *dbmp;
	DB_MPOOLFILE *dbmfp;
	const char *path;
	u_int32_t pagesize;
	u_int32_t flags;
	MPOOLFILE **retmfp;
{
	ENV *env;
	MPOOLFILE *mfp;
	int ret;
	void *p;

	env = dbmp->env;
	ret = 0;
	/* Allocate and initialize a new MPOOLFILE. */
	if ((ret = __memp_alloc(dbmp,
	     dbmp->reginfo, NULL, sizeof(MPOOLFILE), NULL, &mfp)) != 0)
		goto err;
	memset(mfp, 0, sizeof(MPOOLFILE));
	mfp->mpf_cnt = 1;
	if (LF_ISSET(DB_FLUSH | DB_RDONLY))
		mfp->neutral_cnt = 1;
	if (LF_ISSET(DB_FLUSH))
		F_SET(dbmfp, MP_FOR_FLUSH);
	mfp->ftype = dbmfp->ftype;
	mfp->pagesize = pagesize;
	mfp->lsn_off = dbmfp->lsn_offset;
	mfp->clear_len = dbmfp->clear_len;
	mfp->priority = dbmfp->priority;
	__memp_set_maxpgno(mfp, dbmfp->gbytes, dbmfp->bytes);
	if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE))
		mfp->no_backing_file = 1;
	if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_UNLINK))
		mfp->unlink_on_close = 1;

	F_SET(mfp, MP_CAN_MMAP);
	if (F_ISSET(env->dbenv, DB_ENV_DATABASE_LOCKING))
		F_SET(mfp, MP_DATABASE_LOCKING);
	if (LF_ISSET(DB_DIRECT))
		F_SET(mfp, MP_DIRECT);
	if (LF_ISSET(DB_DURABLE_UNKNOWN | DB_RDONLY))
		F_SET(mfp, MP_DURABLE_UNKNOWN);
	if (LF_ISSET(DB_EXTENT))
		F_SET(mfp, MP_EXTENT);
	if (LF_ISSET(DB_TXN_NOT_DURABLE))
		F_SET(mfp, MP_NOT_DURABLE);

	/*
	 * An in-memory database with no name is a temp file.  Named
	 * in-memory databases get an artificially  bumped reference
	 * count so they don't disappear on close; they need a remove
	 * to make them disappear.
	 */
	if (path == NULL)
		F_SET(mfp, MP_TEMP);
	else if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE))
		mfp->mpf_cnt++;

	/* Copy the file identification string into shared memory. */
	if (F_ISSET(dbmfp, MP_FILEID_SET)) {
		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
		    NULL, DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0)
			goto err;
		memcpy(p, dbmfp->fileid, DB_FILE_ID_LEN);
	}

	/* Copy the file path into shared memory. */
	if (path != NULL) {
		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
		    NULL, strlen(path) + 1, &mfp->path_off, &p)) != 0)
			goto err;
		memcpy(p, path, strlen(path) + 1);
	}

	/* Copy the page cookie into shared memory. */
	if (dbmfp->pgcookie == NULL || dbmfp->pgcookie->size == 0) {
		mfp->pgcookie_len = 0;
		mfp->pgcookie_off = 0;
	} else {
		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
		    NULL, dbmfp->pgcookie->size,
		    &mfp->pgcookie_off, &p)) != 0)
			goto err;
		memcpy(p,
		     dbmfp->pgcookie->data, dbmfp->pgcookie->size);
		mfp->pgcookie_len = dbmfp->pgcookie->size;
	}

	if ((ret = __mutex_alloc(env,
	    MTX_MPOOLFILE_HANDLE, 0, &mfp->mutex)) != 0)
		goto err;
#ifndef HAVE_ATOMICFILEREAD
	if ((ret = __mutex_alloc(env,
	    MTX_MPOOLFILE_HANDLE, DB_MUTEX_SHARED, &mfp->mtx_write)) != 0)
		goto err;
#endif
	*retmfp = mfp;

err:	return (ret);
}

/*
 * memp_fclose_pp --
 *	DB_MPOOLFILE->close pre/post processing.
 *
 * PUBLIC: int __memp_fclose_pp __P((DB_MPOOLFILE *, u_int32_t));
 */
int
__memp_fclose_pp(dbmfp, flags)
	DB_MPOOLFILE *dbmfp;
	u_int32_t flags;
{
	DB_THREAD_INFO *ip;
	ENV *env;
	int ret;

	env = dbmfp->env;

	/*
	 * Validate arguments, but as a handle destructor, we can't fail.
	 */
	if (flags != 0)
		(void)__db_ferr(env, "DB_MPOOLFILE->close", 0);

	ENV_ENTER(env, ip);
	REPLICATION_WRAP(env, (__memp_fclose(dbmfp, 0)), 0, ret);
	ENV_LEAVE(env, ip);
	return (ret);
}

/*
 * __memp_fclose --
 *	DB_MPOOLFILE->close.
 *
 * PUBLIC: int __memp_fclose __P((DB_MPOOLFILE *, u_int32_t));
 */
int
__memp_fclose(dbmfp, flags)
	DB_MPOOLFILE *dbmfp;
	u_int32_t flags;
{
	DB_MPOOL *dbmp;
	ENV *env;
	MPOOLFILE *mfp;
	char *rpath;
	u_int32_t ref;
	int deleted, ret, t_ret;

	env = dbmfp->env;
	dbmp = env->mp_handle;
	ret = 0;

	/*
	 * Remove the DB_MPOOLFILE from the process' list.
	 *
	 * It's possible the underlying mpool cache may never have been created.
	 * In that case, all we have is a structure, discard it.
	 *
	 * It's possible the DB_MPOOLFILE was never added to the DB_MPOOLFILE
	 * file list, check the MP_OPEN_CALLED flag to be sure.
	 */
	if (dbmp == NULL)
		goto done;

	MUTEX_LOCK(env, dbmp->mutex);

	DB_ASSERT(env, dbmfp->ref >= 1);
	if ((ref = --dbmfp->ref) == 0 && F_ISSET(dbmfp, MP_OPEN_CALLED))
		TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q);

	/*
	 * Decrement the file descriptor's ref count -- if we're the last ref,
	 * we'll discard the file descriptor.
	 */
	if (ref == 0 && dbmfp->fhp != NULL && --dbmfp->fhp->ref > 0)
		dbmfp->fhp = NULL;
	MUTEX_UNLOCK(env, dbmp->mutex);
	if (ref != 0)
		return (0);

	/* Complain if pinned blocks never returned. */
	if (dbmfp->pinref != 0) {
		__db_errx(env, DB_STR_A("3040",
		    "%s: close: %lu blocks left pinned", "%s %lu"),
		    __memp_fn(dbmfp), (u_long)dbmfp->pinref);
		ret = __env_panic(env, DB_RUNRECOVERY);
	}

	/* Discard any mmap information. */
	if (dbmfp->addr != NULL && dbmfp->fhp != NULL &&
	    (ret = __os_unmapfile(env, dbmfp->addr, dbmfp->len)) != 0)
		__db_err(env, ret, "%s", __memp_fn(dbmfp));

	/*
	 * Close the file and discard the descriptor structure; temporary
	 * files may not yet have been created.
	 */
	if (dbmfp->fhp != NULL) {
		if ((t_ret =
		    __mutex_free(env, &dbmfp->fhp->mtx_fh)) != 0 && ret == 0)
			ret = t_ret;
		if ((t_ret = __os_closehandle(env, dbmfp->fhp)) != 0) {
			__db_err(env, t_ret, "%s", __memp_fn(dbmfp));
			if (ret == 0)
				ret = t_ret;
		}
		dbmfp->fhp = NULL;
	}

	/*
	 * Discard our reference on the underlying MPOOLFILE, and close it
	 * if it's no longer useful to anyone.  It possible the open of the
	 * file never happened or wasn't successful, in which case, mpf will
	 * be NULL and MP_OPEN_CALLED will not be set.
	 */
	mfp = dbmfp->mfp;
	DB_ASSERT(env,
	    (F_ISSET(dbmfp, MP_OPEN_CALLED) && mfp != NULL) ||
	    (!F_ISSET(dbmfp, MP_OPEN_CALLED) && mfp == NULL));
	if (!F_ISSET(dbmfp, MP_OPEN_CALLED))
		goto done;

	/*
	 * If it's a temp file, all outstanding references belong to unflushed
	 * buffers.  (A temp file can only be referenced by one DB_MPOOLFILE).
	 * We don't care about preserving any of those buffers, so mark the
	 * MPOOLFILE as dead so that even the dirty ones just get discarded
	 * when we try to flush them.
	 */
	deleted = 0;
	if (!LF_ISSET(DB_MPOOL_NOLOCK))
		MUTEX_LOCK(env, mfp->mutex);
	if (F_ISSET(dbmfp, MP_MULTIVERSION))
		atomic_dec(env, &mfp->multiversion);
	if (F_ISSET(dbmfp, MP_READONLY) ||
	    (LF_ISSET(DB_FLUSH) && F_ISSET(dbmfp, MP_FOR_FLUSH))) {
		DB_ASSERT(env, mfp->neutral_cnt != 0);
		--mfp->neutral_cnt;
	}
	DB_ASSERT(env, mfp->neutral_cnt < mfp->mpf_cnt);
	if (--mfp->mpf_cnt == 0 || LF_ISSET(DB_MPOOL_DISCARD)) {
		if (LF_ISSET(DB_MPOOL_DISCARD) ||
		    F_ISSET(mfp, MP_TEMP) || mfp->unlink_on_close) {
			mfp->deadfile = 1;
		}
		if (mfp->unlink_on_close) {
			if ((t_ret = __db_appname(dbmp->env, DB_APP_DATA,
			    R_ADDR(dbmp->reginfo, mfp->path_off), NULL,
			    &rpath)) != 0 && ret == 0)
				ret = t_ret;
			if (t_ret == 0) {
				if ((t_ret = __os_unlink(
				    dbmp->env, rpath, 0)) != 0 && ret == 0)
					ret = t_ret;
				__os_free(env, rpath);
			}
			mfp->unlink_on_close = 0;
		}
		if (MFP_OPEN_CNT(mfp) == 0) {
			F_CLR(mfp, MP_NOT_DURABLE);
			F_SET(mfp, MP_DURABLE_UNKNOWN);
		}
		if (mfp->block_cnt == 0) {
			/*
			 * We should never discard this mp file if our caller
			 * is holding the lock on it.  See comment in
			 * __memp_sync_file.
			 */
			DB_ASSERT(env, !LF_ISSET(DB_MPOOL_NOLOCK));
			if ((t_ret =
			    __memp_mf_discard(dbmp, mfp, 0)) != 0 && ret == 0)
				ret = t_ret;
			deleted = 1;
		}
	}
	if (!deleted && !LF_ISSET(DB_MPOOL_NOLOCK))
		MUTEX_UNLOCK(env, mfp->mutex);

done:	/* Discard the DB_MPOOLFILE structure. */
	if (dbmfp->pgcookie != NULL) {
		__os_free(env, dbmfp->pgcookie->data);
		__os_free(env, dbmfp->pgcookie);
	}
	__os_free(env, dbmfp);

	return (ret);
}

/*
 * __memp_mf_discard --
 *	Discard an MPOOLFILE.
 *
 * PUBLIC: int __memp_mf_discard __P((DB_MPOOL *, MPOOLFILE *, int));
 */
int
__memp_mf_discard(dbmp, mfp, hp_locked)
	DB_MPOOL *dbmp;
	MPOOLFILE *mfp;
	int hp_locked;
{
	DB_MPOOL_HASH *hp;
	ENV *env;
#ifdef HAVE_STATISTICS
	DB_MPOOL_STAT *sp;
#endif
	MPOOL *mp;
	char *rpath;
	int need_sync, ret, t_ret;

	env = dbmp->env;
	mp = dbmp->reginfo[0].primary;
	hp = R_ADDR(dbmp->reginfo, mp->ftab);
	hp += mfp->bucket;
	ret = 0;

	/*
	 * Expects caller to be holding the MPOOLFILE mutex.
	 *
	 * When discarding a file, we have to flush writes from it to disk.
	 * The scenario is that dirty buffers from this file need to be
	 * flushed to satisfy a future checkpoint, but when the checkpoint
	 * calls mpool sync, the sync code won't know anything about them.
	 * Ignore files not written, discarded, or only temporary.
	 */
	need_sync = mfp->file_written && !mfp->deadfile &&
	    !F_ISSET(mfp, MP_TEMP) && !mfp->no_backing_file;

	/*
	 * We have to release the MPOOLFILE mutex before acquiring the region
	 * mutex so we don't deadlock.  Make sure nobody ever looks at this
	 * structure again.
	 */
	mfp->deadfile = 1;

	/* We should unlink the file if necessary. */
	if (mfp->block_cnt == 0 && mfp->mpf_cnt == 0 && mfp->unlink_on_close &&
	    !F_ISSET(mfp, MP_TEMP) && !mfp->no_backing_file) {
		if ((t_ret = __db_appname(env, DB_APP_DATA,
		    R_ADDR(dbmp->reginfo, mfp->path_off), NULL,
		    &rpath)) != 0 && ret == 0)
			ret = t_ret;
		if (t_ret == 0) {
			if ((t_ret = __os_unlink(
			    dbmp->env, rpath, 0)) != 0 && ret == 0)
				ret = t_ret;
			__os_free(env, rpath);
		}
		mfp->unlink_on_close = 0;
		need_sync = 0;
	}

	/* Discard the mutex we're holding and return it too the pool. */
	MUTEX_UNLOCK(env, mfp->mutex);
	if ((t_ret = __mutex_free(env, &mfp->mutex)) != 0 && ret == 0)
		ret = t_ret;
#ifndef HAVE_ATOMICFILEREAD
	if ((ret = __mutex_free(env, &mfp->mtx_write)) != 0 && ret == 0)
		ret = t_ret;
#endif

	/*
	 * Lock the bucket and delete from the list of MPOOLFILEs.
	 * If this function is called by __memp_discard_all_mpfs,
	 * the MPOOLFILE hash bucket is already locked.
	 */
	if (!hp_locked)
		MUTEX_LOCK(env, hp->mtx_hash);
	SH_TAILQ_REMOVE(&hp->hash_bucket, mfp, q, __mpoolfile);
	if (!hp_locked)
		MUTEX_UNLOCK(env, hp->mtx_hash);

	/* Lock the region and collect stats and free the space. */
	MPOOL_SYSTEM_LOCK(env);
	if (need_sync &&
	    (t_ret = __memp_mf_sync(dbmp, mfp, 0)) != 0 && ret == 0)
		ret = t_ret;

#ifdef HAVE_STATISTICS
	/* Copy the statistics into the region. */
	sp = &mp->stat;
	sp->st_cache_hit += mfp->stat.st_cache_hit;
	sp->st_cache_miss += mfp->stat.st_cache_miss;
	sp->st_map += mfp->stat.st_map;
	sp->st_page_create += mfp->stat.st_page_create;
	sp->st_page_in += mfp->stat.st_page_in;
	sp->st_page_out += mfp->stat.st_page_out;
#endif

	/* Free the space. */
	if (mfp->path_off != 0)
		__memp_free(&dbmp->reginfo[0],
		    R_ADDR(dbmp->reginfo, mfp->path_off));
	if (mfp->fileid_off != 0)
		__memp_free(&dbmp->reginfo[0],
		    R_ADDR(dbmp->reginfo, mfp->fileid_off));
	if (mfp->pgcookie_off != 0)
		__memp_free(&dbmp->reginfo[0],
		    R_ADDR(dbmp->reginfo, mfp->pgcookie_off));
	__memp_free(&dbmp->reginfo[0], mfp);

	MPOOL_SYSTEM_UNLOCK(env);

	return (ret);
}

/*
 * __memp_inmemlist --
 *	Return a list of the named in-memory databases.
 *
 * PUBLIC: int __memp_inmemlist __P((ENV *, char ***, int *));
 */
int
__memp_inmemlist(env, namesp, cntp)
	ENV *env;
	char ***namesp;
	int *cntp;
{
	DB_MPOOL *dbmp;
	DB_MPOOL_HASH *hp;
	MPOOL *mp;
	MPOOLFILE *mfp;
	int arraysz, cnt, i, ret;
	char **names;

	names = NULL;
	dbmp = env->mp_handle;
	mp = dbmp->reginfo[0].primary;
	hp = R_ADDR(dbmp->reginfo, mp->ftab);

	arraysz = cnt = 0;
	for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) {
		MUTEX_LOCK(env, hp->mtx_hash);
		SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) {
			/* Skip dead files and temporary files. */
			if (mfp->deadfile || F_ISSET(mfp, MP_TEMP))
				continue;

			/* Skip entries that allow files. */
			if (!mfp->no_backing_file)
				continue;

			/* We found one. */
			if (cnt >= arraysz) {
				arraysz += 100;
				if ((ret = __os_realloc(env,
				    (u_int)arraysz * sizeof(names[0]),
				    &names)) != 0)
					goto nomem;
			}
			if ((ret = __os_strdup(env,
			    R_ADDR(dbmp->reginfo, mfp->path_off),
			    &names[cnt])) != 0)
				goto nomem;

			cnt++;
		}
		MUTEX_UNLOCK(env, hp->mtx_hash);
	}
	*namesp = names;
	*cntp = cnt;
	return (0);

nomem:	MUTEX_UNLOCK(env, hp->mtx_hash);
	if (names != NULL) {
		while (--cnt >= 0)
			__os_free(env, names[cnt]);
		__os_free(env, names);
	}

	/* Make sure we don't return any garbage. */
	*cntp = 0;
	*namesp = NULL;
	return (ret);
}