/*-
 * Copyright (c) 2004, 2018 Oracle and/or its affiliates.  All rights reserved.
 *
 * See the file LICENSE for license information.
 *
 * $Id$
 */

#include "db_config.h"

#include "db_int.h"

#define	REGISTER_FILE	"__db.register"

#define	PID_EMPTY	"X                      0\n"	/* Unused PID entry */
#define	PID_FMT		"%24lu\n"			/* PID entry format */
							/* Unused PID test */
#define	PID_ISEMPTY(p)	(memcmp(p, PID_EMPTY, PID_LEN) == 0)
#define	PID_LEN		(25)				/* PID entry length */

#define	REGISTRY_LOCK(env, pos, nowait)					\
	__os_fdlock(env, (env)->dbenv->registry, (off_t)(pos),		\
	    DB_LOCK_WRITE, nowait)
#define	REGISTRY_UNLOCK(env, pos)					\
	__os_fdlock(env, (env)->dbenv->registry, (off_t)(pos), DB_LOCK_NG, 0)
#define	REGISTRY_EXCL_LOCK(env, nowait)					\
	REGISTRY_LOCK(env, 1, nowait)
#define	REGISTRY_EXCL_UNLOCK(env)					\
	REGISTRY_UNLOCK(env, 1)

static	int __envreg_add __P((ENV *, int *, u_int32_t));
static	int __envreg_pid_compare __P((const void *, const void *));
static	int __envreg_create_active_pid __P((ENV *, char *));
static	int __envreg_add_active_pid __P((ENV*, char *));

/*
 * Support for portable, multi-process database environment locking, based on
 * the Subversion SR (#11511).
 *
 * The registry feature is configured by specifying the DB_REGISTER flag to the
 * DbEnv.open method.  If DB_REGISTER is specified, DB opens the registry file
 * in the database environment home directory.  The registry file is formatted
 * as follows:
 *
 *	                    12345		# process ID slot 1
 *	X		# empty slot
 *	                    12346		# process ID slot 2
 *	X		# empty slot
 *	                    12347		# process ID slot 3
 *	                    12348		# process ID slot 4
 *	X                   12349		# empty slot
 *	X		# empty slot
 *
 * All lines are fixed-length.  All lines are process ID slots.  Empty slots
 * are marked with leading non-digit characters.
 *
 * To modify the file, you get an exclusive lock on the first byte of the file.
 *
 * While holding any DbEnv handle, each process has an exclusive lock on the
 * first byte of a process ID slot.  There is a restriction on having more
 * than one DbEnv handle open at a time, because Berkeley DB uses per-process
 * locking to implement this feature, that is, a process may never have more
 * than a single slot locked.
 *
 * This work requires that if a process dies or the system crashes, locks held
 * by the dying processes will be dropped.  (We can't use system shared
 * memory-backed or filesystem-backed locks because they're persistent when a
 * process dies.)  On POSIX systems, we use fcntl(2) locks; on Win32 we have
 * LockFileEx/UnlockFile, except for Win/9X and Win/ME which have to loop on
 * Lockfile/UnlockFile.
 *
 * We could implement the same solution with flock locking instead of fcntl,
 * but flock would require a separate file for each process of control (and
 * probably each DbEnv handle) in the database environment, which is fairly
 * ugly.
 *
 * Whenever a process opens a new DbEnv handle, it walks the registry file and
 * verifies it CANNOT acquire the lock for any non-empty slot.  If a lock for
 * a non-empty slot is available, we know a process died holding an open handle,
 * and recovery needs to be run.
 *
 * It's possible to get corruption in the registry file.  If a write system
 * call fails after partially completing, there can be corrupted entries in
 * the registry file, or a partial entry at the end of the file.  This is OK.
 * A corrupted entry will be flagged as a non-empty line during the registry
 * file walk.  Since the line was corrupted by process failure, no process will
 * hold a lock on the slot, which will lead to recovery being run.
 *
 * There can still be processes running in the environment when we recover it,
 * and, in fact, there can still be processes running in the old environment
 * after we're up and running in a new one.  This is safe because performing
 * recovery panics (and removes) the existing environment, so the window of
 * vulnerability is small.  Further, we check the panic flag in the DB API
 * methods, when waking from spinning on a mutex, and whenever we're about to
 * write to disk).  The only window of corruption is if the write check of the
 * panic were to complete, the region subsequently be recovered, and then the
 * write continues.  That's very, very unlikely to happen.  This vulnerability
 * already exists in Berkeley DB, too, the registry code doesn't make it any
 * worse than it already is.
 *
 * The only way to avoid that window entirely is to ensure that all processes
 * in the Berkeley DB environment exit before we run recovery.   Applications
 * can do that if they maintain their own process registry outside of Berkeley
 * DB, but it's a little more difficult to do here.   The obvious approach is
 * to send signals to any process using the database environment as soon as we
 * decide to run recovery, but there are problems with that approach: we might
 * not have permission to send signals to the process, the process might have
 * signal handlers installed, the cookie stored might not be the same as kill's
 * argument, we may not be able to reliably tell if the process died, and there
 * are probably other problems.  However, if we can send a signal, it reduces
 * the window, and so we include the code here.  To configure it, turn on the
 * DB_ENVREG_KILL_ALL #define.
 */
#define	DB_ENVREG_KILL_ALL	0

/*
 * __envreg_register --
 *	Register a ENV handle.
 *
 * PUBLIC: int __envreg_register __P((ENV *, int *, u_int32_t));
 */
int
__envreg_register(env, need_recoveryp, flags)
	ENV *env;
	int *need_recoveryp;
	u_int32_t flags;
{
	DB_ENV *dbenv;
	pid_t pid;
	u_int32_t bytes, mbytes;
	int ret;
	char *pp;

	*need_recoveryp = 0;

	dbenv = env->dbenv;
	dbenv->thread_id(dbenv, &pid, NULL);
	pp = NULL;

	if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
		__db_msg(env, DB_STR_A("1524",
		    "%lu: register environment", "%lu"), (u_long)pid);

	if ((ret = __envreg_registry_open(env, &pp, DB_OSO_CREATE)) != 0)
		goto err;

	/*
	 * If the file size is 0, initialize the file.
	 *
	 * Run recovery if we create the file, that means we can clean up the
	 * system by removing the registry file and restarting the application.
	 */
	if ((ret = __os_ioinfo(
	    env, pp, dbenv->registry, &mbytes, &bytes, NULL)) != 0)
		goto err;
	if (mbytes == 0 && bytes == 0) {
		if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
			__db_msg(env, DB_STR_A("1525",
			    "%lu: creating %s", "%lu %s"), (u_long)pid, pp);
		*need_recoveryp = 1;
	}

	/* Register this process. */
	if ((ret = __envreg_add(env, need_recoveryp, flags)) != 0)
		goto err;
	/*
	 * Release our exclusive lock if we don't need to run recovery.  If
	 * we need to run recovery, ENV->open will call back into register
	 * code once recovery has completed.
	 */
	if (*need_recoveryp == 0 && (ret = REGISTRY_EXCL_UNLOCK(env)) != 0)
		goto err;

	if (0) {
err:
		/*
		 * !!!
		 * Closing the file handle must release all of our locks.
		 */
		(void)__envreg_registry_close(env);
	}
	if (pp != NULL)
		__os_free(env, pp);

	return (ret);
}

/*
 * __envreg_add --
 *	Add the process' pid to the register.
 */
static int
__envreg_add(env, need_recoveryp, flags)
	ENV *env;
	int *need_recoveryp;
	u_int32_t flags;
{
	DB_ENV *dbenv;
	DB_THREAD_INFO *ip;
	REGENV * renv;
	REGINFO *infop;
	pid_t pid;
	off_t end, pos, dead;
	size_t nr, nw;
	u_int lcnt;
	u_int32_t bytes, mbytes, orig_flags;
	int need_failchk, ret, t_ret;
	char *p, buf[PID_LEN + 10], pid_buf[PID_LEN + 10];

	dbenv = env->dbenv;
	need_failchk = t_ret = 0;
	COMPQUIET(dead, 0);
	COMPQUIET(p, NULL);
	ip = NULL;

	pid = env->pid_cache;
	snprintf(pid_buf, sizeof(pid_buf), PID_FMT, (u_long)pid);

	if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
		__db_msg(env, DB_STR_A("1526",
		    "%lu: adding self to registry", "%lu"), (u_long)pid);

#if DB_ENVREG_KILL_ALL
	if (0) {
kill_all:	/*
		 * A second pass through the file, this time killing any
		 * processes still running.
		 */
		if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0)
			return (ret);
	}
#endif

	/*
	 * Read the file.  Skip empty slots, and check that a lock is held
	 * for any allocated slots.  An allocated slot which we can lock
	 * indicates a process died holding a handle and recovery needs to
	 * be run.
	 */
	for (lcnt = 0;; ++lcnt) {
		if ((ret = __os_read(
		    env, dbenv->registry, buf, PID_LEN, &nr)) != 0)
			return (ret);
		if (nr == 0)
			break;

		/*
		 * A partial record at the end of the file is possible if a
		 * previously un-registered process was interrupted while
		 * registering.
		 */
		if (nr != PID_LEN) {
			need_failchk = 1;
			break;
		}

		if (PID_ISEMPTY(buf)) {
			if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
				__db_msg(env, DB_STR_A("1527",
				    "%02u: EMPTY", "%02u"), lcnt);
			continue;
		}

		/*
		 * !!!
		 * DB_REGISTER is implemented using per-process locking, only
		 * a single ENV handle may be open per process.  Enforce
		 * that restriction.
		 */
		if (memcmp(buf, pid_buf, PID_LEN) == 0) {
			__db_errx(env, DB_STR("1528",
"DB_REGISTER limits processes to one open DB_ENV handle per environment"));
			return (EINVAL);
		}

		if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) {
			for (p = buf; *p == ' ';)
				++p;
			buf[nr - 1] = '\0';
		}

#if DB_ENVREG_KILL_ALL
		if (need_failchk) {
			pid = (pid_t)strtoul(buf, NULL, 10);
			(void)kill(pid, SIGKILL);

			if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
				__db_msg(env, DB_STR_A("1529",
				    "%02u: %s: KILLED", "%02u %s"), lcnt, p);
			continue;
		}
#endif
		pos = (off_t)lcnt * PID_LEN;
		if (REGISTRY_LOCK(env, pos, 1) == 0) {
			if ((ret = REGISTRY_UNLOCK(env, pos)) != 0)
				return (ret);

			if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
				__db_msg(env, DB_STR_A("1530",
				    "%02u: %s: FAILED", "%02u %s"), lcnt, p);

			need_failchk = 1;
			dead = pos;
#if DB_ENVREG_KILL_ALL
			goto kill_all;
#else
			break;
#endif
		} else
			if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
				__db_msg(env, DB_STR_A("1531",
				    "%02u: %s: LOCKED", "%02u %s"), lcnt, p);
	}

	/* Check for a panic; if so there's no need to call failchk. */
	if ((t_ret = __env_attach(env, NULL, 0, 0)) != 0)
		goto sig_proc;
	infop = env->reginfo;
	renv = infop->primary;
	*need_recoveryp = renv->envid != env->envid;
	(void)__env_detach(env, 0);
	if (*need_recoveryp)
		return (0);

	/*
	 * If we have to perform failchk...
	 *
	 * Mark all slots empty.  Registry ignores empty slots we can't lock,
	 * so it doesn't matter if any of the processes are in the middle of
	 * exiting Berkeley DB -- they'll discard their lock when they exit.
	 */
	if (need_failchk) {
		if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
			__db_msg(env,
			    "%lu: failchk recovery required", (u_long)pid);

		if (LF_ISSET(DB_FAILCHK) || LF_ISSET(DB_FAILCHK_ISALIVE)) {
			if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
				__db_msg(env,
				    "%lu: performing failchk", (u_long)pid);

			if (LF_ISSET(DB_FAILCHK_ISALIVE) && (ret =
			    __envreg_create_active_pid(env, pid_buf)) != 0)
				goto sig_proc;

			/*
			 * The environment will already exist, so we do not
			 * want DB_CREATE set, nor do we want any recovery at
			 * this point.  No need to put values back as flags is
			 * passed in by value.  Save original dbenv flags in
			 * case we need to recover/remove existing environment.
			 * Set DB_ENV_FAILCHK before attach to help ensure we
			 * don't block on a mutex held by the dead process.
			 */
			LF_CLR(DB_CREATE | DB_RECOVER | DB_RECOVER_FATAL);
			orig_flags = dbenv->flags;
			F_SET(dbenv, DB_ENV_FAILCHK);
			/* Attach to environment and subsystems. */
			if ((ret = __env_attach_regions(
			    dbenv, flags, orig_flags, 0)) != 0)
				goto sig_proc;
			if ((t_ret = __env_set_state(env,
			   &ip, THREAD_FAILCHK)) != 0 && ret == 0)
				ret = t_ret;
			if (ret == 0 && (t_ret = __env_failchk_int(dbenv)) != 0)
				ret = t_ret;
			if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
				__db_msg(env,
				    "%lu: failchk returned %d, ret is %d",
				    (u_long)pid, t_ret, ret);

			/* Free active pid array if used. */
			if (LF_ISSET(DB_FAILCHK_ISALIVE)) {
				env->num_active_pids = 0;
				env->size_active_pids = 0;
				__os_free(env, env->active_pids);
				env->active_pids = NULL;
			}

			/* Detach from environment and deregister thread. */
			if ((t_ret = __env_refresh(dbenv,
			    orig_flags, 0)) != 0 && ret == 0)
				ret = t_ret;
			F_CLR(env, ENV_OPEN_CALLED);

			if (ret == 0) {
				if ((ret = __os_seek(env, dbenv->registry,
				    0, 0, (u_int32_t)dead)) != 0 ||
				    (ret = __os_write(env, dbenv->registry,
				    PID_EMPTY, PID_LEN, &nw)) != 0)
					return (ret);
				need_failchk = 0;
				goto add;
			}

		}
		/* If we can't attach, then we cannot set DB_REGISTER panic. */
sig_proc:
		if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
			__db_msg(env, "%lu: sig_proc attaching errs %s/ret %s",
			    (u_long)pid, db_strerror(t_ret), db_strerror(ret));
		if (__env_attach(env, NULL, 0, 0) == 0) {
			infop = env->reginfo;
			renv = infop->primary;
			/*
			 * Indicate DB_REGISTER panic.  Also, set (or re-set)
			 * environment panic as this is the panic trigger
			 * mechanism in the code that everything looks for.
			 */
			renv->reg_panic = 1;
			renv->envid = ENVID_PANIC;
			(void)__env_detach(env, 0);
		}

		/* Wait for processes to see the panic and leave. */
		__os_yield(env, 0, dbenv->envreg_timeout);

		/* FIGURE out how big the file is. */
		if ((ret = __os_ioinfo(
		    env, NULL, dbenv->registry, &mbytes, &bytes, NULL)) != 0)
			return (ret);
		end = (off_t)mbytes * MEGABYTE + bytes;

		/*
		 * Seek to the beginning of the file and overwrite slots to
		 * the end of the file.
		 *
		 * It's possible for there to be a partial entry at the end of
		 * the file if a process died when trying to register.  If so,
		 * correct for it and overwrite it as well.
		 */
		if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0)
			return (ret);
		for (lcnt = 0; lcnt < ((u_int)end / PID_LEN +
		    ((u_int)end % PID_LEN == 0 ? 0 : 1)); ++lcnt) {

			if ((ret = __os_read(
			    env, dbenv->registry, buf, PID_LEN, &nr)) != 0)
				return (ret);

			pos = (off_t)lcnt * PID_LEN;
			/* do not notify on dead process */
			if (pos != dead) {
				pid = (pid_t)strtoul(buf, NULL, 10);
				DB_EVENT(env, DB_EVENT_REG_ALIVE, &pid);
			}

			if ((ret = __os_seek(env,
			    dbenv->registry, 0, 0, (u_int32_t)pos)) != 0 ||
			    (ret = __os_write(env,
			    dbenv->registry, PID_EMPTY, PID_LEN, &nw)) != 0)
				return (ret);
		}
		/* wait one last time to get everyone out */
		__os_yield(env, 0, dbenv->envreg_timeout);
	}

	/*
	 * Seek to the first process slot and add ourselves to the first empty
	 * slot we can lock.
	 */
add:	if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0)
		return (ret);
	for (lcnt = 0;; ++lcnt) {
		if ((ret = __os_read(
		    env, dbenv->registry, buf, PID_LEN, &nr)) != 0)
			return (ret);
		if (nr == PID_LEN && !PID_ISEMPTY(buf))
			continue;
		pos = (off_t)lcnt * PID_LEN;
		if (REGISTRY_LOCK(env, pos, 1) == 0) {
			if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
				__db_msg(env, DB_STR_A("1532",
				    "%lu: locking slot %02u at offset %lu",
				    "%lu %02u %lu"), (u_long)pid, lcnt,
				    (u_long)pos);

			if ((ret = __os_seek(env,
			    dbenv->registry, 0, 0, (u_int32_t)pos)) != 0 ||
			    (ret = __os_write(env,
			    dbenv->registry, pid_buf, PID_LEN, &nw)) != 0)
				return (ret);
			/* Add the entry for this process. */
			if ((ret = __envreg_add_active_pid(env, pid_buf)) != 0)
				return (ret);
			dbenv->registry_off = (u_int32_t)pos;
			break;
		}
	}

	if (need_failchk)
		*need_recoveryp = 1;

	return (ret);
}

/*
 * __envreg_unregister_pid --
 *	Unregister a process by pid, optionally with its offset in the registry.
 *
 * Parameters:
 *	If the caller knows the entry's loation in the registry, they can pass
 *	it in via 'offset'. If not known, pass in 0.
 *
 * PUBLIC: int __envreg_unregister_pid __P((ENV *, pid_t, u_int32_t));
 */
int
__envreg_unregister_pid(env, pid, offset)
	ENV *env;
	pid_t  pid;
	u_int32_t offset;
{
	DB_FH *registry;
	size_t nbytes;
	int ret, t_ret;
	char buf[PID_LEN];

	registry = env->dbenv->registry;
	if (offset != 0) {
		/* Verify that the pid is at the specified offset. */
		if ((ret = __os_io(env, DB_IO_READ,
		    registry, 0, 0, offset, PID_LEN,
		    (u_int8_t *)buf, &nbytes)) != 0)
			goto err;
		if (nbytes != PID_LEN || pid != (pid_t)strtoul(buf, NULL, 10)) {
not_found:
			ret = USR_ERR(env, DB_NOTFOUND);
			__db_errx(env, "__envreg_unregister_pid: %lu not found",
				    (u_long)pid);
			goto err;
		}
	} else {
		/*
		 * The caller did not tell us where to find the process, so
		 * search for it.
		 */
		if ((ret = __os_seek(env, registry, 0, 0, 0)) != 0)
			goto err;
		for (;;) {
			if ((ret = __os_read(
			    env, registry, buf, PID_LEN, &nbytes)) != 0)
				goto err;
			/*
			 * A too-short record means that we reached EOF without
			 * finding the process.
			 */
			if (nbytes != PID_LEN)
				goto not_found;
			if (pid == (pid_t)strtoul(buf, NULL, 10))
				break;
			offset += PID_LEN;
		}
	}
	ret = __os_io(env, DB_IO_WRITE,
	    registry, 0, 0, offset, PID_LEN, (u_int8_t *)PID_EMPTY, &nbytes);
err:
	if ((t_ret = __envreg_registry_close(env)) != 0 && ret == 0)
		ret = t_ret;
	return (ret);
}

/*
 * __envreg_unregister --
 *	Unregister a ENV handle.
 *
 * PUBLIC: int __envreg_unregister __P((ENV *, int));
 */
int
__envreg_unregister(env, recovery_failed)
	ENV *env;
	int recovery_failed;
{
	DB_ENV *dbenv;
	int ret, t_ret;

	dbenv = env->dbenv;
	ret = 0;

	/*
	 * If recovery failed, we want to drop our locks and return, but still
	 * make sure any subsequent process doesn't decide everything is just
	 * fine and try to get into the database environment.  In the case of
	 * an error, discard our locks, but leave our slot filled-in.
	 */
	if (recovery_failed)
		goto err;

	/*
	 * Why isn't an exclusive lock necessary to discard a ENV handle?
	 *
	 * We mark our process ID slot empty before we discard the process slot
	 * lock, and threads of control reviewing the register file ignore any
	 * slots which they can't lock.
	 */
	if ((ret = __envreg_unregister_pid(env,
	    env->pid_cache, dbenv->registry_off)) != 0)
		goto err;

	/*
	 * !!!
	 * This code assumes that closing the file descriptor discards all
	 * held locks.
	 *
	 * !!!
	 * There is an ordering problem here -- in the case of a process that
	 * failed in recovery, we're unlocking both the exclusive lock and our
	 * slot lock.  If the OS unlocked the exclusive lock and then allowed
	 * another thread of control to acquire the exclusive lock before also
	 * also releasing our slot lock, we could race.  That can't happen, I
	 * don't think.
	 */
err:
	if (dbenv->registry != NULL &&
	    (t_ret = __os_closehandle(env, dbenv->registry)) != 0 && ret == 0)
		ret = t_ret;

	dbenv->registry = NULL;
	return (ret);
}

 /*
 * __envreg_registry_open --
 *	Open the registry file, possibly creating it if the open mode contains
 *	DB_OSO_CREATE. Obtain an exclusive lock on the registry.
 *
 * PUBLIC: int __envreg_registry_open __P((ENV *, char **, u_int32_t));
 */
int
__envreg_registry_open(env, namep, os_open_flags)
	ENV *env;
	char **namep;
	u_int32_t os_open_flags;
{
	int ret;

	ret = 0;

	/* Build the path name and open the registry file. */
	if ((ret = __db_appname(env,
	    DB_APP_NONE, REGISTER_FILE, NULL, namep)) != 0) {
		__db_err(env, ret,
		    "__envreg_register_open: appname failed for %s",
		    REGISTER_FILE);
		goto err;
	}
	if ((ret = __os_open(env, *namep, 0,
	    os_open_flags, DB_MODE_660, &env->dbenv->registry)) != 0) {
		if (ret != ENOENT)
			__db_err(env, ret,
			    "__envreg_register_open failed for %s", *namep);
		goto err;
	}

	/*
	 * Wait for an exclusive lock on the file.
	 *
	 * !!!
	 * We're locking bytes that don't yet exist, but that's OK as far as
	 * I know.
	 */
	if ((ret = REGISTRY_EXCL_LOCK(env, 0)) != 0)
		goto err;
	if (FLD_ISSET(env->dbenv->verbose, DB_VERB_REGISTER))
		__db_msg(env, "opened registry %s", *namep);
	if (0) {
err:
		(void)__envreg_registry_close(env);
		if (*namep != NULL) {
			__os_free(env, *namep);
			*namep = NULL;
		}
	}
	return (ret);
}

/*
 * __envreg_registry_close --
 *	Close the registry file, if any. That also releases any registry lock.
 *
 * PUBLIC: int __envreg_registry_close __P((ENV *));
 */
int
__envreg_registry_close(env)
	ENV *env;
{
	DB_ENV *dbenv;
	int ret;

	ret = 0;
	dbenv = env->dbenv;
	if (dbenv->registry != NULL) {
		ret = __os_closehandle(env, dbenv->registry);
		dbenv->registry = NULL;
	}
	return (ret);
}

/*
 * __envreg_xunlock --
 *	Discard the exclusive lock held by the ENV handle.
 *
 * PUBLIC: int __envreg_xunlock __P((ENV *));
 */
int
__envreg_xunlock(env)
	ENV *env;
{
	DB_ENV *dbenv;
	pid_t pid;
	int ret;

	dbenv = env->dbenv;
	dbenv->thread_id(dbenv, &pid, NULL);

	if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
		__db_msg(env, DB_STR_A("1533",
		    "%lu: recovery completed, unlocking", "%lu"), (u_long)pid);

	if ((ret = REGISTRY_EXCL_UNLOCK(env)) == 0)
		return (ret);

	__db_err(env, ret, DB_STR_A("1534",
	    "%s: exclusive file unlock", "%s"), REGISTER_FILE);
	return (__env_panic(env, ret));
}

/*
 * __envreg_pid_compare --
 *	Compare routine for qsort and bsearch calls.
 *	returns neg if key is less than membr, 0 if equal and
 *	pos if key is greater than membr.
 */
static int
__envreg_pid_compare(key, membr)
	const void *key;
	const void *membr;
{
	return ( *(pid_t*)key - *(pid_t*)membr );
}

/*
 * __envreg_isalive --
 *	Default isalive function that uses contents of an array of active pids
 *	gotten from the db_register file to determine if process is still
 *	alive.
 *
 * PUBLIC: int __envreg_isalive
 * PUBLIC:   __P((DB_ENV *, pid_t, db_threadid_t, u_int32_t));
 */
int
__envreg_isalive(dbenv, pid, tid, flags )
	DB_ENV *dbenv;
	pid_t pid;
	db_threadid_t tid;
	u_int32_t flags;
{
	ENV *env;

	env = dbenv->env;

	/* in this case we really do not care about tid, simply for lint */
	DB_THREADID_INIT(tid);

	/* if is not an expected value then return early */
	if (!((flags == 0) || (flags == DB_MUTEX_PROCESS_ONLY)))
		return (EINVAL);

	if (env->active_pids == NULL || env->num_active_pids == 0)
		return (0);
	/*
	 * bsearch returns a pointer to an entry in active_pids if a match
	 * is found on pid, else no match found it returns NULL.   This
	 * routine will return a 1 if a match is found, else a 0.
	 */
	if (bsearch(&pid, env->active_pids, env->num_active_pids,
	    sizeof(pid_t), __envreg_pid_compare))
		return 1;

	return (0);
}

/*
 * __envreg_create_active_pid --
 *	Create array of pids, if need more room in array then double size.
 *	Only add active pids from DB_REGISTER file into array. The given
 *	active my_pid is also added into array.
 */
static int
__envreg_create_active_pid(env, my_pid)
	ENV *env;
	char *my_pid;
{
	DB_ENV *dbenv;
	char buf[PID_LEN + 10];
	int    ret;
	off_t  pos;
	size_t nr;
	u_int lcnt;

	dbenv = env->dbenv;
	pos = 0;
	ret = 0;

	/*
	 * The process getting here has not been added to the DB_REGISTER
	 * file yet, so include it as the first item in array
	 */
	if (env->num_active_pids == 0) {
		if ((ret = __envreg_add_active_pid(env, my_pid)) != 0)
			return (ret);
	}

	/*
	 * Walk through DB_REGISTER file, we grab pid entries that are locked
	 * as those represent processes that are still alive.   Ignore empty
	 * slots, or those that are unlocked.
	 */
	if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0)
		return (ret);
	for (lcnt = 0;; ++lcnt) {
		if ((ret = __os_read(
		    env, dbenv->registry, buf, PID_LEN, &nr)) != 0)
			return (ret);

		/* all done is read nothing, or get a partial record */
		if (nr == 0 || nr != PID_LEN)
			break;
		if (PID_ISEMPTY(buf))
			continue;

		pos = (off_t)lcnt * PID_LEN;
		if (REGISTRY_LOCK(env, pos, 1) == 0) {
			/* got lock, so process died. Do not add to array */
			if ((ret = REGISTRY_UNLOCK(env, pos)) != 0)
				return (ret);
		} else {
			if ((ret = __envreg_add_active_pid(env, buf)) != 0)
				return (ret);
		}

	}

	/* lets sort the array to allow for binary search in isalive func */
	qsort(env->active_pids, env->num_active_pids,
	    sizeof(pid_t), __envreg_pid_compare);
	return (ret);
}

/*
 * __envreg_add_active_pid --
 *	Add an active pid into array, if need more room in array
 *	then double size.
 */
static int
__envreg_add_active_pid(env, pid)
	ENV *env;
	char *pid;
{
	int ret;
	size_t tmpsize;

	ret = 0;

	/* Realloc() the array if it is cannot hold one more item. */
	if (env->num_active_pids >= env->size_active_pids) {
		tmpsize = env->size_active_pids * sizeof(pid_t);

		/* start with 512, then double if must grow */
		tmpsize = tmpsize > 0 ? tmpsize * 2 : 512;
		if ((ret = __os_realloc(env, tmpsize, &env->active_pids)) != 0)
			return (ret);

		env->size_active_pids = tmpsize / sizeof(pid_t);
	}

	/* insert into array */
	env->active_pids
	    [env->num_active_pids++] = (pid_t)strtoul(pid, NULL, 10);

	return (0);
}
