/*	$NetBSD$	*/

/*-
 * Copyright (c) 2014 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Taylor R Campbell.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * XXX NOTE NOTE NOTE XXX
 *
 * This code does not actually work.  It is a draft of an idea.  It
 * probably won't even compile, even if you make it include the right
 * header files.
 */

/*
 * Vnode life cycle
 *
 * Vnodes exist in one of seven states:
 *
 * - UNINITIALIZED
 * - INITIALIZING
 * - READY (inactive if usecount = 0, active if usecount > 0)
 * - REVOKED
 * - DEACTIVATING
 * - RECLAIMING
 * - RECLAIMED
 *
 * This is a lot of states, but users of the vnode abstraction don't
 * usually see most of them -- vget returns a READY or REVOKED vnode,
 * and until you vrele, you don't ever see states other than those.
 *
 * - The UNINITIALIZED state is not visible outside the vnode
 * abstraction except that marker vnodes are always in it.
 *
 * - The INITIALIZING state is visible only from getnewvnode until
 * vready.
 *
 * - The DEACTIVATING state is visible only in VOP_INACTIVE.
 *
 * - The REVOKED state is indistinguishable from the READY state of a
 * deadfs vnode.
 *
 * - The RECLAIMING state is visible only in VOP_RECLAIM, and the
 * decision to reclaim a vnode is final.
 *
 * - The RECLAIMED state is not visible: vget returns ENOENT in this
 * case.
 *
 *                      (getnewvnode)
 *                            |
 *                            V
 *                    +--------------+       ungetnewvnode
 *                    | INITIALIZING |-------------------------> (vnfree)
 *                    +--------------+
 *                            |
 *                            | vready
 *                            V
 *              +------------------------------+
 *  vget +----> | active (READY, usecount > 0) | <----+
 *      /       +------------------------------+       \
 *     /                 /         \                    |
 *    /      last vrele /           \ vrevoke           |
 *   |     or vrecycle /             \                  |
 *   |                |               |                 |
 *   |                V               V                 |
 *   |       +----------------+     +------------------------+
 *   |       |  DEACTIVATING  |     | REVOKED [VOP_INACTIVE, |
 *   |       | [VOP_INACTIVE] |     | VOP_RECLAIM]           |
 *   |       +----------------+     +------------------------+
 *   |                     |   \___________
 *   |   VOP_INACTIVE says |               \ VOP_INACTIVE says reclaim
 *   |       don't reclaim |                |
 *   |                     V                V
 *   |       +---------------+          +---------------+
 *   |       |   inactive    |  vdrain  |  RECLAIMING   |
 *   ^-------|    (READY,    | -------->| [VOP_RECLAIM] |
 *           | usecount = 0) |          +---------------+
 *           +---------------+                  |
 *                                              |
 *                                              V
 *                                         +-----------+
 *                                         | RECLAIMED |
 *                                         +-----------+
 *                                              |
 *                                              V
 *                                           (vnfree)
 *
 * Usecount is managed with atomics.  Dropping usecount to zero may
 * happen only under vp->v_interlock; all other transitions are allowed
 * in any context.
 */

/*
 * Changes:
 *
 * - Take the vnode lock around vrevoke.  (XXX Non-genfs vnode locks?)
 * - Before vget, do vpreget instead of mutex_enter(vp->v_interlock).
 * - Call vready when a newly published vnode is ready.
 * - If you want the vnode lock after vget, take it yourself.
 */

/*
 * Idea:
 *
 * - VOP_INTERRUPT delivers a signal to whoever holds a vnode's lock.
 *   (May be hairy for LK_SHARED...)  Then `umount -f' can actually
 *   work!  What we really want is to cause pending I/O to fail (with a
 *   negative acknowledgement), but a signal is probably the closest we
 *   can get.
 */

/*
 * Vnode allocation
 */

/*
 * vnalloc: Allocate a vnode.  If mp is nonnull, this is a marker vnode
 * for it; otherwise, it is a normal vnode.  Must be freed with vnfree.
 */
struct vnode *
vnalloc(struct mount *mp)
{
	static const struct vnode zero_vnode;
	struct vnode *vp;

	if (mp == NULL)		/* not a marker */
		vdrain_vnode_created();

	vp = pool_cache_get(vnode_cache, PR_WAITOK);
	KASSERT(vp != NULL);

	*vp = zero_vnode;
	vp->v_state = VS_UNINITIALIZED;
	uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0);
	cv_init(&vp->v_cv, "vnode");
	LIST_INIT(&vp->v_nclist);
	LIST_INIT(&vp->v_dnclist);

	if (mp == NULL) {
		rw_init(&vp->v_lock);
	} else {
		vp->v_mount = mp;
		vp->v_type = VBAD;
		vp->v_iflag = VI_MARKER;
	}

	return vp;
}

/*
 * vnfree: Free a vnode allocated with vnalloc.
 *
 * - vp must be UNINITIALIZED.
 */
void
vnfree(struct vnode *vp)
{
	bool marker;

	KASSERT(vp->v_state == VS_UNINITIALIZED);
	KASSERT(vp->v_usecount == 0);

	marker = vismarker(vp);
	if (marker) {
		KASSERT(vp->v_type == VBAD);
	} else {
		KASSERT(vp->v_mount == NULL);
		rw_destroy(&vp->v_lock);
	}

	KASSERT(LIST_EMPTY(&vp->v_dnclist));
	KASSERT(LIST_EMPTY(&vp->v_nclist));
	cv_destroy(&vp->v_cv);
	uvm_obj_destroy(&vp->v_uobj, true);

	pool_cache_put(vnode_cache, vp);

	if (!marker)
		vdrain_vnode_destroyed();
}

/*
 * Vnode creation
 */

/*
 * getnewvnode: Create a new vnode.
 */
int
getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *),
    kmutex_t *interlock, struct vnode *vpp)
{
	struct vnode *vp = NULL;
	int error;

	if (mp != NULL) {
		error = vfs_busy(mp, NULL);
		if (error)
			return error;
	}

	vp = vnalloc(NULL);

	vp->v_state = VS_INITIALIZING;
	vp->v_type = VNON;
	vp->v_tag = tag;
	vp->v_op = vops;
	vp->v_data = NULL;
	vp->v_writecount = 0;
	vp->v_holdcnt = 0;

	/* These should be set up by uvm_obj_init in vnalloc.  */
	KASSERT(vp->v_usecount == 0);
	KASSERT(vp->v_uobj.pgops == &uvm_vnodeops);
	KASSERT(vp->v_uobj.uo_npages == 0);
	KASSERT(TAILQ_FIRST(&vp->v_uobj.memq) == NULL);

	vp->v_size = vp->v_writesize = VSIZENOTSET;

	if (interlock) {
		mutex_obj_hold(interlock);
		uvm_obj_setlock(&vp->v_uobj, interlock);
		KASSERT(vp->v_interlock == interlock);
	}

	vfs_insmntque(vp, mp);

	if (mp != NULL) {
		if (ISSET(mp->mnt_iflag, IMNT_MPSAFE))
			vp->v_vflag |= VV_MPSAFE;
	}

	*vpp = vp;
	return 0;
}

/*
 * ungetnewvnode: Undo a getnewvnode before it is initialized.
 *
 * - vp must be INITIALIZING.
 */
void
ungetnewvnode(struct vnode *vp)
{

	KASSERT(vp->v_state == VS_INITIALIZING);
	KASSERT(vp->v_type == VNON);
	KASSERT(vp->v_data == NULL);
	KASSERT(vp->v_writecount == 0);
	KASSERT(vp->v_holdcnt == 0);
	KASSERT(!vismarker(vp));

	vfs_insmntque(vp, NULL);

	mutex_enter(vp->v_interlock);
	vp->v_state = VS_RECLAIMED;
	cv_broadcast(&vp->v_cv, vp->v_interlock);
	while (0 < vp->v_usecount)
		cv_wait(&vp->v_cv, vp->v_interlock);
	KASSERT(vp->v_state == VS_RECLAIMED);
	KASSERT(vp->v_usecount == 0);
	vp->v_state = VS_UNINITIALIZED;
	mutex_exit(vp->v_interlock);

	vnfree(vp);
}

/*
 * vready: Mark a vnode initialized and ready to be used.
 *
 * - vp must be INITIALIZING.
 */
void
vready(struct vnode *vp)
{
	int error;

	mutex_enter(vp->v_interlock);
	KASSERT(vp->v_state == VS_INITIALIZING);
	KASSERT(!vismarker(vp));

	if (__predict_false(atomic_inc_uint_nv(&vp->v_usecount) == 0))
		vnpanic(vp, "%s: usecount overflow", __func__);
	vp->v_state = VS_READY;
	cv_broadcast(&vp->v_cv, vp->v_interlock);
	mutex_exit(vp->v_interlock);
}

/*
 * vpreget: Prepare for vget.  Safe under a lock or in a pserialized
 * reader.  Caller should only drop locks or exit pserialized reader
 * after this before vget.
 */
void
vpreget(struct vnode *vp)
{

	if (__predict_false(atomic_inc_uint_nv(&vp->v_usecount) == 0))
		vnpanic("%s: usecount overflow", __func__);
}

/*
 * vget: Try to get a reference to vp.  If it's currently changing
 * state, wait until it's done.  Caller must have previously called
 * vpreget or vpreget_locked.
 */
int
vget(struct vnode *vp, int flags)
{
	unsigned int usecount;
	int error;

	mutex_enter(vp->v_interlock);
	KASSERT(!vismarker(vp));
	KASSERT(0 < vp->v_usecount);
	KASSERT(vp->v_state != VS_UNINITIALIZED);

	while ((vp->v_state == VS_INITIALIZING) ||
	    (vp->v_state == VS_DEACTIVATING) ||
	    (vp->v_state == VS_RECLAIMING)) {
		if (flags == VGET_NONBLOCK) {
			error = EWOUBLDLOCK;
			goto fail;
		} else if (flags == VGET_INTR) {
			error = cv_wait_sig(&vp->v_cv, vp->v_interlock);
			if (error) {
				if ((vp->v_state == VS_READY) ||
				    (vp->v_state == VS_REVOKED) ||
				    (vp->v_state == VS_RECLAIMED))
					/*
					 * Interrupted, but we
					 * transitioned to a happy
					 * state at the same time.
					 *
					 * XXX Is it kosher to ignore
					 * the error?  If not, we'll
					 * have to vrele_async.
					 */
					break;
				goto fail;
			}
		} else {
			cv_wait(&vp->v_cv, vp->v_interlock);
		}
	}

	if (vp->v_state == VS_RECLAIMED) {
		error = ENOENT;
		goto fail;
	}

	/* Success!  */
	KASSERT((vp->v_state == VS_READY) || (vp->v_state == VS_REVOKED));
	vremfree(vp);
	mutex_exit(vp->v_interlock);
	return 0;

fail:	KASSERT(vp->v_state != VS_READY);
	KASSERT(vp->v_state != VS_REVOKED);
	usecount = atomic_dec_uint_nv(&vp->v_usecount);
	KASSERT(usecount != UINT_MAX);
	if (vp->v_state == VS_RECLAIMED) {
		/*
		 * If the vnode has been reclaimed, and we held the
		 * last reference to it, signal whoever is responsible
		 * for freeing it that the last attempted reference has
		 * been dropped.
		 */
		if (usecount == 0)
			cv_broadcast(&vp->v_cv, vp->v_interlock);
	}
	mutex_exit(vp->v_interlock);
	return error;
}

/*
 * vref: Bump vp's usecount.
 *
 * - vp must be active.  (Otherwise use vget.)
 */
void
vref(struct vnode *vp)
{
	unsigned int usecount;

#if DIAGNOSTIC
    {
	mutex_enter(vp->v_interlock);
	KASSERT((vp->v_state == VS_READY) || (vp->v_state == VS_REVOKED));
	mutex_exit(vp->v_interlock);
    }
#endif

	usecount = atomic_inc_uint_nv(&vp->v_usecount);
	if (__predict_true(2 <= usecount))
		return;
	if (usecount == 0)
		vnpanic(vp, "%s: usecount overflow", __func__);
	if (usecount == 1)
		vnpanic(vp, "%s: inactive vnode", __func__);
}

/*
 * vrele: Drop vp's usecount.  If it drops to zero, call VOP_INACTIVE
 * and maybe reclaim it.  May take and drop vp->v_interlock.
 *
 * Because this may call VOP_INACTIVE and VOP_RECLAIM synchronously, it
 * may take vp's vnode lock, so caller must not hold any locks that are
 * out of order with vp's vnode lock.  If this is an issue, or if
 * calling VOP_INACTIVE or VOP_RECLAIM is otherwise an issue, use
 * vrele_async instead.
 *
 * - vp must be active.
 */
void
vrele(struct vnode *vp)
{

	vrele_with(vp, &vdeactivate_and_reclaim);
}

/*
 * vrele_async: Drop vp's usecount.  If it drops to zero, schedule the
 * actions of vrele asynchronously.  May take and drop vp->v_interlock.
 *
 * - vp must be active.
 */
void
vrele_async(struct vnode *vp)
{

	vrele_with(vp, &vdeactivate_async);
}

static inline void
vrele_with(struct vnode *vp, void (*deactivate)(struct vnode *))
{

	if (__predict_true(atomic_dec_uint_lock_if_zero(&vp->v_usecount,
		    vp->v_interlock)))
		return;

	KASSERT(mutex_owned(vp->v_interlock));
	KASSERT(vp->v_state == VS_READY);
	KASSERT(!vismarker(vp));
	(*deactivate)(vp);
}

static bool
vdeactivate(struct vnode *vp)
{
	bool reclaim;

	KASSERT(mutex_owned(vp->v_interlock));

	vp->v_state = VS_DEACTIVATING;
	mutex_exit(vp->v_interlock);
	VOP_LOCK(vp, LK_EXCLUSIVE);	/* XXX This is silly.  */
	VOP_INACTIVE(vp, &reclaim);
	mutex_enter(vp->v_interlock);
	KASSERT(vp->v_state == VS_DEACTIVATING);

	return reclaim;
}

static void
vdeactivate_and_reclaim(struct vnode *vp)
{
	bool reclaim;

	KASSERT(mutex_owned(vp->v_interlock));

	reclaim = vdeactivate(vp);

	KASSERT(mutex_owned(vp->v_interlock));
	KASSERT(vp->v_state == VS_DEACTIVATING);

	/*
	 * If the file system wants it reclaimed, reclaim it now.
	 * Otherwise, put it on a queue to be reclaimed when we want to
	 * chuck some vnodes.
	 */
	if (reclaim) {
		vdestroy(vp);
	} else {
		vaddfree(vp);
		vp->v_state = VS_READY;
		cv_broadcast(&vp->v_cv, vp->v_interlock);
		mutex_exit(vp->v_interlock);
	}
}

static void
vdeactivate_async(struct vnode *vp)
{
	unsigned int usecount;

	KASSERT(mutex_owned(vp->v_interlock));
	KASSERT(vp->v_state == VS_READY);

	/*
	 * Bump the usecount to pretend it's active until the thread
	 * can get to it.
	 */
	do {
		usecount = vp->v_usecount;
		if (__predict_false(usecount != 0)) {
			if (__predict_false(usecount == UINT_MAX))
				vnpanic(vp, "%s: usecount overflow", __func__);
			/*
			 * Someone else got a reference to it and will
			 * release it.
			 */
			return;
		}
	} while (atomic_cas_uint(&vp->v_usecount, 0, 1) != 0);

#if notyet
	workqueue_enqueue(&vrele_wq, &vp->v_rele_work, NULL);
#else
	mutex_enter(&vnode_free_list_lock);
	if (vp->v_freelisthd != NULL)
		TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
	vp->v_freelisthd = &vrele_list;
	TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
	/* XXX Why delay?  */
	/* XXX Use a per-CPU workqueue instead?  */
	if ((vrele_pending == UINT_MAX) ||
	    (++vrele_pending > (desiredvnodes >> 8)))
		cv_signal(&vrele_cv);
	mutex_exit(&vnode_free_list_lock);
#endif
}

static void
vrele_thread(void *arg __unused)
{
	struct vnode *vp;

	for (;;) {
		mutex_enter(&vnode_free_list_lock);
		while (TAILQ_EMPTY(&vrele_list))
			cv_wait(&vrele_cv, &vnode_free_list_lock);
		vp = TAILQ_FIRST(&vrele_list);
		TAILQ_REMOVE(&vrele_list, vp, v_freelist);
		KASSERT(vp->v_freelisthd == &vrele_list);
		vp->v_freelisthd = NULL;
		/* May be zero if we overflowed, but that's OK.  */
		if (vrele_pending)
			vrele_pending--;
		mutex_exit(&vnode_free_list_lock);

		vrele(vp);
	}
}

/*
 * Revocation, reclamation, and destruction
 */

/*
 * vrevoke: Turn vp into a dead vnode, to implement VOP_REVOKE.  Will
 * take and drop vp->v_interlock.  Will drop the vnode lock when it
 * calls VOP_INACTIVE before it calls VOP_RECLAIM.
 *
 * - vp must be active.
 * - vp's vnode lock must be held.
 *
 * NOTE: You must use the same VOP_UNLOCK as deadfs!
 */
void
vrevoke(struct vnode *vp)
{

	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);

	mutex_enter(vp->v_interlock);
	KASSERT(vp->v_state == VS_READY);
	KASSERT(!vismarker(vp));

	/*
	 * We must hold a reference, so even though the usecount may
	 * change without the lock, it can't become inactive.
	 */
	KASSERT(1 <= vp->v_usecount);

	if ((vp->v_type == VCHR) || (vp->v_type == VBLK)) {
		type = vp->v_type;
		dev = vp->v_dev;
		/*
		 * vrevoke1 drops vp's vnode lock and interlock, so
		 * there is no lock order to worry about between vp and
		 * all other device vnodes vq.
		 */
		vrevoke1(vp);
		while (spec_node_lookup_by_dev(type, dev, &vq) == 0) {
			VOP_LOCK(vq, LK_EXCLUSIVE);
			mutex_enter(vq->v_interlock);
			vrevoke1(vq);
			vrele(vq);
		}
	} else {
		vrevoke1(vp);
	}

}

void
vrevoke1(struct vnode *vp)
{
	bool reclaim __unused;

	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
	KASSERT(mutex_owned(vp->v_interlock));
	KASSERT(vp->v_state == VS_READY);
	KASSERT(!vismarker(vp));

	/*
	 * Mark the vnode as being revoked, tell the file system it is
	 * inactive, and clean the vnode.  VOP_INACTIVE drops the vnode
	 * lock.
	 */
	vp->v_state = VS_REVOKED;
	mutex_exit(vp->v_interlock);
	VOP_INACTIVE(vp, &reclaim);
	mutex_enter(vp->v_interlock);
	KASSERT(vp->v_state == VS_REVOKED);
	vreclaim(vp);
	KASSERT(vp->v_state == VS_REVOKED);

	vp->v_vflag &= ~VV_ROOT;
	if ((vp->v_type == VBLK) &&
	    spec_node_getmountedfs(vp) != NULL) {
		vp->v_op = spec_vnodeop_p;
		vp->v_vflag &= ~VV_LOCKSWORK;
	} else {
		vp->v_op = dead_vnodeop_p;
		vp->v_vflag |= VV_LOCKSWORK;
		vp->v_tag = VT_NON;
		KNOTE(&vp->v_klist, NOTE_REVOKE);
	}

	/* Publish v_op before v_state.  */
	membar_producer();

	vp->v_state = VS_READY;
	cv_broadcast(&vp->v_cv);

	mutex_exit(vp->v_interlock);
}

/*
 * vrecycle: Try to reclaim vp.  If we can, either because someone else
 * is revoking it and we can wait for them to finish, or because it is
 * inactive and we can destroy it, then return true.  If we can't
 * reclaim vp, return false.
 */
bool
vrecycle(struct vnode *vp)
{
	unsigned int usecount;

	mutex_enter(vp->v_interlock);
	KASSERT(!vismarker(vp));
	KASSERT(0 < vp->v_usecount);

	/*
	 * If someone revoked it, wait for the revocation to complete
	 * and inform caller vp has been reclaimed.
	 */
	if (vp->v_state == VS_REVOKED) {
		do cv_wait(&vp->v_cv, vp->v_interlock);
		while (vp->v_state == VS_REVOKED);
		mutex_exit(vp->v_interlock);
		return true;
	}
	KASSERT(vp->v_state == VS_READY);
	KASSERT(0 < vp->v_usecount);
	usecount = atomic_dec_uint_nv(&vp->v_usecount);
	if (__predict_false(usecount == UINT_MAX))
		vnpanic(vp, "%s: usecount underflow", __func__);
	if (0 < usecount) {
		mutex_exit(vp->v_interlock);
		return false;
	}

	(void)vdeactivate(vp);
	vdestroy(vp);
	return true;
}

/*
 * vdestroy: Reclaim vp, notify vget that this vnode is gone, wait for
 * stragglers, and vnfree vp.
 *
 * - vp must be DEACTIVATING.
 * - vp->v_interlock must be held.
 */
static void
vdestroy(struct vnode *vp)
{

	KASSERT(mutex_owned(vp->v_interlock));
	KASSERT((vp->v_state == VS_DEACTIVATING) ||
	    (vp->v_state == VS_READY));

	/*
	 * Mark the vnode as preparing for destruction, and clean it.
	 * vreclaim drops the interlock.
	 */
	vp->v_state = VS_RECLAIMING;
	vreclaim(vp);
	KASSERT(vp->v_state == VS_RECLAIMING);

	/* Wait until anyone who tried vget is done.  */
	vp->v_state = VS_RECLAIMED;
	while (0 < vp->v_usecount)
		cv_wait(&vp->v_cv, vp->v_interlock);
	/* We now hold the last reference, so it is OK to free vp.  */
	KASSERT(vp->v_state == VS_RECLAIMED);
	KASSERT(vp->v_usecount == 0);
	vp->v_state = VS_UNINITIALIZED;
	mutex_exit(vp->v_interlock);

	vnfree(vp);
}

/*
 * vreclaim: Someone wants to destroy or revoke vp.  Flush buffers
 * associated with it and call VOP_RECLAIM.  Drops and retakes
 * vp->v_interlock.
 *
 * - vp must be RECLAIMING or REVOKED.
 * - vp->v_interlock must be held.
 */
static void
vreclaim(struct vnode *vp)
{

	KASSERT(mutex_owned(vp->v_interlock));
	KASSERT((vp->v_state == VS_RECLAIMING) || (vp->v_state == REVOKED));

	/* XXX Begin crud cargo-culted from old vclean.  */
	if (ISSET(vp->v_iflag, VI_EXECMAP)) {
		atomic_add_int(&uvmexp.execpages, -vp.v_uobj.uo_npages);
		atomic_add_int(&uvmexp.filepages, vp.v_uobj.uo_npages);
	}
	/* XXX Old vrelel cleared VI_WRMAP; old vclean didn't.  Hmm.  */
	vp->v_iflag &= ~(VI_TEXT | VI_EXECMAP | VI_WRMAP);

	mutex_exit(vp->v_interlock);

	VOP_LOCK(vp, LK_EXCLUSIVE);
	if ((vp->v_type == VBLK) &&
	    (spec_node_getmountedfs(vp) != NULL)) {
		/* XXX What now?  */
	} else {
		error = vinvalbuf(vp, V_SAVE, NOCRED, 1, 0, 0);
		if (error) {
			if (wapbl_vphaswapbl(vp))
				WAPBL_DISCARD(wapbl_vptomp(vp));
			error = vinvalbuf(vp, 0, NOCRED, 1, 0, 0);
			KASSERT(error == 0);
		}
		KASSERT(!ISSET(vp->v_iflag, VI_ONWORKLST));
		if ((vp->v_type == VBLK) || (vp->v_type == VCHR))
			spec_node_revoke(vp);
	}
	VOP_UNLOCK(vp, LK_EXCLUSIVE);
	/* XXX End crud cargo-culted from old vclean.  */

	VOP_RECLAIM(vp);
	KASSERT(vp->v_data == NULL);
	KASSERT(vp->v_uobj.uo_npages == 0);

	/* XXX Begin crud cargo-culted from old vclean.  */
	if ((vp->v_type == VREG) && (vp->v_ractx != NULL)) {
		uvm_ra_freectx(vp->v_ractx);
		vp->v_ractx = NULL;
	}
	cache_purge(vp);
	/* XXX End crud cargo-culted from old vclean.  */

	/*
	 * Must happen after VOP_RECLAIM.  We lose access to the mount
	 * point after this.
	 */
	vfs_insmntque(vp, NULL);

	mutex_enter(vp->v_interlock);
	KASSERT(!ISSET(vp->v_iflag, VI_ONWORKLST));
	vremfree(vp);
}

/*
 * Vnode lock
 */

/*
 * vn_lock: Lock vp, or fail with ENOENT if it has been revoked.
 */
int
vn_lock(struct vnode *vp, int flags)
{

	error = VOP_LOCK(vp, flags);
	if (error)
		return error;

#if 0
	mutex_enter(vp->v_interlock);
	if ((vp->v_state == VS_REVOKED) ||
	    (vp->v_op == dead_vnodeop_p)) {
		mutex_exit(vp->v_interlock);
		VOP_UNLOCK(vp);
		return ENOENT;
	}
	mutex_exit(vp->v_interlock);
#else
	/* State transition to REVOKED is prevented by vnode lock.  */
	if (vp->v_state == VS_REVOKED) {
		VOP_UNLOCK(vp);
		return ENOENT;
	}

	/* Read v_state before v_op.  */
	membar_consumer();

	/*
	 * State transition from REVOKED happens after setting v_op, so
	 * if we did not observe the REVOKED state, then either
	 *
	 * (a) nobody has revoked it, and the vnode lock prevents
	 * anyone from doing so now; or
	 *
	 * (b) someone revoked it before we took the vnode lock, in
	 * which case they can transition it from REVOKED to READY with
	 * only the interlock.
	 *
	 * In case (b), they would have set v_op to dead_vnodeop_p
	 * before (with a membar_producer) transitioning from REVOKED
	 * to READY.  So if we read that from v_op now (after a
	 * membar_consumer), we are guaranteed to see the effects of
	 * option (b).
	 */
	if (vp->v_op == dead_vnodeop_p) {
		VOP_UNLOCK(vp);
		return ENOENT;
	}
#endif

	KASSERT(VOP_ISLOCKED(vp) == (flags & (LK_EXCLUSIVE | LK_SHARED)));
	return 0;
}

/*
 *
 */
int
vn_lock_deadok(struct vnode *vp, int flags)
{

	error = VOP_LOCK(vp, flags);
	if (error)
		return error;

	if (vp->v_state == VS_REVOKED) {
		mutex_enter(vp->v_interlock);
		VOP_UNLOCK(vp);
		KASSERT(vp->v_state == VS_REVOKED);
		do {
			if (ISSET(flags, LK_NOWAIT)) {
				error = EWOULDBLOCK;
				break;
			} else {
				cv_wait(&vp->v_cv, vp->v_interlock);
			}
		} while (vp->v_state == VS_REVOKED);
		mutex_exit(vp->v_interlock);
		if (error)
			return error;
		error = VOP_LOCK(vp, flags);
		if (error)
			return error;
	}

	KASSERT(VOP_ISLOCKED(vp) == (flags & (LK_EXCLUSIVE | LK_SHARED)));
	return 0;
}

/*
 * Hold counts.  When there are buffers in the kernel (buffer cache or
 * uvm) for a vnode, we would prefer to destroy that vnode later.  The
 * hold count records how many such buffers there are.
 */

/*
 * vholdl: Bump vp's hold count.
 *
 * - vp must be READY or REVOKED.
 * - vp->v_interlock must be held.
 */
void
vholdl(struct vnode *vp)
{

	KASSERT(mutex_owned(vp->v_interlock));
	KASSERT((vp->v_state == VS_READY) || (vp->v_state == VS_REVOKED));
	KASSERT(!vismarker(vp));

	if (vp->v_holdcnt++ == UINT_MAX)
		vnpanic(vp, "vnode hold count overflow");

	if (vp->v_holdcnt == 1)
		vswitchfree(vp, &vnode_hold_list, &vnode_free_list);
}

/*
 * holdrelel: Drop vp's hold count.
 *
 * - vp must be READY or REVOKED.
 * - vp->v_interlock must be held.
 * - vp must not be a marker vnode.
 */
void
holdrelel(struct vnode *vp)
{

	KASSERT(mutex_owned(vp->v_interlock));
	KASSERT((vp->v_state == VS_READY) || (vp->v_state == VS_REVOKED));
	KASSERT(!vismarker(vp));

	KASSERT(0 < vp->v_holdcnt);
	vp->v_holdcnt--;

	if (vp->v_holdcnt == 0)
		vswitchfree(vp, &vnode_hold_list, &vnode_free_list);
}

/*
 * Freelists.  Vnodes that are not actively being used stay cached in
 * case someone wants them soon, but get queued up to be destroyed when
 * the number of vnodes in the system gets too high.  Vnodes not used
 * by buffers in the kernel are on the /free list/, and get destroyed
 * first; vnodes used by buffers in the kernel are on the /hold list/,
 * and get destroyed after everything in the free list.
 *
 * Destruction happens asynchronously, in the vdrain thread.  Each file
 * system's VOP_RECLAIM cannot allocate or otherwise wait for resources
 * that allocating vnodes in any file system may require.  (Yikes!)
 */

/*
 * vaddfree: Add vp to the free list or hold list as appropriate.
 *
 * - vp must be READY.
 * - vp->v_interlock must be held.
 * - vp must not be a marker vnode.
 * - vp must not have already been put on a free list, i.e. it is being
 *   deactivated.
 */
static void
vaddfree(struct vnode *vp)
{

	KASSERT(mutex_owned(vp->v_interlock));
	KASSERT(vp->v_state == VS_READY);
	KASSERT(!vismarker(vp));

	if (0 < vp->v_usecount)
		return;

	mutex_enter(&vnode_free_list_lock);
	KASSERT(vp->v_freelisthd == NULL);
	vp->v_freelisthd = (0 == vp->v_holdcnt?
	    &vnode_free_list : &vnode_hold_list);
	TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
	mutex_exit(&vnode_free_list_lock);
}

/*
 * vswitchfree: Switch vp from the freelist old to the freelist new.
 *
 * - vp->v_interlock must be held.
 */
static void
vswitchfree(struct vnode *vp, struct vnodelst *old, struct vnodelist *new)
{

	KASSERT(mutex_owned(vp->v_interlock));

	/* Don't bother if someone has already snagged it.  */
	if (0 < vp->v_usecount)
		return;

	mutex_enter(&vnode_free_list_lock);
	KASSERT(vp->v_freelisthd == old);
	TAILQ_REMOVE(old, vp, v_freelist);
	vp->v_freelisthd = new;
	TAILQ_INSERT(new, vp, v_freelist);
	mutex_exit(&vnode_free_list_lock);
}

/*
 * vremfree: Remove vp from whichever freelist it is on.
 *
 * - vp must be READY, REVOKED, or RECLAIMED.
 * - vp->v_interlock must be held.
 * - vp must not be a marker vnode.
 */
static void
vremfree(struct vnode *vp)
{

	KASSERT(mutex_owned(vp->v_interlock));
	KASSERT((vp->v_state == VS_READY) ||
	    (vp->v_state == VS_REVOKED) ||
	    (vp->v_state == VS_RECLAIMED));
	KASSERT(!vismarker(vp));

	/* Don't bother if we never got put on at all.  */
	if (vp->v_freelisthd == NULL)
		return;

	mutex_enter(&vnode_free_list_lock);
	KASSERT(vp->v_freelisthd == (0 == vp->v_holdcnt?
		&vnode_free_list : &vnode_hold_list));
	TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
	vp->v_freelisthd = NULL;
	mutex_exit(&vnode_free_list_lock);
}

static void
vdrain_vnode_created(struct vnode *vp)
{

	mutex_enter(&vnode_free_list_lock);
	if (numvnodes == UINT_MAX)
		vnpanic(vp, "too many vnodes");
	numvnodes++;
	if ((desiredvnodes + (desiredvnodes/10)) < numvnodes)
		cv_signal(&vdrain_cv);
	mutex_exit(&vnode_free_list_lock);
}

static void
vdrain_vnode_destroyed(struct vnode *vp)
{

	mutex_enter(&vnode_free_list_lock);
	numvnodes--;
	mutex_exit(&vnode_free_list_lock);
}

static void
vdrain_thread(void *arg __unused)
{

	for (;;) {
		mutex_enter(&vnode_free_list_lock);
		while (numvnodes < desiredvnodes)
			cv_wait(&vdrain_cv, &vnode_free_list_lock);
		if (vdrain_1() == EBUSY)
			kpause("vdrain", false, hz, NULL);
		/* vdrain_1 drops vnode_free_list_lock for us.  */
	}
}

static int
vdrain_1(void)
{
	static struct vnodelst *freelists[] = {
		&vnode_free_list, &vnode_hold_list,
	};
	size_t i;
	struct vnode *vp;
	struct mount *mp;
	int error = ENOENT;

	KASSERT(mutex_owned(&vnode_free_list_lock));

	for (i = 0; i < __arraycount(freelists); i++) {
		if (TAILQ_EMPTY(freelists[i]))
			continue;
		TAILQ_FOREACH(vp, freelists[i], v_freelist) {
			/*
			 * XXX Lock order reversal!  We can get rid of
			 * this by removing vp from the queue before
			 * taking its interlock and putting it back on
			 * the queue if the fstrans can't start.
			 * However, that also requires changing
			 * everything else that manages vnodes on the
			 * freelists to handle the case that vdrain may
			 * have taken the vnode off the freelist and
			 * may be about to put it back on.  That is
			 * more trouble than it is worth to avoid a
			 * single speculative grab of this vnode.
			 */
			if (!mutex_tryenter(vp->v_interlock)) {
				error = EBUSY;
				continue;
			}
			KASSERT(vp->v_state == VS_READY);
			KASSERT(vp->v_usecount == 0);
			KASSERT(vp->v_freelisthd == freelists[i]);
			mp = vp->v_mount;
			if (fstrans_start_nowait(mp, FSTRANS_SHARED) != 0) {
				mutex_exit(vp->v_interlock);
				error = EBUSY;
				continue;
			}
			goto found;
		}
	}

	mutex_exit(&vnode_free_list_lock);
	return error;

found:	TAILQ_REMOVE(freelists[i], vp, v_freelist);
	vp->v_freelisthd = NULL;
	mutex_exit(&vnode_free_list_lock);

	/*
	 * Act as though we had just finished vrele and VOP_INACTIVE by
	 * entering the DEACTIVATING state.
	 */
	vp->v_state = VS_DEACTIVATING;
	vdestroy(vp);
	fstrans_done(mp);
	return 0;
}