Index: src/sys/fs/cd9660/cd9660_lookup.c
diff -u src/sys/fs/cd9660/cd9660_lookup.c:1.30 src/sys/fs/cd9660/cd9660_lookup.c:1.30.24.1
--- src/sys/fs/cd9660/cd9660_lookup.c:1.30	Sat Mar 28 19:24:05 2015
+++ src/sys/fs/cd9660/cd9660_lookup.c	Sun Jan 19 21:21:54 2020
@@ -152,6 +152,9 @@ cd9660_lookup(void *v)
 			 cnp->cn_nameiop, cnp->cn_flags, NULL, vpp)) {
 		return *vpp == NULLVP ? ENOENT : 0;
 	}
+	/* May need to restart the lookup with an exclusive lock. */
+	if (VOP_ISLOCKED(vdp) != LK_EXCLUSIVE)
+		return ENOLCK;
 
 	len = cnp->cn_namelen;
 	name = cnp->cn_nameptr;
Index: src/sys/fs/cd9660/cd9660_vfsops.c
diff -u src/sys/fs/cd9660/cd9660_vfsops.c:1.94 src/sys/fs/cd9660/cd9660_vfsops.c:1.93.18.2
--- src/sys/fs/cd9660/cd9660_vfsops.c:1.94	Fri Jan 17 20:08:07 2020
+++ src/sys/fs/cd9660/cd9660_vfsops.c	Sun Jan 19 21:21:54 2020
@@ -444,7 +444,7 @@ iso_mountfs(struct vnode *devvp, struct 
 	mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
 	mp->mnt_stat.f_namemax = ISO_MAXNAMLEN;
 	mp->mnt_flag |= MNT_LOCAL;
-	mp->mnt_iflag |= IMNT_MPSAFE;
+	mp->mnt_iflag |= IMNT_MPSAFE | IMNT_SHRLOOKUP;
 	mp->mnt_dev_bshift = iso_bsize;
 	mp->mnt_fs_bshift = isomp->im_bshift;
 	isomp->im_mountp = mp;
Index: src/sys/fs/msdosfs/msdosfs_lookup.c
diff -u src/sys/fs/msdosfs/msdosfs_lookup.c:1.35 src/sys/fs/msdosfs/msdosfs_lookup.c:1.35.24.1
--- src/sys/fs/msdosfs/msdosfs_lookup.c:1.35	Sat Jan 30 09:59:27 2016
+++ src/sys/fs/msdosfs/msdosfs_lookup.c	Sun Jan 19 21:21:54 2020
@@ -161,6 +161,10 @@ msdosfs_lookup(void *v)
 		return *vpp == NULLVP ? ENOENT: 0;
 	}
 
+	/* May need to restart the lookup with an exclusive lock. */
+	if (VOP_ISLOCKED(vdp) != LK_EXCLUSIVE)
+		return ENOLCK;
+
 	/*
 	 * If they are going after the . or .. entry in the root directory,
 	 * they won't find it.  DOS filesystems don't have them in the root
Index: src/sys/fs/msdosfs/msdosfs_vfsops.c
diff -u src/sys/fs/msdosfs/msdosfs_vfsops.c:1.132 src/sys/fs/msdosfs/msdosfs_vfsops.c:1.130.6.3
--- src/sys/fs/msdosfs/msdosfs_vfsops.c:1.132	Thu Feb 27 22:12:53 2020
+++ src/sys/fs/msdosfs/msdosfs_vfsops.c	Sat Feb 29 20:21:01 2020
@@ -867,6 +867,7 @@ msdosfs_mountfs(struct vnode *devvp, str
 	mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
 	mp->mnt_stat.f_namemax = MSDOSFS_NAMEMAX(pmp);
 	mp->mnt_flag |= MNT_LOCAL;
+	mp->mnt_iflag |= IMNT_SHRLOOKUP;
 	mp->mnt_dev_bshift = pmp->pm_bnshift;
 	mp->mnt_fs_bshift = pmp->pm_cnshift;
 
Index: src/sys/fs/tmpfs/tmpfs_subr.c
diff -u src/sys/fs/tmpfs/tmpfs_subr.c:1.106 src/sys/fs/tmpfs/tmpfs_subr.c:1.105.2.3
--- src/sys/fs/tmpfs/tmpfs_subr.c:1.106	Sun Feb 23 15:46:40 2020
+++ src/sys/fs/tmpfs/tmpfs_subr.c	Sat Feb 29 20:21:02 2020
@@ -147,6 +147,8 @@ tmpfs_init_vnode(struct vnode *vp, tmpfs
 	vp->v_data = node;
 	node->tn_vnode = vp;
 	uvm_vnp_setsize(vp, node->tn_size);
+	KASSERT(node->tn_mode != VNOVAL);
+	cache_enter_id(vp, node->tn_mode, node->tn_uid, node->tn_gid);
 }
 
 /*
@@ -1035,6 +1037,7 @@ tmpfs_chmod(vnode_t *vp, mode_t mode, ka
 	node->tn_mode = (mode & ALLPERMS);
 	tmpfs_update(vp, TMPFS_UPDATE_CTIME);
 	VN_KNOTE(vp, NOTE_ATTRIB);
+	cache_enter_id(vp, node->tn_mode, node->tn_uid, node->tn_gid);
 	return 0;
 }
 
@@ -1079,6 +1082,7 @@ tmpfs_chown(vnode_t *vp, uid_t uid, gid_
 	node->tn_gid = gid;
 	tmpfs_update(vp, TMPFS_UPDATE_CTIME);
 	VN_KNOTE(vp, NOTE_ATTRIB);
+	cache_enter_id(vp, node->tn_mode, node->tn_uid, node->tn_gid);
 	return 0;
 }
 
Index: src/sys/fs/tmpfs/tmpfs_vfsops.c
diff -u src/sys/fs/tmpfs/tmpfs_vfsops.c:1.76 src/sys/fs/tmpfs/tmpfs_vfsops.c:1.75.2.3
--- src/sys/fs/tmpfs/tmpfs_vfsops.c:1.76	Fri Jan 17 20:08:08 2020
+++ src/sys/fs/tmpfs/tmpfs_vfsops.c	Fri Jan 24 16:48:58 2020
@@ -182,7 +182,8 @@ tmpfs_mount(struct mount *mp, const char
 	mp->mnt_stat.f_namemax = TMPFS_MAXNAMLEN;
 	mp->mnt_fs_bshift = PAGE_SHIFT;
 	mp->mnt_dev_bshift = DEV_BSHIFT;
-	mp->mnt_iflag |= IMNT_MPSAFE | IMNT_CAN_RWTORO;
+	mp->mnt_iflag |= IMNT_MPSAFE | IMNT_CAN_RWTORO | IMNT_SHRLOOKUP |
+	    IMNT_NCLOOKUP;
 	vfs_getnewfsid(mp);
 
 	/* Allocate the tmpfs mount structure and fill it. */
Index: src/sys/kern/exec_script.c
diff -u src/sys/kern/exec_script.c:1.80 src/sys/kern/exec_script.c:1.80.2.1
--- src/sys/kern/exec_script.c:1.80	Sun Sep 15 20:21:12 2019
+++ src/sys/kern/exec_script.c	Fri Jan 17 21:53:01 2020
@@ -216,7 +216,7 @@ check_shell:
 	 * close all open fd's when the start.  That kills this
 	 * method of implementing "safe" set-id and x-only scripts.
 	 */
-	vn_lock(epp->ep_vp, LK_EXCLUSIVE | LK_RETRY);
+	vn_lock(epp->ep_vp, LK_SHARED | LK_RETRY);
 	error = VOP_ACCESS(epp->ep_vp, VREAD, l->l_cred);
 	VOP_UNLOCK(epp->ep_vp);
 	if (error == EACCES
Index: src/sys/kern/init_sysctl.c
diff -u src/sys/kern/init_sysctl.c:1.224 src/sys/kern/init_sysctl.c:1.223.2.2
--- src/sys/kern/init_sysctl.c:1.224	Sat Jan 18 14:40:03 2020
+++ src/sys/kern/init_sysctl.c	Sat Jan 25 22:38:50 2020
@@ -732,7 +732,6 @@ sysctl_kern_maxvnodes(SYSCTLFN_ARGS)
 		return (error);
 	}
 	vfs_reinit();
-	nchreinit();
 
 	return (0);
 }
Index: src/sys/kern/vfs_cache.c
diff -u src/sys/kern/vfs_cache.c:1.127 src/sys/kern/vfs_cache.c:1.126.2.12
--- src/sys/kern/vfs_cache.c:1.127	Wed Jan  8 12:04:56 2020
+++ src/sys/kern/vfs_cache.c	Sun Feb 16 22:00:53 2020
@@ -1,9 +1,12 @@
 /*	$NetBSD$	*/
 
 /*-
- * Copyright (c) 2008 The NetBSD Foundation, Inc.
+ * Copyright (c) 2008, 2019, 2020 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Andrew Doran.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -57,6 +60,116 @@
  *	@(#)vfs_cache.c	8.3 (Berkeley) 8/22/94
  */
 
+/*
+ * Name caching:
+ *
+ *	Names found by directory scans are retained in a cache for future
+ *	reference.  It is managed LRU, so frequently used names will hang
+ *	around.  The cache is indexed by hash value obtained from the name.
+ *
+ *	The name cache is the brainchild of Robert Elz and was introduced in
+ *	4.3BSD.  See "Using gprof to Tune the 4.2BSD Kernel", Marshall Kirk
+ *	McKusick, May 21 1984.
+ *
+ * Data structures:
+ *
+ *	Most Unix namecaches very sensibly use a global hash table to index
+ *	names.  The global hash table works well, but can cause concurrency
+ *	headaches for the kernel hacker.  In the NetBSD 10.0 implementation
+ *	we are not sensible, and use a per-directory data structure to index
+ *	names, but the cache otherwise functions the same.
+ *
+ *	The index is a red-black tree.  There are no special concurrency
+ *	requirements placed on it, because it's per-directory and protected
+ *	by the namecache's per-directory locks.  It should therefore not be
+ *	difficult to experiment with other types of index.
+ *
+ *	Each cached name is stored in a struct namecache, along with a
+ *	pointer to the associated vnode (nc_vp).  Names longer than a
+ *	maximum length of NCHNAMLEN are allocated with kmem_alloc(); they
+ *	occur infrequently, and names shorter than this are stored directly
+ *	in struct namecache.  If it is a "negative" entry, (i.e. for a name
+ *	that is known NOT to exist) the vnode pointer will be NULL.
+ *
+ *	For a directory with 3 cached names for 3 distinct vnodes, the
+ *	various vnodes and namecache structs would be connected like this
+ *	(the root is at the bottom of the diagram):
+ *
+ *          ...
+ *           ^
+ *           |- vi_nc_tree
+ *           |                                                           
+ *      +----o----+               +---------+               +---------+
+ *      |  VDIR   |               |  VCHR   |               |  VREG   |
+ *      |  vnode  o-----+         |  vnode  o-----+         |  vnode  o------+
+ *      +---------+     |         +---------+     |         +---------+      |
+ *           ^          |              ^          |              ^           |
+ *           |- nc_vp   |- vi_nc_list  |- nc_vp   |- vi_nc_list  |- nc_vp    |
+ *           |          |              |          |              |           |
+ *      +----o----+     |         +----o----+     |         +----o----+      |
+ *  +---onamecache|<----+     +---onamecache|<----+     +---onamecache|<-----+
+ *  |   +---------+           |   +---------+           |   +---------+
+ *  |        ^                |        ^                |        ^
+ *  |        |                |        |                |        |
+ *  |        |  +----------------------+                |        |
+ *  |-nc_dvp | +-------------------------------------------------+
+ *  |        |/- vi_nc_tree   |                         |
+ *  |        |                |- nc_dvp                 |- nc_dvp
+ *  |   +----o----+           |                         |
+ *  +-->|  VDIR   |<----------+                         |
+ *      |  vnode  |<------------------------------------+
+ *      +---------+
+ *
+ *      START HERE
+ *
+ * Replacement:
+ *
+ *	As the cache becomes full, old and unused entries are purged as new
+ *	entries are added.  The synchronization overhead in maintaining a
+ *	strict ordering would be prohibitive, so the VM system's "clock" or
+ *	"second chance" page replacement algorithm is aped here.  New
+ *	entries go to the tail of the active list.  After they age out and
+ *	reach the head of the list, they are moved to the tail of the
+ *	inactive list.  Any use of the deactivated cache entry reactivates
+ *	it, saving it from impending doom; if not reactivated, the entry
+ *	eventually reaches the head of the inactive list and is purged.
+ *
+ * Concurrency:
+ *
+ *	From a performance perspective, cache_lookup(nameiop == LOOKUP) is
+ *	what really matters; insertion of new entries with cache_enter() is
+ *	comparatively infrequent, and overshadowed by the cost of expensive
+ *	file system metadata operations (which may involve disk I/O).  We
+ *	therefore want to make everything simplest in the lookup path.
+ *
+ *	struct namecache is mostly stable except for list and tree related
+ *	entries, changes to which don't affect the cached name or vnode. 
+ *	For changes to name+vnode, entries are purged in preference to
+ *	modifying them.
+ *
+ *	Read access to namecache entries is made via tree, list, or LRU
+ *	list.  A lock corresponding to the direction of access should be
+ *	held.  See definition of "struct namecache" in src/sys/namei.src,
+ *	and the definition of "struct vnode" for the particulars.
+ *
+ *	Per-CPU statistics, and LRU list totals are read unlocked, since
+ *	an approximate value is OK.  We maintain uintptr_t sized per-CPU
+ *	counters and 64-bit global counters under the theory that uintptr_t
+ *	sized counters are less likely to be hosed by nonatomic increment.
+ *
+ *	The lock order is:
+ *
+ *	1) vi->vi_nc_lock	(tree or parent -> child direction,
+ *				 used during forward lookup)
+ *
+ *	2) vi->vi_nc_listlock	(list or child -> parent direction,
+ *				 used during reverse lookup)
+ *
+ *	3) cache_lru_lock	(LRU list direction, used during reclaim)
+ *
+ *	4) vp->v_interlock	(what the cache entry points to)
+ */
+
 #include <sys/cdefs.h>
 __KERNEL_RCSID(0, "$NetBSD$");
 
@@ -66,16 +179,15 @@ __KERNEL_RCSID(0, "$NetBSD$");
 #include "opt_dtrace.h"
 #endif
 
-#include <sys/param.h>
-#include <sys/atomic.h>
 #include <sys/cpu.h>
 #include <sys/errno.h>
 #include <sys/evcnt.h>
+#include <sys/hash.h>
 #include <sys/kernel.h>
-#include <sys/kthread.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
+#include <sys/param.h>
 #include <sys/pool.h>
 #include <sys/sdt.h>
 #include <sys/sysctl.h>
@@ -83,244 +195,61 @@ __KERNEL_RCSID(0, "$NetBSD$");
 #include <sys/time.h>
 #include <sys/vnode_impl.h>
 
-/*
- * Name caching works as follows:
- *
- * Names found by directory scans are retained in a cache
- * for future reference.  It is managed LRU, so frequently
- * used names will hang around.  Cache is indexed by hash value
- * obtained from (dvp, name) where dvp refers to the directory
- * containing name.
- *
- * Upon reaching the last segment of a path, if the reference
- * is for DELETE, or NOCACHE is set (rewrite), and the
- * name is located in the cache, it will be dropped.
- */
-
-/*
- * Cache entry lifetime:
- *
- *	nonexistent
- *	---create---> active
- *	---invalidate---> queued
- *	---reclaim---> nonexistent.
- *
- * States:
- * - Nonexistent.  Cache entry does not exist.
- *
- * - Active.  cache_lookup, cache_lookup_raw, cache_revlookup can look
- *   up, acquire references, and hand off references to vnodes,
- *   e.g. via v_interlock.  Marked by nonnull ncp->nc_dvp.
- *
- * - Queued.  Pending desstruction by cache_reclaim.  Cannot be used by
- *   cache_lookup, cache_lookup_raw, or cache_revlookup.  May still be
- *   on lists.  Marked by null ncp->nc_dvp.
- *
- * Transitions:
- *
- * - Create: nonexistent--->active
- *
- *   Done by cache_enter(dvp, vp, name, namelen, cnflags), called by
- *   VOP_LOOKUP after the answer is found.  Allocates a struct
- *   namecache object, initializes it with the above fields, and
- *   activates it by inserting it into the forward and reverse tables.
- *
- * - Invalidate: active--->queued
- *
- *   Done by cache_invalidate.  If not already invalidated, nullify
- *   ncp->nc_dvp and and add to cache_gcqueue.  Called,
- *   among various other places, in cache_lookup(dvp, name, namelen,
- *   nameiop, cnflags, &iswht, &vp) when MAKEENTRY is missing from
- *   cnflags.
- *
- * - Reclaim: queued--->nonexistent
- *
- *   Done by cache_reclaim.  Disassociate ncp from any lists it is on
- *   and free memory.
- */
-
-/*
- * Locking.
- *
- * L namecache_lock		Global lock for namecache table and queues.
- * C struct nchcpu::cpu_lock	Per-CPU lock to reduce read contention.
- * N struct namecache::nc_lock	Per-entry lock.
- * V struct vnode::v_interlock	Vnode interlock.
- *
- * Lock order: L -> C -> N -> V
- *
- *	Examples:
- *	. L->C: cache_reclaim
- *	. C->N->V: cache_lookup
- *	. L->N->V: cache_purge1, cache_revlookup
- *
- * All use serialized by namecache_lock:
- *
- *	nclruhead / struct namecache::nc_lru
- *	struct vnode_impl::vi_dnclist / struct namecache::nc_dvlist
- *	struct vnode_impl::vi_nclist / struct namecache::nc_vlist
- *	nchstats
- *
- * - Insertion serialized by namecache_lock,
- * - read protected by per-CPU lock,
- * - insert/read ordering guaranteed by memory barriers, and
- * - deletion allowed only under namecache_lock and *all* per-CPU locks
- *   in CPU_INFO_FOREACH order:
- *
- *	nchashtbl / struct namecache::nc_hash
- *
- *   The per-CPU locks exist only to reduce the probability of
- *   contention between readers.  We do not bind to a CPU, so
- *   contention is still possible.
- *
- * All use serialized by struct namecache::nc_lock:
- *
- *	struct namecache::nc_dvp
- *	struct namecache::nc_vp
- *	struct namecache::nc_gcqueue (*)
- *	struct namecache::nc_hittime (**)
- *
- * (*) Once on the queue, only cache_thread uses this nc_gcqueue, unlocked.
- * (**) cache_prune reads nc_hittime unlocked, since approximate is OK.
- *
- * Unlocked because stable after initialization:
- *
- *	struct namecache::nc_dvp
- *	struct namecache::nc_vp
- *	struct namecache::nc_flags
- *	struct namecache::nc_nlen
- *	struct namecache::nc_name
- *
- * Unlocked because approximation is OK:
- *
- *	struct nchcpu::cpu_stats
- *	struct nchcpu::cpu_stats_last
- *
- * Updates under namecache_lock or any per-CPU lock are marked with
- * COUNT, while updates outside those locks are marked with COUNT_UNL.
- *
- * - The theory seems to have been that you could replace COUNT_UNL by
- *   atomic operations -- except that doesn't help unless you also
- *   replace COUNT by atomic operations, because mixing atomics and
- *   nonatomics is a recipe for failure.
- * - We use 32-bit per-CPU counters and 64-bit global counters under
- *   the theory that 32-bit counters are less likely to be hosed by
- *   nonatomic increment.
- */
+#include <miscfs/genfs/genfs.h>
 
-/*
- * The comment below is preserved for posterity in case it is
- * important, but it is clear that everywhere the namecache_count_*()
- * functions are called, other cache_*() functions that take the same
- * locks are also called, so I can't imagine how this could be a
- * problem:
- *
- * N.B.: Attempting to protect COUNT_UNL() increments by taking
- * a per-cpu lock in the namecache_count_*() functions causes
- * a deadlock.  Don't do that, use atomic increments instead if
- * the imperfections here bug you.
- */
-
-/*
- * struct nchstats_percpu:
- *
- *	Per-CPU counters.
- */
-struct nchstats_percpu _NAMEI_CACHE_STATS(uint32_t);
-
-/*
- * struct nchcpu:
- *
- *	Per-CPU namecache state: lock and per-CPU counters.
- */
-struct nchcpu {
-	kmutex_t		cpu_lock;
-	struct nchstats_percpu	cpu_stats;
-	/* XXX maybe __cacheline_aligned would improve this? */
-	struct nchstats_percpu	cpu_stats_last;	/* from last sample */
+static void	cache_activate(struct namecache *);
+static int	cache_compare_key(void *, const void *, const void *);
+static int	cache_compare_nodes(void *, const void *, const void *);
+static void	cache_deactivate(void);
+static void	cache_reclaim(void);
+static int	cache_stat_sysctl(SYSCTLFN_ARGS);
+
+/* Per-CPU counters. */
+struct nchstats_percpu _NAMEI_CACHE_STATS(uintptr_t);
+
+/* Global pool cache. */
+static pool_cache_t cache_pool __read_mostly;
+
+/* LRU replacement. */
+enum cache_lru_id {
+	LRU_ACTIVE,
+	LRU_INACTIVE,
+	LRU_COUNT
 };
 
-/*
- * The type for the hash code. While the hash function generates a
- * u32, the hash code has historically been passed around as a u_long,
- * and the value is modified by xor'ing a uintptr_t, so it's not
- * entirely clear what the best type is. For now I'll leave it
- * unchanged as u_long.
- */
+static struct {
+	TAILQ_HEAD(, namecache)	list[LRU_COUNT];
+	u_int			count[LRU_COUNT];
+} cache_lru __cacheline_aligned;
 
-typedef u_long nchash_t;
-
-/*
- * Structures associated with name cacheing.
- */
-
-static kmutex_t *namecache_lock __read_mostly;
-static pool_cache_t namecache_cache __read_mostly;
-static TAILQ_HEAD(, namecache) nclruhead __cacheline_aligned;
-
-static LIST_HEAD(nchashhead, namecache) *nchashtbl __read_mostly;
-static u_long	nchash __read_mostly;
-
-#define	NCHASH2(hash, dvp)	\
-	(((hash) ^ ((uintptr_t)(dvp) >> 3)) & nchash)
-
-/* Number of cache entries allocated. */
-static long	numcache __cacheline_aligned;
-
-/* Garbage collection queue and number of entries pending in it. */
-static void	*cache_gcqueue;
-static u_int	cache_gcpend;
+static kmutex_t cache_lru_lock __cacheline_aligned;
 
 /* Cache effectiveness statistics.  This holds total from per-cpu stats */
 struct nchstats	nchstats __cacheline_aligned;
 
-/*
- * Macros to count an event, update the central stats with per-cpu
- * values and add current per-cpu increments to the subsystem total
- * last collected by cache_reclaim().
- */
-#define	CACHE_STATS_CURRENT	/* nothing */
-
-#define	COUNT(cpup, f)	((cpup)->cpu_stats.f++)
-
-#define	UPDATE(cpup, f) do { \
-	struct nchcpu *Xcpup = (cpup); \
-	uint32_t Xcnt = (volatile uint32_t) Xcpup->cpu_stats.f; \
-	nchstats.f += Xcnt - Xcpup->cpu_stats_last.f; \
-	Xcpup->cpu_stats_last.f = Xcnt; \
-} while (/* CONSTCOND */ 0)
-
-#define	ADD(stats, cpup, f) do { \
-	struct nchcpu *Xcpup = (cpup); \
-	stats.f += Xcpup->cpu_stats.f - Xcpup->cpu_stats_last.f; \
-} while (/* CONSTCOND */ 0)
-
-/* Do unlocked stats the same way. Use a different name to allow mind changes */
-#define	COUNT_UNL(cpup, f)	COUNT((cpup), f)
-
-static const int cache_lowat = 95;
-static const int cache_hiwat = 98;
-static const int cache_hottime = 5;	/* number of seconds */
-static int doingcache = 1;		/* 1 => enable the cache */
-
-static struct evcnt cache_ev_scan;
-static struct evcnt cache_ev_gc;
-static struct evcnt cache_ev_over;
-static struct evcnt cache_ev_under;
-static struct evcnt cache_ev_forced;
-
-static struct namecache *cache_lookup_entry(
-    const struct vnode *, const char *, size_t);
-static void cache_thread(void *);
-static void cache_invalidate(struct namecache *);
-static void cache_disassociate(struct namecache *);
-static void cache_reclaim(void);
-static int cache_ctor(void *, void *, int);
-static void cache_dtor(void *, void *);
-
-static struct sysctllog *sysctllog;
-static void sysctl_cache_stat_setup(void);
+#define	COUNT(f)	do { \
+	kpreempt_disable(); \
+	((struct nchstats_percpu *)curcpu()->ci_data.cpu_nch)->f++; \
+	kpreempt_enable(); \
+} while (/* CONSTCOND */ 0);
+
+/* Tunables */
+static const int cache_lru_maxdeact = 2;	/* max # to deactivate */
+static const int cache_lru_maxscan = 64;	/* max # to scan/reclaim */
+static int doingcache = 1;			/* 1 => enable the cache */
+
+/* sysctl */
+static struct	sysctllog *cache_sysctllog;
+
+/* Read-black tree */
+static rb_tree_ops_t cache_rbtree_ops __read_mostly = {
+	.rbto_compare_nodes = cache_compare_nodes,
+	.rbto_compare_key = cache_compare_key,
+	.rbto_node_offset = offsetof(struct namecache, nc_tree),
+	.rbto_context = NULL
+};
 
+/* dtrace hooks */
 SDT_PROVIDER_DEFINE(vfs);
 
 SDT_PROBE_DEFINE1(vfs, namecache, invalidate, done, "struct vnode *");
@@ -345,160 +274,172 @@ SDT_PROBE_DEFINE3(vfs, namecache, enter,
     "char *", "size_t");
 
 /*
- * Compute the hash for an entry.
- *
- * (This is for now a wrapper around namei_hash, whose interface is
- * for the time being slightly inconvenient.)
+ * rbtree: compare two nodes.
  */
-static nchash_t
-cache_hash(const char *name, size_t namelen)
+static int
+cache_compare_nodes(void *context, const void *n1, const void *n2)
 {
-	const char *endptr;
+	const struct namecache *nc1 = n1;
+	const struct namecache *nc2 = n2;
 
-	endptr = name + namelen;
-	return namei_hash(name, &endptr);
+	if (nc1->nc_key < nc2->nc_key) {
+		return -1;
+	}
+	if (nc1->nc_key > nc2->nc_key) {
+		return 1;
+	}
+	return 0;
 }
 
 /*
- * Invalidate a cache entry and enqueue it for garbage collection.
- * The caller needs to hold namecache_lock or a per-cpu lock to hold
- * off cache_reclaim().
+ * rbtree: compare a node and a key.
  */
-static void
-cache_invalidate(struct namecache *ncp)
+static int
+cache_compare_key(void *context, const void *n, const void *k)
 {
-	void *head;
-
-	KASSERT(mutex_owned(&ncp->nc_lock));
-
-	if (ncp->nc_dvp != NULL) {
-		SDT_PROBE(vfs, namecache, invalidate, done, ncp->nc_dvp,
-		    0, 0, 0, 0);
+	const struct namecache *ncp = n;
+	const int64_t key = *(const int64_t *)k;
 
-		ncp->nc_dvp = NULL;
-		do {
-			head = cache_gcqueue;
-			ncp->nc_gcqueue = head;
-		} while (atomic_cas_ptr(&cache_gcqueue, head, ncp) != head);
-		atomic_inc_uint(&cache_gcpend);
+	if (ncp->nc_key < key) {
+		return -1;
 	}
+	if (ncp->nc_key > key) {
+		return 1;
+	}
+	return 0;
 }
 
 /*
- * Disassociate a namecache entry from any vnodes it is attached to,
- * and remove from the global LRU list.
+ * Compute a key value for the given name.  The name length is encoded in
+ * the key value to try and improve uniqueness, and so that length doesn't
+ * need to be compared separately for string comparisons.
  */
-static void
-cache_disassociate(struct namecache *ncp)
+static int64_t
+cache_key(const char *name, size_t nlen)
 {
+	int64_t key;
 
-	KASSERT(mutex_owned(namecache_lock));
-	KASSERT(ncp->nc_dvp == NULL);
+	KASSERT(nlen <= USHRT_MAX);
 
-	if (ncp->nc_lru.tqe_prev != NULL) {
-		TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
-		ncp->nc_lru.tqe_prev = NULL;
-	}
-	if (ncp->nc_vlist.tqe_prev != NULL) {
-		KASSERT(ncp->nc_vp != NULL);
-		TAILQ_REMOVE(&VNODE_TO_VIMPL(ncp->nc_vp)->vi_nclist, ncp,
-		    nc_vlist);
-		ncp->nc_vlist.tqe_prev = NULL;
-	}
-	if (ncp->nc_dvlist.le_prev != NULL) {
-		LIST_REMOVE(ncp, nc_dvlist);
-		ncp->nc_dvlist.le_prev = NULL;
-	}
+	key = hash32_buf(name, nlen, HASH32_STR_INIT);
+	return (key << 32) | nlen;
 }
 
 /*
- * Lock all CPUs to prevent any cache lookup activity.  Conceptually,
- * this locks out all "readers".
+ * Like bcmp() but tuned for the use case here which is:
+ *
+ * - always of equal length both sides
+ * - almost always the same string both sides
+ * - small strings
  */
-static void
-cache_lock_cpus(void)
+static inline int
+cache_namecmp(struct namecache *ncp, const char *name, size_t namelen)
 {
-	CPU_INFO_ITERATOR cii;
-	struct cpu_info *ci;
-	struct nchcpu *cpup;
+	size_t i;
+	int d;
 
-	/*
-	 * Lock out all CPUs first, then harvest per-cpu stats.  This
-	 * is probably not quite as cache-efficient as doing the lock
-	 * and harvest at the same time, but allows cache_stat_sysctl()
-	 * to make do with a per-cpu lock.
-	 */
-	for (CPU_INFO_FOREACH(cii, ci)) {
-		cpup = ci->ci_data.cpu_nch;
-		mutex_enter(&cpup->cpu_lock);
-	}
-	for (CPU_INFO_FOREACH(cii, ci)) {
-		cpup = ci->ci_data.cpu_nch;
-		UPDATE(cpup, ncs_goodhits);
-		UPDATE(cpup, ncs_neghits);
-		UPDATE(cpup, ncs_badhits);
-		UPDATE(cpup, ncs_falsehits);
-		UPDATE(cpup, ncs_miss);
-		UPDATE(cpup, ncs_long);
-		UPDATE(cpup, ncs_pass2);
-		UPDATE(cpup, ncs_2passes);
-		UPDATE(cpup, ncs_revhits);
-		UPDATE(cpup, ncs_revmiss);
+	KASSERT(ncp->nc_nlen == namelen);
+	for (d = 0, i = 0; i < namelen; i++) {
+		d |= (ncp->nc_name[i] ^ name[i]);
 	}
+	return d;
 }
 
 /*
- * Release all CPU locks.
+ * Remove an entry from the cache.  vi_nc_lock must be held, and if dir2node
+ * is true, then we're locking in the conventional direction and the list
+ * lock will be acquired when removing the entry from the vnode list.
  */
 static void
-cache_unlock_cpus(void)
+cache_remove(struct namecache *ncp, const bool dir2node)
 {
-	CPU_INFO_ITERATOR cii;
-	struct cpu_info *ci;
-	struct nchcpu *cpup;
+	struct vnode *vp, *dvp = ncp->nc_dvp;
+	vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
 
-	for (CPU_INFO_FOREACH(cii, ci)) {
-		cpup = ci->ci_data.cpu_nch;
-		mutex_exit(&cpup->cpu_lock);
+	KASSERT(rw_write_held(&dvi->vi_nc_lock));
+	KASSERT(cache_key(ncp->nc_name, ncp->nc_nlen) == ncp->nc_key);
+	KASSERT(rb_tree_find_node(&dvi->vi_nc_tree, &ncp->nc_key) == ncp);
+
+	SDT_PROBE(vfs, namecache, invalidate, done, ncp,
+	    0, 0, 0, 0);
+
+	/* First remove from the directory's rbtree. */
+	rb_tree_remove_node(&dvi->vi_nc_tree, ncp);
+
+	/* Then remove from the LRU lists. */
+	mutex_enter(&cache_lru_lock);
+	TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru);
+	cache_lru.count[ncp->nc_lrulist]--;
+	mutex_exit(&cache_lru_lock);
+
+	/* Then remove from the node's list. */
+	if ((vp = ncp->nc_vp) != NULL) {
+		vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
+		if (__predict_true(dir2node)) {
+			rw_enter(&vi->vi_nc_listlock, RW_WRITER);
+			TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list);
+			rw_exit(&vi->vi_nc_listlock);
+		} else {
+			TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list);
+		}
+	}
+
+	/* Finally, free it. */
+	if (ncp->nc_nlen > NCHNAMLEN) {
+		size_t sz = offsetof(struct namecache, nc_name[ncp->nc_nlen]);
+		kmem_free(ncp, sz);
+	} else {
+		pool_cache_put(cache_pool, ncp);
 	}
 }
 
 /*
- * Find a single cache entry and return it locked.
- * The caller needs to hold namecache_lock or a per-cpu lock to hold
- * off cache_reclaim().
+ * Find a single cache entry and return it.  vi_nc_lock must be held.
  */
-static struct namecache *
-cache_lookup_entry(const struct vnode *dvp, const char *name, size_t namelen)
+static struct namecache * __noinline
+cache_lookup_entry(struct vnode *dvp, const char *name, size_t namelen,
+    int64_t key)
 {
-	struct nchashhead *ncpp;
+	vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
+	struct rb_node *node = dvi->vi_nc_tree.rbt_root;
 	struct namecache *ncp;
-	nchash_t hash;
 
-	KASSERT(dvp != NULL);
-	hash = cache_hash(name, namelen);
-	ncpp = &nchashtbl[NCHASH2(hash, dvp)];
-
-	LIST_FOREACH(ncp, ncpp, nc_hash) {
-		membar_datadep_consumer();	/* for Alpha... */
-		if (ncp->nc_dvp != dvp ||
-		    ncp->nc_nlen != namelen ||
-		    memcmp(ncp->nc_name, name, (u_int)ncp->nc_nlen))
-		    	continue;
-	    	mutex_enter(&ncp->nc_lock);
-		if (__predict_true(ncp->nc_dvp == dvp)) {
-			ncp->nc_hittime = hardclock_ticks;
-			SDT_PROBE(vfs, namecache, lookup, hit, dvp,
-			    name, namelen, 0, 0);
-			return ncp;
-		}
-		/* Raced: entry has been nullified. */
-		mutex_exit(&ncp->nc_lock);
-	}
-
-	SDT_PROBE(vfs, namecache, lookup, miss, dvp,
-	    name, namelen, 0, 0);
-	return NULL;
+	KASSERT(rw_lock_held(&dvi->vi_nc_lock));
+
+	/*
+	 * Search the RB tree for the key.  This is an inlined lookup
+	 * tailored for exactly what's needed here (64-bit key and so on)
+	 * that is quite a bit faster than using rb_tree_find_node(). 
+	 * Elsewhere during entry/removal the usual functions are used as it
+	 * doesn't matter there.
+	 */
+	for (;;) {
+		if (__predict_false(RB_SENTINEL_P(node))) {
+			return NULL;
+		}
+		KASSERT((void *)&ncp->nc_tree == (void *)ncp);
+		ncp = (struct namecache *)node;
+		KASSERT(ncp->nc_dvp == dvp);
+		if (ncp->nc_key == key) {
+			break;
+		}
+		node = node->rb_nodes[ncp->nc_key < key];
+	}
+
+	/* Exclude collisions. */
+	if (__predict_false(cache_namecmp(ncp, name, namelen))) {
+		return NULL;
+	}
+
+	/*
+	 * If the entry is on the wrong LRU list, requeue it.  This is an
+	 * unlocked check, but it will rarely be wrong and even then there
+	 * will be no harm caused.
+	 */
+	if (__predict_false(ncp->nc_lrulist != LRU_ACTIVE)) {
+		cache_activate(ncp);
+	}
+	return ncp;
 }
 
 /*
@@ -556,12 +497,13 @@ cache_lookup(struct vnode *dvp, const ch
 	     uint32_t nameiop, uint32_t cnflags,
 	     int *iswht_ret, struct vnode **vn_ret)
 {
+	vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
 	struct namecache *ncp;
 	struct vnode *vp;
-	struct nchcpu *cpup;
+	int64_t key;
 	int error;
 	bool hit;
-
+	krw_t op;
 
 	/* Establish default result values */
 	if (iswht_ret != NULL) {
@@ -573,73 +515,77 @@ cache_lookup(struct vnode *dvp, const ch
 		return false;
 	}
 
-	cpup = curcpu()->ci_data.cpu_nch;
-	mutex_enter(&cpup->cpu_lock);
 	if (__predict_false(namelen > USHRT_MAX)) {
 		SDT_PROBE(vfs, namecache, lookup, toolong, dvp,
 		    name, namelen, 0, 0);
-		COUNT(cpup, ncs_long);
-		mutex_exit(&cpup->cpu_lock);
-		/* found nothing */
+		COUNT(ncs_long);
 		return false;
 	}
 
-	ncp = cache_lookup_entry(dvp, name, namelen);
+	/* Could the entry be purged below? */
+	if ((cnflags & ISLASTCN) != 0 &&
+	    ((cnflags & MAKEENTRY) == 0 || nameiop == CREATE)) {
+	    	op = RW_WRITER;
+	} else {
+		op = RW_READER;
+	}
+
+	/* Compute the key up front - don't need the lock. */
+	key = cache_key(name, namelen);
+
+	/* Now look for the name. */
+	rw_enter(&dvi->vi_nc_lock, op);
+	ncp = cache_lookup_entry(dvp, name, namelen, key);
 	if (__predict_false(ncp == NULL)) {
-		COUNT(cpup, ncs_miss);
-		mutex_exit(&cpup->cpu_lock);
-		/* found nothing */
+		rw_exit(&dvi->vi_nc_lock);
+		COUNT(ncs_miss);
+		SDT_PROBE(vfs, namecache, lookup, miss, dvp,
+		    name, namelen, 0, 0);
 		return false;
 	}
-	if ((cnflags & MAKEENTRY) == 0) {
-		COUNT(cpup, ncs_badhits);
+	if (__predict_false((cnflags & MAKEENTRY) == 0)) {
 		/*
 		 * Last component and we are renaming or deleting,
 		 * the cache entry is invalid, or otherwise don't
 		 * want cache entry to exist.
 		 */
-		cache_invalidate(ncp);
-		mutex_exit(&ncp->nc_lock);
-		mutex_exit(&cpup->cpu_lock);
-		/* found nothing */
+		KASSERT((cnflags & ISLASTCN) != 0);
+		cache_remove(ncp, true);
+		rw_exit(&dvi->vi_nc_lock);
+		COUNT(ncs_badhits);
 		return false;
 	}
 	if (ncp->nc_vp == NULL) {
-		if (iswht_ret != NULL) {
+		if (nameiop == CREATE && (cnflags & ISLASTCN) != 0) {
 			/*
-			 * Restore the ISWHITEOUT flag saved earlier.
+			 * Last component and we are preparing to create
+			 * the named object, so flush the negative cache
+			 * entry.
 			 */
-			KASSERT((ncp->nc_flags & ~ISWHITEOUT) == 0);
-			*iswht_ret = (ncp->nc_flags & ISWHITEOUT) != 0;
+			COUNT(ncs_badhits);
+			cache_remove(ncp, true);
+			hit = false;
 		} else {
-			KASSERT(ncp->nc_flags == 0);
-		}
-
-		if (__predict_true(nameiop != CREATE ||
-		    (cnflags & ISLASTCN) == 0)) {
-			COUNT(cpup, ncs_neghits);
+			COUNT(ncs_neghits);
+			SDT_PROBE(vfs, namecache, lookup, hit, dvp, name,
+			    namelen, 0, 0);
 			/* found neg entry; vn is already null from above */
 			hit = true;
-		} else {
-			COUNT(cpup, ncs_badhits);
+		}
+		if (iswht_ret != NULL) {
 			/*
-			 * Last component and we are preparing to create
-			 * the named object, so flush the negative cache
-			 * entry.
+			 * Restore the ISWHITEOUT flag saved earlier.
 			 */
-			cache_invalidate(ncp);
-			/* found nothing */
-			hit = false;
+			*iswht_ret = ncp->nc_whiteout;
+		} else {
+			KASSERT(!ncp->nc_whiteout);
 		}
-		mutex_exit(&ncp->nc_lock);
-		mutex_exit(&cpup->cpu_lock);
+		rw_exit(&dvi->vi_nc_lock);
 		return hit;
 	}
-
 	vp = ncp->nc_vp;
 	mutex_enter(vp->v_interlock);
-	mutex_exit(&ncp->nc_lock);
-	mutex_exit(&cpup->cpu_lock);
+	rw_exit(&dvi->vi_nc_lock);
 
 	/*
 	 * Unlocked except for the vnode interlock.  Call vcache_tryvget().
@@ -651,100 +597,136 @@ cache_lookup(struct vnode *dvp, const ch
 		 * This vnode is being cleaned out.
 		 * XXX badhits?
 		 */
-		COUNT_UNL(cpup, ncs_falsehits);
-		/* found nothing */
+		COUNT(ncs_falsehits);
 		return false;
 	}
 
-	COUNT_UNL(cpup, ncs_goodhits);
+	COUNT(ncs_goodhits);
+	SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0);
 	/* found it */
 	*vn_ret = vp;
 	return true;
 }
 
-
 /*
- * Cut-'n-pasted version of the above without the nameiop argument.
+ * Version of the above without the nameiop argument, for NFS.
  */
 bool
 cache_lookup_raw(struct vnode *dvp, const char *name, size_t namelen,
 		 uint32_t cnflags,
 		 int *iswht_ret, struct vnode **vn_ret)
 {
+
+	return cache_lookup(dvp, name, namelen, LOOKUP, cnflags | MAKEENTRY,
+	    iswht_ret, vn_ret);
+}
+
+/*
+ * Used by namei() to walk down a path, component by component by looking up
+ * names in the cache.  The node locks are chained along the way: a parent's
+ * lock is not dropped until the child's is acquired.
+ */
+bool
+cache_lookup_linked(struct vnode *dvp, const char *name, size_t namelen,
+		    struct vnode **vn_ret, krwlock_t **plock,
+		    kauth_cred_t cred)
+{
+	vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
 	struct namecache *ncp;
-	struct vnode *vp;
-	struct nchcpu *cpup;
+	int64_t key;
 	int error;
 
 	/* Establish default results. */
-	if (iswht_ret != NULL) {
-		*iswht_ret = 0;
-	}
 	*vn_ret = NULL;
 
-	if (__predict_false(!doingcache)) {
-		/* found nothing */
+	/* If disabled, or file system doesn't support this, bail out. */
+	if (__predict_false(!doingcache ||
+	    (dvp->v_mount->mnt_iflag & IMNT_NCLOOKUP) == 0)) {
 		return false;
 	}
 
-	cpup = curcpu()->ci_data.cpu_nch;
-	mutex_enter(&cpup->cpu_lock);
 	if (__predict_false(namelen > USHRT_MAX)) {
-		COUNT(cpup, ncs_long);
-		mutex_exit(&cpup->cpu_lock);
-		/* found nothing */
+		COUNT(ncs_long);
 		return false;
 	}
-	ncp = cache_lookup_entry(dvp, name, namelen);
+
+	/* Compute the key up front - don't need the lock. */
+	key = cache_key(name, namelen);
+
+	/*
+	 * Acquire the directory lock.  Once we have that, we can drop the
+	 * previous one (if any).
+	 *
+	 * The two lock holds mean that the directory can't go away while
+	 * here: the directory must be purged with cache_purge() before
+	 * being freed, and both parent & child's vi_nc_lock must be taken
+	 * before that point is passed.
+	 *
+	 * However if there's no previous lock, like at the root of the
+	 * chain, then "dvp" must be referenced to prevent dvp going away
+	 * before we get its lock.
+	 *
+	 * Note that the two locks can be the same if looking up a dot, for
+	 * example: /usr/bin/.
+	 */
+	if (*plock != &dvi->vi_nc_lock) {
+		rw_enter(&dvi->vi_nc_lock, RW_READER);
+		if (*plock != NULL) {
+			rw_exit(*plock);
+		}
+		*plock = &dvi->vi_nc_lock;
+	} else if (*plock == NULL) {
+		KASSERT(dvp->v_usecount > 0);
+	}
+
+	/*
+	 * First up check if the user is allowed to look up files in this
+	 * directory.
+	 */
+	KASSERT(dvi->vi_nc_mode != VNOVAL && dvi->vi_nc_uid != VNOVAL &&
+	    dvi->vi_nc_gid != VNOVAL);
+	error = kauth_authorize_vnode(cred, KAUTH_ACCESS_ACTION(VEXEC,
+	    dvp->v_type, dvi->vi_nc_mode & ALLPERMS), dvp, NULL,
+	    genfs_can_access(dvp->v_type, dvi->vi_nc_mode & ALLPERMS,
+	    dvi->vi_nc_uid, dvi->vi_nc_gid, VEXEC, cred));
+	if (error != 0) {
+		COUNT(ncs_denied);
+		return false;
+	}
+
+	/*
+	 * Now look for a matching cache entry.
+	 */
+	ncp = cache_lookup_entry(dvp, name, namelen, key);
 	if (__predict_false(ncp == NULL)) {
-		COUNT(cpup, ncs_miss);
-		mutex_exit(&cpup->cpu_lock);
-		/* found nothing */
+		COUNT(ncs_miss);
+		SDT_PROBE(vfs, namecache, lookup, miss, dvp,
+		    name, namelen, 0, 0);
 		return false;
 	}
-	vp = ncp->nc_vp;
-	if (vp == NULL) {
-		/*
-		 * Restore the ISWHITEOUT flag saved earlier.
-		 */
-		if (iswht_ret != NULL) {
-			KASSERT((ncp->nc_flags & ~ISWHITEOUT) == 0);
-			/*cnp->cn_flags |= ncp->nc_flags;*/
-			*iswht_ret = (ncp->nc_flags & ISWHITEOUT) != 0;
-		}
-		COUNT(cpup, ncs_neghits);
-		mutex_exit(&ncp->nc_lock);
-		mutex_exit(&cpup->cpu_lock);
+	if (ncp->nc_vp == NULL) {
 		/* found negative entry; vn is already null from above */
+		COUNT(ncs_neghits);
+		SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0);
 		return true;
 	}
-	mutex_enter(vp->v_interlock);
-	mutex_exit(&ncp->nc_lock);
-	mutex_exit(&cpup->cpu_lock);
+
+	COUNT(ncs_goodhits); /* XXX can be "badhits" */
+	SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0);
 
 	/*
-	 * Unlocked except for the vnode interlock.  Call vcache_tryvget().
+	 * Return with the directory lock still held.  It will either be
+	 * returned to us with another call to cache_lookup_linked() when
+	 * looking up the next component, or the caller will release it
+	 * manually when finished.
 	 */
-	error = vcache_tryvget(vp);
-	if (error) {
-		KASSERT(error == EBUSY);
-		/*
-		 * This vnode is being cleaned out.
-		 * XXX badhits?
-		 */
-		COUNT_UNL(cpup, ncs_falsehits);
-		/* found nothing */
-		return false;
-	}
-
-	COUNT_UNL(cpup, ncs_goodhits); /* XXX can be "badhits" */
-	/* found it */
-	*vn_ret = vp;
+	*vn_ret = ncp->nc_vp;
 	return true;
 }
 
 /*
  * Scan cache looking for name of directory entry pointing at vp.
+ * Will not search for "." or "..".
  *
  * If the lookup succeeds the vnode is referenced and stored in dvpp.
  *
@@ -756,11 +738,12 @@ cache_lookup_raw(struct vnode *dvp, cons
  * Returns 0 on success, -1 on cache miss, positive errno on failure.
  */
 int
-cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp)
+cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp,
+    bool checkaccess, int perms)
 {
+	vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
 	struct namecache *ncp;
 	struct vnode *dvp;
-	struct nchcpu *cpup;
 	char *bp;
 	int error, nlen;
 
@@ -769,44 +752,57 @@ cache_revlookup(struct vnode *vp, struct
 	if (!doingcache)
 		goto out;
 
-	/*
-	 * We increment counters in the local CPU's per-cpu stats.
-	 * We don't take the per-cpu lock, however, since this function
-	 * is the only place these counters are incremented so no one
-	 * will be racing with us to increment them.
-	 */
-	cpup = curcpu()->ci_data.cpu_nch;
-	mutex_enter(namecache_lock);
-	TAILQ_FOREACH(ncp, &VNODE_TO_VIMPL(vp)->vi_nclist, nc_vlist) {
-		mutex_enter(&ncp->nc_lock);
-		/* Ignore invalidated entries. */
-		dvp = ncp->nc_dvp;
-		if (dvp == NULL) {
-			mutex_exit(&ncp->nc_lock);
-			continue;
-		}
-		
+	rw_enter(&vi->vi_nc_listlock, RW_READER);
+	if (checkaccess) {
 		/*
-		 * The list is partially sorted.  Once we hit dot or dotdot
-		 * it's only more dots from there on in.
+		 * Check if the user is allowed to see.  NOTE: this is
+		 * checking for access on the "wrong" directory.  getcwd()
+		 * wants to see that there is access on every component
+		 * along the way, not that there is access to any individual
+		 * component.  Don't use this to check you can look in vp.
+		 *
+		 * I don't like it, I didn't come up with it, don't blame me!
 		 */
+		KASSERT(vi->vi_nc_mode != VNOVAL && vi->vi_nc_uid != VNOVAL &&
+		    vi->vi_nc_gid != VNOVAL);
+		error = kauth_authorize_vnode(curlwp->l_cred,
+		    KAUTH_ACCESS_ACTION(VEXEC, vp->v_type, vi->vi_nc_mode &
+		    ALLPERMS), vp, NULL, genfs_can_access(vp->v_type,
+		    vi->vi_nc_mode & ALLPERMS, vi->vi_nc_uid, vi->vi_nc_gid,
+		    perms, curlwp->l_cred));
+		    if (error != 0) {
+		    	rw_exit(&vi->vi_nc_listlock);
+			COUNT(ncs_denied);
+			return EACCES;
+		}
+	}
+	TAILQ_FOREACH(ncp, &vi->vi_nc_list, nc_list) {
+		KASSERT(ncp->nc_vp == vp);
+		KASSERT(ncp->nc_dvp != NULL);
 		nlen = ncp->nc_nlen;
+
+		/*
+		 * The queue is partially sorted.  Once we hit dots, nothing
+		 * else remains but dots and dotdots, so bail out.
+		 */
 		if (ncp->nc_name[0] == '.') {
 			if (nlen == 1 ||
 			    (nlen == 2 && ncp->nc_name[1] == '.')) {
-				mutex_exit(&ncp->nc_lock);
-				break;
+			    	break;
 			}
 		}
-		COUNT(cpup, ncs_revhits);
+
+		/* Record a hit on the entry.  This is an unlocked read. */
+		if (ncp->nc_lrulist != LRU_ACTIVE) {
+			cache_activate(ncp);
+		}
 
 		if (bufp) {
 			bp = *bpp;
 			bp -= nlen;
 			if (bp <= bufp) {
 				*dvpp = NULL;
-				mutex_exit(&ncp->nc_lock);
-				mutex_exit(namecache_lock);
+				rw_exit(&vi->vi_nc_listlock);
 				SDT_PROBE(vfs, namecache, revlookup,
 				    fail, vp, ERANGE, 0, 0, 0);
 				return (ERANGE);
@@ -815,9 +811,9 @@ cache_revlookup(struct vnode *vp, struct
 			*bpp = bp;
 		}
 
+		dvp = ncp->nc_dvp;
 		mutex_enter(dvp->v_interlock);
-		mutex_exit(&ncp->nc_lock);
-		mutex_exit(namecache_lock);
+		rw_exit(&vi->vi_nc_listlock);
 		error = vcache_tryvget(dvp);
 		if (error) {
 			KASSERT(error == EBUSY);
@@ -831,26 +827,26 @@ cache_revlookup(struct vnode *vp, struct
 		*dvpp = dvp;
 		SDT_PROBE(vfs, namecache, revlookup, success, vp, dvp,
 		    0, 0, 0);
+		COUNT(ncs_revhits);
 		return (0);
 	}
-	COUNT(cpup, ncs_revmiss);
-	mutex_exit(namecache_lock);
+	rw_exit(&vi->vi_nc_listlock);
+	COUNT(ncs_revmiss);
  out:
 	*dvpp = NULL;
 	return (-1);
 }
 
 /*
- * Add an entry to the cache
+ * Add an entry to the cache.
  */
 void
 cache_enter(struct vnode *dvp, struct vnode *vp,
 	    const char *name, size_t namelen, uint32_t cnflags)
 {
-	struct namecache *ncp;
-	struct namecache *oncp;
-	struct nchashhead *ncpp;
-	nchash_t hash;
+	vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
+	struct namecache *ncp, *oncp;
+	int total;
 
 	/* First, check whether we can/should add a cache entry. */
 	if ((cnflags & MAKEENTRY) == 0 ||
@@ -861,140 +857,151 @@ cache_enter(struct vnode *dvp, struct vn
 	}
 
 	SDT_PROBE(vfs, namecache, enter, done, vp, name, namelen, 0, 0);
-	if (numcache > desiredvnodes) {
-		mutex_enter(namecache_lock);
-		cache_ev_forced.ev_count++;
+
+	/*
+	 * Reclaim some entries if over budget.  This is an unlocked check,
+	 * but it doesn't matter.  Just need to catch up with things
+	 * eventually: it doesn't matter if we go over temporarily.
+	 */
+	total = cache_lru.count[LRU_ACTIVE] + cache_lru.count[LRU_INACTIVE];
+	if (__predict_false(total > desiredvnodes)) {
 		cache_reclaim();
-		mutex_exit(namecache_lock);
 	}
 
-	if (namelen > NCHNAMLEN) {
-		ncp = kmem_alloc(sizeof(*ncp) + namelen, KM_SLEEP);
-		cache_ctor(NULL, ncp, 0);
-	} else
-		ncp = pool_cache_get(namecache_cache, PR_WAITOK);
+	/* Now allocate a fresh entry. */
+	if (__predict_true(namelen <= NCHNAMLEN)) {
+		ncp = pool_cache_get(cache_pool, PR_WAITOK);
+	} else {
+		size_t sz = offsetof(struct namecache, nc_name[namelen]);
+		ncp = kmem_alloc(sz, KM_SLEEP);
+	}
 
-	mutex_enter(namecache_lock);
-	numcache++;
+	/* Fill in cache info. */
+	ncp->nc_dvp = dvp;
+	ncp->nc_key = cache_key(name, namelen);
+	ncp->nc_nlen = namelen;
+	memcpy(ncp->nc_name, name, namelen);
 
 	/*
-	 * Concurrent lookups in the same directory may race for a
-	 * cache entry.  if there's a duplicated entry, free it.
+	 * Insert to the directory.  Concurrent lookups in the same
+	 * directory may race for a cache entry.  There can also be hash
+	 * value collisions.  If there's a entry there already, free it.
 	 */
-	oncp = cache_lookup_entry(dvp, name, namelen);
+	rw_enter(&dvi->vi_nc_lock, RW_WRITER);
+	oncp = rb_tree_find_node(&dvi->vi_nc_tree, &ncp->nc_key);
 	if (oncp) {
-		cache_invalidate(oncp);
-		mutex_exit(&oncp->nc_lock);
+		KASSERT(oncp->nc_nlen == ncp->nc_nlen);
+		if (cache_namecmp(oncp, name, namelen)) {
+			COUNT(ncs_collisions);
+		}
+		cache_remove(oncp, true);
 	}
+	rb_tree_insert_node(&dvi->vi_nc_tree, ncp);
 
-	/* Grab the vnode we just found. */
-	mutex_enter(&ncp->nc_lock);
-	ncp->nc_vp = vp;
-	ncp->nc_flags = 0;
-	ncp->nc_hittime = 0;
-	ncp->nc_gcqueue = NULL;
+	/* Then insert to the vnode. */
 	if (vp == NULL) {
 		/*
 		 * For negative hits, save the ISWHITEOUT flag so we can
 		 * restore it later when the cache entry is used again.
 		 */
-		ncp->nc_flags = cnflags & ISWHITEOUT;
-	}
-
-	/* Fill in cache info. */
-	ncp->nc_dvp = dvp;
-	LIST_INSERT_HEAD(&VNODE_TO_VIMPL(dvp)->vi_dnclist, ncp, nc_dvlist);
-	if (vp) {
+		ncp->nc_vp = NULL;
+		ncp->nc_whiteout = ((cnflags & ISWHITEOUT) != 0);
+	} else {
 		/* Partially sort the per-vnode list: dots go to back. */
+		vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
+		rw_enter(&vi->vi_nc_listlock, RW_WRITER);
 		if ((namelen == 1 && name[0] == '.') ||
 		    (namelen == 2 && name[0] == '.' && name[1] == '.')) {
-			TAILQ_INSERT_TAIL(&VNODE_TO_VIMPL(vp)->vi_nclist, ncp,
-			    nc_vlist);
+			TAILQ_INSERT_TAIL(&vi->vi_nc_list, ncp, nc_list);
 		} else {
-			TAILQ_INSERT_HEAD(&VNODE_TO_VIMPL(vp)->vi_nclist, ncp,
-			    nc_vlist);
+			TAILQ_INSERT_HEAD(&vi->vi_nc_list, ncp, nc_list);
 		}
-	} else {
-		ncp->nc_vlist.tqe_prev = NULL;
-		ncp->nc_vlist.tqe_next = NULL;
+		rw_exit(&vi->vi_nc_listlock);
+		ncp->nc_vp = vp;
+		ncp->nc_whiteout = false;
 	}
-	KASSERT(namelen <= USHRT_MAX);
-	ncp->nc_nlen = namelen;
-	memcpy(ncp->nc_name, name, (unsigned)ncp->nc_nlen);
-	TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru);
-	hash = cache_hash(name, namelen);
-	ncpp = &nchashtbl[NCHASH2(hash, dvp)];
 
 	/*
-	 * Flush updates before making visible in table.  No need for a
-	 * memory barrier on the other side: to see modifications the
-	 * list must be followed, meaning a dependent pointer load.
-	 * The below is LIST_INSERT_HEAD() inlined, with the memory
-	 * barrier included in the correct place.
+	 * Finally, insert to the tail of the ACTIVE LRU list (new) and
+	 * with the LRU lock held take the to opportunity to incrementally
+	 * balance the lists.
 	 */
-	if ((ncp->nc_hash.le_next = ncpp->lh_first) != NULL)
-		ncpp->lh_first->nc_hash.le_prev = &ncp->nc_hash.le_next;
-	ncp->nc_hash.le_prev = &ncpp->lh_first;
-	membar_producer();
-	ncpp->lh_first = ncp;
-	mutex_exit(&ncp->nc_lock);
-	mutex_exit(namecache_lock);
+	mutex_enter(&cache_lru_lock);
+	ncp->nc_lrulist = LRU_ACTIVE;
+	cache_lru.count[LRU_ACTIVE]++;
+	TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru);
+	cache_deactivate();
+	mutex_exit(&cache_lru_lock);
+	rw_exit(&dvi->vi_nc_lock);
 }
 
 /*
- * Name cache initialization, from vfs_init() when we are booting
+ * Set identity info in cache for a vnode.  We only care about directories
+ * so ignore other updates.
  */
 void
-nchinit(void)
+cache_enter_id(struct vnode *vp, mode_t mode, uid_t uid, gid_t gid)
 {
-	int error;
-
-	TAILQ_INIT(&nclruhead);
-	namecache_cache = pool_cache_init(sizeof(struct namecache) + NCHNAMLEN,
-	    coherency_unit, 0, 0, "ncache", NULL, IPL_NONE, cache_ctor,
-	    cache_dtor, NULL);
-	KASSERT(namecache_cache != NULL);
-
-	namecache_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
-	nchashtbl = hashinit(desiredvnodes, HASH_LIST, true, &nchash);
-
-	error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, cache_thread,
-	    NULL, NULL, "cachegc");
-	if (error != 0)
-		panic("nchinit %d", error);
-
-	evcnt_attach_dynamic(&cache_ev_scan, EVCNT_TYPE_MISC, NULL,
-	   "namecache", "entries scanned");
-	evcnt_attach_dynamic(&cache_ev_gc, EVCNT_TYPE_MISC, NULL,
-	   "namecache", "entries collected");
-	evcnt_attach_dynamic(&cache_ev_over, EVCNT_TYPE_MISC, NULL,
-	   "namecache", "over scan target");
-	evcnt_attach_dynamic(&cache_ev_under, EVCNT_TYPE_MISC, NULL,
-	   "namecache", "under scan target");
-	evcnt_attach_dynamic(&cache_ev_forced, EVCNT_TYPE_MISC, NULL,
-	   "namecache", "forced reclaims");
+	vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
 
-	sysctl_cache_stat_setup();
+	if (vp->v_type == VDIR) {
+		/* Grab both locks, for forward & reverse lookup. */
+		rw_enter(&vi->vi_nc_lock, RW_WRITER);
+		rw_enter(&vi->vi_nc_listlock, RW_WRITER);
+		vi->vi_nc_mode = mode;
+		vi->vi_nc_uid = uid;
+		vi->vi_nc_gid = gid;
+		rw_exit(&vi->vi_nc_listlock);
+		rw_exit(&vi->vi_nc_lock);
+	}
 }
 
-static int
-cache_ctor(void *arg, void *obj, int flag)
+/*
+ * Return true if we have identity for the given vnode, and use as an
+ * opportunity to confirm that everything squares up.
+ *
+ * Because of shared code, some file systems could provide partial
+ * information, missing some updates, so always check the mount flag
+ * instead of looking for !VNOVAL.
+ */
+bool
+cache_have_id(struct vnode *vp)
 {
-	struct namecache *ncp;
-
-	ncp = obj;
-	mutex_init(&ncp->nc_lock, MUTEX_DEFAULT, IPL_NONE);
 
-	return 0;
+	if (vp->v_type == VDIR &&
+	    (vp->v_mount->mnt_iflag & IMNT_NCLOOKUP) != 0) {
+		KASSERT(VNODE_TO_VIMPL(vp)->vi_nc_mode != VNOVAL);
+		KASSERT(VNODE_TO_VIMPL(vp)->vi_nc_uid != VNOVAL);
+		KASSERT(VNODE_TO_VIMPL(vp)->vi_nc_gid != VNOVAL);
+		return true;
+	} else {
+		return false;
+	}
 }
 
-static void
-cache_dtor(void *arg, void *obj)
+/*
+ * Name cache initialization, from vfs_init() when the system is booting.
+ */
+void
+nchinit(void)
 {
-	struct namecache *ncp;
 
-	ncp = obj;
-	mutex_destroy(&ncp->nc_lock);
+	cache_pool = pool_cache_init(sizeof(struct namecache),
+	    coherency_unit, 0, 0, "nchentry", NULL, IPL_NONE, NULL,
+	    NULL, NULL);
+	KASSERT(cache_pool != NULL);
+
+	mutex_init(&cache_lru_lock, MUTEX_DEFAULT, IPL_NONE);
+	TAILQ_INIT(&cache_lru.list[LRU_ACTIVE]);
+	TAILQ_INIT(&cache_lru.list[LRU_INACTIVE]);
+
+	KASSERT(cache_sysctllog == NULL);
+	sysctl_createv(&cache_sysctllog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_STRUCT, "namecache_stats",
+		       SYSCTL_DESCR("namecache statistics"),
+		       cache_stat_sysctl, 0, NULL, 0,
+		       CTL_VFS, CTL_CREATE, CTL_EOL);
 }
 
 /*
@@ -1003,87 +1010,176 @@ cache_dtor(void *arg, void *obj)
 void
 cache_cpu_init(struct cpu_info *ci)
 {
-	struct nchcpu *cpup;
+	void *p;
 	size_t sz;
 
-	sz = roundup2(sizeof(*cpup), coherency_unit) + coherency_unit;
-	cpup = kmem_zalloc(sz, KM_SLEEP);
-	cpup = (void *)roundup2((uintptr_t)cpup, coherency_unit);
-	mutex_init(&cpup->cpu_lock, MUTEX_DEFAULT, IPL_NONE);
-	ci->ci_data.cpu_nch = cpup;
+	sz = roundup2(sizeof(struct nchstats_percpu), coherency_unit) +
+	    coherency_unit;
+	p = kmem_zalloc(sz, KM_SLEEP);
+	ci->ci_data.cpu_nch = (void *)roundup2((uintptr_t)p, coherency_unit);
 }
 
 /*
- * Name cache reinitialization, for when the maximum number of vnodes increases.
+ * A vnode is being allocated: set up cache structures.
  */
 void
-nchreinit(void)
+cache_vnode_init(struct vnode *vp)
 {
+	vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
+
+	rw_init(&vi->vi_nc_lock);
+	rw_init(&vi->vi_nc_listlock);
+	rb_tree_init(&vi->vi_nc_tree, &cache_rbtree_ops);
+	TAILQ_INIT(&vi->vi_nc_list);
+	vi->vi_nc_mode = VNOVAL;
+	vi->vi_nc_uid = VNOVAL;
+	vi->vi_nc_gid = VNOVAL;
+}
+
+/*
+ * A vnode is being freed: finish cache structures.
+ */
+void
+cache_vnode_fini(struct vnode *vp)
+{
+	vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
+
+	KASSERT(RB_TREE_MIN(&vi->vi_nc_tree) == NULL);
+	KASSERT(TAILQ_EMPTY(&vi->vi_nc_list));
+	rw_destroy(&vi->vi_nc_lock);
+	rw_destroy(&vi->vi_nc_listlock);
+}
+
+/*
+ * Helper for cache_purge1(): purge cache entries for the given vnode from
+ * all directories that the vnode is cached in.
+ */
+static void
+cache_purge_parents(struct vnode *vp)
+{
+	vnode_impl_t *dvi, *vi = VNODE_TO_VIMPL(vp);
+	struct vnode *dvp, *blocked;
 	struct namecache *ncp;
-	struct nchashhead *oldhash, *hash;
-	u_long i, oldmask, mask;
 
-	hash = hashinit(desiredvnodes, HASH_LIST, true, &mask);
-	mutex_enter(namecache_lock);
-	cache_lock_cpus();
-	oldhash = nchashtbl;
-	oldmask = nchash;
-	nchashtbl = hash;
-	nchash = mask;
-	for (i = 0; i <= oldmask; i++) {
-		while ((ncp = LIST_FIRST(&oldhash[i])) != NULL) {
-			LIST_REMOVE(ncp, nc_hash);
-			ncp->nc_hash.le_prev = NULL;
-		}
-	}
-	cache_unlock_cpus();
-	mutex_exit(namecache_lock);
-	hashdone(oldhash, HASH_LIST, oldmask);
+	SDT_PROBE(vfs, namecache, purge, parents, vp, 0, 0, 0, 0);
+
+	blocked = NULL;
+
+	rw_enter(&vi->vi_nc_listlock, RW_WRITER);
+	while ((ncp = TAILQ_FIRST(&vi->vi_nc_list)) != NULL) {
+		/*
+		 * Locking in the wrong direction.  Try for a hold on the
+		 * directory node's lock, and if we get it then all good,
+		 * nuke the entry and move on to the next.
+		 */
+		dvp = ncp->nc_dvp;
+		dvi = VNODE_TO_VIMPL(dvp);
+		if (rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) {
+			cache_remove(ncp, false);
+			rw_exit(&dvi->vi_nc_lock);
+			blocked = NULL;
+			continue;
+		}
+
+		/*
+		 * We can't wait on the directory node's lock with our list
+		 * lock held or the system could deadlock.
+		 *
+		 * Take a hold on the directory vnode to prevent it from
+		 * being freed (taking the vnode & lock with it).  Then
+		 * wait for the lock to become available with no other locks
+		 * held, and retry.
+		 *
+		 * If this happens twice in a row, give the other side a
+		 * breather; we can do nothing until it lets go.
+		 */
+		vhold(dvp);
+		rw_exit(&vi->vi_nc_listlock);
+		rw_enter(&dvi->vi_nc_lock, RW_WRITER);
+		/* Do nothing. */
+		rw_exit(&dvi->vi_nc_lock);
+		holdrele(dvp);
+		if (blocked == dvp) {
+			kpause("ncpurge", false, 1, NULL);
+		}
+		rw_enter(&vi->vi_nc_listlock, RW_WRITER);
+		blocked = dvp;
+	}
+	rw_exit(&vi->vi_nc_listlock);
+}
+
+/*
+ * Helper for cache_purge1(): purge all cache entries hanging off the given
+ * directory vnode.
+ */
+static void
+cache_purge_children(struct vnode *dvp)
+{
+	vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
+	struct namecache *ncp;
+
+	SDT_PROBE(vfs, namecache, purge, children, dvp, 0, 0, 0, 0);
+
+	rw_enter(&dvi->vi_nc_lock, RW_WRITER);
+	for (;;) {
+		ncp = rb_tree_iterate(&dvi->vi_nc_tree, NULL, RB_DIR_RIGHT);
+		if (ncp == NULL) {
+			break;
+		}
+		cache_remove(ncp, true);
+	}
+	rw_exit(&dvi->vi_nc_lock);
+}
+
+/*
+ * Helper for cache_purge1(): purge cache entry from the given vnode,
+ * finding it by name.
+ */
+static void
+cache_purge_name(struct vnode *dvp, const char *name, size_t namelen)
+{
+	vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
+	struct namecache *ncp;
+	int64_t key;
+
+	SDT_PROBE(vfs, namecache, purge, name, name, namelen, 0, 0, 0);
+
+	key = cache_key(name, namelen);
+	rw_enter(&dvi->vi_nc_lock, RW_WRITER);
+	ncp = cache_lookup_entry(dvp, name, namelen, key);
+	if (ncp) {
+		cache_remove(ncp, true);
+	}
+	rw_exit(&dvi->vi_nc_lock);
 }
 
 /*
  * Cache flush, a particular vnode; called when a vnode is renamed to
- * hide entries that would now be invalid
+ * hide entries that would now be invalid.
  */
 void
 cache_purge1(struct vnode *vp, const char *name, size_t namelen, int flags)
 {
-	struct namecache *ncp, *ncnext;
 
-	mutex_enter(namecache_lock);
 	if (flags & PURGE_PARENTS) {
-		SDT_PROBE(vfs, namecache, purge, parents, vp, 0, 0, 0, 0);
-
-		for (ncp = TAILQ_FIRST(&VNODE_TO_VIMPL(vp)->vi_nclist);
-		    ncp != NULL; ncp = ncnext) {
-			ncnext = TAILQ_NEXT(ncp, nc_vlist);
-			mutex_enter(&ncp->nc_lock);
-			cache_invalidate(ncp);
-			mutex_exit(&ncp->nc_lock);
-			cache_disassociate(ncp);
-		}
+		cache_purge_parents(vp);
 	}
 	if (flags & PURGE_CHILDREN) {
-		SDT_PROBE(vfs, namecache, purge, children, vp, 0, 0, 0, 0);
-		for (ncp = LIST_FIRST(&VNODE_TO_VIMPL(vp)->vi_dnclist);
-		    ncp != NULL; ncp = ncnext) {
-			ncnext = LIST_NEXT(ncp, nc_dvlist);
-			mutex_enter(&ncp->nc_lock);
-			cache_invalidate(ncp);
-			mutex_exit(&ncp->nc_lock);
-			cache_disassociate(ncp);
-		}
+		cache_purge_children(vp);
 	}
 	if (name != NULL) {
-		SDT_PROBE(vfs, namecache, purge, name, name, namelen, 0, 0, 0);
-		ncp = cache_lookup_entry(vp, name, namelen);
-		if (ncp) {
-			cache_invalidate(ncp);
-			mutex_exit(&ncp->nc_lock);
-			cache_disassociate(ncp);
-		}
+		cache_purge_name(vp, name, namelen);
 	}
-	mutex_exit(namecache_lock);
+}
+
+/*
+ * vnode filter for cache_purgevfs().
+ */
+static bool
+cache_vdir_filter(void *cookie, vnode_t *vp)
+{
+
+	return vp->v_type == VDIR;
 }
 
 /*
@@ -1093,186 +1189,171 @@ cache_purge1(struct vnode *vp, const cha
 void
 cache_purgevfs(struct mount *mp)
 {
-	struct namecache *ncp, *nxtcp;
+	struct vnode_iterator *iter;
+	vnode_t *dvp;
 
-	SDT_PROBE(vfs, namecache, purge, vfs, mp, 0, 0, 0, 0);
-	mutex_enter(namecache_lock);
-	for (ncp = TAILQ_FIRST(&nclruhead); ncp != NULL; ncp = nxtcp) {
-		nxtcp = TAILQ_NEXT(ncp, nc_lru);
-		mutex_enter(&ncp->nc_lock);
-		if (ncp->nc_dvp != NULL && ncp->nc_dvp->v_mount == mp) {
-			/* Free the resources we had. */
-			cache_invalidate(ncp);
-			cache_disassociate(ncp);
+	vfs_vnode_iterator_init(mp, &iter);
+	for (;;) {
+		dvp = vfs_vnode_iterator_next(iter, cache_vdir_filter, NULL);
+		if (dvp == NULL) {
+			break;
 		}
-		mutex_exit(&ncp->nc_lock);
+		cache_purge_children(dvp);
+		vrele(dvp);
 	}
-	cache_reclaim();
-	mutex_exit(namecache_lock);
+	vfs_vnode_iterator_destroy(iter);
 }
 
 /*
- * Scan global list invalidating entries until we meet a preset target.
- * Prefer to invalidate entries that have not scored a hit within
- * cache_hottime seconds.  We sort the LRU list only for this routine's
- * benefit.
+ * Re-queue an entry onto the correct LRU list, after it has scored a hit.
  */
 static void
-cache_prune(int incache, int target)
+cache_activate(struct namecache *ncp)
 {
-	struct namecache *ncp, *nxtcp, *sentinel;
-	int items, recent, tryharder;
-
-	KASSERT(mutex_owned(namecache_lock));
 
-	SDT_PROBE(vfs, namecache, prune, done, incache, target, 0, 0, 0);
-	items = 0;
-	tryharder = 0;
-	recent = hardclock_ticks - hz * cache_hottime;
-	sentinel = NULL;
-	for (ncp = TAILQ_FIRST(&nclruhead); ncp != NULL; ncp = nxtcp) {
-		if (incache <= target)
-			break;
-		items++;
-		nxtcp = TAILQ_NEXT(ncp, nc_lru);
-		if (ncp == sentinel) {
-			/*
-			 * If we looped back on ourself, then ignore
-			 * recent entries and purge whatever we find.
-			 */
-			tryharder = 1;
-		}
-		if (ncp->nc_dvp == NULL)
-			continue;
-		if (!tryharder && (ncp->nc_hittime - recent) > 0) {
-			if (sentinel == NULL)
-				sentinel = ncp;
-			TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
-			TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru);
-			continue;
-		}
-		mutex_enter(&ncp->nc_lock);
-		if (ncp->nc_dvp != NULL) {
-			cache_invalidate(ncp);
-			cache_disassociate(ncp);
-			incache--;
-		}
-		mutex_exit(&ncp->nc_lock);
-	}
-	cache_ev_scan.ev_count += items;
+	mutex_enter(&cache_lru_lock);
+	/* Put on tail of ACTIVE list, since it just scored a hit. */
+	TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru);
+	TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru);
+	cache_lru.count[ncp->nc_lrulist]--;
+	cache_lru.count[LRU_ACTIVE]++;
+	ncp->nc_lrulist = LRU_ACTIVE;
+	mutex_exit(&cache_lru_lock);
 }
 
 /*
- * Collect dead cache entries from all CPUs and garbage collect.
+ * Try to balance the LRU lists.  Pick some victim entries, and re-queue
+ * them from the head of the active list to the tail of the inactive list. 
  */
 static void
-cache_reclaim(void)
+cache_deactivate(void)
 {
-	struct namecache *ncp, *next;
-	int items;
+	struct namecache *ncp;
+	int total, i;
 
-	KASSERT(mutex_owned(namecache_lock));
+	KASSERT(mutex_owned(&cache_lru_lock));
 
-	/*
-	 * If the number of extant entries not awaiting garbage collection
-	 * exceeds the high water mark, then reclaim stale entries until we
-	 * reach our low water mark.
-	 */
-	items = numcache - cache_gcpend;
-	if (items > (uint64_t)desiredvnodes * cache_hiwat / 100) {
-		cache_prune(items, (int)((uint64_t)desiredvnodes *
-		    cache_lowat / 100));
-		cache_ev_over.ev_count++;
-	} else
-		cache_ev_under.ev_count++;
+	/* If we're nowhere near budget yet, don't bother. */
+	total = cache_lru.count[LRU_ACTIVE] + cache_lru.count[LRU_INACTIVE];
+	if (total < (desiredvnodes >> 1)) {
+	    	return;
+	}
 
 	/*
-	 * Stop forward lookup activity on all CPUs and garbage collect dead
-	 * entries.
+	 * Aim for a 1:1 ratio of active to inactive.  This is to allow each
+	 * potential victim a reasonable amount of time to cycle through the
+	 * inactive list in order to score a hit and be reactivated, while
+	 * trying not to cause reactivations too frequently.
 	 */
-	cache_lock_cpus();
-	ncp = cache_gcqueue;
-	cache_gcqueue = NULL;
-	items = cache_gcpend;
-	cache_gcpend = 0;
-	while (ncp != NULL) {
-		next = ncp->nc_gcqueue;
-		cache_disassociate(ncp);
-		KASSERT(ncp->nc_dvp == NULL);
-		if (ncp->nc_hash.le_prev != NULL) {
-			LIST_REMOVE(ncp, nc_hash);
-			ncp->nc_hash.le_prev = NULL;
-		}
-		if (ncp->nc_nlen > NCHNAMLEN) {
-			cache_dtor(NULL, ncp);
-			kmem_free(ncp, sizeof(*ncp) + ncp->nc_nlen);
-		} else
-			pool_cache_put(namecache_cache, ncp);
-		ncp = next;
-	}
-	cache_unlock_cpus();
-	numcache -= items;
-	cache_ev_gc.ev_count += items;
+	if (cache_lru.count[LRU_ACTIVE] < cache_lru.count[LRU_INACTIVE]) {
+		return;
+	}
+
+	/* Move only a few at a time; will catch up eventually. */
+	for (i = 0; i < cache_lru_maxdeact; i++) {
+		ncp = TAILQ_FIRST(&cache_lru.list[LRU_ACTIVE]);
+		if (ncp == NULL) {
+			break;
+		}
+		KASSERT(ncp->nc_lrulist == LRU_ACTIVE);
+		ncp->nc_lrulist = LRU_INACTIVE;
+		TAILQ_REMOVE(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru);
+		TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE], ncp, nc_lru);
+		cache_lru.count[LRU_ACTIVE]--;
+		cache_lru.count[LRU_INACTIVE]++;
+	}
 }
 
 /*
- * Cache maintainence thread, awakening once per second to:
- *
- * => keep number of entries below the high water mark
- * => sort pseudo-LRU list
- * => garbage collect dead entries
+ * Free some entries from the cache, when we have gone over budget.
+ *
+ * We don't want to cause too much work for any individual caller, and it
+ * doesn't matter if we temporarily go over budget.  This is also "just a
+ * cache" so it's not a big deal if we screw up and throw out something we
+ * shouldn't.  So we take a relaxed attitude to this process to reduce its
+ * impact.
  */
 static void
-cache_thread(void *arg)
-{
-
-	mutex_enter(namecache_lock);
-	for (;;) {
-		cache_reclaim();
-		kpause("cachegc", false, hz, namecache_lock);
-	}
-}
-
-#ifdef DDB
-void
-namecache_print(struct vnode *vp, void (*pr)(const char *, ...))
+cache_reclaim(void)
 {
-	struct vnode *dvp = NULL;
 	struct namecache *ncp;
+	vnode_impl_t *dvi;
+	int toscan;
 
-	TAILQ_FOREACH(ncp, &nclruhead, nc_lru) {
-		if (ncp->nc_vp == vp && ncp->nc_dvp != NULL) {
-			(*pr)("name %.*s\n", ncp->nc_nlen, ncp->nc_name);
-			dvp = ncp->nc_dvp;
+	/*
+	 * Scan up to a preset maxium number of entries, but no more than
+	 * 0.8% of the total at once (to allow for very small systems).
+	 *
+	 * On bigger systems, do a larger chunk of work to reduce the number
+	 * of times that cache_lru_lock is held for any length of time.
+	 */
+	mutex_enter(&cache_lru_lock);
+	toscan = MIN(cache_lru_maxscan, desiredvnodes >> 7);
+	toscan = MAX(toscan, 1);
+	SDT_PROBE(vfs, namecache, prune, done, cache_lru.count[LRU_ACTIVE] +
+	    cache_lru.count[LRU_INACTIVE], toscan, 0, 0, 0);
+	while (toscan-- != 0) {
+		/* First try to balance the lists. */
+		cache_deactivate();
+
+		/* Now look for a victim on head of inactive list (old). */
+		ncp = TAILQ_FIRST(&cache_lru.list[LRU_INACTIVE]);
+		if (ncp == NULL) {
+			break;
 		}
-	}
-	if (dvp == NULL) {
-		(*pr)("name not found\n");
-		return;
-	}
-	vp = dvp;
-	TAILQ_FOREACH(ncp, &nclruhead, nc_lru) {
-		if (ncp->nc_vp == vp && ncp->nc_dvp != NULL) {
-			(*pr)("parent %.*s\n", ncp->nc_nlen, ncp->nc_name);
+		dvi = VNODE_TO_VIMPL(ncp->nc_dvp);
+		KASSERT(ncp->nc_lrulist == LRU_INACTIVE);
+		KASSERT(dvi != NULL);
+
+		/*
+		 * Locking in the wrong direction.  If we can't get the
+		 * lock, the directory is actively busy, and it could also
+		 * cause problems for the next guy in here, so send the
+		 * entry to the back of the list.
+		 */
+		if (!rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) {
+			TAILQ_REMOVE(&cache_lru.list[LRU_INACTIVE],
+			    ncp, nc_lru);
+			TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE],
+			    ncp, nc_lru);
+			continue;
 		}
+
+		/*
+		 * Now have the victim entry locked.  Drop the LRU list
+		 * lock, purge the entry, and start over.  The hold on
+		 * vi_nc_lock will prevent the vnode from vanishing until
+		 * finished (cache_purge() will be called on dvp before it
+		 * disappears, and that will wait on vi_nc_lock).
+		 */
+		mutex_exit(&cache_lru_lock);
+		cache_remove(ncp, true);
+		rw_exit(&dvi->vi_nc_lock);
+		mutex_enter(&cache_lru_lock);
 	}
+	mutex_exit(&cache_lru_lock);
 }
-#endif
 
+/*
+ * For file system code: count a lookup that required a full re-scan of
+ * directory metadata.
+ */
 void
 namecache_count_pass2(void)
 {
-	struct nchcpu *cpup = curcpu()->ci_data.cpu_nch;
 
-	COUNT_UNL(cpup, ncs_pass2);
+	COUNT(ncs_pass2);
 }
 
+/*
+ * For file system code: count a lookup that scored a hit in the directory
+ * metadata near the location of the last lookup.
+ */
 void
 namecache_count_2passes(void)
 {
-	struct nchcpu *cpup = curcpu()->ci_data.cpu_nch;
 
-	COUNT_UNL(cpup, ncs_2passes);
+	COUNT(ncs_2passes);
 }
 
 /*
@@ -1283,64 +1364,79 @@ namecache_count_2passes(void)
 static int
 cache_stat_sysctl(SYSCTLFN_ARGS)
 {
-	struct nchstats stats;
-	struct nchcpu *my_cpup;
-#ifdef CACHE_STATS_CURRENT
 	CPU_INFO_ITERATOR cii;
+	struct nchstats stats;
 	struct cpu_info *ci;
-#endif	/* CACHE_STATS_CURRENT */
 
 	if (oldp == NULL) {
-		*oldlenp = sizeof(stats);
+		*oldlenp = sizeof(nchstats);
 		return 0;
 	}
 
-	if (*oldlenp < sizeof(stats)) {
+	if (*oldlenp <= 0) {
 		*oldlenp = 0;
 		return 0;
 	}
 
-	/*
-	 * Take this CPU's per-cpu lock to hold off cache_reclaim()
-	 * from doing a stats update while doing minimal damage to
-	 * concurrent operations.
-	 */
 	sysctl_unlock();
-	my_cpup = curcpu()->ci_data.cpu_nch;
-	mutex_enter(&my_cpup->cpu_lock);
-	stats = nchstats;
-#ifdef CACHE_STATS_CURRENT
+	memset(&stats, 0, sizeof(nchstats));
 	for (CPU_INFO_FOREACH(cii, ci)) {
-		struct nchcpu *cpup = ci->ci_data.cpu_nch;
+		struct nchstats_percpu *np = ci->ci_data.cpu_nch;
 
-		ADD(stats, cpup, ncs_goodhits);
-		ADD(stats, cpup, ncs_neghits);
-		ADD(stats, cpup, ncs_badhits);
-		ADD(stats, cpup, ncs_falsehits);
-		ADD(stats, cpup, ncs_miss);
-		ADD(stats, cpup, ncs_long);
-		ADD(stats, cpup, ncs_pass2);
-		ADD(stats, cpup, ncs_2passes);
-		ADD(stats, cpup, ncs_revhits);
-		ADD(stats, cpup, ncs_revmiss);
-	}
-#endif	/* CACHE_STATS_CURRENT */
-	mutex_exit(&my_cpup->cpu_lock);
+		stats.ncs_goodhits += np->ncs_goodhits;
+		stats.ncs_neghits += np->ncs_neghits;
+		stats.ncs_badhits += np->ncs_badhits;
+		stats.ncs_falsehits += np->ncs_falsehits;
+		stats.ncs_miss += np->ncs_miss;
+		stats.ncs_long += np->ncs_long;
+		stats.ncs_pass2 += np->ncs_pass2;
+		stats.ncs_2passes += np->ncs_2passes;
+		stats.ncs_revhits += np->ncs_revhits;
+		stats.ncs_revmiss += np->ncs_revmiss;
+		stats.ncs_collisions += np->ncs_collisions;
+		stats.ncs_denied += np->ncs_denied;
+	}
+	mutex_enter(&cache_lru_lock);
+	memcpy(&nchstats, &stats, sizeof(nchstats));
+	mutex_exit(&cache_lru_lock);
 	sysctl_relock();
 
-	*oldlenp = sizeof(stats);
-	return sysctl_copyout(l, &stats, oldp, sizeof(stats));
+	*oldlenp = MIN(sizeof(stats), *oldlenp);
+	return sysctl_copyout(l, &stats, oldp, *oldlenp);
 }
 
-static void
-sysctl_cache_stat_setup(void)
+/*
+ * For the debugger, given the address of a vnode, print all associated
+ * names in the cache.
+ */
+#ifdef DDB
+void
+namecache_print(struct vnode *vp, void (*pr)(const char *, ...))
 {
+	struct vnode *dvp = NULL;
+	struct namecache *ncp;
+	enum cache_lru_id id;
 
-	KASSERT(sysctllog == NULL);
-	sysctl_createv(&sysctllog, 0, NULL, NULL,
-		       CTLFLAG_PERMANENT,
-		       CTLTYPE_STRUCT, "namecache_stats",
-		       SYSCTL_DESCR("namecache statistics"),
-		       cache_stat_sysctl, 0, NULL, 0,
-		       CTL_VFS, CTL_CREATE, CTL_EOL);
+	for (id = 0; id < LRU_COUNT; id++) {
+		TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) {
+			if (ncp->nc_vp == vp) {
+				(*pr)("name %.*s\n", ncp->nc_nlen,
+				    ncp->nc_name);
+				dvp = ncp->nc_dvp;
+			}
+		}
+	}
+	if (dvp == NULL) {
+		(*pr)("name not found\n");
+		return;
+	}
+	for (id = 0; id < LRU_COUNT; id++) {
+		TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) {
+			if (ncp->nc_vp == dvp) {
+				(*pr)("parent %.*s\n", ncp->nc_nlen,
+				    ncp->nc_name);
+			}
+		}
+	}
 }
+#endif
Index: src/sys/kern/vfs_getcwd.c
diff -u src/sys/kern/vfs_getcwd.c:1.55 src/sys/kern/vfs_getcwd.c:1.53.2.5
--- src/sys/kern/vfs_getcwd.c:1.55	Sun Feb 23 22:14:03 2020
+++ src/sys/kern/vfs_getcwd.c	Sat Feb 29 20:21:03 2020
@@ -1,7 +1,7 @@
 /* $NetBSD$ */
 
 /*-
- * Copyright (c) 1999 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2020 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -87,7 +87,7 @@ __KERNEL_RCSID(0, "$NetBSD$");
  * On exit, *uvpp is either NULL or is a locked vnode reference.
  */
 static int
-getcwd_scandir(struct vnode **lvpp, struct vnode **uvpp, char **bpp,
+getcwd_scandir(struct vnode *lvp, struct vnode **uvpp, char **bpp,
     char *bufp, struct lwp *l)
 {
 	int     error = 0;
@@ -101,12 +101,14 @@ getcwd_scandir(struct vnode **lvpp, stru
 	ino_t   fileno;
 	struct vattr va;
 	struct vnode *uvp = NULL;
-	struct vnode *lvp = *lvpp;
 	kauth_cred_t cred = l->l_cred;
 	struct componentname cn;
 	int len, reclen;
 	tries = 0;
 
+	/* Need exclusive for UFS VOP_GETATTR (itimes) & VOP_LOOKUP. */
+	KASSERT(VOP_ISLOCKED(lvp) == LK_EXCLUSIVE);
+
 	/*
 	 * If we want the filename, get some info we need while the
 	 * current directory is still locked.
@@ -114,8 +116,7 @@ getcwd_scandir(struct vnode **lvpp, stru
 	if (bufp != NULL) {
 		error = VOP_GETATTR(lvp, &va, cred);
 		if (error) {
-			vput(lvp);
-			*lvpp = NULL;
+			VOP_UNLOCK(lvp);
 			*uvpp = NULL;
 			return error;
 		}
@@ -134,24 +135,14 @@ getcwd_scandir(struct vnode **lvpp, stru
 
 	/* At this point, lvp is locked  */
 	error = VOP_LOOKUP(lvp, uvpp, &cn);
-	vput(lvp);
+	VOP_UNLOCK(lvp);
 	if (error) {
-		*lvpp = NULL;
 		*uvpp = NULL;
 		return error;
 	}
 	uvp = *uvpp;
-	/* Now lvp is unlocked, try to lock uvp */
-	error = vn_lock(uvp, LK_EXCLUSIVE);
-	if (error) {
-		*lvpp = NULL;
-		*uvpp = NULL;
-		return error;
-	}
-
 	/* If we don't care about the pathname, we're done */
 	if (bufp == NULL) {
-		*lvpp = NULL;
 		return 0;
 	}
 
@@ -163,6 +154,14 @@ getcwd_scandir(struct vnode **lvpp, stru
 		dirbuflen = va.va_blocksize;
 	dirbuf = kmem_alloc(dirbuflen, KM_SLEEP);
 
+	/* Now lvp is unlocked, try to lock uvp */
+	error = vn_lock(uvp, LK_SHARED);
+	if (error) {
+		vrele(uvp);
+		*uvpp = NULL;
+		return error;
+	}
+
 #if 0
 unionread:
 #endif
@@ -254,73 +253,21 @@ unionread:
 		vput(tvp);
 		vref(uvp);
 		*uvpp = uvp;
-		vn_lock(uvp, LK_EXCLUSIVE | LK_RETRY);
+		vn_lock(uvp, LK_SHARED | LK_RETRY);
 		goto unionread;
 	}
 #endif
 	error = ENOENT;
 
 out:
-	*lvpp = NULL;
+	VOP_UNLOCK(uvp);
 	kmem_free(dirbuf, dirbuflen);
 	return error;
 }
 
 /*
- * Look in the vnode-to-name reverse cache to see if
- * we can find things the easy way.
- *
- * XXX vget failure path is untested.
- *
- * On entry, *lvpp is a locked vnode reference.
- * On exit, one of the following is the case:
- *	0) Both *lvpp and *uvpp are NULL and failure is returned.
- * 	1) *uvpp is NULL, *lvpp remains locked and -1 is returned (cache miss)
- *	2) *uvpp is a locked vnode reference, *lvpp is vput and NULL'ed
- *	   and 0 is returned (cache hit)
- */
-
-static int
-getcwd_getcache(struct vnode **lvpp, struct vnode **uvpp, char **bpp,
-    char *bufp)
-{
-	struct vnode *lvp, *uvp = NULL;
-	int error;
-
-	lvp = *lvpp;
-
-	/*
-	 * This returns 0 on a cache hit, -1 on a clean cache miss,
-	 * or an errno on other failure.
-	 */
-	error = cache_revlookup(lvp, uvpp, bpp, bufp);
-	if (error) {
-		if (error != -1) {
-			vput(lvp);
-			*lvpp = NULL;
-			*uvpp = NULL;
-		}
-		return error;
-	}
-	uvp = *uvpp;
-
-	/*
-	 * Since we're going up, we have to release the current lock
-	 * before we take the parent lock.
-	 */
-
-	VOP_UNLOCK(lvp);
-	vn_lock(uvp, LK_EXCLUSIVE | LK_RETRY);
-	vrele(lvp);
-	*lvpp = NULL;
-
-	return error;
-}
-
-/*
  * common routine shared by sys___getcwd() and vn_isunder()
  */
-
 int
 getcwd_common(struct vnode *lvp, struct vnode *rvp, char **bpp, char *bufp,
     int limit, int flags, struct lwp *l)
@@ -345,11 +292,10 @@ getcwd_common(struct vnode *lvp, struct 
 	/*
 	 * Error handling invariant:
 	 * Before a `goto out':
-	 *	lvp is either NULL, or locked and held.
-	 *	uvp is either NULL, or locked and held.
+	 *	lvp is either NULL, or held.
+	 *	uvp is either NULL, or held.
 	 */
 
-	vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
 	if (bufp)
 		bp = *bpp;
 
@@ -369,63 +315,93 @@ getcwd_common(struct vnode *lvp, struct 
 		 * access check here is optional, depending on
 		 * whether or not caller cares.
 		 */
-		if (flags & GETCWD_CHECK_ACCESS) {
-			error = VOP_ACCESS(lvp, perms, cred);
-			if (error)
-				goto out;
-			perms = VEXEC|VREAD;
-		}
+		int chkaccess = (flags & GETCWD_CHECK_ACCESS);
+		bool locked = false;
 
 		/*
 		 * step up if we're a covered vnode..
+		 * check access on the first vnode only.
 		 */
-		while (lvp->v_vflag & VV_ROOT) {
-			struct vnode *tvp;
+		if (lvp->v_vflag & VV_ROOT) {
+			vn_lock(lvp, LK_SHARED | LK_RETRY);
+			if (chkaccess) {
+				error = VOP_ACCESS(lvp, perms, cred);
+				if (error) {
+					VOP_UNLOCK(lvp);
+					goto out;
+				}
+				chkaccess = 0;
+			}
+			while (lvp->v_vflag & VV_ROOT) {
+				struct vnode *tvp;
 
-			if (lvp == rvp)
-				goto out;
+				if (lvp == rvp) {
+					VOP_UNLOCK(lvp);
+					goto out;
+				}
 
-			tvp = lvp;
-			lvp = lvp->v_mount->mnt_vnodecovered;
-			vput(tvp);
-			/*
-			 * hodie natus est radici frater
-			 */
-			if (lvp == NULL) {
-				error = ENOENT;
-				goto out;
+				tvp = lvp->v_mount->mnt_vnodecovered;
+				/*
+				 * hodie natus est radici frater
+				 */
+				if (tvp == NULL) {
+					VOP_UNLOCK(lvp);
+					error = ENOENT;
+					goto out;
+				}
+				vref(tvp);
+				vput(lvp);
+				lvp = tvp;
+				if (lvp->v_vflag & VV_ROOT)
+					vn_lock(lvp, LK_SHARED | LK_RETRY);
 			}
-			vref(lvp);
-			error = vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
-			if (error != 0) {
-				vrele(lvp);
-				lvp = NULL;
+		}
+
+		/* Do we need to check access to the directory? */
+		if (chkaccess && !cache_have_id(lvp)) {
+			/* Need exclusive for UFS VOP_GETATTR (itimes) & VOP_LOOKUP. */
+			vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
+			error = VOP_ACCESS(lvp, perms, cred);
+			if (error) {
+				VOP_UNLOCK(lvp);
 				goto out;
 			}
+			chkaccess = 0;
+			locked = true;
 		}
+
 		/*
 		 * Look in the name cache; if that fails, look in the
 		 * directory..
 		 */
-		error = getcwd_getcache(&lvp, &uvp, &bp, bufp);
+		error = cache_revlookup(lvp, &uvp, &bp, bufp, chkaccess,
+		    perms);
 		if (error == -1) {
+			if (!locked) {
+				locked = true;
+				vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
+			}
 			if (lvp->v_type != VDIR) {
+				VOP_UNLOCK(lvp);
 				error = ENOTDIR;
 				goto out;
 			}
-			error = getcwd_scandir(&lvp, &uvp, &bp, bufp, l);
+			error = getcwd_scandir(lvp, &uvp, &bp, bufp, l);
+			/* lvp now unlocked */
+		} else if (locked) {
+			VOP_UNLOCK(lvp);
 		}
 		if (error)
 			goto out;
 #if DIAGNOSTIC
-		if (lvp != NULL)
-			panic("getcwd: oops, forgot to null lvp");
 		if (bufp && (bp <= bufp)) {
 			panic("getcwd: oops, went back too far");
 		}
 #endif
+		perms = VEXEC | VREAD;
 		if (bp)
 			*(--bp) = '/';
+		vrele(lvp);
 		lvp = uvp;
 		uvp = NULL;
 		limit--;
@@ -435,9 +411,9 @@ out:
 	if (bpp)
 		*bpp = bp;
 	if (uvp)
-		vput(uvp);
+		vrele(uvp);
 	if (lvp)
-		vput(lvp);
+		vrele(lvp);
 	vrele(rvp);
 	return error;
 }
@@ -556,11 +532,7 @@ vnode_to_path(char *path, size_t len, st
 	bp = bend = &path[len];
 	*(--bp) = '\0';
 
-	error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-	if (error != 0)
-		return error;
-	error = cache_revlookup(vp, &dvp, &bp, path);
-	VOP_UNLOCK(vp);
+	error = cache_revlookup(vp, &dvp, &bp, path, false, 0);
 	if (error != 0)
 		return (error == -1 ? ENOENT : error);
 
Index: src/sys/kern/vfs_lookup.c
diff -u src/sys/kern/vfs_lookup.c:1.214 src/sys/kern/vfs_lookup.c:1.212.4.11
--- src/sys/kern/vfs_lookup.c:1.214	Sun Feb 23 22:14:03 2020
+++ src/sys/kern/vfs_lookup.c	Tue Mar  3 22:30:57 2020
@@ -50,6 +50,7 @@ __KERNEL_RCSID(0, "$NetBSD$");
 #include <sys/time.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
+#include <sys/vnode_impl.h>
 #include <sys/mount.h>
 #include <sys/errno.h>
 #include <sys/filedesc.h>
@@ -709,8 +710,6 @@ namei_start(struct namei_state *state, i
 		return ENOTDIR;
 	}
 
-	vn_lock(startdir, LK_EXCLUSIVE | LK_RETRY);
-
 	*startdir_ret = startdir;
 	return 0;
 }
@@ -748,15 +747,17 @@ namei_follow(struct namei_state *state, 
 	size_t linklen;
 	int error;
 
-	KASSERT(VOP_ISLOCKED(searchdir) == LK_EXCLUSIVE);
-	KASSERT(VOP_ISLOCKED(foundobj) == LK_EXCLUSIVE);
 	if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
 		return ELOOP;
 	}
+
+	vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY);
 	if (foundobj->v_mount->mnt_flag & MNT_SYMPERM) {
 		error = VOP_ACCESS(foundobj, VEXEC, cnp->cn_cred);
-		if (error != 0)
+		if (error != 0) {
+			VOP_UNLOCK(foundobj);
 			return error;
+		}
 	}
 
 	/* FUTURE: fix this to not use a second buffer */
@@ -770,6 +771,7 @@ namei_follow(struct namei_state *state, 
 	auio.uio_resid = MAXPATHLEN;
 	UIO_SETUP_SYSSPACE(&auio);
 	error = VOP_READLINK(foundobj, &auio, cnp->cn_cred);
+	VOP_UNLOCK(foundobj);
 	if (error) {
 		PNBUF_PUT(cp);
 		return error;
@@ -806,14 +808,11 @@ namei_follow(struct namei_state *state, 
 	/* we're now starting from the beginning of the buffer again */
 	cnp->cn_nameptr = ndp->ni_pnbuf;
 
-	/* must unlock this before relocking searchdir */
-	VOP_UNLOCK(foundobj);
-
 	/*
 	 * Check if root directory should replace current directory.
 	 */
 	if (ndp->ni_pnbuf[0] == '/') {
-		vput(searchdir);
+		vrele(searchdir);
 		/* Keep absolute symbolic links inside emulation root */
 		searchdir = ndp->ni_erootdir;
 		if (searchdir == NULL ||
@@ -824,7 +823,6 @@ namei_follow(struct namei_state *state, 
 			searchdir = ndp->ni_rootdir;
 		}
 		vref(searchdir);
-		vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY);
 		while (cnp->cn_nameptr[0] == '/') {
 			cnp->cn_nameptr++;
 			ndp->ni_pathlen--;
@@ -832,7 +830,6 @@ namei_follow(struct namei_state *state, 
 	}
 
 	*newsearchdir_ret = searchdir;
-	KASSERT(VOP_ISLOCKED(searchdir) == LK_EXCLUSIVE);
 	return 0;
 }
 
@@ -860,7 +857,7 @@ lookup_parsepath(struct namei_state *sta
 	 * responsibility for freeing the pathname buffer.
 	 *
 	 * At this point, our only vnode state is that the search dir
-	 * is held and locked.
+	 * is held.
 	 */
 	cnp->cn_consume = 0;
 	cnp->cn_namelen = namei_getcomponent(cnp->cn_nameptr);
@@ -917,6 +914,111 @@ lookup_parsepath(struct namei_state *sta
 }
 
 /*
+ * Take care of crossing a mounted-on vnode.  On error, foundobj_ret will be
+ * vrele'd, but searchdir is left alone.
+ */
+static int
+lookup_crossmount(struct namei_state *state,
+		  struct vnode **searchdir_ret,
+		  struct vnode **foundobj_ret,
+		  bool *searchdir_locked)
+{
+	struct componentname *cnp = state->cnp;
+	struct vnode *foundobj;
+	struct vnode *searchdir;
+	struct mount *mp;
+	int error, lktype;
+
+	searchdir = *searchdir_ret;
+	foundobj = *foundobj_ret;
+	error = 0;
+
+	KASSERT((cnp->cn_flags & NOCROSSMOUNT) == 0);
+	KASSERT(searchdir != NULL);
+
+	/* First, unlock searchdir (oof). */
+	if (*searchdir_locked) {
+		lktype = VOP_ISLOCKED(searchdir);
+		VOP_UNLOCK(searchdir);
+		*searchdir_locked = false;
+	} else {
+		lktype = LK_NONE;
+	}
+
+	/*
+	 * Do an unlocked check to see if the vnode has been mounted on; if
+	 * so find the root of the mounted file system.
+	 */
+	while (foundobj->v_type == VDIR &&
+	    (mp = foundobj->v_mountedhere) != NULL &&
+	    (cnp->cn_flags & NOCROSSMOUNT) == 0) {
+		KASSERTMSG(searchdir != foundobj, "same vn %p", searchdir);
+		/*
+		 * First get the vnode stable.  LK_SHARED works brilliantly
+		 * here because almost nothing else wants to lock the
+		 * covered vnode.
+		 */
+		error = vn_lock(foundobj, LK_SHARED);
+		if (error != 0) {
+			vrele(foundobj);
+			*foundobj_ret = NULL;
+			break;
+		}
+
+		/* Then check to see if something is still mounted on it. */
+		if ((mp = foundobj->v_mountedhere) == NULL) {
+			VOP_UNLOCK(foundobj);
+			break;
+		}
+
+		/* Get a reference to the mountpoint, and ditch foundobj. */
+		error = vfs_busy(mp);
+		vput(foundobj);
+		if (error != 0) {
+			*foundobj_ret = NULL;
+			break;
+		}
+
+		/* Now get a reference on the root vnode, and drop mount. */
+		error = VFS_ROOT(mp, LK_NONE, &foundobj);
+		vfs_unbusy(mp);
+		if (error) {
+			*foundobj_ret = NULL;
+			break;
+		}
+
+		/*
+		 * Avoid locking vnodes from two filesystems because
+		 * it's prone to deadlock, e.g. when using puffs.
+		 * Also, it isn't a good idea to propagate slowness of
+		 * a filesystem up to the root directory. For now,
+		 * only handle the common case, where foundobj is
+		 * VDIR.
+		 *
+		 * In this case set searchdir to null to avoid using
+		 * it again. It is not correct to set searchdir ==
+		 * foundobj here as that will confuse the caller.
+		 * (See PR 40740.)
+		 */
+		if (searchdir == NULL) {
+			/* already been here once; do nothing further */
+		} else if (foundobj->v_type == VDIR) {
+			vrele(searchdir);
+			*searchdir_ret = searchdir = NULL;
+			*foundobj_ret = foundobj;
+			lktype = LK_NONE;
+		}
+	}
+
+	/* If searchdir is still around, re-lock it. */
+ 	if (error == 0 && lktype != LK_NONE) {
+		vn_lock(searchdir, lktype | LK_RETRY);
+		*searchdir_locked = true;
+	}
+	return error;
+}
+
+/*
  * Call VOP_LOOKUP for a single lookup; return a new search directory
  * (used when crossing mountpoints up or searching union mounts down) and 
  * the found object, which for create operations may be NULL on success.
@@ -932,19 +1034,19 @@ static int
 lookup_once(struct namei_state *state,
 	    struct vnode *searchdir,
 	    struct vnode **newsearchdir_ret,
-	    struct vnode **foundobj_ret)
+	    struct vnode **foundobj_ret,
+	    bool *newsearchdir_locked_ret)
 {
 	struct vnode *tmpvn;		/* scratch vnode */
 	struct vnode *foundobj;		/* result */
-	struct mount *mp;		/* mount table entry */
 	struct lwp *l = curlwp;
-	int error;
+	bool searchdir_locked = false;
+	int error, lktype;
 
 	struct componentname *cnp = state->cnp;
 	struct nameidata *ndp = state->ndp;
 
 	KASSERT(cnp == &ndp->ni_cnd);
-	KASSERT(VOP_ISLOCKED(searchdir) == LK_EXCLUSIVE);
 	*newsearchdir_ret = searchdir;
 
 	/*
@@ -976,9 +1078,7 @@ lookup_once(struct namei_state *state,
 			if (ndp->ni_rootdir != rootvnode) {
 				int retval;
 
-				VOP_UNLOCK(searchdir);
 				retval = vn_isunder(searchdir, ndp->ni_rootdir, l);
-				vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY);
 				if (!retval) {
 				    /* Oops! We got out of jail! */
 				    log(LOG_WARNING,
@@ -987,12 +1087,11 @@ lookup_once(struct namei_state *state,
 					p->p_pid, kauth_cred_geteuid(l->l_cred),
 					p->p_comm);
 				    /* Put us at the jail root. */
-				    vput(searchdir);
+				    vrele(searchdir);
 				    searchdir = NULL;
 				    foundobj = ndp->ni_rootdir;
 				    vref(foundobj);
 				    vref(foundobj);
-				    vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY);
 				    *newsearchdir_ret = foundobj;
 				    *foundobj_ret = foundobj;
 				    error = 0;
@@ -1005,18 +1104,35 @@ lookup_once(struct namei_state *state,
 			tmpvn = searchdir;
 			searchdir = searchdir->v_mount->mnt_vnodecovered;
 			vref(searchdir);
-			vput(tmpvn);
-			vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY);
+			vrele(tmpvn);
 			*newsearchdir_ret = searchdir;
 		}
 	}
 
 	/*
+	 * If the file system supports VOP_LOOKUP() with a shared lock, and
+	 * we are not making any modifications (nameiop LOOKUP) or this is
+	 * not the last component then get a shared lock.  Where we can't do
+	 * fast-forwarded lookups (for example with layered file systems)
+	 * then this is the fallback for reducing lock contention.
+	 */
+	if ((searchdir->v_mount->mnt_iflag & IMNT_SHRLOOKUP) != 0 &&
+	    (cnp->cn_nameiop == LOOKUP || (cnp->cn_flags & ISLASTCN) == 0)) {
+	    	lktype = LK_SHARED;
+	} else {
+		lktype = LK_EXCLUSIVE;
+	}
+
+	/*
 	 * We now have a segment name to search for, and a directory to search.
-	 * Our vnode state here is that "searchdir" is held and locked.
+	 * Our vnode state here is that "searchdir" is held.
 	 */
 unionlookup:
 	foundobj = NULL;
+	if (!searchdir_locked) {
+		vn_lock(searchdir, lktype | LK_RETRY);
+		searchdir_locked = true;
+	}
 	error = VOP_LOOKUP(searchdir, &foundobj, cnp);
 
 	if (error != 0) {
@@ -1026,6 +1142,23 @@ unionlookup:
 #ifdef NAMEI_DIAGNOSTIC
 		printf("not found\n");
 #endif /* NAMEI_DIAGNOSTIC */
+
+		/*
+		 * If ENOLCK, the file system needs us to retry the lookup
+		 * with an exclusive lock.  It's likely nothing was found in
+		 * cache and/or modifications need to be made.
+		 */
+		if (error == ENOLCK) {
+			KASSERT(VOP_ISLOCKED(searchdir) == LK_SHARED);
+			KASSERT(searchdir_locked);
+			if (vn_lock(searchdir, LK_UPGRADE | LK_NOWAIT)) {
+				VOP_UNLOCK(searchdir);
+				searchdir_locked = false;
+			}
+			lktype = LK_EXCLUSIVE;
+			goto unionlookup;
+		}
+
 		if ((error == ENOENT) &&
 		    (searchdir->v_vflag & VV_ROOT) &&
 		    (searchdir->v_mount->mnt_flag & MNT_UNION)) {
@@ -1033,7 +1166,7 @@ unionlookup:
 			searchdir = searchdir->v_mount->mnt_vnodecovered;
 			vref(searchdir);
 			vput(tmpvn);
-			vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY);
+			searchdir_locked = false;
 			*newsearchdir_ret = searchdir;
 			goto unionlookup;
 		}
@@ -1087,85 +1220,184 @@ unionlookup:
 			cnp->cn_flags |= ISLASTCN;
 	}
 
-	/*
-	 * "searchdir" is locked and held, "foundobj" is held,
-	 * they may be the same vnode.
-	 */
-	if (searchdir != foundobj) {
-		if (cnp->cn_flags & ISDOTDOT)
-			VOP_UNLOCK(searchdir);
-		error = vn_lock(foundobj, LK_EXCLUSIVE);
-		if (cnp->cn_flags & ISDOTDOT)
-			vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY);
-		if (error != 0) {
-			vrele(foundobj);
-			goto done;
+	/* Unlock, unless the caller needs the parent locked. */
+	if (searchdir != NULL) {
+		KASSERT(searchdir_locked);
+		if ((cnp->cn_flags & (ISLASTCN | LOCKPARENT)) !=
+		    (ISLASTCN | LOCKPARENT)) {
+		    	VOP_UNLOCK(searchdir);
+		    	searchdir_locked = false;
 		}
+	} else {
+		KASSERT(!searchdir_locked);
 	}
 
-	/*
-	 * Check to see if the vnode has been mounted on;
-	 * if so find the root of the mounted file system.
-	 */
-	KASSERT(searchdir != NULL);
-	while (foundobj->v_type == VDIR &&
-	       (mp = foundobj->v_mountedhere) != NULL &&
-	       (cnp->cn_flags & NOCROSSMOUNT) == 0) {
+	*foundobj_ret = foundobj;
+	error = 0;
+done:
+	*newsearchdir_locked_ret = searchdir_locked;
+	return error;
+}
 
-		KASSERT(searchdir != foundobj);
+/*
+ * Parse out the first path name component that we need to to consider. 
+ *
+ * While doing this, attempt to use the name cache to fast-forward through
+ * as many "easy" to find components of the path as possible.
+ *
+ * We use the namecache's node locks to form a chain, and avoid as many
+ * vnode references and locks as possible.  In the ideal case, only the
+ * final vnode will have its reference count adjusted and lock taken.
+ */
+static int
+lookup_fastforward(struct namei_state *state, struct vnode **searchdir_ret,
+		   struct vnode **foundobj_ret)
+{
+	struct componentname *cnp = state->cnp;
+	struct nameidata *ndp = state->ndp;
+	krwlock_t *plock;
+	struct vnode *foundobj, *searchdir;
+	int error, error2;
+	size_t oldpathlen;
+	const char *oldnameptr;
 
-		error = vfs_busy(mp);
-		if (error != 0) {
-			vput(foundobj);
-			goto done;
+	/*
+	 * Eat as many path name components as possible before giving up and
+	 * letting lookup_once() handle it.  Remember the starting point in
+	 * case we can't get vnode references and need to roll back.
+	 */
+	plock = NULL;
+	searchdir = *searchdir_ret;
+	oldnameptr = cnp->cn_nameptr;
+	oldpathlen = ndp->ni_pathlen;
+	for (;;) {
+		foundobj = NULL;
+
+		/*
+		 * Get the next component name.  There should be no slashes
+		 * here, and we shouldn't have looped around if we were
+		 * done.
+		 */
+		KASSERT(cnp->cn_nameptr[0] != '/');
+		KASSERT(cnp->cn_nameptr[0] != '\0');
+		if ((error = lookup_parsepath(state)) != 0) {
+			break;
 		}
-		if (searchdir != NULL) {
-			VOP_UNLOCK(searchdir);
+
+		/*
+		 * Can't deal with dotdot lookups, because it means lock
+		 * order reversal, and there are checks in lookup_once()
+		 * that need to be made.  Also check for missing mountpoints.
+		 */
+		if ((cnp->cn_flags & ISDOTDOT) != 0 ||
+		    searchdir->v_mount == NULL) {
+			error = EOPNOTSUPP;
+			break;
 		}
-		vput(foundobj);
-		error = VFS_ROOT(mp, LK_EXCLUSIVE, &foundobj);
-		vfs_unbusy(mp);
-		if (error) {
-			if (searchdir != NULL) {
-				vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY);
+
+		/*
+		 * Can't deal with last component when modifying; this needs
+		 * searchdir locked and VOP_LOOKUP() called (which can and
+		 * does modify state, despite the name).
+		 */
+		if ((cnp->cn_flags & ISLASTCN) != 0) {
+			if (cnp->cn_nameiop != LOOKUP ||
+			    (cnp->cn_flags & LOCKPARENT) != 0) {
+				error = EOPNOTSUPP;
+				break;
 			}
-			goto done;
 		}
+
+		/* Can't deal with -o union lookups. */
+		if ((searchdir->v_vflag & VV_ROOT) != 0 &&
+		    (searchdir->v_mount->mnt_flag & MNT_UNION) != 0) {
+		    	error = EOPNOTSUPP;
+		    	break;
+		}
+
 		/*
-		 * Avoid locking vnodes from two filesystems because
-		 * it's prone to deadlock, e.g. when using puffs.
-		 * Also, it isn't a good idea to propagate slowness of
-		 * a filesystem up to the root directory. For now,
-		 * only handle the common case, where foundobj is
-		 * VDIR.
+		 * Good, now look for it in cache.  cache_lookup_linked()
+		 * will fail if there's nothing there, or if there's no
+		 * ownership info for the directory, or if the user doesn't
+		 * have permission to look up files in this directory.
+		 */
+		if (!cache_lookup_linked(searchdir, cnp->cn_nameptr,
+		    cnp->cn_namelen, &foundobj, &plock, cnp->cn_cred)) {
+			error = EOPNOTSUPP;
+			break;
+		}
+		KASSERT(plock != NULL && rw_lock_held(plock));
+
+		/* Scored a hit.  Negative is good too (ENOENT). */
+		if (foundobj == NULL) {
+			error = ENOENT;
+			break;
+		}
+
+		/*
+		 * Stop and get a hold on the vnode if there's something
+		 * that can't be handled here:
 		 *
-		 * In this case set searchdir to null to avoid using
-		 * it again. It is not correct to set searchdir ==
-		 * foundobj here as that will confuse the caller.
-		 * (See PR 40740.)
+		 * - we've reached the last component.
+		 * - or encountered a mount point that needs to be crossed.
+		 * - or encountered something other than a directory.
 		 */
-		if (searchdir == NULL) {
-			/* already been here once; do nothing further */
-		} else if (foundobj->v_type == VDIR) {
-			vrele(searchdir);
-			*newsearchdir_ret = searchdir = NULL;
+		if ((cnp->cn_flags & ISLASTCN) != 0 ||
+		    foundobj->v_type != VDIR ||
+		    (foundobj->v_type == VDIR &&
+		    foundobj->v_mountedhere != NULL)) {
+			mutex_enter(foundobj->v_interlock);
+			error = vcache_tryvget(foundobj);
+			/* v_interlock now released */
+			if (error != 0) {
+				foundobj = NULL;
+			}
+			break;
+		}
+
+		/*
+		 * Otherwise, we're still in business.  Set the found VDIR
+		 * vnode as the search dir for the next component and
+		 * continue on to it.
+		 */
+		cnp->cn_nameptr = ndp->ni_next;
+		searchdir = foundobj;
+	}
+
+	/*
+	 * If we ended up with a new search dir, ref it before dropping the
+	 * namecache's lock.  The lock prevents both searchdir and foundobj
+	 * from disappearing.  If we can't ref the new searchdir, we have a
+	 * bit of a problem.  Roll back the fastforward to the beginning and
+	 * let lookup_once() take care of it.
+	 */
+	if (searchdir != *searchdir_ret) {
+		mutex_enter(searchdir->v_interlock);
+		error2 = vcache_tryvget(searchdir);
+		/* v_interlock now unheld */
+		KASSERT(plock != NULL);
+		rw_exit(plock);
+		if (__predict_true(error2 == 0)) {
+			/* Returning new searchdir, and maybe new foundobj. */
+			vrele(*searchdir_ret);
+			*searchdir_ret = searchdir;
 		} else {
-			VOP_UNLOCK(foundobj);
-			vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY);
-			vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY);
+			/* Returning nothing. */
+			if (foundobj != NULL) {
+				vrele(foundobj);
+				foundobj = NULL;
+			}
+			cnp->cn_nameptr = oldnameptr;
+			ndp->ni_pathlen = oldpathlen;
+			error = lookup_parsepath(state);
 		}
+	} else if (plock != NULL) {
+		/* Drop any namecache lock still held. */
+		rw_exit(plock);
 	}
 
+	KASSERT(error == 0 ? foundobj != NULL : foundobj == NULL);
 	*foundobj_ret = foundobj;
-	error = 0;
-done:
-	KASSERT(*newsearchdir_ret == NULL ||
-		VOP_ISLOCKED(*newsearchdir_ret) == LK_EXCLUSIVE);
-	/*
-	 * *foundobj_ret is valid only if error == 0.
-	 */
-	KASSERT(error != 0 || *foundobj_ret == NULL ||
-	    VOP_ISLOCKED(*foundobj_ret) == LK_EXCLUSIVE);
 	return error;
 }
 
@@ -1182,6 +1414,7 @@ namei_oneroot(struct namei_state *state,
 	struct nameidata *ndp = state->ndp;
 	struct componentname *cnp = state->cnp;
 	struct vnode *searchdir, *foundobj;
+	bool searchdir_locked = false;
 	int error;
 
 	error = namei_start(state, isnfsd, &searchdir);
@@ -1222,44 +1455,47 @@ namei_oneroot(struct namei_state *state,
 
 	for (;;) {
 		KASSERT(searchdir != NULL);
-		KASSERT(VOP_ISLOCKED(searchdir) == LK_EXCLUSIVE);
+		KASSERT(!searchdir_locked);
 
 		/*
-		 * If the directory we're on is unmounted, bail out.
-		 * XXX: should this also check if it's unlinked?
-		 * XXX: yes it should... but how?
+		 * Parse out the first path name component that we need to
+		 * to consider.  While doing this, attempt to use the name
+		 * cache to fast-forward through as many "easy" to find
+		 * components of the path as possible.
 		 */
-		if (searchdir->v_mount == NULL) {
-			vput(searchdir);
-			ndp->ni_dvp = NULL;
-			ndp->ni_vp = NULL;
-			return (ENOENT);
-		}
+		error = lookup_fastforward(state, &searchdir, &foundobj);
 
 		/*
-		 * Look up the next path component.
-		 * (currently, this may consume more than one)
+		 * If we didn't get a good answer from the namecache, then
+		 * go directly to the file system.
 		 */
+		if (error != 0 && error != ENOENT) {
+			error = lookup_once(state, searchdir, &searchdir,
+			    &foundobj, &searchdir_locked);
+		}
 
-		/* There should be no slashes here. */
-		KASSERT(cnp->cn_nameptr[0] != '/');
-
-		/* and we shouldn't have looped around if we were done */
-		KASSERT(cnp->cn_nameptr[0] != '\0');
-
-		error = lookup_parsepath(state);
-		if (error) {
-			vput(searchdir);
-			ndp->ni_dvp = NULL;
-			ndp->ni_vp = NULL;
-			state->attempt_retry = 1;
-			return (error);
+		/*
+		 * If the vnode we found is mounted on, then cross the mount
+		 * and get the root vnode in foundobj.  If this encounters
+		 * an error, it will dispose of foundobj, but searchdir is
+		 * untouched.
+		 */
+		if (error == 0 && foundobj != NULL &&
+		    foundobj->v_type == VDIR &&
+		    foundobj->v_mountedhere != NULL &&
+		    (cnp->cn_flags & NOCROSSMOUNT) == 0) {
+		    	error = lookup_crossmount(state, &searchdir,
+		    	    &foundobj, &searchdir_locked);
 		}
 
-		error = lookup_once(state, searchdir, &searchdir, &foundobj);
 		if (error) {
 			if (searchdir != NULL) {
-				vput(searchdir);
+				if (searchdir_locked) {
+					searchdir_locked = false;
+					vput(searchdir);
+				} else {
+					vrele(searchdir);
+				}
 			}
 			ndp->ni_dvp = NULL;
 			ndp->ni_vp = NULL;
@@ -1296,6 +1532,11 @@ namei_oneroot(struct namei_state *state,
 		 * them again.
 		 */
 		if (namei_atsymlink(state, foundobj)) {
+			/* Don't need searchdir locked any more. */
+			if (searchdir_locked) {
+				searchdir_locked = false;
+				VOP_UNLOCK(searchdir);
+			}
 			ndp->ni_pathlen += state->slashes;
 			ndp->ni_next -= state->slashes;
 			if (neverfollow) {
@@ -1337,14 +1578,13 @@ namei_oneroot(struct namei_state *state,
 			if (error) {
 				KASSERT(searchdir != foundobj);
 				if (searchdir != NULL) {
-					vput(searchdir);
+					vrele(searchdir);
 				}
-				vput(foundobj);
+				vrele(foundobj);
 				ndp->ni_dvp = NULL;
 				ndp->ni_vp = NULL;
 				return error;
 			}
-			/* namei_follow unlocks it (ugh) so rele, not put */
 			vrele(foundobj);
 			foundobj = NULL;
 
@@ -1375,9 +1615,16 @@ namei_oneroot(struct namei_state *state,
 		    (cnp->cn_flags & REQUIREDIR)) {
 			KASSERT(foundobj != searchdir);
 			if (searchdir) {
-				vput(searchdir);
+				if (searchdir_locked) {
+					searchdir_locked = false;
+					vput(searchdir);
+				} else {
+					vrele(searchdir);
+				}
+			} else {
+				KASSERT(!searchdir_locked);
 			}
-			vput(foundobj);
+			vrele(foundobj);
 			ndp->ni_dvp = NULL;
 			ndp->ni_vp = NULL;
 			state->attempt_retry = 1;
@@ -1395,15 +1642,21 @@ namei_oneroot(struct namei_state *state,
 		 * Continue with the next component.
 		 */
 		cnp->cn_nameptr = ndp->ni_next;
-		if (searchdir == foundobj) {
-			vrele(searchdir);
-		} else if (searchdir != NULL) {
-			vput(searchdir);
+		if (searchdir != NULL) {
+			if (searchdir_locked) {
+				searchdir_locked = false;
+				vput(searchdir);
+			} else {
+				vrele(searchdir);
+			}
 		}
 		searchdir = foundobj;
 		foundobj = NULL;
 	}
 
+	KASSERT((cnp->cn_flags & LOCKPARENT) == 0 || searchdir == NULL ||
+	    VOP_ISLOCKED(searchdir) == LK_EXCLUSIVE);
+
  skiploop:
 
 	if (foundobj != NULL) {
@@ -1416,16 +1669,17 @@ namei_oneroot(struct namei_state *state,
 			 * forever.  So convert it to the real root.
 			 */
 			if (searchdir != NULL) {
-				if (searchdir == foundobj)
-					vrele(searchdir);
-				else
+				if (searchdir_locked) {
 					vput(searchdir);
+					searchdir_locked = false;
+				} else {
+					vrele(searchdir);
+				}
 				searchdir = NULL;
 			}
-			vput(foundobj);
+			vrele(foundobj);
 			foundobj = ndp->ni_rootdir;
 			vref(foundobj);
-			vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY);
 		}
 
 		/*
@@ -1438,9 +1692,15 @@ namei_oneroot(struct namei_state *state,
 		    (searchdir == NULL ||
 		     searchdir->v_mount != foundobj->v_mount)) {
 			if (searchdir) {
-				vput(searchdir);
+				if (searchdir_locked) {
+					vput(searchdir);
+					searchdir_locked = false;
+				} else {
+					vrele(searchdir);
+				}
+				searchdir = NULL;
 			}
-			vput(foundobj);
+			vrele(foundobj);
 			foundobj = NULL;
 			ndp->ni_dvp = NULL;
 			ndp->ni_vp = NULL;
@@ -1465,21 +1725,25 @@ namei_oneroot(struct namei_state *state,
 		if (state->rdonly &&
 		    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
 			if (searchdir) {
-				if (foundobj != searchdir) {
+				if (searchdir_locked) {
 					vput(searchdir);
+					searchdir_locked = false;
 				} else {
 					vrele(searchdir);
 				}
 				searchdir = NULL;
 			}
-			vput(foundobj);
+			vrele(foundobj);
 			foundobj = NULL;
 			ndp->ni_dvp = NULL;
 			ndp->ni_vp = NULL;
 			state->attempt_retry = 1;
 			return EROFS;
 		}
-		if ((cnp->cn_flags & LOCKLEAF) == 0) {
+
+		/* Lock the leaf node if requested. */
+		if ((cnp->cn_flags & (LOCKLEAF | LOCKPARENT)) == LOCKPARENT &&
+		    searchdir == foundobj) {
 			/*
 			 * Note: if LOCKPARENT but not LOCKLEAF is
 			 * set, and searchdir == foundobj, this code
@@ -1491,7 +1755,15 @@ namei_oneroot(struct namei_state *state,
 			 * that uses this combination "knows" this, so
 			 * it can't be safely changed. Feh. XXX
 			 */
-			VOP_UNLOCK(foundobj);
+			KASSERT(searchdir_locked);
+		    	VOP_UNLOCK(searchdir);
+		    	searchdir_locked = false;
+		} else if ((cnp->cn_flags & LOCKLEAF) != 0 &&
+		    (searchdir != foundobj ||
+		    (cnp->cn_flags & LOCKPARENT) == 0)) {
+			const int lktype = (cnp->cn_flags & LOCKSHARED) != 0 ?
+			    LK_SHARED : LK_EXCLUSIVE;
+			vn_lock(foundobj, lktype | LK_RETRY);
 		}
 	}
 
@@ -1503,11 +1775,7 @@ namei_oneroot(struct namei_state *state,
 	 * If LOCKPARENT is not set, the parent directory isn't returned.
 	 */
 	if ((cnp->cn_flags & LOCKPARENT) == 0 && searchdir != NULL) {
-		if (searchdir == foundobj) {
-			vrele(searchdir);
-		} else {
-			vput(searchdir);
-		}
+		vrele(searchdir);
 		searchdir = NULL;
 	}
 
@@ -1649,6 +1917,7 @@ do_lookup_for_nfsd_index(struct namei_st
 	struct nameidata *ndp = state->ndp;
 	struct vnode *startdir;
 	struct vnode *foundobj;
+	bool startdir_locked;
 	const char *cp;			/* pointer into pathname argument */
 
 	KASSERT(cnp == &ndp->ni_cnd);
@@ -1681,30 +1950,37 @@ do_lookup_for_nfsd_index(struct namei_st
 	 * own reference to it to avoid consuming the caller's.
 	 */
 	vref(startdir);
-	vn_lock(startdir, LK_EXCLUSIVE | LK_RETRY);
-	error = lookup_once(state, startdir, &startdir, &foundobj);
-	if (error == 0 && startdir == foundobj) {
-		vrele(startdir);
-	} else if (startdir != NULL) {
-		vput(startdir);
-	}
-	if (error) {
-		goto bad;
-	}
-	ndp->ni_vp = foundobj;
+	error = lookup_once(state, startdir, &startdir, &foundobj,
+	    &startdir_locked);
 
-	if (foundobj == NULL) {
-		return 0;
+	KASSERT((cnp->cn_flags & LOCKPARENT) == 0);
+	if (startdir_locked) {
+		VOP_UNLOCK(startdir);
+		startdir_locked = false;
 	}
 
-	KASSERT((cnp->cn_flags & LOCKPARENT) == 0);
-	if ((cnp->cn_flags & LOCKLEAF) == 0) {
-		VOP_UNLOCK(foundobj);
+	/*
+	 * If the vnode we found is mounted on, then cross the mount and get
+	 * the root vnode in foundobj.  If this encounters an error, it will
+	 * dispose of foundobj, but searchdir is untouched.
+	 */
+	if (error == 0 && foundobj != NULL &&
+	    foundobj->v_type == VDIR &&
+	    foundobj->v_mountedhere != NULL &&
+	    (cnp->cn_flags & NOCROSSMOUNT) == 0) {
+		error = lookup_crossmount(state, &startdir, &foundobj,
+		    &startdir_locked);
 	}
-	return (0);
 
-bad:
-	ndp->ni_vp = NULL;
+	/* Now toss startdir and see if we have an error. */
+	if (startdir != NULL)
+		vrele(startdir);
+	if (error)
+		foundobj = NULL;
+	else if (foundobj != NULL && (cnp->cn_flags & LOCKLEAF) != 0)
+		vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY);
+
+	ndp->ni_vp = foundobj;
 	return (error);
 }
 
Index: src/sys/kern/vfs_syscalls.c
diff -u src/sys/kern/vfs_syscalls.c:1.542 src/sys/kern/vfs_syscalls.c:1.539.2.4
--- src/sys/kern/vfs_syscalls.c:1.542	Sun Feb 23 22:14:04 2020
+++ src/sys/kern/vfs_syscalls.c	Sat Feb 29 20:21:03 2020
@@ -1528,7 +1528,7 @@ chdir_lookup(const char *path, int where
 	if (error) {
 		return error;
 	}
-	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT, pb);
 	if ((error = namei(&nd)) != 0) {
 		pathbuf_destroy(pb);
 		return error;
@@ -2994,7 +2994,7 @@ do_sys_accessat(struct lwp *l, int fdat,
 		return EINVAL;
 	}
 
-	nd_flag = FOLLOW | LOCKLEAF | TRYEMULROOT;
+	nd_flag = FOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT;
 	if (flags & AT_SYMLINK_NOFOLLOW)
 		nd_flag &= ~FOLLOW;
 
@@ -3220,7 +3220,7 @@ do_sys_readlinkat(struct lwp *l, int fda
 	if (error) {
 		return error;
 	}
-	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | TRYEMULROOT, pb);
+	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT, pb);
 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
 		pathbuf_destroy(pb);
 		return error;
@@ -4691,7 +4691,7 @@ dorevoke(struct vnode *vp, kauth_cred_t 
 	struct vattr vattr;
 	int error, fs_decision;
 
-	vn_lock(vp, LK_SHARED | LK_RETRY);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_GETATTR(vp, &vattr, cred);
 	VOP_UNLOCK(vp);
 	if (error != 0)
Index: src/sys/kern/vfs_vnode.c
diff -u src/sys/kern/vfs_vnode.c:1.113 src/sys/kern/vfs_vnode.c:1.105.2.9
--- src/sys/kern/vfs_vnode.c:1.113	Thu Feb 27 22:12:54 2020
+++ src/sys/kern/vfs_vnode.c	Sat Feb 29 20:21:03 2020
@@ -119,8 +119,7 @@
  *			Vnode finished disassociation from underlying file
  *			system in vcache_reclaim().
  *	LOADED -> BLOCKED
- *			Either vcache_rekey*() is changing the vnode key or
- *			vrelel() is about to call VOP_INACTIVE().
+ *			vcache_rekey*() is changing the vnode key.
  *	BLOCKED -> LOADED
  *			The block condition is over.
  *	LOADING -> RECLAIMED
@@ -828,25 +827,23 @@ vrelel(vnode_t *vp, int flags, int lktyp
 	if (VSTATE_GET(vp) == VS_RECLAIMED) {
 		VOP_UNLOCK(vp);
 	} else {
-		VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
-		mutex_exit(vp->v_interlock);
-
 		/*
-		 * The vnode must not gain another reference while being
-		 * deactivated.  If VOP_INACTIVE() indicates that
-		 * the described file has been deleted, then recycle
-		 * the vnode.
+		 * If VOP_INACTIVE() indicates that the described file has
+		 * been deleted, then recycle the vnode.  Note that
+		 * VOP_INACTIVE() will not drop the vnode lock.
 		 *
-		 * Note that VOP_INACTIVE() will not drop the vnode lock.
+		 * If the file has been deleted, this is a lingering
+		 * reference and there is no need to worry about new
+		 * references looking to do real work with the vnode (as it
+		 * will have been purged from directories, caches, etc).
 		 */
 		recycle = false;
+		mutex_exit(vp->v_interlock);
 		VOP_INACTIVE(vp, &recycle);
-		if (!recycle)
-			VOP_UNLOCK(vp);
 		rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
 		mutex_enter(vp->v_interlock);
-		VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
 		if (!recycle) {
+			VOP_UNLOCK(vp);
 			if (vtryrele(vp)) {
 				mutex_exit(vp->v_interlock);
 				rw_exit(vp->v_uobj.vmobjlock);
@@ -1228,12 +1225,9 @@ vcache_alloc(void)
 	rw_init(&vip->vi_lock);
 	vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
 
-	/* SLIST_INIT(&vip->vi_hash); */
-	TAILQ_INIT(&vip->vi_nclist);
-	/* LIST_INIT(&vip->vi_dnclist); */
-
 	uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1);
 	cv_init(&vp->v_cv, "vnode");
+	cache_vnode_init(vp);
 
 	vp->v_usecount = 1;
 	vp->v_type = VNON;
@@ -1294,6 +1288,7 @@ vcache_free(vnode_impl_t *vip)
 	rw_destroy(&vip->vi_lock);
 	uvm_obj_destroy(&vp->v_uobj, true);
 	cv_destroy(&vp->v_cv);
+	cache_vnode_fini(vp);
 	pool_cache_put(vcache_pool, vip);
 }
 
@@ -1681,6 +1676,13 @@ vcache_reclaim(vnode_t *vp)
 	mutex_exit(vp->v_interlock);
 	rw_exit(vp->v_uobj.vmobjlock);
 
+	/*
+	 * With vnode state set to reclaiming, purge name cache immediately
+	 * to prevent new handles on vnode, and wait for existing threads
+	 * trying to get a handle to notice VS_RECLAIMED status and abort.
+	 */
+	cache_purge(vp);
+
 	/* Replace the vnode key with a temporary copy. */
 	if (vip->vi_key.vk_key_len > sizeof(temp_buf)) {
 		temp_key = kmem_alloc(temp_key_len, KM_SLEEP);
@@ -1734,9 +1736,6 @@ vcache_reclaim(vnode_t *vp)
 		vp->v_ractx = NULL;
 	}
 
-	/* Purge name cache. */
-	cache_purge(vp);
-
 	if (vip->vi_key.vk_key_len > 0) {
 	/* Remove from vnode cache. */
 		hash = vcache_hash(&vip->vi_key);
Index: src/sys/miscfs/genfs/layer_vnops.c
diff -u src/sys/miscfs/genfs/layer_vnops.c:1.68 src/sys/miscfs/genfs/layer_vnops.c:1.67.12.2
--- src/sys/miscfs/genfs/layer_vnops.c:1.68	Sun Feb 23 15:46:41 2020
+++ src/sys/miscfs/genfs/layer_vnops.c	Sat Feb 29 20:21:04 2020
@@ -384,6 +384,7 @@ layer_lookup(void *v)
 		vrele(lvp);
 	} else if (lvp != NULL) {
 		/* Note: dvp and ldvp are both locked. */
+		KASSERT(error != ENOLCK);
 		error = layer_node_create(dvp->v_mount, lvp, ap->a_vpp);
 		if (error) {
 			vrele(lvp);
Index: src/sys/miscfs/nullfs/null_vfsops.c
diff -u src/sys/miscfs/nullfs/null_vfsops.c:1.96 src/sys/miscfs/nullfs/null_vfsops.c:1.96.2.2
--- src/sys/miscfs/nullfs/null_vfsops.c:1.96	Sun Dec 15 20:30:56 2019
+++ src/sys/miscfs/nullfs/null_vfsops.c	Wed Jan 22 12:04:36 2020
@@ -141,6 +141,7 @@ nullfs_mount(struct mount *mp, const cha
 	nmp = kmem_zalloc(sizeof(struct null_mount), KM_SLEEP);
 	mp->mnt_data = nmp;
 	mp->mnt_iflag |= IMNT_MPSAFE;
+	mp->mnt_iflag |= lowerrootvp->v_mount->mnt_iflag & IMNT_SHRLOOKUP;
 
 	/*
 	 * Make sure that the mount point is sufficiently initialized
Index: src/sys/miscfs/procfs/procfs_vfsops.c
diff -u src/sys/miscfs/procfs/procfs_vfsops.c:1.102 src/sys/miscfs/procfs/procfs_vfsops.c:1.101.6.2
--- src/sys/miscfs/procfs/procfs_vfsops.c:1.102	Fri Jan 17 20:08:09 2020
+++ src/sys/miscfs/procfs/procfs_vfsops.c	Sun Jan 19 21:21:55 2020
@@ -173,7 +173,7 @@ procfs_mount(
 	else
 		pmnt->pmnt_flags = 0;
 
-	mp->mnt_iflag |= IMNT_MPSAFE;
+	mp->mnt_iflag |= IMNT_MPSAFE | IMNT_SHRLOOKUP;
 	return error;
 }
 
Index: src/sys/sys/namei.src
diff -u src/sys/sys/namei.src:1.48 src/sys/sys/namei.src:1.47.2.7
--- src/sys/sys/namei.src:1.48	Wed Jan  8 12:04:56 2020
+++ src/sys/sys/namei.src	Wed Mar  4 20:21:05 2020
@@ -39,6 +39,7 @@
 
 #ifdef _KERNEL
 #include <sys/kauth.h>
+#include <sys/rwlock.h>
 
 /*
  * Abstraction for a single pathname.
@@ -151,13 +152,14 @@ NAMEIFL	NOFOLLOW	0x00000000	/* do not fo
 					   (pseudo) */
 NAMEIFL	EMULROOTSET	0x00000080	/* emulation root already
 					   in ni_erootdir */
+NAMEIFL	LOCKSHARED	0x00000100	/* want shared locks if possible */
 NAMEIFL	NOCHROOT	0x01000000	/* no chroot on abs path lookups */
-NAMEIFL	MODMASK		0x010000fc	/* mask of operational modifiers */
+NAMEIFL	MODMASK		0x010001fc	/* mask of operational modifiers */
 /*
  * Namei parameter descriptors.
  */
-NAMEIFL	NOCROSSMOUNT	0x0000100	/* do not cross mount points */
-NAMEIFL	RDONLY		0x0000200	/* lookup with read-only semantics */
+NAMEIFL	NOCROSSMOUNT	0x0000800	/* do not cross mount points */
+NAMEIFL	RDONLY		0x0001000	/* lookup with read-only semantics */
 NAMEIFL	ISDOTDOT	0x0002000	/* current component name is .. */
 NAMEIFL	MAKEENTRY	0x0004000	/* entry is to be added to name cache */
 NAMEIFL	ISLASTCN	0x0008000	/* this is last component of pathname */
@@ -165,7 +167,7 @@ NAMEIFL	ISWHITEOUT	0x0020000	/* found wh
 NAMEIFL	DOWHITEOUT	0x0040000	/* do whiteouts */
 NAMEIFL	REQUIREDIR	0x0080000	/* must be a directory */
 NAMEIFL	CREATEDIR	0x0200000	/* trailing slashes are ok */
-NAMEIFL	PARAMASK	0x02ee300	/* mask of parameter descriptors */
+NAMEIFL	PARAMASK	0x02ef800	/* mask of parameter descriptors */
 
 /*
  * Initialization of a nameidata structure.
@@ -188,42 +190,42 @@ NAMEIFL	PARAMASK	0x02ee300	/* mask of pa
 #endif
 
 #ifdef __NAMECACHE_PRIVATE
+#include <sys/rbtree.h>
+
 /*
  * For simplicity (and economy of storage), names longer than
  * a maximum length of NCHNAMLEN are stored in non-pooled storage.
  */
-#define	NCHNAMLEN	32	/* up to this size gets stored in pool */
+#define	NCHNAMLEN	sizeof(((struct namecache *)NULL)->nc_name)
 
 /*
  * Namecache entry.  
- * This structure describes the elements in the cache of recent
- * names looked up by namei.
  *
- * Locking rules:
+ * This structure describes the elements in the cache of recent names looked
+ * up by namei.  It's carefully sized to take up 128 bytes on _LP64, to make
+ * good use of space and the CPU caches.
+ *
+ * Field markings and their corresponding locks:
  *
- *      -       stable after initialization
- *      L       namecache_lock
- *      C       struct nchcpu::cpu_lock
- *      L/C     insert needs L, read needs L or any C,
- *              must hold L and all C after (or during) delete before free
- *      N       struct namecache::nc_lock
+ * -  stable throught the lifetime of the namecache entry
+ * d  protected by nc_dvp->vi_ncdlock
+ * v  protected by nc_dvp->vi_ncvlock
+ * l  protected by cache_lru_lock
+ * u  accesses are unlocked, no serialization applied
  */
+struct nchnode;
 struct namecache {
-	LIST_ENTRY(namecache) nc_hash;	/* L/C hash chain */
-	TAILQ_ENTRY(namecache) nc_lru;	/* L pseudo-lru chain */
-	LIST_ENTRY(namecache) nc_dvlist;/* L dvp's list of cache entries */
-	TAILQ_ENTRY(namecache) nc_vlist;/* L vp's list of cache entries */
-	struct	vnode *nc_dvp;		/* N vnode of parent of name */
-	struct	vnode *nc_vp;		/* N vnode the name refers to */
-	void	*nc_gcqueue;		/* N queue for garbage collection */
-	kmutex_t nc_lock;		/*   lock on this entry */
-	int	nc_hittime;		/* N last time scored a hit */
-	int	nc_flags;		/* - copy of componentname ISWHITEOUT */
-	u_short	nc_nlen;		/* - length of name */
-	char	nc_name[0];		/* - segment name */
+	struct	rb_node nc_tree;	/* d  red-black tree, must be first */
+	TAILQ_ENTRY(namecache) nc_list;	/* v  vp's list of cache entries */
+	TAILQ_ENTRY(namecache) nc_lru;	/* l  pseudo-lru chain */
+	struct	vnode *nc_dvp;		/* -  vnode of parent of name */
+	struct	vnode *nc_vp;		/* -  vnode the name refers to */
+	int64_t	nc_key;			/* -  hash key */
+	int	nc_lrulist;		/* l  which LRU list its on */
+	short	nc_nlen;		/* -  length of the name */
+	char	nc_whiteout;		/* -  true if a whiteout */
+	char	nc_name[41];		/* -  segment name */
 };
-__CTASSERT((sizeof(struct namecache) + NCHNAMLEN)
-    % __alignof(struct namecache) == 0);
 #endif
 
 #ifdef _KERNEL
@@ -286,14 +288,22 @@ bool	cache_lookup(struct vnode *, const 
 			int *, struct vnode **);
 bool	cache_lookup_raw(struct vnode *, const char *, size_t, uint32_t,
 			int *, struct vnode **);
-int	cache_revlookup(struct vnode *, struct vnode **, char **, char *);
+bool	cache_lookup_linked(struct vnode *, const char *, size_t,
+			    struct vnode **, krwlock_t **, kauth_cred_t);
+int	cache_revlookup(struct vnode *, struct vnode **, char **, char *,
+			bool, int);
+int	cache_diraccess(struct vnode *, int);
 void	cache_enter(struct vnode *, struct vnode *,
 			const char *, size_t, uint32_t);
+void	cache_enter_id(struct vnode *, mode_t, uid_t, gid_t);
+bool	cache_have_id(struct vnode *);
+void	cache_vnode_init(struct vnode * );
+void	cache_vnode_fini(struct vnode * );
+void	cache_cpu_init(struct cpu_info *);
+
 void	nchinit(void);
-void	nchreinit(void);
 void	namecache_count_pass2(void);
 void	namecache_count_2passes(void);
-void	cache_cpu_init(struct cpu_info *);
 void	cache_purgevfs(struct mount *);
 void	namecache_print(struct vnode *, void (*)(const char *, ...)
     __printflike(1, 2));
@@ -318,6 +328,8 @@ void	namecache_print(struct vnode *, voi
 	type	ncs_2passes;	/* number of times we attempt it (U) */	\
 	type	ncs_revhits;	/* reverse-cache hits */		\
 	type	ncs_revmiss;	/* reverse-cache misses */		\
+	type	ncs_collisions;	/* hash value collisions */		\
+	type	ncs_denied;	/* access denied */			\
 }
 
 /*
Index: src/sys/sys/vnode_impl.h
diff -u src/sys/sys/vnode_impl.h:1.21 src/sys/sys/vnode_impl.h:1.19.2.5
--- src/sys/sys/vnode_impl.h:1.21	Sun Feb 23 22:14:04 2020
+++ src/sys/sys/vnode_impl.h	Fri Jan 24 16:05:23 2020
@@ -63,7 +63,8 @@ struct vcache_key {
  *	i	v_interlock
  *	l	vi_nc_listlock
  *	m	mnt_vnodelock
- *	n	namecache_lock
+ *	n	vi_nc_lock
+ *	n,l	vi_nc_lock + vi_nc_listlock to modify
  *	s	syncer_data_lock
  */
 struct vnode_impl {
@@ -76,11 +77,15 @@ struct vnode_impl {
 
 	/*
 	 * Namecache.  Give it a separate line so activity doesn't impinge
-	 * on the stable stuff (pending merge of ad-namecache branch).
+	 * on the stable stuff.
 	 */
-	LIST_HEAD(, namecache) vi_dnclist	/* n: namecaches (children) */
+	rb_tree_t	vi_nc_tree		/* n   namecache tree */
 	    __aligned(COHERENCY_UNIT);
-	TAILQ_HEAD(, namecache) vi_nclist;	/* n: namecaches (parent) */
+	TAILQ_HEAD(,namecache) vi_nc_list;	/* l   namecaches (parent) */
+	mode_t		vi_nc_mode;		/* n,l cached mode or VNOVAL */
+	uid_t		vi_nc_uid;		/* n,l cached UID or VNOVAL */
+	gid_t		vi_nc_gid;		/* n,l cached GID or VNOVAL */
+	uint32_t	vi_nc_spare;		/* -   spare (padding) */
 
 	/*
 	 * vnode cache, LRU and syncer.  This all changes with some
Index: src/sys/ufs/chfs/chfs_vnops.c
diff -u src/sys/ufs/chfs/chfs_vnops.c:1.36 src/sys/ufs/chfs/chfs_vnops.c:1.34.4.3
--- src/sys/ufs/chfs/chfs_vnops.c:1.36	Sun Feb 23 15:46:42 2020
+++ src/sys/ufs/chfs/chfs_vnops.c	Sat Feb 29 20:21:10 2020
@@ -90,6 +90,10 @@ chfs_lookup(void *v)
 		return (*vpp == NULLVP ? ENOENT : 0);
 	}
 
+	/* May need to restart the lookup with an exclusive lock. */
+	if (VOP_ISLOCKED(dvp) != LK_EXCLUSIVE)
+		return ENOLCK;
+
 	ip = VTOI(dvp);
 	ump = VFSTOUFS(dvp->v_mount);
 	chmp = ump->um_chfs;
Index: src/sys/ufs/ext2fs/ext2fs_lookup.c
diff -u src/sys/ufs/ext2fs/ext2fs_lookup.c:1.88 src/sys/ufs/ext2fs/ext2fs_lookup.c:1.88.22.1
--- src/sys/ufs/ext2fs/ext2fs_lookup.c:1.88	Tue Aug 23 06:40:25 2016
+++ src/sys/ufs/ext2fs/ext2fs_lookup.c	Sun Jan 19 21:21:55 2020
@@ -313,14 +313,6 @@ ext2fs_lookup(void *v)
 	*vpp = NULL;
 
 	/*
-	 * Produce the auxiliary lookup results into i_crap. Increment
-	 * its serial number so elsewhere we can tell if we're using
-	 * stale results. This should not be done this way. XXX.
-	 */
-	results = &dp->i_crap;
-	dp->i_crapcounter++;
-
-	/*
 	 * Check accessiblity of directory.
 	 */
 	if ((error = VOP_ACCESS(vdp, VEXEC, cred)) != 0)
@@ -342,6 +334,18 @@ ext2fs_lookup(void *v)
 		return *vpp == NULLVP ? ENOENT : 0;
 	}
 
+	/* May need to restart the lookup with an exclusive lock. */
+	if (VOP_ISLOCKED(vdp) != LK_EXCLUSIVE)
+		return ENOLCK;
+
+	/*
+	 * Produce the auxiliary lookup results into i_crap. Increment
+	 * its serial number so elsewhere we can tell if we're using
+	 * stale results. This should not be done this way. XXX.
+	 */
+	results = &dp->i_crap;
+	dp->i_crapcounter++;
+
 	/*
 	 * Suppress search for slots unless creating
 	 * file and at end of pathname, in which case
Index: src/sys/ufs/ext2fs/ext2fs_vfsops.c
diff -u src/sys/ufs/ext2fs/ext2fs_vfsops.c:1.216 src/sys/ufs/ext2fs/ext2fs_vfsops.c:1.214.4.3
--- src/sys/ufs/ext2fs/ext2fs_vfsops.c:1.216	Thu Feb 27 22:12:54 2020
+++ src/sys/ufs/ext2fs/ext2fs_vfsops.c	Sat Feb 29 20:21:10 2020
@@ -736,7 +736,7 @@ ext2fs_mountfs(struct vnode *devvp, stru
 	mp->mnt_flag |= MNT_LOCAL;
 	mp->mnt_dev_bshift = DEV_BSHIFT;	/* XXX */
 	mp->mnt_fs_bshift = m_fs->e2fs_bshift;
-	mp->mnt_iflag |= IMNT_DTYPE;
+	mp->mnt_iflag |= IMNT_DTYPE | IMNT_SHRLOOKUP;
 	ump->um_flags = 0;
 	ump->um_mountp = mp;
 	ump->um_dev = dev;
Index: src/sys/ufs/ffs/ffs_vfsops.c
diff -u src/sys/ufs/ffs/ffs_vfsops.c:1.365 src/sys/ufs/ffs/ffs_vfsops.c:1.362.4.5
--- src/sys/ufs/ffs/ffs_vfsops.c:1.365	Thu Feb 27 22:12:54 2020
+++ src/sys/ufs/ffs/ffs_vfsops.c	Sat Feb 29 20:21:11 2020
@@ -1453,7 +1453,8 @@ ffs_mountfs(struct vnode *devvp, struct 
 	mp->mnt_fs_bshift = fs->fs_bshift;
 	mp->mnt_dev_bshift = DEV_BSHIFT;	/* XXX */
 	mp->mnt_flag |= MNT_LOCAL;
-	mp->mnt_iflag |= IMNT_MPSAFE | IMNT_CAN_RWTORO;
+	mp->mnt_iflag |= IMNT_MPSAFE | IMNT_CAN_RWTORO | IMNT_SHRLOOKUP |
+	    IMNT_NCLOOKUP;
 #ifdef FFS_EI
 	if (needswap)
 		ump->um_flags |= UFS_NEEDSWAP;
@@ -2082,6 +2083,7 @@ ffs_loadvnode(struct mount *mp, struct v
 		ip->i_gid = ip->i_ffs1_ogid;			/* XXX */
 	}							/* XXX */
 	uvm_vnp_setsize(vp, ip->i_size);
+	cache_enter_id(vp, ip->i_mode, ip->i_uid, ip->i_gid);
 	*new_key = &ip->i_number;
 	return 0;
 }
@@ -2203,6 +2205,7 @@ ffs_newvnode(struct mount *mp, struct vn
 	}
 
 	uvm_vnp_setsize(vp, ip->i_size);
+	cache_enter_id(vp, ip->i_mode, ip->i_uid, ip->i_gid);
 	*new_key = &ip->i_number;
 	return 0;
 }
Index: src/sys/ufs/lfs/lfs_vfsops.c
diff -u src/sys/ufs/lfs/lfs_vfsops.c:1.374 src/sys/ufs/lfs/lfs_vfsops.c:1.367.2.3
--- src/sys/ufs/lfs/lfs_vfsops.c:1.374	Sun Feb 23 15:46:42 2020
+++ src/sys/ufs/lfs/lfs_vfsops.c	Sat Feb 29 20:21:11 2020
@@ -1130,6 +1130,7 @@ lfs_mountfs(struct vnode *devvp, struct 
 	mp->mnt_stat.f_namemax = LFS_MAXNAMLEN;
 	mp->mnt_stat.f_iosize = lfs_sb_getbsize(fs);
 	mp->mnt_flag |= MNT_LOCAL;
+	mp->mnt_iflag |= IMNT_SHRLOOKUP;
 	mp->mnt_fs_bshift = lfs_sb_getbshift(fs);
 	mp->mnt_iflag |= IMNT_CAN_RWTORO;
 	if (fs->um_maxsymlinklen > 0)
Index: src/sys/ufs/lfs/ulfs_lookup.c
diff -u src/sys/ufs/lfs/ulfs_lookup.c:1.41 src/sys/ufs/lfs/ulfs_lookup.c:1.41.12.1
--- src/sys/ufs/lfs/ulfs_lookup.c:1.41	Sat Jun 10 05:29:36 2017
+++ src/sys/ufs/lfs/ulfs_lookup.c	Sun Jan 19 21:21:55 2020
@@ -162,14 +162,6 @@ ulfs_lookup(void *v)
 	endsearch = 0; /* silence compiler warning */
 
 	/*
-	 * Produce the auxiliary lookup results into i_crap. Increment
-	 * its serial number so elsewhere we can tell if we're using
-	 * stale results. This should not be done this way. XXX.
-	 */
-	results = &dp->i_crap;
-	dp->i_crapcounter++;
-
-	/*
 	 * Check accessiblity of directory.
 	 */
 	if ((error = VOP_ACCESS(vdp, VEXEC, cred)) != 0)
@@ -193,6 +185,19 @@ ulfs_lookup(void *v)
 		}
 		return *vpp == NULLVP ? ENOENT : 0;
 	}
+
+	/* May need to restart the lookup with an exclusive lock. */
+	if (VOP_ISLOCKED(vdp) != LK_EXCLUSIVE)
+		return ENOLCK;
+
+	/*
+	 * Produce the auxiliary lookup results into i_crap. Increment
+	 * its serial number so elsewhere we can tell if we're using
+	 * stale results. This should not be done this way. XXX.
+	 */
+	results = &dp->i_crap;
+	dp->i_crapcounter++;
+
 	if (iswhiteout) {
 		/*
 		 * The namecache set iswhiteout without finding a
Index: src/sys/ufs/ufs/ufs_lookup.c
diff -u src/sys/ufs/ufs/ufs_lookup.c:1.150 src/sys/ufs/ufs/ufs_lookup.c:1.150.4.1
--- src/sys/ufs/ufs/ufs_lookup.c:1.150	Sun May  5 15:07:12 2019
+++ src/sys/ufs/ufs/ufs_lookup.c	Sun Jan 19 21:21:55 2020
@@ -330,14 +330,6 @@ ufs_lookup(void *v)
 	endsearch = 0; /* silence compiler warning */
 
 	/*
-	 * Produce the auxiliary lookup results into i_crap. Increment
-	 * its serial number so elsewhere we can tell if we're using
-	 * stale results. This should not be done this way. XXX.
-	 */
-	results = &dp->i_crap;
-	dp->i_crapcounter++;
-
-	/*
 	 * Check accessiblity of directory.
 	 */
 	if ((error = VOP_ACCESS(vdp, VEXEC, cred)) != 0)
@@ -361,6 +353,20 @@ ufs_lookup(void *v)
 		}
 		return *vpp == NULLVP ? ENOENT : 0;
 	}
+
+	/* May need to restart the lookup with an exclusive lock. */
+	if (VOP_ISLOCKED(vdp) != LK_EXCLUSIVE) {
+		return ENOLCK;
+	}
+
+	/*
+	 * Produce the auxiliary lookup results into i_crap. Increment
+	 * its serial number so elsewhere we can tell if we're using
+	 * stale results. This should not be done this way. XXX.
+	 */
+	results = &dp->i_crap;
+	dp->i_crapcounter++;
+
 	if (iswhiteout) {
 		/*
 		 * The namecache set iswhiteout without finding a
Index: src/sys/ufs/ufs/ufs_vnops.c
diff -u src/sys/ufs/ufs/ufs_vnops.c:1.249 src/sys/ufs/ufs/ufs_vnops.c:1.248.2.3
--- src/sys/ufs/ufs/ufs_vnops.c:1.249	Wed Feb 26 18:00:12 2020
+++ src/sys/ufs/ufs/ufs_vnops.c	Sat Feb 29 20:21:11 2020
@@ -1,7 +1,7 @@
 /*	$NetBSD$	*/
 
 /*-
- * Copyright (c) 2008 The NetBSD Foundation, Inc.
+ * Copyright (c) 2008, 2020 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -621,6 +621,7 @@ ufs_setattr(void *v)
 	}
 	VN_KNOTE(vp, NOTE_ATTRIB);
 out:
+	cache_enter_id(vp, ip->i_mode, ip->i_uid, ip->i_gid);
 	return (error);
 }
 
@@ -648,6 +649,7 @@ ufs_chmod(struct vnode *vp, int mode, ka
 	ip->i_flag |= IN_CHANGE;
 	DIP_ASSIGN(ip, mode, ip->i_mode);
 	UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
+	cache_enter_id(vp, ip->i_mode, ip->i_uid, ip->i_gid);
 	return (0);
 }
 
@@ -708,6 +710,7 @@ ufs_chown(struct vnode *vp, uid_t uid, g
 #endif /* QUOTA || QUOTA2 */
 	ip->i_flag |= IN_CHANGE;
 	UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
+	cache_enter_id(vp, ip->i_mode, ip->i_uid, ip->i_gid);
 	return (0);
 }