Index: src/sys/fs/cd9660/cd9660_lookup.c diff -u src/sys/fs/cd9660/cd9660_lookup.c:1.30 src/sys/fs/cd9660/cd9660_lookup.c:1.30.24.1 --- src/sys/fs/cd9660/cd9660_lookup.c:1.30 Sat Mar 28 19:24:05 2015 +++ src/sys/fs/cd9660/cd9660_lookup.c Sun Jan 19 21:21:54 2020 @@ -152,6 +152,9 @@ cd9660_lookup(void *v) cnp->cn_nameiop, cnp->cn_flags, NULL, vpp)) { return *vpp == NULLVP ? ENOENT : 0; } + /* May need to restart the lookup with an exclusive lock. */ + if (VOP_ISLOCKED(vdp) != LK_EXCLUSIVE) + return ENOLCK; len = cnp->cn_namelen; name = cnp->cn_nameptr; Index: src/sys/fs/cd9660/cd9660_vfsops.c diff -u src/sys/fs/cd9660/cd9660_vfsops.c:1.94 src/sys/fs/cd9660/cd9660_vfsops.c:1.93.18.2 --- src/sys/fs/cd9660/cd9660_vfsops.c:1.94 Fri Jan 17 20:08:07 2020 +++ src/sys/fs/cd9660/cd9660_vfsops.c Sun Jan 19 21:21:54 2020 @@ -444,7 +444,7 @@ iso_mountfs(struct vnode *devvp, struct mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; mp->mnt_stat.f_namemax = ISO_MAXNAMLEN; mp->mnt_flag |= MNT_LOCAL; - mp->mnt_iflag |= IMNT_MPSAFE; + mp->mnt_iflag |= IMNT_MPSAFE | IMNT_SHRLOOKUP; mp->mnt_dev_bshift = iso_bsize; mp->mnt_fs_bshift = isomp->im_bshift; isomp->im_mountp = mp; Index: src/sys/fs/msdosfs/msdosfs_lookup.c diff -u src/sys/fs/msdosfs/msdosfs_lookup.c:1.35 src/sys/fs/msdosfs/msdosfs_lookup.c:1.35.24.1 --- src/sys/fs/msdosfs/msdosfs_lookup.c:1.35 Sat Jan 30 09:59:27 2016 +++ src/sys/fs/msdosfs/msdosfs_lookup.c Sun Jan 19 21:21:54 2020 @@ -161,6 +161,10 @@ msdosfs_lookup(void *v) return *vpp == NULLVP ? ENOENT: 0; } + /* May need to restart the lookup with an exclusive lock. */ + if (VOP_ISLOCKED(vdp) != LK_EXCLUSIVE) + return ENOLCK; + /* * If they are going after the . or .. entry in the root directory, * they won't find it. DOS filesystems don't have them in the root Index: src/sys/fs/msdosfs/msdosfs_vfsops.c diff -u src/sys/fs/msdosfs/msdosfs_vfsops.c:1.132 src/sys/fs/msdosfs/msdosfs_vfsops.c:1.130.6.3 --- src/sys/fs/msdosfs/msdosfs_vfsops.c:1.132 Thu Feb 27 22:12:53 2020 +++ src/sys/fs/msdosfs/msdosfs_vfsops.c Sat Feb 29 20:21:01 2020 @@ -867,6 +867,7 @@ msdosfs_mountfs(struct vnode *devvp, str mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; mp->mnt_stat.f_namemax = MSDOSFS_NAMEMAX(pmp); mp->mnt_flag |= MNT_LOCAL; + mp->mnt_iflag |= IMNT_SHRLOOKUP; mp->mnt_dev_bshift = pmp->pm_bnshift; mp->mnt_fs_bshift = pmp->pm_cnshift; Index: src/sys/fs/tmpfs/tmpfs_subr.c diff -u src/sys/fs/tmpfs/tmpfs_subr.c:1.106 src/sys/fs/tmpfs/tmpfs_subr.c:1.105.2.3 --- src/sys/fs/tmpfs/tmpfs_subr.c:1.106 Sun Feb 23 15:46:40 2020 +++ src/sys/fs/tmpfs/tmpfs_subr.c Sat Feb 29 20:21:02 2020 @@ -147,6 +147,8 @@ tmpfs_init_vnode(struct vnode *vp, tmpfs vp->v_data = node; node->tn_vnode = vp; uvm_vnp_setsize(vp, node->tn_size); + KASSERT(node->tn_mode != VNOVAL); + cache_enter_id(vp, node->tn_mode, node->tn_uid, node->tn_gid); } /* @@ -1035,6 +1037,7 @@ tmpfs_chmod(vnode_t *vp, mode_t mode, ka node->tn_mode = (mode & ALLPERMS); tmpfs_update(vp, TMPFS_UPDATE_CTIME); VN_KNOTE(vp, NOTE_ATTRIB); + cache_enter_id(vp, node->tn_mode, node->tn_uid, node->tn_gid); return 0; } @@ -1079,6 +1082,7 @@ tmpfs_chown(vnode_t *vp, uid_t uid, gid_ node->tn_gid = gid; tmpfs_update(vp, TMPFS_UPDATE_CTIME); VN_KNOTE(vp, NOTE_ATTRIB); + cache_enter_id(vp, node->tn_mode, node->tn_uid, node->tn_gid); return 0; } Index: src/sys/fs/tmpfs/tmpfs_vfsops.c diff -u src/sys/fs/tmpfs/tmpfs_vfsops.c:1.76 src/sys/fs/tmpfs/tmpfs_vfsops.c:1.75.2.3 --- src/sys/fs/tmpfs/tmpfs_vfsops.c:1.76 Fri Jan 17 20:08:08 2020 +++ src/sys/fs/tmpfs/tmpfs_vfsops.c Fri Jan 24 16:48:58 2020 @@ -182,7 +182,8 @@ tmpfs_mount(struct mount *mp, const char mp->mnt_stat.f_namemax = TMPFS_MAXNAMLEN; mp->mnt_fs_bshift = PAGE_SHIFT; mp->mnt_dev_bshift = DEV_BSHIFT; - mp->mnt_iflag |= IMNT_MPSAFE | IMNT_CAN_RWTORO; + mp->mnt_iflag |= IMNT_MPSAFE | IMNT_CAN_RWTORO | IMNT_SHRLOOKUP | + IMNT_NCLOOKUP; vfs_getnewfsid(mp); /* Allocate the tmpfs mount structure and fill it. */ Index: src/sys/kern/exec_script.c diff -u src/sys/kern/exec_script.c:1.80 src/sys/kern/exec_script.c:1.80.2.1 --- src/sys/kern/exec_script.c:1.80 Sun Sep 15 20:21:12 2019 +++ src/sys/kern/exec_script.c Fri Jan 17 21:53:01 2020 @@ -216,7 +216,7 @@ check_shell: * close all open fd's when the start. That kills this * method of implementing "safe" set-id and x-only scripts. */ - vn_lock(epp->ep_vp, LK_EXCLUSIVE | LK_RETRY); + vn_lock(epp->ep_vp, LK_SHARED | LK_RETRY); error = VOP_ACCESS(epp->ep_vp, VREAD, l->l_cred); VOP_UNLOCK(epp->ep_vp); if (error == EACCES Index: src/sys/kern/init_sysctl.c diff -u src/sys/kern/init_sysctl.c:1.224 src/sys/kern/init_sysctl.c:1.223.2.2 --- src/sys/kern/init_sysctl.c:1.224 Sat Jan 18 14:40:03 2020 +++ src/sys/kern/init_sysctl.c Sat Jan 25 22:38:50 2020 @@ -732,7 +732,6 @@ sysctl_kern_maxvnodes(SYSCTLFN_ARGS) return (error); } vfs_reinit(); - nchreinit(); return (0); } Index: src/sys/kern/vfs_cache.c diff -u src/sys/kern/vfs_cache.c:1.127 src/sys/kern/vfs_cache.c:1.126.2.12 --- src/sys/kern/vfs_cache.c:1.127 Wed Jan 8 12:04:56 2020 +++ src/sys/kern/vfs_cache.c Sun Feb 16 22:00:53 2020 @@ -1,9 +1,12 @@ /* $NetBSD$ */ /*- - * Copyright (c) 2008 The NetBSD Foundation, Inc. + * Copyright (c) 2008, 2019, 2020 The NetBSD Foundation, Inc. * All rights reserved. * + * This code is derived from software contributed to The NetBSD Foundation + * by Andrew Doran. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -57,6 +60,116 @@ * @(#)vfs_cache.c 8.3 (Berkeley) 8/22/94 */ +/* + * Name caching: + * + * Names found by directory scans are retained in a cache for future + * reference. It is managed LRU, so frequently used names will hang + * around. The cache is indexed by hash value obtained from the name. + * + * The name cache is the brainchild of Robert Elz and was introduced in + * 4.3BSD. See "Using gprof to Tune the 4.2BSD Kernel", Marshall Kirk + * McKusick, May 21 1984. + * + * Data structures: + * + * Most Unix namecaches very sensibly use a global hash table to index + * names. The global hash table works well, but can cause concurrency + * headaches for the kernel hacker. In the NetBSD 10.0 implementation + * we are not sensible, and use a per-directory data structure to index + * names, but the cache otherwise functions the same. + * + * The index is a red-black tree. There are no special concurrency + * requirements placed on it, because it's per-directory and protected + * by the namecache's per-directory locks. It should therefore not be + * difficult to experiment with other types of index. + * + * Each cached name is stored in a struct namecache, along with a + * pointer to the associated vnode (nc_vp). Names longer than a + * maximum length of NCHNAMLEN are allocated with kmem_alloc(); they + * occur infrequently, and names shorter than this are stored directly + * in struct namecache. If it is a "negative" entry, (i.e. for a name + * that is known NOT to exist) the vnode pointer will be NULL. + * + * For a directory with 3 cached names for 3 distinct vnodes, the + * various vnodes and namecache structs would be connected like this + * (the root is at the bottom of the diagram): + * + * ... + * ^ + * |- vi_nc_tree + * | + * +----o----+ +---------+ +---------+ + * | VDIR | | VCHR | | VREG | + * | vnode o-----+ | vnode o-----+ | vnode o------+ + * +---------+ | +---------+ | +---------+ | + * ^ | ^ | ^ | + * |- nc_vp |- vi_nc_list |- nc_vp |- vi_nc_list |- nc_vp | + * | | | | | | + * +----o----+ | +----o----+ | +----o----+ | + * +---onamecache|<----+ +---onamecache|<----+ +---onamecache|<-----+ + * | +---------+ | +---------+ | +---------+ + * | ^ | ^ | ^ + * | | | | | | + * | | +----------------------+ | | + * |-nc_dvp | +-------------------------------------------------+ + * | |/- vi_nc_tree | | + * | | |- nc_dvp |- nc_dvp + * | +----o----+ | | + * +-->| VDIR |<----------+ | + * | vnode |<------------------------------------+ + * +---------+ + * + * START HERE + * + * Replacement: + * + * As the cache becomes full, old and unused entries are purged as new + * entries are added. The synchronization overhead in maintaining a + * strict ordering would be prohibitive, so the VM system's "clock" or + * "second chance" page replacement algorithm is aped here. New + * entries go to the tail of the active list. After they age out and + * reach the head of the list, they are moved to the tail of the + * inactive list. Any use of the deactivated cache entry reactivates + * it, saving it from impending doom; if not reactivated, the entry + * eventually reaches the head of the inactive list and is purged. + * + * Concurrency: + * + * From a performance perspective, cache_lookup(nameiop == LOOKUP) is + * what really matters; insertion of new entries with cache_enter() is + * comparatively infrequent, and overshadowed by the cost of expensive + * file system metadata operations (which may involve disk I/O). We + * therefore want to make everything simplest in the lookup path. + * + * struct namecache is mostly stable except for list and tree related + * entries, changes to which don't affect the cached name or vnode. + * For changes to name+vnode, entries are purged in preference to + * modifying them. + * + * Read access to namecache entries is made via tree, list, or LRU + * list. A lock corresponding to the direction of access should be + * held. See definition of "struct namecache" in src/sys/namei.src, + * and the definition of "struct vnode" for the particulars. + * + * Per-CPU statistics, and LRU list totals are read unlocked, since + * an approximate value is OK. We maintain uintptr_t sized per-CPU + * counters and 64-bit global counters under the theory that uintptr_t + * sized counters are less likely to be hosed by nonatomic increment. + * + * The lock order is: + * + * 1) vi->vi_nc_lock (tree or parent -> child direction, + * used during forward lookup) + * + * 2) vi->vi_nc_listlock (list or child -> parent direction, + * used during reverse lookup) + * + * 3) cache_lru_lock (LRU list direction, used during reclaim) + * + * 4) vp->v_interlock (what the cache entry points to) + */ + #include __KERNEL_RCSID(0, "$NetBSD$"); @@ -66,16 +179,15 @@ __KERNEL_RCSID(0, "$NetBSD$"); #include "opt_dtrace.h" #endif -#include -#include #include #include #include +#include #include -#include #include #include #include +#include #include #include #include @@ -83,244 +195,61 @@ __KERNEL_RCSID(0, "$NetBSD$"); #include #include -/* - * Name caching works as follows: - * - * Names found by directory scans are retained in a cache - * for future reference. It is managed LRU, so frequently - * used names will hang around. Cache is indexed by hash value - * obtained from (dvp, name) where dvp refers to the directory - * containing name. - * - * Upon reaching the last segment of a path, if the reference - * is for DELETE, or NOCACHE is set (rewrite), and the - * name is located in the cache, it will be dropped. - */ - -/* - * Cache entry lifetime: - * - * nonexistent - * ---create---> active - * ---invalidate---> queued - * ---reclaim---> nonexistent. - * - * States: - * - Nonexistent. Cache entry does not exist. - * - * - Active. cache_lookup, cache_lookup_raw, cache_revlookup can look - * up, acquire references, and hand off references to vnodes, - * e.g. via v_interlock. Marked by nonnull ncp->nc_dvp. - * - * - Queued. Pending desstruction by cache_reclaim. Cannot be used by - * cache_lookup, cache_lookup_raw, or cache_revlookup. May still be - * on lists. Marked by null ncp->nc_dvp. - * - * Transitions: - * - * - Create: nonexistent--->active - * - * Done by cache_enter(dvp, vp, name, namelen, cnflags), called by - * VOP_LOOKUP after the answer is found. Allocates a struct - * namecache object, initializes it with the above fields, and - * activates it by inserting it into the forward and reverse tables. - * - * - Invalidate: active--->queued - * - * Done by cache_invalidate. If not already invalidated, nullify - * ncp->nc_dvp and and add to cache_gcqueue. Called, - * among various other places, in cache_lookup(dvp, name, namelen, - * nameiop, cnflags, &iswht, &vp) when MAKEENTRY is missing from - * cnflags. - * - * - Reclaim: queued--->nonexistent - * - * Done by cache_reclaim. Disassociate ncp from any lists it is on - * and free memory. - */ - -/* - * Locking. - * - * L namecache_lock Global lock for namecache table and queues. - * C struct nchcpu::cpu_lock Per-CPU lock to reduce read contention. - * N struct namecache::nc_lock Per-entry lock. - * V struct vnode::v_interlock Vnode interlock. - * - * Lock order: L -> C -> N -> V - * - * Examples: - * . L->C: cache_reclaim - * . C->N->V: cache_lookup - * . L->N->V: cache_purge1, cache_revlookup - * - * All use serialized by namecache_lock: - * - * nclruhead / struct namecache::nc_lru - * struct vnode_impl::vi_dnclist / struct namecache::nc_dvlist - * struct vnode_impl::vi_nclist / struct namecache::nc_vlist - * nchstats - * - * - Insertion serialized by namecache_lock, - * - read protected by per-CPU lock, - * - insert/read ordering guaranteed by memory barriers, and - * - deletion allowed only under namecache_lock and *all* per-CPU locks - * in CPU_INFO_FOREACH order: - * - * nchashtbl / struct namecache::nc_hash - * - * The per-CPU locks exist only to reduce the probability of - * contention between readers. We do not bind to a CPU, so - * contention is still possible. - * - * All use serialized by struct namecache::nc_lock: - * - * struct namecache::nc_dvp - * struct namecache::nc_vp - * struct namecache::nc_gcqueue (*) - * struct namecache::nc_hittime (**) - * - * (*) Once on the queue, only cache_thread uses this nc_gcqueue, unlocked. - * (**) cache_prune reads nc_hittime unlocked, since approximate is OK. - * - * Unlocked because stable after initialization: - * - * struct namecache::nc_dvp - * struct namecache::nc_vp - * struct namecache::nc_flags - * struct namecache::nc_nlen - * struct namecache::nc_name - * - * Unlocked because approximation is OK: - * - * struct nchcpu::cpu_stats - * struct nchcpu::cpu_stats_last - * - * Updates under namecache_lock or any per-CPU lock are marked with - * COUNT, while updates outside those locks are marked with COUNT_UNL. - * - * - The theory seems to have been that you could replace COUNT_UNL by - * atomic operations -- except that doesn't help unless you also - * replace COUNT by atomic operations, because mixing atomics and - * nonatomics is a recipe for failure. - * - We use 32-bit per-CPU counters and 64-bit global counters under - * the theory that 32-bit counters are less likely to be hosed by - * nonatomic increment. - */ +#include -/* - * The comment below is preserved for posterity in case it is - * important, but it is clear that everywhere the namecache_count_*() - * functions are called, other cache_*() functions that take the same - * locks are also called, so I can't imagine how this could be a - * problem: - * - * N.B.: Attempting to protect COUNT_UNL() increments by taking - * a per-cpu lock in the namecache_count_*() functions causes - * a deadlock. Don't do that, use atomic increments instead if - * the imperfections here bug you. - */ - -/* - * struct nchstats_percpu: - * - * Per-CPU counters. - */ -struct nchstats_percpu _NAMEI_CACHE_STATS(uint32_t); - -/* - * struct nchcpu: - * - * Per-CPU namecache state: lock and per-CPU counters. - */ -struct nchcpu { - kmutex_t cpu_lock; - struct nchstats_percpu cpu_stats; - /* XXX maybe __cacheline_aligned would improve this? */ - struct nchstats_percpu cpu_stats_last; /* from last sample */ +static void cache_activate(struct namecache *); +static int cache_compare_key(void *, const void *, const void *); +static int cache_compare_nodes(void *, const void *, const void *); +static void cache_deactivate(void); +static void cache_reclaim(void); +static int cache_stat_sysctl(SYSCTLFN_ARGS); + +/* Per-CPU counters. */ +struct nchstats_percpu _NAMEI_CACHE_STATS(uintptr_t); + +/* Global pool cache. */ +static pool_cache_t cache_pool __read_mostly; + +/* LRU replacement. */ +enum cache_lru_id { + LRU_ACTIVE, + LRU_INACTIVE, + LRU_COUNT }; -/* - * The type for the hash code. While the hash function generates a - * u32, the hash code has historically been passed around as a u_long, - * and the value is modified by xor'ing a uintptr_t, so it's not - * entirely clear what the best type is. For now I'll leave it - * unchanged as u_long. - */ +static struct { + TAILQ_HEAD(, namecache) list[LRU_COUNT]; + u_int count[LRU_COUNT]; +} cache_lru __cacheline_aligned; -typedef u_long nchash_t; - -/* - * Structures associated with name cacheing. - */ - -static kmutex_t *namecache_lock __read_mostly; -static pool_cache_t namecache_cache __read_mostly; -static TAILQ_HEAD(, namecache) nclruhead __cacheline_aligned; - -static LIST_HEAD(nchashhead, namecache) *nchashtbl __read_mostly; -static u_long nchash __read_mostly; - -#define NCHASH2(hash, dvp) \ - (((hash) ^ ((uintptr_t)(dvp) >> 3)) & nchash) - -/* Number of cache entries allocated. */ -static long numcache __cacheline_aligned; - -/* Garbage collection queue and number of entries pending in it. */ -static void *cache_gcqueue; -static u_int cache_gcpend; +static kmutex_t cache_lru_lock __cacheline_aligned; /* Cache effectiveness statistics. This holds total from per-cpu stats */ struct nchstats nchstats __cacheline_aligned; -/* - * Macros to count an event, update the central stats with per-cpu - * values and add current per-cpu increments to the subsystem total - * last collected by cache_reclaim(). - */ -#define CACHE_STATS_CURRENT /* nothing */ - -#define COUNT(cpup, f) ((cpup)->cpu_stats.f++) - -#define UPDATE(cpup, f) do { \ - struct nchcpu *Xcpup = (cpup); \ - uint32_t Xcnt = (volatile uint32_t) Xcpup->cpu_stats.f; \ - nchstats.f += Xcnt - Xcpup->cpu_stats_last.f; \ - Xcpup->cpu_stats_last.f = Xcnt; \ -} while (/* CONSTCOND */ 0) - -#define ADD(stats, cpup, f) do { \ - struct nchcpu *Xcpup = (cpup); \ - stats.f += Xcpup->cpu_stats.f - Xcpup->cpu_stats_last.f; \ -} while (/* CONSTCOND */ 0) - -/* Do unlocked stats the same way. Use a different name to allow mind changes */ -#define COUNT_UNL(cpup, f) COUNT((cpup), f) - -static const int cache_lowat = 95; -static const int cache_hiwat = 98; -static const int cache_hottime = 5; /* number of seconds */ -static int doingcache = 1; /* 1 => enable the cache */ - -static struct evcnt cache_ev_scan; -static struct evcnt cache_ev_gc; -static struct evcnt cache_ev_over; -static struct evcnt cache_ev_under; -static struct evcnt cache_ev_forced; - -static struct namecache *cache_lookup_entry( - const struct vnode *, const char *, size_t); -static void cache_thread(void *); -static void cache_invalidate(struct namecache *); -static void cache_disassociate(struct namecache *); -static void cache_reclaim(void); -static int cache_ctor(void *, void *, int); -static void cache_dtor(void *, void *); - -static struct sysctllog *sysctllog; -static void sysctl_cache_stat_setup(void); +#define COUNT(f) do { \ + kpreempt_disable(); \ + ((struct nchstats_percpu *)curcpu()->ci_data.cpu_nch)->f++; \ + kpreempt_enable(); \ +} while (/* CONSTCOND */ 0); + +/* Tunables */ +static const int cache_lru_maxdeact = 2; /* max # to deactivate */ +static const int cache_lru_maxscan = 64; /* max # to scan/reclaim */ +static int doingcache = 1; /* 1 => enable the cache */ + +/* sysctl */ +static struct sysctllog *cache_sysctllog; + +/* Read-black tree */ +static rb_tree_ops_t cache_rbtree_ops __read_mostly = { + .rbto_compare_nodes = cache_compare_nodes, + .rbto_compare_key = cache_compare_key, + .rbto_node_offset = offsetof(struct namecache, nc_tree), + .rbto_context = NULL +}; +/* dtrace hooks */ SDT_PROVIDER_DEFINE(vfs); SDT_PROBE_DEFINE1(vfs, namecache, invalidate, done, "struct vnode *"); @@ -345,160 +274,172 @@ SDT_PROBE_DEFINE3(vfs, namecache, enter, "char *", "size_t"); /* - * Compute the hash for an entry. - * - * (This is for now a wrapper around namei_hash, whose interface is - * for the time being slightly inconvenient.) + * rbtree: compare two nodes. */ -static nchash_t -cache_hash(const char *name, size_t namelen) +static int +cache_compare_nodes(void *context, const void *n1, const void *n2) { - const char *endptr; + const struct namecache *nc1 = n1; + const struct namecache *nc2 = n2; - endptr = name + namelen; - return namei_hash(name, &endptr); + if (nc1->nc_key < nc2->nc_key) { + return -1; + } + if (nc1->nc_key > nc2->nc_key) { + return 1; + } + return 0; } /* - * Invalidate a cache entry and enqueue it for garbage collection. - * The caller needs to hold namecache_lock or a per-cpu lock to hold - * off cache_reclaim(). + * rbtree: compare a node and a key. */ -static void -cache_invalidate(struct namecache *ncp) +static int +cache_compare_key(void *context, const void *n, const void *k) { - void *head; - - KASSERT(mutex_owned(&ncp->nc_lock)); - - if (ncp->nc_dvp != NULL) { - SDT_PROBE(vfs, namecache, invalidate, done, ncp->nc_dvp, - 0, 0, 0, 0); + const struct namecache *ncp = n; + const int64_t key = *(const int64_t *)k; - ncp->nc_dvp = NULL; - do { - head = cache_gcqueue; - ncp->nc_gcqueue = head; - } while (atomic_cas_ptr(&cache_gcqueue, head, ncp) != head); - atomic_inc_uint(&cache_gcpend); + if (ncp->nc_key < key) { + return -1; } + if (ncp->nc_key > key) { + return 1; + } + return 0; } /* - * Disassociate a namecache entry from any vnodes it is attached to, - * and remove from the global LRU list. + * Compute a key value for the given name. The name length is encoded in + * the key value to try and improve uniqueness, and so that length doesn't + * need to be compared separately for string comparisons. */ -static void -cache_disassociate(struct namecache *ncp) +static int64_t +cache_key(const char *name, size_t nlen) { + int64_t key; - KASSERT(mutex_owned(namecache_lock)); - KASSERT(ncp->nc_dvp == NULL); + KASSERT(nlen <= USHRT_MAX); - if (ncp->nc_lru.tqe_prev != NULL) { - TAILQ_REMOVE(&nclruhead, ncp, nc_lru); - ncp->nc_lru.tqe_prev = NULL; - } - if (ncp->nc_vlist.tqe_prev != NULL) { - KASSERT(ncp->nc_vp != NULL); - TAILQ_REMOVE(&VNODE_TO_VIMPL(ncp->nc_vp)->vi_nclist, ncp, - nc_vlist); - ncp->nc_vlist.tqe_prev = NULL; - } - if (ncp->nc_dvlist.le_prev != NULL) { - LIST_REMOVE(ncp, nc_dvlist); - ncp->nc_dvlist.le_prev = NULL; - } + key = hash32_buf(name, nlen, HASH32_STR_INIT); + return (key << 32) | nlen; } /* - * Lock all CPUs to prevent any cache lookup activity. Conceptually, - * this locks out all "readers". + * Like bcmp() but tuned for the use case here which is: + * + * - always of equal length both sides + * - almost always the same string both sides + * - small strings */ -static void -cache_lock_cpus(void) +static inline int +cache_namecmp(struct namecache *ncp, const char *name, size_t namelen) { - CPU_INFO_ITERATOR cii; - struct cpu_info *ci; - struct nchcpu *cpup; + size_t i; + int d; - /* - * Lock out all CPUs first, then harvest per-cpu stats. This - * is probably not quite as cache-efficient as doing the lock - * and harvest at the same time, but allows cache_stat_sysctl() - * to make do with a per-cpu lock. - */ - for (CPU_INFO_FOREACH(cii, ci)) { - cpup = ci->ci_data.cpu_nch; - mutex_enter(&cpup->cpu_lock); - } - for (CPU_INFO_FOREACH(cii, ci)) { - cpup = ci->ci_data.cpu_nch; - UPDATE(cpup, ncs_goodhits); - UPDATE(cpup, ncs_neghits); - UPDATE(cpup, ncs_badhits); - UPDATE(cpup, ncs_falsehits); - UPDATE(cpup, ncs_miss); - UPDATE(cpup, ncs_long); - UPDATE(cpup, ncs_pass2); - UPDATE(cpup, ncs_2passes); - UPDATE(cpup, ncs_revhits); - UPDATE(cpup, ncs_revmiss); + KASSERT(ncp->nc_nlen == namelen); + for (d = 0, i = 0; i < namelen; i++) { + d |= (ncp->nc_name[i] ^ name[i]); } + return d; } /* - * Release all CPU locks. + * Remove an entry from the cache. vi_nc_lock must be held, and if dir2node + * is true, then we're locking in the conventional direction and the list + * lock will be acquired when removing the entry from the vnode list. */ static void -cache_unlock_cpus(void) +cache_remove(struct namecache *ncp, const bool dir2node) { - CPU_INFO_ITERATOR cii; - struct cpu_info *ci; - struct nchcpu *cpup; + struct vnode *vp, *dvp = ncp->nc_dvp; + vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); - for (CPU_INFO_FOREACH(cii, ci)) { - cpup = ci->ci_data.cpu_nch; - mutex_exit(&cpup->cpu_lock); + KASSERT(rw_write_held(&dvi->vi_nc_lock)); + KASSERT(cache_key(ncp->nc_name, ncp->nc_nlen) == ncp->nc_key); + KASSERT(rb_tree_find_node(&dvi->vi_nc_tree, &ncp->nc_key) == ncp); + + SDT_PROBE(vfs, namecache, invalidate, done, ncp, + 0, 0, 0, 0); + + /* First remove from the directory's rbtree. */ + rb_tree_remove_node(&dvi->vi_nc_tree, ncp); + + /* Then remove from the LRU lists. */ + mutex_enter(&cache_lru_lock); + TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru); + cache_lru.count[ncp->nc_lrulist]--; + mutex_exit(&cache_lru_lock); + + /* Then remove from the node's list. */ + if ((vp = ncp->nc_vp) != NULL) { + vnode_impl_t *vi = VNODE_TO_VIMPL(vp); + if (__predict_true(dir2node)) { + rw_enter(&vi->vi_nc_listlock, RW_WRITER); + TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list); + rw_exit(&vi->vi_nc_listlock); + } else { + TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list); + } + } + + /* Finally, free it. */ + if (ncp->nc_nlen > NCHNAMLEN) { + size_t sz = offsetof(struct namecache, nc_name[ncp->nc_nlen]); + kmem_free(ncp, sz); + } else { + pool_cache_put(cache_pool, ncp); } } /* - * Find a single cache entry and return it locked. - * The caller needs to hold namecache_lock or a per-cpu lock to hold - * off cache_reclaim(). + * Find a single cache entry and return it. vi_nc_lock must be held. */ -static struct namecache * -cache_lookup_entry(const struct vnode *dvp, const char *name, size_t namelen) +static struct namecache * __noinline +cache_lookup_entry(struct vnode *dvp, const char *name, size_t namelen, + int64_t key) { - struct nchashhead *ncpp; + vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); + struct rb_node *node = dvi->vi_nc_tree.rbt_root; struct namecache *ncp; - nchash_t hash; - KASSERT(dvp != NULL); - hash = cache_hash(name, namelen); - ncpp = &nchashtbl[NCHASH2(hash, dvp)]; - - LIST_FOREACH(ncp, ncpp, nc_hash) { - membar_datadep_consumer(); /* for Alpha... */ - if (ncp->nc_dvp != dvp || - ncp->nc_nlen != namelen || - memcmp(ncp->nc_name, name, (u_int)ncp->nc_nlen)) - continue; - mutex_enter(&ncp->nc_lock); - if (__predict_true(ncp->nc_dvp == dvp)) { - ncp->nc_hittime = hardclock_ticks; - SDT_PROBE(vfs, namecache, lookup, hit, dvp, - name, namelen, 0, 0); - return ncp; - } - /* Raced: entry has been nullified. */ - mutex_exit(&ncp->nc_lock); - } - - SDT_PROBE(vfs, namecache, lookup, miss, dvp, - name, namelen, 0, 0); - return NULL; + KASSERT(rw_lock_held(&dvi->vi_nc_lock)); + + /* + * Search the RB tree for the key. This is an inlined lookup + * tailored for exactly what's needed here (64-bit key and so on) + * that is quite a bit faster than using rb_tree_find_node(). + * Elsewhere during entry/removal the usual functions are used as it + * doesn't matter there. + */ + for (;;) { + if (__predict_false(RB_SENTINEL_P(node))) { + return NULL; + } + KASSERT((void *)&ncp->nc_tree == (void *)ncp); + ncp = (struct namecache *)node; + KASSERT(ncp->nc_dvp == dvp); + if (ncp->nc_key == key) { + break; + } + node = node->rb_nodes[ncp->nc_key < key]; + } + + /* Exclude collisions. */ + if (__predict_false(cache_namecmp(ncp, name, namelen))) { + return NULL; + } + + /* + * If the entry is on the wrong LRU list, requeue it. This is an + * unlocked check, but it will rarely be wrong and even then there + * will be no harm caused. + */ + if (__predict_false(ncp->nc_lrulist != LRU_ACTIVE)) { + cache_activate(ncp); + } + return ncp; } /* @@ -556,12 +497,13 @@ cache_lookup(struct vnode *dvp, const ch uint32_t nameiop, uint32_t cnflags, int *iswht_ret, struct vnode **vn_ret) { + vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); struct namecache *ncp; struct vnode *vp; - struct nchcpu *cpup; + int64_t key; int error; bool hit; - + krw_t op; /* Establish default result values */ if (iswht_ret != NULL) { @@ -573,73 +515,77 @@ cache_lookup(struct vnode *dvp, const ch return false; } - cpup = curcpu()->ci_data.cpu_nch; - mutex_enter(&cpup->cpu_lock); if (__predict_false(namelen > USHRT_MAX)) { SDT_PROBE(vfs, namecache, lookup, toolong, dvp, name, namelen, 0, 0); - COUNT(cpup, ncs_long); - mutex_exit(&cpup->cpu_lock); - /* found nothing */ + COUNT(ncs_long); return false; } - ncp = cache_lookup_entry(dvp, name, namelen); + /* Could the entry be purged below? */ + if ((cnflags & ISLASTCN) != 0 && + ((cnflags & MAKEENTRY) == 0 || nameiop == CREATE)) { + op = RW_WRITER; + } else { + op = RW_READER; + } + + /* Compute the key up front - don't need the lock. */ + key = cache_key(name, namelen); + + /* Now look for the name. */ + rw_enter(&dvi->vi_nc_lock, op); + ncp = cache_lookup_entry(dvp, name, namelen, key); if (__predict_false(ncp == NULL)) { - COUNT(cpup, ncs_miss); - mutex_exit(&cpup->cpu_lock); - /* found nothing */ + rw_exit(&dvi->vi_nc_lock); + COUNT(ncs_miss); + SDT_PROBE(vfs, namecache, lookup, miss, dvp, + name, namelen, 0, 0); return false; } - if ((cnflags & MAKEENTRY) == 0) { - COUNT(cpup, ncs_badhits); + if (__predict_false((cnflags & MAKEENTRY) == 0)) { /* * Last component and we are renaming or deleting, * the cache entry is invalid, or otherwise don't * want cache entry to exist. */ - cache_invalidate(ncp); - mutex_exit(&ncp->nc_lock); - mutex_exit(&cpup->cpu_lock); - /* found nothing */ + KASSERT((cnflags & ISLASTCN) != 0); + cache_remove(ncp, true); + rw_exit(&dvi->vi_nc_lock); + COUNT(ncs_badhits); return false; } if (ncp->nc_vp == NULL) { - if (iswht_ret != NULL) { + if (nameiop == CREATE && (cnflags & ISLASTCN) != 0) { /* - * Restore the ISWHITEOUT flag saved earlier. + * Last component and we are preparing to create + * the named object, so flush the negative cache + * entry. */ - KASSERT((ncp->nc_flags & ~ISWHITEOUT) == 0); - *iswht_ret = (ncp->nc_flags & ISWHITEOUT) != 0; + COUNT(ncs_badhits); + cache_remove(ncp, true); + hit = false; } else { - KASSERT(ncp->nc_flags == 0); - } - - if (__predict_true(nameiop != CREATE || - (cnflags & ISLASTCN) == 0)) { - COUNT(cpup, ncs_neghits); + COUNT(ncs_neghits); + SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, + namelen, 0, 0); /* found neg entry; vn is already null from above */ hit = true; - } else { - COUNT(cpup, ncs_badhits); + } + if (iswht_ret != NULL) { /* - * Last component and we are preparing to create - * the named object, so flush the negative cache - * entry. + * Restore the ISWHITEOUT flag saved earlier. */ - cache_invalidate(ncp); - /* found nothing */ - hit = false; + *iswht_ret = ncp->nc_whiteout; + } else { + KASSERT(!ncp->nc_whiteout); } - mutex_exit(&ncp->nc_lock); - mutex_exit(&cpup->cpu_lock); + rw_exit(&dvi->vi_nc_lock); return hit; } - vp = ncp->nc_vp; mutex_enter(vp->v_interlock); - mutex_exit(&ncp->nc_lock); - mutex_exit(&cpup->cpu_lock); + rw_exit(&dvi->vi_nc_lock); /* * Unlocked except for the vnode interlock. Call vcache_tryvget(). @@ -651,100 +597,136 @@ cache_lookup(struct vnode *dvp, const ch * This vnode is being cleaned out. * XXX badhits? */ - COUNT_UNL(cpup, ncs_falsehits); - /* found nothing */ + COUNT(ncs_falsehits); return false; } - COUNT_UNL(cpup, ncs_goodhits); + COUNT(ncs_goodhits); + SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0); /* found it */ *vn_ret = vp; return true; } - /* - * Cut-'n-pasted version of the above without the nameiop argument. + * Version of the above without the nameiop argument, for NFS. */ bool cache_lookup_raw(struct vnode *dvp, const char *name, size_t namelen, uint32_t cnflags, int *iswht_ret, struct vnode **vn_ret) { + + return cache_lookup(dvp, name, namelen, LOOKUP, cnflags | MAKEENTRY, + iswht_ret, vn_ret); +} + +/* + * Used by namei() to walk down a path, component by component by looking up + * names in the cache. The node locks are chained along the way: a parent's + * lock is not dropped until the child's is acquired. + */ +bool +cache_lookup_linked(struct vnode *dvp, const char *name, size_t namelen, + struct vnode **vn_ret, krwlock_t **plock, + kauth_cred_t cred) +{ + vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); struct namecache *ncp; - struct vnode *vp; - struct nchcpu *cpup; + int64_t key; int error; /* Establish default results. */ - if (iswht_ret != NULL) { - *iswht_ret = 0; - } *vn_ret = NULL; - if (__predict_false(!doingcache)) { - /* found nothing */ + /* If disabled, or file system doesn't support this, bail out. */ + if (__predict_false(!doingcache || + (dvp->v_mount->mnt_iflag & IMNT_NCLOOKUP) == 0)) { return false; } - cpup = curcpu()->ci_data.cpu_nch; - mutex_enter(&cpup->cpu_lock); if (__predict_false(namelen > USHRT_MAX)) { - COUNT(cpup, ncs_long); - mutex_exit(&cpup->cpu_lock); - /* found nothing */ + COUNT(ncs_long); return false; } - ncp = cache_lookup_entry(dvp, name, namelen); + + /* Compute the key up front - don't need the lock. */ + key = cache_key(name, namelen); + + /* + * Acquire the directory lock. Once we have that, we can drop the + * previous one (if any). + * + * The two lock holds mean that the directory can't go away while + * here: the directory must be purged with cache_purge() before + * being freed, and both parent & child's vi_nc_lock must be taken + * before that point is passed. + * + * However if there's no previous lock, like at the root of the + * chain, then "dvp" must be referenced to prevent dvp going away + * before we get its lock. + * + * Note that the two locks can be the same if looking up a dot, for + * example: /usr/bin/. + */ + if (*plock != &dvi->vi_nc_lock) { + rw_enter(&dvi->vi_nc_lock, RW_READER); + if (*plock != NULL) { + rw_exit(*plock); + } + *plock = &dvi->vi_nc_lock; + } else if (*plock == NULL) { + KASSERT(dvp->v_usecount > 0); + } + + /* + * First up check if the user is allowed to look up files in this + * directory. + */ + KASSERT(dvi->vi_nc_mode != VNOVAL && dvi->vi_nc_uid != VNOVAL && + dvi->vi_nc_gid != VNOVAL); + error = kauth_authorize_vnode(cred, KAUTH_ACCESS_ACTION(VEXEC, + dvp->v_type, dvi->vi_nc_mode & ALLPERMS), dvp, NULL, + genfs_can_access(dvp->v_type, dvi->vi_nc_mode & ALLPERMS, + dvi->vi_nc_uid, dvi->vi_nc_gid, VEXEC, cred)); + if (error != 0) { + COUNT(ncs_denied); + return false; + } + + /* + * Now look for a matching cache entry. + */ + ncp = cache_lookup_entry(dvp, name, namelen, key); if (__predict_false(ncp == NULL)) { - COUNT(cpup, ncs_miss); - mutex_exit(&cpup->cpu_lock); - /* found nothing */ + COUNT(ncs_miss); + SDT_PROBE(vfs, namecache, lookup, miss, dvp, + name, namelen, 0, 0); return false; } - vp = ncp->nc_vp; - if (vp == NULL) { - /* - * Restore the ISWHITEOUT flag saved earlier. - */ - if (iswht_ret != NULL) { - KASSERT((ncp->nc_flags & ~ISWHITEOUT) == 0); - /*cnp->cn_flags |= ncp->nc_flags;*/ - *iswht_ret = (ncp->nc_flags & ISWHITEOUT) != 0; - } - COUNT(cpup, ncs_neghits); - mutex_exit(&ncp->nc_lock); - mutex_exit(&cpup->cpu_lock); + if (ncp->nc_vp == NULL) { /* found negative entry; vn is already null from above */ + COUNT(ncs_neghits); + SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0); return true; } - mutex_enter(vp->v_interlock); - mutex_exit(&ncp->nc_lock); - mutex_exit(&cpup->cpu_lock); + + COUNT(ncs_goodhits); /* XXX can be "badhits" */ + SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0); /* - * Unlocked except for the vnode interlock. Call vcache_tryvget(). + * Return with the directory lock still held. It will either be + * returned to us with another call to cache_lookup_linked() when + * looking up the next component, or the caller will release it + * manually when finished. */ - error = vcache_tryvget(vp); - if (error) { - KASSERT(error == EBUSY); - /* - * This vnode is being cleaned out. - * XXX badhits? - */ - COUNT_UNL(cpup, ncs_falsehits); - /* found nothing */ - return false; - } - - COUNT_UNL(cpup, ncs_goodhits); /* XXX can be "badhits" */ - /* found it */ - *vn_ret = vp; + *vn_ret = ncp->nc_vp; return true; } /* * Scan cache looking for name of directory entry pointing at vp. + * Will not search for "." or "..". * * If the lookup succeeds the vnode is referenced and stored in dvpp. * @@ -756,11 +738,12 @@ cache_lookup_raw(struct vnode *dvp, cons * Returns 0 on success, -1 on cache miss, positive errno on failure. */ int -cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp) +cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp, + bool checkaccess, int perms) { + vnode_impl_t *vi = VNODE_TO_VIMPL(vp); struct namecache *ncp; struct vnode *dvp; - struct nchcpu *cpup; char *bp; int error, nlen; @@ -769,44 +752,57 @@ cache_revlookup(struct vnode *vp, struct if (!doingcache) goto out; - /* - * We increment counters in the local CPU's per-cpu stats. - * We don't take the per-cpu lock, however, since this function - * is the only place these counters are incremented so no one - * will be racing with us to increment them. - */ - cpup = curcpu()->ci_data.cpu_nch; - mutex_enter(namecache_lock); - TAILQ_FOREACH(ncp, &VNODE_TO_VIMPL(vp)->vi_nclist, nc_vlist) { - mutex_enter(&ncp->nc_lock); - /* Ignore invalidated entries. */ - dvp = ncp->nc_dvp; - if (dvp == NULL) { - mutex_exit(&ncp->nc_lock); - continue; - } - + rw_enter(&vi->vi_nc_listlock, RW_READER); + if (checkaccess) { /* - * The list is partially sorted. Once we hit dot or dotdot - * it's only more dots from there on in. + * Check if the user is allowed to see. NOTE: this is + * checking for access on the "wrong" directory. getcwd() + * wants to see that there is access on every component + * along the way, not that there is access to any individual + * component. Don't use this to check you can look in vp. + * + * I don't like it, I didn't come up with it, don't blame me! */ + KASSERT(vi->vi_nc_mode != VNOVAL && vi->vi_nc_uid != VNOVAL && + vi->vi_nc_gid != VNOVAL); + error = kauth_authorize_vnode(curlwp->l_cred, + KAUTH_ACCESS_ACTION(VEXEC, vp->v_type, vi->vi_nc_mode & + ALLPERMS), vp, NULL, genfs_can_access(vp->v_type, + vi->vi_nc_mode & ALLPERMS, vi->vi_nc_uid, vi->vi_nc_gid, + perms, curlwp->l_cred)); + if (error != 0) { + rw_exit(&vi->vi_nc_listlock); + COUNT(ncs_denied); + return EACCES; + } + } + TAILQ_FOREACH(ncp, &vi->vi_nc_list, nc_list) { + KASSERT(ncp->nc_vp == vp); + KASSERT(ncp->nc_dvp != NULL); nlen = ncp->nc_nlen; + + /* + * The queue is partially sorted. Once we hit dots, nothing + * else remains but dots and dotdots, so bail out. + */ if (ncp->nc_name[0] == '.') { if (nlen == 1 || (nlen == 2 && ncp->nc_name[1] == '.')) { - mutex_exit(&ncp->nc_lock); - break; + break; } } - COUNT(cpup, ncs_revhits); + + /* Record a hit on the entry. This is an unlocked read. */ + if (ncp->nc_lrulist != LRU_ACTIVE) { + cache_activate(ncp); + } if (bufp) { bp = *bpp; bp -= nlen; if (bp <= bufp) { *dvpp = NULL; - mutex_exit(&ncp->nc_lock); - mutex_exit(namecache_lock); + rw_exit(&vi->vi_nc_listlock); SDT_PROBE(vfs, namecache, revlookup, fail, vp, ERANGE, 0, 0, 0); return (ERANGE); @@ -815,9 +811,9 @@ cache_revlookup(struct vnode *vp, struct *bpp = bp; } + dvp = ncp->nc_dvp; mutex_enter(dvp->v_interlock); - mutex_exit(&ncp->nc_lock); - mutex_exit(namecache_lock); + rw_exit(&vi->vi_nc_listlock); error = vcache_tryvget(dvp); if (error) { KASSERT(error == EBUSY); @@ -831,26 +827,26 @@ cache_revlookup(struct vnode *vp, struct *dvpp = dvp; SDT_PROBE(vfs, namecache, revlookup, success, vp, dvp, 0, 0, 0); + COUNT(ncs_revhits); return (0); } - COUNT(cpup, ncs_revmiss); - mutex_exit(namecache_lock); + rw_exit(&vi->vi_nc_listlock); + COUNT(ncs_revmiss); out: *dvpp = NULL; return (-1); } /* - * Add an entry to the cache + * Add an entry to the cache. */ void cache_enter(struct vnode *dvp, struct vnode *vp, const char *name, size_t namelen, uint32_t cnflags) { - struct namecache *ncp; - struct namecache *oncp; - struct nchashhead *ncpp; - nchash_t hash; + vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); + struct namecache *ncp, *oncp; + int total; /* First, check whether we can/should add a cache entry. */ if ((cnflags & MAKEENTRY) == 0 || @@ -861,140 +857,151 @@ cache_enter(struct vnode *dvp, struct vn } SDT_PROBE(vfs, namecache, enter, done, vp, name, namelen, 0, 0); - if (numcache > desiredvnodes) { - mutex_enter(namecache_lock); - cache_ev_forced.ev_count++; + + /* + * Reclaim some entries if over budget. This is an unlocked check, + * but it doesn't matter. Just need to catch up with things + * eventually: it doesn't matter if we go over temporarily. + */ + total = cache_lru.count[LRU_ACTIVE] + cache_lru.count[LRU_INACTIVE]; + if (__predict_false(total > desiredvnodes)) { cache_reclaim(); - mutex_exit(namecache_lock); } - if (namelen > NCHNAMLEN) { - ncp = kmem_alloc(sizeof(*ncp) + namelen, KM_SLEEP); - cache_ctor(NULL, ncp, 0); - } else - ncp = pool_cache_get(namecache_cache, PR_WAITOK); + /* Now allocate a fresh entry. */ + if (__predict_true(namelen <= NCHNAMLEN)) { + ncp = pool_cache_get(cache_pool, PR_WAITOK); + } else { + size_t sz = offsetof(struct namecache, nc_name[namelen]); + ncp = kmem_alloc(sz, KM_SLEEP); + } - mutex_enter(namecache_lock); - numcache++; + /* Fill in cache info. */ + ncp->nc_dvp = dvp; + ncp->nc_key = cache_key(name, namelen); + ncp->nc_nlen = namelen; + memcpy(ncp->nc_name, name, namelen); /* - * Concurrent lookups in the same directory may race for a - * cache entry. if there's a duplicated entry, free it. + * Insert to the directory. Concurrent lookups in the same + * directory may race for a cache entry. There can also be hash + * value collisions. If there's a entry there already, free it. */ - oncp = cache_lookup_entry(dvp, name, namelen); + rw_enter(&dvi->vi_nc_lock, RW_WRITER); + oncp = rb_tree_find_node(&dvi->vi_nc_tree, &ncp->nc_key); if (oncp) { - cache_invalidate(oncp); - mutex_exit(&oncp->nc_lock); + KASSERT(oncp->nc_nlen == ncp->nc_nlen); + if (cache_namecmp(oncp, name, namelen)) { + COUNT(ncs_collisions); + } + cache_remove(oncp, true); } + rb_tree_insert_node(&dvi->vi_nc_tree, ncp); - /* Grab the vnode we just found. */ - mutex_enter(&ncp->nc_lock); - ncp->nc_vp = vp; - ncp->nc_flags = 0; - ncp->nc_hittime = 0; - ncp->nc_gcqueue = NULL; + /* Then insert to the vnode. */ if (vp == NULL) { /* * For negative hits, save the ISWHITEOUT flag so we can * restore it later when the cache entry is used again. */ - ncp->nc_flags = cnflags & ISWHITEOUT; - } - - /* Fill in cache info. */ - ncp->nc_dvp = dvp; - LIST_INSERT_HEAD(&VNODE_TO_VIMPL(dvp)->vi_dnclist, ncp, nc_dvlist); - if (vp) { + ncp->nc_vp = NULL; + ncp->nc_whiteout = ((cnflags & ISWHITEOUT) != 0); + } else { /* Partially sort the per-vnode list: dots go to back. */ + vnode_impl_t *vi = VNODE_TO_VIMPL(vp); + rw_enter(&vi->vi_nc_listlock, RW_WRITER); if ((namelen == 1 && name[0] == '.') || (namelen == 2 && name[0] == '.' && name[1] == '.')) { - TAILQ_INSERT_TAIL(&VNODE_TO_VIMPL(vp)->vi_nclist, ncp, - nc_vlist); + TAILQ_INSERT_TAIL(&vi->vi_nc_list, ncp, nc_list); } else { - TAILQ_INSERT_HEAD(&VNODE_TO_VIMPL(vp)->vi_nclist, ncp, - nc_vlist); + TAILQ_INSERT_HEAD(&vi->vi_nc_list, ncp, nc_list); } - } else { - ncp->nc_vlist.tqe_prev = NULL; - ncp->nc_vlist.tqe_next = NULL; + rw_exit(&vi->vi_nc_listlock); + ncp->nc_vp = vp; + ncp->nc_whiteout = false; } - KASSERT(namelen <= USHRT_MAX); - ncp->nc_nlen = namelen; - memcpy(ncp->nc_name, name, (unsigned)ncp->nc_nlen); - TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); - hash = cache_hash(name, namelen); - ncpp = &nchashtbl[NCHASH2(hash, dvp)]; /* - * Flush updates before making visible in table. No need for a - * memory barrier on the other side: to see modifications the - * list must be followed, meaning a dependent pointer load. - * The below is LIST_INSERT_HEAD() inlined, with the memory - * barrier included in the correct place. + * Finally, insert to the tail of the ACTIVE LRU list (new) and + * with the LRU lock held take the to opportunity to incrementally + * balance the lists. */ - if ((ncp->nc_hash.le_next = ncpp->lh_first) != NULL) - ncpp->lh_first->nc_hash.le_prev = &ncp->nc_hash.le_next; - ncp->nc_hash.le_prev = &ncpp->lh_first; - membar_producer(); - ncpp->lh_first = ncp; - mutex_exit(&ncp->nc_lock); - mutex_exit(namecache_lock); + mutex_enter(&cache_lru_lock); + ncp->nc_lrulist = LRU_ACTIVE; + cache_lru.count[LRU_ACTIVE]++; + TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru); + cache_deactivate(); + mutex_exit(&cache_lru_lock); + rw_exit(&dvi->vi_nc_lock); } /* - * Name cache initialization, from vfs_init() when we are booting + * Set identity info in cache for a vnode. We only care about directories + * so ignore other updates. */ void -nchinit(void) +cache_enter_id(struct vnode *vp, mode_t mode, uid_t uid, gid_t gid) { - int error; - - TAILQ_INIT(&nclruhead); - namecache_cache = pool_cache_init(sizeof(struct namecache) + NCHNAMLEN, - coherency_unit, 0, 0, "ncache", NULL, IPL_NONE, cache_ctor, - cache_dtor, NULL); - KASSERT(namecache_cache != NULL); - - namecache_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); - nchashtbl = hashinit(desiredvnodes, HASH_LIST, true, &nchash); - - error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, cache_thread, - NULL, NULL, "cachegc"); - if (error != 0) - panic("nchinit %d", error); - - evcnt_attach_dynamic(&cache_ev_scan, EVCNT_TYPE_MISC, NULL, - "namecache", "entries scanned"); - evcnt_attach_dynamic(&cache_ev_gc, EVCNT_TYPE_MISC, NULL, - "namecache", "entries collected"); - evcnt_attach_dynamic(&cache_ev_over, EVCNT_TYPE_MISC, NULL, - "namecache", "over scan target"); - evcnt_attach_dynamic(&cache_ev_under, EVCNT_TYPE_MISC, NULL, - "namecache", "under scan target"); - evcnt_attach_dynamic(&cache_ev_forced, EVCNT_TYPE_MISC, NULL, - "namecache", "forced reclaims"); + vnode_impl_t *vi = VNODE_TO_VIMPL(vp); - sysctl_cache_stat_setup(); + if (vp->v_type == VDIR) { + /* Grab both locks, for forward & reverse lookup. */ + rw_enter(&vi->vi_nc_lock, RW_WRITER); + rw_enter(&vi->vi_nc_listlock, RW_WRITER); + vi->vi_nc_mode = mode; + vi->vi_nc_uid = uid; + vi->vi_nc_gid = gid; + rw_exit(&vi->vi_nc_listlock); + rw_exit(&vi->vi_nc_lock); + } } -static int -cache_ctor(void *arg, void *obj, int flag) +/* + * Return true if we have identity for the given vnode, and use as an + * opportunity to confirm that everything squares up. + * + * Because of shared code, some file systems could provide partial + * information, missing some updates, so always check the mount flag + * instead of looking for !VNOVAL. + */ +bool +cache_have_id(struct vnode *vp) { - struct namecache *ncp; - - ncp = obj; - mutex_init(&ncp->nc_lock, MUTEX_DEFAULT, IPL_NONE); - return 0; + if (vp->v_type == VDIR && + (vp->v_mount->mnt_iflag & IMNT_NCLOOKUP) != 0) { + KASSERT(VNODE_TO_VIMPL(vp)->vi_nc_mode != VNOVAL); + KASSERT(VNODE_TO_VIMPL(vp)->vi_nc_uid != VNOVAL); + KASSERT(VNODE_TO_VIMPL(vp)->vi_nc_gid != VNOVAL); + return true; + } else { + return false; + } } -static void -cache_dtor(void *arg, void *obj) +/* + * Name cache initialization, from vfs_init() when the system is booting. + */ +void +nchinit(void) { - struct namecache *ncp; - ncp = obj; - mutex_destroy(&ncp->nc_lock); + cache_pool = pool_cache_init(sizeof(struct namecache), + coherency_unit, 0, 0, "nchentry", NULL, IPL_NONE, NULL, + NULL, NULL); + KASSERT(cache_pool != NULL); + + mutex_init(&cache_lru_lock, MUTEX_DEFAULT, IPL_NONE); + TAILQ_INIT(&cache_lru.list[LRU_ACTIVE]); + TAILQ_INIT(&cache_lru.list[LRU_INACTIVE]); + + KASSERT(cache_sysctllog == NULL); + sysctl_createv(&cache_sysctllog, 0, NULL, NULL, + CTLFLAG_PERMANENT, + CTLTYPE_STRUCT, "namecache_stats", + SYSCTL_DESCR("namecache statistics"), + cache_stat_sysctl, 0, NULL, 0, + CTL_VFS, CTL_CREATE, CTL_EOL); } /* @@ -1003,87 +1010,176 @@ cache_dtor(void *arg, void *obj) void cache_cpu_init(struct cpu_info *ci) { - struct nchcpu *cpup; + void *p; size_t sz; - sz = roundup2(sizeof(*cpup), coherency_unit) + coherency_unit; - cpup = kmem_zalloc(sz, KM_SLEEP); - cpup = (void *)roundup2((uintptr_t)cpup, coherency_unit); - mutex_init(&cpup->cpu_lock, MUTEX_DEFAULT, IPL_NONE); - ci->ci_data.cpu_nch = cpup; + sz = roundup2(sizeof(struct nchstats_percpu), coherency_unit) + + coherency_unit; + p = kmem_zalloc(sz, KM_SLEEP); + ci->ci_data.cpu_nch = (void *)roundup2((uintptr_t)p, coherency_unit); } /* - * Name cache reinitialization, for when the maximum number of vnodes increases. + * A vnode is being allocated: set up cache structures. */ void -nchreinit(void) +cache_vnode_init(struct vnode *vp) { + vnode_impl_t *vi = VNODE_TO_VIMPL(vp); + + rw_init(&vi->vi_nc_lock); + rw_init(&vi->vi_nc_listlock); + rb_tree_init(&vi->vi_nc_tree, &cache_rbtree_ops); + TAILQ_INIT(&vi->vi_nc_list); + vi->vi_nc_mode = VNOVAL; + vi->vi_nc_uid = VNOVAL; + vi->vi_nc_gid = VNOVAL; +} + +/* + * A vnode is being freed: finish cache structures. + */ +void +cache_vnode_fini(struct vnode *vp) +{ + vnode_impl_t *vi = VNODE_TO_VIMPL(vp); + + KASSERT(RB_TREE_MIN(&vi->vi_nc_tree) == NULL); + KASSERT(TAILQ_EMPTY(&vi->vi_nc_list)); + rw_destroy(&vi->vi_nc_lock); + rw_destroy(&vi->vi_nc_listlock); +} + +/* + * Helper for cache_purge1(): purge cache entries for the given vnode from + * all directories that the vnode is cached in. + */ +static void +cache_purge_parents(struct vnode *vp) +{ + vnode_impl_t *dvi, *vi = VNODE_TO_VIMPL(vp); + struct vnode *dvp, *blocked; struct namecache *ncp; - struct nchashhead *oldhash, *hash; - u_long i, oldmask, mask; - hash = hashinit(desiredvnodes, HASH_LIST, true, &mask); - mutex_enter(namecache_lock); - cache_lock_cpus(); - oldhash = nchashtbl; - oldmask = nchash; - nchashtbl = hash; - nchash = mask; - for (i = 0; i <= oldmask; i++) { - while ((ncp = LIST_FIRST(&oldhash[i])) != NULL) { - LIST_REMOVE(ncp, nc_hash); - ncp->nc_hash.le_prev = NULL; - } - } - cache_unlock_cpus(); - mutex_exit(namecache_lock); - hashdone(oldhash, HASH_LIST, oldmask); + SDT_PROBE(vfs, namecache, purge, parents, vp, 0, 0, 0, 0); + + blocked = NULL; + + rw_enter(&vi->vi_nc_listlock, RW_WRITER); + while ((ncp = TAILQ_FIRST(&vi->vi_nc_list)) != NULL) { + /* + * Locking in the wrong direction. Try for a hold on the + * directory node's lock, and if we get it then all good, + * nuke the entry and move on to the next. + */ + dvp = ncp->nc_dvp; + dvi = VNODE_TO_VIMPL(dvp); + if (rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) { + cache_remove(ncp, false); + rw_exit(&dvi->vi_nc_lock); + blocked = NULL; + continue; + } + + /* + * We can't wait on the directory node's lock with our list + * lock held or the system could deadlock. + * + * Take a hold on the directory vnode to prevent it from + * being freed (taking the vnode & lock with it). Then + * wait for the lock to become available with no other locks + * held, and retry. + * + * If this happens twice in a row, give the other side a + * breather; we can do nothing until it lets go. + */ + vhold(dvp); + rw_exit(&vi->vi_nc_listlock); + rw_enter(&dvi->vi_nc_lock, RW_WRITER); + /* Do nothing. */ + rw_exit(&dvi->vi_nc_lock); + holdrele(dvp); + if (blocked == dvp) { + kpause("ncpurge", false, 1, NULL); + } + rw_enter(&vi->vi_nc_listlock, RW_WRITER); + blocked = dvp; + } + rw_exit(&vi->vi_nc_listlock); +} + +/* + * Helper for cache_purge1(): purge all cache entries hanging off the given + * directory vnode. + */ +static void +cache_purge_children(struct vnode *dvp) +{ + vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); + struct namecache *ncp; + + SDT_PROBE(vfs, namecache, purge, children, dvp, 0, 0, 0, 0); + + rw_enter(&dvi->vi_nc_lock, RW_WRITER); + for (;;) { + ncp = rb_tree_iterate(&dvi->vi_nc_tree, NULL, RB_DIR_RIGHT); + if (ncp == NULL) { + break; + } + cache_remove(ncp, true); + } + rw_exit(&dvi->vi_nc_lock); +} + +/* + * Helper for cache_purge1(): purge cache entry from the given vnode, + * finding it by name. + */ +static void +cache_purge_name(struct vnode *dvp, const char *name, size_t namelen) +{ + vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp); + struct namecache *ncp; + int64_t key; + + SDT_PROBE(vfs, namecache, purge, name, name, namelen, 0, 0, 0); + + key = cache_key(name, namelen); + rw_enter(&dvi->vi_nc_lock, RW_WRITER); + ncp = cache_lookup_entry(dvp, name, namelen, key); + if (ncp) { + cache_remove(ncp, true); + } + rw_exit(&dvi->vi_nc_lock); } /* * Cache flush, a particular vnode; called when a vnode is renamed to - * hide entries that would now be invalid + * hide entries that would now be invalid. */ void cache_purge1(struct vnode *vp, const char *name, size_t namelen, int flags) { - struct namecache *ncp, *ncnext; - mutex_enter(namecache_lock); if (flags & PURGE_PARENTS) { - SDT_PROBE(vfs, namecache, purge, parents, vp, 0, 0, 0, 0); - - for (ncp = TAILQ_FIRST(&VNODE_TO_VIMPL(vp)->vi_nclist); - ncp != NULL; ncp = ncnext) { - ncnext = TAILQ_NEXT(ncp, nc_vlist); - mutex_enter(&ncp->nc_lock); - cache_invalidate(ncp); - mutex_exit(&ncp->nc_lock); - cache_disassociate(ncp); - } + cache_purge_parents(vp); } if (flags & PURGE_CHILDREN) { - SDT_PROBE(vfs, namecache, purge, children, vp, 0, 0, 0, 0); - for (ncp = LIST_FIRST(&VNODE_TO_VIMPL(vp)->vi_dnclist); - ncp != NULL; ncp = ncnext) { - ncnext = LIST_NEXT(ncp, nc_dvlist); - mutex_enter(&ncp->nc_lock); - cache_invalidate(ncp); - mutex_exit(&ncp->nc_lock); - cache_disassociate(ncp); - } + cache_purge_children(vp); } if (name != NULL) { - SDT_PROBE(vfs, namecache, purge, name, name, namelen, 0, 0, 0); - ncp = cache_lookup_entry(vp, name, namelen); - if (ncp) { - cache_invalidate(ncp); - mutex_exit(&ncp->nc_lock); - cache_disassociate(ncp); - } + cache_purge_name(vp, name, namelen); } - mutex_exit(namecache_lock); +} + +/* + * vnode filter for cache_purgevfs(). + */ +static bool +cache_vdir_filter(void *cookie, vnode_t *vp) +{ + + return vp->v_type == VDIR; } /* @@ -1093,186 +1189,171 @@ cache_purge1(struct vnode *vp, const cha void cache_purgevfs(struct mount *mp) { - struct namecache *ncp, *nxtcp; + struct vnode_iterator *iter; + vnode_t *dvp; - SDT_PROBE(vfs, namecache, purge, vfs, mp, 0, 0, 0, 0); - mutex_enter(namecache_lock); - for (ncp = TAILQ_FIRST(&nclruhead); ncp != NULL; ncp = nxtcp) { - nxtcp = TAILQ_NEXT(ncp, nc_lru); - mutex_enter(&ncp->nc_lock); - if (ncp->nc_dvp != NULL && ncp->nc_dvp->v_mount == mp) { - /* Free the resources we had. */ - cache_invalidate(ncp); - cache_disassociate(ncp); + vfs_vnode_iterator_init(mp, &iter); + for (;;) { + dvp = vfs_vnode_iterator_next(iter, cache_vdir_filter, NULL); + if (dvp == NULL) { + break; } - mutex_exit(&ncp->nc_lock); + cache_purge_children(dvp); + vrele(dvp); } - cache_reclaim(); - mutex_exit(namecache_lock); + vfs_vnode_iterator_destroy(iter); } /* - * Scan global list invalidating entries until we meet a preset target. - * Prefer to invalidate entries that have not scored a hit within - * cache_hottime seconds. We sort the LRU list only for this routine's - * benefit. + * Re-queue an entry onto the correct LRU list, after it has scored a hit. */ static void -cache_prune(int incache, int target) +cache_activate(struct namecache *ncp) { - struct namecache *ncp, *nxtcp, *sentinel; - int items, recent, tryharder; - - KASSERT(mutex_owned(namecache_lock)); - SDT_PROBE(vfs, namecache, prune, done, incache, target, 0, 0, 0); - items = 0; - tryharder = 0; - recent = hardclock_ticks - hz * cache_hottime; - sentinel = NULL; - for (ncp = TAILQ_FIRST(&nclruhead); ncp != NULL; ncp = nxtcp) { - if (incache <= target) - break; - items++; - nxtcp = TAILQ_NEXT(ncp, nc_lru); - if (ncp == sentinel) { - /* - * If we looped back on ourself, then ignore - * recent entries and purge whatever we find. - */ - tryharder = 1; - } - if (ncp->nc_dvp == NULL) - continue; - if (!tryharder && (ncp->nc_hittime - recent) > 0) { - if (sentinel == NULL) - sentinel = ncp; - TAILQ_REMOVE(&nclruhead, ncp, nc_lru); - TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); - continue; - } - mutex_enter(&ncp->nc_lock); - if (ncp->nc_dvp != NULL) { - cache_invalidate(ncp); - cache_disassociate(ncp); - incache--; - } - mutex_exit(&ncp->nc_lock); - } - cache_ev_scan.ev_count += items; + mutex_enter(&cache_lru_lock); + /* Put on tail of ACTIVE list, since it just scored a hit. */ + TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru); + TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru); + cache_lru.count[ncp->nc_lrulist]--; + cache_lru.count[LRU_ACTIVE]++; + ncp->nc_lrulist = LRU_ACTIVE; + mutex_exit(&cache_lru_lock); } /* - * Collect dead cache entries from all CPUs and garbage collect. + * Try to balance the LRU lists. Pick some victim entries, and re-queue + * them from the head of the active list to the tail of the inactive list. */ static void -cache_reclaim(void) +cache_deactivate(void) { - struct namecache *ncp, *next; - int items; + struct namecache *ncp; + int total, i; - KASSERT(mutex_owned(namecache_lock)); + KASSERT(mutex_owned(&cache_lru_lock)); - /* - * If the number of extant entries not awaiting garbage collection - * exceeds the high water mark, then reclaim stale entries until we - * reach our low water mark. - */ - items = numcache - cache_gcpend; - if (items > (uint64_t)desiredvnodes * cache_hiwat / 100) { - cache_prune(items, (int)((uint64_t)desiredvnodes * - cache_lowat / 100)); - cache_ev_over.ev_count++; - } else - cache_ev_under.ev_count++; + /* If we're nowhere near budget yet, don't bother. */ + total = cache_lru.count[LRU_ACTIVE] + cache_lru.count[LRU_INACTIVE]; + if (total < (desiredvnodes >> 1)) { + return; + } /* - * Stop forward lookup activity on all CPUs and garbage collect dead - * entries. + * Aim for a 1:1 ratio of active to inactive. This is to allow each + * potential victim a reasonable amount of time to cycle through the + * inactive list in order to score a hit and be reactivated, while + * trying not to cause reactivations too frequently. */ - cache_lock_cpus(); - ncp = cache_gcqueue; - cache_gcqueue = NULL; - items = cache_gcpend; - cache_gcpend = 0; - while (ncp != NULL) { - next = ncp->nc_gcqueue; - cache_disassociate(ncp); - KASSERT(ncp->nc_dvp == NULL); - if (ncp->nc_hash.le_prev != NULL) { - LIST_REMOVE(ncp, nc_hash); - ncp->nc_hash.le_prev = NULL; - } - if (ncp->nc_nlen > NCHNAMLEN) { - cache_dtor(NULL, ncp); - kmem_free(ncp, sizeof(*ncp) + ncp->nc_nlen); - } else - pool_cache_put(namecache_cache, ncp); - ncp = next; - } - cache_unlock_cpus(); - numcache -= items; - cache_ev_gc.ev_count += items; + if (cache_lru.count[LRU_ACTIVE] < cache_lru.count[LRU_INACTIVE]) { + return; + } + + /* Move only a few at a time; will catch up eventually. */ + for (i = 0; i < cache_lru_maxdeact; i++) { + ncp = TAILQ_FIRST(&cache_lru.list[LRU_ACTIVE]); + if (ncp == NULL) { + break; + } + KASSERT(ncp->nc_lrulist == LRU_ACTIVE); + ncp->nc_lrulist = LRU_INACTIVE; + TAILQ_REMOVE(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru); + TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE], ncp, nc_lru); + cache_lru.count[LRU_ACTIVE]--; + cache_lru.count[LRU_INACTIVE]++; + } } /* - * Cache maintainence thread, awakening once per second to: - * - * => keep number of entries below the high water mark - * => sort pseudo-LRU list - * => garbage collect dead entries + * Free some entries from the cache, when we have gone over budget. + * + * We don't want to cause too much work for any individual caller, and it + * doesn't matter if we temporarily go over budget. This is also "just a + * cache" so it's not a big deal if we screw up and throw out something we + * shouldn't. So we take a relaxed attitude to this process to reduce its + * impact. */ static void -cache_thread(void *arg) -{ - - mutex_enter(namecache_lock); - for (;;) { - cache_reclaim(); - kpause("cachegc", false, hz, namecache_lock); - } -} - -#ifdef DDB -void -namecache_print(struct vnode *vp, void (*pr)(const char *, ...)) +cache_reclaim(void) { - struct vnode *dvp = NULL; struct namecache *ncp; + vnode_impl_t *dvi; + int toscan; - TAILQ_FOREACH(ncp, &nclruhead, nc_lru) { - if (ncp->nc_vp == vp && ncp->nc_dvp != NULL) { - (*pr)("name %.*s\n", ncp->nc_nlen, ncp->nc_name); - dvp = ncp->nc_dvp; + /* + * Scan up to a preset maxium number of entries, but no more than + * 0.8% of the total at once (to allow for very small systems). + * + * On bigger systems, do a larger chunk of work to reduce the number + * of times that cache_lru_lock is held for any length of time. + */ + mutex_enter(&cache_lru_lock); + toscan = MIN(cache_lru_maxscan, desiredvnodes >> 7); + toscan = MAX(toscan, 1); + SDT_PROBE(vfs, namecache, prune, done, cache_lru.count[LRU_ACTIVE] + + cache_lru.count[LRU_INACTIVE], toscan, 0, 0, 0); + while (toscan-- != 0) { + /* First try to balance the lists. */ + cache_deactivate(); + + /* Now look for a victim on head of inactive list (old). */ + ncp = TAILQ_FIRST(&cache_lru.list[LRU_INACTIVE]); + if (ncp == NULL) { + break; } - } - if (dvp == NULL) { - (*pr)("name not found\n"); - return; - } - vp = dvp; - TAILQ_FOREACH(ncp, &nclruhead, nc_lru) { - if (ncp->nc_vp == vp && ncp->nc_dvp != NULL) { - (*pr)("parent %.*s\n", ncp->nc_nlen, ncp->nc_name); + dvi = VNODE_TO_VIMPL(ncp->nc_dvp); + KASSERT(ncp->nc_lrulist == LRU_INACTIVE); + KASSERT(dvi != NULL); + + /* + * Locking in the wrong direction. If we can't get the + * lock, the directory is actively busy, and it could also + * cause problems for the next guy in here, so send the + * entry to the back of the list. + */ + if (!rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) { + TAILQ_REMOVE(&cache_lru.list[LRU_INACTIVE], + ncp, nc_lru); + TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE], + ncp, nc_lru); + continue; } + + /* + * Now have the victim entry locked. Drop the LRU list + * lock, purge the entry, and start over. The hold on + * vi_nc_lock will prevent the vnode from vanishing until + * finished (cache_purge() will be called on dvp before it + * disappears, and that will wait on vi_nc_lock). + */ + mutex_exit(&cache_lru_lock); + cache_remove(ncp, true); + rw_exit(&dvi->vi_nc_lock); + mutex_enter(&cache_lru_lock); } + mutex_exit(&cache_lru_lock); } -#endif +/* + * For file system code: count a lookup that required a full re-scan of + * directory metadata. + */ void namecache_count_pass2(void) { - struct nchcpu *cpup = curcpu()->ci_data.cpu_nch; - COUNT_UNL(cpup, ncs_pass2); + COUNT(ncs_pass2); } +/* + * For file system code: count a lookup that scored a hit in the directory + * metadata near the location of the last lookup. + */ void namecache_count_2passes(void) { - struct nchcpu *cpup = curcpu()->ci_data.cpu_nch; - COUNT_UNL(cpup, ncs_2passes); + COUNT(ncs_2passes); } /* @@ -1283,64 +1364,79 @@ namecache_count_2passes(void) static int cache_stat_sysctl(SYSCTLFN_ARGS) { - struct nchstats stats; - struct nchcpu *my_cpup; -#ifdef CACHE_STATS_CURRENT CPU_INFO_ITERATOR cii; + struct nchstats stats; struct cpu_info *ci; -#endif /* CACHE_STATS_CURRENT */ if (oldp == NULL) { - *oldlenp = sizeof(stats); + *oldlenp = sizeof(nchstats); return 0; } - if (*oldlenp < sizeof(stats)) { + if (*oldlenp <= 0) { *oldlenp = 0; return 0; } - /* - * Take this CPU's per-cpu lock to hold off cache_reclaim() - * from doing a stats update while doing minimal damage to - * concurrent operations. - */ sysctl_unlock(); - my_cpup = curcpu()->ci_data.cpu_nch; - mutex_enter(&my_cpup->cpu_lock); - stats = nchstats; -#ifdef CACHE_STATS_CURRENT + memset(&stats, 0, sizeof(nchstats)); for (CPU_INFO_FOREACH(cii, ci)) { - struct nchcpu *cpup = ci->ci_data.cpu_nch; + struct nchstats_percpu *np = ci->ci_data.cpu_nch; - ADD(stats, cpup, ncs_goodhits); - ADD(stats, cpup, ncs_neghits); - ADD(stats, cpup, ncs_badhits); - ADD(stats, cpup, ncs_falsehits); - ADD(stats, cpup, ncs_miss); - ADD(stats, cpup, ncs_long); - ADD(stats, cpup, ncs_pass2); - ADD(stats, cpup, ncs_2passes); - ADD(stats, cpup, ncs_revhits); - ADD(stats, cpup, ncs_revmiss); - } -#endif /* CACHE_STATS_CURRENT */ - mutex_exit(&my_cpup->cpu_lock); + stats.ncs_goodhits += np->ncs_goodhits; + stats.ncs_neghits += np->ncs_neghits; + stats.ncs_badhits += np->ncs_badhits; + stats.ncs_falsehits += np->ncs_falsehits; + stats.ncs_miss += np->ncs_miss; + stats.ncs_long += np->ncs_long; + stats.ncs_pass2 += np->ncs_pass2; + stats.ncs_2passes += np->ncs_2passes; + stats.ncs_revhits += np->ncs_revhits; + stats.ncs_revmiss += np->ncs_revmiss; + stats.ncs_collisions += np->ncs_collisions; + stats.ncs_denied += np->ncs_denied; + } + mutex_enter(&cache_lru_lock); + memcpy(&nchstats, &stats, sizeof(nchstats)); + mutex_exit(&cache_lru_lock); sysctl_relock(); - *oldlenp = sizeof(stats); - return sysctl_copyout(l, &stats, oldp, sizeof(stats)); + *oldlenp = MIN(sizeof(stats), *oldlenp); + return sysctl_copyout(l, &stats, oldp, *oldlenp); } -static void -sysctl_cache_stat_setup(void) +/* + * For the debugger, given the address of a vnode, print all associated + * names in the cache. + */ +#ifdef DDB +void +namecache_print(struct vnode *vp, void (*pr)(const char *, ...)) { + struct vnode *dvp = NULL; + struct namecache *ncp; + enum cache_lru_id id; - KASSERT(sysctllog == NULL); - sysctl_createv(&sysctllog, 0, NULL, NULL, - CTLFLAG_PERMANENT, - CTLTYPE_STRUCT, "namecache_stats", - SYSCTL_DESCR("namecache statistics"), - cache_stat_sysctl, 0, NULL, 0, - CTL_VFS, CTL_CREATE, CTL_EOL); + for (id = 0; id < LRU_COUNT; id++) { + TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) { + if (ncp->nc_vp == vp) { + (*pr)("name %.*s\n", ncp->nc_nlen, + ncp->nc_name); + dvp = ncp->nc_dvp; + } + } + } + if (dvp == NULL) { + (*pr)("name not found\n"); + return; + } + for (id = 0; id < LRU_COUNT; id++) { + TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) { + if (ncp->nc_vp == dvp) { + (*pr)("parent %.*s\n", ncp->nc_nlen, + ncp->nc_name); + } + } + } } +#endif Index: src/sys/kern/vfs_getcwd.c diff -u src/sys/kern/vfs_getcwd.c:1.55 src/sys/kern/vfs_getcwd.c:1.53.2.5 --- src/sys/kern/vfs_getcwd.c:1.55 Sun Feb 23 22:14:03 2020 +++ src/sys/kern/vfs_getcwd.c Sat Feb 29 20:21:03 2020 @@ -1,7 +1,7 @@ /* $NetBSD$ */ /*- - * Copyright (c) 1999 The NetBSD Foundation, Inc. + * Copyright (c) 1999, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation @@ -87,7 +87,7 @@ __KERNEL_RCSID(0, "$NetBSD$"); * On exit, *uvpp is either NULL or is a locked vnode reference. */ static int -getcwd_scandir(struct vnode **lvpp, struct vnode **uvpp, char **bpp, +getcwd_scandir(struct vnode *lvp, struct vnode **uvpp, char **bpp, char *bufp, struct lwp *l) { int error = 0; @@ -101,12 +101,14 @@ getcwd_scandir(struct vnode **lvpp, stru ino_t fileno; struct vattr va; struct vnode *uvp = NULL; - struct vnode *lvp = *lvpp; kauth_cred_t cred = l->l_cred; struct componentname cn; int len, reclen; tries = 0; + /* Need exclusive for UFS VOP_GETATTR (itimes) & VOP_LOOKUP. */ + KASSERT(VOP_ISLOCKED(lvp) == LK_EXCLUSIVE); + /* * If we want the filename, get some info we need while the * current directory is still locked. @@ -114,8 +116,7 @@ getcwd_scandir(struct vnode **lvpp, stru if (bufp != NULL) { error = VOP_GETATTR(lvp, &va, cred); if (error) { - vput(lvp); - *lvpp = NULL; + VOP_UNLOCK(lvp); *uvpp = NULL; return error; } @@ -134,24 +135,14 @@ getcwd_scandir(struct vnode **lvpp, stru /* At this point, lvp is locked */ error = VOP_LOOKUP(lvp, uvpp, &cn); - vput(lvp); + VOP_UNLOCK(lvp); if (error) { - *lvpp = NULL; *uvpp = NULL; return error; } uvp = *uvpp; - /* Now lvp is unlocked, try to lock uvp */ - error = vn_lock(uvp, LK_EXCLUSIVE); - if (error) { - *lvpp = NULL; - *uvpp = NULL; - return error; - } - /* If we don't care about the pathname, we're done */ if (bufp == NULL) { - *lvpp = NULL; return 0; } @@ -163,6 +154,14 @@ getcwd_scandir(struct vnode **lvpp, stru dirbuflen = va.va_blocksize; dirbuf = kmem_alloc(dirbuflen, KM_SLEEP); + /* Now lvp is unlocked, try to lock uvp */ + error = vn_lock(uvp, LK_SHARED); + if (error) { + vrele(uvp); + *uvpp = NULL; + return error; + } + #if 0 unionread: #endif @@ -254,73 +253,21 @@ unionread: vput(tvp); vref(uvp); *uvpp = uvp; - vn_lock(uvp, LK_EXCLUSIVE | LK_RETRY); + vn_lock(uvp, LK_SHARED | LK_RETRY); goto unionread; } #endif error = ENOENT; out: - *lvpp = NULL; + VOP_UNLOCK(uvp); kmem_free(dirbuf, dirbuflen); return error; } /* - * Look in the vnode-to-name reverse cache to see if - * we can find things the easy way. - * - * XXX vget failure path is untested. - * - * On entry, *lvpp is a locked vnode reference. - * On exit, one of the following is the case: - * 0) Both *lvpp and *uvpp are NULL and failure is returned. - * 1) *uvpp is NULL, *lvpp remains locked and -1 is returned (cache miss) - * 2) *uvpp is a locked vnode reference, *lvpp is vput and NULL'ed - * and 0 is returned (cache hit) - */ - -static int -getcwd_getcache(struct vnode **lvpp, struct vnode **uvpp, char **bpp, - char *bufp) -{ - struct vnode *lvp, *uvp = NULL; - int error; - - lvp = *lvpp; - - /* - * This returns 0 on a cache hit, -1 on a clean cache miss, - * or an errno on other failure. - */ - error = cache_revlookup(lvp, uvpp, bpp, bufp); - if (error) { - if (error != -1) { - vput(lvp); - *lvpp = NULL; - *uvpp = NULL; - } - return error; - } - uvp = *uvpp; - - /* - * Since we're going up, we have to release the current lock - * before we take the parent lock. - */ - - VOP_UNLOCK(lvp); - vn_lock(uvp, LK_EXCLUSIVE | LK_RETRY); - vrele(lvp); - *lvpp = NULL; - - return error; -} - -/* * common routine shared by sys___getcwd() and vn_isunder() */ - int getcwd_common(struct vnode *lvp, struct vnode *rvp, char **bpp, char *bufp, int limit, int flags, struct lwp *l) @@ -345,11 +292,10 @@ getcwd_common(struct vnode *lvp, struct /* * Error handling invariant: * Before a `goto out': - * lvp is either NULL, or locked and held. - * uvp is either NULL, or locked and held. + * lvp is either NULL, or held. + * uvp is either NULL, or held. */ - vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY); if (bufp) bp = *bpp; @@ -369,63 +315,93 @@ getcwd_common(struct vnode *lvp, struct * access check here is optional, depending on * whether or not caller cares. */ - if (flags & GETCWD_CHECK_ACCESS) { - error = VOP_ACCESS(lvp, perms, cred); - if (error) - goto out; - perms = VEXEC|VREAD; - } + int chkaccess = (flags & GETCWD_CHECK_ACCESS); + bool locked = false; /* * step up if we're a covered vnode.. + * check access on the first vnode only. */ - while (lvp->v_vflag & VV_ROOT) { - struct vnode *tvp; + if (lvp->v_vflag & VV_ROOT) { + vn_lock(lvp, LK_SHARED | LK_RETRY); + if (chkaccess) { + error = VOP_ACCESS(lvp, perms, cred); + if (error) { + VOP_UNLOCK(lvp); + goto out; + } + chkaccess = 0; + } + while (lvp->v_vflag & VV_ROOT) { + struct vnode *tvp; - if (lvp == rvp) - goto out; + if (lvp == rvp) { + VOP_UNLOCK(lvp); + goto out; + } - tvp = lvp; - lvp = lvp->v_mount->mnt_vnodecovered; - vput(tvp); - /* - * hodie natus est radici frater - */ - if (lvp == NULL) { - error = ENOENT; - goto out; + tvp = lvp->v_mount->mnt_vnodecovered; + /* + * hodie natus est radici frater + */ + if (tvp == NULL) { + VOP_UNLOCK(lvp); + error = ENOENT; + goto out; + } + vref(tvp); + vput(lvp); + lvp = tvp; + if (lvp->v_vflag & VV_ROOT) + vn_lock(lvp, LK_SHARED | LK_RETRY); } - vref(lvp); - error = vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY); - if (error != 0) { - vrele(lvp); - lvp = NULL; + } + + /* Do we need to check access to the directory? */ + if (chkaccess && !cache_have_id(lvp)) { + /* Need exclusive for UFS VOP_GETATTR (itimes) & VOP_LOOKUP. */ + vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY); + error = VOP_ACCESS(lvp, perms, cred); + if (error) { + VOP_UNLOCK(lvp); goto out; } + chkaccess = 0; + locked = true; } + /* * Look in the name cache; if that fails, look in the * directory.. */ - error = getcwd_getcache(&lvp, &uvp, &bp, bufp); + error = cache_revlookup(lvp, &uvp, &bp, bufp, chkaccess, + perms); if (error == -1) { + if (!locked) { + locked = true; + vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY); + } if (lvp->v_type != VDIR) { + VOP_UNLOCK(lvp); error = ENOTDIR; goto out; } - error = getcwd_scandir(&lvp, &uvp, &bp, bufp, l); + error = getcwd_scandir(lvp, &uvp, &bp, bufp, l); + /* lvp now unlocked */ + } else if (locked) { + VOP_UNLOCK(lvp); } if (error) goto out; #if DIAGNOSTIC - if (lvp != NULL) - panic("getcwd: oops, forgot to null lvp"); if (bufp && (bp <= bufp)) { panic("getcwd: oops, went back too far"); } #endif + perms = VEXEC | VREAD; if (bp) *(--bp) = '/'; + vrele(lvp); lvp = uvp; uvp = NULL; limit--; @@ -435,9 +411,9 @@ out: if (bpp) *bpp = bp; if (uvp) - vput(uvp); + vrele(uvp); if (lvp) - vput(lvp); + vrele(lvp); vrele(rvp); return error; } @@ -556,11 +532,7 @@ vnode_to_path(char *path, size_t len, st bp = bend = &path[len]; *(--bp) = '\0'; - error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - if (error != 0) - return error; - error = cache_revlookup(vp, &dvp, &bp, path); - VOP_UNLOCK(vp); + error = cache_revlookup(vp, &dvp, &bp, path, false, 0); if (error != 0) return (error == -1 ? ENOENT : error); Index: src/sys/kern/vfs_lookup.c diff -u src/sys/kern/vfs_lookup.c:1.214 src/sys/kern/vfs_lookup.c:1.212.4.11 --- src/sys/kern/vfs_lookup.c:1.214 Sun Feb 23 22:14:03 2020 +++ src/sys/kern/vfs_lookup.c Tue Mar 3 22:30:57 2020 @@ -50,6 +50,7 @@ __KERNEL_RCSID(0, "$NetBSD$"); #include #include #include +#include #include #include #include @@ -709,8 +710,6 @@ namei_start(struct namei_state *state, i return ENOTDIR; } - vn_lock(startdir, LK_EXCLUSIVE | LK_RETRY); - *startdir_ret = startdir; return 0; } @@ -748,15 +747,17 @@ namei_follow(struct namei_state *state, size_t linklen; int error; - KASSERT(VOP_ISLOCKED(searchdir) == LK_EXCLUSIVE); - KASSERT(VOP_ISLOCKED(foundobj) == LK_EXCLUSIVE); if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { return ELOOP; } + + vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY); if (foundobj->v_mount->mnt_flag & MNT_SYMPERM) { error = VOP_ACCESS(foundobj, VEXEC, cnp->cn_cred); - if (error != 0) + if (error != 0) { + VOP_UNLOCK(foundobj); return error; + } } /* FUTURE: fix this to not use a second buffer */ @@ -770,6 +771,7 @@ namei_follow(struct namei_state *state, auio.uio_resid = MAXPATHLEN; UIO_SETUP_SYSSPACE(&auio); error = VOP_READLINK(foundobj, &auio, cnp->cn_cred); + VOP_UNLOCK(foundobj); if (error) { PNBUF_PUT(cp); return error; @@ -806,14 +808,11 @@ namei_follow(struct namei_state *state, /* we're now starting from the beginning of the buffer again */ cnp->cn_nameptr = ndp->ni_pnbuf; - /* must unlock this before relocking searchdir */ - VOP_UNLOCK(foundobj); - /* * Check if root directory should replace current directory. */ if (ndp->ni_pnbuf[0] == '/') { - vput(searchdir); + vrele(searchdir); /* Keep absolute symbolic links inside emulation root */ searchdir = ndp->ni_erootdir; if (searchdir == NULL || @@ -824,7 +823,6 @@ namei_follow(struct namei_state *state, searchdir = ndp->ni_rootdir; } vref(searchdir); - vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY); while (cnp->cn_nameptr[0] == '/') { cnp->cn_nameptr++; ndp->ni_pathlen--; @@ -832,7 +830,6 @@ namei_follow(struct namei_state *state, } *newsearchdir_ret = searchdir; - KASSERT(VOP_ISLOCKED(searchdir) == LK_EXCLUSIVE); return 0; } @@ -860,7 +857,7 @@ lookup_parsepath(struct namei_state *sta * responsibility for freeing the pathname buffer. * * At this point, our only vnode state is that the search dir - * is held and locked. + * is held. */ cnp->cn_consume = 0; cnp->cn_namelen = namei_getcomponent(cnp->cn_nameptr); @@ -917,6 +914,111 @@ lookup_parsepath(struct namei_state *sta } /* + * Take care of crossing a mounted-on vnode. On error, foundobj_ret will be + * vrele'd, but searchdir is left alone. + */ +static int +lookup_crossmount(struct namei_state *state, + struct vnode **searchdir_ret, + struct vnode **foundobj_ret, + bool *searchdir_locked) +{ + struct componentname *cnp = state->cnp; + struct vnode *foundobj; + struct vnode *searchdir; + struct mount *mp; + int error, lktype; + + searchdir = *searchdir_ret; + foundobj = *foundobj_ret; + error = 0; + + KASSERT((cnp->cn_flags & NOCROSSMOUNT) == 0); + KASSERT(searchdir != NULL); + + /* First, unlock searchdir (oof). */ + if (*searchdir_locked) { + lktype = VOP_ISLOCKED(searchdir); + VOP_UNLOCK(searchdir); + *searchdir_locked = false; + } else { + lktype = LK_NONE; + } + + /* + * Do an unlocked check to see if the vnode has been mounted on; if + * so find the root of the mounted file system. + */ + while (foundobj->v_type == VDIR && + (mp = foundobj->v_mountedhere) != NULL && + (cnp->cn_flags & NOCROSSMOUNT) == 0) { + KASSERTMSG(searchdir != foundobj, "same vn %p", searchdir); + /* + * First get the vnode stable. LK_SHARED works brilliantly + * here because almost nothing else wants to lock the + * covered vnode. + */ + error = vn_lock(foundobj, LK_SHARED); + if (error != 0) { + vrele(foundobj); + *foundobj_ret = NULL; + break; + } + + /* Then check to see if something is still mounted on it. */ + if ((mp = foundobj->v_mountedhere) == NULL) { + VOP_UNLOCK(foundobj); + break; + } + + /* Get a reference to the mountpoint, and ditch foundobj. */ + error = vfs_busy(mp); + vput(foundobj); + if (error != 0) { + *foundobj_ret = NULL; + break; + } + + /* Now get a reference on the root vnode, and drop mount. */ + error = VFS_ROOT(mp, LK_NONE, &foundobj); + vfs_unbusy(mp); + if (error) { + *foundobj_ret = NULL; + break; + } + + /* + * Avoid locking vnodes from two filesystems because + * it's prone to deadlock, e.g. when using puffs. + * Also, it isn't a good idea to propagate slowness of + * a filesystem up to the root directory. For now, + * only handle the common case, where foundobj is + * VDIR. + * + * In this case set searchdir to null to avoid using + * it again. It is not correct to set searchdir == + * foundobj here as that will confuse the caller. + * (See PR 40740.) + */ + if (searchdir == NULL) { + /* already been here once; do nothing further */ + } else if (foundobj->v_type == VDIR) { + vrele(searchdir); + *searchdir_ret = searchdir = NULL; + *foundobj_ret = foundobj; + lktype = LK_NONE; + } + } + + /* If searchdir is still around, re-lock it. */ + if (error == 0 && lktype != LK_NONE) { + vn_lock(searchdir, lktype | LK_RETRY); + *searchdir_locked = true; + } + return error; +} + +/* * Call VOP_LOOKUP for a single lookup; return a new search directory * (used when crossing mountpoints up or searching union mounts down) and * the found object, which for create operations may be NULL on success. @@ -932,19 +1034,19 @@ static int lookup_once(struct namei_state *state, struct vnode *searchdir, struct vnode **newsearchdir_ret, - struct vnode **foundobj_ret) + struct vnode **foundobj_ret, + bool *newsearchdir_locked_ret) { struct vnode *tmpvn; /* scratch vnode */ struct vnode *foundobj; /* result */ - struct mount *mp; /* mount table entry */ struct lwp *l = curlwp; - int error; + bool searchdir_locked = false; + int error, lktype; struct componentname *cnp = state->cnp; struct nameidata *ndp = state->ndp; KASSERT(cnp == &ndp->ni_cnd); - KASSERT(VOP_ISLOCKED(searchdir) == LK_EXCLUSIVE); *newsearchdir_ret = searchdir; /* @@ -976,9 +1078,7 @@ lookup_once(struct namei_state *state, if (ndp->ni_rootdir != rootvnode) { int retval; - VOP_UNLOCK(searchdir); retval = vn_isunder(searchdir, ndp->ni_rootdir, l); - vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY); if (!retval) { /* Oops! We got out of jail! */ log(LOG_WARNING, @@ -987,12 +1087,11 @@ lookup_once(struct namei_state *state, p->p_pid, kauth_cred_geteuid(l->l_cred), p->p_comm); /* Put us at the jail root. */ - vput(searchdir); + vrele(searchdir); searchdir = NULL; foundobj = ndp->ni_rootdir; vref(foundobj); vref(foundobj); - vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY); *newsearchdir_ret = foundobj; *foundobj_ret = foundobj; error = 0; @@ -1005,18 +1104,35 @@ lookup_once(struct namei_state *state, tmpvn = searchdir; searchdir = searchdir->v_mount->mnt_vnodecovered; vref(searchdir); - vput(tmpvn); - vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY); + vrele(tmpvn); *newsearchdir_ret = searchdir; } } /* + * If the file system supports VOP_LOOKUP() with a shared lock, and + * we are not making any modifications (nameiop LOOKUP) or this is + * not the last component then get a shared lock. Where we can't do + * fast-forwarded lookups (for example with layered file systems) + * then this is the fallback for reducing lock contention. + */ + if ((searchdir->v_mount->mnt_iflag & IMNT_SHRLOOKUP) != 0 && + (cnp->cn_nameiop == LOOKUP || (cnp->cn_flags & ISLASTCN) == 0)) { + lktype = LK_SHARED; + } else { + lktype = LK_EXCLUSIVE; + } + + /* * We now have a segment name to search for, and a directory to search. - * Our vnode state here is that "searchdir" is held and locked. + * Our vnode state here is that "searchdir" is held. */ unionlookup: foundobj = NULL; + if (!searchdir_locked) { + vn_lock(searchdir, lktype | LK_RETRY); + searchdir_locked = true; + } error = VOP_LOOKUP(searchdir, &foundobj, cnp); if (error != 0) { @@ -1026,6 +1142,23 @@ unionlookup: #ifdef NAMEI_DIAGNOSTIC printf("not found\n"); #endif /* NAMEI_DIAGNOSTIC */ + + /* + * If ENOLCK, the file system needs us to retry the lookup + * with an exclusive lock. It's likely nothing was found in + * cache and/or modifications need to be made. + */ + if (error == ENOLCK) { + KASSERT(VOP_ISLOCKED(searchdir) == LK_SHARED); + KASSERT(searchdir_locked); + if (vn_lock(searchdir, LK_UPGRADE | LK_NOWAIT)) { + VOP_UNLOCK(searchdir); + searchdir_locked = false; + } + lktype = LK_EXCLUSIVE; + goto unionlookup; + } + if ((error == ENOENT) && (searchdir->v_vflag & VV_ROOT) && (searchdir->v_mount->mnt_flag & MNT_UNION)) { @@ -1033,7 +1166,7 @@ unionlookup: searchdir = searchdir->v_mount->mnt_vnodecovered; vref(searchdir); vput(tmpvn); - vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY); + searchdir_locked = false; *newsearchdir_ret = searchdir; goto unionlookup; } @@ -1087,85 +1220,184 @@ unionlookup: cnp->cn_flags |= ISLASTCN; } - /* - * "searchdir" is locked and held, "foundobj" is held, - * they may be the same vnode. - */ - if (searchdir != foundobj) { - if (cnp->cn_flags & ISDOTDOT) - VOP_UNLOCK(searchdir); - error = vn_lock(foundobj, LK_EXCLUSIVE); - if (cnp->cn_flags & ISDOTDOT) - vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY); - if (error != 0) { - vrele(foundobj); - goto done; + /* Unlock, unless the caller needs the parent locked. */ + if (searchdir != NULL) { + KASSERT(searchdir_locked); + if ((cnp->cn_flags & (ISLASTCN | LOCKPARENT)) != + (ISLASTCN | LOCKPARENT)) { + VOP_UNLOCK(searchdir); + searchdir_locked = false; } + } else { + KASSERT(!searchdir_locked); } - /* - * Check to see if the vnode has been mounted on; - * if so find the root of the mounted file system. - */ - KASSERT(searchdir != NULL); - while (foundobj->v_type == VDIR && - (mp = foundobj->v_mountedhere) != NULL && - (cnp->cn_flags & NOCROSSMOUNT) == 0) { + *foundobj_ret = foundobj; + error = 0; +done: + *newsearchdir_locked_ret = searchdir_locked; + return error; +} - KASSERT(searchdir != foundobj); +/* + * Parse out the first path name component that we need to to consider. + * + * While doing this, attempt to use the name cache to fast-forward through + * as many "easy" to find components of the path as possible. + * + * We use the namecache's node locks to form a chain, and avoid as many + * vnode references and locks as possible. In the ideal case, only the + * final vnode will have its reference count adjusted and lock taken. + */ +static int +lookup_fastforward(struct namei_state *state, struct vnode **searchdir_ret, + struct vnode **foundobj_ret) +{ + struct componentname *cnp = state->cnp; + struct nameidata *ndp = state->ndp; + krwlock_t *plock; + struct vnode *foundobj, *searchdir; + int error, error2; + size_t oldpathlen; + const char *oldnameptr; - error = vfs_busy(mp); - if (error != 0) { - vput(foundobj); - goto done; + /* + * Eat as many path name components as possible before giving up and + * letting lookup_once() handle it. Remember the starting point in + * case we can't get vnode references and need to roll back. + */ + plock = NULL; + searchdir = *searchdir_ret; + oldnameptr = cnp->cn_nameptr; + oldpathlen = ndp->ni_pathlen; + for (;;) { + foundobj = NULL; + + /* + * Get the next component name. There should be no slashes + * here, and we shouldn't have looped around if we were + * done. + */ + KASSERT(cnp->cn_nameptr[0] != '/'); + KASSERT(cnp->cn_nameptr[0] != '\0'); + if ((error = lookup_parsepath(state)) != 0) { + break; } - if (searchdir != NULL) { - VOP_UNLOCK(searchdir); + + /* + * Can't deal with dotdot lookups, because it means lock + * order reversal, and there are checks in lookup_once() + * that need to be made. Also check for missing mountpoints. + */ + if ((cnp->cn_flags & ISDOTDOT) != 0 || + searchdir->v_mount == NULL) { + error = EOPNOTSUPP; + break; } - vput(foundobj); - error = VFS_ROOT(mp, LK_EXCLUSIVE, &foundobj); - vfs_unbusy(mp); - if (error) { - if (searchdir != NULL) { - vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY); + + /* + * Can't deal with last component when modifying; this needs + * searchdir locked and VOP_LOOKUP() called (which can and + * does modify state, despite the name). + */ + if ((cnp->cn_flags & ISLASTCN) != 0) { + if (cnp->cn_nameiop != LOOKUP || + (cnp->cn_flags & LOCKPARENT) != 0) { + error = EOPNOTSUPP; + break; } - goto done; } + + /* Can't deal with -o union lookups. */ + if ((searchdir->v_vflag & VV_ROOT) != 0 && + (searchdir->v_mount->mnt_flag & MNT_UNION) != 0) { + error = EOPNOTSUPP; + break; + } + /* - * Avoid locking vnodes from two filesystems because - * it's prone to deadlock, e.g. when using puffs. - * Also, it isn't a good idea to propagate slowness of - * a filesystem up to the root directory. For now, - * only handle the common case, where foundobj is - * VDIR. + * Good, now look for it in cache. cache_lookup_linked() + * will fail if there's nothing there, or if there's no + * ownership info for the directory, or if the user doesn't + * have permission to look up files in this directory. + */ + if (!cache_lookup_linked(searchdir, cnp->cn_nameptr, + cnp->cn_namelen, &foundobj, &plock, cnp->cn_cred)) { + error = EOPNOTSUPP; + break; + } + KASSERT(plock != NULL && rw_lock_held(plock)); + + /* Scored a hit. Negative is good too (ENOENT). */ + if (foundobj == NULL) { + error = ENOENT; + break; + } + + /* + * Stop and get a hold on the vnode if there's something + * that can't be handled here: * - * In this case set searchdir to null to avoid using - * it again. It is not correct to set searchdir == - * foundobj here as that will confuse the caller. - * (See PR 40740.) + * - we've reached the last component. + * - or encountered a mount point that needs to be crossed. + * - or encountered something other than a directory. */ - if (searchdir == NULL) { - /* already been here once; do nothing further */ - } else if (foundobj->v_type == VDIR) { - vrele(searchdir); - *newsearchdir_ret = searchdir = NULL; + if ((cnp->cn_flags & ISLASTCN) != 0 || + foundobj->v_type != VDIR || + (foundobj->v_type == VDIR && + foundobj->v_mountedhere != NULL)) { + mutex_enter(foundobj->v_interlock); + error = vcache_tryvget(foundobj); + /* v_interlock now released */ + if (error != 0) { + foundobj = NULL; + } + break; + } + + /* + * Otherwise, we're still in business. Set the found VDIR + * vnode as the search dir for the next component and + * continue on to it. + */ + cnp->cn_nameptr = ndp->ni_next; + searchdir = foundobj; + } + + /* + * If we ended up with a new search dir, ref it before dropping the + * namecache's lock. The lock prevents both searchdir and foundobj + * from disappearing. If we can't ref the new searchdir, we have a + * bit of a problem. Roll back the fastforward to the beginning and + * let lookup_once() take care of it. + */ + if (searchdir != *searchdir_ret) { + mutex_enter(searchdir->v_interlock); + error2 = vcache_tryvget(searchdir); + /* v_interlock now unheld */ + KASSERT(plock != NULL); + rw_exit(plock); + if (__predict_true(error2 == 0)) { + /* Returning new searchdir, and maybe new foundobj. */ + vrele(*searchdir_ret); + *searchdir_ret = searchdir; } else { - VOP_UNLOCK(foundobj); - vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY); - vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY); + /* Returning nothing. */ + if (foundobj != NULL) { + vrele(foundobj); + foundobj = NULL; + } + cnp->cn_nameptr = oldnameptr; + ndp->ni_pathlen = oldpathlen; + error = lookup_parsepath(state); } + } else if (plock != NULL) { + /* Drop any namecache lock still held. */ + rw_exit(plock); } + KASSERT(error == 0 ? foundobj != NULL : foundobj == NULL); *foundobj_ret = foundobj; - error = 0; -done: - KASSERT(*newsearchdir_ret == NULL || - VOP_ISLOCKED(*newsearchdir_ret) == LK_EXCLUSIVE); - /* - * *foundobj_ret is valid only if error == 0. - */ - KASSERT(error != 0 || *foundobj_ret == NULL || - VOP_ISLOCKED(*foundobj_ret) == LK_EXCLUSIVE); return error; } @@ -1182,6 +1414,7 @@ namei_oneroot(struct namei_state *state, struct nameidata *ndp = state->ndp; struct componentname *cnp = state->cnp; struct vnode *searchdir, *foundobj; + bool searchdir_locked = false; int error; error = namei_start(state, isnfsd, &searchdir); @@ -1222,44 +1455,47 @@ namei_oneroot(struct namei_state *state, for (;;) { KASSERT(searchdir != NULL); - KASSERT(VOP_ISLOCKED(searchdir) == LK_EXCLUSIVE); + KASSERT(!searchdir_locked); /* - * If the directory we're on is unmounted, bail out. - * XXX: should this also check if it's unlinked? - * XXX: yes it should... but how? + * Parse out the first path name component that we need to + * to consider. While doing this, attempt to use the name + * cache to fast-forward through as many "easy" to find + * components of the path as possible. */ - if (searchdir->v_mount == NULL) { - vput(searchdir); - ndp->ni_dvp = NULL; - ndp->ni_vp = NULL; - return (ENOENT); - } + error = lookup_fastforward(state, &searchdir, &foundobj); /* - * Look up the next path component. - * (currently, this may consume more than one) + * If we didn't get a good answer from the namecache, then + * go directly to the file system. */ + if (error != 0 && error != ENOENT) { + error = lookup_once(state, searchdir, &searchdir, + &foundobj, &searchdir_locked); + } - /* There should be no slashes here. */ - KASSERT(cnp->cn_nameptr[0] != '/'); - - /* and we shouldn't have looped around if we were done */ - KASSERT(cnp->cn_nameptr[0] != '\0'); - - error = lookup_parsepath(state); - if (error) { - vput(searchdir); - ndp->ni_dvp = NULL; - ndp->ni_vp = NULL; - state->attempt_retry = 1; - return (error); + /* + * If the vnode we found is mounted on, then cross the mount + * and get the root vnode in foundobj. If this encounters + * an error, it will dispose of foundobj, but searchdir is + * untouched. + */ + if (error == 0 && foundobj != NULL && + foundobj->v_type == VDIR && + foundobj->v_mountedhere != NULL && + (cnp->cn_flags & NOCROSSMOUNT) == 0) { + error = lookup_crossmount(state, &searchdir, + &foundobj, &searchdir_locked); } - error = lookup_once(state, searchdir, &searchdir, &foundobj); if (error) { if (searchdir != NULL) { - vput(searchdir); + if (searchdir_locked) { + searchdir_locked = false; + vput(searchdir); + } else { + vrele(searchdir); + } } ndp->ni_dvp = NULL; ndp->ni_vp = NULL; @@ -1296,6 +1532,11 @@ namei_oneroot(struct namei_state *state, * them again. */ if (namei_atsymlink(state, foundobj)) { + /* Don't need searchdir locked any more. */ + if (searchdir_locked) { + searchdir_locked = false; + VOP_UNLOCK(searchdir); + } ndp->ni_pathlen += state->slashes; ndp->ni_next -= state->slashes; if (neverfollow) { @@ -1337,14 +1578,13 @@ namei_oneroot(struct namei_state *state, if (error) { KASSERT(searchdir != foundobj); if (searchdir != NULL) { - vput(searchdir); + vrele(searchdir); } - vput(foundobj); + vrele(foundobj); ndp->ni_dvp = NULL; ndp->ni_vp = NULL; return error; } - /* namei_follow unlocks it (ugh) so rele, not put */ vrele(foundobj); foundobj = NULL; @@ -1375,9 +1615,16 @@ namei_oneroot(struct namei_state *state, (cnp->cn_flags & REQUIREDIR)) { KASSERT(foundobj != searchdir); if (searchdir) { - vput(searchdir); + if (searchdir_locked) { + searchdir_locked = false; + vput(searchdir); + } else { + vrele(searchdir); + } + } else { + KASSERT(!searchdir_locked); } - vput(foundobj); + vrele(foundobj); ndp->ni_dvp = NULL; ndp->ni_vp = NULL; state->attempt_retry = 1; @@ -1395,15 +1642,21 @@ namei_oneroot(struct namei_state *state, * Continue with the next component. */ cnp->cn_nameptr = ndp->ni_next; - if (searchdir == foundobj) { - vrele(searchdir); - } else if (searchdir != NULL) { - vput(searchdir); + if (searchdir != NULL) { + if (searchdir_locked) { + searchdir_locked = false; + vput(searchdir); + } else { + vrele(searchdir); + } } searchdir = foundobj; foundobj = NULL; } + KASSERT((cnp->cn_flags & LOCKPARENT) == 0 || searchdir == NULL || + VOP_ISLOCKED(searchdir) == LK_EXCLUSIVE); + skiploop: if (foundobj != NULL) { @@ -1416,16 +1669,17 @@ namei_oneroot(struct namei_state *state, * forever. So convert it to the real root. */ if (searchdir != NULL) { - if (searchdir == foundobj) - vrele(searchdir); - else + if (searchdir_locked) { vput(searchdir); + searchdir_locked = false; + } else { + vrele(searchdir); + } searchdir = NULL; } - vput(foundobj); + vrele(foundobj); foundobj = ndp->ni_rootdir; vref(foundobj); - vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY); } /* @@ -1438,9 +1692,15 @@ namei_oneroot(struct namei_state *state, (searchdir == NULL || searchdir->v_mount != foundobj->v_mount)) { if (searchdir) { - vput(searchdir); + if (searchdir_locked) { + vput(searchdir); + searchdir_locked = false; + } else { + vrele(searchdir); + } + searchdir = NULL; } - vput(foundobj); + vrele(foundobj); foundobj = NULL; ndp->ni_dvp = NULL; ndp->ni_vp = NULL; @@ -1465,21 +1725,25 @@ namei_oneroot(struct namei_state *state, if (state->rdonly && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { if (searchdir) { - if (foundobj != searchdir) { + if (searchdir_locked) { vput(searchdir); + searchdir_locked = false; } else { vrele(searchdir); } searchdir = NULL; } - vput(foundobj); + vrele(foundobj); foundobj = NULL; ndp->ni_dvp = NULL; ndp->ni_vp = NULL; state->attempt_retry = 1; return EROFS; } - if ((cnp->cn_flags & LOCKLEAF) == 0) { + + /* Lock the leaf node if requested. */ + if ((cnp->cn_flags & (LOCKLEAF | LOCKPARENT)) == LOCKPARENT && + searchdir == foundobj) { /* * Note: if LOCKPARENT but not LOCKLEAF is * set, and searchdir == foundobj, this code @@ -1491,7 +1755,15 @@ namei_oneroot(struct namei_state *state, * that uses this combination "knows" this, so * it can't be safely changed. Feh. XXX */ - VOP_UNLOCK(foundobj); + KASSERT(searchdir_locked); + VOP_UNLOCK(searchdir); + searchdir_locked = false; + } else if ((cnp->cn_flags & LOCKLEAF) != 0 && + (searchdir != foundobj || + (cnp->cn_flags & LOCKPARENT) == 0)) { + const int lktype = (cnp->cn_flags & LOCKSHARED) != 0 ? + LK_SHARED : LK_EXCLUSIVE; + vn_lock(foundobj, lktype | LK_RETRY); } } @@ -1503,11 +1775,7 @@ namei_oneroot(struct namei_state *state, * If LOCKPARENT is not set, the parent directory isn't returned. */ if ((cnp->cn_flags & LOCKPARENT) == 0 && searchdir != NULL) { - if (searchdir == foundobj) { - vrele(searchdir); - } else { - vput(searchdir); - } + vrele(searchdir); searchdir = NULL; } @@ -1649,6 +1917,7 @@ do_lookup_for_nfsd_index(struct namei_st struct nameidata *ndp = state->ndp; struct vnode *startdir; struct vnode *foundobj; + bool startdir_locked; const char *cp; /* pointer into pathname argument */ KASSERT(cnp == &ndp->ni_cnd); @@ -1681,30 +1950,37 @@ do_lookup_for_nfsd_index(struct namei_st * own reference to it to avoid consuming the caller's. */ vref(startdir); - vn_lock(startdir, LK_EXCLUSIVE | LK_RETRY); - error = lookup_once(state, startdir, &startdir, &foundobj); - if (error == 0 && startdir == foundobj) { - vrele(startdir); - } else if (startdir != NULL) { - vput(startdir); - } - if (error) { - goto bad; - } - ndp->ni_vp = foundobj; + error = lookup_once(state, startdir, &startdir, &foundobj, + &startdir_locked); - if (foundobj == NULL) { - return 0; + KASSERT((cnp->cn_flags & LOCKPARENT) == 0); + if (startdir_locked) { + VOP_UNLOCK(startdir); + startdir_locked = false; } - KASSERT((cnp->cn_flags & LOCKPARENT) == 0); - if ((cnp->cn_flags & LOCKLEAF) == 0) { - VOP_UNLOCK(foundobj); + /* + * If the vnode we found is mounted on, then cross the mount and get + * the root vnode in foundobj. If this encounters an error, it will + * dispose of foundobj, but searchdir is untouched. + */ + if (error == 0 && foundobj != NULL && + foundobj->v_type == VDIR && + foundobj->v_mountedhere != NULL && + (cnp->cn_flags & NOCROSSMOUNT) == 0) { + error = lookup_crossmount(state, &startdir, &foundobj, + &startdir_locked); } - return (0); -bad: - ndp->ni_vp = NULL; + /* Now toss startdir and see if we have an error. */ + if (startdir != NULL) + vrele(startdir); + if (error) + foundobj = NULL; + else if (foundobj != NULL && (cnp->cn_flags & LOCKLEAF) != 0) + vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY); + + ndp->ni_vp = foundobj; return (error); } Index: src/sys/kern/vfs_syscalls.c diff -u src/sys/kern/vfs_syscalls.c:1.542 src/sys/kern/vfs_syscalls.c:1.539.2.4 --- src/sys/kern/vfs_syscalls.c:1.542 Sun Feb 23 22:14:04 2020 +++ src/sys/kern/vfs_syscalls.c Sat Feb 29 20:21:03 2020 @@ -1528,7 +1528,7 @@ chdir_lookup(const char *path, int where if (error) { return error; } - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb); + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT, pb); if ((error = namei(&nd)) != 0) { pathbuf_destroy(pb); return error; @@ -2994,7 +2994,7 @@ do_sys_accessat(struct lwp *l, int fdat, return EINVAL; } - nd_flag = FOLLOW | LOCKLEAF | TRYEMULROOT; + nd_flag = FOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT; if (flags & AT_SYMLINK_NOFOLLOW) nd_flag &= ~FOLLOW; @@ -3220,7 +3220,7 @@ do_sys_readlinkat(struct lwp *l, int fda if (error) { return error; } - NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | TRYEMULROOT, pb); + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT, pb); if ((error = fd_nameiat(l, fdat, &nd)) != 0) { pathbuf_destroy(pb); return error; @@ -4691,7 +4691,7 @@ dorevoke(struct vnode *vp, kauth_cred_t struct vattr vattr; int error, fs_decision; - vn_lock(vp, LK_SHARED | LK_RETRY); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_GETATTR(vp, &vattr, cred); VOP_UNLOCK(vp); if (error != 0) Index: src/sys/kern/vfs_vnode.c diff -u src/sys/kern/vfs_vnode.c:1.113 src/sys/kern/vfs_vnode.c:1.105.2.9 --- src/sys/kern/vfs_vnode.c:1.113 Thu Feb 27 22:12:54 2020 +++ src/sys/kern/vfs_vnode.c Sat Feb 29 20:21:03 2020 @@ -119,8 +119,7 @@ * Vnode finished disassociation from underlying file * system in vcache_reclaim(). * LOADED -> BLOCKED - * Either vcache_rekey*() is changing the vnode key or - * vrelel() is about to call VOP_INACTIVE(). + * vcache_rekey*() is changing the vnode key. * BLOCKED -> LOADED * The block condition is over. * LOADING -> RECLAIMED @@ -828,25 +827,23 @@ vrelel(vnode_t *vp, int flags, int lktyp if (VSTATE_GET(vp) == VS_RECLAIMED) { VOP_UNLOCK(vp); } else { - VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED); - mutex_exit(vp->v_interlock); - /* - * The vnode must not gain another reference while being - * deactivated. If VOP_INACTIVE() indicates that - * the described file has been deleted, then recycle - * the vnode. + * If VOP_INACTIVE() indicates that the described file has + * been deleted, then recycle the vnode. Note that + * VOP_INACTIVE() will not drop the vnode lock. * - * Note that VOP_INACTIVE() will not drop the vnode lock. + * If the file has been deleted, this is a lingering + * reference and there is no need to worry about new + * references looking to do real work with the vnode (as it + * will have been purged from directories, caches, etc). */ recycle = false; + mutex_exit(vp->v_interlock); VOP_INACTIVE(vp, &recycle); - if (!recycle) - VOP_UNLOCK(vp); rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); mutex_enter(vp->v_interlock); - VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED); if (!recycle) { + VOP_UNLOCK(vp); if (vtryrele(vp)) { mutex_exit(vp->v_interlock); rw_exit(vp->v_uobj.vmobjlock); @@ -1228,12 +1225,9 @@ vcache_alloc(void) rw_init(&vip->vi_lock); vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); - /* SLIST_INIT(&vip->vi_hash); */ - TAILQ_INIT(&vip->vi_nclist); - /* LIST_INIT(&vip->vi_dnclist); */ - uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1); cv_init(&vp->v_cv, "vnode"); + cache_vnode_init(vp); vp->v_usecount = 1; vp->v_type = VNON; @@ -1294,6 +1288,7 @@ vcache_free(vnode_impl_t *vip) rw_destroy(&vip->vi_lock); uvm_obj_destroy(&vp->v_uobj, true); cv_destroy(&vp->v_cv); + cache_vnode_fini(vp); pool_cache_put(vcache_pool, vip); } @@ -1681,6 +1676,13 @@ vcache_reclaim(vnode_t *vp) mutex_exit(vp->v_interlock); rw_exit(vp->v_uobj.vmobjlock); + /* + * With vnode state set to reclaiming, purge name cache immediately + * to prevent new handles on vnode, and wait for existing threads + * trying to get a handle to notice VS_RECLAIMED status and abort. + */ + cache_purge(vp); + /* Replace the vnode key with a temporary copy. */ if (vip->vi_key.vk_key_len > sizeof(temp_buf)) { temp_key = kmem_alloc(temp_key_len, KM_SLEEP); @@ -1734,9 +1736,6 @@ vcache_reclaim(vnode_t *vp) vp->v_ractx = NULL; } - /* Purge name cache. */ - cache_purge(vp); - if (vip->vi_key.vk_key_len > 0) { /* Remove from vnode cache. */ hash = vcache_hash(&vip->vi_key); Index: src/sys/miscfs/genfs/layer_vnops.c diff -u src/sys/miscfs/genfs/layer_vnops.c:1.68 src/sys/miscfs/genfs/layer_vnops.c:1.67.12.2 --- src/sys/miscfs/genfs/layer_vnops.c:1.68 Sun Feb 23 15:46:41 2020 +++ src/sys/miscfs/genfs/layer_vnops.c Sat Feb 29 20:21:04 2020 @@ -384,6 +384,7 @@ layer_lookup(void *v) vrele(lvp); } else if (lvp != NULL) { /* Note: dvp and ldvp are both locked. */ + KASSERT(error != ENOLCK); error = layer_node_create(dvp->v_mount, lvp, ap->a_vpp); if (error) { vrele(lvp); Index: src/sys/miscfs/nullfs/null_vfsops.c diff -u src/sys/miscfs/nullfs/null_vfsops.c:1.96 src/sys/miscfs/nullfs/null_vfsops.c:1.96.2.2 --- src/sys/miscfs/nullfs/null_vfsops.c:1.96 Sun Dec 15 20:30:56 2019 +++ src/sys/miscfs/nullfs/null_vfsops.c Wed Jan 22 12:04:36 2020 @@ -141,6 +141,7 @@ nullfs_mount(struct mount *mp, const cha nmp = kmem_zalloc(sizeof(struct null_mount), KM_SLEEP); mp->mnt_data = nmp; mp->mnt_iflag |= IMNT_MPSAFE; + mp->mnt_iflag |= lowerrootvp->v_mount->mnt_iflag & IMNT_SHRLOOKUP; /* * Make sure that the mount point is sufficiently initialized Index: src/sys/miscfs/procfs/procfs_vfsops.c diff -u src/sys/miscfs/procfs/procfs_vfsops.c:1.102 src/sys/miscfs/procfs/procfs_vfsops.c:1.101.6.2 --- src/sys/miscfs/procfs/procfs_vfsops.c:1.102 Fri Jan 17 20:08:09 2020 +++ src/sys/miscfs/procfs/procfs_vfsops.c Sun Jan 19 21:21:55 2020 @@ -173,7 +173,7 @@ procfs_mount( else pmnt->pmnt_flags = 0; - mp->mnt_iflag |= IMNT_MPSAFE; + mp->mnt_iflag |= IMNT_MPSAFE | IMNT_SHRLOOKUP; return error; } Index: src/sys/sys/namei.src diff -u src/sys/sys/namei.src:1.48 src/sys/sys/namei.src:1.47.2.7 --- src/sys/sys/namei.src:1.48 Wed Jan 8 12:04:56 2020 +++ src/sys/sys/namei.src Wed Mar 4 20:21:05 2020 @@ -39,6 +39,7 @@ #ifdef _KERNEL #include +#include /* * Abstraction for a single pathname. @@ -151,13 +152,14 @@ NAMEIFL NOFOLLOW 0x00000000 /* do not fo (pseudo) */ NAMEIFL EMULROOTSET 0x00000080 /* emulation root already in ni_erootdir */ +NAMEIFL LOCKSHARED 0x00000100 /* want shared locks if possible */ NAMEIFL NOCHROOT 0x01000000 /* no chroot on abs path lookups */ -NAMEIFL MODMASK 0x010000fc /* mask of operational modifiers */ +NAMEIFL MODMASK 0x010001fc /* mask of operational modifiers */ /* * Namei parameter descriptors. */ -NAMEIFL NOCROSSMOUNT 0x0000100 /* do not cross mount points */ -NAMEIFL RDONLY 0x0000200 /* lookup with read-only semantics */ +NAMEIFL NOCROSSMOUNT 0x0000800 /* do not cross mount points */ +NAMEIFL RDONLY 0x0001000 /* lookup with read-only semantics */ NAMEIFL ISDOTDOT 0x0002000 /* current component name is .. */ NAMEIFL MAKEENTRY 0x0004000 /* entry is to be added to name cache */ NAMEIFL ISLASTCN 0x0008000 /* this is last component of pathname */ @@ -165,7 +167,7 @@ NAMEIFL ISWHITEOUT 0x0020000 /* found wh NAMEIFL DOWHITEOUT 0x0040000 /* do whiteouts */ NAMEIFL REQUIREDIR 0x0080000 /* must be a directory */ NAMEIFL CREATEDIR 0x0200000 /* trailing slashes are ok */ -NAMEIFL PARAMASK 0x02ee300 /* mask of parameter descriptors */ +NAMEIFL PARAMASK 0x02ef800 /* mask of parameter descriptors */ /* * Initialization of a nameidata structure. @@ -188,42 +190,42 @@ NAMEIFL PARAMASK 0x02ee300 /* mask of pa #endif #ifdef __NAMECACHE_PRIVATE +#include + /* * For simplicity (and economy of storage), names longer than * a maximum length of NCHNAMLEN are stored in non-pooled storage. */ -#define NCHNAMLEN 32 /* up to this size gets stored in pool */ +#define NCHNAMLEN sizeof(((struct namecache *)NULL)->nc_name) /* * Namecache entry. - * This structure describes the elements in the cache of recent - * names looked up by namei. * - * Locking rules: + * This structure describes the elements in the cache of recent names looked + * up by namei. It's carefully sized to take up 128 bytes on _LP64, to make + * good use of space and the CPU caches. + * + * Field markings and their corresponding locks: * - * - stable after initialization - * L namecache_lock - * C struct nchcpu::cpu_lock - * L/C insert needs L, read needs L or any C, - * must hold L and all C after (or during) delete before free - * N struct namecache::nc_lock + * - stable throught the lifetime of the namecache entry + * d protected by nc_dvp->vi_ncdlock + * v protected by nc_dvp->vi_ncvlock + * l protected by cache_lru_lock + * u accesses are unlocked, no serialization applied */ +struct nchnode; struct namecache { - LIST_ENTRY(namecache) nc_hash; /* L/C hash chain */ - TAILQ_ENTRY(namecache) nc_lru; /* L pseudo-lru chain */ - LIST_ENTRY(namecache) nc_dvlist;/* L dvp's list of cache entries */ - TAILQ_ENTRY(namecache) nc_vlist;/* L vp's list of cache entries */ - struct vnode *nc_dvp; /* N vnode of parent of name */ - struct vnode *nc_vp; /* N vnode the name refers to */ - void *nc_gcqueue; /* N queue for garbage collection */ - kmutex_t nc_lock; /* lock on this entry */ - int nc_hittime; /* N last time scored a hit */ - int nc_flags; /* - copy of componentname ISWHITEOUT */ - u_short nc_nlen; /* - length of name */ - char nc_name[0]; /* - segment name */ + struct rb_node nc_tree; /* d red-black tree, must be first */ + TAILQ_ENTRY(namecache) nc_list; /* v vp's list of cache entries */ + TAILQ_ENTRY(namecache) nc_lru; /* l pseudo-lru chain */ + struct vnode *nc_dvp; /* - vnode of parent of name */ + struct vnode *nc_vp; /* - vnode the name refers to */ + int64_t nc_key; /* - hash key */ + int nc_lrulist; /* l which LRU list its on */ + short nc_nlen; /* - length of the name */ + char nc_whiteout; /* - true if a whiteout */ + char nc_name[41]; /* - segment name */ }; -__CTASSERT((sizeof(struct namecache) + NCHNAMLEN) - % __alignof(struct namecache) == 0); #endif #ifdef _KERNEL @@ -286,14 +288,22 @@ bool cache_lookup(struct vnode *, const int *, struct vnode **); bool cache_lookup_raw(struct vnode *, const char *, size_t, uint32_t, int *, struct vnode **); -int cache_revlookup(struct vnode *, struct vnode **, char **, char *); +bool cache_lookup_linked(struct vnode *, const char *, size_t, + struct vnode **, krwlock_t **, kauth_cred_t); +int cache_revlookup(struct vnode *, struct vnode **, char **, char *, + bool, int); +int cache_diraccess(struct vnode *, int); void cache_enter(struct vnode *, struct vnode *, const char *, size_t, uint32_t); +void cache_enter_id(struct vnode *, mode_t, uid_t, gid_t); +bool cache_have_id(struct vnode *); +void cache_vnode_init(struct vnode * ); +void cache_vnode_fini(struct vnode * ); +void cache_cpu_init(struct cpu_info *); + void nchinit(void); -void nchreinit(void); void namecache_count_pass2(void); void namecache_count_2passes(void); -void cache_cpu_init(struct cpu_info *); void cache_purgevfs(struct mount *); void namecache_print(struct vnode *, void (*)(const char *, ...) __printflike(1, 2)); @@ -318,6 +328,8 @@ void namecache_print(struct vnode *, voi type ncs_2passes; /* number of times we attempt it (U) */ \ type ncs_revhits; /* reverse-cache hits */ \ type ncs_revmiss; /* reverse-cache misses */ \ + type ncs_collisions; /* hash value collisions */ \ + type ncs_denied; /* access denied */ \ } /* Index: src/sys/sys/vnode_impl.h diff -u src/sys/sys/vnode_impl.h:1.21 src/sys/sys/vnode_impl.h:1.19.2.5 --- src/sys/sys/vnode_impl.h:1.21 Sun Feb 23 22:14:04 2020 +++ src/sys/sys/vnode_impl.h Fri Jan 24 16:05:23 2020 @@ -63,7 +63,8 @@ struct vcache_key { * i v_interlock * l vi_nc_listlock * m mnt_vnodelock - * n namecache_lock + * n vi_nc_lock + * n,l vi_nc_lock + vi_nc_listlock to modify * s syncer_data_lock */ struct vnode_impl { @@ -76,11 +77,15 @@ struct vnode_impl { /* * Namecache. Give it a separate line so activity doesn't impinge - * on the stable stuff (pending merge of ad-namecache branch). + * on the stable stuff. */ - LIST_HEAD(, namecache) vi_dnclist /* n: namecaches (children) */ + rb_tree_t vi_nc_tree /* n namecache tree */ __aligned(COHERENCY_UNIT); - TAILQ_HEAD(, namecache) vi_nclist; /* n: namecaches (parent) */ + TAILQ_HEAD(,namecache) vi_nc_list; /* l namecaches (parent) */ + mode_t vi_nc_mode; /* n,l cached mode or VNOVAL */ + uid_t vi_nc_uid; /* n,l cached UID or VNOVAL */ + gid_t vi_nc_gid; /* n,l cached GID or VNOVAL */ + uint32_t vi_nc_spare; /* - spare (padding) */ /* * vnode cache, LRU and syncer. This all changes with some Index: src/sys/ufs/chfs/chfs_vnops.c diff -u src/sys/ufs/chfs/chfs_vnops.c:1.36 src/sys/ufs/chfs/chfs_vnops.c:1.34.4.3 --- src/sys/ufs/chfs/chfs_vnops.c:1.36 Sun Feb 23 15:46:42 2020 +++ src/sys/ufs/chfs/chfs_vnops.c Sat Feb 29 20:21:10 2020 @@ -90,6 +90,10 @@ chfs_lookup(void *v) return (*vpp == NULLVP ? ENOENT : 0); } + /* May need to restart the lookup with an exclusive lock. */ + if (VOP_ISLOCKED(dvp) != LK_EXCLUSIVE) + return ENOLCK; + ip = VTOI(dvp); ump = VFSTOUFS(dvp->v_mount); chmp = ump->um_chfs; Index: src/sys/ufs/ext2fs/ext2fs_lookup.c diff -u src/sys/ufs/ext2fs/ext2fs_lookup.c:1.88 src/sys/ufs/ext2fs/ext2fs_lookup.c:1.88.22.1 --- src/sys/ufs/ext2fs/ext2fs_lookup.c:1.88 Tue Aug 23 06:40:25 2016 +++ src/sys/ufs/ext2fs/ext2fs_lookup.c Sun Jan 19 21:21:55 2020 @@ -313,14 +313,6 @@ ext2fs_lookup(void *v) *vpp = NULL; /* - * Produce the auxiliary lookup results into i_crap. Increment - * its serial number so elsewhere we can tell if we're using - * stale results. This should not be done this way. XXX. - */ - results = &dp->i_crap; - dp->i_crapcounter++; - - /* * Check accessiblity of directory. */ if ((error = VOP_ACCESS(vdp, VEXEC, cred)) != 0) @@ -342,6 +334,18 @@ ext2fs_lookup(void *v) return *vpp == NULLVP ? ENOENT : 0; } + /* May need to restart the lookup with an exclusive lock. */ + if (VOP_ISLOCKED(vdp) != LK_EXCLUSIVE) + return ENOLCK; + + /* + * Produce the auxiliary lookup results into i_crap. Increment + * its serial number so elsewhere we can tell if we're using + * stale results. This should not be done this way. XXX. + */ + results = &dp->i_crap; + dp->i_crapcounter++; + /* * Suppress search for slots unless creating * file and at end of pathname, in which case Index: src/sys/ufs/ext2fs/ext2fs_vfsops.c diff -u src/sys/ufs/ext2fs/ext2fs_vfsops.c:1.216 src/sys/ufs/ext2fs/ext2fs_vfsops.c:1.214.4.3 --- src/sys/ufs/ext2fs/ext2fs_vfsops.c:1.216 Thu Feb 27 22:12:54 2020 +++ src/sys/ufs/ext2fs/ext2fs_vfsops.c Sat Feb 29 20:21:10 2020 @@ -736,7 +736,7 @@ ext2fs_mountfs(struct vnode *devvp, stru mp->mnt_flag |= MNT_LOCAL; mp->mnt_dev_bshift = DEV_BSHIFT; /* XXX */ mp->mnt_fs_bshift = m_fs->e2fs_bshift; - mp->mnt_iflag |= IMNT_DTYPE; + mp->mnt_iflag |= IMNT_DTYPE | IMNT_SHRLOOKUP; ump->um_flags = 0; ump->um_mountp = mp; ump->um_dev = dev; Index: src/sys/ufs/ffs/ffs_vfsops.c diff -u src/sys/ufs/ffs/ffs_vfsops.c:1.365 src/sys/ufs/ffs/ffs_vfsops.c:1.362.4.5 --- src/sys/ufs/ffs/ffs_vfsops.c:1.365 Thu Feb 27 22:12:54 2020 +++ src/sys/ufs/ffs/ffs_vfsops.c Sat Feb 29 20:21:11 2020 @@ -1453,7 +1453,8 @@ ffs_mountfs(struct vnode *devvp, struct mp->mnt_fs_bshift = fs->fs_bshift; mp->mnt_dev_bshift = DEV_BSHIFT; /* XXX */ mp->mnt_flag |= MNT_LOCAL; - mp->mnt_iflag |= IMNT_MPSAFE | IMNT_CAN_RWTORO; + mp->mnt_iflag |= IMNT_MPSAFE | IMNT_CAN_RWTORO | IMNT_SHRLOOKUP | + IMNT_NCLOOKUP; #ifdef FFS_EI if (needswap) ump->um_flags |= UFS_NEEDSWAP; @@ -2082,6 +2083,7 @@ ffs_loadvnode(struct mount *mp, struct v ip->i_gid = ip->i_ffs1_ogid; /* XXX */ } /* XXX */ uvm_vnp_setsize(vp, ip->i_size); + cache_enter_id(vp, ip->i_mode, ip->i_uid, ip->i_gid); *new_key = &ip->i_number; return 0; } @@ -2203,6 +2205,7 @@ ffs_newvnode(struct mount *mp, struct vn } uvm_vnp_setsize(vp, ip->i_size); + cache_enter_id(vp, ip->i_mode, ip->i_uid, ip->i_gid); *new_key = &ip->i_number; return 0; } Index: src/sys/ufs/lfs/lfs_vfsops.c diff -u src/sys/ufs/lfs/lfs_vfsops.c:1.374 src/sys/ufs/lfs/lfs_vfsops.c:1.367.2.3 --- src/sys/ufs/lfs/lfs_vfsops.c:1.374 Sun Feb 23 15:46:42 2020 +++ src/sys/ufs/lfs/lfs_vfsops.c Sat Feb 29 20:21:11 2020 @@ -1130,6 +1130,7 @@ lfs_mountfs(struct vnode *devvp, struct mp->mnt_stat.f_namemax = LFS_MAXNAMLEN; mp->mnt_stat.f_iosize = lfs_sb_getbsize(fs); mp->mnt_flag |= MNT_LOCAL; + mp->mnt_iflag |= IMNT_SHRLOOKUP; mp->mnt_fs_bshift = lfs_sb_getbshift(fs); mp->mnt_iflag |= IMNT_CAN_RWTORO; if (fs->um_maxsymlinklen > 0) Index: src/sys/ufs/lfs/ulfs_lookup.c diff -u src/sys/ufs/lfs/ulfs_lookup.c:1.41 src/sys/ufs/lfs/ulfs_lookup.c:1.41.12.1 --- src/sys/ufs/lfs/ulfs_lookup.c:1.41 Sat Jun 10 05:29:36 2017 +++ src/sys/ufs/lfs/ulfs_lookup.c Sun Jan 19 21:21:55 2020 @@ -162,14 +162,6 @@ ulfs_lookup(void *v) endsearch = 0; /* silence compiler warning */ /* - * Produce the auxiliary lookup results into i_crap. Increment - * its serial number so elsewhere we can tell if we're using - * stale results. This should not be done this way. XXX. - */ - results = &dp->i_crap; - dp->i_crapcounter++; - - /* * Check accessiblity of directory. */ if ((error = VOP_ACCESS(vdp, VEXEC, cred)) != 0) @@ -193,6 +185,19 @@ ulfs_lookup(void *v) } return *vpp == NULLVP ? ENOENT : 0; } + + /* May need to restart the lookup with an exclusive lock. */ + if (VOP_ISLOCKED(vdp) != LK_EXCLUSIVE) + return ENOLCK; + + /* + * Produce the auxiliary lookup results into i_crap. Increment + * its serial number so elsewhere we can tell if we're using + * stale results. This should not be done this way. XXX. + */ + results = &dp->i_crap; + dp->i_crapcounter++; + if (iswhiteout) { /* * The namecache set iswhiteout without finding a Index: src/sys/ufs/ufs/ufs_lookup.c diff -u src/sys/ufs/ufs/ufs_lookup.c:1.150 src/sys/ufs/ufs/ufs_lookup.c:1.150.4.1 --- src/sys/ufs/ufs/ufs_lookup.c:1.150 Sun May 5 15:07:12 2019 +++ src/sys/ufs/ufs/ufs_lookup.c Sun Jan 19 21:21:55 2020 @@ -330,14 +330,6 @@ ufs_lookup(void *v) endsearch = 0; /* silence compiler warning */ /* - * Produce the auxiliary lookup results into i_crap. Increment - * its serial number so elsewhere we can tell if we're using - * stale results. This should not be done this way. XXX. - */ - results = &dp->i_crap; - dp->i_crapcounter++; - - /* * Check accessiblity of directory. */ if ((error = VOP_ACCESS(vdp, VEXEC, cred)) != 0) @@ -361,6 +353,20 @@ ufs_lookup(void *v) } return *vpp == NULLVP ? ENOENT : 0; } + + /* May need to restart the lookup with an exclusive lock. */ + if (VOP_ISLOCKED(vdp) != LK_EXCLUSIVE) { + return ENOLCK; + } + + /* + * Produce the auxiliary lookup results into i_crap. Increment + * its serial number so elsewhere we can tell if we're using + * stale results. This should not be done this way. XXX. + */ + results = &dp->i_crap; + dp->i_crapcounter++; + if (iswhiteout) { /* * The namecache set iswhiteout without finding a Index: src/sys/ufs/ufs/ufs_vnops.c diff -u src/sys/ufs/ufs/ufs_vnops.c:1.249 src/sys/ufs/ufs/ufs_vnops.c:1.248.2.3 --- src/sys/ufs/ufs/ufs_vnops.c:1.249 Wed Feb 26 18:00:12 2020 +++ src/sys/ufs/ufs/ufs_vnops.c Sat Feb 29 20:21:11 2020 @@ -1,7 +1,7 @@ /* $NetBSD$ */ /*- - * Copyright (c) 2008 The NetBSD Foundation, Inc. + * Copyright (c) 2008, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation @@ -621,6 +621,7 @@ ufs_setattr(void *v) } VN_KNOTE(vp, NOTE_ATTRIB); out: + cache_enter_id(vp, ip->i_mode, ip->i_uid, ip->i_gid); return (error); } @@ -648,6 +649,7 @@ ufs_chmod(struct vnode *vp, int mode, ka ip->i_flag |= IN_CHANGE; DIP_ASSIGN(ip, mode, ip->i_mode); UFS_WAPBL_UPDATE(vp, NULL, NULL, 0); + cache_enter_id(vp, ip->i_mode, ip->i_uid, ip->i_gid); return (0); } @@ -708,6 +710,7 @@ ufs_chown(struct vnode *vp, uid_t uid, g #endif /* QUOTA || QUOTA2 */ ip->i_flag |= IN_CHANGE; UFS_WAPBL_UPDATE(vp, NULL, NULL, 0); + cache_enter_id(vp, ip->i_mode, ip->i_uid, ip->i_gid); return (0); }