Preliminary namespaces patch.

This adds:
   - namespace decls for kernel and userland in sys/sys/namespace.h
   - namespace code in sys/kern/vfs_namespace.c
   - system call entry points for the namespace operations
   - a namespace hook in struct proc
   - namespace operations in namei, readdir, exec, fork, and exit
   - system call definitions for the namespace syscalls
   - build glue for vfs_namespace.c
   - build glue and setlist updates to install sys/namespace.h
   - updates to secret rump process logic so rumpity stuff doesn't
     randomly explode

diff -r d581ced863ba -r ef920de0474a distrib/sets/lists/comp/mi
--- a/distrib/sets/lists/comp/mi	Sat Nov 28 10:13:45 2015 -0500
+++ b/distrib/sets/lists/comp/mi	Sat Nov 28 10:38:33 2015 -0500
@@ -2845,6 +2845,7 @@
 ./usr/include/sys/mtio.h			comp-c-include
 ./usr/include/sys/mutex.h			comp-c-include
 ./usr/include/sys/namei.h			comp-c-include
+./usr/include/sys/namespace.h			comp-c-include
 ./usr/include/sys/null.h			comp-c-include
 ./usr/include/sys/optstr.h			comp-obsolete		obsolete
 ./usr/include/sys/param.h			comp-c-include
diff -r d581ced863ba -r ef920de0474a sys/kern/files.kern
--- a/sys/kern/files.kern	Sat Nov 28 10:13:45 2015 -0500
+++ b/sys/kern/files.kern	Sat Nov 28 10:38:33 2015 -0500
@@ -183,6 +183,7 @@ file	kern/vfs_init.c			vfs
 file	kern/vfs_lockf.c		vfs
 file	kern/vfs_lookup.c		vfs
 file	kern/vfs_mount.c		vfs
+file	kern/vfs_namespace.c		vfs
 file	kern/vfs_quotactl.c		vfs
 file	kern/vfs_subr.c			vfs
 file	kern/vfs_syscalls.c		vfs
diff -r d581ced863ba -r ef920de0474a sys/kern/kern_exec.c
--- a/sys/kern/kern_exec.c	Sat Nov 28 10:13:45 2015 -0500
+++ b/sys/kern/kern_exec.c	Sat Nov 28 10:38:33 2015 -0500
@@ -76,6 +76,7 @@
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/kmem.h>
+#include <sys/namespace.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/file.h>
@@ -1168,6 +1169,8 @@ execve_runproc(struct lwp *l, struct exe
 	if (error != 0)
 		goto exec_abort;
 
+	if (p->p_nsinfo != NULL)
+		ns_exec(p);
 	cwdexec(p);
 	fd_closeexec();		/* handle close on exec */
 
@@ -2439,6 +2442,12 @@ do_posix_spawn(struct lwp *l1, pid_t *pi
 
 	p2->p_cwdi = cwdinit();
 
+	if (p1->p_nsinfo != NULL) {
+		p2->p_nsinfo = nsinfo_clone(p1->p_nsinfo);
+	} else {
+		p2->p_nsinfo = NULL;
+	}
+
 	/*
 	 * Note: p_limit (rlimit stuff) is copy-on-write, so normally
 	 * we just need increase pl_refcnt.
diff -r d581ced863ba -r ef920de0474a sys/kern/kern_exit.c
--- a/sys/kern/kern_exit.c	Sat Nov 28 10:13:45 2015 -0500
+++ b/sys/kern/kern_exit.c	Sat Nov 28 10:38:33 2015 -0500
@@ -99,6 +99,7 @@
 #include <sys/signalvar.h>
 #include <sys/sched.h>
 #include <sys/mount.h>
+#include <sys/namespace.h>
 #include <sys/syscallargs.h>
 #include <sys/kauth.h>
 #include <sys/sleepq.h>
@@ -282,6 +283,10 @@ exit1(struct lwp *l, int rv)
 	fd_free();
 	cwdfree(p->p_cwdi);
 	p->p_cwdi = NULL;
+	if (p->p_nsinfo != NULL) {
+		nsinfo_decref(p->p_nsinfo);
+		p->p_nsinfo = NULL;
+	}
 	doexithooks(p);
 	sigactsfree(p->p_sigacts);
 
diff -r d581ced863ba -r ef920de0474a sys/kern/kern_fork.c
--- a/sys/kern/kern_fork.c	Sat Nov 28 10:13:45 2015 -0500
+++ b/sys/kern/kern_fork.c	Sat Nov 28 10:38:33 2015 -0500
@@ -78,6 +78,7 @@
 #include <sys/kernel.h>
 #include <sys/pool.h>
 #include <sys/mount.h>
+#include <sys/namespace.h>
 #include <sys/proc.h>
 #include <sys/ras.h>
 #include <sys/resourcevar.h>
@@ -362,6 +363,16 @@ fork1(struct lwp *l1, int flags, int exi
 	else
 		p2->p_cwdi = cwdinit();
 
+	/* reuse SHARECWD for the namespace */
+	if (p1->p_nsinfo != NULL) {
+		if (flags & FORK_SHARECWD)
+			p2->p_nsinfo = nsinfo_share(p1->p_nsinfo);
+		else
+			p2->p_nsinfo = nsinfo_clone(p1->p_nsinfo);
+	} else {
+		p2->p_nsinfo = NULL;
+	}
+
 	/*
 	 * Note: p_limit (rlimit stuff) is copy-on-write, so normally
 	 * we just need increase pl_refcnt.
diff -r d581ced863ba -r ef920de0474a sys/kern/syscalls.master
--- a/sys/kern/syscalls.master	Sat Nov 28 10:13:45 2015 -0500
+++ b/sys/kern/syscalls.master	Sat Nov 28 10:38:33 2015 -0500
@@ -979,3 +979,9 @@ 479	NOERR	RUMP	{ int|sys||posix_fallocat
 			    off_t len); }
 480	STD  RUMP	{ int|sys||fdiscard(int fd, int PAD, off_t pos, \
 			    off_t len); }
+481	STD  RUMP	{ int|sys||ns_begin(void); }
+482	STD  RUMP	{ int|sys||ns_addrule(const char *dir, \
+			    const char *name, const char *replacement); }
+483	STD  RUMP	{ int|sys||ns_finish(void); }
+484	STD  RUMP	{ int|sys||ns_setns(int usepriv); }
+485	STD  RUMP	{ int|sys||ns_empower(void); }
diff -r d581ced863ba -r ef920de0474a sys/kern/vfs_lookup.c
--- a/sys/kern/vfs_lookup.c	Sat Nov 28 10:13:45 2015 -0500
+++ b/sys/kern/vfs_lookup.c	Sat Nov 28 10:38:33 2015 -0500
@@ -51,6 +51,7 @@
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
+#include <sys/namespace.h>
 #include <sys/errno.h>
 #include <sys/filedesc.h>
 #include <sys/hash.h>
@@ -528,14 +529,36 @@ namei_cleanup(struct namei_state *state)
  * Initializes the rootdir and erootdir state and returns a reference
  * to the starting dir.
  */
-static struct vnode *
-namei_getstartdir(struct namei_state *state)
+static int
+namei_getstartdir(struct namei_state *state, struct vnode **ret)
 {
 	struct nameidata *ndp = state->ndp;
 	struct componentname *cnp = state->cnp;
 	struct cwdinfo *cwdi;		/* pointer to cwd state */
 	struct lwp *self = curlwp;	/* thread doing namei() */
 	struct vnode *rootdir, *erootdir, *curdir, *startdir;
+	int error;
+
+	if (self->l_proc->p_nsinfo != NULL) {
+		error = ns_getroot(&startdir);
+		if (error) {
+			return error;
+		}
+		if (startdir != NULL) {
+			/*
+			 * XXX. for now, don't allow emulation roots and
+			 * namespaces at once.
+			 */
+			if (cnp->cn_flags & (TRYEMULROOT | EMULROOTSET)) {
+				vrele(startdir);
+				state->attempt_retry = 1;
+				return ENOENT;
+			}
+			*ret = startdir;
+			return 0;
+		}
+		/* the namespace didn't have a root dir, continue as usual */
+	}
 
 	cwdi = self->l_proc->p_cwdi;
 	rw_enter(&cwdi->cwdi_lock, RW_READER);
@@ -592,7 +615,8 @@ namei_getstartdir(struct namei_state *st
 	vref(startdir);
 
 	rw_exit(&cwdi->cwdi_lock);
-	return startdir;
+	*ret = startdir;
+	return 0;
 }
 
 /*
@@ -657,6 +681,7 @@ namei_start(struct namei_state *state, i
 {
 	struct nameidata *ndp = state->ndp;
 	struct vnode *startdir;
+	int error;
 
 	/* length includes null terminator (was originally from copyinstr) */
 	ndp->ni_pathlen = strlen(ndp->ni_pnbuf) + 1;
@@ -675,7 +700,10 @@ namei_start(struct namei_state *state, i
 		startdir = namei_getstartdir_for_nfsd(state);
 		/* no ktrace */
 	} else {
-		startdir = namei_getstartdir(state);
+		error = namei_getstartdir(state, &startdir);
+		if (error) {
+			return error;
+		}
 		namei_ktrace(state);
 	}
 
@@ -915,6 +943,48 @@ lookup_once(struct namei_state *state,
 	*newsearchdir_ret = searchdir;
 
 	/*
+	 * I think it's ok to bypass the mount point crossing code
+	 * for namespaces as that should have been handled when the
+	 * namespace was prepared. But: XXX think about this more;
+	 * nothing in here is straightforward or obvious.
+	 */
+	if (l->l_proc->p_nsinfo != NULL) {
+		error = ns_lookup(searchdir, cnp->cn_nameptr, cnp->cn_namelen,
+				  &foundobj);
+		if (error) {
+			goto done;
+		}
+		if (foundobj != NULL) {
+			/*
+			 * If we found a dir, it might be either above
+			 * or below searchdir. In the former case, we
+			 * must unlock searchdir before locking it, so
+			 * do that unconditionally. Make searchdir be
+			 * another reference to the result; this is
+			 * harmless when we've found a directory as
+			 * the result will become the next searchdir
+			 * soon anyway, and (like other places in
+			 * here) avoids creating additional special
+			 * cases.
+			 *
+			 * If we found a non-dir, it's safe to lock
+			 * it while still holding searchdir's lock
+			 * (at least, relatively) as all dirs are
+			 * above and thus before all non-dirs.
+			 */
+			if (foundobj->v_type == VDIR) {
+				vput(searchdir);
+				vref(foundobj);
+				searchdir = foundobj;
+				*newsearchdir_ret = searchdir;
+			}
+			vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY);
+			*foundobj_ret = foundobj;
+			goto done;
+		}
+	}
+
+	/*
 	 * Handle "..": two special cases.
 	 * 1. If at root directory (e.g. after chroot)
 	 *    or at absolute root directory
diff -r d581ced863ba -r ef920de0474a sys/kern/vfs_namespace.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/kern/vfs_namespace.c	Sat Nov 28 10:38:33 2015 -0500
@@ -0,0 +1,1065 @@
+#include <sys/types.h>
+#include <sys/atomic.h>
+#include <sys/rbtree.h>
+#include <sys/kmem.h>
+#include <sys/uio.h> /* required by sys/kauth.h */
+#include <sys/kauth.h>
+#include <sys/proc.h>
+#include <sys/fstypes.h> /* for fsid_t */
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/namei.h>
+#include <sys/dirent.h>
+#include <sys/namespace.h>
+
+#define vn_incref(vn) vref(vn)
+#define vn_decref(vn) vrele(vn)
+#define vn_unlock(vn) VOP_UNLOCK(vn)
+
+////////////////////////////////////////////////////////////
+// rbtree and helpers (code that should exist elsewhere but doesn't)
+
+/*
+ * Compare function for fsid_t. XXX: this knows the internal
+ * representation of fsid_t, which shouldn't be exposed here.
+ */
+static int
+fsid_compare(fsid_t a, fsid_t b)
+{
+	unsigned i;
+	int32_t xa, xb;
+
+	for (i=0; i<__arraycount(a.__fsid_val); i++) {
+		xa = a.__fsid_val[i];
+		xb = b.__fsid_val[i];
+		if (xa < xb) {
+			return -1;
+		} else if (xa > xb) {
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * Remove operation for rbtrees that can be used in a loop. Removes
+ * and returns an arbitrary node from the rbtree; returns NULL when
+ * the tree is empty.
+ */
+static void *
+rb_tree_removeone(rb_tree_t *rbt)
+{
+	void *elem;
+
+	RB_TREE_FOREACH(elem, rbt) {
+		/* tree not yet empty; remove the first node */
+		rb_tree_remove_node(rbt, elem);
+		return elem;
+	}
+	/* we didn't get anything; tree is empty */
+	return NULL;
+}
+
+////////////////////////////////////////////////////////////
+// nsrule/nsrulekey methods
+
+static struct nsrule *
+nsrule_create(const char *name)
+{
+	struct nsrule *rule;
+	size_t len;
+
+	len = strlen(name);
+
+	rule = kmem_zalloc(sizeof(*rule), KM_SLEEP);
+	KASSERT(rule != NULL);
+
+	rule->nsr_key.nsrk_namelen = len;
+	rule->nsr_key.nsrk_name = kmem_alloc(len+1, KM_SLEEP);
+	strcpy(rule->nsr_key.nsrk_name, name);
+	rule->nsr_vn = NULL;
+	rule->nsr_ino = 0;
+	rule->nsr_type = DT_UNKNOWN;
+
+	return rule;
+}
+
+static void
+nsrule_destroy(struct nsrule *rule)
+{
+	if (rule->nsr_vn != NULL) {
+		vn_decref(rule->nsr_vn);
+	}
+	kmem_free(rule->nsr_key.nsrk_name, rule->nsr_key.nsrk_namelen + 1);
+	kmem_free(rule, sizeof(*rule));
+}
+
+static int
+nsrulekey_setdir(struct nsrulekey *key, struct vnode *vp)
+{
+	/*
+	 * XXX, what a mess. There should be a vfs function to fetch
+	 * the fsid, and inode numbers should almost certainly just be
+	 * exposed through struct vnode, or failing that should be a
+	 * nonfailing vnode call.
+	 */
+	struct vattr va;
+	int result;
+
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+	result = VOP_GETATTR(vp, &va, curlwp->l_cred);
+	vn_unlock(vp);
+	if (result) {
+		return result;
+	}
+
+	key->nsrk_vol = vp->v_mount->mnt_stat.f_fsidx;
+	key->nsrk_ino = va.va_fileid;
+	return 0;
+}
+
+static bool
+nsrulekey_samevnode(const struct nsrulekey *k1, const struct nsrulekey *k2)
+{
+	if (fsid_compare(k1->nsrk_vol, k2->nsrk_vol)) {
+		return false;
+	}
+	return k1->nsrk_ino == k2->nsrk_ino;
+}
+
+static int
+nsrulekey_compare(const struct nsrulekey *k1, const struct nsrulekey *k2)
+{
+	int ret;
+	size_t len;
+
+	ret = fsid_compare(k1->nsrk_vol, k2->nsrk_vol);
+	if (ret) {
+		return ret;
+	}
+
+	if (k1->nsrk_ino < k2->nsrk_ino) {
+		return -1;
+	} else if (k1->nsrk_ino > k2->nsrk_ino) {
+		return 1;
+	}
+
+	/*
+	 * Because one of the names will often come from namei, it
+	 * isn't null-terminated, only bounded by nsrk_namelen. Deal.
+	 */
+	len = min(k1->nsrk_namelen, k2->nsrk_namelen);
+	ret = memcmp(k1->nsrk_name, k2->nsrk_name, len);
+	if (ret) {
+		return ret;
+	}
+	if (k1->nsrk_namelen < k2->nsrk_namelen) {
+		return -1;
+	} else if (k1->nsrk_namelen > k2->nsrk_namelen) {
+		return 1;
+	}
+	return 0;
+}
+
+static int
+ns_compare_nodes(void *ctx, const void *v1, const void *v2)
+{
+	const struct nsrule *r1 = v1;
+	const struct nsrule *r2 = v2;
+
+	return nsrulekey_compare(&r1->nsr_key, &r2->nsr_key);
+}
+
+static int
+ns_compare_key(void *ctx, const void *v1, const void *v2)
+{
+	const struct nsrule *r1 = v1;
+	const struct nsrulekey *k2 = v2;
+
+	return nsrulekey_compare(&r1->nsr_key, k2);
+}
+
+static const rb_tree_ops_t nsrule_rbtreeops = {
+	.rbto_compare_nodes = ns_compare_nodes,
+	.rbto_compare_key = ns_compare_key,
+	.rbto_node_offset = offsetof(struct nsrule, nsr_rbnode),
+	.rbto_context = NULL,
+};
+
+////////////////////////////////////////////////////////////
+// namespace methods
+
+static void
+namespace_incref(struct namespace *ns)
+{
+	atomic_inc_uint(&ns->ns_refcount);
+}
+
+static struct namespace *
+namespace_create(struct namespace *parent)
+{
+	struct namespace *ns;
+
+	ns = kmem_alloc(sizeof(*ns), KM_SLEEP);
+	KASSERT(ns != NULL);
+
+	ns->ns_refcount = 1;
+	if (parent != NULL) {
+		KASSERT(parent->ns_finished == true);
+		namespace_incref(parent);
+	}
+	ns->ns_parent = parent;
+	rb_tree_init(&ns->ns_rules, &nsrule_rbtreeops);
+	ns->ns_numrules = 0;
+	ns->ns_rootvn = NULL;
+	ns->ns_finished = false;
+
+	return ns;
+}
+
+static void
+namespace_decref(struct namespace *ns)
+{
+	unsigned newcount;
+	struct nsrule *rule;
+	struct namespace *parent;
+
+	newcount = atomic_dec_uint_nv(&ns->ns_refcount);
+	if (newcount == 0) {
+		/* destroy */
+
+		while ((rule = rb_tree_removeone(&ns->ns_rules)) != NULL) {
+			nsrule_destroy(rule);
+			ns->ns_numrules--;
+		}
+
+		parent = ns->ns_parent;
+		/* this doesn't seem to exist */
+		/*rb_tree_cleanup(&ns->ns_rules);*/
+		KASSERT(ns->ns_numrules == 0);
+		kmem_free(ns, sizeof(*ns));
+		if (ns->ns_rootvn != NULL) {
+			vn_decref(ns->ns_rootvn);
+		}
+
+		/* make this a tail call for stack safety */
+		namespace_decref(parent);
+	}
+}
+
+////////////////////////////////////////////////////////////
+// nsinfo methods
+
+/*
+ * Create an empty nsinfo (per-process data)
+ */
+static struct nsinfo *
+nsinfo_create(void)
+{
+	struct nsinfo *nsi;
+
+	nsi = kmem_alloc(sizeof(*nsi), KM_SLEEP);
+	KASSERT(nsi != NULL);
+
+	nsi->nsi_refcount = 1;
+	mutex_init(&nsi->nsi_lock, MUTEX_DEFAULT, IPL_NONE);
+	nsi->nsi_space = NULL;
+	nsi->nsi_privspace = NULL;
+	nsi->nsi_nspaces = 0;
+	nsi->nsi_usepriv = false;
+
+	return nsi;
+}
+
+/*
+ * Copy nsinfo (used at fork time)
+ */
+struct nsinfo *
+nsinfo_clone(struct nsinfo *prev)
+{
+	struct nsinfo *nsi;
+
+	nsi = nsinfo_create();
+
+	mutex_enter(&prev->nsi_lock);
+
+	namespace_incref(prev->nsi_space);
+	namespace_incref(prev->nsi_privspace);
+	nsi->nsi_space = prev->nsi_space;
+	nsi->nsi_privspace = prev->nsi_privspace;
+
+	/* Mark both namespaces finished if they aren't. */
+	if (nsi->nsi_space->ns_finished == false) {
+		KASSERT(nsi->nsi_space->ns_refcount == 1);
+		nsi->nsi_space->ns_finished = true;
+	}
+	if (nsi->nsi_privspace->ns_finished == false) {
+		KASSERT(nsi->nsi_privspace->ns_refcount == 1);
+		nsi->nsi_privspace->ns_finished = true;
+	}
+
+	mutex_exit(&prev->nsi_lock);
+	return nsi;
+}
+
+/*
+ * Share nsinfo (used at fork time when doing clone(2))
+ */
+struct nsinfo *
+nsinfo_share(struct nsinfo *prev)
+{
+	atomic_inc_uint(&prev->nsi_refcount);
+	return prev;
+}
+
+/*
+ * Destroy nsinfo (used at exit time)
+ */
+void
+nsinfo_decref(struct nsinfo *nsi)
+{
+	unsigned newcount;
+
+	KASSERT(nsi != NULL);
+
+	newcount = atomic_dec_uint_nv(&nsi->nsi_refcount);
+	if (__predict_true(newcount == 0)) {
+
+		namespace_decref(nsi->nsi_space);
+		namespace_decref(nsi->nsi_privspace);
+		mutex_destroy(&nsi->nsi_lock);
+
+		kmem_free(nsi, sizeof(*nsi));
+	}
+}
+
+/*
+ * Fetch the current namespace based on the usepriv flag.
+ */
+static struct namespace *
+nsinfo_getspace(struct nsinfo *nsi)
+{
+	if (nsi->nsi_usepriv) {
+#ifdef DIAGNOSTIC
+		mutex_enter(curproc->p_lock);
+		KASSERT((curproc->p_flag & PK_SUGID) != 0);
+		mutex_exit(curproc->p_lock);
+#endif
+		return nsi->nsi_privspace;
+
+	} else {
+		return nsi->nsi_space;
+	}
+}
+
+/*
+ * Change the current namespace based on the usepriv flag.
+ */
+static void
+nsinfo_setspace(struct nsinfo *nsi, struct namespace *ns)
+{
+	if (nsi->nsi_usepriv) {
+#ifdef DIAGNOSTIC
+		mutex_enter(curproc->p_lock);
+		KASSERT((curproc->p_flag & PK_SUGID) != 0);
+		mutex_exit(curproc->p_lock);
+#endif
+		nsi->nsi_privspace = ns;
+
+	} else {
+		nsi->nsi_space = ns;
+	}
+}
+
+////////////////////////////////////////////////////////////
+// exec-time interface
+
+/*
+ * Handle namespace work that is needed at exec time. This means:
+ *   - mark both namespaces finished if they aren't
+ *   - clear nsi_nspaces
+ *   - initialize nsi_usepriv
+ */
+void
+ns_exec(struct proc *p)
+{
+	struct nsinfo *nsi;
+
+	KASSERT(p->p_nsinfo != NULL);
+	nsi = p->p_nsinfo;
+	mutex_enter(&nsi->nsi_lock);
+
+	/* Mark both namespaces finished if they aren't. */
+	if (nsi->nsi_space->ns_finished == false) {
+		KASSERT(nsi->nsi_space->ns_refcount == 1);
+		nsi->nsi_space->ns_finished = true;
+	}
+	if (nsi->nsi_privspace->ns_finished == false) {
+		KASSERT(nsi->nsi_privspace->ns_refcount == 1);
+		nsi->nsi_privspace->ns_finished = true;
+	}
+
+	/* reset the number-of-spaces counter */
+	nsi->nsi_nspaces = 0;
+
+	/* initialize nsi_usepriv based on setugid state */
+	mutex_enter(p->p_lock);
+	nsi->nsi_usepriv = (p->p_flag & PK_SUGID) != 0;
+	mutex_exit(p->p_lock);
+
+	mutex_exit(&nsi->nsi_lock);
+}
+
+////////////////////////////////////////////////////////////
+// lookup-time interface
+
+/*
+ * Fetch a reference to the root directory as defined by the current
+ * namespace.
+ *
+ * By getting here we know that curproc->p_nsinfo is non-null.
+ */
+int
+ns_getroot(struct vnode **ret)
+{
+	struct nsinfo *nsi;
+	struct namespace *ns;
+
+	nsi = curproc->p_nsinfo;
+	KASSERT(nsi != NULL);
+
+	mutex_enter(&nsi->nsi_lock);
+	ns = nsinfo_getspace(nsi);
+	mutex_exit(&nsi->nsi_lock);
+
+	while (ns != NULL) {
+		if (ns->ns_rootvn != NULL) {
+			vn_incref(ns->ns_rootvn);
+			*ret = ns->ns_rootvn;
+			return 0;
+		}
+		ns = ns->ns_parent;
+	}
+
+	/* no root dir defined; use the real root */
+	*ret = NULL;
+	return 0;
+}
+
+/*
+ * Look up the namespace rule, if any, for NAME in directory DVP, and
+ * return a reference to the resulting object in RET. If there is no
+ * rule, succeed and set RET to NULL.
+ *
+ * By getting here we know that curproc->p_nsinfo is non-null.
+ */
+int
+ns_lookup(struct vnode *dvp, const char *name, size_t namelen,
+	  struct vnode **ret)
+{
+	struct nsinfo *nsi;
+	struct namespace *ns;
+	struct nsrule *rule;
+	struct nsrulekey key;
+	int result;
+
+	nsi = curproc->p_nsinfo;
+	KASSERT(nsi != NULL);
+
+	/*
+	 * Note that even if other threads in the process are screwing
+	 * around, the namespace cannot disappear under us: creating a
+	 * new one inherits from the old one and doesn't discard it.
+	 * If we get around to implementing coalescing of namespaces we
+	 * might need to revisit this.
+	 *
+	 * Namespaces disappear only when the last process referring
+	 * to them exits (in which case there are no other threads
+	 * doing lookup in them) or when root copies the unprivileged
+	 * namespace to the privileged namespace. In the latter case
+	 * we just demand that there are no other threads in the
+	 * process, as the only time that should be happening is in
+	 * chroot(8) or similar.
+	 */
+
+	mutex_enter(&nsi->nsi_lock);
+	ns = nsinfo_getspace(nsi);
+	mutex_exit(&nsi->nsi_lock);
+
+	result = nsrulekey_setdir(&key, dvp);
+	if (result) {
+		return result;
+	}
+	key.nsrk_name = (char *)(uintptr_t)name; /* XXXconst */
+	key.nsrk_namelen = namelen;
+
+	while (ns != NULL) {
+		rule = rb_tree_find_node(&ns->ns_rules, &key);
+		if (rule != NULL) {
+			if (rule->nsr_vn == NULL) {
+				/* Rule says to deny access. */
+				return EACCES;
+			} else {
+				/* Rule says to return this vnode. */
+				vn_incref(rule->nsr_vn);
+				*ret = rule->nsr_vn;
+				return 0;
+			}
+		}
+		ns = ns->ns_parent;
+	}
+
+	/* Reached the canonical (real) namespace; we're done. */
+	*ret = NULL;
+	return 0;
+}
+
+////////////////////////////////////////////////////////////
+// readdir-time interface
+
+/*
+ * Check if the current namespace has any rules for names in the
+ * directory passed in.
+ */
+bool
+ns_hasentriesfor(struct vnode *dvp)
+{
+	struct nsinfo *nsi;
+	struct namespace *ns;
+	struct nsrule *rule;
+	struct nsrulekey key;
+	int result;
+
+	nsi = curproc->p_nsinfo;
+	if (nsi == NULL) {
+		return false;
+	}
+
+	mutex_enter(&nsi->nsi_lock);
+	ns = nsinfo_getspace(nsi);
+	mutex_exit(&nsi->nsi_lock);
+
+	result = nsrulekey_setdir(&key, dvp);
+	if (result) {
+		/* XXX bah. shouldn't happen, but don't fail open */
+		printf("ns_filterdents: warning: error %d looking up dir\n",
+		       result);
+		return true;
+	}
+	key.nsrk_name = (char *)(uintptr_t)""; /* XXXconst */
+	key.nsrk_namelen = 0;
+
+	while (ns != NULL) {
+		rule = rb_tree_find_node_geq(&ns->ns_rules, &key);
+		if (rule != NULL && nsrulekey_samevnode(&rule->nsr_key, &key)) {
+			return true;
+		}
+		ns = ns->ns_parent;
+	}
+
+	return false;
+}
+
+/*
+ * Check if a rule matching KEY exists in the current namespace.
+ */
+static bool
+ns_keyexists(struct namespace *ns, const struct nsrulekey *key)
+{
+	struct nsrule *rule;
+
+	while (ns != NULL) {
+		rule = rb_tree_find_node(&ns->ns_rules, key);
+		if (rule != NULL) {
+			return true;
+		}
+		ns = ns->ns_parent;
+	}
+	return false;
+}
+
+/*
+ * Take a buffer full of struct dirent and overwrite any entries for
+ * names that are mentioned in the namespace rules for the passed-in
+ * directory.
+ */
+int
+ns_filterdents(struct vnode *dvp, char *buf, size_t len)
+{
+	struct nsinfo *nsi;
+	struct namespace *ns;
+	struct nsrulekey key;
+	struct dirent *dp, *lastdp;
+	size_t pos, frontsize;
+	int result;
+
+	nsi = curproc->p_nsinfo;
+	if (nsi == NULL) {
+		/* nothing to do */
+		return 0;
+	}
+
+	mutex_enter(&nsi->nsi_lock);
+	ns = nsinfo_getspace(nsi);
+	mutex_exit(&nsi->nsi_lock);
+
+	if (ns == NULL) {
+		/* shortcut */
+		return 0;
+	}
+
+	result = nsrulekey_setdir(&key, dvp);
+	if (result) {
+		return result;
+	}
+
+	lastdp = NULL;
+	frontsize = 0;
+	for (pos = 0; pos < len; pos += dp->d_reclen) {
+		dp = (struct dirent *)(buf + pos);
+		if (dp->d_namlen == 2 &&
+		    dp->d_name[0] == '.' && dp->d_name[1] == '.') {
+			/* don't need to filter out ".." */
+			continue;
+		}
+		key.nsrk_name = dp->d_name;
+		key.nsrk_namelen = dp->d_namlen;
+		if (!ns_keyexists(ns, &key)) {
+			lastdp = dp;
+			continue;
+		}
+		/* need to zot this entry */
+		if (lastdp == NULL) {
+			frontsize += dp->d_reclen;
+		} else {
+			lastdp->d_reclen += dp->d_reclen;
+			dp->d_type = DT_UNKNOWN;
+			dp->d_fileno = 0;
+			bzero(dp->d_name, dp->d_namlen);
+			dp->d_namlen = 0;
+
+			/* blah */
+			pos += dp->d_reclen;
+			dp->d_reclen = 0;
+		}
+	}
+	if (frontsize > 0) {
+		dp = (struct dirent *)(buf + frontsize);
+		dp->d_reclen += frontsize;
+		memmove(buf, dp, dp->d_reclen - frontsize);
+	}
+
+	return 0;
+}
+
+/*
+ * Do the equivalent of VOP_READDIR for all the names that are
+ * mentioned in the namespace rules for the passed-in directory.
+ *
+ * Note: this has no ability to start in the middle.
+ */
+int
+ns_readdir(struct vnode *dvp, struct uio *uio)
+{
+	struct nsinfo *nsi;
+	struct namespace *ns;
+	struct nsrule *rule;
+	struct nsrulekey key;
+	struct dirent *dp;
+	size_t len;
+	int result;
+
+	nsi = curproc->p_nsinfo;
+	if (nsi == NULL) {
+		return false;
+	}
+
+	mutex_enter(&nsi->nsi_lock);
+	ns = nsinfo_getspace(nsi);
+	mutex_exit(&nsi->nsi_lock);
+
+	result = nsrulekey_setdir(&key, dvp);
+	if (result) {
+		return result;
+	}
+	key.nsrk_name = (char *)(uintptr_t)""; /* XXXconst */
+	key.nsrk_namelen = 0;
+
+	dp = kmem_alloc(sizeof(*dp), KM_SLEEP);
+
+	while (ns != NULL && uio->uio_resid > 0) {
+		rule = rb_tree_find_node_geq(&ns->ns_rules, &key);
+		while (rule != NULL &&
+		       nsrulekey_samevnode(&rule->nsr_key, &key)) {
+
+			if (!strcmp(rule->nsr_key.nsrk_name, "..")) {
+				/* don't need to issue another ".." */
+				continue;
+			}
+
+			len = strlen(rule->nsr_key.nsrk_name);
+			KASSERT(len + 1 < sizeof(dp->d_name));
+			memcpy(dp->d_name, rule->nsr_key.nsrk_name, len);
+			dp->d_name[len] = '\0';
+			dp->d_fileno = rule->nsr_ino;
+			dp->d_type = rule->nsr_type;
+			dp->d_namlen = len;
+			dp->d_reclen = _DIRENT_RECLEN(dp, len);
+
+			/*
+			 * XXX: should we test uio_resid >= d_reclen
+			 * before calling uiomove? if not, uiomove
+			 * will happily send back a partial entry, but
+			 * that might make libc splode.
+			 */
+
+			result = uiomove(dp, dp->d_reclen, uio);
+			if (result) {
+				goto out;
+			}
+
+			rule = rb_tree_iterate(&ns->ns_rules, &key,
+					       RB_DIR_RIGHT);
+		}
+		ns = ns->ns_parent;
+	}
+
+out:
+	kmem_free(dp, sizeof(*dp));
+	return 0;
+}
+
+////////////////////////////////////////////////////////////
+// control interface
+
+/*
+ * Begin construction of a new namespace.
+ */
+int
+ns_begin(void)
+{
+	struct nsinfo *nsi, *foundnsi;
+	struct namespace *ns, *newns;
+
+	nsi = curproc->p_nsinfo;
+	if (nsi == NULL) {
+		nsi = nsinfo_create();
+		KASSERT(nsi != NULL);
+
+		/* set p_nsinfo atomically */
+		foundnsi = atomic_cas_ptr(&curproc->p_nsinfo, NULL, nsi);
+		if (foundnsi != NULL) {
+			/* someone else also just did ns_begin */
+			nsinfo_decref(nsi);
+			/* maybe should just fail */
+			nsi = foundnsi;
+		}
+		/*
+		 * Technically we need a membar_store_any here, but
+		 * the immediately following mutex_enter() has one.
+		 */
+	}
+
+	mutex_enter(&nsi->nsi_lock);
+	if (nsi->nsi_nspaces >= MAX_NS_PER_PROC) {
+		/* too many */
+		mutex_exit(&nsi->nsi_lock);
+		return EPERM;
+	}
+
+	ns = nsinfo_getspace(nsi);
+	if (ns != NULL && ns->ns_finished == false) {
+		/* already constructing a namespace */
+		mutex_exit(&nsi->nsi_lock);
+		return EALREADY;
+	}
+	newns = namespace_create(ns);
+	KASSERT(newns != NULL);
+	KASSERT(newns->ns_parent == ns);
+	nsinfo_setspace(nsi, newns);
+	mutex_exit(&nsi->nsi_lock);
+
+	return 0;
+}
+
+/*
+ * Add a rule to the namespace under construction.
+ */
+int
+ns_addrule(const char *dir, const char *name, const char *replacement)
+{
+	struct nsinfo *nsi;
+	struct namespace *newns, *prevns;
+	struct nsrule *rule, *oldrule;
+	struct vnode *dvp, *replvp;
+	struct vattr va;
+	int result;
+
+	/* the syscall code enforces this */
+	KASSERT(strlen(name) <= NAME_MAX);
+
+	nsi = curproc->p_nsinfo;
+	if (nsi == NULL) {
+		/* no namespace */
+		return EINVAL;
+	}
+
+	mutex_enter(&nsi->nsi_lock);
+	newns = nsinfo_getspace(nsi);
+	if (newns == NULL || newns->ns_finished == true) {
+		/* not creating a namespace */
+		mutex_exit(&nsi->nsi_lock);
+		return EINVAL;
+	}
+	prevns = newns->ns_parent;
+
+	KASSERT(newns->ns_refcount == 1);
+
+	if (newns->ns_numrules >= MAX_NSRULES_PER_NS) {
+		mutex_exit(&nsi->nsi_lock);
+		/* not an especially good errno, but best we have I think */
+		return E2BIG;
+	}
+
+	if (dir == NULL) {
+		/* rule for / */
+		KASSERT(name == NULL);
+		if (replacement == NULL) {
+			/* not sensible to deny access to / */
+			mutex_exit(&nsi->nsi_lock);
+			return EPERM;
+		}
+		rule = NULL;
+	} else {
+		/*
+		 * To help make the resulting namespace coherent, the
+		 * *source* dir is looked up in the parent namespace
+		 * and the *replacement* in the new namespace. Then if
+		 * the rules are loaded last-to-first overlapping
+		 * entries will behave as if the rules are applied in
+		 * order first-to-last. Note that it's the caller's
+		 * responsibility to not insert rules where the key
+		 * side is unreachable once the whole namespace is
+		 * loaded. Such rules have no particular negative
+		 * impact but probably do not produce the results the
+		 * operator intended.
+		 *
+		 * Note that we do this by mucking with
+		 * nsi->nsi_space, so we must hold nsi_lock across the
+		 * lookups to prevent other threads in the process
+		 * from seeing the namespace flap. Of course it's
+		 * pretty silly to be building a namespace while other
+		 * threads are doing lookups, but that's the user's
+		 * lookout.
+		 */
+
+		/* Look up the source dir, in the old namespace. */
+		nsinfo_setspace(nsi, prevns);
+		result = namei_simple_kernel(dir, NSM_FOLLOW_NOEMULROOT, &dvp);
+		if (result) {
+			nsinfo_setspace(nsi, newns);
+			mutex_exit(&nsi->nsi_lock);
+			return result;
+		}
+
+		rule = nsrule_create(name);
+		KASSERT(rule != NULL);
+		result = nsrulekey_setdir(&rule->nsr_key, dvp);
+		if (result) {
+			nsrule_destroy(rule);
+			nsinfo_setspace(nsi, newns);
+			mutex_exit(&nsi->nsi_lock);
+			return result;
+		}
+		vn_decref(dvp);
+		nsinfo_setspace(nsi, newns);
+	}
+
+	/* Look up the replacement, back in the new namespace. */
+	if (replacement == NULL) {
+		replvp = NULL;
+	} else {
+		result = namei_simple_kernel(replacement,
+					     NSM_FOLLOW_NOEMULROOT,
+					     &replvp);
+		if (result) {
+			nsrule_destroy(rule);
+			mutex_exit(&nsi->nsi_lock);
+			return result;
+		}
+
+		vn_lock(replvp, LK_SHARED | LK_RETRY);
+		result = VOP_GETATTR(replvp, &va, curlwp->l_cred);
+		vn_unlock(replvp);
+		if (result) {
+			nsrule_destroy(rule);
+			mutex_exit(&nsi->nsi_lock);
+			return result;
+		}
+	}
+
+	if (dir == NULL) {
+		/* Rule for / */
+		if (newns->ns_rootvn != NULL) {
+			nsrule_destroy(rule);
+			mutex_exit(&nsi->nsi_lock);
+			return EEXIST;
+		}
+		newns->ns_rootvn = replvp;
+	} else {
+		rule->nsr_vn = replvp;
+		rule->nsr_ino = va.va_fileid;
+		rule->nsr_type = IFTODT(va.va_mode);
+
+		/* Insert the rule */
+
+		oldrule = rb_tree_insert_node(&newns->ns_rules, rule);
+		if (oldrule != rule) {
+			nsrule_destroy(rule);
+			mutex_exit(&nsi->nsi_lock);
+			return EEXIST;
+		}
+		newns->ns_numrules++;
+	}
+
+	/* Done */
+	mutex_exit(&nsi->nsi_lock);
+	return 0;
+}
+
+/*
+ * Mark the namespace under construction finished.
+ */
+int
+ns_finish(void)
+{
+	struct nsinfo *nsi;
+	struct namespace *newns;
+
+	nsi = curproc->p_nsinfo;
+	if (nsi == NULL) {
+		/* no namespace */
+		return EINVAL;
+	}
+
+	mutex_enter(&nsi->nsi_lock);
+	newns = nsinfo_getspace(nsi);
+	if (newns == NULL || newns->ns_finished == true) {
+		/* not creating a namespace */
+		mutex_exit(&nsi->nsi_lock);
+		return EINVAL;
+	}
+
+	KASSERT(newns->ns_refcount == 1);
+
+	newns->ns_finished = true;
+
+	mutex_exit(&nsi->nsi_lock);
+	return 0;
+}
+
+////////////////////////////////////////////////////////////
+// privileged control interface
+
+/*
+ * Switch back and forth between the privileged and unprivileged
+ * namespaces. Only setugid processes (or root) may do this.
+ *
+ * XXX: is there a kauth way to check issetugid?
+ * XXX: is chroot the right permission check?
+ */
+int
+ns_setns(bool usepriv)
+{
+	struct nsinfo *nsi;
+	bool ok;
+	int result;
+
+	nsi = curproc->p_nsinfo;
+	if (nsi == NULL) {
+		/* no namespace */
+		return EINVAL;
+	}
+
+	mutex_enter(curproc->p_lock);
+	ok = (curproc->p_flag & PK_SUGID) != 0;
+	mutex_exit(curproc->p_lock);
+
+	if (!ok) {
+		return EPERM;
+	}
+
+	result = kauth_authorize_system(curlwp->l_cred, KAUTH_SYSTEM_CHROOT,
+ 	    KAUTH_REQ_SYSTEM_CHROOT_FCHROOT, NULL, NULL, NULL);
+	if (result) {
+		return result;
+	}
+
+	mutex_enter(&nsi->nsi_lock);
+	nsi->nsi_usepriv = usepriv;
+	mutex_exit(&nsi->nsi_lock);
+	return 0;
+}
+
+/*
+ * Clone the unprivileged namespace as the privileged namespace.
+ *
+ * Only root may do this, because it's equivalent to a traditional
+ * chroot that affects setugid programs. Although it's ok for root to
+ * do it inside another namespace, as namespaces nest.
+ *
+ * Note that we require the current process has only one thread, to
+ * avoid potentially hazardous races in lookup while destroying the
+ * old privileged namespace.
+ *
+ * XXX: this name is stupid; let's find a better one.
+ */
+int
+ns_empower(void)
+{
+	struct nsinfo *nsi;
+	struct namespace *oldns;
+	int result;
+
+	nsi = curproc->p_nsinfo;
+	if (nsi == NULL) {
+		/* no namespace */
+		return EINVAL;
+	}
+
+	result = kauth_authorize_system(curlwp->l_cred, KAUTH_SYSTEM_CHROOT,
+ 	    KAUTH_REQ_SYSTEM_CHROOT_FCHROOT, NULL, NULL, NULL);
+	if (result) {
+		return result;
+	}
+
+	mutex_enter(curproc->p_lock);
+	if (curproc->p_nlwps > 1) {
+		/* multithreaded process */
+		mutex_exit(curproc->p_lock);
+		return EPERM;
+	}
+	mutex_exit(curproc->p_lock);
+
+	mutex_enter(&nsi->nsi_lock);
+
+	if (nsi->nsi_space->ns_finished == false) {
+		/* finish constructing it first! */
+		mutex_exit(&nsi->nsi_lock);
+		return EINVAL;
+	}
+
+	if (nsi->nsi_refcount > 1) {
+		/* different kind of multithreaded process */
+		mutex_exit(&nsi->nsi_lock);
+		return EPERM;
+	}
+
+	oldns = nsi->nsi_privspace;
+	namespace_incref(nsi->nsi_space);
+	nsi->nsi_privspace = nsi->nsi_space;
+
+	mutex_exit(&nsi->nsi_lock);
+	namespace_decref(oldns);
+	return 0;
+}
diff -r d581ced863ba -r ef920de0474a sys/kern/vfs_syscalls.c
--- a/sys/kern/vfs_syscalls.c	Sat Nov 28 10:13:45 2015 -0500
+++ b/sys/kern/vfs_syscalls.c	Sat Nov 28 10:38:33 2015 -0500
@@ -87,6 +87,7 @@
 #include <sys/stat.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
+#include <sys/namespace.h>
 #include <sys/proc.h>
 #include <sys/uio.h>
 #include <sys/kmem.h>
@@ -4800,3 +4801,119 @@ fail:
 	fd_putfile(fd);
 	return error;
 }
+
+/*
+ * Begin creating a private namespace.
+ */
+/* ARGSUSED */
+int
+sys_ns_begin(struct lwp *l, const void *uap, register_t *retval)
+{
+	return ns_begin();
+}
+
+/*
+ * Grab a pathname that can legitimately be null.
+ */
+static int
+copyinpath_nullok(const char *ustr, char **kstr_ret, size_t max)
+{
+	char *kstr;
+	int error;
+
+	KASSERT(max <= PATH_MAX);
+
+	if (ustr == NULL) {
+		*kstr_ret = NULL;
+		return 0;
+	}
+
+	kstr = PNBUF_GET();
+	error = copyinstr(ustr, kstr, max, NULL);
+	if (error) {
+		PNBUF_PUT(kstr);
+		return error;
+	}
+	*kstr_ret = kstr;
+	return 0;
+}
+
+/*
+ * Add a rule to a private namespace under construction
+ */
+/* ARGSUSED */
+int
+sys_ns_addrule(struct lwp *l, const struct sys_ns_addrule_args *uap,
+		register_t *retval)
+{
+	/* {
+		syscallarg(const char *) dir;
+		syscallarg(const char *) name;
+		syscallarg(const char *) replacement;
+	} */
+	char *dir = NULL;
+	char *name = NULL;
+	char *replacement = NULL;
+	int error;
+
+	error = copyinpath_nullok(SCARG(uap, dir), &dir, PATH_MAX);
+	if (error) {
+		goto out;
+	}
+	error = copyinpath_nullok(SCARG(uap, name), &name, NAME_MAX+1);
+	if (error) {
+		goto out;
+	}
+	error = copyinpath_nullok(SCARG(uap, replacement), &replacement,
+				  PATH_MAX);
+	if (error) {
+		goto out;
+	}
+
+	error = ns_addrule(dir, name, replacement);
+out:
+	if (dir != NULL)
+		PNBUF_PUT(dir);
+	if (name != NULL)
+		PNBUF_PUT(name);
+	if (replacement != NULL)
+		PNBUF_PUT(replacement);
+	return error;
+}
+
+/*
+ * Complete construction of a private namespace
+ */
+/* ARGSUSED */
+int
+sys_ns_finish(struct lwp *l, const void *uap, register_t *retval)
+{
+	return ns_finish();
+}
+
+/*
+ * Select use of privileged or unprivileged namespace
+ * (privileged operation)
+ */
+/* ARGSUSED */
+int
+sys_ns_setns(struct lwp *l, const struct sys_ns_setns_args *uap,
+		register_t *retval)
+{
+	/* {
+		syscallarg(int) usepriv;
+	} */
+
+	return ns_setns(SCARG(uap, usepriv) != 0);
+}
+
+/*
+ * Copy unprivileged namespace to privileged namespace
+ * (privileged operation)
+ */
+/* ARGSUSED */
+int
+sys_ns_empower(struct lwp *l, const void *uap, register_t *retval)
+{
+	return ns_empower();
+}
diff -r d581ced863ba -r ef920de0474a sys/kern/vfs_vnops.c
--- a/sys/kern/vfs_vnops.c	Sat Nov 28 10:13:45 2015 -0500
+++ b/sys/kern/vfs_vnops.c	Sat Nov 28 10:38:33 2015 -0500
@@ -90,6 +90,7 @@
 #include <sys/filedesc.h>
 #include <sys/wapbl.h>
 #include <sys/mman.h>
+#include <sys/namespace.h>
 
 #include <miscfs/specfs/specdev.h>
 #include <miscfs/fifofs/fifo.h>
@@ -467,21 +468,37 @@ vn_rdwr(enum uio_rw rw, struct vnode *vp
 }
 
 int
-vn_readdir(file_t *fp, char *bf, int segflg, u_int count, int *done,
+vn_readdir(file_t *fp, char *callerbuf, int segflg, u_int count, int *done,
     struct lwp *l, off_t **cookies, int *ncookies)
 {
 	struct vnode *vp = fp->f_vnode;
 	struct iovec aiov;
 	struct uio auio;
+	int do_ns;
+	char *nsbuf;
+	char *readdirbuf;
 	int error, eofflag;
 
 	/* Limit the size on any kernel buffers used by VOP_READDIR */
 	count = min(MAXBSIZE, count);
 
+	do_ns = ns_hasentriesfor(vp);
+	if (do_ns && segflg != UIO_SYSSPACE) {
+		/*
+		 * Need to VOP_READDIR into a kernel buffer so we can
+		 * munge it.
+		 */
+		readdirbuf = nsbuf = kmem_alloc(count, KM_SLEEP);
+		segflg = UIO_SYSSPACE;
+	} else {
+		nsbuf = NULL;
+		readdirbuf = callerbuf;
+	}
+
 unionread:
 	if (vp->v_type != VDIR)
 		return (EINVAL);
-	aiov.iov_base = bf;
+	aiov.iov_base = readdirbuf;
 	aiov.iov_len = count;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
@@ -502,14 +519,20 @@ unionread:
 	mutex_exit(&fp->f_lock);
 	VOP_UNLOCK(vp);
 	if (error)
-		return (error);
+		goto out;
+
+	if (do_ns) {
+		error = ns_filterdents(vp, readdirbuf, count - auio.uio_resid);
+		if (error)
+			goto out;
+	}
 
 	if (count == auio.uio_resid && vn_union_readdir_hook) {
 		struct vnode *ovp = vp;
 
 		error = (*vn_union_readdir_hook)(&vp, fp, l);
 		if (error)
-			return (error);
+			goto out;
 		if (vp != ovp)
 			goto unionread;
 	}
@@ -526,7 +549,45 @@ unionread:
 		vrele(tvp);
 		goto unionread;
 	}
+
+	if (count == auio.uio_resid && do_ns) {
+		/*
+		 * XXX: if the names from the namespace rules don't
+		 * fit into the buffer you can't get the rest of them;
+		 * to handle that we'd have to create a fake vnode for
+		 * calling VOP_READDIR on and stuff that in fp->f_vnode.
+		 *
+		 * (Also note that in that case we'd have to arrange
+		 * to know to skip ns_filterdents, or it would all be
+		 * an expensive nop.)
+		 *
+		 * However, we expect namespace rules to be limited so
+		 * it's probably ok. And the right solution to this
+		 * problem is not to create such terrible hacks but to
+		 * restructure the way directory reading works so it
+		 * all happens cleanly.
+		 *
+		 * XXX: does this need to be passed cookies/ncookies?
+		 */
+		error = ns_readdir(vp, &auio);
+		if (error) {
+			goto out;
+		}
+	}
+
+	if (nsbuf != NULL) {
+		KASSERT(do_ns);
+		error = copyout(nsbuf, callerbuf, count - auio.uio_resid);
+		if (error) {
+			goto out;
+		}
+	}
+
 	*done = count - auio.uio_resid;
+out:
+	if (nsbuf != NULL) {
+		kmem_free(nsbuf, count);
+	}
 	return error;
 }
 
diff -r d581ced863ba -r ef920de0474a sys/rump/librump/rumpvfs/Makefile.rumpvfs
--- a/sys/rump/librump/rumpvfs/Makefile.rumpvfs	Sat Nov 28 10:13:45 2015 -0500
+++ b/sys/rump/librump/rumpvfs/Makefile.rumpvfs	Sat Nov 28 10:38:33 2015 -0500
@@ -33,6 +33,7 @@ SRCS+=	kern_physio.c
 # sys/kern vfs
 SRCS+=	vfs_bio.c vfs_cache.c vfs_cwd.c vfs_dirhash.c vfs_getcwd.c	\
 	vfs_hooks.c vfs_init.c vfs_lockf.c vfs_lookup.c vfs_mount.c	\
+	vfs_namespace.c \
 	vfs_subr.c vfs_syscalls.c vfs_trans.c vfs_vnode.c vfs_vnops.c	\
 	vfs_wapbl.c vfs_xattr.c
 
diff -r d581ced863ba -r ef920de0474a sys/rump/librump/rumpvfs/rump_vfs.c
--- a/sys/rump/librump/rumpvfs/rump_vfs.c	Sat Nov 28 10:13:45 2015 -0500
+++ b/sys/rump/librump/rumpvfs/rump_vfs.c	Sat Nov 28 10:38:33 2015 -0500
@@ -64,6 +64,7 @@ pvfs_init(struct proc *p)
 {
 
 	p->p_cwdi = cwdinit();
+	p->p_nsinfo = NULL;
 }
 
 static void
@@ -132,6 +133,7 @@ RUMP_COMPONENT(RUMP__FACTION_VFS)
 	/* bootstrap cwdi (rest done in vfs_mountroot() */
 	proc0.p_cwdi = &cwdi0;
 	proc0.p_cwdi = cwdinit();
+	proc0.p_nsinfo = NULL;
 
 	vfs_attach(&rumpfs_vfsops);
 	vfs_mountroot();
diff -r d581ced863ba -r ef920de0474a sys/sys/Makefile
--- a/sys/sys/Makefile	Sat Nov 28 10:13:45 2015 -0500
+++ b/sys/sys/Makefile	Sat Nov 28 10:38:33 2015 -0500
@@ -27,7 +27,7 @@ INCS=	acct.h agpio.h aio.h ansi.h aout_m
 	localedef.h lock.h lockf.h lua.h lwp.h lwpctl.h \
 	malloc.h mallocvar.h mbuf.h md4.h md5.h midiio.h \
 	mman.h module.h mount.h mqueue.h msg.h msgbuf.h mtio.h mutex.h \
-	namei.h null.h \
+	namei.h namespace.h null.h \
 	param.h pcu.h pipe.h pmc.h poll.h pool.h power.h proc.h \
 	protosw.h pset.h ptrace.h ptree.h \
 	queue.h quota.h quotactl.h \
diff -r d581ced863ba -r ef920de0474a sys/sys/namespace.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/sys/namespace.h	Sat Nov 28 10:38:33 2015 -0500
@@ -0,0 +1,125 @@
+/*
+ * per-process namespace stuff
+ */
+
+#ifndef _SYS_NAMESPACE_H_
+#define _SYS_NAMESPACE_H_
+
+#ifdef _KERNEL
+
+#include <sys/rbtree.h>
+
+struct proc;
+
+/*
+ * nsrulekey: the key half of nsrule (volume id, inode number, name)
+ *
+ * XXX: nsek_name can't be const because in the allocated rules it's a
+ * pointer to string data that needs to be freed. But in places this
+ * requires casting away const, e.g. for the name one is looking up.
+ * Growl.
+ */
+struct nsrulekey {
+	fsid_t nsrk_vol;
+	ino_t nsrk_ino;
+	char *nsrk_name;
+	size_t nsrk_namelen;
+};
+
+/*
+ * nsrule: one per namespace rule
+ */
+struct nsrule {
+	struct nsrulekey nsr_key;
+
+	/* value: vnode to return */
+	struct vnode *nsr_vn;	/* may be null, which means EACCES */
+	ino_t nsr_ino;		/* inode number of nsr_vn */
+	unsigned nsr_type;	/* DT_* type of nsr_ino */
+
+	/* rbtree hook */
+	rb_node_t nsr_rbnode;
+};
+
+/*
+ * namespace: one globally per declared namespace
+ *
+ * The "fixed" elements are immutable when refcount > 1 so do not need
+ * locking.
+ *
+ * The rule table is a table of struct nsentry. The root vnode is a
+ * special case rule. Here NULL means there is no root vnode rule,
+ * rather than "generate EACCESS", as the latter isn't sensible.
+ *
+ * XXX: what about emulation roots...?
+ */
+struct namespace {
+	unsigned ns_refcount;			/* Reference count (atomic) */
+	struct namespace *ns_parent;		/* parent namespace (fixed) */
+	rb_tree_t ns_rules;			/* rule table (fixed) */
+	unsigned ns_numrules;			/* # items in ns_rules */
+	struct vnode *ns_rootvn;		/* root vnode (fixed) */
+	bool ns_finished;			/* true -> immutable */
+};
+
+/*
+ * namespace info: one per process that is using namespaces
+ *
+ * Note: nsi_lock comes before proc->p_lock and also before all the
+ * assorted locks used during namei.
+ *
+ * The refcount only comes into play when clone() results in multiple
+ * threads appearing as multiple procs. Otherwise the structure is
+ * copied at fork time.
+ */
+struct nsinfo {
+	unsigned nsi_refcount;		/* reference count (atomic) */
+	kmutex_t nsi_lock;		/* lock for rest of this structure */
+	struct namespace *nsi_space;	/* the main/unprivileged namespace */
+	struct namespace *nsi_privspace; /* the protected priv'd namespace */
+	unsigned nsi_nspaces;		/* number of namespaces created */
+	bool nsi_usepriv;		/* true -> use nsi_privspace */
+};
+
+/* Creating more than this many namespaces per process fails. */
+#define MAX_NS_PER_PROC		8
+/* Limit on the size of the rule table in each namespace */
+#define MAX_NSRULES_PER_NS	256
+
+/* fork-time interface */
+struct nsinfo *nsinfo_clone(struct nsinfo *prev);
+struct nsinfo *nsinfo_share(struct nsinfo *prev);
+
+/* exit-time interface */
+void nsinfo_decref(struct nsinfo *nsi);
+
+/* exec-time interface */
+void ns_exec(struct proc *p);
+
+/* lookup-time interface */
+int ns_getroot(struct vnode **ret);
+int ns_lookup(struct vnode *dvp, const char *name, size_t namelen,
+	      struct vnode **ret);
+
+/* readdir-time interface */
+bool ns_hasentriesfor(struct vnode *dvp);
+int ns_filterdents(struct vnode *dvp, char *buf, size_t len);
+int ns_readdir(struct vnode *dvp, struct uio *uio);
+
+#endif /* _KERNEL */
+
+/*
+ * These functions have the same signature in and out of the kernel.
+ */
+
+/* control interface */
+int ns_begin(void);
+int ns_addrule(const char *dir, const char *name, const char *replacement);
+int ns_finish(void);
+
+/* privileged control interface */
+int ns_setns(bool usepriv);
+int ns_empower(void);
+
+
+#endif /* _SYS_NAMESPACE_H_ */
diff -r d581ced863ba -r ef920de0474a sys/sys/proc.h
--- a/sys/sys/proc.h	Sat Nov 28 10:13:45 2015 -0500
+++ b/sys/sys/proc.h	Sat Nov 28 10:38:33 2015 -0500
@@ -225,6 +225,7 @@ struct proc {
 	/* Substructures: */
 	struct kauth_cred *p_cred;	/* p: Master copy of credentials */
 	struct filedesc	*p_fd;		/* :: Ptr to open files structure */
+	struct nsinfo   *p_nsinfo;	/* :: Per-process namespace info */
 	struct cwdinfo	*p_cwdi;	/* :: cdir/rdir/cmask info */
 	struct pstats	*p_stats;	/* :: Accounting/stats (PROC ONLY) */
 	struct plimit	*p_limit;	/* :: Process limits */