Preliminary namespaces patch. This adds: - namespace decls for kernel and userland in sys/sys/namespace.h - namespace code in sys/kern/vfs_namespace.c - system call entry points for the namespace operations - a namespace hook in struct proc - namespace operations in namei, readdir, exec, fork, and exit - system call definitions for the namespace syscalls - build glue for vfs_namespace.c - build glue and setlist updates to install sys/namespace.h - updates to secret rump process logic so rumpity stuff doesn't randomly explode diff -r d581ced863ba -r ef920de0474a distrib/sets/lists/comp/mi --- a/distrib/sets/lists/comp/mi Sat Nov 28 10:13:45 2015 -0500 +++ b/distrib/sets/lists/comp/mi Sat Nov 28 10:38:33 2015 -0500 @@ -2845,6 +2845,7 @@ ./usr/include/sys/mtio.h comp-c-include ./usr/include/sys/mutex.h comp-c-include ./usr/include/sys/namei.h comp-c-include +./usr/include/sys/namespace.h comp-c-include ./usr/include/sys/null.h comp-c-include ./usr/include/sys/optstr.h comp-obsolete obsolete ./usr/include/sys/param.h comp-c-include diff -r d581ced863ba -r ef920de0474a sys/kern/files.kern --- a/sys/kern/files.kern Sat Nov 28 10:13:45 2015 -0500 +++ b/sys/kern/files.kern Sat Nov 28 10:38:33 2015 -0500 @@ -183,6 +183,7 @@ file kern/vfs_init.c vfs file kern/vfs_lockf.c vfs file kern/vfs_lookup.c vfs file kern/vfs_mount.c vfs +file kern/vfs_namespace.c vfs file kern/vfs_quotactl.c vfs file kern/vfs_subr.c vfs file kern/vfs_syscalls.c vfs diff -r d581ced863ba -r ef920de0474a sys/kern/kern_exec.c --- a/sys/kern/kern_exec.c Sat Nov 28 10:13:45 2015 -0500 +++ b/sys/kern/kern_exec.c Sat Nov 28 10:38:33 2015 -0500 @@ -76,6 +76,7 @@ #include #include #include +#include #include #include #include @@ -1168,6 +1169,8 @@ execve_runproc(struct lwp *l, struct exe if (error != 0) goto exec_abort; + if (p->p_nsinfo != NULL) + ns_exec(p); cwdexec(p); fd_closeexec(); /* handle close on exec */ @@ -2439,6 +2442,12 @@ do_posix_spawn(struct lwp *l1, pid_t *pi p2->p_cwdi = cwdinit(); + if (p1->p_nsinfo != NULL) { + p2->p_nsinfo = nsinfo_clone(p1->p_nsinfo); + } else { + p2->p_nsinfo = NULL; + } + /* * Note: p_limit (rlimit stuff) is copy-on-write, so normally * we just need increase pl_refcnt. diff -r d581ced863ba -r ef920de0474a sys/kern/kern_exit.c --- a/sys/kern/kern_exit.c Sat Nov 28 10:13:45 2015 -0500 +++ b/sys/kern/kern_exit.c Sat Nov 28 10:38:33 2015 -0500 @@ -99,6 +99,7 @@ #include #include #include +#include #include #include #include @@ -282,6 +283,10 @@ exit1(struct lwp *l, int rv) fd_free(); cwdfree(p->p_cwdi); p->p_cwdi = NULL; + if (p->p_nsinfo != NULL) { + nsinfo_decref(p->p_nsinfo); + p->p_nsinfo = NULL; + } doexithooks(p); sigactsfree(p->p_sigacts); diff -r d581ced863ba -r ef920de0474a sys/kern/kern_fork.c --- a/sys/kern/kern_fork.c Sat Nov 28 10:13:45 2015 -0500 +++ b/sys/kern/kern_fork.c Sat Nov 28 10:38:33 2015 -0500 @@ -78,6 +78,7 @@ #include #include #include +#include #include #include #include @@ -362,6 +363,16 @@ fork1(struct lwp *l1, int flags, int exi else p2->p_cwdi = cwdinit(); + /* reuse SHARECWD for the namespace */ + if (p1->p_nsinfo != NULL) { + if (flags & FORK_SHARECWD) + p2->p_nsinfo = nsinfo_share(p1->p_nsinfo); + else + p2->p_nsinfo = nsinfo_clone(p1->p_nsinfo); + } else { + p2->p_nsinfo = NULL; + } + /* * Note: p_limit (rlimit stuff) is copy-on-write, so normally * we just need increase pl_refcnt. diff -r d581ced863ba -r ef920de0474a sys/kern/syscalls.master --- a/sys/kern/syscalls.master Sat Nov 28 10:13:45 2015 -0500 +++ b/sys/kern/syscalls.master Sat Nov 28 10:38:33 2015 -0500 @@ -979,3 +979,9 @@ 479 NOERR RUMP { int|sys||posix_fallocat off_t len); } 480 STD RUMP { int|sys||fdiscard(int fd, int PAD, off_t pos, \ off_t len); } +481 STD RUMP { int|sys||ns_begin(void); } +482 STD RUMP { int|sys||ns_addrule(const char *dir, \ + const char *name, const char *replacement); } +483 STD RUMP { int|sys||ns_finish(void); } +484 STD RUMP { int|sys||ns_setns(int usepriv); } +485 STD RUMP { int|sys||ns_empower(void); } diff -r d581ced863ba -r ef920de0474a sys/kern/vfs_lookup.c --- a/sys/kern/vfs_lookup.c Sat Nov 28 10:13:45 2015 -0500 +++ b/sys/kern/vfs_lookup.c Sat Nov 28 10:38:33 2015 -0500 @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -528,14 +529,36 @@ namei_cleanup(struct namei_state *state) * Initializes the rootdir and erootdir state and returns a reference * to the starting dir. */ -static struct vnode * -namei_getstartdir(struct namei_state *state) +static int +namei_getstartdir(struct namei_state *state, struct vnode **ret) { struct nameidata *ndp = state->ndp; struct componentname *cnp = state->cnp; struct cwdinfo *cwdi; /* pointer to cwd state */ struct lwp *self = curlwp; /* thread doing namei() */ struct vnode *rootdir, *erootdir, *curdir, *startdir; + int error; + + if (self->l_proc->p_nsinfo != NULL) { + error = ns_getroot(&startdir); + if (error) { + return error; + } + if (startdir != NULL) { + /* + * XXX. for now, don't allow emulation roots and + * namespaces at once. + */ + if (cnp->cn_flags & (TRYEMULROOT | EMULROOTSET)) { + vrele(startdir); + state->attempt_retry = 1; + return ENOENT; + } + *ret = startdir; + return 0; + } + /* the namespace didn't have a root dir, continue as usual */ + } cwdi = self->l_proc->p_cwdi; rw_enter(&cwdi->cwdi_lock, RW_READER); @@ -592,7 +615,8 @@ namei_getstartdir(struct namei_state *st vref(startdir); rw_exit(&cwdi->cwdi_lock); - return startdir; + *ret = startdir; + return 0; } /* @@ -657,6 +681,7 @@ namei_start(struct namei_state *state, i { struct nameidata *ndp = state->ndp; struct vnode *startdir; + int error; /* length includes null terminator (was originally from copyinstr) */ ndp->ni_pathlen = strlen(ndp->ni_pnbuf) + 1; @@ -675,7 +700,10 @@ namei_start(struct namei_state *state, i startdir = namei_getstartdir_for_nfsd(state); /* no ktrace */ } else { - startdir = namei_getstartdir(state); + error = namei_getstartdir(state, &startdir); + if (error) { + return error; + } namei_ktrace(state); } @@ -915,6 +943,48 @@ lookup_once(struct namei_state *state, *newsearchdir_ret = searchdir; /* + * I think it's ok to bypass the mount point crossing code + * for namespaces as that should have been handled when the + * namespace was prepared. But: XXX think about this more; + * nothing in here is straightforward or obvious. + */ + if (l->l_proc->p_nsinfo != NULL) { + error = ns_lookup(searchdir, cnp->cn_nameptr, cnp->cn_namelen, + &foundobj); + if (error) { + goto done; + } + if (foundobj != NULL) { + /* + * If we found a dir, it might be either above + * or below searchdir. In the former case, we + * must unlock searchdir before locking it, so + * do that unconditionally. Make searchdir be + * another reference to the result; this is + * harmless when we've found a directory as + * the result will become the next searchdir + * soon anyway, and (like other places in + * here) avoids creating additional special + * cases. + * + * If we found a non-dir, it's safe to lock + * it while still holding searchdir's lock + * (at least, relatively) as all dirs are + * above and thus before all non-dirs. + */ + if (foundobj->v_type == VDIR) { + vput(searchdir); + vref(foundobj); + searchdir = foundobj; + *newsearchdir_ret = searchdir; + } + vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY); + *foundobj_ret = foundobj; + goto done; + } + } + + /* * Handle "..": two special cases. * 1. If at root directory (e.g. after chroot) * or at absolute root directory diff -r d581ced863ba -r ef920de0474a sys/kern/vfs_namespace.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/kern/vfs_namespace.c Sat Nov 28 10:38:33 2015 -0500 @@ -0,0 +1,1065 @@ +#include +#include +#include +#include +#include /* required by sys/kauth.h */ +#include +#include +#include /* for fsid_t */ +#include +#include +#include +#include +#include + +#define vn_incref(vn) vref(vn) +#define vn_decref(vn) vrele(vn) +#define vn_unlock(vn) VOP_UNLOCK(vn) + +//////////////////////////////////////////////////////////// +// rbtree and helpers (code that should exist elsewhere but doesn't) + +/* + * Compare function for fsid_t. XXX: this knows the internal + * representation of fsid_t, which shouldn't be exposed here. + */ +static int +fsid_compare(fsid_t a, fsid_t b) +{ + unsigned i; + int32_t xa, xb; + + for (i=0; i<__arraycount(a.__fsid_val); i++) { + xa = a.__fsid_val[i]; + xb = b.__fsid_val[i]; + if (xa < xb) { + return -1; + } else if (xa > xb) { + return 1; + } + } + return 0; +} + +/* + * Remove operation for rbtrees that can be used in a loop. Removes + * and returns an arbitrary node from the rbtree; returns NULL when + * the tree is empty. + */ +static void * +rb_tree_removeone(rb_tree_t *rbt) +{ + void *elem; + + RB_TREE_FOREACH(elem, rbt) { + /* tree not yet empty; remove the first node */ + rb_tree_remove_node(rbt, elem); + return elem; + } + /* we didn't get anything; tree is empty */ + return NULL; +} + +//////////////////////////////////////////////////////////// +// nsrule/nsrulekey methods + +static struct nsrule * +nsrule_create(const char *name) +{ + struct nsrule *rule; + size_t len; + + len = strlen(name); + + rule = kmem_zalloc(sizeof(*rule), KM_SLEEP); + KASSERT(rule != NULL); + + rule->nsr_key.nsrk_namelen = len; + rule->nsr_key.nsrk_name = kmem_alloc(len+1, KM_SLEEP); + strcpy(rule->nsr_key.nsrk_name, name); + rule->nsr_vn = NULL; + rule->nsr_ino = 0; + rule->nsr_type = DT_UNKNOWN; + + return rule; +} + +static void +nsrule_destroy(struct nsrule *rule) +{ + if (rule->nsr_vn != NULL) { + vn_decref(rule->nsr_vn); + } + kmem_free(rule->nsr_key.nsrk_name, rule->nsr_key.nsrk_namelen + 1); + kmem_free(rule, sizeof(*rule)); +} + +static int +nsrulekey_setdir(struct nsrulekey *key, struct vnode *vp) +{ + /* + * XXX, what a mess. There should be a vfs function to fetch + * the fsid, and inode numbers should almost certainly just be + * exposed through struct vnode, or failing that should be a + * nonfailing vnode call. + */ + struct vattr va; + int result; + + vn_lock(vp, LK_SHARED | LK_RETRY); + result = VOP_GETATTR(vp, &va, curlwp->l_cred); + vn_unlock(vp); + if (result) { + return result; + } + + key->nsrk_vol = vp->v_mount->mnt_stat.f_fsidx; + key->nsrk_ino = va.va_fileid; + return 0; +} + +static bool +nsrulekey_samevnode(const struct nsrulekey *k1, const struct nsrulekey *k2) +{ + if (fsid_compare(k1->nsrk_vol, k2->nsrk_vol)) { + return false; + } + return k1->nsrk_ino == k2->nsrk_ino; +} + +static int +nsrulekey_compare(const struct nsrulekey *k1, const struct nsrulekey *k2) +{ + int ret; + size_t len; + + ret = fsid_compare(k1->nsrk_vol, k2->nsrk_vol); + if (ret) { + return ret; + } + + if (k1->nsrk_ino < k2->nsrk_ino) { + return -1; + } else if (k1->nsrk_ino > k2->nsrk_ino) { + return 1; + } + + /* + * Because one of the names will often come from namei, it + * isn't null-terminated, only bounded by nsrk_namelen. Deal. + */ + len = min(k1->nsrk_namelen, k2->nsrk_namelen); + ret = memcmp(k1->nsrk_name, k2->nsrk_name, len); + if (ret) { + return ret; + } + if (k1->nsrk_namelen < k2->nsrk_namelen) { + return -1; + } else if (k1->nsrk_namelen > k2->nsrk_namelen) { + return 1; + } + return 0; +} + +static int +ns_compare_nodes(void *ctx, const void *v1, const void *v2) +{ + const struct nsrule *r1 = v1; + const struct nsrule *r2 = v2; + + return nsrulekey_compare(&r1->nsr_key, &r2->nsr_key); +} + +static int +ns_compare_key(void *ctx, const void *v1, const void *v2) +{ + const struct nsrule *r1 = v1; + const struct nsrulekey *k2 = v2; + + return nsrulekey_compare(&r1->nsr_key, k2); +} + +static const rb_tree_ops_t nsrule_rbtreeops = { + .rbto_compare_nodes = ns_compare_nodes, + .rbto_compare_key = ns_compare_key, + .rbto_node_offset = offsetof(struct nsrule, nsr_rbnode), + .rbto_context = NULL, +}; + +//////////////////////////////////////////////////////////// +// namespace methods + +static void +namespace_incref(struct namespace *ns) +{ + atomic_inc_uint(&ns->ns_refcount); +} + +static struct namespace * +namespace_create(struct namespace *parent) +{ + struct namespace *ns; + + ns = kmem_alloc(sizeof(*ns), KM_SLEEP); + KASSERT(ns != NULL); + + ns->ns_refcount = 1; + if (parent != NULL) { + KASSERT(parent->ns_finished == true); + namespace_incref(parent); + } + ns->ns_parent = parent; + rb_tree_init(&ns->ns_rules, &nsrule_rbtreeops); + ns->ns_numrules = 0; + ns->ns_rootvn = NULL; + ns->ns_finished = false; + + return ns; +} + +static void +namespace_decref(struct namespace *ns) +{ + unsigned newcount; + struct nsrule *rule; + struct namespace *parent; + + newcount = atomic_dec_uint_nv(&ns->ns_refcount); + if (newcount == 0) { + /* destroy */ + + while ((rule = rb_tree_removeone(&ns->ns_rules)) != NULL) { + nsrule_destroy(rule); + ns->ns_numrules--; + } + + parent = ns->ns_parent; + /* this doesn't seem to exist */ + /*rb_tree_cleanup(&ns->ns_rules);*/ + KASSERT(ns->ns_numrules == 0); + kmem_free(ns, sizeof(*ns)); + if (ns->ns_rootvn != NULL) { + vn_decref(ns->ns_rootvn); + } + + /* make this a tail call for stack safety */ + namespace_decref(parent); + } +} + +//////////////////////////////////////////////////////////// +// nsinfo methods + +/* + * Create an empty nsinfo (per-process data) + */ +static struct nsinfo * +nsinfo_create(void) +{ + struct nsinfo *nsi; + + nsi = kmem_alloc(sizeof(*nsi), KM_SLEEP); + KASSERT(nsi != NULL); + + nsi->nsi_refcount = 1; + mutex_init(&nsi->nsi_lock, MUTEX_DEFAULT, IPL_NONE); + nsi->nsi_space = NULL; + nsi->nsi_privspace = NULL; + nsi->nsi_nspaces = 0; + nsi->nsi_usepriv = false; + + return nsi; +} + +/* + * Copy nsinfo (used at fork time) + */ +struct nsinfo * +nsinfo_clone(struct nsinfo *prev) +{ + struct nsinfo *nsi; + + nsi = nsinfo_create(); + + mutex_enter(&prev->nsi_lock); + + namespace_incref(prev->nsi_space); + namespace_incref(prev->nsi_privspace); + nsi->nsi_space = prev->nsi_space; + nsi->nsi_privspace = prev->nsi_privspace; + + /* Mark both namespaces finished if they aren't. */ + if (nsi->nsi_space->ns_finished == false) { + KASSERT(nsi->nsi_space->ns_refcount == 1); + nsi->nsi_space->ns_finished = true; + } + if (nsi->nsi_privspace->ns_finished == false) { + KASSERT(nsi->nsi_privspace->ns_refcount == 1); + nsi->nsi_privspace->ns_finished = true; + } + + mutex_exit(&prev->nsi_lock); + return nsi; +} + +/* + * Share nsinfo (used at fork time when doing clone(2)) + */ +struct nsinfo * +nsinfo_share(struct nsinfo *prev) +{ + atomic_inc_uint(&prev->nsi_refcount); + return prev; +} + +/* + * Destroy nsinfo (used at exit time) + */ +void +nsinfo_decref(struct nsinfo *nsi) +{ + unsigned newcount; + + KASSERT(nsi != NULL); + + newcount = atomic_dec_uint_nv(&nsi->nsi_refcount); + if (__predict_true(newcount == 0)) { + + namespace_decref(nsi->nsi_space); + namespace_decref(nsi->nsi_privspace); + mutex_destroy(&nsi->nsi_lock); + + kmem_free(nsi, sizeof(*nsi)); + } +} + +/* + * Fetch the current namespace based on the usepriv flag. + */ +static struct namespace * +nsinfo_getspace(struct nsinfo *nsi) +{ + if (nsi->nsi_usepriv) { +#ifdef DIAGNOSTIC + mutex_enter(curproc->p_lock); + KASSERT((curproc->p_flag & PK_SUGID) != 0); + mutex_exit(curproc->p_lock); +#endif + return nsi->nsi_privspace; + + } else { + return nsi->nsi_space; + } +} + +/* + * Change the current namespace based on the usepriv flag. + */ +static void +nsinfo_setspace(struct nsinfo *nsi, struct namespace *ns) +{ + if (nsi->nsi_usepriv) { +#ifdef DIAGNOSTIC + mutex_enter(curproc->p_lock); + KASSERT((curproc->p_flag & PK_SUGID) != 0); + mutex_exit(curproc->p_lock); +#endif + nsi->nsi_privspace = ns; + + } else { + nsi->nsi_space = ns; + } +} + +//////////////////////////////////////////////////////////// +// exec-time interface + +/* + * Handle namespace work that is needed at exec time. This means: + * - mark both namespaces finished if they aren't + * - clear nsi_nspaces + * - initialize nsi_usepriv + */ +void +ns_exec(struct proc *p) +{ + struct nsinfo *nsi; + + KASSERT(p->p_nsinfo != NULL); + nsi = p->p_nsinfo; + mutex_enter(&nsi->nsi_lock); + + /* Mark both namespaces finished if they aren't. */ + if (nsi->nsi_space->ns_finished == false) { + KASSERT(nsi->nsi_space->ns_refcount == 1); + nsi->nsi_space->ns_finished = true; + } + if (nsi->nsi_privspace->ns_finished == false) { + KASSERT(nsi->nsi_privspace->ns_refcount == 1); + nsi->nsi_privspace->ns_finished = true; + } + + /* reset the number-of-spaces counter */ + nsi->nsi_nspaces = 0; + + /* initialize nsi_usepriv based on setugid state */ + mutex_enter(p->p_lock); + nsi->nsi_usepriv = (p->p_flag & PK_SUGID) != 0; + mutex_exit(p->p_lock); + + mutex_exit(&nsi->nsi_lock); +} + +//////////////////////////////////////////////////////////// +// lookup-time interface + +/* + * Fetch a reference to the root directory as defined by the current + * namespace. + * + * By getting here we know that curproc->p_nsinfo is non-null. + */ +int +ns_getroot(struct vnode **ret) +{ + struct nsinfo *nsi; + struct namespace *ns; + + nsi = curproc->p_nsinfo; + KASSERT(nsi != NULL); + + mutex_enter(&nsi->nsi_lock); + ns = nsinfo_getspace(nsi); + mutex_exit(&nsi->nsi_lock); + + while (ns != NULL) { + if (ns->ns_rootvn != NULL) { + vn_incref(ns->ns_rootvn); + *ret = ns->ns_rootvn; + return 0; + } + ns = ns->ns_parent; + } + + /* no root dir defined; use the real root */ + *ret = NULL; + return 0; +} + +/* + * Look up the namespace rule, if any, for NAME in directory DVP, and + * return a reference to the resulting object in RET. If there is no + * rule, succeed and set RET to NULL. + * + * By getting here we know that curproc->p_nsinfo is non-null. + */ +int +ns_lookup(struct vnode *dvp, const char *name, size_t namelen, + struct vnode **ret) +{ + struct nsinfo *nsi; + struct namespace *ns; + struct nsrule *rule; + struct nsrulekey key; + int result; + + nsi = curproc->p_nsinfo; + KASSERT(nsi != NULL); + + /* + * Note that even if other threads in the process are screwing + * around, the namespace cannot disappear under us: creating a + * new one inherits from the old one and doesn't discard it. + * If we get around to implementing coalescing of namespaces we + * might need to revisit this. + * + * Namespaces disappear only when the last process referring + * to them exits (in which case there are no other threads + * doing lookup in them) or when root copies the unprivileged + * namespace to the privileged namespace. In the latter case + * we just demand that there are no other threads in the + * process, as the only time that should be happening is in + * chroot(8) or similar. + */ + + mutex_enter(&nsi->nsi_lock); + ns = nsinfo_getspace(nsi); + mutex_exit(&nsi->nsi_lock); + + result = nsrulekey_setdir(&key, dvp); + if (result) { + return result; + } + key.nsrk_name = (char *)(uintptr_t)name; /* XXXconst */ + key.nsrk_namelen = namelen; + + while (ns != NULL) { + rule = rb_tree_find_node(&ns->ns_rules, &key); + if (rule != NULL) { + if (rule->nsr_vn == NULL) { + /* Rule says to deny access. */ + return EACCES; + } else { + /* Rule says to return this vnode. */ + vn_incref(rule->nsr_vn); + *ret = rule->nsr_vn; + return 0; + } + } + ns = ns->ns_parent; + } + + /* Reached the canonical (real) namespace; we're done. */ + *ret = NULL; + return 0; +} + +//////////////////////////////////////////////////////////// +// readdir-time interface + +/* + * Check if the current namespace has any rules for names in the + * directory passed in. + */ +bool +ns_hasentriesfor(struct vnode *dvp) +{ + struct nsinfo *nsi; + struct namespace *ns; + struct nsrule *rule; + struct nsrulekey key; + int result; + + nsi = curproc->p_nsinfo; + if (nsi == NULL) { + return false; + } + + mutex_enter(&nsi->nsi_lock); + ns = nsinfo_getspace(nsi); + mutex_exit(&nsi->nsi_lock); + + result = nsrulekey_setdir(&key, dvp); + if (result) { + /* XXX bah. shouldn't happen, but don't fail open */ + printf("ns_filterdents: warning: error %d looking up dir\n", + result); + return true; + } + key.nsrk_name = (char *)(uintptr_t)""; /* XXXconst */ + key.nsrk_namelen = 0; + + while (ns != NULL) { + rule = rb_tree_find_node_geq(&ns->ns_rules, &key); + if (rule != NULL && nsrulekey_samevnode(&rule->nsr_key, &key)) { + return true; + } + ns = ns->ns_parent; + } + + return false; +} + +/* + * Check if a rule matching KEY exists in the current namespace. + */ +static bool +ns_keyexists(struct namespace *ns, const struct nsrulekey *key) +{ + struct nsrule *rule; + + while (ns != NULL) { + rule = rb_tree_find_node(&ns->ns_rules, key); + if (rule != NULL) { + return true; + } + ns = ns->ns_parent; + } + return false; +} + +/* + * Take a buffer full of struct dirent and overwrite any entries for + * names that are mentioned in the namespace rules for the passed-in + * directory. + */ +int +ns_filterdents(struct vnode *dvp, char *buf, size_t len) +{ + struct nsinfo *nsi; + struct namespace *ns; + struct nsrulekey key; + struct dirent *dp, *lastdp; + size_t pos, frontsize; + int result; + + nsi = curproc->p_nsinfo; + if (nsi == NULL) { + /* nothing to do */ + return 0; + } + + mutex_enter(&nsi->nsi_lock); + ns = nsinfo_getspace(nsi); + mutex_exit(&nsi->nsi_lock); + + if (ns == NULL) { + /* shortcut */ + return 0; + } + + result = nsrulekey_setdir(&key, dvp); + if (result) { + return result; + } + + lastdp = NULL; + frontsize = 0; + for (pos = 0; pos < len; pos += dp->d_reclen) { + dp = (struct dirent *)(buf + pos); + if (dp->d_namlen == 2 && + dp->d_name[0] == '.' && dp->d_name[1] == '.') { + /* don't need to filter out ".." */ + continue; + } + key.nsrk_name = dp->d_name; + key.nsrk_namelen = dp->d_namlen; + if (!ns_keyexists(ns, &key)) { + lastdp = dp; + continue; + } + /* need to zot this entry */ + if (lastdp == NULL) { + frontsize += dp->d_reclen; + } else { + lastdp->d_reclen += dp->d_reclen; + dp->d_type = DT_UNKNOWN; + dp->d_fileno = 0; + bzero(dp->d_name, dp->d_namlen); + dp->d_namlen = 0; + + /* blah */ + pos += dp->d_reclen; + dp->d_reclen = 0; + } + } + if (frontsize > 0) { + dp = (struct dirent *)(buf + frontsize); + dp->d_reclen += frontsize; + memmove(buf, dp, dp->d_reclen - frontsize); + } + + return 0; +} + +/* + * Do the equivalent of VOP_READDIR for all the names that are + * mentioned in the namespace rules for the passed-in directory. + * + * Note: this has no ability to start in the middle. + */ +int +ns_readdir(struct vnode *dvp, struct uio *uio) +{ + struct nsinfo *nsi; + struct namespace *ns; + struct nsrule *rule; + struct nsrulekey key; + struct dirent *dp; + size_t len; + int result; + + nsi = curproc->p_nsinfo; + if (nsi == NULL) { + return false; + } + + mutex_enter(&nsi->nsi_lock); + ns = nsinfo_getspace(nsi); + mutex_exit(&nsi->nsi_lock); + + result = nsrulekey_setdir(&key, dvp); + if (result) { + return result; + } + key.nsrk_name = (char *)(uintptr_t)""; /* XXXconst */ + key.nsrk_namelen = 0; + + dp = kmem_alloc(sizeof(*dp), KM_SLEEP); + + while (ns != NULL && uio->uio_resid > 0) { + rule = rb_tree_find_node_geq(&ns->ns_rules, &key); + while (rule != NULL && + nsrulekey_samevnode(&rule->nsr_key, &key)) { + + if (!strcmp(rule->nsr_key.nsrk_name, "..")) { + /* don't need to issue another ".." */ + continue; + } + + len = strlen(rule->nsr_key.nsrk_name); + KASSERT(len + 1 < sizeof(dp->d_name)); + memcpy(dp->d_name, rule->nsr_key.nsrk_name, len); + dp->d_name[len] = '\0'; + dp->d_fileno = rule->nsr_ino; + dp->d_type = rule->nsr_type; + dp->d_namlen = len; + dp->d_reclen = _DIRENT_RECLEN(dp, len); + + /* + * XXX: should we test uio_resid >= d_reclen + * before calling uiomove? if not, uiomove + * will happily send back a partial entry, but + * that might make libc splode. + */ + + result = uiomove(dp, dp->d_reclen, uio); + if (result) { + goto out; + } + + rule = rb_tree_iterate(&ns->ns_rules, &key, + RB_DIR_RIGHT); + } + ns = ns->ns_parent; + } + +out: + kmem_free(dp, sizeof(*dp)); + return 0; +} + +//////////////////////////////////////////////////////////// +// control interface + +/* + * Begin construction of a new namespace. + */ +int +ns_begin(void) +{ + struct nsinfo *nsi, *foundnsi; + struct namespace *ns, *newns; + + nsi = curproc->p_nsinfo; + if (nsi == NULL) { + nsi = nsinfo_create(); + KASSERT(nsi != NULL); + + /* set p_nsinfo atomically */ + foundnsi = atomic_cas_ptr(&curproc->p_nsinfo, NULL, nsi); + if (foundnsi != NULL) { + /* someone else also just did ns_begin */ + nsinfo_decref(nsi); + /* maybe should just fail */ + nsi = foundnsi; + } + /* + * Technically we need a membar_store_any here, but + * the immediately following mutex_enter() has one. + */ + } + + mutex_enter(&nsi->nsi_lock); + if (nsi->nsi_nspaces >= MAX_NS_PER_PROC) { + /* too many */ + mutex_exit(&nsi->nsi_lock); + return EPERM; + } + + ns = nsinfo_getspace(nsi); + if (ns != NULL && ns->ns_finished == false) { + /* already constructing a namespace */ + mutex_exit(&nsi->nsi_lock); + return EALREADY; + } + newns = namespace_create(ns); + KASSERT(newns != NULL); + KASSERT(newns->ns_parent == ns); + nsinfo_setspace(nsi, newns); + mutex_exit(&nsi->nsi_lock); + + return 0; +} + +/* + * Add a rule to the namespace under construction. + */ +int +ns_addrule(const char *dir, const char *name, const char *replacement) +{ + struct nsinfo *nsi; + struct namespace *newns, *prevns; + struct nsrule *rule, *oldrule; + struct vnode *dvp, *replvp; + struct vattr va; + int result; + + /* the syscall code enforces this */ + KASSERT(strlen(name) <= NAME_MAX); + + nsi = curproc->p_nsinfo; + if (nsi == NULL) { + /* no namespace */ + return EINVAL; + } + + mutex_enter(&nsi->nsi_lock); + newns = nsinfo_getspace(nsi); + if (newns == NULL || newns->ns_finished == true) { + /* not creating a namespace */ + mutex_exit(&nsi->nsi_lock); + return EINVAL; + } + prevns = newns->ns_parent; + + KASSERT(newns->ns_refcount == 1); + + if (newns->ns_numrules >= MAX_NSRULES_PER_NS) { + mutex_exit(&nsi->nsi_lock); + /* not an especially good errno, but best we have I think */ + return E2BIG; + } + + if (dir == NULL) { + /* rule for / */ + KASSERT(name == NULL); + if (replacement == NULL) { + /* not sensible to deny access to / */ + mutex_exit(&nsi->nsi_lock); + return EPERM; + } + rule = NULL; + } else { + /* + * To help make the resulting namespace coherent, the + * *source* dir is looked up in the parent namespace + * and the *replacement* in the new namespace. Then if + * the rules are loaded last-to-first overlapping + * entries will behave as if the rules are applied in + * order first-to-last. Note that it's the caller's + * responsibility to not insert rules where the key + * side is unreachable once the whole namespace is + * loaded. Such rules have no particular negative + * impact but probably do not produce the results the + * operator intended. + * + * Note that we do this by mucking with + * nsi->nsi_space, so we must hold nsi_lock across the + * lookups to prevent other threads in the process + * from seeing the namespace flap. Of course it's + * pretty silly to be building a namespace while other + * threads are doing lookups, but that's the user's + * lookout. + */ + + /* Look up the source dir, in the old namespace. */ + nsinfo_setspace(nsi, prevns); + result = namei_simple_kernel(dir, NSM_FOLLOW_NOEMULROOT, &dvp); + if (result) { + nsinfo_setspace(nsi, newns); + mutex_exit(&nsi->nsi_lock); + return result; + } + + rule = nsrule_create(name); + KASSERT(rule != NULL); + result = nsrulekey_setdir(&rule->nsr_key, dvp); + if (result) { + nsrule_destroy(rule); + nsinfo_setspace(nsi, newns); + mutex_exit(&nsi->nsi_lock); + return result; + } + vn_decref(dvp); + nsinfo_setspace(nsi, newns); + } + + /* Look up the replacement, back in the new namespace. */ + if (replacement == NULL) { + replvp = NULL; + } else { + result = namei_simple_kernel(replacement, + NSM_FOLLOW_NOEMULROOT, + &replvp); + if (result) { + nsrule_destroy(rule); + mutex_exit(&nsi->nsi_lock); + return result; + } + + vn_lock(replvp, LK_SHARED | LK_RETRY); + result = VOP_GETATTR(replvp, &va, curlwp->l_cred); + vn_unlock(replvp); + if (result) { + nsrule_destroy(rule); + mutex_exit(&nsi->nsi_lock); + return result; + } + } + + if (dir == NULL) { + /* Rule for / */ + if (newns->ns_rootvn != NULL) { + nsrule_destroy(rule); + mutex_exit(&nsi->nsi_lock); + return EEXIST; + } + newns->ns_rootvn = replvp; + } else { + rule->nsr_vn = replvp; + rule->nsr_ino = va.va_fileid; + rule->nsr_type = IFTODT(va.va_mode); + + /* Insert the rule */ + + oldrule = rb_tree_insert_node(&newns->ns_rules, rule); + if (oldrule != rule) { + nsrule_destroy(rule); + mutex_exit(&nsi->nsi_lock); + return EEXIST; + } + newns->ns_numrules++; + } + + /* Done */ + mutex_exit(&nsi->nsi_lock); + return 0; +} + +/* + * Mark the namespace under construction finished. + */ +int +ns_finish(void) +{ + struct nsinfo *nsi; + struct namespace *newns; + + nsi = curproc->p_nsinfo; + if (nsi == NULL) { + /* no namespace */ + return EINVAL; + } + + mutex_enter(&nsi->nsi_lock); + newns = nsinfo_getspace(nsi); + if (newns == NULL || newns->ns_finished == true) { + /* not creating a namespace */ + mutex_exit(&nsi->nsi_lock); + return EINVAL; + } + + KASSERT(newns->ns_refcount == 1); + + newns->ns_finished = true; + + mutex_exit(&nsi->nsi_lock); + return 0; +} + +//////////////////////////////////////////////////////////// +// privileged control interface + +/* + * Switch back and forth between the privileged and unprivileged + * namespaces. Only setugid processes (or root) may do this. + * + * XXX: is there a kauth way to check issetugid? + * XXX: is chroot the right permission check? + */ +int +ns_setns(bool usepriv) +{ + struct nsinfo *nsi; + bool ok; + int result; + + nsi = curproc->p_nsinfo; + if (nsi == NULL) { + /* no namespace */ + return EINVAL; + } + + mutex_enter(curproc->p_lock); + ok = (curproc->p_flag & PK_SUGID) != 0; + mutex_exit(curproc->p_lock); + + if (!ok) { + return EPERM; + } + + result = kauth_authorize_system(curlwp->l_cred, KAUTH_SYSTEM_CHROOT, + KAUTH_REQ_SYSTEM_CHROOT_FCHROOT, NULL, NULL, NULL); + if (result) { + return result; + } + + mutex_enter(&nsi->nsi_lock); + nsi->nsi_usepriv = usepriv; + mutex_exit(&nsi->nsi_lock); + return 0; +} + +/* + * Clone the unprivileged namespace as the privileged namespace. + * + * Only root may do this, because it's equivalent to a traditional + * chroot that affects setugid programs. Although it's ok for root to + * do it inside another namespace, as namespaces nest. + * + * Note that we require the current process has only one thread, to + * avoid potentially hazardous races in lookup while destroying the + * old privileged namespace. + * + * XXX: this name is stupid; let's find a better one. + */ +int +ns_empower(void) +{ + struct nsinfo *nsi; + struct namespace *oldns; + int result; + + nsi = curproc->p_nsinfo; + if (nsi == NULL) { + /* no namespace */ + return EINVAL; + } + + result = kauth_authorize_system(curlwp->l_cred, KAUTH_SYSTEM_CHROOT, + KAUTH_REQ_SYSTEM_CHROOT_FCHROOT, NULL, NULL, NULL); + if (result) { + return result; + } + + mutex_enter(curproc->p_lock); + if (curproc->p_nlwps > 1) { + /* multithreaded process */ + mutex_exit(curproc->p_lock); + return EPERM; + } + mutex_exit(curproc->p_lock); + + mutex_enter(&nsi->nsi_lock); + + if (nsi->nsi_space->ns_finished == false) { + /* finish constructing it first! */ + mutex_exit(&nsi->nsi_lock); + return EINVAL; + } + + if (nsi->nsi_refcount > 1) { + /* different kind of multithreaded process */ + mutex_exit(&nsi->nsi_lock); + return EPERM; + } + + oldns = nsi->nsi_privspace; + namespace_incref(nsi->nsi_space); + nsi->nsi_privspace = nsi->nsi_space; + + mutex_exit(&nsi->nsi_lock); + namespace_decref(oldns); + return 0; +} diff -r d581ced863ba -r ef920de0474a sys/kern/vfs_syscalls.c --- a/sys/kern/vfs_syscalls.c Sat Nov 28 10:13:45 2015 -0500 +++ b/sys/kern/vfs_syscalls.c Sat Nov 28 10:38:33 2015 -0500 @@ -87,6 +87,7 @@ #include #include #include +#include #include #include #include @@ -4800,3 +4801,119 @@ fail: fd_putfile(fd); return error; } + +/* + * Begin creating a private namespace. + */ +/* ARGSUSED */ +int +sys_ns_begin(struct lwp *l, const void *uap, register_t *retval) +{ + return ns_begin(); +} + +/* + * Grab a pathname that can legitimately be null. + */ +static int +copyinpath_nullok(const char *ustr, char **kstr_ret, size_t max) +{ + char *kstr; + int error; + + KASSERT(max <= PATH_MAX); + + if (ustr == NULL) { + *kstr_ret = NULL; + return 0; + } + + kstr = PNBUF_GET(); + error = copyinstr(ustr, kstr, max, NULL); + if (error) { + PNBUF_PUT(kstr); + return error; + } + *kstr_ret = kstr; + return 0; +} + +/* + * Add a rule to a private namespace under construction + */ +/* ARGSUSED */ +int +sys_ns_addrule(struct lwp *l, const struct sys_ns_addrule_args *uap, + register_t *retval) +{ + /* { + syscallarg(const char *) dir; + syscallarg(const char *) name; + syscallarg(const char *) replacement; + } */ + char *dir = NULL; + char *name = NULL; + char *replacement = NULL; + int error; + + error = copyinpath_nullok(SCARG(uap, dir), &dir, PATH_MAX); + if (error) { + goto out; + } + error = copyinpath_nullok(SCARG(uap, name), &name, NAME_MAX+1); + if (error) { + goto out; + } + error = copyinpath_nullok(SCARG(uap, replacement), &replacement, + PATH_MAX); + if (error) { + goto out; + } + + error = ns_addrule(dir, name, replacement); +out: + if (dir != NULL) + PNBUF_PUT(dir); + if (name != NULL) + PNBUF_PUT(name); + if (replacement != NULL) + PNBUF_PUT(replacement); + return error; +} + +/* + * Complete construction of a private namespace + */ +/* ARGSUSED */ +int +sys_ns_finish(struct lwp *l, const void *uap, register_t *retval) +{ + return ns_finish(); +} + +/* + * Select use of privileged or unprivileged namespace + * (privileged operation) + */ +/* ARGSUSED */ +int +sys_ns_setns(struct lwp *l, const struct sys_ns_setns_args *uap, + register_t *retval) +{ + /* { + syscallarg(int) usepriv; + } */ + + return ns_setns(SCARG(uap, usepriv) != 0); +} + +/* + * Copy unprivileged namespace to privileged namespace + * (privileged operation) + */ +/* ARGSUSED */ +int +sys_ns_empower(struct lwp *l, const void *uap, register_t *retval) +{ + return ns_empower(); +} diff -r d581ced863ba -r ef920de0474a sys/kern/vfs_vnops.c --- a/sys/kern/vfs_vnops.c Sat Nov 28 10:13:45 2015 -0500 +++ b/sys/kern/vfs_vnops.c Sat Nov 28 10:38:33 2015 -0500 @@ -90,6 +90,7 @@ #include #include #include +#include #include #include @@ -467,21 +468,37 @@ vn_rdwr(enum uio_rw rw, struct vnode *vp } int -vn_readdir(file_t *fp, char *bf, int segflg, u_int count, int *done, +vn_readdir(file_t *fp, char *callerbuf, int segflg, u_int count, int *done, struct lwp *l, off_t **cookies, int *ncookies) { struct vnode *vp = fp->f_vnode; struct iovec aiov; struct uio auio; + int do_ns; + char *nsbuf; + char *readdirbuf; int error, eofflag; /* Limit the size on any kernel buffers used by VOP_READDIR */ count = min(MAXBSIZE, count); + do_ns = ns_hasentriesfor(vp); + if (do_ns && segflg != UIO_SYSSPACE) { + /* + * Need to VOP_READDIR into a kernel buffer so we can + * munge it. + */ + readdirbuf = nsbuf = kmem_alloc(count, KM_SLEEP); + segflg = UIO_SYSSPACE; + } else { + nsbuf = NULL; + readdirbuf = callerbuf; + } + unionread: if (vp->v_type != VDIR) return (EINVAL); - aiov.iov_base = bf; + aiov.iov_base = readdirbuf; aiov.iov_len = count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; @@ -502,14 +519,20 @@ unionread: mutex_exit(&fp->f_lock); VOP_UNLOCK(vp); if (error) - return (error); + goto out; + + if (do_ns) { + error = ns_filterdents(vp, readdirbuf, count - auio.uio_resid); + if (error) + goto out; + } if (count == auio.uio_resid && vn_union_readdir_hook) { struct vnode *ovp = vp; error = (*vn_union_readdir_hook)(&vp, fp, l); if (error) - return (error); + goto out; if (vp != ovp) goto unionread; } @@ -526,7 +549,45 @@ unionread: vrele(tvp); goto unionread; } + + if (count == auio.uio_resid && do_ns) { + /* + * XXX: if the names from the namespace rules don't + * fit into the buffer you can't get the rest of them; + * to handle that we'd have to create a fake vnode for + * calling VOP_READDIR on and stuff that in fp->f_vnode. + * + * (Also note that in that case we'd have to arrange + * to know to skip ns_filterdents, or it would all be + * an expensive nop.) + * + * However, we expect namespace rules to be limited so + * it's probably ok. And the right solution to this + * problem is not to create such terrible hacks but to + * restructure the way directory reading works so it + * all happens cleanly. + * + * XXX: does this need to be passed cookies/ncookies? + */ + error = ns_readdir(vp, &auio); + if (error) { + goto out; + } + } + + if (nsbuf != NULL) { + KASSERT(do_ns); + error = copyout(nsbuf, callerbuf, count - auio.uio_resid); + if (error) { + goto out; + } + } + *done = count - auio.uio_resid; +out: + if (nsbuf != NULL) { + kmem_free(nsbuf, count); + } return error; } diff -r d581ced863ba -r ef920de0474a sys/rump/librump/rumpvfs/Makefile.rumpvfs --- a/sys/rump/librump/rumpvfs/Makefile.rumpvfs Sat Nov 28 10:13:45 2015 -0500 +++ b/sys/rump/librump/rumpvfs/Makefile.rumpvfs Sat Nov 28 10:38:33 2015 -0500 @@ -33,6 +33,7 @@ SRCS+= kern_physio.c # sys/kern vfs SRCS+= vfs_bio.c vfs_cache.c vfs_cwd.c vfs_dirhash.c vfs_getcwd.c \ vfs_hooks.c vfs_init.c vfs_lockf.c vfs_lookup.c vfs_mount.c \ + vfs_namespace.c \ vfs_subr.c vfs_syscalls.c vfs_trans.c vfs_vnode.c vfs_vnops.c \ vfs_wapbl.c vfs_xattr.c diff -r d581ced863ba -r ef920de0474a sys/rump/librump/rumpvfs/rump_vfs.c --- a/sys/rump/librump/rumpvfs/rump_vfs.c Sat Nov 28 10:13:45 2015 -0500 +++ b/sys/rump/librump/rumpvfs/rump_vfs.c Sat Nov 28 10:38:33 2015 -0500 @@ -64,6 +64,7 @@ pvfs_init(struct proc *p) { p->p_cwdi = cwdinit(); + p->p_nsinfo = NULL; } static void @@ -132,6 +133,7 @@ RUMP_COMPONENT(RUMP__FACTION_VFS) /* bootstrap cwdi (rest done in vfs_mountroot() */ proc0.p_cwdi = &cwdi0; proc0.p_cwdi = cwdinit(); + proc0.p_nsinfo = NULL; vfs_attach(&rumpfs_vfsops); vfs_mountroot(); diff -r d581ced863ba -r ef920de0474a sys/sys/Makefile --- a/sys/sys/Makefile Sat Nov 28 10:13:45 2015 -0500 +++ b/sys/sys/Makefile Sat Nov 28 10:38:33 2015 -0500 @@ -27,7 +27,7 @@ INCS= acct.h agpio.h aio.h ansi.h aout_m localedef.h lock.h lockf.h lua.h lwp.h lwpctl.h \ malloc.h mallocvar.h mbuf.h md4.h md5.h midiio.h \ mman.h module.h mount.h mqueue.h msg.h msgbuf.h mtio.h mutex.h \ - namei.h null.h \ + namei.h namespace.h null.h \ param.h pcu.h pipe.h pmc.h poll.h pool.h power.h proc.h \ protosw.h pset.h ptrace.h ptree.h \ queue.h quota.h quotactl.h \ diff -r d581ced863ba -r ef920de0474a sys/sys/namespace.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/sys/namespace.h Sat Nov 28 10:38:33 2015 -0500 @@ -0,0 +1,125 @@ +/* + * per-process namespace stuff + */ + +#ifndef _SYS_NAMESPACE_H_ +#define _SYS_NAMESPACE_H_ + +#ifdef _KERNEL + +#include + +struct proc; + +/* + * nsrulekey: the key half of nsrule (volume id, inode number, name) + * + * XXX: nsek_name can't be const because in the allocated rules it's a + * pointer to string data that needs to be freed. But in places this + * requires casting away const, e.g. for the name one is looking up. + * Growl. + */ +struct nsrulekey { + fsid_t nsrk_vol; + ino_t nsrk_ino; + char *nsrk_name; + size_t nsrk_namelen; +}; + +/* + * nsrule: one per namespace rule + */ +struct nsrule { + struct nsrulekey nsr_key; + + /* value: vnode to return */ + struct vnode *nsr_vn; /* may be null, which means EACCES */ + ino_t nsr_ino; /* inode number of nsr_vn */ + unsigned nsr_type; /* DT_* type of nsr_ino */ + + /* rbtree hook */ + rb_node_t nsr_rbnode; +}; + +/* + * namespace: one globally per declared namespace + * + * The "fixed" elements are immutable when refcount > 1 so do not need + * locking. + * + * The rule table is a table of struct nsentry. The root vnode is a + * special case rule. Here NULL means there is no root vnode rule, + * rather than "generate EACCESS", as the latter isn't sensible. + * + * XXX: what about emulation roots...? + */ +struct namespace { + unsigned ns_refcount; /* Reference count (atomic) */ + struct namespace *ns_parent; /* parent namespace (fixed) */ + rb_tree_t ns_rules; /* rule table (fixed) */ + unsigned ns_numrules; /* # items in ns_rules */ + struct vnode *ns_rootvn; /* root vnode (fixed) */ + bool ns_finished; /* true -> immutable */ +}; + +/* + * namespace info: one per process that is using namespaces + * + * Note: nsi_lock comes before proc->p_lock and also before all the + * assorted locks used during namei. + * + * The refcount only comes into play when clone() results in multiple + * threads appearing as multiple procs. Otherwise the structure is + * copied at fork time. + */ +struct nsinfo { + unsigned nsi_refcount; /* reference count (atomic) */ + kmutex_t nsi_lock; /* lock for rest of this structure */ + struct namespace *nsi_space; /* the main/unprivileged namespace */ + struct namespace *nsi_privspace; /* the protected priv'd namespace */ + unsigned nsi_nspaces; /* number of namespaces created */ + bool nsi_usepriv; /* true -> use nsi_privspace */ +}; + +/* Creating more than this many namespaces per process fails. */ +#define MAX_NS_PER_PROC 8 +/* Limit on the size of the rule table in each namespace */ +#define MAX_NSRULES_PER_NS 256 + +/* fork-time interface */ +struct nsinfo *nsinfo_clone(struct nsinfo *prev); +struct nsinfo *nsinfo_share(struct nsinfo *prev); + +/* exit-time interface */ +void nsinfo_decref(struct nsinfo *nsi); + +/* exec-time interface */ +void ns_exec(struct proc *p); + +/* lookup-time interface */ +int ns_getroot(struct vnode **ret); +int ns_lookup(struct vnode *dvp, const char *name, size_t namelen, + struct vnode **ret); + +/* readdir-time interface */ +bool ns_hasentriesfor(struct vnode *dvp); +int ns_filterdents(struct vnode *dvp, char *buf, size_t len); +int ns_readdir(struct vnode *dvp, struct uio *uio); + +#endif /* _KERNEL */ + +/* + * These functions have the same signature in and out of the kernel. + */ + +/* control interface */ +int ns_begin(void); +int ns_addrule(const char *dir, const char *name, const char *replacement); +int ns_finish(void); + +/* privileged control interface */ +int ns_setns(bool usepriv); +int ns_empower(void); + + +#endif /* _SYS_NAMESPACE_H_ */ diff -r d581ced863ba -r ef920de0474a sys/sys/proc.h --- a/sys/sys/proc.h Sat Nov 28 10:13:45 2015 -0500 +++ b/sys/sys/proc.h Sat Nov 28 10:38:33 2015 -0500 @@ -225,6 +225,7 @@ struct proc { /* Substructures: */ struct kauth_cred *p_cred; /* p: Master copy of credentials */ struct filedesc *p_fd; /* :: Ptr to open files structure */ + struct nsinfo *p_nsinfo; /* :: Per-process namespace info */ struct cwdinfo *p_cwdi; /* :: cdir/rdir/cmask info */ struct pstats *p_stats; /* :: Accounting/stats (PROC ONLY) */ struct plimit *p_limit; /* :: Process limits */