diff --git a/sys/arch/amd64/conf/ALL b/sys/arch/amd64/conf/ALL index 1bc3036..c79b457 100644 --- a/sys/arch/amd64/conf/ALL +++ b/sys/arch/amd64/conf/ALL @@ -125,6 +125,7 @@ options SYSCALL_STATS # per syscall counts options SYSCALL_TIMES # per syscall times options SYSCALL_TIMES_HASCOUNTER # use 'broken' rdtsc (soekris) options KDTRACE_HOOKS # kernel DTrace hooks +options PSREF_DEBUG # debug passive references # Compatibility options #options COMPAT_NOMID # NetBSD 0.8, 386BSD, and BSDI diff --git a/sys/arch/i386/conf/ALL b/sys/arch/i386/conf/ALL index 3524f8e..0faca2a 100644 --- a/sys/arch/i386/conf/ALL +++ b/sys/arch/i386/conf/ALL @@ -125,6 +125,7 @@ options SYSCALL_STATS # per syscall counts options SYSCALL_TIMES # per syscall times options SYSCALL_TIMES_HASCOUNTER # use 'broken' rdtsc (soekris) options KDTRACE_HOOKS # kernel DTrace hooks +options PSREF_DEBUG # debug passive references # Compatibility options options COMPAT_NOMID # NetBSD 0.8, 386BSD, and BSDI diff --git a/sys/conf/files b/sys/conf/files index ca3092f..d7c495c 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -284,6 +284,7 @@ defparam opt_kgdb.h KGDB_DEV KGDB_DEVNAME KGDB_DEVPORT defflag LOCKDEBUG defflag SYSCALL_DEBUG defflag opt_kstack.h KSTACK_CHECK_MAGIC +defflag opt_psref.h PSREF_DEBUG # memory (ram) disk options # diff --git a/sys/kern/files.kern b/sys/kern/files.kern index 4ff6a76..b21f43a 100644 --- a/sys/kern/files.kern +++ b/sys/kern/files.kern @@ -125,6 +125,7 @@ file kern/subr_pool.c kern file kern/subr_prf.c kern file kern/subr_prof.c kern file kern/subr_pserialize.c kern +file kern/subr_psref.c kern file kern/subr_specificdata.c kern file kern/subr_tftproot.c tftproot file kern/subr_time.c kern diff --git a/sys/kern/kern_softint.c b/sys/kern/kern_softint.c index 782540f..75f67d6 100644 --- a/sys/kern/kern_softint.c +++ b/sys/kern/kern_softint.c @@ -442,8 +442,8 @@ softint_disestablish(void *arg) KASSERT(sh->sh_func != NULL); flags |= sh->sh_flags; } - /* Neither pending nor active on all CPUs? */ - if ((flags & (SOFTINT_PENDING | SOFTINT_ACTIVE)) == 0) { + /* Inactive on all CPUs? */ + if ((flags & SOFTINT_ACTIVE) == 0) { break; } /* Oops, still active. Wait for it to clear. */ diff --git a/sys/kern/subr_psref.c b/sys/kern/subr_psref.c new file mode 100644 index 0000000..51c14eb --- /dev/null +++ b/sys/kern/subr_psref.c @@ -0,0 +1,513 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2016 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Taylor R. Campbell. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Passive references + * + * Passive references are references to objects that guarantee the + * object will not be destroyed until the reference is released. + * + * Passive references require no interprocessor synchronization to + * acquire or release. However, destroying the target of passive + * references requires expensive interprocessor synchronization -- + * xcalls to determine on which CPUs the object is still in use. + * + * Passive references may be held only on a single CPU and by a + * single LWP. They require the caller to allocate a little stack + * space, a struct psref object. Sleeping while a passive + * reference is held is allowed, provided that the owner's LWP is + * bound to a CPU -- e.g., the owner is a softint or a bound + * kthread. However, sleeping should be kept to a short duration, + * e.g. sleeping on an adaptive lock. + * + * Passive references serve as an intermediate stage between + * reference counting and passive serialization (pserialize(9)): + * + * - If you need references to transfer from CPU to CPU or LWP to + * LWP, or if you need long-term references, you must use + * reference counting, e.g. with atomic operations or locks, + * which incurs interprocessor synchronization for every use -- + * cheaper than an xcall, but not scalable. + * + * - If all users *guarantee* that they will not sleep, then it is + * not necessary to use passive references: you may as well just + * use the even cheaper pserialize(9), because you have + * satisfied the requirements of a pserialize read section. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef PSREF_DEBUG +#define __psref_debugused +#else +#define __psref_debugused __unused +#endif + +LIST_HEAD(psref_head, psref); + +/* + * struct psref_class + * + * Private global state for a class of passive reference targets. + * Opaque to callers. + */ +struct psref_class { + kmutex_t prc_lock; + kcondvar_t prc_cv; + struct percpu *prc_percpu; /* struct psref_cpu */ + ipl_cookie_t prc_iplcookie; +}; + +/* + * struct psref_cpu + * + * Private per-CPU state for a class of passive reference targets. + * Not exposed by the API. + */ +struct psref_cpu { + struct psref_head pcpu_head; +}; + +/* + * psref_class_create(name, ipl) + * + * Create a new passive reference class, with the given wchan name + * and ipl. + */ +struct psref_class * +psref_class_create(const char *name, int ipl) +{ + struct psref_class *class; + + ASSERT_SLEEPABLE(); + + class = kmem_alloc(sizeof(*class), KM_SLEEP); + if (class == NULL) + goto fail0; + + class->prc_percpu = percpu_alloc(sizeof(struct psref_cpu)); + if (class->prc_percpu == NULL) + goto fail1; + + mutex_init(&class->prc_lock, MUTEX_DEFAULT, ipl); + cv_init(&class->prc_cv, name); + class->prc_iplcookie = makeiplcookie(ipl); + + return class; + +fail1: kmem_free(class, sizeof(*class)); +fail0: return NULL; +} + +#ifdef DIAGNOSTIC +static void +psref_cpu_drained_p(void *p, void *cookie, struct cpu_info *ci __unused) +{ + struct psref_cpu *pcpu = p; + bool *retp = cookie; + + if (!LIST_EMPTY(&pcpu->pcpu_head)) + *retp = false; +} + +static bool +psref_class_drained_p(struct psref_class *prc) +{ + bool ret = true; + + percpu_foreach(prc->prc_percpu, &psref_cpu_drained_p, &ret); + + return ret; +} +#endif /* DIAGNOSTIC */ + +/* + * psref_class_destroy(class) + * + * Destroy a passive reference class and free memory associated + * with it. All targets in this class must have been drained and + * destroyed already. + */ +void +psref_class_destroy(struct psref_class *class) +{ + + KASSERT(psref_class_drained_p(class)); + + cv_destroy(&class->prc_cv); + mutex_destroy(&class->prc_lock); + percpu_free(class->prc_percpu, sizeof(struct psref_cpu)); + kmem_free(class, sizeof(*class)); +} + +/* + * psref_target_init(target, class) + * + * Initialize a passive reference target in the specified class. + * The caller is responsible for issuing a membar_producer after + * psref_target_init and before exposing a pointer to the target + * to other CPUs. + */ +void +psref_target_init(struct psref_target *target, + struct psref_class *class __psref_debugused) +{ + +#ifdef PSREF_DEBUG + target->prt_class = class; +#endif + target->prt_draining = false; +} + +/* + * psref_acquire(psref, target, class) + * + * Acquire a passive reference to the specified target, which must + * be in the specified class. + * + * The caller must guarantee that the target will not be destroyed + * before psref_acquire returns. + * + * The caller must additionally guarantee that it will not switch + * CPUs before releasing the passive reference, either by + * disabling kpreemption and avoiding sleeps, or by being in a + * softint or in an LWP bound to a CPU. + */ +void +psref_acquire(struct psref *psref, struct psref_target *target, + struct psref_class *class) +{ + struct psref_cpu *pcpu; + int s; + + KASSERTMSG((kpreempt_disabled() || cpu_softintr_p() || + ISSET(curlwp->l_pflag, LP_BOUND)), + "passive references are CPU-local," + " but preemption is enabled and the caller is not" + " in a softint or CPU-bound LWP"); + +#ifdef PSREF_DEBUG + KASSERTMSG((target->prt_class == class), + "mismatched psref target class: %p (ref) != %p (expected)", + target->prt_class, class); +#endif + KASSERTMSG(!target->prt_draining, "psref target already destroyed: %p", + target); + + /* Block interrupts and acquire the current CPU's reference list. */ + s = splraiseipl(class->prc_iplcookie); + pcpu = percpu_getref(class->prc_percpu); + + /* Record our reference. */ + LIST_INSERT_HEAD(&pcpu->pcpu_head, psref, psref_entry); + psref->psref_target = target; +#ifdef PSREF_DEBUG + psref->psref_lwp = curlwp; + psref->psref_cpu = curcpu(); +#endif + + /* Release the CPU list and restore interrupts. */ + percpu_putref(class->prc_percpu); + splx(s); +} + +/* + * psref_release(psref, target, class) + * + * Release a passive reference to the specified target, which must + * be in the specified class. + * + * The caller must not have switched CPUs or LWPs since acquiring + * the passive reference. + */ +void +psref_release(struct psref *psref, struct psref_target *target, + struct psref_class *class) +{ + int s; + + KASSERTMSG((kpreempt_disabled() || cpu_softintr_p() || + ISSET(curlwp->l_pflag, LP_BOUND)), + "passive references are CPU-local," + " but preemption is enabled and the caller is not" + " in a softint or CPU-bound LWP"); + +#ifdef PSREF_DEBUG + KASSERTMSG((target->prt_class == class), + "mismatched psref target class: %p (ref) != %p (expected)", + target->prt_class, class); +#endif + + /* Make sure the psref looks sensible. */ + KASSERTMSG((psref->psref_target == target), + "passive reference target mismatch: %p (ref) != %p (expected)", + psref->psref_target, target); +#ifdef PSREF_DEBUG + KASSERTMSG((psref->psref_lwp == curlwp), + "passive reference transferred from lwp %p to lwp %p", + psref->psref_lwp, curlwp); + KASSERTMSG((psref->psref_cpu == curcpu()), + "passive reference transferred from CPU %u to CPU %u", + cpu_index(psref->psref_cpu), cpu_index(curcpu())); +#endif + + /* + * Block interrupts and remove the psref from the current CPU's + * list. No need to percpu_getref or get the head of the list, + * and the caller guarantees that we are bound to a CPU anyway + * (as does blocking interrupts). + */ + s = splraiseipl(class->prc_iplcookie); + LIST_REMOVE(psref, psref_entry); + splx(s); + + /* If someone is waiting for users to drain, notify 'em. */ + if (__predict_false(target->prt_draining)) + cv_broadcast(&class->prc_cv); +} + +/* + * psref_copy(pto, pfrom class) + * + * Copy a passive reference from pfrom, which must be in the + * specified class, to pto. Both pfrom and pto must later be + * released with psref_release. + * + * The caller must not have switched CPUs or LWPs since acquiring + * pfrom, and must not switch CPUs or LWPs before releasing both + * pfrom and pto. + */ +void +psref_copy(struct psref *pto, const struct psref *pfrom, + struct psref_class *class) +{ + struct psref_cpu *pcpu; + int s; + + KASSERTMSG((kpreempt_disabled() || cpu_softintr_p() || + ISSET(curlwp->l_pflag, LP_BOUND)), + "passive references are CPU-local," + " but preemption is enabled and the caller is not" + " in a softint or CPU-bound LWP"); + KASSERTMSG((pto != pfrom), + "can't copy passive reference to itself: %p", + pto); + +#ifdef PSREF_DEBUG + /* Make sure the pfrom reference looks sensible. */ + KASSERTMSG((pfrom->psref_lwp == curlwp), + "passive reference transferred from lwp %p to lwp %p", + pfrom->psref_lwp, curlwp); + KASSERTMSG((pfrom->psref_cpu == curcpu()), + "passive reference transferred from CPU %u to CPU %u", + cpu_index(pfrom->psref_cpu), cpu_index(curcpu())); + KASSERTMSG((pfrom->psref_target->prt_class == class), + "mismatched psref target class: %p (ref) != %p (expected)", + pfrom->psref_target->prt_class, class); +#endif + + /* Block interrupts and acquire the current CPU's reference list. */ + s = splraiseipl(class->prc_iplcookie); + pcpu = percpu_getref(class->prc_percpu); + + /* Record the new reference. */ + LIST_INSERT_HEAD(&pcpu->pcpu_head, pto, psref_entry); + pto->psref_target = pfrom->psref_target; +#ifdef PSREF_DEBUG + pto->psref_lwp = curlwp; + pto->psref_cpu = curcpu(); +#endif + + /* Release the CPU list and restore interrupts. */ + percpu_putref(class->prc_percpu); + splx(s); +} + +/* + * struct psreffed + * + * Global state for draining a psref target. + */ +struct psreffed { + struct psref_class *class; + struct psref_target *target; + bool ret; +}; + +static void +psreffed_p_xc(void *cookie0, void *cookie1 __unused) +{ + struct psreffed *P = cookie0; + + /* + * If we hold a psref to the target, then answer true. + * + * This is the only dynamic decision that may be made with + * psref_held. + * + * No need to lock anything here: every write transitions from + * false to true, so there can be no conflicting writes. No + * need for a memory barrier here because P->ret is read only + * after xc_wait, which has already issued any necessary memory + * barriers. + */ + if (psref_held(P->target, P->class)) + P->ret = true; +} + +static bool +psreffed_p(struct psref_target *target, struct psref_class *class) +{ + struct psreffed P = { + .class = class, + .target = target, + .ret = false, + }; + + /* Ask all CPUs to say whether they hold a psref to the target. */ + xc_wait(xc_broadcast(0, &psreffed_p_xc, &P, NULL)); + + return P.ret; +} + +/* + * psref_target_destroy(target, class) + * + * Destroy a passive reference target. Waits for all existing + * references to drain. Caller must guarantee no new references + * will be acquired once it calls psref_target_destroy, e.g. by + * removing the target from a global list first. May sleep. + */ +void +psref_target_destroy(struct psref_target *target, struct psref_class *class) +{ + + ASSERT_SLEEPABLE(); + +#ifdef PSREF_DEBUG + KASSERTMSG((target->prt_class == class), + "mismatched psref target class: %p (ref) != %p (expected)", + target->prt_class, class); +#endif + + /* Request psref_release to notify us when done. */ + KASSERTMSG(!target->prt_draining, "psref target already destroyed: %p", + target); + target->prt_draining = true; + + /* Wait until there are no more references on any CPU. */ + while (psreffed_p(target, class)) { + /* + * This enter/wait/exit business looks wrong, but it is + * both necessary, because psreffed_p performs a + * low-priority xcall and hence cannot run while a + * mutex is locked, and OK, because the wait is timed + * -- explicit wakeups are only an optimization. + */ + mutex_enter(&class->prc_lock); + (void)cv_timedwait(&class->prc_cv, &class->prc_lock, 1); + mutex_exit(&class->prc_lock); + } + +#ifdef PSREF_DEBUG + /* No more references. Cause subsequent psref_acquire to kassert. */ + target->prt_class = NULL; +#endif +} + +/* + * psref_held(target, class) + * + * True if the current CPU holds a passive reference to target, + * false otherwise. May be used only inside assertions. + */ +bool +psref_held(struct psref_target *target, struct psref_class *class) +{ + struct psref_cpu *pcpu; + struct psref *psref; + int s; + bool held = false; + + KASSERTMSG((kpreempt_disabled() || cpu_softintr_p() || + ISSET(curlwp->l_pflag, LP_BOUND)), + "passive references are CPU-local," + " but preemption is enabled and the caller is not" + " in a softint or CPU-bound LWP"); + +#ifdef PSREF_DEBUG + KASSERTMSG((target->prt_class == class), + "mismatched psref target class: %p (ref) != %p (expected)", + target->prt_class, class); +#endif + + /* Block interrupts and acquire the current CPU's reference list. */ + s = splraiseipl(class->prc_iplcookie); + pcpu = percpu_getref(class->prc_percpu); + + /* Search through all the references on this CPU. */ + LIST_FOREACH(psref, &pcpu->pcpu_head, psref_entry) { +#ifdef PSREF_DEBUG + /* Sanity-check the reference. */ + KASSERTMSG((psref->psref_lwp == curlwp), + "passive reference transferred from lwp %p to lwp %p", + psref->psref_lwp, curlwp); + KASSERTMSG((psref->psref_cpu == curcpu()), + "passive reference transferred from CPU %u to CPU %u", + cpu_index(psref->psref_cpu), cpu_index(curcpu())); +#endif + + /* If it matches, stop here and answer yes. */ + if (psref->psref_target == target) { + held = true; + break; + } + } + + /* Release the CPU list and restore interrupts. */ + percpu_putref(class->prc_percpu); + splx(s); + + return held; +} diff --git a/sys/net/if.c b/sys/net/if.c index 2633c72..32c19fc 100644 --- a/sys/net/if.c +++ b/sys/net/if.c @@ -94,6 +94,7 @@ __KERNEL_RCSID(0, "$NetBSD: if.c,v 1.325 2016/02/19 20:05:43 roy Exp $"); #if defined(_KERNEL_OPT) #include "opt_inet.h" +#include "opt_ipsec.h" #include "opt_atalk.h" #include "opt_natm.h" @@ -138,6 +139,9 @@ __KERNEL_RCSID(0, "$NetBSD: if.c,v 1.325 2016/02/19 20:05:43 roy Exp $"); #include #include #include +#ifndef IPSEC +#include +#endif #ifdef INET6 #include @@ -251,6 +255,10 @@ ifinit(void) sysctl_net_pktq_setup(NULL, PF_INET6); #endif +#ifndef IPSEC + encapinit(); +#endif + if_listener = kauth_listen_scope(KAUTH_SCOPE_NETWORK, if_listener_cb, NULL); diff --git a/sys/net/if_gif.c b/sys/net/if_gif.c index 83afafe..1ce2332 100644 --- a/sys/net/if_gif.c +++ b/sys/net/if_gif.c @@ -53,6 +53,7 @@ __KERNEL_RCSID(0, "$NetBSD: if_gif.c,v 1.106 2016/02/26 07:35:17 knakahara Exp $ #include #include #include +#include #include #include @@ -100,6 +101,7 @@ static int gif_check_nesting(struct ifnet *, struct mbuf *); static int gif_encap_attach(struct gif_softc *); static int gif_encap_detach(struct gif_softc *); +static void gif_encap_pause(struct gif_softc *); static struct if_clone gif_cloner = IF_CLONE_INITIALIZER("gif", gif_clone_create, gif_clone_destroy); @@ -217,7 +219,8 @@ gif_encapcheck(struct mbuf *m, int off, int proto, void *arg) if (sc == NULL) return 0; - if ((sc->gif_if.if_flags & IFF_UP) == 0) + if ((sc->gif_if.if_flags & (IFF_UP|IFF_RUNNING)) + != (IFF_UP|IFF_RUNNING)) return 0; /* no physical address */ @@ -321,9 +324,8 @@ gif_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, } m->m_flags &= ~(M_BCAST|M_MCAST); - if (!(ifp->if_flags & IFF_UP) || - sc->gif_psrc == NULL || sc->gif_pdst == NULL || - sc->gif_si == NULL) { + if (!(ifp->if_flags & IFF_UP) || /* check IFF_RUNNING later */ + sc->gif_psrc == NULL || sc->gif_pdst == NULL) { m_freem(m); error = ENETDOWN; goto end; @@ -344,6 +346,17 @@ gif_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, m->m_pkthdr.csum_data = 0; s = splnet(); + /* + * This if_flags check, IFQ_ENQUEUE and softint_schedule() are required + * to be done atomically in the local CPU, because this local CPU must + * let gif_encap_pause() wait until softint_schedule() completion. + */ + if (!(ifp->if_flags & IFF_RUNNING)) { + splx(s); + m_freem(m); + error = ENETDOWN; + goto end; + } IFQ_ENQUEUE(&ifp->if_snd, m, &pktattr, error); if (error) { splx(s); @@ -376,15 +389,6 @@ gifintr(void *arg) sc = arg; ifp = &sc->gif_if; - /* - * other CPUs does {set,delete}_tunnel after curcpu have done - * softint_schedule(). - */ - if (sc->gif_pdst == NULL || sc->gif_psrc == NULL) { - IFQ_PURGE(&ifp->if_snd); - return; - } - /* output processing */ while (1) { s = splnet(); @@ -776,6 +780,46 @@ gif_encap_detach(struct gif_softc *sc) return error; } +static void +gif_encap_pause(struct gif_softc *sc) +{ + struct ifnet *ifp; + uint64_t where; + + if (sc == NULL || sc->gif_psrc == NULL) + return; + + ifp = &sc->gif_if; + if ((ifp->if_flags & IFF_RUNNING) == 0) + return; + + switch (sc->gif_psrc->sa_family) { +#ifdef INET + case AF_INET: + (void)in_gif_pause(sc); + break; +#endif +#ifdef INET6 + case AF_INET6: + (void)in6_gif_pause(sc); + break; +#endif + } + + ifp->if_flags &= ~IFF_RUNNING; + /* membar_sync() is done in xc_broadcast(). */ + + /* + * Wait for softint_schedule() completion done by other CPUs which + * already run over if_flags check in gif_output(). + * In addition, wait for softint_execute()(ipintr() or ip6intr()) + * completion done by other CPUs which already run over if_flags + * check in in_gif_input() or in6_gif_input(). + */ + where = xc_broadcast(0, (xcfunc_t)nullop, NULL, NULL); + xc_wait(where); +} + int gif_set_tunnel(struct ifnet *ifp, struct sockaddr *src, struct sockaddr *dst) { @@ -783,11 +827,13 @@ gif_set_tunnel(struct ifnet *ifp, struct sockaddr *src, struct sockaddr *dst) struct gif_softc *sc2; struct sockaddr *osrc, *odst; struct sockaddr *nsrc, *ndst; - void *osi; int s; int error; s = splsoftnet(); + error = encap_lock_enter(); + if (error) + return error; LIST_FOREACH(sc2, &gif_softc_list, gif_list) { if (sc2 == sc) @@ -798,6 +844,7 @@ gif_set_tunnel(struct ifnet *ifp, struct sockaddr *src, struct sockaddr *dst) if (sockaddr_cmp(sc2->gif_pdst, dst) == 0 && sockaddr_cmp(sc2->gif_psrc, src) == 0) { /* continue to use the old configureation. */ + encap_lock_exit(); splx(s); return EADDRNOTAVAIL; } @@ -806,42 +853,29 @@ gif_set_tunnel(struct ifnet *ifp, struct sockaddr *src, struct sockaddr *dst) } if ((nsrc = sockaddr_dup(src, M_WAITOK)) == NULL) { + encap_lock_exit(); splx(s); return ENOMEM; } if ((ndst = sockaddr_dup(dst, M_WAITOK)) == NULL) { sockaddr_free(nsrc); + encap_lock_exit(); splx(s); return ENOMEM; } + gif_encap_pause(sc); + /* + * At this point, gif_output() does not softint_schedule() any more. + * Furthermore, all of gif_output() has completed. It promises not to + * call softint_schedule() anymore, so we can call + * softint_disestablish() now. + */ + /* Firstly, clear old configurations. */ if (sc->gif_si) { - osrc = sc->gif_psrc; - odst = sc->gif_pdst; - osi = sc->gif_si; - sc->gif_psrc = NULL; - sc->gif_pdst = NULL; + softint_disestablish(sc->gif_si); sc->gif_si = NULL; - /* - * At this point, gif_output() does not softint_schedule() - * any more. However, there are below 2 fears of other CPUs - * which would cause panic because of the race between - * softint_execute() and softint_disestablish(). - * (a) gif_output() has done softint_schedule(), and softint - * (gifintr()) is waiting for execution - * => This pattern is avoided by waiting SOFTINT_PENDING - * CPUs in softint_disestablish() - * (b) gifintr() is already running - * => This pattern is avoided by waiting SOFTINT_ACTIVE - * CPUs in softint_disestablish() - */ - - softint_disestablish(osi); - sc->gif_psrc = osrc; - sc->gif_pdst = odst; - osrc = NULL; - odst = NULL; } /* XXX we can detach from both, but be polite just in case */ if (sc->gif_psrc) @@ -900,6 +934,7 @@ gif_set_tunnel(struct ifnet *ifp, struct sockaddr *src, struct sockaddr *dst) else ifp->if_flags &= ~IFF_RUNNING; + encap_lock_exit(); splx(s); return error; } @@ -908,24 +943,18 @@ void gif_delete_tunnel(struct ifnet *ifp) { struct gif_softc *sc = ifp->if_softc; - struct sockaddr *osrc, *odst; - void *osi; int s; + int error; s = splsoftnet(); + error = encap_lock_enter(); + if (error) + return; + gif_encap_pause(sc); if (sc->gif_si) { - osrc = sc->gif_psrc; - odst = sc->gif_pdst; - osi = sc->gif_si; - - sc->gif_psrc = NULL; - sc->gif_pdst = NULL; + softint_disestablish(sc->gif_si); sc->gif_si = NULL; - - softint_disestablish(osi); - sc->gif_psrc = osrc; - sc->gif_pdst = odst; } if (sc->gif_psrc) { sockaddr_free(sc->gif_psrc); @@ -947,5 +976,7 @@ gif_delete_tunnel(struct ifnet *ifp) ifp->if_flags |= IFF_RUNNING; else ifp->if_flags &= ~IFF_RUNNING; + + encap_lock_exit(); splx(s); } diff --git a/sys/net/if_stf.c b/sys/net/if_stf.c index 3ac7dd1..0139e76 100644 --- a/sys/net/if_stf.c +++ b/sys/net/if_stf.c @@ -190,18 +190,27 @@ static int stf_clone_create(struct if_clone *ifc, int unit) { struct stf_softc *sc; + int error; + + sc = malloc(sizeof(struct stf_softc), M_DEVBUF, M_WAIT|M_ZERO); + if_initname(&sc->sc_if, ifc->ifc_name, unit); + + error = encap_lock_enter(); + if (error) { + free(sc, M_DEVBUF); + return error; + } if (LIST_FIRST(&stf_softc_list) != NULL) { /* Only one stf interface is allowed. */ + encap_lock_exit(); + free(sc, M_DEVBUF); return (EEXIST); } - sc = malloc(sizeof(struct stf_softc), M_DEVBUF, M_WAIT|M_ZERO); - - if_initname(&sc->sc_if, ifc->ifc_name, unit); - sc->encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV6, stf_encapcheck, &in_stf_encapsw, sc); + encap_lock_exit(); if (sc->encap_cookie == NULL) { printf("%s: unable to attach encap\n", if_name(&sc->sc_if)); free(sc, M_DEVBUF); @@ -226,8 +235,10 @@ stf_clone_destroy(struct ifnet *ifp) { struct stf_softc *sc = (void *) ifp; + encap_lock_enter(); LIST_REMOVE(sc, sc_list); encap_detach(sc->encap_cookie); + encap_lock_exit(); bpf_detach(ifp); if_detach(ifp); rtcache_free(&sc->sc_ro); diff --git a/sys/netinet/in_gif.c b/sys/netinet/in_gif.c index 35f6116..526979f 100644 --- a/sys/netinet/in_gif.c +++ b/sys/netinet/in_gif.c @@ -204,7 +204,8 @@ in_gif_input(struct mbuf *m, int off, int proto) gifp = (struct ifnet *)encap_getarg(m); - if (gifp == NULL || (gifp->if_flags & IFF_UP) == 0) { + if (gifp == NULL || (gifp->if_flags & (IFF_UP|IFF_RUNNING)) + != (IFF_UP|IFF_RUNNING)) { m_freem(m); ip_statinc(IP_STAT_NOGIF); return; @@ -384,11 +385,21 @@ in_gif_detach(struct gif_softc *sc) { int error; + error = in_gif_pause(sc); + + rtcache_free(&sc->gif_ro); + + return error; +} + +int +in_gif_pause(struct gif_softc *sc) +{ + int error; + error = encap_detach(sc->encap_cookie4); if (error == 0) sc->encap_cookie4 = NULL; - rtcache_free(&sc->gif_ro); - return error; } diff --git a/sys/netinet/in_gif.h b/sys/netinet/in_gif.h index 1107ee8..654b71c 100644 --- a/sys/netinet/in_gif.h +++ b/sys/netinet/in_gif.h @@ -45,5 +45,6 @@ int gif_encapcheck4(struct mbuf *, int, int, void *); #endif int in_gif_attach(struct gif_softc *); int in_gif_detach(struct gif_softc *); +int in_gif_pause(struct gif_softc *); #endif /* !_NETINET_IN_GIF_H_ */ diff --git a/sys/netinet/ip_encap.c b/sys/netinet/ip_encap.c index 128ee75..baa81a1 100644 --- a/sys/netinet/ip_encap.c +++ b/sys/netinet/ip_encap.c @@ -58,13 +58,18 @@ /* XXX is M_NETADDR correct? */ /* - * The code will use radix table for tunnel lookup, for + * With USE_RADIX the code will use radix table for tunnel lookup, for * tunnels registered with encap_attach() with a addr/mask pair. * Faster on machines with thousands of tunnel registerations (= interfaces). * * The code assumes that radix table code can handle non-continuous netmask, * as it will pass radix table memory region with (src + dst) sockaddr pair. */ +/* XXX future work + * eliminate linear search of encap interfaces. It must fix the many encap + * interface scaling issue without reducing computation by radix tree. + */ +#undef USE_RADIX #include __KERNEL_RCSID(0, "$NetBSD: ip_encap.c,v 1.52 2016/02/26 07:35:17 knakahara Exp $"); @@ -82,6 +87,9 @@ __KERNEL_RCSID(0, "$NetBSD: ip_encap.c,v 1.52 2016/02/26 07:35:17 knakahara Exp #include #include #include +#include +#include +#include #include #include @@ -109,21 +117,65 @@ __KERNEL_RCSID(0, "$NetBSD: ip_encap.c,v 1.52 2016/02/26 07:35:17 knakahara Exp enum direction { INBOUND, OUTBOUND }; #ifdef INET -static struct encaptab *encap4_lookup(struct mbuf *, int, int, enum direction); +static struct encaptab *encap4_lookup(struct mbuf *, int, int, enum direction, + struct psref *); #endif #ifdef INET6 -static struct encaptab *encap6_lookup(struct mbuf *, int, int, enum direction); +static struct encaptab *encap6_lookup(struct mbuf *, int, int, enum direction, + struct psref *); #endif static int encap_add(struct encaptab *); static int encap_remove(struct encaptab *); static int encap_afcheck(int, const struct sockaddr *, const struct sockaddr *); +#ifdef USE_RADIX static struct radix_node_head *encap_rnh(int); static int mask_matchlen(const struct sockaddr *); +#else +static int mask_match(const struct encaptab *, const struct sockaddr *, + const struct sockaddr *); +#endif static void encap_fillarg(struct mbuf *, const struct encaptab *); -LIST_HEAD(, encaptab) encaptab = LIST_HEAD_INITIALIZER(&encaptab); - +/* + * In encap[46]_lookup(), ep->func can sleep(e.g. rtalloc1) while walking + * encap_table. So, it cannot use pserialize_read_enter() + */ +static struct { + LIST_HEAD(, encaptab) list; + pserialize_t psz; + struct psref_class *elem_class; /* for the element of et_list */ +} encaptab __cacheline_aligned = { + .list = LIST_HEAD_INITIALIZER(&encap_table), +}; +#define encap_table encaptab.list + +static struct { + kmutex_t lock; + kcondvar_t cv; + struct lwp *busy; +} encap_whole __cacheline_aligned; + +#ifdef USE_RADIX struct radix_node_head *encap_head[2]; /* 0 for AF_INET, 1 for AF_INET6 */ +static bool encap_head_updating = false; +#endif + +/* + * must be done before other encap interfaces initialization. + */ +void +encapinit(void) +{ + + encaptab.psz = pserialize_create(); + encaptab.elem_class = psref_class_create("encapelem", IPL_SOFTNET); + if (encaptab.elem_class == NULL) + panic("encaptab.elem_class cannot be allocated.\n"); + + mutex_init(&encap_whole.lock, MUTEX_DEFAULT, IPL_NONE); + cv_init(&encap_whole.cv, "ip_encap cv"); + encap_whole.busy = NULL; +} void encap_init(void) @@ -141,9 +193,10 @@ encap_init(void) * initialization - using LIST_INIT() here can nuke encap_attach() * from drivers. */ - LIST_INIT(&encaptab); + LIST_INIT(&encap_table); #endif +#ifdef USE_RADIX /* * initialize radix lookup table when the radix subsystem is inited. */ @@ -153,18 +206,23 @@ encap_init(void) rn_delayedinit((void *)&encap_head[1], sizeof(struct sockaddr_pack) << 3); #endif +#endif } #ifdef INET static struct encaptab * -encap4_lookup(struct mbuf *m, int off, int proto, enum direction dir) +encap4_lookup(struct mbuf *m, int off, int proto, enum direction dir, + struct psref *match_psref) { struct ip *ip; struct ip_pack4 pack; struct encaptab *ep, *match; int prio, matchprio; + int s; +#ifdef USE_RADIX struct radix_node_head *rnh = encap_rnh(AF_INET); struct radix_node *rn; +#endif KASSERT(m->m_len >= sizeof(*ip)); @@ -185,22 +243,54 @@ encap4_lookup(struct mbuf *m, int off, int proto, enum direction dir) match = NULL; matchprio = 0; + s = pserialize_read_enter(); +#ifdef USE_RADIX + /* Check whether there's and update in progress. */ + if (encap_head_updating) { + /* + * Update in progress. Pretend there are no tunnels + */ + pserialize_read_exit(s); + return NULL; + } rn = rnh->rnh_matchaddr((void *)&pack, rnh); if (rn && (rn->rn_flags & RNF_ROOT) == 0) { - match = (struct encaptab *)rn; + struct encaptab *encapp = (struct encaptab *)rn; + + psref_acquire(match_psref, &encapp->psref, + encaptab.elem_class); + match = encapp; matchprio = mask_matchlen(match->srcmask) + - mask_matchlen(match->dstmask); + mask_matchlen(match->dstmask); } +#endif + LIST_FOREACH(ep, &encap_table, chain) { + struct psref elem_psref; + + membar_datadep_consumer(); - LIST_FOREACH(ep, &encaptab, chain) { if (ep->af != AF_INET) continue; if (ep->proto >= 0 && ep->proto != proto) continue; - if (ep->func) + + psref_acquire(&elem_psref, &ep->psref, + encaptab.elem_class); + if (ep->func) { + pserialize_read_exit(s); + /* XXXX ep->func is sleepable. */ prio = (*ep->func)(m, off, proto, ep->arg); - else + s = pserialize_read_enter(); + } else { +#ifdef USE_RADIX + psref_release(&elem_psref, &ep->psref, + encaptab.elem_class); continue; +#else + prio = mask_match(ep, (struct sockaddr *)&pack.mine, + (struct sockaddr *)&pack.yours); +#endif + } /* * We prioritize the matches by using bit length of the @@ -223,13 +313,30 @@ encap4_lookup(struct mbuf *m, int off, int proto, enum direction dir) * For radix-based lookup, I guess source takes precedence. * See rn_{refines,lexobetter} for the correct answer. */ - if (prio <= 0) + if (prio <= 0) { + psref_release(&elem_psref, &ep->psref, + encaptab.elem_class); continue; + } if (prio > matchprio) { + /* release last matched ep */ + if (match != NULL) + psref_release(match_psref, &match->psref, + encaptab.elem_class); + + psref_copy(match_psref, &elem_psref, + encaptab.elem_class); matchprio = prio; match = ep; } + KASSERTMSG((match == NULL) || psref_held(&match->psref, + encaptab.elem_class), + "current match = %p, but not hold its psref", match); + + psref_release(&elem_psref, &ep->psref, + encaptab.elem_class); } + pserialize_read_exit(s); return match; } @@ -241,22 +348,27 @@ encap4_input(struct mbuf *m, ...) va_list ap; const struct encapsw *esw; struct encaptab *match; + struct psref match_psref; va_start(ap, m); off = va_arg(ap, int); proto = va_arg(ap, int); va_end(ap); - match = encap4_lookup(m, off, proto, INBOUND); - + match = encap4_lookup(m, off, proto, INBOUND, &match_psref); if (match) { /* found a match, "match" has the best one */ esw = match->esw; if (esw && esw->encapsw4.pr_input) { encap_fillarg(m, match); (*esw->encapsw4.pr_input)(m, off, proto); - } else + psref_release(&match_psref, &match->psref, + encaptab.elem_class); + } else { + psref_release(&match_psref, &match->psref, + encaptab.elem_class); m_freem(m); + } return; } @@ -267,14 +379,18 @@ encap4_input(struct mbuf *m, ...) #ifdef INET6 static struct encaptab * -encap6_lookup(struct mbuf *m, int off, int proto, enum direction dir) +encap6_lookup(struct mbuf *m, int off, int proto, enum direction dir, + struct psref *match_psref) { struct ip6_hdr *ip6; struct ip_pack6 pack; int prio, matchprio; + int s; struct encaptab *ep, *match; +#ifdef USE_RADIX struct radix_node_head *rnh = encap_rnh(AF_INET6); struct radix_node *rn; +#endif KASSERT(m->m_len >= sizeof(*ip6)); @@ -295,31 +411,82 @@ encap6_lookup(struct mbuf *m, int off, int proto, enum direction dir) match = NULL; matchprio = 0; + s = pserialize_read_enter(); +#ifdef USE_RADIX + /* Check whether there's and update in progress. */ + if (encap_head_updating) { + /* + * Update in progress. Pretend there are no tunnels + */ + pserialize_read_exit(s); + return NULL; + } + rn = rnh->rnh_matchaddr((void *)&pack, rnh); if (rn && (rn->rn_flags & RNF_ROOT) == 0) { - match = (struct encaptab *)rn; + struct encaptab *encapp = (struct encaptab *)rn; + + psref_acquire(match_psref, &encapp->psref, + encaptab.elem_class); + match = encapp; matchprio = mask_matchlen(match->srcmask) + - mask_matchlen(match->dstmask); + mask_matchlen(match->dstmask); } +#endif + LIST_FOREACH(ep, &encap_table, chain) { + struct psref elem_psref; + + membar_datadep_consumer(); - LIST_FOREACH(ep, &encaptab, chain) { if (ep->af != AF_INET6) continue; if (ep->proto >= 0 && ep->proto != proto) continue; - if (ep->func) + + psref_acquire(&elem_psref, &ep->psref, + encaptab.elem_class); + + if (ep->func) { + pserialize_read_exit(s); + /* XXXX ep->func is sleepable. */ prio = (*ep->func)(m, off, proto, ep->arg); - else + s = pserialize_read_enter(); + } else { +#ifdef USE_RADIX + psref_release(&elem_psref, &ep->psref, + encaptab.elem_class); continue; +#else + prio = mask_match(ep, (struct sockaddr *)&pack.mine, + (struct sockaddr *)&pack.yours); +#endif + } /* see encap4_lookup() for issues here */ - if (prio <= 0) + if (prio <= 0) { + psref_release(&elem_psref, &ep->psref, + encaptab.elem_class); continue; + } if (prio > matchprio) { + /* release last matched ep */ + if (match != NULL) + psref_release(match_psref, &match->psref, + encaptab.elem_class); + + psref_copy(match_psref, &elem_psref, + encaptab.elem_class); matchprio = prio; match = ep; } + KASSERTMSG((match == NULL) || psref_held(&match->psref, + encaptab.elem_class), + "current match = %p, but not hold its psref", match); + + psref_release(&elem_psref, &ep->psref, + encaptab.elem_class); } + pserialize_read_exit(s); return match; } @@ -330,16 +497,23 @@ encap6_input(struct mbuf **mp, int *offp, int proto) struct mbuf *m = *mp; const struct encapsw *esw; struct encaptab *match; + struct psref match_psref; - match = encap6_lookup(m, *offp, proto, INBOUND); + match = encap6_lookup(m, *offp, proto, INBOUND, &match_psref); if (match) { /* found a match */ esw = match->esw; if (esw && esw->encapsw6.pr_input) { + int ret; encap_fillarg(m, match); - return (*esw->encapsw6.pr_input)(mp, offp, proto); + ret = (*esw->encapsw6.pr_input)(mp, offp, proto); + psref_release(&match_psref, &match->psref, + encaptab.elem_class); + return ret; } else { + psref_release(&match_psref, &match->psref, + encaptab.elem_class); m_freem(m); return IPPROTO_DONE; } @@ -350,39 +524,111 @@ encap6_input(struct mbuf **mp, int *offp, int proto) } #endif +/* + * XXX + * The encaptab list and the rnh radix tree must be manipulated atomically. + */ static int encap_add(struct encaptab *ep) { +#ifdef USE_RADIX struct radix_node_head *rnh = encap_rnh(ep->af); - int error = 0; +#endif + + KASSERT(encap_lock_held()); - LIST_INSERT_HEAD(&encaptab, ep, chain); +#ifdef USE_RADIX if (!ep->func && rnh) { + /* Disable access to the radix tree for reader. */ + encap_head_updating = true; + /* Wait for all readers to drain. */ + pserialize_perform(encaptab.psz); + if (!rnh->rnh_addaddr((void *)ep->addrpack, (void *)ep->maskpack, rnh, ep->nodes)) { - error = EEXIST; - goto fail; + encap_head_updating = false; + return EEXIST; } + + /* + * The ep added to the radix tree must be skipped while + * encap[46]_lookup walks encaptab list. In other words, + * encap_add() does not need to care whether the ep has + * been added encaptab list or not yet. + * So, we can re-enable access to the radix tree for now. + */ + encap_head_updating = false; } - return error; +#endif - fail: - LIST_REMOVE(ep, chain); - return error; +/* + * XXX + * need memory barrier to use queue(3) with pserialize(9). + * see https://mail-index.netbsd.org/tech-kern/2014/11/21/msg018055.html + */ +#define LIST_INSERT_HEAD_PSZ(_head, _elm, _field) do { \ + (_elm)->_field.le_next = (_head)->lh_first; \ + (_elm)->_field.le_prev = &(_head)->lh_first; \ + membar_producer(); \ + if ((_elm)->_field.le_next != LIST_END(_head)) \ + (_head)->lh_first->_field.le_prev = &(_elm)->_field.le_next; \ + (_head)->lh_first = (_elm); \ + } while (/*CONSTCOND*/0) + + LIST_INSERT_HEAD_PSZ(&encap_table, ep, chain); + +#undef LIST_INSERT_HEAD_PSZ + + return 0; } +/* + * XXX + * The encaptab list and the rnh radix tree must be manipulated atomically. + */ static int encap_remove(struct encaptab *ep) { +#ifdef USE_RADIX struct radix_node_head *rnh = encap_rnh(ep->af); +#endif int error = 0; - LIST_REMOVE(ep, chain); + KASSERT(encap_lock_held()); + +#ifdef USE_RADIX if (!ep->func && rnh) { + /* Disable access to the radix tree for reader. */ + encap_head_updating = true; + /* Wait for all readers to drain. */ + pserialize_perform(encaptab.psz); + if (!rnh->rnh_deladdr((void *)ep->addrpack, (void *)ep->maskpack, rnh)) error = ESRCH; + + /* + * The ep added to the radix tree must be skipped while + * encap[46]_lookup walks encaptab list. In other words, + * encap_add() does not need to care whether the ep has + * been added encaptab list or not yet. + * So, we can re-enable access to the radix tree for now. + */ + encap_head_updating = false; } +#endif + +/* + * XXX + * need memory barrier to use queue(3) with pserialize(9). + * see https://mail-index.netbsd.org/tech-kern/2014/11/21/msg018055.html + */ +#define LIST_REMOVE_PSZ LIST_REMOVE + + LIST_REMOVE(ep, chain); + +#undef LIST_REMOVE_PSZ + return error; } @@ -434,7 +680,7 @@ encap_attach(int af, int proto, { struct encaptab *ep; int error; - int s; + int s, pss; size_t l; struct ip_pack4 *pack4; #ifdef INET6 @@ -448,7 +694,10 @@ encap_attach(int af, int proto, goto fail; /* check if anyone have already attached with exactly same config */ - LIST_FOREACH(ep, &encaptab, chain) { + pss = pserialize_read_enter(); + LIST_FOREACH(ep, &encap_table, chain) { + membar_datadep_consumer(); + if (ep->af != af) continue; if (ep->proto != proto) @@ -471,8 +720,10 @@ encap_attach(int af, int proto, continue; error = EEXIST; + pserialize_read_exit(pss); goto fail; } + pserialize_read_exit(pss); switch (af) { case AF_INET: @@ -535,6 +786,7 @@ encap_attach(int af, int proto, memcpy(ep->dstmask, dm, dp->sa_len); ep->esw = esw; ep->arg = arg; + psref_target_init(&ep->psref, encaptab.elem_class); error = encap_add(ep); if (error) @@ -614,6 +866,7 @@ encap6_ctlinput(int cmd, const struct sockaddr *sa, void *d0) int off; struct ip6ctlparam *ip6cp = NULL; int nxt; + int s; struct encaptab *ep; const struct encapsw *esw; @@ -641,13 +894,17 @@ encap6_ctlinput(int cmd, const struct sockaddr *sa, void *d0) if (ip6 && cmd == PRC_MSGSIZE) { int valid = 0; struct encaptab *match; + struct psref elem_psref; /* * Check to see if we have a valid encap configuration. */ - match = encap6_lookup(m, off, nxt, OUTBOUND); + match = encap6_lookup(m, off, nxt, OUTBOUND, + &elem_psref); if (match) valid++; + psref_release(&elem_psref, &match->psref, + encaptab.elem_class); /* * Depending on the value of "valid" and routing table @@ -665,7 +922,13 @@ encap6_ctlinput(int cmd, const struct sockaddr *sa, void *d0) } /* inform all listeners */ - LIST_FOREACH(ep, &encaptab, chain) { + + s = pserialize_read_enter(); + LIST_FOREACH(ep, &encap_table, chain) { + struct psref elem_psref; + + membar_datadep_consumer(); + if (ep->af != AF_INET6) continue; if (ep->proto >= 0 && ep->proto != nxt) @@ -674,11 +937,16 @@ encap6_ctlinput(int cmd, const struct sockaddr *sa, void *d0) /* should optimize by looking at address pairs */ /* XXX need to pass ep->arg or ep itself to listeners */ + psref_acquire(&elem_psref, &ep->psref, + encaptab.elem_class); esw = ep->esw; if (esw && esw->encapsw6.pr_ctlinput) { (*esw->encapsw6.pr_ctlinput)(cmd, sa, d, ep->arg); } + psref_release(&elem_psref, &ep->psref, + encaptab.elem_class); } + pserialize_read_exit(s); rip6_ctlinput(cmd, sa, d0); return NULL; @@ -692,11 +960,20 @@ encap_detach(const struct encaptab *cookie) struct encaptab *p, *np; int error; - LIST_FOREACH_SAFE(p, &encaptab, chain, np) { + KASSERT(encap_lock_held()); + + LIST_FOREACH_SAFE(p, &encap_table, chain, np) { + membar_datadep_consumer(); + if (p == ep) { error = encap_remove(p); if (error) return error; + + pserialize_perform(encaptab.psz); + + psref_target_destroy(&p->psref, + encaptab.elem_class); if (!ep->func) { kmem_free(p->addrpack, ep->addrpack->sa_len); kmem_free(p->maskpack, ep->maskpack->sa_len); @@ -709,6 +986,7 @@ encap_detach(const struct encaptab *cookie) return ENOENT; } +#ifdef USE_RADIX static struct radix_node_head * encap_rnh(int af) { @@ -742,6 +1020,63 @@ mask_matchlen(const struct sockaddr *sa) } return l; } +#endif + +#ifndef USE_RADIX +static int +mask_match(const struct encaptab *ep, + const struct sockaddr *sp, + const struct sockaddr *dp) +{ + struct sockaddr_storage s; + struct sockaddr_storage d; + int i; + const u_int8_t *p, *q; + u_int8_t *r; + int matchlen; + + KASSERTMSG(ep->func == NULL, "wrong encaptab passed to mask_match"); + + if (sp->sa_len > sizeof(s) || dp->sa_len > sizeof(d)) + return 0; + if (sp->sa_family != ep->af || dp->sa_family != ep->af) + return 0; + if (sp->sa_len != ep->src->sa_len || dp->sa_len != ep->dst->sa_len) + return 0; + + matchlen = 0; + + p = (const u_int8_t *)sp; + q = (const u_int8_t *)ep->srcmask; + r = (u_int8_t *)&s; + for (i = 0 ; i < sp->sa_len; i++) { + r[i] = p[i] & q[i]; + /* XXX estimate */ + matchlen += (q[i] ? 8 : 0); + } + + p = (const u_int8_t *)dp; + q = (const u_int8_t *)ep->dstmask; + r = (u_int8_t *)&d; + for (i = 0 ; i < dp->sa_len; i++) { + r[i] = p[i] & q[i]; + /* XXX rough estimate */ + matchlen += (q[i] ? 8 : 0); + } + + /* need to overwrite len/family portion as we don't compare them */ + s.ss_len = sp->sa_len; + s.ss_family = sp->sa_family; + d.ss_len = dp->sa_len; + d.ss_family = dp->sa_family; + + if (memcmp(&s, ep->src, ep->src->sa_len) == 0 && + memcmp(&d, ep->dst, ep->dst->sa_len) == 0) { + return matchlen; + } else + return 0; +} +#endif static void encap_fillarg(struct mbuf *m, const struct encaptab *ep) @@ -769,3 +1104,41 @@ encap_getarg(struct mbuf *m) } return p; } + +int +encap_lock_enter(void) +{ + int error; + + mutex_enter(&encap_whole.lock); + while (encap_whole.busy != NULL) { + error = cv_wait_sig(&encap_whole.cv, &encap_whole.lock); + if (error) { + mutex_exit(&encap_whole.lock); + return error; + } + } + KASSERT(encap_whole.busy == NULL); + encap_whole.busy = curlwp; + mutex_exit(&encap_whole.lock); + + return 0; +} + +void +encap_lock_exit(void) +{ + + mutex_enter(&encap_whole.lock); + KASSERT(encap_whole.busy == curlwp); + encap_whole.busy = NULL; + cv_broadcast(&encap_whole.cv); + mutex_exit(&encap_whole.lock); +} + +bool +encap_lock_held(void) +{ + + return (encap_whole.busy == curlwp); +} diff --git a/sys/netinet/ip_encap.h b/sys/netinet/ip_encap.h index 13b14b3..1013e17 100644 --- a/sys/netinet/ip_encap.h +++ b/sys/netinet/ip_encap.h @@ -39,6 +39,8 @@ #include #endif +#include + struct encapsw { union { struct encapsw4 { @@ -73,6 +75,7 @@ struct encaptab { int (*func) (struct mbuf *, int, int, void *); const struct encapsw *esw; void *arg; /* passed via PACKET_TAG_ENCAP */ + struct psref_target psref; }; /* to lookup a pair of address using radix tree */ @@ -93,6 +96,8 @@ struct ip_pack6 { struct sockaddr_in6 yours; }; +void encapinit(void); + void encap_init(void); void encap4_input(struct mbuf *, ...); int encap6_input(struct mbuf **, int *, int); @@ -106,6 +111,10 @@ void *encap6_ctlinput(int, const struct sockaddr *, void *); int encap_detach(const struct encaptab *); void *encap_getarg(struct mbuf *); +void encap_lock_enter(void); +void encap_lock_exit(void); +bool encap_lock_held(void); + #define ENCAP_PR_WRAP_CTLINPUT(name) \ static void * \ name##_wrapper(int a, const struct sockaddr *b, void *c, void *d) \ @@ -117,5 +126,4 @@ name##_wrapper(int a, const struct sockaddr *b, void *c, void *d) \ return rv; \ } #endif - #endif /* !_NETINET_IP_ENCAP_H_ */ diff --git a/sys/netinet/ip_mroute.c b/sys/netinet/ip_mroute.c index 7108db5..ab533ff 100644 --- a/sys/netinet/ip_mroute.c +++ b/sys/netinet/ip_mroute.c @@ -833,8 +833,12 @@ add_vif(struct vifctl *vifcp) * this requires both radix tree lookup and then a * function to check, and this is not supported yet. */ + error = encap_lock_enter(); + if (error) + return error; vifp->v_encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV4, vif_encapcheck, &vif_encapsw, vifp); + encap_lock_exit(); if (!vifp->v_encap_cookie) return (EINVAL); @@ -930,7 +934,9 @@ reset_vif(struct vif *vifp) callout_stop(&vifp->v_repq_ch); /* detach this vif from decapsulator dispatch table */ + encap_lock_enter(); encap_detach(vifp->v_encap_cookie); + encap_lock_exit(); vifp->v_encap_cookie = NULL; /* diff --git a/sys/netinet6/in6_gif.c b/sys/netinet6/in6_gif.c index ecf6d02..290c8c1 100644 --- a/sys/netinet6/in6_gif.c +++ b/sys/netinet6/in6_gif.c @@ -215,7 +215,8 @@ in6_gif_input(struct mbuf **mp, int *offp, int proto) gifp = (struct ifnet *)encap_getarg(m); - if (gifp == NULL || (gifp->if_flags & IFF_UP) == 0) { + if (gifp == NULL || (gifp->if_flags & (IFF_UP|IFF_RUNNING)) + != (IFF_UP|IFF_RUNNING)) { m_freem(m); IP6_STATINC(IP6_STAT_NOGIF); return IPPROTO_DONE; @@ -386,12 +387,22 @@ in6_gif_detach(struct gif_softc *sc) { int error; + error = in6_gif_pause(sc); + + rtcache_free(&sc->gif_ro); + + return error; +} + +int +in6_gif_pause(struct gif_softc *sc) +{ + int error; + error = encap_detach(sc->encap_cookie6); if (error == 0) sc->encap_cookie6 = NULL; - rtcache_free(&sc->gif_ro); - return error; } diff --git a/sys/netinet6/in6_gif.h b/sys/netinet6/in6_gif.h index e59985c..081a2fb 100644 --- a/sys/netinet6/in6_gif.h +++ b/sys/netinet6/in6_gif.h @@ -45,6 +45,7 @@ int gif_encapcheck6(struct mbuf *, int, int, void *); #endif int in6_gif_attach(struct gif_softc *); int in6_gif_detach(struct gif_softc *); +int in6_gif_pause(struct gif_softc *); void *in6_gif_ctlinput(int, const struct sockaddr *, void *, void *); #endif /* !_NETINET6_IN6_GIF_H_ */ diff --git a/sys/netipsec/xform_ipip.c b/sys/netipsec/xform_ipip.c index 21b9fd7..936b35b 100644 --- a/sys/netipsec/xform_ipip.c +++ b/sys/netipsec/xform_ipip.c @@ -721,6 +721,11 @@ ipe4_attach(void) xform_register(&ipe4_xformsw); /* attach to encapsulation framework */ /* XXX save return cookie for detach on module remove */ + + encapinit(); + /* This function is called before ifinit(). Who else gets lock? */ + (void)encap_lock_enter(); + /* ipe4_encapsw and ipe4_encapsw must be added atomically */ #ifdef INET (void) encap_attach_func(AF_INET, -1, ipe4_encapcheck, &ipe4_encapsw, NULL); @@ -729,6 +734,7 @@ ipe4_attach(void) (void) encap_attach_func(AF_INET6, -1, ipe4_encapcheck, &ipe4_encapsw6, NULL); #endif + encap_lock_exit(); } #ifdef SYSINIT diff --git a/sys/sys/psref.h b/sys/sys/psref.h new file mode 100644 index 0000000..73652e0 --- /dev/null +++ b/sys/sys/psref.h @@ -0,0 +1,111 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2016 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Taylor R. Campbell. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SYS_PSREF_H +#define _SYS_PSREF_H + +#include +#include + +/* + * PSREF_DEBUG + * + * If nonzero, enable debugging of psrefs. WARNING: This changes + * the ABI by adding extra fields to struct psref_target and + * struct psref, which are exposed to callers and embedded in + * other structures. + */ +#ifdef _KERNEL_OPT +#include "opt_psref.h" +#endif + +struct cpu_info; +struct lwp; + +struct psref; +struct psref_class; +struct psref_target; + +/* + * struct psref_target + * + * Bookkeeping for an object to which users can acquire passive + * references. This is compact so that it can easily be embedded + * into many multitudes of objects, e.g. IP packet flows. + * + * prt_draining is false on initialization, and may be written + * only once, to make it true, when someone has prevented new + * references from being created and wants to drain the target in + * order to destroy it. + */ +struct psref_target { +#ifdef PSREF_DEBUG + struct psref_class *prt_class; +#endif + bool prt_draining; +}; + +/* + * struct psref + * + * Bookkeeping for a single passive reference. There should only + * be a few of these per CPU in the system at once, no matter how + * many targets are stored, so these are a bit larger than struct + * psref_target. The contents of struct psref may be read and + * written only on the local CPU. + */ +struct psref { + LIST_ENTRY(psref) psref_entry; + struct psref_target *psref_target; +#ifdef PSREF_DEBUG + struct lwp *psref_lwp; + struct cpu_info *psref_cpu; +#endif +}; + +struct psref_class * + psref_class_create(const char *, int); +void psref_class_destroy(struct psref_class *); + +void psref_target_init(struct psref_target *, struct psref_class *); +void psref_target_destroy(struct psref_target *, struct psref_class *); + +void psref_acquire(struct psref *, struct psref_target *, + struct psref_class *); +void psref_release(struct psref *, struct psref_target *, + struct psref_class *); +void psref_copy(struct psref *, const struct psref *, + struct psref_class *); + +/* For use only in assertions. */ +bool psref_held(struct psref_target *, struct psref_class *); + +#endif /* _SYS_PSREF_H */