diff --git a/sys/kern/kern_softint.c b/sys/kern/kern_softint.c index 782540f..75f67d6 100644 --- a/sys/kern/kern_softint.c +++ b/sys/kern/kern_softint.c @@ -442,8 +442,8 @@ softint_disestablish(void *arg) KASSERT(sh->sh_func != NULL); flags |= sh->sh_flags; } - /* Neither pending nor active on all CPUs? */ - if ((flags & (SOFTINT_PENDING | SOFTINT_ACTIVE)) == 0) { + /* Inactive on all CPUs? */ + if ((flags & SOFTINT_ACTIVE) == 0) { break; } /* Oops, still active. Wait for it to clear. */ diff --git a/sys/net/if.c b/sys/net/if.c index ca4ddda..ad776e6 100644 --- a/sys/net/if.c +++ b/sys/net/if.c @@ -94,6 +94,7 @@ __KERNEL_RCSID(0, "$NetBSD: if.c,v 1.336 2016/05/16 01:16:24 ozaki-r Exp $"); #if defined(_KERNEL_OPT) #include "opt_inet.h" +#include "opt_ipsec.h" #include "opt_atalk.h" #include "opt_natm.h" @@ -137,6 +138,9 @@ __KERNEL_RCSID(0, "$NetBSD: if.c,v 1.336 2016/05/16 01:16:24 ozaki-r Exp $"); #include #include #include +#ifndef IPSEC +#include +#endif #ifdef INET6 #include @@ -257,6 +261,10 @@ ifinit(void) sysctl_net_pktq_setup(NULL, PF_INET6); #endif +#ifndef IPSEC + encapinit(); +#endif + if_listener = kauth_listen_scope(KAUTH_SCOPE_NETWORK, if_listener_cb, NULL); diff --git a/sys/net/if_gif.c b/sys/net/if_gif.c index 7ae30e3..116dafe 100644 --- a/sys/net/if_gif.c +++ b/sys/net/if_gif.c @@ -35,6 +35,7 @@ __KERNEL_RCSID(0, "$NetBSD: if_gif.c,v 1.108 2016/04/28 00:16:56 ozaki-r Exp $") #ifdef _KERNEL_OPT #include "opt_inet.h" +#include "opt_net_mpsafe.h" #endif #include @@ -53,6 +54,7 @@ __KERNEL_RCSID(0, "$NetBSD: if_gif.c,v 1.108 2016/04/28 00:16:56 ozaki-r Exp $") #include #include #include +#include #include #include @@ -85,6 +87,10 @@ __KERNEL_RCSID(0, "$NetBSD: if_gif.c,v 1.108 2016/04/28 00:16:56 ozaki-r Exp $") #include "ioconf.h" +#ifdef NET_MPSAFE +#define GIF_MPSAFE 1 +#endif + static void gifintr(void *); /* @@ -100,6 +106,7 @@ static int gif_check_nesting(struct ifnet *, struct mbuf *); static int gif_encap_attach(struct gif_softc *); static int gif_encap_detach(struct gif_softc *); +static void gif_encap_pause(struct gif_softc *); static struct if_clone gif_cloner = IF_CLONE_INITIALIZER("gif", gif_clone_create, gif_clone_destroy); @@ -217,7 +224,8 @@ gif_encapcheck(struct mbuf *m, int off, int proto, void *arg) if (sc == NULL) return 0; - if ((sc->gif_if.if_flags & IFF_UP) == 0) + if ((sc->gif_if.if_flags & (IFF_UP|IFF_RUNNING)) + != (IFF_UP|IFF_RUNNING)) return 0; /* no physical address */ @@ -320,9 +328,8 @@ gif_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, } m->m_flags &= ~(M_BCAST|M_MCAST); - if (!(ifp->if_flags & IFF_UP) || - sc->gif_psrc == NULL || sc->gif_pdst == NULL || - sc->gif_si == NULL) { + if (!(ifp->if_flags & IFF_UP) || /* check IFF_RUNNING later */ + sc->gif_psrc == NULL || sc->gif_pdst == NULL) { m_freem(m); error = ENETDOWN; goto end; @@ -343,6 +350,17 @@ gif_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, m->m_pkthdr.csum_data = 0; s = splnet(); + /* + * This if_flags check, IFQ_ENQUEUE and softint_schedule() are required + * to be done atomically in the local CPU, because this local CPU must + * let gif_encap_pause() wait until softint_schedule() completion. + */ + if (!(ifp->if_flags & IFF_RUNNING)) { + splx(s); + m_freem(m); + error = ENETDOWN; + goto end; + } IFQ_ENQUEUE(&ifp->if_snd, m, error); if (error) { splx(s); @@ -369,26 +387,23 @@ gifintr(void *arg) struct mbuf *m; int family; int len; +#ifndef GIF_MPSAFE int s; +#endif int error; sc = arg; ifp = &sc->gif_if; - /* - * other CPUs does {set,delete}_tunnel after curcpu have done - * softint_schedule(). - */ - if (sc->gif_pdst == NULL || sc->gif_psrc == NULL) { - IFQ_PURGE(&ifp->if_snd); - return; - } - /* output processing */ while (1) { +#ifndef GIF_MPSAFE s = splnet(); +#endif IFQ_DEQUEUE(&sc->gif_if.if_snd, m); +#ifndef GIF_MPSAFE splx(s); +#endif if (m == NULL) break; @@ -442,7 +457,9 @@ gif_input(struct mbuf *m, int af, struct ifnet *ifp) { pktqueue_t *pktq; size_t pktlen; +#ifndef GIF_MPSAFE int s; +#endif if (ifp == NULL) { /* just in case */ @@ -477,14 +494,18 @@ gif_input(struct mbuf *m, int af, struct ifnet *ifp) return; } +#ifndef GIF_MPSAFE s = splnet(); +#endif if (__predict_true(pktq_enqueue(pktq, m, 0))) { ifp->if_ibytes += pktlen; ifp->if_ipackets++; } else { m_freem(m); } +#ifndef GIF_MPSAFE splx(s); +#endif } /* XXX how should we handle IPv6 scope on SIOC[GS]IFPHYADDR? */ @@ -775,6 +796,46 @@ gif_encap_detach(struct gif_softc *sc) return error; } +static void +gif_encap_pause(struct gif_softc *sc) +{ + struct ifnet *ifp; + uint64_t where; + + if (sc == NULL || sc->gif_psrc == NULL) + return; + + ifp = &sc->gif_if; + if ((ifp->if_flags & IFF_RUNNING) == 0) + return; + + switch (sc->gif_psrc->sa_family) { +#ifdef INET + case AF_INET: + (void)in_gif_pause(sc); + break; +#endif +#ifdef INET6 + case AF_INET6: + (void)in6_gif_pause(sc); + break; +#endif + } + + ifp->if_flags &= ~IFF_RUNNING; + /* membar_sync() is done in xc_broadcast(). */ + + /* + * Wait for softint_schedule() completion done by other CPUs which + * already run over if_flags check in gif_output(). + * In addition, wait for softint_execute()(ipintr() or ip6intr()) + * completion done by other CPUs which already run over if_flags + * check in in_gif_input() or in6_gif_input(). + */ + where = xc_broadcast(0, (xcfunc_t)nullop, NULL, NULL); + xc_wait(where); +} + int gif_set_tunnel(struct ifnet *ifp, struct sockaddr *src, struct sockaddr *dst) { @@ -782,11 +843,19 @@ gif_set_tunnel(struct ifnet *ifp, struct sockaddr *src, struct sockaddr *dst) struct gif_softc *sc2; struct sockaddr *osrc, *odst; struct sockaddr *nsrc, *ndst; - void *osi; - int s; int error; +#ifndef GIF_MPSAFE + int s; s = splsoftnet(); +#endif + error = encap_lock_enter(); + if (error) { +#ifndef GIF_MPSAFE + splx(s); +#endif + return error; + } LIST_FOREACH(sc2, &gif_softc_list, gif_list) { if (sc2 == sc) @@ -797,50 +866,35 @@ gif_set_tunnel(struct ifnet *ifp, struct sockaddr *src, struct sockaddr *dst) if (sockaddr_cmp(sc2->gif_pdst, dst) == 0 && sockaddr_cmp(sc2->gif_psrc, src) == 0) { /* continue to use the old configureation. */ - splx(s); - return EADDRNOTAVAIL; + error = EADDRNOTAVAIL; + goto out; } /* XXX both end must be valid? (I mean, not 0.0.0.0) */ } if ((nsrc = sockaddr_dup(src, M_WAITOK)) == NULL) { - splx(s); - return ENOMEM; + error = ENOMEM; + goto out; } if ((ndst = sockaddr_dup(dst, M_WAITOK)) == NULL) { sockaddr_free(nsrc); - splx(s); - return ENOMEM; + error = ENOMEM; + goto out; } + gif_encap_pause(sc); + /* + * At this point, gif_output() does not softint_schedule() any more. + * Furthermore, all of gif_output() has completed. It promises not to + * call softint_schedule() anymore, so we can call + * softint_disestablish() now. + */ + /* Firstly, clear old configurations. */ if (sc->gif_si) { - osrc = sc->gif_psrc; - odst = sc->gif_pdst; - osi = sc->gif_si; - sc->gif_psrc = NULL; - sc->gif_pdst = NULL; + softint_disestablish(sc->gif_si); sc->gif_si = NULL; - /* - * At this point, gif_output() does not softint_schedule() - * any more. However, there are below 2 fears of other CPUs - * which would cause panic because of the race between - * softint_execute() and softint_disestablish(). - * (a) gif_output() has done softint_schedule(), and softint - * (gifintr()) is waiting for execution - * => This pattern is avoided by waiting SOFTINT_PENDING - * CPUs in softint_disestablish() - * (b) gifintr() is already running - * => This pattern is avoided by waiting SOFTINT_ACTIVE - * CPUs in softint_disestablish() - */ - - softint_disestablish(osi); - sc->gif_psrc = osrc; - sc->gif_pdst = odst; - osrc = NULL; - odst = NULL; } /* XXX we can detach from both, but be polite just in case */ if (sc->gif_psrc) @@ -867,7 +921,11 @@ gif_set_tunnel(struct ifnet *ifp, struct sockaddr *src, struct sockaddr *dst) continue; } - sc->gif_si = softint_establish(SOFTINT_NET, gifintr, sc); + sc->gif_si = softint_establish(SOFTINT_NET +#ifdef GIF_MPSAFE + | SOFTINT_MPSAFE +#endif + , gifintr, sc); if (sc->gif_si == NULL) { (void)gif_encap_detach(sc); @@ -899,7 +957,11 @@ gif_set_tunnel(struct ifnet *ifp, struct sockaddr *src, struct sockaddr *dst) else ifp->if_flags &= ~IFF_RUNNING; + out: + encap_lock_exit(); +#ifndef GIF_MPSAFE splx(s); +#endif return error; } @@ -907,24 +969,24 @@ void gif_delete_tunnel(struct ifnet *ifp) { struct gif_softc *sc = ifp->if_softc; - struct sockaddr *osrc, *odst; - void *osi; + int error; +#ifndef GIF_MPSAFE int s; s = splsoftnet(); +#endif + error = encap_lock_enter(); + if (error) { +#ifndef GIF_MPSAFE + splx(s); +#endif + return; + } + gif_encap_pause(sc); if (sc->gif_si) { - osrc = sc->gif_psrc; - odst = sc->gif_pdst; - osi = sc->gif_si; - - sc->gif_psrc = NULL; - sc->gif_pdst = NULL; + softint_disestablish(sc->gif_si); sc->gif_si = NULL; - - softint_disestablish(osi); - sc->gif_psrc = osrc; - sc->gif_pdst = odst; } if (sc->gif_psrc) { sockaddr_free(sc->gif_psrc); @@ -946,5 +1008,9 @@ gif_delete_tunnel(struct ifnet *ifp) ifp->if_flags |= IFF_RUNNING; else ifp->if_flags &= ~IFF_RUNNING; + + encap_lock_exit(); +#ifndef GIF_MPSAFE splx(s); +#endif } diff --git a/sys/net/if_stf.c b/sys/net/if_stf.c index a4b7fb6..0c23123 100644 --- a/sys/net/if_stf.c +++ b/sys/net/if_stf.c @@ -190,18 +190,27 @@ static int stf_clone_create(struct if_clone *ifc, int unit) { struct stf_softc *sc; + int error; + + sc = malloc(sizeof(struct stf_softc), M_DEVBUF, M_WAIT|M_ZERO); + if_initname(&sc->sc_if, ifc->ifc_name, unit); + + error = encap_lock_enter(); + if (error) { + free(sc, M_DEVBUF); + return error; + } if (LIST_FIRST(&stf_softc_list) != NULL) { /* Only one stf interface is allowed. */ + encap_lock_exit(); + free(sc, M_DEVBUF); return (EEXIST); } - sc = malloc(sizeof(struct stf_softc), M_DEVBUF, M_WAIT|M_ZERO); - - if_initname(&sc->sc_if, ifc->ifc_name, unit); - sc->encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV6, stf_encapcheck, &in_stf_encapsw, sc); + encap_lock_exit(); if (sc->encap_cookie == NULL) { printf("%s: unable to attach encap\n", if_name(&sc->sc_if)); free(sc, M_DEVBUF); @@ -226,8 +235,10 @@ stf_clone_destroy(struct ifnet *ifp) { struct stf_softc *sc = (void *) ifp; + encap_lock_enter(); LIST_REMOVE(sc, sc_list); encap_detach(sc->encap_cookie); + encap_lock_exit(); bpf_detach(ifp); if_detach(ifp); rtcache_free(&sc->sc_ro); diff --git a/sys/netinet/in_gif.c b/sys/netinet/in_gif.c index 35f6116..526979f 100644 --- a/sys/netinet/in_gif.c +++ b/sys/netinet/in_gif.c @@ -204,7 +204,8 @@ in_gif_input(struct mbuf *m, int off, int proto) gifp = (struct ifnet *)encap_getarg(m); - if (gifp == NULL || (gifp->if_flags & IFF_UP) == 0) { + if (gifp == NULL || (gifp->if_flags & (IFF_UP|IFF_RUNNING)) + != (IFF_UP|IFF_RUNNING)) { m_freem(m); ip_statinc(IP_STAT_NOGIF); return; @@ -384,11 +385,21 @@ in_gif_detach(struct gif_softc *sc) { int error; + error = in_gif_pause(sc); + + rtcache_free(&sc->gif_ro); + + return error; +} + +int +in_gif_pause(struct gif_softc *sc) +{ + int error; + error = encap_detach(sc->encap_cookie4); if (error == 0) sc->encap_cookie4 = NULL; - rtcache_free(&sc->gif_ro); - return error; } diff --git a/sys/netinet/in_gif.h b/sys/netinet/in_gif.h index 1107ee8..654b71c 100644 --- a/sys/netinet/in_gif.h +++ b/sys/netinet/in_gif.h @@ -45,5 +45,6 @@ int gif_encapcheck4(struct mbuf *, int, int, void *); #endif int in_gif_attach(struct gif_softc *); int in_gif_detach(struct gif_softc *); +int in_gif_pause(struct gif_softc *); #endif /* !_NETINET_IN_GIF_H_ */ diff --git a/sys/netinet/ip_encap.c b/sys/netinet/ip_encap.c index d743cca..60f2395 100644 --- a/sys/netinet/ip_encap.c +++ b/sys/netinet/ip_encap.c @@ -58,13 +58,18 @@ /* XXX is M_NETADDR correct? */ /* - * The code will use radix table for tunnel lookup, for + * With USE_RADIX the code will use radix table for tunnel lookup, for * tunnels registered with encap_attach() with a addr/mask pair. * Faster on machines with thousands of tunnel registerations (= interfaces). * * The code assumes that radix table code can handle non-continuous netmask, * as it will pass radix table memory region with (src + dst) sockaddr pair. */ +/* XXX future work + * eliminate linear search of encap interfaces. It must fix the many encap + * interface scaling issue without reducing computation by radix tree. + */ +#undef USE_RADIX #include __KERNEL_RCSID(0, "$NetBSD: ip_encap.c,v 1.53 2016/04/26 08:44:44 ozaki-r Exp $"); @@ -72,6 +77,7 @@ __KERNEL_RCSID(0, "$NetBSD: ip_encap.c,v 1.53 2016/04/26 08:44:44 ozaki-r Exp $" #ifdef _KERNEL_OPT #include "opt_mrouting.h" #include "opt_inet.h" +#include "opt_net_mpsafe.h" #endif #include @@ -82,6 +88,10 @@ __KERNEL_RCSID(0, "$NetBSD: ip_encap.c,v 1.53 2016/04/26 08:44:44 ozaki-r Exp $" #include #include #include +#include +#include +#include +#include #include @@ -105,24 +115,72 @@ __KERNEL_RCSID(0, "$NetBSD: ip_encap.c,v 1.53 2016/04/26 08:44:44 ozaki-r Exp $" #include +#ifdef NET_MPSAFE +#define ENCAP_MPSAFE 1 +#endif + enum direction { INBOUND, OUTBOUND }; #ifdef INET -static struct encaptab *encap4_lookup(struct mbuf *, int, int, enum direction); +static struct encaptab *encap4_lookup(struct mbuf *, int, int, enum direction, + struct psref *); #endif #ifdef INET6 -static struct encaptab *encap6_lookup(struct mbuf *, int, int, enum direction); +static struct encaptab *encap6_lookup(struct mbuf *, int, int, enum direction, + struct psref *); #endif static int encap_add(struct encaptab *); static int encap_remove(struct encaptab *); static int encap_afcheck(int, const struct sockaddr *, const struct sockaddr *); +#ifdef USE_RADIX static struct radix_node_head *encap_rnh(int); static int mask_matchlen(const struct sockaddr *); +#else +static int mask_match(const struct encaptab *, const struct sockaddr *, + const struct sockaddr *); +#endif static void encap_fillarg(struct mbuf *, const struct encaptab *); -LIST_HEAD(, encaptab) encaptab = LIST_HEAD_INITIALIZER(&encaptab); - +/* + * In encap[46]_lookup(), ep->func can sleep(e.g. rtalloc1) while walking + * encap_table. So, it cannot use pserialize_read_enter() + */ +static struct { + struct pslist_head list; + pserialize_t psz; + struct psref_class *elem_class; /* for the element of et_list */ +} encaptab __cacheline_aligned = { + .list = PSLIST_INITIALIZER, +}; +#define encap_table encaptab.list + +static struct { + kmutex_t lock; + kcondvar_t cv; + struct lwp *busy; +} encap_whole __cacheline_aligned; + +#ifdef USE_RADIX struct radix_node_head *encap_head[2]; /* 0 for AF_INET, 1 for AF_INET6 */ +static bool encap_head_updating = false; +#endif + +/* + * must be done before other encap interfaces initialization. + */ +void +encapinit(void) +{ + + encaptab.psz = pserialize_create(); + encaptab.elem_class = psref_class_create("encapelem", IPL_SOFTNET); + if (encaptab.elem_class == NULL) + panic("encaptab.elem_class cannot be allocated.\n"); + + mutex_init(&encap_whole.lock, MUTEX_DEFAULT, IPL_NONE); + cv_init(&encap_whole.cv, "ip_encap cv"); + encap_whole.busy = NULL; +} void encap_init(void) @@ -140,9 +198,10 @@ encap_init(void) * initialization - using LIST_INIT() here can nuke encap_attach() * from drivers. */ - LIST_INIT(&encaptab); + PSLIST_INIT(&encap_table); #endif +#ifdef USE_RADIX /* * initialize radix lookup table when the radix subsystem is inited. */ @@ -152,18 +211,23 @@ encap_init(void) rn_delayedinit((void *)&encap_head[1], sizeof(struct sockaddr_pack) << 3); #endif +#endif } #ifdef INET static struct encaptab * -encap4_lookup(struct mbuf *m, int off, int proto, enum direction dir) +encap4_lookup(struct mbuf *m, int off, int proto, enum direction dir, + struct psref *match_psref) { struct ip *ip; struct ip_pack4 pack; struct encaptab *ep, *match; int prio, matchprio; + int s; +#ifdef USE_RADIX struct radix_node_head *rnh = encap_rnh(AF_INET); struct radix_node *rn; +#endif KASSERT(m->m_len >= sizeof(*ip)); @@ -184,22 +248,54 @@ encap4_lookup(struct mbuf *m, int off, int proto, enum direction dir) match = NULL; matchprio = 0; + s = pserialize_read_enter(); +#ifdef USE_RADIX + /* Check whether there's and update in progress. */ + if (encap_head_updating) { + /* + * Update in progress. Pretend there are no tunnels + */ + pserialize_read_exit(s); + return NULL; + } rn = rnh->rnh_matchaddr((void *)&pack, rnh); if (rn && (rn->rn_flags & RNF_ROOT) == 0) { - match = (struct encaptab *)rn; + struct encaptab *encapp = (struct encaptab *)rn; + + psref_acquire(match_psref, &encapp->psref, + encaptab.elem_class); + match = encapp; matchprio = mask_matchlen(match->srcmask) + - mask_matchlen(match->dstmask); + mask_matchlen(match->dstmask); } +#endif + PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) { + struct psref elem_psref; + + membar_datadep_consumer(); - LIST_FOREACH(ep, &encaptab, chain) { if (ep->af != AF_INET) continue; if (ep->proto >= 0 && ep->proto != proto) continue; - if (ep->func) + + psref_acquire(&elem_psref, &ep->psref, + encaptab.elem_class); + if (ep->func) { + pserialize_read_exit(s); + /* XXXX ep->func is sleepable. */ prio = (*ep->func)(m, off, proto, ep->arg); - else + s = pserialize_read_enter(); + } else { +#ifdef USE_RADIX + psref_release(&elem_psref, &ep->psref, + encaptab.elem_class); continue; +#else + prio = mask_match(ep, (struct sockaddr *)&pack.mine, + (struct sockaddr *)&pack.yours); +#endif + } /* * We prioritize the matches by using bit length of the @@ -222,13 +318,30 @@ encap4_lookup(struct mbuf *m, int off, int proto, enum direction dir) * For radix-based lookup, I guess source takes precedence. * See rn_{refines,lexobetter} for the correct answer. */ - if (prio <= 0) + if (prio <= 0) { + psref_release(&elem_psref, &ep->psref, + encaptab.elem_class); continue; + } if (prio > matchprio) { + /* release last matched ep */ + if (match != NULL) + psref_release(match_psref, &match->psref, + encaptab.elem_class); + + psref_copy(match_psref, &elem_psref, + encaptab.elem_class); matchprio = prio; match = ep; } + KASSERTMSG((match == NULL) || psref_held(&match->psref, + encaptab.elem_class), + "current match = %p, but not hold its psref", match); + + psref_release(&elem_psref, &ep->psref, + encaptab.elem_class); } + pserialize_read_exit(s); return match; } @@ -240,22 +353,27 @@ encap4_input(struct mbuf *m, ...) va_list ap; const struct encapsw *esw; struct encaptab *match; + struct psref match_psref; va_start(ap, m); off = va_arg(ap, int); proto = va_arg(ap, int); va_end(ap); - match = encap4_lookup(m, off, proto, INBOUND); - + match = encap4_lookup(m, off, proto, INBOUND, &match_psref); if (match) { /* found a match, "match" has the best one */ esw = match->esw; if (esw && esw->encapsw4.pr_input) { encap_fillarg(m, match); (*esw->encapsw4.pr_input)(m, off, proto); - } else + psref_release(&match_psref, &match->psref, + encaptab.elem_class); + } else { + psref_release(&match_psref, &match->psref, + encaptab.elem_class); m_freem(m); + } return; } @@ -266,14 +384,18 @@ encap4_input(struct mbuf *m, ...) #ifdef INET6 static struct encaptab * -encap6_lookup(struct mbuf *m, int off, int proto, enum direction dir) +encap6_lookup(struct mbuf *m, int off, int proto, enum direction dir, + struct psref *match_psref) { struct ip6_hdr *ip6; struct ip_pack6 pack; int prio, matchprio; + int s; struct encaptab *ep, *match; +#ifdef USE_RADIX struct radix_node_head *rnh = encap_rnh(AF_INET6); struct radix_node *rn; +#endif KASSERT(m->m_len >= sizeof(*ip6)); @@ -294,31 +416,82 @@ encap6_lookup(struct mbuf *m, int off, int proto, enum direction dir) match = NULL; matchprio = 0; + s = pserialize_read_enter(); +#ifdef USE_RADIX + /* Check whether there's and update in progress. */ + if (encap_head_updating) { + /* + * Update in progress. Pretend there are no tunnels + */ + pserialize_read_exit(s); + return NULL; + } + rn = rnh->rnh_matchaddr((void *)&pack, rnh); if (rn && (rn->rn_flags & RNF_ROOT) == 0) { - match = (struct encaptab *)rn; + struct encaptab *encapp = (struct encaptab *)rn; + + psref_acquire(match_psref, &encapp->psref, + encaptab.elem_class); + match = encapp; matchprio = mask_matchlen(match->srcmask) + - mask_matchlen(match->dstmask); + mask_matchlen(match->dstmask); } +#endif + PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) { + struct psref elem_psref; + + membar_datadep_consumer(); - LIST_FOREACH(ep, &encaptab, chain) { if (ep->af != AF_INET6) continue; if (ep->proto >= 0 && ep->proto != proto) continue; - if (ep->func) + + psref_acquire(&elem_psref, &ep->psref, + encaptab.elem_class); + + if (ep->func) { + pserialize_read_exit(s); + /* XXXX ep->func is sleepable. */ prio = (*ep->func)(m, off, proto, ep->arg); - else + s = pserialize_read_enter(); + } else { +#ifdef USE_RADIX + psref_release(&elem_psref, &ep->psref, + encaptab.elem_class); continue; +#else + prio = mask_match(ep, (struct sockaddr *)&pack.mine, + (struct sockaddr *)&pack.yours); +#endif + } /* see encap4_lookup() for issues here */ - if (prio <= 0) + if (prio <= 0) { + psref_release(&elem_psref, &ep->psref, + encaptab.elem_class); continue; + } if (prio > matchprio) { + /* release last matched ep */ + if (match != NULL) + psref_release(match_psref, &match->psref, + encaptab.elem_class); + + psref_copy(match_psref, &elem_psref, + encaptab.elem_class); matchprio = prio; match = ep; } + KASSERTMSG((match == NULL) || psref_held(&match->psref, + encaptab.elem_class), + "current match = %p, but not hold its psref", match); + + psref_release(&elem_psref, &ep->psref, + encaptab.elem_class); } + pserialize_read_exit(s); return match; } @@ -329,16 +502,23 @@ encap6_input(struct mbuf **mp, int *offp, int proto) struct mbuf *m = *mp; const struct encapsw *esw; struct encaptab *match; + struct psref match_psref; - match = encap6_lookup(m, *offp, proto, INBOUND); + match = encap6_lookup(m, *offp, proto, INBOUND, &match_psref); if (match) { /* found a match */ esw = match->esw; if (esw && esw->encapsw6.pr_input) { + int ret; encap_fillarg(m, match); - return (*esw->encapsw6.pr_input)(mp, offp, proto); + ret = (*esw->encapsw6.pr_input)(mp, offp, proto); + psref_release(&match_psref, &match->psref, + encaptab.elem_class); + return ret; } else { + psref_release(&match_psref, &match->psref, + encaptab.elem_class); m_freem(m); return IPPROTO_DONE; } @@ -349,39 +529,86 @@ encap6_input(struct mbuf **mp, int *offp, int proto) } #endif +/* + * XXX + * The encaptab list and the rnh radix tree must be manipulated atomically. + */ static int encap_add(struct encaptab *ep) { +#ifdef USE_RADIX struct radix_node_head *rnh = encap_rnh(ep->af); - int error = 0; +#endif + + KASSERT(encap_lock_held()); - LIST_INSERT_HEAD(&encaptab, ep, chain); +#ifdef USE_RADIX if (!ep->func && rnh) { + /* Disable access to the radix tree for reader. */ + encap_head_updating = true; + /* Wait for all readers to drain. */ + pserialize_perform(encaptab.psz); + if (!rnh->rnh_addaddr((void *)ep->addrpack, (void *)ep->maskpack, rnh, ep->nodes)) { - error = EEXIST; - goto fail; + encap_head_updating = false; + return EEXIST; } + + /* + * The ep added to the radix tree must be skipped while + * encap[46]_lookup walks encaptab list. In other words, + * encap_add() does not need to care whether the ep has + * been added encaptab list or not yet. + * So, we can re-enable access to the radix tree for now. + */ + encap_head_updating = false; } - return error; +#endif - fail: - LIST_REMOVE(ep, chain); - return error; + PSLIST_WRITER_INSERT_HEAD(&encap_table, ep, chain); + + return 0; } +/* + * XXX + * The encaptab list and the rnh radix tree must be manipulated atomically. + */ static int encap_remove(struct encaptab *ep) { +#ifdef USE_RADIX struct radix_node_head *rnh = encap_rnh(ep->af); +#endif int error = 0; - LIST_REMOVE(ep, chain); + KASSERT(encap_lock_held()); + +#ifdef USE_RADIX if (!ep->func && rnh) { + /* Disable access to the radix tree for reader. */ + encap_head_updating = true; + /* Wait for all readers to drain. */ + pserialize_perform(encaptab.psz); + if (!rnh->rnh_deladdr((void *)ep->addrpack, (void *)ep->maskpack, rnh)) error = ESRCH; + + /* + * The ep added to the radix tree must be skipped while + * encap[46]_lookup walks encaptab list. In other words, + * encap_add() does not need to care whether the ep has + * been added encaptab list or not yet. + * So, we can re-enable access to the radix tree for now. + */ + encap_head_updating = false; } +#endif + + PSLIST_WRITER_REMOVE(ep, chain); + return error; } @@ -433,21 +660,27 @@ encap_attach(int af, int proto, { struct encaptab *ep; int error; - int s; + int pss; size_t l; struct ip_pack4 *pack4; #ifdef INET6 struct ip_pack6 *pack6; #endif +#ifndef ENCAP_MPSAFE + int s; s = splsoftnet(); +#endif /* sanity check on args */ error = encap_afcheck(af, sp, dp); if (error) goto fail; /* check if anyone have already attached with exactly same config */ - LIST_FOREACH(ep, &encaptab, chain) { + pss = pserialize_read_enter(); + PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) { + membar_datadep_consumer(); + if (ep->af != af) continue; if (ep->proto != proto) @@ -470,8 +703,10 @@ encap_attach(int af, int proto, continue; error = EEXIST; + pserialize_read_exit(pss); goto fail; } + pserialize_read_exit(pss); switch (af) { case AF_INET: @@ -534,13 +769,16 @@ encap_attach(int af, int proto, memcpy(ep->dstmask, dm, dp->sa_len); ep->esw = esw; ep->arg = arg; + psref_target_init(&ep->psref, encaptab.elem_class); error = encap_add(ep); if (error) goto gc; error = 0; +#ifndef ENCAP_MPSAFE splx(s); +#endif return ep; gc: @@ -551,7 +789,9 @@ gc: if (ep) kmem_free(ep, sizeof(*ep)); fail: +#ifndef ENCAP_MPSAFE splx(s); +#endif return NULL; } @@ -562,9 +802,11 @@ encap_attach_func(int af, int proto, { struct encaptab *ep; int error; +#ifndef ENCAP_MPSAFE int s; s = splsoftnet(); +#endif /* sanity check on args */ if (!func) { error = EINVAL; @@ -587,17 +829,22 @@ encap_attach_func(int af, int proto, ep->func = func; ep->esw = esw; ep->arg = arg; + psref_target_init(&ep->psref, encaptab.elem_class); error = encap_add(ep); if (error) goto fail; error = 0; +#ifndef ENCAP_MPSAFE splx(s); +#endif return ep; fail: +#ifndef ENCAP_MPSAFE splx(s); +#endif return NULL; } @@ -613,6 +860,7 @@ encap6_ctlinput(int cmd, const struct sockaddr *sa, void *d0) int off; struct ip6ctlparam *ip6cp = NULL; int nxt; + int s; struct encaptab *ep; const struct encapsw *esw; @@ -640,13 +888,17 @@ encap6_ctlinput(int cmd, const struct sockaddr *sa, void *d0) if (ip6 && cmd == PRC_MSGSIZE) { int valid = 0; struct encaptab *match; + struct psref elem_psref; /* * Check to see if we have a valid encap configuration. */ - match = encap6_lookup(m, off, nxt, OUTBOUND); + match = encap6_lookup(m, off, nxt, OUTBOUND, + &elem_psref); if (match) valid++; + psref_release(&elem_psref, &match->psref, + encaptab.elem_class); /* * Depending on the value of "valid" and routing table @@ -664,7 +916,13 @@ encap6_ctlinput(int cmd, const struct sockaddr *sa, void *d0) } /* inform all listeners */ - LIST_FOREACH(ep, &encaptab, chain) { + + s = pserialize_read_enter(); + PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) { + struct psref elem_psref; + + membar_datadep_consumer(); + if (ep->af != AF_INET6) continue; if (ep->proto >= 0 && ep->proto != nxt) @@ -673,11 +931,16 @@ encap6_ctlinput(int cmd, const struct sockaddr *sa, void *d0) /* should optimize by looking at address pairs */ /* XXX need to pass ep->arg or ep itself to listeners */ + psref_acquire(&elem_psref, &ep->psref, + encaptab.elem_class); esw = ep->esw; if (esw && esw->encapsw6.pr_ctlinput) { (*esw->encapsw6.pr_ctlinput)(cmd, sa, d, ep->arg); } + psref_release(&elem_psref, &ep->psref, + encaptab.elem_class); } + pserialize_read_exit(s); rip6_ctlinput(cmd, sa, d0); return NULL; @@ -688,26 +951,39 @@ int encap_detach(const struct encaptab *cookie) { const struct encaptab *ep = cookie; - struct encaptab *p, *np; + struct encaptab *p; int error; - LIST_FOREACH_SAFE(p, &encaptab, chain, np) { + KASSERT(encap_lock_held()); + + PSLIST_WRITER_FOREACH(p, &encap_table, struct encaptab, chain) { + membar_datadep_consumer(); + if (p == ep) { error = encap_remove(p); if (error) return error; - if (!ep->func) { - kmem_free(p->addrpack, ep->addrpack->sa_len); - kmem_free(p->maskpack, ep->maskpack->sa_len); - } - kmem_free(p, sizeof(*p)); /*XXX*/ - return 0; + else + break; } } + if (p == NULL) + return ENOENT; + + pserialize_perform(encaptab.psz); + + psref_target_destroy(&p->psref, + encaptab.elem_class); + if (!ep->func) { + kmem_free(p->addrpack, ep->addrpack->sa_len); + kmem_free(p->maskpack, ep->maskpack->sa_len); + } + kmem_free(p, sizeof(*p)); - return ENOENT; + return 0; } +#ifdef USE_RADIX static struct radix_node_head * encap_rnh(int af) { @@ -741,6 +1017,63 @@ mask_matchlen(const struct sockaddr *sa) } return l; } +#endif + +#ifndef USE_RADIX +static int +mask_match(const struct encaptab *ep, + const struct sockaddr *sp, + const struct sockaddr *dp) +{ + struct sockaddr_storage s; + struct sockaddr_storage d; + int i; + const u_int8_t *p, *q; + u_int8_t *r; + int matchlen; + + KASSERTMSG(ep->func == NULL, "wrong encaptab passed to mask_match"); + + if (sp->sa_len > sizeof(s) || dp->sa_len > sizeof(d)) + return 0; + if (sp->sa_family != ep->af || dp->sa_family != ep->af) + return 0; + if (sp->sa_len != ep->src->sa_len || dp->sa_len != ep->dst->sa_len) + return 0; + + matchlen = 0; + + p = (const u_int8_t *)sp; + q = (const u_int8_t *)ep->srcmask; + r = (u_int8_t *)&s; + for (i = 0 ; i < sp->sa_len; i++) { + r[i] = p[i] & q[i]; + /* XXX estimate */ + matchlen += (q[i] ? 8 : 0); + } + + p = (const u_int8_t *)dp; + q = (const u_int8_t *)ep->dstmask; + r = (u_int8_t *)&d; + for (i = 0 ; i < dp->sa_len; i++) { + r[i] = p[i] & q[i]; + /* XXX rough estimate */ + matchlen += (q[i] ? 8 : 0); + } + + /* need to overwrite len/family portion as we don't compare them */ + s.ss_len = sp->sa_len; + s.ss_family = sp->sa_family; + d.ss_len = dp->sa_len; + d.ss_family = dp->sa_family; + + if (memcmp(&s, ep->src, ep->src->sa_len) == 0 && + memcmp(&d, ep->dst, ep->dst->sa_len) == 0) { + return matchlen; + } else + return 0; +} +#endif static void encap_fillarg(struct mbuf *m, const struct encaptab *ep) @@ -768,3 +1101,41 @@ encap_getarg(struct mbuf *m) } return p; } + +int +encap_lock_enter(void) +{ + int error; + + mutex_enter(&encap_whole.lock); + while (encap_whole.busy != NULL) { + error = cv_wait_sig(&encap_whole.cv, &encap_whole.lock); + if (error) { + mutex_exit(&encap_whole.lock); + return error; + } + } + KASSERT(encap_whole.busy == NULL); + encap_whole.busy = curlwp; + mutex_exit(&encap_whole.lock); + + return 0; +} + +void +encap_lock_exit(void) +{ + + mutex_enter(&encap_whole.lock); + KASSERT(encap_whole.busy == curlwp); + encap_whole.busy = NULL; + cv_broadcast(&encap_whole.cv); + mutex_exit(&encap_whole.lock); +} + +bool +encap_lock_held(void) +{ + + return (encap_whole.busy == curlwp); +} diff --git a/sys/netinet/ip_encap.h b/sys/netinet/ip_encap.h index 13b14b3..42085ed 100644 --- a/sys/netinet/ip_encap.h +++ b/sys/netinet/ip_encap.h @@ -39,6 +39,9 @@ #include #endif +#include +#include + struct encapsw { union { struct encapsw4 { @@ -61,7 +64,7 @@ struct encapsw { struct encaptab { struct radix_node nodes[2]; - LIST_ENTRY(encaptab) chain; + struct pslist_entry chain; int af; int proto; /* -1: don't care, I'll check myself */ struct sockaddr *addrpack; /* malloc'ed, for radix lookup */ @@ -73,6 +76,7 @@ struct encaptab { int (*func) (struct mbuf *, int, int, void *); const struct encapsw *esw; void *arg; /* passed via PACKET_TAG_ENCAP */ + struct psref_target psref; }; /* to lookup a pair of address using radix tree */ @@ -93,6 +97,8 @@ struct ip_pack6 { struct sockaddr_in6 yours; }; +void encapinit(void); + void encap_init(void); void encap4_input(struct mbuf *, ...); int encap6_input(struct mbuf **, int *, int); @@ -106,6 +112,10 @@ void *encap6_ctlinput(int, const struct sockaddr *, void *); int encap_detach(const struct encaptab *); void *encap_getarg(struct mbuf *); +int encap_lock_enter(void); +void encap_lock_exit(void); +bool encap_lock_held(void); + #define ENCAP_PR_WRAP_CTLINPUT(name) \ static void * \ name##_wrapper(int a, const struct sockaddr *b, void *c, void *d) \ @@ -117,5 +127,4 @@ name##_wrapper(int a, const struct sockaddr *b, void *c, void *d) \ return rv; \ } #endif - #endif /* !_NETINET_IP_ENCAP_H_ */ diff --git a/sys/netinet/ip_mroute.c b/sys/netinet/ip_mroute.c index dbc9c2e..de9762d 100644 --- a/sys/netinet/ip_mroute.c +++ b/sys/netinet/ip_mroute.c @@ -832,8 +832,12 @@ add_vif(struct vifctl *vifcp) * this requires both radix tree lookup and then a * function to check, and this is not supported yet. */ + error = encap_lock_enter(); + if (error) + return error; vifp->v_encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV4, vif_encapcheck, &vif_encapsw, vifp); + encap_lock_exit(); if (!vifp->v_encap_cookie) return (EINVAL); @@ -929,7 +933,9 @@ reset_vif(struct vif *vifp) callout_stop(&vifp->v_repq_ch); /* detach this vif from decapsulator dispatch table */ + encap_lock_enter(); encap_detach(vifp->v_encap_cookie); + encap_lock_exit(); vifp->v_encap_cookie = NULL; /* diff --git a/sys/netinet6/in6_gif.c b/sys/netinet6/in6_gif.c index ecf6d02..290c8c1 100644 --- a/sys/netinet6/in6_gif.c +++ b/sys/netinet6/in6_gif.c @@ -215,7 +215,8 @@ in6_gif_input(struct mbuf **mp, int *offp, int proto) gifp = (struct ifnet *)encap_getarg(m); - if (gifp == NULL || (gifp->if_flags & IFF_UP) == 0) { + if (gifp == NULL || (gifp->if_flags & (IFF_UP|IFF_RUNNING)) + != (IFF_UP|IFF_RUNNING)) { m_freem(m); IP6_STATINC(IP6_STAT_NOGIF); return IPPROTO_DONE; @@ -386,12 +387,22 @@ in6_gif_detach(struct gif_softc *sc) { int error; + error = in6_gif_pause(sc); + + rtcache_free(&sc->gif_ro); + + return error; +} + +int +in6_gif_pause(struct gif_softc *sc) +{ + int error; + error = encap_detach(sc->encap_cookie6); if (error == 0) sc->encap_cookie6 = NULL; - rtcache_free(&sc->gif_ro); - return error; } diff --git a/sys/netinet6/in6_gif.h b/sys/netinet6/in6_gif.h index e59985c..081a2fb 100644 --- a/sys/netinet6/in6_gif.h +++ b/sys/netinet6/in6_gif.h @@ -45,6 +45,7 @@ int gif_encapcheck6(struct mbuf *, int, int, void *); #endif int in6_gif_attach(struct gif_softc *); int in6_gif_detach(struct gif_softc *); +int in6_gif_pause(struct gif_softc *); void *in6_gif_ctlinput(int, const struct sockaddr *, void *, void *); #endif /* !_NETINET6_IN6_GIF_H_ */ diff --git a/sys/netipsec/xform_ipip.c b/sys/netipsec/xform_ipip.c index 2c2a0a4..8dade5e 100644 --- a/sys/netipsec/xform_ipip.c +++ b/sys/netipsec/xform_ipip.c @@ -725,6 +725,11 @@ ipe4_attach(void) xform_register(&ipe4_xformsw); /* attach to encapsulation framework */ /* XXX save return cookie for detach on module remove */ + + encapinit(); + /* This function is called before ifinit(). Who else gets lock? */ + (void)encap_lock_enter(); + /* ipe4_encapsw and ipe4_encapsw must be added atomically */ #ifdef INET (void) encap_attach_func(AF_INET, -1, ipe4_encapcheck, &ipe4_encapsw, NULL); @@ -733,6 +738,7 @@ ipe4_attach(void) (void) encap_attach_func(AF_INET6, -1, ipe4_encapcheck, &ipe4_encapsw6, NULL); #endif + encap_lock_exit(); } #ifdef SYSINIT