diff --git a/sys/net/bpf.c b/sys/net/bpf.c index f66e783..5767421 100644 --- a/sys/net/bpf.c +++ b/sys/net/bpf.c @@ -121,18 +121,41 @@ struct bpfjit_ops bpfjit_module_ops = { struct bpf_stat bpf_gstats; /* - * Use a mutex to avoid a race condition between gathering the stats/peers - * and opening/closing the device. - */ -static kmutex_t bpf_mtx; - -/* * bpf_iflist is the list of interfaces; each corresponds to an ifnet * bpf_dtab holds the descriptors, indexed by minor device # */ struct bpf_if *bpf_iflist; LIST_HEAD(, bpf_d) bpf_list; +/* Global lock */ +static kmutex_t bpf_mtx __cacheline_aligned; + +/* + * Lock order: bpf_mtx (adaptive) => bpf_if#bif_lock (spin) => bpf_d#bd_lock (spin) + * + * - bpf_gstats and bpf_iflist are protected by bpf_mtx + * - bpf_if#bif_dlist (and bpf_d#bd_next) is protected by bif_lock + * - The list may be accessed from HW interrupt + * - bpf_d itself and its members are protected by bd_lock + * - bd_lock is a spin lock because it has to be used in HW interrupts + * - bd_lock has to be released during operations that may block or sleep + * so we have to prevent bpf_d from being unexpectedly freed or modified + * during such operations + * - To do so, we have two reference counters (bpf_refcount); one for + * bpf_d itself and the other is for bd_hbuf + * - Some of bpf_d members are also protected by bpf_mtx + * - Modifying bpf_d#bd_bif and bpf_d#bd_promisc are protected by bpf_mtx + * in addition to bd_lock + * - So references to them is safe if bpf_mtx is held as same as + * d->bd_lock is held and d->bd_bif != NULL is checked + * - bpf_d#bd_dying has to be checked anytime when accessing bpf_d + * without holding bpf_mtx + * - Modifying bpf_if list (bpf_if#bif_next) and bpf_d list (bpf_d#bd_next) + * must be done with holding bpf_mtx + * - Hot paths shouldn't hold bpf_mtx, for example, bpf_read, bpf_write + * and bpf_deliver should hold only bif_lock and db_lock. + */ + static int bpf_allocbufs(struct bpf_d *); static void bpf_deliver(struct bpf_if *, void *(*cpfn)(void *, const void *, size_t), @@ -143,8 +166,9 @@ static void *bpf_mcpy(void *, const void *, size_t); static int bpf_movein(struct uio *, int, uint64_t, struct mbuf **, struct sockaddr *); static void bpf_attachd(struct bpf_d *, struct bpf_if *); -static void bpf_detachd(struct bpf_d *); +static void bpf_detachd(struct bpf_d *, bool); static int bpf_setif(struct bpf_d *, struct ifreq *); +static int bpf_setf(struct bpf_d *, struct bpf_program *); static void bpf_timed_out(void *); static inline void bpf_wakeup(struct bpf_d *); @@ -152,9 +176,11 @@ static int bpf_hdrlen(struct bpf_d *); static void catchpacket(struct bpf_d *, u_char *, u_int, u_int, void *(*)(void *, const void *, size_t), struct timespec *); static void reset_d(struct bpf_d *); -static int bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *); +static void reset_d_locked(struct bpf_d *); +static int bpf_getdltlist(struct ifnet *, struct bpf_dltlist *); static int bpf_setdlt(struct bpf_d *, u_int); + static int bpf_read(struct file *, off_t *, struct uio *, kauth_cred_t, int); static int bpf_write(struct file *, off_t *, struct uio *, kauth_cred_t, @@ -165,6 +191,7 @@ static int bpf_stat(struct file *, struct stat *); static int bpf_close(struct file *); static int bpf_kqfilter(struct file *, struct knote *); static void bpf_softintr(void *); +static int bpf_ifpromisc(struct ifnet *, int); static const struct fileops bpf_fileops = { .fo_read = bpf_read, @@ -335,41 +362,96 @@ bad: /* * Attach file to the bpf interface, i.e. make d listen on bp. - * Must be called at splnet. */ static void bpf_attachd(struct bpf_d *d, struct bpf_if *bp) { + KASSERT(mutex_owned(&bpf_mtx)); /* * Point d at bp, and add d to the interface's list of listeners. * Finally, point the driver's bpf cookie at the interface so * it will divert packets to bpf. */ + mutex_enter(bp->bif_lock); + mutex_enter(d->bd_lock); d->bd_bif = bp; d->bd_next = bp->bif_dlist; + mutex_exit(d->bd_lock); bp->bif_dlist = d; + /* Something like ifp->if_bpf = bp */ *bp->bif_driverp = bp; + mutex_exit(bp->bif_lock); +} + +static int +bpf_ifpromisc(struct ifnet *ifp, int pswitch) +{ + int s; + int error; + + KASSERT(mutex_owned(&bpf_mtx)); + + KERNEL_LOCK(1, NULL); + s = splnet(); + error = ifpromisc(ifp, pswitch); + splx(s); + KERNEL_UNLOCK_ONE(NULL); + + return error; } /* * Detach a file from its interface. */ static void -bpf_detachd(struct bpf_d *d) +bpf_detachd(struct bpf_d *d, bool dont_ifpromisc) { struct bpf_d **p; struct bpf_if *bp; + struct ifnet *ifp; + struct bpf_if **driverp; + bool turn_promisc_off = false; + KASSERT(mutex_owned(&bpf_mtx)); + + mutex_enter(d->bd_lock); bp = d->bd_bif; + driverp = d->bd_bif->bif_driverp; + d->bd_bif = NULL; + if (d->bd_promisc && !dont_ifpromisc) + turn_promisc_off = true; + d->bd_promisc = 0; + mutex_exit(d->bd_lock); + + KASSERT(bp != NULL); + KASSERT(driverp != NULL); + + /* Remove d from the interface's descriptor list. */ + mutex_enter(bp->bif_lock); + p = &bp->bif_dlist; + while (*p != d) { + p = &(*p)->bd_next; + if (*p == NULL) + panic("%s: descriptor not in list", __func__); + } + *p = (*p)->bd_next; + if (bp->bif_dlist == NULL) + /* + * This is something like ifp->if_bpf = NULL. + * Let the driver know that there are no more listeners. + */ + *driverp = NULL; + ifp = bp->bif_ifp; + mutex_exit(bp->bif_lock); + /* * Check if this descriptor had requested promiscuous mode. * If so, turn it off. */ - if (d->bd_promisc) { + if (turn_promisc_off) { int error __diagused; - d->bd_promisc = 0; /* * Take device out of promiscuous mode. Since we were * able to enter promiscuous mode, we should be able @@ -377,26 +459,12 @@ bpf_detachd(struct bpf_d *d) * the interface was configured down, so only panic * if we don't get an unexpected error. */ - error = ifpromisc(bp->bif_ifp, 0); + error = bpf_ifpromisc(ifp, 0); #ifdef DIAGNOSTIC if (error) printf("%s: ifpromisc failed: %d", __func__, error); #endif } - /* Remove d from the interface's descriptor list. */ - p = &bp->bif_dlist; - while (*p != d) { - p = &(*p)->bd_next; - if (*p == NULL) - panic("%s: descriptor not in list", __func__); - } - *p = (*p)->bd_next; - if (bp->bif_dlist == NULL) - /* - * Let the driver know that there are no more listeners. - */ - *d->bd_bif->bif_driverp = NULL; - d->bd_bif = NULL; } static int @@ -426,6 +494,51 @@ bpfilterattach(int n) RUN_ONCE(&control, doinit); } +static void +bpf_refcount_init(struct bpf_refcount *br, const char *wmesg) +{ + cv_init(&br->br_cv, wmesg); + br->br_refs = 0; + br->br_waiting = false; +} + +static void +bpf_refcount_destroy(struct bpf_refcount *br) +{ + cv_destroy(&br->br_cv); +} + +static void +bpf_refcount_grab(struct bpf_refcount *br) +{ + br->br_refs++; +} + +static void +bpf_refcount_release(struct bpf_refcount *br) +{ + br->br_refs--; + if (br->br_refs == 0 && br->br_waiting) { + cv_broadcast(&br->br_cv); + } +} + +static int +bpf_refcount_refs(struct bpf_refcount *br) +{ + return br->br_refs; +} + +static void +bpf_refcount_wait(struct bpf_refcount *br, kmutex_t *lock) +{ + while (br->br_refs > 0) { + br->br_waiting = true; + cv_wait(&br->br_cv, lock); + } + br->br_waiting = false; +} + /* * Open ethernet device. Clones. */ @@ -454,12 +567,15 @@ bpfopen(dev_t dev, int flag, int mode, struct lwp *l) d->bd_atime = d->bd_mtime = d->bd_btime; callout_init(&d->bd_callout, 0); selinit(&d->bd_sel); - d->bd_sih = softint_establish(SOFTINT_CLOCK, bpf_softintr, d); + d->bd_sih = softint_establish(SOFTINT_CLOCK | SOFTINT_MPSAFE, + bpf_softintr, d); d->bd_jitcode = NULL; - - mutex_enter(&bpf_mtx); - LIST_INSERT_HEAD(&bpf_list, d, bd_list); - mutex_exit(&bpf_mtx); + d->bd_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NET); + bpf_refcount_init(&d->bd_refcount, "bd_refcount"); + bpf_refcount_init(&d->bd_hbuf_refcount, "bd_hbuf_refcount"); + d->bd_dying = false; + cv_init(&d->bd_read_cv, "bpf_read_cv"); + d->bd_read_waiting = false; return fd_clone(fp, fd, flag, &bpf_fileops, d); } @@ -473,33 +589,44 @@ static int bpf_close(struct file *fp) { struct bpf_d *d = fp->f_bpf; - int s; - KERNEL_LOCK(1, NULL); + mutex_enter(&bpf_mtx); + mutex_enter(d->bd_lock); /* * Refresh the PID associated with this bpf file. */ d->bd_pid = curproc->p_pid; - s = splnet(); if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); d->bd_state = BPF_IDLE; + + bpf_refcount_wait(&d->bd_refcount, d->bd_lock); + d->bd_dying = true; + mutex_exit(d->bd_lock); + if (d->bd_bif) - bpf_detachd(d); - splx(s); + bpf_detachd(d, false); + + mutex_enter(d->bd_lock); + bpf_refcount_wait(&d->bd_refcount, d->bd_lock); bpf_freed(d); - mutex_enter(&bpf_mtx); - LIST_REMOVE(d, bd_list); - mutex_exit(&bpf_mtx); + mutex_exit(d->bd_lock); + callout_destroy(&d->bd_callout); seldestroy(&d->bd_sel); softint_disestablish(d->bd_sih); + + bpf_refcount_destroy(&d->bd_refcount); + bpf_refcount_destroy(&d->bd_hbuf_refcount); + cv_destroy(&d->bd_read_cv); + mutex_obj_free(d->bd_lock); + free(d, M_DEVBUF); fp->f_bpf = NULL; - KERNEL_UNLOCK_ONE(NULL); + mutex_exit(&bpf_mtx); return (0); } @@ -525,18 +652,25 @@ bpf_read(struct file *fp, off_t *offp, struct uio *uio, struct bpf_d *d = fp->f_bpf; int timed_out; int error; - int s; + + mutex_enter(d->bd_lock); + + if (d->bd_dying) { + mutex_exit(d->bd_lock); + return ENXIO; + } getnanotime(&d->bd_atime); /* * Restrict application to use a buffer the same size as * the kernel buffers. */ - if (uio->uio_resid != d->bd_bufsize) + if (uio->uio_resid != d->bd_bufsize) { + mutex_exit(d->bd_lock); return (EINVAL); + } + bpf_refcount_grab(&d->bd_refcount); - KERNEL_LOCK(1, NULL); - s = splnet(); if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); timed_out = (d->bd_state == BPF_TIMED_OUT); @@ -549,9 +683,8 @@ bpf_read(struct file *fp, off_t *offp, struct uio *uio, while (d->bd_hbuf == NULL) { if (fp->f_flag & FNONBLOCK) { if (d->bd_slen == 0) { - splx(s); - KERNEL_UNLOCK_ONE(NULL); - return (EWOULDBLOCK); + error = EWOULDBLOCK; + goto done; } ROTATE_BUFFERS(d); break; @@ -566,13 +699,13 @@ bpf_read(struct file *fp, off_t *offp, struct uio *uio, ROTATE_BUFFERS(d); break; } - error = tsleep(d, PRINET|PCATCH, "bpf", - d->bd_rtout); - if (error == EINTR || error == ERESTART) { - splx(s); - KERNEL_UNLOCK_ONE(NULL); - return (error); - } + + d->bd_read_waiting = true; + error = cv_timedwait_sig(&d->bd_read_cv, d->bd_lock, d->bd_rtout); + d->bd_read_waiting = false; + if (error == EINTR || error == ERESTART) + goto done; + if (error == EWOULDBLOCK) { /* * On a timeout, return what's in the buffer, @@ -588,9 +721,8 @@ bpf_read(struct file *fp, off_t *offp, struct uio *uio, break; if (d->bd_slen == 0) { - splx(s); - KERNEL_UNLOCK_ONE(NULL); - return (0); + error = 0; + goto done; } ROTATE_BUFFERS(d); break; @@ -601,7 +733,8 @@ bpf_read(struct file *fp, off_t *offp, struct uio *uio, /* * At this point, we know we have something in the hold slot. */ - splx(s); + bpf_refcount_grab(&d->bd_hbuf_refcount); + mutex_exit(d->bd_lock); /* * Move data from hold buffer into user space. @@ -610,14 +743,16 @@ bpf_read(struct file *fp, off_t *offp, struct uio *uio, */ error = uiomove(d->bd_hbuf, d->bd_hlen, uio); - s = splnet(); + mutex_enter(d->bd_lock); d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; d->bd_hlen = 0; + bpf_refcount_release(&d->bd_hbuf_refcount); done: - splx(s); - KERNEL_UNLOCK_ONE(NULL); - return (error); + bpf_refcount_release(&d->bd_refcount); + mutex_exit(d->bd_lock); + + return error; } @@ -627,7 +762,10 @@ done: static inline void bpf_wakeup(struct bpf_d *d) { - wakeup(d); + KASSERT(mutex_owned(d->bd_lock)); + + if (d->bd_read_waiting) + cv_broadcast(&d->bd_read_cv); if (d->bd_async) softint_schedule(d->bd_sih); selnotify(&d->bd_sel, 0, 0); @@ -636,26 +774,30 @@ bpf_wakeup(struct bpf_d *d) static void bpf_softintr(void *cookie) { - struct bpf_d *d; + struct bpf_d *d = cookie; + pid_t pgid = -1; - d = cookie; - if (d->bd_async) - fownsignal(d->bd_pgid, SIGIO, 0, 0, NULL); + mutex_enter(d->bd_lock); + if (!d->bd_dying && d->bd_async) + pgid = d->bd_pgid; + mutex_exit(d->bd_lock); + + if (pgid != -1) + fownsignal(pgid, SIGIO, 0, 0, NULL); } static void bpf_timed_out(void *arg) { struct bpf_d *d = arg; - int s; - s = splnet(); - if (d->bd_state == BPF_WAITING) { + mutex_enter(d->bd_lock); + if (!d->bd_dying && d->bd_state == BPF_WAITING) { d->bd_state = BPF_TIMED_OUT; if (d->bd_slen != 0) bpf_wakeup(d); } - splx(s); + mutex_exit(d->bd_lock); } @@ -668,35 +810,43 @@ bpf_write(struct file *fp, off_t *offp, struct uio *uio, struct mbuf *m, *mc; int error, s; static struct sockaddr_storage dst; + u_int dlt; m = NULL; /* XXX gcc */ + mc = NULL; /* XXX gcc */ - KERNEL_LOCK(1, NULL); + mutex_enter(d->bd_lock); - if (d->bd_bif == NULL) { - KERNEL_UNLOCK_ONE(NULL); - return (ENXIO); + if (d->bd_bif == NULL || d->bd_dying) { + mutex_exit(d->bd_lock); + return ENXIO; } + getnanotime(&d->bd_mtime); ifp = d->bd_bif->bif_ifp; + dlt = d->bd_bif->bif_dlt; if (uio->uio_resid == 0) { - KERNEL_UNLOCK_ONE(NULL); - return (0); + mutex_exit(d->bd_lock); + return 0; } - error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp->if_mtu, &m, + bpf_refcount_grab(&d->bd_refcount); + mutex_exit(d->bd_lock); + + error = bpf_movein(uio, (int)dlt, ifp->if_mtu, &m, (struct sockaddr *) &dst); - if (error) { - KERNEL_UNLOCK_ONE(NULL); - return (error); - } + + mutex_enter(d->bd_lock); + + if (error) + goto unlock; if (m->m_pkthdr.len > ifp->if_mtu) { - KERNEL_UNLOCK_ONE(NULL); m_freem(m); - return (EMSGSIZE); + error = EMSGSIZE; + goto unlock; } if (d->bd_hdrcmplt) @@ -712,6 +862,15 @@ bpf_write(struct file *fp, off_t *offp, struct uio *uio, } else mc = NULL; + error = 0; +unlock: + bpf_refcount_release(&d->bd_refcount); + mutex_exit(d->bd_lock); + + if (error) + return error; + + KERNEL_LOCK(1, NULL); s = splsoftnet(); error = (*ifp->if_output)(ifp, m, (struct sockaddr *) &dst, NULL); @@ -733,8 +892,12 @@ bpf_write(struct file *fp, off_t *offp, struct uio *uio, * receive and drop counts. Should be called at splnet. */ static void -reset_d(struct bpf_d *d) +reset_d_locked(struct bpf_d *d) { + KASSERT(mutex_owned(d->bd_lock)); + + bpf_refcount_wait(&d->bd_hbuf_refcount, d->bd_lock); + if (d->bd_hbuf) { /* Free the hold buffer. */ d->bd_fbuf = d->bd_hbuf; @@ -747,6 +910,16 @@ reset_d(struct bpf_d *d) d->bd_ccount = 0; } +static void +reset_d(struct bpf_d *d) +{ + KASSERT(mutex_owned(&bpf_mtx)); + + mutex_enter(d->bd_lock); + reset_d_locked(d); + mutex_exit(d->bd_lock); +} + /* * FIONREAD Check for read packet available. * BIOCGBLEN Get buffer len [for read()]. @@ -773,12 +946,19 @@ static int bpf_ioctl(struct file *fp, u_long cmd, void *addr) { struct bpf_d *d = fp->f_bpf; - int s, error = 0; + int error = 0; + bool need_unlock = true; + + mutex_enter(d->bd_lock); + + if (d->bd_dying) { + mutex_exit(d->bd_lock); + return ENXIO; + } /* * Refresh the PID associated with this bpf file. */ - KERNEL_LOCK(1, NULL); d->bd_pid = curproc->p_pid; #ifdef _LP64 if (curproc->p_flag & PK_32) @@ -787,11 +967,9 @@ bpf_ioctl(struct file *fp, u_long cmd, void *addr) d->bd_compat32 = 0; #endif - s = splnet(); if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); d->bd_state = BPF_IDLE; - splx(s); switch (cmd) { @@ -802,19 +980,18 @@ bpf_ioctl(struct file *fp, u_long cmd, void *addr) /* * Check for read packet available. */ - case FIONREAD: - { - int n; + case FIONREAD: { + int n; - s = splnet(); - n = d->bd_slen; - if (d->bd_hbuf) - n += d->bd_hlen; - splx(s); + bpf_refcount_wait(&d->bd_hbuf_refcount, d->bd_lock); + n = d->bd_slen; + if (d->bd_hbuf) + n += d->bd_hlen; - *(int *)addr = n; - break; - } + *(int *)addr = n; + + break; + } /* * Get buffer len [for read()]. @@ -844,37 +1021,49 @@ bpf_ioctl(struct file *fp, u_long cmd, void *addr) * Set link layer read filter. */ case BIOCSETF: + bpf_refcount_grab(&d->bd_refcount); + mutex_exit(d->bd_lock); + + mutex_enter(&bpf_mtx); error = bpf_setf(d, addr); + mutex_exit(&bpf_mtx); + + mutex_enter(d->bd_lock); + bpf_refcount_release(&d->bd_refcount); break; /* * Flush read packet buffer. */ case BIOCFLUSH: - s = splnet(); - reset_d(d); - splx(s); + reset_d_locked(d); break; /* * Put interface into promiscuous mode. */ - case BIOCPROMISC: + case BIOCPROMISC: { + struct ifnet *ifp; if (d->bd_bif == NULL) { - /* - * No interface attached yet. - */ error = EINVAL; break; } - s = splnet(); + ifp = d->bd_bif->bif_ifp; + bpf_refcount_grab(&d->bd_refcount); + mutex_exit(d->bd_lock); + + mutex_enter(&bpf_mtx); if (d->bd_promisc == 0) { - error = ifpromisc(d->bd_bif->bif_ifp, 1); + error = bpf_ifpromisc(ifp, 1); if (error == 0) d->bd_promisc = 1; } - splx(s); + mutex_exit(&bpf_mtx); + + mutex_enter(d->bd_lock); + bpf_refcount_release(&d->bd_refcount); break; + } /* * Get device parameters. @@ -889,22 +1078,37 @@ bpf_ioctl(struct file *fp, u_long cmd, void *addr) /* * Get a list of supported device parameters. */ - case BIOCGDLTLIST: - if (d->bd_bif == NULL) + case BIOCGDLTLIST: { + struct ifnet *ifp; + if (d->bd_bif == NULL) { error = EINVAL; - else - error = bpf_getdltlist(d, addr); + break; + } + ifp = d->bd_bif->bif_ifp; + mutex_exit(d->bd_lock); + need_unlock = false; + + mutex_enter(&bpf_mtx); + error = bpf_getdltlist(ifp, addr); + mutex_exit(&bpf_mtx); break; + } /* * Set device parameters. */ - case BIOCSDLT: - if (d->bd_bif == NULL) - error = EINVAL; - else - error = bpf_setdlt(d, *(u_int *)addr); + case BIOCSDLT: { + bpf_refcount_grab(&d->bd_refcount); + mutex_exit(d->bd_lock); + + mutex_enter(&bpf_mtx); + error = bpf_setdlt(d, *(u_int *)addr); + mutex_exit(&bpf_mtx); + + mutex_enter(d->bd_lock); + bpf_refcount_release(&d->bd_refcount); break; + } /* * Set interface name. @@ -926,7 +1130,15 @@ bpf_ioctl(struct file *fp, u_long cmd, void *addr) case OBIOCSETIF: #endif case BIOCSETIF: + bpf_refcount_grab(&d->bd_refcount); + mutex_exit(d->bd_lock); + + mutex_enter(&bpf_mtx); error = bpf_setif(d, addr); + mutex_exit(&bpf_mtx); + + mutex_enter(d->bd_lock); + bpf_refcount_release(&d->bd_refcount); break; /* @@ -1072,29 +1284,43 @@ bpf_ioctl(struct file *fp, u_long cmd, void *addr) case TIOCSPGRP: /* Process or group to send signals to */ case FIOSETOWN: + mutex_exit(d->bd_lock); + need_unlock = false; + + mutex_enter(&bpf_mtx); error = fsetown(&d->bd_pgid, cmd, addr); + mutex_exit(&bpf_mtx); break; case TIOCGPGRP: case FIOGETOWN: + mutex_exit(d->bd_lock); + need_unlock = false; + + mutex_enter(&bpf_mtx); error = fgetown(d->bd_pgid, cmd, addr); + mutex_exit(&bpf_mtx); break; } - KERNEL_UNLOCK_ONE(NULL); - return (error); + + if (need_unlock) + mutex_exit(d->bd_lock); + + return error; } /* * Set d's packet filter program to fp. If this file already has a filter, * free it and replace it. Returns EINVAL for bogus requests. */ -int +static int bpf_setf(struct bpf_d *d, struct bpf_program *fp) { struct bpf_insn *fcode, *old; bpfjit_func_t jcode, oldj; size_t flen, size; - int s; + + KASSERT(mutex_owned(&bpf_mtx)); jcode = NULL; flen = fp->bf_len; @@ -1122,13 +1348,13 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp) fcode = NULL; } - s = splnet(); + mutex_enter(d->bd_lock); old = d->bd_filter; d->bd_filter = fcode; oldj = d->bd_jitcode; d->bd_jitcode = jcode; + mutex_exit(d->bd_lock); reset_d(d); - splx(s); if (old) { free(old, M_DEVBUF); @@ -1150,7 +1376,10 @@ bpf_setif(struct bpf_d *d, struct ifreq *ifr) { struct bpf_if *bp; char *cp; - int unit_seen, i, s, error; + int unit_seen, i, error; + + KASSERT(mutex_owned(&bpf_mtx)); + KASSERT(bpf_refcount_refs(&d->bd_refcount) > 0); /* * Make sure the provided name has a unit number, and default @@ -1198,18 +1427,17 @@ bpf_setif(struct bpf_d *d, struct ifreq *ifr) if (error != 0) return (error); } - s = splnet(); + if (bp != d->bd_bif) { if (d->bd_bif) /* * Detach if attached to something else. */ - bpf_detachd(d); + bpf_detachd(d, false); bpf_attachd(d, bp); } reset_d(d); - splx(s); return (0); } /* Not found. */ @@ -1230,8 +1458,12 @@ bpf_stat(struct file *fp, struct stat *st) { struct bpf_d *d = fp->f_bpf; + mutex_enter(d->bd_lock); + if (d->bd_dying) { + mutex_exit(d->bd_lock); + return ENXIO; + } (void)memset(st, 0, sizeof(*st)); - KERNEL_LOCK(1, NULL); st->st_dev = makedev(cdevsw_lookup_major(&bpf_cdevsw), d->bd_pid); st->st_atimespec = d->bd_atime; st->st_mtimespec = d->bd_mtime; @@ -1239,7 +1471,7 @@ bpf_stat(struct file *fp, struct stat *st) st->st_uid = kauth_cred_geteuid(fp->f_cred); st->st_gid = kauth_cred_getegid(fp->f_cred); st->st_mode = S_IFCHR; - KERNEL_UNLOCK_ONE(NULL); + mutex_exit(d->bd_lock); return 0; } @@ -1255,13 +1487,17 @@ static int bpf_poll(struct file *fp, int events) { struct bpf_d *d = fp->f_bpf; - int s = splnet(); int revents; + mutex_enter(d->bd_lock); + if (d->bd_dying) { + mutex_exit(d->bd_lock); + return ENXIO; + } + bpf_refcount_wait(&d->bd_hbuf_refcount, d->bd_lock); /* * Refresh the PID associated with this bpf file. */ - KERNEL_LOCK(1, NULL); d->bd_pid = curproc->p_pid; revents = events & (POLLOUT | POLLWRNORM); @@ -1284,22 +1520,22 @@ bpf_poll(struct file *fp, int events) } } - KERNEL_UNLOCK_ONE(NULL); - splx(s); - return (revents); + mutex_exit(d->bd_lock); + return revents; } static void filt_bpfrdetach(struct knote *kn) { struct bpf_d *d = kn->kn_hook; - int s; - KERNEL_LOCK(1, NULL); - s = splnet(); + mutex_enter(d->bd_lock); + if (d->bd_dying) { + mutex_exit(d->bd_lock); + return; + } SLIST_REMOVE(&d->bd_sel.sel_klist, kn, knote, kn_selnext); - splx(s); - KERNEL_UNLOCK_ONE(NULL); + mutex_exit(d->bd_lock); } static int @@ -1308,12 +1544,18 @@ filt_bpfread(struct knote *kn, long hint) struct bpf_d *d = kn->kn_hook; int rv; - KERNEL_LOCK(1, NULL); + mutex_enter(d->bd_lock); + if (d->bd_dying) { + mutex_exit(d->bd_lock); + return ENXIO; + } + bpf_refcount_wait(&d->bd_hbuf_refcount, d->bd_lock); kn->kn_data = d->bd_hlen; if (d->bd_immediate) kn->kn_data += d->bd_slen; rv = (kn->kn_data > 0); - KERNEL_UNLOCK_ONE(NULL); + mutex_exit(d->bd_lock); + return rv; } @@ -1325,9 +1567,13 @@ bpf_kqfilter(struct file *fp, struct knote *kn) { struct bpf_d *d = fp->f_bpf; struct klist *klist; - int s; - KERNEL_LOCK(1, NULL); + mutex_enter(d->bd_lock); + + if (d->bd_dying) { + mutex_exit(d->bd_lock); + return ENXIO; + } switch (kn->kn_filter) { case EVFILT_READ: @@ -1336,18 +1582,16 @@ bpf_kqfilter(struct file *fp, struct knote *kn) break; default: - KERNEL_UNLOCK_ONE(NULL); - return (EINVAL); + mutex_exit(d->bd_lock); + return EINVAL; } kn->kn_hook = d; - s = splnet(); SLIST_INSERT_HEAD(klist, kn, kn_selnext); - splx(s); - KERNEL_UNLOCK_ONE(NULL); - return (0); + mutex_exit(d->bd_lock); + return 0; } /* @@ -1405,12 +1649,15 @@ bpf_deliver(struct bpf_if *bp, void *(*cpfn)(void *, const void *, size_t), * The only problem that could arise here is that if two different * interfaces shared any data. This is not the case. */ + mutex_enter(bp->bif_lock); for (struct bpf_d *d = bp->bif_dlist; d != NULL; d = d->bd_next) { u_int slen; - if (!d->bd_seesent && !rcv) { - continue; - } + mutex_enter(d->bd_lock); + + if (d->bd_dying || (!d->bd_seesent && !rcv)) + goto next; + d->bd_rcount++; bpf_gstats.bs_recv++; @@ -1419,15 +1666,18 @@ bpf_deliver(struct bpf_if *bp, void *(*cpfn)(void *, const void *, size_t), else slen = bpf_filter_ext(bc, d->bd_filter, &args); - if (!slen) { - continue; - } + if (!slen) + goto next; if (!gottime) { gottime = true; nanotime(&ts); } catchpacket(d, pkt, pktlen, slen, cpfn, &ts); + + next: + mutex_exit(d->bd_lock); } + mutex_exit(bp->bif_lock); } /* @@ -1534,7 +1784,6 @@ _bpf_mtap_af(struct bpf_if *bp, uint32_t af, struct mbuf *m) static void _bpf_mtap_sl_in(struct bpf_if *bp, u_char *chdr, struct mbuf **m) { - int s; u_char *hp; M_PREPEND(*m, SLIP_HDRLEN, M_DONTWAIT); @@ -1545,9 +1794,7 @@ _bpf_mtap_sl_in(struct bpf_if *bp, u_char *chdr, struct mbuf **m) hp[SLX_DIR] = SLIPDIR_IN; (void)memcpy(&hp[SLX_CHDR], chdr, CHDR_LEN); - s = splnet(); _bpf_mtap(bp, *m); - splx(s); m_adj(*m, SLIP_HDRLEN); } @@ -1562,7 +1809,6 @@ _bpf_mtap_sl_out(struct bpf_if *bp, u_char *chdr, struct mbuf *m) { struct mbuf m0; u_char *hp; - int s; m0.m_flags = 0; m0.m_next = m; @@ -1574,9 +1820,7 @@ _bpf_mtap_sl_out(struct bpf_if *bp, u_char *chdr, struct mbuf *m) hp[SLX_DIR] = SLIPDIR_OUT; (void)memcpy(&hp[SLX_CHDR], chdr, CHDR_LEN); - s = splnet(); _bpf_mtap(bp, &m0); - splx(s); m_freem(m); } @@ -1615,6 +1859,15 @@ catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, int hdrlen = bpf_hdrlen(d); int do_wakeup = 0; + KASSERT(mutex_owned(d->bd_lock)); + + if (bpf_refcount_refs(&d->bd_hbuf_refcount) > 0) { + /* We cannot wait here, so have to drop. */ + ++d->bd_dcount; + ++bpf_gstats.bs_drop; + return; + } + ++d->bd_ccount; ++bpf_gstats.bs_capt; /* @@ -1718,10 +1971,10 @@ static int bpf_allocbufs(struct bpf_d *d) { - d->bd_fbuf = malloc(d->bd_bufsize, M_DEVBUF, M_WAITOK | M_CANFAIL); + d->bd_fbuf = malloc(d->bd_bufsize, M_DEVBUF, M_NOWAIT); if (!d->bd_fbuf) return (ENOBUFS); - d->bd_sbuf = malloc(d->bd_bufsize, M_DEVBUF, M_WAITOK | M_CANFAIL); + d->bd_sbuf = malloc(d->bd_bufsize, M_DEVBUF, M_NOWAIT); if (!d->bd_sbuf) { free(d->bd_fbuf, M_DEVBUF); return (ENOBUFS); @@ -1738,6 +1991,7 @@ bpf_allocbufs(struct bpf_d *d) static void bpf_freed(struct bpf_d *d) { + KASSERT(mutex_owned(&bpf_mtx)); /* * We don't need to lock out interrupts since this descriptor has * been detached from its interface and it yet hasn't been marked @@ -1775,18 +2029,33 @@ _bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp) bp->bif_driverp = driverp; bp->bif_ifp = ifp; bp->bif_dlt = dlt; + bp->bif_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NET); + + mutex_enter(&bpf_mtx); + *bp->bif_driverp = NULL; + bp->bif_hdrlen = hdrlen; bp->bif_next = bpf_iflist; bpf_iflist = bp; - - *bp->bif_driverp = NULL; - - bp->bif_hdrlen = hdrlen; + mutex_exit(&bpf_mtx); #if 0 printf("bpf: %s attached\n", ifp->if_xname); #endif } +static void +bpf_detachd_all(struct bpf_if *bp) +{ + struct bpf_d *d; + + KASSERT(mutex_owned(&bpf_mtx)); + + for (d = bp->bif_dlist; d != NULL; d = d->bd_next) { + /* we can't touch device. */ + bpf_detachd(d, true); + } +} + /* * Remove an interface from bpf. */ @@ -1794,32 +2063,20 @@ static void _bpfdetach(struct ifnet *ifp) { struct bpf_if *bp, **pbp; - struct bpf_d *d; - int s; - - /* Nuke the vnodes for any open instances */ - LIST_FOREACH(d, &bpf_list, bd_list) { - if (d->bd_bif != NULL && d->bd_bif->bif_ifp == ifp) { - /* - * Detach the descriptor from an interface now. - * It will be free'ed later by close routine. - */ - s = splnet(); - d->bd_promisc = 0; /* we can't touch device. */ - bpf_detachd(d); - splx(s); - } - } + mutex_enter(&bpf_mtx); again: for (bp = bpf_iflist, pbp = &bpf_iflist; bp != NULL; pbp = &bp->bif_next, bp = bp->bif_next) { if (bp->bif_ifp == ifp) { + /* Nuke the vnodes for any open instances */ + bpf_detachd_all(bp); *pbp = bp->bif_next; free(bp, M_DEVBUF); goto again; } } + mutex_exit(&bpf_mtx); } /* @@ -1830,6 +2087,7 @@ _bpf_change_type(struct ifnet *ifp, u_int dlt, u_int hdrlen) { struct bpf_if *bp; + mutex_enter(&bpf_mtx); for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) { if (bp->bif_driverp == &ifp->if_bpf) break; @@ -1840,21 +2098,23 @@ _bpf_change_type(struct ifnet *ifp, u_int dlt, u_int hdrlen) bp->bif_dlt = dlt; bp->bif_hdrlen = hdrlen; + mutex_exit(&bpf_mtx); } /* * Get a list of available data link type of the interface. */ static int -bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl) +bpf_getdltlist(struct ifnet *ifp, struct bpf_dltlist *bfl) { int n, error; - struct ifnet *ifp; struct bpf_if *bp; - ifp = d->bd_bif->bif_ifp; + KASSERT(mutex_owned(&bpf_mtx)); + n = 0; error = 0; + for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) { if (bp->bif_ifp != ifp) continue; @@ -1876,10 +2136,16 @@ bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl) static int bpf_setdlt(struct bpf_d *d, u_int dlt) { - int s, error, opromisc; + int error, opromisc; struct ifnet *ifp; struct bpf_if *bp; + KASSERT(mutex_owned(&bpf_mtx)); + KASSERT(bpf_refcount_refs(&d->bd_refcount) > 0); + + if (d->bd_bif == NULL) + return EINVAL; + if (d->bd_bif->bif_dlt == dlt) return 0; ifp = d->bd_bif->bif_ifp; @@ -1889,20 +2155,21 @@ bpf_setdlt(struct bpf_d *d, u_int dlt) } if (bp == NULL) return EINVAL; - s = splnet(); opromisc = d->bd_promisc; - bpf_detachd(d); + bpf_detachd(d, false); bpf_attachd(d, bp); reset_d(d); + ifp = bp->bif_ifp; + if (opromisc) { - error = ifpromisc(bp->bif_ifp, 1); + error = bpf_ifpromisc(ifp, 1); if (error) printf("%s: bpf_setdlt: ifpromisc failed (%d)\n", - bp->bif_ifp->if_xname, error); + ifp->if_xname, error); else d->bd_promisc = 1; } - splx(s); + return 0; } @@ -1963,7 +2230,7 @@ static int sysctl_net_bpf_peers(SYSCTLFN_ARGS) { int error, elem_count; - struct bpf_d *dp; + struct bpf_if *bp; struct bpf_d_ext dpe; size_t len, needed, elem_size, out_size; char *sp; @@ -1991,36 +2258,39 @@ sysctl_net_bpf_peers(SYSCTLFN_ARGS) return (EINVAL); mutex_enter(&bpf_mtx); - LIST_FOREACH(dp, &bpf_list, bd_list) { - if (len >= elem_size && elem_count > 0) { + for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) { + struct bpf_d *dp; + for (dp = bp->bif_dlist; dp != NULL; dp = dp->bd_next) { + if (len >= elem_size && elem_count > 0) { #define BPF_EXT(field) dpe.bde_ ## field = dp->bd_ ## field - BPF_EXT(bufsize); - BPF_EXT(promisc); - BPF_EXT(state); - BPF_EXT(immediate); - BPF_EXT(hdrcmplt); - BPF_EXT(seesent); - BPF_EXT(pid); - BPF_EXT(rcount); - BPF_EXT(dcount); - BPF_EXT(ccount); + BPF_EXT(bufsize); + BPF_EXT(promisc); + BPF_EXT(state); + BPF_EXT(immediate); + BPF_EXT(hdrcmplt); + BPF_EXT(seesent); + BPF_EXT(pid); + BPF_EXT(rcount); + BPF_EXT(dcount); + BPF_EXT(ccount); #undef BPF_EXT - if (dp->bd_bif) - (void)strlcpy(dpe.bde_ifname, - dp->bd_bif->bif_ifp->if_xname, - IFNAMSIZ - 1); - else - dpe.bde_ifname[0] = '\0'; + if (dp->bd_bif) + (void)strlcpy(dpe.bde_ifname, + dp->bd_bif->bif_ifp->if_xname, + IFNAMSIZ - 1); + else + dpe.bde_ifname[0] = '\0'; - error = copyout(&dpe, sp, out_size); - if (error) - break; - sp += elem_size; - len -= elem_size; + error = copyout(&dpe, sp, out_size); + if (error) + break; + sp += elem_size; + len -= elem_size; + } + needed += elem_size; + if (elem_count > 0 && elem_count != INT_MAX) + elem_count--; } - needed += elem_size; - if (elem_count > 0 && elem_count != INT_MAX) - elem_count--; } mutex_exit(&bpf_mtx); diff --git a/sys/net/bpfdesc.h b/sys/net/bpfdesc.h index 4cf0ab7..2e84066 100644 --- a/sys/net/bpfdesc.h +++ b/sys/net/bpfdesc.h @@ -45,6 +45,14 @@ #include /* for struct selinfo */ #include /* for IFNAMSIZ */ #include /* for bpfjit_function_t */ +#include +#include + +struct bpf_refcount { + kcondvar_t br_cv; + int br_refs; + bool br_waiting; +}; /* * Descriptor associated with each open bpf file. @@ -93,7 +101,6 @@ struct bpf_d { #endif callout_t bd_callout; /* for BPF timeouts with select */ pid_t bd_pid; /* corresponding PID */ - LIST_ENTRY(bpf_d) bd_list; /* list of all BPF's */ void *bd_sih; /* soft interrupt handle */ struct timespec bd_atime; /* access time */ struct timespec bd_mtime; /* modification time */ @@ -102,6 +109,12 @@ struct bpf_d { int bd_compat32; /* 32-bit stream on LP64 system */ #endif bpfjit_func_t bd_jitcode; /* compiled filter program */ + kmutex_t *bd_lock; /* for the entry and the list */ + struct bpf_refcount bd_refcount; + struct bpf_refcount bd_hbuf_refcount; + bool bd_dying; + kcondvar_t bd_read_cv; + bool bd_read_waiting; }; @@ -139,10 +152,7 @@ struct bpf_if { u_int bif_dlt; /* link layer type */ u_int bif_hdrlen; /* length of header (with padding) */ struct ifnet *bif_ifp; /* corresponding interface */ + kmutex_t *bif_lock; /* to protect the entry */ }; -#ifdef _KERNEL -int bpf_setf(struct bpf_d *, struct bpf_program *); -#endif - #endif /* !_NET_BPFDESC_H_ */