diff --git a/external/cddl/osnet/dev/dtrace/dtrace_modevent.c b/external/cddl/osnet/dev/dtrace/dtrace_modevent.c index cc0f8103fb98..f1338840125a 100644 --- a/external/cddl/osnet/dev/dtrace/dtrace_modevent.c +++ b/external/cddl/osnet/dev/dtrace/dtrace_modevent.c @@ -42,9 +42,7 @@ dtrace_modcmd(modcmd_t cmd, void *data) return error; case MODULE_CMD_FINI: - error = devsw_detach(NULL, &dtrace_cdevsw); - if (error != 0) - return error; + devsw_detach(NULL, &dtrace_cdevsw); error = dtrace_unload(); if (error != 0) { diff --git a/external/cddl/osnet/dev/fbt/fbt.c b/external/cddl/osnet/dev/fbt/fbt.c index b367c2155292..46dd7c1f7f06 100644 --- a/external/cddl/osnet/dev/fbt/fbt.c +++ b/external/cddl/osnet/dev/fbt/fbt.c @@ -1329,7 +1329,8 @@ dtrace_fbt_modcmd(modcmd_t cmd, void *data) error = fbt_unload(); if (error != 0) return error; - return devsw_detach(NULL, &fbt_cdevsw); + devsw_detach(NULL, &fbt_cdevsw); + return 0; case MODULE_CMD_AUTOUNLOAD: return EBUSY; default: diff --git a/external/cddl/osnet/dev/sdt/sdt.c b/external/cddl/osnet/dev/sdt/sdt.c index c3ad129f8284..5a41270a2917 100644 --- a/external/cddl/osnet/dev/sdt/sdt.c +++ b/external/cddl/osnet/dev/sdt/sdt.c @@ -562,7 +562,8 @@ dtrace_sdt_modcmd(modcmd_t cmd, void *data) error = sdt_unload(); if (error != 0) return error; - return devsw_detach(NULL, &sdt_cdevsw); + devsw_detach(NULL, &sdt_cdevsw); + return 0; case MODULE_CMD_AUTOUNLOAD: return EBUSY; default: diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c index 9e19cd1dc0c3..d74d8c71e54d 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c @@ -7231,7 +7231,7 @@ zfs_modcmd(modcmd_t cmd, void *arg) if (error) return error; - (void) devsw_detach(&zfs_bdevsw, &zfs_cdevsw); + devsw_detach(&zfs_bdevsw, &zfs_cdevsw); attacherr: zfs_sysctl_fini(); diff --git a/share/man/man9/devsw_attach.9 b/share/man/man9/devsw_attach.9 index cf862be5846a..6ffc51957a3f 100644 --- a/share/man/man9/devsw_attach.9 +++ b/share/man/man9/devsw_attach.9 @@ -27,7 +27,7 @@ .\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE .\" POSSIBILITY OF SUCH DAMAGE. .\" -.Dd April 30, 2017 +.Dd January 11, 2022 .Dt DEVSW 9 .Os .Sh NAME @@ -49,7 +49,7 @@ .Fa "const struct cdevsw *cdev" .Fa "devmajor_t *cmajor" .Fc -.Ft int +.Ft void .Fo devsw_detach .Fa "const struct bdevsw *bdev" .Fa "const struct cdevsw *cdev" @@ -130,6 +130,11 @@ and structures. .Fn devsw_detach should be called before a loaded device driver is unloaded. +The caller must ensure that there are no open instances of the device, +and that the device's +.Fn d_open +function will fail, before calling. +Fn devsw_detach . .Pp The .Fn bdevsw_lookup @@ -155,10 +160,8 @@ or .Sh RETURN VALUES Upon successful completion, .Fn devsw_attach -and -.Fn devsw_detach -return 0. -Otherwise they return an error value. +returns 0. +Otherwise it returns an error value. .Pp In case of failure, .Fn bdevsw_lookup diff --git a/sys/coda/coda_psdev.c b/sys/coda/coda_psdev.c index cede16da3f53..7f531f03fe56 100644 --- a/sys/coda/coda_psdev.c +++ b/sys/coda/coda_psdev.c @@ -758,7 +758,7 @@ vcoda_modcmd(modcmd_t cmd, void *arg) if (VC_OPEN(vcp)) return EBUSY; } - return devsw_detach(NULL, &vcoda_cdevsw); + devsw_detach(NULL, &vcoda_cdevsw); } #endif break; diff --git a/sys/dev/ata/wd.c b/sys/dev/ata/wd.c index b5d7476d1e6c..672684a3b4aa 100644 --- a/sys/dev/ata/wd.c +++ b/sys/dev/ata/wd.c @@ -152,6 +152,8 @@ const struct bdevsw wd_bdevsw = { .d_dump = wddump, .d_psize = wdsize, .d_discard = wddiscard, + .d_cfdriver = &wd_cd, + .d_devtounit = disklabel_dev_unit, .d_flag = D_DISK }; @@ -167,6 +169,8 @@ const struct cdevsw wd_cdevsw = { .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = wddiscard, + .d_cfdriver = &wd_cd, + .d_devtounit = disklabel_dev_unit, .d_flag = D_DISK }; diff --git a/sys/dev/audio/audio.c b/sys/dev/audio/audio.c index d22ea989a180..176749ac449f 100644 --- a/sys/dev/audio/audio.c +++ b/sys/dev/audio/audio.c @@ -521,7 +521,6 @@ static int audio_exlock_mutex_enter(struct audio_softc *); static void audio_exlock_mutex_exit(struct audio_softc *); static int audio_exlock_enter(struct audio_softc *); static void audio_exlock_exit(struct audio_softc *); -static void audio_sc_acquire_foropen(struct audio_softc *, struct psref *); static struct audio_softc *audio_sc_acquire_fromfile(audio_file_t *, struct psref *); static void audio_sc_release(struct audio_softc *, struct psref *); @@ -732,6 +731,8 @@ const struct cdevsw audio_cdevsw = { .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, + .d_cfdriver = &audio_cd, + .d_devtounit = dev_minor_unit, .d_flag = D_OTHER | D_MPSAFE }; @@ -1543,31 +1544,6 @@ audio_exlock_exit(struct audio_softc *sc) audio_exlock_mutex_exit(sc); } -/* - * Increment reference counter for this sc. - * This is intended to be used for open. - */ -void -audio_sc_acquire_foropen(struct audio_softc *sc, struct psref *refp) -{ - int s; - - /* Block audiodetach while we acquire a reference */ - s = pserialize_read_enter(); - - /* - * We don't examine sc_dying here. However, all open methods - * call audio_exlock_enter() right after this, so we can examine - * sc_dying in it. - */ - - /* Acquire a reference */ - psref_acquire(refp, &sc->sc_psref, audio_psref_class); - - /* Now sc won't go away until we drop the reference count */ - pserialize_read_exit(s); -} - /* * Get sc from file, and increment reference counter for this sc. * This is intended to be used for methods other than open. @@ -1680,21 +1656,20 @@ static int audioopen(dev_t dev, int flags, int ifmt, struct lwp *l) { struct audio_softc *sc; - struct psref sc_ref; - int bound; int error; - /* Find the device */ + /* + * Find the device. Because we wired the cdevsw to the audio + * autoconf instance, the system ensures it will not go away + * until after we return. + */ sc = device_lookup_private(&audio_cd, AUDIOUNIT(dev)); if (sc == NULL || sc->hw_if == NULL) return ENXIO; - bound = curlwp_bind(); - audio_sc_acquire_foropen(sc, &sc_ref); - error = audio_exlock_enter(sc); if (error) - goto done; + return error; device_active(sc->sc_dev, DVA_SYSTEM); switch (AUDIODEV(dev)) { @@ -1714,9 +1689,6 @@ audioopen(dev_t dev, int flags, int ifmt, struct lwp *l) } audio_exlock_exit(sc); -done: - audio_sc_release(sc, &sc_ref); - curlwp_bindx(bound); return error; } @@ -2098,30 +2070,41 @@ done: int audiobellopen(dev_t dev, audio_file_t **filep) { + device_t audiodev = NULL; struct audio_softc *sc; - struct psref sc_ref; - int bound; + bool exlock = false; int error; - /* Find the device */ - sc = device_lookup_private(&audio_cd, AUDIOUNIT(dev)); - if (sc == NULL || sc->hw_if == NULL) - return ENXIO; + /* + * Find the autoconf instance and make sure it doesn't go away + * while we are opening it. + */ + audiodev = device_lookup_acquire(&audio_cd, AUDIOUNIT(dev)); + if (audiodev == NULL) { + error = ENXIO; + goto out; + } - bound = curlwp_bind(); - audio_sc_acquire_foropen(sc, &sc_ref); + /* If attach failed, it's hopeless -- give up. */ + sc = device_private(audiodev); + if (sc->hw_if == NULL) { + error = ENXIO; + goto out; + } + /* Take the exclusive configuration lock. */ error = audio_exlock_enter(sc); if (error) - goto done; + goto out; + /* Open the audio device. */ device_active(sc->sc_dev, DVA_SYSTEM); error = audio_open(dev, sc, FWRITE, 0, curlwp, filep); - audio_exlock_exit(sc); -done: - audio_sc_release(sc, &sc_ref); - curlwp_bindx(bound); +out: if (exlock) + audio_exlock_exit(sc); + if (audiodev) + device_release(audiodev); return error; } diff --git a/sys/dev/ccd.c b/sys/dev/ccd.c index 05945f9a67ba..2283bc0346da 100644 --- a/sys/dev/ccd.c +++ b/sys/dev/ccd.c @@ -1710,7 +1710,7 @@ ccd_modcmd(modcmd_t cmd, void *arg) error = EBUSY; } else { mutex_exit(&ccd_lock); - error = devsw_detach(&ccd_bdevsw, &ccd_cdevsw); + devsw_detach(&ccd_bdevsw, &ccd_cdevsw); ccddetach(); } #endif diff --git a/sys/dev/clockctl.c b/sys/dev/clockctl.c index 0da5e7765fe8..9685c0f129f6 100644 --- a/sys/dev/clockctl.c +++ b/sys/dev/clockctl.c @@ -182,14 +182,12 @@ clockctl_modcmd(modcmd_t cmd, void *data) return EBUSY; } #ifdef _MODULE - error = devsw_detach(NULL, &clockctl_cdevsw); + devsw_detach(NULL, &clockctl_cdevsw); #endif mutex_exit(&clockctl_mtx); - if (error == 0) { - kauth_unlisten_scope(clockctl_listener); - mutex_destroy(&clockctl_mtx); - } + kauth_unlisten_scope(clockctl_listener); + mutex_destroy(&clockctl_mtx); break; default: diff --git a/sys/dev/hdaudio/hdaudio.c b/sys/dev/hdaudio/hdaudio.c index d39ff2db6cde..5c7874778e22 100644 --- a/sys/dev/hdaudio/hdaudio.c +++ b/sys/dev/hdaudio/hdaudio.c @@ -1636,11 +1636,7 @@ hdaudio_modcmd(modcmd_t cmd, void *opaque) error = config_cfdriver_detach(&hdaudio_cd); if (error) break; - error = devsw_detach(NULL, &hdaudio_cdevsw); - if (error) { - config_cfdriver_attach(&hdaudio_cd); - break; - } + devsw_detach(NULL, &hdaudio_cdevsw); #endif break; default: diff --git a/sys/dev/i2c/i2c.c b/sys/dev/i2c/i2c.c index 322f6ad3f199..b56b3e235112 100644 --- a/sys/dev/i2c/i2c.c +++ b/sys/dev/i2c/i2c.c @@ -904,7 +904,7 @@ iic_modcmd(modcmd_t cmd, void *opaque) if (error) { aprint_error("%s: unable to init component\n", iic_cd.cd_name); - (void)devsw_detach(NULL, &iic_cdevsw); + devsw_detach(NULL, &iic_cdevsw); } mutex_exit(&iic_mtx); #endif @@ -922,10 +922,7 @@ iic_modcmd(modcmd_t cmd, void *opaque) mutex_exit(&iic_mtx); break; } - error = devsw_detach(NULL, &iic_cdevsw); - if (error != 0) - config_init_component(cfdriver_ioconf_iic, - cfattach_ioconf_iic, cfdata_ioconf_iic); + devsw_detach(NULL, &iic_cdevsw); #endif mutex_exit(&iic_mtx); break; diff --git a/sys/dev/pad/pad.c b/sys/dev/pad/pad.c index fe0b429cf386..a779f1f71b8d 100644 --- a/sys/dev/pad/pad.c +++ b/sys/dev/pad/pad.c @@ -777,9 +777,7 @@ pad_modcmd(modcmd_t cmd, void *arg) case MODULE_CMD_FINI: #ifdef _MODULE - error = devsw_detach(NULL, &pad_cdevsw); - if (error) - break; + devsw_detach(NULL, &pad_cdevsw); error = config_fini_component(cfdriver_ioconf_pad, pad_cfattach, cfdata_ioconf_pad); diff --git a/sys/dev/raidframe/rf_netbsdkintf.c b/sys/dev/raidframe/rf_netbsdkintf.c index d1bda3553e03..87439aa70bfb 100644 --- a/sys/dev/raidframe/rf_netbsdkintf.c +++ b/sys/dev/raidframe/rf_netbsdkintf.c @@ -4088,16 +4088,7 @@ raid_modcmd_fini(void) return error; } #endif - error = devsw_detach(&raid_bdevsw, &raid_cdevsw); - if (error != 0) { - aprint_error("%s: cannot detach devsw\n",__func__); -#ifdef _MODULE - config_cfdriver_attach(&raid_cd); -#endif - config_cfattach_attach(raid_cd.cd_name, &raid_ca); - mutex_exit(&raid_lock); - return error; - } + devsw_detach(&raid_bdevsw, &raid_cdevsw); rf_BootRaidframe(false); #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) rf_destroy_mutex2(rf_sparet_wait_mutex); diff --git a/sys/dev/scsipi/sd.c b/sys/dev/scsipi/sd.c index 7bd0d12c070b..24b3b6c1aa9c 100644 --- a/sys/dev/scsipi/sd.c +++ b/sys/dev/scsipi/sd.c @@ -167,6 +167,8 @@ const struct bdevsw sd_bdevsw = { .d_dump = sddump, .d_psize = sdsize, .d_discard = nodiscard, + .d_cfdriver = &sd_cd, + .d_devtounit = disklabel_dev_unit, .d_flag = D_DISK | D_MPSAFE }; @@ -182,6 +184,8 @@ const struct cdevsw sd_cdevsw = { .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, + .d_cfdriver = &sd_cd, + .d_devtounit = disklabel_dev_unit, .d_flag = D_DISK | D_MPSAFE }; diff --git a/sys/dev/sysmon/sysmon.c b/sys/dev/sysmon/sysmon.c index 46aedaad7337..fd2993f0c180 100644 --- a/sys/dev/sysmon/sysmon.c +++ b/sys/dev/sysmon/sysmon.c @@ -356,7 +356,7 @@ sysmon_fini(void) if (error == 0) { mutex_enter(&sysmon_minor_mtx); sm_is_attached = false; - error = devsw_detach(NULL, &sysmon_cdevsw); + devsw_detach(NULL, &sysmon_cdevsw); mutex_exit(&sysmon_minor_mtx); } #endif diff --git a/sys/dev/tprof/tprof.c b/sys/dev/tprof/tprof.c index b069a5b7df5d..136fd190ad14 100644 --- a/sys/dev/tprof/tprof.c +++ b/sys/dev/tprof/tprof.c @@ -768,13 +768,7 @@ tprof_modcmd(modcmd_t cmd, void *arg) case MODULE_CMD_FINI: #if defined(_MODULE) - { - int error; - error = devsw_detach(NULL, &tprof_cdevsw); - if (error) { - return error; - } - } + devsw_detach(NULL, &tprof_cdevsw); #endif /* defined(_MODULE) */ tprof_driver_fini(); return 0; diff --git a/sys/dev/usb/uhid.c b/sys/dev/usb/uhid.c index 81292fd5130e..1e6cd705e1a9 100644 --- a/sys/dev/usb/uhid.c +++ b/sys/dev/usb/uhid.c @@ -104,7 +104,6 @@ struct uhid_softc { volatile uint32_t sc_state; /* driver state */ #define UHID_IMMED 0x02 /* return read data immediately */ - int sc_refcnt; int sc_raw; u_char sc_open; u_char sc_dying; @@ -134,6 +133,8 @@ const struct cdevsw uhid_cdevsw = { .d_mmap = nommap, .d_kqfilter = uhidkqfilter, .d_discard = nodiscard, + .d_cfdriver = &uhid_cd, + .d_devtounit = dev_minor_unit, .d_flag = D_OTHER }; @@ -194,7 +195,6 @@ uhid_attach(device_t parent, device_t self, void *aux) mutex_init(&sc->sc_lock, MUTEX_DEFAULT, IPL_SOFTUSB); cv_init(&sc->sc_cv, "uhidrea"); - cv_init(&sc->sc_detach_cv, "uhiddet"); if (!pmf_device_register(self, NULL, NULL)) aprint_error_dev(self, "couldn't establish power handler\n"); @@ -233,15 +233,6 @@ uhid_detach(device_t self, int flags) /* Interrupt any pending uhidev_write. */ uhidev_stop(&sc->sc_hdev); - /* Wait for I/O operations to complete. */ - mutex_enter(&sc->sc_lock); - while (sc->sc_refcnt) { - DPRINTF(("%s: open=%d refcnt=%d\n", __func__, - sc->sc_open, sc->sc_refcnt)); - cv_wait(&sc->sc_detach_cv, &sc->sc_lock); - } - mutex_exit(&sc->sc_lock); - pmf_device_deregister(self); /* locate the major number */ @@ -251,28 +242,9 @@ uhid_detach(device_t self, int flags) mn = device_unit(self); vdevgone(maj, mn, mn, VCHR); - /* - * Wait for close to finish. - * - * XXX I assumed that vdevgone would synchronously call close, - * and not return before it has completed, but empirically the - * assertion of sc->sc_open == 0 below fires if we don't wait - * here. Someone^TM should carefully examine vdevgone to - * ascertain what it guarantees, and audit all other users of - * it accordingly. - */ - mutex_enter(&sc->sc_lock); - while (sc->sc_open) { - DPRINTF(("%s: open=%d\n", __func__, sc->sc_open)); - cv_wait(&sc->sc_detach_cv, &sc->sc_lock); - } - mutex_exit(&sc->sc_lock); - KASSERT(sc->sc_open == 0); - KASSERT(sc->sc_refcnt == 0); cv_destroy(&sc->sc_cv); - cv_destroy(&sc->sc_detach_cv); mutex_destroy(&sc->sc_lock); seldestroy(&sc->sc_rsel); @@ -389,7 +361,6 @@ fail1: selnotify(&sc->sc_rsel, POLLHUP, 0); fail0: mutex_enter(&sc->sc_lock); KASSERT(sc->sc_open == 1); sc->sc_open = 0; - cv_broadcast(&sc->sc_detach_cv); atomic_store_relaxed(&sc->sc_state, 0); mutex_exit(&sc->sc_lock); return error; @@ -432,7 +403,6 @@ uhidclose(dev_t dev, int flag, int mode, struct lwp *l) mutex_enter(&sc->sc_lock); KASSERT(sc->sc_open == 1); sc->sc_open = 0; - cv_broadcast(&sc->sc_detach_cv); atomic_store_relaxed(&sc->sc_state, 0); mutex_exit(&sc->sc_lock); @@ -454,11 +424,8 @@ uhid_enter(dev_t dev, struct uhid_softc **scp) KASSERT(sc->sc_open == 2); if (sc->sc_dying) { error = ENXIO; - } else if (sc->sc_refcnt == INT_MAX) { - error = EBUSY; } else { *scp = sc; - sc->sc_refcnt++; error = 0; } mutex_exit(&sc->sc_lock); @@ -472,9 +439,6 @@ uhid_exit(struct uhid_softc *sc) mutex_enter(&sc->sc_lock); KASSERT(sc->sc_open == 2); - KASSERT(sc->sc_refcnt > 0); - if (--sc->sc_refcnt == 0) - cv_broadcast(&sc->sc_detach_cv); mutex_exit(&sc->sc_lock); } diff --git a/sys/dist/pf/net/pf_ioctl.c b/sys/dist/pf/net/pf_ioctl.c index 94bfb70a411d..e4c13be698f8 100644 --- a/sys/dist/pf/net/pf_ioctl.c +++ b/sys/dist/pf/net/pf_ioctl.c @@ -3459,7 +3459,8 @@ pf_modcmd(modcmd_t cmd, void *opaque) } else { pfdetach(); pflogdetach(); - return devsw_detach(NULL, &pf_cdevsw); + devsw_detach(NULL, &pf_cdevsw); + return 0; } default: return ENOTTY; diff --git a/sys/external/bsd/ipf/netinet/ip_fil_netbsd.c b/sys/external/bsd/ipf/netinet/ip_fil_netbsd.c index d0c4ca95097c..bb4e0706cc9b 100644 --- a/sys/external/bsd/ipf/netinet/ip_fil_netbsd.c +++ b/sys/external/bsd/ipf/netinet/ip_fil_netbsd.c @@ -2256,7 +2256,7 @@ ipl_fini(void *opaque) { #ifdef _MODULE - (void)devsw_detach(NULL, &ipl_cdevsw); + devsw_detach(NULL, &ipl_cdevsw); #endif /* diff --git a/sys/fs/autofs/autofs_vfsops.c b/sys/fs/autofs/autofs_vfsops.c index fbd6eafe6532..1204d1f9b6d3 100644 --- a/sys/fs/autofs/autofs_vfsops.c +++ b/sys/fs/autofs/autofs_vfsops.c @@ -496,9 +496,7 @@ autofs_modcmd(modcmd_t cmd, void *arg) } mutex_exit(&autofs_softc->sc_lock); - error = devsw_detach(NULL, &autofs_cdevsw); - if (error) - break; + devsw_detach(NULL, &autofs_cdevsw); #endif error = vfs_detach(&autofs_vfsops); if (error) diff --git a/sys/kern/kern_drvctl.c b/sys/kern/kern_drvctl.c index 37f4730b2512..8a4156f8a0aa 100644 --- a/sys/kern/kern_drvctl.c +++ b/sys/kern/kern_drvctl.c @@ -665,15 +665,10 @@ drvctl_modcmd(modcmd_t cmd, void *arg) devmon_insert_vec = saved_insert_vec; saved_insert_vec = NULL; #ifdef _MODULE - error = devsw_detach(NULL, &drvctl_cdevsw); - if (error != 0) { - saved_insert_vec = devmon_insert_vec; - devmon_insert_vec = devmon_insert; - } + devsw_detach(NULL, &drvctl_cdevsw); #endif mutex_exit(&drvctl_lock); - if (error == 0) - drvctl_fini(); + drvctl_fini(); break; default: diff --git a/sys/kern/subr_autoconf.c b/sys/kern/subr_autoconf.c index 8439ae8c6a00..073ce6f52059 100644 --- a/sys/kern/subr_autoconf.c +++ b/sys/kern/subr_autoconf.c @@ -108,6 +108,7 @@ __KERNEL_RCSID(0, "$NetBSD: subr_autoconf.c,v 1.291 2021/12/31 14:19:57 riastrad #include <sys/cpu.h> #include <sys/sysctl.h> #include <sys/stdarg.h> +#include <sys/localcount.h> #include <sys/disk.h> @@ -1453,6 +1454,9 @@ config_devdelete(device_t dev) if (dg->dg_devs != NULL) kmem_free(dg->dg_devs, sizeof(device_t) * dg->dg_ndevs); + localcount_fini(dev->dv_localcount); + kmem_free(dev->dv_localcount, sizeof(*dev->dv_localcount)); + cv_destroy(&dvl->dvl_cv); mutex_destroy(&dvl->dvl_mtx); @@ -1556,6 +1560,7 @@ config_devalloc(const device_t parent, const cfdata_t cf, dev->dv_activity_handlers = NULL; dev->dv_private = dev_private; dev->dv_flags = ca->ca_flags; /* inherit flags from class */ + dev->dv_attaching = curlwp; myunit = config_unit_alloc(dev, cd, cf); if (myunit == -1) { @@ -1604,6 +1609,10 @@ config_devalloc(const device_t parent, const cfdata_t cf, "device-parent", device_xname(parent)); } + dev->dv_localcount = kmem_zalloc(sizeof(*dev->dv_localcount), + KM_SLEEP); + localcount_init(dev->dv_localcount); + if (dev->dv_cfdriver->cd_attrs != NULL) config_add_attrib_dict(dev); @@ -1755,8 +1764,29 @@ config_attach_internal(device_t parent, cfdata_t cf, void *aux, cfprint_t print, /* Let userland know */ devmon_report_device(dev, true); + /* + * Prevent detach until the driver's attach function, and all + * deferred actions, have finished. + */ config_pending_incr(dev); + + /* Call the driver's attach function. */ (*dev->dv_cfattach->ca_attach)(parent, dev, aux); + + /* + * Allow other threads to acquire references to the device now + * that the driver's attach function is done. + */ + mutex_enter(&config_misc_lock); + KASSERT(dev->dv_attaching == curlwp); + dev->dv_attaching = NULL; + cv_broadcast(&config_misc_cv); + mutex_exit(&config_misc_lock); + + /* + * Synchronous parts of attach are done. Allow detach, unless + * the driver's attach function scheduled deferred actions. + */ config_pending_decr(dev); mutex_enter(&config_misc_lock); @@ -1822,8 +1852,29 @@ config_attach_pseudo(cfdata_t cf) /* Let userland know */ devmon_report_device(dev, true); + /* + * Prevent detach until the driver's attach function, and all + * deferred actions, have finished. + */ config_pending_incr(dev); + + /* Call the driver's attach function. */ (*dev->dv_cfattach->ca_attach)(ROOT, dev, NULL); + + /* + * Allow other threads to acquire references to the device now + * that the driver's attach function is done. + */ + mutex_enter(&config_misc_lock); + KASSERT(dev->dv_attaching == curlwp); + dev->dv_attaching = NULL; + cv_broadcast(&config_misc_cv); + mutex_exit(&config_misc_lock); + + /* + * Synchronous parts of attach are done. Allow detach, unless + * the driver's attach function scheduled deferred actions. + */ config_pending_decr(dev); config_process_deferred(&deferred_config_queue, dev); @@ -1872,24 +1923,49 @@ config_dump_garbage(struct devicelist *garbage) static int config_detach_enter(device_t dev) { - int error; + int error = 0; mutex_enter(&config_misc_lock); - for (;;) { - if (dev->dv_pending == 0 && dev->dv_detaching == NULL) { - dev->dv_detaching = curlwp; - error = 0; - break; - } + + /* + * Wait until attach has fully completed, and until any + * concurrent detach (e.g., drvctl racing with USB event + * thread) has completed. + * + * Caller must hold alldevs_nread or alldevs_nwrite (e.g., via + * deviter) to ensure the winner of the race doesn't free the + * device leading the loser of the race into use-after-free. + * + * XXX Not all callers do this! + */ + while (dev->dv_pending || dev->dv_detaching) { KASSERTMSG(dev->dv_detaching != curlwp, "recursively detaching %s", device_xname(dev)); error = cv_wait_sig(&config_misc_cv, &config_misc_lock); if (error) - break; + goto out; } - KASSERT(error || dev->dv_detaching == curlwp); - mutex_exit(&config_misc_lock); + /* + * Attach has completed, and no other concurrent detach is + * running. Claim the device for detaching. This will cause + * all new attempts to acquire references to block. + */ + KASSERT(dev->dv_attaching == NULL); + KASSERT(dev->dv_detaching == NULL); + dev->dv_detaching = curlwp; + + /* + * Wait for all device_lookup_acquire references -- mostly, for + * all attempts to open the device -- to drain. There may + * still be open instances of the device after this point, but + * all new attempts to acquire references will block until + * dv_detaching clears. + */ + localcount_drain(dev->dv_localcount, + &config_misc_cv, &config_misc_lock); + +out: mutex_exit(&config_misc_lock); return error; } @@ -1980,9 +2056,18 @@ config_detach(device_t dev, int flags) */ if (rv == 0) dev->dv_flags &= ~DVF_ACTIVE; - else if ((flags & DETACH_FORCE) == 0) + else if ((flags & DETACH_FORCE) == 0) { + /* + * Detach failed -- likely EBUSY. Reset the + * localcount. At this point there can be no + * references held, and no new references acquired -- + * calls to device_lookup_acquire are held up on + * dv_detaching until config_detach_exit. + */ + localcount_fini(dev->dv_localcount); + localcount_init(dev->dv_localcount); goto out; - else { + } else { panic("config_detach: forced detach of %s failed (%d)", device_xname(dev), rv); } @@ -2498,6 +2583,14 @@ config_alldevs_exit(struct alldevs_foray *af) * device_lookup: * * Look up a device instance for a given driver. + * + * Caller is responsible for ensuring the device's state is + * stable, either by holding a reference already obtained with + * device_lookup_acquire or by otherwise ensuring the device is + * attached and can't be detached (e.g., holding an open device + * node and ensuring *_detach calls vdevgone). + * + * XXX Find a way to assert this. */ device_t device_lookup(cfdriver_t cd, int unit) @@ -2526,6 +2619,69 @@ device_lookup_private(cfdriver_t cd, int unit) return device_private(device_lookup(cd, unit)); } +/* + * device_lookup_acquire: + * + * Look up a device instance for a given driver, and return a + * reference to it that must be released by device_release. + * + * => If the device is still attaching, blocks until *_attach has + * returned. + * + * => If the device is detaching, blocks until *_detach has + * returned. May succeed or fail in that case, depending on + * whether *_detach has backed out (EBUSY) or committed to + * detaching. + */ +device_t +device_lookup_acquire(cfdriver_t cd, int unit) +{ + device_t dv; + + /* XXX This should have a pserialized fast path -- TBD. */ + mutex_enter(&config_misc_lock); + mutex_enter(&alldevs_lock); +retry: if (unit < 0 || unit >= cd->cd_ndevs || + (dv = cd->cd_devs[unit]) == NULL || + dv->dv_del_gen != 0) { + dv = NULL; + } else { + /* + * Wait for the device to stabilize, if attaching or + * detaching. Either way we must wait for *_attach or + * *_detach to complete, and either way we must retry: + * even if detaching, *_detach might fail (EBUSY) so + * the device may still be there. + */ + if ((dv->dv_attaching != NULL && dv->dv_attaching != curlwp) || + dv->dv_detaching != NULL) { + mutex_exit(&alldevs_lock); + cv_wait(&config_misc_cv, &config_misc_lock); + mutex_enter(&alldevs_lock); + goto retry; + } + localcount_acquire(dv->dv_localcount); + } + mutex_exit(&alldevs_lock); + mutex_exit(&config_misc_lock); + + return dv; +} + +/* + * device_release: + * + * Release a reference to a device acquired with + * device_lookup_acquire. + */ +void +device_release(device_t dv) +{ + + localcount_release(dv->dv_localcount, + &config_misc_cv, &config_misc_lock); +} + /* * device_find_by_xname: * diff --git a/sys/kern/subr_devsw.c b/sys/kern/subr_devsw.c index 1a0f721fdd65..0618de9abdb4 100644 --- a/sys/kern/subr_devsw.c +++ b/sys/kern/subr_devsw.c @@ -85,6 +85,11 @@ __KERNEL_RCSID(0, "$NetBSD: subr_devsw.c,v 1.38 2017/11/07 18:35:57 christos Exp #include <sys/buf.h> #include <sys/reboot.h> #include <sys/sdt.h> +#include <sys/atomic.h> +#include <sys/localcount.h> +#include <sys/pserialize.h> +#include <sys/xcall.h> +#include <sys/device.h> #ifdef DEVSW_DEBUG #define DPRINTF(x) printf x @@ -97,12 +102,22 @@ __KERNEL_RCSID(0, "$NetBSD: subr_devsw.c,v 1.38 2017/11/07 18:35:57 christos Exp #define CDEVSW_SIZE (sizeof(struct cdevsw *)) #define DEVSWCONV_SIZE (sizeof(struct devsw_conv)) +struct devswref { + struct localcount *dr_lc; + bool dr_dynamic; +}; + +/* XXX bdevsw, cdevsw, max_bdevsws, and max_cdevsws should be volatile */ extern const struct bdevsw **bdevsw, *bdevsw0[]; extern const struct cdevsw **cdevsw, *cdevsw0[]; extern struct devsw_conv *devsw_conv, devsw_conv0[]; extern const int sys_bdevsws, sys_cdevsws; extern int max_bdevsws, max_cdevsws, max_devsw_convs; +static struct devswref *cdevswref; +static struct devswref *bdevswref; +static kcondvar_t devsw_cv; + static int bdevsw_attach(const struct bdevsw *, devmajor_t *); static int cdevsw_attach(const struct cdevsw *, devmajor_t *); static void devsw_detach_locked(const struct bdevsw *, const struct cdevsw *); @@ -118,6 +133,8 @@ devsw_init(void) KASSERT(sys_bdevsws < MAXDEVSW - 1); KASSERT(sys_cdevsws < MAXDEVSW - 1); mutex_init(&device_lock, MUTEX_DEFAULT, IPL_NONE); + + cv_init(&devsw_cv, "devsw"); } int @@ -158,15 +175,12 @@ devsw_attach(const char *devname, error = EEXIST; goto fail; } - - if (bdev != NULL) - bdevsw[*bmajor] = bdev; - cdevsw[*cmajor] = cdev; - - mutex_exit(&device_lock); - return (0); } + /* + * XXX This should allocate what it needs up front so we never + * need to flail around trying to unwind. + */ error = bdevsw_attach(bdev, bmajor); if (error != 0) goto fail; @@ -176,6 +190,13 @@ devsw_attach(const char *devname, goto fail; } + /* + * If we already found a conv, we're done. Otherwise, find an + * empty slot or extend the table. + */ + if (i == max_devsw_convs) + goto out; + for (i = 0 ; i < max_devsw_convs ; i++) { if (devsw_conv[i].d_name == NULL) break; @@ -224,7 +245,9 @@ devsw_attach(const char *devname, static int bdevsw_attach(const struct bdevsw *devsw, devmajor_t *devmajor) { - const struct bdevsw **newptr; + const struct bdevsw **newbdevsw = NULL; + struct devswref *newbdevswref = NULL; + struct localcount *lc; devmajor_t bmajor; int i; @@ -253,20 +276,35 @@ bdevsw_attach(const struct bdevsw *devsw, devmajor_t *devmajor) return (ENOMEM); } + if (bdevswref == NULL) { + newbdevswref = kmem_zalloc(MAXDEVSW * sizeof(newbdevswref[0]), + KM_NOSLEEP); + if (newbdevswref == NULL) + return ENOMEM; + atomic_store_release(&bdevswref, newbdevswref); + } + if (*devmajor >= max_bdevsws) { KASSERT(bdevsw == bdevsw0); - newptr = kmem_zalloc(MAXDEVSW * BDEVSW_SIZE, KM_NOSLEEP); - if (newptr == NULL) - return (ENOMEM); - memcpy(newptr, bdevsw, max_bdevsws * BDEVSW_SIZE); - bdevsw = newptr; - max_bdevsws = MAXDEVSW; + newbdevsw = kmem_zalloc(MAXDEVSW * sizeof(newbdevsw[0]), + KM_NOSLEEP); + if (newbdevsw == NULL) + return ENOMEM; + memcpy(newbdevsw, bdevsw, max_bdevsws * sizeof(bdevsw[0])); + atomic_store_release(&bdevsw, newbdevsw); + atomic_store_release(&max_bdevsws, MAXDEVSW); } if (bdevsw[*devmajor] != NULL) return (EEXIST); - bdevsw[*devmajor] = devsw; + KASSERT(bdevswref[*devmajor].dr_lc == NULL); + lc = kmem_zalloc(sizeof(*lc), KM_SLEEP); + localcount_init(lc); + bdevswref[*devmajor].dr_lc = lc; + bdevswref[*devmajor].dr_dynamic = true; + + atomic_store_release(&bdevsw[*devmajor], devsw); return (0); } @@ -274,7 +312,9 @@ bdevsw_attach(const struct bdevsw *devsw, devmajor_t *devmajor) static int cdevsw_attach(const struct cdevsw *devsw, devmajor_t *devmajor) { - const struct cdevsw **newptr; + const struct cdevsw **newcdevsw = NULL; + struct devswref *newcdevswref = NULL; + struct localcount *lc; devmajor_t cmajor; int i; @@ -300,20 +340,35 @@ cdevsw_attach(const struct cdevsw *devsw, devmajor_t *devmajor) return (ENOMEM); } + if (cdevswref == NULL) { + newcdevswref = kmem_zalloc(MAXDEVSW * sizeof(newcdevswref[0]), + KM_NOSLEEP); + if (newcdevswref == NULL) + return ENOMEM; + atomic_store_release(&cdevswref, newcdevswref); + } + if (*devmajor >= max_cdevsws) { KASSERT(cdevsw == cdevsw0); - newptr = kmem_zalloc(MAXDEVSW * CDEVSW_SIZE, KM_NOSLEEP); - if (newptr == NULL) - return (ENOMEM); - memcpy(newptr, cdevsw, max_cdevsws * CDEVSW_SIZE); - cdevsw = newptr; - max_cdevsws = MAXDEVSW; + newcdevsw = kmem_zalloc(MAXDEVSW * sizeof(newcdevsw[0]), + KM_NOSLEEP); + if (newcdevsw == NULL) + return ENOMEM; + memcpy(newcdevsw, cdevsw, max_cdevsws * sizeof(cdevsw[0])); + atomic_store_release(&cdevsw, newcdevsw); + atomic_store_release(&max_cdevsws, MAXDEVSW); } if (cdevsw[*devmajor] != NULL) return (EEXIST); - cdevsw[*devmajor] = devsw; + KASSERT(cdevswref[*devmajor].dr_lc == NULL); + lc = kmem_zalloc(sizeof(*lc), KM_SLEEP); + localcount_init(lc); + cdevswref[*devmajor].dr_lc = lc; + cdevswref[*devmajor].dr_dynamic = true; + + atomic_store_release(&cdevsw[*devmajor], devsw); return (0); } @@ -321,36 +376,75 @@ cdevsw_attach(const struct cdevsw *devsw, devmajor_t *devmajor) static void devsw_detach_locked(const struct bdevsw *bdev, const struct cdevsw *cdev) { - int i; + int bi, ci; KASSERT(mutex_owned(&device_lock)); + /* Prevent new references. */ if (bdev != NULL) { - for (i = 0 ; i < max_bdevsws ; i++) { - if (bdevsw[i] != bdev) + for (bi = 0; bi < max_bdevsws; bi++) { + if (bdevsw[bi] != bdev) continue; - bdevsw[i] = NULL; + atomic_store_relaxed(&bdevsw[bi], NULL); break; } + KASSERT(bi < max_bdevsws); } if (cdev != NULL) { - for (i = 0 ; i < max_cdevsws ; i++) { - if (cdevsw[i] != cdev) + for (ci = 0; ci < max_cdevsws; ci++) { + if (cdevsw[ci] != cdev) continue; - cdevsw[i] = NULL; + atomic_store_relaxed(&cdevsw[ci], NULL); break; } + KASSERT(ci < max_cdevsws); + } + + if (bdev == NULL && cdev == NULL) /* XXX possible? */ + return; + + /* + * Wait for all bdevsw_lookup_acquire, cdevsw_lookup_acquire + * calls to notice that the devsw is gone. + * + * XXX Can't use pserialize_perform here because devsw_init is + * too early for pserialize_create(). + */ + xc_barrier(0); + + /* + * Wait for all references to drain. It is the caller's + * responsibility to ensure that at this point, there are no + * extant open instances and all new d_open calls will fail. + * + * Note that localcount_drain may release and reacquire + * device_lock. + */ + if (bdev != NULL) { + localcount_drain(bdevswref[bi].dr_lc, + &devsw_cv, &device_lock); + localcount_fini(bdevswref[bi].dr_lc); + kmem_free(bdevswref[bi].dr_lc, sizeof(*bdevswref[bi].dr_lc)); + bdevswref[bi].dr_lc = NULL; + bdevswref[bi].dr_dynamic = false; + } + if (cdev != NULL) { + localcount_drain(cdevswref[ci].dr_lc, + &devsw_cv, &device_lock); + localcount_fini(cdevswref[ci].dr_lc); + kmem_free(cdevswref[ci].dr_lc, sizeof(*cdevswref[ci].dr_lc)); + cdevswref[ci].dr_lc = NULL; + cdevswref[ci].dr_dynamic = false; } } -int +void devsw_detach(const struct bdevsw *bdev, const struct cdevsw *cdev) { mutex_enter(&device_lock); devsw_detach_locked(bdev, cdev); mutex_exit(&device_lock); - return 0; } /* @@ -366,10 +460,60 @@ bdevsw_lookup(dev_t dev) if (dev == NODEV) return (NULL); bmajor = major(dev); - if (bmajor < 0 || bmajor >= max_bdevsws) + if (bmajor < 0 || bmajor >= atomic_load_relaxed(&max_bdevsws)) return (NULL); - return (bdevsw[bmajor]); + return atomic_load_consume(&bdevsw)[bmajor]; +} + +static const struct bdevsw * +bdevsw_lookup_acquire(dev_t dev, struct localcount **lcp) +{ + devmajor_t bmajor; + const struct bdevsw *bdev = NULL, *const *curbdevsw; + struct devswref *curbdevswref; + int s; + + if (dev == NODEV) + return NULL; + bmajor = major(dev); + if (bmajor < 0) + return NULL; + + s = pserialize_read_enter(); + + /* + * max_bdevsws never goes down, so it is safe to rely on this + * condition without any locking for the array access below. + * Test sys_bdevsws first so we can avoid the memory barrier in + * that case. + */ + if (bmajor >= sys_bdevsws && + bmajor >= atomic_load_acquire(&max_bdevsws)) + goto out; + curbdevsw = atomic_load_consume(&bdevsw); + if ((bdev = atomic_load_consume(&curbdevsw[bmajor])) == NULL) + goto out; + + curbdevswref = atomic_load_consume(&bdevswref); + if (curbdevswref == NULL || !curbdevswref[bmajor].dr_dynamic) { + *lcp = NULL; + } else { + *lcp = curbdevswref[bmajor].dr_lc; + localcount_acquire(*lcp); + } + +out: pserialize_read_exit(s); + return bdev; +} + +static void +bdevsw_release(const struct bdevsw *bdev, struct localcount *lc) +{ + + if (lc == NULL) + return; + localcount_release(lc, &devsw_cv, &device_lock); } /* @@ -385,10 +529,60 @@ cdevsw_lookup(dev_t dev) if (dev == NODEV) return (NULL); cmajor = major(dev); - if (cmajor < 0 || cmajor >= max_cdevsws) + if (cmajor < 0 || cmajor >= atomic_load_relaxed(&max_cdevsws)) return (NULL); - return (cdevsw[cmajor]); + return atomic_load_consume(&cdevsw)[cmajor]; +} + +static const struct cdevsw * +cdevsw_lookup_acquire(dev_t dev, struct localcount **lcp) +{ + devmajor_t cmajor; + const struct cdevsw *cdev = NULL, *const *curcdevsw; + struct devswref *curcdevswref; + int s; + + if (dev == NODEV) + return NULL; + cmajor = major(dev); + if (cmajor < 0) + return NULL; + + s = pserialize_read_enter(); + + /* + * max_cdevsws never goes down, so it is safe to rely on this + * condition without any locking for the array access below. + * Test sys_cdevsws first so we can avoid the memory barrier in + * that case. + */ + if (cmajor >= sys_cdevsws && + cmajor >= atomic_load_acquire(&max_cdevsws)) + goto out; + curcdevsw = atomic_load_consume(&cdevsw); + if ((cdev = atomic_load_consume(&curcdevsw[cmajor])) == NULL) + goto out; + + curcdevswref = atomic_load_consume(&cdevswref); + if (curcdevswref == NULL || !curcdevswref[cmajor].dr_dynamic) { + *lcp = NULL; + } else { + *lcp = curcdevswref[cmajor].dr_lc; + localcount_acquire(*lcp); + } + +out: pserialize_read_exit(s); + return cdev; +} + +static void +cdevsw_release(const struct cdevsw *cdev, struct localcount *lc) +{ + + if (lc == NULL) + return; + localcount_release(lc, &devsw_cv, &device_lock); } /* @@ -400,10 +594,13 @@ cdevsw_lookup(dev_t dev) devmajor_t bdevsw_lookup_major(const struct bdevsw *bdev) { - devmajor_t bmajor; + const struct bdevsw *const *curbdevsw; + devmajor_t bmajor, bmax; - for (bmajor = 0 ; bmajor < max_bdevsws ; bmajor++) { - if (bdevsw[bmajor] == bdev) + bmax = atomic_load_acquire(&max_bdevsws); + curbdevsw = atomic_load_consume(&bdevsw); + for (bmajor = 0; bmajor < bmax; bmajor++) { + if (atomic_load_relaxed(&curbdevsw[bmajor]) == bdev) return (bmajor); } @@ -419,10 +616,13 @@ bdevsw_lookup_major(const struct bdevsw *bdev) devmajor_t cdevsw_lookup_major(const struct cdevsw *cdev) { - devmajor_t cmajor; + const struct cdevsw *const *curcdevsw; + devmajor_t cmajor, cmax; - for (cmajor = 0 ; cmajor < max_cdevsws ; cmajor++) { - if (cdevsw[cmajor] == cdev) + cmax = atomic_load_acquire(&max_cdevsws); + curcdevsw = atomic_load_consume(&cdevsw); + for (cmajor = 0; cmajor < cmax; cmajor++) { + if (atomic_load_relaxed(&curcdevsw[cmajor]) == cdev) return (cmajor); } @@ -697,22 +897,41 @@ int bdev_open(dev_t dev, int flag, int devtype, lwp_t *l) { const struct bdevsw *d; - int rv, mpflag; + struct localcount *lc; + device_t dv = NULL/*XXXGCC*/; + int unit, rv, mpflag; - /* - * For open we need to lock, in order to synchronize - * with attach/detach. - */ - mutex_enter(&device_lock); - d = bdevsw_lookup(dev); - mutex_exit(&device_lock); + d = bdevsw_lookup_acquire(dev, &lc); if (d == NULL) return ENXIO; + if (d->d_devtounit) { + /* + * If the device node corresponds to an autoconf device + * instance, acquire a reference to it so that during + * d_open, device_lookup is stable. + * + * XXX This should also arrange to instantiate cloning + * pseudo-devices if appropriate, but that requires + * reviewing them all to find and verify a common + * pattern. + */ + if ((unit = (*d->d_devtounit)(dev)) == -1) + return ENXIO; + if ((dv = device_lookup_acquire(d->d_cfdriver, unit)) == NULL) + return ENXIO; + } + DEV_LOCK(d); rv = (*d->d_open)(dev, flag, devtype, l); DEV_UNLOCK(d); + if (d->d_devtounit) { + device_release(dv); + } + + bdevsw_release(d, lc); + return rv; } @@ -855,22 +1074,41 @@ int cdev_open(dev_t dev, int flag, int devtype, lwp_t *l) { const struct cdevsw *d; - int rv, mpflag; + struct localcount *lc; + device_t dv = NULL/*XXXGCC*/; + int unit, rv, mpflag; - /* - * For open we need to lock, in order to synchronize - * with attach/detach. - */ - mutex_enter(&device_lock); - d = cdevsw_lookup(dev); - mutex_exit(&device_lock); + d = cdevsw_lookup_acquire(dev, &lc); if (d == NULL) return ENXIO; + if (d->d_devtounit) { + /* + * If the device node corresponds to an autoconf device + * instance, acquire a reference to it so that during + * d_open, device_lookup is stable. + * + * XXX This should also arrange to instantiate cloning + * pseudo-devices if appropriate, but that requires + * reviewing them all to find and verify a common + * pattern. + */ + if ((unit = (*d->d_devtounit)(dev)) == -1) + return ENXIO; + if ((dv = device_lookup_acquire(d->d_cfdriver, unit)) == NULL) + return ENXIO; + } + DEV_LOCK(d); rv = (*d->d_open)(dev, flag, devtype, l); DEV_UNLOCK(d); + if (d->d_devtounit) { + device_release(dv); + } + + cdevsw_release(d, lc); + return rv; } @@ -1063,3 +1301,18 @@ nommap(dev_t dev, off_t off, int prot) return (paddr_t)-1; } + +/* + * dev_minor_unit(dev) + * + * Returns minor(dev) as an int. Intended for use with struct + * bdevsw, cdevsw::d_devtounit for drivers whose /dev nodes are + * implemented by reference to an autoconf instance with the minor + * number. + */ +int +dev_minor_unit(dev_t dev) +{ + + return minor(dev); +} diff --git a/sys/kern/subr_disk.c b/sys/kern/subr_disk.c index da664f920382..41218421db57 100644 --- a/sys/kern/subr_disk.c +++ b/sys/kern/subr_disk.c @@ -728,3 +728,10 @@ disk_set_info(device_t dev, struct disk *dk, const char *type) if (odisk_info) prop_object_release(odisk_info); } + +int +disklabel_dev_unit(dev_t dev) +{ + + return DISKUNIT(dev); +} diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index b4bc4c34ab03..bfc791cef39b 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -81,6 +81,7 @@ __KERNEL_RCSID(0, "$NetBSD: spec_vnops.c,v 1.183 2021/07/18 23:57:14 dholland Ex #include <sys/kauth.h> #include <sys/fstrans.h> #include <sys/module.h> +#include <sys/atomic.h> #include <miscfs/genfs/genfs.h> #include <miscfs/specfs/specdev.h> @@ -103,6 +104,7 @@ const char devcls[] = "devcls"; static vnode_t *specfs_hash[SPECHSZ]; extern struct mount *dead_rootmount; +static struct kcondvar specfs_iocv; /* * This vnode operations vector is used for special device nodes @@ -210,6 +212,100 @@ spec_init(void) rawio_listener = kauth_listen_scope(KAUTH_SCOPE_DEVICE, rawio_listener_cb, NULL); + cv_init(&specfs_iocv, "specio"); +} + +/* + * spec_io_enter(vp, &sn, &dev) + * + * Enter an operation that may not hold vp's vnode lock or an + * fstrans on vp's mount. Until spec_io_exit, the vnode will not + * be revoked. + * + * On success, set sn to the specnode pointer and dev to the dev_t + * number and return zero. Caller must later call spec_io_exit + * when done. + * + * On failure, return ENXIO -- the device has been revoked and no + * longer exists. + */ +static int +spec_io_enter(struct vnode *vp, struct specnode **snp, dev_t *devp) +{ + dev_t dev; + struct specnode *sn; + unsigned iocnt; + int error = 0; + + mutex_enter(vp->v_interlock); + + /* + * Extract all the info we need from the vnode, unless the + * vnode has already been reclaimed. This can happen if the + * underlying device has been removed and all the device nodes + * for it have been revoked. The caller may not hold a vnode + * lock or fstrans to prevent this from happening before it has + * had an opportunity to notice the vnode is dead. + */ + if (vdead_check(vp, VDEAD_NOWAIT) != 0 || + (sn = vp->v_specnode) == NULL || + (dev = vp->v_rdev) == NODEV) { + error = ENXIO; + goto out; + } + + /* + * Notify spec_node_revoke that we are doing an I/O operation + * which may not be not bracketed by fstrans(9) and thus is not + * blocked by vfs suspension. + * + * We could hold this reference with psref(9) instead, but we + * already have to take the interlock for vdead_check, so + * there's not much more cost here to another atomic operation. + */ + iocnt = atomic_inc_uint_nv(&sn->sn_iocnt); + CTASSERT(MAXLWP < UINT_MAX); + KASSERT(iocnt < UINT_MAX); + + /* Success! */ + *snp = sn; + *devp = dev; + error = 0; + +out: mutex_exit(vp->v_interlock); + return error; +} + +/* + * spec_io_exit(vp, sn) + * + * Exit an operation entered with a successful spec_io_enter -- + * allow concurrent spec_node_revoke to proceed. The argument sn + * must match the struct specnode pointer returned by spec_io_exit + * for vp. + */ +static void +spec_io_exit(struct vnode *vp, struct specnode *sn) +{ + unsigned iocnt; + + KASSERT(vp->v_specnode == sn); + + /* + * We are done. Notify spec_node_revoke if appropriate. The + * transition of 1 -> 0 must happen under device_lock so + * spec_node_revoke doesn't miss a wakeup. + */ + do { + iocnt = atomic_load_relaxed(&sn->sn_iocnt); + if (iocnt == 1) { + mutex_enter(&device_lock); + if (atomic_dec_uint_nv(&sn->sn_iocnt) == 0) + cv_broadcast(&specfs_iocv); + mutex_exit(&device_lock); + break; + } + } while (atomic_cas_uint(&sn->sn_iocnt, iocnt, iocnt - 1) != iocnt); } /* @@ -261,6 +357,7 @@ spec_node_init(vnode_t *vp, dev_t rdev) sn->sn_opencnt = 0; sn->sn_rdev = rdev; sn->sn_gone = false; + sn->sn_iocnt = 0; vp->v_specnode = sn; vp->v_specnext = *vpp; *vpp = vp; @@ -406,7 +503,22 @@ spec_node_revoke(vnode_t *vp) if (sn->sn_opencnt != 0) { sd->sd_opencnt -= (sn->sn_opencnt - 1); sn->sn_opencnt = 1; + + /* Prevent new opens. */ sn->sn_gone = true; + + /* + * Wait for all I/O operations in spec_io_enter/exit to + * complete. The caller holds the file system + * suspended, but file system suspension doesn't block + * all device special operations -- only those that run + * to completion holding the vnode lock and a handful + * of others. It is the driver's responsibility to + * ensure that new opens fail gracefully at this point. + */ + while (atomic_load_relaxed(&sn->sn_iocnt) != 0) + cv_wait(&specfs_iocv, &device_lock); + mutex_exit(&device_lock); VOP_CLOSE(vp, FNONBLOCK, NOCRED); @@ -435,6 +547,7 @@ spec_node_destroy(vnode_t *vp) KASSERT(vp->v_type == VBLK || vp->v_type == VCHR); KASSERT(vp->v_specnode != NULL); KASSERT(sn->sn_opencnt == 0); + KASSERT(sn->sn_iocnt == 0); mutex_enter(&device_lock); /* Remove from the hash and destroy the node. */ @@ -690,6 +803,8 @@ spec_read(void *v) struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct lwp *l = curlwp; + struct specnode *sn; + dev_t dev; struct buf *bp; daddr_t bn; int bsize, bscale; @@ -701,6 +816,7 @@ spec_read(void *v) int *rasizes; int nrablks, ratogo; + KASSERT(VOP_ISLOCKED(vp) || (vp->v_vflag & VV_LOCKSWORK) == 0); KASSERT(uio->uio_rw == UIO_READ); KASSERTMSG(VMSPACE_IS_KERNEL_P(uio->uio_vmspace) || uio->uio_vmspace == curproc->p_vmspace, @@ -712,9 +828,27 @@ spec_read(void *v) switch (vp->v_type) { case VCHR: + /* + * Release the lock while we sleep -- possibly + * indefinitely, if this is, e.g., a tty -- in + * cdev_read, so we don't hold up everything else that + * might want access to the vnode. + * + * But before we issue the read, take an I/O reference + * to the specnode so revoke will know when we're done + * writing. Note that the moment we release the lock, + * the vnode's identity may change; hence spec_io_enter + * may fail, and the caller may have a dead vnode on + * their hands, if the file system on which vp lived + * has been unmounted. + */ VOP_UNLOCK(vp); - error = cdev_read(vp->v_rdev, uio, ap->a_ioflag); - vn_lock(vp, LK_SHARED | LK_RETRY); + error = spec_io_enter(vp, &sn, &dev); + if (error) + goto out; + error = cdev_read(dev, uio, ap->a_ioflag); + spec_io_exit(vp, sn); +out: vn_lock(vp, LK_SHARED | LK_RETRY); return (error); case VBLK: @@ -790,6 +924,8 @@ spec_write(void *v) } */ *ap = v; struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; + struct specnode *sn; + dev_t dev; struct lwp *l = curlwp; struct buf *bp; daddr_t bn; @@ -798,6 +934,8 @@ spec_write(void *v) int n, on; int error = 0; + KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE || + (vp->v_vflag & VV_LOCKSWORK) == 0); KASSERT(uio->uio_rw == UIO_WRITE); KASSERTMSG(VMSPACE_IS_KERNEL_P(uio->uio_vmspace) || uio->uio_vmspace == curproc->p_vmspace, @@ -806,9 +944,27 @@ spec_write(void *v) switch (vp->v_type) { case VCHR: + /* + * Release the lock while we sleep -- possibly + * indefinitely, if this is, e.g., a tty -- in + * cdev_write, so we don't hold up everything else that + * might want access to the vnode. + * + * But before we issue the write, take an I/O reference + * to the specnode so revoke will know when we're done + * writing. Note that the moment we release the lock, + * the vnode's identity may change; hence spec_io_enter + * may fail, and the caller may have a dead vnode on + * their hands, if the file system on which vp lived + * has been unmounted. + */ VOP_UNLOCK(vp); - error = cdev_write(vp->v_rdev, uio, ap->a_ioflag); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + error = spec_io_enter(vp, &sn, &dev); + if (error) + goto out; + error = cdev_write(dev, uio, ap->a_ioflag); + spec_io_exit(vp, sn); +out: vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); return (error); case VBLK: @@ -866,31 +1022,23 @@ spec_fdiscard(void *v) off_t a_pos; off_t a_len; } */ *ap = v; - struct vnode *vp; + struct vnode *vp = ap->a_vp; dev_t dev; - vp = ap->a_vp; - dev = NODEV; - - mutex_enter(vp->v_interlock); - if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode != NULL) { - dev = vp->v_rdev; - } - mutex_exit(vp->v_interlock); + KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE || + (vp->v_vflag & VV_LOCKSWORK) == 0); - if (dev == NODEV) { - return ENXIO; - } + dev = vp->v_rdev; switch (vp->v_type) { - case VCHR: + case VCHR: // this is not stored for character devices //KASSERT(vp == vp->v_specnode->sn_dev->sd_cdevvp); return cdev_discard(dev, ap->a_pos, ap->a_len); - case VBLK: + case VBLK: KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); return bdev_discard(dev, ap->a_pos, ap->a_len); - default: + default: panic("spec_fdiscard: not a device\n"); } } @@ -909,40 +1057,32 @@ spec_ioctl(void *v) int a_fflag; kauth_cred_t a_cred; } */ *ap = v; - struct vnode *vp; + struct vnode *vp = ap->a_vp; + struct specnode *sn; dev_t dev; + int error; - /* - * Extract all the info we need from the vnode, taking care to - * avoid a race with VOP_REVOKE(). - */ - - vp = ap->a_vp; - dev = NODEV; - mutex_enter(vp->v_interlock); - if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode) { - dev = vp->v_rdev; - } - mutex_exit(vp->v_interlock); - if (dev == NODEV) { - return ENXIO; - } + error = spec_io_enter(vp, &sn, &dev); + if (error) + return error; switch (vp->v_type) { - case VCHR: - return cdev_ioctl(dev, ap->a_command, ap->a_data, + error = cdev_ioctl(dev, ap->a_command, ap->a_data, ap->a_fflag, curlwp); - + break; case VBLK: KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); - return bdev_ioctl(dev, ap->a_command, ap->a_data, + error = bdev_ioctl(dev, ap->a_command, ap->a_data, ap->a_fflag, curlwp); - + break; default: panic("spec_ioctl"); /* NOTREACHED */ } + + spec_io_exit(vp, sn); + return error; } /* ARGSUSED */ @@ -953,33 +1093,25 @@ spec_poll(void *v) struct vnode *a_vp; int a_events; } */ *ap = v; - struct vnode *vp; + struct vnode *vp = ap->a_vp; + struct specnode *sn; dev_t dev; + int revents; - /* - * Extract all the info we need from the vnode, taking care to - * avoid a race with VOP_REVOKE(). - */ - - vp = ap->a_vp; - dev = NODEV; - mutex_enter(vp->v_interlock); - if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode) { - dev = vp->v_rdev; - } - mutex_exit(vp->v_interlock); - if (dev == NODEV) { + if (spec_io_enter(vp, &sn, &dev) != 0) return POLLERR; - } switch (vp->v_type) { - case VCHR: - return cdev_poll(dev, ap->a_events, curlwp); - + revents = cdev_poll(dev, ap->a_events, curlwp); + break; default: - return (genfs_poll(v)); + revents = genfs_poll(v); + break; } + + spec_io_exit(vp, sn); + return revents; } /* ARGSUSED */ @@ -990,20 +1122,30 @@ spec_kqfilter(void *v) struct vnode *a_vp; struct proc *a_kn; } */ *ap = v; + struct vnode *vp = ap->a_vp; + struct specnode *sn; dev_t dev; + int error; - switch (ap->a_vp->v_type) { + error = spec_io_enter(vp, &sn, &dev); + if (error) + return error; + switch (vp->v_type) { case VCHR: - dev = ap->a_vp->v_rdev; - return cdev_kqfilter(dev, ap->a_kn); + error = cdev_kqfilter(dev, ap->a_kn); + break; default: /* * Block devices don't support kqfilter, and refuse it * for any other files (like those vflush()ed) too. */ - return (EOPNOTSUPP); + error = EOPNOTSUPP; + break; } + + spec_io_exit(vp, sn); + return error; } /* @@ -1018,11 +1160,19 @@ spec_mmap(void *v) kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; + struct specnode *sn; + dev_t dev; + int error; + + error = spec_io_enter(vp, &sn, &dev); + if (error) + return error; KASSERT(vp->v_type == VBLK); - if (bdev_type(vp->v_rdev) != D_DISK) + if (bdev_type(dev) != D_DISK) return EINVAL; + spec_io_exit(vp, sn); return 0; } @@ -1067,27 +1217,15 @@ spec_strategy(void *v) } */ *ap = v; struct vnode *vp = ap->a_vp; struct buf *bp = ap->a_bp; + struct specnode *sn; dev_t dev; + bool ioentered = false; int error; - dev = NODEV; - - /* - * Extract all the info we need from the vnode, taking care to - * avoid a race with VOP_REVOKE(). - */ - - mutex_enter(vp->v_interlock); - if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode != NULL) { - KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); - dev = vp->v_rdev; - } - mutex_exit(vp->v_interlock); - - if (dev == NODEV) { - error = ENXIO; + error = spec_io_enter(vp, &sn, &dev); + if (error) goto out; - } + ioentered = true; bp->b_dev = dev; if (!(bp->b_flags & B_READ)) { @@ -1106,14 +1244,15 @@ spec_strategy(void *v) goto out; } bdev_strategy(bp); - - return 0; - -out: - bp->b_error = error; - bp->b_resid = bp->b_bcount; - biodone(bp); - + error = 0; + +out: if (ioentered) + spec_io_exit(vp, sn); + if (error) { + bp->b_error = error; + bp->b_resid = bp->b_bcount; + biodone(bp); + } return error; } @@ -1284,18 +1423,22 @@ spec_close(void *v) * might end up sleeping for someone else who wants our queues. They * won't get them if we hold the vnode locked. */ - if (!(flags & FNONBLOCK)) + if (!(flags & FNONBLOCK)) { + fstrans_start(vp->v_mount); VOP_UNLOCK(vp); + } if (vp->v_type == VBLK) error = bdev_close(dev, flags, mode, curlwp); else error = cdev_close(dev, flags, mode, curlwp); - if (!(flags & FNONBLOCK)) + if (!(flags & FNONBLOCK)) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + fstrans_done(vp->v_mount); + } - return (error); + return error; } /* diff --git a/sys/miscfs/specfs/specdev.h b/sys/miscfs/specfs/specdev.h index 621c103e80b1..e8afc0606db9 100644 --- a/sys/miscfs/specfs/specdev.h +++ b/sys/miscfs/specfs/specdev.h @@ -69,6 +69,7 @@ typedef struct specnode { u_int sn_opencnt; dev_t sn_rdev; bool sn_gone; + volatile u_int sn_iocnt; /* # active I/O operations */ } specnode_t; typedef struct specdev { diff --git a/sys/modules/examples/pollpal/pollpal.c b/sys/modules/examples/pollpal/pollpal.c index b76e0733699b..d8ddc73450e0 100644 --- a/sys/modules/examples/pollpal/pollpal.c +++ b/sys/modules/examples/pollpal/pollpal.c @@ -311,7 +311,8 @@ pollpal_modcmd(modcmd_t cmd, void *arg __unused) case MODULE_CMD_FINI: if (pollpal_nopen != 0) return EBUSY; - return devsw_detach(NULL, &pollpal_cdevsw); + devsw_detach(NULL, &pollpal_cdevsw); + return 0; default: return ENOTTY; } diff --git a/sys/net/if_tap.c b/sys/net/if_tap.c index 0b57ad4a711c..314f4647707c 100644 --- a/sys/net/if_tap.c +++ b/sys/net/if_tap.c @@ -256,9 +256,7 @@ tapdetach(void) if_clone_detach(&tap_cloners); #ifdef _MODULE - error = devsw_detach(NULL, &tap_cdevsw); - if (error != 0) - goto out2; + devsw_detach(NULL, &tap_cdevsw); #endif if (tap_count != 0) { @@ -277,7 +275,6 @@ tapdetach(void) out1: #ifdef _MODULE devsw_attach("tap", NULL, &tap_bmajor, &tap_cdevsw, &tap_cmajor); - out2: #endif if_clone_attach(&tap_cloners); diff --git a/sys/net/if_tun.c b/sys/net/if_tun.c index 4f533a8f08d1..f4e5b6d86d43 100644 --- a/sys/net/if_tun.c +++ b/sys/net/if_tun.c @@ -142,17 +142,10 @@ tuninit(void) static int tundetach(void) { -#ifdef _MODULE - int error; -#endif if_clone_detach(&tun_cloner); #ifdef _MODULE - error = devsw_detach(NULL, &tun_cdevsw); - if (error != 0) { - if_clone_attach(&tun_cloner); - return error; - } + devsw_detach(NULL, &tun_cdevsw); #endif if (!LIST_EMPTY(&tun_softc_list) || !LIST_EMPTY(&tunz_softc_list)) { diff --git a/sys/rump/dev/lib/libbpf/bpf_component.c b/sys/rump/dev/lib/libbpf/bpf_component.c index 05807d371d40..d41d1987afe8 100644 --- a/sys/rump/dev/lib/libbpf/bpf_component.c +++ b/sys/rump/dev/lib/libbpf/bpf_component.c @@ -50,6 +50,5 @@ RUMP_COMPONENT(RUMP_COMPONENT_NET) panic("bpf devsw attach failed: %d", error); if ((error = rump_vfs_makeonedevnode(S_IFCHR, "/dev/bpf", cmaj, 0)) !=0) panic("cannot create bpf device nodes: %d", error); - if ((error = devsw_detach(NULL, &bpf_cdevsw)) != 0) - panic("cannot detach bpf devsw: %d", error); + devsw_detach(NULL, &bpf_cdevsw); } diff --git a/sys/rump/dev/lib/libdrvctl/drvctl_component.c b/sys/rump/dev/lib/libdrvctl/drvctl_component.c index e2e79f45f9de..ac4e103fdb9c 100644 --- a/sys/rump/dev/lib/libdrvctl/drvctl_component.c +++ b/sys/rump/dev/lib/libdrvctl/drvctl_component.c @@ -51,7 +51,5 @@ RUMP_COMPONENT(RUMP_COMPONENT_DEV) if ( error !=0) panic("cannot create drvctl device node: %d", error); - error = devsw_detach(NULL, &drvctl_cdevsw); - if (error != 0) - panic("cannot detach drvctl devsw: %d", error); + devsw_detach(NULL, &drvctl_cdevsw); } diff --git a/sys/sys/conf.h b/sys/sys/conf.h index 081631d2111f..cb4c287d1982 100644 --- a/sys/sys/conf.h +++ b/sys/sys/conf.h @@ -76,6 +76,8 @@ struct bdevsw { int (*d_dump)(dev_t, daddr_t, void *, size_t); int (*d_psize)(dev_t); int (*d_discard)(dev_t, off_t, off_t); + int (*d_devtounit)(dev_t); + struct cfdriver *d_cfdriver; int d_flag; }; @@ -94,6 +96,8 @@ struct cdevsw { paddr_t (*d_mmap)(dev_t, off_t, int); int (*d_kqfilter)(dev_t, struct knote *); int (*d_discard)(dev_t, off_t, off_t); + int (*d_devtounit)(dev_t); + struct cfdriver *d_cfdriver; int d_flag; }; @@ -104,7 +108,7 @@ extern kmutex_t device_lock; int devsw_attach(const char *, const struct bdevsw *, devmajor_t *, const struct cdevsw *, devmajor_t *); -int devsw_detach(const struct bdevsw *, const struct cdevsw *); +void devsw_detach(const struct bdevsw *, const struct cdevsw *); const struct bdevsw *bdevsw_lookup(dev_t); const struct cdevsw *cdevsw_lookup(dev_t); devmajor_t bdevsw_lookup_major(const struct bdevsw *); @@ -276,6 +280,7 @@ devmajor_t devsw_name2blk(const char *, char *, size_t); devmajor_t devsw_name2chr(const char *, char *, size_t); dev_t devsw_chr2blk(dev_t); dev_t devsw_blk2chr(dev_t); +int dev_minor_unit(dev_t); void mm_init(void); #endif /* _KERNEL */ diff --git a/sys/sys/device.h b/sys/sys/device.h index 99940ab1563e..76d9f8be5ad6 100644 --- a/sys/sys/device.h +++ b/sys/sys/device.h @@ -274,10 +274,12 @@ struct device { void *dv_private; /* this device's private storage */ int *dv_locators; /* our actual locators (optional) */ prop_dictionary_t dv_properties;/* properties dictionary */ + struct localcount *dv_localcount;/* reference count */ int dv_pending; /* config_pending count */ TAILQ_ENTRY(device) dv_pending_list; + struct lwp *dv_attaching; /* thread not yet finished in attach */ struct lwp *dv_detaching; /* detach lock (config_misc_lock/cv) */ size_t dv_activity_count; @@ -651,6 +653,10 @@ void null_childdetached(device_t, device_t); device_t device_lookup(cfdriver_t, int); void *device_lookup_private(cfdriver_t, int); + +device_t device_lookup_acquire(cfdriver_t, int); +void device_release(device_t); + void device_register(device_t, void *); void device_register_post_config(device_t, void *); diff --git a/sys/sys/disklabel.h b/sys/sys/disklabel.h index 4e94b8671332..853cdbe668a3 100644 --- a/sys/sys/disklabel.h +++ b/sys/sys/disklabel.h @@ -509,6 +509,7 @@ const char *convertdisklabel(struct disklabel *, void (*)(struct buf *), int bounds_check_with_label(struct disk *, struct buf *, int); int bounds_check_with_mediasize(struct buf *, int, uint64_t); const char *getfstypename(int); +int disklabel_dev_unit(dev_t); #endif #endif /* _LOCORE */