diff --git a/external/cddl/osnet/dev/dtrace/dtrace_modevent.c b/external/cddl/osnet/dev/dtrace/dtrace_modevent.c
index cc0f8103fb98..f1338840125a 100644
--- a/external/cddl/osnet/dev/dtrace/dtrace_modevent.c
+++ b/external/cddl/osnet/dev/dtrace/dtrace_modevent.c
@@ -42,9 +42,7 @@ dtrace_modcmd(modcmd_t cmd, void *data)
 		return error;
 
 	case MODULE_CMD_FINI:
-		error = devsw_detach(NULL, &dtrace_cdevsw);
-		if (error != 0)
-			return error;
+		devsw_detach(NULL, &dtrace_cdevsw);
 
 		error = dtrace_unload();
 		if (error != 0) {
diff --git a/external/cddl/osnet/dev/fbt/fbt.c b/external/cddl/osnet/dev/fbt/fbt.c
index b367c2155292..46dd7c1f7f06 100644
--- a/external/cddl/osnet/dev/fbt/fbt.c
+++ b/external/cddl/osnet/dev/fbt/fbt.c
@@ -1329,7 +1329,8 @@ dtrace_fbt_modcmd(modcmd_t cmd, void *data)
 		error = fbt_unload();
 		if (error != 0)
 			return error;
-		return devsw_detach(NULL, &fbt_cdevsw);
+		devsw_detach(NULL, &fbt_cdevsw);
+		return 0;
 	case MODULE_CMD_AUTOUNLOAD:
 		return EBUSY;
 	default:
diff --git a/external/cddl/osnet/dev/sdt/sdt.c b/external/cddl/osnet/dev/sdt/sdt.c
index c3ad129f8284..5a41270a2917 100644
--- a/external/cddl/osnet/dev/sdt/sdt.c
+++ b/external/cddl/osnet/dev/sdt/sdt.c
@@ -562,7 +562,8 @@ dtrace_sdt_modcmd(modcmd_t cmd, void *data)
 		error = sdt_unload();
 		if (error != 0)
 			return error;
-		return devsw_detach(NULL, &sdt_cdevsw);
+		devsw_detach(NULL, &sdt_cdevsw);
+		return 0;
 	case MODULE_CMD_AUTOUNLOAD:
 		return EBUSY;
 	default:
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c
index 9e19cd1dc0c3..d74d8c71e54d 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c
@@ -7231,7 +7231,7 @@ zfs_modcmd(modcmd_t cmd, void *arg)
 		if (error)
 			return error;
 
-		(void) devsw_detach(&zfs_bdevsw, &zfs_cdevsw);
+		devsw_detach(&zfs_bdevsw, &zfs_cdevsw);
 
 attacherr:
 		zfs_sysctl_fini();
diff --git a/share/man/man9/devsw_attach.9 b/share/man/man9/devsw_attach.9
index cf862be5846a..6ffc51957a3f 100644
--- a/share/man/man9/devsw_attach.9
+++ b/share/man/man9/devsw_attach.9
@@ -27,7 +27,7 @@
 .\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 .\" POSSIBILITY OF SUCH DAMAGE.
 .\"
-.Dd April 30, 2017
+.Dd January 11, 2022
 .Dt DEVSW 9
 .Os
 .Sh NAME
@@ -49,7 +49,7 @@
 .Fa "const struct cdevsw *cdev"
 .Fa "devmajor_t *cmajor"
 .Fc
-.Ft int
+.Ft void
 .Fo devsw_detach
 .Fa "const struct bdevsw *bdev"
 .Fa "const struct cdevsw *cdev"
@@ -130,6 +130,11 @@ and
 structures.
 .Fn devsw_detach
 should be called before a loaded device driver is unloaded.
+The caller must ensure that there are no open instances of the device,
+and that the device's
+.Fn d_open
+function will fail, before calling.
+Fn devsw_detach .
 .Pp
 The
 .Fn bdevsw_lookup
@@ -155,10 +160,8 @@ or
 .Sh RETURN VALUES
 Upon successful completion,
 .Fn devsw_attach
-and
-.Fn devsw_detach
-return 0.
-Otherwise they return an error value.
+returns 0.
+Otherwise it returns an error value.
 .Pp
 In case of failure,
 .Fn bdevsw_lookup
diff --git a/sys/coda/coda_psdev.c b/sys/coda/coda_psdev.c
index cede16da3f53..7f531f03fe56 100644
--- a/sys/coda/coda_psdev.c
+++ b/sys/coda/coda_psdev.c
@@ -758,7 +758,7 @@ vcoda_modcmd(modcmd_t cmd, void *arg)
 				if (VC_OPEN(vcp))
 					return EBUSY;
 			}
-			return devsw_detach(NULL, &vcoda_cdevsw);
+			devsw_detach(NULL, &vcoda_cdevsw);
 		}
 #endif
 		break;
diff --git a/sys/dev/ccd.c b/sys/dev/ccd.c
index 05945f9a67ba..2283bc0346da 100644
--- a/sys/dev/ccd.c
+++ b/sys/dev/ccd.c
@@ -1710,7 +1710,7 @@ ccd_modcmd(modcmd_t cmd, void *arg)
 			error = EBUSY;
 		} else {
 			mutex_exit(&ccd_lock);
-			error = devsw_detach(&ccd_bdevsw, &ccd_cdevsw);
+			devsw_detach(&ccd_bdevsw, &ccd_cdevsw);
 			ccddetach();
 		}
 #endif
diff --git a/sys/dev/clockctl.c b/sys/dev/clockctl.c
index 0da5e7765fe8..9685c0f129f6 100644
--- a/sys/dev/clockctl.c
+++ b/sys/dev/clockctl.c
@@ -182,14 +182,12 @@ clockctl_modcmd(modcmd_t cmd, void *data)
 			return EBUSY;
 		}
 #ifdef _MODULE
-		error = devsw_detach(NULL, &clockctl_cdevsw);
+		devsw_detach(NULL, &clockctl_cdevsw);
 #endif
 		mutex_exit(&clockctl_mtx);
 
-		if (error == 0) {
-			kauth_unlisten_scope(clockctl_listener);
-			mutex_destroy(&clockctl_mtx);
-		}
+		kauth_unlisten_scope(clockctl_listener);
+		mutex_destroy(&clockctl_mtx);
 		break;
 
 	default:
diff --git a/sys/dev/hdaudio/hdaudio.c b/sys/dev/hdaudio/hdaudio.c
index d39ff2db6cde..5c7874778e22 100644
--- a/sys/dev/hdaudio/hdaudio.c
+++ b/sys/dev/hdaudio/hdaudio.c
@@ -1636,11 +1636,7 @@ hdaudio_modcmd(modcmd_t cmd, void *opaque)
 		error = config_cfdriver_detach(&hdaudio_cd);
 		if (error)
 			break;
-		error = devsw_detach(NULL, &hdaudio_cdevsw);
-		if (error) {
-			config_cfdriver_attach(&hdaudio_cd);
-			break;
-		}
+		devsw_detach(NULL, &hdaudio_cdevsw);
 #endif
 		break;
 	default:
diff --git a/sys/dev/i2c/i2c.c b/sys/dev/i2c/i2c.c
index 6f2c0c6a9698..36a3e87d5316 100644
--- a/sys/dev/i2c/i2c.c
+++ b/sys/dev/i2c/i2c.c
@@ -942,7 +942,7 @@ iic_modcmd(modcmd_t cmd, void *opaque)
 		if (error) {
 			aprint_error("%s: unable to init component\n",
 			    iic_cd.cd_name);
-			(void)devsw_detach(NULL, &iic_cdevsw);
+			devsw_detach(NULL, &iic_cdevsw);
 		}
 		mutex_exit(&iic_mtx);
 #endif
@@ -960,10 +960,7 @@ iic_modcmd(modcmd_t cmd, void *opaque)
 			mutex_exit(&iic_mtx);
 			break;
 		}
-		error = devsw_detach(NULL, &iic_cdevsw);
-		if (error != 0)
-			config_init_component(cfdriver_ioconf_iic,
-			    cfattach_ioconf_iic, cfdata_ioconf_iic);
+		devsw_detach(NULL, &iic_cdevsw);
 #endif
 		mutex_exit(&iic_mtx);
 		break;
diff --git a/sys/dev/pad/pad.c b/sys/dev/pad/pad.c
index fe0b429cf386..a779f1f71b8d 100644
--- a/sys/dev/pad/pad.c
+++ b/sys/dev/pad/pad.c
@@ -777,9 +777,7 @@ pad_modcmd(modcmd_t cmd, void *arg)
 
 	case MODULE_CMD_FINI:
 #ifdef _MODULE
-		error = devsw_detach(NULL, &pad_cdevsw);
-		if (error)
-			break;
+		devsw_detach(NULL, &pad_cdevsw);
 
 		error = config_fini_component(cfdriver_ioconf_pad,
 		    pad_cfattach, cfdata_ioconf_pad);
diff --git a/sys/dev/raidframe/rf_netbsdkintf.c b/sys/dev/raidframe/rf_netbsdkintf.c
index d1bda3553e03..87439aa70bfb 100644
--- a/sys/dev/raidframe/rf_netbsdkintf.c
+++ b/sys/dev/raidframe/rf_netbsdkintf.c
@@ -4088,16 +4088,7 @@ raid_modcmd_fini(void)
 		return error;
 	}
 #endif
-	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
-	if (error != 0) {
-		aprint_error("%s: cannot detach devsw\n",__func__);
-#ifdef _MODULE
-		config_cfdriver_attach(&raid_cd);
-#endif
-		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
-		mutex_exit(&raid_lock);
-		return error;
-	}
+	devsw_detach(&raid_bdevsw, &raid_cdevsw);
 	rf_BootRaidframe(false);
 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
 	rf_destroy_mutex2(rf_sparet_wait_mutex);
diff --git a/sys/dev/sysmon/sysmon.c b/sys/dev/sysmon/sysmon.c
index 46aedaad7337..fd2993f0c180 100644
--- a/sys/dev/sysmon/sysmon.c
+++ b/sys/dev/sysmon/sysmon.c
@@ -356,7 +356,7 @@ sysmon_fini(void)
 	if (error == 0) {
 		mutex_enter(&sysmon_minor_mtx);
 		sm_is_attached = false;
-		error = devsw_detach(NULL, &sysmon_cdevsw);
+		devsw_detach(NULL, &sysmon_cdevsw);
 		mutex_exit(&sysmon_minor_mtx);
 	}
 #endif
diff --git a/sys/dev/tprof/tprof.c b/sys/dev/tprof/tprof.c
index b069a5b7df5d..136fd190ad14 100644
--- a/sys/dev/tprof/tprof.c
+++ b/sys/dev/tprof/tprof.c
@@ -768,13 +768,7 @@ tprof_modcmd(modcmd_t cmd, void *arg)
 
 	case MODULE_CMD_FINI:
 #if defined(_MODULE)
-		{
-			int error;
-			error = devsw_detach(NULL, &tprof_cdevsw);
-			if (error) {
-				return error;
-			}
-		}
+		devsw_detach(NULL, &tprof_cdevsw);
 #endif /* defined(_MODULE) */
 		tprof_driver_fini();
 		return 0;
diff --git a/sys/dist/pf/net/pf_ioctl.c b/sys/dist/pf/net/pf_ioctl.c
index 94bfb70a411d..e4c13be698f8 100644
--- a/sys/dist/pf/net/pf_ioctl.c
+++ b/sys/dist/pf/net/pf_ioctl.c
@@ -3459,7 +3459,8 @@ pf_modcmd(modcmd_t cmd, void *opaque)
 		} else {
 			pfdetach();
 			pflogdetach();
-			return devsw_detach(NULL, &pf_cdevsw);
+			devsw_detach(NULL, &pf_cdevsw);
+			return 0;
 		}
 	default:
 		return ENOTTY;
diff --git a/sys/external/bsd/ipf/netinet/ip_fil_netbsd.c b/sys/external/bsd/ipf/netinet/ip_fil_netbsd.c
index d0c4ca95097c..bb4e0706cc9b 100644
--- a/sys/external/bsd/ipf/netinet/ip_fil_netbsd.c
+++ b/sys/external/bsd/ipf/netinet/ip_fil_netbsd.c
@@ -2256,7 +2256,7 @@ ipl_fini(void *opaque)
 {
 
 #ifdef _MODULE
-	(void)devsw_detach(NULL, &ipl_cdevsw);
+	devsw_detach(NULL, &ipl_cdevsw);
 #endif
 
 	/*
diff --git a/sys/fs/autofs/autofs_vfsops.c b/sys/fs/autofs/autofs_vfsops.c
index fbd6eafe6532..1204d1f9b6d3 100644
--- a/sys/fs/autofs/autofs_vfsops.c
+++ b/sys/fs/autofs/autofs_vfsops.c
@@ -496,9 +496,7 @@ autofs_modcmd(modcmd_t cmd, void *arg)
 		}
 		mutex_exit(&autofs_softc->sc_lock);
 
-		error = devsw_detach(NULL, &autofs_cdevsw);
-		if (error)
-			break;
+		devsw_detach(NULL, &autofs_cdevsw);
 #endif
 		error = vfs_detach(&autofs_vfsops);
 		if (error)
diff --git a/sys/kern/kern_drvctl.c b/sys/kern/kern_drvctl.c
index 37f4730b2512..8a4156f8a0aa 100644
--- a/sys/kern/kern_drvctl.c
+++ b/sys/kern/kern_drvctl.c
@@ -665,15 +665,10 @@ drvctl_modcmd(modcmd_t cmd, void *arg)
 		devmon_insert_vec = saved_insert_vec;
 		saved_insert_vec = NULL;
 #ifdef _MODULE
-		error = devsw_detach(NULL, &drvctl_cdevsw);
-		if (error != 0) {
-			saved_insert_vec = devmon_insert_vec;
-			devmon_insert_vec = devmon_insert;
-		}
+		devsw_detach(NULL, &drvctl_cdevsw);
 #endif
 		mutex_exit(&drvctl_lock);
-		if (error == 0)
-			drvctl_fini();
+		drvctl_fini();
 
 		break;
 	default:
diff --git a/sys/kern/subr_autoconf.c b/sys/kern/subr_autoconf.c
index 8439ae8c6a00..8c0b66556e0f 100644
--- a/sys/kern/subr_autoconf.c
+++ b/sys/kern/subr_autoconf.c
@@ -108,6 +108,7 @@ __KERNEL_RCSID(0, "$NetBSD: subr_autoconf.c,v 1.291 2021/12/31 14:19:57 riastrad
 #include <sys/cpu.h>
 #include <sys/sysctl.h>
 #include <sys/stdarg.h>
+#include <sys/localcount.h>
 
 #include <sys/disk.h>
 
@@ -1453,6 +1454,9 @@ config_devdelete(device_t dev)
 	if (dg->dg_devs != NULL)
 		kmem_free(dg->dg_devs, sizeof(device_t) * dg->dg_ndevs);
 
+	localcount_fini(dev->dv_localcount);
+	kmem_free(dev->dv_localcount, sizeof(*dev->dv_localcount));
+
 	cv_destroy(&dvl->dvl_cv);
 	mutex_destroy(&dvl->dvl_mtx);
 
@@ -1556,6 +1560,7 @@ config_devalloc(const device_t parent, const cfdata_t cf,
 	dev->dv_activity_handlers = NULL;
 	dev->dv_private = dev_private;
 	dev->dv_flags = ca->ca_flags;	/* inherit flags from class */
+	dev->dv_attaching = curlwp;
 
 	myunit = config_unit_alloc(dev, cd, cf);
 	if (myunit == -1) {
@@ -1604,6 +1609,10 @@ config_devalloc(const device_t parent, const cfdata_t cf,
 		    "device-parent", device_xname(parent));
 	}
 
+	dev->dv_localcount = kmem_zalloc(sizeof(*dev->dv_localcount),
+	    KM_SLEEP);
+	localcount_init(dev->dv_localcount);
+
 	if (dev->dv_cfdriver->cd_attrs != NULL)
 		config_add_attrib_dict(dev);
 
@@ -1755,8 +1764,29 @@ config_attach_internal(device_t parent, cfdata_t cf, void *aux, cfprint_t print,
 	/* Let userland know */
 	devmon_report_device(dev, true);
 
+	/*
+	 * Prevent detach until the driver's attach function, and all
+	 * deferred actions, have finished.
+	 */
 	config_pending_incr(dev);
+
+	/* Call the driver's attach function.  */
 	(*dev->dv_cfattach->ca_attach)(parent, dev, aux);
+
+	/*
+	 * Allow other threads to acquire references to the device now
+	 * that the driver's attach function is done.
+	 */
+	mutex_enter(&config_misc_lock);
+	KASSERT(dev->dv_attaching == curlwp);
+	dev->dv_attaching = NULL;
+	cv_broadcast(&config_misc_cv);
+	mutex_exit(&config_misc_lock);
+
+	/*
+	 * Synchronous parts of attach are done.  Allow detach, unless
+	 * the driver's attach function scheduled deferred actions.
+	 */
 	config_pending_decr(dev);
 
 	mutex_enter(&config_misc_lock);
@@ -1822,8 +1852,29 @@ config_attach_pseudo(cfdata_t cf)
 	/* Let userland know */
 	devmon_report_device(dev, true);
 
+	/*
+	 * Prevent detach until the driver's attach function, and all
+	 * deferred actions, have finished.
+	 */
 	config_pending_incr(dev);
+
+	/* Call the driver's attach function.  */
 	(*dev->dv_cfattach->ca_attach)(ROOT, dev, NULL);
+
+	/*
+	 * Allow other threads to acquire references to the device now
+	 * that the driver's attach function is done.
+	 */
+	mutex_enter(&config_misc_lock);
+	KASSERT(dev->dv_attaching == curlwp);
+	dev->dv_attaching = NULL;
+	cv_broadcast(&config_misc_cv);
+	mutex_exit(&config_misc_lock);
+
+	/*
+	 * Synchronous parts of attach are done.  Allow detach, unless
+	 * the driver's attach function scheduled deferred actions.
+	 */
 	config_pending_decr(dev);
 
 	config_process_deferred(&deferred_config_queue, dev);
@@ -1872,24 +1923,39 @@ config_dump_garbage(struct devicelist *garbage)
 static int
 config_detach_enter(device_t dev)
 {
-	int error;
+	int error = 0;
 
 	mutex_enter(&config_misc_lock);
-	for (;;) {
-		if (dev->dv_pending == 0 && dev->dv_detaching == NULL) {
-			dev->dv_detaching = curlwp;
-			error = 0;
-			break;
-		}
+
+	/*
+	 * Wait until attach has fully completed, and until any
+	 * concurrent detach (e.g., drvctl racing with USB event
+	 * thread) has completed.
+	 *
+	 * Caller must hold alldevs_nread or alldevs_nwrite (e.g., via
+	 * deviter) to ensure the winner of the race doesn't free the
+	 * device leading the loser of the race into use-after-free.
+	 *
+	 * XXX Not all callers do this!
+	 */
+	while (dev->dv_pending || dev->dv_detaching) {
 		KASSERTMSG(dev->dv_detaching != curlwp,
 		    "recursively detaching %s", device_xname(dev));
 		error = cv_wait_sig(&config_misc_cv, &config_misc_lock);
 		if (error)
-			break;
+			goto out;
 	}
-	KASSERT(error || dev->dv_detaching == curlwp);
-	mutex_exit(&config_misc_lock);
 
+	/*
+	 * Attach has completed, and no other concurrent detach is
+	 * running.  Claim the device for detaching.  This will cause
+	 * all new attempts to acquire references to block.
+	 */
+	KASSERT(dev->dv_attaching == NULL);
+	KASSERT(dev->dv_detaching == NULL);
+	dev->dv_detaching = curlwp;
+
+out:	mutex_exit(&config_misc_lock);
 	return error;
 }
 
@@ -1980,9 +2046,10 @@ config_detach(device_t dev, int flags)
 	 */
 	if (rv == 0)
 		dev->dv_flags &= ~DVF_ACTIVE;
-	else if ((flags & DETACH_FORCE) == 0)
+	else if ((flags & DETACH_FORCE) == 0) {
+		/* Detach failed -- likely EBUSY.  */
 		goto out;
-	else {
+	} else {
 		panic("config_detach: forced detach of %s failed (%d)",
 		    device_xname(dev), rv);
 	}
@@ -1991,6 +2058,19 @@ config_detach(device_t dev, int flags)
 	 * The device has now been successfully detached.
 	 */
 
+	/*
+	 * Wait for all device_lookup_acquire references -- mostly, for
+	 * all attempts to open the device -- to drain.  It is the
+	 * responsibility of .ca_detach to ensure anything with open
+	 * references will be interrupted and release them promptly,
+	 * not block indefinitely.  All new attempts to acquire
+	 * references will block until dv_detaching clears.
+	 */
+	mutex_enter(&config_misc_lock);
+	localcount_drain(dev->dv_localcount,
+	    &config_misc_cv, &config_misc_lock);
+	mutex_exit(&config_misc_lock);
+
 	/* Let userland know */
 	devmon_report_device(dev, false);
 
@@ -2498,6 +2578,14 @@ config_alldevs_exit(struct alldevs_foray *af)
  * device_lookup:
  *
  *	Look up a device instance for a given driver.
+ *
+ *	Caller is responsible for ensuring the device's state is
+ *	stable, either by holding a reference already obtained with
+ *	device_lookup_acquire or by otherwise ensuring the device is
+ *	attached and can't be detached (e.g., holding an open device
+ *	node and ensuring *_detach calls vdevgone).
+ *
+ *	XXX Find a way to assert this.
  */
 device_t
 device_lookup(cfdriver_t cd, int unit)
@@ -2526,6 +2614,69 @@ device_lookup_private(cfdriver_t cd, int unit)
 	return device_private(device_lookup(cd, unit));
 }
 
+/*
+ * device_lookup_acquire:
+ *
+ *	Look up a device instance for a given driver, and return a
+ *	reference to it that must be released by device_release.
+ *
+ *	=> If the device is still attaching, blocks until *_attach has
+ *	   returned.
+ *
+ *	=> If the device is detaching, blocks until *_detach has
+ *	   returned.  May succeed or fail in that case, depending on
+ *	   whether *_detach has backed out (EBUSY) or committed to
+ *	   detaching.
+ */
+device_t
+device_lookup_acquire(cfdriver_t cd, int unit)
+{
+	device_t dv;
+
+	/* XXX This should have a pserialized fast path -- TBD.  */
+	mutex_enter(&config_misc_lock);
+	mutex_enter(&alldevs_lock);
+retry:	if (unit < 0 || unit >= cd->cd_ndevs ||
+	    (dv = cd->cd_devs[unit]) == NULL ||
+	    dv->dv_del_gen != 0) {
+		dv = NULL;
+	} else {
+		/*
+		 * Wait for the device to stabilize, if attaching or
+		 * detaching.  Either way we must wait for *_attach or
+		 * *_detach to complete, and either way we must retry:
+		 * even if detaching, *_detach might fail (EBUSY) so
+		 * the device may still be there.
+		 */
+		if ((dv->dv_attaching != NULL && dv->dv_attaching != curlwp) ||
+		    dv->dv_detaching != NULL) {
+			mutex_exit(&alldevs_lock);
+			cv_wait(&config_misc_cv, &config_misc_lock);
+			mutex_enter(&alldevs_lock);
+			goto retry;
+		}
+		localcount_acquire(dv->dv_localcount);
+	}
+	mutex_exit(&alldevs_lock);
+	mutex_exit(&config_misc_lock);
+
+	return dv;
+}
+
+/*
+ * device_release:
+ *
+ *	Release a reference to a device acquired with
+ *	device_lookup_acquire.
+ */
+void
+device_release(device_t dv)
+{
+
+	localcount_release(dv->dv_localcount,
+	    &config_misc_cv, &config_misc_lock);
+}
+
 /*
  * device_find_by_xname:
  *
diff --git a/sys/kern/subr_devsw.c b/sys/kern/subr_devsw.c
index 1a0f721fdd65..8b55187b32c1 100644
--- a/sys/kern/subr_devsw.c
+++ b/sys/kern/subr_devsw.c
@@ -85,6 +85,11 @@ __KERNEL_RCSID(0, "$NetBSD: subr_devsw.c,v 1.38 2017/11/07 18:35:57 christos Exp
 #include <sys/buf.h>
 #include <sys/reboot.h>
 #include <sys/sdt.h>
+#include <sys/atomic.h>
+#include <sys/localcount.h>
+#include <sys/pserialize.h>
+#include <sys/xcall.h>
+#include <sys/device.h>
 
 #ifdef DEVSW_DEBUG
 #define	DPRINTF(x)	printf x
@@ -97,12 +102,22 @@ __KERNEL_RCSID(0, "$NetBSD: subr_devsw.c,v 1.38 2017/11/07 18:35:57 christos Exp
 #define	CDEVSW_SIZE	(sizeof(struct cdevsw *))
 #define	DEVSWCONV_SIZE	(sizeof(struct devsw_conv))
 
+struct devswref {
+	struct localcount	*dr_lc;
+	bool			dr_dynamic;
+};
+
+/* XXX bdevsw, cdevsw, max_bdevsws, and max_cdevsws should be volatile */
 extern const struct bdevsw **bdevsw, *bdevsw0[];
 extern const struct cdevsw **cdevsw, *cdevsw0[];
 extern struct devsw_conv *devsw_conv, devsw_conv0[];
 extern const int sys_bdevsws, sys_cdevsws;
 extern int max_bdevsws, max_cdevsws, max_devsw_convs;
 
+static struct devswref *cdevswref;
+static struct devswref *bdevswref;
+static kcondvar_t devsw_cv;
+
 static int bdevsw_attach(const struct bdevsw *, devmajor_t *);
 static int cdevsw_attach(const struct cdevsw *, devmajor_t *);
 static void devsw_detach_locked(const struct bdevsw *, const struct cdevsw *);
@@ -118,6 +133,8 @@ devsw_init(void)
 	KASSERT(sys_bdevsws < MAXDEVSW - 1);
 	KASSERT(sys_cdevsws < MAXDEVSW - 1);
 	mutex_init(&device_lock, MUTEX_DEFAULT, IPL_NONE);
+
+	cv_init(&devsw_cv, "devsw");
 }
 
 int
@@ -158,15 +175,12 @@ devsw_attach(const char *devname,
 			error = EEXIST;
 			goto fail;
 		}
-
-		if (bdev != NULL)
-			bdevsw[*bmajor] = bdev;
-		cdevsw[*cmajor] = cdev;
-
-		mutex_exit(&device_lock);
-		return (0);
 	}
 
+	/*
+	 * XXX This should allocate what it needs up front so we never
+	 * need to flail around trying to unwind.
+	 */
 	error = bdevsw_attach(bdev, bmajor);
 	if (error != 0) 
 		goto fail;
@@ -176,6 +190,13 @@ devsw_attach(const char *devname,
 		goto fail;
 	}
 
+	/*
+	 * If we already found a conv, we're done.  Otherwise, find an
+	 * empty slot or extend the table.
+	 */
+	if (i == max_devsw_convs)
+		goto fail;
+
 	for (i = 0 ; i < max_devsw_convs ; i++) {
 		if (devsw_conv[i].d_name == NULL)
 			break;
@@ -224,7 +245,9 @@ devsw_attach(const char *devname,
 static int
 bdevsw_attach(const struct bdevsw *devsw, devmajor_t *devmajor)
 {
-	const struct bdevsw **newptr;
+	const struct bdevsw **newbdevsw = NULL;
+	struct devswref *newbdevswref = NULL;
+	struct localcount *lc;
 	devmajor_t bmajor;
 	int i;
 
@@ -253,20 +276,35 @@ bdevsw_attach(const struct bdevsw *devsw, devmajor_t *devmajor)
 		return (ENOMEM);
 	}
 
+	if (bdevswref == NULL) {
+		newbdevswref = kmem_zalloc(MAXDEVSW * sizeof(newbdevswref[0]),
+		    KM_NOSLEEP);
+		if (newbdevswref == NULL)
+			return ENOMEM;
+		atomic_store_release(&bdevswref, newbdevswref);
+	}
+
 	if (*devmajor >= max_bdevsws) {
 		KASSERT(bdevsw == bdevsw0);
-		newptr = kmem_zalloc(MAXDEVSW * BDEVSW_SIZE, KM_NOSLEEP);
-		if (newptr == NULL)
-			return (ENOMEM);
-		memcpy(newptr, bdevsw, max_bdevsws * BDEVSW_SIZE);
-		bdevsw = newptr;
-		max_bdevsws = MAXDEVSW;
+		newbdevsw = kmem_zalloc(MAXDEVSW * sizeof(newbdevsw[0]),
+		    KM_NOSLEEP);
+                if (newbdevsw == NULL)
+                        return ENOMEM;
+		memcpy(newbdevsw, bdevsw, max_bdevsws * sizeof(bdevsw[0]));
+		atomic_store_release(&bdevsw, newbdevsw);
+		atomic_store_release(&max_bdevsws, MAXDEVSW);
 	}
 
 	if (bdevsw[*devmajor] != NULL)
 		return (EEXIST);
 
-	bdevsw[*devmajor] = devsw;
+	KASSERT(bdevswref[*devmajor].dr_lc == NULL);
+	lc = kmem_zalloc(sizeof(*lc), KM_SLEEP);
+	localcount_init(lc);
+	bdevswref[*devmajor].dr_lc = lc;
+	bdevswref[*devmajor].dr_dynamic = true;
+
+	atomic_store_release(&bdevsw[*devmajor], devsw);
 
 	return (0);
 }
@@ -274,7 +312,9 @@ bdevsw_attach(const struct bdevsw *devsw, devmajor_t *devmajor)
 static int
 cdevsw_attach(const struct cdevsw *devsw, devmajor_t *devmajor)
 {
-	const struct cdevsw **newptr;
+	const struct cdevsw **newcdevsw = NULL;
+	struct devswref *newcdevswref = NULL;
+	struct localcount *lc;
 	devmajor_t cmajor;
 	int i;
 
@@ -300,20 +340,35 @@ cdevsw_attach(const struct cdevsw *devsw, devmajor_t *devmajor)
 		return (ENOMEM);
 	}
 
+	if (cdevswref == NULL) {
+		newcdevswref = kmem_zalloc(MAXDEVSW * sizeof(newcdevswref[0]),
+		    KM_NOSLEEP);
+		if (newcdevswref == NULL)
+			return ENOMEM;
+		atomic_store_release(&cdevswref, newcdevswref);
+	}
+
 	if (*devmajor >= max_cdevsws) {
 		KASSERT(cdevsw == cdevsw0);
-		newptr = kmem_zalloc(MAXDEVSW * CDEVSW_SIZE, KM_NOSLEEP);
-		if (newptr == NULL)
-			return (ENOMEM);
-		memcpy(newptr, cdevsw, max_cdevsws * CDEVSW_SIZE);
-		cdevsw = newptr;
-		max_cdevsws = MAXDEVSW;
+		newcdevsw = kmem_zalloc(MAXDEVSW * sizeof(newcdevsw[0]),
+		    KM_NOSLEEP);
+                if (newcdevsw == NULL)
+                        return ENOMEM;
+		memcpy(newcdevsw, cdevsw, max_cdevsws * sizeof(cdevsw[0]));
+		atomic_store_release(&cdevsw, newcdevsw);
+		atomic_store_release(&max_cdevsws, MAXDEVSW);
 	}
 
 	if (cdevsw[*devmajor] != NULL)
 		return (EEXIST);
 
-	cdevsw[*devmajor] = devsw;
+	KASSERT(cdevswref[*devmajor].dr_lc == NULL);
+	lc = kmem_zalloc(sizeof(*lc), KM_SLEEP);
+	localcount_init(lc);
+	cdevswref[*devmajor].dr_lc = lc;
+	cdevswref[*devmajor].dr_dynamic = true;
+
+	atomic_store_release(&cdevsw[*devmajor], devsw);
 
 	return (0);
 }
@@ -321,36 +376,75 @@ cdevsw_attach(const struct cdevsw *devsw, devmajor_t *devmajor)
 static void
 devsw_detach_locked(const struct bdevsw *bdev, const struct cdevsw *cdev)
 {
-	int i;
+	int bi, ci = -1/*XXXGCC*/;
 
 	KASSERT(mutex_owned(&device_lock));
 
+	/* Prevent new references.  */
 	if (bdev != NULL) {
-		for (i = 0 ; i < max_bdevsws ; i++) {
-			if (bdevsw[i] != bdev)
+		for (bi = 0; bi < max_bdevsws; bi++) {
+			if (bdevsw[bi] != bdev)
 				continue;
-			bdevsw[i] = NULL;
+			atomic_store_relaxed(&bdevsw[bi], NULL);
 			break;
 		}
+		KASSERT(bi < max_bdevsws);
 	}
 	if (cdev != NULL) {
-		for (i = 0 ; i < max_cdevsws ; i++) {
-			if (cdevsw[i] != cdev)
+		for (ci = 0; ci < max_cdevsws; ci++) {
+			if (cdevsw[ci] != cdev)
 				continue;
-			cdevsw[i] = NULL;
+			atomic_store_relaxed(&cdevsw[ci], NULL);
 			break;
 		}
+		KASSERT(ci < max_cdevsws);
+	}
+
+	if (bdev == NULL && cdev == NULL) /* XXX possible? */
+		return;
+
+	/*
+	 * Wait for all bdevsw_lookup_acquire, cdevsw_lookup_acquire
+	 * calls to notice that the devsw is gone.
+	 *
+	 * XXX Can't use pserialize_perform here because devsw_init is
+	 * too early for pserialize_create().
+	 */
+	xc_barrier(0);
+
+	/*
+	 * Wait for all references to drain.  It is the caller's
+	 * responsibility to ensure that at this point, there are no
+	 * extant open instances and all new d_open calls will fail.
+	 *
+	 * Note that localcount_drain may release and reacquire
+	 * device_lock.
+	 */
+	if (bdev != NULL) {
+		localcount_drain(bdevswref[bi].dr_lc,
+		    &devsw_cv, &device_lock);
+		localcount_fini(bdevswref[bi].dr_lc);
+		kmem_free(bdevswref[bi].dr_lc, sizeof(*bdevswref[bi].dr_lc));
+		bdevswref[bi].dr_lc = NULL;
+		bdevswref[bi].dr_dynamic = false;
+	}
+	if (cdev != NULL) {
+		localcount_drain(cdevswref[ci].dr_lc,
+		    &devsw_cv, &device_lock);
+		localcount_fini(cdevswref[ci].dr_lc);
+		kmem_free(cdevswref[ci].dr_lc, sizeof(*cdevswref[ci].dr_lc));
+		cdevswref[ci].dr_lc = NULL;
+		cdevswref[ci].dr_dynamic = false;
 	}
 }
 
-int
+void
 devsw_detach(const struct bdevsw *bdev, const struct cdevsw *cdev)
 {
 
 	mutex_enter(&device_lock);
 	devsw_detach_locked(bdev, cdev);
 	mutex_exit(&device_lock);
-	return 0;
 }
 
 /*
@@ -366,10 +460,60 @@ bdevsw_lookup(dev_t dev)
 	if (dev == NODEV)
 		return (NULL);
 	bmajor = major(dev);
-	if (bmajor < 0 || bmajor >= max_bdevsws)
+	if (bmajor < 0 || bmajor >= atomic_load_relaxed(&max_bdevsws))
 		return (NULL);
 
-	return (bdevsw[bmajor]);
+	return atomic_load_consume(&bdevsw)[bmajor];
+}
+
+static const struct bdevsw *
+bdevsw_lookup_acquire(dev_t dev, struct localcount **lcp)
+{
+	devmajor_t bmajor;
+	const struct bdevsw *bdev = NULL, *const *curbdevsw;
+	struct devswref *curbdevswref;
+	int s;
+
+	if (dev == NODEV)
+		return NULL;
+	bmajor = major(dev);
+	if (bmajor < 0)
+		return NULL;
+
+	s = pserialize_read_enter();
+
+	/*
+	 * max_bdevsws never goes down, so it is safe to rely on this
+	 * condition without any locking for the array access below.
+	 * Test sys_bdevsws first so we can avoid the memory barrier in
+	 * that case.
+	 */
+	if (bmajor >= sys_bdevsws &&
+	    bmajor >= atomic_load_acquire(&max_bdevsws))
+		goto out;
+	curbdevsw = atomic_load_consume(&bdevsw);
+	if ((bdev = atomic_load_consume(&curbdevsw[bmajor])) == NULL)
+		goto out;
+
+	curbdevswref = atomic_load_consume(&bdevswref);
+	if (curbdevswref == NULL || !curbdevswref[bmajor].dr_dynamic) {
+		*lcp = NULL;
+	} else {
+		*lcp = curbdevswref[bmajor].dr_lc;
+		localcount_acquire(*lcp);
+	}
+
+out:	pserialize_read_exit(s);
+	return bdev;
+}
+
+static void
+bdevsw_release(const struct bdevsw *bdev, struct localcount *lc)
+{
+
+	if (lc == NULL)
+		return;
+	localcount_release(lc, &devsw_cv, &device_lock);
 }
 
 /*
@@ -385,10 +529,60 @@ cdevsw_lookup(dev_t dev)
 	if (dev == NODEV)
 		return (NULL);
 	cmajor = major(dev);
-	if (cmajor < 0 || cmajor >= max_cdevsws)
+	if (cmajor < 0 || cmajor >= atomic_load_relaxed(&max_cdevsws))
 		return (NULL);
 
-	return (cdevsw[cmajor]);
+	return atomic_load_consume(&cdevsw)[cmajor];
+}
+
+static const struct cdevsw *
+cdevsw_lookup_acquire(dev_t dev, struct localcount **lcp)
+{
+	devmajor_t cmajor;
+	const struct cdevsw *cdev = NULL, *const *curcdevsw;
+	struct devswref *curcdevswref;
+	int s;
+
+	if (dev == NODEV)
+		return NULL;
+	cmajor = major(dev);
+	if (cmajor < 0)
+		return NULL;
+
+	s = pserialize_read_enter();
+
+	/*
+	 * max_cdevsws never goes down, so it is safe to rely on this
+	 * condition without any locking for the array access below.
+	 * Test sys_cdevsws first so we can avoid the memory barrier in
+	 * that case.
+	 */
+	if (cmajor >= sys_cdevsws &&
+	    cmajor >= atomic_load_acquire(&max_cdevsws))
+		goto out;
+	curcdevsw = atomic_load_consume(&cdevsw);
+	if ((cdev = atomic_load_consume(&curcdevsw[cmajor])) == NULL)
+		goto out;
+
+	curcdevswref = atomic_load_consume(&cdevswref);
+	if (curcdevswref == NULL || !curcdevswref[cmajor].dr_dynamic) {
+		*lcp = NULL;
+	} else {
+		*lcp = curcdevswref[cmajor].dr_lc;
+		localcount_acquire(*lcp);
+	}
+
+out:	pserialize_read_exit(s);
+	return cdev;
+}
+
+static void
+cdevsw_release(const struct cdevsw *cdev, struct localcount *lc)
+{
+
+	if (lc == NULL)
+		return;
+	localcount_release(lc, &devsw_cv, &device_lock);
 }
 
 /*
@@ -400,10 +594,13 @@ cdevsw_lookup(dev_t dev)
 devmajor_t
 bdevsw_lookup_major(const struct bdevsw *bdev)
 {
-	devmajor_t bmajor;
+	const struct bdevsw *const *curbdevsw;
+	devmajor_t bmajor, bmax;
 
-	for (bmajor = 0 ; bmajor < max_bdevsws ; bmajor++) {
-		if (bdevsw[bmajor] == bdev)
+	bmax = atomic_load_acquire(&max_bdevsws);
+	curbdevsw = atomic_load_consume(&bdevsw);
+	for (bmajor = 0; bmajor < bmax; bmajor++) {
+		if (atomic_load_relaxed(&curbdevsw[bmajor]) == bdev)
 			return (bmajor);
 	}
 
@@ -419,10 +616,13 @@ bdevsw_lookup_major(const struct bdevsw *bdev)
 devmajor_t
 cdevsw_lookup_major(const struct cdevsw *cdev)
 {
-	devmajor_t cmajor;
+	const struct cdevsw *const *curcdevsw;
+	devmajor_t cmajor, cmax;
 
-	for (cmajor = 0 ; cmajor < max_cdevsws ; cmajor++) {
-		if (cdevsw[cmajor] == cdev)
+	cmax = atomic_load_acquire(&max_cdevsws);
+	curcdevsw = atomic_load_consume(&cdevsw);
+	for (cmajor = 0; cmajor < cmax; cmajor++) {
+		if (atomic_load_relaxed(&curcdevsw[cmajor]) == cdev)
 			return (cmajor);
 	}
 
@@ -697,22 +897,41 @@ int
 bdev_open(dev_t dev, int flag, int devtype, lwp_t *l)
 {
 	const struct bdevsw *d;
-	int rv, mpflag;
+	struct localcount *lc;
+	device_t dv = NULL/*XXXGCC*/;
+	int unit, rv, mpflag;
 
-	/*
-	 * For open we need to lock, in order to synchronize
-	 * with attach/detach.
-	 */
-	mutex_enter(&device_lock);
-	d = bdevsw_lookup(dev);
-	mutex_exit(&device_lock);
+	d = bdevsw_lookup_acquire(dev, &lc);
 	if (d == NULL)
 		return ENXIO;
 
+	if (d->d_devtounit) {
+		/*
+		 * If the device node corresponds to an autoconf device
+		 * instance, acquire a reference to it so that during
+		 * d_open, device_lookup is stable.
+		 *
+		 * XXX This should also arrange to instantiate cloning
+		 * pseudo-devices if appropriate, but that requires
+		 * reviewing them all to find and verify a common
+		 * pattern.
+		 */
+		if ((unit = (*d->d_devtounit)(dev)) == -1)
+			return ENXIO;
+		if ((dv = device_lookup_acquire(d->d_cfdriver, unit)) == NULL)
+			return ENXIO;
+	}
+
 	DEV_LOCK(d);
 	rv = (*d->d_open)(dev, flag, devtype, l);
 	DEV_UNLOCK(d);
 
+	if (d->d_devtounit) {
+		device_release(dv);
+	}
+
+	bdevsw_release(d, lc);
+
 	return rv;
 }
 
@@ -855,22 +1074,41 @@ int
 cdev_open(dev_t dev, int flag, int devtype, lwp_t *l)
 {
 	const struct cdevsw *d;
-	int rv, mpflag;
+	struct localcount *lc;
+	device_t dv = NULL/*XXXGCC*/;
+	int unit, rv, mpflag;
 
-	/*
-	 * For open we need to lock, in order to synchronize
-	 * with attach/detach.
-	 */
-	mutex_enter(&device_lock);
-	d = cdevsw_lookup(dev);
-	mutex_exit(&device_lock);
+	d = cdevsw_lookup_acquire(dev, &lc);
 	if (d == NULL)
 		return ENXIO;
 
+	if (d->d_devtounit) {
+		/*
+		 * If the device node corresponds to an autoconf device
+		 * instance, acquire a reference to it so that during
+		 * d_open, device_lookup is stable.
+		 *
+		 * XXX This should also arrange to instantiate cloning
+		 * pseudo-devices if appropriate, but that requires
+		 * reviewing them all to find and verify a common
+		 * pattern.
+		 */
+		if ((unit = (*d->d_devtounit)(dev)) == -1)
+			return ENXIO;
+		if ((dv = device_lookup_acquire(d->d_cfdriver, unit)) == NULL)
+			return ENXIO;
+	}
+
 	DEV_LOCK(d);
 	rv = (*d->d_open)(dev, flag, devtype, l);
 	DEV_UNLOCK(d);
 
+	if (d->d_devtounit) {
+		device_release(dv);
+	}
+
+	cdevsw_release(d, lc);
+
 	return rv;
 }
 
@@ -1063,3 +1301,18 @@ nommap(dev_t dev, off_t off, int prot)
 
 	return (paddr_t)-1;
 }
+
+/*
+ * dev_minor_unit(dev)
+ *
+ *	Returns minor(dev) as an int.  Intended for use with struct
+ *	bdevsw, cdevsw::d_devtounit for drivers whose /dev nodes are
+ *	implemented by reference to an autoconf instance with the minor
+ *	number.
+ */
+int
+dev_minor_unit(dev_t dev)
+{
+
+	return minor(dev);
+}
diff --git a/sys/kern/subr_disk.c b/sys/kern/subr_disk.c
index da664f920382..41218421db57 100644
--- a/sys/kern/subr_disk.c
+++ b/sys/kern/subr_disk.c
@@ -728,3 +728,10 @@ disk_set_info(device_t dev, struct disk *dk, const char *type)
 	if (odisk_info)
 		prop_object_release(odisk_info);
 }
+
+int
+disklabel_dev_unit(dev_t dev)
+{
+
+	return DISKUNIT(dev);
+}
diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c
index b4bc4c34ab03..987cd5969d95 100644
--- a/sys/miscfs/specfs/spec_vnops.c
+++ b/sys/miscfs/specfs/spec_vnops.c
@@ -81,10 +81,19 @@ __KERNEL_RCSID(0, "$NetBSD: spec_vnops.c,v 1.183 2021/07/18 23:57:14 dholland Ex
 #include <sys/kauth.h>
 #include <sys/fstrans.h>
 #include <sys/module.h>
+#include <sys/atomic.h>
 
 #include <miscfs/genfs/genfs.h>
 #include <miscfs/specfs/specdev.h>
 
+/*
+ * Lock order:
+ *
+ *	vnode lock
+ *	-> device_lock
+ *	-> struct vnode::v_interlock
+ */
+
 /* symbolic sleep message strings for devices */
 const char	devopn[] = "devopn";
 const char	devio[] = "devio";
@@ -165,6 +174,7 @@ const struct vnodeopv_desc spec_vnodeop_opv_desc =
 	{ &spec_vnodeop_p, spec_vnodeop_entries };
 
 static kauth_listener_t rawio_listener;
+static struct kcondvar specfs_iocv;
 
 /* Returns true if vnode is /dev/mem or /dev/kmem. */
 bool
@@ -210,6 +220,123 @@ spec_init(void)
 
 	rawio_listener = kauth_listen_scope(KAUTH_SCOPE_DEVICE,
 	    rawio_listener_cb, NULL);
+	cv_init(&specfs_iocv, "specio");
+}
+
+/*
+ * spec_io_enter(vp, &sn, &dev)
+ *
+ *	Enter an operation that may not hold vp's vnode lock or an
+ *	fstrans on vp's mount.  Until spec_io_exit, the vnode will not
+ *	be revoked.
+ *
+ *	On success, set sn to the specnode pointer and dev to the dev_t
+ *	number and return zero.  Caller must later call spec_io_exit
+ *	when done.
+ *
+ *	On failure, return ENXIO -- the device has been revoked and no
+ *	longer exists.
+ */
+static int
+spec_io_enter(struct vnode *vp, struct specnode **snp, dev_t *devp)
+{
+	dev_t dev;
+	struct specnode *sn;
+	unsigned iocnt;
+	int error = 0;
+
+	mutex_enter(vp->v_interlock);
+
+	/*
+	 * Extract all the info we need from the vnode, unless the
+	 * vnode has already been reclaimed.  This can happen if the
+	 * underlying device has been removed and all the device nodes
+	 * for it have been revoked.  The caller may not hold a vnode
+	 * lock or fstrans to prevent this from happening before it has
+	 * had an opportunity to notice the vnode is dead.
+	 */
+	if (vdead_check(vp, VDEAD_NOWAIT) != 0 ||
+	    (sn = vp->v_specnode) == NULL ||
+	    (dev = vp->v_rdev) == NODEV) {
+		error = ENXIO;
+		goto out;
+	}
+
+	/*
+	 * Notify spec_node_revoke that we are doing an I/O operation
+	 * which may not be not bracketed by fstrans(9) and thus is not
+	 * blocked by vfs suspension.
+	 *
+	 * We could hold this reference with psref(9) instead, but we
+	 * already have to take the interlock for vdead_check, so
+	 * there's not much more cost here to another atomic operation.
+	 */
+	iocnt = atomic_inc_uint_nv(&sn->sn_iocnt);
+	CTASSERT(MAXLWP < UINT_MAX);
+	KASSERT(iocnt < UINT_MAX);
+
+	/* Success!  */
+	*snp = sn;
+	*devp = dev;
+	error = 0;
+
+out:	mutex_exit(vp->v_interlock);
+	return error;
+}
+
+/*
+ * spec_io_exit(vp, sn)
+ *
+ *	Exit an operation entered with a successful spec_io_enter --
+ *	allow concurrent spec_node_revoke to proceed.  The argument sn
+ *	must match the struct specnode pointer returned by spec_io_exit
+ *	for vp.
+ */
+static void
+spec_io_exit(struct vnode *vp, struct specnode *sn)
+{
+	unsigned iocnt;
+
+	KASSERT(vp->v_specnode == sn);
+
+	/*
+	 * We are done.  Notify spec_node_revoke if appropriate.  The
+	 * transition of 1 -> 0 must happen under device_lock so
+	 * spec_node_revoke doesn't miss a wakeup.
+	 */
+	do {
+		iocnt = atomic_load_relaxed(&sn->sn_iocnt);
+		if (iocnt == 1) {
+			mutex_enter(&device_lock);
+			if (atomic_dec_uint_nv(&sn->sn_iocnt) == 0)
+				cv_broadcast(&specfs_iocv);
+			mutex_exit(&device_lock);
+			break;
+		}
+	} while (atomic_cas_uint(&sn->sn_iocnt, iocnt, iocnt - 1) != iocnt);
+}
+
+/*
+ * spec_io_drain(vp, sn)
+ *
+ *	Wait for all existing spec_io_enter/exit sections to complete.
+ *	Caller must ensure spec_io_enter will fail at this point.
+ */
+static void
+spec_io_drain(struct vnode *vp, struct specnode *sn)
+{
+
+	/*
+	 * I/O at the same time as closing is unlikely -- it often
+	 * indicates an application bug.
+	 */
+	if (__predict_true(atomic_load_relaxed(&sn->sn_iocnt) == 0))
+		return;
+
+	mutex_enter(&device_lock);
+	while (atomic_load_relaxed(&sn->sn_iocnt) != 0)
+		cv_wait(&specfs_iocv, &device_lock);
+	mutex_exit(&device_lock);
 }
 
 /*
@@ -250,6 +377,7 @@ spec_node_init(vnode_t *vp, dev_t rdev)
 		sd->sd_refcnt = 1;
 		sd->sd_opencnt = 0;
 		sd->sd_bdevvp = NULL;
+		sd->sd_opened = false;
 		sn->sn_dev = sd;
 		sd = NULL;
 	} else {
@@ -261,6 +389,7 @@ spec_node_init(vnode_t *vp, dev_t rdev)
 	sn->sn_opencnt = 0;
 	sn->sn_rdev = rdev;
 	sn->sn_gone = false;
+	sn->sn_iocnt = 0;
 	vp->v_specnode = sn;
 	vp->v_specnext = *vpp;
 	*vpp = vp;
@@ -373,6 +502,8 @@ spec_node_setmountedfs(vnode_t *devvp, struct mount *mp)
 
 	KASSERT(devvp->v_type == VBLK);
 	KASSERT(devvp->v_specnode->sn_dev->sd_mountpoint == NULL || mp == NULL);
+	KASSERT(devvp->v_specnode->sn_opencnt);
+
 	devvp->v_specnode->sn_dev->sd_mountpoint = mp;
 	if (mp == NULL)
 		return;
@@ -415,6 +546,14 @@ spec_node_revoke(vnode_t *vp)
 		KASSERT(sn->sn_opencnt == 0);
 	}
 	mutex_exit(&device_lock);
+
+	/*
+	 * Wait for all other devsw operations to drain.  After this
+	 * point, no bdev/cdev_* can be active for this specnode.
+	 * Note: We drain even if we witnessed sn->sn_opencnt == 0,
+	 * because there may have been a concurrent VOP_CLOSE.
+	 */
+	spec_io_drain(vp, sn);
 }
 
 /*
@@ -435,6 +574,7 @@ spec_node_destroy(vnode_t *vp)
 	KASSERT(vp->v_type == VBLK || vp->v_type == VCHR);
 	KASSERT(vp->v_specnode != NULL);
 	KASSERT(sn->sn_opencnt == 0);
+	KASSERT(sn->sn_iocnt == 0);
 
 	mutex_enter(&device_lock);
 	/* Remove from the hash and destroy the node. */
@@ -498,26 +638,28 @@ spec_open(void *v)
 		int  a_mode;
 		kauth_cred_t a_cred;
 	} */ *ap = v;
-	struct lwp *l;
-	struct vnode *vp;
-	dev_t dev;
+	struct lwp *l = curlwp;
+	struct vnode *vp = ap->a_vp;
+	dev_t dev, dev1;
 	int error;
 	enum kauth_device_req req;
-	specnode_t *sn;
+	specnode_t *sn, *sn1;
 	specdev_t *sd;
 	spec_ioctl_t ioctl;
-	u_int gen;
-	const char *name;
+	u_int gen = 0;
+	const char *name = NULL;
+	bool needclose = false;
 	struct partinfo pi;
-	
-	l = curlwp;
-	vp = ap->a_vp;
+
+	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE ||
+	    (vp->v_vflag & VV_LOCKSWORK) == 0);
+	KASSERTMSG(vp->v_type == VBLK || vp->v_type == VCHR, "type=%d",
+	    vp->v_type);
+
 	dev = vp->v_rdev;
 	sn = vp->v_specnode;
 	sd = sn->sn_dev;
-	name = NULL;
-	gen = 0;
-	
+
 	/*
 	 * Don't allow open if fs is mounted -nodev.
 	 */
@@ -535,28 +677,101 @@ spec_open(void *v)
 		req = KAUTH_REQ_DEVICE_RAWIO_SPEC_READ;
 		break;
 	}
+	error = kauth_authorize_device_spec(ap->a_cred, req, vp);
+	if (error != 0)
+		return (error);
 
+	/*
+	 * Acquire an open reference -- as long as we hold onto it, and
+	 * the vnode isn't revoked, it can't be closed.
+	 */
+	mutex_enter(&device_lock);
+	KASSERT(!sn->sn_gone);
 	switch (vp->v_type) {
 	case VCHR:
-		error = kauth_authorize_device_spec(ap->a_cred, req, vp);
-		if (error != 0)
-			return (error);
-
 		/*
 		 * Character devices can accept opens from multiple
 		 * vnodes.
 		 */
-		mutex_enter(&device_lock);
-		if (sn->sn_gone) {
-			mutex_exit(&device_lock);
-			return (EBADF);
-		}
 		sd->sd_opencnt++;
 		sn->sn_opencnt++;
-		mutex_exit(&device_lock);
+		break;
+	case VBLK:
+		/*
+		 * For block devices, permit only one open.  The buffer
+		 * cache cannot remain self-consistent with multiple
+		 * vnodes holding a block device open.
+		 *
+		 * Treat zero opencnt with non-NULL mountpoint as open.
+		 * This may happen after forced detach of a mounted device.
+		 */
+		if (sd->sd_opencnt != 0 || sd->sd_mountpoint != NULL) {
+			error = EBUSY;
+			break;
+		}
+		KASSERTMSG(sn->sn_opencnt == 0, "%u", sn->sn_opencnt);
+		sn->sn_opencnt = 1;
+		sd->sd_opencnt = 1;
+		sd->sd_bdevvp = vp;
+		break;
+	default:
+		panic("invalid specfs vnode type: %d", vp->v_type);
+	}
+	mutex_exit(&device_lock);
+	if (error)
+		return error;
+
+	/*
+	 * Set VV_ISTTY if this is a tty cdev.
+	 *
+	 * XXX This does the wrong thing if the module has to be
+	 * autoloaded.  We should maybe set this after autoloading
+	 * modules and calling .d_open successfully, except (a) we need
+	 * the vnode lock to touch it, and (b) once we acquire the
+	 * vnode lock again, the vnode may have been revoked, and
+	 * deadfs's dead_read needs VV_ISTTY to be already set in order
+	 * to return the right answer.  So this needs some additional
+	 * synchronization to be made to work correctly with tty driver
+	 * module autoload.  For now, let's just hope it doesn't cause
+	 * too much trouble for a tty from an autoloaded driver module
+	 * to fail with EIO instead of returning EOF.
+	 */
+	if (vp->v_type == VCHR) {
 		if (cdev_type(dev) == D_TTY)
 			vp->v_vflag |= VV_ISTTY;
-		VOP_UNLOCK(vp);
+	}
+
+	/*
+	 * Because the opening the device may block indefinitely,
+	 * e.g. when opening a tty, and loading a module may cross into
+	 * many other subsystems, we must not hold the vnode lock while
+	 * calling .d_open, so release it now and reacquire it when
+	 * done.
+	 *
+	 * Take an I/O reference so that any concurrent
+	 * spec_node_revoke will wait for us to finish calling .d_open.
+	 * The vnode can't be dead at this point because we have it
+	 * locked.  Note that if revoked, the driver will interrupt
+	 * .d_open before spec_node_revoke starts waiting for I/O to
+	 * drain so this doesn't deadlock.
+	 */
+	VOP_UNLOCK(vp);
+	error = spec_io_enter(vp, &sn1, &dev1);
+	if (error) {
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+		return error;
+	}
+	KASSERT(sn1 == sn);
+	KASSERT(dev1 == dev);
+
+	/*
+	 * Open the device.  If .d_open returns ENXIO (device not
+	 * configured), the driver may not be loaded, so try
+	 * autoloading a module and then try .d_open again if anything
+	 * got loaded.
+	 */
+	switch (vp->v_type) {
+	case VCHR:
 		do {
 			const struct cdevsw *cdev;
 
@@ -579,36 +794,9 @@ spec_open(void *v)
 			/* Try to autoload device module */
 			(void) module_autoload(name, MODULE_CLASS_DRIVER);
 		} while (gen != module_gen);
-
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		break;
 
 	case VBLK:
-		error = kauth_authorize_device_spec(ap->a_cred, req, vp);
-		if (error != 0)
-			return (error);
-
-		/*
-		 * For block devices, permit only one open.  The buffer
-		 * cache cannot remain self-consistent with multiple
-		 * vnodes holding a block device open.
-		 *
-		 * Treat zero opencnt with non-NULL mountpoint as open.
-		 * This may happen after forced detach of a mounted device.
-		 */
-		mutex_enter(&device_lock);
-		if (sn->sn_gone) {
-			mutex_exit(&device_lock);
-			return (EBADF);
-		}
-		if (sd->sd_opencnt != 0 || sd->sd_mountpoint != NULL) {
-			mutex_exit(&device_lock);
-			return EBUSY;
-		}
-		sn->sn_opencnt = 1;
-		sd->sd_opencnt = 1;
-		sd->sd_bdevvp = vp;
-		mutex_exit(&device_lock);
 		do {
 			const struct bdevsw *bdev;
 
@@ -628,49 +816,118 @@ spec_open(void *v)
 			if ((name = bdevsw_getname(major(dev))) == NULL)
 				break;
 
-			VOP_UNLOCK(vp);
-
                         /* Try to autoload device module */
 			(void) module_autoload(name, MODULE_CLASS_DRIVER);
-			
-			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		} while (gen != module_gen);
-
 		break;
 
-	case VNON:
-	case VLNK:
-	case VDIR:
-	case VREG:
-	case VBAD:
-	case VFIFO:
-	case VSOCK:
 	default:
-		return 0;
+		__unreachable();
 	}
 
+	/*
+	 * Release the I/O reference now that we have called .d_open,
+	 * and reacquire the vnode lock.  At this point, the device may
+	 * have been revoked, so we must tread carefully -- can't touch
+	 * sn or sd until we verify the vnode is not dead.
+	 */
+	spec_io_exit(vp, sn);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+
+	/*
+	 * If it has been revoked since we released the vnode lock and
+	 * reacquired it, then spec_node_revoke has closed it, and we
+	 * must fail with EBADF without touching sn or sd which may be
+	 * freed at this point.
+	 *
+	 * Otherwise, if opening it failed, back out and release the
+	 * open reference.  If it was ever successfully opened and we
+	 * got the last reference this way, it's now our job to close
+	 * it.  This might happen in the following scenario:
+	 *
+	 *	Thread 1		Thread 2
+	 *	VOP_OPEN
+	 *	  ...
+	 *	  .d_open -> 0 (success)
+	 *	  acquire vnode lock
+	 *	  do stuff		VOP_OPEN
+	 *	  release vnode lock	...
+	 *				  .d_open -> EBUSY
+	 *	VOP_CLOSE
+	 *	  acquire vnode lock
+	 *	  --sd_opencnt != 0
+	 *	  => no .d_close
+	 *	  release vnode lock
+	 *				  acquire vnode lock
+	 *				  --sd_opencnt == 0
+	 *
+	 * We can't resolve this by making spec_close wait for .d_open
+	 * to complete before examining sd_opencnt, because .d_open can
+	 * hang indefinitely, e.g. for a tty.
+	 */
 	mutex_enter(&device_lock);
-	if (sn->sn_gone) {
+	mutex_enter(vp->v_interlock);
+	if (vdead_check(vp, VDEAD_NOWAIT) != 0) {
 		if (error == 0)
 			error = EBADF;
-	} else if (error != 0) {
-		sd->sd_opencnt--;
-		sn->sn_opencnt--;
-		if (vp->v_type == VBLK)
-			sd->sd_bdevvp = NULL;
-
+	} else {
+		KASSERT(!sn->sn_gone);
+		if (error != 0) {
+			if (--sd->sd_opencnt == 0 && sd->sd_opened) {
+				needclose = true;
+				sd->sd_opened = false;
+			}
+			sn->sn_opencnt--;
+			if (vp->v_type == VBLK)
+				sd->sd_bdevvp = NULL;
+		} else {
+			sd->sd_opened = true;
+		}
 	}
+	mutex_exit(vp->v_interlock);
 	mutex_exit(&device_lock);
 
-	if (cdev_type(dev) != D_DISK || error != 0)
+	/*
+	 * If this open failed, but the device was previously opened,
+	 * and another thread concurrently closed the vnode while we
+	 * were in the middle of reopening it, the other thread will
+	 * see sd_opencnt > 0 and thus decide not to call .d_close --
+	 * it is now our responsibility to do so.
+	 *
+	 * XXX The flags passed to VOP_CLOSE here are wrong, but
+	 * drivers can't rely on FREAD|FWRITE anyway -- e.g., consider
+	 * a device opened by thread 0 with O_READ, then opened by
+	 * thread 1 with O_WRITE, then closed by thread 0, and finally
+	 * closed by thread 1; the last .d_close call will have FWRITE
+	 * but not FREAD.  We should just eliminate the FREAD/FWRITE
+	 * parameter to .d_close altogether.
+	 */
+	if (needclose) {
+		KASSERT(error);
+		VOP_CLOSE(vp, FNONBLOCK, NOCRED);
+	}
+
+	/* If anything went wrong, we're done.  */
+	if (error)
 		return error;
 
-	
-	ioctl = vp->v_type == VCHR ? cdev_ioctl : bdev_ioctl;
-	error = (*ioctl)(vp->v_rdev, DIOCGPARTINFO, &pi, FREAD, curlwp);
-	if (error == 0)
-		uvm_vnp_setsize(vp, (voff_t)pi.pi_secsize * pi.pi_size);
+	/*
+	 * For disk devices, automagically set the vnode size to the
+	 * partition size, if we can.  This applies to block devices
+	 * and character devices alike -- every block device must have
+	 * a corresponding character device.  And if the module is
+	 * loaded it will remain loaded until we're done here (it is
+	 * forbidden to devsw_detach until closed).  So it is safe to
+	 * query cdev_type unconditionally here.
+	 */
+	if (cdev_type(dev) == D_DISK) {
+		ioctl = vp->v_type == VCHR ? cdev_ioctl : bdev_ioctl;
+		if ((*ioctl)(dev, DIOCGPARTINFO, &pi, FREAD, curlwp) == 0)
+			uvm_vnp_setsize(vp,
+			    (voff_t)pi.pi_secsize * pi.pi_size);
+	}
 
+	/* Success!  */
 	return 0;
 }
 
@@ -690,6 +947,8 @@ spec_read(void *v)
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
  	struct lwp *l = curlwp;
+	struct specnode *sn;
+	dev_t dev;
 	struct buf *bp;
 	daddr_t bn;
 	int bsize, bscale;
@@ -712,9 +971,27 @@ spec_read(void *v)
 	switch (vp->v_type) {
 
 	case VCHR:
+		/*
+		 * Release the lock while we sleep -- possibly
+		 * indefinitely, if this is, e.g., a tty -- in
+		 * cdev_read, so we don't hold up everything else that
+		 * might want access to the vnode.
+		 *
+		 * But before we issue the read, take an I/O reference
+		 * to the specnode so close will know when we're done
+		 * writing.  Note that the moment we release the lock,
+		 * the vnode's identity may change; hence spec_io_enter
+		 * may fail, and the caller may have a dead vnode on
+		 * their hands, if the file system on which vp lived
+		 * has been unmounted.
+		 */
 		VOP_UNLOCK(vp);
-		error = cdev_read(vp->v_rdev, uio, ap->a_ioflag);
-		vn_lock(vp, LK_SHARED | LK_RETRY);
+		error = spec_io_enter(vp, &sn, &dev);
+		if (error)
+			goto out;
+		error = cdev_read(dev, uio, ap->a_ioflag);
+		spec_io_exit(vp, sn);
+out:		vn_lock(vp, LK_SHARED | LK_RETRY);
 		return (error);
 
 	case VBLK:
@@ -791,6 +1068,8 @@ spec_write(void *v)
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct lwp *l = curlwp;
+	struct specnode *sn;
+	dev_t dev;
 	struct buf *bp;
 	daddr_t bn;
 	int bsize, bscale;
@@ -806,9 +1085,27 @@ spec_write(void *v)
 	switch (vp->v_type) {
 
 	case VCHR:
+		/*
+		 * Release the lock while we sleep -- possibly
+		 * indefinitely, if this is, e.g., a tty -- in
+		 * cdev_write, so we don't hold up everything else that
+		 * might want access to the vnode.
+		 *
+		 * But before we issue the write, take an I/O reference
+		 * to the specnode so close will know when we're done
+		 * writing.  Note that the moment we release the lock,
+		 * the vnode's identity may change; hence spec_io_enter
+		 * may fail, and the caller may have a dead vnode on
+		 * their hands, if the file system on which vp lived
+		 * has been unmounted.
+		 */
 		VOP_UNLOCK(vp);
-		error = cdev_write(vp->v_rdev, uio, ap->a_ioflag);
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+		error = spec_io_enter(vp, &sn, &dev);
+		if (error)
+			goto out;
+		error = cdev_write(dev, uio, ap->a_ioflag);
+		spec_io_exit(vp, sn);
+out:		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		return (error);
 
 	case VBLK:
@@ -866,21 +1163,13 @@ spec_fdiscard(void *v)
 		off_t a_pos;
 		off_t a_len;
 	} */ *ap = v;
-	struct vnode *vp;
+	struct vnode *vp = ap->a_vp;
 	dev_t dev;
 
-	vp = ap->a_vp;
-	dev = NODEV;
-
-	mutex_enter(vp->v_interlock);
-	if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode != NULL) {
-		dev = vp->v_rdev;
-	}
-	mutex_exit(vp->v_interlock);
+	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE ||
+	    (vp->v_vflag & VV_LOCKSWORK) == 0);
 
-	if (dev == NODEV) {
-		return ENXIO;
-	}
+	dev = vp->v_rdev;
 
 	switch (vp->v_type) {
 	    case VCHR:
@@ -909,40 +1198,32 @@ spec_ioctl(void *v)
 		int  a_fflag;
 		kauth_cred_t a_cred;
 	} */ *ap = v;
-	struct vnode *vp;
+	struct vnode *vp = ap->a_vp;
+	struct specnode *sn;
 	dev_t dev;
+	int error;
 
-	/*
-	 * Extract all the info we need from the vnode, taking care to
-	 * avoid a race with VOP_REVOKE().
-	 */
-
-	vp = ap->a_vp;
-	dev = NODEV;
-	mutex_enter(vp->v_interlock);
-	if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode) {
-		dev = vp->v_rdev;
-	}
-	mutex_exit(vp->v_interlock);
-	if (dev == NODEV) {
-		return ENXIO;
-	}
+	error = spec_io_enter(vp, &sn, &dev);
+	if (error)
+		return error;
 
 	switch (vp->v_type) {
-
 	case VCHR:
-		return cdev_ioctl(dev, ap->a_command, ap->a_data,
+		error = cdev_ioctl(dev, ap->a_command, ap->a_data,
 		    ap->a_fflag, curlwp);
-
+		break;
 	case VBLK:
 		KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp);
-		return bdev_ioctl(dev, ap->a_command, ap->a_data,
+		error = bdev_ioctl(dev, ap->a_command, ap->a_data,
 		   ap->a_fflag, curlwp);
-
+		break;
 	default:
 		panic("spec_ioctl");
 		/* NOTREACHED */
 	}
+
+	spec_io_exit(vp, sn);
+	return error;
 }
 
 /* ARGSUSED */
@@ -953,33 +1234,25 @@ spec_poll(void *v)
 		struct vnode *a_vp;
 		int a_events;
 	} */ *ap = v;
-	struct vnode *vp;
+	struct vnode *vp = ap->a_vp;
+	struct specnode *sn;
 	dev_t dev;
+	int revents;
 
-	/*
-	 * Extract all the info we need from the vnode, taking care to
-	 * avoid a race with VOP_REVOKE().
-	 */
-
-	vp = ap->a_vp;
-	dev = NODEV;
-	mutex_enter(vp->v_interlock);
-	if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode) {
-		dev = vp->v_rdev;
-	}
-	mutex_exit(vp->v_interlock);
-	if (dev == NODEV) {
+	if (spec_io_enter(vp, &sn, &dev) != 0)
 		return POLLERR;
-	}
 
 	switch (vp->v_type) {
-
 	case VCHR:
-		return cdev_poll(dev, ap->a_events, curlwp);
-
+		revents = cdev_poll(dev, ap->a_events, curlwp);
+		break;
 	default:
-		return (genfs_poll(v));
+		revents = genfs_poll(v);
+		break;
 	}
+
+	spec_io_exit(vp, sn);
+	return revents;
 }
 
 /* ARGSUSED */
@@ -990,20 +1263,30 @@ spec_kqfilter(void *v)
 		struct vnode	*a_vp;
 		struct proc	*a_kn;
 	} */ *ap = v;
+	struct vnode *vp = ap->a_vp;
+	struct specnode *sn;
 	dev_t dev;
+	int error;
 
-	switch (ap->a_vp->v_type) {
+	error = spec_io_enter(vp, &sn, &dev);
+	if (error)
+		return error;
 
+	switch (vp->v_type) {
 	case VCHR:
-		dev = ap->a_vp->v_rdev;
-		return cdev_kqfilter(dev, ap->a_kn);
+		error = cdev_kqfilter(dev, ap->a_kn);
+		break;
 	default:
 		/*
 		 * Block devices don't support kqfilter, and refuse it
 		 * for any other files (like those vflush()ed) too.
 		 */
-		return (EOPNOTSUPP);
+		error = EOPNOTSUPP;
+		break;
 	}
+
+	spec_io_exit(vp, sn);
+	return error;
 }
 
 /*
@@ -1018,11 +1301,19 @@ spec_mmap(void *v)
 		kauth_cred_t a_cred;
 	} */ *ap = v;
 	struct vnode *vp = ap->a_vp;
+	struct specnode *sn;
+	dev_t dev;
+	int error;
 
 	KASSERT(vp->v_type == VBLK);
-	if (bdev_type(vp->v_rdev) != D_DISK)
-		return EINVAL;
 
+	error = spec_io_enter(vp, &sn, &dev);
+	if (error)
+		return error;
+
+	error = bdev_type(dev) == D_DISK ? 0 : EINVAL;
+
+	spec_io_exit(vp, sn);
 	return 0;
 }
 
@@ -1067,27 +1358,14 @@ spec_strategy(void *v)
 	} */ *ap = v;
 	struct vnode *vp = ap->a_vp;
 	struct buf *bp = ap->a_bp;
+	struct specnode *sn = NULL;
 	dev_t dev;
 	int error;
 
-	dev = NODEV;
-
-	/*
-	 * Extract all the info we need from the vnode, taking care to
-	 * avoid a race with VOP_REVOKE().
-	 */
-
-	mutex_enter(vp->v_interlock);
-	if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode != NULL) {
-		KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp);
-		dev = vp->v_rdev;
-	}
-	mutex_exit(vp->v_interlock);
-
-	if (dev == NODEV) {
-		error = ENXIO;
+	error = spec_io_enter(vp, &sn, &dev);
+	if (error)
 		goto out;
-	}
+
 	bp->b_dev = dev;
 
 	if (!(bp->b_flags & B_READ)) {
@@ -1107,13 +1385,15 @@ spec_strategy(void *v)
 	}
 	bdev_strategy(bp);
 
-	return 0;
-
-out:
-	bp->b_error = error;
-	bp->b_resid = bp->b_bcount;
-	biodone(bp);
+	error = 0;
 
+out:	if (sn)
+		spec_io_exit(vp, sn);
+	if (error) {
+		bp->b_error = error;
+		bp->b_resid = bp->b_bcount;
+		biodone(bp);
+	}
 	return error;
 }
 
@@ -1139,6 +1419,9 @@ spec_reclaim(void *v)
 	} */ *ap = v;
 	struct vnode *vp = ap->a_vp;
 
+	KASSERT(vp->v_specnode->sn_opencnt == 0);
+	KASSERT(vp->v_specnode->sn_iocnt == 0);
+
 	VOP_UNLOCK(vp);
 
 	KASSERT(vp->v_mount == dead_rootmount);
@@ -1182,14 +1465,18 @@ spec_close(void *v)
 	} */ *ap = v;
 	struct vnode *vp = ap->a_vp;
 	struct session *sess;
-	dev_t dev = vp->v_rdev;
+	dev_t dev, dev1;
 	int flags = ap->a_fflag;
 	int mode, error, count;
-	specnode_t *sn;
+	specnode_t *sn, *sn1;
 	specdev_t *sd;
 
+	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE ||
+	    (vp->v_vflag & VV_LOCKSWORK) == 0);
+
 	mutex_enter(vp->v_interlock);
 	sn = vp->v_specnode;
+	dev = vp->v_rdev;
 	sd = sn->sn_dev;
 	/*
 	 * If we're going away soon, make this non-blocking.
@@ -1269,14 +1556,44 @@ spec_close(void *v)
 		panic("spec_close: not special");
 	}
 
+	/*
+	 * Decrement the open reference count of this node and the
+	 * device.  For block devices, the open reference count must be
+	 * 1 at this point.  If the device's open reference count goes
+	 * to zero, we're the last one out so get the lights.
+	 *
+	 * We may find --sd->sd_opencnt gives zero, and yet
+	 * sd->sd_opened is false.  This happens if the vnode is
+	 * revoked at the same time as it is being opened, which can
+	 * happen when opening a tty blocks indefinitely.  In that
+	 * case, we still must call close -- it is the job of close to
+	 * interrupt the open.  Either way, the device will be no
+	 * longer opened, so we have to clear sd->sd_opened; subsequent
+	 * opens will have responsibility for issuing close.
+	 *
+	 * This has the side effect that the sequence of opens might
+	 * happen out of order -- we might end up doing open, open,
+	 * close, close, instead of open, close, open, close.  This is
+	 * unavoidable with the current devsw API, where open is
+	 * allowed to block and close must be able to run concurrently
+	 * to interrupt it.  It is the driver's responsibility to
+	 * ensure that close is idempotent so that this works.  Drivers
+	 * requiring per-open state and exact 1:1 correspondence
+	 * between open and close can use fd_clone.
+	 */
 	mutex_enter(&device_lock);
 	sn->sn_opencnt--;
 	count = --sd->sd_opencnt;
-	if (vp->v_type == VBLK)
+	if (vp->v_type == VBLK) {
+		KASSERTMSG(count == 0, "block device with %u opens",
+		    count + 1);
 		sd->sd_bdevvp = NULL;
+	}
+	if (count == 0)
+		sd->sd_opened = false;
 	mutex_exit(&device_lock);
 
-	if (count != 0 && (vp->v_type != VCHR || !(cdev_flags(dev) & D_MCLOSE)))
+	if (count != 0)
 		return 0;
 
 	/*
@@ -1284,16 +1601,28 @@ spec_close(void *v)
 	 * might end up sleeping for someone else who wants our queues. They
 	 * won't get them if we hold the vnode locked.
 	 */
-	if (!(flags & FNONBLOCK))
+	if (!(flags & FNONBLOCK)) {
+		/*
+		 * Take an I/O reference while we hold the vnode lock.
+		 * This ensures that spec_node_revoke waits for the
+		 * last close.
+		 */
+		error = spec_io_enter(vp, &sn1, &dev1);
+		KASSERTMSG(error == 0, "error=%d", error);
+		KASSERT(sn1 == sn);
+		KASSERT(dev1 == dev);
 		VOP_UNLOCK(vp);
+	}
 
 	if (vp->v_type == VBLK)
 		error = bdev_close(dev, flags, mode, curlwp);
 	else
 		error = cdev_close(dev, flags, mode, curlwp);
 
-	if (!(flags & FNONBLOCK))
+	if (!(flags & FNONBLOCK)) {
+		spec_io_exit(vp, sn1);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	}
 
 	return (error);
 }
diff --git a/sys/miscfs/specfs/specdev.h b/sys/miscfs/specfs/specdev.h
index 621c103e80b1..1abb52964b50 100644
--- a/sys/miscfs/specfs/specdev.h
+++ b/sys/miscfs/specfs/specdev.h
@@ -69,6 +69,7 @@ typedef struct specnode {
 	u_int		sn_opencnt;
 	dev_t		sn_rdev;
 	bool		sn_gone;
+	volatile u_int	sn_iocnt;
 } specnode_t;
 
 typedef struct specdev {
@@ -78,6 +79,7 @@ typedef struct specdev {
 	u_int		sd_opencnt;
 	u_int		sd_refcnt;
 	dev_t		sd_rdev;
+	bool		sd_opened;
 } specdev_t;
 
 /*
diff --git a/sys/modules/examples/pollpal/pollpal.c b/sys/modules/examples/pollpal/pollpal.c
index b76e0733699b..d8ddc73450e0 100644
--- a/sys/modules/examples/pollpal/pollpal.c
+++ b/sys/modules/examples/pollpal/pollpal.c
@@ -311,7 +311,8 @@ pollpal_modcmd(modcmd_t cmd, void *arg __unused)
 	case MODULE_CMD_FINI:
 		if (pollpal_nopen != 0)
 			return EBUSY;
-		return devsw_detach(NULL, &pollpal_cdevsw);
+		devsw_detach(NULL, &pollpal_cdevsw);
+		return 0;
 	default:
 		return ENOTTY;
 	} 
diff --git a/sys/net/if_tap.c b/sys/net/if_tap.c
index 0b57ad4a711c..314f4647707c 100644
--- a/sys/net/if_tap.c
+++ b/sys/net/if_tap.c
@@ -256,9 +256,7 @@ tapdetach(void)
 
 	if_clone_detach(&tap_cloners);
 #ifdef _MODULE
-	error = devsw_detach(NULL, &tap_cdevsw);
-	if (error != 0)
-		goto out2;
+	devsw_detach(NULL, &tap_cdevsw);
 #endif
 
 	if (tap_count != 0) {
@@ -277,7 +275,6 @@ tapdetach(void)
  out1:
 #ifdef _MODULE
 	devsw_attach("tap", NULL, &tap_bmajor, &tap_cdevsw, &tap_cmajor);
- out2:
 #endif
 	if_clone_attach(&tap_cloners);
 
diff --git a/sys/net/if_tun.c b/sys/net/if_tun.c
index 4f533a8f08d1..f4e5b6d86d43 100644
--- a/sys/net/if_tun.c
+++ b/sys/net/if_tun.c
@@ -142,17 +142,10 @@ tuninit(void)
 static int
 tundetach(void)
 {
-#ifdef _MODULE
-	int error;
-#endif
 
 	if_clone_detach(&tun_cloner);
 #ifdef _MODULE
-	error = devsw_detach(NULL, &tun_cdevsw);
-	if (error != 0) {
-		if_clone_attach(&tun_cloner);
-		return error;
-	}
+	devsw_detach(NULL, &tun_cdevsw);
 #endif
 
 	if (!LIST_EMPTY(&tun_softc_list) || !LIST_EMPTY(&tunz_softc_list)) {
diff --git a/sys/rump/dev/lib/libbpf/bpf_component.c b/sys/rump/dev/lib/libbpf/bpf_component.c
index 05807d371d40..d41d1987afe8 100644
--- a/sys/rump/dev/lib/libbpf/bpf_component.c
+++ b/sys/rump/dev/lib/libbpf/bpf_component.c
@@ -50,6 +50,5 @@ RUMP_COMPONENT(RUMP_COMPONENT_NET)
 		panic("bpf devsw attach failed: %d", error);
 	if ((error = rump_vfs_makeonedevnode(S_IFCHR, "/dev/bpf", cmaj, 0)) !=0)
 		panic("cannot create bpf device nodes: %d", error);
-	if ((error = devsw_detach(NULL, &bpf_cdevsw)) != 0)
-		panic("cannot detach bpf devsw: %d", error);
+	devsw_detach(NULL, &bpf_cdevsw);
 }
diff --git a/sys/rump/dev/lib/libdrvctl/drvctl_component.c b/sys/rump/dev/lib/libdrvctl/drvctl_component.c
index e2e79f45f9de..ac4e103fdb9c 100644
--- a/sys/rump/dev/lib/libdrvctl/drvctl_component.c
+++ b/sys/rump/dev/lib/libdrvctl/drvctl_component.c
@@ -51,7 +51,5 @@ RUMP_COMPONENT(RUMP_COMPONENT_DEV)
 	if ( error !=0)
 		panic("cannot create drvctl device node: %d", error);
 
-	error = devsw_detach(NULL, &drvctl_cdevsw);
-	if (error != 0)
-		panic("cannot detach drvctl devsw: %d", error);
+	devsw_detach(NULL, &drvctl_cdevsw);
 }
diff --git a/sys/sys/conf.h b/sys/sys/conf.h
index 081631d2111f..16dd87e5480c 100644
--- a/sys/sys/conf.h
+++ b/sys/sys/conf.h
@@ -63,7 +63,7 @@ struct vnode;
 #define	D_TYPEMASK	0x00ff
 #define	D_MPSAFE	0x0100
 #define	D_NEGOFFSAFE	0x0200
-#define	D_MCLOSE	0x0400
+#define	D_UNUSED0	0x0400	/* was D_MCLOSE */
 
 /*
  * Block device switch table
@@ -76,6 +76,8 @@ struct bdevsw {
 	int		(*d_dump)(dev_t, daddr_t, void *, size_t);
 	int		(*d_psize)(dev_t);
 	int		(*d_discard)(dev_t, off_t, off_t);
+	int		(*d_devtounit)(dev_t);
+	struct cfdriver	*d_cfdriver;
 	int		d_flag;
 };
 
@@ -94,6 +96,8 @@ struct cdevsw {
 	paddr_t		(*d_mmap)(dev_t, off_t, int);
 	int		(*d_kqfilter)(dev_t, struct knote *);
 	int		(*d_discard)(dev_t, off_t, off_t);
+	int		(*d_devtounit)(dev_t);
+	struct cfdriver	*d_cfdriver;
 	int		d_flag;
 };
 
@@ -104,7 +108,7 @@ extern kmutex_t device_lock;
 
 int devsw_attach(const char *, const struct bdevsw *, devmajor_t *,
 		 const struct cdevsw *, devmajor_t *);
-int devsw_detach(const struct bdevsw *, const struct cdevsw *);
+void devsw_detach(const struct bdevsw *, const struct cdevsw *);
 const struct bdevsw *bdevsw_lookup(dev_t);
 const struct cdevsw *cdevsw_lookup(dev_t);
 devmajor_t bdevsw_lookup_major(const struct bdevsw *);
@@ -276,6 +280,7 @@ devmajor_t devsw_name2blk(const char *, char *, size_t);
 devmajor_t devsw_name2chr(const char *, char *, size_t);
 dev_t devsw_chr2blk(dev_t);
 dev_t devsw_blk2chr(dev_t);
+int dev_minor_unit(dev_t);
 
 void mm_init(void);
 #endif /* _KERNEL */
diff --git a/sys/sys/device.h b/sys/sys/device.h
index 3bd4a6c3abf7..e685419d4925 100644
--- a/sys/sys/device.h
+++ b/sys/sys/device.h
@@ -274,10 +274,12 @@ struct device {
 	void		*dv_private;	/* this device's private storage */
 	int		*dv_locators;	/* our actual locators (optional) */
 	prop_dictionary_t dv_properties;/* properties dictionary */
+	struct localcount *dv_localcount;/* reference count */
 
 	int		dv_pending;	/* config_pending count */
 	TAILQ_ENTRY(device) dv_pending_list;
 
+	struct lwp	*dv_attaching;	/* thread not yet finished in attach */
 	struct lwp	*dv_detaching;	/* detach lock (config_misc_lock/cv) */
 
 	size_t		dv_activity_count;
@@ -651,6 +653,10 @@ void	null_childdetached(device_t, device_t);
 
 device_t	device_lookup(cfdriver_t, int);
 void		*device_lookup_private(cfdriver_t, int);
+
+device_t	device_lookup_acquire(cfdriver_t, int);
+void		device_release(device_t);
+
 void		device_register(device_t, void *);
 void		device_register_post_config(device_t, void *);
 
diff --git a/sys/sys/disklabel.h b/sys/sys/disklabel.h
index 4e94b8671332..853cdbe668a3 100644
--- a/sys/sys/disklabel.h
+++ b/sys/sys/disklabel.h
@@ -509,6 +509,7 @@ const char *convertdisklabel(struct disklabel *, void (*)(struct buf *),
 int	 bounds_check_with_label(struct disk *, struct buf *, int);
 int	 bounds_check_with_mediasize(struct buf *, int, uint64_t);
 const char *getfstypename(int);
+int	disklabel_dev_unit(dev_t);
 #endif
 #endif /* _LOCORE */