diff -r 14228ccdcddb -r 2d0f42cbc645 share/man/man9/workqueue.9
--- a/share/man/man9/workqueue.9	Fri Jul 31 02:02:15 2020 +0000
+++ b/share/man/man9/workqueue.9	Fri Jul 31 18:29:28 2020 +0000
@@ -83,6 +83,11 @@ The highest IPL at which this workqueue 
 The value of 0 indicates a standard create operation, however the following
 flags may be bitwise ORed together:
 .Bl -tag -width WQ_MPSAFE
+.It Dv WQ_FPU
+Specifies that the kthread must be allowed to use any machine-dependent
+per-CPU floating-point units or SIMD vector units, as in
+.Xr kthread_fpu_enter 9 / Xr kthread_fpu_exit 9 ,
+when it executes the worker function.u
 .It Dv WQ_MPSAFE
 Specifies that the workqueue is multiprocessor safe and does its own locking;
 otherwise the kernel lock will be held while processing work.
diff -r 14228ccdcddb -r 2d0f42cbc645 sys/arch/aarch64/aarch64/fpu.c
--- a/sys/arch/aarch64/aarch64/fpu.c	Fri Jul 31 02:02:15 2020 +0000
+++ b/sys/arch/aarch64/aarch64/fpu.c	Fri Jul 31 18:29:28 2020 +0000
@@ -35,6 +35,8 @@
 
 #include <sys/param.h>
 #include <sys/types.h>
+#include <sys/cpu.h>
+#include <sys/kthread.h>
 #include <sys/lwp.h>
 #include <sys/evcnt.h>
 
@@ -176,12 +178,30 @@ fpu_state_release(lwp_t *l)
 	__asm __volatile ("isb");
 }
 
+static const struct fpreg zero_fpreg;
+
+/*
+ * True if this is a system thread with its own private FPU state.
+ */
+static inline bool
+lwp_system_fpu_p(struct lwp *l)
+{
+
+	return (l->l_flag & (LW_SYSTEM|LW_SYSTEM_FPU)) ==
+	    (LW_SYSTEM|LW_SYSTEM_FPU);
+}
+
 void
 fpu_kern_enter(void)
 {
 	struct cpu_info *ci;
 	int s;
 
+	if (lwp_system_fpu_p(curlwp) && !cpu_intr_p()) {
+		KASSERT(!cpu_softintr_p());
+		return;
+	}
+
 	/*
 	 * Block interrupts up to IPL_VM.  We must block preemption
 	 * since -- if this is a user thread -- there is nowhere to
@@ -209,10 +229,16 @@ fpu_kern_enter(void)
 void
 fpu_kern_leave(void)
 {
-	static const struct fpreg zero_fpreg;
-	struct cpu_info *ci = curcpu();
+	struct cpu_info *ci;
 	int s;
 
+	if (lwp_system_fpu_p(curlwp) && !cpu_intr_p()) {
+		KASSERT(!cpu_softintr_p());
+		return;
+	}
+
+	ci = curcpu();
+
 	KASSERT(ci->ci_cpl == IPL_VM);
 	KASSERT(ci->ci_kfpu_spl != -1);
 
@@ -234,3 +260,19 @@ fpu_kern_leave(void)
 	ci->ci_kfpu_spl = -1;
 	splx(s);
 }
+
+void
+kthread_fpu_enter_md(void)
+{
+
+	fpu_load(curlwp);
+}
+
+void
+kthread_fpu_exit_md(void)
+{
+
+	/* XXX Should fpu_state_release zero the registers itself?  */
+	load_fpregs(&zero_fpreg);
+	fpu_discard(curlwp, 0);
+}
diff -r 14228ccdcddb -r 2d0f42cbc645 sys/arch/aarch64/aarch64/trap.c
--- a/sys/arch/aarch64/aarch64/trap.c	Fri Jul 31 02:02:15 2020 +0000
+++ b/sys/arch/aarch64/aarch64/trap.c	Fri Jul 31 18:29:28 2020 +0000
@@ -242,6 +242,12 @@ trap_el1h_sync(struct trapframe *tf)
 		break;
 
 	case ESR_EC_FP_ACCESS:
+		if ((curlwp->l_flag & (LW_SYSTEM|LW_SYSTEM_FPU)) ==
+		    (LW_SYSTEM|LW_SYSTEM_FPU)) {
+			fpu_load(curlwp);
+			break;
+		}
+		/*FALLTHROUGH*/
 	case ESR_EC_FP_TRAP_A64:
 	case ESR_EC_PC_ALIGNMENT:
 	case ESR_EC_SP_ALIGNMENT:
diff -r 14228ccdcddb -r 2d0f42cbc645 sys/arch/arm/vfp/vfp_init.c
--- a/sys/arch/arm/vfp/vfp_init.c	Fri Jul 31 02:02:15 2020 +0000
+++ b/sys/arch/arm/vfp/vfp_init.c	Fri Jul 31 18:29:28 2020 +0000
@@ -38,6 +38,7 @@
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/device.h>
+#include <sys/kthread.h>
 #include <sys/proc.h>
 #include <sys/cpu.h>
 
@@ -504,7 +505,8 @@ neon_handler(u_int address, u_int insn, 
 		return 1;
 
 	/* This shouldn't ever happen.  */
-	if (fault_code != FAULT_USER)
+	if (fault_code != FAULT_USER &&
+	    (curlwp->l_flag & (LW_SYSTEM|LW_SYSTEM_FPU)) == LW_SYSTEM)
 		panic("NEON fault in non-user mode");
 
 	/* if we already own the FPU and it's enabled, raise SIGILL */
@@ -668,6 +670,19 @@ vfp_setcontext(struct lwp *l, const mcon
 	    sizeof(mcp->__fpu.__vfpregs.__vfp_fstmx));
 }
 
+/*
+ * True if this is a system thread with its own private FPU state.
+ */
+static inline bool
+lwp_system_fpu_p(struct lwp *l)
+{
+
+	return (l->l_flag & (LW_SYSTEM|LW_SYSTEM_FPU)) ==
+	    (LW_SYSTEM|LW_SYSTEM_FPU);
+}
+
+static const struct vfpreg zero_vfpreg;
+
 void
 fpu_kern_enter(void)
 {
@@ -675,6 +690,11 @@ fpu_kern_enter(void)
 	uint32_t fpexc;
 	int s;
 
+	if (lwp_system_fpu_p(curlwp) && !cpu_intr_p()) {
+		KASSERT(!cpu_softintr_p());
+		return;
+	}
+
 	/*
 	 * Block interrupts up to IPL_VM.  We must block preemption
 	 * since -- if this is a user thread -- there is nowhere to
@@ -701,11 +721,15 @@ fpu_kern_enter(void)
 void
 fpu_kern_leave(void)
 {
-	static const struct vfpreg zero_vfpreg;
 	struct cpu_info *ci = curcpu();
 	int s;
 	uint32_t fpexc;
 
+	if (lwp_system_fpu_p(curlwp) && !cpu_intr_p()) {
+		KASSERT(!cpu_softintr_p());
+		return;
+	}
+
 	KASSERT(ci->ci_cpl == IPL_VM);
 	KASSERT(ci->ci_kfpu_spl != -1);
 
@@ -730,4 +754,20 @@ fpu_kern_leave(void)
 	splx(s);
 }
 
+void
+kthread_fpu_enter_md(void)
+{
+
+	pcu_load(&arm_vfp_ops);
+}
+
+void
+kthread_fpu_exit_md(void)
+{
+
+	/* XXX Should vfp_state_release zero the registers itself?  */
+	load_vfpregs(&zero_vfpreg);
+	vfp_discardcontext(curlwp, 0);
+}
+
 #endif /* FPU_VFP */
diff -r 14228ccdcddb -r 2d0f42cbc645 sys/arch/x86/x86/fpu.c
--- a/sys/arch/x86/x86/fpu.c	Fri Jul 31 02:02:15 2020 +0000
+++ b/sys/arch/x86/x86/fpu.c	Fri Jul 31 18:29:28 2020 +0000
@@ -107,6 +107,7 @@
 #include <sys/file.h>
 #include <sys/proc.h>
 #include <sys/kernel.h>
+#include <sys/kthread.h>
 #include <sys/sysctl.h>
 #include <sys/xcall.h>
 
@@ -131,13 +132,35 @@ void fpu_switch(struct lwp *, struct lwp
 
 uint32_t x86_fpu_mxcsr_mask __read_mostly = 0;
 
+/*
+ * True if this a thread that is allowed to use the FPU -- either a
+ * user thread, or a system thread with LW_SYSTEM_FPU enabled.
+ */
+static inline bool
+lwp_can_haz_fpu(struct lwp *l)
+{
+
+	return (l->l_flag & (LW_SYSTEM|LW_SYSTEM_FPU)) != LW_SYSTEM;
+}
+
+/*
+ * True if this is a system thread with its own private FPU state.
+ */
+static inline bool
+lwp_system_fpu_p(struct lwp *l)
+{
+
+	return (l->l_flag & (LW_SYSTEM|LW_SYSTEM_FPU)) ==
+	    (LW_SYSTEM|LW_SYSTEM_FPU);
+}
+
 static inline union savefpu *
 fpu_lwp_area(struct lwp *l)
 {
 	struct pcb *pcb = lwp_getpcb(l);
 	union savefpu *area = &pcb->pcb_savefpu;
 
-	KASSERT((l->l_flag & LW_SYSTEM) == 0);
+	KASSERT(lwp_can_haz_fpu(l));
 	if (l == curlwp) {
 		fpu_save();
 	}
@@ -155,7 +178,7 @@ fpu_save_lwp(struct lwp *l)
 
 	s = splvm();
 	if (l->l_md.md_flags & MDL_FPU_IN_CPU) {
-		KASSERT((l->l_flag & LW_SYSTEM) == 0);
+		KASSERT(lwp_can_haz_fpu(l));
 		fpu_area_save(area, x86_xsave_features);
 		l->l_md.md_flags &= ~MDL_FPU_IN_CPU;
 	}
@@ -307,7 +330,7 @@ fpu_switch(struct lwp *oldlwp, struct lw
 	    cpu_index(ci), ci->ci_ilevel);
 
 	if (oldlwp->l_md.md_flags & MDL_FPU_IN_CPU) {
-		KASSERT(!(oldlwp->l_flag & LW_SYSTEM));
+		KASSERT(lwp_can_haz_fpu(oldlwp));
 		pcb = lwp_getpcb(oldlwp);
 		fpu_area_save(&pcb->pcb_savefpu, x86_xsave_features);
 		oldlwp->l_md.md_flags &= ~MDL_FPU_IN_CPU;
@@ -322,11 +345,11 @@ fpu_lwp_fork(struct lwp *l1, struct lwp 
 	union savefpu *fpu_save;
 
 	/* Kernel threads have no FPU. */
-	if (__predict_false(l2->l_flag & LW_SYSTEM)) {
+	if (__predict_false(!lwp_can_haz_fpu(l2))) {
 		return;
 	}
 	/* For init(8). */
-	if (__predict_false(l1->l_flag & LW_SYSTEM)) {
+	if (__predict_false(!lwp_can_haz_fpu(l1))) {
 		memset(&pcb2->pcb_savefpu, 0, x86_fpu_save_size);
 		return;
 	}
@@ -350,6 +373,8 @@ fpu_lwp_abandon(struct lwp *l)
 
 /* -------------------------------------------------------------------------- */
 
+static const union savefpu zero_fpu __aligned(64);
+
 /*
  * fpu_kern_enter()
  *
@@ -369,6 +394,11 @@ fpu_kern_enter(void)
 	struct cpu_info *ci;
 	int s;
 
+	if (lwp_system_fpu_p(l) && !cpu_intr_p()) {
+		KASSERT(!cpu_softintr_p());
+		return;
+	}
+
 	s = splvm();
 
 	ci = curcpu();
@@ -401,10 +431,16 @@ fpu_kern_enter(void)
 void
 fpu_kern_leave(void)
 {
-	static const union savefpu zero_fpu __aligned(64);
-	struct cpu_info *ci = curcpu();
+	struct cpu_info *ci;
 	int s;
 
+	if (lwp_system_fpu_p(curlwp) && !cpu_intr_p()) {
+		KASSERT(!cpu_softintr_p());
+		return;
+	}
+
+	ci = curcpu();
+
 	KASSERT(ci->ci_ilevel == IPL_VM);
 	KASSERT(ci->ci_kfpu_spl != -1);
 
@@ -426,6 +462,23 @@ fpu_kern_leave(void)
 	splx(s);
 }
 
+void
+kthread_fpu_enter_md(void)
+{
+
+	/* Enable the FPU by clearing CR0_TS.  */
+	clts();
+}
+
+void
+kthread_fpu_exit_md(void)
+{
+
+	/* Zero the FPU state and disable the FPU by setting CR0_TS.  */
+	fpu_area_restore(&zero_fpu, x86_xsave_features);
+	stts();
+}
+
 /* -------------------------------------------------------------------------- */
 
 /*
diff -r 14228ccdcddb -r 2d0f42cbc645 sys/dev/cgd.c
--- a/sys/dev/cgd.c	Fri Jul 31 02:02:15 2020 +0000
+++ b/sys/dev/cgd.c	Fri Jul 31 18:29:28 2020 +0000
@@ -673,7 +673,7 @@ cgd_create_worker(void)
 	cp = kmem_alloc(sizeof(struct pool), KM_SLEEP);
 
 	error = workqueue_create(&wq, "cgd", cgd_process, NULL,
-	                         PRI_BIO, IPL_BIO, WQ_MPSAFE | WQ_PERCPU);
+	    PRI_BIO, IPL_BIO, WQ_FPU|WQ_MPSAFE|WQ_PERCPU);
 	if (error) {
 		kmem_free(cp, sizeof(struct pool));
 		kmem_free(cw, sizeof(struct cgd_worker));
@@ -684,9 +684,8 @@ cgd_create_worker(void)
 	cw->cw_wq = wq;
 	pool_init(cw->cw_cpool, sizeof(struct cgd_xfer), 0,
 	    0, 0, "cgdcpl", NULL, IPL_BIO);
+	mutex_init(&cw->cw_lock, MUTEX_DEFAULT, IPL_BIO);
 
-	mutex_init(&cw->cw_lock, MUTEX_DEFAULT, IPL_BIO);
-	
 	return cw;
 }
 
diff -r 14228ccdcddb -r 2d0f42cbc645 sys/kern/subr_pcu.c
--- a/sys/kern/subr_pcu.c	Fri Jul 31 02:02:15 2020 +0000
+++ b/sys/kern/subr_pcu.c	Fri Jul 31 18:29:28 2020 +0000
@@ -89,6 +89,17 @@ typedef struct {
 extern const pcu_ops_t * const pcu_ops_md_defs[];
 
 /*
+ * pcu_available_p: true if lwp is allowed to use PCU state.
+ */
+static inline bool
+pcu_available_p(struct lwp *l)
+{
+
+	/* XXX Not sure this is safe unless l is locked!  */
+	return (l->l_flag & (LW_SYSTEM|LW_SYSTEM_FPU)) != LW_SYSTEM;
+}
+
+/*
  * pcu_switchpoint: release PCU state if the LWP is being run on another CPU.
  * This routine is called on each context switch by by mi_switch().
  */
@@ -135,7 +146,7 @@ pcu_discard_all(lwp_t *l)
 	 * due to an error in the LWP creation path before it ever runs.
 	 */
 	KASSERT(l == curlwp || l->l_stat == LSIDL ||
-		((l->l_flag & LW_SYSTEM) && pcu_valid == 0));
+		(!pcu_available_p(l) && pcu_valid == 0));
 
 	if (__predict_true(pcu_valid == 0)) {
 		/* PCUs are not in use. */
@@ -174,7 +185,7 @@ pcu_save_all(lwp_t *l)
 	 * with a different LWP (forking a system LWP or doing a coredump of
 	 * a process with multiple threads) and we need to deal with that.
 	 */
-	KASSERT(l == curlwp || (((l->l_flag & LW_SYSTEM) ||
+	KASSERT(l == curlwp || ((!pcu_available_p(l) ||
 	    (curlwp->l_proc == l->l_proc && l->l_stat == LSSUSPENDED)) &&
 	    pcu_valid == 0));
 
diff -r 14228ccdcddb -r 2d0f42cbc645 sys/kern/subr_workqueue.c
--- a/sys/kern/subr_workqueue.c	Fri Jul 31 02:02:15 2020 +0000
+++ b/sys/kern/subr_workqueue.c	Fri Jul 31 18:29:28 2020 +0000
@@ -112,10 +112,13 @@ workqueue_worker(void *cookie)
 {
 	struct workqueue *wq = cookie;
 	struct workqueue_queue *q;
+	int s;
 
 	/* find the workqueue of this kthread */
 	q = workqueue_queue_lookup(wq, curlwp->l_cpu);
 
+	if (wq->wq_flags & WQ_FPU)
+		s = kthread_fpu_enter();
 	for (;;) {
 		/*
 		 * we violate abstraction of SIMPLEQ.
@@ -141,6 +144,8 @@ workqueue_worker(void *cookie)
 		}
 		mutex_exit(&q->q_mutex);
 	}
+	if (wq->wq_flags & WQ_FPU)
+		kthread_fpu_exit(s);
 }
 
 static void
diff -r 14228ccdcddb -r 2d0f42cbc645 sys/sys/workqueue.h
--- a/sys/sys/workqueue.h	Fri Jul 31 02:02:15 2020 +0000
+++ b/sys/sys/workqueue.h	Fri Jul 31 18:29:28 2020 +0000
@@ -47,6 +47,7 @@ struct workqueue;
 
 #define	WQ_MPSAFE	0x01
 #define	WQ_PERCPU	0x02
+#define	WQ_FPU		0x04
 
 int workqueue_create(struct workqueue **, const char *,
     void (*)(struct work *, void *), void *, pri_t, int, int);