diff -r 969aab925f8a common/lib/libc/arch/i386/atomic/atomic.S
--- a/common/lib/libc/arch/i386/atomic/atomic.S	Mon Sep 01 04:47:03 2025 +0000
+++ b/common/lib/libc/arch/i386/atomic/atomic.S	Fri Sep 05 02:22:11 2025 +0000
@@ -212,7 +212,8 @@ ENTRY(_membar_sync)
 	 * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
 	 * https://www.agner.org/optimize/instruction_tables.pdf
 	 *
-	 * Sync with xen_mb in sys/arch/i386/i386/cpufunc.S.
+	 * Sync with paravirt_membar_sync in
+	 * sys/arch/i386/i386/cpufunc.S.
 	 */
 	LOCK
 	addl	$0, -4(%esp)
diff -r 969aab925f8a common/lib/libc/arch/sparc64/atomic/membar_ops.S
--- a/common/lib/libc/arch/sparc64/atomic/membar_ops.S	Mon Sep 01 04:47:03 2025 +0000
+++ b/common/lib/libc/arch/sparc64/atomic/membar_ops.S	Fri Sep 05 02:22:11 2025 +0000
@@ -72,6 +72,9 @@ ENTRY(_membar_sync)
 	 *	https://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/sparc-2i-usersmanual-2516677.pdf#page=518
 	 *
 	 * So let's avoid doing that.
+	 *
+	 * Sync with paravirt_membar_sync in
+	 * sys/arch/sparc64/sparc64/locore.s.
 	 */
 	membar	#StoreLoad
 	retl
diff -r 969aab925f8a common/lib/libc/arch/x86_64/atomic/atomic.S
--- a/common/lib/libc/arch/x86_64/atomic/atomic.S	Mon Sep 01 04:47:03 2025 +0000
+++ b/common/lib/libc/arch/x86_64/atomic/atomic.S	Fri Sep 05 02:22:11 2025 +0000
@@ -287,7 +287,8 @@ ENTRY(_membar_sync)
 	 * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
 	 * https://www.agner.org/optimize/instruction_tables.pdf
 	 *
-	 * Sync with xen_mb in sys/arch/amd64/amd64/cpufunc.S.
+	 * Sync with paravirt_membar_sync in
+	 * sys/arch/amd64/amd64/cpufunc.S.
 	 */
 	LOCK
 	addq	$0, -8(%rsp)
diff -r 969aab925f8a distrib/sets/lists/comp/mi
--- a/distrib/sets/lists/comp/mi	Mon Sep 01 04:47:03 2025 +0000
+++ b/distrib/sets/lists/comp/mi	Fri Sep 05 02:22:11 2025 +0000
@@ -12925,6 +12925,7 @@
 ./usr/share/man/cat9/optstr_get.0		comp-sys-catman		.cat
 ./usr/share/man/cat9/p_find.0			comp-obsolete		obsolete
 ./usr/share/man/cat9/panic.0			comp-sys-catman		.cat
+./usr/share/man/cat9/paravirt_membar_sync.0	comp-sys-catman		.cat
 ./usr/share/man/cat9/pathbuf.0			comp-sys-catman		.cat
 ./usr/share/man/cat9/pci.0			comp-sys-catman		.cat
 ./usr/share/man/cat9/pci_conf_hook.0		comp-sys-catman		.cat
@@ -21789,6 +21790,7 @@
 ./usr/share/man/html9/optstr_get.html		comp-sys-htmlman	html
 ./usr/share/man/html9/p_find.html		comp-obsolete		obsolete
 ./usr/share/man/html9/panic.html		comp-sys-htmlman	html
+./usr/share/man/html9/paravirt_membar_sync.html	comp-sys-htmlman	html
 ./usr/share/man/html9/pathbuf.html		comp-sys-htmlman	html
 ./usr/share/man/html9/pci.html			comp-sys-htmlman	html
 ./usr/share/man/html9/pci_conf_hook.html	comp-sys-htmlman	html
@@ -30810,6 +30812,7 @@
 ./usr/share/man/man9/optstr_get.9		comp-sys-man		.man
 ./usr/share/man/man9/p_find.9			comp-obsolete		obsolete
 ./usr/share/man/man9/panic.9			comp-sys-man		.man
+./usr/share/man/man9/paravirt_membar_sync.9	comp-sys-man		.man
 ./usr/share/man/man9/pathbuf.9			comp-sys-man		.man
 ./usr/share/man/man9/pci.9			comp-sys-man		.man
 ./usr/share/man/man9/pci_conf_hook.9		comp-sys-man		.man
diff -r 969aab925f8a share/man/man9/Makefile
--- a/share/man/man9/Makefile	Mon Sep 01 04:47:03 2025 +0000
+++ b/share/man/man9/Makefile	Fri Sep 05 02:22:11 2025 +0000
@@ -41,7 +41,12 @@ MAN=	accept_filter.9 accf_data.9 accf_ht
 	microseq.9 microtime.9 microuptime.9 mi_switch.9 module.9 \
 	mstohz.9 mutex.9 m_tag.9 namecache.9 \
 	namei.9 nullop.9 opencrypto.9 optstr.9 \
-	panic.9 pathbuf.9 pci.9 pci_configure_bus.9 pci_intr.9 \
+	panic.9 \
+	paravirt_membar_sync.9 \
+	pathbuf.9 \
+	pci.9 \
+	pci_configure_bus.9 \
+	pci_intr.9 \
 	pci_msi.9 pckbport.9 pcmcia.9 pcq.9 pcu.9 \
 	percpu.9 pfil.9 physio.9 pktqueue.9 pmap.9 pmatch.9 pmf.9 pool.9 \
 	pool_cache.9 portfeatures.9 powerhook_establish.9 ppsratecheck.9 \
diff -r 969aab925f8a share/man/man9/paravirt_membar_sync.9
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/share/man/man9/paravirt_membar_sync.9	Fri Sep 05 02:22:11 2025 +0000
@@ -0,0 +1,148 @@
+.\"	$NetBSD$
+.\"
+.\" Copyright (c) 2025 The NetBSD Foundation
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+.\" PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+.\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+.\" POSSIBILITY OF SUCH DAMAGE.
+.\"
+.Dd August 31, 2025
+.Dt PARAVIRT_MEMBAR_SYNC 9
+.Os
+.Sh NAME
+.Nm paravirt_membar_sync
+.Nd memory barrier for paravirtualized device drivers
+.Sh SYNOPSIS
+.In sys/paravirt_membar.h
+.Ft void
+.Fn paravirt_membar_sync "void"
+.Sh DESCRIPTION
+The
+.Nm
+function issues a store-before-load barrier for coordination with a
+paravirtualized device.
+.Pp
+This function has the same ordering semantics as
+.Xr membar_sync 3 ,
+but
+.Xr membar_sync 3
+can only coordinate with other CPUs that
+.Nx
+is running on.
+In a virtual machine,
+.Nx
+may be running on a single
+.Em virtual
+CPU, and patch
+.Xr membar_sync 3
+to be a no-op, while the host side of a paravirtualized device may be
+running on a different
+.Em physical
+CPU requiring a barrier that
+.Xr membar_sync 3
+does not issue.
+.Sh EXAMPLES
+Submit a request to the host device, and notify the host to process
+it\(embut elide the notification, which is expensive, if the host is
+already reading requests anyway:
+.Bd -literal
+	/*
+	 * Write the request into the ring buffer.
+	 */
+	memcpy(cputodev_ring->buffer[sc->sc_cputodev_idx], request,
+	    sizeof(*request));
+
+	/*
+	 * Publish the request to the host device side.
+	 */
+	cputodev_ring->header->producer_tail = ++sc->sc_cputodev_idx;
+
+	/*
+	 * Ensure we have published it _before_ we check whether the
+	 * host needs notification.
+	 */
+	paravirt_membar_sync();
+
+	/*
+	 * Notify the host, if needed.  Notifying the host is usually
+	 * expensive (trap to hypervisor), so we try to avoid it if not
+	 * needed.
+	 */
+	if (cputodev_ring->header->needs_notification)
+		notify_host();
+.Ed
+.Pp
+Enable interrupts from the host and check whether any were pending
+while interrupts were disabled:
+.Bd -literal
+	/*
+	 * Tell the host device to deliver interrupts after this
+	 * point.
+	 */
+restart:
+	devtocpu_ring->header->needs_notification = true;
+
+	/*
+	 * Ensure we have requested interrupts _before_ we check
+	 * whether we missed any notifications.
+	 */
+	paravirt_membar_sync();
+
+	/*
+	 * Check whether there were any pending notifications while
+	 * interrupts were blocked.  If not, stop here.
+	 */
+	idx = devtocpu_ring->header->producer_idx;
+	if (sc->sc_devtocpu_idx == idx)
+		return;
+
+	/*
+	 * Process the notifications.
+	 */
+	devtocpu_ring->header->needs_notification = false;
+	while (sc->sc_devtocpu_idx != idx) {
+		struct buffer *buf =
+		    devtocpu_ring->buffer[sc->sc_devtocpu_idx];
+		process_notification(buf);
+		sc->sc_devtocpu_idx++;
+		sc->sc_devtocpu_idx %= ringlen;
+	}
+	goto restart;
+.Ed
+.Pp
+.Sy "N.B.:"
+Other ordering or bouncing may be required with
+.Xr bus_dmamap_sync 9 ;
+this is independent of
+.Nm ,
+which is needed
+.Em in addition to
+.Xr bus_dmamap_sync 9
+to guarantee store-before-load ordering when there is no intervening
+I/O doorbell trigger for a DMA operation, nor interrupt delivery for a
+DMA completion.
+.Sh SEE ALSO
+.Xr membar_ops 3 ,
+.Xr bus_dma 9 ,
+.Xr bus_space 9
+.Sh HISTORY
+These atomic operations first appeared in
+.Nx 12.0 .
diff -r 969aab925f8a sys/arch/alpha/alpha/locore.s
--- a/sys/arch/alpha/alpha/locore.s	Mon Sep 01 04:47:03 2025 +0000
+++ b/sys/arch/alpha/alpha/locore.s	Fri Sep 05 02:22:11 2025 +0000
@@ -1524,3 +1524,18 @@ LEAF(alpha_write_fpcr, 1); f30save = 0; 
 	lda	sp, framesz(sp)
 	RET
 END(alpha_write_fpcr)
+
+LEAF(paravirt_membar_sync, 0)
+	/*
+	 * Store-before-load ordering with respect to matching logic
+	 * on the hypervisor side.
+	 *
+	 * This is the same as membar_sync, but without hotpatching
+	 * away the MB instruction on uniprocessor boots -- because
+	 * under virtualization, we still have to coordinate with a
+	 * `device' backed by a hypervisor that is potentially on
+	 * another physical CPU even if we observe only one virtual CPU
+	 * as the guest.
+	 */
+	mb
+END(paravirt_membar_sync)
diff -r 969aab925f8a sys/arch/amd64/amd64/cpufunc.S
--- a/sys/arch/amd64/amd64/cpufunc.S	Mon Sep 01 04:47:03 2025 +0000
+++ b/sys/arch/amd64/amd64/cpufunc.S	Fri Sep 05 02:22:11 2025 +0000
@@ -61,17 +61,17 @@ ENTRY(x86_mfence)
 	ret
 END(x86_mfence)
 
-#ifdef XEN
-ENTRY(xen_mb)
+ENTRY(paravirt_membar_sync)
 	/*
 	 * Store-before-load ordering with respect to matching logic
 	 * on the hypervisor side.
 	 *
 	 * This is the same as membar_sync, but without hotpatching
 	 * away the LOCK prefix on uniprocessor boots -- because under
-	 * Xen, we still have to coordinate with a `device' backed by a
-	 * hypervisor that is potentially on another physical CPU even
-	 * if we observe only one virtual CPU as the guest.
+	 * virtualization, we still have to coordinate with a `device'
+	 * backed by a hypervisor that is potentially on another
+	 * physical CPU even if we observe only one virtual CPU as the
+	 * guest.
 	 *
 	 * See common/lib/libc/arch/x86_64/atomic/atomic.S for
 	 * rationale and keep this in sync with the implementation
@@ -80,7 +80,10 @@ ENTRY(xen_mb)
 	lock
 	addq	$0,-8(%rsp)
 	ret
-END(xen_mb)
+END(paravirt_membar_sync)
+
+#ifdef XEN
+STRONG_ALIAS(xen_mb,paravirt_membar_sync)
 #endif	/* XEN */
 
 #ifdef KDTRACE_HOOKS
diff -r 969aab925f8a sys/arch/arm/arm/cpu_subr.c
--- a/sys/arch/arm/arm/cpu_subr.c	Mon Sep 01 04:47:03 2025 +0000
+++ b/sys/arch/arm/arm/cpu_subr.c	Fri Sep 05 02:22:11 2025 +0000
@@ -38,6 +38,7 @@
 #include <sys/param.h>
 #include <sys/atomic.h>
 #include <sys/cpu.h>
+#include <sys/paravirt_membar.h>
 #include <sys/reboot.h>
 
 #include <arm/cpufunc.h>
@@ -145,3 +146,33 @@ cpu_clr_mbox(int cpuindex)
 }
 
 #endif
+
+#if defined _ARM_ARCH_6 || defined _ARM_ARCH_7 /* see below regarding armv<6 */
+void
+paravirt_membar_sync(void)
+{
+
+	/*
+	 * Store-before-load ordering with respect to matching logic
+	 * on the hypervisor side.
+	 *
+	 * This is the same as membar_sync, but guaranteed never to be
+	 * conditionalized or hotpatched away even on uniprocessor
+	 * builds and boots -- because under virtualization, we still
+	 * have to coordinate with a `device' backed by a hypervisor
+	 * that is potentially on another physical CPU even if we
+	 * observe only one virtual CPU as the guest.
+	 *
+	 * Prior to armv6, there was no data memory barrier
+	 * instruction.  Such CPUs presumably don't exist in
+	 * multiprocessor configurations.  But what if we're running a
+	 * _kernel_ built for a uniprocessor armv5 CPU, as a virtual
+	 * machine guest of a _host_ with a newer multiprocessor CPU?
+	 * How do we enforce store-before-load ordering for a
+	 * paravirtualized device driver, coordinating with host
+	 * software `device' potentially on another CPU?  You'll have
+	 * to answer that before you can use virtio drivers!
+	 */
+	dmb(ish);
+}
+#endif	/* defined _ARM_ARCH_6 || defined _ARM_ARCH_7 */
diff -r 969aab925f8a sys/arch/hppa/hppa/support.S
--- a/sys/arch/hppa/hppa/support.S	Mon Sep 01 04:47:03 2025 +0000
+++ b/sys/arch/hppa/hppa/support.S	Fri Sep 05 02:22:11 2025 +0000
@@ -304,3 +304,18 @@ LEAF_ENTRY(longjmp)
 	ldi	1, %ret0
 EXIT(longjmp)
 
+LEAF_ENTRY(paravirt_membar_sync)
+	/*
+	 * Store-before-load ordering with respect to matching logic
+	 * on the hypervisor side.
+	 *
+	 * This is the same as membar_sync, but guaranteed never to be
+	 * conditionalized or hotpatched away even on uniprocessor
+	 * builds and boots -- because under virtualization, we still
+	 * have to coordinate with a `device' backed by a hypervisor
+	 * that is potentially on another physical CPU even if we
+	 * observe only one virtual CPU as the guest.
+	 */
+	bv	%r0(%rp)
+	 sync
+EXIT(paravirt_membar_sync)
diff -r 969aab925f8a sys/arch/i386/i386/cpufunc.S
--- a/sys/arch/i386/i386/cpufunc.S	Mon Sep 01 04:47:03 2025 +0000
+++ b/sys/arch/i386/i386/cpufunc.S	Fri Sep 05 02:22:11 2025 +0000
@@ -67,17 +67,17 @@ ENTRY(x86_mfence)
 	ret
 END(x86_mfence)
 
-#ifdef XEN
-ENTRY(xen_mb)
+ENTRY(paravirt_membar_sync)
 	/*
 	 * Store-before-load ordering with respect to matching logic
 	 * on the hypervisor side.
 	 *
 	 * This is the same as membar_sync, but without hotpatching
 	 * away the LOCK prefix on uniprocessor boots -- because under
-	 * Xen, we still have to coordinate with a `device' backed by a
-	 * hypervisor that is potentially on another physical CPU even
-	 * if we observe only one virtual CPU as the guest.
+	 * virtualization, we still have to coordinate with a `device'
+	 * backed by a hypervisor that is potentially on another
+	 * physical CPU even if we observe only one virtual CPU as the
+	 * guest.
 	 *
 	 * See common/lib/libc/arch/i386/atomic/atomic.S for
 	 * rationale and keep this in sync with the implementation
@@ -86,7 +86,10 @@ ENTRY(xen_mb)
 	lock
 	addl	$0,-4(%esp)
 	ret
-END(xen_mb)
+END(paravirt_membar_sync)
+
+#ifdef XEN
+STRONG_ALIAS(xen_mb,paravirt_membar_sync)
 #endif	/* XEN */
 
 #ifdef KDTRACE_HOOKS
diff -r 969aab925f8a sys/arch/mips/mips/cpu_subr.c
--- a/sys/arch/mips/mips/cpu_subr.c	Mon Sep 01 04:47:03 2025 +0000
+++ b/sys/arch/mips/mips/cpu_subr.c	Fri Sep 05 02:22:11 2025 +0000
@@ -49,6 +49,7 @@
 #include <sys/kernel.h>
 #include <sys/lwp.h>
 #include <sys/module.h>
+#include <sys/paravirt_membar.h>
 #include <sys/proc.h>
 #include <sys/ras.h>
 #include <sys/reboot.h>
@@ -1195,3 +1196,37 @@ cpuwatch_clr(cpu_watchpoint_t *cwp)
 }
 
 #endif	/* (MIPS32 + MIPS32R2 + MIPS64 + MIPS64R2) > 0 */
+
+#if (MIPS2 + MIPS3 + MIPS4 + MIPS5 + MIPS32 + MIPS32R2 + MIPS64 + MIPS64R2) > 0
+void
+paravirt_membar_sync(void)
+{
+
+	/*
+	 * Store-before-load ordering with respect to matching logic
+	 * on the hypervisor side.
+	 *
+	 * This is the same as membar_sync, but guaranteed never to be
+	 * conditionalized or hotpatched away even on uniprocessor
+	 * builds and boots -- because under virtualization, we still
+	 * have to coordinate with a `device' backed by a hypervisor
+	 * that is potentially on another physical CPU even if we
+	 * observe only one virtual CPU as the guest.
+	 *
+	 * Prior to MIPS-II, there was no SYNC instruction.[1]  CPUs
+	 * with only MIPS-I presumably don't exist in multiprocessor
+	 * configurations.  But what if we're running a _kernel_ built
+	 * for a uniprocessor MIPS-I CPU, as a virtual machine guest of
+	 * a _host_ with a newer multiprocessor CPU?  How do we enforce
+	 * store-before-load ordering for a paravirtualized device
+	 * driver, coordinating with host software `device' potentially
+	 * on another CPU?  You'll have to answer that before you can
+	 * use virtio drivers!
+	 *
+	 * [1] MIPS32 Architecture For Programmers, Volume II: The
+	 *     MIPS32 Instruction Set, Document Number: MD00086,
+	 *     Revision 0.95, March 12, 2001, MIPS Technologies, p. 215
+	 */
+	__asm volatile("sync");
+}
+#endif	/* !MIPS1 */
diff -r 969aab925f8a sys/arch/riscv/riscv/cpu_subr.c
--- a/sys/arch/riscv/riscv/cpu_subr.c	Mon Sep 01 04:47:03 2025 +0000
+++ b/sys/arch/riscv/riscv/cpu_subr.c	Fri Sep 05 02:22:11 2025 +0000
@@ -41,6 +41,7 @@
 #include <sys/cpu.h>
 #include <sys/kernel.h>
 #include <sys/reboot.h>
+#include <sys/paravirt_membar.h>
 #include <sys/xcall.h>
 
 #include <machine/db_machdep.h>
@@ -428,3 +429,21 @@ cpu_ipi(struct cpu_info *ci)
 }
 
 #endif
+
+void
+paravirt_membar_sync(void)
+{
+
+	/*
+	 * Store-before-load ordering with respect to matching logic
+	 * on the hypervisor side.
+	 *
+	 * This is the same as membar_sync, but guaranteed never to be
+	 * conditionalized or hotpatched away even on uniprocessor
+	 * builds and boots -- because under virtualization, we still
+	 * have to coordinate with a `device' backed by a hypervisor
+	 * that is potentially on another physical CPU even if we
+	 * observe only one virtual CPU as the guest.
+	 */
+	__asm volatile("fence	rw,rw");
+}
diff -r 969aab925f8a sys/arch/sparc/sparc/locore.s
--- a/sys/arch/sparc/sparc/locore.s	Mon Sep 01 04:47:03 2025 +0000
+++ b/sys/arch/sparc/sparc/locore.s	Fri Sep 05 02:22:11 2025 +0000
@@ -6001,6 +6001,21 @@ Lpanic_spunout:
 	.asciz	"cpu%d: stuck on lock@%x"
 	_ALIGN
 
+ENTRY(paravirt_membar_sync)
+	/*
+	 * Store-before-load ordering with respect to matching logic
+	 * on the hypervisor side.
+	 *
+	 * This is the same as membar_sync, but without
+	 * conditionalizing away the LDSTUB instruction on uniprocessor
+	 * builds -- because under virtualization, we still have to
+	 * coordinate with a `device' backed by a hypervisor that is
+	 * potentially on another physical CPU even if we observe only
+	 * one virtual CPU as the guest.
+	 */
+	ldstub	[%sp - 4], %g0	/* makeshift store-before-load barrier */
+END(paravirt_membar_sync)
+
 #if defined(KGDB) || defined(DDB) || defined(DIAGNOSTIC)
 /*
  * Write all windows (user or otherwise), except the current one.
diff -r 969aab925f8a sys/arch/sparc64/sparc64/locore.s
--- a/sys/arch/sparc64/sparc64/locore.s	Mon Sep 01 04:47:03 2025 +0000
+++ b/sys/arch/sparc64/sparc64/locore.s	Fri Sep 05 02:22:11 2025 +0000
@@ -7948,6 +7948,26 @@ ENTRY(sparc64_ipi_ccall)
 
 #endif
 
+ENTRY(paravirt_membar_sync)
+	/*
+	 * Store-before-load ordering with respect to matching logic
+	 * on the hypervisor side.
+	 *
+	 * This is the same as membar_sync, but without patching or
+	 * conditionalizing away the MEMBAR instruction on uniprocessor
+	 * builds or boots -- because under virtualization, we still
+	 * have to coordinate with a `device' backed by a hypervisor
+	 * that is potentially on another physical CPU even if we
+	 * observe only one virtual CPU as the guest.
+	 *
+	 * See common/lib/libc/arch/sparc64/atomic/membar_ops.S for why
+	 * we avoid using the delay slot and keep this in sync with the
+	 * implementation of membar_sync there.
+	 */
+	membar	#StoreLoad
+	retl
+	 nop
+END(paravirt_membar_sync)
 
 	.data
 	_ALIGN
diff -r 969aab925f8a sys/arch/virt68k/virt68k/locore.s
--- a/sys/arch/virt68k/virt68k/locore.s	Mon Sep 01 04:47:03 2025 +0000
+++ b/sys/arch/virt68k/virt68k/locore.s	Fri Sep 05 02:22:11 2025 +0000
@@ -598,6 +598,27 @@ ENTRY(ecacheon)
 ENTRY(ecacheoff)
 	rts
 
+ENTRY(paravirt_membar_sync)
+	/*
+	 * Store-before-load ordering with respect to matching logic
+	 * on the hypervisor side.
+	 *
+	 * This is the same as membar_sync, but guaranteed never to be
+	 * conditionalized or hotpatched away even on uniprocessor
+	 * builds and boots -- because under virtualization, we still
+	 * have to coordinate with a `device' backed by a hypervisor
+	 * that is potentially on another physical CPU even if we
+	 * observe only one virtual CPU as the guest.
+	 *
+	 * I don't see an obvious ordering-only instruction in the m68k
+	 * instruction set, but qemu implements CAS with
+	 * store-before-load ordering, so this should work for virtio.
+	 */
+	clrl	%d0
+	casl	%d0,%d0,%sp@
+	rts
+END(paravirt_membar_sync)
+
 /*
  * Misc. global variables.
  */
diff -r 969aab925f8a sys/dev/hyperv/vmbus.c
--- a/sys/dev/hyperv/vmbus.c	Mon Sep 01 04:47:03 2025 +0000
+++ b/sys/dev/hyperv/vmbus.c	Fri Sep 05 02:22:11 2025 +0000
@@ -50,6 +50,7 @@
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/xcall.h>
+#include <sys/paravirt_membar.h>
 
 #include <uvm/uvm_extern.h>
 
@@ -791,6 +792,7 @@ vmbus_message_proc(void *arg, struct cpu
 
 	msg = (struct vmbus_message *)sc->sc_percpu[cpu_index(ci)].simp +
 	    VMBUS_SINT_MESSAGE;
+	/* XXX bus_dmamap_sync(POSTREAD|POSTWRITE) on msg_type */
 	if (__predict_false(msg->msg_type != HYPERV_MSGTYPE_NONE)) {
 		if (__predict_true(!cold))
 			softint_schedule_cpu(sc->sc_msg_sih, ci);
@@ -813,9 +815,12 @@ vmbus_message_softintr(void *arg)
 	for (;;) {
 		msg = (struct vmbus_message *)sc->sc_percpu[cpu].simp +
 		    VMBUS_SINT_MESSAGE;
+		/* XXX bus_dmamap_sync(POSTREAD|POSTWRITE) on msg_type */
 		if (msg->msg_type == HYPERV_MSGTYPE_NONE)
 			break;
 
+		/* XXX bus_dmamap_sync(POSTREAD) on msg_data */
+
 		hdr = (struct vmbus_chanmsg_hdr *)msg->msg_data;
 		type = hdr->chm_type;
 		if (type >= VMBUS_CHANMSG_COUNT) {
@@ -831,10 +836,22 @@ vmbus_message_softintr(void *arg)
 			}
 		}
 
+		/* XXX bus_dmamap_sync(PREREAD) on msg_data */
+
 		msg->msg_type = HYPERV_MSGTYPE_NONE;
-		membar_sync();
+		/* XXX bus_dmamap_sync(PREWRITE|PREREAD) on msg_type */
+
+		/*
+		 * Ensure we tell the host that this message is done
+		 * before we check whether the host told us there are
+		 * more pending.
+		 */
+		paravirt_membar_sync();
+
+		/* XXX bus_dmamap_sync(POSTREAD) on msg_flags */
 		if (msg->msg_flags & VMBUS_MSGFLAG_PENDING)
 			hyperv_send_eom();
+		/* XXX bus_dmamap_sync(PREREAD) on msg_flags */
 	}
 }
 
@@ -1655,8 +1672,10 @@ static __inline void
 vmbus_ring_avail(struct vmbus_ring_data *rd, uint32_t *towrite,
     uint32_t *toread)
 {
+	/* XXX bus_dmamap_sync(POSTREAD) on br_rindex/br_windex */
 	uint32_t ridx = rd->rd_ring->br_rindex;
 	uint32_t widx = rd->rd_ring->br_windex;
+	/* XXX bus_dmamap_sync(PREREAD) on br_rindex/br_windex */
 	uint32_t r, w;
 
 	if (widx >= ridx)
@@ -1674,7 +1693,9 @@ static bool
 vmbus_ring_is_empty(struct vmbus_ring_data *rd)
 {
 
+	/* XXX bus_dmamap_sync(POSTREAD) on br_rindex/br_windex */
 	return rd->rd_ring->br_rindex == rd->rd_ring->br_windex;
+	/* XXX bus_dmamap_sync(PREREAD) on br_rindex/br_windex */
 }
 
 static int
@@ -1698,15 +1719,27 @@ vmbus_ring_write(struct vmbus_ring_data 
 
 	oprod = wrd->rd_prod;
 
+	/* XXX bus_dmamap_sync(POSTWRITE) on ring data */
+
 	for (i = 0; i < iov_cnt; i++)
 		vmbus_ring_put(wrd, iov[i].iov_base, iov[i].iov_len);
 
 	indices = (uint64_t)oprod << 32;
 	vmbus_ring_put(wrd, (uint8_t *)&indices, sizeof(indices));
 
-	membar_sync();
+	/* XXX bus_dmamap_sync(PREWRITE) on ring data */
+
+	membar_sync();	/* XXX bus_dmamap_sync(POSTWRITE) on br_windex */
 	wrd->rd_ring->br_windex = wrd->rd_prod;
-	membar_sync();
+	/* XXX bus_dmamap_sync(PREWRITE) on br_windex */
+
+	/*
+	 * Ensure we publish the producer index _before_ we check
+	 * whether the host needs to be notified.
+	 */
+	paravirt_membar_sync();
+
+	/* XXX bus_dmamap_sync(POSTREAD) on br_rindex */
 
 	/* Signal when the ring transitions from being empty to non-empty */
 	if (wrd->rd_ring->br_imask == 0 &&
@@ -1715,6 +1748,8 @@ vmbus_ring_write(struct vmbus_ring_data 
 	else
 		*needsig = 0;
 
+	/* XXX bus_dmamap_sync(PREREAD) on br_rindex */
+
 	return 0;
 }
 
@@ -1874,6 +1909,8 @@ vmbus_ring_read(struct vmbus_ring_data *
 		return EAGAIN;
 	}
 
+	/* XXX bus_dmamap_sync(POSTREAD) on ring data */
+
 	if (offset) {
 		rrd->rd_cons += offset;
 		if (rrd->rd_cons >= rrd->rd_dsize)
@@ -1883,8 +1920,11 @@ vmbus_ring_read(struct vmbus_ring_data *
 	vmbus_ring_get(rrd, (uint8_t *)data, datalen, 0);
 	vmbus_ring_get(rrd, (uint8_t *)&indices, sizeof(indices), 0);
 
-	membar_sync();
+	/* XXX bus_dmamap_sync(PREREAD) on ring data */
+
+	membar_sync();	/* XXX bus_dmamap_sync(POSTWRITE) on br_rindex */
 	rrd->rd_ring->br_rindex = rrd->rd_cons;
+	/* XXX bus_dmamap_sync(PREWRITE) on br_rindex */
 
 	return 0;
 }
@@ -1931,18 +1971,18 @@ static inline void
 vmbus_ring_mask(struct vmbus_ring_data *rd)
 {
 
-	membar_sync();
+	membar_sync();	/* XXX bus_dmamap_sync(POSTWRITE) on br_imask */
 	rd->rd_ring->br_imask = 1;
-	membar_sync();
+	membar_sync();	/* XXX bus_dmamap_sync(PREWRITE) on br_imask */
 }
 
 static inline void
 vmbus_ring_unmask(struct vmbus_ring_data *rd)
 {
 
-	membar_sync();
+	membar_sync();	/* XXX bus_dmamap_sync(POSTWRITE) on br_imask */
 	rd->rd_ring->br_imask = 0;
-	membar_sync();
+	membar_sync();	/* XXX bus_dmamap_sync(PREWRITE) on br_imask */
 }
 
 void
@@ -1962,6 +2002,14 @@ vmbus_channel_unpause(struct vmbus_chann
 	atomic_and_ulong(&ch->ch_sc->sc_evtmask[ch->ch_id / VMBUS_EVTFLAG_LEN],
 	    ~__BIT(ch->ch_id % VMBUS_EVTFLAG_LEN));
 	vmbus_ring_unmask(&ch->ch_rrd);
+
+	/*
+	 * Ensure we announce to the host side that we are accepting
+	 * interrupts _before_ we check whether any pending events had
+	 * come over the ring while we weren't accepting interrupts.
+	 */
+	paravirt_membar_sync();
+
 	vmbus_ring_avail(&ch->ch_rrd, NULL, &avail);
 
 	return avail;
diff -r 969aab925f8a sys/dev/pci/pvscsi.c
--- a/sys/dev/pci/pvscsi.c	Mon Sep 01 04:47:03 2025 +0000
+++ b/sys/dev/pci/pvscsi.c	Fri Sep 05 02:22:11 2025 +0000
@@ -63,13 +63,13 @@ in the file called LICENSE.GPL.
 
 #include <sys/param.h>
 
-#include <sys/atomic.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/cpu.h>
 #include <sys/device.h>
 #include <sys/kernel.h>
 #include <sys/kmem.h>
+#include <sys/paravirt_membar.h>
 #include <sys/queue.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
@@ -320,6 +320,18 @@ CFATTACH_DECL3_NEW(pvscsi, sizeof(struct
     pvscsi_probe, pvscsi_attach, pvscsi_detach, NULL, NULL, NULL,
     DVF_DETACH_SHUTDOWN);
 
+#define	PVSCSI_DMA_SYNC_STATE(sc, dma, structptr, member, ops)		      \
+	bus_dmamap_sync((sc)->sc_dmat, (dma)->map,			      \
+	    /*offset*/offsetof(__typeof__(*(structptr)), member),	      \
+	    /*length*/sizeof((structptr)->member),			      \
+	    (ops))
+
+#define	PVSCSI_DMA_SYNC_RING(sc, dma, ring, idx, ops)			      \
+	bus_dmamap_sync((sc)->sc_dmat, (dma)->map,			      \
+	    /*offset*/sizeof(*(ring)) * (idx),				      \
+	    /*length*/sizeof(*(ring)),					      \
+	    (ops))
+
 static inline uint32_t
 pvscsi_reg_read(struct pvscsi_softc *sc, uint32_t offset)
 {
@@ -371,6 +383,7 @@ pvscsi_intr_disable(struct pvscsi_softc 
 static void
 pvscsi_kick_io(struct pvscsi_softc *sc, uint8_t cdb0)
 {
+	struct pvscsi_dma *s_dma;
 	struct pvscsi_rings_state *s;
 
 	DEBUG_PRINTF(2, sc->dev, "%s: cdb0 %#x\n", __func__, cdb0);
@@ -378,8 +391,18 @@ pvscsi_kick_io(struct pvscsi_softc *sc, 
 	    cdb0 == READ_12  || cdb0 == READ_16  ||
 	    cdb0 == SCSI_WRITE_6_COMMAND || cdb0 == WRITE_10 ||
 	    cdb0 == WRITE_12 || cdb0 == WRITE_16) {
+		s_dma = &sc->rings_state_dma;
 		s = sc->rings_state;
 
+		/*
+		 * Ensure the command has been published before we read
+		 * req_cons_idx to test whether we need to kick the
+		 * host.
+		 */
+		paravirt_membar_sync();
+
+		PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, req_cons_idx,
+		    BUS_DMASYNC_POSTREAD);
 		DEBUG_PRINTF(2, sc->dev, "%s req prod %d cons %d\n", __func__,
 		    s->req_prod_idx, s->req_cons_idx);
 		if (!sc->use_req_call_threshold ||
@@ -390,8 +413,14 @@ pvscsi_kick_io(struct pvscsi_softc *sc, 
 		} else {
 			DEBUG_PRINTF(2, sc->dev, "wtf\n");
 		}
+		PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, req_cons_idx,
+		    BUS_DMASYNC_PREREAD);
 	} else {
 		s = sc->rings_state;
+		/*
+		 * XXX req_cons_idx in debug log might be stale, but no
+		 * need for DMA sync otherwise in this branch
+		 */
 		DEBUG_PRINTF(1, sc->dev, "%s req prod %d cons %d not checked\n", __func__,
 		    s->req_prod_idx, s->req_cons_idx);
 
@@ -497,6 +526,15 @@ static int pvscsi_setup_req_call(struct 
 		    &cmd, sizeof(cmd));
 		status = pvscsi_reg_read(sc, PVSCSI_REG_OFFSET_COMMAND_STATUS);
 
+		/*
+		 * After setup, sync req_call_threshold before use.
+		 * After this point it should be stable, so no need to
+		 * sync again during use.
+		 */
+		PVSCSI_DMA_SYNC_STATE(sc, &sc->rings_state_dma,
+		    sc->rings_state, req_call_threshold,
+		    BUS_DMASYNC_POSTREAD);
+
 		return (status != 0);
 	} else {
 		return (0);
@@ -585,6 +623,10 @@ pvscsi_dma_alloc_ppns(struct pvscsi_soft
 		return (error);
 	}
 
+	memset(dma->vaddr, 0, num_pages * PAGE_SIZE);
+	bus_dmamap_sync(sc->sc_dmat, dma->map, 0, num_pages * PAGE_SIZE,
+	    BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE);
+
 	ppn = dma->paddr >> PAGE_SHIFT;
 	for (i = 0; i < num_pages; i++) {
 		ppn_list[i] = ppn + i;
@@ -681,6 +723,16 @@ static void
 pvscsi_free_rings(struct pvscsi_softc *sc)
 {
 
+	bus_dmamap_sync(sc->sc_dmat, sc->rings_state_dma.map,
+	    0, sc->rings_state_dma.size,
+	    BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE);
+	bus_dmamap_sync(sc->sc_dmat, sc->req_ring_dma.map,
+	    0, sc->req_ring_dma.size,
+	    BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE);
+	bus_dmamap_sync(sc->sc_dmat, sc->cmp_ring_dma.map,
+	    0, sc->cmp_ring_dma.size,
+	    BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE);
+
 	pvscsi_dma_free(sc, &sc->rings_state_dma);
 	pvscsi_dma_free(sc, &sc->req_ring_dma);
 	pvscsi_dma_free(sc, &sc->cmp_ring_dma);
@@ -762,6 +814,18 @@ pvscsi_setup_rings(struct pvscsi_softc *
 	}
 
 	pvscsi_write_cmd(sc, PVSCSI_CMD_SETUP_RINGS, &cmd, sizeof(cmd));
+
+	/*
+	 * After setup, sync *_num_entries_log2 before use.  After this
+	 * point they should be stable, so no need to sync again during
+	 * use.
+	 */
+	PVSCSI_DMA_SYNC_STATE(sc, &sc->rings_state_dma,
+	    sc->rings_state, req_num_entries_log2,
+	    BUS_DMASYNC_POSTREAD);
+	PVSCSI_DMA_SYNC_STATE(sc, &sc->rings_state_dma,
+	    sc->rings_state, cmp_num_entries_log2,
+	    BUS_DMASYNC_POSTREAD);
 }
 
 static int
@@ -792,6 +856,15 @@ pvscsi_setup_msg_ring(struct pvscsi_soft
 	}
 
 	pvscsi_write_cmd(sc, PVSCSI_CMD_SETUP_MSG_RING, &cmd, sizeof(cmd));
+
+	/*
+	 * After setup, sync msg_num_entries_log2 before use.  After
+	 * this point it should be stable, so no need to sync again
+	 * during use.
+	 */
+	PVSCSI_DMA_SYNC_STATE(sc, &sc->rings_state_dma,
+	    sc->rings_state, msg_num_entries_log2,
+	    BUS_DMASYNC_POSTREAD);
 }
 
 static void
@@ -1078,26 +1151,36 @@ pvscsi_process_completion(struct pvscsi_
 static void
 pvscsi_process_cmp_ring(struct pvscsi_softc *sc)
 {
+	struct pvscsi_dma *ring_dma;
 	struct pvscsi_ring_cmp_desc *ring;
+	struct pvscsi_dma *s_dma;
 	struct pvscsi_rings_state *s;
 	struct pvscsi_ring_cmp_desc *e;
 	uint32_t mask;
 
 	KASSERT(mutex_owned(&sc->lock));
 
+	s_dma = &sc->rings_state_dma;
 	s = sc->rings_state;
+	ring_dma = &sc->cmp_ring_dma;
 	ring = sc->cmp_ring;
 	mask = MASK(s->cmp_num_entries_log2);
 
-	while (true) {
+	for (;;) {
+		PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, cmp_prod_idx,
+		    BUS_DMASYNC_POSTREAD);
 		size_t crpidx = s->cmp_prod_idx;
-		membar_acquire();
+		PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, cmp_prod_idx,
+		    BUS_DMASYNC_PREREAD);
 
 		if (s->cmp_cons_idx == crpidx)
 			break;
 
 		size_t crcidx = s->cmp_cons_idx & mask;
 
+		PVSCSI_DMA_SYNC_RING(sc, ring_dma, ring, crcidx,
+		    BUS_DMASYNC_POSTREAD);
+
 		e = ring + crcidx;
 
 		pvscsi_process_completion(sc, e);
@@ -1106,8 +1189,19 @@ pvscsi_process_cmp_ring(struct pvscsi_so
 		 * ensure completion processing reads happen before write to
 		 * (increment of) cmp_cons_idx
 		 */
-		membar_release();
+		PVSCSI_DMA_SYNC_RING(sc, ring_dma, ring, crcidx,
+		    BUS_DMASYNC_PREREAD);
+
+		/*
+		 * XXX Not actually sure the `device' does DMA for
+		 * s->cmp_cons_idx at all -- qemu doesn't.  If not, we
+		 * can skip these DMA syncs.
+		 */
+		PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, cmp_cons_idx,
+		    BUS_DMASYNC_POSTWRITE);
 		s->cmp_cons_idx++;
+		PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, cmp_cons_idx,
+		    BUS_DMASYNC_PREWRITE);
 	}
 }
 
@@ -1152,26 +1246,36 @@ pvscsi_process_msg(struct pvscsi_softc *
 static void
 pvscsi_process_msg_ring(struct pvscsi_softc *sc)
 {
+	struct pvscsi_dma *ring_dma;
 	struct pvscsi_ring_msg_desc *ring;
+	struct pvscsi_dma *s_dma;
 	struct pvscsi_rings_state *s;
 	struct pvscsi_ring_msg_desc *e;
 	uint32_t mask;
 
 	KASSERT(mutex_owned(&sc->lock));
 
+	s_dma = &sc->rings_state_dma;
 	s = sc->rings_state;
+	ring_dma = &sc->msg_ring_dma;
 	ring = sc->msg_ring;
 	mask = MASK(s->msg_num_entries_log2);
 
-	while (true) {
+	for (;;) {
+		PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, msg_prod_idx,
+		    BUS_DMASYNC_POSTREAD);
 		size_t mpidx = s->msg_prod_idx;	// dma read (device -> cpu)
-		membar_acquire();
+		PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, msg_prod_idx,
+		    BUS_DMASYNC_PREREAD);
 
 		if (s->msg_cons_idx == mpidx)
 			break;
 
 		size_t mcidx = s->msg_cons_idx & mask;
 
+		PVSCSI_DMA_SYNC_RING(sc, ring_dma, ring, mcidx,
+		    BUS_DMASYNC_POSTREAD);
+
 		e = ring + mcidx;
 
 		pvscsi_process_msg(sc, e);
@@ -1180,8 +1284,14 @@ pvscsi_process_msg_ring(struct pvscsi_so
 		 * ensure message processing reads happen before write to
 		 * (increment of) msg_cons_idx
 		 */
-		membar_release();
+		PVSCSI_DMA_SYNC_RING(sc, ring_dma, ring, mcidx,
+		    BUS_DMASYNC_PREREAD);
+
+		PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, msg_cons_idx,
+		    BUS_DMASYNC_POSTWRITE);
 		s->msg_cons_idx++;
+		PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, msg_cons_idx,
+		    BUS_DMASYNC_PREWRITE);
 	}
 }
 
@@ -1244,8 +1354,10 @@ pvscsi_scsipi_request(struct scsipi_chan
 #endif
 
 	uint32_t req_num_entries_log2;
+	struct pvscsi_dma *ring_dma;
 	struct pvscsi_ring_req_desc *ring;
 	struct pvscsi_ring_req_desc *e;
+	struct pvscsi_dma *s_dma;
 	struct pvscsi_rings_state *s;
 	struct pvscsi_hcb *hcb;
 
@@ -1258,7 +1370,9 @@ pvscsi_scsipi_request(struct scsipi_chan
 		return;
 	}
 
+	ring_dma = &sc->req_ring_dma;
 	ring = sc->req_ring;
+	s_dma = &sc->rings_state_dma;
 	s = sc->rings_state;
 
 	hcb = NULL;
@@ -1292,6 +1406,7 @@ pvscsi_scsipi_request(struct scsipi_chan
 	hcb->xs = xs;
 
 	const size_t rridx = s->req_prod_idx & MASK(req_num_entries_log2);
+	PVSCSI_DMA_SYNC_RING(sc, ring_dma, ring, rridx, BUS_DMASYNC_POSTWRITE);
 	e = ring + rridx;
 
 	memset(e, 0, sizeof(*e));
@@ -1391,7 +1506,7 @@ pvscsi_scsipi_request(struct scsipi_chan
 	 * Ensure request record writes happen before write to (increment of)
 	 * req_prod_idx.
 	 */
-	membar_producer();
+	PVSCSI_DMA_SYNC_RING(sc, ring_dma, ring, rridx, BUS_DMASYNC_PREWRITE);
 
 	uint8_t cdb0 = e->cdb[0];
 
@@ -1404,13 +1519,16 @@ pvscsi_scsipi_request(struct scsipi_chan
 		callout_reset(&xs->xs_callout, timeout, pvscsi_timeout, hcb);
 	}
 
+	PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, req_prod_idx,
+	    BUS_DMASYNC_POSTWRITE);
 	s->req_prod_idx++;
 
 	/*
 	 * Ensure req_prod_idx write (increment) happens before
 	 * IO is kicked (via a write).
 	 */
-	membar_producer();
+	PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, req_prod_idx,
+	    BUS_DMASYNC_PREWRITE);
 
 	pvscsi_kick_io(sc, cdb0);
 	mutex_exit(&sc->lock);
diff -r 969aab925f8a sys/dev/pci/virtio.c
--- a/sys/dev/pci/virtio.c	Mon Sep 01 04:47:03 2025 +0000
+++ b/sys/dev/pci/virtio.c	Fri Sep 05 02:22:11 2025 +0000
@@ -38,6 +38,7 @@
 #include <sys/device.h>
 #include <sys/kmem.h>
 #include <sys/module.h>
+#include <sys/paravirt_membar.h>
 
 #define VIRTIO_PRIVATE
 
@@ -708,6 +709,13 @@ virtio_start_vq_intr(struct virtio_softc
 	}
 	vq->vq_queued++;
 
+	/*
+	 * Ensure we announce to the host side that we are accepting
+	 * interrupts _before_ we check whether any pending events had
+	 * come over the queue while we weren't accepting interrupts.
+	 */
+	paravirt_membar_sync();
+
 	vq_sync_uring_header(sc, vq, BUS_DMASYNC_POSTREAD);
 	if (vq->vq_used_idx == virtio_rw16(sc, vq->vq_used->idx))
 		return 0;
@@ -1252,6 +1260,12 @@ notify:
 		vq_sync_aring_header(sc, vq, BUS_DMASYNC_PREWRITE);
 		vq->vq_queued++;
 
+		/*
+		 * Ensure we publish the avail idx _before_ we check whether
+		 * the host needs to notified.
+		 */
+		paravirt_membar_sync();
+
 		if (sc->sc_active_features & VIRTIO_F_RING_EVENT_IDX) {
 			vq_sync_uring_avail(sc, vq, BUS_DMASYNC_POSTREAD);
 			t = virtio_rw16(sc, *vq->vq_avail_event) + 1;
diff -r 969aab925f8a sys/sys/paravirt_membar.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/sys/paravirt_membar.h	Fri Sep 05 02:22:11 2025 +0000
@@ -0,0 +1,34 @@
+/*	$NetBSD$	*/
+
+/*-
+ * Copyright (c) 2025 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_SYS_PARAVIRT_MEMBAR_H_
+#define	_SYS_PARAVIRT_MEMBAR_H_
+
+void	paravirt_membar_sync(void);
+
+#endif	/* _SYS_PARAVIRT_MEMBAR_H_ */