diff -r 969aab925f8a common/lib/libc/arch/i386/atomic/atomic.S --- a/common/lib/libc/arch/i386/atomic/atomic.S Mon Sep 01 04:47:03 2025 +0000 +++ b/common/lib/libc/arch/i386/atomic/atomic.S Fri Sep 05 02:22:11 2025 +0000 @@ -212,7 +212,8 @@ ENTRY(_membar_sync) * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/ * https://www.agner.org/optimize/instruction_tables.pdf * - * Sync with xen_mb in sys/arch/i386/i386/cpufunc.S. + * Sync with paravirt_membar_sync in + * sys/arch/i386/i386/cpufunc.S. */ LOCK addl $0, -4(%esp) diff -r 969aab925f8a common/lib/libc/arch/sparc64/atomic/membar_ops.S --- a/common/lib/libc/arch/sparc64/atomic/membar_ops.S Mon Sep 01 04:47:03 2025 +0000 +++ b/common/lib/libc/arch/sparc64/atomic/membar_ops.S Fri Sep 05 02:22:11 2025 +0000 @@ -72,6 +72,9 @@ ENTRY(_membar_sync) * https://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/sparc-2i-usersmanual-2516677.pdf#page=518 * * So let's avoid doing that. + * + * Sync with paravirt_membar_sync in + * sys/arch/sparc64/sparc64/locore.s. */ membar #StoreLoad retl diff -r 969aab925f8a common/lib/libc/arch/x86_64/atomic/atomic.S --- a/common/lib/libc/arch/x86_64/atomic/atomic.S Mon Sep 01 04:47:03 2025 +0000 +++ b/common/lib/libc/arch/x86_64/atomic/atomic.S Fri Sep 05 02:22:11 2025 +0000 @@ -287,7 +287,8 @@ ENTRY(_membar_sync) * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/ * https://www.agner.org/optimize/instruction_tables.pdf * - * Sync with xen_mb in sys/arch/amd64/amd64/cpufunc.S. + * Sync with paravirt_membar_sync in + * sys/arch/amd64/amd64/cpufunc.S. */ LOCK addq $0, -8(%rsp) diff -r 969aab925f8a distrib/sets/lists/comp/mi --- a/distrib/sets/lists/comp/mi Mon Sep 01 04:47:03 2025 +0000 +++ b/distrib/sets/lists/comp/mi Fri Sep 05 02:22:11 2025 +0000 @@ -12925,6 +12925,7 @@ ./usr/share/man/cat9/optstr_get.0 comp-sys-catman .cat ./usr/share/man/cat9/p_find.0 comp-obsolete obsolete ./usr/share/man/cat9/panic.0 comp-sys-catman .cat +./usr/share/man/cat9/paravirt_membar_sync.0 comp-sys-catman .cat ./usr/share/man/cat9/pathbuf.0 comp-sys-catman .cat ./usr/share/man/cat9/pci.0 comp-sys-catman .cat ./usr/share/man/cat9/pci_conf_hook.0 comp-sys-catman .cat @@ -21789,6 +21790,7 @@ ./usr/share/man/html9/optstr_get.html comp-sys-htmlman html ./usr/share/man/html9/p_find.html comp-obsolete obsolete ./usr/share/man/html9/panic.html comp-sys-htmlman html +./usr/share/man/html9/paravirt_membar_sync.html comp-sys-htmlman html ./usr/share/man/html9/pathbuf.html comp-sys-htmlman html ./usr/share/man/html9/pci.html comp-sys-htmlman html ./usr/share/man/html9/pci_conf_hook.html comp-sys-htmlman html @@ -30810,6 +30812,7 @@ ./usr/share/man/man9/optstr_get.9 comp-sys-man .man ./usr/share/man/man9/p_find.9 comp-obsolete obsolete ./usr/share/man/man9/panic.9 comp-sys-man .man +./usr/share/man/man9/paravirt_membar_sync.9 comp-sys-man .man ./usr/share/man/man9/pathbuf.9 comp-sys-man .man ./usr/share/man/man9/pci.9 comp-sys-man .man ./usr/share/man/man9/pci_conf_hook.9 comp-sys-man .man diff -r 969aab925f8a share/man/man9/Makefile --- a/share/man/man9/Makefile Mon Sep 01 04:47:03 2025 +0000 +++ b/share/man/man9/Makefile Fri Sep 05 02:22:11 2025 +0000 @@ -41,7 +41,12 @@ MAN= accept_filter.9 accf_data.9 accf_ht microseq.9 microtime.9 microuptime.9 mi_switch.9 module.9 \ mstohz.9 mutex.9 m_tag.9 namecache.9 \ namei.9 nullop.9 opencrypto.9 optstr.9 \ - panic.9 pathbuf.9 pci.9 pci_configure_bus.9 pci_intr.9 \ + panic.9 \ + paravirt_membar_sync.9 \ + pathbuf.9 \ + pci.9 \ + pci_configure_bus.9 \ + pci_intr.9 \ pci_msi.9 pckbport.9 pcmcia.9 pcq.9 pcu.9 \ percpu.9 pfil.9 physio.9 pktqueue.9 pmap.9 pmatch.9 pmf.9 pool.9 \ pool_cache.9 portfeatures.9 powerhook_establish.9 ppsratecheck.9 \ diff -r 969aab925f8a share/man/man9/paravirt_membar_sync.9 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/share/man/man9/paravirt_membar_sync.9 Fri Sep 05 02:22:11 2025 +0000 @@ -0,0 +1,148 @@ +.\" $NetBSD$ +.\" +.\" Copyright (c) 2025 The NetBSD Foundation +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS +.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +.\" PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS +.\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +.\" POSSIBILITY OF SUCH DAMAGE. +.\" +.Dd August 31, 2025 +.Dt PARAVIRT_MEMBAR_SYNC 9 +.Os +.Sh NAME +.Nm paravirt_membar_sync +.Nd memory barrier for paravirtualized device drivers +.Sh SYNOPSIS +.In sys/paravirt_membar.h +.Ft void +.Fn paravirt_membar_sync "void" +.Sh DESCRIPTION +The +.Nm +function issues a store-before-load barrier for coordination with a +paravirtualized device. +.Pp +This function has the same ordering semantics as +.Xr membar_sync 3 , +but +.Xr membar_sync 3 +can only coordinate with other CPUs that +.Nx +is running on. +In a virtual machine, +.Nx +may be running on a single +.Em virtual +CPU, and patch +.Xr membar_sync 3 +to be a no-op, while the host side of a paravirtualized device may be +running on a different +.Em physical +CPU requiring a barrier that +.Xr membar_sync 3 +does not issue. +.Sh EXAMPLES +Submit a request to the host device, and notify the host to process +it\(embut elide the notification, which is expensive, if the host is +already reading requests anyway: +.Bd -literal + /* + * Write the request into the ring buffer. + */ + memcpy(cputodev_ring->buffer[sc->sc_cputodev_idx], request, + sizeof(*request)); + + /* + * Publish the request to the host device side. + */ + cputodev_ring->header->producer_tail = ++sc->sc_cputodev_idx; + + /* + * Ensure we have published it _before_ we check whether the + * host needs notification. + */ + paravirt_membar_sync(); + + /* + * Notify the host, if needed. Notifying the host is usually + * expensive (trap to hypervisor), so we try to avoid it if not + * needed. + */ + if (cputodev_ring->header->needs_notification) + notify_host(); +.Ed +.Pp +Enable interrupts from the host and check whether any were pending +while interrupts were disabled: +.Bd -literal + /* + * Tell the host device to deliver interrupts after this + * point. + */ +restart: + devtocpu_ring->header->needs_notification = true; + + /* + * Ensure we have requested interrupts _before_ we check + * whether we missed any notifications. + */ + paravirt_membar_sync(); + + /* + * Check whether there were any pending notifications while + * interrupts were blocked. If not, stop here. + */ + idx = devtocpu_ring->header->producer_idx; + if (sc->sc_devtocpu_idx == idx) + return; + + /* + * Process the notifications. + */ + devtocpu_ring->header->needs_notification = false; + while (sc->sc_devtocpu_idx != idx) { + struct buffer *buf = + devtocpu_ring->buffer[sc->sc_devtocpu_idx]; + process_notification(buf); + sc->sc_devtocpu_idx++; + sc->sc_devtocpu_idx %= ringlen; + } + goto restart; +.Ed +.Pp +.Sy "N.B.:" +Other ordering or bouncing may be required with +.Xr bus_dmamap_sync 9 ; +this is independent of +.Nm , +which is needed +.Em in addition to +.Xr bus_dmamap_sync 9 +to guarantee store-before-load ordering when there is no intervening +I/O doorbell trigger for a DMA operation, nor interrupt delivery for a +DMA completion. +.Sh SEE ALSO +.Xr membar_ops 3 , +.Xr bus_dma 9 , +.Xr bus_space 9 +.Sh HISTORY +These atomic operations first appeared in +.Nx 12.0 . diff -r 969aab925f8a sys/arch/alpha/alpha/locore.s --- a/sys/arch/alpha/alpha/locore.s Mon Sep 01 04:47:03 2025 +0000 +++ b/sys/arch/alpha/alpha/locore.s Fri Sep 05 02:22:11 2025 +0000 @@ -1524,3 +1524,18 @@ LEAF(alpha_write_fpcr, 1); f30save = 0; lda sp, framesz(sp) RET END(alpha_write_fpcr) + +LEAF(paravirt_membar_sync, 0) + /* + * Store-before-load ordering with respect to matching logic + * on the hypervisor side. + * + * This is the same as membar_sync, but without hotpatching + * away the MB instruction on uniprocessor boots -- because + * under virtualization, we still have to coordinate with a + * `device' backed by a hypervisor that is potentially on + * another physical CPU even if we observe only one virtual CPU + * as the guest. + */ + mb +END(paravirt_membar_sync) diff -r 969aab925f8a sys/arch/amd64/amd64/cpufunc.S --- a/sys/arch/amd64/amd64/cpufunc.S Mon Sep 01 04:47:03 2025 +0000 +++ b/sys/arch/amd64/amd64/cpufunc.S Fri Sep 05 02:22:11 2025 +0000 @@ -61,17 +61,17 @@ ENTRY(x86_mfence) ret END(x86_mfence) -#ifdef XEN -ENTRY(xen_mb) +ENTRY(paravirt_membar_sync) /* * Store-before-load ordering with respect to matching logic * on the hypervisor side. * * This is the same as membar_sync, but without hotpatching * away the LOCK prefix on uniprocessor boots -- because under - * Xen, we still have to coordinate with a `device' backed by a - * hypervisor that is potentially on another physical CPU even - * if we observe only one virtual CPU as the guest. + * virtualization, we still have to coordinate with a `device' + * backed by a hypervisor that is potentially on another + * physical CPU even if we observe only one virtual CPU as the + * guest. * * See common/lib/libc/arch/x86_64/atomic/atomic.S for * rationale and keep this in sync with the implementation @@ -80,7 +80,10 @@ ENTRY(xen_mb) lock addq $0,-8(%rsp) ret -END(xen_mb) +END(paravirt_membar_sync) + +#ifdef XEN +STRONG_ALIAS(xen_mb,paravirt_membar_sync) #endif /* XEN */ #ifdef KDTRACE_HOOKS diff -r 969aab925f8a sys/arch/arm/arm/cpu_subr.c --- a/sys/arch/arm/arm/cpu_subr.c Mon Sep 01 04:47:03 2025 +0000 +++ b/sys/arch/arm/arm/cpu_subr.c Fri Sep 05 02:22:11 2025 +0000 @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -145,3 +146,33 @@ cpu_clr_mbox(int cpuindex) } #endif + +#if defined _ARM_ARCH_6 || defined _ARM_ARCH_7 /* see below regarding armv<6 */ +void +paravirt_membar_sync(void) +{ + + /* + * Store-before-load ordering with respect to matching logic + * on the hypervisor side. + * + * This is the same as membar_sync, but guaranteed never to be + * conditionalized or hotpatched away even on uniprocessor + * builds and boots -- because under virtualization, we still + * have to coordinate with a `device' backed by a hypervisor + * that is potentially on another physical CPU even if we + * observe only one virtual CPU as the guest. + * + * Prior to armv6, there was no data memory barrier + * instruction. Such CPUs presumably don't exist in + * multiprocessor configurations. But what if we're running a + * _kernel_ built for a uniprocessor armv5 CPU, as a virtual + * machine guest of a _host_ with a newer multiprocessor CPU? + * How do we enforce store-before-load ordering for a + * paravirtualized device driver, coordinating with host + * software `device' potentially on another CPU? You'll have + * to answer that before you can use virtio drivers! + */ + dmb(ish); +} +#endif /* defined _ARM_ARCH_6 || defined _ARM_ARCH_7 */ diff -r 969aab925f8a sys/arch/hppa/hppa/support.S --- a/sys/arch/hppa/hppa/support.S Mon Sep 01 04:47:03 2025 +0000 +++ b/sys/arch/hppa/hppa/support.S Fri Sep 05 02:22:11 2025 +0000 @@ -304,3 +304,18 @@ LEAF_ENTRY(longjmp) ldi 1, %ret0 EXIT(longjmp) +LEAF_ENTRY(paravirt_membar_sync) + /* + * Store-before-load ordering with respect to matching logic + * on the hypervisor side. + * + * This is the same as membar_sync, but guaranteed never to be + * conditionalized or hotpatched away even on uniprocessor + * builds and boots -- because under virtualization, we still + * have to coordinate with a `device' backed by a hypervisor + * that is potentially on another physical CPU even if we + * observe only one virtual CPU as the guest. + */ + bv %r0(%rp) + sync +EXIT(paravirt_membar_sync) diff -r 969aab925f8a sys/arch/i386/i386/cpufunc.S --- a/sys/arch/i386/i386/cpufunc.S Mon Sep 01 04:47:03 2025 +0000 +++ b/sys/arch/i386/i386/cpufunc.S Fri Sep 05 02:22:11 2025 +0000 @@ -67,17 +67,17 @@ ENTRY(x86_mfence) ret END(x86_mfence) -#ifdef XEN -ENTRY(xen_mb) +ENTRY(paravirt_membar_sync) /* * Store-before-load ordering with respect to matching logic * on the hypervisor side. * * This is the same as membar_sync, but without hotpatching * away the LOCK prefix on uniprocessor boots -- because under - * Xen, we still have to coordinate with a `device' backed by a - * hypervisor that is potentially on another physical CPU even - * if we observe only one virtual CPU as the guest. + * virtualization, we still have to coordinate with a `device' + * backed by a hypervisor that is potentially on another + * physical CPU even if we observe only one virtual CPU as the + * guest. * * See common/lib/libc/arch/i386/atomic/atomic.S for * rationale and keep this in sync with the implementation @@ -86,7 +86,10 @@ ENTRY(xen_mb) lock addl $0,-4(%esp) ret -END(xen_mb) +END(paravirt_membar_sync) + +#ifdef XEN +STRONG_ALIAS(xen_mb,paravirt_membar_sync) #endif /* XEN */ #ifdef KDTRACE_HOOKS diff -r 969aab925f8a sys/arch/mips/mips/cpu_subr.c --- a/sys/arch/mips/mips/cpu_subr.c Mon Sep 01 04:47:03 2025 +0000 +++ b/sys/arch/mips/mips/cpu_subr.c Fri Sep 05 02:22:11 2025 +0000 @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -1195,3 +1196,37 @@ cpuwatch_clr(cpu_watchpoint_t *cwp) } #endif /* (MIPS32 + MIPS32R2 + MIPS64 + MIPS64R2) > 0 */ + +#if (MIPS2 + MIPS3 + MIPS4 + MIPS5 + MIPS32 + MIPS32R2 + MIPS64 + MIPS64R2) > 0 +void +paravirt_membar_sync(void) +{ + + /* + * Store-before-load ordering with respect to matching logic + * on the hypervisor side. + * + * This is the same as membar_sync, but guaranteed never to be + * conditionalized or hotpatched away even on uniprocessor + * builds and boots -- because under virtualization, we still + * have to coordinate with a `device' backed by a hypervisor + * that is potentially on another physical CPU even if we + * observe only one virtual CPU as the guest. + * + * Prior to MIPS-II, there was no SYNC instruction.[1] CPUs + * with only MIPS-I presumably don't exist in multiprocessor + * configurations. But what if we're running a _kernel_ built + * for a uniprocessor MIPS-I CPU, as a virtual machine guest of + * a _host_ with a newer multiprocessor CPU? How do we enforce + * store-before-load ordering for a paravirtualized device + * driver, coordinating with host software `device' potentially + * on another CPU? You'll have to answer that before you can + * use virtio drivers! + * + * [1] MIPS32 Architecture For Programmers, Volume II: The + * MIPS32 Instruction Set, Document Number: MD00086, + * Revision 0.95, March 12, 2001, MIPS Technologies, p. 215 + */ + __asm volatile("sync"); +} +#endif /* !MIPS1 */ diff -r 969aab925f8a sys/arch/riscv/riscv/cpu_subr.c --- a/sys/arch/riscv/riscv/cpu_subr.c Mon Sep 01 04:47:03 2025 +0000 +++ b/sys/arch/riscv/riscv/cpu_subr.c Fri Sep 05 02:22:11 2025 +0000 @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -428,3 +429,21 @@ cpu_ipi(struct cpu_info *ci) } #endif + +void +paravirt_membar_sync(void) +{ + + /* + * Store-before-load ordering with respect to matching logic + * on the hypervisor side. + * + * This is the same as membar_sync, but guaranteed never to be + * conditionalized or hotpatched away even on uniprocessor + * builds and boots -- because under virtualization, we still + * have to coordinate with a `device' backed by a hypervisor + * that is potentially on another physical CPU even if we + * observe only one virtual CPU as the guest. + */ + __asm volatile("fence rw,rw"); +} diff -r 969aab925f8a sys/arch/sparc/sparc/locore.s --- a/sys/arch/sparc/sparc/locore.s Mon Sep 01 04:47:03 2025 +0000 +++ b/sys/arch/sparc/sparc/locore.s Fri Sep 05 02:22:11 2025 +0000 @@ -6001,6 +6001,21 @@ Lpanic_spunout: .asciz "cpu%d: stuck on lock@%x" _ALIGN +ENTRY(paravirt_membar_sync) + /* + * Store-before-load ordering with respect to matching logic + * on the hypervisor side. + * + * This is the same as membar_sync, but without + * conditionalizing away the LDSTUB instruction on uniprocessor + * builds -- because under virtualization, we still have to + * coordinate with a `device' backed by a hypervisor that is + * potentially on another physical CPU even if we observe only + * one virtual CPU as the guest. + */ + ldstub [%sp - 4], %g0 /* makeshift store-before-load barrier */ +END(paravirt_membar_sync) + #if defined(KGDB) || defined(DDB) || defined(DIAGNOSTIC) /* * Write all windows (user or otherwise), except the current one. diff -r 969aab925f8a sys/arch/sparc64/sparc64/locore.s --- a/sys/arch/sparc64/sparc64/locore.s Mon Sep 01 04:47:03 2025 +0000 +++ b/sys/arch/sparc64/sparc64/locore.s Fri Sep 05 02:22:11 2025 +0000 @@ -7948,6 +7948,26 @@ ENTRY(sparc64_ipi_ccall) #endif +ENTRY(paravirt_membar_sync) + /* + * Store-before-load ordering with respect to matching logic + * on the hypervisor side. + * + * This is the same as membar_sync, but without patching or + * conditionalizing away the MEMBAR instruction on uniprocessor + * builds or boots -- because under virtualization, we still + * have to coordinate with a `device' backed by a hypervisor + * that is potentially on another physical CPU even if we + * observe only one virtual CPU as the guest. + * + * See common/lib/libc/arch/sparc64/atomic/membar_ops.S for why + * we avoid using the delay slot and keep this in sync with the + * implementation of membar_sync there. + */ + membar #StoreLoad + retl + nop +END(paravirt_membar_sync) .data _ALIGN diff -r 969aab925f8a sys/arch/virt68k/virt68k/locore.s --- a/sys/arch/virt68k/virt68k/locore.s Mon Sep 01 04:47:03 2025 +0000 +++ b/sys/arch/virt68k/virt68k/locore.s Fri Sep 05 02:22:11 2025 +0000 @@ -598,6 +598,27 @@ ENTRY(ecacheon) ENTRY(ecacheoff) rts +ENTRY(paravirt_membar_sync) + /* + * Store-before-load ordering with respect to matching logic + * on the hypervisor side. + * + * This is the same as membar_sync, but guaranteed never to be + * conditionalized or hotpatched away even on uniprocessor + * builds and boots -- because under virtualization, we still + * have to coordinate with a `device' backed by a hypervisor + * that is potentially on another physical CPU even if we + * observe only one virtual CPU as the guest. + * + * I don't see an obvious ordering-only instruction in the m68k + * instruction set, but qemu implements CAS with + * store-before-load ordering, so this should work for virtio. + */ + clrl %d0 + casl %d0,%d0,%sp@ + rts +END(paravirt_membar_sync) + /* * Misc. global variables. */ diff -r 969aab925f8a sys/dev/hyperv/vmbus.c --- a/sys/dev/hyperv/vmbus.c Mon Sep 01 04:47:03 2025 +0000 +++ b/sys/dev/hyperv/vmbus.c Fri Sep 05 02:22:11 2025 +0000 @@ -50,6 +50,7 @@ #include #include #include +#include #include @@ -791,6 +792,7 @@ vmbus_message_proc(void *arg, struct cpu msg = (struct vmbus_message *)sc->sc_percpu[cpu_index(ci)].simp + VMBUS_SINT_MESSAGE; + /* XXX bus_dmamap_sync(POSTREAD|POSTWRITE) on msg_type */ if (__predict_false(msg->msg_type != HYPERV_MSGTYPE_NONE)) { if (__predict_true(!cold)) softint_schedule_cpu(sc->sc_msg_sih, ci); @@ -813,9 +815,12 @@ vmbus_message_softintr(void *arg) for (;;) { msg = (struct vmbus_message *)sc->sc_percpu[cpu].simp + VMBUS_SINT_MESSAGE; + /* XXX bus_dmamap_sync(POSTREAD|POSTWRITE) on msg_type */ if (msg->msg_type == HYPERV_MSGTYPE_NONE) break; + /* XXX bus_dmamap_sync(POSTREAD) on msg_data */ + hdr = (struct vmbus_chanmsg_hdr *)msg->msg_data; type = hdr->chm_type; if (type >= VMBUS_CHANMSG_COUNT) { @@ -831,10 +836,22 @@ vmbus_message_softintr(void *arg) } } + /* XXX bus_dmamap_sync(PREREAD) on msg_data */ + msg->msg_type = HYPERV_MSGTYPE_NONE; - membar_sync(); + /* XXX bus_dmamap_sync(PREWRITE|PREREAD) on msg_type */ + + /* + * Ensure we tell the host that this message is done + * before we check whether the host told us there are + * more pending. + */ + paravirt_membar_sync(); + + /* XXX bus_dmamap_sync(POSTREAD) on msg_flags */ if (msg->msg_flags & VMBUS_MSGFLAG_PENDING) hyperv_send_eom(); + /* XXX bus_dmamap_sync(PREREAD) on msg_flags */ } } @@ -1655,8 +1672,10 @@ static __inline void vmbus_ring_avail(struct vmbus_ring_data *rd, uint32_t *towrite, uint32_t *toread) { + /* XXX bus_dmamap_sync(POSTREAD) on br_rindex/br_windex */ uint32_t ridx = rd->rd_ring->br_rindex; uint32_t widx = rd->rd_ring->br_windex; + /* XXX bus_dmamap_sync(PREREAD) on br_rindex/br_windex */ uint32_t r, w; if (widx >= ridx) @@ -1674,7 +1693,9 @@ static bool vmbus_ring_is_empty(struct vmbus_ring_data *rd) { + /* XXX bus_dmamap_sync(POSTREAD) on br_rindex/br_windex */ return rd->rd_ring->br_rindex == rd->rd_ring->br_windex; + /* XXX bus_dmamap_sync(PREREAD) on br_rindex/br_windex */ } static int @@ -1698,15 +1719,27 @@ vmbus_ring_write(struct vmbus_ring_data oprod = wrd->rd_prod; + /* XXX bus_dmamap_sync(POSTWRITE) on ring data */ + for (i = 0; i < iov_cnt; i++) vmbus_ring_put(wrd, iov[i].iov_base, iov[i].iov_len); indices = (uint64_t)oprod << 32; vmbus_ring_put(wrd, (uint8_t *)&indices, sizeof(indices)); - membar_sync(); + /* XXX bus_dmamap_sync(PREWRITE) on ring data */ + + membar_sync(); /* XXX bus_dmamap_sync(POSTWRITE) on br_windex */ wrd->rd_ring->br_windex = wrd->rd_prod; - membar_sync(); + /* XXX bus_dmamap_sync(PREWRITE) on br_windex */ + + /* + * Ensure we publish the producer index _before_ we check + * whether the host needs to be notified. + */ + paravirt_membar_sync(); + + /* XXX bus_dmamap_sync(POSTREAD) on br_rindex */ /* Signal when the ring transitions from being empty to non-empty */ if (wrd->rd_ring->br_imask == 0 && @@ -1715,6 +1748,8 @@ vmbus_ring_write(struct vmbus_ring_data else *needsig = 0; + /* XXX bus_dmamap_sync(PREREAD) on br_rindex */ + return 0; } @@ -1874,6 +1909,8 @@ vmbus_ring_read(struct vmbus_ring_data * return EAGAIN; } + /* XXX bus_dmamap_sync(POSTREAD) on ring data */ + if (offset) { rrd->rd_cons += offset; if (rrd->rd_cons >= rrd->rd_dsize) @@ -1883,8 +1920,11 @@ vmbus_ring_read(struct vmbus_ring_data * vmbus_ring_get(rrd, (uint8_t *)data, datalen, 0); vmbus_ring_get(rrd, (uint8_t *)&indices, sizeof(indices), 0); - membar_sync(); + /* XXX bus_dmamap_sync(PREREAD) on ring data */ + + membar_sync(); /* XXX bus_dmamap_sync(POSTWRITE) on br_rindex */ rrd->rd_ring->br_rindex = rrd->rd_cons; + /* XXX bus_dmamap_sync(PREWRITE) on br_rindex */ return 0; } @@ -1931,18 +1971,18 @@ static inline void vmbus_ring_mask(struct vmbus_ring_data *rd) { - membar_sync(); + membar_sync(); /* XXX bus_dmamap_sync(POSTWRITE) on br_imask */ rd->rd_ring->br_imask = 1; - membar_sync(); + membar_sync(); /* XXX bus_dmamap_sync(PREWRITE) on br_imask */ } static inline void vmbus_ring_unmask(struct vmbus_ring_data *rd) { - membar_sync(); + membar_sync(); /* XXX bus_dmamap_sync(POSTWRITE) on br_imask */ rd->rd_ring->br_imask = 0; - membar_sync(); + membar_sync(); /* XXX bus_dmamap_sync(PREWRITE) on br_imask */ } void @@ -1962,6 +2002,14 @@ vmbus_channel_unpause(struct vmbus_chann atomic_and_ulong(&ch->ch_sc->sc_evtmask[ch->ch_id / VMBUS_EVTFLAG_LEN], ~__BIT(ch->ch_id % VMBUS_EVTFLAG_LEN)); vmbus_ring_unmask(&ch->ch_rrd); + + /* + * Ensure we announce to the host side that we are accepting + * interrupts _before_ we check whether any pending events had + * come over the ring while we weren't accepting interrupts. + */ + paravirt_membar_sync(); + vmbus_ring_avail(&ch->ch_rrd, NULL, &avail); return avail; diff -r 969aab925f8a sys/dev/pci/pvscsi.c --- a/sys/dev/pci/pvscsi.c Mon Sep 01 04:47:03 2025 +0000 +++ b/sys/dev/pci/pvscsi.c Fri Sep 05 02:22:11 2025 +0000 @@ -63,13 +63,13 @@ in the file called LICENSE.GPL. #include -#include #include #include #include #include #include #include +#include #include #include #include @@ -320,6 +320,18 @@ CFATTACH_DECL3_NEW(pvscsi, sizeof(struct pvscsi_probe, pvscsi_attach, pvscsi_detach, NULL, NULL, NULL, DVF_DETACH_SHUTDOWN); +#define PVSCSI_DMA_SYNC_STATE(sc, dma, structptr, member, ops) \ + bus_dmamap_sync((sc)->sc_dmat, (dma)->map, \ + /*offset*/offsetof(__typeof__(*(structptr)), member), \ + /*length*/sizeof((structptr)->member), \ + (ops)) + +#define PVSCSI_DMA_SYNC_RING(sc, dma, ring, idx, ops) \ + bus_dmamap_sync((sc)->sc_dmat, (dma)->map, \ + /*offset*/sizeof(*(ring)) * (idx), \ + /*length*/sizeof(*(ring)), \ + (ops)) + static inline uint32_t pvscsi_reg_read(struct pvscsi_softc *sc, uint32_t offset) { @@ -371,6 +383,7 @@ pvscsi_intr_disable(struct pvscsi_softc static void pvscsi_kick_io(struct pvscsi_softc *sc, uint8_t cdb0) { + struct pvscsi_dma *s_dma; struct pvscsi_rings_state *s; DEBUG_PRINTF(2, sc->dev, "%s: cdb0 %#x\n", __func__, cdb0); @@ -378,8 +391,18 @@ pvscsi_kick_io(struct pvscsi_softc *sc, cdb0 == READ_12 || cdb0 == READ_16 || cdb0 == SCSI_WRITE_6_COMMAND || cdb0 == WRITE_10 || cdb0 == WRITE_12 || cdb0 == WRITE_16) { + s_dma = &sc->rings_state_dma; s = sc->rings_state; + /* + * Ensure the command has been published before we read + * req_cons_idx to test whether we need to kick the + * host. + */ + paravirt_membar_sync(); + + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, req_cons_idx, + BUS_DMASYNC_POSTREAD); DEBUG_PRINTF(2, sc->dev, "%s req prod %d cons %d\n", __func__, s->req_prod_idx, s->req_cons_idx); if (!sc->use_req_call_threshold || @@ -390,8 +413,14 @@ pvscsi_kick_io(struct pvscsi_softc *sc, } else { DEBUG_PRINTF(2, sc->dev, "wtf\n"); } + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, req_cons_idx, + BUS_DMASYNC_PREREAD); } else { s = sc->rings_state; + /* + * XXX req_cons_idx in debug log might be stale, but no + * need for DMA sync otherwise in this branch + */ DEBUG_PRINTF(1, sc->dev, "%s req prod %d cons %d not checked\n", __func__, s->req_prod_idx, s->req_cons_idx); @@ -497,6 +526,15 @@ static int pvscsi_setup_req_call(struct &cmd, sizeof(cmd)); status = pvscsi_reg_read(sc, PVSCSI_REG_OFFSET_COMMAND_STATUS); + /* + * After setup, sync req_call_threshold before use. + * After this point it should be stable, so no need to + * sync again during use. + */ + PVSCSI_DMA_SYNC_STATE(sc, &sc->rings_state_dma, + sc->rings_state, req_call_threshold, + BUS_DMASYNC_POSTREAD); + return (status != 0); } else { return (0); @@ -585,6 +623,10 @@ pvscsi_dma_alloc_ppns(struct pvscsi_soft return (error); } + memset(dma->vaddr, 0, num_pages * PAGE_SIZE); + bus_dmamap_sync(sc->sc_dmat, dma->map, 0, num_pages * PAGE_SIZE, + BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE); + ppn = dma->paddr >> PAGE_SHIFT; for (i = 0; i < num_pages; i++) { ppn_list[i] = ppn + i; @@ -681,6 +723,16 @@ static void pvscsi_free_rings(struct pvscsi_softc *sc) { + bus_dmamap_sync(sc->sc_dmat, sc->rings_state_dma.map, + 0, sc->rings_state_dma.size, + BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE); + bus_dmamap_sync(sc->sc_dmat, sc->req_ring_dma.map, + 0, sc->req_ring_dma.size, + BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE); + bus_dmamap_sync(sc->sc_dmat, sc->cmp_ring_dma.map, + 0, sc->cmp_ring_dma.size, + BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE); + pvscsi_dma_free(sc, &sc->rings_state_dma); pvscsi_dma_free(sc, &sc->req_ring_dma); pvscsi_dma_free(sc, &sc->cmp_ring_dma); @@ -762,6 +814,18 @@ pvscsi_setup_rings(struct pvscsi_softc * } pvscsi_write_cmd(sc, PVSCSI_CMD_SETUP_RINGS, &cmd, sizeof(cmd)); + + /* + * After setup, sync *_num_entries_log2 before use. After this + * point they should be stable, so no need to sync again during + * use. + */ + PVSCSI_DMA_SYNC_STATE(sc, &sc->rings_state_dma, + sc->rings_state, req_num_entries_log2, + BUS_DMASYNC_POSTREAD); + PVSCSI_DMA_SYNC_STATE(sc, &sc->rings_state_dma, + sc->rings_state, cmp_num_entries_log2, + BUS_DMASYNC_POSTREAD); } static int @@ -792,6 +856,15 @@ pvscsi_setup_msg_ring(struct pvscsi_soft } pvscsi_write_cmd(sc, PVSCSI_CMD_SETUP_MSG_RING, &cmd, sizeof(cmd)); + + /* + * After setup, sync msg_num_entries_log2 before use. After + * this point it should be stable, so no need to sync again + * during use. + */ + PVSCSI_DMA_SYNC_STATE(sc, &sc->rings_state_dma, + sc->rings_state, msg_num_entries_log2, + BUS_DMASYNC_POSTREAD); } static void @@ -1078,26 +1151,36 @@ pvscsi_process_completion(struct pvscsi_ static void pvscsi_process_cmp_ring(struct pvscsi_softc *sc) { + struct pvscsi_dma *ring_dma; struct pvscsi_ring_cmp_desc *ring; + struct pvscsi_dma *s_dma; struct pvscsi_rings_state *s; struct pvscsi_ring_cmp_desc *e; uint32_t mask; KASSERT(mutex_owned(&sc->lock)); + s_dma = &sc->rings_state_dma; s = sc->rings_state; + ring_dma = &sc->cmp_ring_dma; ring = sc->cmp_ring; mask = MASK(s->cmp_num_entries_log2); - while (true) { + for (;;) { + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, cmp_prod_idx, + BUS_DMASYNC_POSTREAD); size_t crpidx = s->cmp_prod_idx; - membar_acquire(); + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, cmp_prod_idx, + BUS_DMASYNC_PREREAD); if (s->cmp_cons_idx == crpidx) break; size_t crcidx = s->cmp_cons_idx & mask; + PVSCSI_DMA_SYNC_RING(sc, ring_dma, ring, crcidx, + BUS_DMASYNC_POSTREAD); + e = ring + crcidx; pvscsi_process_completion(sc, e); @@ -1106,8 +1189,19 @@ pvscsi_process_cmp_ring(struct pvscsi_so * ensure completion processing reads happen before write to * (increment of) cmp_cons_idx */ - membar_release(); + PVSCSI_DMA_SYNC_RING(sc, ring_dma, ring, crcidx, + BUS_DMASYNC_PREREAD); + + /* + * XXX Not actually sure the `device' does DMA for + * s->cmp_cons_idx at all -- qemu doesn't. If not, we + * can skip these DMA syncs. + */ + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, cmp_cons_idx, + BUS_DMASYNC_POSTWRITE); s->cmp_cons_idx++; + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, cmp_cons_idx, + BUS_DMASYNC_PREWRITE); } } @@ -1152,26 +1246,36 @@ pvscsi_process_msg(struct pvscsi_softc * static void pvscsi_process_msg_ring(struct pvscsi_softc *sc) { + struct pvscsi_dma *ring_dma; struct pvscsi_ring_msg_desc *ring; + struct pvscsi_dma *s_dma; struct pvscsi_rings_state *s; struct pvscsi_ring_msg_desc *e; uint32_t mask; KASSERT(mutex_owned(&sc->lock)); + s_dma = &sc->rings_state_dma; s = sc->rings_state; + ring_dma = &sc->msg_ring_dma; ring = sc->msg_ring; mask = MASK(s->msg_num_entries_log2); - while (true) { + for (;;) { + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, msg_prod_idx, + BUS_DMASYNC_POSTREAD); size_t mpidx = s->msg_prod_idx; // dma read (device -> cpu) - membar_acquire(); + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, msg_prod_idx, + BUS_DMASYNC_PREREAD); if (s->msg_cons_idx == mpidx) break; size_t mcidx = s->msg_cons_idx & mask; + PVSCSI_DMA_SYNC_RING(sc, ring_dma, ring, mcidx, + BUS_DMASYNC_POSTREAD); + e = ring + mcidx; pvscsi_process_msg(sc, e); @@ -1180,8 +1284,14 @@ pvscsi_process_msg_ring(struct pvscsi_so * ensure message processing reads happen before write to * (increment of) msg_cons_idx */ - membar_release(); + PVSCSI_DMA_SYNC_RING(sc, ring_dma, ring, mcidx, + BUS_DMASYNC_PREREAD); + + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, msg_cons_idx, + BUS_DMASYNC_POSTWRITE); s->msg_cons_idx++; + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, msg_cons_idx, + BUS_DMASYNC_PREWRITE); } } @@ -1244,8 +1354,10 @@ pvscsi_scsipi_request(struct scsipi_chan #endif uint32_t req_num_entries_log2; + struct pvscsi_dma *ring_dma; struct pvscsi_ring_req_desc *ring; struct pvscsi_ring_req_desc *e; + struct pvscsi_dma *s_dma; struct pvscsi_rings_state *s; struct pvscsi_hcb *hcb; @@ -1258,7 +1370,9 @@ pvscsi_scsipi_request(struct scsipi_chan return; } + ring_dma = &sc->req_ring_dma; ring = sc->req_ring; + s_dma = &sc->rings_state_dma; s = sc->rings_state; hcb = NULL; @@ -1292,6 +1406,7 @@ pvscsi_scsipi_request(struct scsipi_chan hcb->xs = xs; const size_t rridx = s->req_prod_idx & MASK(req_num_entries_log2); + PVSCSI_DMA_SYNC_RING(sc, ring_dma, ring, rridx, BUS_DMASYNC_POSTWRITE); e = ring + rridx; memset(e, 0, sizeof(*e)); @@ -1391,7 +1506,7 @@ pvscsi_scsipi_request(struct scsipi_chan * Ensure request record writes happen before write to (increment of) * req_prod_idx. */ - membar_producer(); + PVSCSI_DMA_SYNC_RING(sc, ring_dma, ring, rridx, BUS_DMASYNC_PREWRITE); uint8_t cdb0 = e->cdb[0]; @@ -1404,13 +1519,16 @@ pvscsi_scsipi_request(struct scsipi_chan callout_reset(&xs->xs_callout, timeout, pvscsi_timeout, hcb); } + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, req_prod_idx, + BUS_DMASYNC_POSTWRITE); s->req_prod_idx++; /* * Ensure req_prod_idx write (increment) happens before * IO is kicked (via a write). */ - membar_producer(); + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, req_prod_idx, + BUS_DMASYNC_PREWRITE); pvscsi_kick_io(sc, cdb0); mutex_exit(&sc->lock); diff -r 969aab925f8a sys/dev/pci/virtio.c --- a/sys/dev/pci/virtio.c Mon Sep 01 04:47:03 2025 +0000 +++ b/sys/dev/pci/virtio.c Fri Sep 05 02:22:11 2025 +0000 @@ -38,6 +38,7 @@ #include #include #include +#include #define VIRTIO_PRIVATE @@ -708,6 +709,13 @@ virtio_start_vq_intr(struct virtio_softc } vq->vq_queued++; + /* + * Ensure we announce to the host side that we are accepting + * interrupts _before_ we check whether any pending events had + * come over the queue while we weren't accepting interrupts. + */ + paravirt_membar_sync(); + vq_sync_uring_header(sc, vq, BUS_DMASYNC_POSTREAD); if (vq->vq_used_idx == virtio_rw16(sc, vq->vq_used->idx)) return 0; @@ -1252,6 +1260,12 @@ notify: vq_sync_aring_header(sc, vq, BUS_DMASYNC_PREWRITE); vq->vq_queued++; + /* + * Ensure we publish the avail idx _before_ we check whether + * the host needs to notified. + */ + paravirt_membar_sync(); + if (sc->sc_active_features & VIRTIO_F_RING_EVENT_IDX) { vq_sync_uring_avail(sc, vq, BUS_DMASYNC_POSTREAD); t = virtio_rw16(sc, *vq->vq_avail_event) + 1; diff -r 969aab925f8a sys/sys/paravirt_membar.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/sys/paravirt_membar.h Fri Sep 05 02:22:11 2025 +0000 @@ -0,0 +1,34 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2025 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SYS_PARAVIRT_MEMBAR_H_ +#define _SYS_PARAVIRT_MEMBAR_H_ + +void paravirt_membar_sync(void); + +#endif /* _SYS_PARAVIRT_MEMBAR_H_ */