# HG changeset patch # User Taylor R Campbell # Date 1756726485 0 # Mon Sep 01 11:34:45 2025 +0000 # Branch trunk # Node ID 7a1a303e95d5f18e803aed54e836ea88574cba55 # Parent f039b59231796487b407c54ce79bb83301339033 # EXP-Topic riastradh-pr59618-paravirtmbsync pvscsi(4): Use paravirt_membar(9) as needed. PR kern/59618: occasional virtio block device lock ups/hangs diff -r f039b5923179 -r 7a1a303e95d5 sys/dev/pci/pvscsi.c --- a/sys/dev/pci/pvscsi.c Sun Aug 31 21:35:14 2025 +0000 +++ b/sys/dev/pci/pvscsi.c Mon Sep 01 11:34:45 2025 +0000 @@ -70,6 +70,7 @@ in the file called LICENSE.GPL. #include #include #include +#include #include #include #include @@ -380,6 +381,12 @@ pvscsi_kick_io(struct pvscsi_softc *sc, cdb0 == WRITE_12 || cdb0 == WRITE_16) { s = sc->rings_state; + /* + * Ensure the command has been published before we test + * whether we need to kick the host. + */ + paravirt_membar_sync(); + DEBUG_PRINTF(2, sc->dev, "%s req prod %d cons %d\n", __func__, s->req_prod_idx, s->req_cons_idx); if (!sc->use_req_call_threshold || # HG changeset patch # User Taylor R Campbell # Date 1756730607 0 # Mon Sep 01 12:43:27 2025 +0000 # Branch trunk # Node ID b17e8da5094454def7b1084baf52d5bc367bf625 # Parent 7a1a303e95d5f18e803aed54e836ea88574cba55 # EXP-Topic riastradh-pr59618-paravirtmbsync pvscsi(4): Use bus_dmamap_sync, not membar_*, for DMA. membar_* may be a noop if we're booting on a single _virtual_ CPU, but the barriers are still needed in case the host behind pvscsi(4) is running on another _physical_ CPU. Prompted by (and related to but not the same issue as): PR kern/59618: occasional virtio block device lock ups/hangs diff -r 7a1a303e95d5 -r b17e8da50944 sys/dev/pci/pvscsi.c --- a/sys/dev/pci/pvscsi.c Mon Sep 01 11:34:45 2025 +0000 +++ b/sys/dev/pci/pvscsi.c Mon Sep 01 12:43:27 2025 +0000 @@ -63,7 +63,6 @@ in the file called LICENSE.GPL. #include -#include #include #include #include @@ -684,6 +683,18 @@ fail: return (error); } +#define PVSCSI_DMA_SYNC_STATE(sc, dma, structptr, member, ops) \ + bus_dmamap_sync((sc)->sc_dmat, (dma)->map, \ + /*offset*/offsetof(__typeof__(*(structptr)), member), \ + /*length*/sizeof((structptr)->member), \ + (ops)) + +#define PVSCSI_DMA_SYNC_RING(sc, dma, ring, idx, ops) \ + bus_dmamap_sync((sc)->sc_dmat, (dma)->map, \ + /*offset*/sizeof(*(ring)) * (idx), \ + /*length*/sizeof(*(ring)), \ + (ops)) + static void pvscsi_free_rings(struct pvscsi_softc *sc) { @@ -1085,36 +1096,52 @@ pvscsi_process_completion(struct pvscsi_ static void pvscsi_process_cmp_ring(struct pvscsi_softc *sc) { + struct pvscsi_dma *ring_dma; struct pvscsi_ring_cmp_desc *ring; + struct pvscsi_dma *s_dma; struct pvscsi_rings_state *s; struct pvscsi_ring_cmp_desc *e; uint32_t mask; KASSERT(mutex_owned(&sc->lock)); + s_dma = &sc->rings_state_dma; s = sc->rings_state; + ring_dma = &sc->cmp_ring_dma; ring = sc->cmp_ring; mask = MASK(s->cmp_num_entries_log2); - while (true) { + for (;;) { + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, cmp_prod_idx, + BUS_DMASYNC_POSTREAD); size_t crpidx = s->cmp_prod_idx; - membar_acquire(); + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, cmp_prod_idx, + BUS_DMASYNC_PREREAD); if (s->cmp_cons_idx == crpidx) break; size_t crcidx = s->cmp_cons_idx & mask; + PVSCSI_DMA_SYNC_RING(sc, ring_dma, ring, crcidx, + BUS_DMASYNC_POSTREAD); + e = ring + crcidx; pvscsi_process_completion(sc, e); + PVSCSI_DMA_SYNC_RING(sc, ring_dma, ring, crcidx, + BUS_DMASYNC_PREREAD); + /* * ensure completion processing reads happen before write to * (increment of) cmp_cons_idx */ - membar_release(); + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, cmp_cons_idx, + BUS_DMASYNC_POSTWRITE); s->cmp_cons_idx++; + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, cmp_cons_idx, + BUS_DMASYNC_PREWRITE); } } @@ -1159,36 +1186,52 @@ pvscsi_process_msg(struct pvscsi_softc * static void pvscsi_process_msg_ring(struct pvscsi_softc *sc) { + struct pvscsi_dma *ring_dma; struct pvscsi_ring_msg_desc *ring; + struct pvscsi_dma *s_dma; struct pvscsi_rings_state *s; struct pvscsi_ring_msg_desc *e; uint32_t mask; KASSERT(mutex_owned(&sc->lock)); + s_dma = &sc->rings_state_dma; s = sc->rings_state; + ring_dma = &sc->msg_ring_dma; ring = sc->msg_ring; mask = MASK(s->msg_num_entries_log2); - while (true) { + for (;;) { + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, msg_prod_idx, + BUS_DMASYNC_POSTREAD); size_t mpidx = s->msg_prod_idx; // dma read (device -> cpu) - membar_acquire(); + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, msg_prod_idx, + BUS_DMASYNC_PREREAD); if (s->msg_cons_idx == mpidx) break; size_t mcidx = s->msg_cons_idx & mask; + PVSCSI_DMA_SYNC_RING(sc, ring_dma, ring, mcidx, + BUS_DMASYNC_POSTREAD); + e = ring + mcidx; pvscsi_process_msg(sc, e); + PVSCSI_DMA_SYNC_RING(sc, ring_dma, ring, mcidx, + BUS_DMASYNC_POSTREAD); + /* * ensure message processing reads happen before write to * (increment of) msg_cons_idx */ - membar_release(); + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, msg_cons_idx, + BUS_DMASYNC_POSTWRITE); s->msg_cons_idx++; + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, msg_cons_idx, + BUS_DMASYNC_PREWRITE); } } @@ -1251,8 +1294,10 @@ pvscsi_scsipi_request(struct scsipi_chan #endif uint32_t req_num_entries_log2; + struct pvscsi_dma *ring_dma; struct pvscsi_ring_req_desc *ring; struct pvscsi_ring_req_desc *e; + struct pvscsi_dma *s_dma; struct pvscsi_rings_state *s; struct pvscsi_hcb *hcb; @@ -1265,7 +1310,9 @@ pvscsi_scsipi_request(struct scsipi_chan return; } + ring_dma = &sc->req_ring_dma; ring = sc->req_ring; + s_dma = &sc->rings_state_dma; s = sc->rings_state; hcb = NULL; @@ -1299,6 +1346,7 @@ pvscsi_scsipi_request(struct scsipi_chan hcb->xs = xs; const size_t rridx = s->req_prod_idx & MASK(req_num_entries_log2); + PVSCSI_DMA_SYNC_RING(sc, ring_dma, ring, rridx, BUS_DMASYNC_POSTWRITE); e = ring + rridx; memset(e, 0, sizeof(*e)); @@ -1398,7 +1446,7 @@ pvscsi_scsipi_request(struct scsipi_chan * Ensure request record writes happen before write to (increment of) * req_prod_idx. */ - membar_producer(); + PVSCSI_DMA_SYNC_RING(sc, ring_dma, ring, rridx, BUS_DMASYNC_PREWRITE); uint8_t cdb0 = e->cdb[0]; @@ -1411,13 +1459,16 @@ pvscsi_scsipi_request(struct scsipi_chan callout_reset(&xs->xs_callout, timeout, pvscsi_timeout, hcb); } + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, req_prod_idx, + BUS_DMASYNC_POSTWRITE); s->req_prod_idx++; /* * Ensure req_prod_idx write (increment) happens before * IO is kicked (via a write). */ - membar_producer(); + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, req_prod_idx, + BUS_DMASYNC_PREWRITE); pvscsi_kick_io(sc, cdb0); mutex_exit(&sc->lock); # HG changeset patch # User Taylor R Campbell # Date 1756730719 0 # Mon Sep 01 12:45:19 2025 +0000 # Branch trunk # Node ID dc6327910059a766c03d5b376fe720009bc7f1e9 # Parent b17e8da5094454def7b1084baf52d5bc367bf625 # EXP-Topic riastradh-pr59618-paravirtmbsync pvscsi(4): Zero rings before using them. bus_dmamem_alloc(9) doesn't guarantee zeroing, as far as I can tell, and who knows what might happen if the ring header and contents have anything nonzero initially. Insert an initial preread/prewrite sync between zeroing and first use, and, out of paranoia, a final postread/postwrite sync between last use and unload/free. Prompted by (but not really related to): PR kern/59618: occasional virtio block device lock ups/hangs diff -r b17e8da50944 -r dc6327910059 sys/dev/pci/pvscsi.c --- a/sys/dev/pci/pvscsi.c Mon Sep 01 12:43:27 2025 +0000 +++ b/sys/dev/pci/pvscsi.c Mon Sep 01 12:45:19 2025 +0000 @@ -591,6 +591,10 @@ pvscsi_dma_alloc_ppns(struct pvscsi_soft return (error); } + memset(dma->vaddr, 0, num_pages * PAGE_SIZE); + bus_dmamap_sync(sc->sc_dmat, dma->map, 0, num_pages * PAGE_SIZE, + BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE); + ppn = dma->paddr >> PAGE_SHIFT; for (i = 0; i < num_pages; i++) { ppn_list[i] = ppn + i; @@ -699,6 +703,16 @@ static void pvscsi_free_rings(struct pvscsi_softc *sc) { + bus_dmamap_sync(sc->sc_dmat, sc->rings_state_dma.map, + 0, sc->rings_state_dma.size, + BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE); + bus_dmamap_sync(sc->sc_dmat, sc->req_ring_dma.map, + 0, sc->req_ring_dma.size, + BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE); + bus_dmamap_sync(sc->sc_dmat, sc->cmp_ring_dma.map, + 0, sc->cmp_ring_dma.size, + BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE); + pvscsi_dma_free(sc, &sc->rings_state_dma); pvscsi_dma_free(sc, &sc->req_ring_dma); pvscsi_dma_free(sc, &sc->cmp_ring_dma);