# HG changeset patch # User Taylor R Campbell # Date 1756726485 0 # Mon Sep 01 11:34:45 2025 +0000 # Branch trunk # Node ID 7a1a303e95d5f18e803aed54e836ea88574cba55 # Parent f039b59231796487b407c54ce79bb83301339033 # EXP-Topic riastradh-pr59618-paravirtmbsync pvscsi(4): Use paravirt_membar(9) as needed. PR kern/59618: occasional virtio block device lock ups/hangs diff -r f039b5923179 -r 7a1a303e95d5 sys/dev/pci/pvscsi.c --- a/sys/dev/pci/pvscsi.c Sun Aug 31 21:35:14 2025 +0000 +++ b/sys/dev/pci/pvscsi.c Mon Sep 01 11:34:45 2025 +0000 @@ -70,6 +70,7 @@ in the file called LICENSE.GPL. #include #include #include +#include #include #include #include @@ -380,6 +381,12 @@ pvscsi_kick_io(struct pvscsi_softc *sc, cdb0 == WRITE_12 || cdb0 == WRITE_16) { s = sc->rings_state; + /* + * Ensure the command has been published before we test + * whether we need to kick the host. + */ + paravirt_membar_sync(); + DEBUG_PRINTF(2, sc->dev, "%s req prod %d cons %d\n", __func__, s->req_prod_idx, s->req_cons_idx); if (!sc->use_req_call_threshold || # HG changeset patch # User Taylor R Campbell # Date 1756730607 0 # Mon Sep 01 12:43:27 2025 +0000 # Branch trunk # Node ID 29dccdef1d779245c79a5086dfaa6a38c2ec2a15 # Parent 7a1a303e95d5f18e803aed54e836ea88574cba55 # EXP-Topic riastradh-pr59618-paravirtmbsync pvscsi(4): Use bus_dmamap_sync, not membar_*, for DMA. membar_* may be a noop if we're booting on a single _virtual_ CPU, but the barriers are still needed in case the host behind pvscsi(4) is running on another _physical_ CPU. Prompted by (and related to but not the same issue as): PR kern/59618: occasional virtio block device lock ups/hangs diff -r 7a1a303e95d5 -r 29dccdef1d77 sys/dev/pci/pvscsi.c --- a/sys/dev/pci/pvscsi.c Mon Sep 01 11:34:45 2025 +0000 +++ b/sys/dev/pci/pvscsi.c Mon Sep 01 12:43:27 2025 +0000 @@ -63,7 +63,6 @@ in the file called LICENSE.GPL. #include -#include #include #include #include @@ -321,6 +320,18 @@ CFATTACH_DECL3_NEW(pvscsi, sizeof(struct pvscsi_probe, pvscsi_attach, pvscsi_detach, NULL, NULL, NULL, DVF_DETACH_SHUTDOWN); +#define PVSCSI_DMA_SYNC_STATE(sc, dma, structptr, member, ops) \ + bus_dmamap_sync((sc)->sc_dmat, (dma)->map, \ + /*offset*/offsetof(__typeof__(*(structptr)), member), \ + /*length*/sizeof((structptr)->member), \ + (ops)) + +#define PVSCSI_DMA_SYNC_RING(sc, dma, ring, idx, ops) \ + bus_dmamap_sync((sc)->sc_dmat, (dma)->map, \ + /*offset*/sizeof(*(ring)) * (idx), \ + /*length*/sizeof(*(ring)), \ + (ops)) + static inline uint32_t pvscsi_reg_read(struct pvscsi_softc *sc, uint32_t offset) { @@ -372,6 +383,7 @@ pvscsi_intr_disable(struct pvscsi_softc static void pvscsi_kick_io(struct pvscsi_softc *sc, uint8_t cdb0) { + struct pvscsi_dma *s_dma; struct pvscsi_rings_state *s; DEBUG_PRINTF(2, sc->dev, "%s: cdb0 %#x\n", __func__, cdb0); @@ -379,6 +391,7 @@ pvscsi_kick_io(struct pvscsi_softc *sc, cdb0 == READ_12 || cdb0 == READ_16 || cdb0 == SCSI_WRITE_6_COMMAND || cdb0 == WRITE_10 || cdb0 == WRITE_12 || cdb0 == WRITE_16) { + s_dma = &sc->rings_state_dma; s = sc->rings_state; /* @@ -387,6 +400,10 @@ pvscsi_kick_io(struct pvscsi_softc *sc, */ paravirt_membar_sync(); + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, req_cons_idx, + BUS_DMASYNC_POSTREAD); + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, req_call_threshold, + BUS_DMASYNC_POSTREAD); DEBUG_PRINTF(2, sc->dev, "%s req prod %d cons %d\n", __func__, s->req_prod_idx, s->req_cons_idx); if (!sc->use_req_call_threshold || @@ -397,6 +414,10 @@ pvscsi_kick_io(struct pvscsi_softc *sc, } else { DEBUG_PRINTF(2, sc->dev, "wtf\n"); } + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, req_cons_idx, + BUS_DMASYNC_PREREAD); + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, req_call_threshold, + BUS_DMASYNC_PREREAD); } else { s = sc->rings_state; DEBUG_PRINTF(1, sc->dev, "%s req prod %d cons %d not checked\n", __func__, @@ -1085,36 +1106,56 @@ pvscsi_process_completion(struct pvscsi_ static void pvscsi_process_cmp_ring(struct pvscsi_softc *sc) { + struct pvscsi_dma *ring_dma; struct pvscsi_ring_cmp_desc *ring; + struct pvscsi_dma *s_dma; struct pvscsi_rings_state *s; struct pvscsi_ring_cmp_desc *e; uint32_t mask; KASSERT(mutex_owned(&sc->lock)); + s_dma = &sc->rings_state_dma; s = sc->rings_state; + ring_dma = &sc->cmp_ring_dma; ring = sc->cmp_ring; + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, cmp_num_entries_log2, + BUS_DMASYNC_POSTREAD); mask = MASK(s->cmp_num_entries_log2); + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, cmp_num_entries_log2, + BUS_DMASYNC_PREREAD); - while (true) { + for (;;) { + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, cmp_prod_idx, + BUS_DMASYNC_POSTREAD); size_t crpidx = s->cmp_prod_idx; - membar_acquire(); + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, cmp_prod_idx, + BUS_DMASYNC_PREREAD); if (s->cmp_cons_idx == crpidx) break; size_t crcidx = s->cmp_cons_idx & mask; + PVSCSI_DMA_SYNC_RING(sc, ring_dma, ring, crcidx, + BUS_DMASYNC_POSTREAD); + e = ring + crcidx; pvscsi_process_completion(sc, e); + PVSCSI_DMA_SYNC_RING(sc, ring_dma, ring, crcidx, + BUS_DMASYNC_PREREAD); + /* * ensure completion processing reads happen before write to * (increment of) cmp_cons_idx */ - membar_release(); + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, cmp_cons_idx, + BUS_DMASYNC_POSTWRITE); s->cmp_cons_idx++; + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, cmp_cons_idx, + BUS_DMASYNC_PREWRITE); } } @@ -1159,36 +1200,56 @@ pvscsi_process_msg(struct pvscsi_softc * static void pvscsi_process_msg_ring(struct pvscsi_softc *sc) { + struct pvscsi_dma *ring_dma; struct pvscsi_ring_msg_desc *ring; + struct pvscsi_dma *s_dma; struct pvscsi_rings_state *s; struct pvscsi_ring_msg_desc *e; uint32_t mask; KASSERT(mutex_owned(&sc->lock)); + s_dma = &sc->rings_state_dma; s = sc->rings_state; + ring_dma = &sc->msg_ring_dma; ring = sc->msg_ring; + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, msg_num_entries_log2, + BUS_DMASYNC_POSTREAD); mask = MASK(s->msg_num_entries_log2); + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, msg_num_entries_log2, + BUS_DMASYNC_PREREAD); - while (true) { + for (;;) { + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, msg_prod_idx, + BUS_DMASYNC_POSTREAD); size_t mpidx = s->msg_prod_idx; // dma read (device -> cpu) - membar_acquire(); + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, msg_prod_idx, + BUS_DMASYNC_PREREAD); if (s->msg_cons_idx == mpidx) break; size_t mcidx = s->msg_cons_idx & mask; + PVSCSI_DMA_SYNC_RING(sc, ring_dma, ring, mcidx, + BUS_DMASYNC_POSTREAD); + e = ring + mcidx; pvscsi_process_msg(sc, e); + PVSCSI_DMA_SYNC_RING(sc, ring_dma, ring, mcidx, + BUS_DMASYNC_POSTREAD); + /* * ensure message processing reads happen before write to * (increment of) msg_cons_idx */ - membar_release(); + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, msg_cons_idx, + BUS_DMASYNC_POSTWRITE); s->msg_cons_idx++; + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, msg_cons_idx, + BUS_DMASYNC_PREWRITE); } } @@ -1251,8 +1312,10 @@ pvscsi_scsipi_request(struct scsipi_chan #endif uint32_t req_num_entries_log2; + struct pvscsi_dma *ring_dma; struct pvscsi_ring_req_desc *ring; struct pvscsi_ring_req_desc *e; + struct pvscsi_dma *s_dma; struct pvscsi_rings_state *s; struct pvscsi_hcb *hcb; @@ -1265,11 +1328,17 @@ pvscsi_scsipi_request(struct scsipi_chan return; } + ring_dma = &sc->req_ring_dma; ring = sc->req_ring; + s_dma = &sc->rings_state_dma; s = sc->rings_state; hcb = NULL; + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, req_num_entries_log2, + BUS_DMASYNC_POSTREAD); req_num_entries_log2 = s->req_num_entries_log2; + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, req_num_entries_log2, + BUS_DMASYNC_PREREAD); /* Protect against multiple senders */ mutex_enter(&sc->lock); @@ -1299,6 +1368,7 @@ pvscsi_scsipi_request(struct scsipi_chan hcb->xs = xs; const size_t rridx = s->req_prod_idx & MASK(req_num_entries_log2); + PVSCSI_DMA_SYNC_RING(sc, ring_dma, ring, rridx, BUS_DMASYNC_POSTWRITE); e = ring + rridx; memset(e, 0, sizeof(*e)); @@ -1398,7 +1468,7 @@ pvscsi_scsipi_request(struct scsipi_chan * Ensure request record writes happen before write to (increment of) * req_prod_idx. */ - membar_producer(); + PVSCSI_DMA_SYNC_RING(sc, ring_dma, ring, rridx, BUS_DMASYNC_PREWRITE); uint8_t cdb0 = e->cdb[0]; @@ -1411,13 +1481,16 @@ pvscsi_scsipi_request(struct scsipi_chan callout_reset(&xs->xs_callout, timeout, pvscsi_timeout, hcb); } + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, req_prod_idx, + BUS_DMASYNC_POSTWRITE); s->req_prod_idx++; /* * Ensure req_prod_idx write (increment) happens before * IO is kicked (via a write). */ - membar_producer(); + PVSCSI_DMA_SYNC_STATE(sc, s_dma, s, req_prod_idx, + BUS_DMASYNC_PREWRITE); pvscsi_kick_io(sc, cdb0); mutex_exit(&sc->lock); # HG changeset patch # User Taylor R Campbell # Date 1756730719 0 # Mon Sep 01 12:45:19 2025 +0000 # Branch trunk # Node ID 7dc5ed92014dd2f8485f69217a569545907ae089 # Parent 29dccdef1d779245c79a5086dfaa6a38c2ec2a15 # EXP-Topic riastradh-pr59618-paravirtmbsync pvscsi(4): Zero rings before using them. bus_dmamem_alloc(9) doesn't guarantee zeroing, as far as I can tell, and who knows what might happen if the ring header and contents have anything nonzero initially. Insert an initial preread/prewrite sync between zeroing and first use, and, out of paranoia, a final postread/postwrite sync between last use and unload/free. Prompted by (but not really related to): PR kern/59618: occasional virtio block device lock ups/hangs diff -r 29dccdef1d77 -r 7dc5ed92014d sys/dev/pci/pvscsi.c --- a/sys/dev/pci/pvscsi.c Mon Sep 01 12:43:27 2025 +0000 +++ b/sys/dev/pci/pvscsi.c Mon Sep 01 12:45:19 2025 +0000 @@ -613,6 +613,10 @@ pvscsi_dma_alloc_ppns(struct pvscsi_soft return (error); } + memset(dma->vaddr, 0, num_pages * PAGE_SIZE); + bus_dmamap_sync(sc->sc_dmat, dma->map, 0, num_pages * PAGE_SIZE, + BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE); + ppn = dma->paddr >> PAGE_SHIFT; for (i = 0; i < num_pages; i++) { ppn_list[i] = ppn + i; @@ -709,6 +713,16 @@ static void pvscsi_free_rings(struct pvscsi_softc *sc) { + bus_dmamap_sync(sc->sc_dmat, sc->rings_state_dma.map, + 0, sc->rings_state_dma.size, + BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE); + bus_dmamap_sync(sc->sc_dmat, sc->req_ring_dma.map, + 0, sc->req_ring_dma.size, + BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE); + bus_dmamap_sync(sc->sc_dmat, sc->cmp_ring_dma.map, + 0, sc->cmp_ring_dma.size, + BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE); + pvscsi_dma_free(sc, &sc->rings_state_dma); pvscsi_dma_free(sc, &sc->req_ring_dma); pvscsi_dma_free(sc, &sc->cmp_ring_dma);