virtio(4) bus_dmamap_sync audit

To: tech-kern%NetBSD.org@localhost
Subject: virtio(4) bus_dmamap_sync audit
From: Taylor R Campbell <campbell+netbsd-tech-kern%mumble.net@localhost>
Date: Mon, 8 Aug 2022 11:11:54 +0000

I reviewed the bus_dmamap_sync calls in sys/dev/pci/virtio.c and found
several likely bugs by code inspection, including misuse of membar_*.
The attached patch fixes all the issues I saw, and removes the
membar_* calls in favour of bus_dmamap_sync.  Details in attached
commit message.

qemu virtio-blk and virtio-net still seem to work in cursory testing.
Are there any automated tests, ideally stress tests, of virtio(4)?

Review or testing welcome!

From 39a2d4f14e7b7fe6d2ddb8bd621b459eb343f780 Mon Sep 17 00:00:00 2001
From: Taylor R Campbell <riastradh%NetBSD.org@localhost>
Date: Mon, 8 Aug 2022 10:47:45 +0000
Subject: [PATCH] virtio(4): Membar and bus_dmamap_sync audit.

- Don't use membar_* for DMA.

- Sync only the header and payload of the rings separately as needed.
  If we bounce, this avoids large memcpy when we only care about the
  header.

- Sync uring with PREREAD before triggering anything that will return
  data in it.

  => Move uring PREREAD in virtio_enqueue_commit to _before_ updating
     vq->vq_avail->idx, which is the pointat which the DMA read is
     triggered in the `device' (host).

  => Omit needless membar_consumer in virtio_enqueue_commit -- may not
     work with DMA memory, and even if it does, redundant with
     bus_dmamap_sync uring PREREAD here.

  => XXX Does the device/host ever return unsolicited entries in the
     queue, or only solicited ones?  If only solicited ones, the
     PREREAD in virtio_init_vq is redundant.

- Sync uring with POSTREAD before we read from it.  This way the DMA
  read into our buffer has finished before we read from the buffer.

  => Add missing uring POSTREAD in virtio_vq_is_enqueued, between read of
     vq->vq_used_idx and return to caller, so that the caller can
     safely use virtio_dequeue.

  => Add missing uring POSTREADs in virtio_start_vq_intr:
     . between entry from caller and the read of vq->vq_used_idx
     . between the read of vq->vq_used_idx and return to caller,
       so that the caller can safely use virtio_dequeue, just like
       virtio_vq_is_enqueued

  => Move uring POSTREADs in virtio_enqueue_commit to _before_ reading
     vq->vq_used->flags or *vq->vq_avail_event, not after.

- After we write to aring, sync it with PREWRITE.  This way we finish
  writing to our buffer before the DMA write from it.

  => Omit needless PREWRITE in virtio_init_vq -- we do the appropriate
     PREWRITE in virtio_enqueue_commit now.

  => Convert membar_producer to bus_dmamap_sync PREWRITE in
     virtio_enqueue_commit.

  => Omit incorrect aring POSTWRITE in virtio_enqueue_commit -- no need
     because the DMA write may not have completed yet at this point,
     and we already do a POSTWRITE in virtio_vq_is_enqueued.

  => Omit needless membar_producer in virtio_postpone_intr -- may not
     work with DMA memory, and even if it does, redundant with
     bus_dmamap_sync PREWRITE here.

- After xfers to aring have completed, sync it with POSTWRITE.

  => Add missing aring POSTWRITE in virtio_free_vq, in case there are
     paths from virtio_enqueue_commit to here that don't go through
     virtio_is_enqueued.  (If there are no such paths, then maybe we
     should KASSERT(vq->vq_queued == 0) in virtio_free_vq.)
---
 sys/dev/pci/virtio.c | 160 +++++++++++++++++++++++++++++++++----------
 1 file changed, 122 insertions(+), 38 deletions(-)

diff --git a/sys/dev/pci/virtio.c b/sys/dev/pci/virtio.c
index 4c2d80c28e9a..f7189903a0e3 100644
--- a/sys/dev/pci/virtio.c
+++ b/sys/dev/pci/virtio.c
@@ -422,47 +422,111 @@ virtio_soft_intr(void *arg)
 static inline void
 vq_sync_descs(struct virtio_softc *sc, struct virtqueue *vq, int ops)
 {
+
 	/* availoffset == sizeof(vring_desc)*vq_num */
 	bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap, 0, vq->vq_availoffset,
-			ops);
+	    ops);
 }
 
 static inline void
-vq_sync_aring(struct virtio_softc *sc, struct virtqueue *vq, int ops)
+vq_sync_aring_all(struct virtio_softc *sc, struct virtqueue *vq, int ops)
 {
 	uint16_t hdrlen = offsetof(struct vring_avail, ring);
+	size_t payloadlen = sc->sc_nvqs * sizeof(uint16_t);
+	size_t usedlen = 0;
+
 	if (sc->sc_active_features & VIRTIO_F_RING_EVENT_IDX)
-		hdrlen += sizeof(uint16_t);
+		usedlen = sizeof(uint16_t);
+	bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap,
+	    vq->vq_availoffset, hdrlen + payloadlen + usedlen, ops);
+}
+
+static inline void
+vq_sync_aring_header(struct virtio_softc *sc, struct virtqueue *vq, int ops)
+{
+	uint16_t hdrlen = offsetof(struct vring_avail, ring);
+
+	bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap,
+	    vq->vq_availoffset, hdrlen, ops);
+}
+
+static inline void
+vq_sync_aring_payload(struct virtio_softc *sc, struct virtqueue *vq, int ops)
+{
+	uint16_t hdrlen = offsetof(struct vring_avail, ring);
+	size_t payloadlen = sc->sc_nvqs * sizeof(uint16_t);
 
 	bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap,
-			vq->vq_availoffset,
-			hdrlen + sc->sc_nvqs * sizeof(uint16_t),
-			ops);
+	    vq->vq_availoffset + hdrlen, payloadlen, ops);
 }
 
 static inline void
-vq_sync_uring(struct virtio_softc *sc, struct virtqueue *vq, int ops)
+vq_sync_aring_used(struct virtio_softc *sc, struct virtqueue *vq, int ops)
+{
+	uint16_t hdrlen = offsetof(struct vring_avail, ring);
+	size_t payloadlen = sc->sc_nvqs * sizeof(uint16_t);
+	size_t usedlen = sizeof(uint16_t);
+
+	if ((sc->sc_active_features & VIRTIO_F_RING_EVENT_IDX) == 0)
+		return;
+	bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap,
+	    vq->vq_availoffset + hdrlen + payloadlen, usedlen, ops);
+}
+
+static inline void
+vq_sync_uring_all(struct virtio_softc *sc, struct virtqueue *vq, int ops)
 {
 	uint16_t hdrlen = offsetof(struct vring_used, ring);
+	size_t payloadlen = sc->sc_nvqs * sizeof(struct vring_used_elem);
+	size_t availlen = 0;
+
 	if (sc->sc_active_features & VIRTIO_F_RING_EVENT_IDX)
-		hdrlen += sizeof(uint16_t);
+		availlen = sizeof(uint16_t);
+	bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap,
+	    vq->vq_usedoffset, hdrlen + payloadlen + availlen, ops);
+}
+
+static inline void
+vq_sync_uring_header(struct virtio_softc *sc, struct virtqueue *vq, int ops)
+{
+	uint16_t hdrlen = offsetof(struct vring_used, ring);
 
 	bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap,
-			vq->vq_usedoffset,
-			hdrlen + sc->sc_nvqs * sizeof(struct vring_used_elem),
-			ops);
+	    vq->vq_usedoffset, hdrlen, ops);
+}
+
+static inline void
+vq_sync_uring_payload(struct virtio_softc *sc, struct virtqueue *vq, int ops)
+{
+	uint16_t hdrlen = offsetof(struct vring_used, ring);
+	size_t payloadlen = sc->sc_nvqs * sizeof(struct vring_used_elem);
+
+	bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap,
+	    vq->vq_usedoffset + hdrlen, payloadlen, ops);
+}
+
+static inline void
+vq_sync_uring_avail(struct virtio_softc *sc, struct virtqueue *vq, int ops)
+{
+	uint16_t hdrlen = offsetof(struct vring_used, ring);
+	size_t payloadlen = sc->sc_nvqs * sizeof(struct vring_used_elem);
+	size_t availlen = sizeof(uint16_t);
+
+	if ((sc->sc_active_features & VIRTIO_F_RING_EVENT_IDX) == 0)
+		return;
+	bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap,
+	    vq->vq_usedoffset + hdrlen + payloadlen, availlen, ops);
 }
 
 static inline void
 vq_sync_indirect(struct virtio_softc *sc, struct virtqueue *vq, int slot,
-		     int ops)
+    int ops)
 {
-	int offset = vq->vq_indirectoffset
-		      + sizeof(struct vring_desc) * vq->vq_maxnsegs * slot;
+	int offset = vq->vq_indirectoffset +
+	    sizeof(struct vring_desc) * vq->vq_maxnsegs * slot;
 
 	bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap,
-			offset, sizeof(struct vring_desc) * vq->vq_maxnsegs,
-			ops);
+	    offset, sizeof(struct vring_desc) * vq->vq_maxnsegs, ops);
 }
 
 bool
@@ -471,12 +535,14 @@ virtio_vq_is_enqueued(struct virtio_softc *sc, struct virtqueue *vq)
 
 	if (vq->vq_queued) {
 		vq->vq_queued = 0;
-		vq_sync_aring(sc, vq, BUS_DMASYNC_POSTWRITE);
+		vq_sync_aring_all(sc, vq, BUS_DMASYNC_POSTWRITE);
 	}
-	vq_sync_uring(sc, vq, BUS_DMASYNC_POSTREAD);
-	membar_consumer();
 
-	return (vq->vq_used_idx != virtio_rw16(sc, vq->vq_used->idx)) ? 1 : 0;
+	vq_sync_uring_header(sc, vq, BUS_DMASYNC_POSTREAD);
+	if (vq->vq_used_idx == virtio_rw16(sc, vq->vq_used->idx))
+		return 0;
+	vq_sync_uring_payload(sc, vq, BUS_DMASYNC_POSTREAD);
+	return 1;
 }
 
 /*
@@ -530,9 +596,7 @@ virtio_postpone_intr(struct virtio_softc *sc, struct virtqueue *vq,
 
 	/* set the new event index: avail_ring->used_event = idx */
 	*vq->vq_used_event = virtio_rw16(sc, idx);
-	membar_producer();
-
-	vq_sync_aring(vq->vq_owner, vq, BUS_DMASYNC_PREWRITE);
+	vq_sync_aring_used(vq->vq_owner, vq, BUS_DMASYNC_PREWRITE);
 	vq->vq_queued++;
 
 	nused = (uint16_t)
@@ -578,6 +642,7 @@ virtio_postpone_intr_far(struct virtio_softc *sc, struct virtqueue *vq)
 void
 virtio_stop_vq_intr(struct virtio_softc *sc, struct virtqueue *vq)
 {
+
 	if (sc->sc_active_features & VIRTIO_F_RING_EVENT_IDX) {
 		/*
 		 * No way to disable the interrupt completely with
@@ -586,16 +651,19 @@ virtio_stop_vq_intr(struct virtio_softc *sc, struct virtqueue *vq)
 		 * the past to not trigger a spurios interrupt.
 		 */
 		*vq->vq_used_event = virtio_rw16(sc, vq->vq_used_idx + 0x8000);
+		vq_sync_aring_used(sc, vq, BUS_DMASYNC_PREWRITE);
 	} else {
-		vq->vq_avail->flags |= virtio_rw16(sc, VRING_AVAIL_F_NO_INTERRUPT);
+		vq->vq_avail->flags |=
+		    virtio_rw16(sc, VRING_AVAIL_F_NO_INTERRUPT);
+		vq_sync_aring_header(sc, vq, BUS_DMASYNC_PREWRITE);
 	}
-	vq_sync_aring(sc, vq, BUS_DMASYNC_PREWRITE);
 	vq->vq_queued++;
 }
 
 int
 virtio_start_vq_intr(struct virtio_softc *sc, struct virtqueue *vq)
 {
+
 	if (sc->sc_active_features & VIRTIO_F_RING_EVENT_IDX) {
 		/*
 		 * If event index feature is negotiated, enabling interrupts
@@ -603,13 +671,19 @@ virtio_start_vq_intr(struct virtio_softc *sc, struct virtqueue *vq)
 		 * used_event field
 		 */
 		*vq->vq_used_event = virtio_rw16(sc, vq->vq_used_idx);
+		vq_sync_aring_used(sc, vq, BUS_DMASYNC_PREWRITE);
 	} else {
-		vq->vq_avail->flags &= ~virtio_rw16(sc, VRING_AVAIL_F_NO_INTERRUPT);
+		vq->vq_avail->flags &=
+		    ~virtio_rw16(sc, VRING_AVAIL_F_NO_INTERRUPT);
+		vq_sync_aring_header(sc, vq, BUS_DMASYNC_PREWRITE);
 	}
-	vq_sync_aring(sc, vq, BUS_DMASYNC_PREWRITE);
 	vq->vq_queued++;
 
-	return vq->vq_used_idx != virtio_rw16(sc, vq->vq_used->idx);
+	vq_sync_uring_header(sc, vq, BUS_DMASYNC_POSTREAD);
+	if (vq->vq_used_idx == virtio_rw16(sc, vq->vq_used->idx))
+		return 0;
+	vq_sync_uring_payload(sc, vq, BUS_DMASYNC_POSTREAD);
+	return 1;
 }
 
 /*
@@ -655,8 +729,7 @@ virtio_init_vq(struct virtio_softc *sc, struct virtqueue *vq,
 		mutex_init(&vq->vq_aring_lock, MUTEX_SPIN, sc->sc_ipl);
 		mutex_init(&vq->vq_uring_lock, MUTEX_SPIN, sc->sc_ipl);
 	}
-	vq_sync_aring(sc, vq, BUS_DMASYNC_PREWRITE);
-	vq_sync_uring(sc, vq, BUS_DMASYNC_PREREAD);
+	vq_sync_uring_all(sc, vq, BUS_DMASYNC_PREREAD);
 	vq->vq_queued++;
 }
 
@@ -812,6 +885,8 @@ virtio_free_vq(struct virtio_softc *sc, struct virtqueue *vq)
 	/* tell device that there's no virtqueue any longer */
 	sc->sc_ops->setup_queue(sc, vq->vq_index, 0);
 
+	vq_sync_aring_all(sc, vq, BUS_DMASYNC_POSTWRITE);
+
 	kmem_free(vq->vq_entries, sizeof(*vq->vq_entries) * vq->vq_num);
 	bus_dmamap_unload(sc->sc_dmat, vq->vq_dmamap);
 	bus_dmamap_destroy(sc->sc_dmat, vq->vq_dmamap);
@@ -1050,34 +1125,43 @@ virtio_enqueue_commit(struct virtio_softc *sc, struct virtqueue *vq, int slot,
 		vq_sync_indirect(sc, vq, slot, BUS_DMASYNC_PREWRITE);
 	mutex_enter(&vq->vq_aring_lock);
 	vq->vq_avail->ring[(vq->vq_avail_idx++) % vq->vq_num] =
-		virtio_rw16(sc, slot);
+	    virtio_rw16(sc, slot);
 
 notify:
 	if (notifynow) {
 		uint16_t o, n, t;
 		uint16_t flags;
+
 		o = virtio_rw16(sc, vq->vq_avail->idx);
 		n = vq->vq_avail_idx;
 
-		/* publish avail idx */
-		membar_producer();
+		/*
+		 * Prepare for `device->CPU' (host->guest) transfer
+		 * into the buffer.  This must happen before we commit
+		 * the vq->vq_avail->idx update to ensure we're not
+		 * still using the buffer in case program-prior loads
+		 * or stores in it get delayed past the store to
+		 * vq->vq_avail->idx.
+		 */
+		vq_sync_uring_all(sc, vq, BUS_DMASYNC_PREREAD);
+
+		/* ensure payload is published, then avail idx */
+		vq_sync_aring_payload(sc, vq, BUS_DMASYNC_PREWRITE);
 		vq->vq_avail->idx = virtio_rw16(sc, vq->vq_avail_idx);
-		vq_sync_aring(sc, vq, BUS_DMASYNC_PREWRITE);
+		vq_sync_aring_header(sc, vq, BUS_DMASYNC_PREWRITE);
 		vq->vq_queued++;
 
-		membar_consumer();
-		vq_sync_uring(sc, vq, BUS_DMASYNC_PREREAD);
 		if (sc->sc_active_features & VIRTIO_F_RING_EVENT_IDX) {
+			vq_sync_uring_avail(sc, vq, BUS_DMASYNC_POSTREAD);
 			t = virtio_rw16(sc, *vq->vq_avail_event) + 1;
 			if ((uint16_t) (n - t) < (uint16_t) (n - o))
 				sc->sc_ops->kick(sc, vq->vq_index);
 		} else {
+			vq_sync_uring_header(sc, vq, BUS_DMASYNC_POSTREAD);
 			flags = virtio_rw16(sc, vq->vq_used->flags);
 			if (!(flags & VRING_USED_F_NO_NOTIFY))
 				sc->sc_ops->kick(sc, vq->vq_index);
 		}
-		vq_sync_uring(sc, vq, BUS_DMASYNC_POSTREAD);
-		vq_sync_aring(sc, vq, BUS_DMASYNC_POSTWRITE);
 	}
 	mutex_exit(&vq->vq_aring_lock);

Prev by Date: Re: mfii(4) and Dell PERC
Next by Date: Re: mfii(4) and Dell PERC
Previous by Thread: mfii(4) and Dell PERC
Next by Thread: fan control
Indexes:

Home | Main Index | Thread Index | Old Index