Subject: xbd backend disconnection
To: None <port-xen@NetBSD.org>
From: Jed Davis <jdev@panix.com>
List: port-xen
Date: 09/16/2005 23:14:23
--=-=-=


While testing live migration, I discovered that the xbd(4) backend
processes disconnect requests immediately, even if I/O is still
pending.  In that case, the kernel panics when the response message is
written to the then-unmapped ring.

I was also able to reproduce the panic by destroying a domain
performing I/O; in my case it was by configuring a swap device and
causing the domain to use it heavily.

The attached patch adds a reference count on the backend instance,
much like the one Linux has, and defers deallocating resources and
responding to the disconnect request until the I/O is done.
uvm_km_free can't be called from interrupt context, since it takes a
lock, so that I deferred to the destroy message handling -- and, if
the device is reconnected instead, reuses the still-allocated page.
(I don't know if that case would normally be reached.)

It's not perfect, though: The hopefully-atomic decrement clearly
belongs somewhere MD (<machine/atomic.h>?), and in any case it's just
temporary until things are properly simplelocked (since MP will become
relevant at some point with Xen 3).


--=-=-=
Content-Type: text/x-patch
Content-Disposition: attachment;
  filename=xbdback-async-disconnect.patch
Content-Description: Respond to xbdback disconnect request only when all I/O is done.

Index: sys/arch/xen/xen/xbdback.c
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/xen/xbdback.c,v
retrieving revision 1.4.2.9
diff -u -p -r1.4.2.9 xbdback.c
--- sys/arch/xen/xen/xbdback.c	21 Aug 2005 22:12:21 -0000	1.4.2.9
+++ sys/arch/xen/xen/xbdback.c	17 Sep 2005 01:18:34 -0000
@@ -75,7 +75,7 @@ struct xbdback_fragment;
 struct xbdback_instance;
 
 /* state of a xbdback instance */
-typedef enum {CONNECTED, DISCONNECTED} xbdback_state_t;
+typedef enum {CONNECTED, DISCONNECTING, DISCONNECTED} xbdback_state_t;
 
 /*
  * Since there are a variety of conditions that can block our I/O
@@ -94,7 +94,7 @@ struct xbdback_instance {
 	SLIST_ENTRY(xbdback_instance) next;
 	domid_t domid;		/* attached to this domain */
 	uint32_t handle;	/* domain-specific handle */
-	xbdback_state_t status;
+	volatile xbdback_state_t status;
 	/* parameters for the communication */
 	unsigned int evtchn;
 	paddr_t ma_ring;
@@ -102,6 +102,9 @@ struct xbdback_instance {
 	blkif_ring_t *blk_ring;
 	BLKIF_RING_IDX resp_prod; /* our current reply index */
 	BLKIF_RING_IDX req_cons; /* our current request index */
+	/* disconnection must be postponed until all I/O is done */
+	volatile unsigned refcnt;
+	uint8_t disconnect_rspid; /* request id of the disconnect request */
 	/* 
 	 * State for I/O processing/coalescing follows; this has to
 	 * live here instead of on the stack because of the
@@ -124,6 +127,16 @@ struct xbdback_instance {
 
 	SLIST_HEAD(, xbd_vbd) vbds; /* list of virtual block devices */
 };
+/* Manipulation of the above reference count. */
+/* XXXjld@panix.com: not MP-safe, and move the i386 asm elsewhere. */
+#define xbdi_get(xbdip) (++(xbdip)->refcnt)
+#define xbdi_put(xbdip)                                      \
+do {                                                         \
+	__asm __volatile("decl %0"                           \
+	    : "=m"((xbdip)->refcnt) : "m"((xbdip)->refcnt)); \
+	if (0 == (xbdip)->refcnt)                            \
+               xbdback_finish_disconnect(xbdip);             \
+} while (/* CONSTCOND */ 0)
 
 /* each xbdback instance has a list of vbds associated with it */
 struct xbd_vbd {
@@ -202,6 +215,7 @@ struct timeval xbdback_fragio_intvl = { 
 
 static void xbdback_ctrlif_rx(ctrl_msg_t *, unsigned long);
 static int  xbdback_evthandler(void *);
+static void xbdback_finish_disconnect(struct xbdback_instance *);
 
 static struct xbdback_instance *xbdif_lookup(domid_t, uint32_t);
 static struct xbd_vbd * vbd_lookup(struct xbdback_instance *, blkif_vdev_t);
@@ -317,6 +331,8 @@ xbdback_ctrlif_rx(ctrl_msg_t *msg, unsig
 		xbdi->domid = req->domid;
 		xbdi->handle = req->blkif_handle;
 		xbdi->status = DISCONNECTED;
+		xbdi->refcnt = 1;
+		xbdi->blk_ring = NULL;
 		SLIST_INIT(&xbdi->vbds);
 		SLIST_INSERT_HEAD(&xbdback_instances, xbdi, next);
 		
@@ -333,10 +349,13 @@ xbdback_ctrlif_rx(ctrl_msg_t *msg, unsig
 			req->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
 			goto end;
 		}
-		if (xbdi->status == CONNECTED) {
+		if (xbdi->status != DISCONNECTED) {
 			req->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
 			goto end;
 		}
+		if (xbdi->blk_ring != NULL)
+			uvm_km_free(kernel_map, (vaddr_t)xbdi->blk_ring,
+			    PAGE_SIZE);
 		SLIST_REMOVE(&xbdback_instances, xbdi, xbdback_instance, next);
 		free(xbdi, M_DEVBUF);
 		req->status = BLKIF_BE_STATUS_OKAY;
@@ -355,14 +374,19 @@ xbdback_ctrlif_rx(ctrl_msg_t *msg, unsig
 			req->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
 			goto end;
 		}
-		if (xbdi->status == CONNECTED) {
+		if (xbdi->status != DISCONNECTED) {
 			req->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
 			goto end;
 		}
-		ring_addr = uvm_km_alloc(kernel_map, PAGE_SIZE);
-		if (ring_addr == 0) {
-			req->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
-			goto end;
+		if (xbdi->blk_ring == NULL) {
+			ring_addr = uvm_km_alloc(kernel_map, PAGE_SIZE);
+			if (ring_addr == 0) {
+				req->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+				goto end;
+			}
+		} else {
+			ring_addr = (vaddr_t)xbdi->blk_ring;
+			xbdi->blk_ring = NULL;
 		}
 
 		xbdi->ma_ring = req->shmem_frame << PAGE_SHIFT;
@@ -392,7 +416,6 @@ xbdback_ctrlif_rx(ctrl_msg_t *msg, unsig
 	{
 		blkif_be_disconnect_t *req =
 		    (blkif_be_disconnect_t *)&msg->msg[0];
-		vaddr_t ring_addr;
 
 		if (msg->length != sizeof(blkif_be_disconnect_t))
 			goto error;
@@ -401,13 +424,10 @@ xbdback_ctrlif_rx(ctrl_msg_t *msg, unsig
 			req->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
 			goto end;
 		}
-		hypervisor_mask_event(xbdi->evtchn);
-		event_remove_handler(xbdi->evtchn, xbdback_evthandler, xbdi);
-		ring_addr = (vaddr_t)xbdi->blk_ring;
-		pmap_remove(pmap_kernel(), ring_addr, ring_addr + PAGE_SIZE);
-		uvm_km_free(kernel_map, ring_addr, PAGE_SIZE);
-		req->status = BLKIF_BE_STATUS_OKAY;
-		break;
+		xbdi->status = DISCONNECTING;
+		/* XXXjld@panix.com: could this race with the iodone? */
+		xbdi_put(xbdi);
+		return;
 	}
 	case CMSG_BLKIF_BE_VBD_CREATE:
 	{
@@ -586,6 +606,33 @@ end:
 	return;
 }
 
+static void xbdback_finish_disconnect(struct xbdback_instance *xbdi)
+{
+	ctrl_msg_t cmsg;
+	blkif_be_disconnect_t *pdisc;
+	vaddr_t ring_addr;
+	
+	pdisc = (blkif_be_disconnect_t *)&cmsg.msg;
+
+	KASSERT(xbdi->status == DISCONNECTING);
+	hypervisor_mask_event(xbdi->evtchn);
+	event_remove_handler(xbdi->evtchn, xbdback_evthandler, xbdi);
+
+	cmsg.type = CMSG_BLKIF_BE;
+	cmsg.subtype = CMSG_BLKIF_BE_DISCONNECT;
+	cmsg.id = xbdi->disconnect_rspid;
+	cmsg.length = sizeof(*pdisc);
+	pdisc->domid = xbdi->domid;
+	pdisc->blkif_handle = xbdi->handle;
+	pdisc->status = BLKIF_BE_STATUS_OKAY;
+
+       	ring_addr = (vaddr_t)xbdi->blk_ring;
+	pmap_remove(pmap_kernel(), ring_addr, ring_addr + PAGE_SIZE);
+	
+	xbdi->status = DISCONNECTED;
+	ctrl_if_send_response(&cmsg);
+}
+
 static struct xbdback_instance *
 xbdif_lookup(domid_t dom , uint32_t handle)
 {
@@ -614,6 +661,7 @@ static int
 xbdback_evthandler(void *arg)
 {
 	struct xbdback_instance *xbdi = arg;
+
 	if (xbdi->cont == NULL) {
 		xbdi->cont = xbdback_co_main;
 		xbdback_trampoline(xbdi, xbdi);
@@ -724,7 +772,7 @@ xbdback_co_io(struct xbdback_instance *x
 		printf("xbdback_io domain %d: unknown vbd %d\n",
 		       xbdi->domid, xbdi->xen_req->device);
 		error = EINVAL;
-		goto end;	
+		goto end;
 	}
 	if (xbdi->xen_req->operation == BLKIF_OP_WRITE) {
 		if (xbdi->req_vbd->flags & VBD_F_RO) {
@@ -850,6 +898,8 @@ xbdback_co_io_gotio(struct xbdback_insta
 	vaddr_t start_offset; /* start offset in vm area */
 	int buf_flags;
 
+	xbdi_get(xbdi);
+	
 	xbd_io = xbdi->io = obj;
 	xbd_io->xio_xbdi = xbdi;
 	SLIST_INIT(&xbd_io->xio_rq);
@@ -1063,6 +1113,7 @@ xbdback_iodone(struct buf *bp)
 		    xbd_req->rq_operation, error);
 		xbdback_pool_put(&xbdback_request_pool, xbd_req);
 	}
+	xbdi_put(xbdi);
 	xbdback_pool_put(&xbdback_io_pool, xbd_io);
 }
 

--=-=-=

-- 
(let ((C call-with-current-continuation)) (apply (lambda (x y) (x y)) (map
((lambda (r) ((C C) (lambda (s) (r (lambda l (apply (s s) l))))))  (lambda
(f) (lambda (l) (if (null? l) C (lambda (k) (display (car l)) ((f (cdr l))
(C k)))))))    '((#\J #\d #\D #\v #\s) (#\e #\space #\a #\i #\newline)))))

--=-=-=--