Subject: Re: Netbsd DomU daily-08.01.2007-crashes
To: Anzi <anzi@dnainternet.net>
From: Manuel Bouyer <bouyer@antioche.eu.org>
List: port-xen
Date: 01/10/2007 12:57:14
--bp/iNruPH9dso1Pn
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

On Wed, Jan 10, 2007 at 11:10:11AM +0200, Anzi wrote:
> Manuel Bouyer wrote:
> 
> >On Tue, Jan 09, 2007 at 02:01:36PM +0200, Anzi wrote:
> > 
> >
> >>OK,
> >>
> >>I thought that if I download kernel from current I get the fixed version 
> >>from Manuel (in xbd_xenbus.c 1.16)
> >>Apparently it is not so?
> >>   
> >>
> >
> >your kernel is from 20061025, long before I commited the fix
> >
> > 
> >
> 
> oh, I thought that fix was for the DOMU kernel. Is the current DOM0 / 
> DOMU kernel good for production use (I think server will not be under 
> high load)?
> This far it has worked pretty well for with linux domU.

Yes, it's for the domU. Sorry, I misread your first mail.
Hum, there's still a possible race condition at attach time. Can you try
the attached patch ?

-- 
Manuel Bouyer <bouyer@antioche.eu.org>
     NetBSD: 26 ans d'experience feront toujours la difference
--

--bp/iNruPH9dso1Pn
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename=diff

Index: xbd_xenbus.c
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/xen/xbd_xenbus.c,v
retrieving revision 1.15
diff -u -r1.15 xbd_xenbus.c
--- xbd_xenbus.c	1 Jan 2007 22:54:14 -0000	1.15
+++ xbd_xenbus.c	10 Jan 2007 11:56:46 -0000
@@ -388,12 +388,15 @@
 		xenbus_switch_state(sc->sc_xbusd, NULL, XenbusStateClosed);
 		break;
 	case XenbusStateConnected:
-		s = splbio();
+		/*
+		 * note that xbd_backend_changed() can only be called by
+		 * the xenbus thread.
+		 */
+
 		if (sc->sc_backend_status == BLKIF_STATE_CONNECTED)
 			/* already connected */
 			return;
-		sc->sc_backend_status = BLKIF_STATE_CONNECTED;
-		splx(s);
+
 		xbd_connect(sc);
 		sc->sc_shutdown = 0;
 		hypervisor_enable_event(sc->sc_evtchn);
@@ -409,8 +412,10 @@
 
 		bufq_alloc(&sc->sc_dksc.sc_bufq, "fcfs", 0);
 		sc->sc_dksc.sc_flags |= DKF_INITED;
-
 		disk_attach(&sc->sc_dksc.sc_dkdev);
+
+		sc->sc_backend_status = BLKIF_STATE_CONNECTED;
+
 		/* try to read the disklabel */
 		dk_getdisklabel(sc->sc_di, &sc->sc_dksc, 0 /* XXX ? */);
 		format_bytes(buf, sizeof(buf), (uint64_t)sc->sc_dksc.sc_size *
@@ -469,6 +474,8 @@
 
 	DPRINTF(("xbd_handler(%s)\n", sc->sc_dev.dv_xname));
 
+	if (__predict_false(sc->sc_backend_status != BLKIF_STATE_CONNECTED))
+		return 0;
 again:
 	resp_prod = sc->sc_ring.sring->rsp_prod;
 	x86_lfence(); /* ensure we see replies up to resp_prod */
Index: xencons.c
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/xen/xencons.c,v
retrieving revision 1.19
diff -u -r1.19 xencons.c
--- xencons.c	1 Oct 2006 19:28:43 -0000	1.19
+++ xencons.c	10 Jan 2007 11:56:46 -0000
@@ -387,6 +387,7 @@
 #define XNC_OUT (xencons_interface->out)
 		cons = xencons_interface->out_cons;
 		prod = xencons_interface->out_prod;
+		x86_lfence();
 		while (prod != cons + sizeof(xencons_interface->out)) {
 			if (MASK_XENCONS_IDX(prod, XNC_OUT) <
 			    MASK_XENCONS_IDX(cons, XNC_OUT)) {
@@ -402,9 +403,9 @@
 				break;
 			prod = prod + len;
 		}
-		x86_lfence();
+		x86_sfence();
 		xencons_interface->out_prod = prod;
-		x86_lfence();
+		x86_sfence();
 		hypervisor_notify_via_evtchn(xen_start_info.console_evtchn);
 #undef XNC_OUT
 #else /* XEN3 */
@@ -482,12 +483,13 @@
 			cons = xencons_interface->in_cons;
 			prod = xencons_interface->in_prod;
 			x86_lfence();
-		} else
+		} else {
 			cons += len;
+			x86_sfence();
+			xencons_interface->in_cons = cons;
+			x86_sfence();
+		}
 	}
-	x86_lfence();
-	xencons_interface->in_cons = cons;
-	x86_lfence();
 	hypervisor_notify_via_evtchn(xen_start_info.console_evtchn);
 	splx(s);
 	return 1;

--bp/iNruPH9dso1Pn--