Subject: Re: Netbsd DomU daily-08.01.2007-crashes
To: Manuel Bouyer <bouyer@antioche.eu.org>
From: Anzi <anzi@dnainternet.net>
List: port-xen
Date: 01/10/2007 14:09:11
Manuel Bouyer wrote:

>On Wed, Jan 10, 2007 at 11:10:11AM +0200, Anzi wrote:
>  
>
>>Manuel Bouyer wrote:
>>
>>    
>>
>>>On Tue, Jan 09, 2007 at 02:01:36PM +0200, Anzi wrote:
>>>
>>>
>>>      
>>>
>>>>OK,
>>>>
>>>>I thought that if I download kernel from current I get the fixed version 
>>>>        
>>>>
>>>>from Manuel (in xbd_xenbus.c 1.16)
>>>      
>>>
>>>>Apparently it is not so?
>>>>  
>>>>
>>>>        
>>>>
>>>your kernel is from 20061025, long before I commited the fix
>>>
>>>
>>>
>>>      
>>>
>>oh, I thought that fix was for the DOMU kernel. Is the current DOM0 / 
>>DOMU kernel good for production use (I think server will not be under 
>>high load)?
>>This far it has worked pretty well for with linux domU.
>>    
>>
>
>Yes, it's for the domU. Sorry, I misread your first mail.
>Hum, there's still a possible race condition at attach time. Can you try
>the attached patch ?
>
>  
>

I can, after I have downloaded kernel source. You have commited version 
1.20 of xencons.c and version 1.16 of xbd_xenbus.c. Are they the same 
version as these diffs?

-anzi-


>------------------------------------------------------------------------
>
>Index: xbd_xenbus.c
>===================================================================
>RCS file: /cvsroot/src/sys/arch/xen/xen/xbd_xenbus.c,v
>retrieving revision 1.15
>diff -u -r1.15 xbd_xenbus.c
>--- xbd_xenbus.c	1 Jan 2007 22:54:14 -0000	1.15
>+++ xbd_xenbus.c	10 Jan 2007 11:56:46 -0000
>@@ -388,12 +388,15 @@
> 		xenbus_switch_state(sc->sc_xbusd, NULL, XenbusStateClosed);
> 		break;
> 	case XenbusStateConnected:
>-		s = splbio();
>+		/*
>+		 * note that xbd_backend_changed() can only be called by
>+		 * the xenbus thread.
>+		 */
>+
> 		if (sc->sc_backend_status == BLKIF_STATE_CONNECTED)
> 			/* already connected */
> 			return;
>-		sc->sc_backend_status = BLKIF_STATE_CONNECTED;
>-		splx(s);
>+
> 		xbd_connect(sc);
> 		sc->sc_shutdown = 0;
> 		hypervisor_enable_event(sc->sc_evtchn);
>@@ -409,8 +412,10 @@
> 
> 		bufq_alloc(&sc->sc_dksc.sc_bufq, "fcfs", 0);
> 		sc->sc_dksc.sc_flags |= DKF_INITED;
>-
> 		disk_attach(&sc->sc_dksc.sc_dkdev);
>+
>+		sc->sc_backend_status = BLKIF_STATE_CONNECTED;
>+
> 		/* try to read the disklabel */
> 		dk_getdisklabel(sc->sc_di, &sc->sc_dksc, 0 /* XXX ? */);
> 		format_bytes(buf, sizeof(buf), (uint64_t)sc->sc_dksc.sc_size *
>@@ -469,6 +474,8 @@
> 
> 	DPRINTF(("xbd_handler(%s)\n", sc->sc_dev.dv_xname));
> 
>+	if (__predict_false(sc->sc_backend_status != BLKIF_STATE_CONNECTED))
>+		return 0;
> again:
> 	resp_prod = sc->sc_ring.sring->rsp_prod;
> 	x86_lfence(); /* ensure we see replies up to resp_prod */
>Index: xencons.c
>===================================================================
>RCS file: /cvsroot/src/sys/arch/xen/xen/xencons.c,v
>retrieving revision 1.19
>diff -u -r1.19 xencons.c
>--- xencons.c	1 Oct 2006 19:28:43 -0000	1.19
>+++ xencons.c	10 Jan 2007 11:56:46 -0000
>@@ -387,6 +387,7 @@
> #define XNC_OUT (xencons_interface->out)
> 		cons = xencons_interface->out_cons;
> 		prod = xencons_interface->out_prod;
>+		x86_lfence();
> 		while (prod != cons + sizeof(xencons_interface->out)) {
> 			if (MASK_XENCONS_IDX(prod, XNC_OUT) <
> 			    MASK_XENCONS_IDX(cons, XNC_OUT)) {
>@@ -402,9 +403,9 @@
> 				break;
> 			prod = prod + len;
> 		}
>-		x86_lfence();
>+		x86_sfence();
> 		xencons_interface->out_prod = prod;
>-		x86_lfence();
>+		x86_sfence();
> 		hypervisor_notify_via_evtchn(xen_start_info.console_evtchn);
> #undef XNC_OUT
> #else /* XEN3 */
>@@ -482,12 +483,13 @@
> 			cons = xencons_interface->in_cons;
> 			prod = xencons_interface->in_prod;
> 			x86_lfence();
>-		} else
>+		} else {
> 			cons += len;
>+			x86_sfence();
>+			xencons_interface->in_cons = cons;
>+			x86_sfence();
>+		}
> 	}
>-	x86_lfence();
>-	xencons_interface->in_cons = cons;
>-	x86_lfence();
> 	hypervisor_notify_via_evtchn(xen_start_info.console_evtchn);
> 	splx(s);
> 	return 1;
>  
>