Re: pae MP on cherry-xenmp

On Tue, 30 Aug 2011, Jeff Rizzo wrote:

I did fairly quickly have the same panic Michael did (note that this is in a kernel including the per-cpu MMU queue commit you made about 45m ago).:

panic: kernel diagnostic assertion "gnt_entries[last_gnt_entry] == XENGNT_NO_ENTRY" failed: file "/Users/riz/Documents/code/netbsd/xenmpsrc/sys/arch/xen/xen/xengnt.c", line 208

This appeared to be related to missing MP locking, so I added a mutex for this code and was able to run about an hour before the VM hung. The hang was preceeded by a message in sys/arch/xen/xen/evtchn.c:sys/arch/xen/xen/evtchn.c. The message was something like "sys/arch/xen/xen/evtchn.c: handler xen_timer_handler didn't lower ipl 8 7". I couldn't spot anything that might change the ipl in that function when I looked last night, but looking at it now, there are a couple of things that might change the ipl. One is the mutext used, since nested mutexes may not lower the ipl when the inner mutex_exit() is called. The other maybe is if the hardclock() call does something with the ipl, but I haven't looked at that yet. And looking at the mutex, it's using IPL_HIGH, which looks to be 8, and the event function is supposed to be scheduled at IPL_CLOCK, which is IPL_SCHED and appears to be 7. So it looks like that mutex may be the culprit.

The patch I was using (in addition to the previous patch for the CR3 fix):

Index: sys/arch/xen/xen/xengnt.c
RCS file: /cvsroot/src/sys/arch/xen/xen/xengnt.c,v
retrieving revision
diff -u -p -r1.18.2.1 xengnt.c
--- sys/arch/xen/xen/xengnt.c   23 Jun 2011 14:19:50 -0000
+++ sys/arch/xen/xen/xengnt.c   31 Aug 2011 01:49:53 -0000
@@ -35,6 +35,7 @@ __KERNEL_RCSID(0, "$NetBSD: xengnt.c,v 1
 #include <sys/queue.h>
 #include <sys/extent.h>
 #include <sys/kernel.h>
+#include <sys/mutex.h>
 #include <uvm/uvm.h>

 #include <xen/hypervisor.h>
@@ -62,6 +63,8 @@ int last_gnt_entry;
 /* empty entry in the list */
 #define XENGNT_NO_ENTRY 0xffffffff

+kmutex_t       gnt_mutex;
 /* VM address of the grant table */
 grant_entry_t *grant_table;

@@ -101,6 +104,7 @@ xengnt_init(void)
                gnt_entries[i] = XENGNT_NO_ENTRY;

        last_gnt_entry = 0;
+       mutex_init(&gnt_mutex, MUTEX_DEFAULT, IPL_VM);

@@ -192,13 +196,13 @@ static grant_ref_t
        grant_ref_t entry;
-       int s = splvm();
        static struct timeval xengnt_nonmemtime;
        static const struct timeval xengnt_nonmemintvl = {5,0};

+       mutex_enter(&gnt_mutex);
        if (last_gnt_entry == 0) {
                if (xengnt_more_entries()) {
-                       splx(s);
+                       mutex_exit(&gnt_mutex);
                        if (ratecheck(&xengnt_nonmemtime, &xengnt_nonmemintvl))
                                printf("xengnt_get_entry: out of grant "
                                    "table entries\n");
@@ -209,7 +213,7 @@ xengnt_get_entry(void)
        entry = gnt_entries[last_gnt_entry];
        gnt_entries[last_gnt_entry] = XENGNT_NO_ENTRY;
-       splx(s);
+       mutex_exit(&gnt_mutex);
        KASSERT(entry != XENGNT_NO_ENTRY);
        KASSERT(last_gnt_entry >= 0);
        KASSERT(last_gnt_entry <= gnt_max_grant_frames * 
@@ -222,13 +226,13 @@ xengnt_get_entry(void)
 static void
 xengnt_free_entry(grant_ref_t entry)
-       int s = splvm();
+       mutex_enter(&gnt_mutex);
        KASSERT(gnt_entries[last_gnt_entry] == XENGNT_NO_ENTRY);
        KASSERT(last_gnt_entry >= 0);
        KASSERT(last_gnt_entry <= gnt_max_grant_frames * 
        gnt_entries[last_gnt_entry] = entry;
-       splx(s);
+       mutex_exit(&gnt_mutex);


