tech-kern archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Re: nfs server lockup (again)



On Fri, Jun 17, 2011 at 11:45:11AM +0200, Manuel Bouyer wrote:
>[...]
> And now that our softclk are sleeping waiting for the lock held by the nfsd
> in kpause, the kpause will never be woken up and the kernel is locked.
> 
> Now the question is why this doesn't happen with the feb 2010 5.0_STABLE
> kernel. nfsd_use_loan is also set to 1 in this kernel.
> One thing that maybe is relevant because it defers more things
> to a softnet_lock callout is tcp_input.c 1.291.4.3.
> 
> I guess this can happen with any TCP application using page loan, not only
> nfsd.
> Any idea on how to properly fix this ?
> a workaround could be to use yield() in uvm_loan.c because it would not
> require a clock tick to wake up.
> I'm not sure if it's possible to drop the socket lock before unloaning the
> pages.
> But I wonder if this could be a more general issue with callouts.
> Maybe we should have one thread per callout, a la fast_softint,
> which is used when a callout needs to sleep ?

I opened kern/45093 for this.

Remplacing kpause() with yield() fixes the problem, but this has performances
issues (as one could expect); I notice the daily & security scripts are
running slower on my NFS server now.

After more analisys, I found the only path which shows this problem is
through soclose(). Other places looks like calling sodopendfreel()
without the socket lock held.

I'm not sure it's possible to drop the lock in the soclose() path to call
sodopendfree(), so I went with a kernel thread dedicated to handle the
so_pendfree list. This also has the win that we can call cv_wakeup()
when enqueing more work the thread, instead of looking for entry points where
it is safe to process so_pendfree, so the code is more clear.

I've tested the attached patch on a Xen (so UP) platform with NFS clients
and servers, running a LOCKDEBUG kernel.
I'm also now running a patched kernel on ftp.fr.netbsd.org (also with
LOCKDEBUG).
Please ignore the KASSERT(!mutex_owned()) in the patch; I added them there
so I could track more closely what's going on with locks but they
won't work on a !LOCKDEBUG kernel so they will have to go away.

Comments ?

-- 
Manuel Bouyer <bouyer%antioche.eu.org@localhost>
     NetBSD: 26 ans d'experience feront toujours la difference
--
Index: sys/socketvar.h
===================================================================
RCS file: /cvsroot/src/sys/sys/socketvar.h,v
retrieving revision 1.116.4.2
diff -u -r1.116.4.2 socketvar.h
--- sys/socketvar.h     4 Apr 2009 23:36:28 -0000       1.116.4.2
+++ sys/socketvar.h     22 Jun 2011 20:12:12 -0000
@@ -281,6 +281,7 @@
 int    sbwait(struct sockbuf *);
 int    sb_max_set(u_long);
 void   soinit(void);
+void   soinit1(void);
 void   soinit2(void);
 int    soabort(struct socket *);
 int    soaccept(struct socket *, struct mbuf *);
Index: kern/init_main.c
===================================================================
RCS file: /cvsroot/src/sys/kern/init_main.c,v
retrieving revision 1.371.2.5
diff -u -r1.371.2.5 init_main.c
--- kern/init_main.c    21 Dec 2010 22:21:42 -0000      1.371.2.5
+++ kern/init_main.c    22 Jun 2011 20:12:12 -0000
@@ -463,6 +463,10 @@
        /* Initialize interfaces. */
        ifinit1();
 
+       /* Initialize sockets thread(s) */
+       soinit1();
+
+
        /* Configure the system hardware.  This will enable interrupts. */
        configure();
 
Index: kern/uipc_socket.c
===================================================================
RCS file: /cvsroot/src/sys/kern/uipc_socket.c,v
retrieving revision 1.177.4.3
diff -u -r1.177.4.3 uipc_socket.c
--- kern/uipc_socket.c  3 May 2009 13:18:55 -0000       1.177.4.3
+++ kern/uipc_socket.c  22 Jun 2011 20:12:12 -0000
@@ -91,6 +91,7 @@
 #include <sys/kauth.h>
 #include <sys/mutex.h>
 #include <sys/condvar.h>
+#include <sys/kthread.h>
 
 #include <uvm/uvm.h>
 
@@ -136,7 +137,7 @@
 #endif
 
 static kmutex_t so_pendfree_lock;
-static struct mbuf *so_pendfree;
+static struct mbuf *so_pendfree = NULL;
 
 #ifndef SOMAXKVA
 #define        SOMAXKVA (16 * 1024 * 1024)
@@ -147,31 +148,19 @@
 
 #define        SOCK_LOAN_CHUNK         65536
 
-static size_t sodopendfree(void);
-static size_t sodopendfreel(void);
+static void sopendfree_thread(void *);
+static kcondvar_t pendfree_thread_cv;
+static lwp_t *sopendfree_lwp;
 
 static vsize_t
 sokvareserve(struct socket *so, vsize_t len)
 {
        int error;
 
+       KASSERT(!mutex_owned(softnet_lock));
+       KASSERT(!mutex_owned(so->so_lock));
        mutex_enter(&so_pendfree_lock);
        while (socurkva + len > somaxkva) {
-               size_t freed;
-
-               /*
-                * try to do pendfree.
-                */
-
-               freed = sodopendfreel();
-
-               /*
-                * if some kva was freed, try again.
-                */
-
-               if (freed)
-                       continue;
-
                SOSEND_COUNTER_INCR(&sosend_kvalimit);
                error = cv_wait_sig(&socurkva_cv, &so_pendfree_lock);
                if (error) {
@@ -202,6 +191,8 @@
 sokvaalloc(vsize_t len, struct socket *so)
 {
        vaddr_t lva;
+       KASSERT(!mutex_owned(softnet_lock));
+       KASSERT(!mutex_owned(so->so_lock));
 
        /*
         * reserve kva.
@@ -264,56 +255,45 @@
        sokvafree(sva, len);
 }
 
-static size_t
-sodopendfree(void)
-{
-       size_t rv;
-
-       if (__predict_true(so_pendfree == NULL))
-               return 0;
-
-       mutex_enter(&so_pendfree_lock);
-       rv = sodopendfreel();
-       mutex_exit(&so_pendfree_lock);
-
-       return rv;
-}
-
 /*
- * sodopendfreel: free mbufs on "pendfree" list.
+ * sopendfree_thread: free mbufs on "pendfree" list.
  * unlock and relock so_pendfree_lock when freeing mbufs.
- *
- * => called with so_pendfree_lock held.
  */
 
-static size_t
-sodopendfreel(void)
+static void
+sopendfree_thread(void *v)
 {
        struct mbuf *m, *next;
-       size_t rv = 0;
-
-       KASSERT(mutex_owned(&so_pendfree_lock));
+       size_t rv;
 
-       while (so_pendfree != NULL) {
-               m = so_pendfree;
-               so_pendfree = NULL;
-               mutex_exit(&so_pendfree_lock);
+       mutex_enter(&so_pendfree_lock);
 
-               for (; m != NULL; m = next) {
-                       next = m->m_next;
-                       KASSERT((~m->m_flags & (M_EXT|M_EXT_PAGES)) == 0);
-                       KASSERT(m->m_ext.ext_refcnt == 0);
+       for (;;) {
+               rv = 0;
+               while (so_pendfree != NULL) {
+                       m = so_pendfree;
+                       so_pendfree = NULL;
+                       mutex_exit(&so_pendfree_lock);
+
+                       for (; m != NULL; m = next) {
+                               next = m->m_next;
+                               KASSERT((~m->m_flags & (M_EXT|M_EXT_PAGES)) == 
0);
+                               KASSERT(m->m_ext.ext_refcnt == 0);
+
+                               rv += m->m_ext.ext_size;
+                               sodoloanfree(m->m_ext.ext_pgs, m->m_ext.ext_buf,
+                                   m->m_ext.ext_size);
+                               pool_cache_put(mb_cache, m);
+                       }
 
-                       rv += m->m_ext.ext_size;
-                       sodoloanfree(m->m_ext.ext_pgs, m->m_ext.ext_buf,
-                           m->m_ext.ext_size);
-                       pool_cache_put(mb_cache, m);
+                       mutex_enter(&so_pendfree_lock);
                }
-
-               mutex_enter(&so_pendfree_lock);
+               if (rv)
+                       cv_broadcast(&socurkva_cv);
+               cv_wait(&pendfree_thread_cv, &so_pendfree_lock);
        }
-
-       return (rv);
+       panic("sopendfree_thread");
+       /* NOTREACHED */
 }
 
 void
@@ -332,7 +312,7 @@
        mutex_enter(&so_pendfree_lock);
        m->m_next = so_pendfree;
        so_pendfree = m;
-       cv_broadcast(&socurkva_cv);
+       cv_signal(&pendfree_thread_cv);
        mutex_exit(&so_pendfree_lock);
 }
 
@@ -402,7 +382,6 @@
        KASSERT(ce == &sokva_reclaimerentry);
        KASSERT(obj == NULL);
 
-       sodopendfree();
        if (!vm_map_starved_p(kernel_map)) {
                return CALLBACK_CHAIN_ABORT;
        }
@@ -414,20 +393,23 @@
 {
        struct mbuf *m;
 
+       KASSERT(!mutex_owned(softnet_lock));
+       KASSERT(!mutex_owned(so->so_lock));
        m = m_get(M_WAIT, type);
        MCLAIM(m, so->so_mowner);
        return m;
 }
 
 void
-soinit(void)
+soinit()
 {
-
        mutex_init(&so_pendfree_lock, MUTEX_DEFAULT, IPL_VM);
        softnet_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
        cv_init(&socurkva_cv, "sokva");
+       cv_init(&pendfree_thread_cv, "sopendfr");
        soinit2();
 
+
        /* Set the initial adjusted socket buffer size. */
        if (sb_max_set(sb_max))
                panic("bad initial sb_max value: %lu", sb_max);
@@ -436,6 +418,15 @@
            &sokva_reclaimerentry, NULL, sokva_reclaim_callback);
 }
 
+void
+soinit1(void)
+{
+       int error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL,
+           sopendfree_thread, NULL, &sopendfree_lwp, "sopendfree");
+       if (error)
+               panic("soinit1 %d", error);
+}
+
 /*
  * Socket operation routines.
  * These routines are called by the routines in
@@ -787,7 +778,6 @@
                error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT,
                    NULL, NULL, NULL, NULL);
        }
-       sodopendfree();
        return (error);
 }
 
@@ -819,7 +809,6 @@
        int             error, s, dontroute, atomic;
 
        p = l->l_proc;
-       sodopendfree();
        clen = 0;
 
        /*
@@ -1088,10 +1077,9 @@
        else
                flags = 0;
 
-       if ((flags & MSG_DONTWAIT) == 0)
-               sodopendfree();
-
        if (flags & MSG_OOB) {
+               KASSERT(!mutex_owned(softnet_lock));
+               KASSERT(!mutex_owned(so->so_lock));
                m = m_get(M_WAIT, MT_DATA);
                solock(so);
                error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m,


Home | Main Index | Thread Index | Old Index