Subject: kern/19288: patches for softdep problems
To: None <gnats-bugs@gnats.netbsd.org>
From: None <stephen@degler.net>
List: netbsd-bugs
Date: 12/04/2002 23:10:45
>Number:         19288
>Category:       kern
>Synopsis:       patches for softdep problems
>Confidential:   no
>Severity:       serious
>Priority:       high
>Responsible:    kern-bug-people
>State:          open
>Class:          sw-bug
>Submitter-Id:   net
>Arrival-Date:   Wed Dec 04 20:11:00 PST 2002
>Closed-Date:
>Last-Modified:
>Originator:     Stephen Degler
>Release:        NetBSD 1.6K
>Organization:
Very little, at best.
	
>Environment:
	
	
System: NetBSD kashmir.degler.net 1.6K NetBSD 1.6K (KASHMIR-$Revision: 1.500 $) #1: Fri Sep 27 00:21:32 EDT 2002 sdegler@bauhaus.degler.net:/vol1/NetBSD/kernels/KASHMIR i386
Architecture: i386
Machine: i386
>Description:

While stress testing softdeps on ffs with dbench, I encountered a
series of problems.  Dbench can be found at ftp://samba.org/pub/tridge/dbench
Problems encountered.

1) After running dbench with 64 children or more, sub processes are
hung in disk wait state, deadlocked on busy buffers.  This is due to
softdep_flush_indir.  I'm not exactly sure what's wrong with this
routine, but trivial efforts to fix it did not change the problem.  I
ended up just removing this routine and arranging for
softdep_setup_allocindir_page to sleep until a reasonable amount of
bufcache was available.

2) After fixing #1 I got a panic from softdep_allocindir_page, because
it called softdep_setup_pagecache with the lock held, but
softdep_setup_pagecache called pool_get with PR_WAITOK and then was
put to sleep due to memory exhaustion.  The next process in found the
softdep lock held and paniced.  Reordering the code fixed this.

3) Similar to #2 but in softdep_setup_allocdirect.

Finally, the code still seems dangerous, calling pool_get(xxx,PR_WAITOK)
with splbio held is going to result in bad things when memory is low.

>How-To-Repeat:
	build a current kernel with SOFTDEPS

	Running the follwing script on a filesystem mounted with softdeps
	should trigger the problems:
	#!/bin/ksh
	ulimit -p 1024
	ulimit -n 1024

	for i in 256 1 2 4 8 16 32 64 128
	do
	  dbench $i
	done

	I was using a pc with 1Gb ram.
	
>Fix:

Index: ffs_softdep.c
===================================================================
RCS file: /cvsroot/syssrc/sys/ufs/ffs/ffs_softdep.c,v
retrieving revision 1.35
diff -u -r1.35 ffs_softdep.c
--- ffs_softdep.c	2002/09/27 15:38:04	1.35
+++ ffs_softdep.c	2002/12/05 03:29:58
@@ -57,7 +57,7 @@
 
 #include <uvm/uvm.h>
 struct pool sdpcpool;
-u_int softdep_lockedbufs;
+volatile u_int softdep_lockedbufs;
 
 /*
  * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
@@ -192,7 +192,6 @@
 #endif
 void softdep_pageiodone __P((struct buf *));
 void softdep_flush_vnode __P((struct vnode *, ufs_lbn_t));
-static void softdep_flush_indir __P((struct vnode *));
 
 /*
  * Exported softdep operations.
@@ -244,6 +243,8 @@
 static struct lockit {
 	int	lkt_spl;
 	volatile pid_t	lkt_held;
+        void    *lkt_addr;
+        void    *lkt_addr2;
 } lk = { 0, -1 };
 static int lockcnt;
 
@@ -263,12 +264,15 @@
 {
 	if (lk->lkt_held != -1) {
 		if (lk->lkt_held == CURPROC_PID)
-			panic("softdep_lock: locking against myself");
+		  panic("softdep_lock: locking against myself: prev caller: %p %p",lk->lkt_addr,lk->lkt_addr2);
 		else
-			panic("softdep_lock: lock held by %d", lk->lkt_held);
+		  panic("softdep_lock: lock held by %d: prev caller: %p %p",
+			lk->lkt_held,lk->lkt_addr,lk->lkt_addr2);
 	}
 	lk->lkt_spl = splbio();
 	lk->lkt_held = CURPROC_PID;
+	lk->lkt_addr = __builtin_return_address(0);
+	lk->lkt_addr2 = __builtin_return_address(1);
 	lockcnt++;
 }
 
@@ -290,13 +294,15 @@
 {
 	if (lk->lkt_held != -1) {
 		if (lk->lkt_held == CURPROC_PID)
-			panic("softdep_lock_interlocked: locking against self");
+		        panic("softdep_lock_interlocked: locking against myself: prev caller: %p %p",lk->lkt_addr,lk->lkt_addr2);
 		else
-			panic("softdep_lock_interlocked: lock held by %d",
-			    lk->lkt_held);
+		        panic("softdep_lock_interlocked: lock held by %d: prev caller: %p %p",
+			    lk->lkt_held,lk->lkt_addr,lk->lkt_addr2);
 	}
 	lk->lkt_spl = s;
 	lk->lkt_held = CURPROC_PID;
+	lk->lkt_addr = __builtin_return_address(0);
+	lk->lkt_addr2 = __builtin_return_address(1);
 	lockcnt++;
 }
 
@@ -1408,6 +1414,7 @@
 	}
 	LIST_REMOVE(newblk, nb_hash);
 	pool_put(&newblk_pool, newblk);
+	FREE_LOCK(&lk);
 
 	/*
 	 * If we were not passed a bp to attach the dep to,
@@ -1423,6 +1430,7 @@
 		UVMHIST_LOG(ubchist, "bp = %p, size = %d -> %d",
 		    bp, (int)oldsize, (int)newsize, 0);
 	}
+	ACQUIRE_LOCK(&lk);
 	WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
 	if (lbn >= NDADDR) {
 		/* allocating an indirect block */
@@ -1661,17 +1669,18 @@
 
 	/*
 	 * If we are already holding "many" buffers busy (as the safe copies
-	 * of indirect blocks) flush the dependency for one of those before
-	 * potentially tying up more.  otherwise we could fill the
-	 * buffer cache with busy buffers and deadlock.
-	 * XXXUBC I'm sure there's a better way to deal with this.
+	 * of indirect blocks) wait for them to complete.  This avoids a
+	 * buffer cache deadlock
 	 */
 
-	while (softdep_lockedbufs > nbuf >> 2) {
-		softdep_flush_indir(ITOV(ip));
+	while (softdep_lockedbufs > (nbuf >> 2) ) {
+	  (void) tsleep((void *)&softdep_lockedbufs, PRIBIO+1, "softdbufs", 0);		
 	}
 
 	aip = newallocindir(ip, ptrno, newblkno, oldblkno);
+	if (nbp == NULL) {
+		nbp = softdep_setup_pagecache(ip, lbn, ip->i_fs->fs_bsize);
+	}
 	ACQUIRE_LOCK(&lk);
 	/*
 	 * If we are allocating a directory page, then we must
@@ -1681,9 +1690,6 @@
 	if ((ip->i_ffs_mode & IFMT) == IFDIR &&
 	    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
 		WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
-	if (nbp == NULL) {
-		nbp = softdep_setup_pagecache(ip, lbn, ip->i_fs->fs_bsize);
-	}
 	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
 	FREE_LOCK(&lk);
 	setup_allocindir_phase2(bp, ip, aip);
@@ -1796,6 +1802,9 @@
 				brelse(newindirdep->ir_savebp);
 				KDASSERT(softdep_lockedbufs != 0);
 				softdep_lockedbufs--;
+				if ( softdep_lockedbufs == nbuf >> 2 ) {
+				  wakeup((void *)&softdep_lockedbufs);
+				}
 			}
 			WORKITEM_FREE(newindirdep, D_INDIRDEP);
 		}
@@ -2455,6 +2464,9 @@
 	brelse(bp);
 	KDASSERT(softdep_lockedbufs != 0);
 	softdep_lockedbufs--;
+	if ( softdep_lockedbufs == nbuf >> 2 ) {
+	  wakeup((void *)&softdep_lockedbufs);
+	}
 	return (allerror);
 }
 
@@ -3288,7 +3300,9 @@
 				brelse(indirdep->ir_savebp);
 				KDASSERT(softdep_lockedbufs != 0);
 				softdep_lockedbufs--;
-
+				if ( softdep_lockedbufs == nbuf >> 2 ) {
+				  wakeup((void *)&softdep_lockedbufs);
+				}
 				/* inline expand WORKLIST_REMOVE(wk); */
 				wk->wk_state &= ~ONWORKLIST;
 				LIST_REMOVE(wk, wk_list);
@@ -5283,7 +5297,7 @@
 {
 	struct vnode *vp = ITOV(ip);
 	struct buf *bp;
-	int s;
+	/* int s; */
 	UVMHIST_FUNC("softdep_setup_pagecache"); UVMHIST_CALLED(ubchist);
 
 	/*
@@ -5294,9 +5308,9 @@
 
 	bp = softdep_lookup_pcbp(vp, lbn);
 	if (bp == NULL) {
-		s = splbio();
+	        /* s = splbio(); */
 		bp = pool_get(&sdpcpool, PR_WAITOK);
-		splx(s);
+	        /* splx(s); */
 
 		bp->b_vp = vp;
 		bp->b_lblkno = lbn;
@@ -5358,36 +5372,6 @@
 
 	return (NULL);
 }
-
-/*
- * Flush some dependent page cache data for any vnode *except*
- * the one specified.
- * XXXUBC this is a horrible hack and it's probably not too hard to deadlock
- * even with this, but it's better than nothing.
- */
-
-static void
-softdep_flush_indir(vp)
-	struct vnode *vp;
-{
-	struct buf *bp;
-	int i;
-
-	for (i = 0; i < PCBPHASHSIZE; i++) {
-		LIST_FOREACH(bp, &pcbphashhead[i], b_hash) {
-			if (bp->b_vp == vp ||
-			    LIST_FIRST(&bp->b_dep)->wk_type != D_ALLOCINDIR) {
-				continue;
-			}
-
-			VOP_FSYNC(bp->b_vp, curproc->p_ucred, FSYNC_WAIT, 0, 0,
-				  curproc);
-			return;
-		}
-	}
-	printf("softdep_flush_indir: nothing to flush?\n");
-}
-
 
 static struct buf *
 softdep_lookup_pcbp(vp, lbn)

>Release-Note:
>Audit-Trail:
>Unformatted: