Subject: kern/17961: softdep block allocation failure of nearly full disk
To: None <gnats-bugs@gnats.netbsd.org>
From: Gregory McGarry <g.mcgarry@netbsd.org>
List: netbsd-bugs
Date: 08/17/2002 12:09:25
>Number:         17961
>Category:       kern
>Synopsis:       softdep block allocation failure on nearly full disk
>Confidential:   no
>Severity:       non-critical
>Priority:       low
>Responsible:    kern-bug-people
>State:          open
>Class:          sw-bug
>Submitter-Id:   net
>Arrival-Date:   Fri Aug 16 17:10:00 PDT 2002
>Closed-Date:
>Last-Modified:
>Originator:     Gregory McGarry
>Release:        NetBSD-current
>Organization:
	
>Environment:
	

NetBSD 1.6 and -current

>Description:

Here is the commit log that appeared in FreeBSD:

"This patch fixes a long standing complaint with soft updates in
which small and/or nearly full filesystems would fail with `file
system full' messages when trying to replace a number of existing
files (for example during a system installation). When the allocation
routines are about to fail with a file system full condition, they
make a call to softdep_request_cleanup() which attempts to accelerate
the flushing of pending deletion requests in an effort to free up
space. In the face of filesystem I/O requests that exceed the
available disk transfer capacity, the cleanup request could take
an unbounded amount of time. Thus, the softdep_request_cleanup()
routine will only try for tickdelay seconds (default 2 seconds)
before giving up and returning a filesystem full error. Under typical
conditions, the softdep_request_cleanup() routine is able to free
up space in under fifty milliseconds."

Revisions in the FreeBSD CVS repository are (2002/01/22):

ffs_alloc.c	1.86
ffs_extern.h	1.44
ffs_softdep.c	1.105

>How-To-Repeat:

Fill the disk.  Delete some files.  Write to disk.  Still fails.

>Fix:

Fair bit of beating required to get working, since the NetBSD
syncer is different to the FreeBSD one.  Mostly works, but still
fails in some pathological cases.

Index: ffs_alloc.c
===================================================================
RCS file: /cvsroot/syssrc/sys/ufs/ffs/ffs_alloc.c,v
retrieving revision 1.55
diff -u -b -p -r1.55 ffs_alloc.c
--- ffs_alloc.c	2002/05/14 02:46:22	1.55
+++ ffs_alloc.c	2002/08/16 23:55:41
@@ -116,7 +116,7 @@ ffs_alloc(ip, lbn, bpref, size, cred, bn
 {
 	struct fs *fs = ip->i_fs;
 	ufs_daddr_t bno;
-	int cg;
+	int cg, reclaimed;
 #ifdef QUOTA
 	int error;
 #endif
@@ -151,6 +151,8 @@ ffs_alloc(ip, lbn, bpref, size, cred, bn
 	if (cred == NOCRED)
 		panic("ffs_alloc: missing credential\n");
 #endif /* DIAGNOSTIC */
+	reclaimed = 0;
+retry:
 	if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
 		goto nospace;
 	if (cred->cr_uid != 0 && freespace(fs, fs->fs_minfree) <= 0)
@@ -180,6 +182,12 @@ ffs_alloc(ip, lbn, bpref, size, cred, bn
 	(void) chkdq(ip, (long)-btodb(size), cred, FORCE);
 #endif
 nospace:
+
+	if (fs->fs_pendingblocks > 0 && reclaimed == 0) {
+		reclaimed = 1;
+		softdep_request_cleanup(fs, ITOV(ip));
+		goto retry;
+	}	
 	ffs_fserr(fs, cred->cr_uid, "file system full");
 	uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
 	return (ENOSPC);
@@ -205,7 +213,7 @@ ffs_realloccg(ip, lbprev, bpref, osize, 
 {
 	struct fs *fs = ip->i_fs;
 	struct buf *bp;
-	int cg, request, error;
+	int cg, request, error, reclaimed;
 	ufs_daddr_t bprev, bno;
 
 #ifdef UVM_PAGE_TRKOWN
@@ -238,6 +246,8 @@ ffs_realloccg(ip, lbprev, bpref, osize, 
 	if (cred == NOCRED)
 		panic("ffs_realloccg: missing credential\n");
 #endif /* DIAGNOSTIC */
+	reclaimed = 0;
+retry:
 	if (cred->cr_uid != 0 && freespace(fs, fs->fs_minfree) <= 0)
 		goto nospace;
 	if ((bprev = ufs_rw32(ip->i_ffs_db[lbprev], UFS_FSNEEDSWAP(fs))) == 0) {
@@ -376,6 +386,12 @@ nospace:
 	/*
 	 * no space available
 	 */
+	if (fs->fs_pendingblocks > 0 && reclaimed == 0) {
+		reclaimed = 1;
+		softdep_request_cleanup(fs, ITOV(ip));
+		goto retry;
+	}
+
 	ffs_fserr(fs, cred->cr_uid, "file system full");
 	uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
 	return (ENOSPC);
Index: ffs_extern.h
===================================================================
RCS file: /cvsroot/syssrc/sys/ufs/ffs/ffs_extern.h,v
retrieving revision 1.22
diff -u -b -p -r1.22 ffs_extern.h
--- ffs_extern.h	2002/05/05 17:00:06	1.22
+++ ffs_extern.h	2002/08/16 23:55:42
@@ -156,6 +156,7 @@ int	softdep_flushfiles __P((struct mount
 void	softdep_update_inodeblock __P((struct inode *, struct buf *, int));
 void	softdep_load_inodeblock __P((struct inode *));
 void	softdep_freefile __P((void *));
+int	softdep_request_cleanup(struct fs *, struct vnode *);
 void	softdep_setup_freeblocks __P((struct inode *, off_t));
 void	softdep_setup_inomapdep __P((struct buf *, struct inode *, ino_t));
 void	softdep_setup_blkmapdep __P((struct buf *, struct fs *, ufs_daddr_t));
Index: ffs_softdep.c
===================================================================
RCS file: /cvsroot/syssrc/sys/ufs/ffs/ffs_softdep.c,v
retrieving revision 1.33
diff -u -b -p -r1.33 ffs_softdep.c
--- ffs_softdep.c	2002/07/05 13:49:26	1.33
+++ ffs_softdep.c	2002/08/16 23:55:43
@@ -179,7 +179,9 @@ static	int inodedep_lookup __P((struct f
 static	int pagedep_lookup __P((struct inode *, ufs_lbn_t, int,
 	    struct pagedep **));
 static	void pause_timer __P((void *));
+static	void cleanup_timer __P((void *));
 static	int request_cleanup __P((int, int));
+static  int process_worklist_item __P((struct mount *, int));
 static	void add_to_worklist __P((struct worklist *));
 static	struct buf *softdep_setup_pagecache __P((struct inode *, ufs_lbn_t,
 						 long));
@@ -553,11 +555,15 @@ static int max_softdeps;	/* maximum numb
 static int tickdelay = 2;	/* number of ticks to pause during slowdown */
 static int proc_waiting;	/* tracks whether we have a timeout posted */
 static struct callout pause_timer_ch = CALLOUT_INITIALIZER;
+static struct callout cleanup_timer_ch = CALLOUT_INITIALIZER;
+static int cleanup_timeout = 0;
 static struct proc *filesys_syncer; /* proc of filesystem syncer process */
 static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
 #define FLUSH_INODES	1
 static int req_clear_remove;	/* syncer process flush some freeblks */
 #define FLUSH_REMOVE	2
+#define FLUSH_REMOVE_WAIT	3
+
 /*
  * runtime statistics
  */
@@ -622,9 +628,9 @@ softdep_process_worklist(matchmnt)
 {
 	struct proc *p = CURPROC;
 	struct worklist *wk;
-	struct fs *matchfs;
 	int matchcnt;
 
+#if 0
 	/*
 	 * First process any items on the delayed-free queue.
 	 */
@@ -632,6 +638,7 @@ softdep_process_worklist(matchmnt)
 	ACQUIRE_LOCK(&lk);
 	softdep_freequeue_process();
 	FREE_LOCK(&lk);
+#endif
 
 	/*
 	 * Record the process identifier of our caller so that we can give
@@ -639,17 +646,19 @@ softdep_process_worklist(matchmnt)
 	 */
 	filesys_syncer = p;
 	matchcnt = 0;
-	matchfs = NULL;
-	if (matchmnt != NULL)
-		matchfs = VFSTOUFS(matchmnt)->um_fs;
+
 	/*
 	 * There is no danger of having multiple processes run this
 	 * code. It is single threaded solely so that softdep_flushfiles
 	 * (below) can get an accurate count of the number of items
 	 * related to its mount point that are in the list.
 	 */
-	if (softdep_worklist_busy && matchmnt == NULL)
+	if (matchmnt == NULL) {
+		if (softdep_worklist_busy < 0)
 		return (-1);
+		softdep_worklist_busy += 1;
+	}
+
 	/*
 	 * If requested, try removing inode or removal dependencies.
 	 */
@@ -663,8 +672,81 @@ softdep_process_worklist(matchmnt)
 		req_clear_remove = 0;
 		wakeup(&proc_waiting);
 	}
-	ACQUIRE_LOCK(&lk);
+
 	while ((wk = LIST_FIRST(&softdep_workitem_pending)) != 0) {
+		matchcnt += process_worklist_item(matchmnt, 0);
+
+		if (softdep_worklist_busy && matchmnt == NULL) {
+			matchcnt = -1;
+			break;
+		}
+
+		/*
+		 * If requested, try removing inode or removal dependencies.
+		 */
+		if (req_clear_inodedeps) {
+			clear_inodedeps(p);
+			req_clear_inodedeps = 0;
+			wakeup(&proc_waiting);
+		}
+		if (req_clear_remove) {
+			clear_remove(p);
+			req_clear_remove = 0;
+			wakeup(&proc_waiting);
+		}
+
+		/*
+		 * Process any new items on the delayed-free queue.
+		 */
+		ACQUIRE_LOCK(&lk);
+		softdep_freequeue_process();
+		FREE_LOCK(&lk);
+	}
+	if (matchmnt == NULL)
+		softdep_worklist_busy -= 1;
+
+	return (matchcnt);
+}
+
+
+/*
+ * Process one item on the worklist.
+ */
+static int
+process_worklist_item(matchmnt, flags)
+	struct mount *matchmnt;
+	int flags;
+{
+	struct worklist *wk;
+	struct dirrem *dirrem;
+	struct vnode *vp;
+	struct fs *matchfs;
+	int matchcnt = -1;
+
+	matchfs = NULL;
+	if (matchmnt != NULL)
+		matchfs = VFSTOUFS(matchmnt)->um_fs;
+
+	ACQUIRE_LOCK(&lk);
+       /*
+         * Normally we just process each item on the worklist in order.
+         * However, if we are in a situation where we cannot lock any
+         * inodes, we have to skip over any dirrem requests whose
+         * vnodes are resident and locked.
+         */
+        LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) {
+                if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
+                        break;
+                dirrem = WK_DIRREM(wk);
+                vp = ufs_ihashlookup(VFSTOUFS(dirrem->dm_mnt)->um_dev,
+                    dirrem->dm_oldinum);
+                if (vp == NULL || !VOP_ISLOCKED(vp))
+                        break;
+        }
+	if (wk == 0) {
+		FREE_LOCK(&lk);
+		return (-1);
+	}
 		WORKLIST_REMOVE(wk);
 		FREE_LOCK(&lk);
 		switch (wk->wk_type) {
@@ -702,33 +784,10 @@ softdep_process_worklist(matchmnt)
 			    TYPENAME(wk->wk_type));
 			/* NOTREACHED */
 		}
-		if (softdep_worklist_busy && matchmnt == NULL)
-			return (-1);
-		/*
-		 * If requested, try removing inode or removal dependencies.
-		 */
-		if (req_clear_inodedeps) {
-			clear_inodedeps(p);
-			req_clear_inodedeps = 0;
-			wakeup(&proc_waiting);
-		}
-		if (req_clear_remove) {
-			clear_remove(p);
-			req_clear_remove = 0;
-			wakeup(&proc_waiting);
-		}
-
-		/*
-		 * Process any new items on the delayed-free queue.
-		 */
-
-		ACQUIRE_LOCK(&lk);
-		softdep_freequeue_process();
-	}
-	FREE_LOCK(&lk);
 	return (matchcnt);
 }
 
+
 /*
  * Move dependencies from one buffer to another.
  */
@@ -771,7 +830,7 @@ softdep_flushfiles(oldmnt, flags, p)
 	 */
 	while (softdep_worklist_busy)
 		tsleep(&lbolt, PRIBIO, "softflush", 0);
-	softdep_worklist_busy = 1;
+	softdep_worklist_busy = -1;
 	if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) {
 		softdep_worklist_busy = 0;
 		return (error);
@@ -4891,6 +4950,50 @@ flush_pagedep_deps(pvp, mp, diraddhdp)
 }
 
 /*
+ * Called by the allocation routines when they are about to fail
+ * in the hope that we can free up some disk space.
+ * 
+ * First check to see if the work list has anything on it. If it has,
+ * clean up entries until we successfully free some space. Because this
+ * process holds inodes locked, we cannot handle any remove requests
+ * that might block on a locked inode as that could lead to deadlock.
+ * If the worklist yields no free space, encourage the syncer daemon
+ * to help us. In no event will we try for longer than tickdelay seconds.
+ */
+int
+softdep_request_cleanup(fs, vp)
+	struct fs *fs;
+	struct vnode *vp;
+{
+	long needed;
+
+	cleanup_timeout = 0;
+	needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize;
+	if (VOP_UPDATE(vp, NULL, NULL, UPDATE_WAIT) != 0)
+		return (0);
+	callout_reset(&cleanup_timer_ch, tickdelay, cleanup_timer, NULL);
+	while (fs->fs_pendingblocks > 0 &&
+	    fs->fs_cstotal.cs_nbfree <= needed) {
+		if (cleanup_timeout != 0)
+			return (0);
+		if ((LIST_FIRST(&softdep_workitem_pending) != NULL) &&
+		    (softdep_process_worklist(NULL) != -1))
+			continue;
+		request_cleanup(FLUSH_REMOVE_WAIT, 0);
+	}
+	callout_stop(&cleanup_timer_ch);
+	return (1);
+}
+
+void
+cleanup_timer(arg)
+	void *arg;
+{
+
+	cleanup_timeout = 1;
+}
+
+/*
  * A large burst of file addition or deletion activity can drive the
  * memory load excessively high. Therefore we deliberately slow things
  * down and speed up the I/O processing if we find ourselves with too
@@ -4910,6 +5013,13 @@ request_cleanup(resource, islocked)
 	if (p == filesys_syncer)
 		return (0);
 	/*
+	 * Next, we attempt to speed up the syncer process. If that
+	 * is successful, then we allow the process to continue.
+	 */
+	if (speedup_syncer() && resource != FLUSH_REMOVE_WAIT)
+		return (0);
+
+	/*
 	 * If we are resource constrained on inode dependencies, try
 	 * flushing some dirty inodes. Otherwise, we are constrained
 	 * by file deletions, so try accelerating flushes of directories
@@ -4927,11 +5037,14 @@ request_cleanup(resource, islocked)
 		break;
 
 	case FLUSH_REMOVE:
+	case FLUSH_REMOVE_WAIT:
 		stat_blk_limit_push += 1;
 		req_clear_remove = 1;
 		break;
 
 	default:
+		if (islocked)
+			FREE_LOCK(&lk);
 		panic("request_cleanup: unknown type");
 	}
 	/*
>Release-Note:
>Audit-Trail:
>Unformatted: