Subject: Re: More on sorted I/O requests
To: None <tech-kern@netbsd.org>
From: Gregory McGarry <g.mcgarry@ieee.org>
List: tech-kern
Date: 06/29/2002 18:23:20
John Franklin wrote:
 
> On Friday, June 28, 2002, at 01:26 AM, Sean Davis wrote:
> 
> > I noticed this yesterday as well, when removing a core dump so that I 
> > could
> > savecore another without filling up the partition. I had to sync and 
> > then
> > savecore to make it work. Are the performance improvements from this
> > signifigant enough to justify the annoyance of situations like that?
> >
> > On Fri, Jun 28, 2002 at 05:01:42PM +1200, Gregory McGarry wrote:
> >> More to consider wrt sorted I/O requests.  If I fill a partition
> >> by creating a large file, delete the file and attempt to create
> >> another large file in its place, the delayed deletion operation
> >> causes the second creation to fail.
> 
> I've noticed this before.  It's not an I/O order issue.  I had a root 
> partition that was close to full and copying a new kernel into it would 
> fail the first time but succeed the second (since the old kernel had 
> been deleted by the previous failed copy.)  Turning off softdeps "fixed" 
> the problem.

Here's a patch I cooked up to address this problem.  It is based on the
same change done to the FreeBSD softdep code.  I also noticed there
is code in FreeBSD to stop softdeps killing the buffer cache.

Those familiar with softdeps can review these changes.  However,
if the syncer is already running flat out, nothing seems to help.

	-- Gregory McGarry <g.mcgarry@ieee.org>

Index: ffs_alloc.c
===================================================================
RCS file: /cvsroot/syssrc/sys/ufs/ffs/ffs_alloc.c,v
retrieving revision 1.55
diff -u -b -p -r1.55 ffs_alloc.c
--- ffs_alloc.c	2002/05/14 02:46:22	1.55
+++ ffs_alloc.c	2002/06/29 06:16:15
@@ -116,7 +116,7 @@ ffs_alloc(ip, lbn, bpref, size, cred, bn
 {
 	struct fs *fs = ip->i_fs;
 	ufs_daddr_t bno;
-	int cg;
+	int cg, reclaimed;
 #ifdef QUOTA
 	int error;
 #endif
@@ -151,6 +151,8 @@ ffs_alloc(ip, lbn, bpref, size, cred, bn
 	if (cred == NOCRED)
 		panic("ffs_alloc: missing credential\n");
 #endif /* DIAGNOSTIC */
+	reclaimed = 0;
+retry:
 	if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
 		goto nospace;
 	if (cred->cr_uid != 0 && freespace(fs, fs->fs_minfree) <= 0)
@@ -180,6 +182,12 @@ ffs_alloc(ip, lbn, bpref, size, cred, bn
 	(void) chkdq(ip, (long)-btodb(size), cred, FORCE);
 #endif
 nospace:
+
+	if (fs->fs_pendingblocks > 0 && reclaimed == 0) {
+		reclaimed = 1;
+		softdep_request_cleanup(fs, ITOV(ip));
+		goto retry;
+	}	
 	ffs_fserr(fs, cred->cr_uid, "file system full");
 	uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
 	return (ENOSPC);
@@ -205,7 +213,7 @@ ffs_realloccg(ip, lbprev, bpref, osize, 
 {
 	struct fs *fs = ip->i_fs;
 	struct buf *bp;
-	int cg, request, error;
+	int cg, request, error, reclaimed;
 	ufs_daddr_t bprev, bno;
 
 #ifdef UVM_PAGE_TRKOWN
@@ -238,6 +246,8 @@ ffs_realloccg(ip, lbprev, bpref, osize, 
 	if (cred == NOCRED)
 		panic("ffs_realloccg: missing credential\n");
 #endif /* DIAGNOSTIC */
+	reclaimed = 0;
+retry:
 	if (cred->cr_uid != 0 && freespace(fs, fs->fs_minfree) <= 0)
 		goto nospace;
 	if ((bprev = ufs_rw32(ip->i_ffs_db[lbprev], UFS_FSNEEDSWAP(fs))) == 0) {
@@ -376,6 +386,12 @@ nospace:
 	/*
 	 * no space available
 	 */
+	if (fs->fs_pendingblocks > 0 && reclaimed == 0) {
+		reclaimed = 1;
+		softdep_request_cleanup(fs, ITOV(ip));
+		goto retry;
+	}
+
 	ffs_fserr(fs, cred->cr_uid, "file system full");
 	uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
 	return (ENOSPC);
Index: ffs_extern.h
===================================================================
RCS file: /cvsroot/syssrc/sys/ufs/ffs/ffs_extern.h,v
retrieving revision 1.22
diff -u -b -p -r1.22 ffs_extern.h
--- ffs_extern.h	2002/05/05 17:00:06	1.22
+++ ffs_extern.h	2002/06/29 06:16:15
@@ -156,6 +156,7 @@ int	softdep_flushfiles __P((struct mount
 void	softdep_update_inodeblock __P((struct inode *, struct buf *, int));
 void	softdep_load_inodeblock __P((struct inode *));
 void	softdep_freefile __P((void *));
+int	softdep_request_cleanup(struct fs *, struct vnode *);
 void	softdep_setup_freeblocks __P((struct inode *, off_t));
 void	softdep_setup_inomapdep __P((struct buf *, struct inode *, ino_t));
 void	softdep_setup_blkmapdep __P((struct buf *, struct fs *, ufs_daddr_t));
Index: ffs_softdep.c
===================================================================
RCS file: /cvsroot/syssrc/sys/ufs/ffs/ffs_softdep.c,v
retrieving revision 1.32
diff -u -b -p -r1.32 ffs_softdep.c
--- ffs_softdep.c	2002/06/18 20:24:31	1.32
+++ ffs_softdep.c	2002/06/29 06:16:16
@@ -179,7 +179,9 @@ static	int inodedep_lookup __P((struct f
 static	int pagedep_lookup __P((struct inode *, ufs_lbn_t, int,
 	    struct pagedep **));
 static	void pause_timer __P((void *));
+static	void cleanup_timer __P((void *));
 static	int request_cleanup __P((int, int));
+static  int process_worklist_item __P((struct mount *, int));
 static	void add_to_worklist __P((struct worklist *));
 static	struct buf *softdep_setup_pagecache __P((struct inode *, ufs_lbn_t,
 						 long));
@@ -553,11 +555,15 @@ static int max_softdeps;	/* maximum numb
 static int tickdelay = 2;	/* number of ticks to pause during slowdown */
 static int proc_waiting;	/* tracks whether we have a timeout posted */
 static struct callout pause_timer_ch = CALLOUT_INITIALIZER;
+static struct callout cleanup_timer_ch = CALLOUT_INITIALIZER;
+static int cleanup_timeout = 0;
 static struct proc *filesys_syncer; /* proc of filesystem syncer process */
 static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
 #define FLUSH_INODES	1
 static int req_clear_remove;	/* syncer process flush some freeblks */
 #define FLUSH_REMOVE	2
+#define FLUSH_REMOVE_WAIT	3
+
 /*
  * runtime statistics
  */
@@ -622,9 +628,9 @@ softdep_process_worklist(matchmnt)
 {
 	struct proc *p = CURPROC;
 	struct worklist *wk;
-	struct fs *matchfs;
 	int matchcnt;
 
+#if 0
 	/*
 	 * First process any items on the delayed-free queue.
 	 */
@@ -632,6 +638,7 @@ softdep_process_worklist(matchmnt)
 	ACQUIRE_LOCK(&lk);
 	softdep_freequeue_process();
 	FREE_LOCK(&lk);
+#endif
 
 	/*
 	 * Record the process identifier of our caller so that we can give
@@ -639,17 +646,19 @@ softdep_process_worklist(matchmnt)
 	 */
 	filesys_syncer = p;
 	matchcnt = 0;
-	matchfs = NULL;
-	if (matchmnt != NULL)
-		matchfs = VFSTOUFS(matchmnt)->um_fs;
+
 	/*
 	 * There is no danger of having multiple processes run this
 	 * code. It is single threaded solely so that softdep_flushfiles
 	 * (below) can get an accurate count of the number of items
 	 * related to its mount point that are in the list.
 	 */
-	if (softdep_worklist_busy && matchmnt == NULL)
+	if (matchmnt == NULL) {
+		if (softdep_worklist_busy < 0)
 		return (-1);
+		softdep_worklist_busy += 1;
+	}
+
 	/*
 	 * If requested, try removing inode or removal dependencies.
 	 */
@@ -663,8 +672,81 @@ softdep_process_worklist(matchmnt)
 		req_clear_remove = 0;
 		wakeup(&proc_waiting);
 	}
-	ACQUIRE_LOCK(&lk);
+
 	while ((wk = LIST_FIRST(&softdep_workitem_pending)) != 0) {
+		matchcnt += process_worklist_item(matchmnt, 0);
+
+		if (softdep_worklist_busy && matchmnt == NULL) {
+			matchcnt = -1;
+			break;
+		}
+
+		/*
+		 * If requested, try removing inode or removal dependencies.
+		 */
+		if (req_clear_inodedeps) {
+			clear_inodedeps(p);
+			req_clear_inodedeps = 0;
+			wakeup(&proc_waiting);
+		}
+		if (req_clear_remove) {
+			clear_remove(p);
+			req_clear_remove = 0;
+			wakeup(&proc_waiting);
+		}
+
+		/*
+		 * Process any new items on the delayed-free queue.
+		 */
+		ACQUIRE_LOCK(&lk);
+		softdep_freequeue_process();
+		FREE_LOCK(&lk);
+	}
+	if (matchmnt == NULL)
+		softdep_worklist_busy -= 1;
+
+	return (matchcnt);
+}
+
+
+/*
+ * Process one item on the worklist.
+ */
+static int
+process_worklist_item(matchmnt, flags)
+	struct mount *matchmnt;
+	int flags;
+{
+	struct worklist *wk;
+	struct dirrem *dirrem;
+	struct vnode *vp;
+	struct fs *matchfs;
+	int matchcnt = -1;
+
+	matchfs = NULL;
+	if (matchmnt != NULL)
+		matchfs = VFSTOUFS(matchmnt)->um_fs;
+
+	ACQUIRE_LOCK(&lk);
+       /*
+         * Normally we just process each item on the worklist in order.
+         * However, if we are in a situation where we cannot lock any
+         * inodes, we have to skip over any dirrem requests whose
+         * vnodes are resident and locked.
+         */
+        LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) {
+                if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
+                        break;
+                dirrem = WK_DIRREM(wk);
+                vp = ufs_ihashlookup(VFSTOUFS(dirrem->dm_mnt)->um_dev,
+                    dirrem->dm_oldinum);
+                if (vp == NULL || !VOP_ISLOCKED(vp))
+                        break;
+        }
+	if (wk == 0) {
+		FREE_LOCK(&lk);
+		return (-1);
+	}
 		WORKLIST_REMOVE(wk);
 		FREE_LOCK(&lk);
 		switch (wk->wk_type) {
@@ -702,33 +784,10 @@ softdep_process_worklist(matchmnt)
 			    TYPENAME(wk->wk_type));
 			/* NOTREACHED */
 		}
-		if (softdep_worklist_busy && matchmnt == NULL)
-			return (-1);
-		/*
-		 * If requested, try removing inode or removal dependencies.
-		 */
-		if (req_clear_inodedeps) {
-			clear_inodedeps(p);
-			req_clear_inodedeps = 0;
-			wakeup(&proc_waiting);
-		}
-		if (req_clear_remove) {
-			clear_remove(p);
-			req_clear_remove = 0;
-			wakeup(&proc_waiting);
-		}
-
-		/*
-		 * Process any new items on the delayed-free queue.
-		 */
-
-		ACQUIRE_LOCK(&lk);
-		softdep_freequeue_process();
-	}
-	FREE_LOCK(&lk);
 	return (matchcnt);
 }
 
+
 /*
  * Move dependencies from one buffer to another.
  */
@@ -771,7 +830,7 @@ softdep_flushfiles(oldmnt, flags, p)
 	 */
 	while (softdep_worklist_busy)
 		tsleep(&lbolt, PRIBIO, "softflush", 0);
-	softdep_worklist_busy = 1;
+	softdep_worklist_busy = -1;
 	if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) {
 		softdep_worklist_busy = 0;
 		return (error);
@@ -4889,6 +4948,50 @@ flush_pagedep_deps(pvp, mp, diraddhdp)
 }
 
 /*
+ * Called by the allocation routines when they are about to fail
+ * in the hope that we can free up some disk space.
+ * 
+ * First check to see if the work list has anything on it. If it has,
+ * clean up entries until we successfully free some space. Because this
+ * process holds inodes locked, we cannot handle any remove requests
+ * that might block on a locked inode as that could lead to deadlock.
+ * If the worklist yields no free space, encourage the syncer daemon
+ * to help us. In no event will we try for longer than tickdelay seconds.
+ */
+int
+softdep_request_cleanup(fs, vp)
+	struct fs *fs;
+	struct vnode *vp;
+{
+	long needed;
+
+	cleanup_timeout = 0;
+	needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize;
+	if (VOP_UPDATE(vp, NULL, NULL, UPDATE_WAIT) != 0)
+		return (0);
+	callout_reset(&cleanup_timer_ch, tickdelay, cleanup_timer, NULL);
+	while (fs->fs_pendingblocks > 0 &&
+	    fs->fs_cstotal.cs_nbfree <= needed) {
+		if (cleanup_timeout != 0)
+			return (0);
+		if ((LIST_FIRST(&softdep_workitem_pending) != NULL) &&
+		    (softdep_process_worklist(NULL) != -1))
+			continue;
+		request_cleanup(FLUSH_REMOVE_WAIT, 0);
+	}
+	callout_stop(&cleanup_timer_ch);
+	return (1);
+}
+
+void
+cleanup_timer(arg)
+	void *arg;
+{
+
+	cleanup_timeout = 1;
+}
+
+/*
  * A large burst of file addition or deletion activity can drive the
  * memory load excessively high. Therefore we deliberately slow things
  * down and speed up the I/O processing if we find ourselves with too
@@ -4908,6 +5011,13 @@ request_cleanup(resource, islocked)
 	if (p == filesys_syncer)
 		return (0);
 	/*
+	 * Next, we attempt to speed up the syncer process. If that
+	 * is successful, then we allow the process to continue.
+	 */
+	if (speedup_syncer() && resource != FLUSH_REMOVE_WAIT)
+		return (0);
+
+	/*
 	 * If we are resource constrained on inode dependencies, try
 	 * flushing some dirty inodes. Otherwise, we are constrained
 	 * by file deletions, so try accelerating flushes of directories
@@ -4925,11 +5035,14 @@ request_cleanup(resource, islocked)
 		break;
 
 	case FLUSH_REMOVE:
+	case FLUSH_REMOVE_WAIT:
 		stat_blk_limit_push += 1;
 		req_clear_remove = 1;
 		break;
 
 	default:
+		if (islocked)
+			FREE_LOCK(&lk);
 		panic("request_cleanup: unknown type");
 	}
 	/*