Subject: kern/17961: softdep block allocation failure of nearly full disk
To: None <gnats-bugs@gnats.netbsd.org>
From: Gregory McGarry <g.mcgarry@netbsd.org>
List: netbsd-bugs
Date: 08/17/2002 12:09:25
>Number: 17961
>Category: kern
>Synopsis: softdep block allocation failure on nearly full disk
>Confidential: no
>Severity: non-critical
>Priority: low
>Responsible: kern-bug-people
>State: open
>Class: sw-bug
>Submitter-Id: net
>Arrival-Date: Fri Aug 16 17:10:00 PDT 2002
>Closed-Date:
>Last-Modified:
>Originator: Gregory McGarry
>Release: NetBSD-current
>Organization:
>Environment:
NetBSD 1.6 and -current
>Description:
Here is the commit log that appeared in FreeBSD:
"This patch fixes a long standing complaint with soft updates in
which small and/or nearly full filesystems would fail with `file
system full' messages when trying to replace a number of existing
files (for example during a system installation). When the allocation
routines are about to fail with a file system full condition, they
make a call to softdep_request_cleanup() which attempts to accelerate
the flushing of pending deletion requests in an effort to free up
space. In the face of filesystem I/O requests that exceed the
available disk transfer capacity, the cleanup request could take
an unbounded amount of time. Thus, the softdep_request_cleanup()
routine will only try for tickdelay seconds (default 2 seconds)
before giving up and returning a filesystem full error. Under typical
conditions, the softdep_request_cleanup() routine is able to free
up space in under fifty milliseconds."
Revisions in the FreeBSD CVS repository are (2002/01/22):
ffs_alloc.c 1.86
ffs_extern.h 1.44
ffs_softdep.c 1.105
>How-To-Repeat:
Fill the disk. Delete some files. Write to disk. Still fails.
>Fix:
Fair bit of beating required to get working, since the NetBSD
syncer is different to the FreeBSD one. Mostly works, but still
fails in some pathological cases.
Index: ffs_alloc.c
===================================================================
RCS file: /cvsroot/syssrc/sys/ufs/ffs/ffs_alloc.c,v
retrieving revision 1.55
diff -u -b -p -r1.55 ffs_alloc.c
--- ffs_alloc.c 2002/05/14 02:46:22 1.55
+++ ffs_alloc.c 2002/08/16 23:55:41
@@ -116,7 +116,7 @@ ffs_alloc(ip, lbn, bpref, size, cred, bn
{
struct fs *fs = ip->i_fs;
ufs_daddr_t bno;
- int cg;
+ int cg, reclaimed;
#ifdef QUOTA
int error;
#endif
@@ -151,6 +151,8 @@ ffs_alloc(ip, lbn, bpref, size, cred, bn
if (cred == NOCRED)
panic("ffs_alloc: missing credential\n");
#endif /* DIAGNOSTIC */
+ reclaimed = 0;
+retry:
if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
goto nospace;
if (cred->cr_uid != 0 && freespace(fs, fs->fs_minfree) <= 0)
@@ -180,6 +182,12 @@ ffs_alloc(ip, lbn, bpref, size, cred, bn
(void) chkdq(ip, (long)-btodb(size), cred, FORCE);
#endif
nospace:
+
+ if (fs->fs_pendingblocks > 0 && reclaimed == 0) {
+ reclaimed = 1;
+ softdep_request_cleanup(fs, ITOV(ip));
+ goto retry;
+ }
ffs_fserr(fs, cred->cr_uid, "file system full");
uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
return (ENOSPC);
@@ -205,7 +213,7 @@ ffs_realloccg(ip, lbprev, bpref, osize,
{
struct fs *fs = ip->i_fs;
struct buf *bp;
- int cg, request, error;
+ int cg, request, error, reclaimed;
ufs_daddr_t bprev, bno;
#ifdef UVM_PAGE_TRKOWN
@@ -238,6 +246,8 @@ ffs_realloccg(ip, lbprev, bpref, osize,
if (cred == NOCRED)
panic("ffs_realloccg: missing credential\n");
#endif /* DIAGNOSTIC */
+ reclaimed = 0;
+retry:
if (cred->cr_uid != 0 && freespace(fs, fs->fs_minfree) <= 0)
goto nospace;
if ((bprev = ufs_rw32(ip->i_ffs_db[lbprev], UFS_FSNEEDSWAP(fs))) == 0) {
@@ -376,6 +386,12 @@ nospace:
/*
* no space available
*/
+ if (fs->fs_pendingblocks > 0 && reclaimed == 0) {
+ reclaimed = 1;
+ softdep_request_cleanup(fs, ITOV(ip));
+ goto retry;
+ }
+
ffs_fserr(fs, cred->cr_uid, "file system full");
uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
return (ENOSPC);
Index: ffs_extern.h
===================================================================
RCS file: /cvsroot/syssrc/sys/ufs/ffs/ffs_extern.h,v
retrieving revision 1.22
diff -u -b -p -r1.22 ffs_extern.h
--- ffs_extern.h 2002/05/05 17:00:06 1.22
+++ ffs_extern.h 2002/08/16 23:55:42
@@ -156,6 +156,7 @@ int softdep_flushfiles __P((struct mount
void softdep_update_inodeblock __P((struct inode *, struct buf *, int));
void softdep_load_inodeblock __P((struct inode *));
void softdep_freefile __P((void *));
+int softdep_request_cleanup(struct fs *, struct vnode *);
void softdep_setup_freeblocks __P((struct inode *, off_t));
void softdep_setup_inomapdep __P((struct buf *, struct inode *, ino_t));
void softdep_setup_blkmapdep __P((struct buf *, struct fs *, ufs_daddr_t));
Index: ffs_softdep.c
===================================================================
RCS file: /cvsroot/syssrc/sys/ufs/ffs/ffs_softdep.c,v
retrieving revision 1.33
diff -u -b -p -r1.33 ffs_softdep.c
--- ffs_softdep.c 2002/07/05 13:49:26 1.33
+++ ffs_softdep.c 2002/08/16 23:55:43
@@ -179,7 +179,9 @@ static int inodedep_lookup __P((struct f
static int pagedep_lookup __P((struct inode *, ufs_lbn_t, int,
struct pagedep **));
static void pause_timer __P((void *));
+static void cleanup_timer __P((void *));
static int request_cleanup __P((int, int));
+static int process_worklist_item __P((struct mount *, int));
static void add_to_worklist __P((struct worklist *));
static struct buf *softdep_setup_pagecache __P((struct inode *, ufs_lbn_t,
long));
@@ -553,11 +555,15 @@ static int max_softdeps; /* maximum numb
static int tickdelay = 2; /* number of ticks to pause during slowdown */
static int proc_waiting; /* tracks whether we have a timeout posted */
static struct callout pause_timer_ch = CALLOUT_INITIALIZER;
+static struct callout cleanup_timer_ch = CALLOUT_INITIALIZER;
+static int cleanup_timeout = 0;
static struct proc *filesys_syncer; /* proc of filesystem syncer process */
static int req_clear_inodedeps; /* syncer process flush some inodedeps */
#define FLUSH_INODES 1
static int req_clear_remove; /* syncer process flush some freeblks */
#define FLUSH_REMOVE 2
+#define FLUSH_REMOVE_WAIT 3
+
/*
* runtime statistics
*/
@@ -622,9 +628,9 @@ softdep_process_worklist(matchmnt)
{
struct proc *p = CURPROC;
struct worklist *wk;
- struct fs *matchfs;
int matchcnt;
+#if 0
/*
* First process any items on the delayed-free queue.
*/
@@ -632,6 +638,7 @@ softdep_process_worklist(matchmnt)
ACQUIRE_LOCK(&lk);
softdep_freequeue_process();
FREE_LOCK(&lk);
+#endif
/*
* Record the process identifier of our caller so that we can give
@@ -639,17 +646,19 @@ softdep_process_worklist(matchmnt)
*/
filesys_syncer = p;
matchcnt = 0;
- matchfs = NULL;
- if (matchmnt != NULL)
- matchfs = VFSTOUFS(matchmnt)->um_fs;
+
/*
* There is no danger of having multiple processes run this
* code. It is single threaded solely so that softdep_flushfiles
* (below) can get an accurate count of the number of items
* related to its mount point that are in the list.
*/
- if (softdep_worklist_busy && matchmnt == NULL)
+ if (matchmnt == NULL) {
+ if (softdep_worklist_busy < 0)
return (-1);
+ softdep_worklist_busy += 1;
+ }
+
/*
* If requested, try removing inode or removal dependencies.
*/
@@ -663,8 +672,81 @@ softdep_process_worklist(matchmnt)
req_clear_remove = 0;
wakeup(&proc_waiting);
}
- ACQUIRE_LOCK(&lk);
+
while ((wk = LIST_FIRST(&softdep_workitem_pending)) != 0) {
+ matchcnt += process_worklist_item(matchmnt, 0);
+
+ if (softdep_worklist_busy && matchmnt == NULL) {
+ matchcnt = -1;
+ break;
+ }
+
+ /*
+ * If requested, try removing inode or removal dependencies.
+ */
+ if (req_clear_inodedeps) {
+ clear_inodedeps(p);
+ req_clear_inodedeps = 0;
+ wakeup(&proc_waiting);
+ }
+ if (req_clear_remove) {
+ clear_remove(p);
+ req_clear_remove = 0;
+ wakeup(&proc_waiting);
+ }
+
+ /*
+ * Process any new items on the delayed-free queue.
+ */
+ ACQUIRE_LOCK(&lk);
+ softdep_freequeue_process();
+ FREE_LOCK(&lk);
+ }
+ if (matchmnt == NULL)
+ softdep_worklist_busy -= 1;
+
+ return (matchcnt);
+}
+
+
+/*
+ * Process one item on the worklist.
+ */
+static int
+process_worklist_item(matchmnt, flags)
+ struct mount *matchmnt;
+ int flags;
+{
+ struct worklist *wk;
+ struct dirrem *dirrem;
+ struct vnode *vp;
+ struct fs *matchfs;
+ int matchcnt = -1;
+
+ matchfs = NULL;
+ if (matchmnt != NULL)
+ matchfs = VFSTOUFS(matchmnt)->um_fs;
+
+ ACQUIRE_LOCK(&lk);
+ /*
+ * Normally we just process each item on the worklist in order.
+ * However, if we are in a situation where we cannot lock any
+ * inodes, we have to skip over any dirrem requests whose
+ * vnodes are resident and locked.
+ */
+ LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) {
+ if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
+ break;
+ dirrem = WK_DIRREM(wk);
+ vp = ufs_ihashlookup(VFSTOUFS(dirrem->dm_mnt)->um_dev,
+ dirrem->dm_oldinum);
+ if (vp == NULL || !VOP_ISLOCKED(vp))
+ break;
+ }
+ if (wk == 0) {
+ FREE_LOCK(&lk);
+ return (-1);
+ }
WORKLIST_REMOVE(wk);
FREE_LOCK(&lk);
switch (wk->wk_type) {
@@ -702,33 +784,10 @@ softdep_process_worklist(matchmnt)
TYPENAME(wk->wk_type));
/* NOTREACHED */
}
- if (softdep_worklist_busy && matchmnt == NULL)
- return (-1);
- /*
- * If requested, try removing inode or removal dependencies.
- */
- if (req_clear_inodedeps) {
- clear_inodedeps(p);
- req_clear_inodedeps = 0;
- wakeup(&proc_waiting);
- }
- if (req_clear_remove) {
- clear_remove(p);
- req_clear_remove = 0;
- wakeup(&proc_waiting);
- }
-
- /*
- * Process any new items on the delayed-free queue.
- */
-
- ACQUIRE_LOCK(&lk);
- softdep_freequeue_process();
- }
- FREE_LOCK(&lk);
return (matchcnt);
}
+
/*
* Move dependencies from one buffer to another.
*/
@@ -771,7 +830,7 @@ softdep_flushfiles(oldmnt, flags, p)
*/
while (softdep_worklist_busy)
tsleep(&lbolt, PRIBIO, "softflush", 0);
- softdep_worklist_busy = 1;
+ softdep_worklist_busy = -1;
if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) {
softdep_worklist_busy = 0;
return (error);
@@ -4891,6 +4950,50 @@ flush_pagedep_deps(pvp, mp, diraddhdp)
}
/*
+ * Called by the allocation routines when they are about to fail
+ * in the hope that we can free up some disk space.
+ *
+ * First check to see if the work list has anything on it. If it has,
+ * clean up entries until we successfully free some space. Because this
+ * process holds inodes locked, we cannot handle any remove requests
+ * that might block on a locked inode as that could lead to deadlock.
+ * If the worklist yields no free space, encourage the syncer daemon
+ * to help us. In no event will we try for longer than tickdelay seconds.
+ */
+int
+softdep_request_cleanup(fs, vp)
+ struct fs *fs;
+ struct vnode *vp;
+{
+ long needed;
+
+ cleanup_timeout = 0;
+ needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize;
+ if (VOP_UPDATE(vp, NULL, NULL, UPDATE_WAIT) != 0)
+ return (0);
+ callout_reset(&cleanup_timer_ch, tickdelay, cleanup_timer, NULL);
+ while (fs->fs_pendingblocks > 0 &&
+ fs->fs_cstotal.cs_nbfree <= needed) {
+ if (cleanup_timeout != 0)
+ return (0);
+ if ((LIST_FIRST(&softdep_workitem_pending) != NULL) &&
+ (softdep_process_worklist(NULL) != -1))
+ continue;
+ request_cleanup(FLUSH_REMOVE_WAIT, 0);
+ }
+ callout_stop(&cleanup_timer_ch);
+ return (1);
+}
+
+void
+cleanup_timer(arg)
+ void *arg;
+{
+
+ cleanup_timeout = 1;
+}
+
+/*
* A large burst of file addition or deletion activity can drive the
* memory load excessively high. Therefore we deliberately slow things
* down and speed up the I/O processing if we find ourselves with too
@@ -4910,6 +5013,13 @@ request_cleanup(resource, islocked)
if (p == filesys_syncer)
return (0);
/*
+ * Next, we attempt to speed up the syncer process. If that
+ * is successful, then we allow the process to continue.
+ */
+ if (speedup_syncer() && resource != FLUSH_REMOVE_WAIT)
+ return (0);
+
+ /*
* If we are resource constrained on inode dependencies, try
* flushing some dirty inodes. Otherwise, we are constrained
* by file deletions, so try accelerating flushes of directories
@@ -4927,11 +5037,14 @@ request_cleanup(resource, islocked)
break;
case FLUSH_REMOVE:
+ case FLUSH_REMOVE_WAIT:
stat_blk_limit_push += 1;
req_clear_remove = 1;
break;
default:
+ if (islocked)
+ FREE_LOCK(&lk);
panic("request_cleanup: unknown type");
}
/*
>Release-Note:
>Audit-Trail:
>Unformatted: