tech-kern archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Re: very bad behavior on overquota writes



On Thu, Nov 22, 2012 at 12:46:54PM +0100, Manuel Bouyer wrote:
> Here's another patch, after comments from chs@ in a private mail.
> 
> There really are 2 parts here:
> - avoid unecessery list walk at truncate time. This is the change to
>   uvm_vnp_setsize() (truncate between pgend and oldsize instead of 0, which
>   always triggers a list walk - we know there is nothing beyond oldsize)
>   and ffs_truncate() (vtruncbuf() is needed only for non-VREG files,
>   and will always trigger a list walk).
> 
> This help for the local write case, where on my test system syscalls/s rise
> from 170 (when write succeed) to 570 (when EDQUOT is returned). Without
> this patch, the syscalls/s would fall to less than 20.
> 
> But this doesn't help much for nfsd, which can get write requests way beyond
> the real end of file (the client's idea of end of file is wrong).
> In such a case, the distance between pgend and oldsize in uvm_vnp_setsize()
> is large enough to trigger a list walk, even through there is really only
> a few pages to free. I guess fixing this would require an interface change
> to pass to uvm_vnp_setsize() for which the allocation did really take place.
> But I found a workaround: when a write error occurs, free all the
> pages associated with the vnode in ffs_write(). This way, on subsequent writes
> only new pages should be in the list, and freeing them is fast.
> 
> With this change, the nfsd performances don't change when the quota limit
> is hit.

In fact, with this patch alone, the nfsd performances drops
from about 1250 writes/s to 700 when the quota limit is hit (and we keep
a disk activity of 140MB/s when no data write occurs). To preserve the
performance, the attached patch is also needed. What this does is detect early
that we'll have an allocation failure in ffs_balloc(), and bail out
before we allocated any indirect block that we would need to freed in the
error path. 
This needs an additionnal flag to to chkdq(), to test a quota allocation
without updating the quota values.

-- 
Manuel Bouyer <bouyer%antioche.eu.org@localhost>
     NetBSD: 26 ans d'experience feront toujours la difference
--
Index: ffs/ffs_alloc.c
===================================================================
RCS file: /cvsroot/src/sys/ufs/ffs/ffs_alloc.c,v
retrieving revision 1.130
diff -u -p -r1.130 ffs_alloc.c
--- ffs/ffs_alloc.c     28 Nov 2011 08:05:07 -0000      1.130
+++ ffs/ffs_alloc.c     22 Nov 2012 17:27:28 -0000
@@ -110,7 +110,6 @@ static daddr_t ffs_alloccg(struct inode 
 static daddr_t ffs_alloccgblk(struct inode *, struct buf *, daddr_t, int);
 static ino_t ffs_dirpref(struct inode *);
 static daddr_t ffs_fragextend(struct inode *, int, daddr_t, int, int);
-static void ffs_fserr(struct fs *, u_int, const char *);
 static daddr_t ffs_hashalloc(struct inode *, int, daddr_t, int, int,
     daddr_t (*)(struct inode *, int, daddr_t, int, int));
 static daddr_t ffs_nodealloccg(struct inode *, int, daddr_t, int, int);
@@ -2014,17 +2013,3 @@ ffs_mapsearch(struct fs *fs, struct cg *
        panic("ffs_alloccg: block not in map");
        /* return (-1); */
 }
-
-/*
- * Fserr prints the name of a file system with an error diagnostic.
- *
- * The form of the error message is:
- *     fs: error message
- */
-static void
-ffs_fserr(struct fs *fs, u_int uid, const char *cp)
-{
-
-       log(LOG_ERR, "uid %d, pid %d, command %s, on %s: %s\n",
-           uid, curproc->p_pid, curproc->p_comm, fs->fs_fsmnt, cp);
-}
Index: ffs/ffs_balloc.c
===================================================================
RCS file: /cvsroot/src/sys/ufs/ffs/ffs_balloc.c,v
retrieving revision 1.54
diff -u -p -r1.54 ffs_balloc.c
--- ffs/ffs_balloc.c    23 Apr 2011 07:36:02 -0000      1.54
+++ ffs/ffs_balloc.c    22 Nov 2012 17:27:28 -0000
@@ -111,6 +111,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t 
        int32_t *blkp, *allocblk, allociblk[NIADDR + 1];
        int32_t *allocib;
        int unwindidx = -1;
+       off_t wantedblks;
 #ifdef FFS_EI
        const int needswap = UFS_FSNEEDSWAP(fs);
 #endif
@@ -263,9 +264,43 @@ ffs_balloc_ufs1(struct vnode *vp, off_t 
                return (error);
 
        /*
+        * before doing any allocations, check if we have enough space.
+        * This avoids going through allocations/deallocations if 
+        * the filesystem or quota is short in space, and a badly-written
+        * software keeps retrying the write.
+        * Here we may check a size larger than what's really needed
+        * is indirect blocks are already there, but it's too large
+        * by at most 3 blocks. Not a big deal.
+        */
+       wantedblks = (flags & B_METAONLY) ? num : (num + 1);
+       if (kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0,
+           NULL, NULL, NULL) != 0) {
+               if (blkstofrags(fs, wantedblks) > freespace(fs, 
fs->fs_minfree)) {
+                       ffs_fserr(fs, kauth_cred_geteuid(cred),
+                           "file system full");
+                       uprintf("\n%s: write failed, file system is full\n",
+                           fs->fs_fsmnt);
+                       return (ENOSPC);
+               }
+       } else {
+               if (wantedblks > fs->fs_cstotal.cs_nbfree) {
+                       ffs_fserr(fs, kauth_cred_geteuid(cred),
+                           "file system full");
+                       uprintf("\n%s: write failed, file system is full\n",
+                           fs->fs_fsmnt);
+                       return (ENOSPC);
+               }
+       }
+
+#if defined(QUOTA) || defined(QUOTA2)
+       if ((error = chkdq(ip, btodb(lblktosize(fs, wantedblks)), cred,
+           TRYONLY)) != 0)
+               return error;
+#endif
+
+       /*
         * Fetch the first indirect block allocating if necessary.
         */
-
        --num;
        nb = ufs_rw32(ip->i_ffs1_ib[indirs[0].in_off], needswap);
        allocib = NULL;
@@ -533,6 +571,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t 
        daddr_t *blkp, *allocblk, allociblk[NIADDR + 1];
        int64_t *allocib;
        int unwindidx = -1;
+       off_t wantedblks;
 #ifdef FFS_EI
        const int needswap = UFS_FSNEEDSWAP(fs);
 #endif
@@ -793,6 +832,40 @@ ffs_balloc_ufs2(struct vnode *vp, off_t 
                return (error);
 
        /*
+        * before doing any allocations, check if we have enough space.
+        * This avoids going through allocations/deallocations if 
+        * the filesystem or quota is short in space, and a badly-written
+        * software keeps retrying the write.
+        * Here we may check a size larger than what's really needed
+        * is indirect blocks are already there, but it's too large
+        * by at most 3 blocks. Not a big deal.
+        */
+       wantedblks = (flags & B_METAONLY) ? num : (num + 1);
+       if (kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0,
+           NULL, NULL, NULL) != 0) {
+               if (blkstofrags(fs, wantedblks) > freespace(fs, 
fs->fs_minfree)) {
+                       ffs_fserr(fs, kauth_cred_geteuid(cred),
+                           "file system full");
+                       uprintf("\n%s: write failed, file system is full\n",
+                           fs->fs_fsmnt);
+                       return (ENOSPC);
+               }
+       } else {
+               if (wantedblks > fs->fs_cstotal.cs_nbfree) {
+                       ffs_fserr(fs, kauth_cred_geteuid(cred),
+                           "file system full");
+                       uprintf("\n%s: write failed, file system is full\n",
+                           fs->fs_fsmnt);
+                       return (ENOSPC);
+               }
+       }
+
+#if defined(QUOTA) || defined(QUOTA2)
+       if ((error = chkdq(ip, btodb(lblktosize(fs, wantedblks)), cred,
+           TRYONLY)) != 0)
+               return error;
+#endif
+       /*
         * Fetch the first indirect block allocating if necessary.
         */
 
Index: ffs/ffs_extern.h
===================================================================
RCS file: /cvsroot/src/sys/ufs/ffs/ffs_extern.h,v
retrieving revision 1.78
diff -u -p -r1.78 ffs_extern.h
--- ffs/ffs_extern.h    17 Jun 2011 14:23:52 -0000      1.78
+++ ffs/ffs_extern.h    22 Nov 2012 17:27:28 -0000
@@ -193,6 +193,7 @@ void        ffs_cg_swap(struct cg *, struct cg 
 #if defined(_KERNEL)
 void   ffs_load_inode(struct buf *, struct inode *, struct fs *, ino_t);
 int    ffs_getblk(struct vnode *, daddr_t, daddr_t, int, bool, buf_t **);
+void   ffs_fserr(struct fs *, u_int, const char *);
 #endif /* defined(_KERNEL) */
 void   ffs_fragacct(struct fs *, int, int32_t[], int, int);
 int    ffs_isblock(struct fs *, u_char *, int32_t);
Index: ffs/ffs_subr.c
===================================================================
RCS file: /cvsroot/src/sys/ufs/ffs/ffs_subr.c,v
retrieving revision 1.47
diff -u -p -r1.47 ffs_subr.c
--- ffs/ffs_subr.c      14 Aug 2011 12:37:09 -0000      1.47
+++ ffs/ffs_subr.c      22 Nov 2012 17:27:28 -0000
@@ -64,6 +64,7 @@ void    panic(const char *, ...)
 #include <sys/inttypes.h>
 #include <sys/pool.h>
 #include <sys/fstrans.h>
+#include <sys/syslog.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>
@@ -132,6 +133,20 @@ ffs_getblk(struct vnode *vp, daddr_t lbl
        return error;
 }
 
+
+/*
+ * Fserr prints the name of a file system with an error diagnostic.
+ *
+ * The form of the error message is:
+ *     fs: error message
+ */
+void
+ffs_fserr(struct fs *fs, u_int uid, const char *cp)
+{
+
+       log(LOG_ERR, "uid %d, pid %d, command %s, on %s: %s\n",
+           uid, curproc->p_pid, curproc->p_comm, fs->fs_fsmnt, cp);
+}
 #endif /* _KERNEL */
 
 /*
Index: ufs/ufs_extern.h
===================================================================
RCS file: /cvsroot/src/sys/ufs/ufs/ufs_extern.h,v
retrieving revision 1.71
diff -u -p -r1.71 ufs_extern.h
--- ufs/ufs_extern.h    1 Feb 2012 05:34:43 -0000       1.71
+++ ufs/ufs_extern.h    22 Nov 2012 17:27:28 -0000
@@ -144,6 +144,7 @@ int ufs_blkatoff(struct vnode *, off_t, 
  * Flags to chkdq() and chkiq()
  */
 #define        FORCE   0x01    /* force usage changes independent of limits */
+#define        TRYONLY 0x02    /* check limits but don't update usage */
 void   ufsquota_init(struct inode *);
 void   ufsquota_free(struct inode *);
 int    chkdq(struct inode *, int64_t, kauth_cred_t, int);
Index: ufs/ufs_quota.c
===================================================================
RCS file: /cvsroot/src/sys/ufs/ufs/ufs_quota.c,v
retrieving revision 1.108.2.2
diff -u -p -r1.108.2.2 ufs_quota.c
--- ufs/ufs_quota.c     13 Sep 2012 22:24:27 -0000      1.108.2.2
+++ ufs/ufs_quota.c     22 Nov 2012 17:27:28 -0000
@@ -154,6 +154,7 @@ chkdq(struct inode *ip, int64_t change, 
 int
 chkiq(struct inode *ip, int32_t change, kauth_cred_t cred, int flags)
 {
+       KASSERT((flags & TRYONLY) == 0);
        /* do not track snapshot usage, or we will deadlock */
        if ((ip->i_flags & SF_SNAPSHOT) != 0)
                return 0;
Index: ufs/ufs_quota1.c
===================================================================
RCS file: /cvsroot/src/sys/ufs/ufs/ufs_quota1.c,v
retrieving revision 1.18
diff -u -p -r1.18 ufs_quota1.c
--- ufs/ufs_quota1.c    2 Feb 2012 03:00:48 -0000       1.18
+++ ufs/ufs_quota1.c    22 Nov 2012 17:27:28 -0000
@@ -71,6 +71,8 @@ chkdq1(struct inode *ip, int64_t change,
        if (change == 0)
                return (0);
        if (change < 0) {
+               if (flags & TRYONLY)
+                       return (0);
                for (i = 0; i < MAXQUOTAS; i++) {
                        if ((dq = ip->i_dquot[i]) == NODQUOT)
                                continue;
@@ -100,6 +102,8 @@ chkdq1(struct inode *ip, int64_t change,
                                return (error);
                }
        }
+       if (flags & TRYONLY)
+               return (0);
        for (i = 0; i < MAXQUOTAS; i++) {
                if ((dq = ip->i_dquot[i]) == NODQUOT)
                        continue;
Index: ufs/ufs_quota2.c
===================================================================
RCS file: /cvsroot/src/sys/ufs/ufs/ufs_quota2.c,v
retrieving revision 1.34.2.1
diff -u -p -r1.34.2.1 ufs_quota2.c
--- ufs/ufs_quota2.c    1 Oct 2012 19:55:22 -0000       1.34.2.1
+++ ufs/ufs_quota2.c    22 Nov 2012 17:27:28 -0000
@@ -465,6 +465,8 @@ quota2_check(struct inode *ip, int vtype
                return 0;
        }
        if (change < 0) {
+               if (flags & TRYONLY)
+                       return (0);
                for (i = 0; i < MAXQUOTAS; i++) {
                        dq = ip->i_dquot[i];
                        if (dq == NODQUOT)
@@ -551,7 +553,7 @@ quota2_check(struct inode *ip, int vtype
                if (dq == NODQUOT)
                        continue;
                KASSERT(q2e[i] != NULL);
-               if (error == 0) {
+               if (error == 0 && (flags & TRYONLY) == 0)  {
                        q2vp = &q2e[i]->q2e_val[vtype];
                        ncurblks = ufs_rw64(q2vp->q2v_cur, needswap);
                        q2vp->q2v_cur = ufs_rw64(ncurblks + change, needswap);


Home | Main Index | Thread Index | Old Index