tech-kern archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

ffs fsync patch - block devices and wapbl



Block devices have two different properties.  First they are device nodes
having access times etc. and reside on a file system.  Second they may have
a file system mounted on them.  With WAPBL it is important to handle these
properties separately.

Relevant PRs are 41189, 41192, 41977, 42149, 42551, 44377 and 44746 at least.

The attached diff should solve these problems by:

- Replace the ugly sync loop in ffs_full_fsync() with vflushbuf().  This
  loop is a left-over of softdeps and not needed anymore.

- Merge ffs_vfs_fsync() with ffs_full_fsync() so we have only on operation
  whether the request comes from ffs or from other file system via VFS_FSYNC().

- Take care which mount to test for WAPBL -- v_mount to update the times and
  wapbl_vptomp() to update the dirty blocks.  Never update times when called
  by VFS_FSYNC().


Comments or objections?

-- 
Juergen Hannken-Illjes - hannken%eis.cs.tu-bs.de@localhost - TU Braunschweig 
(Germany)
Index: sys/ufs/ffs/ffs_vnops.c
===================================================================
RCS file: /cvsroot/src/sys/ufs/ffs/ffs_vnops.c,v
retrieving revision 1.117
diff -p -u -4 -r1.117 ffs_vnops.c
--- sys/ufs/ffs/ffs_vnops.c     15 Apr 2011 15:54:11 -0000      1.117
+++ sys/ufs/ffs/ffs_vnops.c     19 Apr 2011 09:33:28 -0000
@@ -392,30 +392,33 @@ out:
        return error;
 }
 
 /*
- * Synch an open file.  Called for VOP_FSYNC().
+ * Synch an open file.
+ * Called for VOP_FSYNC() for nodes on an ffs file system or with
+ * flag FSYNC_VFS for VFS_FSYNC() for VBLK nodes with mounted ffs file system.
  */
 /* ARGSUSED */
 int
 ffs_full_fsync(struct vnode *vp, int flags)
 {
-       struct buf *bp, *nbp;
-       int error, passes, skipmeta, waitfor, i;
+       int error, waitfor, i;
+#ifdef WAPBL
        struct mount *mp;
+#endif
 
-       KASSERT(VTOI(vp) != NULL);
-       KASSERT(vp->v_tag == VT_UFS);
-
-       error = 0;
-
-       mp = vp->v_mount;
-       if (vp->v_type == VBLK && vp->v_specmountpoint != NULL) {
-               mp = vp->v_specmountpoint;
+       if ((flags & FSYNC_VFS) != 0) {
+               KASSERT(vp->v_type == VBLK);
+               KASSERT(vp->v_specmountpoint != NULL);
        } else {
-               mp = vp->v_mount;
+               KASSERT(VTOI(vp) != NULL);
+               KASSERT(vp->v_tag == VT_UFS);
+               if (vp->v_type == VBLK && vp->v_specmountpoint == NULL)
+                       KASSERT(LIST_EMPTY(&vp->v_dirtyblkhd));
        }
 
+       error = 0;
+
        /*
         * Flush all dirty data associated with the vnode.
         */
        if (vp->v_type == VREG || vp->v_type == VBLK) {
@@ -423,19 +426,18 @@ ffs_full_fsync(struct vnode *vp, int fla
 
                if ((flags & FSYNC_WAIT))
                        pflags |= PGO_SYNCIO;
                if (vp->v_type == VREG &&
-                   fstrans_getstate(mp) == FSTRANS_SUSPENDING)
+                   fstrans_getstate(vp->v_mount) == FSTRANS_SUSPENDING)
                        pflags |= PGO_FREE;
                mutex_enter(&vp->v_interlock);
                error = VOP_PUTPAGES(vp, 0, 0, pflags);
                if (error)
                        return error;
        }
 
 #ifdef WAPBL
-       mp = wapbl_vptomp(vp);
-       if (mp && mp->mnt_wapbl) {
+       if ((flags & FSYNC_VFS) == 0 && vp->v_mount && vp->v_mount->mnt_wapbl) {
                /*
                 * Don't bother writing out metadata if the syncer is
                 * making the request.  We will let the sync vnode
                 * write it out in a single burst through a call to
@@ -445,22 +447,26 @@ ffs_full_fsync(struct vnode *vp, int fla
                        return 0;
 
                if ((VTOI(vp)->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE
                    | IN_MODIFY | IN_MODIFIED | IN_ACCESSED)) != 0) {
-                       error = UFS_WAPBL_BEGIN(mp);
+                       error = UFS_WAPBL_BEGIN(vp->v_mount);
                        if (error)
                                return error;
                        error = ffs_update(vp, NULL, NULL, UPDATE_CLOSE |
                            ((flags & FSYNC_WAIT) ? UPDATE_WAIT : 0));
-                       UFS_WAPBL_END(mp);
+                       UFS_WAPBL_END(vp->v_mount);
                }
-               if (error || (flags & FSYNC_NOLOG) != 0)
+               if (error)
                        return error;
-
+       }
+       mp = wapbl_vptomp(vp);
+       if (mp && mp->mnt_wapbl) {
                /*
                 * Don't flush the log if the vnode being flushed
                 * contains no dirty buffers that could be in the log.
                 */
+               if ((flags & FSYNC_NOLOG) != 0)
+                       return 0;
                if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
                        error = wapbl_flush(mp->mnt_wapbl, 0);
                        if (error)
                                return error;
@@ -476,90 +482,22 @@ ffs_full_fsync(struct vnode *vp, int fla
                return error;
        }
 #endif /* WAPBL */
 
-       /*
-        * Write out metadata for non-logging file systems. XXX This block
-        * should be simplified now that softdep is gone.
-        */
-       passes = NIADDR + 1;
-       skipmeta = 0;
-       if (flags & FSYNC_WAIT)
-               skipmeta = 1;
-
-loop:
-       mutex_enter(&bufcache_lock);
-       LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
-               bp->b_cflags &= ~BC_SCANNED;
-       }
-       for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
-               nbp = LIST_NEXT(bp, b_vnbufs);
-               if (bp->b_cflags & (BC_BUSY | BC_SCANNED))
-                       continue;
-               if ((bp->b_oflags & BO_DELWRI) == 0)
-                       panic("ffs_fsync: not dirty");
-               if (skipmeta && bp->b_lblkno < 0)
-                       continue;
-               bp->b_cflags |= BC_BUSY | BC_VFLUSH | BC_SCANNED;
-               mutex_exit(&bufcache_lock);
-               /*
-                * On our final pass through, do all I/O synchronously
-                * so that we can find out if our flush is failing
-                * because of write errors.
-                */
-               if (passes > 0 || !(flags & FSYNC_WAIT))
-                       (void) bawrite(bp);
-               else if ((error = bwrite(bp)) != 0)
-                       return (error);
-               /*
-                * Since we unlocked during the I/O, we need
-                * to start from a known point.
-                */
-               mutex_enter(&bufcache_lock);
-               nbp = LIST_FIRST(&vp->v_dirtyblkhd);
-       }
-       mutex_exit(&bufcache_lock);
-       if (skipmeta) {
-               skipmeta = 0;
-               goto loop;
-       }
-
-       if ((flags & FSYNC_WAIT) != 0) {
-               mutex_enter(&vp->v_interlock);
-               while (vp->v_numoutput) {
-                       cv_wait(&vp->v_cv, &vp->v_interlock);
-               }
-               mutex_exit(&vp->v_interlock);
-
-               /*
-                * Ensure that any filesystem metadata associated
-                * with the vnode has been written.
-                */
-               if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
-                       /*
-                       * Block devices associated with filesystems may
-                       * have new I/O requests posted for them even if
-                       * the vnode is locked, so no amount of trying will
-                       * get them clean. Thus we give block devices a
-                       * good effort, then just give up. For all other file
-                       * types, go around and try again until it is clean.
-                       */
-                       if (passes > 0) {
-                               passes--;
-                               goto loop;
-                       }
-#ifdef DIAGNOSTIC
-                       if (vp->v_type != VBLK)
-                               vprint("ffs_fsync: dirty", vp);
-#endif
-               }
-       }
+       waitfor = ((flags & FSYNC_WAIT) != 0);
+       error = vflushbuf(vp, waitfor);
+       if (error)
+               return error;
 
        waitfor = (flags & FSYNC_WAIT) ? UPDATE_WAIT : 0;
-       error = ffs_update(vp, NULL, NULL, UPDATE_CLOSE | waitfor);
+       if ((flags & FSYNC_VFS) == 0)
+               error = ffs_update(vp, NULL, NULL, UPDATE_CLOSE | waitfor);
 
        if (error == 0 && (flags & FSYNC_CACHE) != 0) {
-               (void)VOP_IOCTL(VTOI(vp)->i_devvp, DIOCCACHESYNC, &i, FWRITE,
+               if ((flags & FSYNC_VFS) == 0)
+                       vp = VTOI(vp)->i_devvp;
+               i = 1;
+               (void)VOP_IOCTL(vp, DIOCCACHESYNC, &i, FWRITE,
                    kauth_cred_get());
        }
 
        return error;
Index: sys/ufs/ffs/ffs_vfsops.c
===================================================================
RCS file: /cvsroot/src/sys/ufs/ffs/ffs_vfsops.c,v
retrieving revision 1.265
diff -p -u -4 -r1.265 ffs_vfsops.c
--- sys/ufs/ffs/ffs_vfsops.c    27 Mar 2011 08:04:50 -0000      1.265
+++ sys/ufs/ffs/ffs_vfsops.c    19 Apr 2011 09:33:28 -0000
@@ -109,10 +109,8 @@ __KERNEL_RCSID(0, "$NetBSD: ffs_vfsops.c
 #include <ufs/ffs/ffs_extern.h>
 
 MODULE(MODULE_CLASS_VFS, ffs, NULL);
 
-static int     ffs_vfs_fsync(vnode_t *, int);
-
 static struct sysctllog *ffs_sysctl_log;
 
 /* how many times ffs_init() was called */
 int ffs_initcount = 0;
@@ -149,9 +147,9 @@ struct vfsops ffs_vfsops = {
        ffs_extattrctl,
        ffs_suspendctl,
        genfs_renamelock_enter,
        genfs_renamelock_exit,
-       ffs_vfs_fsync,
+       ffs_full_fsync,
        ffs_vnodeopv_descs,
        0,
        { NULL, NULL },
 };
@@ -2068,145 +2066,4 @@ ffs_suspendctl(struct mount *mp, int cmd
        default:
                return EINVAL;
        }
 }
-
-/*
- * Synch vnode for a mounted file system.  This is called for foreign
- * vnodes, i.e. non-ffs.
- */
-static int
-ffs_vfs_fsync(vnode_t *vp, int flags)
-{
-       int error, passes, skipmeta, i, pflags;
-       buf_t *bp, *nbp;
-#ifdef WAPBL
-       struct mount *mp;
-#endif
-
-       KASSERT(vp->v_type == VBLK);
-       KASSERT(vp->v_specmountpoint != NULL);
-
-       /*
-        * Flush all dirty data associated with the vnode.
-        */
-       pflags = PGO_ALLPAGES | PGO_CLEANIT;
-       if ((flags & FSYNC_WAIT) != 0)
-               pflags |= PGO_SYNCIO;
-       mutex_enter(&vp->v_interlock);
-       error = VOP_PUTPAGES(vp, 0, 0, pflags);
-       if (error)
-               return error;
-
-#ifdef WAPBL
-       mp = vp->v_specmountpoint;
-       if (mp && mp->mnt_wapbl) {
-               /*
-                * Don't bother writing out metadata if the syncer is
-                * making the request.  We will let the sync vnode
-                * write it out in a single burst through a call to
-                * VFS_SYNC().
-                */
-               if ((flags & (FSYNC_DATAONLY | FSYNC_LAZY | FSYNC_NOLOG)) != 0)
-                       return 0;
-
-               /*
-                * Don't flush the log if the vnode being flushed
-                * contains no dirty buffers that could be in the log.
-                */
-               if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
-                       error = wapbl_flush(mp->mnt_wapbl, 0);
-                       if (error)
-                               return error;
-               }
-
-               if ((flags & FSYNC_WAIT) != 0) {
-                       mutex_enter(&vp->v_interlock);
-                       while (vp->v_numoutput)
-                               cv_wait(&vp->v_cv, &vp->v_interlock);
-                       mutex_exit(&vp->v_interlock);
-               }
-
-               return 0;
-       }
-#endif /* WAPBL */
-
-       /*
-        * Write out metadata for non-logging file systems. XXX This block
-        * should be simplified now that softdep is gone.
-        */
-       passes = NIADDR + 1;
-       skipmeta = 0;
-       if (flags & FSYNC_WAIT)
-               skipmeta = 1;
-
-loop:
-       mutex_enter(&bufcache_lock);
-       LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
-               bp->b_cflags &= ~BC_SCANNED;
-       }
-       for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
-               nbp = LIST_NEXT(bp, b_vnbufs);
-               if (bp->b_cflags & (BC_BUSY | BC_SCANNED))
-                       continue;
-               if ((bp->b_oflags & BO_DELWRI) == 0)
-                       panic("ffs_fsync: not dirty");
-               if (skipmeta && bp->b_lblkno < 0)
-                       continue;
-               bp->b_cflags |= BC_BUSY | BC_VFLUSH | BC_SCANNED;
-               mutex_exit(&bufcache_lock);
-               /*
-                * On our final pass through, do all I/O synchronously
-                * so that we can find out if our flush is failing
-                * because of write errors.
-                */
-               if (passes > 0 || !(flags & FSYNC_WAIT))
-                       (void) bawrite(bp);
-               else if ((error = bwrite(bp)) != 0)
-                       return (error);
-               /*
-                * Since we unlocked during the I/O, we need
-                * to start from a known point.
-                */
-               mutex_enter(&bufcache_lock);
-               nbp = LIST_FIRST(&vp->v_dirtyblkhd);
-       }
-       mutex_exit(&bufcache_lock);
-       if (skipmeta) {
-               skipmeta = 0;
-               goto loop;
-       }
-
-       if ((flags & FSYNC_WAIT) != 0) {
-               mutex_enter(&vp->v_interlock);
-               while (vp->v_numoutput) {
-                       cv_wait(&vp->v_cv, &vp->v_interlock);
-               }
-               mutex_exit(&vp->v_interlock);
-
-               if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
-                       /*
-                       * Block devices associated with filesystems may
-                       * have new I/O requests posted for them even if
-                       * the vnode is locked, so no amount of trying will
-                       * get them clean. Thus we give block devices a
-                       * good effort, then just give up. For all other file
-                       * types, go around and try again until it is clean.
-                       */
-                       if (passes > 0) {
-                               passes--;
-                               goto loop;
-                       }
-#ifdef DIAGNOSTIC
-                       if (vp->v_type != VBLK)
-                               vprint("ffs_fsync: dirty", vp);
-#endif
-               }
-       }
-
-       if (error == 0 && (flags & FSYNC_CACHE) != 0) {
-               (void)VOP_IOCTL(vp, DIOCCACHESYNC, &i, FWRITE,
-                   kauth_cred_get());
-       }
-
-       return error;
-}
Index: sys/kern/vfs_subr.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_subr.c,v
retrieving revision 1.421
diff -p -u -4 -r1.421 vfs_subr.c
--- sys/kern/vfs_subr.c 2 Apr 2011 04:28:56 -0000       1.421
+++ sys/kern/vfs_subr.c 19 Apr 2011 09:33:27 -0000
@@ -271,13 +271,13 @@ restart:
  * Flush all dirty buffers from a vnode.
  * Called with the underlying vnode locked, which should prevent new dirty
  * buffers from being queued.
  */
-void
+int
 vflushbuf(struct vnode *vp, int sync)
 {
        struct buf *bp, *nbp;
-       int flags = PGO_CLEANIT | PGO_ALLPAGES | (sync ? PGO_SYNCIO : 0);
+       int error, flags = PGO_CLEANIT | PGO_ALLPAGES | (sync ? PGO_SYNCIO : 0);
        bool dirty;
 
        mutex_enter(&vp->v_interlock);
        (void) VOP_PUTPAGES(vp, 0, 0, flags);
@@ -297,16 +297,19 @@ loop:
                 * since there is no way to quickly wait for them below.
                 */
                if (bp->b_vp == vp || sync == 0)
                        (void) bawrite(bp);
-               else
-                       (void) bwrite(bp);
+               else {
+                       error = bwrite(bp);
+                       if (error)
+                               return error;
+               }
                goto loop;
        }
        mutex_exit(&bufcache_lock);
 
        if (sync == 0)
-               return;
+               return 0;
 
        mutex_enter(&vp->v_interlock);
        while (vp->v_numoutput != 0)
                cv_wait(&vp->v_cv, &vp->v_interlock);
@@ -316,8 +319,10 @@ loop:
        if (dirty) {
                vprint("vflushbuf: dirty", vp);
                goto loop;
        }
+
+       return 0;
 }
 
 /*
  * Create a vnode for a block device.
Index: sys/sys/vnode.h
===================================================================
RCS file: /cvsroot/src/sys/sys/vnode.h,v
retrieving revision 1.228
diff -p -u -4 -r1.228 vnode.h
--- sys/sys/vnode.h     18 Apr 2011 15:53:04 -0000      1.228
+++ sys/sys/vnode.h     19 Apr 2011 09:33:28 -0000
@@ -551,9 +551,9 @@ int vaccess(enum vtype, mode_t, uid_t, g
 void   vattr_null(struct vattr *);
 void   vdevgone(int, int, int, enum vtype);
 int    vfinddev(dev_t, enum vtype, struct vnode **);
 int    vflush(struct mount *, struct vnode *, int);
-void   vflushbuf(struct vnode *, int);
+int    vflushbuf(struct vnode *, int);
 int    vget(struct vnode *, int);
 bool   vtryget(struct vnode *);
 void   vgone(struct vnode *);
 int    vinvalbuf(struct vnode *, int, kauth_cred_t, struct lwp *, bool, int);


Home | Main Index | Thread Index | Old Index