Subject: Extension of fsync_range() to permit forcing disk cache flushing
To: None <tech-kern@netbsd.org>
From: Bill Studenmund <wrstuden@netbsd.org>
List: tech-kern
Date: 12/16/2004 12:16:09
--TD8GDToEDw0WLGOL
Content-Type: multipart/mixed; boundary="k4f25fnPtRuIRUb3"
Content-Disposition: inline


--k4f25fnPtRuIRUb3
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
Content-Transfer-Encoding: quoted-printable

I have an application that wants to be able to know that certain writes=20
have been forced to permanent storage - that they aren't still sitting in=
=20
the disk's write cache. This idea is similar to the current thread about=20
wd and write caches, except here we're talking about userland wanting to=20
request the flushing.

After discussing this with some developers, the best solution seems to be=
=20
to add a flag to fsync_range() to force this behavior. Then pass a flag=20
down to VOP_FSYNC() to trigger a disk cache flush after the other update=20
steps happen.

The current implementation just flushes the whole disk cache. Future=20
implementations may be selective and issue finer-grained cache flushes.

Thoughts?

Take care,

Bill

--k4f25fnPtRuIRUb3
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="fsync.diffie"
Content-Transfer-Encoding: quoted-printable

Index: lib/libc/sys/fsync.2
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /cvsroot/src/lib/libc/sys/fsync.2,v
retrieving revision 1.15
diff -u -u -r1.15 fsync.2
--- lib/libc/sys/fsync.2	18 Nov 2003 08:49:18 -0000	1.15
+++ lib/libc/sys/fsync.2	16 Dec 2004 19:58:30 -0000
@@ -79,6 +79,8 @@
 data for the specified range.
 .It Dv FFILESYNC
 Synchronize all modified file data and meta-data for the specified range.
+.It Dv FDISKSYNC
+Force the requested synchronization to permanent storage.
 .El
 .Pp
 If the
@@ -86,6 +88,36 @@
 parameter is zero,
 .Fn fsync_range
 will synchronize all of the file data.
+.Pp
+Passing the
+.Dv FDISKSYNC
+flag to
+.Fn fsync_range
+will trigger synchronization of the data, both file data and relevant
+meta-data, to permanent storage.
+Without this flag (and for all cases for
+.Fn fsync ) ,
+the call may return when the write commands have completed.
+The difference arises if the backing disk has an enabled write-back
+cache.
+In the presence of a write-back disk cache, the=20
+.Dv FDISKSYNC
+flag permits an application to enforce its own policy regarding which
+writes need to have reached permanent storage, or if it is sufficient
+for the disk to report the write has been completed.
+.Pp
+The exact write cache synchronization policy triggered by the=20
+.Dv FDISKSYNC=20
+flag is implementation dependent.
+An implementation may issue write cache synchronization commands to flush
+only the file data and relevant meta-data from the disk cache, or an
+implementation may flush the write cache in its entirety.
+An implementation may support both behaviors, and use its own policy to
+choose between them.
+Different file systems on the same host may implement this flag differentl=
y.
+Flusshing the cache in its entirety may have a performance impact as
+data unrelated to the file being synchronized will be written to permanent
+storage.
 .Sh RETURN VALUES
 A 0 value is returned on success.
 A \-1 value indicates an error.
Index: sys/kern/vfs_syscalls.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /cvsroot/src/sys/kern/vfs_syscalls.c,v
retrieving revision 1.213
diff -u -u -r1.213 vfs_syscalls.c
--- sys/kern/vfs_syscalls.c	30 Nov 2004 04:25:44 -0000	1.213
+++ sys/kern/vfs_syscalls.c	16 Dec 2004 19:59:00 -0000
@@ -3022,7 +3022,7 @@
 	error =3D VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT, 0, 0, p);
 	if (error =3D=3D 0 && bioops.io_fsync !=3D NULL &&
 	    vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))
-		(*bioops.io_fsync)(vp);
+		(*bioops.io_fsync)(vp, 0);
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp, 0);
 	FILE_UNUSE(fp, p);
@@ -3075,6 +3075,8 @@
 		nflags =3D FSYNC_DATAONLY | FSYNC_WAIT;
 	else
 		nflags =3D FSYNC_WAIT;
+	if (flags & FDISKSYNC)
+		nflags |=3D FSYNC_CACHE;
=20
 	len =3D SCARG(uap, length);
 	/* If length =3D=3D 0, we do the whole file, and s =3D l =3D 0 will do th=
at */
@@ -3096,7 +3098,7 @@
=20
 	if (error =3D=3D 0 && bioops.io_fsync !=3D NULL &&
 	    vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))
-		(*bioops.io_fsync)(vp);
+		(*bioops.io_fsync)(vp, nflags);
=20
 	VOP_UNLOCK(vp, 0);
 	FILE_UNUSE(fp, p);
Index: sys/miscfs/genfs/genfs_vnops.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /cvsroot/src/sys/miscfs/genfs/genfs_vnops.c,v
retrieving revision 1.91
diff -u -u -r1.91 genfs_vnops.c
--- sys/miscfs/genfs/genfs_vnops.c	4 Oct 2004 00:28:30 -0000	1.91
+++ sys/miscfs/genfs/genfs_vnops.c	16 Dec 2004 19:59:02 -0000
@@ -97,15 +97,26 @@
 		off_t offhi;
 		struct proc *a_p;
 	} */ *ap =3D v;
-	struct vnode *vp =3D ap->a_vp;
+	struct vnode *vp =3D ap->a_vp, *dvp;
 	int wait;
+	int error;
=20
 	wait =3D (ap->a_flags & FSYNC_WAIT) !=3D 0;
 	vflushbuf(vp, wait);
 	if ((ap->a_flags & FSYNC_DATAONLY) !=3D 0)
-		return (0);
+		error =3D 0;
 	else
-		return (VOP_UPDATE(vp, NULL, NULL, wait ? UPDATE_WAIT : 0));
+		error =3D VOP_UPDATE(vp, NULL, NULL, wait ? UPDATE_WAIT : 0);
+	if (error =3D=3D 0 && ap->a_flags & FSYNC_CACHE) {
+		long l =3D 0;
+		if (VOP_BMAP(vp, 0, &dvp, NULL, NULL))
+			error =3D ENXIO;
+		else
+			error =3D VOP_IOCTL(dvp, DIOCCACHESYNC, &l, FWRITE,
+					  ap->a_p->p_ucred, ap->a_p);
+	}
+
+	return (error);
 }
=20
 int
Index: sys/sys/buf.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /cvsroot/src/sys/sys/buf.h,v
retrieving revision 1.77
diff -u -u -r1.77 buf.h
--- sys/sys/buf.h	28 Oct 2004 07:07:46 -0000	1.77
+++ sys/sys/buf.h	16 Dec 2004 19:59:03 -0000
@@ -102,7 +102,7 @@
  	void	(*io_start)(struct buf *);
  	void	(*io_complete)(struct buf *);
  	void	(*io_deallocate)(struct buf *);
- 	int	(*io_fsync)(struct vnode *);
+ 	int	(*io_fsync)(struct vnode *, int);
  	int	(*io_sync)(struct mount *);
 	void	(*io_movedeps)(struct buf *, struct buf *);
 	int	(*io_countdeps)(struct buf *, int);
Index: sys/sys/unistd.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /cvsroot/src/sys/sys/unistd.h,v
retrieving revision 1.31
diff -u -u -r1.31 unistd.h
--- sys/sys/unistd.h	10 Nov 2004 04:02:52 -0000	1.31
+++ sys/sys/unistd.h	16 Dec 2004 19:59:03 -0000
@@ -123,6 +123,7 @@
  */
 #define	FDATASYNC	0x0010	/* sync data and minimal metadata */
 #define	FFILESYNC	0x0020	/* sync data and metadata */
+#define	FDISKSYNC	0x0040	/* flush disk caches after sync */
 #endif
=20
 /* configurable pathname variables; use as argument to pathconf(3) */
Index: sys/sys/vnode.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /cvsroot/src/sys/sys/vnode.h,v
retrieving revision 1.127
diff -u -u -r1.127 vnode.h
--- sys/sys/vnode.h	10 Nov 2004 17:30:56 -0000	1.127
+++ sys/sys/vnode.h	16 Dec 2004 19:59:04 -0000
@@ -272,6 +272,7 @@
 #define	FSYNC_DATAONLY	0x0002		/* fsync: hint: sync file data only */
 #define	FSYNC_RECLAIM	0x0004		/* fsync: hint: vnode is being reclaimed */
 #define	FSYNC_LAZY	0x0008		/* fsync: lazy sync (trickle) */
+#define	FSYNC_CACHE	0x0100		/* fsync: flush disk caches too */
=20
 #define	UPDATE_WAIT	0x0001		/* update: wait for completion */
 #define	UPDATE_DIROP	0x0002		/* update: hint to fs to wait or not */
Index: sys/ufs/ffs/ffs_softdep.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /cvsroot/src/sys/ufs/ffs/ffs_softdep.c,v
retrieving revision 1.60
diff -u -u -r1.60 ffs_softdep.c
--- sys/ufs/ffs/ffs_softdep.c	29 Aug 2004 10:13:48 -0000	1.60
+++ sys/ufs/ffs/ffs_softdep.c	16 Dec 2004 19:59:09 -0000
@@ -38,6 +38,7 @@
 #include <sys/param.h>
 #include <sys/buf.h>
 #include <sys/callout.h>
+#include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
@@ -202,7 +203,7 @@
 static	void softdep_disk_io_initiation __P((struct buf *));
 static	void softdep_disk_write_complete __P((struct buf *));
 static	void softdep_deallocate_dependencies __P((struct buf *));
-static	int softdep_fsync __P((struct vnode *));
+static	int softdep_fsync __P((struct vnode *, int));
 static	int softdep_process_worklist __P((struct mount *));
 static	void softdep_move_dependencies __P((struct buf *, struct buf *));
 static	int softdep_count_dependencies __P((struct buf *bp, int));
@@ -4663,8 +4664,9 @@
  * entries for the inode have been written after the inode gets to disk.
  */
 static int
-softdep_fsync(vp)
+softdep_fsync(vp, f)
 	struct vnode *vp;	/* the "in_core" copy of the inode */
+	int f;			/* Flags */
 {
 	struct diradd *dap;
 	struct inodedep *inodedep;
@@ -4679,6 +4681,7 @@
 	int error, flushparent;
 	ino_t parentino;
 	daddr_t lbn;
+	long l;
=20
 	ip =3D VTOI(vp);
 	fs =3D ip->i_fs;
@@ -4777,6 +4780,14 @@
 			break;
 	}
 	FREE_LOCK(&lk);
+	if (f & FSYNC_CACHE) {
+		/*
+		 * If requested, make sure all of these changes don't
+		 * linger in disk caches
+		 */
+		l =3D 0;
+		VOP_IOCTL(ip->i_devvp, DIOCCACHESYNC, &l, FWRITE, p->p_ucred, p);
+	}
 	return (0);
 }
=20
Index: sys/ufs/ffs/ffs_vnops.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /cvsroot/src/sys/ufs/ffs/ffs_vnops.c,v
retrieving revision 1.66
diff -u -u -r1.66 ffs_vnops.c
--- sys/ufs/ffs/ffs_vnops.c	15 Nov 2003 01:19:38 -0000	1.66
+++ sys/ufs/ffs/ffs_vnops.c	16 Dec 2004 19:59:09 -0000
@@ -317,9 +317,17 @@
 	}
 	splx(s);
=20
-	return (VOP_UPDATE(vp, NULL, NULL,
+	error =3D VOP_UPDATE(vp, NULL, NULL,
 	    ((ap->a_flags & (FSYNC_WAIT | FSYNC_DATAONLY)) =3D=3D FSYNC_WAIT)
-	    ? UPDATE_WAIT : 0));
+	    ? UPDATE_WAIT : 0);
+
+	if (error =3D=3D 0 && ap->a_flags & FSYNC_CACHE) {
+		long l =3D 0;
+		VOP_IOCTL(VTOI(vp)->i_devvp, DIOCCACHESYNC, &l, FWRITE,
+			ap->a_p->p_ucred, ap->a_p);
+	}
+
+	return error;
 }
=20
 /*
@@ -451,7 +459,15 @@
 		waitfor =3D 0;
 	else
 		waitfor =3D (ap->a_flags & FSYNC_WAIT) ? UPDATE_WAIT : 0;
-	return (VOP_UPDATE(vp, NULL, NULL, waitfor));
+	error =3D VOP_UPDATE(vp, NULL, NULL, waitfor);
+
+	if (error =3D=3D 0 && ap->a_flags & FSYNC_WAIT) {
+		long i =3D 0;
+		error =3D VOP_IOCTL(VTOI(vp)->i_devvp, DIOCCACHESYNC, &i, FWRITE,
+			ap->a_p->p_ucred, ap->a_p);
+	}
+
+	return error;
 }
=20
 /*
Index: sys/ufs/lfs/lfs_vnops.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /cvsroot/src/sys/ufs/lfs/lfs_vnops.c,v
retrieving revision 1.132
diff -u -u -r1.132 lfs_vnops.c
--- sys/ufs/lfs/lfs_vnops.c	22 Apr 2004 10:45:00 -0000	1.132
+++ sys/ufs/lfs/lfs_vnops.c	16 Dec 2004 19:59:11 -0000
@@ -317,6 +317,11 @@
 	if (error)
 		return error;
 	error =3D VOP_UPDATE(vp, NULL, NULL, wait ? UPDATE_WAIT : 0);
+	if (error =3D=3D 0 && ap->a_flags & FSYNC_CACHE) {
+		long l =3D 0;
+		error =3D VOP_IOCTL(VTOI(vp)->i_devvp, DIOCCACHESYNC, &l, FWRITE,
+				  ap->a_p->p_ucred, ap->a_p);
+	}
 	if (wait && !VPISEMPTY(vp))
 		LFS_SET_UINO(VTOI(vp), IN_MODIFIED);
=20

--k4f25fnPtRuIRUb3--

--TD8GDToEDw0WLGOL
Content-Type: application/pgp-signature
Content-Disposition: inline

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.3 (NetBSD)

iD8DBQFBwe0JWz+3JHUci9cRAvRwAJ4jrOwPFNSyVlT+zcJA0tplLAnqmgCeOjtO
rDbTYAEDA1XWUo+SXcjYyzo=
=L5wj
-----END PGP SIGNATURE-----

--TD8GDToEDw0WLGOL--