Subject: Extension of fsync_range() to permit forcing disk cache flushing
To: None <tech-kern@netbsd.org>
From: Bill Studenmund <wrstuden@netbsd.org>
List: tech-kern
Date: 12/16/2004 12:16:09
--TD8GDToEDw0WLGOL
Content-Type: multipart/mixed; boundary="k4f25fnPtRuIRUb3"
Content-Disposition: inline
--k4f25fnPtRuIRUb3
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
Content-Transfer-Encoding: quoted-printable
I have an application that wants to be able to know that certain writes=20
have been forced to permanent storage - that they aren't still sitting in=
=20
the disk's write cache. This idea is similar to the current thread about=20
wd and write caches, except here we're talking about userland wanting to=20
request the flushing.
After discussing this with some developers, the best solution seems to be=
=20
to add a flag to fsync_range() to force this behavior. Then pass a flag=20
down to VOP_FSYNC() to trigger a disk cache flush after the other update=20
steps happen.
The current implementation just flushes the whole disk cache. Future=20
implementations may be selective and issue finer-grained cache flushes.
Thoughts?
Take care,
Bill
--k4f25fnPtRuIRUb3
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="fsync.diffie"
Content-Transfer-Encoding: quoted-printable
Index: lib/libc/sys/fsync.2
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /cvsroot/src/lib/libc/sys/fsync.2,v
retrieving revision 1.15
diff -u -u -r1.15 fsync.2
--- lib/libc/sys/fsync.2 18 Nov 2003 08:49:18 -0000 1.15
+++ lib/libc/sys/fsync.2 16 Dec 2004 19:58:30 -0000
@@ -79,6 +79,8 @@
data for the specified range.
.It Dv FFILESYNC
Synchronize all modified file data and meta-data for the specified range.
+.It Dv FDISKSYNC
+Force the requested synchronization to permanent storage.
.El
.Pp
If the
@@ -86,6 +88,36 @@
parameter is zero,
.Fn fsync_range
will synchronize all of the file data.
+.Pp
+Passing the
+.Dv FDISKSYNC
+flag to
+.Fn fsync_range
+will trigger synchronization of the data, both file data and relevant
+meta-data, to permanent storage.
+Without this flag (and for all cases for
+.Fn fsync ) ,
+the call may return when the write commands have completed.
+The difference arises if the backing disk has an enabled write-back
+cache.
+In the presence of a write-back disk cache, the=20
+.Dv FDISKSYNC
+flag permits an application to enforce its own policy regarding which
+writes need to have reached permanent storage, or if it is sufficient
+for the disk to report the write has been completed.
+.Pp
+The exact write cache synchronization policy triggered by the=20
+.Dv FDISKSYNC=20
+flag is implementation dependent.
+An implementation may issue write cache synchronization commands to flush
+only the file data and relevant meta-data from the disk cache, or an
+implementation may flush the write cache in its entirety.
+An implementation may support both behaviors, and use its own policy to
+choose between them.
+Different file systems on the same host may implement this flag differentl=
y.
+Flusshing the cache in its entirety may have a performance impact as
+data unrelated to the file being synchronized will be written to permanent
+storage.
.Sh RETURN VALUES
A 0 value is returned on success.
A \-1 value indicates an error.
Index: sys/kern/vfs_syscalls.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /cvsroot/src/sys/kern/vfs_syscalls.c,v
retrieving revision 1.213
diff -u -u -r1.213 vfs_syscalls.c
--- sys/kern/vfs_syscalls.c 30 Nov 2004 04:25:44 -0000 1.213
+++ sys/kern/vfs_syscalls.c 16 Dec 2004 19:59:00 -0000
@@ -3022,7 +3022,7 @@
error =3D VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT, 0, 0, p);
if (error =3D=3D 0 && bioops.io_fsync !=3D NULL &&
vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))
- (*bioops.io_fsync)(vp);
+ (*bioops.io_fsync)(vp, 0);
VOP_UNLOCK(vp, 0);
vn_finished_write(mp, 0);
FILE_UNUSE(fp, p);
@@ -3075,6 +3075,8 @@
nflags =3D FSYNC_DATAONLY | FSYNC_WAIT;
else
nflags =3D FSYNC_WAIT;
+ if (flags & FDISKSYNC)
+ nflags |=3D FSYNC_CACHE;
=20
len =3D SCARG(uap, length);
/* If length =3D=3D 0, we do the whole file, and s =3D l =3D 0 will do th=
at */
@@ -3096,7 +3098,7 @@
=20
if (error =3D=3D 0 && bioops.io_fsync !=3D NULL &&
vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))
- (*bioops.io_fsync)(vp);
+ (*bioops.io_fsync)(vp, nflags);
=20
VOP_UNLOCK(vp, 0);
FILE_UNUSE(fp, p);
Index: sys/miscfs/genfs/genfs_vnops.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /cvsroot/src/sys/miscfs/genfs/genfs_vnops.c,v
retrieving revision 1.91
diff -u -u -r1.91 genfs_vnops.c
--- sys/miscfs/genfs/genfs_vnops.c 4 Oct 2004 00:28:30 -0000 1.91
+++ sys/miscfs/genfs/genfs_vnops.c 16 Dec 2004 19:59:02 -0000
@@ -97,15 +97,26 @@
off_t offhi;
struct proc *a_p;
} */ *ap =3D v;
- struct vnode *vp =3D ap->a_vp;
+ struct vnode *vp =3D ap->a_vp, *dvp;
int wait;
+ int error;
=20
wait =3D (ap->a_flags & FSYNC_WAIT) !=3D 0;
vflushbuf(vp, wait);
if ((ap->a_flags & FSYNC_DATAONLY) !=3D 0)
- return (0);
+ error =3D 0;
else
- return (VOP_UPDATE(vp, NULL, NULL, wait ? UPDATE_WAIT : 0));
+ error =3D VOP_UPDATE(vp, NULL, NULL, wait ? UPDATE_WAIT : 0);
+ if (error =3D=3D 0 && ap->a_flags & FSYNC_CACHE) {
+ long l =3D 0;
+ if (VOP_BMAP(vp, 0, &dvp, NULL, NULL))
+ error =3D ENXIO;
+ else
+ error =3D VOP_IOCTL(dvp, DIOCCACHESYNC, &l, FWRITE,
+ ap->a_p->p_ucred, ap->a_p);
+ }
+
+ return (error);
}
=20
int
Index: sys/sys/buf.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /cvsroot/src/sys/sys/buf.h,v
retrieving revision 1.77
diff -u -u -r1.77 buf.h
--- sys/sys/buf.h 28 Oct 2004 07:07:46 -0000 1.77
+++ sys/sys/buf.h 16 Dec 2004 19:59:03 -0000
@@ -102,7 +102,7 @@
void (*io_start)(struct buf *);
void (*io_complete)(struct buf *);
void (*io_deallocate)(struct buf *);
- int (*io_fsync)(struct vnode *);
+ int (*io_fsync)(struct vnode *, int);
int (*io_sync)(struct mount *);
void (*io_movedeps)(struct buf *, struct buf *);
int (*io_countdeps)(struct buf *, int);
Index: sys/sys/unistd.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /cvsroot/src/sys/sys/unistd.h,v
retrieving revision 1.31
diff -u -u -r1.31 unistd.h
--- sys/sys/unistd.h 10 Nov 2004 04:02:52 -0000 1.31
+++ sys/sys/unistd.h 16 Dec 2004 19:59:03 -0000
@@ -123,6 +123,7 @@
*/
#define FDATASYNC 0x0010 /* sync data and minimal metadata */
#define FFILESYNC 0x0020 /* sync data and metadata */
+#define FDISKSYNC 0x0040 /* flush disk caches after sync */
#endif
=20
/* configurable pathname variables; use as argument to pathconf(3) */
Index: sys/sys/vnode.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /cvsroot/src/sys/sys/vnode.h,v
retrieving revision 1.127
diff -u -u -r1.127 vnode.h
--- sys/sys/vnode.h 10 Nov 2004 17:30:56 -0000 1.127
+++ sys/sys/vnode.h 16 Dec 2004 19:59:04 -0000
@@ -272,6 +272,7 @@
#define FSYNC_DATAONLY 0x0002 /* fsync: hint: sync file data only */
#define FSYNC_RECLAIM 0x0004 /* fsync: hint: vnode is being reclaimed */
#define FSYNC_LAZY 0x0008 /* fsync: lazy sync (trickle) */
+#define FSYNC_CACHE 0x0100 /* fsync: flush disk caches too */
=20
#define UPDATE_WAIT 0x0001 /* update: wait for completion */
#define UPDATE_DIROP 0x0002 /* update: hint to fs to wait or not */
Index: sys/ufs/ffs/ffs_softdep.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /cvsroot/src/sys/ufs/ffs/ffs_softdep.c,v
retrieving revision 1.60
diff -u -u -r1.60 ffs_softdep.c
--- sys/ufs/ffs/ffs_softdep.c 29 Aug 2004 10:13:48 -0000 1.60
+++ sys/ufs/ffs/ffs_softdep.c 16 Dec 2004 19:59:09 -0000
@@ -38,6 +38,7 @@
#include <sys/param.h>
#include <sys/buf.h>
#include <sys/callout.h>
+#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/mount.h>
@@ -202,7 +203,7 @@
static void softdep_disk_io_initiation __P((struct buf *));
static void softdep_disk_write_complete __P((struct buf *));
static void softdep_deallocate_dependencies __P((struct buf *));
-static int softdep_fsync __P((struct vnode *));
+static int softdep_fsync __P((struct vnode *, int));
static int softdep_process_worklist __P((struct mount *));
static void softdep_move_dependencies __P((struct buf *, struct buf *));
static int softdep_count_dependencies __P((struct buf *bp, int));
@@ -4663,8 +4664,9 @@
* entries for the inode have been written after the inode gets to disk.
*/
static int
-softdep_fsync(vp)
+softdep_fsync(vp, f)
struct vnode *vp; /* the "in_core" copy of the inode */
+ int f; /* Flags */
{
struct diradd *dap;
struct inodedep *inodedep;
@@ -4679,6 +4681,7 @@
int error, flushparent;
ino_t parentino;
daddr_t lbn;
+ long l;
=20
ip =3D VTOI(vp);
fs =3D ip->i_fs;
@@ -4777,6 +4780,14 @@
break;
}
FREE_LOCK(&lk);
+ if (f & FSYNC_CACHE) {
+ /*
+ * If requested, make sure all of these changes don't
+ * linger in disk caches
+ */
+ l =3D 0;
+ VOP_IOCTL(ip->i_devvp, DIOCCACHESYNC, &l, FWRITE, p->p_ucred, p);
+ }
return (0);
}
=20
Index: sys/ufs/ffs/ffs_vnops.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /cvsroot/src/sys/ufs/ffs/ffs_vnops.c,v
retrieving revision 1.66
diff -u -u -r1.66 ffs_vnops.c
--- sys/ufs/ffs/ffs_vnops.c 15 Nov 2003 01:19:38 -0000 1.66
+++ sys/ufs/ffs/ffs_vnops.c 16 Dec 2004 19:59:09 -0000
@@ -317,9 +317,17 @@
}
splx(s);
=20
- return (VOP_UPDATE(vp, NULL, NULL,
+ error =3D VOP_UPDATE(vp, NULL, NULL,
((ap->a_flags & (FSYNC_WAIT | FSYNC_DATAONLY)) =3D=3D FSYNC_WAIT)
- ? UPDATE_WAIT : 0));
+ ? UPDATE_WAIT : 0);
+
+ if (error =3D=3D 0 && ap->a_flags & FSYNC_CACHE) {
+ long l =3D 0;
+ VOP_IOCTL(VTOI(vp)->i_devvp, DIOCCACHESYNC, &l, FWRITE,
+ ap->a_p->p_ucred, ap->a_p);
+ }
+
+ return error;
}
=20
/*
@@ -451,7 +459,15 @@
waitfor =3D 0;
else
waitfor =3D (ap->a_flags & FSYNC_WAIT) ? UPDATE_WAIT : 0;
- return (VOP_UPDATE(vp, NULL, NULL, waitfor));
+ error =3D VOP_UPDATE(vp, NULL, NULL, waitfor);
+
+ if (error =3D=3D 0 && ap->a_flags & FSYNC_WAIT) {
+ long i =3D 0;
+ error =3D VOP_IOCTL(VTOI(vp)->i_devvp, DIOCCACHESYNC, &i, FWRITE,
+ ap->a_p->p_ucred, ap->a_p);
+ }
+
+ return error;
}
=20
/*
Index: sys/ufs/lfs/lfs_vnops.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /cvsroot/src/sys/ufs/lfs/lfs_vnops.c,v
retrieving revision 1.132
diff -u -u -r1.132 lfs_vnops.c
--- sys/ufs/lfs/lfs_vnops.c 22 Apr 2004 10:45:00 -0000 1.132
+++ sys/ufs/lfs/lfs_vnops.c 16 Dec 2004 19:59:11 -0000
@@ -317,6 +317,11 @@
if (error)
return error;
error =3D VOP_UPDATE(vp, NULL, NULL, wait ? UPDATE_WAIT : 0);
+ if (error =3D=3D 0 && ap->a_flags & FSYNC_CACHE) {
+ long l =3D 0;
+ error =3D VOP_IOCTL(VTOI(vp)->i_devvp, DIOCCACHESYNC, &l, FWRITE,
+ ap->a_p->p_ucred, ap->a_p);
+ }
if (wait && !VPISEMPTY(vp))
LFS_SET_UINO(VTOI(vp), IN_MODIFIED);
=20
--k4f25fnPtRuIRUb3--
--TD8GDToEDw0WLGOL
Content-Type: application/pgp-signature
Content-Disposition: inline
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.3 (NetBSD)
iD8DBQFBwe0JWz+3JHUci9cRAvRwAJ4jrOwPFNSyVlT+zcJA0tplLAnqmgCeOjtO
rDbTYAEDA1XWUo+SXcjYyzo=
=L5wj
-----END PGP SIGNATURE-----
--TD8GDToEDw0WLGOL--