Subject: Re: Is O_DIRECT useless on NetBSD?
To: Roland Illig <rillig@NetBSD.org>
From: Antti Kantee <pooka@cs.hut.fi>
List: tech-kern
Date: 11/23/2007 17:51:14
--9zSXsLTf0vkW971A
Content-Type: text/plain; charset=iso-8859-1
Content-Disposition: inline
Content-Transfer-Encoding: 8bit

On Thu Nov 22 2007 at 23:53:34 +0100, Roland Illig wrote:
> Pavel Cahyna wrote:
> >>On Wed Nov 21 2007 at 16:08:59 +0100, Roland Illig wrote:
> >>
> >>>So am I right that currently there is no way for a program to say "I 
> >>>will write some data to that file, and nobody is going to use it in the 
> >>>next time, so please don't buffer it"?
> >
> >Or posix_fadvise?
> 
> Nice idea, but ...
> 
> BUGS
>      POSIX_FADV_WILLNEED, POSIX_FADV_DONTNEED, and POSIX_FADV_NOREUSE
>      are not implemented.

This quick patch/hack will add support for POSIX_FADV_NOREUSE.  But I/O
will be quite slow if you use small sizes.

-- 
Antti Kantee <pooka@iki.fi>                     Of course he runs NetBSD
http://www.iki.fi/pooka/                          http://www.NetBSD.org/
    "la qualité la plus indispensable du cuisinier est l'exactitude"

--9zSXsLTf0vkW971A
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="noreuse.diff"

Index: sys/vnode.h
===================================================================
RCS file: /cvsroot/src/sys/sys/vnode.h,v
retrieving revision 1.174
diff -p -u -r1.174 vnode.h
--- sys/vnode.h	23 Nov 2007 11:53:45 -0000	1.174
+++ sys/vnode.h	23 Nov 2007 15:49:54 -0000
@@ -254,7 +254,7 @@ extern struct simplelock global_v_numout
 #define	IO_NORMAL	0x00800		/* operate on regular data */
 #define	IO_EXT		0x01000		/* operate on extended attributes */
 #define	IO_DIRECT	0x02000		/* direct I/O hint */
-#define	IO_ADV_MASK	0x00003		/* access pattern hint */
+#define	IO_ADV_MASK	0x00007		/* access pattern hint */
 
 #define	IO_ADV_SHIFT	0
 #define	IO_ADV_ENCODE(adv)	(((adv) << IO_ADV_SHIFT) & IO_ADV_MASK)
Index: kern/kern_descrip.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_descrip.c,v
retrieving revision 1.162
diff -p -u -r1.162 kern_descrip.c
--- kern/kern_descrip.c	7 Nov 2007 00:23:20 -0000	1.162
+++ kern/kern_descrip.c	23 Nov 2007 15:49:55 -0000
@@ -1671,9 +1671,11 @@ sys_posix_fadvise(struct lwp *l, void *v
 	case POSIX_FADV_NORMAL:
 	case POSIX_FADV_RANDOM:
 	case POSIX_FADV_SEQUENTIAL:
+	case POSIX_FADV_NOREUSE:
 		KASSERT(POSIX_FADV_NORMAL == UVM_ADV_NORMAL);
 		KASSERT(POSIX_FADV_RANDOM == UVM_ADV_RANDOM);
 		KASSERT(POSIX_FADV_SEQUENTIAL == UVM_ADV_SEQUENTIAL);
+		KASSERT(POSIX_FADV_NOREUSE == UVM_ADV_NOREUSE);
 
 		/*
 		 * we ignore offset and size.
@@ -1684,7 +1686,6 @@ sys_posix_fadvise(struct lwp *l, void *v
 
 	case POSIX_FADV_WILLNEED:
 	case POSIX_FADV_DONTNEED:
-	case POSIX_FADV_NOREUSE:
 
 		/*
 		 * not implemented yet.
Index: kern/vfs_vnops.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_vnops.c,v
retrieving revision 1.143
diff -p -u -r1.143 vfs_vnops.c
--- kern/vfs_vnops.c	10 Oct 2007 20:42:27 -0000	1.143
+++ kern/vfs_vnops.c	23 Nov 2007 15:49:55 -0000
@@ -444,10 +444,11 @@ vn_write(struct file *fp, off_t *offset,
     int flags)
 {
 	struct vnode *vp = (struct vnode *)fp->f_data;
-	int count, error, ioflag = IO_UNIT;
+	int count, error, ioflag;
 	struct lwp *l = curlwp;
 
 	mutex_enter(&fp->f_lock);
+	ioflag = IO_ADV_ENCODE(fp->f_advice) | IO_UNIT;
 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
 		ioflag |= IO_APPEND;
 	if (fp->f_flag & FNONBLOCK)
Index: uvm/uvm_bio.c
===================================================================
RCS file: /cvsroot/src/sys/uvm/uvm_bio.c,v
retrieving revision 1.62
diff -p -u -r1.62 uvm_bio.c
--- uvm/uvm_bio.c	27 Jul 2007 09:50:37 -0000	1.62
+++ uvm/uvm_bio.c	23 Nov 2007 15:49:55 -0000
@@ -575,12 +575,14 @@ ubc_release(void *va, int flags)
 	struct uvm_object *uobj;
 	vaddr_t umapva;
 	bool unmapped;
+	int advice;
 	UVMHIST_FUNC("ubc_release"); UVMHIST_CALLED(ubchist);
 
 	UVMHIST_LOG(ubchist, "va %p", va, 0, 0, 0);
 	umap = &ubc_object.umap[((char *)va - ubc_object.kva) >> ubc_winshift];
 	umapva = UBC_UMAP_ADDR(umap);
 	uobj = umap->uobj;
+	advice = umap->advice;
 	KASSERT(uobj != NULL);
 
 	if (umap->flags & UMAP_PAGES_LOCKED) {
@@ -625,7 +627,19 @@ ubc_release(void *va, int flags)
 	umap->writelen = 0;
 	umap->refcount--;
 	if (umap->refcount == 0) {
-		if (flags & UBC_UNMAP) {
+		if ((flags & UBC_UNMAP) || (advice == UVM_ADV_NOREUSE)) {
+
+			/*
+			 * Clean & free all the pages in the window if the
+			 * advice says they will not be needed any longer.
+			 */
+			if (advice == UVM_ADV_NOREUSE) {
+				simple_lock(&uobj->vmobjlock);
+				(void) uobj->pgops->pgo_put(uobj,
+				    umap->offset & ~(ubc_winsize-1),
+				    (umap->offset+ubc_winsize)&~(ubc_winsize-1),
+				    PGO_CLEANIT | PGO_FREE);
+			}
 
 			/*
 			 * Invalidate any cached mappings if requested.
Index: uvm/uvm_extern.h
===================================================================
RCS file: /cvsroot/src/sys/uvm/uvm_extern.h,v
retrieving revision 1.136
diff -p -u -r1.136 uvm_extern.h
--- uvm/uvm_extern.h	6 Nov 2007 00:42:46 -0000	1.136
+++ uvm/uvm_extern.h	23 Nov 2007 15:49:56 -0000
@@ -132,7 +132,9 @@ typedef voff_t pgoff_t;		/* XXX: number 
 #define UVM_ADV_NORMAL	0x0	/* 'normal' */
 #define UVM_ADV_RANDOM	0x1	/* 'random' */
 #define UVM_ADV_SEQUENTIAL 0x2	/* 'sequential' */
-/* 0x3: will need, 0x4: dontneed */
+#define UVM_ADV_WILLNEED 0x3	/* pages will be needed */
+#define UVM_ADV_DONTNEED 0x4	/* pages won't be needed */
+#define UVM_ADV_NOREUSE	0x5	/* pages will be used only once */
 #define UVM_ADV_MASK	0x7	/* mask */
 
 /* bits 0xffff0000: mapping flags */
Index: uvm/uvm_readahead.c
===================================================================
RCS file: /cvsroot/src/sys/uvm/uvm_readahead.c,v
retrieving revision 1.4
diff -p -u -r1.4 uvm_readahead.c
--- uvm/uvm_readahead.c	11 May 2007 12:11:09 -0000	1.4
+++ uvm/uvm_readahead.c	23 Nov 2007 15:49:56 -0000
@@ -195,7 +195,8 @@ uvm_ra_request(struct uvm_ractx *ra, int
     off_t reqoff, size_t reqsize)
 {
 
-	if (ra == NULL || advice == UVM_ADV_RANDOM) {
+	if (ra == NULL
+	    || advice == UVM_ADV_RANDOM || advice == UVM_ADV_NOREUSE) {
 		return;
 	}
 
Index: ufs/ufs/ufs_readwrite.c
===================================================================
RCS file: /cvsroot/src/sys/ufs/ufs/ufs_readwrite.c,v
retrieving revision 1.84
diff -p -u -r1.84 ufs_readwrite.c
--- ufs/ufs/ufs_readwrite.c	10 Oct 2007 20:42:40 -0000	1.84
+++ ufs/ufs/ufs_readwrite.c	23 Nov 2007 15:49:56 -0000
@@ -380,8 +380,8 @@ WRITE(void *v)
 		 */
 
 		ubc_flags |= UBC_WANT_UNMAP(vp) ? UBC_UNMAP : 0;
-		error = ubc_uiomove(&vp->v_uobj, uio, bytelen, UVM_ADV_RANDOM,
-		    ubc_flags);
+		error = ubc_uiomove(&vp->v_uobj, uio, bytelen,
+		    IO_ADV_DECODE(ioflag), ubc_flags);
 
 		/*
 		 * update UVM's notion of the size now that we've

--9zSXsLTf0vkW971A--