tech-kern archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

RFC: lseek() extensions: SEEK_HOLE / SEEK_DATA with patch



Dear folks,

i've implemented the SEEK_HOLE / SEEK_DATA additions to lseek() as introduced
by Solaris for ZFS. It ought to perform as in the Solaris implementation. For
a test program and trace on Solaris 10 see https://lkml.org/lkml/2011/4/22/79

Test runs have been made on NFS (genfs_seek) and FFS (ufs_seek) and are
attached to this message next to the patch. The runs indicate that it behaves
like it ought to do. I've also added a simple demo program that shows how it
could be used inside cp(1).

The only issue this patch is not adressing is puffs. The puff module won't
build since the protocol for VOP_SEEK() has been changed and i dont know yet
how to adequately fix that together with the needed compat code. Assistence
with this issue, even if just pointers, would be most welcome.

With regards,
Reinoud

luiaard# rm file
luiaard# 
luiaard# ./sparse-test2
creating file
Undefined error: 0
fpathconf gives -1, ENXIO is 6

testing at start
CUR at offset 0, errno 0
HOLE gives offset 1048578, errno 0
CUR at offset 1048578, errno 0
DATA gives offset 0, errno 0
CUR at offset 0, errno 0
HOLE gives offset 1048578, errno 0
CUR at offset 1048578, errno 0

testing at end
end at offset 1048578, errno 0
HOLE gives offset -1, errno 6
CUR at offset 1048578, errno 0
DATA gives offset -1, errno 6
CUR at offset 1048578, errno 0
HOLE gives offset 1048578, errno 0
CUR at offset 1048578, errno 0
DATA gives offset 1048577, errno 0
CUR at offset 1048577, errno 0

testing at offset 1
HOLE gives offset 1048578, errno 0
CUR at offset 1048578, errno 0
DATA gives offset 1, errno 0
CUR at offset 1, errno 0

testing at offset 200000
HOLE gives offset 1048578, errno 0
CUR at offset 1048578, errno 0
DATA gives offset 200000, errno 0
CUR at offset 200000, errno 0
luiaard# 
luiaard# ./print-allocs file
st.st_size: 1048578
st.st_blocks: 96
st.st_blksize: 32768
nlks: 33 * 32768 = 1081344
               0 ->               33
luiaard# 

luiaard# mount_ffs -o log /dev/wd0a /wd0a
luiaard# 
luiaard# cd /wd0a
luiaard# rm file
rm: file: No such file or directory
luiaard# 
luiaard# ~/sparse-test2
creating file
Undefined error: 0
fpathconf gives -1, ENXIO is 6

testing at start
CUR at offset 0, errno 0
HOLE gives offset 16384, errno 0
CUR at offset 16384, errno 0
DATA gives offset 0, errno 0
CUR at offset 0, errno 0
HOLE gives offset 16384, errno 0
CUR at offset 16384, errno 0

testing at end
end at offset 1048578, errno 0
HOLE gives offset -1, errno 6
CUR at offset 1048578, errno 0
DATA gives offset -1, errno 6
CUR at offset 1048578, errno 0
HOLE gives offset 1048578, errno 0
CUR at offset 1048578, errno 0
DATA gives offset 1048577, errno 0
CUR at offset 1048577, errno 0

testing at offset 1
HOLE gives offset 16384, errno 0
CUR at offset 16384, errno 0
DATA gives offset 1, errno 0
CUR at offset 1, errno 0

testing at offset 200000
HOLE gives offset 200000, errno 0
CUR at offset 200000, errno 0
DATA gives offset 1048576, errno 0
CUR at offset 1048576, errno 0
luiaard# 
luiaard# 
luiaard# ~/print-allocs file
st.st_size: 1048578
st.st_blocks: 96
st.st_blksize: 16384
nlks: 65 * 16384 = 1064960
               0 ->                0
              64 ->               65
luiaard# 
luiaard# cd
luiaard# umount /wd0a

#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <unistd.h>
#include <errno.h>
#include <inttypes.h>

#define BLOB_SIZE 32*1024

char *blob[BLOB_SIZE];
int fdi, fdo;

#define MIN(a,b) ((a) < (b) ? (a) : (b))

void filedatacopy(off_t start, off_t len)
{
        off_t cplen;

        printf("copy from %"PRIu64" to %"PRIu64"\n", start, start+len);
        cplen = MIN(BLOB_SIZE, len);
        while (len) {
                 pread(fdi, blob, cplen, start);
                pwrite(fdo, blob, cplen, start);
                start += cplen;
                len   -= cplen;
                cplen = MIN(BLOB_SIZE, len);
        }
}


int main(int argc, char **argv)
{
        off_t extent, data, hole, pos;
        struct stat stat;
        int error;
        char *filen;

        if (argc != 2) {
                printf("Use %s fname\n", argv[0]);
                return EXIT_FAILURE;
        }
        filen = argv[1];

        if ((fdi = open(filen, O_RDONLY, 0)) == -1) {
                printf("Can't open my input file\n");
                return EXIT_FAILURE;
        }
        if (fstat(fdi, &stat)) {
                printf("can't stat file\n");
                return EXIT_FAILURE;
        }

        if ((fdo = open("file-out", O_WRONLY | O_TRUNC | O_CREAT, 0666)) == -1) 
{
                printf("Can't open output file\n");
                return EXIT_FAILURE;
        }

        /* sparse copy routine */
        extent = stat.st_size;

        error= 0;
        pos  = 0;

        while (pos < extent) {
                data = lseek(fdi, pos, SEEK_DATA);
                hole = lseek(fdi, data, SEEK_HOLE);
printf("DATA = %"PRIi64"\n", data);
printf("HOLE = %"PRIi64"\n", hole);
                if (data == -1) {
                        error = errno;
                        break;
                }
                filedatacopy(data, hole-data);
                pos = hole;
        }

        if (error && (error != ENXIO)) {
                printf("error occured in transfer\n");
                return EXIT_FAILURE;
        }

        close(fdi);
        close(fdo);

        return EXIT_SUCCESS;
}

Index: lib/libc/sys/lseek.2
===================================================================
RCS file: /cvsroot/src/lib/libc/sys/lseek.2,v
retrieving revision 1.24
diff -u -p -r1.24 lseek.2
--- lib/libc/sys/lseek.2        5 Apr 2010 07:53:47 -0000       1.24
+++ lib/libc/sys/lseek.2        7 Aug 2011 07:34:25 -0000
@@ -29,7 +29,7 @@
 .\"
 .\"     @(#)lseek.2    8.3 (Berkeley) 4/19/94
 .\"
-.Dd April 3, 2010
+.Dd August 7, 2011
 .Dt LSEEK 2
 .Os
 .Sh NAME
@@ -86,6 +86,24 @@ the offset is set to the size of the
 file plus
 .Fa offset
 bytes.
+.It
+If
+.Fa whence
+is
+.Dv SEEK_HOLE ,
+the offset is set to the "hole" file region,
+either pointed to or to the start of the first one following the supplied
+.Fa offset
+in bytes.
+.It
+If
+.Fa whence
+is
+.Dv SEEK_DATA 
+the offset is set to the first non-"hole" file region,
+either pointed to or to the start of the first one following the supplied
+.Fa offset
+in bytes.
 .El
 .Pp
 The
@@ -98,6 +116,17 @@ bytes of zeros (until data is actually w
 .Pp
 Some devices are incapable of seeking.
 The value of the pointer associated with such a device is undefined.
+.Pp
+In this function, a "hole" is defined as a gap in a file, a contiguous range 
of bytes in a file
+that are, when read in, all zero. Note that NOT all zero ranges are guaranteed 
to be
+returned as holes by
+.Dv SEEK_HOLE
+though filesystems are allowed to not only look at the file's allocation to 
find
+holes but also to look at the recorded contents to find holes. The offsets
+returned thus does not have to be an integer multiple of the filesystem block 
size.
+.Pp
+To ease implementation, a hole is guaranteed after each data region. This
+implies that there is a virtual hole at the end of the file.
 .Sh RETURN VALUES
 Upon successful completion,
 .Fn lseek
@@ -142,6 +171,12 @@ for
 due to a larger
 .Fa offset
 argument type.
+.Pp
+The
+.Dv SEEK_HOLE
+and
+.Dv SEEK_DATA
+extensions were added by Solaris for their ZFS.
 .Sh BUGS
 This document's use of
 .Fa whence
Index: rump/include/rump/rumpvnode_if.h
===================================================================
RCS file: /cvsroot/src/sys/rump/include/rump/rumpvnode_if.h,v
retrieving revision 1.11
diff -u -p -r1.11 rumpvnode_if.h
--- rump/include/rump/rumpvnode_if.h    11 Jul 2011 08:27:39 -0000      1.11
+++ rump/include/rump/rumpvnode_if.h    7 Aug 2011 07:34:26 -0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: rumpvnode_if.h,v 1.11 2011/07/11 08:27:39 hannken Exp $        
*/
+/*     $NetBSD$        */
 
 /*
  * Warning: DO NOT EDIT! This file is automatically generated!
@@ -71,7 +71,8 @@ int RUMP_VOP_KQFILTER(struct vnode *, st
 int RUMP_VOP_REVOKE(struct vnode *, int);
 int RUMP_VOP_MMAP(struct vnode *, int, struct kauth_cred *);
 int RUMP_VOP_FSYNC(struct vnode *, struct kauth_cred *, int, off_t, off_t);
-int RUMP_VOP_SEEK(struct vnode *, off_t, off_t, struct kauth_cred *);
+int RUMP_VOP_SEEK(struct vnode *, off_t, int, off_t, off_t *, 
+    struct kauth_cred *);
 int RUMP_VOP_REMOVE(struct vnode *, struct vnode *, struct componentname *);
 int RUMP_VOP_LINK(struct vnode *, struct vnode *, struct componentname *);
 int RUMP_VOP_RENAME(struct vnode *, struct vnode *, struct componentname *, 
Index: rump/librump/rumpvfs/rumpvnode_if.c
===================================================================
RCS file: /cvsroot/src/sys/rump/librump/rumpvfs/rumpvnode_if.c,v
retrieving revision 1.10
diff -u -p -r1.10 rumpvnode_if.c
--- rump/librump/rumpvfs/rumpvnode_if.c 11 Jul 2011 08:27:39 -0000      1.10
+++ rump/librump/rumpvfs/rumpvnode_if.c 7 Aug 2011 07:34:26 -0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: rumpvnode_if.c,v 1.10 2011/07/11 08:27:39 hannken Exp $        
*/
+/*     $NetBSD$        */
 
 /*
  * Warning: DO NOT EDIT! This file is automatically generated!
@@ -40,7 +40,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: rumpvnode_if.c,v 1.10 2011/07/11 08:27:39 hannken 
Exp $");
+__KERNEL_RCSID(0, "$NetBSD$");
 
 #include <sys/param.h>
 #include <sys/mount.h>
@@ -310,14 +310,16 @@ RUMP_VOP_FSYNC(struct vnode *vp,
 
 int
 RUMP_VOP_SEEK(struct vnode *vp,
-    off_t oldoff,
-    off_t newoff,
+    off_t oldoffset,
+    int type,
+    off_t givenoffset,
+    off_t *newoffset,
     struct kauth_cred *cred)
 {
        int error;
 
        rump_schedule();
-       error = VOP_SEEK(vp, oldoff, newoff, cred);
+       error = VOP_SEEK(vp, oldoffset, type, givenoffset, newoffset, cred);
        rump_unschedule();
 
        return error;
Index: sys/fs/union/union_vnops.c
===================================================================
RCS file: /cvsroot/src/sys/fs/union/union_vnops.c,v
retrieving revision 1.41
diff -u -p -r1.41 union_vnops.c
--- sys/fs/union/union_vnops.c  5 Aug 2011 08:17:47 -0000       1.41
+++ sys/fs/union/union_vnops.c  7 Aug 2011 07:34:37 -0000
@@ -1130,8 +1130,10 @@ union_seek(void *v)
 {
        struct vop_seek_args /* {
                struct vnode *a_vp;
-               off_t  a_oldoff;
-               off_t  a_newoff;
+               off_t a_oldoffset;
+               int a_whence;
+               off_t a_givenoffset;
+               off_t *a_newoffset;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct vnode *ovp = OTHERVP(ap->a_vp);
Index: sys/kern/sys_generic.c
===================================================================
RCS file: /cvsroot/src/sys/kern/sys_generic.c,v
retrieving revision 1.127
diff -u -p -r1.127 sys_generic.c
--- sys/kern/sys_generic.c      27 Jul 2011 14:35:34 -0000      1.127
+++ sys/kern/sys_generic.c      7 Aug 2011 07:34:38 -0000
@@ -218,7 +218,8 @@ do_filereadv(int fd, const struct iovec 
                 * XXX This works because no file systems actually
                 * XXX take any action on the seek operation.
                 */
-               error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
+               error = VOP_SEEK(vp, fp->f_offset, SEEK_CUR, 0, offset,
+                               fp->f_cred);
                if (error != 0)
                        goto out;
        }
@@ -423,7 +424,8 @@ do_filewritev(int fd, const struct iovec
                 * XXX This works because no file systems actually
                 * XXX take any action on the seek operation.
                 */
-               error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
+               error = VOP_SEEK(vp, fp->f_offset, SEEK_CUR, 0, offset,
+                               fp->f_cred);
                if (error != 0)
                        goto out;
        }
Index: sys/kern/vfs_syscalls.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_syscalls.c,v
retrieving revision 1.432
diff -u -p -r1.432 vfs_syscalls.c
--- sys/kern/vfs_syscalls.c     24 Jul 2011 09:40:10 -0000      1.432
+++ sys/kern/vfs_syscalls.c     7 Aug 2011 07:34:38 -0000
@@ -2031,43 +2031,27 @@ sys_lseek(struct lwp *l, const struct sy
        kauth_cred_t cred = l->l_cred;
        file_t *fp;
        struct vnode *vp;
-       struct vattr vattr;
-       off_t newoff;
-       int error, fd;
+       off_t offset, newoff;
+       int error, whence, fd;
 
        fd = SCARG(uap, fd);
+       whence = SCARG(uap, whence);
+       offset = SCARG(uap, offset);
 
        if ((fp = fd_getfile(fd)) == NULL)
                return (EBADF);
 
        vp = fp->f_data;
        if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
-               error = ESPIPE;
-               goto out;
+               fd_putfile(fd);
+               return (ESPIPE);
        }
 
-       switch (SCARG(uap, whence)) {
-       case SEEK_CUR:
-               newoff = fp->f_offset + SCARG(uap, offset);
-               break;
-       case SEEK_END:
-               error = VOP_GETATTR(vp, &vattr, cred);
-               if (error) {
-                       goto out;
-               }
-               newoff = SCARG(uap, offset) + vattr.va_size;
-               break;
-       case SEEK_SET:
-               newoff = SCARG(uap, offset);
-               break;
-       default:
-               error = EINVAL;
-               goto out;
-       }
-       if ((error = VOP_SEEK(vp, fp->f_offset, newoff, cred)) == 0) {
+       if ((error = VOP_SEEK(vp, fp->f_offset,
+                       whence, offset, &newoff, cred)) == 0) {
                *(off_t *)retval = fp->f_offset = newoff;
        }
- out:
+
        fd_putfile(fd);
        return (error);
 }
@@ -2109,7 +2093,8 @@ sys_pread(struct lwp *l, const struct sy
         * XXX This works because no file systems actually
         * XXX take any action on the seek operation.
         */
-       if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
+       if ((error = VOP_SEEK(vp, fp->f_offset,
+                       SEEK_SET, offset, NULL, fp->f_cred)) != 0)
                goto out;
 
        /* dofileread() will unuse the descriptor for us */
@@ -2176,7 +2161,8 @@ sys_pwrite(struct lwp *l, const struct s
         * XXX This works because no file systems actually
         * XXX take any action on the seek operation.
         */
-       if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
+       if ((error = VOP_SEEK(vp, fp->f_offset,
+                       SEEK_SET, offset, NULL, fp->f_cred)) != 0)
                goto out;
 
        /* dofilewrite() will unuse the descriptor for us */
Index: sys/kern/vnode_if.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vnode_if.c,v
retrieving revision 1.88
diff -u -p -r1.88 vnode_if.c
--- sys/kern/vnode_if.c 11 Jul 2011 08:27:38 -0000      1.88
+++ sys/kern/vnode_if.c 7 Aug 2011 07:34:38 -0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: vnode_if.c,v 1.88 2011/07/11 08:27:38 hannken Exp $    */
+/*     $NetBSD$        */
 
 /*
  * Warning: DO NOT EDIT! This file is automatically generated!
@@ -40,7 +40,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vnode_if.c,v 1.88 2011/07/11 08:27:38 hannken Exp 
$");
+__KERNEL_RCSID(0, "$NetBSD$");
 
 #include <sys/param.h>
 #include <sys/mount.h>
@@ -678,7 +678,9 @@ const struct vnodeop_desc vop_seek_desc 
 int
 VOP_SEEK(struct vnode *vp,
     off_t oldoff,
-    off_t newoff,
+    int whence,
+    off_t givenoff,
+    off_t *newoff,
     kauth_cred_t cred)
 {
        int error;
@@ -687,6 +689,8 @@ VOP_SEEK(struct vnode *vp,
        a.a_desc = VDESC(vop_seek);
        a.a_vp = vp;
        a.a_oldoff = oldoff;
+       a.a_whence = whence;
+       a.a_givenoff = givenoff;
        a.a_newoff = newoff;
        a.a_cred = cred;
        mpsafe = (vp->v_vflag & VV_MPSAFE);
Index: sys/kern/vnode_if.src
===================================================================
RCS file: /cvsroot/src/sys/kern/vnode_if.src,v
retrieving revision 1.62
diff -u -p -r1.62 vnode_if.src
--- sys/kern/vnode_if.src       11 Jul 2011 08:23:00 -0000      1.62
+++ sys/kern/vnode_if.src       7 Aug 2011 07:34:38 -0000
@@ -245,13 +245,14 @@ vop_fsync {
 };
 
 #
-# Needs work: Is newoff right?  What's it mean?
 # XXX Locking protocol?
 #
 vop_seek {
        IN struct vnode *vp;
        IN off_t oldoff;
-       IN off_t newoff;
+       IN int whence;
+       IN off_t givenoff;
+       OUT off_t *newoff;
        IN kauth_cred_t cred;
 };
 
Index: sys/miscfs/genfs/genfs_vnops.c
===================================================================
RCS file: /cvsroot/src/sys/miscfs/genfs/genfs_vnops.c,v
retrieving revision 1.187
diff -u -p -r1.187 genfs_vnops.c
--- sys/miscfs/genfs/genfs_vnops.c      12 Jun 2011 03:35:58 -0000      1.187
+++ sys/miscfs/genfs/genfs_vnops.c      7 Aug 2011 07:34:45 -0000
@@ -74,6 +74,7 @@ __KERNEL_RCSID(0, "$NetBSD: genfs_vnops.
 #include <sys/file.h>
 #include <sys/kauth.h>
 #include <sys/stat.h>
+#include <sys/buf.h>
 
 #include <miscfs/genfs/genfs.h>
 #include <miscfs/genfs/genfs_node.h>
@@ -104,14 +105,54 @@ genfs_seek(void *v)
        struct vop_seek_args /* {
                struct vnode *a_vp;
                off_t a_oldoff;
-               off_t a_newoff;
-               kauth_cred_t cred;
-       } */ *ap = v;
+               int a_whence;
+               off_t a_givenoff;
+               off_t *a_newoff;
+               kauth_cred_t a_cred;
+       }; */ *ap = v;
+       off_t newoffset;
+       struct vattr vattr;
+       int error;
 
-       if (ap->a_newoff < 0)
-               return (EINVAL);
+       error = VOP_GETATTR(ap->a_vp, &vattr, ap->a_cred);
+       if (error)
+               return error;
 
-       return (0);
+       /* initialise return value with old offset */
+       newoffset = ap->a_oldoff;
+       switch (ap->a_whence) {
+       case SEEK_CUR:
+               newoffset = ap->a_oldoff + ap->a_givenoff;
+               break;
+       case SEEK_END:
+               newoffset = ap->a_givenoff + vattr.va_size;
+               break;
+       case SEEK_SET:
+               newoffset = ap->a_givenoff;
+               break;
+       case SEEK_DATA:
+               /* if in the file space, there is one data block */
+               if (ap->a_givenoff >= vattr.va_size)
+                       return ENXIO;
+               newoffset = ap->a_givenoff;
+               break;
+       case SEEK_HOLE:
+               /* there exists one virtual hole at the end of the file */
+               if (ap->a_givenoff >= vattr.va_size)
+                       return ENXIO;
+               newoffset = vattr.va_size;
+               break;
+       default:
+               return EINVAL;
+       }
+
+       if (newoffset < 0)
+               return EINVAL;
+
+       if (ap->a_newoff)
+               *(ap->a_newoff) = newoffset;
+
+       return 0;
 }
 
 int
Index: sys/modules/ffs/Makefile
===================================================================
RCS file: /cvsroot/src/sys/modules/ffs/Makefile,v
retrieving revision 1.7
diff -u -p -r1.7 Makefile
--- sys/modules/ffs/Makefile    26 May 2011 12:56:31 -0000      1.7
+++ sys/modules/ffs/Makefile    7 Aug 2011 07:34:45 -0000
@@ -17,6 +17,7 @@ SRCS= ufs_bmap.c ufs_dirhash.c ufs_extat
 .PATH: ${S}/ufs/ffs
 SRCS+= ffs_alloc.c ffs_balloc.c ffs_inode.c ffs_subr.c ffs_tables.c \
        ffs_vfsops.c ffs_vnops.c ffs_snapshot.c \
-       ffs_bswap.c ffs_wapbl.c ffs_appleufs.c ffs_quota2.c
+       ffs_bswap.c ffs_wapbl.c ffs_appleufs.c ffs_quota2.c \
+       ufs_seek.c
 
 .include <bsd.kmodule.mk>
Index: sys/rump/include/rump/rumpvnode_if.h
===================================================================
RCS file: /cvsroot/src/sys/rump/include/rump/rumpvnode_if.h,v
retrieving revision 1.11
diff -u -p -r1.11 rumpvnode_if.h
--- sys/rump/include/rump/rumpvnode_if.h        11 Jul 2011 08:27:39 -0000      
1.11
+++ sys/rump/include/rump/rumpvnode_if.h        7 Aug 2011 07:34:45 -0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: rumpvnode_if.h,v 1.11 2011/07/11 08:27:39 hannken Exp $        
*/
+/*     $NetBSD$        */
 
 /*
  * Warning: DO NOT EDIT! This file is automatically generated!
@@ -71,7 +71,8 @@ int RUMP_VOP_KQFILTER(struct vnode *, st
 int RUMP_VOP_REVOKE(struct vnode *, int);
 int RUMP_VOP_MMAP(struct vnode *, int, struct kauth_cred *);
 int RUMP_VOP_FSYNC(struct vnode *, struct kauth_cred *, int, off_t, off_t);
-int RUMP_VOP_SEEK(struct vnode *, off_t, off_t, struct kauth_cred *);
+int RUMP_VOP_SEEK(struct vnode *, off_t, int, off_t, off_t *, 
+    struct kauth_cred *);
 int RUMP_VOP_REMOVE(struct vnode *, struct vnode *, struct componentname *);
 int RUMP_VOP_LINK(struct vnode *, struct vnode *, struct componentname *);
 int RUMP_VOP_RENAME(struct vnode *, struct vnode *, struct componentname *, 
Index: sys/rump/librump/rumpvfs/rumpvnode_if.c
===================================================================
RCS file: /cvsroot/src/sys/rump/librump/rumpvfs/rumpvnode_if.c,v
retrieving revision 1.10
diff -u -p -r1.10 rumpvnode_if.c
--- sys/rump/librump/rumpvfs/rumpvnode_if.c     11 Jul 2011 08:27:39 -0000      
1.10
+++ sys/rump/librump/rumpvfs/rumpvnode_if.c     7 Aug 2011 07:34:45 -0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: rumpvnode_if.c,v 1.10 2011/07/11 08:27:39 hannken Exp $        
*/
+/*     $NetBSD$        */
 
 /*
  * Warning: DO NOT EDIT! This file is automatically generated!
@@ -40,7 +40,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: rumpvnode_if.c,v 1.10 2011/07/11 08:27:39 hannken 
Exp $");
+__KERNEL_RCSID(0, "$NetBSD$");
 
 #include <sys/param.h>
 #include <sys/mount.h>
@@ -311,13 +311,15 @@ RUMP_VOP_FSYNC(struct vnode *vp,
 int
 RUMP_VOP_SEEK(struct vnode *vp,
     off_t oldoff,
-    off_t newoff,
+    int whence,
+    off_t givenoff,
+    off_t *newoff,
     struct kauth_cred *cred)
 {
        int error;
 
        rump_schedule();
-       error = VOP_SEEK(vp, oldoff, newoff, cred);
+       error = VOP_SEEK(vp, oldoff, whence, givenoff, newoff, cred);
        rump_unschedule();
 
        return error;
Index: sys/sys/unistd.h
===================================================================
RCS file: /cvsroot/src/sys/sys/unistd.h,v
retrieving revision 1.52
diff -u -p -r1.52 unistd.h
--- sys/sys/unistd.h    30 Aug 2009 16:38:48 -0000      1.52
+++ sys/sys/unistd.h    7 Aug 2011 07:34:45 -0000
@@ -167,6 +167,8 @@
 #define        SEEK_SET        0       /* set file offset to offset */
 #define        SEEK_CUR        1       /* set file offset to current plus 
offset */
 #define        SEEK_END        2       /* set file offset to EOF plus offset */
+#define        SEEK_DATA       3       /* Set file pointer to next data past 
offset */
+#define        SEEK_HOLE       4       /* Set file pointer to next hole past 
offset */
 
 #if defined(_NETBSD_SOURCE)
 /* whence values for lseek(2); renamed by POSIX 1003.1 */
Index: sys/sys/vnode_if.h
===================================================================
RCS file: /cvsroot/src/sys/sys/vnode_if.h,v
retrieving revision 1.82
diff -u -p -r1.82 vnode_if.h
--- sys/sys/vnode_if.h  11 Jul 2011 08:27:39 -0000      1.82
+++ sys/sys/vnode_if.h  7 Aug 2011 07:34:45 -0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: vnode_if.h,v 1.82 2011/07/11 08:27:39 hannken Exp $    */
+/*     $NetBSD$        */
 
 /*
  * Warning: DO NOT EDIT! This file is automatically generated!
@@ -243,11 +243,13 @@ struct vop_seek_args {
        const struct vnodeop_desc *a_desc;
        struct vnode *a_vp;
        off_t a_oldoff;
-       off_t a_newoff;
+       int a_whence;
+       off_t a_givenoff;
+       off_t *a_newoff;
        kauth_cred_t a_cred;
 };
 extern const struct vnodeop_desc vop_seek_desc;
-int VOP_SEEK(struct vnode *, off_t, off_t, kauth_cred_t);
+int VOP_SEEK(struct vnode *, off_t, int, off_t, off_t *, kauth_cred_t);
 
 #define VOP_REMOVE_DESCOFFSET 20
 struct vop_remove_args {
Index: sys/ufs/files.ufs
===================================================================
RCS file: /cvsroot/src/sys/ufs/files.ufs,v
retrieving revision 1.26
diff -u -p -r1.26 files.ufs
--- sys/ufs/files.ufs   24 Mar 2011 17:05:45 -0000      1.26
+++ sys/ufs/files.ufs   7 Aug 2011 07:34:45 -0000
@@ -66,4 +66,5 @@ file  ufs/ufs/quota1_subr.c
 file   ufs/ufs/quota2_subr.c           quota2 & (ffs | lfs | mfs | ext2fs)
 file   ufs/ufs/ufs_vfsops.c            ffs | lfs | mfs | ext2fs
 file   ufs/ufs/ufs_vnops.c             ffs | lfs | mfs | ext2fs
+file   ufs/ufs/ufs_seek.c              ffs | lfs | mfs | ext2fs
 file   ufs/ufs/ufs_wapbl.c             ffs & wapbl
Index: sys/ufs/mfs/mfsnode.h
===================================================================
RCS file: /cvsroot/src/sys/ufs/mfs/mfsnode.h,v
retrieving revision 1.21
diff -u -p -r1.21 mfsnode.h
--- sys/ufs/mfs/mfsnode.h       26 Mar 2008 14:19:43 -0000      1.21
+++ sys/ufs/mfs/mfsnode.h       7 Aug 2011 07:34:45 -0000
@@ -69,7 +69,7 @@ struct mfsnode {
 #define        mfs_write       genfs_badop
 #define        mfs_poll        genfs_badop
 #define        mfs_mmap        genfs_badop
-#define        mfs_seek        genfs_badop
+#define        mfs_seek        genfs_seek
 #define        mfs_remove      genfs_badop
 #define        mfs_link        genfs_badop
 #define        mfs_rename      genfs_badop
Index: sys/ufs/ufs/ufs_bmap.c
===================================================================
RCS file: /cvsroot/src/sys/ufs/ufs/ufs_bmap.c,v
retrieving revision 1.49
diff -u -p -r1.49 ufs_bmap.c
--- sys/ufs/ufs/ufs_bmap.c      6 Mar 2011 17:08:39 -0000       1.49
+++ sys/ufs/ufs/ufs_bmap.c      7 Aug 2011 07:34:45 -0000
@@ -146,7 +146,9 @@ ufs_bmaparray(struct vnode *vp, daddr_t 
                maxrun = MAXPHYS / mp->mnt_stat.f_iosize - 1;
        }
 
+       /* direct entry? */
        if (bn >= 0 && bn < NDADDR) {
+               /* direct entry so sectornrs are stored in the inode itself */
                if (nump != NULL)
                        *nump = 0;
                if (ump->um_fstype == UFS1)
@@ -176,6 +178,7 @@ ufs_bmaparray(struct vnode *vp, daddr_t 
                                *bnp = -1;
                        }
                } else if (runp) {
+                       /* get run length */
                        if (ump->um_fstype == UFS1) {
                                for (++bn; bn < NDADDR && *runp < maxrun &&
                                    is_sequential(ump,
@@ -192,20 +195,32 @@ ufs_bmaparray(struct vnode *vp, daddr_t 
                                        ufs_rw64(ip->i_ffs2_db[bn],
                                            UFS_MPNEEDSWAP(ump)));
                                    ++bn, ++*runp);
-                       }
+                       } /* if UFS1/UFS2 */
                }
                return (0);
        }
 
+       /* use passed indirect block trace or otherwise use temp. */
        xap = ap == NULL ? a : ap;
+
+       /* if number of indirections not requested back use own temp */
        if (!nump)
                nump = &num;
+
+       /*
+        * Get the array of logical block number/offset pairs which represent
+        * the path of indirect blocks required to access a data block. In the
+        * trace the offsets to where to look in those indirect blocks are
+        * recorded in xap->in_off, the lbn of the indirect blocks is recorded
+        * in xap->in_lbn.
+        */
+
        if ((error = ufs_getlbns(vp, bn, xap, nump)) != 0)
                return (error);
 
        num = *nump;
 
-       /* Get disk address out of indirect block array */
+       /* get disk address out of indirect block array */
        if (ump->um_fstype == UFS1)
                daddr = ufs_rw32(ip->i_ffs1_ib[xap->in_off],
                    UFS_MPNEEDSWAP(ump));
@@ -213,11 +228,15 @@ ufs_bmaparray(struct vnode *vp, daddr_t 
                daddr = ufs_rw64(ip->i_ffs2_ib[xap->in_off],
                    UFS_MPNEEDSWAP(ump));
 
+       /* loop trough indirect blocks until we are at the low level */
        for (bp = NULL, ++xap; --num; ++xap) {
                /*
                 * Exit the loop if there is no disk address assigned yet and
                 * the indirect block isn't in the cache, or if we were
                 * looking for an indirect block and we've found it.
+                *
+                * Note that meta-lbn is the negative logical buffer number
+                * represting this indirect block.
                 */
 
                metalbn = xap->in_lbn;
@@ -258,6 +277,7 @@ ufs_bmaparray(struct vnode *vp, daddr_t 
                        panic("ufs_bmaparray: indirect block not in cache");
 #endif
                else {
+                       /* not found in the cache, read in indirect block */
                        trace(TR_BREADMISS, pack(vp, size), metalbn);
                        bp->b_blkno = blkptrtodb(ump, daddr);
                        bp->b_flags |= B_READ;
@@ -265,14 +285,23 @@ ufs_bmaparray(struct vnode *vp, daddr_t 
                        VOP_STRATEGY(vp, bp);
                        curlwp->l_ru.ru_inblock++;      /* XXX */
                        if ((error = biowait(bp)) != 0) {
+                               /* something went wrong reading in, bomb out */
                                brelse(bp, 0);
                                return (error);
                        }
                }
+
+               /*
+                * we have a valid indirect block now pointed by bp containing
+                * block numbers to either the next level of indirect blocks
+                * or to the actual data blocks itself
+                */
+
                if (ump->um_fstype == UFS1) {
                        daddr = ufs_rw32(((u_int32_t *)bp->b_data)[xap->in_off],
                            UFS_MPNEEDSWAP(ump));
                        if (num == 1 && daddr && runp) {
+                               /* we're at the low level, calc. run length */
                                for (bn = xap->in_off + 1;
                                    bn < MNINDIR(ump) && *runp < maxrun &&
                                    is_sequential(ump,
@@ -286,6 +315,7 @@ ufs_bmaparray(struct vnode *vp, daddr_t 
                        daddr = ufs_rw64(((u_int64_t *)bp->b_data)[xap->in_off],
                            UFS_MPNEEDSWAP(ump));
                        if (num == 1 && daddr && runp) {
+                               /* we're at the low level, calc. run length */
                                for (bn = xap->in_off + 1;
                                    bn < MNINDIR(ump) && *runp < maxrun &&
                                    is_sequential(ump,
@@ -295,8 +325,8 @@ ufs_bmaparray(struct vnode *vp, daddr_t 
                                            UFS_MPNEEDSWAP(ump)));
                                    ++bn, ++*runp);
                        }
-               }
-       }
+               } /* if UFS1/UFS2 */
+       } /* end for */
        if (bp)
                brelse(bp, 0);
 
Index: sys/ufs/ufs/ufs_extern.h
===================================================================
RCS file: /cvsroot/src/sys/ufs/ufs/ufs_extern.h,v
retrieving revision 1.66
diff -u -p -r1.66 ufs_extern.h
--- sys/ufs/ufs/ufs_extern.h    17 Jul 2011 22:07:59 -0000      1.66
+++ sys/ufs/ufs/ufs_extern.h    7 Aug 2011 07:34:45 -0000
@@ -86,7 +86,7 @@ int   ufs_readlink(void *);
 int    ufs_remove(void *);
 int    ufs_rename(void *);
 int    ufs_rmdir(void *);
-#define        ufs_seek        genfs_seek
+int    ufs_seek(void *);
 #define        ufs_poll        genfs_poll
 int    ufs_setattr(void *);
 int    ufs_strategy(void *);
===================================================================
--- /dev/null   2011-08-07 03:51:18.000000000 +0200
+++ sys/ufs/ufs/ufs_seek.c      2011-08-06 12:13:03.000000000 +0200
@@ -0,0 +1,330 @@
+/*     $NetBSD$        */
+
+/*
+ * Copyright (c) 2011 Reinoud Zandijk
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *          This product includes software developed for the
+ *          NetBSD Project.  See http://www.NetBSD.org/ for
+ *          information about NetBSD.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * 
+ */
+
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/resourcevar.h>
+#include <sys/trace.h>
+#include <sys/unistd.h>
+#include <sys/fstrans.h>
+
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+
+
+#define INODE_DADDR(bn) \
+       ((ump->um_fstype == UFS1) ? \
+               ufs_rw32(ip->i_ffs1_db[bn], UFS_MPNEEDSWAP(ump)) : \
+               ufs_rw64(ip->i_ffs2_db[bn], UFS_MPNEEDSWAP(ump)))
+
+#define INODE_IADDR(indir) \
+       ((ump->um_fstype == UFS1) ? \
+               ufs_rw32(ip->i_ffs1_ib[indir], UFS_MPNEEDSWAP(ump)) : \
+               ufs_rw64(ip->i_ffs2_ib[indir], UFS_MPNEEDSWAP(ump)))
+
+#define INDIR_IADDR(blk, offset) \
+       ((ump->um_fstype == UFS1) ? \
+               ufs_rw32(((uint32_t *) (blk))[offset], UFS_MPNEEDSWAP(ump)) : \
+               ufs_rw64(((uint64_t *) (blk))[offset], UFS_MPNEEDSWAP(ump)))
+
+#define SEEK_CONDITION(addr, what) \
+        (((what) == SEEK_DATA) ? ((addr) != 0) : ((addr) == 0))
+
+#if 0
+#define SEEK_DEBUG
+#endif
+
+
+/* XXX could check SPCF_SHOULDYIELD, preempt(9) */
+static int
+ufs_seekilevel(struct vnode *vp, int what,
+       daddr_t *skip_bn, daddr_t *walking_bn,
+       uint32_t bsize, int level, int indir_idx, daddr_t indir, int *found)
+{
+       struct mount *mp;
+       struct inode *ip;
+       struct ufsmount *ump;
+       struct buf *bp;
+       struct indir indir_path[NIADDR + 1];
+       daddr_t daddr;
+       uint32_t subbsize;
+       int d, entry, error;
+       daddr_t metablkno;
+
+       ip = VTOI(vp);
+       mp = vp->v_mount;
+       ump = ip->i_ump;
+
+#ifdef SEEK_DEBUG
+       printf( "ufs_seek: check indirect entries, level %d, "
+               "skip = %"PRIi64", walking %"PRIi64", "
+               "daddr %"PRIi64"\n", level, *skip_bn, *walking_bn, indir);
+#endif
+
+       /* check if it would be in this block */
+       entry = *skip_bn / bsize;
+       if (entry >= MNINDIR(ump)) {
+               *skip_bn    -= MNINDIR(ump) * bsize;
+               *walking_bn += MNINDIR(ump) * bsize;
+               return 0;
+       }
+       /* retrieve indirect block at disc address indir */
+       /* calculate metablockno */
+       memset(indir_path, 0, sizeof(indir_path));
+       error = ufs_getlbns(vp, *walking_bn, indir_path, NULL);
+       if (error)
+               return error;
+
+       metablkno = indir_path[indir_idx].in_lbn;
+       if (metablkno >= 0) {
+               printf("SEEK_DATA/SEEK_HOLE internal error; not a metablk\n");
+               return EIO;
+       }
+
+       /* this (negative) metablkno is used for caching */
+       bp = getblk(vp, metablkno, mp->mnt_stat.f_iosize, 0, 0);
+       if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0) {
+               bp->b_blkno = blkptrtodb(ump, indir);
+               bp->b_flags |= B_READ;
+               BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
+               VOP_STRATEGY(vp, bp);
+               curproc->p_stats->p_ru.ru_inblock++;    /* XXX */
+               if ((error = biowait(bp)) != 0) {
+                       /* something went wrong reading in, bomb out */
+                       brelse(bp, 0);
+                       return error;
+               }
+       }
+
+       /* indirect block found, now go over its entries */
+       error = 0;
+       *skip_bn    -= entry * bsize;
+       *walking_bn += entry * bsize;
+
+       subbsize = bsize >> ump->um_lognindir;
+       for (; entry < MNINDIR(ump); entry++) {
+               daddr = INDIR_IADDR(bp->b_data, entry);
+               if (daddr && (level > 0)) {
+                       error = ufs_seekilevel(vp, what, skip_bn, walking_bn,
+                               subbsize, level-1, indir_idx+1, daddr, found);
+                       if (error || *found)
+                               break;
+               } else {
+                       d = MIN(*skip_bn, bsize);
+                       *skip_bn -= d;
+                       *walking_bn += d;
+                       if ((*skip_bn == 0) && (level == 0))
+                               *found = SEEK_CONDITION(daddr, what);
+                       if (*found)
+                               break;
+                       *walking_bn += bsize - d;
+               }
+       }
+       /* error condition, not in this level or we found it */
+       brelse(bp, 0);
+       return error;
+}
+
+
+static int
+ufs_seeksparse(struct vnode *vp, int what, daddr_t from_bn, daddr_t 
*walking_bn)
+{
+       struct mount *mp;
+       struct inode *ip;
+       struct ufsmount *ump;
+       daddr_t skip_bn, daddr;
+       uint32_t bsize;
+       int level, error, found;
+
+       ip = VTOI(vp);
+       mp = vp->v_mount;
+       ump = ip->i_ump;
+
+       /* we can't start before block zero */
+       if (from_bn < 0)
+               from_bn = 0;
+
+       skip_bn = from_bn;
+       *walking_bn = 0;
+       /* first check direct entries */
+       if (skip_bn < NDADDR) {
+               *walking_bn = skip_bn;
+               daddr = INODE_DADDR(*walking_bn);
+               while ((*walking_bn < NDADDR) && !SEEK_CONDITION(daddr, what)) {
+                       *walking_bn += 1;
+                       daddr = INODE_DADDR(*walking_bn);
+               }
+               if (*walking_bn < NDADDR)
+                       return 0;
+       } else {
+               *walking_bn = NDADDR;
+       }
+       skip_bn = MAX(0, skip_bn - NDADDR);
+
+       /* next the indirect levels */
+       bsize = 1;
+       for (level = 0; level < NIADDR; level++) {
+               daddr = INODE_IADDR(level);
+               found = 0;
+               error = ufs_seekilevel(vp, what, &skip_bn, walking_bn, bsize,
+                               level, 1, daddr, &found);
+
+#ifdef SEEK_DEBUG
+               printf( "premature state at level %d, found = %d, "
+                       "skip = %"PRIi64", walking = %"PRIi64", "
+                       "error = %d, bsize = %d\n", level, found,
+                       skip_bn, *walking_bn, error, bsize);
+#endif
+
+               if (error)
+                       return error;
+               if (found)
+                       return 0;
+
+               bsize = bsize << ump->um_lognindir;
+       }
+
+       if (what == SEEK_DATA)
+               return ENXIO;
+
+       /* can't reach */
+       printf("File to big in ufs_seek? level 3 didn't find a hole!\n");
+       return EFBIG;
+}
+
+
+int
+ufs_seek(void *v)
+{
+       struct vop_seek_args /* {
+               const struct vnodeop_desc *a_desc;
+               struct vnode *a_vp;
+               off_t a_oldoff;
+               int a_whence;
+               off_t a_givenoff;
+               off_t *a_newoff;
+               kauth_cred_t a_cred;
+       }; */ *ap = v;
+       off_t newoff;
+       struct mount *mp;
+       struct vattr vattr;
+       daddr_t start_bn, bn;
+       int blksize, error;
+
+       mp = ap->a_vp->v_mount;
+       blksize = mp->mnt_stat.f_iosize;
+
+       error = VOP_GETATTR(ap->a_vp, &vattr, ap->a_cred);
+       if (error)
+               return error;
+
+       /* initialise return value with old offset */
+       newoff = ap->a_oldoff;
+       switch (ap->a_whence) {
+       case SEEK_CUR:
+               newoff = ap->a_oldoff + ap->a_givenoff;
+               break;
+       case SEEK_END:
+               newoff = ap->a_givenoff + vattr.va_size;
+               break;
+       case SEEK_SET:
+               newoff = ap->a_givenoff;
+               break;
+       case SEEK_DATA:
+               /* if outside the file space, there is no data */
+               if (ap->a_givenoff >= vattr.va_size)
+                       return ENXIO;
+               /* last block is allways data, no sense checking */
+               if (ap->a_givenoff >= vattr.va_size - blksize) {
+                       newoff = ap->a_givenoff;
+                       break;
+               }
+               /* falltrough*/
+       case SEEK_HOLE:
+               /* there exists one virtual hole at the end of the file */
+               if (ap->a_givenoff >= vattr.va_size)
+                       return ENXIO;
+
+               /* protect against changes */
+               fstrans_start(ap->a_vp->v_mount, FSTRANS_SHARED);
+
+               /* we need to FSYNC first or the disc administation is wrong */
+               VOP_FSYNC(ap->a_vp, ap->a_cred, FSYNC_WAIT, 0, 0);
+
+               /* search inside file */
+               start_bn = ap->a_givenoff / blksize;
+               bn = 0;
+               error = ufs_seeksparse(ap->a_vp, ap->a_whence,
+                   start_bn, &bn);
+               newoff = bn * blksize;
+
+               /* release and exit when we got an error */
+               fstrans_done(ap->a_vp->v_mount);
+               if (error)
+                       return error;
+
+               /* if we haven't changed, return the origional offset */
+               if (bn == start_bn)
+                       newoff = ap->a_givenoff;
+
+               /* if we've passed the file size, we give the filesize */
+               if (newoff > vattr.va_size)
+                       newoff = vattr.va_size;
+               break;
+       default:
+               return EINVAL;
+       }
+
+       if (newoff < 0)
+               return EINVAL;
+
+       if (ap->a_newoff)
+               *(ap->a_newoff) = newoff;
+
+       return 0;
+}
+
===================================================================



Home | Main Index | Thread Index | Old Index