tech-kern archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

RFC: SEEK_DATA/SEEK_HOLE implementation version 2



Hi folks,

after getting stuck in the 1st implementation in the rump/puffs/refuse jungle
i started a new version that is more in line with the Solaris implementation
and is far less invasive.

Basicly the system call forwards the requests using ioctl's just like Solaris
and, as it turns out, also FreeBSD with their ZFS import. For simplicity and
to reduce compat stuff i've used the same ioctls FreeBSD defines. FreeBSDs
support is limited though; only ZFS handles them. The ioctl names are not
documented yet.

The new implementation presents the default one-blob for file systems that
don't implement it. For NetBSD its currently implemented for UFS and is tested
for FFS with/without WAPBL, ext2fs and lfs. It is present in our ZFS import
but aparently disabled still and i dont have a ZFS partition to play with. I
might be tempted to try it later on my scratch machine :) UDF is next but
shouldn't be that difficult.

Rests the userland tool awareness and utilizing but thats phase 2.

With regards,
Reinoud

Index: lib/libc/sys/lseek.2
===================================================================
RCS file: /cvsroot/src/lib/libc/sys/lseek.2,v
retrieving revision 1.24
diff -u -p -r1.24 lseek.2
--- lib/libc/sys/lseek.2        5 Apr 2010 07:53:47 -0000       1.24
+++ lib/libc/sys/lseek.2        16 Aug 2011 19:34:43 -0000
@@ -29,7 +29,7 @@
 .\"
 .\"     @(#)lseek.2    8.3 (Berkeley) 4/19/94
 .\"
-.Dd April 3, 2010
+.Dd August 7, 2011
 .Dt LSEEK 2
 .Os
 .Sh NAME
@@ -86,6 +86,24 @@ the offset is set to the size of the
 file plus
 .Fa offset
 bytes.
+.It
+If
+.Fa whence
+is
+.Dv SEEK_HOLE ,
+the offset is set to the "hole" file region,
+either pointed to or to the start of the first one following the supplied
+.Fa offset
+in bytes.
+.It
+If
+.Fa whence
+is
+.Dv SEEK_DATA 
+the offset is set to the first non-"hole" file region,
+either pointed to or to the start of the first one following the supplied
+.Fa offset
+in bytes.
 .El
 .Pp
 The
@@ -98,6 +116,17 @@ bytes of zeros (until data is actually w
 .Pp
 Some devices are incapable of seeking.
 The value of the pointer associated with such a device is undefined.
+.Pp
+In this function, a "hole" is defined as a gap in a file, a contiguous range 
of bytes in a file
+that are, when read in, all zero. Note that NOT all zero ranges are guaranteed 
to be
+returned as holes by
+.Dv SEEK_HOLE
+though filesystems are allowed to not only look at the file's allocation to 
find
+holes but also to look at the recorded contents to find holes. The offsets
+returned thus does not have to be an integer multiple of the filesystem block 
size.
+.Pp
+To ease implementation, a hole is guaranteed after each data region. This
+implies that there is a virtual hole at the end of the file.
 .Sh RETURN VALUES
 Upon successful completion,
 .Fn lseek
@@ -142,6 +171,12 @@ for
 due to a larger
 .Fa offset
 argument type.
+.Pp
+The
+.Dv SEEK_HOLE
+and
+.Dv SEEK_DATA
+extensions were added by Solaris for their ZFS.
 .Sh BUGS
 This document's use of
 .Fa whence
Index: sys/kern/vfs_syscalls.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_syscalls.c,v
retrieving revision 1.433
diff -u -p -r1.433 vfs_syscalls.c
--- sys/kern/vfs_syscalls.c     8 Aug 2011 12:08:53 -0000       1.433
+++ sys/kern/vfs_syscalls.c     16 Aug 2011 19:35:02 -0000
@@ -2141,7 +2141,7 @@ sys_lseek(struct lwp *l, const struct sy
        file_t *fp;
        struct vnode *vp;
        struct vattr vattr;
-       off_t newoff;
+       off_t offset, newoff;
        int error, fd;
 
        fd = SCARG(uap, fd);
@@ -2155,19 +2155,53 @@ sys_lseek(struct lwp *l, const struct sy
                goto out;
        }
 
+       error = VOP_GETATTR(vp, &vattr, cred);
+       if (error)
+               goto out;
+
+       offset = SCARG(uap, offset);
        switch (SCARG(uap, whence)) {
        case SEEK_CUR:
-               newoff = fp->f_offset + SCARG(uap, offset);
+               newoff = fp->f_offset + offset;
                break;
        case SEEK_END:
-               error = VOP_GETATTR(vp, &vattr, cred);
+               newoff = vattr.va_size + offset;
+               break;
+       case SEEK_SET:
+               newoff = offset;
+               break;
+       case SEEK_DATA:
+               newoff = offset;
+               error = VOP_IOCTL(vp, FIOSEEKDATA, &newoff,  FKIOCTL, NOCRED);
                if (error) {
-                       goto out;
+                       if (error != EPASSTHROUGH)
+                               goto out;
+                       /*
+                        * Unimplemented in the filesystem backing the vp;
+                        * emulate a single blob
+                        */
+                       if (newoff >= vattr.va_size) {
+                               error = ENXIO;
+                               goto out;
+                       }
                }
-               newoff = SCARG(uap, offset) + vattr.va_size;
                break;
-       case SEEK_SET:
-               newoff = SCARG(uap, offset);
+       case SEEK_HOLE:
+               newoff = offset;
+               error = VOP_IOCTL(vp, FIOSEEKHOLE, &newoff,  FKIOCTL, NOCRED);
+               if (error) {
+                       if (error != EPASSTHROUGH)
+                               goto out;
+                       /*
+                        * Unimplemented in the filesystem backing the vp;
+                        * emulate the "virtual hole" at the end of the file
+                        */
+                       if (newoff >= vattr.va_size) {
+                               error = ENXIO;
+                               goto out;
+                       }
+                       newoff = vattr.va_size;
+               }
                break;
        default:
                error = EINVAL;
Index: sys/sys/fcntl.h
===================================================================
RCS file: /cvsroot/src/sys/sys/fcntl.h,v
retrieving revision 1.41
diff -u -p -r1.41 fcntl.h
--- sys/sys/fcntl.h     9 Aug 2011 04:19:17 -0000       1.41
+++ sys/sys/fcntl.h     16 Aug 2011 19:35:04 -0000
@@ -268,6 +268,12 @@ struct flock {
 #ifndef        SEEK_END
 #define        SEEK_END        2       /* set file offset to EOF plus offset */
 #endif
+#ifndef SEEK_DATA
+#define        SEEK_DATA       3       /* Set file pointer to next data past 
offset */
+#endif
+#ifndef SEEK_HOLE
+#define        SEEK_HOLE       4       /* Set file pointer to next hole past 
offset */
+#endif
 
 /*
  * posix_advise advisories.
Index: sys/sys/filio.h
===================================================================
RCS file: /cvsroot/src/sys/sys/filio.h,v
retrieving revision 1.10
diff -u -p -r1.10 filio.h
--- sys/sys/filio.h     11 Dec 2005 12:25:20 -0000      1.10
+++ sys/sys/filio.h     16 Aug 2011 19:35:04 -0000
@@ -44,6 +44,8 @@
 /* Generic file-descriptor ioctl's. */
 #define        FIOCLEX          _IO('f', 1)            /* set close on exec on 
fd */
 #define        FIONCLEX         _IO('f', 2)            /* remove close on exec 
*/
+#define        FIOSEEKDATA     _IOWR('f', 97, off_t)   /* lseek's SEEK_DATA 
helper ioctl */
+#define        FIOSEEKHOLE     _IOWR('f', 98, off_t)   /* lseek's SEEK_HOLE 
helper ioctl */
 #define        FIONREAD        _IOR('f', 127, int)     /* get # bytes to read 
*/
 #define        FIONBIO         _IOW('f', 126, int)     /* set/clear 
non-blocking i/o */
 #define        FIOASYNC        _IOW('f', 125, int)     /* set/clear async i/o 
*/
Index: sys/sys/unistd.h
===================================================================
RCS file: /cvsroot/src/sys/sys/unistd.h,v
retrieving revision 1.52
diff -u -p -r1.52 unistd.h
--- sys/sys/unistd.h    30 Aug 2009 16:38:48 -0000      1.52
+++ sys/sys/unistd.h    16 Aug 2011 19:35:04 -0000
@@ -167,6 +167,8 @@
 #define        SEEK_SET        0       /* set file offset to offset */
 #define        SEEK_CUR        1       /* set file offset to current plus 
offset */
 #define        SEEK_END        2       /* set file offset to EOF plus offset */
+#define        SEEK_DATA       3       /* Set file pointer to next data past 
offset */
+#define        SEEK_HOLE       4       /* Set file pointer to next hole past 
offset */
 
 #if defined(_NETBSD_SOURCE)
 /* whence values for lseek(2); renamed by POSIX 1003.1 */
Index: sys/ufs/files.ufs
===================================================================
RCS file: /cvsroot/src/sys/ufs/files.ufs,v
retrieving revision 1.26
diff -u -p -r1.26 files.ufs
--- sys/ufs/files.ufs   24 Mar 2011 17:05:45 -0000      1.26
+++ sys/ufs/files.ufs   16 Aug 2011 19:35:04 -0000
@@ -66,4 +66,5 @@ file  ufs/ufs/quota1_subr.c
 file   ufs/ufs/quota2_subr.c           quota2 & (ffs | lfs | mfs | ext2fs)
 file   ufs/ufs/ufs_vfsops.c            ffs | lfs | mfs | ext2fs
 file   ufs/ufs/ufs_vnops.c             ffs | lfs | mfs | ext2fs
+file   ufs/ufs/ufs_ioctl.c             ffs | lfs | mfs | ext2fs
 file   ufs/ufs/ufs_wapbl.c             ffs & wapbl
Index: sys/ufs/ufs/ufs_extern.h
===================================================================
RCS file: /cvsroot/src/sys/ufs/ufs/ufs_extern.h,v
retrieving revision 1.66
diff -u -p -r1.66 ufs_extern.h
--- sys/ufs/ufs/ufs_extern.h    17 Jul 2011 22:07:59 -0000      1.66
+++ sys/ufs/ufs/ufs_extern.h    16 Aug 2011 19:35:04 -0000
@@ -69,7 +69,7 @@ int   ufs_create(void *);
 int    ufs_getattr(void *);
 int    ufs_inactive(void *);
 #define        ufs_fcntl       genfs_fcntl
-#define        ufs_ioctl       genfs_enoioctl
+int    ufs_ioctl(void *);
 #define        ufs_islocked    genfs_islocked
 int    ufs_link(void *);
 #define        ufs_lock        genfs_lock
===================================================================
--- /dev/null   2011-08-16 16:37:02.000000000 +0200
+++ sys/ufs/ufs/ufs_ioctl.c     2011-08-16 21:40:47.000000000 +0200
@@ -0,0 +1,323 @@
+/*     $NetBSD$        */
+
+/*
+ * Copyright (c) 2011 Reinoud Zandijk
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *          This product includes software developed for the
+ *          NetBSD Project.  See http://www.NetBSD.org/ for
+ *          information about NetBSD.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * 
+ */
+
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/resourcevar.h>
+#include <sys/trace.h>
+#include <sys/unistd.h>
+#include <sys/fstrans.h>
+
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+
+
+#define INODE_DADDR(bn) \
+       ((ump->um_fstype == UFS1) ? \
+               ufs_rw32(ip->i_ffs1_db[bn], UFS_MPNEEDSWAP(ump)) : \
+               ufs_rw64(ip->i_ffs2_db[bn], UFS_MPNEEDSWAP(ump)))
+
+#define INODE_IADDR(indir) \
+       ((ump->um_fstype == UFS1) ? \
+               ufs_rw32(ip->i_ffs1_ib[indir], UFS_MPNEEDSWAP(ump)) : \
+               ufs_rw64(ip->i_ffs2_ib[indir], UFS_MPNEEDSWAP(ump)))
+
+#define INDIR_IADDR(blk, offset) \
+       ((ump->um_fstype == UFS1) ? \
+               ufs_rw32(((uint32_t *) (blk))[offset], UFS_MPNEEDSWAP(ump)) : \
+               ufs_rw64(((uint64_t *) (blk))[offset], UFS_MPNEEDSWAP(ump)))
+
+#define SEEK_CONDITION(addr, what) \
+        (((what) == SEEK_DATA) ? ((addr) != 0) : ((addr) == 0))
+
+#if 0
+#define SEEK_DEBUG
+#endif
+
+
+/* XXX could check SPCF_SHOULDYIELD, preempt(9) */
+static int
+ufs_seekilevel(struct vnode *vp, int what,
+       daddr_t *skip_bn, daddr_t *walking_bn,
+       uint32_t bsize, int level, int indir_idx, daddr_t indir, int *found)
+{
+       struct mount *mp;
+       struct inode *ip;
+       struct ufsmount *ump;
+       struct buf *bp;
+       struct indir indir_path[NIADDR + 1];
+       daddr_t daddr;
+       uint32_t subbsize;
+       int d, entry, error;
+       daddr_t metablkno;
+
+       ip = VTOI(vp);
+       mp = vp->v_mount;
+       ump = ip->i_ump;
+
+#ifdef SEEK_DEBUG
+       printf( "ufs_seek: check indirect entries, level %d, "
+               "skip = %"PRIi64", walking %"PRIi64", "
+               "daddr %"PRIi64"\n", level, *skip_bn, *walking_bn, indir);
+#endif
+
+       /* check if it would be in this block */
+       entry = *skip_bn / bsize;
+       if (entry >= MNINDIR(ump)) {
+               *skip_bn    -= MNINDIR(ump) * bsize;
+               *walking_bn += MNINDIR(ump) * bsize;
+               return 0;
+       }
+       /* retrieve indirect block at disc address indir */
+       /* calculate metablockno */
+       memset(indir_path, 0, sizeof(indir_path));
+       error = ufs_getlbns(vp, *walking_bn, indir_path, NULL);
+       if (error)
+               return error;
+
+       metablkno = indir_path[indir_idx].in_lbn;
+       if (metablkno >= 0) {
+               printf("SEEK_DATA/SEEK_HOLE internal error; not a metablk\n");
+               return EIO;
+       }
+
+       /* this (negative) metablkno is used for caching */
+       bp = getblk(vp, metablkno, mp->mnt_stat.f_iosize, 0, 0);
+       if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0) {
+               bp->b_blkno = blkptrtodb(ump, indir);
+               bp->b_flags |= B_READ;
+               BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
+               VOP_STRATEGY(vp, bp);
+               curproc->p_stats->p_ru.ru_inblock++;    /* XXX */
+               if ((error = biowait(bp)) != 0) {
+                       /* something went wrong reading in, bomb out */
+                       brelse(bp, 0);
+                       return error;
+               }
+       }
+
+       /* indirect block found, now go over its entries */
+       error = 0;
+       *skip_bn    -= entry * bsize;
+       *walking_bn += entry * bsize;
+
+       subbsize = bsize >> ump->um_lognindir;
+       for (; entry < MNINDIR(ump); entry++) {
+               daddr = INDIR_IADDR(bp->b_data, entry);
+               if (daddr && (level > 0)) {
+                       error = ufs_seekilevel(vp, what, skip_bn, walking_bn,
+                               subbsize, level-1, indir_idx+1, daddr, found);
+                       if (error || *found)
+                               break;
+               } else {
+                       d = MIN(*skip_bn, bsize);
+                       *skip_bn -= d;
+                       *walking_bn += d;
+                       if ((*skip_bn == 0) && (level == 0))
+                               *found = SEEK_CONDITION(daddr, what);
+                       if (*found)
+                               break;
+                       *walking_bn += bsize - d;
+               }
+       }
+       /* error condition, not in this level or we found it */
+       brelse(bp, 0);
+       return error;
+}
+
+
+static int
+ufs_seeksparse(struct vnode *vp, int what, daddr_t from_bn, daddr_t 
*walking_bn)
+{
+       struct mount *mp;
+       struct inode *ip;
+       struct ufsmount *ump;
+       daddr_t skip_bn, daddr;
+       uint32_t bsize;
+       int level, error, found;
+
+       ip = VTOI(vp);
+       mp = vp->v_mount;
+       ump = ip->i_ump;
+
+       /* we can't start before block zero */
+       if (from_bn < 0)
+               from_bn = 0;
+
+       skip_bn = from_bn;
+       *walking_bn = 0;
+       /* first check direct entries */
+       if (skip_bn < NDADDR) {
+               *walking_bn = skip_bn;
+               daddr = INODE_DADDR(*walking_bn);
+               while ((*walking_bn < NDADDR) && !SEEK_CONDITION(daddr, what)) {
+                       *walking_bn += 1;
+                       daddr = INODE_DADDR(*walking_bn);
+               }
+               if (*walking_bn < NDADDR)
+                       return 0;
+       } else {
+               *walking_bn = NDADDR;
+       }
+       skip_bn = MAX(0, skip_bn - NDADDR);
+
+       /* next the indirect levels */
+       bsize = 1;
+       for (level = 0; level < NIADDR; level++) {
+               daddr = INODE_IADDR(level);
+               found = 0;
+               error = ufs_seekilevel(vp, what, &skip_bn, walking_bn, bsize,
+                               level, 1, daddr, &found);
+
+#ifdef SEEK_DEBUG
+               printf( "premature state at level %d, found = %d, "
+                       "skip = %"PRIi64", walking = %"PRIi64", "
+                       "error = %d, bsize = %d\n", level, found,
+                       skip_bn, *walking_bn, error, bsize);
+#endif
+
+               if (error)
+                       return error;
+               if (found)
+                       return 0;
+
+               bsize = bsize << ump->um_lognindir;
+       }
+
+       if (what == SEEK_DATA)
+               return ENXIO;
+
+       /* can't reach */
+       printf("File to big in ufs_seek? level 3 didn't find a hole!\n");
+       return EFBIG;
+}
+
+int
+ufs_ioctl(void *v)
+{
+       struct vop_ioctl_args /* {
+               struct vnode *a_vp;
+               u_long a_command;
+               void *a_data;
+               int  a_fflag;
+               kauth_cred_t a_cred;
+       } */ *ap = v;
+
+       struct mount *mp;
+       struct vattr vattr;
+       daddr_t start_bn, bn;
+       off_t a_givenoff;
+       int a_whence;
+       off_t newoff;
+       int blksize, error;
+
+       mp = ap->a_vp->v_mount;
+       blksize = mp->mnt_stat.f_iosize;
+
+       error = VOP_GETATTR(ap->a_vp, &vattr, ap->a_cred);
+       if (error)
+               return error;
+
+       a_givenoff = *((off_t *) ap->a_data);
+       newoff = a_givenoff;
+       switch (ap->a_command) {
+       case FIOSEEKDATA:
+               /* if outside the file space, there is no data */
+               if (a_givenoff >= vattr.va_size)
+                       return ENXIO;
+
+               /* last block is allways data, no sense checking */
+               if (a_givenoff >= vattr.va_size - blksize) {
+                       newoff = a_givenoff;
+                       break;
+               }
+               /* falltrough*/
+       case FIOSEEKHOLE:
+               a_whence = (ap->a_command == FIOSEEKDATA) ? SEEK_DATA : 
SEEK_HOLE;
+
+               /* there exists one virtual hole at the end of the file */
+               if (a_givenoff >= vattr.va_size)
+                       return ENXIO;
+
+               /* protect against changes */
+               fstrans_start(ap->a_vp->v_mount, FSTRANS_SHARED);
+
+               /* we need to FSYNC first or the disc administation is wrong */
+               VOP_FSYNC(ap->a_vp, ap->a_cred, FSYNC_WAIT, 0, 0);
+
+               /* search inside file */
+               start_bn = a_givenoff / blksize;
+               bn = 0;
+               error = ufs_seeksparse(ap->a_vp, a_whence, start_bn, &bn);
+               newoff = bn * blksize;
+
+               /* release and exit when we got an error */
+               fstrans_done(ap->a_vp->v_mount);
+               if (error)
+                       return error;
+
+               /* if we haven't changed, return the origional offset */
+               if (bn == start_bn)
+                       newoff = a_givenoff;
+
+               /* if we've passed the file size, we give the filesize */
+               if (newoff > vattr.va_size)
+                       newoff = vattr.va_size;
+               break;
+       default:
+               /* bail out */
+               return ENOTTY;
+       }
+
+       if (newoff < 0)
+               return EINVAL;
+
+       *((off_t *) ap->a_data) = newoff;
+
+       return 0;
+}
+


Home | Main Index | Thread Index | Old Index