Subject: Re: new kpi proposal, sysdisk(9)
To: None <elad@NetBSD.org>
From: YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
List: tech-kern
Date: 01/07/2007 15:36:14
--NextPart-20070107152408-1761200
Content-Type: Text/Plain; charset=us-ascii

> > i meant, the framework should be designed so that it can track
> > which parts of disks are used by who, rather than hardcoding your policy.
> 
> I won't comment on that, because that is a rather hypothetical
> statement... when it's finished, we'll get back to it. :)

how about something like the attached patch?
"query" part is not yet, but it shouldn't be too hard.

YAMAMOTO Takashi

--NextPart-20070107152408-1761200
Content-Type: Text/Plain; charset=us-ascii
Content-Disposition: attachment; filename="dk_use.diff"

Index: sys/disk.h
===================================================================
--- sys/disk.h	(revision 1922)
+++ sys/disk.h	(working copy)
@@ -444,9 +444,14 @@ struct disk {
 	struct cpu_disklabel *dk_cpulabel;
 };
 
+struct diskuser;
+typedef struct diskuser diskuser_t;
+
 struct dkdriver {
 	void	(*d_strategy)(struct buf *);
 	void	(*d_minphys)(struct buf *);
+	int	(*d_use)(struct disk *, int, diskuser_t *);
+	int	(*d_unuse)(struct disk *, int, diskuser_t *);
 #ifdef notyet
 	int	(*d_open)(dev_t, int, int, struct proc *);
 	int	(*d_close)(dev_t, int, int, struct proc *);
@@ -466,6 +471,39 @@ struct dkdriver {
 #define	DK_OPENRAW	5		/* open without label */
 
 /*
+ * disk usage tracking
+ */
+
+int diskuser_create(const char *, diskuser_t **);
+void diskuser_destroy(diskuser_t *);
+
+int disk_use(struct vnode *, diskuser_t *);
+int disk_unuse(struct vnode *, diskuser_t *);
+
+int disk_open(struct vnode *, int, kauth_cred_t cred, struct lwp *,
+    diskuser_t *);
+int disk_close(struct vnode *, int, kauth_cred_t cred, struct lwp *,
+    diskuser_t *);
+
+/*
+ * diskrange_t: represent a range in a disk.
+ */
+
+typedef struct {
+	uint64_t r_start;	/* in bytes */
+	uint64_t r_size;	/* in bytes */
+} diskrange_t;
+
+/*
+ * helper functions for drivers
+ */
+
+int diskrange_use(struct disk *, const diskrange_t *, diskuser_t *);
+int diskrange_unuse(struct disk *, const diskrange_t *, diskuser_t *);
+int diskpartition_use(struct disk *, int, diskuser_t *);
+int diskpartition_unuse(struct disk *, int, diskuser_t *);
+
+/*
  * Bad sector lists per fixed disk
  */
 struct disk_badsectors {
Index: kern/subr_disk.c
===================================================================
--- kern/subr_disk.c	(revision 1922)
+++ kern/subr_disk.c	(working copy)
@@ -79,11 +79,15 @@ __KERNEL_RCSID(0, "$NetBSD: subr_disk.c,
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
+#include <sys/kmem.h>
 #include <sys/buf.h>
+#include <sys/conf.h>
 #include <sys/syslog.h>
 #include <sys/disklabel.h>
 #include <sys/disk.h>
 #include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <miscfs/specfs/specdev.h>
 #include <lib/libkern/libkern.h>
 
 /*
@@ -453,3 +457,190 @@ disk_ioctl(struct disk *diskp, u_long cm
 
 	return (error);
 }
+
+static int
+disk_find_by_dev(dev_t dev, enum vtype type, struct disk **dkp)
+{
+	dev_t bdev;
+	const char *name;
+	struct disk *dk;
+	char devname[16];
+
+	if (type == VCHR) {
+		bdev = devsw_chr2blk(dev);
+		if (bdev == NODEV) {
+			return ENOENT;
+		}
+	} else if (type == VBLK) {
+		bdev = dev;
+	} else {
+		panic("%s: type=%d", __func__, (int)type);
+	}
+	name = devsw_blk2name(major(bdev));
+	snprintf(devname, sizeof(devname), "%s%d", name, DISKUNIT(bdev));
+	dk = disk_find(devname);
+	if (dk == NULL) {
+		return ENOENT;
+	}
+	*dkp = dk;
+	return 0;
+}
+
+struct diskuser {
+	/*
+	 * XXX probably we want to put more info here.
+	 * XXX use prop list?
+	 */
+	const char *du_name;
+};
+
+int
+diskrange_use(struct disk *dk, const diskrange_t *r,
+    diskuser_t *du)
+{
+
+	printf("%s: %s (start=%" PRIu64 ", size=%" PRIu64 ") by %s\n",
+	    __func__, dk->dk_name, r->r_start, r->r_size, du->du_name);
+
+	return 0;
+}
+
+int
+diskrange_unuse(struct disk *dk, const diskrange_t *r,
+    diskuser_t *du)
+{
+
+	printf("%s: %s (start=%" PRIu64 ", size=%" PRIu64 ") by %s\n",
+	    __func__, dk->dk_name, r->r_start, r->r_size, du->du_name);
+
+	return 0;
+}
+
+static void
+diskpartition_getrange(struct disk *dk, int par, diskrange_t *range)
+{
+	const struct disklabel *lp = dk->dk_label;
+	const struct partition *p = lp->d_partitions + par;
+	const int bshift = dk->dk_blkshift + DEV_BSHIFT;
+
+	range->r_start = (uint64_t)p->p_offset << bshift;
+	range->r_size = (uint64_t)p->p_size << bshift;
+}
+
+int
+diskpartition_use(struct disk *dk, int par, diskuser_t *du)
+{
+	diskrange_t r;
+
+	diskpartition_getrange(dk, par, &r);
+	return diskrange_use(dk, &r, du);
+}
+
+int
+diskpartition_unuse(struct disk *dk, int par, diskuser_t *du)
+{
+	diskrange_t r;
+
+	diskpartition_getrange(dk, par, &r);
+	return diskrange_unuse(dk, &r, du);
+}
+
+int
+disk_use(struct vnode *vp, diskuser_t *du)
+{
+	struct disk *dk;
+	dev_t dev = vp->v_rdev;
+	int par;
+	int error;
+
+	error = disk_find_by_dev(dev, vp->v_type, &dk);
+	if (error) {
+		return error;
+	}
+	par = DISKPART(dev);
+	if (dk->dk_driver != NULL && dk->dk_driver->d_use != NULL) {
+		(*dk->dk_driver->d_use)(dk, par, du);
+	} else {
+		diskpartition_use(dk, par, du);
+	}
+	return 0;
+}
+
+int
+disk_unuse(struct vnode *vp, diskuser_t *du)
+{
+	struct disk *dk;
+	dev_t dev = vp->v_rdev;
+	int par;
+	int error;
+
+	error = disk_find_by_dev(dev, vp->v_type, &dk);
+	if (error) {
+		return error;
+	}
+	par = DISKPART(dev);
+	if (dk->dk_driver != NULL && dk->dk_driver->d_unuse != NULL) {
+		(*dk->dk_driver->d_unuse)(dk, par, du);
+	} else {
+		diskpartition_unuse(dk, par, du);
+	}
+	return 0;
+}
+
+int
+diskuser_create(const char *name, diskuser_t **dup)
+{
+	diskuser_t *du;
+
+	du = kmem_alloc(sizeof(*du), KM_SLEEP);
+	du->du_name = name;
+
+	*dup = du;
+	return 0;
+}
+
+void
+diskuser_destroy(diskuser_t *du)
+{
+
+	kmem_free(du, sizeof(*du));
+}
+
+int
+disk_open(struct vnode *vp, int mode, kauth_cred_t cred, struct lwp *l,
+    diskuser_t *du)
+{
+	int error;
+
+	error = VOP_OPEN(vp, mode, cred, l);
+	if (error) {
+		return error;
+	}
+	if (vp->v_type == VBLK || vp->v_type == VCHR) {
+		error = disk_use(vp, du);
+		if (error) {
+			VOP_CLOSE(vp, mode, cred, l);
+			return error;
+		}
+	}
+	return 0;
+}
+
+int
+disk_close(struct vnode *vp, int mode, kauth_cred_t cred, struct lwp *l,
+    diskuser_t *du)
+{
+	int error;
+	int error2;
+
+	if (vp->v_type == VBLK || vp->v_type == VCHR) {
+		error = disk_unuse(vp, du);
+	} else {
+		error = 0;
+	}
+	error2 = VOP_CLOSE(vp, mode, cred, l);
+	if (error2) {
+		return error2;
+	}
+	return error;
+}
Index: uvm/uvm_swap.c
===================================================================
--- uvm/uvm_swap.c	(revision 1934)
+++ uvm/uvm_swap.c	(working copy)
@@ -44,6 +44,7 @@ __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v
 #include <sys/buf.h>
 #include <sys/bufq.h>
 #include <sys/conf.h>
+#include <sys/disk.h>
 #include <sys/proc.h>
 #include <sys/namei.h>
 #include <sys/disklabel.h>
@@ -145,6 +146,7 @@ struct swapdev {
 	int			swd_maxactive;	/* max active i/o reqs */
 	struct bufq_state	*swd_tab;	/* buffer list */
 	int			swd_active;	/* number of active buffers */
+	diskuser_t		*swd_diskuser;
 };
 
 /*
@@ -775,6 +777,7 @@ swap_on(struct lwp *l, struct swapdev *s
 #endif /* NFS */
 	const struct bdevsw *bdev;
 	dev_t dev;
+	diskuser_t *du;
 	UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist);
 
 	/*
@@ -793,10 +796,25 @@ swap_on(struct lwp *l, struct swapdev *s
 	 * we skip the open/close for root on swap because the root
 	 * has already been opened when root was mounted (mountroot).
 	 */
+
+	error = diskuser_create("swap", &du);
+	if (error) {
+		return error;
+	}
 	if (vp != rootvp) {
-		if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred, l)))
-			return (error);
+		error = disk_open(vp, FREAD|FWRITE, l->l_cred, l, du);
+		if (error) {
+			diskuser_destroy(du);
+			return error;
+		}
+	} else {
+		error = disk_use(vp, du);
+		if (error) {
+			diskuser_destroy(du);
+			return error;
+		}
 	}
+	sdp->swd_diskuser = du;
 
 	/* XXX this only works for block devices */
 	UVMHIST_LOG(pdhist, "  dev=%d, major(dev)=%d", dev, major(dev), 0,0);
@@ -964,8 +982,11 @@ bad:
 		blist_destroy(sdp->swd_blist);
 	}
 	if (vp != rootvp) {
-		(void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred, l);
+		(void)disk_close(vp, FREAD|FWRITE, l->l_cred, l, du);
+	} else {
+		disk_unuse(vp, du);
 	}
+	diskuser_destroy(du);
 	return (error);
 }
 
@@ -977,6 +998,7 @@ bad:
 static int
 swap_off(struct lwp *l, struct swapdev *sdp)
 {
+	diskuser_t *du;
 	int npages = sdp->swd_npages;
 	int error = 0;
 
@@ -1019,9 +1041,13 @@ swap_off(struct lwp *l, struct swapdev *
 	 * so that spec_close() can tell if this is the last close.
 	 */
 	vrele(sdp->swd_vp);
+	du = sdp->swd_diskuser;
 	if (sdp->swd_vp != rootvp) {
-		(void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred, l);
+		(void) disk_close(sdp->swd_vp, FREAD|FWRITE, l->l_cred, l, du);
+	} else {
+		disk_unuse(sdp->swd_vp, du);
 	}
+	diskuser_destroy(du);
 
 	simple_lock(&uvm.swap_data_lock);
 	uvmexp.swpages -= npages;

--NextPart-20070107152408-1761200--