Subject: support for mmap'ing disk block devices
To: None <tech-kern@NetBSD.ORG>
From: Jason Thorpe <thorpej@nas.nasa.gov>
List: tech-kern
Date: 06/28/1998 21:53:11
Hi folks...

So, I bit the bullet, and added support for mmap'ing disk block devices
today.  This is a cool feature that e.g. Solaris has, that database apps
like to use, and, apparently, INN, too.

I added it to both Mach VM and UVM, since the changes were so trivial.
I've tested it under UVM with a SCSI disk.  For some reason, it does
NOT work with vnds; I don't know why yet, but I managed to jump through
a NULL function pointer (it looked like, anyhow) when I tried it.  Deep
Evil somewhere else, I suspect.

Anyhow, I'd like to commit these RSN, like in the next couple of days.

Objections?

Jason R. Thorpe                                       thorpej@nas.nasa.gov
NASA Ames Research Center                            Home: +1 408 866 1912
NAS: M/S 258-5                                       Work: +1 650 604 0935
Moffett Field, CA 94035                             Pager: +1 650 428 6939

Index: uvm/uvm_mmap.c
===================================================================
RCS file: /cvsroot/src/sys/uvm/uvm_mmap.c,v
retrieving revision 1.10
diff -c -r1.10 uvm_mmap.c
*** uvm_mmap.c	1998/05/30 22:21:03	1.10
--- uvm_mmap.c	1998/06/29 05:00:36
***************
*** 302,309 ****
  			return (ENODEV);		/* only mmap vnodes! */
  		vp = (struct vnode *)fp->f_data;	/* convert to vnode */
  
! 		if (vp->v_type != VREG && vp->v_type != VCHR)
! 			return (ENODEV);	/* only REG/CHR support mmap */
  
  		/* special case: catch SunOS style /dev/zero */
  		if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) {
--- 302,310 ----
  			return (ENODEV);		/* only mmap vnodes! */
  		vp = (struct vnode *)fp->f_data;	/* convert to vnode */
  
! 		if (vp->v_type != VREG && vp->v_type != VCHR &&
! 		    vp->v_type != VBLK)
! 			return (ENODEV);  /* only REG/CHR/BLK support mmap */
  
  		/* special case: catch SunOS style /dev/zero */
  		if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) {
***************
*** 907,913 ****
  		}
  		
  		if (uobj == NULL)
! 			return((vp->v_type == VCHR) ? EINVAL : ENOMEM);
  
  		if ((flags & MAP_SHARED) == 0)
  			uvmflag |= UVM_FLAG_COPYONW;
--- 908,914 ----
  		}
  		
  		if (uobj == NULL)
! 			return((vp->v_type == VREG) ? ENOMEM : EINVAL);
  
  		if ((flags & MAP_SHARED) == 0)
  			uvmflag |= UVM_FLAG_COPYONW;
Index: uvm/uvm_vnode.c
===================================================================
RCS file: /cvsroot/src/sys/uvm/uvm_vnode.c,v
retrieving revision 1.12
diff -c -r1.12 uvm_vnode.c
*** uvm_vnode.c	1998/06/24 20:58:49	1.12
--- uvm_vnode.c	1998/06/29 05:00:38
***************
*** 61,66 ****
--- 61,72 ----
  #include <sys/proc.h>
  #include <sys/malloc.h>
  #include <sys/vnode.h>
+ #include <sys/disklabel.h>
+ #include <sys/ioctl.h>
+ #include <sys/fcntl.h>
+ #include <sys/conf.h>
+ 
+ #include <miscfs/specfs/specdev.h>
  
  #include <vm/vm.h>
  #include <vm/vm_page.h>
***************
*** 173,183 ****
--- 179,192 ----
  	struct uvm_vnode *uvn = &vp->v_uvm;
  	struct vattr vattr;
  	int oldflags, result;
+ 	struct partinfo pi;
  	u_quad_t used_vnode_size;
  	UVMHIST_FUNC("uvn_attach"); UVMHIST_CALLED(maphist);
  
  	UVMHIST_LOG(maphist, "(vn=0x%x)", arg,0,0,0);
  
+ 	used_vnode_size = (u_quad_t)0;	/* XXX gcc -Wuninitialized */
+ 
  	/*
  	 * first get a lock on the uvn.
  	 */
***************
*** 192,197 ****
--- 201,215 ----
  	}
  
  	/*
+ 	 * if we're maping a BLK device, make sure it is a disk.
+ 	 */
+ 	if (vp->v_type == VBLK && bdevsw[major(vp->v_rdev)].d_type != D_DISK) {
+ 		simple_unlock(&uvn->u_obj.vmobjlock); /* drop lock */
+ 		UVMHIST_LOG(maphist,"<- done (VBLK not D_DISK!)", 0,0,0,0);
+ 		return(NULL);
+ 	}
+ 
+ 	/*
  	 * now we have lock and uvn must not be in a blocked state.
  	 * first check to see if it is already active, in which case
  	 * we can bump the reference count, check to see if we need to
***************
*** 235,253 ****
  	uvn->u_flags = UVM_VNODE_ALOCK;
  	simple_unlock(&uvn->u_obj.vmobjlock); /* drop lock in case we sleep */
  		/* XXX: curproc? */
- 	result = VOP_GETATTR(vp, &vattr, curproc->p_ucred, curproc);
  
! 	/*
! 	 * make sure that the newsize fits within a vm_offset_t
! 	 * XXX: need to revise addressing data types
! 	 */
! 	used_vnode_size = vattr.va_size;
! 	if (used_vnode_size > (vm_offset_t) -PAGE_SIZE) {
! #ifdef DEBUG
! 		printf("uvn_attach: vn %p size truncated %qx->%x\n", vp,
! 		    used_vnode_size, -PAGE_SIZE);
! #endif    
! 		used_vnode_size = (vm_offset_t) -PAGE_SIZE;
  	}
  
  	/* relock object */
--- 253,279 ----
  	uvn->u_flags = UVM_VNODE_ALOCK;
  	simple_unlock(&uvn->u_obj.vmobjlock); /* drop lock in case we sleep */
  		/* XXX: curproc? */
  
! 	if (vp->v_type == VBLK) {
! 		/*
! 		 * We could implement this as a specfs getattr call, but:
! 		 *
! 		 *	(1) VOP_GETATTR() would get the file system
! 		 *	    vnode operation, not the specfs operation.
! 		 *
! 		 *	(2) All we want is the size, anyhow.
! 		 */
! 		result = (*bdevsw[major(vp->v_rdev)].d_ioctl)(vp->v_rdev,
! 		    DIOCGPART, (caddr_t)&pi, FREAD, curproc);
! 		if (result == 0) {
! 			/* XXX should remember blocksize */
! 			used_vnode_size = (u_quad_t)pi.disklab->d_secsize *
! 			    (u_quad_t)pi.part->p_size;
! 		}
! 	} else {
! 		result = VOP_GETATTR(vp, &vattr, curproc->p_ucred, curproc);
! 		if (result == 0)
! 			used_vnode_size = vattr.va_size;
  	}
  
  	/* relock object */
***************
*** 261,267 ****
  		UVMHIST_LOG(maphist,"<- done (VOP_GETATTR FAILED!)", 0,0,0,0);
  		return(NULL);
  	}
! 	
  	/*
  	 * now set up the uvn.
  	 */
--- 287,306 ----
  		UVMHIST_LOG(maphist,"<- done (VOP_GETATTR FAILED!)", 0,0,0,0);
  		return(NULL);
  	}
! 
! 	/*
! 	 * make sure that the newsize fits within a vm_offset_t
! 	 * XXX: need to revise addressing data types
! 	 */
! if (vp->v_type == VBLK) printf("used_vnode_size = %qu\n", used_vnode_size);
! 	if (used_vnode_size > (vm_offset_t) -PAGE_SIZE) {
! #ifdef DEBUG
! 		printf("uvn_attach: vn %p size truncated %qx->%x\n", vp,
! 		    used_vnode_size, -PAGE_SIZE);
! #endif    
! 		used_vnode_size = (vm_offset_t) -PAGE_SIZE;
! 	}
! 
  	/*
  	 * now set up the uvn.
  	 */
Index: vm/vm_mmap.c
===================================================================
RCS file: /cvsroot/src/sys/vm/vm_mmap.c,v
retrieving revision 1.58
diff -c -r1.58 vm_mmap.c
*** vm_mmap.c	1998/05/30 22:21:03	1.58
--- vm_mmap.c	1998/06/29 05:00:38
***************
*** 223,229 ****
  		if (fp->f_type != DTYPE_VNODE)
  			return (ENODEV);
  		vp = (struct vnode *)fp->f_data;
! 		if (vp->v_type != VREG && vp->v_type != VCHR)
  			return (ENODEV);
  		/*
  		 * XXX hack to handle use of /dev/zero to map anon
--- 223,230 ----
  		if (fp->f_type != DTYPE_VNODE)
  			return (ENODEV);
  		vp = (struct vnode *)fp->f_data;
! 		if (vp->v_type != VREG && vp->v_type != VCHR &&
! 		    vp->v_type != VBLK)
  			return (ENODEV);
  		/*
  		 * XXX hack to handle use of /dev/zero to map anon
***************
*** 824,830 ****
  			goto out;
  	}
  	/*
! 	 * A regular file
  	 */
  	else {
  #ifdef DEBUG
--- 825,831 ----
  			goto out;
  	}
  	/*
! 	 * A regular file or block special file
  	 */
  	else {
  #ifdef DEBUG
Index: vm/vnode_pager.c
===================================================================
RCS file: /cvsroot/src/sys/vm/vnode_pager.c,v
retrieving revision 1.37
diff -c -r1.37 vnode_pager.c
*** vnode_pager.c	1998/06/24 20:58:49	1.37
--- vnode_pager.c	1998/06/29 05:00:38
***************
*** 55,63 ****
--- 55,69 ----
  #include <sys/proc.h>
  #include <sys/malloc.h>
  #include <sys/vnode.h>
+ #include <sys/disklabel.h>
+ #include <sys/ioctl.h>
+ #include <sys/fcntl.h>
+ #include <sys/conf.h>
  #include <sys/uio.h>
  #include <sys/mount.h>
  
+ #include <miscfs/specfs/specdev.h>
+ 
  #include <vm/vm.h>
  #include <vm/vm_page.h>
  #include <vm/vnode_pager.h>
***************
*** 130,135 ****
--- 136,142 ----
  	vm_object_t object;
  	struct vattr vattr;
  	struct vnode *vp;
+ 	struct partinfo pi;
  	u_quad_t used_vnode_size;
  	struct proc *p = curproc;	/* XXX */
  
***************
*** 137,142 ****
--- 144,150 ----
  	if (vpagerdebug & (VDB_FOLLOW|VDB_ALLOC))
  		printf("vnode_pager_alloc(%p, %lx, %x)\n", handle, size, prot);
  #endif
+ 
  	/*
  	 * Pageout to vnode, no can do yet.
  	 */
***************
*** 144,154 ****
  		return(NULL);
  
  	/*
  	 * Vnodes keep a pointer to any associated pager so no need to
  	 * lookup with vm_pager_lookup.
  	 */
- 	vp = (struct vnode *)handle;
  	pager = (vm_pager_t)vp->v_vmdata;
  	if (pager == NULL) {
  		/*
  		 * Allocate pager structures
--- 152,169 ----
  		return(NULL);
  
  	/*
+ 	 * If we're mapping a BLK device, make sure it's a disk.
+ 	 */
+ 	vp = (struct vnode *)handle;
+ 	if (vp->v_type == VBLK && bdevsw[major(vp->v_rdev)].d_type != D_DISK)
+ 		return (NULL);
+ 
+ 	/*
  	 * Vnodes keep a pointer to any associated pager so no need to
  	 * lookup with vm_pager_lookup.
  	 */
  	pager = (vm_pager_t)vp->v_vmdata;
+ 
  	if (pager == NULL) {
  		/*
  		 * Allocate pager structures
***************
*** 162,177 ****
  			return(NULL);
  		}
  		/*
! 		 * And an object of the appropriate size
  		 */
! 		if (VOP_GETATTR(vp, &vattr, p->p_ucred, p) != 0) {
! 			free((caddr_t)vnp, M_VMPGDATA);
! 			free((caddr_t)pager, M_VMPAGER);
! 			return(NULL);
  		}
  		/* make sure mapping fits into numeric range,
  		 truncate if necessary */
- 		used_vnode_size = vattr.va_size;
  		if (used_vnode_size > (vm_offset_t)-PAGE_SIZE) {
  #ifdef DEBUG
  			printf("vnode_pager_alloc: vn %p size truncated %qx->%lx\n",
--- 177,213 ----
  			return(NULL);
  		}
  		/*
! 		 * And an object of the appropriate size.
  		 */
! 		if (vp->v_type == VBLK) {
! 			/*
! 			 * We could implement this as a specfs getattr
! 			 * call, but:
! 			 *
! 			 *	(1) VOP_GETATTR() would get the file system
! 			 *	    vnode operation, not the specfs operation.
! 			 *
! 			 *	(2) All we want is the size, anyhow.
! 			 */
! 			if ((*bdevsw[major(vp->v_rdev)].d_ioctl)(vp->v_rdev,
! 			    DIOCGPART, (caddr_t)&pi, FREAD, p) != 0) {
! 				free((caddr_t)vnp, M_VMPGDATA);
! 				free((caddr_t)pager, M_VMPAGER);
! 				return(NULL);
! 			}
! 			/* XXX should remember blocksize */
! 			used_vnode_size = (u_quad_t)pi.disklab->d_secsize *
! 			    (u_quad_t)pi.part->p_size;
! 		} else {
! 			if (VOP_GETATTR(vp, &vattr, p->p_ucred, p) != 0) {
! 				free((caddr_t)vnp, M_VMPGDATA);
! 				free((caddr_t)pager, M_VMPAGER);
! 				return(NULL);
! 			}
! 			used_vnode_size = vattr.va_size;
  		}
  		/* make sure mapping fits into numeric range,
  		 truncate if necessary */
  		if (used_vnode_size > (vm_offset_t)-PAGE_SIZE) {
  #ifdef DEBUG
  			printf("vnode_pager_alloc: vn %p size truncated %qx->%lx\n",