Subject: Re: remplacing page mappings behind uvm
To: YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
From: Manuel Bouyer <bouyer@antioche.eu.org>
List: tech-kern
Date: 12/01/2007 01:51:09
--17pEHd4RhPHOinZp
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

On Fri, Nov 30, 2007 at 05:59:52PM +0100, Manuel Bouyer wrote:
> Now that I'm thinking about it, it should be possible to create a
> uvm_object backed mapping in the ioctl calls, isn't it ?
> 

I played with this, it seems to work. See the attached privcmd.c
There is a wired entry accounting issue I still need to solve,
which I think is related to MMAPBATCH, but it looks promising.

-- 
Manuel Bouyer <bouyer@antioche.eu.org>
     NetBSD: 26 ans d'experience feront toujours la difference
--

--17pEHd4RhPHOinZp
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="privcmd.c"

/* $NetBSD: privcmd.c,v 1.21 2007/11/27 11:37:27 pooka Exp $ */

/*-
 * Copyright (c) 2004 Christian Limpach.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed by Christian Limpach.
 * 4. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */


#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: privcmd.c,v 1.21 2007/11/27 11:37:27 pooka Exp $");

#include "opt_compat_netbsd.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/vnode.h>
#include <sys/malloc.h>
#include <sys/dirent.h>
#include <sys/stat.h>
#include <sys/proc.h>

#include <miscfs/specfs/specdev.h>
#include <miscfs/kernfs/kernfs.h>

#include <uvm/uvm.h>
#include <uvm/uvm_fault.h>
#include <uvm/uvm_fault_i.h>

#include <xen/kernfs_machdep.h>
#include <xen/xenio.h>

#define	PRIVCMD_MODE	(S_IRUSR)

struct privcmd_object {
	struct uvm_object uobj;
	paddr_t *maddr; /* array of machine address to map */
	int	npages;
	int	domid;
};

static void privpgop_reference(struct uvm_object *);
static void privpgop_detach(struct uvm_object *);
static int privpgop_fault(struct uvm_faultinfo *, vaddr_t , struct vm_page **,
			 int, int, vm_prot_t, int);
static int privcmd_map_obj(struct vm_map *, vaddr_t, off_t,
			   struct privcmd_object **);

static int
privcmd_ioctl(void *v)
{
	struct vop_ioctl_args /* {
		const struct vnodeop_desc *a_desc;
		struct vnode *a_vp;
		u_long a_command;
		void *a_data;
		int a_fflag;
		kauth_cred_t a_cred;
	} */ *ap = v;
	int error = 0;
	struct privcmd_object *obj;
	paddr_t *maddr;

	switch (ap->a_command) {
	case IOCTL_PRIVCMD_HYPERCALL: {
		privcmd_hypercall_t *hc = ap->a_data;
		if (hc->op >= (PAGE_SIZE >> 5))
			return EINVAL;
		error = -EOPNOTSUPP;
#if defined(i386)
		__asm volatile (
			"pushl %%ebx; pushl %%ecx; pushl %%edx;"
			"pushl %%esi; pushl %%edi; "
			"movl  4(%%eax),%%ebx ;"
			"movl  8(%%eax),%%ecx ;"
			"movl 12(%%eax),%%edx ;"
			"movl 16(%%eax),%%esi ;"
			"movl 20(%%eax),%%edi ;"
			"movl   (%%eax),%%eax ;"
#if defined(XEN3) && !defined(XEN_COMPAT_030001)
			"shll $5,%%eax ;"
			"addl $hypercall_page,%%eax ;"
			"call *%%eax ;"
#else
			TRAP_INSTR "; "
#endif
			"popl %%edi; popl %%esi; popl %%edx;"
			"popl %%ecx; popl %%ebx"
			: "=a" (error) : "0" (ap->a_data) : "memory" );
#endif /* i386 */
#if defined(x86_64)
		{
		long i1, i2, i3;
		__asm volatile (
			"movq %8,%%r10; movq %9,%%r8;"
			"shll $5,%%eax ;"
			"addq $hypercall_page,%%rax ;"
			"call *%%rax"
			: "=a" (error), "=D" (i1),
			  "=S" (i2), "=d" (i3)
			: "0" ((unsigned int)hc->op),
			  "1" (hc->arg[0]),
			  "2" (hc->arg[1]),
			  "3" (hc->arg[2]),
			  "g" (hc->arg[3]),
			  "g" (hc->arg[4])
			: "r8", "r10", "memory" );
		}
#endif /* x86_64 */
		error = -error;
		break;
	}
#ifndef XEN3
#if defined(COMPAT_30)
	case IOCTL_PRIVCMD_INITDOMAIN_EVTCHN_OLD:
		{
		extern int initdom_ctrlif_domcontroller_port;
		error = initdom_ctrlif_domcontroller_port;
		}
		break;
#endif /* defined(COMPAT_30) */
	case IOCTL_PRIVCMD_INITDOMAIN_EVTCHN:
		{
		extern int initdom_ctrlif_domcontroller_port;
		*(int *)ap->a_data = initdom_ctrlif_domcontroller_port;
		}
		error = 0;
		break;
#endif /* XEN3 */
	case IOCTL_PRIVCMD_MMAP:
	{
		int i, j;
		privcmd_mmap_t *mcmd = ap->a_data;
		privcmd_mmap_entry_t mentry;
		vaddr_t va;
		u_long ma;
		struct vm_map *vmm = &curlwp->l_proc->p_vmspace->vm_map;
		//printf("IOCTL_PRIVCMD_MMAP: %d entries\n", mcmd->num);

		//pmap_t pmap = vm_map_pmap(vmm);
		for (i = 0; i < mcmd->num; i++) {
			error = copyin(&mcmd->entry[i], &mentry, sizeof(mentry));
			if (error)
				return error;
			//printf("entry %d va 0x%lx npages %lu mfm 0x%lx\n", i, mentry.va, mentry.npages, mentry.mfn);
			if (mentry.va > VM_MAXUSER_ADDRESS)
				return EINVAL;
#if 0
			if (mentry.va + (mentry.npages << PGSHIFT) >
			    mrentry->vm_end)
				return EINVAL;
#endif
			maddr = malloc(sizeof(paddr_t) * mentry.npages,
			    M_TEMP, M_WAITOK);
			if (maddr == NULL)
				return ENOMEM;
			va = mentry.va & ~PAGE_MASK;
			ma = mentry.mfn <<  PGSHIFT; /* XXX ??? */
			error  = privcmd_map_obj(vmm, va,
			    (mentry.npages << PGSHIFT), &obj);
			if (error)
				return error;

			for (j = 0; j < mentry.npages; j++) {
				//printf("remap va 0x%lx to 0x%lx\n", va, ma);
				maddr[j] = ma;
				ma += PAGE_SIZE;
			}
			simple_lock(&obj->uobj.vmobjlock);
			obj->maddr = maddr;
			obj->npages = mentry.npages;
			obj->domid = mcmd->dom;
			simple_unlock(&obj->uobj.vmobjlock);
		}
		break;
	}
	case IOCTL_PRIVCMD_MMAPBATCH:
	{
		int i;
		privcmd_mmapbatch_t* pmb = ap->a_data;
		vaddr_t va0, va;
		u_long mfn, ma;
		struct vm_map *vmm;
		pmap_t pmap;

		vmm = &curlwp->l_proc->p_vmspace->vm_map;
		pmap = vm_map_pmap(vmm);
		va0 = pmb->addr & ~PAGE_MASK;

		if (va0 > VM_MAXUSER_ADDRESS)
			return EINVAL;
		if (((VM_MAXUSER_ADDRESS - va0) >> PGSHIFT) < pmb->num)
			return EINVAL;
		
		//printf("mmapbatch: va0=%lx num=%d dom=%d\n", va0, pmb->num, pmb->dom);
		maddr = malloc(sizeof(paddr_t) * pmb->num, M_TEMP, M_WAITOK);
		if (maddr == NULL)
			return ENOMEM;
		error  = privcmd_map_obj(vmm, va0, (pmb->num << PGSHIFT), &obj);
		if (error)
			return error;
		for(i = 0; i < pmb->num; ++i) {
			va = va0 + (i * PAGE_SIZE);
			error = copyin(&pmb->arr[i], &mfn, sizeof(mfn));
			if (error != 0)
				return error;
			ma = mfn << PGSHIFT;
			
			//printf("mmapbatch: va=%lx ma=%lx dom=%d\n", va, ma, pmb->dom);
			maddr[i] = ma;
#if 0
			error = pmap_enter_ma(pmap, va, ma, 0, prot,
			    PMAP_WIRED | PMAP_CANFAIL, pmb->dom);
			if (error != 0) {
				//printf("mmapbatch: remap error %d!\n", error);
				mfn |= 0xF0000000;
				copyout(&mfn, &pmb->arr[i], sizeof(mfn));
			}
#endif
		}
		simple_lock(&obj->uobj.vmobjlock);
		obj->maddr = maddr;
		obj->npages = pmb->num;
		obj->domid = pmb->dom;
		simple_unlock(&obj->uobj.vmobjlock);
		/* force mapping to be loaded */
		if ((error = uvm_map_pageable(vmm, va0,
		    va0 + (pmb->num << PGSHIFT), false, 0)) != 0) {
			printf("uvm_map_pageable error %d\n", error);
			return(error);
		}
		/* check if mappings did work */
		for(i = 0; i < pmb->num; ++i) {
			if ((obj->maddr[i] & 0xF0000000000) == 0xF0000000000) {
				//printf("error entry %d ma 0x%lx\n", i, obj->maddr[i] &~0xF0000000000);
				mfn = obj->maddr[i] >> PGSHIFT;
				copyout(&mfn, &pmb->arr[i], sizeof(mfn));
			}
		}
		error = 0;
		break;
	}
#ifndef XEN3
	case IOCTL_PRIVCMD_GET_MACH2PHYS_START_MFN:
		{
		unsigned long *mfn_start = ap->a_data;
		*mfn_start = HYPERVISOR_shared_info->arch.mfn_to_pfn_start;
		error = 0;
		}
		break;
#endif /* !XEN3 */
	default:
		error = EINVAL;
	}
	
	return error;
}

static struct uvm_pagerops privpgops = {
  .pgo_reference = privpgop_reference,
  .pgo_detach = privpgop_detach,
  .pgo_fault = privpgop_fault,
};

static void
privpgop_reference(struct uvm_object *uobj)
{
	simple_lock(&uobj->vmobjlock);
	uobj->uo_refs++;
	simple_unlock(&uobj->vmobjlock);
}

static void
privpgop_detach(struct uvm_object *uobj)
{
	struct privcmd_object *pobj = (struct privcmd_object *)uobj;
	simple_lock(&uobj->vmobjlock);
	if (uobj->uo_refs > 1) {
		uobj->uo_refs--;
		simple_unlock(&uobj->vmobjlock);
		return;
	}
	simple_unlock(&uobj->vmobjlock);
	free(pobj->maddr, M_TEMP);
	free(pobj, M_TEMP);
}

static int
privpgop_fault(struct uvm_faultinfo *ufi, vaddr_t vaddr, struct vm_page **pps,
    int npages, int centeridx, vm_prot_t access_type, int flags)
{
	struct vm_map_entry *entry = ufi->entry;
	struct uvm_object *uobj = entry->object.uvm_obj;
	struct privcmd_object *pobj = (struct privcmd_object*)uobj;
	int maddr_i;
	int i, error = 0;

	//printf("privpgop_fault va 0x%lx npages %d flags 0x%x\n", vaddr, npages, flags);

	/* compute offset from start of map */
	maddr_i = (entry->offset + (vaddr - entry->start)) >> PAGE_SHIFT;
	//printf("maddr_i %d\n", maddr_i);
	if (maddr_i + npages > pobj->npages)
		return EINVAL;
	for (i = 0; i < npages; i++, maddr_i++, vaddr+= PAGE_SIZE) {
		if ((flags & PGO_ALLPAGES) == 0 && i != centeridx)
			continue;
		if (pps[i] == PGO_DONTCARE)
			continue;
		//printf("privpgop_fault pmap_enter ma 0x%lx -> 0x%lx %d\n", vaddr, pobj->maddr[i], pobj->domid);
		if ((pobj->maddr[maddr_i] & 0xF0000000000) == 0xF0000000000)
			continue; /* this has already been flagged as error */
		error = pmap_enter_ma(ufi->orig_map->pmap, vaddr,
		    pobj->maddr[maddr_i], 0, ufi->entry->protection,
		    PMAP_CANFAIL | ufi->entry->protection,
		    pobj->domid);
		if (error == ENOMEM) {
			uvmfault_unlockall(ufi, ufi->entry->aref.ar_amap,
			    uobj, NULL);
			pmap_update(ufi->orig_map->pmap);
			uvm_wait("udv_fault");
			return (ERESTART);
		}
		if (error) {
			//printf("privcmd_fault: pmap_enter failed %d entry %d ma 0x%lx\n", error, maddr_i, pobj->maddr[maddr_i]);
			/* XXX for proper ptp accountings */
			pmap_remove(ufi->orig_map->pmap, vaddr, 
			    vaddr + PAGE_SIZE);
			pobj->maddr[maddr_i] |= 0xF0000000000;
			error = 0;
		}
	}
	uvmfault_unlockall(ufi, ufi->entry->aref.ar_amap, uobj, NULL);
	pmap_update(ufi->orig_map->pmap);
	return (error);
}

static int
privcmd_map_obj(struct vm_map *map, vaddr_t start, off_t size,
	struct privcmd_object **nobj)
{
	struct vm_map_entry *dead_entries;
	struct uvm_object *uobj;
	int error;
	uvm_flag_t uvmflag;
	vaddr_t newstart = start;
	vm_prot_t prot;

	vm_map_lock(map);
	/* get protections. This also check for validity of mapping */
	if (uvm_map_checkprot(map, start, start + size - 1, VM_PROT_WRITE))
		prot = VM_PROT_READ | VM_PROT_WRITE;
	else if (uvm_map_checkprot(map, start, start + size - 1, VM_PROT_READ))
		prot = VM_PROT_READ;
	else {
		printf("uvm_map_checkprot 0x%lx -> 0x%lx "
		    "failed\n",
		    start, start + size - 1);
		vm_map_unlock(map);
		return EINVAL;
	}
	/* remove current entries */
	uvm_unmap_remove(map, start, start + size, &dead_entries, NULL, 0);
	if (dead_entries != NULL)
		uvm_unmap_detach(dead_entries, 0);

	vm_map_unlock(map);
	uobj = malloc(sizeof(struct privcmd_object), M_TEMP, M_WAITOK);
	if (uobj == NULL)
		return ENOMEM;

	UVM_OBJ_INIT(uobj, &privpgops, 1);
	uvmflag = UVM_MAPFLAG(prot, prot, UVM_INH_NONE, UVM_ADV_NORMAL,
	    UVM_FLAG_FIXED | UVM_FLAG_NOMERGE);
	error = uvm_map(map, &newstart, size, uobj, 0, 0, uvmflag);

	if (error) {
		if (uobj)
			uobj->pgops->pgo_detach(uobj);
		return error;
	}
	if (newstart != start) {
		printf("uvm_map didn't give us back our vm space\n");
		return EINVAL;
	}
	*nobj = (struct privcmd_object *)uobj;
	return 0;
}

static const struct kernfs_fileop privcmd_fileops[] = {
  { .kf_fileop = KERNFS_FILEOP_IOCTL, .kf_vop = privcmd_ioctl },
};

void
xenprivcmd_init()
{
	kernfs_entry_t *dkt;
	kfstype kfst;

	if ((xen_start_info.flags & SIF_PRIVILEGED) == 0)
		return;

	kfst = KERNFS_ALLOCTYPE(privcmd_fileops);

	KERNFS_ALLOCENTRY(dkt, M_TEMP, M_WAITOK);
	KERNFS_INITENTRY(dkt, DT_REG, "privcmd", NULL, kfst, VREG,
	    PRIVCMD_MODE);
	kernfs_addentry(kernxen_pkt, dkt);
}

--17pEHd4RhPHOinZp--