Subject: Re: remplacing page mappings behind uvm
To: YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
From: Manuel Bouyer <bouyer@antioche.eu.org>
List: tech-kern
Date: 12/01/2007 01:51:09
--17pEHd4RhPHOinZp
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
On Fri, Nov 30, 2007 at 05:59:52PM +0100, Manuel Bouyer wrote:
> Now that I'm thinking about it, it should be possible to create a
> uvm_object backed mapping in the ioctl calls, isn't it ?
>
I played with this, it seems to work. See the attached privcmd.c
There is a wired entry accounting issue I still need to solve,
which I think is related to MMAPBATCH, but it looks promising.
--
Manuel Bouyer <bouyer@antioche.eu.org>
NetBSD: 26 ans d'experience feront toujours la difference
--
--17pEHd4RhPHOinZp
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="privcmd.c"
/* $NetBSD: privcmd.c,v 1.21 2007/11/27 11:37:27 pooka Exp $ */
/*-
* Copyright (c) 2004 Christian Limpach.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Christian Limpach.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: privcmd.c,v 1.21 2007/11/27 11:37:27 pooka Exp $");
#include "opt_compat_netbsd.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/vnode.h>
#include <sys/malloc.h>
#include <sys/dirent.h>
#include <sys/stat.h>
#include <sys/proc.h>
#include <miscfs/specfs/specdev.h>
#include <miscfs/kernfs/kernfs.h>
#include <uvm/uvm.h>
#include <uvm/uvm_fault.h>
#include <uvm/uvm_fault_i.h>
#include <xen/kernfs_machdep.h>
#include <xen/xenio.h>
#define PRIVCMD_MODE (S_IRUSR)
struct privcmd_object {
struct uvm_object uobj;
paddr_t *maddr; /* array of machine address to map */
int npages;
int domid;
};
static void privpgop_reference(struct uvm_object *);
static void privpgop_detach(struct uvm_object *);
static int privpgop_fault(struct uvm_faultinfo *, vaddr_t , struct vm_page **,
int, int, vm_prot_t, int);
static int privcmd_map_obj(struct vm_map *, vaddr_t, off_t,
struct privcmd_object **);
static int
privcmd_ioctl(void *v)
{
struct vop_ioctl_args /* {
const struct vnodeop_desc *a_desc;
struct vnode *a_vp;
u_long a_command;
void *a_data;
int a_fflag;
kauth_cred_t a_cred;
} */ *ap = v;
int error = 0;
struct privcmd_object *obj;
paddr_t *maddr;
switch (ap->a_command) {
case IOCTL_PRIVCMD_HYPERCALL: {
privcmd_hypercall_t *hc = ap->a_data;
if (hc->op >= (PAGE_SIZE >> 5))
return EINVAL;
error = -EOPNOTSUPP;
#if defined(i386)
__asm volatile (
"pushl %%ebx; pushl %%ecx; pushl %%edx;"
"pushl %%esi; pushl %%edi; "
"movl 4(%%eax),%%ebx ;"
"movl 8(%%eax),%%ecx ;"
"movl 12(%%eax),%%edx ;"
"movl 16(%%eax),%%esi ;"
"movl 20(%%eax),%%edi ;"
"movl (%%eax),%%eax ;"
#if defined(XEN3) && !defined(XEN_COMPAT_030001)
"shll $5,%%eax ;"
"addl $hypercall_page,%%eax ;"
"call *%%eax ;"
#else
TRAP_INSTR "; "
#endif
"popl %%edi; popl %%esi; popl %%edx;"
"popl %%ecx; popl %%ebx"
: "=a" (error) : "0" (ap->a_data) : "memory" );
#endif /* i386 */
#if defined(x86_64)
{
long i1, i2, i3;
__asm volatile (
"movq %8,%%r10; movq %9,%%r8;"
"shll $5,%%eax ;"
"addq $hypercall_page,%%rax ;"
"call *%%rax"
: "=a" (error), "=D" (i1),
"=S" (i2), "=d" (i3)
: "0" ((unsigned int)hc->op),
"1" (hc->arg[0]),
"2" (hc->arg[1]),
"3" (hc->arg[2]),
"g" (hc->arg[3]),
"g" (hc->arg[4])
: "r8", "r10", "memory" );
}
#endif /* x86_64 */
error = -error;
break;
}
#ifndef XEN3
#if defined(COMPAT_30)
case IOCTL_PRIVCMD_INITDOMAIN_EVTCHN_OLD:
{
extern int initdom_ctrlif_domcontroller_port;
error = initdom_ctrlif_domcontroller_port;
}
break;
#endif /* defined(COMPAT_30) */
case IOCTL_PRIVCMD_INITDOMAIN_EVTCHN:
{
extern int initdom_ctrlif_domcontroller_port;
*(int *)ap->a_data = initdom_ctrlif_domcontroller_port;
}
error = 0;
break;
#endif /* XEN3 */
case IOCTL_PRIVCMD_MMAP:
{
int i, j;
privcmd_mmap_t *mcmd = ap->a_data;
privcmd_mmap_entry_t mentry;
vaddr_t va;
u_long ma;
struct vm_map *vmm = &curlwp->l_proc->p_vmspace->vm_map;
//printf("IOCTL_PRIVCMD_MMAP: %d entries\n", mcmd->num);
//pmap_t pmap = vm_map_pmap(vmm);
for (i = 0; i < mcmd->num; i++) {
error = copyin(&mcmd->entry[i], &mentry, sizeof(mentry));
if (error)
return error;
//printf("entry %d va 0x%lx npages %lu mfm 0x%lx\n", i, mentry.va, mentry.npages, mentry.mfn);
if (mentry.va > VM_MAXUSER_ADDRESS)
return EINVAL;
#if 0
if (mentry.va + (mentry.npages << PGSHIFT) >
mrentry->vm_end)
return EINVAL;
#endif
maddr = malloc(sizeof(paddr_t) * mentry.npages,
M_TEMP, M_WAITOK);
if (maddr == NULL)
return ENOMEM;
va = mentry.va & ~PAGE_MASK;
ma = mentry.mfn << PGSHIFT; /* XXX ??? */
error = privcmd_map_obj(vmm, va,
(mentry.npages << PGSHIFT), &obj);
if (error)
return error;
for (j = 0; j < mentry.npages; j++) {
//printf("remap va 0x%lx to 0x%lx\n", va, ma);
maddr[j] = ma;
ma += PAGE_SIZE;
}
simple_lock(&obj->uobj.vmobjlock);
obj->maddr = maddr;
obj->npages = mentry.npages;
obj->domid = mcmd->dom;
simple_unlock(&obj->uobj.vmobjlock);
}
break;
}
case IOCTL_PRIVCMD_MMAPBATCH:
{
int i;
privcmd_mmapbatch_t* pmb = ap->a_data;
vaddr_t va0, va;
u_long mfn, ma;
struct vm_map *vmm;
pmap_t pmap;
vmm = &curlwp->l_proc->p_vmspace->vm_map;
pmap = vm_map_pmap(vmm);
va0 = pmb->addr & ~PAGE_MASK;
if (va0 > VM_MAXUSER_ADDRESS)
return EINVAL;
if (((VM_MAXUSER_ADDRESS - va0) >> PGSHIFT) < pmb->num)
return EINVAL;
//printf("mmapbatch: va0=%lx num=%d dom=%d\n", va0, pmb->num, pmb->dom);
maddr = malloc(sizeof(paddr_t) * pmb->num, M_TEMP, M_WAITOK);
if (maddr == NULL)
return ENOMEM;
error = privcmd_map_obj(vmm, va0, (pmb->num << PGSHIFT), &obj);
if (error)
return error;
for(i = 0; i < pmb->num; ++i) {
va = va0 + (i * PAGE_SIZE);
error = copyin(&pmb->arr[i], &mfn, sizeof(mfn));
if (error != 0)
return error;
ma = mfn << PGSHIFT;
//printf("mmapbatch: va=%lx ma=%lx dom=%d\n", va, ma, pmb->dom);
maddr[i] = ma;
#if 0
error = pmap_enter_ma(pmap, va, ma, 0, prot,
PMAP_WIRED | PMAP_CANFAIL, pmb->dom);
if (error != 0) {
//printf("mmapbatch: remap error %d!\n", error);
mfn |= 0xF0000000;
copyout(&mfn, &pmb->arr[i], sizeof(mfn));
}
#endif
}
simple_lock(&obj->uobj.vmobjlock);
obj->maddr = maddr;
obj->npages = pmb->num;
obj->domid = pmb->dom;
simple_unlock(&obj->uobj.vmobjlock);
/* force mapping to be loaded */
if ((error = uvm_map_pageable(vmm, va0,
va0 + (pmb->num << PGSHIFT), false, 0)) != 0) {
printf("uvm_map_pageable error %d\n", error);
return(error);
}
/* check if mappings did work */
for(i = 0; i < pmb->num; ++i) {
if ((obj->maddr[i] & 0xF0000000000) == 0xF0000000000) {
//printf("error entry %d ma 0x%lx\n", i, obj->maddr[i] &~0xF0000000000);
mfn = obj->maddr[i] >> PGSHIFT;
copyout(&mfn, &pmb->arr[i], sizeof(mfn));
}
}
error = 0;
break;
}
#ifndef XEN3
case IOCTL_PRIVCMD_GET_MACH2PHYS_START_MFN:
{
unsigned long *mfn_start = ap->a_data;
*mfn_start = HYPERVISOR_shared_info->arch.mfn_to_pfn_start;
error = 0;
}
break;
#endif /* !XEN3 */
default:
error = EINVAL;
}
return error;
}
static struct uvm_pagerops privpgops = {
.pgo_reference = privpgop_reference,
.pgo_detach = privpgop_detach,
.pgo_fault = privpgop_fault,
};
static void
privpgop_reference(struct uvm_object *uobj)
{
simple_lock(&uobj->vmobjlock);
uobj->uo_refs++;
simple_unlock(&uobj->vmobjlock);
}
static void
privpgop_detach(struct uvm_object *uobj)
{
struct privcmd_object *pobj = (struct privcmd_object *)uobj;
simple_lock(&uobj->vmobjlock);
if (uobj->uo_refs > 1) {
uobj->uo_refs--;
simple_unlock(&uobj->vmobjlock);
return;
}
simple_unlock(&uobj->vmobjlock);
free(pobj->maddr, M_TEMP);
free(pobj, M_TEMP);
}
static int
privpgop_fault(struct uvm_faultinfo *ufi, vaddr_t vaddr, struct vm_page **pps,
int npages, int centeridx, vm_prot_t access_type, int flags)
{
struct vm_map_entry *entry = ufi->entry;
struct uvm_object *uobj = entry->object.uvm_obj;
struct privcmd_object *pobj = (struct privcmd_object*)uobj;
int maddr_i;
int i, error = 0;
//printf("privpgop_fault va 0x%lx npages %d flags 0x%x\n", vaddr, npages, flags);
/* compute offset from start of map */
maddr_i = (entry->offset + (vaddr - entry->start)) >> PAGE_SHIFT;
//printf("maddr_i %d\n", maddr_i);
if (maddr_i + npages > pobj->npages)
return EINVAL;
for (i = 0; i < npages; i++, maddr_i++, vaddr+= PAGE_SIZE) {
if ((flags & PGO_ALLPAGES) == 0 && i != centeridx)
continue;
if (pps[i] == PGO_DONTCARE)
continue;
//printf("privpgop_fault pmap_enter ma 0x%lx -> 0x%lx %d\n", vaddr, pobj->maddr[i], pobj->domid);
if ((pobj->maddr[maddr_i] & 0xF0000000000) == 0xF0000000000)
continue; /* this has already been flagged as error */
error = pmap_enter_ma(ufi->orig_map->pmap, vaddr,
pobj->maddr[maddr_i], 0, ufi->entry->protection,
PMAP_CANFAIL | ufi->entry->protection,
pobj->domid);
if (error == ENOMEM) {
uvmfault_unlockall(ufi, ufi->entry->aref.ar_amap,
uobj, NULL);
pmap_update(ufi->orig_map->pmap);
uvm_wait("udv_fault");
return (ERESTART);
}
if (error) {
//printf("privcmd_fault: pmap_enter failed %d entry %d ma 0x%lx\n", error, maddr_i, pobj->maddr[maddr_i]);
/* XXX for proper ptp accountings */
pmap_remove(ufi->orig_map->pmap, vaddr,
vaddr + PAGE_SIZE);
pobj->maddr[maddr_i] |= 0xF0000000000;
error = 0;
}
}
uvmfault_unlockall(ufi, ufi->entry->aref.ar_amap, uobj, NULL);
pmap_update(ufi->orig_map->pmap);
return (error);
}
static int
privcmd_map_obj(struct vm_map *map, vaddr_t start, off_t size,
struct privcmd_object **nobj)
{
struct vm_map_entry *dead_entries;
struct uvm_object *uobj;
int error;
uvm_flag_t uvmflag;
vaddr_t newstart = start;
vm_prot_t prot;
vm_map_lock(map);
/* get protections. This also check for validity of mapping */
if (uvm_map_checkprot(map, start, start + size - 1, VM_PROT_WRITE))
prot = VM_PROT_READ | VM_PROT_WRITE;
else if (uvm_map_checkprot(map, start, start + size - 1, VM_PROT_READ))
prot = VM_PROT_READ;
else {
printf("uvm_map_checkprot 0x%lx -> 0x%lx "
"failed\n",
start, start + size - 1);
vm_map_unlock(map);
return EINVAL;
}
/* remove current entries */
uvm_unmap_remove(map, start, start + size, &dead_entries, NULL, 0);
if (dead_entries != NULL)
uvm_unmap_detach(dead_entries, 0);
vm_map_unlock(map);
uobj = malloc(sizeof(struct privcmd_object), M_TEMP, M_WAITOK);
if (uobj == NULL)
return ENOMEM;
UVM_OBJ_INIT(uobj, &privpgops, 1);
uvmflag = UVM_MAPFLAG(prot, prot, UVM_INH_NONE, UVM_ADV_NORMAL,
UVM_FLAG_FIXED | UVM_FLAG_NOMERGE);
error = uvm_map(map, &newstart, size, uobj, 0, 0, uvmflag);
if (error) {
if (uobj)
uobj->pgops->pgo_detach(uobj);
return error;
}
if (newstart != start) {
printf("uvm_map didn't give us back our vm space\n");
return EINVAL;
}
*nobj = (struct privcmd_object *)uobj;
return 0;
}
static const struct kernfs_fileop privcmd_fileops[] = {
{ .kf_fileop = KERNFS_FILEOP_IOCTL, .kf_vop = privcmd_ioctl },
};
void
xenprivcmd_init()
{
kernfs_entry_t *dkt;
kfstype kfst;
if ((xen_start_info.flags & SIF_PRIVILEGED) == 0)
return;
kfst = KERNFS_ALLOCTYPE(privcmd_fileops);
KERNFS_ALLOCENTRY(dkt, M_TEMP, M_WAITOK);
KERNFS_INITENTRY(dkt, DT_REG, "privcmd", NULL, kfst, VREG,
PRIVCMD_MODE);
kernfs_addentry(kernxen_pkt, dkt);
}
--17pEHd4RhPHOinZp--