Port-xen archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[PATCH] xen: add gntdev



This is a basic (and experimental) gntdev implementation for NetBSD.

The gnt device allows usermode applications to map grant references in
userspace. It is mainly used by Qemu to implement a Xen backend (that
runs in userspace).

Due to the fact that qemu-upstream is not yet functional in NetBSD,
the only way to try this gntdev is to use the old qemu
(qemu-traditional).

Performance is not that bad (given that we are using qemu-traditional
and running a backend in userspace), the throughput of write
operations is 64.7 MB/s, while in the Dom0 it is 104.6 MB/s. Regarding
read operations, the throughput inside the DomU is 76.0 MB/s, while on
the Dom0 it is 108.8 MB/s.

Patches to libxc and libxl are also comming soon.

Cc: xen-devel%lists.xen.org@localhost
---
 etc/MAKEDEV.tmpl                   |    5 +
 etc/etc.amd64/MAKEDEV.conf         |    2 +-
 etc/etc.i386/MAKEDEV.conf          |    2 +-
 sys/arch/amd64/conf/XEN3_DOM0      |    1 +
 sys/arch/amd64/conf/majors.amd64   |    1 +
 sys/arch/i386/conf/XEN3_DOM0       |    1 +
 sys/arch/i386/conf/majors.i386     |    1 +
 sys/arch/xen/conf/files.xen        |    2 +
 sys/arch/xen/include/xen_shm.h     |    2 +
 sys/arch/xen/include/xenio.h       |   76 +++++
 sys/arch/xen/x86/xen_shm_machdep.c |   18 +-
 sys/arch/xen/xen/gntdev.c          |  618 ++++++++++++++++++++++++++++++++++++
 sys/dev/DEVNAMES                   |    1 +
 sys/rump/librump/rumpkern/devsw.c  |    1 +
 14 files changed, 728 insertions(+), 3 deletions(-)
 create mode 100644 sys/arch/xen/xen/gntdev.c

diff --git a/etc/MAKEDEV.tmpl b/etc/MAKEDEV.tmpl
index 21b0568..00029c6 100644
--- a/etc/MAKEDEV.tmpl
+++ b/etc/MAKEDEV.tmpl
@@ -289,6 +289,7 @@
 #      wsfont* console font control
 #      wsmux*  wscons event multiplexor
 #      xenevt  Xen event interface
+#      gntdev  Xen grant table interface
 #
 # iSCSI communication devices
 #      iscsi*  iSCSI driver and /sbin/iscsid communication
@@ -1020,6 +1021,10 @@ xsd_kva)
        mkdev xsd_kva c %xenevt_chr% 1
        ;;
 
+gntdev)
+       mkdev gntdev c %gntdev_chr% 0
+       ;;
+
 xencons)
        mkdev xencons c %xencons_chr% 0
        ;;
diff --git a/etc/etc.amd64/MAKEDEV.conf b/etc/etc.amd64/MAKEDEV.conf
index a4a831c..5e2098c 100644
--- a/etc/etc.amd64/MAKEDEV.conf
+++ b/etc/etc.amd64/MAKEDEV.conf
@@ -44,5 +44,5 @@ all_md)
        ;;
 
 xen)
-       makedev xenevt xencons xsd_kva
+       makedev xenevt xencons xsd_kva gntdev
        ;;
diff --git a/etc/etc.i386/MAKEDEV.conf b/etc/etc.i386/MAKEDEV.conf
index ba3e2cc..bd38673 100644
--- a/etc/etc.i386/MAKEDEV.conf
+++ b/etc/etc.i386/MAKEDEV.conf
@@ -48,7 +48,7 @@ all_md)
        ;;
 
 xen)
-       makedev xenevt xencons xsd_kva
+       makedev xenevt xencons xsd_kva gntdev
        ;;
 
 floppy)
diff --git a/sys/arch/amd64/conf/XEN3_DOM0 b/sys/arch/amd64/conf/XEN3_DOM0
index e5f9f1f..1807dd2 100644
--- a/sys/arch/amd64/conf/XEN3_DOM0
+++ b/sys/arch/amd64/conf/XEN3_DOM0
@@ -838,6 +838,7 @@ pseudo-device       wsfont
 pseudo-device  drvctl
 
 # xen pseudo-devices
+pseudo-device  gntdev
 pseudo-device  xenevt
 pseudo-device  xvif
 pseudo-device  xbdback
diff --git a/sys/arch/amd64/conf/majors.amd64 b/sys/arch/amd64/conf/majors.amd64
index 9e6b1ac..cf15f7d 100644
--- a/sys/arch/amd64/conf/majors.amd64
+++ b/sys/arch/amd64/conf/majors.amd64
@@ -96,6 +96,7 @@ device-major  nsmb            char 98                 nsmb
 # - they appear in the i386 MAKEDEV
 #
 
+device-major   gntdev          char 140                gntdev
 device-major   xenevt          char 141                xenevt
 device-major   xbd             char 142 block 142      xbd
 device-major   xencons         char 143                xencons
diff --git a/sys/arch/i386/conf/XEN3_DOM0 b/sys/arch/i386/conf/XEN3_DOM0
index 8b5cf99..be28bbc 100644
--- a/sys/arch/i386/conf/XEN3_DOM0
+++ b/sys/arch/i386/conf/XEN3_DOM0
@@ -820,6 +820,7 @@ pseudo-device       wsfont
 pseudo-device  drvctl
 
 # xen pseudo-devices
+pseudo-device  gntdev
 pseudo-device  xenevt
 pseudo-device  xvif
 pseudo-device  xbdback
diff --git a/sys/arch/i386/conf/majors.i386 b/sys/arch/i386/conf/majors.i386
index 38c043f..9aab728 100644
--- a/sys/arch/i386/conf/majors.i386
+++ b/sys/arch/i386/conf/majors.i386
@@ -111,6 +111,7 @@ device-major        mt              char 107 block 24       
mt
 # - they appear in the i386 MAKEDEV
 #
 
+device-major   gntdev          char 140                gntdev
 device-major   xenevt          char 141                xenevt
 device-major   xbd             char 142 block 142      xbd
 device-major   xencons         char 143                xencons
diff --git a/sys/arch/xen/conf/files.xen b/sys/arch/xen/conf/files.xen
index e022db5..91ff858 100644
--- a/sys/arch/xen/conf/files.xen
+++ b/sys/arch/xen/conf/files.xen
@@ -198,6 +198,7 @@ attach      xencons at xendevbus
 file   arch/xen/xen/xencons.c          xencons needs-flag
 
 # Xen event peudo-device
+defpseudo gntdev
 defpseudo xenevt
 defpseudo xvif
 defpseudo xbdback
@@ -390,6 +391,7 @@ include     "dev/pcmcia/files.pcmcia"
 # Domain-0 operations
 defflag        opt_xen.h                       DOM0OPS
 file   arch/xen/xen/privcmd.c          dom0ops
+file   arch/xen/xen/gntdev.c           dom0ops
 file   arch/xen/x86/xen_shm_machdep.c  dom0ops
 file   arch/x86/pci/pci_machdep.c      hypervisor & pci & dom0ops
 file   arch/xen/xen/pci_intr_machdep.c hypervisor & pci
diff --git a/sys/arch/xen/include/xen_shm.h b/sys/arch/xen/include/xen_shm.h
index e2d89d0..a796572 100644
--- a/sys/arch/xen/include/xen_shm.h
+++ b/sys/arch/xen/include/xen_shm.h
@@ -37,6 +37,8 @@
  */
 
 int  xen_shm_map(int, int, grant_ref_t *, vaddr_t *, grant_handle_t *, int);
+int xen_shm_map_multidom(int , int *, grant_ref_t *, vaddr_t *,
+                            grant_handle_t *, int);
 void xen_shm_unmap(vaddr_t, int, grant_handle_t *);
 int xen_shm_callback(int (*)(void *), void *);
 
diff --git a/sys/arch/xen/include/xenio.h b/sys/arch/xen/include/xenio.h
index 6b25733..87cd376 100644
--- a/sys/arch/xen/include/xenio.h
+++ b/sys/arch/xen/include/xenio.h
@@ -122,4 +122,80 @@ typedef struct oprivcmd_hypercall
 /* EVTCHN_UNBIND: Unbind from the specified event-channel port. */
 #define EVTCHN_UNBIND _IOW('E', 3, unsigned long)
 
+/* Interface to /dev/gntdev */
+
+typedef struct ioctl_gntdev_grant_ref {
+    /* The domain ID of the grant to be mapped. */
+    uint32_t domid;
+    /* The grant reference of the grant to be mapped. */
+    uint32_t ref;
+} ioctl_gntdev_grant_ref;
+
+typedef struct ioctl_gntdev_map_grant_ref {
+    /* IN parameters */
+    /* The number of grants to be mapped. */
+    uint32_t count;
+    uint32_t pad;
+    uint64_t vaddr;
+    /* OUT parameters */
+    /* The offset to be used on a subsequent call to mmap(). */
+    uint64_t index;
+    /* Variable IN parameter. */
+    /* Array of grant references, of size @count. */
+    ioctl_gntdev_grant_ref *refs;
+} ioctl_gntdev_map_grant_ref;
+
+typedef struct ioctl_gntdev_unmap_grant_ref {
+    /* IN parameters */
+    /* The offset was returned by the corresponding map operation. */
+    uint64_t index;
+    /* The number of pages to be unmapped. */
+    uint32_t count;
+    uint32_t pad;
+} ioctl_gntdev_unmap_grant_ref;
+
+typedef struct ioctl_gntdev_get_offset_for_vaddr {
+    /* IN parameters */
+    /* The virtual address of the first mapped page in a range. */
+    uint64_t vaddr;
+    /* OUT parameters */
+    /* The offset that was used in the initial mmap() operation. */
+    uint64_t offset;
+    /* The number of pages mapped in the VM area that begins at @vaddr. */
+    uint32_t count;
+    uint32_t pad;
+} ioctl_gntdev_get_offset_for_vaddr;
+
+/*
+ * Inserts the grant references into the mapping table of an instance
+ * of gntdev. N.B. This does not perform the mapping, which is deferred
+ * until mmap() is called with @index as the offset.
+ */
+#define IOCTL_GNTDEV_MAP_GRANT_REF \
+    _IOWR('G', 0, ioctl_gntdev_map_grant_ref)
+
+/*
+ * Removes the grant references from the mapping table of an instance of
+ * of gntdev. N.B. munmap() must be called on the relevant virtual address(es)
+ * before this ioctl is called, or an error will result.
+ */
+#define IOCTL_GNTDEV_UNMAP_GRANT_REF \
+    _IOW('G', 1, ioctl_gntdev_unmap_grant_ref)
+
+/*
+ * Returns the offset in the driver's address space that corresponds
+ * to @vaddr. This can be used to perform a munmap(), followed by an
+ * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by
+ * the caller. The number of pages that were allocated at the same time as
+ * @vaddr is returned in @count.
+ *
+ * N.B. Where more than one page has been mapped into a contiguous range, the
+ *      supplied @vaddr must correspond to the start of the range; otherwise
+ *      an error will result. It is only possible to munmap() the entire
+ *      contiguously-allocated range at once, and not any subrange thereof.
+ */
+#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \
+    _IOWR('G', 2, ioctl_gntdev_get_offset_for_vaddr)
+
+
 #endif /* __XEN_XENIO_H__ */
diff --git a/sys/arch/xen/x86/xen_shm_machdep.c 
b/sys/arch/xen/x86/xen_shm_machdep.c
index d47745c..b47cc54 100644
--- a/sys/arch/xen/x86/xen_shm_machdep.c
+++ b/sys/arch/xen/x86/xen_shm_machdep.c
@@ -35,6 +35,7 @@ __KERNEL_RCSID(0, "$NetBSD: xen_shm_machdep.c,v 1.10 
2011/09/02 22:25:08 dyoung
 #include <sys/queue.h>
 #include <sys/vmem.h>
 #include <sys/kernel.h>
+#include <sys/malloc.h>
 #include <uvm/uvm.h>
 
 #include <machine/pmap.h>
@@ -120,6 +121,21 @@ int
 xen_shm_map(int nentries, int domid, grant_ref_t *grefp, vaddr_t *vap,
     grant_handle_t *handlep, int flags)
 {
+       int i, rc;
+       int *domids = malloc(sizeof(domid) * nentries, M_DEVBUF,
+                            M_WAITOK | M_ZERO);
+       for(i = 0; i < nentries; i++) {
+               domids[i] = domid;
+       }
+       rc = xen_shm_map_multidom(nentries, domids, grefp, vap, handlep, flags);
+       free(domids, M_DEVBUF);
+       return rc;
+}
+
+int
+xen_shm_map_multidom(int nentries, int *domid, grant_ref_t *grefp,
+       vaddr_t *vap, grant_handle_t *handlep, int flags)
+{
        int s, i;
        vaddr_t new_va;
        vmem_addr_t new_va_pg;
@@ -168,7 +184,7 @@ xen_shm_map(int nentries, int domid, grant_ref_t *grefp, 
vaddr_t *vap,
        new_va = new_va_pg << PAGE_SHIFT;
        for (i = 0; i < nentries; i++) {
                op[i].host_addr = new_va + i * PAGE_SIZE;
-               op[i].dom = domid;
+               op[i].dom = domid[i];
                op[i].ref = grefp[i];
                op[i].flags = GNTMAP_host_map |
                    ((flags & XSHM_RO) ? GNTMAP_readonly : 0);
diff --git a/sys/arch/xen/xen/gntdev.c b/sys/arch/xen/xen/gntdev.c
new file mode 100644
index 0000000..85dd8ec
--- /dev/null
+++ b/sys/arch/xen/xen/gntdev.c
@@ -0,0 +1,618 @@
+/*
+ * Copyright (c) 2012 Roger Pau Monné.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+
+#include "opt_xen.h"
+
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/conf.h>
+
+#include <uvm/uvm.h>
+
+#include <xen/xen_shm.h>
+#include <xen/xenio.h>
+
+void gntdevattach(int n);
+
+#define freem(va) \
+       if (va) free(va, M_DEVBUF)
+
+#undef GNTDEBUG
+#ifdef GNTDEBUG
+       #define debug(M, ...) \
+               printf("gntdev:%d: " M "\n", __LINE__, ##__VA_ARGS__)
+#else
+       #define debug(M, ...)
+#endif
+
+#define VA_FREE 0
+
+static int gntdev_fioctl(struct file *, u_long, void *);
+static int gntdev_fclose(struct file *);
+
+static const struct fileops gntdev_fileops = {
+       .fo_read = fbadop_read,
+       .fo_write = fbadop_write,
+       .fo_ioctl = gntdev_fioctl,
+       .fo_fcntl = fnullop_fcntl,
+       .fo_poll = fnullop_poll,
+       .fo_stat = fbadop_stat,
+       .fo_close = gntdev_fclose,
+       .fo_kqfilter = fnullop_kqfilter,
+       .fo_restart = fnullop_restart,
+};
+
+dev_type_open(gntdev_open);
+
+const struct cdevsw gntdev_cdevsw = {
+       gntdev_open, nullclose, noread, nowrite, noioctl,
+       nostop, notty, nopoll, nommap, nokqfilter, D_OTHER
+};
+
+struct gntmap {
+       struct uvm_object uobj;
+       pmap_t pmap;
+       LIST_ENTRY(gntmap) next_map;
+       int index;
+       int count;
+       grant_ref_t *grants;
+       int *domids;
+       vaddr_t va;
+       vaddr_t kernel_va;
+       grant_handle_t *handles;
+       bool ro;
+};
+
+struct gntproc {
+       LIST_HEAD(,gntmap) maps;
+       kmutex_t lock;
+       struct lwp *lwp;
+       unsigned int num_maps;
+};
+
+struct gntdev {
+       kcondvar_t wait_mem;
+       kmutex_t lock;
+       bool callback_set;
+};
+
+struct gntdev priv;
+
+/* --- UVM handlers prototypes --- */
+
+static int
+gntmap_fault(struct uvm_faultinfo *, vaddr_t, struct vm_page **,
+    int, int, vm_prot_t, int);
+static void
+gntmap_reference(struct uvm_object *);
+static void
+gntmap_detach(struct uvm_object *);
+static int
+map_grant_ref(struct gntmap *map);
+
+static struct uvm_pagerops gntops = {
+  .pgo_reference = gntmap_reference,
+  .pgo_detach = gntmap_detach,
+  .pgo_fault = gntmap_fault,
+};
+
+/* --- Helper prototypes --- */
+
+static int
+gntdev_mem_callback(void *arg);
+static void
+insert_map(struct gntproc *proc, struct gntmap *map);
+static struct gntmap *
+find_map(struct gntproc *proc, int index, int count);
+static struct gntmap *
+find_vaddr(struct gntproc *proc, vaddr_t va);
+static void
+remove_map(struct gntproc *proc, struct gntmap *map);
+
+/* --- UVM handlers --- */
+
+static int
+gntdev_install_handler(struct vm_map *vmap, vaddr_t start,
+       struct gntmap *map)
+{
+       int rc;
+       uvm_flag_t uvmflag;
+       vaddr_t newstart = start;
+       vm_prot_t prot;
+       off_t size = ((off_t)map->count << PGSHIFT);
+
+       /* remove current entries */
+       uvm_unmap(vmap, start, start + size);
+
+       uvm_obj_init(&map->uobj, &gntops, true, 1);
+       prot = map->ro ? VM_PROT_READ : VM_PROT_READ | VM_PROT_WRITE;
+       uvmflag = UVM_MAPFLAG(prot, prot, UVM_INH_NONE, UVM_ADV_NORMAL,
+                             UVM_FLAG_FIXED | UVM_FLAG_NOMERGE);
+       rc = uvm_map(vmap, &newstart, size, &map->uobj, 0, 0, uvmflag);
+       if (rc) {
+               debug("uvm_map failed");
+               map->uobj.pgops->pgo_detach(&map->uobj);
+               return rc;
+       }
+       if (newstart != start) {
+               debug("uvm_map didn't give us back our vm space");
+               return EINVAL;
+       }
+       map->pmap = vm_map_pmap(vmap);
+       return 0;
+}
+
+static int
+gntmap_fault(struct uvm_faultinfo *ufi, vaddr_t vaddr, struct vm_page **pps,
+    int npages, int centeridx, vm_prot_t access_type, int flags)
+{
+       struct vm_map_entry *entry = ufi->entry;
+       struct uvm_object *uobj = entry->object.uvm_obj;
+       struct gntmap *map = (struct gntmap*)entry->object.uvm_obj;
+       pmap_t pmap = ufi->orig_map->pmap;
+       int index, i, rc = 0;
+       vaddr_t u_va, k_va;
+       vm_prot_t prot;
+       paddr_t ma;
+
+       /* compute offset from start of map */
+       index = (entry->offset + (vaddr - entry->start)) >> PAGE_SHIFT;
+       if (index + npages > map->count) {
+               return EINVAL;
+       }
+
+       for (i = 0; i < npages; i++, index++) {
+               if ((flags & PGO_ALLPAGES) == 0 && i != centeridx)
+                       continue;
+               if (pps[i] == PGO_DONTCARE)
+                       continue;
+
+               u_va = map->va + (index * PAGE_SIZE);
+               k_va = map->kernel_va + (index * PAGE_SIZE);
+               if (pmap_extract_ma(pmap_kernel(), k_va, &ma) == false) {
+                       debug("unable to extract kernel MA");
+                       return EFAULT;
+               }
+               prot = map->ro ? VM_PROT_READ : (VM_PROT_READ | VM_PROT_WRITE);
+               rc = pmap_enter_ma(pmap, u_va, ma, 0, prot, PMAP_CANFAIL,
+                                  map->domids[index]);
+               if (rc == ENOMEM) {
+                       debug("pmap_enter_ma: ENOMEM");
+                       rc = ERESTART;
+                       break;
+               }
+               if (rc) {
+                       /* XXX for proper ptp accountings */
+                       debug("pmap_enter_ma: failed");
+                       pmap_remove(pmap, u_va, u_va + PAGE_SIZE);
+               }
+       }
+       pmap_update(pmap);
+       uvmfault_unlockall(ufi, ufi->entry->aref.ar_amap, uobj);
+
+       if (rc == ERESTART) {
+               uvm_wait("privpgop_fault");
+       }
+       return rc;
+}
+
+static void
+gntmap_reference(struct uvm_object *uobj)
+{
+       mutex_enter(uobj->vmobjlock);
+       uobj->uo_refs++;
+       mutex_exit(uobj->vmobjlock);
+}
+
+static void
+gntmap_detach(struct uvm_object *uobj)
+{
+       struct gntmap *map = (struct gntmap *)uobj;
+
+       mutex_enter(uobj->vmobjlock);
+       uobj->uo_refs--;
+       if (uobj->uo_refs > 0) {
+               mutex_exit(uobj->vmobjlock);
+               return;
+       }
+       mutex_exit(uobj->vmobjlock);
+       pmap_remove(map->pmap, map->va, map->va + (map->count * PAGE_SIZE));
+       pmap_update(map->pmap);
+}
+
+/* --- Internal Helpers --- */
+
+static int
+gntdev_mem_callback(void *arg)
+{
+       mutex_enter(&priv.lock);
+       cv_broadcast(&priv.wait_mem);
+       priv.callback_set = false;
+       mutex_exit(&priv.lock);
+       return 0;
+}
+
+static void
+insert_map(struct gntproc *proc, struct gntmap *map)
+{
+       struct gntmap *tmap;
+
+       mutex_enter(&proc->lock);
+       proc->num_maps++;
+       if (LIST_EMPTY(&proc->maps)) {
+               LIST_INSERT_HEAD(&proc->maps, map, next_map);
+               goto out;
+       }
+       LIST_FOREACH(tmap, &proc->maps, next_map) {
+               if (map->index + map->count < tmap->index) {
+                       LIST_INSERT_BEFORE(tmap, map, next_map);
+                       goto out;
+               }
+               map->index = tmap->index + tmap->count;
+               if (LIST_NEXT(tmap, next_map) == NULL) {
+                       LIST_INSERT_AFTER(tmap, map, next_map);
+                       goto out;
+               }
+       }
+
+out:
+       mutex_exit(&proc->lock);
+       return;
+}
+
+static struct gntmap *
+find_map(struct gntproc *proc, int index, int count)
+{
+       struct gntmap *map = NULL;
+
+       mutex_enter(&proc->lock);
+       if (LIST_EMPTY(&proc->maps))
+               goto out;
+
+       LIST_FOREACH(map, &proc->maps, next_map) {
+               if (index != map->index) {
+                       continue;
+               }
+               if (count && count != map->count) {
+                       continue;
+               }
+               goto out;
+       }
+       map = NULL;
+
+out:
+       mutex_exit(&proc->lock);
+       return map;
+}
+
+static struct gntmap *
+find_vaddr(struct gntproc *proc, vaddr_t va)
+{
+       struct gntmap *map = NULL;
+
+       mutex_enter(&proc->lock);
+       if (LIST_EMPTY(&proc->maps))
+               goto out;
+
+       LIST_FOREACH(map, &proc->maps, next_map) {
+               if (va >= map->va && va < (map->va + (map->count * PAGE_SIZE)))
+                       goto out;
+       }
+       map = NULL;
+
+out:
+       mutex_exit(&proc->lock);
+       return map;
+}
+
+static void
+remove_map(struct gntproc *proc, struct gntmap *map)
+{
+       mutex_enter(&proc->lock);
+       LIST_REMOVE(map, next_map);
+       proc->num_maps--;
+       mutex_exit(&proc->lock);
+       if (map->kernel_va != VA_FREE) {
+               xen_shm_unmap(map->kernel_va, map->count, map->handles);
+               map->kernel_va = VA_FREE;
+               uvm_obj_destroy(&map->uobj, true);
+       }
+       free(map->grants, M_DEVBUF);
+       free(map->handles, M_DEVBUF);
+       free(map->domids, M_DEVBUF);
+       free(map, M_DEVBUF);
+}
+
+static int
+map_grant_ref(struct gntmap *map)
+{
+       int rc;
+       vaddr_t k_va;
+
+       do {
+               debug("mapping grefs with index %d", map->index);
+               rc = xen_shm_map_multidom(map->count, map->domids, map->grants, 
&k_va,
+                                         map->handles, map->ro ? XSHM_RO : 0);
+               switch (rc) {
+               case 0:
+                       /* All good */
+                       map->kernel_va = k_va;
+                       break;
+               case ENOMEM:
+                       mutex_enter(&priv.lock);
+                       debug("xen_shm_map_multidom out of memory");
+                       if (!priv.callback_set) {
+                               /* Register callback */
+                               if (xen_shm_callback(gntdev_mem_callback, NULL) 
!= 0) {
+                                       mutex_exit(&priv.lock);
+                                       panic("ioctl_map_grant_ref: 
xen_shm_callback failed");
+                               }
+                               priv.callback_set = true;
+                       }
+                       cv_wait(&priv.wait_mem, &priv.lock);
+                       mutex_exit(&priv.lock);
+               default:
+                       debug("xen_shm_map_multidom failed");
+                       goto error;
+               }
+       } while (rc == ENOMEM);
+
+       return 0;
+
+error:
+       return rc;
+}
+
+/* --- ioctl handlers --- */
+
+static int
+ioctl_map_grant_ref(struct gntproc *proc,
+       ioctl_gntdev_map_grant_ref *map_grants)
+{
+       grant_ref_t *refs = NULL;
+       grant_handle_t *handles = NULL;
+       int *domids = NULL;
+       struct gntmap *map = NULL;
+       struct vm_map *vmm;
+       ioctl_gntdev_grant_ref ioctl_map;
+       int i, rc;
+       vaddr_t va0;
+
+       if (find_vaddr(proc, map_grants->vaddr)) {
+               debug("memory area already in use");
+               rc = EINVAL;
+               goto error;
+       }
+
+       debug("mapping %d refs", map_grants->count);
+
+       refs = malloc(sizeof(*refs) * map_grants->count, M_DEVBUF,
+                    M_WAITOK | M_ZERO);
+       handles = malloc(sizeof(*handles) * map_grants->count, M_DEVBUF,
+                    M_WAITOK | M_ZERO);
+       domids = malloc(sizeof(*domids) * map_grants->count, M_DEVBUF,
+                    M_WAITOK | M_ZERO);
+
+       for (i = 0; i < map_grants->count; i++) {
+               rc = copyin(&map_grants->refs[i], &ioctl_map, 
sizeof(ioctl_map));
+               if (rc != 0) {
+                       debug("unable to copyin grant ref info %d", i);
+                       goto error;
+               }
+               debug("mapping ref: %u Dom: %u", ioctl_map.ref, 
ioctl_map.domid);
+               refs[i] = ioctl_map.ref;
+               domids[i] = ioctl_map.domid;
+       }
+       map = malloc(sizeof(*map), M_DEVBUF,
+                                    M_WAITOK | M_ZERO);
+       vmm = &proc->lwp->l_proc->p_vmspace->vm_map;
+       va0 = map_grants->vaddr & ~PAGE_MASK;
+       vm_map_lock_read(vmm);
+       if (uvm_map_checkprot(vmm, va0, va0 + (map_grants->count << PGSHIFT) - 
1,
+           VM_PROT_WRITE)) {
+               map->ro = false;
+               debug("map RW");
+       } else if (uvm_map_checkprot(vmm, va0,
+           va0 + (map_grants->count << PGSHIFT) - 1, VM_PROT_READ)) {
+               map->ro = true;
+               debug("map RO");
+       } else {
+               debug("unable check protection");
+               vm_map_unlock_read(vmm);
+               goto error;
+       }
+       vm_map_unlock_read(vmm);
+       map->grants = refs;
+       map->handles = handles;
+       map->domids = domids;
+       map->va = map_grants->vaddr;
+       map->kernel_va = VA_FREE;
+       map->count = map_grants->count;
+       map->index = 0;
+
+       rc = map_grant_ref(map);
+       if (rc) {
+               debug("map_grant_ref failed");
+               goto error;
+       }
+
+       rc = gntdev_install_handler(vmm, map->va, map);
+       if (rc) {
+               debug("gntdev_install_handler failed");
+               xen_shm_unmap(map->kernel_va, map->count, map->handles);
+               map->kernel_va = VA_FREE;
+               goto error;
+       }
+       insert_map(proc, map);
+       map_grants->index = map->index << PAGE_SHIFT;
+       debug("gntrefs mapped at index %" PRIu64 "", map_grants->index);
+       return 0;
+
+error:
+       freem(refs);
+       freem(handles);
+       freem(domids);
+       freem(map);
+       debug("unable to map grant refs");
+       return rc;
+}
+
+static int
+ioctl_unmap_grant_ref(struct gntproc *proc,
+       ioctl_gntdev_unmap_grant_ref *unmap_grants)
+{
+       struct gntmap *map;
+       uint64_t index = unmap_grants->index >> PAGE_SHIFT;
+       int rc = 0;
+
+       debug("unmapping index %" PRIu64, index);
+
+       map = find_map(proc, index, unmap_grants->count);
+       if (map == NULL) {
+               debug("unable to find index %" PRIu64, index);
+               rc = EINVAL;
+               goto out;
+       }
+       mutex_enter(map->uobj.vmobjlock);
+       if (map->uobj.uo_refs) {
+               debug("trying to remove a referenced map");
+               mutex_exit(map->uobj.vmobjlock);
+               return EINVAL;
+       }
+       mutex_exit(map->uobj.vmobjlock);
+       remove_map(proc, map);
+out:
+       return rc;
+}
+
+static int
+ioctl_get_offset_vaddr(struct gntproc *proc,
+       ioctl_gntdev_get_offset_for_vaddr *offset_vaddr)
+{
+       struct gntmap *map;
+       int rc = 0;
+
+       debug("find offset va: %p", (void *)offset_vaddr->vaddr);
+
+       map = find_vaddr(proc, offset_vaddr->vaddr);
+       if (map == NULL) {
+               debug("unable to find vaddr");
+               rc = EINVAL;
+               goto out;
+       }
+
+       offset_vaddr->offset = map->index << PAGE_SHIFT;
+       offset_vaddr->count = map->count;
+
+out:
+       return rc;
+}
+
+/* --- Device ops handlers --- */
+
+static int
+gntdev_fioctl(struct file *fp, u_long cmd, void *addr)
+{
+       struct gntproc *proc = fp->f_data;
+       ioctl_gntdev_map_grant_ref *map_grants;
+       ioctl_gntdev_unmap_grant_ref *unmap_grants;
+       ioctl_gntdev_get_offset_for_vaddr *offset_vaddr;
+       int rc;
+
+       switch (cmd) {
+       case IOCTL_GNTDEV_MAP_GRANT_REF:
+               map_grants = addr;
+               rc = ioctl_map_grant_ref(proc, map_grants);
+               break;
+       case IOCTL_GNTDEV_UNMAP_GRANT_REF:
+               unmap_grants = addr;
+               rc = ioctl_unmap_grant_ref(proc, unmap_grants);
+               break;
+       case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR:
+               offset_vaddr = addr;
+               rc = ioctl_get_offset_vaddr(proc, offset_vaddr);
+               break;
+       default:
+               rc = EINVAL;
+       }
+       return rc;
+}
+
+int
+gntdev_open(dev_t dev, int flags, int mode, struct lwp *l)
+{
+       struct gntproc *proc;
+       struct file *fp;
+       int fd, rc;
+
+       rc = fd_allocfile(&fp, &fd);
+       if (rc)
+               return rc;
+
+       proc = malloc(sizeof(*proc), M_DEVBUF, M_WAITOK | M_ZERO);
+       mutex_init(&proc->lock, MUTEX_DEFAULT, IPL_NONE);
+       LIST_INIT(&proc->maps);
+       proc->lwp = l;
+       proc->num_maps = 0;
+       debug("opened for proc %p", l);
+       return fd_clone(fp, fd, flags, &gntdev_fileops, proc);
+}
+
+static int
+gntdev_fclose(struct file *fp)
+{
+       struct gntproc *proc = fp->f_data;
+       struct gntmap *map;
+
+       mutex_enter(&proc->lock);
+       while (LIST_FIRST(&proc->maps) != NULL) {
+               map = LIST_FIRST(&proc->maps);
+               mutex_exit(&proc->lock);
+               remove_map(proc, map);
+               mutex_enter(&proc->lock);
+       }
+       KASSERT(proc->num_maps == 0);
+       mutex_exit(&proc->lock);
+       mutex_destroy(&proc->lock);
+       debug("closed device for proc %p", proc->lwp);
+       free(proc, M_DEVBUF);
+       return 0;
+}
+
+void
+gntdevattach(int n)
+{
+       mutex_init(&priv.lock, MUTEX_DEFAULT, IPL_VM);
+       cv_init(&priv.wait_mem, "gntdev");
+       priv.callback_set = false;
+       debug("attached");
+       return;
+}
diff --git a/sys/dev/DEVNAMES b/sys/dev/DEVNAMES
index 45cf018..765fe45 100644
--- a/sys/dev/DEVNAMES
+++ b/sys/dev/DEVNAMES
@@ -1517,6 +1517,7 @@ xdc                       MI
 xdc                    sun3
 xe                     next68k
 xel                    x68k
+gntdev                 xen
 xencons                        xen
 xenevt                 xen
 xennet                 xen
diff --git a/sys/rump/librump/rumpkern/devsw.c 
b/sys/rump/librump/rumpkern/devsw.c
index 5a1af01..e513885 100644
--- a/sys/rump/librump/rumpkern/devsw.c
+++ b/sys/rump/librump/rumpkern/devsw.c
@@ -134,6 +134,7 @@ struct devsw_conv devsw_conv0[] = {
        { "rd", 22, 105, DEVNODE_DONTBOTHER, 0, { 0, 0 }},
        { "ct", 23, 106, DEVNODE_DONTBOTHER, 0, { 0, 0 }},
        { "mt", 24, 107, DEVNODE_DONTBOTHER, 0, { 0, 0 }},
+       { "gntdev", -1, 140, DEVNODE_DONTBOTHER, 0, { 0, 0 }},
        { "xenevt", -1, 141, DEVNODE_DONTBOTHER, 0, { 0, 0 }},
        { "xbd", 142, 142, DEVNODE_DONTBOTHER, 0, { 0, 0 }},
        { "xencons", -1, 143, DEVNODE_DONTBOTHER, 0, { 0, 0 }},
-- 
1.7.7.5 (Apple Git-26)



Home | Main Index | Thread Index | Old Index