[src/trunk]: src Improvements and fixes in NVMM.

To: source-changes-hg%NetBSD.org@localhost
Subject: [src/trunk]: src Improvements and fixes in NVMM.
From: maxv <maxv%NetBSD.org@localhost>
Date: Sun, 06 Jan 2019 18:54:26 +0000
details:   https://anonhg.NetBSD.org/src/rev/68cee890c922
branches:  trunk
changeset: 447283:68cee890c922
user:      maxv <maxv%NetBSD.org@localhost>
date:      Sun Jan 06 16:10:51 2019 +0000

description:
Improvements and fixes in NVMM.

Kernel driver:

 * Don't take an extra (unneeded) reference to the UAO.

 * Provide npc for HLT. I'm not really happy with it right now, will
   likely be revisited.

 * Add the INT_SHADOW, INT_WINDOW_EXIT and NMI_WINDOW_EXIT states. Provide
   them in the exitstate too.

 * Don't take the TPR into account when processing INTs. The virtualizer
   can do that itself (Qemu already does).

 * Provide a hypervisor signature in CPUID, and hide SVM.

 * Ignore certain MSRs. One special case is MSR_NB_CFG in which we set
   NB_CFG_INITAPICCPUIDLO. Allow reads of MSR_TSC.

 * If the LWP has pending signals or softints, leave, rather than waiting
   for a rescheduling to happen later. This reduces interrupt processing
   time in the guest (Qemu sends a signal to the thread, and now we leave
   right away). This could be improved even more by sending an actual IPI
   to the CPU, but I'll see later.

Libnvmm:

 * Fix the MMU translation of large pages, we need to add the lower bits
   too.

 * Change the IO and Mem structures to take a pointer rather than a
   static array. This provides more flexibility.

 * Batch together the str+rep IO transactions. We do one big memory
   read/write, and then send the IO commands to the hypervisor all at
   once. This considerably increases performance.

 * Decode MOVZX.

With these changes in place, Qemu+NVMM works. I can install NetBSD 8.0
in a VM with multiple VCPUs, connect to the network, etc.

diffstat:

 lib/libnvmm/libnvmm.3           |   12 +-
 lib/libnvmm/libnvmm_x86.c       |  206 ++++++++++++++++++++++++++++++++++++++-
 lib/libnvmm/nvmm.h              |    6 +-
 sys/dev/nvmm/nvmm.c             |    7 +-
 sys/dev/nvmm/nvmm.h             |    7 +-
 sys/dev/nvmm/x86/nvmm_x86.h     |   14 +-
 sys/dev/nvmm/x86/nvmm_x86_svm.c |  146 +++++++++++++++++++++++----
 7 files changed, 344 insertions(+), 54 deletions(-)

diffs (truncated from 877 to 300 lines):

diff -r b2f389174e42 -r 68cee890c922 lib/libnvmm/libnvmm.3
--- a/lib/libnvmm/libnvmm.3     Sun Jan 06 15:37:17 2019 +0000
+++ b/lib/libnvmm/libnvmm.3     Sun Jan 06 16:10:51 2019 +0000
@@ -1,4 +1,4 @@
-.\"    $NetBSD: libnvmm.3,v 1.6 2018/12/27 07:22:31 maxv Exp $
+.\"    $NetBSD: libnvmm.3,v 1.7 2019/01/06 16:10:51 maxv Exp $
 .\"
 .\" Copyright (c) 2018 The NetBSD Foundation, Inc.
 .\" All rights reserved.
@@ -27,7 +27,7 @@
 .\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 .\" POSSIBILITY OF SUCH DAMAGE.
 .\"
-.Dd December 26, 2018
+.Dd January 06, 2019
 .Dt LIBNVMM 3
 .Os
 .Sh NAME
@@ -242,8 +242,6 @@
 .Fa cpuid
 from machine
 .Fa mach .
-.Fa cb
-will be called to handle the transaction.
 See
 .Sx I/O Assist
 below for details.
@@ -255,8 +253,6 @@
 .Fa cpuid
 from machine
 .Fa mach .
-.Fa cb
-will be called to handle the transaction.
 See
 .Sx Mem Assist
 below for details.
@@ -415,7 +411,7 @@
        uint64_t port;
        bool in;
        size_t size;
-       uint8_t data[8];
+       uint8_t *data;
 };
 .Ed
 .Pp
@@ -463,7 +459,7 @@
        gpaddr_t gpa;
        bool write;
        size_t size;
-       uint8_t data[8];
+       uint8_t *data;
 };
 .Ed
 .Pp
diff -r b2f389174e42 -r 68cee890c922 lib/libnvmm/libnvmm_x86.c
--- a/lib/libnvmm/libnvmm_x86.c Sun Jan 06 15:37:17 2019 +0000
+++ b/lib/libnvmm/libnvmm_x86.c Sun Jan 06 16:10:51 2019 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: libnvmm_x86.c,v 1.9 2019/01/04 10:25:39 maxv Exp $     */
+/*     $NetBSD: libnvmm_x86.c,v 1.10 2019/01/06 16:10:51 maxv Exp $    */
 
 /*
  * Copyright (c) 2018 The NetBSD Foundation, Inc.
@@ -45,6 +45,8 @@
 
 #include "nvmm.h"
 
+#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
+
 #include <x86/specialreg.h>
 
 extern struct nvmm_callbacks __callbacks;
@@ -83,6 +85,11 @@
                    (void *)state.segs[i].limit,
                    state.segs[i].attrib.p, state.segs[i].attrib.def32);
        }
+       printf("| -> MSR_EFER=%p\n", (void *)state.msrs[NVMM_X64_MSR_EFER]);
+       printf("| -> CR0=%p\n", (void *)state.crs[NVMM_X64_CR_CR0]);
+       printf("| -> CR3=%p\n", (void *)state.crs[NVMM_X64_CR_CR3]);
+       printf("| -> CR4=%p\n", (void *)state.crs[NVMM_X64_CR_CR4]);
+       printf("| -> CR8=%p\n", (void *)state.crs[NVMM_X64_CR_CR8]);
        printf("| -> CPL=%p\n", (void *)state.misc[NVMM_X64_MISC_CPL]);
 
        return 0;
@@ -131,6 +138,7 @@
                return -1;
        if (pte & PG_PS) {
                *gpa = (pte & PTE32_L2_FRAME);
+               *gpa = *gpa + (gva & PTE32_L1_MASK);
                return 0;
        }
 
@@ -215,6 +223,7 @@
                return -1;
        if (pte & PG_PS) {
                *gpa = (pte & PTE32_PAE_L2_FRAME);
+               *gpa = *gpa + (gva & PTE32_PAE_L1_MASK);
                return 0;
        }
 
@@ -320,6 +329,7 @@
                return -1;
        if (pte & PG_PS) {
                *gpa = (pte & PTE64_L3_FRAME);
+               *gpa = *gpa + (gva & (PTE64_L2_MASK|PTE64_L1_MASK));
                return 0;
        }
 
@@ -341,6 +351,7 @@
                return -1;
        if (pte & PG_PS) {
                *gpa = (pte & PTE64_L2_FRAME);
+               *gpa = *gpa + (gva & PTE64_L1_MASK);
                return 0;
        }
 
@@ -500,13 +511,34 @@
 }
 
 static uint64_t
+rep_get_cnt(struct nvmm_x64_state *state, size_t adsize)
+{
+       uint64_t mask, cnt;
+
+       mask = mask_from_adsize(adsize);
+       cnt = state->gprs[NVMM_X64_GPR_RCX] & mask;
+
+       return cnt;
+}
+
+static void
+rep_set_cnt(struct nvmm_x64_state *state, size_t adsize, uint64_t cnt)
+{
+       uint64_t mask;
+
+       mask = mask_from_adsize(adsize);
+       state->gprs[NVMM_X64_GPR_RCX] &= ~mask;
+       state->gprs[NVMM_X64_GPR_RCX] |= cnt;
+}
+
+static uint64_t
 rep_dec_apply(struct nvmm_x64_state *state, size_t adsize)
 {
        uint64_t mask, cnt;
 
        mask = mask_from_adsize(adsize);
 
-       cnt = state->gprs[NVMM_X64_GPR_RCX] & mask; 
+       cnt = state->gprs[NVMM_X64_GPR_RCX] & mask;
        cnt -= 1;
        cnt &= mask;
 
@@ -521,6 +553,7 @@
     gvaddr_t gva, uint8_t *data, size_t size)
 {
        struct nvmm_mem mem;
+       uint8_t membuf[8];
        nvmm_prot_t prot;
        gpaddr_t gpa;
        uintptr_t hva;
@@ -547,6 +580,7 @@
        is_mmio = (ret == -1);
 
        if (is_mmio) {
+               mem.data = membuf;
                mem.gva = gva;
                mem.gpa = gpa;
                mem.write = false;
@@ -572,6 +606,7 @@
     gvaddr_t gva, uint8_t *data, size_t size)
 {
        struct nvmm_mem mem;
+       uint8_t membuf[8];
        nvmm_prot_t prot;
        gpaddr_t gpa;
        uintptr_t hva;
@@ -598,6 +633,7 @@
        is_mmio = (ret == -1);
 
        if (is_mmio) {
+               mem.data = membuf;
                mem.gva = gva;
                mem.gpa = gpa;
                mem.write = true;
@@ -622,16 +658,55 @@
 
 static int fetch_segment(struct nvmm_machine *, struct nvmm_x64_state *);
 
+#define NVMM_IO_BATCH_SIZE     32
+
+static int
+assist_io_batch(struct nvmm_machine *mach, struct nvmm_x64_state *state,
+    struct nvmm_io *io, gvaddr_t gva, uint64_t cnt)
+{
+       uint8_t iobuf[NVMM_IO_BATCH_SIZE];
+       size_t i, iosize, iocnt;
+       int ret;
+
+       cnt = MIN(cnt, NVMM_IO_BATCH_SIZE);
+       iosize = MIN(io->size * cnt, NVMM_IO_BATCH_SIZE);
+       iocnt = iosize / io->size;
+
+       io->data = iobuf;
+
+       if (!io->in) {
+               ret = read_guest_memory(mach, state, gva, iobuf, iosize);
+               if (ret == -1)
+                       return -1;
+       }
+
+       for (i = 0; i < iocnt; i++) {
+               (*__callbacks.io)(io);
+               io->data += io->size;
+       }
+
+       if (io->in) {
+               ret = write_guest_memory(mach, state, gva, iobuf, iosize);
+               if (ret == -1)
+                       return -1;
+       }
+
+       return iocnt;
+}
+
 int
 nvmm_assist_io(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
     struct nvmm_exit *exit)
 {
        struct nvmm_x64_state state;
        struct nvmm_io io;
-       uint64_t cnt;
+       uint64_t cnt = 0; /* GCC */
+       uint8_t iobuf[8];
+       int iocnt = 1;
        gvaddr_t gva;
        int reg = 0; /* GCC */
        int ret, seg;
+       bool psld = false;
 
        if (__predict_false(exit->reason != NVMM_EXIT_IO)) {
                errno = EINVAL;
@@ -641,6 +716,7 @@
        io.port = exit->u.io.port;
        io.in = (exit->u.io.type == NVMM_EXIT_IO_IN);
        io.size = exit->u.io.operand_size;
+       io.data = iobuf;
 
        ret = nvmm_vcpu_getstate(mach, cpuid, &state,
            NVMM_X64_STATE_GPRS | NVMM_X64_STATE_SEGS |
@@ -648,6 +724,17 @@
        if (ret == -1)
                return -1;
 
+       if (exit->u.io.rep) {
+               cnt = rep_get_cnt(&state, exit->u.io.address_size);
+               if (__predict_false(cnt == 0)) {
+                       return 0;
+               }
+       }
+
+       if (__predict_false(state.gprs[NVMM_X64_GPR_RFLAGS] & PSL_D)) {
+               psld = true;
+       }
+
        /*
         * Determine GVA.
         */
@@ -678,6 +765,13 @@
                        if (ret == -1)
                                return -1;
                }
+
+               if (exit->u.io.rep && !psld) {
+                       iocnt = assist_io_batch(mach, &state, &io, gva, cnt);
+                       if (iocnt == -1)
+                               return -1;
+                       goto done;
+               }
        }
 
        if (!io.in) {
@@ -704,16 +798,18 @@
                }
        }
 
+done:
        if (exit->u.io.str) {
-               if (state.gprs[NVMM_X64_GPR_RFLAGS] & PSL_D) {
-                       state.gprs[reg] -= io.size;
+               if (__predict_false(psld)) {
+                       state.gprs[reg] -= iocnt * io.size;
                } else {
-                       state.gprs[reg] += io.size;
+                       state.gprs[reg] += iocnt * io.size;
                }
        }
 
        if (exit->u.io.rep) {
-               cnt = rep_dec_apply(&state, exit->u.io.address_size);
+               cnt -= iocnt;
Prev by Date: [src/trunk]: src/sys/arch/x86/x86 restore original now that weak symbols are ...
Next by Date: [src/trunk]: src Handle the NVMM signature.
Previous by Thread: [src/trunk]: src/sys/arch/x86/x86 restore original now that weak symbols are ...
Next by Thread: [src/trunk]: src Handle the NVMM signature.
Indexes:
Home | Main Index | Thread Index | Old Index