Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/arch Unmap the kernel heap from the user page tables (SVS).



details:   https://anonhg.NetBSD.org/src/rev/8a2235ae0c86
branches:  trunk
changeset: 829123:8a2235ae0c86
user:      maxv <maxv%NetBSD.org@localhost>
date:      Thu Jan 18 07:25:34 2018 +0000

description:
Unmap the kernel heap from the user page tables (SVS).

This implementation is optimized and organized in such a way that we
don't need to copy the kernel stack to a safe place during user<->kernel
transitions. We create two VAs that point to the same physical page; one
will be mapped in userland and is offset in order to contain only the
trapframe, the other is mapped in the kernel and maps the entire stack.

Sent on tech-kern@ a week ago.

diffstat:

 sys/arch/amd64/amd64/amd64_trap.S |    8 +-
 sys/arch/amd64/amd64/genassym.cf  |    5 +-
 sys/arch/amd64/amd64/locore.S     |   29 +++++-
 sys/arch/amd64/amd64/machdep.c    |  164 ++++++++++++++++++++++++++++++++-----
 sys/arch/amd64/include/frameasm.h |   55 +++++++----
 sys/arch/x86/include/cpu.h        |    7 +-
 sys/arch/x86/include/pmap.h       |    6 +-
 sys/arch/x86/x86/vm_machdep.c     |   10 +-
 8 files changed, 224 insertions(+), 60 deletions(-)

diffs (truncated from 527 to 300 lines):

diff -r feceb311455d -r 8a2235ae0c86 sys/arch/amd64/amd64/amd64_trap.S
--- a/sys/arch/amd64/amd64/amd64_trap.S Thu Jan 18 00:34:05 2018 +0000
+++ b/sys/arch/amd64/amd64/amd64_trap.S Thu Jan 18 07:25:34 2018 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: amd64_trap.S,v 1.17 2018/01/07 16:10:16 maxv Exp $     */
+/*     $NetBSD: amd64_trap.S,v 1.18 2018/01/18 07:25:34 maxv Exp $     */
 
 /*
  * Copyright (c) 1998, 2007, 2008, 2017 The NetBSD Foundation, Inc.
@@ -66,7 +66,7 @@
 
 #if 0
 #include <machine/asm.h>
-__KERNEL_RCSID(0, "$NetBSD: amd64_trap.S,v 1.17 2018/01/07 16:10:16 maxv Exp $");
+__KERNEL_RCSID(0, "$NetBSD: amd64_trap.S,v 1.18 2018/01/18 07:25:34 maxv Exp $");
 #endif
 
 /*
@@ -120,8 +120,8 @@
 #else
        ZTRAP_NJ(T_NMI)
        subq    $TF_REGSIZE,%rsp
-       SVS_ENTER
        INTR_SAVE_GPRS
+       SVS_ENTER_ALTSTACK
        cld
        SMAP_ENABLE
        movw    %gs,TF_GS(%rsp)
@@ -138,7 +138,7 @@
        movq    %rsp,%rdi
        incq    CPUVAR(NTRAP)
        call    _C_LABEL(nmitrap)
-       SVS_LEAVE
+       SVS_LEAVE_ALTSTACK
        swapgs
        jmp     .Lnmileave
 
diff -r feceb311455d -r 8a2235ae0c86 sys/arch/amd64/amd64/genassym.cf
--- a/sys/arch/amd64/amd64/genassym.cf  Thu Jan 18 00:34:05 2018 +0000
+++ b/sys/arch/amd64/amd64/genassym.cf  Thu Jan 18 07:25:34 2018 +0000
@@ -1,4 +1,4 @@
-#      $NetBSD: genassym.cf,v 1.66 2018/01/07 16:47:22 christos Exp $
+#      $NetBSD: genassym.cf,v 1.67 2018/01/18 07:25:34 maxv Exp $
 
 #
 # Copyright (c) 1998, 2006, 2007, 2008 The NetBSD Foundation, Inc.
@@ -239,6 +239,9 @@
 ifdef SVS
 define CPU_INFO_UPDIRPA        offsetof(struct cpu_info, ci_svs_updirpa)
 define CPU_INFO_KPDIRPA        offsetof(struct cpu_info, ci_svs_kpdirpa)
+define CPU_INFO_RSP0           offsetof(struct cpu_info, ci_svs_rsp0)
+define CPU_INFO_URSP0          offsetof(struct cpu_info, ci_svs_ursp0)
+define CPU_INFO_KRSP0          offsetof(struct cpu_info, ci_svs_krsp0)
 endif
 define CPU_INFO_NSYSCALL       offsetof(struct cpu_info, ci_data.cpu_nsyscall)
 define CPU_INFO_NTRAP          offsetof(struct cpu_info, ci_data.cpu_ntrap)
diff -r feceb311455d -r 8a2235ae0c86 sys/arch/amd64/amd64/locore.S
--- a/sys/arch/amd64/amd64/locore.S     Thu Jan 18 00:34:05 2018 +0000
+++ b/sys/arch/amd64/amd64/locore.S     Thu Jan 18 07:25:34 2018 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: locore.S,v 1.146 2018/01/11 09:00:04 maxv Exp $        */
+/*     $NetBSD: locore.S,v 1.147 2018/01/18 07:25:34 maxv Exp $        */
 
 /*
  * Copyright-o-rama!
@@ -1112,7 +1112,11 @@
        jnz     .Lswitch_return
 
        /* Switch ring0 stack */
-#ifndef XEN
+#ifdef SVS
+       movq    CPUVAR(RSP0),%rax
+       movq    CPUVAR(TSS),%rdi
+       movq    %rax,TSS_RSP0(%rdi)
+#elif !defined(XEN)
        movq    PCB_RSP0(%r14),%rax
        movq    CPUVAR(TSS),%rdi
        movq    %rax,TSS_RSP0(%rdi)
@@ -1268,14 +1272,20 @@
         * is ignored as well.
         */
        swapgs
-       SVS_ENTER_NOSTACK
+
+#ifdef SVS
+       movq    %rax,SVS_UTLS+UTLS_SCRATCH
+       movq    SVS_UTLS+UTLS_RSP0,%rax
+#define SP(x)  (x)-(TF_SS+8)(%rax)
+#else
        movq    %r15,CPUVAR(SCRATCH)
        movq    CPUVAR(CURLWP),%r15
        movq    L_PCB(%r15),%r15
        movq    PCB_RSP0(%r15),%r15     /* LWP's kernel stack pointer */
+#define SP(x)  (x)-(TF_SS+8)(%r15)
+#endif
 
        /* Make stack look like an 'int nn' frame */
-#define SP(x)  (x)-(TF_SS+8)(%r15)
        movq    $(LSEL(LUDATA_SEL, SEL_UPL)),SP(TF_SS)  /* user %ss */
        movq    %rsp,SP(TF_RSP)                         /* user %rsp */
        movq    %r11,SP(TF_RFLAGS)                      /* user %rflags */
@@ -1283,8 +1293,11 @@
        movq    %rcx,SP(TF_RIP)                         /* user %rip */
 
        leaq    SP(0),%rsp              /* %rsp now valid after frame */
+#ifdef SVS
+       movq    SVS_UTLS+UTLS_SCRATCH,%rax
+#else
        movq    CPUVAR(SCRATCH),%r15
-#undef SP
+#endif
 
        movq    $2,TF_ERR(%rsp)         /* syscall instruction size */
        movq    $T_ASTFLT,TF_TRAPNO(%rsp)
@@ -1301,6 +1314,7 @@
        movw    $GSEL(GUDATA_SEL, SEL_UPL),TF_ES(%rsp)
        movw    $0,TF_FS(%rsp)
        movw    $0,TF_GS(%rsp)
+       SVS_ENTER
        STI(si)
 
 .Ldo_syscall:
@@ -1339,8 +1353,8 @@
        testl   $(MDL_IRET|MDL_COMPAT32),L_MD_FLAGS(%r14)
        jnz     intrfastexit
 
+       SVS_LEAVE
        INTR_RESTORE_GPRS
-       SVS_LEAVE
        SWAPGS
 #ifndef XEN
        movq    TF_RIP(%rsp),%rcx       /* %rip for sysret */
@@ -1483,6 +1497,8 @@
 END(pagezero)
 
 ENTRY(intrfastexit)
+       NOT_XEN(cli;)
+       SVS_LEAVE
        INTR_RESTORE_GPRS
        testw   $SEL_UPL,TF_CS(%rsp)    /* interrupted %cs */
        jz      .Lkexit
@@ -1513,7 +1529,6 @@
 
 .Luexit64:
        NOT_XEN(cli;)
-       SVS_LEAVE
        SWAPGS
 
 .Lkexit:
diff -r feceb311455d -r 8a2235ae0c86 sys/arch/amd64/amd64/machdep.c
--- a/sys/arch/amd64/amd64/machdep.c    Thu Jan 18 00:34:05 2018 +0000
+++ b/sys/arch/amd64/amd64/machdep.c    Thu Jan 18 07:25:34 2018 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: machdep.c,v 1.290 2018/01/12 09:12:01 maxv Exp $       */
+/*     $NetBSD: machdep.c,v 1.291 2018/01/18 07:25:34 maxv Exp $       */
 
 /*
  * Copyright (c) 1996, 1997, 1998, 2000, 2006, 2007, 2008, 2011
@@ -110,7 +110,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.290 2018/01/12 09:12:01 maxv Exp $");
+__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.291 2018/01/18 07:25:34 maxv Exp $");
 
 /* #define XENDEBUG_LOW  */
 
@@ -2265,10 +2265,16 @@
  *     PTE Space         [OK]
  *     Direct Map        [OK]
  *     Remote PCPU Areas [OK]
- *     Kernel Heap       [TODO]
+ *     Kernel Heap       [OK]
  *     Kernel Image      [TODO]
  */
 
+struct svs_utls {
+       paddr_t kpdirpa;
+       uint64_t scratch;
+       vaddr_t rsp0;
+};
+
 static pd_entry_t *
 svs_tree_add(struct cpu_info *ci, vaddr_t va)
 {
@@ -2334,6 +2340,84 @@
 }
 
 static void
+svs_rsp0_init(struct cpu_info *ci)
+{
+       const cpuid_t cid = cpu_index(ci);
+       vaddr_t va, rsp0;
+       pd_entry_t *pd;
+       size_t pidx;
+
+       rsp0 = (vaddr_t)&pcpuarea->ent[cid].rsp0;
+
+       /* The first page is a redzone. */
+       va = rsp0 + PAGE_SIZE;
+
+       /* Create levels L4, L3 and L2. */
+       pd = svs_tree_add(ci, va);
+
+       /* Get the info for L1. */
+       pidx = pl1_i(va % NBPD_L2);
+       if (pmap_valid_entry(pd[pidx])) {
+               panic("%s: rsp0 page already mapped", __func__);
+       }
+
+       ci->ci_svs_rsp0_pte = (pt_entry_t *)&pd[pidx];
+       ci->ci_svs_rsp0 = rsp0 + PAGE_SIZE + sizeof(struct trapframe);
+       ci->ci_svs_ursp0 = ci->ci_svs_rsp0 - sizeof(struct trapframe);
+       ci->ci_svs_krsp0 = 0;
+}
+
+static void
+svs_utls_init(struct cpu_info *ci)
+{
+       const vaddr_t utlsva = (vaddr_t)&pcpuarea->utls;
+       struct svs_utls *utls;
+       struct vm_page *pg;
+       pd_entry_t *pd;
+       size_t pidx;
+       paddr_t pa;
+       vaddr_t va;
+
+       /* Create levels L4, L3 and L2. */
+       pd = svs_tree_add(ci, utlsva);
+
+       /* Allocate L1. */
+       pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
+       if (pg == 0)
+               panic("%s: failed to allocate PA for CPU %d\n", __func__,
+                   cpu_index(ci));
+       pa = VM_PAGE_TO_PHYS(pg);
+
+       /* Enter L1. */
+       if (pmap_valid_entry(L1_BASE[pl1_i(utlsva)])) {
+               panic("%s: local page already mapped", __func__);
+       }
+       pidx = pl1_i(utlsva % NBPD_L2);
+       if (pmap_valid_entry(pd[pidx])) {
+               panic("%s: L1 page already mapped", __func__);
+       }
+       pd[pidx] = PG_V | PG_RW | pmap_pg_nx | pa;
+
+       /*
+        * Now, allocate a VA in the kernel map, that points to the UTLS
+        * page.
+        */
+       va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
+           UVM_KMF_VAONLY|UVM_KMF_NOWAIT);
+       if (va == 0) {
+               panic("%s: unable to allocate VA\n", __func__);
+       }
+       pmap_kenter_pa(va, pa, VM_PROT_READ|VM_PROT_WRITE, 0);
+       pmap_update(pmap_kernel());
+
+       ci->ci_svs_utls = va;
+
+       /* Initialize the constant fields of the UTLS page */
+       utls = (struct svs_utls *)ci->ci_svs_utls;
+       utls->rsp0 = ci->ci_svs_rsp0;
+}
+
+static void
 svs_range_add(struct cpu_info *ci, vaddr_t va, size_t size)
 {
        size_t i, n;
@@ -2377,7 +2461,10 @@
        svs_page_add(ci, (vaddr_t)&pcpuarea->idt);
        svs_page_add(ci, (vaddr_t)&pcpuarea->ldt);
        svs_range_add(ci, (vaddr_t)&pcpuarea->ent[cid],
-           sizeof(struct pcpu_entry));
+           offsetof(struct pcpu_entry, rsp0));
+
+       svs_rsp0_init(ci);
+       svs_utls_init(ci);
 }
 
 void
@@ -2412,7 +2499,43 @@
 void
 svs_lwp_switch(struct lwp *oldlwp, struct lwp *newlwp)
 {
-       /* Switch rsp0 */
+       struct cpu_info *ci = curcpu();
+       struct pcb *pcb;
+       pt_entry_t *pte;
+       uintptr_t rsp0;
+       vaddr_t va;
+
+       if (newlwp->l_flag & LW_SYSTEM) {
+               return;
+       }
+
+#ifdef DIAGNOSTIC
+       if (oldlwp != NULL && !(oldlwp->l_flag & LW_SYSTEM)) {
+               pcb = lwp_getpcb(oldlwp);



Home | Main Index | Thread Index | Old Index