Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/arch/amd64 Unmap the kernel from userland in SVS, and le...



details:   https://anonhg.NetBSD.org/src/rev/ade1009cc83a
branches:  trunk
changeset: 358887:ade1009cc83a
user:      maxv <maxv%NetBSD.org@localhost>
date:      Sun Jan 21 11:21:40 2018 +0000

description:
Unmap the kernel from userland in SVS, and leave only the needed
trampolines. As explained below, SVS should now completely mitigate
Meltdown on GENERIC kernels, even though it needs some more tweaking
for GENERIC_KASLR.

Until now the kernel entry points looked like:

        FUNC(intr)
                pushq   $ERR
                pushq   $TRAPNO
                INTRENTRY
                ... handle interrupt ...
                INTRFASTEXIT
        END(intr)

With this change they are split and become:

        FUNC(handle)
                ... handle interrupt ...
                INTRFASTEXIT
        END(handle)

                TEXT_USER_BEGIN
        FUNC(intr)
                pushq   $ERR
                pushq   $TRAPNO
                INTRENTRY
                jmp     handle
        END(intr)
                TEXT_USER_END

A new section is introduced, .text.user, that contains minimal kernel
entry/exit points. In order to choose what to put in this section, two
macros are introduced, TEXT_USER_BEGIN and TEXT_USER_END.

The section is mapped in userland with normal 4K pages.

In GENERIC, the section is 4K-page-aligned and embedded in .text, which
is mapped with large pages. That is to say, when an interrupt comes in,
the CPU has the user page tables loaded and executes the 'intr' functions
on 4K pages; after calling SVS_ENTER (in INTRENTRY) these 4K pages become
2MB large pages, and remain so when executing in kernel mode.

In GENERIC_KASLR, the section is 4K-page-aligned and independent from the
other kernel texts. The prekern just picks it up and maps it at a random
address.

In GENERIC, SVS should now completely mitigate Meltdown: what we put in
.text.user is not secret.

In GENERIC_KASLR, SVS would have to be improved a bit more: the
'jmp handle' instruction is actually secret, since it leaks the address
of the section we are jumping into. By exploiting Meltdown on Intel, this
theoretically allows a local user to reconstruct the address of the first
text section. But given that our KASLR produces several texts, and that
each section is not correlated with the others, the level of protection
KASLR provides is still good.

diffstat:

 sys/arch/amd64/amd64/amd64_trap.S       |   51 +++--
 sys/arch/amd64/amd64/locore.S           |  242 +++++++++++++++++--------------
 sys/arch/amd64/amd64/machdep.c          |   14 +-
 sys/arch/amd64/amd64/vector.S           |  135 +++++++++++------
 sys/arch/amd64/conf/kern.ldscript       |    8 +-
 sys/arch/amd64/conf/kern.ldscript.kaslr |   10 +-
 sys/arch/amd64/include/frameasm.h       |    5 +-
 7 files changed, 276 insertions(+), 189 deletions(-)

diffs (truncated from 736 to 300 lines):

diff -r 193ad00c03e7 -r ade1009cc83a sys/arch/amd64/amd64/amd64_trap.S
--- a/sys/arch/amd64/amd64/amd64_trap.S Sun Jan 21 10:59:21 2018 +0000
+++ b/sys/arch/amd64/amd64/amd64_trap.S Sun Jan 21 11:21:40 2018 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: amd64_trap.S,v 1.22 2018/01/20 14:27:15 maxv Exp $     */
+/*     $NetBSD: amd64_trap.S,v 1.23 2018/01/21 11:21:40 maxv Exp $     */
 
 /*
  * Copyright (c) 1998, 2007, 2008, 2017 The NetBSD Foundation, Inc.
@@ -95,13 +95,19 @@
 #define        PRE_TRAP
 #endif
 
+#define TRAPENTRY                      \
+       INTRENTRY                       ; \
+       jmp     .Lalltraps_noentry
+
 #define        TRAP_NJ(a)      PRE_TRAP ; pushq $(a)
 #define        ZTRAP_NJ(a)     PRE_TRAP ; pushq $0 ; pushq $(a)
-#define        TRAP(a)         TRAP_NJ(a) ; jmp _C_LABEL(alltraps)
-#define        ZTRAP(a)        ZTRAP_NJ(a) ; jmp _C_LABEL(alltraps)
+#define        TRAP(a)         TRAP_NJ(a) ; TRAPENTRY
+#define        ZTRAP(a)        ZTRAP_NJ(a) ; TRAPENTRY
 
        .text
 
+       TEXT_USER_BEGIN
+
 IDTVEC(trap00)
        ZTRAP(T_DIVIDE)
 IDTVEC_END(trap00)
@@ -361,24 +367,6 @@
        jmp     .Lalltraps_checkusr
 IDTVEC_END(intrspurious)
 
-/*
- * trap() calls here when it detects a fault in INTRFASTEXIT (loading the
- * segment registers or during the iret itself). The address of the (possibly
- * reconstructed) user trap frame is passed as an argument.
- *
- * Typically the code will have raised a SIGSEGV which will be actioned
- * by the code below.
- */
-       .type   _C_LABEL(trap_return_fault_return), @function
-LABEL(trap_return_fault_return)
-       mov     %rdi,%rsp               /* frame for user return */
-#ifdef DIAGNOSTIC
-       /* We can't recover the saved %rbx, so suppress warning */
-       movl    CPUVAR(ILEVEL),%ebx
-#endif
-       jmp     .Lalltraps_checkusr
-END(trap_return_fault_return)
-
 #ifndef check_swapgs
 /*
  * We need to worry about traps in kernel mode while the kernel %gs isn't
@@ -423,12 +411,33 @@
 END(check_swapgs)
 #endif
 
+       TEXT_USER_END
+
+/*
+ * trap() calls here when it detects a fault in INTRFASTEXIT (loading the
+ * segment registers or during the iret itself). The address of the (possibly
+ * reconstructed) user trap frame is passed as an argument.
+ *
+ * Typically the code will have raised a SIGSEGV which will be actioned
+ * by the code below.
+ */
+       .type   _C_LABEL(trap_return_fault_return), @function
+LABEL(trap_return_fault_return)
+       mov     %rdi,%rsp               /* frame for user return */
+#ifdef DIAGNOSTIC
+       /* We can't recover the saved %rbx, so suppress warning */
+       movl    CPUVAR(ILEVEL),%ebx
+#endif
+       jmp     .Lalltraps_checkusr
+END(trap_return_fault_return)
+
 /*
  * All traps go through here. Call the generic trap handler, and
  * check for ASTs afterwards.
  */
 NENTRY(alltraps)
        INTRENTRY
+.Lalltraps_noentry:
        STI(si)
 
 calltrap:
diff -r 193ad00c03e7 -r ade1009cc83a sys/arch/amd64/amd64/locore.S
--- a/sys/arch/amd64/amd64/locore.S     Sun Jan 21 10:59:21 2018 +0000
+++ b/sys/arch/amd64/amd64/locore.S     Sun Jan 21 11:21:40 2018 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: locore.S,v 1.147 2018/01/18 07:25:34 maxv Exp $        */
+/*     $NetBSD: locore.S,v 1.148 2018/01/21 11:21:40 maxv Exp $        */
 
 /*
  * Copyright-o-rama!
@@ -1248,15 +1248,103 @@
        ret
 END(savectx)
 
-IDTVEC(syscall32)
-       sysret          /* go away please */
-IDTVEC_END(syscall32)
+/*
+ * Syscall handler.
+ */
+NENTRY(handle_syscall)
+       STI(si)
+
+       movq    CPUVAR(CURLWP),%r14
+       incq    CPUVAR(NSYSCALL)        /* count it atomically */
+       movq    %rsp,L_MD_REGS(%r14)    /* save pointer to frame */
+       movq    L_PROC(%r14),%r15
+       andl    $~MDL_IRET,L_MD_FLAGS(%r14)   /* Allow sysret return */
+       movq    %rsp,%rdi               /* Pass frame as arg0 */
+       call    *P_MD_SYSCALL(%r15)
+.Lsyscall_checkast:
+       /*
+        * Disable interrupts to avoid new ASTs (etc) being added and
+        * to ensure we don't take an interrupt with some of the user
+        * registers loaded.
+        */
+       CLI(si)
+       /* Check for ASTs on exit to user mode. */
+       movl    L_MD_ASTPENDING(%r14),%eax
+       orl     CPUVAR(WANT_PMAPLOAD),%eax
+       jnz     9f
+
+#ifdef DIAGNOSTIC
+       cmpl    $IPL_NONE,CPUVAR(ILEVEL)
+       jne     .Lspl_error
+#endif
+
+       /*
+        * Decide if we need to take a slow path. That's the case when we
+        * want to reload %cs and %ss on a 64bit LWP (MDL_IRET set), or when
+        * we're returning to a 32bit LWP (MDL_COMPAT32 set).
+        *
+        * In either case, we jump into intrfastexit and return to userland
+        * with the iret instruction.
+        */
+       testl   $(MDL_IRET|MDL_COMPAT32),L_MD_FLAGS(%r14)
+       jnz     intrfastexit
+
+       jmp     syscall_sysret
+
+#ifdef DIAGNOSTIC
+       /* Report SPL error */
+.Lspl_error:
+       movabsq $4f,%rdi
+       movl    TF_RAX(%rsp),%esi
+       movl    TF_RDI(%rsp),%edx
+       movl    %ebx,%ecx
+       movl    CPUVAR(ILEVEL),%r8d
+       xorq    %rax,%rax
+       call    _C_LABEL(printf)
+       movl    $IPL_NONE,%edi
+       call    _C_LABEL(spllower)
+       jmp     .Lsyscall_checkast
+4:     .asciz  "WARNING: SPL NOT LOWERED ON SYSCALL %d %d EXIT %x %x\n"
+#endif
+
+/* AST pending or pmap load needed */
+9:
+       cmpl    $0,CPUVAR(WANT_PMAPLOAD)
+       jz      10f
+       STI(si)
+       call    _C_LABEL(do_pmap_load)
+       jmp     .Lsyscall_checkast      /* re-check ASTs */
+10:
+       CLEAR_ASTPENDING(%r14)
+       STI(si)
+       /* Pushed T_ASTFLT into tf_trapno on entry. */
+       movq    %rsp,%rdi
+       call    _C_LABEL(trap)
+       jmp     .Lsyscall_checkast      /* re-check ASTs */
+END(handle_syscall)
 
 /*
- * syscall()
+ * void lwp_trampoline(void);
  *
- * syscall insn entry.
+ * This is a trampoline function pushed run by newly created LWPs
+ * in order to do additional setup in their context.
  */
+NENTRY(lwp_trampoline)
+       movq    %rbp,%rsi
+       movq    %rbp,%r14       /* for .Lsyscall_checkast */
+       movq    %rax,%rdi
+       xorq    %rbp,%rbp
+       call    _C_LABEL(lwp_startup)
+       movq    %r13,%rdi
+       call    *%r12
+       jmp     .Lsyscall_checkast
+END(lwp_trampoline)
+
+/*
+ * Entry points of the 'syscall' instruction, 64bit and 32bit mode.
+ */
+       TEXT_USER_BEGIN
+
 IDTVEC(syscall)
 #ifndef XEN
        /*
@@ -1315,44 +1403,40 @@
        movw    $0,TF_FS(%rsp)
        movw    $0,TF_GS(%rsp)
        SVS_ENTER
-       STI(si)
+       jmp     handle_syscall
+IDTVEC_END(syscall)
+
+IDTVEC(syscall32)
+       sysret          /* go away please */
+IDTVEC_END(syscall32)
+
+       TEXT_USER_END
 
-.Ldo_syscall:
-       movq    CPUVAR(CURLWP),%r14
-       incq    CPUVAR(NSYSCALL)        /* count it atomically */
-       movq    %rsp,L_MD_REGS(%r14)    /* save pointer to frame */
-       movq    L_PROC(%r14),%r15
-       andl    $~MDL_IRET,L_MD_FLAGS(%r14)   /* Allow sysret return */
-       movq    %rsp,%rdi               /* Pass frame as arg0 */
-       call    *P_MD_SYSCALL(%r15)
-.Lsyscall_checkast:
-       /*
-        * Disable interrupts to avoid new ASTs (etc) being added and
-        * to ensure we don't take an interrupt with some of the user
-        * registers loaded.
-        */
-       CLI(si)
-       /* Check for ASTs on exit to user mode. */
-       movl    L_MD_ASTPENDING(%r14),%eax
-       orl     CPUVAR(WANT_PMAPLOAD),%eax
-       jnz     9f
+/*
+ * osyscall()
+ *
+ * Trap gate entry for int $80 syscall, also used by sigreturn.
+ */
+       TEXT_USER_BEGIN
+IDTVEC(osyscall)
+#ifdef XEN
+       movq (%rsp),%rcx
+       movq 8(%rsp),%r11
+       addq $0x10,%rsp
+#endif
+       pushq   $2              /* size of instruction for restart */
+       pushq   $T_ASTFLT       /* trap # for doing ASTs */
+       INTRENTRY
+       jmp     handle_syscall
+IDTVEC_END(osyscall)
+       TEXT_USER_END
 
-#ifdef DIAGNOSTIC
-       cmpl    $IPL_NONE,CPUVAR(ILEVEL)
-       jne     .Lspl_error
-#endif
-
-       /*
-        * Decide if we need to take a slow path. That's the case when we
-        * want to reload %cs and %ss on a 64bit LWP (MDL_IRET set), or when
-        * we're returning to a 32bit LWP (MDL_COMPAT32 set).
-        *
-        * In either case, we jump into intrfastexit and return to userland
-        * with the iret instruction.
-        */
-       testl   $(MDL_IRET|MDL_COMPAT32),L_MD_FLAGS(%r14)
-       jnz     intrfastexit
-
+/*
+ * Return to userland via 'sysret'.
+ */
+       TEXT_USER_BEGIN
+       _ALIGN_TEXT
+LABEL(syscall_sysret)
        SVS_LEAVE
        INTR_RESTORE_GPRS
        SWAPGS
@@ -1367,73 +1451,8 @@
        pushq   $256    /* VGCF_IN_SYSCALL */
        jmp     HYPERVISOR_iret
 #endif
-
-#ifdef DIAGNOSTIC
-       /* Report SPL error */
-.Lspl_error:
-       movabsq $4f,%rdi
-       movl    TF_RAX(%rsp),%esi
-       movl    TF_RDI(%rsp),%edx
-       movl    %ebx,%ecx
-       movl    CPUVAR(ILEVEL),%r8d
-       xorq    %rax,%rax
-       call    _C_LABEL(printf)



Home | Main Index | Thread Index | Old Index