Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/arch/arm/cortex Improve MP startup code. We now use a t...



details:   https://anonhg.NetBSD.org/src/rev/819a41768f19
branches:  trunk
changeset: 328298:819a41768f19
user:      matt <matt%NetBSD.org@localhost>
date:      Sun Mar 30 15:20:53 2014 +0000

description:
Improve MP startup code.  We now use a two stage startup, after creating
the initial L1PT and turning on the MMU/caches, we spinup the secondary CPUs
waiting for them to get the same state as the boot processor.  Once the
real L1PT is initialized and used, the secondary CPUs are kicked so they can
use it (and the initial L1PT is discarded).  Finally, wait until NetBSD
kicks the secondary CPUs then load the stack from the idlelwp and then hatch
the cpu and then jump to idle_loop.

diffstat:

 sys/arch/arm/cortex/a9_mpsubr.S |  757 ++++++++++++++++++++++++---------------
 1 files changed, 458 insertions(+), 299 deletions(-)

diffs (truncated from 950 to 300 lines):

diff -r ad3b45b85353 -r 819a41768f19 sys/arch/arm/cortex/a9_mpsubr.S
--- a/sys/arch/arm/cortex/a9_mpsubr.S   Sun Mar 30 13:14:40 2014 +0000
+++ b/sys/arch/arm/cortex/a9_mpsubr.S   Sun Mar 30 15:20:53 2014 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: a9_mpsubr.S,v 1.13 2014/02/21 22:22:48 matt Exp $      */
+/*     $NetBSD: a9_mpsubr.S,v 1.14 2014/03/30 15:20:54 matt Exp $      */
 /*-
  * Copyright (c) 2012 The NetBSD Foundation, Inc.
  * All rights reserved.
@@ -37,40 +37,48 @@
 #include <arm/cortex/scu_reg.h>
 #include "assym.h"
 
+//#define MPDEBUG
 
-/* We'll modify va and pa at run time so we can use relocatable addresses. */
+// We'll modify va and pa at run time so we can use relocatable addresses.
 #define MMU_INIT(va,pa,n_sec,attr) \
-       .word   va                                          ; \
-       .word   pa                                          ; \
-       .word   n_sec                                       ; \
-       .word   attr                                        ;
+       .word   (va)|(n_sec)                                ; \
+       .word   (pa)|(attr)                                 ; \
 
-/*
- * Set up a preliminary mapping in the MMU to allow us to run
- * at KERNEL_BASE with caches on.
- */
+// Set up a preliminary mapping in the MMU to allow us to run at KERNEL_BASE
+// with caches on.  If we are MULTIPROCESSOR, save the TTB address.
+//
 arm_boot_l1pt_init:
-       mov     ip, r1                  @ save mmu table addr
-       /* Build page table from scratch */
-       mov     r1, r0                  /* Start address to clear memory. */
-       /* Zero the entire table so all virtual addresses are invalid. */
-       mov     r2, #L1_TABLE_SIZE      /* in bytes */
-       mov     r3, #0
-       mov     r4, r3
-       mov     r5, r3
-       mov     r6, r3
-       mov     r7, r3
-       mov     r8, r3
-       mov     r10, r3
-       mov     r11, r3
-1:     stmia   r1!, {r3-r8,r10-r11}
-       stmia   r1!, {r3-r8,r10-r11}
-       stmia   r1!, {r3-r8,r10-r11}
-       stmia   r1!, {r3-r8,r10-r11}
-       subs    r2, r2, #(4 * 4 * 8)    /* bytes per loop */
-       bne     1b
+#if defined(MULTIPROCESSOR)
+#if defined(KERNEL_BASES_EQUAL)
+       movw    r3, #:lower16:cortex_mmuinfo
+       movt    r3, #:upper16:cortex_mmuinfo
+#else
+       adr     r3, arm_boot_l1pt_init
+       movw    r2, #:lower16:cortex_mmuinfo
+       movt    r2, #:upper16:cortex_mmuinfo
+       bfi     r3, r2, #0, #28
+#endif
+       str     r0, [r3]
 
-       /* Now create our entries per the mmu_init_table. */
+       // Make sure the info makes into memory
+       mcr     p15, 0, r3, c7, c10, 1          // writeback the cache line
+       dsb
+#endif
+
+       mov     ip, r1                  // save mmu table addr
+       // Build page table from scratch
+       mov     r1, r0                  // Start address to clear memory.
+       // Zero the entire table so all virtual addresses are invalid.
+       add     r2, r1, #L1_TABLE_SIZE  // Ending address
+       mov     r4, #0
+       mov     r5, #0
+       mov     r6, #0
+       mov     r7, #0
+1:     stmia   r1!, {r4-r7}            // 16 bytes at a time
+       cmp     r1, r2
+       blt     1b
+
+       // Now create our entries per the mmu_init_table.
        l1table .req r0
        va      .req r1
        pa      .req r2
@@ -78,7 +86,11 @@
        attr    .req r4
        itable  .req r5
 
-       mov     itable, ip              @ reclaim table address
+       mov     attr, #0
+       mrc     p15, 0, r3, c0, c0, 5   // MPIDR read
+       cmp     r3, #0                  // not zero?
+       movne   attr, #L1_S_V6_S        //    yes, shareable attribute
+       mov     itable, ip              // reclaim table address
        b       3f
 
 2:     str     pa, [l1table, va, lsl #2]
@@ -87,20 +99,18 @@
        subs    n_sec, n_sec, #1
        bhi     2b
 
-3:     ldmia   itable!, {va,pa,n_sec,attr}
-       /* Convert va to l1 offset:     va = 4 * (va >> L1_S_SHIFT)     */
+3:     ldmia   itable!, {va, pa}
+       // Convert va to l1 offset:     va = 4 * (va >> L1_S_SHIFT)
+       ubfx    n_sec, va, #0, #L1_S_SHIFT
        lsr     va, va, #L1_S_SHIFT
-       /* Convert pa to l1 entry:      pa = (pa & L1_S_FRAME) | attr   */
-#ifdef _ARM_ARCH_7
-       bfc     pa, #0, #L1_S_SHIFT
-#else
-       lsr     pa, pa, #L1_S_SHIFT
-       lsl     pa, pa, #L1_S_SHIFT
-#endif
-       orr     pa, pa, attr
-       cmp     n_sec, #0
+
+       // Do we need add sharing for this?
+       tst     pa, #(L1_S_C|L1_S_B)    // is this entry cacheable?
+       orrne   pa, pa, attr            // add sharing
+       
+4:     cmp     n_sec, #0
        bne     2b
-       bx      lr                      @ return
+       bx      lr                      // return
 
        .unreq  va
        .unreq  pa
@@ -109,6 +119,9 @@
        .unreq  itable
        .unreq  l1table
 
+//
+// Coprocessor register initialization values
+//
 #if defined(CPU_CORTEXA8)
 #undef CPU_CONTROL_SWP_ENABLE          // not present on A8
 #define CPU_CONTROL_SWP_ENABLE         0
@@ -126,6 +139,8 @@
 #define CPU_CONTROL_AFLT_ENABLE_SET    CPU_CONTROL_AFLT_ENABLE
 #endif
 
+// bits to set in the Control Register 
+//
 #define CPU_CONTROL_SET \
        (CPU_CONTROL_MMU_ENABLE         |       \
         CPU_CONTROL_AFLT_ENABLE_SET    |       \
@@ -136,124 +151,120 @@
         CPU_CONTROL_EX_BEND_SET        |       \
         CPU_CONTROL_UNAL_ENABLE)
 
+// bits to clear in the Control Register 
+//
 #define CPU_CONTROL_CLR \
        (CPU_CONTROL_AFLT_ENABLE_CLR)
 
 arm_cpuinit:
-       /*
-        * In theory, because the MMU is off, we shouldn't need all of this,
-        * but let's not take any chances and do a typical sequence to set
-        * the Translation Table Base.
-        */
+       // Because the MMU may already be on do a typical sequence to set
+       // the Translation Table Base(s).
        mov     ip, lr
-       mov     r10, r0
+       mov     r10, r0                 // save TTBR 
        mov     r1, #0
 
        mcr     p15, 0, r1, c7, c5, 0   // invalidate I cache
 
-       mrc     p15, 0, r2, c1, c0, 0   // read SCTRL
+       mrc     p15, 0, r2, c1, c0, 0   // SCTRL read
        movw    r1, #(CPU_CONTROL_DC_ENABLE|CPU_CONTROL_IC_ENABLE)
        bic     r2, r2, r1              // clear I+D cache enable
 
 #ifdef __ARMEB__
-       /*
-        * SCTRL.EE determines the endianness of translation table lookups.
-        * So we need to make sure it's set before starting to use the new
-        * translation tables (which are big endian).
-        */
+       // SCTRL.EE determines the endianness of translation table lookups.
+       // So we need to make sure it's set before starting to use the new
+       // translation tables (which are big endian).
+       //
        orr     r2, r2, #CPU_CONTROL_EX_BEND
        bic     r2, r2, #CPU_CONTROL_MMU_ENABLE
-       pli     [pc, #32]               /* preload the next few cachelines */
+       pli     [pc, #32]               // preload the next few cachelines
        pli     [pc, #64]
        pli     [pc, #96]
        pli     [pc, #128]
 #endif
 
-       mcr     p15, 0, r2, c1, c0, 0   /* write SCTRL */
+       mcr     p15, 0, r2, c1, c0, 0   // SCTRL write
 
        XPUTC(#70)
-       dsb                             /* Drain the write buffers. */
+       dsb                             // Drain the write buffers.
 1:
        XPUTC(#71)
-       mrc     p15, 0, r1, c0, c0, 5   /* get MPIDR */
+       mrc     p15, 0, r1, c0, c0, 5   // MPIDR read
        cmp     r1, #0
-       orrlt   r10, r10, #0x5b         /* MP, cachable (Normal WB) */
-       orrge   r10, r10, #0x1b         /* Non-MP, cacheable, normal WB */
-       mcr     p15, 0, r10, c2, c0, 0  /* Set Translation Table Base */
+       orrlt   r10, r10, #0x5b         // MP, cachable (Normal WB)
+       orrge   r10, r10, #0x1b         // Non-MP, cacheable, normal WB
+       XPUTC(#48)
+       mcr     p15, 0, r10, c2, c0, 0  // TTBR0 write
+#if defined(ARM_MMU_EXTENDED)
+       // When using split TTBRs, we need to set both since the physical
+       // addresses we were/are using might be in either.
+       XPUTC(#49)
+       mcr     p15, 0, r10, c2, c0, 1  // TTBR1 write
+#endif
 
        XPUTC(#72)
-       mov     r1, #0
-       mcr     p15, 0, r1, c2, c0, 2   /* Set Translation Table Control */
+#if defined(ARM_MMU_EXTENDED)
+       XPUTC(#49)            
+       mov     r1, #TTBCR_S_N_1        // make sure TTBCR_S_N is 1
+#else
+       XPUTC(#48)
+       mov     r1, #0                  // make sure TTBCR is 0
+#endif
+       mcr     p15, 0, r1, c2, c0, 2   // TTBCR write
 
        XPUTC(#73)
        mov     r1, #0
-       mcr     p15, 0, r1, c8, c7, 0   /* Invalidate TLBs */
+       mcr     p15, 0, r1, c8, c7, 0   // TLBIALL (just this core)
 
-       /* Set the Domain Access register.  Very important! */
        XPUTC(#74)
+       mov     r1, #0                  // get KERNEL_PID
+       mcr     p15, 0, r1, c13, c0, 1  // CONTEXTIDR write
+
+       // Set the Domain Access register.  Very important!
+       XPUTC(#75)
        mov     r1, #((DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2)) | DOMAIN_CLIENT)
-       mcr     p15, 0, r1, c3, c0, 0
+       mcr     p15, 0, r1, c3, c0, 0   // DACR write
 
-       /*
-        * Enable the MMU, etc.
-        */
-       XPUTC(#75)
-       mrc     p15, 0, r0, c1, c0, 0
+       //
+       // Enable the MMU, etc.
+       //
+       XPUTC(#76)
+       mrc     p15, 0, r1, c1, c0, 0   // SCTRL read
 
        movw    r3, #:lower16:CPU_CONTROL_SET
 #if (CPU_CONTROL_SET & 0xffff0000)
        movt    r3, #:upper16:CPU_CONTROL_SET
 #endif
-       orr     r0, r0, r3
+       orr     r0, r1, r3
 #if defined(CPU_CONTROL_CLR) && (CPU_CONTROL_CLR != 0)
        bic     r0, r0, #CPU_CONTROL_CLR
 #endif
+       //cmp   r0, r1                  // any changes to SCTRL?
+       //bxeq  ip                      //    no, then return.
+
        pli     1f
-       
        dsb
-       @ turn mmu on!
-       mov     r0, r0                  /* fetch instruction cacheline */
-1:     mcr     p15, 0, r0, c1, c0, 0
 
-       /*
-        * Ensure that the coprocessor has finished turning on the MMU.
-        */
-       mrc     p15, 0, r0, c0, c0, 0   /* Read an arbitrary value. */
-       mov     r0, r0                  /* Stall until read completes. */
-1:     XPUTC(#76)
+       // turn mmu on!
+       //
+       mov     r0, r0                  // fetch instruction cacheline
+1:     mcr     p15, 0, r0, c1, c0, 0   // SCTRL write
 
-       bx      ip                      /* return */
+       // Ensure that the coprocessor has finished turning on the MMU.
+       //
+       mrc     p15, 0, r0, c0, c0, 0   // Read an arbitrary value.
+       mov     r0, r0                  // Stall until read completes.
+       XPUTC(#77)
 
-/*



Home | Main Index | Thread Index | Old Index