Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/arch Instead of using a global array with per-cpu indexe...



details:   https://anonhg.NetBSD.org/src/rev/abbaacdc6772
branches:  trunk
changeset: 351379:abbaacdc6772
user:      maxv <maxv%NetBSD.org@localhost>
date:      Sat Feb 11 14:11:24 2017 +0000

description:
Instead of using a global array with per-cpu indexes, embed the tmp VAs
into cpu_info directly. This concerns only {i386, Xen-i386, Xen-amd64},
because amd64 already has a direct map that is way faster than that.

There are two major issues with the global array: maxcpus entries are
allocated while it is unlikely that common i386 machines have so many
cpus, and the base VA of these entries is not cache-line-aligned, which
mostly guarantees cache-line-thrashing each time the VAs are entered.

Now the number of tmp VAs allocated is proportionate to the number of CPUs
attached (which therefore reduces memory consumption), and the base is
properly aligned.

On my 3-core AMD, the number of DC_refills_L2 events triggered when
performing 5x10^6 calls to pmap_zero_page on two dedicated cores is on
average divided by two with this patch.

Discussed on tech-kern a little.

diffstat:

 sys/arch/x86/include/cpu.h  |   12 ++-
 sys/arch/x86/include/pmap.h |    6 +-
 sys/arch/x86/x86/cpu.c      |    7 +-
 sys/arch/x86/x86/pmap.c     |  206 +++++++++++++++++++++++--------------------
 sys/arch/xen/x86/cpu.c      |    7 +-
 5 files changed, 134 insertions(+), 104 deletions(-)

diffs (truncated from 442 to 300 lines):

diff -r 8a370ebee22f -r abbaacdc6772 sys/arch/x86/include/cpu.h
--- a/sys/arch/x86/include/cpu.h        Sat Feb 11 13:22:58 2017 +0000
+++ b/sys/arch/x86/include/cpu.h        Sat Feb 11 14:11:24 2017 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: cpu.h,v 1.67 2015/12/13 15:02:19 maxv Exp $    */
+/*     $NetBSD: cpu.h,v 1.68 2017/02/11 14:11:24 maxv Exp $    */
 
 /*-
  * Copyright (c) 1990 The Regents of the University of California.
@@ -133,6 +133,16 @@
        volatile int    ci_mtx_count;   /* Negative count of spin mutexes */
        volatile int    ci_mtx_oldspl;  /* Old SPL at this ci_idepth */
 
+#ifndef __HAVE_DIRECT_MAP
+#define VPAGE_SRC 0
+#define VPAGE_DST 1
+#define VPAGE_ZER 2
+#define VPAGE_PTP 3
+#define VPAGE_MAX 4
+       vaddr_t         vpage[VPAGE_MAX];
+       pt_entry_t      *vpage_pte[VPAGE_MAX];
+#endif
+
        /* The following must be aligned for cmpxchg8b. */
        struct {
                uint32_t        ipending;
diff -r 8a370ebee22f -r abbaacdc6772 sys/arch/x86/include/pmap.h
--- a/sys/arch/x86/include/pmap.h       Sat Feb 11 13:22:58 2017 +0000
+++ b/sys/arch/x86/include/pmap.h       Sat Feb 11 14:11:24 2017 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: pmap.h,v 1.61 2016/11/08 03:05:36 christos Exp $       */
+/*     $NetBSD: pmap.h,v 1.62 2017/02/11 14:11:24 maxv Exp $   */
 
 /*
  * Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -278,6 +278,10 @@
 
 bool           pmap_is_curpmap(struct pmap *);
 
+#ifndef __HAVE_DIRECT_MAP
+void           pmap_vpage_cpu_init(struct cpu_info *);
+#endif
+
 vaddr_t reserve_dumppages(vaddr_t); /* XXX: not a pmap fn */
 
 typedef enum tlbwhy {
diff -r 8a370ebee22f -r abbaacdc6772 sys/arch/x86/x86/cpu.c
--- a/sys/arch/x86/x86/cpu.c    Sat Feb 11 13:22:58 2017 +0000
+++ b/sys/arch/x86/x86/cpu.c    Sat Feb 11 14:11:24 2017 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: cpu.c,v 1.122 2017/02/02 08:57:04 maxv Exp $   */
+/*     $NetBSD: cpu.c,v 1.123 2017/02/11 14:11:24 maxv Exp $   */
 
 /*-
  * Copyright (c) 2000-2012 NetBSD Foundation, Inc.
@@ -62,7 +62,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.122 2017/02/02 08:57:04 maxv Exp $");
+__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.123 2017/02/11 14:11:24 maxv Exp $");
 
 #include "opt_ddb.h"
 #include "opt_mpbios.h"                /* for MPDEBUG */
@@ -286,6 +286,9 @@
        uvm_page_recolor(ncolors);
 
        pmap_tlb_cpu_init(ci);
+#ifndef __HAVE_DIRECT_MAP
+       pmap_vpage_cpu_init(ci);
+#endif
 }
 
 static void
diff -r 8a370ebee22f -r abbaacdc6772 sys/arch/x86/x86/pmap.c
--- a/sys/arch/x86/x86/pmap.c   Sat Feb 11 13:22:58 2017 +0000
+++ b/sys/arch/x86/x86/pmap.c   Sat Feb 11 14:11:24 2017 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: pmap.c,v 1.239 2017/02/02 17:37:49 maxv Exp $  */
+/*     $NetBSD: pmap.c,v 1.240 2017/02/11 14:11:24 maxv Exp $  */
 
 /*-
  * Copyright (c) 2008, 2010, 2016, 2017 The NetBSD Foundation, Inc.
@@ -171,7 +171,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.239 2017/02/02 17:37:49 maxv Exp $");
+__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.240 2017/02/11 14:11:24 maxv Exp $");
 
 #include "opt_user_ldt.h"
 #include "opt_lockdebug.h"
@@ -496,29 +496,15 @@
 
 #ifndef __HAVE_DIRECT_MAP
 /*
- * MULTIPROCESSOR: special VAs and PTEs are actually allocated inside a
- * (maxcpus * NPTECL) array of PTE, to avoid cache line thrashing due to
- * false sharing.
- */
-#ifdef MULTIPROCESSOR
-#define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
-#define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE)
-#else
-#define PTESLEW(pte, id) ((void)id, pte)
-#define VASLEW(va,id) ((void)id, va)
-#endif
-
-/*
  * Special VAs and the PTEs that map them
  */
-static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte;
-static char *csrcp, *cdstp, *zerop, *ptpp;
+static pt_entry_t *early_zero_pte;
+static void pmap_vpage_cpualloc(struct cpu_info *);
 #ifdef XEN
 char *early_zerop; /* also referenced from xen_locore() */
 #else
 static char *early_zerop;
 #endif
-
 #endif
 
 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
@@ -1328,11 +1314,15 @@
        pmap_init_lapic();
 #endif /* !XEN */
 
-
 #ifdef __HAVE_DIRECT_MAP
        pmap_init_directmap(kpm);
 #else
-       if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
+       pmap_vpage_cpualloc(&cpu_info_primary);
+
+       if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */
+               early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER];
+               early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER];
+       } else { /* amd64 */
                /*
                 * zero_pte is stuck at the end of mapped space for the kernel
                 * image (disjunct from kva space). This is done so that it
@@ -1347,41 +1337,6 @@
 #endif
                early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
        }
-
-       /*
-        * Now we allocate the "special" VAs which are used for tmp mappings
-        * by pmap. We allocate the VAs, and find the PTE that maps them via
-        * the linear PTE mapping.
-        */
-       vaddr_t cpuva_base;
-       pt_entry_t *pte;
-
-#ifdef MULTIPROCESSOR
-       /*
-        * Waste some VA space to avoid false sharing of cache lines for page
-        * table pages: give each possible CPU a cache line of 8 PTEs to play
-        * with, though we only need 4.
-        */
-       cpuva_base = pmap_bootstrap_valloc(maxcpus * NPTECL);
-#else
-       cpuva_base = pmap_bootstrap_valloc(4);
-#endif
-       pte = PTE_BASE + pl1_i(cpuva_base);
-
-       /* Values used to index the array */
-       csrcp = (char *)cpuva_base;
-       csrc_pte = pte;
-       cdstp = (char *)cpuva_base + PAGE_SIZE;
-       cdst_pte = pte + 1;
-       zerop = (char *)cpuva_base + PAGE_SIZE * 2;
-       zero_pte = pte + 2;
-       ptpp = (char *)cpuva_base + PAGE_SIZE * 3;
-       ptp_pte = pte + 3;
-
-       if (VM_MIN_KERNEL_ADDRESS == KERNBASE) {
-               early_zerop = zerop;
-               early_zero_pte = zero_pte;
-       }
 #endif
 
 #if defined(XEN) && defined(__x86_64__)
@@ -1715,6 +1670,57 @@
 }
 #endif
 
+#ifndef __HAVE_DIRECT_MAP
+CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t));
+CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0);
+
+static void
+pmap_vpage_cpualloc(struct cpu_info *ci)
+{
+       bool primary = (ci == &cpu_info_primary);
+       size_t i, npages;
+       vaddr_t vabase;
+       vsize_t vrange;
+
+       npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t));
+       KASSERT(npages >= VPAGE_MAX);
+       vrange = npages * PAGE_SIZE;
+
+       if (primary) {
+               while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) {
+                       /* Waste some pages to align properly */
+               }
+               /* The base is aligned, allocate the rest (contiguous) */
+               pmap_bootstrap_valloc(npages - 1);
+       } else {
+               vabase = uvm_km_alloc(kernel_map, vrange, vrange,
+                   UVM_KMF_VAONLY);
+               if (vabase == 0) {
+                       panic("%s: failed to allocate tmp VA for CPU %d\n",
+                           __func__, cpu_index(ci));
+               }
+       }
+
+       KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0);
+
+       for (i = 0; i < VPAGE_MAX; i++) {
+               ci->vpage[i] = vabase + i * PAGE_SIZE;
+               ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]);
+       }
+}
+
+void
+pmap_vpage_cpu_init(struct cpu_info *ci)
+{
+       if (ci == &cpu_info_primary) {
+               /* cpu0 already taken care of in pmap_bootstrap */
+               return;
+       }
+
+       pmap_vpage_cpualloc(ci);
+}
+#endif
+
 /*
  * p v _ e n t r y   f u n c t i o n s
  */
@@ -3039,17 +3045,18 @@
        if (XEN_VERSION_SUPPORTED(3, 4))
                xen_pagezero(pa);
 #endif
+       struct cpu_info *ci;
        pt_entry_t *zpte;
-       void *zerova;
-       int id;
+       vaddr_t zerova;
 
        const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U |
            PG_k;
 
        kpreempt_disable();
-       id = cpu_number();
-       zpte = PTESLEW(zero_pte, id);
-       zerova = VASLEW(zerop, id);
+
+       ci = curcpu();
+       zerova = ci->vpage[VPAGE_ZER];
+       zpte = ci->vpage_pte[VPAGE_ZER];
 
 #ifdef DIAGNOSTIC
        if (*zpte)
@@ -3058,9 +3065,9 @@
 
        pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
        pmap_pte_flush();
-       pmap_update_pg((vaddr_t)zerova);                /* flush TLB */
-
-       memset(zerova, 0, PAGE_SIZE);
+       pmap_update_pg(zerova);         /* flush TLB */
+
+       memset((void *)zerova, 0, PAGE_SIZE);
 
 #if defined(DIAGNOSTIC) || defined(XEN)
        pmap_pte_set(zpte, 0);                          /* zap ! */
@@ -3084,26 +3091,26 @@
        KASSERT(cpu_feature[0] & CPUID_SSE2);
        return sse2_idlezero_page((void *)PMAP_DIRECT_MAP(pa));
 #else
+       struct cpu_info *ci;
        pt_entry_t *zpte;
-       void *zerova;
+       vaddr_t zerova;
        bool rv;
-       int id;
 
        const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U |
            PG_k;
 
-       id = cpu_number();
-       zpte = PTESLEW(zero_pte, id);
-       zerova = VASLEW(zerop, id);
+       ci = curcpu();
+       zerova = ci->vpage[VPAGE_ZER];
+       zpte = ci->vpage_pte[VPAGE_ZER];
 



Home | Main Index | Thread Index | Old Index