Port-amd64 archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Re: support for more than 32 CPUs



On Mon, Apr 16, 2012 at 11:05:04PM +0200, Manuel Bouyer wrote:
> Hello,
> the attached patch,

Which I forgot to attach, as pointed out by several of you. Here it is.

> based on a patch sent by Mindaugas Rasiukevicius
> on tech-kern@ some time ago, bumps the max number of CPUs to 256 for
> amd64, and should easily allow up to 64 for Xen/amd64. I tested it
> on a x86 with 64 AMD cores (lighly as this box has now known drive yet - some
> driver hacking is needed), 8 intel cores and with a Xen domU with 4 core.
> I didn't notice regressions so far.
> 
> Comments before I commit ?
> 
> -- 
> Manuel Bouyer <bouyer%antioche.eu.org@localhost>
>      NetBSD: 26 ans d'experience feront toujours la difference
> --

-- 
Manuel Bouyer <bouyer%antioche.eu.org@localhost>
     NetBSD: 26 ans d'experience feront toujours la difference
--
Index: arch/amd64/amd64/genassym.cf
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/amd64/genassym.cf,v
retrieving revision 1.49
diff -u -p -u -r1.49 genassym.cf
--- arch/amd64/amd64/genassym.cf        7 Dec 2011 15:47:41 -0000       1.49
+++ arch/amd64/amd64/genassym.cf        16 Apr 2012 20:44:24 -0000
@@ -228,12 +228,10 @@ define    CPU_INFO_RESCHED        offsetof(struct 
 define CPU_INFO_WANT_PMAPLOAD  offsetof(struct cpu_info, ci_want_pmapload)
 define CPU_INFO_TLBSTATE       offsetof(struct cpu_info, ci_tlbstate)
 define TLBSTATE_VALID          TLBSTATE_VALID
-define CPU_INFO_TLB_EVCNT      offsetof(struct cpu_info, ci_tlb_evcnt)
 define CPU_INFO_CURLWP         offsetof(struct cpu_info, ci_curlwp)
 define CPU_INFO_CURLDT         offsetof(struct cpu_info, ci_curldt)
 define CPU_INFO_IDLELWP        offsetof(struct cpu_info, ci_data.cpu_idlelwp)
 define CPU_INFO_PMAP           offsetof(struct cpu_info, ci_pmap)
-define CPU_INFO_CPUMASK        offsetof(struct cpu_info, ci_cpumask)
 define CPU_INFO_RSP0           offsetof(struct cpu_info, ci_tss.tss_rsp0)
 define CPU_INFO_NSYSCALL       offsetof(struct cpu_info, ci_data.cpu_nsyscall)
 define CPU_INFO_NTRAP          offsetof(struct cpu_info, ci_data.cpu_ntrap)
Index: arch/amd64/amd64/mptramp.S
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/amd64/mptramp.S,v
retrieving revision 1.12
diff -u -p -u -r1.12 mptramp.S
--- arch/amd64/amd64/mptramp.S  20 Apr 2010 15:42:21 -0000      1.12
+++ arch/amd64/amd64/mptramp.S  16 Apr 2012 20:44:24 -0000
@@ -108,7 +108,6 @@
 #define HALTT(x,y) /**/
 #endif
 
-       .globl  _C_LABEL(idle_loop)
        .global _C_LABEL(cpu_spinup_trampoline)
        .global _C_LABEL(cpu_spinup_trampoline_end)
        .global _C_LABEL(cpu_hatch)
@@ -252,7 +251,6 @@ _C_LABEL(cpu_spinup_trampoline_end):        #en
        movl    PCB_CR0(%rsi),%eax
        movq    %rax,%cr0
        call    _C_LABEL(cpu_hatch)
-       jmp     _C_LABEL(idle_loop)
        
        .data
 _C_LABEL(mp_pdirpa):
Index: arch/amd64/include/param.h
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/include/param.h,v
retrieving revision 1.17
diff -u -p -u -r1.17 param.h
--- arch/amd64/include/param.h  4 Feb 2012 17:56:16 -0000       1.17
+++ arch/amd64/include/param.h  16 Apr 2012 20:44:24 -0000
@@ -2,6 +2,11 @@
 
 #ifdef __x86_64__
 
+#ifndef XEN
+/* Must be defined before cpu.h */
+#define        MAXCPUS         256
+#endif
+
 #ifdef _KERNEL
 #include <machine/cpu.h>
 #endif
Index: arch/i386/i386/genassym.cf
===================================================================
RCS file: /cvsroot/src/sys/arch/i386/i386/genassym.cf,v
retrieving revision 1.91
diff -u -p -u -r1.91 genassym.cf
--- arch/i386/i386/genassym.cf  7 Dec 2011 15:47:42 -0000       1.91
+++ arch/i386/i386/genassym.cf  16 Apr 2012 20:44:24 -0000
@@ -287,13 +287,11 @@ define    CPU_INFO_RESCHED        offsetof(struct 
 define CPU_INFO_WANT_PMAPLOAD  offsetof(struct cpu_info, ci_want_pmapload)
 define CPU_INFO_TLBSTATE       offsetof(struct cpu_info, ci_tlbstate)
 define TLBSTATE_VALID          TLBSTATE_VALID
-define CPU_INFO_TLB_EVCNT      offsetof(struct cpu_info, ci_tlb_evcnt)
 define CPU_INFO_CURLWP         offsetof(struct cpu_info, ci_curlwp)
 define CPU_INFO_FPCURLWP       offsetof(struct cpu_info, ci_fpcurlwp)
 define CPU_INFO_CURLDT         offsetof(struct cpu_info, ci_curldt)
 define CPU_INFO_IDLELWP        offsetof(struct cpu_info, ci_data.cpu_idlelwp)
 define CPU_INFO_PMAP           offsetof(struct cpu_info, ci_pmap)
-define CPU_INFO_CPUMASK        offsetof(struct cpu_info, ci_cpumask)
 define CPU_INFO_TSS            offsetof(struct cpu_info, ci_tss)
 define CPU_INFO_TSS_SEL        offsetof(struct cpu_info, ci_tss_sel)
 define CPU_INFO_ESP0           offsetof(struct cpu_info, ci_tss.tss_esp0)
Index: arch/i386/i386/mptramp.S
===================================================================
RCS file: /cvsroot/src/sys/arch/i386/i386/mptramp.S,v
retrieving revision 1.22
diff -u -p -u -r1.22 mptramp.S
--- arch/i386/i386/mptramp.S    28 Jul 2010 17:05:51 -0000      1.22
+++ arch/i386/i386/mptramp.S    16 Apr 2012 20:44:24 -0000
@@ -271,8 +271,6 @@ mp_cont:
        HALTT(0x30,%ecx)        
        pushl   %ecx
        call    _C_LABEL(cpu_hatch)
-       HALT(0x33)
-       jmp     _C_LABEL(idle_loop)
        
        .data
 _C_LABEL(mp_pdirpa):
Index: arch/i386/include/param.h
===================================================================
RCS file: /cvsroot/src/sys/arch/i386/include/param.h,v
retrieving revision 1.76
diff -u -p -u -r1.76 param.h
--- arch/i386/include/param.h   10 Feb 2012 17:35:49 -0000      1.76
+++ arch/i386/include/param.h   16 Apr 2012 20:44:24 -0000
@@ -41,6 +41,13 @@
  * Machine dependent constants for Intel 386.
  */
 
+/*
+ * MAXCPUS must be defined before cpu.h inclusion.  Note: i386 might
+ * support more CPUs, but due to the limited KVA space available on
+ * i386, such support would be inefficient.  Use amd64 instead.
+ */
+#define        MAXCPUS         32
+
 #ifdef _KERNEL
 #include <machine/cpu.h>
 #endif
Index: arch/x86/acpi/acpi_wakeup.c
===================================================================
RCS file: /cvsroot/src/sys/arch/x86/acpi/acpi_wakeup.c,v
retrieving revision 1.30
diff -u -p -u -r1.30 acpi_wakeup.c
--- arch/x86/acpi/acpi_wakeup.c 10 Apr 2012 13:48:24 -0000      1.30
+++ arch/x86/acpi/acpi_wakeup.c 16 Apr 2012 20:44:25 -0000
@@ -1,7 +1,7 @@
 /*     $NetBSD: acpi_wakeup.c,v 1.30 2012/04/10 13:48:24 jruoho Exp $  */
 
 /*-
- * Copyright (c) 2002 The NetBSD Foundation, Inc.
+ * Copyright (c) 2002, 2011 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -61,11 +61,15 @@ __KERNEL_RCSID(0, "$NetBSD: acpi_wakeup.
  *      FreeBSD: src/sys/i386/acpica/acpi_wakeup.c,v 1.9 2002/01/10 03:26:46 
wes Exp
  */
 
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD$");
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
-#include <sys/proc.h>
+#include <sys/cpu.h>
+#include <sys/kcpuset.h>
 #include <sys/sysctl.h>
 
 #include <uvm/uvm_extern.h>
@@ -209,7 +213,7 @@ acpi_md_sleep_enter(int state)
 #ifdef MULTIPROCESSOR
        if (!CPU_IS_PRIMARY(ci)) {
                atomic_and_32(&ci->ci_flags, ~CPUF_RUNNING);
-               atomic_and_32(&cpus_running, ~ci->ci_cpumask);
+               kcpuset_atomic_clear(kcpuset_running, cpu_index(ci));
 
                ACPI_FLUSH_CPU_CACHE();
 
@@ -277,7 +281,7 @@ acpi_cpu_sleep(struct cpu_info *ci)
 #endif
 
        atomic_or_32(&ci->ci_flags, CPUF_RUNNING);
-       atomic_or_32(&cpus_running, ci->ci_cpumask);
+       kcpuset_atomic_set(kcpuset_running, cpu_index(ci));
        tsc_sync_ap(ci);
 
        x86_enable_intr();
@@ -291,6 +295,7 @@ acpi_md_sleep(int state)
 #ifdef MULTIPROCESSOR
        struct cpu_info *ci;
        CPU_INFO_ITERATOR cii;
+       cpuid_t cid;
 #endif
 
        KASSERT(acpi_wakeup_paddr != 0);
@@ -312,10 +317,12 @@ acpi_md_sleep(int state)
        x86_disable_intr();
 
 #ifdef MULTIPROCESSOR
-       /* Save and suspend Application Processors */
+       /* Save and suspend Application Processors. */
        x86_broadcast_ipi(X86_IPI_ACPI_CPU_SLEEP);
-       while (cpus_running != curcpu()->ci_cpumask)
+       cid = cpu_index(curcpu());
+       while (!kcpuset_isotherset(kcpuset_running, cid)) {
                delay(1);
+       }
 #endif
 
        if (acpi_md_sleep_prepare(state))
Index: arch/x86/include/cpu.h
===================================================================
RCS file: /cvsroot/src/sys/arch/x86/include/cpu.h,v
retrieving revision 1.49
diff -u -p -u -r1.49 cpu.h
--- arch/x86/include/cpu.h      2 Mar 2012 16:43:31 -0000       1.49
+++ arch/x86/include/cpu.h      16 Apr 2012 20:44:25 -0000
@@ -105,7 +105,7 @@ struct cpu_info {
        int     ci_fpsaving;            /* save in progress */
        int     ci_fpused;              /* XEN: FPU was used by curlwp */
        cpuid_t ci_cpuid;               /* our CPU ID */
-       int     ci_cpumask;             /* (1 << CPU ID) */
+       int     _unused;
        uint32_t ci_acpiid;             /* our ACPI/MADT ID */
        uint32_t ci_initapicid;         /* our intitial APIC ID */
 
@@ -323,8 +323,6 @@ void cpu_load_pmap(struct pmap *, struct
 void cpu_broadcast_halt(void);
 void cpu_kick(struct cpu_info *);
 
-extern uint32_t cpus_attached;
-
 #define        curcpu()                x86_curcpu()
 #define        curlwp                  x86_curlwp()
 #define        curpcb                  ((struct pcb *)lwp_getpcb(curlwp))
Index: arch/x86/include/cpuvar.h
===================================================================
RCS file: /cvsroot/src/sys/arch/x86/include/cpuvar.h,v
retrieving revision 1.45
diff -u -p -u -r1.45 cpuvar.h
--- arch/x86/include/cpuvar.h   13 Aug 2011 12:37:30 -0000      1.45
+++ arch/x86/include/cpuvar.h   16 Apr 2012 20:44:25 -0000
@@ -95,13 +95,11 @@ struct cpufeature_attach_args {
 };
 
 #ifdef _KERNEL
-
+#include <sys/kcpuset.h>
 #if defined(_KERNEL_OPT)
 #include "opt_multiprocessor.h"
 #endif /* defined(_KERNEL_OPT) */
 
-extern uint32_t cpus_running;
-
 int x86_ipi(int, int, int);
 void x86_self_ipi(int);
 int x86_ipi_init(int);
Index: arch/x86/include/pmap.h
===================================================================
RCS file: /cvsroot/src/sys/arch/x86/include/pmap.h,v
retrieving revision 1.51
diff -u -p -u -r1.51 pmap.h
--- arch/x86/include/pmap.h     11 Mar 2012 16:28:02 -0000      1.51
+++ arch/x86/include/pmap.h     16 Apr 2012 20:44:25 -0000
@@ -108,6 +108,8 @@
 
 
 #if defined(_KERNEL)
+#include <sys/kcpuset.h>
+
 /*
  * pmap data structures: see pmap.c for details of locking.
  */
@@ -162,10 +164,10 @@ struct pmap {
        union descriptor *pm_ldt;       /* user-set LDT */
        size_t pm_ldt_len;              /* size of LDT in bytes */
        int pm_ldt_sel;                 /* LDT selector */
-       uint32_t pm_cpus;               /* mask of CPUs using pmap */
-       uint32_t pm_kernel_cpus;        /* mask of CPUs using kernel part
+       kcpuset_t *pm_cpus;             /* mask of CPUs using pmap */
+       kcpuset_t *pm_kernel_cpus;      /* mask of CPUs using kernel part
                                         of pmap */
-       uint32_t pm_xen_ptp_cpus;       /* mask of CPUs which have this pmap's
+       kcpuset_t *pm_xen_ptp_cpus;     /* mask of CPUs which have this pmap's
                                         ptp mapped */
        uint64_t pm_ncsw;               /* for assertions */
        struct vm_page *pm_gc_ptp;      /* pages from pmap g/c */
@@ -289,6 +291,7 @@ typedef enum tlbwhy {
 } tlbwhy_t;
 
 void           pmap_tlb_init(void);
+void           pmap_tlb_cpu_init(struct cpu_info *);
 void           pmap_tlb_shootdown(pmap_t, vaddr_t, pt_entry_t, tlbwhy_t);
 void           pmap_tlb_shootnow(void);
 void           pmap_tlb_intr(void);
Index: arch/x86/x86/cpu.c
===================================================================
RCS file: /cvsroot/src/sys/arch/x86/x86/cpu.c,v
retrieving revision 1.97
diff -u -p -u -r1.97 cpu.c
--- arch/x86/x86/cpu.c  17 Feb 2012 18:40:19 -0000      1.97
+++ arch/x86/x86/cpu.c  16 Apr 2012 20:44:25 -0000
@@ -1,7 +1,7 @@
 /*     $NetBSD: cpu.c,v 1.97 2012/02/17 18:40:19 bouyer Exp $  */
 
 /*-
- * Copyright (c) 2000, 2006, 2007, 2008 The NetBSD Foundation, Inc.
+ * Copyright (c) 2000-2012 NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -82,6 +82,7 @@ __KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.97
 #include <sys/kmem.h>
 #include <sys/cpu.h>
 #include <sys/cpufreq.h>
+#include <sys/idle.h>
 #include <sys/atomic.h>
 #include <sys/reboot.h>
 
@@ -114,10 +115,6 @@ __KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.97
 
 #include "tsc.h"
 
-#if MAXCPUS > 32
-#error cpu_info contains 32bit bitmasks
-#endif
-
 static int     cpu_match(device_t, cfdata_t, void *);
 static void    cpu_attach(device_t, device_t, void *);
 static void    cpu_defer(device_t);
@@ -157,7 +154,6 @@ struct cpu_info cpu_info_primary __align
        .ci_idepth = -1,
        .ci_curlwp = &lwp0,
        .ci_curldt = -1,
-       .ci_cpumask = 1,
 #ifdef TRAPLOG
        .ci_tlog_base = &tlog_primary,
 #endif /* !TRAPLOG */
@@ -173,9 +169,6 @@ static void tss_init(struct i386tss *, v
 
 static void    cpu_init_idle_lwp(struct cpu_info *);
 
-uint32_t cpus_attached = 0;
-uint32_t cpus_running = 1;
-
 uint32_t cpu_feature[5]; /* X86 CPUID feature bits
                          *     [0] basic features %edx
                          *     [1] basic features %ecx
@@ -271,8 +264,9 @@ cpu_vm_init(struct cpu_info *ci)
         */
        aprint_debug_dev(ci->ci_dev, "%d page colors\n", ncolors);
        uvm_page_recolor(ncolors);
-}
 
+       pmap_tlb_cpu_init(ci);
+}
 
 static void
 cpu_attach(device_t parent, device_t self, void *aux)
@@ -286,8 +280,12 @@ cpu_attach(device_t parent, device_t sel
 
        sc->sc_dev = self;
 
-       if (cpus_attached == ~0) {
-               aprint_error(": increase MAXCPUS\n");
+       if (ncpu == maxcpus) {
+#ifndef _LP64
+               aprint_error(": too many CPUs, please use NetBSD/amd64\n");
+#else
+               aprint_error(": too many CPUs\n");
+#endif
                return;
        }
 
@@ -356,7 +354,6 @@ cpu_attach(device_t parent, device_t sel
                KASSERT(ci->ci_data.cpu_idlelwp != NULL);
        }
 
-       ci->ci_cpumask = (1 << cpu_index(ci));
        pmap_reference(pmap_kernel());
        ci->ci_pmap = pmap_kernel();
        ci->ci_tlbstate = TLBSTATE_STALE;
@@ -428,7 +425,6 @@ cpu_attach(device_t parent, device_t sel
        }
 
        pat_init(ci);
-       atomic_or_32(&cpus_attached, ci->ci_cpumask);
 
        if (!pmf_device_register1(self, cpu_suspend, cpu_resume, cpu_shutdown))
                aprint_error_dev(self, "couldn't establish power handler\n");
@@ -579,8 +575,6 @@ cpu_init(struct cpu_info *ci)
 #endif /* i386 */
 #endif /* MTRR */
 
-       atomic_or_32(&cpus_running, ci->ci_cpumask);
-
        if (ci != &cpu_info_primary) {
                /* Synchronize TSC again, and check for drift. */
                wbinvd();
@@ -839,6 +833,9 @@ cpu_hatch(void *v)
        x86_errata();
 
        aprint_debug_dev(ci->ci_dev, "running\n");
+
+       idle_loop(NULL);
+       KASSERT(false);
 }
 
 #if defined(DDB)
Index: arch/x86/x86/mtrr_i686.c
===================================================================
RCS file: /cvsroot/src/sys/arch/x86/x86/mtrr_i686.c,v
retrieving revision 1.25
diff -u -p -u -r1.25 mtrr_i686.c
--- arch/x86/x86/mtrr_i686.c    15 Dec 2011 09:38:21 -0000      1.25
+++ arch/x86/x86/mtrr_i686.c    16 Apr 2012 20:44:25 -0000
@@ -1,7 +1,7 @@
 /*     $NetBSD: mtrr_i686.c,v 1.25 2011/12/15 09:38:21 abs Exp $ */
 
 /*-
- * Copyright (c) 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 2000, 2011 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -36,10 +36,11 @@ __KERNEL_RCSID(0, "$NetBSD: mtrr_i686.c,
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/proc.h>
-#include <sys/malloc.h>
+
 #include <sys/atomic.h>
 #include <sys/cpu.h>
+#include <sys/kmem.h>
+#include <sys/proc.h>
 
 #include <uvm/uvm_extern.h>
 
@@ -133,11 +134,9 @@ struct mtrr_funcs i686_mtrr_funcs = {
        i686_mtrr_dump
 };
 
-#ifdef MULTIPROCESSOR
-static volatile uint32_t mtrr_waiting;
-#endif
+static kcpuset_t *             mtrr_waiting;
 
-static uint64_t i686_mtrr_cap;
+static uint64_t                        i686_mtrr_cap;
 
 static void
 i686_mtrr_dump(const char *tag)
@@ -174,14 +173,10 @@ i686_mtrr_reload(int synch)
        vaddr_t cr3, cr4;
        uint32_t origcr0;
        vaddr_t origcr4;
-#ifdef MULTIPROCESSOR
-       uint32_t mymask = 1 << cpu_number();
-#endif
 
        /*
         * 2. Disable interrupts
         */
-
        x86_disable_intr();
 
 #ifdef MULTIPROCESSOR
@@ -189,11 +184,10 @@ i686_mtrr_reload(int synch)
                /*
                 * 3. Wait for all processors to reach this point.
                 */
-
-               atomic_or_32(&mtrr_waiting, mymask);
-
-               while (mtrr_waiting != cpus_running)
+               kcpuset_atomic_set(mtrr_waiting, cpu_index(curcpu()));
+               while (!kcpuset_match(mtrr_waiting, kcpuset_running)) {
                        DELAY(10);
+               }
        }
 #endif
 
@@ -289,10 +283,10 @@ i686_mtrr_reload(int synch)
                /*
                 * 14. Wait for all processors to reach this point.
                 */
-               atomic_and_32(&mtrr_waiting, ~mymask);
-
-               while (mtrr_waiting != 0)
+               kcpuset_atomic_clear(mtrr_waiting, cpu_index(curcpu()));
+               while (!kcpuset_iszero(mtrr_waiting)) {
                        DELAY(10);
+               }
        }
 #endif
 
@@ -326,25 +320,25 @@ i686_mtrr_init_first(void)
                }
        }
 
-       for (i = 0; i < nmtrr_raw; i++)
+       for (i = 0; i < nmtrr_raw; i++) {
                if (mtrr_raw[i].msraddr)
                        mtrr_raw[i].msrval = rdmsr(mtrr_raw[i].msraddr);
                else
                        mtrr_raw[i].msrval = 0;
+       }
 #if 0
        mtrr_dump("init mtrr");
 #endif
 
-       mtrr_fixed = (struct mtrr *)
-           malloc(MTRR_I686_NFIXED_SOFT * sizeof (struct mtrr), M_TEMP,
-                  M_NOWAIT);
-       if (mtrr_fixed == NULL)
-               panic("can't allocate fixed MTRR array");
-
-       mtrr_var = (struct mtrr *)
-           malloc(i686_mtrr_vcnt * sizeof (struct mtrr), M_TEMP, M_NOWAIT);
-       if (mtrr_var == NULL)
-               panic("can't allocate variable MTRR array");
+       kcpuset_create(&mtrr_waiting, true);
+
+       mtrr_fixed =
+           kmem_zalloc(MTRR_I686_NFIXED_SOFT * sizeof(struct mtrr), KM_SLEEP);
+       KASSERT(mtrr_fixed != NULL);
+
+       mtrr_var =
+           kmem_zalloc(i686_mtrr_vcnt * sizeof(struct mtrr), KM_SLEEP);
+       KASSERT(mtrr_var != NULL);
 
        mtrr_var_raw = &mtrr_raw[0];
        mtrr_fixed_raw = &mtrr_raw[MTRR_I686_NVAR_MAX * 2];
@@ -767,9 +761,12 @@ i686_mtrr_get(struct mtrr *mtrrp, int *n
 static void
 i686_mtrr_commit(void)
 {
+
        i686_soft2raw();
+       kpreempt_disable();
 #ifdef MULTIPROCESSOR
        x86_broadcast_ipi(X86_IPI_MTRR);
 #endif
        i686_mtrr_reload(1);
+       kpreempt_enable();
 }
Index: arch/x86/x86/pmap.c
===================================================================
RCS file: /cvsroot/src/sys/arch/x86/x86/pmap.c,v
retrieving revision 1.176
diff -u -p -u -r1.176 pmap.c
--- arch/x86/x86/pmap.c 25 Feb 2012 20:03:58 -0000      1.176
+++ arch/x86/x86/pmap.c 16 Apr 2012 20:44:25 -0000
@@ -723,7 +723,6 @@ pmap_map_ptes(struct pmap *pmap, struct 
 {
        struct pmap *curpmap;
        struct cpu_info *ci;
-       uint32_t cpumask;
        lwp_t *l;
 
        /* The kernel's pmap is always accessible. */
@@ -765,13 +764,14 @@ pmap_map_ptes(struct pmap *pmap, struct 
                 * The reference will be dropped by pmap_unmap_ptes().
                 * Can happen if we block during exit().
                 */
-               cpumask = ci->ci_cpumask;
-               atomic_and_32(&curpmap->pm_cpus, ~cpumask);
-               atomic_and_32(&curpmap->pm_kernel_cpus, ~cpumask);
+               const cpuid_t cid = cpu_index(ci);
+
+               kcpuset_atomic_clear(curpmap->pm_cpus, cid);
+               kcpuset_atomic_clear(curpmap->pm_kernel_cpus, cid);
                ci->ci_pmap = pmap;
                ci->ci_tlbstate = TLBSTATE_VALID;
-               atomic_or_32(&pmap->pm_cpus, cpumask);
-               atomic_or_32(&pmap->pm_kernel_cpus, cpumask);
+               kcpuset_atomic_set(pmap->pm_cpus, cid);
+               kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
                cpu_load_pmap(pmap, curpmap);
        }
        pmap->pm_ncsw = l->l_ncsw;
@@ -1048,8 +1048,7 @@ pmap_emap_sync(bool canload)
                 */
                pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
                if (__predict_false(pmap == ci->ci_pmap)) {
-                       const uint32_t cpumask = ci->ci_cpumask;
-                       atomic_and_32(&pmap->pm_cpus, ~cpumask);
+                       kcpuset_atomic_clear(pmap->pm_cpus, cpu_index(ci));
                }
                pmap_load();
                KASSERT(ci->ci_want_pmapload == 0);
@@ -1234,6 +1233,9 @@ pmap_bootstrap(vaddr_t kva_start)
        kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
                x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
 
+       kcpuset_create(&kpm->pm_cpus, true);
+       kcpuset_create(&kpm->pm_kernel_cpus, true);
+
        /*
         * the above is just a rough estimate and not critical to the proper
         * operation of the system.
@@ -1651,6 +1653,9 @@ pmap_init(void)
 
        pmap_tlb_init();
 
+       /* XXX: Since cpu_hatch() is only for secondary CPUs. */
+       pmap_tlb_cpu_init(curcpu());
+
        evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
            NULL, "x86", "io bitmap copy");
        evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
@@ -1896,9 +1901,8 @@ pmap_free_ptp(struct pmap *pmap, struct 
                        /*
                         * Update the per-cpu PD on all cpus the current
                         * pmap is active on 
-                        */ 
+                        */
                        xen_kpm_sync(pmap, index);
-
                }
 #  endif /*__x86_64__ */
                invaladdr = level == 1 ? (vaddr_t)ptes :
@@ -1988,7 +1992,7 @@ pmap_get_ptp(struct pmap *pmap, vaddr_t 
                        /*
                         * Update the per-cpu PD on all cpus the current
                         * pmap is active on 
-                        */ 
+                        */
                        xen_kpm_sync(pmap, index);
                }
 #endif /* XEN && __x86_64__ */
@@ -2010,7 +2014,7 @@ pmap_get_ptp(struct pmap *pmap, vaddr_t 
        }
 
        /*
-        * ptp is not NULL if we just allocated a new ptp. If it's
+        * PTP is not NULL if we just allocated a new PTP.  If it is
         * still NULL, we must look up the existing one.
         */
        if (ptp == NULL) {
@@ -2022,10 +2026,11 @@ pmap_get_ptp(struct pmap *pmap, vaddr_t 
                        panic("pmap_get_ptp: unmanaged user PTP");
                }
 #endif
+               KASSERT(ptp != NULL);
        }
 
        pmap->pm_ptphint[0] = ptp;
-       return(ptp);
+       return ptp;
 }
 
 /*
@@ -2200,12 +2205,8 @@ pmap_pdp_free(struct pool *pp, void *v)
 #endif /* PAE */
 
 /*
- * pmap_create: create a pmap
- *
- * => note: old pmap interface took a "size" args which allowed for
- *     the creation of "software only" pmaps (not in bsd).
+ * pmap_create: create a pmap object.
  */
-
 struct pmap *
 pmap_create(void)
 {
@@ -2228,11 +2229,13 @@ pmap_create(void)
        pmap->pm_hiexec = 0;
 #endif /* !defined(__x86_64__) */
        pmap->pm_flags = 0;
-       pmap->pm_cpus = 0;
-       pmap->pm_kernel_cpus = 0;
-       pmap->pm_xen_ptp_cpus = 0;
        pmap->pm_gc_ptp = NULL;
 
+       kcpuset_create(&pmap->pm_cpus, true);
+       kcpuset_create(&pmap->pm_kernel_cpus, true);
+#ifdef XEN
+       kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
+#endif
        /* init the LDT */
        pmap->pm_ldt = NULL;
        pmap->pm_ldt_len = 0;
@@ -2287,12 +2290,8 @@ pmap_free_ptps(struct vm_page *empty_ptp
 void
 pmap_destroy(struct pmap *pmap)
 {
-       int i;
-#ifdef DIAGNOSTIC
-       struct cpu_info *ci;
-       CPU_INFO_ITERATOR cii;
-#endif /* DIAGNOSTIC */
        lwp_t *l;
+       int i;
 
        /*
         * If we have torn down this pmap, process deferred frees and
@@ -2321,6 +2320,9 @@ pmap_destroy(struct pmap *pmap)
        }
 
 #ifdef DIAGNOSTIC
+       CPU_INFO_ITERATOR cii;
+       struct cpu_info *ci;
+
        for (CPU_INFO_FOREACH(cii, ci)) {
                if (ci->ci_pmap == pmap)
                        panic("destroying pmap being used");
@@ -2344,11 +2346,8 @@ pmap_destroy(struct pmap *pmap)
 #endif /* DIAGNOSTIC */
 
        /*
-        * reference count is zero, free pmap resources and then free pmap.
-        */
-
-       /*
-        * remove it from global list of pmaps
+        * Reference count is zero, free pmap resources and then free pmap.
+        * First, remove it from global list of pmaps.
         */
 
        mutex_enter(&pmaps_lock);
@@ -2394,6 +2393,11 @@ pmap_destroy(struct pmap *pmap)
                uvm_obj_destroy(&pmap->pm_obj[i], false);
                mutex_destroy(&pmap->pm_obj_lock[i]);
        }
+       kcpuset_destroy(pmap->pm_cpus);
+       kcpuset_destroy(pmap->pm_kernel_cpus);
+#ifdef XEN
+       kcpuset_destroy(pmap->pm_xen_ptp_cpus);
+#endif
        pool_cache_put(&pmap_cache, pmap);
 }
 
@@ -2596,19 +2600,15 @@ pmap_activate(struct lwp *l)
 /*
  * pmap_reactivate: try to regain reference to the pmap.
  *
- * => must be called with kernel preemption disabled
+ * => Must be called with kernel preemption disabled.
  */
 
 static bool
 pmap_reactivate(struct pmap *pmap)
 {
-       struct cpu_info *ci;
-       uint32_t cpumask;
-       bool result;    
-       uint32_t oldcpus;
-
-       ci = curcpu();
-       cpumask = ci->ci_cpumask;
+       struct cpu_info * const ci = curcpu();
+       const cpuid_t cid = cpu_index(ci);
+       bool result;
 
        KASSERT(kpreempt_disabled());
 #if defined(XEN) && defined(__x86_64__)
@@ -2620,53 +2620,48 @@ pmap_reactivate(struct pmap *pmap)
 #endif
 
        /*
-        * if we still have a lazy reference to this pmap,
-        * we can assume that there was no tlb shootdown
-        * for this pmap in the meantime.
+        * If we still have a lazy reference to this pmap, we can assume
+        * that there was no TLB shootdown for this pmap in the meantime.
         *
-        * the order of events here is important as we must
-        * synchronize with TLB shootdown interrupts.  declare
-        * interest in invalidations (TLBSTATE_VALID) and then
-        * check the cpumask, which the IPIs can change only
-        * when the state is TLBSTATE_LAZY.
+        * The order of events here is important as we must synchronize
+        * with TLB shootdown interrupts.  Declare interest in invalidations
+        * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can
+        * change only when the state is TLBSTATE_LAZY.
         */
 
        ci->ci_tlbstate = TLBSTATE_VALID;
-       oldcpus = pmap->pm_cpus;
-       KASSERT((pmap->pm_kernel_cpus & cpumask) != 0);
-       if (oldcpus & cpumask) {
-               /* got it */
+       KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
+
+       if (kcpuset_isset(pmap->pm_cpus, cid)) {
+               /* We have the reference, state is valid. */
                result = true;
        } else {
-               /* must reload */
-               atomic_or_32(&pmap->pm_cpus, cpumask);
+               /* Must reload the TLB. */
+               kcpuset_atomic_set(pmap->pm_cpus, cid);
                result = false;
        }
-
        return result;
 }
 
 /*
- * pmap_load: actually switch pmap.  (fill in %cr3 and LDT info)
- *
- * ensures that the current process' pmap is loaded on the current cpu's MMU
- * and there's no stale TLB entries.
+ * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register
+ * and relevant LDT info.
  *
- * the caller should disable preemption or do check-and-retry to prevent
- * a preemption from undoing our efforts.
+ * Ensures that the current process' pmap is loaded on the current CPU's
+ * MMU and that there are no stale TLB entries.
  *
- * this function can block.
+ * => The caller should disable kernel preemption or do check-and-retry
+ *    to prevent a preemption from undoing our efforts.
+ * => This function may block.
  */
-
 void
 pmap_load(void)
 {
        struct cpu_info *ci;
-       uint32_t cpumask;
-       struct pmap *pmap;
-       struct pmap *oldpmap;
+       struct pmap *pmap, *oldpmap;
        struct lwp *l;
        struct pcb *pcb;
+       cpuid_t cid;
        uint64_t ncsw;
 
        kpreempt_disable();
@@ -2676,7 +2671,6 @@ pmap_load(void)
                kpreempt_enable();
                return;
        }
-       cpumask = ci->ci_cpumask;
        l = ci->ci_curlwp;
        ncsw = l->l_ncsw;
 
@@ -2714,17 +2708,14 @@ pmap_load(void)
        }
 
        /*
-        * grab a reference to the new pmap.
+        * Acquire a reference to the new pmap and perform the switch.
         */
 
        pmap_reference(pmap);
 
-       /*
-        * actually switch pmap.
-        */
-
-       atomic_and_32(&oldpmap->pm_cpus, ~cpumask);
-       atomic_and_32(&oldpmap->pm_kernel_cpus, ~cpumask);
+       cid = cpu_index(ci);
+       kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
+       kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
 
 #if defined(XEN) && defined(__x86_64__)
        KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd ||
@@ -2734,19 +2725,17 @@ pmap_load(void)
 #elif !defined(XEN)
        KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3()));
 #endif
-       KASSERT((pmap->pm_cpus & cpumask) == 0);
-       KASSERT((pmap->pm_kernel_cpus & cpumask) == 0);
+       KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
+       KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
 
        /*
-        * mark the pmap in use by this processor.  again we must
-        * synchronize with TLB shootdown interrupts, so set the
-        * state VALID first, then register us for shootdown events
-        * on this pmap.
+        * Mark the pmap in use by this CPU.  Again, we must synchronize
+        * with TLB shootdown interrupts, so set the state VALID first,
+        * then register us for shootdown events on this pmap.
         */
-
        ci->ci_tlbstate = TLBSTATE_VALID;
-       atomic_or_32(&pmap->pm_cpus, cpumask);
-       atomic_or_32(&pmap->pm_kernel_cpus, cpumask);
+       kcpuset_atomic_set(pmap->pm_cpus, cid);
+       kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
        ci->ci_pmap = pmap;
 
        /*
Index: arch/x86/x86/pmap_tlb.c
===================================================================
RCS file: /cvsroot/src/sys/arch/x86/x86/pmap_tlb.c,v
retrieving revision 1.4
diff -u -p -u -r1.4 pmap_tlb.c
--- arch/x86/x86/pmap_tlb.c     4 Dec 2011 04:28:41 -0000       1.4
+++ arch/x86/x86/pmap_tlb.c     16 Apr 2012 20:44:25 -0000
@@ -1,7 +1,7 @@
 /*     $NetBSD: pmap_tlb.c,v 1.4 2011/12/04 04:28:41 cherry Exp $      */
 
 /*-
- * Copyright (c) 2008-2011 The NetBSD Foundation, Inc.
+ * Copyright (c) 2008-2012 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -32,10 +32,10 @@
 /*
  * x86 pmap(9) module: TLB shootdowns.
  *
- * TLB shootdowns are hard interrupts that operate outside the SPL framework:
+ * TLB shootdowns are hard interrupts that operate outside the SPL framework.
  * They do not need to be blocked, provided that the pmap module gets the
  * order of events correct.  The calls are made by poking the LAPIC directly.
- * The interrupt handler is short and does one of the following:  invalidate
+ * The interrupt handler is short and does one of the following: invalidate
  * a set of pages, all user TLB entries or the entire TLB.
  */
 
@@ -70,19 +70,17 @@ typedef struct {
 #endif
        uint16_t                tp_count;
        uint16_t                tp_pte;
-       uint32_t                tp_cpumask;
-       uint32_t                tp_usermask;
+       int                     tp_userpmap;
+       kcpuset_t *             tp_cpumask;
 } pmap_tlb_packet_t;
 
 /* No more than N seperate invlpg. */
 #define        TP_MAXVA                6
 
 typedef struct {
-       volatile uint32_t       tm_pending;
-       volatile uint32_t       tm_gen;
-       uint32_t                tm_usergen;
-       uint32_t                tm_globalgen;
-       char                    tm_pad[64 - sizeof(uintptr_t) * 4];
+       kcpuset_t *             tm_pending;
+       volatile u_int          tm_pendcount;
+       volatile u_int          tm_gen;
 } pmap_tlb_mailbox_t;
 
 /*
@@ -126,6 +124,8 @@ pmap_tlb_init(void)
        memset(&pmap_tlb_packet, 0, sizeof(pmap_tlb_packet_t));
        memset(&pmap_tlb_mailbox, 0, sizeof(pmap_tlb_mailbox_t));
 
+       kcpuset_create(&pmap_tlb_mailbox.tm_pending, true);
+
        evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR,
            NULL, "TLB", "shootdown");
 
@@ -151,11 +151,21 @@ pmap_tlb_init(void)
 #endif
 }
 
+void
+pmap_tlb_cpu_init(struct cpu_info *ci)
+{
+       pmap_tlb_packet_t *tp = (pmap_tlb_packet_t *)ci->ci_pmap_data;
+
+       memset(tp, 0, sizeof(pmap_tlb_packet_t));
+       kcpuset_create(&tp->tp_cpumask, true);
+}
+
 static inline void
 pmap_tlbstat_count(struct pmap *pm, vaddr_t va, tlbwhy_t why)
 {
 #ifdef TLBSTATS
-       uint32_t mask;
+       const cpuid_t cid = cpu_index(curcpu());
+       bool local = false, remote = false;
 
        if (va != (vaddr_t)-1LL) {
                atomic_inc_64(&tlbstat_single_req.ev_count);
@@ -164,15 +174,18 @@ pmap_tlbstat_count(struct pmap *pm, vadd
                atomic_inc_64(&tlbstat_kernel[why].ev_count);
                return;
        }
+
        if (va >= VM_MAXUSER_ADDRESS) {
-               mask = pm->pm_cpus | pm->pm_kernel_cpus;
-       } else {
-               mask = pm->pm_cpus;
+               remote = kcpuset_isotherset(pm->pm_kernel_cpus, cid);
+               local = kcpuset_isset(pm->pm_kernel_cpus, cid);
        }
-       if ((mask & curcpu()->ci_cpumask) != 0) {
+       remote |= kcpuset_isotherset(pm->pm_cpus, cid);
+       local |= kcpuset_isset(pm->pm_cpus, cid);
+
+       if (local) {
                atomic_inc_64(&tlbstat_local[why].ev_count);
        }
-       if ((mask & ~curcpu()->ci_cpumask) != 0) {
+       if (remote) {
                atomic_inc_64(&tlbstat_remote[why].ev_count);
        }
 #endif
@@ -203,7 +216,7 @@ pmap_tlb_invalidate(pmap_tlb_packet_t *t
 }
 
 /*
- * pmap_tlb_shootdown: invalidate a page on all CPUs using pmap 'pm'
+ * pmap_tlb_shootdown: invalidate a page on all CPUs using pmap 'pm'.
  */
 void
 pmap_tlb_shootdown(struct pmap *pm, vaddr_t va, pt_entry_t pte, tlbwhy_t why)
@@ -229,7 +242,7 @@ pmap_tlb_shootdown(struct pmap *pm, vadd
 
        /*
         * Add the shootdown operation to our pending set.
-        */ 
+        */
        s = splvm();
        tp = (pmap_tlb_packet_t *)curcpu()->ci_pmap_data;
 
@@ -250,14 +263,16 @@ pmap_tlb_shootdown(struct pmap *pm, vadd
                tp->tp_count = (uint16_t)-1;
        }
 
-       if (pm == pmap_kernel()) {
-               tp->tp_cpumask = cpus_running;
-       } else if (va >= VM_MAXUSER_ADDRESS) {
-               tp->tp_cpumask |= (pm->pm_cpus | pm->pm_kernel_cpus);
-               tp->tp_usermask |= (pm->pm_cpus | pm->pm_kernel_cpus);
+       if (pm != pmap_kernel()) {
+               kcpuset_copy(tp->tp_cpumask, pm->pm_cpus);
+               if (va >= VM_MAXUSER_ADDRESS) {
+                       kcpuset_merge(tp->tp_cpumask, pm->pm_kernel_cpus);
+               }
+               kcpuset_intersect(tp->tp_cpumask, kcpuset_running);
+               tp->tp_userpmap = 1;
        } else {
-               tp->tp_cpumask |= pm->pm_cpus;
-               tp->tp_usermask |= pm->pm_cpus;
+               kcpuset_copy(tp->tp_cpumask, kcpuset_running);
+               tp->tp_userpmap = 0;
        }
        pmap_tlbstat_count(pm, va, why);
        splx(s);
@@ -265,59 +280,54 @@ pmap_tlb_shootdown(struct pmap *pm, vadd
 
 #ifdef MULTIPROCESSOR
 #ifdef XEN
-static inline
-void pmap_tlb_processpacket(pmap_tlb_packet_t *tp)
+
+static inline void
+pmap_tlb_processpacket(pmap_tlb_packet_t *tp, kcpuset_t *target)
 {
-       struct cpu_info *self = curcpu();
-       if (tp->tp_count == (uint16_t)-1) {
-               xen_mcast_tlbflush(tp->tp_cpumask &
-                                  cpus_running &
-                                  ~self->ci_cpumask);
-       } else {
+       pmap_tlb_mailbox_t *tm = &pmap_tlb_mailbox;
+
+       if (tp->tp_count != (uint16_t)-1) {
                /* Invalidating a single page or a range of pages. */
-               int i;
-               for (i = tp->tp_count - 1; i >= 0; i--) {
-                       xen_mcast_invlpg(tp->tp_va[i],
-                                        tp->tp_cpumask & 
-                                        cpus_running &
-                                        ~self->ci_cpumask);
+               for (int i = tp->tp_count - 1; i >= 0; i--) {
+                       xen_mcast_invlpg(tp->tp_va[i], target);
                }
+       } else {
+               xen_mcast_tlbflush(target);
        }
 
-       /* Ack the request */
-       atomic_and_32(&pmap_tlb_mailbox.tm_pending, ~tp->tp_cpumask);
+       /* Remote CPUs have been synchronously flushed. */
+       tm->tm_pendcount = 0;
 }
-#else /* XEN */
-static inline 
-void pmap_tlb_processpacket(pmap_tlb_packet_t *tp)
+
+#else
+
+static inline void
+pmap_tlb_processpacket(pmap_tlb_packet_t *tp, kcpuset_t *target)
 {
        int err = 0;
-       CPU_INFO_ITERATOR cii;
-       struct cpu_info *lci;
 
-       if (tp->tp_cpumask == cpus_running) {
-               err = x86_ipi(LAPIC_TLB_VECTOR, LAPIC_DEST_ALLEXCL,
-                   LAPIC_DLMODE_FIXED);
-       } else {
-               struct cpu_info *self = curcpu();
+       if (!kcpuset_match(target, kcpuset_running)) {
+               const struct cpu_info * const self = curcpu();
+               CPU_INFO_ITERATOR cii;
+               struct cpu_info *lci;
+
                for (CPU_INFO_FOREACH(cii, lci)) {
-                       if (__predict_false(lci == self)) {
-                               continue;
-                       }
-                       if ((lci->ci_cpumask & pmap_tlb_mailbox.tm_pending) == 
0) {
+                       const cpuid_t lcid = cpu_index(lci);
+
+                       if (__predict_false(lci == self) ||
+                           !kcpuset_isset(target, lcid)) {
                                continue;
                        }
-                       KASSERT(lci->ci_flags & CPUF_RUNNING);
-
                        err |= x86_ipi(LAPIC_TLB_VECTOR,
-                                      lci->ci_cpuid, LAPIC_DLMODE_FIXED);
+                           lci->ci_cpuid, LAPIC_DLMODE_FIXED);
                }
+       } else {
+               err = x86_ipi(LAPIC_TLB_VECTOR, LAPIC_DEST_ALLEXCL,
+                   LAPIC_DLMODE_FIXED);
        }
-
-       if (__predict_false(err != 0)) {
-               panic("pmap_tlb_shootdown: IPI failed");
-       }
+       KASSERT(err == 0);
 }
+
 #endif /* XEN */
 #endif /* MULTIPROCESSOR */
 
@@ -332,8 +342,9 @@ pmap_tlb_shootnow(void)
        pmap_tlb_packet_t *tp;
        pmap_tlb_mailbox_t *tm;
        struct cpu_info *ci;
-       uint32_t remote;
-       uintptr_t gen;
+       kcpuset_t *target;
+       u_int local, gen, rcpucount;
+       cpuid_t cid;
        int s;
 
        KASSERT(kpreempt_disabled());
@@ -351,22 +362,30 @@ pmap_tlb_shootnow(void)
                splx(s);
                return;
        }
-       gen = 0; /* XXXgcc */
        tm = &pmap_tlb_mailbox;
-       remote = tp->tp_cpumask & ~ci->ci_cpumask;
+       cid = cpu_index(ci);
+
+       target = tp->tp_cpumask;
+       local = kcpuset_isset(target, cid) ? 1 : 0;
+       rcpucount = kcpuset_countset(target) - local;
+       gen = 0;
 
 #ifdef MULTIPROCESSOR
-       if (remote != 0) {
+       if (rcpucount) {
                int count;
+
                /*
                 * Gain ownership of the shootdown mailbox.  We must stay
                 * at IPL_VM once we own it or could deadlock against an
                 * interrupt on this CPU trying to do the same.
                 */
-               while (atomic_cas_32(&tm->tm_pending, 0, remote) != 0) {
+               KASSERT(rcpucount < ncpu);
+
+               while (atomic_cas_uint(&tm->tm_pendcount, 0, rcpucount) != 0) {
                        splx(s);
                        count = SPINLOCK_BACKOFF_MIN;
-                       while (tm->tm_pending != 0) {
+                       while (tm->tm_pendcount != 0) {
+                               KASSERT(tm->tm_pendcount < ncpu);
                                SPINLOCK_BACKOFF(count);
                        }
                        s = splvm();
@@ -383,29 +402,21 @@ pmap_tlb_shootnow(void)
                 */
                gen = ++tm->tm_gen;
                memcpy(&pmap_tlb_packet, tp, sizeof(*tp));
+               kcpuset_copy(tm->tm_pending, target);
                pmap_tlb_evcnt.ev_count++;
 
                /*
                 * Initiate shootdowns on remote CPUs.
                 */
-               /* Trim mailbox wait to only for CPUF_RUNNING cpus */
-               atomic_and_32(&tm->tm_pending, cpus_running);
-
-               pmap_tlb_processpacket(tp);
-#ifdef XEN
-               /* 
-                * remote CPUs have been synchronously flushed
-                */
-               remote = 0; 
-#endif /* XEN */
+               pmap_tlb_processpacket(tp, target);
        }
-#endif /* MULTIPROCESSOR */
+#endif
 
        /*
         * Shootdowns on remote CPUs are now in flight.  In the meantime,
-        * perform local shootdowns and do not forget to update emap gen.
+        * perform local shootdown if needed.
         */
-       if ((tp->tp_cpumask & ci->ci_cpumask) != 0) {
+       if (local) {
                pmap_tlb_invalidate(tp);
        }
 
@@ -417,26 +428,28 @@ pmap_tlb_shootnow(void)
                atomic_add_64(&tlbstat_single_issue.ev_count, tp->tp_count);
        }
 #endif
+       kcpuset_zero(tp->tp_cpumask);
+       tp->tp_userpmap = 0;
        tp->tp_count = 0;
        tp->tp_pte = 0;
-       tp->tp_cpumask = 0;
-       tp->tp_usermask = 0;
        splx(s);
 
        /*
         * Now wait for the current generation of updates to be
         * processed by remote CPUs.
         */
-       if (remote != 0 && tm->tm_pending != 0) {
+       if (rcpucount && tm->tm_pendcount) {
                int count = SPINLOCK_BACKOFF_MIN;
-               while (tm->tm_pending != 0 && tm->tm_gen == gen) {
+
+               while (tm->tm_pendcount && tm->tm_gen == gen) {
+                       KASSERT(tm->tm_pendcount < ncpu);
                        SPINLOCK_BACKOFF(count);
                }
        }
 }
 
 /*
- * pmap_tlb_ipi: pmap shootdown interrupt handler to invalidate TLB entries.
+ * pmap_tlb_intr: pmap shootdown interrupt handler to invalidate TLB entries.
  *
  * => Called from IPI only.
  */
@@ -444,10 +457,16 @@ void
 pmap_tlb_intr(void)
 {
        pmap_tlb_packet_t *tp = &pmap_tlb_packet;
-       pmap_tlb_mailbox_t *tm;
-       struct cpu_info *ci;
-       uint32_t cm;
+       pmap_tlb_mailbox_t *tm = &pmap_tlb_mailbox;
+       struct cpu_info *ci = curcpu();
+       cpuid_t cid = cpu_index(ci);
 
+       if (!kcpuset_isset(tm->tm_pending, cid)) {
+               return;
+       }
+       KASSERT(tm->tm_pendcount > 0);
+
+       /* First, TLB flush. */
        pmap_tlb_invalidate(tp);
 
        /*
@@ -455,16 +474,17 @@ pmap_tlb_intr(void)
         * invalidations for this pmap, then take the CPU out of
         * the pmap's bitmask.
         */
-       ci = curcpu();
-       cm = ci->ci_cpumask;
-       if (ci->ci_tlbstate == TLBSTATE_LAZY && (tp->tp_usermask & cm) != 0) {
+       if (ci->ci_tlbstate == TLBSTATE_LAZY && tp->tp_userpmap) {
                struct pmap *pm = ci->ci_pmap;
 
-               atomic_and_32(&pm->pm_cpus, ~cm);
+               kcpuset_atomic_clear(pm->pm_cpus, cid);
                ci->ci_tlbstate = TLBSTATE_STALE;
        }
 
-       /* Ack the request. */
-       tm = &pmap_tlb_mailbox;
-       atomic_and_32(&tm->tm_pending, ~cm);
+       /*
+        * Ack the request.  Order is important: must remove CPU
+        * from the set first, then decrement pending count.
+        */
+       kcpuset_atomic_clear(tm->tm_pending, cid);
+       atomic_dec_uint(&tm->tm_pendcount);
 }
Index: arch/xen/include/xenpmap.h
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/include/xenpmap.h,v
retrieving revision 1.33
diff -u -p -u -r1.33 xenpmap.h
--- arch/xen/include/xenpmap.h  30 Dec 2011 16:55:21 -0000      1.33
+++ arch/xen/include/xenpmap.h  16 Apr 2012 20:44:25 -0000
@@ -34,6 +34,9 @@
 #include "opt_xen.h"
 #endif
 
+#include <sys/types.h>
+#include <sys/kcpuset.h>
+
 #define        INVALID_P2M_ENTRY       (~0UL)
 
 void xpq_queue_machphys_update(paddr_t, paddr_t);
@@ -46,11 +49,11 @@ void xpq_queue_tlb_flush(void);
 void xpq_queue_pin_table(paddr_t, int);
 void xpq_queue_unpin_table(paddr_t);
 int  xpq_update_foreign(paddr_t, pt_entry_t, int);
-void xen_vcpu_mcast_invlpg(vaddr_t, vaddr_t, uint32_t);
+void xen_vcpu_mcast_invlpg(vaddr_t, vaddr_t, kcpuset_t *);
 void xen_vcpu_bcast_invlpg(vaddr_t, vaddr_t);
-void xen_mcast_tlbflush(uint32_t);
+void xen_mcast_tlbflush(kcpuset_t *);
 void xen_bcast_tlbflush(void);
-void xen_mcast_invlpg(vaddr_t, uint32_t);
+void xen_mcast_invlpg(vaddr_t, kcpuset_t *);
 void xen_bcast_invlpg(vaddr_t);
 
 void pmap_xen_resume(void);
Index: arch/xen/x86/cpu.c
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/x86/cpu.c,v
retrieving revision 1.90
diff -u -p -u -r1.90 cpu.c
--- arch/xen/x86/cpu.c  11 Mar 2012 16:16:44 -0000      1.90
+++ arch/xen/x86/cpu.c  16 Apr 2012 20:44:25 -0000
@@ -121,10 +121,6 @@ __KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.90
 #include <dev/ic/mc146818reg.h>
 #include <dev/isa/isareg.h>
 
-#if MAXCPUS > 32
-#error cpu_info contains 32bit bitmasks
-#endif
-
 static int     cpu_match(device_t, cfdata_t, void *);
 static void    cpu_attach(device_t, device_t, void *);
 static void    cpu_defer(device_t);
@@ -167,7 +163,6 @@ struct cpu_info cpu_info_primary __align
        .ci_idepth = -1,
        .ci_curlwp = &lwp0,
        .ci_curldt = -1,
-       .ci_cpumask = 1,
 #ifdef TRAPLOG
        .ci_tlog = &tlog_primary,
 #endif
@@ -181,9 +176,6 @@ struct cpu_info phycpu_info_primary __al
 struct cpu_info *cpu_info_list = &cpu_info_primary;
 struct cpu_info *phycpu_info_list = &phycpu_info_primary;
 
-uint32_t cpus_attached = 1;
-uint32_t cpus_running = 1;
-
 uint32_t cpu_feature[5]; /* X86 CPUID feature bits
                          *     [0] basic features %edx
                          *     [1] basic features %ecx
@@ -370,6 +362,7 @@ cpu_vm_init(struct cpu_info *ci)
         */
        aprint_debug_dev(ci->ci_dev, "%d page colors\n", ncolors);
        uvm_page_recolor(ncolors);
+       pmap_tlb_cpu_init(ci);
 }
 
 static void
@@ -436,7 +429,6 @@ cpu_attach_common(device_t parent, devic
        }
 
        KASSERT(ci->ci_cpuid == ci->ci_index);
-       ci->ci_cpumask = (1 << cpu_index(ci));
        pmap_reference(pmap_kernel());
        ci->ci_pmap = pmap_kernel();
        ci->ci_tlbstate = TLBSTATE_STALE;
@@ -515,8 +507,6 @@ cpu_attach_common(device_t parent, devic
                panic("unknown processor type??\n");
        }
 
-       atomic_or_32(&cpus_attached, ci->ci_cpumask);
-
 #ifdef MPVERBOSE
        if (mp_verbose) {
                struct lwp *l = ci->ci_data.cpu_idlelwp;
@@ -565,7 +555,6 @@ cpu_init(struct cpu_info *ci)
        mutex_init(&ci->ci_kpm_mtx, MUTEX_DEFAULT, IPL_VM);
 #endif
 
-       atomic_or_32(&cpus_running, ci->ci_cpumask);
        atomic_or_32(&ci->ci_flags, CPUF_RUNNING);
 }
 
@@ -728,8 +717,8 @@ cpu_hatch(void *v)
 
        cpu_switchto(NULL, ci->ci_data.cpu_idlelwp, true);
 
-       panic("switch to idle_loop context returned!\n");
-       /* NOTREACHED */
+       idle_loop(NULL);
+       KASSERT(false);
 }
 
 #if defined(DDB)
@@ -1118,14 +1107,13 @@ void
 cpu_load_pmap(struct pmap *pmap, struct pmap *oldpmap)
 {
        KASSERT(pmap != pmap_kernel());
-       
+
 #if defined(__x86_64__) || defined(PAE)
        struct cpu_info *ci = curcpu();
-       uint32_t cpumask = ci->ci_cpumask;
 
        mutex_enter(&ci->ci_kpm_mtx);
        /* make new pmap visible to pmap_kpm_sync_xcall() */
-       atomic_or_32(&pmap->pm_xen_ptp_cpus, cpumask);
+       kcpuset_set(pmap->pm_xen_ptp_cpus, cpu_index(ci));
 #endif
 #ifdef i386
 #ifdef PAE
@@ -1178,7 +1166,9 @@ cpu_load_pmap(struct pmap *pmap, struct 
 #endif /* __x86_64__ */
 #if defined(__x86_64__) || defined(PAE)
        /* old pmap no longer visible to pmap_kpm_sync_xcall() */
-       atomic_and_32(&oldpmap->pm_xen_ptp_cpus, ~cpumask);
+       if (oldpmap != pmap_kernel())
+               kcpuset_clear(oldpmap->pm_xen_ptp_cpus, cpu_index(ci));
+
        mutex_exit(&ci->ci_kpm_mtx);
 #endif
 }
Index: arch/xen/x86/x86_xpmap.c
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/x86/x86_xpmap.c,v
retrieving revision 1.42
diff -u -p -u -r1.42 x86_xpmap.c
--- arch/xen/x86/x86_xpmap.c    2 Mar 2012 16:37:38 -0000       1.42
+++ arch/xen/x86/x86_xpmap.c    16 Apr 2012 20:44:25 -0000
@@ -361,11 +361,31 @@ xpq_queue_invlpg(vaddr_t va)
                panic("xpq_queue_invlpg");
 }
 
+#if defined(_LP64) &&  MAXCPUS > 64
+#error "XEN/amd64 uses 64 bit masks"
+#elsif !defined(_LP64) && MAXCPUS > 32
+#error "XEN/i386 uses 32 bit masks"
+#else
+/* XXX: Inefficient. */
+static u_long
+xen_kcpuset2bits(kcpuset_t *kc)
+{
+       u_long bits = 0;
+
+       for (cpuid_t i = 0; i < ncpu; i++) {
+               if (kcpuset_isset(kc, i)) {
+                       bits |= 1 << i;
+               }
+       }
+       return bits;
+}
+#endif
+
 void
-xen_mcast_invlpg(vaddr_t va, uint32_t cpumask)
+xen_mcast_invlpg(vaddr_t va, kcpuset_t *kc)
 {
+       u_long xcpumask = xen_kcpuset2bits(kc);
        mmuext_op_t op;
-       u_long xcpumask = cpumask;
 
        /* Flush pending page updates */
        xpq_flush_queue();
@@ -401,10 +421,10 @@ xen_bcast_invlpg(vaddr_t va)
 
 /* This is a synchronous call. */
 void
-xen_mcast_tlbflush(uint32_t cpumask)
+xen_mcast_tlbflush(kcpuset_t *kc)
 {
+       u_long xcpumask = xen_kcpuset2bits(kc);
        mmuext_op_t op;
-       u_long xcpumask = cpumask;
 
        /* Flush pending page updates */
        xpq_flush_queue();
@@ -439,7 +459,7 @@ xen_bcast_tlbflush(void)
 
 /* This is a synchronous call. */
 void
-xen_vcpu_mcast_invlpg(vaddr_t sva, vaddr_t eva, uint32_t cpumask)
+xen_vcpu_mcast_invlpg(vaddr_t sva, vaddr_t eva, kcpuset_t *kc)
 {
        KASSERT(eva > sva);
 
@@ -451,7 +471,7 @@ xen_vcpu_mcast_invlpg(vaddr_t sva, vaddr
        eva &= ~PAGE_MASK;
 
        for ( ; sva <= eva; sva += PAGE_SIZE) {
-               xen_mcast_invlpg(sva, cpumask);
+               xen_mcast_invlpg(sva, kc);
        }
 
        return;
Index: arch/xen/x86/xen_pmap.c
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/x86/xen_pmap.c,v
retrieving revision 1.20
diff -u -p -u -r1.20 xen_pmap.c
--- arch/xen/x86/xen_pmap.c     11 Mar 2012 17:14:30 -0000      1.20
+++ arch/xen/x86/xen_pmap.c     16 Apr 2012 20:44:25 -0000
@@ -360,7 +360,7 @@ xen_kpm_sync(struct pmap *pmap, int inde
 {
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci;
-       
+
        KASSERT(pmap != NULL);
        KASSERT(kpreempt_disabled());
 
@@ -370,19 +370,19 @@ xen_kpm_sync(struct pmap *pmap, int inde
                if (ci == NULL) {
                        continue;
                }
+               cpuid_t cid = cpu_index(ci);
                if (pmap != pmap_kernel() &&
-                   (ci->ci_cpumask & pmap->pm_xen_ptp_cpus) == 0)
+                   !kcpuset_isset(pmap->pm_xen_ptp_cpus, cid))
                        continue;
 
                /* take the lock and check again */
                mutex_enter(&ci->ci_kpm_mtx);
                if (pmap == pmap_kernel() ||
-                   (ci->ci_cpumask & pmap->pm_xen_ptp_cpus) != 0) {
+                   kcpuset_isset(pmap->pm_xen_ptp_cpus, cid)) {
                        pmap_kpm_setpte(ci, pmap, index);
                }
                mutex_exit(&ci->ci_kpm_mtx);
        }
-       return;
 }
 
 #endif /* PAE || __x86_64__ */
Index: kern/subr_kcpuset.c
===================================================================
RCS file: /cvsroot/src/sys/kern/subr_kcpuset.c,v
retrieving revision 1.4
diff -u -p -u -r1.4 subr_kcpuset.c
--- kern/subr_kcpuset.c 29 Jan 2012 19:08:26 -0000      1.4
+++ kern/subr_kcpuset.c 16 Apr 2012 20:44:25 -0000
@@ -169,8 +169,9 @@ kcpuset_early_ptr(kcpuset_t **kcptr)
                 * Save the pointer, return pointer to static early field.
                 * Need to zero it out.
                 */
-               kc_noted_early[kc_last_idx++] = kcptr;
+               kc_noted_early[kc_last_idx] = kcptr;
                kcp = (kcpuset_t *)&kc_bits_early[kc_last_idx];
+               kc_last_idx++;
                memset(kcp, 0, KC_BITSIZE_EARLY);
                KASSERT(kc_bitsize == KC_BITSIZE_EARLY);
        } else {
@@ -208,7 +209,6 @@ kcpuset_create_raw(bool zero)
 void
 kcpuset_create(kcpuset_t **retkcp, bool zero)
 {
-
        if (__predict_false(!kc_initialised)) {
                /* Early boot use - special case. */
                *retkcp = kcpuset_early_ptr(retkcp);
@@ -412,6 +412,15 @@ kcpuset_merge(kcpuset_t *kcp1, kcpuset_t
        }
 }
 
+void
+kcpuset_intersect(kcpuset_t *kcp1, kcpuset_t *kcp2)
+{
+
+       for (size_t j = 0; j < kc_nfields; j++) {
+               kcp1->bits[j] &= kcp2->bits[j];
+       }
+}
+
 int
 kcpuset_countset(kcpuset_t *kcp)
 {
Index: sys/kcpuset.h
===================================================================
RCS file: /cvsroot/src/sys/sys/kcpuset.h,v
retrieving revision 1.4
diff -u -p -u -r1.4 kcpuset.h
--- sys/kcpuset.h       29 Jan 2012 19:08:26 -0000      1.4
+++ sys/kcpuset.h       16 Apr 2012 20:44:26 -0000
@@ -61,6 +61,7 @@ bool          kcpuset_isotherset(kcpuset_t *, cp
 bool           kcpuset_iszero(kcpuset_t *);
 bool           kcpuset_match(const kcpuset_t *, const kcpuset_t *);
 void           kcpuset_merge(kcpuset_t *, kcpuset_t *);
+void           kcpuset_intersect(kcpuset_t *, kcpuset_t *);
 int            kcpuset_countset(kcpuset_t *);
 
 void           kcpuset_atomic_set(kcpuset_t *, cpuid_t);


Home | Main Index | Thread Index | Old Index