Port-amd64 archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

PCID support for amd64 - initial patch.



Hello,

here is a patch to implement initial PCID support for amd64. This feature is available on recent Intel processors, starting with Haswell, and optimises TLB use during address space switch.

The patch compiles and seems to work fine in emulator, but I haven't tested it with real hardware just yet. I should be able to do so in a week or so. By then I'd also see if there is any performance improvement. I'm sharing this now just to gather some early feedback.

It only activates when both PCID and INVPCID is supported by boot processor. I think it's not worth the efford any more to try supporting the early machines with PCID but without INVPCID.

I've considered using the MI pmap_tlb.c ASID code and evaluated sparc64 CTX and alpha ASN counterparts. Opted to use sparc64 pmap approach for simplicity and similar features (12 bits context space compared to sparc 13+), on the end I however actually used pretty much the alpha approach. It's using just a generation number for invalidation on ASID wraparound, instead of an explicit pmap list like sparc64.

There is no support for SVS yet, which was actually the primary driver for this effort. SVS code will need to be modified to not force TLB flushes on address space switch. This will be now the next step for me.

Thoughts welcome. I plan to integrate this into the -current tree within couple of weeks.

Jaromir
? arch/amd64/compile/QEMU-VIRT
? arch/amd64/conf/BEAST
? arch/amd64/conf/GENERIC.BARE
? arch/amd64/conf/QEMU-VIRT
Index: arch/x86/include/cpu.h
===================================================================
RCS file: /cvsroot/src/sys/arch/x86/include/cpu.h,v
retrieving revision 1.89
diff -u -p -r1.89 cpu.h
--- arch/x86/include/cpu.h	18 Jan 2018 07:25:34 -0000	1.89
+++ arch/x86/include/cpu.h	3 Mar 2018 20:16:16 -0000
@@ -200,6 +200,21 @@ struct cpu_info {
 	vaddr_t		ci_svs_utls;
 #endif
 
+#if defined(__HAVE_PCID)
+	/*
+	 * ASID (or PCID, how Intel calls it) related structures.
+	 *
+ 	 * A context is simply a small number that differentiates multiple
+	 * mappings of the same address.
+	 *
+	 * There is no lock for allocation, since there is no mutual
+	 * exclusion necessary. 
+	 */
+	tlb_asid_t		ci_pmap_next_ctx; /* Next available PCID */
+	tlb_asid_t		ci_current_ctx;	  /* Current PCID */
+	uint32_t		ci_pmap_ctxgen;	/* Current generation number */
+#endif /* __HAVE_PCID */
+
 #if defined(XEN) && (defined(PAE) || defined(__x86_64__))
 	/* Currently active user PGD (can't use rcr3() with Xen) */
 	pd_entry_t *	ci_kpm_pdir;	/* per-cpu PMD (va) */
@@ -342,6 +357,7 @@ void cpu_boot_secondary_processors(void)
 void cpu_init_idle_lwps(void);
 void cpu_init_msrs(struct cpu_info *, bool);
 void cpu_load_pmap(struct pmap *, struct pmap *);
+void cpu_pmap_init(struct cpu_info *);
 void cpu_broadcast_halt(void);
 void cpu_kick(struct cpu_info *);
 
@@ -416,6 +432,11 @@ extern int x86_fpu_save;
 extern unsigned int x86_fpu_save_size;
 extern uint64_t x86_xsave_features;
 
+#ifdef __HAVE_PCID
+#define PCID_NO_TLB_FLUSH	(1ULL << 63) /* No TLB flush on %cr3 change */
+extern bool x86_use_pcid;
+#endif /* __HAVE_PCID */
+
 extern void (*x86_cpu_idle)(void);
 #define	cpu_idle() (*x86_cpu_idle)()
 
@@ -536,8 +557,9 @@ void x86_bus_space_mallocok(void);
 					 */
 #define	CPU_FPU_SAVE_SIZE	16	/* int: FPU Instruction layout size */
 #define	CPU_XSAVE_FEATURES	17	/* quad: XSAVE features */
+#define	CPU_PCID		18	/* int: OS/CPU supports PCID+INVPCID */
 
-#define	CPU_MAXID		18	/* number of valid machdep ids */
+#define	CPU_MAXID		19	/* number of valid machdep ids */
 
 /*
  * Structure for CPU_DISKINFO sysctl call.
Index: arch/x86/include/pmap.h
===================================================================
RCS file: /cvsroot/src/sys/arch/x86/include/pmap.h,v
retrieving revision 1.75
diff -u -p -r1.75 pmap.h
--- arch/x86/include/pmap.h	18 Jan 2018 07:25:34 -0000	1.75
+++ arch/x86/include/pmap.h	3 Mar 2018 20:16:16 -0000
@@ -235,6 +235,31 @@ struct pmap {
 					 ptp mapped */
 	uint64_t pm_ncsw;		/* for assertions */
 	struct vm_page *pm_gc_ptp;	/* pages from pmap g/c */
+
+#ifdef __HAVE_PCID
+	/*
+	 * We record the context used on any cpu here. If the context
+	 * is actually present in the TLB, it will be the plain context
+	 * number. Kernel pmap doesn't have space for this array
+	 * allocated, so kernel pmap must never use it.
+	 *
+	 * If this pmap has no context allocated on that cpu, the entry
+	 * will be 0.
+	 */
+	struct {
+		tlb_asid_t pc_ctx;	/* Current context per cpu */
+		uint32_t pc_ctxgen;	/* Context generation per cpu */
+	} pm_ctx[];
+	/* Variable length */
+
+/* Compute the sizeof of a pmap structure. */
+#define	PMAP_SIZEOF(x)	(ALIGN(offsetof(struct pmap, pm_ctx[(x)])))
+
+#else /* ! __HAVE_PCID */
+
+#define PMAP_SIZEOF(x)	sizeof(struct pmap)
+
+#endif /* __HAVE_PCID */
 };
 
 /* macro to access pm_pdirpa slots */
@@ -315,6 +340,7 @@ void		pmap_remove_all(struct pmap *);
 void		pmap_ldt_cleanup(struct lwp *);
 void		pmap_ldt_sync(struct pmap *);
 void		pmap_kremove_local(vaddr_t, vsize_t);
+void		pmap_update_pg_shootdown(vaddr_t, struct pmap *);
 
 void		pmap_emap_enter(vaddr_t, paddr_t, vm_prot_t);
 void		pmap_emap_remove(vaddr_t, vsize_t);
@@ -380,32 +406,51 @@ bool	pmap_pageidlezero(paddr_t);
  * inline functions
  */
 
-__inline static bool __unused
-pmap_pdes_valid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde)
-{
-	return pmap_pdes_invalid(va, pdes, lastpde) == 0;
-}
-
+#ifdef __HAVE_PCID
 /*
- * pmap_update_pg: flush one page from the TLB (or flush the whole thing
- *	if hardware doesn't support one-page flushing)
+ * PCID support functions.
  */
+#define INVPCID_ADDR    0
+#define INVPCID_CTX     1
+#define INVPCID_CTXGLOB 2
+#define INVPCID_ALLCTX  3
+
+struct invpcid_descr {
+        uint64_t        pcid:12 __packed;
+        uint64_t        pad:52 __packed;
+        uint64_t        addr;
+} __packed;
 
-__inline static void __unused
-pmap_update_pg(vaddr_t va)
+static __inline void __unused
+invpcid(int type, tlb_asid_t asid, vaddr_t addr)
 {
-	invlpg(va);
+	struct invpcid_descr d;
+
+	memset(&d, 0, sizeof(d));
+	d.pcid = asid;
+	d.addr = addr;
+
+        __asm __volatile("invpcid (%0),%1"
+            : : "r" (&d), "r" ((u_long)type) : "memory");
+}
+#endif /* __HAVE_PCID */
+
+__inline static bool __unused
+pmap_pdes_valid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde)
+{
+	return pmap_pdes_invalid(va, pdes, lastpde) == 0;
 }
 
 /*
- * pmap_update_2pg: flush two pages from the TLB
+ * pmap_update_pg_current: flush one page from the TLB (or flush the whole
+ *	thing if hardware doesn't support one-page flushing) in currently
+ *	active pmap
  */
 
 __inline static void __unused
-pmap_update_2pg(vaddr_t va, vaddr_t vb)
+pmap_update_pg_current(vaddr_t va)
 {
 	invlpg(va);
-	invlpg(vb);
 }
 
 /*
Index: arch/x86/x86/cpu.c
===================================================================
RCS file: /cvsroot/src/sys/arch/x86/x86/cpu.c,v
retrieving revision 1.149
diff -u -p -r1.149 cpu.c
--- arch/x86/x86/cpu.c	22 Feb 2018 13:27:18 -0000	1.149
+++ arch/x86/x86/cpu.c	3 Mar 2018 20:16:16 -0000
@@ -300,10 +300,7 @@ cpu_vm_init(struct cpu_info *ci)
 	aprint_debug_dev(ci->ci_dev, "%d page colors\n", ncolors);
 	uvm_page_recolor(ncolors);
 
-	pmap_tlb_cpu_init(ci);
-#ifndef __HAVE_DIRECT_MAP
-	pmap_vpage_cpu_init(ci);
-#endif
+	cpu_pmap_init(ci);
 }
 
 static void
@@ -619,6 +616,12 @@ cpu_init(struct cpu_info *ci)
 	if (cpu_feature[5] & CPUID_SEF_SMAP)
 		cr4 |= CR4_SMAP;
 
+#ifdef __HAVE_PCID
+	/* If PCID is supported, enable it */
+	if (x86_use_pcid)
+		cr4 |= CR4_PCIDE;
+#endif /* __HAVE_PCID */
+
 	if (cr4) {
 		cr4 |= rcr4();
 		lcr4(cr4);
@@ -900,9 +903,9 @@ cpu_hatch(void *v)
 	for (i = 0 ; i < PDP_SIZE; i++) {
 		l3_pd[i] = pmap_kernel()->pm_pdirpa[i] | PG_V;
 	}
-	lcr3(ci->ci_pae_l3_pdirpa);
+	lcr3(ci->ci_pae_l3_pdirpa);		// hatch, PCID not yet up
 #else
-	lcr3(pmap_pdirpa(pmap_kernel(), 0));
+	lcr3(pmap_pdirpa(pmap_kernel(), 0));	// hatch, PCID not yet up
 #endif
 
 	pcb = lwp_getpcb(curlwp);
@@ -1299,7 +1302,23 @@ cpu_load_pmap(struct pmap *pmap, struct 
 		x86_enable_intr();
 	tlbflush();
 #else /* PAE */
-	lcr3(pmap_pdirpa(pmap, 0));
+
+#ifdef __HAVE_PCID
+	uintptr_t pcid = 0;
+
+	if (x86_use_pcid) {
+		const struct cpu_info *ci = curcpu();
+		KASSERTMSG(
+		    (ci->ci_current_ctx != 0 && pmap != pmap_kernel())
+		    ||(ci->ci_current_ctx == 0 && pmap == pmap_kernel()),
+		    "pmap %p (kernel %p) ctx %u unexpected",
+		    pmap, pmap_kernel(), ci->ci_current_ctx);
+		pcid = ci->ci_current_ctx | PCID_NO_TLB_FLUSH;
+	}
+#else
+	const uintptr_t pcid = 0;
+#endif
+	lcr3(pmap_pdirpa(pmap, 0) | pcid);
 #endif /* PAE */
 }
 
Index: arch/x86/x86/db_memrw.c
===================================================================
RCS file: /cvsroot/src/sys/arch/x86/x86/db_memrw.c,v
retrieving revision 1.4
diff -u -p -r1.4 db_memrw.c
--- arch/x86/x86/db_memrw.c	11 Nov 2017 12:51:05 -0000	1.4
+++ arch/x86/x86/db_memrw.c	3 Mar 2018 20:16:16 -0000
@@ -139,7 +139,7 @@ db_write_text(vaddr_t addr, size_t size,
 		 */
 		pmap_pte_clearbits(ppte, PG_KR);
 		pmap_pte_setbits(ppte, PG_KW);
-		pmap_update_pg(addr);
+		pmap_update_pg_current(addr);
 
 		/*
 		 * MULTIPROCESSOR: no shootdown required as the PTE continues to
@@ -158,7 +158,7 @@ db_write_text(vaddr_t addr, size_t size,
 		 */
 		pmap_pte_clearbits(ppte, PG_KW);
 		pmap_pte_setbits(ppte, PG_KR);
-		pmap_update_pg(addr);
+		pmap_update_pg_current(addr);
 
 		/*
 		 * MULTIPROCESSOR: no shootdown required as all other CPUs
Index: arch/x86/x86/identcpu.c
===================================================================
RCS file: /cvsroot/src/sys/arch/x86/x86/identcpu.c,v
retrieving revision 1.69
diff -u -p -r1.69 identcpu.c
--- arch/x86/x86/identcpu.c	9 Feb 2018 18:45:55 -0000	1.69
+++ arch/x86/x86/identcpu.c	3 Mar 2018 20:16:17 -0000
@@ -779,6 +779,18 @@ cpu_probe_fpu(struct cpu_info *ci)
 #endif
 }
 
+#ifdef __HAVE_PCID
+static void
+cpu_probe_pcid(struct cpu_info *ci)
+{
+	/* If PCID and also INVPCID is supported, enable it */
+	if (ci->ci_feat_val[1] & CPUID2_PCID
+	    && ci->ci_feat_val[5] & CPUID_SEF_INVPCID) {
+		x86_use_pcid = true;
+	}
+}
+#endif /* __HAVE_PCID */
+
 void
 cpu_probe(struct cpu_info *ci)
 {
@@ -895,6 +907,9 @@ cpu_probe(struct cpu_info *ci)
 	cpu_probe_vortex86(ci);
 
 	cpu_probe_fpu(ci);
+#ifdef __HAVE_PCID
+	cpu_probe_pcid(ci);
+#endif
 
 	x86_cpu_topology(ci);
 
Index: arch/x86/x86/mtrr_i686.c
===================================================================
RCS file: /cvsroot/src/sys/arch/x86/x86/mtrr_i686.c,v
retrieving revision 1.29
diff -u -p -r1.29 mtrr_i686.c
--- arch/x86/x86/mtrr_i686.c	1 Jun 2017 02:45:08 -0000	1.29
+++ arch/x86/x86/mtrr_i686.c	3 Mar 2018 20:16:17 -0000
@@ -170,7 +170,7 @@ i686_mtrr_reload(int synch)
 	 * much. Need to change the prototypes of l/rcr0 too if you
 	 * want to correct it. */
 	uint32_t cr0;
-	vaddr_t cr3, cr4;
+	vaddr_t cr4;
 	uint32_t origcr0;
 	vaddr_t origcr4;
 
@@ -221,8 +221,7 @@ i686_mtrr_reload(int synch)
 	 * to CR3)
 	 */
 
-	cr3 = rcr3();
-	lcr3(cr3);
+	tlbflush();
 
 	/*
 	 * 8. Disable all range registers (by clearing the E flag in
@@ -262,7 +261,7 @@ i686_mtrr_reload(int synch)
 	 */
 
 	wbinvd();
-	lcr3(cr3);
+	tlbflush();
 
 	/*
 	 * 12. Enter the normal cache mode to reenable caching (set the CD and
Index: arch/x86/x86/pmap.c
===================================================================
RCS file: /cvsroot/src/sys/arch/x86/x86/pmap.c,v
retrieving revision 1.281
diff -u -p -r1.281 pmap.c
--- arch/x86/x86/pmap.c	18 Feb 2018 14:07:29 -0000	1.281
+++ arch/x86/x86/pmap.c	3 Mar 2018 20:16:18 -0000
@@ -188,6 +188,7 @@ __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.2
 #include <sys/intr.h>
 #include <sys/xcall.h>
 #include <sys/kcore.h>
+#include <sys/once.h>
 
 #include <uvm/uvm.h>
 #include <uvm/pmap/pmap_pvt.h>
@@ -365,6 +366,26 @@ struct evcnt pmap_ldt_evcnt;
 static bool cpu_pat_enabled __read_mostly = false;
 
 /*
+ * PCID
+ */
+#define PCID_NUM	0x1000		/* Number of supported PCIDs */
+#define PCID_MASK	(PCID_NUM - 1)
+
+#ifdef __HAVE_PCID
+bool x86_use_pcid __read_mostly = false;
+static tlb_asid_t pmap_max_pcid __read_mostly = PCID_NUM;
+
+#define	pmap_ctx(ci, pm)	((pm)->pm_ctx[(ci)->ci_index]).pc_ctx
+#define	pmap_ctxgen(ci, pm)	((pm)->pm_ctx[(ci)->ci_index]).pc_ctxgen
+
+/* Initialize the pmap's pm_ctx. It's variable size, depending on ncpu */
+#define PMAP_CTX_INIT(pm)	\
+	memset(pm->pm_ctx, 0, ncpu * sizeof(pm->pm_ctx[0]));
+
+static void ctx_alloc(struct pmap *, struct cpu_info *);
+#endif /* __HAVE_PCID */
+
+/*
  * Global data structures
  */
 
@@ -470,6 +491,7 @@ pvhash_remove(struct pv_hash_head *hh, s
 static pt_entry_t protection_codes[8] __read_mostly;
 
 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */
+static ONCE_DECL(pmap_cache_initialized);
 
 /*
  * The following two vaddr_t's are used during system startup to keep track of
@@ -581,7 +603,7 @@ static void pmap_remove_ptes(struct pmap
 static paddr_t pmap_get_physpage(void);
 static void pmap_alloc_level(struct pmap *, vaddr_t, long *);
 
-static bool pmap_reactivate(struct pmap *);
+static void pmap_reactivate(struct pmap *);
 
 /*
  * p m a p   h e l p e r   f u n c t i o n s
@@ -757,11 +779,7 @@ pmap_map_ptes(struct pmap *pmap, struct 
 		 * often the case during exit(), when we have switched
 		 * to the kernel pmap in order to destroy a user pmap.
 		 */
-		if (!pmap_reactivate(pmap)) {
-			u_int gen = uvm_emap_gen_return();
-			tlbflush();
-			uvm_emap_update(gen);
-		}
+		pmap_reactivate(pmap);
 	} else {
 		/*
 		 * Toss current pmap from CPU, but keep a reference to it.
@@ -776,6 +794,10 @@ pmap_map_ptes(struct pmap *pmap, struct 
 		ci->ci_tlbstate = TLBSTATE_VALID;
 		kcpuset_atomic_set(pmap->pm_cpus, cid);
 		kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
+#ifdef __HAVE_PCID
+		if (x86_use_pcid)
+			ctx_alloc(pmap, ci);
+#endif
 		cpu_load_pmap(pmap, curpmap);
 	}
 	pmap->pm_ncsw = l->l_ncsw;
@@ -856,7 +878,7 @@ pmap_exec_account(struct pmap *pm, vaddr
 		return;
 
 	if ((opte ^ npte) & PG_X)
-		pmap_update_pg(va);
+		pmap_update_pg_current(va);
 
 	/*
 	 * Executability was removed on the last executable change.
@@ -1284,6 +1306,10 @@ pmap_bootstrap(vaddr_t kva_start)
 	kpm->pm_ldt_len = 0;
 	kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
 
+#ifdef __HAVE_PCID
+	PMAP_CTX_INIT(kpm);
+#endif /* __HAVE_PCID */
+
 	/*
 	 * the above is just a rough estimate and not critical to the proper
 	 * operation of the system.
@@ -1492,7 +1518,7 @@ pmap_init_pcpu(void)
 
 		pa = pmap_bootstrap_palloc(1);
 		*pte = (pa & PG_FRAME) | pteflags;
-		pmap_update_pg(tmpva);
+		pmap_update_pg_current(tmpva);
 		memset((void *)tmpva, 0, PAGE_SIZE);
 
 		L4_BASE[L4e_idx+i] = pa | pteflags | PG_U;
@@ -1506,7 +1532,7 @@ pmap_init_pcpu(void)
 
 		pa = pmap_bootstrap_palloc(1);
 		*pte = (pa & PG_FRAME) | pteflags;
-		pmap_update_pg(tmpva);
+		pmap_update_pg_current(tmpva);
 		memset((void *)tmpva, 0, PAGE_SIZE);
 
 		L3_BASE[L3e_idx+i] = pa | pteflags | PG_U;
@@ -1521,7 +1547,7 @@ pmap_init_pcpu(void)
 
 		pa = pmap_bootstrap_palloc(1);
 		*pte = (pa & PG_FRAME) | pteflags;
-		pmap_update_pg(tmpva);
+		pmap_update_pg_current(tmpva);
 		memset((void *)tmpva, 0, PAGE_SIZE);
 
 		L2_BASE[L2e_idx+i] = pa | pteflags | PG_U;
@@ -1539,7 +1565,7 @@ pmap_init_pcpu(void)
 	}
 
 	*pte = 0;
-	pmap_update_pg(tmpva);
+	pmap_update_pg_current(tmpva);
 
 	pcpuarea = (struct pcpu_area *)startva;
 
@@ -1606,7 +1632,7 @@ pmap_init_directmap(struct pmap *kpm)
 
 		pa = pmap_bootstrap_palloc(1);
 		*pte = (pa & PG_FRAME) | pteflags;
-		pmap_update_pg(tmpva);
+		pmap_update_pg_current(tmpva);
 		memset((void *)tmpva, 0, PAGE_SIZE);
 
 		L4_BASE[L4e_idx+i] = pa | pteflags | PG_U;
@@ -1620,7 +1646,7 @@ pmap_init_directmap(struct pmap *kpm)
 
 		pa = pmap_bootstrap_palloc(1);
 		*pte = (pa & PG_FRAME) | pteflags;
-		pmap_update_pg(tmpva);
+		pmap_update_pg_current(tmpva);
 		memset((void *)tmpva, 0, PAGE_SIZE);
 
 		L3_BASE[L3e_idx+i] = pa | pteflags | PG_U;
@@ -1644,7 +1670,7 @@ pmap_init_directmap(struct pmap *kpm)
 	}
 
 	*pte = 0;
-	pmap_update_pg(tmpva);
+	pmap_update_pg_current(tmpva);
 
 	pmap_direct_base = startva;
 	pmap_direct_end = endva;
@@ -1773,6 +1799,40 @@ pmap_remap_largepages(void)
 #endif /* !XEN */
 
 /*
+ * Initialize the per CPU parts for the cpu running this code.
+ */
+void
+cpu_pmap_init(struct cpu_info *ci)
+{
+#ifdef __HAVE_PCID
+	ci->ci_pmap_next_ctx = 1;
+	ci->ci_pmap_ctxgen = 1;
+	ci->ci_current_ctx = 0;
+
+#ifdef SVS
+#error "Adjust user<->kernel address space switch to take advantage of PCID"
+#endif
+
+#endif /* __HAVE_PCID */
+
+	pmap_tlb_cpu_init(ci);	
+#ifndef __HAVE_DIRECT_MAP
+	pmap_vpage_cpu_init(ci);
+#endif
+}
+
+static int
+pmap_cache_init(void)
+{
+	KASSERTMSG(!cold && ncpu > 0, "unexpected cold %d ncpu %d", cold, ncpu);
+
+	pool_cache_bootstrap(&pmap_cache, PMAP_SIZEOF(ncpu), 0, 0, 0,
+	    "pmappl", NULL, IPL_NONE, NULL, NULL, NULL);
+
+	return 0;
+}
+
+/*
  * pmap_init: called from uvm_init, our job is to get the pmap
  * system ready to manage mappings...
  */
@@ -1793,8 +1853,7 @@ pmap_init(void)
 	 * initialize caches.
 	 */
 
-	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0,
-	    "pmappl", NULL, IPL_NONE, NULL, NULL, NULL);
+	/* pmap_cache initialized on first pmap_create() call */
 
 #ifdef XEN
 	/*
@@ -1820,7 +1879,7 @@ pmap_init(void)
 	pmap_tlb_init();
 
 	/* XXX: Since cpu_hatch() is only for secondary CPUs. */
-	pmap_tlb_cpu_init(curcpu());
+	cpu_pmap_init(curcpu());
 
 	evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
 	    NULL, "x86", "io bitmap copy");
@@ -2463,6 +2522,8 @@ pmap_create(void)
 	struct pmap *pmap;
 	int i;
 
+	RUN_ONCE(&pmap_cache_initialized, pmap_cache_init);
+
 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
 
 	/* init uvm_object */
@@ -2491,6 +2552,10 @@ pmap_create(void)
 	pmap->pm_ldt_len = 0;
 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
 
+#ifdef __HAVE_PCID
+	PMAP_CTX_INIT(pmap);
+#endif
+
 	/* allocate PDP */
  try_again:
 	pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
@@ -2548,6 +2613,36 @@ pmap_check_ptps(struct pmap *pmap)
 	}
 }
 
+static inline void
+pmap_check_inuse(struct pmap *pmap)
+{
+#ifdef DIAGNOSTIC
+	CPU_INFO_ITERATOR cii;
+	struct cpu_info *ci;
+
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		if (ci->ci_pmap == pmap)
+			panic("destroying pmap being used");
+#if defined(XEN) && defined(__x86_64__)
+		for (i = 0; i < PDIR_SLOT_PTE; i++) {
+			if (pmap->pm_pdir[i] != 0 &&
+			    ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) {
+				printf("pmap_destroy(%p) pmap_kernel %p "
+				    "curcpu %d cpu %d ci_pmap %p "
+				    "ci->ci_kpm_pdir[%d]=%" PRIx64
+				    " pmap->pm_pdir[%d]=%" PRIx64 "\n",
+				    pmap, pmap_kernel(), curcpu()->ci_index,
+				    ci->ci_index, ci->ci_pmap,
+				    i, ci->ci_kpm_pdir[i],
+				    i, pmap->pm_pdir[i]);
+				panic("%s: used pmap", __func__);
+			}
+		}
+#endif
+	}
+#endif /* DIAGNOSTIC */
+}
+
 /*
  * pmap_destroy: drop reference count on pmap.   free pmap if
  *	reference count goes to zero.
@@ -2586,31 +2681,7 @@ pmap_destroy(struct pmap *pmap)
 		return;
 	}
 
-#ifdef DIAGNOSTIC
-	CPU_INFO_ITERATOR cii;
-	struct cpu_info *ci;
-
-	for (CPU_INFO_FOREACH(cii, ci)) {
-		if (ci->ci_pmap == pmap)
-			panic("destroying pmap being used");
-#if defined(XEN) && defined(__x86_64__)
-		for (i = 0; i < PDIR_SLOT_PTE; i++) {
-			if (pmap->pm_pdir[i] != 0 &&
-			    ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) {
-				printf("pmap_destroy(%p) pmap_kernel %p "
-				    "curcpu %d cpu %d ci_pmap %p "
-				    "ci->ci_kpm_pdir[%d]=%" PRIx64
-				    " pmap->pm_pdir[%d]=%" PRIx64 "\n",
-				    pmap, pmap_kernel(), curcpu()->ci_index,
-				    ci->ci_index, ci->ci_pmap,
-				    i, ci->ci_kpm_pdir[i],
-				    i, pmap->pm_pdir[i]);
-				panic("%s: used pmap", __func__);
-			}
-		}
-#endif
-	}
-#endif /* DIAGNOSTIC */
+	pmap_check_inuse(pmap);
 
 	/*
 	 * Reference count is zero, free pmap resources and then free pmap.
@@ -2658,6 +2729,7 @@ pmap_destroy(struct pmap *pmap)
 #endif
 
 	pmap_check_ptps(pmap);
+
 	pool_cache_put(&pmap_cache, pmap);
 }
 
@@ -2853,45 +2925,54 @@ pmap_activate(struct lwp *l)
 
 	ci = curcpu();
 
-	if (l == ci->ci_curlwp) {
-		KASSERT(ci->ci_want_pmapload == 0);
-		KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
+	if (l != ci->ci_curlwp)
+		return;
 
-		/*
-		 * no need to switch to kernel vmspace because
-		 * it's a subset of any vmspace.
-		 */
+	KASSERT(ci->ci_want_pmapload == 0);
+	KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
 
-		if (pmap == pmap_kernel()) {
-			ci->ci_want_pmapload = 0;
-			return;
-		}
+	/*
+	 * no need to switch to kernel vmspace because
+	 * it's a subset of any vmspace.
+	 */
 
-		ci->ci_want_pmapload = 1;
+	if (pmap == pmap_kernel()) {
+		ci->ci_want_pmapload = 0;
+#ifdef __HAVE_PCID
+		ci->ci_current_ctx = 0;
+#endif
+		return;
 	}
+
+	ci->ci_want_pmapload = 1;
 }
 
+#if defined(XEN) && defined(__x86_64__)
+#define	KASSERT_PDIRPA(pmap) \
+	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \
+	    pmap == pmap_kernel())
+#elif defined(PAE)
+#define	KASSERT_PDIRPA(pmap) \
+	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]))
+#elif !defined(XEN)
+#define	KASSERT_PDIRPA(pmap) \
+	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3() & ~PCID_MASK))
+#endif
+
 /*
  * pmap_reactivate: try to regain reference to the pmap.
  *
  * => Must be called with kernel preemption disabled.
  */
 
-static bool
+static void
 pmap_reactivate(struct pmap *pmap)
 {
 	struct cpu_info * const ci = curcpu();
 	const cpuid_t cid = cpu_index(ci);
-	bool result;
 
 	KASSERT(kpreempt_disabled());
-#if defined(XEN) && defined(__x86_64__)
-	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
-#elif defined(PAE)
-	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
-#elif !defined(XEN)
-	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
-#endif
+	KASSERT_PDIRPA(pmap);
 
 	/*
 	 * If we still have a lazy reference to this pmap, we can assume
@@ -2908,13 +2989,17 @@ pmap_reactivate(struct pmap *pmap)
 
 	if (kcpuset_isset(pmap->pm_cpus, cid)) {
 		/* We have the reference, state is valid. */
-		result = true;
 	} else {
-		/* Must reload the TLB. */
+		/*
+		 * Must reload the TLB, pmap has been changed during
+		 * deactivated.
+		 */
 		kcpuset_atomic_set(pmap->pm_cpus, cid);
-		result = false;
+
+		u_int gen = uvm_emap_gen_return();
+		tlbflush();
+		uvm_emap_update(gen);
 	}
-	return result;
 }
 
 /*
@@ -2964,18 +3049,7 @@ pmap_load(void)
 	pcb = lwp_getpcb(l);
 
 	if (pmap == oldpmap) {
-		if (!pmap_reactivate(pmap)) {
-			u_int gen = uvm_emap_gen_return();
-
-			/*
-			 * pmap has been changed during deactivated.
-			 * our tlb may be stale.
-			 */
-
-			tlbflush();
-			uvm_emap_update(gen);
-		}
-
+		pmap_reactivate(pmap);
 		ci->ci_want_pmapload = 0;
 		kpreempt_enable();
 		return;
@@ -2991,14 +3065,7 @@ pmap_load(void)
 	kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
 	kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
 
-#if defined(XEN) && defined(__x86_64__)
-	KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd ||
-	    oldpmap == pmap_kernel());
-#elif defined(PAE)
-	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
-#elif !defined(XEN)
-	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3()));
-#endif
+	KASSERT_PDIRPA(oldpmap);
 	KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
 	KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
 
@@ -3029,6 +3096,11 @@ pmap_load(void)
 #endif /* !XEN */
 #endif /* i386 */
 
+#ifdef __HAVE_PCID
+	if (x86_use_pcid)
+		ctx_alloc(pmap, ci);
+#endif
+
 	lldt(pmap->pm_ldt_sel);
 
 	u_int gen = uvm_emap_gen_return();
@@ -3103,13 +3175,7 @@ pmap_deactivate(struct lwp *l)
 		return;
 	}
 
-#if defined(XEN) && defined(__x86_64__)
-	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
-#elif defined(PAE)
-	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
-#elif !defined(XEN)
-	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
-#endif
+	KASSERT_PDIRPA(pmap);
 	KASSERT(ci->ci_pmap == pmap);
 
 	/*
@@ -3295,7 +3361,7 @@ pmap_zero_page(paddr_t pa)
 
 	pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
 	pmap_pte_flush();
-	pmap_update_pg(zerova);		/* flush TLB */
+	pmap_update_pg_current(zerova);		/* flush TLB */
 
 	memset((void *)zerova, 0, PAGE_SIZE);
 
@@ -3337,7 +3403,7 @@ pmap_pageidlezero(paddr_t pa)
 
 	pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
 	pmap_pte_flush();
-	pmap_update_pg(zerova);		/* flush TLB */
+	pmap_update_pg_current(zerova);		/* flush TLB */
 
 	rv = sse2_idlezero_page((void *)zerova);
 
@@ -3388,7 +3454,8 @@ pmap_copy_page(paddr_t srcpa, paddr_t ds
 	pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags);
 	pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PG_M);
 	pmap_pte_flush();
-	pmap_update_2pg(srcva, dstva);
+	pmap_update_pg_current(srcva);
+	pmap_update_pg_current(dstva);
 
 	memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
 
@@ -3403,7 +3470,7 @@ pmap_copy_page(paddr_t srcpa, paddr_t ds
 }
 
 static pt_entry_t *
-pmap_map_ptp(struct vm_page *ptp)
+pmap_map_ptp(struct vm_page *ptp, struct pmap *pmap)
 {
 #ifdef __HAVE_DIRECT_MAP
 	return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
@@ -3427,7 +3494,7 @@ pmap_map_ptp(struct vm_page *ptp)
 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags);
 
 	pmap_pte_flush();
-	pmap_update_pg(ptpva);
+	pmap_update_pg_current(ptpva);
 
 	return (pt_entry_t *)ptpva;
 #endif
@@ -3463,7 +3530,7 @@ pmap_map_pte(struct pmap *pmap, struct v
 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
 	}
 	KASSERT(ptp != NULL);
-	return pmap_map_ptp(ptp) + pl1_pi(va);
+	return pmap_map_ptp(ptp, pmap) + pl1_pi(va);
 }
 
 static void
@@ -4493,7 +4560,7 @@ pmap_get_physpage(void)
 		pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PG_V |
 		    PG_RW | pmap_pg_nx);
 		pmap_pte_flush();
-		pmap_update_pg((vaddr_t)early_zerop);
+		pmap_update_pg_current((vaddr_t)early_zerop);
 		memset(early_zerop, 0, PAGE_SIZE);
 #if defined(DIAGNOSTIC) || defined(XEN)
 		pmap_pte_set(early_zero_pte, 0);
@@ -4908,3 +4975,100 @@ x86_mmap_flags(paddr_t mdpgno)
 
 	return pflag;
 }
+
+/*
+ * pmap_update_pg_shootdown: flush one page from the TLB (or flush the whole
+ * 	thing if hardware doesn't support one-page flushing), executed
+ *	from shootdown callback.
+ */
+void
+pmap_update_pg_shootdown(vaddr_t va, struct pmap *pmap)
+{
+	KASSERT(pmap != NULL);
+
+#ifdef __HAVE_PCID
+	/*
+	 * INVLPG invalidates TLB entries only for VA for current PCID. Need
+	 * to use INVPCID to shootdown entries for other PCID. 
+	 */
+	if (x86_use_pcid) {
+		if (pmap == pmap_kernel()) {
+			/*
+			 * If we are updating kernel pmap, we need to flush
+			 * the address mappings for all PCIDs. INVPCID
+			 * can't flush specific address + all PCIDs, so just
+			 * flush all TLB mappings, including globals.
+			 */
+			invpcid(INVPCID_CTXGLOB, 0, 0);
+		} else {
+			KASSERT(pmap != pmap_kernel());
+			invpcid(INVPCID_ADDR, pmap_ctx(curcpu(), pmap), va);
+		}
+		return;
+	}
+#endif /* __HAVE_PCID */
+
+	pmap_update_pg_current(va);
+}
+
+#ifdef __HAVE_PCID
+
+/*
+ * Allocate a hardware context to the given pmap.
+ */
+static void
+ctx_alloc(struct pmap *pm, struct cpu_info *ci)
+{
+	KASSERT(pm != pmap_kernel());
+	KASSERT(ci->ci_pmap_ctxgen > 0);
+
+	if (pmap_ctx(ci, pm) != 0
+	    && pmap_ctxgen(ci, pm) == ci->ci_pmap_ctxgen) {
+		/* Already has context */
+		goto out;
+	}
+
+	tlb_asid_t ctx = ci->ci_pmap_next_ctx++;
+
+	/*
+	 * if we have run out of contexts, remove all user entries from
+	 * the TLB and start over with context 1 again.
+	 */
+
+	if (__predict_false(ctx == pmap_max_pcid)) {
+		/* Flush TLB for all PCIDs and all VAs, excluding globals */
+		invpcid(INVPCID_ALLCTX, 0, 0);
+
+		ci->ci_pmap_ctxgen++;
+
+#ifdef DIAGNOSTIC
+		if (ci->ci_pmap_ctxgen == 0) {
+			/*
+			 * The generation number has wrapped.  We could
+			 * handle this scenario by traversing all of
+			 * the pmaps, and invalidating the generation
+			 * number on those which are not currently
+			 * in use by this processor.
+			 *
+			 * However... considering that we're using
+			 * an unsigned 32-bit integer for generation
+			 * numbers, with 12-bit PCID we wrap in around
+			 * 32686 years (16343 with SVS) if 1000 new processes
+			 * run on the processor every second.
+			 *
+			 * So, we don't bother.
+			 */
+			panic("%s: too much uptime", __func__);
+		}
+#endif
+
+		ctx = 1;
+		ci->ci_pmap_next_ctx = 2;
+	}
+	pmap_ctx(ci, pm) = ctx;
+	pmap_ctxgen(ci, pm) = ci->ci_pmap_ctxgen;
+
+out:
+	ci->ci_current_ctx = pmap_ctx(ci, pm);
+}
+#endif /* __HAVE_PCID */
Index: arch/x86/x86/x86_machdep.c
===================================================================
RCS file: /cvsroot/src/sys/arch/x86/x86/x86_machdep.c,v
retrieving revision 1.108
diff -u -p -r1.108 x86_machdep.c
--- arch/x86/x86/x86_machdep.c	23 Feb 2018 09:57:20 -0000	1.108
+++ arch/x86/x86/x86_machdep.c	3 Mar 2018 20:16:18 -0000
@@ -1276,6 +1276,10 @@ SYSCTL_SETUP(sysctl_machdep_setup, "sysc
 	    CPU_FPU_PRESENT);
 	const_sysctl(clog, "osfxsr", CTLTYPE_INT, i386_use_fxsave,
 	    CPU_OSFXSR);
+#ifdef __HAVE_PCID
+	const_sysctl(clog, "pcid", CTLTYPE_INT, x86_use_pcid,
+	    CPU_PCID);
+#endif /* __HAVE_PCID */
 	const_sysctl(clog, "sse", CTLTYPE_INT, i386_has_sse,
 	    CPU_SSE);
 	const_sysctl(clog, "sse2", CTLTYPE_INT, i386_has_sse2,
Index: arch/x86/x86/x86_tlb.c
===================================================================
RCS file: /cvsroot/src/sys/arch/x86/x86/x86_tlb.c,v
retrieving revision 1.1
diff -u -p -r1.1 x86_tlb.c
--- arch/x86/x86/x86_tlb.c	22 Jan 2018 19:37:45 -0000	1.1
+++ arch/x86/x86/x86_tlb.c	3 Mar 2018 20:16:18 -0000
@@ -61,16 +61,21 @@ __KERNEL_RCSID(0, "$NetBSD: x86_tlb.c,v 
 /*
  * TLB shootdown structures.
  */
+struct pmap_tlb_ctx_t {
+	uintptr_t tc_va;
+	struct pmap *tc_pmap;
+};
 
 typedef struct {
 #ifdef _LP64
-	uintptr_t		tp_va[14];	/* whole struct: 128 bytes */
+	struct pmap_tlb_ctx_t	tp_ctx[7];	/* whole struct: 128 bytes */
+	uint16_t		_tp_pad;
 #else
-	uintptr_t		tp_va[13];	/* whole struct: 64 bytes */
+	struct pmap_tlb_ctx_t	tp_ctx[7];	/* whole struct: 64 bytes */
 #endif
-	uint16_t		tp_count;
+	uint8_t			tp_count;	/* <= TP_MAXVA or 0xff */
+	uint8_t			tp_userpmap;
 	uint16_t		tp_pte;
-	int			tp_userpmap;
 	kcpuset_t *		tp_cpumask;
 } pmap_tlb_packet_t;
 
@@ -198,20 +203,32 @@ pmap_tlb_invalidate(const pmap_tlb_packe
 	int i;
 
 	/* Find out what we need to invalidate. */
-	if (tp->tp_count == (uint16_t)-1) {
+	if (tp->tp_count == (uint8_t)-1) {
 		u_int egen = uvm_emap_gen_return();
 		if (tp->tp_pte & PG_G) {
-			/* Invalidating user and kernel TLB entries. */
-			tlbflushg();
+			/* Invalidating all user and kernel TLB entries. */
+#ifdef __HAVE_PCID
+			if (x86_use_pcid)
+				invpcid(INVPCID_CTXGLOB, 0, 0);
+			else
+#endif /* __HAVE_PCID */
+				tlbflushg();
 		} else {
-			/* Invalidating user TLB entries only. */
-			tlbflush();
+			/* Invalidating all user TLB entries only. */
+#ifdef __HAVE_PCID
+			if (x86_use_pcid)
+				invpcid(INVPCID_ALLCTX, 0, 0);
+			else
+#endif /* __HAVE_PCID */
+				tlbflush();
 		}
 		uvm_emap_update(egen);
 	} else {
 		/* Invalidating a single page or a range of pages. */
 		for (i = tp->tp_count - 1; i >= 0; i--) {
-			pmap_update_pg(tp->tp_va[i]);
+			KASSERT(tp->tp_ctx[i].tc_pmap != NULL);
+			pmap_update_pg_shootdown(tp->tp_ctx[i].tc_va,
+			    tp->tp_ctx[i].tc_pmap);
 		}
 	}
 }
@@ -249,19 +266,22 @@ pmap_tlb_shootdown(struct pmap *pm, vadd
 
 	/* Whole address flush will be needed if PG_G is set. */
 	CTASSERT(PG_G == (uint16_t)PG_G);
+	CTASSERT(TP_MAXVA < __arraycount(tp->tp_ctx));
 	tp->tp_pte |= (uint16_t)pte;
 
-	if (tp->tp_count == (uint16_t)-1) {
+	if (tp->tp_count == (uint8_t)-1) {
 		/*
 		 * Already flushing everything.
 		 */
 	} else if (tp->tp_count < TP_MAXVA && va != (vaddr_t)-1LL) {
 		/* Flush a single page. */
-		tp->tp_va[tp->tp_count++] = va;
+		tp->tp_ctx[tp->tp_count].tc_va = va;
+		tp->tp_ctx[tp->tp_count].tc_pmap = pm;
+		tp->tp_count++;
 		KASSERT(tp->tp_count > 0);
 	} else {
 		/* Flush everything. */
-		tp->tp_count = (uint16_t)-1;
+		tp->tp_count = (uint8_t)-1;
 	}
 
 	if (pm != pmap_kernel()) {
@@ -284,10 +304,10 @@ static inline void
 pmap_tlb_processpacket(pmap_tlb_packet_t *tp, kcpuset_t *target)
 {
 
-	if (tp->tp_count != (uint16_t)-1) {
+	if (tp->tp_count != (uint8_t)-1) {
 		/* Invalidating a single page or a range of pages. */
 		for (int i = tp->tp_count - 1; i >= 0; i--) {
-			xen_mcast_invlpg(tp->tp_va[i], target);
+			xen_mcast_invlpg(tp->tp_ctx[i].tc_va, target);
 		}
 	} else {
 		xen_mcast_tlbflush(target);
@@ -420,7 +440,7 @@ pmap_tlb_shootnow(void)
 	 * Clear out our local buffer.
 	 */
 #ifdef TLBSTATS
-	if (tp->tp_count != (uint16_t)-1) {
+	if (tp->tp_count != (uint8_t)-1) {
 		atomic_add_64(&tlbstat_single_issue.ev_count, tp->tp_count);
 	}
 #endif
Index: arch/amd64/include/types.h
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/include/types.h,v
retrieving revision 1.54
diff -u -p -r1.54 types.h
--- arch/amd64/include/types.h	11 Jan 2018 09:00:04 -0000	1.54
+++ arch/amd64/include/types.h	3 Mar 2018 20:16:18 -0000
@@ -107,12 +107,17 @@ typedef	unsigned char		__cpu_simple_lock
 #define	__HAVE_MM_MD_DIRECT_MAPPED_IO
 #define	__HAVE_MM_MD_DIRECT_MAPPED_PHYS
 /* #define	__HAVE_CPU_UAREA_ROUTINES */
+#define	__HAVE_PCID
 #if !defined(NO_PCI_MSI_MSIX)
 #define	__HAVE_PCI_MSI_MSIX
 #endif
 #endif
 #endif
 
+#ifdef __HAVE_PCID
+typedef unsigned short	tlb_asid_t;
+#endif /* __HAVE_PCID */
+
 #else	/*	!__x86_64__	*/
 
 #include <i386/types.h>


Home | Main Index | Thread Index | Old Index