[src/trunk]: src/sys/uvm - Start trying to reduce the high cache miss rate ob...

To: source-changes-hg%NetBSD.org@localhost
Subject: [src/trunk]: src/sys/uvm - Start trying to reduce the high cache miss rate ob...
From: ad <ad%NetBSD.org@localhost>
Date: Thu, 02 Jan 2020 00:19:40 +0000
details:   https://anonhg.NetBSD.org/src/rev/9654f720a3cd
branches:  trunk
changeset: 466767:9654f720a3cd
user:      ad <ad%NetBSD.org@localhost>
date:      Wed Jan 01 22:01:13 2020 +0000

description:
- Start trying to reduce the high cache miss rate observed around vm_amap.
  On _LP64, pad struct vm_amap to 128 bytes and use the additional space to
  hold the arrays for tiny amaps which are common.  Carefully size the array
  allocations to avoid false sharing, and for smaller amaps try to share
  allocated cache lines.

- Eliminate most contention due to amap_list: maintain the list in the pool
  cache constructor / destructor like we do for struct file.  Cache the
  mutexes we allocate here.

- Don't do PR_WAITOK mutex allocations when NOWAIT has been specified.

diffstat:

 sys/uvm/uvm_amap.c |  261 +++++++++++++++++++++++++++++++++-------------------
 sys/uvm/uvm_amap.h |   24 ++++-
 2 files changed, 190 insertions(+), 95 deletions(-)

diffs (truncated from 488 to 300 lines):

diff -r 84119e6675ce -r 9654f720a3cd sys/uvm/uvm_amap.c
--- a/sys/uvm/uvm_amap.c        Wed Jan 01 21:34:39 2020 +0000
+++ b/sys/uvm/uvm_amap.c        Wed Jan 01 22:01:13 2020 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: uvm_amap.c,v 1.112 2020/01/01 13:11:51 ad Exp $        */
+/*     $NetBSD: uvm_amap.c,v 1.113 2020/01/01 22:01:13 ad Exp $        */
 
 /*
  * Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_amap.c,v 1.112 2020/01/01 13:11:51 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_amap.c,v 1.113 2020/01/01 22:01:13 ad Exp $");
 
 #include "opt_uvmhist.h"
 
@@ -63,29 +63,18 @@
  * local functions
  */
 
-static inline void
-amap_list_insert(struct vm_amap *amap)
-{
-
-       mutex_enter(&amap_list_lock);
-       LIST_INSERT_HEAD(&amap_list, amap, am_list);
-       mutex_exit(&amap_list_lock);
-}
-
-static inline void
-amap_list_remove(struct vm_amap *amap)
-{
-
-       mutex_enter(&amap_list_lock);
-       LIST_REMOVE(amap, am_list);
-       mutex_exit(&amap_list_lock);
-}
-
 static int
 amap_roundup_slots(int slots)
 {
 
+#ifdef _LP64
+       /* Align to cacheline boundary for best performance. */
+       return roundup2((slots * sizeof(struct vm_amap *)),
+           COHERENCY_UNIT) / sizeof(struct vm_amap *);
+#else
+       /* On 32-bit, KVA shortage is a concern. */
        return kmem_roundup_size(slots * sizeof(int)) / sizeof(int);
+#endif
 }
 
 #ifdef UVM_AMAP_PPREF
@@ -170,46 +159,79 @@
        const bool nowait = (flags & UVM_FLAG_NOWAIT) != 0;
        const km_flag_t kmflags = nowait ? KM_NOSLEEP : KM_SLEEP;
        struct vm_amap *amap;
+       kmutex_t *newlock, *oldlock;
        int totalslots;
+       size_t sz;
 
        amap = pool_cache_get(&uvm_amap_cache, nowait ? PR_NOWAIT : PR_WAITOK);
        if (amap == NULL) {
                return NULL;
        }
-       totalslots = amap_roundup_slots(slots + padslots);
-       amap->am_lock = NULL;
+       KASSERT(amap->am_lock != NULL);
+       KASSERT(amap->am_nused == 0);
+
+       /* Try to privatize the lock if currently shared. */
+       if (mutex_obj_refcnt(amap->am_lock) > 1) {
+               newlock = mutex_obj_tryalloc(MUTEX_DEFAULT, IPL_NONE);
+               if (newlock != NULL) {
+                       oldlock = amap->am_lock;
+                       mutex_enter(&amap_list_lock);
+                       amap->am_lock = newlock;
+                       mutex_exit(&amap_list_lock);
+                       mutex_obj_free(oldlock);
+               }
+       }
+
+       totalslots = slots + padslots;
        amap->am_ref = 1;
        amap->am_flags = 0;
 #ifdef UVM_AMAP_PPREF
        amap->am_ppref = NULL;
 #endif
-       amap->am_maxslot = totalslots;
        amap->am_nslot = slots;
-       amap->am_nused = 0;
 
        /*
-        * Note: since allocations are likely big, we expect to reduce the
-        * memory fragmentation by allocating them in separate blocks.
+        * For small amaps use the storage in the amap structure.  Otherwise
+        * go to the heap.  Note: since allocations are likely big, we
+        * expect to reduce the memory fragmentation by allocating them in
+        * separate blocks.
         */
-       amap->am_slots = kmem_alloc(totalslots * sizeof(int), kmflags);
-       if (amap->am_slots == NULL)
-               goto fail1;
+       if (totalslots <= UVM_AMAP_TINY) {
+               amap->am_maxslot = UVM_AMAP_TINY;
+               amap->am_anon = AMAP_TINY_ANON(amap);
+               amap->am_slots = AMAP_TINY_SLOTS(amap);
+               amap->am_bckptr = amap->am_slots + UVM_AMAP_TINY;
+       } else if (totalslots <= UVM_AMAP_SMALL) {
+               amap->am_maxslot = UVM_AMAP_SMALL;
+               amap->am_anon = AMAP_TINY_ANON(amap);
 
-       amap->am_bckptr = kmem_alloc(totalslots * sizeof(int), kmflags);
-       if (amap->am_bckptr == NULL)
-               goto fail2;
+               sz = UVM_AMAP_SMALL * sizeof(int) * 2;
+               sz = roundup2(sz, COHERENCY_UNIT);
+               amap->am_slots = kmem_alloc(sz, kmflags);
+               if (amap->am_slots == NULL)
+                       goto fail1;
 
-       amap->am_anon = kmem_alloc(totalslots * sizeof(struct vm_anon *),
-           kmflags);
-       if (amap->am_anon == NULL)
-               goto fail3;
+               amap->am_bckptr = amap->am_slots + amap->am_maxslot;
+       } else {
+               amap->am_maxslot = amap_roundup_slots(totalslots);
+               sz = amap->am_maxslot * sizeof(int) * 2;
+               KASSERT((sz & (COHERENCY_UNIT - 1)) == 0);
+               amap->am_slots = kmem_alloc(sz, kmflags);
+               if (amap->am_slots == NULL)
+                       goto fail1;
+
+               amap->am_bckptr = amap->am_slots + amap->am_maxslot;
+
+               amap->am_anon = kmem_alloc(amap->am_maxslot *
+                   sizeof(struct vm_anon *), kmflags);
+               if (amap->am_anon == NULL)
+                       goto fail2;
+       }
 
        return amap;
 
-fail3:
-       kmem_free(amap->am_bckptr, totalslots * sizeof(int));
 fail2:
-       kmem_free(amap->am_slots, totalslots * sizeof(int));
+       kmem_free(amap->am_slots, amap->am_maxslot * sizeof(int));
 fail1:
        pool_cache_put(&uvm_amap_cache, amap);
 
@@ -248,8 +270,6 @@
        if (amap) {
                memset(amap->am_anon, 0,
                    amap->am_maxslot * sizeof(struct vm_anon *));
-               amap->am_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
-               amap_list_insert(amap);
        }
 
        UVMHIST_LOG(maphist,"<- done, amap = 0x%#jx, sz=%jd", (uintptr_t)amap,
@@ -258,16 +278,70 @@
 }
 
 /*
+ * amap_ctor: pool_cache constructor for new amaps
+ *
+ * => carefully synchronize with amap_swap_off()
+ */
+static int
+amap_ctor(void *arg, void *obj, int flags)
+{
+       struct vm_amap *amap = obj;
+
+       if ((flags & PR_NOWAIT) != 0) {
+               amap->am_lock = mutex_obj_tryalloc(MUTEX_DEFAULT, IPL_NONE);
+               if (amap->am_lock == NULL) {
+                       return ENOMEM;
+               }
+       } else {
+               amap->am_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
+       }
+       amap->am_nused = 0;
+       amap->am_flags = 0;
+
+       mutex_enter(&amap_list_lock);
+       LIST_INSERT_HEAD(&amap_list, amap, am_list);
+       mutex_exit(&amap_list_lock);
+       return 0;
+}
+
+/*
+ * amap_ctor: pool_cache destructor for amaps
+ *
+ * => carefully synchronize with amap_swap_off()
+ */
+static void
+amap_dtor(void *arg, void *obj)
+{
+       struct vm_amap *amap = obj;
+
+       KASSERT(amap->am_nused == 0);
+
+       mutex_enter(&amap_list_lock);
+       LIST_REMOVE(amap, am_list);
+       mutex_exit(&amap_list_lock);
+       mutex_obj_free(amap->am_lock);
+}
+
+/*
  * uvm_amap_init: initialize the amap system.
  */
 void
 uvm_amap_init(void)
 {
 
+#if defined(_LP64)
+       /*
+        * Correct alignment helps performance.  For 32-bit platforms, KVA
+        * availibility is a concern so leave them be.
+        */
+       KASSERT((sizeof(struct vm_amap) & (COHERENCY_UNIT - 1)) == 0);
+#endif
+
        mutex_init(&amap_list_lock, MUTEX_DEFAULT, IPL_NONE);
 
-       pool_cache_bootstrap(&uvm_amap_cache, sizeof(struct vm_amap), 0, 0, 0,
-           "amappl", NULL, IPL_NONE, NULL, NULL, NULL);
+       pool_cache_bootstrap(&uvm_amap_cache, sizeof(struct vm_amap),
+           COHERENCY_UNIT, 0, 0, "amappl", NULL, IPL_NONE, amap_ctor,
+           amap_dtor, NULL);
 }
 
 /*
@@ -285,17 +359,19 @@
 
        KASSERT(amap->am_ref == 0 && amap->am_nused == 0);
        KASSERT((amap->am_flags & AMAP_SWAPOFF) == 0);
-       if (amap->am_lock != NULL) {
-               KASSERT(!mutex_owned(amap->am_lock));
-               mutex_obj_free(amap->am_lock);
+       slots = amap->am_maxslot;
+       if (amap->am_slots != AMAP_TINY_SLOTS(amap)) {
+               kmem_free(amap->am_slots, roundup2(slots * sizeof(int) * 2,
+                   COHERENCY_UNIT));
        }
-       slots = amap->am_maxslot;
-       kmem_free(amap->am_slots, slots * sizeof(*amap->am_slots));
-       kmem_free(amap->am_bckptr, slots * sizeof(*amap->am_bckptr));
-       kmem_free(amap->am_anon, slots * sizeof(*amap->am_anon));
+       if (amap->am_anon != AMAP_TINY_ANON(amap)) {
+               kmem_free(amap->am_anon, slots * sizeof(*amap->am_anon));
+       }
 #ifdef UVM_AMAP_PPREF
-       if (amap->am_ppref && amap->am_ppref != PPREF_NONE)
-               kmem_free(amap->am_ppref, slots * sizeof(*amap->am_ppref));
+       if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
+               kmem_free(amap->am_ppref, roundup2(slots * sizeof(int),
+                   COHERENCY_UNIT));
+       }
 #endif
        pool_cache_put(&uvm_amap_cache, amap);
        UVMHIST_LOG(maphist,"<- done, freed amap = 0x%#jx", (uintptr_t)amap,
@@ -346,8 +422,7 @@
                slotneed = slotoff + slotmapped + slotadd;
                slotadj = 0;
                slotarea = 0;
-       }
-       else {
+       } else {
                slotneed = slotadd + slotmapped;
                slotadj = slotadd - slotoff;
                slotarea = amap->am_maxslot - slotmapped;
@@ -502,23 +577,22 @@
        newppref = NULL;
        if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
                /* Will be handled later if fails. */
-               newppref = kmem_alloc(slotalloc * sizeof(*newppref), kmflags);
+               newppref = kmem_alloc(roundup2(slotalloc * sizeof(int),
+                   COHERENCY_UNIT), kmflags);
        }
 #endif
-       newsl = kmem_alloc(slotalloc * sizeof(*newsl), kmflags);
-       newbck = kmem_alloc(slotalloc * sizeof(*newbck), kmflags);
+       newsl = kmem_alloc(slotalloc * sizeof(*newsl) * 2, kmflags);
+       newbck = newsl + slotalloc;
        newover = kmem_alloc(slotalloc * sizeof(*newover), kmflags);
        if (newsl == NULL || newbck == NULL || newover == NULL) {
 #ifdef UVM_AMAP_PPREF
                if (newppref != NULL) {
-                       kmem_free(newppref, slotalloc * sizeof(*newppref));
+                       kmem_free(newppref, roundup2(slotalloc * sizeof(int),
+                           COHERENCY_UNIT));
                }
 #endif
                if (newsl != NULL) {
-                       kmem_free(newsl, slotalloc * sizeof(*newsl));
-               }
Prev by Date: [src/trunk]: src/sys Add some new functions for lock objects:
Next by Date: [src/trunk]: src/sys/dev Pull in <sys/stdbool.h>.
Previous by Thread: [src/trunk]: src/sys Add some new functions for lock objects:
Next by Thread: [src/trunk]: src/sys/dev Pull in <sys/stdbool.h>.
Indexes:
Home | Main Index | Thread Index | Old Index