[src/trunk]: src/sys A final set of scheduler tweaks:

To: source-changes-hg%NetBSD.org@localhost
Subject: [src/trunk]: src/sys A final set of scheduler tweaks:
From: ad <ad%NetBSD.org@localhost>
Date: Mon, 13 Jan 2020 00:05:44 +0000
details:   https://anonhg.NetBSD.org/src/rev/0df2d21ed7ce
branches:  trunk
changeset: 467082:0df2d21ed7ce
user:      ad <ad%NetBSD.org@localhost>
date:      Sun Jan 12 22:03:22 2020 +0000

description:
A final set of scheduler tweaks:

- Try hard to keep vfork() parent and child on the same CPU until execve(),
  failing that on the same core, but in all other cases scatter new LWPs
  among the different CPU packages, round robin, to try and get the best out
  of the available cache and bus bandwidth.

- Remove attempts at balancing.  Replace with a rate-limited skim of other
  CPU's run queues in sched_idle(), starting in the current package and
  moving outwards.  Add a sysctl tunable to change the interval.

- Make the cacheht_time tuneable take a milliseconds value.

- It's possible to configure things such that there's no CPU allowed to run
  an LWP.  Defeat this by always having a default:

Reported-by: syzbot+46968944dd9359ab93bc%syzkaller.appspotmail.com@localhost
Reported-by: syzbot+7f750a4cc230d1e831f9%syzkaller.appspotmail.com@localhost
Reported-by: syzbot+88d7675158f5cb4684db%syzkaller.appspotmail.com@localhost
Reported-by: syzbot+d409c2338150e9a8ae1e%syzkaller.appspotmail.com@localhost
Reported-by: syzbot+e152dc5bff188f67358a%syzkaller.appspotmail.com@localhost

diffstat:

 sys/kern/kern_exec.c |   76 ++++--
 sys/kern/kern_runq.c |  500 +++++++++++++++++++++++++-------------------------
 sys/sys/lwp.h        |    3 +-
 sys/sys/sched.h      |    6 +-
 4 files changed, 302 insertions(+), 283 deletions(-)

diffs (truncated from 925 to 300 lines):

diff -r 876a6f8fa937 -r 0df2d21ed7ce sys/kern/kern_exec.c
--- a/sys/kern/kern_exec.c      Sun Jan 12 21:52:36 2020 +0000
+++ b/sys/kern/kern_exec.c      Sun Jan 12 22:03:22 2020 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: kern_exec.c,v 1.487 2020/01/12 18:30:58 ad Exp $       */
+/*     $NetBSD: kern_exec.c,v 1.488 2020/01/12 22:03:22 ad Exp $       */
 
 /*-
  * Copyright (c) 2008, 2019 The NetBSD Foundation, Inc.
@@ -62,7 +62,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: kern_exec.c,v 1.487 2020/01/12 18:30:58 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_exec.c,v 1.488 2020/01/12 22:03:22 ad Exp $");
 
 #include "opt_exec.h"
 #include "opt_execfmt.h"
@@ -1175,6 +1175,7 @@
        struct exec_package     * const epp = &data->ed_pack;
        int error = 0;
        struct proc             *p;
+       struct vmspace          *vm;
 
        /*
         * In case of a posix_spawn operation, the child doing the exec
@@ -1209,6 +1210,10 @@
         * Do whatever is necessary to prepare the address space
         * for remapping.  Note that this might replace the current
         * vmspace with another!
+        *
+        * vfork(): do not touch any user space data in the new child
+        * until we have awoken the parent below, or it will defeat
+        * lazy pmap switching (on x86).
         */
        if (is_spawn)
                uvmspace_spawn(l, epp->ep_vm_minaddr,
@@ -1218,9 +1223,8 @@
                uvmspace_exec(l, epp->ep_vm_minaddr,
                    epp->ep_vm_maxaddr,
                    epp->ep_flags & EXEC_TOPDOWN_VM);
-
-       struct vmspace          *vm;
        vm = p->p_vmspace;
+
        vm->vm_taddr = (void *)epp->ep_taddr;
        vm->vm_tsize = btoc(epp->ep_tsize);
        vm->vm_daddr = (void*)epp->ep_daddr;
@@ -1232,19 +1236,6 @@
 
        pax_aslr_init_vm(l, vm, epp);
 
-       /* Now map address space. */
-       error = execve_dovmcmds(l, data);
-       if (error != 0)
-               goto exec_abort;
-
-       pathexec(p, epp->ep_resolvedname);
-
-       char * const newstack = STACK_GROW(vm->vm_minsaddr, epp->ep_ssize);
-
-       error = copyoutargs(data, l, newstack);
-       if (error != 0)
-               goto exec_abort;
-
        cwdexec(p);
        fd_closeexec();         /* handle close on exec */
 
@@ -1259,6 +1250,17 @@
        p->p_flag |= PK_EXEC;
        mutex_exit(p->p_lock);
 
+       error = credexec(l, &data->ed_attr);
+       if (error)
+               goto exec_abort;
+
+#if defined(__HAVE_RAS)
+       /*
+        * Remove all RASs from the address space.
+        */
+       ras_purgeall();
+#endif
+
        /*
         * Stop profiling.
         */
@@ -1271,32 +1273,46 @@
        /*
         * It's OK to test PL_PPWAIT unlocked here, as other LWPs have
         * exited and exec()/exit() are the only places it will be cleared.
+        *
+        * Once the parent has been awoken, curlwp may teleport to a new CPU
+        * in sched_vforkexec(), and it's then OK to start messing with user
+        * data.  See comment above.
         */
        if ((p->p_lflag & PL_PPWAIT) != 0) {
+               bool samecpu;
                lwp_t *lp;
 
                mutex_enter(proc_lock);
                lp = p->p_vforklwp;
                p->p_vforklwp = NULL;
-
                l->l_lwpctl = NULL; /* was on loan from blocked parent */
+               cv_broadcast(&lp->l_waitcv);
+
+               /* Clear flags after cv_broadcast() (scheduler needs them). */
                p->p_lflag &= ~PL_PPWAIT;
                lp->l_vforkwaiting = false;
 
-               cv_broadcast(&lp->l_waitcv);
+               /* If parent is still on same CPU, teleport curlwp elsewhere. */
+               samecpu = (lp->l_cpu == curlwp->l_cpu);
                mutex_exit(proc_lock);
+
+               /* Give the parent its CPU back - find a new home. */
+               KASSERT(!is_spawn);
+               sched_vforkexec(l, samecpu);
        }
 
-       error = credexec(l, &data->ed_attr);
-       if (error)
+       /* Now map address space. */
+       error = execve_dovmcmds(l, data);
+       if (error != 0)
                goto exec_abort;
 
-#if defined(__HAVE_RAS)
-       /*
-        * Remove all RASs from the address space.
-        */
-       ras_purgeall();
-#endif
+       pathexec(p, epp->ep_resolvedname);
+
+       char * const newstack = STACK_GROW(vm->vm_minsaddr, epp->ep_ssize);
+
+       error = copyoutargs(data, l, newstack);
+       if (error != 0)
+               goto exec_abort;
 
        doexechooks(p);
 
@@ -1393,8 +1409,10 @@
         * get rid of the (new) address space we have created, if any, get rid
         * of our namei data and vnode, and exit noting failure
         */
-       uvm_deallocate(&vm->vm_map, VM_MIN_ADDRESS,
-               VM_MAXUSER_ADDRESS - VM_MIN_ADDRESS);
+       if (vm != NULL) {
+               uvm_deallocate(&vm->vm_map, VM_MIN_ADDRESS,
+                       VM_MAXUSER_ADDRESS - VM_MIN_ADDRESS);
+       }
 
        exec_free_emul_arg(epp);
        pool_put(&exec_pool, data->ed_argp);
diff -r 876a6f8fa937 -r 0df2d21ed7ce sys/kern/kern_runq.c
--- a/sys/kern/kern_runq.c      Sun Jan 12 21:52:36 2020 +0000
+++ b/sys/kern/kern_runq.c      Sun Jan 12 22:03:22 2020 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: kern_runq.c,v 1.57 2020/01/09 16:35:03 ad Exp $        */
+/*     $NetBSD: kern_runq.c,v 1.58 2020/01/12 22:03:22 ad Exp $        */
 
 /*-
  * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc.
@@ -56,7 +56,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.57 2020/01/09 16:35:03 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.58 2020/01/12 22:03:22 ad Exp $");
 
 #include "opt_dtrace.h"
 
@@ -92,7 +92,6 @@
 static void    *sched_getrq(struct schedstate_percpu *, const pri_t);
 #ifdef MULTIPROCESSOR
 static lwp_t * sched_catchlwp(struct cpu_info *);
-static void    sched_balance(void *);
 #endif
 
 /*
@@ -111,14 +110,9 @@
 /*
  * Migration and balancing.
  */
-static u_int   cacheht_time;           /* Cache hotness time */
-static u_int   min_catch;              /* Minimal LWP count for catching */
-static u_int   balance_period;         /* Balance period */
-static u_int   average_weight;         /* Weight old thread count average */
-static struct cpu_info *worker_ci;     /* Victim CPU */
-#ifdef MULTIPROCESSOR
-static struct callout balance_ch;      /* Callout of balancer */
-#endif
+static u_int   cacheht_time;   /* Cache hotness time */
+static u_int   min_catch;      /* Minimal LWP count for catching */
+static u_int   skim_interval;  /* Rate limit for stealing LWPs */
 
 #ifdef KDTRACE_HOOKS
 struct lwp *curthread;
@@ -128,22 +122,14 @@
 runq_init(void)
 {
 
-       /* Balancing */
-       worker_ci = curcpu();
-       cacheht_time = mstohz(3);               /*   ~3 ms */
-       balance_period = mstohz(300);           /* ~300 ms */
+       /* Pulling from remote packages, LWP must not have run for 10ms. */
+       cacheht_time = 10;
 
        /* Minimal count of LWPs for catching */
        min_catch = 1;
-       /* Weight of historical average */
-       average_weight = 50;                    /*   0.5   */
 
-       /* Initialize balancing callout and run it */
-#ifdef MULTIPROCESSOR
-       callout_init(&balance_ch, CALLOUT_MPSAFE);
-       callout_setfunc(&balance_ch, sched_balance, NULL);
-       callout_schedule(&balance_ch, balance_period);
-#endif
+       /* Steal from other CPUs at most every 10ms. */
+       skim_interval = 10;
 }
 
 void
@@ -155,6 +141,7 @@
        u_int i;
 
        spc = &ci->ci_schedstate;
+       spc->spc_nextpkg = ci;
 
        if (spc->spc_lwplock == NULL) {
                spc->spc_lwplock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
@@ -183,7 +170,6 @@
 /*
  * Control of the runqueue.
  */
-
 static inline void *
 sched_getrq(struct schedstate_percpu *spc, const pri_t prio)
 {
@@ -415,18 +401,26 @@
 
 #ifdef MULTIPROCESSOR
 
-/* Estimate if LWP is cache-hot */
+/*
+ * Estimate if LWP is cache-hot.
+ */
 static inline bool
 lwp_cache_hot(const struct lwp *l)
 {
 
-       if (__predict_false(l->l_slptime || l->l_rticks == 0))
+       /* Leave new LWPs in peace, determination has already been made. */
+       if (l->l_stat == LSIDL)
+               return true;
+
+       if (__predict_false(l->l_slptime != 0 || l->l_rticks == 0))
                return false;
 
-       return (hardclock_ticks - l->l_rticks <= cacheht_time);
+       return (hardclock_ticks - l->l_rticks < mstohz(cacheht_time));
 }
 
-/* Check if LWP can migrate to the chosen CPU */
+/*
+ * Check if LWP can migrate to the chosen CPU.
+ */
 static inline bool
 sched_migratable(const struct lwp *l, struct cpu_info *ci)
 {
@@ -446,75 +440,88 @@
 }
 
 /*
+ * A small helper to do round robin through CPU packages.
+ */
+static struct cpu_info *
+sched_nextpkg(void)
+{
+       struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
+
+       spc->spc_nextpkg = 
+           spc->spc_nextpkg->ci_sibling[CPUREL_PACKAGE1ST];
+
+       return spc->spc_nextpkg;
+}
+
+/*
  * Find a CPU to run LWP "l".  Look for the CPU with the lowest priority
  * thread.  In case of equal priority, prefer first class CPUs, and amongst
  * the remainder choose the CPU with the fewest runqueue entries.
+ *
+ * Begin the search in the CPU package which "pivot" is a member of.
  */
 static struct cpu_info * __noinline 
-sched_bestcpu(struct lwp *l)
Prev by Date: [src/trunk]: src/sys/arch Enable sun8icrypto in GENERIC64.
Next by Date: [src/trunk]: src/sys/dev/pci Start to switch from OpenBSD timeout(9) API to c...
Previous by Thread: [src/trunk]: src/sys/arch Enable sun8icrypto in GENERIC64.
Next by Thread: [src/trunk]: src/sys/dev/pci Start to switch from OpenBSD timeout(9) API to c...
Indexes:
Home | Main Index | Thread Index | Old Index