Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/arch/mips/include mips: Brush up __cpu_simple_lock.



details:   https://anonhg.NetBSD.org/src/rev/7771dbdf76bb
branches:  trunk
changeset: 361158:7771dbdf76bb
user:      riastradh <riastradh%NetBSD.org@localhost>
date:      Sat Feb 12 17:10:02 2022 +0000

description:
mips: Brush up __cpu_simple_lock.

- Eradicate last vestiges of mb_* barriers.

- In __cpu_simple_lock_init, omit needless barrier.  It is the
  caller's responsibility to ensure __cpu_simple_lock_init happens
  before other operations on it anyway, so there was never any need
  for a barrier here.

- In __cpu_simple_lock_try, leave comments about memory ordering
  guarantees of the kernel's _atomic_cas_uint, which are inexplicably
  different from the non-underscored atomic_cas_uint.

- In __cpu_simple_unlock, use membar_exit instead of mb_memory, and do
  it unconditionally.

  This ensures that in __cpu_simple_lock/.../__cpu_simple_unlock, all
  memory operations in the ellipsis happen before the store that
  releases the lock.

  - On Octeon, the barrier was omitted altogether, which is a bug --
    it needs to be there or else there is no happens-before relation
    and whoever takes the lock next might see stale values stored or
    even stomp over the unlocking CPU's delayed loads.

  - On non-Octeon, the mb_memory was sync.  Using membar_exit
    preserves this.

  XXX On Octeon, membar_exit only issues syncw -- this seems wrong,
  only store-before-store and not load/store-before-store, unless the
  CNMIPS architecture guarantees it is sufficient here like
  SPARCv8/v9 PSO (`Partial Store Order').

- Leave an essay with citations about why we have an apparently
  pointless syncw _after_ releasing a lock, to work around a design
  bug^W^Wquirk in cnmips which sometimes buffers stores for hundreds
  of thousands of cycles for fun unless you issue syncw.

diffstat:

 common/lib/libc/arch/mips/atomic/membar_ops.S |   12 +--
 sys/arch/mips/include/lock.h                  |  116 ++++++++++++++-----------
 2 files changed, 64 insertions(+), 64 deletions(-)

diffs (189 lines):

diff -r 40e67b2ca44d -r 7771dbdf76bb common/lib/libc/arch/mips/atomic/membar_ops.S
--- a/common/lib/libc/arch/mips/atomic/membar_ops.S     Sat Feb 12 17:09:43 2022 +0000
+++ b/common/lib/libc/arch/mips/atomic/membar_ops.S     Sat Feb 12 17:10:02 2022 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: membar_ops.S,v 1.10 2020/08/10 14:37:38 skrll Exp $    */
+/*     $NetBSD: membar_ops.S,v 1.11 2022/02/12 17:10:02 riastradh Exp $        */
 
 /*-
  * Copyright (c) 2006, 2007 The NetBSD Foundation, Inc.
@@ -46,16 +46,6 @@
 END(_membar_producer)
 #endif
 
-#ifdef _KERNEL
-STRONG_ALIAS(mb_read, _membar_sync)
-#ifdef __OCTEON__
-STRONG_ALIAS(mb_write, _membar_producer)
-#else
-STRONG_ALIAS(mb_write, _membar_sync)
-#endif
-STRONG_ALIAS(mb_memory, _membar_sync)
-#endif
-
 ATOMIC_OP_ALIAS(membar_sync,_membar_sync)
 ATOMIC_OP_ALIAS(membar_enter,_membar_sync)
 STRONG_ALIAS(_membar_enter,_membar_sync)
diff -r 40e67b2ca44d -r 7771dbdf76bb sys/arch/mips/include/lock.h
--- a/sys/arch/mips/include/lock.h      Sat Feb 12 17:09:43 2022 +0000
+++ b/sys/arch/mips/include/lock.h      Sat Feb 12 17:10:02 2022 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: lock.h,v 1.21 2020/08/05 05:24:44 simonb Exp $ */
+/*     $NetBSD: lock.h,v 1.22 2022/02/12 17:10:02 riastradh Exp $      */
 
 /*-
  * Copyright (c) 2001, 2007 The NetBSD Foundation, Inc.
@@ -41,6 +41,8 @@
 
 #include <sys/param.h>
 
+#include <sys/atomic.h>
+
 static __inline int
 __SIMPLELOCK_LOCKED_P(const __cpu_simple_lock_t *__ptr)
 {
@@ -98,63 +100,30 @@
        return (v0 != 0);
 }
 
-#ifdef MIPS1
-static __inline void
-mb_read(void)
-{
-       __insn_barrier();
-}
-
-static __inline void
-mb_write(void)
-{
-       __insn_barrier();
-}
-
-static __inline void
-mb_memory(void)
-{
-       __insn_barrier();
-}
-#else  /* MIPS1*/
-static __inline void
-mb_read(void)
-{
-       __asm volatile(
-               "       .set push               \n"
-               "       .set mips2              \n"
-               "       sync                    \n"
-               "       .set pop"
-               ::: "memory"
-       );
-}
-
-static __inline void
-mb_write(void)
-{
-       mb_read();
-}
-
-static __inline void
-mb_memory(void)
-{
-       mb_read();
-}
-#endif /* MIPS1 */
-
 #else  /* !_HARDKERNEL */
 
 u_int  _atomic_cas_uint(volatile u_int *, u_int, u_int);
 u_long _atomic_cas_ulong(volatile u_long *, u_long, u_long);
 void * _atomic_cas_ptr(volatile void *, void *, void *);
-void   mb_read(void);
-void   mb_write(void);
-void   mb_memory(void);
 
 static __inline int
 __cpu_simple_lock_try(__cpu_simple_lock_t *lp)
 {
 
+       /*
+        * Successful _atomic_cas_uint functions as a load-acquire --
+        * on MP systems, it issues sync after the LL/SC CAS succeeds;
+        * on non-MP systems every load is a load-acquire so it's moot.
+        * This pairs with the membar_exit and store sequence in
+        * __cpu_simple_unlock that functions as a store-release
+        * operation.
+        *
+        * NOTE: This applies only to _atomic_cas_uint (with the
+        * underscore), in sys/arch/mips/mips/lock_stubs_*.S.  Not true
+        * for atomic_cas_uint (without the underscore), from
+        * common/lib/libc/arch/mips/atomic/atomic_cas.S which does not
+        * imply a load-acquire.  It is unclear why these disagree.
+        */
        return _atomic_cas_uint(lp,
            __SIMPLELOCK_UNLOCKED, __SIMPLELOCK_LOCKED) ==
            __SIMPLELOCK_UNLOCKED;
@@ -167,7 +136,6 @@
 {
 
        *lp = __SIMPLELOCK_UNLOCKED;
-       mb_memory();
 }
 
 static __inline void
@@ -184,12 +152,54 @@
 __cpu_simple_unlock(__cpu_simple_lock_t *lp)
 {
 
-#ifndef _MIPS_ARCH_OCTEONP
-       mb_memory();
-#endif
+       /*
+        * The membar_exit and then store functions as a store-release
+        * operation that pairs with the load-acquire operation in
+        * successful __cpu_simple_lock_try.
+        *
+        * Can't use atomic_store_release here because that's not
+        * available in userland at the moment.
+        */
+       membar_exit();
        *lp = __SIMPLELOCK_UNLOCKED;
+
 #ifdef _MIPS_ARCH_OCTEONP
-       mb_write();
+       /*
+        * On Cavium's recommendation, we issue an extra SYNCW that is
+        * not necessary for correct ordering because apparently stores
+        * can get stuck in Octeon store buffers for hundreds of
+        * thousands of cycles, according to the following note:
+        *
+        *      Programming Notes:
+        *      [...]
+        *      Core A (writer)
+        *      SW R1, DATA
+        *      LI R2, 1
+        *      SYNCW
+        *      SW R2, FLAG
+        *      SYNCW
+        *      [...]
+        *
+        *      The second SYNCW instruction executed by core A is not
+        *      necessary for correctness, but has very important
+        *      performance effects on OCTEON.  Without it, the store
+        *      to FLAG may linger in core A's write buffer before it
+        *      becomes visible to other cores.  (If core A is not
+        *      performing many stores, this may add hundreds of
+        *      thousands of cycles to the flag release time since the
+        *      OCTEON core normally retains stores to attempt to merge
+        *      them before sending the store on the CMB.)
+        *      Applications should include this second SYNCW
+        *      instruction after flag or lock releases.
+        *
+        * Cavium Networks OCTEON Plus CN50XX Hardware Reference
+        * Manual, July 2008, Appendix A, p. 943.
+        * https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/hactive/CN50XX-HRM-V0.99E.pdf
+        *
+        * XXX It might be prudent to put this into
+        * atomic_store_release itself.
+        */
+       __asm volatile("syncw" ::: "memory");
 #endif
 }
 



Home | Main Index | Thread Index | Old Index