Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/arch/mips/include mips/cavium: Take advantage of Octeon'...



details:   https://anonhg.NetBSD.org/src/rev/28edddf0fd2c
branches:  trunk
changeset: 365728:28edddf0fd2c
user:      riastradh <riastradh%NetBSD.org@localhost>
date:      Thu Apr 21 12:06:31 2022 +0000

description:
mips/cavium: Take advantage of Octeon's guaranteed r/rw ordering.

diffstat:

 common/lib/libc/arch/mips/atomic/membar_ops.S |  96 ++++++++++++++++++--------
 sys/arch/mips/include/asm.h                   |  21 ++++-
 2 files changed, 81 insertions(+), 36 deletions(-)

diffs (155 lines):

diff -r 1c456eed4846 -r 28edddf0fd2c common/lib/libc/arch/mips/atomic/membar_ops.S
--- a/common/lib/libc/arch/mips/atomic/membar_ops.S     Thu Apr 21 12:05:13 2022 +0000
+++ b/common/lib/libc/arch/mips/atomic/membar_ops.S     Thu Apr 21 12:06:31 2022 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: membar_ops.S,v 1.12 2022/04/09 23:32:51 riastradh Exp $        */
+/*     $NetBSD: membar_ops.S,v 1.13 2022/04/21 12:06:31 riastradh Exp $        */
 
 /*-
  * Copyright (c) 2006, 2007 The NetBSD Foundation, Inc.
@@ -38,44 +38,80 @@
        j       ra
         BDSYNC
 END(_membar_sync)
+ATOMIC_OP_ALIAS(membar_sync,_membar_sync)
+
+STRONG_ALIAS(_membar_enter,_membar_sync)
+ATOMIC_OP_ALIAS(membar_enter,_membar_sync)
 
 #ifdef __OCTEON__
+
+/*
+ * cnMIPS guarantees load-before-load/store ordering without any
+ * barriers.  So the only barriers we need are store-before-load (sync)
+ * and store-before-store (syncw, i.e., sync 4).  See Table 2-32
+ * `Execution Ordering Rules' on p. 104 of Cavium OCTEON III CN78XX
+ * Hardware Reference Manual, CN78XX-HM-0.99E, September 2014:
+ *
+ *     First Operation         DLD [load instruction to a physical
+ *                             address that is L2/DRAM]
+ *     Second Operation        Any
+ *     Execution Ordering Comments
+ *
+ *             The second operation cannot appear to execute before
+ *             the first (DLD) operation, regardless of the presence
+ *             or absence of SYNC* instructions.
+ *
+ * Note: I'm not sure if this applies to earlier cnMIPS -- can't find
+ * it in the Cavium Networks OCTEON Plus CN50XX Hardware Reference
+ * Manual CN50XX-HM-0.99E, July 2008.  Experimentally, on an erlite3
+ * (Cavium Octeon CN5020-500), I can easily detect reordering of
+ * store-before-store and store-before-load, but I haven't been able to
+ * detect any reordering of load-before-load or load-before-store.
+ *
+ * Note: On early cnMIPS (CN3xxx), there is an erratum which sometimes
+ * requires issuing two syncw's in a row.  I don't know the details --
+ * don't have documentation -- and in Linux it is only used for I/O
+ * purposes.
+ *
+ * Currently we don't build kernels that work on both Octeon and
+ * non-Octeon MIPS CPUs, so none of this is done with binary patching.
+ * For userlands we could use a separate shared library on Octeon with
+ * ld.so.conf to override the symbols with cheaper definitions, but we
+ * don't do that now.
+ */
+
+LEAF(_membar_acquire)
+       j       ra
+        nop
+END(_membar_acquire)
+ATOMIC_OP_ALIAS(membar_acquire,_membar_acquire)
+
+STRONG_ALIAS(_membar_consumer,_membar_acquire)
+ATOMIC_OP_ALIAS(membar_consumer,_membar_acquire)
+
 LEAF(_membar_release)
-       /*
-        * syncw is documented as ordering store-before-store in
-        *
-        *      Cavium OCTEON III CN78XX Hardware Reference Manual,
-        *      CN78XX-HM-0.99E, September 2014.
-        *
-        * It's unclear from the documentation the architecture
-        * guarantees load-before-store ordering without barriers, but
-        * this code assumes it does.  If that assumption is wrong, we
-        * can only use syncw for membar_producer -- membar_release has
-        * to use the full sync.
-        */
        j       ra
         syncw
 END(_membar_release)
-#endif
+ATOMIC_OP_ALIAS(membar_release,_membar_release)
 
-ATOMIC_OP_ALIAS(membar_sync,_membar_sync)
-ATOMIC_OP_ALIAS(membar_acquire,_membar_sync)
-STRONG_ALIAS(_membar_acquire,_membar_sync)
-ATOMIC_OP_ALIAS(membar_enter,_membar_sync)
-STRONG_ALIAS(_membar_enter,_membar_sync)
-#ifdef __OCTEON__
+STRONG_ALIAS(_membar_exit,_membar_release)
 ATOMIC_OP_ALIAS(membar_exit,_membar_release)
-STRONG_ALIAS(_membar_exit,_membar_release)
-ATOMIC_OP_ALIAS(membar_release,_membar_release)
+
+STRONG_ALIAS(_membar_producer,_membar_release)
 ATOMIC_OP_ALIAS(membar_producer,_membar_release)
-STRONG_ALIAS(_membar_producer,_membar_release)
-#else
-ATOMIC_OP_ALIAS(membar_exit,_membar_sync)
-STRONG_ALIAS(_membar_exit,_membar_sync)
-ATOMIC_OP_ALIAS(membar_release,_membar_sync)
+
+#else  /* !__OCTEON__ */
+
+STRONG_ALIAS(_membar_acquire,_membar_sync)
+ATOMIC_OP_ALIAS(membar_acquire,_membar_sync)
 STRONG_ALIAS(_membar_release,_membar_sync)
+ATOMIC_OP_ALIAS(membar_release,_membar_sync)
+STRONG_ALIAS(_membar_exit,_membar_sync)
+ATOMIC_OP_ALIAS(membar_exit,_membar_sync)
+STRONG_ALIAS(_membar_consumer,_membar_sync)
+ATOMIC_OP_ALIAS(membar_consumer,_membar_sync)
+STRONG_ALIAS(_membar_producer,_membar_sync)
 ATOMIC_OP_ALIAS(membar_producer,_membar_sync)
-STRONG_ALIAS(_membar_producer,_membar_sync)
+
 #endif
-ATOMIC_OP_ALIAS(membar_consumer,_membar_sync)
-STRONG_ALIAS(_membar_consumer,_membar_sync)
diff -r 1c456eed4846 -r 28edddf0fd2c sys/arch/mips/include/asm.h
--- a/sys/arch/mips/include/asm.h       Thu Apr 21 12:05:13 2022 +0000
+++ b/sys/arch/mips/include/asm.h       Thu Apr 21 12:06:31 2022 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: asm.h,v 1.70 2022/04/09 14:09:32 riastradh Exp $       */
+/*     $NetBSD: asm.h,v 1.71 2022/04/21 12:06:31 riastradh Exp $       */
 
 /*
  * Copyright (c) 1992, 1993
@@ -574,12 +574,21 @@
 
 /* compiler define */
 #if defined(__OCTEON__)
-                               /* early cnMIPS have erratum which means 2 */
-#define        LLSCSYNC        sync 4; sync 4
+/*
+ * See common/lib/libc/arch/mips/atomic/membar_ops.S for notes on
+ * Octeon memory ordering guarantees and barriers.
+ *
+ * cnMIPS also has a quirk where the store buffer can get clogged and
+ * we need to apply a plunger to it _after_ releasing a lock or else
+ * other CPUs may spin for hundreds of thousands of cycles before they
+ * see the lock is released.  So we also have the quirky SYNC_PLUNGER
+ * barrier as syncw.
+ */
+#define        LLSCSYNC        /* nothing */
 #define        BDSYNC          sync
-#define        BDSYNC_ACQ      sync
-#define        SYNC_ACQ        sync
-#define        SYNC_REL        sync
+#define        BDSYNC_ACQ      nop
+#define        SYNC_ACQ        /* nothing */
+#define        SYNC_REL        sync 4
 #define        BDSYNC_PLUNGER  sync 4
 #define        SYNC_PLUNGER    sync 4
 #elif __mips >= 3 || !defined(__mips_o32)



Home | Main Index | Thread Index | Old Index