Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/arch/x86/x86 x86: Eliminate mfence hotpatch for membar_s...



details:   https://anonhg.NetBSD.org/src/rev/4a740dddfcb3
branches:  trunk
changeset: 368723:4a740dddfcb3
user:      riastradh <riastradh%NetBSD.org@localhost>
date:      Sat Jul 30 14:11:00 2022 +0000

description:
x86: Eliminate mfence hotpatch for membar_sync.

The more-compatible  LOCK ADD $0,-N(%rsp)  turns out to be cheaper
than MFENCE anyway.  Let's save some space and maintenance and rip
out the hotpatching for it.

diffstat:

 common/lib/libc/arch/i386/atomic/atomic.S   |  29 ++++++++++------------
 common/lib/libc/arch/x86_64/atomic/atomic.S |  36 ++++++++++------------------
 sys/arch/amd64/include/frameasm.h           |   3 +-
 sys/arch/i386/include/frameasm.h            |   9 +++----
 sys/arch/x86/x86/patch.c                    |  31 +-----------------------
 5 files changed, 33 insertions(+), 75 deletions(-)

diffs (225 lines):

diff -r 96595355b94f -r 4a740dddfcb3 common/lib/libc/arch/i386/atomic/atomic.S
--- a/common/lib/libc/arch/i386/atomic/atomic.S Sat Jul 30 13:09:19 2022 +0000
+++ b/common/lib/libc/arch/i386/atomic/atomic.S Sat Jul 30 14:11:00 2022 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: atomic.S,v 1.35 2022/04/09 23:32:51 riastradh Exp $    */
+/*     $NetBSD: atomic.S,v 1.36 2022/07/30 14:11:00 riastradh Exp $    */
 
 /*-
  * Copyright (c) 2007 The NetBSD Foundation, Inc.
@@ -46,11 +46,9 @@
 #include "opt_xen.h"
 #include <machine/frameasm.h>
 #define LOCK                   HOTPATCH(HP_NAME_NOLOCK, 1); lock
-#define HOTPATCH_SSE2_MFENCE   HOTPATCH(HP_NAME_SSE2_MFENCE, 7);
 #define HOTPATCH_CAS_64                HOTPATCH(HP_NAME_CAS_64, 49);
 #else
 #define LOCK                   lock
-#define HOTPATCH_SSE2_MFENCE   /* nothing */
 #define HOTPATCH_CAS_64                /* nothing */
 #endif
 
@@ -198,13 +196,22 @@
 
 ENTRY(_membar_sync)
        /*
-        * MFENCE, or a serializing instruction like a locked addq,
+        * MFENCE, or a serializing instruction like a locked ADDL,
         * is necessary to order store-before-load.  Every other
         * ordering -- load-before-anything, anything-before-store --
         * is already guaranteed without explicit barriers.
+        *
+        * Empirically it turns out locked ADDL is cheaper than MFENCE,
+        * so we use that, with an offset below the return address on
+        * the stack to avoid a false dependency with RET.  (It might
+        * even be better to use a much lower offset, say -128, to
+        * avoid false dependencies for subsequent callees of the
+        * caller.)
+        *
+        * https://pvk.ca/Blog/2014/10/19/performance-optimisation-~-writing-an-essay/
+        * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
+        * https://www.agner.org/optimize/instruction_tables.pdf
         */
-       HOTPATCH_SSE2_MFENCE
-       /* 7 bytes of instructions */
        LOCK
        addl    $0, -4(%esp)
        ret
@@ -406,13 +413,3 @@
 STRONG_ALIAS(_membar_producer,_membar_release)
 STRONG_ALIAS(_membar_enter,_membar_sync)
 STRONG_ALIAS(_membar_exit,_membar_release)
-
-#ifdef _HARDKERNEL
-       .section .rodata
-
-LABEL(sse2_mfence)
-       mfence
-       ret
-       nop; nop; nop;
-LABEL(sse2_mfence_end)
-#endif /* _HARDKERNEL */
diff -r 96595355b94f -r 4a740dddfcb3 common/lib/libc/arch/x86_64/atomic/atomic.S
--- a/common/lib/libc/arch/x86_64/atomic/atomic.S       Sat Jul 30 13:09:19 2022 +0000
+++ b/common/lib/libc/arch/x86_64/atomic/atomic.S       Sat Jul 30 14:11:00 2022 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: atomic.S,v 1.28 2022/04/09 23:32:52 riastradh Exp $    */
+/*     $NetBSD: atomic.S,v 1.29 2022/07/30 14:11:00 riastradh Exp $    */
 
 /*-
  * Copyright (c) 2007 The NetBSD Foundation, Inc.
@@ -38,15 +38,6 @@
 #define        ALIAS(f, t)     WEAK_ALIAS(f,t)
 #endif
 
-#ifdef _HARDKERNEL
-#include <machine/frameasm.h>
-#define LOCK                   HOTPATCH(HP_NAME_NOLOCK, 1); lock
-#define HOTPATCH_SSE2_MFENCE   HOTPATCH(HP_NAME_SSE2_MFENCE, 8);
-#else
-#define LOCK                   lock
-#define HOTPATCH_SSE2_MFENCE   /* nothing */
-#endif
-
        .text
 
 /* 32-bit */
@@ -273,13 +264,22 @@
 
 ENTRY(_membar_sync)
        /*
-        * MFENCE, or a serializing instruction like a locked addq,
+        * MFENCE, or a serializing instruction like a locked ADDQ,
         * is necessary to order store-before-load.  Every other
         * ordering -- load-before-anything, anything-before-store --
         * is already guaranteed without explicit barriers.
+        *
+        * Empirically it turns out locked ADDQ is cheaper than MFENCE,
+        * so we use that, with an offset below the return address on
+        * the stack to avoid a false dependency with RET.  (It might
+        * even be better to use a much lower offset, say -128, to
+        * avoid false dependencies for subsequent callees of the
+        * caller.)
+        *
+        * https://pvk.ca/Blog/2014/10/19/performance-optimisation-~-writing-an-essay/
+        * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
+        * https://www.agner.org/optimize/instruction_tables.pdf
         */
-       HOTPATCH_SSE2_MFENCE
-       /* 8 bytes of instructions */
        LOCK
        addq    $0, -8(%rsp)
        ret
@@ -429,13 +429,3 @@
 STRONG_ALIAS(_membar_producer,_membar_release)
 STRONG_ALIAS(_membar_enter,_membar_sync)
 STRONG_ALIAS(_membar_exit,_membar_release)
-
-#ifdef _HARDKERNEL
-       .section .rodata
-
-LABEL(sse2_mfence)
-       mfence
-       ret
-       nop; nop; nop; nop;
-LABEL(sse2_mfence_end)
-#endif /* _HARDKERNEL */
diff -r 96595355b94f -r 4a740dddfcb3 sys/arch/amd64/include/frameasm.h
--- a/sys/arch/amd64/include/frameasm.h Sat Jul 30 13:09:19 2022 +0000
+++ b/sys/arch/amd64/include/frameasm.h Sat Jul 30 14:11:00 2022 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: frameasm.h,v 1.54 2022/04/09 12:07:00 riastradh Exp $  */
+/*     $NetBSD: frameasm.h,v 1.55 2022/07/30 14:11:00 riastradh Exp $  */
 
 #ifndef _AMD64_MACHINE_FRAMEASM_H
 #define _AMD64_MACHINE_FRAMEASM_H
@@ -63,7 +63,6 @@
 #define HP_NAME_SVS_ENTER_NMI  11
 #define HP_NAME_SVS_LEAVE_NMI  12
 #define HP_NAME_MDS_LEAVE      13
-#define HP_NAME_SSE2_MFENCE    14
 
 #define HOTPATCH(name, size) \
 123:                                           ; \
diff -r 96595355b94f -r 4a740dddfcb3 sys/arch/i386/include/frameasm.h
--- a/sys/arch/i386/include/frameasm.h  Sat Jul 30 13:09:19 2022 +0000
+++ b/sys/arch/i386/include/frameasm.h  Sat Jul 30 14:11:00 2022 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: frameasm.h,v 1.34 2022/04/09 12:07:00 riastradh Exp $  */
+/*     $NetBSD: frameasm.h,v 1.35 2022/07/30 14:11:00 riastradh Exp $  */
 
 #ifndef _I386_FRAMEASM_H_
 #define _I386_FRAMEASM_H_
@@ -48,10 +48,9 @@
 #define HP_NAME_STAC           2
 #define HP_NAME_NOLOCK         3
 #define HP_NAME_RETFENCE       4
-#define HP_NAME_SSE2_MFENCE    5
-#define HP_NAME_CAS_64         6
-#define HP_NAME_SPLLOWER       7
-#define HP_NAME_MUTEX_EXIT     8
+#define HP_NAME_CAS_64         5
+#define HP_NAME_SPLLOWER       6
+#define HP_NAME_MUTEX_EXIT     7
 
 #define HOTPATCH(name, size) \
 123:                                           ; \
diff -r 96595355b94f -r 4a740dddfcb3 sys/arch/x86/x86/patch.c
--- a/sys/arch/x86/x86/patch.c  Sat Jul 30 13:09:19 2022 +0000
+++ b/sys/arch/x86/x86/patch.c  Sat Jul 30 14:11:00 2022 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: patch.c,v 1.50 2022/04/09 12:07:00 riastradh Exp $     */
+/*     $NetBSD: patch.c,v 1.51 2022/07/30 14:11:00 riastradh Exp $     */
 
 /*-
  * Copyright (c) 2007, 2008, 2009 The NetBSD Foundation, Inc.
@@ -34,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: patch.c,v 1.50 2022/04/09 12:07:00 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: patch.c,v 1.51 2022/07/30 14:11:00 riastradh Exp $");
 
 #include "opt_lockdebug.h"
 #ifdef i386
@@ -117,19 +117,6 @@
 };
 __link_set_add_rodata(x86_hotpatch_descriptors, hp_nolock_desc);
 
-/* Use MFENCE if available, part of SSE2. */
-extern uint8_t sse2_mfence, sse2_mfence_end;
-static const struct x86_hotpatch_source hp_sse2_mfence_source = {
-       .saddr = &sse2_mfence,
-       .eaddr = &sse2_mfence_end
-};
-static const struct x86_hotpatch_descriptor hp_sse2_mfence_desc = {
-       .name = HP_NAME_SSE2_MFENCE,
-       .nsrc = 1,
-       .srcs = { &hp_sse2_mfence_source }
-};
-__link_set_add_rodata(x86_hotpatch_descriptors, hp_sse2_mfence_desc);
-
 #ifdef i386
 /* CAS_64. */
 extern uint8_t _atomic_cas_cx8, _atomic_cas_cx8_end;
@@ -327,20 +314,6 @@
 #endif
        }
 
-       if (!early && (cpu_feature[0] & CPUID_SSE2) != 0) {
-               /*
-                * Faster memory barriers.  The only barrier x86 ever
-                * requires for MI synchronization between CPUs is
-                * MFENCE for store-before-load ordering; all other
-                * ordering is guaranteed already -- every load is a
-                * load-acquire and every store is a store-release.
-                *
-                * LFENCE and SFENCE are relevant only for MD logic
-                * involving I/O devices or non-temporal stores.
-                */
-               x86_hotpatch(HP_NAME_SSE2_MFENCE, 0);
-       }
-
 #ifdef i386
        /*
         * Patch early and late.  Second time around the 'lock' prefix



Home | Main Index | Thread Index | Old Index