Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/sys/arch/x86/x86 x86: Eliminate mfence hotpatch for membar_s...
details: https://anonhg.NetBSD.org/src/rev/4a740dddfcb3
branches: trunk
changeset: 368723:4a740dddfcb3
user: riastradh <riastradh%NetBSD.org@localhost>
date: Sat Jul 30 14:11:00 2022 +0000
description:
x86: Eliminate mfence hotpatch for membar_sync.
The more-compatible LOCK ADD $0,-N(%rsp) turns out to be cheaper
than MFENCE anyway. Let's save some space and maintenance and rip
out the hotpatching for it.
diffstat:
common/lib/libc/arch/i386/atomic/atomic.S | 29 ++++++++++------------
common/lib/libc/arch/x86_64/atomic/atomic.S | 36 ++++++++++------------------
sys/arch/amd64/include/frameasm.h | 3 +-
sys/arch/i386/include/frameasm.h | 9 +++----
sys/arch/x86/x86/patch.c | 31 +-----------------------
5 files changed, 33 insertions(+), 75 deletions(-)
diffs (225 lines):
diff -r 96595355b94f -r 4a740dddfcb3 common/lib/libc/arch/i386/atomic/atomic.S
--- a/common/lib/libc/arch/i386/atomic/atomic.S Sat Jul 30 13:09:19 2022 +0000
+++ b/common/lib/libc/arch/i386/atomic/atomic.S Sat Jul 30 14:11:00 2022 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: atomic.S,v 1.35 2022/04/09 23:32:51 riastradh Exp $ */
+/* $NetBSD: atomic.S,v 1.36 2022/07/30 14:11:00 riastradh Exp $ */
/*-
* Copyright (c) 2007 The NetBSD Foundation, Inc.
@@ -46,11 +46,9 @@
#include "opt_xen.h"
#include <machine/frameasm.h>
#define LOCK HOTPATCH(HP_NAME_NOLOCK, 1); lock
-#define HOTPATCH_SSE2_MFENCE HOTPATCH(HP_NAME_SSE2_MFENCE, 7);
#define HOTPATCH_CAS_64 HOTPATCH(HP_NAME_CAS_64, 49);
#else
#define LOCK lock
-#define HOTPATCH_SSE2_MFENCE /* nothing */
#define HOTPATCH_CAS_64 /* nothing */
#endif
@@ -198,13 +196,22 @@
ENTRY(_membar_sync)
/*
- * MFENCE, or a serializing instruction like a locked addq,
+ * MFENCE, or a serializing instruction like a locked ADDL,
* is necessary to order store-before-load. Every other
* ordering -- load-before-anything, anything-before-store --
* is already guaranteed without explicit barriers.
+ *
+ * Empirically it turns out locked ADDL is cheaper than MFENCE,
+ * so we use that, with an offset below the return address on
+ * the stack to avoid a false dependency with RET. (It might
+ * even be better to use a much lower offset, say -128, to
+ * avoid false dependencies for subsequent callees of the
+ * caller.)
+ *
+ * https://pvk.ca/Blog/2014/10/19/performance-optimisation-~-writing-an-essay/
+ * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
+ * https://www.agner.org/optimize/instruction_tables.pdf
*/
- HOTPATCH_SSE2_MFENCE
- /* 7 bytes of instructions */
LOCK
addl $0, -4(%esp)
ret
@@ -406,13 +413,3 @@
STRONG_ALIAS(_membar_producer,_membar_release)
STRONG_ALIAS(_membar_enter,_membar_sync)
STRONG_ALIAS(_membar_exit,_membar_release)
-
-#ifdef _HARDKERNEL
- .section .rodata
-
-LABEL(sse2_mfence)
- mfence
- ret
- nop; nop; nop;
-LABEL(sse2_mfence_end)
-#endif /* _HARDKERNEL */
diff -r 96595355b94f -r 4a740dddfcb3 common/lib/libc/arch/x86_64/atomic/atomic.S
--- a/common/lib/libc/arch/x86_64/atomic/atomic.S Sat Jul 30 13:09:19 2022 +0000
+++ b/common/lib/libc/arch/x86_64/atomic/atomic.S Sat Jul 30 14:11:00 2022 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: atomic.S,v 1.28 2022/04/09 23:32:52 riastradh Exp $ */
+/* $NetBSD: atomic.S,v 1.29 2022/07/30 14:11:00 riastradh Exp $ */
/*-
* Copyright (c) 2007 The NetBSD Foundation, Inc.
@@ -38,15 +38,6 @@
#define ALIAS(f, t) WEAK_ALIAS(f,t)
#endif
-#ifdef _HARDKERNEL
-#include <machine/frameasm.h>
-#define LOCK HOTPATCH(HP_NAME_NOLOCK, 1); lock
-#define HOTPATCH_SSE2_MFENCE HOTPATCH(HP_NAME_SSE2_MFENCE, 8);
-#else
-#define LOCK lock
-#define HOTPATCH_SSE2_MFENCE /* nothing */
-#endif
-
.text
/* 32-bit */
@@ -273,13 +264,22 @@
ENTRY(_membar_sync)
/*
- * MFENCE, or a serializing instruction like a locked addq,
+ * MFENCE, or a serializing instruction like a locked ADDQ,
* is necessary to order store-before-load. Every other
* ordering -- load-before-anything, anything-before-store --
* is already guaranteed without explicit barriers.
+ *
+ * Empirically it turns out locked ADDQ is cheaper than MFENCE,
+ * so we use that, with an offset below the return address on
+ * the stack to avoid a false dependency with RET. (It might
+ * even be better to use a much lower offset, say -128, to
+ * avoid false dependencies for subsequent callees of the
+ * caller.)
+ *
+ * https://pvk.ca/Blog/2014/10/19/performance-optimisation-~-writing-an-essay/
+ * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
+ * https://www.agner.org/optimize/instruction_tables.pdf
*/
- HOTPATCH_SSE2_MFENCE
- /* 8 bytes of instructions */
LOCK
addq $0, -8(%rsp)
ret
@@ -429,13 +429,3 @@
STRONG_ALIAS(_membar_producer,_membar_release)
STRONG_ALIAS(_membar_enter,_membar_sync)
STRONG_ALIAS(_membar_exit,_membar_release)
-
-#ifdef _HARDKERNEL
- .section .rodata
-
-LABEL(sse2_mfence)
- mfence
- ret
- nop; nop; nop; nop;
-LABEL(sse2_mfence_end)
-#endif /* _HARDKERNEL */
diff -r 96595355b94f -r 4a740dddfcb3 sys/arch/amd64/include/frameasm.h
--- a/sys/arch/amd64/include/frameasm.h Sat Jul 30 13:09:19 2022 +0000
+++ b/sys/arch/amd64/include/frameasm.h Sat Jul 30 14:11:00 2022 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: frameasm.h,v 1.54 2022/04/09 12:07:00 riastradh Exp $ */
+/* $NetBSD: frameasm.h,v 1.55 2022/07/30 14:11:00 riastradh Exp $ */
#ifndef _AMD64_MACHINE_FRAMEASM_H
#define _AMD64_MACHINE_FRAMEASM_H
@@ -63,7 +63,6 @@
#define HP_NAME_SVS_ENTER_NMI 11
#define HP_NAME_SVS_LEAVE_NMI 12
#define HP_NAME_MDS_LEAVE 13
-#define HP_NAME_SSE2_MFENCE 14
#define HOTPATCH(name, size) \
123: ; \
diff -r 96595355b94f -r 4a740dddfcb3 sys/arch/i386/include/frameasm.h
--- a/sys/arch/i386/include/frameasm.h Sat Jul 30 13:09:19 2022 +0000
+++ b/sys/arch/i386/include/frameasm.h Sat Jul 30 14:11:00 2022 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: frameasm.h,v 1.34 2022/04/09 12:07:00 riastradh Exp $ */
+/* $NetBSD: frameasm.h,v 1.35 2022/07/30 14:11:00 riastradh Exp $ */
#ifndef _I386_FRAMEASM_H_
#define _I386_FRAMEASM_H_
@@ -48,10 +48,9 @@
#define HP_NAME_STAC 2
#define HP_NAME_NOLOCK 3
#define HP_NAME_RETFENCE 4
-#define HP_NAME_SSE2_MFENCE 5
-#define HP_NAME_CAS_64 6
-#define HP_NAME_SPLLOWER 7
-#define HP_NAME_MUTEX_EXIT 8
+#define HP_NAME_CAS_64 5
+#define HP_NAME_SPLLOWER 6
+#define HP_NAME_MUTEX_EXIT 7
#define HOTPATCH(name, size) \
123: ; \
diff -r 96595355b94f -r 4a740dddfcb3 sys/arch/x86/x86/patch.c
--- a/sys/arch/x86/x86/patch.c Sat Jul 30 13:09:19 2022 +0000
+++ b/sys/arch/x86/x86/patch.c Sat Jul 30 14:11:00 2022 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: patch.c,v 1.50 2022/04/09 12:07:00 riastradh Exp $ */
+/* $NetBSD: patch.c,v 1.51 2022/07/30 14:11:00 riastradh Exp $ */
/*-
* Copyright (c) 2007, 2008, 2009 The NetBSD Foundation, Inc.
@@ -34,7 +34,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: patch.c,v 1.50 2022/04/09 12:07:00 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: patch.c,v 1.51 2022/07/30 14:11:00 riastradh Exp $");
#include "opt_lockdebug.h"
#ifdef i386
@@ -117,19 +117,6 @@
};
__link_set_add_rodata(x86_hotpatch_descriptors, hp_nolock_desc);
-/* Use MFENCE if available, part of SSE2. */
-extern uint8_t sse2_mfence, sse2_mfence_end;
-static const struct x86_hotpatch_source hp_sse2_mfence_source = {
- .saddr = &sse2_mfence,
- .eaddr = &sse2_mfence_end
-};
-static const struct x86_hotpatch_descriptor hp_sse2_mfence_desc = {
- .name = HP_NAME_SSE2_MFENCE,
- .nsrc = 1,
- .srcs = { &hp_sse2_mfence_source }
-};
-__link_set_add_rodata(x86_hotpatch_descriptors, hp_sse2_mfence_desc);
-
#ifdef i386
/* CAS_64. */
extern uint8_t _atomic_cas_cx8, _atomic_cas_cx8_end;
@@ -327,20 +314,6 @@
#endif
}
- if (!early && (cpu_feature[0] & CPUID_SSE2) != 0) {
- /*
- * Faster memory barriers. The only barrier x86 ever
- * requires for MI synchronization between CPUs is
- * MFENCE for store-before-load ordering; all other
- * ordering is guaranteed already -- every load is a
- * load-acquire and every store is a store-release.
- *
- * LFENCE and SFENCE are relevant only for MD logic
- * involving I/O devices or non-temporal stores.
- */
- x86_hotpatch(HP_NAME_SSE2_MFENCE, 0);
- }
-
#ifdef i386
/*
* Patch early and late. Second time around the 'lock' prefix
Home |
Main Index |
Thread Index |
Old Index