From a4c4afccb1529233366e267998093c3adaf7439d Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 19 Feb 2023 16:17:04 +0000 Subject: [PATCH 1/8] mips: Add missing barrier in cpu_switchto. Otherwise adaptive mutexes don't work -- not reliably, anyway. Details in comments. XXX pullup-8 XXX pullup-9 XXX pullup-10 --- sys/arch/mips/include/asm.h | 12 ++++++++++++ sys/arch/mips/mips/locore.S | 27 +++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/sys/arch/mips/include/asm.h b/sys/arch/mips/include/asm.h index 8673c5063e1e..0fcb3230aac8 100644 --- a/sys/arch/mips/include/asm.h +++ b/sys/arch/mips/include/asm.h @@ -609,6 +609,18 @@ _C_LABEL(x): #define SYNC_PLUNGER /* nothing */ #endif +/* + * Store-before-store and load-before-load barriers. These could be + * made weaker than release (load/store-before-store) and acquire + * (load-before-load/store) barriers, and newer MIPS does have + * instruction encodings for finer-grained barriers like this, but I + * dunno how to appropriately conditionalize their use or get the + * assembler to be happy with them, so we'll use these definitions for + * now. + */ +#define SYNC_PRODUCER SYNC_REL +#define SYNC_CONSUMER SYNC_ACQ + /* CPU dependent hook for cp0 load delays */ #if defined(MIPS1) || defined(MIPS2) || defined(MIPS3) #define MFC0_HAZARD sll $0,$0,1 /* super scalar nop */ diff --git a/sys/arch/mips/mips/locore.S b/sys/arch/mips/mips/locore.S index 87b8e6ed0f73..c9e0b35282af 100644 --- a/sys/arch/mips/mips/locore.S +++ b/sys/arch/mips/mips/locore.S @@ -286,6 +286,33 @@ NESTED(cpu_switchto, CALLFRAME_SIZ, ra) PTR_L t2, L_CPU(MIPS_CURLWP) nop # patchable load delay slot + + /* + * Issue a store-before-store barrier so that any prior + * mutex_exit is visible to any thread that witnesses this + * store. See kern_mutex.c for details -- this is necessary + * for adaptive mutexes to detect whether the lwp is on the + * CPU in order to safely block without requiring atomic CAS + * in mutex_exit. + * + * This serves as a membar_producer that matches the + * membar_consumer between owner = mtx->mtx_owner and + * mutex_oncpu(owner) (or, owner->l_cpu->ci_curlwp == owner) + * in mutex_vector_enter. + * + * Note that we don't need to issue a second membar_producer, + * to match the membar_consumer between mutex_oncpu(owner) + * and MUTEX_HAS_WAITERS(mtx), because a stronger barrier is + * issued by mutex_exit itself before or as part of the store + * that actually releases the mutex. + * + * A store-before-store barrier is sufficient -- no need for + * load/store-before-store or store-release -- because the + * other side, in mutex_vector_enter, is read-only for this + * protocol, allowing us use membar_producer/consumer + * semantics and not the stronger membar_release/acquire. + */ + SYNC_PRODUCER PTR_S MIPS_CURLWP, CPU_INFO_CURLWP(t2) /* Check for restartable atomic sequences (RAS) */ From efc1c3fb0c3e615404c05768e1111d639ad9e3f4 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 19 Feb 2023 21:03:19 +0000 Subject: [PATCH 2/8] aarch64: Fix barrier in cpu_switchto. The two relevant orderings, which mutex_vector_enter relies on, must be: 1. /* potential mutex_exit */ ... mtx->mtx_owner = 0; // (a) ... /* cpu_switchto */ ... ci->ci_curlwp = otherlwp; // (b) 2. /* cpu_switchto */ ... ci->ci_curlwp = owner; // (a) ... /* potential mutex_exit */ ... mtx->mtx_owner = 0; // (b) In scenario 2, mutex_exit (either via MUTEX_RELEASE or via MD lock stubs) already issues a load/store-before-store barrier before 2(b), so there's no need to issue anything in cpu_switchto -- the existing DMB ISHST is unnecessary. So this change removes it. In scenario 1, there must be a DMB ISHST or stronger between 1(a) and 1(b). But that's missing right now. So this change inserts it, just before ci->ci_curlwp = otherlwp. XXX pullup-8 XXX pullup-9 XXX pullup-10 --- sys/arch/aarch64/aarch64/cpuswitch.S | 29 +++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/sys/arch/aarch64/aarch64/cpuswitch.S b/sys/arch/aarch64/aarch64/cpuswitch.S index 6fd1ab26bf57..992044680043 100644 --- a/sys/arch/aarch64/aarch64/cpuswitch.S +++ b/sys/arch/aarch64/aarch64/cpuswitch.S @@ -125,8 +125,35 @@ ENTRY_NP(cpu_switchto) msr tpidr_el1, x1 /* switch curlwp to new lwp */ ldr x3, [x1, #L_CPU] + + /* + * Issue a store-before-store barrier so that any prior + * mutex_exit is visible to any thread that witnesses this + * store. See kern_mutex.c for details -- this is necessary + * for adaptive mutexes to detect whether the lwp is on the + * CPU in order to safely block without requiring atomic CAS + * in mutex_exit. + * + * This serves as a membar_producer that matches the + * membar_consumer between owner = mtx->mtx_owner and + * mutex_oncpu(owner) (or, owner->l_cpu->ci_curlwp == owner) + * in mutex_vector_enter. + * + * Note that we don't need to issue a second membar_producer, + * to match the membar_consumer between mutex_oncpu(owner) + * and MUTEX_HAS_WAITERS(mtx), because a stronger barrier is + * issued by mutex_exit itself before or as part of the store + * that actually releases the mutex. + * + * A store-before-store barrier is sufficient -- no need for + * load/store-before-store or store-release -- because the + * other side, in mutex_vector_enter, is read-only for this + * protocol, allowing us use membar_producer/consumer + * semantics and not the stronger membar_release/acquire. + */ + dmb ishst str x1, [x3, #CI_CURLWP] /* switch curlwp to new lwp */ - dmb ishst /* see comments in kern_mutex.c */ + ENABLE_INTERRUPT /* From a0c1609550e209f277a6ee5e9bbd05f696edfd49 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 19 Feb 2023 21:13:32 +0000 Subject: [PATCH 3/8] sparc64: Add missing barrier in cpu_switchto. Details in comments. XXX pullup-8 XXX pullup-9 XXX pullup-10 --- sys/arch/sparc64/sparc64/locore.s | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/sys/arch/sparc64/sparc64/locore.s b/sys/arch/sparc64/sparc64/locore.s index 17bf12f8b2cd..a8f2ee54c30b 100644 --- a/sys/arch/sparc64/sparc64/locore.s +++ b/sys/arch/sparc64/sparc64/locore.s @@ -6731,8 +6731,33 @@ ENTRY(cpu_switchto) * Load the new lwp. To load, we must change stacks and * alter cpcb and the window control registers, hence we must * keep interrupts disabled. + * + * Issue a store-before-store barrier so that any prior + * mutex_exit is visible to any thread that witnesses this + * store. See kern_mutex.c for details -- this is necessary + * for adaptive mutexes to detect whether the lwp is on the + * CPU in order to safely block without requiring atomic CAS + * in mutex_exit. + * + * This serves as a membar_producer that matches the + * membar_consumer between owner = mtx->mtx_owner and + * mutex_oncpu(owner) (or, owner->l_cpu->ci_curlwp == owner) + * in mutex_vector_enter. + * + * Note that we don't need to issue a second membar_producer, + * to match the membar_consumer between mutex_oncpu(owner) + * and MUTEX_HAS_WAITERS(mtx), because a stronger barrier is + * issued by mutex_exit itself before or as part of the store + * that actually releases the mutex. + * + * A store-before-store barrier is sufficient -- no need for + * load/store-before-store or store-release -- because the + * other side, in mutex_vector_enter, is read-only for this + * protocol, allowing us use membar_producer/consumer + * semantics and not the stronger membar_release/acquire. */ + membar #StoreStore STPTR %i1, [%l7 + %lo(CURLWP)] ! curlwp = l; STPTR %l1, [%l6 + %lo(CPCB)] ! cpcb = newpcb; From 030ad3530efe97c2171223f9f086170adc4d61e8 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 19 Feb 2023 21:34:18 +0000 Subject: [PATCH 4/8] alpha: Add missing barrier in cpu_switchto. Details in comments. XXX pullup-8 XXX pullup-9 XXX pullup-10 --- sys/arch/alpha/alpha/locore.s | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/sys/arch/alpha/alpha/locore.s b/sys/arch/alpha/alpha/locore.s index fec4ddc5b37e..bf0f0b6272ac 100644 --- a/sys/arch/alpha/alpha/locore.s +++ b/sys/arch/alpha/alpha/locore.s @@ -825,7 +825,33 @@ LEAF(cpu_switchto, 0) ldq a0, L_MD_PCBPADDR(s2) call_pal PAL_OSF1_swpctx /* clobbers a0, t0, t8-t11, v0 */ -1: SET_CURLWP(s2) /* curlwp = l */ +1: /* + * Issue a store-before-store barrier so that any prior + * mutex_exit is visible to any thread that witnesses this + * store. See kern_mutex.c for details -- this is necessary + * for adaptive mutexes to detect whether the lwp is on the + * CPU in order to safely block without requiring atomic CAS + * in mutex_exit. + * + * This serves as a membar_producer that matches the + * membar_consumer between owner = mtx->mtx_owner and + * mutex_oncpu(owner) (or, owner->l_cpu->ci_curlwp == owner) + * in mutex_vector_enter. + * + * Note that we don't need to issue a second membar_producer, + * to match the membar_consumer between mutex_oncpu(owner) + * and MUTEX_HAS_WAITERS(mtx), because a stronger barrier is + * issued by mutex_exit itself before or as part of the store + * that actually releases the mutex. + * + * A store-before-store barrier is sufficient -- no need for + * load/store-before-store or store-release -- because the + * other side, in mutex_vector_enter, is read-only for this + * protocol, allowing us use membar_producer/consumer + * semantics and not the stronger membar_release/acquire. + */ + wmb /* XXX patch out if !MP? */ + SET_CURLWP(s2) /* curlwp = l */ /* * Now running on the new PCB. From be2cfdd554b10fbe1f5a41b11046c62597c31522 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 19 Feb 2023 21:36:20 +0000 Subject: [PATCH 5/8] powerpc: Add missing barrier in cpu_switchto. Details in comments. XXX pullup-8 XXX pullup-9 XXX pullup-10 --- sys/arch/powerpc/powerpc/locore_subr.S | 30 ++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/sys/arch/powerpc/powerpc/locore_subr.S b/sys/arch/powerpc/powerpc/locore_subr.S index c910c10a38e5..264cc0e120a8 100644 --- a/sys/arch/powerpc/powerpc/locore_subr.S +++ b/sys/arch/powerpc/powerpc/locore_subr.S @@ -215,6 +215,36 @@ ENTRY(cpu_switchto) */ GET_CPUINFO(%r7) + + /* + * Issue a store-before-store barrier so that any prior + * mutex_exit is visible to any thread that witnesses this + * store. See kern_mutex.c for details -- this is necessary + * for adaptive mutexes to detect whether the lwp is on the + * CPU in order to safely block without requiring atomic CAS + * in mutex_exit. + * + * This serves as a membar_producer that matches the + * membar_consumer between owner = mtx->mtx_owner and + * mutex_oncpu(owner) (or, owner->l_cpu->ci_curlwp == owner) + * in mutex_vector_enter. + * + * Note that we don't need to issue a second membar_producer, + * to match the membar_consumer between mutex_oncpu(owner) + * and MUTEX_HAS_WAITERS(mtx), because a stronger barrier is + * issued by mutex_exit itself before or as part of the store + * that actually releases the mutex. + * + * A store-before-store barrier is sufficient -- no need for + * load/store-before-store or store-release -- because the + * other side, in mutex_vector_enter, is read-only for this + * protocol, allowing us use membar_producer/consumer + * semantics and not the stronger membar_release/acquire. + */ +#ifdef MULTIPROCESSOR + sync /* XXX use eieio if available -- cheaper */ +#endif + stptr %r31,CI_CURLWP(%r7) mr %r13,%r31 #ifdef PPC_BOOKE From be3f272f4483b94ac9ddb7eee309b71714402322 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 19 Feb 2023 21:40:32 +0000 Subject: [PATCH 6/8] arm32: Fix barrier in cpu_switchto. Details in comments. Change is as for aarch64. XXX pullup-8 XXX pullup-9 XXX pullup-10 --- sys/arch/arm/arm32/cpuswitch.S | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/sys/arch/arm/arm32/cpuswitch.S b/sys/arch/arm/arm32/cpuswitch.S index 1ffda42dbd86..e15c3a6ef550 100644 --- a/sys/arch/arm/arm32/cpuswitch.S +++ b/sys/arch/arm/arm32/cpuswitch.S @@ -189,13 +189,38 @@ ENTRY(cpu_switchto) mcr p15, 0, r6, c13, c0, 4 /* set current lwp */ #endif - /* We have a new curlwp now so make a note of it */ - str r6, [r5, #(CI_CURLWP)] - + /* + * Issue a store-before-store barrier so that any prior + * mutex_exit is visible to any thread that witnesses this + * store. See kern_mutex.c for details -- this is necessary + * for adaptive mutexes to detect whether the lwp is on the + * CPU in order to safely block without requiring atomic CAS + * in mutex_exit. + * + * This serves as a membar_producer that matches the + * membar_consumer between owner = mtx->mtx_owner and + * mutex_oncpu(owner) (or, owner->l_cpu->ci_curlwp == owner) + * in mutex_vector_enter. + * + * Note that we don't need to issue a second membar_producer, + * to match the membar_consumer between mutex_oncpu(owner) + * and MUTEX_HAS_WAITERS(mtx), because a stronger barrier is + * issued by mutex_exit itself before or as part of the store + * that actually releases the mutex. + * + * A store-before-store barrier is sufficient -- no need for + * load/store-before-store or store-release -- because the + * other side, in mutex_vector_enter, is read-only for this + * protocol, allowing us use membar_producer/consumer + * semantics and not the stronger membar_release/acquire. + */ #ifdef _ARM_ARCH_7 - dmb /* see comments in kern_mutex.c */ + dmb #endif + /* We have a new curlwp now so make a note of it */ + str r6, [r5, #(CI_CURLWP)] + /* Get the new pcb */ ldr r7, [r6, #(L_PCB)] From e41d5c4a9213d4fc961216deb4791a3ee339a369 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 19 Feb 2023 21:37:53 +0000 Subject: [PATCH 7/8] riscv: Add missing barrier in cpu_switchto. Details in comments. --- sys/arch/riscv/riscv/cpu_switch.S | 32 +++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/sys/arch/riscv/riscv/cpu_switch.S b/sys/arch/riscv/riscv/cpu_switch.S index acb023b3b9d9..3a49c2274742 100644 --- a/sys/arch/riscv/riscv/cpu_switch.S +++ b/sys/arch/riscv/riscv/cpu_switch.S @@ -62,6 +62,38 @@ ENTRY_NP(cpu_switchto) mv tp, a1 // # put the new lwp in thread pointer PTR_L t1, L_CPU(tp) // # get curcpu + + /* + * Load the new lwp. To load, we must change stacks and + * alter cpcb and the window control registers, hence we must + * keep interrupts disabled. + * + * Issue a store-before-store barrier so that any prior + * mutex_exit is visible to any thread that witnesses this + * store. See kern_mutex.c for details -- this is necessary + * for adaptive mutexes to detect whether the lwp is on the + * CPU in order to safely block without requiring atomic CAS + * in mutex_exit. + * + * This serves as a membar_producer that matches the + * membar_consumer between owner = mtx->mtx_owner and + * mutex_oncpu(owner) (or, owner->l_cpu->ci_curlwp == owner) + * in mutex_vector_enter. + * + * Note that we don't need to issue a second membar_producer, + * to match the membar_consumer between mutex_oncpu(owner) + * and MUTEX_HAS_WAITERS(mtx), because a stronger barrier is + * issued by mutex_exit itself before or as part of the store + * that actually releases the mutex. + * + * A store-before-store barrier is sufficient -- no need for + * load/store-before-store or store-release -- because the + * other side, in mutex_vector_enter, is read-only for this + * protocol, allowing us use membar_producer/consumer + * semantics and not the stronger membar_release/acquire. + */ + fence w,w + PTR_S tp, CI_CURLWP(t1) // # update curcpu with the new curlwp REG_L sp, L_MD_KTF(tp) // # load its kernel stack pointer From ca796d10952126055caeccc3fbdc60afca31476e Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Mon, 20 Feb 2023 00:14:39 +0000 Subject: [PATCH 8/8] sparc: Note where STBAR would go in cpu_switchto for non-TSO. No functional change. --- sys/arch/sparc/sparc/locore.s | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sys/arch/sparc/sparc/locore.s b/sys/arch/sparc/sparc/locore.s index 5d7edbec3461..825ea3488c25 100644 --- a/sys/arch/sparc/sparc/locore.s +++ b/sys/arch/sparc/sparc/locore.s @@ -4889,6 +4889,10 @@ Lwb1: SAVE; SAVE; SAVE; SAVE; SAVE; SAVE; /* 6 of each: */ /* set new cpcb, and curlwp */ sethi %hi(curlwp), %l7 st %g5, [%l6 + %lo(cpcb)] ! cpcb = newpcb; + /* + * No membar needed because we run in TSO. But if we didn't, + * we would need STBAR here; see kern_mutex.c for details. + */ st %g3, [%l7 + %lo(curlwp)] ! curlwp = l; /* compute new wim */