- Use store-release/load-acquire to guarantee full happens-before relation in TLB shootdown synchronization. Under the hood, this changes wmb to mb, but only on the very last CPU to process the shootdown -- and it's not a priori clear that wmb is sufficient here; stack activity on another CPU could overwrite prior loads on this CPU of tlb_context content. - Omit needless membar_enter in pmap_kenter_pa -- it is the caller's responsibility to ensure pmap_kenter_pa happens before use of the newly entered VA on another CPU. diff -r e4f263f866a7 sys/arch/alpha/alpha/pmap.c --- a/sys/arch/alpha/alpha/pmap.c Thu Sep 03 09:41:21 2020 +0000 +++ b/sys/arch/alpha/alpha/pmap.c Thu Sep 03 18:01:33 2020 +0000 @@ -979,7 +979,7 @@ pmap_tlb_shootnow(const struct pmap_tlb_ int backoff = SPINLOCK_BACKOFF_MIN; u_int spins = 0; - while (atomic_load_relaxed(&tlb_context) != NULL) { + while (atomic_load_acquire(&tlb_context) != NULL) { SPINLOCK_BACKOFF(backoff); if (spins++ > 0x0fffffff) { printf("TLB LOCAL MASK = 0x%016lx\n", @@ -994,7 +994,6 @@ pmap_tlb_shootnow(const struct pmap_tlb_ panic("pmap_tlb_shootnow"); } } - membar_consumer(); } KASSERT(tlb_context == NULL); #endif /* MULTIPROCESSOR */ @@ -1025,8 +1024,7 @@ pmap_tlb_shootdown_ipi(struct cpu_info * KASSERT(tlb_context != NULL); pmap_tlb_invalidate(tlb_context, ci); if (atomic_and_ulong_nv(&tlb_pending, ~(1UL << ci->ci_cpuid)) == 0) { - membar_producer(); - atomic_store_relaxed(&tlb_context, NULL); + atomic_store_release(&tlb_context, NULL); } } #endif /* MULTIPROCESSOR */ @@ -2275,7 +2273,6 @@ pmap_kenter_pa(vaddr_t va, paddr_t pa, v /* Set the new PTE. */ const pt_entry_t opte = atomic_load_relaxed(pte); atomic_store_relaxed(pte, npte); - PMAP_MP(membar_enter()); PMAP_STAT_INCR(pmap->pm_stats.resident_count, 1); PMAP_STAT_INCR(pmap->pm_stats.wired_count, 1);