- Use store-release/load-acquire to guarantee full happens-before
  relation in TLB shootdown synchronization.  Under the hood, this
  changes wmb to mb, but only on the very last CPU to process the
  shootdown -- and it's not a priori clear that wmb is sufficient
  here; stack activity on another CPU could overwrite prior loads
  on this CPU of tlb_context content.

- Omit needless membar_enter in pmap_kenter_pa -- it is the caller's
  responsibility to ensure pmap_kenter_pa happens before use of the
  newly entered VA on another CPU.

diff -r e4f263f866a7 sys/arch/alpha/alpha/pmap.c
--- a/sys/arch/alpha/alpha/pmap.c	Thu Sep 03 09:41:21 2020 +0000
+++ b/sys/arch/alpha/alpha/pmap.c	Thu Sep 03 18:01:33 2020 +0000
@@ -979,7 +979,7 @@ pmap_tlb_shootnow(const struct pmap_tlb_
 		int backoff = SPINLOCK_BACKOFF_MIN;
 		u_int spins = 0;
 
-		while (atomic_load_relaxed(&tlb_context) != NULL) {
+		while (atomic_load_acquire(&tlb_context) != NULL) {
 			SPINLOCK_BACKOFF(backoff);
 			if (spins++ > 0x0fffffff) {
 				printf("TLB LOCAL MASK  = 0x%016lx\n",
@@ -994,7 +994,6 @@ pmap_tlb_shootnow(const struct pmap_tlb_
 				panic("pmap_tlb_shootnow");
 			}
 		}
-		membar_consumer();
 	}
 	KASSERT(tlb_context == NULL);
 #endif /* MULTIPROCESSOR */
@@ -1025,8 +1024,7 @@ pmap_tlb_shootdown_ipi(struct cpu_info *
 	KASSERT(tlb_context != NULL);
 	pmap_tlb_invalidate(tlb_context, ci);
 	if (atomic_and_ulong_nv(&tlb_pending, ~(1UL << ci->ci_cpuid)) == 0) {
-		membar_producer();
-		atomic_store_relaxed(&tlb_context, NULL);
+		atomic_store_release(&tlb_context, NULL);
 	}
 }
 #endif /* MULTIPROCESSOR */
@@ -2275,7 +2273,6 @@ pmap_kenter_pa(vaddr_t va, paddr_t pa, v
 	/* Set the new PTE. */
 	const pt_entry_t opte = atomic_load_relaxed(pte);
 	atomic_store_relaxed(pte, npte);
-	PMAP_MP(membar_enter());
 
 	PMAP_STAT_INCR(pmap->pm_stats.resident_count, 1);
 	PMAP_STAT_INCR(pmap->pm_stats.wired_count, 1);