diff --git a/src/runtime/memclr_amd64.s b/src/runtime/memclr_amd64.s
index 700bbd7b9b..e3fd4dd0fd 100644
--- a/src/runtime/memclr_amd64.s
+++ b/src/runtime/memclr_amd64.s
@@ -17,6 +17,12 @@ TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT, $0-16
 	MOVQ	AX, DI	// DI = ptr
 	XORQ	AX, AX
 
+#ifndef hack
+	CLD
+	MOVQ	BX, CX
+	REP
+	STOSB
+#else
 	// MOVOU seems always faster than REP STOSQ.
 tail:
 	// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
@@ -176,4 +182,5 @@ _129through256:
 	MOVOU	X15, -48(DI)(BX*1)
 	MOVOU	X15, -32(DI)(BX*1)
 	MOVOU	X15, -16(DI)(BX*1)
+#endif
 	RET
diff --git a/src/runtime/memmove_amd64.s b/src/runtime/memmove_amd64.s
index eeb5033fd9..05dd64412a 100644
--- a/src/runtime/memmove_amd64.s
+++ b/src/runtime/memmove_amd64.s
@@ -38,6 +38,23 @@ TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT, $0-24
 	// CX = n
 	MOVQ	AX, DI
 	MOVQ	BX, SI
+#ifdef hack
+	CMPQ	SI, DI	// Copy right-to-left if src < dst, else left-to-right.
+	JL	0f
+	CLD			// Copy left-to-right.
+	JMP	1f
+0:
+	STD			// Copy right-to-left.
+	ADDQ	CX, DI		// Point to end of dst.
+	ADDQ	$-1, DI
+	ADDQ	CX, SI		// Point to end of src.
+	ADDQ	$-1, SI
+1:
+	REP
+	MOVSB
+
+	CLD		// Either way, leave it cleared.
+#else
 	MOVQ	CX, BX
 
 	// REP instructions have a high startup cost, so we handle small sizes
@@ -529,4 +546,5 @@ gobble_big_mem_bwd_loop:
 	MOVOU	X10, 0x50(AX)
 	MOVOU	X11, 0x60(AX)
 	MOVOU	X12, 0x70(AX)
+#endif
 	RET