diff -r b9bd12fb9564 sys/arch/aarch64/aarch64/cpu.c
--- a/sys/arch/aarch64/aarch64/cpu.c	Sun Aug 02 18:20:51 2020 +0000
+++ b/sys/arch/aarch64/aarch64/cpu.c	Wed Aug 05 19:42:16 2020 +0000
@@ -621,6 +621,7 @@ cpu_setup_aes(device_t dv, struct cpu_in
 	switch (__SHIFTOUT(id->ac_aa64isar0, ID_AA64ISAR0_EL1_AES)) {
 	case ID_AA64ISAR0_EL1_AES_AES:
 	case ID_AA64ISAR0_EL1_AES_PMUL:
+		break;
 		aes_md_init(&aes_armv8_impl);
 		return;
 	default:
diff -r b9bd12fb9564 sys/crypto/aes/arch/arm/aes_armv8_64.S
--- a/sys/crypto/aes/arch/arm/aes_armv8_64.S	Sun Aug 02 18:20:51 2020 +0000
+++ b/sys/crypto/aes/arch/arm/aes_armv8_64.S	Wed Aug 05 19:42:16 2020 +0000
@@ -26,8 +26,6 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include <sys/endian.h>
-
 #include <aarch64/asm.h>
 
 RCSID("$NetBSD: aes_armv8_64.S,v 1.11 2020/07/27 20:57:23 riastradh Exp $")
@@ -921,19 +919,13 @@ ENTRY(aesarmv8_ccm_enc1)
 	ld1	{v5.4s}, [x11]		/* q5 := (0,0,0,1) (host-endian) */
 	mov	x9, x0			/* x9 := enckey */
 	mov	x10, x3			/* x10 := nbytes */
-#if _BYTE_ORDER == _LITTLE_ENDIAN
 	rev32	v2.16b, v2.16b		/* q2 := ctr (host-endian) */
-#endif
 	_ALIGN_TEXT
 1:	ldr	q3, [x1], #0x10		/* q3 := plaintext block */
 	add	v2.4s, v2.4s, v5.4s	/* increment ctr (32-bit) */
 	mov	x0, x9			/* x0 := enckey */
 	mov	x3, x5			/* x3 := nrounds */
-#if _BYTE_ORDER == _LITTLE_ENDIAN
 	rev32	v1.16b, v2.16b		/* q1 := ctr (big-endian) */
-#else
-	mov	v1.16b, v2.16b		/* q1 := ctr (big-endian) */
-#endif
 	eor	v0.16b, v0.16b, v3.16b	/* q0 := auth ^ ptxt */
 	bl	aesarmv8_enc2		/* q0 := auth', q1 := pad;
 					 * trash x0/x3/q16 */
@@ -941,9 +933,7 @@ 1:	ldr	q3, [x1], #0x10		/* q3 := plainte
 	subs	x10, x10, #0x10		/* count down bytes */
 	str	q3, [x2], #0x10		/* store ciphertext block */
 	b.ne	1b			/* repeat if more blocks */
-#if _BYTE_ORDER == _LITTLE_ENDIAN
 	rev32	v2.16b, v2.16b		/* q2 := ctr (big-endian) */
-#endif
 	stp	q0, q2, [x4]		/* store updated auth/ctr */
 	ldp	fp, lr, [sp], #16	/* pop stack frame */
 	ret
@@ -968,18 +958,12 @@ ENTRY(aesarmv8_ccm_dec1)
 	ld1	{v5.4s}, [x11]		/* q5 := (0,0,0,1) (host-endian) */
 	mov	x9, x0			/* x9 := enckey */
 	mov	x10, x3			/* x10 := nbytes */
-#if _BYTE_ORDER == _LITTLE_ENDIAN
 	rev32	v2.16b, v2.16b		/* q2 := ctr (host-endian) */
-#endif
 
 	/* Decrypt the first block.  */
 	add	v2.4s, v2.4s, v5.4s	/* increment ctr (32-bit) */
 	mov	x3, x5			/* x3 := nrounds */
-#if _BYTE_ORDER == _LITTLE_ENDIAN
 	rev32	v0.16b, v2.16b		/* q0 := ctr (big-endian) */
-#else
-	mov	v0.16b, v2.16b		/* q0 := ctr (big-endian) */
-#endif
 	ldr	q3, [x1], #0x10		/* q3 := ctxt */
 	bl	aesarmv8_enc1		/* q0 := pad; trash x0/x3/q16 */
 	b	2f
@@ -995,11 +979,7 @@ 1:	/*
 	add	v2.4s, v2.4s, v5.4s	/* increment ctr (32-bit) */
 	mov	x0, x9			/* x0 := enckey */
 	mov	x3, x5			/* x3 := nrounds */
-#if _BYTE_ORDER == _LITTLE_ENDIAN
 	rev32	v0.16b, v2.16b		/* q0 := ctr (big-endian) */
-#else
-	mov	v0.16b, v2.16b		/* q0 := ctr (big-endian) */
-#endif
 	ldr	q3, [x1], #0x10		/* q3 := ctxt */
 	bl	aesarmv8_enc2		/* q0 := pad, q1 := auth';
 					 * trash x0/x3/q16 */
@@ -1009,15 +989,14 @@ 2:	eor	v3.16b, v0.16b, v3.16b	/* q3 := p
 	eor	v1.16b, v1.16b, v3.16b	/* q1 := auth ^ ptxt */
 	b.ne	1b
 
-#if _BYTE_ORDER == _LITTLE_ENDIAN
 	rev32	v2.16b, v2.16b		/* q2 := ctr (big-endian) */
-#endif
 
 	/* Authenticate the last block.  */
 	mov	x0, x9			/* x0 := enckey */
 	mov	x3, x5			/* x3 := nrounds */
 	mov	v0.16b, v1.16b		/* q0 := auth ^ ptxt */
 	bl	aesarmv8_enc1		/* q0 := auth'; trash x0/x3/q16 */
+
 	stp	q0, q2, [x4]		/* store updated auth/ctr */
 	ldp	fp, lr, [sp], #16	/* pop stack frame */
 	ret
diff -r b9bd12fb9564 sys/crypto/aes/arch/arm/aes_neon_32.S
--- a/sys/crypto/aes/arch/arm/aes_neon_32.S	Sun Aug 02 18:20:51 2020 +0000
+++ b/sys/crypto/aes/arch/arm/aes_neon_32.S	Wed Aug 05 19:42:16 2020 +0000
@@ -270,7 +270,7 @@ ENTRY(aes_neon_enc1)
 	ldr	r12, .Lconstants_addr
 	adr	r11, .Lconstants_addr
 
-	vld1.64	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
+	vld1.8	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
 	movw	r3, #0
 	vmov.i8	q1, #0x0f
 
@@ -280,8 +280,8 @@ ENTRY(aes_neon_enc1)
 	/* (q4, q5) := (iptlo, ipthi) */
 	add	r6, r12, #(iptlo - .Lconstants)
 	add	r7, r12, #(ipthi - .Lconstants)
-	vld1.64	{d8-d9}, [r6 :128]
-	vld1.64	{d10-d11}, [r7 :128]
+	vld1.8	{d8-d9}, [r6 :128]
+	vld1.8	{d10-d11}, [r7 :128]
 
 	/* load the rest of the constants */
 	add	r4, r12, #(sb1_0 - .Lconstants)
@@ -290,12 +290,12 @@ ENTRY(aes_neon_enc1)
 	add	r7, r12, #(sb2_1 - .Lconstants)
 	add	r8, r12, #(inv - .Lconstants)
 	add	r10, r12, #(inva - .Lconstants)
-	vld1.64	{d12-d13}, [r4 :128]	/* q6 = sb1[0] */
-	vld1.64	{d14-d15}, [r5 :128]	/* q7 = sb1[1] */
-	vld1.64	{d16-d17}, [r6 :128]	/* q8 = sb2[0] */
-	vld1.64	{d18-d19}, [r7 :128]	/* q9 = sb2[1] */
-	vld1.64	{d20-d21}, [r8 :128]	/* q10 = inv */
-	vld1.64	{d22-d23}, [r10 :128]	/* q11 = inva */
+	vld1.8	{d12-d13}, [r4 :128]	/* q6 = sb1[0] */
+	vld1.8	{d14-d15}, [r5 :128]	/* q7 = sb1[1] */
+	vld1.8	{d16-d17}, [r6 :128]	/* q8 = sb2[0] */
+	vld1.8	{d18-d19}, [r7 :128]	/* q9 = sb2[1] */
+	vld1.8	{d20-d21}, [r8 :128]	/* q10 = inv */
+	vld1.8	{d22-d23}, [r10 :128]	/* q11 = inva */
 
 	/* (r4, r5) := (&mc_forward[0], &mc_backward[0]) */
 	add	r4, r12, #(mc_forward - .Lconstants)
@@ -319,7 +319,7 @@ ENTRY(aes_neon_enc1)
 	b	2f
 
 	_ALIGN_TEXT
-1:	vld1.64	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
+1:	vld1.8	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
 
 	/* q0 := A = rk[i] + sb1_0(io) + sb1_1(jo) */
 	vtbl.8	d24, {d12-d13}, d4
@@ -339,8 +339,8 @@ 1:	vld1.64	{d28-d29}, [r0 :128]!	/* q14 
 	/* (q12, q13) := (mc_forward[rmod4], mc_backward[rmod4]) */
 	add	r6, r4, r3, lsl #4
 	add	r7, r5, r3, lsl #4
-	vld1.64	{d24-d25}, [r6]
-	vld1.64	{d26-d27}, [r7]
+	vld1.8	{d24-d25}, [r6]
+	vld1.8	{d26-d27}, [r7]
 
 	/* q15 := A2_B = A2 + A(mcf) */
 	vtbl.8	d30, {d0-d1}, d24
@@ -412,11 +412,11 @@ 2:	/*
 	add	r6, r12, #(sbo_0 - .Lconstants)
 	add	r7, r12, #(sbo_1 - .Lconstants)
 	add	r8, r8, r3, lsl #4
-	vld1.64	{d12-d13}, [r6 :128]
-	vld1.64	{d14-d15}, [r7 :128]
-	vld1.64	{d30-d31}, [r8 :128]
+	vld1.8	{d12-d13}, [r6 :128]
+	vld1.8	{d14-d15}, [r7 :128]
+	vld1.8	{d30-d31}, [r8 :128]
 
-	vld1.64	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
+	vld1.8	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
 
 	/* (q2, q3) := (sbo_0(io), sbo_1(jo)) */
 	vtbl.8	d4, {d12-d13}, d4
@@ -489,7 +489,7 @@ ENTRY(aes_neon_dec1)
 	ldr	r12, .Lconstants_addr
 	adr	r11, .Lconstants_addr
 
-	vld1.64	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
+	vld1.8	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
 	rsb	r3, r1, #0		/* r3 := ~(x - 1) = -x */
 	vmov.i8	q1, #0x0f
 	and	r3, r3, #3		/* r3 := 3 & ~(x - 1) */
@@ -500,8 +500,8 @@ ENTRY(aes_neon_dec1)
 	/* (q4, q5) := (diptlo, dipthi) */
 	add	r6, r12, #(diptlo - .Lconstants)
 	add	r7, r12, #(dipthi - .Lconstants)
-	vld1.64	{d8-d9}, [r6 :128]
-	vld1.64	{d10-d11}, [r7 :128]
+	vld1.8	{d8-d9}, [r6 :128]
+	vld1.8	{d10-d11}, [r7 :128]
 
 	/* load the rest of the constants */
 	add	r4, r12, #(dsbb_0 - .Lconstants)
@@ -509,11 +509,11 @@ ENTRY(aes_neon_dec1)
 	add	r6, r12, #(inv - .Lconstants)
 	add	r7, r12, #(inva - .Lconstants)
 	add	r8, r12, #(.Lmc_forward_3 - .Lconstants)
-	vld1.64	{d12-d13}, [r4 :128]	/* q6 := dsbb[0] */
-	vld1.64	{d14-d15}, [r5 :128]	/* q7 := dsbb[1] */
-	vld1.64	{d20-d21}, [r6 :128]	/* q10 := inv */
-	vld1.64	{d22-d23}, [r7 :128]	/* q11 := inva */
-	vld1.64	{d30-d31}, [r8 :128]	/* q15 := mc_forward[3] */
+	vld1.8	{d12-d13}, [r4 :128]	/* q6 := dsbb[0] */
+	vld1.8	{d14-d15}, [r5 :128]	/* q7 := dsbb[1] */
+	vld1.8	{d20-d21}, [r6 :128]	/* q10 := inv */
+	vld1.8	{d22-d23}, [r7 :128]	/* q11 := inva */
+	vld1.8	{d30-d31}, [r8 :128]	/* q15 := mc_forward[3] */
 
 	/* (q2, q3) := (lo, hi) */
 	vshr.u8	q3, q0, #4
@@ -529,8 +529,8 @@ ENTRY(aes_neon_dec1)
 	/* load dsb9 */
 	add	r4, r12, #(dsb9_0 - .Lconstants)
 	add	r5, r12, #(dsb9_1 - .Lconstants)
-	vld1.64	{d8-d9}, [r4 :128]	/* q4 := dsb9[0] */
-	vld1.64	{d10-d11}, [r5 :128]	/* q5 := dsb9[1] */
+	vld1.8	{d8-d9}, [r4 :128]	/* q4 := dsb9[0] */
+	vld1.8	{d10-d11}, [r5 :128]	/* q5 := dsb9[1] */
 
 	/* q0 := rk[0] + diptlo(lo) + dipthi(hi) */
 	veor	q0, q14, q2
@@ -541,10 +541,10 @@ ENTRY(aes_neon_dec1)
 	_ALIGN_TEXT
 1:	/* load dsbd */
 	add	r4, r12, #(dsbd_0 - .Lconstants)
-	vld1.64	{d16-d17}, [r4 :128]!	/* q8 := dsbd[0] */
-	vld1.64	{d18-d19}, [r4 :128]	/* q9 := dsbd[1] */
+	vld1.8	{d16-d17}, [r4 :128]!	/* q8 := dsbd[0] */
+	vld1.8	{d18-d19}, [r4 :128]	/* q9 := dsbd[1] */
 
-	vld1.64	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
+	vld1.8	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
 
 	/* q0 := rk[i] + dsb9_0(io) + dsb9_1(jo) */
 	vtbl.8	d24, {d8-d9}, d4
@@ -568,8 +568,8 @@ 1:	/* load dsbd */
 
 	/* load dsbe */
 	add	r4, r12, #(dsbe_0 - .Lconstants)
-	vld1.64	{d16-d17}, [r4 :128]!	/* q8 := dsbe[0] */
-	vld1.64	{d18-d19}, [r4 :128]	/* q9 := dsbe[1] */
+	vld1.8	{d16-d17}, [r4 :128]!	/* q8 := dsbe[0] */
+	vld1.8	{d18-d19}, [r4 :128]	/* q9 := dsbe[1] */
 
 	/* q0 := x(mc) + dsbb_0(io) + dsbb_1(jo) */
 	vtbl.8	d28, {d0-d1}, d30
@@ -647,11 +647,11 @@ 2:	/*
 	add	r6, r12, #(dsbo_0 - .Lconstants)
 	add	r7, r12, #(dsbo_1 - .Lconstants)
 	add	r8, r8, r3, lsl #4
-	vld1.64	{d12-d13}, [r6 :128]
-	vld1.64	{d14-d15}, [r7 :128]
-	vld1.64	{d30-d31}, [r8 :128]
+	vld1.8	{d12-d13}, [r6 :128]
+	vld1.8	{d14-d15}, [r7 :128]
+	vld1.8	{d30-d31}, [r8 :128]
 
-	vld1.64	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
+	vld1.8	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
 
 	/* (q2, q3) := (dsbo_0(io), dsbo_1(jo)) */
 	vtbl.8	d4, {d12-d13}, d4
diff -r b9bd12fb9564 sys/crypto/aes/arch/arm/aes_neon_subr.c
--- a/sys/crypto/aes/arch/arm/aes_neon_subr.c	Sun Aug 02 18:20:51 2020 +0000
+++ b/sys/crypto/aes/arch/arm/aes_neon_subr.c	Wed Aug 05 19:42:16 2020 +0000
@@ -161,32 +161,35 @@ static int
 aes_neon_xts_update_selftest(void)
 {
 	static const struct {
-		uint32_t in[4], out[4];
+		uint8_t in[16], out[16];
 	} cases[] = {
 		[0] = { {1}, {2} },
-		[1] = { {0x80000000U,0,0,0}, {0,1,0,0} },
-		[2] = { {0,0x80000000U,0,0}, {0,0,1,0} },
-		[3] = { {0,0,0x80000000U,0}, {0,0,0,1} },
-		[4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} },
-		[5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} },
+		[1] = { {0,0,0,0x80, 0,0,0,0, 0,0,0,0, 0,0,0,0},
+			{0,0,0,0, 1,0,0,0, 0,0,0,0, 0,0,0,0} },
+		[2] = { {0,0,0,0, 0,0,0,0x80, 0,0,0,0, 0,0,0,0},
+			{0,0,0,0, 0,0,0,0, 1,0,0,0, 0,0,0,0} },
+		[3] = { {0,0,0,0, 0,0,0,0, 0,0,0,0x80, 0,0,0,0},
+			{0,0,0,0, 0,0,0,0, 0,0,0,0, 1,0,0,0} },
+		[4] = { {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80},
+			{0x87,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0} },
+		[5] = { {0,0,0,0, 0,0,0,0x80, 0,0,0,0, 0,0,0,0x80},
+			{0x87,0,0,0, 0,0,0,0, 1,0,0,0, 0,0,0,0} },
 	};
 	unsigned i;
-	uint32_t t[4];
+	uint8_t t[16];
 	int result = 0;
 
 	for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) {
-		t[0] = cases[i].in[0];
-		t[1] = cases[i].in[1];
-		t[2] = cases[i].in[2];
-		t[3] = cases[i].in[3];
-		storeblock(t, aes_neon_xts_update(loadblock(t)));
-		if (t[0] != cases[i].out[0] ||
-		    t[1] != cases[i].out[1] ||
-		    t[2] != cases[i].out[2] ||
-		    t[3] != cases[i].out[3]) {
-			printf("%s %u:"
-			    " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n",
-			    __func__, i, t[0], t[1], t[2], t[3]);
+		storeblock(t, aes_neon_xts_update(loadblock(cases[i].in)));
+		if (memcmp(t, cases[i].out, 16)) {
+			char buf[33];
+			unsigned j;
+
+			for (j = 0; j < 16; j++) {
+				snprintf(buf + 2*j, sizeof(buf) - 2*j,
+				    " %02hhx", t[j]);
+			}
+			printf("%s %u: %s\n", __func__, i, buf);
 			result = -1;
 		}
 	}
@@ -289,16 +292,6 @@ aes_neon_cbcmac_update1(const struct aes
  * function, which should substantially improve CCM throughput.
  */
 
-#if _BYTE_ORDER == _LITTLE_ENDIAN
-#define	vbetoh32q_u8	vrev32q_u8
-#define	vhtobe32q_u8	vrev32q_u8
-#elif _BYTE_ORDER == _BIG_ENDIAN
-#define	vbetoh32q_u8(x)	(x)
-#define	vhtobe32q_u8(x)	(x)
-#else
-#error what kind of endian are you anyway
-#endif
-
 void
 aes_neon_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16],
     uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
@@ -313,12 +306,12 @@ aes_neon_ccm_enc1(const struct aesenc *e
 
 	auth = loadblock(authctr);
 	ctr_be = loadblock(authctr + 16);
-	ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be));
+	ctr = vreinterpretq_u32_u8(vrev32q_u8(ctr_be));
 	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
 		uint8x16x2_t b2;
 		ptxt = loadblock(in);
 		ctr = vaddq_u32(ctr, ctr32_inc);
-		ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr));
+		ctr_be = vrev32q_u8(vreinterpretq_u8_u32(ctr));
 
 		b2.val[0] = auth ^ ptxt;
 		b2.val[1] = ctr_be;
@@ -343,9 +336,9 @@ aes_neon_ccm_dec1(const struct aesenc *e
 	KASSERT(nbytes % 16 == 0);
 
 	ctr_be = loadblock(authctr + 16);
-	ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be));
+	ctr = vreinterpretq_u32_u8(vrev32q_u8(ctr_be));
 	ctr = vaddq_u32(ctr, ctr32_inc);
-	ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr));
+	ctr_be = vrev32q_u8(vreinterpretq_u8_u32(ctr));
 	pad = aes_neon_enc1(enc, ctr_be, nrounds);
 	auth = loadblock(authctr);
 	for (;; in += 16, out += 16) {
@@ -359,7 +352,7 @@ aes_neon_ccm_dec1(const struct aesenc *e
 			break;
 
 		ctr = vaddq_u32(ctr, ctr32_inc);
-		ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr));
+		ctr_be = vrev32q_u8(vreinterpretq_u8_u32(ctr));
 		b2.val[0] = auth;
 		b2.val[1] = ctr_be;
 		b2 = aes_neon_enc2(enc, b2, nrounds);
diff -r b9bd12fb9564 sys/crypto/chacha/arch/arm/chacha_neon.c
--- a/sys/crypto/chacha/arch/arm/chacha_neon.c	Sun Aug 02 18:20:51 2020 +0000
+++ b/sys/crypto/chacha/arch/arm/chacha_neon.c	Wed Aug 05 19:42:16 2020 +0000
@@ -46,26 +46,6 @@ vrolq_n_u32(uint32x4_t x, uint8_t n)
 	return vsriq_n_u32(vshlq_n_u32(x, n), x, 32 - n);
 #endif
 }
-
-static inline uint32x4_t
-vhtole_u32(uint32x4_t x)
-{
-#if _BYTE_ORDER == _LITTLE_ENDIAN
-	return x;
-#elif _BYTE_ORDER == _BIG_ENDIAN
-	return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x)));
-#endif
-}
-
-static inline uint32x4_t
-vletoh_u32(uint32x4_t x)
-{
-#if _BYTE_ORDER == _LITTLE_ENDIAN
-	return x;
-#elif _BYTE_ORDER == _BIG_ENDIAN
-	return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x)));
-#endif
-}
 
 static inline uint32x4_t
 rol16(uint32x4_t x)
@@ -180,17 +160,17 @@ chacha_core_neon(uint8_t out[restrict st
 	uint32x4_t in0, in1, in2, in3;
 	uint32x4_t r0, r1, r2, r3;
 
-	r0 = in0 = vletoh_u32(vld1q_u32((const uint32_t *)c));
-	r1 = in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
-	r2 = in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
-	r3 = in3 = vletoh_u32(vld1q_u32((const uint32_t *)in));
+	r0 = in0 = vreinterpretq_u32_u8(vld1q_u8(c));
+	r1 = in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0));
+	r2 = in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16));
+	r3 = in3 = vreinterpretq_u32_u8(vld1q_u8(in));
 
 	chacha_permute(&r0, &r1, &r2, &r3, nr);
 
-	vst1q_u32((uint32_t *)out + 0, vhtole_u32(vaddq_u32(r0, in0)));
-	vst1q_u32((uint32_t *)out + 4, vhtole_u32(vaddq_u32(r1, in1)));
-	vst1q_u32((uint32_t *)out + 8, vhtole_u32(vaddq_u32(r2, in2)));
-	vst1q_u32((uint32_t *)out + 12, vhtole_u32(vaddq_u32(r3, in3)));
+	vst1q_u8(out + 0, vreinterpretq_u8_u32(vaddq_u32(r0, in0)));
+	vst1q_u8(out + 16, vreinterpretq_u8_u32(vaddq_u32(r1, in1)));
+	vst1q_u8(out + 32, vreinterpretq_u8_u32(vaddq_u32(r2, in2)));
+	vst1q_u8(out + 48, vreinterpretq_u8_u32(vaddq_u32(r3, in3)));
 }
 
 void
@@ -202,15 +182,15 @@ hchacha_neon(uint8_t out[restrict static
 {
 	uint32x4_t r0, r1, r2, r3;
 
-	r0 = vletoh_u32(vld1q_u32((const uint32_t *)c));
-	r1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
-	r2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
-	r3 = vletoh_u32(vld1q_u32((const uint32_t *)in));
+	r0 = vreinterpretq_u32_u8(vld1q_u8(c));
+	r1 = vreinterpretq_u32_u8(vld1q_u8(k + 0));
+	r2 = vreinterpretq_u32_u8(vld1q_u8(k + 16));
+	r3 = vreinterpretq_u32_u8(vld1q_u8(in));
 
 	chacha_permute(&r0, &r1, &r2, &r3, nr);
 
-	vst1q_u32((uint32_t *)out + 0, r0);
-	vst1q_u32((uint32_t *)out + 4, r3);
+	vst1q_u8(out + 0, vreinterpretq_u8_u32(r0));
+	vst1q_u8(out + 16, vreinterpretq_u8_u32(r3));
 }
 
 void
@@ -229,9 +209,9 @@ chacha_stream_neon(uint8_t *restrict s, 
 		uint32x4_t in0, in1, in2, in3;
 		uint32x4_t r0, r1, r2, r3;
 
-		in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
-		in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
-		in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
+		in0 = vreinterpretq_u32_u8(vld1q_u8(chacha_const32));
+		in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0));
+		in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16));
 		in3 = (uint32x4_t) {
 			blkno,
 			le32dec(nonce),
@@ -245,27 +225,27 @@ chacha_stream_neon(uint8_t *restrict s, 
 			r2 = in2;
 			r3 = in3;
 			chacha_permute(&r0, &r1, &r2, &r3, nr);
-			r0 = vhtole_u32(vaddq_u32(r0, in0));
-			r1 = vhtole_u32(vaddq_u32(r1, in1));
-			r2 = vhtole_u32(vaddq_u32(r2, in2));
-			r3 = vhtole_u32(vaddq_u32(r3, in3));
+			r0 = vaddq_u32(r0, in0);
+			r1 = vaddq_u32(r1, in1);
+			r2 = vaddq_u32(r2, in2);
+			r3 = vaddq_u32(r3, in3);
 
 			if (n < 64) {
 				uint8_t buf[64] __aligned(16);
 
-				vst1q_u32((uint32_t *)buf + 4*0, r0);
-				vst1q_u32((uint32_t *)buf + 4*1, r1);
-				vst1q_u32((uint32_t *)buf + 4*2, r2);
-				vst1q_u32((uint32_t *)buf + 4*3, r3);
+				vst1q_u8(buf + 0, vreinterpretq_u8_u32(r0));
+				vst1q_u8(buf + 16, vreinterpretq_u8_u32(r1));
+				vst1q_u8(buf + 32, vreinterpretq_u8_u32(r2));
+				vst1q_u8(buf + 48, vreinterpretq_u8_u32(r3));
 				memcpy(s, buf, n);
 
 				break;
 			}
 
-			vst1q_u32((uint32_t *)s + 4*0, r0);
-			vst1q_u32((uint32_t *)s + 4*1, r1);
-			vst1q_u32((uint32_t *)s + 4*2, r2);
-			vst1q_u32((uint32_t *)s + 4*3, r3);
+			vst1q_u8(s + 0, vreinterpretq_u8_u32(r0));
+			vst1q_u8(s + 16, vreinterpretq_u8_u32(r1));
+			vst1q_u8(s + 32, vreinterpretq_u8_u32(r2));
+			vst1q_u8(s + 48, vreinterpretq_u8_u32(r3));
 			in3 = vaddq_u32(in3, blkno_inc);
 		}
 	}
@@ -288,9 +268,9 @@ chacha_stream_xor_neon(uint8_t *s, const
 		uint32x4_t in0, in1, in2, in3;
 		uint32x4_t r0, r1, r2, r3;
 
-		in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
-		in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
-		in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
+		in0 = vreinterpretq_u32_u8(vld1q_u8(chacha_const32));
+		in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0));
+		in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16));
 		in3 = (uint32x4_t) {
 			blkno,
 			le32dec(nonce),
@@ -304,19 +284,19 @@ chacha_stream_xor_neon(uint8_t *s, const
 			r2 = in2;
 			r3 = in3;
 			chacha_permute(&r0, &r1, &r2, &r3, nr);
-			r0 = vhtole_u32(vaddq_u32(r0, in0));
-			r1 = vhtole_u32(vaddq_u32(r1, in1));
-			r2 = vhtole_u32(vaddq_u32(r2, in2));
-			r3 = vhtole_u32(vaddq_u32(r3, in3));
+			r0 = vaddq_u32(r0, in0);
+			r1 = vaddq_u32(r1, in1);
+			r2 = vaddq_u32(r2, in2);
+			r3 = vaddq_u32(r3, in3);
 
 			if (n < 64) {
 				uint8_t buf[64] __aligned(16);
 				unsigned i;
 
-				vst1q_u32((uint32_t *)buf + 4*0, r0);
-				vst1q_u32((uint32_t *)buf + 4*1, r1);
-				vst1q_u32((uint32_t *)buf + 4*2, r2);
-				vst1q_u32((uint32_t *)buf + 4*3, r3);
+				vst1q_u8(buf + 0, vreinterpretq_u8_u32(r0));
+				vst1q_u8(buf + 16, vreinterpretq_u8_u32(r1));
+				vst1q_u8(buf + 32, vreinterpretq_u8_u32(r2));
+				vst1q_u8(buf + 48, vreinterpretq_u8_u32(r3));
 
 				for (i = 0; i < n - n%4; i += 4)
 					le32enc(s + i,
@@ -327,14 +307,14 @@ chacha_stream_xor_neon(uint8_t *s, const
 				break;
 			}
 
-			r0 ^= vld1q_u32((const uint32_t *)p + 4*0);
-			r1 ^= vld1q_u32((const uint32_t *)p + 4*1);
-			r2 ^= vld1q_u32((const uint32_t *)p + 4*2);
-			r3 ^= vld1q_u32((const uint32_t *)p + 4*3);
-			vst1q_u32((uint32_t *)s + 4*0, r0);
-			vst1q_u32((uint32_t *)s + 4*1, r1);
-			vst1q_u32((uint32_t *)s + 4*2, r2);
-			vst1q_u32((uint32_t *)s + 4*3, r3);
+			r0 ^= vreinterpretq_u32_u8(vld1q_u8(p + 0));
+			r1 ^= vreinterpretq_u32_u8(vld1q_u8(p + 16));
+			r2 ^= vreinterpretq_u32_u8(vld1q_u8(p + 32));
+			r3 ^= vreinterpretq_u32_u8(vld1q_u8(p + 48));
+			vst1q_u8(s + 0, vreinterpretq_u8_u32(r0));
+			vst1q_u8(s + 16, vreinterpretq_u8_u32(r1));
+			vst1q_u8(s + 32, vreinterpretq_u8_u32(r2));
+			vst1q_u8(s + 48, vreinterpretq_u8_u32(r3));
 			in3 = vaddq_u32(in3, blkno_inc);
 		}
 	}
diff -r b9bd12fb9564 sys/crypto/chacha/arch/arm/chacha_neon_32.S
--- a/sys/crypto/chacha/arch/arm/chacha_neon_32.S	Sun Aug 02 18:20:51 2020 +0000
+++ b/sys/crypto/chacha/arch/arm/chacha_neon_32.S	Wed Aug 05 19:42:16 2020 +0000
@@ -54,7 +54,7 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2
  */
 
 .macro	ROUNDLD	a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3
-	vld1.32		{\c2-\c3}, [fp, :256]
+	vld1.8		{\c2-\c3}, [fp, :256]
 .endm
 
 .macro	ROUND	a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3, c0l, d0l,d0h,d1l,d1h,d2l,d2h,d3l,d3h
@@ -80,7 +80,7 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2
 	vadd.u32	\c2, \c2, \d2
 	vadd.u32	\c3, \c3, \d3
 
-	vst1.32		{\c0-\c1}, [fp, :256]	/* free c0 and c1 as temps */
+	vst1.8		{\c0-\c1}, [fp, :256]	/* free c0 and c1 as temps */
 
 	veor		\c0, \b0, \c0
 	veor		\c1, \b1, \c1
@@ -118,7 +118,7 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2
 	vtbl.8		\d3l, {\d3l}, \c0l
 	vtbl.8		\d3h, {\d3h}, \c0l
 
-	vld1.32		{\c0-\c1}, [fp, :256]	/* restore c0 and c1 */
+	vld1.8		{\c0-\c1}, [fp, :256]	/* restore c0 and c1 */
 
 	/* c += d; b ^= c; b <<<= 7 */
 	vadd.u32	\c2, \c2, \d2
@@ -126,7 +126,7 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2
 	vadd.u32	\c0, \c0, \d0
 	vadd.u32	\c1, \c1, \d1
 
-	vst1.32		{\c2-\c3}, [fp, :256]	/* free c2 and c3 as temps */
+	vst1.8		{\c2-\c3}, [fp, :256]	/* free c2 and c3 as temps */
 
 	veor		\c2, \b2, \c2
 	veor		\c3, \b3, \c3
@@ -143,14 +143,6 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2
 	vsri.u32	\b1, \c3, #(32 - 7)
 .endm
 
-#if _BYTE_ORDER == _LITTLE_ENDIAN
-#define	HTOLE32(x)
-#define	LE32TOH(x)
-#elif _BYTE_ORDER == _BIG_ENDIAN
-#define	HTOLE32(x)	vrev32.8	x, x
-#define	LE32TOH(x)	vrev32.8	x, x
-#endif
-
 	.text
 	.p2align 2
 .Lconstants_addr:
@@ -183,9 +175,9 @@ ENTRY(chacha_stream256_neon)
 	ldm	ip, {r4, r5}	/* r4 := const, r5 := nr */
 	ldm	r2, {r6, r8, r10}	/* (r6, r8, r10) := nonce[0:12) */
 
-	vld1.32	{q12}, [r4]	/* q12 := constant */
-	vld1.32	{q13-q14}, [r3]	/* q13-q14 := key */
-	vld1.32	{q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */
+	vld1.8	{q12}, [r4]	/* q12 := constant */
+	vld1.8	{q13-q14}, [r3]	/* q13-q14 := key */
+	vld1.8	{q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */
 
 	vdup.32	q0, d24[0]	/* q0-q3 := constant */
 	vdup.32	q1, d24[1]
@@ -205,23 +197,6 @@ ENTRY(chacha_stream256_neon)
 	vdup.32	q14, r8
 	vdup.32	q15, r10
 
-	HTOLE32(q0)
-	HTOLE32(q1)
-	HTOLE32(q2)
-	HTOLE32(q3)
-	HTOLE32(q4)
-	HTOLE32(q5)
-	HTOLE32(q6)
-	HTOLE32(q7)
-	HTOLE32(q8)
-	HTOLE32(q9)
-	HTOLE32(q10)
-	HTOLE32(q11)
-	HTOLE32(q12)
-	HTOLE32(q13)
-	HTOLE32(q14)
-	HTOLE32(q15)
-
 	b	2f
 
 	_ALIGN_TEXT
@@ -275,7 +250,7 @@ 2:	subs	r5, r5, #2
 
 	sub	r7, r7, #0x10
 	vdup.32	q8, r1		/* q8 := (blkno, blkno, blkno, blkno) */
-	vld1.32	{q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */
+	vld1.8	{q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */
 
 	vzip.32	q0, q1
 	vzip.32	q2, q3
@@ -283,9 +258,9 @@ 2:	subs	r5, r5, #2
 	vzip.32	q6, q7
 
 	vadd.u32 q8, q8, q9	/* q8 := (blkno,blkno+1,blkno+2,blkno+3) */
-	vld1.32	{q9}, [r4]	/* q9 := constant */
+	vld1.8	{q9}, [r4]	/* q9 := constant */
 	vadd.u32 q12, q12, q8	/* q12 += (blkno,blkno+1,blkno+2,blkno+3) */
-	vld1.32	{q8}, [r3]!	/* q8 := key[0:16) */
+	vld1.8	{q8}, [r3]!	/* q8 := key[0:16) */
 
 	vswp	d1, d4
 	vswp	d9, d12
@@ -330,19 +305,10 @@ 2:	subs	r5, r5, #2
 	vadd.u32 q3, q3, q8
 	vadd.u32 q7, q7, q8
 
-	vld1.32 {q8-q9}, [fp, :256]	/* restore q8-q9 */
+	vld1.8	{q8-q9}, [fp, :256]	/* restore q8-q9 */
 
-	LE32TOH(q0)
-	LE32TOH(q1)
-	LE32TOH(q2)
-	LE32TOH(q3)
-	LE32TOH(q4)
-	LE32TOH(q5)
-	LE32TOH(q6)
-	LE32TOH(q7)
-
-	vst1.32	{q0-q1}, [r0]!
-	vld1.32	{q0}, [r3]	/* q0 := key[16:32) */
+	vst1.8	{q0-q1}, [r0]!
+	vld1.8	{q0}, [r3]	/* q0 := key[16:32) */
 	mov	r3, #0		/* q1 = (0, nonce[0:4), ..., nonce[8:12)) */
 	vmov	d2, r3, r6
 	vmov	d3, r8, r10
@@ -370,23 +336,14 @@ 2:	subs	r5, r5, #2
 	vadd.u32 q11, q11, q1
 	vadd.u32 q15, q15, q1
 
-	LE32TOH(q8)
-	LE32TOH(q9)
-	LE32TOH(q10)
-	LE32TOH(q11)
-	LE32TOH(q12)
-	LE32TOH(q13)
-	LE32TOH(q14)
-	LE32TOH(q15)
-
-	/* vst1.32	{q0-q1}, [r0]! */
-	vst1.32	{q8-q9}, [r0]!
-	vst1.32	{q2-q3}, [r0]!
-	vst1.32	{q10-q11}, [r0]!
-	vst1.32	{q4-q5}, [r0]!
-	vst1.32	{q12-q13}, [r0]!
-	vst1.32 {q6-q7}, [r0]!
-	vst1.32 {q14-q15}, [r0]
+	/* vst1.8	{q0-q1}, [r0]! */
+	vst1.8	{q8-q9}, [r0]!
+	vst1.8	{q2-q3}, [r0]!
+	vst1.8	{q10-q11}, [r0]!
+	vst1.8	{q4-q5}, [r0]!
+	vst1.8	{q12-q13}, [r0]!
+	vst1.8 {q6-q7}, [r0]!
+	vst1.8 {q14-q15}, [r0]
 
 	/* zero temporary space on the stack */
 	vmov.i32 q0, #0
@@ -426,9 +383,9 @@ ENTRY(chacha_stream_xor256_neon)
 	ldm	ip, {r4, r5, ip}	/* r4 := key, r5 := const, ip := nr */
 	ldm	r3, {r6, r8, r10}	/* (r6, r8, r10) := nonce[0:12) */
 
-	vld1.32	{q12}, [r5]	/* q12 := constant */
-	vld1.32	{q13-q14}, [r4]	/* q13-q14 := key */
-	vld1.32	{q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */
+	vld1.8	{q12}, [r5]	/* q12 := constant */
+	vld1.8	{q13-q14}, [r4]	/* q13-q14 := key */
+	vld1.8	{q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */
 
 	vdup.32	q0, d24[0]	/* q0-q3 := constant */
 	vdup.32	q1, d24[1]
@@ -448,23 +405,6 @@ ENTRY(chacha_stream_xor256_neon)
 	vdup.32	q14, r8
 	vdup.32	q15, r10
 
-	HTOLE32(q0)
-	HTOLE32(q1)
-	HTOLE32(q2)
-	HTOLE32(q3)
-	HTOLE32(q4)
-	HTOLE32(q5)
-	HTOLE32(q6)
-	HTOLE32(q7)
-	HTOLE32(q8)
-	HTOLE32(q9)
-	HTOLE32(q10)
-	HTOLE32(q11)
-	HTOLE32(q12)
-	HTOLE32(q13)
-	HTOLE32(q14)
-	HTOLE32(q15)
-
 	b	2f
 
 	_ALIGN_TEXT
@@ -488,7 +428,7 @@ 2:	subs	ip, ip, #2
 
 	sub	r7, r7, #0x10
 	vdup.32	q8, r2		/* q8 := (blkno, blkno, blkno, blkno) */
-	vld1.32	{q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */
+	vld1.8	{q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */
 
 	vzip.32	q0, q1
 	vzip.32	q2, q3
@@ -496,9 +436,9 @@ 2:	subs	ip, ip, #2
 	vzip.32	q6, q7
 
 	vadd.u32 q8, q8, q9	/* q8 := (blkno,blkno+1,blkno+2,blkno+3) */
-	vld1.32	{q9}, [r5]	/* q9 := constant */
+	vld1.8	{q9}, [r5]	/* q9 := constant */
 	vadd.u32 q12, q12, q8	/* q12 += (blkno,blkno+1,blkno+2,blkno+3) */
-	vld1.32	{q8}, [r4]!	/* q8 := key[0:16) */
+	vld1.8	{q8}, [r4]!	/* q8 := key[0:16) */
 
 	vswp	d3, d6
 	vswp	d9, d12
@@ -518,24 +458,15 @@ 2:	subs	ip, ip, #2
 	vadd.u32 q3, q3, q8
 	vadd.u32 q7, q7, q8
 
-	vld1.32 {q8-q9}, [r1]!	/* load plaintext bytes [0:32) */
-
-	LE32TOH(q0)
-	LE32TOH(q1)
-	LE32TOH(q2)
-	LE32TOH(q6)
-	LE32TOH(q4)
-	LE32TOH(q5)
-	LE32TOH(q3)
-	LE32TOH(q7)
+	vld1.8	{q8-q9}, [r1]!	/* load plaintext bytes [0:32) */
 
 	veor	q0, q0, q8	/* compute ciphertext bytes [0:32) */
 	veor	q1, q1, q9
 
-	vld1.32 {q8-q9}, [fp, :256]	/* restore q8-q9 */
+	vld1.8	{q8-q9}, [fp, :256]	/* restore q8-q9 */
 
-	vst1.32	{q0-q1}, [r0]!	/* store ciphertext bytes [0:32) */
-	vld1.32	{q0}, [r4]	/* q0 := key[16:32) */
+	vst1.8	{q0-q1}, [r0]!	/* store ciphertext bytes [0:32) */
+	vld1.8	{q0}, [r4]	/* q0 := key[16:32) */
 	mov	r3, #0		/* q1 = (0, nonce[0:4), ..., nonce[8:12)) */
 	vmov	d2, r3, r6
 	vmov	d3, r8, r10
@@ -563,57 +494,48 @@ 2:	subs	ip, ip, #2
 	vadd.u32 q11, q11, q1
 	vadd.u32 q15, q15, q1
 
-	vld1.32	{q0-q1}, [r1]!	/* load plaintext bytes [32:64) */
-
-	LE32TOH(q8)
-	LE32TOH(q9)
-	LE32TOH(q10)
-	LE32TOH(q11)
-	LE32TOH(q12)
-	LE32TOH(q13)
-	LE32TOH(q14)
-	LE32TOH(q15)
+	vld1.8	{q0-q1}, [r1]!	/* load plaintext bytes [32:64) */
 
 	veor	q0, q0, q8	/* compute ciphertext bytes [32:64) */
 	veor	q1, q1, q9
 
-	vld1.32	{q8-q9}, [r1]!	/* load plaintext bytes [64:96) */
-	vst1.32	{q0-q1}, [r0]!	/* store ciphertext bytes [32:64) */
-	vld1.32	{q0-q1}, [r1]!	/* load plaintext bytes [96:128) */
+	vld1.8	{q8-q9}, [r1]!	/* load plaintext bytes [64:96) */
+	vst1.8	{q0-q1}, [r0]!	/* store ciphertext bytes [32:64) */
+	vld1.8	{q0-q1}, [r1]!	/* load plaintext bytes [96:128) */
 
 	veor	q2, q2, q8	/* compute ciphertext bytes [64:96) */
 	veor	q3, q3, q9
 
-	vld1.32	{q8-q9}, [r1]!	/* load plaintext bytes [128:160) */
-	vst1.32	{q2-q3}, [r0]!	/* store ciphertext bytes [64:80) */
+	vld1.8	{q8-q9}, [r1]!	/* load plaintext bytes [128:160) */
+	vst1.8	{q2-q3}, [r0]!	/* store ciphertext bytes [64:80) */
 
 	veor	q10, q10, q0	/* compute ciphertext bytes [96:128) */
 	veor	q11, q11, q1
 
-	vld1.32	{q0-q1}, [r1]!	/* load plaintext bytes [160:192) */
-	vst1.32	{q10-q11}, [r0]!	/* store ciphertext bytes [80:96) */
+	vld1.8	{q0-q1}, [r1]!	/* load plaintext bytes [160:192) */
+	vst1.8	{q10-q11}, [r0]!	/* store ciphertext bytes [80:96) */
 
 	veor	q4, q4, q8	/* compute ciphertext bytes [128:160) */
 	veor	q5, q5, q9
 
-	vld1.32	{q8-q9}, [r1]!	/* load plaintext bytes [192:224) */
-	vst1.32	{q4-q5}, [r0]!	/* store ciphertext bytes [96:112) */
+	vld1.8	{q8-q9}, [r1]!	/* load plaintext bytes [192:224) */
+	vst1.8	{q4-q5}, [r0]!	/* store ciphertext bytes [96:112) */
 
 	veor	q12, q12, q0	/* compute ciphertext bytes [160:192) */
 	veor	q13, q13, q1
 
-	vld1.32	{q0-q1}, [r1]	/* load plaintext bytes [224:256) */
-	vst1.32	{q12-q13}, [r0]!	/* store ciphertext bytes [112:128) */
+	vld1.8	{q0-q1}, [r1]	/* load plaintext bytes [224:256) */
+	vst1.8	{q12-q13}, [r0]!	/* store ciphertext bytes [112:128) */
 
 	veor	q6, q6, q8	/* compute ciphertext bytes [192:224) */
 	veor	q7, q7, q9
 
-	vst1.32	{q6-q7}, [r0]!	/* store ciphertext bytes [192:224) */
+	vst1.8	{q6-q7}, [r0]!	/* store ciphertext bytes [192:224) */
 
 	veor	q14, q14, q0	/* compute ciphertext bytes [224:256) */
 	veor	q15, q15, q1
 
-	vst1.32	{q14-q15}, [r0]	/* store ciphertext bytes [224:256) */
+	vst1.8	{q14-q15}, [r0]	/* store ciphertext bytes [224:256) */
 
 	/* zero temporary space on the stack */
 	vmov.i32 q0, #0
@@ -632,10 +554,10 @@ END(chacha_stream_xor256_neon)
 
 	.type	v0123,%object
 v0123:
-	.long	0, 1, 2, 3
+	.byte	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
 END(v0123)
 
 	.type	rot8,%object
 rot8:
-	.long	0x02010003, 0x06050407
+	.byte	3,0,1,2, 7,4,5,6
 END(rot8)
diff -r b9bd12fb9564 sys/crypto/chacha/arch/arm/chacha_neon_64.S
--- a/sys/crypto/chacha/arch/arm/chacha_neon_64.S	Sun Aug 02 18:20:51 2020 +0000
+++ b/sys/crypto/chacha/arch/arm/chacha_neon_64.S	Wed Aug 05 19:42:16 2020 +0000
@@ -169,22 +169,22 @@ ENTRY(chacha_stream256_neon)
 	ld3r	{v13.4s-v15.4s}, [x13]	/* (v13,v14,v15) := nonce */
 	add	v12.4s, v12.4s, v26.4s	/* v12 := blkno + (0,1,2,3) */
 
-	HTOLE32(v0.16b)
-	HTOLE32(v1.16b)
-	HTOLE32(v2.16b)
-	HTOLE32(v3.16b)
-	HTOLE32(v4.16b)
-	HTOLE32(v5.16b)
-	HTOLE32(v6.16b)
-	HTOLE32(v7.16b)
-	HTOLE32(v8.16b)
-	HTOLE32(v9.16b)
-	HTOLE32(v10.16b)
-	HTOLE32(v11.16b)
-	HTOLE32(v12.16b)
-	HTOLE32(v13.16b)
-	HTOLE32(v14.16b)
-	HTOLE32(v15.16b)
+	LE32TOH(v0.16b)
+	LE32TOH(v1.16b)
+	LE32TOH(v2.16b)
+	LE32TOH(v3.16b)
+	LE32TOH(v4.16b)
+	LE32TOH(v5.16b)
+	LE32TOH(v6.16b)
+	LE32TOH(v7.16b)
+	LE32TOH(v8.16b)
+	LE32TOH(v9.16b)
+	LE32TOH(v10.16b)
+	LE32TOH(v11.16b)
+	/* LE32TOH(v12.16b) -- blkno, already host order */
+	LE32TOH(v13.16b)
+	LE32TOH(v14.16b)
+	LE32TOH(v15.16b)
 
 	mov	v16.16b, v0.16b
 	mov	v17.16b, v1.16b
@@ -234,22 +234,22 @@ 1:	subs	w5, w5, #2
 	add	v14.4s, v14.4s, v30.4s
 	add	v15.4s, v15.4s, v31.4s
 
-	LE32TOH(v0.16b)
-	LE32TOH(v1.16b)
-	LE32TOH(v2.16b)
-	LE32TOH(v3.16b)
-	LE32TOH(v4.16b)
-	LE32TOH(v5.16b)
-	LE32TOH(v6.16b)
-	LE32TOH(v7.16b)
-	LE32TOH(v8.16b)
-	LE32TOH(v9.16b)
-	LE32TOH(v10.16b)
-	LE32TOH(v11.16b)
-	LE32TOH(v12.16b)
-	LE32TOH(v13.16b)
-	LE32TOH(v14.16b)
-	LE32TOH(v15.16b)
+	HTOLE32(v0.16b)
+	HTOLE32(v1.16b)
+	HTOLE32(v2.16b)
+	HTOLE32(v3.16b)
+	HTOLE32(v4.16b)
+	HTOLE32(v5.16b)
+	HTOLE32(v6.16b)
+	HTOLE32(v7.16b)
+	HTOLE32(v8.16b)
+	HTOLE32(v9.16b)
+	HTOLE32(v10.16b)
+	HTOLE32(v11.16b)
+	HTOLE32(v12.16b)
+	HTOLE32(v13.16b)
+	HTOLE32(v14.16b)
+	HTOLE32(v15.16b)
 
 	st4	{ v0.s, v1.s, v2.s, v3.s}[0], [x0], #16
 	st4	{ v4.s, v5.s, v6.s, v7.s}[0], [x0], #16
@@ -308,22 +308,22 @@ ENTRY(chacha_stream_xor256_neon)
 	ld3r	{v13.4s-v15.4s}, [x13]	/* (v13,v14,v15) := nonce */
 	add	v12.4s, v12.4s, v26.4s	/* v12 := blkno + (0,1,2,3) */
 
-	HTOLE32(v0.16b)
-	HTOLE32(v1.16b)
-	HTOLE32(v2.16b)
-	HTOLE32(v3.16b)
-	HTOLE32(v4.16b)
-	HTOLE32(v5.16b)
-	HTOLE32(v6.16b)
-	HTOLE32(v7.16b)
-	HTOLE32(v8.16b)
-	HTOLE32(v9.16b)
-	HTOLE32(v10.16b)
-	HTOLE32(v11.16b)
-	HTOLE32(v12.16b)
-	HTOLE32(v13.16b)
-	HTOLE32(v14.16b)
-	HTOLE32(v15.16b)
+	LE32TOH(v0.16b)
+	LE32TOH(v1.16b)
+	LE32TOH(v2.16b)
+	LE32TOH(v3.16b)
+	LE32TOH(v4.16b)
+	LE32TOH(v5.16b)
+	LE32TOH(v6.16b)
+	LE32TOH(v7.16b)
+	LE32TOH(v8.16b)
+	LE32TOH(v9.16b)
+	LE32TOH(v10.16b)
+	LE32TOH(v11.16b)
+	/* LE32TOH(v12.16b) -- blkno, already host order */
+	LE32TOH(v13.16b)
+	LE32TOH(v14.16b)
+	LE32TOH(v15.16b)
 
 	mov	v16.16b, v0.16b
 	mov	v17.16b, v1.16b
@@ -401,22 +401,22 @@ 1:	subs	w6, w6, #2
 	ld4	{v24.s,v25.s,v26.s,v27.s}[3], [x1], #16
 	ld4	{v28.s,v29.s,v30.s,v31.s}[3], [x1], #16
 
-	LE32TOH(v0.16b)
-	LE32TOH(v1.16b)
-	LE32TOH(v2.16b)
-	LE32TOH(v3.16b)
-	LE32TOH(v4.16b)
-	LE32TOH(v5.16b)
-	LE32TOH(v6.16b)
-	LE32TOH(v7.16b)
-	LE32TOH(v8.16b)
-	LE32TOH(v9.16b)
-	LE32TOH(v10.16b)
-	LE32TOH(v11.16b)
-	LE32TOH(v12.16b)
-	LE32TOH(v13.16b)
-	LE32TOH(v14.16b)
-	LE32TOH(v15.16b)
+	HTOLE32(v0.16b)
+	HTOLE32(v1.16b)
+	HTOLE32(v2.16b)
+	HTOLE32(v3.16b)
+	HTOLE32(v4.16b)
+	HTOLE32(v5.16b)
+	HTOLE32(v6.16b)
+	HTOLE32(v7.16b)
+	HTOLE32(v8.16b)
+	HTOLE32(v9.16b)
+	HTOLE32(v10.16b)
+	HTOLE32(v11.16b)
+	HTOLE32(v12.16b)
+	HTOLE32(v13.16b)
+	HTOLE32(v14.16b)
+	HTOLE32(v15.16b)
 
 	eor	v16.16b, v16.16b, v0.16b
 	eor	v17.16b, v17.16b, v1.16b