diff -r b9bd12fb9564 sys/arch/aarch64/aarch64/cpu.c --- a/sys/arch/aarch64/aarch64/cpu.c Sun Aug 02 18:20:51 2020 +0000 +++ b/sys/arch/aarch64/aarch64/cpu.c Wed Aug 05 19:42:16 2020 +0000 @@ -621,6 +621,7 @@ cpu_setup_aes(device_t dv, struct cpu_in switch (__SHIFTOUT(id->ac_aa64isar0, ID_AA64ISAR0_EL1_AES)) { case ID_AA64ISAR0_EL1_AES_AES: case ID_AA64ISAR0_EL1_AES_PMUL: + break; aes_md_init(&aes_armv8_impl); return; default: diff -r b9bd12fb9564 sys/crypto/aes/arch/arm/aes_armv8_64.S --- a/sys/crypto/aes/arch/arm/aes_armv8_64.S Sun Aug 02 18:20:51 2020 +0000 +++ b/sys/crypto/aes/arch/arm/aes_armv8_64.S Wed Aug 05 19:42:16 2020 +0000 @@ -26,8 +26,6 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#include - #include RCSID("$NetBSD: aes_armv8_64.S,v 1.11 2020/07/27 20:57:23 riastradh Exp $") @@ -921,19 +919,13 @@ ENTRY(aesarmv8_ccm_enc1) ld1 {v5.4s}, [x11] /* q5 := (0,0,0,1) (host-endian) */ mov x9, x0 /* x9 := enckey */ mov x10, x3 /* x10 := nbytes */ -#if _BYTE_ORDER == _LITTLE_ENDIAN rev32 v2.16b, v2.16b /* q2 := ctr (host-endian) */ -#endif _ALIGN_TEXT 1: ldr q3, [x1], #0x10 /* q3 := plaintext block */ add v2.4s, v2.4s, v5.4s /* increment ctr (32-bit) */ mov x0, x9 /* x0 := enckey */ mov x3, x5 /* x3 := nrounds */ -#if _BYTE_ORDER == _LITTLE_ENDIAN rev32 v1.16b, v2.16b /* q1 := ctr (big-endian) */ -#else - mov v1.16b, v2.16b /* q1 := ctr (big-endian) */ -#endif eor v0.16b, v0.16b, v3.16b /* q0 := auth ^ ptxt */ bl aesarmv8_enc2 /* q0 := auth', q1 := pad; * trash x0/x3/q16 */ @@ -941,9 +933,7 @@ 1: ldr q3, [x1], #0x10 /* q3 := plainte subs x10, x10, #0x10 /* count down bytes */ str q3, [x2], #0x10 /* store ciphertext block */ b.ne 1b /* repeat if more blocks */ -#if _BYTE_ORDER == _LITTLE_ENDIAN rev32 v2.16b, v2.16b /* q2 := ctr (big-endian) */ -#endif stp q0, q2, [x4] /* store updated auth/ctr */ ldp fp, lr, [sp], #16 /* pop stack frame */ ret @@ -968,18 +958,12 @@ ENTRY(aesarmv8_ccm_dec1) ld1 {v5.4s}, [x11] /* q5 := (0,0,0,1) (host-endian) */ mov x9, x0 /* x9 := enckey */ mov x10, x3 /* x10 := nbytes */ -#if _BYTE_ORDER == _LITTLE_ENDIAN rev32 v2.16b, v2.16b /* q2 := ctr (host-endian) */ -#endif /* Decrypt the first block. */ add v2.4s, v2.4s, v5.4s /* increment ctr (32-bit) */ mov x3, x5 /* x3 := nrounds */ -#if _BYTE_ORDER == _LITTLE_ENDIAN rev32 v0.16b, v2.16b /* q0 := ctr (big-endian) */ -#else - mov v0.16b, v2.16b /* q0 := ctr (big-endian) */ -#endif ldr q3, [x1], #0x10 /* q3 := ctxt */ bl aesarmv8_enc1 /* q0 := pad; trash x0/x3/q16 */ b 2f @@ -995,11 +979,7 @@ 1: /* add v2.4s, v2.4s, v5.4s /* increment ctr (32-bit) */ mov x0, x9 /* x0 := enckey */ mov x3, x5 /* x3 := nrounds */ -#if _BYTE_ORDER == _LITTLE_ENDIAN rev32 v0.16b, v2.16b /* q0 := ctr (big-endian) */ -#else - mov v0.16b, v2.16b /* q0 := ctr (big-endian) */ -#endif ldr q3, [x1], #0x10 /* q3 := ctxt */ bl aesarmv8_enc2 /* q0 := pad, q1 := auth'; * trash x0/x3/q16 */ @@ -1009,15 +989,14 @@ 2: eor v3.16b, v0.16b, v3.16b /* q3 := p eor v1.16b, v1.16b, v3.16b /* q1 := auth ^ ptxt */ b.ne 1b -#if _BYTE_ORDER == _LITTLE_ENDIAN rev32 v2.16b, v2.16b /* q2 := ctr (big-endian) */ -#endif /* Authenticate the last block. */ mov x0, x9 /* x0 := enckey */ mov x3, x5 /* x3 := nrounds */ mov v0.16b, v1.16b /* q0 := auth ^ ptxt */ bl aesarmv8_enc1 /* q0 := auth'; trash x0/x3/q16 */ + stp q0, q2, [x4] /* store updated auth/ctr */ ldp fp, lr, [sp], #16 /* pop stack frame */ ret diff -r b9bd12fb9564 sys/crypto/aes/arch/arm/aes_neon_32.S --- a/sys/crypto/aes/arch/arm/aes_neon_32.S Sun Aug 02 18:20:51 2020 +0000 +++ b/sys/crypto/aes/arch/arm/aes_neon_32.S Wed Aug 05 19:42:16 2020 +0000 @@ -270,7 +270,7 @@ ENTRY(aes_neon_enc1) ldr r12, .Lconstants_addr adr r11, .Lconstants_addr - vld1.64 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ + vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ movw r3, #0 vmov.i8 q1, #0x0f @@ -280,8 +280,8 @@ ENTRY(aes_neon_enc1) /* (q4, q5) := (iptlo, ipthi) */ add r6, r12, #(iptlo - .Lconstants) add r7, r12, #(ipthi - .Lconstants) - vld1.64 {d8-d9}, [r6 :128] - vld1.64 {d10-d11}, [r7 :128] + vld1.8 {d8-d9}, [r6 :128] + vld1.8 {d10-d11}, [r7 :128] /* load the rest of the constants */ add r4, r12, #(sb1_0 - .Lconstants) @@ -290,12 +290,12 @@ ENTRY(aes_neon_enc1) add r7, r12, #(sb2_1 - .Lconstants) add r8, r12, #(inv - .Lconstants) add r10, r12, #(inva - .Lconstants) - vld1.64 {d12-d13}, [r4 :128] /* q6 = sb1[0] */ - vld1.64 {d14-d15}, [r5 :128] /* q7 = sb1[1] */ - vld1.64 {d16-d17}, [r6 :128] /* q8 = sb2[0] */ - vld1.64 {d18-d19}, [r7 :128] /* q9 = sb2[1] */ - vld1.64 {d20-d21}, [r8 :128] /* q10 = inv */ - vld1.64 {d22-d23}, [r10 :128] /* q11 = inva */ + vld1.8 {d12-d13}, [r4 :128] /* q6 = sb1[0] */ + vld1.8 {d14-d15}, [r5 :128] /* q7 = sb1[1] */ + vld1.8 {d16-d17}, [r6 :128] /* q8 = sb2[0] */ + vld1.8 {d18-d19}, [r7 :128] /* q9 = sb2[1] */ + vld1.8 {d20-d21}, [r8 :128] /* q10 = inv */ + vld1.8 {d22-d23}, [r10 :128] /* q11 = inva */ /* (r4, r5) := (&mc_forward[0], &mc_backward[0]) */ add r4, r12, #(mc_forward - .Lconstants) @@ -319,7 +319,7 @@ ENTRY(aes_neon_enc1) b 2f _ALIGN_TEXT -1: vld1.64 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ +1: vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ /* q0 := A = rk[i] + sb1_0(io) + sb1_1(jo) */ vtbl.8 d24, {d12-d13}, d4 @@ -339,8 +339,8 @@ 1: vld1.64 {d28-d29}, [r0 :128]! /* q14 /* (q12, q13) := (mc_forward[rmod4], mc_backward[rmod4]) */ add r6, r4, r3, lsl #4 add r7, r5, r3, lsl #4 - vld1.64 {d24-d25}, [r6] - vld1.64 {d26-d27}, [r7] + vld1.8 {d24-d25}, [r6] + vld1.8 {d26-d27}, [r7] /* q15 := A2_B = A2 + A(mcf) */ vtbl.8 d30, {d0-d1}, d24 @@ -412,11 +412,11 @@ 2: /* add r6, r12, #(sbo_0 - .Lconstants) add r7, r12, #(sbo_1 - .Lconstants) add r8, r8, r3, lsl #4 - vld1.64 {d12-d13}, [r6 :128] - vld1.64 {d14-d15}, [r7 :128] - vld1.64 {d30-d31}, [r8 :128] + vld1.8 {d12-d13}, [r6 :128] + vld1.8 {d14-d15}, [r7 :128] + vld1.8 {d30-d31}, [r8 :128] - vld1.64 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ + vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ /* (q2, q3) := (sbo_0(io), sbo_1(jo)) */ vtbl.8 d4, {d12-d13}, d4 @@ -489,7 +489,7 @@ ENTRY(aes_neon_dec1) ldr r12, .Lconstants_addr adr r11, .Lconstants_addr - vld1.64 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ + vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ rsb r3, r1, #0 /* r3 := ~(x - 1) = -x */ vmov.i8 q1, #0x0f and r3, r3, #3 /* r3 := 3 & ~(x - 1) */ @@ -500,8 +500,8 @@ ENTRY(aes_neon_dec1) /* (q4, q5) := (diptlo, dipthi) */ add r6, r12, #(diptlo - .Lconstants) add r7, r12, #(dipthi - .Lconstants) - vld1.64 {d8-d9}, [r6 :128] - vld1.64 {d10-d11}, [r7 :128] + vld1.8 {d8-d9}, [r6 :128] + vld1.8 {d10-d11}, [r7 :128] /* load the rest of the constants */ add r4, r12, #(dsbb_0 - .Lconstants) @@ -509,11 +509,11 @@ ENTRY(aes_neon_dec1) add r6, r12, #(inv - .Lconstants) add r7, r12, #(inva - .Lconstants) add r8, r12, #(.Lmc_forward_3 - .Lconstants) - vld1.64 {d12-d13}, [r4 :128] /* q6 := dsbb[0] */ - vld1.64 {d14-d15}, [r5 :128] /* q7 := dsbb[1] */ - vld1.64 {d20-d21}, [r6 :128] /* q10 := inv */ - vld1.64 {d22-d23}, [r7 :128] /* q11 := inva */ - vld1.64 {d30-d31}, [r8 :128] /* q15 := mc_forward[3] */ + vld1.8 {d12-d13}, [r4 :128] /* q6 := dsbb[0] */ + vld1.8 {d14-d15}, [r5 :128] /* q7 := dsbb[1] */ + vld1.8 {d20-d21}, [r6 :128] /* q10 := inv */ + vld1.8 {d22-d23}, [r7 :128] /* q11 := inva */ + vld1.8 {d30-d31}, [r8 :128] /* q15 := mc_forward[3] */ /* (q2, q3) := (lo, hi) */ vshr.u8 q3, q0, #4 @@ -529,8 +529,8 @@ ENTRY(aes_neon_dec1) /* load dsb9 */ add r4, r12, #(dsb9_0 - .Lconstants) add r5, r12, #(dsb9_1 - .Lconstants) - vld1.64 {d8-d9}, [r4 :128] /* q4 := dsb9[0] */ - vld1.64 {d10-d11}, [r5 :128] /* q5 := dsb9[1] */ + vld1.8 {d8-d9}, [r4 :128] /* q4 := dsb9[0] */ + vld1.8 {d10-d11}, [r5 :128] /* q5 := dsb9[1] */ /* q0 := rk[0] + diptlo(lo) + dipthi(hi) */ veor q0, q14, q2 @@ -541,10 +541,10 @@ ENTRY(aes_neon_dec1) _ALIGN_TEXT 1: /* load dsbd */ add r4, r12, #(dsbd_0 - .Lconstants) - vld1.64 {d16-d17}, [r4 :128]! /* q8 := dsbd[0] */ - vld1.64 {d18-d19}, [r4 :128] /* q9 := dsbd[1] */ + vld1.8 {d16-d17}, [r4 :128]! /* q8 := dsbd[0] */ + vld1.8 {d18-d19}, [r4 :128] /* q9 := dsbd[1] */ - vld1.64 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ + vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ /* q0 := rk[i] + dsb9_0(io) + dsb9_1(jo) */ vtbl.8 d24, {d8-d9}, d4 @@ -568,8 +568,8 @@ 1: /* load dsbd */ /* load dsbe */ add r4, r12, #(dsbe_0 - .Lconstants) - vld1.64 {d16-d17}, [r4 :128]! /* q8 := dsbe[0] */ - vld1.64 {d18-d19}, [r4 :128] /* q9 := dsbe[1] */ + vld1.8 {d16-d17}, [r4 :128]! /* q8 := dsbe[0] */ + vld1.8 {d18-d19}, [r4 :128] /* q9 := dsbe[1] */ /* q0 := x(mc) + dsbb_0(io) + dsbb_1(jo) */ vtbl.8 d28, {d0-d1}, d30 @@ -647,11 +647,11 @@ 2: /* add r6, r12, #(dsbo_0 - .Lconstants) add r7, r12, #(dsbo_1 - .Lconstants) add r8, r8, r3, lsl #4 - vld1.64 {d12-d13}, [r6 :128] - vld1.64 {d14-d15}, [r7 :128] - vld1.64 {d30-d31}, [r8 :128] + vld1.8 {d12-d13}, [r6 :128] + vld1.8 {d14-d15}, [r7 :128] + vld1.8 {d30-d31}, [r8 :128] - vld1.64 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ + vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ /* (q2, q3) := (dsbo_0(io), dsbo_1(jo)) */ vtbl.8 d4, {d12-d13}, d4 diff -r b9bd12fb9564 sys/crypto/aes/arch/arm/aes_neon_subr.c --- a/sys/crypto/aes/arch/arm/aes_neon_subr.c Sun Aug 02 18:20:51 2020 +0000 +++ b/sys/crypto/aes/arch/arm/aes_neon_subr.c Wed Aug 05 19:42:16 2020 +0000 @@ -161,32 +161,35 @@ static int aes_neon_xts_update_selftest(void) { static const struct { - uint32_t in[4], out[4]; + uint8_t in[16], out[16]; } cases[] = { [0] = { {1}, {2} }, - [1] = { {0x80000000U,0,0,0}, {0,1,0,0} }, - [2] = { {0,0x80000000U,0,0}, {0,0,1,0} }, - [3] = { {0,0,0x80000000U,0}, {0,0,0,1} }, - [4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} }, - [5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} }, + [1] = { {0,0,0,0x80, 0,0,0,0, 0,0,0,0, 0,0,0,0}, + {0,0,0,0, 1,0,0,0, 0,0,0,0, 0,0,0,0} }, + [2] = { {0,0,0,0, 0,0,0,0x80, 0,0,0,0, 0,0,0,0}, + {0,0,0,0, 0,0,0,0, 1,0,0,0, 0,0,0,0} }, + [3] = { {0,0,0,0, 0,0,0,0, 0,0,0,0x80, 0,0,0,0}, + {0,0,0,0, 0,0,0,0, 0,0,0,0, 1,0,0,0} }, + [4] = { {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80}, + {0x87,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0} }, + [5] = { {0,0,0,0, 0,0,0,0x80, 0,0,0,0, 0,0,0,0x80}, + {0x87,0,0,0, 0,0,0,0, 1,0,0,0, 0,0,0,0} }, }; unsigned i; - uint32_t t[4]; + uint8_t t[16]; int result = 0; for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) { - t[0] = cases[i].in[0]; - t[1] = cases[i].in[1]; - t[2] = cases[i].in[2]; - t[3] = cases[i].in[3]; - storeblock(t, aes_neon_xts_update(loadblock(t))); - if (t[0] != cases[i].out[0] || - t[1] != cases[i].out[1] || - t[2] != cases[i].out[2] || - t[3] != cases[i].out[3]) { - printf("%s %u:" - " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n", - __func__, i, t[0], t[1], t[2], t[3]); + storeblock(t, aes_neon_xts_update(loadblock(cases[i].in))); + if (memcmp(t, cases[i].out, 16)) { + char buf[33]; + unsigned j; + + for (j = 0; j < 16; j++) { + snprintf(buf + 2*j, sizeof(buf) - 2*j, + " %02hhx", t[j]); + } + printf("%s %u: %s\n", __func__, i, buf); result = -1; } } @@ -289,16 +292,6 @@ aes_neon_cbcmac_update1(const struct aes * function, which should substantially improve CCM throughput. */ -#if _BYTE_ORDER == _LITTLE_ENDIAN -#define vbetoh32q_u8 vrev32q_u8 -#define vhtobe32q_u8 vrev32q_u8 -#elif _BYTE_ORDER == _BIG_ENDIAN -#define vbetoh32q_u8(x) (x) -#define vhtobe32q_u8(x) (x) -#else -#error what kind of endian are you anyway -#endif - void aes_neon_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16], uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32], @@ -313,12 +306,12 @@ aes_neon_ccm_enc1(const struct aesenc *e auth = loadblock(authctr); ctr_be = loadblock(authctr + 16); - ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be)); + ctr = vreinterpretq_u32_u8(vrev32q_u8(ctr_be)); for (; nbytes; nbytes -= 16, in += 16, out += 16) { uint8x16x2_t b2; ptxt = loadblock(in); ctr = vaddq_u32(ctr, ctr32_inc); - ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr)); + ctr_be = vrev32q_u8(vreinterpretq_u8_u32(ctr)); b2.val[0] = auth ^ ptxt; b2.val[1] = ctr_be; @@ -343,9 +336,9 @@ aes_neon_ccm_dec1(const struct aesenc *e KASSERT(nbytes % 16 == 0); ctr_be = loadblock(authctr + 16); - ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be)); + ctr = vreinterpretq_u32_u8(vrev32q_u8(ctr_be)); ctr = vaddq_u32(ctr, ctr32_inc); - ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr)); + ctr_be = vrev32q_u8(vreinterpretq_u8_u32(ctr)); pad = aes_neon_enc1(enc, ctr_be, nrounds); auth = loadblock(authctr); for (;; in += 16, out += 16) { @@ -359,7 +352,7 @@ aes_neon_ccm_dec1(const struct aesenc *e break; ctr = vaddq_u32(ctr, ctr32_inc); - ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr)); + ctr_be = vrev32q_u8(vreinterpretq_u8_u32(ctr)); b2.val[0] = auth; b2.val[1] = ctr_be; b2 = aes_neon_enc2(enc, b2, nrounds); diff -r b9bd12fb9564 sys/crypto/chacha/arch/arm/chacha_neon.c --- a/sys/crypto/chacha/arch/arm/chacha_neon.c Sun Aug 02 18:20:51 2020 +0000 +++ b/sys/crypto/chacha/arch/arm/chacha_neon.c Wed Aug 05 19:42:16 2020 +0000 @@ -46,26 +46,6 @@ vrolq_n_u32(uint32x4_t x, uint8_t n) return vsriq_n_u32(vshlq_n_u32(x, n), x, 32 - n); #endif } - -static inline uint32x4_t -vhtole_u32(uint32x4_t x) -{ -#if _BYTE_ORDER == _LITTLE_ENDIAN - return x; -#elif _BYTE_ORDER == _BIG_ENDIAN - return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))); -#endif -} - -static inline uint32x4_t -vletoh_u32(uint32x4_t x) -{ -#if _BYTE_ORDER == _LITTLE_ENDIAN - return x; -#elif _BYTE_ORDER == _BIG_ENDIAN - return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))); -#endif -} static inline uint32x4_t rol16(uint32x4_t x) @@ -180,17 +160,17 @@ chacha_core_neon(uint8_t out[restrict st uint32x4_t in0, in1, in2, in3; uint32x4_t r0, r1, r2, r3; - r0 = in0 = vletoh_u32(vld1q_u32((const uint32_t *)c)); - r1 = in1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); - r2 = in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); - r3 = in3 = vletoh_u32(vld1q_u32((const uint32_t *)in)); + r0 = in0 = vreinterpretq_u32_u8(vld1q_u8(c)); + r1 = in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0)); + r2 = in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16)); + r3 = in3 = vreinterpretq_u32_u8(vld1q_u8(in)); chacha_permute(&r0, &r1, &r2, &r3, nr); - vst1q_u32((uint32_t *)out + 0, vhtole_u32(vaddq_u32(r0, in0))); - vst1q_u32((uint32_t *)out + 4, vhtole_u32(vaddq_u32(r1, in1))); - vst1q_u32((uint32_t *)out + 8, vhtole_u32(vaddq_u32(r2, in2))); - vst1q_u32((uint32_t *)out + 12, vhtole_u32(vaddq_u32(r3, in3))); + vst1q_u8(out + 0, vreinterpretq_u8_u32(vaddq_u32(r0, in0))); + vst1q_u8(out + 16, vreinterpretq_u8_u32(vaddq_u32(r1, in1))); + vst1q_u8(out + 32, vreinterpretq_u8_u32(vaddq_u32(r2, in2))); + vst1q_u8(out + 48, vreinterpretq_u8_u32(vaddq_u32(r3, in3))); } void @@ -202,15 +182,15 @@ hchacha_neon(uint8_t out[restrict static { uint32x4_t r0, r1, r2, r3; - r0 = vletoh_u32(vld1q_u32((const uint32_t *)c)); - r1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); - r2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); - r3 = vletoh_u32(vld1q_u32((const uint32_t *)in)); + r0 = vreinterpretq_u32_u8(vld1q_u8(c)); + r1 = vreinterpretq_u32_u8(vld1q_u8(k + 0)); + r2 = vreinterpretq_u32_u8(vld1q_u8(k + 16)); + r3 = vreinterpretq_u32_u8(vld1q_u8(in)); chacha_permute(&r0, &r1, &r2, &r3, nr); - vst1q_u32((uint32_t *)out + 0, r0); - vst1q_u32((uint32_t *)out + 4, r3); + vst1q_u8(out + 0, vreinterpretq_u8_u32(r0)); + vst1q_u8(out + 16, vreinterpretq_u8_u32(r3)); } void @@ -229,9 +209,9 @@ chacha_stream_neon(uint8_t *restrict s, uint32x4_t in0, in1, in2, in3; uint32x4_t r0, r1, r2, r3; - in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32)); - in1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); - in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); + in0 = vreinterpretq_u32_u8(vld1q_u8(chacha_const32)); + in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0)); + in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16)); in3 = (uint32x4_t) { blkno, le32dec(nonce), @@ -245,27 +225,27 @@ chacha_stream_neon(uint8_t *restrict s, r2 = in2; r3 = in3; chacha_permute(&r0, &r1, &r2, &r3, nr); - r0 = vhtole_u32(vaddq_u32(r0, in0)); - r1 = vhtole_u32(vaddq_u32(r1, in1)); - r2 = vhtole_u32(vaddq_u32(r2, in2)); - r3 = vhtole_u32(vaddq_u32(r3, in3)); + r0 = vaddq_u32(r0, in0); + r1 = vaddq_u32(r1, in1); + r2 = vaddq_u32(r2, in2); + r3 = vaddq_u32(r3, in3); if (n < 64) { uint8_t buf[64] __aligned(16); - vst1q_u32((uint32_t *)buf + 4*0, r0); - vst1q_u32((uint32_t *)buf + 4*1, r1); - vst1q_u32((uint32_t *)buf + 4*2, r2); - vst1q_u32((uint32_t *)buf + 4*3, r3); + vst1q_u8(buf + 0, vreinterpretq_u8_u32(r0)); + vst1q_u8(buf + 16, vreinterpretq_u8_u32(r1)); + vst1q_u8(buf + 32, vreinterpretq_u8_u32(r2)); + vst1q_u8(buf + 48, vreinterpretq_u8_u32(r3)); memcpy(s, buf, n); break; } - vst1q_u32((uint32_t *)s + 4*0, r0); - vst1q_u32((uint32_t *)s + 4*1, r1); - vst1q_u32((uint32_t *)s + 4*2, r2); - vst1q_u32((uint32_t *)s + 4*3, r3); + vst1q_u8(s + 0, vreinterpretq_u8_u32(r0)); + vst1q_u8(s + 16, vreinterpretq_u8_u32(r1)); + vst1q_u8(s + 32, vreinterpretq_u8_u32(r2)); + vst1q_u8(s + 48, vreinterpretq_u8_u32(r3)); in3 = vaddq_u32(in3, blkno_inc); } } @@ -288,9 +268,9 @@ chacha_stream_xor_neon(uint8_t *s, const uint32x4_t in0, in1, in2, in3; uint32x4_t r0, r1, r2, r3; - in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32)); - in1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); - in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); + in0 = vreinterpretq_u32_u8(vld1q_u8(chacha_const32)); + in1 = vreinterpretq_u32_u8(vld1q_u8(k + 0)); + in2 = vreinterpretq_u32_u8(vld1q_u8(k + 16)); in3 = (uint32x4_t) { blkno, le32dec(nonce), @@ -304,19 +284,19 @@ chacha_stream_xor_neon(uint8_t *s, const r2 = in2; r3 = in3; chacha_permute(&r0, &r1, &r2, &r3, nr); - r0 = vhtole_u32(vaddq_u32(r0, in0)); - r1 = vhtole_u32(vaddq_u32(r1, in1)); - r2 = vhtole_u32(vaddq_u32(r2, in2)); - r3 = vhtole_u32(vaddq_u32(r3, in3)); + r0 = vaddq_u32(r0, in0); + r1 = vaddq_u32(r1, in1); + r2 = vaddq_u32(r2, in2); + r3 = vaddq_u32(r3, in3); if (n < 64) { uint8_t buf[64] __aligned(16); unsigned i; - vst1q_u32((uint32_t *)buf + 4*0, r0); - vst1q_u32((uint32_t *)buf + 4*1, r1); - vst1q_u32((uint32_t *)buf + 4*2, r2); - vst1q_u32((uint32_t *)buf + 4*3, r3); + vst1q_u8(buf + 0, vreinterpretq_u8_u32(r0)); + vst1q_u8(buf + 16, vreinterpretq_u8_u32(r1)); + vst1q_u8(buf + 32, vreinterpretq_u8_u32(r2)); + vst1q_u8(buf + 48, vreinterpretq_u8_u32(r3)); for (i = 0; i < n - n%4; i += 4) le32enc(s + i, @@ -327,14 +307,14 @@ chacha_stream_xor_neon(uint8_t *s, const break; } - r0 ^= vld1q_u32((const uint32_t *)p + 4*0); - r1 ^= vld1q_u32((const uint32_t *)p + 4*1); - r2 ^= vld1q_u32((const uint32_t *)p + 4*2); - r3 ^= vld1q_u32((const uint32_t *)p + 4*3); - vst1q_u32((uint32_t *)s + 4*0, r0); - vst1q_u32((uint32_t *)s + 4*1, r1); - vst1q_u32((uint32_t *)s + 4*2, r2); - vst1q_u32((uint32_t *)s + 4*3, r3); + r0 ^= vreinterpretq_u32_u8(vld1q_u8(p + 0)); + r1 ^= vreinterpretq_u32_u8(vld1q_u8(p + 16)); + r2 ^= vreinterpretq_u32_u8(vld1q_u8(p + 32)); + r3 ^= vreinterpretq_u32_u8(vld1q_u8(p + 48)); + vst1q_u8(s + 0, vreinterpretq_u8_u32(r0)); + vst1q_u8(s + 16, vreinterpretq_u8_u32(r1)); + vst1q_u8(s + 32, vreinterpretq_u8_u32(r2)); + vst1q_u8(s + 48, vreinterpretq_u8_u32(r3)); in3 = vaddq_u32(in3, blkno_inc); } } diff -r b9bd12fb9564 sys/crypto/chacha/arch/arm/chacha_neon_32.S --- a/sys/crypto/chacha/arch/arm/chacha_neon_32.S Sun Aug 02 18:20:51 2020 +0000 +++ b/sys/crypto/chacha/arch/arm/chacha_neon_32.S Wed Aug 05 19:42:16 2020 +0000 @@ -54,7 +54,7 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2 */ .macro ROUNDLD a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3 - vld1.32 {\c2-\c3}, [fp, :256] + vld1.8 {\c2-\c3}, [fp, :256] .endm .macro ROUND a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3, c0l, d0l,d0h,d1l,d1h,d2l,d2h,d3l,d3h @@ -80,7 +80,7 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2 vadd.u32 \c2, \c2, \d2 vadd.u32 \c3, \c3, \d3 - vst1.32 {\c0-\c1}, [fp, :256] /* free c0 and c1 as temps */ + vst1.8 {\c0-\c1}, [fp, :256] /* free c0 and c1 as temps */ veor \c0, \b0, \c0 veor \c1, \b1, \c1 @@ -118,7 +118,7 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2 vtbl.8 \d3l, {\d3l}, \c0l vtbl.8 \d3h, {\d3h}, \c0l - vld1.32 {\c0-\c1}, [fp, :256] /* restore c0 and c1 */ + vld1.8 {\c0-\c1}, [fp, :256] /* restore c0 and c1 */ /* c += d; b ^= c; b <<<= 7 */ vadd.u32 \c2, \c2, \d2 @@ -126,7 +126,7 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2 vadd.u32 \c0, \c0, \d0 vadd.u32 \c1, \c1, \d1 - vst1.32 {\c2-\c3}, [fp, :256] /* free c2 and c3 as temps */ + vst1.8 {\c2-\c3}, [fp, :256] /* free c2 and c3 as temps */ veor \c2, \b2, \c2 veor \c3, \b3, \c3 @@ -143,14 +143,6 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2 vsri.u32 \b1, \c3, #(32 - 7) .endm -#if _BYTE_ORDER == _LITTLE_ENDIAN -#define HTOLE32(x) -#define LE32TOH(x) -#elif _BYTE_ORDER == _BIG_ENDIAN -#define HTOLE32(x) vrev32.8 x, x -#define LE32TOH(x) vrev32.8 x, x -#endif - .text .p2align 2 .Lconstants_addr: @@ -183,9 +175,9 @@ ENTRY(chacha_stream256_neon) ldm ip, {r4, r5} /* r4 := const, r5 := nr */ ldm r2, {r6, r8, r10} /* (r6, r8, r10) := nonce[0:12) */ - vld1.32 {q12}, [r4] /* q12 := constant */ - vld1.32 {q13-q14}, [r3] /* q13-q14 := key */ - vld1.32 {q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */ + vld1.8 {q12}, [r4] /* q12 := constant */ + vld1.8 {q13-q14}, [r3] /* q13-q14 := key */ + vld1.8 {q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */ vdup.32 q0, d24[0] /* q0-q3 := constant */ vdup.32 q1, d24[1] @@ -205,23 +197,6 @@ ENTRY(chacha_stream256_neon) vdup.32 q14, r8 vdup.32 q15, r10 - HTOLE32(q0) - HTOLE32(q1) - HTOLE32(q2) - HTOLE32(q3) - HTOLE32(q4) - HTOLE32(q5) - HTOLE32(q6) - HTOLE32(q7) - HTOLE32(q8) - HTOLE32(q9) - HTOLE32(q10) - HTOLE32(q11) - HTOLE32(q12) - HTOLE32(q13) - HTOLE32(q14) - HTOLE32(q15) - b 2f _ALIGN_TEXT @@ -275,7 +250,7 @@ 2: subs r5, r5, #2 sub r7, r7, #0x10 vdup.32 q8, r1 /* q8 := (blkno, blkno, blkno, blkno) */ - vld1.32 {q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */ + vld1.8 {q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */ vzip.32 q0, q1 vzip.32 q2, q3 @@ -283,9 +258,9 @@ 2: subs r5, r5, #2 vzip.32 q6, q7 vadd.u32 q8, q8, q9 /* q8 := (blkno,blkno+1,blkno+2,blkno+3) */ - vld1.32 {q9}, [r4] /* q9 := constant */ + vld1.8 {q9}, [r4] /* q9 := constant */ vadd.u32 q12, q12, q8 /* q12 += (blkno,blkno+1,blkno+2,blkno+3) */ - vld1.32 {q8}, [r3]! /* q8 := key[0:16) */ + vld1.8 {q8}, [r3]! /* q8 := key[0:16) */ vswp d1, d4 vswp d9, d12 @@ -330,19 +305,10 @@ 2: subs r5, r5, #2 vadd.u32 q3, q3, q8 vadd.u32 q7, q7, q8 - vld1.32 {q8-q9}, [fp, :256] /* restore q8-q9 */ + vld1.8 {q8-q9}, [fp, :256] /* restore q8-q9 */ - LE32TOH(q0) - LE32TOH(q1) - LE32TOH(q2) - LE32TOH(q3) - LE32TOH(q4) - LE32TOH(q5) - LE32TOH(q6) - LE32TOH(q7) - - vst1.32 {q0-q1}, [r0]! - vld1.32 {q0}, [r3] /* q0 := key[16:32) */ + vst1.8 {q0-q1}, [r0]! + vld1.8 {q0}, [r3] /* q0 := key[16:32) */ mov r3, #0 /* q1 = (0, nonce[0:4), ..., nonce[8:12)) */ vmov d2, r3, r6 vmov d3, r8, r10 @@ -370,23 +336,14 @@ 2: subs r5, r5, #2 vadd.u32 q11, q11, q1 vadd.u32 q15, q15, q1 - LE32TOH(q8) - LE32TOH(q9) - LE32TOH(q10) - LE32TOH(q11) - LE32TOH(q12) - LE32TOH(q13) - LE32TOH(q14) - LE32TOH(q15) - - /* vst1.32 {q0-q1}, [r0]! */ - vst1.32 {q8-q9}, [r0]! - vst1.32 {q2-q3}, [r0]! - vst1.32 {q10-q11}, [r0]! - vst1.32 {q4-q5}, [r0]! - vst1.32 {q12-q13}, [r0]! - vst1.32 {q6-q7}, [r0]! - vst1.32 {q14-q15}, [r0] + /* vst1.8 {q0-q1}, [r0]! */ + vst1.8 {q8-q9}, [r0]! + vst1.8 {q2-q3}, [r0]! + vst1.8 {q10-q11}, [r0]! + vst1.8 {q4-q5}, [r0]! + vst1.8 {q12-q13}, [r0]! + vst1.8 {q6-q7}, [r0]! + vst1.8 {q14-q15}, [r0] /* zero temporary space on the stack */ vmov.i32 q0, #0 @@ -426,9 +383,9 @@ ENTRY(chacha_stream_xor256_neon) ldm ip, {r4, r5, ip} /* r4 := key, r5 := const, ip := nr */ ldm r3, {r6, r8, r10} /* (r6, r8, r10) := nonce[0:12) */ - vld1.32 {q12}, [r5] /* q12 := constant */ - vld1.32 {q13-q14}, [r4] /* q13-q14 := key */ - vld1.32 {q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */ + vld1.8 {q12}, [r5] /* q12 := constant */ + vld1.8 {q13-q14}, [r4] /* q13-q14 := key */ + vld1.8 {q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */ vdup.32 q0, d24[0] /* q0-q3 := constant */ vdup.32 q1, d24[1] @@ -448,23 +405,6 @@ ENTRY(chacha_stream_xor256_neon) vdup.32 q14, r8 vdup.32 q15, r10 - HTOLE32(q0) - HTOLE32(q1) - HTOLE32(q2) - HTOLE32(q3) - HTOLE32(q4) - HTOLE32(q5) - HTOLE32(q6) - HTOLE32(q7) - HTOLE32(q8) - HTOLE32(q9) - HTOLE32(q10) - HTOLE32(q11) - HTOLE32(q12) - HTOLE32(q13) - HTOLE32(q14) - HTOLE32(q15) - b 2f _ALIGN_TEXT @@ -488,7 +428,7 @@ 2: subs ip, ip, #2 sub r7, r7, #0x10 vdup.32 q8, r2 /* q8 := (blkno, blkno, blkno, blkno) */ - vld1.32 {q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */ + vld1.8 {q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */ vzip.32 q0, q1 vzip.32 q2, q3 @@ -496,9 +436,9 @@ 2: subs ip, ip, #2 vzip.32 q6, q7 vadd.u32 q8, q8, q9 /* q8 := (blkno,blkno+1,blkno+2,blkno+3) */ - vld1.32 {q9}, [r5] /* q9 := constant */ + vld1.8 {q9}, [r5] /* q9 := constant */ vadd.u32 q12, q12, q8 /* q12 += (blkno,blkno+1,blkno+2,blkno+3) */ - vld1.32 {q8}, [r4]! /* q8 := key[0:16) */ + vld1.8 {q8}, [r4]! /* q8 := key[0:16) */ vswp d3, d6 vswp d9, d12 @@ -518,24 +458,15 @@ 2: subs ip, ip, #2 vadd.u32 q3, q3, q8 vadd.u32 q7, q7, q8 - vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [0:32) */ - - LE32TOH(q0) - LE32TOH(q1) - LE32TOH(q2) - LE32TOH(q6) - LE32TOH(q4) - LE32TOH(q5) - LE32TOH(q3) - LE32TOH(q7) + vld1.8 {q8-q9}, [r1]! /* load plaintext bytes [0:32) */ veor q0, q0, q8 /* compute ciphertext bytes [0:32) */ veor q1, q1, q9 - vld1.32 {q8-q9}, [fp, :256] /* restore q8-q9 */ + vld1.8 {q8-q9}, [fp, :256] /* restore q8-q9 */ - vst1.32 {q0-q1}, [r0]! /* store ciphertext bytes [0:32) */ - vld1.32 {q0}, [r4] /* q0 := key[16:32) */ + vst1.8 {q0-q1}, [r0]! /* store ciphertext bytes [0:32) */ + vld1.8 {q0}, [r4] /* q0 := key[16:32) */ mov r3, #0 /* q1 = (0, nonce[0:4), ..., nonce[8:12)) */ vmov d2, r3, r6 vmov d3, r8, r10 @@ -563,57 +494,48 @@ 2: subs ip, ip, #2 vadd.u32 q11, q11, q1 vadd.u32 q15, q15, q1 - vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [32:64) */ - - LE32TOH(q8) - LE32TOH(q9) - LE32TOH(q10) - LE32TOH(q11) - LE32TOH(q12) - LE32TOH(q13) - LE32TOH(q14) - LE32TOH(q15) + vld1.8 {q0-q1}, [r1]! /* load plaintext bytes [32:64) */ veor q0, q0, q8 /* compute ciphertext bytes [32:64) */ veor q1, q1, q9 - vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [64:96) */ - vst1.32 {q0-q1}, [r0]! /* store ciphertext bytes [32:64) */ - vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [96:128) */ + vld1.8 {q8-q9}, [r1]! /* load plaintext bytes [64:96) */ + vst1.8 {q0-q1}, [r0]! /* store ciphertext bytes [32:64) */ + vld1.8 {q0-q1}, [r1]! /* load plaintext bytes [96:128) */ veor q2, q2, q8 /* compute ciphertext bytes [64:96) */ veor q3, q3, q9 - vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [128:160) */ - vst1.32 {q2-q3}, [r0]! /* store ciphertext bytes [64:80) */ + vld1.8 {q8-q9}, [r1]! /* load plaintext bytes [128:160) */ + vst1.8 {q2-q3}, [r0]! /* store ciphertext bytes [64:80) */ veor q10, q10, q0 /* compute ciphertext bytes [96:128) */ veor q11, q11, q1 - vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [160:192) */ - vst1.32 {q10-q11}, [r0]! /* store ciphertext bytes [80:96) */ + vld1.8 {q0-q1}, [r1]! /* load plaintext bytes [160:192) */ + vst1.8 {q10-q11}, [r0]! /* store ciphertext bytes [80:96) */ veor q4, q4, q8 /* compute ciphertext bytes [128:160) */ veor q5, q5, q9 - vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [192:224) */ - vst1.32 {q4-q5}, [r0]! /* store ciphertext bytes [96:112) */ + vld1.8 {q8-q9}, [r1]! /* load plaintext bytes [192:224) */ + vst1.8 {q4-q5}, [r0]! /* store ciphertext bytes [96:112) */ veor q12, q12, q0 /* compute ciphertext bytes [160:192) */ veor q13, q13, q1 - vld1.32 {q0-q1}, [r1] /* load plaintext bytes [224:256) */ - vst1.32 {q12-q13}, [r0]! /* store ciphertext bytes [112:128) */ + vld1.8 {q0-q1}, [r1] /* load plaintext bytes [224:256) */ + vst1.8 {q12-q13}, [r0]! /* store ciphertext bytes [112:128) */ veor q6, q6, q8 /* compute ciphertext bytes [192:224) */ veor q7, q7, q9 - vst1.32 {q6-q7}, [r0]! /* store ciphertext bytes [192:224) */ + vst1.8 {q6-q7}, [r0]! /* store ciphertext bytes [192:224) */ veor q14, q14, q0 /* compute ciphertext bytes [224:256) */ veor q15, q15, q1 - vst1.32 {q14-q15}, [r0] /* store ciphertext bytes [224:256) */ + vst1.8 {q14-q15}, [r0] /* store ciphertext bytes [224:256) */ /* zero temporary space on the stack */ vmov.i32 q0, #0 @@ -632,10 +554,10 @@ END(chacha_stream_xor256_neon) .type v0123,%object v0123: - .long 0, 1, 2, 3 + .byte 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 END(v0123) .type rot8,%object rot8: - .long 0x02010003, 0x06050407 + .byte 3,0,1,2, 7,4,5,6 END(rot8) diff -r b9bd12fb9564 sys/crypto/chacha/arch/arm/chacha_neon_64.S --- a/sys/crypto/chacha/arch/arm/chacha_neon_64.S Sun Aug 02 18:20:51 2020 +0000 +++ b/sys/crypto/chacha/arch/arm/chacha_neon_64.S Wed Aug 05 19:42:16 2020 +0000 @@ -169,22 +169,22 @@ ENTRY(chacha_stream256_neon) ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */ add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */ - HTOLE32(v0.16b) - HTOLE32(v1.16b) - HTOLE32(v2.16b) - HTOLE32(v3.16b) - HTOLE32(v4.16b) - HTOLE32(v5.16b) - HTOLE32(v6.16b) - HTOLE32(v7.16b) - HTOLE32(v8.16b) - HTOLE32(v9.16b) - HTOLE32(v10.16b) - HTOLE32(v11.16b) - HTOLE32(v12.16b) - HTOLE32(v13.16b) - HTOLE32(v14.16b) - HTOLE32(v15.16b) + LE32TOH(v0.16b) + LE32TOH(v1.16b) + LE32TOH(v2.16b) + LE32TOH(v3.16b) + LE32TOH(v4.16b) + LE32TOH(v5.16b) + LE32TOH(v6.16b) + LE32TOH(v7.16b) + LE32TOH(v8.16b) + LE32TOH(v9.16b) + LE32TOH(v10.16b) + LE32TOH(v11.16b) + /* LE32TOH(v12.16b) -- blkno, already host order */ + LE32TOH(v13.16b) + LE32TOH(v14.16b) + LE32TOH(v15.16b) mov v16.16b, v0.16b mov v17.16b, v1.16b @@ -234,22 +234,22 @@ 1: subs w5, w5, #2 add v14.4s, v14.4s, v30.4s add v15.4s, v15.4s, v31.4s - LE32TOH(v0.16b) - LE32TOH(v1.16b) - LE32TOH(v2.16b) - LE32TOH(v3.16b) - LE32TOH(v4.16b) - LE32TOH(v5.16b) - LE32TOH(v6.16b) - LE32TOH(v7.16b) - LE32TOH(v8.16b) - LE32TOH(v9.16b) - LE32TOH(v10.16b) - LE32TOH(v11.16b) - LE32TOH(v12.16b) - LE32TOH(v13.16b) - LE32TOH(v14.16b) - LE32TOH(v15.16b) + HTOLE32(v0.16b) + HTOLE32(v1.16b) + HTOLE32(v2.16b) + HTOLE32(v3.16b) + HTOLE32(v4.16b) + HTOLE32(v5.16b) + HTOLE32(v6.16b) + HTOLE32(v7.16b) + HTOLE32(v8.16b) + HTOLE32(v9.16b) + HTOLE32(v10.16b) + HTOLE32(v11.16b) + HTOLE32(v12.16b) + HTOLE32(v13.16b) + HTOLE32(v14.16b) + HTOLE32(v15.16b) st4 { v0.s, v1.s, v2.s, v3.s}[0], [x0], #16 st4 { v4.s, v5.s, v6.s, v7.s}[0], [x0], #16 @@ -308,22 +308,22 @@ ENTRY(chacha_stream_xor256_neon) ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */ add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */ - HTOLE32(v0.16b) - HTOLE32(v1.16b) - HTOLE32(v2.16b) - HTOLE32(v3.16b) - HTOLE32(v4.16b) - HTOLE32(v5.16b) - HTOLE32(v6.16b) - HTOLE32(v7.16b) - HTOLE32(v8.16b) - HTOLE32(v9.16b) - HTOLE32(v10.16b) - HTOLE32(v11.16b) - HTOLE32(v12.16b) - HTOLE32(v13.16b) - HTOLE32(v14.16b) - HTOLE32(v15.16b) + LE32TOH(v0.16b) + LE32TOH(v1.16b) + LE32TOH(v2.16b) + LE32TOH(v3.16b) + LE32TOH(v4.16b) + LE32TOH(v5.16b) + LE32TOH(v6.16b) + LE32TOH(v7.16b) + LE32TOH(v8.16b) + LE32TOH(v9.16b) + LE32TOH(v10.16b) + LE32TOH(v11.16b) + /* LE32TOH(v12.16b) -- blkno, already host order */ + LE32TOH(v13.16b) + LE32TOH(v14.16b) + LE32TOH(v15.16b) mov v16.16b, v0.16b mov v17.16b, v1.16b @@ -401,22 +401,22 @@ 1: subs w6, w6, #2 ld4 {v24.s,v25.s,v26.s,v27.s}[3], [x1], #16 ld4 {v28.s,v29.s,v30.s,v31.s}[3], [x1], #16 - LE32TOH(v0.16b) - LE32TOH(v1.16b) - LE32TOH(v2.16b) - LE32TOH(v3.16b) - LE32TOH(v4.16b) - LE32TOH(v5.16b) - LE32TOH(v6.16b) - LE32TOH(v7.16b) - LE32TOH(v8.16b) - LE32TOH(v9.16b) - LE32TOH(v10.16b) - LE32TOH(v11.16b) - LE32TOH(v12.16b) - LE32TOH(v13.16b) - LE32TOH(v14.16b) - LE32TOH(v15.16b) + HTOLE32(v0.16b) + HTOLE32(v1.16b) + HTOLE32(v2.16b) + HTOLE32(v3.16b) + HTOLE32(v4.16b) + HTOLE32(v5.16b) + HTOLE32(v6.16b) + HTOLE32(v7.16b) + HTOLE32(v8.16b) + HTOLE32(v9.16b) + HTOLE32(v10.16b) + HTOLE32(v11.16b) + HTOLE32(v12.16b) + HTOLE32(v13.16b) + HTOLE32(v14.16b) + HTOLE32(v15.16b) eor v16.16b, v16.16b, v0.16b eor v17.16b, v17.16b, v1.16b