diff -r b9bd12fb9564 sys/crypto/aes/arch/arm/aes_neon_subr.c --- a/sys/crypto/aes/arch/arm/aes_neon_subr.c Sun Aug 02 18:20:51 2020 +0000 +++ b/sys/crypto/aes/arch/arm/aes_neon_subr.c Tue Aug 04 15:54:38 2020 +0000 @@ -57,6 +57,20 @@ storeblock(void *out, uint8x16_t block) vst1q_u8(out, block); } +#if _BYTE_ORDER == _LITTLE_ENDIAN +#define vbetoh32q_u8 vrev32q_u8 +#define vhtobe32q_u8 vrev32q_u8 +#define vletoh32q_u8(x) (x) +#define vhtole32q_u8(x) (x) +#elif _BYTE_ORDER == _BIG_ENDIAN +#define vbetoh32q_u8(x) (x) +#define vhtobe32q_u8(x) (x) +#define vletoh32q_u8 vrev32q_u8 +#define vhtole32q_u8 vrev32q_u8 +#else +#error what kind of endian are you anyway +#endif + void aes_neon_enc(const struct aesenc *enc, const uint8_t in[static 16], uint8_t out[static 16], uint32_t nrounds) @@ -148,13 +162,13 @@ aes_neon_xts_update(uint8x16_t t8) int32x4_t t, t_; uint32x4_t mask; - t = vreinterpretq_s32_u8(t8); + t = vreinterpretq_s32_u8(vletoh32q_u8(t8)); mask = vcltq_s32(t, zero); /* -1 if high bit set else 0 */ mask = vextq_u32(mask, mask, 3); /* rotate quarters */ t_ = vsliq_n_s32(zero, t, 1); /* shift */ t_ ^= carry & mask; - return vreinterpretq_u8_s32(t_); + return vhtole32q_u8(vreinterpretq_u8_s32(t_)); } static int @@ -289,16 +303,6 @@ aes_neon_cbcmac_update1(const struct aes * function, which should substantially improve CCM throughput. */ -#if _BYTE_ORDER == _LITTLE_ENDIAN -#define vbetoh32q_u8 vrev32q_u8 -#define vhtobe32q_u8 vrev32q_u8 -#elif _BYTE_ORDER == _BIG_ENDIAN -#define vbetoh32q_u8(x) (x) -#define vhtobe32q_u8(x) (x) -#else -#error what kind of endian are you anyway -#endif - void aes_neon_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16], uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32], diff -r b9bd12fb9564 sys/crypto/chacha/arch/arm/chacha_neon_32.S --- a/sys/crypto/chacha/arch/arm/chacha_neon_32.S Sun Aug 02 18:20:51 2020 +0000 +++ b/sys/crypto/chacha/arch/arm/chacha_neon_32.S Tue Aug 04 15:54:38 2020 +0000 @@ -96,7 +96,7 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2 vsri.u32 \b2, \c0, #(32 - 12) vsri.u32 \b3, \c1, #(32 - 12) - vld1.8 {\c0l}, [r7, :64] /* load rot8 table */ + vld1.32 {\c0l}, [r7, :64] /* load rot8 table */ /* a += b; d ^= a; d <<<= 8 */ vadd.u32 \a0, \a0, \b0 @@ -205,22 +205,22 @@ ENTRY(chacha_stream256_neon) vdup.32 q14, r8 vdup.32 q15, r10 - HTOLE32(q0) - HTOLE32(q1) - HTOLE32(q2) - HTOLE32(q3) - HTOLE32(q4) - HTOLE32(q5) - HTOLE32(q6) - HTOLE32(q7) - HTOLE32(q8) - HTOLE32(q9) - HTOLE32(q10) - HTOLE32(q11) - HTOLE32(q12) - HTOLE32(q13) - HTOLE32(q14) - HTOLE32(q15) + LE32TOH(q0) + LE32TOH(q1) + LE32TOH(q2) + LE32TOH(q3) + LE32TOH(q4) + LE32TOH(q5) + LE32TOH(q6) + LE32TOH(q7) + LE32TOH(q8) + LE32TOH(q9) + LE32TOH(q10) + LE32TOH(q11) + /* LE32TOH(q12) -- blkno, already host order */ + LE32TOH(q13) + LE32TOH(q14) + LE32TOH(q15) b 2f @@ -320,6 +320,9 @@ 2: subs r5, r5, #2 vswp q1, q4 vswp q3, q6 + LE32TOH(q9) + LE32TOH(q8) + vadd.u32 q0, q0, q9 vadd.u32 q4, q4, q9 vadd.u32 q2, q2, q9 @@ -332,14 +335,14 @@ 2: subs r5, r5, #2 vld1.32 {q8-q9}, [fp, :256] /* restore q8-q9 */ - LE32TOH(q0) - LE32TOH(q1) - LE32TOH(q2) - LE32TOH(q3) - LE32TOH(q4) - LE32TOH(q5) - LE32TOH(q6) - LE32TOH(q7) + HTOLE32(q0) + HTOLE32(q1) + HTOLE32(q2) + HTOLE32(q3) + HTOLE32(q4) + HTOLE32(q5) + HTOLE32(q6) + HTOLE32(q7) vst1.32 {q0-q1}, [r0]! vld1.32 {q0}, [r3] /* q0 := key[16:32) */ @@ -360,6 +363,9 @@ 2: subs r5, r5, #2 vswp q9, q12 vswp q11, q14 + LE32TOH(q0) + LE32TOH(q1) + vadd.u32 q8, q8, q0 vadd.u32 q12, q12, q0 vadd.u32 q10, q10, q0 @@ -370,14 +376,14 @@ 2: subs r5, r5, #2 vadd.u32 q11, q11, q1 vadd.u32 q15, q15, q1 - LE32TOH(q8) - LE32TOH(q9) - LE32TOH(q10) - LE32TOH(q11) - LE32TOH(q12) - LE32TOH(q13) - LE32TOH(q14) - LE32TOH(q15) + HTOLE32(q8) + HTOLE32(q9) + HTOLE32(q10) + HTOLE32(q11) + HTOLE32(q12) + HTOLE32(q13) + HTOLE32(q14) + HTOLE32(q15) /* vst1.32 {q0-q1}, [r0]! */ vst1.32 {q8-q9}, [r0]! @@ -448,22 +454,22 @@ ENTRY(chacha_stream_xor256_neon) vdup.32 q14, r8 vdup.32 q15, r10 - HTOLE32(q0) - HTOLE32(q1) - HTOLE32(q2) - HTOLE32(q3) - HTOLE32(q4) - HTOLE32(q5) - HTOLE32(q6) - HTOLE32(q7) - HTOLE32(q8) - HTOLE32(q9) - HTOLE32(q10) - HTOLE32(q11) - HTOLE32(q12) - HTOLE32(q13) - HTOLE32(q14) - HTOLE32(q15) + LE32TOH(q0) + LE32TOH(q1) + LE32TOH(q2) + LE32TOH(q3) + LE32TOH(q4) + LE32TOH(q5) + LE32TOH(q6) + LE32TOH(q7) + LE32TOH(q8) + LE32TOH(q9) + LE32TOH(q10) + LE32TOH(q11) + /* LE32TOH(q12) -- already host order, block number */ + LE32TOH(q13) + LE32TOH(q14) + LE32TOH(q15) b 2f @@ -508,6 +514,9 @@ 2: subs ip, ip, #2 vswp q1, q4 vswp q3, q6 + LE32TOH(q9) + LE32TOH(q8) + vadd.u32 q0, q0, q9 vadd.u32 q4, q4, q9 vadd.u32 q2, q2, q9 @@ -520,14 +529,17 @@ 2: subs ip, ip, #2 vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [0:32) */ - LE32TOH(q0) - LE32TOH(q1) - LE32TOH(q2) - LE32TOH(q6) - LE32TOH(q4) - LE32TOH(q5) - LE32TOH(q3) - LE32TOH(q7) + HTOLE32(q0) + HTOLE32(q1) + HTOLE32(q2) + HTOLE32(q6) + HTOLE32(q4) + HTOLE32(q5) + HTOLE32(q3) + HTOLE32(q7) + + LE32TOH(q8) + LE32TOH(q9) veor q0, q0, q8 /* compute ciphertext bytes [0:32) */ veor q1, q1, q9 @@ -553,6 +565,9 @@ 2: subs ip, ip, #2 vswp q9, q12 /* free up q9 earlier for consecutive q8-q9 */ vswp q11, q14 + LE32TOH(q0) + LE32TOH(q1) + vadd.u32 q8, q8, q0 vadd.u32 q12, q12, q0 vadd.u32 q10, q10, q0 @@ -565,14 +580,14 @@ 2: subs ip, ip, #2 vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [32:64) */ - LE32TOH(q8) - LE32TOH(q9) - LE32TOH(q10) - LE32TOH(q11) - LE32TOH(q12) - LE32TOH(q13) - LE32TOH(q14) - LE32TOH(q15) + HTOLE32(q8) + HTOLE32(q9) + HTOLE32(q10) + HTOLE32(q11) + HTOLE32(q12) + HTOLE32(q13) + HTOLE32(q14) + HTOLE32(q15) veor q0, q0, q8 /* compute ciphertext bytes [32:64) */ veor q1, q1, q9 diff -r b9bd12fb9564 sys/crypto/chacha/arch/arm/chacha_neon_64.S --- a/sys/crypto/chacha/arch/arm/chacha_neon_64.S Sun Aug 02 18:20:51 2020 +0000 +++ b/sys/crypto/chacha/arch/arm/chacha_neon_64.S Tue Aug 04 15:54:38 2020 +0000 @@ -169,22 +169,22 @@ ENTRY(chacha_stream256_neon) ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */ add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */ - HTOLE32(v0.16b) - HTOLE32(v1.16b) - HTOLE32(v2.16b) - HTOLE32(v3.16b) - HTOLE32(v4.16b) - HTOLE32(v5.16b) - HTOLE32(v6.16b) - HTOLE32(v7.16b) - HTOLE32(v8.16b) - HTOLE32(v9.16b) - HTOLE32(v10.16b) - HTOLE32(v11.16b) - HTOLE32(v12.16b) - HTOLE32(v13.16b) - HTOLE32(v14.16b) - HTOLE32(v15.16b) + LE32TOH(v0.16b) + LE32TOH(v1.16b) + LE32TOH(v2.16b) + LE32TOH(v3.16b) + LE32TOH(v4.16b) + LE32TOH(v5.16b) + LE32TOH(v6.16b) + LE32TOH(v7.16b) + LE32TOH(v8.16b) + LE32TOH(v9.16b) + LE32TOH(v10.16b) + LE32TOH(v11.16b) + /* LE32TOH(v12.16b) -- blkno, already host order */ + LE32TOH(v13.16b) + LE32TOH(v14.16b) + LE32TOH(v15.16b) mov v16.16b, v0.16b mov v17.16b, v1.16b @@ -234,22 +234,22 @@ 1: subs w5, w5, #2 add v14.4s, v14.4s, v30.4s add v15.4s, v15.4s, v31.4s - LE32TOH(v0.16b) - LE32TOH(v1.16b) - LE32TOH(v2.16b) - LE32TOH(v3.16b) - LE32TOH(v4.16b) - LE32TOH(v5.16b) - LE32TOH(v6.16b) - LE32TOH(v7.16b) - LE32TOH(v8.16b) - LE32TOH(v9.16b) - LE32TOH(v10.16b) - LE32TOH(v11.16b) - LE32TOH(v12.16b) - LE32TOH(v13.16b) - LE32TOH(v14.16b) - LE32TOH(v15.16b) + HTOLE32(v0.16b) + HTOLE32(v1.16b) + HTOLE32(v2.16b) + HTOLE32(v3.16b) + HTOLE32(v4.16b) + HTOLE32(v5.16b) + HTOLE32(v6.16b) + HTOLE32(v7.16b) + HTOLE32(v8.16b) + HTOLE32(v9.16b) + HTOLE32(v10.16b) + HTOLE32(v11.16b) + HTOLE32(v12.16b) + HTOLE32(v13.16b) + HTOLE32(v14.16b) + HTOLE32(v15.16b) st4 { v0.s, v1.s, v2.s, v3.s}[0], [x0], #16 st4 { v4.s, v5.s, v6.s, v7.s}[0], [x0], #16 @@ -308,22 +308,22 @@ ENTRY(chacha_stream_xor256_neon) ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */ add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */ - HTOLE32(v0.16b) - HTOLE32(v1.16b) - HTOLE32(v2.16b) - HTOLE32(v3.16b) - HTOLE32(v4.16b) - HTOLE32(v5.16b) - HTOLE32(v6.16b) - HTOLE32(v7.16b) - HTOLE32(v8.16b) - HTOLE32(v9.16b) - HTOLE32(v10.16b) - HTOLE32(v11.16b) - HTOLE32(v12.16b) - HTOLE32(v13.16b) - HTOLE32(v14.16b) - HTOLE32(v15.16b) + LE32TOH(v0.16b) + LE32TOH(v1.16b) + LE32TOH(v2.16b) + LE32TOH(v3.16b) + LE32TOH(v4.16b) + LE32TOH(v5.16b) + LE32TOH(v6.16b) + LE32TOH(v7.16b) + LE32TOH(v8.16b) + LE32TOH(v9.16b) + LE32TOH(v10.16b) + LE32TOH(v11.16b) + /* LE32TOH(v12.16b) -- blkno, already host order */ + LE32TOH(v13.16b) + LE32TOH(v14.16b) + LE32TOH(v15.16b) mov v16.16b, v0.16b mov v17.16b, v1.16b @@ -401,22 +401,22 @@ 1: subs w6, w6, #2 ld4 {v24.s,v25.s,v26.s,v27.s}[3], [x1], #16 ld4 {v28.s,v29.s,v30.s,v31.s}[3], [x1], #16 - LE32TOH(v0.16b) - LE32TOH(v1.16b) - LE32TOH(v2.16b) - LE32TOH(v3.16b) - LE32TOH(v4.16b) - LE32TOH(v5.16b) - LE32TOH(v6.16b) - LE32TOH(v7.16b) - LE32TOH(v8.16b) - LE32TOH(v9.16b) - LE32TOH(v10.16b) - LE32TOH(v11.16b) - LE32TOH(v12.16b) - LE32TOH(v13.16b) - LE32TOH(v14.16b) - LE32TOH(v15.16b) + HTOLE32(v0.16b) + HTOLE32(v1.16b) + HTOLE32(v2.16b) + HTOLE32(v3.16b) + HTOLE32(v4.16b) + HTOLE32(v5.16b) + HTOLE32(v6.16b) + HTOLE32(v7.16b) + HTOLE32(v8.16b) + HTOLE32(v9.16b) + HTOLE32(v10.16b) + HTOLE32(v11.16b) + HTOLE32(v12.16b) + HTOLE32(v13.16b) + HTOLE32(v14.16b) + HTOLE32(v15.16b) eor v16.16b, v16.16b, v0.16b eor v17.16b, v17.16b, v1.16b