# HG changeset patch # User Taylor R Campbell # Date 1763785115 0 # Sat Nov 22 04:18:35 2025 +0000 # Branch trunk # Node ID 8da4be48b8762a07f5504a2b60c33d89229d236f # Parent 03d10f218fce5630dbd63fdd135fec096db9f6ca # EXP-Topic riastradh-pr59774-aesbear64 aes(9): New aes_keysched_enc/dec. These implement the standard key schedule. They are named independently of any particular AES implementation, so that: (a) we can swap between the BearSSL aes_ct and aes_ct64 code without changing all the callers who don't care which one they get, and (b) we could push it into the aes_impl abstraction if we wanted. This eliminates all br_aes_* references outside aes_bear.c, aes_ct*.c, and the new aes_keysched.c wrappers. PR kern/59774: bearssl 32-bit AES is too slow, want 64-bit optimized version in kernel diff -r 03d10f218fce -r 8da4be48b876 sys/arch/x86/x86/via_padlock.c --- a/sys/arch/x86/x86/via_padlock.c Sun Oct 19 18:56:19 2025 +0000 +++ b/sys/arch/x86/x86/via_padlock.c Sat Nov 22 04:18:35 2025 +0000 @@ -37,7 +37,7 @@ __KERNEL_RCSID(0, "$NetBSD: via_padlock. #include #include -#include +#include #include #include @@ -176,23 +176,28 @@ via_padlock_crypto_newsession(void *arg, switch (c->cri_klen) { case 128: - br_aes_ct_keysched_stdenc(ses->ses_ekey, + /* + * XXX Is this needed? For AES-128 the + * VIA padlock instructions usually + * compute the key schedule internally. + */ + aes_keysched_enc(ses->ses_ekey, c->cri_key, 16); - br_aes_ct_keysched_stddec(ses->ses_dkey, + aes_keysched_dec(ses->ses_dkey, c->cri_key, 16); cw0 = C3_CRYPT_CWLO_KEY128; break; case 192: - br_aes_ct_keysched_stdenc(ses->ses_ekey, + aes_keysched_stdenc(ses->ses_ekey, c->cri_key, 24); - br_aes_ct_keysched_stddec(ses->ses_dkey, + aes_keysched_stddec(ses->ses_dkey, c->cri_key, 24); cw0 = C3_CRYPT_CWLO_KEY192; break; case 256: - br_aes_ct_keysched_stdenc(ses->ses_ekey, + aes_keysched_stdenc(ses->ses_ekey, c->cri_key, 32); - br_aes_ct_keysched_stddec(ses->ses_dkey, + aes_keysched_stddec(ses->ses_dkey, c->cri_key, 32); cw0 = C3_CRYPT_CWLO_KEY256; break; diff -r 03d10f218fce -r 8da4be48b876 sys/crypto/aes/aes.h --- a/sys/crypto/aes/aes.h Sun Oct 19 18:56:19 2025 +0000 +++ b/sys/crypto/aes/aes.h Sat Nov 22 04:18:35 2025 +0000 @@ -35,7 +35,8 @@ /* * struct aes * - * Expanded round keys. + * Expanded round keys, in implementation-dependent format. (For + * the standard AES key schedule, see aes_keysched.h.) */ union aes { uint32_t aes_rk[60]; diff -r 03d10f218fce -r 8da4be48b876 sys/crypto/aes/aes_impl.c --- a/sys/crypto/aes/aes_impl.c Sun Oct 19 18:56:19 2025 +0000 +++ b/sys/crypto/aes/aes_impl.c Sat Nov 22 04:18:35 2025 +0000 @@ -37,12 +37,13 @@ __KERNEL_RCSID(1, "$NetBSD: aes_impl.c,v #include #include +#include /* default implementation */ #include -#include /* default implementation */ #include +#include #include -static int aes_selftest_stdkeysched(void); +static int aes_keysched_selftest(void); static const struct aes_impl *aes_md_impl __read_mostly; static const struct aes_impl *aes_impl __read_mostly; @@ -101,7 +102,7 @@ aes_select(void) KASSERT(aes_impl == NULL); - if (aes_selftest_stdkeysched()) + if (aes_keysched_selftest()) panic("AES is busted"); if (aes_md_impl) { @@ -337,10 +338,13 @@ aes_ccm_dec1(const struct aesenc *enc, c } /* - * Known-answer self-tests for the standard key schedule. + * Known-answer self-tests for the standard key schedule, used by some + * drivers for hardware devices that compute AES encryption and + * decryption in hardware but rely on software to compute the standard + * key schedule. */ static int -aes_selftest_stdkeysched(void) +aes_keysched_selftest(void) { static const uint8_t key[32] = { 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, @@ -451,11 +455,11 @@ aes_selftest_stdkeysched(void) unsigned i; for (i = 0; i < __arraycount(C); i++) { - if (br_aes_ct_keysched_stdenc(rk, key, C[i].len) != C[i].nr) + if (aes_keysched_enc(rk, key, C[i].len) != C[i].nr) return -1; if (memcmp(rk, C[i].enc, 4*(C[i].nr + 1))) return -1; - if (br_aes_ct_keysched_stddec(rk, key, C[i].len) != C[i].nr) + if (aes_keysched_dec(rk, key, C[i].len) != C[i].nr) return -1; if (memcmp(rk, C[i].dec, 4*(C[i].nr + 1))) return -1; diff -r 03d10f218fce -r 8da4be48b876 sys/crypto/aes/aes_keysched.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/crypto/aes/aes_keysched.c Sat Nov 22 04:18:35 2025 +0000 @@ -0,0 +1,65 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2025 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__KERNEL_RCSID(1, "$NetBSD$"); + +#include + +#include +#include + +/* + * aes_keysched_enc(rk, key, keybytes) + * + * Compute the standard AES encryption key schedule, expanding a + * 16-, 24-, or 32-byte key into 44, 52, or 60 32-bit round keys + * for encryption. Returns the number of rounds for the key of + * this length. + */ +u_int +aes_keysched_enc(uint32_t *rk, const void *key, size_t keybytes) +{ + + return br_aes_ct_keysched_stdenc(rk, key, keybytes); +} + +/* + * aes_keysched_dec(rk, key, keybytes) + * + * Compute the standard AES decryption key schedule, expanding a + * 16-, 24-, or 32-byte key into 44, 52, or 60 32-bit round keys + * and applying InvMixColumns for decryption. Returns the number + * of rounds for the key of this length. + */ +u_int +aes_keysched_dec(uint32_t *rk, const void *key, size_t keybytes) +{ + + return br_aes_ct_keysched_stddec(rk, key, keybytes); +} diff -r 03d10f218fce -r 8da4be48b876 sys/crypto/aes/aes_keysched.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/crypto/aes/aes_keysched.h Sat Nov 22 04:18:35 2025 +0000 @@ -0,0 +1,37 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2025 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _CRYPTO_AES_AES_KEYSCHED_H +#define _CRYPTO_AES_AES_KEYSCHED_H + +#include + +u_int aes_keysched_enc(uint32_t *, const void *, size_t); +u_int aes_keysched_dec(uint32_t *, const void *, size_t); + +#endif /* _CRYPTO_AES_AES_KEYSCHED_H */ diff -r 03d10f218fce -r 8da4be48b876 sys/crypto/aes/arch/x86/aes_via.c --- a/sys/crypto/aes/arch/x86/aes_via.c Sun Oct 19 18:56:19 2025 +0000 +++ b/sys/crypto/aes/arch/x86/aes_via.c Sat Nov 22 04:18:35 2025 +0000 @@ -46,8 +46,8 @@ struct evcnt { uint64_t ev_count; }; #endif #include -#include #include +#include #ifdef _KERNEL #include @@ -107,6 +107,12 @@ aesvia_setenckey(struct aesenc *enc, con { size_t key_len; + /* + * For AES-128, VIA PadLock only needs the original key itself. + * + * For AES-192 and AES-256, VIA PadLock needs software to + * compute the standard AES key schedule. + */ switch (nrounds) { case AES_128_NROUNDS: enc->aese_aes.aes_rk[0] = le32dec(key + 4*0); @@ -123,7 +129,7 @@ aesvia_setenckey(struct aesenc *enc, con default: panic("invalid AES nrounds: %u", nrounds); } - br_aes_ct_keysched_stdenc(enc->aese_aes.aes_rk, key, key_len); + aes_keysched_enc(enc->aese_aes.aes_rk, key, key_len); } static void @@ -147,7 +153,7 @@ aesvia_setdeckey(struct aesdec *dec, con default: panic("invalid AES nrounds: %u", nrounds); } - br_aes_ct_keysched_stddec(dec->aesd_aes.aes_rk, key, key_len); + aes_keysched_dec(dec->aesd_aes.aes_rk, key, key_len); } static inline void diff -r 03d10f218fce -r 8da4be48b876 sys/crypto/aes/files.aes --- a/sys/crypto/aes/files.aes Sun Oct 19 18:56:19 2025 +0000 +++ b/sys/crypto/aes/files.aes Sat Nov 22 04:18:35 2025 +0000 @@ -9,4 +9,5 @@ file crypto/aes/aes_ct.c aes file crypto/aes/aes_ct_dec.c aes file crypto/aes/aes_ct_enc.c aes file crypto/aes/aes_impl.c aes +file crypto/aes/aes_keysched.c aes file crypto/aes/aes_selftest.c aes diff -r 03d10f218fce -r 8da4be48b876 tests/sys/crypto/aes/Makefile --- a/tests/sys/crypto/aes/Makefile Sun Oct 19 18:56:19 2025 +0000 +++ b/tests/sys/crypto/aes/Makefile Sat Nov 22 04:18:35 2025 +0000 @@ -17,6 +17,7 @@ SRCS.t_aes+= aes_bear.c SRCS.t_aes+= aes_ct.c SRCS.t_aes+= aes_ct_dec.c SRCS.t_aes+= aes_ct_enc.c +SRCS.t_aes+= aes_keysched.c SRCS.t_aes+= aes_selftest.c .if !empty(MACHINE_ARCH:Mearmv7*) || !empty(MACHINE_ARCH:Maarch64*) # HG changeset patch # User Taylor R Campbell # Date 1763789294 0 # Sat Nov 22 05:28:14 2025 +0000 # Branch trunk # Node ID cce15febbf047be806a40490ef2416f043f6db13 # Parent 8da4be48b8762a07f5504a2b60c33d89229d236f # EXP-Topic riastradh-pr59774-aesbear64 aes(9): New 64-bit bitsliced implementation. Derived from BearSSL's aes_ct64 code. Enable with `options AES_BEAR64'. Should be a reasonable default on all platforms with 64-bit integer registers. Caveat: uses about 1200 bytes of stack space. (Could approximately halve that, like the BearSSL aes_ct code, at some speed cost which I haven't measured -- by moving the br_aes_ct64_skey_expand logic into add_round_key in aes_ct64_enc/dec.c.) PR kern/59774: bearssl 32-bit AES is too slow, want 64-bit optimized version in kernel diff -r 8da4be48b876 -r cce15febbf04 sys/conf/copts.mk --- a/sys/conf/copts.mk Sat Nov 22 04:18:35 2025 +0000 +++ b/sys/conf/copts.mk Sat Nov 22 05:28:14 2025 +0000 @@ -39,6 +39,7 @@ COPTS.chacha_neon.c+= -flax-vector-conve .endif .if ${MACHINE_ARCH} == "x86_64" || ${MACHINE_ARCH} == "i386" +COPTS.aes_bear64.c+= ${CC_WNO_ARRAY_BOUNDS} ${CC_WNO_STRINGOP_OVERFLOW} COPTS.aes_ni.c+= ${CC_WNO_STRINGOP_OVERREAD} ${CC_WNO_STRINGOP_OVERFLOW} COPTS.aes_sse2_subr.c+= ${CC_WNO_ARRAY_BOUNDS} COPTS.aes_ssse3_subr.c+=${CC_WNO_ARRAY_BOUNDS} diff -r 8da4be48b876 -r cce15febbf04 sys/crypto/aes/aes_bear64.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/crypto/aes/aes_bear64.c Sat Nov 22 05:28:14 2025 +0000 @@ -0,0 +1,933 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2025 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__KERNEL_RCSID(1, "$NetBSD$"); + +#include +#include + +#ifdef _KERNEL +#include +#else +#include +#include +#include +#define KASSERT assert +#define panic(fmt, args...) err(1, fmt, args) +#endif + +#include +#include +#include + +static void +aesbear64_setkey(uint64_t rk[static 30], const void *key, uint32_t nrounds) +{ + size_t key_len; + + switch (nrounds) { + case 10: + key_len = 16; + break; + case 12: + key_len = 24; + break; + case 14: + key_len = 32; + break; + default: + panic("invalid AES nrounds: %u", nrounds); + } + + br_aes_ct64_keysched(rk, key, key_len); +} + +static void +aesbear64_setenckey(struct aesenc *enc, const uint8_t *key, uint32_t nrounds) +{ + + aesbear64_setkey(enc->aese_aes.aes_rk64, key, nrounds); +} + +static void +aesbear64_setdeckey(struct aesdec *dec, const uint8_t *key, uint32_t nrounds) +{ + + /* + * BearSSL computes InvMixColumns on the fly -- no need for + * distinct decryption round keys. + */ + aesbear64_setkey(dec->aesd_aes.aes_rk64, key, nrounds); +} + +static void +aesbear64_enc(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], uint32_t nrounds) +{ + uint64_t sk_exp[120]; + uint32_t w[4]; + uint64_t q[8]; + + /* Expand round keys for bitslicing. */ + br_aes_ct64_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64); + + /* Load input block interleaved with garbage blocks. */ + w[0] = le32dec(in + 4*0); + w[1] = le32dec(in + 4*1); + w[2] = le32dec(in + 4*2); + w[3] = le32dec(in + 4*3); + br_aes_ct64_interleave_in(&q[0], &q[4], w); + q[1] = q[2] = q[3] = 0; + q[5] = q[6] = q[7] = 0; + + /* Transform to bitslice, encrypt, transform from bitslice. */ + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_encrypt(nrounds, sk_exp, q); + br_aes_ct64_ortho(q); + + /* Store output block. */ + br_aes_ct64_interleave_out(w, q[0], q[4]); + le32enc(out + 4*0, w[0]); + le32enc(out + 4*1, w[1]); + le32enc(out + 4*2, w[2]); + le32enc(out + 4*3, w[3]); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); +} + +static void +aesbear64_dec(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], uint32_t nrounds) +{ + uint64_t sk_exp[120]; + uint32_t w[4]; + uint64_t q[8]; + + /* Expand round keys for bitslicing. */ + br_aes_ct64_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64); + + /* Load input block interleaved with garbage blocks. */ + w[0] = le32dec(in + 4*0); + w[1] = le32dec(in + 4*1); + w[2] = le32dec(in + 4*2); + w[3] = le32dec(in + 4*3); + br_aes_ct64_interleave_in(&q[0], &q[4], w); + q[1] = q[2] = q[3] = 0; + q[5] = q[6] = q[7] = 0; + + /* Transform to bitslice, decrypt, transform from bitslice. */ + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_decrypt(nrounds, sk_exp, q); + br_aes_ct64_ortho(q); + + /* Store output block. */ + br_aes_ct64_interleave_out(w, q[0], q[4]); + le32enc(out + 4*0, w[0]); + le32enc(out + 4*1, w[1]); + le32enc(out + 4*2, w[2]); + le32enc(out + 4*3, w[3]); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); +} + +static void +aesbear64_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], + uint32_t nrounds) +{ + uint64_t sk_exp[120]; + uint32_t w[4]; + uint64_t q[8]; + uint32_t cv0, cv1, cv2, cv3; + + KASSERT(nbytes % 16 == 0); + + /* Skip if there's nothing to do. */ + if (nbytes == 0) + return; + + /* Expand round keys for bitslicing. */ + br_aes_ct64_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64); + + /* Initialize garbage blocks. */ + q[1] = q[2] = q[3] = 0; + q[5] = q[6] = q[7] = 0; + + /* Load IV. */ + cv0 = le32dec(iv + 4*0); + cv1 = le32dec(iv + 4*1); + cv2 = le32dec(iv + 4*2); + cv3 = le32dec(iv + 4*3); + + for (; nbytes; nbytes -= 16, in += 16, out += 16) { + /* Load input block and apply CV. */ + w[0] = cv0 ^ le32dec(in + 4*0); + w[1] = cv1 ^ le32dec(in + 4*1); + w[2] = cv2 ^ le32dec(in + 4*2); + w[3] = cv3 ^ le32dec(in + 4*3); + br_aes_ct64_interleave_in(&q[0], &q[4], w); + + /* Transform to bitslice, encrypt, transform from bitslice. */ + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_encrypt(nrounds, sk_exp, q); + br_aes_ct64_ortho(q); + + /* Remember ciphertext as CV and store output block. */ + br_aes_ct64_interleave_out(w, q[0], q[4]); + cv0 = w[0]; + cv1 = w[1]; + cv2 = w[2]; + cv3 = w[3]; + le32enc(out + 4*0, cv0); + le32enc(out + 4*1, cv1); + le32enc(out + 4*2, cv2); + le32enc(out + 4*3, cv3); + } + + /* Store updated IV. */ + le32enc(iv + 4*0, cv0); + le32enc(iv + 4*1, cv1); + le32enc(iv + 4*2, cv2); + le32enc(iv + 4*3, cv3); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); +} + +static void +aesbear64_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], + uint32_t nrounds) +{ + uint64_t sk_exp[120]; + uint32_t w[4]; + uint64_t q[8]; + uint32_t cv0, cv1, cv2, cv3, iv0, iv1, iv2, iv3; + unsigned i; + + KASSERT(nbytes % 16 == 0); + + /* Skip if there's nothing to do. */ + if (nbytes == 0) + return; + + /* Expand round keys for bitslicing. */ + br_aes_ct64_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64); + + /* Load the IV. */ + iv0 = le32dec(iv + 4*0); + iv1 = le32dec(iv + 4*1); + iv2 = le32dec(iv + 4*2); + iv3 = le32dec(iv + 4*3); + + /* Load the last cipher block. */ + cv0 = le32dec(in + nbytes - 16 + 4*0); + cv1 = le32dec(in + nbytes - 16 + 4*1); + cv2 = le32dec(in + nbytes - 16 + 4*2); + cv3 = le32dec(in + nbytes - 16 + 4*3); + + /* Store the updated IV. */ + le32enc(iv + 4*0, cv0); + le32enc(iv + 4*1, cv1); + le32enc(iv + 4*2, cv2); + le32enc(iv + 4*3, cv3); + + /* Handle the last cipher block separately if odd number. */ + if (nbytes % 64) { + unsigned n = (nbytes % 64)/16; + + KASSERT(n == 1 || n == 2 || n == 3); + + for (i = 4; i --> n;) + q[i] = q[4 + i] = 0; + KASSERT(i == n - 1); + w[0] = cv0; /* le32dec(in + nbytes - 16*n + 16*i + 4*0) */ + w[1] = cv1; /* le32dec(in + nbytes - 16*n + 16*i + 4*1) */ + w[2] = cv2; /* le32dec(in + nbytes - 16*n + 16*i + 4*2) */ + w[3] = cv3; /* le32dec(in + nbytes - 16*n + 16*i + 4*3) */ + br_aes_ct64_interleave_in(&q[i], &q[4 + i], w); + while (i --> 0) { + w[0] = le32dec(in + nbytes - 16*n + 16*i + 4*0); + w[1] = le32dec(in + nbytes - 16*n + 16*i + 4*1); + w[2] = le32dec(in + nbytes - 16*n + 16*i + 4*2); + w[3] = le32dec(in + nbytes - 16*n + 16*i + 4*3); + br_aes_ct64_interleave_in(&q[i], &q[4 + i], w); + } + + /* Decrypt. */ + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_decrypt(nrounds, sk_exp, q); + br_aes_ct64_ortho(q); + + for (i = n; i --> 1;) { + br_aes_ct64_interleave_out(w, q[i], q[4 + i]); + cv0 = le32dec(in + nbytes - 16*n + 16*(i - 1) + 4*0); + cv1 = le32dec(in + nbytes - 16*n + 16*(i - 1) + 4*1); + cv2 = le32dec(in + nbytes - 16*n + 16*(i - 1) + 4*2); + cv3 = le32dec(in + nbytes - 16*n + 16*(i - 1) + 4*3); + le32enc(out + nbytes - 16*n + 16*i + 4*0, w[0] ^ cv0); + le32enc(out + nbytes - 16*n + 16*i + 4*1, w[1] ^ cv1); + le32enc(out + nbytes - 16*n + 16*i + 4*2, w[2] ^ cv2); + le32enc(out + nbytes - 16*n + 16*i + 4*3, w[3] ^ cv3); + } + br_aes_ct64_interleave_out(w, q[0], q[4]); + + /* If this was the only cipher block, we're done. */ + nbytes -= nbytes % 64; + if (nbytes == 0) + goto out; + + /* + * Otherwise, load up the previous cipher block, and + * store the output block. + */ + cv0 = le32dec(in + nbytes - 16 + 4*0); + cv1 = le32dec(in + nbytes - 16 + 4*1); + cv2 = le32dec(in + nbytes - 16 + 4*2); + cv3 = le32dec(in + nbytes - 16 + 4*3); + le32enc(out + nbytes + 4*0, cv0 ^ w[0]); + le32enc(out + nbytes + 4*1, cv1 ^ w[1]); + le32enc(out + nbytes + 4*2, cv2 ^ w[2]); + le32enc(out + nbytes + 4*3, cv3 ^ w[3]); + } + + for (;;) { + KASSERT(nbytes >= 64); + + /* Load the input blocks. */ + w[0] = cv0; /* le32dec(in + nbytes - 64 + 16*i + 4*0) */ + w[1] = cv1; /* le32dec(in + nbytes - 64 + 16*i + 4*1) */ + w[2] = cv2; /* le32dec(in + nbytes - 64 + 16*i + 4*2) */ + w[3] = cv3; /* le32dec(in + nbytes - 64 + 16*i + 4*3) */ + br_aes_ct64_interleave_in(&q[3], &q[7], w); + for (i = 3; i --> 0;) { + w[0] = le32dec(in + nbytes - 64 + 16*i + 4*0); + w[1] = le32dec(in + nbytes - 64 + 16*i + 4*1); + w[2] = le32dec(in + nbytes - 64 + 16*i + 4*2); + w[3] = le32dec(in + nbytes - 64 + 16*i + 4*3); + br_aes_ct64_interleave_in(&q[i], &q[4 + i], w); + } + + /* Decrypt. */ + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_decrypt(nrounds, sk_exp, q); + br_aes_ct64_ortho(q); + + /* Store the upper output blocks. */ + for (i = 4; i --> 1;) { + br_aes_ct64_interleave_out(w, q[i], q[4 + i]); + cv0 = le32dec(in + nbytes - 64 + 16*(i - 1) + 4*0); + cv1 = le32dec(in + nbytes - 64 + 16*(i - 1) + 4*1); + cv2 = le32dec(in + nbytes - 64 + 16*(i - 1) + 4*2); + cv3 = le32dec(in + nbytes - 64 + 16*(i - 1) + 4*3); + le32enc(out + nbytes - 64 + 16*i + 4*0, w[0] ^ cv0); + le32enc(out + nbytes - 64 + 16*i + 4*1, w[1] ^ cv1); + le32enc(out + nbytes - 64 + 16*i + 4*2, w[2] ^ cv2); + le32enc(out + nbytes - 64 + 16*i + 4*3, w[3] ^ cv3); + } + + /* Prepare the first output block. */ + br_aes_ct64_interleave_out(w, q[0], q[4]); + + /* Stop if we've reached the first output block. */ + nbytes -= 64; + if (nbytes == 0) + goto out; + + /* + * Load the preceding cipher block, and apply it as the + * chaining value to this one. + */ + cv0 = le32dec(in + nbytes - 16 + 4*0); + cv1 = le32dec(in + nbytes - 16 + 4*1); + cv2 = le32dec(in + nbytes - 16 + 4*2); + cv3 = le32dec(in + nbytes - 16 + 4*3); + le32enc(out + nbytes + 4*0, w[0] ^ cv0); + le32enc(out + nbytes + 4*1, w[1] ^ cv1); + le32enc(out + nbytes + 4*2, w[2] ^ cv2); + le32enc(out + nbytes + 4*3, w[3] ^ cv3); + } + +out: /* Store the first output block. */ + le32enc(out + 4*0, w[0] ^ iv0); + le32enc(out + 4*1, w[1] ^ iv1); + le32enc(out + 4*2, w[2] ^ iv2); + le32enc(out + 4*3, w[3] ^ iv3); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); +} + +static inline void +aesbear64_xts_update(uint32_t *t0, uint32_t *t1, uint32_t *t2, uint32_t *t3) +{ + uint32_t s0, s1, s2, s3; + + s0 = *t0 >> 31; + s1 = *t1 >> 31; + s2 = *t2 >> 31; + s3 = *t3 >> 31; + *t0 = (*t0 << 1) ^ (-s3 & 0x87); + *t1 = (*t1 << 1) ^ s0; + *t2 = (*t2 << 1) ^ s1; + *t3 = (*t3 << 1) ^ s2; +} + +static int +aesbear64_xts_update_selftest(void) +{ + static const struct { + uint32_t in[4], out[4]; + } cases[] = { + { {1}, {2} }, + { {0x80000000U,0,0,0}, {0,1,0,0} }, + { {0,0x80000000U,0,0}, {0,0,1,0} }, + { {0,0,0x80000000U,0}, {0,0,0,1} }, + { {0,0,0,0x80000000U}, {0x87,0,0,0} }, + { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} }, + }; + unsigned i; + uint32_t t0, t1, t2, t3; + + for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) { + t0 = cases[i].in[0]; + t1 = cases[i].in[1]; + t2 = cases[i].in[2]; + t3 = cases[i].in[3]; + aesbear64_xts_update(&t0, &t1, &t2, &t3); + if (t0 != cases[i].out[0] || + t1 != cases[i].out[1] || + t2 != cases[i].out[2] || + t3 != cases[i].out[3]) + return -1; + } + + /* Success! */ + return 0; +} + +static void +aesbear64_xts_enc(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], + uint32_t nrounds) +{ + uint64_t sk_exp[120]; + uint32_t w[4]; + uint64_t q[8]; + uint32_t t0, t1, t2, t3, u0, u1, u2, u3; + unsigned i; + + KASSERT(nbytes % 16 == 0); + + /* Skip if there's nothing to do. */ + if (nbytes == 0) + return; + + /* Expand round keys for bitslicing. */ + br_aes_ct64_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64); + + /* Load tweak. */ + t0 = le32dec(tweak + 4*0); + t1 = le32dec(tweak + 4*1); + t2 = le32dec(tweak + 4*2); + t3 = le32dec(tweak + 4*3); + + /* Handle the first blocks separately if odd number. */ + if (nbytes % 64) { + unsigned n = (nbytes % 64)/16; + + /* Load up the first blocks and garbage. */ + for (i = 0, u0 = t0, u1 = t1, u2 = t2, u3 = t3; i < n; i++) { + w[0] = le32dec(in + 16*i + 4*0) ^ u0; + w[1] = le32dec(in + 16*i + 4*1) ^ u1; + w[2] = le32dec(in + 16*i + 4*2) ^ u2; + w[3] = le32dec(in + 16*i + 4*3) ^ u3; + aesbear64_xts_update(&u0, &u1, &u2, &u3); + br_aes_ct64_interleave_in(&q[i], &q[4 + i], w); + } + for (; i < 4; i++) + q[i] = q[4 + i] = 0; + + /* Encrypt up to three blocks. */ + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_encrypt(nrounds, sk_exp, q); + br_aes_ct64_ortho(q); + + /* Store up to three blocks. */ + for (i = 0, u0 = t0, u1 = t1, u2 = t2, u3 = t3; i < n; i++) { + br_aes_ct64_interleave_out(w, q[i], q[4 + i]); + le32enc(out + 16*i + 4*0, w[0] ^ u0); + le32enc(out + 16*i + 4*1, w[1] ^ u1); + le32enc(out + 16*i + 4*2, w[2] ^ u2); + le32enc(out + 16*i + 4*3, w[3] ^ u3); + aesbear64_xts_update(&u0, &u1, &u2, &u3); + } + + /* Advance to the next block. */ + t0 = u0, t1 = u1, t2 = u2, t3 = u3; + if ((nbytes -= 16*n) == 0) + goto out; + in += 16*n; + out += 16*n; + } + + do { + KASSERT(nbytes >= 64); + + /* Load four blocks. */ + for (i = 0, u0 = t0, u1 = t1, u2 = t2, u3 = t3; i < 4; i++) { + w[0] = le32dec(in + 16*i + 4*0) ^ u0; + w[1] = le32dec(in + 16*i + 4*1) ^ u1; + w[2] = le32dec(in + 16*i + 4*2) ^ u2; + w[3] = le32dec(in + 16*i + 4*3) ^ u3; + aesbear64_xts_update(&u0, &u1, &u2, &u3); + br_aes_ct64_interleave_in(&q[i], &q[4 + i], w); + } + + /* Encrypt four blocks. */ + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_encrypt(nrounds, sk_exp, q); + br_aes_ct64_ortho(q); + + /* Store four blocks. */ + for (i = 0, u0 = t0, u1 = t1, u2 = t2, u3 = t3; i < 4; i++) { + br_aes_ct64_interleave_out(w, q[i], q[4 + i]); + le32enc(out + 16*i + 4*0, w[0] ^ u0); + le32enc(out + 16*i + 4*1, w[1] ^ u1); + le32enc(out + 16*i + 4*2, w[2] ^ u2); + le32enc(out + 16*i + 4*3, w[3] ^ u3); + aesbear64_xts_update(&u0, &u1, &u2, &u3); + } + + /* Advance to the next pair of blocks. */ + t0 = u0, t1 = u1, t2 = u2, t3 = u3; + in += 64; + out += 64; + } while (nbytes -= 64, nbytes); + +out: /* Store the updated tweak. */ + le32enc(tweak + 4*0, t0); + le32enc(tweak + 4*1, t1); + le32enc(tweak + 4*2, t2); + le32enc(tweak + 4*3, t3); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); +} + +static void +aesbear64_xts_dec(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], + uint32_t nrounds) +{ + uint64_t sk_exp[120]; + uint32_t w[4]; + uint64_t q[8]; + uint32_t t0, t1, t2, t3, u0, u1, u2, u3; + unsigned i; + + KASSERT(nbytes % 16 == 0); + + /* Skip if there's nothing to do. */ + if (nbytes == 0) + return; + + /* Expand round keys for bitslicing. */ + br_aes_ct64_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64); + + /* Load tweak. */ + t0 = le32dec(tweak + 4*0); + t1 = le32dec(tweak + 4*1); + t2 = le32dec(tweak + 4*2); + t3 = le32dec(tweak + 4*3); + + /* Handle the first blocks separately if odd number. */ + if (nbytes % 64) { + unsigned n = (nbytes % 64)/16; + + /* Load up the first blocks and garbage. */ + for (i = 0, u0 = t0, u1 = t1, u2 = t2, u3 = t3; i < n; i++) { + w[0] = le32dec(in + 16*i + 4*0) ^ u0; + w[1] = le32dec(in + 16*i + 4*1) ^ u1; + w[2] = le32dec(in + 16*i + 4*2) ^ u2; + w[3] = le32dec(in + 16*i + 4*3) ^ u3; + aesbear64_xts_update(&u0, &u1, &u2, &u3); + br_aes_ct64_interleave_in(&q[i], &q[4 + i], w); + } + for (; i < 4; i++) + q[i] = q[4 + i] = 0; + + /* Decrypt up to three blocks. */ + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_decrypt(nrounds, sk_exp, q); + br_aes_ct64_ortho(q); + + /* Store up to three blocks. */ + for (i = 0, u0 = t0, u1 = t1, u2 = t2, u3 = t3; i < n; i++) { + br_aes_ct64_interleave_out(w, q[i], q[4 + i]); + le32enc(out + 16*i + 4*0, w[0] ^ u0); + le32enc(out + 16*i + 4*1, w[1] ^ u1); + le32enc(out + 16*i + 4*2, w[2] ^ u2); + le32enc(out + 16*i + 4*3, w[3] ^ u3); + aesbear64_xts_update(&u0, &u1, &u2, &u3); + } + + /* Advance to the next block. */ + t0 = u0, t1 = u1, t2 = u2, t3 = u3; + if ((nbytes -= 16*n) == 0) + goto out; + in += 16*n; + out += 16*n; + } + + do { + KASSERT(nbytes >= 64); + + /* Load four blocks. */ + for (i = 0, u0 = t0, u1 = t1, u2 = t2, u3 = t3; i < 4; i++) { + w[0] = le32dec(in + 16*i + 4*0) ^ u0; + w[1] = le32dec(in + 16*i + 4*1) ^ u1; + w[2] = le32dec(in + 16*i + 4*2) ^ u2; + w[3] = le32dec(in + 16*i + 4*3) ^ u3; + aesbear64_xts_update(&u0, &u1, &u2, &u3); + br_aes_ct64_interleave_in(&q[i], &q[4 + i], w); + } + + /* Decrypt four blocks. */ + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_decrypt(nrounds, sk_exp, q); + br_aes_ct64_ortho(q); + + /* Store four blocks. */ + for (i = 0, u0 = t0, u1 = t1, u2 = t2, u3 = t3; i < 4; i++) { + br_aes_ct64_interleave_out(w, q[i], q[4 + i]); + le32enc(out + 16*i + 4*0, w[0] ^ u0); + le32enc(out + 16*i + 4*1, w[1] ^ u1); + le32enc(out + 16*i + 4*2, w[2] ^ u2); + le32enc(out + 16*i + 4*3, w[3] ^ u3); + aesbear64_xts_update(&u0, &u1, &u2, &u3); + } + + /* Advance to the next pair of blocks. */ + t0 = u0, t1 = u1, t2 = u2, t3 = u3; + in += 64; + out += 64; + } while (nbytes -= 64, nbytes); + +out: /* Store the updated tweak. */ + le32enc(tweak + 4*0, t0); + le32enc(tweak + 4*1, t1); + le32enc(tweak + 4*2, t2); + le32enc(tweak + 4*3, t3); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); +} + +static void +aesbear64_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16], + size_t nbytes, uint8_t auth[static 16], uint32_t nrounds) +{ + uint64_t sk_exp[120]; + uint32_t w[4]; + uint64_t q[8]; + + KASSERT(nbytes); + KASSERT(nbytes % 16 == 0); + + /* Expand round keys for bitslicing. */ + br_aes_ct64_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64); + + /* Initialize garbage blocks. */ + q[1] = q[2] = q[3] = 0; + q[5] = q[6] = q[7] = 0; + + /* Load initial authenticator. */ + w[0] = le32dec(auth + 4*0); + w[1] = le32dec(auth + 4*1); + w[2] = le32dec(auth + 4*2); + w[3] = le32dec(auth + 4*3); + + for (; nbytes; nbytes -= 16, in += 16) { + /* Combine input block. */ + w[0] ^= le32dec(in + 4*0); + w[1] ^= le32dec(in + 4*1); + w[2] ^= le32dec(in + 4*2); + w[3] ^= le32dec(in + 4*3); + br_aes_ct64_interleave_in(&q[0], &q[4], w); + + /* Transform to bitslice, encrypt, transform from bitslice. */ + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_encrypt(nrounds, sk_exp, q); + br_aes_ct64_ortho(q); + + br_aes_ct64_interleave_out(w, q[0], q[4]); + } + + /* Store updated authenticator. */ + le32enc(auth + 4*0, w[0]); + le32enc(auth + 4*1, w[1]); + le32enc(auth + 4*2, w[2]); + le32enc(auth + 4*3, w[3]); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); +} + +static void +aesbear64_ccm_enc1(const struct aesenc *enc, const uint8_t *in, uint8_t *out, + size_t nbytes, uint8_t authctr[32], uint32_t nrounds) +{ + uint64_t sk_exp[120]; + uint32_t w[4]; + uint64_t q[8]; + uint32_t c0, c1, c2, c3be; + + KASSERT(nbytes); + KASSERT(nbytes % 16 == 0); + + /* Expand round keys for bitslicing. */ + br_aes_ct64_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64); + + /* Initialize garbage blocks. */ + q[2] = q[3] = 0; + q[6] = q[7] = 0; + + /* Set first block to authenticator. */ + w[0] = le32dec(authctr + 4*0); + w[1] = le32dec(authctr + 4*1); + w[2] = le32dec(authctr + 4*2); + w[3] = le32dec(authctr + 4*3); + + /* Load initial counter block, big-endian so we can increment it. */ + c0 = le32dec(authctr + 16 + 4*0); + c1 = le32dec(authctr + 16 + 4*1); + c2 = le32dec(authctr + 16 + 4*2); + c3be = bswap32(le32dec(authctr + 16 + 4*3)); + + for (; nbytes; nbytes -= 16, in += 16, out += 16) { + /* Update authenticator. */ + w[0] ^= le32dec(in + 4*0); + w[1] ^= le32dec(in + 4*1); + w[2] ^= le32dec(in + 4*2); + w[3] ^= le32dec(in + 4*3); + br_aes_ct64_interleave_in(&q[0], &q[4], w); + + /* Increment 32-bit counter. */ + w[0] = c0; + w[1] = c1; + w[2] = c2; + w[3] = bswap32(++c3be); + br_aes_ct64_interleave_in(&q[1], &q[5], w); + + /* Encrypt authenticator and counter. */ + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_encrypt(nrounds, sk_exp, q); + br_aes_ct64_ortho(q); + + /* Encrypt with CTR output. */ + br_aes_ct64_interleave_out(w, q[1], q[5]); + le32enc(out + 4*0, le32dec(in + 4*0) ^ w[0]); + le32enc(out + 4*1, le32dec(in + 4*1) ^ w[1]); + le32enc(out + 4*2, le32dec(in + 4*2) ^ w[2]); + le32enc(out + 4*3, le32dec(in + 4*3) ^ w[3]); + + /* Fish out the authenticator so far. */ + br_aes_ct64_interleave_out(w, q[0], q[4]); + } + + /* Update authenticator. */ + le32enc(authctr + 4*0, w[0]); + le32enc(authctr + 4*1, w[1]); + le32enc(authctr + 4*2, w[2]); + le32enc(authctr + 4*3, w[3]); + + /* Update counter. */ + le32enc(authctr + 16 + 4*3, bswap32(c3be)); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); +} + +static void +aesbear64_ccm_dec1(const struct aesenc *enc, const uint8_t *in, uint8_t *out, + size_t nbytes, uint8_t authctr[32], uint32_t nrounds) +{ + uint64_t sk_exp[120]; + uint32_t w[4]; + uint64_t q[8]; + uint32_t c0, c1, c2, c3be; + uint32_t b0, b1, b2, b3; + + KASSERT(nbytes); + KASSERT(nbytes % 16 == 0); + + /* Expand round keys for bitslicing. */ + br_aes_ct64_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64); + + /* Initialize garbage blocks. */ + q[2] = q[3] = 0; + q[6] = q[7] = 0; + + /* Load initial counter block, big-endian so we can increment it. */ + c0 = le32dec(authctr + 16 + 4*0); + c1 = le32dec(authctr + 16 + 4*1); + c2 = le32dec(authctr + 16 + 4*2); + c3be = bswap32(le32dec(authctr + 16 + 4*3)); + + /* Increment 32-bit counter. */ + w[0] = c0; + w[1] = c1; + w[2] = c2; + w[3] = bswap32(++c3be); + br_aes_ct64_interleave_in(&q[1], &q[5], w); + + /* + * Set the other block to garbage -- we don't have any + * plaintext to authenticate yet. + */ + q[0] = q[4] = 0; + + /* Encrypt first CTR. */ + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_encrypt(nrounds, sk_exp, q); + br_aes_ct64_ortho(q); + + /* Load the initial authenticator. */ + w[0] = le32dec(authctr + 4*0); + w[1] = le32dec(authctr + 4*1); + w[2] = le32dec(authctr + 4*2); + w[3] = le32dec(authctr + 4*3); + br_aes_ct64_interleave_in(&q[0], &q[4], w); + + for (;; in += 16, out += 16) { + /* Decrypt the block. */ + br_aes_ct64_interleave_out(w, q[1], q[5]); + b0 = le32dec(in + 4*0) ^ w[0]; + b1 = le32dec(in + 4*1) ^ w[1]; + b2 = le32dec(in + 4*2) ^ w[2]; + b3 = le32dec(in + 4*3) ^ w[3]; + + /* Update authenticator. */ + br_aes_ct64_interleave_out(w, q[0], q[4]); + w[0] ^= b0; + w[1] ^= b1; + w[2] ^= b2; + w[3] ^= b3; + br_aes_ct64_interleave_in(&q[0], &q[4], w); + + /* Store plaintext. */ + le32enc(out + 4*0, b0); + le32enc(out + 4*1, b1); + le32enc(out + 4*2, b2); + le32enc(out + 4*3, b3); + + /* If this is the last block, stop. */ + if ((nbytes -= 16) == 0) + break; + + /* Increment 32-bit counter. */ + w[0] = c0; + w[1] = c1; + w[2] = c2; + w[3] = bswap32(++c3be); + br_aes_ct64_interleave_in(&q[1], &q[5], w); + + /* Authenticate previous plaintext, encrypt next CTR. */ + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_encrypt(nrounds, sk_exp, q); + br_aes_ct64_ortho(q); + } + + /* + * Authenticate last plaintext. We're only doing this for the + * authenticator, not for the counter, so don't bother to + * initialize q[2*i]. (Even for the sake of sanitizers, + * they're already initialized to something by now.) + */ + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_encrypt(nrounds, sk_exp, q); + br_aes_ct64_ortho(q); + + /* Update authenticator. */ + br_aes_ct64_interleave_out(w, q[0], q[4]); + le32enc(authctr + 4*0, w[0]); + le32enc(authctr + 4*1, w[1]); + le32enc(authctr + 4*2, w[2]); + le32enc(authctr + 4*3, w[3]); + + /* Update counter. */ + le32enc(authctr + 16 + 4*3, bswap32(c3be)); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); +} + +static int +aesbear64_probe(void) +{ + + if (aesbear64_xts_update_selftest()) + return -1; + + /* XXX test br_aes_ct64_bitslice_decrypt */ + /* XXX test br_aes_ct64_bitslice_encrypt */ + /* XXX test br_aes_ct64_keysched */ + /* XXX test br_aes_ct64_ortho */ + /* XXX test br_aes_ct64_skey_expand */ + + return 0; +} + +struct aes_impl aes_bear64_impl = { + .ai_name = "BearSSL aes_ct64", + .ai_probe = aesbear64_probe, + .ai_setenckey = aesbear64_setenckey, + .ai_setdeckey = aesbear64_setdeckey, + .ai_enc = aesbear64_enc, + .ai_dec = aesbear64_dec, + .ai_cbc_enc = aesbear64_cbc_enc, + .ai_cbc_dec = aesbear64_cbc_dec, + .ai_xts_enc = aesbear64_xts_enc, + .ai_xts_dec = aesbear64_xts_dec, + .ai_cbcmac_update1 = aesbear64_cbcmac_update1, + .ai_ccm_enc1 = aesbear64_ccm_enc1, + .ai_ccm_dec1 = aesbear64_ccm_dec1, +}; diff -r 8da4be48b876 -r cce15febbf04 sys/crypto/aes/aes_bear64.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/crypto/aes/aes_bear64.h Sat Nov 22 05:28:14 2025 +0000 @@ -0,0 +1,62 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _CRYPTO_AES_AES_BEAR64_H +#define _CRYPTO_AES_AES_BEAR64_H + +#include +#include + +#include + +#define br_dec32le le32dec +#define br_enc32le le32enc + +void br_aes_ct64_bitslice_Sbox(uint64_t[static 8]); +void br_aes_ct64_bitslice_invSbox(uint64_t[static 8]); +void br_aes_ct64_ortho(uint64_t[static 8]); +void br_aes_ct64_interleave_in(uint64_t[static 1], uint64_t[static 1], + const uint32_t[static 4]); +void br_aes_ct64_interleave_out(uint32_t[static 4], uint64_t, uint64_t); +u_int br_aes_ct64_keysched(uint64_t[static 30], const void *, size_t); +void br_aes_ct64_skey_expand(uint64_t[static 120], unsigned, + const uint64_t[static 30]); +void br_aes_ct64_bitslice_encrypt(unsigned, const uint64_t[static 120], + uint64_t[static 8]); +void br_aes_ct64_bitslice_decrypt(unsigned, const uint64_t[static 120], + uint64_t[static 8]); + +/* NetBSD additions */ + +void br_aes_ct64_inv_mix_columns(uint64_t[static 8]); +u_int br_aes_ct64_keysched_stdenc(uint32_t *, const void *, size_t); +u_int br_aes_ct64_keysched_stddec(uint32_t *, const void *, size_t); + +extern struct aes_impl aes_bear64_impl; + +#endif /* _CRYPTO_AES_AES_BEAR64_H */ diff -r 8da4be48b876 -r cce15febbf04 sys/crypto/aes/aes_ct64.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/crypto/aes/aes_ct64.c Sat Nov 22 05:28:14 2025 +0000 @@ -0,0 +1,512 @@ +/* $NetBSD$ */ + +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +__KERNEL_RCSID(1, "$NetBSD$"); + +#include + +#ifdef _KERNEL +#include +#else +#include +#endif + +#include + +static void +br_range_dec32le(uint32_t *p32, size_t nwords, const void *v) +{ + const uint8_t *p8 = v; + + while (nwords --> 0) { + uint32_t x0 = *p8++; + uint32_t x1 = *p8++; + uint32_t x2 = *p8++; + uint32_t x3 = *p8++; + + *p32++ = x0 | (x1 << 8) | (x2 << 16) | (x3 << 24); + } +} + +/* see inner.h */ +void +br_aes_ct64_bitslice_Sbox(uint64_t q[static 8]) +{ + /* + * This S-box implementation is a straightforward translation of + * the circuit described by Boyar and Peralta in "A new + * combinational logic minimization technique with applications + * to cryptology" (https://eprint.iacr.org/2009/191.pdf). + * + * Note that variables x* (input) and s* (output) are numbered + * in "reverse" order (x0 is the high bit, x7 is the low bit). + */ + + uint64_t x0, x1, x2, x3, x4, x5, x6, x7; + uint64_t y1, y2, y3, y4, y5, y6, y7, y8, y9; + uint64_t y10, y11, y12, y13, y14, y15, y16, y17, y18, y19; + uint64_t y20, y21; + uint64_t z0, z1, z2, z3, z4, z5, z6, z7, z8, z9; + uint64_t z10, z11, z12, z13, z14, z15, z16, z17; + uint64_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; + uint64_t t10, t11, t12, t13, t14, t15, t16, t17, t18, t19; + uint64_t t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; + uint64_t t30, t31, t32, t33, t34, t35, t36, t37, t38, t39; + uint64_t t40, t41, t42, t43, t44, t45, t46, t47, t48, t49; + uint64_t t50, t51, t52, t53, t54, t55, t56, t57, t58, t59; + uint64_t t60, t61, t62, t63, t64, t65, t66, t67; + uint64_t s0, s1, s2, s3, s4, s5, s6, s7; + + x0 = q[7]; + x1 = q[6]; + x2 = q[5]; + x3 = q[4]; + x4 = q[3]; + x5 = q[2]; + x6 = q[1]; + x7 = q[0]; + + /* + * Top linear transformation. + */ + y14 = x3 ^ x5; + y13 = x0 ^ x6; + y9 = x0 ^ x3; + y8 = x0 ^ x5; + t0 = x1 ^ x2; + y1 = t0 ^ x7; + y4 = y1 ^ x3; + y12 = y13 ^ y14; + y2 = y1 ^ x0; + y5 = y1 ^ x6; + y3 = y5 ^ y8; + t1 = x4 ^ y12; + y15 = t1 ^ x5; + y20 = t1 ^ x1; + y6 = y15 ^ x7; + y10 = y15 ^ t0; + y11 = y20 ^ y9; + y7 = x7 ^ y11; + y17 = y10 ^ y11; + y19 = y10 ^ y8; + y16 = t0 ^ y11; + y21 = y13 ^ y16; + y18 = x0 ^ y16; + + /* + * Non-linear section. + */ + t2 = y12 & y15; + t3 = y3 & y6; + t4 = t3 ^ t2; + t5 = y4 & x7; + t6 = t5 ^ t2; + t7 = y13 & y16; + t8 = y5 & y1; + t9 = t8 ^ t7; + t10 = y2 & y7; + t11 = t10 ^ t7; + t12 = y9 & y11; + t13 = y14 & y17; + t14 = t13 ^ t12; + t15 = y8 & y10; + t16 = t15 ^ t12; + t17 = t4 ^ t14; + t18 = t6 ^ t16; + t19 = t9 ^ t14; + t20 = t11 ^ t16; + t21 = t17 ^ y20; + t22 = t18 ^ y19; + t23 = t19 ^ y21; + t24 = t20 ^ y18; + + t25 = t21 ^ t22; + t26 = t21 & t23; + t27 = t24 ^ t26; + t28 = t25 & t27; + t29 = t28 ^ t22; + t30 = t23 ^ t24; + t31 = t22 ^ t26; + t32 = t31 & t30; + t33 = t32 ^ t24; + t34 = t23 ^ t33; + t35 = t27 ^ t33; + t36 = t24 & t35; + t37 = t36 ^ t34; + t38 = t27 ^ t36; + t39 = t29 & t38; + t40 = t25 ^ t39; + + t41 = t40 ^ t37; + t42 = t29 ^ t33; + t43 = t29 ^ t40; + t44 = t33 ^ t37; + t45 = t42 ^ t41; + z0 = t44 & y15; + z1 = t37 & y6; + z2 = t33 & x7; + z3 = t43 & y16; + z4 = t40 & y1; + z5 = t29 & y7; + z6 = t42 & y11; + z7 = t45 & y17; + z8 = t41 & y10; + z9 = t44 & y12; + z10 = t37 & y3; + z11 = t33 & y4; + z12 = t43 & y13; + z13 = t40 & y5; + z14 = t29 & y2; + z15 = t42 & y9; + z16 = t45 & y14; + z17 = t41 & y8; + + /* + * Bottom linear transformation. + */ + t46 = z15 ^ z16; + t47 = z10 ^ z11; + t48 = z5 ^ z13; + t49 = z9 ^ z10; + t50 = z2 ^ z12; + t51 = z2 ^ z5; + t52 = z7 ^ z8; + t53 = z0 ^ z3; + t54 = z6 ^ z7; + t55 = z16 ^ z17; + t56 = z12 ^ t48; + t57 = t50 ^ t53; + t58 = z4 ^ t46; + t59 = z3 ^ t54; + t60 = t46 ^ t57; + t61 = z14 ^ t57; + t62 = t52 ^ t58; + t63 = t49 ^ t58; + t64 = z4 ^ t59; + t65 = t61 ^ t62; + t66 = z1 ^ t63; + s0 = t59 ^ t63; + s6 = t56 ^ ~t62; + s7 = t48 ^ ~t60; + t67 = t64 ^ t65; + s3 = t53 ^ t66; + s4 = t51 ^ t66; + s5 = t47 ^ t65; + s1 = t64 ^ ~s3; + s2 = t55 ^ ~t67; + + q[7] = s0; + q[6] = s1; + q[5] = s2; + q[4] = s3; + q[3] = s4; + q[2] = s5; + q[1] = s6; + q[0] = s7; +} + +/* see inner.h */ +void +br_aes_ct64_ortho(uint64_t q[static 8]) +{ +#define SWAPN(cl, ch, s, x, y) do { \ + uint64_t a, b; \ + a = (x); \ + b = (y); \ + (x) = (a & (uint64_t)cl) | ((b & (uint64_t)cl) << (s)); \ + (y) = ((a & (uint64_t)ch) >> (s)) | (b & (uint64_t)ch); \ + } while (0) + +#define SWAP2(x, y) SWAPN(0x5555555555555555, 0xAAAAAAAAAAAAAAAA, 1, x, y) +#define SWAP4(x, y) SWAPN(0x3333333333333333, 0xCCCCCCCCCCCCCCCC, 2, x, y) +#define SWAP8(x, y) SWAPN(0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 4, x, y) + + SWAP2(q[0], q[1]); + SWAP2(q[2], q[3]); + SWAP2(q[4], q[5]); + SWAP2(q[6], q[7]); + + SWAP4(q[0], q[2]); + SWAP4(q[1], q[3]); + SWAP4(q[4], q[6]); + SWAP4(q[5], q[7]); + + SWAP8(q[0], q[4]); + SWAP8(q[1], q[5]); + SWAP8(q[2], q[6]); + SWAP8(q[3], q[7]); +} + +/* see inner.h */ +void +br_aes_ct64_interleave_in(uint64_t q0[static 1], uint64_t q1[static 1], + const uint32_t w[static 4]) +{ + uint64_t x0, x1, x2, x3; + + x0 = w[0]; + x1 = w[1]; + x2 = w[2]; + x3 = w[3]; + x0 |= (x0 << 16); + x1 |= (x1 << 16); + x2 |= (x2 << 16); + x3 |= (x3 << 16); + x0 &= (uint64_t)0x0000FFFF0000FFFF; + x1 &= (uint64_t)0x0000FFFF0000FFFF; + x2 &= (uint64_t)0x0000FFFF0000FFFF; + x3 &= (uint64_t)0x0000FFFF0000FFFF; + x0 |= (x0 << 8); + x1 |= (x1 << 8); + x2 |= (x2 << 8); + x3 |= (x3 << 8); + x0 &= (uint64_t)0x00FF00FF00FF00FF; + x1 &= (uint64_t)0x00FF00FF00FF00FF; + x2 &= (uint64_t)0x00FF00FF00FF00FF; + x3 &= (uint64_t)0x00FF00FF00FF00FF; + *q0 = x0 | (x2 << 8); + *q1 = x1 | (x3 << 8); +} + +/* see inner.h */ +void +br_aes_ct64_interleave_out(uint32_t w[static 4], uint64_t q0, uint64_t q1) +{ + uint64_t x0, x1, x2, x3; + + x0 = q0 & (uint64_t)0x00FF00FF00FF00FF; + x1 = q1 & (uint64_t)0x00FF00FF00FF00FF; + x2 = (q0 >> 8) & (uint64_t)0x00FF00FF00FF00FF; + x3 = (q1 >> 8) & (uint64_t)0x00FF00FF00FF00FF; + x0 |= (x0 >> 8); + x1 |= (x1 >> 8); + x2 |= (x2 >> 8); + x3 |= (x3 >> 8); + x0 &= (uint64_t)0x0000FFFF0000FFFF; + x1 &= (uint64_t)0x0000FFFF0000FFFF; + x2 &= (uint64_t)0x0000FFFF0000FFFF; + x3 &= (uint64_t)0x0000FFFF0000FFFF; + w[0] = (uint32_t)x0 | (uint32_t)(x0 >> 16); + w[1] = (uint32_t)x1 | (uint32_t)(x1 >> 16); + w[2] = (uint32_t)x2 | (uint32_t)(x2 >> 16); + w[3] = (uint32_t)x3 | (uint32_t)(x3 >> 16); +} + +static const unsigned char Rcon[] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36 +}; + +static uint32_t +sub_word(uint32_t x) +{ + uint64_t q[8]; + + memset(q, 0, sizeof q); + q[0] = x; + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_Sbox(q); + br_aes_ct64_ortho(q); + return (uint32_t)q[0]; +} + +/* see inner.h */ +unsigned +br_aes_ct64_keysched(uint64_t comp_skey[static 30], + const void *key, size_t key_len) +{ + unsigned num_rounds; + int i, j, k, nk, nkf; + uint32_t tmp; + uint32_t skey[60]; + + switch (key_len) { + case 16: + num_rounds = 10; + break; + case 24: + num_rounds = 12; + break; + case 32: + num_rounds = 14; + break; + default: + /* abort(); */ + return 0; + } + nk = (int)(key_len >> 2); + nkf = (int)((num_rounds + 1) << 2); + br_range_dec32le(skey, (key_len >> 2), key); + tmp = skey[(key_len >> 2) - 1]; + for (i = nk, j = 0, k = 0; i < nkf; i ++) { + if (j == 0) { + tmp = (tmp << 24) | (tmp >> 8); + tmp = sub_word(tmp) ^ Rcon[k]; + } else if (nk > 6 && j == 4) { + tmp = sub_word(tmp); + } + tmp ^= skey[i - nk]; + skey[i] = tmp; + if (++ j == nk) { + j = 0; + k ++; + } + } + + for (i = 0, j = 0; i < nkf; i += 4, j += 2) { + uint64_t q[8]; + + br_aes_ct64_interleave_in(&q[0], &q[4], skey + i); + q[1] = q[0]; + q[2] = q[0]; + q[3] = q[0]; + q[5] = q[4]; + q[6] = q[4]; + q[7] = q[4]; + br_aes_ct64_ortho(q); + comp_skey[j + 0] = + (q[0] & (uint64_t)0x1111111111111111) + | (q[1] & (uint64_t)0x2222222222222222) + | (q[2] & (uint64_t)0x4444444444444444) + | (q[3] & (uint64_t)0x8888888888888888); + comp_skey[j + 1] = + (q[4] & (uint64_t)0x1111111111111111) + | (q[5] & (uint64_t)0x2222222222222222) + | (q[6] & (uint64_t)0x4444444444444444) + | (q[7] & (uint64_t)0x8888888888888888); + } + return num_rounds; +} + +/* see inner.h */ +void +br_aes_ct64_skey_expand(uint64_t skey[static 120], + unsigned num_rounds, const uint64_t comp_skey[static 30]) +{ + unsigned u, v, n; + + n = (num_rounds + 1) << 1; + for (u = 0, v = 0; u < n; u ++, v += 4) { + uint64_t x0, x1, x2, x3; + + x0 = x1 = x2 = x3 = comp_skey[u]; + x0 &= (uint64_t)0x1111111111111111; + x1 &= (uint64_t)0x2222222222222222; + x2 &= (uint64_t)0x4444444444444444; + x3 &= (uint64_t)0x8888888888888888; + x1 >>= 1; + x2 >>= 2; + x3 >>= 3; + skey[v + 0] = (x0 << 4) - x0; + skey[v + 1] = (x1 << 4) - x1; + skey[v + 2] = (x2 << 4) - x2; + skey[v + 3] = (x3 << 4) - x3; + } +} + +/* NetBSD additions, for computing the standard AES key schedule */ + +unsigned +br_aes_ct64_keysched_stdenc(uint32_t *skey, const void *key, size_t key_len) +{ + unsigned num_rounds; + int i, j, k, nk, nkf; + uint32_t tmp; + + switch (key_len) { + case 16: + num_rounds = 10; + break; + case 24: + num_rounds = 12; + break; + case 32: + num_rounds = 14; + break; + default: + /* abort(); */ + return 0; + } + nk = (int)(key_len >> 2); + nkf = (int)((num_rounds + 1) << 2); + tmp = 0; + for (i = 0; i < nk; i ++) { + tmp = br_dec32le((const unsigned char *)key + (i << 2)); + skey[i] = tmp; + } + for (i = nk, j = 0, k = 0; i < nkf; i ++) { + if (j == 0) { + tmp = (tmp << 24) | (tmp >> 8); + tmp = sub_word(tmp) ^ Rcon[k]; + } else if (nk > 6 && j == 4) { + tmp = sub_word(tmp); + } + tmp ^= skey[i - nk]; + skey[i] = tmp; + if (++ j == nk) { + j = 0; + k ++; + } + } + return num_rounds; +} + +unsigned +br_aes_ct64_keysched_stddec(uint32_t *skey, const void *key, size_t key_len) +{ + uint32_t tkey[60]; + uint64_t q[8]; + unsigned num_rounds; + unsigned i; + + num_rounds = br_aes_ct64_keysched_stdenc(skey, key, key_len); + if (num_rounds == 0) + return 0; + + q[1] = q[2] = q[3] = 0; + q[5] = q[6] = q[7] = 0; + + tkey[0] = skey[4*num_rounds + 0]; + tkey[1] = skey[4*num_rounds + 1]; + tkey[2] = skey[4*num_rounds + 2]; + tkey[3] = skey[4*num_rounds + 3]; + for (i = 1; i < num_rounds; i++) { + br_aes_ct64_interleave_in(&q[0], &q[4], skey + 4*i); + br_aes_ct64_ortho(q); + br_aes_ct64_inv_mix_columns(q); + br_aes_ct64_ortho(q); + br_aes_ct64_interleave_out(&tkey[4*(num_rounds - i)], + q[0], q[4]); + } + tkey[4*num_rounds + 0] = skey[0]; + tkey[4*num_rounds + 1] = skey[1]; + tkey[4*num_rounds + 2] = skey[2]; + tkey[4*num_rounds + 3] = skey[3]; + + memcpy(skey, tkey, 4*(num_rounds + 1)*sizeof(uint32_t)); + explicit_memset(tkey, 0, 4*(num_rounds + 1)*sizeof(uint32_t)); + return num_rounds; +} diff -r 8da4be48b876 -r cce15febbf04 sys/crypto/aes/aes_ct64_dec.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/crypto/aes/aes_ct64_dec.c Sat Nov 22 05:28:14 2025 +0000 @@ -0,0 +1,174 @@ +/* $NetBSD$ */ + +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +__KERNEL_RCSID(1, "$NetBSD$"); + +#include + +#include + +/* see inner.h */ +void +br_aes_ct64_bitslice_invSbox(uint64_t q[static 8]) +{ + /* + * See br_aes_ct_bitslice_invSbox(). This is the natural extension + * to 64-bit registers. + */ + uint64_t q0, q1, q2, q3, q4, q5, q6, q7; + + q0 = ~q[0]; + q1 = ~q[1]; + q2 = q[2]; + q3 = q[3]; + q4 = q[4]; + q5 = ~q[5]; + q6 = ~q[6]; + q7 = q[7]; + q[7] = q1 ^ q4 ^ q6; + q[6] = q0 ^ q3 ^ q5; + q[5] = q7 ^ q2 ^ q4; + q[4] = q6 ^ q1 ^ q3; + q[3] = q5 ^ q0 ^ q2; + q[2] = q4 ^ q7 ^ q1; + q[1] = q3 ^ q6 ^ q0; + q[0] = q2 ^ q5 ^ q7; + + br_aes_ct64_bitslice_Sbox(q); + + q0 = ~q[0]; + q1 = ~q[1]; + q2 = q[2]; + q3 = q[3]; + q4 = q[4]; + q5 = ~q[5]; + q6 = ~q[6]; + q7 = q[7]; + q[7] = q1 ^ q4 ^ q6; + q[6] = q0 ^ q3 ^ q5; + q[5] = q7 ^ q2 ^ q4; + q[4] = q6 ^ q1 ^ q3; + q[3] = q5 ^ q0 ^ q2; + q[2] = q4 ^ q7 ^ q1; + q[1] = q3 ^ q6 ^ q0; + q[0] = q2 ^ q5 ^ q7; +} + +static void +add_round_key(uint64_t q[static 8], const uint64_t sk[static 8]) +{ + int i; + + for (i = 0; i < 8; i ++) { + q[i] ^= sk[i]; + } +} + +static void +inv_shift_rows(uint64_t q[static 8]) +{ + int i; + + for (i = 0; i < 8; i ++) { + uint64_t x; + + x = q[i]; + q[i] = (x & (uint64_t)0x000000000000FFFF) + | ((x & (uint64_t)0x000000000FFF0000) << 4) + | ((x & (uint64_t)0x00000000F0000000) >> 12) + | ((x & (uint64_t)0x000000FF00000000) << 8) + | ((x & (uint64_t)0x0000FF0000000000) >> 8) + | ((x & (uint64_t)0x000F000000000000) << 12) + | ((x & (uint64_t)0xFFF0000000000000) >> 4); + } +} + +static inline uint64_t +rotr32(uint64_t x) +{ + return (x << 32) | (x >> 32); +} + +static void +inv_mix_columns(uint64_t q[static 8]) +{ + uint64_t q0, q1, q2, q3, q4, q5, q6, q7; + uint64_t r0, r1, r2, r3, r4, r5, r6, r7; + + q0 = q[0]; + q1 = q[1]; + q2 = q[2]; + q3 = q[3]; + q4 = q[4]; + q5 = q[5]; + q6 = q[6]; + q7 = q[7]; + r0 = (q0 >> 16) | (q0 << 48); + r1 = (q1 >> 16) | (q1 << 48); + r2 = (q2 >> 16) | (q2 << 48); + r3 = (q3 >> 16) | (q3 << 48); + r4 = (q4 >> 16) | (q4 << 48); + r5 = (q5 >> 16) | (q5 << 48); + r6 = (q6 >> 16) | (q6 << 48); + r7 = (q7 >> 16) | (q7 << 48); + + q[0] = q5 ^ q6 ^ q7 ^ r0 ^ r5 ^ r7 ^ rotr32(q0 ^ q5 ^ q6 ^ r0 ^ r5); + q[1] = q0 ^ q5 ^ r0 ^ r1 ^ r5 ^ r6 ^ r7 ^ rotr32(q1 ^ q5 ^ q7 ^ r1 ^ r5 ^ r6); + q[2] = q0 ^ q1 ^ q6 ^ r1 ^ r2 ^ r6 ^ r7 ^ rotr32(q0 ^ q2 ^ q6 ^ r2 ^ r6 ^ r7); + q[3] = q0 ^ q1 ^ q2 ^ q5 ^ q6 ^ r0 ^ r2 ^ r3 ^ r5 ^ rotr32(q0 ^ q1 ^ q3 ^ q5 ^ q6 ^ q7 ^ r0 ^ r3 ^ r5 ^ r7); + q[4] = q1 ^ q2 ^ q3 ^ q5 ^ r1 ^ r3 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr32(q1 ^ q2 ^ q4 ^ q5 ^ q7 ^ r1 ^ r4 ^ r5 ^ r6); + q[5] = q2 ^ q3 ^ q4 ^ q6 ^ r2 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr32(q2 ^ q3 ^ q5 ^ q6 ^ r2 ^ r5 ^ r6 ^ r7); + q[6] = q3 ^ q4 ^ q5 ^ q7 ^ r3 ^ r5 ^ r6 ^ r7 ^ rotr32(q3 ^ q4 ^ q6 ^ q7 ^ r3 ^ r6 ^ r7); + q[7] = q4 ^ q5 ^ q6 ^ r4 ^ r6 ^ r7 ^ rotr32(q4 ^ q5 ^ q7 ^ r4 ^ r7); +} + +/* see inner.h */ +void +br_aes_ct64_bitslice_decrypt(unsigned num_rounds, + const uint64_t skey[static 120], uint64_t q[static 8]) +{ + unsigned u; + + add_round_key(q, skey + (num_rounds << 3)); + for (u = num_rounds - 1; u > 0; u --) { + inv_shift_rows(q); + br_aes_ct64_bitslice_invSbox(q); + add_round_key(q, skey + (u << 3)); + inv_mix_columns(q); + } + inv_shift_rows(q); + br_aes_ct64_bitslice_invSbox(q); + add_round_key(q, skey); +} + +/* NetBSD addition, for generating compatible decryption keys */ +void +br_aes_ct64_inv_mix_columns(uint64_t q[static 8]) +{ + + inv_mix_columns(q); +} diff -r 8da4be48b876 -r cce15febbf04 sys/crypto/aes/aes_ct64_enc.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/crypto/aes/aes_ct64_enc.c Sat Nov 22 05:28:14 2025 +0000 @@ -0,0 +1,122 @@ +/* $NetBSD$ */ + +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +__KERNEL_RCSID(1, "$NetBSD$"); + +#include + +#include + +static inline void +add_round_key(uint64_t q[static 8], const uint64_t sk[static 8]) +{ + q[0] ^= sk[0]; + q[1] ^= sk[1]; + q[2] ^= sk[2]; + q[3] ^= sk[3]; + q[4] ^= sk[4]; + q[5] ^= sk[5]; + q[6] ^= sk[6]; + q[7] ^= sk[7]; +} + +static inline void +shift_rows(uint64_t q[static 8]) +{ + int i; + + for (i = 0; i < 8; i ++) { + uint64_t x; + + x = q[i]; + q[i] = (x & (uint64_t)0x000000000000FFFF) + | ((x & (uint64_t)0x00000000FFF00000) >> 4) + | ((x & (uint64_t)0x00000000000F0000) << 12) + | ((x & (uint64_t)0x0000FF0000000000) >> 8) + | ((x & (uint64_t)0x000000FF00000000) << 8) + | ((x & (uint64_t)0xF000000000000000) >> 12) + | ((x & (uint64_t)0x0FFF000000000000) << 4); + } +} + +static inline uint64_t +rotr32(uint64_t x) +{ + return (x << 32) | (x >> 32); +} + +static inline void +mix_columns(uint64_t q[static 8]) +{ + uint64_t q0, q1, q2, q3, q4, q5, q6, q7; + uint64_t r0, r1, r2, r3, r4, r5, r6, r7; + + q0 = q[0]; + q1 = q[1]; + q2 = q[2]; + q3 = q[3]; + q4 = q[4]; + q5 = q[5]; + q6 = q[6]; + q7 = q[7]; + r0 = (q0 >> 16) | (q0 << 48); + r1 = (q1 >> 16) | (q1 << 48); + r2 = (q2 >> 16) | (q2 << 48); + r3 = (q3 >> 16) | (q3 << 48); + r4 = (q4 >> 16) | (q4 << 48); + r5 = (q5 >> 16) | (q5 << 48); + r6 = (q6 >> 16) | (q6 << 48); + r7 = (q7 >> 16) | (q7 << 48); + + q[0] = q7 ^ r7 ^ r0 ^ rotr32(q0 ^ r0); + q[1] = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr32(q1 ^ r1); + q[2] = q1 ^ r1 ^ r2 ^ rotr32(q2 ^ r2); + q[3] = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr32(q3 ^ r3); + q[4] = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr32(q4 ^ r4); + q[5] = q4 ^ r4 ^ r5 ^ rotr32(q5 ^ r5); + q[6] = q5 ^ r5 ^ r6 ^ rotr32(q6 ^ r6); + q[7] = q6 ^ r6 ^ r7 ^ rotr32(q7 ^ r7); +} + +/* see inner.h */ +void +br_aes_ct64_bitslice_encrypt(unsigned num_rounds, + const uint64_t skey[static 120], uint64_t q[static 8]) +{ + unsigned u; + + add_round_key(q, skey); + for (u = 1; u < num_rounds; u ++) { + br_aes_ct64_bitslice_Sbox(q); + shift_rows(q); + mix_columns(q); + add_round_key(q, skey + (u << 3)); + } + br_aes_ct64_bitslice_Sbox(q); + shift_rows(q); + add_round_key(q, skey + (num_rounds << 3)); +} diff -r 8da4be48b876 -r cce15febbf04 sys/crypto/aes/aes_impl.c --- a/sys/crypto/aes/aes_impl.c Sat Nov 22 04:18:35 2025 +0000 +++ b/sys/crypto/aes/aes_impl.c Sat Nov 22 05:28:14 2025 +0000 @@ -29,6 +29,10 @@ #include __KERNEL_RCSID(1, "$NetBSD: aes_impl.c,v 1.10 2022/11/05 17:36:33 jmcneill Exp $"); +#ifdef _KERNEL_OPT +#include "opt_aes.h" +#endif + #include #include #include @@ -37,12 +41,20 @@ __KERNEL_RCSID(1, "$NetBSD: aes_impl.c,v #include #include -#include /* default implementation */ #include #include #include #include +/* default implementation */ +#ifdef AES_BEAR64 +#include +static const struct aes_impl *aes_default_impl = &aes_bear64_impl; +#else +#include +static const struct aes_impl *aes_default_impl = &aes_bear_impl; +#endif + static int aes_keysched_selftest(void); static const struct aes_impl *aes_md_impl __read_mostly; @@ -113,11 +125,11 @@ aes_select(void) aes_impl = aes_md_impl; } if (aes_impl == NULL) { - if (aes_selftest(&aes_bear_impl)) + if (aes_selftest(aes_default_impl)) aprint_error("aes: self-test failed: %s\n", - aes_bear_impl.ai_name); + aes_default_impl->ai_name); else - aes_impl = &aes_bear_impl; + aes_impl = aes_default_impl; } if (aes_impl == NULL) panic("AES self-tests failed"); diff -r 8da4be48b876 -r cce15febbf04 sys/crypto/aes/aes_keysched.c --- a/sys/crypto/aes/aes_keysched.c Sat Nov 22 04:18:35 2025 +0000 +++ b/sys/crypto/aes/aes_keysched.c Sat Nov 22 05:28:14 2025 +0000 @@ -29,10 +29,19 @@ #include __KERNEL_RCSID(1, "$NetBSD$"); +#ifdef _KERNEL_OPT +#include "opt_aes.h" +#endif + #include +#include + +#ifdef AES_BEAR64 +#include +#else #include -#include +#endif /* * aes_keysched_enc(rk, key, keybytes) @@ -46,7 +55,11 @@ u_int aes_keysched_enc(uint32_t *rk, const void *key, size_t keybytes) { +#ifdef AES_BEAR64 + return br_aes_ct64_keysched_stdenc(rk, key, keybytes); +#else return br_aes_ct_keysched_stdenc(rk, key, keybytes); +#endif } /* @@ -61,5 +74,9 @@ u_int aes_keysched_dec(uint32_t *rk, const void *key, size_t keybytes) { +#ifdef AES_BEAR64 + return br_aes_ct64_keysched_stddec(rk, key, keybytes); +#else return br_aes_ct_keysched_stddec(rk, key, keybytes); +#endif } diff -r 8da4be48b876 -r cce15febbf04 sys/crypto/aes/files.aes --- a/sys/crypto/aes/files.aes Sat Nov 22 04:18:35 2025 +0000 +++ b/sys/crypto/aes/files.aes Sat Nov 22 05:28:14 2025 +0000 @@ -2,12 +2,18 @@ define aes -file crypto/aes/aes_bear.c aes +defflag opt_aes.h AES_BEAR64 + +file crypto/aes/aes_bear.c aes & !aes_bear64 +file crypto/aes/aes_bear64.c aes & aes_bear64 file crypto/aes/aes_ccm.c aes file crypto/aes/aes_ccm_mbuf.c aes -file crypto/aes/aes_ct.c aes -file crypto/aes/aes_ct_dec.c aes -file crypto/aes/aes_ct_enc.c aes +file crypto/aes/aes_ct.c aes & !aes_bear64 +file crypto/aes/aes_ct64.c aes & aes_bear64 +file crypto/aes/aes_ct64_dec.c aes & aes_bear64 +file crypto/aes/aes_ct64_enc.c aes & aes_bear64 +file crypto/aes/aes_ct_dec.c aes & !aes_bear64 +file crypto/aes/aes_ct_enc.c aes & !aes_bear64 file crypto/aes/aes_impl.c aes file crypto/aes/aes_keysched.c aes file crypto/aes/aes_selftest.c aes diff -r 8da4be48b876 -r cce15febbf04 tests/sys/crypto/aes/Makefile --- a/tests/sys/crypto/aes/Makefile Sat Nov 22 04:18:35 2025 +0000 +++ b/tests/sys/crypto/aes/Makefile Sat Nov 22 05:28:14 2025 +0000 @@ -17,6 +17,12 @@ SRCS.t_aes+= aes_bear.c SRCS.t_aes+= aes_ct.c SRCS.t_aes+= aes_ct_dec.c SRCS.t_aes+= aes_ct_enc.c + +SRCS.t_aes+= aes_bear64.c +SRCS.t_aes+= aes_ct64.c +SRCS.t_aes+= aes_ct64_dec.c +SRCS.t_aes+= aes_ct64_enc.c + SRCS.t_aes+= aes_keysched.c SRCS.t_aes+= aes_selftest.c @@ -85,6 +91,7 @@ WARNS= 5 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110878 COPTS.aes_armv8.c+= ${CC_WNO_STRINGOP_OVERREAD} ${CC_WNO_STRINGOP_OVERFLOW} COPTS.aes_bear.c+= ${CC_WNO_STRINGOP_OVERFLOW} ${CC_WNO_ARRAY_BOUNDS} +COPTS.aes_bear64.c+= ${CC_WNO_STRINGOP_OVERFLOW} ${CC_WNO_ARRAY_BOUNDS} COPTS.aes_neon_subr.c+= ${CC_WNO_ARRAY_BOUNDS} COPTS.aes_ni.c+= ${CC_WNO_STRINGOP_OVERREAD} ${CC_WNO_STRINGOP_OVERFLOW} diff -r 8da4be48b876 -r cce15febbf04 tests/sys/crypto/aes/t_aes.c --- a/tests/sys/crypto/aes/t_aes.c Sat Nov 22 04:18:35 2025 +0000 +++ b/tests/sys/crypto/aes/t_aes.c Sat Nov 22 05:28:14 2025 +0000 @@ -30,6 +30,7 @@ #include #include +#include #include #if defined(__i386__) || defined(__x86_64__) @@ -71,6 +72,28 @@ ATF_TC_BODY(aes_ct_selftest, tc) atf_tc_fail("BearSSL aes_ct self-test failed"); } +ATF_TC(aes_ct64_selftest); +ATF_TC_HEAD(aes_ct64_selftest, tc) +{ + + atf_tc_set_md_var(tc, "descr", "BearSSL aes_ct64 tests"); +} + +ATF_TC_BODY(aes_ct64_selftest, tc) +{ + + if (aes_bear64_impl.ai_probe()) { + /* + * aes_ct64 is the portable software fallback for LP64 + * platforms, so probe should never fail. + */ + atf_tc_fail("BearSSL aes_ct probe64 failed"); + } + + if (aes_selftest(&aes_bear64_impl)) + atf_tc_fail("BearSSL aes_ct64 self-test failed"); +} + #define AES_SELFTEST(name, impl, descr) \ ATF_TC(name); \ ATF_TC_HEAD(name, tc) \ @@ -113,6 +136,7 @@ ATF_TP_ADD_TCS(tp) { ATF_TP_ADD_TC(tp, aes_ct_selftest); + ATF_TP_ADD_TC(tp, aes_ct64_selftest); #ifdef __aarch64__ ATF_TP_ADD_TC(tp, aes_armv8_selftest); # HG changeset patch # User Taylor R Campbell # Date 1763792551 0 # Sat Nov 22 06:22:31 2025 +0000 # Branch trunk # Node ID 4ef13977846541a224c93d56f4044dadb35ef919 # Parent cce15febbf047be806a40490ef2416f043f6db13 # EXP-Topic riastradh-pr59774-aesbear64 aes(9): Rewrite x86 SSE2 implementation. This computes eight AES_k instances simultaneously, using the bitsliced 32-bit aes_ct logic which computes two blocks at a time in uint32_t arithmetic, vectorized four ways. Previously, the SSE2 code was a very naive adaptation of aes_ct64, which computes four blocks at a time in uint64_t arithmetic, without any 2x vectorization -- I did it at the time because: (a) it was easier to get working, (b) it only affects really old hardware with neither AES-NI nor SSSE3 which are both much much faster. But it was bugging me that this was a kind of dumb use of SSE2. Substantially reduces stack usage (from ~1200 bytes to ~800 bytes) and should approximately double throughput for CBC decryption and for XTS encryption/decryption. I also tried a 2x64 version but cursory performance measurements didn't reveal much benefit over 4x32. (If anyone is interested in doing more serious performance measurements, on ancient hardware for which it might matter, I also have the 2x64 code around.) Prompted by: PR kern/59774: bearssl 32-bit AES is too slow, want 64-bit optimized version in kernel diff -r cce15febbf04 -r 4ef139778465 sys/arch/x86/x86/identcpu.c --- a/sys/arch/x86/x86/identcpu.c Sat Nov 22 05:28:14 2025 +0000 +++ b/sys/arch/x86/x86/identcpu.c Sat Nov 22 06:22:31 2025 +0000 @@ -42,7 +42,7 @@ __KERNEL_RCSID(0, "$NetBSD: identcpu.c,v #include #include -#include +#include #include #include #include @@ -1011,7 +1011,7 @@ cpu_probe(struct cpu_info *ci) (cpu_feature[1] & CPUID2_SSSE3)) aes_md_init(&aes_ssse3_impl); else if (i386_has_sse && i386_has_sse2) - aes_md_init(&aes_sse2_impl); + aes_md_init(&aes_sse2_4x32_impl); /* ChaCha */ if (i386_has_sse && i386_has_sse2) diff -r cce15febbf04 -r 4ef139778465 sys/conf/copts.mk --- a/sys/conf/copts.mk Sat Nov 22 05:28:14 2025 +0000 +++ b/sys/conf/copts.mk Sat Nov 22 06:22:31 2025 +0000 @@ -41,7 +41,7 @@ COPTS.chacha_neon.c+= -flax-vector-conve .if ${MACHINE_ARCH} == "x86_64" || ${MACHINE_ARCH} == "i386" COPTS.aes_bear64.c+= ${CC_WNO_ARRAY_BOUNDS} ${CC_WNO_STRINGOP_OVERFLOW} COPTS.aes_ni.c+= ${CC_WNO_STRINGOP_OVERREAD} ${CC_WNO_STRINGOP_OVERFLOW} -COPTS.aes_sse2_subr.c+= ${CC_WNO_ARRAY_BOUNDS} +COPTS.aes_sse2_4x32_subr.c+= ${CC_WNO_ARRAY_BOUNDS} COPTS.aes_ssse3_subr.c+=${CC_WNO_ARRAY_BOUNDS} COPTS.aes_via.c+= ${CC_WNO_ARRAY_BOUNDS} .endif diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2.c --- a/sys/crypto/aes/arch/x86/aes_sse2.c Sat Nov 22 05:28:14 2025 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,403 +0,0 @@ -/* - * Copyright (c) 2016 Thomas Pornin - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -__KERNEL_RCSID(1, "$NetBSD: aes_sse2.c,v 1.2 2020/06/30 20:32:11 riastradh Exp $"); - -#include - -#ifdef _KERNEL -#include -#else -#include -#include -#endif - -#include "aes_sse2_impl.h" - -static void -br_range_dec32le(uint32_t *p32, size_t nwords, const void *v) -{ - const uint8_t *p8 = v; - - while (nwords --> 0) { - uint32_t x0 = *p8++; - uint32_t x1 = *p8++; - uint32_t x2 = *p8++; - uint32_t x3 = *p8++; - - *p32++ = x0 | (x1 << 8) | (x2 << 16) | (x3 << 24); - } -} - -void -aes_sse2_bitslice_Sbox(__m128i q[static 4]) -{ - __m128i x0, x1, x2, x3, x4, x5, x6, x7; - __m128i y1, y2, y3, y4, y5, y6, y7, y8, y9; - __m128i y10, y11, y12, y13, y14, y15, y16, y17, y18, y19; - __m128i y20, y21; - __m128i z0, z1, z2, z3, z4, z5, z6, z7, z8, z9; - __m128i z10, z11, z12, z13, z14, z15, z16, z17; - __m128i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; - __m128i t10, t11, t12, t13, t14, t15, t16, t17, t18, t19; - __m128i t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; - __m128i t30, t31, t32, t33, t34, t35, t36, t37, t38, t39; - __m128i t40, t41, t42, t43, t44, t45, t46, t47, t48, t49; - __m128i t50, t51, t52, t53, t54, t55, t56, t57, t58, t59; - __m128i t60, t61, t62, t63, t64, t65, t66, t67; - __m128i s0, s1, s2, s3, s4, s5, s6, s7; - - x0 = _mm_shuffle_epi32(q[3], 0x0e); - x1 = _mm_shuffle_epi32(q[2], 0x0e); - x2 = _mm_shuffle_epi32(q[1], 0x0e); - x3 = _mm_shuffle_epi32(q[0], 0x0e); - x4 = q[3]; - x5 = q[2]; - x6 = q[1]; - x7 = q[0]; - - /* - * Top linear transformation. - */ - y14 = x3 ^ x5; - y13 = x0 ^ x6; - y9 = x0 ^ x3; - y8 = x0 ^ x5; - t0 = x1 ^ x2; - y1 = t0 ^ x7; - y4 = y1 ^ x3; - y12 = y13 ^ y14; - y2 = y1 ^ x0; - y5 = y1 ^ x6; - y3 = y5 ^ y8; - t1 = x4 ^ y12; - y15 = t1 ^ x5; - y20 = t1 ^ x1; - y6 = y15 ^ x7; - y10 = y15 ^ t0; - y11 = y20 ^ y9; - y7 = x7 ^ y11; - y17 = y10 ^ y11; - y19 = y10 ^ y8; - y16 = t0 ^ y11; - y21 = y13 ^ y16; - y18 = x0 ^ y16; - - /* - * Non-linear section. - */ - t2 = y12 & y15; - t3 = y3 & y6; - t4 = t3 ^ t2; - t5 = y4 & x7; - t6 = t5 ^ t2; - t7 = y13 & y16; - t8 = y5 & y1; - t9 = t8 ^ t7; - t10 = y2 & y7; - t11 = t10 ^ t7; - t12 = y9 & y11; - t13 = y14 & y17; - t14 = t13 ^ t12; - t15 = y8 & y10; - t16 = t15 ^ t12; - t17 = t4 ^ t14; - t18 = t6 ^ t16; - t19 = t9 ^ t14; - t20 = t11 ^ t16; - t21 = t17 ^ y20; - t22 = t18 ^ y19; - t23 = t19 ^ y21; - t24 = t20 ^ y18; - - t25 = t21 ^ t22; - t26 = t21 & t23; - t27 = t24 ^ t26; - t28 = t25 & t27; - t29 = t28 ^ t22; - t30 = t23 ^ t24; - t31 = t22 ^ t26; - t32 = t31 & t30; - t33 = t32 ^ t24; - t34 = t23 ^ t33; - t35 = t27 ^ t33; - t36 = t24 & t35; - t37 = t36 ^ t34; - t38 = t27 ^ t36; - t39 = t29 & t38; - t40 = t25 ^ t39; - - t41 = t40 ^ t37; - t42 = t29 ^ t33; - t43 = t29 ^ t40; - t44 = t33 ^ t37; - t45 = t42 ^ t41; - z0 = t44 & y15; - z1 = t37 & y6; - z2 = t33 & x7; - z3 = t43 & y16; - z4 = t40 & y1; - z5 = t29 & y7; - z6 = t42 & y11; - z7 = t45 & y17; - z8 = t41 & y10; - z9 = t44 & y12; - z10 = t37 & y3; - z11 = t33 & y4; - z12 = t43 & y13; - z13 = t40 & y5; - z14 = t29 & y2; - z15 = t42 & y9; - z16 = t45 & y14; - z17 = t41 & y8; - - /* - * Bottom linear transformation. - */ - t46 = z15 ^ z16; - t47 = z10 ^ z11; - t48 = z5 ^ z13; - t49 = z9 ^ z10; - t50 = z2 ^ z12; - t51 = z2 ^ z5; - t52 = z7 ^ z8; - t53 = z0 ^ z3; - t54 = z6 ^ z7; - t55 = z16 ^ z17; - t56 = z12 ^ t48; - t57 = t50 ^ t53; - t58 = z4 ^ t46; - t59 = z3 ^ t54; - t60 = t46 ^ t57; - t61 = z14 ^ t57; - t62 = t52 ^ t58; - t63 = t49 ^ t58; - t64 = z4 ^ t59; - t65 = t61 ^ t62; - t66 = z1 ^ t63; - s0 = t59 ^ t63; - s6 = t56 ^ ~t62; - s7 = t48 ^ ~t60; - t67 = t64 ^ t65; - s3 = t53 ^ t66; - s4 = t51 ^ t66; - s5 = t47 ^ t65; - s1 = t64 ^ ~s3; - s2 = t55 ^ ~t67; - - q[3] = _mm_unpacklo_epi64(s4, s0); - q[2] = _mm_unpacklo_epi64(s5, s1); - q[1] = _mm_unpacklo_epi64(s6, s2); - q[0] = _mm_unpacklo_epi64(s7, s3); -} - -void -aes_sse2_ortho(__m128i q[static 4]) -{ -#define SWAPN(cl, ch, s, x, y) do { \ - __m128i a, b; \ - a = (x); \ - b = (y); \ - (x) = (a & _mm_set1_epi64x(cl)) | \ - _mm_slli_epi64(b & _mm_set1_epi64x(cl), (s)); \ - (y) = _mm_srli_epi64(a & _mm_set1_epi64x(ch), (s)) | \ - (b & _mm_set1_epi64x(ch)); \ - } while (0) - -#define SWAP2(x, y) SWAPN(0x5555555555555555, 0xAAAAAAAAAAAAAAAA, 1, x, y) -#define SWAP4(x, y) SWAPN(0x3333333333333333, 0xCCCCCCCCCCCCCCCC, 2, x, y) -#define SWAP8(x, y) SWAPN(0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 4, x, y) - - SWAP2(q[0], q[1]); - SWAP2(q[2], q[3]); - - SWAP4(q[0], q[2]); - SWAP4(q[1], q[3]); - - __m128i q0 = q[0]; - __m128i q1 = q[1]; - __m128i q2 = q[2]; - __m128i q3 = q[3]; - __m128i q4 = _mm_shuffle_epi32(q[0], 0x0e); - __m128i q5 = _mm_shuffle_epi32(q[1], 0x0e); - __m128i q6 = _mm_shuffle_epi32(q[2], 0x0e); - __m128i q7 = _mm_shuffle_epi32(q[3], 0x0e); - SWAP8(q0, q4); - SWAP8(q1, q5); - SWAP8(q2, q6); - SWAP8(q3, q7); - q[0] = _mm_unpacklo_epi64(q0, q4); - q[1] = _mm_unpacklo_epi64(q1, q5); - q[2] = _mm_unpacklo_epi64(q2, q6); - q[3] = _mm_unpacklo_epi64(q3, q7); -} - -__m128i -aes_sse2_interleave_in(__m128i w) -{ - __m128i lo, hi; - - lo = _mm_shuffle_epi32(w, 0x10); - hi = _mm_shuffle_epi32(w, 0x32); - lo &= _mm_set1_epi64x(0x00000000FFFFFFFF); - hi &= _mm_set1_epi64x(0x00000000FFFFFFFF); - lo |= _mm_slli_epi64(lo, 16); - hi |= _mm_slli_epi64(hi, 16); - lo &= _mm_set1_epi32(0x0000FFFF); - hi &= _mm_set1_epi32(0x0000FFFF); - lo |= _mm_slli_epi64(lo, 8); - hi |= _mm_slli_epi64(hi, 8); - lo &= _mm_set1_epi16(0x00FF); - hi &= _mm_set1_epi16(0x00FF); - return lo | _mm_slli_epi64(hi, 8); -} - -__m128i -aes_sse2_interleave_out(__m128i q) -{ - __m128i lo, hi; - - lo = q; - hi = _mm_srli_si128(q, 1); - lo &= _mm_set1_epi16(0x00FF); - hi &= _mm_set1_epi16(0x00FF); - lo |= _mm_srli_epi64(lo, 8); - hi |= _mm_srli_epi64(hi, 8); - lo &= _mm_set1_epi32(0x0000FFFF); - hi &= _mm_set1_epi32(0x0000FFFF); - lo |= _mm_srli_epi64(lo, 16); - hi |= _mm_srli_epi64(hi, 16); - return (__m128i)_mm_shuffle_ps((__m128)lo, (__m128)hi, 0x88); -} - -static const unsigned char Rcon[] = { - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36 -}; - -static uint32_t -sub_word(uint32_t x) -{ - __m128i q[4]; - uint32_t y; - - memset(q, 0, sizeof(q)); - q[0] = _mm_loadu_si32(&x); - aes_sse2_ortho(q); - aes_sse2_bitslice_Sbox(q); - aes_sse2_ortho(q); - _mm_storeu_si32(&y, q[0]); - return y; -} - -unsigned -aes_sse2_keysched(uint64_t *comp_skey, const void *key, size_t key_len) -{ - unsigned num_rounds; - int i, j, k, nk, nkf; - uint32_t tmp; - uint32_t skey[60]; - - switch (key_len) { - case 16: - num_rounds = 10; - break; - case 24: - num_rounds = 12; - break; - case 32: - num_rounds = 14; - break; - default: - /* abort(); */ - return 0; - } - nk = (int)(key_len >> 2); - nkf = (int)((num_rounds + 1) << 2); - br_range_dec32le(skey, (key_len >> 2), key); - tmp = skey[(key_len >> 2) - 1]; - for (i = nk, j = 0, k = 0; i < nkf; i ++) { - if (j == 0) { - tmp = (tmp << 24) | (tmp >> 8); - tmp = sub_word(tmp) ^ Rcon[k]; - } else if (nk > 6 && j == 4) { - tmp = sub_word(tmp); - } - tmp ^= skey[i - nk]; - skey[i] = tmp; - if (++ j == nk) { - j = 0; - k ++; - } - } - - for (i = 0, j = 0; i < nkf; i += 4, j += 2) { - __m128i q[4], q0, q1, q2, q3, q4, q5, q6, q7; - __m128i w; - - w = _mm_loadu_epi8(skey + i); - q[0] = q[1] = q[2] = q[3] = aes_sse2_interleave_in(w); - aes_sse2_ortho(q); - q0 = q[0] & _mm_set1_epi64x(0x1111111111111111); - q1 = q[1] & _mm_set1_epi64x(0x2222222222222222); - q2 = q[2] & _mm_set1_epi64x(0x4444444444444444); - q3 = q[3] & _mm_set1_epi64x(0x8888888888888888); - q4 = _mm_shuffle_epi32(q0, 0x0e); - q5 = _mm_shuffle_epi32(q1, 0x0e); - q6 = _mm_shuffle_epi32(q2, 0x0e); - q7 = _mm_shuffle_epi32(q3, 0x0e); - _mm_storeu_si64(&comp_skey[j + 0], q0 | q1 | q2 | q3); - _mm_storeu_si64(&comp_skey[j + 1], q4 | q5 | q6 | q7); - } - return num_rounds; -} - -void -aes_sse2_skey_expand(uint64_t *skey, - unsigned num_rounds, const uint64_t *comp_skey) -{ - unsigned u, v, n; - - n = (num_rounds + 1) << 1; - for (u = 0, v = 0; u < n; u ++, v += 4) { - __m128i x0, x1, x2, x3; - - x0 = x1 = x2 = x3 = _mm_loadu_si64(&comp_skey[u]); - x0 &= 0x1111111111111111; - x1 &= 0x2222222222222222; - x2 &= 0x4444444444444444; - x3 &= 0x8888888888888888; - x1 = _mm_srli_epi64(x1, 1); - x2 = _mm_srli_epi64(x2, 2); - x3 = _mm_srli_epi64(x3, 3); - x0 = _mm_sub_epi64(_mm_slli_epi64(x0, 4), x0); - x1 = _mm_sub_epi64(_mm_slli_epi64(x1, 4), x1); - x2 = _mm_sub_epi64(_mm_slli_epi64(x2, 4), x2); - x3 = _mm_sub_epi64(_mm_slli_epi64(x3, 4), x3); - _mm_storeu_si64(&skey[v + 0], x0); - _mm_storeu_si64(&skey[v + 1], x1); - _mm_storeu_si64(&skey[v + 2], x2); - _mm_storeu_si64(&skey[v + 3], x3); - } -} diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2.h --- a/sys/crypto/aes/arch/x86/aes_sse2.h Sat Nov 22 05:28:14 2025 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,72 +0,0 @@ -/* $NetBSD: aes_sse2.h,v 1.4 2020/07/25 22:29:56 riastradh Exp $ */ - -/*- - * Copyright (c) 2020 The NetBSD Foundation, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _CRYPTO_AES_ARCH_X86_AES_SSE2_H -#define _CRYPTO_AES_ARCH_X86_AES_SSE2_H - -#include - -#include - -struct aesenc; -struct aesdec; - -/* - * These functions MUST NOT use any vector registers for parameters or - * results -- the caller is compiled with -mno-sse &c. in the kernel, - * and dynamically turns on the vector unit just before calling them. - * Internal subroutines that use the vector unit for parameters are - * declared in aes_sse2_impl.h instead. - */ - -void aes_sse2_setkey(uint64_t[static 30], const void *, uint32_t); - -void aes_sse2_enc(const struct aesenc *, const uint8_t in[static 16], - uint8_t[static 16], uint32_t); -void aes_sse2_dec(const struct aesdec *, const uint8_t in[static 16], - uint8_t[static 16], uint32_t); -void aes_sse2_cbc_enc(const struct aesenc *, const uint8_t[static 16], - uint8_t[static 16], size_t nbytes, uint8_t[static 16], uint32_t); -void aes_sse2_cbc_dec(const struct aesdec *, const uint8_t[static 16], - uint8_t[static 16], size_t nbytes, uint8_t[static 16], uint32_t); -void aes_sse2_xts_enc(const struct aesenc *, const uint8_t[static 16], - uint8_t[static 16], size_t nbytes, uint8_t[static 16], uint32_t); -void aes_sse2_xts_dec(const struct aesdec *, const uint8_t[static 16], - uint8_t[static 16], size_t nbytes, uint8_t[static 16], uint32_t); -void aes_sse2_cbcmac_update1(const struct aesenc *, const uint8_t[static 16], - size_t, uint8_t[static 16], uint32_t); -void aes_sse2_ccm_enc1(const struct aesenc *, const uint8_t[static 16], - uint8_t[static 16], size_t, uint8_t[static 32], uint32_t); -void aes_sse2_ccm_dec1(const struct aesenc *, const uint8_t[static 16], - uint8_t[static 16], size_t, uint8_t[static 32], uint32_t); - -int aes_sse2_selftest(void); - -extern struct aes_impl aes_sse2_impl; - -#endif /* _CRYPTO_AES_ARCH_X86_AES_SSE2_H */ diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2_4x32.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/crypto/aes/arch/x86/aes_sse2_4x32.c Sat Nov 22 06:22:31 2025 +0000 @@ -0,0 +1,352 @@ +/* $NetBSD$ */ + +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +__KERNEL_RCSID(1, "$NetBSD$"); + +#include + +#ifdef _KERNEL +#include +#else +#include +#endif + +#include "aes_sse2_4x32_impl.h" + +/* see inner.h */ +void +aes_sse2_4x32_bitslice_Sbox(__m128i q[static 8]) +{ + /* + * This S-box implementation is a straightforward translation of + * the circuit described by Boyar and Peralta in "A new + * combinational logic minimization technique with applications + * to cryptology" (https://eprint.iacr.org/2009/191.pdf). + * + * Note that variables x* (input) and s* (output) are numbered + * in "reverse" order (x0 is the high bit, x7 is the low bit). + */ + + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i y1, y2, y3, y4, y5, y6, y7, y8, y9; + __m128i y10, y11, y12, y13, y14, y15, y16, y17, y18, y19; + __m128i y20, y21; + __m128i z0, z1, z2, z3, z4, z5, z6, z7, z8, z9; + __m128i z10, z11, z12, z13, z14, z15, z16, z17; + __m128i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; + __m128i t10, t11, t12, t13, t14, t15, t16, t17, t18, t19; + __m128i t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; + __m128i t30, t31, t32, t33, t34, t35, t36, t37, t38, t39; + __m128i t40, t41, t42, t43, t44, t45, t46, t47, t48, t49; + __m128i t50, t51, t52, t53, t54, t55, t56, t57, t58, t59; + __m128i t60, t61, t62, t63, t64, t65, t66, t67; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + + x0 = q[7]; + x1 = q[6]; + x2 = q[5]; + x3 = q[4]; + x4 = q[3]; + x5 = q[2]; + x6 = q[1]; + x7 = q[0]; + + /* + * Top linear transformation. + */ + y14 = x3 ^ x5; + y13 = x0 ^ x6; + y9 = x0 ^ x3; + y8 = x0 ^ x5; + t0 = x1 ^ x2; + y1 = t0 ^ x7; + y4 = y1 ^ x3; + y12 = y13 ^ y14; + y2 = y1 ^ x0; + y5 = y1 ^ x6; + y3 = y5 ^ y8; + t1 = x4 ^ y12; + y15 = t1 ^ x5; + y20 = t1 ^ x1; + y6 = y15 ^ x7; + y10 = y15 ^ t0; + y11 = y20 ^ y9; + y7 = x7 ^ y11; + y17 = y10 ^ y11; + y19 = y10 ^ y8; + y16 = t0 ^ y11; + y21 = y13 ^ y16; + y18 = x0 ^ y16; + + /* + * Non-linear section. + */ + t2 = y12 & y15; + t3 = y3 & y6; + t4 = t3 ^ t2; + t5 = y4 & x7; + t6 = t5 ^ t2; + t7 = y13 & y16; + t8 = y5 & y1; + t9 = t8 ^ t7; + t10 = y2 & y7; + t11 = t10 ^ t7; + t12 = y9 & y11; + t13 = y14 & y17; + t14 = t13 ^ t12; + t15 = y8 & y10; + t16 = t15 ^ t12; + t17 = t4 ^ t14; + t18 = t6 ^ t16; + t19 = t9 ^ t14; + t20 = t11 ^ t16; + t21 = t17 ^ y20; + t22 = t18 ^ y19; + t23 = t19 ^ y21; + t24 = t20 ^ y18; + + t25 = t21 ^ t22; + t26 = t21 & t23; + t27 = t24 ^ t26; + t28 = t25 & t27; + t29 = t28 ^ t22; + t30 = t23 ^ t24; + t31 = t22 ^ t26; + t32 = t31 & t30; + t33 = t32 ^ t24; + t34 = t23 ^ t33; + t35 = t27 ^ t33; + t36 = t24 & t35; + t37 = t36 ^ t34; + t38 = t27 ^ t36; + t39 = t29 & t38; + t40 = t25 ^ t39; + + t41 = t40 ^ t37; + t42 = t29 ^ t33; + t43 = t29 ^ t40; + t44 = t33 ^ t37; + t45 = t42 ^ t41; + z0 = t44 & y15; + z1 = t37 & y6; + z2 = t33 & x7; + z3 = t43 & y16; + z4 = t40 & y1; + z5 = t29 & y7; + z6 = t42 & y11; + z7 = t45 & y17; + z8 = t41 & y10; + z9 = t44 & y12; + z10 = t37 & y3; + z11 = t33 & y4; + z12 = t43 & y13; + z13 = t40 & y5; + z14 = t29 & y2; + z15 = t42 & y9; + z16 = t45 & y14; + z17 = t41 & y8; + + /* + * Bottom linear transformation. + */ + t46 = z15 ^ z16; + t47 = z10 ^ z11; + t48 = z5 ^ z13; + t49 = z9 ^ z10; + t50 = z2 ^ z12; + t51 = z2 ^ z5; + t52 = z7 ^ z8; + t53 = z0 ^ z3; + t54 = z6 ^ z7; + t55 = z16 ^ z17; + t56 = z12 ^ t48; + t57 = t50 ^ t53; + t58 = z4 ^ t46; + t59 = z3 ^ t54; + t60 = t46 ^ t57; + t61 = z14 ^ t57; + t62 = t52 ^ t58; + t63 = t49 ^ t58; + t64 = z4 ^ t59; + t65 = t61 ^ t62; + t66 = z1 ^ t63; + s0 = t59 ^ t63; + s6 = t56 ^ ~t62; + s7 = t48 ^ ~t60; + t67 = t64 ^ t65; + s3 = t53 ^ t66; + s4 = t51 ^ t66; + s5 = t47 ^ t65; + s1 = t64 ^ ~s3; + s2 = t55 ^ ~t67; + + q[7] = s0; + q[6] = s1; + q[5] = s2; + q[4] = s3; + q[3] = s4; + q[2] = s5; + q[1] = s6; + q[0] = s7; +} + +/* see inner.h */ +void +aes_sse2_4x32_ortho(__m128i q[static 8]) +{ +#define SWAPN(cl, ch, s, x, y) do { \ + __m128i cl128 = _mm_set1_epi32(cl); \ + __m128i ch128 = _mm_set1_epi32(ch); \ + __m128i a, b; \ + a = _mm_load_si128(&(x)); \ + b = _mm_load_si128(&(y)); \ + _mm_store_si128(&(x), \ + (a & cl128) | _mm_slli_epi32((b & cl128), (s))); \ + _mm_store_si128(&(y), \ + _mm_srli_epi32((a & ch128), (s)) | (b & ch128)); \ + } while (0) + +#define SWAP2(x, y) SWAPN(0x55555555, 0xAAAAAAAA, 1, x, y) +#define SWAP4(x, y) SWAPN(0x33333333, 0xCCCCCCCC, 2, x, y) +#define SWAP8(x, y) SWAPN(0x0F0F0F0F, 0xF0F0F0F0, 4, x, y) + + SWAP2(q[0], q[1]); + SWAP2(q[2], q[3]); + SWAP2(q[4], q[5]); + SWAP2(q[6], q[7]); + + SWAP4(q[0], q[2]); + SWAP4(q[1], q[3]); + SWAP4(q[4], q[6]); + SWAP4(q[5], q[7]); + + SWAP8(q[0], q[4]); + SWAP8(q[1], q[5]); + SWAP8(q[2], q[6]); + SWAP8(q[3], q[7]); +} + +static const unsigned char Rcon[] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36 +}; + +static uint32_t +sub_word(uint32_t x) +{ + __m128i q[8]; + uint32_t y; + + memset(q, 0, sizeof(q)); + q[0] = _mm_loadu_si32(&x); + aes_sse2_4x32_ortho(q); + aes_sse2_4x32_bitslice_Sbox(q); + aes_sse2_4x32_ortho(q); + _mm_storeu_si32(&y, q[0]); + return y; +} + +/* see inner.h */ +unsigned +aes_sse2_4x32_keysched(uint32_t comp_skey[static 60], const void *key, + size_t key_len) +{ + unsigned num_rounds; + int i, j, k, nk, nkf; + uint32_t tmp; + uint32_t skey[120]; + + switch (key_len) { + case 16: + num_rounds = 10; + break; + case 24: + num_rounds = 12; + break; + case 32: + num_rounds = 14; + break; + default: + /* abort(); */ + return 0; + } + nk = (int)(key_len >> 2); + nkf = (int)((num_rounds + 1) << 2); + tmp = 0; + for (i = 0; i < nk; i ++) { + tmp = br_dec32le((const unsigned char *)key + (i << 2)); + skey[(i << 1) + 0] = tmp; + skey[(i << 1) + 1] = tmp; + } + for (i = nk, j = 0, k = 0; i < nkf; i ++) { + if (j == 0) { + tmp = (tmp << 24) | (tmp >> 8); + tmp = sub_word(tmp) ^ Rcon[k]; + } else if (nk > 6 && j == 4) { + tmp = sub_word(tmp); + } + tmp ^= skey[(i - nk) << 1]; + skey[(i << 1) + 0] = tmp; + skey[(i << 1) + 1] = tmp; + if (++ j == nk) { + j = 0; + k ++; + } + } + for (i = 0; i < nkf; i += 4) { + __m128i q[8]; + + for (j = 0; j < 8; j++) + q[j] = _mm_loadu_si32(&skey[(i << 1) + j]); + aes_sse2_4x32_ortho(q); + for (j = 0; j < 8; j++) + _mm_storeu_si32(&skey[(i << 1) + j], q[j]); + } + for (i = 0, j = 0; i < nkf; i ++, j += 2) { + comp_skey[i] = (skey[j + 0] & 0x55555555) + | (skey[j + 1] & 0xAAAAAAAA); + } + return num_rounds; +} + +/* see inner.h */ +void +aes_sse2_4x32_skey_expand(uint32_t skey[static 120], + unsigned num_rounds, const uint32_t comp_skey[static 60]) +{ + unsigned u, v, n; + + n = (num_rounds + 1) << 2; + for (u = 0, v = 0; u < n; u ++, v += 2) { + uint32_t x, y; + + x = y = comp_skey[u]; + x &= 0x55555555; + skey[v + 0] = x | (x << 1); + y &= 0xAAAAAAAA; + skey[v + 1] = y | (y >> 1); + } +} diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2_4x32.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/crypto/aes/arch/x86/aes_sse2_4x32.h Sat Nov 22 06:22:31 2025 +0000 @@ -0,0 +1,36 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2020 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _CRYPTO_AES_ARCH_X86_AES_SSE2_4X32_H +#define _CRYPTO_AES_ARCH_X86_AES_SSE2_4X32_H + +#include + +extern struct aes_impl aes_sse2_4x32_impl; + +#endif /* _CRYPTO_AES_ARCH_X86_AES_SSE2_4X32_H */ diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2_4x32_dec.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/crypto/aes/arch/x86/aes_sse2_4x32_dec.c Sat Nov 22 06:22:31 2025 +0000 @@ -0,0 +1,195 @@ +/* $NetBSD$ */ + +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +__KERNEL_RCSID(1, "$NetBSD$"); + +#include + +#include "aes_sse2_4x32_impl.h" + +/* see inner.h */ +void +aes_sse2_4x32_bitslice_invSbox(__m128i q[static 8]) +{ + /* + * AES S-box is: + * S(x) = A(I(x)) ^ 0x63 + * where I() is inversion in GF(256), and A() is a linear + * transform (0 is formally defined to be its own inverse). + * Since inversion is an involution, the inverse S-box can be + * computed from the S-box as: + * iS(x) = B(S(B(x ^ 0x63)) ^ 0x63) + * where B() is the inverse of A(). Indeed, for any y in GF(256): + * iS(S(y)) = B(A(I(B(A(I(y)) ^ 0x63 ^ 0x63))) ^ 0x63 ^ 0x63) = y + * + * Note: we reuse the implementation of the forward S-box, + * instead of duplicating it here, so that total code size is + * lower. By merging the B() transforms into the S-box circuit + * we could make faster CBC decryption, but CBC decryption is + * already quite faster than CBC encryption because we can + * process two blocks in parallel. + */ + __m128i q0, q1, q2, q3, q4, q5, q6, q7; + + q0 = ~q[0]; + q1 = ~q[1]; + q2 = q[2]; + q3 = q[3]; + q4 = q[4]; + q5 = ~q[5]; + q6 = ~q[6]; + q7 = q[7]; + q[7] = q1 ^ q4 ^ q6; + q[6] = q0 ^ q3 ^ q5; + q[5] = q7 ^ q2 ^ q4; + q[4] = q6 ^ q1 ^ q3; + q[3] = q5 ^ q0 ^ q2; + q[2] = q4 ^ q7 ^ q1; + q[1] = q3 ^ q6 ^ q0; + q[0] = q2 ^ q5 ^ q7; + + aes_sse2_4x32_bitslice_Sbox(q); + + q0 = ~q[0]; + q1 = ~q[1]; + q2 = q[2]; + q3 = q[3]; + q4 = q[4]; + q5 = ~q[5]; + q6 = ~q[6]; + q7 = q[7]; + q[7] = q1 ^ q4 ^ q6; + q[6] = q0 ^ q3 ^ q5; + q[5] = q7 ^ q2 ^ q4; + q[4] = q6 ^ q1 ^ q3; + q[3] = q5 ^ q0 ^ q2; + q[2] = q4 ^ q7 ^ q1; + q[1] = q3 ^ q6 ^ q0; + q[0] = q2 ^ q5 ^ q7; +} + +static void +add_round_key(__m128i q[static 8], const uint32_t sk[static 8]) +{ + + q[0] ^= _mm_set1_epi32(sk[0]); + q[1] ^= _mm_set1_epi32(sk[1]); + q[2] ^= _mm_set1_epi32(sk[2]); + q[3] ^= _mm_set1_epi32(sk[3]); + q[4] ^= _mm_set1_epi32(sk[4]); + q[5] ^= _mm_set1_epi32(sk[5]); + q[6] ^= _mm_set1_epi32(sk[6]); + q[7] ^= _mm_set1_epi32(sk[7]); +} + +static inline __m128i +inv_shift_row(__m128i q) +{ + __m128i x, y0, y1, y2, y3, y4, y5, y6; + + x = q; + y0 = x & _mm_set1_epi32(0x000000FF); + y1 = _mm_slli_epi32(x & _mm_set1_epi32(0x00003F00), 2); + y2 = _mm_srli_epi32(x & _mm_set1_epi32(0x0000C000), 6); + y3 = _mm_slli_epi32(x & _mm_set1_epi32(0x000F0000), 4); + y4 = _mm_srli_epi32(x & _mm_set1_epi32(0x00F00000), 4); + y5 = _mm_slli_epi32(x & _mm_set1_epi32(0x03000000), 6); + y6 = _mm_srli_epi32(x & _mm_set1_epi32(0xFC000000), 2); + return y0 | y1 | y2 | y3 | y4 | y5 | y6; +} + +static void +inv_shift_rows(__m128i *q) +{ + + q[0] = inv_shift_row(q[0]); + q[1] = inv_shift_row(q[1]); + q[2] = inv_shift_row(q[2]); + q[3] = inv_shift_row(q[3]); + q[4] = inv_shift_row(q[4]); + q[5] = inv_shift_row(q[5]); + q[6] = inv_shift_row(q[6]); + q[7] = inv_shift_row(q[7]); +} + +static inline __m128i +rotr16(__m128i x) +{ + return _mm_slli_epi32(x, 16) | _mm_srli_epi32(x, 16); +} + +static void +inv_mix_columns(__m128i q[static 8]) +{ + __m128i q0, q1, q2, q3, q4, q5, q6, q7; + __m128i r0, r1, r2, r3, r4, r5, r6, r7; + + q0 = q[0]; + q1 = q[1]; + q2 = q[2]; + q3 = q[3]; + q4 = q[4]; + q5 = q[5]; + q6 = q[6]; + q7 = q[7]; + r0 = _mm_srli_epi32(q0, 8) | _mm_slli_epi32(q0, 24); + r1 = _mm_srli_epi32(q1, 8) | _mm_slli_epi32(q1, 24); + r2 = _mm_srli_epi32(q2, 8) | _mm_slli_epi32(q2, 24); + r3 = _mm_srli_epi32(q3, 8) | _mm_slli_epi32(q3, 24); + r4 = _mm_srli_epi32(q4, 8) | _mm_slli_epi32(q4, 24); + r5 = _mm_srli_epi32(q5, 8) | _mm_slli_epi32(q5, 24); + r6 = _mm_srli_epi32(q6, 8) | _mm_slli_epi32(q6, 24); + r7 = _mm_srli_epi32(q7, 8) | _mm_slli_epi32(q7, 24); + + q[0] = q5 ^ q6 ^ q7 ^ r0 ^ r5 ^ r7 ^ rotr16(q0 ^ q5 ^ q6 ^ r0 ^ r5); + q[1] = q0 ^ q5 ^ r0 ^ r1 ^ r5 ^ r6 ^ r7 ^ rotr16(q1 ^ q5 ^ q7 ^ r1 ^ r5 ^ r6); + q[2] = q0 ^ q1 ^ q6 ^ r1 ^ r2 ^ r6 ^ r7 ^ rotr16(q0 ^ q2 ^ q6 ^ r2 ^ r6 ^ r7); + q[3] = q0 ^ q1 ^ q2 ^ q5 ^ q6 ^ r0 ^ r2 ^ r3 ^ r5 ^ rotr16(q0 ^ q1 ^ q3 ^ q5 ^ q6 ^ q7 ^ r0 ^ r3 ^ r5 ^ r7); + q[4] = q1 ^ q2 ^ q3 ^ q5 ^ r1 ^ r3 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr16(q1 ^ q2 ^ q4 ^ q5 ^ q7 ^ r1 ^ r4 ^ r5 ^ r6); + q[5] = q2 ^ q3 ^ q4 ^ q6 ^ r2 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr16(q2 ^ q3 ^ q5 ^ q6 ^ r2 ^ r5 ^ r6 ^ r7); + q[6] = q3 ^ q4 ^ q5 ^ q7 ^ r3 ^ r5 ^ r6 ^ r7 ^ rotr16(q3 ^ q4 ^ q6 ^ q7 ^ r3 ^ r6 ^ r7); + q[7] = q4 ^ q5 ^ q6 ^ r4 ^ r6 ^ r7 ^ rotr16(q4 ^ q5 ^ q7 ^ r4 ^ r7); +} + +/* see inner.h */ +void +aes_sse2_4x32_bitslice_decrypt(unsigned num_rounds, + const uint32_t skey[static 120], __m128i q[static 8]) +{ + unsigned u; + + add_round_key(q, skey + (num_rounds << 3)); + for (u = num_rounds - 1; u > 0; u --) { + inv_shift_rows(q); + aes_sse2_4x32_bitslice_invSbox(q); + add_round_key(q, skey + (u << 3)); + inv_mix_columns(q); + } + inv_shift_rows(q); + aes_sse2_4x32_bitslice_invSbox(q); + add_round_key(q, skey); +} diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2_4x32_enc.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/crypto/aes/arch/x86/aes_sse2_4x32_enc.c Sat Nov 22 06:22:31 2025 +0000 @@ -0,0 +1,134 @@ +/* $NetBSD$ */ + +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +__KERNEL_RCSID(1, "$NetBSD$"); + +#include + +#include "aes_sse2_4x32_impl.h" + +static inline void +add_round_key(__m128i q[static 8], const uint32_t sk[static 8]) +{ + + q[0] ^= _mm_set1_epi32(sk[0]); + q[1] ^= _mm_set1_epi32(sk[1]); + q[2] ^= _mm_set1_epi32(sk[2]); + q[3] ^= _mm_set1_epi32(sk[3]); + q[4] ^= _mm_set1_epi32(sk[4]); + q[5] ^= _mm_set1_epi32(sk[5]); + q[6] ^= _mm_set1_epi32(sk[6]); + q[7] ^= _mm_set1_epi32(sk[7]); +} + +static inline __m128i +shift_row(__m128i q) +{ + __m128i x, y0, y1, y2, y3, y4, y5, y6; + + x = q; + y0 = x & _mm_set1_epi32(0x000000FF); + y1 = _mm_srli_epi32(x & _mm_set1_epi32(0x0000FC00), 2); + y2 = _mm_slli_epi32(x & _mm_set1_epi32(0x00000300), 6); + y3 = _mm_srli_epi32(x & _mm_set1_epi32(0x00F00000), 4); + y4 = _mm_slli_epi32(x & _mm_set1_epi32(0x000F0000), 4); + y5 = _mm_srli_epi32(x & _mm_set1_epi32(0xC0000000), 6); + y6 = _mm_slli_epi32(x & _mm_set1_epi32(0x3F000000), 2); + return y0 | y1 | y2 | y3 | y4 | y5 | y6; +} + +static inline void +shift_rows(__m128i q[static 8]) +{ + + q[0] = shift_row(q[0]); + q[1] = shift_row(q[1]); + q[2] = shift_row(q[2]); + q[3] = shift_row(q[3]); + q[4] = shift_row(q[4]); + q[5] = shift_row(q[5]); + q[6] = shift_row(q[6]); + q[7] = shift_row(q[7]); +} + +static inline __m128i +rotr16(__m128i x) +{ + return _mm_slli_epi32(x, 16) | _mm_srli_epi32(x, 16); +} + +static inline void +mix_columns(__m128i q[static 8]) +{ + __m128i q0, q1, q2, q3, q4, q5, q6, q7; + __m128i r0, r1, r2, r3, r4, r5, r6, r7; + + q0 = q[0]; + q1 = q[1]; + q2 = q[2]; + q3 = q[3]; + q4 = q[4]; + q5 = q[5]; + q6 = q[6]; + q7 = q[7]; + r0 = _mm_srli_epi32(q0, 8) | _mm_slli_epi32(q0, 24); + r1 = _mm_srli_epi32(q1, 8) | _mm_slli_epi32(q1, 24); + r2 = _mm_srli_epi32(q2, 8) | _mm_slli_epi32(q2, 24); + r3 = _mm_srli_epi32(q3, 8) | _mm_slli_epi32(q3, 24); + r4 = _mm_srli_epi32(q4, 8) | _mm_slli_epi32(q4, 24); + r5 = _mm_srli_epi32(q5, 8) | _mm_slli_epi32(q5, 24); + r6 = _mm_srli_epi32(q6, 8) | _mm_slli_epi32(q6, 24); + r7 = _mm_srli_epi32(q7, 8) | _mm_slli_epi32(q7, 24); + + q[0] = q7 ^ r7 ^ r0 ^ rotr16(q0 ^ r0); + q[1] = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr16(q1 ^ r1); + q[2] = q1 ^ r1 ^ r2 ^ rotr16(q2 ^ r2); + q[3] = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr16(q3 ^ r3); + q[4] = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr16(q4 ^ r4); + q[5] = q4 ^ r4 ^ r5 ^ rotr16(q5 ^ r5); + q[6] = q5 ^ r5 ^ r6 ^ rotr16(q6 ^ r6); + q[7] = q6 ^ r6 ^ r7 ^ rotr16(q7 ^ r7); +} + +/* see inner.h */ +void +aes_sse2_4x32_bitslice_encrypt(unsigned num_rounds, + const uint32_t skey[static 120], __m128i q[static 8]) +{ + unsigned u; + + add_round_key(q, skey); + for (u = 1; u < num_rounds; u ++) { + aes_sse2_4x32_bitslice_Sbox(q); + shift_rows(q); + mix_columns(q); + add_round_key(q, skey + (u << 3)); + } + aes_sse2_4x32_bitslice_Sbox(q); + shift_rows(q); + add_round_key(q, skey + (num_rounds << 3)); +} diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2_4x32_impl.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/crypto/aes/arch/x86/aes_sse2_4x32_impl.c Sat Nov 22 06:22:31 2025 +0000 @@ -0,0 +1,223 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2025 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__KERNEL_RCSID(1, "$NetBSD$"); + +#include +#include + +#include +#include +#include + +#ifdef _KERNEL +#include +#include +#include +#include +#else +#include +#define fpu_kern_enter() ((void)0) +#define fpu_kern_leave() ((void)0) +#endif + +#include "aes_sse2_4x32_subr.h" + +static void +aes_sse2_4x32_setenckey_impl(struct aesenc *enc, const uint8_t *key, + uint32_t nrounds) +{ + + fpu_kern_enter(); + aes_sse2_4x32_setkey(enc->aese_aes.aes_rk, key, nrounds); + fpu_kern_leave(); +} + +static void +aes_sse2_4x32_setdeckey_impl(struct aesdec *dec, const uint8_t *key, + uint32_t nrounds) +{ + + fpu_kern_enter(); + /* + * BearSSL computes InvMixColumns on the fly -- no need for + * distinct decryption round keys. + */ + aes_sse2_4x32_setkey(dec->aesd_aes.aes_rk, key, nrounds); + fpu_kern_leave(); +} + +static void +aes_sse2_4x32_enc_impl(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], uint32_t nrounds) +{ + + fpu_kern_enter(); + aes_sse2_4x32_enc(enc, in, out, nrounds); + fpu_kern_leave(); +} + +static void +aes_sse2_4x32_dec_impl(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], uint32_t nrounds) +{ + + fpu_kern_enter(); + aes_sse2_4x32_dec(dec, in, out, nrounds); + fpu_kern_leave(); +} + +static void +aes_sse2_4x32_cbc_enc_impl(const struct aesenc *enc, + const uint8_t in[static 16], uint8_t out[static 16], + size_t nbytes, uint8_t iv[static 16], uint32_t nrounds) +{ + + if (nbytes == 0) + return; + fpu_kern_enter(); + aes_sse2_4x32_cbc_enc(enc, in, out, nbytes, iv, nrounds); + fpu_kern_leave(); +} + +static void +aes_sse2_4x32_cbc_dec_impl(const struct aesdec *dec, + const uint8_t in[static 16], uint8_t out[static 16], + size_t nbytes, uint8_t iv[static 16], uint32_t nrounds) +{ + + if (nbytes == 0) + return; + fpu_kern_enter(); + aes_sse2_4x32_cbc_dec(dec, in, out, nbytes, iv, nrounds); + fpu_kern_leave(); +} + +static void +aes_sse2_4x32_xts_enc_impl(const struct aesenc *enc, + const uint8_t in[static 16], uint8_t out[static 16], + size_t nbytes, uint8_t tweak[static 16], uint32_t nrounds) +{ + + if (nbytes == 0) + return; + fpu_kern_enter(); + aes_sse2_4x32_xts_enc(enc, in, out, nbytes, tweak, nrounds); + fpu_kern_leave(); +} + +static void +aes_sse2_4x32_xts_dec_impl(const struct aesdec *dec, + const uint8_t in[static 16], uint8_t out[static 16], + size_t nbytes, uint8_t tweak[static 16], uint32_t nrounds) +{ + + if (nbytes == 0) + return; + fpu_kern_enter(); + aes_sse2_4x32_xts_dec(dec, in, out, nbytes, tweak, nrounds); + fpu_kern_leave(); +} + +static void +aes_sse2_4x32_cbcmac_update1_impl(const struct aesenc *enc, + const uint8_t in[static 16], size_t nbytes, uint8_t auth[static 16], + uint32_t nrounds) +{ + + fpu_kern_enter(); + aes_sse2_4x32_cbcmac_update1(enc, in, nbytes, auth, nrounds); + fpu_kern_leave(); +} + +static void +aes_sse2_4x32_ccm_enc1_impl(const struct aesenc *enc, + const uint8_t *in, uint8_t *out, + size_t nbytes, uint8_t authctr[32], uint32_t nrounds) +{ + + fpu_kern_enter(); + aes_sse2_4x32_ccm_enc1(enc, in, out, nbytes, authctr, nrounds); + fpu_kern_leave(); +} + +static void +aes_sse2_4x32_ccm_dec1_impl(const struct aesenc *enc, + const uint8_t *in, uint8_t *out, + size_t nbytes, uint8_t authctr[32], uint32_t nrounds) +{ + + fpu_kern_enter(); + aes_sse2_4x32_ccm_dec1(enc, in, out, nbytes, authctr, nrounds); + fpu_kern_leave(); +} + +static int +aes_sse2_4x32_probe(void) +{ + int result = 0; + + /* Verify that the CPU supports SSE and SSE2. */ +#ifdef _KERNEL + if (!i386_has_sse) + return -1; + if (!i386_has_sse2) + return -1; +#else + unsigned eax, ebx, ecx, edx; + if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx)) + return -1; + if ((edx & bit_SSE) == 0) + return -1; + if ((edx & bit_SSE2) == 0) + return -1; +#endif + + fpu_kern_enter(); + result = aes_sse2_4x32_selftest(); + fpu_kern_leave(); + + return result; +} + +struct aes_impl aes_sse2_4x32_impl = { + .ai_name = "Intel SSE2 4x32 bitsliced", + .ai_probe = aes_sse2_4x32_probe, + .ai_setenckey = aes_sse2_4x32_setenckey_impl, + .ai_setdeckey = aes_sse2_4x32_setdeckey_impl, + .ai_enc = aes_sse2_4x32_enc_impl, + .ai_dec = aes_sse2_4x32_dec_impl, + .ai_cbc_enc = aes_sse2_4x32_cbc_enc_impl, + .ai_cbc_dec = aes_sse2_4x32_cbc_dec_impl, + .ai_xts_enc = aes_sse2_4x32_xts_enc_impl, + .ai_xts_dec = aes_sse2_4x32_xts_dec_impl, + .ai_cbcmac_update1 = aes_sse2_4x32_cbcmac_update1_impl, + .ai_ccm_enc1 = aes_sse2_4x32_ccm_enc1_impl, + .ai_ccm_dec1 = aes_sse2_4x32_ccm_dec1_impl, +}; diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2_4x32_impl.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/crypto/aes/arch/x86/aes_sse2_4x32_impl.h Sat Nov 22 06:22:31 2025 +0000 @@ -0,0 +1,54 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2025 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _CRYPTO_AES_ARCH_X86_AES_SSE2_4X32_IMPL_H +#define _CRYPTO_AES_ARCH_X86_AES_SSE2_4X32_IMPL_H + +#include + +#include +#include +#include + +#include "aes_sse2_4x32_subr.h" + +#define br_dec32le le32dec +#define br_enc32le le32enc + +void aes_sse2_4x32_bitslice_Sbox(__m128i[static 8]); +void aes_sse2_4x32_bitslice_invSbox(__m128i[static 8]); +void aes_sse2_4x32_ortho(__m128i[static 8]); +unsigned aes_sse2_4x32_keysched(uint32_t[static 60], const void *, size_t); +void aes_sse2_4x32_skey_expand(uint32_t[static 120], unsigned, + const uint32_t[static 60]); +void aes_sse2_4x32_bitslice_encrypt(unsigned, const uint32_t[static 120], + __m128i[static 8]); +void aes_sse2_4x32_bitslice_decrypt(unsigned, const uint32_t[static 120], + __m128i[static 8]); + +#endif /* _CRYPTO_AES_ARCH_X86_AES_SSE2_4X32_IMPL_H */ diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2_4x32_subr.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/crypto/aes/arch/x86/aes_sse2_4x32_subr.c Sat Nov 22 06:22:31 2025 +0000 @@ -0,0 +1,753 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2025 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__KERNEL_RCSID(1, "$NetBSD$"); + +#include + +#ifdef _KERNEL +#include +#include +#else +#include +#include +#include +#include +#include +#define KASSERT assert +#define panic(fmt, args...) err(1, fmt, ##args) +#endif + +#include "aes_sse2_4x32_impl.h" +#include "aes_sse2_4x32_subr.h" + +#ifndef _MM_TRANSPOSE4_EPI32 +#define _MM_TRANSPOSE4_EPI32(r0, r1, r2, r3) do \ +{ \ + __m128i _mm_tmp0, _mm_tmp1, _mm_tmp2, _mm_tmp3; \ + \ + _mm_tmp0 = _mm_unpacklo_epi32(r0, r1); \ + _mm_tmp2 = _mm_unpacklo_epi32(r2, r3); \ + _mm_tmp1 = _mm_unpackhi_epi32(r0, r1); \ + _mm_tmp3 = _mm_unpackhi_epi32(r2, r3); \ + (r0) = (__m128i)_mm_movelh_ps((__m128)_mm_tmp0, (__m128)_mm_tmp2); \ + (r1) = (__m128i)_mm_movehl_ps((__m128)_mm_tmp2, (__m128)_mm_tmp0); \ + (r2) = (__m128i)_mm_movelh_ps((__m128)_mm_tmp1, (__m128)_mm_tmp3); \ + (r3) = (__m128i)_mm_movehl_ps((__m128)_mm_tmp3, (__m128)_mm_tmp1); \ +} while (0) +#endif + +void +aes_sse2_4x32_setkey(uint32_t rk[static 60], const void *key, uint32_t nrounds) +{ + size_t key_len; + + switch (nrounds) { + case 10: + key_len = 16; + break; + case 12: + key_len = 24; + break; + case 14: + key_len = 32; + break; + default: + panic("invalid AES nrounds: %u", nrounds); + } + + aes_sse2_4x32_keysched(rk, key, key_len); +} + +void +aes_sse2_4x32_enc(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], uint32_t nrounds) +{ + uint32_t sk_exp[120]; + __m128i q[8]; + + /* Expand round keys for bitslicing. */ + aes_sse2_4x32_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk); + + /* Load input block interleaved with garbage blocks. */ + q[0] = _mm_loadu_epi8(in); + q[2] = q[4] = q[6] = _mm_setzero_si128(); + q[1] = q[3] = q[5] = q[7] = _mm_setzero_si128(); + + /* Transform to bitslice, decrypt, transform from bitslice. */ + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + aes_sse2_4x32_ortho(q); + aes_sse2_4x32_bitslice_encrypt(nrounds, sk_exp, q); + aes_sse2_4x32_ortho(q); + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + + /* Store output block. */ + _mm_storeu_epi8(out, q[0]); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); +} + +void +aes_sse2_4x32_dec(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], uint32_t nrounds) +{ + uint32_t sk_exp[120]; + __m128i q[8]; + + /* Expand round keys for bitslicing. */ + aes_sse2_4x32_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk); + + /* Load input block interleaved with garbage blocks. */ + q[0] = _mm_loadu_epi8(in); + q[2] = q[4] = q[6] = _mm_setzero_si128(); + q[1] = q[3] = q[5] = q[7] = _mm_setzero_si128(); + + /* Transform to bitslice, decrypt, transform from bitslice. */ + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + aes_sse2_4x32_ortho(q); + aes_sse2_4x32_bitslice_decrypt(nrounds, sk_exp, q); + aes_sse2_4x32_ortho(q); + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + + /* Store output block. */ + _mm_storeu_epi8(out, q[0]); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); +} + +void +aes_sse2_4x32_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], + uint32_t nrounds) +{ + uint32_t sk_exp[120]; + __m128i q[8]; + __m128i cv; + + KASSERT(nbytes); + KASSERT(nbytes % 16 == 0); + + /* Expand round keys for bitslicing. */ + aes_sse2_4x32_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk); + + /* Load the IV. */ + cv = _mm_loadu_epi8(iv); + + /* + * Zero the registers we won't be using, since CBC encryption + * is inherently sequential so we can only do one block at a + * time. + */ + q[2] = q[4] = q[6] = _mm_setzero_si128(); + q[1] = q[3] = q[5] = q[7] = _mm_setzero_si128(); + + for (; nbytes; nbytes -= 16, in += 16, out += 16) { + /* Load input block and apply CV. */ + q[0] = cv ^ _mm_loadu_epi8(in); + + /* Transform to bitslice, encrypt, transform from bitslice. */ + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + aes_sse2_4x32_ortho(q); + aes_sse2_4x32_bitslice_encrypt(nrounds, sk_exp, q); + aes_sse2_4x32_ortho(q); + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + + /* Remember ciphertext as CV and store output block. */ + cv = q[0]; + _mm_storeu_epi8(out, cv); + } + + /* Store updated IV. */ + _mm_storeu_epi8(iv, cv); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); +} + +void +aes_sse2_4x32_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t ivp[static 16], + uint32_t nrounds) +{ + uint32_t sk_exp[120]; + __m128i q[8]; + __m128i cv, iv, w; + + KASSERT(nbytes); + KASSERT(nbytes % 16 == 0); + + /* Expand round keys for bitslicing. */ + aes_sse2_4x32_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk); + + /* Load the IV. */ + iv = _mm_loadu_epi8(ivp); + + /* Load the last cipher block. */ + cv = _mm_loadu_epi8(in + nbytes - 16); + + /* Store the updated IV. */ + _mm_storeu_epi8(ivp, cv); + + /* Process the last blocks if not an even multiple of eight. */ + if (nbytes % (8*16)) { + unsigned i, n = (nbytes/16) % 8; + + KASSERT(n > 0); + KASSERT(n < 8); + + for (i = 8; i --> n;) + q[i] = _mm_setzero_si128(); + q[i] = cv; + while (i --> 0) + q[i] = _mm_loadu_epi8(in + nbytes - 16*n + 16*i); + + /* Decrypt up to seven blocks. */ + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + _MM_TRANSPOSE4_EPI32(q[1], q[3], q[5], q[7]); + aes_sse2_4x32_ortho(q); + aes_sse2_4x32_bitslice_decrypt(nrounds, sk_exp, q); + aes_sse2_4x32_ortho(q); + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + _MM_TRANSPOSE4_EPI32(q[1], q[3], q[5], q[7]); + + do { + n--; + w = q[n]; + if ((nbytes -= 16) == 0) + goto out; + cv = _mm_loadu_epi8(in + nbytes - 16); + _mm_storeu_epi8(out + nbytes, w ^ cv); + } while (n); + } + + for (;;) { + KASSERT(nbytes >= 128); + nbytes -= 128; + + /* + * 1. Set up upper cipher block from cv. + * 2. Load lower cipher blocks from input. + */ + q[7] = cv; /* _mm_loadu_epi8(in + nbytes + 16*7) */ + q[6] = _mm_loadu_epi8(in + nbytes + 16*6); + q[5] = _mm_loadu_epi8(in + nbytes + 16*5); + q[4] = _mm_loadu_epi8(in + nbytes + 16*4); + q[3] = _mm_loadu_epi8(in + nbytes + 16*3); + q[2] = _mm_loadu_epi8(in + nbytes + 16*2); + q[1] = _mm_loadu_epi8(in + nbytes + 16*1); + q[0] = _mm_loadu_epi8(in + nbytes + 16*0); + + /* Decrypt eight blocks at a time. */ + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + _MM_TRANSPOSE4_EPI32(q[1], q[3], q[5], q[7]); + aes_sse2_4x32_ortho(q); + aes_sse2_4x32_bitslice_decrypt(nrounds, sk_exp, q); + aes_sse2_4x32_ortho(q); + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + _MM_TRANSPOSE4_EPI32(q[1], q[3], q[5], q[7]); + + /* Store the seven upper output blocks. */ + cv = _mm_loadu_epi8(in + nbytes + 16*6); + _mm_storeu_epi8(out + nbytes + 16*7, cv ^ q[7]); + cv = _mm_loadu_epi8(in + nbytes + 16*5); + _mm_storeu_epi8(out + nbytes + 16*6, cv ^ q[6]); + cv = _mm_loadu_epi8(in + nbytes + 16*4); + _mm_storeu_epi8(out + nbytes + 16*5, cv ^ q[5]); + cv = _mm_loadu_epi8(in + nbytes + 16*3); + _mm_storeu_epi8(out + nbytes + 16*4, cv ^ q[4]); + cv = _mm_loadu_epi8(in + nbytes + 16*2); + _mm_storeu_epi8(out + nbytes + 16*3, cv ^ q[3]); + cv = _mm_loadu_epi8(in + nbytes + 16*1); + _mm_storeu_epi8(out + nbytes + 16*2, cv ^ q[2]); + cv = _mm_loadu_epi8(in + nbytes + 16*0); + _mm_storeu_epi8(out + nbytes + 16*1, cv ^ q[1]); + + /* + * Get the first output block, but don't load the CV + * yet -- it might be the previous ciphertext block, or + * it might be the IV. + */ + w = q[0]; + + /* Stop if we've reached the first output block. */ + if (nbytes == 0) + goto out; + + /* + * Load the preceding cipher block, and apply it as the + * chaining value to this one. + */ + cv = _mm_loadu_epi8(in + nbytes - 16); + _mm_storeu_epi8(out + nbytes, w ^ cv); + } + +out: /* Store the first output block. */ + _mm_storeu_epi8(out, w ^ iv); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); +} + +static inline __m128i +aes_sse2_4x32_xts_update(__m128i t) +{ + const __m128i one = _mm_set_epi64x(1, 1); + __m128i s, m, c; + + s = _mm_srli_epi64(t, 63); /* 1 if high bit set else 0 */ + m = _mm_sub_epi64(s, one); /* 0 if high bit set else -1 */ + m = _mm_shuffle_epi32(m, 0x4e); /* swap halves */ + c = _mm_set_epi64x(1, 0x87); /* carry */ + + return _mm_slli_epi64(t, 1) ^ (c & ~m); +} + +static int +aes_sse2_4x32_xts_update_selftest(void) +{ + static const struct { + uint32_t in[4], out[4]; + } cases[] = { + [0] = { {1}, {2} }, + [1] = { {0x80000000U,0,0,0}, {0,1,0,0} }, + [2] = { {0,0x80000000U,0,0}, {0,0,1,0} }, + [3] = { {0,0,0x80000000U,0}, {0,0,0,1} }, + [4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} }, + [5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} }, + }; + unsigned i; + uint32_t t[4]; + int result = 0; + + for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) { + t[0] = cases[i].in[0]; + t[1] = cases[i].in[1]; + t[2] = cases[i].in[2]; + t[3] = cases[i].in[3]; + _mm_storeu_epi8(t, aes_sse2_4x32_xts_update(_mm_loadu_epi8(t))); + if (t[0] != cases[i].out[0] || + t[1] != cases[i].out[1] || + t[2] != cases[i].out[2] || + t[3] != cases[i].out[3]) { + printf("%s %u:" + " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n", + __func__, i, t[0], t[1], t[2], t[3]); + result = -1; + } + } + + return result; +} + +void +aes_sse2_4x32_xts_enc(const struct aesenc *enc, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], + uint32_t nrounds) +{ + uint32_t sk_exp[120]; + __m128i q[8]; + __m128i t[9]; + unsigned i; + + KASSERT(nbytes); + KASSERT(nbytes % 16 == 0); + + /* Expand round keys for bitslicing. */ + aes_sse2_4x32_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk); + + /* Load tweak. */ + t[0] = _mm_loadu_epi8(tweak); + + /* Handle the first block separately if odd number. */ + if (nbytes % (8*16)) { + /* Load up the tweaked inputs. */ + for (i = 0; i < (nbytes/16) % 8; i++) { + q[i] = _mm_loadu_epi8(in + 16*i) ^ t[i]; + t[i + 1] = aes_sse2_4x32_xts_update(t[i]); + } + for (; i < 8; i++) + q[i] = _mm_setzero_si128(); + + /* Encrypt up to seven blocks. */ + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + _MM_TRANSPOSE4_EPI32(q[1], q[3], q[5], q[7]); + aes_sse2_4x32_ortho(q); + aes_sse2_4x32_bitslice_encrypt(nrounds, sk_exp, q); + aes_sse2_4x32_ortho(q); + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + _MM_TRANSPOSE4_EPI32(q[1], q[3], q[5], q[7]); + + /* Store the tweaked outputs. */ + for (i = 0; i < (nbytes/16) % 8; i++) + _mm_storeu_epi8(out + 16*i, q[i] ^ t[i]); + + /* Advance to the next block. */ + t[0] = t[i]; + in += nbytes % (8*16); + out += nbytes % (8*16); + nbytes -= nbytes % (8*16); + if (nbytes == 0) + goto out; + } + + do { + KASSERT(nbytes % 128 == 0); + KASSERT(nbytes >= 128); + + /* Load up the tweaked inputs. */ + for (i = 0; i < 8; i++) { + q[i] = _mm_loadu_epi8(in + 16*i) ^ t[i]; + t[i + 1] = aes_sse2_4x32_xts_update(t[i]); + } + + /* Encrypt eight blocks. */ + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + _MM_TRANSPOSE4_EPI32(q[1], q[3], q[5], q[7]); + aes_sse2_4x32_ortho(q); + aes_sse2_4x32_bitslice_encrypt(nrounds, sk_exp, q); + aes_sse2_4x32_ortho(q); + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + _MM_TRANSPOSE4_EPI32(q[1], q[3], q[5], q[7]); + + /* Store the tweaked outputs. */ + for (i = 0; i < 8; i++) + _mm_storeu_epi8(out + 16*i, q[i] ^ t[i]); + + /* Advance to the next block. */ + t[0] = t[8]; + in += 128; + out += 128; + nbytes -= 128; + } while (nbytes); + +out: /* Store the updated tweak. */ + _mm_storeu_epi8(tweak, t[0]); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); + explicit_memset(t, 0, sizeof t); +} + +void +aes_sse2_4x32_xts_dec(const struct aesdec *dec, const uint8_t in[static 16], + uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], + uint32_t nrounds) +{ + uint32_t sk_exp[120]; + __m128i q[8]; + __m128i t[9]; + unsigned i; + + KASSERT(nbytes); + KASSERT(nbytes % 16 == 0); + + /* Expand round keys for bitslicing. */ + aes_sse2_4x32_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk); + + /* Load tweak. */ + t[0] = _mm_loadu_epi8(tweak); + + /* Handle the first block separately if odd number. */ + if (nbytes % (8*16)) { + /* Load up the tweaked inputs. */ + for (i = 0; i < (nbytes/16) % 8; i++) { + q[i] = _mm_loadu_epi8(in + 16*i) ^ t[i]; + t[i + 1] = aes_sse2_4x32_xts_update(t[i]); + } + for (; i < 8; i++) + q[i] = _mm_setzero_si128(); + + /* Decrypt up to seven blocks. */ + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + _MM_TRANSPOSE4_EPI32(q[1], q[3], q[5], q[7]); + aes_sse2_4x32_ortho(q); + aes_sse2_4x32_bitslice_decrypt(nrounds, sk_exp, q); + aes_sse2_4x32_ortho(q); + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + _MM_TRANSPOSE4_EPI32(q[1], q[3], q[5], q[7]); + + /* Store the tweaked outputs. */ + for (i = 0; i < (nbytes/16) % 8; i++) + _mm_storeu_epi8(out + 16*i, q[i] ^ t[i]); + + /* Advance to the next block. */ + t[0] = t[i]; + in += nbytes % (8*16); + out += nbytes % (8*16); + nbytes -= nbytes % (8*16); + if (nbytes == 0) + goto out; + } + + do { + KASSERT(nbytes % 128 == 0); + KASSERT(nbytes >= 128); + + /* Load up the tweaked inputs. */ + for (i = 0; i < 8; i++) { + q[i] = _mm_loadu_epi8(in + 16*i) ^ t[i]; + t[i + 1] = aes_sse2_4x32_xts_update(t[i]); + } + + /* Decrypt eight blocks. */ + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + _MM_TRANSPOSE4_EPI32(q[1], q[3], q[5], q[7]); + aes_sse2_4x32_ortho(q); + aes_sse2_4x32_bitslice_decrypt(nrounds, sk_exp, q); + aes_sse2_4x32_ortho(q); + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + _MM_TRANSPOSE4_EPI32(q[1], q[3], q[5], q[7]); + + /* Store the tweaked outputs. */ + for (i = 0; i < 8; i++) + _mm_storeu_epi8(out + 16*i, q[i] ^ t[i]); + + /* Advance to the next block. */ + t[0] = t[8]; + in += 128; + out += 128; + nbytes -= 128; + } while (nbytes); + +out: /* Store the updated tweak. */ + _mm_storeu_epi8(tweak, t[0]); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); + explicit_memset(t, 0, sizeof t); +} + +void +aes_sse2_4x32_cbcmac_update1(const struct aesenc *enc, + const uint8_t in[static 16], size_t nbytes, + uint8_t auth[static 16], uint32_t nrounds) +{ + uint32_t sk_exp[120]; + __m128i q[8]; + + KASSERT(nbytes); + KASSERT(nbytes % 16 == 0); + + /* Expand round keys for bitslicing. */ + aes_sse2_4x32_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk); + + /* Initialize garbage blocks. */ + q[1] = q[2] = q[3] = q[4] = q[5] = q[6] = q[7] = _mm_setzero_si128(); + + /* Load initial authenticator. */ + q[0] = _mm_loadu_epi8(auth); + + for (; nbytes; nbytes -= 16, in += 16) { + /* Combine input block. */ + q[0] ^= _mm_loadu_epi8(in); + + /* Transform to bitslice, encrypt, transform from bitslice. */ + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + aes_sse2_4x32_ortho(q); + aes_sse2_4x32_bitslice_encrypt(nrounds, sk_exp, q); + aes_sse2_4x32_ortho(q); + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + } + + /* Store updated authenticator. */ + _mm_storeu_epi8(auth, q[0]); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); +} + +void +aes_sse2_4x32_ccm_enc1(const struct aesenc *enc, + const uint8_t in[static 16], uint8_t out[static 16], size_t nbytes, + uint8_t authctr[static 32], uint32_t nrounds) +{ + uint32_t sk_exp[120]; + __m128i q[8]; + uint32_t c0, c1, c2, c3be; + + KASSERT(nbytes); + KASSERT(nbytes % 16 == 0); + + /* Expand round keys for bitslicing. */ + aes_sse2_4x32_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk); + + /* Set first block to authenticator. */ + q[0] = _mm_loadu_epi8(authctr); + + /* Load initial counter block, big-endian so we can increment it. */ + c0 = le32dec(authctr + 16 + 4*0); + c1 = le32dec(authctr + 16 + 4*1); + c2 = le32dec(authctr + 16 + 4*2); + c3be = bswap32(le32dec(authctr + 16 + 4*3)); + + /* Set other blocks to garbage -- can't take advantage. */ + q[1] = q[3] = q[4] = q[5] = q[6] = q[7] = _mm_setzero_si128(); + + for (; nbytes; nbytes -= 16, in += 16, out += 16) { + /* Update authenticator. */ + q[0] ^= _mm_loadu_epi8(in); + + /* Increment 32-bit counter. */ + q[2] = _mm_set_epi32(bswap32(++c3be), c2, c1, c0); + + /* Encrypt authenticator and counter. */ + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + aes_sse2_4x32_ortho(q); + aes_sse2_4x32_bitslice_encrypt(nrounds, sk_exp, q); + aes_sse2_4x32_ortho(q); + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + + /* Encrypt with CTR output. */ + _mm_storeu_epi8(out, _mm_loadu_epi8(in) ^ q[2]); + } + + /* Update authenticator. */ + _mm_storeu_epi8(authctr, q[0]); + + /* Update counter. */ + le32enc(authctr + 16 + 4*3, bswap32(c3be)); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); +} + +void +aes_sse2_4x32_ccm_dec1(const struct aesenc *enc, + const uint8_t in[static 16], uint8_t out[static 16], size_t nbytes, + uint8_t authctr[static 32], uint32_t nrounds) +{ + uint32_t sk_exp[120]; + __m128i q[8]; + uint32_t c0, c1, c2, c3be; + __m128i b; + + KASSERT(nbytes); + KASSERT(nbytes % 16 == 0); + + /* Expand round keys for bitslicing. */ + aes_sse2_4x32_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk); + + /* Load initial counter block, big-endian so we can increment it. */ + c0 = le32dec(authctr + 16 + 4*0); + c1 = le32dec(authctr + 16 + 4*1); + c2 = le32dec(authctr + 16 + 4*2); + c3be = bswap32(le32dec(authctr + 16 + 4*3)); + + /* Increment 32-bit counter. */ + q[0] = _mm_set_epi32(bswap32(++c3be), c2, c1, c0); + + /* + * Set the other blocks to garbage -- we don't have any + * plaintext to authenticate yet. + */ + q[1] = q[3] = q[4] = q[5] = q[6] = q[7] = _mm_setzero_si128(); + + /* Encrypt first CTR. */ + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + aes_sse2_4x32_ortho(q); + aes_sse2_4x32_bitslice_encrypt(nrounds, sk_exp, q); + aes_sse2_4x32_ortho(q); + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + + /* Load the initial authenticator. */ + q[2] = _mm_loadu_epi8(authctr); + + for (;; in += 16, out += 16) { + /* Decrypt the block. */ + b = _mm_loadu_epi8(in) ^ q[0]; + + /* Update authenticator. */ + q[2] ^= b; + + /* Store plaintext. */ + _mm_storeu_epi8(out, b); + + /* If this is the last block, stop. */ + if ((nbytes -= 16) == 0) + break; + + /* Increment 32-bit counter. */ + q[0] = _mm_set_epi32(bswap32(++c3be), c2, c1, c0); + + /* Authenticate previous plaintext, encrypt next CTR. */ + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + aes_sse2_4x32_ortho(q); + aes_sse2_4x32_bitslice_encrypt(nrounds, sk_exp, q); + aes_sse2_4x32_ortho(q); + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + } + + /* + * Authenticate last plaintext. We're only doing this for the + * authenticator, not for the counter, so don't bother to + * initialize q[0]. (Even for the sake of sanitizers, they're + * already initialized to something by now.) + */ + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + aes_sse2_4x32_ortho(q); + aes_sse2_4x32_bitslice_encrypt(nrounds, sk_exp, q); + aes_sse2_4x32_ortho(q); + _MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]); + + /* Update authenticator. */ + _mm_storeu_epi8(authctr, q[2]); + + /* Update counter. */ + le32enc(authctr + 16 + 4*3, bswap32(c3be)); + + /* Paranoia: Zero temporary buffers. */ + explicit_memset(sk_exp, 0, sizeof sk_exp); + explicit_memset(q, 0, sizeof q); +} + +int +aes_sse2_4x32_selftest(void) +{ + + if (aes_sse2_4x32_xts_update_selftest()) + return -1; + + /* XXX test aes_sse2_4x32_bitslice_decrypt */ + /* XXX test aes_sse2_4x32_bitslice_encrypt */ + /* XXX test aes_sse2_4x32_keysched */ + /* XXX test aes_sse2_4x32_ortho */ + /* XXX test aes_sse2_4x32_skey_expand */ + + return 0; +} diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2_4x32_subr.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/crypto/aes/arch/x86/aes_sse2_4x32_subr.h Sat Nov 22 06:22:31 2025 +0000 @@ -0,0 +1,67 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2025 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _CRYPTO_AES_ARCH_X86_AES_SSE2_4X32_SUBR_H +#define _CRYPTO_AES_ARCH_X86_AES_SSE2_4X32_SUBR_H + +#include + +/* + * These functions MUST NOT use any vector registers for parameters or + * results -- the caller is compiled with -mno-sse &c. in the kernel, + * and dynamically turns on the vector unit just before calling them. + * Internal subroutines that use the vector unit for parameters are + * declared in aes_sse2_4x32_internal.h instead. + */ + +void aes_sse2_4x32_setkey(uint32_t[static 60], const void *, uint32_t); + +void aes_sse2_4x32_enc(const struct aesenc *, const uint8_t in[static 16], + uint8_t[static 16], uint32_t); +void aes_sse2_4x32_dec(const struct aesdec *, const uint8_t in[static 16], + uint8_t[static 16], uint32_t); +void aes_sse2_4x32_cbc_enc(const struct aesenc *, const uint8_t[static 16], + uint8_t[static 16], size_t nbytes, uint8_t[static 16], uint32_t); +void aes_sse2_4x32_cbc_dec(const struct aesdec *, const uint8_t[static 16], + uint8_t[static 16], size_t nbytes, uint8_t[static 16], uint32_t); +void aes_sse2_4x32_xts_enc(const struct aesenc *, const uint8_t[static 16], + uint8_t[static 16], size_t nbytes, uint8_t[static 16], uint32_t); +void aes_sse2_4x32_xts_dec(const struct aesdec *, const uint8_t[static 16], + uint8_t[static 16], size_t nbytes, uint8_t[static 16], uint32_t); +void aes_sse2_4x32_cbcmac_update1(const struct aesenc *, + const uint8_t[static 16], size_t, uint8_t[static 16], uint32_t); +void aes_sse2_4x32_ccm_enc1(const struct aesenc *, + const uint8_t[static 16], uint8_t[static 16], size_t, uint8_t[static 32], + uint32_t); +void aes_sse2_4x32_ccm_dec1(const struct aesenc *, + const uint8_t[static 16], uint8_t[static 16], size_t, uint8_t[static 32], + uint32_t); + +int aes_sse2_4x32_selftest(void); + +#endif /* _CRYPTO_AES_ARCH_X86_AES_SSE2_4X32_SUBR_H */ diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2_dec.c --- a/sys/crypto/aes/arch/x86/aes_sse2_dec.c Sat Nov 22 05:28:14 2025 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,178 +0,0 @@ -/* - * Copyright (c) 2016 Thomas Pornin - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -__KERNEL_RCSID(1, "$NetBSD: aes_sse2_dec.c,v 1.1 2020/06/29 23:47:54 riastradh Exp $"); - -#include - -#include "aes_sse2_impl.h" - -/* see inner.h */ -void -aes_sse2_bitslice_invSbox(__m128i q[static 4]) -{ - /* - * See br_aes_ct_bitslice_invSbox(). This is the natural extension - * to 64-bit registers. - */ - __m128i q0, q1, q2, q3, q4, q5, q6, q7; - - q0 = ~q[0]; - q1 = ~q[1]; - q2 = q[2]; - q3 = q[3]; - q4 = _mm_shuffle_epi32(q[0], 0x0e); - q5 = _mm_shuffle_epi32(~q[1], 0x0e); - q6 = _mm_shuffle_epi32(~q[2], 0x0e); - q7 = _mm_shuffle_epi32(q[3], 0x0e); - - q[3] = _mm_unpacklo_epi64(q5 ^ q0 ^ q2, q1 ^ q4 ^ q6); - q[2] = _mm_unpacklo_epi64(q4 ^ q7 ^ q1, q0 ^ q3 ^ q5); - q[1] = _mm_unpacklo_epi64(q3 ^ q6 ^ q0, q7 ^ q2 ^ q4); - q[0] = _mm_unpacklo_epi64(q2 ^ q5 ^ q7, q6 ^ q1 ^ q3); - - aes_sse2_bitslice_Sbox(q); - - q0 = ~q[0]; - q1 = ~q[1]; - q2 = q[2]; - q3 = q[3]; - q4 = _mm_shuffle_epi32(q[0], 0x0e); - q5 = _mm_shuffle_epi32(~q[1], 0x0e); - q6 = _mm_shuffle_epi32(~q[2], 0x0e); - q7 = _mm_shuffle_epi32(q[3], 0x0e); - - q[3] = _mm_unpacklo_epi64(q5 ^ q0 ^ q2, q1 ^ q4 ^ q6); - q[2] = _mm_unpacklo_epi64(q4 ^ q7 ^ q1, q0 ^ q3 ^ q5); - q[1] = _mm_unpacklo_epi64(q3 ^ q6 ^ q0, q7 ^ q2 ^ q4); - q[0] = _mm_unpacklo_epi64(q2 ^ q5 ^ q7, q6 ^ q1 ^ q3); -} - -static inline void -add_round_key(__m128i q[static 4], const uint64_t sk[static 8]) -{ - q[0] ^= _mm_set_epi64x(sk[4], sk[0]); - q[1] ^= _mm_set_epi64x(sk[5], sk[1]); - q[2] ^= _mm_set_epi64x(sk[6], sk[2]); - q[3] ^= _mm_set_epi64x(sk[7], sk[3]); -} - -static inline __m128i -inv_shift_row(__m128i q) -{ - __m128i x, y0, y1, y2, y3, y4, y5, y6; - - x = q; - y0 = x & _mm_set1_epi64x(0x000000000000FFFF); - y1 = x & _mm_set1_epi64x(0x000000000FFF0000); - y2 = x & _mm_set1_epi64x(0x00000000F0000000); - y3 = x & _mm_set1_epi64x(0x000000FF00000000); - y4 = x & _mm_set1_epi64x(0x0000FF0000000000); - y5 = x & _mm_set1_epi64x(0x000F000000000000); - y6 = x & _mm_set1_epi64x(0xFFF0000000000000); - y1 = _mm_slli_epi64(y1, 4); - y2 = _mm_srli_epi64(y2, 12); - y3 = _mm_slli_epi64(y3, 8); - y4 = _mm_srli_epi64(y4, 8); - y5 = _mm_slli_epi64(y5, 12); - y6 = _mm_srli_epi64(y6, 4); - return y0 | y1 | y2 | y3 | y4 | y5 | y6; -} - -static inline void -inv_shift_rows(__m128i q[static 4]) -{ - - q[0] = inv_shift_row(q[0]); - q[1] = inv_shift_row(q[1]); - q[2] = inv_shift_row(q[2]); - q[3] = inv_shift_row(q[3]); -} - -static inline __m128i -rotr32(__m128i x) -{ - return _mm_slli_epi64(x, 32) | _mm_srli_epi64(x, 32); -} - -static inline void -inv_mix_columns(__m128i q[4]) -{ - __m128i q0, q1, q2, q3, q4, q5, q6, q7; - __m128i r0, r1, r2, r3, r4, r5, r6, r7; - __m128i s0, s1, s2, s3, s4, s5, s6, s7; - - q0 = q[0]; - q1 = q[1]; - q2 = q[2]; - q3 = q[3]; - r0 = _mm_srli_epi64(q0, 16) | _mm_slli_epi64(q0, 48); - r1 = _mm_srli_epi64(q1, 16) | _mm_slli_epi64(q1, 48); - r2 = _mm_srli_epi64(q2, 16) | _mm_slli_epi64(q2, 48); - r3 = _mm_srli_epi64(q3, 16) | _mm_slli_epi64(q3, 48); - - q7 = _mm_shuffle_epi32(q3, 0x0e); - q6 = _mm_shuffle_epi32(q2, 0x0e); - q5 = _mm_shuffle_epi32(q1, 0x0e); - q4 = _mm_shuffle_epi32(q0, 0x0e); - - r7 = _mm_shuffle_epi32(r3, 0x0e); - r6 = _mm_shuffle_epi32(r2, 0x0e); - r5 = _mm_shuffle_epi32(r1, 0x0e); - r4 = _mm_shuffle_epi32(r0, 0x0e); - - s0 = q5 ^ q6 ^ q7 ^ r0 ^ r5 ^ r7 ^ rotr32(q0 ^ q5 ^ q6 ^ r0 ^ r5); - s1 = q0 ^ q5 ^ r0 ^ r1 ^ r5 ^ r6 ^ r7 ^ rotr32(q1 ^ q5 ^ q7 ^ r1 ^ r5 ^ r6); - s2 = q0 ^ q1 ^ q6 ^ r1 ^ r2 ^ r6 ^ r7 ^ rotr32(q0 ^ q2 ^ q6 ^ r2 ^ r6 ^ r7); - s3 = q0 ^ q1 ^ q2 ^ q5 ^ q6 ^ r0 ^ r2 ^ r3 ^ r5 ^ rotr32(q0 ^ q1 ^ q3 ^ q5 ^ q6 ^ q7 ^ r0 ^ r3 ^ r5 ^ r7); - s4 = q1 ^ q2 ^ q3 ^ q5 ^ r1 ^ r3 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr32(q1 ^ q2 ^ q4 ^ q5 ^ q7 ^ r1 ^ r4 ^ r5 ^ r6); - s5 = q2 ^ q3 ^ q4 ^ q6 ^ r2 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr32(q2 ^ q3 ^ q5 ^ q6 ^ r2 ^ r5 ^ r6 ^ r7); - s6 = q3 ^ q4 ^ q5 ^ q7 ^ r3 ^ r5 ^ r6 ^ r7 ^ rotr32(q3 ^ q4 ^ q6 ^ q7 ^ r3 ^ r6 ^ r7); - s7 = q4 ^ q5 ^ q6 ^ r4 ^ r6 ^ r7 ^ rotr32(q4 ^ q5 ^ q7 ^ r4 ^ r7); - - q[0] = _mm_unpacklo_epi64(s0, s4); - q[1] = _mm_unpacklo_epi64(s1, s5); - q[2] = _mm_unpacklo_epi64(s2, s6); - q[3] = _mm_unpacklo_epi64(s3, s7); -} - -/* see inner.h */ -void -aes_sse2_bitslice_decrypt(unsigned num_rounds, - const uint64_t *skey, __m128i q[static 4]) -{ - unsigned u; - - add_round_key(q, skey + (num_rounds << 3)); - for (u = num_rounds - 1; u > 0; u --) { - inv_shift_rows(q); - aes_sse2_bitslice_invSbox(q); - add_round_key(q, skey + (u << 3)); - inv_mix_columns(q); - } - inv_shift_rows(q); - aes_sse2_bitslice_invSbox(q); - add_round_key(q, skey); -} diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2_enc.c --- a/sys/crypto/aes/arch/x86/aes_sse2_enc.c Sat Nov 22 05:28:14 2025 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2016 Thomas Pornin - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -__KERNEL_RCSID(1, "$NetBSD: aes_sse2_enc.c,v 1.1 2020/06/29 23:47:54 riastradh Exp $"); - -#include - -#include "aes_sse2_impl.h" - -static inline void -add_round_key(__m128i q[static 4], const uint64_t sk[static 8]) -{ - q[0] ^= _mm_set_epi64x(sk[4], sk[0]); - q[1] ^= _mm_set_epi64x(sk[5], sk[1]); - q[2] ^= _mm_set_epi64x(sk[6], sk[2]); - q[3] ^= _mm_set_epi64x(sk[7], sk[3]); -} - -static inline __m128i -shift_row(__m128i q) -{ - __m128i x, y0, y1, y2, y3, y4, y5, y6; - - x = q; - y0 = x & _mm_set1_epi64x(0x000000000000FFFF); - y1 = x & _mm_set1_epi64x(0x00000000FFF00000); - y2 = x & _mm_set1_epi64x(0x00000000000F0000); - y3 = x & _mm_set1_epi64x(0x0000FF0000000000); - y4 = x & _mm_set1_epi64x(0x000000FF00000000); - y5 = x & _mm_set1_epi64x(0xF000000000000000); - y6 = x & _mm_set1_epi64x(0x0FFF000000000000); - y1 = _mm_srli_epi64(y1, 4); - y2 = _mm_slli_epi64(y2, 12); - y3 = _mm_srli_epi64(y3, 8); - y4 = _mm_slli_epi64(y4, 8); - y5 = _mm_srli_epi64(y5, 12); - y6 = _mm_slli_epi64(y6, 4); - return y0 | y1 | y2 | y3 | y4 | y5 | y6; -} - -static inline void -shift_rows(__m128i q[static 4]) -{ - - q[0] = shift_row(q[0]); - q[1] = shift_row(q[1]); - q[2] = shift_row(q[2]); - q[3] = shift_row(q[3]); -} - -static inline __m128i -rotr32(__m128i x) -{ - return _mm_slli_epi64(x, 32) | _mm_srli_epi64(x, 32); -} - -static inline void -mix_columns(__m128i q[static 4]) -{ - __m128i q0, q1, q2, q3, q4, q5, q6, q7; - __m128i r0, r1, r2, r3, r4, r5, r6, r7; - __m128i s0, s1, s2, s3, s4, s5, s6, s7; - - q0 = q[0]; - q1 = q[1]; - q2 = q[2]; - q3 = q[3]; - r0 = _mm_srli_epi64(q0, 16) | _mm_slli_epi64(q0, 48); - r1 = _mm_srli_epi64(q1, 16) | _mm_slli_epi64(q1, 48); - r2 = _mm_srli_epi64(q2, 16) | _mm_slli_epi64(q2, 48); - r3 = _mm_srli_epi64(q3, 16) | _mm_slli_epi64(q3, 48); - - q7 = _mm_shuffle_epi32(q3, 0x0e); - q6 = _mm_shuffle_epi32(q2, 0x0e); - q5 = _mm_shuffle_epi32(q1, 0x0e); - q4 = _mm_shuffle_epi32(q0, 0x0e); - - r7 = _mm_shuffle_epi32(r3, 0x0e); - r6 = _mm_shuffle_epi32(r2, 0x0e); - r5 = _mm_shuffle_epi32(r1, 0x0e); - r4 = _mm_shuffle_epi32(r0, 0x0e); - - s0 = q7 ^ r7 ^ r0 ^ rotr32(q0 ^ r0); - s1 = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr32(q1 ^ r1); - s2 = q1 ^ r1 ^ r2 ^ rotr32(q2 ^ r2); - s3 = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr32(q3 ^ r3); - s4 = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr32(q4 ^ r4); - s5 = q4 ^ r4 ^ r5 ^ rotr32(q5 ^ r5); - s6 = q5 ^ r5 ^ r6 ^ rotr32(q6 ^ r6); - s7 = q6 ^ r6 ^ r7 ^ rotr32(q7 ^ r7); - - q[0] = _mm_unpacklo_epi64(s0, s4); - q[1] = _mm_unpacklo_epi64(s1, s5); - q[2] = _mm_unpacklo_epi64(s2, s6); - q[3] = _mm_unpacklo_epi64(s3, s7); -} - -void -aes_sse2_bitslice_encrypt(unsigned num_rounds, - const uint64_t *skey, __m128i q[static 4]) -{ - unsigned u; - - add_round_key(q, skey); - for (u = 1; u < num_rounds; u ++) { - aes_sse2_bitslice_Sbox(q); - shift_rows(q); - mix_columns(q); - add_round_key(q, skey + (u << 3)); - } - aes_sse2_bitslice_Sbox(q); - shift_rows(q); - add_round_key(q, skey + (num_rounds << 3)); -} diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2_impl.c --- a/sys/crypto/aes/arch/x86/aes_sse2_impl.c Sat Nov 22 05:28:14 2025 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,221 +0,0 @@ -/* $NetBSD: aes_sse2_impl.c,v 1.5 2020/07/25 22:29:56 riastradh Exp $ */ - -/*- - * Copyright (c) 2020 The NetBSD Foundation, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include -__KERNEL_RCSID(1, "$NetBSD: aes_sse2_impl.c,v 1.5 2020/07/25 22:29:56 riastradh Exp $"); - -#include -#include - -#include -#include -#include - -#ifdef _KERNEL -#include -#include -#include -#include -#else -#include -#define fpu_kern_enter() ((void)0) -#define fpu_kern_leave() ((void)0) -#endif - -static void -aes_sse2_setenckey_impl(struct aesenc *enc, const uint8_t *key, - uint32_t nrounds) -{ - - fpu_kern_enter(); - aes_sse2_setkey(enc->aese_aes.aes_rk64, key, nrounds); - fpu_kern_leave(); -} - -static void -aes_sse2_setdeckey_impl(struct aesdec *dec, const uint8_t *key, - uint32_t nrounds) -{ - - fpu_kern_enter(); - /* - * BearSSL computes InvMixColumns on the fly -- no need for - * distinct decryption round keys. - */ - aes_sse2_setkey(dec->aesd_aes.aes_rk64, key, nrounds); - fpu_kern_leave(); -} - -static void -aes_sse2_enc_impl(const struct aesenc *enc, const uint8_t in[static 16], - uint8_t out[static 16], uint32_t nrounds) -{ - - fpu_kern_enter(); - aes_sse2_enc(enc, in, out, nrounds); - fpu_kern_leave(); -} - -static void -aes_sse2_dec_impl(const struct aesdec *dec, const uint8_t in[static 16], - uint8_t out[static 16], uint32_t nrounds) -{ - - fpu_kern_enter(); - aes_sse2_dec(dec, in, out, nrounds); - fpu_kern_leave(); -} - -static void -aes_sse2_cbc_enc_impl(const struct aesenc *enc, const uint8_t in[static 16], - uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], - uint32_t nrounds) -{ - - if (nbytes == 0) - return; - fpu_kern_enter(); - aes_sse2_cbc_enc(enc, in, out, nbytes, iv, nrounds); - fpu_kern_leave(); -} - -static void -aes_sse2_cbc_dec_impl(const struct aesdec *dec, const uint8_t in[static 16], - uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], - uint32_t nrounds) -{ - - if (nbytes == 0) - return; - fpu_kern_enter(); - aes_sse2_cbc_dec(dec, in, out, nbytes, iv, nrounds); - fpu_kern_leave(); -} - -static void -aes_sse2_xts_enc_impl(const struct aesenc *enc, const uint8_t in[static 16], - uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], - uint32_t nrounds) -{ - - if (nbytes == 0) - return; - fpu_kern_enter(); - aes_sse2_xts_enc(enc, in, out, nbytes, tweak, nrounds); - fpu_kern_leave(); -} - -static void -aes_sse2_xts_dec_impl(const struct aesdec *dec, const uint8_t in[static 16], - uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], - uint32_t nrounds) -{ - - if (nbytes == 0) - return; - fpu_kern_enter(); - aes_sse2_xts_dec(dec, in, out, nbytes, tweak, nrounds); - fpu_kern_leave(); -} - -static void -aes_sse2_cbcmac_update1_impl(const struct aesenc *enc, - const uint8_t in[static 16], size_t nbytes, uint8_t auth[static 16], - uint32_t nrounds) -{ - - fpu_kern_enter(); - aes_sse2_cbcmac_update1(enc, in, nbytes, auth, nrounds); - fpu_kern_leave(); -} - -static void -aes_sse2_ccm_enc1_impl(const struct aesenc *enc, const uint8_t in[static 16], - uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32], - uint32_t nrounds) -{ - - fpu_kern_enter(); - aes_sse2_ccm_enc1(enc, in, out, nbytes, authctr, nrounds); - fpu_kern_leave(); -} - -static void -aes_sse2_ccm_dec1_impl(const struct aesenc *enc, const uint8_t in[static 16], - uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32], - uint32_t nrounds) -{ - - fpu_kern_enter(); - aes_sse2_ccm_dec1(enc, in, out, nbytes, authctr, nrounds); - fpu_kern_leave(); -} - -static int -aes_sse2_probe(void) -{ - int result = 0; - - /* Verify that the CPU supports SSE and SSE2. */ -#ifdef _KERNEL - if (!i386_has_sse) - return -1; - if (!i386_has_sse2) - return -1; -#else - unsigned eax, ebx, ecx, edx; - if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx)) - return -1; - if ((edx & bit_SSE) == 0) - return -1; - if ((edx & bit_SSE2) == 0) - return -1; -#endif - - fpu_kern_enter(); - result = aes_sse2_selftest(); - fpu_kern_leave(); - - return result; -} - -struct aes_impl aes_sse2_impl = { - .ai_name = "Intel SSE2 bitsliced", - .ai_probe = aes_sse2_probe, - .ai_setenckey = aes_sse2_setenckey_impl, - .ai_setdeckey = aes_sse2_setdeckey_impl, - .ai_enc = aes_sse2_enc_impl, - .ai_dec = aes_sse2_dec_impl, - .ai_cbc_enc = aes_sse2_cbc_enc_impl, - .ai_cbc_dec = aes_sse2_cbc_dec_impl, - .ai_xts_enc = aes_sse2_xts_enc_impl, - .ai_xts_dec = aes_sse2_xts_dec_impl, - .ai_cbcmac_update1 = aes_sse2_cbcmac_update1_impl, - .ai_ccm_enc1 = aes_sse2_ccm_enc1_impl, - .ai_ccm_dec1 = aes_sse2_ccm_dec1_impl, -}; diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2_impl.h --- a/sys/crypto/aes/arch/x86/aes_sse2_impl.h Sat Nov 22 05:28:14 2025 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,49 +0,0 @@ -/* $NetBSD: aes_sse2_impl.h,v 1.3 2023/08/07 01:07:36 rin Exp $ */ - -/*- - * Copyright (c) 2020 The NetBSD Foundation, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _CRYPTO_AES_ARCH_X86_AES_SSE2_IMPL_H -#define _CRYPTO_AES_ARCH_X86_AES_SSE2_IMPL_H - -#include - -#include -#include -#include -#include - -void aes_sse2_bitslice_Sbox(__m128i[static 4]); -void aes_sse2_bitslice_invSbox(__m128i[static 4]); -void aes_sse2_ortho(__m128i[static 4]); -__m128i aes_sse2_interleave_in(__m128i); -__m128i aes_sse2_interleave_out(__m128i); -unsigned aes_sse2_keysched(uint64_t *, const void *, size_t); -void aes_sse2_skey_expand(uint64_t *, unsigned, const uint64_t *); -void aes_sse2_bitslice_encrypt(unsigned, const uint64_t *, __m128i[static 4]); -void aes_sse2_bitslice_decrypt(unsigned, const uint64_t *, __m128i[static 4]); - -#endif /* _CRYPTO_AES_ARCH_X86_AES_SSE2_IMPL_H */ diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2_subr.c --- a/sys/crypto/aes/arch/x86/aes_sse2_subr.c Sat Nov 22 05:28:14 2025 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,711 +0,0 @@ -/* $NetBSD: aes_sse2_subr.c,v 1.4 2020/09/08 22:48:24 riastradh Exp $ */ - -/*- - * Copyright (c) 2020 The NetBSD Foundation, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include -__KERNEL_RCSID(1, "$NetBSD: aes_sse2_subr.c,v 1.4 2020/09/08 22:48:24 riastradh Exp $"); - -#ifdef _KERNEL -#include -#include -#else -#include -#include -#include -#include -#include -#define KASSERT assert -#define panic(fmt, args...) err(1, fmt, ##args) -#endif - -#include -#include - -#include "aes_sse2_impl.h" - -void -aes_sse2_setkey(uint64_t rk[static 30], const void *key, uint32_t nrounds) -{ - size_t key_len; - - switch (nrounds) { - case 10: - key_len = 16; - break; - case 12: - key_len = 24; - break; - case 14: - key_len = 32; - break; - default: - panic("invalid AES nrounds: %u", nrounds); - } - - aes_sse2_keysched(rk, key, key_len); -} - -void -aes_sse2_enc(const struct aesenc *enc, const uint8_t in[static 16], - uint8_t out[static 16], uint32_t nrounds) -{ - uint64_t sk_exp[120]; - __m128i q[4]; - - /* Expand round keys for bitslicing. */ - aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64); - - /* Load input block interleaved with garbage blocks. */ - q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in)); - q[1] = q[2] = q[3] = _mm_setzero_si128(); - - /* Transform to bitslice, decrypt, transform from bitslice. */ - aes_sse2_ortho(q); - aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); - aes_sse2_ortho(q); - - /* Store output block. */ - _mm_storeu_epi8(out, aes_sse2_interleave_out(q[0])); - - /* Paranoia: Zero temporary buffers. */ - explicit_memset(sk_exp, 0, sizeof sk_exp); - explicit_memset(q, 0, sizeof q); -} - -void -aes_sse2_dec(const struct aesdec *dec, const uint8_t in[static 16], - uint8_t out[static 16], uint32_t nrounds) -{ - uint64_t sk_exp[120]; - __m128i q[4]; - - /* Expand round keys for bitslicing. */ - aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64); - - /* Load input block interleaved with garbage blocks. */ - q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in)); - q[1] = q[2] = q[3] = _mm_setzero_si128(); - - /* Transform to bitslice, decrypt, transform from bitslice. */ - aes_sse2_ortho(q); - aes_sse2_bitslice_decrypt(nrounds, sk_exp, q); - aes_sse2_ortho(q); - - /* Store output block. */ - _mm_storeu_epi8(out, aes_sse2_interleave_out(q[0])); - - /* Paranoia: Zero temporary buffers. */ - explicit_memset(sk_exp, 0, sizeof sk_exp); - explicit_memset(q, 0, sizeof q); -} - -void -aes_sse2_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16], - uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], - uint32_t nrounds) -{ - uint64_t sk_exp[120]; - __m128i q[4]; - __m128i cv; - - KASSERT(nbytes); - KASSERT(nbytes % 16 == 0); - - /* Expand round keys for bitslicing. */ - aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64); - - /* Load the IV. */ - cv = _mm_loadu_epi8(iv); - - for (; nbytes; nbytes -= 16, in += 16, out += 16) { - /* Load input block and apply CV. */ - q[0] = aes_sse2_interleave_in(cv ^ _mm_loadu_epi8(in)); - - /* Transform to bitslice, encrypt, transform from bitslice. */ - aes_sse2_ortho(q); - aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); - aes_sse2_ortho(q); - - /* Remember ciphertext as CV and store output block. */ - cv = aes_sse2_interleave_out(q[0]); - _mm_storeu_epi8(out, cv); - } - - /* Store updated IV. */ - _mm_storeu_epi8(iv, cv); - - /* Paranoia: Zero temporary buffers. */ - explicit_memset(sk_exp, 0, sizeof sk_exp); - explicit_memset(q, 0, sizeof q); -} - -void -aes_sse2_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16], - uint8_t out[static 16], size_t nbytes, uint8_t ivp[static 16], - uint32_t nrounds) -{ - uint64_t sk_exp[120]; - __m128i q[4]; - __m128i cv, iv, w; - - KASSERT(nbytes); - KASSERT(nbytes % 16 == 0); - - /* Expand round keys for bitslicing. */ - aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64); - - /* Load the IV. */ - iv = _mm_loadu_epi8(ivp); - - /* Load the last cipher block. */ - cv = _mm_loadu_epi8(in + nbytes - 16); - - /* Store the updated IV. */ - _mm_storeu_epi8(ivp, cv); - - /* Process the last blocks if not an even multiple of four. */ - if (nbytes % (4*16)) { - unsigned n = (nbytes/16) % 4; - - KASSERT(n > 0); - KASSERT(n < 4); - - q[1] = q[2] = q[3] = _mm_setzero_si128(); - q[n - 1] = aes_sse2_interleave_in(cv); - switch (nbytes % 64) { - case 48: - w = _mm_loadu_epi8(in + nbytes - 32); - q[1] = aes_sse2_interleave_in(w); - w = _mm_loadu_epi8(in + nbytes - 48); - q[0] = aes_sse2_interleave_in(w); - break; - case 32: - w = _mm_loadu_epi8(in + nbytes - 32); - q[0] = aes_sse2_interleave_in(w); - break; - case 16: - break; - } - - /* Decrypt. */ - aes_sse2_ortho(q); - aes_sse2_bitslice_decrypt(nrounds, sk_exp, q); - aes_sse2_ortho(q); - - do { - n--; - w = aes_sse2_interleave_out(q[n]); - if ((nbytes -= 16) == 0) - goto out; - cv = _mm_loadu_epi8(in + nbytes - 16); - _mm_storeu_epi8(out + nbytes, w ^ cv); - } while (n); - } - - for (;;) { - KASSERT(nbytes >= 64); - nbytes -= 64; - - /* - * 1. Set up upper cipher block from cv. - * 2. Load lower cipher block into cv and set it up. - * 3. Decrypt. - */ - q[3] = aes_sse2_interleave_in(cv); - - w = _mm_loadu_epi8(in + nbytes + 4*8); - q[2] = aes_sse2_interleave_in(w); - - w = _mm_loadu_epi8(in + nbytes + 4*4); - q[1] = aes_sse2_interleave_in(w); - - w = _mm_loadu_epi8(in + nbytes + 4*0); - q[0] = aes_sse2_interleave_in(w); - - aes_sse2_ortho(q); - aes_sse2_bitslice_decrypt(nrounds, sk_exp, q); - aes_sse2_ortho(q); - - /* Store the upper output block. */ - w = aes_sse2_interleave_out(q[3]); - cv = _mm_loadu_epi8(in + nbytes + 4*8); - _mm_storeu_epi8(out + nbytes + 4*12, w ^ cv); - - /* Store the middle output blocks. */ - w = aes_sse2_interleave_out(q[2]); - cv = _mm_loadu_epi8(in + nbytes + 4*4); - _mm_storeu_epi8(out + nbytes + 4*8, w ^ cv); - - w = aes_sse2_interleave_out(q[1]); - cv = _mm_loadu_epi8(in + nbytes + 4*0); - _mm_storeu_epi8(out + nbytes + 4*4, w ^ cv); - - /* - * Get the first output block, but don't load the CV - * yet -- it might be the previous ciphertext block, or - * it might be the IV. - */ - w = aes_sse2_interleave_out(q[0]); - - /* Stop if we've reached the first output block. */ - if (nbytes == 0) - goto out; - - /* - * Load the preceding cipher block, and apply it as the - * chaining value to this one. - */ - cv = _mm_loadu_epi8(in + nbytes - 16); - _mm_storeu_epi8(out + nbytes, w ^ cv); - } - -out: /* Store the first output block. */ - _mm_storeu_epi8(out, w ^ iv); - - /* Paranoia: Zero temporary buffers. */ - explicit_memset(sk_exp, 0, sizeof sk_exp); - explicit_memset(q, 0, sizeof q); -} - -static inline __m128i -aes_sse2_xts_update(__m128i t) -{ - const __m128i one = _mm_set_epi64x(1, 1); - __m128i s, m, c; - - s = _mm_srli_epi64(t, 63); /* 1 if high bit set else 0 */ - m = _mm_sub_epi64(s, one); /* 0 if high bit set else -1 */ - m = _mm_shuffle_epi32(m, 0x4e); /* swap halves */ - c = _mm_set_epi64x(1, 0x87); /* carry */ - - return _mm_slli_epi64(t, 1) ^ (c & ~m); -} - -static int -aes_sse2_xts_update_selftest(void) -{ - static const struct { - uint32_t in[4], out[4]; - } cases[] = { - [0] = { {1}, {2} }, - [1] = { {0x80000000U,0,0,0}, {0,1,0,0} }, - [2] = { {0,0x80000000U,0,0}, {0,0,1,0} }, - [3] = { {0,0,0x80000000U,0}, {0,0,0,1} }, - [4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} }, - [5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} }, - }; - unsigned i; - uint32_t t[4]; - int result = 0; - - for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) { - t[0] = cases[i].in[0]; - t[1] = cases[i].in[1]; - t[2] = cases[i].in[2]; - t[3] = cases[i].in[3]; - _mm_storeu_epi8(t, aes_sse2_xts_update(_mm_loadu_epi8(t))); - if (t[0] != cases[i].out[0] || - t[1] != cases[i].out[1] || - t[2] != cases[i].out[2] || - t[3] != cases[i].out[3]) { - printf("%s %u:" - " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n", - __func__, i, t[0], t[1], t[2], t[3]); - result = -1; - } - } - - return result; -} - -void -aes_sse2_xts_enc(const struct aesenc *enc, const uint8_t in[static 16], - uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], - uint32_t nrounds) -{ - uint64_t sk_exp[120]; - __m128i q[4]; - __m128i w; - __m128i t[5]; - unsigned i; - - KASSERT(nbytes); - KASSERT(nbytes % 16 == 0); - - /* Expand round keys for bitslicing. */ - aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64); - - /* Load tweak. */ - t[0] = _mm_loadu_epi8(tweak); - - /* Handle the first block separately if odd number. */ - if (nbytes % (4*16)) { - /* Load up the tweaked inputs. */ - for (i = 0; i < (nbytes/16) % 4; i++) { - w = _mm_loadu_epi8(in + 16*i) ^ t[i]; - q[i] = aes_sse2_interleave_in(w); - t[i + 1] = aes_sse2_xts_update(t[i]); - } - for (; i < 4; i++) - q[i] = _mm_setzero_si128(); - - /* Encrypt up to four blocks. */ - aes_sse2_ortho(q); - aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); - aes_sse2_ortho(q); - - /* Store the tweaked outputs. */ - for (i = 0; i < (nbytes/16) % 4; i++) { - w = aes_sse2_interleave_out(q[i]); - _mm_storeu_epi8(out + 16*i, w ^ t[i]); - } - - /* Advance to the next block. */ - t[0] = t[i]; - in += nbytes % (4*16); - out += nbytes % (4*16); - nbytes -= nbytes % (4*16); - if (nbytes == 0) - goto out; - } - - do { - KASSERT(nbytes % 64 == 0); - KASSERT(nbytes >= 64); - - /* Load up the tweaked inputs. */ - for (i = 0; i < 4; i++) { - w = _mm_loadu_epi8(in + 16*i) ^ t[i]; - q[i] = aes_sse2_interleave_in(w); - t[i + 1] = aes_sse2_xts_update(t[i]); - } - - /* Encrypt four blocks. */ - aes_sse2_ortho(q); - aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); - aes_sse2_ortho(q); - - /* Store the tweaked outputs. */ - for (i = 0; i < 4; i++) { - w = aes_sse2_interleave_out(q[i]); - _mm_storeu_epi8(out + 16*i, w ^ t[i]); - } - - /* Advance to the next block. */ - t[0] = t[4]; - in += 64; - out += 64; - nbytes -= 64; - } while (nbytes); - -out: /* Store the updated tweak. */ - _mm_storeu_epi8(tweak, t[0]); - - /* Paranoia: Zero temporary buffers. */ - explicit_memset(sk_exp, 0, sizeof sk_exp); - explicit_memset(q, 0, sizeof q); - explicit_memset(t, 0, sizeof t); -} - -void -aes_sse2_xts_dec(const struct aesdec *dec, const uint8_t in[static 16], - uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], - uint32_t nrounds) -{ - uint64_t sk_exp[120]; - __m128i q[4]; - __m128i w; - __m128i t[5]; - unsigned i; - - KASSERT(nbytes); - KASSERT(nbytes % 16 == 0); - - /* Expand round keys for bitslicing. */ - aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64); - - /* Load tweak. */ - t[0] = _mm_loadu_epi8(tweak); - - /* Handle the first block separately if odd number. */ - if (nbytes % (4*16)) { - /* Load up the tweaked inputs. */ - for (i = 0; i < (nbytes/16) % 4; i++) { - w = _mm_loadu_epi8(in + 16*i) ^ t[i]; - q[i] = aes_sse2_interleave_in(w); - t[i + 1] = aes_sse2_xts_update(t[i]); - } - for (; i < 4; i++) - q[i] = _mm_setzero_si128(); - - /* Decrypt up to four blocks. */ - aes_sse2_ortho(q); - aes_sse2_bitslice_decrypt(nrounds, sk_exp, q); - aes_sse2_ortho(q); - - /* Store the tweaked outputs. */ - for (i = 0; i < (nbytes/16) % 4; i++) { - w = aes_sse2_interleave_out(q[i]); - _mm_storeu_epi8(out + 16*i, w ^ t[i]); - } - - /* Advance to the next block. */ - t[0] = t[i]; - in += nbytes % (4*16); - out += nbytes % (4*16); - nbytes -= nbytes % (4*16); - if (nbytes == 0) - goto out; - } - - do { - KASSERT(nbytes % 64 == 0); - KASSERT(nbytes >= 64); - - /* Load up the tweaked inputs. */ - for (i = 0; i < 4; i++) { - w = _mm_loadu_epi8(in + 16*i) ^ t[i]; - q[i] = aes_sse2_interleave_in(w); - t[i + 1] = aes_sse2_xts_update(t[i]); - } - - /* Decrypt four blocks. */ - aes_sse2_ortho(q); - aes_sse2_bitslice_decrypt(nrounds, sk_exp, q); - aes_sse2_ortho(q); - - /* Store the tweaked outputs. */ - for (i = 0; i < 4; i++) { - w = aes_sse2_interleave_out(q[i]); - _mm_storeu_epi8(out + 16*i, w ^ t[i]); - } - - /* Advance to the next block. */ - t[0] = t[4]; - in += 64; - out += 64; - nbytes -= 64; - } while (nbytes); - -out: /* Store the updated tweak. */ - _mm_storeu_epi8(tweak, t[0]); - - /* Paranoia: Zero temporary buffers. */ - explicit_memset(sk_exp, 0, sizeof sk_exp); - explicit_memset(q, 0, sizeof q); - explicit_memset(t, 0, sizeof t); -} - -void -aes_sse2_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16], - size_t nbytes, uint8_t auth[static 16], uint32_t nrounds) -{ - uint64_t sk_exp[120]; - __m128i q[4]; - - KASSERT(nbytes); - KASSERT(nbytes % 16 == 0); - - /* Expand round keys for bitslicing. */ - aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64); - - /* Load initial authenticator. */ - q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(auth)); - - for (; nbytes; nbytes -= 16, in += 16) { - q[0] ^= aes_sse2_interleave_in(_mm_loadu_epi8(in)); - aes_sse2_ortho(q); - aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); - aes_sse2_ortho(q); - } - - /* Store updated authenticator. */ - _mm_storeu_epi8(auth, aes_sse2_interleave_out(q[0])); - - /* Paranoia: Zero temporary buffers. */ - explicit_memset(sk_exp, 0, sizeof sk_exp); - explicit_memset(q, 0, sizeof q); -} - -void -aes_sse2_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16], - uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32], - uint32_t nrounds) -{ - uint64_t sk_exp[120]; - __m128i q[4]; - __m128i ctr; - uint32_t c0, c1, c2, c3; - - KASSERT(nbytes); - KASSERT(nbytes % 16 == 0); - - /* Expand round keys for bitslicing. */ - aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64); - - /* Set first block to authenticator. */ - q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(authctr)); - - /* Load initial counter block, big-endian so we can increment it. */ - c0 = le32dec(authctr + 16 + 4*0); - c1 = le32dec(authctr + 16 + 4*1); - c2 = le32dec(authctr + 16 + 4*2); - c3 = be32dec(authctr + 16 + 4*3); - - /* Set other blocks to garbage -- can't take advantage. */ - q[2] = q[3] = _mm_setzero_si128(); - - for (; nbytes; nbytes -= 16, in += 16, out += 16) { - /* Update authenticator. */ - q[0] ^= aes_sse2_interleave_in(_mm_loadu_epi8(in)); - - /* Increment 32-bit counter. */ - ctr = _mm_set_epi32(bswap32(++c3), c2, c1, c0); - q[1] = aes_sse2_interleave_in(ctr); - - /* Encrypt authenticator and counter. */ - aes_sse2_ortho(q); - aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); - aes_sse2_ortho(q); - - /* Encrypt with CTR output. */ - _mm_storeu_epi8(out, - _mm_loadu_epi8(in) ^ aes_sse2_interleave_out(q[1])); - } - - /* Update authenticator. */ - _mm_storeu_epi8(authctr, aes_sse2_interleave_out(q[0])); - - /* Update counter. */ - be32enc(authctr + 16 + 4*3, c3); - - /* Paranoia: Zero temporary buffers. */ - explicit_memset(sk_exp, 0, sizeof sk_exp); - explicit_memset(q, 0, sizeof q); -} - -void -aes_sse2_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16], - uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32], - uint32_t nrounds) -{ - uint64_t sk_exp[120]; - __m128i q[4]; - __m128i ctr, block; - uint32_t c0, c1, c2, c3; - - KASSERT(nbytes); - KASSERT(nbytes % 16 == 0); - - /* Expand round keys for bitslicing. */ - aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64); - - /* Load initial counter block, big-endian so we can increment it. */ - c0 = le32dec(authctr + 16 + 4*0); - c1 = le32dec(authctr + 16 + 4*1); - c2 = le32dec(authctr + 16 + 4*2); - c3 = be32dec(authctr + 16 + 4*3); - - /* Increment 32-bit counter. */ - ctr = _mm_set_epi32(bswap32(++c3), c2, c1, c0); - q[0] = aes_sse2_interleave_in(ctr); - - /* - * Set the other blocks to garbage -- we don't have any - * plaintext to authenticate yet. - */ - q[1] = q[2] = q[3] = _mm_setzero_si128(); - - /* Encrypt first CTR. */ - aes_sse2_ortho(q); - aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); - aes_sse2_ortho(q); - - /* Load the initial authenticator. */ - q[1] = aes_sse2_interleave_in(_mm_loadu_epi8(authctr)); - - for (;; in += 16, out += 16) { - /* Decrypt the block. */ - block = _mm_loadu_epi8(in) ^ aes_sse2_interleave_out(q[0]); - - /* Update authenticator. */ - q[1] ^= aes_sse2_interleave_in(block); - - /* Store plaintext. */ - _mm_storeu_epi8(out, block); - - /* If this is the last block, stop. */ - if ((nbytes -= 16) == 0) - break; - - /* Increment 32-bit counter. */ - ctr = _mm_set_epi32(bswap32(++c3), c2, c1, c0); - q[0] = aes_sse2_interleave_in(ctr); - - /* Authenticate previous plaintext, encrypt next CTR. */ - aes_sse2_ortho(q); - aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); - aes_sse2_ortho(q); - } - - /* - * Authenticate last plaintext. We're only doing this for the - * authenticator, not for the counter, so don't bother to - * initialize q[0], q[2], q[3]. (Even for the sake of - * sanitizers, they're already initialized to something by - * now.) - */ - aes_sse2_ortho(q); - aes_sse2_bitslice_encrypt(nrounds, sk_exp, q); - aes_sse2_ortho(q); - - /* Update authenticator. */ - _mm_storeu_epi8(authctr, aes_sse2_interleave_out(q[1])); - - /* Update counter. */ - be32enc(authctr + 16 + 4*3, c3); - - /* Paranoia: Zero temporary buffers. */ - explicit_memset(sk_exp, 0, sizeof sk_exp); - explicit_memset(q, 0, sizeof q); -} - -int -aes_sse2_selftest(void) -{ - - if (aes_sse2_xts_update_selftest()) - return -1; - - /* XXX test aes_sse2_bitslice_decrypt */ - /* XXX test aes_sse2_bitslice_encrypt */ - /* XXX test aes_sse2_keysched */ - /* XXX test aes_sse2_ortho */ - /* XXX test aes_sse2_skey_expand */ - - return 0; -} diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/files.aessse2 --- a/sys/crypto/aes/arch/x86/files.aessse2 Sat Nov 22 05:28:14 2025 +0000 +++ b/sys/crypto/aes/arch/x86/files.aessse2 Sat Nov 22 06:22:31 2025 +0000 @@ -1,12 +1,12 @@ # $NetBSD: files.aessse2,v 1.2 2020/06/29 23:50:05 riastradh Exp $ -makeoptions aes "COPTS.aes_sse2.c"+="-msse -msse2" -makeoptions aes "COPTS.aes_sse2_dec.c"+="-msse -msse2" -makeoptions aes "COPTS.aes_sse2_enc.c"+="-msse -msse2" -makeoptions aes "COPTS.aes_sse2_subr.c"+="-msse -msse2" +makeoptions aes "COPTS.aes_sse2_4x32.c"+="-msse -msse2" +makeoptions aes "COPTS.aes_sse2_4x32_dec.c"+="-msse -msse2" +makeoptions aes "COPTS.aes_sse2_4x32_enc.c"+="-msse -msse2" +makeoptions aes "COPTS.aes_sse2_4x32_subr.c"+="-msse -msse2" -file crypto/aes/arch/x86/aes_sse2.c aes -file crypto/aes/arch/x86/aes_sse2_dec.c aes -file crypto/aes/arch/x86/aes_sse2_enc.c aes -file crypto/aes/arch/x86/aes_sse2_impl.c aes -file crypto/aes/arch/x86/aes_sse2_subr.c aes +file crypto/aes/arch/x86/aes_sse2_4x32.c aes +file crypto/aes/arch/x86/aes_sse2_4x32_dec.c aes +file crypto/aes/arch/x86/aes_sse2_4x32_enc.c aes +file crypto/aes/arch/x86/aes_sse2_4x32_impl.c aes +file crypto/aes/arch/x86/aes_sse2_4x32_subr.c aes diff -r cce15febbf04 -r 4ef139778465 tests/sys/crypto/aes/Makefile --- a/tests/sys/crypto/aes/Makefile Sat Nov 22 05:28:14 2025 +0000 +++ b/tests/sys/crypto/aes/Makefile Sat Nov 22 06:22:31 2025 +0000 @@ -65,15 +65,15 @@ SRCS.t_aes+= aes_ni.c SRCS.t_aes+= aes_ni_64.S .endif -SRCS.t_aes+= aes_sse2.c -SRCS.t_aes+= aes_sse2_dec.c -SRCS.t_aes+= aes_sse2_enc.c -SRCS.t_aes+= aes_sse2_impl.c -SRCS.t_aes+= aes_sse2_subr.c -COPTS.aes_sse2.c+= -msse -msse2 -COPTS.aes_sse2_dec.c+= -msse -msse2 -COPTS.aes_sse2_enc.c+= -msse -msse2 -COPTS.aes_sse2_subr.c+= -msse -msse2 +SRCS.t_aes+= aes_sse2_4x32.c +SRCS.t_aes+= aes_sse2_4x32_dec.c +SRCS.t_aes+= aes_sse2_4x32_enc.c +SRCS.t_aes+= aes_sse2_4x32_impl.c +SRCS.t_aes+= aes_sse2_4x32_subr.c +COPTS.aes_sse2_4x32.c+= -msse -msse2 +COPTS.aes_sse2_4x32_dec.c+= -msse -msse2 +COPTS.aes_sse2_4x32_enc.c+= -msse -msse2 +COPTS.aes_sse2_4x32_subr.c+= -msse -msse2 SRCS.t_aes+= aes_ssse3.c SRCS.t_aes+= aes_ssse3_impl.c @@ -95,7 +95,7 @@ COPTS.aes_bear64.c+= ${CC_WNO_STRINGOP_O COPTS.aes_neon_subr.c+= ${CC_WNO_ARRAY_BOUNDS} COPTS.aes_ni.c+= ${CC_WNO_STRINGOP_OVERREAD} ${CC_WNO_STRINGOP_OVERFLOW} -COPTS.aes_sse2_subr.c+= ${CC_WNO_ARRAY_BOUNDS} +COPTS.aes_sse2_4x32_subr.c+= ${CC_WNO_ARRAY_BOUNDS} COPTS.aes_ssse3_subr.c+=${CC_WNO_ARRAY_BOUNDS} COPTS.aes_via.c+= ${CC_WNO_ARRAY_BOUNDS} diff -r cce15febbf04 -r 4ef139778465 tests/sys/crypto/aes/t_aes.c --- a/tests/sys/crypto/aes/t_aes.c Sat Nov 22 05:28:14 2025 +0000 +++ b/tests/sys/crypto/aes/t_aes.c Sat Nov 22 06:22:31 2025 +0000 @@ -35,7 +35,7 @@ #if defined(__i386__) || defined(__x86_64__) #include -#include +#include #include #include #endif @@ -125,8 +125,8 @@ AES_SELFTEST(aes_ni_selftest, &aes_ni_im #endif #if defined(__i386__) || defined(__x86_64__) -AES_SELFTEST(aes_sse2_selftest, &aes_sse2_impl, - "Intel SSE2 bitsliced self-test") +AES_SELFTEST(aes_sse2_4x32_selftest, &aes_sse2_4x32_impl, + "Intel SSE2 4x32 bitsliced self-test") AES_SELFTEST(aes_ssse3_selftest, &aes_ssse3_impl, "Intel SSSE3 vpaes self-test") AES_SELFTEST(aes_via_selftest, &aes_via_impl, "VIA ACE AES self-test") @@ -151,7 +151,7 @@ ATF_TP_ADD_TCS(tp) #endif #if defined(__i386__) || defined(__x86_64__) - ATF_TP_ADD_TC(tp, aes_sse2_selftest); + ATF_TP_ADD_TC(tp, aes_sse2_4x32_selftest); ATF_TP_ADD_TC(tp, aes_ssse3_selftest); ATF_TP_ADD_TC(tp, aes_via_selftest); #endif