# HG changeset patch
# User Taylor R Campbell <riastradh@NetBSD.org>
# Date 1763785115 0
#      Sat Nov 22 04:18:35 2025 +0000
# Branch trunk
# Node ID 8da4be48b8762a07f5504a2b60c33d89229d236f
# Parent  03d10f218fce5630dbd63fdd135fec096db9f6ca
# EXP-Topic riastradh-pr59774-aesbear64
aes(9): New aes_keysched_enc/dec.

These implement the standard key schedule.  They are named
independently of any particular AES implementation, so that:

(a) we can swap between the BearSSL aes_ct and aes_ct64 code without
    changing all the callers who don't care which one they get, and

(b) we could push it into the aes_impl abstraction if we wanted.

This eliminates all br_aes_* references outside aes_bear.c, aes_ct*.c,
and the new aes_keysched.c wrappers.

PR kern/59774: bearssl 32-bit AES is too slow, want 64-bit optimized
version in kernel

diff -r 03d10f218fce -r 8da4be48b876 sys/arch/x86/x86/via_padlock.c
--- a/sys/arch/x86/x86/via_padlock.c	Sun Oct 19 18:56:19 2025 +0000
+++ b/sys/arch/x86/x86/via_padlock.c	Sat Nov 22 04:18:35 2025 +0000
@@ -37,7 +37,7 @@ __KERNEL_RCSID(0, "$NetBSD: via_padlock.
 #include <machine/cpufunc.h>
 #include <machine/cpuvar.h>
 
-#include <crypto/aes/aes_bear.h>
+#include <crypto/aes/aes_keysched.h>
 
 #include <opencrypto/cryptodev.h>
 #include <opencrypto/cryptosoft.h>
@@ -176,23 +176,28 @@ via_padlock_crypto_newsession(void *arg,
 
 			switch (c->cri_klen) {
 			case 128:
-				br_aes_ct_keysched_stdenc(ses->ses_ekey,
+				/*
+				 * XXX Is this needed?  For AES-128 the
+				 * VIA padlock instructions usually
+				 * compute the key schedule internally.
+				 */
+				aes_keysched_enc(ses->ses_ekey,
 				    c->cri_key, 16);
-				br_aes_ct_keysched_stddec(ses->ses_dkey,
+				aes_keysched_dec(ses->ses_dkey,
 				    c->cri_key, 16);
 				cw0 = C3_CRYPT_CWLO_KEY128;
 				break;
 			case 192:
-				br_aes_ct_keysched_stdenc(ses->ses_ekey,
+				aes_keysched_stdenc(ses->ses_ekey,
 				    c->cri_key, 24);
-				br_aes_ct_keysched_stddec(ses->ses_dkey,
+				aes_keysched_stddec(ses->ses_dkey,
 				    c->cri_key, 24);
 				cw0 = C3_CRYPT_CWLO_KEY192;
 				break;
 			case 256:
-				br_aes_ct_keysched_stdenc(ses->ses_ekey,
+				aes_keysched_stdenc(ses->ses_ekey,
 				    c->cri_key, 32);
-				br_aes_ct_keysched_stddec(ses->ses_dkey,
+				aes_keysched_stddec(ses->ses_dkey,
 				    c->cri_key, 32);
 				cw0 = C3_CRYPT_CWLO_KEY256;
 				break;
diff -r 03d10f218fce -r 8da4be48b876 sys/crypto/aes/aes.h
--- a/sys/crypto/aes/aes.h	Sun Oct 19 18:56:19 2025 +0000
+++ b/sys/crypto/aes/aes.h	Sat Nov 22 04:18:35 2025 +0000
@@ -35,7 +35,8 @@
 /*
  * struct aes
  *
- *	Expanded round keys.
+ *	Expanded round keys, in implementation-dependent format.  (For
+ *	the standard AES key schedule, see aes_keysched.h.)
  */
 union aes {
 	uint32_t	aes_rk[60];
diff -r 03d10f218fce -r 8da4be48b876 sys/crypto/aes/aes_impl.c
--- a/sys/crypto/aes/aes_impl.c	Sun Oct 19 18:56:19 2025 +0000
+++ b/sys/crypto/aes/aes_impl.c	Sat Nov 22 04:18:35 2025 +0000
@@ -37,12 +37,13 @@ __KERNEL_RCSID(1, "$NetBSD: aes_impl.c,v
 #include <sys/systm.h>
 
 #include <crypto/aes/aes.h>
+#include <crypto/aes/aes_bear.h> /* default implementation */
 #include <crypto/aes/aes_cbc.h>
-#include <crypto/aes/aes_bear.h> /* default implementation */
 #include <crypto/aes/aes_impl.h>
+#include <crypto/aes/aes_keysched.h>
 #include <crypto/aes/aes_xts.h>
 
-static int aes_selftest_stdkeysched(void);
+static int aes_keysched_selftest(void);
 
 static const struct aes_impl	*aes_md_impl	__read_mostly;
 static const struct aes_impl	*aes_impl	__read_mostly;
@@ -101,7 +102,7 @@ aes_select(void)
 
 	KASSERT(aes_impl == NULL);
 
-	if (aes_selftest_stdkeysched())
+	if (aes_keysched_selftest())
 		panic("AES is busted");
 
 	if (aes_md_impl) {
@@ -337,10 +338,13 @@ aes_ccm_dec1(const struct aesenc *enc, c
 }
 
 /*
- * Known-answer self-tests for the standard key schedule.
+ * Known-answer self-tests for the standard key schedule, used by some
+ * drivers for hardware devices that compute AES encryption and
+ * decryption in hardware but rely on software to compute the standard
+ * key schedule.
  */
 static int
-aes_selftest_stdkeysched(void)
+aes_keysched_selftest(void)
 {
 	static const uint8_t key[32] = {
 		0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
@@ -451,11 +455,11 @@ aes_selftest_stdkeysched(void)
 	unsigned i;
 
 	for (i = 0; i < __arraycount(C); i++) {
-		if (br_aes_ct_keysched_stdenc(rk, key, C[i].len) != C[i].nr)
+		if (aes_keysched_enc(rk, key, C[i].len) != C[i].nr)
 			return -1;
 		if (memcmp(rk, C[i].enc, 4*(C[i].nr + 1)))
 			return -1;
-		if (br_aes_ct_keysched_stddec(rk, key, C[i].len) != C[i].nr)
+		if (aes_keysched_dec(rk, key, C[i].len) != C[i].nr)
 			return -1;
 		if (memcmp(rk, C[i].dec, 4*(C[i].nr + 1)))
 			return -1;
diff -r 03d10f218fce -r 8da4be48b876 sys/crypto/aes/aes_keysched.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/aes/aes_keysched.c	Sat Nov 22 04:18:35 2025 +0000
@@ -0,0 +1,65 @@
+/*	$NetBSD$	*/
+
+/*-
+ * Copyright (c) 2025 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(1, "$NetBSD$");
+
+#include <sys/types.h>
+
+#include <crypto/aes/aes_bear.h>
+#include <crypto/aes/aes_keysched.h>
+
+/*
+ * aes_keysched_enc(rk, key, keybytes)
+ *
+ *	Compute the standard AES encryption key schedule, expanding a
+ *	16-, 24-, or 32-byte key into 44, 52, or 60 32-bit round keys
+ *	for encryption.  Returns the number of rounds for the key of
+ *	this length.
+ */
+u_int
+aes_keysched_enc(uint32_t *rk, const void *key, size_t keybytes)
+{
+
+	return br_aes_ct_keysched_stdenc(rk, key, keybytes);
+}
+
+/*
+ * aes_keysched_dec(rk, key, keybytes)
+ *
+ *	Compute the standard AES decryption key schedule, expanding a
+ *	16-, 24-, or 32-byte key into 44, 52, or 60 32-bit round keys
+ *	and applying InvMixColumns for decryption.  Returns the number
+ *	of rounds for the key of this length.
+ */
+u_int
+aes_keysched_dec(uint32_t *rk, const void *key, size_t keybytes)
+{
+
+	return br_aes_ct_keysched_stddec(rk, key, keybytes);
+}
diff -r 03d10f218fce -r 8da4be48b876 sys/crypto/aes/aes_keysched.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/aes/aes_keysched.h	Sat Nov 22 04:18:35 2025 +0000
@@ -0,0 +1,37 @@
+/*	$NetBSD$	*/
+
+/*-
+ * Copyright (c) 2025 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_CRYPTO_AES_AES_KEYSCHED_H
+#define	_CRYPTO_AES_AES_KEYSCHED_H
+
+#include <sys/types.h>
+
+u_int	aes_keysched_enc(uint32_t *, const void *, size_t);
+u_int	aes_keysched_dec(uint32_t *, const void *, size_t);
+
+#endif	/* _CRYPTO_AES_AES_KEYSCHED_H */
diff -r 03d10f218fce -r 8da4be48b876 sys/crypto/aes/arch/x86/aes_via.c
--- a/sys/crypto/aes/arch/x86/aes_via.c	Sun Oct 19 18:56:19 2025 +0000
+++ b/sys/crypto/aes/arch/x86/aes_via.c	Sat Nov 22 04:18:35 2025 +0000
@@ -46,8 +46,8 @@ struct evcnt { uint64_t ev_count; };
 #endif
 
 #include <crypto/aes/aes.h>
-#include <crypto/aes/aes_bear.h>
 #include <crypto/aes/aes_impl.h>
+#include <crypto/aes/aes_keysched.h>
 
 #ifdef _KERNEL
 #include <x86/cpufunc.h>
@@ -107,6 +107,12 @@ aesvia_setenckey(struct aesenc *enc, con
 {
 	size_t key_len;
 
+	/*
+	 * For AES-128, VIA PadLock only needs the original key itself.
+	 *
+	 * For AES-192 and AES-256, VIA PadLock needs software to
+	 * compute the standard AES key schedule.
+	 */
 	switch (nrounds) {
 	case AES_128_NROUNDS:
 		enc->aese_aes.aes_rk[0] = le32dec(key + 4*0);
@@ -123,7 +129,7 @@ aesvia_setenckey(struct aesenc *enc, con
 	default:
 		panic("invalid AES nrounds: %u", nrounds);
 	}
-	br_aes_ct_keysched_stdenc(enc->aese_aes.aes_rk, key, key_len);
+	aes_keysched_enc(enc->aese_aes.aes_rk, key, key_len);
 }
 
 static void
@@ -147,7 +153,7 @@ aesvia_setdeckey(struct aesdec *dec, con
 	default:
 		panic("invalid AES nrounds: %u", nrounds);
 	}
-	br_aes_ct_keysched_stddec(dec->aesd_aes.aes_rk, key, key_len);
+	aes_keysched_dec(dec->aesd_aes.aes_rk, key, key_len);
 }
 
 static inline void
diff -r 03d10f218fce -r 8da4be48b876 sys/crypto/aes/files.aes
--- a/sys/crypto/aes/files.aes	Sun Oct 19 18:56:19 2025 +0000
+++ b/sys/crypto/aes/files.aes	Sat Nov 22 04:18:35 2025 +0000
@@ -9,4 +9,5 @@ file	crypto/aes/aes_ct.c			aes
 file	crypto/aes/aes_ct_dec.c			aes
 file	crypto/aes/aes_ct_enc.c			aes
 file	crypto/aes/aes_impl.c			aes
+file	crypto/aes/aes_keysched.c		aes
 file	crypto/aes/aes_selftest.c		aes
diff -r 03d10f218fce -r 8da4be48b876 tests/sys/crypto/aes/Makefile
--- a/tests/sys/crypto/aes/Makefile	Sun Oct 19 18:56:19 2025 +0000
+++ b/tests/sys/crypto/aes/Makefile	Sat Nov 22 04:18:35 2025 +0000
@@ -17,6 +17,7 @@ SRCS.t_aes+=	aes_bear.c
 SRCS.t_aes+=	aes_ct.c
 SRCS.t_aes+=	aes_ct_dec.c
 SRCS.t_aes+=	aes_ct_enc.c
+SRCS.t_aes+=	aes_keysched.c
 SRCS.t_aes+=	aes_selftest.c
 
 .if !empty(MACHINE_ARCH:Mearmv7*) || !empty(MACHINE_ARCH:Maarch64*)
# HG changeset patch
# User Taylor R Campbell <riastradh@NetBSD.org>
# Date 1763789294 0
#      Sat Nov 22 05:28:14 2025 +0000
# Branch trunk
# Node ID cce15febbf047be806a40490ef2416f043f6db13
# Parent  8da4be48b8762a07f5504a2b60c33d89229d236f
# EXP-Topic riastradh-pr59774-aesbear64
aes(9): New 64-bit bitsliced implementation.

Derived from BearSSL's aes_ct64 code.

Enable with `options AES_BEAR64'.  Should be a reasonable default on
all platforms with 64-bit integer registers.  Caveat: uses about 1200
bytes of stack space.  (Could approximately halve that, like the
BearSSL aes_ct code, at some speed cost which I haven't measured --
by moving the br_aes_ct64_skey_expand logic into add_round_key in
aes_ct64_enc/dec.c.)

PR kern/59774: bearssl 32-bit AES is too slow, want 64-bit optimized
version in kernel

diff -r 8da4be48b876 -r cce15febbf04 sys/conf/copts.mk
--- a/sys/conf/copts.mk	Sat Nov 22 04:18:35 2025 +0000
+++ b/sys/conf/copts.mk	Sat Nov 22 05:28:14 2025 +0000
@@ -39,6 +39,7 @@ COPTS.chacha_neon.c+=	-flax-vector-conve
 .endif
 
 .if ${MACHINE_ARCH} == "x86_64" || ${MACHINE_ARCH} == "i386"
+COPTS.aes_bear64.c+=	${CC_WNO_ARRAY_BOUNDS} ${CC_WNO_STRINGOP_OVERFLOW}
 COPTS.aes_ni.c+=	${CC_WNO_STRINGOP_OVERREAD} ${CC_WNO_STRINGOP_OVERFLOW}
 COPTS.aes_sse2_subr.c+=	${CC_WNO_ARRAY_BOUNDS}
 COPTS.aes_ssse3_subr.c+=${CC_WNO_ARRAY_BOUNDS}
diff -r 8da4be48b876 -r cce15febbf04 sys/crypto/aes/aes_bear64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/aes/aes_bear64.c	Sat Nov 22 05:28:14 2025 +0000
@@ -0,0 +1,933 @@
+/*	$NetBSD$	*/
+
+/*-
+ * Copyright (c) 2025 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(1, "$NetBSD$");
+
+#include <sys/types.h>
+#include <sys/endian.h>
+
+#ifdef _KERNEL
+#include <sys/systm.h>
+#else
+#include <assert.h>
+#include <err.h>
+#include <string.h>
+#define	KASSERT			assert
+#define	panic(fmt, args...)	err(1, fmt, args)
+#endif
+
+#include <crypto/aes/aes.h>
+#include <crypto/aes/aes_bear64.h>
+#include <crypto/aes/aes_impl.h>
+
+static void
+aesbear64_setkey(uint64_t rk[static 30], const void *key, uint32_t nrounds)
+{
+	size_t key_len;
+
+	switch (nrounds) {
+	case 10:
+		key_len = 16;
+		break;
+	case 12:
+		key_len = 24;
+		break;
+	case 14:
+		key_len = 32;
+		break;
+	default:
+		panic("invalid AES nrounds: %u", nrounds);
+	}
+
+	br_aes_ct64_keysched(rk, key, key_len);
+}
+
+static void
+aesbear64_setenckey(struct aesenc *enc, const uint8_t *key, uint32_t nrounds)
+{
+
+	aesbear64_setkey(enc->aese_aes.aes_rk64, key, nrounds);
+}
+
+static void
+aesbear64_setdeckey(struct aesdec *dec, const uint8_t *key, uint32_t nrounds)
+{
+
+	/*
+	 * BearSSL computes InvMixColumns on the fly -- no need for
+	 * distinct decryption round keys.
+	 */
+	aesbear64_setkey(dec->aesd_aes.aes_rk64, key, nrounds);
+}
+
+static void
+aesbear64_enc(const struct aesenc *enc, const uint8_t in[static 16],
+    uint8_t out[static 16], uint32_t nrounds)
+{
+	uint64_t sk_exp[120];
+	uint32_t w[4];
+	uint64_t q[8];
+
+	/* Expand round keys for bitslicing.  */
+	br_aes_ct64_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
+
+	/* Load input block interleaved with garbage blocks.  */
+	w[0] = le32dec(in + 4*0);
+	w[1] = le32dec(in + 4*1);
+	w[2] = le32dec(in + 4*2);
+	w[3] = le32dec(in + 4*3);
+	br_aes_ct64_interleave_in(&q[0], &q[4], w);
+	q[1] = q[2] = q[3] = 0;
+	q[5] = q[6] = q[7] = 0;
+
+	/* Transform to bitslice, encrypt, transform from bitslice.  */
+	br_aes_ct64_ortho(q);
+	br_aes_ct64_bitslice_encrypt(nrounds, sk_exp, q);
+	br_aes_ct64_ortho(q);
+
+	/* Store output block.  */
+	br_aes_ct64_interleave_out(w, q[0], q[4]);
+	le32enc(out + 4*0, w[0]);
+	le32enc(out + 4*1, w[1]);
+	le32enc(out + 4*2, w[2]);
+	le32enc(out + 4*3, w[3]);
+
+	/* Paranoia: Zero temporary buffers.  */
+	explicit_memset(sk_exp, 0, sizeof sk_exp);
+	explicit_memset(q, 0, sizeof q);
+}
+
+static void
+aesbear64_dec(const struct aesdec *dec, const uint8_t in[static 16],
+    uint8_t out[static 16], uint32_t nrounds)
+{
+	uint64_t sk_exp[120];
+	uint32_t w[4];
+	uint64_t q[8];
+
+	/* Expand round keys for bitslicing.  */
+	br_aes_ct64_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
+
+	/* Load input block interleaved with garbage blocks.  */
+	w[0] = le32dec(in + 4*0);
+	w[1] = le32dec(in + 4*1);
+	w[2] = le32dec(in + 4*2);
+	w[3] = le32dec(in + 4*3);
+	br_aes_ct64_interleave_in(&q[0], &q[4], w);
+	q[1] = q[2] = q[3] = 0;
+	q[5] = q[6] = q[7] = 0;
+
+	/* Transform to bitslice, decrypt, transform from bitslice.  */
+	br_aes_ct64_ortho(q);
+	br_aes_ct64_bitslice_decrypt(nrounds, sk_exp, q);
+	br_aes_ct64_ortho(q);
+
+	/* Store output block.  */
+	br_aes_ct64_interleave_out(w, q[0], q[4]);
+	le32enc(out + 4*0, w[0]);
+	le32enc(out + 4*1, w[1]);
+	le32enc(out + 4*2, w[2]);
+	le32enc(out + 4*3, w[3]);
+
+	/* Paranoia: Zero temporary buffers.  */
+	explicit_memset(sk_exp, 0, sizeof sk_exp);
+	explicit_memset(q, 0, sizeof q);
+}
+
+static void
+aesbear64_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
+    uint32_t nrounds)
+{
+	uint64_t sk_exp[120];
+	uint32_t w[4];
+	uint64_t q[8];
+	uint32_t cv0, cv1, cv2, cv3;
+
+	KASSERT(nbytes % 16 == 0);
+
+	/* Skip if there's nothing to do.  */
+	if (nbytes == 0)
+		return;
+
+	/* Expand round keys for bitslicing.  */
+	br_aes_ct64_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
+
+	/* Initialize garbage blocks.  */
+	q[1] = q[2] = q[3] = 0;
+	q[5] = q[6] = q[7] = 0;
+
+	/* Load IV.  */
+	cv0 = le32dec(iv + 4*0);
+	cv1 = le32dec(iv + 4*1);
+	cv2 = le32dec(iv + 4*2);
+	cv3 = le32dec(iv + 4*3);
+
+	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
+		/* Load input block and apply CV.  */
+		w[0] = cv0 ^ le32dec(in + 4*0);
+		w[1] = cv1 ^ le32dec(in + 4*1);
+		w[2] = cv2 ^ le32dec(in + 4*2);
+		w[3] = cv3 ^ le32dec(in + 4*3);
+		br_aes_ct64_interleave_in(&q[0], &q[4], w);
+
+		/* Transform to bitslice, encrypt, transform from bitslice.  */
+		br_aes_ct64_ortho(q);
+		br_aes_ct64_bitslice_encrypt(nrounds, sk_exp, q);
+		br_aes_ct64_ortho(q);
+
+		/* Remember ciphertext as CV and store output block.  */
+		br_aes_ct64_interleave_out(w, q[0], q[4]);
+		cv0 = w[0];
+		cv1 = w[1];
+		cv2 = w[2];
+		cv3 = w[3];
+		le32enc(out + 4*0, cv0);
+		le32enc(out + 4*1, cv1);
+		le32enc(out + 4*2, cv2);
+		le32enc(out + 4*3, cv3);
+	}
+
+	/* Store updated IV.  */
+	le32enc(iv + 4*0, cv0);
+	le32enc(iv + 4*1, cv1);
+	le32enc(iv + 4*2, cv2);
+	le32enc(iv + 4*3, cv3);
+
+	/* Paranoia: Zero temporary buffers.  */
+	explicit_memset(sk_exp, 0, sizeof sk_exp);
+	explicit_memset(q, 0, sizeof q);
+}
+
+static void
+aesbear64_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
+    uint32_t nrounds)
+{
+	uint64_t sk_exp[120];
+	uint32_t w[4];
+	uint64_t q[8];
+	uint32_t cv0, cv1, cv2, cv3, iv0, iv1, iv2, iv3;
+	unsigned i;
+
+	KASSERT(nbytes % 16 == 0);
+
+	/* Skip if there's nothing to do.  */
+	if (nbytes == 0)
+		return;
+
+	/* Expand round keys for bitslicing.  */
+	br_aes_ct64_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
+
+	/* Load the IV.  */
+	iv0 = le32dec(iv + 4*0);
+	iv1 = le32dec(iv + 4*1);
+	iv2 = le32dec(iv + 4*2);
+	iv3 = le32dec(iv + 4*3);
+
+	/* Load the last cipher block.  */
+	cv0 = le32dec(in + nbytes - 16 + 4*0);
+	cv1 = le32dec(in + nbytes - 16 + 4*1);
+	cv2 = le32dec(in + nbytes - 16 + 4*2);
+	cv3 = le32dec(in + nbytes - 16 + 4*3);
+
+	/* Store the updated IV.  */
+	le32enc(iv + 4*0, cv0);
+	le32enc(iv + 4*1, cv1);
+	le32enc(iv + 4*2, cv2);
+	le32enc(iv + 4*3, cv3);
+
+	/* Handle the last cipher block separately if odd number.  */
+	if (nbytes % 64) {
+		unsigned n = (nbytes % 64)/16;
+
+		KASSERT(n == 1 || n == 2 || n == 3);
+
+		for (i = 4; i --> n;)
+			q[i] = q[4 + i] = 0;
+		KASSERT(i == n - 1);
+		w[0] = cv0;	/* le32dec(in + nbytes - 16*n + 16*i + 4*0) */
+		w[1] = cv1;	/* le32dec(in + nbytes - 16*n + 16*i + 4*1) */
+		w[2] = cv2;	/* le32dec(in + nbytes - 16*n + 16*i + 4*2) */
+		w[3] = cv3;	/* le32dec(in + nbytes - 16*n + 16*i + 4*3) */
+		br_aes_ct64_interleave_in(&q[i], &q[4 + i], w);
+		while (i --> 0) {
+			w[0] = le32dec(in + nbytes - 16*n + 16*i + 4*0);
+			w[1] = le32dec(in + nbytes - 16*n + 16*i + 4*1);
+			w[2] = le32dec(in + nbytes - 16*n + 16*i + 4*2);
+			w[3] = le32dec(in + nbytes - 16*n + 16*i + 4*3);
+			br_aes_ct64_interleave_in(&q[i], &q[4 + i], w);
+		}
+
+		/* Decrypt.  */
+		br_aes_ct64_ortho(q);
+		br_aes_ct64_bitslice_decrypt(nrounds, sk_exp, q);
+		br_aes_ct64_ortho(q);
+
+		for (i = n; i --> 1;) {
+			br_aes_ct64_interleave_out(w, q[i], q[4 + i]);
+			cv0 = le32dec(in + nbytes - 16*n + 16*(i - 1) + 4*0);
+			cv1 = le32dec(in + nbytes - 16*n + 16*(i - 1) + 4*1);
+			cv2 = le32dec(in + nbytes - 16*n + 16*(i - 1) + 4*2);
+			cv3 = le32dec(in + nbytes - 16*n + 16*(i - 1) + 4*3);
+			le32enc(out + nbytes - 16*n + 16*i + 4*0, w[0] ^ cv0);
+			le32enc(out + nbytes - 16*n + 16*i + 4*1, w[1] ^ cv1);
+			le32enc(out + nbytes - 16*n + 16*i + 4*2, w[2] ^ cv2);
+			le32enc(out + nbytes - 16*n + 16*i + 4*3, w[3] ^ cv3);
+		}
+		br_aes_ct64_interleave_out(w, q[0], q[4]);
+
+		/* If this was the only cipher block, we're done.  */
+		nbytes -= nbytes % 64;
+		if (nbytes == 0)
+			goto out;
+
+		/*
+		 * Otherwise, load up the previous cipher block, and
+		 * store the output block.
+		 */
+		cv0 = le32dec(in + nbytes - 16 + 4*0);
+		cv1 = le32dec(in + nbytes - 16 + 4*1);
+		cv2 = le32dec(in + nbytes - 16 + 4*2);
+		cv3 = le32dec(in + nbytes - 16 + 4*3);
+		le32enc(out + nbytes + 4*0, cv0 ^ w[0]);
+		le32enc(out + nbytes + 4*1, cv1 ^ w[1]);
+		le32enc(out + nbytes + 4*2, cv2 ^ w[2]);
+		le32enc(out + nbytes + 4*3, cv3 ^ w[3]);
+	}
+
+	for (;;) {
+		KASSERT(nbytes >= 64);
+
+		/* Load the input blocks.  */
+		w[0] = cv0;	/* le32dec(in + nbytes - 64 + 16*i + 4*0) */
+		w[1] = cv1;	/* le32dec(in + nbytes - 64 + 16*i + 4*1) */
+		w[2] = cv2;	/* le32dec(in + nbytes - 64 + 16*i + 4*2) */
+		w[3] = cv3;	/* le32dec(in + nbytes - 64 + 16*i + 4*3) */
+		br_aes_ct64_interleave_in(&q[3], &q[7], w);
+		for (i = 3; i --> 0;) {
+			w[0] = le32dec(in + nbytes - 64 + 16*i + 4*0);
+			w[1] = le32dec(in + nbytes - 64 + 16*i + 4*1);
+			w[2] = le32dec(in + nbytes - 64 + 16*i + 4*2);
+			w[3] = le32dec(in + nbytes - 64 + 16*i + 4*3);
+			br_aes_ct64_interleave_in(&q[i], &q[4 + i], w);
+		}
+
+		/* Decrypt.  */
+		br_aes_ct64_ortho(q);
+		br_aes_ct64_bitslice_decrypt(nrounds, sk_exp, q);
+		br_aes_ct64_ortho(q);
+
+		/* Store the upper output blocks.  */
+		for (i = 4; i --> 1;) {
+			br_aes_ct64_interleave_out(w, q[i], q[4 + i]);
+			cv0 = le32dec(in + nbytes - 64 + 16*(i - 1) + 4*0);
+			cv1 = le32dec(in + nbytes - 64 + 16*(i - 1) + 4*1);
+			cv2 = le32dec(in + nbytes - 64 + 16*(i - 1) + 4*2);
+			cv3 = le32dec(in + nbytes - 64 + 16*(i - 1) + 4*3);
+			le32enc(out + nbytes - 64 + 16*i + 4*0, w[0] ^ cv0);
+			le32enc(out + nbytes - 64 + 16*i + 4*1, w[1] ^ cv1);
+			le32enc(out + nbytes - 64 + 16*i + 4*2, w[2] ^ cv2);
+			le32enc(out + nbytes - 64 + 16*i + 4*3, w[3] ^ cv3);
+		}
+
+		/* Prepare the first output block.  */
+		br_aes_ct64_interleave_out(w, q[0], q[4]);
+
+		/* Stop if we've reached the first output block.  */
+		nbytes -= 64;
+		if (nbytes == 0)
+			goto out;
+
+		/*
+		 * Load the preceding cipher block, and apply it as the
+		 * chaining value to this one.
+		 */
+		cv0 = le32dec(in + nbytes - 16 + 4*0);
+		cv1 = le32dec(in + nbytes - 16 + 4*1);
+		cv2 = le32dec(in + nbytes - 16 + 4*2);
+		cv3 = le32dec(in + nbytes - 16 + 4*3);
+		le32enc(out + nbytes + 4*0, w[0] ^ cv0);
+		le32enc(out + nbytes + 4*1, w[1] ^ cv1);
+		le32enc(out + nbytes + 4*2, w[2] ^ cv2);
+		le32enc(out + nbytes + 4*3, w[3] ^ cv3);
+	}
+
+out:	/* Store the first output block.  */
+	le32enc(out + 4*0, w[0] ^ iv0);
+	le32enc(out + 4*1, w[1] ^ iv1);
+	le32enc(out + 4*2, w[2] ^ iv2);
+	le32enc(out + 4*3, w[3] ^ iv3);
+
+	/* Paranoia: Zero temporary buffers.  */
+	explicit_memset(sk_exp, 0, sizeof sk_exp);
+	explicit_memset(q, 0, sizeof q);
+}
+
+static inline void
+aesbear64_xts_update(uint32_t *t0, uint32_t *t1, uint32_t *t2, uint32_t *t3)
+{
+	uint32_t s0, s1, s2, s3;
+
+	s0 = *t0 >> 31;
+	s1 = *t1 >> 31;
+	s2 = *t2 >> 31;
+	s3 = *t3 >> 31;
+	*t0 = (*t0 << 1) ^ (-s3 & 0x87);
+	*t1 = (*t1 << 1) ^ s0;
+	*t2 = (*t2 << 1) ^ s1;
+	*t3 = (*t3 << 1) ^ s2;
+}
+
+static int
+aesbear64_xts_update_selftest(void)
+{
+	static const struct {
+		uint32_t in[4], out[4];
+	} cases[] = {
+		{ {1}, {2} },
+		{ {0x80000000U,0,0,0}, {0,1,0,0} },
+		{ {0,0x80000000U,0,0}, {0,0,1,0} },
+		{ {0,0,0x80000000U,0}, {0,0,0,1} },
+		{ {0,0,0,0x80000000U}, {0x87,0,0,0} },
+		{ {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} },
+	};
+	unsigned i;
+	uint32_t t0, t1, t2, t3;
+
+	for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) {
+		t0 = cases[i].in[0];
+		t1 = cases[i].in[1];
+		t2 = cases[i].in[2];
+		t3 = cases[i].in[3];
+		aesbear64_xts_update(&t0, &t1, &t2, &t3);
+		if (t0 != cases[i].out[0] ||
+		    t1 != cases[i].out[1] ||
+		    t2 != cases[i].out[2] ||
+		    t3 != cases[i].out[3])
+			return -1;
+	}
+
+	/* Success!  */
+	return 0;
+}
+
+static void
+aesbear64_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
+    uint32_t nrounds)
+{
+	uint64_t sk_exp[120];
+	uint32_t w[4];
+	uint64_t q[8];
+	uint32_t t0, t1, t2, t3, u0, u1, u2, u3;
+	unsigned i;
+
+	KASSERT(nbytes % 16 == 0);
+
+	/* Skip if there's nothing to do.  */
+	if (nbytes == 0)
+		return;
+
+	/* Expand round keys for bitslicing.  */
+	br_aes_ct64_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
+
+	/* Load tweak.  */
+	t0 = le32dec(tweak + 4*0);
+	t1 = le32dec(tweak + 4*1);
+	t2 = le32dec(tweak + 4*2);
+	t3 = le32dec(tweak + 4*3);
+
+	/* Handle the first blocks separately if odd number.  */
+	if (nbytes % 64) {
+		unsigned n = (nbytes % 64)/16;
+
+		/* Load up the first blocks and garbage.  */
+		for (i = 0, u0 = t0, u1 = t1, u2 = t2, u3 = t3; i < n; i++) {
+			w[0] = le32dec(in + 16*i + 4*0) ^ u0;
+			w[1] = le32dec(in + 16*i + 4*1) ^ u1;
+			w[2] = le32dec(in + 16*i + 4*2) ^ u2;
+			w[3] = le32dec(in + 16*i + 4*3) ^ u3;
+			aesbear64_xts_update(&u0, &u1, &u2, &u3);
+			br_aes_ct64_interleave_in(&q[i], &q[4 + i], w);
+		}
+		for (; i < 4; i++)
+			q[i] = q[4 + i] = 0;
+
+		/* Encrypt up to three blocks.  */
+		br_aes_ct64_ortho(q);
+		br_aes_ct64_bitslice_encrypt(nrounds, sk_exp, q);
+		br_aes_ct64_ortho(q);
+
+		/* Store up to three blocks.  */
+		for (i = 0, u0 = t0, u1 = t1, u2 = t2, u3 = t3; i < n; i++) {
+			br_aes_ct64_interleave_out(w, q[i], q[4 + i]);
+			le32enc(out + 16*i + 4*0, w[0] ^ u0);
+			le32enc(out + 16*i + 4*1, w[1] ^ u1);
+			le32enc(out + 16*i + 4*2, w[2] ^ u2);
+			le32enc(out + 16*i + 4*3, w[3] ^ u3);
+			aesbear64_xts_update(&u0, &u1, &u2, &u3);
+		}
+
+		/* Advance to the next block.  */
+		t0 = u0, t1 = u1, t2 = u2, t3 = u3;
+		if ((nbytes -= 16*n) == 0)
+			goto out;
+		in += 16*n;
+		out += 16*n;
+	}
+
+	do {
+		KASSERT(nbytes >= 64);
+
+		/* Load four blocks.  */
+		for (i = 0, u0 = t0, u1 = t1, u2 = t2, u3 = t3; i < 4; i++) {
+			w[0] = le32dec(in + 16*i + 4*0) ^ u0;
+			w[1] = le32dec(in + 16*i + 4*1) ^ u1;
+			w[2] = le32dec(in + 16*i + 4*2) ^ u2;
+			w[3] = le32dec(in + 16*i + 4*3) ^ u3;
+			aesbear64_xts_update(&u0, &u1, &u2, &u3);
+			br_aes_ct64_interleave_in(&q[i], &q[4 + i], w);
+		}
+
+		/* Encrypt four blocks.  */
+		br_aes_ct64_ortho(q);
+		br_aes_ct64_bitslice_encrypt(nrounds, sk_exp, q);
+		br_aes_ct64_ortho(q);
+
+		/* Store four blocks.  */
+		for (i = 0, u0 = t0, u1 = t1, u2 = t2, u3 = t3; i < 4; i++) {
+			br_aes_ct64_interleave_out(w, q[i], q[4 + i]);
+			le32enc(out + 16*i + 4*0, w[0] ^ u0);
+			le32enc(out + 16*i + 4*1, w[1] ^ u1);
+			le32enc(out + 16*i + 4*2, w[2] ^ u2);
+			le32enc(out + 16*i + 4*3, w[3] ^ u3);
+			aesbear64_xts_update(&u0, &u1, &u2, &u3);
+		}
+
+		/* Advance to the next pair of blocks.  */
+		t0 = u0, t1 = u1, t2 = u2, t3 = u3;
+		in += 64;
+		out += 64;
+	} while (nbytes -= 64, nbytes);
+
+out:	/* Store the updated tweak.  */
+	le32enc(tweak + 4*0, t0);
+	le32enc(tweak + 4*1, t1);
+	le32enc(tweak + 4*2, t2);
+	le32enc(tweak + 4*3, t3);
+
+	/* Paranoia: Zero temporary buffers.  */
+	explicit_memset(sk_exp, 0, sizeof sk_exp);
+	explicit_memset(q, 0, sizeof q);
+}
+
+static void
+aesbear64_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
+    uint32_t nrounds)
+{
+	uint64_t sk_exp[120];
+	uint32_t w[4];
+	uint64_t q[8];
+	uint32_t t0, t1, t2, t3, u0, u1, u2, u3;
+	unsigned i;
+
+	KASSERT(nbytes % 16 == 0);
+
+	/* Skip if there's nothing to do.  */
+	if (nbytes == 0)
+		return;
+
+	/* Expand round keys for bitslicing.  */
+	br_aes_ct64_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
+
+	/* Load tweak.  */
+	t0 = le32dec(tweak + 4*0);
+	t1 = le32dec(tweak + 4*1);
+	t2 = le32dec(tweak + 4*2);
+	t3 = le32dec(tweak + 4*3);
+
+	/* Handle the first blocks separately if odd number.  */
+	if (nbytes % 64) {
+		unsigned n = (nbytes % 64)/16;
+
+		/* Load up the first blocks and garbage.  */
+		for (i = 0, u0 = t0, u1 = t1, u2 = t2, u3 = t3; i < n; i++) {
+			w[0] = le32dec(in + 16*i + 4*0) ^ u0;
+			w[1] = le32dec(in + 16*i + 4*1) ^ u1;
+			w[2] = le32dec(in + 16*i + 4*2) ^ u2;
+			w[3] = le32dec(in + 16*i + 4*3) ^ u3;
+			aesbear64_xts_update(&u0, &u1, &u2, &u3);
+			br_aes_ct64_interleave_in(&q[i], &q[4 + i], w);
+		}
+		for (; i < 4; i++)
+			q[i] = q[4 + i] = 0;
+
+		/* Decrypt up to three blocks.  */
+		br_aes_ct64_ortho(q);
+		br_aes_ct64_bitslice_decrypt(nrounds, sk_exp, q);
+		br_aes_ct64_ortho(q);
+
+		/* Store up to three blocks.  */
+		for (i = 0, u0 = t0, u1 = t1, u2 = t2, u3 = t3; i < n; i++) {
+			br_aes_ct64_interleave_out(w, q[i], q[4 + i]);
+			le32enc(out + 16*i + 4*0, w[0] ^ u0);
+			le32enc(out + 16*i + 4*1, w[1] ^ u1);
+			le32enc(out + 16*i + 4*2, w[2] ^ u2);
+			le32enc(out + 16*i + 4*3, w[3] ^ u3);
+			aesbear64_xts_update(&u0, &u1, &u2, &u3);
+		}
+
+		/* Advance to the next block.  */
+		t0 = u0, t1 = u1, t2 = u2, t3 = u3;
+		if ((nbytes -= 16*n) == 0)
+			goto out;
+		in += 16*n;
+		out += 16*n;
+	}
+
+	do {
+		KASSERT(nbytes >= 64);
+
+		/* Load four blocks.  */
+		for (i = 0, u0 = t0, u1 = t1, u2 = t2, u3 = t3; i < 4; i++) {
+			w[0] = le32dec(in + 16*i + 4*0) ^ u0;
+			w[1] = le32dec(in + 16*i + 4*1) ^ u1;
+			w[2] = le32dec(in + 16*i + 4*2) ^ u2;
+			w[3] = le32dec(in + 16*i + 4*3) ^ u3;
+			aesbear64_xts_update(&u0, &u1, &u2, &u3);
+			br_aes_ct64_interleave_in(&q[i], &q[4 + i], w);
+		}
+
+		/* Decrypt four blocks.  */
+		br_aes_ct64_ortho(q);
+		br_aes_ct64_bitslice_decrypt(nrounds, sk_exp, q);
+		br_aes_ct64_ortho(q);
+
+		/* Store four blocks.  */
+		for (i = 0, u0 = t0, u1 = t1, u2 = t2, u3 = t3; i < 4; i++) {
+			br_aes_ct64_interleave_out(w, q[i], q[4 + i]);
+			le32enc(out + 16*i + 4*0, w[0] ^ u0);
+			le32enc(out + 16*i + 4*1, w[1] ^ u1);
+			le32enc(out + 16*i + 4*2, w[2] ^ u2);
+			le32enc(out + 16*i + 4*3, w[3] ^ u3);
+			aesbear64_xts_update(&u0, &u1, &u2, &u3);
+		}
+
+		/* Advance to the next pair of blocks.  */
+		t0 = u0, t1 = u1, t2 = u2, t3 = u3;
+		in += 64;
+		out += 64;
+	} while (nbytes -= 64, nbytes);
+
+out:	/* Store the updated tweak.  */
+	le32enc(tweak + 4*0, t0);
+	le32enc(tweak + 4*1, t1);
+	le32enc(tweak + 4*2, t2);
+	le32enc(tweak + 4*3, t3);
+
+	/* Paranoia: Zero temporary buffers.  */
+	explicit_memset(sk_exp, 0, sizeof sk_exp);
+	explicit_memset(q, 0, sizeof q);
+}
+
+static void
+aesbear64_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16],
+    size_t nbytes, uint8_t auth[static 16], uint32_t nrounds)
+{
+	uint64_t sk_exp[120];
+	uint32_t w[4];
+	uint64_t q[8];
+
+	KASSERT(nbytes);
+	KASSERT(nbytes % 16 == 0);
+
+	/* Expand round keys for bitslicing.  */
+	br_aes_ct64_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
+
+	/* Initialize garbage blocks.  */
+	q[1] = q[2] = q[3] = 0;
+	q[5] = q[6] = q[7] = 0;
+
+	/* Load initial authenticator.  */
+	w[0] = le32dec(auth + 4*0);
+	w[1] = le32dec(auth + 4*1);
+	w[2] = le32dec(auth + 4*2);
+	w[3] = le32dec(auth + 4*3);
+
+	for (; nbytes; nbytes -= 16, in += 16) {
+		/* Combine input block.  */
+		w[0] ^= le32dec(in + 4*0);
+		w[1] ^= le32dec(in + 4*1);
+		w[2] ^= le32dec(in + 4*2);
+		w[3] ^= le32dec(in + 4*3);
+		br_aes_ct64_interleave_in(&q[0], &q[4], w);
+
+		/* Transform to bitslice, encrypt, transform from bitslice.  */
+		br_aes_ct64_ortho(q);
+		br_aes_ct64_bitslice_encrypt(nrounds, sk_exp, q);
+		br_aes_ct64_ortho(q);
+
+		br_aes_ct64_interleave_out(w, q[0], q[4]);
+	}
+
+	/* Store updated authenticator.  */
+	le32enc(auth + 4*0, w[0]);
+	le32enc(auth + 4*1, w[1]);
+	le32enc(auth + 4*2, w[2]);
+	le32enc(auth + 4*3, w[3]);
+
+	/* Paranoia: Zero temporary buffers.  */
+	explicit_memset(sk_exp, 0, sizeof sk_exp);
+	explicit_memset(q, 0, sizeof q);
+}
+
+static void
+aesbear64_ccm_enc1(const struct aesenc *enc, const uint8_t *in, uint8_t *out,
+    size_t nbytes, uint8_t authctr[32], uint32_t nrounds)
+{
+	uint64_t sk_exp[120];
+	uint32_t w[4];
+	uint64_t q[8];
+	uint32_t c0, c1, c2, c3be;
+
+	KASSERT(nbytes);
+	KASSERT(nbytes % 16 == 0);
+
+	/* Expand round keys for bitslicing.  */
+	br_aes_ct64_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
+
+	/* Initialize garbage blocks.  */
+	q[2] = q[3] = 0;
+	q[6] = q[7] = 0;
+
+	/* Set first block to authenticator.  */
+	w[0] = le32dec(authctr + 4*0);
+	w[1] = le32dec(authctr + 4*1);
+	w[2] = le32dec(authctr + 4*2);
+	w[3] = le32dec(authctr + 4*3);
+
+	/* Load initial counter block, big-endian so we can increment it.  */
+	c0 = le32dec(authctr + 16 + 4*0);
+	c1 = le32dec(authctr + 16 + 4*1);
+	c2 = le32dec(authctr + 16 + 4*2);
+	c3be = bswap32(le32dec(authctr + 16 + 4*3));
+
+	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
+		/* Update authenticator.  */
+		w[0] ^= le32dec(in + 4*0);
+		w[1] ^= le32dec(in + 4*1);
+		w[2] ^= le32dec(in + 4*2);
+		w[3] ^= le32dec(in + 4*3);
+		br_aes_ct64_interleave_in(&q[0], &q[4], w);
+
+		/* Increment 32-bit counter.  */
+		w[0] = c0;
+		w[1] = c1;
+		w[2] = c2;
+		w[3] = bswap32(++c3be);
+		br_aes_ct64_interleave_in(&q[1], &q[5], w);
+
+		/* Encrypt authenticator and counter.  */
+		br_aes_ct64_ortho(q);
+		br_aes_ct64_bitslice_encrypt(nrounds, sk_exp, q);
+		br_aes_ct64_ortho(q);
+
+		/* Encrypt with CTR output.  */
+		br_aes_ct64_interleave_out(w, q[1], q[5]);
+		le32enc(out + 4*0, le32dec(in + 4*0) ^ w[0]);
+		le32enc(out + 4*1, le32dec(in + 4*1) ^ w[1]);
+		le32enc(out + 4*2, le32dec(in + 4*2) ^ w[2]);
+		le32enc(out + 4*3, le32dec(in + 4*3) ^ w[3]);
+
+		/* Fish out the authenticator so far.  */
+		br_aes_ct64_interleave_out(w, q[0], q[4]);
+	}
+
+	/* Update authenticator.  */
+	le32enc(authctr + 4*0, w[0]);
+	le32enc(authctr + 4*1, w[1]);
+	le32enc(authctr + 4*2, w[2]);
+	le32enc(authctr + 4*3, w[3]);
+
+	/* Update counter.  */
+	le32enc(authctr + 16 + 4*3, bswap32(c3be));
+
+	/* Paranoia: Zero temporary buffers.  */
+	explicit_memset(sk_exp, 0, sizeof sk_exp);
+	explicit_memset(q, 0, sizeof q);
+}
+
+static void
+aesbear64_ccm_dec1(const struct aesenc *enc, const uint8_t *in, uint8_t *out,
+    size_t nbytes, uint8_t authctr[32], uint32_t nrounds)
+{
+	uint64_t sk_exp[120];
+	uint32_t w[4];
+	uint64_t q[8];
+	uint32_t c0, c1, c2, c3be;
+	uint32_t b0, b1, b2, b3;
+
+	KASSERT(nbytes);
+	KASSERT(nbytes % 16 == 0);
+
+	/* Expand round keys for bitslicing.  */
+	br_aes_ct64_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
+
+	/* Initialize garbage blocks.  */
+	q[2] = q[3] = 0;
+	q[6] = q[7] = 0;
+
+	/* Load initial counter block, big-endian so we can increment it.  */
+	c0 = le32dec(authctr + 16 + 4*0);
+	c1 = le32dec(authctr + 16 + 4*1);
+	c2 = le32dec(authctr + 16 + 4*2);
+	c3be = bswap32(le32dec(authctr + 16 + 4*3));
+
+	/* Increment 32-bit counter.  */
+	w[0] = c0;
+	w[1] = c1;
+	w[2] = c2;
+	w[3] = bswap32(++c3be);
+	br_aes_ct64_interleave_in(&q[1], &q[5], w);
+
+	/*
+	 * Set the other block to garbage -- we don't have any
+	 * plaintext to authenticate yet.
+	 */
+	q[0] = q[4] = 0;
+
+	/* Encrypt first CTR.  */
+	br_aes_ct64_ortho(q);
+	br_aes_ct64_bitslice_encrypt(nrounds, sk_exp, q);
+	br_aes_ct64_ortho(q);
+
+	/* Load the initial authenticator.  */
+	w[0] = le32dec(authctr + 4*0);
+	w[1] = le32dec(authctr + 4*1);
+	w[2] = le32dec(authctr + 4*2);
+	w[3] = le32dec(authctr + 4*3);
+	br_aes_ct64_interleave_in(&q[0], &q[4], w);
+
+	for (;; in += 16, out += 16) {
+		/* Decrypt the block.  */
+		br_aes_ct64_interleave_out(w, q[1], q[5]);
+		b0 = le32dec(in + 4*0) ^ w[0];
+		b1 = le32dec(in + 4*1) ^ w[1];
+		b2 = le32dec(in + 4*2) ^ w[2];
+		b3 = le32dec(in + 4*3) ^ w[3];
+
+		/* Update authenticator.  */
+		br_aes_ct64_interleave_out(w, q[0], q[4]);
+		w[0] ^= b0;
+		w[1] ^= b1;
+		w[2] ^= b2;
+		w[3] ^= b3;
+		br_aes_ct64_interleave_in(&q[0], &q[4], w);
+
+		/* Store plaintext.  */
+		le32enc(out + 4*0, b0);
+		le32enc(out + 4*1, b1);
+		le32enc(out + 4*2, b2);
+		le32enc(out + 4*3, b3);
+
+		/* If this is the last block, stop.  */
+		if ((nbytes -= 16) == 0)
+			break;
+
+		/* Increment 32-bit counter.  */
+		w[0] = c0;
+		w[1] = c1;
+		w[2] = c2;
+		w[3] = bswap32(++c3be);
+		br_aes_ct64_interleave_in(&q[1], &q[5], w);
+
+		/* Authenticate previous plaintext, encrypt next CTR.  */
+		br_aes_ct64_ortho(q);
+		br_aes_ct64_bitslice_encrypt(nrounds, sk_exp, q);
+		br_aes_ct64_ortho(q);
+	}
+
+	/*
+	 * Authenticate last plaintext.  We're only doing this for the
+	 * authenticator, not for the counter, so don't bother to
+	 * initialize q[2*i].  (Even for the sake of sanitizers,
+	 * they're already initialized to something by now.)
+	 */
+	br_aes_ct64_ortho(q);
+	br_aes_ct64_bitslice_encrypt(nrounds, sk_exp, q);
+	br_aes_ct64_ortho(q);
+
+	/* Update authenticator.  */
+	br_aes_ct64_interleave_out(w, q[0], q[4]);
+	le32enc(authctr + 4*0, w[0]);
+	le32enc(authctr + 4*1, w[1]);
+	le32enc(authctr + 4*2, w[2]);
+	le32enc(authctr + 4*3, w[3]);
+
+	/* Update counter.  */
+	le32enc(authctr + 16 + 4*3, bswap32(c3be));
+
+	/* Paranoia: Zero temporary buffers.  */
+	explicit_memset(sk_exp, 0, sizeof sk_exp);
+	explicit_memset(q, 0, sizeof q);
+}
+
+static int
+aesbear64_probe(void)
+{
+
+	if (aesbear64_xts_update_selftest())
+		return -1;
+
+	/* XXX test br_aes_ct64_bitslice_decrypt */
+	/* XXX test br_aes_ct64_bitslice_encrypt */
+	/* XXX test br_aes_ct64_keysched */
+	/* XXX test br_aes_ct64_ortho */
+	/* XXX test br_aes_ct64_skey_expand */
+
+	return 0;
+}
+
+struct aes_impl aes_bear64_impl = {
+	.ai_name = "BearSSL aes_ct64",
+	.ai_probe = aesbear64_probe,
+	.ai_setenckey = aesbear64_setenckey,
+	.ai_setdeckey = aesbear64_setdeckey,
+	.ai_enc = aesbear64_enc,
+	.ai_dec = aesbear64_dec,
+	.ai_cbc_enc = aesbear64_cbc_enc,
+	.ai_cbc_dec = aesbear64_cbc_dec,
+	.ai_xts_enc = aesbear64_xts_enc,
+	.ai_xts_dec = aesbear64_xts_dec,
+	.ai_cbcmac_update1 = aesbear64_cbcmac_update1,
+	.ai_ccm_enc1 = aesbear64_ccm_enc1,
+	.ai_ccm_dec1 = aesbear64_ccm_dec1,
+};
diff -r 8da4be48b876 -r cce15febbf04 sys/crypto/aes/aes_bear64.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/aes/aes_bear64.h	Sat Nov 22 05:28:14 2025 +0000
@@ -0,0 +1,62 @@
+/*	$NetBSD$	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_CRYPTO_AES_AES_BEAR64_H
+#define	_CRYPTO_AES_AES_BEAR64_H
+
+#include <sys/types.h>
+#include <sys/endian.h>
+
+#include <crypto/aes/aes.h>
+
+#define	br_dec32le	le32dec
+#define	br_enc32le	le32enc
+
+void	br_aes_ct64_bitslice_Sbox(uint64_t[static 8]);
+void	br_aes_ct64_bitslice_invSbox(uint64_t[static 8]);
+void	br_aes_ct64_ortho(uint64_t[static 8]);
+void	br_aes_ct64_interleave_in(uint64_t[static 1], uint64_t[static 1],
+	    const uint32_t[static 4]);
+void	br_aes_ct64_interleave_out(uint32_t[static 4], uint64_t, uint64_t);
+u_int	br_aes_ct64_keysched(uint64_t[static 30], const void *, size_t);
+void	br_aes_ct64_skey_expand(uint64_t[static 120], unsigned,
+	    const uint64_t[static 30]);
+void	br_aes_ct64_bitslice_encrypt(unsigned, const uint64_t[static 120],
+	    uint64_t[static 8]);
+void	br_aes_ct64_bitslice_decrypt(unsigned, const uint64_t[static 120],
+	    uint64_t[static 8]);
+
+/* NetBSD additions */
+
+void	br_aes_ct64_inv_mix_columns(uint64_t[static 8]);
+u_int	br_aes_ct64_keysched_stdenc(uint32_t *, const void *, size_t);
+u_int	br_aes_ct64_keysched_stddec(uint32_t *, const void *, size_t);
+
+extern struct aes_impl	aes_bear64_impl;
+
+#endif	/* _CRYPTO_AES_AES_BEAR64_H */
diff -r 8da4be48b876 -r cce15febbf04 sys/crypto/aes/aes_ct64.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/aes/aes_ct64.c	Sat Nov 22 05:28:14 2025 +0000
@@ -0,0 +1,512 @@
+/*	$NetBSD$	*/
+
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(1, "$NetBSD$");
+
+#include <sys/types.h>
+
+#ifdef _KERNEL
+#include <lib/libkern/libkern.h>
+#else
+#include <string.h>
+#endif
+
+#include <crypto/aes/aes_bear64.h>
+
+static void
+br_range_dec32le(uint32_t *p32, size_t nwords, const void *v)
+{
+	const uint8_t *p8 = v;
+
+	while (nwords --> 0) {
+		uint32_t x0 = *p8++;
+		uint32_t x1 = *p8++;
+		uint32_t x2 = *p8++;
+		uint32_t x3 = *p8++;
+
+		*p32++ = x0 | (x1 << 8) | (x2 << 16) | (x3 << 24);
+	}
+}
+
+/* see inner.h */
+void
+br_aes_ct64_bitslice_Sbox(uint64_t q[static 8])
+{
+	/*
+	 * This S-box implementation is a straightforward translation of
+	 * the circuit described by Boyar and Peralta in "A new
+	 * combinational logic minimization technique with applications
+	 * to cryptology" (https://eprint.iacr.org/2009/191.pdf).
+	 *
+	 * Note that variables x* (input) and s* (output) are numbered
+	 * in "reverse" order (x0 is the high bit, x7 is the low bit).
+	 */
+
+	uint64_t x0, x1, x2, x3, x4, x5, x6, x7;
+	uint64_t y1, y2, y3, y4, y5, y6, y7, y8, y9;
+	uint64_t y10, y11, y12, y13, y14, y15, y16, y17, y18, y19;
+	uint64_t y20, y21;
+	uint64_t z0, z1, z2, z3, z4, z5, z6, z7, z8, z9;
+	uint64_t z10, z11, z12, z13, z14, z15, z16, z17;
+	uint64_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
+	uint64_t t10, t11, t12, t13, t14, t15, t16, t17, t18, t19;
+	uint64_t t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
+	uint64_t t30, t31, t32, t33, t34, t35, t36, t37, t38, t39;
+	uint64_t t40, t41, t42, t43, t44, t45, t46, t47, t48, t49;
+	uint64_t t50, t51, t52, t53, t54, t55, t56, t57, t58, t59;
+	uint64_t t60, t61, t62, t63, t64, t65, t66, t67;
+	uint64_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+	x0 = q[7];
+	x1 = q[6];
+	x2 = q[5];
+	x3 = q[4];
+	x4 = q[3];
+	x5 = q[2];
+	x6 = q[1];
+	x7 = q[0];
+
+	/*
+	 * Top linear transformation.
+	 */
+	y14 = x3 ^ x5;
+	y13 = x0 ^ x6;
+	y9 = x0 ^ x3;
+	y8 = x0 ^ x5;
+	t0 = x1 ^ x2;
+	y1 = t0 ^ x7;
+	y4 = y1 ^ x3;
+	y12 = y13 ^ y14;
+	y2 = y1 ^ x0;
+	y5 = y1 ^ x6;
+	y3 = y5 ^ y8;
+	t1 = x4 ^ y12;
+	y15 = t1 ^ x5;
+	y20 = t1 ^ x1;
+	y6 = y15 ^ x7;
+	y10 = y15 ^ t0;
+	y11 = y20 ^ y9;
+	y7 = x7 ^ y11;
+	y17 = y10 ^ y11;
+	y19 = y10 ^ y8;
+	y16 = t0 ^ y11;
+	y21 = y13 ^ y16;
+	y18 = x0 ^ y16;
+
+	/*
+	 * Non-linear section.
+	 */
+	t2 = y12 & y15;
+	t3 = y3 & y6;
+	t4 = t3 ^ t2;
+	t5 = y4 & x7;
+	t6 = t5 ^ t2;
+	t7 = y13 & y16;
+	t8 = y5 & y1;
+	t9 = t8 ^ t7;
+	t10 = y2 & y7;
+	t11 = t10 ^ t7;
+	t12 = y9 & y11;
+	t13 = y14 & y17;
+	t14 = t13 ^ t12;
+	t15 = y8 & y10;
+	t16 = t15 ^ t12;
+	t17 = t4 ^ t14;
+	t18 = t6 ^ t16;
+	t19 = t9 ^ t14;
+	t20 = t11 ^ t16;
+	t21 = t17 ^ y20;
+	t22 = t18 ^ y19;
+	t23 = t19 ^ y21;
+	t24 = t20 ^ y18;
+
+	t25 = t21 ^ t22;
+	t26 = t21 & t23;
+	t27 = t24 ^ t26;
+	t28 = t25 & t27;
+	t29 = t28 ^ t22;
+	t30 = t23 ^ t24;
+	t31 = t22 ^ t26;
+	t32 = t31 & t30;
+	t33 = t32 ^ t24;
+	t34 = t23 ^ t33;
+	t35 = t27 ^ t33;
+	t36 = t24 & t35;
+	t37 = t36 ^ t34;
+	t38 = t27 ^ t36;
+	t39 = t29 & t38;
+	t40 = t25 ^ t39;
+
+	t41 = t40 ^ t37;
+	t42 = t29 ^ t33;
+	t43 = t29 ^ t40;
+	t44 = t33 ^ t37;
+	t45 = t42 ^ t41;
+	z0 = t44 & y15;
+	z1 = t37 & y6;
+	z2 = t33 & x7;
+	z3 = t43 & y16;
+	z4 = t40 & y1;
+	z5 = t29 & y7;
+	z6 = t42 & y11;
+	z7 = t45 & y17;
+	z8 = t41 & y10;
+	z9 = t44 & y12;
+	z10 = t37 & y3;
+	z11 = t33 & y4;
+	z12 = t43 & y13;
+	z13 = t40 & y5;
+	z14 = t29 & y2;
+	z15 = t42 & y9;
+	z16 = t45 & y14;
+	z17 = t41 & y8;
+
+	/*
+	 * Bottom linear transformation.
+	 */
+	t46 = z15 ^ z16;
+	t47 = z10 ^ z11;
+	t48 = z5 ^ z13;
+	t49 = z9 ^ z10;
+	t50 = z2 ^ z12;
+	t51 = z2 ^ z5;
+	t52 = z7 ^ z8;
+	t53 = z0 ^ z3;
+	t54 = z6 ^ z7;
+	t55 = z16 ^ z17;
+	t56 = z12 ^ t48;
+	t57 = t50 ^ t53;
+	t58 = z4 ^ t46;
+	t59 = z3 ^ t54;
+	t60 = t46 ^ t57;
+	t61 = z14 ^ t57;
+	t62 = t52 ^ t58;
+	t63 = t49 ^ t58;
+	t64 = z4 ^ t59;
+	t65 = t61 ^ t62;
+	t66 = z1 ^ t63;
+	s0 = t59 ^ t63;
+	s6 = t56 ^ ~t62;
+	s7 = t48 ^ ~t60;
+	t67 = t64 ^ t65;
+	s3 = t53 ^ t66;
+	s4 = t51 ^ t66;
+	s5 = t47 ^ t65;
+	s1 = t64 ^ ~s3;
+	s2 = t55 ^ ~t67;
+
+	q[7] = s0;
+	q[6] = s1;
+	q[5] = s2;
+	q[4] = s3;
+	q[3] = s4;
+	q[2] = s5;
+	q[1] = s6;
+	q[0] = s7;
+}
+
+/* see inner.h */
+void
+br_aes_ct64_ortho(uint64_t q[static 8])
+{
+#define SWAPN(cl, ch, s, x, y)   do { \
+		uint64_t a, b; \
+		a = (x); \
+		b = (y); \
+		(x) = (a & (uint64_t)cl) | ((b & (uint64_t)cl) << (s)); \
+		(y) = ((a & (uint64_t)ch) >> (s)) | (b & (uint64_t)ch); \
+	} while (0)
+
+#define SWAP2(x, y)    SWAPN(0x5555555555555555, 0xAAAAAAAAAAAAAAAA,  1, x, y)
+#define SWAP4(x, y)    SWAPN(0x3333333333333333, 0xCCCCCCCCCCCCCCCC,  2, x, y)
+#define SWAP8(x, y)    SWAPN(0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0,  4, x, y)
+
+	SWAP2(q[0], q[1]);
+	SWAP2(q[2], q[3]);
+	SWAP2(q[4], q[5]);
+	SWAP2(q[6], q[7]);
+
+	SWAP4(q[0], q[2]);
+	SWAP4(q[1], q[3]);
+	SWAP4(q[4], q[6]);
+	SWAP4(q[5], q[7]);
+
+	SWAP8(q[0], q[4]);
+	SWAP8(q[1], q[5]);
+	SWAP8(q[2], q[6]);
+	SWAP8(q[3], q[7]);
+}
+
+/* see inner.h */
+void
+br_aes_ct64_interleave_in(uint64_t q0[static 1], uint64_t q1[static 1],
+	const uint32_t w[static 4])
+{
+	uint64_t x0, x1, x2, x3;
+
+	x0 = w[0];
+	x1 = w[1];
+	x2 = w[2];
+	x3 = w[3];
+	x0 |= (x0 << 16);
+	x1 |= (x1 << 16);
+	x2 |= (x2 << 16);
+	x3 |= (x3 << 16);
+	x0 &= (uint64_t)0x0000FFFF0000FFFF;
+	x1 &= (uint64_t)0x0000FFFF0000FFFF;
+	x2 &= (uint64_t)0x0000FFFF0000FFFF;
+	x3 &= (uint64_t)0x0000FFFF0000FFFF;
+	x0 |= (x0 << 8);
+	x1 |= (x1 << 8);
+	x2 |= (x2 << 8);
+	x3 |= (x3 << 8);
+	x0 &= (uint64_t)0x00FF00FF00FF00FF;
+	x1 &= (uint64_t)0x00FF00FF00FF00FF;
+	x2 &= (uint64_t)0x00FF00FF00FF00FF;
+	x3 &= (uint64_t)0x00FF00FF00FF00FF;
+	*q0 = x0 | (x2 << 8);
+	*q1 = x1 | (x3 << 8);
+}
+
+/* see inner.h */
+void
+br_aes_ct64_interleave_out(uint32_t w[static 4], uint64_t q0, uint64_t q1)
+{
+	uint64_t x0, x1, x2, x3;
+
+	x0 = q0 & (uint64_t)0x00FF00FF00FF00FF;
+	x1 = q1 & (uint64_t)0x00FF00FF00FF00FF;
+	x2 = (q0 >> 8) & (uint64_t)0x00FF00FF00FF00FF;
+	x3 = (q1 >> 8) & (uint64_t)0x00FF00FF00FF00FF;
+	x0 |= (x0 >> 8);
+	x1 |= (x1 >> 8);
+	x2 |= (x2 >> 8);
+	x3 |= (x3 >> 8);
+	x0 &= (uint64_t)0x0000FFFF0000FFFF;
+	x1 &= (uint64_t)0x0000FFFF0000FFFF;
+	x2 &= (uint64_t)0x0000FFFF0000FFFF;
+	x3 &= (uint64_t)0x0000FFFF0000FFFF;
+	w[0] = (uint32_t)x0 | (uint32_t)(x0 >> 16);
+	w[1] = (uint32_t)x1 | (uint32_t)(x1 >> 16);
+	w[2] = (uint32_t)x2 | (uint32_t)(x2 >> 16);
+	w[3] = (uint32_t)x3 | (uint32_t)(x3 >> 16);
+}
+
+static const unsigned char Rcon[] = {
+	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36
+};
+
+static uint32_t
+sub_word(uint32_t x)
+{
+	uint64_t q[8];
+
+	memset(q, 0, sizeof q);
+	q[0] = x;
+	br_aes_ct64_ortho(q);
+	br_aes_ct64_bitslice_Sbox(q);
+	br_aes_ct64_ortho(q);
+	return (uint32_t)q[0];
+}
+
+/* see inner.h */
+unsigned
+br_aes_ct64_keysched(uint64_t comp_skey[static 30],
+	const void *key, size_t key_len)
+{
+	unsigned num_rounds;
+	int i, j, k, nk, nkf;
+	uint32_t tmp;
+	uint32_t skey[60];
+
+	switch (key_len) {
+	case 16:
+		num_rounds = 10;
+		break;
+	case 24:
+		num_rounds = 12;
+		break;
+	case 32:
+		num_rounds = 14;
+		break;
+	default:
+		/* abort(); */
+		return 0;
+	}
+	nk = (int)(key_len >> 2);
+	nkf = (int)((num_rounds + 1) << 2);
+	br_range_dec32le(skey, (key_len >> 2), key);
+	tmp = skey[(key_len >> 2) - 1];
+	for (i = nk, j = 0, k = 0; i < nkf; i ++) {
+		if (j == 0) {
+			tmp = (tmp << 24) | (tmp >> 8);
+			tmp = sub_word(tmp) ^ Rcon[k];
+		} else if (nk > 6 && j == 4) {
+			tmp = sub_word(tmp);
+		}
+		tmp ^= skey[i - nk];
+		skey[i] = tmp;
+		if (++ j == nk) {
+			j = 0;
+			k ++;
+		}
+	}
+
+	for (i = 0, j = 0; i < nkf; i += 4, j += 2) {
+		uint64_t q[8];
+
+		br_aes_ct64_interleave_in(&q[0], &q[4], skey + i);
+		q[1] = q[0];
+		q[2] = q[0];
+		q[3] = q[0];
+		q[5] = q[4];
+		q[6] = q[4];
+		q[7] = q[4];
+		br_aes_ct64_ortho(q);
+		comp_skey[j + 0] =
+			  (q[0] & (uint64_t)0x1111111111111111)
+			| (q[1] & (uint64_t)0x2222222222222222)
+			| (q[2] & (uint64_t)0x4444444444444444)
+			| (q[3] & (uint64_t)0x8888888888888888);
+		comp_skey[j + 1] =
+			  (q[4] & (uint64_t)0x1111111111111111)
+			| (q[5] & (uint64_t)0x2222222222222222)
+			| (q[6] & (uint64_t)0x4444444444444444)
+			| (q[7] & (uint64_t)0x8888888888888888);
+	}
+	return num_rounds;
+}
+
+/* see inner.h */
+void
+br_aes_ct64_skey_expand(uint64_t skey[static 120],
+	unsigned num_rounds, const uint64_t comp_skey[static 30])
+{
+	unsigned u, v, n;
+
+	n = (num_rounds + 1) << 1;
+	for (u = 0, v = 0; u < n; u ++, v += 4) {
+		uint64_t x0, x1, x2, x3;
+
+		x0 = x1 = x2 = x3 = comp_skey[u];
+		x0 &= (uint64_t)0x1111111111111111;
+		x1 &= (uint64_t)0x2222222222222222;
+		x2 &= (uint64_t)0x4444444444444444;
+		x3 &= (uint64_t)0x8888888888888888;
+		x1 >>= 1;
+		x2 >>= 2;
+		x3 >>= 3;
+		skey[v + 0] = (x0 << 4) - x0;
+		skey[v + 1] = (x1 << 4) - x1;
+		skey[v + 2] = (x2 << 4) - x2;
+		skey[v + 3] = (x3 << 4) - x3;
+	}
+}
+
+/* NetBSD additions, for computing the standard AES key schedule */
+
+unsigned
+br_aes_ct64_keysched_stdenc(uint32_t *skey, const void *key, size_t key_len)
+{
+	unsigned num_rounds;
+	int i, j, k, nk, nkf;
+	uint32_t tmp;
+
+	switch (key_len) {
+	case 16:
+		num_rounds = 10;
+		break;
+	case 24:
+		num_rounds = 12;
+		break;
+	case 32:
+		num_rounds = 14;
+		break;
+	default:
+		/* abort(); */
+		return 0;
+	}
+	nk = (int)(key_len >> 2);
+	nkf = (int)((num_rounds + 1) << 2);
+	tmp = 0;
+	for (i = 0; i < nk; i ++) {
+		tmp = br_dec32le((const unsigned char *)key + (i << 2));
+		skey[i] = tmp;
+	}
+	for (i = nk, j = 0, k = 0; i < nkf; i ++) {
+		if (j == 0) {
+			tmp = (tmp << 24) | (tmp >> 8);
+			tmp = sub_word(tmp) ^ Rcon[k];
+		} else if (nk > 6 && j == 4) {
+			tmp = sub_word(tmp);
+		}
+		tmp ^= skey[i - nk];
+		skey[i] = tmp;
+		if (++ j == nk) {
+			j = 0;
+			k ++;
+		}
+	}
+	return num_rounds;
+}
+
+unsigned
+br_aes_ct64_keysched_stddec(uint32_t *skey, const void *key, size_t key_len)
+{
+	uint32_t tkey[60];
+	uint64_t q[8];
+	unsigned num_rounds;
+	unsigned i;
+
+	num_rounds = br_aes_ct64_keysched_stdenc(skey, key, key_len);
+	if (num_rounds == 0)
+		return 0;
+
+	q[1] = q[2] = q[3] = 0;
+	q[5] = q[6] = q[7] = 0;
+
+	tkey[0] = skey[4*num_rounds + 0];
+	tkey[1] = skey[4*num_rounds + 1];
+	tkey[2] = skey[4*num_rounds + 2];
+	tkey[3] = skey[4*num_rounds + 3];
+	for (i = 1; i < num_rounds; i++) {
+		br_aes_ct64_interleave_in(&q[0], &q[4], skey + 4*i);
+		br_aes_ct64_ortho(q);
+		br_aes_ct64_inv_mix_columns(q);
+		br_aes_ct64_ortho(q);
+		br_aes_ct64_interleave_out(&tkey[4*(num_rounds - i)],
+		    q[0], q[4]);
+	}
+	tkey[4*num_rounds + 0] = skey[0];
+	tkey[4*num_rounds + 1] = skey[1];
+	tkey[4*num_rounds + 2] = skey[2];
+	tkey[4*num_rounds + 3] = skey[3];
+
+	memcpy(skey, tkey, 4*(num_rounds + 1)*sizeof(uint32_t));
+	explicit_memset(tkey, 0, 4*(num_rounds + 1)*sizeof(uint32_t));
+	return num_rounds;
+}
diff -r 8da4be48b876 -r cce15febbf04 sys/crypto/aes/aes_ct64_dec.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/aes/aes_ct64_dec.c	Sat Nov 22 05:28:14 2025 +0000
@@ -0,0 +1,174 @@
+/*	$NetBSD$	*/
+
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(1, "$NetBSD$");
+
+#include <sys/types.h>
+
+#include <crypto/aes/aes_bear64.h>
+
+/* see inner.h */
+void
+br_aes_ct64_bitslice_invSbox(uint64_t q[static 8])
+{
+	/*
+	 * See br_aes_ct_bitslice_invSbox(). This is the natural extension
+	 * to 64-bit registers.
+	 */
+	uint64_t q0, q1, q2, q3, q4, q5, q6, q7;
+
+	q0 = ~q[0];
+	q1 = ~q[1];
+	q2 = q[2];
+	q3 = q[3];
+	q4 = q[4];
+	q5 = ~q[5];
+	q6 = ~q[6];
+	q7 = q[7];
+	q[7] = q1 ^ q4 ^ q6;
+	q[6] = q0 ^ q3 ^ q5;
+	q[5] = q7 ^ q2 ^ q4;
+	q[4] = q6 ^ q1 ^ q3;
+	q[3] = q5 ^ q0 ^ q2;
+	q[2] = q4 ^ q7 ^ q1;
+	q[1] = q3 ^ q6 ^ q0;
+	q[0] = q2 ^ q5 ^ q7;
+
+	br_aes_ct64_bitslice_Sbox(q);
+
+	q0 = ~q[0];
+	q1 = ~q[1];
+	q2 = q[2];
+	q3 = q[3];
+	q4 = q[4];
+	q5 = ~q[5];
+	q6 = ~q[6];
+	q7 = q[7];
+	q[7] = q1 ^ q4 ^ q6;
+	q[6] = q0 ^ q3 ^ q5;
+	q[5] = q7 ^ q2 ^ q4;
+	q[4] = q6 ^ q1 ^ q3;
+	q[3] = q5 ^ q0 ^ q2;
+	q[2] = q4 ^ q7 ^ q1;
+	q[1] = q3 ^ q6 ^ q0;
+	q[0] = q2 ^ q5 ^ q7;
+}
+
+static void
+add_round_key(uint64_t q[static 8], const uint64_t sk[static 8])
+{
+	int i;
+
+	for (i = 0; i < 8; i ++) {
+		q[i] ^= sk[i];
+	}
+}
+
+static void
+inv_shift_rows(uint64_t q[static 8])
+{
+	int i;
+
+	for (i = 0; i < 8; i ++) {
+		uint64_t x;
+
+		x = q[i];
+		q[i] = (x & (uint64_t)0x000000000000FFFF)
+			| ((x & (uint64_t)0x000000000FFF0000) << 4)
+			| ((x & (uint64_t)0x00000000F0000000) >> 12)
+			| ((x & (uint64_t)0x000000FF00000000) << 8)
+			| ((x & (uint64_t)0x0000FF0000000000) >> 8)
+			| ((x & (uint64_t)0x000F000000000000) << 12)
+			| ((x & (uint64_t)0xFFF0000000000000) >> 4);
+	}
+}
+
+static inline uint64_t
+rotr32(uint64_t x)
+{
+	return (x << 32) | (x >> 32);
+}
+
+static void
+inv_mix_columns(uint64_t q[static 8])
+{
+	uint64_t q0, q1, q2, q3, q4, q5, q6, q7;
+	uint64_t r0, r1, r2, r3, r4, r5, r6, r7;
+
+	q0 = q[0];
+	q1 = q[1];
+	q2 = q[2];
+	q3 = q[3];
+	q4 = q[4];
+	q5 = q[5];
+	q6 = q[6];
+	q7 = q[7];
+	r0 = (q0 >> 16) | (q0 << 48);
+	r1 = (q1 >> 16) | (q1 << 48);
+	r2 = (q2 >> 16) | (q2 << 48);
+	r3 = (q3 >> 16) | (q3 << 48);
+	r4 = (q4 >> 16) | (q4 << 48);
+	r5 = (q5 >> 16) | (q5 << 48);
+	r6 = (q6 >> 16) | (q6 << 48);
+	r7 = (q7 >> 16) | (q7 << 48);
+
+	q[0] = q5 ^ q6 ^ q7 ^ r0 ^ r5 ^ r7 ^ rotr32(q0 ^ q5 ^ q6 ^ r0 ^ r5);
+	q[1] = q0 ^ q5 ^ r0 ^ r1 ^ r5 ^ r6 ^ r7 ^ rotr32(q1 ^ q5 ^ q7 ^ r1 ^ r5 ^ r6);
+	q[2] = q0 ^ q1 ^ q6 ^ r1 ^ r2 ^ r6 ^ r7 ^ rotr32(q0 ^ q2 ^ q6 ^ r2 ^ r6 ^ r7);
+	q[3] = q0 ^ q1 ^ q2 ^ q5 ^ q6 ^ r0 ^ r2 ^ r3 ^ r5 ^ rotr32(q0 ^ q1 ^ q3 ^ q5 ^ q6 ^ q7 ^ r0 ^ r3 ^ r5 ^ r7);
+	q[4] = q1 ^ q2 ^ q3 ^ q5 ^ r1 ^ r3 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr32(q1 ^ q2 ^ q4 ^ q5 ^ q7 ^ r1 ^ r4 ^ r5 ^ r6);
+	q[5] = q2 ^ q3 ^ q4 ^ q6 ^ r2 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr32(q2 ^ q3 ^ q5 ^ q6 ^ r2 ^ r5 ^ r6 ^ r7);
+	q[6] = q3 ^ q4 ^ q5 ^ q7 ^ r3 ^ r5 ^ r6 ^ r7 ^ rotr32(q3 ^ q4 ^ q6 ^ q7 ^ r3 ^ r6 ^ r7);
+	q[7] = q4 ^ q5 ^ q6 ^ r4 ^ r6 ^ r7 ^ rotr32(q4 ^ q5 ^ q7 ^ r4 ^ r7);
+}
+
+/* see inner.h */
+void
+br_aes_ct64_bitslice_decrypt(unsigned num_rounds,
+	const uint64_t skey[static 120], uint64_t q[static 8])
+{
+	unsigned u;
+
+	add_round_key(q, skey + (num_rounds << 3));
+	for (u = num_rounds - 1; u > 0; u --) {
+		inv_shift_rows(q);
+		br_aes_ct64_bitslice_invSbox(q);
+		add_round_key(q, skey + (u << 3));
+		inv_mix_columns(q);
+	}
+	inv_shift_rows(q);
+	br_aes_ct64_bitslice_invSbox(q);
+	add_round_key(q, skey);
+}
+
+/* NetBSD addition, for generating compatible decryption keys */
+void
+br_aes_ct64_inv_mix_columns(uint64_t q[static 8])
+{
+
+	inv_mix_columns(q);
+}
diff -r 8da4be48b876 -r cce15febbf04 sys/crypto/aes/aes_ct64_enc.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/aes/aes_ct64_enc.c	Sat Nov 22 05:28:14 2025 +0000
@@ -0,0 +1,122 @@
+/*	$NetBSD$	*/
+
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(1, "$NetBSD$");
+
+#include <sys/types.h>
+
+#include <crypto/aes/aes_bear64.h>
+
+static inline void
+add_round_key(uint64_t q[static 8], const uint64_t sk[static 8])
+{
+	q[0] ^= sk[0];
+	q[1] ^= sk[1];
+	q[2] ^= sk[2];
+	q[3] ^= sk[3];
+	q[4] ^= sk[4];
+	q[5] ^= sk[5];
+	q[6] ^= sk[6];
+	q[7] ^= sk[7];
+}
+
+static inline void
+shift_rows(uint64_t q[static 8])
+{
+	int i;
+
+	for (i = 0; i < 8; i ++) {
+		uint64_t x;
+
+		x = q[i];
+		q[i] = (x & (uint64_t)0x000000000000FFFF)
+			| ((x & (uint64_t)0x00000000FFF00000) >> 4)
+			| ((x & (uint64_t)0x00000000000F0000) << 12)
+			| ((x & (uint64_t)0x0000FF0000000000) >> 8)
+			| ((x & (uint64_t)0x000000FF00000000) << 8)
+			| ((x & (uint64_t)0xF000000000000000) >> 12)
+			| ((x & (uint64_t)0x0FFF000000000000) << 4);
+	}
+}
+
+static inline uint64_t
+rotr32(uint64_t x)
+{
+	return (x << 32) | (x >> 32);
+}
+
+static inline void
+mix_columns(uint64_t q[static 8])
+{
+	uint64_t q0, q1, q2, q3, q4, q5, q6, q7;
+	uint64_t r0, r1, r2, r3, r4, r5, r6, r7;
+
+	q0 = q[0];
+	q1 = q[1];
+	q2 = q[2];
+	q3 = q[3];
+	q4 = q[4];
+	q5 = q[5];
+	q6 = q[6];
+	q7 = q[7];
+	r0 = (q0 >> 16) | (q0 << 48);
+	r1 = (q1 >> 16) | (q1 << 48);
+	r2 = (q2 >> 16) | (q2 << 48);
+	r3 = (q3 >> 16) | (q3 << 48);
+	r4 = (q4 >> 16) | (q4 << 48);
+	r5 = (q5 >> 16) | (q5 << 48);
+	r6 = (q6 >> 16) | (q6 << 48);
+	r7 = (q7 >> 16) | (q7 << 48);
+
+	q[0] = q7 ^ r7 ^ r0 ^ rotr32(q0 ^ r0);
+	q[1] = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr32(q1 ^ r1);
+	q[2] = q1 ^ r1 ^ r2 ^ rotr32(q2 ^ r2);
+	q[3] = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr32(q3 ^ r3);
+	q[4] = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr32(q4 ^ r4);
+	q[5] = q4 ^ r4 ^ r5 ^ rotr32(q5 ^ r5);
+	q[6] = q5 ^ r5 ^ r6 ^ rotr32(q6 ^ r6);
+	q[7] = q6 ^ r6 ^ r7 ^ rotr32(q7 ^ r7);
+}
+
+/* see inner.h */
+void
+br_aes_ct64_bitslice_encrypt(unsigned num_rounds,
+	const uint64_t skey[static 120], uint64_t q[static 8])
+{
+	unsigned u;
+
+	add_round_key(q, skey);
+	for (u = 1; u < num_rounds; u ++) {
+		br_aes_ct64_bitslice_Sbox(q);
+		shift_rows(q);
+		mix_columns(q);
+		add_round_key(q, skey + (u << 3));
+	}
+	br_aes_ct64_bitslice_Sbox(q);
+	shift_rows(q);
+	add_round_key(q, skey + (num_rounds << 3));
+}
diff -r 8da4be48b876 -r cce15febbf04 sys/crypto/aes/aes_impl.c
--- a/sys/crypto/aes/aes_impl.c	Sat Nov 22 04:18:35 2025 +0000
+++ b/sys/crypto/aes/aes_impl.c	Sat Nov 22 05:28:14 2025 +0000
@@ -29,6 +29,10 @@
 #include <sys/cdefs.h>
 __KERNEL_RCSID(1, "$NetBSD: aes_impl.c,v 1.10 2022/11/05 17:36:33 jmcneill Exp $");
 
+#ifdef _KERNEL_OPT
+#include "opt_aes.h"
+#endif
+
 #include <sys/types.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
@@ -37,12 +41,20 @@ __KERNEL_RCSID(1, "$NetBSD: aes_impl.c,v
 #include <sys/systm.h>
 
 #include <crypto/aes/aes.h>
-#include <crypto/aes/aes_bear.h> /* default implementation */
 #include <crypto/aes/aes_cbc.h>
 #include <crypto/aes/aes_impl.h>
 #include <crypto/aes/aes_keysched.h>
 #include <crypto/aes/aes_xts.h>
 
+/* default implementation */
+#ifdef AES_BEAR64
+#include <crypto/aes/aes_bear64.h>
+static const struct aes_impl	*aes_default_impl = &aes_bear64_impl;
+#else
+#include <crypto/aes/aes_bear.h>
+static const struct aes_impl	*aes_default_impl = &aes_bear_impl;
+#endif
+
 static int aes_keysched_selftest(void);
 
 static const struct aes_impl	*aes_md_impl	__read_mostly;
@@ -113,11 +125,11 @@ aes_select(void)
 			aes_impl = aes_md_impl;
 	}
 	if (aes_impl == NULL) {
-		if (aes_selftest(&aes_bear_impl))
+		if (aes_selftest(aes_default_impl))
 			aprint_error("aes: self-test failed: %s\n",
-			    aes_bear_impl.ai_name);
+			    aes_default_impl->ai_name);
 		else
-			aes_impl = &aes_bear_impl;
+			aes_impl = aes_default_impl;
 	}
 	if (aes_impl == NULL)
 		panic("AES self-tests failed");
diff -r 8da4be48b876 -r cce15febbf04 sys/crypto/aes/aes_keysched.c
--- a/sys/crypto/aes/aes_keysched.c	Sat Nov 22 04:18:35 2025 +0000
+++ b/sys/crypto/aes/aes_keysched.c	Sat Nov 22 05:28:14 2025 +0000
@@ -29,10 +29,19 @@
 #include <sys/cdefs.h>
 __KERNEL_RCSID(1, "$NetBSD$");
 
+#ifdef _KERNEL_OPT
+#include "opt_aes.h"
+#endif
+
 #include <sys/types.h>
 
+#include <crypto/aes/aes_keysched.h>
+
+#ifdef AES_BEAR64
+#include <crypto/aes/aes_bear64.h>
+#else
 #include <crypto/aes/aes_bear.h>
-#include <crypto/aes/aes_keysched.h>
+#endif
 
 /*
  * aes_keysched_enc(rk, key, keybytes)
@@ -46,7 +55,11 @@ u_int
 aes_keysched_enc(uint32_t *rk, const void *key, size_t keybytes)
 {
 
+#ifdef AES_BEAR64
+	return br_aes_ct64_keysched_stdenc(rk, key, keybytes);
+#else
 	return br_aes_ct_keysched_stdenc(rk, key, keybytes);
+#endif
 }
 
 /*
@@ -61,5 +74,9 @@ u_int
 aes_keysched_dec(uint32_t *rk, const void *key, size_t keybytes)
 {
 
+#ifdef AES_BEAR64
+	return br_aes_ct64_keysched_stddec(rk, key, keybytes);
+#else
 	return br_aes_ct_keysched_stddec(rk, key, keybytes);
+#endif
 }
diff -r 8da4be48b876 -r cce15febbf04 sys/crypto/aes/files.aes
--- a/sys/crypto/aes/files.aes	Sat Nov 22 04:18:35 2025 +0000
+++ b/sys/crypto/aes/files.aes	Sat Nov 22 05:28:14 2025 +0000
@@ -2,12 +2,18 @@
 
 define	aes
 
-file	crypto/aes/aes_bear.c			aes
+defflag	opt_aes.h		AES_BEAR64
+
+file	crypto/aes/aes_bear.c			aes & !aes_bear64
+file	crypto/aes/aes_bear64.c			aes & aes_bear64
 file	crypto/aes/aes_ccm.c			aes
 file	crypto/aes/aes_ccm_mbuf.c		aes
-file	crypto/aes/aes_ct.c			aes
-file	crypto/aes/aes_ct_dec.c			aes
-file	crypto/aes/aes_ct_enc.c			aes
+file	crypto/aes/aes_ct.c			aes & !aes_bear64
+file	crypto/aes/aes_ct64.c			aes & aes_bear64
+file	crypto/aes/aes_ct64_dec.c		aes & aes_bear64
+file	crypto/aes/aes_ct64_enc.c		aes & aes_bear64
+file	crypto/aes/aes_ct_dec.c			aes & !aes_bear64
+file	crypto/aes/aes_ct_enc.c			aes & !aes_bear64
 file	crypto/aes/aes_impl.c			aes
 file	crypto/aes/aes_keysched.c		aes
 file	crypto/aes/aes_selftest.c		aes
diff -r 8da4be48b876 -r cce15febbf04 tests/sys/crypto/aes/Makefile
--- a/tests/sys/crypto/aes/Makefile	Sat Nov 22 04:18:35 2025 +0000
+++ b/tests/sys/crypto/aes/Makefile	Sat Nov 22 05:28:14 2025 +0000
@@ -17,6 +17,12 @@ SRCS.t_aes+=	aes_bear.c
 SRCS.t_aes+=	aes_ct.c
 SRCS.t_aes+=	aes_ct_dec.c
 SRCS.t_aes+=	aes_ct_enc.c
+
+SRCS.t_aes+=	aes_bear64.c
+SRCS.t_aes+=	aes_ct64.c
+SRCS.t_aes+=	aes_ct64_dec.c
+SRCS.t_aes+=	aes_ct64_enc.c
+
 SRCS.t_aes+=	aes_keysched.c
 SRCS.t_aes+=	aes_selftest.c
 
@@ -85,6 +91,7 @@ WARNS=		5
 #   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110878
 COPTS.aes_armv8.c+=	${CC_WNO_STRINGOP_OVERREAD} ${CC_WNO_STRINGOP_OVERFLOW}
 COPTS.aes_bear.c+=	${CC_WNO_STRINGOP_OVERFLOW} ${CC_WNO_ARRAY_BOUNDS}
+COPTS.aes_bear64.c+=	${CC_WNO_STRINGOP_OVERFLOW} ${CC_WNO_ARRAY_BOUNDS}
 COPTS.aes_neon_subr.c+=	${CC_WNO_ARRAY_BOUNDS}
 
 COPTS.aes_ni.c+=	${CC_WNO_STRINGOP_OVERREAD} ${CC_WNO_STRINGOP_OVERFLOW}
diff -r 8da4be48b876 -r cce15febbf04 tests/sys/crypto/aes/t_aes.c
--- a/tests/sys/crypto/aes/t_aes.c	Sat Nov 22 04:18:35 2025 +0000
+++ b/tests/sys/crypto/aes/t_aes.c	Sat Nov 22 05:28:14 2025 +0000
@@ -30,6 +30,7 @@
 
 #include <crypto/aes/aes.h>
 #include <crypto/aes/aes_bear.h>
+#include <crypto/aes/aes_bear64.h>
 #include <crypto/aes/aes_impl.h>
 
 #if defined(__i386__) || defined(__x86_64__)
@@ -71,6 +72,28 @@ ATF_TC_BODY(aes_ct_selftest, tc)
 		atf_tc_fail("BearSSL aes_ct self-test failed");
 }
 
+ATF_TC(aes_ct64_selftest);
+ATF_TC_HEAD(aes_ct64_selftest, tc)
+{
+
+	atf_tc_set_md_var(tc, "descr", "BearSSL aes_ct64 tests");
+}
+
+ATF_TC_BODY(aes_ct64_selftest, tc)
+{
+
+	if (aes_bear64_impl.ai_probe()) {
+		/*
+		 * aes_ct64 is the portable software fallback for LP64
+		 * platforms, so probe should never fail.
+		 */
+		atf_tc_fail("BearSSL aes_ct probe64 failed");
+	}
+
+	if (aes_selftest(&aes_bear64_impl))
+		atf_tc_fail("BearSSL aes_ct64 self-test failed");
+}
+
 #define	AES_SELFTEST(name, impl, descr)					      \
 ATF_TC(name);								      \
 ATF_TC_HEAD(name, tc)							      \
@@ -113,6 +136,7 @@ ATF_TP_ADD_TCS(tp)
 {
 
 	ATF_TP_ADD_TC(tp, aes_ct_selftest);
+	ATF_TP_ADD_TC(tp, aes_ct64_selftest);
 
 #ifdef __aarch64__
 	ATF_TP_ADD_TC(tp, aes_armv8_selftest);
# HG changeset patch
# User Taylor R Campbell <riastradh@NetBSD.org>
# Date 1763792551 0
#      Sat Nov 22 06:22:31 2025 +0000
# Branch trunk
# Node ID 4ef13977846541a224c93d56f4044dadb35ef919
# Parent  cce15febbf047be806a40490ef2416f043f6db13
# EXP-Topic riastradh-pr59774-aesbear64
aes(9): Rewrite x86 SSE2 implementation.

This computes eight AES_k instances simultaneously, using the
bitsliced 32-bit aes_ct logic which computes two blocks at a time in
uint32_t arithmetic, vectorized four ways.

Previously, the SSE2 code was a very naive adaptation of aes_ct64,
which computes four blocks at a time in uint64_t arithmetic, without
any 2x vectorization -- I did it at the time because:

(a) it was easier to get working,
(b) it only affects really old hardware with neither AES-NI nor SSSE3
    which are both much much faster.

But it was bugging me that this was a kind of dumb use of SSE2.

Substantially reduces stack usage (from ~1200 bytes to ~800 bytes)
and should approximately double throughput for CBC decryption and for
XTS encryption/decryption.

I also tried a 2x64 version but cursory performance measurements
didn't reveal much benefit over 4x32.  (If anyone is interested in
doing more serious performance measurements, on ancient hardware for
which it might matter, I also have the 2x64 code around.)

Prompted by:

PR kern/59774: bearssl 32-bit AES is too slow, want 64-bit optimized
version in kernel

diff -r cce15febbf04 -r 4ef139778465 sys/arch/x86/x86/identcpu.c
--- a/sys/arch/x86/x86/identcpu.c	Sat Nov 22 05:28:14 2025 +0000
+++ b/sys/arch/x86/x86/identcpu.c	Sat Nov 22 06:22:31 2025 +0000
@@ -42,7 +42,7 @@ __KERNEL_RCSID(0, "$NetBSD: identcpu.c,v
 
 #include <crypto/aes/aes_impl.h>
 #include <crypto/aes/arch/x86/aes_ni.h>
-#include <crypto/aes/arch/x86/aes_sse2.h>
+#include <crypto/aes/arch/x86/aes_sse2_4x32.h>
 #include <crypto/aes/arch/x86/aes_ssse3.h>
 #include <crypto/aes/arch/x86/aes_via.h>
 #include <crypto/chacha/arch/x86/chacha_sse2.h>
@@ -1011,7 +1011,7 @@ cpu_probe(struct cpu_info *ci)
 		    (cpu_feature[1] & CPUID2_SSSE3))
 			aes_md_init(&aes_ssse3_impl);
 		else if (i386_has_sse && i386_has_sse2)
-			aes_md_init(&aes_sse2_impl);
+			aes_md_init(&aes_sse2_4x32_impl);
 
 		/* ChaCha */
 		if (i386_has_sse && i386_has_sse2)
diff -r cce15febbf04 -r 4ef139778465 sys/conf/copts.mk
--- a/sys/conf/copts.mk	Sat Nov 22 05:28:14 2025 +0000
+++ b/sys/conf/copts.mk	Sat Nov 22 06:22:31 2025 +0000
@@ -41,7 +41,7 @@ COPTS.chacha_neon.c+=	-flax-vector-conve
 .if ${MACHINE_ARCH} == "x86_64" || ${MACHINE_ARCH} == "i386"
 COPTS.aes_bear64.c+=	${CC_WNO_ARRAY_BOUNDS} ${CC_WNO_STRINGOP_OVERFLOW}
 COPTS.aes_ni.c+=	${CC_WNO_STRINGOP_OVERREAD} ${CC_WNO_STRINGOP_OVERFLOW}
-COPTS.aes_sse2_subr.c+=	${CC_WNO_ARRAY_BOUNDS}
+COPTS.aes_sse2_4x32_subr.c+=	${CC_WNO_ARRAY_BOUNDS}
 COPTS.aes_ssse3_subr.c+=${CC_WNO_ARRAY_BOUNDS}
 COPTS.aes_via.c+=	${CC_WNO_ARRAY_BOUNDS}
 .endif
diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2.c
--- a/sys/crypto/aes/arch/x86/aes_sse2.c	Sat Nov 22 05:28:14 2025 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,403 +0,0 @@
-/*
- * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining 
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be 
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: aes_sse2.c,v 1.2 2020/06/30 20:32:11 riastradh Exp $");
-
-#include <sys/types.h>
-
-#ifdef _KERNEL
-#include <lib/libkern/libkern.h>
-#else
-#include <stdint.h>
-#include <string.h>
-#endif
-
-#include "aes_sse2_impl.h"
-
-static void
-br_range_dec32le(uint32_t *p32, size_t nwords, const void *v)
-{
-	const uint8_t *p8 = v;
-
-	while (nwords --> 0) {
-		uint32_t x0 = *p8++;
-		uint32_t x1 = *p8++;
-		uint32_t x2 = *p8++;
-		uint32_t x3 = *p8++;
-
-		*p32++ = x0 | (x1 << 8) | (x2 << 16) | (x3 << 24);
-	}
-}
-
-void
-aes_sse2_bitslice_Sbox(__m128i q[static 4])
-{
-	__m128i x0, x1, x2, x3, x4, x5, x6, x7;
-	__m128i y1, y2, y3, y4, y5, y6, y7, y8, y9;
-	__m128i y10, y11, y12, y13, y14, y15, y16, y17, y18, y19;
-	__m128i y20, y21;
-	__m128i z0, z1, z2, z3, z4, z5, z6, z7, z8, z9;
-	__m128i z10, z11, z12, z13, z14, z15, z16, z17;
-	__m128i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
-	__m128i t10, t11, t12, t13, t14, t15, t16, t17, t18, t19;
-	__m128i t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
-	__m128i t30, t31, t32, t33, t34, t35, t36, t37, t38, t39;
-	__m128i t40, t41, t42, t43, t44, t45, t46, t47, t48, t49;
-	__m128i t50, t51, t52, t53, t54, t55, t56, t57, t58, t59;
-	__m128i t60, t61, t62, t63, t64, t65, t66, t67;
-	__m128i s0, s1, s2, s3, s4, s5, s6, s7;
-
-	x0 = _mm_shuffle_epi32(q[3], 0x0e);
-	x1 = _mm_shuffle_epi32(q[2], 0x0e);
-	x2 = _mm_shuffle_epi32(q[1], 0x0e);
-	x3 = _mm_shuffle_epi32(q[0], 0x0e);
-	x4 = q[3];
-	x5 = q[2];
-	x6 = q[1];
-	x7 = q[0];
-
-	/*
-	 * Top linear transformation.
-	 */
-	y14 = x3 ^ x5;
-	y13 = x0 ^ x6;
-	y9 = x0 ^ x3;
-	y8 = x0 ^ x5;
-	t0 = x1 ^ x2;
-	y1 = t0 ^ x7;
-	y4 = y1 ^ x3;
-	y12 = y13 ^ y14;
-	y2 = y1 ^ x0;
-	y5 = y1 ^ x6;
-	y3 = y5 ^ y8;
-	t1 = x4 ^ y12;
-	y15 = t1 ^ x5;
-	y20 = t1 ^ x1;
-	y6 = y15 ^ x7;
-	y10 = y15 ^ t0;
-	y11 = y20 ^ y9;
-	y7 = x7 ^ y11;
-	y17 = y10 ^ y11;
-	y19 = y10 ^ y8;
-	y16 = t0 ^ y11;
-	y21 = y13 ^ y16;
-	y18 = x0 ^ y16;
-
-	/*
-	 * Non-linear section.
-	 */
-	t2 = y12 & y15;
-	t3 = y3 & y6;
-	t4 = t3 ^ t2;
-	t5 = y4 & x7;
-	t6 = t5 ^ t2;
-	t7 = y13 & y16;
-	t8 = y5 & y1;
-	t9 = t8 ^ t7;
-	t10 = y2 & y7;
-	t11 = t10 ^ t7;
-	t12 = y9 & y11;
-	t13 = y14 & y17;
-	t14 = t13 ^ t12;
-	t15 = y8 & y10;
-	t16 = t15 ^ t12;
-	t17 = t4 ^ t14;
-	t18 = t6 ^ t16;
-	t19 = t9 ^ t14;
-	t20 = t11 ^ t16;
-	t21 = t17 ^ y20;
-	t22 = t18 ^ y19;
-	t23 = t19 ^ y21;
-	t24 = t20 ^ y18;
-
-	t25 = t21 ^ t22;
-	t26 = t21 & t23;
-	t27 = t24 ^ t26;
-	t28 = t25 & t27;
-	t29 = t28 ^ t22;
-	t30 = t23 ^ t24;
-	t31 = t22 ^ t26;
-	t32 = t31 & t30;
-	t33 = t32 ^ t24;
-	t34 = t23 ^ t33;
-	t35 = t27 ^ t33;
-	t36 = t24 & t35;
-	t37 = t36 ^ t34;
-	t38 = t27 ^ t36;
-	t39 = t29 & t38;
-	t40 = t25 ^ t39;
-
-	t41 = t40 ^ t37;
-	t42 = t29 ^ t33;
-	t43 = t29 ^ t40;
-	t44 = t33 ^ t37;
-	t45 = t42 ^ t41;
-	z0 = t44 & y15;
-	z1 = t37 & y6;
-	z2 = t33 & x7;
-	z3 = t43 & y16;
-	z4 = t40 & y1;
-	z5 = t29 & y7;
-	z6 = t42 & y11;
-	z7 = t45 & y17;
-	z8 = t41 & y10;
-	z9 = t44 & y12;
-	z10 = t37 & y3;
-	z11 = t33 & y4;
-	z12 = t43 & y13;
-	z13 = t40 & y5;
-	z14 = t29 & y2;
-	z15 = t42 & y9;
-	z16 = t45 & y14;
-	z17 = t41 & y8;
-
-	/*
-	 * Bottom linear transformation.
-	 */
-	t46 = z15 ^ z16;
-	t47 = z10 ^ z11;
-	t48 = z5 ^ z13;
-	t49 = z9 ^ z10;
-	t50 = z2 ^ z12;
-	t51 = z2 ^ z5;
-	t52 = z7 ^ z8;
-	t53 = z0 ^ z3;
-	t54 = z6 ^ z7;
-	t55 = z16 ^ z17;
-	t56 = z12 ^ t48;
-	t57 = t50 ^ t53;
-	t58 = z4 ^ t46;
-	t59 = z3 ^ t54;
-	t60 = t46 ^ t57;
-	t61 = z14 ^ t57;
-	t62 = t52 ^ t58;
-	t63 = t49 ^ t58;
-	t64 = z4 ^ t59;
-	t65 = t61 ^ t62;
-	t66 = z1 ^ t63;
-	s0 = t59 ^ t63;
-	s6 = t56 ^ ~t62;
-	s7 = t48 ^ ~t60;
-	t67 = t64 ^ t65;
-	s3 = t53 ^ t66;
-	s4 = t51 ^ t66;
-	s5 = t47 ^ t65;
-	s1 = t64 ^ ~s3;
-	s2 = t55 ^ ~t67;
-
-	q[3] = _mm_unpacklo_epi64(s4, s0);
-	q[2] = _mm_unpacklo_epi64(s5, s1);
-	q[1] = _mm_unpacklo_epi64(s6, s2);
-	q[0] = _mm_unpacklo_epi64(s7, s3);
-}
-
-void
-aes_sse2_ortho(__m128i q[static 4])
-{
-#define SWAPN(cl, ch, s, x, y)   do { \
-		__m128i a, b; \
-		a = (x); \
-		b = (y); \
-		(x) = (a & _mm_set1_epi64x(cl)) | \
-		    _mm_slli_epi64(b & _mm_set1_epi64x(cl), (s)); \
-		(y) = _mm_srli_epi64(a & _mm_set1_epi64x(ch), (s)) | \
-		    (b & _mm_set1_epi64x(ch)); \
-	} while (0)
-
-#define SWAP2(x, y)    SWAPN(0x5555555555555555, 0xAAAAAAAAAAAAAAAA,  1, x, y)
-#define SWAP4(x, y)    SWAPN(0x3333333333333333, 0xCCCCCCCCCCCCCCCC,  2, x, y)
-#define SWAP8(x, y)    SWAPN(0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0,  4, x, y)
-
-	SWAP2(q[0], q[1]);
-	SWAP2(q[2], q[3]);
-
-	SWAP4(q[0], q[2]);
-	SWAP4(q[1], q[3]);
-
-	__m128i q0 = q[0];
-	__m128i q1 = q[1];
-	__m128i q2 = q[2];
-	__m128i q3 = q[3];
-	__m128i q4 = _mm_shuffle_epi32(q[0], 0x0e);
-	__m128i q5 = _mm_shuffle_epi32(q[1], 0x0e);
-	__m128i q6 = _mm_shuffle_epi32(q[2], 0x0e);
-	__m128i q7 = _mm_shuffle_epi32(q[3], 0x0e);
-	SWAP8(q0, q4);
-	SWAP8(q1, q5);
-	SWAP8(q2, q6);
-	SWAP8(q3, q7);
-	q[0] = _mm_unpacklo_epi64(q0, q4);
-	q[1] = _mm_unpacklo_epi64(q1, q5);
-	q[2] = _mm_unpacklo_epi64(q2, q6);
-	q[3] = _mm_unpacklo_epi64(q3, q7);
-}
-
-__m128i
-aes_sse2_interleave_in(__m128i w)
-{
-	__m128i lo, hi;
-
-	lo = _mm_shuffle_epi32(w, 0x10);
-	hi = _mm_shuffle_epi32(w, 0x32);
-	lo &= _mm_set1_epi64x(0x00000000FFFFFFFF);
-	hi &= _mm_set1_epi64x(0x00000000FFFFFFFF);
-	lo |= _mm_slli_epi64(lo, 16);
-	hi |= _mm_slli_epi64(hi, 16);
-	lo &= _mm_set1_epi32(0x0000FFFF);
-	hi &= _mm_set1_epi32(0x0000FFFF);
-	lo |= _mm_slli_epi64(lo, 8);
-	hi |= _mm_slli_epi64(hi, 8);
-	lo &= _mm_set1_epi16(0x00FF);
-	hi &= _mm_set1_epi16(0x00FF);
-	return lo | _mm_slli_epi64(hi, 8);
-}
-
-__m128i
-aes_sse2_interleave_out(__m128i q)
-{
-	__m128i lo, hi;
-
-	lo = q;
-	hi = _mm_srli_si128(q, 1);
-	lo &= _mm_set1_epi16(0x00FF);
-	hi &= _mm_set1_epi16(0x00FF);
-	lo |= _mm_srli_epi64(lo, 8);
-	hi |= _mm_srli_epi64(hi, 8);
-	lo &= _mm_set1_epi32(0x0000FFFF);
-	hi &= _mm_set1_epi32(0x0000FFFF);
-	lo |= _mm_srli_epi64(lo, 16);
-	hi |= _mm_srli_epi64(hi, 16);
-	return (__m128i)_mm_shuffle_ps((__m128)lo, (__m128)hi, 0x88);
-}
-
-static const unsigned char Rcon[] = {
-	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36
-};
-
-static uint32_t
-sub_word(uint32_t x)
-{
-	__m128i q[4];
-	uint32_t y;
-
-	memset(q, 0, sizeof(q));
-	q[0] = _mm_loadu_si32(&x);
-	aes_sse2_ortho(q);
-	aes_sse2_bitslice_Sbox(q);
-	aes_sse2_ortho(q);
-	_mm_storeu_si32(&y, q[0]);
-	return y;
-}
-
-unsigned
-aes_sse2_keysched(uint64_t *comp_skey, const void *key, size_t key_len)
-{
-	unsigned num_rounds;
-	int i, j, k, nk, nkf;
-	uint32_t tmp;
-	uint32_t skey[60];
-
-	switch (key_len) {
-	case 16:
-		num_rounds = 10;
-		break;
-	case 24:
-		num_rounds = 12;
-		break;
-	case 32:
-		num_rounds = 14;
-		break;
-	default:
-		/* abort(); */
-		return 0;
-	}
-	nk = (int)(key_len >> 2);
-	nkf = (int)((num_rounds + 1) << 2);
-	br_range_dec32le(skey, (key_len >> 2), key);
-	tmp = skey[(key_len >> 2) - 1];
-	for (i = nk, j = 0, k = 0; i < nkf; i ++) {
-		if (j == 0) {
-			tmp = (tmp << 24) | (tmp >> 8);
-			tmp = sub_word(tmp) ^ Rcon[k];
-		} else if (nk > 6 && j == 4) {
-			tmp = sub_word(tmp);
-		}
-		tmp ^= skey[i - nk];
-		skey[i] = tmp;
-		if (++ j == nk) {
-			j = 0;
-			k ++;
-		}
-	}
-
-	for (i = 0, j = 0; i < nkf; i += 4, j += 2) {
-		__m128i q[4], q0, q1, q2, q3, q4, q5, q6, q7;
-		__m128i w;
-
-		w = _mm_loadu_epi8(skey + i);
-		q[0] = q[1] = q[2] = q[3] = aes_sse2_interleave_in(w);
-		aes_sse2_ortho(q);
-		q0 = q[0] & _mm_set1_epi64x(0x1111111111111111);
-		q1 = q[1] & _mm_set1_epi64x(0x2222222222222222);
-		q2 = q[2] & _mm_set1_epi64x(0x4444444444444444);
-		q3 = q[3] & _mm_set1_epi64x(0x8888888888888888);
-		q4 = _mm_shuffle_epi32(q0, 0x0e);
-		q5 = _mm_shuffle_epi32(q1, 0x0e);
-		q6 = _mm_shuffle_epi32(q2, 0x0e);
-		q7 = _mm_shuffle_epi32(q3, 0x0e);
-		_mm_storeu_si64(&comp_skey[j + 0], q0 | q1 | q2 | q3);
-		_mm_storeu_si64(&comp_skey[j + 1], q4 | q5 | q6 | q7);
-	}
-	return num_rounds;
-}
-
-void
-aes_sse2_skey_expand(uint64_t *skey,
-	unsigned num_rounds, const uint64_t *comp_skey)
-{
-	unsigned u, v, n;
-
-	n = (num_rounds + 1) << 1;
-	for (u = 0, v = 0; u < n; u ++, v += 4) {
-		__m128i x0, x1, x2, x3;
-
-		x0 = x1 = x2 = x3 = _mm_loadu_si64(&comp_skey[u]);
-		x0 &= 0x1111111111111111;
-		x1 &= 0x2222222222222222;
-		x2 &= 0x4444444444444444;
-		x3 &= 0x8888888888888888;
-		x1 = _mm_srli_epi64(x1, 1);
-		x2 = _mm_srli_epi64(x2, 2);
-		x3 = _mm_srli_epi64(x3, 3);
-		x0 = _mm_sub_epi64(_mm_slli_epi64(x0, 4), x0);
-		x1 = _mm_sub_epi64(_mm_slli_epi64(x1, 4), x1);
-		x2 = _mm_sub_epi64(_mm_slli_epi64(x2, 4), x2);
-		x3 = _mm_sub_epi64(_mm_slli_epi64(x3, 4), x3);
-		_mm_storeu_si64(&skey[v + 0], x0);
-		_mm_storeu_si64(&skey[v + 1], x1);
-		_mm_storeu_si64(&skey[v + 2], x2);
-		_mm_storeu_si64(&skey[v + 3], x3);
-	}
-}
diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2.h
--- a/sys/crypto/aes/arch/x86/aes_sse2.h	Sat Nov 22 05:28:14 2025 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,72 +0,0 @@
-/*	$NetBSD: aes_sse2.h,v 1.4 2020/07/25 22:29:56 riastradh Exp $	*/
-
-/*-
- * Copyright (c) 2020 The NetBSD Foundation, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef	_CRYPTO_AES_ARCH_X86_AES_SSE2_H
-#define	_CRYPTO_AES_ARCH_X86_AES_SSE2_H
-
-#include <sys/types.h>
-
-#include <crypto/aes/aes_impl.h>
-
-struct aesenc;
-struct aesdec;
-
-/*
- * These functions MUST NOT use any vector registers for parameters or
- * results -- the caller is compiled with -mno-sse &c. in the kernel,
- * and dynamically turns on the vector unit just before calling them.
- * Internal subroutines that use the vector unit for parameters are
- * declared in aes_sse2_impl.h instead.
- */
-
-void aes_sse2_setkey(uint64_t[static 30], const void *, uint32_t);
-
-void aes_sse2_enc(const struct aesenc *, const uint8_t in[static 16],
-    uint8_t[static 16], uint32_t);
-void aes_sse2_dec(const struct aesdec *, const uint8_t in[static 16],
-    uint8_t[static 16], uint32_t);
-void aes_sse2_cbc_enc(const struct aesenc *, const uint8_t[static 16],
-    uint8_t[static 16], size_t nbytes, uint8_t[static 16], uint32_t);
-void aes_sse2_cbc_dec(const struct aesdec *, const uint8_t[static 16],
-    uint8_t[static 16], size_t nbytes, uint8_t[static 16], uint32_t);
-void aes_sse2_xts_enc(const struct aesenc *, const uint8_t[static 16],
-    uint8_t[static 16], size_t nbytes, uint8_t[static 16], uint32_t);
-void aes_sse2_xts_dec(const struct aesdec *, const uint8_t[static 16],
-    uint8_t[static 16], size_t nbytes, uint8_t[static 16], uint32_t);
-void aes_sse2_cbcmac_update1(const struct aesenc *, const uint8_t[static 16],
-    size_t, uint8_t[static 16], uint32_t);
-void aes_sse2_ccm_enc1(const struct aesenc *, const uint8_t[static 16],
-    uint8_t[static 16], size_t, uint8_t[static 32], uint32_t);
-void aes_sse2_ccm_dec1(const struct aesenc *, const uint8_t[static 16],
-    uint8_t[static 16], size_t, uint8_t[static 32], uint32_t);
-
-int aes_sse2_selftest(void);
-
-extern struct aes_impl aes_sse2_impl;
-
-#endif	/* _CRYPTO_AES_ARCH_X86_AES_SSE2_H */
diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2_4x32.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/aes/arch/x86/aes_sse2_4x32.c	Sat Nov 22 06:22:31 2025 +0000
@@ -0,0 +1,352 @@
+/*	$NetBSD$	*/
+
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(1, "$NetBSD$");
+
+#include <sys/types.h>
+
+#ifdef _KERNEL
+#include <lib/libkern/libkern.h>
+#else
+#include <string.h>
+#endif
+
+#include "aes_sse2_4x32_impl.h"
+
+/* see inner.h */
+void
+aes_sse2_4x32_bitslice_Sbox(__m128i q[static 8])
+{
+	/*
+	 * This S-box implementation is a straightforward translation of
+	 * the circuit described by Boyar and Peralta in "A new
+	 * combinational logic minimization technique with applications
+	 * to cryptology" (https://eprint.iacr.org/2009/191.pdf).
+	 *
+	 * Note that variables x* (input) and s* (output) are numbered
+	 * in "reverse" order (x0 is the high bit, x7 is the low bit).
+	 */
+
+	__m128i x0, x1, x2, x3, x4, x5, x6, x7;
+	__m128i y1, y2, y3, y4, y5, y6, y7, y8, y9;
+	__m128i y10, y11, y12, y13, y14, y15, y16, y17, y18, y19;
+	__m128i y20, y21;
+	__m128i z0, z1, z2, z3, z4, z5, z6, z7, z8, z9;
+	__m128i z10, z11, z12, z13, z14, z15, z16, z17;
+	__m128i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
+	__m128i t10, t11, t12, t13, t14, t15, t16, t17, t18, t19;
+	__m128i t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
+	__m128i t30, t31, t32, t33, t34, t35, t36, t37, t38, t39;
+	__m128i t40, t41, t42, t43, t44, t45, t46, t47, t48, t49;
+	__m128i t50, t51, t52, t53, t54, t55, t56, t57, t58, t59;
+	__m128i t60, t61, t62, t63, t64, t65, t66, t67;
+	__m128i s0, s1, s2, s3, s4, s5, s6, s7;
+
+	x0 = q[7];
+	x1 = q[6];
+	x2 = q[5];
+	x3 = q[4];
+	x4 = q[3];
+	x5 = q[2];
+	x6 = q[1];
+	x7 = q[0];
+
+	/*
+	 * Top linear transformation.
+	 */
+	y14 = x3 ^ x5;
+	y13 = x0 ^ x6;
+	y9 = x0 ^ x3;
+	y8 = x0 ^ x5;
+	t0 = x1 ^ x2;
+	y1 = t0 ^ x7;
+	y4 = y1 ^ x3;
+	y12 = y13 ^ y14;
+	y2 = y1 ^ x0;
+	y5 = y1 ^ x6;
+	y3 = y5 ^ y8;
+	t1 = x4 ^ y12;
+	y15 = t1 ^ x5;
+	y20 = t1 ^ x1;
+	y6 = y15 ^ x7;
+	y10 = y15 ^ t0;
+	y11 = y20 ^ y9;
+	y7 = x7 ^ y11;
+	y17 = y10 ^ y11;
+	y19 = y10 ^ y8;
+	y16 = t0 ^ y11;
+	y21 = y13 ^ y16;
+	y18 = x0 ^ y16;
+
+	/*
+	 * Non-linear section.
+	 */
+	t2 = y12 & y15;
+	t3 = y3 & y6;
+	t4 = t3 ^ t2;
+	t5 = y4 & x7;
+	t6 = t5 ^ t2;
+	t7 = y13 & y16;
+	t8 = y5 & y1;
+	t9 = t8 ^ t7;
+	t10 = y2 & y7;
+	t11 = t10 ^ t7;
+	t12 = y9 & y11;
+	t13 = y14 & y17;
+	t14 = t13 ^ t12;
+	t15 = y8 & y10;
+	t16 = t15 ^ t12;
+	t17 = t4 ^ t14;
+	t18 = t6 ^ t16;
+	t19 = t9 ^ t14;
+	t20 = t11 ^ t16;
+	t21 = t17 ^ y20;
+	t22 = t18 ^ y19;
+	t23 = t19 ^ y21;
+	t24 = t20 ^ y18;
+
+	t25 = t21 ^ t22;
+	t26 = t21 & t23;
+	t27 = t24 ^ t26;
+	t28 = t25 & t27;
+	t29 = t28 ^ t22;
+	t30 = t23 ^ t24;
+	t31 = t22 ^ t26;
+	t32 = t31 & t30;
+	t33 = t32 ^ t24;
+	t34 = t23 ^ t33;
+	t35 = t27 ^ t33;
+	t36 = t24 & t35;
+	t37 = t36 ^ t34;
+	t38 = t27 ^ t36;
+	t39 = t29 & t38;
+	t40 = t25 ^ t39;
+
+	t41 = t40 ^ t37;
+	t42 = t29 ^ t33;
+	t43 = t29 ^ t40;
+	t44 = t33 ^ t37;
+	t45 = t42 ^ t41;
+	z0 = t44 & y15;
+	z1 = t37 & y6;
+	z2 = t33 & x7;
+	z3 = t43 & y16;
+	z4 = t40 & y1;
+	z5 = t29 & y7;
+	z6 = t42 & y11;
+	z7 = t45 & y17;
+	z8 = t41 & y10;
+	z9 = t44 & y12;
+	z10 = t37 & y3;
+	z11 = t33 & y4;
+	z12 = t43 & y13;
+	z13 = t40 & y5;
+	z14 = t29 & y2;
+	z15 = t42 & y9;
+	z16 = t45 & y14;
+	z17 = t41 & y8;
+
+	/*
+	 * Bottom linear transformation.
+	 */
+	t46 = z15 ^ z16;
+	t47 = z10 ^ z11;
+	t48 = z5 ^ z13;
+	t49 = z9 ^ z10;
+	t50 = z2 ^ z12;
+	t51 = z2 ^ z5;
+	t52 = z7 ^ z8;
+	t53 = z0 ^ z3;
+	t54 = z6 ^ z7;
+	t55 = z16 ^ z17;
+	t56 = z12 ^ t48;
+	t57 = t50 ^ t53;
+	t58 = z4 ^ t46;
+	t59 = z3 ^ t54;
+	t60 = t46 ^ t57;
+	t61 = z14 ^ t57;
+	t62 = t52 ^ t58;
+	t63 = t49 ^ t58;
+	t64 = z4 ^ t59;
+	t65 = t61 ^ t62;
+	t66 = z1 ^ t63;
+	s0 = t59 ^ t63;
+	s6 = t56 ^ ~t62;
+	s7 = t48 ^ ~t60;
+	t67 = t64 ^ t65;
+	s3 = t53 ^ t66;
+	s4 = t51 ^ t66;
+	s5 = t47 ^ t65;
+	s1 = t64 ^ ~s3;
+	s2 = t55 ^ ~t67;
+
+	q[7] = s0;
+	q[6] = s1;
+	q[5] = s2;
+	q[4] = s3;
+	q[3] = s4;
+	q[2] = s5;
+	q[1] = s6;
+	q[0] = s7;
+}
+
+/* see inner.h */
+void
+aes_sse2_4x32_ortho(__m128i q[static 8])
+{
+#define SWAPN(cl, ch, s, x, y)   do { \
+		__m128i cl128 = _mm_set1_epi32(cl); \
+		__m128i ch128 = _mm_set1_epi32(ch); \
+		__m128i a, b; \
+		a = _mm_load_si128(&(x)); \
+		b = _mm_load_si128(&(y)); \
+		_mm_store_si128(&(x), \
+		    (a & cl128) | _mm_slli_epi32((b & cl128), (s))); \
+		_mm_store_si128(&(y), \
+		    _mm_srli_epi32((a & ch128), (s)) | (b & ch128)); \
+	} while (0)
+
+#define SWAP2(x, y)   SWAPN(0x55555555, 0xAAAAAAAA, 1, x, y)
+#define SWAP4(x, y)   SWAPN(0x33333333, 0xCCCCCCCC, 2, x, y)
+#define SWAP8(x, y)   SWAPN(0x0F0F0F0F, 0xF0F0F0F0, 4, x, y)
+
+	SWAP2(q[0], q[1]);
+	SWAP2(q[2], q[3]);
+	SWAP2(q[4], q[5]);
+	SWAP2(q[6], q[7]);
+
+	SWAP4(q[0], q[2]);
+	SWAP4(q[1], q[3]);
+	SWAP4(q[4], q[6]);
+	SWAP4(q[5], q[7]);
+
+	SWAP8(q[0], q[4]);
+	SWAP8(q[1], q[5]);
+	SWAP8(q[2], q[6]);
+	SWAP8(q[3], q[7]);
+}
+
+static const unsigned char Rcon[] = {
+	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36
+};
+
+static uint32_t
+sub_word(uint32_t x)
+{
+	__m128i q[8];
+	uint32_t y;
+
+	memset(q, 0, sizeof(q));
+	q[0] = _mm_loadu_si32(&x);
+	aes_sse2_4x32_ortho(q);
+	aes_sse2_4x32_bitslice_Sbox(q);
+	aes_sse2_4x32_ortho(q);
+	_mm_storeu_si32(&y, q[0]);
+	return y;
+}
+
+/* see inner.h */
+unsigned
+aes_sse2_4x32_keysched(uint32_t comp_skey[static 60], const void *key,
+    size_t key_len)
+{
+	unsigned num_rounds;
+	int i, j, k, nk, nkf;
+	uint32_t tmp;
+	uint32_t skey[120];
+
+	switch (key_len) {
+	case 16:
+		num_rounds = 10;
+		break;
+	case 24:
+		num_rounds = 12;
+		break;
+	case 32:
+		num_rounds = 14;
+		break;
+	default:
+		/* abort(); */
+		return 0;
+	}
+	nk = (int)(key_len >> 2);
+	nkf = (int)((num_rounds + 1) << 2);
+	tmp = 0;
+	for (i = 0; i < nk; i ++) {
+		tmp = br_dec32le((const unsigned char *)key + (i << 2));
+		skey[(i << 1) + 0] = tmp;
+		skey[(i << 1) + 1] = tmp;
+	}
+	for (i = nk, j = 0, k = 0; i < nkf; i ++) {
+		if (j == 0) {
+			tmp = (tmp << 24) | (tmp >> 8);
+			tmp = sub_word(tmp) ^ Rcon[k];
+		} else if (nk > 6 && j == 4) {
+			tmp = sub_word(tmp);
+		}
+		tmp ^= skey[(i - nk) << 1];
+		skey[(i << 1) + 0] = tmp;
+		skey[(i << 1) + 1] = tmp;
+		if (++ j == nk) {
+			j = 0;
+			k ++;
+		}
+	}
+	for (i = 0; i < nkf; i += 4) {
+		__m128i q[8];
+
+		for (j = 0; j < 8; j++)
+			q[j] = _mm_loadu_si32(&skey[(i << 1) + j]);
+		aes_sse2_4x32_ortho(q);
+		for (j = 0; j < 8; j++)
+			_mm_storeu_si32(&skey[(i << 1) + j], q[j]);
+	}
+	for (i = 0, j = 0; i < nkf; i ++, j += 2) {
+		comp_skey[i] = (skey[j + 0] & 0x55555555)
+			| (skey[j + 1] & 0xAAAAAAAA);
+	}
+	return num_rounds;
+}
+
+/* see inner.h */
+void
+aes_sse2_4x32_skey_expand(uint32_t skey[static 120],
+	unsigned num_rounds, const uint32_t comp_skey[static 60])
+{
+	unsigned u, v, n;
+
+	n = (num_rounds + 1) << 2;
+	for (u = 0, v = 0; u < n; u ++, v += 2) {
+		uint32_t x, y;
+
+		x = y = comp_skey[u];
+		x &= 0x55555555;
+		skey[v + 0] = x | (x << 1);
+		y &= 0xAAAAAAAA;
+		skey[v + 1] = y | (y >> 1);
+	}
+}
diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2_4x32.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/aes/arch/x86/aes_sse2_4x32.h	Sat Nov 22 06:22:31 2025 +0000
@@ -0,0 +1,36 @@
+/*	$NetBSD$	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_CRYPTO_AES_ARCH_X86_AES_SSE2_4X32_H
+#define	_CRYPTO_AES_ARCH_X86_AES_SSE2_4X32_H
+
+#include <crypto/aes/aes.h>
+
+extern struct aes_impl aes_sse2_4x32_impl;
+
+#endif	/* _CRYPTO_AES_ARCH_X86_AES_SSE2_4X32_H */
diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2_4x32_dec.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/aes/arch/x86/aes_sse2_4x32_dec.c	Sat Nov 22 06:22:31 2025 +0000
@@ -0,0 +1,195 @@
+/*	$NetBSD$	*/
+
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(1, "$NetBSD$");
+
+#include <sys/types.h>
+
+#include "aes_sse2_4x32_impl.h"
+
+/* see inner.h */
+void
+aes_sse2_4x32_bitslice_invSbox(__m128i q[static 8])
+{
+	/*
+	 * AES S-box is:
+	 *   S(x) = A(I(x)) ^ 0x63
+	 * where I() is inversion in GF(256), and A() is a linear
+	 * transform (0 is formally defined to be its own inverse).
+	 * Since inversion is an involution, the inverse S-box can be
+	 * computed from the S-box as:
+	 *   iS(x) = B(S(B(x ^ 0x63)) ^ 0x63)
+	 * where B() is the inverse of A(). Indeed, for any y in GF(256):
+	 *   iS(S(y)) = B(A(I(B(A(I(y)) ^ 0x63 ^ 0x63))) ^ 0x63 ^ 0x63) = y
+	 *
+	 * Note: we reuse the implementation of the forward S-box,
+	 * instead of duplicating it here, so that total code size is
+	 * lower. By merging the B() transforms into the S-box circuit
+	 * we could make faster CBC decryption, but CBC decryption is
+	 * already quite faster than CBC encryption because we can
+	 * process two blocks in parallel.
+	 */
+	__m128i q0, q1, q2, q3, q4, q5, q6, q7;
+
+	q0 = ~q[0];
+	q1 = ~q[1];
+	q2 = q[2];
+	q3 = q[3];
+	q4 = q[4];
+	q5 = ~q[5];
+	q6 = ~q[6];
+	q7 = q[7];
+	q[7] = q1 ^ q4 ^ q6;
+	q[6] = q0 ^ q3 ^ q5;
+	q[5] = q7 ^ q2 ^ q4;
+	q[4] = q6 ^ q1 ^ q3;
+	q[3] = q5 ^ q0 ^ q2;
+	q[2] = q4 ^ q7 ^ q1;
+	q[1] = q3 ^ q6 ^ q0;
+	q[0] = q2 ^ q5 ^ q7;
+
+	aes_sse2_4x32_bitslice_Sbox(q);
+
+	q0 = ~q[0];
+	q1 = ~q[1];
+	q2 = q[2];
+	q3 = q[3];
+	q4 = q[4];
+	q5 = ~q[5];
+	q6 = ~q[6];
+	q7 = q[7];
+	q[7] = q1 ^ q4 ^ q6;
+	q[6] = q0 ^ q3 ^ q5;
+	q[5] = q7 ^ q2 ^ q4;
+	q[4] = q6 ^ q1 ^ q3;
+	q[3] = q5 ^ q0 ^ q2;
+	q[2] = q4 ^ q7 ^ q1;
+	q[1] = q3 ^ q6 ^ q0;
+	q[0] = q2 ^ q5 ^ q7;
+}
+
+static void
+add_round_key(__m128i q[static 8], const uint32_t sk[static 8])
+{
+
+	q[0] ^= _mm_set1_epi32(sk[0]);
+	q[1] ^= _mm_set1_epi32(sk[1]);
+	q[2] ^= _mm_set1_epi32(sk[2]);
+	q[3] ^= _mm_set1_epi32(sk[3]);
+	q[4] ^= _mm_set1_epi32(sk[4]);
+	q[5] ^= _mm_set1_epi32(sk[5]);
+	q[6] ^= _mm_set1_epi32(sk[6]);
+	q[7] ^= _mm_set1_epi32(sk[7]);
+}
+
+static inline __m128i
+inv_shift_row(__m128i q)
+{
+	__m128i x, y0, y1, y2, y3, y4, y5, y6;
+
+	x = q;
+	y0 = x & _mm_set1_epi32(0x000000FF);
+	y1 = _mm_slli_epi32(x & _mm_set1_epi32(0x00003F00), 2);
+	y2 = _mm_srli_epi32(x & _mm_set1_epi32(0x0000C000), 6);
+	y3 = _mm_slli_epi32(x & _mm_set1_epi32(0x000F0000), 4);
+	y4 = _mm_srli_epi32(x & _mm_set1_epi32(0x00F00000), 4);
+	y5 = _mm_slli_epi32(x & _mm_set1_epi32(0x03000000), 6);
+	y6 = _mm_srli_epi32(x & _mm_set1_epi32(0xFC000000), 2);
+	return y0 | y1 | y2 | y3 | y4 | y5 | y6;
+}
+
+static void
+inv_shift_rows(__m128i *q)
+{
+
+	q[0] = inv_shift_row(q[0]);
+	q[1] = inv_shift_row(q[1]);
+	q[2] = inv_shift_row(q[2]);
+	q[3] = inv_shift_row(q[3]);
+	q[4] = inv_shift_row(q[4]);
+	q[5] = inv_shift_row(q[5]);
+	q[6] = inv_shift_row(q[6]);
+	q[7] = inv_shift_row(q[7]);
+}
+
+static inline __m128i
+rotr16(__m128i x)
+{
+	return _mm_slli_epi32(x, 16) | _mm_srli_epi32(x, 16);
+}
+
+static void
+inv_mix_columns(__m128i q[static 8])
+{
+	__m128i q0, q1, q2, q3, q4, q5, q6, q7;
+	__m128i r0, r1, r2, r3, r4, r5, r6, r7;
+
+	q0 = q[0];
+	q1 = q[1];
+	q2 = q[2];
+	q3 = q[3];
+	q4 = q[4];
+	q5 = q[5];
+	q6 = q[6];
+	q7 = q[7];
+	r0 = _mm_srli_epi32(q0, 8) | _mm_slli_epi32(q0, 24);
+	r1 = _mm_srli_epi32(q1, 8) | _mm_slli_epi32(q1, 24);
+	r2 = _mm_srli_epi32(q2, 8) | _mm_slli_epi32(q2, 24);
+	r3 = _mm_srli_epi32(q3, 8) | _mm_slli_epi32(q3, 24);
+	r4 = _mm_srli_epi32(q4, 8) | _mm_slli_epi32(q4, 24);
+	r5 = _mm_srli_epi32(q5, 8) | _mm_slli_epi32(q5, 24);
+	r6 = _mm_srli_epi32(q6, 8) | _mm_slli_epi32(q6, 24);
+	r7 = _mm_srli_epi32(q7, 8) | _mm_slli_epi32(q7, 24);
+
+	q[0] = q5 ^ q6 ^ q7 ^ r0 ^ r5 ^ r7 ^ rotr16(q0 ^ q5 ^ q6 ^ r0 ^ r5);
+	q[1] = q0 ^ q5 ^ r0 ^ r1 ^ r5 ^ r6 ^ r7 ^ rotr16(q1 ^ q5 ^ q7 ^ r1 ^ r5 ^ r6);
+	q[2] = q0 ^ q1 ^ q6 ^ r1 ^ r2 ^ r6 ^ r7 ^ rotr16(q0 ^ q2 ^ q6 ^ r2 ^ r6 ^ r7);
+	q[3] = q0 ^ q1 ^ q2 ^ q5 ^ q6 ^ r0 ^ r2 ^ r3 ^ r5 ^ rotr16(q0 ^ q1 ^ q3 ^ q5 ^ q6 ^ q7 ^ r0 ^ r3 ^ r5 ^ r7);
+	q[4] = q1 ^ q2 ^ q3 ^ q5 ^ r1 ^ r3 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr16(q1 ^ q2 ^ q4 ^ q5 ^ q7 ^ r1 ^ r4 ^ r5 ^ r6);
+	q[5] = q2 ^ q3 ^ q4 ^ q6 ^ r2 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr16(q2 ^ q3 ^ q5 ^ q6 ^ r2 ^ r5 ^ r6 ^ r7);
+	q[6] = q3 ^ q4 ^ q5 ^ q7 ^ r3 ^ r5 ^ r6 ^ r7 ^ rotr16(q3 ^ q4 ^ q6 ^ q7 ^ r3 ^ r6 ^ r7);
+	q[7] = q4 ^ q5 ^ q6 ^ r4 ^ r6 ^ r7 ^ rotr16(q4 ^ q5 ^ q7 ^ r4 ^ r7);
+}
+
+/* see inner.h */
+void
+aes_sse2_4x32_bitslice_decrypt(unsigned num_rounds,
+	const uint32_t skey[static 120], __m128i q[static 8])
+{
+	unsigned u;
+
+	add_round_key(q, skey + (num_rounds << 3));
+	for (u = num_rounds - 1; u > 0; u --) {
+		inv_shift_rows(q);
+		aes_sse2_4x32_bitslice_invSbox(q);
+		add_round_key(q, skey + (u << 3));
+		inv_mix_columns(q);
+	}
+	inv_shift_rows(q);
+	aes_sse2_4x32_bitslice_invSbox(q);
+	add_round_key(q, skey);
+}
diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2_4x32_enc.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/aes/arch/x86/aes_sse2_4x32_enc.c	Sat Nov 22 06:22:31 2025 +0000
@@ -0,0 +1,134 @@
+/*	$NetBSD$	*/
+
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(1, "$NetBSD$");
+
+#include <sys/types.h>
+
+#include "aes_sse2_4x32_impl.h"
+
+static inline void
+add_round_key(__m128i q[static 8], const uint32_t sk[static 8])
+{
+
+	q[0] ^= _mm_set1_epi32(sk[0]);
+	q[1] ^= _mm_set1_epi32(sk[1]);
+	q[2] ^= _mm_set1_epi32(sk[2]);
+	q[3] ^= _mm_set1_epi32(sk[3]);
+	q[4] ^= _mm_set1_epi32(sk[4]);
+	q[5] ^= _mm_set1_epi32(sk[5]);
+	q[6] ^= _mm_set1_epi32(sk[6]);
+	q[7] ^= _mm_set1_epi32(sk[7]);
+}
+
+static inline __m128i
+shift_row(__m128i q)
+{
+	__m128i x, y0, y1, y2, y3, y4, y5, y6;
+
+	x = q;
+	y0 = x & _mm_set1_epi32(0x000000FF);
+	y1 = _mm_srli_epi32(x & _mm_set1_epi32(0x0000FC00), 2);
+	y2 = _mm_slli_epi32(x & _mm_set1_epi32(0x00000300), 6);
+	y3 = _mm_srli_epi32(x & _mm_set1_epi32(0x00F00000), 4);
+	y4 = _mm_slli_epi32(x & _mm_set1_epi32(0x000F0000), 4);
+	y5 = _mm_srli_epi32(x & _mm_set1_epi32(0xC0000000), 6);
+	y6 = _mm_slli_epi32(x & _mm_set1_epi32(0x3F000000), 2);
+	return y0 | y1 | y2 | y3 | y4 | y5 | y6;
+}
+
+static inline void
+shift_rows(__m128i q[static 8])
+{
+
+	q[0] = shift_row(q[0]);
+	q[1] = shift_row(q[1]);
+	q[2] = shift_row(q[2]);
+	q[3] = shift_row(q[3]);
+	q[4] = shift_row(q[4]);
+	q[5] = shift_row(q[5]);
+	q[6] = shift_row(q[6]);
+	q[7] = shift_row(q[7]);
+}
+
+static inline __m128i
+rotr16(__m128i x)
+{
+	return _mm_slli_epi32(x, 16) | _mm_srli_epi32(x, 16);
+}
+
+static inline void
+mix_columns(__m128i q[static 8])
+{
+	__m128i q0, q1, q2, q3, q4, q5, q6, q7;
+	__m128i r0, r1, r2, r3, r4, r5, r6, r7;
+
+	q0 = q[0];
+	q1 = q[1];
+	q2 = q[2];
+	q3 = q[3];
+	q4 = q[4];
+	q5 = q[5];
+	q6 = q[6];
+	q7 = q[7];
+	r0 = _mm_srli_epi32(q0, 8) | _mm_slli_epi32(q0, 24);
+	r1 = _mm_srli_epi32(q1, 8) | _mm_slli_epi32(q1, 24);
+	r2 = _mm_srli_epi32(q2, 8) | _mm_slli_epi32(q2, 24);
+	r3 = _mm_srli_epi32(q3, 8) | _mm_slli_epi32(q3, 24);
+	r4 = _mm_srli_epi32(q4, 8) | _mm_slli_epi32(q4, 24);
+	r5 = _mm_srli_epi32(q5, 8) | _mm_slli_epi32(q5, 24);
+	r6 = _mm_srli_epi32(q6, 8) | _mm_slli_epi32(q6, 24);
+	r7 = _mm_srli_epi32(q7, 8) | _mm_slli_epi32(q7, 24);
+
+	q[0] = q7 ^ r7 ^ r0 ^ rotr16(q0 ^ r0);
+	q[1] = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr16(q1 ^ r1);
+	q[2] = q1 ^ r1 ^ r2 ^ rotr16(q2 ^ r2);
+	q[3] = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr16(q3 ^ r3);
+	q[4] = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr16(q4 ^ r4);
+	q[5] = q4 ^ r4 ^ r5 ^ rotr16(q5 ^ r5);
+	q[6] = q5 ^ r5 ^ r6 ^ rotr16(q6 ^ r6);
+	q[7] = q6 ^ r6 ^ r7 ^ rotr16(q7 ^ r7);
+}
+
+/* see inner.h */
+void
+aes_sse2_4x32_bitslice_encrypt(unsigned num_rounds,
+	const uint32_t skey[static 120], __m128i q[static 8])
+{
+	unsigned u;
+
+	add_round_key(q, skey);
+	for (u = 1; u < num_rounds; u ++) {
+		aes_sse2_4x32_bitslice_Sbox(q);
+		shift_rows(q);
+		mix_columns(q);
+		add_round_key(q, skey + (u << 3));
+	}
+	aes_sse2_4x32_bitslice_Sbox(q);
+	shift_rows(q);
+	add_round_key(q, skey + (num_rounds << 3));
+}
diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2_4x32_impl.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/aes/arch/x86/aes_sse2_4x32_impl.c	Sat Nov 22 06:22:31 2025 +0000
@@ -0,0 +1,223 @@
+/*	$NetBSD$	*/
+
+/*-
+ * Copyright (c) 2025 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(1, "$NetBSD$");
+
+#include <sys/types.h>
+#include <sys/endian.h>
+
+#include <crypto/aes/aes.h>
+#include <crypto/aes/aes_impl.h>
+#include <crypto/aes/arch/x86/aes_sse2_4x32.h>
+
+#ifdef _KERNEL
+#include <x86/cpu.h>
+#include <x86/cpuvar.h>
+#include <x86/fpu.h>
+#include <x86/specialreg.h>
+#else
+#include <cpuid.h>
+#define	fpu_kern_enter()	((void)0)
+#define	fpu_kern_leave()	((void)0)
+#endif
+
+#include "aes_sse2_4x32_subr.h"
+
+static void
+aes_sse2_4x32_setenckey_impl(struct aesenc *enc, const uint8_t *key,
+    uint32_t nrounds)
+{
+
+	fpu_kern_enter();
+	aes_sse2_4x32_setkey(enc->aese_aes.aes_rk, key, nrounds);
+	fpu_kern_leave();
+}
+
+static void
+aes_sse2_4x32_setdeckey_impl(struct aesdec *dec, const uint8_t *key,
+    uint32_t nrounds)
+{
+
+	fpu_kern_enter();
+	/*
+	 * BearSSL computes InvMixColumns on the fly -- no need for
+	 * distinct decryption round keys.
+	 */
+	aes_sse2_4x32_setkey(dec->aesd_aes.aes_rk, key, nrounds);
+	fpu_kern_leave();
+}
+
+static void
+aes_sse2_4x32_enc_impl(const struct aesenc *enc, const uint8_t in[static 16],
+    uint8_t out[static 16], uint32_t nrounds)
+{
+
+	fpu_kern_enter();
+	aes_sse2_4x32_enc(enc, in, out, nrounds);
+	fpu_kern_leave();
+}
+
+static void
+aes_sse2_4x32_dec_impl(const struct aesdec *dec, const uint8_t in[static 16],
+    uint8_t out[static 16], uint32_t nrounds)
+{
+
+	fpu_kern_enter();
+	aes_sse2_4x32_dec(dec, in, out, nrounds);
+	fpu_kern_leave();
+}
+
+static void
+aes_sse2_4x32_cbc_enc_impl(const struct aesenc *enc,
+    const uint8_t in[static 16], uint8_t out[static 16],
+    size_t nbytes, uint8_t iv[static 16], uint32_t nrounds)
+{
+
+	if (nbytes == 0)
+		return;
+	fpu_kern_enter();
+	aes_sse2_4x32_cbc_enc(enc, in, out, nbytes, iv, nrounds);
+	fpu_kern_leave();
+}
+
+static void
+aes_sse2_4x32_cbc_dec_impl(const struct aesdec *dec,
+    const uint8_t in[static 16], uint8_t out[static 16],
+    size_t nbytes, uint8_t iv[static 16], uint32_t nrounds)
+{
+
+	if (nbytes == 0)
+		return;
+	fpu_kern_enter();
+	aes_sse2_4x32_cbc_dec(dec, in, out, nbytes, iv, nrounds);
+	fpu_kern_leave();
+}
+
+static void
+aes_sse2_4x32_xts_enc_impl(const struct aesenc *enc,
+    const uint8_t in[static 16], uint8_t out[static 16],
+    size_t nbytes, uint8_t tweak[static 16], uint32_t nrounds)
+{
+
+	if (nbytes == 0)
+		return;
+	fpu_kern_enter();
+	aes_sse2_4x32_xts_enc(enc, in, out, nbytes, tweak, nrounds);
+	fpu_kern_leave();
+}
+
+static void
+aes_sse2_4x32_xts_dec_impl(const struct aesdec *dec,
+    const uint8_t in[static 16], uint8_t out[static 16],
+    size_t nbytes, uint8_t tweak[static 16], uint32_t nrounds)
+{
+
+	if (nbytes == 0)
+		return;
+	fpu_kern_enter();
+	aes_sse2_4x32_xts_dec(dec, in, out, nbytes, tweak, nrounds);
+	fpu_kern_leave();
+}
+
+static void
+aes_sse2_4x32_cbcmac_update1_impl(const struct aesenc *enc,
+    const uint8_t in[static 16], size_t nbytes, uint8_t auth[static 16],
+    uint32_t nrounds)
+{
+
+	fpu_kern_enter();
+	aes_sse2_4x32_cbcmac_update1(enc, in, nbytes, auth, nrounds);
+	fpu_kern_leave();
+}
+
+static void
+aes_sse2_4x32_ccm_enc1_impl(const struct aesenc *enc,
+    const uint8_t *in, uint8_t *out,
+    size_t nbytes, uint8_t authctr[32], uint32_t nrounds)
+{
+
+	fpu_kern_enter();
+	aes_sse2_4x32_ccm_enc1(enc, in, out, nbytes, authctr, nrounds);
+	fpu_kern_leave();
+}
+
+static void
+aes_sse2_4x32_ccm_dec1_impl(const struct aesenc *enc,
+    const uint8_t *in, uint8_t *out,
+    size_t nbytes, uint8_t authctr[32], uint32_t nrounds)
+{
+
+	fpu_kern_enter();
+	aes_sse2_4x32_ccm_dec1(enc, in, out, nbytes, authctr, nrounds);
+	fpu_kern_leave();
+}
+
+static int
+aes_sse2_4x32_probe(void)
+{
+	int result = 0;
+
+	/* Verify that the CPU supports SSE and SSE2.  */
+#ifdef _KERNEL
+	if (!i386_has_sse)
+		return -1;
+	if (!i386_has_sse2)
+		return -1;
+#else
+	unsigned eax, ebx, ecx, edx;
+	if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx))
+		return -1;
+	if ((edx & bit_SSE) == 0)
+		return -1;
+	if ((edx & bit_SSE2) == 0)
+		return -1;
+#endif
+
+	fpu_kern_enter();
+	result = aes_sse2_4x32_selftest();
+	fpu_kern_leave();
+
+	return result;
+}
+
+struct aes_impl aes_sse2_4x32_impl = {
+	.ai_name = "Intel SSE2 4x32 bitsliced",
+	.ai_probe = aes_sse2_4x32_probe,
+	.ai_setenckey = aes_sse2_4x32_setenckey_impl,
+	.ai_setdeckey = aes_sse2_4x32_setdeckey_impl,
+	.ai_enc = aes_sse2_4x32_enc_impl,
+	.ai_dec = aes_sse2_4x32_dec_impl,
+	.ai_cbc_enc = aes_sse2_4x32_cbc_enc_impl,
+	.ai_cbc_dec = aes_sse2_4x32_cbc_dec_impl,
+	.ai_xts_enc = aes_sse2_4x32_xts_enc_impl,
+	.ai_xts_dec = aes_sse2_4x32_xts_dec_impl,
+	.ai_cbcmac_update1 = aes_sse2_4x32_cbcmac_update1_impl,
+	.ai_ccm_enc1 = aes_sse2_4x32_ccm_enc1_impl,
+	.ai_ccm_dec1 = aes_sse2_4x32_ccm_dec1_impl,
+};
diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2_4x32_impl.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/aes/arch/x86/aes_sse2_4x32_impl.h	Sat Nov 22 06:22:31 2025 +0000
@@ -0,0 +1,54 @@
+/*	$NetBSD$	*/
+
+/*-
+ * Copyright (c) 2025 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_CRYPTO_AES_ARCH_X86_AES_SSE2_4X32_IMPL_H
+#define	_CRYPTO_AES_ARCH_X86_AES_SSE2_4X32_IMPL_H
+
+#include <sys/types.h>
+
+#include <crypto/aes/aes.h>
+#include <crypto/arch/x86/immintrin.h>
+#include <crypto/arch/x86/immintrin_ext.h>
+
+#include "aes_sse2_4x32_subr.h"
+
+#define	br_dec32le	le32dec
+#define	br_enc32le	le32enc
+
+void aes_sse2_4x32_bitslice_Sbox(__m128i[static 8]);
+void aes_sse2_4x32_bitslice_invSbox(__m128i[static 8]);
+void aes_sse2_4x32_ortho(__m128i[static 8]);
+unsigned aes_sse2_4x32_keysched(uint32_t[static 60], const void *, size_t);
+void aes_sse2_4x32_skey_expand(uint32_t[static 120], unsigned,
+    const uint32_t[static 60]);
+void aes_sse2_4x32_bitslice_encrypt(unsigned, const uint32_t[static 120],
+    __m128i[static 8]);
+void aes_sse2_4x32_bitslice_decrypt(unsigned, const uint32_t[static 120],
+    __m128i[static 8]);
+
+#endif	/* _CRYPTO_AES_ARCH_X86_AES_SSE2_4X32_IMPL_H */
diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2_4x32_subr.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/aes/arch/x86/aes_sse2_4x32_subr.c	Sat Nov 22 06:22:31 2025 +0000
@@ -0,0 +1,753 @@
+/*	$NetBSD$	*/
+
+/*-
+ * Copyright (c) 2025 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(1, "$NetBSD$");
+
+#include <crypto/aes/aes.h>
+
+#ifdef _KERNEL
+#include <sys/systm.h>
+#include <lib/libkern/libkern.h>
+#else
+#include <err.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <string.h>
+#define	KASSERT			assert
+#define	panic(fmt, args...)	err(1, fmt, ##args)
+#endif
+
+#include "aes_sse2_4x32_impl.h"
+#include "aes_sse2_4x32_subr.h"
+
+#ifndef _MM_TRANSPOSE4_EPI32
+#define	_MM_TRANSPOSE4_EPI32(r0, r1, r2, r3) do				      \
+{									      \
+	__m128i _mm_tmp0, _mm_tmp1, _mm_tmp2, _mm_tmp3;			      \
+									      \
+	_mm_tmp0 = _mm_unpacklo_epi32(r0, r1);				      \
+	_mm_tmp2 = _mm_unpacklo_epi32(r2, r3);				      \
+	_mm_tmp1 = _mm_unpackhi_epi32(r0, r1);				      \
+	_mm_tmp3 = _mm_unpackhi_epi32(r2, r3);				      \
+	(r0) = (__m128i)_mm_movelh_ps((__m128)_mm_tmp0, (__m128)_mm_tmp2);    \
+	(r1) = (__m128i)_mm_movehl_ps((__m128)_mm_tmp2, (__m128)_mm_tmp0);    \
+	(r2) = (__m128i)_mm_movelh_ps((__m128)_mm_tmp1, (__m128)_mm_tmp3);    \
+	(r3) = (__m128i)_mm_movehl_ps((__m128)_mm_tmp3, (__m128)_mm_tmp1);    \
+} while (0)
+#endif
+
+void
+aes_sse2_4x32_setkey(uint32_t rk[static 60], const void *key, uint32_t nrounds)
+{
+	size_t key_len;
+
+	switch (nrounds) {
+	case 10:
+		key_len = 16;
+		break;
+	case 12:
+		key_len = 24;
+		break;
+	case 14:
+		key_len = 32;
+		break;
+	default:
+		panic("invalid AES nrounds: %u", nrounds);
+	}
+
+	aes_sse2_4x32_keysched(rk, key, key_len);
+}
+
+void
+aes_sse2_4x32_enc(const struct aesenc *enc, const uint8_t in[static 16],
+    uint8_t out[static 16], uint32_t nrounds)
+{
+	uint32_t sk_exp[120];
+	__m128i q[8];
+
+	/* Expand round keys for bitslicing.  */
+	aes_sse2_4x32_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk);
+
+	/* Load input block interleaved with garbage blocks.  */
+	q[0] = _mm_loadu_epi8(in);
+	q[2] = q[4] = q[6] = _mm_setzero_si128();
+	q[1] = q[3] = q[5] = q[7] = _mm_setzero_si128();
+
+	/* Transform to bitslice, decrypt, transform from bitslice.  */
+	_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+	aes_sse2_4x32_ortho(q);
+	aes_sse2_4x32_bitslice_encrypt(nrounds, sk_exp, q);
+	aes_sse2_4x32_ortho(q);
+	_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+
+	/* Store output block.  */
+	_mm_storeu_epi8(out, q[0]);
+
+	/* Paranoia: Zero temporary buffers.  */
+	explicit_memset(sk_exp, 0, sizeof sk_exp);
+	explicit_memset(q, 0, sizeof q);
+}
+
+void
+aes_sse2_4x32_dec(const struct aesdec *dec, const uint8_t in[static 16],
+    uint8_t out[static 16], uint32_t nrounds)
+{
+	uint32_t sk_exp[120];
+	__m128i q[8];
+
+	/* Expand round keys for bitslicing.  */
+	aes_sse2_4x32_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk);
+
+	/* Load input block interleaved with garbage blocks.  */
+	q[0] = _mm_loadu_epi8(in);
+	q[2] = q[4] = q[6] = _mm_setzero_si128();
+	q[1] = q[3] = q[5] = q[7] = _mm_setzero_si128();
+
+	/* Transform to bitslice, decrypt, transform from bitslice.  */
+	_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+	aes_sse2_4x32_ortho(q);
+	aes_sse2_4x32_bitslice_decrypt(nrounds, sk_exp, q);
+	aes_sse2_4x32_ortho(q);
+	_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+
+	/* Store output block.  */
+	_mm_storeu_epi8(out, q[0]);
+
+	/* Paranoia: Zero temporary buffers.  */
+	explicit_memset(sk_exp, 0, sizeof sk_exp);
+	explicit_memset(q, 0, sizeof q);
+}
+
+void
+aes_sse2_4x32_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
+    uint32_t nrounds)
+{
+	uint32_t sk_exp[120];
+	__m128i q[8];
+	__m128i cv;
+
+	KASSERT(nbytes);
+	KASSERT(nbytes % 16 == 0);
+
+	/* Expand round keys for bitslicing.  */
+	aes_sse2_4x32_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk);
+
+	/* Load the IV.  */
+	cv = _mm_loadu_epi8(iv);
+
+	/*
+	 * Zero the registers we won't be using, since CBC encryption
+	 * is inherently sequential so we can only do one block at a
+	 * time.
+	 */
+	q[2] = q[4] = q[6] = _mm_setzero_si128();
+	q[1] = q[3] = q[5] = q[7] = _mm_setzero_si128();
+
+	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
+		/* Load input block and apply CV.  */
+		q[0] = cv ^ _mm_loadu_epi8(in);
+
+		/* Transform to bitslice, encrypt, transform from bitslice.  */
+		_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+		aes_sse2_4x32_ortho(q);
+		aes_sse2_4x32_bitslice_encrypt(nrounds, sk_exp, q);
+		aes_sse2_4x32_ortho(q);
+		_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+
+		/* Remember ciphertext as CV and store output block.  */
+		cv = q[0];
+		_mm_storeu_epi8(out, cv);
+	}
+
+	/* Store updated IV.  */
+	_mm_storeu_epi8(iv, cv);
+
+	/* Paranoia: Zero temporary buffers.  */
+	explicit_memset(sk_exp, 0, sizeof sk_exp);
+	explicit_memset(q, 0, sizeof q);
+}
+
+void
+aes_sse2_4x32_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t ivp[static 16],
+    uint32_t nrounds)
+{
+	uint32_t sk_exp[120];
+	__m128i q[8];
+	__m128i cv, iv, w;
+
+	KASSERT(nbytes);
+	KASSERT(nbytes % 16 == 0);
+
+	/* Expand round keys for bitslicing.  */
+	aes_sse2_4x32_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk);
+
+	/* Load the IV.  */
+	iv = _mm_loadu_epi8(ivp);
+
+	/* Load the last cipher block.  */
+	cv = _mm_loadu_epi8(in + nbytes - 16);
+
+	/* Store the updated IV.  */
+	_mm_storeu_epi8(ivp, cv);
+
+	/* Process the last blocks if not an even multiple of eight.  */
+	if (nbytes % (8*16)) {
+		unsigned i, n = (nbytes/16) % 8;
+
+		KASSERT(n > 0);
+		KASSERT(n < 8);
+
+		for (i = 8; i --> n;)
+			q[i] = _mm_setzero_si128();
+		q[i] = cv;
+		while (i --> 0)
+			q[i] = _mm_loadu_epi8(in + nbytes - 16*n + 16*i);
+
+		/* Decrypt up to seven blocks.  */
+		_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+		_MM_TRANSPOSE4_EPI32(q[1], q[3], q[5], q[7]);
+		aes_sse2_4x32_ortho(q);
+		aes_sse2_4x32_bitslice_decrypt(nrounds, sk_exp, q);
+		aes_sse2_4x32_ortho(q);
+		_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+		_MM_TRANSPOSE4_EPI32(q[1], q[3], q[5], q[7]);
+
+		do {
+			n--;
+			w = q[n];
+			if ((nbytes -= 16) == 0)
+				goto out;
+			cv = _mm_loadu_epi8(in + nbytes - 16);
+			_mm_storeu_epi8(out + nbytes, w ^ cv);
+		} while (n);
+	}
+
+	for (;;) {
+		KASSERT(nbytes >= 128);
+		nbytes -= 128;
+
+		/*
+		 * 1. Set up upper cipher block from cv.
+		 * 2. Load lower cipher blocks from input.
+		 */
+		q[7] = cv;	/* _mm_loadu_epi8(in + nbytes + 16*7) */
+		q[6] = _mm_loadu_epi8(in + nbytes + 16*6);
+		q[5] = _mm_loadu_epi8(in + nbytes + 16*5);
+		q[4] = _mm_loadu_epi8(in + nbytes + 16*4);
+		q[3] = _mm_loadu_epi8(in + nbytes + 16*3);
+		q[2] = _mm_loadu_epi8(in + nbytes + 16*2);
+		q[1] = _mm_loadu_epi8(in + nbytes + 16*1);
+		q[0] = _mm_loadu_epi8(in + nbytes + 16*0);
+
+		/* Decrypt eight blocks at a time.  */
+		_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+		_MM_TRANSPOSE4_EPI32(q[1], q[3], q[5], q[7]);
+		aes_sse2_4x32_ortho(q);
+		aes_sse2_4x32_bitslice_decrypt(nrounds, sk_exp, q);
+		aes_sse2_4x32_ortho(q);
+		_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+		_MM_TRANSPOSE4_EPI32(q[1], q[3], q[5], q[7]);
+
+		/* Store the seven upper output blocks.  */
+		cv = _mm_loadu_epi8(in + nbytes + 16*6);
+		_mm_storeu_epi8(out + nbytes + 16*7, cv ^ q[7]);
+		cv = _mm_loadu_epi8(in + nbytes + 16*5);
+		_mm_storeu_epi8(out + nbytes + 16*6, cv ^ q[6]);
+		cv = _mm_loadu_epi8(in + nbytes + 16*4);
+		_mm_storeu_epi8(out + nbytes + 16*5, cv ^ q[5]);
+		cv = _mm_loadu_epi8(in + nbytes + 16*3);
+		_mm_storeu_epi8(out + nbytes + 16*4, cv ^ q[4]);
+		cv = _mm_loadu_epi8(in + nbytes + 16*2);
+		_mm_storeu_epi8(out + nbytes + 16*3, cv ^ q[3]);
+		cv = _mm_loadu_epi8(in + nbytes + 16*1);
+		_mm_storeu_epi8(out + nbytes + 16*2, cv ^ q[2]);
+		cv = _mm_loadu_epi8(in + nbytes + 16*0);
+		_mm_storeu_epi8(out + nbytes + 16*1, cv ^ q[1]);
+
+		/*
+		 * Get the first output block, but don't load the CV
+		 * yet -- it might be the previous ciphertext block, or
+		 * it might be the IV.
+		 */
+		w = q[0];
+
+		/* Stop if we've reached the first output block.  */
+		if (nbytes == 0)
+			goto out;
+
+		/*
+		 * Load the preceding cipher block, and apply it as the
+		 * chaining value to this one.
+		 */
+		cv = _mm_loadu_epi8(in + nbytes - 16);
+		_mm_storeu_epi8(out + nbytes, w ^ cv);
+	}
+
+out:	/* Store the first output block.  */
+	_mm_storeu_epi8(out, w ^ iv);
+
+	/* Paranoia: Zero temporary buffers.  */
+	explicit_memset(sk_exp, 0, sizeof sk_exp);
+	explicit_memset(q, 0, sizeof q);
+}
+
+static inline __m128i
+aes_sse2_4x32_xts_update(__m128i t)
+{
+	const __m128i one = _mm_set_epi64x(1, 1);
+	__m128i s, m, c;
+
+	s = _mm_srli_epi64(t, 63);	/* 1 if high bit set else 0 */
+	m = _mm_sub_epi64(s, one);	/* 0 if high bit set else -1 */
+	m = _mm_shuffle_epi32(m, 0x4e);	/* swap halves */
+	c = _mm_set_epi64x(1, 0x87);	/* carry */
+
+	return _mm_slli_epi64(t, 1) ^ (c & ~m);
+}
+
+static int
+aes_sse2_4x32_xts_update_selftest(void)
+{
+	static const struct {
+		uint32_t in[4], out[4];
+	} cases[] = {
+		[0] = { {1}, {2} },
+		[1] = { {0x80000000U,0,0,0}, {0,1,0,0} },
+		[2] = { {0,0x80000000U,0,0}, {0,0,1,0} },
+		[3] = { {0,0,0x80000000U,0}, {0,0,0,1} },
+		[4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} },
+		[5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} },
+	};
+	unsigned i;
+	uint32_t t[4];
+	int result = 0;
+
+	for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) {
+		t[0] = cases[i].in[0];
+		t[1] = cases[i].in[1];
+		t[2] = cases[i].in[2];
+		t[3] = cases[i].in[3];
+		_mm_storeu_epi8(t, aes_sse2_4x32_xts_update(_mm_loadu_epi8(t)));
+		if (t[0] != cases[i].out[0] ||
+		    t[1] != cases[i].out[1] ||
+		    t[2] != cases[i].out[2] ||
+		    t[3] != cases[i].out[3]) {
+			printf("%s %u:"
+			    " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n",
+			    __func__, i, t[0], t[1], t[2], t[3]);
+			result = -1;
+		}
+	}
+
+	return result;
+}
+
+void
+aes_sse2_4x32_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
+    uint32_t nrounds)
+{
+	uint32_t sk_exp[120];
+	__m128i q[8];
+	__m128i t[9];
+	unsigned i;
+
+	KASSERT(nbytes);
+	KASSERT(nbytes % 16 == 0);
+
+	/* Expand round keys for bitslicing.  */
+	aes_sse2_4x32_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk);
+
+	/* Load tweak.  */
+	t[0] = _mm_loadu_epi8(tweak);
+
+	/* Handle the first block separately if odd number.  */
+	if (nbytes % (8*16)) {
+		/* Load up the tweaked inputs.  */
+		for (i = 0; i < (nbytes/16) % 8; i++) {
+			q[i] = _mm_loadu_epi8(in + 16*i) ^ t[i];
+			t[i + 1] = aes_sse2_4x32_xts_update(t[i]);
+		}
+		for (; i < 8; i++)
+			q[i] = _mm_setzero_si128();
+
+		/* Encrypt up to seven blocks.  */
+		_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+		_MM_TRANSPOSE4_EPI32(q[1], q[3], q[5], q[7]);
+		aes_sse2_4x32_ortho(q);
+		aes_sse2_4x32_bitslice_encrypt(nrounds, sk_exp, q);
+		aes_sse2_4x32_ortho(q);
+		_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+		_MM_TRANSPOSE4_EPI32(q[1], q[3], q[5], q[7]);
+
+		/* Store the tweaked outputs.  */
+		for (i = 0; i < (nbytes/16) % 8; i++)
+			_mm_storeu_epi8(out + 16*i, q[i] ^ t[i]);
+
+		/* Advance to the next block.  */
+		t[0] = t[i];
+		in += nbytes % (8*16);
+		out += nbytes % (8*16);
+		nbytes -= nbytes % (8*16);
+		if (nbytes == 0)
+			goto out;
+	}
+
+	do {
+		KASSERT(nbytes % 128 == 0);
+		KASSERT(nbytes >= 128);
+
+		/* Load up the tweaked inputs.  */
+		for (i = 0; i < 8; i++) {
+			q[i] = _mm_loadu_epi8(in + 16*i) ^ t[i];
+			t[i + 1] = aes_sse2_4x32_xts_update(t[i]);
+		}
+
+		/* Encrypt eight blocks.  */
+		_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+		_MM_TRANSPOSE4_EPI32(q[1], q[3], q[5], q[7]);
+		aes_sse2_4x32_ortho(q);
+		aes_sse2_4x32_bitslice_encrypt(nrounds, sk_exp, q);
+		aes_sse2_4x32_ortho(q);
+		_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+		_MM_TRANSPOSE4_EPI32(q[1], q[3], q[5], q[7]);
+
+		/* Store the tweaked outputs.  */
+		for (i = 0; i < 8; i++)
+			_mm_storeu_epi8(out + 16*i, q[i] ^ t[i]);
+
+		/* Advance to the next block.  */
+		t[0] = t[8];
+		in += 128;
+		out += 128;
+		nbytes -= 128;
+	} while (nbytes);
+
+out:	/* Store the updated tweak.  */
+	_mm_storeu_epi8(tweak, t[0]);
+
+	/* Paranoia: Zero temporary buffers.  */
+	explicit_memset(sk_exp, 0, sizeof sk_exp);
+	explicit_memset(q, 0, sizeof q);
+	explicit_memset(t, 0, sizeof t);
+}
+
+void
+aes_sse2_4x32_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
+    uint32_t nrounds)
+{
+	uint32_t sk_exp[120];
+	__m128i q[8];
+	__m128i t[9];
+	unsigned i;
+
+	KASSERT(nbytes);
+	KASSERT(nbytes % 16 == 0);
+
+	/* Expand round keys for bitslicing.  */
+	aes_sse2_4x32_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk);
+
+	/* Load tweak.  */
+	t[0] = _mm_loadu_epi8(tweak);
+
+	/* Handle the first block separately if odd number.  */
+	if (nbytes % (8*16)) {
+		/* Load up the tweaked inputs.  */
+		for (i = 0; i < (nbytes/16) % 8; i++) {
+			q[i] = _mm_loadu_epi8(in + 16*i) ^ t[i];
+			t[i + 1] = aes_sse2_4x32_xts_update(t[i]);
+		}
+		for (; i < 8; i++)
+			q[i] = _mm_setzero_si128();
+
+		/* Decrypt up to seven blocks.  */
+		_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+		_MM_TRANSPOSE4_EPI32(q[1], q[3], q[5], q[7]);
+		aes_sse2_4x32_ortho(q);
+		aes_sse2_4x32_bitslice_decrypt(nrounds, sk_exp, q);
+		aes_sse2_4x32_ortho(q);
+		_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+		_MM_TRANSPOSE4_EPI32(q[1], q[3], q[5], q[7]);
+
+		/* Store the tweaked outputs.  */
+		for (i = 0; i < (nbytes/16) % 8; i++)
+			_mm_storeu_epi8(out + 16*i, q[i] ^ t[i]);
+
+		/* Advance to the next block.  */
+		t[0] = t[i];
+		in += nbytes % (8*16);
+		out += nbytes % (8*16);
+		nbytes -= nbytes % (8*16);
+		if (nbytes == 0)
+			goto out;
+	}
+
+	do {
+		KASSERT(nbytes % 128 == 0);
+		KASSERT(nbytes >= 128);
+
+		/* Load up the tweaked inputs.  */
+		for (i = 0; i < 8; i++) {
+			q[i] = _mm_loadu_epi8(in + 16*i) ^ t[i];
+			t[i + 1] = aes_sse2_4x32_xts_update(t[i]);
+		}
+
+		/* Decrypt eight blocks.  */
+		_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+		_MM_TRANSPOSE4_EPI32(q[1], q[3], q[5], q[7]);
+		aes_sse2_4x32_ortho(q);
+		aes_sse2_4x32_bitslice_decrypt(nrounds, sk_exp, q);
+		aes_sse2_4x32_ortho(q);
+		_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+		_MM_TRANSPOSE4_EPI32(q[1], q[3], q[5], q[7]);
+
+		/* Store the tweaked outputs.  */
+		for (i = 0; i < 8; i++)
+			_mm_storeu_epi8(out + 16*i, q[i] ^ t[i]);
+
+		/* Advance to the next block.  */
+		t[0] = t[8];
+		in += 128;
+		out += 128;
+		nbytes -= 128;
+	} while (nbytes);
+
+out:	/* Store the updated tweak.  */
+	_mm_storeu_epi8(tweak, t[0]);
+
+	/* Paranoia: Zero temporary buffers.  */
+	explicit_memset(sk_exp, 0, sizeof sk_exp);
+	explicit_memset(q, 0, sizeof q);
+	explicit_memset(t, 0, sizeof t);
+}
+
+void
+aes_sse2_4x32_cbcmac_update1(const struct aesenc *enc,
+    const uint8_t in[static 16], size_t nbytes,
+    uint8_t auth[static 16], uint32_t nrounds)
+{
+	uint32_t sk_exp[120];
+	__m128i q[8];
+
+	KASSERT(nbytes);
+	KASSERT(nbytes % 16 == 0);
+
+	/* Expand round keys for bitslicing.  */
+	aes_sse2_4x32_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk);
+
+	/* Initialize garbage blocks.  */
+	q[1] = q[2] = q[3] = q[4] = q[5] = q[6] = q[7] = _mm_setzero_si128();
+
+	/* Load initial authenticator.  */
+	q[0] = _mm_loadu_epi8(auth);
+
+	for (; nbytes; nbytes -= 16, in += 16) {
+		/* Combine input block.  */
+		q[0] ^= _mm_loadu_epi8(in);
+
+		/* Transform to bitslice, encrypt, transform from bitslice.  */
+		_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+		aes_sse2_4x32_ortho(q);
+		aes_sse2_4x32_bitslice_encrypt(nrounds, sk_exp, q);
+		aes_sse2_4x32_ortho(q);
+		_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+	}
+
+	/* Store updated authenticator.  */
+	_mm_storeu_epi8(auth, q[0]);
+
+	/* Paranoia: Zero temporary buffers.  */
+	explicit_memset(sk_exp, 0, sizeof sk_exp);
+	explicit_memset(q, 0, sizeof q);
+}
+
+void
+aes_sse2_4x32_ccm_enc1(const struct aesenc *enc,
+    const uint8_t in[static 16], uint8_t out[static 16], size_t nbytes,
+    uint8_t authctr[static 32], uint32_t nrounds)
+{
+	uint32_t sk_exp[120];
+	__m128i q[8];
+	uint32_t c0, c1, c2, c3be;
+
+	KASSERT(nbytes);
+	KASSERT(nbytes % 16 == 0);
+
+	/* Expand round keys for bitslicing.  */
+	aes_sse2_4x32_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk);
+
+	/* Set first block to authenticator.  */
+	q[0] = _mm_loadu_epi8(authctr);
+
+	/* Load initial counter block, big-endian so we can increment it.  */
+	c0 = le32dec(authctr + 16 + 4*0);
+	c1 = le32dec(authctr + 16 + 4*1);
+	c2 = le32dec(authctr + 16 + 4*2);
+	c3be = bswap32(le32dec(authctr + 16 + 4*3));
+
+	/* Set other blocks to garbage -- can't take advantage.  */
+	q[1] = q[3] = q[4] = q[5] = q[6] = q[7] = _mm_setzero_si128();
+
+	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
+		/* Update authenticator.  */
+		q[0] ^= _mm_loadu_epi8(in);
+
+		/* Increment 32-bit counter.  */
+		q[2] = _mm_set_epi32(bswap32(++c3be), c2, c1, c0);
+
+		/* Encrypt authenticator and counter.  */
+		_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+		aes_sse2_4x32_ortho(q);
+		aes_sse2_4x32_bitslice_encrypt(nrounds, sk_exp, q);
+		aes_sse2_4x32_ortho(q);
+		_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+
+		/* Encrypt with CTR output.  */
+		_mm_storeu_epi8(out, _mm_loadu_epi8(in) ^ q[2]);
+	}
+
+	/* Update authenticator.  */
+	_mm_storeu_epi8(authctr, q[0]);
+
+	/* Update counter.  */
+	le32enc(authctr + 16 + 4*3, bswap32(c3be));
+
+	/* Paranoia: Zero temporary buffers.  */
+	explicit_memset(sk_exp, 0, sizeof sk_exp);
+	explicit_memset(q, 0, sizeof q);
+}
+
+void
+aes_sse2_4x32_ccm_dec1(const struct aesenc *enc,
+    const uint8_t in[static 16], uint8_t out[static 16], size_t nbytes,
+    uint8_t authctr[static 32], uint32_t nrounds)
+{
+	uint32_t sk_exp[120];
+	__m128i q[8];
+	uint32_t c0, c1, c2, c3be;
+	__m128i b;
+
+	KASSERT(nbytes);
+	KASSERT(nbytes % 16 == 0);
+
+	/* Expand round keys for bitslicing.  */
+	aes_sse2_4x32_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk);
+
+	/* Load initial counter block, big-endian so we can increment it.  */
+	c0 = le32dec(authctr + 16 + 4*0);
+	c1 = le32dec(authctr + 16 + 4*1);
+	c2 = le32dec(authctr + 16 + 4*2);
+	c3be = bswap32(le32dec(authctr + 16 + 4*3));
+
+	/* Increment 32-bit counter.  */
+	q[0] = _mm_set_epi32(bswap32(++c3be), c2, c1, c0);
+
+	/*
+	 * Set the other blocks to garbage -- we don't have any
+	 * plaintext to authenticate yet.
+	 */
+	q[1] = q[3] = q[4] = q[5] = q[6] = q[7] = _mm_setzero_si128();
+
+	/* Encrypt first CTR.  */
+	_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+	aes_sse2_4x32_ortho(q);
+	aes_sse2_4x32_bitslice_encrypt(nrounds, sk_exp, q);
+	aes_sse2_4x32_ortho(q);
+	_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+
+	/* Load the initial authenticator.  */
+	q[2] = _mm_loadu_epi8(authctr);
+
+	for (;; in += 16, out += 16) {
+		/* Decrypt the block.  */
+		b = _mm_loadu_epi8(in) ^ q[0];
+
+		/* Update authenticator.  */
+		q[2] ^= b;
+
+		/* Store plaintext.  */
+		_mm_storeu_epi8(out, b);
+
+		/* If this is the last block, stop.  */
+		if ((nbytes -= 16) == 0)
+			break;
+
+		/* Increment 32-bit counter.  */
+		q[0] = _mm_set_epi32(bswap32(++c3be), c2, c1, c0);
+
+		/* Authenticate previous plaintext, encrypt next CTR.  */
+		_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+		aes_sse2_4x32_ortho(q);
+		aes_sse2_4x32_bitslice_encrypt(nrounds, sk_exp, q);
+		aes_sse2_4x32_ortho(q);
+		_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+	}
+
+	/*
+	 * Authenticate last plaintext.  We're only doing this for the
+	 * authenticator, not for the counter, so don't bother to
+	 * initialize q[0].  (Even for the sake of sanitizers, they're
+	 * already initialized to something by now.)
+	 */
+	_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+	aes_sse2_4x32_ortho(q);
+	aes_sse2_4x32_bitslice_encrypt(nrounds, sk_exp, q);
+	aes_sse2_4x32_ortho(q);
+	_MM_TRANSPOSE4_EPI32(q[0], q[2], q[4], q[6]);
+
+	/* Update authenticator.  */
+	_mm_storeu_epi8(authctr, q[2]);
+
+	/* Update counter.  */
+	le32enc(authctr + 16 + 4*3, bswap32(c3be));
+
+	/* Paranoia: Zero temporary buffers.  */
+	explicit_memset(sk_exp, 0, sizeof sk_exp);
+	explicit_memset(q, 0, sizeof q);
+}
+
+int
+aes_sse2_4x32_selftest(void)
+{
+
+	if (aes_sse2_4x32_xts_update_selftest())
+		return -1;
+
+	/* XXX test aes_sse2_4x32_bitslice_decrypt */
+	/* XXX test aes_sse2_4x32_bitslice_encrypt */
+	/* XXX test aes_sse2_4x32_keysched */
+	/* XXX test aes_sse2_4x32_ortho */
+	/* XXX test aes_sse2_4x32_skey_expand */
+
+	return 0;
+}
diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2_4x32_subr.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/aes/arch/x86/aes_sse2_4x32_subr.h	Sat Nov 22 06:22:31 2025 +0000
@@ -0,0 +1,67 @@
+/*	$NetBSD$	*/
+
+/*-
+ * Copyright (c) 2025 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_CRYPTO_AES_ARCH_X86_AES_SSE2_4X32_SUBR_H
+#define	_CRYPTO_AES_ARCH_X86_AES_SSE2_4X32_SUBR_H
+
+#include <crypto/aes/aes.h>
+
+/*
+ * These functions MUST NOT use any vector registers for parameters or
+ * results -- the caller is compiled with -mno-sse &c. in the kernel,
+ * and dynamically turns on the vector unit just before calling them.
+ * Internal subroutines that use the vector unit for parameters are
+ * declared in aes_sse2_4x32_internal.h instead.
+ */
+
+void aes_sse2_4x32_setkey(uint32_t[static 60], const void *, uint32_t);
+
+void aes_sse2_4x32_enc(const struct aesenc *, const uint8_t in[static 16],
+    uint8_t[static 16], uint32_t);
+void aes_sse2_4x32_dec(const struct aesdec *, const uint8_t in[static 16],
+    uint8_t[static 16], uint32_t);
+void aes_sse2_4x32_cbc_enc(const struct aesenc *, const uint8_t[static 16],
+    uint8_t[static 16], size_t nbytes, uint8_t[static 16], uint32_t);
+void aes_sse2_4x32_cbc_dec(const struct aesdec *, const uint8_t[static 16],
+    uint8_t[static 16], size_t nbytes, uint8_t[static 16], uint32_t);
+void aes_sse2_4x32_xts_enc(const struct aesenc *, const uint8_t[static 16],
+    uint8_t[static 16], size_t nbytes, uint8_t[static 16], uint32_t);
+void aes_sse2_4x32_xts_dec(const struct aesdec *, const uint8_t[static 16],
+    uint8_t[static 16], size_t nbytes, uint8_t[static 16], uint32_t);
+void aes_sse2_4x32_cbcmac_update1(const struct aesenc *,
+    const uint8_t[static 16], size_t, uint8_t[static 16], uint32_t);
+void aes_sse2_4x32_ccm_enc1(const struct aesenc *,
+    const uint8_t[static 16], uint8_t[static 16], size_t, uint8_t[static 32],
+    uint32_t);
+void aes_sse2_4x32_ccm_dec1(const struct aesenc *,
+    const uint8_t[static 16], uint8_t[static 16], size_t, uint8_t[static 32],
+    uint32_t);
+
+int aes_sse2_4x32_selftest(void);
+
+#endif	/* _CRYPTO_AES_ARCH_X86_AES_SSE2_4X32_SUBR_H */
diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2_dec.c
--- a/sys/crypto/aes/arch/x86/aes_sse2_dec.c	Sat Nov 22 05:28:14 2025 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,178 +0,0 @@
-/*
- * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining 
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be 
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: aes_sse2_dec.c,v 1.1 2020/06/29 23:47:54 riastradh Exp $");
-
-#include <sys/types.h>
-
-#include "aes_sse2_impl.h"
-
-/* see inner.h */
-void
-aes_sse2_bitslice_invSbox(__m128i q[static 4])
-{
-	/*
-	 * See br_aes_ct_bitslice_invSbox(). This is the natural extension
-	 * to 64-bit registers.
-	 */
-	__m128i q0, q1, q2, q3, q4, q5, q6, q7;
-
-	q0 = ~q[0];
-	q1 = ~q[1];
-	q2 = q[2];
-	q3 = q[3];
-	q4 = _mm_shuffle_epi32(q[0], 0x0e);
-	q5 = _mm_shuffle_epi32(~q[1], 0x0e);
-	q6 = _mm_shuffle_epi32(~q[2], 0x0e);
-	q7 = _mm_shuffle_epi32(q[3], 0x0e);
-
-	q[3] = _mm_unpacklo_epi64(q5 ^ q0 ^ q2, q1 ^ q4 ^ q6);
-	q[2] = _mm_unpacklo_epi64(q4 ^ q7 ^ q1, q0 ^ q3 ^ q5);
-	q[1] = _mm_unpacklo_epi64(q3 ^ q6 ^ q0, q7 ^ q2 ^ q4);
-	q[0] = _mm_unpacklo_epi64(q2 ^ q5 ^ q7, q6 ^ q1 ^ q3);
-
-	aes_sse2_bitslice_Sbox(q);
-
-	q0 = ~q[0];
-	q1 = ~q[1];
-	q2 = q[2];
-	q3 = q[3];
-	q4 = _mm_shuffle_epi32(q[0], 0x0e);
-	q5 = _mm_shuffle_epi32(~q[1], 0x0e);
-	q6 = _mm_shuffle_epi32(~q[2], 0x0e);
-	q7 = _mm_shuffle_epi32(q[3], 0x0e);
-
-	q[3] = _mm_unpacklo_epi64(q5 ^ q0 ^ q2, q1 ^ q4 ^ q6);
-	q[2] = _mm_unpacklo_epi64(q4 ^ q7 ^ q1, q0 ^ q3 ^ q5);
-	q[1] = _mm_unpacklo_epi64(q3 ^ q6 ^ q0, q7 ^ q2 ^ q4);
-	q[0] = _mm_unpacklo_epi64(q2 ^ q5 ^ q7, q6 ^ q1 ^ q3);
-}
-
-static inline void
-add_round_key(__m128i q[static 4], const uint64_t sk[static 8])
-{
-	q[0] ^= _mm_set_epi64x(sk[4], sk[0]);
-	q[1] ^= _mm_set_epi64x(sk[5], sk[1]);
-	q[2] ^= _mm_set_epi64x(sk[6], sk[2]);
-	q[3] ^= _mm_set_epi64x(sk[7], sk[3]);
-}
-
-static inline __m128i
-inv_shift_row(__m128i q)
-{
-	__m128i x, y0, y1, y2, y3, y4, y5, y6;
-
-	x = q;
-	y0 = x & _mm_set1_epi64x(0x000000000000FFFF);
-	y1 = x & _mm_set1_epi64x(0x000000000FFF0000);
-	y2 = x & _mm_set1_epi64x(0x00000000F0000000);
-	y3 = x & _mm_set1_epi64x(0x000000FF00000000);
-	y4 = x & _mm_set1_epi64x(0x0000FF0000000000);
-	y5 = x & _mm_set1_epi64x(0x000F000000000000);
-	y6 = x & _mm_set1_epi64x(0xFFF0000000000000);
-	y1 = _mm_slli_epi64(y1, 4);
-	y2 = _mm_srli_epi64(y2, 12);
-	y3 = _mm_slli_epi64(y3, 8);
-	y4 = _mm_srli_epi64(y4, 8);
-	y5 = _mm_slli_epi64(y5, 12);
-	y6 = _mm_srli_epi64(y6, 4);
-	return y0 | y1 | y2 | y3 | y4 | y5 | y6;
-}
-
-static inline void
-inv_shift_rows(__m128i q[static 4])
-{
-
-	q[0] = inv_shift_row(q[0]);
-	q[1] = inv_shift_row(q[1]);
-	q[2] = inv_shift_row(q[2]);
-	q[3] = inv_shift_row(q[3]);
-}
-
-static inline __m128i
-rotr32(__m128i x)
-{
-	return _mm_slli_epi64(x, 32) | _mm_srli_epi64(x, 32);
-}
-
-static inline void
-inv_mix_columns(__m128i q[4])
-{
-	__m128i q0, q1, q2, q3, q4, q5, q6, q7;
-	__m128i r0, r1, r2, r3, r4, r5, r6, r7;
-	__m128i s0, s1, s2, s3, s4, s5, s6, s7;
-
-	q0 = q[0];
-	q1 = q[1];
-	q2 = q[2];
-	q3 = q[3];
-	r0 = _mm_srli_epi64(q0, 16) | _mm_slli_epi64(q0, 48);
-	r1 = _mm_srli_epi64(q1, 16) | _mm_slli_epi64(q1, 48);
-	r2 = _mm_srli_epi64(q2, 16) | _mm_slli_epi64(q2, 48);
-	r3 = _mm_srli_epi64(q3, 16) | _mm_slli_epi64(q3, 48);
-
-	q7 = _mm_shuffle_epi32(q3, 0x0e);
-	q6 = _mm_shuffle_epi32(q2, 0x0e);
-	q5 = _mm_shuffle_epi32(q1, 0x0e);
-	q4 = _mm_shuffle_epi32(q0, 0x0e);
-
-	r7 = _mm_shuffle_epi32(r3, 0x0e);
-	r6 = _mm_shuffle_epi32(r2, 0x0e);
-	r5 = _mm_shuffle_epi32(r1, 0x0e);
-	r4 = _mm_shuffle_epi32(r0, 0x0e);
-
-	s0 = q5 ^ q6 ^ q7 ^ r0 ^ r5 ^ r7 ^ rotr32(q0 ^ q5 ^ q6 ^ r0 ^ r5);
-	s1 = q0 ^ q5 ^ r0 ^ r1 ^ r5 ^ r6 ^ r7 ^ rotr32(q1 ^ q5 ^ q7 ^ r1 ^ r5 ^ r6);
-	s2 = q0 ^ q1 ^ q6 ^ r1 ^ r2 ^ r6 ^ r7 ^ rotr32(q0 ^ q2 ^ q6 ^ r2 ^ r6 ^ r7);
-	s3 = q0 ^ q1 ^ q2 ^ q5 ^ q6 ^ r0 ^ r2 ^ r3 ^ r5 ^ rotr32(q0 ^ q1 ^ q3 ^ q5 ^ q6 ^ q7 ^ r0 ^ r3 ^ r5 ^ r7);
-	s4 = q1 ^ q2 ^ q3 ^ q5 ^ r1 ^ r3 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr32(q1 ^ q2 ^ q4 ^ q5 ^ q7 ^ r1 ^ r4 ^ r5 ^ r6);
-	s5 = q2 ^ q3 ^ q4 ^ q6 ^ r2 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr32(q2 ^ q3 ^ q5 ^ q6 ^ r2 ^ r5 ^ r6 ^ r7);
-	s6 = q3 ^ q4 ^ q5 ^ q7 ^ r3 ^ r5 ^ r6 ^ r7 ^ rotr32(q3 ^ q4 ^ q6 ^ q7 ^ r3 ^ r6 ^ r7);
-	s7 = q4 ^ q5 ^ q6 ^ r4 ^ r6 ^ r7 ^ rotr32(q4 ^ q5 ^ q7 ^ r4 ^ r7);
-
-	q[0] = _mm_unpacklo_epi64(s0, s4);
-	q[1] = _mm_unpacklo_epi64(s1, s5);
-	q[2] = _mm_unpacklo_epi64(s2, s6);
-	q[3] = _mm_unpacklo_epi64(s3, s7);
-}
-
-/* see inner.h */
-void
-aes_sse2_bitslice_decrypt(unsigned num_rounds,
-	const uint64_t *skey, __m128i q[static 4])
-{
-	unsigned u;
-
-	add_round_key(q, skey + (num_rounds << 3));
-	for (u = num_rounds - 1; u > 0; u --) {
-		inv_shift_rows(q);
-		aes_sse2_bitslice_invSbox(q);
-		add_round_key(q, skey + (u << 3));
-		inv_mix_columns(q);
-	}
-	inv_shift_rows(q);
-	aes_sse2_bitslice_invSbox(q);
-	add_round_key(q, skey);
-}
diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2_enc.c
--- a/sys/crypto/aes/arch/x86/aes_sse2_enc.c	Sat Nov 22 05:28:14 2025 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining 
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be 
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: aes_sse2_enc.c,v 1.1 2020/06/29 23:47:54 riastradh Exp $");
-
-#include <sys/types.h>
-
-#include "aes_sse2_impl.h"
-
-static inline void
-add_round_key(__m128i q[static 4], const uint64_t sk[static 8])
-{
-	q[0] ^= _mm_set_epi64x(sk[4], sk[0]);
-	q[1] ^= _mm_set_epi64x(sk[5], sk[1]);
-	q[2] ^= _mm_set_epi64x(sk[6], sk[2]);
-	q[3] ^= _mm_set_epi64x(sk[7], sk[3]);
-}
-
-static inline __m128i
-shift_row(__m128i q)
-{
-	__m128i x, y0, y1, y2, y3, y4, y5, y6;
-
-	x = q;
-	y0 = x & _mm_set1_epi64x(0x000000000000FFFF);
-	y1 = x & _mm_set1_epi64x(0x00000000FFF00000);
-	y2 = x & _mm_set1_epi64x(0x00000000000F0000);
-	y3 = x & _mm_set1_epi64x(0x0000FF0000000000);
-	y4 = x & _mm_set1_epi64x(0x000000FF00000000);
-	y5 = x & _mm_set1_epi64x(0xF000000000000000);
-	y6 = x & _mm_set1_epi64x(0x0FFF000000000000);
-	y1 = _mm_srli_epi64(y1, 4);
-	y2 = _mm_slli_epi64(y2, 12);
-	y3 = _mm_srli_epi64(y3, 8);
-	y4 = _mm_slli_epi64(y4, 8);
-	y5 = _mm_srli_epi64(y5, 12);
-	y6 = _mm_slli_epi64(y6, 4);
-	return y0 | y1 | y2 | y3 | y4 | y5 | y6;
-}
-
-static inline void
-shift_rows(__m128i q[static 4])
-{
-
-	q[0] = shift_row(q[0]);
-	q[1] = shift_row(q[1]);
-	q[2] = shift_row(q[2]);
-	q[3] = shift_row(q[3]);
-}
-
-static inline __m128i
-rotr32(__m128i x)
-{
-	return _mm_slli_epi64(x, 32) | _mm_srli_epi64(x, 32);
-}
-
-static inline void
-mix_columns(__m128i q[static 4])
-{
-	__m128i q0, q1, q2, q3, q4, q5, q6, q7;
-	__m128i r0, r1, r2, r3, r4, r5, r6, r7;
-	__m128i s0, s1, s2, s3, s4, s5, s6, s7;
-
-	q0 = q[0];
-	q1 = q[1];
-	q2 = q[2];
-	q3 = q[3];
-	r0 = _mm_srli_epi64(q0, 16) | _mm_slli_epi64(q0, 48);
-	r1 = _mm_srli_epi64(q1, 16) | _mm_slli_epi64(q1, 48);
-	r2 = _mm_srli_epi64(q2, 16) | _mm_slli_epi64(q2, 48);
-	r3 = _mm_srli_epi64(q3, 16) | _mm_slli_epi64(q3, 48);
-
-	q7 = _mm_shuffle_epi32(q3, 0x0e);
-	q6 = _mm_shuffle_epi32(q2, 0x0e);
-	q5 = _mm_shuffle_epi32(q1, 0x0e);
-	q4 = _mm_shuffle_epi32(q0, 0x0e);
-
-	r7 = _mm_shuffle_epi32(r3, 0x0e);
-	r6 = _mm_shuffle_epi32(r2, 0x0e);
-	r5 = _mm_shuffle_epi32(r1, 0x0e);
-	r4 = _mm_shuffle_epi32(r0, 0x0e);
-
-	s0 = q7 ^ r7 ^ r0 ^ rotr32(q0 ^ r0);
-	s1 = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr32(q1 ^ r1);
-	s2 = q1 ^ r1 ^ r2 ^ rotr32(q2 ^ r2);
-	s3 = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr32(q3 ^ r3);
-	s4 = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr32(q4 ^ r4);
-	s5 = q4 ^ r4 ^ r5 ^ rotr32(q5 ^ r5);
-	s6 = q5 ^ r5 ^ r6 ^ rotr32(q6 ^ r6);
-	s7 = q6 ^ r6 ^ r7 ^ rotr32(q7 ^ r7);
-
-	q[0] = _mm_unpacklo_epi64(s0, s4);
-	q[1] = _mm_unpacklo_epi64(s1, s5);
-	q[2] = _mm_unpacklo_epi64(s2, s6);
-	q[3] = _mm_unpacklo_epi64(s3, s7);
-}
-
-void
-aes_sse2_bitslice_encrypt(unsigned num_rounds,
-	const uint64_t *skey, __m128i q[static 4])
-{
-	unsigned u;
-
-	add_round_key(q, skey);
-	for (u = 1; u < num_rounds; u ++) {
-		aes_sse2_bitslice_Sbox(q);
-		shift_rows(q);
-		mix_columns(q);
-		add_round_key(q, skey + (u << 3));
-	}
-	aes_sse2_bitslice_Sbox(q);
-	shift_rows(q);
-	add_round_key(q, skey + (num_rounds << 3));
-}
diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2_impl.c
--- a/sys/crypto/aes/arch/x86/aes_sse2_impl.c	Sat Nov 22 05:28:14 2025 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,221 +0,0 @@
-/*	$NetBSD: aes_sse2_impl.c,v 1.5 2020/07/25 22:29:56 riastradh Exp $	*/
-
-/*-
- * Copyright (c) 2020 The NetBSD Foundation, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: aes_sse2_impl.c,v 1.5 2020/07/25 22:29:56 riastradh Exp $");
-
-#include <sys/types.h>
-#include <sys/endian.h>
-
-#include <crypto/aes/aes.h>
-#include <crypto/aes/aes_impl.h>
-#include <crypto/aes/arch/x86/aes_sse2.h>
-
-#ifdef _KERNEL
-#include <x86/cpu.h>
-#include <x86/cpuvar.h>
-#include <x86/fpu.h>
-#include <x86/specialreg.h>
-#else
-#include <cpuid.h>
-#define	fpu_kern_enter()	((void)0)
-#define	fpu_kern_leave()	((void)0)
-#endif
-
-static void
-aes_sse2_setenckey_impl(struct aesenc *enc, const uint8_t *key,
-    uint32_t nrounds)
-{
-
-	fpu_kern_enter();
-	aes_sse2_setkey(enc->aese_aes.aes_rk64, key, nrounds);
-	fpu_kern_leave();
-}
-
-static void
-aes_sse2_setdeckey_impl(struct aesdec *dec, const uint8_t *key,
-    uint32_t nrounds)
-{
-
-	fpu_kern_enter();
-	/*
-	 * BearSSL computes InvMixColumns on the fly -- no need for
-	 * distinct decryption round keys.
-	 */
-	aes_sse2_setkey(dec->aesd_aes.aes_rk64, key, nrounds);
-	fpu_kern_leave();
-}
-
-static void
-aes_sse2_enc_impl(const struct aesenc *enc, const uint8_t in[static 16],
-    uint8_t out[static 16], uint32_t nrounds)
-{
-
-	fpu_kern_enter();
-	aes_sse2_enc(enc, in, out, nrounds);
-	fpu_kern_leave();
-}
-
-static void
-aes_sse2_dec_impl(const struct aesdec *dec, const uint8_t in[static 16],
-    uint8_t out[static 16], uint32_t nrounds)
-{
-
-	fpu_kern_enter();
-	aes_sse2_dec(dec, in, out, nrounds);
-	fpu_kern_leave();
-}
-
-static void
-aes_sse2_cbc_enc_impl(const struct aesenc *enc, const uint8_t in[static 16],
-    uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
-    uint32_t nrounds)
-{
-
-	if (nbytes == 0)
-		return;
-	fpu_kern_enter();
-	aes_sse2_cbc_enc(enc, in, out, nbytes, iv, nrounds);
-	fpu_kern_leave();
-}
-
-static void
-aes_sse2_cbc_dec_impl(const struct aesdec *dec, const uint8_t in[static 16],
-    uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
-    uint32_t nrounds)
-{
-
-	if (nbytes == 0)
-		return;
-	fpu_kern_enter();
-	aes_sse2_cbc_dec(dec, in, out, nbytes, iv, nrounds);
-	fpu_kern_leave();
-}
-
-static void
-aes_sse2_xts_enc_impl(const struct aesenc *enc, const uint8_t in[static 16],
-    uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
-    uint32_t nrounds)
-{
-
-	if (nbytes == 0)
-		return;
-	fpu_kern_enter();
-	aes_sse2_xts_enc(enc, in, out, nbytes, tweak, nrounds);
-	fpu_kern_leave();
-}
-
-static void
-aes_sse2_xts_dec_impl(const struct aesdec *dec, const uint8_t in[static 16],
-    uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
-    uint32_t nrounds)
-{
-
-	if (nbytes == 0)
-		return;
-	fpu_kern_enter();
-	aes_sse2_xts_dec(dec, in, out, nbytes, tweak, nrounds);
-	fpu_kern_leave();
-}
-
-static void
-aes_sse2_cbcmac_update1_impl(const struct aesenc *enc,
-    const uint8_t in[static 16], size_t nbytes, uint8_t auth[static 16],
-    uint32_t nrounds)
-{
-
-	fpu_kern_enter();
-	aes_sse2_cbcmac_update1(enc, in, nbytes, auth, nrounds);
-	fpu_kern_leave();
-}
-
-static void
-aes_sse2_ccm_enc1_impl(const struct aesenc *enc, const uint8_t in[static 16],
-    uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
-    uint32_t nrounds)
-{
-
-	fpu_kern_enter();
-	aes_sse2_ccm_enc1(enc, in, out, nbytes, authctr, nrounds);
-	fpu_kern_leave();
-}
-
-static void
-aes_sse2_ccm_dec1_impl(const struct aesenc *enc, const uint8_t in[static 16],
-    uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
-    uint32_t nrounds)
-{
-
-	fpu_kern_enter();
-	aes_sse2_ccm_dec1(enc, in, out, nbytes, authctr, nrounds);
-	fpu_kern_leave();
-}
-
-static int
-aes_sse2_probe(void)
-{
-	int result = 0;
-
-	/* Verify that the CPU supports SSE and SSE2.  */
-#ifdef _KERNEL
-	if (!i386_has_sse)
-		return -1;
-	if (!i386_has_sse2)
-		return -1;
-#else
-	unsigned eax, ebx, ecx, edx;
-	if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx))
-		return -1;
-	if ((edx & bit_SSE) == 0)
-		return -1;
-	if ((edx & bit_SSE2) == 0)
-		return -1;
-#endif
-
-	fpu_kern_enter();
-	result = aes_sse2_selftest();
-	fpu_kern_leave();
-
-	return result;
-}
-
-struct aes_impl aes_sse2_impl = {
-	.ai_name = "Intel SSE2 bitsliced",
-	.ai_probe = aes_sse2_probe,
-	.ai_setenckey = aes_sse2_setenckey_impl,
-	.ai_setdeckey = aes_sse2_setdeckey_impl,
-	.ai_enc = aes_sse2_enc_impl,
-	.ai_dec = aes_sse2_dec_impl,
-	.ai_cbc_enc = aes_sse2_cbc_enc_impl,
-	.ai_cbc_dec = aes_sse2_cbc_dec_impl,
-	.ai_xts_enc = aes_sse2_xts_enc_impl,
-	.ai_xts_dec = aes_sse2_xts_dec_impl,
-	.ai_cbcmac_update1 = aes_sse2_cbcmac_update1_impl,
-	.ai_ccm_enc1 = aes_sse2_ccm_enc1_impl,
-	.ai_ccm_dec1 = aes_sse2_ccm_dec1_impl,
-};
diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2_impl.h
--- a/sys/crypto/aes/arch/x86/aes_sse2_impl.h	Sat Nov 22 05:28:14 2025 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,49 +0,0 @@
-/*	$NetBSD: aes_sse2_impl.h,v 1.3 2023/08/07 01:07:36 rin Exp $	*/
-
-/*-
- * Copyright (c) 2020 The NetBSD Foundation, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef	_CRYPTO_AES_ARCH_X86_AES_SSE2_IMPL_H
-#define	_CRYPTO_AES_ARCH_X86_AES_SSE2_IMPL_H
-
-#include <sys/types.h>
-
-#include <crypto/aes/aes.h>
-#include <crypto/aes/arch/x86/aes_sse2.h>
-#include <crypto/arch/x86/immintrin.h>
-#include <crypto/arch/x86/immintrin_ext.h>
-
-void aes_sse2_bitslice_Sbox(__m128i[static 4]);
-void aes_sse2_bitslice_invSbox(__m128i[static 4]);
-void aes_sse2_ortho(__m128i[static 4]);
-__m128i aes_sse2_interleave_in(__m128i);
-__m128i aes_sse2_interleave_out(__m128i);
-unsigned aes_sse2_keysched(uint64_t *, const void *, size_t);
-void aes_sse2_skey_expand(uint64_t *, unsigned, const uint64_t *);
-void aes_sse2_bitslice_encrypt(unsigned, const uint64_t *, __m128i[static 4]);
-void aes_sse2_bitslice_decrypt(unsigned, const uint64_t *, __m128i[static 4]);
-
-#endif	/* _CRYPTO_AES_ARCH_X86_AES_SSE2_IMPL_H */
diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/aes_sse2_subr.c
--- a/sys/crypto/aes/arch/x86/aes_sse2_subr.c	Sat Nov 22 05:28:14 2025 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,711 +0,0 @@
-/*	$NetBSD: aes_sse2_subr.c,v 1.4 2020/09/08 22:48:24 riastradh Exp $	*/
-
-/*-
- * Copyright (c) 2020 The NetBSD Foundation, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: aes_sse2_subr.c,v 1.4 2020/09/08 22:48:24 riastradh Exp $");
-
-#ifdef _KERNEL
-#include <sys/systm.h>
-#include <lib/libkern/libkern.h>
-#else
-#include <err.h>
-#include <assert.h>
-#include <inttypes.h>
-#include <stdio.h>
-#include <string.h>
-#define	KASSERT			assert
-#define	panic(fmt, args...)	err(1, fmt, ##args)
-#endif
-
-#include <crypto/aes/aes.h>
-#include <crypto/aes/arch/x86/aes_sse2.h>
-
-#include "aes_sse2_impl.h"
-
-void
-aes_sse2_setkey(uint64_t rk[static 30], const void *key, uint32_t nrounds)
-{
-	size_t key_len;
-
-	switch (nrounds) {
-	case 10:
-		key_len = 16;
-		break;
-	case 12:
-		key_len = 24;
-		break;
-	case 14:
-		key_len = 32;
-		break;
-	default:
-		panic("invalid AES nrounds: %u", nrounds);
-	}
-
-	aes_sse2_keysched(rk, key, key_len);
-}
-
-void
-aes_sse2_enc(const struct aesenc *enc, const uint8_t in[static 16],
-    uint8_t out[static 16], uint32_t nrounds)
-{
-	uint64_t sk_exp[120];
-	__m128i q[4];
-
-	/* Expand round keys for bitslicing.  */
-	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
-
-	/* Load input block interleaved with garbage blocks.  */
-	q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in));
-	q[1] = q[2] = q[3] = _mm_setzero_si128();
-
-	/* Transform to bitslice, decrypt, transform from bitslice.  */
-	aes_sse2_ortho(q);
-	aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
-	aes_sse2_ortho(q);
-
-	/* Store output block.  */
-	_mm_storeu_epi8(out, aes_sse2_interleave_out(q[0]));
-
-	/* Paranoia: Zero temporary buffers.  */
-	explicit_memset(sk_exp, 0, sizeof sk_exp);
-	explicit_memset(q, 0, sizeof q);
-}
-
-void
-aes_sse2_dec(const struct aesdec *dec, const uint8_t in[static 16],
-    uint8_t out[static 16], uint32_t nrounds)
-{
-	uint64_t sk_exp[120];
-	__m128i q[4];
-
-	/* Expand round keys for bitslicing.  */
-	aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
-
-	/* Load input block interleaved with garbage blocks.  */
-	q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in));
-	q[1] = q[2] = q[3] = _mm_setzero_si128();
-
-	/* Transform to bitslice, decrypt, transform from bitslice.  */
-	aes_sse2_ortho(q);
-	aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
-	aes_sse2_ortho(q);
-
-	/* Store output block.  */
-	_mm_storeu_epi8(out, aes_sse2_interleave_out(q[0]));
-
-	/* Paranoia: Zero temporary buffers.  */
-	explicit_memset(sk_exp, 0, sizeof sk_exp);
-	explicit_memset(q, 0, sizeof q);
-}
-
-void
-aes_sse2_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16],
-    uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
-    uint32_t nrounds)
-{
-	uint64_t sk_exp[120];
-	__m128i q[4];
-	__m128i cv;
-
-	KASSERT(nbytes);
-	KASSERT(nbytes % 16 == 0);
-
-	/* Expand round keys for bitslicing.  */
-	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
-
-	/* Load the IV.  */
-	cv = _mm_loadu_epi8(iv);
-
-	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
-		/* Load input block and apply CV.  */
-		q[0] = aes_sse2_interleave_in(cv ^ _mm_loadu_epi8(in));
-
-		/* Transform to bitslice, encrypt, transform from bitslice.  */
-		aes_sse2_ortho(q);
-		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
-		aes_sse2_ortho(q);
-
-		/* Remember ciphertext as CV and store output block.  */
-		cv = aes_sse2_interleave_out(q[0]);
-		_mm_storeu_epi8(out, cv);
-	}
-
-	/* Store updated IV.  */
-	_mm_storeu_epi8(iv, cv);
-
-	/* Paranoia: Zero temporary buffers.  */
-	explicit_memset(sk_exp, 0, sizeof sk_exp);
-	explicit_memset(q, 0, sizeof q);
-}
-
-void
-aes_sse2_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
-    uint8_t out[static 16], size_t nbytes, uint8_t ivp[static 16],
-    uint32_t nrounds)
-{
-	uint64_t sk_exp[120];
-	__m128i q[4];
-	__m128i cv, iv, w;
-
-	KASSERT(nbytes);
-	KASSERT(nbytes % 16 == 0);
-
-	/* Expand round keys for bitslicing.  */
-	aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
-
-	/* Load the IV.  */
-	iv = _mm_loadu_epi8(ivp);
-
-	/* Load the last cipher block.  */
-	cv = _mm_loadu_epi8(in + nbytes - 16);
-
-	/* Store the updated IV.  */
-	_mm_storeu_epi8(ivp, cv);
-
-	/* Process the last blocks if not an even multiple of four.  */
-	if (nbytes % (4*16)) {
-		unsigned n = (nbytes/16) % 4;
-
-		KASSERT(n > 0);
-		KASSERT(n < 4);
-
-		q[1] = q[2] = q[3] = _mm_setzero_si128();
-		q[n - 1] = aes_sse2_interleave_in(cv);
-		switch (nbytes % 64) {
-		case 48:
-			w = _mm_loadu_epi8(in + nbytes - 32);
-			q[1] = aes_sse2_interleave_in(w);
-			w = _mm_loadu_epi8(in + nbytes - 48);
-			q[0] = aes_sse2_interleave_in(w);
-			break;
-		case 32:
-			w = _mm_loadu_epi8(in + nbytes - 32);
-			q[0] = aes_sse2_interleave_in(w);
-			break;
-		case 16:
-			break;
-		}
-
-		/* Decrypt.  */
-		aes_sse2_ortho(q);
-		aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
-		aes_sse2_ortho(q);
-
-		do {
-			n--;
-			w = aes_sse2_interleave_out(q[n]);
-			if ((nbytes -= 16) == 0)
-				goto out;
-			cv = _mm_loadu_epi8(in + nbytes - 16);
-			_mm_storeu_epi8(out + nbytes, w ^ cv);
-		} while (n);
-	}
-
-	for (;;) {
-		KASSERT(nbytes >= 64);
-		nbytes -= 64;
-
-		/*
-		 * 1. Set up upper cipher block from cv.
-		 * 2. Load lower cipher block into cv and set it up.
-		 * 3. Decrypt.
-		 */
-		q[3] = aes_sse2_interleave_in(cv);
-
-		w = _mm_loadu_epi8(in + nbytes + 4*8);
-		q[2] = aes_sse2_interleave_in(w);
-
-		w = _mm_loadu_epi8(in + nbytes + 4*4);
-		q[1] = aes_sse2_interleave_in(w);
-
-		w = _mm_loadu_epi8(in + nbytes + 4*0);
-		q[0] = aes_sse2_interleave_in(w);
-
-		aes_sse2_ortho(q);
-		aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
-		aes_sse2_ortho(q);
-
-		/* Store the upper output block.  */
-		w = aes_sse2_interleave_out(q[3]);
-		cv = _mm_loadu_epi8(in + nbytes + 4*8);
-		_mm_storeu_epi8(out + nbytes + 4*12, w ^ cv);
-
-		/* Store the middle output blocks.  */
-		w = aes_sse2_interleave_out(q[2]);
-		cv = _mm_loadu_epi8(in + nbytes + 4*4);
-		_mm_storeu_epi8(out + nbytes + 4*8, w ^ cv);
-
-		w = aes_sse2_interleave_out(q[1]);
-		cv = _mm_loadu_epi8(in + nbytes + 4*0);
-		_mm_storeu_epi8(out + nbytes + 4*4, w ^ cv);
-
-		/*
-		 * Get the first output block, but don't load the CV
-		 * yet -- it might be the previous ciphertext block, or
-		 * it might be the IV.
-		 */
-		w = aes_sse2_interleave_out(q[0]);
-
-		/* Stop if we've reached the first output block.  */
-		if (nbytes == 0)
-			goto out;
-
-		/*
-		 * Load the preceding cipher block, and apply it as the
-		 * chaining value to this one.
-		 */
-		cv = _mm_loadu_epi8(in + nbytes - 16);
-		_mm_storeu_epi8(out + nbytes, w ^ cv);
-	}
-
-out:	/* Store the first output block.  */
-	_mm_storeu_epi8(out, w ^ iv);
-
-	/* Paranoia: Zero temporary buffers.  */
-	explicit_memset(sk_exp, 0, sizeof sk_exp);
-	explicit_memset(q, 0, sizeof q);
-}
-
-static inline __m128i
-aes_sse2_xts_update(__m128i t)
-{
-	const __m128i one = _mm_set_epi64x(1, 1);
-	__m128i s, m, c;
-
-	s = _mm_srli_epi64(t, 63);	/* 1 if high bit set else 0 */
-	m = _mm_sub_epi64(s, one);	/* 0 if high bit set else -1 */
-	m = _mm_shuffle_epi32(m, 0x4e);	/* swap halves */
-	c = _mm_set_epi64x(1, 0x87);	/* carry */
-
-	return _mm_slli_epi64(t, 1) ^ (c & ~m);
-}
-
-static int
-aes_sse2_xts_update_selftest(void)
-{
-	static const struct {
-		uint32_t in[4], out[4];
-	} cases[] = {
-		[0] = { {1}, {2} },
-		[1] = { {0x80000000U,0,0,0}, {0,1,0,0} },
-		[2] = { {0,0x80000000U,0,0}, {0,0,1,0} },
-		[3] = { {0,0,0x80000000U,0}, {0,0,0,1} },
-		[4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} },
-		[5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} },
-	};
-	unsigned i;
-	uint32_t t[4];
-	int result = 0;
-
-	for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) {
-		t[0] = cases[i].in[0];
-		t[1] = cases[i].in[1];
-		t[2] = cases[i].in[2];
-		t[3] = cases[i].in[3];
-		_mm_storeu_epi8(t, aes_sse2_xts_update(_mm_loadu_epi8(t)));
-		if (t[0] != cases[i].out[0] ||
-		    t[1] != cases[i].out[1] ||
-		    t[2] != cases[i].out[2] ||
-		    t[3] != cases[i].out[3]) {
-			printf("%s %u:"
-			    " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n",
-			    __func__, i, t[0], t[1], t[2], t[3]);
-			result = -1;
-		}
-	}
-
-	return result;
-}
-
-void
-aes_sse2_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
-    uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
-    uint32_t nrounds)
-{
-	uint64_t sk_exp[120];
-	__m128i q[4];
-	__m128i w;
-	__m128i t[5];
-	unsigned i;
-
-	KASSERT(nbytes);
-	KASSERT(nbytes % 16 == 0);
-
-	/* Expand round keys for bitslicing.  */
-	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
-
-	/* Load tweak.  */
-	t[0] = _mm_loadu_epi8(tweak);
-
-	/* Handle the first block separately if odd number.  */
-	if (nbytes % (4*16)) {
-		/* Load up the tweaked inputs.  */
-		for (i = 0; i < (nbytes/16) % 4; i++) {
-			w = _mm_loadu_epi8(in + 16*i) ^ t[i];
-			q[i] = aes_sse2_interleave_in(w);
-			t[i + 1] = aes_sse2_xts_update(t[i]);
-		}
-		for (; i < 4; i++)
-			q[i] = _mm_setzero_si128();
-
-		/* Encrypt up to four blocks.  */
-		aes_sse2_ortho(q);
-		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
-		aes_sse2_ortho(q);
-
-		/* Store the tweaked outputs.  */
-		for (i = 0; i < (nbytes/16) % 4; i++) {
-			w = aes_sse2_interleave_out(q[i]);
-			_mm_storeu_epi8(out + 16*i, w ^ t[i]);
-		}
-
-		/* Advance to the next block.  */
-		t[0] = t[i];
-		in += nbytes % (4*16);
-		out += nbytes % (4*16);
-		nbytes -= nbytes % (4*16);
-		if (nbytes == 0)
-			goto out;
-	}
-
-	do {
-		KASSERT(nbytes % 64 == 0);
-		KASSERT(nbytes >= 64);
-
-		/* Load up the tweaked inputs.  */
-		for (i = 0; i < 4; i++) {
-			w = _mm_loadu_epi8(in + 16*i) ^ t[i];
-			q[i] = aes_sse2_interleave_in(w);
-			t[i + 1] = aes_sse2_xts_update(t[i]);
-		}
-
-		/* Encrypt four blocks.  */
-		aes_sse2_ortho(q);
-		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
-		aes_sse2_ortho(q);
-
-		/* Store the tweaked outputs.  */
-		for (i = 0; i < 4; i++) {
-			w = aes_sse2_interleave_out(q[i]);
-			_mm_storeu_epi8(out + 16*i, w ^ t[i]);
-		}
-
-		/* Advance to the next block.  */
-		t[0] = t[4];
-		in += 64;
-		out += 64;
-		nbytes -= 64;
-	} while (nbytes);
-
-out:	/* Store the updated tweak.  */
-	_mm_storeu_epi8(tweak, t[0]);
-
-	/* Paranoia: Zero temporary buffers.  */
-	explicit_memset(sk_exp, 0, sizeof sk_exp);
-	explicit_memset(q, 0, sizeof q);
-	explicit_memset(t, 0, sizeof t);
-}
-
-void
-aes_sse2_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
-    uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
-    uint32_t nrounds)
-{
-	uint64_t sk_exp[120];
-	__m128i q[4];
-	__m128i w;
-	__m128i t[5];
-	unsigned i;
-
-	KASSERT(nbytes);
-	KASSERT(nbytes % 16 == 0);
-
-	/* Expand round keys for bitslicing.  */
-	aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
-
-	/* Load tweak.  */
-	t[0] = _mm_loadu_epi8(tweak);
-
-	/* Handle the first block separately if odd number.  */
-	if (nbytes % (4*16)) {
-		/* Load up the tweaked inputs.  */
-		for (i = 0; i < (nbytes/16) % 4; i++) {
-			w = _mm_loadu_epi8(in + 16*i) ^ t[i];
-			q[i] = aes_sse2_interleave_in(w);
-			t[i + 1] = aes_sse2_xts_update(t[i]);
-		}
-		for (; i < 4; i++)
-			q[i] = _mm_setzero_si128();
-
-		/* Decrypt up to four blocks.  */
-		aes_sse2_ortho(q);
-		aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
-		aes_sse2_ortho(q);
-
-		/* Store the tweaked outputs.  */
-		for (i = 0; i < (nbytes/16) % 4; i++) {
-			w = aes_sse2_interleave_out(q[i]);
-			_mm_storeu_epi8(out + 16*i, w ^ t[i]);
-		}
-
-		/* Advance to the next block.  */
-		t[0] = t[i];
-		in += nbytes % (4*16);
-		out += nbytes % (4*16);
-		nbytes -= nbytes % (4*16);
-		if (nbytes == 0)
-			goto out;
-	}
-
-	do {
-		KASSERT(nbytes % 64 == 0);
-		KASSERT(nbytes >= 64);
-
-		/* Load up the tweaked inputs.  */
-		for (i = 0; i < 4; i++) {
-			w = _mm_loadu_epi8(in + 16*i) ^ t[i];
-			q[i] = aes_sse2_interleave_in(w);
-			t[i + 1] = aes_sse2_xts_update(t[i]);
-		}
-
-		/* Decrypt four blocks.  */
-		aes_sse2_ortho(q);
-		aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
-		aes_sse2_ortho(q);
-
-		/* Store the tweaked outputs.  */
-		for (i = 0; i < 4; i++) {
-			w = aes_sse2_interleave_out(q[i]);
-			_mm_storeu_epi8(out + 16*i, w ^ t[i]);
-		}
-
-		/* Advance to the next block.  */
-		t[0] = t[4];
-		in += 64;
-		out += 64;
-		nbytes -= 64;
-	} while (nbytes);
-
-out:	/* Store the updated tweak.  */
-	_mm_storeu_epi8(tweak, t[0]);
-
-	/* Paranoia: Zero temporary buffers.  */
-	explicit_memset(sk_exp, 0, sizeof sk_exp);
-	explicit_memset(q, 0, sizeof q);
-	explicit_memset(t, 0, sizeof t);
-}
-
-void
-aes_sse2_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16],
-    size_t nbytes, uint8_t auth[static 16], uint32_t nrounds)
-{
-	uint64_t sk_exp[120];
-	__m128i q[4];
-
-	KASSERT(nbytes);
-	KASSERT(nbytes % 16 == 0);
-
-	/* Expand round keys for bitslicing.  */
-	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
-
-	/* Load initial authenticator.  */
-	q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(auth));
-
-	for (; nbytes; nbytes -= 16, in += 16) {
-		q[0] ^= aes_sse2_interleave_in(_mm_loadu_epi8(in));
-		aes_sse2_ortho(q);
-		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
-		aes_sse2_ortho(q);
-	}
-
-	/* Store updated authenticator.  */
-	_mm_storeu_epi8(auth, aes_sse2_interleave_out(q[0]));
-
-	/* Paranoia: Zero temporary buffers.  */
-	explicit_memset(sk_exp, 0, sizeof sk_exp);
-	explicit_memset(q, 0, sizeof q);
-}
-
-void
-aes_sse2_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16],
-    uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
-    uint32_t nrounds)
-{
-	uint64_t sk_exp[120];
-	__m128i q[4];
-	__m128i ctr;
-	uint32_t c0, c1, c2, c3;
-
-	KASSERT(nbytes);
-	KASSERT(nbytes % 16 == 0);
-
-	/* Expand round keys for bitslicing.  */
-	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
-
-	/* Set first block to authenticator.  */
-	q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(authctr));
-
-	/* Load initial counter block, big-endian so we can increment it.  */
-	c0 = le32dec(authctr + 16 + 4*0);
-	c1 = le32dec(authctr + 16 + 4*1);
-	c2 = le32dec(authctr + 16 + 4*2);
-	c3 = be32dec(authctr + 16 + 4*3);
-
-	/* Set other blocks to garbage -- can't take advantage.  */
-	q[2] = q[3] = _mm_setzero_si128();
-
-	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
-		/* Update authenticator.  */
-		q[0] ^= aes_sse2_interleave_in(_mm_loadu_epi8(in));
-
-		/* Increment 32-bit counter.  */
-		ctr = _mm_set_epi32(bswap32(++c3), c2, c1, c0);
-		q[1] = aes_sse2_interleave_in(ctr);
-
-		/* Encrypt authenticator and counter.  */
-		aes_sse2_ortho(q);
-		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
-		aes_sse2_ortho(q);
-
-		/* Encrypt with CTR output.  */
-		_mm_storeu_epi8(out,
-		    _mm_loadu_epi8(in) ^ aes_sse2_interleave_out(q[1]));
-	}
-
-	/* Update authenticator.  */
-	_mm_storeu_epi8(authctr, aes_sse2_interleave_out(q[0]));
-
-	/* Update counter.  */
-	be32enc(authctr + 16 + 4*3, c3);
-
-	/* Paranoia: Zero temporary buffers.  */
-	explicit_memset(sk_exp, 0, sizeof sk_exp);
-	explicit_memset(q, 0, sizeof q);
-}
-
-void
-aes_sse2_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16],
-    uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
-    uint32_t nrounds)
-{
-	uint64_t sk_exp[120];
-	__m128i q[4];
-	__m128i ctr, block;
-	uint32_t c0, c1, c2, c3;
-
-	KASSERT(nbytes);
-	KASSERT(nbytes % 16 == 0);
-
-	/* Expand round keys for bitslicing.  */
-	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
-
-	/* Load initial counter block, big-endian so we can increment it.  */
-	c0 = le32dec(authctr + 16 + 4*0);
-	c1 = le32dec(authctr + 16 + 4*1);
-	c2 = le32dec(authctr + 16 + 4*2);
-	c3 = be32dec(authctr + 16 + 4*3);
-
-	/* Increment 32-bit counter.  */
-	ctr = _mm_set_epi32(bswap32(++c3), c2, c1, c0);
-	q[0] = aes_sse2_interleave_in(ctr);
-
-	/*
-	 * Set the other blocks to garbage -- we don't have any
-	 * plaintext to authenticate yet.
-	 */
-	q[1] = q[2] = q[3] = _mm_setzero_si128();
-
-	/* Encrypt first CTR.  */
-	aes_sse2_ortho(q);
-	aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
-	aes_sse2_ortho(q);
-
-	/* Load the initial authenticator.  */
-	q[1] = aes_sse2_interleave_in(_mm_loadu_epi8(authctr));
-
-	for (;; in += 16, out += 16) {
-		/* Decrypt the block.  */
-		block = _mm_loadu_epi8(in) ^ aes_sse2_interleave_out(q[0]);
-
-		/* Update authenticator.  */
-		q[1] ^= aes_sse2_interleave_in(block);
-
-		/* Store plaintext.  */
-		_mm_storeu_epi8(out, block);
-
-		/* If this is the last block, stop.  */
-		if ((nbytes -= 16) == 0)
-			break;
-
-		/* Increment 32-bit counter.  */
-		ctr = _mm_set_epi32(bswap32(++c3), c2, c1, c0);
-		q[0] = aes_sse2_interleave_in(ctr);
-
-		/* Authenticate previous plaintext, encrypt next CTR.  */
-		aes_sse2_ortho(q);
-		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
-		aes_sse2_ortho(q);
-	}
-
-	/*
-	 * Authenticate last plaintext.  We're only doing this for the
-	 * authenticator, not for the counter, so don't bother to
-	 * initialize q[0], q[2], q[3].  (Even for the sake of
-	 * sanitizers, they're already initialized to something by
-	 * now.)
-	 */
-	aes_sse2_ortho(q);
-	aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
-	aes_sse2_ortho(q);
-
-	/* Update authenticator.  */
-	_mm_storeu_epi8(authctr, aes_sse2_interleave_out(q[1]));
-
-	/* Update counter.  */
-	be32enc(authctr + 16 + 4*3, c3);
-
-	/* Paranoia: Zero temporary buffers.  */
-	explicit_memset(sk_exp, 0, sizeof sk_exp);
-	explicit_memset(q, 0, sizeof q);
-}
-
-int
-aes_sse2_selftest(void)
-{
-
-	if (aes_sse2_xts_update_selftest())
-		return -1;
-
-	/* XXX test aes_sse2_bitslice_decrypt */
-	/* XXX test aes_sse2_bitslice_encrypt */
-	/* XXX test aes_sse2_keysched */
-	/* XXX test aes_sse2_ortho */
-	/* XXX test aes_sse2_skey_expand */
-
-	return 0;
-}
diff -r cce15febbf04 -r 4ef139778465 sys/crypto/aes/arch/x86/files.aessse2
--- a/sys/crypto/aes/arch/x86/files.aessse2	Sat Nov 22 05:28:14 2025 +0000
+++ b/sys/crypto/aes/arch/x86/files.aessse2	Sat Nov 22 06:22:31 2025 +0000
@@ -1,12 +1,12 @@
 #	$NetBSD: files.aessse2,v 1.2 2020/06/29 23:50:05 riastradh Exp $
 
-makeoptions	aes	"COPTS.aes_sse2.c"+="-msse -msse2"
-makeoptions	aes	"COPTS.aes_sse2_dec.c"+="-msse -msse2"
-makeoptions	aes	"COPTS.aes_sse2_enc.c"+="-msse -msse2"
-makeoptions	aes	"COPTS.aes_sse2_subr.c"+="-msse -msse2"
+makeoptions	aes	"COPTS.aes_sse2_4x32.c"+="-msse -msse2"
+makeoptions	aes	"COPTS.aes_sse2_4x32_dec.c"+="-msse -msse2"
+makeoptions	aes	"COPTS.aes_sse2_4x32_enc.c"+="-msse -msse2"
+makeoptions	aes	"COPTS.aes_sse2_4x32_subr.c"+="-msse -msse2"
 
-file	crypto/aes/arch/x86/aes_sse2.c		aes
-file	crypto/aes/arch/x86/aes_sse2_dec.c	aes
-file	crypto/aes/arch/x86/aes_sse2_enc.c	aes
-file	crypto/aes/arch/x86/aes_sse2_impl.c	aes
-file	crypto/aes/arch/x86/aes_sse2_subr.c	aes
+file	crypto/aes/arch/x86/aes_sse2_4x32.c		aes
+file	crypto/aes/arch/x86/aes_sse2_4x32_dec.c		aes
+file	crypto/aes/arch/x86/aes_sse2_4x32_enc.c		aes
+file	crypto/aes/arch/x86/aes_sse2_4x32_impl.c	aes
+file	crypto/aes/arch/x86/aes_sse2_4x32_subr.c	aes
diff -r cce15febbf04 -r 4ef139778465 tests/sys/crypto/aes/Makefile
--- a/tests/sys/crypto/aes/Makefile	Sat Nov 22 05:28:14 2025 +0000
+++ b/tests/sys/crypto/aes/Makefile	Sat Nov 22 06:22:31 2025 +0000
@@ -65,15 +65,15 @@ SRCS.t_aes+=	aes_ni.c
 SRCS.t_aes+=	aes_ni_64.S
 .endif
 
-SRCS.t_aes+=	aes_sse2.c
-SRCS.t_aes+=	aes_sse2_dec.c
-SRCS.t_aes+=	aes_sse2_enc.c
-SRCS.t_aes+=	aes_sse2_impl.c
-SRCS.t_aes+=	aes_sse2_subr.c
-COPTS.aes_sse2.c+=	-msse -msse2
-COPTS.aes_sse2_dec.c+=	-msse -msse2
-COPTS.aes_sse2_enc.c+=	-msse -msse2
-COPTS.aes_sse2_subr.c+=	-msse -msse2
+SRCS.t_aes+=	aes_sse2_4x32.c
+SRCS.t_aes+=	aes_sse2_4x32_dec.c
+SRCS.t_aes+=	aes_sse2_4x32_enc.c
+SRCS.t_aes+=	aes_sse2_4x32_impl.c
+SRCS.t_aes+=	aes_sse2_4x32_subr.c
+COPTS.aes_sse2_4x32.c+=		-msse -msse2
+COPTS.aes_sse2_4x32_dec.c+=	-msse -msse2
+COPTS.aes_sse2_4x32_enc.c+=	-msse -msse2
+COPTS.aes_sse2_4x32_subr.c+=	-msse -msse2
 
 SRCS.t_aes+=	aes_ssse3.c
 SRCS.t_aes+=	aes_ssse3_impl.c
@@ -95,7 +95,7 @@ COPTS.aes_bear64.c+=	${CC_WNO_STRINGOP_O
 COPTS.aes_neon_subr.c+=	${CC_WNO_ARRAY_BOUNDS}
 
 COPTS.aes_ni.c+=	${CC_WNO_STRINGOP_OVERREAD} ${CC_WNO_STRINGOP_OVERFLOW}
-COPTS.aes_sse2_subr.c+=	${CC_WNO_ARRAY_BOUNDS}
+COPTS.aes_sse2_4x32_subr.c+=	${CC_WNO_ARRAY_BOUNDS}
 COPTS.aes_ssse3_subr.c+=${CC_WNO_ARRAY_BOUNDS}
 COPTS.aes_via.c+=	${CC_WNO_ARRAY_BOUNDS}
 
diff -r cce15febbf04 -r 4ef139778465 tests/sys/crypto/aes/t_aes.c
--- a/tests/sys/crypto/aes/t_aes.c	Sat Nov 22 05:28:14 2025 +0000
+++ b/tests/sys/crypto/aes/t_aes.c	Sat Nov 22 06:22:31 2025 +0000
@@ -35,7 +35,7 @@
 
 #if defined(__i386__) || defined(__x86_64__)
 #include <crypto/aes/arch/x86/aes_ni.h>
-#include <crypto/aes/arch/x86/aes_sse2.h>
+#include <crypto/aes/arch/x86/aes_sse2_4x32.h>
 #include <crypto/aes/arch/x86/aes_ssse3.h>
 #include <crypto/aes/arch/x86/aes_via.h>
 #endif
@@ -125,8 +125,8 @@ AES_SELFTEST(aes_ni_selftest, &aes_ni_im
 #endif
 
 #if defined(__i386__) || defined(__x86_64__)
-AES_SELFTEST(aes_sse2_selftest, &aes_sse2_impl,
-    "Intel SSE2 bitsliced self-test")
+AES_SELFTEST(aes_sse2_4x32_selftest, &aes_sse2_4x32_impl,
+    "Intel SSE2 4x32 bitsliced self-test")
 AES_SELFTEST(aes_ssse3_selftest, &aes_ssse3_impl,
     "Intel SSSE3 vpaes self-test")
 AES_SELFTEST(aes_via_selftest, &aes_via_impl, "VIA ACE AES self-test")
@@ -151,7 +151,7 @@ ATF_TP_ADD_TCS(tp)
 #endif
 
 #if defined(__i386__) || defined(__x86_64__)
-	ATF_TP_ADD_TC(tp, aes_sse2_selftest);
+	ATF_TP_ADD_TC(tp, aes_sse2_4x32_selftest);
 	ATF_TP_ADD_TC(tp, aes_ssse3_selftest);
 	ATF_TP_ADD_TC(tp, aes_via_selftest);
 #endif