/* $NetBSD$ */ /*- * Copyright (c) 2014 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Taylor R. Campbell. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Legacy arc4random(3) API from OpenBSD, reimplemented to use the * ChaCha stream cipher, which in 2014 remains unbroken with a high * security margin (only attack better than brute force is against 7 * rounds of 20), unlike RC4, which is completely broken. * * For a 256-bit key k and 128-bit input i, ChaCha_k(i) yields a * 512-bit output. ChaCha is conjectured to be a PRF -- that is, if K * is a random variable in {0,1}^256 with uniform distribution, then * the ChaCha_K random variable is computationally indistinguishable * from a random variable in functions {0,1}^128 --> {0,1}^512 with * uniform distribution. Computing ChaCha_k(i) for any k, i takes * about 300 cycles on an Intel Ivy Bridge CPU with naive C code. * * The arc4random(3) state is a 256-bit key from sysctl(KERN_URND), a * 128-bit nonce, and a 512-bit output buffer aligned to 32-bit words. * Whenever the buffer is exhausted we refill it with a single ChaCha * output and increment a 128-bit counter. Long requests are served by * generating a key from the buffer and producing a new stream from * that key. The state is per-thread if _REENTRANT is defined, or * global if not. * * CPRNG state is stored in pages mapped so that they are inherited as * zero in the child. That way, if the child uses arc4random, it will * request a new key from the kernel so that its subsequent output will * be distinct from the parent's, and if the child drops privileges, it * won't be able to see the parent's secrets. */ #include __RCSID("$NetBSD$"); #include "namespace.h" #include "reentrant.h" #include #include #include #include #include #include #include #include #include #include #ifdef __weak_alias __weak_alias(arc4random,_arc4random) __weak_alias(arc4random_addrandom,_arc4random_addrandom) __weak_alias(arc4random_buf,_arc4random_buf) __weak_alias(arc4random_stir,_arc4random_stir) __weak_alias(arc4random_uniform,_arc4random_uniform) #endif /* ChaCha core */ #define crypto_core_OUTPUTWORDS 16 #define crypto_core_INPUTWORDS 4 #define crypto_core_KEYWORDS 8 #define crypto_core_CONSTWORDS 4 #define crypto_core_ROUNDS 20 static uint32_t rotate(uint32_t u, unsigned c) { return (u << c) | (u >> (32 - c)); } #define QUARTERROUND(a, b, c, d) do { \ (a) += (b); (d) ^= (a); (d) = rotate((d), 16); \ (c) += (d); (b) ^= (c); (b) = rotate((b), 12); \ (a) += (b); (d) ^= (a); (d) = rotate((d), 8); \ (c) += (d); (b) ^= (c); (b) = rotate((b), 7); \ } while (0) static void crypto_core(uint32_t *out, const uint32_t *in, const uint32_t *k, const uint32_t *c) { uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; int i; x0 = c[0]; x1 = c[1]; x2 = c[2]; x3 = c[3]; x4 = k[0]; x5 = k[1]; x6 = k[2]; x7 = k[3]; x8 = k[4]; x9 = k[5]; x10 = k[6]; x11 = k[7]; x12 = in[0]; x13 = in[1]; x14 = in[2]; x15 = in[3]; for (i = crypto_core_ROUNDS; i > 0; i -= 2) { QUARTERROUND( x0, x4, x8,x12); QUARTERROUND( x1, x5, x9,x13); QUARTERROUND( x2, x6,x10,x14); QUARTERROUND( x3, x7,x11,x15); QUARTERROUND( x0, x5,x10,x15); QUARTERROUND( x1, x6,x11,x12); QUARTERROUND( x2, x7, x8,x13); QUARTERROUND( x3, x4, x9,x14); } out[0] = x0 + c[0]; out[1] = x1 + c[1]; out[2] = x2 + c[2]; out[3] = x3 + c[3]; out[4] = x4 + k[0]; out[5] = x5 + k[1]; out[6] = x6 + k[2]; out[7] = x7 + k[3]; out[8] = x8 + k[4]; out[9] = x9 + k[5]; out[10] = x10 + k[6]; out[11] = x11 + k[7]; out[12] = x12 + in[0]; out[13] = x13 + in[1]; out[14] = x14 + in[2]; out[15] = x15 + in[3]; } /* `expand 32-byte k' */ static const uint32_t crypto_core_constant32[4] = { 0x61707865U, 0x3320646eU, 0x79622d32U, 0x6b206574U, }; /* * Test vector for ChaCha20 from * , * test vectors for ChaCha12 and ChaCha8 generated by the same * crypto_core code with crypto_core_ROUNDS varied. */ #define check(E) do { if (!(E)) abort(); } while (0) static void crypto_core_selftest(void) { const uint32_t zero32[8] = {0}; const uint8_t sigma[] = "expand 32-byte k"; uint32_t block[16]; unsigned i; #if crypto_core_ROUNDS == 8 static const uint8_t out[64] = { 0x3e,0x00,0xef,0x2f,0x89,0x5f,0x40,0xd6, 0x7f,0x5b,0xb8,0xe8,0x1f,0x09,0xa5,0xa1, 0x2c,0x84,0x0e,0xc3,0xce,0x9a,0x7f,0x3b, 0x18,0x1b,0xe1,0x88,0xef,0x71,0x1a,0x1e, 0x98,0x4c,0xe1,0x72,0xb9,0x21,0x6f,0x41, 0x9f,0x44,0x53,0x67,0x45,0x6d,0x56,0x19, 0x31,0x4a,0x42,0xa3,0xda,0x86,0xb0,0x01, 0x38,0x7b,0xfd,0xb8,0x0e,0x0c,0xfe,0x42, }; #elif crypto_core_ROUNDS == 12 static const uint8_t out[64] = { 0x9b,0xf4,0x9a,0x6a,0x07,0x55,0xf9,0x53, 0x81,0x1f,0xce,0x12,0x5f,0x26,0x83,0xd5, 0x04,0x29,0xc3,0xbb,0x49,0xe0,0x74,0x14, 0x7e,0x00,0x89,0xa5,0x2e,0xae,0x15,0x5f, 0x05,0x64,0xf8,0x79,0xd2,0x7a,0xe3,0xc0, 0x2c,0xe8,0x28,0x34,0xac,0xfa,0x8c,0x79, 0x3a,0x62,0x9f,0x2c,0xa0,0xde,0x69,0x19, 0x61,0x0b,0xe8,0x2f,0x41,0x13,0x26,0xbe, }; #elif crypto_core_ROUNDS == 20 static const uint8_t out[64] = { 0x76,0xb8,0xe0,0xad,0xa0,0xf1,0x3d,0x90, 0x40,0x5d,0x6a,0xe5,0x53,0x86,0xbd,0x28, 0xbd,0xd2,0x19,0xb8,0xa0,0x8d,0xed,0x1a, 0xa8,0x36,0xef,0xcc,0x8b,0x77,0x0d,0xc7, 0xda,0x41,0x59,0x7c,0x51,0x57,0x48,0x8d, 0x77,0x24,0xe0,0x3f,0xb8,0xd8,0x4a,0x37, 0x6a,0x43,0xb8,0xf4,0x15,0x18,0xa1,0x1c, 0xc3,0x87,0xb6,0x69,0xb2,0xee,0x65,0x86, }; #else #error crypto_core_ROUNDS must be 8, 12, or 20. #endif check(crypto_core_constant32[0] == le32dec(&sigma[0])); check(crypto_core_constant32[1] == le32dec(&sigma[4])); check(crypto_core_constant32[2] == le32dec(&sigma[8])); check(crypto_core_constant32[3] == le32dec(&sigma[12])); crypto_core(block, zero32, zero32, crypto_core_constant32); for (i = 0; i < 16; i++) check(block[i] == le32dec(&out[i*4])); } #undef check /* CPRNG algorithm */ /* * The state consists of a key, the current nonce, and a 64-byte buffer * of output. Since we fill the buffer only when we need output, and * eat a 32-bit word at a time, one 32-bit word of the buffer would be * wasted. Instead, we repurpose it to count the number of entries in * the buffer remaining, counting from high to low in order to allow * comparison to zero to detect when we need to refill it. */ #define CPRNG_BUFIDX (crypto_core_OUTPUTWORDS - 1) #define CPRNG_SEED_BYTES (crypto_core_KEYWORDS * sizeof(uint32_t)) struct cprng { uint32_t buffer[crypto_core_OUTPUTWORDS]; uint32_t key[crypto_core_KEYWORDS]; uint32_t nonce[crypto_core_INPUTWORDS]; }; __CTASSERT(sizeof ((struct cprng *)0)->key == CPRNG_SEED_BYTES); static void cprng_seed(struct cprng *cprng, const void *seed) { (void)memset(cprng->buffer, 0, sizeof cprng->buffer); (void)memcpy(cprng->key, seed, sizeof cprng->key); (void)memset(cprng->nonce, 0, sizeof cprng->nonce); } static inline uint32_t cprng_word(struct cprng *cprng) { uint32_t v; if (__predict_true(0 < cprng->buffer[CPRNG_BUFIDX])) { v = cprng->buffer[--cprng->buffer[CPRNG_BUFIDX]]; } else { /* If we don't have enough words, refill the buffer. */ crypto_core(cprng->buffer, cprng->nonce, cprng->key, crypto_core_constant32); if (++cprng->nonce[0] == 0) cprng->nonce[1]++; v = cprng->buffer[CPRNG_BUFIDX]; cprng->buffer[CPRNG_BUFIDX] = CPRNG_BUFIDX; } return v; } static inline void cprng_buf(struct cprng *cprng, void *buf, unsigned n) { uint8_t *p = buf; uint32_t v; unsigned w, r; w = n / sizeof(uint32_t); while (w--) { v = cprng_word(cprng); (void)memcpy(p, &v, 4); p += 4; } r = n % sizeof(uint32_t); if (r) { v = cprng_word(cprng); while (r--) { *p++ = (v & 0xff); v >>= 8; } } } /* * crypto_onetimestream: Expand a short unpredictable one-time seed * into a long unpredictable output. */ static void crypto_onetimestream(const uint32_t seed[crypto_core_KEYWORDS], void *buf, size_t n) { uint32_t block[crypto_core_OUTPUTWORDS]; uint32_t nonce[crypto_core_INPUTWORDS] = {0}; uint8_t *p8; uint32_t *p32; size_t ni, nb, nf; /* * Guarantee we can generate up to n bytes. We have * 2^(32*INPUTWORDS) possible inputs yielding output of * 4*OUTPUTWORDS*2^(32*INPUTWORDS) bytes. It suffices to * require that sizeof n > (1/CHAR_BIT) log_2 n be less than * (1/CHAR_BIT) log_2 of the total output stream length. We * have * * log_2 (4 o 2^(32 i)) = log_2 (4 o) + log_2 2^(32 i) * = 2 + log_2 o + 32 i. */ __CTASSERT(CHAR_BIT*sizeof n <= (2 + ilog2(crypto_core_OUTPUTWORDS) + 32*crypto_core_INPUTWORDS)); p8 = buf; p32 = (uint32_t *)roundup2((uintptr_t)p8, sizeof(uint32_t)); ni = (uint8_t *)p32 - p8; if (n < ni) ni = n; nb = (n - ni) / sizeof block; nf = (n - ni) % sizeof block; _DIAGASSERT(((uintptr_t)p32 & 3) == 0); _DIAGASSERT(ni <= n); _DIAGASSERT(nb <= (n / sizeof block)); _DIAGASSERT(nf <= n); _DIAGASSERT(n == (ni + (nb * sizeof block) + nf)); _DIAGASSERT(ni < sizeof(uint32_t)); _DIAGASSERT(nf < sizeof block); if (ni) { crypto_core(block, nonce, seed, crypto_core_constant32); nonce[0]++; (void)memcpy(p8, block, ni); } while (nb--) { crypto_core(p32, nonce, seed, crypto_core_constant32); if (++nonce[0] == 0) nonce[1]++; p32 += crypto_core_OUTPUTWORDS; } if (nf) { crypto_core(block, nonce, seed, crypto_core_constant32); if (++nonce[0] == 0) nonce[1]++; (void)memcpy(p32, block, nf); } if (ni | nf) (void)explicit_memset(block, 0, sizeof block); } /* CPRNG state: per-thread, per-process (zeroed in child on fork) */ struct arc4random_cprng { struct cprng arc4_cprng; bool arc4_seeded:1; }; static void arc4random_cprng_addrandom(struct arc4random_cprng *cprng, const void *data, size_t datalen) { const int mib[] = { CTL_KERN, KERN_ARND }; uint8_t seed[CPRNG_SEED_BYTES]; size_t seedlen = sizeof seed; __CTASSERT(sizeof seed == SHA256_DIGEST_LENGTH); if (sysctl(mib, __arraycount(mib), seed, &seedlen, NULL, 0) == -1) abort(); if (seedlen != sizeof seed) abort(); if (data != NULL) { SHA256_CTX ctx; SHA256_Init(&ctx); SHA256_Update(&ctx, seed, sizeof seed); SHA256_Update(&ctx, data, datalen); SHA256_Final(seed, &ctx); (void)explicit_memset(&ctx, 0, sizeof ctx); } cprng_seed(&cprng->arc4_cprng, seed); (void)explicit_memset(seed, 0, sizeof seed); cprng->arc4_seeded = true; } static __noinline struct arc4random_cprng * arc4random_cprng_create(void) { struct arc4random_cprng *cprng; const size_t size = roundup(sizeof(*cprng), sysconf(_SC_PAGESIZE)); crypto_core_selftest(); cprng = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_ANON, -1, 0); if (cprng == MAP_FAILED) abort(); #ifdef MAP_INHERIT_ZERO if (minherit(cprng, size, MAP_INHERIT_ZERO) == -1) abort(); #else #warning This arc4random is not fork-safe! #endif (void)memset(cprng, 0, sizeof cprng); arc4random_cprng_addrandom(cprng, NULL, 0); return cprng; } /* Library state */ #if !defined(_REENTRANT) #define __arc4random_thread /* global */ #elif defined(__HAVE_TLS_VARIANT_I) || defined(__HAVE_TLS_VARIANT_II) #define __arc4random_thread __thread #endif #ifdef __arc4random_thread static __arc4random_thread struct arc4random_cprng *arc4random_cprng_tsd; static inline struct arc4random_cprng * arc4random_cprng(void) { struct arc4random_cprng *cprng; cprng = arc4random_cprng_tsd; if (__predict_false(cprng == NULL)) cprng = arc4random_cprng_tsd = arc4random_cprng_create(); if (__predict_false(!cprng->arc4_seeded)) arc4random_cprng_addrandom(cprng, NULL, 0); return cprng; } #else /* !defined(__arc4random_thread) */ static struct { mutex_t lock; thread_key_t thread_key; /* struct arc4random_cprng * */ bool initialized; } arc4random_global = { .lock = MUTEX_INITIALIZER, .initialized = false, }; static void arc4random_initialize(void) { mutex_lock(&arc4random_global.lock); if (!arc4random_global.initialized) { if (thr_keycreate(&arc4random_global.thread_key, NULL) != 0) abort(); arc4random_global.initialized = true; } mutex_unlock(&arc4random_global.lock); } static inline struct arc4random_cprng * arc4random_cprng(void) { struct arc4random_cprng *cprng; if (__predict_false(!arc4random_global.initialized)) arc4random_initialize(); cprng = thr_getspecific(arc4random_global.thread_key); if (__predict_false(cprng == NULL)) { cprng = arc4random_cprng_create(); thr_setspecific(arc4random_global.thread_key, cprng); } if (__predict_false(!cprng->arc4_seeded)) arc4random_cprng_addrandom(cprng, NULL, 0); return cprng; } #endif /* defined(__arc4random_thread) */ /* Public API */ uint32_t arc4random(void) { return cprng_word(&arc4random_cprng()->arc4_cprng); } static void arc4random_buf_short(void *buf, size_t len) { struct arc4random_cprng *const cprng = arc4random_cprng(); cprng_buf(&cprng->arc4_cprng, buf, len); } /* * Keep this out-of-line to avoid allocating a giant stack frame * unconditionally in arc4random_buf, which measurably slows it down * for short requests. */ static __noinline void arc4random_buf_long(void *buf, size_t len) { struct arc4random_cprng *const cprng = arc4random_cprng(); uint32_t seed[crypto_core_KEYWORDS]; cprng_buf(&cprng->arc4_cprng, seed, sizeof seed); crypto_onetimestream(seed, buf, len); } void arc4random_buf(void *buf, size_t len) { /* 1024 chosen by some kludgey measurements to optimize throughput. */ if (__predict_true(len <= 1024)) arc4random_buf_short(buf, len); else arc4random_buf_long(buf, len); } uint32_t arc4random_uniform(uint32_t bound) { struct arc4random_cprng *const cprng = arc4random_cprng(); uint32_t minimum, r; /* * We want a uniform random choice in [0, n), and arc4random() * makes a uniform random choice in [0, 2^32). If we reduce * that modulo n, values in [0, 2^32 mod n) will be represented * slightly more than values in [2^32 mod n, n). Instead we * choose only from [2^32 mod n, 2^32) by rejecting samples in * [0, 2^32 mod n), to avoid counting the extra representative * of [0, 2^32 mod n). To compute 2^32 mod n, note that * * 2^32 mod n = 2^32 mod n - 0 * = 2^32 mod n - n mod n * = (2^32 - n) mod n, * * the last of which is what we compute in 32-bit arithmetic. */ minimum = (-bound % bound); do r = cprng_word(&cprng->arc4_cprng); while (r < minimum); return (r % bound); } void arc4random_stir(void) { struct arc4random_cprng *const cprng = arc4random_cprng(); arc4random_cprng_addrandom(cprng, NULL, 0); } /* * Silly signature here is for hysterical raisins. Should instead be * const void *data and size_t datalen. */ void arc4random_addrandom(u_char *data, int datalen) { struct arc4random_cprng *const cprng = arc4random_cprng(); _DIAGASSERT(0 <= datalen); arc4random_cprng_addrandom(cprng, data, datalen); } #ifdef _ARC4RANDOM_TEST #include #include #include int main(int argc __unused, char **argv __unused) { const uint8_t zero64[64] = {0}; uint8_t buf[2048]; unsigned i, a, n; /* Test arc4random: should not be deterministic. */ if (printf("arc4random: %08"PRIx32"\n", arc4random()) < 0) err(1, "printf"); /* Test stirring: should definitely not be deterministic. */ arc4random_stir(); /* Test small buffer. */ arc4random_buf(buf, 8); if (printf("arc4randombuf small:") < 0) err(1, "printf"); for (i = 0; i < 8; i++) if (printf(" %02x", buf[i]) < 0) err(1, "printf"); if (printf("\n") < 0) err(1, "printf"); /* Test addrandom: should not make the rest deterministic. */ arc4random_addrandom((void *)crypto_core_constant32, sizeof crypto_core_constant32); /* Test large buffer. */ arc4random_buf(buf, sizeof buf); if (printf("arc4randombuf_large:") < 0) err(1, "printf"); for (i = 0; i < sizeof buf; i++) if (printf(" %02x", buf[i]) < 0) err(1, "printf"); if (printf("\n") < 0) err(1, "printf"); /* Test misaligned small and large. */ for (a = 0; a < 64; a++) { for (n = a; n < sizeof buf; n++) { (void)memset(buf, 0, sizeof buf); arc4random_buf(buf, n - a); if (memcmp(buf + n - a, zero64, a) != 0) errx(1, "arc4random buffer overflow 0"); (void)memset(buf, 0, sizeof buf); arc4random_buf(buf + a, n - a); if (memcmp(buf, zero64, a) != 0) errx(1, "arc4random buffer overflow 1"); if ((2*a) <= n) { (void)memset(buf, 0, sizeof buf); arc4random_buf(buf + a, n - a - a); if (memcmp(buf + n - a, zero64, a) != 0) errx(1, "arc4random buffer overflow 2"); } } } /* Test fork-safety. */ { pid_t pid, rpid; int status; pid = fork(); switch (pid) { case -1: err(1, "fork"); case 0: _exit(arc4random_cprng()->arc4_seeded); default: rpid = waitpid(pid, &status, 0); if (rpid == -1) err(1, "waitpid"); if (rpid != pid) errx(1, "waitpid returned wrong pid" ": %"PRIdMAX" != %"PRIdMAX, (intmax_t)rpid, (intmax_t)pid); if (WIFEXITED(status)) { if (WEXITSTATUS(status) != 0) errx(1, "child exited with %d", WEXITSTATUS(status)); } else if (WIFSIGNALED(status)) { errx(1, "child terminated on signal %d", WTERMSIG(status)); } else { errx(1, "child died mysteriously: %d", status); } } } return 0; } #endif