monocypher.c (110141B)
1 // Monocypher version 3.1.2 2 // 3 // This file is dual-licensed. Choose whichever licence you want from 4 // the two licences listed below. 5 // 6 // The first licence is a regular 2-clause BSD licence. The second licence 7 // is the CC-0 from Creative Commons. It is intended to release Monocypher 8 // to the public domain. The BSD licence serves as a fallback option. 9 // 10 // SPDX-License-Identifier: BSD-2-Clause OR CC0-1.0 11 // 12 // ------------------------------------------------------------------------ 13 // 14 // Copyright (c) 2017-2020, Loup Vaillant 15 // All rights reserved. 16 // 17 // 18 // Redistribution and use in source and binary forms, with or without 19 // modification, are permitted provided that the following conditions are 20 // met: 21 // 22 // 1. Redistributions of source code must retain the above copyright 23 // notice, this list of conditions and the following disclaimer. 24 // 25 // 2. Redistributions in binary form must reproduce the above copyright 26 // notice, this list of conditions and the following disclaimer in the 27 // documentation and/or other materials provided with the 28 // distribution. 29 // 30 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 31 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 32 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 33 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 34 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 35 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 36 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 37 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 38 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 39 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 40 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 41 // 42 // ------------------------------------------------------------------------ 43 // 44 // Written in 2017-2020 by Loup Vaillant 45 // 46 // To the extent possible under law, the author(s) have dedicated all copyright 47 // and related neighboring rights to this software to the public domain 48 // worldwide. This software is distributed without any warranty. 49 // 50 // You should have received a copy of the CC0 Public Domain Dedication along 51 // with this software. If not, see 52 // <https://creativecommons.org/publicdomain/zero/1.0/> 53 54 #include "monocypher.h" 55 56 ///////////////// 57 /// Utilities /// 58 ///////////////// 59 #define FOR_T(type, i, start, end) for (type i = (start); i < (end); i++) 60 #define FOR(i, start, end) FOR_T(size_t, i, start, end) 61 #define COPY(dst, src, size) FOR(i, 0, size) (dst)[i] = (src)[i] 62 #define ZERO(buf, size) FOR(i, 0, size) (buf)[i] = 0 63 #define WIPE_CTX(ctx) crypto_wipe(ctx , sizeof(*(ctx))) 64 #define WIPE_BUFFER(buffer) crypto_wipe(buffer, sizeof(buffer)) 65 #define MIN(a, b) ((a) <= (b) ? (a) : (b)) 66 #define MAX(a, b) ((a) >= (b) ? (a) : (b)) 67 68 typedef int8_t i8; 69 typedef uint8_t u8; 70 typedef int16_t i16; 71 typedef uint32_t u32; 72 typedef int32_t i32; 73 typedef int64_t i64; 74 typedef uint64_t u64; 75 76 static const u8 zero[128] = {0}; 77 78 // returns the smallest positive integer y such that 79 // (x + y) % pow_2 == 0 80 // Basically, it's how many bytes we need to add to "align" x. 81 // Only works when pow_2 is a power of 2. 82 // Note: we use ~x+1 instead of -x to avoid compiler warnings 83 static size_t align(size_t x, size_t pow_2) 84 { 85 return (~x + 1) & (pow_2 - 1); 86 } 87 88 static u32 load24_le(const u8 s[3]) 89 { 90 return (u32)s[0] 91 | ((u32)s[1] << 8) 92 | ((u32)s[2] << 16); 93 } 94 95 static u32 load32_le(const u8 s[4]) 96 { 97 return (u32)s[0] 98 | ((u32)s[1] << 8) 99 | ((u32)s[2] << 16) 100 | ((u32)s[3] << 24); 101 } 102 103 static u64 load64_le(const u8 s[8]) 104 { 105 return load32_le(s) | ((u64)load32_le(s+4) << 32); 106 } 107 108 static void store32_le(u8 out[4], u32 in) 109 { 110 out[0] = in & 0xff; 111 out[1] = (in >> 8) & 0xff; 112 out[2] = (in >> 16) & 0xff; 113 out[3] = (in >> 24) & 0xff; 114 } 115 116 static void store64_le(u8 out[8], u64 in) 117 { 118 store32_le(out , (u32)in ); 119 store32_le(out + 4, in >> 32); 120 } 121 122 static void load32_le_buf (u32 *dst, const u8 *src, size_t size) { 123 FOR(i, 0, size) { dst[i] = load32_le(src + i*4); } 124 } 125 static void load64_le_buf (u64 *dst, const u8 *src, size_t size) { 126 FOR(i, 0, size) { dst[i] = load64_le(src + i*8); } 127 } 128 static void store32_le_buf(u8 *dst, const u32 *src, size_t size) { 129 FOR(i, 0, size) { store32_le(dst + i*4, src[i]); } 130 } 131 static void store64_le_buf(u8 *dst, const u64 *src, size_t size) { 132 FOR(i, 0, size) { store64_le(dst + i*8, src[i]); } 133 } 134 135 static u64 rotr64(u64 x, u64 n) { return (x >> n) ^ (x << (64 - n)); } 136 static u32 rotl32(u32 x, u32 n) { return (x << n) ^ (x >> (32 - n)); } 137 138 static int neq0(u64 diff) 139 { // constant time comparison to zero 140 // return diff != 0 ? -1 : 0 141 u64 half = (diff >> 32) | ((u32)diff); 142 return (1 & ((half - 1) >> 32)) - 1; 143 } 144 145 static u64 x16(const u8 a[16], const u8 b[16]) 146 { 147 return (load64_le(a + 0) ^ load64_le(b + 0)) 148 | (load64_le(a + 8) ^ load64_le(b + 8)); 149 } 150 static u64 x32(const u8 a[32],const u8 b[32]){return x16(a,b)| x16(a+16, b+16);} 151 static u64 x64(const u8 a[64],const u8 b[64]){return x32(a,b)| x32(a+32, b+32);} 152 int crypto_verify16(const u8 a[16], const u8 b[16]){ return neq0(x16(a, b)); } 153 int crypto_verify32(const u8 a[32], const u8 b[32]){ return neq0(x32(a, b)); } 154 int crypto_verify64(const u8 a[64], const u8 b[64]){ return neq0(x64(a, b)); } 155 156 void crypto_wipe(void *secret, size_t size) 157 { 158 volatile u8 *v_secret = (u8*)secret; 159 ZERO(v_secret, size); 160 } 161 162 ///////////////// 163 /// Chacha 20 /// 164 ///////////////// 165 #define QUARTERROUND(a, b, c, d) \ 166 a += b; d = rotl32(d ^ a, 16); \ 167 c += d; b = rotl32(b ^ c, 12); \ 168 a += b; d = rotl32(d ^ a, 8); \ 169 c += d; b = rotl32(b ^ c, 7) 170 171 static void chacha20_rounds(u32 out[16], const u32 in[16]) 172 { 173 // The temporary variables make Chacha20 10% faster. 174 u32 t0 = in[ 0]; u32 t1 = in[ 1]; u32 t2 = in[ 2]; u32 t3 = in[ 3]; 175 u32 t4 = in[ 4]; u32 t5 = in[ 5]; u32 t6 = in[ 6]; u32 t7 = in[ 7]; 176 u32 t8 = in[ 8]; u32 t9 = in[ 9]; u32 t10 = in[10]; u32 t11 = in[11]; 177 u32 t12 = in[12]; u32 t13 = in[13]; u32 t14 = in[14]; u32 t15 = in[15]; 178 179 FOR (i, 0, 10) { // 20 rounds, 2 rounds per loop. 180 QUARTERROUND(t0, t4, t8 , t12); // column 0 181 QUARTERROUND(t1, t5, t9 , t13); // column 1 182 QUARTERROUND(t2, t6, t10, t14); // column 2 183 QUARTERROUND(t3, t7, t11, t15); // column 3 184 QUARTERROUND(t0, t5, t10, t15); // diagonal 0 185 QUARTERROUND(t1, t6, t11, t12); // diagonal 1 186 QUARTERROUND(t2, t7, t8 , t13); // diagonal 2 187 QUARTERROUND(t3, t4, t9 , t14); // diagonal 3 188 } 189 out[ 0] = t0; out[ 1] = t1; out[ 2] = t2; out[ 3] = t3; 190 out[ 4] = t4; out[ 5] = t5; out[ 6] = t6; out[ 7] = t7; 191 out[ 8] = t8; out[ 9] = t9; out[10] = t10; out[11] = t11; 192 out[12] = t12; out[13] = t13; out[14] = t14; out[15] = t15; 193 } 194 195 static void chacha20_init_key(u32 block[16], const u8 key[32]) 196 { 197 load32_le_buf(block , (const u8*)"expand 32-byte k", 4); // constant 198 load32_le_buf(block+4, key , 8); // key 199 } 200 201 void crypto_hchacha20(u8 out[32], const u8 key[32], const u8 in [16]) 202 { 203 u32 block[16]; 204 chacha20_init_key(block, key); 205 // input 206 load32_le_buf(block + 12, in, 4); 207 chacha20_rounds(block, block); 208 // prevent reversal of the rounds by revealing only half of the buffer. 209 store32_le_buf(out , block , 4); // constant 210 store32_le_buf(out+16, block+12, 4); // counter and nonce 211 WIPE_BUFFER(block); 212 } 213 214 u64 crypto_chacha20_ctr(u8 *cipher_text, const u8 *plain_text, 215 size_t text_size, const u8 key[32], const u8 nonce[8], 216 u64 ctr) 217 { 218 u32 input[16]; 219 chacha20_init_key(input, key); 220 input[12] = (u32) ctr; 221 input[13] = (u32)(ctr >> 32); 222 load32_le_buf(input+14, nonce, 2); 223 224 // Whole blocks 225 u32 pool[16]; 226 size_t nb_blocks = text_size >> 6; 227 FOR (i, 0, nb_blocks) { 228 chacha20_rounds(pool, input); 229 if (plain_text != 0) { 230 FOR (j, 0, 16) { 231 u32 p = pool[j] + input[j]; 232 store32_le(cipher_text, p ^ load32_le(plain_text)); 233 cipher_text += 4; 234 plain_text += 4; 235 } 236 } else { 237 FOR (j, 0, 16) { 238 u32 p = pool[j] + input[j]; 239 store32_le(cipher_text, p); 240 cipher_text += 4; 241 } 242 } 243 input[12]++; 244 if (input[12] == 0) { 245 input[13]++; 246 } 247 } 248 text_size &= 63; 249 250 // Last (incomplete) block 251 if (text_size > 0) { 252 if (plain_text == 0) { 253 plain_text = zero; 254 } 255 chacha20_rounds(pool, input); 256 u8 tmp[64]; 257 FOR (i, 0, 16) { 258 store32_le(tmp + i*4, pool[i] + input[i]); 259 } 260 FOR (i, 0, text_size) { 261 cipher_text[i] = tmp[i] ^ plain_text[i]; 262 } 263 WIPE_BUFFER(tmp); 264 } 265 ctr = input[12] + ((u64)input[13] << 32) + (text_size > 0); 266 267 WIPE_BUFFER(pool); 268 WIPE_BUFFER(input); 269 return ctr; 270 } 271 272 u32 crypto_ietf_chacha20_ctr(u8 *cipher_text, const u8 *plain_text, 273 size_t text_size, 274 const u8 key[32], const u8 nonce[12], u32 ctr) 275 { 276 u64 big_ctr = ctr + ((u64)load32_le(nonce) << 32); 277 return (u32)crypto_chacha20_ctr(cipher_text, plain_text, text_size, 278 key, nonce + 4, big_ctr); 279 } 280 281 u64 crypto_xchacha20_ctr(u8 *cipher_text, const u8 *plain_text, 282 size_t text_size, 283 const u8 key[32], const u8 nonce[24], u64 ctr) 284 { 285 u8 sub_key[32]; 286 crypto_hchacha20(sub_key, key, nonce); 287 ctr = crypto_chacha20_ctr(cipher_text, plain_text, text_size, 288 sub_key, nonce+16, ctr); 289 WIPE_BUFFER(sub_key); 290 return ctr; 291 } 292 293 void crypto_chacha20(u8 *cipher_text, const u8 *plain_text, size_t text_size, 294 const u8 key[32], const u8 nonce[8]) 295 { 296 crypto_chacha20_ctr(cipher_text, plain_text, text_size, key, nonce, 0); 297 298 } 299 void crypto_ietf_chacha20(u8 *cipher_text, const u8 *plain_text, 300 size_t text_size, 301 const u8 key[32], const u8 nonce[12]) 302 { 303 crypto_ietf_chacha20_ctr(cipher_text, plain_text, text_size, key, nonce, 0); 304 } 305 306 void crypto_xchacha20(u8 *cipher_text, const u8 *plain_text, size_t text_size, 307 const u8 key[32], const u8 nonce[24]) 308 { 309 crypto_xchacha20_ctr(cipher_text, plain_text, text_size, key, nonce, 0); 310 } 311 312 ///////////////// 313 /// Poly 1305 /// 314 ///////////////// 315 316 // h = (h + c) * r 317 // preconditions: 318 // ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff 319 // ctx->c <= 1_ffffffff_ffffffff_ffffffff_ffffffff 320 // ctx->r <= 0ffffffc_0ffffffc_0ffffffc_0fffffff 321 // Postcondition: 322 // ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff 323 static void poly_block(crypto_poly1305_ctx *ctx) 324 { 325 // s = h + c, without carry propagation 326 const u64 s0 = ctx->h[0] + (u64)ctx->c[0]; // s0 <= 1_fffffffe 327 const u64 s1 = ctx->h[1] + (u64)ctx->c[1]; // s1 <= 1_fffffffe 328 const u64 s2 = ctx->h[2] + (u64)ctx->c[2]; // s2 <= 1_fffffffe 329 const u64 s3 = ctx->h[3] + (u64)ctx->c[3]; // s3 <= 1_fffffffe 330 const u32 s4 = ctx->h[4] + ctx->c[4]; // s4 <= 5 331 332 // Local all the things! 333 const u32 r0 = ctx->r[0]; // r0 <= 0fffffff 334 const u32 r1 = ctx->r[1]; // r1 <= 0ffffffc 335 const u32 r2 = ctx->r[2]; // r2 <= 0ffffffc 336 const u32 r3 = ctx->r[3]; // r3 <= 0ffffffc 337 const u32 rr0 = (r0 >> 2) * 5; // rr0 <= 13fffffb // lose 2 bits... 338 const u32 rr1 = (r1 >> 2) + r1; // rr1 <= 13fffffb // rr1 == (r1 >> 2) * 5 339 const u32 rr2 = (r2 >> 2) + r2; // rr2 <= 13fffffb // rr1 == (r2 >> 2) * 5 340 const u32 rr3 = (r3 >> 2) + r3; // rr3 <= 13fffffb // rr1 == (r3 >> 2) * 5 341 342 // (h + c) * r, without carry propagation 343 const u64 x0 = s0*r0+ s1*rr3+ s2*rr2+ s3*rr1+ s4*rr0; // <= 97ffffe007fffff8 344 const u64 x1 = s0*r1+ s1*r0 + s2*rr3+ s3*rr2+ s4*rr1; // <= 8fffffe20ffffff6 345 const u64 x2 = s0*r2+ s1*r1 + s2*r0 + s3*rr3+ s4*rr2; // <= 87ffffe417fffff4 346 const u64 x3 = s0*r3+ s1*r2 + s2*r1 + s3*r0 + s4*rr3; // <= 7fffffe61ffffff2 347 const u32 x4 = s4 * (r0 & 3); // ...recover 2 bits // <= f 348 349 // partial reduction modulo 2^130 - 5 350 const u32 u5 = x4 + (x3 >> 32); // u5 <= 7ffffff5 351 const u64 u0 = (u5 >> 2) * 5 + (x0 & 0xffffffff); 352 const u64 u1 = (u0 >> 32) + (x1 & 0xffffffff) + (x0 >> 32); 353 const u64 u2 = (u1 >> 32) + (x2 & 0xffffffff) + (x1 >> 32); 354 const u64 u3 = (u2 >> 32) + (x3 & 0xffffffff) + (x2 >> 32); 355 const u64 u4 = (u3 >> 32) + (u5 & 3); 356 357 // Update the hash 358 ctx->h[0] = (u32)u0; // u0 <= 1_9ffffff0 359 ctx->h[1] = (u32)u1; // u1 <= 1_97ffffe0 360 ctx->h[2] = (u32)u2; // u2 <= 1_8fffffe2 361 ctx->h[3] = (u32)u3; // u3 <= 1_87ffffe4 362 ctx->h[4] = (u32)u4; // u4 <= 4 363 } 364 365 // (re-)initialises the input counter and input buffer 366 static void poly_clear_c(crypto_poly1305_ctx *ctx) 367 { 368 ZERO(ctx->c, 4); 369 ctx->c_idx = 0; 370 } 371 372 static void poly_take_input(crypto_poly1305_ctx *ctx, u8 input) 373 { 374 size_t word = ctx->c_idx >> 2; 375 size_t byte = ctx->c_idx & 3; 376 ctx->c[word] |= (u32)input << (byte * 8); 377 ctx->c_idx++; 378 } 379 380 static void poly_update(crypto_poly1305_ctx *ctx, 381 const u8 *message, size_t message_size) 382 { 383 FOR (i, 0, message_size) { 384 poly_take_input(ctx, message[i]); 385 if (ctx->c_idx == 16) { 386 poly_block(ctx); 387 poly_clear_c(ctx); 388 } 389 } 390 } 391 392 void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const u8 key[32]) 393 { 394 // Initial hash is zero 395 ZERO(ctx->h, 5); 396 // add 2^130 to every input block 397 ctx->c[4] = 1; 398 poly_clear_c(ctx); 399 // load r and pad (r has some of its bits cleared) 400 load32_le_buf(ctx->r , key , 4); 401 load32_le_buf(ctx->pad, key+16, 4); 402 FOR (i, 0, 1) { ctx->r[i] &= 0x0fffffff; } 403 FOR (i, 1, 4) { ctx->r[i] &= 0x0ffffffc; } 404 } 405 406 void crypto_poly1305_update(crypto_poly1305_ctx *ctx, 407 const u8 *message, size_t message_size) 408 { 409 if (message_size == 0) { 410 return; 411 } 412 // Align ourselves with block boundaries 413 size_t aligned = MIN(align(ctx->c_idx, 16), message_size); 414 poly_update(ctx, message, aligned); 415 message += aligned; 416 message_size -= aligned; 417 418 // Process the message block by block 419 size_t nb_blocks = message_size >> 4; 420 FOR (i, 0, nb_blocks) { 421 load32_le_buf(ctx->c, message, 4); 422 poly_block(ctx); 423 message += 16; 424 } 425 if (nb_blocks > 0) { 426 poly_clear_c(ctx); 427 } 428 message_size &= 15; 429 430 // remaining bytes 431 poly_update(ctx, message, message_size); 432 } 433 434 void crypto_poly1305_final(crypto_poly1305_ctx *ctx, u8 mac[16]) 435 { 436 // Process the last block (if any) 437 if (ctx->c_idx != 0) { 438 // move the final 1 according to remaining input length 439 // (We may add less than 2^130 to the last input block) 440 ctx->c[4] = 0; 441 poly_take_input(ctx, 1); 442 // one last hash update 443 poly_block(ctx); 444 } 445 446 // check if we should subtract 2^130-5 by performing the 447 // corresponding carry propagation. 448 u64 c = 5; 449 FOR (i, 0, 4) { 450 c += ctx->h[i]; 451 c >>= 32; 452 } 453 c += ctx->h[4]; 454 c = (c >> 2) * 5; // shift the carry back to the beginning 455 // c now indicates how many times we should subtract 2^130-5 (0 or 1) 456 FOR (i, 0, 4) { 457 c += (u64)ctx->h[i] + ctx->pad[i]; 458 store32_le(mac + i*4, (u32)c); 459 c = c >> 32; 460 } 461 WIPE_CTX(ctx); 462 } 463 464 void crypto_poly1305(u8 mac[16], const u8 *message, 465 size_t message_size, const u8 key[32]) 466 { 467 crypto_poly1305_ctx ctx; 468 crypto_poly1305_init (&ctx, key); 469 crypto_poly1305_update(&ctx, message, message_size); 470 crypto_poly1305_final (&ctx, mac); 471 } 472 473 //////////////// 474 /// Blake2 b /// 475 //////////////// 476 static const u64 iv[8] = { 477 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 478 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, 479 0x510e527fade682d1, 0x9b05688c2b3e6c1f, 480 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, 481 }; 482 483 // increment the input offset 484 static void blake2b_incr(crypto_blake2b_ctx *ctx) 485 { 486 u64 *x = ctx->input_offset; 487 size_t y = ctx->input_idx; 488 x[0] += y; 489 if (x[0] < y) { 490 x[1]++; 491 } 492 } 493 494 static void blake2b_compress(crypto_blake2b_ctx *ctx, int is_last_block) 495 { 496 static const u8 sigma[12][16] = { 497 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, 498 { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, 499 { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, 500 { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, 501 { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, 502 { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, 503 { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, 504 { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, 505 { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, 506 { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, 507 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, 508 { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, 509 }; 510 511 // init work vector 512 u64 v0 = ctx->hash[0]; u64 v8 = iv[0]; 513 u64 v1 = ctx->hash[1]; u64 v9 = iv[1]; 514 u64 v2 = ctx->hash[2]; u64 v10 = iv[2]; 515 u64 v3 = ctx->hash[3]; u64 v11 = iv[3]; 516 u64 v4 = ctx->hash[4]; u64 v12 = iv[4] ^ ctx->input_offset[0]; 517 u64 v5 = ctx->hash[5]; u64 v13 = iv[5] ^ ctx->input_offset[1]; 518 u64 v6 = ctx->hash[6]; u64 v14 = iv[6] ^ (u64)~(is_last_block - 1); 519 u64 v7 = ctx->hash[7]; u64 v15 = iv[7]; 520 521 // mangle work vector 522 u64 *input = ctx->input; 523 #define BLAKE2_G(a, b, c, d, x, y) \ 524 a += b + x; d = rotr64(d ^ a, 32); \ 525 c += d; b = rotr64(b ^ c, 24); \ 526 a += b + y; d = rotr64(d ^ a, 16); \ 527 c += d; b = rotr64(b ^ c, 63) 528 #define BLAKE2_ROUND(i) \ 529 BLAKE2_G(v0, v4, v8 , v12, input[sigma[i][ 0]], input[sigma[i][ 1]]); \ 530 BLAKE2_G(v1, v5, v9 , v13, input[sigma[i][ 2]], input[sigma[i][ 3]]); \ 531 BLAKE2_G(v2, v6, v10, v14, input[sigma[i][ 4]], input[sigma[i][ 5]]); \ 532 BLAKE2_G(v3, v7, v11, v15, input[sigma[i][ 6]], input[sigma[i][ 7]]); \ 533 BLAKE2_G(v0, v5, v10, v15, input[sigma[i][ 8]], input[sigma[i][ 9]]); \ 534 BLAKE2_G(v1, v6, v11, v12, input[sigma[i][10]], input[sigma[i][11]]); \ 535 BLAKE2_G(v2, v7, v8 , v13, input[sigma[i][12]], input[sigma[i][13]]); \ 536 BLAKE2_G(v3, v4, v9 , v14, input[sigma[i][14]], input[sigma[i][15]]) 537 538 #ifdef BLAKE2_NO_UNROLLING 539 FOR (i, 0, 12) { 540 BLAKE2_ROUND(i); 541 } 542 #else 543 BLAKE2_ROUND(0); BLAKE2_ROUND(1); BLAKE2_ROUND(2); BLAKE2_ROUND(3); 544 BLAKE2_ROUND(4); BLAKE2_ROUND(5); BLAKE2_ROUND(6); BLAKE2_ROUND(7); 545 BLAKE2_ROUND(8); BLAKE2_ROUND(9); BLAKE2_ROUND(10); BLAKE2_ROUND(11); 546 #endif 547 548 // update hash 549 ctx->hash[0] ^= v0 ^ v8; ctx->hash[1] ^= v1 ^ v9; 550 ctx->hash[2] ^= v2 ^ v10; ctx->hash[3] ^= v3 ^ v11; 551 ctx->hash[4] ^= v4 ^ v12; ctx->hash[5] ^= v5 ^ v13; 552 ctx->hash[6] ^= v6 ^ v14; ctx->hash[7] ^= v7 ^ v15; 553 } 554 555 static void blake2b_set_input(crypto_blake2b_ctx *ctx, u8 input, size_t index) 556 { 557 if (index == 0) { 558 ZERO(ctx->input, 16); 559 } 560 size_t word = index >> 3; 561 size_t byte = index & 7; 562 ctx->input[word] |= (u64)input << (byte << 3); 563 564 } 565 566 static void blake2b_end_block(crypto_blake2b_ctx *ctx) 567 { 568 if (ctx->input_idx == 128) { // If buffer is full, 569 blake2b_incr(ctx); // update the input offset 570 blake2b_compress(ctx, 0); // and compress the (not last) block 571 ctx->input_idx = 0; 572 } 573 } 574 575 static void blake2b_update(crypto_blake2b_ctx *ctx, 576 const u8 *message, size_t message_size) 577 { 578 FOR (i, 0, message_size) { 579 blake2b_end_block(ctx); 580 blake2b_set_input(ctx, message[i], ctx->input_idx); 581 ctx->input_idx++; 582 } 583 } 584 585 void crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t hash_size, 586 const u8 *key, size_t key_size) 587 { 588 // initial hash 589 COPY(ctx->hash, iv, 8); 590 ctx->hash[0] ^= 0x01010000 ^ (key_size << 8) ^ hash_size; 591 592 ctx->input_offset[0] = 0; // beginning of the input, no offset 593 ctx->input_offset[1] = 0; // beginning of the input, no offset 594 ctx->hash_size = hash_size; // remember the hash size we want 595 ctx->input_idx = 0; 596 597 // if there is a key, the first block is that key (padded with zeroes) 598 if (key_size > 0) { 599 u8 key_block[128] = {0}; 600 COPY(key_block, key, key_size); 601 // same as calling crypto_blake2b_update(ctx, key_block , 128) 602 load64_le_buf(ctx->input, key_block, 16); 603 ctx->input_idx = 128; 604 } 605 } 606 607 void crypto_blake2b_init(crypto_blake2b_ctx *ctx) 608 { 609 crypto_blake2b_general_init(ctx, 64, 0, 0); 610 } 611 612 void crypto_blake2b_update(crypto_blake2b_ctx *ctx, 613 const u8 *message, size_t message_size) 614 { 615 if (message_size == 0) { 616 return; 617 } 618 // Align ourselves with block boundaries 619 size_t aligned = MIN(align(ctx->input_idx, 128), message_size); 620 blake2b_update(ctx, message, aligned); 621 message += aligned; 622 message_size -= aligned; 623 624 // Process the message block by block 625 FOR (i, 0, message_size >> 7) { // number of blocks 626 blake2b_end_block(ctx); 627 load64_le_buf(ctx->input, message, 16); 628 message += 128; 629 ctx->input_idx = 128; 630 } 631 message_size &= 127; 632 633 // remaining bytes 634 blake2b_update(ctx, message, message_size); 635 } 636 637 void crypto_blake2b_final(crypto_blake2b_ctx *ctx, u8 *hash) 638 { 639 // Pad the end of the block with zeroes 640 FOR (i, ctx->input_idx, 128) { 641 blake2b_set_input(ctx, 0, i); 642 } 643 blake2b_incr(ctx); // update the input offset 644 blake2b_compress(ctx, 1); // compress the last block 645 size_t nb_words = ctx->hash_size >> 3; 646 store64_le_buf(hash, ctx->hash, nb_words); 647 FOR (i, nb_words << 3, ctx->hash_size) { 648 hash[i] = (ctx->hash[i >> 3] >> (8 * (i & 7))) & 0xff; 649 } 650 WIPE_CTX(ctx); 651 } 652 653 void crypto_blake2b_general(u8 *hash , size_t hash_size, 654 const u8 *key , size_t key_size, 655 const u8 *message, size_t message_size) 656 { 657 crypto_blake2b_ctx ctx; 658 crypto_blake2b_general_init(&ctx, hash_size, key, key_size); 659 crypto_blake2b_update(&ctx, message, message_size); 660 crypto_blake2b_final(&ctx, hash); 661 } 662 663 void crypto_blake2b(u8 hash[64], const u8 *message, size_t message_size) 664 { 665 crypto_blake2b_general(hash, 64, 0, 0, message, message_size); 666 } 667 668 static void blake2b_vtable_init(void *ctx) { 669 crypto_blake2b_init(&((crypto_sign_ctx*)ctx)->hash); 670 } 671 static void blake2b_vtable_update(void *ctx, const u8 *m, size_t s) { 672 crypto_blake2b_update(&((crypto_sign_ctx*)ctx)->hash, m, s); 673 } 674 static void blake2b_vtable_final(void *ctx, u8 *h) { 675 crypto_blake2b_final(&((crypto_sign_ctx*)ctx)->hash, h); 676 } 677 const crypto_sign_vtable crypto_blake2b_vtable = { 678 crypto_blake2b, 679 blake2b_vtable_init, 680 blake2b_vtable_update, 681 blake2b_vtable_final, 682 sizeof(crypto_sign_ctx), 683 }; 684 685 //////////////// 686 /// Argon2 i /// 687 //////////////// 688 // references to R, Z, Q etc. come from the spec 689 690 // Argon2 operates on 1024 byte blocks. 691 typedef struct { u64 a[128]; } block; 692 693 static void wipe_block(block *b) 694 { 695 volatile u64* a = b->a; 696 ZERO(a, 128); 697 } 698 699 // updates a Blake2 hash with a 32 bit word, little endian. 700 static void blake_update_32(crypto_blake2b_ctx *ctx, u32 input) 701 { 702 u8 buf[4]; 703 store32_le(buf, input); 704 crypto_blake2b_update(ctx, buf, 4); 705 WIPE_BUFFER(buf); 706 } 707 708 static void load_block(block *b, const u8 bytes[1024]) 709 { 710 load64_le_buf(b->a, bytes, 128); 711 } 712 713 static void store_block(u8 bytes[1024], const block *b) 714 { 715 store64_le_buf(bytes, b->a, 128); 716 } 717 718 static void copy_block(block *o,const block*in){FOR(i,0,128)o->a[i] = in->a[i];} 719 static void xor_block(block *o,const block*in){FOR(i,0,128)o->a[i]^= in->a[i];} 720 721 // Hash with a virtually unlimited digest size. 722 // Doesn't extract more entropy than the base hash function. 723 // Mainly used for filling a whole kilobyte block with pseudo-random bytes. 724 // (One could use a stream cipher with a seed hash as the key, but 725 // this would introduce another dependency —and point of failure.) 726 static void extended_hash(u8 *digest, u32 digest_size, 727 const u8 *input , u32 input_size) 728 { 729 crypto_blake2b_ctx ctx; 730 crypto_blake2b_general_init(&ctx, MIN(digest_size, 64), 0, 0); 731 blake_update_32 (&ctx, digest_size); 732 crypto_blake2b_update (&ctx, input, input_size); 733 crypto_blake2b_final (&ctx, digest); 734 735 if (digest_size > 64) { 736 // the conversion to u64 avoids integer overflow on 737 // ludicrously big hash sizes. 738 u32 r = (u32)(((u64)digest_size + 31) >> 5) - 2; 739 u32 i = 1; 740 u32 in = 0; 741 u32 out = 32; 742 while (i < r) { 743 // Input and output overlap. This is intentional 744 crypto_blake2b(digest + out, digest + in, 64); 745 i += 1; 746 in += 32; 747 out += 32; 748 } 749 crypto_blake2b_general(digest + out, digest_size - (32 * r), 750 0, 0, // no key 751 digest + in , 64); 752 } 753 } 754 755 #define LSB(x) ((x) & 0xffffffff) 756 #define G(a, b, c, d) \ 757 a += b + 2 * LSB(a) * LSB(b); d ^= a; d = rotr64(d, 32); \ 758 c += d + 2 * LSB(c) * LSB(d); b ^= c; b = rotr64(b, 24); \ 759 a += b + 2 * LSB(a) * LSB(b); d ^= a; d = rotr64(d, 16); \ 760 c += d + 2 * LSB(c) * LSB(d); b ^= c; b = rotr64(b, 63) 761 #define ROUND(v0, v1, v2, v3, v4, v5, v6, v7, \ 762 v8, v9, v10, v11, v12, v13, v14, v15) \ 763 G(v0, v4, v8, v12); G(v1, v5, v9, v13); \ 764 G(v2, v6, v10, v14); G(v3, v7, v11, v15); \ 765 G(v0, v5, v10, v15); G(v1, v6, v11, v12); \ 766 G(v2, v7, v8, v13); G(v3, v4, v9, v14) 767 768 // Core of the compression function G. Computes Z from R in place. 769 static void g_rounds(block *work_block) 770 { 771 // column rounds (work_block = Q) 772 for (int i = 0; i < 128; i += 16) { 773 ROUND(work_block->a[i ], work_block->a[i + 1], 774 work_block->a[i + 2], work_block->a[i + 3], 775 work_block->a[i + 4], work_block->a[i + 5], 776 work_block->a[i + 6], work_block->a[i + 7], 777 work_block->a[i + 8], work_block->a[i + 9], 778 work_block->a[i + 10], work_block->a[i + 11], 779 work_block->a[i + 12], work_block->a[i + 13], 780 work_block->a[i + 14], work_block->a[i + 15]); 781 } 782 // row rounds (work_block = Z) 783 for (int i = 0; i < 16; i += 2) { 784 ROUND(work_block->a[i ], work_block->a[i + 1], 785 work_block->a[i + 16], work_block->a[i + 17], 786 work_block->a[i + 32], work_block->a[i + 33], 787 work_block->a[i + 48], work_block->a[i + 49], 788 work_block->a[i + 64], work_block->a[i + 65], 789 work_block->a[i + 80], work_block->a[i + 81], 790 work_block->a[i + 96], work_block->a[i + 97], 791 work_block->a[i + 112], work_block->a[i + 113]); 792 } 793 } 794 795 // The compression function G (copy version for the first pass) 796 static void g_copy(block *result, const block *x, const block *y, block* tmp) 797 { 798 copy_block(tmp , x ); // tmp = X 799 xor_block (tmp , y ); // tmp = X ^ Y = R 800 copy_block(result, tmp); // result = R (only difference with g_xor) 801 g_rounds (tmp); // tmp = Z 802 xor_block (result, tmp); // result = R ^ Z 803 } 804 805 // The compression function G (xor version for subsequent passes) 806 static void g_xor(block *result, const block *x, const block *y, block *tmp) 807 { 808 copy_block(tmp , x ); // tmp = X 809 xor_block (tmp , y ); // tmp = X ^ Y = R 810 xor_block (result, tmp); // result = R ^ old (only difference with g_copy) 811 g_rounds (tmp); // tmp = Z 812 xor_block (result, tmp); // result = R ^ old ^ Z 813 } 814 815 // Unary version of the compression function. 816 // The missing argument is implied zero. 817 // Does the transformation in place. 818 static void unary_g(block *work_block, block *tmp) 819 { 820 // work_block == R 821 copy_block(tmp, work_block); // tmp = R 822 g_rounds (work_block); // work_block = Z 823 xor_block (work_block, tmp); // work_block = Z ^ R 824 } 825 826 // Argon2i uses a kind of stream cipher to determine which reference 827 // block it will take to synthesise the next block. This context hold 828 // that stream's state. (It's very similar to Chacha20. The block b 829 // is analogous to Chacha's own pool) 830 typedef struct { 831 block b; 832 u32 pass_number; 833 u32 slice_number; 834 u32 nb_blocks; 835 u32 nb_iterations; 836 u32 ctr; 837 u32 offset; 838 } gidx_ctx; 839 840 // The block in the context will determine array indices. To avoid 841 // timing attacks, it only depends on public information. No looking 842 // at a previous block to seed the next. This makes offline attacks 843 // easier, but timing attacks are the bigger threat in many settings. 844 static void gidx_refresh(gidx_ctx *ctx) 845 { 846 // seed the beginning of the block... 847 ctx->b.a[0] = ctx->pass_number; 848 ctx->b.a[1] = 0; // lane number (we have only one) 849 ctx->b.a[2] = ctx->slice_number; 850 ctx->b.a[3] = ctx->nb_blocks; 851 ctx->b.a[4] = ctx->nb_iterations; 852 ctx->b.a[5] = 1; // type: Argon2i 853 ctx->b.a[6] = ctx->ctr; 854 ZERO(ctx->b.a + 7, 121); // ...then zero the rest out 855 856 // Shuffle the block thus: ctx->b = G((G(ctx->b, zero)), zero) 857 // (G "square" function), to get cheap pseudo-random numbers. 858 block tmp; 859 unary_g(&ctx->b, &tmp); 860 unary_g(&ctx->b, &tmp); 861 wipe_block(&tmp); 862 } 863 864 static void gidx_init(gidx_ctx *ctx, 865 u32 pass_number, u32 slice_number, 866 u32 nb_blocks, u32 nb_iterations) 867 { 868 ctx->pass_number = pass_number; 869 ctx->slice_number = slice_number; 870 ctx->nb_blocks = nb_blocks; 871 ctx->nb_iterations = nb_iterations; 872 ctx->ctr = 0; 873 874 // Offset from the beginning of the segment. For the first slice 875 // of the first pass, we start at the *third* block, so the offset 876 // starts at 2, not 0. 877 if (pass_number != 0 || slice_number != 0) { 878 ctx->offset = 0; 879 } else { 880 ctx->offset = 2; 881 ctx->ctr++; // Compensates for missed lazy creation 882 gidx_refresh(ctx); // at the start of gidx_next() 883 } 884 } 885 886 static u32 gidx_next(gidx_ctx *ctx) 887 { 888 // lazily creates the offset block we need 889 if ((ctx->offset & 127) == 0) { 890 ctx->ctr++; 891 gidx_refresh(ctx); 892 } 893 u32 index = ctx->offset & 127; // save index for current call 894 u32 offset = ctx->offset; // save offset for current call 895 ctx->offset++; // update offset for next call 896 897 // Computes the area size. 898 // Pass 0 : all already finished segments plus already constructed 899 // blocks in this segment 900 // Pass 1+: 3 last segments plus already constructed 901 // blocks in this segment. THE SPEC SUGGESTS OTHERWISE. 902 // I CONFORM TO THE REFERENCE IMPLEMENTATION. 903 int first_pass = ctx->pass_number == 0; 904 u32 slice_size = ctx->nb_blocks >> 2; 905 u32 nb_segments = first_pass ? ctx->slice_number : 3; 906 u32 area_size = nb_segments * slice_size + offset - 1; 907 908 // Computes the starting position of the reference area. 909 // CONTRARY TO WHAT THE SPEC SUGGESTS, IT STARTS AT THE 910 // NEXT SEGMENT, NOT THE NEXT BLOCK. 911 u32 next_slice = ((ctx->slice_number + 1) & 3) * slice_size; 912 u32 start_pos = first_pass ? 0 : next_slice; 913 914 // Generate offset from J1 (no need for J2, there's only one lane) 915 u64 j1 = ctx->b.a[index] & 0xffffffff; // pseudo-random number 916 u64 x = (j1 * j1) >> 32; 917 u64 y = (area_size * x) >> 32; 918 u64 z = (area_size - 1) - y; 919 u64 ref = start_pos + z; // ref < 2 * nb_blocks 920 return (u32)(ref < ctx->nb_blocks ? ref : ref - ctx->nb_blocks); 921 } 922 923 // Main algorithm 924 void crypto_argon2i_general(u8 *hash, u32 hash_size, 925 void *work_area, u32 nb_blocks, 926 u32 nb_iterations, 927 const u8 *password, u32 password_size, 928 const u8 *salt, u32 salt_size, 929 const u8 *key, u32 key_size, 930 const u8 *ad, u32 ad_size) 931 { 932 // work area seen as blocks (must be suitably aligned) 933 block *blocks = (block*)work_area; 934 { 935 crypto_blake2b_ctx ctx; 936 crypto_blake2b_init(&ctx); 937 938 blake_update_32 (&ctx, 1 ); // p: number of threads 939 blake_update_32 (&ctx, hash_size ); 940 blake_update_32 (&ctx, nb_blocks ); 941 blake_update_32 (&ctx, nb_iterations); 942 blake_update_32 (&ctx, 0x13 ); // v: version number 943 blake_update_32 (&ctx, 1 ); // y: Argon2i 944 blake_update_32 (&ctx, password_size); 945 crypto_blake2b_update(&ctx, password, password_size); 946 blake_update_32 (&ctx, salt_size); 947 crypto_blake2b_update(&ctx, salt, salt_size); 948 blake_update_32 (&ctx, key_size); 949 crypto_blake2b_update(&ctx, key, key_size); 950 blake_update_32 (&ctx, ad_size); 951 crypto_blake2b_update(&ctx, ad, ad_size); 952 953 u8 initial_hash[72]; // 64 bytes plus 2 words for future hashes 954 crypto_blake2b_final(&ctx, initial_hash); 955 956 // fill first 2 blocks 957 block tmp_block; 958 u8 hash_area[1024]; 959 store32_le(initial_hash + 64, 0); // first additional word 960 store32_le(initial_hash + 68, 0); // second additional word 961 extended_hash(hash_area, 1024, initial_hash, 72); 962 load_block(&tmp_block, hash_area); 963 copy_block(blocks, &tmp_block); 964 965 store32_le(initial_hash + 64, 1); // slight modification 966 extended_hash(hash_area, 1024, initial_hash, 72); 967 load_block(&tmp_block, hash_area); 968 copy_block(blocks + 1, &tmp_block); 969 970 WIPE_BUFFER(initial_hash); 971 WIPE_BUFFER(hash_area); 972 wipe_block(&tmp_block); 973 } 974 975 // Actual number of blocks 976 nb_blocks -= nb_blocks & 3; // round down to 4 p (p == 1 thread) 977 const u32 segment_size = nb_blocks >> 2; 978 979 // fill (then re-fill) the rest of the blocks 980 block tmp; 981 gidx_ctx ctx; // public information, no need to wipe 982 FOR_T (u32, pass_number, 0, nb_iterations) { 983 int first_pass = pass_number == 0; 984 985 FOR_T (u32, segment, 0, 4) { 986 gidx_init(&ctx, pass_number, segment, nb_blocks, nb_iterations); 987 988 // On the first segment of the first pass, 989 // blocks 0 and 1 are already filled. 990 // We use the offset to skip them. 991 u32 start_offset = first_pass && segment == 0 ? 2 : 0; 992 u32 segment_start = segment * segment_size + start_offset; 993 u32 segment_end = (segment + 1) * segment_size; 994 FOR_T (u32, current_block, segment_start, segment_end) { 995 u32 reference_block = gidx_next(&ctx); 996 u32 previous_block = current_block == 0 997 ? nb_blocks - 1 998 : current_block - 1; 999 block *c = blocks + current_block; 1000 block *p = blocks + previous_block; 1001 block *r = blocks + reference_block; 1002 if (first_pass) { g_copy(c, p, r, &tmp); } 1003 else { g_xor (c, p, r, &tmp); } 1004 } 1005 } 1006 } 1007 wipe_block(&tmp); 1008 u8 final_block[1024]; 1009 store_block(final_block, blocks + (nb_blocks - 1)); 1010 1011 // wipe work area 1012 volatile u64 *p = (u64*)work_area; 1013 ZERO(p, 128 * nb_blocks); 1014 1015 // hash the very last block with H' into the output hash 1016 extended_hash(hash, hash_size, final_block, 1024); 1017 WIPE_BUFFER(final_block); 1018 } 1019 1020 void crypto_argon2i(u8 *hash, u32 hash_size, 1021 void *work_area, u32 nb_blocks, u32 nb_iterations, 1022 const u8 *password, u32 password_size, 1023 const u8 *salt, u32 salt_size) 1024 { 1025 crypto_argon2i_general(hash, hash_size, work_area, nb_blocks, nb_iterations, 1026 password, password_size, salt , salt_size, 0,0,0,0); 1027 } 1028 1029 //////////////////////////////////// 1030 /// Arithmetic modulo 2^255 - 19 /// 1031 //////////////////////////////////// 1032 // Originally taken from SUPERCOP's ref10 implementation. 1033 // A bit bigger than TweetNaCl, over 4 times faster. 1034 1035 // field element 1036 typedef i32 fe[10]; 1037 1038 // field constants 1039 // 1040 // fe_one : 1 1041 // sqrtm1 : sqrt(-1) 1042 // d : -121665 / 121666 1043 // D2 : 2 * -121665 / 121666 1044 // lop_x, lop_y: low order point in Edwards coordinates 1045 // ufactor : -sqrt(-1) * 2 1046 // A2 : 486662^2 (A squared) 1047 static const fe fe_one = {1}; 1048 static const fe sqrtm1 = {-32595792, -7943725, 9377950, 3500415, 12389472, 1049 -272473, -25146209, -2005654, 326686, 11406482,}; 1050 static const fe d = {-10913610, 13857413, -15372611, 6949391, 114729, 1051 -8787816, -6275908, -3247719, -18696448, -12055116,}; 1052 static const fe D2 = {-21827239, -5839606, -30745221, 13898782, 229458, 1053 15978800, -12551817, -6495438, 29715968, 9444199,}; 1054 static const fe lop_x = {21352778, 5345713, 4660180, -8347857, 24143090, 1055 14568123, 30185756, -12247770, -33528939, 8345319,}; 1056 static const fe lop_y = {-6952922, -1265500, 6862341, -7057498, -4037696, 1057 -5447722, 31680899, -15325402, -19365852, 1569102,}; 1058 static const fe ufactor = {-1917299, 15887451, -18755900, -7000830, -24778944, 1059 544946, -16816446, 4011309, -653372, 10741468,}; 1060 static const fe A2 = {12721188, 3529, 0, 0, 0, 0, 0, 0, 0, 0,}; 1061 1062 static void fe_0(fe h) { ZERO(h , 10); } 1063 static void fe_1(fe h) { h[0] = 1; ZERO(h+1, 9); } 1064 1065 static void fe_copy(fe h,const fe f ){FOR(i,0,10) h[i] = f[i]; } 1066 static void fe_neg (fe h,const fe f ){FOR(i,0,10) h[i] = -f[i]; } 1067 static void fe_add (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] + g[i];} 1068 static void fe_sub (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] - g[i];} 1069 1070 static void fe_cswap(fe f, fe g, int b) 1071 { 1072 i32 mask = -b; // -1 = 0xffffffff 1073 FOR (i, 0, 10) { 1074 i32 x = (f[i] ^ g[i]) & mask; 1075 f[i] = f[i] ^ x; 1076 g[i] = g[i] ^ x; 1077 } 1078 } 1079 1080 static void fe_ccopy(fe f, const fe g, int b) 1081 { 1082 i32 mask = -b; // -1 = 0xffffffff 1083 FOR (i, 0, 10) { 1084 i32 x = (f[i] ^ g[i]) & mask; 1085 f[i] = f[i] ^ x; 1086 } 1087 } 1088 1089 1090 // Signed carry propagation 1091 // ------------------------ 1092 // 1093 // Let t be a number. It can be uniquely decomposed thus: 1094 // 1095 // t = h*2^26 + l 1096 // such that -2^25 <= l < 2^25 1097 // 1098 // Let c = (t + 2^25) / 2^26 (rounded down) 1099 // c = (h*2^26 + l + 2^25) / 2^26 (rounded down) 1100 // c = h + (l + 2^25) / 2^26 (rounded down) 1101 // c = h (exactly) 1102 // Because 0 <= l + 2^25 < 2^26 1103 // 1104 // Let u = t - c*2^26 1105 // u = h*2^26 + l - h*2^26 1106 // u = l 1107 // Therefore, -2^25 <= u < 2^25 1108 // 1109 // Additionally, if |t| < x, then |h| < x/2^26 (rounded down) 1110 // 1111 // Notations: 1112 // - In C, 1<<25 means 2^25. 1113 // - In C, x>>25 means floor(x / (2^25)). 1114 // - All of the above applies with 25 & 24 as well as 26 & 25. 1115 // 1116 // 1117 // Note on negative right shifts 1118 // ----------------------------- 1119 // 1120 // In C, x >> n, where x is a negative integer, is implementation 1121 // defined. In practice, all platforms do arithmetic shift, which is 1122 // equivalent to division by 2^26, rounded down. Some compilers, like 1123 // GCC, even guarantee it. 1124 // 1125 // If we ever stumble upon a platform that does not propagate the sign 1126 // bit (we won't), visible failures will show at the slightest test, and 1127 // the signed shifts can be replaced by the following: 1128 // 1129 // typedef struct { i64 x:39; } s25; 1130 // typedef struct { i64 x:38; } s26; 1131 // i64 shift25(i64 x) { s25 s; s.x = ((u64)x)>>25; return s.x; } 1132 // i64 shift26(i64 x) { s26 s; s.x = ((u64)x)>>26; return s.x; } 1133 // 1134 // Current compilers cannot optimise this, causing a 30% drop in 1135 // performance. Fairly expensive for something that never happens. 1136 // 1137 // 1138 // Precondition 1139 // ------------ 1140 // 1141 // |t0| < 2^63 1142 // |t1|..|t9| < 2^62 1143 // 1144 // Algorithm 1145 // --------- 1146 // c = t0 + 2^25 / 2^26 -- |c| <= 2^36 1147 // t0 -= c * 2^26 -- |t0| <= 2^25 1148 // t1 += c -- |t1| <= 2^63 1149 // 1150 // c = t4 + 2^25 / 2^26 -- |c| <= 2^36 1151 // t4 -= c * 2^26 -- |t4| <= 2^25 1152 // t5 += c -- |t5| <= 2^63 1153 // 1154 // c = t1 + 2^24 / 2^25 -- |c| <= 2^38 1155 // t1 -= c * 2^25 -- |t1| <= 2^24 1156 // t2 += c -- |t2| <= 2^63 1157 // 1158 // c = t5 + 2^24 / 2^25 -- |c| <= 2^38 1159 // t5 -= c * 2^25 -- |t5| <= 2^24 1160 // t6 += c -- |t6| <= 2^63 1161 // 1162 // c = t2 + 2^25 / 2^26 -- |c| <= 2^37 1163 // t2 -= c * 2^26 -- |t2| <= 2^25 < 1.1 * 2^25 (final t2) 1164 // t3 += c -- |t3| <= 2^63 1165 // 1166 // c = t6 + 2^25 / 2^26 -- |c| <= 2^37 1167 // t6 -= c * 2^26 -- |t6| <= 2^25 < 1.1 * 2^25 (final t6) 1168 // t7 += c -- |t7| <= 2^63 1169 // 1170 // c = t3 + 2^24 / 2^25 -- |c| <= 2^38 1171 // t3 -= c * 2^25 -- |t3| <= 2^24 < 1.1 * 2^24 (final t3) 1172 // t4 += c -- |t4| <= 2^25 + 2^38 < 2^39 1173 // 1174 // c = t7 + 2^24 / 2^25 -- |c| <= 2^38 1175 // t7 -= c * 2^25 -- |t7| <= 2^24 < 1.1 * 2^24 (final t7) 1176 // t8 += c -- |t8| <= 2^63 1177 // 1178 // c = t4 + 2^25 / 2^26 -- |c| <= 2^13 1179 // t4 -= c * 2^26 -- |t4| <= 2^25 < 1.1 * 2^25 (final t4) 1180 // t5 += c -- |t5| <= 2^24 + 2^13 < 1.1 * 2^24 (final t5) 1181 // 1182 // c = t8 + 2^25 / 2^26 -- |c| <= 2^37 1183 // t8 -= c * 2^26 -- |t8| <= 2^25 < 1.1 * 2^25 (final t8) 1184 // t9 += c -- |t9| <= 2^63 1185 // 1186 // c = t9 + 2^24 / 2^25 -- |c| <= 2^38 1187 // t9 -= c * 2^25 -- |t9| <= 2^24 < 1.1 * 2^24 (final t9) 1188 // t0 += c * 19 -- |t0| <= 2^25 + 2^38*19 < 2^44 1189 // 1190 // c = t0 + 2^25 / 2^26 -- |c| <= 2^18 1191 // t0 -= c * 2^26 -- |t0| <= 2^25 < 1.1 * 2^25 (final t0) 1192 // t1 += c -- |t1| <= 2^24 + 2^18 < 1.1 * 2^24 (final t1) 1193 // 1194 // Postcondition 1195 // ------------- 1196 // |t0|, |t2|, |t4|, |t6|, |t8| < 1.1 * 2^25 1197 // |t1|, |t3|, |t5|, |t7|, |t9| < 1.1 * 2^24 1198 #define FE_CARRY \ 1199 i64 c; \ 1200 c = (t0 + ((i64)1<<25)) >> 26; t0 -= c * ((i64)1 << 26); t1 += c; \ 1201 c = (t4 + ((i64)1<<25)) >> 26; t4 -= c * ((i64)1 << 26); t5 += c; \ 1202 c = (t1 + ((i64)1<<24)) >> 25; t1 -= c * ((i64)1 << 25); t2 += c; \ 1203 c = (t5 + ((i64)1<<24)) >> 25; t5 -= c * ((i64)1 << 25); t6 += c; \ 1204 c = (t2 + ((i64)1<<25)) >> 26; t2 -= c * ((i64)1 << 26); t3 += c; \ 1205 c = (t6 + ((i64)1<<25)) >> 26; t6 -= c * ((i64)1 << 26); t7 += c; \ 1206 c = (t3 + ((i64)1<<24)) >> 25; t3 -= c * ((i64)1 << 25); t4 += c; \ 1207 c = (t7 + ((i64)1<<24)) >> 25; t7 -= c * ((i64)1 << 25); t8 += c; \ 1208 c = (t4 + ((i64)1<<25)) >> 26; t4 -= c * ((i64)1 << 26); t5 += c; \ 1209 c = (t8 + ((i64)1<<25)) >> 26; t8 -= c * ((i64)1 << 26); t9 += c; \ 1210 c = (t9 + ((i64)1<<24)) >> 25; t9 -= c * ((i64)1 << 25); t0 += c * 19; \ 1211 c = (t0 + ((i64)1<<25)) >> 26; t0 -= c * ((i64)1 << 26); t1 += c; \ 1212 h[0]=(i32)t0; h[1]=(i32)t1; h[2]=(i32)t2; h[3]=(i32)t3; h[4]=(i32)t4; \ 1213 h[5]=(i32)t5; h[6]=(i32)t6; h[7]=(i32)t7; h[8]=(i32)t8; h[9]=(i32)t9 1214 1215 static void fe_frombytes(fe h, const u8 s[32]) 1216 { 1217 i64 t0 = load32_le(s); // t0 < 2^32 1218 i64 t1 = load24_le(s + 4) << 6; // t1 < 2^30 1219 i64 t2 = load24_le(s + 7) << 5; // t2 < 2^29 1220 i64 t3 = load24_le(s + 10) << 3; // t3 < 2^27 1221 i64 t4 = load24_le(s + 13) << 2; // t4 < 2^26 1222 i64 t5 = load32_le(s + 16); // t5 < 2^32 1223 i64 t6 = load24_le(s + 20) << 7; // t6 < 2^31 1224 i64 t7 = load24_le(s + 23) << 5; // t7 < 2^29 1225 i64 t8 = load24_le(s + 26) << 4; // t8 < 2^28 1226 i64 t9 = (load24_le(s + 29) & 0x7fffff) << 2; // t9 < 2^25 1227 FE_CARRY; // Carry recondition OK 1228 } 1229 1230 // Precondition 1231 // |h[0]|, |h[2]|, |h[4]|, |h[6]|, |h[8]| < 1.1 * 2^25 1232 // |h[1]|, |h[3]|, |h[5]|, |h[7]|, |h[9]| < 1.1 * 2^24 1233 // 1234 // Therefore, |h| < 2^255-19 1235 // There are two possibilities: 1236 // 1237 // - If h is positive, all we need to do is reduce its individual 1238 // limbs down to their tight positive range. 1239 // - If h is negative, we also need to add 2^255-19 to it. 1240 // Or just remove 19 and chop off any excess bit. 1241 static void fe_tobytes(u8 s[32], const fe h) 1242 { 1243 i32 t[10]; 1244 COPY(t, h, 10); 1245 i32 q = (19 * t[9] + (((i32) 1) << 24)) >> 25; 1246 // |t9| < 1.1 * 2^24 1247 // -1.1 * 2^24 < t9 < 1.1 * 2^24 1248 // -21 * 2^24 < 19 * t9 < 21 * 2^24 1249 // -2^29 < 19 * t9 + 2^24 < 2^29 1250 // -2^29 / 2^25 < (19 * t9 + 2^24) / 2^25 < 2^29 / 2^25 1251 // -16 < (19 * t9 + 2^24) / 2^25 < 16 1252 FOR (i, 0, 5) { 1253 q += t[2*i ]; q >>= 26; // q = 0 or -1 1254 q += t[2*i+1]; q >>= 25; // q = 0 or -1 1255 } 1256 // q = 0 iff h >= 0 1257 // q = -1 iff h < 0 1258 // Adding q * 19 to h reduces h to its proper range. 1259 q *= 19; // Shift carry back to the beginning 1260 FOR (i, 0, 5) { 1261 t[i*2 ] += q; q = t[i*2 ] >> 26; t[i*2 ] -= q * ((i32)1 << 26); 1262 t[i*2+1] += q; q = t[i*2+1] >> 25; t[i*2+1] -= q * ((i32)1 << 25); 1263 } 1264 // h is now fully reduced, and q represents the excess bit. 1265 1266 store32_le(s + 0, ((u32)t[0] >> 0) | ((u32)t[1] << 26)); 1267 store32_le(s + 4, ((u32)t[1] >> 6) | ((u32)t[2] << 19)); 1268 store32_le(s + 8, ((u32)t[2] >> 13) | ((u32)t[3] << 13)); 1269 store32_le(s + 12, ((u32)t[3] >> 19) | ((u32)t[4] << 6)); 1270 store32_le(s + 16, ((u32)t[5] >> 0) | ((u32)t[6] << 25)); 1271 store32_le(s + 20, ((u32)t[6] >> 7) | ((u32)t[7] << 19)); 1272 store32_le(s + 24, ((u32)t[7] >> 13) | ((u32)t[8] << 12)); 1273 store32_le(s + 28, ((u32)t[8] >> 20) | ((u32)t[9] << 6)); 1274 1275 WIPE_BUFFER(t); 1276 } 1277 1278 // Precondition 1279 // ------------- 1280 // |f0|, |f2|, |f4|, |f6|, |f8| < 1.65 * 2^26 1281 // |f1|, |f3|, |f5|, |f7|, |f9| < 1.65 * 2^25 1282 // 1283 // |g0|, |g2|, |g4|, |g6|, |g8| < 1.65 * 2^26 1284 // |g1|, |g3|, |g5|, |g7|, |g9| < 1.65 * 2^25 1285 static void fe_mul_small(fe h, const fe f, i32 g) 1286 { 1287 i64 t0 = f[0] * (i64) g; i64 t1 = f[1] * (i64) g; 1288 i64 t2 = f[2] * (i64) g; i64 t3 = f[3] * (i64) g; 1289 i64 t4 = f[4] * (i64) g; i64 t5 = f[5] * (i64) g; 1290 i64 t6 = f[6] * (i64) g; i64 t7 = f[7] * (i64) g; 1291 i64 t8 = f[8] * (i64) g; i64 t9 = f[9] * (i64) g; 1292 // |t0|, |t2|, |t4|, |t6|, |t8| < 1.65 * 2^26 * 2^31 < 2^58 1293 // |t1|, |t3|, |t5|, |t7|, |t9| < 1.65 * 2^25 * 2^31 < 2^57 1294 1295 FE_CARRY; // Carry precondition OK 1296 } 1297 1298 // Precondition 1299 // ------------- 1300 // |f0|, |f2|, |f4|, |f6|, |f8| < 1.65 * 2^26 1301 // |f1|, |f3|, |f5|, |f7|, |f9| < 1.65 * 2^25 1302 // 1303 // |g0|, |g2|, |g4|, |g6|, |g8| < 1.65 * 2^26 1304 // |g1|, |g3|, |g5|, |g7|, |g9| < 1.65 * 2^25 1305 static void fe_mul(fe h, const fe f, const fe g) 1306 { 1307 // Everything is unrolled and put in temporary variables. 1308 // We could roll the loop, but that would make curve25519 twice as slow. 1309 i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4]; 1310 i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9]; 1311 i32 g0 = g[0]; i32 g1 = g[1]; i32 g2 = g[2]; i32 g3 = g[3]; i32 g4 = g[4]; 1312 i32 g5 = g[5]; i32 g6 = g[6]; i32 g7 = g[7]; i32 g8 = g[8]; i32 g9 = g[9]; 1313 i32 F1 = f1*2; i32 F3 = f3*2; i32 F5 = f5*2; i32 F7 = f7*2; i32 F9 = f9*2; 1314 i32 G1 = g1*19; i32 G2 = g2*19; i32 G3 = g3*19; 1315 i32 G4 = g4*19; i32 G5 = g5*19; i32 G6 = g6*19; 1316 i32 G7 = g7*19; i32 G8 = g8*19; i32 G9 = g9*19; 1317 // |F1|, |F3|, |F5|, |F7|, |F9| < 1.65 * 2^26 1318 // |G0|, |G2|, |G4|, |G6|, |G8| < 2^31 1319 // |G1|, |G3|, |G5|, |G7|, |G9| < 2^30 1320 1321 i64 t0 = f0*(i64)g0 + F1*(i64)G9 + f2*(i64)G8 + F3*(i64)G7 + f4*(i64)G6 1322 + F5*(i64)G5 + f6*(i64)G4 + F7*(i64)G3 + f8*(i64)G2 + F9*(i64)G1; 1323 i64 t1 = f0*(i64)g1 + f1*(i64)g0 + f2*(i64)G9 + f3*(i64)G8 + f4*(i64)G7 1324 + f5*(i64)G6 + f6*(i64)G5 + f7*(i64)G4 + f8*(i64)G3 + f9*(i64)G2; 1325 i64 t2 = f0*(i64)g2 + F1*(i64)g1 + f2*(i64)g0 + F3*(i64)G9 + f4*(i64)G8 1326 + F5*(i64)G7 + f6*(i64)G6 + F7*(i64)G5 + f8*(i64)G4 + F9*(i64)G3; 1327 i64 t3 = f0*(i64)g3 + f1*(i64)g2 + f2*(i64)g1 + f3*(i64)g0 + f4*(i64)G9 1328 + f5*(i64)G8 + f6*(i64)G7 + f7*(i64)G6 + f8*(i64)G5 + f9*(i64)G4; 1329 i64 t4 = f0*(i64)g4 + F1*(i64)g3 + f2*(i64)g2 + F3*(i64)g1 + f4*(i64)g0 1330 + F5*(i64)G9 + f6*(i64)G8 + F7*(i64)G7 + f8*(i64)G6 + F9*(i64)G5; 1331 i64 t5 = f0*(i64)g5 + f1*(i64)g4 + f2*(i64)g3 + f3*(i64)g2 + f4*(i64)g1 1332 + f5*(i64)g0 + f6*(i64)G9 + f7*(i64)G8 + f8*(i64)G7 + f9*(i64)G6; 1333 i64 t6 = f0*(i64)g6 + F1*(i64)g5 + f2*(i64)g4 + F3*(i64)g3 + f4*(i64)g2 1334 + F5*(i64)g1 + f6*(i64)g0 + F7*(i64)G9 + f8*(i64)G8 + F9*(i64)G7; 1335 i64 t7 = f0*(i64)g7 + f1*(i64)g6 + f2*(i64)g5 + f3*(i64)g4 + f4*(i64)g3 1336 + f5*(i64)g2 + f6*(i64)g1 + f7*(i64)g0 + f8*(i64)G9 + f9*(i64)G8; 1337 i64 t8 = f0*(i64)g8 + F1*(i64)g7 + f2*(i64)g6 + F3*(i64)g5 + f4*(i64)g4 1338 + F5*(i64)g3 + f6*(i64)g2 + F7*(i64)g1 + f8*(i64)g0 + F9*(i64)G9; 1339 i64 t9 = f0*(i64)g9 + f1*(i64)g8 + f2*(i64)g7 + f3*(i64)g6 + f4*(i64)g5 1340 + f5*(i64)g4 + f6*(i64)g3 + f7*(i64)g2 + f8*(i64)g1 + f9*(i64)g0; 1341 // t0 < 0.67 * 2^61 1342 // t1 < 0.41 * 2^61 1343 // t2 < 0.52 * 2^61 1344 // t3 < 0.32 * 2^61 1345 // t4 < 0.38 * 2^61 1346 // t5 < 0.22 * 2^61 1347 // t6 < 0.23 * 2^61 1348 // t7 < 0.13 * 2^61 1349 // t8 < 0.09 * 2^61 1350 // t9 < 0.03 * 2^61 1351 1352 FE_CARRY; // Everything below 2^62, Carry precondition OK 1353 } 1354 1355 // Precondition 1356 // ------------- 1357 // |f0|, |f2|, |f4|, |f6|, |f8| < 1.65 * 2^26 1358 // |f1|, |f3|, |f5|, |f7|, |f9| < 1.65 * 2^25 1359 // 1360 // Note: we could use fe_mul() for this, but this is significantly faster 1361 static void fe_sq(fe h, const fe f) 1362 { 1363 i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4]; 1364 i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9]; 1365 i32 f0_2 = f0*2; i32 f1_2 = f1*2; i32 f2_2 = f2*2; i32 f3_2 = f3*2; 1366 i32 f4_2 = f4*2; i32 f5_2 = f5*2; i32 f6_2 = f6*2; i32 f7_2 = f7*2; 1367 i32 f5_38 = f5*38; i32 f6_19 = f6*19; i32 f7_38 = f7*38; 1368 i32 f8_19 = f8*19; i32 f9_38 = f9*38; 1369 // |f0_2| , |f2_2| , |f4_2| , |f6_2| , |f8_2| < 1.65 * 2^27 1370 // |f1_2| , |f3_2| , |f5_2| , |f7_2| , |f9_2| < 1.65 * 2^26 1371 // |f5_38|, |f6_19|, |f7_38|, |f8_19|, |f9_38| < 2^31 1372 1373 i64 t0 = f0 *(i64)f0 + f1_2*(i64)f9_38 + f2_2*(i64)f8_19 1374 + f3_2*(i64)f7_38 + f4_2*(i64)f6_19 + f5 *(i64)f5_38; 1375 i64 t1 = f0_2*(i64)f1 + f2 *(i64)f9_38 + f3_2*(i64)f8_19 1376 + f4 *(i64)f7_38 + f5_2*(i64)f6_19; 1377 i64 t2 = f0_2*(i64)f2 + f1_2*(i64)f1 + f3_2*(i64)f9_38 1378 + f4_2*(i64)f8_19 + f5_2*(i64)f7_38 + f6 *(i64)f6_19; 1379 i64 t3 = f0_2*(i64)f3 + f1_2*(i64)f2 + f4 *(i64)f9_38 1380 + f5_2*(i64)f8_19 + f6 *(i64)f7_38; 1381 i64 t4 = f0_2*(i64)f4 + f1_2*(i64)f3_2 + f2 *(i64)f2 1382 + f5_2*(i64)f9_38 + f6_2*(i64)f8_19 + f7 *(i64)f7_38; 1383 i64 t5 = f0_2*(i64)f5 + f1_2*(i64)f4 + f2_2*(i64)f3 1384 + f6 *(i64)f9_38 + f7_2*(i64)f8_19; 1385 i64 t6 = f0_2*(i64)f6 + f1_2*(i64)f5_2 + f2_2*(i64)f4 1386 + f3_2*(i64)f3 + f7_2*(i64)f9_38 + f8 *(i64)f8_19; 1387 i64 t7 = f0_2*(i64)f7 + f1_2*(i64)f6 + f2_2*(i64)f5 1388 + f3_2*(i64)f4 + f8 *(i64)f9_38; 1389 i64 t8 = f0_2*(i64)f8 + f1_2*(i64)f7_2 + f2_2*(i64)f6 1390 + f3_2*(i64)f5_2 + f4 *(i64)f4 + f9 *(i64)f9_38; 1391 i64 t9 = f0_2*(i64)f9 + f1_2*(i64)f8 + f2_2*(i64)f7 1392 + f3_2*(i64)f6 + f4 *(i64)f5_2; 1393 // t0 < 0.67 * 2^61 1394 // t1 < 0.41 * 2^61 1395 // t2 < 0.52 * 2^61 1396 // t3 < 0.32 * 2^61 1397 // t4 < 0.38 * 2^61 1398 // t5 < 0.22 * 2^61 1399 // t6 < 0.23 * 2^61 1400 // t7 < 0.13 * 2^61 1401 // t8 < 0.09 * 2^61 1402 // t9 < 0.03 * 2^61 1403 1404 FE_CARRY; 1405 } 1406 1407 // h = 2 * (f^2) 1408 // 1409 // Precondition 1410 // ------------- 1411 // |f0|, |f2|, |f4|, |f6|, |f8| < 1.65 * 2^26 1412 // |f1|, |f3|, |f5|, |f7|, |f9| < 1.65 * 2^25 1413 // 1414 // Note: we could implement fe_sq2() by copying fe_sq(), multiplying 1415 // each limb by 2, *then* perform the carry. This saves one carry. 1416 // However, doing so with the stated preconditions does not work (t2 1417 // would overflow). There are 3 ways to solve this: 1418 // 1419 // 1. Show that t2 actually never overflows (it really does not). 1420 // 2. Accept an additional carry, at a small lost of performance. 1421 // 3. Make sure the input of fe_sq2() is freshly carried. 1422 // 1423 // SUPERCOP ref10 relies on (1). 1424 // Monocypher chose (2) and (3), mostly to save code. 1425 static void fe_sq2(fe h, const fe f) 1426 { 1427 fe_sq(h, f); 1428 fe_mul_small(h, h, 2); 1429 } 1430 1431 // This could be simplified, but it would be slower 1432 static void fe_pow22523(fe out, const fe z) 1433 { 1434 fe t0, t1, t2; 1435 fe_sq(t0, z); 1436 fe_sq(t1,t0); fe_sq(t1, t1); fe_mul(t1, z, t1); 1437 fe_mul(t0, t0, t1); 1438 fe_sq(t0, t0); fe_mul(t0, t1, t0); 1439 fe_sq(t1, t0); FOR (i, 1, 5) fe_sq(t1, t1); fe_mul(t0, t1, t0); 1440 fe_sq(t1, t0); FOR (i, 1, 10) fe_sq(t1, t1); fe_mul(t1, t1, t0); 1441 fe_sq(t2, t1); FOR (i, 1, 20) fe_sq(t2, t2); fe_mul(t1, t2, t1); 1442 fe_sq(t1, t1); FOR (i, 1, 10) fe_sq(t1, t1); fe_mul(t0, t1, t0); 1443 fe_sq(t1, t0); FOR (i, 1, 50) fe_sq(t1, t1); fe_mul(t1, t1, t0); 1444 fe_sq(t2, t1); FOR (i, 1, 100) fe_sq(t2, t2); fe_mul(t1, t2, t1); 1445 fe_sq(t1, t1); FOR (i, 1, 50) fe_sq(t1, t1); fe_mul(t0, t1, t0); 1446 fe_sq(t0, t0); FOR (i, 1, 2) fe_sq(t0, t0); fe_mul(out, t0, z); 1447 WIPE_BUFFER(t0); 1448 WIPE_BUFFER(t1); 1449 WIPE_BUFFER(t2); 1450 } 1451 1452 // Inverting means multiplying by 2^255 - 21 1453 // 2^255 - 21 = (2^252 - 3) * 8 + 3 1454 // So we reuse the multiplication chain of fe_pow22523 1455 static void fe_invert(fe out, const fe z) 1456 { 1457 fe tmp; 1458 fe_pow22523(tmp, z); 1459 // tmp2^8 * z^3 1460 fe_sq(tmp, tmp); // 0 1461 fe_sq(tmp, tmp); fe_mul(tmp, tmp, z); // 1 1462 fe_sq(tmp, tmp); fe_mul(out, tmp, z); // 1 1463 WIPE_BUFFER(tmp); 1464 } 1465 1466 // Parity check. Returns 0 if even, 1 if odd 1467 static int fe_isodd(const fe f) 1468 { 1469 u8 s[32]; 1470 fe_tobytes(s, f); 1471 u8 isodd = s[0] & 1; 1472 WIPE_BUFFER(s); 1473 return isodd; 1474 } 1475 1476 // Returns 1 if equal, 0 if not equal 1477 static int fe_isequal(const fe f, const fe g) 1478 { 1479 u8 fs[32]; 1480 u8 gs[32]; 1481 fe_tobytes(fs, f); 1482 fe_tobytes(gs, g); 1483 int isdifferent = crypto_verify32(fs, gs); 1484 WIPE_BUFFER(fs); 1485 WIPE_BUFFER(gs); 1486 return 1 + isdifferent; 1487 } 1488 1489 // Inverse square root. 1490 // Returns true if x is a non zero square, false otherwise. 1491 // After the call: 1492 // isr = sqrt(1/x) if x is non-zero square. 1493 // isr = sqrt(sqrt(-1)/x) if x is not a square. 1494 // isr = 0 if x is zero. 1495 // We do not guarantee the sign of the square root. 1496 // 1497 // Notes: 1498 // Let quartic = x^((p-1)/4) 1499 // 1500 // x^((p-1)/2) = chi(x) 1501 // quartic^2 = chi(x) 1502 // quartic = sqrt(chi(x)) 1503 // quartic = 1 or -1 or sqrt(-1) or -sqrt(-1) 1504 // 1505 // Note that x is a square if quartic is 1 or -1 1506 // There are 4 cases to consider: 1507 // 1508 // if quartic = 1 (x is a square) 1509 // then x^((p-1)/4) = 1 1510 // x^((p-5)/4) * x = 1 1511 // x^((p-5)/4) = 1/x 1512 // x^((p-5)/8) = sqrt(1/x) or -sqrt(1/x) 1513 // 1514 // if quartic = -1 (x is a square) 1515 // then x^((p-1)/4) = -1 1516 // x^((p-5)/4) * x = -1 1517 // x^((p-5)/4) = -1/x 1518 // x^((p-5)/8) = sqrt(-1) / sqrt(x) 1519 // x^((p-5)/8) * sqrt(-1) = sqrt(-1)^2 / sqrt(x) 1520 // x^((p-5)/8) * sqrt(-1) = -1/sqrt(x) 1521 // x^((p-5)/8) * sqrt(-1) = -sqrt(1/x) or sqrt(1/x) 1522 // 1523 // if quartic = sqrt(-1) (x is not a square) 1524 // then x^((p-1)/4) = sqrt(-1) 1525 // x^((p-5)/4) * x = sqrt(-1) 1526 // x^((p-5)/4) = sqrt(-1)/x 1527 // x^((p-5)/8) = sqrt(sqrt(-1)/x) or -sqrt(sqrt(-1)/x) 1528 // 1529 // Note that the product of two non-squares is always a square: 1530 // For any non-squares a and b, chi(a) = -1 and chi(b) = -1. 1531 // Since chi(x) = x^((p-1)/2), chi(a)*chi(b) = chi(a*b) = 1. 1532 // Therefore a*b is a square. 1533 // 1534 // Since sqrt(-1) and x are both non-squares, their product is a 1535 // square, and we can compute their square root. 1536 // 1537 // if quartic = -sqrt(-1) (x is not a square) 1538 // then x^((p-1)/4) = -sqrt(-1) 1539 // x^((p-5)/4) * x = -sqrt(-1) 1540 // x^((p-5)/4) = -sqrt(-1)/x 1541 // x^((p-5)/8) = sqrt(-sqrt(-1)/x) 1542 // x^((p-5)/8) = sqrt( sqrt(-1)/x) * sqrt(-1) 1543 // x^((p-5)/8) * sqrt(-1) = sqrt( sqrt(-1)/x) * sqrt(-1)^2 1544 // x^((p-5)/8) * sqrt(-1) = sqrt( sqrt(-1)/x) * -1 1545 // x^((p-5)/8) * sqrt(-1) = -sqrt(sqrt(-1)/x) or sqrt(sqrt(-1)/x) 1546 static int invsqrt(fe isr, const fe x) 1547 { 1548 fe check, quartic; 1549 fe_copy(check, x); 1550 fe_pow22523(isr, check); 1551 fe_sq (quartic, isr); 1552 fe_mul(quartic, quartic, check); 1553 fe_1 (check); int p1 = fe_isequal(quartic, check); 1554 fe_neg(check, check ); int m1 = fe_isequal(quartic, check); 1555 fe_neg(check, sqrtm1); int ms = fe_isequal(quartic, check); 1556 fe_mul(check, isr, sqrtm1); 1557 fe_ccopy(isr, check, m1 | ms); 1558 WIPE_BUFFER(quartic); 1559 WIPE_BUFFER(check); 1560 return p1 | m1; 1561 } 1562 1563 // trim a scalar for scalar multiplication 1564 static void trim_scalar(u8 scalar[32]) 1565 { 1566 scalar[ 0] &= 248; 1567 scalar[31] &= 127; 1568 scalar[31] |= 64; 1569 } 1570 1571 // get bit from scalar at position i 1572 static int scalar_bit(const u8 s[32], int i) 1573 { 1574 if (i < 0) { return 0; } // handle -1 for sliding windows 1575 return (s[i>>3] >> (i&7)) & 1; 1576 } 1577 1578 /////////////// 1579 /// X-25519 /// Taken from SUPERCOP's ref10 implementation. 1580 /////////////// 1581 static void scalarmult(u8 q[32], const u8 scalar[32], const u8 p[32], 1582 int nb_bits) 1583 { 1584 // computes the scalar product 1585 fe x1; 1586 fe_frombytes(x1, p); 1587 1588 // computes the actual scalar product (the result is in x2 and z2) 1589 fe x2, z2, x3, z3, t0, t1; 1590 // Montgomery ladder 1591 // In projective coordinates, to avoid divisions: x = X / Z 1592 // We don't care about the y coordinate, it's only 1 bit of information 1593 fe_1(x2); fe_0(z2); // "zero" point 1594 fe_copy(x3, x1); fe_1(z3); // "one" point 1595 int swap = 0; 1596 for (int pos = nb_bits-1; pos >= 0; --pos) { 1597 // constant time conditional swap before ladder step 1598 int b = scalar_bit(scalar, pos); 1599 swap ^= b; // xor trick avoids swapping at the end of the loop 1600 fe_cswap(x2, x3, swap); 1601 fe_cswap(z2, z3, swap); 1602 swap = b; // anticipates one last swap after the loop 1603 1604 // Montgomery ladder step: replaces (P2, P3) by (P2*2, P2+P3) 1605 // with differential addition 1606 fe_sub(t0, x3, z3); 1607 fe_sub(t1, x2, z2); 1608 fe_add(x2, x2, z2); 1609 fe_add(z2, x3, z3); 1610 fe_mul(z3, t0, x2); 1611 fe_mul(z2, z2, t1); 1612 fe_sq (t0, t1 ); 1613 fe_sq (t1, x2 ); 1614 fe_add(x3, z3, z2); 1615 fe_sub(z2, z3, z2); 1616 fe_mul(x2, t1, t0); 1617 fe_sub(t1, t1, t0); 1618 fe_sq (z2, z2 ); 1619 fe_mul_small(z3, t1, 121666); 1620 fe_sq (x3, x3 ); 1621 fe_add(t0, t0, z3); 1622 fe_mul(z3, x1, z2); 1623 fe_mul(z2, t1, t0); 1624 } 1625 // last swap is necessary to compensate for the xor trick 1626 // Note: after this swap, P3 == P2 + P1. 1627 fe_cswap(x2, x3, swap); 1628 fe_cswap(z2, z3, swap); 1629 1630 // normalises the coordinates: x == X / Z 1631 fe_invert(z2, z2); 1632 fe_mul(x2, x2, z2); 1633 fe_tobytes(q, x2); 1634 1635 WIPE_BUFFER(x1); 1636 WIPE_BUFFER(x2); WIPE_BUFFER(z2); WIPE_BUFFER(t0); 1637 WIPE_BUFFER(x3); WIPE_BUFFER(z3); WIPE_BUFFER(t1); 1638 } 1639 1640 void crypto_x25519(u8 raw_shared_secret[32], 1641 const u8 your_secret_key [32], 1642 const u8 their_public_key [32]) 1643 { 1644 // restrict the possible scalar values 1645 u8 e[32]; 1646 COPY(e, your_secret_key, 32); 1647 trim_scalar(e); 1648 scalarmult(raw_shared_secret, e, their_public_key, 255); 1649 WIPE_BUFFER(e); 1650 } 1651 1652 void crypto_x25519_public_key(u8 public_key[32], 1653 const u8 secret_key[32]) 1654 { 1655 static const u8 base_point[32] = {9}; 1656 crypto_x25519(public_key, secret_key, base_point); 1657 } 1658 1659 /////////////////////////// 1660 /// Arithmetic modulo L /// 1661 /////////////////////////// 1662 static const u32 L[8] = {0x5cf5d3ed, 0x5812631a, 0xa2f79cd6, 0x14def9de, 1663 0x00000000, 0x00000000, 0x00000000, 0x10000000,}; 1664 1665 // p = a*b + p 1666 static void multiply(u32 p[16], const u32 a[8], const u32 b[8]) 1667 { 1668 FOR (i, 0, 8) { 1669 u64 carry = 0; 1670 FOR (j, 0, 8) { 1671 carry += p[i+j] + (u64)a[i] * b[j]; 1672 p[i+j] = (u32)carry; 1673 carry >>= 32; 1674 } 1675 p[i+8] = (u32)carry; 1676 } 1677 } 1678 1679 static int is_above_l(const u32 x[8]) 1680 { 1681 // We work with L directly, in a 2's complement encoding 1682 // (-L == ~L + 1) 1683 u64 carry = 1; 1684 FOR (i, 0, 8) { 1685 carry += (u64)x[i] + ~L[i]; 1686 carry >>= 32; 1687 } 1688 return carry; 1689 } 1690 1691 // Final reduction modulo L, by conditionally removing L. 1692 // if x < l , then r = x 1693 // if l <= x 2*l, then r = x-l 1694 // otherwise the result will be wrong 1695 static void remove_l(u32 r[8], const u32 x[8]) 1696 { 1697 u64 carry = is_above_l(x); 1698 u32 mask = ~(u32)carry + 1; // carry == 0 or 1 1699 FOR (i, 0, 8) { 1700 carry += (u64)x[i] + (~L[i] & mask); 1701 r[i] = (u32)carry; 1702 carry >>= 32; 1703 } 1704 } 1705 1706 // Full reduction modulo L (Barrett reduction) 1707 static void mod_l(u8 reduced[32], const u32 x[16]) 1708 { 1709 static const u32 r[9] = {0x0a2c131b,0xed9ce5a3,0x086329a7,0x2106215d, 1710 0xffffffeb,0xffffffff,0xffffffff,0xffffffff,0xf,}; 1711 // xr = x * r 1712 u32 xr[25] = {0}; 1713 FOR (i, 0, 9) { 1714 u64 carry = 0; 1715 FOR (j, 0, 16) { 1716 carry += xr[i+j] + (u64)r[i] * x[j]; 1717 xr[i+j] = (u32)carry; 1718 carry >>= 32; 1719 } 1720 xr[i+16] = (u32)carry; 1721 } 1722 // xr = floor(xr / 2^512) * L 1723 // Since the result is guaranteed to be below 2*L, 1724 // it is enough to only compute the first 256 bits. 1725 // The division is performed by saying xr[i+16]. (16 * 32 = 512) 1726 ZERO(xr, 8); 1727 FOR (i, 0, 8) { 1728 u64 carry = 0; 1729 FOR (j, 0, 8-i) { 1730 carry += xr[i+j] + (u64)xr[i+16] * L[j]; 1731 xr[i+j] = (u32)carry; 1732 carry >>= 32; 1733 } 1734 } 1735 // xr = x - xr 1736 u64 carry = 1; 1737 FOR (i, 0, 8) { 1738 carry += (u64)x[i] + ~xr[i]; 1739 xr[i] = (u32)carry; 1740 carry >>= 32; 1741 } 1742 // Final reduction modulo L (conditional subtraction) 1743 remove_l(xr, xr); 1744 store32_le_buf(reduced, xr, 8); 1745 1746 WIPE_BUFFER(xr); 1747 } 1748 1749 static void reduce(u8 r[64]) 1750 { 1751 u32 x[16]; 1752 load32_le_buf(x, r, 16); 1753 mod_l(r, x); 1754 WIPE_BUFFER(x); 1755 } 1756 1757 // r = (a * b) + c 1758 static void mul_add(u8 r[32], const u8 a[32], const u8 b[32], const u8 c[32]) 1759 { 1760 u32 A[8]; load32_le_buf(A, a, 8); 1761 u32 B[8]; load32_le_buf(B, b, 8); 1762 u32 p[16]; 1763 load32_le_buf(p, c, 8); 1764 ZERO(p + 8, 8); 1765 multiply(p, A, B); 1766 mod_l(r, p); 1767 WIPE_BUFFER(p); 1768 WIPE_BUFFER(A); 1769 WIPE_BUFFER(B); 1770 } 1771 1772 /////////////// 1773 /// Ed25519 /// 1774 /////////////// 1775 1776 // Point (group element, ge) in a twisted Edwards curve, 1777 // in extended projective coordinates. 1778 // ge : x = X/Z, y = Y/Z, T = XY/Z 1779 // ge_cached : Yp = X+Y, Ym = X-Y, T2 = T*D2 1780 // ge_precomp: Z = 1 1781 typedef struct { fe X; fe Y; fe Z; fe T; } ge; 1782 typedef struct { fe Yp; fe Ym; fe Z; fe T2; } ge_cached; 1783 typedef struct { fe Yp; fe Ym; fe T2; } ge_precomp; 1784 1785 static void ge_zero(ge *p) 1786 { 1787 fe_0(p->X); 1788 fe_1(p->Y); 1789 fe_1(p->Z); 1790 fe_0(p->T); 1791 } 1792 1793 static void ge_tobytes(u8 s[32], const ge *h) 1794 { 1795 fe recip, x, y; 1796 fe_invert(recip, h->Z); 1797 fe_mul(x, h->X, recip); 1798 fe_mul(y, h->Y, recip); 1799 fe_tobytes(s, y); 1800 s[31] ^= fe_isodd(x) << 7; 1801 1802 WIPE_BUFFER(recip); 1803 WIPE_BUFFER(x); 1804 WIPE_BUFFER(y); 1805 } 1806 1807 // h = s, where s is a point encoded in 32 bytes 1808 // 1809 // Variable time! Inputs must not be secret! 1810 // => Use only to *check* signatures. 1811 // 1812 // From the specifications: 1813 // The encoding of s contains y and the sign of x 1814 // x = sqrt((y^2 - 1) / (d*y^2 + 1)) 1815 // In extended coordinates: 1816 // X = x, Y = y, Z = 1, T = x*y 1817 // 1818 // Note that num * den is a square iff num / den is a square 1819 // If num * den is not a square, the point was not on the curve. 1820 // From the above: 1821 // Let num = y^2 - 1 1822 // Let den = d*y^2 + 1 1823 // x = sqrt((y^2 - 1) / (d*y^2 + 1)) 1824 // x = sqrt(num / den) 1825 // x = sqrt(num^2 / (num * den)) 1826 // x = num * sqrt(1 / (num * den)) 1827 // 1828 // Therefore, we can just compute: 1829 // num = y^2 - 1 1830 // den = d*y^2 + 1 1831 // isr = invsqrt(num * den) // abort if not square 1832 // x = num * isr 1833 // Finally, negate x if its sign is not as specified. 1834 static int ge_frombytes_vartime(ge *h, const u8 s[32]) 1835 { 1836 fe_frombytes(h->Y, s); 1837 fe_1(h->Z); 1838 fe_sq (h->T, h->Y); // t = y^2 1839 fe_mul(h->X, h->T, d ); // x = d*y^2 1840 fe_sub(h->T, h->T, h->Z); // t = y^2 - 1 1841 fe_add(h->X, h->X, h->Z); // x = d*y^2 + 1 1842 fe_mul(h->X, h->T, h->X); // x = (y^2 - 1) * (d*y^2 + 1) 1843 int is_square = invsqrt(h->X, h->X); 1844 if (!is_square) { 1845 return -1; // Not on the curve, abort 1846 } 1847 fe_mul(h->X, h->T, h->X); // x = sqrt((y^2 - 1) / (d*y^2 + 1)) 1848 if (fe_isodd(h->X) != (s[31] >> 7)) { 1849 fe_neg(h->X, h->X); 1850 } 1851 fe_mul(h->T, h->X, h->Y); 1852 return 0; 1853 } 1854 1855 static void ge_cache(ge_cached *c, const ge *p) 1856 { 1857 fe_add (c->Yp, p->Y, p->X); 1858 fe_sub (c->Ym, p->Y, p->X); 1859 fe_copy(c->Z , p->Z ); 1860 fe_mul (c->T2, p->T, D2 ); 1861 } 1862 1863 // Internal buffers are not wiped! Inputs must not be secret! 1864 // => Use only to *check* signatures. 1865 static void ge_add(ge *s, const ge *p, const ge_cached *q) 1866 { 1867 fe a, b; 1868 fe_add(a , p->Y, p->X ); 1869 fe_sub(b , p->Y, p->X ); 1870 fe_mul(a , a , q->Yp); 1871 fe_mul(b , b , q->Ym); 1872 fe_add(s->Y, a , b ); 1873 fe_sub(s->X, a , b ); 1874 1875 fe_add(s->Z, p->Z, p->Z ); 1876 fe_mul(s->Z, s->Z, q->Z ); 1877 fe_mul(s->T, p->T, q->T2); 1878 fe_add(a , s->Z, s->T ); 1879 fe_sub(b , s->Z, s->T ); 1880 1881 fe_mul(s->T, s->X, s->Y); 1882 fe_mul(s->X, s->X, b ); 1883 fe_mul(s->Y, s->Y, a ); 1884 fe_mul(s->Z, a , b ); 1885 } 1886 1887 // Internal buffers are not wiped! Inputs must not be secret! 1888 // => Use only to *check* signatures. 1889 static void ge_sub(ge *s, const ge *p, const ge_cached *q) 1890 { 1891 ge_cached neg; 1892 fe_copy(neg.Ym, q->Yp); 1893 fe_copy(neg.Yp, q->Ym); 1894 fe_copy(neg.Z , q->Z ); 1895 fe_neg (neg.T2, q->T2); 1896 ge_add(s, p, &neg); 1897 } 1898 1899 static void ge_madd(ge *s, const ge *p, const ge_precomp *q, fe a, fe b) 1900 { 1901 fe_add(a , p->Y, p->X ); 1902 fe_sub(b , p->Y, p->X ); 1903 fe_mul(a , a , q->Yp); 1904 fe_mul(b , b , q->Ym); 1905 fe_add(s->Y, a , b ); 1906 fe_sub(s->X, a , b ); 1907 1908 fe_add(s->Z, p->Z, p->Z ); 1909 fe_mul(s->T, p->T, q->T2); 1910 fe_add(a , s->Z, s->T ); 1911 fe_sub(b , s->Z, s->T ); 1912 1913 fe_mul(s->T, s->X, s->Y); 1914 fe_mul(s->X, s->X, b ); 1915 fe_mul(s->Y, s->Y, a ); 1916 fe_mul(s->Z, a , b ); 1917 } 1918 1919 static void ge_msub(ge *s, const ge *p, const ge_precomp *q, fe a, fe b) 1920 { 1921 fe_add(a , p->Y, p->X ); 1922 fe_sub(b , p->Y, p->X ); 1923 fe_mul(a , a , q->Ym); 1924 fe_mul(b , b , q->Yp); 1925 fe_add(s->Y, a , b ); 1926 fe_sub(s->X, a , b ); 1927 1928 fe_add(s->Z, p->Z, p->Z ); 1929 fe_mul(s->T, p->T, q->T2); 1930 fe_sub(a , s->Z, s->T ); 1931 fe_add(b , s->Z, s->T ); 1932 1933 fe_mul(s->T, s->X, s->Y); 1934 fe_mul(s->X, s->X, b ); 1935 fe_mul(s->Y, s->Y, a ); 1936 fe_mul(s->Z, a , b ); 1937 } 1938 1939 static void ge_double(ge *s, const ge *p, ge *q) 1940 { 1941 fe_sq (q->X, p->X); 1942 fe_sq (q->Y, p->Y); 1943 fe_sq2(q->Z, p->Z); 1944 fe_add(q->T, p->X, p->Y); 1945 fe_sq (s->T, q->T); 1946 fe_add(q->T, q->Y, q->X); 1947 fe_sub(q->Y, q->Y, q->X); 1948 fe_sub(q->X, s->T, q->T); 1949 fe_sub(q->Z, q->Z, q->Y); 1950 1951 fe_mul(s->X, q->X , q->Z); 1952 fe_mul(s->Y, q->T , q->Y); 1953 fe_mul(s->Z, q->Y , q->Z); 1954 fe_mul(s->T, q->X , q->T); 1955 } 1956 1957 // 5-bit signed window in cached format (Niels coordinates, Z=1) 1958 static const ge_precomp b_window[8] = { 1959 {{25967493,-14356035,29566456,3660896,-12694345, 1960 4014787,27544626,-11754271,-6079156,2047605,}, 1961 {-12545711,934262,-2722910,3049990,-727428, 1962 9406986,12720692,5043384,19500929,-15469378,}, 1963 {-8738181,4489570,9688441,-14785194,10184609, 1964 -12363380,29287919,11864899,-24514362,-4438546,},}, 1965 {{15636291,-9688557,24204773,-7912398,616977, 1966 -16685262,27787600,-14772189,28944400,-1550024,}, 1967 {16568933,4717097,-11556148,-1102322,15682896, 1968 -11807043,16354577,-11775962,7689662,11199574,}, 1969 {30464156,-5976125,-11779434,-15670865,23220365, 1970 15915852,7512774,10017326,-17749093,-9920357,},}, 1971 {{10861363,11473154,27284546,1981175,-30064349, 1972 12577861,32867885,14515107,-15438304,10819380,}, 1973 {4708026,6336745,20377586,9066809,-11272109, 1974 6594696,-25653668,12483688,-12668491,5581306,}, 1975 {19563160,16186464,-29386857,4097519,10237984, 1976 -4348115,28542350,13850243,-23678021,-15815942,},}, 1977 {{5153746,9909285,1723747,-2777874,30523605, 1978 5516873,19480852,5230134,-23952439,-15175766,}, 1979 {-30269007,-3463509,7665486,10083793,28475525, 1980 1649722,20654025,16520125,30598449,7715701,}, 1981 {28881845,14381568,9657904,3680757,-20181635, 1982 7843316,-31400660,1370708,29794553,-1409300,},}, 1983 {{-22518993,-6692182,14201702,-8745502,-23510406, 1984 8844726,18474211,-1361450,-13062696,13821877,}, 1985 {-6455177,-7839871,3374702,-4740862,-27098617, 1986 -10571707,31655028,-7212327,18853322,-14220951,}, 1987 {4566830,-12963868,-28974889,-12240689,-7602672, 1988 -2830569,-8514358,-10431137,2207753,-3209784,},}, 1989 {{-25154831,-4185821,29681144,7868801,-6854661, 1990 -9423865,-12437364,-663000,-31111463,-16132436,}, 1991 {25576264,-2703214,7349804,-11814844,16472782, 1992 9300885,3844789,15725684,171356,6466918,}, 1993 {23103977,13316479,9739013,-16149481,817875, 1994 -15038942,8965339,-14088058,-30714912,16193877,},}, 1995 {{-33521811,3180713,-2394130,14003687,-16903474, 1996 -16270840,17238398,4729455,-18074513,9256800,}, 1997 {-25182317,-4174131,32336398,5036987,-21236817, 1998 11360617,22616405,9761698,-19827198,630305,}, 1999 {-13720693,2639453,-24237460,-7406481,9494427, 2000 -5774029,-6554551,-15960994,-2449256,-14291300,},}, 2001 {{-3151181,-5046075,9282714,6866145,-31907062, 2002 -863023,-18940575,15033784,25105118,-7894876,}, 2003 {-24326370,15950226,-31801215,-14592823,-11662737, 2004 -5090925,1573892,-2625887,2198790,-15804619,}, 2005 {-3099351,10324967,-2241613,7453183,-5446979, 2006 -2735503,-13812022,-16236442,-32461234,-12290683,},}, 2007 }; 2008 2009 // Incremental sliding windows (left to right) 2010 // Based on Roberto Maria Avanzi[2005] 2011 typedef struct { 2012 i16 next_index; // position of the next signed digit 2013 i8 next_digit; // next signed digit (odd number below 2^window_width) 2014 u8 next_check; // point at which we must check for a new window 2015 } slide_ctx; 2016 2017 static void slide_init(slide_ctx *ctx, const u8 scalar[32]) 2018 { 2019 // scalar is guaranteed to be below L, either because we checked (s), 2020 // or because we reduced it modulo L (h_ram). L is under 2^253, so 2021 // so bits 253 to 255 are guaranteed to be zero. No need to test them. 2022 // 2023 // Note however that L is very close to 2^252, so bit 252 is almost 2024 // always zero. If we were to start at bit 251, the tests wouldn't 2025 // catch the off-by-one error (constructing one that does would be 2026 // prohibitively expensive). 2027 // 2028 // We should still check bit 252, though. 2029 int i = 252; 2030 while (i > 0 && scalar_bit(scalar, i) == 0) { 2031 i--; 2032 } 2033 ctx->next_check = (u8)(i + 1); 2034 ctx->next_index = -1; 2035 ctx->next_digit = -1; 2036 } 2037 2038 static int slide_step(slide_ctx *ctx, int width, int i, const u8 scalar[32]) 2039 { 2040 if (i == ctx->next_check) { 2041 if (scalar_bit(scalar, i) == scalar_bit(scalar, i - 1)) { 2042 ctx->next_check--; 2043 } else { 2044 // compute digit of next window 2045 int w = MIN(width, i + 1); 2046 int v = -(scalar_bit(scalar, i) << (w-1)); 2047 FOR_T (int, j, 0, w-1) { 2048 v += scalar_bit(scalar, i-(w-1)+j) << j; 2049 } 2050 v += scalar_bit(scalar, i-w); 2051 int lsb = v & (~v + 1); // smallest bit of v 2052 int s = ( ((lsb & 0xAA) != 0) // log2(lsb) 2053 | (((lsb & 0xCC) != 0) << 1) 2054 | (((lsb & 0xF0) != 0) << 2)); 2055 ctx->next_index = (i16)(i-(w-1)+s); 2056 ctx->next_digit = (i8) (v >> s ); 2057 ctx->next_check -= (u8) w; 2058 } 2059 } 2060 return i == ctx->next_index ? ctx->next_digit: 0; 2061 } 2062 2063 #define P_W_WIDTH 3 // Affects the size of the stack 2064 #define B_W_WIDTH 5 // Affects the size of the binary 2065 #define P_W_SIZE (1<<(P_W_WIDTH-2)) 2066 2067 // P = [b]B + [p]P, where B is the base point 2068 // 2069 // Variable time! Internal buffers are not wiped! Inputs must not be secret! 2070 // => Use only to *check* signatures. 2071 static void ge_double_scalarmult_vartime(ge *P, const u8 p[32], const u8 b[32]) 2072 { 2073 // cache P window for addition 2074 ge_cached cP[P_W_SIZE]; 2075 { 2076 ge P2, tmp; 2077 ge_double(&P2, P, &tmp); 2078 ge_cache(&cP[0], P); 2079 FOR (i, 1, P_W_SIZE) { 2080 ge_add(&tmp, &P2, &cP[i-1]); 2081 ge_cache(&cP[i], &tmp); 2082 } 2083 } 2084 2085 // Merged double and add ladder, fused with sliding 2086 slide_ctx p_slide; slide_init(&p_slide, p); 2087 slide_ctx b_slide; slide_init(&b_slide, b); 2088 int i = MAX(p_slide.next_check, b_slide.next_check); 2089 ge *sum = P; 2090 ge_zero(sum); 2091 while (i >= 0) { 2092 ge tmp; 2093 ge_double(sum, sum, &tmp); 2094 int p_digit = slide_step(&p_slide, P_W_WIDTH, i, p); 2095 int b_digit = slide_step(&b_slide, B_W_WIDTH, i, b); 2096 if (p_digit > 0) { ge_add(sum, sum, &cP[ p_digit / 2]); } 2097 if (p_digit < 0) { ge_sub(sum, sum, &cP[-p_digit / 2]); } 2098 fe t1, t2; 2099 if (b_digit > 0) { ge_madd(sum, sum, b_window + b_digit/2, t1, t2); } 2100 if (b_digit < 0) { ge_msub(sum, sum, b_window + -b_digit/2, t1, t2); } 2101 i--; 2102 } 2103 } 2104 2105 // R_check = s[B] - h_ram[pk], where B is the base point 2106 // 2107 // Variable time! Internal buffers are not wiped! Inputs must not be secret! 2108 // => Use only to *check* signatures. 2109 static int ge_r_check(u8 R_check[32], u8 s[32], u8 h_ram[32], u8 pk[32]) 2110 { 2111 ge A; // not secret, not wiped 2112 u32 s32[8]; // not secret, not wiped 2113 load32_le_buf(s32, s, 8); 2114 if (ge_frombytes_vartime(&A, pk) || // A = pk 2115 is_above_l(s32)) { // prevent s malleability 2116 return -1; 2117 } 2118 fe_neg(A.X, A.X); 2119 fe_neg(A.T, A.T); // A = -pk 2120 ge_double_scalarmult_vartime(&A, h_ram, s); // A = [s]B - [h_ram]pk 2121 ge_tobytes(R_check, &A); // R_check = A 2122 return 0; 2123 } 2124 2125 // 5-bit signed comb in cached format (Niels coordinates, Z=1) 2126 static const ge_precomp b_comb_low[8] = { 2127 {{-6816601,-2324159,-22559413,124364,18015490, 2128 8373481,19993724,1979872,-18549925,9085059,}, 2129 {10306321,403248,14839893,9633706,8463310, 2130 -8354981,-14305673,14668847,26301366,2818560,}, 2131 {-22701500,-3210264,-13831292,-2927732,-16326337, 2132 -14016360,12940910,177905,12165515,-2397893,},}, 2133 {{-12282262,-7022066,9920413,-3064358,-32147467, 2134 2927790,22392436,-14852487,2719975,16402117,}, 2135 {-7236961,-4729776,2685954,-6525055,-24242706, 2136 -15940211,-6238521,14082855,10047669,12228189,}, 2137 {-30495588,-12893761,-11161261,3539405,-11502464, 2138 16491580,-27286798,-15030530,-7272871,-15934455,},}, 2139 {{17650926,582297,-860412,-187745,-12072900, 2140 -10683391,-20352381,15557840,-31072141,-5019061,}, 2141 {-6283632,-2259834,-4674247,-4598977,-4089240, 2142 12435688,-31278303,1060251,6256175,10480726,}, 2143 {-13871026,2026300,-21928428,-2741605,-2406664, 2144 -8034988,7355518,15733500,-23379862,7489131,},}, 2145 {{6883359,695140,23196907,9644202,-33430614, 2146 11354760,-20134606,6388313,-8263585,-8491918,}, 2147 {-7716174,-13605463,-13646110,14757414,-19430591, 2148 -14967316,10359532,-11059670,-21935259,12082603,}, 2149 {-11253345,-15943946,10046784,5414629,24840771, 2150 8086951,-6694742,9868723,15842692,-16224787,},}, 2151 {{9639399,11810955,-24007778,-9320054,3912937, 2152 -9856959,996125,-8727907,-8919186,-14097242,}, 2153 {7248867,14468564,25228636,-8795035,14346339, 2154 8224790,6388427,-7181107,6468218,-8720783,}, 2155 {15513115,15439095,7342322,-10157390,18005294, 2156 -7265713,2186239,4884640,10826567,7135781,},}, 2157 {{-14204238,5297536,-5862318,-6004934,28095835, 2158 4236101,-14203318,1958636,-16816875,3837147,}, 2159 {-5511166,-13176782,-29588215,12339465,15325758, 2160 -15945770,-8813185,11075932,-19608050,-3776283,}, 2161 {11728032,9603156,-4637821,-5304487,-7827751, 2162 2724948,31236191,-16760175,-7268616,14799772,},}, 2163 {{-28842672,4840636,-12047946,-9101456,-1445464, 2164 381905,-30977094,-16523389,1290540,12798615,}, 2165 {27246947,-10320914,14792098,-14518944,5302070, 2166 -8746152,-3403974,-4149637,-27061213,10749585,}, 2167 {25572375,-6270368,-15353037,16037944,1146292, 2168 32198,23487090,9585613,24714571,-1418265,},}, 2169 {{19844825,282124,-17583147,11004019,-32004269, 2170 -2716035,6105106,-1711007,-21010044,14338445,}, 2171 {8027505,8191102,-18504907,-12335737,25173494, 2172 -5923905,15446145,7483684,-30440441,10009108,}, 2173 {-14134701,-4174411,10246585,-14677495,33553567, 2174 -14012935,23366126,15080531,-7969992,7663473,},}, 2175 }; 2176 2177 static const ge_precomp b_comb_high[8] = { 2178 {{33055887,-4431773,-521787,6654165,951411, 2179 -6266464,-5158124,6995613,-5397442,-6985227,}, 2180 {4014062,6967095,-11977872,3960002,8001989, 2181 5130302,-2154812,-1899602,-31954493,-16173976,}, 2182 {16271757,-9212948,23792794,731486,-25808309, 2183 -3546396,6964344,-4767590,10976593,10050757,},}, 2184 {{2533007,-4288439,-24467768,-12387405,-13450051, 2185 14542280,12876301,13893535,15067764,8594792,}, 2186 {20073501,-11623621,3165391,-13119866,13188608, 2187 -11540496,-10751437,-13482671,29588810,2197295,}, 2188 {-1084082,11831693,6031797,14062724,14748428, 2189 -8159962,-20721760,11742548,31368706,13161200,},}, 2190 {{2050412,-6457589,15321215,5273360,25484180, 2191 124590,-18187548,-7097255,-6691621,-14604792,}, 2192 {9938196,2162889,-6158074,-1711248,4278932, 2193 -2598531,-22865792,-7168500,-24323168,11746309,}, 2194 {-22691768,-14268164,5965485,9383325,20443693, 2195 5854192,28250679,-1381811,-10837134,13717818,},}, 2196 {{-8495530,16382250,9548884,-4971523,-4491811, 2197 -3902147,6182256,-12832479,26628081,10395408,}, 2198 {27329048,-15853735,7715764,8717446,-9215518, 2199 -14633480,28982250,-5668414,4227628,242148,}, 2200 {-13279943,-7986904,-7100016,8764468,-27276630, 2201 3096719,29678419,-9141299,3906709,11265498,},}, 2202 {{11918285,15686328,-17757323,-11217300,-27548967, 2203 4853165,-27168827,6807359,6871949,-1075745,}, 2204 {-29002610,13984323,-27111812,-2713442,28107359, 2205 -13266203,6155126,15104658,3538727,-7513788,}, 2206 {14103158,11233913,-33165269,9279850,31014152, 2207 4335090,-1827936,4590951,13960841,12787712,},}, 2208 {{1469134,-16738009,33411928,13942824,8092558, 2209 -8778224,-11165065,1437842,22521552,-2792954,}, 2210 {31352705,-4807352,-25327300,3962447,12541566, 2211 -9399651,-27425693,7964818,-23829869,5541287,}, 2212 {-25732021,-6864887,23848984,3039395,-9147354, 2213 6022816,-27421653,10590137,25309915,-1584678,},}, 2214 {{-22951376,5048948,31139401,-190316,-19542447, 2215 -626310,-17486305,-16511925,-18851313,-12985140,}, 2216 {-9684890,14681754,30487568,7717771,-10829709, 2217 9630497,30290549,-10531496,-27798994,-13812825,}, 2218 {5827835,16097107,-24501327,12094619,7413972, 2219 11447087,28057551,-1793987,-14056981,4359312,},}, 2220 {{26323183,2342588,-21887793,-1623758,-6062284, 2221 2107090,-28724907,9036464,-19618351,-13055189,}, 2222 {-29697200,14829398,-4596333,14220089,-30022969, 2223 2955645,12094100,-13693652,-5941445,7047569,}, 2224 {-3201977,14413268,-12058324,-16417589,-9035655, 2225 -7224648,9258160,1399236,30397584,-5684634,},}, 2226 }; 2227 2228 static void lookup_add(ge *p, ge_precomp *tmp_c, fe tmp_a, fe tmp_b, 2229 const ge_precomp comb[8], const u8 scalar[32], int i) 2230 { 2231 u8 teeth = (u8)((scalar_bit(scalar, i) ) + 2232 (scalar_bit(scalar, i + 32) << 1) + 2233 (scalar_bit(scalar, i + 64) << 2) + 2234 (scalar_bit(scalar, i + 96) << 3)); 2235 u8 high = teeth >> 3; 2236 u8 index = (teeth ^ (high - 1)) & 7; 2237 FOR (j, 0, 8) { 2238 i32 select = 1 & (((j ^ index) - 1) >> 8); 2239 fe_ccopy(tmp_c->Yp, comb[j].Yp, select); 2240 fe_ccopy(tmp_c->Ym, comb[j].Ym, select); 2241 fe_ccopy(tmp_c->T2, comb[j].T2, select); 2242 } 2243 fe_neg(tmp_a, tmp_c->T2); 2244 fe_cswap(tmp_c->T2, tmp_a , high ^ 1); 2245 fe_cswap(tmp_c->Yp, tmp_c->Ym, high ^ 1); 2246 ge_madd(p, p, tmp_c, tmp_a, tmp_b); 2247 } 2248 2249 // p = [scalar]B, where B is the base point 2250 static void ge_scalarmult_base(ge *p, const u8 scalar[32]) 2251 { 2252 // twin 4-bits signed combs, from Mike Hamburg's 2253 // Fast and compact elliptic-curve cryptography (2012) 2254 // 1 / 2 modulo L 2255 static const u8 half_mod_L[32] = { 2256 247,233,122,46,141,49,9,44,107,206,123,81,239,124,111,10, 2257 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8, }; 2258 // (2^256 - 1) / 2 modulo L 2259 static const u8 half_ones[32] = { 2260 142,74,204,70,186,24,118,107,184,231,190,57,250,173,119,99, 2261 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,7, }; 2262 2263 // All bits set form: 1 means 1, 0 means -1 2264 u8 s_scalar[32]; 2265 mul_add(s_scalar, scalar, half_mod_L, half_ones); 2266 2267 // Double and add ladder 2268 fe tmp_a, tmp_b; // temporaries for addition 2269 ge_precomp tmp_c; // temporary for comb lookup 2270 ge tmp_d; // temporary for doubling 2271 fe_1(tmp_c.Yp); 2272 fe_1(tmp_c.Ym); 2273 fe_0(tmp_c.T2); 2274 2275 // Save a double on the first iteration 2276 ge_zero(p); 2277 lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_low , s_scalar, 31); 2278 lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_high, s_scalar, 31+128); 2279 // Regular double & add for the rest 2280 for (int i = 30; i >= 0; i--) { 2281 ge_double(p, p, &tmp_d); 2282 lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_low , s_scalar, i); 2283 lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_high, s_scalar, i+128); 2284 } 2285 // Note: we could save one addition at the end if we assumed the 2286 // scalar fit in 252 bit. Which it does in practice if it is 2287 // selected at random. However, non-random, non-hashed scalars 2288 // *can* overflow 252 bits in practice. Better account for that 2289 // than leaving that kind of subtle corner case. 2290 2291 WIPE_BUFFER(tmp_a); WIPE_CTX(&tmp_d); 2292 WIPE_BUFFER(tmp_b); WIPE_CTX(&tmp_c); 2293 WIPE_BUFFER(s_scalar); 2294 } 2295 2296 void crypto_sign_public_key_custom_hash(u8 public_key[32], 2297 const u8 secret_key[32], 2298 const crypto_sign_vtable *hash) 2299 { 2300 u8 a[64]; 2301 hash->hash(a, secret_key, 32); 2302 trim_scalar(a); 2303 ge A; 2304 ge_scalarmult_base(&A, a); 2305 ge_tobytes(public_key, &A); 2306 WIPE_BUFFER(a); 2307 WIPE_CTX(&A); 2308 } 2309 2310 void crypto_sign_public_key(u8 public_key[32], const u8 secret_key[32]) 2311 { 2312 crypto_sign_public_key_custom_hash(public_key, secret_key, 2313 &crypto_blake2b_vtable); 2314 } 2315 2316 void crypto_sign_init_first_pass_custom_hash(crypto_sign_ctx_abstract *ctx, 2317 const u8 secret_key[32], 2318 const u8 public_key[32], 2319 const crypto_sign_vtable *hash) 2320 { 2321 ctx->hash = hash; // set vtable 2322 u8 *a = ctx->buf; 2323 u8 *prefix = ctx->buf + 32; 2324 ctx->hash->hash(a, secret_key, 32); 2325 trim_scalar(a); 2326 2327 if (public_key == 0) { 2328 crypto_sign_public_key_custom_hash(ctx->pk, secret_key, ctx->hash); 2329 } else { 2330 COPY(ctx->pk, public_key, 32); 2331 } 2332 2333 // Deterministic part of EdDSA: Construct a nonce by hashing the message 2334 // instead of generating a random number. 2335 // An actual random number would work just fine, and would save us 2336 // the trouble of hashing the message twice. If we did that 2337 // however, the user could fuck it up and reuse the nonce. 2338 ctx->hash->init (ctx); 2339 ctx->hash->update(ctx, prefix , 32); 2340 } 2341 2342 void crypto_sign_init_first_pass(crypto_sign_ctx_abstract *ctx, 2343 const u8 secret_key[32], 2344 const u8 public_key[32]) 2345 { 2346 crypto_sign_init_first_pass_custom_hash(ctx, secret_key, public_key, 2347 &crypto_blake2b_vtable); 2348 } 2349 2350 void crypto_sign_update(crypto_sign_ctx_abstract *ctx, 2351 const u8 *msg, size_t msg_size) 2352 { 2353 ctx->hash->update(ctx, msg, msg_size); 2354 } 2355 2356 void crypto_sign_init_second_pass(crypto_sign_ctx_abstract *ctx) 2357 { 2358 u8 *r = ctx->buf + 32; 2359 u8 *half_sig = ctx->buf + 64; 2360 ctx->hash->final(ctx, r); 2361 reduce(r); 2362 2363 // first half of the signature = "random" nonce times the base point 2364 ge R; 2365 ge_scalarmult_base(&R, r); 2366 ge_tobytes(half_sig, &R); 2367 WIPE_CTX(&R); 2368 2369 // Hash R, the public key, and the message together. 2370 // It cannot be done in parallel with the first hash. 2371 ctx->hash->init (ctx); 2372 ctx->hash->update(ctx, half_sig, 32); 2373 ctx->hash->update(ctx, ctx->pk , 32); 2374 } 2375 2376 void crypto_sign_final(crypto_sign_ctx_abstract *ctx, u8 signature[64]) 2377 { 2378 u8 *a = ctx->buf; 2379 u8 *r = ctx->buf + 32; 2380 u8 *half_sig = ctx->buf + 64; 2381 u8 h_ram[64]; 2382 ctx->hash->final(ctx, h_ram); 2383 reduce(h_ram); 2384 COPY(signature, half_sig, 32); 2385 mul_add(signature + 32, h_ram, a, r); // s = h_ram * a + r 2386 WIPE_BUFFER(h_ram); 2387 crypto_wipe(ctx, ctx->hash->ctx_size); 2388 } 2389 2390 void crypto_sign(u8 signature[64], 2391 const u8 secret_key[32], 2392 const u8 public_key[32], 2393 const u8 *message, size_t message_size) 2394 { 2395 crypto_sign_ctx ctx; 2396 crypto_sign_ctx_abstract *actx = (crypto_sign_ctx_abstract*)&ctx; 2397 crypto_sign_init_first_pass (actx, secret_key, public_key); 2398 crypto_sign_update (actx, message, message_size); 2399 crypto_sign_init_second_pass(actx); 2400 crypto_sign_update (actx, message, message_size); 2401 crypto_sign_final (actx, signature); 2402 } 2403 2404 void crypto_check_init_custom_hash(crypto_check_ctx_abstract *ctx, 2405 const u8 signature[64], 2406 const u8 public_key[32], 2407 const crypto_sign_vtable *hash) 2408 { 2409 ctx->hash = hash; // set vtable 2410 COPY(ctx->buf, signature , 64); 2411 COPY(ctx->pk , public_key, 32); 2412 ctx->hash->init (ctx); 2413 ctx->hash->update(ctx, signature , 32); 2414 ctx->hash->update(ctx, public_key, 32); 2415 } 2416 2417 void crypto_check_init(crypto_check_ctx_abstract *ctx, const u8 signature[64], 2418 const u8 public_key[32]) 2419 { 2420 crypto_check_init_custom_hash(ctx, signature, public_key, 2421 &crypto_blake2b_vtable); 2422 } 2423 2424 void crypto_check_update(crypto_check_ctx_abstract *ctx, 2425 const u8 *msg, size_t msg_size) 2426 { 2427 ctx->hash->update(ctx, msg, msg_size); 2428 } 2429 2430 int crypto_check_final(crypto_check_ctx_abstract *ctx) 2431 { 2432 u8 h_ram[64]; 2433 ctx->hash->final(ctx, h_ram); 2434 reduce(h_ram); 2435 u8 *R = ctx->buf; // R 2436 u8 *s = ctx->buf + 32; // s 2437 u8 *R_check = ctx->pk; // overwrite ctx->pk to save stack space 2438 if (ge_r_check(R_check, s, h_ram, ctx->pk)) { 2439 return -1; 2440 } 2441 return crypto_verify32(R, R_check); // R == R_check ? OK : fail 2442 } 2443 2444 int crypto_check(const u8 signature[64], const u8 public_key[32], 2445 const u8 *message, size_t message_size) 2446 { 2447 crypto_check_ctx ctx; 2448 crypto_check_ctx_abstract *actx = (crypto_check_ctx_abstract*)&ctx; 2449 crypto_check_init (actx, signature, public_key); 2450 crypto_check_update(actx, message, message_size); 2451 return crypto_check_final(actx); 2452 } 2453 2454 /////////////////////// 2455 /// EdDSA to X25519 /// 2456 /////////////////////// 2457 void crypto_from_eddsa_private(u8 x25519[32], const u8 eddsa[32]) 2458 { 2459 u8 a[64]; 2460 crypto_blake2b(a, eddsa, 32); 2461 COPY(x25519, a, 32); 2462 WIPE_BUFFER(a); 2463 } 2464 2465 void crypto_from_eddsa_public(u8 x25519[32], const u8 eddsa[32]) 2466 { 2467 fe t1, t2; 2468 fe_frombytes(t2, eddsa); 2469 fe_add(t1, fe_one, t2); 2470 fe_sub(t2, fe_one, t2); 2471 fe_invert(t2, t2); 2472 fe_mul(t1, t1, t2); 2473 fe_tobytes(x25519, t1); 2474 WIPE_BUFFER(t1); 2475 WIPE_BUFFER(t2); 2476 } 2477 2478 ///////////////////////////////////////////// 2479 /// Dirty ephemeral public key generation /// 2480 ///////////////////////////////////////////// 2481 2482 // Those functions generates a public key, *without* clearing the 2483 // cofactor. Sending that key over the network leaks 3 bits of the 2484 // private key. Use only to generate ephemeral keys that will be hidden 2485 // with crypto_curve_to_hidden(). 2486 // 2487 // The public key is otherwise compatible with crypto_x25519() and 2488 // crypto_key_exchange() (those properly clear the cofactor). 2489 // 2490 // Note that the distribution of the resulting public keys is almost 2491 // uniform. Flipping the sign of the v coordinate (not provided by this 2492 // function), covers the entire key space almost perfectly, where 2493 // "almost" means a 2^-128 bias (undetectable). This uniformity is 2494 // needed to ensure the proper randomness of the resulting 2495 // representatives (once we apply crypto_curve_to_hidden()). 2496 // 2497 // Recall that Curve25519 has order C = 2^255 + e, with e < 2^128 (not 2498 // to be confused with the prime order of the main subgroup, L, which is 2499 // 8 times less than that). 2500 // 2501 // Generating all points would require us to multiply a point of order C 2502 // (the base point plus any point of order 8) by all scalars from 0 to 2503 // C-1. Clamping limits us to scalars between 2^254 and 2^255 - 1. But 2504 // by negating the resulting point at random, we also cover scalars from 2505 // -2^255 + 1 to -2^254 (which modulo C is congruent to e+1 to 2^254 + e). 2506 // 2507 // In practice: 2508 // - Scalars from 0 to e + 1 are never generated 2509 // - Scalars from 2^255 to 2^255 + e are never generated 2510 // - Scalars from 2^254 + 1 to 2^254 + e are generated twice 2511 // 2512 // Since e < 2^128, detecting this bias requires observing over 2^100 2513 // representatives from a given source (this will never happen), *and* 2514 // recovering enough of the private key to determine that they do, or do 2515 // not, belong to the biased set (this practically requires solving 2516 // discrete logarithm, which is conjecturally intractable). 2517 // 2518 // In practice, this means the bias is impossible to detect. 2519 2520 // s + (x*L) % 8*L 2521 // Guaranteed to fit in 256 bits iff s fits in 255 bits. 2522 // L < 2^253 2523 // x%8 < 2^3 2524 // L * (x%8) < 2^255 2525 // s < 2^255 2526 // s + L * (x%8) < 2^256 2527 static void add_xl(u8 s[32], u8 x) 2528 { 2529 u64 mod8 = x & 7; 2530 u64 carry = 0; 2531 FOR (i , 0, 8) { 2532 carry = carry + load32_le(s + 4*i) + L[i] * mod8; 2533 store32_le(s + 4*i, (u32)carry); 2534 carry >>= 32; 2535 } 2536 } 2537 2538 // "Small" dirty ephemeral key. 2539 // Use if you need to shrink the size of the binary, and can afford to 2540 // slow down by a factor of two (compared to the fast version) 2541 // 2542 // This version works by decoupling the cofactor from the main factor. 2543 // 2544 // - The trimmed scalar determines the main factor 2545 // - The clamped bits of the scalar determine the cofactor. 2546 // 2547 // Cofactor and main factor are combined into a single scalar, which is 2548 // then multiplied by a point of order 8*L (unlike the base point, which 2549 // has prime order). That "dirty" base point is the addition of the 2550 // regular base point (9), and a point of order 8. 2551 void crypto_x25519_dirty_small(u8 public_key[32], const u8 secret_key[32]) 2552 { 2553 // Base point of order 8*L 2554 // Raw scalar multiplication with it does not clear the cofactor, 2555 // and the resulting public key will reveal 3 bits of the scalar. 2556 static const u8 dirty_base_point[32] = { 2557 0x34, 0xfc, 0x6c, 0xb7, 0xc8, 0xde, 0x58, 0x97, 0x77, 0x70, 0xd9, 0x52, 2558 0x16, 0xcc, 0xdc, 0x6c, 0x85, 0x90, 0xbe, 0xcd, 0x91, 0x9c, 0x07, 0x59, 2559 0x94, 0x14, 0x56, 0x3b, 0x4b, 0xa4, 0x47, 0x0f, }; 2560 // separate the main factor & the cofactor of the scalar 2561 u8 scalar[32]; 2562 COPY(scalar, secret_key, 32); 2563 trim_scalar(scalar); 2564 2565 // Separate the main factor and the cofactor 2566 // 2567 // The scalar is trimmed, so its cofactor is cleared. The three 2568 // least significant bits however still have a main factor. We must 2569 // remove it for X25519 compatibility. 2570 // 2571 // We exploit the fact that 5*L = 1 (modulo 8) 2572 // cofactor = lsb * 5 * L (modulo 8*L) 2573 // combined = scalar + cofactor (modulo 8*L) 2574 // combined = scalar + (lsb * 5 * L) (modulo 8*L) 2575 add_xl(scalar, secret_key[0] * 5); 2576 scalarmult(public_key, scalar, dirty_base_point, 256); 2577 WIPE_BUFFER(scalar); 2578 } 2579 2580 // "Fast" dirty ephemeral key 2581 // We use this one by default. 2582 // 2583 // This version works by performing a regular scalar multiplication, 2584 // then add a low order point. The scalar multiplication is done in 2585 // Edwards space for more speed (*2 compared to the "small" version). 2586 // The cost is a bigger binary for programs that don't also sign messages. 2587 void crypto_x25519_dirty_fast(u8 public_key[32], const u8 secret_key[32]) 2588 { 2589 u8 scalar[32]; 2590 ge pk; 2591 COPY(scalar, secret_key, 32); 2592 trim_scalar(scalar); 2593 ge_scalarmult_base(&pk, scalar); 2594 2595 // Select low order point 2596 // We're computing the [cofactor]lop scalar multiplication, where: 2597 // cofactor = tweak & 7. 2598 // lop = (lop_x, lop_y) 2599 // lop_x = sqrt((sqrt(d + 1) + 1) / d) 2600 // lop_y = -lop_x * sqrtm1 2601 // Notes: 2602 // - A (single) Montgomery ladder would be twice as slow. 2603 // - An actual scalar multiplication would hurt performance. 2604 // - A full table lookup would take more code. 2605 u8 cofactor = secret_key[0] & 7; 2606 int a = (cofactor >> 2) & 1; 2607 int b = (cofactor >> 1) & 1; 2608 int c = (cofactor >> 0) & 1; 2609 fe t1, t2, t3; 2610 fe_0(t1); 2611 fe_ccopy(t1, sqrtm1, b); 2612 fe_ccopy(t1, lop_x , c); 2613 fe_neg (t3, t1); 2614 fe_ccopy(t1, t3, a); 2615 fe_1(t2); 2616 fe_0(t3); 2617 fe_ccopy(t2, t3 , b); 2618 fe_ccopy(t2, lop_y, c); 2619 fe_neg (t3, t2); 2620 fe_ccopy(t2, t3, a^b); 2621 ge_precomp low_order_point; 2622 fe_add(low_order_point.Yp, t2, t1); 2623 fe_sub(low_order_point.Ym, t2, t1); 2624 fe_mul(low_order_point.T2, t2, t1); 2625 fe_mul(low_order_point.T2, low_order_point.T2, D2); 2626 2627 // Add low order point to the public key 2628 ge_madd(&pk, &pk, &low_order_point, t1, t2); 2629 2630 // Convert to Montgomery u coordinate (we ignore the sign) 2631 fe_add(t1, pk.Z, pk.Y); 2632 fe_sub(t2, pk.Z, pk.Y); 2633 fe_invert(t2, t2); 2634 fe_mul(t1, t1, t2); 2635 2636 fe_tobytes(public_key, t1); 2637 2638 WIPE_BUFFER(t1); WIPE_BUFFER(scalar); 2639 WIPE_BUFFER(t2); WIPE_CTX(&pk); 2640 WIPE_BUFFER(t3); WIPE_CTX(&low_order_point); 2641 } 2642 2643 /////////////////// 2644 /// Elligator 2 /// 2645 /////////////////// 2646 static const fe A = {486662}; 2647 2648 // Elligator direct map 2649 // 2650 // Computes the point corresponding to a representative, encoded in 32 2651 // bytes (little Endian). Since positive representatives fits in 254 2652 // bits, The two most significant bits are ignored. 2653 // 2654 // From the paper: 2655 // w = -A / (fe(1) + non_square * r^2) 2656 // e = chi(w^3 + A*w^2 + w) 2657 // u = e*w - (fe(1)-e)*(A//2) 2658 // v = -e * sqrt(u^3 + A*u^2 + u) 2659 // 2660 // We ignore v because we don't need it for X25519 (the Montgomery 2661 // ladder only uses u). 2662 // 2663 // Note that e is either 0, 1 or -1 2664 // if e = 0 u = 0 and v = 0 2665 // if e = 1 u = w 2666 // if e = -1 u = -w - A = w * non_square * r^2 2667 // 2668 // Let r1 = non_square * r^2 2669 // Let r2 = 1 + r1 2670 // Note that r2 cannot be zero, -1/non_square is not a square. 2671 // We can (tediously) verify that: 2672 // w^3 + A*w^2 + w = (A^2*r1 - r2^2) * A / r2^3 2673 // Therefore: 2674 // chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)) 2675 // chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)) * 1 2676 // chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)) * chi(r2^6) 2677 // chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3) * r2^6) 2678 // chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * A * r2^3) 2679 // Corollary: 2680 // e = 1 if (A^2*r1 - r2^2) * A * r2^3) is a non-zero square 2681 // e = -1 if (A^2*r1 - r2^2) * A * r2^3) is not a square 2682 // Note that w^3 + A*w^2 + w (and therefore e) can never be zero: 2683 // w^3 + A*w^2 + w = w * (w^2 + A*w + 1) 2684 // w^3 + A*w^2 + w = w * (w^2 + A*w + A^2/4 - A^2/4 + 1) 2685 // w^3 + A*w^2 + w = w * (w + A/2)^2 - A^2/4 + 1) 2686 // which is zero only if: 2687 // w = 0 (impossible) 2688 // (w + A/2)^2 = A^2/4 - 1 (impossible, because A^2/4-1 is not a square) 2689 // 2690 // Let isr = invsqrt((A^2*r1 - r2^2) * A * r2^3) 2691 // isr = sqrt(1 / ((A^2*r1 - r2^2) * A * r2^3)) if e = 1 2692 // isr = sqrt(sqrt(-1) / ((A^2*r1 - r2^2) * A * r2^3)) if e = -1 2693 // 2694 // if e = 1 2695 // let u1 = -A * (A^2*r1 - r2^2) * A * r2^2 * isr^2 2696 // u1 = w 2697 // u1 = u 2698 // 2699 // if e = -1 2700 // let ufactor = -non_square * sqrt(-1) * r^2 2701 // let vfactor = sqrt(ufactor) 2702 // let u2 = -A * (A^2*r1 - r2^2) * A * r2^2 * isr^2 * ufactor 2703 // u2 = w * -1 * -non_square * r^2 2704 // u2 = w * non_square * r^2 2705 // u2 = u 2706 void crypto_hidden_to_curve(uint8_t curve[32], const uint8_t hidden[32]) 2707 { 2708 // Representatives are encoded in 254 bits. 2709 // The two most significant ones are random padding that must be ignored. 2710 u8 clamped[32]; 2711 COPY(clamped, hidden, 32); 2712 clamped[31] &= 0x3f; 2713 2714 fe r, u, t1, t2, t3; 2715 fe_frombytes(r, clamped); 2716 fe_sq2(t1, r); 2717 fe_add(u, t1, fe_one); 2718 fe_sq (t2, u); 2719 fe_mul(t3, A2, t1); 2720 fe_sub(t3, t3, t2); 2721 fe_mul(t3, t3, A); 2722 fe_mul(t1, t2, u); 2723 fe_mul(t1, t3, t1); 2724 int is_square = invsqrt(t1, t1); 2725 fe_sq(u, r); 2726 fe_mul(u, u, ufactor); 2727 fe_ccopy(u, fe_one, is_square); 2728 fe_sq (t1, t1); 2729 fe_mul(u, u, A); 2730 fe_mul(u, u, t3); 2731 fe_mul(u, u, t2); 2732 fe_mul(u, u, t1); 2733 fe_neg(u, u); 2734 fe_tobytes(curve, u); 2735 2736 WIPE_BUFFER(t1); WIPE_BUFFER(r); 2737 WIPE_BUFFER(t2); WIPE_BUFFER(u); 2738 WIPE_BUFFER(t3); WIPE_BUFFER(clamped); 2739 } 2740 2741 // Elligator inverse map 2742 // 2743 // Computes the representative of a point, if possible. If not, it does 2744 // nothing and returns -1. Note that the success of the operation 2745 // depends only on the point (more precisely its u coordinate). The 2746 // tweak parameter is used only upon success 2747 // 2748 // The tweak should be a random byte. Beyond that, its contents are an 2749 // implementation detail. Currently, the tweak comprises: 2750 // - Bit 1 : sign of the v coordinate (0 if positive, 1 if negative) 2751 // - Bit 2-5: not used 2752 // - Bits 6-7: random padding 2753 // 2754 // From the paper: 2755 // Let sq = -non_square * u * (u+A) 2756 // if sq is not a square, or u = -A, there is no mapping 2757 // Assuming there is a mapping: 2758 // if v is positive: r = sqrt(-(u+A) / u) 2759 // if v is negative: r = sqrt(-u / (u+A)) 2760 // 2761 // We compute isr = invsqrt(-non_square * u * (u+A)) 2762 // if it wasn't a non-zero square, abort. 2763 // else, isr = sqrt(-1 / (non_square * u * (u+A)) 2764 // 2765 // This causes us to abort if u is zero, even though we shouldn't. This 2766 // never happens in practice, because (i) a random point in the curve has 2767 // a negligible chance of being zero, and (ii) scalar multiplication with 2768 // a trimmed scalar *never* yields zero. 2769 // 2770 // Since: 2771 // isr * (u+A) = sqrt(-1 / (non_square * u * (u+A)) * (u+A) 2772 // isr * (u+A) = sqrt(-(u+A) / (non_square * u * (u+A)) 2773 // and: 2774 // isr = u = sqrt(-1 / (non_square * u * (u+A)) * u 2775 // isr = u = sqrt(-u / (non_square * u * (u+A)) 2776 // Therefore: 2777 // if v is positive: r = isr * (u+A) 2778 // if v is negative: r = isr * u 2779 int crypto_curve_to_hidden(u8 hidden[32], const u8 public_key[32], u8 tweak) 2780 { 2781 fe t1, t2, t3; 2782 fe_frombytes(t1, public_key); 2783 2784 fe_add(t2, t1, A); 2785 fe_mul(t3, t1, t2); 2786 fe_mul_small(t3, t3, -2); 2787 int is_square = invsqrt(t3, t3); 2788 if (!is_square) { 2789 // The only variable time bit. This ultimately reveals how many 2790 // tries it took us to find a representable key. 2791 // This does not affect security as long as we try keys at random. 2792 WIPE_BUFFER(t1); 2793 WIPE_BUFFER(t2); 2794 WIPE_BUFFER(t3); 2795 return -1; 2796 } 2797 fe_ccopy (t1, t2, tweak & 1); 2798 fe_mul (t3, t1, t3); 2799 fe_mul_small(t1, t3, 2); 2800 fe_neg (t2, t3); 2801 fe_ccopy (t3, t2, fe_isodd(t1)); 2802 fe_tobytes(hidden, t3); 2803 2804 // Pad with two random bits 2805 hidden[31] |= tweak & 0xc0; 2806 2807 WIPE_BUFFER(t1); 2808 WIPE_BUFFER(t2); 2809 WIPE_BUFFER(t3); 2810 return 0; 2811 } 2812 2813 void crypto_hidden_key_pair(u8 hidden[32], u8 secret_key[32], u8 seed[32]) 2814 { 2815 u8 pk [32]; // public key 2816 u8 buf[64]; // seed + representative 2817 COPY(buf + 32, seed, 32); 2818 do { 2819 crypto_chacha20(buf, 0, 64, buf+32, zero); 2820 crypto_x25519_dirty_fast(pk, buf); // or the "small" version 2821 } while(crypto_curve_to_hidden(buf+32, pk, buf[32])); 2822 // Note that the return value of crypto_curve_to_hidden() is 2823 // independent from its tweak parameter. 2824 // Therefore, buf[32] is not actually reused. Either we loop one 2825 // more time and buf[32] is used for the new seed, or we succeeded, 2826 // and buf[32] becomes the tweak parameter. 2827 2828 crypto_wipe(seed, 32); 2829 COPY(hidden , buf + 32, 32); 2830 COPY(secret_key, buf , 32); 2831 WIPE_BUFFER(buf); 2832 WIPE_BUFFER(pk); 2833 } 2834 2835 //////////////////// 2836 /// Key exchange /// 2837 //////////////////// 2838 void crypto_key_exchange(u8 shared_key[32], 2839 const u8 your_secret_key [32], 2840 const u8 their_public_key[32]) 2841 { 2842 crypto_x25519(shared_key, your_secret_key, their_public_key); 2843 crypto_hchacha20(shared_key, shared_key, zero); 2844 } 2845 2846 /////////////////////// 2847 /// Scalar division /// 2848 /////////////////////// 2849 2850 // Montgomery reduction. 2851 // Divides x by (2^256), and reduces the result modulo L 2852 // 2853 // Precondition: 2854 // x < L * 2^256 2855 // Constants: 2856 // r = 2^256 (makes division by r trivial) 2857 // k = (r * (1/r) - 1) // L (1/r is computed modulo L ) 2858 // Algorithm: 2859 // s = (x * k) % r 2860 // t = x + s*L (t is always a multiple of r) 2861 // u = (t/r) % L (u is always below 2*L, conditional subtraction is enough) 2862 static void redc(u32 u[8], u32 x[16]) 2863 { 2864 static const u32 k[8] = { 0x12547e1b, 0xd2b51da3, 0xfdba84ff, 0xb1a206f2, 2865 0xffa36bea, 0x14e75438, 0x6fe91836, 0x9db6c6f2,}; 2866 static const u32 l[8] = { 0x5cf5d3ed, 0x5812631a, 0xa2f79cd6, 0x14def9de, 2867 0x00000000, 0x00000000, 0x00000000, 0x10000000,}; 2868 // s = x * k (modulo 2^256) 2869 // This is cheaper than the full multiplication. 2870 u32 s[8] = {0}; 2871 FOR (i, 0, 8) { 2872 u64 carry = 0; 2873 FOR (j, 0, 8-i) { 2874 carry += s[i+j] + (u64)x[i] * k[j]; 2875 s[i+j] = (u32)carry; 2876 carry >>= 32; 2877 } 2878 } 2879 u32 t[16] = {0}; 2880 multiply(t, s, l); 2881 2882 // t = t + x 2883 u64 carry = 0; 2884 FOR (i, 0, 16) { 2885 carry += (u64)t[i] + x[i]; 2886 t[i] = (u32)carry; 2887 carry >>= 32; 2888 } 2889 2890 // u = (t / 2^256) % L 2891 // Note that t / 2^256 is always below 2*L, 2892 // So a constant time conditional subtraction is enough 2893 // We work with L directly, in a 2's complement encoding 2894 // (-L == ~L + 1) 2895 remove_l(u, t+8); 2896 2897 WIPE_BUFFER(s); 2898 WIPE_BUFFER(t); 2899 } 2900 2901 void crypto_x25519_inverse(u8 blind_salt [32], const u8 private_key[32], 2902 const u8 curve_point[32]) 2903 { 2904 static const u8 Lm2[32] = { // L - 2 2905 0xeb, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58, 0xd6, 0x9c, 0xf7, 0xa2, 2906 0xde, 0xf9, 0xde, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 2907 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, }; 2908 // 1 in Montgomery form 2909 u32 m_inv [8] = {0x8d98951d, 0xd6ec3174, 0x737dcf70, 0xc6ef5bf4, 2910 0xfffffffe, 0xffffffff, 0xffffffff, 0x0fffffff,}; 2911 2912 u8 scalar[32]; 2913 COPY(scalar, private_key, 32); 2914 trim_scalar(scalar); 2915 2916 // Convert the scalar in Montgomery form 2917 // m_scl = scalar * 2^256 (modulo L) 2918 u32 m_scl[8]; 2919 { 2920 u32 tmp[16]; 2921 ZERO(tmp, 8); 2922 load32_le_buf(tmp+8, scalar, 8); 2923 mod_l(scalar, tmp); 2924 load32_le_buf(m_scl, scalar, 8); 2925 WIPE_BUFFER(tmp); // Wipe ASAP to save stack space 2926 } 2927 2928 u32 product[16]; 2929 for (int i = 252; i >= 0; i--) { 2930 ZERO(product, 16); 2931 multiply(product, m_inv, m_inv); 2932 redc(m_inv, product); 2933 if (scalar_bit(Lm2, i)) { 2934 ZERO(product, 16); 2935 multiply(product, m_inv, m_scl); 2936 redc(m_inv, product); 2937 } 2938 } 2939 // Convert the inverse *out* of Montgomery form 2940 // scalar = m_inv / 2^256 (modulo L) 2941 COPY(product, m_inv, 8); 2942 ZERO(product + 8, 8); 2943 redc(m_inv, product); 2944 store32_le_buf(scalar, m_inv, 8); // the *inverse* of the scalar 2945 2946 // Clear the cofactor of scalar: 2947 // cleared = scalar * (3*L + 1) (modulo 8*L) 2948 // cleared = scalar + scalar * 3 * L (modulo 8*L) 2949 // Note that (scalar * 3) is reduced modulo 8, so we only need the 2950 // first byte. 2951 add_xl(scalar, scalar[0] * 3); 2952 2953 // Recall that 8*L < 2^256. However it is also very close to 2954 // 2^255. If we spanned the ladder over 255 bits, random tests 2955 // wouldn't catch the off-by-one error. 2956 scalarmult(blind_salt, scalar, curve_point, 256); 2957 2958 WIPE_BUFFER(scalar); WIPE_BUFFER(m_scl); 2959 WIPE_BUFFER(product); WIPE_BUFFER(m_inv); 2960 } 2961 2962 //////////////////////////////// 2963 /// Authenticated encryption /// 2964 //////////////////////////////// 2965 static void lock_auth(u8 mac[16], const u8 auth_key[32], 2966 const u8 *ad , size_t ad_size, 2967 const u8 *cipher_text, size_t text_size) 2968 { 2969 u8 sizes[16]; // Not secret, not wiped 2970 store64_le(sizes + 0, ad_size); 2971 store64_le(sizes + 8, text_size); 2972 crypto_poly1305_ctx poly_ctx; // auto wiped... 2973 crypto_poly1305_init (&poly_ctx, auth_key); 2974 crypto_poly1305_update(&poly_ctx, ad , ad_size); 2975 crypto_poly1305_update(&poly_ctx, zero , align(ad_size, 16)); 2976 crypto_poly1305_update(&poly_ctx, cipher_text, text_size); 2977 crypto_poly1305_update(&poly_ctx, zero , align(text_size, 16)); 2978 crypto_poly1305_update(&poly_ctx, sizes , 16); 2979 crypto_poly1305_final (&poly_ctx, mac); // ...here 2980 } 2981 2982 void crypto_lock_aead(u8 mac[16], u8 *cipher_text, 2983 const u8 key[32], const u8 nonce[24], 2984 const u8 *ad , size_t ad_size, 2985 const u8 *plain_text, size_t text_size) 2986 { 2987 u8 sub_key[32]; 2988 u8 auth_key[64]; // "Wasting" the whole Chacha block is faster 2989 crypto_hchacha20(sub_key, key, nonce); 2990 crypto_chacha20(auth_key, 0, 64, sub_key, nonce + 16); 2991 crypto_chacha20_ctr(cipher_text, plain_text, text_size, 2992 sub_key, nonce + 16, 1); 2993 lock_auth(mac, auth_key, ad, ad_size, cipher_text, text_size); 2994 WIPE_BUFFER(sub_key); 2995 WIPE_BUFFER(auth_key); 2996 } 2997 2998 int crypto_unlock_aead(u8 *plain_text, const u8 key[32], const u8 nonce[24], 2999 const u8 mac[16], 3000 const u8 *ad , size_t ad_size, 3001 const u8 *cipher_text, size_t text_size) 3002 { 3003 u8 sub_key[32]; 3004 u8 auth_key[64]; // "Wasting" the whole Chacha block is faster 3005 crypto_hchacha20(sub_key, key, nonce); 3006 crypto_chacha20(auth_key, 0, 64, sub_key, nonce + 16); 3007 u8 real_mac[16]; 3008 lock_auth(real_mac, auth_key, ad, ad_size, cipher_text, text_size); 3009 WIPE_BUFFER(auth_key); 3010 if (crypto_verify16(mac, real_mac)) { 3011 WIPE_BUFFER(sub_key); 3012 WIPE_BUFFER(real_mac); 3013 return -1; 3014 } 3015 crypto_chacha20_ctr(plain_text, cipher_text, text_size, 3016 sub_key, nonce + 16, 1); 3017 WIPE_BUFFER(sub_key); 3018 WIPE_BUFFER(real_mac); 3019 return 0; 3020 } 3021 3022 void crypto_lock(u8 mac[16], u8 *cipher_text, 3023 const u8 key[32], const u8 nonce[24], 3024 const u8 *plain_text, size_t text_size) 3025 { 3026 crypto_lock_aead(mac, cipher_text, key, nonce, 0, 0, plain_text, text_size); 3027 } 3028 3029 int crypto_unlock(u8 *plain_text, 3030 const u8 key[32], const u8 nonce[24], const u8 mac[16], 3031 const u8 *cipher_text, size_t text_size) 3032 { 3033 return crypto_unlock_aead(plain_text, key, nonce, mac, 0, 0, 3034 cipher_text, text_size); 3035 }