passman

dead simple password manager using state of the art encryption
git clone git://kqueue.dev/passman.git
Log | Files | Refs | README | LICENSE

monocypher.c (110141B)


      1 // Monocypher version 3.1.2
      2 //
      3 // This file is dual-licensed.  Choose whichever licence you want from
      4 // the two licences listed below.
      5 //
      6 // The first licence is a regular 2-clause BSD licence.  The second licence
      7 // is the CC-0 from Creative Commons. It is intended to release Monocypher
      8 // to the public domain.  The BSD licence serves as a fallback option.
      9 //
     10 // SPDX-License-Identifier: BSD-2-Clause OR CC0-1.0
     11 //
     12 // ------------------------------------------------------------------------
     13 //
     14 // Copyright (c) 2017-2020, Loup Vaillant
     15 // All rights reserved.
     16 //
     17 //
     18 // Redistribution and use in source and binary forms, with or without
     19 // modification, are permitted provided that the following conditions are
     20 // met:
     21 //
     22 // 1. Redistributions of source code must retain the above copyright
     23 //    notice, this list of conditions and the following disclaimer.
     24 //
     25 // 2. Redistributions in binary form must reproduce the above copyright
     26 //    notice, this list of conditions and the following disclaimer in the
     27 //    documentation and/or other materials provided with the
     28 //    distribution.
     29 //
     30 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     31 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     32 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     33 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     34 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     35 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     36 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     37 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     38 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     39 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     40 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     41 //
     42 // ------------------------------------------------------------------------
     43 //
     44 // Written in 2017-2020 by Loup Vaillant
     45 //
     46 // To the extent possible under law, the author(s) have dedicated all copyright
     47 // and related neighboring rights to this software to the public domain
     48 // worldwide.  This software is distributed without any warranty.
     49 //
     50 // You should have received a copy of the CC0 Public Domain Dedication along
     51 // with this software.  If not, see
     52 // <https://creativecommons.org/publicdomain/zero/1.0/>
     53 
     54 #include "monocypher.h"
     55 
     56 /////////////////
     57 /// Utilities ///
     58 /////////////////
     59 #define FOR_T(type, i, start, end) for (type i = (start); i < (end); i++)
     60 #define FOR(i, start, end)         FOR_T(size_t, i, start, end)
     61 #define COPY(dst, src, size)       FOR(i, 0, size) (dst)[i] = (src)[i]
     62 #define ZERO(buf, size)            FOR(i, 0, size) (buf)[i] = 0
     63 #define WIPE_CTX(ctx)              crypto_wipe(ctx   , sizeof(*(ctx)))
     64 #define WIPE_BUFFER(buffer)        crypto_wipe(buffer, sizeof(buffer))
     65 #define MIN(a, b)                  ((a) <= (b) ? (a) : (b))
     66 #define MAX(a, b)                  ((a) >= (b) ? (a) : (b))
     67 
     68 typedef int8_t   i8;
     69 typedef uint8_t  u8;
     70 typedef int16_t  i16;
     71 typedef uint32_t u32;
     72 typedef int32_t  i32;
     73 typedef int64_t  i64;
     74 typedef uint64_t u64;
     75 
     76 static const u8 zero[128] = {0};
     77 
     78 // returns the smallest positive integer y such that
     79 // (x + y) % pow_2  == 0
     80 // Basically, it's how many bytes we need to add to "align" x.
     81 // Only works when pow_2 is a power of 2.
     82 // Note: we use ~x+1 instead of -x to avoid compiler warnings
     83 static size_t align(size_t x, size_t pow_2)
     84 {
     85     return (~x + 1) & (pow_2 - 1);
     86 }
     87 
     88 static u32 load24_le(const u8 s[3])
     89 {
     90     return (u32)s[0]
     91         | ((u32)s[1] <<  8)
     92         | ((u32)s[2] << 16);
     93 }
     94 
     95 static u32 load32_le(const u8 s[4])
     96 {
     97     return (u32)s[0]
     98         | ((u32)s[1] <<  8)
     99         | ((u32)s[2] << 16)
    100         | ((u32)s[3] << 24);
    101 }
    102 
    103 static u64 load64_le(const u8 s[8])
    104 {
    105     return load32_le(s) | ((u64)load32_le(s+4) << 32);
    106 }
    107 
    108 static void store32_le(u8 out[4], u32 in)
    109 {
    110     out[0] =  in        & 0xff;
    111     out[1] = (in >>  8) & 0xff;
    112     out[2] = (in >> 16) & 0xff;
    113     out[3] = (in >> 24) & 0xff;
    114 }
    115 
    116 static void store64_le(u8 out[8], u64 in)
    117 {
    118     store32_le(out    , (u32)in );
    119     store32_le(out + 4, in >> 32);
    120 }
    121 
    122 static void load32_le_buf (u32 *dst, const u8 *src, size_t size) {
    123     FOR(i, 0, size) { dst[i] = load32_le(src + i*4); }
    124 }
    125 static void load64_le_buf (u64 *dst, const u8 *src, size_t size) {
    126     FOR(i, 0, size) { dst[i] = load64_le(src + i*8); }
    127 }
    128 static void store32_le_buf(u8 *dst, const u32 *src, size_t size) {
    129     FOR(i, 0, size) { store32_le(dst + i*4, src[i]); }
    130 }
    131 static void store64_le_buf(u8 *dst, const u64 *src, size_t size) {
    132     FOR(i, 0, size) { store64_le(dst + i*8, src[i]); }
    133 }
    134 
    135 static u64 rotr64(u64 x, u64 n) { return (x >> n) ^ (x << (64 - n)); }
    136 static u32 rotl32(u32 x, u32 n) { return (x << n) ^ (x >> (32 - n)); }
    137 
    138 static int neq0(u64 diff)
    139 {   // constant time comparison to zero
    140     // return diff != 0 ? -1 : 0
    141     u64 half = (diff >> 32) | ((u32)diff);
    142     return (1 & ((half - 1) >> 32)) - 1;
    143 }
    144 
    145 static u64 x16(const u8 a[16], const u8 b[16])
    146 {
    147     return (load64_le(a + 0) ^ load64_le(b + 0))
    148         |  (load64_le(a + 8) ^ load64_le(b + 8));
    149 }
    150 static u64 x32(const u8 a[32],const u8 b[32]){return x16(a,b)| x16(a+16, b+16);}
    151 static u64 x64(const u8 a[64],const u8 b[64]){return x32(a,b)| x32(a+32, b+32);}
    152 int crypto_verify16(const u8 a[16], const u8 b[16]){ return neq0(x16(a, b)); }
    153 int crypto_verify32(const u8 a[32], const u8 b[32]){ return neq0(x32(a, b)); }
    154 int crypto_verify64(const u8 a[64], const u8 b[64]){ return neq0(x64(a, b)); }
    155 
    156 void crypto_wipe(void *secret, size_t size)
    157 {
    158     volatile u8 *v_secret = (u8*)secret;
    159     ZERO(v_secret, size);
    160 }
    161 
    162 /////////////////
    163 /// Chacha 20 ///
    164 /////////////////
    165 #define QUARTERROUND(a, b, c, d)     \
    166     a += b;  d = rotl32(d ^ a, 16);  \
    167     c += d;  b = rotl32(b ^ c, 12);  \
    168     a += b;  d = rotl32(d ^ a,  8);  \
    169     c += d;  b = rotl32(b ^ c,  7)
    170 
    171 static void chacha20_rounds(u32 out[16], const u32 in[16])
    172 {
    173     // The temporary variables make Chacha20 10% faster.
    174     u32 t0  = in[ 0];  u32 t1  = in[ 1];  u32 t2  = in[ 2];  u32 t3  = in[ 3];
    175     u32 t4  = in[ 4];  u32 t5  = in[ 5];  u32 t6  = in[ 6];  u32 t7  = in[ 7];
    176     u32 t8  = in[ 8];  u32 t9  = in[ 9];  u32 t10 = in[10];  u32 t11 = in[11];
    177     u32 t12 = in[12];  u32 t13 = in[13];  u32 t14 = in[14];  u32 t15 = in[15];
    178 
    179     FOR (i, 0, 10) { // 20 rounds, 2 rounds per loop.
    180         QUARTERROUND(t0, t4, t8 , t12); // column 0
    181         QUARTERROUND(t1, t5, t9 , t13); // column 1
    182         QUARTERROUND(t2, t6, t10, t14); // column 2
    183         QUARTERROUND(t3, t7, t11, t15); // column 3
    184         QUARTERROUND(t0, t5, t10, t15); // diagonal 0
    185         QUARTERROUND(t1, t6, t11, t12); // diagonal 1
    186         QUARTERROUND(t2, t7, t8 , t13); // diagonal 2
    187         QUARTERROUND(t3, t4, t9 , t14); // diagonal 3
    188     }
    189     out[ 0] = t0;   out[ 1] = t1;   out[ 2] = t2;   out[ 3] = t3;
    190     out[ 4] = t4;   out[ 5] = t5;   out[ 6] = t6;   out[ 7] = t7;
    191     out[ 8] = t8;   out[ 9] = t9;   out[10] = t10;  out[11] = t11;
    192     out[12] = t12;  out[13] = t13;  out[14] = t14;  out[15] = t15;
    193 }
    194 
    195 static void chacha20_init_key(u32 block[16], const u8 key[32])
    196 {
    197     load32_le_buf(block  , (const u8*)"expand 32-byte k", 4); // constant
    198     load32_le_buf(block+4, key                          , 8); // key
    199 }
    200 
    201 void crypto_hchacha20(u8 out[32], const u8 key[32], const u8 in [16])
    202 {
    203     u32 block[16];
    204     chacha20_init_key(block, key);
    205     // input
    206     load32_le_buf(block + 12, in, 4);
    207     chacha20_rounds(block, block);
    208     // prevent reversal of the rounds by revealing only half of the buffer.
    209     store32_le_buf(out   , block   , 4); // constant
    210     store32_le_buf(out+16, block+12, 4); // counter and nonce
    211     WIPE_BUFFER(block);
    212 }
    213 
    214 u64 crypto_chacha20_ctr(u8 *cipher_text, const u8 *plain_text,
    215                         size_t text_size, const u8 key[32], const u8 nonce[8],
    216                         u64 ctr)
    217 {
    218     u32 input[16];
    219     chacha20_init_key(input, key);
    220     input[12] = (u32) ctr;
    221     input[13] = (u32)(ctr >> 32);
    222     load32_le_buf(input+14, nonce, 2);
    223 
    224     // Whole blocks
    225     u32    pool[16];
    226     size_t nb_blocks = text_size >> 6;
    227     FOR (i, 0, nb_blocks) {
    228         chacha20_rounds(pool, input);
    229         if (plain_text != 0) {
    230             FOR (j, 0, 16) {
    231                 u32 p = pool[j] + input[j];
    232                 store32_le(cipher_text, p ^ load32_le(plain_text));
    233                 cipher_text += 4;
    234                 plain_text  += 4;
    235             }
    236         } else {
    237             FOR (j, 0, 16) {
    238                 u32 p = pool[j] + input[j];
    239                 store32_le(cipher_text, p);
    240                 cipher_text += 4;
    241             }
    242         }
    243         input[12]++;
    244         if (input[12] == 0) {
    245             input[13]++;
    246         }
    247     }
    248     text_size &= 63;
    249 
    250     // Last (incomplete) block
    251     if (text_size > 0) {
    252         if (plain_text == 0) {
    253             plain_text = zero;
    254         }
    255         chacha20_rounds(pool, input);
    256         u8 tmp[64];
    257         FOR (i, 0, 16) {
    258             store32_le(tmp + i*4, pool[i] + input[i]);
    259         }
    260         FOR (i, 0, text_size) {
    261             cipher_text[i] = tmp[i] ^ plain_text[i];
    262         }
    263         WIPE_BUFFER(tmp);
    264     }
    265     ctr = input[12] + ((u64)input[13] << 32) + (text_size > 0);
    266 
    267     WIPE_BUFFER(pool);
    268     WIPE_BUFFER(input);
    269     return ctr;
    270 }
    271 
    272 u32 crypto_ietf_chacha20_ctr(u8 *cipher_text, const u8 *plain_text,
    273                              size_t text_size,
    274                              const u8 key[32], const u8 nonce[12], u32 ctr)
    275 {
    276     u64 big_ctr = ctr + ((u64)load32_le(nonce) << 32);
    277     return (u32)crypto_chacha20_ctr(cipher_text, plain_text, text_size,
    278                                     key, nonce + 4, big_ctr);
    279 }
    280 
    281 u64 crypto_xchacha20_ctr(u8 *cipher_text, const u8 *plain_text,
    282                          size_t text_size,
    283                          const u8 key[32], const u8 nonce[24], u64 ctr)
    284 {
    285     u8 sub_key[32];
    286     crypto_hchacha20(sub_key, key, nonce);
    287     ctr = crypto_chacha20_ctr(cipher_text, plain_text, text_size,
    288                               sub_key, nonce+16, ctr);
    289     WIPE_BUFFER(sub_key);
    290     return ctr;
    291 }
    292 
    293 void crypto_chacha20(u8 *cipher_text, const u8 *plain_text, size_t text_size,
    294                      const u8 key[32], const u8 nonce[8])
    295 {
    296     crypto_chacha20_ctr(cipher_text, plain_text, text_size, key, nonce, 0);
    297 
    298 }
    299 void crypto_ietf_chacha20(u8 *cipher_text, const u8 *plain_text,
    300                           size_t text_size,
    301                           const u8 key[32], const u8 nonce[12])
    302 {
    303     crypto_ietf_chacha20_ctr(cipher_text, plain_text, text_size, key, nonce, 0);
    304 }
    305 
    306 void crypto_xchacha20(u8 *cipher_text, const u8 *plain_text, size_t text_size,
    307                       const u8 key[32], const u8 nonce[24])
    308 {
    309     crypto_xchacha20_ctr(cipher_text, plain_text, text_size, key, nonce, 0);
    310 }
    311 
    312 /////////////////
    313 /// Poly 1305 ///
    314 /////////////////
    315 
    316 // h = (h + c) * r
    317 // preconditions:
    318 //   ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
    319 //   ctx->c <= 1_ffffffff_ffffffff_ffffffff_ffffffff
    320 //   ctx->r <=   0ffffffc_0ffffffc_0ffffffc_0fffffff
    321 // Postcondition:
    322 //   ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
    323 static void poly_block(crypto_poly1305_ctx *ctx)
    324 {
    325     // s = h + c, without carry propagation
    326     const u64 s0 = ctx->h[0] + (u64)ctx->c[0]; // s0 <= 1_fffffffe
    327     const u64 s1 = ctx->h[1] + (u64)ctx->c[1]; // s1 <= 1_fffffffe
    328     const u64 s2 = ctx->h[2] + (u64)ctx->c[2]; // s2 <= 1_fffffffe
    329     const u64 s3 = ctx->h[3] + (u64)ctx->c[3]; // s3 <= 1_fffffffe
    330     const u32 s4 = ctx->h[4] +      ctx->c[4]; // s4 <=          5
    331 
    332     // Local all the things!
    333     const u32 r0 = ctx->r[0];       // r0  <= 0fffffff
    334     const u32 r1 = ctx->r[1];       // r1  <= 0ffffffc
    335     const u32 r2 = ctx->r[2];       // r2  <= 0ffffffc
    336     const u32 r3 = ctx->r[3];       // r3  <= 0ffffffc
    337     const u32 rr0 = (r0 >> 2) * 5;  // rr0 <= 13fffffb // lose 2 bits...
    338     const u32 rr1 = (r1 >> 2) + r1; // rr1 <= 13fffffb // rr1 == (r1 >> 2) * 5
    339     const u32 rr2 = (r2 >> 2) + r2; // rr2 <= 13fffffb // rr1 == (r2 >> 2) * 5
    340     const u32 rr3 = (r3 >> 2) + r3; // rr3 <= 13fffffb // rr1 == (r3 >> 2) * 5
    341 
    342     // (h + c) * r, without carry propagation
    343     const u64 x0 = s0*r0+ s1*rr3+ s2*rr2+ s3*rr1+ s4*rr0; // <= 97ffffe007fffff8
    344     const u64 x1 = s0*r1+ s1*r0 + s2*rr3+ s3*rr2+ s4*rr1; // <= 8fffffe20ffffff6
    345     const u64 x2 = s0*r2+ s1*r1 + s2*r0 + s3*rr3+ s4*rr2; // <= 87ffffe417fffff4
    346     const u64 x3 = s0*r3+ s1*r2 + s2*r1 + s3*r0 + s4*rr3; // <= 7fffffe61ffffff2
    347     const u32 x4 = s4 * (r0 & 3); // ...recover 2 bits    // <=                f
    348 
    349     // partial reduction modulo 2^130 - 5
    350     const u32 u5 = x4 + (x3 >> 32); // u5 <= 7ffffff5
    351     const u64 u0 = (u5 >>  2) * 5 + (x0 & 0xffffffff);
    352     const u64 u1 = (u0 >> 32)     + (x1 & 0xffffffff) + (x0 >> 32);
    353     const u64 u2 = (u1 >> 32)     + (x2 & 0xffffffff) + (x1 >> 32);
    354     const u64 u3 = (u2 >> 32)     + (x3 & 0xffffffff) + (x2 >> 32);
    355     const u64 u4 = (u3 >> 32)     + (u5 & 3);
    356 
    357     // Update the hash
    358     ctx->h[0] = (u32)u0; // u0 <= 1_9ffffff0
    359     ctx->h[1] = (u32)u1; // u1 <= 1_97ffffe0
    360     ctx->h[2] = (u32)u2; // u2 <= 1_8fffffe2
    361     ctx->h[3] = (u32)u3; // u3 <= 1_87ffffe4
    362     ctx->h[4] = (u32)u4; // u4 <=          4
    363 }
    364 
    365 // (re-)initialises the input counter and input buffer
    366 static void poly_clear_c(crypto_poly1305_ctx *ctx)
    367 {
    368     ZERO(ctx->c, 4);
    369     ctx->c_idx = 0;
    370 }
    371 
    372 static void poly_take_input(crypto_poly1305_ctx *ctx, u8 input)
    373 {
    374     size_t word = ctx->c_idx >> 2;
    375     size_t byte = ctx->c_idx & 3;
    376     ctx->c[word] |= (u32)input << (byte * 8);
    377     ctx->c_idx++;
    378 }
    379 
    380 static void poly_update(crypto_poly1305_ctx *ctx,
    381                         const u8 *message, size_t message_size)
    382 {
    383     FOR (i, 0, message_size) {
    384         poly_take_input(ctx, message[i]);
    385         if (ctx->c_idx == 16) {
    386             poly_block(ctx);
    387             poly_clear_c(ctx);
    388         }
    389     }
    390 }
    391 
    392 void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const u8 key[32])
    393 {
    394     // Initial hash is zero
    395     ZERO(ctx->h, 5);
    396     // add 2^130 to every input block
    397     ctx->c[4] = 1;
    398     poly_clear_c(ctx);
    399     // load r and pad (r has some of its bits cleared)
    400     load32_le_buf(ctx->r  , key   , 4);
    401     load32_le_buf(ctx->pad, key+16, 4);
    402     FOR (i, 0, 1) { ctx->r[i] &= 0x0fffffff; }
    403     FOR (i, 1, 4) { ctx->r[i] &= 0x0ffffffc; }
    404 }
    405 
    406 void crypto_poly1305_update(crypto_poly1305_ctx *ctx,
    407                             const u8 *message, size_t message_size)
    408 {
    409     if (message_size == 0) {
    410         return;
    411     }
    412     // Align ourselves with block boundaries
    413     size_t aligned = MIN(align(ctx->c_idx, 16), message_size);
    414     poly_update(ctx, message, aligned);
    415     message      += aligned;
    416     message_size -= aligned;
    417 
    418     // Process the message block by block
    419     size_t nb_blocks = message_size >> 4;
    420     FOR (i, 0, nb_blocks) {
    421         load32_le_buf(ctx->c, message, 4);
    422         poly_block(ctx);
    423         message += 16;
    424     }
    425     if (nb_blocks > 0) {
    426         poly_clear_c(ctx);
    427     }
    428     message_size &= 15;
    429 
    430     // remaining bytes
    431     poly_update(ctx, message, message_size);
    432 }
    433 
    434 void crypto_poly1305_final(crypto_poly1305_ctx *ctx, u8 mac[16])
    435 {
    436     // Process the last block (if any)
    437     if (ctx->c_idx != 0) {
    438         // move the final 1 according to remaining input length
    439         // (We may add less than 2^130 to the last input block)
    440         ctx->c[4] = 0;
    441         poly_take_input(ctx, 1);
    442         // one last hash update
    443         poly_block(ctx);
    444     }
    445 
    446     // check if we should subtract 2^130-5 by performing the
    447     // corresponding carry propagation.
    448     u64 c = 5;
    449     FOR (i, 0, 4) {
    450         c  += ctx->h[i];
    451         c >>= 32;
    452     }
    453     c += ctx->h[4];
    454     c  = (c >> 2) * 5; // shift the carry back to the beginning
    455     // c now indicates how many times we should subtract 2^130-5 (0 or 1)
    456     FOR (i, 0, 4) {
    457         c += (u64)ctx->h[i] + ctx->pad[i];
    458         store32_le(mac + i*4, (u32)c);
    459         c = c >> 32;
    460     }
    461     WIPE_CTX(ctx);
    462 }
    463 
    464 void crypto_poly1305(u8     mac[16],  const u8 *message,
    465                      size_t message_size, const u8  key[32])
    466 {
    467     crypto_poly1305_ctx ctx;
    468     crypto_poly1305_init  (&ctx, key);
    469     crypto_poly1305_update(&ctx, message, message_size);
    470     crypto_poly1305_final (&ctx, mac);
    471 }
    472 
    473 ////////////////
    474 /// Blake2 b ///
    475 ////////////////
    476 static const u64 iv[8] = {
    477     0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
    478     0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
    479     0x510e527fade682d1, 0x9b05688c2b3e6c1f,
    480     0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
    481 };
    482 
    483 // increment the input offset
    484 static void blake2b_incr(crypto_blake2b_ctx *ctx)
    485 {
    486     u64   *x = ctx->input_offset;
    487     size_t y = ctx->input_idx;
    488     x[0] += y;
    489     if (x[0] < y) {
    490         x[1]++;
    491     }
    492 }
    493 
    494 static void blake2b_compress(crypto_blake2b_ctx *ctx, int is_last_block)
    495 {
    496     static const u8 sigma[12][16] = {
    497         {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
    498         { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
    499         { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
    500         {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
    501         {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
    502         {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
    503         { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
    504         { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
    505         {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
    506         { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0 },
    507         {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
    508         { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
    509     };
    510 
    511     // init work vector
    512     u64 v0 = ctx->hash[0];  u64 v8  = iv[0];
    513     u64 v1 = ctx->hash[1];  u64 v9  = iv[1];
    514     u64 v2 = ctx->hash[2];  u64 v10 = iv[2];
    515     u64 v3 = ctx->hash[3];  u64 v11 = iv[3];
    516     u64 v4 = ctx->hash[4];  u64 v12 = iv[4] ^ ctx->input_offset[0];
    517     u64 v5 = ctx->hash[5];  u64 v13 = iv[5] ^ ctx->input_offset[1];
    518     u64 v6 = ctx->hash[6];  u64 v14 = iv[6] ^ (u64)~(is_last_block - 1);
    519     u64 v7 = ctx->hash[7];  u64 v15 = iv[7];
    520 
    521     // mangle work vector
    522     u64 *input = ctx->input;
    523 #define BLAKE2_G(a, b, c, d, x, y)      \
    524     a += b + x;  d = rotr64(d ^ a, 32); \
    525     c += d;      b = rotr64(b ^ c, 24); \
    526     a += b + y;  d = rotr64(d ^ a, 16); \
    527     c += d;      b = rotr64(b ^ c, 63)
    528 #define BLAKE2_ROUND(i)                                                 \
    529     BLAKE2_G(v0, v4, v8 , v12, input[sigma[i][ 0]], input[sigma[i][ 1]]); \
    530     BLAKE2_G(v1, v5, v9 , v13, input[sigma[i][ 2]], input[sigma[i][ 3]]); \
    531     BLAKE2_G(v2, v6, v10, v14, input[sigma[i][ 4]], input[sigma[i][ 5]]); \
    532     BLAKE2_G(v3, v7, v11, v15, input[sigma[i][ 6]], input[sigma[i][ 7]]); \
    533     BLAKE2_G(v0, v5, v10, v15, input[sigma[i][ 8]], input[sigma[i][ 9]]); \
    534     BLAKE2_G(v1, v6, v11, v12, input[sigma[i][10]], input[sigma[i][11]]); \
    535     BLAKE2_G(v2, v7, v8 , v13, input[sigma[i][12]], input[sigma[i][13]]); \
    536     BLAKE2_G(v3, v4, v9 , v14, input[sigma[i][14]], input[sigma[i][15]])
    537 
    538 #ifdef BLAKE2_NO_UNROLLING
    539     FOR (i, 0, 12) {
    540         BLAKE2_ROUND(i);
    541     }
    542 #else
    543     BLAKE2_ROUND(0);  BLAKE2_ROUND(1);  BLAKE2_ROUND(2);  BLAKE2_ROUND(3);
    544     BLAKE2_ROUND(4);  BLAKE2_ROUND(5);  BLAKE2_ROUND(6);  BLAKE2_ROUND(7);
    545     BLAKE2_ROUND(8);  BLAKE2_ROUND(9);  BLAKE2_ROUND(10); BLAKE2_ROUND(11);
    546 #endif
    547 
    548     // update hash
    549     ctx->hash[0] ^= v0 ^ v8;   ctx->hash[1] ^= v1 ^ v9;
    550     ctx->hash[2] ^= v2 ^ v10;  ctx->hash[3] ^= v3 ^ v11;
    551     ctx->hash[4] ^= v4 ^ v12;  ctx->hash[5] ^= v5 ^ v13;
    552     ctx->hash[6] ^= v6 ^ v14;  ctx->hash[7] ^= v7 ^ v15;
    553 }
    554 
    555 static void blake2b_set_input(crypto_blake2b_ctx *ctx, u8 input, size_t index)
    556 {
    557     if (index == 0) {
    558         ZERO(ctx->input, 16);
    559     }
    560     size_t word = index >> 3;
    561     size_t byte = index & 7;
    562     ctx->input[word] |= (u64)input << (byte << 3);
    563 
    564 }
    565 
    566 static void blake2b_end_block(crypto_blake2b_ctx *ctx)
    567 {
    568     if (ctx->input_idx == 128) {  // If buffer is full,
    569         blake2b_incr(ctx);        // update the input offset
    570         blake2b_compress(ctx, 0); // and compress the (not last) block
    571         ctx->input_idx = 0;
    572     }
    573 }
    574 
    575 static void blake2b_update(crypto_blake2b_ctx *ctx,
    576                            const u8 *message, size_t message_size)
    577 {
    578     FOR (i, 0, message_size) {
    579         blake2b_end_block(ctx);
    580         blake2b_set_input(ctx, message[i], ctx->input_idx);
    581         ctx->input_idx++;
    582     }
    583 }
    584 
    585 void crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t hash_size,
    586                                  const u8           *key, size_t key_size)
    587 {
    588     // initial hash
    589     COPY(ctx->hash, iv, 8);
    590     ctx->hash[0] ^= 0x01010000 ^ (key_size << 8) ^ hash_size;
    591 
    592     ctx->input_offset[0] = 0;         // beginning of the input, no offset
    593     ctx->input_offset[1] = 0;         // beginning of the input, no offset
    594     ctx->hash_size       = hash_size; // remember the hash size we want
    595     ctx->input_idx       = 0;
    596 
    597     // if there is a key, the first block is that key (padded with zeroes)
    598     if (key_size > 0) {
    599         u8 key_block[128] = {0};
    600         COPY(key_block, key, key_size);
    601         // same as calling crypto_blake2b_update(ctx, key_block , 128)
    602         load64_le_buf(ctx->input, key_block, 16);
    603         ctx->input_idx = 128;
    604     }
    605 }
    606 
    607 void crypto_blake2b_init(crypto_blake2b_ctx *ctx)
    608 {
    609     crypto_blake2b_general_init(ctx, 64, 0, 0);
    610 }
    611 
    612 void crypto_blake2b_update(crypto_blake2b_ctx *ctx,
    613                            const u8 *message, size_t message_size)
    614 {
    615     if (message_size == 0) {
    616         return;
    617     }
    618     // Align ourselves with block boundaries
    619     size_t aligned = MIN(align(ctx->input_idx, 128), message_size);
    620     blake2b_update(ctx, message, aligned);
    621     message      += aligned;
    622     message_size -= aligned;
    623 
    624     // Process the message block by block
    625     FOR (i, 0, message_size >> 7) { // number of blocks
    626         blake2b_end_block(ctx);
    627         load64_le_buf(ctx->input, message, 16);
    628         message += 128;
    629         ctx->input_idx = 128;
    630     }
    631     message_size &= 127;
    632 
    633     // remaining bytes
    634     blake2b_update(ctx, message, message_size);
    635 }
    636 
    637 void crypto_blake2b_final(crypto_blake2b_ctx *ctx, u8 *hash)
    638 {
    639     // Pad the end of the block with zeroes
    640     FOR (i, ctx->input_idx, 128) {
    641         blake2b_set_input(ctx, 0, i);
    642     }
    643     blake2b_incr(ctx);        // update the input offset
    644     blake2b_compress(ctx, 1); // compress the last block
    645     size_t nb_words = ctx->hash_size >> 3;
    646     store64_le_buf(hash, ctx->hash, nb_words);
    647     FOR (i, nb_words << 3, ctx->hash_size) {
    648         hash[i] = (ctx->hash[i >> 3] >> (8 * (i & 7))) & 0xff;
    649     }
    650     WIPE_CTX(ctx);
    651 }
    652 
    653 void crypto_blake2b_general(u8       *hash   , size_t hash_size,
    654                             const u8 *key    , size_t key_size,
    655                             const u8 *message, size_t message_size)
    656 {
    657     crypto_blake2b_ctx ctx;
    658     crypto_blake2b_general_init(&ctx, hash_size, key, key_size);
    659     crypto_blake2b_update(&ctx, message, message_size);
    660     crypto_blake2b_final(&ctx, hash);
    661 }
    662 
    663 void crypto_blake2b(u8 hash[64], const u8 *message, size_t message_size)
    664 {
    665     crypto_blake2b_general(hash, 64, 0, 0, message, message_size);
    666 }
    667 
    668 static void blake2b_vtable_init(void *ctx) {
    669     crypto_blake2b_init(&((crypto_sign_ctx*)ctx)->hash);
    670 }
    671 static void blake2b_vtable_update(void *ctx, const u8 *m, size_t s) {
    672     crypto_blake2b_update(&((crypto_sign_ctx*)ctx)->hash, m, s);
    673 }
    674 static void blake2b_vtable_final(void *ctx, u8 *h) {
    675     crypto_blake2b_final(&((crypto_sign_ctx*)ctx)->hash, h);
    676 }
    677 const crypto_sign_vtable crypto_blake2b_vtable = {
    678     crypto_blake2b,
    679     blake2b_vtable_init,
    680     blake2b_vtable_update,
    681     blake2b_vtable_final,
    682     sizeof(crypto_sign_ctx),
    683 };
    684 
    685 ////////////////
    686 /// Argon2 i ///
    687 ////////////////
    688 // references to R, Z, Q etc. come from the spec
    689 
    690 // Argon2 operates on 1024 byte blocks.
    691 typedef struct { u64 a[128]; } block;
    692 
    693 static void wipe_block(block *b)
    694 {
    695     volatile u64* a = b->a;
    696     ZERO(a, 128);
    697 }
    698 
    699 // updates a Blake2 hash with a 32 bit word, little endian.
    700 static void blake_update_32(crypto_blake2b_ctx *ctx, u32 input)
    701 {
    702     u8 buf[4];
    703     store32_le(buf, input);
    704     crypto_blake2b_update(ctx, buf, 4);
    705     WIPE_BUFFER(buf);
    706 }
    707 
    708 static void load_block(block *b, const u8 bytes[1024])
    709 {
    710     load64_le_buf(b->a, bytes, 128);
    711 }
    712 
    713 static void store_block(u8 bytes[1024], const block *b)
    714 {
    715     store64_le_buf(bytes, b->a, 128);
    716 }
    717 
    718 static void copy_block(block *o,const block*in){FOR(i,0,128)o->a[i] = in->a[i];}
    719 static void  xor_block(block *o,const block*in){FOR(i,0,128)o->a[i]^= in->a[i];}
    720 
    721 // Hash with a virtually unlimited digest size.
    722 // Doesn't extract more entropy than the base hash function.
    723 // Mainly used for filling a whole kilobyte block with pseudo-random bytes.
    724 // (One could use a stream cipher with a seed hash as the key, but
    725 //  this would introduce another dependency —and point of failure.)
    726 static void extended_hash(u8       *digest, u32 digest_size,
    727                           const u8 *input , u32 input_size)
    728 {
    729     crypto_blake2b_ctx ctx;
    730     crypto_blake2b_general_init(&ctx, MIN(digest_size, 64), 0, 0);
    731     blake_update_32            (&ctx, digest_size);
    732     crypto_blake2b_update      (&ctx, input, input_size);
    733     crypto_blake2b_final       (&ctx, digest);
    734 
    735     if (digest_size > 64) {
    736         // the conversion to u64 avoids integer overflow on
    737         // ludicrously big hash sizes.
    738         u32 r   = (u32)(((u64)digest_size + 31) >> 5) - 2;
    739         u32 i   =  1;
    740         u32 in  =  0;
    741         u32 out = 32;
    742         while (i < r) {
    743             // Input and output overlap. This is intentional
    744             crypto_blake2b(digest + out, digest + in, 64);
    745             i   +=  1;
    746             in  += 32;
    747             out += 32;
    748         }
    749         crypto_blake2b_general(digest + out, digest_size - (32 * r),
    750                                0, 0, // no key
    751                                digest + in , 64);
    752     }
    753 }
    754 
    755 #define LSB(x) ((x) & 0xffffffff)
    756 #define G(a, b, c, d)                                            \
    757     a += b + 2 * LSB(a) * LSB(b);  d ^= a;  d = rotr64(d, 32);   \
    758     c += d + 2 * LSB(c) * LSB(d);  b ^= c;  b = rotr64(b, 24);   \
    759     a += b + 2 * LSB(a) * LSB(b);  d ^= a;  d = rotr64(d, 16);   \
    760     c += d + 2 * LSB(c) * LSB(d);  b ^= c;  b = rotr64(b, 63)
    761 #define ROUND(v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7,    \
    762               v8,  v9, v10, v11, v12, v13, v14, v15)    \
    763     G(v0, v4,  v8, v12);  G(v1, v5,  v9, v13);          \
    764     G(v2, v6, v10, v14);  G(v3, v7, v11, v15);          \
    765     G(v0, v5, v10, v15);  G(v1, v6, v11, v12);          \
    766     G(v2, v7,  v8, v13);  G(v3, v4,  v9, v14)
    767 
    768 // Core of the compression function G.  Computes Z from R in place.
    769 static void g_rounds(block *work_block)
    770 {
    771     // column rounds (work_block = Q)
    772     for (int i = 0; i < 128; i += 16) {
    773         ROUND(work_block->a[i     ], work_block->a[i +  1],
    774               work_block->a[i +  2], work_block->a[i +  3],
    775               work_block->a[i +  4], work_block->a[i +  5],
    776               work_block->a[i +  6], work_block->a[i +  7],
    777               work_block->a[i +  8], work_block->a[i +  9],
    778               work_block->a[i + 10], work_block->a[i + 11],
    779               work_block->a[i + 12], work_block->a[i + 13],
    780               work_block->a[i + 14], work_block->a[i + 15]);
    781     }
    782     // row rounds (work_block = Z)
    783     for (int i = 0; i < 16; i += 2) {
    784         ROUND(work_block->a[i      ], work_block->a[i +   1],
    785               work_block->a[i +  16], work_block->a[i +  17],
    786               work_block->a[i +  32], work_block->a[i +  33],
    787               work_block->a[i +  48], work_block->a[i +  49],
    788               work_block->a[i +  64], work_block->a[i +  65],
    789               work_block->a[i +  80], work_block->a[i +  81],
    790               work_block->a[i +  96], work_block->a[i +  97],
    791               work_block->a[i + 112], work_block->a[i + 113]);
    792     }
    793 }
    794 
    795 // The compression function G (copy version for the first pass)
    796 static void g_copy(block *result, const block *x, const block *y, block* tmp)
    797 {
    798     copy_block(tmp   , x  ); // tmp    = X
    799     xor_block (tmp   , y  ); // tmp    = X ^ Y = R
    800     copy_block(result, tmp); // result = R         (only difference with g_xor)
    801     g_rounds  (tmp);         // tmp    = Z
    802     xor_block (result, tmp); // result = R ^ Z
    803 }
    804 
    805 // The compression function G (xor version for subsequent passes)
    806 static void g_xor(block *result, const block *x, const block *y, block *tmp)
    807 {
    808     copy_block(tmp   , x  ); // tmp    = X
    809     xor_block (tmp   , y  ); // tmp    = X ^ Y = R
    810     xor_block (result, tmp); // result = R ^ old   (only difference with g_copy)
    811     g_rounds  (tmp);         // tmp    = Z
    812     xor_block (result, tmp); // result = R ^ old ^ Z
    813 }
    814 
    815 // Unary version of the compression function.
    816 // The missing argument is implied zero.
    817 // Does the transformation in place.
    818 static void unary_g(block *work_block, block *tmp)
    819 {
    820     // work_block == R
    821     copy_block(tmp, work_block); // tmp        = R
    822     g_rounds  (work_block);      // work_block = Z
    823     xor_block (work_block, tmp); // work_block = Z ^ R
    824 }
    825 
    826 // Argon2i uses a kind of stream cipher to determine which reference
    827 // block it will take to synthesise the next block.  This context hold
    828 // that stream's state.  (It's very similar to Chacha20.  The block b
    829 // is analogous to Chacha's own pool)
    830 typedef struct {
    831     block b;
    832     u32 pass_number;
    833     u32 slice_number;
    834     u32 nb_blocks;
    835     u32 nb_iterations;
    836     u32 ctr;
    837     u32 offset;
    838 } gidx_ctx;
    839 
    840 // The block in the context will determine array indices. To avoid
    841 // timing attacks, it only depends on public information.  No looking
    842 // at a previous block to seed the next.  This makes offline attacks
    843 // easier, but timing attacks are the bigger threat in many settings.
    844 static void gidx_refresh(gidx_ctx *ctx)
    845 {
    846     // seed the beginning of the block...
    847     ctx->b.a[0] = ctx->pass_number;
    848     ctx->b.a[1] = 0;  // lane number (we have only one)
    849     ctx->b.a[2] = ctx->slice_number;
    850     ctx->b.a[3] = ctx->nb_blocks;
    851     ctx->b.a[4] = ctx->nb_iterations;
    852     ctx->b.a[5] = 1;  // type: Argon2i
    853     ctx->b.a[6] = ctx->ctr;
    854     ZERO(ctx->b.a + 7, 121); // ...then zero the rest out
    855 
    856     // Shuffle the block thus: ctx->b = G((G(ctx->b, zero)), zero)
    857     // (G "square" function), to get cheap pseudo-random numbers.
    858     block tmp;
    859     unary_g(&ctx->b, &tmp);
    860     unary_g(&ctx->b, &tmp);
    861     wipe_block(&tmp);
    862 }
    863 
    864 static void gidx_init(gidx_ctx *ctx,
    865                       u32 pass_number, u32 slice_number,
    866                       u32 nb_blocks,   u32 nb_iterations)
    867 {
    868     ctx->pass_number   = pass_number;
    869     ctx->slice_number  = slice_number;
    870     ctx->nb_blocks     = nb_blocks;
    871     ctx->nb_iterations = nb_iterations;
    872     ctx->ctr           = 0;
    873 
    874     // Offset from the beginning of the segment.  For the first slice
    875     // of the first pass, we start at the *third* block, so the offset
    876     // starts at 2, not 0.
    877     if (pass_number != 0 || slice_number != 0) {
    878         ctx->offset = 0;
    879     } else {
    880         ctx->offset = 2;
    881         ctx->ctr++;         // Compensates for missed lazy creation
    882         gidx_refresh(ctx);  // at the start of gidx_next()
    883     }
    884 }
    885 
    886 static u32 gidx_next(gidx_ctx *ctx)
    887 {
    888     // lazily creates the offset block we need
    889     if ((ctx->offset & 127) == 0) {
    890         ctx->ctr++;
    891         gidx_refresh(ctx);
    892     }
    893     u32 index  = ctx->offset & 127; // save index  for current call
    894     u32 offset = ctx->offset;       // save offset for current call
    895     ctx->offset++;                  // update offset for next call
    896 
    897     // Computes the area size.
    898     // Pass 0 : all already finished segments plus already constructed
    899     //          blocks in this segment
    900     // Pass 1+: 3 last segments plus already constructed
    901     //          blocks in this segment.  THE SPEC SUGGESTS OTHERWISE.
    902     //          I CONFORM TO THE REFERENCE IMPLEMENTATION.
    903     int first_pass  = ctx->pass_number == 0;
    904     u32 slice_size  = ctx->nb_blocks >> 2;
    905     u32 nb_segments = first_pass ? ctx->slice_number : 3;
    906     u32 area_size   = nb_segments * slice_size + offset - 1;
    907 
    908     // Computes the starting position of the reference area.
    909     // CONTRARY TO WHAT THE SPEC SUGGESTS, IT STARTS AT THE
    910     // NEXT SEGMENT, NOT THE NEXT BLOCK.
    911     u32 next_slice = ((ctx->slice_number + 1) & 3) * slice_size;
    912     u32 start_pos  = first_pass ? 0 : next_slice;
    913 
    914     // Generate offset from J1 (no need for J2, there's only one lane)
    915     u64 j1  = ctx->b.a[index] & 0xffffffff; // pseudo-random number
    916     u64 x   = (j1 * j1)       >> 32;
    917     u64 y   = (area_size * x) >> 32;
    918     u64 z   = (area_size - 1) - y;
    919     u64 ref = start_pos + z;                // ref < 2 * nb_blocks
    920     return (u32)(ref < ctx->nb_blocks ? ref : ref - ctx->nb_blocks);
    921 }
    922 
    923 // Main algorithm
    924 void crypto_argon2i_general(u8       *hash,      u32 hash_size,
    925                             void     *work_area, u32 nb_blocks,
    926                             u32 nb_iterations,
    927                             const u8 *password,  u32 password_size,
    928                             const u8 *salt,      u32 salt_size,
    929                             const u8 *key,       u32 key_size,
    930                             const u8 *ad,        u32 ad_size)
    931 {
    932     // work area seen as blocks (must be suitably aligned)
    933     block *blocks = (block*)work_area;
    934     {
    935         crypto_blake2b_ctx ctx;
    936         crypto_blake2b_init(&ctx);
    937 
    938         blake_update_32      (&ctx, 1            ); // p: number of threads
    939         blake_update_32      (&ctx, hash_size    );
    940         blake_update_32      (&ctx, nb_blocks    );
    941         blake_update_32      (&ctx, nb_iterations);
    942         blake_update_32      (&ctx, 0x13         ); // v: version number
    943         blake_update_32      (&ctx, 1            ); // y: Argon2i
    944         blake_update_32      (&ctx,           password_size);
    945         crypto_blake2b_update(&ctx, password, password_size);
    946         blake_update_32      (&ctx,           salt_size);
    947         crypto_blake2b_update(&ctx, salt,     salt_size);
    948         blake_update_32      (&ctx,           key_size);
    949         crypto_blake2b_update(&ctx, key,      key_size);
    950         blake_update_32      (&ctx,           ad_size);
    951         crypto_blake2b_update(&ctx, ad,       ad_size);
    952 
    953         u8 initial_hash[72]; // 64 bytes plus 2 words for future hashes
    954         crypto_blake2b_final(&ctx, initial_hash);
    955 
    956         // fill first 2 blocks
    957         block tmp_block;
    958         u8    hash_area[1024];
    959         store32_le(initial_hash + 64, 0); // first  additional word
    960         store32_le(initial_hash + 68, 0); // second additional word
    961         extended_hash(hash_area, 1024, initial_hash, 72);
    962         load_block(&tmp_block, hash_area);
    963         copy_block(blocks, &tmp_block);
    964 
    965         store32_le(initial_hash + 64, 1); // slight modification
    966         extended_hash(hash_area, 1024, initial_hash, 72);
    967         load_block(&tmp_block, hash_area);
    968         copy_block(blocks + 1, &tmp_block);
    969 
    970         WIPE_BUFFER(initial_hash);
    971         WIPE_BUFFER(hash_area);
    972         wipe_block(&tmp_block);
    973     }
    974 
    975     // Actual number of blocks
    976     nb_blocks -= nb_blocks & 3; // round down to 4 p (p == 1 thread)
    977     const u32 segment_size = nb_blocks >> 2;
    978 
    979     // fill (then re-fill) the rest of the blocks
    980     block tmp;
    981     gidx_ctx ctx; // public information, no need to wipe
    982     FOR_T (u32, pass_number, 0, nb_iterations) {
    983         int first_pass = pass_number == 0;
    984 
    985         FOR_T (u32, segment, 0, 4) {
    986             gidx_init(&ctx, pass_number, segment, nb_blocks, nb_iterations);
    987 
    988             // On the first segment of the first pass,
    989             // blocks 0 and 1 are already filled.
    990             // We use the offset to skip them.
    991             u32 start_offset  = first_pass && segment == 0 ? 2 : 0;
    992             u32 segment_start = segment * segment_size + start_offset;
    993             u32 segment_end   = (segment + 1) * segment_size;
    994             FOR_T (u32, current_block, segment_start, segment_end) {
    995                 u32 reference_block = gidx_next(&ctx);
    996                 u32 previous_block  = current_block == 0
    997                                     ? nb_blocks - 1
    998                                     : current_block - 1;
    999                 block *c = blocks + current_block;
   1000                 block *p = blocks + previous_block;
   1001                 block *r = blocks + reference_block;
   1002                 if (first_pass) { g_copy(c, p, r, &tmp); }
   1003                 else            { g_xor (c, p, r, &tmp); }
   1004             }
   1005         }
   1006     }
   1007     wipe_block(&tmp);
   1008     u8 final_block[1024];
   1009     store_block(final_block, blocks + (nb_blocks - 1));
   1010 
   1011     // wipe work area
   1012     volatile u64 *p = (u64*)work_area;
   1013     ZERO(p, 128 * nb_blocks);
   1014 
   1015     // hash the very last block with H' into the output hash
   1016     extended_hash(hash, hash_size, final_block, 1024);
   1017     WIPE_BUFFER(final_block);
   1018 }
   1019 
   1020 void crypto_argon2i(u8   *hash,      u32 hash_size,
   1021                     void *work_area, u32 nb_blocks, u32 nb_iterations,
   1022                     const u8 *password,  u32 password_size,
   1023                     const u8 *salt,      u32 salt_size)
   1024 {
   1025     crypto_argon2i_general(hash, hash_size, work_area, nb_blocks, nb_iterations,
   1026                            password, password_size, salt , salt_size, 0,0,0,0);
   1027 }
   1028 
   1029 ////////////////////////////////////
   1030 /// Arithmetic modulo 2^255 - 19 ///
   1031 ////////////////////////////////////
   1032 //  Originally taken from SUPERCOP's ref10 implementation.
   1033 //  A bit bigger than TweetNaCl, over 4 times faster.
   1034 
   1035 // field element
   1036 typedef i32 fe[10];
   1037 
   1038 // field constants
   1039 //
   1040 // fe_one      : 1
   1041 // sqrtm1      : sqrt(-1)
   1042 // d           :     -121665 / 121666
   1043 // D2          : 2 * -121665 / 121666
   1044 // lop_x, lop_y: low order point in Edwards coordinates
   1045 // ufactor     : -sqrt(-1) * 2
   1046 // A2          : 486662^2  (A squared)
   1047 static const fe fe_one  = {1};
   1048 static const fe sqrtm1  = {-32595792, -7943725, 9377950, 3500415, 12389472,
   1049                            -272473, -25146209, -2005654, 326686, 11406482,};
   1050 static const fe d       = {-10913610, 13857413, -15372611, 6949391, 114729,
   1051                            -8787816, -6275908, -3247719, -18696448, -12055116,};
   1052 static const fe D2      = {-21827239, -5839606, -30745221, 13898782, 229458,
   1053                            15978800, -12551817, -6495438, 29715968, 9444199,};
   1054 static const fe lop_x   = {21352778, 5345713, 4660180, -8347857, 24143090,
   1055                            14568123, 30185756, -12247770, -33528939, 8345319,};
   1056 static const fe lop_y   = {-6952922, -1265500, 6862341, -7057498, -4037696,
   1057                            -5447722, 31680899, -15325402, -19365852, 1569102,};
   1058 static const fe ufactor = {-1917299, 15887451, -18755900, -7000830, -24778944,
   1059                            544946, -16816446, 4011309, -653372, 10741468,};
   1060 static const fe A2      = {12721188, 3529, 0, 0, 0, 0, 0, 0, 0, 0,};
   1061 
   1062 static void fe_0(fe h) {           ZERO(h  , 10); }
   1063 static void fe_1(fe h) { h[0] = 1; ZERO(h+1,  9); }
   1064 
   1065 static void fe_copy(fe h,const fe f           ){FOR(i,0,10) h[i] =  f[i];      }
   1066 static void fe_neg (fe h,const fe f           ){FOR(i,0,10) h[i] = -f[i];      }
   1067 static void fe_add (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] + g[i];}
   1068 static void fe_sub (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] - g[i];}
   1069 
   1070 static void fe_cswap(fe f, fe g, int b)
   1071 {
   1072     i32 mask = -b; // -1 = 0xffffffff
   1073     FOR (i, 0, 10) {
   1074         i32 x = (f[i] ^ g[i]) & mask;
   1075         f[i] = f[i] ^ x;
   1076         g[i] = g[i] ^ x;
   1077     }
   1078 }
   1079 
   1080 static void fe_ccopy(fe f, const fe g, int b)
   1081 {
   1082     i32 mask = -b; // -1 = 0xffffffff
   1083     FOR (i, 0, 10) {
   1084         i32 x = (f[i] ^ g[i]) & mask;
   1085         f[i] = f[i] ^ x;
   1086     }
   1087 }
   1088 
   1089 
   1090 // Signed carry propagation
   1091 // ------------------------
   1092 //
   1093 // Let t be a number.  It can be uniquely decomposed thus:
   1094 //
   1095 //    t = h*2^26 + l
   1096 //    such that -2^25 <= l < 2^25
   1097 //
   1098 // Let c = (t + 2^25) / 2^26            (rounded down)
   1099 //     c = (h*2^26 + l + 2^25) / 2^26   (rounded down)
   1100 //     c =  h   +   (l + 2^25) / 2^26   (rounded down)
   1101 //     c =  h                           (exactly)
   1102 // Because 0 <= l + 2^25 < 2^26
   1103 //
   1104 // Let u = t          - c*2^26
   1105 //     u = h*2^26 + l - h*2^26
   1106 //     u = l
   1107 // Therefore, -2^25 <= u < 2^25
   1108 //
   1109 // Additionally, if |t| < x, then |h| < x/2^26 (rounded down)
   1110 //
   1111 // Notations:
   1112 // - In C, 1<<25 means 2^25.
   1113 // - In C, x>>25 means floor(x / (2^25)).
   1114 // - All of the above applies with 25 & 24 as well as 26 & 25.
   1115 //
   1116 //
   1117 // Note on negative right shifts
   1118 // -----------------------------
   1119 //
   1120 // In C, x >> n, where x is a negative integer, is implementation
   1121 // defined.  In practice, all platforms do arithmetic shift, which is
   1122 // equivalent to division by 2^26, rounded down.  Some compilers, like
   1123 // GCC, even guarantee it.
   1124 //
   1125 // If we ever stumble upon a platform that does not propagate the sign
   1126 // bit (we won't), visible failures will show at the slightest test, and
   1127 // the signed shifts can be replaced by the following:
   1128 //
   1129 //     typedef struct { i64 x:39; } s25;
   1130 //     typedef struct { i64 x:38; } s26;
   1131 //     i64 shift25(i64 x) { s25 s; s.x = ((u64)x)>>25; return s.x; }
   1132 //     i64 shift26(i64 x) { s26 s; s.x = ((u64)x)>>26; return s.x; }
   1133 //
   1134 // Current compilers cannot optimise this, causing a 30% drop in
   1135 // performance.  Fairly expensive for something that never happens.
   1136 //
   1137 //
   1138 // Precondition
   1139 // ------------
   1140 //
   1141 // |t0|       < 2^63
   1142 // |t1|..|t9| < 2^62
   1143 //
   1144 // Algorithm
   1145 // ---------
   1146 // c   = t0 + 2^25 / 2^26   -- |c|  <= 2^36
   1147 // t0 -= c * 2^26           -- |t0| <= 2^25
   1148 // t1 += c                  -- |t1| <= 2^63
   1149 //
   1150 // c   = t4 + 2^25 / 2^26   -- |c|  <= 2^36
   1151 // t4 -= c * 2^26           -- |t4| <= 2^25
   1152 // t5 += c                  -- |t5| <= 2^63
   1153 //
   1154 // c   = t1 + 2^24 / 2^25   -- |c|  <= 2^38
   1155 // t1 -= c * 2^25           -- |t1| <= 2^24
   1156 // t2 += c                  -- |t2| <= 2^63
   1157 //
   1158 // c   = t5 + 2^24 / 2^25   -- |c|  <= 2^38
   1159 // t5 -= c * 2^25           -- |t5| <= 2^24
   1160 // t6 += c                  -- |t6| <= 2^63
   1161 //
   1162 // c   = t2 + 2^25 / 2^26   -- |c|  <= 2^37
   1163 // t2 -= c * 2^26           -- |t2| <= 2^25        < 1.1 * 2^25  (final t2)
   1164 // t3 += c                  -- |t3| <= 2^63
   1165 //
   1166 // c   = t6 + 2^25 / 2^26   -- |c|  <= 2^37
   1167 // t6 -= c * 2^26           -- |t6| <= 2^25        < 1.1 * 2^25  (final t6)
   1168 // t7 += c                  -- |t7| <= 2^63
   1169 //
   1170 // c   = t3 + 2^24 / 2^25   -- |c|  <= 2^38
   1171 // t3 -= c * 2^25           -- |t3| <= 2^24        < 1.1 * 2^24  (final t3)
   1172 // t4 += c                  -- |t4| <= 2^25 + 2^38 < 2^39
   1173 //
   1174 // c   = t7 + 2^24 / 2^25   -- |c|  <= 2^38
   1175 // t7 -= c * 2^25           -- |t7| <= 2^24        < 1.1 * 2^24  (final t7)
   1176 // t8 += c                  -- |t8| <= 2^63
   1177 //
   1178 // c   = t4 + 2^25 / 2^26   -- |c|  <= 2^13
   1179 // t4 -= c * 2^26           -- |t4| <= 2^25        < 1.1 * 2^25  (final t4)
   1180 // t5 += c                  -- |t5| <= 2^24 + 2^13 < 1.1 * 2^24  (final t5)
   1181 //
   1182 // c   = t8 + 2^25 / 2^26   -- |c|  <= 2^37
   1183 // t8 -= c * 2^26           -- |t8| <= 2^25        < 1.1 * 2^25  (final t8)
   1184 // t9 += c                  -- |t9| <= 2^63
   1185 //
   1186 // c   = t9 + 2^24 / 2^25   -- |c|  <= 2^38
   1187 // t9 -= c * 2^25           -- |t9| <= 2^24        < 1.1 * 2^24  (final t9)
   1188 // t0 += c * 19             -- |t0| <= 2^25 + 2^38*19 < 2^44
   1189 //
   1190 // c   = t0 + 2^25 / 2^26   -- |c|  <= 2^18
   1191 // t0 -= c * 2^26           -- |t0| <= 2^25        < 1.1 * 2^25  (final t0)
   1192 // t1 += c                  -- |t1| <= 2^24 + 2^18 < 1.1 * 2^24  (final t1)
   1193 //
   1194 // Postcondition
   1195 // -------------
   1196 //   |t0|, |t2|, |t4|, |t6|, |t8|  <  1.1 * 2^25
   1197 //   |t1|, |t3|, |t5|, |t7|, |t9|  <  1.1 * 2^24
   1198 #define FE_CARRY                                                        \
   1199     i64 c;                                                              \
   1200     c = (t0 + ((i64)1<<25)) >> 26;  t0 -= c * ((i64)1 << 26);  t1 += c; \
   1201     c = (t4 + ((i64)1<<25)) >> 26;  t4 -= c * ((i64)1 << 26);  t5 += c; \
   1202     c = (t1 + ((i64)1<<24)) >> 25;  t1 -= c * ((i64)1 << 25);  t2 += c; \
   1203     c = (t5 + ((i64)1<<24)) >> 25;  t5 -= c * ((i64)1 << 25);  t6 += c; \
   1204     c = (t2 + ((i64)1<<25)) >> 26;  t2 -= c * ((i64)1 << 26);  t3 += c; \
   1205     c = (t6 + ((i64)1<<25)) >> 26;  t6 -= c * ((i64)1 << 26);  t7 += c; \
   1206     c = (t3 + ((i64)1<<24)) >> 25;  t3 -= c * ((i64)1 << 25);  t4 += c; \
   1207     c = (t7 + ((i64)1<<24)) >> 25;  t7 -= c * ((i64)1 << 25);  t8 += c; \
   1208     c = (t4 + ((i64)1<<25)) >> 26;  t4 -= c * ((i64)1 << 26);  t5 += c; \
   1209     c = (t8 + ((i64)1<<25)) >> 26;  t8 -= c * ((i64)1 << 26);  t9 += c; \
   1210     c = (t9 + ((i64)1<<24)) >> 25;  t9 -= c * ((i64)1 << 25);  t0 += c * 19; \
   1211     c = (t0 + ((i64)1<<25)) >> 26;  t0 -= c * ((i64)1 << 26);  t1 += c; \
   1212     h[0]=(i32)t0;  h[1]=(i32)t1;  h[2]=(i32)t2;  h[3]=(i32)t3;  h[4]=(i32)t4; \
   1213     h[5]=(i32)t5;  h[6]=(i32)t6;  h[7]=(i32)t7;  h[8]=(i32)t8;  h[9]=(i32)t9
   1214 
   1215 static void fe_frombytes(fe h, const u8 s[32])
   1216 {
   1217     i64 t0 =  load32_le(s);                        // t0 < 2^32
   1218     i64 t1 =  load24_le(s +  4) << 6;              // t1 < 2^30
   1219     i64 t2 =  load24_le(s +  7) << 5;              // t2 < 2^29
   1220     i64 t3 =  load24_le(s + 10) << 3;              // t3 < 2^27
   1221     i64 t4 =  load24_le(s + 13) << 2;              // t4 < 2^26
   1222     i64 t5 =  load32_le(s + 16);                   // t5 < 2^32
   1223     i64 t6 =  load24_le(s + 20) << 7;              // t6 < 2^31
   1224     i64 t7 =  load24_le(s + 23) << 5;              // t7 < 2^29
   1225     i64 t8 =  load24_le(s + 26) << 4;              // t8 < 2^28
   1226     i64 t9 = (load24_le(s + 29) & 0x7fffff) << 2;  // t9 < 2^25
   1227     FE_CARRY;                                      // Carry recondition OK
   1228 }
   1229 
   1230 // Precondition
   1231 //   |h[0]|, |h[2]|, |h[4]|, |h[6]|, |h[8]|  <  1.1 * 2^25
   1232 //   |h[1]|, |h[3]|, |h[5]|, |h[7]|, |h[9]|  <  1.1 * 2^24
   1233 //
   1234 // Therefore, |h| < 2^255-19
   1235 // There are two possibilities:
   1236 //
   1237 // - If h is positive, all we need to do is reduce its individual
   1238 //   limbs down to their tight positive range.
   1239 // - If h is negative, we also need to add 2^255-19 to it.
   1240 //   Or just remove 19 and chop off any excess bit.
   1241 static void fe_tobytes(u8 s[32], const fe h)
   1242 {
   1243     i32 t[10];
   1244     COPY(t, h, 10);
   1245     i32 q = (19 * t[9] + (((i32) 1) << 24)) >> 25;
   1246     //                 |t9|                    < 1.1 * 2^24
   1247     //  -1.1 * 2^24  <  t9                     < 1.1 * 2^24
   1248     //  -21  * 2^24  <  19 * t9                < 21  * 2^24
   1249     //  -2^29        <  19 * t9 + 2^24         < 2^29
   1250     //  -2^29 / 2^25 < (19 * t9 + 2^24) / 2^25 < 2^29 / 2^25
   1251     //  -16          < (19 * t9 + 2^24) / 2^25 < 16
   1252     FOR (i, 0, 5) {
   1253         q += t[2*i  ]; q >>= 26; // q = 0 or -1
   1254         q += t[2*i+1]; q >>= 25; // q = 0 or -1
   1255     }
   1256     // q =  0 iff h >= 0
   1257     // q = -1 iff h <  0
   1258     // Adding q * 19 to h reduces h to its proper range.
   1259     q *= 19;  // Shift carry back to the beginning
   1260     FOR (i, 0, 5) {
   1261         t[i*2  ] += q;  q = t[i*2  ] >> 26;  t[i*2  ] -= q * ((i32)1 << 26);
   1262         t[i*2+1] += q;  q = t[i*2+1] >> 25;  t[i*2+1] -= q * ((i32)1 << 25);
   1263     }
   1264     // h is now fully reduced, and q represents the excess bit.
   1265 
   1266     store32_le(s +  0, ((u32)t[0] >>  0) | ((u32)t[1] << 26));
   1267     store32_le(s +  4, ((u32)t[1] >>  6) | ((u32)t[2] << 19));
   1268     store32_le(s +  8, ((u32)t[2] >> 13) | ((u32)t[3] << 13));
   1269     store32_le(s + 12, ((u32)t[3] >> 19) | ((u32)t[4] <<  6));
   1270     store32_le(s + 16, ((u32)t[5] >>  0) | ((u32)t[6] << 25));
   1271     store32_le(s + 20, ((u32)t[6] >>  7) | ((u32)t[7] << 19));
   1272     store32_le(s + 24, ((u32)t[7] >> 13) | ((u32)t[8] << 12));
   1273     store32_le(s + 28, ((u32)t[8] >> 20) | ((u32)t[9] <<  6));
   1274 
   1275     WIPE_BUFFER(t);
   1276 }
   1277 
   1278 // Precondition
   1279 // -------------
   1280 //   |f0|, |f2|, |f4|, |f6|, |f8|  <  1.65 * 2^26
   1281 //   |f1|, |f3|, |f5|, |f7|, |f9|  <  1.65 * 2^25
   1282 //
   1283 //   |g0|, |g2|, |g4|, |g6|, |g8|  <  1.65 * 2^26
   1284 //   |g1|, |g3|, |g5|, |g7|, |g9|  <  1.65 * 2^25
   1285 static void fe_mul_small(fe h, const fe f, i32 g)
   1286 {
   1287     i64 t0 = f[0] * (i64) g;  i64 t1 = f[1] * (i64) g;
   1288     i64 t2 = f[2] * (i64) g;  i64 t3 = f[3] * (i64) g;
   1289     i64 t4 = f[4] * (i64) g;  i64 t5 = f[5] * (i64) g;
   1290     i64 t6 = f[6] * (i64) g;  i64 t7 = f[7] * (i64) g;
   1291     i64 t8 = f[8] * (i64) g;  i64 t9 = f[9] * (i64) g;
   1292     // |t0|, |t2|, |t4|, |t6|, |t8|  <  1.65 * 2^26 * 2^31  < 2^58
   1293     // |t1|, |t3|, |t5|, |t7|, |t9|  <  1.65 * 2^25 * 2^31  < 2^57
   1294 
   1295     FE_CARRY; // Carry precondition OK
   1296 }
   1297 
   1298 // Precondition
   1299 // -------------
   1300 //   |f0|, |f2|, |f4|, |f6|, |f8|  <  1.65 * 2^26
   1301 //   |f1|, |f3|, |f5|, |f7|, |f9|  <  1.65 * 2^25
   1302 //
   1303 //   |g0|, |g2|, |g4|, |g6|, |g8|  <  1.65 * 2^26
   1304 //   |g1|, |g3|, |g5|, |g7|, |g9|  <  1.65 * 2^25
   1305 static void fe_mul(fe h, const fe f, const fe g)
   1306 {
   1307     // Everything is unrolled and put in temporary variables.
   1308     // We could roll the loop, but that would make curve25519 twice as slow.
   1309     i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
   1310     i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9];
   1311     i32 g0 = g[0]; i32 g1 = g[1]; i32 g2 = g[2]; i32 g3 = g[3]; i32 g4 = g[4];
   1312     i32 g5 = g[5]; i32 g6 = g[6]; i32 g7 = g[7]; i32 g8 = g[8]; i32 g9 = g[9];
   1313     i32 F1 = f1*2; i32 F3 = f3*2; i32 F5 = f5*2; i32 F7 = f7*2; i32 F9 = f9*2;
   1314     i32 G1 = g1*19;  i32 G2 = g2*19;  i32 G3 = g3*19;
   1315     i32 G4 = g4*19;  i32 G5 = g5*19;  i32 G6 = g6*19;
   1316     i32 G7 = g7*19;  i32 G8 = g8*19;  i32 G9 = g9*19;
   1317     // |F1|, |F3|, |F5|, |F7|, |F9|  <  1.65 * 2^26
   1318     // |G0|, |G2|, |G4|, |G6|, |G8|  <  2^31
   1319     // |G1|, |G3|, |G5|, |G7|, |G9|  <  2^30
   1320 
   1321     i64 t0 = f0*(i64)g0 + F1*(i64)G9 + f2*(i64)G8 + F3*(i64)G7 + f4*(i64)G6
   1322         +    F5*(i64)G5 + f6*(i64)G4 + F7*(i64)G3 + f8*(i64)G2 + F9*(i64)G1;
   1323     i64 t1 = f0*(i64)g1 + f1*(i64)g0 + f2*(i64)G9 + f3*(i64)G8 + f4*(i64)G7
   1324         +    f5*(i64)G6 + f6*(i64)G5 + f7*(i64)G4 + f8*(i64)G3 + f9*(i64)G2;
   1325     i64 t2 = f0*(i64)g2 + F1*(i64)g1 + f2*(i64)g0 + F3*(i64)G9 + f4*(i64)G8
   1326         +    F5*(i64)G7 + f6*(i64)G6 + F7*(i64)G5 + f8*(i64)G4 + F9*(i64)G3;
   1327     i64 t3 = f0*(i64)g3 + f1*(i64)g2 + f2*(i64)g1 + f3*(i64)g0 + f4*(i64)G9
   1328         +    f5*(i64)G8 + f6*(i64)G7 + f7*(i64)G6 + f8*(i64)G5 + f9*(i64)G4;
   1329     i64 t4 = f0*(i64)g4 + F1*(i64)g3 + f2*(i64)g2 + F3*(i64)g1 + f4*(i64)g0
   1330         +    F5*(i64)G9 + f6*(i64)G8 + F7*(i64)G7 + f8*(i64)G6 + F9*(i64)G5;
   1331     i64 t5 = f0*(i64)g5 + f1*(i64)g4 + f2*(i64)g3 + f3*(i64)g2 + f4*(i64)g1
   1332         +    f5*(i64)g0 + f6*(i64)G9 + f7*(i64)G8 + f8*(i64)G7 + f9*(i64)G6;
   1333     i64 t6 = f0*(i64)g6 + F1*(i64)g5 + f2*(i64)g4 + F3*(i64)g3 + f4*(i64)g2
   1334         +    F5*(i64)g1 + f6*(i64)g0 + F7*(i64)G9 + f8*(i64)G8 + F9*(i64)G7;
   1335     i64 t7 = f0*(i64)g7 + f1*(i64)g6 + f2*(i64)g5 + f3*(i64)g4 + f4*(i64)g3
   1336         +    f5*(i64)g2 + f6*(i64)g1 + f7*(i64)g0 + f8*(i64)G9 + f9*(i64)G8;
   1337     i64 t8 = f0*(i64)g8 + F1*(i64)g7 + f2*(i64)g6 + F3*(i64)g5 + f4*(i64)g4
   1338         +    F5*(i64)g3 + f6*(i64)g2 + F7*(i64)g1 + f8*(i64)g0 + F9*(i64)G9;
   1339     i64 t9 = f0*(i64)g9 + f1*(i64)g8 + f2*(i64)g7 + f3*(i64)g6 + f4*(i64)g5
   1340         +    f5*(i64)g4 + f6*(i64)g3 + f7*(i64)g2 + f8*(i64)g1 + f9*(i64)g0;
   1341     // t0 < 0.67 * 2^61
   1342     // t1 < 0.41 * 2^61
   1343     // t2 < 0.52 * 2^61
   1344     // t3 < 0.32 * 2^61
   1345     // t4 < 0.38 * 2^61
   1346     // t5 < 0.22 * 2^61
   1347     // t6 < 0.23 * 2^61
   1348     // t7 < 0.13 * 2^61
   1349     // t8 < 0.09 * 2^61
   1350     // t9 < 0.03 * 2^61
   1351 
   1352     FE_CARRY; // Everything below 2^62, Carry precondition OK
   1353 }
   1354 
   1355 // Precondition
   1356 // -------------
   1357 //   |f0|, |f2|, |f4|, |f6|, |f8|  <  1.65 * 2^26
   1358 //   |f1|, |f3|, |f5|, |f7|, |f9|  <  1.65 * 2^25
   1359 //
   1360 // Note: we could use fe_mul() for this, but this is significantly faster
   1361 static void fe_sq(fe h, const fe f)
   1362 {
   1363     i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
   1364     i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9];
   1365     i32 f0_2  = f0*2;   i32 f1_2  = f1*2;   i32 f2_2  = f2*2;   i32 f3_2 = f3*2;
   1366     i32 f4_2  = f4*2;   i32 f5_2  = f5*2;   i32 f6_2  = f6*2;   i32 f7_2 = f7*2;
   1367     i32 f5_38 = f5*38;  i32 f6_19 = f6*19;  i32 f7_38 = f7*38;
   1368     i32 f8_19 = f8*19;  i32 f9_38 = f9*38;
   1369     // |f0_2| , |f2_2| , |f4_2| , |f6_2| , |f8_2|  <  1.65 * 2^27
   1370     // |f1_2| , |f3_2| , |f5_2| , |f7_2| , |f9_2|  <  1.65 * 2^26
   1371     // |f5_38|, |f6_19|, |f7_38|, |f8_19|, |f9_38| <  2^31
   1372 
   1373     i64 t0 = f0  *(i64)f0    + f1_2*(i64)f9_38 + f2_2*(i64)f8_19
   1374         +    f3_2*(i64)f7_38 + f4_2*(i64)f6_19 + f5  *(i64)f5_38;
   1375     i64 t1 = f0_2*(i64)f1    + f2  *(i64)f9_38 + f3_2*(i64)f8_19
   1376         +    f4  *(i64)f7_38 + f5_2*(i64)f6_19;
   1377     i64 t2 = f0_2*(i64)f2    + f1_2*(i64)f1    + f3_2*(i64)f9_38
   1378         +    f4_2*(i64)f8_19 + f5_2*(i64)f7_38 + f6  *(i64)f6_19;
   1379     i64 t3 = f0_2*(i64)f3    + f1_2*(i64)f2    + f4  *(i64)f9_38
   1380         +    f5_2*(i64)f8_19 + f6  *(i64)f7_38;
   1381     i64 t4 = f0_2*(i64)f4    + f1_2*(i64)f3_2  + f2  *(i64)f2
   1382         +    f5_2*(i64)f9_38 + f6_2*(i64)f8_19 + f7  *(i64)f7_38;
   1383     i64 t5 = f0_2*(i64)f5    + f1_2*(i64)f4    + f2_2*(i64)f3
   1384         +    f6  *(i64)f9_38 + f7_2*(i64)f8_19;
   1385     i64 t6 = f0_2*(i64)f6    + f1_2*(i64)f5_2  + f2_2*(i64)f4
   1386         +    f3_2*(i64)f3    + f7_2*(i64)f9_38 + f8  *(i64)f8_19;
   1387     i64 t7 = f0_2*(i64)f7    + f1_2*(i64)f6    + f2_2*(i64)f5
   1388         +    f3_2*(i64)f4    + f8  *(i64)f9_38;
   1389     i64 t8 = f0_2*(i64)f8    + f1_2*(i64)f7_2  + f2_2*(i64)f6
   1390         +    f3_2*(i64)f5_2  + f4  *(i64)f4    + f9  *(i64)f9_38;
   1391     i64 t9 = f0_2*(i64)f9    + f1_2*(i64)f8    + f2_2*(i64)f7
   1392         +    f3_2*(i64)f6    + f4  *(i64)f5_2;
   1393     // t0 < 0.67 * 2^61
   1394     // t1 < 0.41 * 2^61
   1395     // t2 < 0.52 * 2^61
   1396     // t3 < 0.32 * 2^61
   1397     // t4 < 0.38 * 2^61
   1398     // t5 < 0.22 * 2^61
   1399     // t6 < 0.23 * 2^61
   1400     // t7 < 0.13 * 2^61
   1401     // t8 < 0.09 * 2^61
   1402     // t9 < 0.03 * 2^61
   1403 
   1404     FE_CARRY;
   1405 }
   1406 
   1407 // h = 2 * (f^2)
   1408 //
   1409 // Precondition
   1410 // -------------
   1411 //   |f0|, |f2|, |f4|, |f6|, |f8|  <  1.65 * 2^26
   1412 //   |f1|, |f3|, |f5|, |f7|, |f9|  <  1.65 * 2^25
   1413 //
   1414 // Note: we could implement fe_sq2() by copying fe_sq(), multiplying
   1415 // each limb by 2, *then* perform the carry.  This saves one carry.
   1416 // However, doing so with the stated preconditions does not work (t2
   1417 // would overflow).  There are 3 ways to solve this:
   1418 //
   1419 // 1. Show that t2 actually never overflows (it really does not).
   1420 // 2. Accept an additional carry, at a small lost of performance.
   1421 // 3. Make sure the input of fe_sq2() is freshly carried.
   1422 //
   1423 // SUPERCOP ref10 relies on (1).
   1424 // Monocypher chose (2) and (3), mostly to save code.
   1425 static void fe_sq2(fe h, const fe f)
   1426 {
   1427     fe_sq(h, f);
   1428     fe_mul_small(h, h, 2);
   1429 }
   1430 
   1431 // This could be simplified, but it would be slower
   1432 static void fe_pow22523(fe out, const fe z)
   1433 {
   1434     fe t0, t1, t2;
   1435     fe_sq(t0, z);
   1436     fe_sq(t1,t0);                   fe_sq(t1, t1);  fe_mul(t1, z, t1);
   1437     fe_mul(t0, t0, t1);
   1438     fe_sq(t0, t0);                                  fe_mul(t0, t1, t0);
   1439     fe_sq(t1, t0);  FOR (i, 1,   5) fe_sq(t1, t1);  fe_mul(t0, t1, t0);
   1440     fe_sq(t1, t0);  FOR (i, 1,  10) fe_sq(t1, t1);  fe_mul(t1, t1, t0);
   1441     fe_sq(t2, t1);  FOR (i, 1,  20) fe_sq(t2, t2);  fe_mul(t1, t2, t1);
   1442     fe_sq(t1, t1);  FOR (i, 1,  10) fe_sq(t1, t1);  fe_mul(t0, t1, t0);
   1443     fe_sq(t1, t0);  FOR (i, 1,  50) fe_sq(t1, t1);  fe_mul(t1, t1, t0);
   1444     fe_sq(t2, t1);  FOR (i, 1, 100) fe_sq(t2, t2);  fe_mul(t1, t2, t1);
   1445     fe_sq(t1, t1);  FOR (i, 1,  50) fe_sq(t1, t1);  fe_mul(t0, t1, t0);
   1446     fe_sq(t0, t0);  FOR (i, 1,   2) fe_sq(t0, t0);  fe_mul(out, t0, z);
   1447     WIPE_BUFFER(t0);
   1448     WIPE_BUFFER(t1);
   1449     WIPE_BUFFER(t2);
   1450 }
   1451 
   1452 // Inverting means multiplying by 2^255 - 21
   1453 // 2^255 - 21 = (2^252 - 3) * 8 + 3
   1454 // So we reuse the multiplication chain of fe_pow22523
   1455 static void fe_invert(fe out, const fe z)
   1456 {
   1457     fe tmp;
   1458     fe_pow22523(tmp, z);
   1459     // tmp2^8 * z^3
   1460     fe_sq(tmp, tmp);                        // 0
   1461     fe_sq(tmp, tmp);  fe_mul(tmp, tmp, z);  // 1
   1462     fe_sq(tmp, tmp);  fe_mul(out, tmp, z);  // 1
   1463     WIPE_BUFFER(tmp);
   1464 }
   1465 
   1466 //  Parity check.  Returns 0 if even, 1 if odd
   1467 static int fe_isodd(const fe f)
   1468 {
   1469     u8 s[32];
   1470     fe_tobytes(s, f);
   1471     u8 isodd = s[0] & 1;
   1472     WIPE_BUFFER(s);
   1473     return isodd;
   1474 }
   1475 
   1476 // Returns 1 if equal, 0 if not equal
   1477 static int fe_isequal(const fe f, const fe g)
   1478 {
   1479     u8 fs[32];
   1480     u8 gs[32];
   1481     fe_tobytes(fs, f);
   1482     fe_tobytes(gs, g);
   1483     int isdifferent = crypto_verify32(fs, gs);
   1484     WIPE_BUFFER(fs);
   1485     WIPE_BUFFER(gs);
   1486     return 1 + isdifferent;
   1487 }
   1488 
   1489 // Inverse square root.
   1490 // Returns true if x is a non zero square, false otherwise.
   1491 // After the call:
   1492 //   isr = sqrt(1/x)        if x is non-zero square.
   1493 //   isr = sqrt(sqrt(-1)/x) if x is not a square.
   1494 //   isr = 0                if x is zero.
   1495 // We do not guarantee the sign of the square root.
   1496 //
   1497 // Notes:
   1498 // Let quartic = x^((p-1)/4)
   1499 //
   1500 // x^((p-1)/2) = chi(x)
   1501 // quartic^2   = chi(x)
   1502 // quartic     = sqrt(chi(x))
   1503 // quartic     = 1 or -1 or sqrt(-1) or -sqrt(-1)
   1504 //
   1505 // Note that x is a square if quartic is 1 or -1
   1506 // There are 4 cases to consider:
   1507 //
   1508 // if   quartic         = 1  (x is a square)
   1509 // then x^((p-1)/4)     = 1
   1510 //      x^((p-5)/4) * x = 1
   1511 //      x^((p-5)/4)     = 1/x
   1512 //      x^((p-5)/8)     = sqrt(1/x) or -sqrt(1/x)
   1513 //
   1514 // if   quartic                = -1  (x is a square)
   1515 // then x^((p-1)/4)            = -1
   1516 //      x^((p-5)/4) * x        = -1
   1517 //      x^((p-5)/4)            = -1/x
   1518 //      x^((p-5)/8)            = sqrt(-1)   / sqrt(x)
   1519 //      x^((p-5)/8) * sqrt(-1) = sqrt(-1)^2 / sqrt(x)
   1520 //      x^((p-5)/8) * sqrt(-1) = -1/sqrt(x)
   1521 //      x^((p-5)/8) * sqrt(-1) = -sqrt(1/x) or sqrt(1/x)
   1522 //
   1523 // if   quartic         = sqrt(-1)  (x is not a square)
   1524 // then x^((p-1)/4)     = sqrt(-1)
   1525 //      x^((p-5)/4) * x = sqrt(-1)
   1526 //      x^((p-5)/4)     = sqrt(-1)/x
   1527 //      x^((p-5)/8)     = sqrt(sqrt(-1)/x) or -sqrt(sqrt(-1)/x)
   1528 //
   1529 // Note that the product of two non-squares is always a square:
   1530 //   For any non-squares a and b, chi(a) = -1 and chi(b) = -1.
   1531 //   Since chi(x) = x^((p-1)/2), chi(a)*chi(b) = chi(a*b) = 1.
   1532 //   Therefore a*b is a square.
   1533 //
   1534 //   Since sqrt(-1) and x are both non-squares, their product is a
   1535 //   square, and we can compute their square root.
   1536 //
   1537 // if   quartic                = -sqrt(-1)  (x is not a square)
   1538 // then x^((p-1)/4)            = -sqrt(-1)
   1539 //      x^((p-5)/4) * x        = -sqrt(-1)
   1540 //      x^((p-5)/4)            = -sqrt(-1)/x
   1541 //      x^((p-5)/8)            = sqrt(-sqrt(-1)/x)
   1542 //      x^((p-5)/8)            = sqrt( sqrt(-1)/x) * sqrt(-1)
   1543 //      x^((p-5)/8) * sqrt(-1) = sqrt( sqrt(-1)/x) * sqrt(-1)^2
   1544 //      x^((p-5)/8) * sqrt(-1) = sqrt( sqrt(-1)/x) * -1
   1545 //      x^((p-5)/8) * sqrt(-1) = -sqrt(sqrt(-1)/x) or sqrt(sqrt(-1)/x)
   1546 static int invsqrt(fe isr, const fe x)
   1547 {
   1548     fe check, quartic;
   1549     fe_copy(check, x);
   1550     fe_pow22523(isr, check);
   1551     fe_sq (quartic, isr);
   1552     fe_mul(quartic, quartic, check);
   1553     fe_1  (check);          int p1 = fe_isequal(quartic, check);
   1554     fe_neg(check, check );  int m1 = fe_isequal(quartic, check);
   1555     fe_neg(check, sqrtm1);  int ms = fe_isequal(quartic, check);
   1556     fe_mul(check, isr, sqrtm1);
   1557     fe_ccopy(isr, check, m1 | ms);
   1558     WIPE_BUFFER(quartic);
   1559     WIPE_BUFFER(check);
   1560     return p1 | m1;
   1561 }
   1562 
   1563 // trim a scalar for scalar multiplication
   1564 static void trim_scalar(u8 scalar[32])
   1565 {
   1566     scalar[ 0] &= 248;
   1567     scalar[31] &= 127;
   1568     scalar[31] |= 64;
   1569 }
   1570 
   1571 // get bit from scalar at position i
   1572 static int scalar_bit(const u8 s[32], int i)
   1573 {
   1574     if (i < 0) { return 0; } // handle -1 for sliding windows
   1575     return (s[i>>3] >> (i&7)) & 1;
   1576 }
   1577 
   1578 ///////////////
   1579 /// X-25519 /// Taken from SUPERCOP's ref10 implementation.
   1580 ///////////////
   1581 static void scalarmult(u8 q[32], const u8 scalar[32], const u8 p[32],
   1582                        int nb_bits)
   1583 {
   1584     // computes the scalar product
   1585     fe x1;
   1586     fe_frombytes(x1, p);
   1587 
   1588     // computes the actual scalar product (the result is in x2 and z2)
   1589     fe x2, z2, x3, z3, t0, t1;
   1590     // Montgomery ladder
   1591     // In projective coordinates, to avoid divisions: x = X / Z
   1592     // We don't care about the y coordinate, it's only 1 bit of information
   1593     fe_1(x2);        fe_0(z2); // "zero" point
   1594     fe_copy(x3, x1); fe_1(z3); // "one"  point
   1595     int swap = 0;
   1596     for (int pos = nb_bits-1; pos >= 0; --pos) {
   1597         // constant time conditional swap before ladder step
   1598         int b = scalar_bit(scalar, pos);
   1599         swap ^= b; // xor trick avoids swapping at the end of the loop
   1600         fe_cswap(x2, x3, swap);
   1601         fe_cswap(z2, z3, swap);
   1602         swap = b;  // anticipates one last swap after the loop
   1603 
   1604         // Montgomery ladder step: replaces (P2, P3) by (P2*2, P2+P3)
   1605         // with differential addition
   1606         fe_sub(t0, x3, z3);
   1607         fe_sub(t1, x2, z2);
   1608         fe_add(x2, x2, z2);
   1609         fe_add(z2, x3, z3);
   1610         fe_mul(z3, t0, x2);
   1611         fe_mul(z2, z2, t1);
   1612         fe_sq (t0, t1    );
   1613         fe_sq (t1, x2    );
   1614         fe_add(x3, z3, z2);
   1615         fe_sub(z2, z3, z2);
   1616         fe_mul(x2, t1, t0);
   1617         fe_sub(t1, t1, t0);
   1618         fe_sq (z2, z2    );
   1619         fe_mul_small(z3, t1, 121666);
   1620         fe_sq (x3, x3    );
   1621         fe_add(t0, t0, z3);
   1622         fe_mul(z3, x1, z2);
   1623         fe_mul(z2, t1, t0);
   1624     }
   1625     // last swap is necessary to compensate for the xor trick
   1626     // Note: after this swap, P3 == P2 + P1.
   1627     fe_cswap(x2, x3, swap);
   1628     fe_cswap(z2, z3, swap);
   1629 
   1630     // normalises the coordinates: x == X / Z
   1631     fe_invert(z2, z2);
   1632     fe_mul(x2, x2, z2);
   1633     fe_tobytes(q, x2);
   1634 
   1635     WIPE_BUFFER(x1);
   1636     WIPE_BUFFER(x2);  WIPE_BUFFER(z2);  WIPE_BUFFER(t0);
   1637     WIPE_BUFFER(x3);  WIPE_BUFFER(z3);  WIPE_BUFFER(t1);
   1638 }
   1639 
   1640 void crypto_x25519(u8       raw_shared_secret[32],
   1641                    const u8 your_secret_key  [32],
   1642                    const u8 their_public_key [32])
   1643 {
   1644     // restrict the possible scalar values
   1645     u8 e[32];
   1646     COPY(e, your_secret_key, 32);
   1647     trim_scalar(e);
   1648     scalarmult(raw_shared_secret, e, their_public_key, 255);
   1649     WIPE_BUFFER(e);
   1650 }
   1651 
   1652 void crypto_x25519_public_key(u8       public_key[32],
   1653                               const u8 secret_key[32])
   1654 {
   1655     static const u8 base_point[32] = {9};
   1656     crypto_x25519(public_key, secret_key, base_point);
   1657 }
   1658 
   1659 ///////////////////////////
   1660 /// Arithmetic modulo L ///
   1661 ///////////////////////////
   1662 static const u32 L[8] = {0x5cf5d3ed, 0x5812631a, 0xa2f79cd6, 0x14def9de,
   1663                          0x00000000, 0x00000000, 0x00000000, 0x10000000,};
   1664 
   1665 //  p = a*b + p
   1666 static void multiply(u32 p[16], const u32 a[8], const u32 b[8])
   1667 {
   1668     FOR (i, 0, 8) {
   1669         u64 carry = 0;
   1670         FOR (j, 0, 8) {
   1671             carry  += p[i+j] + (u64)a[i] * b[j];
   1672             p[i+j]  = (u32)carry;
   1673             carry >>= 32;
   1674         }
   1675         p[i+8] = (u32)carry;
   1676     }
   1677 }
   1678 
   1679 static int is_above_l(const u32 x[8])
   1680 {
   1681     // We work with L directly, in a 2's complement encoding
   1682     // (-L == ~L + 1)
   1683     u64 carry = 1;
   1684     FOR (i, 0, 8) {
   1685         carry += (u64)x[i] + ~L[i];
   1686         carry >>= 32;
   1687     }
   1688     return carry;
   1689 }
   1690 
   1691 // Final reduction modulo L, by conditionally removing L.
   1692 // if x < l     , then r = x
   1693 // if l <= x 2*l, then r = x-l
   1694 // otherwise the result will be wrong
   1695 static void remove_l(u32 r[8], const u32 x[8])
   1696 {
   1697     u64 carry = is_above_l(x);
   1698     u32 mask  = ~(u32)carry + 1; // carry == 0 or 1
   1699     FOR (i, 0, 8) {
   1700         carry += (u64)x[i] + (~L[i] & mask);
   1701         r[i]   = (u32)carry;
   1702         carry >>= 32;
   1703     }
   1704 }
   1705 
   1706 // Full reduction modulo L (Barrett reduction)
   1707 static void mod_l(u8 reduced[32], const u32 x[16])
   1708 {
   1709     static const u32 r[9] = {0x0a2c131b,0xed9ce5a3,0x086329a7,0x2106215d,
   1710                              0xffffffeb,0xffffffff,0xffffffff,0xffffffff,0xf,};
   1711     // xr = x * r
   1712     u32 xr[25] = {0};
   1713     FOR (i, 0, 9) {
   1714         u64 carry = 0;
   1715         FOR (j, 0, 16) {
   1716             carry  += xr[i+j] + (u64)r[i] * x[j];
   1717             xr[i+j] = (u32)carry;
   1718             carry >>= 32;
   1719         }
   1720         xr[i+16] = (u32)carry;
   1721     }
   1722     // xr = floor(xr / 2^512) * L
   1723     // Since the result is guaranteed to be below 2*L,
   1724     // it is enough to only compute the first 256 bits.
   1725     // The division is performed by saying xr[i+16]. (16 * 32 = 512)
   1726     ZERO(xr, 8);
   1727     FOR (i, 0, 8) {
   1728         u64 carry = 0;
   1729         FOR (j, 0, 8-i) {
   1730             carry   += xr[i+j] + (u64)xr[i+16] * L[j];
   1731             xr[i+j] = (u32)carry;
   1732             carry >>= 32;
   1733         }
   1734     }
   1735     // xr = x - xr
   1736     u64 carry = 1;
   1737     FOR (i, 0, 8) {
   1738         carry  += (u64)x[i] + ~xr[i];
   1739         xr[i]   = (u32)carry;
   1740         carry >>= 32;
   1741     }
   1742     // Final reduction modulo L (conditional subtraction)
   1743     remove_l(xr, xr);
   1744     store32_le_buf(reduced, xr, 8);
   1745 
   1746     WIPE_BUFFER(xr);
   1747 }
   1748 
   1749 static void reduce(u8 r[64])
   1750 {
   1751     u32 x[16];
   1752     load32_le_buf(x, r, 16);
   1753     mod_l(r, x);
   1754     WIPE_BUFFER(x);
   1755 }
   1756 
   1757 // r = (a * b) + c
   1758 static void mul_add(u8 r[32], const u8 a[32], const u8 b[32], const u8 c[32])
   1759 {
   1760     u32 A[8];  load32_le_buf(A, a, 8);
   1761     u32 B[8];  load32_le_buf(B, b, 8);
   1762     u32 p[16];
   1763     load32_le_buf(p, c, 8);
   1764     ZERO(p + 8, 8);
   1765     multiply(p, A, B);
   1766     mod_l(r, p);
   1767     WIPE_BUFFER(p);
   1768     WIPE_BUFFER(A);
   1769     WIPE_BUFFER(B);
   1770 }
   1771 
   1772 ///////////////
   1773 /// Ed25519 ///
   1774 ///////////////
   1775 
   1776 // Point (group element, ge) in a twisted Edwards curve,
   1777 // in extended projective coordinates.
   1778 // ge        : x  = X/Z, y  = Y/Z, T  = XY/Z
   1779 // ge_cached : Yp = X+Y, Ym = X-Y, T2 = T*D2
   1780 // ge_precomp: Z  = 1
   1781 typedef struct { fe X;  fe Y;  fe Z; fe T;  } ge;
   1782 typedef struct { fe Yp; fe Ym; fe Z; fe T2; } ge_cached;
   1783 typedef struct { fe Yp; fe Ym;       fe T2; } ge_precomp;
   1784 
   1785 static void ge_zero(ge *p)
   1786 {
   1787     fe_0(p->X);
   1788     fe_1(p->Y);
   1789     fe_1(p->Z);
   1790     fe_0(p->T);
   1791 }
   1792 
   1793 static void ge_tobytes(u8 s[32], const ge *h)
   1794 {
   1795     fe recip, x, y;
   1796     fe_invert(recip, h->Z);
   1797     fe_mul(x, h->X, recip);
   1798     fe_mul(y, h->Y, recip);
   1799     fe_tobytes(s, y);
   1800     s[31] ^= fe_isodd(x) << 7;
   1801 
   1802     WIPE_BUFFER(recip);
   1803     WIPE_BUFFER(x);
   1804     WIPE_BUFFER(y);
   1805 }
   1806 
   1807 // h = s, where s is a point encoded in 32 bytes
   1808 //
   1809 // Variable time!  Inputs must not be secret!
   1810 // => Use only to *check* signatures.
   1811 //
   1812 // From the specifications:
   1813 //   The encoding of s contains y and the sign of x
   1814 //   x = sqrt((y^2 - 1) / (d*y^2 + 1))
   1815 // In extended coordinates:
   1816 //   X = x, Y = y, Z = 1, T = x*y
   1817 //
   1818 //    Note that num * den is a square iff num / den is a square
   1819 //    If num * den is not a square, the point was not on the curve.
   1820 // From the above:
   1821 //   Let num =   y^2 - 1
   1822 //   Let den = d*y^2 + 1
   1823 //   x = sqrt((y^2 - 1) / (d*y^2 + 1))
   1824 //   x = sqrt(num / den)
   1825 //   x = sqrt(num^2 / (num * den))
   1826 //   x = num * sqrt(1 / (num * den))
   1827 //
   1828 // Therefore, we can just compute:
   1829 //   num =   y^2 - 1
   1830 //   den = d*y^2 + 1
   1831 //   isr = invsqrt(num * den)  // abort if not square
   1832 //   x   = num * isr
   1833 // Finally, negate x if its sign is not as specified.
   1834 static int ge_frombytes_vartime(ge *h, const u8 s[32])
   1835 {
   1836     fe_frombytes(h->Y, s);
   1837     fe_1(h->Z);
   1838     fe_sq (h->T, h->Y);        // t =   y^2
   1839     fe_mul(h->X, h->T, d   );  // x = d*y^2
   1840     fe_sub(h->T, h->T, h->Z);  // t =   y^2 - 1
   1841     fe_add(h->X, h->X, h->Z);  // x = d*y^2 + 1
   1842     fe_mul(h->X, h->T, h->X);  // x = (y^2 - 1) * (d*y^2 + 1)
   1843     int is_square = invsqrt(h->X, h->X);
   1844     if (!is_square) {
   1845         return -1;             // Not on the curve, abort
   1846     }
   1847     fe_mul(h->X, h->T, h->X);  // x = sqrt((y^2 - 1) / (d*y^2 + 1))
   1848     if (fe_isodd(h->X) != (s[31] >> 7)) {
   1849         fe_neg(h->X, h->X);
   1850     }
   1851     fe_mul(h->T, h->X, h->Y);
   1852     return 0;
   1853 }
   1854 
   1855 static void ge_cache(ge_cached *c, const ge *p)
   1856 {
   1857     fe_add (c->Yp, p->Y, p->X);
   1858     fe_sub (c->Ym, p->Y, p->X);
   1859     fe_copy(c->Z , p->Z      );
   1860     fe_mul (c->T2, p->T, D2  );
   1861 }
   1862 
   1863 // Internal buffers are not wiped! Inputs must not be secret!
   1864 // => Use only to *check* signatures.
   1865 static void ge_add(ge *s, const ge *p, const ge_cached *q)
   1866 {
   1867     fe a, b;
   1868     fe_add(a   , p->Y, p->X );
   1869     fe_sub(b   , p->Y, p->X );
   1870     fe_mul(a   , a   , q->Yp);
   1871     fe_mul(b   , b   , q->Ym);
   1872     fe_add(s->Y, a   , b    );
   1873     fe_sub(s->X, a   , b    );
   1874 
   1875     fe_add(s->Z, p->Z, p->Z );
   1876     fe_mul(s->Z, s->Z, q->Z );
   1877     fe_mul(s->T, p->T, q->T2);
   1878     fe_add(a   , s->Z, s->T );
   1879     fe_sub(b   , s->Z, s->T );
   1880 
   1881     fe_mul(s->T, s->X, s->Y);
   1882     fe_mul(s->X, s->X, b   );
   1883     fe_mul(s->Y, s->Y, a   );
   1884     fe_mul(s->Z, a   , b   );
   1885 }
   1886 
   1887 // Internal buffers are not wiped! Inputs must not be secret!
   1888 // => Use only to *check* signatures.
   1889 static void ge_sub(ge *s, const ge *p, const ge_cached *q)
   1890 {
   1891     ge_cached neg;
   1892     fe_copy(neg.Ym, q->Yp);
   1893     fe_copy(neg.Yp, q->Ym);
   1894     fe_copy(neg.Z , q->Z );
   1895     fe_neg (neg.T2, q->T2);
   1896     ge_add(s, p, &neg);
   1897 }
   1898 
   1899 static void ge_madd(ge *s, const ge *p, const ge_precomp *q, fe a, fe b)
   1900 {
   1901     fe_add(a   , p->Y, p->X );
   1902     fe_sub(b   , p->Y, p->X );
   1903     fe_mul(a   , a   , q->Yp);
   1904     fe_mul(b   , b   , q->Ym);
   1905     fe_add(s->Y, a   , b    );
   1906     fe_sub(s->X, a   , b    );
   1907 
   1908     fe_add(s->Z, p->Z, p->Z );
   1909     fe_mul(s->T, p->T, q->T2);
   1910     fe_add(a   , s->Z, s->T );
   1911     fe_sub(b   , s->Z, s->T );
   1912 
   1913     fe_mul(s->T, s->X, s->Y);
   1914     fe_mul(s->X, s->X, b   );
   1915     fe_mul(s->Y, s->Y, a   );
   1916     fe_mul(s->Z, a   , b   );
   1917 }
   1918 
   1919 static void ge_msub(ge *s, const ge *p, const ge_precomp *q, fe a, fe b)
   1920 {
   1921     fe_add(a   , p->Y, p->X );
   1922     fe_sub(b   , p->Y, p->X );
   1923     fe_mul(a   , a   , q->Ym);
   1924     fe_mul(b   , b   , q->Yp);
   1925     fe_add(s->Y, a   , b    );
   1926     fe_sub(s->X, a   , b    );
   1927 
   1928     fe_add(s->Z, p->Z, p->Z );
   1929     fe_mul(s->T, p->T, q->T2);
   1930     fe_sub(a   , s->Z, s->T );
   1931     fe_add(b   , s->Z, s->T );
   1932 
   1933     fe_mul(s->T, s->X, s->Y);
   1934     fe_mul(s->X, s->X, b   );
   1935     fe_mul(s->Y, s->Y, a   );
   1936     fe_mul(s->Z, a   , b   );
   1937 }
   1938 
   1939 static void ge_double(ge *s, const ge *p, ge *q)
   1940 {
   1941     fe_sq (q->X, p->X);
   1942     fe_sq (q->Y, p->Y);
   1943     fe_sq2(q->Z, p->Z);
   1944     fe_add(q->T, p->X, p->Y);
   1945     fe_sq (s->T, q->T);
   1946     fe_add(q->T, q->Y, q->X);
   1947     fe_sub(q->Y, q->Y, q->X);
   1948     fe_sub(q->X, s->T, q->T);
   1949     fe_sub(q->Z, q->Z, q->Y);
   1950 
   1951     fe_mul(s->X, q->X , q->Z);
   1952     fe_mul(s->Y, q->T , q->Y);
   1953     fe_mul(s->Z, q->Y , q->Z);
   1954     fe_mul(s->T, q->X , q->T);
   1955 }
   1956 
   1957 // 5-bit signed window in cached format (Niels coordinates, Z=1)
   1958 static const ge_precomp b_window[8] = {
   1959     {{25967493,-14356035,29566456,3660896,-12694345,
   1960       4014787,27544626,-11754271,-6079156,2047605,},
   1961      {-12545711,934262,-2722910,3049990,-727428,
   1962       9406986,12720692,5043384,19500929,-15469378,},
   1963      {-8738181,4489570,9688441,-14785194,10184609,
   1964       -12363380,29287919,11864899,-24514362,-4438546,},},
   1965     {{15636291,-9688557,24204773,-7912398,616977,
   1966       -16685262,27787600,-14772189,28944400,-1550024,},
   1967      {16568933,4717097,-11556148,-1102322,15682896,
   1968       -11807043,16354577,-11775962,7689662,11199574,},
   1969      {30464156,-5976125,-11779434,-15670865,23220365,
   1970       15915852,7512774,10017326,-17749093,-9920357,},},
   1971     {{10861363,11473154,27284546,1981175,-30064349,
   1972       12577861,32867885,14515107,-15438304,10819380,},
   1973      {4708026,6336745,20377586,9066809,-11272109,
   1974       6594696,-25653668,12483688,-12668491,5581306,},
   1975      {19563160,16186464,-29386857,4097519,10237984,
   1976       -4348115,28542350,13850243,-23678021,-15815942,},},
   1977     {{5153746,9909285,1723747,-2777874,30523605,
   1978       5516873,19480852,5230134,-23952439,-15175766,},
   1979      {-30269007,-3463509,7665486,10083793,28475525,
   1980       1649722,20654025,16520125,30598449,7715701,},
   1981      {28881845,14381568,9657904,3680757,-20181635,
   1982       7843316,-31400660,1370708,29794553,-1409300,},},
   1983     {{-22518993,-6692182,14201702,-8745502,-23510406,
   1984       8844726,18474211,-1361450,-13062696,13821877,},
   1985      {-6455177,-7839871,3374702,-4740862,-27098617,
   1986       -10571707,31655028,-7212327,18853322,-14220951,},
   1987      {4566830,-12963868,-28974889,-12240689,-7602672,
   1988       -2830569,-8514358,-10431137,2207753,-3209784,},},
   1989     {{-25154831,-4185821,29681144,7868801,-6854661,
   1990       -9423865,-12437364,-663000,-31111463,-16132436,},
   1991      {25576264,-2703214,7349804,-11814844,16472782,
   1992       9300885,3844789,15725684,171356,6466918,},
   1993      {23103977,13316479,9739013,-16149481,817875,
   1994       -15038942,8965339,-14088058,-30714912,16193877,},},
   1995     {{-33521811,3180713,-2394130,14003687,-16903474,
   1996       -16270840,17238398,4729455,-18074513,9256800,},
   1997      {-25182317,-4174131,32336398,5036987,-21236817,
   1998       11360617,22616405,9761698,-19827198,630305,},
   1999      {-13720693,2639453,-24237460,-7406481,9494427,
   2000       -5774029,-6554551,-15960994,-2449256,-14291300,},},
   2001     {{-3151181,-5046075,9282714,6866145,-31907062,
   2002       -863023,-18940575,15033784,25105118,-7894876,},
   2003      {-24326370,15950226,-31801215,-14592823,-11662737,
   2004       -5090925,1573892,-2625887,2198790,-15804619,},
   2005      {-3099351,10324967,-2241613,7453183,-5446979,
   2006       -2735503,-13812022,-16236442,-32461234,-12290683,},},
   2007 };
   2008 
   2009 // Incremental sliding windows (left to right)
   2010 // Based on Roberto Maria Avanzi[2005]
   2011 typedef struct {
   2012     i16 next_index; // position of the next signed digit
   2013     i8  next_digit; // next signed digit (odd number below 2^window_width)
   2014     u8  next_check; // point at which we must check for a new window
   2015 } slide_ctx;
   2016 
   2017 static void slide_init(slide_ctx *ctx, const u8 scalar[32])
   2018 {
   2019     // scalar is guaranteed to be below L, either because we checked (s),
   2020     // or because we reduced it modulo L (h_ram). L is under 2^253, so
   2021     // so bits 253 to 255 are guaranteed to be zero. No need to test them.
   2022     //
   2023     // Note however that L is very close to 2^252, so bit 252 is almost
   2024     // always zero.  If we were to start at bit 251, the tests wouldn't
   2025     // catch the off-by-one error (constructing one that does would be
   2026     // prohibitively expensive).
   2027     //
   2028     // We should still check bit 252, though.
   2029     int i = 252;
   2030     while (i > 0 && scalar_bit(scalar, i) == 0) {
   2031         i--;
   2032     }
   2033     ctx->next_check = (u8)(i + 1);
   2034     ctx->next_index = -1;
   2035     ctx->next_digit = -1;
   2036 }
   2037 
   2038 static int slide_step(slide_ctx *ctx, int width, int i, const u8 scalar[32])
   2039 {
   2040     if (i == ctx->next_check) {
   2041         if (scalar_bit(scalar, i) == scalar_bit(scalar, i - 1)) {
   2042             ctx->next_check--;
   2043         } else {
   2044             // compute digit of next window
   2045             int w = MIN(width, i + 1);
   2046             int v = -(scalar_bit(scalar, i) << (w-1));
   2047             FOR_T (int, j, 0, w-1) {
   2048                 v += scalar_bit(scalar, i-(w-1)+j) << j;
   2049             }
   2050             v += scalar_bit(scalar, i-w);
   2051             int lsb = v & (~v + 1);            // smallest bit of v
   2052             int s   = (   ((lsb & 0xAA) != 0)  // log2(lsb)
   2053                        | (((lsb & 0xCC) != 0) << 1)
   2054                        | (((lsb & 0xF0) != 0) << 2));
   2055             ctx->next_index  = (i16)(i-(w-1)+s);
   2056             ctx->next_digit  = (i8) (v >> s   );
   2057             ctx->next_check -= (u8) w;
   2058         }
   2059     }
   2060     return i == ctx->next_index ? ctx->next_digit: 0;
   2061 }
   2062 
   2063 #define P_W_WIDTH 3 // Affects the size of the stack
   2064 #define B_W_WIDTH 5 // Affects the size of the binary
   2065 #define P_W_SIZE  (1<<(P_W_WIDTH-2))
   2066 
   2067 // P = [b]B + [p]P, where B is the base point
   2068 //
   2069 // Variable time! Internal buffers are not wiped! Inputs must not be secret!
   2070 // => Use only to *check* signatures.
   2071 static void ge_double_scalarmult_vartime(ge *P, const u8 p[32], const u8 b[32])
   2072 {
   2073     // cache P window for addition
   2074     ge_cached cP[P_W_SIZE];
   2075     {
   2076         ge P2, tmp;
   2077         ge_double(&P2, P, &tmp);
   2078         ge_cache(&cP[0], P);
   2079         FOR (i, 1, P_W_SIZE) {
   2080             ge_add(&tmp, &P2, &cP[i-1]);
   2081             ge_cache(&cP[i], &tmp);
   2082         }
   2083     }
   2084 
   2085     // Merged double and add ladder, fused with sliding
   2086     slide_ctx p_slide;  slide_init(&p_slide, p);
   2087     slide_ctx b_slide;  slide_init(&b_slide, b);
   2088     int i = MAX(p_slide.next_check, b_slide.next_check);
   2089     ge *sum = P;
   2090     ge_zero(sum);
   2091     while (i >= 0) {
   2092         ge tmp;
   2093         ge_double(sum, sum, &tmp);
   2094         int p_digit = slide_step(&p_slide, P_W_WIDTH, i, p);
   2095         int b_digit = slide_step(&b_slide, B_W_WIDTH, i, b);
   2096         if (p_digit > 0) { ge_add(sum, sum, &cP[ p_digit / 2]); }
   2097         if (p_digit < 0) { ge_sub(sum, sum, &cP[-p_digit / 2]); }
   2098         fe t1, t2;
   2099         if (b_digit > 0) { ge_madd(sum, sum, b_window +  b_digit/2, t1, t2); }
   2100         if (b_digit < 0) { ge_msub(sum, sum, b_window + -b_digit/2, t1, t2); }
   2101         i--;
   2102     }
   2103 }
   2104 
   2105 // R_check = s[B] - h_ram[pk], where B is the base point
   2106 //
   2107 // Variable time! Internal buffers are not wiped! Inputs must not be secret!
   2108 // => Use only to *check* signatures.
   2109 static int ge_r_check(u8 R_check[32], u8 s[32], u8 h_ram[32], u8 pk[32])
   2110 {
   2111     ge  A;      // not secret, not wiped
   2112     u32 s32[8]; // not secret, not wiped
   2113     load32_le_buf(s32, s, 8);
   2114     if (ge_frombytes_vartime(&A, pk) ||         // A = pk
   2115         is_above_l(s32)) {                      // prevent s malleability
   2116         return -1;
   2117     }
   2118     fe_neg(A.X, A.X);
   2119     fe_neg(A.T, A.T);                           // A = -pk
   2120     ge_double_scalarmult_vartime(&A, h_ram, s); // A = [s]B - [h_ram]pk
   2121     ge_tobytes(R_check, &A);                    // R_check = A
   2122     return 0;
   2123 }
   2124 
   2125 // 5-bit signed comb in cached format (Niels coordinates, Z=1)
   2126 static const ge_precomp b_comb_low[8] = {
   2127     {{-6816601,-2324159,-22559413,124364,18015490,
   2128       8373481,19993724,1979872,-18549925,9085059,},
   2129      {10306321,403248,14839893,9633706,8463310,
   2130       -8354981,-14305673,14668847,26301366,2818560,},
   2131      {-22701500,-3210264,-13831292,-2927732,-16326337,
   2132       -14016360,12940910,177905,12165515,-2397893,},},
   2133     {{-12282262,-7022066,9920413,-3064358,-32147467,
   2134       2927790,22392436,-14852487,2719975,16402117,},
   2135      {-7236961,-4729776,2685954,-6525055,-24242706,
   2136       -15940211,-6238521,14082855,10047669,12228189,},
   2137      {-30495588,-12893761,-11161261,3539405,-11502464,
   2138       16491580,-27286798,-15030530,-7272871,-15934455,},},
   2139     {{17650926,582297,-860412,-187745,-12072900,
   2140       -10683391,-20352381,15557840,-31072141,-5019061,},
   2141      {-6283632,-2259834,-4674247,-4598977,-4089240,
   2142       12435688,-31278303,1060251,6256175,10480726,},
   2143      {-13871026,2026300,-21928428,-2741605,-2406664,
   2144       -8034988,7355518,15733500,-23379862,7489131,},},
   2145     {{6883359,695140,23196907,9644202,-33430614,
   2146       11354760,-20134606,6388313,-8263585,-8491918,},
   2147      {-7716174,-13605463,-13646110,14757414,-19430591,
   2148       -14967316,10359532,-11059670,-21935259,12082603,},
   2149      {-11253345,-15943946,10046784,5414629,24840771,
   2150       8086951,-6694742,9868723,15842692,-16224787,},},
   2151     {{9639399,11810955,-24007778,-9320054,3912937,
   2152       -9856959,996125,-8727907,-8919186,-14097242,},
   2153      {7248867,14468564,25228636,-8795035,14346339,
   2154       8224790,6388427,-7181107,6468218,-8720783,},
   2155      {15513115,15439095,7342322,-10157390,18005294,
   2156       -7265713,2186239,4884640,10826567,7135781,},},
   2157     {{-14204238,5297536,-5862318,-6004934,28095835,
   2158       4236101,-14203318,1958636,-16816875,3837147,},
   2159      {-5511166,-13176782,-29588215,12339465,15325758,
   2160       -15945770,-8813185,11075932,-19608050,-3776283,},
   2161      {11728032,9603156,-4637821,-5304487,-7827751,
   2162       2724948,31236191,-16760175,-7268616,14799772,},},
   2163     {{-28842672,4840636,-12047946,-9101456,-1445464,
   2164       381905,-30977094,-16523389,1290540,12798615,},
   2165      {27246947,-10320914,14792098,-14518944,5302070,
   2166       -8746152,-3403974,-4149637,-27061213,10749585,},
   2167      {25572375,-6270368,-15353037,16037944,1146292,
   2168       32198,23487090,9585613,24714571,-1418265,},},
   2169     {{19844825,282124,-17583147,11004019,-32004269,
   2170       -2716035,6105106,-1711007,-21010044,14338445,},
   2171      {8027505,8191102,-18504907,-12335737,25173494,
   2172       -5923905,15446145,7483684,-30440441,10009108,},
   2173      {-14134701,-4174411,10246585,-14677495,33553567,
   2174       -14012935,23366126,15080531,-7969992,7663473,},},
   2175 };
   2176 
   2177 static const ge_precomp b_comb_high[8] = {
   2178     {{33055887,-4431773,-521787,6654165,951411,
   2179       -6266464,-5158124,6995613,-5397442,-6985227,},
   2180      {4014062,6967095,-11977872,3960002,8001989,
   2181       5130302,-2154812,-1899602,-31954493,-16173976,},
   2182      {16271757,-9212948,23792794,731486,-25808309,
   2183       -3546396,6964344,-4767590,10976593,10050757,},},
   2184     {{2533007,-4288439,-24467768,-12387405,-13450051,
   2185       14542280,12876301,13893535,15067764,8594792,},
   2186      {20073501,-11623621,3165391,-13119866,13188608,
   2187       -11540496,-10751437,-13482671,29588810,2197295,},
   2188      {-1084082,11831693,6031797,14062724,14748428,
   2189       -8159962,-20721760,11742548,31368706,13161200,},},
   2190     {{2050412,-6457589,15321215,5273360,25484180,
   2191       124590,-18187548,-7097255,-6691621,-14604792,},
   2192      {9938196,2162889,-6158074,-1711248,4278932,
   2193       -2598531,-22865792,-7168500,-24323168,11746309,},
   2194      {-22691768,-14268164,5965485,9383325,20443693,
   2195       5854192,28250679,-1381811,-10837134,13717818,},},
   2196     {{-8495530,16382250,9548884,-4971523,-4491811,
   2197       -3902147,6182256,-12832479,26628081,10395408,},
   2198      {27329048,-15853735,7715764,8717446,-9215518,
   2199       -14633480,28982250,-5668414,4227628,242148,},
   2200      {-13279943,-7986904,-7100016,8764468,-27276630,
   2201       3096719,29678419,-9141299,3906709,11265498,},},
   2202     {{11918285,15686328,-17757323,-11217300,-27548967,
   2203       4853165,-27168827,6807359,6871949,-1075745,},
   2204      {-29002610,13984323,-27111812,-2713442,28107359,
   2205       -13266203,6155126,15104658,3538727,-7513788,},
   2206      {14103158,11233913,-33165269,9279850,31014152,
   2207       4335090,-1827936,4590951,13960841,12787712,},},
   2208     {{1469134,-16738009,33411928,13942824,8092558,
   2209       -8778224,-11165065,1437842,22521552,-2792954,},
   2210      {31352705,-4807352,-25327300,3962447,12541566,
   2211       -9399651,-27425693,7964818,-23829869,5541287,},
   2212      {-25732021,-6864887,23848984,3039395,-9147354,
   2213       6022816,-27421653,10590137,25309915,-1584678,},},
   2214     {{-22951376,5048948,31139401,-190316,-19542447,
   2215       -626310,-17486305,-16511925,-18851313,-12985140,},
   2216      {-9684890,14681754,30487568,7717771,-10829709,
   2217       9630497,30290549,-10531496,-27798994,-13812825,},
   2218      {5827835,16097107,-24501327,12094619,7413972,
   2219       11447087,28057551,-1793987,-14056981,4359312,},},
   2220     {{26323183,2342588,-21887793,-1623758,-6062284,
   2221       2107090,-28724907,9036464,-19618351,-13055189,},
   2222      {-29697200,14829398,-4596333,14220089,-30022969,
   2223       2955645,12094100,-13693652,-5941445,7047569,},
   2224      {-3201977,14413268,-12058324,-16417589,-9035655,
   2225       -7224648,9258160,1399236,30397584,-5684634,},},
   2226 };
   2227 
   2228 static void lookup_add(ge *p, ge_precomp *tmp_c, fe tmp_a, fe tmp_b,
   2229                        const ge_precomp comb[8], const u8 scalar[32], int i)
   2230 {
   2231     u8 teeth = (u8)((scalar_bit(scalar, i)          ) +
   2232                     (scalar_bit(scalar, i + 32) << 1) +
   2233                     (scalar_bit(scalar, i + 64) << 2) +
   2234                     (scalar_bit(scalar, i + 96) << 3));
   2235     u8 high  = teeth >> 3;
   2236     u8 index = (teeth ^ (high - 1)) & 7;
   2237     FOR (j, 0, 8) {
   2238         i32 select = 1 & (((j ^ index) - 1) >> 8);
   2239         fe_ccopy(tmp_c->Yp, comb[j].Yp, select);
   2240         fe_ccopy(tmp_c->Ym, comb[j].Ym, select);
   2241         fe_ccopy(tmp_c->T2, comb[j].T2, select);
   2242     }
   2243     fe_neg(tmp_a, tmp_c->T2);
   2244     fe_cswap(tmp_c->T2, tmp_a    , high ^ 1);
   2245     fe_cswap(tmp_c->Yp, tmp_c->Ym, high ^ 1);
   2246     ge_madd(p, p, tmp_c, tmp_a, tmp_b);
   2247 }
   2248 
   2249 // p = [scalar]B, where B is the base point
   2250 static void ge_scalarmult_base(ge *p, const u8 scalar[32])
   2251 {
   2252     // twin 4-bits signed combs, from Mike Hamburg's
   2253     // Fast and compact elliptic-curve cryptography (2012)
   2254     // 1 / 2 modulo L
   2255     static const u8 half_mod_L[32] = {
   2256         247,233,122,46,141,49,9,44,107,206,123,81,239,124,111,10,
   2257         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8, };
   2258     // (2^256 - 1) / 2 modulo L
   2259     static const u8 half_ones[32] = {
   2260         142,74,204,70,186,24,118,107,184,231,190,57,250,173,119,99,
   2261         255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,7, };
   2262 
   2263     // All bits set form: 1 means 1, 0 means -1
   2264     u8 s_scalar[32];
   2265     mul_add(s_scalar, scalar, half_mod_L, half_ones);
   2266 
   2267     // Double and add ladder
   2268     fe tmp_a, tmp_b;  // temporaries for addition
   2269     ge_precomp tmp_c; // temporary for comb lookup
   2270     ge tmp_d;         // temporary for doubling
   2271     fe_1(tmp_c.Yp);
   2272     fe_1(tmp_c.Ym);
   2273     fe_0(tmp_c.T2);
   2274 
   2275     // Save a double on the first iteration
   2276     ge_zero(p);
   2277     lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_low , s_scalar, 31);
   2278     lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_high, s_scalar, 31+128);
   2279     // Regular double & add for the rest
   2280     for (int i = 30; i >= 0; i--) {
   2281         ge_double(p, p, &tmp_d);
   2282         lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_low , s_scalar, i);
   2283         lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_high, s_scalar, i+128);
   2284     }
   2285     // Note: we could save one addition at the end if we assumed the
   2286     // scalar fit in 252 bit.  Which it does in practice if it is
   2287     // selected at random.  However, non-random, non-hashed scalars
   2288     // *can* overflow 252 bits in practice.  Better account for that
   2289     // than leaving that kind of subtle corner case.
   2290 
   2291     WIPE_BUFFER(tmp_a);  WIPE_CTX(&tmp_d);
   2292     WIPE_BUFFER(tmp_b);  WIPE_CTX(&tmp_c);
   2293     WIPE_BUFFER(s_scalar);
   2294 }
   2295 
   2296 void crypto_sign_public_key_custom_hash(u8       public_key[32],
   2297                                         const u8 secret_key[32],
   2298                                         const crypto_sign_vtable *hash)
   2299 {
   2300     u8 a[64];
   2301     hash->hash(a, secret_key, 32);
   2302     trim_scalar(a);
   2303     ge A;
   2304     ge_scalarmult_base(&A, a);
   2305     ge_tobytes(public_key, &A);
   2306     WIPE_BUFFER(a);
   2307     WIPE_CTX(&A);
   2308 }
   2309 
   2310 void crypto_sign_public_key(u8 public_key[32], const u8 secret_key[32])
   2311 {
   2312     crypto_sign_public_key_custom_hash(public_key, secret_key,
   2313                                        &crypto_blake2b_vtable);
   2314 }
   2315 
   2316 void crypto_sign_init_first_pass_custom_hash(crypto_sign_ctx_abstract *ctx,
   2317                                              const u8 secret_key[32],
   2318                                              const u8 public_key[32],
   2319                                              const crypto_sign_vtable *hash)
   2320 {
   2321     ctx->hash  = hash; // set vtable
   2322     u8 *a      = ctx->buf;
   2323     u8 *prefix = ctx->buf + 32;
   2324     ctx->hash->hash(a, secret_key, 32);
   2325     trim_scalar(a);
   2326 
   2327     if (public_key == 0) {
   2328         crypto_sign_public_key_custom_hash(ctx->pk, secret_key, ctx->hash);
   2329     } else {
   2330         COPY(ctx->pk, public_key, 32);
   2331     }
   2332 
   2333     // Deterministic part of EdDSA: Construct a nonce by hashing the message
   2334     // instead of generating a random number.
   2335     // An actual random number would work just fine, and would save us
   2336     // the trouble of hashing the message twice.  If we did that
   2337     // however, the user could fuck it up and reuse the nonce.
   2338     ctx->hash->init  (ctx);
   2339     ctx->hash->update(ctx, prefix , 32);
   2340 }
   2341 
   2342 void crypto_sign_init_first_pass(crypto_sign_ctx_abstract *ctx,
   2343                                  const u8 secret_key[32],
   2344                                  const u8 public_key[32])
   2345 {
   2346     crypto_sign_init_first_pass_custom_hash(ctx, secret_key, public_key,
   2347                                             &crypto_blake2b_vtable);
   2348 }
   2349 
   2350 void crypto_sign_update(crypto_sign_ctx_abstract *ctx,
   2351                         const u8 *msg, size_t msg_size)
   2352 {
   2353     ctx->hash->update(ctx, msg, msg_size);
   2354 }
   2355 
   2356 void crypto_sign_init_second_pass(crypto_sign_ctx_abstract *ctx)
   2357 {
   2358     u8 *r        = ctx->buf + 32;
   2359     u8 *half_sig = ctx->buf + 64;
   2360     ctx->hash->final(ctx, r);
   2361     reduce(r);
   2362 
   2363     // first half of the signature = "random" nonce times the base point
   2364     ge R;
   2365     ge_scalarmult_base(&R, r);
   2366     ge_tobytes(half_sig, &R);
   2367     WIPE_CTX(&R);
   2368 
   2369     // Hash R, the public key, and the message together.
   2370     // It cannot be done in parallel with the first hash.
   2371     ctx->hash->init  (ctx);
   2372     ctx->hash->update(ctx, half_sig, 32);
   2373     ctx->hash->update(ctx, ctx->pk , 32);
   2374 }
   2375 
   2376 void crypto_sign_final(crypto_sign_ctx_abstract *ctx, u8 signature[64])
   2377 {
   2378     u8 *a        = ctx->buf;
   2379     u8 *r        = ctx->buf + 32;
   2380     u8 *half_sig = ctx->buf + 64;
   2381     u8  h_ram[64];
   2382     ctx->hash->final(ctx, h_ram);
   2383     reduce(h_ram);
   2384     COPY(signature, half_sig, 32);
   2385     mul_add(signature + 32, h_ram, a, r); // s = h_ram * a + r
   2386     WIPE_BUFFER(h_ram);
   2387     crypto_wipe(ctx, ctx->hash->ctx_size);
   2388 }
   2389 
   2390 void crypto_sign(u8        signature[64],
   2391                  const u8  secret_key[32],
   2392                  const u8  public_key[32],
   2393                  const u8 *message, size_t message_size)
   2394 {
   2395     crypto_sign_ctx ctx;
   2396     crypto_sign_ctx_abstract *actx = (crypto_sign_ctx_abstract*)&ctx;
   2397     crypto_sign_init_first_pass (actx, secret_key, public_key);
   2398     crypto_sign_update          (actx, message, message_size);
   2399     crypto_sign_init_second_pass(actx);
   2400     crypto_sign_update          (actx, message, message_size);
   2401     crypto_sign_final           (actx, signature);
   2402 }
   2403 
   2404 void crypto_check_init_custom_hash(crypto_check_ctx_abstract *ctx,
   2405                                    const u8 signature[64],
   2406                                    const u8 public_key[32],
   2407                                    const crypto_sign_vtable *hash)
   2408 {
   2409     ctx->hash = hash; // set vtable
   2410     COPY(ctx->buf, signature , 64);
   2411     COPY(ctx->pk , public_key, 32);
   2412     ctx->hash->init  (ctx);
   2413     ctx->hash->update(ctx, signature , 32);
   2414     ctx->hash->update(ctx, public_key, 32);
   2415 }
   2416 
   2417 void crypto_check_init(crypto_check_ctx_abstract *ctx, const u8 signature[64],
   2418                        const u8 public_key[32])
   2419 {
   2420     crypto_check_init_custom_hash(ctx, signature, public_key,
   2421                                   &crypto_blake2b_vtable);
   2422 }
   2423 
   2424 void crypto_check_update(crypto_check_ctx_abstract *ctx,
   2425                          const u8 *msg, size_t msg_size)
   2426 {
   2427     ctx->hash->update(ctx, msg, msg_size);
   2428 }
   2429 
   2430 int crypto_check_final(crypto_check_ctx_abstract *ctx)
   2431 {
   2432     u8 h_ram[64];
   2433     ctx->hash->final(ctx, h_ram);
   2434     reduce(h_ram);
   2435     u8 *R       = ctx->buf;      // R
   2436     u8 *s       = ctx->buf + 32; // s
   2437     u8 *R_check = ctx->pk;       // overwrite ctx->pk to save stack space
   2438     if (ge_r_check(R_check, s, h_ram, ctx->pk)) {
   2439         return -1;
   2440     }
   2441     return crypto_verify32(R, R_check); // R == R_check ? OK : fail
   2442 }
   2443 
   2444 int crypto_check(const u8  signature[64], const u8 public_key[32],
   2445                  const u8 *message, size_t message_size)
   2446 {
   2447     crypto_check_ctx ctx;
   2448     crypto_check_ctx_abstract *actx = (crypto_check_ctx_abstract*)&ctx;
   2449     crypto_check_init  (actx, signature, public_key);
   2450     crypto_check_update(actx, message, message_size);
   2451     return crypto_check_final(actx);
   2452 }
   2453 
   2454 ///////////////////////
   2455 /// EdDSA to X25519 ///
   2456 ///////////////////////
   2457 void crypto_from_eddsa_private(u8 x25519[32], const u8 eddsa[32])
   2458 {
   2459     u8 a[64];
   2460     crypto_blake2b(a, eddsa, 32);
   2461     COPY(x25519, a, 32);
   2462     WIPE_BUFFER(a);
   2463 }
   2464 
   2465 void crypto_from_eddsa_public(u8 x25519[32], const u8 eddsa[32])
   2466 {
   2467     fe t1, t2;
   2468     fe_frombytes(t2, eddsa);
   2469     fe_add(t1, fe_one, t2);
   2470     fe_sub(t2, fe_one, t2);
   2471     fe_invert(t2, t2);
   2472     fe_mul(t1, t1, t2);
   2473     fe_tobytes(x25519, t1);
   2474     WIPE_BUFFER(t1);
   2475     WIPE_BUFFER(t2);
   2476 }
   2477 
   2478 /////////////////////////////////////////////
   2479 /// Dirty ephemeral public key generation ///
   2480 /////////////////////////////////////////////
   2481 
   2482 // Those functions generates a public key, *without* clearing the
   2483 // cofactor.  Sending that key over the network leaks 3 bits of the
   2484 // private key.  Use only to generate ephemeral keys that will be hidden
   2485 // with crypto_curve_to_hidden().
   2486 //
   2487 // The public key is otherwise compatible with crypto_x25519() and
   2488 // crypto_key_exchange() (those properly clear the cofactor).
   2489 //
   2490 // Note that the distribution of the resulting public keys is almost
   2491 // uniform.  Flipping the sign of the v coordinate (not provided by this
   2492 // function), covers the entire key space almost perfectly, where
   2493 // "almost" means a 2^-128 bias (undetectable).  This uniformity is
   2494 // needed to ensure the proper randomness of the resulting
   2495 // representatives (once we apply crypto_curve_to_hidden()).
   2496 //
   2497 // Recall that Curve25519 has order C = 2^255 + e, with e < 2^128 (not
   2498 // to be confused with the prime order of the main subgroup, L, which is
   2499 // 8 times less than that).
   2500 //
   2501 // Generating all points would require us to multiply a point of order C
   2502 // (the base point plus any point of order 8) by all scalars from 0 to
   2503 // C-1.  Clamping limits us to scalars between 2^254 and 2^255 - 1. But
   2504 // by negating the resulting point at random, we also cover scalars from
   2505 // -2^255 + 1 to -2^254 (which modulo C is congruent to e+1 to 2^254 + e).
   2506 //
   2507 // In practice:
   2508 // - Scalars from 0         to e + 1     are never generated
   2509 // - Scalars from 2^255     to 2^255 + e are never generated
   2510 // - Scalars from 2^254 + 1 to 2^254 + e are generated twice
   2511 //
   2512 // Since e < 2^128, detecting this bias requires observing over 2^100
   2513 // representatives from a given source (this will never happen), *and*
   2514 // recovering enough of the private key to determine that they do, or do
   2515 // not, belong to the biased set (this practically requires solving
   2516 // discrete logarithm, which is conjecturally intractable).
   2517 //
   2518 // In practice, this means the bias is impossible to detect.
   2519 
   2520 // s + (x*L) % 8*L
   2521 // Guaranteed to fit in 256 bits iff s fits in 255 bits.
   2522 //   L             < 2^253
   2523 //   x%8           < 2^3
   2524 //   L * (x%8)     < 2^255
   2525 //   s             < 2^255
   2526 //   s + L * (x%8) < 2^256
   2527 static void add_xl(u8 s[32], u8 x)
   2528 {
   2529     u64 mod8  = x & 7;
   2530     u64 carry = 0;
   2531     FOR (i , 0, 8) {
   2532         carry = carry + load32_le(s + 4*i) + L[i] * mod8;
   2533         store32_le(s + 4*i, (u32)carry);
   2534         carry >>= 32;
   2535     }
   2536 }
   2537 
   2538 // "Small" dirty ephemeral key.
   2539 // Use if you need to shrink the size of the binary, and can afford to
   2540 // slow down by a factor of two (compared to the fast version)
   2541 //
   2542 // This version works by decoupling the cofactor from the main factor.
   2543 //
   2544 // - The trimmed scalar determines the main factor
   2545 // - The clamped bits of the scalar determine the cofactor.
   2546 //
   2547 // Cofactor and main factor are combined into a single scalar, which is
   2548 // then multiplied by a point of order 8*L (unlike the base point, which
   2549 // has prime order).  That "dirty" base point is the addition of the
   2550 // regular base point (9), and a point of order 8.
   2551 void crypto_x25519_dirty_small(u8 public_key[32], const u8 secret_key[32])
   2552 {
   2553     // Base point of order 8*L
   2554     // Raw scalar multiplication with it does not clear the cofactor,
   2555     // and the resulting public key will reveal 3 bits of the scalar.
   2556     static const u8 dirty_base_point[32] = {
   2557         0x34, 0xfc, 0x6c, 0xb7, 0xc8, 0xde, 0x58, 0x97, 0x77, 0x70, 0xd9, 0x52,
   2558         0x16, 0xcc, 0xdc, 0x6c, 0x85, 0x90, 0xbe, 0xcd, 0x91, 0x9c, 0x07, 0x59,
   2559         0x94, 0x14, 0x56, 0x3b, 0x4b, 0xa4, 0x47, 0x0f, };
   2560     // separate the main factor & the cofactor of the scalar
   2561     u8 scalar[32];
   2562     COPY(scalar, secret_key, 32);
   2563     trim_scalar(scalar);
   2564 
   2565     // Separate the main factor and the cofactor
   2566     //
   2567     // The scalar is trimmed, so its cofactor is cleared.  The three
   2568     // least significant bits however still have a main factor.  We must
   2569     // remove it for X25519 compatibility.
   2570     //
   2571     // We exploit the fact that 5*L = 1 (modulo 8)
   2572     //   cofactor = lsb * 5 * L             (modulo 8*L)
   2573     //   combined = scalar + cofactor       (modulo 8*L)
   2574     //   combined = scalar + (lsb * 5 * L)  (modulo 8*L)
   2575     add_xl(scalar, secret_key[0] * 5);
   2576     scalarmult(public_key, scalar, dirty_base_point, 256);
   2577     WIPE_BUFFER(scalar);
   2578 }
   2579 
   2580 // "Fast" dirty ephemeral key
   2581 // We use this one by default.
   2582 //
   2583 // This version works by performing a regular scalar multiplication,
   2584 // then add a low order point.  The scalar multiplication is done in
   2585 // Edwards space for more speed (*2 compared to the "small" version).
   2586 // The cost is a bigger binary for programs that don't also sign messages.
   2587 void crypto_x25519_dirty_fast(u8 public_key[32], const u8 secret_key[32])
   2588 {
   2589     u8 scalar[32];
   2590     ge pk;
   2591     COPY(scalar, secret_key, 32);
   2592     trim_scalar(scalar);
   2593     ge_scalarmult_base(&pk, scalar);
   2594 
   2595     // Select low order point
   2596     // We're computing the [cofactor]lop scalar multiplication, where:
   2597     //   cofactor = tweak & 7.
   2598     //   lop      = (lop_x, lop_y)
   2599     //   lop_x    = sqrt((sqrt(d + 1) + 1) / d)
   2600     //   lop_y    = -lop_x * sqrtm1
   2601     // Notes:
   2602     // - A (single) Montgomery ladder would be twice as slow.
   2603     // - An actual scalar multiplication would hurt performance.
   2604     // - A full table lookup would take more code.
   2605     u8 cofactor = secret_key[0] & 7;
   2606     int a = (cofactor >> 2) & 1;
   2607     int b = (cofactor >> 1) & 1;
   2608     int c = (cofactor >> 0) & 1;
   2609     fe t1, t2, t3;
   2610     fe_0(t1);
   2611     fe_ccopy(t1, sqrtm1, b);
   2612     fe_ccopy(t1, lop_x , c);
   2613     fe_neg  (t3, t1);
   2614     fe_ccopy(t1, t3, a);
   2615     fe_1(t2);
   2616     fe_0(t3);
   2617     fe_ccopy(t2, t3   , b);
   2618     fe_ccopy(t2, lop_y, c);
   2619     fe_neg  (t3, t2);
   2620     fe_ccopy(t2, t3, a^b);
   2621     ge_precomp low_order_point;
   2622     fe_add(low_order_point.Yp, t2, t1);
   2623     fe_sub(low_order_point.Ym, t2, t1);
   2624     fe_mul(low_order_point.T2, t2, t1);
   2625     fe_mul(low_order_point.T2, low_order_point.T2, D2);
   2626 
   2627     // Add low order point to the public key
   2628     ge_madd(&pk, &pk, &low_order_point, t1, t2);
   2629 
   2630     // Convert to Montgomery u coordinate (we ignore the sign)
   2631     fe_add(t1, pk.Z, pk.Y);
   2632     fe_sub(t2, pk.Z, pk.Y);
   2633     fe_invert(t2, t2);
   2634     fe_mul(t1, t1, t2);
   2635 
   2636     fe_tobytes(public_key, t1);
   2637 
   2638     WIPE_BUFFER(t1);  WIPE_BUFFER(scalar);
   2639     WIPE_BUFFER(t2);  WIPE_CTX(&pk);
   2640     WIPE_BUFFER(t3);  WIPE_CTX(&low_order_point);
   2641 }
   2642 
   2643 ///////////////////
   2644 /// Elligator 2 ///
   2645 ///////////////////
   2646 static const fe A = {486662};
   2647 
   2648 // Elligator direct map
   2649 //
   2650 // Computes the point corresponding to a representative, encoded in 32
   2651 // bytes (little Endian).  Since positive representatives fits in 254
   2652 // bits, The two most significant bits are ignored.
   2653 //
   2654 // From the paper:
   2655 // w = -A / (fe(1) + non_square * r^2)
   2656 // e = chi(w^3 + A*w^2 + w)
   2657 // u = e*w - (fe(1)-e)*(A//2)
   2658 // v = -e * sqrt(u^3 + A*u^2 + u)
   2659 //
   2660 // We ignore v because we don't need it for X25519 (the Montgomery
   2661 // ladder only uses u).
   2662 //
   2663 // Note that e is either 0, 1 or -1
   2664 // if e = 0    u = 0  and v = 0
   2665 // if e = 1    u = w
   2666 // if e = -1   u = -w - A = w * non_square * r^2
   2667 //
   2668 // Let r1 = non_square * r^2
   2669 // Let r2 = 1 + r1
   2670 // Note that r2 cannot be zero, -1/non_square is not a square.
   2671 // We can (tediously) verify that:
   2672 //   w^3 + A*w^2 + w = (A^2*r1 - r2^2) * A / r2^3
   2673 // Therefore:
   2674 //   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3))
   2675 //   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)) * 1
   2676 //   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)) * chi(r2^6)
   2677 //   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)  *     r2^6)
   2678 //   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) *  A * r2^3)
   2679 // Corollary:
   2680 //   e =  1 if (A^2*r1 - r2^2) *  A * r2^3) is a non-zero square
   2681 //   e = -1 if (A^2*r1 - r2^2) *  A * r2^3) is not a square
   2682 //   Note that w^3 + A*w^2 + w (and therefore e) can never be zero:
   2683 //     w^3 + A*w^2 + w = w * (w^2 + A*w + 1)
   2684 //     w^3 + A*w^2 + w = w * (w^2 + A*w + A^2/4 - A^2/4 + 1)
   2685 //     w^3 + A*w^2 + w = w * (w + A/2)^2        - A^2/4 + 1)
   2686 //     which is zero only if:
   2687 //       w = 0                   (impossible)
   2688 //       (w + A/2)^2 = A^2/4 - 1 (impossible, because A^2/4-1 is not a square)
   2689 //
   2690 // Let isr   = invsqrt((A^2*r1 - r2^2) *  A * r2^3)
   2691 //     isr   = sqrt(1        / ((A^2*r1 - r2^2) *  A * r2^3)) if e =  1
   2692 //     isr   = sqrt(sqrt(-1) / ((A^2*r1 - r2^2) *  A * r2^3)) if e = -1
   2693 //
   2694 // if e = 1
   2695 //   let u1 = -A * (A^2*r1 - r2^2) * A * r2^2 * isr^2
   2696 //       u1 = w
   2697 //       u1 = u
   2698 //
   2699 // if e = -1
   2700 //   let ufactor = -non_square * sqrt(-1) * r^2
   2701 //   let vfactor = sqrt(ufactor)
   2702 //   let u2 = -A * (A^2*r1 - r2^2) * A * r2^2 * isr^2 * ufactor
   2703 //       u2 = w * -1 * -non_square * r^2
   2704 //       u2 = w * non_square * r^2
   2705 //       u2 = u
   2706 void crypto_hidden_to_curve(uint8_t curve[32], const uint8_t hidden[32])
   2707 {
   2708     // Representatives are encoded in 254 bits.
   2709     // The two most significant ones are random padding that must be ignored.
   2710     u8 clamped[32];
   2711     COPY(clamped, hidden, 32);
   2712     clamped[31] &= 0x3f;
   2713 
   2714     fe r, u, t1, t2, t3;
   2715     fe_frombytes(r, clamped);
   2716     fe_sq2(t1, r);
   2717     fe_add(u, t1, fe_one);
   2718     fe_sq (t2, u);
   2719     fe_mul(t3, A2, t1);
   2720     fe_sub(t3, t3, t2);
   2721     fe_mul(t3, t3, A);
   2722     fe_mul(t1, t2, u);
   2723     fe_mul(t1, t3, t1);
   2724     int is_square = invsqrt(t1, t1);
   2725     fe_sq(u, r);
   2726     fe_mul(u, u, ufactor);
   2727     fe_ccopy(u, fe_one, is_square);
   2728     fe_sq (t1, t1);
   2729     fe_mul(u, u, A);
   2730     fe_mul(u, u, t3);
   2731     fe_mul(u, u, t2);
   2732     fe_mul(u, u, t1);
   2733     fe_neg(u, u);
   2734     fe_tobytes(curve, u);
   2735 
   2736     WIPE_BUFFER(t1);  WIPE_BUFFER(r);
   2737     WIPE_BUFFER(t2);  WIPE_BUFFER(u);
   2738     WIPE_BUFFER(t3);  WIPE_BUFFER(clamped);
   2739 }
   2740 
   2741 // Elligator inverse map
   2742 //
   2743 // Computes the representative of a point, if possible.  If not, it does
   2744 // nothing and returns -1.  Note that the success of the operation
   2745 // depends only on the point (more precisely its u coordinate).  The
   2746 // tweak parameter is used only upon success
   2747 //
   2748 // The tweak should be a random byte.  Beyond that, its contents are an
   2749 // implementation detail. Currently, the tweak comprises:
   2750 // - Bit  1  : sign of the v coordinate (0 if positive, 1 if negative)
   2751 // - Bit  2-5: not used
   2752 // - Bits 6-7: random padding
   2753 //
   2754 // From the paper:
   2755 // Let sq = -non_square * u * (u+A)
   2756 // if sq is not a square, or u = -A, there is no mapping
   2757 // Assuming there is a mapping:
   2758 //   if v is positive: r = sqrt(-(u+A) / u)
   2759 //   if v is negative: r = sqrt(-u / (u+A))
   2760 //
   2761 // We compute isr = invsqrt(-non_square * u * (u+A))
   2762 // if it wasn't a non-zero square, abort.
   2763 // else, isr = sqrt(-1 / (non_square * u * (u+A))
   2764 //
   2765 // This causes us to abort if u is zero, even though we shouldn't. This
   2766 // never happens in practice, because (i) a random point in the curve has
   2767 // a negligible chance of being zero, and (ii) scalar multiplication with
   2768 // a trimmed scalar *never* yields zero.
   2769 //
   2770 // Since:
   2771 //   isr * (u+A) = sqrt(-1     / (non_square * u * (u+A)) * (u+A)
   2772 //   isr * (u+A) = sqrt(-(u+A) / (non_square * u * (u+A))
   2773 // and:
   2774 //   isr = u = sqrt(-1 / (non_square * u * (u+A)) * u
   2775 //   isr = u = sqrt(-u / (non_square * u * (u+A))
   2776 // Therefore:
   2777 //   if v is positive: r = isr * (u+A)
   2778 //   if v is negative: r = isr * u
   2779 int crypto_curve_to_hidden(u8 hidden[32], const u8 public_key[32], u8 tweak)
   2780 {
   2781     fe t1, t2, t3;
   2782     fe_frombytes(t1, public_key);
   2783 
   2784     fe_add(t2, t1, A);
   2785     fe_mul(t3, t1, t2);
   2786     fe_mul_small(t3, t3, -2);
   2787     int is_square = invsqrt(t3, t3);
   2788     if (!is_square) {
   2789         // The only variable time bit.  This ultimately reveals how many
   2790         // tries it took us to find a representable key.
   2791         // This does not affect security as long as we try keys at random.
   2792         WIPE_BUFFER(t1);
   2793         WIPE_BUFFER(t2);
   2794         WIPE_BUFFER(t3);
   2795         return -1;
   2796     }
   2797     fe_ccopy    (t1, t2, tweak & 1);
   2798     fe_mul      (t3, t1, t3);
   2799     fe_mul_small(t1, t3, 2);
   2800     fe_neg      (t2, t3);
   2801     fe_ccopy    (t3, t2, fe_isodd(t1));
   2802     fe_tobytes(hidden, t3);
   2803 
   2804     // Pad with two random bits
   2805     hidden[31] |= tweak & 0xc0;
   2806 
   2807     WIPE_BUFFER(t1);
   2808     WIPE_BUFFER(t2);
   2809     WIPE_BUFFER(t3);
   2810     return 0;
   2811 }
   2812 
   2813 void crypto_hidden_key_pair(u8 hidden[32], u8 secret_key[32], u8 seed[32])
   2814 {
   2815     u8 pk [32]; // public key
   2816     u8 buf[64]; // seed + representative
   2817     COPY(buf + 32, seed, 32);
   2818     do {
   2819         crypto_chacha20(buf, 0, 64, buf+32, zero);
   2820         crypto_x25519_dirty_fast(pk, buf); // or the "small" version
   2821     } while(crypto_curve_to_hidden(buf+32, pk, buf[32]));
   2822     // Note that the return value of crypto_curve_to_hidden() is
   2823     // independent from its tweak parameter.
   2824     // Therefore, buf[32] is not actually reused.  Either we loop one
   2825     // more time and buf[32] is used for the new seed, or we succeeded,
   2826     // and buf[32] becomes the tweak parameter.
   2827 
   2828     crypto_wipe(seed, 32);
   2829     COPY(hidden    , buf + 32, 32);
   2830     COPY(secret_key, buf     , 32);
   2831     WIPE_BUFFER(buf);
   2832     WIPE_BUFFER(pk);
   2833 }
   2834 
   2835 ////////////////////
   2836 /// Key exchange ///
   2837 ////////////////////
   2838 void crypto_key_exchange(u8       shared_key[32],
   2839                          const u8 your_secret_key [32],
   2840                          const u8 their_public_key[32])
   2841 {
   2842     crypto_x25519(shared_key, your_secret_key, their_public_key);
   2843     crypto_hchacha20(shared_key, shared_key, zero);
   2844 }
   2845 
   2846 ///////////////////////
   2847 /// Scalar division ///
   2848 ///////////////////////
   2849 
   2850 // Montgomery reduction.
   2851 // Divides x by (2^256), and reduces the result modulo L
   2852 //
   2853 // Precondition:
   2854 //   x < L * 2^256
   2855 // Constants:
   2856 //   r = 2^256                 (makes division by r trivial)
   2857 //   k = (r * (1/r) - 1) // L  (1/r is computed modulo L   )
   2858 // Algorithm:
   2859 //   s = (x * k) % r
   2860 //   t = x + s*L      (t is always a multiple of r)
   2861 //   u = (t/r) % L    (u is always below 2*L, conditional subtraction is enough)
   2862 static void redc(u32 u[8], u32 x[16])
   2863 {
   2864     static const u32 k[8]  = { 0x12547e1b, 0xd2b51da3, 0xfdba84ff, 0xb1a206f2,
   2865                                0xffa36bea, 0x14e75438, 0x6fe91836, 0x9db6c6f2,};
   2866     static const u32 l[8]  = { 0x5cf5d3ed, 0x5812631a, 0xa2f79cd6, 0x14def9de,
   2867                                0x00000000, 0x00000000, 0x00000000, 0x10000000,};
   2868     // s = x * k (modulo 2^256)
   2869     // This is cheaper than the full multiplication.
   2870     u32 s[8] = {0};
   2871     FOR (i, 0, 8) {
   2872         u64 carry = 0;
   2873         FOR (j, 0, 8-i) {
   2874             carry  += s[i+j] + (u64)x[i] * k[j];
   2875             s[i+j]  = (u32)carry;
   2876             carry >>= 32;
   2877         }
   2878     }
   2879     u32 t[16] = {0};
   2880     multiply(t, s, l);
   2881 
   2882     // t = t + x
   2883     u64 carry = 0;
   2884     FOR (i, 0, 16) {
   2885         carry  += (u64)t[i] + x[i];
   2886         t[i]    = (u32)carry;
   2887         carry >>= 32;
   2888     }
   2889 
   2890     // u = (t / 2^256) % L
   2891     // Note that t / 2^256 is always below 2*L,
   2892     // So a constant time conditional subtraction is enough
   2893     // We work with L directly, in a 2's complement encoding
   2894     // (-L == ~L + 1)
   2895     remove_l(u, t+8);
   2896 
   2897     WIPE_BUFFER(s);
   2898     WIPE_BUFFER(t);
   2899 }
   2900 
   2901 void crypto_x25519_inverse(u8 blind_salt [32], const u8 private_key[32],
   2902                            const u8 curve_point[32])
   2903 {
   2904     static const  u8 Lm2[32] = { // L - 2
   2905         0xeb, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58, 0xd6, 0x9c, 0xf7, 0xa2,
   2906         0xde, 0xf9, 0xde, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   2907         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, };
   2908     // 1 in Montgomery form
   2909     u32 m_inv [8] = {0x8d98951d, 0xd6ec3174, 0x737dcf70, 0xc6ef5bf4,
   2910                      0xfffffffe, 0xffffffff, 0xffffffff, 0x0fffffff,};
   2911 
   2912     u8 scalar[32];
   2913     COPY(scalar, private_key, 32);
   2914     trim_scalar(scalar);
   2915 
   2916     // Convert the scalar in Montgomery form
   2917     // m_scl = scalar * 2^256 (modulo L)
   2918     u32 m_scl[8];
   2919     {
   2920         u32 tmp[16];
   2921         ZERO(tmp, 8);
   2922         load32_le_buf(tmp+8, scalar, 8);
   2923         mod_l(scalar, tmp);
   2924         load32_le_buf(m_scl, scalar, 8);
   2925         WIPE_BUFFER(tmp); // Wipe ASAP to save stack space
   2926     }
   2927 
   2928     u32 product[16];
   2929     for (int i = 252; i >= 0; i--) {
   2930         ZERO(product, 16);
   2931         multiply(product, m_inv, m_inv);
   2932         redc(m_inv, product);
   2933         if (scalar_bit(Lm2, i)) {
   2934             ZERO(product, 16);
   2935             multiply(product, m_inv, m_scl);
   2936             redc(m_inv, product);
   2937         }
   2938     }
   2939     // Convert the inverse *out* of Montgomery form
   2940     // scalar = m_inv / 2^256 (modulo L)
   2941     COPY(product, m_inv, 8);
   2942     ZERO(product + 8, 8);
   2943     redc(m_inv, product);
   2944     store32_le_buf(scalar, m_inv, 8); // the *inverse* of the scalar
   2945 
   2946     // Clear the cofactor of scalar:
   2947     //   cleared = scalar * (3*L + 1)      (modulo 8*L)
   2948     //   cleared = scalar + scalar * 3 * L (modulo 8*L)
   2949     // Note that (scalar * 3) is reduced modulo 8, so we only need the
   2950     // first byte.
   2951     add_xl(scalar, scalar[0] * 3);
   2952 
   2953     // Recall that 8*L < 2^256. However it is also very close to
   2954     // 2^255. If we spanned the ladder over 255 bits, random tests
   2955     // wouldn't catch the off-by-one error.
   2956     scalarmult(blind_salt, scalar, curve_point, 256);
   2957 
   2958     WIPE_BUFFER(scalar);   WIPE_BUFFER(m_scl);
   2959     WIPE_BUFFER(product);  WIPE_BUFFER(m_inv);
   2960 }
   2961 
   2962 ////////////////////////////////
   2963 /// Authenticated encryption ///
   2964 ////////////////////////////////
   2965 static void lock_auth(u8 mac[16], const u8  auth_key[32],
   2966                       const u8 *ad         , size_t ad_size,
   2967                       const u8 *cipher_text, size_t text_size)
   2968 {
   2969     u8 sizes[16]; // Not secret, not wiped
   2970     store64_le(sizes + 0, ad_size);
   2971     store64_le(sizes + 8, text_size);
   2972     crypto_poly1305_ctx poly_ctx;           // auto wiped...
   2973     crypto_poly1305_init  (&poly_ctx, auth_key);
   2974     crypto_poly1305_update(&poly_ctx, ad         , ad_size);
   2975     crypto_poly1305_update(&poly_ctx, zero       , align(ad_size, 16));
   2976     crypto_poly1305_update(&poly_ctx, cipher_text, text_size);
   2977     crypto_poly1305_update(&poly_ctx, zero       , align(text_size, 16));
   2978     crypto_poly1305_update(&poly_ctx, sizes      , 16);
   2979     crypto_poly1305_final (&poly_ctx, mac); // ...here
   2980 }
   2981 
   2982 void crypto_lock_aead(u8 mac[16], u8 *cipher_text,
   2983                       const u8  key[32], const u8  nonce[24],
   2984                       const u8 *ad        , size_t ad_size,
   2985                       const u8 *plain_text, size_t text_size)
   2986 {
   2987     u8 sub_key[32];
   2988     u8 auth_key[64]; // "Wasting" the whole Chacha block is faster
   2989     crypto_hchacha20(sub_key, key, nonce);
   2990     crypto_chacha20(auth_key, 0, 64, sub_key, nonce + 16);
   2991     crypto_chacha20_ctr(cipher_text, plain_text, text_size,
   2992                         sub_key, nonce + 16, 1);
   2993     lock_auth(mac, auth_key, ad, ad_size, cipher_text, text_size);
   2994     WIPE_BUFFER(sub_key);
   2995     WIPE_BUFFER(auth_key);
   2996 }
   2997 
   2998 int crypto_unlock_aead(u8 *plain_text, const u8 key[32], const u8 nonce[24],
   2999                        const u8  mac[16],
   3000                        const u8 *ad         , size_t ad_size,
   3001                        const u8 *cipher_text, size_t text_size)
   3002 {
   3003     u8 sub_key[32];
   3004     u8 auth_key[64]; // "Wasting" the whole Chacha block is faster
   3005     crypto_hchacha20(sub_key, key, nonce);
   3006     crypto_chacha20(auth_key, 0, 64, sub_key, nonce + 16);
   3007     u8 real_mac[16];
   3008     lock_auth(real_mac, auth_key, ad, ad_size, cipher_text, text_size);
   3009     WIPE_BUFFER(auth_key);
   3010     if (crypto_verify16(mac, real_mac)) {
   3011         WIPE_BUFFER(sub_key);
   3012         WIPE_BUFFER(real_mac);
   3013         return -1;
   3014     }
   3015     crypto_chacha20_ctr(plain_text, cipher_text, text_size,
   3016                         sub_key, nonce + 16, 1);
   3017     WIPE_BUFFER(sub_key);
   3018     WIPE_BUFFER(real_mac);
   3019     return 0;
   3020 }
   3021 
   3022 void crypto_lock(u8 mac[16], u8 *cipher_text,
   3023                  const u8 key[32], const u8 nonce[24],
   3024                  const u8 *plain_text, size_t text_size)
   3025 {
   3026     crypto_lock_aead(mac, cipher_text, key, nonce, 0, 0, plain_text, text_size);
   3027 }
   3028 
   3029 int crypto_unlock(u8 *plain_text,
   3030                   const u8 key[32], const u8 nonce[24], const u8 mac[16],
   3031                   const u8 *cipher_text, size_t text_size)
   3032 {
   3033     return crypto_unlock_aead(plain_text, key, nonce, mac, 0, 0,
   3034                               cipher_text, text_size);
   3035 }