trezor-firmware/crypto/bignum.c

/**
 * Copyright (c) 2013-2014 Tomas Dzetkulic
 * Copyright (c) 2013-2014 Pavol Rusnak
 * Copyright (c)      2015 Jochen Hoenicke
 * Copyright (c)      2016 Alex Beregszaszi
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
 * OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */

#include "bignum.h"

#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>

#include "memzero.h"
#include "script.h"

/*
 This library implements 256-bit numbers arithmetic.

 An unsigned 256-bit number is represented by a bignum256 structure, that is an
 array of nine 32-bit values called limbs. Limbs are digits of the number in
 the base 2**29 representation in the little endian order. This means that
   bignum256 x;
 represents the value
   sum([x[i] * 2**(29*i) for i in range(9)).

 A limb of a bignum256 is *normalized* iff it's less than 2**29.
 A bignum256 is *normalized* iff every its limb is normalized.
 A number is *fully reduced modulo p* iff it is less than p.
 A number is *partly reduced modulo p* iff is is less than 2*p.
 The number p is usually a prime number such that 2^256 - 2^224 <= p <= 2^256.

 All functions except bn_fast_mod expect that all their bignum256 inputs are
 normalized. (The function bn_fast_mod allows the input number to have the
 most significant limb unnormalized). All bignum256 outputs of all functions
 are guaranteed to be  normalized.

 A number can be partly reduced with bn_fast_mod, a partly reduced number can
 be fully reduced with bn_mod.

 A function has *constant control flow with regard to its argument* iff the
 order in which instructions of the function are executed doesn't depend on the
 value of the argument.
 A function has *constant memory access flow with regard to its argument* iff
 the memory addresses that are acessed and the order in which they are accessed
 don't depend on the value of the argument.
 A function *has contant control (memory access) flow* iff it has constant
 control (memory access) flow with regard to all its arguments.

 The following function has contant control flow with regard to its arugment
 n, however is doesn't have constant memory access flow with regard to it:
 void (int n, int *a) }
   a[0] = 0;
   a[n] = 0; // memory address reveals the value of n
 }

 Unless stated otherwise all functions are supposed to have both constant
 control flow and constant memory access flow.
 */

#define BN_MAX_DECIMAL_DIGITS \
  79  // floor(log(2**(LIMBS * BITS_PER_LIMB), 10)) + 1

// out_number = (bignum256) in_number
// Assumes in_number is a raw bigendian 256-bit number
// Guarantees out_number is normalized
void bn_read_be(const uint8_t *in_number, bignum256 *out_number) {
  uint32_t temp = 0;

  for (int i = 0; i < BN_LIMBS - 1; i++) {
    uint32_t limb = read_be(in_number + (BN_LIMBS - 2 - i) * 4);

    temp |= limb << (BN_EXTRA_BITS * i);
    out_number->val[i] = temp & BN_LIMB_MASK;

    temp = limb >> (32 - BN_EXTRA_BITS * (i + 1));
  }

  out_number->val[BN_LIMBS - 1] = temp;
}

// out_number = (256BE) in_number
// Assumes in_number < 2**256
// Guarantess out_number is a raw bigendian 256-bit number
void bn_write_be(const bignum256 *in_number, uint8_t *out_number) {
  uint32_t temp = in_number->val[BN_LIMBS - 1];
  for (int i = BN_LIMBS - 2; i >= 0; i--) {
    uint32_t limb = in_number->val[i];

    temp = (temp << (BN_BITS_PER_LIMB - BN_EXTRA_BITS * i)) |
           (limb >> (BN_EXTRA_BITS * i));
    write_be(out_number + (BN_LIMBS - 2 - i) * 4, temp);

    temp = limb;
  }
}

// out_number = (bignum256) in_number
// Assumes in_number is a raw little endian 256-bit number
// Guarantees out_number is normalized
void bn_read_le(const uint8_t *in_number, bignum256 *out_number) {
  uint32_t temp = 0;
  for (int i = 0; i < BN_LIMBS - 1; i++) {
    uint32_t limb = read_le(in_number + i * 4);

    temp |= limb << (BN_EXTRA_BITS * i);
    out_number->val[i] = temp & BN_LIMB_MASK;
    temp = limb >> (32 - BN_EXTRA_BITS * (i + 1));
  }

  out_number->val[BN_LIMBS - 1] = temp;
}

// out_number = (256LE) in_number
// Assumes in_number < 2**256
// Guarantess out_number is a raw little endian 256-bit number
void bn_write_le(const bignum256 *in_number, uint8_t *out_number) {
  uint32_t temp = in_number->val[BN_LIMBS - 1];

  for (int i = BN_LIMBS - 2; i >= 0; i--) {
    uint32_t limb = in_number->val[i];
    temp = (temp << (BN_BITS_PER_LIMB - BN_EXTRA_BITS * i)) |
           (limb >> (BN_EXTRA_BITS * i));
    write_le(out_number + i * 4, temp);
    temp = limb;
  }
}

// out_number = (bignum256) in_number
// Guarantees out_number is normalized
void bn_read_uint32(uint32_t in_number, bignum256 *out_number) {
  out_number->val[0] = in_number & BN_LIMB_MASK;
  out_number->val[1] = in_number >> BN_BITS_PER_LIMB;
  for (uint32_t i = 2; i < BN_LIMBS; i++) out_number->val[i] = 0;
}

// out_number = (bignum256) in_number
// Guarantees out_number is normalized
void bn_read_uint64(uint64_t in_number, bignum256 *out_number) {
  out_number->val[0] = in_number & BN_LIMB_MASK;
  out_number->val[1] = (in_number >>= BN_BITS_PER_LIMB) & BN_LIMB_MASK;
  out_number->val[2] = in_number >> BN_BITS_PER_LIMB;
  for (uint32_t i = 3; i < BN_LIMBS; i++) out_number->val[i] = 0;
}

// Returns the bitsize of x
// Assumes x is normalized
// The function doesn't have neither constant control flow nor constant memory
//   access flow
int bn_bitcount(const bignum256 *x) {
  for (int i = BN_LIMBS - 1; i >= 0; i--) {
    uint32_t limb = x->val[i];
    if (limb != 0) {
      // __builtin_clz returns the number of leading zero bits starting at the
      // most significant bit position
      return i * BN_BITS_PER_LIMB + (32 - __builtin_clz(limb));
    }
  }
  return 0;
}

// Returns the number of decimal digits of x; if x is 0, returns 1
// Assumes x is normalized
// The function doesn't have neither constant control flow nor constant memory
//   access flow
unsigned int bn_digitcount(const bignum256 *x) {
  bignum256 val = {0};
  bn_copy(x, &val);

  unsigned int digits = 1;
  for (unsigned int i = 0; i < BN_MAX_DECIMAL_DIGITS; i += 3) {
    uint32_t limb = 0;

    bn_divmod1000(&val, &limb);

    if (limb >= 100) {
      digits = i + 3;
    } else if (limb >= 10) {
      digits = i + 2;
    } else if (limb >= 1) {
      digits = i + 1;
    }
  }

  memzero(&val, sizeof(val));

  return digits;
}

// x = 0
// Guarantees x is normalized
void bn_zero(bignum256 *x) {
  for (int i = 0; i < BN_LIMBS; i++) {
    x->val[i] = 0;
  }
}

// x = 1
// Guarantees x is normalized
void bn_one(bignum256 *x) {
  x->val[0] = 1;
  for (int i = 1; i < BN_LIMBS; i++) {
    x->val[i] = 0;
  }
}

// Returns x == 0
// Assumes x is normalized
int bn_is_zero(const bignum256 *x) {
  uint32_t result = 0;
  for (int i = 0; i < BN_LIMBS; i++) {
    result |= x->val[i];
  }
  return !result;
}

// Returns x == 1
// Assumes x is normalized
int bn_is_one(const bignum256 *x) {
  uint32_t result = x->val[0] ^ 1;
  for (int i = 1; i < BN_LIMBS; i++) {
    result |= x->val[i];
  }
  return !result;
}

// Returns x < y
// Assumes x, y are normalized
int bn_is_less(const bignum256 *x, const bignum256 *y) {
  uint32_t res1 = 0;
  uint32_t res2 = 0;
  for (int i = BN_LIMBS - 1; i >= 0; i--) {
    res1 = (res1 << 1) | (x->val[i] < y->val[i]);
    res2 = (res2 << 1) | (x->val[i] > y->val[i]);
  }
  return res1 > res2;
}

// Returns x == y
// Assumes x, y are normalized
int bn_is_equal(const bignum256 *x, const bignum256 *y) {
  uint32_t result = 0;
  for (int i = 0; i < BN_LIMBS; i++) {
    result |= x->val[i] ^ y->val[i];
  }
  return !result;
}

// res = cond if truecase else falsecase
// Assumes cond is either 0 or 1
// Works properly even if &res == &truecase or &res == &falsecase or
//   &truecase == &falsecase or &res == &truecase == &falsecase
void bn_cmov(bignum256 *res, volatile uint32_t cond, const bignum256 *truecase,
             const bignum256 *falsecase) {
  // Intentional use of bitwise OR operator to ensure constant-time
  assert((int)(cond == 1) | (int)(cond == 0));

  uint32_t tmask = -cond;   // tmask = 0xFFFFFFFF if cond else 0x00000000
  uint32_t fmask = ~tmask;  // fmask = 0x00000000 if cond else 0xFFFFFFFF

  for (int i = 0; i < BN_LIMBS; i++) {
    res->val[i] = (truecase->val[i] & tmask) | (falsecase->val[i] & fmask);
  }
}

// x = -x % prime if cond else x,
// Explicitly x = (3 * prime - x if x > prime else 2 * prime - x) if cond else
//   else (x if x > prime else x + prime)
// Assumes x is normalized and partly reduced
// Assumes cond is either 1 or 0
// Guarantees x is normalized
// Assumes prime is normalized and
//   0 < prime < 2**260 == 2**(BITS_PER_LIMB * LIMBS - 1)
void bn_cnegate(volatile uint32_t cond, bignum256 *x, const bignum256 *prime) {
  // Intentional use of bitwise OR operator to ensure constant time
  assert((int)(cond == 1) | (int)(cond == 0));

  uint32_t tmask = -cond;   // tmask = 0xFFFFFFFF if cond else 0x00000000
  uint32_t fmask = ~tmask;  // fmask = 0x00000000 if cond else 0xFFFFFFFF

  bn_mod(x, prime);
  // x < prime

  uint32_t acc1 = 1;
  uint32_t acc2 = 0;

  for (int i = 0; i < BN_LIMBS; i++) {
    acc1 += (BN_BASE - 1) + 2 * prime->val[i] - x->val[i];
    // acc1 neither overflows 32 bits nor underflows 0
    // Proof:
    //   acc1 + (BASE - 1) + 2 * prime[i] - x[i]
    //     >= (BASE - 1) - x >= (2**BITS_PER_LIMB - 1) - (2**BITS_PER_LIMB - 1)
    //     == 0
    //   acc1 + (BASE - 1) + 2 * prime[i] - x[i]
    //     <= acc1 + (BASE - 1) + 2 * prime[i]
    //     <= (2**(32 - BITS_PER_LIMB) - 1) + 2 * (2**BITS_PER_LIMB - 1) +
    //       (2**BITS_PER_LIMB - 1)
    //     == 7 + 3 * 2**29 < 2**32

    acc2 += prime->val[i] + x->val[i];
    // acc2 doesn't overflow 32 bits
    // Proof:
    //   acc2 + prime[i] + x[i]
    //     <= 2**(32 - BITS_PER_LIMB) - 1 + 2 * (2**BITS_PER_LIMB - 1)
    //     == 2**(32 - BITS_PER_LIMB) + 2**(BITS_PER_LIMB + 1) - 2
    //     == 2**30 + 5 < 2**32

    // x = acc1 & LIMB_MASK if cond else acc2 & LIMB_MASK
    x->val[i] = ((acc1 & tmask) | (acc2 & fmask)) & BN_LIMB_MASK;

    acc1 >>= BN_BITS_PER_LIMB;
    // acc1 <= 7 == 2**(32 - BITS_PER_LIMB) - 1
    // acc1 == 2**(BITS_PER_LIMB * (i + 1)) + 2 * prime[:i + 1] - x[:i + 1]
    //   >> BITS_PER_LIMB * (i + 1)

    acc2 >>= BN_BITS_PER_LIMB;
    // acc2 <= 7 == 2**(32 - BITS_PER_LIMB) - 1
    // acc2 == prime[:i + 1] + x[:i + 1] >> BITS_PER_LIMB * (i + 1)
  }

  // assert(acc1 == 1); // assert prime <= 2**260
  // assert(acc2 == 0);

  // clang-format off
  // acc1 == 1
  // Proof:
  //   acc1 == 2**(BITS_PER_LIMB * LIMBS) + 2 * prime[:LIMBS] - x[:LIMBS] >> BITS_PER_LIMB * LIMBS
  //     == 2**(BITS_PER_LIMB * LIMBS) + 2 * prime - x >> BITS_PER_LIMB * LIMBS
  //     <= 2**(BITS_PER_LIMB * LIMBS) + 2 * prime >> BITS_PER_LIMB * LIMBS
  //     <= 2**(BITS_PER_LIMB * LIMBS) + 2 * (2**(BITS_PER_LIMB * LIMBS - 1) - 1) >> BITS_PER_LIMB * LIMBS
  //     <= 2**(BITS_PER_LIMB * LIMBS) + 2**(BITS_PER_LIMB * LIMBS) - 2 >> BITS_PER_LIMB * LIMBS
  //     == 1

  //   acc1 == 2**(BITS_PER_LIMB * LIMBS) + 2 * prime[:LIMBS] - x[:LIMBS] >> BITS_PER_LIMB * LIMBS
  //     == 2**(BITS_PER_LIMB * LIMBS) + 2 * prime - x >> BITS_PER_LIMB * LIMBS
  //     >= 2**(BITS_PER_LIMB * LIMBS) + 0 >> BITS_PER_LIMB * LIMBS
  //     == 1

  // acc2 == 0
  // Proof:
  //   acc2 == prime[:LIMBS] + x[:LIMBS] >> BITS_PER_LIMB * LIMBS
  //     == prime + x >> BITS_PER_LIMB * LIMBS
  //     <= 2 * prime - 1 >> BITS_PER_LIMB * LIMBS
  //     <= 2 * (2**(BITS_PER_LIMB * LIMBS - 1) - 1) - 1 >> 261
  //     == 2**(BITS_PER_LIMB * LIMBS) - 3 >> BITS_PER_LIMB * LIMBS
  //     == 0
  // clang-format on
}

// x <<= 1
// Assumes x is normalized, x < 2**260 == 2**(LIMBS*BITS_PER_LIMB - 1)
// Guarantees x is normalized
void bn_lshift(bignum256 *x) {
  for (int i = BN_LIMBS - 1; i > 0; i--) {
    x->val[i] = ((x->val[i] << 1) & BN_LIMB_MASK) |
                (x->val[i - 1] >> (BN_BITS_PER_LIMB - 1));
  }
  x->val[0] = (x->val[0] << 1) & BN_LIMB_MASK;
}

// x >>= 1, i.e. x = floor(x/2)
// Assumes x is normalized
// Guarantees x is normalized
// If x is partly reduced (fully reduced) modulo prime,
//   guarantess x will be partly reduced (fully reduced) modulo prime
void bn_rshift(bignum256 *x) {
  for (int i = 0; i < BN_LIMBS - 1; i++) {
    x->val[i] =
        (x->val[i] >> 1) | ((x->val[i + 1] & 1) << (BN_BITS_PER_LIMB - 1));
  }
  x->val[BN_LIMBS - 1] >>= 1;
}

// Sets i-th least significant bit (counting from zero)
// Assumes x is normalized and 0 <= i < 261 == LIMBS*BITS_PER_LIMB
// Guarantees x is normalized
// The function has constant control flow but not constant memory access flow
//   with regard to i
void bn_setbit(bignum256 *x, uint16_t i) {
  assert(i < BN_LIMBS * BN_BITS_PER_LIMB);
  x->val[i / BN_BITS_PER_LIMB] |= (1u << (i % BN_BITS_PER_LIMB));
}

// clears i-th least significant bit (counting from zero)
// Assumes x is normalized and 0 <= i < 261 == LIMBS*BITS_PER_LIMB
// Guarantees x is normalized
// The function has constant control flow but not constant memory access flow
//   with regard to i
void bn_clearbit(bignum256 *x, uint16_t i) {
  assert(i < BN_LIMBS * BN_BITS_PER_LIMB);
  x->val[i / BN_BITS_PER_LIMB] &= ~(1u << (i % BN_BITS_PER_LIMB));
}

// returns i-th least significant bit (counting from zero)
// Assumes x is normalized and 0 <= i < 261 == LIMBS*BITS_PER_LIMB
// The function has constant control flow but not constant memory access flow
//   with regard to i
uint32_t bn_testbit(const bignum256 *x, uint16_t i) {
  assert(i < BN_LIMBS * BN_BITS_PER_LIMB);
  return (x->val[i / BN_BITS_PER_LIMB] >> (i % BN_BITS_PER_LIMB)) & 1;
}

// res = x ^ y
// Assumes x, y are normalized
// Guarantees res is normalized
// Works properly even if &res == &x or &res == &y or &res == &x == &y
void bn_xor(bignum256 *res, const bignum256 *x, const bignum256 *y) {
  for (int i = 0; i < BN_LIMBS; i++) {
    res->val[i] = x->val[i] ^ y->val[i];
  }
}

// x = x / 2 % prime
// Explicitly x = x / 2 if is_even(x) else (x + prime) / 2
// Assumes x is normalized, x + prime < 261 == LIMBS * BITS_PER_LIMB
// Guarantees x is normalized
// If x is partly reduced (fully reduced) modulo prime,
//   guarantess x will be partly reduced (fully reduced) modulo prime
// Assumes prime is an odd number and normalized
void bn_mult_half(bignum256 *x, const bignum256 *prime) {
  // x = x / 2 if is_even(x) else (x + prime) / 2

  uint32_t x_is_odd_mask =
      -(x->val[0] & 1);  // x_is_odd_mask = 0xFFFFFFFF if is_odd(x) else 0

  uint32_t acc = (x->val[0] + (prime->val[0] & x_is_odd_mask)) >> 1;
  // acc < 2**BITS_PER_LIMB
  // Proof:
  //   acc == x[0] + prime[0] & x_is_odd_mask >> 1
  //     <= (2**(BITS_PER_LIMB) - 1) + (2**(BITS_PER_LIMB) - 1) >> 1
  //     == 2**(BITS_PER_LIMB + 1) - 2 >> 1
  //     <  2**(BITS_PER_LIMB)

  for (int i = 0; i < BN_LIMBS - 1; i++) {
    uint32_t temp = (x->val[i + 1] + (prime->val[i + 1] & x_is_odd_mask));
    // temp < 2**(BITS_PER_LIMB + 1)
    // Proof:
    //   temp == x[i + 1] + val[i + 1] & x_is_odd_mask
    //     <= (2**(BITS_PER_LIMB) - 1) + (2**(BITS_PER_LIMB) - 1)
    //     <  2**(BITS_PER_LIMB + 1)

    acc += (temp & 1) << (BN_BITS_PER_LIMB - 1);
    // acc doesn't overflow 32 bits
    // Proof:
    //   acc + (temp & 1 << BITS_PER_LIMB - 1)
    //     <= 2**(BITS_PER_LIMB + 1) + 2**(BITS_PER_LIMB - 1)
    //     <= 2**30 + 2**28 < 2**32

    x->val[i] = acc & BN_LIMB_MASK;
    acc >>= BN_BITS_PER_LIMB;
    acc += temp >> 1;
    // acc < 2**(BITS_PER_LIMB + 1)
    // Proof:
    //   acc + (temp >> 1)
    //     <= (2**(32 - BITS_PER_LIMB) - 1) + (2**(BITS_PER_LIMB + 1) - 1 >> 1)
    //     == 7 + 2**(BITS_PER_LIMB) - 1 < 2**(BITS_PER_LIMB + 1)

    // acc == x[:i+2]+(prime[:i+2] & x_is_odd_mask) >> BITS_PER_LIMB * (i+1)
  }
  x->val[BN_LIMBS - 1] = acc;

  // assert(acc >> BITS_PER_LIMB == 0);
  // acc >> BITS_PER_LIMB == 0
  // Proof:
  //   acc
  //     == x[:LIMBS] + (prime[:LIMBS] & x_is_odd_mask) >> BITS_PER_LIMB*LIMBS
  //     == x + (prime & x_is_odd_mask) >> BITS_PER_LIMB * LIMBS
  //     <= x + prime >> BITS_PER_LIMB * LIMBS
  //     <= 2**(BITS_PER_LIMB * LIMBS) - 1 >> BITS_PER_LIMB * LIMBS
  //     == 0
}

// x = x * k % prime
// Assumes x is normalized, 0 <= k <= 8 = 2**(32 - BITS_PER_LIMB)
// Assumes prime is normalized and 2^256 - 2^224 <= prime <= 2^256
// Guarantees x is normalized and partly reduced modulo prime
void bn_mult_k(bignum256 *x, uint8_t k, const bignum256 *prime) {
  assert(k <= 8);

  for (int i = 0; i < BN_LIMBS; i++) {
    x->val[i] = k * x->val[i];
    // x[i] doesn't overflow 32 bits
    // k * x[i] <= 2**(32 - BITS_PER_LIMB) * (2**BITS_PER_LIMB - 1)
    //   < 2**(32 - BITS_PER_LIMB) * 2**BITS_PER_LIMB == 2**32
  }

  bn_fast_mod(x, prime);
}

// Reduces partly reduced x modulo prime
// Explicitly x = x if x < prime else x - prime
// Assumes x is partly reduced modulo prime
// Guarantees x is fully reduced modulo prime
// Assumes prime is nonzero and normalized
void bn_mod(bignum256 *x, const bignum256 *prime) {
  uint32_t x_less_prime = bn_is_less(x, prime);

  bignum256 temp = {0};
  bn_subtract(x, prime, &temp);
  bn_cmov(x, x_less_prime, x, &temp);

  memzero(&temp, sizeof(temp));
}

// Auxiliary function for bn_multiply
// res = k * x
// Assumes k and x are normalized
// Guarantees res is normalized 18 digit little endian number in base 2**29
void bn_multiply_long(const bignum256 *k, const bignum256 *x,
                      uint32_t res[2 * BN_LIMBS]) {
  // Uses long multiplication in base 2**29, see
  // https://en.wikipedia.org/wiki/Multiplication_algorithm#Long_multiplication

  uint64_t acc = 0;

  // compute lower half
  for (int i = 0; i < BN_LIMBS; i++) {
    for (int j = 0; j <= i; j++) {
      acc += k->val[j] * (uint64_t)x->val[i - j];
      // acc doesn't overflow 64 bits
      // Proof:
      //   acc <= acc + sum([k[j] * x[i-j] for j in range(i)])
      //     <= (2**(64 - BITS_PER_LIMB) - 1) +
      //       LIMBS * (2**BITS_PER_LIMB - 1) * (2**BITS_PER_LIMB - 1)
      //     == (2**35 - 1) + 9 * (2**29 - 1) * (2**29 - 1)
      //     <= 2**35 + 9 * 2**58 < 2**64
    }

    res[i] = acc & BN_LIMB_MASK;
    acc >>= BN_BITS_PER_LIMB;
    // acc <= 2**35 - 1 == 2**(64 - BITS_PER_LIMB) - 1
  }

  // compute upper half
  for (int i = BN_LIMBS; i < 2 * BN_LIMBS - 1; i++) {
    for (int j = i - BN_LIMBS + 1; j < BN_LIMBS; j++) {
      acc += k->val[j] * (uint64_t)x->val[i - j];
      // acc doesn't overflow 64 bits
      // Proof:
      //   acc <= acc + sum([k[j] * x[i-j] for j in range(i)])
      //     <= (2**(64 - BITS_PER_LIMB) - 1)
      //       LIMBS * (2**BITS_PER_LIMB - 1) * (2**BITS_PER_LIMB - 1)
      //     == (2**35 - 1) + 9 * (2**29 - 1) * (2**29 - 1)
      //     <= 2**35 + 9 * 2**58 < 2**64
    }

    res[i] = acc & (BN_BASE - 1);
    acc >>= BN_BITS_PER_LIMB;
    // acc < 2**35 == 2**(64 - BITS_PER_LIMB)
  }

  res[2 * BN_LIMBS - 1] = acc;
}

// Auxiliary function for bn_multiply
// Assumes 0 <= d <= 8 == LIMBS - 1
// Assumes res is normalized and res < 2**(256 + 29*d + 31)
// Guarantess res in normalized and res < 2 * prime * 2**(29*d)
// Assumes prime is normalized, 2**256 - 2**224 <= prime <= 2**256
void bn_multiply_reduce_step(uint32_t res[2 * BN_LIMBS], const bignum256 *prime,
                             uint32_t d) {
  // clang-format off
  // Computes res = res - (res // 2**(256 + BITS_PER_LIMB * d)) * prime * 2**(BITS_PER_LIMB * d)

  // res - (res // 2**(256 + BITS_PER_LIMB * d)) * prime * 2**(BITS_PER_LIMB * d) < 2 * prime * 2**(BITS_PER_LIMB * d)
  // Proof:
  //   res - res // (2**(256 + BITS_PER_LIMB * d)) * 2**(BITS_PER_LIMB * d) * prime
  //     == res - res // (2**(256 + BITS_PER_LIMB * d)) * 2**(BITS_PER_LIMB * d) * (2**256 - (2**256 - prime))
  //     == res - res // (2**(256 + BITS_PER_LIMB * d)) * 2**(BITS_PER_LIMB * d) * 2**256 + res // (2**(256 + BITS_PER_LIMB * d)) * 2**(BITS_PER_LIMB * d) * (2**256 - prime)
  //     == (res % 2**(256 + BITS_PER_LIMB * d)) + res // (2**256 + BITS_PER_LIMB * d) * 2**(BITS_PER_LIMB * d) * (2**256 - prime)
  //     <= (2**(256 + 29*d + 31) % 2**(256 + 29*d)) + (2**(256 + 29*d + 31) - 1) / (2**256 + 29*d) * 2**(29*d) * (2**256 - prime)
  //     <= 2**(256 + 29*d) + 2**(256 + 29*d + 31) / (2**256 + 29*d) * 2**(29*d) * (2**256 - prime)
  //     == 2**(256 + 29*d) + 2**31 * 2**(29*d) * (2**256 - prime)
  //     == 2**(29*d) * (2**256 + 2**31 * (2*256 - prime))
  //     <= 2**(29*d) * (2**256 + 2**31 * 2*224)
  //     <= 2**(29*d) * (2**256 + 2**255)
  //     <= 2**(29*d) * 2 * (2**256 - 2**224)
  //     <= 2 * prime * 2**(29*d)
  // clang-format on

  uint32_t coef =
      (res[d + BN_LIMBS - 1] >> (256 - (BN_LIMBS - 1) * BN_BITS_PER_LIMB)) +
      (res[d + BN_LIMBS] << ((BN_LIMBS * BN_BITS_PER_LIMB) - 256));

  // coef == res // 2**(256 + BITS_PER_LIMB * d)

  // coef <  2**31
  // Proof:
  //   coef == res // 2**(256 + BITS_PER_LIMB * d)
  //     <  2**(256 + 29 * d + 31) // 2**(256 + 29 * d)
  //     == 2**31

  const int shift = 31;
  uint64_t acc = 1ull << shift;

  for (int i = 0; i < BN_LIMBS; i++) {
    acc += (((uint64_t)(BN_BASE - 1)) << shift) + res[d + i] -
           prime->val[i] * (uint64_t)coef;
    // acc neither overflow 64 bits nor underflow zero
    // Proof:
    //   acc + ((BASE - 1) << shift) + res[d + i] - prime[i] * coef
    //     >= ((BASE - 1) << shift) - prime[i] * coef
    //     == 2**shift * (2**BITS_PER_LIMB - 1) - (2**BITS_PER_LIMB - 1) *
    //       (2**31 - 1)
    //     == (2**shift - 2**31 + 1) * (2**BITS_PER_LIMB - 1)
    //     == (2**31 - 2**31 + 1) * (2**29 - 1)
    //     == 2**29 - 1 > 0
    //   acc + ((BASE - 1) << shift) + res[d + i] - prime[i] * coef
    //     <= acc + ((BASE - 1) << shift) + res[d+i]
    //     <= (2**(64 - BITS_PER_LIMB) - 1) + 2**shift * (2**BITS_PER_LIMB - 1)
    //       + (2*BITS_PER_LIMB - 1)
    //     == (2**(64 - BITS_PER_LIMB) - 1) + (2**shift + 1) *
    //       (2**BITS_PER_LIMB - 1)
    //     == (2**35 - 1) + (2**31 + 1) * (2**29 - 1)
    //     <= 2**35 + 2**60 + 2**29 < 2**64

    res[d + i] = acc & BN_LIMB_MASK;
    acc >>= BN_BITS_PER_LIMB;
    // acc <= 2**(64 - BITS_PER_LIMB) - 1 == 2**35 - 1

    // acc == (1 << BITS_PER_LIMB * (i + 1) + shift) + res[d : d + i + 1]
    //   - coef * prime[:i + 1] >> BITS_PER_LIMB * (i + 1)
  }

  // acc += (((uint64_t)(BASE - 1)) << shift) + res[d + LIMBS];
  // acc >>= BITS_PER_LIMB;
  // assert(acc <= 1ul << shift);

  // clang-format off
  // acc == 1 << shift
  // Proof:
  //   acc
  //     == (1 << BITS_PER_LIMB * (LIMBS + 1) + shift) + res[d : d + LIMBS + 1] - coef * prime[:LIMBS] >> BITS_PER_LIMB * (LIMBS + 1)
  //     == (1 << BITS_PER_LIMB * (LIMBS + 1) + shift) + res[d : d + LIMBS + 1] - coef * prime >> BITS_PER_LIMB * (LIMBS + 1)

  //     == (1 << BITS_PER_LIMB * (LIMBS + 1) + shift) + (res[d : d + LIMBS + 1] - coef * prime) >> BITS_PER_LIMB * (LIMBS + 1)
  //     <= (1 << BITS_PER_LIMB * (LIMBS + 1) + shift) + (res[:d] + BASE**d * res[d : d + LIMBS + 1] - BASE**d * coef * prime)//BASE**d >> BITS_PER_LIMB * (LIMBS + 1)
  //     <= (1 << BITS_PER_LIMB * (LIMBS + 1) + shift) + (res - BASE**d * coef * prime) // BASE**d >> BITS_PER_LIMB * (LIMBS + 1)
  //     == (1 << BITS_PER_LIMB * (LIMBS + 1) + shift) + (2 * prime * BASE**d) // BASE**d >> BITS_PER_LIMB * (LIMBS + 1)
  //     <= (1 << 321) + 2 * 2**256 >> 290
  //     == 1 << 31 == 1 << shift

  //     == (1 << BITS_PER_LIMB * (LIMBS + 1) + shift) + res[d : d + LIMBS + 1] - coef * prime[:LIMBS + 1] >> BITS_PER_LIMB * (LIMBS + 1)
  //     >= (1 << BITS_PER_LIMB * (LIMBS + 1) + shift) + 0 >> BITS_PER_LIMB * (LIMBS + 1)
  //     == 1 << shift
  // clang-format on

  res[d + BN_LIMBS] = 0;
}

// Auxiliary function for bn_multiply
// Partly reduces res and stores both in x and res
// Assumes res in normalized and res < 2**519
// Guarantees x is normalized and partly reduced modulo prime
// Assumes prime is normalized, 2**256 - 2**224 <= prime <= 2**256
void bn_multiply_reduce(bignum256 *x, uint32_t res[2 * BN_LIMBS],
                        const bignum256 *prime) {
  for (int i = BN_LIMBS - 1; i >= 0; i--) {
    // res < 2**(256 + 29*i + 31)
    // Proof:
    //   if i == LIMBS - 1:
    //     res < 2**519
    //       == 2**(256 + 29 * 8 + 31)
    //       == 2**(256 + 29 * (LIMBS - 1) + 31)
    //   else:
    //     res < 2 * prime * 2**(29 * (i + 1))
    //       <= 2**256 * 2**(29*i + 29) < 2**(256 + 29*i + 31)
    bn_multiply_reduce_step(res, prime, i);
  }

  for (int i = 0; i < BN_LIMBS; i++) {
    x->val[i] = res[i];
  }
}

// x = k * x % prime
// Assumes k, x are normalized, k * x < 2**519
// Guarantees x is normalized and partly reduced modulo prime
// Assumes prime is normalized, 2**256 - 2**224 <= prime <= 2**256
void bn_multiply(const bignum256 *k, bignum256 *x, const bignum256 *prime) {
  uint32_t res[2 * BN_LIMBS] = {0};

  bn_multiply_long(k, x, res);
  bn_multiply_reduce(x, res, prime);

  memzero(res, sizeof(res));
}

// Partly reduces x modulo prime
// Assumes limbs of x except the last (the most significant) one are normalized
// Assumes prime is normalized and 2^256 - 2^224 <= prime <= 2^256
// Guarantees x is normalized and partly reduced modulo prime
void bn_fast_mod(bignum256 *x, const bignum256 *prime) {
  // Computes x = x - (x // 2**256) * prime

  // x < 2**((LIMBS - 1) * BITS_PER_LIMB + 32) == 2**264

  // x - (x // 2**256) * prime < 2 * prime
  // Proof:
  //   x - (x // 2**256) * prime
  //     == x - (x // 2**256) * (2**256 - (2**256 - prime))
  //     == x - ((x // 2**256) * 2**256) + (x // 2**256) * (2**256 - prime)
  //     == (x % prime) + (x // 2**256) * (2**256 - prime)
  //     <= prime - 1 + (2**264 // 2**256) * (2**256 - prime)
  //     <= 2**256 + 2**8 * 2**224 == 2**256 + 2**232
  //     <  2 * (2**256 - 2**224)
  //     <= 2 * prime

  // x - (x // 2**256 - 1) * prime < 2 * prime
  // Proof:
  //   x - (x // 2**256) * prime + prime
  //     == x - (x // 2**256) * (2**256 - (2**256 - prime)) + prime
  //     == x - ((x//2**256) * 2**256) + (x//2**256) * (2**256 - prime) + prime
  //     == (x % prime) + (x // 2**256) * (2**256 - prime) + prime
  //     <= 2 * prime - 1 + (2**264 // 2**256) * (2**256 - prime)
  //     <= 2 * prime + 2**8 * 2**224 == 2**256 + 2**232 + 2**256 - 2**224
  //     <  2 * (2**256 - 2**224)
  //     <= 2 * prime

  uint32_t coef =
      x->val[BN_LIMBS - 1] >> (256 - ((BN_LIMBS - 1) * BN_BITS_PER_LIMB));

  // clang-format off
  // coef == x // 2**256
  // 0 <= coef < 2**((LIMBS - 1) * BITS_PER_LIMB + 32 - 256) == 256
  // Proof:
  //*  Let x[[a : b] be the number consisting of a-th to (b-1)-th bit of the number x.
  //   x[LIMBS - 1] >> (256 - ((LIMBS - 1) * BITS_PER_LIMB))
  //     == x[[(LIMBS - 1) * BITS_PER_LIMB : (LIMBS - 1) * BITS_PER_LIMB + 32]] >> (256 - ((LIMBS - 1) * BITS_PER_LIMB))
  //     == x[[256 - ((LIMBS - 1) * BITS_PER_LIMB) + (LIMBS - 1) * BITS_PER_LIMB : (LIMBS - 1) * BITS_PER_LIMB + 32]]
  //     == x[[256 : (LIMBS - 1) * BITS_PER_LIMB + 32]]
  //     == x[[256 : 264]] == x // 2**256
  // clang-format on

  const int shift = 8;
  uint64_t acc = 1ull << shift;

  for (int i = 0; i < BN_LIMBS; i++) {
    acc += (((uint64_t)(BN_BASE - 1)) << shift) + x->val[i] -
           prime->val[i] * (uint64_t)coef;
    // acc neither overflows 64 bits nor underflows 0
    // Proof:
    //   acc + (BASE - 1 << shift) + x[i] - prime[i] * coef
    //     >= (BASE - 1 << shift) - prime[i] * coef
    //     >= 2**shift * (2**BITS_PER_LIMB - 1) - (2**BITS_PER_LIMB - 1) * 255
    //     == (2**shift - 255) * (2**BITS_PER_LIMB - 1)
    //     == (2**8 - 255) * (2**29 - 1) == 2**29 - 1 >= 0
    //   acc + (BASE - 1 << shift) + x[i] - prime[i] * coef
    //     <= acc + ((BASE - 1) << shift) + x[i]
    //     <= (2**(64 - BITS_PER_LIMB) - 1) + 2**shift * (2**BITS_PER_LIMB - 1)
    //       + (2**32 - 1)
    //     == (2**35 - 1) + 2**8 * (2**29 - 1) + 2**32
    //     <  2**35 + 2**37 + 2**32 < 2**64

    x->val[i] = acc & BN_LIMB_MASK;
    acc >>= BN_BITS_PER_LIMB;
    // acc <= 2**(64 - BITS_PER_LIMB) - 1 == 2**35 - 1

    // acc == (1 << BITS_PER_LIMB * (i + 1) + shift) + x[:i + 1]
    //   - coef * prime[:i + 1] >> BITS_PER_LIMB * (i + 1)
  }

  // assert(acc == 1 << shift);

  // clang-format off
  // acc == 1 << shift
  // Proof:
  //   acc
  //     == (1 << BITS_PER_LIMB * LIMBS + shift) + x[:LIMBS] - coef * prime[:LIMBS] >> BITS_PER_LIMB * LIMBS
  //     == (1 << BITS_PER_LIMB * LIMBS + shift) + (x - coef * prime) >> BITS_PER_LIMB * LIMBS
  //     <= (1 << BITS_PER_LIMB * LIMBS + shift) + (2 * prime) >> BITS_PER_LIMB * LIMBS
  //     <= (1 << BITS_PER_LIMB * LIMBS + shift) + 2 * 2**256 >> BITS_PER_LIMB * LIMBS
  //     <= 2**269 + 2**257 >> 2**261
  //     <= 1 << 8 == 1 << shift

  //   acc
  //     == (1 << BITS_PER_LIMB * LIMBS + shift) + x[:LIMBS] - coef * prime[:LIMBS] >> BITS_PER_LIMB * LIMBS
  //     >= (1 << BITS_PER_LIMB * LIMBS + shift) + 0 >> BITS_PER_LIMB * LIMBS
  //     == (1 << BITS_PER_LIMB * LIMBS + shift) + 0 >> BITS_PER_LIMB * LIMBS
  //     <= 1 << 8 == 1 << shift
  // clang-format on
}

// res = x**e % prime
// Assumes both x and e are normalized, x < 2**259
// Guarantees res is normalized and partly reduced modulo prime
// Works properly even if &x == &res
// Assumes prime is normalized, 2**256 - 2**224 <= prime <= 2**256
// The function doesn't have neither constant control flow nor constant memory
//  access flow with regard to e
void bn_power_mod(const bignum256 *x, const bignum256 *e,
                  const bignum256 *prime, bignum256 *res) {
  // Uses iterative right-to-left exponentiation by squaring, see
  // https://en.wikipedia.org/wiki/Modular_exponentiation#Right-to-left_binary_method

  bignum256 acc = {0};
  bn_copy(x, &acc);

  bn_one(res);
  for (int i = 0; i < BN_LIMBS; i++) {
    uint32_t limb = e->val[i];

    for (int j = 0; j < BN_BITS_PER_LIMB; j++) {
      // Break if the following bits of the last limb are zero
      if (i == BN_LIMBS - 1 && limb == 0) break;

      if (limb & 1)
        // acc * res < 2**519
        // Proof:
        //   acc * res <= max(2**259 - 1, 2 * prime) * (2 * prime)
        //     == max(2**259 - 1, 2**257) * 2**257 < 2**259 * 2**257
        //     == 2**516 < 2**519
        bn_multiply(&acc, res, prime);

      limb >>= 1;
      // acc * acc < 2**519
      // Proof:
      //   acc * acc <= max(2**259 - 1, 2 * prime)**2
      //     <= (2**259)**2 == 2**518 < 2**519
      bn_multiply(&acc, &acc, prime);
    }
    // acc == x**(e[:i + 1]) % prime
  }

  memzero(&acc, sizeof(acc));
}

// x = sqrt(x) % prime
// Explicitly x = x**((prime+1)/4) % prime
// The other root is -sqrt(x)
// Assumes x is normalized, x < 2**259 and quadratic residuum mod prime
// Assumes prime is a prime number, prime % 4 == 3, it is normalized and
//   2**256 - 2**224 <= prime <= 2**256
// Guarantees x is normalized and fully reduced modulo prime
// The function doesn't have neither constant control flow nor constant memory
//  access flow with regard to prime
void bn_sqrt(bignum256 *x, const bignum256 *prime) {
  // Uses the Lagrange formula for the primes of the special form, see
  // http://en.wikipedia.org/wiki/Quadratic_residue#Prime_or_prime_power_modulus
  // If prime % 4 == 3, then sqrt(x) % prime == x**((prime+1)//4) % prime

  assert(prime->val[BN_LIMBS - 1] % 4 == 3);

  // e = (prime + 1) // 4
  bignum256 e = {0};
  bn_copy(prime, &e);
  bn_addi(&e, 1);
  bn_rshift(&e);
  bn_rshift(&e);

  bn_power_mod(x, &e, prime, x);
  bn_mod(x, prime);

  memzero(&e, sizeof(e));
}

// a = 1/a % 2**n
// Assumes a is odd, 1 <= n <= 32
// The function doesn't have neither constant control flow nor constant memory
//   access flow with regard to n
uint32_t inverse_mod_power_two(uint32_t a, uint32_t n) {
  // Uses "Explicit Quadratic Modular inverse modulo 2" from section 3.3 of "On
  // Newton-Raphson iteration for multiplicative inverses modulo prime powers"
  // by Jean-Guillaume Dumas, see
  // https://arxiv.org/pdf/1209.6626.pdf

  // 1/a % 2**n
  //   = (2-a) * product([1 + (a-1)**(2**i) for i in range(1, floor(log2(n)))])

  uint32_t acc = 2 - a;
  uint32_t f = a - 1;

  // mask = (1 << n) - 1
  uint32_t mask = n == 32 ? 0xFFFFFFFF : (1u << n) - 1;

  for (uint32_t i = 1; i < n; i <<= 1) {
    f = (f * f) & mask;
    acc = (acc * (1 + f)) & mask;
  }

  return acc;
}

// x = (x / 2**BITS_PER_LIMB) % prime
// Assumes both x and prime are normalized
// Assumes prime is an odd number and normalized
// Guarantees x is normalized
// If x is partly reduced (fully reduced) modulo prime,
//   guarantess x will be partly reduced (fully reduced) modulo prime
void bn_divide_base(bignum256 *x, const bignum256 *prime) {
  // Uses an explicit formula for the modular inverse of power of two
  // (x / 2**n) % prime == (x + ((-x / prime) % 2**n) * prime) // 2**n
  // Proof:
  //   (x + ((-x / prime) % 2**n) * prime) % 2**n
  //     == (x - x / prime * prime) % 2**n
  //     == 0
  //   (x + ((-1 / prime) % 2**n) * prime) % prime
  //     == x
  //   if x < prime:
  //     (x + ((-x / prime) % 2**n) * prime) // 2**n
  //       <= ((prime - 1) + (2**n - 1) * prime) / 2**n
  //       == (2**n * prime - 1) / 2**n == prime - 1 / 2**n < prime
  //   if x < 2 * prime:
  //     (x + ((-x / prime) % 2**n) * prime) // 2**n
  //       <= ((2 * prime - 1) + (2**n - 1) * prime) / 2**n
  //       == (2**n * prime + prime - 1) / 2**n
  //       == prime + (prime - 1) / 2**n < 2 * prime

  // m = (-x / prime) % 2**BITS_PER_LIMB
  uint32_t m = (x->val[0] * (BN_BASE - inverse_mod_power_two(
                                           prime->val[0], BN_BITS_PER_LIMB))) &
               BN_LIMB_MASK;
  // m < 2**BITS_PER_LIMB

  uint64_t acc = x->val[0] + (uint64_t)m * prime->val[0];
  acc >>= BN_BITS_PER_LIMB;

  for (int i = 1; i < BN_LIMBS; i++) {
    acc = acc + x->val[i] + (uint64_t)m * prime->val[i];
    // acc does not overflow 64 bits
    // acc == acc + x + m * prime
    //    <= 2**(64 - BITS_PER_LIMB) + 2**(BITS_PER_LIMB)
    //      2**(BITS_PER_LIMB) * 2**(BITS_PER_LIMB)
    //    <= 2**(2 * BITS_PER_LIMB) + 2**(64 - BITS_PER_LIMB) +
    //      2**(BITS_PER_LIMB)
    //    <= 2**58 + 2**35 + 2**29 < 2**64

    x->val[i - 1] = acc & BN_LIMB_MASK;
    acc >>= BN_BITS_PER_LIMB;
    // acc < 2**35 == 2**(64 - BITS_PER_LIMB)

    // acc == x[:i + 1] + m * prime[:i + 1] >> BITS_PER_LIMB * (i + 1)
  }

  x->val[BN_LIMBS - 1] = acc;

  assert(acc >> BN_BITS_PER_LIMB == 0);

  // clang-format off
  // acc >> BITS_PER_LIMB == 0
  // Proof:
  //   acc >> BITS_PER_LIMB
  //     == (x[:LIMB] + m * prime[:LIMB] >> BITS_PER_LIMB * LIMBS) >> BITS_PER_LIMB * (LIMBS + 1)
  //     == x + m * prime >> BITS_PER_LIMB * (LIMBS + 1)
  //     <= (2**(BITS_PER_LIMB * LIMBS) - 1) + (2**BITS_PER_LIMB - 1) * (2**(BITS_PER_LIMB * LIMBS) - 1) >> BITS_PER_LIMB * (LIMBS + 1)
  //     == 2**(BITS_PER_LIMB * LIMBS) - 1 + 2**(BITS_PER_LIMB * (LIMBS + 1)) - 2**(BITS_PER_LIMB * LIMBS) - 2**BITS_PER_LIMB + 1 >> BITS_PER_LIMB * (LIMBS + 1)
  //     == 2**(BITS_PER_LIMB * (LIMBS + 1)) - 2**BITS_PER_LIMB >> BITS_PER_LIMB * (LIMBS + 1)
  //     == 0
  // clang-format on
}

#if !USE_INVERSE_FAST
// x = 1/x % prime if x != 0 else 0
// Assumes x is normalized
// Assumes prime is a prime number
// Guarantees x is normalized and fully reduced modulo prime
// Assumes prime is normalized, 2**256 - 2**224 <= prime <= 2**256
// The function doesn't have neither constant control flow nor constant memory
//   access flow with regard to prime
static void bn_inverse_slow(bignum256 *x, const bignum256 *prime) {
  // Uses formula 1/x % prime == x**(prime - 2) % prime
  // See https://en.wikipedia.org/wiki/Fermat%27s_little_theorem

  bn_fast_mod(x, prime);

  // e = prime - 2
  bignum256 e = {0};
  bn_read_uint32(2, &e);
  bn_subtract(prime, &e, &e);

  bn_power_mod(x, &e, prime, x);
  bn_mod(x, prime);

  memzero(&e, sizeof(e));
}
#endif

#if false
// x = 1/x % prime if x != 0 else 0
// Assumes x is is_normalized
// Assumes GCD(x, prime) = 1
// Guarantees x is normalized and fully reduced modulo prime
// Assumes prime is odd, normalized, 2**256 - 2**224 <= prime <= 2**256
// The function doesn't have neither constant control flow nor constant memory
//   access flow with regard to prime and x
static void bn_inverse_fast(bignum256 *x, const bignum256 *prime) {
  // "The Almost Montgomery Inverse" from the section 3 of "Constant Time
  // Modular Inversion" by Joppe W. Bos
  // See http://www.joppebos.com/files/CTInversion.pdf

  /*
    u = prime
    v = x & prime
    s = 1
    r = 0

    k = 0
    while v != 1:
      k += 1
      if is_even(u):
        u = u // 2
        s = 2 * s
      elif is_even(v):
        v = v // 2
        r = 2 * r
      elif v < u:
        u = (u - v) // 2
        r = r + s
        s = 2 * s
      else:
        v = (v - u) // 2
        s = r + s
        r = 2 * r

    s = (s / 2**k) % prime
    return s
  */

  if (bn_is_zero(x)) return;

  bn_fast_mod(x, prime);
  bn_mod(x, prime);

  bignum256 u = {0}, v = {0}, r = {0}, s = {0};
  bn_copy(prime, &u);
  bn_copy(x, &v);
  bn_one(&s);
  bn_zero(&r);

  int k = 0;
  while (!bn_is_one(&v)) {
    if ((u.val[0] & 1) == 0) {
      bn_rshift(&u);
      bn_lshift(&s);
    } else if ((v.val[0] & 1) == 0) {
      bn_rshift(&v);
      bn_lshift(&r);
    } else if (bn_is_less(&v, &u)) {
      bn_subtract(&u, &v, &u);
      bn_rshift(&u);
      bn_add(&r, &s);
      bn_lshift(&s);
    } else {
      bn_subtract(&v, &u, &v);
      bn_rshift(&v);
      bn_add(&s, &r);
      bn_lshift(&r);
    }
    k += 1;
    assert(!bn_is_zero(&v)); // assert GCD(x, prime) == 1
  }

  // s = s / 2**(k // BITS_PER_LIMB * BITS_PER_LIMB)
  for (int i = 0; i < k / BITS_PER_LIMB; i++) {
    bn_divide_base(&s, prime);
  }

  // s = s / 2**(k % BITS_PER_LIMB)
  for (int i = 0; i < k % BN_BITS_PER_LIMB; i++) {
    bn_mult_half(&s, prime);
  }

  bn_copy(&s, x);

  memzero(&u, sizeof(u));
  memzero(&v, sizeof(v));
  memzero(&r, sizeof(r));
  memzero(&s, sizeof(s));
}
#endif

#if USE_INVERSE_FAST
// x = 1/x % prime if x != 0 else 0
// Assumes x is is_normalized
// Assumes GCD(x, prime) = 1
// Guarantees x is normalized and fully reduced modulo prime
// Assumes prime is odd, normalized, 2**256 - 2**224 <= prime <= 2**256
// The function has constant control flow but not constant memory access flow
//   with regard to prime and x
static void bn_inverse_fast(bignum256 *x, const bignum256 *prime) {
  // Custom constant time version of "The Almost Montgomery Inverse" from the
  // section 3 of "Constant Time Modular Inversion" by Joppe W. Bos
  // See http://www.joppebos.com/files/CTInversion.pdf

  /*
    u = prime
    v = x % prime
    s = 1
    r = 0

    k = 0
    while v != 1:
      k += 1
      if is_even(u): # b1
        u = u // 2
        s = 2 * s
      elif is_even(v): # b2
        v = v // 2
        r = 2 * r
      elif v < u: # b3
        u = (u - v) // 2
        r = r + s
        s = 2 * s
      else: # b4
        v = (v - u) // 2
        s = r + s
        r = 2 * r

    s = (s / 2**k) % prime
    return s
  */

  bn_fast_mod(x, prime);
  bn_mod(x, prime);

  bignum256 u = {0}, v = {0}, r = {0}, s = {0};
  bn_copy(prime, &u);
  bn_copy(x, &v);
  bn_one(&s);
  bn_zero(&r);

  bignum256 zero = {0};
  bn_zero(&zero);

  int k = 0;

  int finished = 0, u_even = 0, v_even = 0, v_less_u = 0, b1 = 0, b2 = 0,
      b3 = 0, b4 = 0;
  finished = 0;

  for (int i = 0; i < 2 * BN_LIMBS * BN_BITS_PER_LIMB; i++) {
    finished = finished | -bn_is_one(&v);
    u_even = -bn_is_even(&u);
    v_even = -bn_is_even(&v);
    v_less_u = -bn_is_less(&v, &u);

    b1 = ~finished & u_even;
    b2 = ~finished & ~b1 & v_even;
    b3 = ~finished & ~b1 & ~b2 & v_less_u;
    b4 = ~finished & ~b1 & ~b2 & ~b3;

// The ternary operator for pointers with constant control flow
// BN_INVERSE_FAST_TERNARY(c, t, f) = t if c else f
// Very nasty hack, sorry for that
#define BN_INVERSE_FAST_TERNARY(c, t, f) \
  ((void *)(((c) & (uintptr_t)(t)) | (~(c) & (uintptr_t)(f))))

    bn_subtract(BN_INVERSE_FAST_TERNARY(b3, &u, &v),
                BN_INVERSE_FAST_TERNARY(
                    b3 | b4, BN_INVERSE_FAST_TERNARY(b3, &v, &u), &zero),
                BN_INVERSE_FAST_TERNARY(b3, &u, &v));

    bn_add(BN_INVERSE_FAST_TERNARY(b3, &r, &s),
           BN_INVERSE_FAST_TERNARY(b3 | b4, BN_INVERSE_FAST_TERNARY(b3, &s, &r),
                                   &zero));
    bn_rshift(BN_INVERSE_FAST_TERNARY(b1 | b3, &u, &v));
    bn_lshift(BN_INVERSE_FAST_TERNARY(b1 | b3, &s, &r));

    k = k - ~finished;
  }

  // s = s / 2**(k // BITS_PER_LIMB * BITS_PER_LIMB)
  for (int i = 0; i < 2 * BN_LIMBS; i++) {
    // s = s / 2**BITS_PER_LIMB % prime if i < k // BITS_PER_LIMB else s
    bn_copy(&s, &r);
    bn_divide_base(&r, prime);
    bn_cmov(&s, i < k / BN_BITS_PER_LIMB, &r, &s);
  }

  // s = s / 2**(k % BITS_PER_LIMB)
  for (int i = 0; i < BN_BITS_PER_LIMB; i++) {
    // s = s / 2 % prime if i < k % BITS_PER_LIMB else s
    bn_copy(&s, &r);
    bn_mult_half(&r, prime);
    bn_cmov(&s, i < k % BN_BITS_PER_LIMB, &r, &s);
  }

  bn_cmov(x, bn_is_zero(x), x, &s);

  memzero(&u, sizeof(u));
  memzero(&v, sizeof(v));
  memzero(&r, sizeof(s));
  memzero(&s, sizeof(s));
}
#endif

#if false
// x = 1/x % prime if x != 0 else 0
// Assumes x is is_normalized
// Assumes GCD(x, prime) = 1
// Guarantees x is normalized and fully reduced modulo prime
// Assumes prime is odd, normalized, 2**256 - 2**224 <= prime <= 2**256
static void bn_inverse_fast(bignum256 *x, const bignum256 *prime) {
  // Custom constant time version of "The Almost Montgomery Inverse" from the
  // section 3 of "Constant Time Modular Inversion" by Joppe W. Bos
  // See http://www.joppebos.com/files/CTInversion.pdf

  /*
    u = prime
    v = x % prime
    s = 1
    r = 0

    k = 0
    while v != 1:
      k += 1
      if is_even(u): # b1
        u = u // 2
        s = 2 * s
      elif is_even(v): # b2
        v = v // 2
        r = 2 * r
      elif v < u: # b3
        u = (u - v) // 2
        r = r + s
        s = 2 * s
      else: # b4
        v = (v - u) // 2
        s = r + s
        r = 2 * r

    s = (s / 2**k) % prime
    return s
  */

  bn_fast_mod(x, prime);
  bn_mod(x, prime);

  bignum256 u = {0}, v = {0}, r = {0}, s = {0};
  bn_copy(prime, &u);
  bn_copy(x, &v);
  bn_one(&s);
  bn_zero(&r);

  bignum256 zero = {0};
  bn_zero(&zero);

  int k = 0;

  uint32_t finished = 0, u_even = 0, v_even = 0, v_less_u = 0, b1 = 0, b2 = 0,
      b3 = 0, b4 = 0;
  finished = 0;

  bignum256 u_half = {0}, v_half = {0}, u_minus_v_half = {0}, v_minus_u_half = {0}, r_plus_s = {0}, r_twice = {0}, s_twice = {0};
  for (int i = 0; i < 2 * BN_LIMBS * BN_BITS_PER_LIMB; i++) {
    finished = finished | bn_is_one(&v);
    u_even = bn_is_even(&u);
    v_even = bn_is_even(&v);
    v_less_u = bn_is_less(&v, &u);

    b1 = (finished ^ 1) & u_even;
    b2 = (finished ^ 1) & (b1 ^ 1) & v_even;
    b3 = (finished ^ 1) & (b1 ^ 1) & (b2 ^ 1) & v_less_u;
    b4 = (finished ^ 1) & (b1 ^ 1) & (b2 ^ 1) & (b3 ^ 1);

    // u_half = u // 2
    bn_copy(&u, &u_half);
    bn_rshift(&u_half);

    // v_half = v // 2
    bn_copy(&v, &v_half);
    bn_rshift(&v_half);

    // u_minus_v_half  = (u - v) // 2
    bn_subtract(&u, &v, &u_minus_v_half);
    bn_rshift(&u_minus_v_half);

    // v_minus_u_half  = (v - u) // 2
    bn_subtract(&v, &u, &v_minus_u_half);
    bn_rshift(&v_minus_u_half);

    // r_plus_s = r + s
    bn_copy(&r, &r_plus_s);
    bn_add(&r_plus_s, &s);

    // r_twice = 2 * r
    bn_copy(&r, &r_twice);
    bn_lshift(&r_twice);

    // s_twice = 2 * s
    bn_copy(&s, &s_twice);
    bn_lshift(&s_twice);

    bn_cmov(&u, b1, &u_half, &u);
    bn_cmov(&u, b3, &u_minus_v_half, &u);

    bn_cmov(&v, b2, &v_half, &v);
    bn_cmov(&v, b4, &v_minus_u_half, &v);

    bn_cmov(&r, b2 | b4, &r_twice, &r);
    bn_cmov(&r, b3, &r_plus_s, &r);

    bn_cmov(&s, b1 | b3, &s_twice, &s);
    bn_cmov(&s, b4, &r_plus_s, &s);

    k = k + (finished ^ 1);
  }

  // s = s / 2**(k // BITS_PER_LIMB * BITS_PER_LIMB)
  for (int i = 0; i < 2 * BN_LIMBS; i++) {
    // s = s / 2**BITS_PER_LIMB % prime if i < k // BITS_PER_LIMB else s
    bn_copy(&s, &r);
    bn_divide_base(&r, prime);
    bn_cmov(&s, i < k / BITS_PER_LIMB, &r, &s);
  }

  // s = s / 2**(k % BITS_PER_LIMB)
  for (int i = 0; i < BN_BITS_PER_LIMB; i++) {
    // s = s / 2 % prime if i < k % BITS_PER_LIMB else s
    bn_copy(&s, &r);
    bn_mult_half(&r, prime);
    bn_cmov(&s, i < k % BN_BITS_PER_LIMB, &r, &s);
  }

  bn_cmov(x, bn_is_zero(x), x, &s);

  memzero(&u, sizeof(u));
  memzero(&v, sizeof(v));
  memzero(&r, sizeof(r));
  memzero(&s, sizeof(s));
  memzero(&u_half, sizeof(u_half));
  memzero(&v_half, sizeof(v_half));
  memzero(&u_minus_v_half, sizeof(u_minus_v_half));
  memzero(&v_minus_u_half, sizeof(v_minus_u_half));
  memzero(&r_twice, sizeof(r_twice));
  memzero(&s_twice, sizeof(s_twice));
  memzero(&r_plus_s, sizeof(r_plus_s));
}
#endif

// Normalizes x
// Assumes x < 2**261 == 2**(LIMBS * BITS_PER_LIMB)
// Guarantees x is normalized
void bn_normalize(bignum256 *x) {
  uint32_t acc = 0;

  for (int i = 0; i < BN_LIMBS; i++) {
    acc += x->val[i];
    // acc doesn't overflow 32 bits
    // Proof:
    //   acc + x[i]
    //      <= (2**(32 - BITS_PER_LIMB) - 1) + (2**BITS_PER_LIMB - 1)
    //      == 7 + 2**29 - 1 < 2**32

    x->val[i] = acc & BN_LIMB_MASK;
    acc >>= (BN_BITS_PER_LIMB);
    // acc <= 7 == 2**(32 - BITS_PER_LIMB) - 1
  }
}

// x = x + y
// Assumes x, y are normalized, x + y < 2**(LIMBS*BITS_PER_LIMB) == 2**261
// Guarantees x is normalized
// Works properly even if &x == &y
void bn_add(bignum256 *x, const bignum256 *y) {
  uint32_t acc = 0;
  for (int i = 0; i < BN_LIMBS; i++) {
    acc += x->val[i] + y->val[i];
    // acc doesn't overflow 32 bits
    // Proof:
    //   acc + x[i] + y[i]
    //     <= (2**(32 - BITS_PER_LIMB) - 1) + 2 * (2**BITS_PER_LIMB - 1)
    //     == (2**(32 - BITS_PER_LIMB) - 1) + 2**(BITS_PER_LIMB + 1) - 2
    //     == 7 + 2**30 - 2 < 2**32

    x->val[i] = acc & BN_LIMB_MASK;
    acc >>= BN_BITS_PER_LIMB;
    // acc <= 7 == 2**(32 - BITS_PER_LIMB) - 1

    // acc == x[:i + 1] + y[:i + 1] >> BITS_PER_LIMB * (i + 1)
  }

  // assert(acc == 0); // assert x + y < 2**261
  // acc == 0
  // Proof:
  //   acc == x[:LIMBS] + y[:LIMBS] >> LIMBS * BITS_PER_LIMB
  //     == x + y >> LIMBS * BITS_PER_LIMB
  //     <= 2**(LIMBS * BITS_PER_LIMB) - 1 >> LIMBS * BITS_PER_LIMB == 0
}

// x = x + y % prime
// Assumes x, y are normalized
// Guarantees x is normalized and partly reduced modulo prime
// Assumes prime is normalized and 2^256 - 2^224 <= prime <= 2^256
void bn_addmod(bignum256 *x, const bignum256 *y, const bignum256 *prime) {
  for (int i = 0; i < BN_LIMBS; i++) {
    x->val[i] += y->val[i];
    // x[i] doesn't overflow 32 bits
    // Proof:
    //   x[i] + y[i]
    //     <= 2 * (2**BITS_PER_LIMB - 1)
    //     == 2**30 - 2 < 2**32
  }

  bn_fast_mod(x, prime);
}

// x = x + y
// Assumes x is normalized
// Assumes y <= 2**32 - 2**29 == 2**32 - 2**BITS_PER_LIMB and
//   x + y < 2**261 == 2**(LIMBS * BITS_PER_LIMB)
// Guarantees x is normalized
void bn_addi(bignum256 *x, uint32_t y) {
  // assert(y <= 3758096384); // assert y <= 2**32 - 2**29
  uint32_t acc = y;

  for (int i = 0; i < BN_LIMBS; i++) {
    acc += x->val[i];
    // acc doesn't overflow 32 bits
    // Proof:
    //   if i == 0:
    //     acc + x[i] == y + x[0]
    //       <= (2**32 - 2**BITS_PER_LIMB) + (2**BITS_PER_LIMB - 1)
    //       == 2**32 - 1 < 2**32
    //   else:
    //     acc + x[i]
    //       <= (2**(32 - BITS_PER_LIMB) - 1) + (2**BITS_PER_LIMB - 1)
    //       == 7 + 2**29 - 1 < 2**32

    x->val[i] = acc & BN_LIMB_MASK;
    acc >>= (BN_BITS_PER_LIMB);
    // acc <= 7 == 2**(32 - BITS_PER_LIMB) - 1

    // acc == x[:i + 1] + y >> BITS_PER_LIMB * (i + 1)
  }

  // assert(acc == 0); // assert x + y < 2**261
  // acc == 0
  // Proof:
  //   acc == x[:LIMBS] + y << LIMBS * BITS_PER_LIMB
  //     == x + y << LIMBS * BITS_PER_LIMB
  //     <= 2**(LIMBS + BITS_PER_LIMB) - 1 << LIMBS * BITS_PER_LIMB
  //     == 0
}

// x = x - y % prime
// Explicitly x = x + prime - y
// Assumes x, y are normalized
// Assumes y < prime[0], x + prime - y < 2**261 == 2**(LIMBS * BITS_PER_LIMB)
// Guarantees x is normalized
// If x is fully reduced modulo prime,
//   guarantess x will be partly reduced modulo prime
// Assumes prime is nonzero and normalized
void bn_subi(bignum256 *x, uint32_t y, const bignum256 *prime) {
  assert(y < prime->val[0]);

  // x = x + prime - y

  uint32_t acc = -y;
  for (int i = 0; i < BN_LIMBS; i++) {
    acc += x->val[i] + prime->val[i];
    // acc neither overflows 32 bits nor underflows 0
    // Proof:
    //   acc + x[i] + prime[i]
    //     <= (2**(32 - BITS_PER_LIMB) - 1) + 2 * (2**BITS_PER_LIMB - 1)
    //     <= 7 + 2**30 - 2 < 2**32
    //   acc + x[i] + prime[i]
    //     >= -y + prime[0] >= 0

    x->val[i] = acc & BN_LIMB_MASK;
    acc >>= BN_BITS_PER_LIMB;
    // acc <= 7 == 2**(32 - BITS_PER_LIMB) - 1

    // acc == x[:i + 1] + prime[:i + 1] - y >> BITS_PER_LIMB * (i + 1)
  }

  // assert(acc == 0); // assert x + prime - y < 2**261
  // acc == 0
  // Proof:
  //   acc == x[:LIMBS] + prime[:LIMBS] - y >> BITS_PER_LIMB * LIMBS
  //     == x + prime - y >> BITS_PER_LIMB * LIMBS
  //     <= 2**(LIMBS * BITS_PER_LIMB) - 1 >> BITS_PER_LIMB * LIMBS == 0
}

// res = x - y % prime
// Explicitly res = x + (2 * prime - y)
// Assumes x, y are normalized, y is partly reduced
// Assumes x + 2 * prime - y < 2**261 == 2**(BITS_PER_LIMB * LIMBS)
// Guarantees res is normalized
// Assumes prime is nonzero and normalized
void bn_subtractmod(const bignum256 *x, const bignum256 *y, bignum256 *res,
                    const bignum256 *prime) {
  // res = x + (2 * prime - y)

  uint32_t acc = 1;

  for (int i = 0; i < BN_LIMBS; i++) {
    acc += (BN_BASE - 1) + x->val[i] + 2 * prime->val[i] - y->val[i];
    // acc neither overflows 32 bits nor underflows 0
    // Proof:
    //   acc + (BASE - 1) + x[i] + 2 * prime[i] - y[i]
    //     >= (BASE - 1) - y[i]
    //     == (2**BITS_PER_LIMB - 1) - (2**BITS_PER_LIMB - 1) == 0
    //   acc + (BASE - 1) + x[i] + 2 * prime[i] - y[i]
    //     <= acc + (BASE - 1) + x[i] + 2 * prime[i]
    //     <= (2**(32 - BITS_PER_LIMB) - 1) + (2**BITS_PER_LIMB - 1) +
    //     (2**BITS_PER_LIMB - 1) + 2 * (2**BITS_PER_LIMB - 1)
    //     <= (2**(32 - BITS_PER_LIMB) - 1) + 4 * (2**BITS_PER_LIMB - 1)
    //     == 7 + 4 * 2**29 - 4 == 2**31 + 3 < 2**32

    res->val[i] = acc & (BN_BASE - 1);
    acc >>= BN_BITS_PER_LIMB;
    // acc <= 7 == 2**(32 - BITS_PER_LIMB) - 1

    // acc == 2**(BITS_PER_LIMB * (i + 1)) + x[:i+1] - y[:i+1] + 2*prime[:i+1]
    //   >> BITS_PER_LIMB * (i+1)
  }

  // assert(acc == 1); // assert x + 2 * prime - y < 2**261

  // clang-format off
  // acc == 1
  // Proof:
  //   acc == 2**(BITS_PER_LIMB * LIMBS) + x[:LIMBS] - y[:LIMBS] + 2 * prime[:LIMBS] >> BITS_PER_LIMB * LIMBS
  //     == 2**(BITS_PER_LIMB * LIMBS) + x - y + 2 * prime >> BITS_PER_LIMB * LIMBS
  //     == 2**(BITS_PER_LIMB * LIMBS) + x + (2 * prime - y) >> BITS_PER_LIMB * LIMBS
  //     <= 2**(BITS_PER_LIMB * LIMBS) + 2**(BITS_PER_LIMB * LIMBS) - 1 >> BITS_PER_LIMB * LIMBS
  //     <= 2 * 2**(BITS_PER_LIMB * LIMBS) - 1 >> BITS_PER_LIMB * LIMBS
  //     == 1

  //   acc == 2**(BITS_PER_LIMB * LIMBS) + x[:LIMBS] - y[:LIMBS] + 2 * prime[:LIMBS] >> BITS_PER_LIMB * LIMBS
  //     == 2**(BITS_PER_LIMB * LIMBS) + x - y + 2 * prime >> BITS_PER_LIMB * LIMBS
  //     == 2**(BITS_PER_LIMB * LIMBS) + x + (2 * prime - y) >> BITS_PER_LIMB * LIMBS
  //     >= 2**(BITS_PER_LIMB * LIMBS) + 0 + 1 >> BITS_PER_LIMB * LIMBS
  //     == 1
  // clang-format on
}

// res = x - y
// Assumes x, y are normalized and x >= y
// Guarantees res is normalized
// Works properly even if &x == &y or &x == &res or &y == &res or
//   &x == &y == &res
void bn_subtract(const bignum256 *x, const bignum256 *y, bignum256 *res) {
  uint32_t acc = 1;
  for (int i = 0; i < BN_LIMBS; i++) {
    acc += (BN_BASE - 1) + x->val[i] - y->val[i];
    // acc neither overflows 32 bits nor underflows 0
    // Proof:
    //   acc + (BASE - 1) + x[i] - y[i]
    //     >= (BASE - 1) - y == (2**BITS_PER_LIMB - 1) - (2**BITS_PER_LIMB - 1)
    //     == 0
    //   acc + (BASE - 1) + x[i] - y[i]
    //     <= acc + (BASE - 1) + x[i]
    //     <= (2**(32 - BITS_PER_LIMB) - 1) + (2**BITS_PER_LIMB - 1) +
    //       (2**BITS_PER_LIMB - 1)
    //     == 7 + 2 * 2**29 < 2 **32

    res->val[i] = acc & BN_LIMB_MASK;
    acc >>= BN_BITS_PER_LIMB;
    // acc <= 7 == 2**(32 - BITS_PER_LIMB) - 1

    // acc == 2**(BITS_PER_LIMB * (i + 1)) + x[:i + 1] - y[:i + 1]
    //   >> BITS_PER_LIMB * (i + 1)
  }

  // assert(acc == 1); // assert x >= y

  // clang-format off
  // acc == 1
  // Proof:
  //   acc == 2**(BITS_PER_LIMB * LIMBS) + x[:LIMBS] - y[:LIMBS] >> BITS_PER_LIMB * LIMBS
  //     == 2**(BITS_PER_LIMB * LIMBS) + x - y >> BITS_PER_LIMB * LIMBS
  //     == 2**(BITS_PER_LIMB * LIMBS) + x >> BITS_PER_LIMB * LIMBS
  //     <= 2**(BITS_PER_LIMB * LIMBS) + 2**(BITS_PER_LIMB * LIMBS) - 1 >> BITS_PER_LIMB * LIMBS
  //     <= 2 * 2**(BITS_PER_LIMB * LIMBS) - 1 >> BITS_PER_LIMB * LIMBS
  //     == 1

  //   acc == 2**(BITS_PER_LIMB * LIMBS) + x[:LIMBS] - y[:LIMBS] >> BITS_PER_LIMB * LIMBS
  //     == 2**(BITS_PER_LIMB * LIMBS) + x - y >> BITS_PER_LIMB * LIMBS
  //     >= 2**(BITS_PER_LIMB * LIMBS) >> BITS_PER_LIMB * LIMBS
  //     == 1
}

// q = x // d, r = x % d
// Assumes x is normalized, 1 <= d <= 61304
// Guarantees q is normalized
void bn_long_division(bignum256 *x, uint32_t d, bignum256 *q, uint32_t *r) {
  assert(1 <= d && d < 61304);

  uint32_t acc = 0;

  *r = x->val[BN_LIMBS - 1] % d;
  q->val[BN_LIMBS - 1] = x->val[BN_LIMBS - 1] / d;

  for (int i = BN_LIMBS - 2; i >= 0; i--) {
    acc = *r * (BN_BASE % d) + x->val[i];
    // acc doesn't overflow 32 bits
    // Proof:
    //   r * (BASE % d) + x[i]
    //     <= (d - 1) * (d - 1) + (2**BITS_PER_LIMB - 1)
    //     == d**2 - 2*d + 2**BITS_PER_LIMB
    //     == 61304**2 - 2 * 61304 + 2**29
    //     == 3758057808 + 2**29  < 2**32

    q->val[i] = *r * (BN_BASE / d) + (acc / d);
    // q[i] doesn't overflow 32 bits
    // Proof:
    //   r * (BASE // d) + (acc // d)
    //     <= (d - 1) * (2**BITS_PER_LIMB / d) +
    //      ((d**2 - 2*d + 2**BITS_PER_LIMB) / d)
    //     <= (d - 1) * (2**BITS_PER_LIMB / d) + (d - 2 + 2**BITS_PER_LIMB / d)
    //     == (d - 1 + 1) * (2**BITS_PER_LIMB / d) + d - 2
    //     == 2**BITS_PER_LIMB + d - 2 <= 2**29 + 61304 < 2**32

    // q[i] == (r * BASE + x[i]) // d
    // Proof:
    //   q[i] == r * (BASE // d) + (acc // d)
    //     == r * (BASE // d) + (r * (BASE % d) + x[i]) // d
    //     == (r * d * (BASE // d) + r * (BASE % d) + x[i]) // d
    //     == (r * (d * (BASE // d) + (BASE % d)) + x[i]) // d
    //     == (r * BASE + x[i]) // d

    // q[i] < 2**BITS_PER_LIMB
    // Proof:
    //   q[i] == (r * BASE + x[i]) // d
    //     <= ((d - 1) * 2**BITS_PER_LIMB + (2**BITS_PER_LIMB - 1)) / d
    //     == (d * 2**BITS_PER_LIMB - 1) / d == 2**BITS_PER_LIMB - 1 / d
    //     <  2**BITS_PER_LIMB

    *r = acc % d;
    // r == (r * BASE + x[i]) % d
    // Proof:
    //   r == acc % d == (r * (BASE % d) + x[i]) % d
    //     == (r * BASE + x[i]) % d

    // x[:i] == q[:i] * d + r
  }
}

// x = x // 58, r = x % 58
// Assumes x is normalized
// Guarantees x is normalized
void bn_divmod58(bignum256 *x, uint32_t *r) { bn_long_division(x, 58, x, r); }

// x = x // 1000, r = x % 1000
// Assumes x is normalized
// Guarantees x is normalized
void bn_divmod1000(bignum256 *x, uint32_t *r) {
  bn_long_division(x, 1000, x, r);
}

// x = x // 10, r = x % 10
// Assumes x is normalized
// Guarantees x is normalized
void bn_divmod10(bignum256 *x, uint32_t *r) { bn_long_division(x, 10, x, r); }

// Formats amount
// Assumes amount is normalized
// Assumes prefix and suffix are null-terminated strings
// Assumes output is an array of length output_length
// The function doesn't have neither constant control flow nor constant memory
//   access flow with regard to any its argument
size_t bn_format(const bignum256 *amount, const char *prefix, const char *suffix, unsigned int decimals, int exponent, bool trailing, char thousands, char *output, size_t output_length) {

/*
  Python prototype of the function:

  def format(amount, prefix, suffix, decimals, exponent, trailing, thousands):
      if exponent >= 0:
          amount *= 10**exponent
      else:
          amount //= 10 ** (-exponent)

      d = pow(10, decimals)

      integer_part = amount // d
      integer_str = f"{integer_part:,}".replace(",", thousands or "")

      if decimals:
          decimal_part = amount % d
          decimal_str = f".{decimal_part:0{decimals}d}"
          if not trailing:
              decimal_str = decimal_str.rstrip("0").rstrip(".")
      else:
          decimal_str = ""

      return prefix + integer_str + decimal_str + suffix
*/

// Auxiliary macro for bn_format
// If enough space adds one character to output starting from the end
#define BN_FORMAT_ADD_OUTPUT_CHAR(c)                               \
  {                                                                \
    --position;                                                    \
    if (output <= position && position < output + output_length) { \
      *position = (c);                                             \
    } else {                                                       \
      memset(output, '\0', output_length);                         \
      return 0;                                                    \
    }                                                              \
  }

  bignum256 temp = {0};
  bn_copy(amount, &temp);
  uint32_t digit = 0;

  char *position = output + output_length;

  // Add string ending character
  BN_FORMAT_ADD_OUTPUT_CHAR('\0');

  // Add suffix
  size_t suffix_length = suffix ? strlen(suffix) : 0;
  for (int i = suffix_length - 1; i >= 0; --i)
    BN_FORMAT_ADD_OUTPUT_CHAR(suffix[i])

  // amount //= 10**exponent
  for (; exponent < 0; ++exponent) {
    // if temp == 0, there is no need to divide it by 10 anymore
    if (bn_is_zero(&temp)) {
      exponent = 0;
      break;
    }
    bn_divmod10(&temp, &digit);
  }

  // exponent >= 0 && decimals >= 0

  bool fractional_part = false;  // is fractional-part of amount present

  {  // Add fractional-part digits of amount
    // Add trailing zeroes
    unsigned int trailing_zeros = decimals < (unsigned int) exponent ? decimals : (unsigned int) exponent;
    // When casting a negative int to unsigned int, UINT_MAX is added to the int before
    // Since exponent >= 0, the value remains unchanged
    decimals -= trailing_zeros;
    exponent -= trailing_zeros;

    if (trailing && trailing_zeros) {
      fractional_part = true;
      for (; trailing_zeros > 0; --trailing_zeros)
          BN_FORMAT_ADD_OUTPUT_CHAR('0')
    }

    // exponent == 0 || decimals == 0

    // Add significant digits and leading zeroes
    for (; decimals > 0; --decimals) {
      bn_divmod10(&temp, &digit);

      if (fractional_part || digit || trailing) {
        fractional_part = true;
        BN_FORMAT_ADD_OUTPUT_CHAR('0' + digit)
      }
      else if (bn_is_zero(&temp)) {
        // We break since the remaining digits are zeroes and fractional_part == trailing == false
        decimals = 0;
        break;
      }
    }
    // decimals == 0
  }

  if (fractional_part) {
    BN_FORMAT_ADD_OUTPUT_CHAR('.')
  }

  {  // Add integer-part digits of amount
    // Add trailing zeroes
    int digits = 0;
    if (!bn_is_zero(&temp)) {
      for (; exponent > 0; --exponent) {
        ++digits;
        BN_FORMAT_ADD_OUTPUT_CHAR('0')
        if (thousands != 0 && digits % 3 == 0) {
          BN_FORMAT_ADD_OUTPUT_CHAR(thousands)
        }
      }
    }
    // decimals == 0 && exponent == 0

    // Add significant digits
    bool is_zero = false;
    do {
      ++digits;
      bn_divmod10(&temp, &digit);
      is_zero = bn_is_zero(&temp);
      BN_FORMAT_ADD_OUTPUT_CHAR('0' + digit)
      if (thousands != 0 && !is_zero && digits % 3 == 0) {
        BN_FORMAT_ADD_OUTPUT_CHAR(thousands)
      }
    } while (!is_zero);
  }

  // Add prefix
  size_t prefix_length = prefix ? strlen(prefix) : 0;
  for (int i = prefix_length - 1; i >= 0; --i)
    BN_FORMAT_ADD_OUTPUT_CHAR(prefix[i])

  // Move formatted amount to the start of output
  int length = output - position + output_length;
  memmove(output, position, length);
  return length - 1;
}

#if USE_BN_PRINT
// Prints x in hexadecimal
// Assumes x is normalized and x < 2**256
void bn_print(const bignum256 *x) {
  printf("%06x", x->val[8]);
  printf("%08x", ((x->val[7] << 3) | (x->val[6] >> 26)));
  printf("%07x", ((x->val[6] << 2) | (x->val[5] >> 27)) & 0x0FFFFFFF);
  printf("%07x", ((x->val[5] << 1) | (x->val[4] >> 28)) & 0x0FFFFFFF);
  printf("%07x", x->val[4] & 0x0FFFFFFF);
  printf("%08x", ((x->val[3] << 3) | (x->val[2] >> 26)));
  printf("%07x", ((x->val[2] << 2) | (x->val[1] >> 27)) & 0x0FFFFFFF);
  printf("%07x", ((x->val[1] << 1) | (x->val[0] >> 28)) & 0x0FFFFFFF);
  printf("%07x", x->val[0] & 0x0FFFFFFF);
}

// Prints comma separated list of limbs of x
void bn_print_raw(const bignum256 *x) {
  for (int i = 0; i < BN_LIMBS - 1; i++) {
    printf("0x%08x, ", x->val[i]);
  }
  printf("0x%08x", x->val[BN_LIMBS - 1]);
}
#endif

#if USE_INVERSE_FAST
void bn_inverse(bignum256 *x, const bignum256 *prime) {
  bn_inverse_fast(x, prime);
}
#else
void bn_inverse(bignum256 *x, const bignum256 *prime) {
  bn_inverse_slow(x, prime);
}
#endif