1
0
mirror of https://github.com/hashcat/hashcat.git synced 2024-11-14 03:39:09 +00:00
hashcat/OpenCL/inc_ecc_secp256k1.cl

2065 lines
45 KiB
Common Lisp

/**
* Author......: See docs/credits.txt
* License.....: MIT
*
* Furthermore, since elliptic curve operations are highly researched and optimized,
* we've consulted a lot of online resources to implement this, including several papers and
* example code.
*
* Credits where credits are due: there are a lot of nice projects that explain and/or optimize
* elliptic curve operations (especially elliptic curve multiplications by a scalar).
*
* We want to shout out following projects, which were quite helpful when implementing this:
* - secp256k1 by Pieter Wuille (https://github.com/bitcoin-core/secp256k1/, MIT)
* - secp256k1-cl by hhanh00 (https://github.com/hhanh00/secp256k1-cl/, MIT)
* - ec_pure_c by masterzorag (https://github.com/masterzorag/ec_pure_c/)
* - ecc-gmp by leivaburto (https://github.com/leivaburto/ecc-gmp)
* - micro-ecc by Ken MacKay (https://github.com/kmackay/micro-ecc/, BSD)
* - curve_example by willem (https://gist.github.com/nlitsme/c9031c7b9bf6bb009e5a)
* - py_ecc by Vitalik Buterin (https://github.com/ethereum/py_ecc/, MIT)
*
*
* Some BigNum operations are implemented similar to micro-ecc which is licensed under these terms:
* Copyright 2014 Ken MacKay, 2-Clause BSD License
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* ATTENTION: this code is NOT meant to be used in security critical environments that are at risk
* of side-channel or timing attacks etc, it's only purpose is to make it work fast for GPGPU
* (OpenCL/CUDA). Some attack vectors like side-channel and timing-attacks might be possible,
* because of some optimizations used within this code (non-constant time etc).
*/
/*
* Implementation considerations:
* point double and point add are implemented similar to algorithms mentioned in this 2011 paper:
* http://eprint.iacr.org/2011/338.pdf
* (Fast and Regular Algorithms for Scalar Multiplication over Elliptic Curves by Matthieu Rivain)
*
* In theory we could use the Jacobian Co-Z enhancement to get rid of the larger buffer caused by
* the z coordinates (and in this way reduce register pressure etc).
* For the Co-Z improvement there are a lot of fast algorithms, but we might still be faster
* with this implementation (b/c we allow non-constant time) without the Brier/Joye Montgomery-like
* ladder. Of course, this claim would need to be verified and tested to see which one is faster
* for our specific scenario at the end.
*
* We accomplish a "little" speedup by using scalars converted to w-NAF (non-adjacent form):
* The general idea of w-NAF is to pre-compute some zi coefficients like below to reduce the
* costly point additions by using a non-binary ("signed") number system (values other than just
* 0 and 1, but ranging from -2^(w-1)-1 to 2^(w-1)-1). This works best with the left-to-right
* binary algorithm such that we just add zi * P when adding point P (we pre-compute all the
* possible zi * P values because the x/y coordinates are known before the kernel starts):
*
* // Example with window size w = 2 (i.e. mod 4 => & 3):
* // 173 => 1 0 -1 0 -1 0 -1 0 1 = 2^8 - 2^6 - 2^4 - 2^2 + 1
* int e = 0b10101101; // 173
* int z[8 + 1] = { 0 }; // our zi/di, we need one extra slot to make the subtraction work
*
* int i = 0;
*
* while (e)
* {
* if (e & 1)
* {
* // for window size w = 3 it would be:
* // => 2^(w-0) = 2^3 = 8
* // => 2^(w-1) = 2^2 = 4
*
* int bit; // = 2 - (e & 3) for w = 2
*
* if ((e & 3) >= 2) // e % 4 == e & 3, use (e & 7) >= 4 for w = 3
* bit = (e & 3) - 4; // (e & 7) - 8 for w = 3
* else
* bit = e & 3; // e & 7 for w = 3
*
* z[i] = bit;
* e -= bit;
* }
*
* e >>= 1; // e / 2
* i++;
* }
*/
#include "inc_ecc_secp256k1.h"
DECLSPEC u32 sub (u32 *r, const u32 *a, const u32 *b)
{
u32 c = 0; // carry/borrow
#if defined IS_NV && HAS_SUB == 1 && HAS_SUBC == 1
asm volatile
(
"sub.cc.u32 %0, %9, %17;"
"subc.cc.u32 %1, %10, %18;"
"subc.cc.u32 %2, %11, %19;"
"subc.cc.u32 %3, %12, %20;"
"subc.cc.u32 %4, %13, %21;"
"subc.cc.u32 %5, %14, %22;"
"subc.cc.u32 %6, %15, %23;"
"subc.cc.u32 %7, %16, %24;"
"subc.u32 %8, 0, 0;"
: "=r"(r[0]), "=r"(r[1]), "=r"(r[2]), "=r"(r[3]), "=r"(r[4]), "=r"(r[5]), "=r"(r[6]), "=r"(r[7]),
"=r"(c)
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(a[4]), "r"(a[5]), "r"(a[6]), "r"(a[7]),
"r"(b[0]), "r"(b[1]), "r"(b[2]), "r"(b[3]), "r"(b[4]), "r"(b[5]), "r"(b[6]), "r"(b[7])
);
#elif defined IS_AMD && HAS_VSUB == 1 && HAS_VSUBB == 1
__asm__ __volatile__
(
"V_SUB_U32 %0, %9, %17;"
"V_SUBB_U32 %1, %10, %18;"
"V_SUBB_U32 %2, %11, %19;"
"V_SUBB_U32 %3, %12, %20;"
"V_SUBB_U32 %4, %13, %21;"
"V_SUBB_U32 %5, %14, %22;"
"V_SUBB_U32 %6, %15, %23;"
"V_SUBB_U32 %7, %16, %24;"
"V_SUBB_U32 %8, 0, 0;"
: "=v"(r[0]), "=v"(r[1]), "=v"(r[2]), "=v"(r[3]), "=v"(r[4]), "=v"(r[5]), "=v"(r[6]), "=v"(r[7]),
"=v"(c)
: "v"(a[0]), "v"(a[1]), "v"(a[2]), "v"(a[3]), "v"(a[4]), "v"(a[5]), "v"(a[6]), "v"(a[7]),
"v"(b[0]), "v"(b[1]), "v"(b[2]), "v"(b[3]), "v"(b[4]), "v"(b[5]), "v"(b[6]), "v"(b[7])
);
#else
for (u32 i = 0; i < 8; i++)
{
const u32 diff = a[i] - b[i] - c;
if (diff != a[i]) c = (diff > a[i]);
r[i] = diff;
}
#endif
return c;
}
DECLSPEC u32 add (u32 *r, const u32 *a, const u32 *b)
{
u32 c = 0; // carry/borrow
#if defined IS_NV && HAS_ADD == 1 && HAS_ADDC == 1
asm volatile
(
"add.cc.u32 %0, %9, %17;"
"addc.cc.u32 %1, %10, %18;"
"addc.cc.u32 %2, %11, %19;"
"addc.cc.u32 %3, %12, %20;"
"addc.cc.u32 %4, %13, %21;"
"addc.cc.u32 %5, %14, %22;"
"addc.cc.u32 %6, %15, %23;"
"addc.cc.u32 %7, %16, %24;"
"addc.u32 %8, 0, 0;"
: "=r"(r[0]), "=r"(r[1]), "=r"(r[2]), "=r"(r[3]), "=r"(r[4]), "=r"(r[5]), "=r"(r[6]), "=r"(r[7]),
"=r"(c)
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(a[4]), "r"(a[5]), "r"(a[6]), "r"(a[7]),
"r"(b[0]), "r"(b[1]), "r"(b[2]), "r"(b[3]), "r"(b[4]), "r"(b[5]), "r"(b[6]), "r"(b[7])
);
#elif defined IS_AMD && HAS_VADD == 1 && HAS_VADDC == 1
__asm__ __volatile__
(
"V_ADD_U32 %0, %9, %17;"
"V_ADDC_U32 %1, %10, %18;"
"V_ADDC_U32 %2, %11, %19;"
"V_ADDC_U32 %3, %12, %20;"
"V_ADDC_U32 %4, %13, %21;"
"V_ADDC_U32 %5, %14, %22;"
"V_ADDC_U32 %6, %15, %23;"
"V_ADDC_U32 %7, %16, %24;"
"V_ADDC_U32 %8, 0, 0;"
: "=v"(r[0]), "=v"(r[1]), "=v"(r[2]), "=v"(r[3]), "=v"(r[4]), "=v"(r[5]), "=v"(r[6]), "=v"(r[7]),
"=v"(c)
: "v"(a[0]), "v"(a[1]), "v"(a[2]), "v"(a[3]), "v"(a[4]), "v"(a[5]), "v"(a[6]), "v"(a[7]),
"v"(b[0]), "v"(b[1]), "v"(b[2]), "v"(b[3]), "v"(b[4]), "v"(b[5]), "v"(b[6]), "v"(b[7])
);
#else
for (u32 i = 0; i < 8; i++)
{
const u32 t = a[i] + b[i] + c;
if (t != a[i]) c = (t < a[i]);
r[i] = t;
}
#endif
return c;
}
DECLSPEC void sub_mod (u32 *r, const u32 *a, const u32 *b)
{
const u32 c = sub (r, a, b); // carry
if (c)
{
u32 t[8];
t[0] = SECP256K1_P0;
t[1] = SECP256K1_P1;
t[2] = SECP256K1_P2;
t[3] = SECP256K1_P3;
t[4] = SECP256K1_P4;
t[5] = SECP256K1_P5;
t[6] = SECP256K1_P6;
t[7] = SECP256K1_P7;
add (r, r, t);
}
}
DECLSPEC void add_mod (u32 *r, const u32 *a, const u32 *b)
{
const u32 c = add (r, a, b); // carry
/*
* Modulo operation:
*/
// note: we could have an early exit in case of c == 1 => sub ()
u32 t[8];
t[0] = SECP256K1_P0;
t[1] = SECP256K1_P1;
t[2] = SECP256K1_P2;
t[3] = SECP256K1_P3;
t[4] = SECP256K1_P4;
t[5] = SECP256K1_P5;
t[6] = SECP256K1_P6;
t[7] = SECP256K1_P7;
// check if modulo operation is needed
u32 mod = 1;
if (c == 0)
{
for (int i = 7; i >= 0; i--)
{
if (r[i] < t[i])
{
mod = 0;
break; // or return ! (check if faster)
}
if (r[i] > t[i]) break;
}
}
if (mod == 1)
{
sub (r, r, t);
}
}
DECLSPEC void mod_512 (u32 *n)
{
// we need to perform a modulo operation with 512-bit % 256-bit (bignum modulo):
// the modulus is the secp256k1 group order
// ATTENTION: for this function the byte-order is reversed (most significant bytes
// at the left)
/*
the general modulo by shift and substract code (a = a % b):
x = b;
t = a >> 1;
while (x <= t) x <<= 1;
while (a >= b)
{
if (a >= x) a -= x;
x >>= 1;
}
return a; // remainder
*/
u32 a[16];
a[ 0] = n[ 0];
a[ 1] = n[ 1];
a[ 2] = n[ 2];
a[ 3] = n[ 3];
a[ 4] = n[ 4];
a[ 5] = n[ 5];
a[ 6] = n[ 6];
a[ 7] = n[ 7];
a[ 8] = n[ 8];
a[ 9] = n[ 9];
a[10] = n[10];
a[11] = n[11];
a[12] = n[12];
a[13] = n[13];
a[14] = n[14];
a[15] = n[15];
u32 b[16];
b[ 0] = 0x00000000;
b[ 1] = 0x00000000;
b[ 2] = 0x00000000;
b[ 3] = 0x00000000;
b[ 4] = 0x00000000;
b[ 5] = 0x00000000;
b[ 6] = 0x00000000;
b[ 7] = 0x00000000;
b[ 8] = SECP256K1_N7;
b[ 9] = SECP256K1_N6;
b[10] = SECP256K1_N5;
b[11] = SECP256K1_N4;
b[12] = SECP256K1_N3;
b[13] = SECP256K1_N2;
b[14] = SECP256K1_N1;
b[15] = SECP256K1_N0;
/*
* Start:
*/
// x = b (but with a fast "shift" trick to avoid the while loop)
u32 x[16];
x[ 0] = b[ 8]; // this is a trick: we just put the group order's most significant bit all the
x[ 1] = b[ 9]; // way to the top to avoid doing the initial: while (x <= t) x <<= 1
x[ 2] = b[10];
x[ 3] = b[11];
x[ 4] = b[12];
x[ 5] = b[13];
x[ 6] = b[14];
x[ 7] = b[15];
x[ 8] = 0x00000000;
x[ 9] = 0x00000000;
x[10] = 0x00000000;
x[11] = 0x00000000;
x[12] = 0x00000000;
x[13] = 0x00000000;
x[14] = 0x00000000;
x[15] = 0x00000000;
// a >= b
while (a[0] >= b[0])
{
u32 l00 = a[ 0] < b[ 0];
u32 l01 = a[ 1] < b[ 1];
u32 l02 = a[ 2] < b[ 2];
u32 l03 = a[ 3] < b[ 3];
u32 l04 = a[ 4] < b[ 4];
u32 l05 = a[ 5] < b[ 5];
u32 l06 = a[ 6] < b[ 6];
u32 l07 = a[ 7] < b[ 7];
u32 l08 = a[ 8] < b[ 8];
u32 l09 = a[ 9] < b[ 9];
u32 l10 = a[10] < b[10];
u32 l11 = a[11] < b[11];
u32 l12 = a[12] < b[12];
u32 l13 = a[13] < b[13];
u32 l14 = a[14] < b[14];
u32 l15 = a[15] < b[15];
u32 e00 = a[ 0] == b[ 0];
u32 e01 = a[ 1] == b[ 1];
u32 e02 = a[ 2] == b[ 2];
u32 e03 = a[ 3] == b[ 3];
u32 e04 = a[ 4] == b[ 4];
u32 e05 = a[ 5] == b[ 5];
u32 e06 = a[ 6] == b[ 6];
u32 e07 = a[ 7] == b[ 7];
u32 e08 = a[ 8] == b[ 8];
u32 e09 = a[ 9] == b[ 9];
u32 e10 = a[10] == b[10];
u32 e11 = a[11] == b[11];
u32 e12 = a[12] == b[12];
u32 e13 = a[13] == b[13];
u32 e14 = a[14] == b[14];
if (l00) break;
if (l01 && e00) break;
if (l02 && e00 && e01) break;
if (l03 && e00 && e01 && e02) break;
if (l04 && e00 && e01 && e02 && e03) break;
if (l05 && e00 && e01 && e02 && e03 && e04) break;
if (l06 && e00 && e01 && e02 && e03 && e04 && e05) break;
if (l07 && e00 && e01 && e02 && e03 && e04 && e05 && e06) break;
if (l08 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07) break;
if (l09 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08) break;
if (l10 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09) break;
if (l11 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10) break;
if (l12 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11) break;
if (l13 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11 && e12) break;
if (l14 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11 && e12 && e13) break;
if (l15 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11 && e12 && e13 && e14) break;
// r = x (copy it to have the original values for the subtraction)
u32 r[16];
r[ 0] = x[ 0];
r[ 1] = x[ 1];
r[ 2] = x[ 2];
r[ 3] = x[ 3];
r[ 4] = x[ 4];
r[ 5] = x[ 5];
r[ 6] = x[ 6];
r[ 7] = x[ 7];
r[ 8] = x[ 8];
r[ 9] = x[ 9];
r[10] = x[10];
r[11] = x[11];
r[12] = x[12];
r[13] = x[13];
r[14] = x[14];
r[15] = x[15];
// x <<= 1
x[15] = x[15] >> 1 | x[14] << 31;
x[14] = x[14] >> 1 | x[13] << 31;
x[13] = x[13] >> 1 | x[12] << 31;
x[12] = x[12] >> 1 | x[11] << 31;
x[11] = x[11] >> 1 | x[10] << 31;
x[10] = x[10] >> 1 | x[ 9] << 31;
x[ 9] = x[ 9] >> 1 | x[ 8] << 31;
x[ 8] = x[ 8] >> 1 | x[ 7] << 31;
x[ 7] = x[ 7] >> 1 | x[ 6] << 31;
x[ 6] = x[ 6] >> 1 | x[ 5] << 31;
x[ 5] = x[ 5] >> 1 | x[ 4] << 31;
x[ 4] = x[ 4] >> 1 | x[ 3] << 31;
x[ 3] = x[ 3] >> 1 | x[ 2] << 31;
x[ 2] = x[ 2] >> 1 | x[ 1] << 31;
x[ 1] = x[ 1] >> 1 | x[ 0] << 31;
x[ 0] = x[ 0] >> 1;
// if (a >= r) a -= r;
l00 = a[ 0] < r[ 0];
l01 = a[ 1] < r[ 1];
l02 = a[ 2] < r[ 2];
l03 = a[ 3] < r[ 3];
l04 = a[ 4] < r[ 4];
l05 = a[ 5] < r[ 5];
l06 = a[ 6] < r[ 6];
l07 = a[ 7] < r[ 7];
l08 = a[ 8] < r[ 8];
l09 = a[ 9] < r[ 9];
l10 = a[10] < r[10];
l11 = a[11] < r[11];
l12 = a[12] < r[12];
l13 = a[13] < r[13];
l14 = a[14] < r[14];
l15 = a[15] < r[15];
e00 = a[ 0] == r[ 0];
e01 = a[ 1] == r[ 1];
e02 = a[ 2] == r[ 2];
e03 = a[ 3] == r[ 3];
e04 = a[ 4] == r[ 4];
e05 = a[ 5] == r[ 5];
e06 = a[ 6] == r[ 6];
e07 = a[ 7] == r[ 7];
e08 = a[ 8] == r[ 8];
e09 = a[ 9] == r[ 9];
e10 = a[10] == r[10];
e11 = a[11] == r[11];
e12 = a[12] == r[12];
e13 = a[13] == r[13];
e14 = a[14] == r[14];
if (l00) continue;
if (l01 && e00) continue;
if (l02 && e00 && e01) continue;
if (l03 && e00 && e01 && e02) continue;
if (l04 && e00 && e01 && e02 && e03) continue;
if (l05 && e00 && e01 && e02 && e03 && e04) continue;
if (l06 && e00 && e01 && e02 && e03 && e04 && e05) continue;
if (l07 && e00 && e01 && e02 && e03 && e04 && e05 && e06) continue;
if (l08 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07) continue;
if (l09 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08) continue;
if (l10 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09) continue;
if (l11 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10) continue;
if (l12 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11) continue;
if (l13 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11 && e12) continue;
if (l14 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11 && e12 && e13) continue;
if (l15 && e00 && e01 && e02 && e03 && e04 && e05 && e06 && e07 && e08 && e09 && e10 && e11 && e12 && e13 && e14) continue;
// substract (a -= r):
if ((r[ 0] | r[ 1] | r[ 2] | r[ 3] | r[ 4] | r[ 5] | r[ 6] | r[ 7] |
r[ 8] | r[ 9] | r[10] | r[11] | r[12] | r[13] | r[14] | r[15]) == 0) break;
r[ 0] = a[ 0] - r[ 0];
r[ 1] = a[ 1] - r[ 1];
r[ 2] = a[ 2] - r[ 2];
r[ 3] = a[ 3] - r[ 3];
r[ 4] = a[ 4] - r[ 4];
r[ 5] = a[ 5] - r[ 5];
r[ 6] = a[ 6] - r[ 6];
r[ 7] = a[ 7] - r[ 7];
r[ 8] = a[ 8] - r[ 8];
r[ 9] = a[ 9] - r[ 9];
r[10] = a[10] - r[10];
r[11] = a[11] - r[11];
r[12] = a[12] - r[12];
r[13] = a[13] - r[13];
r[14] = a[14] - r[14];
r[15] = a[15] - r[15];
// take care of the "borrow" (we can't do it the other way around 15...1 because r[x] is changed!)
if (r[ 1] > a[ 1]) r[ 0]--;
if (r[ 2] > a[ 2]) r[ 1]--;
if (r[ 3] > a[ 3]) r[ 2]--;
if (r[ 4] > a[ 4]) r[ 3]--;
if (r[ 5] > a[ 5]) r[ 4]--;
if (r[ 6] > a[ 6]) r[ 5]--;
if (r[ 7] > a[ 7]) r[ 6]--;
if (r[ 8] > a[ 8]) r[ 7]--;
if (r[ 9] > a[ 9]) r[ 8]--;
if (r[10] > a[10]) r[ 9]--;
if (r[11] > a[11]) r[10]--;
if (r[12] > a[12]) r[11]--;
if (r[13] > a[13]) r[12]--;
if (r[14] > a[14]) r[13]--;
if (r[15] > a[15]) r[14]--;
a[ 0] = r[ 0];
a[ 1] = r[ 1];
a[ 2] = r[ 2];
a[ 3] = r[ 3];
a[ 4] = r[ 4];
a[ 5] = r[ 5];
a[ 6] = r[ 6];
a[ 7] = r[ 7];
a[ 8] = r[ 8];
a[ 9] = r[ 9];
a[10] = r[10];
a[11] = r[11];
a[12] = r[12];
a[13] = r[13];
a[14] = r[14];
a[15] = r[15];
}
n[ 0] = a[ 0];
n[ 1] = a[ 1];
n[ 2] = a[ 2];
n[ 3] = a[ 3];
n[ 4] = a[ 4];
n[ 5] = a[ 5];
n[ 6] = a[ 6];
n[ 7] = a[ 7];
n[ 8] = a[ 8];
n[ 9] = a[ 9];
n[10] = a[10];
n[11] = a[11];
n[12] = a[12];
n[13] = a[13];
n[14] = a[14];
n[15] = a[15];
}
DECLSPEC void mul_mod (u32 *r, const u32 *a, const u32 *b) // TODO get rid of u64 ?
{
u32 t[16] = { 0 }; // we need up to double the space (2 * 8)
/*
* First start with the basic a * b multiplication:
*/
u32 t0 = 0;
u32 t1 = 0;
u32 c = 0;
for (u32 i = 0; i < 8; i++)
{
for (u32 j = 0; j <= i; j++)
{
u64 p = ((u64) a[j]) * b[i - j];
u64 d = ((u64) t1) << 32 | t0;
d += p;
t0 = (u32) d;
t1 = d >> 32;
c += d < p; // carry
}
t[i] = t0;
t0 = t1;
t1 = c;
c = 0;
}
for (u32 i = 8; i < 15; i++)
{
for (u32 j = i - 7; j < 8; j++)
{
u64 p = ((u64) a[j]) * b[i - j];
u64 d = ((u64) t1) << 32 | t0;
d += p;
t0 = (u32) d;
t1 = d >> 32;
c += d < p;
}
t[i] = t0;
t0 = t1;
t1 = c;
c = 0;
}
t[15] = t0;
/*
* Now do the modulo operation:
* (r = t % p)
*
* http://www.isys.uni-klu.ac.at/PDF/2001-0126-MT.pdf (p.354 or p.9 in that document)
*/
u32 tmp[16] = { 0 };
// c = 0;
// Note: SECP256K1_P = 2^256 - 2^32 - 977 (0x03d1 = 977)
// multiply t[8]...t[15] by omega:
for (u32 i = 0, j = 8; i < 8; i++, j++)
{
u64 p = ((u64) 0x03d1) * t[j] + c;
tmp[i] = (u32) p;
c = p >> 32;
}
tmp[8] = c;
c = add (tmp + 1, tmp + 1, t + 8); // modifies tmp[1]...tmp[8]
tmp[9] = c;
// r = t + tmp
c = add (r, t, tmp);
// multiply t[0]...t[7] by omega:
u32 c2 = 0;
// memset (t, 0, sizeof (t));
for (u32 i = 0, j = 8; i < 8; i++, j++)
{
u64 p = ((u64) 0x3d1) * tmp[j] + c2;
t[i] = (u32) p;
c2 = p >> 32;
}
t[8] = c2;
c2 = add (t + 1, t + 1, tmp + 8); // modifies t[1]...t[8]
t[9] = c2;
// r = r + t
c2 = add (r, r, t);
c += c2;
t[0] = SECP256K1_P0;
t[1] = SECP256K1_P1;
t[2] = SECP256K1_P2;
t[3] = SECP256K1_P3;
t[4] = SECP256K1_P4;
t[5] = SECP256K1_P5;
t[6] = SECP256K1_P6;
t[7] = SECP256K1_P7;
for (u32 i = c; i > 0; i--)
{
sub (r, r, t);
}
for (int i = 7; i >= 0; i--)
{
if (r[i] < t[i]) break;
if (r[i] > t[i])
{
sub (r, r, t);
break;
}
}
}
DECLSPEC void sqrt_mod (u32 *r)
{
// Fermat's Little Theorem
// secp256k1: y^2 = x^3 + 7 % p
// y ^ (p - 1) = 1
// y ^ (p - 1) = (y^2) ^ ((p - 1) / 2) = 1 => y^2 = (y^2) ^ (((p - 1) / 2) + 1)
// => y = (y^2) ^ ((((p - 1) / 2) + 1) / 2)
// y = (y^2) ^ (((p - 1 + 2) / 2) / 2) = (y^2) ^ ((p + 1) / 4)
// y1 = (x^3 + 7) ^ ((p + 1) / 4)
// y2 = p - y1 (or y2 = y1 * -1 % p)
u32 s[8];
s[0] = SECP256K1_P0 + 1; // because of (p + 1) / 4 or use add (s, s, 1)
s[1] = SECP256K1_P1;
s[2] = SECP256K1_P2;
s[3] = SECP256K1_P3;
s[4] = SECP256K1_P4;
s[5] = SECP256K1_P5;
s[6] = SECP256K1_P6;
s[7] = SECP256K1_P7;
u32 t[8] = { 0 };
t[0] = 1;
for (u32 i = 255; i > 1; i--) // we just skip the last 2 multiplications (=> exp / 4)
{
mul_mod (t, t, t); // r * r
u32 idx = i >> 5;
u32 mask = 1 << (i & 0x1f);
if (s[idx] & mask)
{
mul_mod (t, t, r); // t * r
}
}
r[0] = t[0];
r[1] = t[1];
r[2] = t[2];
r[3] = t[3];
r[4] = t[4];
r[5] = t[5];
r[6] = t[6];
r[7] = t[7];
}
// (inverse (a, p) * a) % p == 1 (or think of a * a^-1 = a / a = 1)
DECLSPEC void inv_mod (u32 *a)
{
// How often does this really happen? it should "almost" never happen (but would be safer)
// if ((a[0] | a[1] | a[2] | a[3] | a[4] | a[5] | a[6] | a[7]) == 0) return;
u32 t0[8];
t0[0] = a[0];
t0[1] = a[1];
t0[2] = a[2];
t0[3] = a[3];
t0[4] = a[4];
t0[5] = a[5];
t0[6] = a[6];
t0[7] = a[7];
u32 p[8];
p[0] = SECP256K1_P0;
p[1] = SECP256K1_P1;
p[2] = SECP256K1_P2;
p[3] = SECP256K1_P3;
p[4] = SECP256K1_P4;
p[5] = SECP256K1_P5;
p[6] = SECP256K1_P6;
p[7] = SECP256K1_P7;
u32 t1[8];
t1[0] = SECP256K1_P0;
t1[1] = SECP256K1_P1;
t1[2] = SECP256K1_P2;
t1[3] = SECP256K1_P3;
t1[4] = SECP256K1_P4;
t1[5] = SECP256K1_P5;
t1[6] = SECP256K1_P6;
t1[7] = SECP256K1_P7;
u32 t2[8] = { 0 };
t2[0] = 0x00000001;
u32 t3[8] = { 0 };
u32 b = (t0[0] != t1[0])
| (t0[1] != t1[1])
| (t0[2] != t1[2])
| (t0[3] != t1[3])
| (t0[4] != t1[4])
| (t0[5] != t1[5])
| (t0[6] != t1[6])
| (t0[7] != t1[7]);
while (b)
{
if ((t0[0] & 1) == 0) // even
{
t0[0] = t0[0] >> 1 | t0[1] << 31;
t0[1] = t0[1] >> 1 | t0[2] << 31;
t0[2] = t0[2] >> 1 | t0[3] << 31;
t0[3] = t0[3] >> 1 | t0[4] << 31;
t0[4] = t0[4] >> 1 | t0[5] << 31;
t0[5] = t0[5] >> 1 | t0[6] << 31;
t0[6] = t0[6] >> 1 | t0[7] << 31;
t0[7] = t0[7] >> 1;
u32 c = 0;
if (t2[0] & 1) c = add (t2, t2, p);
t2[0] = t2[0] >> 1 | t2[1] << 31;
t2[1] = t2[1] >> 1 | t2[2] << 31;
t2[2] = t2[2] >> 1 | t2[3] << 31;
t2[3] = t2[3] >> 1 | t2[4] << 31;
t2[4] = t2[4] >> 1 | t2[5] << 31;
t2[5] = t2[5] >> 1 | t2[6] << 31;
t2[6] = t2[6] >> 1 | t2[7] << 31;
t2[7] = t2[7] >> 1 | c << 31;
}
else if ((t1[0] & 1) == 0)
{
t1[0] = t1[0] >> 1 | t1[1] << 31;
t1[1] = t1[1] >> 1 | t1[2] << 31;
t1[2] = t1[2] >> 1 | t1[3] << 31;
t1[3] = t1[3] >> 1 | t1[4] << 31;
t1[4] = t1[4] >> 1 | t1[5] << 31;
t1[5] = t1[5] >> 1 | t1[6] << 31;
t1[6] = t1[6] >> 1 | t1[7] << 31;
t1[7] = t1[7] >> 1;
u32 c = 0;
if (t3[0] & 1) c = add (t3, t3, p);
t3[0] = t3[0] >> 1 | t3[1] << 31;
t3[1] = t3[1] >> 1 | t3[2] << 31;
t3[2] = t3[2] >> 1 | t3[3] << 31;
t3[3] = t3[3] >> 1 | t3[4] << 31;
t3[4] = t3[4] >> 1 | t3[5] << 31;
t3[5] = t3[5] >> 1 | t3[6] << 31;
t3[6] = t3[6] >> 1 | t3[7] << 31;
t3[7] = t3[7] >> 1 | c << 31;
}
else
{
u32 gt = 0;
for (int i = 7; i >= 0; i--)
{
if (t0[i] > t1[i])
{
gt = 1;
break;
}
if (t0[i] < t1[i]) break;
}
if (gt)
{
sub (t0, t0, t1);
t0[0] = t0[0] >> 1 | t0[1] << 31;
t0[1] = t0[1] >> 1 | t0[2] << 31;
t0[2] = t0[2] >> 1 | t0[3] << 31;
t0[3] = t0[3] >> 1 | t0[4] << 31;
t0[4] = t0[4] >> 1 | t0[5] << 31;
t0[5] = t0[5] >> 1 | t0[6] << 31;
t0[6] = t0[6] >> 1 | t0[7] << 31;
t0[7] = t0[7] >> 1;
u32 lt = 0;
for (int i = 7; i >= 0; i--)
{
if (t2[i] < t3[i])
{
lt = 1;
break;
}
if (t2[i] > t3[i]) break;
}
if (lt) add (t2, t2, p);
sub (t2, t2, t3);
u32 c = 0;
if (t2[0] & 1) c = add (t2, t2, p);
t2[0] = t2[0] >> 1 | t2[1] << 31;
t2[1] = t2[1] >> 1 | t2[2] << 31;
t2[2] = t2[2] >> 1 | t2[3] << 31;
t2[3] = t2[3] >> 1 | t2[4] << 31;
t2[4] = t2[4] >> 1 | t2[5] << 31;
t2[5] = t2[5] >> 1 | t2[6] << 31;
t2[6] = t2[6] >> 1 | t2[7] << 31;
t2[7] = t2[7] >> 1 | c << 31;
}
else
{
sub (t1, t1, t0);
t1[0] = t1[0] >> 1 | t1[1] << 31;
t1[1] = t1[1] >> 1 | t1[2] << 31;
t1[2] = t1[2] >> 1 | t1[3] << 31;
t1[3] = t1[3] >> 1 | t1[4] << 31;
t1[4] = t1[4] >> 1 | t1[5] << 31;
t1[5] = t1[5] >> 1 | t1[6] << 31;
t1[6] = t1[6] >> 1 | t1[7] << 31;
t1[7] = t1[7] >> 1;
u32 lt = 0;
for (int i = 7; i >= 0; i--)
{
if (t3[i] < t2[i])
{
lt = 1;
break;
}
if (t3[i] > t2[i]) break;
}
if (lt) add (t3, t3, p);
sub (t3, t3, t2);
u32 c = 0;
if (t3[0] & 1) c = add (t3, t3, p);
t3[0] = t3[0] >> 1 | t3[1] << 31;
t3[1] = t3[1] >> 1 | t3[2] << 31;
t3[2] = t3[2] >> 1 | t3[3] << 31;
t3[3] = t3[3] >> 1 | t3[4] << 31;
t3[4] = t3[4] >> 1 | t3[5] << 31;
t3[5] = t3[5] >> 1 | t3[6] << 31;
t3[6] = t3[6] >> 1 | t3[7] << 31;
t3[7] = t3[7] >> 1 | c << 31;
}
}
// update b:
b = (t0[0] != t1[0])
| (t0[1] != t1[1])
| (t0[2] != t1[2])
| (t0[3] != t1[3])
| (t0[4] != t1[4])
| (t0[5] != t1[5])
| (t0[6] != t1[6])
| (t0[7] != t1[7]);
}
// set result:
a[0] = t2[0];
a[1] = t2[1];
a[2] = t2[2];
a[3] = t2[3];
a[4] = t2[4];
a[5] = t2[5];
a[6] = t2[6];
a[7] = t2[7];
}
/*
// everything from the formulas below of course MOD the prime:
// we use this formula:
X = (3/2 * x^2)^2 - 2 * x * y^2
Y = (3/2 * x^2) * (x * y^2 - X) - y^4
Z = y * z
this is identical to the more frequently used form:
X = (3 * x^2)^2 - 8 * x * y^2
Y = 3 * x^2 * (4 * x * y^2 - X) - 8 * y^4
Z = 2 * y * z
*/
DECLSPEC void point_double (u32 *x, u32 *y, u32 *z)
{
// How often does this really happen? it should "almost" never happen (but would be safer)
/*
if ((y[0] | y[1] | y[2] | y[3] | y[4] | y[5] | y[6] | y[7]) == 0)
{
x[0] = 0;
x[1] = 0;
x[2] = 0;
x[3] = 0;
x[4] = 0;
x[5] = 0;
x[6] = 0;
x[7] = 0;
y[0] = 0;
y[1] = 0;
y[2] = 0;
y[3] = 0;
y[4] = 0;
y[5] = 0;
y[6] = 0;
y[7] = 0;
z[0] = 0;
z[1] = 0;
z[2] = 0;
z[3] = 0;
z[4] = 0;
z[5] = 0;
z[6] = 0;
z[7] = 0;
return;
}
*/
u32 t1[8];
t1[0] = x[0];
t1[1] = x[1];
t1[2] = x[2];
t1[3] = x[3];
t1[4] = x[4];
t1[5] = x[5];
t1[6] = x[6];
t1[7] = x[7];
u32 t2[8];
t2[0] = y[0];
t2[1] = y[1];
t2[2] = y[2];
t2[3] = y[3];
t2[4] = y[4];
t2[5] = y[5];
t2[6] = y[6];
t2[7] = y[7];
u32 t3[8];
t3[0] = z[0];
t3[1] = z[1];
t3[2] = z[2];
t3[3] = z[3];
t3[4] = z[4];
t3[5] = z[5];
t3[6] = z[6];
t3[7] = z[7];
u32 t4[8];
u32 t5[8];
u32 t6[8];
mul_mod (t4, t1, t1); // t4 = x^2
mul_mod (t5, t2, t2); // t5 = y^2
mul_mod (t1, t1, t5); // t1 = x*y^2
mul_mod (t5, t5, t5); // t5 = t5^2 = y^4
// here the z^2 and z^4 is not needed for a = 0
mul_mod (t3, t2, t3); // t3 = x * z
add_mod (t2, t4, t4); // t2 = 2 * t4 = 2 * x^2
add_mod (t4, t4, t2); // t4 = 3 * t4 = 3 * x^2
// a * z^4 = 0 * 1^4 = 0
// don't discard the least significant bit it's important too!
u32 c = 0;
if (t4[0] & 1)
{
u32 t[8];
t[0] = SECP256K1_P0;
t[1] = SECP256K1_P1;
t[2] = SECP256K1_P2;
t[3] = SECP256K1_P3;
t[4] = SECP256K1_P4;
t[5] = SECP256K1_P5;
t[6] = SECP256K1_P6;
t[7] = SECP256K1_P7;
c = add (t4, t4, t); // t4 + SECP256K1_P
}
// right shift (t4 / 2):
t4[0] = t4[0] >> 1 | t4[1] << 31;
t4[1] = t4[1] >> 1 | t4[2] << 31;
t4[2] = t4[2] >> 1 | t4[3] << 31;
t4[3] = t4[3] >> 1 | t4[4] << 31;
t4[4] = t4[4] >> 1 | t4[5] << 31;
t4[5] = t4[5] >> 1 | t4[6] << 31;
t4[6] = t4[6] >> 1 | t4[7] << 31;
t4[7] = t4[7] >> 1 | c << 31;
mul_mod (t6, t4, t4); // t6 = t4^2 = (3/2 * x^2)^2
add_mod (t2, t1, t1); // t2 = 2 * t1
sub_mod (t6, t6, t2); // t6 = t6 - t2
sub_mod (t1, t1, t6); // t1 = t1 - t6
mul_mod (t4, t4, t1); // t4 = t4 * t1
sub_mod (t1, t4, t5); // t1 = t4 - t5
// => x = t6, y = t1, z = t3:
x[0] = t6[0];
x[1] = t6[1];
x[2] = t6[2];
x[3] = t6[3];
x[4] = t6[4];
x[5] = t6[5];
x[6] = t6[6];
x[7] = t6[7];
y[0] = t1[0];
y[1] = t1[1];
y[2] = t1[2];
y[3] = t1[3];
y[4] = t1[4];
y[5] = t1[5];
y[6] = t1[6];
y[7] = t1[7];
z[0] = t3[0];
z[1] = t3[1];
z[2] = t3[2];
z[3] = t3[3];
z[4] = t3[4];
z[5] = t3[5];
z[6] = t3[6];
z[7] = t3[7];
}
/*
* madd-2004-hmv:
* (from https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html)
* t1 = z1^2
* t2 = t1*z1
* t1 = t1*x2
* t2 = t2*y2
* t1 = t1-x1
* t2 = t2-y1
* z3 = z1*t1
* t3 = t1^2
* t4 = t3*t1
* t3 = t3*x1
* t1 = 2*t3
* x3 = t2^2
* x3 = x3-t1
* x3 = x3-t4
* t3 = t3-x3
* t3 = t3*t2
* t4 = t4*y1
* y3 = t3-t4
*/
DECLSPEC void point_add (u32 *x1, u32 *y1, u32 *z1, u32 *x2, u32 *y2) // z2 = 1
{
// How often does this really happen? it should "almost" never happen (but would be safer)
/*
if ((y2[0] | y2[1] | y2[2] | y2[3] | y2[4] | y2[5] | y2[6] | y2[7]) == 0) return;
if ((y1[0] | y1[1] | y1[2] | y1[3] | y1[4] | y1[5] | y1[6] | y1[7]) == 0)
{
x1[0] = x2[0];
x1[1] = x2[1];
x1[2] = x2[2];
x1[3] = x2[3];
x1[4] = x2[4];
x1[5] = x2[5];
x1[6] = x2[6];
x1[7] = x2[7];
y1[0] = y2[0];
y1[1] = y2[1];
y1[2] = y2[2];
y1[3] = y2[3];
y1[4] = y2[4];
y1[5] = y2[5];
y1[6] = y2[6];
y1[7] = y2[7];
z1[0] = z2[0];
z1[1] = z2[1];
z1[2] = z2[2];
z1[3] = z2[3];
z1[4] = z2[4];
z1[5] = z2[5];
z1[6] = z2[6];
z1[7] = z2[7];
return;
}
*/
// if x1 == x2 and y2 == y2 and z2 == z2 we need to double instead?
// x1/y1/z1:
u32 t1[8];
t1[0] = x1[0];
t1[1] = x1[1];
t1[2] = x1[2];
t1[3] = x1[3];
t1[4] = x1[4];
t1[5] = x1[5];
t1[6] = x1[6];
t1[7] = x1[7];
u32 t2[8];
t2[0] = y1[0];
t2[1] = y1[1];
t2[2] = y1[2];
t2[3] = y1[3];
t2[4] = y1[4];
t2[5] = y1[5];
t2[6] = y1[6];
t2[7] = y1[7];
u32 t3[8];
t3[0] = z1[0];
t3[1] = z1[1];
t3[2] = z1[2];
t3[3] = z1[3];
t3[4] = z1[4];
t3[5] = z1[5];
t3[6] = z1[6];
t3[7] = z1[7];
// x2/y2:
u32 t4[8];
t4[0] = x2[0];
t4[1] = x2[1];
t4[2] = x2[2];
t4[3] = x2[3];
t4[4] = x2[4];
t4[5] = x2[5];
t4[6] = x2[6];
t4[7] = x2[7];
u32 t5[8];
t5[0] = y2[0];
t5[1] = y2[1];
t5[2] = y2[2];
t5[3] = y2[3];
t5[4] = y2[4];
t5[5] = y2[5];
t5[6] = y2[6];
t5[7] = y2[7];
u32 t6[8];
u32 t7[8];
u32 t8[8];
u32 t9[8];
mul_mod (t6, t3, t3); // t6 = t3^2
mul_mod (t7, t6, t3); // t7 = t6*t3
mul_mod (t6, t6, t4); // t6 = t6*t4
mul_mod (t7, t7, t5); // t7 = t7*t5
sub_mod (t6, t6, t1); // t6 = t6-t1
sub_mod (t7, t7, t2); // t7 = t7-t2
mul_mod (t8, t3, t6); // t8 = t3*t6
mul_mod (t4, t6, t6); // t4 = t6^2
mul_mod (t9, t4, t6); // t9 = t4*t6
mul_mod (t4, t4, t1); // t4 = t4*t1
// left shift (t4 * 2):
t6[7] = t4[7] << 1 | t4[6] >> 31;
t6[6] = t4[6] << 1 | t4[5] >> 31;
t6[5] = t4[5] << 1 | t4[4] >> 31;
t6[4] = t4[4] << 1 | t4[3] >> 31;
t6[3] = t4[3] << 1 | t4[2] >> 31;
t6[2] = t4[2] << 1 | t4[1] >> 31;
t6[1] = t4[1] << 1 | t4[0] >> 31;
t6[0] = t4[0] << 1;
// don't discard the most significant bit, it's important too!
if (t4[7] & 0x80000000)
{
// use most significant bit and perform mod P, since we have: t4 * 2 % P
u32 a[8] = { 0 };
a[1] = 1;
a[0] = 0x000003d1; // omega (see: mul_mod ())
add (t6, t6, a);
}
mul_mod (t5, t7, t7); // t5 = t7*t7
sub_mod (t5, t5, t6); // t5 = t5-t6
sub_mod (t5, t5, t9); // t5 = t5-t9
sub_mod (t4, t4, t5); // t4 = t4-t5
mul_mod (t4, t4, t7); // t4 = t4*t7
mul_mod (t9, t9, t2); // t9 = t9*t2
sub_mod (t9, t4, t9); // t9 = t4-t9
x1[0] = t5[0];
x1[1] = t5[1];
x1[2] = t5[2];
x1[3] = t5[3];
x1[4] = t5[4];
x1[5] = t5[5];
x1[6] = t5[6];
x1[7] = t5[7];
y1[0] = t9[0];
y1[1] = t9[1];
y1[2] = t9[2];
y1[3] = t9[3];
y1[4] = t9[4];
y1[5] = t9[5];
y1[6] = t9[6];
y1[7] = t9[7];
z1[0] = t8[0];
z1[1] = t8[1];
z1[2] = t8[2];
z1[3] = t8[3];
z1[4] = t8[4];
z1[5] = t8[5];
z1[6] = t8[6];
z1[7] = t8[7];
}
DECLSPEC void point_get_coords (secp256k1_t *r, const u32 *x, const u32 *y)
{
/*
pre-compute 1/-1, 3/-3, 5/-5, 7/-7 times P (x, y)
for wNAF with window size 4 (max/min: +/- 2^3-1): -7, -5, -3, -1, 1, 3, 5, 7
+x1 ( 0)
+y1 ( 8)
-y1 (16)
+x3 (24)
+y3 (32)
-y3 (40)
+x5 (48)
+y5 (56)
-y5 (64)
+x7 (72)
+y7 (80)
-y7 (88)
*/
// note: we use jacobian forms with (x, y, z) for computation, but affine
// (or just converted to z = 1) for storage
// 1:
r->xy[ 0] = x[0];
r->xy[ 1] = x[1];
r->xy[ 2] = x[2];
r->xy[ 3] = x[3];
r->xy[ 4] = x[4];
r->xy[ 5] = x[5];
r->xy[ 6] = x[6];
r->xy[ 7] = x[7];
r->xy[ 8] = y[0];
r->xy[ 9] = y[1];
r->xy[10] = y[2];
r->xy[11] = y[3];
r->xy[12] = y[4];
r->xy[13] = y[5];
r->xy[14] = y[6];
r->xy[15] = y[7];
// -1:
u32 p[8];
p[0] = SECP256K1_P0;
p[1] = SECP256K1_P1;
p[2] = SECP256K1_P2;
p[3] = SECP256K1_P3;
p[4] = SECP256K1_P4;
p[5] = SECP256K1_P5;
p[6] = SECP256K1_P6;
p[7] = SECP256K1_P7;
u32 neg[8];
neg[0] = y[0];
neg[1] = y[1];
neg[2] = y[2];
neg[3] = y[3];
neg[4] = y[4];
neg[5] = y[5];
neg[6] = y[6];
neg[7] = y[7];
sub_mod (neg, p, neg); // -y = p - y
r->xy[16] = neg[0];
r->xy[17] = neg[1];
r->xy[18] = neg[2];
r->xy[19] = neg[3];
r->xy[20] = neg[4];
r->xy[21] = neg[5];
r->xy[22] = neg[6];
r->xy[23] = neg[7];
// copy of 1:
u32 tx[8];
tx[0] = x[0];
tx[1] = x[1];
tx[2] = x[2];
tx[3] = x[3];
tx[4] = x[4];
tx[5] = x[5];
tx[6] = x[6];
tx[7] = x[7];
u32 ty[8];
ty[0] = y[0];
ty[1] = y[1];
ty[2] = y[2];
ty[3] = y[3];
ty[4] = y[4];
ty[5] = y[5];
ty[6] = y[6];
ty[7] = y[7];
u32 rx[8];
rx[0] = x[0];
rx[1] = x[1];
rx[2] = x[2];
rx[3] = x[3];
rx[4] = x[4];
rx[5] = x[5];
rx[6] = x[6];
rx[7] = x[7];
u32 ry[8];
ry[0] = y[0];
ry[1] = y[1];
ry[2] = y[2];
ry[3] = y[3];
ry[4] = y[4];
ry[5] = y[5];
ry[6] = y[6];
ry[7] = y[7];
u32 rz[8] = { 0 };
rz[0] = 1;
// 3:
point_double (rx, ry, rz); // 2
point_add (rx, ry, rz, tx, ty); // 3
// to affine:
inv_mod (rz);
mul_mod (neg, rz, rz); // neg is temporary variable (z^2)
mul_mod (rx, rx, neg);
mul_mod (rz, neg, rz);
mul_mod (ry, ry, rz);
r->xy[24] = rx[0];
r->xy[25] = rx[1];
r->xy[26] = rx[2];
r->xy[27] = rx[3];
r->xy[28] = rx[4];
r->xy[29] = rx[5];
r->xy[30] = rx[6];
r->xy[31] = rx[7];
r->xy[32] = ry[0];
r->xy[33] = ry[1];
r->xy[34] = ry[2];
r->xy[35] = ry[3];
r->xy[36] = ry[4];
r->xy[37] = ry[5];
r->xy[38] = ry[6];
r->xy[39] = ry[7];
// -3:
neg[0] = ry[0];
neg[1] = ry[1];
neg[2] = ry[2];
neg[3] = ry[3];
neg[4] = ry[4];
neg[5] = ry[5];
neg[6] = ry[6];
neg[7] = ry[7];
sub_mod (neg, p, neg);
r->xy[40] = neg[0];
r->xy[41] = neg[1];
r->xy[42] = neg[2];
r->xy[43] = neg[3];
r->xy[44] = neg[4];
r->xy[45] = neg[5];
r->xy[46] = neg[6];
r->xy[47] = neg[7];
// 5:
rz[0] = 1; // actually we could take advantage of rz being 1 too (alternative point_add ()),
rz[1] = 0; // but it is not important because this is performed only once per "hash"
rz[2] = 0;
rz[3] = 0;
rz[4] = 0;
rz[5] = 0;
rz[6] = 0;
rz[7] = 0;
point_add (rx, ry, rz, tx, ty); // 4
point_add (rx, ry, rz, tx, ty); // 5
// to affine:
inv_mod (rz);
mul_mod (neg, rz, rz);
mul_mod (rx, rx, neg);
mul_mod (rz, neg, rz);
mul_mod (ry, ry, rz);
r->xy[48] = rx[0];
r->xy[49] = rx[1];
r->xy[50] = rx[2];
r->xy[51] = rx[3];
r->xy[52] = rx[4];
r->xy[53] = rx[5];
r->xy[54] = rx[6];
r->xy[55] = rx[7];
r->xy[56] = ry[0];
r->xy[57] = ry[1];
r->xy[58] = ry[2];
r->xy[59] = ry[3];
r->xy[60] = ry[4];
r->xy[61] = ry[5];
r->xy[62] = ry[6];
r->xy[63] = ry[7];
// -5:
neg[0] = ry[0];
neg[1] = ry[1];
neg[2] = ry[2];
neg[3] = ry[3];
neg[4] = ry[4];
neg[5] = ry[5];
neg[6] = ry[6];
neg[7] = ry[7];
sub_mod (neg, p, neg);
r->xy[64] = neg[0];
r->xy[65] = neg[1];
r->xy[66] = neg[2];
r->xy[67] = neg[3];
r->xy[68] = neg[4];
r->xy[69] = neg[5];
r->xy[70] = neg[6];
r->xy[71] = neg[7];
// 7:
rz[0] = 1;
rz[1] = 0;
rz[2] = 0;
rz[3] = 0;
rz[4] = 0;
rz[5] = 0;
rz[6] = 0;
rz[7] = 0;
point_add (rx, ry, rz, tx, ty); // 6
point_add (rx, ry, rz, tx, ty); // 7
// to affine:
inv_mod (rz);
mul_mod (neg, rz, rz);
mul_mod (rx, rx, neg);
mul_mod (rz, neg, rz);
mul_mod (ry, ry, rz);
r->xy[72] = rx[0];
r->xy[73] = rx[1];
r->xy[74] = rx[2];
r->xy[75] = rx[3];
r->xy[76] = rx[4];
r->xy[77] = rx[5];
r->xy[78] = rx[6];
r->xy[79] = rx[7];
r->xy[80] = ry[0];
r->xy[81] = ry[1];
r->xy[82] = ry[2];
r->xy[83] = ry[3];
r->xy[84] = ry[4];
r->xy[85] = ry[5];
r->xy[86] = ry[6];
r->xy[87] = ry[7];
// -7:
neg[0] = ry[0];
neg[1] = ry[1];
neg[2] = ry[2];
neg[3] = ry[3];
neg[4] = ry[4];
neg[5] = ry[5];
neg[6] = ry[6];
neg[7] = ry[7];
sub_mod (neg, p, neg);
r->xy[88] = neg[0];
r->xy[89] = neg[1];
r->xy[90] = neg[2];
r->xy[91] = neg[3];
r->xy[92] = neg[4];
r->xy[93] = neg[5];
r->xy[94] = neg[6];
r->xy[95] = neg[7];
}
DECLSPEC void point_mul (u32 *r, const u32 *k, GLOBAL_AS const secp256k1_t *tmps)
{
/*
* Convert the tweak/scalar k to w-NAF (window size is 4)
*/
u32 n[9];
n[0] = 0; // we need this extra slot sometimes for the subtraction to work
n[1] = k[7];
n[2] = k[6];
n[3] = k[5];
n[4] = k[4];
n[5] = k[3];
n[6] = k[2];
n[7] = k[1];
n[8] = k[0];
u32 naf[32 + 1] = { 0 }; // we need one extra slot
int loop_start = 0;
for (int i = 0; i <= 256; i++)
{
if (n[8] & 1)
{
// for window size w = 4:
// => 2^(w-0) = 2^4 = 16 (0x10)
// => 2^(w-1) = 2^3 = 8 (0x08)
int diff = n[8] & 0x0f; // n % 2^w == n & (2^w - 1)
// convert diff to val according to this table:
// 1 -> +1 -> 1
// 3 -> +3 -> 3
// 5 -> +5 -> 5
// 7 -> +7 -> 7
// 9 -> -7 -> 8
// 11 -> -5 -> 6
// 13 -> -3 -> 4
// 15 -> -1 -> 2
int val = diff;
if (diff >= 0x08)
{
diff -= 0x10;
val = 0x11 - val;
}
naf[i >> 3] |= val << ((i & 7) << 2);
u32 t = n[8]; // t is the (temporary) old/unmodified value
n[8] -= diff;
// we need to take care of the carry/borrow:
u32 k = 8;
if (diff > 0)
{
while (n[k] > t) // overflow propagation
{
if (k == 0) break; // needed ?
k--;
t = n[k];
n[k]--;
}
}
else // if (diff < 0)
{
while (t > n[k]) // overflow propagation
{
if (k == 0) break;
k--;
t = n[k];
n[k]++;
}
}
// update start:
loop_start = i;
}
// n = n / 2:
n[8] = n[8] >> 1 | n[7] << 31;
n[7] = n[7] >> 1 | n[6] << 31;
n[6] = n[6] >> 1 | n[5] << 31;
n[5] = n[5] >> 1 | n[4] << 31;
n[4] = n[4] >> 1 | n[3] << 31;
n[3] = n[3] >> 1 | n[2] << 31;
n[2] = n[2] >> 1 | n[1] << 31;
n[1] = n[1] >> 1 | n[0] << 31;
n[0] = n[0] >> 1;
}
// first set:
const u32 multiplier = (naf[loop_start >> 3] >> ((loop_start & 7) << 2)) & 0x0f; // or use u8 ?
const u32 odd = multiplier & 1;
const u32 x_pos = ((multiplier - 1 + odd) >> 1) * 24;
const u32 y_pos = odd ? (x_pos + 8) : (x_pos + 16);
u32 x1[8];
x1[0] = tmps->xy[x_pos + 0];
x1[1] = tmps->xy[x_pos + 1];
x1[2] = tmps->xy[x_pos + 2];
x1[3] = tmps->xy[x_pos + 3];
x1[4] = tmps->xy[x_pos + 4];
x1[5] = tmps->xy[x_pos + 5];
x1[6] = tmps->xy[x_pos + 6];
x1[7] = tmps->xy[x_pos + 7];
u32 y1[8];
y1[0] = tmps->xy[y_pos + 0];
y1[1] = tmps->xy[y_pos + 1];
y1[2] = tmps->xy[y_pos + 2];
y1[3] = tmps->xy[y_pos + 3];
y1[4] = tmps->xy[y_pos + 4];
y1[5] = tmps->xy[y_pos + 5];
y1[6] = tmps->xy[y_pos + 6];
y1[7] = tmps->xy[y_pos + 7];
u32 z1[8] = { 0 };
z1[0] = 1;
/*
* Start:
*/
// main loop (left-to-right binary algorithm):
for (int pos = loop_start - 1; pos >= 0; pos--) // -1 because we've set/add the point already
{
// always double:
point_double (x1, y1, z1);
// add only if needed:
const u32 multiplier = (naf[pos >> 3] >> ((pos & 7) << 2)) & 0x0f;
if (multiplier)
{
/*
m -> y | y = ((m - (m & 1)) / 2) * 24
----------------------------------
1 -> 0 | 1/2 * 24 = 0
2 -> 16
3 -> 24 | 3/2 * 24 = 24
4 -> 40
5 -> 48 | 5/2 * 24 = 2*24
6 -> 64
7 -> 72 | 7/2 * 24 = 3*24
8 -> 88
*/
const u32 odd = multiplier & 1;
const u32 x_pos = ((multiplier - 1 + odd) >> 1) * 24;
const u32 y_pos = odd ? (x_pos + 8) : (x_pos + 16);
u32 x2[8];
x2[0] = tmps->xy[x_pos + 0];
x2[1] = tmps->xy[x_pos + 1];
x2[2] = tmps->xy[x_pos + 2];
x2[3] = tmps->xy[x_pos + 3];
x2[4] = tmps->xy[x_pos + 4];
x2[5] = tmps->xy[x_pos + 5];
x2[6] = tmps->xy[x_pos + 6];
x2[7] = tmps->xy[x_pos + 7];
u32 y2[8];
y2[0] = tmps->xy[y_pos + 0];
y2[1] = tmps->xy[y_pos + 1];
y2[2] = tmps->xy[y_pos + 2];
y2[3] = tmps->xy[y_pos + 3];
y2[4] = tmps->xy[y_pos + 4];
y2[5] = tmps->xy[y_pos + 5];
y2[6] = tmps->xy[y_pos + 6];
y2[7] = tmps->xy[y_pos + 7];
// (x1, y1, z1) + multiplier * (x, y, z) = (x1, y1, z1) + (x2, y2, z2)
point_add (x1, y1, z1, x2, y2);
// optimization (there can't be any adds after an add for w-1 times):
// (but it seems to be faster without this manipulation of "pos")
//for (u32 i = 0; i < 3; i++)
//{
// if (pos == 0) break;
// point_double (x1, y1, z1);
// pos--;
//}
}
}
/*
* Get the corresponding affine coordinates x/y:
*
* Note:
* x1_affine = x1_jacobian / z1^2 = x1_jacobian * z1_inv^2
* y1_affine = y1_jacobian / z1^2 = y1_jacobian * z1_inv^2
*
*/
inv_mod (z1);
u32 z2[8];
mul_mod (z2, z1, z1); // z1^2
mul_mod (x1, x1, z2); // x1_affine
mul_mod (z1, z2, z1); // z1^3
mul_mod (y1, y1, z1); // y1_affine
/*
* output:
*/
// shift by 1 byte (8 bits) to make room and add the parity/sign (for odd/even y):
r[8] = (x1[0] << 24);
r[7] = (x1[0] >> 8) | (x1[1] << 24);
r[6] = (x1[1] >> 8) | (x1[2] << 24);
r[5] = (x1[2] >> 8) | (x1[3] << 24);
r[4] = (x1[3] >> 8) | (x1[4] << 24);
r[3] = (x1[4] >> 8) | (x1[5] << 24);
r[2] = (x1[5] >> 8) | (x1[6] << 24);
r[1] = (x1[6] >> 8) | (x1[7] << 24);
r[0] = (x1[7] >> 8);
const u32 type = 0x02 | (y1[0] & 1); // (note: 0b10 | 0b01 = 0x03)
r[0] = r[0] | type << 24; // 0x02 or 0x03
}
DECLSPEC u32 parse_public (secp256k1_t *r, const u32 *k)
{
// verify:
const u32 first_byte = k[0] & 0xff;
if ((first_byte != '\x02') && (first_byte != '\x03'))
{
return 1;
}
// load k into x without the first byte:
u32 x[8];
x[0] = (k[7] & 0xff00) << 16 | (k[7] & 0xff0000) | (k[7] & 0xff000000) >> 16 | (k[8] & 0xff);
x[1] = (k[6] & 0xff00) << 16 | (k[6] & 0xff0000) | (k[6] & 0xff000000) >> 16 | (k[7] & 0xff);
x[2] = (k[5] & 0xff00) << 16 | (k[5] & 0xff0000) | (k[5] & 0xff000000) >> 16 | (k[6] & 0xff);
x[3] = (k[4] & 0xff00) << 16 | (k[4] & 0xff0000) | (k[4] & 0xff000000) >> 16 | (k[5] & 0xff);
x[4] = (k[3] & 0xff00) << 16 | (k[3] & 0xff0000) | (k[3] & 0xff000000) >> 16 | (k[4] & 0xff);
x[5] = (k[2] & 0xff00) << 16 | (k[2] & 0xff0000) | (k[2] & 0xff000000) >> 16 | (k[3] & 0xff);
x[6] = (k[1] & 0xff00) << 16 | (k[1] & 0xff0000) | (k[1] & 0xff000000) >> 16 | (k[2] & 0xff);
x[7] = (k[0] & 0xff00) << 16 | (k[0] & 0xff0000) | (k[0] & 0xff000000) >> 16 | (k[1] & 0xff);
u32 p[8];
p[0] = SECP256K1_P0;
p[1] = SECP256K1_P1;
p[2] = SECP256K1_P2;
p[3] = SECP256K1_P3;
p[4] = SECP256K1_P4;
p[5] = SECP256K1_P5;
p[6] = SECP256K1_P6;
p[7] = SECP256K1_P7;
// x must be smaller than p (because of y ^ 2 = x ^ 3 % p)
for (int i = 7; i >= 0; i--)
{
if (x[i] < p[i]) break;
if (x[i] > p[i]) return 1;
}
// get y^2 = x^3 + 7:
u32 b[8] = { 0 };
b[0] = SECP256K1_B;
u32 y[8];
mul_mod (y, x, x);
mul_mod (y, y, x);
add_mod (y, y, b);
// get y = sqrt (y^2):
sqrt_mod (y);
// check if it's of the correct parity that we want (odd/even):
if ((first_byte & 1) != (y[0] & 1))
{
// y2 = p - y1 (or y2 = y1 * -1)
sub_mod (y, p, y);
}
// get xy:
point_get_coords (r, x, y);
return 0;
}