1
0
mirror of https://github.com/trezor/trezor-firmware.git synced 2024-11-22 07:28:10 +00:00

use curve25519-donna from floodyberry

This commit is contained in:
Pavol Rusnak 2016-10-24 20:51:57 +02:00
parent bede439a62
commit 0abc61f672
No known key found for this signature in database
GPG Key ID: 91F3B339B9A02A3D
16 changed files with 2368 additions and 877 deletions

View File

@ -46,7 +46,7 @@ SRCS += sha2.c
SRCS += sha3.c
SRCS += aescrypt.c aeskey.c aestab.c aes_modes.c
SRCS += ed25519-donna/ed25519.c
SRCS += curve25519-donna/curve25519-donna.c
SRCS += curve25519-donna/curve25519.c
OBJS = $(SRCS:.c=.o)
@ -62,7 +62,7 @@ tests: tests.o $(OBJS)
$(CC) tests.o $(OBJS) $(TESTLIBS) -o tests
test_speed: test_speed.o $(OBJS)
$(CC) test_speed.o $(OBJS) $(TESTLIBS) -o test_speed
$(CC) test_speed.o $(OBJS) -o test_speed
test-openssl: test-openssl.o $(OBJS)
$(CC) test-openssl.o $(OBJS) $(TESTSSLLIBS) -o test-openssl

View File

@ -37,7 +37,7 @@
#include "secp256k1.h"
#include "nist256p1.h"
#include "ed25519.h"
#include "curve25519-donna.h"
#include "curve25519.h"
#if USE_ETHEREUM
#include "sha3.h"
#endif
@ -400,7 +400,7 @@ void hdnode_fill_public_key(HDNode *node)
ed25519_publickey(node->private_key, node->public_key + 1);
} else if (node->curve == &curve25519_info) {
node->public_key[0] = 1;
curve25519_publickey(node->public_key + 1, node->private_key);
curve25519_donna_basepoint(node->public_key + 1, node->private_key);
} else {
ecdsa_get_public_key33(node->curve->params, node->private_key, node->public_key);
}
@ -466,7 +466,7 @@ int hdnode_get_shared_key(const HDNode *node, const uint8_t *peer_public_key, ui
if (peer_public_key[0] != 0x40) {
return 1; // Curve25519 public key should start with 0x40 byte.
}
curve25519_scalarmult(session_key + 1, node->private_key, peer_public_key + 1);
curve25519_donna(session_key + 1, node->private_key, peer_public_key + 1);
*result_size = 33;
return 0;
} else {

107
curve25519-donna/README.md Normal file
View File

@ -0,0 +1,107 @@
[curve25519](http://cr.yp.to/ecdh.html) is an elliptic curve, developed by
[Dan Bernstein](http://cr.yp.to/djb.html), for fast
[Diffie-Hellman](http://en.wikipedia.org/wiki/Diffie-Hellman) key agreement.
DJB's [original implementation](http://cr.yp.to/ecdh.html) was written in a
language of his own devising called [qhasm](http://cr.yp.to/qhasm.html).
The original qhasm source isn't available, only the x86 32-bit assembly output.
This project provides performant, portable 32-bit & 64-bit implementations.
All implementations are of course constant time in regard to secret data.
#### Performance
Compilers versions are gcc 4.6.3, icc 13.1.1, clang 3.4-1~exp1.
Counts are in thousands of cycles.
Note that SSE2 performance may be less impressive on AMD & older CPUs with slower SSE ops!
##### E5200 @ 2.5ghz, march=core2
<table>
<thead><tr><th>Version</th><th>gcc</th><th>icc</th><th>clang</th></tr></thead>
<tbody>
<tr><td>64-bit SSE2 </td><td> 278k</td><td> 265k</td><td> 302k</td></tr>
<tr><td>64-bit </td><td> 273k</td><td> 271k</td><td> 377k</td></tr>
<tr><td>32-bit SSE2 </td><td> 304k</td><td> 289k</td><td> 317k</td></tr>
<tr><td>32-bit </td><td> 1417k</td><td> 845k</td><td> 981k</td></tr>
</tbody>
</table>
##### E3-1270 @ 3.4ghz, march=corei7-avx
<table>
<thead><tr><th>Version</th><th>gcc</th><th>icc</th><th>clang</th></tr></thead>
<tbody>
<tr><td>64-bit </td><td> 201k</td><td> 192k</td><td> 233k</td></tr>
<tr><td>64-bit SSE2 </td><td> 201k</td><td> 201k</td><td> 261k</td></tr>
<tr><td>32-bit SSE2 </td><td> 238k</td><td> 225k</td><td> 250k</td></tr>
<tr><td>32-bit </td><td> 1293k</td><td> 822k</td><td> 848k</td></tr>
</tbody>
</table>
#### Compilation
No configuration is needed.
##### 32-bit
gcc curve25519.c -m32 -O3 -c
##### 64-bit
gcc curve25519.c -m64 -O3 -c
##### SSE2
gcc curve25519.c -m32 -O3 -c -DCURVE25519_SSE2 -msse2
gcc curve25519.c -m64 -O3 -c -DCURVE25519_SSE2
clang, icc, and msvc are also supported
##### Named Versions
Define CURVE25519_SUFFIX to append a suffix to public functions, e.g.
`-DCURVE25519_SUFFIX=_sse2` to create curve25519_donna_sse2 and
curve25519_donna_basepoint_sse2.
#### Usage
To use the code, link against `curve25519.o` and:
#include "curve25519.h"
To generate a private/secret key, generate 32 cryptographically random bytes:
curve25519_key sk;
randombytes(sk, sizeof(curve25519_key));
Manual clamping is not needed, and it is actually not possible to use unclamped
keys due to the code taking advantage of the clamped bits internally.
To generate the public key from the private/secret key:
curve25519_key pk;
curve25519_donna_basepoint(pk, sk);
To generate a shared key with your private/secret key and someone elses public key:
curve25519_key shared;
curve25519_donna(shared, mysk, yourpk);
And hash `shared` with a cryptographic hash before using, or e.g. pass `shared` through
HSalsa20/HChacha as NaCl does.
#### Testing
Fuzzing against a reference implemenation is now available. See [fuzz/README](fuzz/README.md).
Building `curve25519.c` and linking with `test.c` will run basic sanity tests and benchmark curve25519_donna.
#### Papers
[djb's curve25519 paper](http://cr.yp.to/ecdh/curve25519-20060209.pdf)
#### License
Public Domain, or MIT

View File

@ -0,0 +1,466 @@
typedef uint32_t bignum25519[10];
static const uint32_t reduce_mask_26 = (1 << 26) - 1;
static const uint32_t reduce_mask_25 = (1 << 25) - 1;
/* out = in */
DONNA_INLINE static void
curve25519_copy(bignum25519 out, const bignum25519 in) {
out[0] = in[0];
out[1] = in[1];
out[2] = in[2];
out[3] = in[3];
out[4] = in[4];
out[5] = in[5];
out[6] = in[6];
out[7] = in[7];
out[8] = in[8];
out[9] = in[9];
}
/* out = a + b */
DONNA_INLINE static void
curve25519_add(bignum25519 out, const bignum25519 a, const bignum25519 b) {
out[0] = a[0] + b[0];
out[1] = a[1] + b[1];
out[2] = a[2] + b[2];
out[3] = a[3] + b[3];
out[4] = a[4] + b[4];
out[5] = a[5] + b[5];
out[6] = a[6] + b[6];
out[7] = a[7] + b[7];
out[8] = a[8] + b[8];
out[9] = a[9] + b[9];
}
/* out = a - b */
DONNA_INLINE static void
curve25519_sub(bignum25519 out, const bignum25519 a, const bignum25519 b) {
uint32_t c;
out[0] = 0x7ffffda + a[0] - b[0] ; c = (out[0] >> 26); out[0] &= reduce_mask_26;
out[1] = 0x3fffffe + a[1] - b[1] + c; c = (out[1] >> 25); out[1] &= reduce_mask_25;
out[2] = 0x7fffffe + a[2] - b[2] + c; c = (out[2] >> 26); out[2] &= reduce_mask_26;
out[3] = 0x3fffffe + a[3] - b[3] + c; c = (out[3] >> 25); out[3] &= reduce_mask_25;
out[4] = 0x7fffffe + a[4] - b[4] + c; c = (out[4] >> 26); out[4] &= reduce_mask_26;
out[5] = 0x3fffffe + a[5] - b[5] + c; c = (out[5] >> 25); out[5] &= reduce_mask_25;
out[6] = 0x7fffffe + a[6] - b[6] + c; c = (out[6] >> 26); out[6] &= reduce_mask_26;
out[7] = 0x3fffffe + a[7] - b[7] + c; c = (out[7] >> 25); out[7] &= reduce_mask_25;
out[8] = 0x7fffffe + a[8] - b[8] + c; c = (out[8] >> 26); out[8] &= reduce_mask_26;
out[9] = 0x3fffffe + a[9] - b[9] + c; c = (out[9] >> 25); out[9] &= reduce_mask_25;
out[0] += 19 * c;
}
/* out = in * scalar */
DONNA_INLINE static void
curve25519_scalar_product(bignum25519 out, const bignum25519 in, const uint32_t scalar) {
uint64_t a;
uint32_t c;
a = mul32x32_64(in[0], scalar); out[0] = (uint32_t)a & reduce_mask_26; c = (uint32_t)(a >> 26);
a = mul32x32_64(in[1], scalar) + c; out[1] = (uint32_t)a & reduce_mask_25; c = (uint32_t)(a >> 25);
a = mul32x32_64(in[2], scalar) + c; out[2] = (uint32_t)a & reduce_mask_26; c = (uint32_t)(a >> 26);
a = mul32x32_64(in[3], scalar) + c; out[3] = (uint32_t)a & reduce_mask_25; c = (uint32_t)(a >> 25);
a = mul32x32_64(in[4], scalar) + c; out[4] = (uint32_t)a & reduce_mask_26; c = (uint32_t)(a >> 26);
a = mul32x32_64(in[5], scalar) + c; out[5] = (uint32_t)a & reduce_mask_25; c = (uint32_t)(a >> 25);
a = mul32x32_64(in[6], scalar) + c; out[6] = (uint32_t)a & reduce_mask_26; c = (uint32_t)(a >> 26);
a = mul32x32_64(in[7], scalar) + c; out[7] = (uint32_t)a & reduce_mask_25; c = (uint32_t)(a >> 25);
a = mul32x32_64(in[8], scalar) + c; out[8] = (uint32_t)a & reduce_mask_26; c = (uint32_t)(a >> 26);
a = mul32x32_64(in[9], scalar) + c; out[9] = (uint32_t)a & reduce_mask_25; c = (uint32_t)(a >> 25);
out[0] += c * 19;
}
/* out = a * b */
DONNA_INLINE static void
curve25519_mul(bignum25519 out, const bignum25519 a, const bignum25519 b) {
uint32_t r0,r1,r2,r3,r4,r5,r6,r7,r8,r9;
uint32_t s0,s1,s2,s3,s4,s5,s6,s7,s8,s9;
uint64_t m0,m1,m2,m3,m4,m5,m6,m7,m8,m9,c;
uint32_t p;
r0 = b[0];
r1 = b[1];
r2 = b[2];
r3 = b[3];
r4 = b[4];
r5 = b[5];
r6 = b[6];
r7 = b[7];
r8 = b[8];
r9 = b[9];
s0 = a[0];
s1 = a[1];
s2 = a[2];
s3 = a[3];
s4 = a[4];
s5 = a[5];
s6 = a[6];
s7 = a[7];
s8 = a[8];
s9 = a[9];
m1 = mul32x32_64(r0, s1) + mul32x32_64(r1, s0);
m3 = mul32x32_64(r0, s3) + mul32x32_64(r1, s2) + mul32x32_64(r2, s1) + mul32x32_64(r3, s0);
m5 = mul32x32_64(r0, s5) + mul32x32_64(r1, s4) + mul32x32_64(r2, s3) + mul32x32_64(r3, s2) + mul32x32_64(r4, s1) + mul32x32_64(r5, s0);
m7 = mul32x32_64(r0, s7) + mul32x32_64(r1, s6) + mul32x32_64(r2, s5) + mul32x32_64(r3, s4) + mul32x32_64(r4, s3) + mul32x32_64(r5, s2) + mul32x32_64(r6, s1) + mul32x32_64(r7, s0);
m9 = mul32x32_64(r0, s9) + mul32x32_64(r1, s8) + mul32x32_64(r2, s7) + mul32x32_64(r3, s6) + mul32x32_64(r4, s5) + mul32x32_64(r5, s4) + mul32x32_64(r6, s3) + mul32x32_64(r7, s2) + mul32x32_64(r8, s1) + mul32x32_64(r9, s0);
r1 *= 2;
r3 *= 2;
r5 *= 2;
r7 *= 2;
m0 = mul32x32_64(r0, s0);
m2 = mul32x32_64(r0, s2) + mul32x32_64(r1, s1) + mul32x32_64(r2, s0);
m4 = mul32x32_64(r0, s4) + mul32x32_64(r1, s3) + mul32x32_64(r2, s2) + mul32x32_64(r3, s1) + mul32x32_64(r4, s0);
m6 = mul32x32_64(r0, s6) + mul32x32_64(r1, s5) + mul32x32_64(r2, s4) + mul32x32_64(r3, s3) + mul32x32_64(r4, s2) + mul32x32_64(r5, s1) + mul32x32_64(r6, s0);
m8 = mul32x32_64(r0, s8) + mul32x32_64(r1, s7) + mul32x32_64(r2, s6) + mul32x32_64(r3, s5) + mul32x32_64(r4, s4) + mul32x32_64(r5, s3) + mul32x32_64(r6, s2) + mul32x32_64(r7, s1) + mul32x32_64(r8, s0);
r1 *= 19;
r2 *= 19;
r3 = (r3 / 2) * 19;
r4 *= 19;
r5 = (r5 / 2) * 19;
r6 *= 19;
r7 = (r7 / 2) * 19;
r8 *= 19;
r9 *= 19;
m1 += (mul32x32_64(r9, s2) + mul32x32_64(r8, s3) + mul32x32_64(r7, s4) + mul32x32_64(r6, s5) + mul32x32_64(r5, s6) + mul32x32_64(r4, s7) + mul32x32_64(r3, s8) + mul32x32_64(r2, s9));
m3 += (mul32x32_64(r9, s4) + mul32x32_64(r8, s5) + mul32x32_64(r7, s6) + mul32x32_64(r6, s7) + mul32x32_64(r5, s8) + mul32x32_64(r4, s9));
m5 += (mul32x32_64(r9, s6) + mul32x32_64(r8, s7) + mul32x32_64(r7, s8) + mul32x32_64(r6, s9));
m7 += (mul32x32_64(r9, s8) + mul32x32_64(r8, s9));
r3 *= 2;
r5 *= 2;
r7 *= 2;
r9 *= 2;
m0 += (mul32x32_64(r9, s1) + mul32x32_64(r8, s2) + mul32x32_64(r7, s3) + mul32x32_64(r6, s4) + mul32x32_64(r5, s5) + mul32x32_64(r4, s6) + mul32x32_64(r3, s7) + mul32x32_64(r2, s8) + mul32x32_64(r1, s9));
m2 += (mul32x32_64(r9, s3) + mul32x32_64(r8, s4) + mul32x32_64(r7, s5) + mul32x32_64(r6, s6) + mul32x32_64(r5, s7) + mul32x32_64(r4, s8) + mul32x32_64(r3, s9));
m4 += (mul32x32_64(r9, s5) + mul32x32_64(r8, s6) + mul32x32_64(r7, s7) + mul32x32_64(r6, s8) + mul32x32_64(r5, s9));
m6 += (mul32x32_64(r9, s7) + mul32x32_64(r8, s8) + mul32x32_64(r7, s9));
m8 += (mul32x32_64(r9, s9));
r0 = (uint32_t)m0 & reduce_mask_26; c = (m0 >> 26);
m1 += c; r1 = (uint32_t)m1 & reduce_mask_25; c = (m1 >> 25);
m2 += c; r2 = (uint32_t)m2 & reduce_mask_26; c = (m2 >> 26);
m3 += c; r3 = (uint32_t)m3 & reduce_mask_25; c = (m3 >> 25);
m4 += c; r4 = (uint32_t)m4 & reduce_mask_26; c = (m4 >> 26);
m5 += c; r5 = (uint32_t)m5 & reduce_mask_25; c = (m5 >> 25);
m6 += c; r6 = (uint32_t)m6 & reduce_mask_26; c = (m6 >> 26);
m7 += c; r7 = (uint32_t)m7 & reduce_mask_25; c = (m7 >> 25);
m8 += c; r8 = (uint32_t)m8 & reduce_mask_26; c = (m8 >> 26);
m9 += c; r9 = (uint32_t)m9 & reduce_mask_25; p = (uint32_t)(m9 >> 25);
m0 = r0 + mul32x32_64(p,19); r0 = (uint32_t)m0 & reduce_mask_26; p = (uint32_t)(m0 >> 26);
r1 += p;
out[0] = r0;
out[1] = r1;
out[2] = r2;
out[3] = r3;
out[4] = r4;
out[5] = r5;
out[6] = r6;
out[7] = r7;
out[8] = r8;
out[9] = r9;
}
/* out = in * in */
DONNA_INLINE static void
curve25519_square(bignum25519 out, const bignum25519 in) {
uint32_t r0,r1,r2,r3,r4,r5,r6,r7,r8,r9;
uint32_t d6,d7,d8,d9;
uint64_t m0,m1,m2,m3,m4,m5,m6,m7,m8,m9,c;
uint32_t p;
r0 = in[0];
r1 = in[1];
r2 = in[2];
r3 = in[3];
r4 = in[4];
r5 = in[5];
r6 = in[6];
r7 = in[7];
r8 = in[8];
r9 = in[9];
m0 = mul32x32_64(r0, r0);
r0 *= 2;
m1 = mul32x32_64(r0, r1);
m2 = mul32x32_64(r0, r2) + mul32x32_64(r1, r1 * 2);
r1 *= 2;
m3 = mul32x32_64(r0, r3) + mul32x32_64(r1, r2 );
m4 = mul32x32_64(r0, r4) + mul32x32_64(r1, r3 * 2) + mul32x32_64(r2, r2);
r2 *= 2;
m5 = mul32x32_64(r0, r5) + mul32x32_64(r1, r4 ) + mul32x32_64(r2, r3);
m6 = mul32x32_64(r0, r6) + mul32x32_64(r1, r5 * 2) + mul32x32_64(r2, r4) + mul32x32_64(r3, r3 * 2);
r3 *= 2;
m7 = mul32x32_64(r0, r7) + mul32x32_64(r1, r6 ) + mul32x32_64(r2, r5) + mul32x32_64(r3, r4 );
m8 = mul32x32_64(r0, r8) + mul32x32_64(r1, r7 * 2) + mul32x32_64(r2, r6) + mul32x32_64(r3, r5 * 2) + mul32x32_64(r4, r4 );
m9 = mul32x32_64(r0, r9) + mul32x32_64(r1, r8 ) + mul32x32_64(r2, r7) + mul32x32_64(r3, r6 ) + mul32x32_64(r4, r5 * 2);
d6 = r6 * 19;
d7 = r7 * 2 * 19;
d8 = r8 * 19;
d9 = r9 * 2 * 19;
m0 += (mul32x32_64(d9, r1 ) + mul32x32_64(d8, r2 ) + mul32x32_64(d7, r3 ) + mul32x32_64(d6, r4 * 2) + mul32x32_64(r5, r5 * 2 * 19));
m1 += (mul32x32_64(d9, r2 / 2) + mul32x32_64(d8, r3 ) + mul32x32_64(d7, r4 ) + mul32x32_64(d6, r5 * 2));
m2 += (mul32x32_64(d9, r3 ) + mul32x32_64(d8, r4 * 2) + mul32x32_64(d7, r5 * 2) + mul32x32_64(d6, r6 ));
m3 += (mul32x32_64(d9, r4 ) + mul32x32_64(d8, r5 * 2) + mul32x32_64(d7, r6 ));
m4 += (mul32x32_64(d9, r5 * 2) + mul32x32_64(d8, r6 * 2) + mul32x32_64(d7, r7 ));
m5 += (mul32x32_64(d9, r6 ) + mul32x32_64(d8, r7 * 2));
m6 += (mul32x32_64(d9, r7 * 2) + mul32x32_64(d8, r8 ));
m7 += (mul32x32_64(d9, r8 ));
m8 += (mul32x32_64(d9, r9 ));
r0 = (uint32_t)m0 & reduce_mask_26; c = (m0 >> 26);
m1 += c; r1 = (uint32_t)m1 & reduce_mask_25; c = (m1 >> 25);
m2 += c; r2 = (uint32_t)m2 & reduce_mask_26; c = (m2 >> 26);
m3 += c; r3 = (uint32_t)m3 & reduce_mask_25; c = (m3 >> 25);
m4 += c; r4 = (uint32_t)m4 & reduce_mask_26; c = (m4 >> 26);
m5 += c; r5 = (uint32_t)m5 & reduce_mask_25; c = (m5 >> 25);
m6 += c; r6 = (uint32_t)m6 & reduce_mask_26; c = (m6 >> 26);
m7 += c; r7 = (uint32_t)m7 & reduce_mask_25; c = (m7 >> 25);
m8 += c; r8 = (uint32_t)m8 & reduce_mask_26; c = (m8 >> 26);
m9 += c; r9 = (uint32_t)m9 & reduce_mask_25; p = (uint32_t)(m9 >> 25);
m0 = r0 + mul32x32_64(p,19); r0 = (uint32_t)m0 & reduce_mask_26; p = (uint32_t)(m0 >> 26);
r1 += p;
out[0] = r0;
out[1] = r1;
out[2] = r2;
out[3] = r3;
out[4] = r4;
out[5] = r5;
out[6] = r6;
out[7] = r7;
out[8] = r8;
out[9] = r9;
}
/* out = in^(2 * count) */
static void
curve25519_square_times(bignum25519 out, const bignum25519 in, int count) {
uint32_t r0,r1,r2,r3,r4,r5,r6,r7,r8,r9;
uint32_t d6,d7,d8,d9;
uint64_t m0,m1,m2,m3,m4,m5,m6,m7,m8,m9,c;
uint32_t p;
r0 = in[0];
r1 = in[1];
r2 = in[2];
r3 = in[3];
r4 = in[4];
r5 = in[5];
r6 = in[6];
r7 = in[7];
r8 = in[8];
r9 = in[9];
do {
m0 = mul32x32_64(r0, r0);
r0 *= 2;
m1 = mul32x32_64(r0, r1);
m2 = mul32x32_64(r0, r2) + mul32x32_64(r1, r1 * 2);
r1 *= 2;
m3 = mul32x32_64(r0, r3) + mul32x32_64(r1, r2 );
m4 = mul32x32_64(r0, r4) + mul32x32_64(r1, r3 * 2) + mul32x32_64(r2, r2);
r2 *= 2;
m5 = mul32x32_64(r0, r5) + mul32x32_64(r1, r4 ) + mul32x32_64(r2, r3);
m6 = mul32x32_64(r0, r6) + mul32x32_64(r1, r5 * 2) + mul32x32_64(r2, r4) + mul32x32_64(r3, r3 * 2);
r3 *= 2;
m7 = mul32x32_64(r0, r7) + mul32x32_64(r1, r6 ) + mul32x32_64(r2, r5) + mul32x32_64(r3, r4 );
m8 = mul32x32_64(r0, r8) + mul32x32_64(r1, r7 * 2) + mul32x32_64(r2, r6) + mul32x32_64(r3, r5 * 2) + mul32x32_64(r4, r4 );
m9 = mul32x32_64(r0, r9) + mul32x32_64(r1, r8 ) + mul32x32_64(r2, r7) + mul32x32_64(r3, r6 ) + mul32x32_64(r4, r5 * 2);
d6 = r6 * 19;
d7 = r7 * 2 * 19;
d8 = r8 * 19;
d9 = r9 * 2 * 19;
m0 += (mul32x32_64(d9, r1 ) + mul32x32_64(d8, r2 ) + mul32x32_64(d7, r3 ) + mul32x32_64(d6, r4 * 2) + mul32x32_64(r5, r5 * 2 * 19));
m1 += (mul32x32_64(d9, r2 / 2) + mul32x32_64(d8, r3 ) + mul32x32_64(d7, r4 ) + mul32x32_64(d6, r5 * 2));
m2 += (mul32x32_64(d9, r3 ) + mul32x32_64(d8, r4 * 2) + mul32x32_64(d7, r5 * 2) + mul32x32_64(d6, r6 ));
m3 += (mul32x32_64(d9, r4 ) + mul32x32_64(d8, r5 * 2) + mul32x32_64(d7, r6 ));
m4 += (mul32x32_64(d9, r5 * 2) + mul32x32_64(d8, r6 * 2) + mul32x32_64(d7, r7 ));
m5 += (mul32x32_64(d9, r6 ) + mul32x32_64(d8, r7 * 2));
m6 += (mul32x32_64(d9, r7 * 2) + mul32x32_64(d8, r8 ));
m7 += (mul32x32_64(d9, r8 ));
m8 += (mul32x32_64(d9, r9 ));
r0 = (uint32_t)m0 & reduce_mask_26; c = (m0 >> 26);
m1 += c; r1 = (uint32_t)m1 & reduce_mask_25; c = (m1 >> 25);
m2 += c; r2 = (uint32_t)m2 & reduce_mask_26; c = (m2 >> 26);
m3 += c; r3 = (uint32_t)m3 & reduce_mask_25; c = (m3 >> 25);
m4 += c; r4 = (uint32_t)m4 & reduce_mask_26; c = (m4 >> 26);
m5 += c; r5 = (uint32_t)m5 & reduce_mask_25; c = (m5 >> 25);
m6 += c; r6 = (uint32_t)m6 & reduce_mask_26; c = (m6 >> 26);
m7 += c; r7 = (uint32_t)m7 & reduce_mask_25; c = (m7 >> 25);
m8 += c; r8 = (uint32_t)m8 & reduce_mask_26; c = (m8 >> 26);
m9 += c; r9 = (uint32_t)m9 & reduce_mask_25; p = (uint32_t)(m9 >> 25);
m0 = r0 + mul32x32_64(p,19); r0 = (uint32_t)m0 & reduce_mask_26; p = (uint32_t)(m0 >> 26);
r1 += p;
} while (--count);
out[0] = r0;
out[1] = r1;
out[2] = r2;
out[3] = r3;
out[4] = r4;
out[5] = r5;
out[6] = r6;
out[7] = r7;
out[8] = r8;
out[9] = r9;
}
/* Take a little-endian, 32-byte number and expand it into polynomial form */
static void
curve25519_expand(bignum25519 out, const unsigned char in[32]) {
static const union { uint8_t b[2]; uint16_t s; } endian_check = {{1,0}};
uint32_t x0,x1,x2,x3,x4,x5,x6,x7;
if (endian_check.s == 1) {
x0 = *(uint32_t *)(in + 0);
x1 = *(uint32_t *)(in + 4);
x2 = *(uint32_t *)(in + 8);
x3 = *(uint32_t *)(in + 12);
x4 = *(uint32_t *)(in + 16);
x5 = *(uint32_t *)(in + 20);
x6 = *(uint32_t *)(in + 24);
x7 = *(uint32_t *)(in + 28);
} else {
#define F(s) \
((((uint32_t)in[s + 0]) ) | \
(((uint32_t)in[s + 1]) << 8) | \
(((uint32_t)in[s + 2]) << 16) | \
(((uint32_t)in[s + 3]) << 24))
x0 = F(0);
x1 = F(4);
x2 = F(8);
x3 = F(12);
x4 = F(16);
x5 = F(20);
x6 = F(24);
x7 = F(28);
#undef F
}
out[0] = ( x0 ) & reduce_mask_26;
out[1] = ((((uint64_t)x1 << 32) | x0) >> 26) & reduce_mask_25;
out[2] = ((((uint64_t)x2 << 32) | x1) >> 19) & reduce_mask_26;
out[3] = ((((uint64_t)x3 << 32) | x2) >> 13) & reduce_mask_25;
out[4] = (( x3) >> 6) & reduce_mask_26;
out[5] = ( x4 ) & reduce_mask_25;
out[6] = ((((uint64_t)x5 << 32) | x4) >> 25) & reduce_mask_26;
out[7] = ((((uint64_t)x6 << 32) | x5) >> 19) & reduce_mask_25;
out[8] = ((((uint64_t)x7 << 32) | x6) >> 12) & reduce_mask_26;
out[9] = (( x7) >> 6) & reduce_mask_25; /* ignore the top bit */
}
/* Take a fully reduced polynomial form number and contract it into a little-endian, 32-byte array */
static void
curve25519_contract(unsigned char out[32], const bignum25519 in) {
bignum25519 f;
curve25519_copy(f, in);
#define carry_pass() \
f[1] += f[0] >> 26; f[0] &= reduce_mask_26; \
f[2] += f[1] >> 25; f[1] &= reduce_mask_25; \
f[3] += f[2] >> 26; f[2] &= reduce_mask_26; \
f[4] += f[3] >> 25; f[3] &= reduce_mask_25; \
f[5] += f[4] >> 26; f[4] &= reduce_mask_26; \
f[6] += f[5] >> 25; f[5] &= reduce_mask_25; \
f[7] += f[6] >> 26; f[6] &= reduce_mask_26; \
f[8] += f[7] >> 25; f[7] &= reduce_mask_25; \
f[9] += f[8] >> 26; f[8] &= reduce_mask_26;
#define carry_pass_full() \
carry_pass() \
f[0] += 19 * (f[9] >> 25); f[9] &= reduce_mask_25;
#define carry_pass_final() \
carry_pass() \
f[9] &= reduce_mask_25;
carry_pass_full()
carry_pass_full()
/* now t is between 0 and 2^255-1, properly carried. */
/* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */
f[0] += 19;
carry_pass_full()
/* now between 19 and 2^255-1 in both cases, and offset by 19. */
f[0] += (1 << 26) - 19;
f[1] += (1 << 25) - 1;
f[2] += (1 << 26) - 1;
f[3] += (1 << 25) - 1;
f[4] += (1 << 26) - 1;
f[5] += (1 << 25) - 1;
f[6] += (1 << 26) - 1;
f[7] += (1 << 25) - 1;
f[8] += (1 << 26) - 1;
f[9] += (1 << 25) - 1;
/* now between 2^255 and 2^256-20, and offset by 2^255. */
carry_pass_final()
#undef carry_pass
#undef carry_full
#undef carry_final
f[1] <<= 2;
f[2] <<= 3;
f[3] <<= 5;
f[4] <<= 6;
f[6] <<= 1;
f[7] <<= 3;
f[8] <<= 4;
f[9] <<= 6;
#define F(i, s) \
out[s+0] |= (unsigned char )(f[i] & 0xff); \
out[s+1] = (unsigned char )((f[i] >> 8) & 0xff); \
out[s+2] = (unsigned char )((f[i] >> 16) & 0xff); \
out[s+3] = (unsigned char )((f[i] >> 24) & 0xff);
out[0] = 0;
out[16] = 0;
F(0,0);
F(1,3);
F(2,6);
F(3,9);
F(4,12);
F(5,16);
F(6,19);
F(7,22);
F(8,25);
F(9,28);
#undef F
}
/*
* Swap the contents of [qx] and [qpx] iff @swap is non-zero
*/
DONNA_INLINE static void
curve25519_swap_conditional(bignum25519 x, bignum25519 qpx, uint32_t iswap) {
const uint32_t swap = (uint32_t)(-(int32_t)iswap);
uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9;
x0 = swap & (x[0] ^ qpx[0]); x[0] ^= x0; qpx[0] ^= x0;
x1 = swap & (x[1] ^ qpx[1]); x[1] ^= x1; qpx[1] ^= x1;
x2 = swap & (x[2] ^ qpx[2]); x[2] ^= x2; qpx[2] ^= x2;
x3 = swap & (x[3] ^ qpx[3]); x[3] ^= x3; qpx[3] ^= x3;
x4 = swap & (x[4] ^ qpx[4]); x[4] ^= x4; qpx[4] ^= x4;
x5 = swap & (x[5] ^ qpx[5]); x[5] ^= x5; qpx[5] ^= x5;
x6 = swap & (x[6] ^ qpx[6]); x[6] ^= x6; qpx[6] ^= x6;
x7 = swap & (x[7] ^ qpx[7]); x[7] ^= x7; qpx[7] ^= x7;
x8 = swap & (x[8] ^ qpx[8]); x[8] ^= x8; qpx[8] ^= x8;
x9 = swap & (x[9] ^ qpx[9]); x[9] ^= x9; qpx[9] ^= x9;
}

View File

@ -0,0 +1,345 @@
typedef uint64_t bignum25519[5];
static const uint64_t reduce_mask_51 = ((uint64_t)1 << 51) - 1;
static const uint64_t reduce_mask_52 = ((uint64_t)1 << 52) - 1;
/* out = in */
DONNA_INLINE static void
curve25519_copy(bignum25519 out, const bignum25519 in) {
out[0] = in[0];
out[1] = in[1];
out[2] = in[2];
out[3] = in[3];
out[4] = in[4];
}
/* out = a + b */
DONNA_INLINE static void
curve25519_add(bignum25519 out, const bignum25519 a, const bignum25519 b) {
out[0] = a[0] + b[0];
out[1] = a[1] + b[1];
out[2] = a[2] + b[2];
out[3] = a[3] + b[3];
out[4] = a[4] + b[4];
}
static const uint64_t two54m152 = (((uint64_t)1) << 54) - 152;
static const uint64_t two54m8 = (((uint64_t)1) << 54) - 8;
/* out = a - b */
DONNA_INLINE static void
curve25519_sub(bignum25519 out, const bignum25519 a, const bignum25519 b) {
out[0] = a[0] + two54m152 - b[0];
out[1] = a[1] + two54m8 - b[1];
out[2] = a[2] + two54m8 - b[2];
out[3] = a[3] + two54m8 - b[3];
out[4] = a[4] + two54m8 - b[4];
}
/* out = (in * scalar) */
DONNA_INLINE static void
curve25519_scalar_product(bignum25519 out, const bignum25519 in, const uint64_t scalar) {
uint128_t a;
uint64_t c;
#if defined(HAVE_NATIVE_UINT128)
a = ((uint128_t) in[0]) * scalar; out[0] = (uint64_t)a & reduce_mask_51; c = (uint64_t)(a >> 51);
a = ((uint128_t) in[1]) * scalar + c; out[1] = (uint64_t)a & reduce_mask_51; c = (uint64_t)(a >> 51);
a = ((uint128_t) in[2]) * scalar + c; out[2] = (uint64_t)a & reduce_mask_51; c = (uint64_t)(a >> 51);
a = ((uint128_t) in[3]) * scalar + c; out[3] = (uint64_t)a & reduce_mask_51; c = (uint64_t)(a >> 51);
a = ((uint128_t) in[4]) * scalar + c; out[4] = (uint64_t)a & reduce_mask_51; c = (uint64_t)(a >> 51);
out[0] += c * 19;
#else
mul64x64_128(a, in[0], scalar) out[0] = lo128(a) & reduce_mask_51; shr128(c, a, 51);
mul64x64_128(a, in[1], scalar) add128_64(a, c) out[1] = lo128(a) & reduce_mask_51; shr128(c, a, 51);
mul64x64_128(a, in[2], scalar) add128_64(a, c) out[2] = lo128(a) & reduce_mask_51; shr128(c, a, 51);
mul64x64_128(a, in[3], scalar) add128_64(a, c) out[3] = lo128(a) & reduce_mask_51; shr128(c, a, 51);
mul64x64_128(a, in[4], scalar) add128_64(a, c) out[4] = lo128(a) & reduce_mask_51; shr128(c, a, 51);
out[0] += c * 19;
#endif
}
/* out = a * b */
DONNA_INLINE static void
curve25519_mul(bignum25519 out, const bignum25519 a, const bignum25519 b) {
#if !defined(HAVE_NATIVE_UINT128)
uint128_t mul;
#endif
uint128_t t[5];
uint64_t r0,r1,r2,r3,r4,s0,s1,s2,s3,s4,c;
r0 = b[0];
r1 = b[1];
r2 = b[2];
r3 = b[3];
r4 = b[4];
s0 = a[0];
s1 = a[1];
s2 = a[2];
s3 = a[3];
s4 = a[4];
#if defined(HAVE_NATIVE_UINT128)
t[0] = ((uint128_t) r0) * s0;
t[1] = ((uint128_t) r0) * s1 + ((uint128_t) r1) * s0;
t[2] = ((uint128_t) r0) * s2 + ((uint128_t) r2) * s0 + ((uint128_t) r1) * s1;
t[3] = ((uint128_t) r0) * s3 + ((uint128_t) r3) * s0 + ((uint128_t) r1) * s2 + ((uint128_t) r2) * s1;
t[4] = ((uint128_t) r0) * s4 + ((uint128_t) r4) * s0 + ((uint128_t) r3) * s1 + ((uint128_t) r1) * s3 + ((uint128_t) r2) * s2;
#else
mul64x64_128(t[0], r0, s0)
mul64x64_128(t[1], r0, s1) mul64x64_128(mul, r1, s0) add128(t[1], mul)
mul64x64_128(t[2], r0, s2) mul64x64_128(mul, r2, s0) add128(t[2], mul) mul64x64_128(mul, r1, s1) add128(t[2], mul)
mul64x64_128(t[3], r0, s3) mul64x64_128(mul, r3, s0) add128(t[3], mul) mul64x64_128(mul, r1, s2) add128(t[3], mul) mul64x64_128(mul, r2, s1) add128(t[3], mul)
mul64x64_128(t[4], r0, s4) mul64x64_128(mul, r4, s0) add128(t[4], mul) mul64x64_128(mul, r3, s1) add128(t[4], mul) mul64x64_128(mul, r1, s3) add128(t[4], mul) mul64x64_128(mul, r2, s2) add128(t[4], mul)
#endif
r1 *= 19;
r2 *= 19;
r3 *= 19;
r4 *= 19;
#if defined(HAVE_NATIVE_UINT128)
t[0] += ((uint128_t) r4) * s1 + ((uint128_t) r1) * s4 + ((uint128_t) r2) * s3 + ((uint128_t) r3) * s2;
t[1] += ((uint128_t) r4) * s2 + ((uint128_t) r2) * s4 + ((uint128_t) r3) * s3;
t[2] += ((uint128_t) r4) * s3 + ((uint128_t) r3) * s4;
t[3] += ((uint128_t) r4) * s4;
#else
mul64x64_128(mul, r4, s1) add128(t[0], mul) mul64x64_128(mul, r1, s4) add128(t[0], mul) mul64x64_128(mul, r2, s3) add128(t[0], mul) mul64x64_128(mul, r3, s2) add128(t[0], mul)
mul64x64_128(mul, r4, s2) add128(t[1], mul) mul64x64_128(mul, r2, s4) add128(t[1], mul) mul64x64_128(mul, r3, s3) add128(t[1], mul)
mul64x64_128(mul, r4, s3) add128(t[2], mul) mul64x64_128(mul, r3, s4) add128(t[2], mul)
mul64x64_128(mul, r4, s4) add128(t[3], mul)
#endif
r0 = lo128(t[0]) & reduce_mask_51; shr128(c, t[0], 51);
add128_64(t[1], c) r1 = lo128(t[1]) & reduce_mask_51; shr128(c, t[1], 51);
add128_64(t[2], c) r2 = lo128(t[2]) & reduce_mask_51; shr128(c, t[2], 51);
add128_64(t[3], c) r3 = lo128(t[3]) & reduce_mask_51; shr128(c, t[3], 51);
add128_64(t[4], c) r4 = lo128(t[4]) & reduce_mask_51; shr128(c, t[4], 51);
r0 += c * 19; c = r0 >> 51; r0 = r0 & reduce_mask_51;
r1 += c;
out[0] = r0;
out[1] = r1;
out[2] = r2;
out[3] = r3;
out[4] = r4;
}
/* out = in^(2 * count) */
DONNA_INLINE static void
curve25519_square_times(bignum25519 out, const bignum25519 in, uint64_t count) {
#if !defined(HAVE_NATIVE_UINT128)
uint128_t mul;
#endif
uint128_t t[5];
uint64_t r0,r1,r2,r3,r4,c;
uint64_t d0,d1,d2,d4,d419;
r0 = in[0];
r1 = in[1];
r2 = in[2];
r3 = in[3];
r4 = in[4];
do {
d0 = r0 * 2;
d1 = r1 * 2;
d2 = r2 * 2 * 19;
d419 = r4 * 19;
d4 = d419 * 2;
#if defined(HAVE_NATIVE_UINT128)
t[0] = ((uint128_t) r0) * r0 + ((uint128_t) d4) * r1 + (((uint128_t) d2) * (r3 ));
t[1] = ((uint128_t) d0) * r1 + ((uint128_t) d4) * r2 + (((uint128_t) r3) * (r3 * 19));
t[2] = ((uint128_t) d0) * r2 + ((uint128_t) r1) * r1 + (((uint128_t) d4) * (r3 ));
t[3] = ((uint128_t) d0) * r3 + ((uint128_t) d1) * r2 + (((uint128_t) r4) * (d419 ));
t[4] = ((uint128_t) d0) * r4 + ((uint128_t) d1) * r3 + (((uint128_t) r2) * (r2 ));
#else
mul64x64_128(t[0], r0, r0) mul64x64_128(mul, d4, r1) add128(t[0], mul) mul64x64_128(mul, d2, r3) add128(t[0], mul)
mul64x64_128(t[1], d0, r1) mul64x64_128(mul, d4, r2) add128(t[1], mul) mul64x64_128(mul, r3, r3 * 19) add128(t[1], mul)
mul64x64_128(t[2], d0, r2) mul64x64_128(mul, r1, r1) add128(t[2], mul) mul64x64_128(mul, d4, r3) add128(t[2], mul)
mul64x64_128(t[3], d0, r3) mul64x64_128(mul, d1, r2) add128(t[3], mul) mul64x64_128(mul, r4, d419) add128(t[3], mul)
mul64x64_128(t[4], d0, r4) mul64x64_128(mul, d1, r3) add128(t[4], mul) mul64x64_128(mul, r2, r2) add128(t[4], mul)
#endif
r0 = lo128(t[0]) & reduce_mask_51; shr128(c, t[0], 51);
add128_64(t[1], c) r1 = lo128(t[1]) & reduce_mask_51; shr128(c, t[1], 51);
add128_64(t[2], c) r2 = lo128(t[2]) & reduce_mask_51; shr128(c, t[2], 51);
add128_64(t[3], c) r3 = lo128(t[3]) & reduce_mask_51; shr128(c, t[3], 51);
add128_64(t[4], c) r4 = lo128(t[4]) & reduce_mask_51; shr128(c, t[4], 51);
r0 += c * 19; c = r0 >> 51; r0 = r0 & reduce_mask_51;
r1 += c;
} while(--count);
out[0] = r0;
out[1] = r1;
out[2] = r2;
out[3] = r3;
out[4] = r4;
}
DONNA_INLINE static void
curve25519_square(bignum25519 out, const bignum25519 in) {
#if !defined(HAVE_NATIVE_UINT128)
uint128_t mul;
#endif
uint128_t t[5];
uint64_t r0,r1,r2,r3,r4,c;
uint64_t d0,d1,d2,d4,d419;
r0 = in[0];
r1 = in[1];
r2 = in[2];
r3 = in[3];
r4 = in[4];
d0 = r0 * 2;
d1 = r1 * 2;
d2 = r2 * 2 * 19;
d419 = r4 * 19;
d4 = d419 * 2;
#if defined(HAVE_NATIVE_UINT128)
t[0] = ((uint128_t) r0) * r0 + ((uint128_t) d4) * r1 + (((uint128_t) d2) * (r3 ));
t[1] = ((uint128_t) d0) * r1 + ((uint128_t) d4) * r2 + (((uint128_t) r3) * (r3 * 19));
t[2] = ((uint128_t) d0) * r2 + ((uint128_t) r1) * r1 + (((uint128_t) d4) * (r3 ));
t[3] = ((uint128_t) d0) * r3 + ((uint128_t) d1) * r2 + (((uint128_t) r4) * (d419 ));
t[4] = ((uint128_t) d0) * r4 + ((uint128_t) d1) * r3 + (((uint128_t) r2) * (r2 ));
#else
mul64x64_128(t[0], r0, r0) mul64x64_128(mul, d4, r1) add128(t[0], mul) mul64x64_128(mul, d2, r3) add128(t[0], mul)
mul64x64_128(t[1], d0, r1) mul64x64_128(mul, d4, r2) add128(t[1], mul) mul64x64_128(mul, r3, r3 * 19) add128(t[1], mul)
mul64x64_128(t[2], d0, r2) mul64x64_128(mul, r1, r1) add128(t[2], mul) mul64x64_128(mul, d4, r3) add128(t[2], mul)
mul64x64_128(t[3], d0, r3) mul64x64_128(mul, d1, r2) add128(t[3], mul) mul64x64_128(mul, r4, d419) add128(t[3], mul)
mul64x64_128(t[4], d0, r4) mul64x64_128(mul, d1, r3) add128(t[4], mul) mul64x64_128(mul, r2, r2) add128(t[4], mul)
#endif
r0 = lo128(t[0]) & reduce_mask_51; shr128(c, t[0], 51);
add128_64(t[1], c) r1 = lo128(t[1]) & reduce_mask_51; shr128(c, t[1], 51);
add128_64(t[2], c) r2 = lo128(t[2]) & reduce_mask_51; shr128(c, t[2], 51);
add128_64(t[3], c) r3 = lo128(t[3]) & reduce_mask_51; shr128(c, t[3], 51);
add128_64(t[4], c) r4 = lo128(t[4]) & reduce_mask_51; shr128(c, t[4], 51);
r0 += c * 19; c = r0 >> 51; r0 = r0 & reduce_mask_51;
r1 += c;
out[0] = r0;
out[1] = r1;
out[2] = r2;
out[3] = r3;
out[4] = r4;
}
/* Take a little-endian, 32-byte number and expand it into polynomial form */
DONNA_INLINE static void
curve25519_expand(bignum25519 out, const unsigned char *in) {
static const union { uint8_t b[2]; uint16_t s; } endian_check = {{1,0}};
uint64_t x0,x1,x2,x3;
if (endian_check.s == 1) {
x0 = *(uint64_t *)(in + 0);
x1 = *(uint64_t *)(in + 8);
x2 = *(uint64_t *)(in + 16);
x3 = *(uint64_t *)(in + 24);
} else {
#define F(s) \
((((uint64_t)in[s + 0]) ) | \
(((uint64_t)in[s + 1]) << 8) | \
(((uint64_t)in[s + 2]) << 16) | \
(((uint64_t)in[s + 3]) << 24) | \
(((uint64_t)in[s + 4]) << 32) | \
(((uint64_t)in[s + 5]) << 40) | \
(((uint64_t)in[s + 6]) << 48) | \
(((uint64_t)in[s + 7]) << 56))
x0 = F(0);
x1 = F(8);
x2 = F(16);
x3 = F(24);
}
out[0] = x0 & reduce_mask_51; x0 = (x0 >> 51) | (x1 << 13);
out[1] = x0 & reduce_mask_51; x1 = (x1 >> 38) | (x2 << 26);
out[2] = x1 & reduce_mask_51; x2 = (x2 >> 25) | (x3 << 39);
out[3] = x2 & reduce_mask_51; x3 = (x3 >> 12);
out[4] = x3 & reduce_mask_51; /* ignore the top bit */
}
/* Take a fully reduced polynomial form number and contract it into a
* little-endian, 32-byte array
*/
DONNA_INLINE static void
curve25519_contract(unsigned char *out, const bignum25519 input) {
uint64_t t[5];
uint64_t f, i;
t[0] = input[0];
t[1] = input[1];
t[2] = input[2];
t[3] = input[3];
t[4] = input[4];
#define curve25519_contract_carry() \
t[1] += t[0] >> 51; t[0] &= reduce_mask_51; \
t[2] += t[1] >> 51; t[1] &= reduce_mask_51; \
t[3] += t[2] >> 51; t[2] &= reduce_mask_51; \
t[4] += t[3] >> 51; t[3] &= reduce_mask_51;
#define curve25519_contract_carry_full() curve25519_contract_carry() \
t[0] += 19 * (t[4] >> 51); t[4] &= reduce_mask_51;
#define curve25519_contract_carry_final() curve25519_contract_carry() \
t[4] &= reduce_mask_51;
curve25519_contract_carry_full()
curve25519_contract_carry_full()
/* now t is between 0 and 2^255-1, properly carried. */
/* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */
t[0] += 19;
curve25519_contract_carry_full()
/* now between 19 and 2^255-1 in both cases, and offset by 19. */
t[0] += 0x8000000000000 - 19;
t[1] += 0x8000000000000 - 1;
t[2] += 0x8000000000000 - 1;
t[3] += 0x8000000000000 - 1;
t[4] += 0x8000000000000 - 1;
/* now between 2^255 and 2^256-20, and offset by 2^255. */
curve25519_contract_carry_final()
#define write51full(n,shift) \
f = ((t[n] >> shift) | (t[n+1] << (51 - shift))); \
for (i = 0; i < 8; i++, f >>= 8) *out++ = (unsigned char)f;
#define write51(n) write51full(n,13*n)
write51(0)
write51(1)
write51(2)
write51(3)
#undef curve25519_contract_carry
#undef curve25519_contract_carry_full
#undef curve25519_contract_carry_final
#undef write51full
#undef write51
}
/*
* Swap the contents of [qx] and [qpx] iff @swap is non-zero
*/
DONNA_INLINE static void
curve25519_swap_conditional(bignum25519 x, bignum25519 qpx, uint64_t iswap) {
const uint64_t swap = (uint64_t)(-(int64_t)iswap);
uint64_t x0,x1,x2,x3,x4;
x0 = swap & (x[0] ^ qpx[0]); x[0] ^= x0; qpx[0] ^= x0;
x1 = swap & (x[1] ^ qpx[1]); x[1] ^= x1; qpx[1] ^= x1;
x2 = swap & (x[2] ^ qpx[2]); x[2] ^= x2; qpx[2] ^= x2;
x3 = swap & (x[3] ^ qpx[3]); x[3] ^= x3; qpx[3] ^= x3;
x4 = swap & (x[4] ^ qpx[4]); x[4] ^= x4; qpx[4] ^= x4;
}

View File

@ -0,0 +1,43 @@
/*
* In: b = 2^5 - 2^0
* Out: b = 2^250 - 2^0
*/
static void
curve25519_pow_two5mtwo0_two250mtwo0(bignum25519 b) {
bignum25519 ALIGN(16) t0,c;
/* 2^5 - 2^0 */ /* b */
/* 2^10 - 2^5 */ curve25519_square_times(t0, b, 5);
/* 2^10 - 2^0 */ curve25519_mul(b, t0, b);
/* 2^20 - 2^10 */ curve25519_square_times(t0, b, 10);
/* 2^20 - 2^0 */ curve25519_mul(c, t0, b);
/* 2^40 - 2^20 */ curve25519_square_times(t0, c, 20);
/* 2^40 - 2^0 */ curve25519_mul(t0, t0, c);
/* 2^50 - 2^10 */ curve25519_square_times(t0, t0, 10);
/* 2^50 - 2^0 */ curve25519_mul(b, t0, b);
/* 2^100 - 2^50 */ curve25519_square_times(t0, b, 50);
/* 2^100 - 2^0 */ curve25519_mul(c, t0, b);
/* 2^200 - 2^100 */ curve25519_square_times(t0, c, 100);
/* 2^200 - 2^0 */ curve25519_mul(t0, t0, c);
/* 2^250 - 2^50 */ curve25519_square_times(t0, t0, 50);
/* 2^250 - 2^0 */ curve25519_mul(b, t0, b);
}
/*
* z^(p - 2) = z(2^255 - 21)
*/
static void
curve25519_recip(bignum25519 out, const bignum25519 z) {
bignum25519 ALIGN(16) a,t0,b;
/* 2 */ curve25519_square(a, z); /* a = 2 */
/* 8 */ curve25519_square_times(t0, a, 2);
/* 9 */ curve25519_mul(b, t0, z); /* b = 9 */
/* 11 */ curve25519_mul(a, b, a); /* a = 11 */
/* 22 */ curve25519_square(t0, a);
/* 2^5 - 2^0 = 31 */ curve25519_mul(b, t0, b);
/* 2^250 - 2^0 */ curve25519_pow_two5mtwo0_two250mtwo0(b);
/* 2^255 - 2^5 */ curve25519_square_times(b, b, 5);
/* 2^255 - 21 */ curve25519_mul(out, b, a);
}

View File

@ -0,0 +1,103 @@
/* os */
#if defined(_WIN32) || defined(_WIN64) || defined(__TOS_WIN__) || defined(__WINDOWS__)
#define OS_WINDOWS
#elif defined(sun) || defined(__sun) || defined(__SVR4) || defined(__svr4__)
#define OS_SOLARIS
#else
#include <sys/param.h> /* need this to define BSD */
#define OS_NIX
#if defined(__linux__)
#define OS_LINUX
#elif defined(BSD)
#define OS_BSD
#if defined(MACOS_X) || (defined(__APPLE__) & defined(__MACH__))
#define OS_OSX
#elif defined(macintosh) || defined(Macintosh)
#define OS_MAC
#elif defined(__OpenBSD__)
#define OS_OPENBSD
#endif
#endif
#endif
/* compiler */
#if defined(_MSC_VER)
#define COMPILER_MSVC
#endif
#if defined(__ICC)
#define COMPILER_INTEL
#endif
#if defined(__GNUC__)
#if (__GNUC__ >= 3)
#define COMPILER_GCC ((__GNUC__ * 10000) + (__GNUC_MINOR__ * 100) + (__GNUC_PATCHLEVEL__))
#else
#define COMPILER_GCC ((__GNUC__ * 10000) + (__GNUC_MINOR__ * 100) )
#endif
#endif
#if defined(__PATHCC__)
#define COMPILER_PATHCC
#endif
#if defined(__clang__)
#define COMPILER_CLANG ((__clang_major__ * 10000) + (__clang_minor__ * 100) + (__clang_patchlevel__))
#endif
/* cpu */
#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__ ) || defined(_M_X64)
#define CPU_X86_64
#elif defined(__i586__) || defined(__i686__) || (defined(_M_IX86) && (_M_IX86 >= 500))
#define CPU_X86 500
#elif defined(__i486__) || (defined(_M_IX86) && (_M_IX86 >= 400))
#define CPU_X86 400
#elif defined(__i386__) || (defined(_M_IX86) && (_M_IX86 >= 300)) || defined(__X86__) || defined(_X86_) || defined(__I86__)
#define CPU_X86 300
#elif defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(_M_IA64) || defined(__ia64)
#define CPU_IA64
#endif
#if defined(__sparc__) || defined(__sparc) || defined(__sparcv9)
#define CPU_SPARC
#if defined(__sparcv9)
#define CPU_SPARC64
#endif
#endif
#if defined(powerpc) || defined(__PPC__) || defined(__ppc__) || defined(_ARCH_PPC) || defined(__powerpc__) || defined(__powerpc) || defined(POWERPC) || defined(_M_PPC)
#define CPU_PPC
#if defined(_ARCH_PWR7)
#define CPU_POWER7
#elif defined(__64BIT__)
#define CPU_PPC64
#else
#define CPU_PPC32
#endif
#endif
#if defined(__hppa__) || defined(__hppa)
#define CPU_HPPA
#endif
#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA)
#define CPU_ALPHA
#endif
/* 64 bit cpu */
#if defined(CPU_X86_64) || defined(CPU_IA64) || defined(CPU_SPARC64) || defined(__64BIT__) || defined(__LP64__) || defined(_LP64) || (defined(_MIPS_SZLONG) && (_MIPS_SZLONG == 64))
#define CPU_64BITS
#endif
#if defined(COMPILER_MSVC)
typedef signed char int8_t;
typedef unsigned char uint8_t;
typedef signed short int16_t;
typedef unsigned short uint16_t;
typedef signed int int32_t;
typedef unsigned int uint32_t;
typedef signed __int64 int64_t;
typedef unsigned __int64 uint64_t;
#else
#include <stdint.h>
#endif

View File

@ -0,0 +1,92 @@
#include "curve25519-donna-portable-identify.h"
#define mul32x32_64(a,b) (((uint64_t)(a))*(b))
/* platform */
#if defined(COMPILER_MSVC)
#include <intrin.h>
#if !defined(_DEBUG)
#undef mul32x32_64
#define mul32x32_64(a,b) __emulu(a,b)
#endif
#undef inline
#define inline __forceinline
#define DONNA_INLINE __forceinline
#define DONNA_NOINLINE __declspec(noinline)
#define ALIGN(x) __declspec(align(x))
#define ROTL32(a,b) _rotl(a,b)
#define ROTR32(a,b) _rotr(a,b)
#else
#include <sys/param.h>
#define DONNA_INLINE inline __attribute__((always_inline))
#define DONNA_NOINLINE __attribute__((noinline))
#define ALIGN(x) __attribute__((aligned(x)))
#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
#endif
/* uint128_t */
#if defined(CPU_64BITS) && !defined(ED25519_FORCE_32BIT)
#if defined(COMPILER_CLANG) && (COMPILER_CLANG >= 30100)
#define HAVE_NATIVE_UINT128
typedef unsigned __int128 uint128_t;
#elif defined(COMPILER_MSVC)
#define HAVE_UINT128
typedef struct uint128_t {
uint64_t lo, hi;
} uint128_t;
#define mul64x64_128(out,a,b) out.lo = _umul128(a,b,&out.hi);
#define shr128_pair(out,hi,lo,shift) out = __shiftright128(lo, hi, shift);
#define shl128_pair(out,hi,lo,shift) out = __shiftleft128(lo, hi, shift);
#define shr128(out,in,shift) shr128_pair(out, in.hi, in.lo, shift)
#define shl128(out,in,shift) shl128_pair(out, in.hi, in.lo, shift)
#define add128(a,b) { uint64_t p = a.lo; a.lo += b.lo; a.hi += b.hi + (a.lo < p); }
#define add128_64(a,b) { uint64_t p = a.lo; a.lo += b; a.hi += (a.lo < p); }
#define lo128(a) (a.lo)
#define hi128(a) (a.hi)
#elif defined(COMPILER_GCC) && !defined(HAVE_NATIVE_UINT128)
#if defined(__SIZEOF_INT128__)
#define HAVE_NATIVE_UINT128
typedef unsigned __int128 uint128_t;
#elif (COMPILER_GCC >= 40400)
#define HAVE_NATIVE_UINT128
typedef unsigned uint128_t __attribute__((mode(TI)));
#elif defined(CPU_X86_64)
#define HAVE_UINT128
typedef struct uint128_t {
uint64_t lo, hi;
} uint128_t;
#define mul64x64_128(out,a,b) __asm__ ("mulq %3" : "=a" (out.lo), "=d" (out.hi) : "a" (a), "rm" (b));
#define shr128_pair(out,hi,lo,shift) __asm__ ("shrdq %2,%1,%0" : "+r" (lo) : "r" (hi), "J" (shift)); out = lo;
#define shl128_pair(out,hi,lo,shift) __asm__ ("shldq %2,%1,%0" : "+r" (hi) : "r" (lo), "J" (shift)); out = hi;
#define shr128(out,in,shift) shr128_pair(out,in.hi, in.lo, shift)
#define shl128(out,in,shift) shl128_pair(out,in.hi, in.lo, shift)
#define add128(a,b) __asm__ ("addq %4,%2; adcq %5,%3" : "=r" (a.hi), "=r" (a.lo) : "1" (a.lo), "0" (a.hi), "rm" (b.lo), "rm" (b.hi) : "cc");
#define add128_64(a,b) __asm__ ("addq %4,%2; adcq $0,%3" : "=r" (a.hi), "=r" (a.lo) : "1" (a.lo), "0" (a.hi), "rm" (b) : "cc");
#define lo128(a) (a.lo)
#define hi128(a) (a.hi)
#endif
#endif
#if defined(HAVE_NATIVE_UINT128)
#define HAVE_UINT128
#define mul64x64_128(out,a,b) out = (uint128_t)a * b;
#define shr128_pair(out,hi,lo,shift) out = (uint64_t)((((uint128_t)hi << 64) | lo) >> (shift));
#define shl128_pair(out,hi,lo,shift) out = (uint64_t)(((((uint128_t)hi << 64) | lo) << (shift)) >> 64);
#define shr128(out,in,shift) out = (uint64_t)(in >> (shift));
#define shl128(out,in,shift) out = (uint64_t)((in << shift) >> 64);
#define add128(a,b) a += b;
#define add128_64(a,b) a += (uint64_t)b;
#define lo128(a) ((uint64_t)a)
#define hi128(a) ((uint64_t)(a >> 64))
#endif
#if !defined(HAVE_UINT128)
#error Need a uint128_t implementation!
#endif
#endif
#include <stdlib.h>
#include <string.h>

View File

@ -0,0 +1,66 @@
/* Calculates nQ where Q is the x-coordinate of a point on the curve
*
* mypublic: the packed little endian x coordinate of the resulting curve point
* n: a little endian, 32-byte number
* basepoint: a packed little endian point of the curve
*/
static void
curve25519_scalarmult_donna(curve25519_key mypublic, const curve25519_key n, const curve25519_key basepoint) {
bignum25519 nqpqx = {1}, nqpqz = {0}, nqz = {1}, nqx;
bignum25519 q, qx, qpqx, qqx, zzz, zmone;
size_t bit, lastbit;
int32_t i;
curve25519_expand(q, basepoint);
curve25519_copy(nqx, q);
/* bit 255 is always 0, and bit 254 is always 1, so skip bit 255 and
start pre-swapped on bit 254 */
lastbit = 1;
/* we are doing bits 254..3 in the loop, but are swapping in bits 253..2 */
for (i = 253; i >= 2; i--) {
curve25519_add(qx, nqx, nqz);
curve25519_sub(nqz, nqx, nqz);
curve25519_add(qpqx, nqpqx, nqpqz);
curve25519_sub(nqpqz, nqpqx, nqpqz);
curve25519_mul(nqpqx, qpqx, nqz);
curve25519_mul(nqpqz, qx, nqpqz);
curve25519_add(qqx, nqpqx, nqpqz);
curve25519_sub(nqpqz, nqpqx, nqpqz);
curve25519_square(nqpqz, nqpqz);
curve25519_square(nqpqx, qqx);
curve25519_mul(nqpqz, nqpqz, q);
curve25519_square(qx, qx);
curve25519_square(nqz, nqz);
curve25519_mul(nqx, qx, nqz);
curve25519_sub(nqz, qx, nqz);
curve25519_scalar_product(zzz, nqz, 121665);
curve25519_add(zzz, zzz, qx);
curve25519_mul(nqz, nqz, zzz);
bit = (n[i/8] >> (i & 7)) & 1;
curve25519_swap_conditional(nqx, nqpqx, bit ^ lastbit);
curve25519_swap_conditional(nqz, nqpqz, bit ^ lastbit);
lastbit = bit;
}
/* the final 3 bits are always zero, so we only need to double */
for (i = 0; i < 3; i++) {
curve25519_add(qx, nqx, nqz);
curve25519_sub(nqz, nqx, nqz);
curve25519_square(qx, qx);
curve25519_square(nqz, nqz);
curve25519_mul(nqx, qx, nqz);
curve25519_sub(nqz, qx, nqz);
curve25519_scalar_product(zzz, nqz, 121665);
curve25519_add(zzz, zzz, qx);
curve25519_mul(nqz, nqz, zzz);
}
curve25519_recip(zmone, nqz);
curve25519_mul(nqz, nqx, zmone);
curve25519_contract(mypublic, nqz);
}

View File

@ -0,0 +1,65 @@
/* Calculates nQ where Q is the x-coordinate of a point on the curve
*
* mypublic: the packed little endian x coordinate of the resulting curve point
* n: a little endian, 32-byte number
* basepoint: a packed little endian point of the curve
*/
static void
curve25519_scalarmult_donna(curve25519_key mypublic, const curve25519_key n, const curve25519_key basepoint) {
bignum25519 ALIGN(16) nqx = {1}, nqpqz = {1}, nqz = {0}, nqpqx, zmone;
packed32bignum25519 qx, qz, pqz, pqx;
packed64bignum25519 nq, sq, sqscalar, prime, primex, primez, nqpq;
bignum25519mulprecomp preq;
size_t bit, lastbit, i;
curve25519_expand(nqpqx, basepoint);
curve25519_mul_precompute(&preq, nqpqx);
/* do bits 254..3 */
for (i = 254, lastbit = 0; i >= 3; i--) {
bit = (n[i/8] >> (i & 7)) & 1;
curve25519_swap_conditional(nqx, nqpqx, bit ^ lastbit);
curve25519_swap_conditional(nqz, nqpqz, bit ^ lastbit);
lastbit = bit;
curve25519_tangle32(qx, nqx, nqpqx); /* qx = [nqx,nqpqx] */
curve25519_tangle32(qz, nqz, nqpqz); /* qz = [nqz,nqpqz] */
curve25519_add_packed32(pqx, qx, qz); /* pqx = [nqx+nqz,nqpqx+nqpqz] */
curve25519_sub_packed32(pqz, qx, qz); /* pqz = [nqx-nqz,nqpqx-nqpqz] */
curve25519_make_nqpq(primex, primez, pqx, pqz); /* primex = [nqx+nqz,nqpqx+nqpqz], primez = [nqpqx-nqpqz,nqx-nqz] */
curve25519_mul_packed64(prime, primex, primez); /* prime = [nqx+nqz,nqpqx+nqpqz] * [nqpqx-nqpqz,nqx-nqz] */
curve25519_addsub_packed64(prime); /* prime = [prime.x+prime.z,prime.x-prime.z] */
curve25519_square_packed64(nqpq, prime); /* nqpq = prime^2 */
curve25519_untangle64(nqpqx, nqpqz, nqpq);
curve25519_mul_precomputed(nqpqz, nqpqz, &preq); /* nqpqz = nqpqz * q */
/* (((sq.x-sq.z)*121665)+sq.x) * (sq.x-sq.z) is equivalent to (sq.x*121666-sq.z*121665) * (sq.x-sq.z) */
curve25519_make_nq(nq, pqx, pqz); /* nq = [nqx+nqz,nqx-nqz] */
curve25519_square_packed64(sq, nq); /* sq = nq^2 */
curve25519_121665_packed64(sqscalar, sq); /* sqscalar = sq * [121666,121665] */
curve25519_final_nq(nq, sq, sqscalar); /* nq = [sq.x,sqscalar.x-sqscalar.z] * [sq.z,sq.x-sq.z] */
curve25519_untangle64(nqx, nqz, nq);
};
/* it's possible to get rid of this swap with the swap in the above loop
at the bottom instead of the top, but compilers seem to optimize better this way */
curve25519_swap_conditional(nqx, nqpqx, bit);
curve25519_swap_conditional(nqz, nqpqz, bit);
/* do bits 2..0 */
for (i = 0; i < 3; i++) {
curve25519_compute_nq(nq, nqx, nqz);
curve25519_square_packed64(sq, nq); /* sq = nq^2 */
curve25519_121665_packed64(sqscalar, sq); /* sqscalar = sq * [121666,121665] */
curve25519_final_nq(nq, sq, sqscalar); /* nq = [sq.x,sqscalar.x-sqscalar.z] * [sq.z,sq.x-sq.z] */
curve25519_untangle64(nqx, nqz, nq);
}
curve25519_recip(zmone, nqz);
curve25519_mul(nqz, nqx, zmone);
curve25519_contract(mypublic, nqz);
}

File diff suppressed because it is too large Load Diff

View File

@ -1,863 +0,0 @@
/* Copyright 2008, Google Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Google Inc. nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* curve25519-donna: Curve25519 elliptic curve, public key function
*
* http://code.google.com/p/curve25519-donna/
*
* Adam Langley <agl@imperialviolet.org>
*
* Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to>
*
* More information about curve25519 can be found here
* http://cr.yp.to/ecdh.html
*
* djb's sample implementation of curve25519 is written in a special assembly
* language called qhasm and uses the floating point registers.
*
* This is, almost, a clean room reimplementation from the curve25519 paper. It
* uses many of the tricks described therein. Only the crecip function is taken
* from the sample implementation. */
#include <string.h>
#include "curve25519-donna.h"
#ifdef _MSC_VER
#define inline __inline
#endif
typedef int32_t s32;
typedef int64_t limb;
/* Field element representation:
*
* Field elements are written as an array of signed, 64-bit limbs, least
* significant first. The value of the field element is:
* x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ...
*
* i.e. the limbs are 26, 25, 26, 25, ... bits wide. */
/* Sum two numbers: output += in */
static void fsum(limb *output, const limb *in) {
unsigned i;
for (i = 0; i < 10; i += 2) {
output[0+i] = output[0+i] + in[0+i];
output[1+i] = output[1+i] + in[1+i];
}
}
/* Find the difference of two numbers: output = in - output
* (note the order of the arguments!). */
static void fdifference(limb *output, const limb *in) {
unsigned i;
for (i = 0; i < 10; ++i) {
output[i] = in[i] - output[i];
}
}
/* Multiply a number by a scalar: output = in * scalar */
static void fscalar_product(limb *output, const limb *in, const limb scalar) {
unsigned i;
for (i = 0; i < 10; ++i) {
output[i] = in[i] * scalar;
}
}
/* Multiply two numbers: output = in2 * in
*
* output must be distinct to both inputs. The inputs are reduced coefficient
* form, the output is not.
*
* output[x] <= 14 * the largest product of the input limbs. */
static void fproduct(limb *output, const limb *in2, const limb *in) {
output[0] = ((limb) ((s32) in2[0])) * ((s32) in[0]);
output[1] = ((limb) ((s32) in2[0])) * ((s32) in[1]) +
((limb) ((s32) in2[1])) * ((s32) in[0]);
output[2] = 2 * ((limb) ((s32) in2[1])) * ((s32) in[1]) +
((limb) ((s32) in2[0])) * ((s32) in[2]) +
((limb) ((s32) in2[2])) * ((s32) in[0]);
output[3] = ((limb) ((s32) in2[1])) * ((s32) in[2]) +
((limb) ((s32) in2[2])) * ((s32) in[1]) +
((limb) ((s32) in2[0])) * ((s32) in[3]) +
((limb) ((s32) in2[3])) * ((s32) in[0]);
output[4] = ((limb) ((s32) in2[2])) * ((s32) in[2]) +
2 * (((limb) ((s32) in2[1])) * ((s32) in[3]) +
((limb) ((s32) in2[3])) * ((s32) in[1])) +
((limb) ((s32) in2[0])) * ((s32) in[4]) +
((limb) ((s32) in2[4])) * ((s32) in[0]);
output[5] = ((limb) ((s32) in2[2])) * ((s32) in[3]) +
((limb) ((s32) in2[3])) * ((s32) in[2]) +
((limb) ((s32) in2[1])) * ((s32) in[4]) +
((limb) ((s32) in2[4])) * ((s32) in[1]) +
((limb) ((s32) in2[0])) * ((s32) in[5]) +
((limb) ((s32) in2[5])) * ((s32) in[0]);
output[6] = 2 * (((limb) ((s32) in2[3])) * ((s32) in[3]) +
((limb) ((s32) in2[1])) * ((s32) in[5]) +
((limb) ((s32) in2[5])) * ((s32) in[1])) +
((limb) ((s32) in2[2])) * ((s32) in[4]) +
((limb) ((s32) in2[4])) * ((s32) in[2]) +
((limb) ((s32) in2[0])) * ((s32) in[6]) +
((limb) ((s32) in2[6])) * ((s32) in[0]);
output[7] = ((limb) ((s32) in2[3])) * ((s32) in[4]) +
((limb) ((s32) in2[4])) * ((s32) in[3]) +
((limb) ((s32) in2[2])) * ((s32) in[5]) +
((limb) ((s32) in2[5])) * ((s32) in[2]) +
((limb) ((s32) in2[1])) * ((s32) in[6]) +
((limb) ((s32) in2[6])) * ((s32) in[1]) +
((limb) ((s32) in2[0])) * ((s32) in[7]) +
((limb) ((s32) in2[7])) * ((s32) in[0]);
output[8] = ((limb) ((s32) in2[4])) * ((s32) in[4]) +
2 * (((limb) ((s32) in2[3])) * ((s32) in[5]) +
((limb) ((s32) in2[5])) * ((s32) in[3]) +
((limb) ((s32) in2[1])) * ((s32) in[7]) +
((limb) ((s32) in2[7])) * ((s32) in[1])) +
((limb) ((s32) in2[2])) * ((s32) in[6]) +
((limb) ((s32) in2[6])) * ((s32) in[2]) +
((limb) ((s32) in2[0])) * ((s32) in[8]) +
((limb) ((s32) in2[8])) * ((s32) in[0]);
output[9] = ((limb) ((s32) in2[4])) * ((s32) in[5]) +
((limb) ((s32) in2[5])) * ((s32) in[4]) +
((limb) ((s32) in2[3])) * ((s32) in[6]) +
((limb) ((s32) in2[6])) * ((s32) in[3]) +
((limb) ((s32) in2[2])) * ((s32) in[7]) +
((limb) ((s32) in2[7])) * ((s32) in[2]) +
((limb) ((s32) in2[1])) * ((s32) in[8]) +
((limb) ((s32) in2[8])) * ((s32) in[1]) +
((limb) ((s32) in2[0])) * ((s32) in[9]) +
((limb) ((s32) in2[9])) * ((s32) in[0]);
output[10] = 2 * (((limb) ((s32) in2[5])) * ((s32) in[5]) +
((limb) ((s32) in2[3])) * ((s32) in[7]) +
((limb) ((s32) in2[7])) * ((s32) in[3]) +
((limb) ((s32) in2[1])) * ((s32) in[9]) +
((limb) ((s32) in2[9])) * ((s32) in[1])) +
((limb) ((s32) in2[4])) * ((s32) in[6]) +
((limb) ((s32) in2[6])) * ((s32) in[4]) +
((limb) ((s32) in2[2])) * ((s32) in[8]) +
((limb) ((s32) in2[8])) * ((s32) in[2]);
output[11] = ((limb) ((s32) in2[5])) * ((s32) in[6]) +
((limb) ((s32) in2[6])) * ((s32) in[5]) +
((limb) ((s32) in2[4])) * ((s32) in[7]) +
((limb) ((s32) in2[7])) * ((s32) in[4]) +
((limb) ((s32) in2[3])) * ((s32) in[8]) +
((limb) ((s32) in2[8])) * ((s32) in[3]) +
((limb) ((s32) in2[2])) * ((s32) in[9]) +
((limb) ((s32) in2[9])) * ((s32) in[2]);
output[12] = ((limb) ((s32) in2[6])) * ((s32) in[6]) +
2 * (((limb) ((s32) in2[5])) * ((s32) in[7]) +
((limb) ((s32) in2[7])) * ((s32) in[5]) +
((limb) ((s32) in2[3])) * ((s32) in[9]) +
((limb) ((s32) in2[9])) * ((s32) in[3])) +
((limb) ((s32) in2[4])) * ((s32) in[8]) +
((limb) ((s32) in2[8])) * ((s32) in[4]);
output[13] = ((limb) ((s32) in2[6])) * ((s32) in[7]) +
((limb) ((s32) in2[7])) * ((s32) in[6]) +
((limb) ((s32) in2[5])) * ((s32) in[8]) +
((limb) ((s32) in2[8])) * ((s32) in[5]) +
((limb) ((s32) in2[4])) * ((s32) in[9]) +
((limb) ((s32) in2[9])) * ((s32) in[4]);
output[14] = 2 * (((limb) ((s32) in2[7])) * ((s32) in[7]) +
((limb) ((s32) in2[5])) * ((s32) in[9]) +
((limb) ((s32) in2[9])) * ((s32) in[5])) +
((limb) ((s32) in2[6])) * ((s32) in[8]) +
((limb) ((s32) in2[8])) * ((s32) in[6]);
output[15] = ((limb) ((s32) in2[7])) * ((s32) in[8]) +
((limb) ((s32) in2[8])) * ((s32) in[7]) +
((limb) ((s32) in2[6])) * ((s32) in[9]) +
((limb) ((s32) in2[9])) * ((s32) in[6]);
output[16] = ((limb) ((s32) in2[8])) * ((s32) in[8]) +
2 * (((limb) ((s32) in2[7])) * ((s32) in[9]) +
((limb) ((s32) in2[9])) * ((s32) in[7]));
output[17] = ((limb) ((s32) in2[8])) * ((s32) in[9]) +
((limb) ((s32) in2[9])) * ((s32) in[8]);
output[18] = 2 * ((limb) ((s32) in2[9])) * ((s32) in[9]);
}
/* Reduce a long form to a short form by taking the input mod 2^255 - 19.
*
* On entry: |output[i]| < 14*2^54
* On exit: |output[0..8]| < 280*2^54 */
static void freduce_degree(limb *output) {
/* Each of these shifts and adds ends up multiplying the value by 19.
*
* For output[0..8], the absolute entry value is < 14*2^54 and we add, at
* most, 19*14*2^54 thus, on exit, |output[0..8]| < 280*2^54. */
output[8] += output[18] << 4;
output[8] += output[18] << 1;
output[8] += output[18];
output[7] += output[17] << 4;
output[7] += output[17] << 1;
output[7] += output[17];
output[6] += output[16] << 4;
output[6] += output[16] << 1;
output[6] += output[16];
output[5] += output[15] << 4;
output[5] += output[15] << 1;
output[5] += output[15];
output[4] += output[14] << 4;
output[4] += output[14] << 1;
output[4] += output[14];
output[3] += output[13] << 4;
output[3] += output[13] << 1;
output[3] += output[13];
output[2] += output[12] << 4;
output[2] += output[12] << 1;
output[2] += output[12];
output[1] += output[11] << 4;
output[1] += output[11] << 1;
output[1] += output[11];
output[0] += output[10] << 4;
output[0] += output[10] << 1;
output[0] += output[10];
}
#if (-1 & 3) != 3
#error "This code only works on a two's complement system"
#endif
/* return v / 2^26, using only shifts and adds.
*
* On entry: v can take any value. */
static inline limb
div_by_2_26(const limb v)
{
/* High word of v; no shift needed. */
const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
/* Set to all 1s if v was negative; else set to 0s. */
const int32_t sign = ((int32_t) highword) >> 31;
/* Set to 0x3ffffff if v was negative; else set to 0. */
const int32_t roundoff = ((uint32_t) sign) >> 6;
/* Should return v / (1<<26) */
return (v + roundoff) >> 26;
}
/* return v / (2^25), using only shifts and adds.
*
* On entry: v can take any value. */
static inline limb
div_by_2_25(const limb v)
{
/* High word of v; no shift needed*/
const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
/* Set to all 1s if v was negative; else set to 0s. */
const int32_t sign = ((int32_t) highword) >> 31;
/* Set to 0x1ffffff if v was negative; else set to 0. */
const int32_t roundoff = ((uint32_t) sign) >> 7;
/* Should return v / (1<<25) */
return (v + roundoff) >> 25;
}
/* Reduce all coefficients of the short form input so that |x| < 2^26.
*
* On entry: |output[i]| < 280*2^54 */
static void freduce_coefficients(limb *output) {
unsigned i;
output[10] = 0;
for (i = 0; i < 10; i += 2) {
limb over = div_by_2_26(output[i]);
/* The entry condition (that |output[i]| < 280*2^54) means that over is, at
* most, 280*2^28 in the first iteration of this loop. This is added to the
* next limb and we can approximate the resulting bound of that limb by
* 281*2^54. */
output[i] -= over << 26;
output[i+1] += over;
/* For the first iteration, |output[i+1]| < 281*2^54, thus |over| <
* 281*2^29. When this is added to the next limb, the resulting bound can
* be approximated as 281*2^54.
*
* For subsequent iterations of the loop, 281*2^54 remains a conservative
* bound and no overflow occurs. */
over = div_by_2_25(output[i+1]);
output[i+1] -= over << 25;
output[i+2] += over;
}
/* Now |output[10]| < 281*2^29 and all other coefficients are reduced. */
output[0] += output[10] << 4;
output[0] += output[10] << 1;
output[0] += output[10];
output[10] = 0;
/* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19*281*2^29
* So |over| will be no more than 2^16. */
{
limb over = div_by_2_26(output[0]);
output[0] -= over << 26;
output[1] += over;
}
/* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 2^16 < 2^26. The
* bound on |output[1]| is sufficient to meet our needs. */
}
/* A helpful wrapper around fproduct: output = in * in2.
*
* On entry: |in[i]| < 2^27 and |in2[i]| < 2^27.
*
* output must be distinct to both inputs. The output is reduced degree
* (indeed, one need only provide storage for 10 limbs) and |output[i]| < 2^26. */
static void
fmul(limb *output, const limb *in, const limb *in2) {
limb t[19];
fproduct(t, in, in2);
/* |t[i]| < 14*2^54 */
freduce_degree(t);
freduce_coefficients(t);
/* |t[i]| < 2^26 */
memcpy(output, t, sizeof(limb) * 10);
}
/* Square a number: output = in**2
*
* output must be distinct from the input. The inputs are reduced coefficient
* form, the output is not.
*
* output[x] <= 14 * the largest product of the input limbs. */
static void fsquare_inner(limb *output, const limb *in) {
output[0] = ((limb) ((s32) in[0])) * ((s32) in[0]);
output[1] = 2 * ((limb) ((s32) in[0])) * ((s32) in[1]);
output[2] = 2 * (((limb) ((s32) in[1])) * ((s32) in[1]) +
((limb) ((s32) in[0])) * ((s32) in[2]));
output[3] = 2 * (((limb) ((s32) in[1])) * ((s32) in[2]) +
((limb) ((s32) in[0])) * ((s32) in[3]));
output[4] = ((limb) ((s32) in[2])) * ((s32) in[2]) +
4 * ((limb) ((s32) in[1])) * ((s32) in[3]) +
2 * ((limb) ((s32) in[0])) * ((s32) in[4]);
output[5] = 2 * (((limb) ((s32) in[2])) * ((s32) in[3]) +
((limb) ((s32) in[1])) * ((s32) in[4]) +
((limb) ((s32) in[0])) * ((s32) in[5]));
output[6] = 2 * (((limb) ((s32) in[3])) * ((s32) in[3]) +
((limb) ((s32) in[2])) * ((s32) in[4]) +
((limb) ((s32) in[0])) * ((s32) in[6]) +
2 * ((limb) ((s32) in[1])) * ((s32) in[5]));
output[7] = 2 * (((limb) ((s32) in[3])) * ((s32) in[4]) +
((limb) ((s32) in[2])) * ((s32) in[5]) +
((limb) ((s32) in[1])) * ((s32) in[6]) +
((limb) ((s32) in[0])) * ((s32) in[7]));
output[8] = ((limb) ((s32) in[4])) * ((s32) in[4]) +
2 * (((limb) ((s32) in[2])) * ((s32) in[6]) +
((limb) ((s32) in[0])) * ((s32) in[8]) +
2 * (((limb) ((s32) in[1])) * ((s32) in[7]) +
((limb) ((s32) in[3])) * ((s32) in[5])));
output[9] = 2 * (((limb) ((s32) in[4])) * ((s32) in[5]) +
((limb) ((s32) in[3])) * ((s32) in[6]) +
((limb) ((s32) in[2])) * ((s32) in[7]) +
((limb) ((s32) in[1])) * ((s32) in[8]) +
((limb) ((s32) in[0])) * ((s32) in[9]));
output[10] = 2 * (((limb) ((s32) in[5])) * ((s32) in[5]) +
((limb) ((s32) in[4])) * ((s32) in[6]) +
((limb) ((s32) in[2])) * ((s32) in[8]) +
2 * (((limb) ((s32) in[3])) * ((s32) in[7]) +
((limb) ((s32) in[1])) * ((s32) in[9])));
output[11] = 2 * (((limb) ((s32) in[5])) * ((s32) in[6]) +
((limb) ((s32) in[4])) * ((s32) in[7]) +
((limb) ((s32) in[3])) * ((s32) in[8]) +
((limb) ((s32) in[2])) * ((s32) in[9]));
output[12] = ((limb) ((s32) in[6])) * ((s32) in[6]) +
2 * (((limb) ((s32) in[4])) * ((s32) in[8]) +
2 * (((limb) ((s32) in[5])) * ((s32) in[7]) +
((limb) ((s32) in[3])) * ((s32) in[9])));
output[13] = 2 * (((limb) ((s32) in[6])) * ((s32) in[7]) +
((limb) ((s32) in[5])) * ((s32) in[8]) +
((limb) ((s32) in[4])) * ((s32) in[9]));
output[14] = 2 * (((limb) ((s32) in[7])) * ((s32) in[7]) +
((limb) ((s32) in[6])) * ((s32) in[8]) +
2 * ((limb) ((s32) in[5])) * ((s32) in[9]));
output[15] = 2 * (((limb) ((s32) in[7])) * ((s32) in[8]) +
((limb) ((s32) in[6])) * ((s32) in[9]));
output[16] = ((limb) ((s32) in[8])) * ((s32) in[8]) +
4 * ((limb) ((s32) in[7])) * ((s32) in[9]);
output[17] = 2 * ((limb) ((s32) in[8])) * ((s32) in[9]);
output[18] = 2 * ((limb) ((s32) in[9])) * ((s32) in[9]);
}
/* fsquare sets output = in^2.
*
* On entry: The |in| argument is in reduced coefficients form and |in[i]| <
* 2^27.
*
* On exit: The |output| argument is in reduced coefficients form (indeed, one
* need only provide storage for 10 limbs) and |out[i]| < 2^26. */
static void
fsquare(limb *output, const limb *in) {
limb t[19];
fsquare_inner(t, in);
/* |t[i]| < 14*2^54 because the largest product of two limbs will be <
* 2^(27+27) and fsquare_inner adds together, at most, 14 of those
* products. */
freduce_degree(t);
freduce_coefficients(t);
/* |t[i]| < 2^26 */
memcpy(output, t, sizeof(limb) * 10);
}
/* Take a little-endian, 32-byte number and expand it into polynomial form */
static void
fexpand(limb *output, const u8 *input) {
#define F(n,start,shift,mask) \
output[n] = ((((limb) input[start + 0]) | \
((limb) input[start + 1]) << 8 | \
((limb) input[start + 2]) << 16 | \
((limb) input[start + 3]) << 24) >> shift) & mask;
F(0, 0, 0, 0x3ffffff);
F(1, 3, 2, 0x1ffffff);
F(2, 6, 3, 0x3ffffff);
F(3, 9, 5, 0x1ffffff);
F(4, 12, 6, 0x3ffffff);
F(5, 16, 0, 0x1ffffff);
F(6, 19, 1, 0x3ffffff);
F(7, 22, 3, 0x1ffffff);
F(8, 25, 4, 0x3ffffff);
F(9, 28, 6, 0x1ffffff);
#undef F
}
#if (-32 >> 1) != -16
#error "This code only works when >> does sign-extension on negative numbers"
#endif
/* s32_eq returns 0xffffffff iff a == b and zero otherwise. */
static s32 s32_eq(s32 a, s32 b) {
a = ~(a ^ b);
a &= a << 16;
a &= a << 8;
a &= a << 4;
a &= a << 2;
a &= a << 1;
return a >> 31;
}
/* s32_gte returns 0xffffffff if a >= b and zero otherwise, where a and b are
* both non-negative. */
static s32 s32_gte(s32 a, s32 b) {
a -= b;
/* a >= 0 iff a >= b. */
return ~(a >> 31);
}
/* Take a fully reduced polynomial form number and contract it into a
* little-endian, 32-byte array.
*
* On entry: |input_limbs[i]| < 2^26 */
static void
fcontract(u8 *output, limb *input_limbs) {
int i;
int j;
s32 input[10];
/* |input_limbs[i]| < 2^26, so it's valid to convert to an s32. */
for (i = 0; i < 10; i++) {
input[i] = input_limbs[i];
}
for (j = 0; j < 2; ++j) {
for (i = 0; i < 9; ++i) {
if ((i & 1) == 1) {
/* This calculation is a time-invariant way to make input[i]
* non-negative by borrowing from the next-larger limb. */
const s32 mask = input[i] >> 31;
const s32 carry = -((input[i] & mask) >> 25);
input[i] = input[i] + (carry << 25);
input[i+1] = input[i+1] - carry;
} else {
const s32 mask = input[i] >> 31;
const s32 carry = -((input[i] & mask) >> 26);
input[i] = input[i] + (carry << 26);
input[i+1] = input[i+1] - carry;
}
}
/* There's no greater limb for input[9] to borrow from, but we can multiply
* by 19 and borrow from input[0], which is valid mod 2^255-19. */
{
const s32 mask = input[9] >> 31;
const s32 carry = -((input[9] & mask) >> 25);
input[9] = input[9] + (carry << 25);
input[0] = input[0] - (carry * 19);
}
/* After the first iteration, input[1..9] are non-negative and fit within
* 25 or 26 bits, depending on position. However, input[0] may be
* negative. */
}
/* The first borrow-propagation pass above ended with every limb
except (possibly) input[0] non-negative.
If input[0] was negative after the first pass, then it was because of a
carry from input[9]. On entry, input[9] < 2^26 so the carry was, at most,
one, since (2**26-1) >> 25 = 1. Thus input[0] >= -19.
In the second pass, each limb is decreased by at most one. Thus the second
borrow-propagation pass could only have wrapped around to decrease
input[0] again if the first pass left input[0] negative *and* input[1]
through input[9] were all zero. In that case, input[1] is now 2^25 - 1,
and this last borrow-propagation step will leave input[1] non-negative. */
{
const s32 mask = input[0] >> 31;
const s32 carry = -((input[0] & mask) >> 26);
input[0] = input[0] + (carry << 26);
input[1] = input[1] - carry;
}
/* All input[i] are now non-negative. However, there might be values between
* 2^25 and 2^26 in a limb which is, nominally, 25 bits wide. */
for (j = 0; j < 2; j++) {
for (i = 0; i < 9; i++) {
if ((i & 1) == 1) {
const s32 carry = input[i] >> 25;
input[i] &= 0x1ffffff;
input[i+1] += carry;
} else {
const s32 carry = input[i] >> 26;
input[i] &= 0x3ffffff;
input[i+1] += carry;
}
}
{
const s32 carry = input[9] >> 25;
input[9] &= 0x1ffffff;
input[0] += 19*carry;
}
}
/* If the first carry-chain pass, just above, ended up with a carry from
* input[9], and that caused input[0] to be out-of-bounds, then input[0] was
* < 2^26 + 2*19, because the carry was, at most, two.
*
* If the second pass carried from input[9] again then input[0] is < 2*19 and
* the input[9] -> input[0] carry didn't push input[0] out of bounds. */
/* It still remains the case that input might be between 2^255-19 and 2^255.
* In this case, input[1..9] must take their maximum value and input[0] must
* be >= (2^255-19) & 0x3ffffff, which is 0x3ffffed. */
s32 mask = s32_gte(input[0], 0x3ffffed);
for (i = 1; i < 10; i++) {
if ((i & 1) == 1) {
mask &= s32_eq(input[i], 0x1ffffff);
} else {
mask &= s32_eq(input[i], 0x3ffffff);
}
}
/* mask is either 0xffffffff (if input >= 2^255-19) and zero otherwise. Thus
* this conditionally subtracts 2^255-19. */
input[0] -= mask & 0x3ffffed;
for (i = 1; i < 10; i++) {
if ((i & 1) == 1) {
input[i] -= mask & 0x1ffffff;
} else {
input[i] -= mask & 0x3ffffff;
}
}
input[1] <<= 2;
input[2] <<= 3;
input[3] <<= 5;
input[4] <<= 6;
input[6] <<= 1;
input[7] <<= 3;
input[8] <<= 4;
input[9] <<= 6;
#define F(i, s) \
output[s+0] |= input[i] & 0xff; \
output[s+1] = (input[i] >> 8) & 0xff; \
output[s+2] = (input[i] >> 16) & 0xff; \
output[s+3] = (input[i] >> 24) & 0xff;
output[0] = 0;
output[16] = 0;
F(0,0);
F(1,3);
F(2,6);
F(3,9);
F(4,12);
F(5,16);
F(6,19);
F(7,22);
F(8,25);
F(9,28);
#undef F
}
/* Input: Q, Q', Q-Q'
* Output: 2Q, Q+Q'
*
* x2 z3: long form
* x3 z3: long form
* x z: short form, destroyed
* xprime zprime: short form, destroyed
* qmqp: short form, preserved
*
* On entry and exit, the absolute value of the limbs of all inputs and outputs
* are < 2^26. */
static void fmonty(limb *x2, limb *z2, /* output 2Q */
limb *x3, limb *z3, /* output Q + Q' */
limb *x, limb *z, /* input Q */
limb *xprime, limb *zprime, /* input Q' */
const limb *qmqp /* input Q - Q' */) {
limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19],
zzprime[19], zzzprime[19], xxxprime[19];
memcpy(origx, x, 10 * sizeof(limb));
fsum(x, z);
/* |x[i]| < 2^27 */
fdifference(z, origx); /* does x - z */
/* |z[i]| < 2^27 */
memcpy(origxprime, xprime, sizeof(limb) * 10);
fsum(xprime, zprime);
/* |xprime[i]| < 2^27 */
fdifference(zprime, origxprime);
/* |zprime[i]| < 2^27 */
fproduct(xxprime, xprime, z);
/* |xxprime[i]| < 14*2^54: the largest product of two limbs will be <
* 2^(27+27) and fproduct adds together, at most, 14 of those products.
* (Approximating that to 2^58 doesn't work out.) */
fproduct(zzprime, x, zprime);
/* |zzprime[i]| < 14*2^54 */
freduce_degree(xxprime);
freduce_coefficients(xxprime);
/* |xxprime[i]| < 2^26 */
freduce_degree(zzprime);
freduce_coefficients(zzprime);
/* |zzprime[i]| < 2^26 */
memcpy(origxprime, xxprime, sizeof(limb) * 10);
fsum(xxprime, zzprime);
/* |xxprime[i]| < 2^27 */
fdifference(zzprime, origxprime);
/* |zzprime[i]| < 2^27 */
fsquare(xxxprime, xxprime);
/* |xxxprime[i]| < 2^26 */
fsquare(zzzprime, zzprime);
/* |zzzprime[i]| < 2^26 */
fproduct(zzprime, zzzprime, qmqp);
/* |zzprime[i]| < 14*2^52 */
freduce_degree(zzprime);
freduce_coefficients(zzprime);
/* |zzprime[i]| < 2^26 */
memcpy(x3, xxxprime, sizeof(limb) * 10);
memcpy(z3, zzprime, sizeof(limb) * 10);
fsquare(xx, x);
/* |xx[i]| < 2^26 */
fsquare(zz, z);
/* |zz[i]| < 2^26 */
fproduct(x2, xx, zz);
/* |x2[i]| < 14*2^52 */
freduce_degree(x2);
freduce_coefficients(x2);
/* |x2[i]| < 2^26 */
fdifference(zz, xx); // does zz = xx - zz
/* |zz[i]| < 2^27 */
memset(zzz + 10, 0, sizeof(limb) * 9);
fscalar_product(zzz, zz, 121665);
/* |zzz[i]| < 2^(27+17) */
/* No need to call freduce_degree here:
fscalar_product doesn't increase the degree of its input. */
freduce_coefficients(zzz);
/* |zzz[i]| < 2^26 */
fsum(zzz, xx);
/* |zzz[i]| < 2^27 */
fproduct(z2, zz, zzz);
/* |z2[i]| < 14*2^(26+27) */
freduce_degree(z2);
freduce_coefficients(z2);
/* |z2|i| < 2^26 */
}
/* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave
* them unchanged if 'iswap' is 0. Runs in data-invariant time to avoid
* side-channel attacks.
*
* NOTE that this function requires that 'iswap' be 1 or 0; other values give
* wrong results. Also, the two limb arrays must be in reduced-coefficient,
* reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped,
* and all all values in a[0..9],b[0..9] must have magnitude less than
* INT32_MAX. */
static void
swap_conditional(limb a[19], limb b[19], limb iswap) {
unsigned i;
const s32 swap = (s32) -iswap;
for (i = 0; i < 10; ++i) {
const s32 x = swap & ( ((s32)a[i]) ^ ((s32)b[i]) );
a[i] = ((s32)a[i]) ^ x;
b[i] = ((s32)b[i]) ^ x;
}
}
/* Calculates nQ where Q is the x-coordinate of a point on the curve
*
* resultx/resultz: the x coordinate of the resulting curve point (short form)
* n: a little endian, 32-byte number
* q: a point of the curve (short form) */
static void
cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) {
limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0};
limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t;
limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1};
limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h;
unsigned i, j;
memcpy(nqpqx, q, sizeof(limb) * 10);
for (i = 0; i < 32; ++i) {
u8 byte = n[31 - i];
for (j = 0; j < 8; ++j) {
const limb bit = byte >> 7;
swap_conditional(nqx, nqpqx, bit);
swap_conditional(nqz, nqpqz, bit);
fmonty(nqx2, nqz2,
nqpqx2, nqpqz2,
nqx, nqz,
nqpqx, nqpqz,
q);
swap_conditional(nqx2, nqpqx2, bit);
swap_conditional(nqz2, nqpqz2, bit);
t = nqx;
nqx = nqx2;
nqx2 = t;
t = nqz;
nqz = nqz2;
nqz2 = t;
t = nqpqx;
nqpqx = nqpqx2;
nqpqx2 = t;
t = nqpqz;
nqpqz = nqpqz2;
nqpqz2 = t;
byte <<= 1;
}
}
memcpy(resultx, nqx, sizeof(limb) * 10);
memcpy(resultz, nqz, sizeof(limb) * 10);
}
// -----------------------------------------------------------------------------
// Shamelessly copied from djb's code
// -----------------------------------------------------------------------------
static void
crecip(limb *out, const limb *z) {
limb z2[10];
limb z9[10];
limb z11[10];
limb z2_5_0[10];
limb z2_10_0[10];
limb z2_20_0[10];
limb z2_50_0[10];
limb z2_100_0[10];
limb t0[10];
limb t1[10];
int i;
/* 2 */ fsquare(z2,z);
/* 4 */ fsquare(t1,z2);
/* 8 */ fsquare(t0,t1);
/* 9 */ fmul(z9,t0,z);
/* 11 */ fmul(z11,z9,z2);
/* 22 */ fsquare(t0,z11);
/* 2^5 - 2^0 = 31 */ fmul(z2_5_0,t0,z9);
/* 2^6 - 2^1 */ fsquare(t0,z2_5_0);
/* 2^7 - 2^2 */ fsquare(t1,t0);
/* 2^8 - 2^3 */ fsquare(t0,t1);
/* 2^9 - 2^4 */ fsquare(t1,t0);
/* 2^10 - 2^5 */ fsquare(t0,t1);
/* 2^10 - 2^0 */ fmul(z2_10_0,t0,z2_5_0);
/* 2^11 - 2^1 */ fsquare(t0,z2_10_0);
/* 2^12 - 2^2 */ fsquare(t1,t0);
/* 2^20 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
/* 2^20 - 2^0 */ fmul(z2_20_0,t1,z2_10_0);
/* 2^21 - 2^1 */ fsquare(t0,z2_20_0);
/* 2^22 - 2^2 */ fsquare(t1,t0);
/* 2^40 - 2^20 */ for (i = 2;i < 20;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
/* 2^40 - 2^0 */ fmul(t0,t1,z2_20_0);
/* 2^41 - 2^1 */ fsquare(t1,t0);
/* 2^42 - 2^2 */ fsquare(t0,t1);
/* 2^50 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
/* 2^50 - 2^0 */ fmul(z2_50_0,t0,z2_10_0);
/* 2^51 - 2^1 */ fsquare(t0,z2_50_0);
/* 2^52 - 2^2 */ fsquare(t1,t0);
/* 2^100 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
/* 2^100 - 2^0 */ fmul(z2_100_0,t1,z2_50_0);
/* 2^101 - 2^1 */ fsquare(t1,z2_100_0);
/* 2^102 - 2^2 */ fsquare(t0,t1);
/* 2^200 - 2^100 */ for (i = 2;i < 100;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
/* 2^200 - 2^0 */ fmul(t1,t0,z2_100_0);
/* 2^201 - 2^1 */ fsquare(t0,t1);
/* 2^202 - 2^2 */ fsquare(t1,t0);
/* 2^250 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
/* 2^250 - 2^0 */ fmul(t0,t1,z2_50_0);
/* 2^251 - 2^1 */ fsquare(t1,t0);
/* 2^252 - 2^2 */ fsquare(t0,t1);
/* 2^253 - 2^3 */ fsquare(t1,t0);
/* 2^254 - 2^4 */ fsquare(t0,t1);
/* 2^255 - 2^5 */ fsquare(t1,t0);
/* 2^255 - 21 */ fmul(out,t1,z11);
}
static const u8 curve25519_basepoint[32] = {9};
void curve25519_scalarmult(u8 *result, const u8 *secret, const u8 *basepoint) {
limb bp[10], x[10], z[11], zmone[10];
uint8_t e[32];
int i;
for (i = 0; i < 32; ++i) e[i] = secret[i];
e[0] &= 248;
e[31] &= 127;
e[31] |= 64;
fexpand(bp, basepoint);
cmult(x, z, e, bp);
crecip(zmone, z);
fmul(z, x, zmone);
fcontract(result, z);
}
void curve25519_publickey(u8 *public, const u8 *secret) {
curve25519_scalarmult(public, secret, curve25519_basepoint);
}

View File

@ -1,11 +1,32 @@
#ifndef CURVE25519_H
#define CURVE25519_H
#include "curve25519.h"
#include "curve25519-donna-portable.h"
#include <stdint.h>
#if defined(CURVE25519_SSE2)
#else
#if defined(HAVE_UINT128) && !defined(CURVE25519_FORCE_32BIT)
#define CURVE25519_64BIT
#else
#define CURVE25519_32BIT
#endif
#endif
typedef uint8_t u8;
#if !defined(CURVE25519_NO_INLINE_ASM)
#endif
void curve25519_scalarmult(u8 *result, const u8 *secret, const u8 *basepoint);
void curve25519_publickey(u8 *public, const u8 *secret);
#endif // CURVE25519_H
#if defined(CURVE25519_SSE2)
#include "curve25519-donna-sse2.h"
#elif defined(CURVE25519_64BIT)
#include "curve25519-donna-64bit.h"
#else
#include "curve25519-donna-32bit.h"
#endif
#include "curve25519-donna-common.h"
#if defined(CURVE25519_SSE2)
#include "curve25519-donna-scalarmult-sse2.h"
#else
#include "curve25519-donna-scalarmult-base.h"
#endif

View File

@ -0,0 +1,27 @@
#include "curve25519-donna.h"
#if !defined(CURVE25519_SUFFIX)
#define CURVE25519_SUFFIX
#endif
#define CURVE25519_FN3(fn,suffix) fn##suffix
#define CURVE25519_FN2(fn,suffix) CURVE25519_FN3(fn,suffix)
#define CURVE25519_FN(fn) CURVE25519_FN2(fn,CURVE25519_SUFFIX)
void
CURVE25519_FN(curve25519_donna) (curve25519_key mypublic, const curve25519_key secret, const curve25519_key basepoint) {
curve25519_key e;
size_t i;
for (i = 0;i < 32;++i) e[i] = secret[i];
e[0] &= 0xf8;
e[31] &= 0x7f;
e[31] |= 0x40;
curve25519_scalarmult_donna(mypublic, e, basepoint);
}
void
CURVE25519_FN(curve25519_donna_basepoint) (curve25519_key mypublic, const curve25519_key secret) {
static const curve25519_key basepoint = {9};
CURVE25519_FN(curve25519_donna)(mypublic, secret, basepoint);
}

View File

@ -0,0 +1,10 @@
#ifndef CURVE25519_H
#define CURVE25519_H
typedef unsigned char curve25519_key[32];
void curve25519_donna(curve25519_key mypublic, const curve25519_key secret, const curve25519_key basepoint);
void curve25519_donna_basepoint(curve25519_key mypublic, const curve25519_key secret);
#endif /* CURVE25519_H */

View File

@ -8,7 +8,7 @@
#include "secp256k1.h"
#include "nist256p1.h"
#include "ed25519.h"
#include "curve25519-donna.h"
#include "curve25519.h"
static uint8_t msg[32];
@ -92,7 +92,7 @@ void bench_curve25519(void)
clock_t t = clock();
for (int i = 0 ; i < 500; i++) {
curve25519_scalarmult(result, secret, basepoint);
curve25519_donna(result, secret, basepoint);
}
printf("Curve25519 multiplying speed: %0.2f mul/s\n", 500.0f / ((float)(clock() - t) / CLOCKS_PER_SEC));
}