merge curve25519 into ed25519, code reuse

2024-12-26 16:18:22 +00:00 · 2017-03-28 18:53:09 +02:00 · 2017-03-28 18:53:09 +02:00 · 092d8e7bf1
commit 092d8e7bf1
parent 397a13f654
15 changed files with 43 additions and 2285 deletions
--- a/7
+++ b/7
@ -26,7 +26,7 @@ CFLAGS   += $(OPTFLAGS) \
 # disable sequence point warning because of AES code
 CFLAGS += -Wno-sequence-point
-CFLAGS += -Ied25519-donna -Icurve25519-donna -I.
+CFLAGS += -Ied25519-donna -I.
 CFLAGS += -DUSE_ETHEREUM=1
 CFLAGS += -DUSE_GRAPHENE=1
@ -42,8 +42,7 @@ SRCS  += ripemd160.c
 SRCS  += sha2.c
 SRCS  += sha3.c
 SRCS  += aescrypt.c aeskey.c aestab.c aes_modes.c
-SRCS  += ed25519-donna/ed25519.c
+SRCS  += ed25519-donna/ed25519.c ed25519-donna/curve25519.c
 SRCS  += curve25519-donna/curve25519.c
 SRCS  += blake2b.c blake2s.c
 OBJS   = $(SRCS:.c=.o)
@ -80,5 +79,5 @@ tools/bip39bruteforce: tools/bip39bruteforce.o $(OBJS)
 	$(CC) tools/bip39bruteforce.o $(OBJS) -o tools/bip39bruteforce
 clean:
-	rm -f *.o ed25519-donna/*.o curve25519-donna/*.o tests test_speed test-openssl libtrezor-crypto.so
+	rm -f *.o ed25519-donna/*.o tests test_speed test-openssl libtrezor-crypto.so
 	rm -f tools/*.o tools/xpubaddrgen tools/mktable tools/bip39bruteforce
--- a/curve25519-donna/README.md
+++ b/curve25519-donna/README.md
@ -1,107 +0,0 @@
 [curve25519](http://cr.yp.to/ecdh.html) is an elliptic curve, developed by 
 [Dan Bernstein](http://cr.yp.to/djb.html), for fast 
 [Diffie-Hellman](http://en.wikipedia.org/wiki/Diffie-Hellman) key agreement. 
 DJB's [original implementation](http://cr.yp.to/ecdh.html) was written in a 
 language of his own devising called [qhasm](http://cr.yp.to/qhasm.html). 
 The original qhasm source isn't available, only the x86 32-bit assembly output.
 This project provides performant, portable 32-bit & 64-bit implementations. 
 All implementations are of course constant time in regard to secret data.
 #### Performance 
 Compilers versions are gcc 4.6.3, icc 13.1.1, clang 3.4-1~exp1.
 Counts are in thousands of cycles.
 Note that SSE2 performance may be less impressive on AMD & older CPUs with slower SSE ops!
 ##### E5200 @ 2.5ghz, march=core2
 <table>
 <thead><tr><th>Version</th><th>gcc</th><th>icc</th><th>clang</th></tr></thead>
 <tbody>
 <tr><td>64-bit SSE2  </td><td>  278k</td><td>  265k</td><td>  302k</td></tr>
 <tr><td>64-bit       </td><td>  273k</td><td>  271k</td><td>  377k</td></tr>
 <tr><td>32-bit SSE2  </td><td>  304k</td><td>  289k</td><td>  317k</td></tr>
 <tr><td>32-bit       </td><td> 1417k</td><td>  845k</td><td>  981k</td></tr>
 </tbody>
 </table>
 ##### E3-1270 @ 3.4ghz, march=corei7-avx
 <table>
 <thead><tr><th>Version</th><th>gcc</th><th>icc</th><th>clang</th></tr></thead>
 <tbody>
 <tr><td>64-bit       </td><td>  201k</td><td>  192k</td><td>  233k</td></tr>
 <tr><td>64-bit SSE2  </td><td>  201k</td><td>  201k</td><td>  261k</td></tr>
 <tr><td>32-bit SSE2  </td><td>  238k</td><td>  225k</td><td>  250k</td></tr>
 <tr><td>32-bit       </td><td> 1293k</td><td>  822k</td><td>  848k</td></tr>
 </tbody>
 </table>
 #### Compilation
 No configuration is needed.
 ##### 32-bit
 	gcc curve25519.c -m32 -O3 -c
 ##### 64-bit
 	gcc curve25519.c -m64 -O3 -c
 ##### SSE2
 	gcc curve25519.c -m32 -O3 -c -DCURVE25519_SSE2 -msse2
 	gcc curve25519.c -m64 -O3 -c -DCURVE25519_SSE2
 clang, icc, and msvc are also supported
 ##### Named Versions
 Define CURVE25519_SUFFIX to append a suffix to public functions, e.g. 
 `-DCURVE25519_SUFFIX=_sse2` to create curve25519_donna_sse2 and 
 curve25519_donna_basepoint_sse2.
 #### Usage
 To use the code, link against `curve25519.o` and:
 	#include "curve25519.h"
 To generate a private/secret key, generate 32 cryptographically random bytes: 
 	curve25519_key sk;
 	randombytes(sk, sizeof(curve25519_key));
 Manual clamping is not needed, and it is actually not possible to use unclamped
 keys due to the code taking advantage of the clamped bits internally.
 To generate the public key from the private/secret key:
 	curve25519_key pk;
 	curve25519_donna_basepoint(pk, sk);
 To generate a shared key with your private/secret key and someone elses public key:
 	curve25519_key shared;
 	curve25519_donna(shared, mysk, yourpk);
 And hash `shared` with a cryptographic hash before using, or e.g. pass `shared` through
 HSalsa20/HChacha as NaCl does.
 #### Testing
 Fuzzing against a reference implemenation is now available. See [fuzz/README](fuzz/README.md).
 Building `curve25519.c` and linking with `test.c` will run basic sanity tests and benchmark curve25519_donna.
 #### Papers
 [djb's curve25519 paper](http://cr.yp.to/ecdh/curve25519-20060209.pdf)
 #### License
 Public Domain, or MIT
--- a/curve25519-donna/curve25519-donna-32bit.h
+++ b/curve25519-donna/curve25519-donna-32bit.h
@ -1,466 +0,0 @@
 typedef uint32_t bignum25519[10];
 static const uint32_t reduce_mask_26 = (1 << 26) - 1;
 static const uint32_t reduce_mask_25 = (1 << 25) - 1;
 /* out = in */
 DONNA_INLINE static void
 curve25519_copy(bignum25519 out, const bignum25519 in) {
 	out[0] = in[0];
 	out[1] = in[1];
 	out[2] = in[2];
 	out[3] = in[3];
 	out[4] = in[4];
 	out[5] = in[5];
 	out[6] = in[6];
 	out[7] = in[7];
 	out[8] = in[8];
 	out[9] = in[9];
 }
 /* out = a + b */
 DONNA_INLINE static void
 curve25519_add(bignum25519 out, const bignum25519 a, const bignum25519 b) {
 	out[0] = a[0] + b[0];
 	out[1] = a[1] + b[1];
 	out[2] = a[2] + b[2];
 	out[3] = a[3] + b[3];
 	out[4] = a[4] + b[4];
 	out[5] = a[5] + b[5];
 	out[6] = a[6] + b[6];
 	out[7] = a[7] + b[7];
 	out[8] = a[8] + b[8];
 	out[9] = a[9] + b[9];
 }
 /* out = a - b */
 DONNA_INLINE static void
 curve25519_sub(bignum25519 out, const bignum25519 a, const bignum25519 b) {
 	uint32_t c;
 	out[0] = 0x7ffffda + a[0] - b[0]    ; c = (out[0] >> 26); out[0] &= reduce_mask_26;
 	out[1] = 0x3fffffe + a[1] - b[1] + c; c = (out[1] >> 25); out[1] &= reduce_mask_25;
 	out[2] = 0x7fffffe + a[2] - b[2] + c; c = (out[2] >> 26); out[2] &= reduce_mask_26;
 	out[3] = 0x3fffffe + a[3] - b[3] + c; c = (out[3] >> 25); out[3] &= reduce_mask_25;
 	out[4] = 0x7fffffe + a[4] - b[4] + c; c = (out[4] >> 26); out[4] &= reduce_mask_26;
 	out[5] = 0x3fffffe + a[5] - b[5] + c; c = (out[5] >> 25); out[5] &= reduce_mask_25;
 	out[6] = 0x7fffffe + a[6] - b[6] + c; c = (out[6] >> 26); out[6] &= reduce_mask_26;
 	out[7] = 0x3fffffe + a[7] - b[7] + c; c = (out[7] >> 25); out[7] &= reduce_mask_25;
 	out[8] = 0x7fffffe + a[8] - b[8] + c; c = (out[8] >> 26); out[8] &= reduce_mask_26;
 	out[9] = 0x3fffffe + a[9] - b[9] + c; c = (out[9] >> 25); out[9] &= reduce_mask_25;
 	out[0] += 19 * c;
 }
 /* out = in * scalar */
 DONNA_INLINE static void
 curve25519_scalar_product(bignum25519 out, const bignum25519 in, const uint32_t scalar) {
 	uint64_t a;
 	uint32_t c;
 	a = mul32x32_64(in[0], scalar);     out[0] = (uint32_t)a & reduce_mask_26; c = (uint32_t)(a >> 26);
 	a = mul32x32_64(in[1], scalar) + c; out[1] = (uint32_t)a & reduce_mask_25; c = (uint32_t)(a >> 25);
 	a = mul32x32_64(in[2], scalar) + c; out[2] = (uint32_t)a & reduce_mask_26; c = (uint32_t)(a >> 26);
 	a = mul32x32_64(in[3], scalar) + c; out[3] = (uint32_t)a & reduce_mask_25; c = (uint32_t)(a >> 25);
 	a = mul32x32_64(in[4], scalar) + c; out[4] = (uint32_t)a & reduce_mask_26; c = (uint32_t)(a >> 26);
 	a = mul32x32_64(in[5], scalar) + c; out[5] = (uint32_t)a & reduce_mask_25; c = (uint32_t)(a >> 25);
 	a = mul32x32_64(in[6], scalar) + c; out[6] = (uint32_t)a & reduce_mask_26; c = (uint32_t)(a >> 26);
 	a = mul32x32_64(in[7], scalar) + c; out[7] = (uint32_t)a & reduce_mask_25; c = (uint32_t)(a >> 25);
 	a = mul32x32_64(in[8], scalar) + c; out[8] = (uint32_t)a & reduce_mask_26; c = (uint32_t)(a >> 26);
 	a = mul32x32_64(in[9], scalar) + c; out[9] = (uint32_t)a & reduce_mask_25; c = (uint32_t)(a >> 25);
 	                                    out[0] += c * 19;
 }
 /* out = a * b */
 DONNA_INLINE static void
 curve25519_mul(bignum25519 out, const bignum25519 a, const bignum25519 b) {
 	uint32_t r0,r1,r2,r3,r4,r5,r6,r7,r8,r9;
 	uint32_t s0,s1,s2,s3,s4,s5,s6,s7,s8,s9;
 	uint64_t m0,m1,m2,m3,m4,m5,m6,m7,m8,m9,c;
 	uint32_t p;
 	r0 = b[0];
 	r1 = b[1];
 	r2 = b[2];
 	r3 = b[3];
 	r4 = b[4];
 	r5 = b[5];
 	r6 = b[6];
 	r7 = b[7];
 	r8 = b[8];
 	r9 = b[9];
 	s0 = a[0];
 	s1 = a[1];
 	s2 = a[2];
 	s3 = a[3];
 	s4 = a[4];
 	s5 = a[5];
 	s6 = a[6];
 	s7 = a[7];
 	s8 = a[8];
 	s9 = a[9];
 	m1 = mul32x32_64(r0, s1) + mul32x32_64(r1, s0);
 	m3 = mul32x32_64(r0, s3) + mul32x32_64(r1, s2) + mul32x32_64(r2, s1) + mul32x32_64(r3, s0);
 	m5 = mul32x32_64(r0, s5) + mul32x32_64(r1, s4) + mul32x32_64(r2, s3) + mul32x32_64(r3, s2) + mul32x32_64(r4, s1) + mul32x32_64(r5, s0);
 	m7 = mul32x32_64(r0, s7) + mul32x32_64(r1, s6) + mul32x32_64(r2, s5) + mul32x32_64(r3, s4) + mul32x32_64(r4, s3) + mul32x32_64(r5, s2) + mul32x32_64(r6, s1) + mul32x32_64(r7, s0);
 	m9 = mul32x32_64(r0, s9) + mul32x32_64(r1, s8) + mul32x32_64(r2, s7) + mul32x32_64(r3, s6) + mul32x32_64(r4, s5) + mul32x32_64(r5, s4) + mul32x32_64(r6, s3) + mul32x32_64(r7, s2) + mul32x32_64(r8, s1) + mul32x32_64(r9, s0);
 	r1 *= 2;
 	r3 *= 2;
 	r5 *= 2;
 	r7 *= 2;
 	m0 = mul32x32_64(r0, s0);
 	m2 = mul32x32_64(r0, s2) + mul32x32_64(r1, s1) + mul32x32_64(r2, s0);
 	m4 = mul32x32_64(r0, s4) + mul32x32_64(r1, s3) + mul32x32_64(r2, s2) + mul32x32_64(r3, s1) + mul32x32_64(r4, s0);
 	m6 = mul32x32_64(r0, s6) + mul32x32_64(r1, s5) + mul32x32_64(r2, s4) + mul32x32_64(r3, s3) + mul32x32_64(r4, s2) + mul32x32_64(r5, s1) + mul32x32_64(r6, s0);
 	m8 = mul32x32_64(r0, s8) + mul32x32_64(r1, s7) + mul32x32_64(r2, s6) + mul32x32_64(r3, s5) + mul32x32_64(r4, s4) + mul32x32_64(r5, s3) + mul32x32_64(r6, s2) + mul32x32_64(r7, s1) + mul32x32_64(r8, s0);
 	r1 *= 19;
 	r2 *= 19;
 	r3 = (r3 / 2) * 19;
 	r4 *= 19;
 	r5 = (r5 / 2) * 19;
 	r6 *= 19;
 	r7 = (r7 / 2) * 19;
 	r8 *= 19;
 	r9 *= 19;
 	m1 += (mul32x32_64(r9, s2) + mul32x32_64(r8, s3) + mul32x32_64(r7, s4) + mul32x32_64(r6, s5) + mul32x32_64(r5, s6) + mul32x32_64(r4, s7) + mul32x32_64(r3, s8) + mul32x32_64(r2, s9));
 	m3 += (mul32x32_64(r9, s4) + mul32x32_64(r8, s5) + mul32x32_64(r7, s6) + mul32x32_64(r6, s7) + mul32x32_64(r5, s8) + mul32x32_64(r4, s9));
 	m5 += (mul32x32_64(r9, s6) + mul32x32_64(r8, s7) + mul32x32_64(r7, s8) + mul32x32_64(r6, s9));
 	m7 += (mul32x32_64(r9, s8) + mul32x32_64(r8, s9));
 	r3 *= 2;
 	r5 *= 2;
 	r7 *= 2;
 	r9 *= 2;
 	m0 += (mul32x32_64(r9, s1) + mul32x32_64(r8, s2) + mul32x32_64(r7, s3) + mul32x32_64(r6, s4) + mul32x32_64(r5, s5) + mul32x32_64(r4, s6) + mul32x32_64(r3, s7) + mul32x32_64(r2, s8) + mul32x32_64(r1, s9));
 	m2 += (mul32x32_64(r9, s3) + mul32x32_64(r8, s4) + mul32x32_64(r7, s5) + mul32x32_64(r6, s6) + mul32x32_64(r5, s7) + mul32x32_64(r4, s8) + mul32x32_64(r3, s9));
 	m4 += (mul32x32_64(r9, s5) + mul32x32_64(r8, s6) + mul32x32_64(r7, s7) + mul32x32_64(r6, s8) + mul32x32_64(r5, s9));
 	m6 += (mul32x32_64(r9, s7) + mul32x32_64(r8, s8) + mul32x32_64(r7, s9));
 	m8 += (mul32x32_64(r9, s9));
 	                             r0 = (uint32_t)m0 & reduce_mask_26; c = (m0 >> 26);
 	m1 += c;                     r1 = (uint32_t)m1 & reduce_mask_25; c = (m1 >> 25);
 	m2 += c;                     r2 = (uint32_t)m2 & reduce_mask_26; c = (m2 >> 26);
 	m3 += c;                     r3 = (uint32_t)m3 & reduce_mask_25; c = (m3 >> 25);
 	m4 += c;                     r4 = (uint32_t)m4 & reduce_mask_26; c = (m4 >> 26);
 	m5 += c;                     r5 = (uint32_t)m5 & reduce_mask_25; c = (m5 >> 25);
 	m6 += c;                     r6 = (uint32_t)m6 & reduce_mask_26; c = (m6 >> 26);
 	m7 += c;                     r7 = (uint32_t)m7 & reduce_mask_25; c = (m7 >> 25);
 	m8 += c;                     r8 = (uint32_t)m8 & reduce_mask_26; c = (m8 >> 26);
 	m9 += c;                     r9 = (uint32_t)m9 & reduce_mask_25; p = (uint32_t)(m9 >> 25);
 	m0 = r0 + mul32x32_64(p,19); r0 = (uint32_t)m0 & reduce_mask_26; p = (uint32_t)(m0 >> 26);
 	r1 += p;
 	out[0] = r0;
 	out[1] = r1;
 	out[2] = r2;
 	out[3] = r3;
 	out[4] = r4;
 	out[5] = r5;
 	out[6] = r6;
 	out[7] = r7;
 	out[8] = r8;
 	out[9] = r9;
 }
 /* out = in * in */
 DONNA_INLINE static void
 curve25519_square(bignum25519 out, const bignum25519 in) {
 	uint32_t r0,r1,r2,r3,r4,r5,r6,r7,r8,r9;
 	uint32_t d6,d7,d8,d9;
 	uint64_t m0,m1,m2,m3,m4,m5,m6,m7,m8,m9,c;
 	uint32_t p;
 	r0 = in[0];
 	r1 = in[1];
 	r2 = in[2];
 	r3 = in[3];
 	r4 = in[4];
 	r5 = in[5];
 	r6 = in[6];
 	r7 = in[7];
 	r8 = in[8];
 	r9 = in[9];
 	m0 = mul32x32_64(r0, r0);
 	r0 *= 2;
 	m1 = mul32x32_64(r0, r1);
 	m2 = mul32x32_64(r0, r2) + mul32x32_64(r1, r1 * 2);
 	r1 *= 2;
 	m3 = mul32x32_64(r0, r3) + mul32x32_64(r1, r2    );
 	m4 = mul32x32_64(r0, r4) + mul32x32_64(r1, r3 * 2) + mul32x32_64(r2, r2);
 	r2 *= 2;
 	m5 = mul32x32_64(r0, r5) + mul32x32_64(r1, r4    ) + mul32x32_64(r2, r3);
 	m6 = mul32x32_64(r0, r6) + mul32x32_64(r1, r5 * 2) + mul32x32_64(r2, r4) + mul32x32_64(r3, r3 * 2);
 	r3 *= 2;
 	m7 = mul32x32_64(r0, r7) + mul32x32_64(r1, r6    ) + mul32x32_64(r2, r5) + mul32x32_64(r3, r4    );
 	m8 = mul32x32_64(r0, r8) + mul32x32_64(r1, r7 * 2) + mul32x32_64(r2, r6) + mul32x32_64(r3, r5 * 2) + mul32x32_64(r4, r4    );
 	m9 = mul32x32_64(r0, r9) + mul32x32_64(r1, r8    ) + mul32x32_64(r2, r7) + mul32x32_64(r3, r6    ) + mul32x32_64(r4, r5 * 2);
 	d6 = r6 * 19;
 	d7 = r7 * 2 * 19;
 	d8 = r8 * 19;
 	d9 = r9 * 2 * 19;
 	m0 += (mul32x32_64(d9, r1    ) + mul32x32_64(d8, r2    ) + mul32x32_64(d7, r3    ) + mul32x32_64(d6, r4 * 2) + mul32x32_64(r5, r5 * 2 * 19));
 	m1 += (mul32x32_64(d9, r2 / 2) + mul32x32_64(d8, r3    ) + mul32x32_64(d7, r4    ) + mul32x32_64(d6, r5 * 2));
 	m2 += (mul32x32_64(d9, r3    ) + mul32x32_64(d8, r4 * 2) + mul32x32_64(d7, r5 * 2) + mul32x32_64(d6, r6    ));
 	m3 += (mul32x32_64(d9, r4    ) + mul32x32_64(d8, r5 * 2) + mul32x32_64(d7, r6    ));
 	m4 += (mul32x32_64(d9, r5 * 2) + mul32x32_64(d8, r6 * 2) + mul32x32_64(d7, r7    ));
 	m5 += (mul32x32_64(d9, r6    ) + mul32x32_64(d8, r7 * 2));
 	m6 += (mul32x32_64(d9, r7 * 2) + mul32x32_64(d8, r8    ));
 	m7 += (mul32x32_64(d9, r8    ));
 	m8 += (mul32x32_64(d9, r9    ));
 	                             r0 = (uint32_t)m0 & reduce_mask_26; c = (m0 >> 26);
 	m1 += c;                     r1 = (uint32_t)m1 & reduce_mask_25; c = (m1 >> 25);
 	m2 += c;                     r2 = (uint32_t)m2 & reduce_mask_26; c = (m2 >> 26);
 	m3 += c;                     r3 = (uint32_t)m3 & reduce_mask_25; c = (m3 >> 25);
 	m4 += c;                     r4 = (uint32_t)m4 & reduce_mask_26; c = (m4 >> 26);
 	m5 += c;                     r5 = (uint32_t)m5 & reduce_mask_25; c = (m5 >> 25);
 	m6 += c;                     r6 = (uint32_t)m6 & reduce_mask_26; c = (m6 >> 26);
 	m7 += c;                     r7 = (uint32_t)m7 & reduce_mask_25; c = (m7 >> 25);
 	m8 += c;                     r8 = (uint32_t)m8 & reduce_mask_26; c = (m8 >> 26);
 	m9 += c;                     r9 = (uint32_t)m9 & reduce_mask_25; p = (uint32_t)(m9 >> 25);
 	m0 = r0 + mul32x32_64(p,19); r0 = (uint32_t)m0 & reduce_mask_26; p = (uint32_t)(m0 >> 26);
 	r1 += p;
 	out[0] = r0;
 	out[1] = r1;
 	out[2] = r2;
 	out[3] = r3;
 	out[4] = r4;
 	out[5] = r5;
 	out[6] = r6;
 	out[7] = r7;
 	out[8] = r8;
 	out[9] = r9;
 }
 /* out = in^(2 * count) */
 static void
 curve25519_square_times(bignum25519 out, const bignum25519 in, int count) {
 	uint32_t r0,r1,r2,r3,r4,r5,r6,r7,r8,r9;
 	uint32_t d6,d7,d8,d9;
 	uint64_t m0,m1,m2,m3,m4,m5,m6,m7,m8,m9,c;
 	uint32_t p;
 	r0 = in[0];
 	r1 = in[1];
 	r2 = in[2];
 	r3 = in[3];
 	r4 = in[4];
 	r5 = in[5];
 	r6 = in[6];
 	r7 = in[7];
 	r8 = in[8];
 	r9 = in[9];
 	do {
 		m0 = mul32x32_64(r0, r0);
 		r0 *= 2;
 		m1 = mul32x32_64(r0, r1);
 		m2 = mul32x32_64(r0, r2) + mul32x32_64(r1, r1 * 2);
 		r1 *= 2;
 		m3 = mul32x32_64(r0, r3) + mul32x32_64(r1, r2    );
 		m4 = mul32x32_64(r0, r4) + mul32x32_64(r1, r3 * 2) + mul32x32_64(r2, r2);
 		r2 *= 2;
 		m5 = mul32x32_64(r0, r5) + mul32x32_64(r1, r4    ) + mul32x32_64(r2, r3);
 		m6 = mul32x32_64(r0, r6) + mul32x32_64(r1, r5 * 2) + mul32x32_64(r2, r4) + mul32x32_64(r3, r3 * 2);
 		r3 *= 2;
 		m7 = mul32x32_64(r0, r7) + mul32x32_64(r1, r6    ) + mul32x32_64(r2, r5) + mul32x32_64(r3, r4    );
 		m8 = mul32x32_64(r0, r8) + mul32x32_64(r1, r7 * 2) + mul32x32_64(r2, r6) + mul32x32_64(r3, r5 * 2) + mul32x32_64(r4, r4    );
 		m9 = mul32x32_64(r0, r9) + mul32x32_64(r1, r8    ) + mul32x32_64(r2, r7) + mul32x32_64(r3, r6    ) + mul32x32_64(r4, r5 * 2);
 		d6 = r6 * 19;
 		d7 = r7 * 2 * 19;
 		d8 = r8 * 19;
 		d9 = r9 * 2 * 19;
 		m0 += (mul32x32_64(d9, r1    ) + mul32x32_64(d8, r2    ) + mul32x32_64(d7, r3    ) + mul32x32_64(d6, r4 * 2) + mul32x32_64(r5, r5 * 2 * 19));
 		m1 += (mul32x32_64(d9, r2 / 2) + mul32x32_64(d8, r3    ) + mul32x32_64(d7, r4    ) + mul32x32_64(d6, r5 * 2));
 		m2 += (mul32x32_64(d9, r3    ) + mul32x32_64(d8, r4 * 2) + mul32x32_64(d7, r5 * 2) + mul32x32_64(d6, r6    ));
 		m3 += (mul32x32_64(d9, r4    ) + mul32x32_64(d8, r5 * 2) + mul32x32_64(d7, r6    ));
 		m4 += (mul32x32_64(d9, r5 * 2) + mul32x32_64(d8, r6 * 2) + mul32x32_64(d7, r7    ));
 		m5 += (mul32x32_64(d9, r6    ) + mul32x32_64(d8, r7 * 2));
 		m6 += (mul32x32_64(d9, r7 * 2) + mul32x32_64(d8, r8    ));
 		m7 += (mul32x32_64(d9, r8    ));
 		m8 += (mul32x32_64(d9, r9    ));
 		                             r0 = (uint32_t)m0 & reduce_mask_26; c = (m0 >> 26);
 		m1 += c;                     r1 = (uint32_t)m1 & reduce_mask_25; c = (m1 >> 25);
 		m2 += c;                     r2 = (uint32_t)m2 & reduce_mask_26; c = (m2 >> 26);
 		m3 += c;                     r3 = (uint32_t)m3 & reduce_mask_25; c = (m3 >> 25);
 		m4 += c;                     r4 = (uint32_t)m4 & reduce_mask_26; c = (m4 >> 26);
 		m5 += c;                     r5 = (uint32_t)m5 & reduce_mask_25; c = (m5 >> 25);
 		m6 += c;                     r6 = (uint32_t)m6 & reduce_mask_26; c = (m6 >> 26);
 		m7 += c;                     r7 = (uint32_t)m7 & reduce_mask_25; c = (m7 >> 25);
 		m8 += c;                     r8 = (uint32_t)m8 & reduce_mask_26; c = (m8 >> 26);
 		m9 += c;                     r9 = (uint32_t)m9 & reduce_mask_25; p = (uint32_t)(m9 >> 25);
 		m0 = r0 + mul32x32_64(p,19); r0 = (uint32_t)m0 & reduce_mask_26; p = (uint32_t)(m0 >> 26);
 		r1 += p;
 	} while (--count);
 	out[0] = r0;
 	out[1] = r1;
 	out[2] = r2;
 	out[3] = r3;
 	out[4] = r4;
 	out[5] = r5;
 	out[6] = r6;
 	out[7] = r7;
 	out[8] = r8;
 	out[9] = r9;
 }
 /* Take a little-endian, 32-byte number and expand it into polynomial form */
 static void
 curve25519_expand(bignum25519 out, const unsigned char in[32]) {
 	static const union { uint8_t b[2]; uint16_t s; } endian_check = {{1,0}};
 	uint32_t x0,x1,x2,x3,x4,x5,x6,x7;
 	if (endian_check.s == 1) {
 		x0 = *(uint32_t *)(in + 0);
 		x1 = *(uint32_t *)(in + 4);
 		x2 = *(uint32_t *)(in + 8);
 		x3 = *(uint32_t *)(in + 12);
 		x4 = *(uint32_t *)(in + 16);
 		x5 = *(uint32_t *)(in + 20);
 		x6 = *(uint32_t *)(in + 24);
 		x7 = *(uint32_t *)(in + 28);
    } else {
 		#define F(s)                         \
 			((((uint32_t)in[s + 0])      ) | \
 			 (((uint32_t)in[s + 1]) <<  8) | \
 			 (((uint32_t)in[s + 2]) << 16) | \
 			 (((uint32_t)in[s + 3]) << 24))
 		x0 = F(0);
 		x1 = F(4);
 		x2 = F(8);
 		x3 = F(12);
 		x4 = F(16);
 		x5 = F(20);
 		x6 = F(24);
 		x7 = F(28);
 		#undef F
 	}
 	out[0] = (                        x0       ) & reduce_mask_26;
 	out[1] = ((((uint64_t)x1 << 32) | x0) >> 26) & reduce_mask_25;
 	out[2] = ((((uint64_t)x2 << 32) | x1) >> 19) & reduce_mask_26;
 	out[3] = ((((uint64_t)x3 << 32) | x2) >> 13) & reduce_mask_25;
 	out[4] = ((                       x3) >>  6) & reduce_mask_26;
 	out[5] = (                        x4       ) & reduce_mask_25;
 	out[6] = ((((uint64_t)x5 << 32) | x4) >> 25) & reduce_mask_26;
 	out[7] = ((((uint64_t)x6 << 32) | x5) >> 19) & reduce_mask_25;
 	out[8] = ((((uint64_t)x7 << 32) | x6) >> 12) & reduce_mask_26;
 	out[9] = ((                       x7) >>  6) & reduce_mask_25; /* ignore the top bit */
 }
 /* Take a fully reduced polynomial form number and contract it into a little-endian, 32-byte array */
 static void
 curve25519_contract(unsigned char out[32], const bignum25519 in) {
 	bignum25519 f;
 	curve25519_copy(f, in);
 	#define carry_pass() \
 		f[1] += f[0] >> 26; f[0] &= reduce_mask_26; \
 		f[2] += f[1] >> 25; f[1] &= reduce_mask_25; \
 		f[3] += f[2] >> 26; f[2] &= reduce_mask_26; \
 		f[4] += f[3] >> 25; f[3] &= reduce_mask_25; \
 		f[5] += f[4] >> 26; f[4] &= reduce_mask_26; \
 		f[6] += f[5] >> 25; f[5] &= reduce_mask_25; \
 		f[7] += f[6] >> 26; f[6] &= reduce_mask_26; \
 		f[8] += f[7] >> 25; f[7] &= reduce_mask_25; \
 		f[9] += f[8] >> 26; f[8] &= reduce_mask_26;
 	#define carry_pass_full() \
 		carry_pass() \
 		f[0] += 19 * (f[9] >> 25); f[9] &= reduce_mask_25;
 	#define carry_pass_final() \
 		carry_pass() \
 		f[9] &= reduce_mask_25;
 	carry_pass_full()
 	carry_pass_full()
 	/* now t is between 0 and 2^255-1, properly carried. */
 	/* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */
 	f[0] += 19;
 	carry_pass_full()
 	/* now between 19 and 2^255-1 in both cases, and offset by 19. */
 	f[0] += (1 << 26) - 19;
 	f[1] += (1 << 25) - 1;
 	f[2] += (1 << 26) - 1;
 	f[3] += (1 << 25) - 1;
 	f[4] += (1 << 26) - 1;
 	f[5] += (1 << 25) - 1;
 	f[6] += (1 << 26) - 1;
 	f[7] += (1 << 25) - 1;
 	f[8] += (1 << 26) - 1;
 	f[9] += (1 << 25) - 1;
 	/* now between 2^255 and 2^256-20, and offset by 2^255. */
 	carry_pass_final()
 	#undef carry_pass
 	#undef carry_full
 	#undef carry_final
 	f[1] <<= 2;
 	f[2] <<= 3;
 	f[3] <<= 5;
 	f[4] <<= 6;
 	f[6] <<= 1;
 	f[7] <<= 3;
 	f[8] <<= 4;
 	f[9] <<= 6;
 	#define F(i, s) \
 		out[s+0] |= (unsigned char )(f[i] & 0xff); \
 		out[s+1] = (unsigned char )((f[i] >> 8) & 0xff); \
 		out[s+2] = (unsigned char )((f[i] >> 16) & 0xff); \
 		out[s+3] = (unsigned char )((f[i] >> 24) & 0xff);
 	out[0] = 0;
 	out[16] = 0;
 	F(0,0);
 	F(1,3);
 	F(2,6);
 	F(3,9);
 	F(4,12);
 	F(5,16);
 	F(6,19);
 	F(7,22);
 	F(8,25);
 	F(9,28);
 	#undef F
 }
 /*
 * Swap the contents of [qx] and [qpx] iff @swap is non-zero
 */
 DONNA_INLINE static void
 curve25519_swap_conditional(bignum25519 x, bignum25519 qpx, uint32_t iswap) {
 	const uint32_t swap = (uint32_t)(-(int32_t)iswap);
 	uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9;
 	x0 = swap & (x[0] ^ qpx[0]); x[0] ^= x0; qpx[0] ^= x0;
 	x1 = swap & (x[1] ^ qpx[1]); x[1] ^= x1; qpx[1] ^= x1;
 	x2 = swap & (x[2] ^ qpx[2]); x[2] ^= x2; qpx[2] ^= x2;
 	x3 = swap & (x[3] ^ qpx[3]); x[3] ^= x3; qpx[3] ^= x3;
 	x4 = swap & (x[4] ^ qpx[4]); x[4] ^= x4; qpx[4] ^= x4;
 	x5 = swap & (x[5] ^ qpx[5]); x[5] ^= x5; qpx[5] ^= x5;
 	x6 = swap & (x[6] ^ qpx[6]); x[6] ^= x6; qpx[6] ^= x6;
 	x7 = swap & (x[7] ^ qpx[7]); x[7] ^= x7; qpx[7] ^= x7;
 	x8 = swap & (x[8] ^ qpx[8]); x[8] ^= x8; qpx[8] ^= x8;
 	x9 = swap & (x[9] ^ qpx[9]); x[9] ^= x9; qpx[9] ^= x9;
 }
--- a/curve25519-donna/curve25519-donna-64bit.h
+++ b/curve25519-donna/curve25519-donna-64bit.h
@ -1,345 +0,0 @@
 typedef uint64_t bignum25519[5];
 static const uint64_t reduce_mask_51 = ((uint64_t)1 << 51) - 1;
 static const uint64_t reduce_mask_52 = ((uint64_t)1 << 52) - 1;
 /* out = in */
 DONNA_INLINE static void
 curve25519_copy(bignum25519 out, const bignum25519 in) {
 	out[0] = in[0];
 	out[1] = in[1];
 	out[2] = in[2];
 	out[3] = in[3];
 	out[4] = in[4];
 }
 /* out = a + b */
 DONNA_INLINE static void
 curve25519_add(bignum25519 out, const bignum25519 a, const bignum25519 b) {
 	out[0] = a[0] + b[0];
 	out[1] = a[1] + b[1];
 	out[2] = a[2] + b[2];
 	out[3] = a[3] + b[3];
 	out[4] = a[4] + b[4];
 }
 static const uint64_t two54m152 = (((uint64_t)1) << 54) - 152;
 static const uint64_t two54m8 = (((uint64_t)1) << 54) - 8;
 /* out = a - b */
 DONNA_INLINE static void
 curve25519_sub(bignum25519 out, const bignum25519 a, const bignum25519 b) {
 	out[0] = a[0] + two54m152 - b[0];
 	out[1] = a[1] + two54m8 - b[1];
 	out[2] = a[2] + two54m8 - b[2];
 	out[3] = a[3] + two54m8 - b[3];
 	out[4] = a[4] + two54m8 - b[4];
 }
 /* out = (in * scalar) */
 DONNA_INLINE static void
 curve25519_scalar_product(bignum25519 out, const bignum25519 in, const uint64_t scalar) {
  uint128_t a;
  uint64_t c;
 #if defined(HAVE_NATIVE_UINT128)
 	a = ((uint128_t) in[0]) * scalar;     out[0] = (uint64_t)a & reduce_mask_51; c = (uint64_t)(a >> 51);
 	a = ((uint128_t) in[1]) * scalar + c; out[1] = (uint64_t)a & reduce_mask_51; c = (uint64_t)(a >> 51);
 	a = ((uint128_t) in[2]) * scalar + c; out[2] = (uint64_t)a & reduce_mask_51; c = (uint64_t)(a >> 51);
 	a = ((uint128_t) in[3]) * scalar + c; out[3] = (uint64_t)a & reduce_mask_51; c = (uint64_t)(a >> 51);
 	a = ((uint128_t) in[4]) * scalar + c; out[4] = (uint64_t)a & reduce_mask_51; c = (uint64_t)(a >> 51);
 	                                      out[0] += c * 19;
 #else
 	mul64x64_128(a, in[0], scalar)                  out[0] = lo128(a) & reduce_mask_51; shr128(c, a, 51);
 	mul64x64_128(a, in[1], scalar) add128_64(a, c)  out[1] = lo128(a) & reduce_mask_51; shr128(c, a, 51);
 	mul64x64_128(a, in[2], scalar) add128_64(a, c)  out[2] = lo128(a) & reduce_mask_51; shr128(c, a, 51);
 	mul64x64_128(a, in[3], scalar) add128_64(a, c)  out[3] = lo128(a) & reduce_mask_51; shr128(c, a, 51);
 	mul64x64_128(a, in[4], scalar) add128_64(a, c)  out[4] = lo128(a) & reduce_mask_51; shr128(c, a, 51);
 	                                                out[0] += c * 19;
 #endif
 }
 /* out = a * b */
 DONNA_INLINE static void
 curve25519_mul(bignum25519 out, const bignum25519 a, const bignum25519 b) {
 #if !defined(HAVE_NATIVE_UINT128)
 	uint128_t mul;
 #endif
 	uint128_t t[5];
 	uint64_t r0,r1,r2,r3,r4,s0,s1,s2,s3,s4,c;
 	r0 = b[0];
 	r1 = b[1];
 	r2 = b[2];
 	r3 = b[3];
 	r4 = b[4];
 	s0 = a[0];
 	s1 = a[1];
 	s2 = a[2];
 	s3 = a[3];
 	s4 = a[4];
 #if defined(HAVE_NATIVE_UINT128)
 	t[0]  =  ((uint128_t) r0) * s0;
 	t[1]  =  ((uint128_t) r0) * s1 + ((uint128_t) r1) * s0;
 	t[2]  =  ((uint128_t) r0) * s2 + ((uint128_t) r2) * s0 + ((uint128_t) r1) * s1;
 	t[3]  =  ((uint128_t) r0) * s3 + ((uint128_t) r3) * s0 + ((uint128_t) r1) * s2 + ((uint128_t) r2) * s1;
 	t[4]  =  ((uint128_t) r0) * s4 + ((uint128_t) r4) * s0 + ((uint128_t) r3) * s1 + ((uint128_t) r1) * s3 + ((uint128_t) r2) * s2;
 #else
 	mul64x64_128(t[0], r0, s0)
 	mul64x64_128(t[1], r0, s1) mul64x64_128(mul, r1, s0) add128(t[1], mul)
 	mul64x64_128(t[2], r0, s2) mul64x64_128(mul, r2, s0) add128(t[2], mul) mul64x64_128(mul, r1, s1) add128(t[2], mul)
 	mul64x64_128(t[3], r0, s3) mul64x64_128(mul, r3, s0) add128(t[3], mul) mul64x64_128(mul, r1, s2) add128(t[3], mul) mul64x64_128(mul, r2, s1) add128(t[3], mul)
 	mul64x64_128(t[4], r0, s4) mul64x64_128(mul, r4, s0) add128(t[4], mul) mul64x64_128(mul, r3, s1) add128(t[4], mul) mul64x64_128(mul, r1, s3) add128(t[4], mul) mul64x64_128(mul, r2, s2) add128(t[4], mul)
 #endif
 	r1 *= 19;
 	r2 *= 19;
 	r3 *= 19;
 	r4 *= 19;
 #if defined(HAVE_NATIVE_UINT128)
 	t[0] += ((uint128_t) r4) * s1 + ((uint128_t) r1) * s4 + ((uint128_t) r2) * s3 + ((uint128_t) r3) * s2;
 	t[1] += ((uint128_t) r4) * s2 + ((uint128_t) r2) * s4 + ((uint128_t) r3) * s3;
 	t[2] += ((uint128_t) r4) * s3 + ((uint128_t) r3) * s4;
 	t[3] += ((uint128_t) r4) * s4;
 #else
 	mul64x64_128(mul, r4, s1) add128(t[0], mul) mul64x64_128(mul, r1, s4) add128(t[0], mul) mul64x64_128(mul, r2, s3) add128(t[0], mul) mul64x64_128(mul, r3, s2) add128(t[0], mul)
 	mul64x64_128(mul, r4, s2) add128(t[1], mul) mul64x64_128(mul, r2, s4) add128(t[1], mul) mul64x64_128(mul, r3, s3) add128(t[1], mul)
 	mul64x64_128(mul, r4, s3) add128(t[2], mul) mul64x64_128(mul, r3, s4) add128(t[2], mul)
 	mul64x64_128(mul, r4, s4) add128(t[3], mul)
 #endif
 	                     r0 = lo128(t[0]) & reduce_mask_51; shr128(c, t[0], 51);
 	add128_64(t[1], c)   r1 = lo128(t[1]) & reduce_mask_51; shr128(c, t[1], 51);
 	add128_64(t[2], c)   r2 = lo128(t[2]) & reduce_mask_51; shr128(c, t[2], 51);
 	add128_64(t[3], c)   r3 = lo128(t[3]) & reduce_mask_51; shr128(c, t[3], 51);
 	add128_64(t[4], c)   r4 = lo128(t[4]) & reduce_mask_51; shr128(c, t[4], 51);
 	r0 +=   c * 19; c = r0 >> 51; r0 = r0 & reduce_mask_51;
 	r1 +=   c;
 	out[0] = r0;
 	out[1] = r1;
 	out[2] = r2;
 	out[3] = r3;
 	out[4] = r4;
 }
 /* out = in^(2 * count) */
 DONNA_INLINE static void
 curve25519_square_times(bignum25519 out, const bignum25519 in, uint64_t count) {
 #if !defined(HAVE_NATIVE_UINT128)
 	uint128_t mul;
 #endif
 	uint128_t t[5];
 	uint64_t r0,r1,r2,r3,r4,c;
 	uint64_t d0,d1,d2,d4,d419;
 	r0 = in[0];
 	r1 = in[1];
 	r2 = in[2];
 	r3 = in[3];
 	r4 = in[4];
 	do {
 		d0 = r0 * 2;
 		d1 = r1 * 2;
 		d2 = r2 * 2 * 19;
 		d419 = r4 * 19;
 		d4 = d419 * 2;
 #if defined(HAVE_NATIVE_UINT128)
 		t[0] = ((uint128_t) r0) * r0 + ((uint128_t) d4) * r1 + (((uint128_t) d2) * (r3     ));
 		t[1] = ((uint128_t) d0) * r1 + ((uint128_t) d4) * r2 + (((uint128_t) r3) * (r3 * 19));
 		t[2] = ((uint128_t) d0) * r2 + ((uint128_t) r1) * r1 + (((uint128_t) d4) * (r3     ));
 		t[3] = ((uint128_t) d0) * r3 + ((uint128_t) d1) * r2 + (((uint128_t) r4) * (d419   ));
 		t[4] = ((uint128_t) d0) * r4 + ((uint128_t) d1) * r3 + (((uint128_t) r2) * (r2     ));
 #else
 		mul64x64_128(t[0], r0, r0) mul64x64_128(mul, d4, r1) add128(t[0], mul) mul64x64_128(mul, d2,      r3) add128(t[0], mul)
 		mul64x64_128(t[1], d0, r1) mul64x64_128(mul, d4, r2) add128(t[1], mul) mul64x64_128(mul, r3, r3 * 19) add128(t[1], mul)
 		mul64x64_128(t[2], d0, r2) mul64x64_128(mul, r1, r1) add128(t[2], mul) mul64x64_128(mul, d4,      r3) add128(t[2], mul)
 		mul64x64_128(t[3], d0, r3) mul64x64_128(mul, d1, r2) add128(t[3], mul) mul64x64_128(mul, r4,    d419) add128(t[3], mul)
 		mul64x64_128(t[4], d0, r4) mul64x64_128(mul, d1, r3) add128(t[4], mul) mul64x64_128(mul, r2,      r2) add128(t[4], mul)
 #endif
 		                     r0 = lo128(t[0]) & reduce_mask_51; shr128(c, t[0], 51);
 		add128_64(t[1], c)   r1 = lo128(t[1]) & reduce_mask_51; shr128(c, t[1], 51);
 		add128_64(t[2], c)   r2 = lo128(t[2]) & reduce_mask_51; shr128(c, t[2], 51);
 		add128_64(t[3], c)   r3 = lo128(t[3]) & reduce_mask_51; shr128(c, t[3], 51);
 		add128_64(t[4], c)   r4 = lo128(t[4]) & reduce_mask_51; shr128(c, t[4], 51);
 		r0 +=   c * 19; c = r0 >> 51; r0 = r0 & reduce_mask_51;
 		r1 +=   c;
 	} while(--count);
 	out[0] = r0;
 	out[1] = r1;
 	out[2] = r2;
 	out[3] = r3;
 	out[4] = r4;
 }
 DONNA_INLINE static void
 curve25519_square(bignum25519 out, const bignum25519 in) {
 #if !defined(HAVE_NATIVE_UINT128)
 	uint128_t mul;
 #endif
 	uint128_t t[5];
 	uint64_t r0,r1,r2,r3,r4,c;
 	uint64_t d0,d1,d2,d4,d419;
 	r0 = in[0];
 	r1 = in[1];
 	r2 = in[2];
 	r3 = in[3];
 	r4 = in[4];
 	d0 = r0 * 2;
 	d1 = r1 * 2;
 	d2 = r2 * 2 * 19;
 	d419 = r4 * 19;
 	d4 = d419 * 2;
 #if defined(HAVE_NATIVE_UINT128)
 	t[0] = ((uint128_t) r0) * r0 + ((uint128_t) d4) * r1 + (((uint128_t) d2) * (r3     ));
 	t[1] = ((uint128_t) d0) * r1 + ((uint128_t) d4) * r2 + (((uint128_t) r3) * (r3 * 19));
 	t[2] = ((uint128_t) d0) * r2 + ((uint128_t) r1) * r1 + (((uint128_t) d4) * (r3     ));
 	t[3] = ((uint128_t) d0) * r3 + ((uint128_t) d1) * r2 + (((uint128_t) r4) * (d419   ));
 	t[4] = ((uint128_t) d0) * r4 + ((uint128_t) d1) * r3 + (((uint128_t) r2) * (r2     ));
 #else
 	mul64x64_128(t[0], r0, r0) mul64x64_128(mul, d4, r1) add128(t[0], mul) mul64x64_128(mul, d2,      r3) add128(t[0], mul)
 	mul64x64_128(t[1], d0, r1) mul64x64_128(mul, d4, r2) add128(t[1], mul) mul64x64_128(mul, r3, r3 * 19) add128(t[1], mul)
 	mul64x64_128(t[2], d0, r2) mul64x64_128(mul, r1, r1) add128(t[2], mul) mul64x64_128(mul, d4,      r3) add128(t[2], mul)
 	mul64x64_128(t[3], d0, r3) mul64x64_128(mul, d1, r2) add128(t[3], mul) mul64x64_128(mul, r4,    d419) add128(t[3], mul)
 	mul64x64_128(t[4], d0, r4) mul64x64_128(mul, d1, r3) add128(t[4], mul) mul64x64_128(mul, r2,      r2) add128(t[4], mul)
 #endif
 	                     r0 = lo128(t[0]) & reduce_mask_51; shr128(c, t[0], 51);
 	add128_64(t[1], c)   r1 = lo128(t[1]) & reduce_mask_51; shr128(c, t[1], 51);
 	add128_64(t[2], c)   r2 = lo128(t[2]) & reduce_mask_51; shr128(c, t[2], 51);
 	add128_64(t[3], c)   r3 = lo128(t[3]) & reduce_mask_51; shr128(c, t[3], 51);
 	add128_64(t[4], c)   r4 = lo128(t[4]) & reduce_mask_51; shr128(c, t[4], 51);
 	r0 +=   c * 19; c = r0 >> 51; r0 = r0 & reduce_mask_51;
 	r1 +=   c;
 	out[0] = r0;
 	out[1] = r1;
 	out[2] = r2;
 	out[3] = r3;
 	out[4] = r4;
 }
 /* Take a little-endian, 32-byte number and expand it into polynomial form */
 DONNA_INLINE static void
 curve25519_expand(bignum25519 out, const unsigned char *in) {
 	static const union { uint8_t b[2]; uint16_t s; } endian_check = {{1,0}};
 	uint64_t x0,x1,x2,x3;
 	if (endian_check.s == 1) {
 		x0 = *(uint64_t *)(in + 0);
 		x1 = *(uint64_t *)(in + 8);
 		x2 = *(uint64_t *)(in + 16);
 		x3 = *(uint64_t *)(in + 24);
 	} else {
 		#define F(s)                         \
 			((((uint64_t)in[s + 0])      ) | \
 			 (((uint64_t)in[s + 1]) <<  8) | \
 			 (((uint64_t)in[s + 2]) << 16) | \
 			 (((uint64_t)in[s + 3]) << 24) | \
 			 (((uint64_t)in[s + 4]) << 32) | \
 			 (((uint64_t)in[s + 5]) << 40) | \
 			 (((uint64_t)in[s + 6]) << 48) | \
 			 (((uint64_t)in[s + 7]) << 56))
 		x0 = F(0);
 		x1 = F(8);
 		x2 = F(16);
 		x3 = F(24);
 	}
 	out[0] = x0 & reduce_mask_51; x0 = (x0 >> 51) | (x1 << 13);
 	out[1] = x0 & reduce_mask_51; x1 = (x1 >> 38) | (x2 << 26);
 	out[2] = x1 & reduce_mask_51; x2 = (x2 >> 25) | (x3 << 39);
 	out[3] = x2 & reduce_mask_51; x3 = (x3 >> 12);
 	out[4] = x3 & reduce_mask_51; /* ignore the top bit */
 }
 /* Take a fully reduced polynomial form number and contract it into a
 * little-endian, 32-byte array
 */
 DONNA_INLINE static void
 curve25519_contract(unsigned char *out, const bignum25519 input) {
 	uint64_t t[5];
 	uint64_t f, i;
 	t[0] = input[0];
 	t[1] = input[1];
 	t[2] = input[2];
 	t[3] = input[3];
 	t[4] = input[4];
 	#define curve25519_contract_carry() \
 		t[1] += t[0] >> 51; t[0] &= reduce_mask_51; \
 		t[2] += t[1] >> 51; t[1] &= reduce_mask_51; \
 		t[3] += t[2] >> 51; t[2] &= reduce_mask_51; \
 		t[4] += t[3] >> 51; t[3] &= reduce_mask_51;
 	#define curve25519_contract_carry_full() curve25519_contract_carry() \
 		t[0] += 19 * (t[4] >> 51); t[4] &= reduce_mask_51;
 	#define curve25519_contract_carry_final() curve25519_contract_carry() \
 		t[4] &= reduce_mask_51;
 	curve25519_contract_carry_full()
 	curve25519_contract_carry_full()
 	/* now t is between 0 and 2^255-1, properly carried. */
 	/* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */
 	t[0] += 19;
 	curve25519_contract_carry_full()
 	/* now between 19 and 2^255-1 in both cases, and offset by 19. */
 	t[0] += 0x8000000000000 - 19;
 	t[1] += 0x8000000000000 - 1;
 	t[2] += 0x8000000000000 - 1;
 	t[3] += 0x8000000000000 - 1;
 	t[4] += 0x8000000000000 - 1;
 	/* now between 2^255 and 2^256-20, and offset by 2^255. */
 	curve25519_contract_carry_final()
 	#define write51full(n,shift) \
 		f = ((t[n] >> shift) | (t[n+1] << (51 - shift))); \
 		for (i = 0; i < 8; i++, f >>= 8) *out++ = (unsigned char)f;
 	#define write51(n) write51full(n,13*n)
 	write51(0)
 	write51(1)
 	write51(2)
 	write51(3)
 	#undef curve25519_contract_carry
 	#undef curve25519_contract_carry_full
 	#undef curve25519_contract_carry_final
 	#undef write51full
 	#undef write51
 }
 /*
 * Swap the contents of [qx] and [qpx] iff @swap is non-zero
 */
 DONNA_INLINE static void
 curve25519_swap_conditional(bignum25519 x, bignum25519 qpx, uint64_t iswap) {
 	const uint64_t swap = (uint64_t)(-(int64_t)iswap);
 	uint64_t x0,x1,x2,x3,x4;
 	x0 = swap & (x[0] ^ qpx[0]); x[0] ^= x0; qpx[0] ^= x0;
 	x1 = swap & (x[1] ^ qpx[1]); x[1] ^= x1; qpx[1] ^= x1;
 	x2 = swap & (x[2] ^ qpx[2]); x[2] ^= x2; qpx[2] ^= x2;
 	x3 = swap & (x[3] ^ qpx[3]); x[3] ^= x3; qpx[3] ^= x3;
 	x4 = swap & (x[4] ^ qpx[4]); x[4] ^= x4; qpx[4] ^= x4;
 }
--- a/curve25519-donna/curve25519-donna-common.h
+++ b/curve25519-donna/curve25519-donna-common.h
@ -1,43 +0,0 @@
 /*
 * In:  b =   2^5 - 2^0
 * Out: b = 2^250 - 2^0
 */
 static void
 curve25519_pow_two5mtwo0_two250mtwo0(bignum25519 b) {
 	bignum25519 ALIGN(16) t0,c;
 	/* 2^5  - 2^0 */ /* b */
 	/* 2^10 - 2^5 */ curve25519_square_times(t0, b, 5);
 	/* 2^10 - 2^0 */ curve25519_mul(b, t0, b);
 	/* 2^20 - 2^10 */ curve25519_square_times(t0, b, 10);
 	/* 2^20 - 2^0 */ curve25519_mul(c, t0, b);
 	/* 2^40 - 2^20 */ curve25519_square_times(t0, c, 20);
 	/* 2^40 - 2^0 */ curve25519_mul(t0, t0, c);
 	/* 2^50 - 2^10 */ curve25519_square_times(t0, t0, 10);
 	/* 2^50 - 2^0 */ curve25519_mul(b, t0, b);
 	/* 2^100 - 2^50 */ curve25519_square_times(t0, b, 50);
 	/* 2^100 - 2^0 */ curve25519_mul(c, t0, b);
 	/* 2^200 - 2^100 */ curve25519_square_times(t0, c, 100);
 	/* 2^200 - 2^0 */ curve25519_mul(t0, t0, c);
 	/* 2^250 - 2^50 */ curve25519_square_times(t0, t0, 50);
 	/* 2^250 - 2^0 */ curve25519_mul(b, t0, b);
 }
 /*
 * z^(p - 2) = z(2^255 - 21)
 */
 static void
 curve25519_recip(bignum25519 out, const bignum25519 z) {
 	bignum25519 ALIGN(16) a,t0,b;
 	/* 2 */ curve25519_square(a, z); /* a = 2 */
 	/* 8 */ curve25519_square_times(t0, a, 2);
 	/* 9 */ curve25519_mul(b, t0, z); /* b = 9 */
 	/* 11 */ curve25519_mul(a, b, a); /* a = 11 */
 	/* 22 */ curve25519_square(t0, a);
 	/* 2^5 - 2^0 = 31 */ curve25519_mul(b, t0, b);
 	/* 2^250 - 2^0 */ curve25519_pow_two5mtwo0_two250mtwo0(b);
 	/* 2^255 - 2^5 */ curve25519_square_times(b, b, 5);
 	/* 2^255 - 21 */ curve25519_mul(out, b, a);
 }
--- a/curve25519-donna/curve25519-donna-portable-identify.h
+++ b/curve25519-donna/curve25519-donna-portable-identify.h
@ -1,103 +0,0 @@
 /* os */
 #if defined(_WIN32)	|| defined(_WIN64) || defined(__TOS_WIN__) || defined(__WINDOWS__)
 	#define OS_WINDOWS
 #elif defined(sun) || defined(__sun) || defined(__SVR4) || defined(__svr4__)
 	#define OS_SOLARIS
 #else
 	#include <sys/param.h> /* need this to define BSD */
 	#define OS_NIX
 	#if defined(__linux__)
 		#define OS_LINUX
 	#elif defined(BSD)
 		#define OS_BSD
 		#if defined(MACOS_X) || (defined(__APPLE__) & defined(__MACH__))
 			#define OS_OSX
 		#elif defined(macintosh) || defined(Macintosh)
 			#define OS_MAC
 		#elif defined(__OpenBSD__)
 			#define OS_OPENBSD
 		#endif
 	#endif
 #endif
 /* compiler */
 #if defined(_MSC_VER)
 	#define COMPILER_MSVC
 #endif
 #if defined(__ICC)
 	#define COMPILER_INTEL
 #endif
 #if defined(__GNUC__)
 	#if (__GNUC__ >= 3)
 		#define COMPILER_GCC ((__GNUC__ * 10000) + (__GNUC_MINOR__ * 100) + (__GNUC_PATCHLEVEL__))
 	#else
 		#define COMPILER_GCC ((__GNUC__ * 10000) + (__GNUC_MINOR__ * 100)                        )
 	#endif
 #endif
 #if defined(__PATHCC__)
 	#define COMPILER_PATHCC
 #endif
 #if defined(__clang__)
 	#define COMPILER_CLANG ((__clang_major__ * 10000) + (__clang_minor__ * 100) + (__clang_patchlevel__))
 #endif
 /* cpu */
 #if defined(__amd64__) || defined(__amd64) || defined(__x86_64__ ) || defined(_M_X64)
 	#define CPU_X86_64
 #elif defined(__i586__) || defined(__i686__) || (defined(_M_IX86) && (_M_IX86 >= 500))
 	#define CPU_X86 500
 #elif defined(__i486__) || (defined(_M_IX86) && (_M_IX86 >= 400))
 	#define CPU_X86 400
 #elif defined(__i386__) || (defined(_M_IX86) && (_M_IX86 >= 300)) || defined(__X86__) || defined(_X86_) || defined(__I86__)
 	#define CPU_X86 300
 #elif defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(_M_IA64) || defined(__ia64)
 	#define CPU_IA64
 #endif
 #if defined(__sparc__) || defined(__sparc) || defined(__sparcv9)
 	#define CPU_SPARC
 	#if defined(__sparcv9)
 		#define CPU_SPARC64
 	#endif
 #endif
 #if defined(powerpc) || defined(__PPC__) || defined(__ppc__) || defined(_ARCH_PPC) || defined(__powerpc__) || defined(__powerpc) || defined(POWERPC) || defined(_M_PPC)
 	#define CPU_PPC
 	#if defined(_ARCH_PWR7)
 		#define CPU_POWER7
 	#elif defined(__64BIT__)
 		#define CPU_PPC64
 	#else
 		#define CPU_PPC32
 	#endif
 #endif
 #if defined(__hppa__) || defined(__hppa)
 	#define CPU_HPPA
 #endif
 #if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA)
 	#define CPU_ALPHA
 #endif
 /* 64 bit cpu */
 #if defined(CPU_X86_64) || defined(CPU_IA64) || defined(CPU_SPARC64) || defined(__64BIT__) || defined(__LP64__) || defined(_LP64) || (defined(_MIPS_SZLONG) && (_MIPS_SZLONG == 64))
 	#define CPU_64BITS
 #endif
 #if defined(COMPILER_MSVC)
 	typedef signed char int8_t;
 	typedef unsigned char uint8_t;
 	typedef signed short int16_t;
 	typedef unsigned short uint16_t;
 	typedef signed int int32_t;
 	typedef unsigned int uint32_t;
 	typedef signed __int64 int64_t;
 	typedef unsigned __int64 uint64_t;
 #else
 	#include <stdint.h>
 #endif
--- a/curve25519-donna/curve25519-donna-portable.h
+++ b/curve25519-donna/curve25519-donna-portable.h
@ -1,94 +0,0 @@
 #include "curve25519-donna-portable-identify.h"
 #define mul32x32_64(a,b) (((uint64_t)(a))*(b))
 /* platform */
 #if defined(COMPILER_MSVC)
 	#include <intrin.h>
 	#if !defined(_DEBUG)
 		#undef mul32x32_64
 		#define mul32x32_64(a,b) __emulu(a,b)
 	#endif
 	#undef inline
 	#define inline __forceinline
 	#define DONNA_INLINE __forceinline
 	#define DONNA_NOINLINE __declspec(noinline)
 	#undef ALIGN
 	#define ALIGN(x) __declspec(align(x))
 	#define ROTL32(a,b) _rotl(a,b)
 	#define ROTR32(a,b) _rotr(a,b)
 #else
 	#include <sys/param.h>
 	#define DONNA_INLINE inline __attribute__((always_inline))
 	#define DONNA_NOINLINE __attribute__((noinline))
 	#undef ALIGN
 	#define ALIGN(x) __attribute__((aligned(x)))
 	#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
 	#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
 #endif
 /* uint128_t */
 #if defined(CPU_64BITS) && !defined(ED25519_FORCE_32BIT)
 	#if defined(COMPILER_CLANG) && (COMPILER_CLANG >= 30100)
 		#define HAVE_NATIVE_UINT128
 		typedef unsigned __int128 uint128_t;
 	#elif defined(COMPILER_MSVC)
 		#define HAVE_UINT128
 		typedef struct uint128_t {
 			uint64_t lo, hi;
 		} uint128_t;
 		#define mul64x64_128(out,a,b) out.lo = _umul128(a,b,&out.hi);
 		#define shr128_pair(out,hi,lo,shift) out = __shiftright128(lo, hi, shift);
 		#define shl128_pair(out,hi,lo,shift) out = __shiftleft128(lo, hi, shift);
 		#define shr128(out,in,shift) shr128_pair(out, in.hi, in.lo, shift)
 		#define shl128(out,in,shift) shl128_pair(out, in.hi, in.lo, shift)
 		#define add128(a,b) { uint64_t p = a.lo; a.lo += b.lo; a.hi += b.hi + (a.lo < p); }
 		#define add128_64(a,b) { uint64_t p = a.lo; a.lo += b; a.hi += (a.lo < p); }
 		#define lo128(a) (a.lo)
 		#define hi128(a) (a.hi)
 	#elif defined(COMPILER_GCC) && !defined(HAVE_NATIVE_UINT128)
 		#if defined(__SIZEOF_INT128__)
 			#define HAVE_NATIVE_UINT128
 			typedef unsigned __int128 uint128_t;
 		#elif (COMPILER_GCC >= 40400)
 			#define HAVE_NATIVE_UINT128
 			typedef unsigned uint128_t __attribute__((mode(TI)));
 		#elif defined(CPU_X86_64)
 			#define HAVE_UINT128
 			typedef struct uint128_t {
 				uint64_t lo, hi;
 			} uint128_t;
 			#define mul64x64_128(out,a,b) __asm__ ("mulq %3" : "=a" (out.lo), "=d" (out.hi) : "a" (a), "rm" (b));
 			#define shr128_pair(out,hi,lo,shift) __asm__ ("shrdq %2,%1,%0" : "+r" (lo) : "r" (hi), "J" (shift)); out = lo;
 			#define shl128_pair(out,hi,lo,shift) __asm__ ("shldq %2,%1,%0" : "+r" (hi) : "r" (lo), "J" (shift)); out = hi;
 			#define shr128(out,in,shift) shr128_pair(out,in.hi, in.lo, shift)
 			#define shl128(out,in,shift) shl128_pair(out,in.hi, in.lo, shift)
 			#define add128(a,b) __asm__ ("addq %4,%2; adcq %5,%3" : "=r" (a.hi), "=r" (a.lo) : "1" (a.lo), "0" (a.hi), "rm" (b.lo), "rm" (b.hi) : "cc");
 			#define add128_64(a,b) __asm__ ("addq %4,%2; adcq $0,%3" : "=r" (a.hi), "=r" (a.lo) : "1" (a.lo), "0" (a.hi), "rm" (b) : "cc");
 			#define lo128(a) (a.lo)
 			#define hi128(a) (a.hi)
 		#endif
 	#endif
 	#if defined(HAVE_NATIVE_UINT128)
 		#define HAVE_UINT128
 		#define mul64x64_128(out,a,b) out = (uint128_t)a * b;
 		#define shr128_pair(out,hi,lo,shift) out = (uint64_t)((((uint128_t)hi << 64) | lo) >> (shift));
 		#define shl128_pair(out,hi,lo,shift) out = (uint64_t)(((((uint128_t)hi << 64) | lo) << (shift)) >> 64);
 		#define shr128(out,in,shift) out = (uint64_t)(in >> (shift));
 		#define shl128(out,in,shift) out = (uint64_t)((in << shift) >> 64);
 		#define add128(a,b) a += b;
 		#define add128_64(a,b) a += (uint64_t)b;
 		#define lo128(a) ((uint64_t)a)
 		#define hi128(a) ((uint64_t)(a >> 64))
 	#endif
 	#if !defined(HAVE_UINT128)
 		#error Need a uint128_t implementation!
 	#endif
 #endif
 #include <stdlib.h>
 #include <string.h>
--- a/curve25519-donna/curve25519-donna-scalarmult-sse2.h
+++ b/curve25519-donna/curve25519-donna-scalarmult-sse2.h
@ -1,65 +0,0 @@
 /* Calculates nQ where Q is the x-coordinate of a point on the curve
 *
 *   mypublic: the packed little endian x coordinate of the resulting curve point
 *   n: a little endian, 32-byte number
 *   basepoint: a packed little endian point of the curve
 */
 static void
 curve25519_scalarmult_donna(curve25519_key mypublic, const curve25519_key n, const curve25519_key basepoint) {
 	bignum25519 ALIGN(16) nqx = {1}, nqpqz = {1}, nqz = {0}, nqpqx, zmone;
 	packed32bignum25519 qx, qz, pqz, pqx;
 	packed64bignum25519 nq, sq, sqscalar, prime, primex, primez, nqpq;
 	bignum25519mulprecomp preq;
 	size_t bit, lastbit, i;
 	curve25519_expand(nqpqx, basepoint);
 	curve25519_mul_precompute(&preq, nqpqx);
 	/* do bits 254..3 */
 	for (i = 254, lastbit = 0; i >= 3; i--) {
 		bit = (n[i/8] >> (i & 7)) & 1;
 		curve25519_swap_conditional(nqx, nqpqx, bit ^ lastbit);
 		curve25519_swap_conditional(nqz, nqpqz, bit ^ lastbit);
 		lastbit = bit;
 		curve25519_tangle32(qx, nqx, nqpqx); /* qx = [nqx,nqpqx] */
 		curve25519_tangle32(qz, nqz, nqpqz); /* qz = [nqz,nqpqz] */
 		curve25519_add_packed32(pqx, qx, qz); /* pqx = [nqx+nqz,nqpqx+nqpqz] */
 		curve25519_sub_packed32(pqz, qx, qz); /* pqz = [nqx-nqz,nqpqx-nqpqz] */
 		curve25519_make_nqpq(primex, primez, pqx, pqz); /* primex = [nqx+nqz,nqpqx+nqpqz], primez = [nqpqx-nqpqz,nqx-nqz] */
 		curve25519_mul_packed64(prime, primex, primez); /* prime = [nqx+nqz,nqpqx+nqpqz] * [nqpqx-nqpqz,nqx-nqz] */
 		curve25519_addsub_packed64(prime); /* prime = [prime.x+prime.z,prime.x-prime.z] */
 		curve25519_square_packed64(nqpq, prime); /* nqpq = prime^2 */
 		curve25519_untangle64(nqpqx, nqpqz, nqpq);
 		curve25519_mul_precomputed(nqpqz, nqpqz, &preq); /* nqpqz = nqpqz * q */
 		/* (((sq.x-sq.z)*121665)+sq.x) * (sq.x-sq.z) is equivalent to (sq.x*121666-sq.z*121665) * (sq.x-sq.z) */
 		curve25519_make_nq(nq, pqx, pqz); /* nq = [nqx+nqz,nqx-nqz] */
 		curve25519_square_packed64(sq, nq); /* sq = nq^2 */
 		curve25519_121665_packed64(sqscalar, sq); /* sqscalar = sq * [121666,121665] */
 		curve25519_final_nq(nq, sq, sqscalar); /* nq = [sq.x,sqscalar.x-sqscalar.z] * [sq.z,sq.x-sq.z] */
 		curve25519_untangle64(nqx, nqz, nq);
 	};
 	/* it's possible to get rid of this swap with the swap in the above loop 
 	   at the bottom instead of the top, but compilers seem to optimize better this way */
 	curve25519_swap_conditional(nqx, nqpqx, bit);
 	curve25519_swap_conditional(nqz, nqpqz, bit);
 	/* do bits 2..0 */
 	for (i = 0; i < 3; i++) {
 		curve25519_compute_nq(nq, nqx, nqz);
 		curve25519_square_packed64(sq, nq); /* sq = nq^2 */
 		curve25519_121665_packed64(sqscalar, sq); /* sqscalar = sq * [121666,121665] */
 		curve25519_final_nq(nq, sq, sqscalar); /* nq = [sq.x,sqscalar.x-sqscalar.z] * [sq.z,sq.x-sq.z] */
 		curve25519_untangle64(nqx, nqz, nq);
 	}
 	curve25519_recip(zmone, nqz);
 	curve25519_mul(nqz, nqx, zmone);
 	curve25519_contract(mypublic, nqz);
 }
--- a/curve25519-donna/curve25519-donna-sse2.h
+++ b/curve25519-donna/curve25519-donna-sse2.h
--- a/curve25519-donna/curve25519-donna.h
+++ b/curve25519-donna/curve25519-donna.h
@ -1,32 +0,0 @@
 #include "curve25519.h"
 #include "curve25519-donna-portable.h"
 #if defined(CURVE25519_SSE2)
 #else
 	#if defined(HAVE_UINT128) && !defined(CURVE25519_FORCE_32BIT)
 		#define CURVE25519_64BIT
 	#else
 		#define CURVE25519_32BIT
 	#endif
 #endif
 #if !defined(CURVE25519_NO_INLINE_ASM)
 #endif
 #if defined(CURVE25519_SSE2)
 	#include "curve25519-donna-sse2.h"
 #elif defined(CURVE25519_64BIT)
 	#include "curve25519-donna-64bit.h"
 #else
 	#include "curve25519-donna-32bit.h"
 #endif
 #include "curve25519-donna-common.h"
 #if defined(CURVE25519_SSE2)
 	#include "curve25519-donna-scalarmult-sse2.h"
 #else
 	#include "curve25519-donna-scalarmult-base.h"
 #endif
--- a/ed25519-donna/curve25519-donna-32bit.h
+++ b/ed25519-donna/curve25519-donna-32bit.h
@ -6,12 +6,10 @@
 */
 typedef uint32_t bignum25519[10];
 typedef uint32_t bignum25519align16[12];
 static const uint32_t reduce_mask_25 = (1 << 25) - 1;
 static const uint32_t reduce_mask_26 = (1 << 26) - 1;
 /* out = in */
 DONNA_INLINE static void
 curve25519_copy(bignum25519 out, const bignum25519 in) {
@ -98,6 +96,24 @@ curve25519_sub(bignum25519 out, const bignum25519 a, const bignum25519 b) {
 	out[9] = twoP13579 + a[9] - b[9]    ;
 }
 /* out = in * scalar */
 DONNA_INLINE static void
 curve25519_scalar_product(bignum25519 out, const bignum25519 in, const uint32_t scalar) {
 	uint64_t a;
 	uint32_t c;
 	a = mul32x32_64(in[0], scalar);     out[0] = (uint32_t)a & reduce_mask_26; c = (uint32_t)(a >> 26);
 	a = mul32x32_64(in[1], scalar) + c; out[1] = (uint32_t)a & reduce_mask_25; c = (uint32_t)(a >> 25);
 	a = mul32x32_64(in[2], scalar) + c; out[2] = (uint32_t)a & reduce_mask_26; c = (uint32_t)(a >> 26);
 	a = mul32x32_64(in[3], scalar) + c; out[3] = (uint32_t)a & reduce_mask_25; c = (uint32_t)(a >> 25);
 	a = mul32x32_64(in[4], scalar) + c; out[4] = (uint32_t)a & reduce_mask_26; c = (uint32_t)(a >> 26);
 	a = mul32x32_64(in[5], scalar) + c; out[5] = (uint32_t)a & reduce_mask_25; c = (uint32_t)(a >> 25);
 	a = mul32x32_64(in[6], scalar) + c; out[6] = (uint32_t)a & reduce_mask_26; c = (uint32_t)(a >> 26);
 	a = mul32x32_64(in[7], scalar) + c; out[7] = (uint32_t)a & reduce_mask_25; c = (uint32_t)(a >> 25);
 	a = mul32x32_64(in[8], scalar) + c; out[8] = (uint32_t)a & reduce_mask_26; c = (uint32_t)(a >> 26);
 	a = mul32x32_64(in[9], scalar) + c; out[9] = (uint32_t)a & reduce_mask_25; c = (uint32_t)(a >> 25);
 	                                    out[0] += c * 19;
 }
 /* out = a - b, where a is the result of a basic op (add,sub) */
 DONNA_INLINE static void
 curve25519_sub_after_basic(bignum25519 out, const bignum25519 a, const bignum25519 b) {
@ -150,7 +166,7 @@ curve25519_neg(bignum25519 out, const bignum25519 a) {
 /* out = a * b */
 #define curve25519_mul_noinline curve25519_mul
-static void
+DONNA_INLINE static void
 curve25519_mul(bignum25519 out, const bignum25519 a, const bignum25519 b) {
 	uint32_t r0,r1,r2,r3,r4,r5,r6,r7,r8,r9;
 	uint32_t s0,s1,s2,s3,s4,s5,s6,s7,s8,s9;
@ -247,8 +263,8 @@ curve25519_mul(bignum25519 out, const bignum25519 a, const bignum25519 b) {
 	out[9] = r9;
 }
-/* out = in*in */
+/* out = in * in */
-static void
+DONNA_INLINE static void
 curve25519_square(bignum25519 out, const bignum25519 in) {
 	uint32_t r0,r1,r2,r3,r4,r5,r6,r7,r8,r9;
 	uint32_t d6,d7,d8,d9;
@ -321,7 +337,6 @@ curve25519_square(bignum25519 out, const bignum25519 in) {
 	out[9] = r9;
 }
 /* out = in ^ (2 * count) */
 static void
 curve25519_square_times(bignum25519 out, const bignum25519 in, int count) {
@ -430,16 +445,16 @@ curve25519_expand(bignum25519 out, const unsigned char in[32]) {
 		#undef F
 	}
-	out[0] = (                        x0       ) & 0x3ffffff;
+	out[0] = (                        x0       ) & reduce_mask_26;
-	out[1] = ((((uint64_t)x1 << 32) | x0) >> 26) & 0x1ffffff;
+	out[1] = ((((uint64_t)x1 << 32) | x0) >> 26) & reduce_mask_25;
-	out[2] = ((((uint64_t)x2 << 32) | x1) >> 19) & 0x3ffffff;
+	out[2] = ((((uint64_t)x2 << 32) | x1) >> 19) & reduce_mask_26;
-	out[3] = ((((uint64_t)x3 << 32) | x2) >> 13) & 0x1ffffff;
+	out[3] = ((((uint64_t)x3 << 32) | x2) >> 13) & reduce_mask_25;
-	out[4] = ((                       x3) >>  6) & 0x3ffffff;
+	out[4] = ((                       x3) >>  6) & reduce_mask_26;
-	out[5] = (                        x4       ) & 0x1ffffff;
+	out[5] = (                        x4       ) & reduce_mask_25;
-	out[6] = ((((uint64_t)x5 << 32) | x4) >> 25) & 0x3ffffff;
+	out[6] = ((((uint64_t)x5 << 32) | x4) >> 25) & reduce_mask_26;
-	out[7] = ((((uint64_t)x6 << 32) | x5) >> 19) & 0x1ffffff;
+	out[7] = ((((uint64_t)x6 << 32) | x5) >> 19) & reduce_mask_25;
-	out[8] = ((((uint64_t)x7 << 32) | x6) >> 12) & 0x3ffffff;
+	out[8] = ((((uint64_t)x7 << 32) | x6) >> 12) & reduce_mask_26;
-	out[9] = ((                       x7) >>  6) & 0x1ffffff;
+	out[9] = ((                       x7) >>  6) & reduce_mask_25; /* ignore the top bit */
 }
 /* Take a fully reduced polynomial form number and contract it into a
@ -526,7 +541,6 @@ curve25519_contract(unsigned char out[32], const bignum25519 in) {
 	#undef F
 }
 /* out = (flag) ? in : out */
 DONNA_INLINE static void
 curve25519_move_conditional_bytes(uint8_t out[96], const uint8_t in[96], uint32_t flag) {
--- a/curve25519-donna/curve25519-donna-scalarmult-base.h
+++ b/curve25519-donna/curve25519-donna-scalarmult-base.h
--- a/ed25519-donna/curve25519-donna.h
+++ b/ed25519-donna/curve25519-donna.h
@ -0,0 +1,9 @@
 #include "curve25519.h"
 #include "ed25519-donna-portable.h"
 #include "curve25519-donna-32bit.h"
 #include "curve25519-donna-helpers.h"
 #include "curve25519-donna-scalarmult-base.h"
--- a/curve25519-donna/curve25519.c
+++ b/curve25519-donna/curve25519.c
--- a/curve25519-donna/curve25519.h
+++ b/curve25519-donna/curve25519.h