1 files changed, 477 insertions, 0 deletions
diff --git a/nacl/crypto_scalarmult/curve25519/donna_c64/smult.c b/nacl/crypto_scalarmult/curve25519/donna_c64/smult.c
new file mode 100644
index 00000000..6d26956b
--- /dev/null
+++ b/nacl/crypto_scalarmult/curve25519/donna_c64/smult.c
@@ -0,0 +1,477 @@
+/* Copyright 2008, Google Inc.
+ * All rights reserved.
+ *
+ * Code released into the public domain.
+ *
+ * curve25519-donna: Curve25519 elliptic curve, public key function
+ *
+ * http://code.google.com/p/curve25519-donna/
+ *
+ * Adam Langley <agl@imperialviolet.org>
+ *
+ * Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to>
+ *
+ * More information about curve25519 can be found here
+ *   http://cr.yp.to/ecdh.html
+ *
+ * djb's sample implementation of curve25519 is written in a special assembly
+ * language called qhasm and uses the floating point registers.
+ *
+ * This is, almost, a clean room reimplementation from the curve25519 paper. It
+ * uses many of the tricks described therein. Only the crecip function is taken
+ * from the sample implementation.
+ */
+#include <string.h>
+#include <stdint.h>
+#include "crypto_scalarmult.h"
+typedef uint8_t u8;
+typedef uint64_t felem;
+// This is a special gcc mode for 128-bit integers. It's implemented on 64-bit
+// platforms only as far as I know.
+typedef unsigned uint128_t __attribute__((mode(TI)));
+/* Sum two numbers: output += in */
+static void fsum(felem *output, const felem *in) {
+  unsigned i;
+  for (i = 0; i < 5; ++i) output[i] += in[i];
+}
+/* Find the difference of two numbers: output = in - output
+ * (note the order of the arguments!)
+ */
+static void fdifference_backwards(felem *ioutput, const felem *iin) {
+  static const int64_t twotothe51 = (1l << 51);
+  const int64_t *in = (const int64_t *) iin;
+  int64_t *out = (int64_t *) ioutput;
+  out[0] = in[0] - out[0];
+  out[1] = in[1] - out[1];
+  out[2] = in[2] - out[2];
+  out[3] = in[3] - out[3];
+  out[4] = in[4] - out[4];
+  // An arithmetic shift right of 63 places turns a positive number to 0 and a
+  // negative number to all 1's. This gives us a bitmask that lets us avoid
+  // side-channel prone branches.
+  int64_t t;
+#define NEGCHAIN(a,b) \
+  t = out[a] >> 63; \
+  out[a] += twotothe51 & t; \
+  out[b] -= 1 & t;
+#define NEGCHAIN19(a,b) \
+  t = out[a] >> 63; \
+  out[a] += twotothe51 & t; \
+  out[b] -= 19 & t;
+  NEGCHAIN(0, 1);
+  NEGCHAIN(1, 2);
+  NEGCHAIN(2, 3);
+  NEGCHAIN(3, 4);
+  NEGCHAIN19(4, 0);
+  NEGCHAIN(0, 1);
+  NEGCHAIN(1, 2);
+  NEGCHAIN(2, 3);
+  NEGCHAIN(3, 4);
+}
+/* Multiply a number by a scalar: output = in * scalar */
+static void fscalar_product(felem *output, const felem *in, const felem scalar) {
+  uint128_t a;
+  a = ((uint128_t) in[0]) * scalar;
+  output[0] = a & 0x7ffffffffffff;
+  a = ((uint128_t) in[1]) * scalar + (a >> 51);
+  output[1] = a & 0x7ffffffffffff;
+  a = ((uint128_t) in[2]) * scalar + (a >> 51);
+  output[2] = a & 0x7ffffffffffff;
+  a = ((uint128_t) in[3]) * scalar + (a >> 51);
+  output[3] = a & 0x7ffffffffffff;
+  a = ((uint128_t) in[4]) * scalar + (a >> 51);
+  output[4] = a & 0x7ffffffffffff;
+  output[0] += (a >> 51) * 19;
+}
+/* Multiply two numbers: output = in2 * in
+ *
+ * output must be distinct to both inputs. The inputs are reduced coefficient
+ * form, the output is not.
+ */
+static void fmul(felem *output, const felem *in2, const felem *in) {
+  uint128_t t[9];
+  t[0] = ((uint128_t) in[0]) * in2[0];
+  t[1] = ((uint128_t) in[0]) * in2[1] +
+         ((uint128_t) in[1]) * in2[0];
+  t[2] = ((uint128_t) in[0]) * in2[2] +
+         ((uint128_t) in[2]) * in2[0] +
+         ((uint128_t) in[1]) * in2[1];
+  t[3] = ((uint128_t) in[0]) * in2[3] +
+         ((uint128_t) in[3]) * in2[0] +
+         ((uint128_t) in[1]) * in2[2] +
+         ((uint128_t) in[2]) * in2[1];
+  t[4] = ((uint128_t) in[0]) * in2[4] +
+         ((uint128_t) in[4]) * in2[0] +
+         ((uint128_t) in[3]) * in2[1] +
+         ((uint128_t) in[1]) * in2[3] +
+         ((uint128_t) in[2]) * in2[2];
+  t[5] = ((uint128_t) in[4]) * in2[1] +
+         ((uint128_t) in[1]) * in2[4] +
+         ((uint128_t) in[2]) * in2[3] +
+         ((uint128_t) in[3]) * in2[2];
+  t[6] = ((uint128_t) in[4]) * in2[2] +
+         ((uint128_t) in[2]) * in2[4] +
+         ((uint128_t) in[3]) * in2[3];
+  t[7] = ((uint128_t) in[3]) * in2[4] +
+         ((uint128_t) in[4]) * in2[3];
+  t[8] = ((uint128_t) in[4]) * in2[4];
+  t[0] += t[5] * 19;
+  t[1] += t[6] * 19;
+  t[2] += t[7] * 19;
+  t[3] += t[8] * 19;
+  t[1] += t[0] >> 51;
+  t[0] &= 0x7ffffffffffff;
+  t[2] += t[1] >> 51;
+  t[1] &= 0x7ffffffffffff;
+  t[3] += t[2] >> 51;
+  t[2] &= 0x7ffffffffffff;
+  t[4] += t[3] >> 51;
+  t[3] &= 0x7ffffffffffff;
+  t[0] += 19 * (t[4] >> 51);
+  t[4] &= 0x7ffffffffffff;
+  t[1] += t[0] >> 51;
+  t[0] &= 0x7ffffffffffff;
+  t[2] += t[1] >> 51;
+  t[1] &= 0x7ffffffffffff;
+  output[0] = t[0];
+  output[1] = t[1];
+  output[2] = t[2];
+  output[3] = t[3];
+  output[4] = t[4];
+}
+static void
+fsquare(felem *output, const felem *in) {
+  uint128_t t[9];
+  t[0] = ((uint128_t) in[0]) * in[0];
+  t[1] = ((uint128_t) in[0]) * in[1] * 2;
+  t[2] = ((uint128_t) in[0]) * in[2] * 2 +
+         ((uint128_t) in[1]) * in[1];
+  t[3] = ((uint128_t) in[0]) * in[3] * 2 +
+         ((uint128_t) in[1]) * in[2] * 2;
+  t[4] = ((uint128_t) in[0]) * in[4] * 2 +
+         ((uint128_t) in[3]) * in[1] * 2 +
+         ((uint128_t) in[2]) * in[2];
+  t[5] = ((uint128_t) in[4]) * in[1] * 2 +
+         ((uint128_t) in[2]) * in[3] * 2;
+  t[6] = ((uint128_t) in[4]) * in[2] * 2 +
+         ((uint128_t) in[3]) * in[3];
+  t[7] = ((uint128_t) in[3]) * in[4] * 2;
+  t[8] = ((uint128_t) in[4]) * in[4];
+  t[0] += t[5] * 19;
+  t[1] += t[6] * 19;
+  t[2] += t[7] * 19;
+  t[3] += t[8] * 19;
+  t[1] += t[0] >> 51;
+  t[0] &= 0x7ffffffffffff;
+  t[2] += t[1] >> 51;
+  t[1] &= 0x7ffffffffffff;
+  t[3] += t[2] >> 51;
+  t[2] &= 0x7ffffffffffff;
+  t[4] += t[3] >> 51;
+  t[3] &= 0x7ffffffffffff;
+  t[0] += 19 * (t[4] >> 51);
+  t[4] &= 0x7ffffffffffff;
+  t[1] += t[0] >> 51;
+  t[0] &= 0x7ffffffffffff;
+  output[0] = t[0];
+  output[1] = t[1];
+  output[2] = t[2];
+  output[3] = t[3];
+  output[4] = t[4];
+}
+/* Take a little-endian, 32-byte number and expand it into polynomial form */
+static void
+fexpand(felem *output, const u8 *in) {
+  output[0] = *((const uint64_t *)(in)) & 0x7ffffffffffff;
+  output[1] = (*((const uint64_t *)(in+6)) >> 3) & 0x7ffffffffffff;
+  output[2] = (*((const uint64_t *)(in+12)) >> 6) & 0x7ffffffffffff;
+  output[3] = (*((const uint64_t *)(in+19)) >> 1) & 0x7ffffffffffff;
+  output[4] = (*((const uint64_t *)(in+25)) >> 4) & 0x7ffffffffffff;
+}
+/* Take a fully reduced polynomial form number and contract it into a
+ * little-endian, 32-byte array
+ */
+static void
+fcontract(u8 *output, const felem *input) {
+  uint128_t t[5];
+  t[0] = input[0];
+  t[1] = input[1];
+  t[2] = input[2];
+  t[3] = input[3];
+  t[4] = input[4];
+  t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff;
+  t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff;
+  t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff;
+  t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff;
+  t[0] += 19 * (t[4] >> 51); t[4] &= 0x7ffffffffffff;
+  t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff;
+  t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff;
+  t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff;
+  t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff;
+  t[0] += 19 * (t[4] >> 51); t[4] &= 0x7ffffffffffff;
+  /* now t is between 0 and 2^255-1, properly carried. */
+  /* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */
+  t[0] += 19;
+  t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff;
+  t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff;
+  t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff;
+  t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff;
+  t[0] += 19 * (t[4] >> 51); t[4] &= 0x7ffffffffffff;
+  /* now between 19 and 2^255-1 in both cases, and offset by 19. */
+  t[0] += 0x8000000000000 - 19;
+  t[1] += 0x8000000000000 - 1;
+  t[2] += 0x8000000000000 - 1;
+  t[3] += 0x8000000000000 - 1;
+  t[4] += 0x8000000000000 - 1;
+  /* now between 2^255 and 2^256-20, and offset by 2^255. */
+  t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff;
+  t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff;
+  t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff;
+  t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff;
+  t[4] &= 0x7ffffffffffff;
+  *((uint64_t *)(output)) = t[0] | (t[1] << 51);
+  *((uint64_t *)(output+8)) = (t[1] >> 13) | (t[2] << 38);
+  *((uint64_t *)(output+16)) = (t[2] >> 26) | (t[3] << 25);
+  *((uint64_t *)(output+24)) = (t[3] >> 39) | (t[4] << 12);
+}
+/* Input: Q, Q', Q-Q'
+ * Output: 2Q, Q+Q'
+ *
+ *   x2 z3: long form
+ *   x3 z3: long form
+ *   x z: short form, destroyed
+ *   xprime zprime: short form, destroyed
+ *   qmqp: short form, preserved
+ */
+static void
+fmonty(felem *x2, felem *z2, /* output 2Q */
+       felem *x3, felem *z3, /* output Q + Q' */
+       felem *x, felem *z,   /* input Q */
+       felem *xprime, felem *zprime, /* input Q' */
+       const felem *qmqp /* input Q - Q' */) {
+  felem origx[5], origxprime[5], zzz[5], xx[5], zz[5], xxprime[5],
+        zzprime[5], zzzprime[5];
+  memcpy(origx, x, 5 * sizeof(felem));
+  fsum(x, z);
+  fdifference_backwards(z, origx);  // does x - z
+  memcpy(origxprime, xprime, sizeof(felem) * 5);
+  fsum(xprime, zprime);
+  fdifference_backwards(zprime, origxprime);
+  fmul(xxprime, xprime, z);
+  fmul(zzprime, x, zprime);
+  memcpy(origxprime, xxprime, sizeof(felem) * 5);
+  fsum(xxprime, zzprime);
+  fdifference_backwards(zzprime, origxprime);
+  fsquare(x3, xxprime);
+  fsquare(zzzprime, zzprime);
+  fmul(z3, zzzprime, qmqp);
+  fsquare(xx, x);
+  fsquare(zz, z);
+  fmul(x2, xx, zz);
+  fdifference_backwards(zz, xx);  // does zz = xx - zz
+  fscalar_product(zzz, zz, 121665);
+  fsum(zzz, xx);
+  fmul(z2, zz, zzz);
+}
+// -----------------------------------------------------------------------------
+// Maybe swap the contents of two felem arrays (@a and @b), each @len elements
+// long. Perform the swap iff @swap is non-zero.
+//
+// This function performs the swap without leaking any side-channel
+// information.
+// -----------------------------------------------------------------------------
+static void
+swap_conditional(felem *a, felem *b, unsigned len, felem iswap) {
+  unsigned i;
+  const felem swap = -iswap;
+  for (i = 0; i < len; ++i) {
+    const felem x = swap & (a[i] ^ b[i]);
+    a[i] ^= x;
+    b[i] ^= x;
+  }
+}
+/* Calculates nQ where Q is the x-coordinate of a point on the curve
+ *
+ *   resultx/resultz: the x coordinate of the resulting curve point (short form)
+ *   n: a little endian, 32-byte number
+ *   q: a point of the curve (short form)
+ */
+static void
+cmult(felem *resultx, felem *resultz, const u8 *n, const felem *q) {
+  felem a[5] = {0}, b[5] = {1}, c[5] = {1}, d[5] = {0};
+  felem *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t;
+  felem e[5] = {0}, f[5] = {1}, g[5] = {0}, h[5] = {1};
+  felem *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h;
+  unsigned i, j;
+  memcpy(nqpqx, q, sizeof(felem) * 5);
+  for (i = 0; i < 32; ++i) {
+    u8 byte = n[31 - i];
+    for (j = 0; j < 8; ++j) {
+      const felem bit = byte >> 7;
+      swap_conditional(nqx, nqpqx, 5, bit);
+      swap_conditional(nqz, nqpqz, 5, bit);
+      fmonty(nqx2, nqz2,
+             nqpqx2, nqpqz2,
+             nqx, nqz,
+             nqpqx, nqpqz,
+             q);
+      swap_conditional(nqx2, nqpqx2, 5, bit);
+      swap_conditional(nqz2, nqpqz2, 5, bit);
+      t = nqx;
+      nqx = nqx2;
+      nqx2 = t;
+      t = nqz;
+      nqz = nqz2;
+      nqz2 = t;
+      t = nqpqx;
+      nqpqx = nqpqx2;
+      nqpqx2 = t;
+      t = nqpqz;
+      nqpqz = nqpqz2;
+      nqpqz2 = t;
+      byte <<= 1;
+    }
+  }
+  memcpy(resultx, nqx, sizeof(felem) * 5);
+  memcpy(resultz, nqz, sizeof(felem) * 5);
+}
+// -----------------------------------------------------------------------------
+// Shamelessly copied from djb's code
+// -----------------------------------------------------------------------------
+static void
+crecip(felem *out, const felem *z) {
+  felem z2[5];
+  felem z9[5];
+  felem z11[5];
+  felem z2_5_0[5];
+  felem z2_10_0[5];
+  felem z2_20_0[5];
+  felem z2_50_0[5];
+  felem z2_100_0[5];
+  felem t0[5];
+  felem t1[5];
+  int i;
+  /* 2 */ fsquare(z2,z);
+  /* 4 */ fsquare(t1,z2);
+  /* 8 */ fsquare(t0,t1);
+  /* 9 */ fmul(z9,t0,z);
+  /* 11 */ fmul(z11,z9,z2);
+  /* 22 */ fsquare(t0,z11);
+  /* 2^5 - 2^0 = 31 */ fmul(z2_5_0,t0,z9);
+  /* 2^6 - 2^1 */ fsquare(t0,z2_5_0);
+  /* 2^7 - 2^2 */ fsquare(t1,t0);
+  /* 2^8 - 2^3 */ fsquare(t0,t1);
+  /* 2^9 - 2^4 */ fsquare(t1,t0);
+  /* 2^10 - 2^5 */ fsquare(t0,t1);
+  /* 2^10 - 2^0 */ fmul(z2_10_0,t0,z2_5_0);
+  /* 2^11 - 2^1 */ fsquare(t0,z2_10_0);
+  /* 2^12 - 2^2 */ fsquare(t1,t0);
+  /* 2^20 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
+  /* 2^20 - 2^0 */ fmul(z2_20_0,t1,z2_10_0);
+  /* 2^21 - 2^1 */ fsquare(t0,z2_20_0);
+  /* 2^22 - 2^2 */ fsquare(t1,t0);
+  /* 2^40 - 2^20 */ for (i = 2;i < 20;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
+  /* 2^40 - 2^0 */ fmul(t0,t1,z2_20_0);
+  /* 2^41 - 2^1 */ fsquare(t1,t0);
+  /* 2^42 - 2^2 */ fsquare(t0,t1);
+  /* 2^50 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
+  /* 2^50 - 2^0 */ fmul(z2_50_0,t0,z2_10_0);
+  /* 2^51 - 2^1 */ fsquare(t0,z2_50_0);
+  /* 2^52 - 2^2 */ fsquare(t1,t0);
+  /* 2^100 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
+  /* 2^100 - 2^0 */ fmul(z2_100_0,t1,z2_50_0);
+  /* 2^101 - 2^1 */ fsquare(t1,z2_100_0);
+  /* 2^102 - 2^2 */ fsquare(t0,t1);
+  /* 2^200 - 2^100 */ for (i = 2;i < 100;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
+  /* 2^200 - 2^0 */ fmul(t1,t0,z2_100_0);
+  /* 2^201 - 2^1 */ fsquare(t0,t1);
+  /* 2^202 - 2^2 */ fsquare(t1,t0);
+  /* 2^250 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
+  /* 2^250 - 2^0 */ fmul(t0,t1,z2_50_0);
+  /* 2^251 - 2^1 */ fsquare(t1,t0);
+  /* 2^252 - 2^2 */ fsquare(t0,t1);
+  /* 2^253 - 2^3 */ fsquare(t1,t0);
+  /* 2^254 - 2^4 */ fsquare(t0,t1);
+  /* 2^255 - 2^5 */ fsquare(t1,t0);
+  /* 2^255 - 21 */ fmul(out,t1,z11);
+}
+int
+crypto_scalarmult(u8 *mypublic, const u8 *secret, const u8 *basepoint) {
+  felem bp[5], x[5], z[5], zmone[5];
+  unsigned char e[32];
+  int i;
+  for (i = 0;i < 32;++i) e[i] = secret[i];
+  e[0] &= 248;
+  e[31] &= 127;
+  e[31] |= 64;
+  fexpand(bp, basepoint);
+  cmult(x, z, e, bp);
+  crecip(zmone, z);
+  fmul(z, x, zmone);
+  fcontract(mypublic, z);
+  return 0;
+}

diff --git a/nacl/crypto_scalarmult/curve25519/donna_c64/smult.c b/nacl/crypto_scalarmult/curve25519/donna_c64/smult.c new file mode 100644 index 00000000..6d26956b --- /dev/null +++ b/nacl/crypto_scalarmult/curve25519/donna_c64/smult.c
@@ -0,0 +1,477 @@
	1	/* Copyright 2008, Google Inc.
	2	* All rights reserved.
	3	*
	4	* Code released into the public domain.
	5	*
	6	* curve25519-donna: Curve25519 elliptic curve, public key function
	7	*
	8	* http://code.google.com/p/curve25519-donna/
	9	*
	10	* Adam Langley <agl@imperialviolet.org>
	11	*
	12	* Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to>
	13	*
	14	* More information about curve25519 can be found here
	15	* http://cr.yp.to/ecdh.html
	16	*
	17	* djb's sample implementation of curve25519 is written in a special assembly
	18	* language called qhasm and uses the floating point registers.
	19	*
	20	* This is, almost, a clean room reimplementation from the curve25519 paper. It
	21	* uses many of the tricks described therein. Only the crecip function is taken
	22	* from the sample implementation.
	23	*/
	24
	25	#include <string.h>
	26	#include <stdint.h>
	27	#include "crypto_scalarmult.h"
	28
	29	typedef uint8_t u8;
	30	typedef uint64_t felem;
	31	// This is a special gcc mode for 128-bit integers. It's implemented on 64-bit
	32	// platforms only as far as I know.
	33	typedef unsigned uint128_t __attribute__((mode(TI)));
	34
	35	/* Sum two numbers: output += in */
	36	static void fsum(felem output, const felem in) {
	37	unsigned i;
	38	for (i = 0; i < 5; ++i) output[i] += in[i];
	39	}
	40
	41	/* Find the difference of two numbers: output = in - output
	42	* (note the order of the arguments!)
	43	*/
	44	static void fdifference_backwards(felem ioutput, const felem iin) {
	45	static const int64_t twotothe51 = (1l << 51);
	46	const int64_t in = (const int64_t ) iin;
	47	int64_t out = (int64_t ) ioutput;
	48
	49	out[0] = in[0] - out[0];
	50	out[1] = in[1] - out[1];
	51	out[2] = in[2] - out[2];
	52	out[3] = in[3] - out[3];
	53	out[4] = in[4] - out[4];
	54
	55	// An arithmetic shift right of 63 places turns a positive number to 0 and a
	56	// negative number to all 1's. This gives us a bitmask that lets us avoid
	57	// side-channel prone branches.
	58	int64_t t;
	59
	60	#define NEGCHAIN(a,b) \
	61	t = out[a] >> 63; \
	62	out[a] += twotothe51 & t; \
	63	out[b] -= 1 & t;
	64
	65	#define NEGCHAIN19(a,b) \
	66	t = out[a] >> 63; \
	67	out[a] += twotothe51 & t; \
	68	out[b] -= 19 & t;
	69
	70	NEGCHAIN(0, 1);
	71	NEGCHAIN(1, 2);
	72	NEGCHAIN(2, 3);
	73	NEGCHAIN(3, 4);
	74	NEGCHAIN19(4, 0);
	75	NEGCHAIN(0, 1);
	76	NEGCHAIN(1, 2);
	77	NEGCHAIN(2, 3);
	78	NEGCHAIN(3, 4);
	79	}
	80
	81	/* Multiply a number by a scalar: output = in * scalar */
	82	static void fscalar_product(felem output, const felem in, const felem scalar) {
	83	uint128_t a;
	84
	85	a = ((uint128_t) in[0]) * scalar;
	86	output[0] = a & 0x7ffffffffffff;
	87
	88	a = ((uint128_t) in[1]) * scalar + (a >> 51);
	89	output[1] = a & 0x7ffffffffffff;
	90
	91	a = ((uint128_t) in[2]) * scalar + (a >> 51);
	92	output[2] = a & 0x7ffffffffffff;
	93
	94	a = ((uint128_t) in[3]) * scalar + (a >> 51);
	95	output[3] = a & 0x7ffffffffffff;
	96
	97	a = ((uint128_t) in[4]) * scalar + (a >> 51);
	98	output[4] = a & 0x7ffffffffffff;
	99
	100	output[0] += (a >> 51) * 19;
	101	}
	102
	103	/* Multiply two numbers: output = in2 * in
	104	*
	105	* output must be distinct to both inputs. The inputs are reduced coefficient
	106	* form, the output is not.
	107	*/
	108	static void fmul(felem output, const felem in2, const felem *in) {
	109	uint128_t t[9];
	110
	111	t[0] = ((uint128_t) in[0]) * in2[0];
	112	t[1] = ((uint128_t) in[0]) * in2[1] +
	113	((uint128_t) in[1]) * in2[0];
	114	t[2] = ((uint128_t) in[0]) * in2[2] +
	115	((uint128_t) in[2]) * in2[0] +
	116	((uint128_t) in[1]) * in2[1];
	117	t[3] = ((uint128_t) in[0]) * in2[3] +
	118	((uint128_t) in[3]) * in2[0] +
	119	((uint128_t) in[1]) * in2[2] +
	120	((uint128_t) in[2]) * in2[1];
	121	t[4] = ((uint128_t) in[0]) * in2[4] +
	122	((uint128_t) in[4]) * in2[0] +
	123	((uint128_t) in[3]) * in2[1] +
	124	((uint128_t) in[1]) * in2[3] +
	125	((uint128_t) in[2]) * in2[2];
	126	t[5] = ((uint128_t) in[4]) * in2[1] +
	127	((uint128_t) in[1]) * in2[4] +
	128	((uint128_t) in[2]) * in2[3] +
	129	((uint128_t) in[3]) * in2[2];
	130	t[6] = ((uint128_t) in[4]) * in2[2] +
	131	((uint128_t) in[2]) * in2[4] +
	132	((uint128_t) in[3]) * in2[3];
	133	t[7] = ((uint128_t) in[3]) * in2[4] +
	134	((uint128_t) in[4]) * in2[3];
	135	t[8] = ((uint128_t) in[4]) * in2[4];
	136
	137	t[0] += t[5] * 19;
	138	t[1] += t[6] * 19;
	139	t[2] += t[7] * 19;
	140	t[3] += t[8] * 19;
	141
	142	t[1] += t[0] >> 51;
	143	t[0] &= 0x7ffffffffffff;
	144	t[2] += t[1] >> 51;
	145	t[1] &= 0x7ffffffffffff;
	146	t[3] += t[2] >> 51;
	147	t[2] &= 0x7ffffffffffff;
	148	t[4] += t[3] >> 51;
	149	t[3] &= 0x7ffffffffffff;
	150	t[0] += 19 * (t[4] >> 51);
	151	t[4] &= 0x7ffffffffffff;
	152	t[1] += t[0] >> 51;
	153	t[0] &= 0x7ffffffffffff;
	154	t[2] += t[1] >> 51;
	155	t[1] &= 0x7ffffffffffff;
	156
	157	output[0] = t[0];
	158	output[1] = t[1];
	159	output[2] = t[2];
	160	output[3] = t[3];
	161	output[4] = t[4];
	162	}
	163
	164	static void
	165	fsquare(felem output, const felem in) {
	166	uint128_t t[9];
	167
	168	t[0] = ((uint128_t) in[0]) * in[0];
	169	t[1] = ((uint128_t) in[0]) * in[1] * 2;
	170	t[2] = ((uint128_t) in[0]) * in[2] * 2 +
	171	((uint128_t) in[1]) * in[1];
	172	t[3] = ((uint128_t) in[0]) * in[3] * 2 +
	173	((uint128_t) in[1]) * in[2] * 2;
	174	t[4] = ((uint128_t) in[0]) * in[4] * 2 +
	175	((uint128_t) in[3]) * in[1] * 2 +
	176	((uint128_t) in[2]) * in[2];
	177	t[5] = ((uint128_t) in[4]) * in[1] * 2 +
	178	((uint128_t) in[2]) * in[3] * 2;
	179	t[6] = ((uint128_t) in[4]) * in[2] * 2 +
	180	((uint128_t) in[3]) * in[3];
	181	t[7] = ((uint128_t) in[3]) * in[4] * 2;
	182	t[8] = ((uint128_t) in[4]) * in[4];
	183
	184	t[0] += t[5] * 19;
	185	t[1] += t[6] * 19;
	186	t[2] += t[7] * 19;
	187	t[3] += t[8] * 19;
	188
	189	t[1] += t[0] >> 51;
	190	t[0] &= 0x7ffffffffffff;
	191	t[2] += t[1] >> 51;
	192	t[1] &= 0x7ffffffffffff;
	193	t[3] += t[2] >> 51;
	194	t[2] &= 0x7ffffffffffff;
	195	t[4] += t[3] >> 51;
	196	t[3] &= 0x7ffffffffffff;
	197	t[0] += 19 * (t[4] >> 51);
	198	t[4] &= 0x7ffffffffffff;
	199	t[1] += t[0] >> 51;
	200	t[0] &= 0x7ffffffffffff;
	201
	202	output[0] = t[0];
	203	output[1] = t[1];
	204	output[2] = t[2];
	205	output[3] = t[3];
	206	output[4] = t[4];
	207	}
	208
	209	/* Take a little-endian, 32-byte number and expand it into polynomial form */
	210	static void
	211	fexpand(felem output, const u8 in) {
	212	output[0] = ((const uint64_t )(in)) & 0x7ffffffffffff;
	213	output[1] = (((const uint64_t )(in+6)) >> 3) & 0x7ffffffffffff;
	214	output[2] = (((const uint64_t )(in+12)) >> 6) & 0x7ffffffffffff;
	215	output[3] = (((const uint64_t )(in+19)) >> 1) & 0x7ffffffffffff;
	216	output[4] = (((const uint64_t )(in+25)) >> 4) & 0x7ffffffffffff;
	217	}
	218
	219	/* Take a fully reduced polynomial form number and contract it into a
	220	* little-endian, 32-byte array
	221	*/
	222	static void
	223	fcontract(u8 output, const felem input) {
	224	uint128_t t[5];
	225
	226	t[0] = input[0];
	227	t[1] = input[1];
	228	t[2] = input[2];
	229	t[3] = input[3];
	230	t[4] = input[4];
	231
	232	t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff;
	233	t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff;
	234	t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff;
	235	t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff;
	236	t[0] += 19 * (t[4] >> 51); t[4] &= 0x7ffffffffffff;
	237
	238	t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff;
	239	t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff;
	240	t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff;
	241	t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff;
	242	t[0] += 19 * (t[4] >> 51); t[4] &= 0x7ffffffffffff;
	243
	244	/* now t is between 0 and 2^255-1, properly carried. */
	245	/* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */
	246
	247	t[0] += 19;
	248
	249	t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff;
	250	t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff;
	251	t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff;
	252	t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff;
	253	t[0] += 19 * (t[4] >> 51); t[4] &= 0x7ffffffffffff;
	254
	255	/* now between 19 and 2^255-1 in both cases, and offset by 19. */
	256
	257	t[0] += 0x8000000000000 - 19;
	258	t[1] += 0x8000000000000 - 1;
	259	t[2] += 0x8000000000000 - 1;
	260	t[3] += 0x8000000000000 - 1;
	261	t[4] += 0x8000000000000 - 1;
	262
	263	/* now between 2^255 and 2^256-20, and offset by 2^255. */
	264
	265	t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff;
	266	t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff;
	267	t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff;
	268	t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff;
	269	t[4] &= 0x7ffffffffffff;
	270
	271	((uint64_t )(output)) = t[0] \| (t[1] << 51);
	272	((uint64_t )(output+8)) = (t[1] >> 13) \| (t[2] << 38);
	273	((uint64_t )(output+16)) = (t[2] >> 26) \| (t[3] << 25);
	274	((uint64_t )(output+24)) = (t[3] >> 39) \| (t[4] << 12);
	275	}
	276
	277	/* Input: Q, Q', Q-Q'
	278	* Output: 2Q, Q+Q'
	279	*
	280	* x2 z3: long form
	281	* x3 z3: long form
	282	* x z: short form, destroyed
	283	* xprime zprime: short form, destroyed
	284	* qmqp: short form, preserved
	285	*/
	286	static void
	287	fmonty(felem x2, felem z2, /* output 2Q */
	288	felem x3, felem z3, /* output Q + Q' */
	289	felem x, felem z, /* input Q */
	290	felem xprime, felem zprime, /* input Q' */
	291	const felem qmqp / input Q - Q' */) {
	292	felem origx[5], origxprime[5], zzz[5], xx[5], zz[5], xxprime[5],
	293	zzprime[5], zzzprime[5];
	294
	295	memcpy(origx, x, 5 * sizeof(felem));
	296	fsum(x, z);
	297	fdifference_backwards(z, origx); // does x - z
	298
	299	memcpy(origxprime, xprime, sizeof(felem) * 5);
	300	fsum(xprime, zprime);
	301	fdifference_backwards(zprime, origxprime);
	302	fmul(xxprime, xprime, z);
	303	fmul(zzprime, x, zprime);
	304	memcpy(origxprime, xxprime, sizeof(felem) * 5);
	305	fsum(xxprime, zzprime);
	306	fdifference_backwards(zzprime, origxprime);
	307	fsquare(x3, xxprime);
	308	fsquare(zzzprime, zzprime);
	309	fmul(z3, zzzprime, qmqp);
	310
	311	fsquare(xx, x);
	312	fsquare(zz, z);
	313	fmul(x2, xx, zz);
	314	fdifference_backwards(zz, xx); // does zz = xx - zz
	315	fscalar_product(zzz, zz, 121665);
	316	fsum(zzz, xx);
	317	fmul(z2, zz, zzz);
	318	}
	319
	320	// -----------------------------------------------------------------------------
	321	// Maybe swap the contents of two felem arrays (@a and @b), each @len elements
	322	// long. Perform the swap iff @swap is non-zero.
	323	//
	324	// This function performs the swap without leaking any side-channel
	325	// information.
	326	// -----------------------------------------------------------------------------
	327	static void
	328	swap_conditional(felem a, felem b, unsigned len, felem iswap) {
	329	unsigned i;
	330	const felem swap = -iswap;
	331
	332	for (i = 0; i < len; ++i) {
	333	const felem x = swap & (a[i] ^ b[i]);
	334	a[i] ^= x;
	335	b[i] ^= x;
	336	}
	337	}
	338
	339	/* Calculates nQ where Q is the x-coordinate of a point on the curve
	340	*
	341	* resultx/resultz: the x coordinate of the resulting curve point (short form)
	342	* n: a little endian, 32-byte number
	343	* q: a point of the curve (short form)
	344	*/
	345	static void
	346	cmult(felem resultx, felem resultz, const u8 n, const felem q) {
	347	felem a[5] = {0}, b[5] = {1}, c[5] = {1}, d[5] = {0};
	348	felem nqpqx = a, nqpqz = b, nqx = c, nqz = d, *t;
	349	felem e[5] = {0}, f[5] = {1}, g[5] = {0}, h[5] = {1};
	350	felem nqpqx2 = e, nqpqz2 = f, nqx2 = g, nqz2 = h;
	351
	352	unsigned i, j;
	353
	354	memcpy(nqpqx, q, sizeof(felem) * 5);
	355
	356	for (i = 0; i < 32; ++i) {
	357	u8 byte = n[31 - i];
	358	for (j = 0; j < 8; ++j) {
	359	const felem bit = byte >> 7;
	360
	361	swap_conditional(nqx, nqpqx, 5, bit);
	362	swap_conditional(nqz, nqpqz, 5, bit);
	363	fmonty(nqx2, nqz2,
	364	nqpqx2, nqpqz2,
	365	nqx, nqz,
	366	nqpqx, nqpqz,
	367	q);
	368	swap_conditional(nqx2, nqpqx2, 5, bit);
	369	swap_conditional(nqz2, nqpqz2, 5, bit);
	370
	371	t = nqx;
	372	nqx = nqx2;
	373	nqx2 = t;
	374	t = nqz;
	375	nqz = nqz2;
	376	nqz2 = t;
	377	t = nqpqx;
	378	nqpqx = nqpqx2;
	379	nqpqx2 = t;
	380	t = nqpqz;
	381	nqpqz = nqpqz2;
	382	nqpqz2 = t;
	383
	384	byte <<= 1;
	385	}
	386	}
	387
	388	memcpy(resultx, nqx, sizeof(felem) * 5);
	389	memcpy(resultz, nqz, sizeof(felem) * 5);
	390	}
	391
	392	// -----------------------------------------------------------------------------
	393	// Shamelessly copied from djb's code
	394	// -----------------------------------------------------------------------------
	395	static void
	396	crecip(felem out, const felem z) {
	397	felem z2[5];
	398	felem z9[5];
	399	felem z11[5];
	400	felem z2_5_0[5];
	401	felem z2_10_0[5];
	402	felem z2_20_0[5];
	403	felem z2_50_0[5];
	404	felem z2_100_0[5];
	405	felem t0[5];
	406	felem t1[5];
	407	int i;
	408
	409	/* 2 */ fsquare(z2,z);
	410	/* 4 */ fsquare(t1,z2);
	411	/* 8 */ fsquare(t0,t1);
	412	/* 9 */ fmul(z9,t0,z);
	413	/* 11 */ fmul(z11,z9,z2);
	414	/* 22 */ fsquare(t0,z11);
	415	/* 2^5 - 2^0 = 31 */ fmul(z2_5_0,t0,z9);
	416
	417	/* 2^6 - 2^1 */ fsquare(t0,z2_5_0);
	418	/* 2^7 - 2^2 */ fsquare(t1,t0);
	419	/* 2^8 - 2^3 */ fsquare(t0,t1);
	420	/* 2^9 - 2^4 */ fsquare(t1,t0);
	421	/* 2^10 - 2^5 */ fsquare(t0,t1);
	422	/* 2^10 - 2^0 */ fmul(z2_10_0,t0,z2_5_0);
	423
	424	/* 2^11 - 2^1 */ fsquare(t0,z2_10_0);
	425	/* 2^12 - 2^2 */ fsquare(t1,t0);
	426	/* 2^20 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
	427	/* 2^20 - 2^0 */ fmul(z2_20_0,t1,z2_10_0);
	428
	429	/* 2^21 - 2^1 */ fsquare(t0,z2_20_0);
	430	/* 2^22 - 2^2 */ fsquare(t1,t0);
	431	/* 2^40 - 2^20 */ for (i = 2;i < 20;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
	432	/* 2^40 - 2^0 */ fmul(t0,t1,z2_20_0);
	433
	434	/* 2^41 - 2^1 */ fsquare(t1,t0);
	435	/* 2^42 - 2^2 */ fsquare(t0,t1);
	436	/* 2^50 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
	437	/* 2^50 - 2^0 */ fmul(z2_50_0,t0,z2_10_0);
	438
	439	/* 2^51 - 2^1 */ fsquare(t0,z2_50_0);
	440	/* 2^52 - 2^2 */ fsquare(t1,t0);
	441	/* 2^100 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
	442	/* 2^100 - 2^0 */ fmul(z2_100_0,t1,z2_50_0);
	443
	444	/* 2^101 - 2^1 */ fsquare(t1,z2_100_0);
	445	/* 2^102 - 2^2 */ fsquare(t0,t1);
	446	/* 2^200 - 2^100 */ for (i = 2;i < 100;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
	447	/* 2^200 - 2^0 */ fmul(t1,t0,z2_100_0);
	448
	449	/* 2^201 - 2^1 */ fsquare(t0,t1);
	450	/* 2^202 - 2^2 */ fsquare(t1,t0);
	451	/* 2^250 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
	452	/* 2^250 - 2^0 */ fmul(t0,t1,z2_50_0);
	453
	454	/* 2^251 - 2^1 */ fsquare(t1,t0);
	455	/* 2^252 - 2^2 */ fsquare(t0,t1);
	456	/* 2^253 - 2^3 */ fsquare(t1,t0);
	457	/* 2^254 - 2^4 */ fsquare(t0,t1);
	458	/* 2^255 - 2^5 */ fsquare(t1,t0);
	459	/* 2^255 - 21 */ fmul(out,t1,z11);
	460	}
	461
	462	int
	463	crypto_scalarmult(u8 mypublic, const u8 secret, const u8 *basepoint) {
	464	felem bp[5], x[5], z[5], zmone[5];
	465	unsigned char e[32];
	466	int i;
	467	for (i = 0;i < 32;++i) e[i] = secret[i];
	468	e[0] &= 248;
	469	e[31] &= 127;
	470	e[31] \|= 64;
	471	fexpand(bp, basepoint);
	472	cmult(x, z, e, bp);
	473	crecip(zmone, z);
	474	fmul(z, x, zmone);
	475	fcontract(mypublic, z);
	476	return 0;
	477	}