1 // The MIT License (MIT)
2 //
3 // Copyright (c) 2015-2016 the fiat-crypto authors (see the AUTHORS file).
4 //
5 // Permission is hereby granted, free of charge, to any person obtaining a copy
6 // of this software and associated documentation files (the "Software"), to deal
7 // in the Software without restriction, including without limitation the rights
8 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 // copies of the Software, and to permit persons to whom the Software is
10 // furnished to do so, subject to the following conditions:
11 //
12 // The above copyright notice and this permission notice shall be included in all
13 // copies or substantial portions of the Software.
14 //
15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 // SOFTWARE.
22 
23 // Some of this code is taken from the ref10 version of Ed25519 in SUPERCOP
24 // 20141124 (http://bench.cr.yp.to/supercop.html). That code is released as
25 // public domain but parts have been replaced with code generated by Fiat
26 // (https://github.com/mit-plv/fiat-crypto), which is MIT licensed.
27 //
28 // The field functions are shared by Ed25519 and X25519 where possible.
29 
30 #include <openssl/curve25519.h>
31 
32 #include <assert.h>
33 #include <string.h>
34 
35 #include <openssl/cpu.h>
36 #include <openssl/mem.h>
37 #include <openssl/rand.h>
38 #include <openssl/sha.h>
39 #include <openssl/type_check.h>
40 
41 #include "internal.h"
42 #include "../../crypto/internal.h"
43 
44 
45 // Various pre-computed constants.
46 #include "./curve25519_tables.h"
47 
48 #if defined(BORINGSSL_CURVE25519_64BIT)
49 #include "./curve25519_64.h"
50 #else
51 #include "./curve25519_32.h"
52 #endif  // BORINGSSL_CURVE25519_64BIT
53 
54 
55 // Low-level intrinsic operations
56 
load_3(const uint8_t * in)57 static uint64_t load_3(const uint8_t *in) {
58   uint64_t result;
59   result = (uint64_t)in[0];
60   result |= ((uint64_t)in[1]) << 8;
61   result |= ((uint64_t)in[2]) << 16;
62   return result;
63 }
64 
load_4(const uint8_t * in)65 static uint64_t load_4(const uint8_t *in) {
66   uint64_t result;
67   result = (uint64_t)in[0];
68   result |= ((uint64_t)in[1]) << 8;
69   result |= ((uint64_t)in[2]) << 16;
70   result |= ((uint64_t)in[3]) << 24;
71   return result;
72 }
73 
74 
75 // Field operations.
76 
77 #if defined(BORINGSSL_CURVE25519_64BIT)
78 
79 typedef uint64_t fe_limb_t;
80 #define FE_NUM_LIMBS 5
81 
82 // assert_fe asserts that |f| satisfies bounds:
83 //
84 //  [[0x0 ~> 0x8cccccccccccc],
85 //   [0x0 ~> 0x8cccccccccccc],
86 //   [0x0 ~> 0x8cccccccccccc],
87 //   [0x0 ~> 0x8cccccccccccc],
88 //   [0x0 ~> 0x8cccccccccccc]]
89 //
90 // See comments in curve25519_64.h for which functions use these bounds for
91 // inputs or outputs.
92 #define assert_fe(f)                                                    \
93   do {                                                                  \
94     for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \
95       assert(f[_assert_fe_i] <= UINT64_C(0x8cccccccccccc));             \
96     }                                                                   \
97   } while (0)
98 
99 // assert_fe_loose asserts that |f| satisfies bounds:
100 //
101 //  [[0x0 ~> 0x1a666666666664],
102 //   [0x0 ~> 0x1a666666666664],
103 //   [0x0 ~> 0x1a666666666664],
104 //   [0x0 ~> 0x1a666666666664],
105 //   [0x0 ~> 0x1a666666666664]]
106 //
107 // See comments in curve25519_64.h for which functions use these bounds for
108 // inputs or outputs.
109 #define assert_fe_loose(f)                                              \
110   do {                                                                  \
111     for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \
112       assert(f[_assert_fe_i] <= UINT64_C(0x1a666666666664));            \
113     }                                                                   \
114   } while (0)
115 
116 #else
117 
118 typedef uint32_t fe_limb_t;
119 #define FE_NUM_LIMBS 10
120 
121 // assert_fe asserts that |f| satisfies bounds:
122 //
123 //  [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
124 //   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
125 //   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
126 //   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
127 //   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
128 //
129 // See comments in curve25519_32.h for which functions use these bounds for
130 // inputs or outputs.
131 #define assert_fe(f)                                                     \
132   do {                                                                   \
133     for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \
134       assert(f[_assert_fe_i] <=                                          \
135              ((_assert_fe_i & 1) ? 0x2333333u : 0x4666666u));            \
136     }                                                                    \
137   } while (0)
138 
139 // assert_fe_loose asserts that |f| satisfies bounds:
140 //
141 //  [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
142 //   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
143 //   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
144 //   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
145 //   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]]
146 //
147 // See comments in curve25519_32.h for which functions use these bounds for
148 // inputs or outputs.
149 #define assert_fe_loose(f)                                               \
150   do {                                                                   \
151     for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \
152       assert(f[_assert_fe_i] <=                                          \
153              ((_assert_fe_i & 1) ? 0x6999999u : 0xd333332u));            \
154     }                                                                    \
155   } while (0)
156 
157 #endif  // BORINGSSL_CURVE25519_64BIT
158 
159 OPENSSL_STATIC_ASSERT(sizeof(fe) == sizeof(fe_limb_t) * FE_NUM_LIMBS,
160                       "fe_limb_t[FE_NUM_LIMBS] is inconsistent with fe");
161 
fe_frombytes_strict(fe * h,const uint8_t s[32])162 static void fe_frombytes_strict(fe *h, const uint8_t s[32]) {
163   // |fiat_25519_from_bytes| requires the top-most bit be clear.
164   assert((s[31] & 0x80) == 0);
165   fiat_25519_from_bytes(h->v, s);
166   assert_fe(h->v);
167 }
168 
fe_frombytes(fe * h,const uint8_t s[32])169 static void fe_frombytes(fe *h, const uint8_t s[32]) {
170   uint8_t s_copy[32];
171   OPENSSL_memcpy(s_copy, s, 32);
172   s_copy[31] &= 0x7f;
173   fe_frombytes_strict(h, s_copy);
174 }
175 
fe_tobytes(uint8_t s[32],const fe * f)176 static void fe_tobytes(uint8_t s[32], const fe *f) {
177   assert_fe(f->v);
178   fiat_25519_to_bytes(s, f->v);
179 }
180 
181 // h = 0
fe_0(fe * h)182 static void fe_0(fe *h) {
183   OPENSSL_memset(h, 0, sizeof(fe));
184 }
185 
fe_loose_0(fe_loose * h)186 static void fe_loose_0(fe_loose *h) {
187   OPENSSL_memset(h, 0, sizeof(fe_loose));
188 }
189 
190 // h = 1
fe_1(fe * h)191 static void fe_1(fe *h) {
192   OPENSSL_memset(h, 0, sizeof(fe));
193   h->v[0] = 1;
194 }
195 
fe_loose_1(fe_loose * h)196 static void fe_loose_1(fe_loose *h) {
197   OPENSSL_memset(h, 0, sizeof(fe_loose));
198   h->v[0] = 1;
199 }
200 
201 // h = f + g
202 // Can overlap h with f or g.
fe_add(fe_loose * h,const fe * f,const fe * g)203 static void fe_add(fe_loose *h, const fe *f, const fe *g) {
204   assert_fe(f->v);
205   assert_fe(g->v);
206   fiat_25519_add(h->v, f->v, g->v);
207   assert_fe_loose(h->v);
208 }
209 
210 // h = f - g
211 // Can overlap h with f or g.
fe_sub(fe_loose * h,const fe * f,const fe * g)212 static void fe_sub(fe_loose *h, const fe *f, const fe *g) {
213   assert_fe(f->v);
214   assert_fe(g->v);
215   fiat_25519_sub(h->v, f->v, g->v);
216   assert_fe_loose(h->v);
217 }
218 
fe_carry(fe * h,const fe_loose * f)219 static void fe_carry(fe *h, const fe_loose* f) {
220   assert_fe_loose(f->v);
221   fiat_25519_carry(h->v, f->v);
222   assert_fe(h->v);
223 }
224 
fe_mul_impl(fe_limb_t out[FE_NUM_LIMBS],const fe_limb_t in1[FE_NUM_LIMBS],const fe_limb_t in2[FE_NUM_LIMBS])225 static void fe_mul_impl(fe_limb_t out[FE_NUM_LIMBS],
226                         const fe_limb_t in1[FE_NUM_LIMBS],
227                         const fe_limb_t in2[FE_NUM_LIMBS]) {
228   assert_fe_loose(in1);
229   assert_fe_loose(in2);
230   fiat_25519_carry_mul(out, in1, in2);
231   assert_fe(out);
232 }
233 
fe_mul_ltt(fe_loose * h,const fe * f,const fe * g)234 static void fe_mul_ltt(fe_loose *h, const fe *f, const fe *g) {
235   fe_mul_impl(h->v, f->v, g->v);
236 }
237 
fe_mul_llt(fe_loose * h,const fe_loose * f,const fe * g)238 static void fe_mul_llt(fe_loose *h, const fe_loose *f, const fe *g) {
239   fe_mul_impl(h->v, f->v, g->v);
240 }
241 
fe_mul_ttt(fe * h,const fe * f,const fe * g)242 static void fe_mul_ttt(fe *h, const fe *f, const fe *g) {
243   fe_mul_impl(h->v, f->v, g->v);
244 }
245 
fe_mul_tlt(fe * h,const fe_loose * f,const fe * g)246 static void fe_mul_tlt(fe *h, const fe_loose *f, const fe *g) {
247   fe_mul_impl(h->v, f->v, g->v);
248 }
249 
fe_mul_ttl(fe * h,const fe * f,const fe_loose * g)250 static void fe_mul_ttl(fe *h, const fe *f, const fe_loose *g) {
251   fe_mul_impl(h->v, f->v, g->v);
252 }
253 
fe_mul_tll(fe * h,const fe_loose * f,const fe_loose * g)254 static void fe_mul_tll(fe *h, const fe_loose *f, const fe_loose *g) {
255   fe_mul_impl(h->v, f->v, g->v);
256 }
257 
fe_sq_tl(fe * h,const fe_loose * f)258 static void fe_sq_tl(fe *h, const fe_loose *f) {
259   assert_fe_loose(f->v);
260   fiat_25519_carry_square(h->v, f->v);
261   assert_fe(h->v);
262 }
263 
fe_sq_tt(fe * h,const fe * f)264 static void fe_sq_tt(fe *h, const fe *f) {
265   assert_fe_loose(f->v);
266   fiat_25519_carry_square(h->v, f->v);
267   assert_fe(h->v);
268 }
269 
270 // Replace (f,g) with (g,f) if b == 1;
271 // replace (f,g) with (f,g) if b == 0.
272 //
273 // Preconditions: b in {0,1}.
fe_cswap(fe * f,fe * g,fe_limb_t b)274 static void fe_cswap(fe *f, fe *g, fe_limb_t b) {
275   b = 0-b;
276   for (unsigned i = 0; i < FE_NUM_LIMBS; i++) {
277     fe_limb_t x = f->v[i] ^ g->v[i];
278     x &= b;
279     f->v[i] ^= x;
280     g->v[i] ^= x;
281   }
282 }
283 
fe_mul121666(fe * h,const fe_loose * f)284 static void fe_mul121666(fe *h, const fe_loose *f) {
285   assert_fe_loose(f->v);
286   fiat_25519_carry_scmul_121666(h->v, f->v);
287   assert_fe(h->v);
288 }
289 
290 // h = -f
fe_neg(fe_loose * h,const fe * f)291 static void fe_neg(fe_loose *h, const fe *f) {
292   assert_fe(f->v);
293   fiat_25519_opp(h->v, f->v);
294   assert_fe_loose(h->v);
295 }
296 
297 // Replace (f,g) with (g,g) if b == 1;
298 // replace (f,g) with (f,g) if b == 0.
299 //
300 // Preconditions: b in {0,1}.
fe_cmov(fe_loose * f,const fe_loose * g,fe_limb_t b)301 static void fe_cmov(fe_loose *f, const fe_loose *g, fe_limb_t b) {
302   // Silence an unused function warning. |fiat_25519_selectznz| isn't quite the
303   // calling convention the rest of this code wants, so implement it by hand.
304   //
305   // TODO(davidben): Switch to fiat's calling convention, or ask fiat to emit a
306   // different one.
307   (void)fiat_25519_selectznz;
308 
309   b = 0-b;
310   for (unsigned i = 0; i < FE_NUM_LIMBS; i++) {
311     fe_limb_t x = f->v[i] ^ g->v[i];
312     x &= b;
313     f->v[i] ^= x;
314   }
315 }
316 
317 // h = f
fe_copy(fe * h,const fe * f)318 static void fe_copy(fe *h, const fe *f) {
319   OPENSSL_memmove(h, f, sizeof(fe));
320 }
321 
fe_copy_lt(fe_loose * h,const fe * f)322 static void fe_copy_lt(fe_loose *h, const fe *f) {
323   OPENSSL_STATIC_ASSERT(sizeof(fe_loose) == sizeof(fe),
324                         "fe and fe_loose mismatch");
325   OPENSSL_memmove(h, f, sizeof(fe));
326 }
327 #if !defined(OPENSSL_SMALL)
fe_copy_ll(fe_loose * h,const fe_loose * f)328 static void fe_copy_ll(fe_loose *h, const fe_loose *f) {
329   OPENSSL_memmove(h, f, sizeof(fe_loose));
330 }
331 #endif // !defined(OPENSSL_SMALL)
332 
fe_loose_invert(fe * out,const fe_loose * z)333 static void fe_loose_invert(fe *out, const fe_loose *z) {
334   fe t0;
335   fe t1;
336   fe t2;
337   fe t3;
338   int i;
339 
340   fe_sq_tl(&t0, z);
341   fe_sq_tt(&t1, &t0);
342   for (i = 1; i < 2; ++i) {
343     fe_sq_tt(&t1, &t1);
344   }
345   fe_mul_tlt(&t1, z, &t1);
346   fe_mul_ttt(&t0, &t0, &t1);
347   fe_sq_tt(&t2, &t0);
348   fe_mul_ttt(&t1, &t1, &t2);
349   fe_sq_tt(&t2, &t1);
350   for (i = 1; i < 5; ++i) {
351     fe_sq_tt(&t2, &t2);
352   }
353   fe_mul_ttt(&t1, &t2, &t1);
354   fe_sq_tt(&t2, &t1);
355   for (i = 1; i < 10; ++i) {
356     fe_sq_tt(&t2, &t2);
357   }
358   fe_mul_ttt(&t2, &t2, &t1);
359   fe_sq_tt(&t3, &t2);
360   for (i = 1; i < 20; ++i) {
361     fe_sq_tt(&t3, &t3);
362   }
363   fe_mul_ttt(&t2, &t3, &t2);
364   fe_sq_tt(&t2, &t2);
365   for (i = 1; i < 10; ++i) {
366     fe_sq_tt(&t2, &t2);
367   }
368   fe_mul_ttt(&t1, &t2, &t1);
369   fe_sq_tt(&t2, &t1);
370   for (i = 1; i < 50; ++i) {
371     fe_sq_tt(&t2, &t2);
372   }
373   fe_mul_ttt(&t2, &t2, &t1);
374   fe_sq_tt(&t3, &t2);
375   for (i = 1; i < 100; ++i) {
376     fe_sq_tt(&t3, &t3);
377   }
378   fe_mul_ttt(&t2, &t3, &t2);
379   fe_sq_tt(&t2, &t2);
380   for (i = 1; i < 50; ++i) {
381     fe_sq_tt(&t2, &t2);
382   }
383   fe_mul_ttt(&t1, &t2, &t1);
384   fe_sq_tt(&t1, &t1);
385   for (i = 1; i < 5; ++i) {
386     fe_sq_tt(&t1, &t1);
387   }
388   fe_mul_ttt(out, &t1, &t0);
389 }
390 
fe_invert(fe * out,const fe * z)391 static void fe_invert(fe *out, const fe *z) {
392   fe_loose l;
393   fe_copy_lt(&l, z);
394   fe_loose_invert(out, &l);
395 }
396 
397 // return 0 if f == 0
398 // return 1 if f != 0
fe_isnonzero(const fe_loose * f)399 static int fe_isnonzero(const fe_loose *f) {
400   fe tight;
401   fe_carry(&tight, f);
402   uint8_t s[32];
403   fe_tobytes(s, &tight);
404 
405   static const uint8_t zero[32] = {0};
406   return CRYPTO_memcmp(s, zero, sizeof(zero)) != 0;
407 }
408 
409 // return 1 if f is in {1,3,5,...,q-2}
410 // return 0 if f is in {0,2,4,...,q-1}
fe_isnegative(const fe * f)411 static int fe_isnegative(const fe *f) {
412   uint8_t s[32];
413   fe_tobytes(s, f);
414   return s[0] & 1;
415 }
416 
fe_sq2_tt(fe * h,const fe * f)417 static void fe_sq2_tt(fe *h, const fe *f) {
418   // h = f^2
419   fe_sq_tt(h, f);
420 
421   // h = h + h
422   fe_loose tmp;
423   fe_add(&tmp, h, h);
424   fe_carry(h, &tmp);
425 }
426 
fe_pow22523(fe * out,const fe * z)427 static void fe_pow22523(fe *out, const fe *z) {
428   fe t0;
429   fe t1;
430   fe t2;
431   int i;
432 
433   fe_sq_tt(&t0, z);
434   fe_sq_tt(&t1, &t0);
435   for (i = 1; i < 2; ++i) {
436     fe_sq_tt(&t1, &t1);
437   }
438   fe_mul_ttt(&t1, z, &t1);
439   fe_mul_ttt(&t0, &t0, &t1);
440   fe_sq_tt(&t0, &t0);
441   fe_mul_ttt(&t0, &t1, &t0);
442   fe_sq_tt(&t1, &t0);
443   for (i = 1; i < 5; ++i) {
444     fe_sq_tt(&t1, &t1);
445   }
446   fe_mul_ttt(&t0, &t1, &t0);
447   fe_sq_tt(&t1, &t0);
448   for (i = 1; i < 10; ++i) {
449     fe_sq_tt(&t1, &t1);
450   }
451   fe_mul_ttt(&t1, &t1, &t0);
452   fe_sq_tt(&t2, &t1);
453   for (i = 1; i < 20; ++i) {
454     fe_sq_tt(&t2, &t2);
455   }
456   fe_mul_ttt(&t1, &t2, &t1);
457   fe_sq_tt(&t1, &t1);
458   for (i = 1; i < 10; ++i) {
459     fe_sq_tt(&t1, &t1);
460   }
461   fe_mul_ttt(&t0, &t1, &t0);
462   fe_sq_tt(&t1, &t0);
463   for (i = 1; i < 50; ++i) {
464     fe_sq_tt(&t1, &t1);
465   }
466   fe_mul_ttt(&t1, &t1, &t0);
467   fe_sq_tt(&t2, &t1);
468   for (i = 1; i < 100; ++i) {
469     fe_sq_tt(&t2, &t2);
470   }
471   fe_mul_ttt(&t1, &t2, &t1);
472   fe_sq_tt(&t1, &t1);
473   for (i = 1; i < 50; ++i) {
474     fe_sq_tt(&t1, &t1);
475   }
476   fe_mul_ttt(&t0, &t1, &t0);
477   fe_sq_tt(&t0, &t0);
478   for (i = 1; i < 2; ++i) {
479     fe_sq_tt(&t0, &t0);
480   }
481   fe_mul_ttt(out, &t0, z);
482 }
483 
484 
485 // Group operations.
486 
x25519_ge_tobytes(uint8_t s[32],const ge_p2 * h)487 void x25519_ge_tobytes(uint8_t s[32], const ge_p2 *h) {
488   fe recip;
489   fe x;
490   fe y;
491 
492   fe_invert(&recip, &h->Z);
493   fe_mul_ttt(&x, &h->X, &recip);
494   fe_mul_ttt(&y, &h->Y, &recip);
495   fe_tobytes(s, &y);
496   s[31] ^= fe_isnegative(&x) << 7;
497 }
498 
ge_p3_tobytes(uint8_t s[32],const ge_p3 * h)499 static void ge_p3_tobytes(uint8_t s[32], const ge_p3 *h) {
500   fe recip;
501   fe x;
502   fe y;
503 
504   fe_invert(&recip, &h->Z);
505   fe_mul_ttt(&x, &h->X, &recip);
506   fe_mul_ttt(&y, &h->Y, &recip);
507   fe_tobytes(s, &y);
508   s[31] ^= fe_isnegative(&x) << 7;
509 }
510 
x25519_ge_frombytes_vartime(ge_p3 * h,const uint8_t s[32])511 int x25519_ge_frombytes_vartime(ge_p3 *h, const uint8_t s[32]) {
512   fe u;
513   fe_loose v;
514   fe v3;
515   fe vxx;
516   fe_loose check;
517 
518   fe_frombytes(&h->Y, s);
519   fe_1(&h->Z);
520   fe_sq_tt(&v3, &h->Y);
521   fe_mul_ttt(&vxx, &v3, &d);
522   fe_sub(&v, &v3, &h->Z);  // u = y^2-1
523   fe_carry(&u, &v);
524   fe_add(&v, &vxx, &h->Z);  // v = dy^2+1
525 
526   fe_sq_tl(&v3, &v);
527   fe_mul_ttl(&v3, &v3, &v);  // v3 = v^3
528   fe_sq_tt(&h->X, &v3);
529   fe_mul_ttl(&h->X, &h->X, &v);
530   fe_mul_ttt(&h->X, &h->X, &u);  // x = uv^7
531 
532   fe_pow22523(&h->X, &h->X);  // x = (uv^7)^((q-5)/8)
533   fe_mul_ttt(&h->X, &h->X, &v3);
534   fe_mul_ttt(&h->X, &h->X, &u);  // x = uv^3(uv^7)^((q-5)/8)
535 
536   fe_sq_tt(&vxx, &h->X);
537   fe_mul_ttl(&vxx, &vxx, &v);
538   fe_sub(&check, &vxx, &u);
539   if (fe_isnonzero(&check)) {
540     fe_add(&check, &vxx, &u);
541     if (fe_isnonzero(&check)) {
542       return 0;
543     }
544     fe_mul_ttt(&h->X, &h->X, &sqrtm1);
545   }
546 
547   if (fe_isnegative(&h->X) != (s[31] >> 7)) {
548     fe_loose t;
549     fe_neg(&t, &h->X);
550     fe_carry(&h->X, &t);
551   }
552 
553   fe_mul_ttt(&h->T, &h->X, &h->Y);
554   return 1;
555 }
556 
ge_p2_0(ge_p2 * h)557 static void ge_p2_0(ge_p2 *h) {
558   fe_0(&h->X);
559   fe_1(&h->Y);
560   fe_1(&h->Z);
561 }
562 
ge_p3_0(ge_p3 * h)563 static void ge_p3_0(ge_p3 *h) {
564   fe_0(&h->X);
565   fe_1(&h->Y);
566   fe_1(&h->Z);
567   fe_0(&h->T);
568 }
569 
ge_cached_0(ge_cached * h)570 static void ge_cached_0(ge_cached *h) {
571   fe_loose_1(&h->YplusX);
572   fe_loose_1(&h->YminusX);
573   fe_loose_1(&h->Z);
574   fe_loose_0(&h->T2d);
575 }
576 
ge_precomp_0(ge_precomp * h)577 static void ge_precomp_0(ge_precomp *h) {
578   fe_loose_1(&h->yplusx);
579   fe_loose_1(&h->yminusx);
580   fe_loose_0(&h->xy2d);
581 }
582 
583 // r = p
ge_p3_to_p2(ge_p2 * r,const ge_p3 * p)584 static void ge_p3_to_p2(ge_p2 *r, const ge_p3 *p) {
585   fe_copy(&r->X, &p->X);
586   fe_copy(&r->Y, &p->Y);
587   fe_copy(&r->Z, &p->Z);
588 }
589 
590 // r = p
x25519_ge_p3_to_cached(ge_cached * r,const ge_p3 * p)591 void x25519_ge_p3_to_cached(ge_cached *r, const ge_p3 *p) {
592   fe_add(&r->YplusX, &p->Y, &p->X);
593   fe_sub(&r->YminusX, &p->Y, &p->X);
594   fe_copy_lt(&r->Z, &p->Z);
595   fe_mul_ltt(&r->T2d, &p->T, &d2);
596 }
597 
598 // r = p
x25519_ge_p1p1_to_p2(ge_p2 * r,const ge_p1p1 * p)599 void x25519_ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p) {
600   fe_mul_tll(&r->X, &p->X, &p->T);
601   fe_mul_tll(&r->Y, &p->Y, &p->Z);
602   fe_mul_tll(&r->Z, &p->Z, &p->T);
603 }
604 
605 // r = p
x25519_ge_p1p1_to_p3(ge_p3 * r,const ge_p1p1 * p)606 void x25519_ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p) {
607   fe_mul_tll(&r->X, &p->X, &p->T);
608   fe_mul_tll(&r->Y, &p->Y, &p->Z);
609   fe_mul_tll(&r->Z, &p->Z, &p->T);
610   fe_mul_tll(&r->T, &p->X, &p->Y);
611 }
612 
613 // r = p
ge_p1p1_to_cached(ge_cached * r,const ge_p1p1 * p)614 static void ge_p1p1_to_cached(ge_cached *r, const ge_p1p1 *p) {
615   ge_p3 t;
616   x25519_ge_p1p1_to_p3(&t, p);
617   x25519_ge_p3_to_cached(r, &t);
618 }
619 
620 // r = 2 * p
ge_p2_dbl(ge_p1p1 * r,const ge_p2 * p)621 static void ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p) {
622   fe trX, trZ, trT;
623   fe t0;
624 
625   fe_sq_tt(&trX, &p->X);
626   fe_sq_tt(&trZ, &p->Y);
627   fe_sq2_tt(&trT, &p->Z);
628   fe_add(&r->Y, &p->X, &p->Y);
629   fe_sq_tl(&t0, &r->Y);
630 
631   fe_add(&r->Y, &trZ, &trX);
632   fe_sub(&r->Z, &trZ, &trX);
633   fe_carry(&trZ, &r->Y);
634   fe_sub(&r->X, &t0, &trZ);
635   fe_carry(&trZ, &r->Z);
636   fe_sub(&r->T, &trT, &trZ);
637 }
638 
639 // r = 2 * p
ge_p3_dbl(ge_p1p1 * r,const ge_p3 * p)640 static void ge_p3_dbl(ge_p1p1 *r, const ge_p3 *p) {
641   ge_p2 q;
642   ge_p3_to_p2(&q, p);
643   ge_p2_dbl(r, &q);
644 }
645 
646 // r = p + q
ge_madd(ge_p1p1 * r,const ge_p3 * p,const ge_precomp * q)647 static void ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) {
648   fe trY, trZ, trT;
649 
650   fe_add(&r->X, &p->Y, &p->X);
651   fe_sub(&r->Y, &p->Y, &p->X);
652   fe_mul_tll(&trZ, &r->X, &q->yplusx);
653   fe_mul_tll(&trY, &r->Y, &q->yminusx);
654   fe_mul_tlt(&trT, &q->xy2d, &p->T);
655   fe_add(&r->T, &p->Z, &p->Z);
656   fe_sub(&r->X, &trZ, &trY);
657   fe_add(&r->Y, &trZ, &trY);
658   fe_carry(&trZ, &r->T);
659   fe_add(&r->Z, &trZ, &trT);
660   fe_sub(&r->T, &trZ, &trT);
661 }
662 
663 // r = p - q
ge_msub(ge_p1p1 * r,const ge_p3 * p,const ge_precomp * q)664 static void ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) {
665   fe trY, trZ, trT;
666 
667   fe_add(&r->X, &p->Y, &p->X);
668   fe_sub(&r->Y, &p->Y, &p->X);
669   fe_mul_tll(&trZ, &r->X, &q->yminusx);
670   fe_mul_tll(&trY, &r->Y, &q->yplusx);
671   fe_mul_tlt(&trT, &q->xy2d, &p->T);
672   fe_add(&r->T, &p->Z, &p->Z);
673   fe_sub(&r->X, &trZ, &trY);
674   fe_add(&r->Y, &trZ, &trY);
675   fe_carry(&trZ, &r->T);
676   fe_sub(&r->Z, &trZ, &trT);
677   fe_add(&r->T, &trZ, &trT);
678 }
679 
680 // r = p + q
x25519_ge_add(ge_p1p1 * r,const ge_p3 * p,const ge_cached * q)681 void x25519_ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
682   fe trX, trY, trZ, trT;
683 
684   fe_add(&r->X, &p->Y, &p->X);
685   fe_sub(&r->Y, &p->Y, &p->X);
686   fe_mul_tll(&trZ, &r->X, &q->YplusX);
687   fe_mul_tll(&trY, &r->Y, &q->YminusX);
688   fe_mul_tlt(&trT, &q->T2d, &p->T);
689   fe_mul_ttl(&trX, &p->Z, &q->Z);
690   fe_add(&r->T, &trX, &trX);
691   fe_sub(&r->X, &trZ, &trY);
692   fe_add(&r->Y, &trZ, &trY);
693   fe_carry(&trZ, &r->T);
694   fe_add(&r->Z, &trZ, &trT);
695   fe_sub(&r->T, &trZ, &trT);
696 }
697 
698 // r = p - q
x25519_ge_sub(ge_p1p1 * r,const ge_p3 * p,const ge_cached * q)699 void x25519_ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
700   fe trX, trY, trZ, trT;
701 
702   fe_add(&r->X, &p->Y, &p->X);
703   fe_sub(&r->Y, &p->Y, &p->X);
704   fe_mul_tll(&trZ, &r->X, &q->YminusX);
705   fe_mul_tll(&trY, &r->Y, &q->YplusX);
706   fe_mul_tlt(&trT, &q->T2d, &p->T);
707   fe_mul_ttl(&trX, &p->Z, &q->Z);
708   fe_add(&r->T, &trX, &trX);
709   fe_sub(&r->X, &trZ, &trY);
710   fe_add(&r->Y, &trZ, &trY);
711   fe_carry(&trZ, &r->T);
712   fe_sub(&r->Z, &trZ, &trT);
713   fe_add(&r->T, &trZ, &trT);
714 }
715 
equal(signed char b,signed char c)716 static uint8_t equal(signed char b, signed char c) {
717   uint8_t ub = b;
718   uint8_t uc = c;
719   uint8_t x = ub ^ uc;  // 0: yes; 1..255: no
720   uint32_t y = x;       // 0: yes; 1..255: no
721   y -= 1;               // 4294967295: yes; 0..254: no
722   y >>= 31;             // 1: yes; 0: no
723   return y;
724 }
725 
cmov(ge_precomp * t,const ge_precomp * u,uint8_t b)726 static void cmov(ge_precomp *t, const ge_precomp *u, uint8_t b) {
727   fe_cmov(&t->yplusx, &u->yplusx, b);
728   fe_cmov(&t->yminusx, &u->yminusx, b);
729   fe_cmov(&t->xy2d, &u->xy2d, b);
730 }
731 
x25519_ge_scalarmult_small_precomp(ge_p3 * h,const uint8_t a[32],const uint8_t precomp_table[15* 2* 32])732 void x25519_ge_scalarmult_small_precomp(
733     ge_p3 *h, const uint8_t a[32], const uint8_t precomp_table[15 * 2 * 32]) {
734   // precomp_table is first expanded into matching |ge_precomp|
735   // elements.
736   ge_precomp multiples[15];
737 
738   unsigned i;
739   for (i = 0; i < 15; i++) {
740     // The precomputed table is assumed to already clear the top bit, so
741     // |fe_frombytes_strict| may be used directly.
742     const uint8_t *bytes = &precomp_table[i*(2 * 32)];
743     fe x, y;
744     fe_frombytes_strict(&x, bytes);
745     fe_frombytes_strict(&y, bytes + 32);
746 
747     ge_precomp *out = &multiples[i];
748     fe_add(&out->yplusx, &y, &x);
749     fe_sub(&out->yminusx, &y, &x);
750     fe_mul_ltt(&out->xy2d, &x, &y);
751     fe_mul_llt(&out->xy2d, &out->xy2d, &d2);
752   }
753 
754   // See the comment above |k25519SmallPrecomp| about the structure of the
755   // precomputed elements. This loop does 64 additions and 64 doublings to
756   // calculate the result.
757   ge_p3_0(h);
758 
759   for (i = 63; i < 64; i--) {
760     unsigned j;
761     signed char index = 0;
762 
763     for (j = 0; j < 4; j++) {
764       const uint8_t bit = 1 & (a[(8 * j) + (i / 8)] >> (i & 7));
765       index |= (bit << j);
766     }
767 
768     ge_precomp e;
769     ge_precomp_0(&e);
770 
771     for (j = 1; j < 16; j++) {
772       cmov(&e, &multiples[j-1], equal(index, j));
773     }
774 
775     ge_cached cached;
776     ge_p1p1 r;
777     x25519_ge_p3_to_cached(&cached, h);
778     x25519_ge_add(&r, h, &cached);
779     x25519_ge_p1p1_to_p3(h, &r);
780 
781     ge_madd(&r, h, &e);
782     x25519_ge_p1p1_to_p3(h, &r);
783   }
784 }
785 
786 #if defined(OPENSSL_SMALL)
787 
x25519_ge_scalarmult_base(ge_p3 * h,const uint8_t a[32])788 void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) {
789   x25519_ge_scalarmult_small_precomp(h, a, k25519SmallPrecomp);
790 }
791 
792 #else
793 
negative(signed char b)794 static uint8_t negative(signed char b) {
795   uint32_t x = b;
796   x >>= 31;  // 1: yes; 0: no
797   return x;
798 }
799 
table_select(ge_precomp * t,int pos,signed char b)800 static void table_select(ge_precomp *t, int pos, signed char b) {
801   ge_precomp minust;
802   uint8_t bnegative = negative(b);
803   uint8_t babs = b - ((uint8_t)((-bnegative) & b) << 1);
804 
805   ge_precomp_0(t);
806   cmov(t, &k25519Precomp[pos][0], equal(babs, 1));
807   cmov(t, &k25519Precomp[pos][1], equal(babs, 2));
808   cmov(t, &k25519Precomp[pos][2], equal(babs, 3));
809   cmov(t, &k25519Precomp[pos][3], equal(babs, 4));
810   cmov(t, &k25519Precomp[pos][4], equal(babs, 5));
811   cmov(t, &k25519Precomp[pos][5], equal(babs, 6));
812   cmov(t, &k25519Precomp[pos][6], equal(babs, 7));
813   cmov(t, &k25519Precomp[pos][7], equal(babs, 8));
814   fe_copy_ll(&minust.yplusx, &t->yminusx);
815   fe_copy_ll(&minust.yminusx, &t->yplusx);
816 
817   // NOTE: the input table is canonical, but types don't encode it
818   fe tmp;
819   fe_carry(&tmp, &t->xy2d);
820   fe_neg(&minust.xy2d, &tmp);
821 
822   cmov(t, &minust, bnegative);
823 }
824 
825 // h = a * B
826 // where a = a[0]+256*a[1]+...+256^31 a[31]
827 // B is the Ed25519 base point (x,4/5) with x positive.
828 //
829 // Preconditions:
830 //   a[31] <= 127
x25519_ge_scalarmult_base(ge_p3 * h,const uint8_t * a)831 void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t *a) {
832   signed char e[64];
833   signed char carry;
834   ge_p1p1 r;
835   ge_p2 s;
836   ge_precomp t;
837   int i;
838 
839   for (i = 0; i < 32; ++i) {
840     e[2 * i + 0] = (a[i] >> 0) & 15;
841     e[2 * i + 1] = (a[i] >> 4) & 15;
842   }
843   // each e[i] is between 0 and 15
844   // e[63] is between 0 and 7
845 
846   carry = 0;
847   for (i = 0; i < 63; ++i) {
848     e[i] += carry;
849     carry = e[i] + 8;
850     carry >>= 4;
851     e[i] -= carry << 4;
852   }
853   e[63] += carry;
854   // each e[i] is between -8 and 8
855 
856   ge_p3_0(h);
857   for (i = 1; i < 64; i += 2) {
858     table_select(&t, i / 2, e[i]);
859     ge_madd(&r, h, &t);
860     x25519_ge_p1p1_to_p3(h, &r);
861   }
862 
863   ge_p3_dbl(&r, h);
864   x25519_ge_p1p1_to_p2(&s, &r);
865   ge_p2_dbl(&r, &s);
866   x25519_ge_p1p1_to_p2(&s, &r);
867   ge_p2_dbl(&r, &s);
868   x25519_ge_p1p1_to_p2(&s, &r);
869   ge_p2_dbl(&r, &s);
870   x25519_ge_p1p1_to_p3(h, &r);
871 
872   for (i = 0; i < 64; i += 2) {
873     table_select(&t, i / 2, e[i]);
874     ge_madd(&r, h, &t);
875     x25519_ge_p1p1_to_p3(h, &r);
876   }
877 }
878 
879 #endif
880 
cmov_cached(ge_cached * t,ge_cached * u,uint8_t b)881 static void cmov_cached(ge_cached *t, ge_cached *u, uint8_t b) {
882   fe_cmov(&t->YplusX, &u->YplusX, b);
883   fe_cmov(&t->YminusX, &u->YminusX, b);
884   fe_cmov(&t->Z, &u->Z, b);
885   fe_cmov(&t->T2d, &u->T2d, b);
886 }
887 
888 // r = scalar * A.
889 // where a = a[0]+256*a[1]+...+256^31 a[31].
x25519_ge_scalarmult(ge_p2 * r,const uint8_t * scalar,const ge_p3 * A)890 void x25519_ge_scalarmult(ge_p2 *r, const uint8_t *scalar, const ge_p3 *A) {
891   ge_p2 Ai_p2[8];
892   ge_cached Ai[16];
893   ge_p1p1 t;
894 
895   ge_cached_0(&Ai[0]);
896   x25519_ge_p3_to_cached(&Ai[1], A);
897   ge_p3_to_p2(&Ai_p2[1], A);
898 
899   unsigned i;
900   for (i = 2; i < 16; i += 2) {
901     ge_p2_dbl(&t, &Ai_p2[i / 2]);
902     ge_p1p1_to_cached(&Ai[i], &t);
903     if (i < 8) {
904       x25519_ge_p1p1_to_p2(&Ai_p2[i], &t);
905     }
906     x25519_ge_add(&t, A, &Ai[i]);
907     ge_p1p1_to_cached(&Ai[i + 1], &t);
908     if (i < 7) {
909       x25519_ge_p1p1_to_p2(&Ai_p2[i + 1], &t);
910     }
911   }
912 
913   ge_p2_0(r);
914   ge_p3 u;
915 
916   for (i = 0; i < 256; i += 4) {
917     ge_p2_dbl(&t, r);
918     x25519_ge_p1p1_to_p2(r, &t);
919     ge_p2_dbl(&t, r);
920     x25519_ge_p1p1_to_p2(r, &t);
921     ge_p2_dbl(&t, r);
922     x25519_ge_p1p1_to_p2(r, &t);
923     ge_p2_dbl(&t, r);
924     x25519_ge_p1p1_to_p3(&u, &t);
925 
926     uint8_t index = scalar[31 - i/8];
927     index >>= 4 - (i & 4);
928     index &= 0xf;
929 
930     unsigned j;
931     ge_cached selected;
932     ge_cached_0(&selected);
933     for (j = 0; j < 16; j++) {
934       cmov_cached(&selected, &Ai[j], equal(j, index));
935     }
936 
937     x25519_ge_add(&t, &u, &selected);
938     x25519_ge_p1p1_to_p2(r, &t);
939   }
940 }
941 
slide(signed char * r,const uint8_t * a)942 static void slide(signed char *r, const uint8_t *a) {
943   int i;
944   int b;
945   int k;
946 
947   for (i = 0; i < 256; ++i) {
948     r[i] = 1 & (a[i >> 3] >> (i & 7));
949   }
950 
951   for (i = 0; i < 256; ++i) {
952     if (r[i]) {
953       for (b = 1; b <= 6 && i + b < 256; ++b) {
954         if (r[i + b]) {
955           if (r[i] + (r[i + b] << b) <= 15) {
956             r[i] += r[i + b] << b;
957             r[i + b] = 0;
958           } else if (r[i] - (r[i + b] << b) >= -15) {
959             r[i] -= r[i + b] << b;
960             for (k = i + b; k < 256; ++k) {
961               if (!r[k]) {
962                 r[k] = 1;
963                 break;
964               }
965               r[k] = 0;
966             }
967           } else {
968             break;
969           }
970         }
971       }
972     }
973   }
974 }
975 
976 // r = a * A + b * B
977 // where a = a[0]+256*a[1]+...+256^31 a[31].
978 // and b = b[0]+256*b[1]+...+256^31 b[31].
979 // B is the Ed25519 base point (x,4/5) with x positive.
ge_double_scalarmult_vartime(ge_p2 * r,const uint8_t * a,const ge_p3 * A,const uint8_t * b)980 static void ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a,
981                                          const ge_p3 *A, const uint8_t *b) {
982   signed char aslide[256];
983   signed char bslide[256];
984   ge_cached Ai[8];  // A,3A,5A,7A,9A,11A,13A,15A
985   ge_p1p1 t;
986   ge_p3 u;
987   ge_p3 A2;
988   int i;
989 
990   slide(aslide, a);
991   slide(bslide, b);
992 
993   x25519_ge_p3_to_cached(&Ai[0], A);
994   ge_p3_dbl(&t, A);
995   x25519_ge_p1p1_to_p3(&A2, &t);
996   x25519_ge_add(&t, &A2, &Ai[0]);
997   x25519_ge_p1p1_to_p3(&u, &t);
998   x25519_ge_p3_to_cached(&Ai[1], &u);
999   x25519_ge_add(&t, &A2, &Ai[1]);
1000   x25519_ge_p1p1_to_p3(&u, &t);
1001   x25519_ge_p3_to_cached(&Ai[2], &u);
1002   x25519_ge_add(&t, &A2, &Ai[2]);
1003   x25519_ge_p1p1_to_p3(&u, &t);
1004   x25519_ge_p3_to_cached(&Ai[3], &u);
1005   x25519_ge_add(&t, &A2, &Ai[3]);
1006   x25519_ge_p1p1_to_p3(&u, &t);
1007   x25519_ge_p3_to_cached(&Ai[4], &u);
1008   x25519_ge_add(&t, &A2, &Ai[4]);
1009   x25519_ge_p1p1_to_p3(&u, &t);
1010   x25519_ge_p3_to_cached(&Ai[5], &u);
1011   x25519_ge_add(&t, &A2, &Ai[5]);
1012   x25519_ge_p1p1_to_p3(&u, &t);
1013   x25519_ge_p3_to_cached(&Ai[6], &u);
1014   x25519_ge_add(&t, &A2, &Ai[6]);
1015   x25519_ge_p1p1_to_p3(&u, &t);
1016   x25519_ge_p3_to_cached(&Ai[7], &u);
1017 
1018   ge_p2_0(r);
1019 
1020   for (i = 255; i >= 0; --i) {
1021     if (aslide[i] || bslide[i]) {
1022       break;
1023     }
1024   }
1025 
1026   for (; i >= 0; --i) {
1027     ge_p2_dbl(&t, r);
1028 
1029     if (aslide[i] > 0) {
1030       x25519_ge_p1p1_to_p3(&u, &t);
1031       x25519_ge_add(&t, &u, &Ai[aslide[i] / 2]);
1032     } else if (aslide[i] < 0) {
1033       x25519_ge_p1p1_to_p3(&u, &t);
1034       x25519_ge_sub(&t, &u, &Ai[(-aslide[i]) / 2]);
1035     }
1036 
1037     if (bslide[i] > 0) {
1038       x25519_ge_p1p1_to_p3(&u, &t);
1039       ge_madd(&t, &u, &Bi[bslide[i] / 2]);
1040     } else if (bslide[i] < 0) {
1041       x25519_ge_p1p1_to_p3(&u, &t);
1042       ge_msub(&t, &u, &Bi[(-bslide[i]) / 2]);
1043     }
1044 
1045     x25519_ge_p1p1_to_p2(r, &t);
1046   }
1047 }
1048 
1049 // int64_lshift21 returns |a << 21| but is defined when shifting bits into the
1050 // sign bit. This works around a language flaw in C.
int64_lshift21(int64_t a)1051 static inline int64_t int64_lshift21(int64_t a) {
1052   return (int64_t)((uint64_t)a << 21);
1053 }
1054 
1055 // The set of scalars is \Z/l
1056 // where l = 2^252 + 27742317777372353535851937790883648493.
1057 
1058 // Input:
1059 //   s[0]+256*s[1]+...+256^63*s[63] = s
1060 //
1061 // Output:
1062 //   s[0]+256*s[1]+...+256^31*s[31] = s mod l
1063 //   where l = 2^252 + 27742317777372353535851937790883648493.
1064 //   Overwrites s in place.
x25519_sc_reduce(uint8_t s[64])1065 void x25519_sc_reduce(uint8_t s[64]) {
1066   int64_t s0 = 2097151 & load_3(s);
1067   int64_t s1 = 2097151 & (load_4(s + 2) >> 5);
1068   int64_t s2 = 2097151 & (load_3(s + 5) >> 2);
1069   int64_t s3 = 2097151 & (load_4(s + 7) >> 7);
1070   int64_t s4 = 2097151 & (load_4(s + 10) >> 4);
1071   int64_t s5 = 2097151 & (load_3(s + 13) >> 1);
1072   int64_t s6 = 2097151 & (load_4(s + 15) >> 6);
1073   int64_t s7 = 2097151 & (load_3(s + 18) >> 3);
1074   int64_t s8 = 2097151 & load_3(s + 21);
1075   int64_t s9 = 2097151 & (load_4(s + 23) >> 5);
1076   int64_t s10 = 2097151 & (load_3(s + 26) >> 2);
1077   int64_t s11 = 2097151 & (load_4(s + 28) >> 7);
1078   int64_t s12 = 2097151 & (load_4(s + 31) >> 4);
1079   int64_t s13 = 2097151 & (load_3(s + 34) >> 1);
1080   int64_t s14 = 2097151 & (load_4(s + 36) >> 6);
1081   int64_t s15 = 2097151 & (load_3(s + 39) >> 3);
1082   int64_t s16 = 2097151 & load_3(s + 42);
1083   int64_t s17 = 2097151 & (load_4(s + 44) >> 5);
1084   int64_t s18 = 2097151 & (load_3(s + 47) >> 2);
1085   int64_t s19 = 2097151 & (load_4(s + 49) >> 7);
1086   int64_t s20 = 2097151 & (load_4(s + 52) >> 4);
1087   int64_t s21 = 2097151 & (load_3(s + 55) >> 1);
1088   int64_t s22 = 2097151 & (load_4(s + 57) >> 6);
1089   int64_t s23 = (load_4(s + 60) >> 3);
1090   int64_t carry0;
1091   int64_t carry1;
1092   int64_t carry2;
1093   int64_t carry3;
1094   int64_t carry4;
1095   int64_t carry5;
1096   int64_t carry6;
1097   int64_t carry7;
1098   int64_t carry8;
1099   int64_t carry9;
1100   int64_t carry10;
1101   int64_t carry11;
1102   int64_t carry12;
1103   int64_t carry13;
1104   int64_t carry14;
1105   int64_t carry15;
1106   int64_t carry16;
1107 
1108   s11 += s23 * 666643;
1109   s12 += s23 * 470296;
1110   s13 += s23 * 654183;
1111   s14 -= s23 * 997805;
1112   s15 += s23 * 136657;
1113   s16 -= s23 * 683901;
1114   s23 = 0;
1115 
1116   s10 += s22 * 666643;
1117   s11 += s22 * 470296;
1118   s12 += s22 * 654183;
1119   s13 -= s22 * 997805;
1120   s14 += s22 * 136657;
1121   s15 -= s22 * 683901;
1122   s22 = 0;
1123 
1124   s9 += s21 * 666643;
1125   s10 += s21 * 470296;
1126   s11 += s21 * 654183;
1127   s12 -= s21 * 997805;
1128   s13 += s21 * 136657;
1129   s14 -= s21 * 683901;
1130   s21 = 0;
1131 
1132   s8 += s20 * 666643;
1133   s9 += s20 * 470296;
1134   s10 += s20 * 654183;
1135   s11 -= s20 * 997805;
1136   s12 += s20 * 136657;
1137   s13 -= s20 * 683901;
1138   s20 = 0;
1139 
1140   s7 += s19 * 666643;
1141   s8 += s19 * 470296;
1142   s9 += s19 * 654183;
1143   s10 -= s19 * 997805;
1144   s11 += s19 * 136657;
1145   s12 -= s19 * 683901;
1146   s19 = 0;
1147 
1148   s6 += s18 * 666643;
1149   s7 += s18 * 470296;
1150   s8 += s18 * 654183;
1151   s9 -= s18 * 997805;
1152   s10 += s18 * 136657;
1153   s11 -= s18 * 683901;
1154   s18 = 0;
1155 
1156   carry6 = (s6 + (1 << 20)) >> 21;
1157   s7 += carry6;
1158   s6 -= int64_lshift21(carry6);
1159   carry8 = (s8 + (1 << 20)) >> 21;
1160   s9 += carry8;
1161   s8 -= int64_lshift21(carry8);
1162   carry10 = (s10 + (1 << 20)) >> 21;
1163   s11 += carry10;
1164   s10 -= int64_lshift21(carry10);
1165   carry12 = (s12 + (1 << 20)) >> 21;
1166   s13 += carry12;
1167   s12 -= int64_lshift21(carry12);
1168   carry14 = (s14 + (1 << 20)) >> 21;
1169   s15 += carry14;
1170   s14 -= int64_lshift21(carry14);
1171   carry16 = (s16 + (1 << 20)) >> 21;
1172   s17 += carry16;
1173   s16 -= int64_lshift21(carry16);
1174 
1175   carry7 = (s7 + (1 << 20)) >> 21;
1176   s8 += carry7;
1177   s7 -= int64_lshift21(carry7);
1178   carry9 = (s9 + (1 << 20)) >> 21;
1179   s10 += carry9;
1180   s9 -= int64_lshift21(carry9);
1181   carry11 = (s11 + (1 << 20)) >> 21;
1182   s12 += carry11;
1183   s11 -= int64_lshift21(carry11);
1184   carry13 = (s13 + (1 << 20)) >> 21;
1185   s14 += carry13;
1186   s13 -= int64_lshift21(carry13);
1187   carry15 = (s15 + (1 << 20)) >> 21;
1188   s16 += carry15;
1189   s15 -= int64_lshift21(carry15);
1190 
1191   s5 += s17 * 666643;
1192   s6 += s17 * 470296;
1193   s7 += s17 * 654183;
1194   s8 -= s17 * 997805;
1195   s9 += s17 * 136657;
1196   s10 -= s17 * 683901;
1197   s17 = 0;
1198 
1199   s4 += s16 * 666643;
1200   s5 += s16 * 470296;
1201   s6 += s16 * 654183;
1202   s7 -= s16 * 997805;
1203   s8 += s16 * 136657;
1204   s9 -= s16 * 683901;
1205   s16 = 0;
1206 
1207   s3 += s15 * 666643;
1208   s4 += s15 * 470296;
1209   s5 += s15 * 654183;
1210   s6 -= s15 * 997805;
1211   s7 += s15 * 136657;
1212   s8 -= s15 * 683901;
1213   s15 = 0;
1214 
1215   s2 += s14 * 666643;
1216   s3 += s14 * 470296;
1217   s4 += s14 * 654183;
1218   s5 -= s14 * 997805;
1219   s6 += s14 * 136657;
1220   s7 -= s14 * 683901;
1221   s14 = 0;
1222 
1223   s1 += s13 * 666643;
1224   s2 += s13 * 470296;
1225   s3 += s13 * 654183;
1226   s4 -= s13 * 997805;
1227   s5 += s13 * 136657;
1228   s6 -= s13 * 683901;
1229   s13 = 0;
1230 
1231   s0 += s12 * 666643;
1232   s1 += s12 * 470296;
1233   s2 += s12 * 654183;
1234   s3 -= s12 * 997805;
1235   s4 += s12 * 136657;
1236   s5 -= s12 * 683901;
1237   s12 = 0;
1238 
1239   carry0 = (s0 + (1 << 20)) >> 21;
1240   s1 += carry0;
1241   s0 -= int64_lshift21(carry0);
1242   carry2 = (s2 + (1 << 20)) >> 21;
1243   s3 += carry2;
1244   s2 -= int64_lshift21(carry2);
1245   carry4 = (s4 + (1 << 20)) >> 21;
1246   s5 += carry4;
1247   s4 -= int64_lshift21(carry4);
1248   carry6 = (s6 + (1 << 20)) >> 21;
1249   s7 += carry6;
1250   s6 -= int64_lshift21(carry6);
1251   carry8 = (s8 + (1 << 20)) >> 21;
1252   s9 += carry8;
1253   s8 -= int64_lshift21(carry8);
1254   carry10 = (s10 + (1 << 20)) >> 21;
1255   s11 += carry10;
1256   s10 -= int64_lshift21(carry10);
1257 
1258   carry1 = (s1 + (1 << 20)) >> 21;
1259   s2 += carry1;
1260   s1 -= int64_lshift21(carry1);
1261   carry3 = (s3 + (1 << 20)) >> 21;
1262   s4 += carry3;
1263   s3 -= int64_lshift21(carry3);
1264   carry5 = (s5 + (1 << 20)) >> 21;
1265   s6 += carry5;
1266   s5 -= int64_lshift21(carry5);
1267   carry7 = (s7 + (1 << 20)) >> 21;
1268   s8 += carry7;
1269   s7 -= int64_lshift21(carry7);
1270   carry9 = (s9 + (1 << 20)) >> 21;
1271   s10 += carry9;
1272   s9 -= int64_lshift21(carry9);
1273   carry11 = (s11 + (1 << 20)) >> 21;
1274   s12 += carry11;
1275   s11 -= int64_lshift21(carry11);
1276 
1277   s0 += s12 * 666643;
1278   s1 += s12 * 470296;
1279   s2 += s12 * 654183;
1280   s3 -= s12 * 997805;
1281   s4 += s12 * 136657;
1282   s5 -= s12 * 683901;
1283   s12 = 0;
1284 
1285   carry0 = s0 >> 21;
1286   s1 += carry0;
1287   s0 -= int64_lshift21(carry0);
1288   carry1 = s1 >> 21;
1289   s2 += carry1;
1290   s1 -= int64_lshift21(carry1);
1291   carry2 = s2 >> 21;
1292   s3 += carry2;
1293   s2 -= int64_lshift21(carry2);
1294   carry3 = s3 >> 21;
1295   s4 += carry3;
1296   s3 -= int64_lshift21(carry3);
1297   carry4 = s4 >> 21;
1298   s5 += carry4;
1299   s4 -= int64_lshift21(carry4);
1300   carry5 = s5 >> 21;
1301   s6 += carry5;
1302   s5 -= int64_lshift21(carry5);
1303   carry6 = s6 >> 21;
1304   s7 += carry6;
1305   s6 -= int64_lshift21(carry6);
1306   carry7 = s7 >> 21;
1307   s8 += carry7;
1308   s7 -= int64_lshift21(carry7);
1309   carry8 = s8 >> 21;
1310   s9 += carry8;
1311   s8 -= int64_lshift21(carry8);
1312   carry9 = s9 >> 21;
1313   s10 += carry9;
1314   s9 -= int64_lshift21(carry9);
1315   carry10 = s10 >> 21;
1316   s11 += carry10;
1317   s10 -= int64_lshift21(carry10);
1318   carry11 = s11 >> 21;
1319   s12 += carry11;
1320   s11 -= int64_lshift21(carry11);
1321 
1322   s0 += s12 * 666643;
1323   s1 += s12 * 470296;
1324   s2 += s12 * 654183;
1325   s3 -= s12 * 997805;
1326   s4 += s12 * 136657;
1327   s5 -= s12 * 683901;
1328   s12 = 0;
1329 
1330   carry0 = s0 >> 21;
1331   s1 += carry0;
1332   s0 -= int64_lshift21(carry0);
1333   carry1 = s1 >> 21;
1334   s2 += carry1;
1335   s1 -= int64_lshift21(carry1);
1336   carry2 = s2 >> 21;
1337   s3 += carry2;
1338   s2 -= int64_lshift21(carry2);
1339   carry3 = s3 >> 21;
1340   s4 += carry3;
1341   s3 -= int64_lshift21(carry3);
1342   carry4 = s4 >> 21;
1343   s5 += carry4;
1344   s4 -= int64_lshift21(carry4);
1345   carry5 = s5 >> 21;
1346   s6 += carry5;
1347   s5 -= int64_lshift21(carry5);
1348   carry6 = s6 >> 21;
1349   s7 += carry6;
1350   s6 -= int64_lshift21(carry6);
1351   carry7 = s7 >> 21;
1352   s8 += carry7;
1353   s7 -= int64_lshift21(carry7);
1354   carry8 = s8 >> 21;
1355   s9 += carry8;
1356   s8 -= int64_lshift21(carry8);
1357   carry9 = s9 >> 21;
1358   s10 += carry9;
1359   s9 -= int64_lshift21(carry9);
1360   carry10 = s10 >> 21;
1361   s11 += carry10;
1362   s10 -= int64_lshift21(carry10);
1363 
1364   s[0] = s0 >> 0;
1365   s[1] = s0 >> 8;
1366   s[2] = (s0 >> 16) | (s1 << 5);
1367   s[3] = s1 >> 3;
1368   s[4] = s1 >> 11;
1369   s[5] = (s1 >> 19) | (s2 << 2);
1370   s[6] = s2 >> 6;
1371   s[7] = (s2 >> 14) | (s3 << 7);
1372   s[8] = s3 >> 1;
1373   s[9] = s3 >> 9;
1374   s[10] = (s3 >> 17) | (s4 << 4);
1375   s[11] = s4 >> 4;
1376   s[12] = s4 >> 12;
1377   s[13] = (s4 >> 20) | (s5 << 1);
1378   s[14] = s5 >> 7;
1379   s[15] = (s5 >> 15) | (s6 << 6);
1380   s[16] = s6 >> 2;
1381   s[17] = s6 >> 10;
1382   s[18] = (s6 >> 18) | (s7 << 3);
1383   s[19] = s7 >> 5;
1384   s[20] = s7 >> 13;
1385   s[21] = s8 >> 0;
1386   s[22] = s8 >> 8;
1387   s[23] = (s8 >> 16) | (s9 << 5);
1388   s[24] = s9 >> 3;
1389   s[25] = s9 >> 11;
1390   s[26] = (s9 >> 19) | (s10 << 2);
1391   s[27] = s10 >> 6;
1392   s[28] = (s10 >> 14) | (s11 << 7);
1393   s[29] = s11 >> 1;
1394   s[30] = s11 >> 9;
1395   s[31] = s11 >> 17;
1396 }
1397 
1398 // Input:
1399 //   a[0]+256*a[1]+...+256^31*a[31] = a
1400 //   b[0]+256*b[1]+...+256^31*b[31] = b
1401 //   c[0]+256*c[1]+...+256^31*c[31] = c
1402 //
1403 // Output:
1404 //   s[0]+256*s[1]+...+256^31*s[31] = (ab+c) mod l
1405 //   where l = 2^252 + 27742317777372353535851937790883648493.
sc_muladd(uint8_t * s,const uint8_t * a,const uint8_t * b,const uint8_t * c)1406 static void sc_muladd(uint8_t *s, const uint8_t *a, const uint8_t *b,
1407                       const uint8_t *c) {
1408   int64_t a0 = 2097151 & load_3(a);
1409   int64_t a1 = 2097151 & (load_4(a + 2) >> 5);
1410   int64_t a2 = 2097151 & (load_3(a + 5) >> 2);
1411   int64_t a3 = 2097151 & (load_4(a + 7) >> 7);
1412   int64_t a4 = 2097151 & (load_4(a + 10) >> 4);
1413   int64_t a5 = 2097151 & (load_3(a + 13) >> 1);
1414   int64_t a6 = 2097151 & (load_4(a + 15) >> 6);
1415   int64_t a7 = 2097151 & (load_3(a + 18) >> 3);
1416   int64_t a8 = 2097151 & load_3(a + 21);
1417   int64_t a9 = 2097151 & (load_4(a + 23) >> 5);
1418   int64_t a10 = 2097151 & (load_3(a + 26) >> 2);
1419   int64_t a11 = (load_4(a + 28) >> 7);
1420   int64_t b0 = 2097151 & load_3(b);
1421   int64_t b1 = 2097151 & (load_4(b + 2) >> 5);
1422   int64_t b2 = 2097151 & (load_3(b + 5) >> 2);
1423   int64_t b3 = 2097151 & (load_4(b + 7) >> 7);
1424   int64_t b4 = 2097151 & (load_4(b + 10) >> 4);
1425   int64_t b5 = 2097151 & (load_3(b + 13) >> 1);
1426   int64_t b6 = 2097151 & (load_4(b + 15) >> 6);
1427   int64_t b7 = 2097151 & (load_3(b + 18) >> 3);
1428   int64_t b8 = 2097151 & load_3(b + 21);
1429   int64_t b9 = 2097151 & (load_4(b + 23) >> 5);
1430   int64_t b10 = 2097151 & (load_3(b + 26) >> 2);
1431   int64_t b11 = (load_4(b + 28) >> 7);
1432   int64_t c0 = 2097151 & load_3(c);
1433   int64_t c1 = 2097151 & (load_4(c + 2) >> 5);
1434   int64_t c2 = 2097151 & (load_3(c + 5) >> 2);
1435   int64_t c3 = 2097151 & (load_4(c + 7) >> 7);
1436   int64_t c4 = 2097151 & (load_4(c + 10) >> 4);
1437   int64_t c5 = 2097151 & (load_3(c + 13) >> 1);
1438   int64_t c6 = 2097151 & (load_4(c + 15) >> 6);
1439   int64_t c7 = 2097151 & (load_3(c + 18) >> 3);
1440   int64_t c8 = 2097151 & load_3(c + 21);
1441   int64_t c9 = 2097151 & (load_4(c + 23) >> 5);
1442   int64_t c10 = 2097151 & (load_3(c + 26) >> 2);
1443   int64_t c11 = (load_4(c + 28) >> 7);
1444   int64_t s0;
1445   int64_t s1;
1446   int64_t s2;
1447   int64_t s3;
1448   int64_t s4;
1449   int64_t s5;
1450   int64_t s6;
1451   int64_t s7;
1452   int64_t s8;
1453   int64_t s9;
1454   int64_t s10;
1455   int64_t s11;
1456   int64_t s12;
1457   int64_t s13;
1458   int64_t s14;
1459   int64_t s15;
1460   int64_t s16;
1461   int64_t s17;
1462   int64_t s18;
1463   int64_t s19;
1464   int64_t s20;
1465   int64_t s21;
1466   int64_t s22;
1467   int64_t s23;
1468   int64_t carry0;
1469   int64_t carry1;
1470   int64_t carry2;
1471   int64_t carry3;
1472   int64_t carry4;
1473   int64_t carry5;
1474   int64_t carry6;
1475   int64_t carry7;
1476   int64_t carry8;
1477   int64_t carry9;
1478   int64_t carry10;
1479   int64_t carry11;
1480   int64_t carry12;
1481   int64_t carry13;
1482   int64_t carry14;
1483   int64_t carry15;
1484   int64_t carry16;
1485   int64_t carry17;
1486   int64_t carry18;
1487   int64_t carry19;
1488   int64_t carry20;
1489   int64_t carry21;
1490   int64_t carry22;
1491 
1492   s0 = c0 + a0 * b0;
1493   s1 = c1 + a0 * b1 + a1 * b0;
1494   s2 = c2 + a0 * b2 + a1 * b1 + a2 * b0;
1495   s3 = c3 + a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0;
1496   s4 = c4 + a0 * b4 + a1 * b3 + a2 * b2 + a3 * b1 + a4 * b0;
1497   s5 = c5 + a0 * b5 + a1 * b4 + a2 * b3 + a3 * b2 + a4 * b1 + a5 * b0;
1498   s6 = c6 + a0 * b6 + a1 * b5 + a2 * b4 + a3 * b3 + a4 * b2 + a5 * b1 + a6 * b0;
1499   s7 = c7 + a0 * b7 + a1 * b6 + a2 * b5 + a3 * b4 + a4 * b3 + a5 * b2 +
1500        a6 * b1 + a7 * b0;
1501   s8 = c8 + a0 * b8 + a1 * b7 + a2 * b6 + a3 * b5 + a4 * b4 + a5 * b3 +
1502        a6 * b2 + a7 * b1 + a8 * b0;
1503   s9 = c9 + a0 * b9 + a1 * b8 + a2 * b7 + a3 * b6 + a4 * b5 + a5 * b4 +
1504        a6 * b3 + a7 * b2 + a8 * b1 + a9 * b0;
1505   s10 = c10 + a0 * b10 + a1 * b9 + a2 * b8 + a3 * b7 + a4 * b6 + a5 * b5 +
1506         a6 * b4 + a7 * b3 + a8 * b2 + a9 * b1 + a10 * b0;
1507   s11 = c11 + a0 * b11 + a1 * b10 + a2 * b9 + a3 * b8 + a4 * b7 + a5 * b6 +
1508         a6 * b5 + a7 * b4 + a8 * b3 + a9 * b2 + a10 * b1 + a11 * b0;
1509   s12 = a1 * b11 + a2 * b10 + a3 * b9 + a4 * b8 + a5 * b7 + a6 * b6 + a7 * b5 +
1510         a8 * b4 + a9 * b3 + a10 * b2 + a11 * b1;
1511   s13 = a2 * b11 + a3 * b10 + a4 * b9 + a5 * b8 + a6 * b7 + a7 * b6 + a8 * b5 +
1512         a9 * b4 + a10 * b3 + a11 * b2;
1513   s14 = a3 * b11 + a4 * b10 + a5 * b9 + a6 * b8 + a7 * b7 + a8 * b6 + a9 * b5 +
1514         a10 * b4 + a11 * b3;
1515   s15 = a4 * b11 + a5 * b10 + a6 * b9 + a7 * b8 + a8 * b7 + a9 * b6 + a10 * b5 +
1516         a11 * b4;
1517   s16 = a5 * b11 + a6 * b10 + a7 * b9 + a8 * b8 + a9 * b7 + a10 * b6 + a11 * b5;
1518   s17 = a6 * b11 + a7 * b10 + a8 * b9 + a9 * b8 + a10 * b7 + a11 * b6;
1519   s18 = a7 * b11 + a8 * b10 + a9 * b9 + a10 * b8 + a11 * b7;
1520   s19 = a8 * b11 + a9 * b10 + a10 * b9 + a11 * b8;
1521   s20 = a9 * b11 + a10 * b10 + a11 * b9;
1522   s21 = a10 * b11 + a11 * b10;
1523   s22 = a11 * b11;
1524   s23 = 0;
1525 
1526   carry0 = (s0 + (1 << 20)) >> 21;
1527   s1 += carry0;
1528   s0 -= int64_lshift21(carry0);
1529   carry2 = (s2 + (1 << 20)) >> 21;
1530   s3 += carry2;
1531   s2 -= int64_lshift21(carry2);
1532   carry4 = (s4 + (1 << 20)) >> 21;
1533   s5 += carry4;
1534   s4 -= int64_lshift21(carry4);
1535   carry6 = (s6 + (1 << 20)) >> 21;
1536   s7 += carry6;
1537   s6 -= int64_lshift21(carry6);
1538   carry8 = (s8 + (1 << 20)) >> 21;
1539   s9 += carry8;
1540   s8 -= int64_lshift21(carry8);
1541   carry10 = (s10 + (1 << 20)) >> 21;
1542   s11 += carry10;
1543   s10 -= int64_lshift21(carry10);
1544   carry12 = (s12 + (1 << 20)) >> 21;
1545   s13 += carry12;
1546   s12 -= int64_lshift21(carry12);
1547   carry14 = (s14 + (1 << 20)) >> 21;
1548   s15 += carry14;
1549   s14 -= int64_lshift21(carry14);
1550   carry16 = (s16 + (1 << 20)) >> 21;
1551   s17 += carry16;
1552   s16 -= int64_lshift21(carry16);
1553   carry18 = (s18 + (1 << 20)) >> 21;
1554   s19 += carry18;
1555   s18 -= int64_lshift21(carry18);
1556   carry20 = (s20 + (1 << 20)) >> 21;
1557   s21 += carry20;
1558   s20 -= int64_lshift21(carry20);
1559   carry22 = (s22 + (1 << 20)) >> 21;
1560   s23 += carry22;
1561   s22 -= int64_lshift21(carry22);
1562 
1563   carry1 = (s1 + (1 << 20)) >> 21;
1564   s2 += carry1;
1565   s1 -= int64_lshift21(carry1);
1566   carry3 = (s3 + (1 << 20)) >> 21;
1567   s4 += carry3;
1568   s3 -= int64_lshift21(carry3);
1569   carry5 = (s5 + (1 << 20)) >> 21;
1570   s6 += carry5;
1571   s5 -= int64_lshift21(carry5);
1572   carry7 = (s7 + (1 << 20)) >> 21;
1573   s8 += carry7;
1574   s7 -= int64_lshift21(carry7);
1575   carry9 = (s9 + (1 << 20)) >> 21;
1576   s10 += carry9;
1577   s9 -= int64_lshift21(carry9);
1578   carry11 = (s11 + (1 << 20)) >> 21;
1579   s12 += carry11;
1580   s11 -= int64_lshift21(carry11);
1581   carry13 = (s13 + (1 << 20)) >> 21;
1582   s14 += carry13;
1583   s13 -= int64_lshift21(carry13);
1584   carry15 = (s15 + (1 << 20)) >> 21;
1585   s16 += carry15;
1586   s15 -= int64_lshift21(carry15);
1587   carry17 = (s17 + (1 << 20)) >> 21;
1588   s18 += carry17;
1589   s17 -= int64_lshift21(carry17);
1590   carry19 = (s19 + (1 << 20)) >> 21;
1591   s20 += carry19;
1592   s19 -= int64_lshift21(carry19);
1593   carry21 = (s21 + (1 << 20)) >> 21;
1594   s22 += carry21;
1595   s21 -= int64_lshift21(carry21);
1596 
1597   s11 += s23 * 666643;
1598   s12 += s23 * 470296;
1599   s13 += s23 * 654183;
1600   s14 -= s23 * 997805;
1601   s15 += s23 * 136657;
1602   s16 -= s23 * 683901;
1603   s23 = 0;
1604 
1605   s10 += s22 * 666643;
1606   s11 += s22 * 470296;
1607   s12 += s22 * 654183;
1608   s13 -= s22 * 997805;
1609   s14 += s22 * 136657;
1610   s15 -= s22 * 683901;
1611   s22 = 0;
1612 
1613   s9 += s21 * 666643;
1614   s10 += s21 * 470296;
1615   s11 += s21 * 654183;
1616   s12 -= s21 * 997805;
1617   s13 += s21 * 136657;
1618   s14 -= s21 * 683901;
1619   s21 = 0;
1620 
1621   s8 += s20 * 666643;
1622   s9 += s20 * 470296;
1623   s10 += s20 * 654183;
1624   s11 -= s20 * 997805;
1625   s12 += s20 * 136657;
1626   s13 -= s20 * 683901;
1627   s20 = 0;
1628 
1629   s7 += s19 * 666643;
1630   s8 += s19 * 470296;
1631   s9 += s19 * 654183;
1632   s10 -= s19 * 997805;
1633   s11 += s19 * 136657;
1634   s12 -= s19 * 683901;
1635   s19 = 0;
1636 
1637   s6 += s18 * 666643;
1638   s7 += s18 * 470296;
1639   s8 += s18 * 654183;
1640   s9 -= s18 * 997805;
1641   s10 += s18 * 136657;
1642   s11 -= s18 * 683901;
1643   s18 = 0;
1644 
1645   carry6 = (s6 + (1 << 20)) >> 21;
1646   s7 += carry6;
1647   s6 -= int64_lshift21(carry6);
1648   carry8 = (s8 + (1 << 20)) >> 21;
1649   s9 += carry8;
1650   s8 -= int64_lshift21(carry8);
1651   carry10 = (s10 + (1 << 20)) >> 21;
1652   s11 += carry10;
1653   s10 -= int64_lshift21(carry10);
1654   carry12 = (s12 + (1 << 20)) >> 21;
1655   s13 += carry12;
1656   s12 -= int64_lshift21(carry12);
1657   carry14 = (s14 + (1 << 20)) >> 21;
1658   s15 += carry14;
1659   s14 -= int64_lshift21(carry14);
1660   carry16 = (s16 + (1 << 20)) >> 21;
1661   s17 += carry16;
1662   s16 -= int64_lshift21(carry16);
1663 
1664   carry7 = (s7 + (1 << 20)) >> 21;
1665   s8 += carry7;
1666   s7 -= int64_lshift21(carry7);
1667   carry9 = (s9 + (1 << 20)) >> 21;
1668   s10 += carry9;
1669   s9 -= int64_lshift21(carry9);
1670   carry11 = (s11 + (1 << 20)) >> 21;
1671   s12 += carry11;
1672   s11 -= int64_lshift21(carry11);
1673   carry13 = (s13 + (1 << 20)) >> 21;
1674   s14 += carry13;
1675   s13 -= int64_lshift21(carry13);
1676   carry15 = (s15 + (1 << 20)) >> 21;
1677   s16 += carry15;
1678   s15 -= int64_lshift21(carry15);
1679 
1680   s5 += s17 * 666643;
1681   s6 += s17 * 470296;
1682   s7 += s17 * 654183;
1683   s8 -= s17 * 997805;
1684   s9 += s17 * 136657;
1685   s10 -= s17 * 683901;
1686   s17 = 0;
1687 
1688   s4 += s16 * 666643;
1689   s5 += s16 * 470296;
1690   s6 += s16 * 654183;
1691   s7 -= s16 * 997805;
1692   s8 += s16 * 136657;
1693   s9 -= s16 * 683901;
1694   s16 = 0;
1695 
1696   s3 += s15 * 666643;
1697   s4 += s15 * 470296;
1698   s5 += s15 * 654183;
1699   s6 -= s15 * 997805;
1700   s7 += s15 * 136657;
1701   s8 -= s15 * 683901;
1702   s15 = 0;
1703 
1704   s2 += s14 * 666643;
1705   s3 += s14 * 470296;
1706   s4 += s14 * 654183;
1707   s5 -= s14 * 997805;
1708   s6 += s14 * 136657;
1709   s7 -= s14 * 683901;
1710   s14 = 0;
1711 
1712   s1 += s13 * 666643;
1713   s2 += s13 * 470296;
1714   s3 += s13 * 654183;
1715   s4 -= s13 * 997805;
1716   s5 += s13 * 136657;
1717   s6 -= s13 * 683901;
1718   s13 = 0;
1719 
1720   s0 += s12 * 666643;
1721   s1 += s12 * 470296;
1722   s2 += s12 * 654183;
1723   s3 -= s12 * 997805;
1724   s4 += s12 * 136657;
1725   s5 -= s12 * 683901;
1726   s12 = 0;
1727 
1728   carry0 = (s0 + (1 << 20)) >> 21;
1729   s1 += carry0;
1730   s0 -= int64_lshift21(carry0);
1731   carry2 = (s2 + (1 << 20)) >> 21;
1732   s3 += carry2;
1733   s2 -= int64_lshift21(carry2);
1734   carry4 = (s4 + (1 << 20)) >> 21;
1735   s5 += carry4;
1736   s4 -= int64_lshift21(carry4);
1737   carry6 = (s6 + (1 << 20)) >> 21;
1738   s7 += carry6;
1739   s6 -= int64_lshift21(carry6);
1740   carry8 = (s8 + (1 << 20)) >> 21;
1741   s9 += carry8;
1742   s8 -= int64_lshift21(carry8);
1743   carry10 = (s10 + (1 << 20)) >> 21;
1744   s11 += carry10;
1745   s10 -= int64_lshift21(carry10);
1746 
1747   carry1 = (s1 + (1 << 20)) >> 21;
1748   s2 += carry1;
1749   s1 -= int64_lshift21(carry1);
1750   carry3 = (s3 + (1 << 20)) >> 21;
1751   s4 += carry3;
1752   s3 -= int64_lshift21(carry3);
1753   carry5 = (s5 + (1 << 20)) >> 21;
1754   s6 += carry5;
1755   s5 -= int64_lshift21(carry5);
1756   carry7 = (s7 + (1 << 20)) >> 21;
1757   s8 += carry7;
1758   s7 -= int64_lshift21(carry7);
1759   carry9 = (s9 + (1 << 20)) >> 21;
1760   s10 += carry9;
1761   s9 -= int64_lshift21(carry9);
1762   carry11 = (s11 + (1 << 20)) >> 21;
1763   s12 += carry11;
1764   s11 -= int64_lshift21(carry11);
1765 
1766   s0 += s12 * 666643;
1767   s1 += s12 * 470296;
1768   s2 += s12 * 654183;
1769   s3 -= s12 * 997805;
1770   s4 += s12 * 136657;
1771   s5 -= s12 * 683901;
1772   s12 = 0;
1773 
1774   carry0 = s0 >> 21;
1775   s1 += carry0;
1776   s0 -= int64_lshift21(carry0);
1777   carry1 = s1 >> 21;
1778   s2 += carry1;
1779   s1 -= int64_lshift21(carry1);
1780   carry2 = s2 >> 21;
1781   s3 += carry2;
1782   s2 -= int64_lshift21(carry2);
1783   carry3 = s3 >> 21;
1784   s4 += carry3;
1785   s3 -= int64_lshift21(carry3);
1786   carry4 = s4 >> 21;
1787   s5 += carry4;
1788   s4 -= int64_lshift21(carry4);
1789   carry5 = s5 >> 21;
1790   s6 += carry5;
1791   s5 -= int64_lshift21(carry5);
1792   carry6 = s6 >> 21;
1793   s7 += carry6;
1794   s6 -= int64_lshift21(carry6);
1795   carry7 = s7 >> 21;
1796   s8 += carry7;
1797   s7 -= int64_lshift21(carry7);
1798   carry8 = s8 >> 21;
1799   s9 += carry8;
1800   s8 -= int64_lshift21(carry8);
1801   carry9 = s9 >> 21;
1802   s10 += carry9;
1803   s9 -= int64_lshift21(carry9);
1804   carry10 = s10 >> 21;
1805   s11 += carry10;
1806   s10 -= int64_lshift21(carry10);
1807   carry11 = s11 >> 21;
1808   s12 += carry11;
1809   s11 -= int64_lshift21(carry11);
1810 
1811   s0 += s12 * 666643;
1812   s1 += s12 * 470296;
1813   s2 += s12 * 654183;
1814   s3 -= s12 * 997805;
1815   s4 += s12 * 136657;
1816   s5 -= s12 * 683901;
1817   s12 = 0;
1818 
1819   carry0 = s0 >> 21;
1820   s1 += carry0;
1821   s0 -= int64_lshift21(carry0);
1822   carry1 = s1 >> 21;
1823   s2 += carry1;
1824   s1 -= int64_lshift21(carry1);
1825   carry2 = s2 >> 21;
1826   s3 += carry2;
1827   s2 -= int64_lshift21(carry2);
1828   carry3 = s3 >> 21;
1829   s4 += carry3;
1830   s3 -= int64_lshift21(carry3);
1831   carry4 = s4 >> 21;
1832   s5 += carry4;
1833   s4 -= int64_lshift21(carry4);
1834   carry5 = s5 >> 21;
1835   s6 += carry5;
1836   s5 -= int64_lshift21(carry5);
1837   carry6 = s6 >> 21;
1838   s7 += carry6;
1839   s6 -= int64_lshift21(carry6);
1840   carry7 = s7 >> 21;
1841   s8 += carry7;
1842   s7 -= int64_lshift21(carry7);
1843   carry8 = s8 >> 21;
1844   s9 += carry8;
1845   s8 -= int64_lshift21(carry8);
1846   carry9 = s9 >> 21;
1847   s10 += carry9;
1848   s9 -= int64_lshift21(carry9);
1849   carry10 = s10 >> 21;
1850   s11 += carry10;
1851   s10 -= int64_lshift21(carry10);
1852 
1853   s[0] = s0 >> 0;
1854   s[1] = s0 >> 8;
1855   s[2] = (s0 >> 16) | (s1 << 5);
1856   s[3] = s1 >> 3;
1857   s[4] = s1 >> 11;
1858   s[5] = (s1 >> 19) | (s2 << 2);
1859   s[6] = s2 >> 6;
1860   s[7] = (s2 >> 14) | (s3 << 7);
1861   s[8] = s3 >> 1;
1862   s[9] = s3 >> 9;
1863   s[10] = (s3 >> 17) | (s4 << 4);
1864   s[11] = s4 >> 4;
1865   s[12] = s4 >> 12;
1866   s[13] = (s4 >> 20) | (s5 << 1);
1867   s[14] = s5 >> 7;
1868   s[15] = (s5 >> 15) | (s6 << 6);
1869   s[16] = s6 >> 2;
1870   s[17] = s6 >> 10;
1871   s[18] = (s6 >> 18) | (s7 << 3);
1872   s[19] = s7 >> 5;
1873   s[20] = s7 >> 13;
1874   s[21] = s8 >> 0;
1875   s[22] = s8 >> 8;
1876   s[23] = (s8 >> 16) | (s9 << 5);
1877   s[24] = s9 >> 3;
1878   s[25] = s9 >> 11;
1879   s[26] = (s9 >> 19) | (s10 << 2);
1880   s[27] = s10 >> 6;
1881   s[28] = (s10 >> 14) | (s11 << 7);
1882   s[29] = s11 >> 1;
1883   s[30] = s11 >> 9;
1884   s[31] = s11 >> 17;
1885 }
1886 
ED25519_keypair(uint8_t out_public_key[32],uint8_t out_private_key[64])1887 void ED25519_keypair(uint8_t out_public_key[32], uint8_t out_private_key[64]) {
1888   uint8_t seed[32];
1889   RAND_bytes(seed, 32);
1890   ED25519_keypair_from_seed(out_public_key, out_private_key, seed);
1891 }
1892 
ED25519_sign(uint8_t out_sig[64],const uint8_t * message,size_t message_len,const uint8_t private_key[64])1893 int ED25519_sign(uint8_t out_sig[64], const uint8_t *message,
1894                  size_t message_len, const uint8_t private_key[64]) {
1895   // NOTE: The documentation on this function says that it returns zero on
1896   // allocation failure. While that can't happen with the current
1897   // implementation, we want to reserve the ability to allocate in this
1898   // implementation in the future.
1899 
1900   uint8_t az[SHA512_DIGEST_LENGTH];
1901   SHA512(private_key, 32, az);
1902 
1903   az[0] &= 248;
1904   az[31] &= 63;
1905   az[31] |= 64;
1906 
1907   SHA512_CTX hash_ctx;
1908   SHA512_Init(&hash_ctx);
1909   SHA512_Update(&hash_ctx, az + 32, 32);
1910   SHA512_Update(&hash_ctx, message, message_len);
1911   uint8_t nonce[SHA512_DIGEST_LENGTH];
1912   SHA512_Final(nonce, &hash_ctx);
1913 
1914   x25519_sc_reduce(nonce);
1915   ge_p3 R;
1916   x25519_ge_scalarmult_base(&R, nonce);
1917   ge_p3_tobytes(out_sig, &R);
1918 
1919   SHA512_Init(&hash_ctx);
1920   SHA512_Update(&hash_ctx, out_sig, 32);
1921   SHA512_Update(&hash_ctx, private_key + 32, 32);
1922   SHA512_Update(&hash_ctx, message, message_len);
1923   uint8_t hram[SHA512_DIGEST_LENGTH];
1924   SHA512_Final(hram, &hash_ctx);
1925 
1926   x25519_sc_reduce(hram);
1927   sc_muladd(out_sig + 32, hram, az, nonce);
1928 
1929   return 1;
1930 }
1931 
ED25519_verify(const uint8_t * message,size_t message_len,const uint8_t signature[64],const uint8_t public_key[32])1932 int ED25519_verify(const uint8_t *message, size_t message_len,
1933                    const uint8_t signature[64], const uint8_t public_key[32]) {
1934   ge_p3 A;
1935   if ((signature[63] & 224) != 0 ||
1936       !x25519_ge_frombytes_vartime(&A, public_key)) {
1937     return 0;
1938   }
1939 
1940   fe_loose t;
1941   fe_neg(&t, &A.X);
1942   fe_carry(&A.X, &t);
1943   fe_neg(&t, &A.T);
1944   fe_carry(&A.T, &t);
1945 
1946   uint8_t pkcopy[32];
1947   OPENSSL_memcpy(pkcopy, public_key, 32);
1948   uint8_t rcopy[32];
1949   OPENSSL_memcpy(rcopy, signature, 32);
1950   union {
1951     uint64_t u64[4];
1952     uint8_t u8[32];
1953   } scopy;
1954   OPENSSL_memcpy(&scopy.u8[0], signature + 32, 32);
1955 
1956   // https://tools.ietf.org/html/rfc8032#section-5.1.7 requires that s be in
1957   // the range [0, order) in order to prevent signature malleability.
1958 
1959   // kOrder is the order of Curve25519 in little-endian form.
1960   static const uint64_t kOrder[4] = {
1961     UINT64_C(0x5812631a5cf5d3ed),
1962     UINT64_C(0x14def9dea2f79cd6),
1963     0,
1964     UINT64_C(0x1000000000000000),
1965   };
1966   for (size_t i = 3;; i--) {
1967     if (scopy.u64[i] > kOrder[i]) {
1968       return 0;
1969     } else if (scopy.u64[i] < kOrder[i]) {
1970       break;
1971     } else if (i == 0) {
1972       return 0;
1973     }
1974   }
1975 
1976   SHA512_CTX hash_ctx;
1977   SHA512_Init(&hash_ctx);
1978   SHA512_Update(&hash_ctx, signature, 32);
1979   SHA512_Update(&hash_ctx, public_key, 32);
1980   SHA512_Update(&hash_ctx, message, message_len);
1981   uint8_t h[SHA512_DIGEST_LENGTH];
1982   SHA512_Final(h, &hash_ctx);
1983 
1984   x25519_sc_reduce(h);
1985 
1986   ge_p2 R;
1987   ge_double_scalarmult_vartime(&R, h, &A, scopy.u8);
1988 
1989   uint8_t rcheck[32];
1990   x25519_ge_tobytes(rcheck, &R);
1991 
1992   return CRYPTO_memcmp(rcheck, rcopy, sizeof(rcheck)) == 0;
1993 }
1994 
ED25519_keypair_from_seed(uint8_t out_public_key[32],uint8_t out_private_key[64],const uint8_t seed[32])1995 void ED25519_keypair_from_seed(uint8_t out_public_key[32],
1996                                uint8_t out_private_key[64],
1997                                const uint8_t seed[32]) {
1998   uint8_t az[SHA512_DIGEST_LENGTH];
1999   SHA512(seed, 32, az);
2000 
2001   az[0] &= 248;
2002   az[31] &= 127;
2003   az[31] |= 64;
2004 
2005   ge_p3 A;
2006   x25519_ge_scalarmult_base(&A, az);
2007   ge_p3_tobytes(out_public_key, &A);
2008 
2009   OPENSSL_memcpy(out_private_key, seed, 32);
2010   OPENSSL_memcpy(out_private_key + 32, out_public_key, 32);
2011 }
2012 
2013 
x25519_scalar_mult_generic(uint8_t out[32],const uint8_t scalar[32],const uint8_t point[32])2014 static void x25519_scalar_mult_generic(uint8_t out[32],
2015                                        const uint8_t scalar[32],
2016                                        const uint8_t point[32]) {
2017   fe x1, x2, z2, x3, z3, tmp0, tmp1;
2018   fe_loose x2l, z2l, x3l, tmp0l, tmp1l;
2019 
2020   uint8_t e[32];
2021   OPENSSL_memcpy(e, scalar, 32);
2022   e[0] &= 248;
2023   e[31] &= 127;
2024   e[31] |= 64;
2025 
2026   // The following implementation was transcribed to Coq and proven to
2027   // correspond to unary scalar multiplication in affine coordinates given that
2028   // x1 != 0 is the x coordinate of some point on the curve. It was also checked
2029   // in Coq that doing a ladderstep with x1 = x3 = 0 gives z2' = z3' = 0, and z2
2030   // = z3 = 0 gives z2' = z3' = 0. The statement was quantified over the
2031   // underlying field, so it applies to Curve25519 itself and the quadratic
2032   // twist of Curve25519. It was not proven in Coq that prime-field arithmetic
2033   // correctly simulates extension-field arithmetic on prime-field values.
2034   // The decoding of the byte array representation of e was not considered.
2035   // Specification of Montgomery curves in affine coordinates:
2036   // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Spec/MontgomeryCurve.v#L27>
2037   // Proof that these form a group that is isomorphic to a Weierstrass curve:
2038   // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/AffineProofs.v#L35>
2039   // Coq transcription and correctness proof of the loop (where scalarbits=255):
2040   // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L118>
2041   // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L278>
2042   // preconditions: 0 <= e < 2^255 (not necessarily e < order), fe_invert(0) = 0
2043   fe_frombytes(&x1, point);
2044   fe_1(&x2);
2045   fe_0(&z2);
2046   fe_copy(&x3, &x1);
2047   fe_1(&z3);
2048 
2049   unsigned swap = 0;
2050   int pos;
2051   for (pos = 254; pos >= 0; --pos) {
2052     // loop invariant as of right before the test, for the case where x1 != 0:
2053     //   pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 is nonzero
2054     //   let r := e >> (pos+1) in the following equalities of projective points:
2055     //   to_xz (r*P)     === if swap then (x3, z3) else (x2, z2)
2056     //   to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3)
2057     //   x1 is the nonzero x coordinate of the nonzero point (r*P-(r+1)*P)
2058     unsigned b = 1 & (e[pos / 8] >> (pos & 7));
2059     swap ^= b;
2060     fe_cswap(&x2, &x3, swap);
2061     fe_cswap(&z2, &z3, swap);
2062     swap = b;
2063     // Coq transcription of ladderstep formula (called from transcribed loop):
2064     // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L89>
2065     // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L131>
2066     // x1 != 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L217>
2067     // x1  = 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L147>
2068     fe_sub(&tmp0l, &x3, &z3);
2069     fe_sub(&tmp1l, &x2, &z2);
2070     fe_add(&x2l, &x2, &z2);
2071     fe_add(&z2l, &x3, &z3);
2072     fe_mul_tll(&z3, &tmp0l, &x2l);
2073     fe_mul_tll(&z2, &z2l, &tmp1l);
2074     fe_sq_tl(&tmp0, &tmp1l);
2075     fe_sq_tl(&tmp1, &x2l);
2076     fe_add(&x3l, &z3, &z2);
2077     fe_sub(&z2l, &z3, &z2);
2078     fe_mul_ttt(&x2, &tmp1, &tmp0);
2079     fe_sub(&tmp1l, &tmp1, &tmp0);
2080     fe_sq_tl(&z2, &z2l);
2081     fe_mul121666(&z3, &tmp1l);
2082     fe_sq_tl(&x3, &x3l);
2083     fe_add(&tmp0l, &tmp0, &z3);
2084     fe_mul_ttt(&z3, &x1, &z2);
2085     fe_mul_tll(&z2, &tmp1l, &tmp0l);
2086   }
2087   // here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) else (x2, z2)
2088   fe_cswap(&x2, &x3, swap);
2089   fe_cswap(&z2, &z3, swap);
2090 
2091   fe_invert(&z2, &z2);
2092   fe_mul_ttt(&x2, &x2, &z2);
2093   fe_tobytes(out, &x2);
2094 }
2095 
x25519_scalar_mult(uint8_t out[32],const uint8_t scalar[32],const uint8_t point[32])2096 static void x25519_scalar_mult(uint8_t out[32], const uint8_t scalar[32],
2097                                const uint8_t point[32]) {
2098 #if defined(BORINGSSL_X25519_NEON)
2099   if (CRYPTO_is_NEON_capable()) {
2100     x25519_NEON(out, scalar, point);
2101     return;
2102   }
2103 #endif
2104 
2105   x25519_scalar_mult_generic(out, scalar, point);
2106 }
2107 
X25519_keypair(uint8_t out_public_value[32],uint8_t out_private_key[32])2108 void X25519_keypair(uint8_t out_public_value[32], uint8_t out_private_key[32]) {
2109   RAND_bytes(out_private_key, 32);
2110 
2111   // All X25519 implementations should decode scalars correctly (see
2112   // https://tools.ietf.org/html/rfc7748#section-5). However, if an
2113   // implementation doesn't then it might interoperate with random keys a
2114   // fraction of the time because they'll, randomly, happen to be correctly
2115   // formed.
2116   //
2117   // Thus we do the opposite of the masking here to make sure that our private
2118   // keys are never correctly masked and so, hopefully, any incorrect
2119   // implementations are deterministically broken.
2120   //
2121   // This does not affect security because, although we're throwing away
2122   // entropy, a valid implementation of scalarmult should throw away the exact
2123   // same bits anyway.
2124   out_private_key[0] |= ~248;
2125   out_private_key[31] &= ~64;
2126   out_private_key[31] |= ~127;
2127 
2128   X25519_public_from_private(out_public_value, out_private_key);
2129 }
2130 
X25519(uint8_t out_shared_key[32],const uint8_t private_key[32],const uint8_t peer_public_value[32])2131 int X25519(uint8_t out_shared_key[32], const uint8_t private_key[32],
2132            const uint8_t peer_public_value[32]) {
2133   static const uint8_t kZeros[32] = {0};
2134   x25519_scalar_mult(out_shared_key, private_key, peer_public_value);
2135   // The all-zero output results when the input is a point of small order.
2136   return CRYPTO_memcmp(kZeros, out_shared_key, 32) != 0;
2137 }
2138 
X25519_public_from_private(uint8_t out_public_value[32],const uint8_t private_key[32])2139 void X25519_public_from_private(uint8_t out_public_value[32],
2140                                 const uint8_t private_key[32]) {
2141 #if defined(BORINGSSL_X25519_NEON)
2142   if (CRYPTO_is_NEON_capable()) {
2143     static const uint8_t kMongomeryBasePoint[32] = {9};
2144     x25519_NEON(out_public_value, private_key, kMongomeryBasePoint);
2145     return;
2146   }
2147 #endif
2148 
2149   uint8_t e[32];
2150   OPENSSL_memcpy(e, private_key, 32);
2151   e[0] &= 248;
2152   e[31] &= 127;
2153   e[31] |= 64;
2154 
2155   ge_p3 A;
2156   x25519_ge_scalarmult_base(&A, e);
2157 
2158   // We only need the u-coordinate of the curve25519 point. The map is
2159   // u=(y+1)/(1-y). Since y=Y/Z, this gives u=(Z+Y)/(Z-Y).
2160   fe_loose zplusy, zminusy;
2161   fe zminusy_inv;
2162   fe_add(&zplusy, &A.Z, &A.Y);
2163   fe_sub(&zminusy, &A.Z, &A.Y);
2164   fe_loose_invert(&zminusy_inv, &zminusy);
2165   fe_mul_tlt(&zminusy_inv, &zplusy, &zminusy_inv);
2166   fe_tobytes(out_public_value, &zminusy_inv);
2167 }
2168