1 #include <openssl/curve25519.h>
2 
3 #include <string.h>
4 
5 #include "internal.h"
6 
7 
8 #if defined(BORINGSSL_X25519_X86_64)
9 
10 typedef struct { uint64_t v[5]; } fe25519;
11 
12 /* These functions are defined in asm/x25519-x86_64.S */
13 void x25519_x86_64_work_cswap(fe25519 *, uint64_t);
14 void x25519_x86_64_mul(fe25519 *out, const fe25519 *a, const fe25519 *b);
15 void x25519_x86_64_square(fe25519 *out, const fe25519 *a);
16 void x25519_x86_64_freeze(fe25519 *);
17 void x25519_x86_64_ladderstep(fe25519 *work);
18 
fe25519_setint(fe25519 * r,unsigned v)19 static void fe25519_setint(fe25519 *r, unsigned v) {
20   r->v[0] = v;
21   r->v[1] = 0;
22   r->v[2] = 0;
23   r->v[3] = 0;
24   r->v[4] = 0;
25 }
26 
27 /* Assumes input x being reduced below 2^255 */
fe25519_pack(unsigned char r[32],const fe25519 * x)28 static void fe25519_pack(unsigned char r[32], const fe25519 *x) {
29   fe25519 t;
30   t = *x;
31   x25519_x86_64_freeze(&t);
32 
33   r[0] = (uint8_t)(t.v[0] & 0xff);
34   r[1] = (uint8_t)((t.v[0] >> 8) & 0xff);
35   r[2] = (uint8_t)((t.v[0] >> 16) & 0xff);
36   r[3] = (uint8_t)((t.v[0] >> 24) & 0xff);
37   r[4] = (uint8_t)((t.v[0] >> 32) & 0xff);
38   r[5] = (uint8_t)((t.v[0] >> 40) & 0xff);
39   r[6] = (uint8_t)((t.v[0] >> 48));
40 
41   r[6] ^= (uint8_t)((t.v[1] << 3) & 0xf8);
42   r[7] = (uint8_t)((t.v[1] >> 5) & 0xff);
43   r[8] = (uint8_t)((t.v[1] >> 13) & 0xff);
44   r[9] = (uint8_t)((t.v[1] >> 21) & 0xff);
45   r[10] = (uint8_t)((t.v[1] >> 29) & 0xff);
46   r[11] = (uint8_t)((t.v[1] >> 37) & 0xff);
47   r[12] = (uint8_t)((t.v[1] >> 45));
48 
49   r[12] ^= (uint8_t)((t.v[2] << 6) & 0xc0);
50   r[13] = (uint8_t)((t.v[2] >> 2) & 0xff);
51   r[14] = (uint8_t)((t.v[2] >> 10) & 0xff);
52   r[15] = (uint8_t)((t.v[2] >> 18) & 0xff);
53   r[16] = (uint8_t)((t.v[2] >> 26) & 0xff);
54   r[17] = (uint8_t)((t.v[2] >> 34) & 0xff);
55   r[18] = (uint8_t)((t.v[2] >> 42) & 0xff);
56   r[19] = (uint8_t)((t.v[2] >> 50));
57 
58   r[19] ^= (uint8_t)((t.v[3] << 1) & 0xfe);
59   r[20] = (uint8_t)((t.v[3] >> 7) & 0xff);
60   r[21] = (uint8_t)((t.v[3] >> 15) & 0xff);
61   r[22] = (uint8_t)((t.v[3] >> 23) & 0xff);
62   r[23] = (uint8_t)((t.v[3] >> 31) & 0xff);
63   r[24] = (uint8_t)((t.v[3] >> 39) & 0xff);
64   r[25] = (uint8_t)((t.v[3] >> 47));
65 
66   r[25] ^= (uint8_t)((t.v[4] << 4) & 0xf0);
67   r[26] = (uint8_t)((t.v[4] >> 4) & 0xff);
68   r[27] = (uint8_t)((t.v[4] >> 12) & 0xff);
69   r[28] = (uint8_t)((t.v[4] >> 20) & 0xff);
70   r[29] = (uint8_t)((t.v[4] >> 28) & 0xff);
71   r[30] = (uint8_t)((t.v[4] >> 36) & 0xff);
72   r[31] = (uint8_t)((t.v[4] >> 44));
73 }
74 
fe25519_unpack(fe25519 * r,const uint8_t x[32])75 static void fe25519_unpack(fe25519 *r, const uint8_t x[32]) {
76   r->v[0] = x[0];
77   r->v[0] += (uint64_t)x[1] << 8;
78   r->v[0] += (uint64_t)x[2] << 16;
79   r->v[0] += (uint64_t)x[3] << 24;
80   r->v[0] += (uint64_t)x[4] << 32;
81   r->v[0] += (uint64_t)x[5] << 40;
82   r->v[0] += ((uint64_t)x[6] & 7) << 48;
83 
84   r->v[1] = x[6] >> 3;
85   r->v[1] += (uint64_t)x[7] << 5;
86   r->v[1] += (uint64_t)x[8] << 13;
87   r->v[1] += (uint64_t)x[9] << 21;
88   r->v[1] += (uint64_t)x[10] << 29;
89   r->v[1] += (uint64_t)x[11] << 37;
90   r->v[1] += ((uint64_t)x[12] & 63) << 45;
91 
92   r->v[2] = x[12] >> 6;
93   r->v[2] += (uint64_t)x[13] << 2;
94   r->v[2] += (uint64_t)x[14] << 10;
95   r->v[2] += (uint64_t)x[15] << 18;
96   r->v[2] += (uint64_t)x[16] << 26;
97   r->v[2] += (uint64_t)x[17] << 34;
98   r->v[2] += (uint64_t)x[18] << 42;
99   r->v[2] += ((uint64_t)x[19] & 1) << 50;
100 
101   r->v[3] = x[19] >> 1;
102   r->v[3] += (uint64_t)x[20] << 7;
103   r->v[3] += (uint64_t)x[21] << 15;
104   r->v[3] += (uint64_t)x[22] << 23;
105   r->v[3] += (uint64_t)x[23] << 31;
106   r->v[3] += (uint64_t)x[24] << 39;
107   r->v[3] += ((uint64_t)x[25] & 15) << 47;
108 
109   r->v[4] = x[25] >> 4;
110   r->v[4] += (uint64_t)x[26] << 4;
111   r->v[4] += (uint64_t)x[27] << 12;
112   r->v[4] += (uint64_t)x[28] << 20;
113   r->v[4] += (uint64_t)x[29] << 28;
114   r->v[4] += (uint64_t)x[30] << 36;
115   r->v[4] += ((uint64_t)x[31] & 127) << 44;
116 }
117 
fe25519_invert(fe25519 * r,const fe25519 * x)118 static void fe25519_invert(fe25519 *r, const fe25519 *x) {
119   fe25519 z2;
120   fe25519 z9;
121   fe25519 z11;
122   fe25519 z2_5_0;
123   fe25519 z2_10_0;
124   fe25519 z2_20_0;
125   fe25519 z2_50_0;
126   fe25519 z2_100_0;
127   fe25519 t;
128   int i;
129 
130   /* 2 */ x25519_x86_64_square(&z2, x);
131   /* 4 */ x25519_x86_64_square(&t, &z2);
132   /* 8 */ x25519_x86_64_square(&t, &t);
133   /* 9 */ x25519_x86_64_mul(&z9, &t, x);
134   /* 11 */ x25519_x86_64_mul(&z11, &z9, &z2);
135   /* 22 */ x25519_x86_64_square(&t, &z11);
136   /* 2^5 - 2^0 = 31 */ x25519_x86_64_mul(&z2_5_0, &t, &z9);
137 
138   /* 2^6 - 2^1 */ x25519_x86_64_square(&t, &z2_5_0);
139   /* 2^20 - 2^10 */ for (i = 1; i < 5; i++) { x25519_x86_64_square(&t, &t); }
140   /* 2^10 - 2^0 */ x25519_x86_64_mul(&z2_10_0, &t, &z2_5_0);
141 
142   /* 2^11 - 2^1 */ x25519_x86_64_square(&t, &z2_10_0);
143   /* 2^20 - 2^10 */ for (i = 1; i < 10; i++) { x25519_x86_64_square(&t, &t); }
144   /* 2^20 - 2^0 */ x25519_x86_64_mul(&z2_20_0, &t, &z2_10_0);
145 
146   /* 2^21 - 2^1 */ x25519_x86_64_square(&t, &z2_20_0);
147   /* 2^40 - 2^20 */ for (i = 1; i < 20; i++) { x25519_x86_64_square(&t, &t); }
148   /* 2^40 - 2^0 */ x25519_x86_64_mul(&t, &t, &z2_20_0);
149 
150   /* 2^41 - 2^1 */ x25519_x86_64_square(&t, &t);
151   /* 2^50 - 2^10 */ for (i = 1; i < 10; i++) { x25519_x86_64_square(&t, &t); }
152   /* 2^50 - 2^0 */ x25519_x86_64_mul(&z2_50_0, &t, &z2_10_0);
153 
154   /* 2^51 - 2^1 */ x25519_x86_64_square(&t, &z2_50_0);
155   /* 2^100 - 2^50 */ for (i = 1; i < 50; i++) { x25519_x86_64_square(&t, &t); }
156   /* 2^100 - 2^0 */ x25519_x86_64_mul(&z2_100_0, &t, &z2_50_0);
157 
158   /* 2^101 - 2^1 */ x25519_x86_64_square(&t, &z2_100_0);
159   /* 2^200 - 2^100 */ for (i = 1; i < 100; i++) {
160     x25519_x86_64_square(&t, &t);
161   }
162   /* 2^200 - 2^0 */ x25519_x86_64_mul(&t, &t, &z2_100_0);
163 
164   /* 2^201 - 2^1 */ x25519_x86_64_square(&t, &t);
165   /* 2^250 - 2^50 */ for (i = 1; i < 50; i++) { x25519_x86_64_square(&t, &t); }
166   /* 2^250 - 2^0 */ x25519_x86_64_mul(&t, &t, &z2_50_0);
167 
168   /* 2^251 - 2^1 */ x25519_x86_64_square(&t, &t);
169   /* 2^252 - 2^2 */ x25519_x86_64_square(&t, &t);
170   /* 2^253 - 2^3 */ x25519_x86_64_square(&t, &t);
171 
172   /* 2^254 - 2^4 */ x25519_x86_64_square(&t, &t);
173 
174   /* 2^255 - 2^5 */ x25519_x86_64_square(&t, &t);
175   /* 2^255 - 21 */ x25519_x86_64_mul(r, &t, &z11);
176 }
177 
mladder(fe25519 * xr,fe25519 * zr,const uint8_t s[32])178 static void mladder(fe25519 *xr, fe25519 *zr, const uint8_t s[32]) {
179   fe25519 work[5];
180 
181   work[0] = *xr;
182   fe25519_setint(work + 1, 1);
183   fe25519_setint(work + 2, 0);
184   work[3] = *xr;
185   fe25519_setint(work + 4, 1);
186 
187   int i, j;
188   uint8_t prevbit = 0;
189 
190   j = 6;
191   for (i = 31; i >= 0; i--) {
192     while (j >= 0) {
193       const uint8_t bit = 1 & (s[i] >> j);
194       const uint64_t swap = bit ^ prevbit;
195       prevbit = bit;
196       x25519_x86_64_work_cswap(work + 1, swap);
197       x25519_x86_64_ladderstep(work);
198       j -= 1;
199     }
200     j = 7;
201   }
202 
203   *xr = work[1];
204   *zr = work[2];
205 }
206 
x25519_x86_64(uint8_t out[32],const uint8_t scalar[32],const uint8_t point[32])207 void x25519_x86_64(uint8_t out[32], const uint8_t scalar[32],
208                   const uint8_t point[32]) {
209   uint8_t e[32];
210   memcpy(e, scalar, sizeof(e));
211 
212   e[0] &= 248;
213   e[31] &= 127;
214   e[31] |= 64;
215 
216   fe25519 t;
217   fe25519 z;
218   fe25519_unpack(&t, point);
219   mladder(&t, &z, e);
220   fe25519_invert(&z, &z);
221   x25519_x86_64_mul(&t, &t, &z);
222   fe25519_pack(out, &t);
223 }
224 
225 #endif  /* BORINGSSL_X25519_X86_64 */
226