1 /* x86_64 BIGNUM accelerator version 0.1, December 2002.
2 *
3 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
4 * project.
5 *
6 * Rights for redistribution and usage in source and binary forms are
7 * granted according to the OpenSSL license. Warranty of any kind is
8 * disclaimed.
9 *
10 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
11 * versions, like 1.0...
12 * A. Well, that's because this code is basically a quick-n-dirty
13 * proof-of-concept hack. As you can see it's implemented with
14 * inline assembler, which means that you're bound to GCC and that
15 * there might be enough room for further improvement.
16 *
17 * Q. Why inline assembler?
18 * A. x86_64 features own ABI which I'm not familiar with. This is
19 * why I decided to let the compiler take care of subroutine
20 * prologue/epilogue as well as register allocation. For reference.
21 * Win64 implements different ABI for AMD64, different from Linux.
22 *
23 * Q. How much faster does it get?
24 * A. 'apps/openssl speed rsa dsa' output with no-asm:
25 *
26 * sign verify sign/s verify/s
27 * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
28 * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
29 * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
30 * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
31 * sign verify sign/s verify/s
32 * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
33 * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
34 * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
35 *
36 * 'apps/openssl speed rsa dsa' output with this module:
37 *
38 * sign verify sign/s verify/s
39 * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
40 * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
41 * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
42 * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
43 * sign verify sign/s verify/s
44 * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
45 * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
46 * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
47 *
48 * For the reference. IA-32 assembler implementation performs
49 * very much like 64-bit code compiled with no-asm on the same
50 * machine.
51 */
52
53 #include <openssl/bn.h>
54
55 /* TODO(davidben): Get this file working on Windows x64. */
56 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__GNUC__)
57
58 #include "../internal.h"
59
60
61 #undef mul
62 #undef mul_add
63
64 #define asm __asm__
65
66 /*
67 * "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
68 * "g"(0) let the compiler to decide where does it
69 * want to keep the value of zero;
70 */
71 #define mul_add(r, a, word, carry) \
72 do { \
73 register BN_ULONG high, low; \
74 asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "m"(a) : "cc"); \
75 asm("addq %2,%0; adcq %3,%1" \
76 : "+r"(carry), "+d"(high) \
77 : "a"(low), "g"(0) \
78 : "cc"); \
79 asm("addq %2,%0; adcq %3,%1" \
80 : "+m"(r), "+d"(high) \
81 : "r"(carry), "g"(0) \
82 : "cc"); \
83 (carry) = high; \
84 } while (0)
85
86 #define mul(r, a, word, carry) \
87 do { \
88 register BN_ULONG high, low; \
89 asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "g"(a) : "cc"); \
90 asm("addq %2,%0; adcq %3,%1" \
91 : "+r"(carry), "+d"(high) \
92 : "a"(low), "g"(0) \
93 : "cc"); \
94 (r) = (carry); \
95 (carry) = high; \
96 } while (0)
97 #undef sqr
98 #define sqr(r0, r1, a) asm("mulq %2" : "=a"(r0), "=d"(r1) : "a"(a) : "cc");
99
bn_mul_add_words(BN_ULONG * rp,const BN_ULONG * ap,int num,BN_ULONG w)100 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
101 BN_ULONG w) {
102 BN_ULONG c1 = 0;
103
104 if (num <= 0) {
105 return (c1);
106 }
107
108 while (num & ~3) {
109 mul_add(rp[0], ap[0], w, c1);
110 mul_add(rp[1], ap[1], w, c1);
111 mul_add(rp[2], ap[2], w, c1);
112 mul_add(rp[3], ap[3], w, c1);
113 ap += 4;
114 rp += 4;
115 num -= 4;
116 }
117 if (num) {
118 mul_add(rp[0], ap[0], w, c1);
119 if (--num == 0) {
120 return c1;
121 }
122 mul_add(rp[1], ap[1], w, c1);
123 if (--num == 0) {
124 return c1;
125 }
126 mul_add(rp[2], ap[2], w, c1);
127 return c1;
128 }
129
130 return c1;
131 }
132
bn_mul_words(BN_ULONG * rp,const BN_ULONG * ap,int num,BN_ULONG w)133 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
134 BN_ULONG c1 = 0;
135
136 if (num <= 0) {
137 return c1;
138 }
139
140 while (num & ~3) {
141 mul(rp[0], ap[0], w, c1);
142 mul(rp[1], ap[1], w, c1);
143 mul(rp[2], ap[2], w, c1);
144 mul(rp[3], ap[3], w, c1);
145 ap += 4;
146 rp += 4;
147 num -= 4;
148 }
149 if (num) {
150 mul(rp[0], ap[0], w, c1);
151 if (--num == 0) {
152 return c1;
153 }
154 mul(rp[1], ap[1], w, c1);
155 if (--num == 0) {
156 return c1;
157 }
158 mul(rp[2], ap[2], w, c1);
159 }
160 return c1;
161 }
162
bn_sqr_words(BN_ULONG * r,const BN_ULONG * a,int n)163 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
164 if (n <= 0) {
165 return;
166 }
167
168 while (n & ~3) {
169 sqr(r[0], r[1], a[0]);
170 sqr(r[2], r[3], a[1]);
171 sqr(r[4], r[5], a[2]);
172 sqr(r[6], r[7], a[3]);
173 a += 4;
174 r += 8;
175 n -= 4;
176 }
177 if (n) {
178 sqr(r[0], r[1], a[0]);
179 if (--n == 0) {
180 return;
181 }
182 sqr(r[2], r[3], a[1]);
183 if (--n == 0) {
184 return;
185 }
186 sqr(r[4], r[5], a[2]);
187 }
188 }
189
bn_add_words(BN_ULONG * rp,const BN_ULONG * ap,const BN_ULONG * bp,int n)190 BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
191 int n) {
192 BN_ULONG ret;
193 size_t i = 0;
194
195 if (n <= 0) {
196 return 0;
197 }
198
199 asm volatile (
200 " subq %0,%0 \n" /* clear carry */
201 " jmp 1f \n"
202 ".p2align 4 \n"
203 "1: movq (%4,%2,8),%0 \n"
204 " adcq (%5,%2,8),%0 \n"
205 " movq %0,(%3,%2,8) \n"
206 " lea 1(%2),%2 \n"
207 " loop 1b \n"
208 " sbbq %0,%0 \n"
209 : "=&r"(ret), "+c"(n), "+r"(i)
210 : "r"(rp), "r"(ap), "r"(bp)
211 : "cc", "memory");
212
213 return ret & 1;
214 }
215
bn_sub_words(BN_ULONG * rp,const BN_ULONG * ap,const BN_ULONG * bp,int n)216 BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
217 int n) {
218 BN_ULONG ret;
219 size_t i = 0;
220
221 if (n <= 0) {
222 return 0;
223 }
224
225 asm volatile (
226 " subq %0,%0 \n" /* clear borrow */
227 " jmp 1f \n"
228 ".p2align 4 \n"
229 "1: movq (%4,%2,8),%0 \n"
230 " sbbq (%5,%2,8),%0 \n"
231 " movq %0,(%3,%2,8) \n"
232 " lea 1(%2),%2 \n"
233 " loop 1b \n"
234 " sbbq %0,%0 \n"
235 : "=&r"(ret), "+c"(n), "+r"(i)
236 : "r"(rp), "r"(ap), "r"(bp)
237 : "cc", "memory");
238
239 return ret & 1;
240 }
241
242 /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
243 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
244 /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
245 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0)
246 */
247
248 /* Keep in mind that carrying into high part of multiplication result can not
249 * overflow, because it cannot be all-ones. */
250 #define mul_add_c(a, b, c0, c1, c2) \
251 do { \
252 BN_ULONG t1, t2; \
253 asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
254 asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
255 : "+r"(c0), "+r"(c1), "+r"(c2) \
256 : "r"(t1), "r"(t2), "g"(0) \
257 : "cc"); \
258 } while (0)
259
260 #define sqr_add_c(a, i, c0, c1, c2) \
261 do { \
262 BN_ULONG t1, t2; \
263 asm("mulq %2" : "=a"(t1), "=d"(t2) : "a"((a)[i]) : "cc"); \
264 asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
265 : "+r"(c0), "+r"(c1), "+r"(c2) \
266 : "r"(t1), "r"(t2), "g"(0) \
267 : "cc"); \
268 } while (0)
269
270 #define mul_add_c2(a, b, c0, c1, c2) \
271 do { \
272 BN_ULONG t1, t2; \
273 asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
274 asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
275 : "+r"(c0), "+r"(c1), "+r"(c2) \
276 : "r"(t1), "r"(t2), "g"(0) \
277 : "cc"); \
278 asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
279 : "+r"(c0), "+r"(c1), "+r"(c2) \
280 : "r"(t1), "r"(t2), "g"(0) \
281 : "cc"); \
282 } while (0)
283
284 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
285
bn_mul_comba8(BN_ULONG * r,BN_ULONG * a,BN_ULONG * b)286 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
287 BN_ULONG c1, c2, c3;
288
289 c1 = 0;
290 c2 = 0;
291 c3 = 0;
292 mul_add_c(a[0], b[0], c1, c2, c3);
293 r[0] = c1;
294 c1 = 0;
295 mul_add_c(a[0], b[1], c2, c3, c1);
296 mul_add_c(a[1], b[0], c2, c3, c1);
297 r[1] = c2;
298 c2 = 0;
299 mul_add_c(a[2], b[0], c3, c1, c2);
300 mul_add_c(a[1], b[1], c3, c1, c2);
301 mul_add_c(a[0], b[2], c3, c1, c2);
302 r[2] = c3;
303 c3 = 0;
304 mul_add_c(a[0], b[3], c1, c2, c3);
305 mul_add_c(a[1], b[2], c1, c2, c3);
306 mul_add_c(a[2], b[1], c1, c2, c3);
307 mul_add_c(a[3], b[0], c1, c2, c3);
308 r[3] = c1;
309 c1 = 0;
310 mul_add_c(a[4], b[0], c2, c3, c1);
311 mul_add_c(a[3], b[1], c2, c3, c1);
312 mul_add_c(a[2], b[2], c2, c3, c1);
313 mul_add_c(a[1], b[3], c2, c3, c1);
314 mul_add_c(a[0], b[4], c2, c3, c1);
315 r[4] = c2;
316 c2 = 0;
317 mul_add_c(a[0], b[5], c3, c1, c2);
318 mul_add_c(a[1], b[4], c3, c1, c2);
319 mul_add_c(a[2], b[3], c3, c1, c2);
320 mul_add_c(a[3], b[2], c3, c1, c2);
321 mul_add_c(a[4], b[1], c3, c1, c2);
322 mul_add_c(a[5], b[0], c3, c1, c2);
323 r[5] = c3;
324 c3 = 0;
325 mul_add_c(a[6], b[0], c1, c2, c3);
326 mul_add_c(a[5], b[1], c1, c2, c3);
327 mul_add_c(a[4], b[2], c1, c2, c3);
328 mul_add_c(a[3], b[3], c1, c2, c3);
329 mul_add_c(a[2], b[4], c1, c2, c3);
330 mul_add_c(a[1], b[5], c1, c2, c3);
331 mul_add_c(a[0], b[6], c1, c2, c3);
332 r[6] = c1;
333 c1 = 0;
334 mul_add_c(a[0], b[7], c2, c3, c1);
335 mul_add_c(a[1], b[6], c2, c3, c1);
336 mul_add_c(a[2], b[5], c2, c3, c1);
337 mul_add_c(a[3], b[4], c2, c3, c1);
338 mul_add_c(a[4], b[3], c2, c3, c1);
339 mul_add_c(a[5], b[2], c2, c3, c1);
340 mul_add_c(a[6], b[1], c2, c3, c1);
341 mul_add_c(a[7], b[0], c2, c3, c1);
342 r[7] = c2;
343 c2 = 0;
344 mul_add_c(a[7], b[1], c3, c1, c2);
345 mul_add_c(a[6], b[2], c3, c1, c2);
346 mul_add_c(a[5], b[3], c3, c1, c2);
347 mul_add_c(a[4], b[4], c3, c1, c2);
348 mul_add_c(a[3], b[5], c3, c1, c2);
349 mul_add_c(a[2], b[6], c3, c1, c2);
350 mul_add_c(a[1], b[7], c3, c1, c2);
351 r[8] = c3;
352 c3 = 0;
353 mul_add_c(a[2], b[7], c1, c2, c3);
354 mul_add_c(a[3], b[6], c1, c2, c3);
355 mul_add_c(a[4], b[5], c1, c2, c3);
356 mul_add_c(a[5], b[4], c1, c2, c3);
357 mul_add_c(a[6], b[3], c1, c2, c3);
358 mul_add_c(a[7], b[2], c1, c2, c3);
359 r[9] = c1;
360 c1 = 0;
361 mul_add_c(a[7], b[3], c2, c3, c1);
362 mul_add_c(a[6], b[4], c2, c3, c1);
363 mul_add_c(a[5], b[5], c2, c3, c1);
364 mul_add_c(a[4], b[6], c2, c3, c1);
365 mul_add_c(a[3], b[7], c2, c3, c1);
366 r[10] = c2;
367 c2 = 0;
368 mul_add_c(a[4], b[7], c3, c1, c2);
369 mul_add_c(a[5], b[6], c3, c1, c2);
370 mul_add_c(a[6], b[5], c3, c1, c2);
371 mul_add_c(a[7], b[4], c3, c1, c2);
372 r[11] = c3;
373 c3 = 0;
374 mul_add_c(a[7], b[5], c1, c2, c3);
375 mul_add_c(a[6], b[6], c1, c2, c3);
376 mul_add_c(a[5], b[7], c1, c2, c3);
377 r[12] = c1;
378 c1 = 0;
379 mul_add_c(a[6], b[7], c2, c3, c1);
380 mul_add_c(a[7], b[6], c2, c3, c1);
381 r[13] = c2;
382 c2 = 0;
383 mul_add_c(a[7], b[7], c3, c1, c2);
384 r[14] = c3;
385 r[15] = c1;
386 }
387
bn_mul_comba4(BN_ULONG * r,BN_ULONG * a,BN_ULONG * b)388 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
389 BN_ULONG c1, c2, c3;
390
391 c1 = 0;
392 c2 = 0;
393 c3 = 0;
394 mul_add_c(a[0], b[0], c1, c2, c3);
395 r[0] = c1;
396 c1 = 0;
397 mul_add_c(a[0], b[1], c2, c3, c1);
398 mul_add_c(a[1], b[0], c2, c3, c1);
399 r[1] = c2;
400 c2 = 0;
401 mul_add_c(a[2], b[0], c3, c1, c2);
402 mul_add_c(a[1], b[1], c3, c1, c2);
403 mul_add_c(a[0], b[2], c3, c1, c2);
404 r[2] = c3;
405 c3 = 0;
406 mul_add_c(a[0], b[3], c1, c2, c3);
407 mul_add_c(a[1], b[2], c1, c2, c3);
408 mul_add_c(a[2], b[1], c1, c2, c3);
409 mul_add_c(a[3], b[0], c1, c2, c3);
410 r[3] = c1;
411 c1 = 0;
412 mul_add_c(a[3], b[1], c2, c3, c1);
413 mul_add_c(a[2], b[2], c2, c3, c1);
414 mul_add_c(a[1], b[3], c2, c3, c1);
415 r[4] = c2;
416 c2 = 0;
417 mul_add_c(a[2], b[3], c3, c1, c2);
418 mul_add_c(a[3], b[2], c3, c1, c2);
419 r[5] = c3;
420 c3 = 0;
421 mul_add_c(a[3], b[3], c1, c2, c3);
422 r[6] = c1;
423 r[7] = c2;
424 }
425
bn_sqr_comba8(BN_ULONG * r,const BN_ULONG * a)426 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
427 BN_ULONG c1, c2, c3;
428
429 c1 = 0;
430 c2 = 0;
431 c3 = 0;
432 sqr_add_c(a, 0, c1, c2, c3);
433 r[0] = c1;
434 c1 = 0;
435 sqr_add_c2(a, 1, 0, c2, c3, c1);
436 r[1] = c2;
437 c2 = 0;
438 sqr_add_c(a, 1, c3, c1, c2);
439 sqr_add_c2(a, 2, 0, c3, c1, c2);
440 r[2] = c3;
441 c3 = 0;
442 sqr_add_c2(a, 3, 0, c1, c2, c3);
443 sqr_add_c2(a, 2, 1, c1, c2, c3);
444 r[3] = c1;
445 c1 = 0;
446 sqr_add_c(a, 2, c2, c3, c1);
447 sqr_add_c2(a, 3, 1, c2, c3, c1);
448 sqr_add_c2(a, 4, 0, c2, c3, c1);
449 r[4] = c2;
450 c2 = 0;
451 sqr_add_c2(a, 5, 0, c3, c1, c2);
452 sqr_add_c2(a, 4, 1, c3, c1, c2);
453 sqr_add_c2(a, 3, 2, c3, c1, c2);
454 r[5] = c3;
455 c3 = 0;
456 sqr_add_c(a, 3, c1, c2, c3);
457 sqr_add_c2(a, 4, 2, c1, c2, c3);
458 sqr_add_c2(a, 5, 1, c1, c2, c3);
459 sqr_add_c2(a, 6, 0, c1, c2, c3);
460 r[6] = c1;
461 c1 = 0;
462 sqr_add_c2(a, 7, 0, c2, c3, c1);
463 sqr_add_c2(a, 6, 1, c2, c3, c1);
464 sqr_add_c2(a, 5, 2, c2, c3, c1);
465 sqr_add_c2(a, 4, 3, c2, c3, c1);
466 r[7] = c2;
467 c2 = 0;
468 sqr_add_c(a, 4, c3, c1, c2);
469 sqr_add_c2(a, 5, 3, c3, c1, c2);
470 sqr_add_c2(a, 6, 2, c3, c1, c2);
471 sqr_add_c2(a, 7, 1, c3, c1, c2);
472 r[8] = c3;
473 c3 = 0;
474 sqr_add_c2(a, 7, 2, c1, c2, c3);
475 sqr_add_c2(a, 6, 3, c1, c2, c3);
476 sqr_add_c2(a, 5, 4, c1, c2, c3);
477 r[9] = c1;
478 c1 = 0;
479 sqr_add_c(a, 5, c2, c3, c1);
480 sqr_add_c2(a, 6, 4, c2, c3, c1);
481 sqr_add_c2(a, 7, 3, c2, c3, c1);
482 r[10] = c2;
483 c2 = 0;
484 sqr_add_c2(a, 7, 4, c3, c1, c2);
485 sqr_add_c2(a, 6, 5, c3, c1, c2);
486 r[11] = c3;
487 c3 = 0;
488 sqr_add_c(a, 6, c1, c2, c3);
489 sqr_add_c2(a, 7, 5, c1, c2, c3);
490 r[12] = c1;
491 c1 = 0;
492 sqr_add_c2(a, 7, 6, c2, c3, c1);
493 r[13] = c2;
494 c2 = 0;
495 sqr_add_c(a, 7, c3, c1, c2);
496 r[14] = c3;
497 r[15] = c1;
498 }
499
bn_sqr_comba4(BN_ULONG * r,const BN_ULONG * a)500 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
501 BN_ULONG c1, c2, c3;
502
503 c1 = 0;
504 c2 = 0;
505 c3 = 0;
506 sqr_add_c(a, 0, c1, c2, c3);
507 r[0] = c1;
508 c1 = 0;
509 sqr_add_c2(a, 1, 0, c2, c3, c1);
510 r[1] = c2;
511 c2 = 0;
512 sqr_add_c(a, 1, c3, c1, c2);
513 sqr_add_c2(a, 2, 0, c3, c1, c2);
514 r[2] = c3;
515 c3 = 0;
516 sqr_add_c2(a, 3, 0, c1, c2, c3);
517 sqr_add_c2(a, 2, 1, c1, c2, c3);
518 r[3] = c1;
519 c1 = 0;
520 sqr_add_c(a, 2, c2, c3, c1);
521 sqr_add_c2(a, 3, 1, c2, c3, c1);
522 r[4] = c2;
523 c2 = 0;
524 sqr_add_c2(a, 3, 2, c3, c1, c2);
525 r[5] = c3;
526 c3 = 0;
527 sqr_add_c(a, 3, c1, c2, c3);
528 r[6] = c1;
529 r[7] = c2;
530 }
531
532 #endif /* !NO_ASM && X86_64 && __GNUC__ */
533