1 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
2 * All rights reserved.
3 *
4 * This package is an SSL implementation written
5 * by Eric Young (eay@cryptsoft.com).
6 * The implementation was written so as to conform with Netscapes SSL.
7 *
8 * This library is free for commercial and non-commercial use as long as
9 * the following conditions are aheared to. The following conditions
10 * apply to all code found in this distribution, be it the RC4, RSA,
11 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
12 * included with this distribution is covered by the same copyright terms
13 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
14 *
15 * Copyright remains Eric Young's, and as such any Copyright notices in
16 * the code are not to be removed.
17 * If this package is used in a product, Eric Young should be given attribution
18 * as the author of the parts of the library used.
19 * This can be in the form of a textual message at program startup or
20 * in documentation (online or textual) provided with the package.
21 *
22 * Redistribution and use in source and binary forms, with or without
23 * modification, are permitted provided that the following conditions
24 * are met:
25 * 1. Redistributions of source code must retain the copyright
26 * notice, this list of conditions and the following disclaimer.
27 * 2. Redistributions in binary form must reproduce the above copyright
28 * notice, this list of conditions and the following disclaimer in the
29 * documentation and/or other materials provided with the distribution.
30 * 3. All advertising materials mentioning features or use of this software
31 * must display the following acknowledgement:
32 * "This product includes cryptographic software written by
33 * Eric Young (eay@cryptsoft.com)"
34 * The word 'cryptographic' can be left out if the rouines from the library
35 * being used are not cryptographic related :-).
36 * 4. If you include any Windows specific code (or a derivative thereof) from
37 * the apps directory (application code) you must include an acknowledgement:
38 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
39 *
40 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50 * SUCH DAMAGE.
51 *
52 * The licence and distribution terms for any publically available version or
53 * derivative of this code cannot be changed. i.e. this code cannot simply be
54 * copied and put under another distribution licence
55 * [including the GNU Public Licence.] */
56
57 #include <openssl/bn.h>
58
59 #include <assert.h>
60
61 #include "internal.h"
62
63
64 /* Generic implementations of most operations are needed for:
65 * - Configurations without inline assembly.
66 * - Architectures other than x86 or x86_64.
67 * - Windows x84_64; x86_64-gcc.c does not build on MSVC. */
68 #if defined(OPENSSL_NO_ASM) || \
69 (!defined(OPENSSL_X86_64) && !defined(OPENSSL_X86)) || \
70 (defined(OPENSSL_X86_64) && defined(OPENSSL_WINDOWS))
71
72 #ifdef BN_ULLONG
73 #define mul_add(r, a, w, c) \
74 { \
75 BN_ULLONG t; \
76 t = (BN_ULLONG)w * (a) + (r) + (c); \
77 (r) = Lw(t); \
78 (c) = Hw(t); \
79 }
80
81 #define mul(r, a, w, c) \
82 { \
83 BN_ULLONG t; \
84 t = (BN_ULLONG)w * (a) + (c); \
85 (r) = Lw(t); \
86 (c) = Hw(t); \
87 }
88
89 #define sqr(r0, r1, a) \
90 { \
91 BN_ULLONG t; \
92 t = (BN_ULLONG)(a) * (a); \
93 (r0) = Lw(t); \
94 (r1) = Hw(t); \
95 }
96
97 #elif defined(BN_UMULT_LOHI)
98 #define mul_add(r, a, w, c) \
99 { \
100 BN_ULONG high, low, ret, tmp = (a); \
101 ret = (r); \
102 BN_UMULT_LOHI(low, high, w, tmp); \
103 ret += (c); \
104 (c) = (ret < (c)) ? 1 : 0; \
105 (c) += high; \
106 ret += low; \
107 (c) += (ret < low) ? 1 : 0; \
108 (r) = ret; \
109 }
110
111 #define mul(r, a, w, c) \
112 { \
113 BN_ULONG high, low, ret, ta = (a); \
114 BN_UMULT_LOHI(low, high, w, ta); \
115 ret = low + (c); \
116 (c) = high; \
117 (c) += (ret < low) ? 1 : 0; \
118 (r) = ret; \
119 }
120
121 #define sqr(r0, r1, a) \
122 { \
123 BN_ULONG tmp = (a); \
124 BN_UMULT_LOHI(r0, r1, tmp, tmp); \
125 }
126
127 #else
128
129 /*************************************************************
130 * No long long type
131 */
132
133 #define LBITS(a) ((a) & BN_MASK2l)
134 #define HBITS(a) (((a) >> BN_BITS4) & BN_MASK2l)
135 #define L2HBITS(a) (((a) << BN_BITS4) & BN_MASK2)
136
137 #define LLBITS(a) ((a) & BN_MASKl)
138 #define LHBITS(a) (((a) >> BN_BITS2) & BN_MASKl)
139 #define LL2HBITS(a) ((BN_ULLONG)((a) & BN_MASKl) << BN_BITS2)
140
141 #define mul64(l, h, bl, bh) \
142 { \
143 BN_ULONG m, m1, lt, ht; \
144 \
145 lt = l; \
146 ht = h; \
147 m = (bh) * (lt); \
148 lt = (bl) * (lt); \
149 m1 = (bl) * (ht); \
150 ht = (bh) * (ht); \
151 m = (m + m1) & BN_MASK2; \
152 if (m < m1) \
153 ht += L2HBITS((BN_ULONG)1); \
154 ht += HBITS(m); \
155 m1 = L2HBITS(m); \
156 lt = (lt + m1) & BN_MASK2; \
157 if (lt < m1) \
158 ht++; \
159 (l) = lt; \
160 (h) = ht; \
161 }
162
163 #define sqr64(lo, ho, in) \
164 { \
165 BN_ULONG l, h, m; \
166 \
167 h = (in); \
168 l = LBITS(h); \
169 h = HBITS(h); \
170 m = (l) * (h); \
171 l *= l; \
172 h *= h; \
173 h += (m & BN_MASK2h1) >> (BN_BITS4 - 1); \
174 m = (m & BN_MASK2l) << (BN_BITS4 + 1); \
175 l = (l + m) & BN_MASK2; \
176 if (l < m) \
177 h++; \
178 (lo) = l; \
179 (ho) = h; \
180 }
181
182 #define mul_add(r, a, bl, bh, c) \
183 { \
184 BN_ULONG l, h; \
185 \
186 h = (a); \
187 l = LBITS(h); \
188 h = HBITS(h); \
189 mul64(l, h, (bl), (bh)); \
190 \
191 /* non-multiply part */ \
192 l = (l + (c)) & BN_MASK2; \
193 if (l < (c)) \
194 h++; \
195 (c) = (r); \
196 l = (l + (c)) & BN_MASK2; \
197 if (l < (c)) \
198 h++; \
199 (c) = h & BN_MASK2; \
200 (r) = l; \
201 }
202
203 #define mul(r, a, bl, bh, c) \
204 { \
205 BN_ULONG l, h; \
206 \
207 h = (a); \
208 l = LBITS(h); \
209 h = HBITS(h); \
210 mul64(l, h, (bl), (bh)); \
211 \
212 /* non-multiply part */ \
213 l += (c); \
214 if ((l & BN_MASK2) < (c)) \
215 h++; \
216 (c) = h & BN_MASK2; \
217 (r) = l & BN_MASK2; \
218 }
219 #endif /* !BN_ULLONG */
220
221 #if defined(BN_ULLONG) || defined(BN_UMULT_HIGH)
222
bn_mul_add_words(BN_ULONG * rp,const BN_ULONG * ap,int num,BN_ULONG w)223 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
224 BN_ULONG w) {
225 BN_ULONG c1 = 0;
226
227 assert(num >= 0);
228 if (num <= 0) {
229 return c1;
230 }
231
232 while (num & ~3) {
233 mul_add(rp[0], ap[0], w, c1);
234 mul_add(rp[1], ap[1], w, c1);
235 mul_add(rp[2], ap[2], w, c1);
236 mul_add(rp[3], ap[3], w, c1);
237 ap += 4;
238 rp += 4;
239 num -= 4;
240 }
241
242 while (num) {
243 mul_add(rp[0], ap[0], w, c1);
244 ap++;
245 rp++;
246 num--;
247 }
248
249 return c1;
250 }
251
bn_mul_words(BN_ULONG * rp,const BN_ULONG * ap,int num,BN_ULONG w)252 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
253 BN_ULONG c1 = 0;
254
255 assert(num >= 0);
256 if (num <= 0) {
257 return c1;
258 }
259
260 while (num & ~3) {
261 mul(rp[0], ap[0], w, c1);
262 mul(rp[1], ap[1], w, c1);
263 mul(rp[2], ap[2], w, c1);
264 mul(rp[3], ap[3], w, c1);
265 ap += 4;
266 rp += 4;
267 num -= 4;
268 }
269 while (num) {
270 mul(rp[0], ap[0], w, c1);
271 ap++;
272 rp++;
273 num--;
274 }
275 return c1;
276 }
277
bn_sqr_words(BN_ULONG * r,const BN_ULONG * a,int n)278 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
279 assert(n >= 0);
280 if (n <= 0) {
281 return;
282 }
283
284 while (n & ~3) {
285 sqr(r[0], r[1], a[0]);
286 sqr(r[2], r[3], a[1]);
287 sqr(r[4], r[5], a[2]);
288 sqr(r[6], r[7], a[3]);
289 a += 4;
290 r += 8;
291 n -= 4;
292 }
293 while (n) {
294 sqr(r[0], r[1], a[0]);
295 a++;
296 r += 2;
297 n--;
298 }
299 }
300
301 #else /* !(defined(BN_ULLONG) || defined(BN_UMULT_HIGH)) */
302
bn_mul_add_words(BN_ULONG * rp,const BN_ULONG * ap,int num,BN_ULONG w)303 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
304 BN_ULONG w) {
305 BN_ULONG c = 0;
306 BN_ULONG bl, bh;
307
308 assert(num >= 0);
309 if (num <= 0) {
310 return (BN_ULONG)0;
311 }
312
313 bl = LBITS(w);
314 bh = HBITS(w);
315
316 while (num & ~3) {
317 mul_add(rp[0], ap[0], bl, bh, c);
318 mul_add(rp[1], ap[1], bl, bh, c);
319 mul_add(rp[2], ap[2], bl, bh, c);
320 mul_add(rp[3], ap[3], bl, bh, c);
321 ap += 4;
322 rp += 4;
323 num -= 4;
324 }
325 while (num) {
326 mul_add(rp[0], ap[0], bl, bh, c);
327 ap++;
328 rp++;
329 num--;
330 }
331 return c;
332 }
333
bn_mul_words(BN_ULONG * rp,const BN_ULONG * ap,int num,BN_ULONG w)334 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
335 BN_ULONG carry = 0;
336 BN_ULONG bl, bh;
337
338 assert(num >= 0);
339 if (num <= 0) {
340 return (BN_ULONG)0;
341 }
342
343 bl = LBITS(w);
344 bh = HBITS(w);
345
346 while (num & ~3) {
347 mul(rp[0], ap[0], bl, bh, carry);
348 mul(rp[1], ap[1], bl, bh, carry);
349 mul(rp[2], ap[2], bl, bh, carry);
350 mul(rp[3], ap[3], bl, bh, carry);
351 ap += 4;
352 rp += 4;
353 num -= 4;
354 }
355 while (num) {
356 mul(rp[0], ap[0], bl, bh, carry);
357 ap++;
358 rp++;
359 num--;
360 }
361 return carry;
362 }
363
bn_sqr_words(BN_ULONG * r,const BN_ULONG * a,int n)364 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
365 assert(n >= 0);
366 if (n <= 0) {
367 return;
368 }
369
370 while (n & ~3) {
371 sqr64(r[0], r[1], a[0]);
372 sqr64(r[2], r[3], a[1]);
373 sqr64(r[4], r[5], a[2]);
374 sqr64(r[6], r[7], a[3]);
375 a += 4;
376 r += 8;
377 n -= 4;
378 }
379 while (n) {
380 sqr64(r[0], r[1], a[0]);
381 a++;
382 r += 2;
383 n--;
384 }
385 }
386
387 #endif /* !(defined(BN_ULLONG) || defined(BN_UMULT_HIGH)) */
388
389 #if defined(BN_ULLONG)
390
bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d)391 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
392 return (BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2) | l) / (BN_ULLONG)d);
393 }
394
395 #else
396
397 /* Divide h,l by d and return the result. */
bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d)398 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
399 BN_ULONG dh, dl, q, ret = 0, th, tl, t;
400 int i, count = 2;
401
402 if (d == 0) {
403 return BN_MASK2;
404 }
405
406 i = BN_num_bits_word(d);
407 assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
408
409 i = BN_BITS2 - i;
410 if (h >= d) {
411 h -= d;
412 }
413
414 if (i) {
415 d <<= i;
416 h = (h << i) | (l >> (BN_BITS2 - i));
417 l <<= i;
418 }
419 dh = (d & BN_MASK2h) >> BN_BITS4;
420 dl = (d & BN_MASK2l);
421 for (;;) {
422 if ((h >> BN_BITS4) == dh) {
423 q = BN_MASK2l;
424 } else {
425 q = h / dh;
426 }
427
428 th = q * dh;
429 tl = dl * q;
430 for (;;) {
431 t = h - th;
432 if ((t & BN_MASK2h) ||
433 ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) {
434 break;
435 }
436 q--;
437 th -= dh;
438 tl -= dl;
439 }
440 t = (tl >> BN_BITS4);
441 tl = (tl << BN_BITS4) & BN_MASK2h;
442 th += t;
443
444 if (l < tl) {
445 th++;
446 }
447 l -= tl;
448 if (h < th) {
449 h += d;
450 q--;
451 }
452 h -= th;
453
454 if (--count == 0) {
455 break;
456 }
457
458 ret = q << BN_BITS4;
459 h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
460 l = (l & BN_MASK2l) << BN_BITS4;
461 }
462
463 ret |= q;
464 return ret;
465 }
466
467 #endif /* !defined(BN_ULLONG) */
468
469 #ifdef BN_ULLONG
bn_add_words(BN_ULONG * r,const BN_ULONG * a,const BN_ULONG * b,int n)470 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
471 int n) {
472 BN_ULLONG ll = 0;
473
474 assert(n >= 0);
475 if (n <= 0) {
476 return (BN_ULONG)0;
477 }
478
479 while (n & ~3) {
480 ll += (BN_ULLONG)a[0] + b[0];
481 r[0] = (BN_ULONG)ll & BN_MASK2;
482 ll >>= BN_BITS2;
483 ll += (BN_ULLONG)a[1] + b[1];
484 r[1] = (BN_ULONG)ll & BN_MASK2;
485 ll >>= BN_BITS2;
486 ll += (BN_ULLONG)a[2] + b[2];
487 r[2] = (BN_ULONG)ll & BN_MASK2;
488 ll >>= BN_BITS2;
489 ll += (BN_ULLONG)a[3] + b[3];
490 r[3] = (BN_ULONG)ll & BN_MASK2;
491 ll >>= BN_BITS2;
492 a += 4;
493 b += 4;
494 r += 4;
495 n -= 4;
496 }
497 while (n) {
498 ll += (BN_ULLONG)a[0] + b[0];
499 r[0] = (BN_ULONG)ll & BN_MASK2;
500 ll >>= BN_BITS2;
501 a++;
502 b++;
503 r++;
504 n--;
505 }
506 return (BN_ULONG)ll;
507 }
508
509 #else /* !BN_ULLONG */
510
bn_add_words(BN_ULONG * r,const BN_ULONG * a,const BN_ULONG * b,int n)511 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
512 int n) {
513 BN_ULONG c, l, t;
514
515 assert(n >= 0);
516 if (n <= 0) {
517 return (BN_ULONG)0;
518 }
519
520 c = 0;
521 while (n & ~3) {
522 t = a[0];
523 t = (t + c) & BN_MASK2;
524 c = (t < c);
525 l = (t + b[0]) & BN_MASK2;
526 c += (l < t);
527 r[0] = l;
528 t = a[1];
529 t = (t + c) & BN_MASK2;
530 c = (t < c);
531 l = (t + b[1]) & BN_MASK2;
532 c += (l < t);
533 r[1] = l;
534 t = a[2];
535 t = (t + c) & BN_MASK2;
536 c = (t < c);
537 l = (t + b[2]) & BN_MASK2;
538 c += (l < t);
539 r[2] = l;
540 t = a[3];
541 t = (t + c) & BN_MASK2;
542 c = (t < c);
543 l = (t + b[3]) & BN_MASK2;
544 c += (l < t);
545 r[3] = l;
546 a += 4;
547 b += 4;
548 r += 4;
549 n -= 4;
550 }
551 while (n) {
552 t = a[0];
553 t = (t + c) & BN_MASK2;
554 c = (t < c);
555 l = (t + b[0]) & BN_MASK2;
556 c += (l < t);
557 r[0] = l;
558 a++;
559 b++;
560 r++;
561 n--;
562 }
563 return (BN_ULONG)c;
564 }
565
566 #endif /* !BN_ULLONG */
567
bn_sub_words(BN_ULONG * r,const BN_ULONG * a,const BN_ULONG * b,int n)568 BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
569 int n) {
570 BN_ULONG t1, t2;
571 int c = 0;
572
573 assert(n >= 0);
574 if (n <= 0) {
575 return (BN_ULONG)0;
576 }
577
578 while (n & ~3) {
579 t1 = a[0];
580 t2 = b[0];
581 r[0] = (t1 - t2 - c) & BN_MASK2;
582 if (t1 != t2) {
583 c = (t1 < t2);
584 }
585 t1 = a[1];
586 t2 = b[1];
587 r[1] = (t1 - t2 - c) & BN_MASK2;
588 if (t1 != t2) {
589 c = (t1 < t2);
590 }
591 t1 = a[2];
592 t2 = b[2];
593 r[2] = (t1 - t2 - c) & BN_MASK2;
594 if (t1 != t2) {
595 c = (t1 < t2);
596 }
597 t1 = a[3];
598 t2 = b[3];
599 r[3] = (t1 - t2 - c) & BN_MASK2;
600 if (t1 != t2) {
601 c = (t1 < t2);
602 }
603 a += 4;
604 b += 4;
605 r += 4;
606 n -= 4;
607 }
608 while (n) {
609 t1 = a[0];
610 t2 = b[0];
611 r[0] = (t1 - t2 - c) & BN_MASK2;
612 if (t1 != t2) {
613 c = (t1 < t2);
614 }
615 a++;
616 b++;
617 r++;
618 n--;
619 }
620 return c;
621 }
622
623 /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
624 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
625 /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
626 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
627
628 #ifdef BN_ULLONG
629
630 /* Keep in mind that additions to multiplication result can not overflow,
631 * because its high half cannot be all-ones. */
632 #define mul_add_c(a, b, c0, c1, c2) \
633 do { \
634 BN_ULONG hi; \
635 BN_ULLONG t = (BN_ULLONG)(a) * (b); \
636 t += c0; /* no carry */ \
637 c0 = (BN_ULONG)Lw(t); \
638 hi = (BN_ULONG)Hw(t); \
639 c1 = (c1 + hi) & BN_MASK2; \
640 if (c1 < hi) \
641 c2++; \
642 } while (0)
643
644 #define mul_add_c2(a, b, c0, c1, c2) \
645 do { \
646 BN_ULONG hi; \
647 BN_ULLONG t = (BN_ULLONG)(a) * (b); \
648 BN_ULLONG tt = t + c0; /* no carry */ \
649 c0 = (BN_ULONG)Lw(tt); \
650 hi = (BN_ULONG)Hw(tt); \
651 c1 = (c1 + hi) & BN_MASK2; \
652 if (c1 < hi) \
653 c2++; \
654 t += c0; /* no carry */ \
655 c0 = (BN_ULONG)Lw(t); \
656 hi = (BN_ULONG)Hw(t); \
657 c1 = (c1 + hi) & BN_MASK2; \
658 if (c1 < hi) \
659 c2++; \
660 } while (0)
661
662 #define sqr_add_c(a, i, c0, c1, c2) \
663 do { \
664 BN_ULONG hi; \
665 BN_ULLONG t = (BN_ULLONG)a[i] * a[i]; \
666 t += c0; /* no carry */ \
667 c0 = (BN_ULONG)Lw(t); \
668 hi = (BN_ULONG)Hw(t); \
669 c1 = (c1 + hi) & BN_MASK2; \
670 if (c1 < hi) \
671 c2++; \
672 } while (0)
673
674 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
675
676 #elif defined(BN_UMULT_LOHI)
677
678 /* Keep in mind that additions to hi can not overflow, because the high word of
679 * a multiplication result cannot be all-ones. */
680 #define mul_add_c(a, b, c0, c1, c2) \
681 do { \
682 BN_ULONG ta = (a), tb = (b); \
683 BN_ULONG lo, hi; \
684 BN_UMULT_LOHI(lo, hi, ta, tb); \
685 c0 += lo; \
686 hi += (c0 < lo) ? 1 : 0; \
687 c1 += hi; \
688 c2 += (c1 < hi) ? 1 : 0; \
689 } while (0)
690
691 #define mul_add_c2(a, b, c0, c1, c2) \
692 do { \
693 BN_ULONG ta = (a), tb = (b); \
694 BN_ULONG lo, hi, tt; \
695 BN_UMULT_LOHI(lo, hi, ta, tb); \
696 c0 += lo; \
697 tt = hi + ((c0 < lo) ? 1 : 0); \
698 c1 += tt; \
699 c2 += (c1 < tt) ? 1 : 0; \
700 c0 += lo; \
701 hi += (c0 < lo) ? 1 : 0; \
702 c1 += hi; \
703 c2 += (c1 < hi) ? 1 : 0; \
704 } while (0)
705
706 #define sqr_add_c(a, i, c0, c1, c2) \
707 do { \
708 BN_ULONG ta = (a)[i]; \
709 BN_ULONG lo, hi; \
710 BN_UMULT_LOHI(lo, hi, ta, ta); \
711 c0 += lo; \
712 hi += (c0 < lo) ? 1 : 0; \
713 c1 += hi; \
714 c2 += (c1 < hi) ? 1 : 0; \
715 } while (0)
716
717 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
718
719 #else /* !BN_ULLONG */
720
721 /* Keep in mind that additions to hi can not overflow, because
722 * the high word of a multiplication result cannot be all-ones. */
723
724 #define mul_add_c(a, b, c0, c1, c2) \
725 do { \
726 BN_ULONG lo = LBITS(a), hi = HBITS(a); \
727 BN_ULONG bl = LBITS(b), bh = HBITS(b); \
728 mul64(lo, hi, bl, bh); \
729 c0 = (c0 + lo) & BN_MASK2; \
730 if (c0 < lo) \
731 hi++; \
732 c1 = (c1 + hi) & BN_MASK2; \
733 if (c1 < hi) \
734 c2++; \
735 } while (0)
736
737 #define mul_add_c2(a, b, c0, c1, c2) \
738 do { \
739 BN_ULONG tt; \
740 BN_ULONG lo = LBITS(a), hi = HBITS(a); \
741 BN_ULONG bl = LBITS(b), bh = HBITS(b); \
742 mul64(lo, hi, bl, bh); \
743 tt = hi; \
744 c0 = (c0 + lo) & BN_MASK2; \
745 if (c0 < lo) \
746 tt++; \
747 c1 = (c1 + tt) & BN_MASK2; \
748 if (c1 < tt) \
749 c2++; \
750 c0 = (c0 + lo) & BN_MASK2; \
751 if (c0 < lo) \
752 hi++; \
753 c1 = (c1 + hi) & BN_MASK2; \
754 if (c1 < hi) \
755 c2++; \
756 } while (0)
757
758 #define sqr_add_c(a, i, c0, c1, c2) \
759 do { \
760 BN_ULONG lo, hi; \
761 sqr64(lo, hi, (a)[i]); \
762 c0 = (c0 + lo) & BN_MASK2; \
763 if (c0 < lo) \
764 hi++; \
765 c1 = (c1 + hi) & BN_MASK2; \
766 if (c1 < hi) \
767 c2++; \
768 } while (0)
769
770 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
771 #endif /* !BN_ULLONG */
772
bn_mul_comba8(BN_ULONG * r,BN_ULONG * a,BN_ULONG * b)773 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
774 BN_ULONG c1, c2, c3;
775
776 c1 = 0;
777 c2 = 0;
778 c3 = 0;
779 mul_add_c(a[0], b[0], c1, c2, c3);
780 r[0] = c1;
781 c1 = 0;
782 mul_add_c(a[0], b[1], c2, c3, c1);
783 mul_add_c(a[1], b[0], c2, c3, c1);
784 r[1] = c2;
785 c2 = 0;
786 mul_add_c(a[2], b[0], c3, c1, c2);
787 mul_add_c(a[1], b[1], c3, c1, c2);
788 mul_add_c(a[0], b[2], c3, c1, c2);
789 r[2] = c3;
790 c3 = 0;
791 mul_add_c(a[0], b[3], c1, c2, c3);
792 mul_add_c(a[1], b[2], c1, c2, c3);
793 mul_add_c(a[2], b[1], c1, c2, c3);
794 mul_add_c(a[3], b[0], c1, c2, c3);
795 r[3] = c1;
796 c1 = 0;
797 mul_add_c(a[4], b[0], c2, c3, c1);
798 mul_add_c(a[3], b[1], c2, c3, c1);
799 mul_add_c(a[2], b[2], c2, c3, c1);
800 mul_add_c(a[1], b[3], c2, c3, c1);
801 mul_add_c(a[0], b[4], c2, c3, c1);
802 r[4] = c2;
803 c2 = 0;
804 mul_add_c(a[0], b[5], c3, c1, c2);
805 mul_add_c(a[1], b[4], c3, c1, c2);
806 mul_add_c(a[2], b[3], c3, c1, c2);
807 mul_add_c(a[3], b[2], c3, c1, c2);
808 mul_add_c(a[4], b[1], c3, c1, c2);
809 mul_add_c(a[5], b[0], c3, c1, c2);
810 r[5] = c3;
811 c3 = 0;
812 mul_add_c(a[6], b[0], c1, c2, c3);
813 mul_add_c(a[5], b[1], c1, c2, c3);
814 mul_add_c(a[4], b[2], c1, c2, c3);
815 mul_add_c(a[3], b[3], c1, c2, c3);
816 mul_add_c(a[2], b[4], c1, c2, c3);
817 mul_add_c(a[1], b[5], c1, c2, c3);
818 mul_add_c(a[0], b[6], c1, c2, c3);
819 r[6] = c1;
820 c1 = 0;
821 mul_add_c(a[0], b[7], c2, c3, c1);
822 mul_add_c(a[1], b[6], c2, c3, c1);
823 mul_add_c(a[2], b[5], c2, c3, c1);
824 mul_add_c(a[3], b[4], c2, c3, c1);
825 mul_add_c(a[4], b[3], c2, c3, c1);
826 mul_add_c(a[5], b[2], c2, c3, c1);
827 mul_add_c(a[6], b[1], c2, c3, c1);
828 mul_add_c(a[7], b[0], c2, c3, c1);
829 r[7] = c2;
830 c2 = 0;
831 mul_add_c(a[7], b[1], c3, c1, c2);
832 mul_add_c(a[6], b[2], c3, c1, c2);
833 mul_add_c(a[5], b[3], c3, c1, c2);
834 mul_add_c(a[4], b[4], c3, c1, c2);
835 mul_add_c(a[3], b[5], c3, c1, c2);
836 mul_add_c(a[2], b[6], c3, c1, c2);
837 mul_add_c(a[1], b[7], c3, c1, c2);
838 r[8] = c3;
839 c3 = 0;
840 mul_add_c(a[2], b[7], c1, c2, c3);
841 mul_add_c(a[3], b[6], c1, c2, c3);
842 mul_add_c(a[4], b[5], c1, c2, c3);
843 mul_add_c(a[5], b[4], c1, c2, c3);
844 mul_add_c(a[6], b[3], c1, c2, c3);
845 mul_add_c(a[7], b[2], c1, c2, c3);
846 r[9] = c1;
847 c1 = 0;
848 mul_add_c(a[7], b[3], c2, c3, c1);
849 mul_add_c(a[6], b[4], c2, c3, c1);
850 mul_add_c(a[5], b[5], c2, c3, c1);
851 mul_add_c(a[4], b[6], c2, c3, c1);
852 mul_add_c(a[3], b[7], c2, c3, c1);
853 r[10] = c2;
854 c2 = 0;
855 mul_add_c(a[4], b[7], c3, c1, c2);
856 mul_add_c(a[5], b[6], c3, c1, c2);
857 mul_add_c(a[6], b[5], c3, c1, c2);
858 mul_add_c(a[7], b[4], c3, c1, c2);
859 r[11] = c3;
860 c3 = 0;
861 mul_add_c(a[7], b[5], c1, c2, c3);
862 mul_add_c(a[6], b[6], c1, c2, c3);
863 mul_add_c(a[5], b[7], c1, c2, c3);
864 r[12] = c1;
865 c1 = 0;
866 mul_add_c(a[6], b[7], c2, c3, c1);
867 mul_add_c(a[7], b[6], c2, c3, c1);
868 r[13] = c2;
869 c2 = 0;
870 mul_add_c(a[7], b[7], c3, c1, c2);
871 r[14] = c3;
872 r[15] = c1;
873 }
874
bn_mul_comba4(BN_ULONG * r,BN_ULONG * a,BN_ULONG * b)875 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
876 BN_ULONG c1, c2, c3;
877
878 c1 = 0;
879 c2 = 0;
880 c3 = 0;
881 mul_add_c(a[0], b[0], c1, c2, c3);
882 r[0] = c1;
883 c1 = 0;
884 mul_add_c(a[0], b[1], c2, c3, c1);
885 mul_add_c(a[1], b[0], c2, c3, c1);
886 r[1] = c2;
887 c2 = 0;
888 mul_add_c(a[2], b[0], c3, c1, c2);
889 mul_add_c(a[1], b[1], c3, c1, c2);
890 mul_add_c(a[0], b[2], c3, c1, c2);
891 r[2] = c3;
892 c3 = 0;
893 mul_add_c(a[0], b[3], c1, c2, c3);
894 mul_add_c(a[1], b[2], c1, c2, c3);
895 mul_add_c(a[2], b[1], c1, c2, c3);
896 mul_add_c(a[3], b[0], c1, c2, c3);
897 r[3] = c1;
898 c1 = 0;
899 mul_add_c(a[3], b[1], c2, c3, c1);
900 mul_add_c(a[2], b[2], c2, c3, c1);
901 mul_add_c(a[1], b[3], c2, c3, c1);
902 r[4] = c2;
903 c2 = 0;
904 mul_add_c(a[2], b[3], c3, c1, c2);
905 mul_add_c(a[3], b[2], c3, c1, c2);
906 r[5] = c3;
907 c3 = 0;
908 mul_add_c(a[3], b[3], c1, c2, c3);
909 r[6] = c1;
910 r[7] = c2;
911 }
912
bn_sqr_comba8(BN_ULONG * r,const BN_ULONG * a)913 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
914 BN_ULONG c1, c2, c3;
915
916 c1 = 0;
917 c2 = 0;
918 c3 = 0;
919 sqr_add_c(a, 0, c1, c2, c3);
920 r[0] = c1;
921 c1 = 0;
922 sqr_add_c2(a, 1, 0, c2, c3, c1);
923 r[1] = c2;
924 c2 = 0;
925 sqr_add_c(a, 1, c3, c1, c2);
926 sqr_add_c2(a, 2, 0, c3, c1, c2);
927 r[2] = c3;
928 c3 = 0;
929 sqr_add_c2(a, 3, 0, c1, c2, c3);
930 sqr_add_c2(a, 2, 1, c1, c2, c3);
931 r[3] = c1;
932 c1 = 0;
933 sqr_add_c(a, 2, c2, c3, c1);
934 sqr_add_c2(a, 3, 1, c2, c3, c1);
935 sqr_add_c2(a, 4, 0, c2, c3, c1);
936 r[4] = c2;
937 c2 = 0;
938 sqr_add_c2(a, 5, 0, c3, c1, c2);
939 sqr_add_c2(a, 4, 1, c3, c1, c2);
940 sqr_add_c2(a, 3, 2, c3, c1, c2);
941 r[5] = c3;
942 c3 = 0;
943 sqr_add_c(a, 3, c1, c2, c3);
944 sqr_add_c2(a, 4, 2, c1, c2, c3);
945 sqr_add_c2(a, 5, 1, c1, c2, c3);
946 sqr_add_c2(a, 6, 0, c1, c2, c3);
947 r[6] = c1;
948 c1 = 0;
949 sqr_add_c2(a, 7, 0, c2, c3, c1);
950 sqr_add_c2(a, 6, 1, c2, c3, c1);
951 sqr_add_c2(a, 5, 2, c2, c3, c1);
952 sqr_add_c2(a, 4, 3, c2, c3, c1);
953 r[7] = c2;
954 c2 = 0;
955 sqr_add_c(a, 4, c3, c1, c2);
956 sqr_add_c2(a, 5, 3, c3, c1, c2);
957 sqr_add_c2(a, 6, 2, c3, c1, c2);
958 sqr_add_c2(a, 7, 1, c3, c1, c2);
959 r[8] = c3;
960 c3 = 0;
961 sqr_add_c2(a, 7, 2, c1, c2, c3);
962 sqr_add_c2(a, 6, 3, c1, c2, c3);
963 sqr_add_c2(a, 5, 4, c1, c2, c3);
964 r[9] = c1;
965 c1 = 0;
966 sqr_add_c(a, 5, c2, c3, c1);
967 sqr_add_c2(a, 6, 4, c2, c3, c1);
968 sqr_add_c2(a, 7, 3, c2, c3, c1);
969 r[10] = c2;
970 c2 = 0;
971 sqr_add_c2(a, 7, 4, c3, c1, c2);
972 sqr_add_c2(a, 6, 5, c3, c1, c2);
973 r[11] = c3;
974 c3 = 0;
975 sqr_add_c(a, 6, c1, c2, c3);
976 sqr_add_c2(a, 7, 5, c1, c2, c3);
977 r[12] = c1;
978 c1 = 0;
979 sqr_add_c2(a, 7, 6, c2, c3, c1);
980 r[13] = c2;
981 c2 = 0;
982 sqr_add_c(a, 7, c3, c1, c2);
983 r[14] = c3;
984 r[15] = c1;
985 }
986
bn_sqr_comba4(BN_ULONG * r,const BN_ULONG * a)987 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
988 BN_ULONG c1, c2, c3;
989
990 c1 = 0;
991 c2 = 0;
992 c3 = 0;
993 sqr_add_c(a, 0, c1, c2, c3);
994 r[0] = c1;
995 c1 = 0;
996 sqr_add_c2(a, 1, 0, c2, c3, c1);
997 r[1] = c2;
998 c2 = 0;
999 sqr_add_c(a, 1, c3, c1, c2);
1000 sqr_add_c2(a, 2, 0, c3, c1, c2);
1001 r[2] = c3;
1002 c3 = 0;
1003 sqr_add_c2(a, 3, 0, c1, c2, c3);
1004 sqr_add_c2(a, 2, 1, c1, c2, c3);
1005 r[3] = c1;
1006 c1 = 0;
1007 sqr_add_c(a, 2, c2, c3, c1);
1008 sqr_add_c2(a, 3, 1, c2, c3, c1);
1009 r[4] = c2;
1010 c2 = 0;
1011 sqr_add_c2(a, 3, 2, c3, c1, c2);
1012 r[5] = c3;
1013 c3 = 0;
1014 sqr_add_c(a, 3, c1, c2, c3);
1015 r[6] = c1;
1016 r[7] = c2;
1017 }
1018
1019 #endif
1020