1 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
2  * All rights reserved.
3  *
4  * This package is an SSL implementation written
5  * by Eric Young (eay@cryptsoft.com).
6  * The implementation was written so as to conform with Netscapes SSL.
7  *
8  * This library is free for commercial and non-commercial use as long as
9  * the following conditions are aheared to.  The following conditions
10  * apply to all code found in this distribution, be it the RC4, RSA,
11  * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
12  * included with this distribution is covered by the same copyright terms
13  * except that the holder is Tim Hudson (tjh@cryptsoft.com).
14  *
15  * Copyright remains Eric Young's, and as such any Copyright notices in
16  * the code are not to be removed.
17  * If this package is used in a product, Eric Young should be given attribution
18  * as the author of the parts of the library used.
19  * This can be in the form of a textual message at program startup or
20  * in documentation (online or textual) provided with the package.
21  *
22  * Redistribution and use in source and binary forms, with or without
23  * modification, are permitted provided that the following conditions
24  * are met:
25  * 1. Redistributions of source code must retain the copyright
26  *    notice, this list of conditions and the following disclaimer.
27  * 2. Redistributions in binary form must reproduce the above copyright
28  *    notice, this list of conditions and the following disclaimer in the
29  *    documentation and/or other materials provided with the distribution.
30  * 3. All advertising materials mentioning features or use of this software
31  *    must display the following acknowledgement:
32  *    "This product includes cryptographic software written by
33  *     Eric Young (eay@cryptsoft.com)"
34  *    The word 'cryptographic' can be left out if the rouines from the library
35  *    being used are not cryptographic related :-).
36  * 4. If you include any Windows specific code (or a derivative thereof) from
37  *    the apps directory (application code) you must include an acknowledgement:
38  *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
39  *
40  * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
41  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
44  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50  * SUCH DAMAGE.
51  *
52  * The licence and distribution terms for any publically available version or
53  * derivative of this code cannot be changed.  i.e. this code cannot simply be
54  * copied and put under another distribution licence
55  * [including the GNU Public Licence.] */
56 
57 #include <openssl/bn.h>
58 
59 #include <assert.h>
60 
61 #include "internal.h"
62 
63 
64 /* Generic implementations of most operations are needed for:
65  * - Configurations without inline assembly.
66  * - Architectures other than x86 or x86_64.
67  * - Windows x84_64; x86_64-gcc.c does not build on MSVC. */
68 #if defined(OPENSSL_NO_ASM) || \
69     (!defined(OPENSSL_X86_64) && !defined(OPENSSL_X86)) || \
70     (defined(OPENSSL_X86_64) && defined(OPENSSL_WINDOWS))
71 
72 #ifdef BN_ULLONG
73 #define mul_add(r, a, w, c)             \
74   {                                     \
75     BN_ULLONG t;                        \
76     t = (BN_ULLONG)w * (a) + (r) + (c); \
77     (r) = Lw(t);                        \
78     (c) = Hw(t);                        \
79   }
80 
81 #define mul(r, a, w, c)           \
82   {                               \
83     BN_ULLONG t;                  \
84     t = (BN_ULLONG)w * (a) + (c); \
85     (r) = Lw(t);                  \
86     (c) = Hw(t);                  \
87   }
88 
89 #define sqr(r0, r1, a)        \
90   {                           \
91     BN_ULLONG t;              \
92     t = (BN_ULLONG)(a) * (a); \
93     (r0) = Lw(t);             \
94     (r1) = Hw(t);             \
95   }
96 
97 #elif defined(BN_UMULT_LOHI)
98 #define mul_add(r, a, w, c)             \
99   {                                     \
100     BN_ULONG high, low, ret, tmp = (a); \
101     ret = (r);                          \
102     BN_UMULT_LOHI(low, high, w, tmp);   \
103     ret += (c);                         \
104     (c) = (ret < (c)) ? 1 : 0;          \
105     (c) += high;                        \
106     ret += low;                         \
107     (c) += (ret < low) ? 1 : 0;         \
108     (r) = ret;                          \
109   }
110 
111 #define mul(r, a, w, c)                \
112   {                                    \
113     BN_ULONG high, low, ret, ta = (a); \
114     BN_UMULT_LOHI(low, high, w, ta);   \
115     ret = low + (c);                   \
116     (c) = high;                        \
117     (c) += (ret < low) ? 1 : 0;        \
118     (r) = ret;                         \
119   }
120 
121 #define sqr(r0, r1, a)               \
122   {                                  \
123     BN_ULONG tmp = (a);              \
124     BN_UMULT_LOHI(r0, r1, tmp, tmp); \
125   }
126 
127 #else
128 
129 /*************************************************************
130  * No long long type
131  */
132 
133 #define LBITS(a) ((a) & BN_MASK2l)
134 #define HBITS(a) (((a) >> BN_BITS4) & BN_MASK2l)
135 #define L2HBITS(a) (((a) << BN_BITS4) & BN_MASK2)
136 
137 #define LLBITS(a) ((a) & BN_MASKl)
138 #define LHBITS(a) (((a) >> BN_BITS2) & BN_MASKl)
139 #define LL2HBITS(a) ((BN_ULLONG)((a) & BN_MASKl) << BN_BITS2)
140 
141 #define mul64(l, h, bl, bh)       \
142   {                               \
143     BN_ULONG m, m1, lt, ht;       \
144                                   \
145     lt = l;                       \
146     ht = h;                       \
147     m = (bh) * (lt);              \
148     lt = (bl) * (lt);             \
149     m1 = (bl) * (ht);             \
150     ht = (bh) * (ht);             \
151     m = (m + m1) & BN_MASK2;      \
152     if (m < m1)                   \
153       ht += L2HBITS((BN_ULONG)1); \
154     ht += HBITS(m);               \
155     m1 = L2HBITS(m);              \
156     lt = (lt + m1) & BN_MASK2;    \
157     if (lt < m1)                  \
158       ht++;                       \
159     (l) = lt;                     \
160     (h) = ht;                     \
161   }
162 
163 #define sqr64(lo, ho, in)                    \
164   {                                          \
165     BN_ULONG l, h, m;                        \
166                                              \
167     h = (in);                                \
168     l = LBITS(h);                            \
169     h = HBITS(h);                            \
170     m = (l) * (h);                           \
171     l *= l;                                  \
172     h *= h;                                  \
173     h += (m & BN_MASK2h1) >> (BN_BITS4 - 1); \
174     m = (m & BN_MASK2l) << (BN_BITS4 + 1);   \
175     l = (l + m) & BN_MASK2;                  \
176     if (l < m)                               \
177       h++;                                   \
178     (lo) = l;                                \
179     (ho) = h;                                \
180   }
181 
182 #define mul_add(r, a, bl, bh, c) \
183   {                              \
184     BN_ULONG l, h;               \
185                                  \
186     h = (a);                     \
187     l = LBITS(h);                \
188     h = HBITS(h);                \
189     mul64(l, h, (bl), (bh));     \
190                                  \
191     /* non-multiply part */      \
192     l = (l + (c)) & BN_MASK2;    \
193     if (l < (c))                 \
194       h++;                       \
195     (c) = (r);                   \
196     l = (l + (c)) & BN_MASK2;    \
197     if (l < (c))                 \
198       h++;                       \
199     (c) = h & BN_MASK2;          \
200     (r) = l;                     \
201   }
202 
203 #define mul(r, a, bl, bh, c)  \
204   {                           \
205     BN_ULONG l, h;            \
206                               \
207     h = (a);                  \
208     l = LBITS(h);             \
209     h = HBITS(h);             \
210     mul64(l, h, (bl), (bh));  \
211                               \
212     /* non-multiply part */   \
213     l += (c);                 \
214     if ((l & BN_MASK2) < (c)) \
215       h++;                    \
216     (c) = h & BN_MASK2;       \
217     (r) = l & BN_MASK2;       \
218   }
219 #endif /* !BN_ULLONG */
220 
221 #if defined(BN_ULLONG) || defined(BN_UMULT_HIGH)
222 
bn_mul_add_words(BN_ULONG * rp,const BN_ULONG * ap,int num,BN_ULONG w)223 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
224                           BN_ULONG w) {
225   BN_ULONG c1 = 0;
226 
227   assert(num >= 0);
228   if (num <= 0) {
229     return c1;
230   }
231 
232   while (num & ~3) {
233     mul_add(rp[0], ap[0], w, c1);
234     mul_add(rp[1], ap[1], w, c1);
235     mul_add(rp[2], ap[2], w, c1);
236     mul_add(rp[3], ap[3], w, c1);
237     ap += 4;
238     rp += 4;
239     num -= 4;
240   }
241 
242   while (num) {
243     mul_add(rp[0], ap[0], w, c1);
244     ap++;
245     rp++;
246     num--;
247   }
248 
249   return c1;
250 }
251 
bn_mul_words(BN_ULONG * rp,const BN_ULONG * ap,int num,BN_ULONG w)252 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
253   BN_ULONG c1 = 0;
254 
255   assert(num >= 0);
256   if (num <= 0) {
257     return c1;
258   }
259 
260   while (num & ~3) {
261     mul(rp[0], ap[0], w, c1);
262     mul(rp[1], ap[1], w, c1);
263     mul(rp[2], ap[2], w, c1);
264     mul(rp[3], ap[3], w, c1);
265     ap += 4;
266     rp += 4;
267     num -= 4;
268   }
269   while (num) {
270     mul(rp[0], ap[0], w, c1);
271     ap++;
272     rp++;
273     num--;
274   }
275   return c1;
276 }
277 
bn_sqr_words(BN_ULONG * r,const BN_ULONG * a,int n)278 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
279   assert(n >= 0);
280   if (n <= 0) {
281     return;
282   }
283 
284   while (n & ~3) {
285     sqr(r[0], r[1], a[0]);
286     sqr(r[2], r[3], a[1]);
287     sqr(r[4], r[5], a[2]);
288     sqr(r[6], r[7], a[3]);
289     a += 4;
290     r += 8;
291     n -= 4;
292   }
293   while (n) {
294     sqr(r[0], r[1], a[0]);
295     a++;
296     r += 2;
297     n--;
298   }
299 }
300 
301 #else /* !(defined(BN_ULLONG) || defined(BN_UMULT_HIGH)) */
302 
bn_mul_add_words(BN_ULONG * rp,const BN_ULONG * ap,int num,BN_ULONG w)303 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
304                           BN_ULONG w) {
305   BN_ULONG c = 0;
306   BN_ULONG bl, bh;
307 
308   assert(num >= 0);
309   if (num <= 0) {
310     return (BN_ULONG)0;
311   }
312 
313   bl = LBITS(w);
314   bh = HBITS(w);
315 
316   while (num & ~3) {
317     mul_add(rp[0], ap[0], bl, bh, c);
318     mul_add(rp[1], ap[1], bl, bh, c);
319     mul_add(rp[2], ap[2], bl, bh, c);
320     mul_add(rp[3], ap[3], bl, bh, c);
321     ap += 4;
322     rp += 4;
323     num -= 4;
324   }
325   while (num) {
326     mul_add(rp[0], ap[0], bl, bh, c);
327     ap++;
328     rp++;
329     num--;
330   }
331   return c;
332 }
333 
bn_mul_words(BN_ULONG * rp,const BN_ULONG * ap,int num,BN_ULONG w)334 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
335   BN_ULONG carry = 0;
336   BN_ULONG bl, bh;
337 
338   assert(num >= 0);
339   if (num <= 0) {
340     return (BN_ULONG)0;
341   }
342 
343   bl = LBITS(w);
344   bh = HBITS(w);
345 
346   while (num & ~3) {
347     mul(rp[0], ap[0], bl, bh, carry);
348     mul(rp[1], ap[1], bl, bh, carry);
349     mul(rp[2], ap[2], bl, bh, carry);
350     mul(rp[3], ap[3], bl, bh, carry);
351     ap += 4;
352     rp += 4;
353     num -= 4;
354   }
355   while (num) {
356     mul(rp[0], ap[0], bl, bh, carry);
357     ap++;
358     rp++;
359     num--;
360   }
361   return carry;
362 }
363 
bn_sqr_words(BN_ULONG * r,const BN_ULONG * a,int n)364 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
365   assert(n >= 0);
366   if (n <= 0) {
367     return;
368   }
369 
370   while (n & ~3) {
371     sqr64(r[0], r[1], a[0]);
372     sqr64(r[2], r[3], a[1]);
373     sqr64(r[4], r[5], a[2]);
374     sqr64(r[6], r[7], a[3]);
375     a += 4;
376     r += 8;
377     n -= 4;
378   }
379   while (n) {
380     sqr64(r[0], r[1], a[0]);
381     a++;
382     r += 2;
383     n--;
384   }
385 }
386 
387 #endif /* !(defined(BN_ULLONG) || defined(BN_UMULT_HIGH)) */
388 
389 #if defined(BN_ULLONG)
390 
bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d)391 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
392   return (BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2) | l) / (BN_ULLONG)d);
393 }
394 
395 #else
396 
397 /* Divide h,l by d and return the result. */
bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d)398 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
399   BN_ULONG dh, dl, q, ret = 0, th, tl, t;
400   int i, count = 2;
401 
402   if (d == 0) {
403     return BN_MASK2;
404   }
405 
406   i = BN_num_bits_word(d);
407   assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
408 
409   i = BN_BITS2 - i;
410   if (h >= d) {
411     h -= d;
412   }
413 
414   if (i) {
415     d <<= i;
416     h = (h << i) | (l >> (BN_BITS2 - i));
417     l <<= i;
418   }
419   dh = (d & BN_MASK2h) >> BN_BITS4;
420   dl = (d & BN_MASK2l);
421   for (;;) {
422     if ((h >> BN_BITS4) == dh) {
423       q = BN_MASK2l;
424     } else {
425       q = h / dh;
426     }
427 
428     th = q * dh;
429     tl = dl * q;
430     for (;;) {
431       t = h - th;
432       if ((t & BN_MASK2h) ||
433           ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) {
434         break;
435       }
436       q--;
437       th -= dh;
438       tl -= dl;
439     }
440     t = (tl >> BN_BITS4);
441     tl = (tl << BN_BITS4) & BN_MASK2h;
442     th += t;
443 
444     if (l < tl) {
445       th++;
446     }
447     l -= tl;
448     if (h < th) {
449       h += d;
450       q--;
451     }
452     h -= th;
453 
454     if (--count == 0) {
455       break;
456     }
457 
458     ret = q << BN_BITS4;
459     h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
460     l = (l & BN_MASK2l) << BN_BITS4;
461   }
462 
463   ret |= q;
464   return ret;
465 }
466 
467 #endif /* !defined(BN_ULLONG) */
468 
469 #ifdef BN_ULLONG
bn_add_words(BN_ULONG * r,const BN_ULONG * a,const BN_ULONG * b,int n)470 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
471                       int n) {
472   BN_ULLONG ll = 0;
473 
474   assert(n >= 0);
475   if (n <= 0) {
476     return (BN_ULONG)0;
477   }
478 
479   while (n & ~3) {
480     ll += (BN_ULLONG)a[0] + b[0];
481     r[0] = (BN_ULONG)ll & BN_MASK2;
482     ll >>= BN_BITS2;
483     ll += (BN_ULLONG)a[1] + b[1];
484     r[1] = (BN_ULONG)ll & BN_MASK2;
485     ll >>= BN_BITS2;
486     ll += (BN_ULLONG)a[2] + b[2];
487     r[2] = (BN_ULONG)ll & BN_MASK2;
488     ll >>= BN_BITS2;
489     ll += (BN_ULLONG)a[3] + b[3];
490     r[3] = (BN_ULONG)ll & BN_MASK2;
491     ll >>= BN_BITS2;
492     a += 4;
493     b += 4;
494     r += 4;
495     n -= 4;
496   }
497   while (n) {
498     ll += (BN_ULLONG)a[0] + b[0];
499     r[0] = (BN_ULONG)ll & BN_MASK2;
500     ll >>= BN_BITS2;
501     a++;
502     b++;
503     r++;
504     n--;
505   }
506   return (BN_ULONG)ll;
507 }
508 
509 #else /* !BN_ULLONG */
510 
bn_add_words(BN_ULONG * r,const BN_ULONG * a,const BN_ULONG * b,int n)511 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
512                       int n) {
513   BN_ULONG c, l, t;
514 
515   assert(n >= 0);
516   if (n <= 0) {
517     return (BN_ULONG)0;
518   }
519 
520   c = 0;
521   while (n & ~3) {
522     t = a[0];
523     t = (t + c) & BN_MASK2;
524     c = (t < c);
525     l = (t + b[0]) & BN_MASK2;
526     c += (l < t);
527     r[0] = l;
528     t = a[1];
529     t = (t + c) & BN_MASK2;
530     c = (t < c);
531     l = (t + b[1]) & BN_MASK2;
532     c += (l < t);
533     r[1] = l;
534     t = a[2];
535     t = (t + c) & BN_MASK2;
536     c = (t < c);
537     l = (t + b[2]) & BN_MASK2;
538     c += (l < t);
539     r[2] = l;
540     t = a[3];
541     t = (t + c) & BN_MASK2;
542     c = (t < c);
543     l = (t + b[3]) & BN_MASK2;
544     c += (l < t);
545     r[3] = l;
546     a += 4;
547     b += 4;
548     r += 4;
549     n -= 4;
550   }
551   while (n) {
552     t = a[0];
553     t = (t + c) & BN_MASK2;
554     c = (t < c);
555     l = (t + b[0]) & BN_MASK2;
556     c += (l < t);
557     r[0] = l;
558     a++;
559     b++;
560     r++;
561     n--;
562   }
563   return (BN_ULONG)c;
564 }
565 
566 #endif /* !BN_ULLONG */
567 
bn_sub_words(BN_ULONG * r,const BN_ULONG * a,const BN_ULONG * b,int n)568 BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
569                       int n) {
570   BN_ULONG t1, t2;
571   int c = 0;
572 
573   assert(n >= 0);
574   if (n <= 0) {
575     return (BN_ULONG)0;
576   }
577 
578   while (n & ~3) {
579     t1 = a[0];
580     t2 = b[0];
581     r[0] = (t1 - t2 - c) & BN_MASK2;
582     if (t1 != t2) {
583       c = (t1 < t2);
584     }
585     t1 = a[1];
586     t2 = b[1];
587     r[1] = (t1 - t2 - c) & BN_MASK2;
588     if (t1 != t2) {
589       c = (t1 < t2);
590     }
591     t1 = a[2];
592     t2 = b[2];
593     r[2] = (t1 - t2 - c) & BN_MASK2;
594     if (t1 != t2) {
595       c = (t1 < t2);
596     }
597     t1 = a[3];
598     t2 = b[3];
599     r[3] = (t1 - t2 - c) & BN_MASK2;
600     if (t1 != t2) {
601       c = (t1 < t2);
602     }
603     a += 4;
604     b += 4;
605     r += 4;
606     n -= 4;
607   }
608   while (n) {
609     t1 = a[0];
610     t2 = b[0];
611     r[0] = (t1 - t2 - c) & BN_MASK2;
612     if (t1 != t2) {
613       c = (t1 < t2);
614     }
615     a++;
616     b++;
617     r++;
618     n--;
619   }
620   return c;
621 }
622 
623 /* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
624 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
625 /* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
626 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
627 
628 #ifdef BN_ULLONG
629 
630 /* Keep in mind that additions to multiplication result can not overflow,
631  * because its high half cannot be all-ones. */
632 #define mul_add_c(a, b, c0, c1, c2)     \
633   do {                                  \
634     BN_ULONG hi;                        \
635     BN_ULLONG t = (BN_ULLONG)(a) * (b); \
636     t += c0; /* no carry */             \
637     c0 = (BN_ULONG)Lw(t);               \
638     hi = (BN_ULONG)Hw(t);               \
639     c1 = (c1 + hi) & BN_MASK2;          \
640     if (c1 < hi)                        \
641       c2++;                             \
642   } while (0)
643 
644 #define mul_add_c2(a, b, c0, c1, c2)      \
645   do {                                    \
646     BN_ULONG hi;                          \
647     BN_ULLONG t = (BN_ULLONG)(a) * (b);   \
648     BN_ULLONG tt = t + c0; /* no carry */ \
649     c0 = (BN_ULONG)Lw(tt);                \
650     hi = (BN_ULONG)Hw(tt);                \
651     c1 = (c1 + hi) & BN_MASK2;            \
652     if (c1 < hi)                          \
653       c2++;                               \
654     t += c0; /* no carry */               \
655     c0 = (BN_ULONG)Lw(t);                 \
656     hi = (BN_ULONG)Hw(t);                 \
657     c1 = (c1 + hi) & BN_MASK2;            \
658     if (c1 < hi)                          \
659       c2++;                               \
660   } while (0)
661 
662 #define sqr_add_c(a, i, c0, c1, c2)       \
663   do {                                    \
664     BN_ULONG hi;                          \
665     BN_ULLONG t = (BN_ULLONG)a[i] * a[i]; \
666     t += c0; /* no carry */               \
667     c0 = (BN_ULONG)Lw(t);                 \
668     hi = (BN_ULONG)Hw(t);                 \
669     c1 = (c1 + hi) & BN_MASK2;            \
670     if (c1 < hi)                          \
671       c2++;                               \
672   } while (0)
673 
674 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
675 
676 #elif defined(BN_UMULT_LOHI)
677 
678 /* Keep in mind that additions to hi can not overflow, because the high word of
679  * a multiplication result cannot be all-ones. */
680 #define mul_add_c(a, b, c0, c1, c2) \
681   do {                              \
682     BN_ULONG ta = (a), tb = (b);    \
683     BN_ULONG lo, hi;                \
684     BN_UMULT_LOHI(lo, hi, ta, tb);  \
685     c0 += lo;                       \
686     hi += (c0 < lo) ? 1 : 0;        \
687     c1 += hi;                       \
688     c2 += (c1 < hi) ? 1 : 0;        \
689   } while (0)
690 
691 #define mul_add_c2(a, b, c0, c1, c2) \
692   do {                               \
693     BN_ULONG ta = (a), tb = (b);     \
694     BN_ULONG lo, hi, tt;             \
695     BN_UMULT_LOHI(lo, hi, ta, tb);   \
696     c0 += lo;                        \
697     tt = hi + ((c0 < lo) ? 1 : 0);   \
698     c1 += tt;                        \
699     c2 += (c1 < tt) ? 1 : 0;         \
700     c0 += lo;                        \
701     hi += (c0 < lo) ? 1 : 0;         \
702     c1 += hi;                        \
703     c2 += (c1 < hi) ? 1 : 0;         \
704   } while (0)
705 
706 #define sqr_add_c(a, i, c0, c1, c2) \
707   do {                              \
708     BN_ULONG ta = (a)[i];           \
709     BN_ULONG lo, hi;                \
710     BN_UMULT_LOHI(lo, hi, ta, ta);  \
711     c0 += lo;                       \
712     hi += (c0 < lo) ? 1 : 0;        \
713     c1 += hi;                       \
714     c2 += (c1 < hi) ? 1 : 0;        \
715   } while (0)
716 
717 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
718 
719 #else /* !BN_ULLONG */
720 
721 /* Keep in mind that additions to hi can not overflow, because
722  * the high word of a multiplication result cannot be all-ones. */
723 
724 #define mul_add_c(a, b, c0, c1, c2)        \
725   do {                                     \
726     BN_ULONG lo = LBITS(a), hi = HBITS(a); \
727     BN_ULONG bl = LBITS(b), bh = HBITS(b); \
728     mul64(lo, hi, bl, bh);                 \
729     c0 = (c0 + lo) & BN_MASK2;             \
730     if (c0 < lo)                           \
731       hi++;                                \
732     c1 = (c1 + hi) & BN_MASK2;             \
733     if (c1 < hi)                           \
734       c2++;                                \
735   } while (0)
736 
737 #define mul_add_c2(a, b, c0, c1, c2)       \
738   do {                                     \
739     BN_ULONG tt;                           \
740     BN_ULONG lo = LBITS(a), hi = HBITS(a); \
741     BN_ULONG bl = LBITS(b), bh = HBITS(b); \
742     mul64(lo, hi, bl, bh);                 \
743     tt = hi;                               \
744     c0 = (c0 + lo) & BN_MASK2;             \
745     if (c0 < lo)                           \
746       tt++;                                \
747     c1 = (c1 + tt) & BN_MASK2;             \
748     if (c1 < tt)                           \
749       c2++;                                \
750     c0 = (c0 + lo) & BN_MASK2;             \
751     if (c0 < lo)                           \
752       hi++;                                \
753     c1 = (c1 + hi) & BN_MASK2;             \
754     if (c1 < hi)                           \
755       c2++;                                \
756   } while (0)
757 
758 #define sqr_add_c(a, i, c0, c1, c2) \
759   do {                              \
760     BN_ULONG lo, hi;                \
761     sqr64(lo, hi, (a)[i]);          \
762     c0 = (c0 + lo) & BN_MASK2;      \
763     if (c0 < lo)                    \
764       hi++;                         \
765     c1 = (c1 + hi) & BN_MASK2;      \
766     if (c1 < hi)                    \
767       c2++;                         \
768   } while (0)
769 
770 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
771 #endif /* !BN_ULLONG */
772 
bn_mul_comba8(BN_ULONG * r,BN_ULONG * a,BN_ULONG * b)773 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
774   BN_ULONG c1, c2, c3;
775 
776   c1 = 0;
777   c2 = 0;
778   c3 = 0;
779   mul_add_c(a[0], b[0], c1, c2, c3);
780   r[0] = c1;
781   c1 = 0;
782   mul_add_c(a[0], b[1], c2, c3, c1);
783   mul_add_c(a[1], b[0], c2, c3, c1);
784   r[1] = c2;
785   c2 = 0;
786   mul_add_c(a[2], b[0], c3, c1, c2);
787   mul_add_c(a[1], b[1], c3, c1, c2);
788   mul_add_c(a[0], b[2], c3, c1, c2);
789   r[2] = c3;
790   c3 = 0;
791   mul_add_c(a[0], b[3], c1, c2, c3);
792   mul_add_c(a[1], b[2], c1, c2, c3);
793   mul_add_c(a[2], b[1], c1, c2, c3);
794   mul_add_c(a[3], b[0], c1, c2, c3);
795   r[3] = c1;
796   c1 = 0;
797   mul_add_c(a[4], b[0], c2, c3, c1);
798   mul_add_c(a[3], b[1], c2, c3, c1);
799   mul_add_c(a[2], b[2], c2, c3, c1);
800   mul_add_c(a[1], b[3], c2, c3, c1);
801   mul_add_c(a[0], b[4], c2, c3, c1);
802   r[4] = c2;
803   c2 = 0;
804   mul_add_c(a[0], b[5], c3, c1, c2);
805   mul_add_c(a[1], b[4], c3, c1, c2);
806   mul_add_c(a[2], b[3], c3, c1, c2);
807   mul_add_c(a[3], b[2], c3, c1, c2);
808   mul_add_c(a[4], b[1], c3, c1, c2);
809   mul_add_c(a[5], b[0], c3, c1, c2);
810   r[5] = c3;
811   c3 = 0;
812   mul_add_c(a[6], b[0], c1, c2, c3);
813   mul_add_c(a[5], b[1], c1, c2, c3);
814   mul_add_c(a[4], b[2], c1, c2, c3);
815   mul_add_c(a[3], b[3], c1, c2, c3);
816   mul_add_c(a[2], b[4], c1, c2, c3);
817   mul_add_c(a[1], b[5], c1, c2, c3);
818   mul_add_c(a[0], b[6], c1, c2, c3);
819   r[6] = c1;
820   c1 = 0;
821   mul_add_c(a[0], b[7], c2, c3, c1);
822   mul_add_c(a[1], b[6], c2, c3, c1);
823   mul_add_c(a[2], b[5], c2, c3, c1);
824   mul_add_c(a[3], b[4], c2, c3, c1);
825   mul_add_c(a[4], b[3], c2, c3, c1);
826   mul_add_c(a[5], b[2], c2, c3, c1);
827   mul_add_c(a[6], b[1], c2, c3, c1);
828   mul_add_c(a[7], b[0], c2, c3, c1);
829   r[7] = c2;
830   c2 = 0;
831   mul_add_c(a[7], b[1], c3, c1, c2);
832   mul_add_c(a[6], b[2], c3, c1, c2);
833   mul_add_c(a[5], b[3], c3, c1, c2);
834   mul_add_c(a[4], b[4], c3, c1, c2);
835   mul_add_c(a[3], b[5], c3, c1, c2);
836   mul_add_c(a[2], b[6], c3, c1, c2);
837   mul_add_c(a[1], b[7], c3, c1, c2);
838   r[8] = c3;
839   c3 = 0;
840   mul_add_c(a[2], b[7], c1, c2, c3);
841   mul_add_c(a[3], b[6], c1, c2, c3);
842   mul_add_c(a[4], b[5], c1, c2, c3);
843   mul_add_c(a[5], b[4], c1, c2, c3);
844   mul_add_c(a[6], b[3], c1, c2, c3);
845   mul_add_c(a[7], b[2], c1, c2, c3);
846   r[9] = c1;
847   c1 = 0;
848   mul_add_c(a[7], b[3], c2, c3, c1);
849   mul_add_c(a[6], b[4], c2, c3, c1);
850   mul_add_c(a[5], b[5], c2, c3, c1);
851   mul_add_c(a[4], b[6], c2, c3, c1);
852   mul_add_c(a[3], b[7], c2, c3, c1);
853   r[10] = c2;
854   c2 = 0;
855   mul_add_c(a[4], b[7], c3, c1, c2);
856   mul_add_c(a[5], b[6], c3, c1, c2);
857   mul_add_c(a[6], b[5], c3, c1, c2);
858   mul_add_c(a[7], b[4], c3, c1, c2);
859   r[11] = c3;
860   c3 = 0;
861   mul_add_c(a[7], b[5], c1, c2, c3);
862   mul_add_c(a[6], b[6], c1, c2, c3);
863   mul_add_c(a[5], b[7], c1, c2, c3);
864   r[12] = c1;
865   c1 = 0;
866   mul_add_c(a[6], b[7], c2, c3, c1);
867   mul_add_c(a[7], b[6], c2, c3, c1);
868   r[13] = c2;
869   c2 = 0;
870   mul_add_c(a[7], b[7], c3, c1, c2);
871   r[14] = c3;
872   r[15] = c1;
873 }
874 
bn_mul_comba4(BN_ULONG * r,BN_ULONG * a,BN_ULONG * b)875 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
876   BN_ULONG c1, c2, c3;
877 
878   c1 = 0;
879   c2 = 0;
880   c3 = 0;
881   mul_add_c(a[0], b[0], c1, c2, c3);
882   r[0] = c1;
883   c1 = 0;
884   mul_add_c(a[0], b[1], c2, c3, c1);
885   mul_add_c(a[1], b[0], c2, c3, c1);
886   r[1] = c2;
887   c2 = 0;
888   mul_add_c(a[2], b[0], c3, c1, c2);
889   mul_add_c(a[1], b[1], c3, c1, c2);
890   mul_add_c(a[0], b[2], c3, c1, c2);
891   r[2] = c3;
892   c3 = 0;
893   mul_add_c(a[0], b[3], c1, c2, c3);
894   mul_add_c(a[1], b[2], c1, c2, c3);
895   mul_add_c(a[2], b[1], c1, c2, c3);
896   mul_add_c(a[3], b[0], c1, c2, c3);
897   r[3] = c1;
898   c1 = 0;
899   mul_add_c(a[3], b[1], c2, c3, c1);
900   mul_add_c(a[2], b[2], c2, c3, c1);
901   mul_add_c(a[1], b[3], c2, c3, c1);
902   r[4] = c2;
903   c2 = 0;
904   mul_add_c(a[2], b[3], c3, c1, c2);
905   mul_add_c(a[3], b[2], c3, c1, c2);
906   r[5] = c3;
907   c3 = 0;
908   mul_add_c(a[3], b[3], c1, c2, c3);
909   r[6] = c1;
910   r[7] = c2;
911 }
912 
bn_sqr_comba8(BN_ULONG * r,const BN_ULONG * a)913 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
914   BN_ULONG c1, c2, c3;
915 
916   c1 = 0;
917   c2 = 0;
918   c3 = 0;
919   sqr_add_c(a, 0, c1, c2, c3);
920   r[0] = c1;
921   c1 = 0;
922   sqr_add_c2(a, 1, 0, c2, c3, c1);
923   r[1] = c2;
924   c2 = 0;
925   sqr_add_c(a, 1, c3, c1, c2);
926   sqr_add_c2(a, 2, 0, c3, c1, c2);
927   r[2] = c3;
928   c3 = 0;
929   sqr_add_c2(a, 3, 0, c1, c2, c3);
930   sqr_add_c2(a, 2, 1, c1, c2, c3);
931   r[3] = c1;
932   c1 = 0;
933   sqr_add_c(a, 2, c2, c3, c1);
934   sqr_add_c2(a, 3, 1, c2, c3, c1);
935   sqr_add_c2(a, 4, 0, c2, c3, c1);
936   r[4] = c2;
937   c2 = 0;
938   sqr_add_c2(a, 5, 0, c3, c1, c2);
939   sqr_add_c2(a, 4, 1, c3, c1, c2);
940   sqr_add_c2(a, 3, 2, c3, c1, c2);
941   r[5] = c3;
942   c3 = 0;
943   sqr_add_c(a, 3, c1, c2, c3);
944   sqr_add_c2(a, 4, 2, c1, c2, c3);
945   sqr_add_c2(a, 5, 1, c1, c2, c3);
946   sqr_add_c2(a, 6, 0, c1, c2, c3);
947   r[6] = c1;
948   c1 = 0;
949   sqr_add_c2(a, 7, 0, c2, c3, c1);
950   sqr_add_c2(a, 6, 1, c2, c3, c1);
951   sqr_add_c2(a, 5, 2, c2, c3, c1);
952   sqr_add_c2(a, 4, 3, c2, c3, c1);
953   r[7] = c2;
954   c2 = 0;
955   sqr_add_c(a, 4, c3, c1, c2);
956   sqr_add_c2(a, 5, 3, c3, c1, c2);
957   sqr_add_c2(a, 6, 2, c3, c1, c2);
958   sqr_add_c2(a, 7, 1, c3, c1, c2);
959   r[8] = c3;
960   c3 = 0;
961   sqr_add_c2(a, 7, 2, c1, c2, c3);
962   sqr_add_c2(a, 6, 3, c1, c2, c3);
963   sqr_add_c2(a, 5, 4, c1, c2, c3);
964   r[9] = c1;
965   c1 = 0;
966   sqr_add_c(a, 5, c2, c3, c1);
967   sqr_add_c2(a, 6, 4, c2, c3, c1);
968   sqr_add_c2(a, 7, 3, c2, c3, c1);
969   r[10] = c2;
970   c2 = 0;
971   sqr_add_c2(a, 7, 4, c3, c1, c2);
972   sqr_add_c2(a, 6, 5, c3, c1, c2);
973   r[11] = c3;
974   c3 = 0;
975   sqr_add_c(a, 6, c1, c2, c3);
976   sqr_add_c2(a, 7, 5, c1, c2, c3);
977   r[12] = c1;
978   c1 = 0;
979   sqr_add_c2(a, 7, 6, c2, c3, c1);
980   r[13] = c2;
981   c2 = 0;
982   sqr_add_c(a, 7, c3, c1, c2);
983   r[14] = c3;
984   r[15] = c1;
985 }
986 
bn_sqr_comba4(BN_ULONG * r,const BN_ULONG * a)987 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
988   BN_ULONG c1, c2, c3;
989 
990   c1 = 0;
991   c2 = 0;
992   c3 = 0;
993   sqr_add_c(a, 0, c1, c2, c3);
994   r[0] = c1;
995   c1 = 0;
996   sqr_add_c2(a, 1, 0, c2, c3, c1);
997   r[1] = c2;
998   c2 = 0;
999   sqr_add_c(a, 1, c3, c1, c2);
1000   sqr_add_c2(a, 2, 0, c3, c1, c2);
1001   r[2] = c3;
1002   c3 = 0;
1003   sqr_add_c2(a, 3, 0, c1, c2, c3);
1004   sqr_add_c2(a, 2, 1, c1, c2, c3);
1005   r[3] = c1;
1006   c1 = 0;
1007   sqr_add_c(a, 2, c2, c3, c1);
1008   sqr_add_c2(a, 3, 1, c2, c3, c1);
1009   r[4] = c2;
1010   c2 = 0;
1011   sqr_add_c2(a, 3, 2, c3, c1, c2);
1012   r[5] = c3;
1013   c3 = 0;
1014   sqr_add_c(a, 3, c1, c2, c3);
1015   r[6] = c1;
1016   r[7] = c2;
1017 }
1018 
1019 #endif
1020