1 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
2  * All rights reserved.
3  *
4  * This package is an SSL implementation written
5  * by Eric Young (eay@cryptsoft.com).
6  * The implementation was written so as to conform with Netscapes SSL.
7  *
8  * This library is free for commercial and non-commercial use as long as
9  * the following conditions are aheared to.  The following conditions
10  * apply to all code found in this distribution, be it the RC4, RSA,
11  * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
12  * included with this distribution is covered by the same copyright terms
13  * except that the holder is Tim Hudson (tjh@cryptsoft.com).
14  *
15  * Copyright remains Eric Young's, and as such any Copyright notices in
16  * the code are not to be removed.
17  * If this package is used in a product, Eric Young should be given attribution
18  * as the author of the parts of the library used.
19  * This can be in the form of a textual message at program startup or
20  * in documentation (online or textual) provided with the package.
21  *
22  * Redistribution and use in source and binary forms, with or without
23  * modification, are permitted provided that the following conditions
24  * are met:
25  * 1. Redistributions of source code must retain the copyright
26  *    notice, this list of conditions and the following disclaimer.
27  * 2. Redistributions in binary form must reproduce the above copyright
28  *    notice, this list of conditions and the following disclaimer in the
29  *    documentation and/or other materials provided with the distribution.
30  * 3. All advertising materials mentioning features or use of this software
31  *    must display the following acknowledgement:
32  *    "This product includes cryptographic software written by
33  *     Eric Young (eay@cryptsoft.com)"
34  *    The word 'cryptographic' can be left out if the rouines from the library
35  *    being used are not cryptographic related :-).
36  * 4. If you include any Windows specific code (or a derivative thereof) from
37  *    the apps directory (application code) you must include an acknowledgement:
38  *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
39  *
40  * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
41  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
44  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50  * SUCH DAMAGE.
51  *
52  * The licence and distribution terms for any publically available version or
53  * derivative of this code cannot be changed.  i.e. this code cannot simply be
54  * copied and put under another distribution licence
55  * [including the GNU Public Licence.]
56  */
57 /* ====================================================================
58  * Copyright (c) 1998-2005 The OpenSSL Project.  All rights reserved.
59  *
60  * Redistribution and use in source and binary forms, with or without
61  * modification, are permitted provided that the following conditions
62  * are met:
63  *
64  * 1. Redistributions of source code must retain the above copyright
65  *    notice, this list of conditions and the following disclaimer.
66  *
67  * 2. Redistributions in binary form must reproduce the above copyright
68  *    notice, this list of conditions and the following disclaimer in
69  *    the documentation and/or other materials provided with the
70  *    distribution.
71  *
72  * 3. All advertising materials mentioning features or use of this
73  *    software must display the following acknowledgment:
74  *    "This product includes software developed by the OpenSSL Project
75  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
76  *
77  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
78  *    endorse or promote products derived from this software without
79  *    prior written permission. For written permission, please contact
80  *    openssl-core@openssl.org.
81  *
82  * 5. Products derived from this software may not be called "OpenSSL"
83  *    nor may "OpenSSL" appear in their names without prior written
84  *    permission of the OpenSSL Project.
85  *
86  * 6. Redistributions of any form whatsoever must retain the following
87  *    acknowledgment:
88  *    "This product includes software developed by the OpenSSL Project
89  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
90  *
91  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
92  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
93  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
94  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
95  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
96  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
97  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
98  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
99  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
100  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
101  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
102  * OF THE POSSIBILITY OF SUCH DAMAGE.
103  * ====================================================================
104  *
105  * This product includes cryptographic software written by Eric Young
106  * (eay@cryptsoft.com).  This product includes software written by Tim
107  * Hudson (tjh@cryptsoft.com). */
108 
109 #include <openssl/bn.h>
110 
111 #include <assert.h>
112 #include <string.h>
113 
114 #include <openssl/cpu.h>
115 #include <openssl/err.h>
116 #include <openssl/mem.h>
117 
118 #include "internal.h"
119 
120 
121 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
122 #define OPENSSL_BN_ASM_MONT5
123 #define RSAZ_ENABLED
124 
125 #include "rsaz_exp.h"
126 
127 void bn_mul_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap, const void *table,
128                          const BN_ULONG *np, const BN_ULONG *n0, int num,
129                          int power);
130 void bn_scatter5(const BN_ULONG *inp, size_t num, void *table, size_t power);
131 void bn_gather5(BN_ULONG *out, size_t num, void *table, size_t power);
132 void bn_power5(BN_ULONG *rp, const BN_ULONG *ap, const void *table,
133                const BN_ULONG *np, const BN_ULONG *n0, int num, int power);
134 int bn_from_montgomery(BN_ULONG *rp, const BN_ULONG *ap,
135                        const BN_ULONG *not_used, const BN_ULONG *np,
136                        const BN_ULONG *n0, int num);
137 #endif
138 
BN_exp(BIGNUM * r,const BIGNUM * a,const BIGNUM * p,BN_CTX * ctx)139 int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) {
140   int i, bits, ret = 0;
141   BIGNUM *v, *rr;
142 
143   if ((p->flags & BN_FLG_CONSTTIME) != 0) {
144     /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
145     OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
146     return 0;
147   }
148 
149   BN_CTX_start(ctx);
150   if (r == a || r == p) {
151     rr = BN_CTX_get(ctx);
152   } else {
153     rr = r;
154   }
155 
156   v = BN_CTX_get(ctx);
157   if (rr == NULL || v == NULL) {
158     goto err;
159   }
160 
161   if (BN_copy(v, a) == NULL) {
162     goto err;
163   }
164   bits = BN_num_bits(p);
165 
166   if (BN_is_odd(p)) {
167     if (BN_copy(rr, a) == NULL) {
168       goto err;
169     }
170   } else {
171     if (!BN_one(rr)) {
172       goto err;
173     }
174   }
175 
176   for (i = 1; i < bits; i++) {
177     if (!BN_sqr(v, v, ctx)) {
178       goto err;
179     }
180     if (BN_is_bit_set(p, i)) {
181       if (!BN_mul(rr, rr, v, ctx)) {
182         goto err;
183       }
184     }
185   }
186 
187   if (r != rr && !BN_copy(r, rr)) {
188     goto err;
189   }
190   ret = 1;
191 
192 err:
193   BN_CTX_end(ctx);
194   return ret;
195 }
196 
197 /* maximum precomputation table size for *variable* sliding windows */
198 #define TABLE_SIZE 32
199 
200 typedef struct bn_recp_ctx_st {
201   BIGNUM N;  /* the divisor */
202   BIGNUM Nr; /* the reciprocal */
203   int num_bits;
204   int shift;
205   int flags;
206 } BN_RECP_CTX;
207 
BN_RECP_CTX_init(BN_RECP_CTX * recp)208 static void BN_RECP_CTX_init(BN_RECP_CTX *recp) {
209   BN_init(&recp->N);
210   BN_init(&recp->Nr);
211   recp->num_bits = 0;
212   recp->flags = 0;
213 }
214 
BN_RECP_CTX_free(BN_RECP_CTX * recp)215 static void BN_RECP_CTX_free(BN_RECP_CTX *recp) {
216   if (recp == NULL) {
217     return;
218   }
219 
220   BN_free(&recp->N);
221   BN_free(&recp->Nr);
222 }
223 
BN_RECP_CTX_set(BN_RECP_CTX * recp,const BIGNUM * d,BN_CTX * ctx)224 static int BN_RECP_CTX_set(BN_RECP_CTX *recp, const BIGNUM *d, BN_CTX *ctx) {
225   if (!BN_copy(&(recp->N), d)) {
226     return 0;
227   }
228   BN_zero(&recp->Nr);
229   recp->num_bits = BN_num_bits(d);
230   recp->shift = 0;
231 
232   return 1;
233 }
234 
235 /* len is the expected size of the result We actually calculate with an extra
236  * word of precision, so we can do faster division if the remainder is not
237  * required.
238  * r := 2^len / m */
BN_reciprocal(BIGNUM * r,const BIGNUM * m,int len,BN_CTX * ctx)239 static int BN_reciprocal(BIGNUM *r, const BIGNUM *m, int len, BN_CTX *ctx) {
240   int ret = -1;
241   BIGNUM *t;
242 
243   BN_CTX_start(ctx);
244   t = BN_CTX_get(ctx);
245   if (t == NULL) {
246     goto err;
247   }
248 
249   if (!BN_set_bit(t, len)) {
250     goto err;
251   }
252 
253   if (!BN_div(r, NULL, t, m, ctx)) {
254     goto err;
255   }
256 
257   ret = len;
258 
259 err:
260   BN_CTX_end(ctx);
261   return ret;
262 }
263 
BN_div_recp(BIGNUM * dv,BIGNUM * rem,const BIGNUM * m,BN_RECP_CTX * recp,BN_CTX * ctx)264 static int BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m,
265                        BN_RECP_CTX *recp, BN_CTX *ctx) {
266   int i, j, ret = 0;
267   BIGNUM *a, *b, *d, *r;
268 
269   BN_CTX_start(ctx);
270   a = BN_CTX_get(ctx);
271   b = BN_CTX_get(ctx);
272   if (dv != NULL) {
273     d = dv;
274   } else {
275     d = BN_CTX_get(ctx);
276   }
277 
278   if (rem != NULL) {
279     r = rem;
280   } else {
281     r = BN_CTX_get(ctx);
282   }
283 
284   if (a == NULL || b == NULL || d == NULL || r == NULL) {
285     goto err;
286   }
287 
288   if (BN_ucmp(m, &recp->N) < 0) {
289     BN_zero(d);
290     if (!BN_copy(r, m)) {
291       goto err;
292     }
293     BN_CTX_end(ctx);
294     return 1;
295   }
296 
297   /* We want the remainder
298    * Given input of ABCDEF / ab
299    * we need multiply ABCDEF by 3 digests of the reciprocal of ab */
300 
301   /* i := max(BN_num_bits(m), 2*BN_num_bits(N)) */
302   i = BN_num_bits(m);
303   j = recp->num_bits << 1;
304   if (j > i) {
305     i = j;
306   }
307 
308   /* Nr := round(2^i / N) */
309   if (i != recp->shift) {
310     recp->shift =
311         BN_reciprocal(&(recp->Nr), &(recp->N), i,
312                       ctx); /* BN_reciprocal returns i, or -1 for an error */
313   }
314 
315   if (recp->shift == -1) {
316     goto err;
317   }
318 
319   /* d := |round(round(m / 2^BN_num_bits(N)) * recp->Nr / 2^(i -
320    * BN_num_bits(N)))|
321    *    = |round(round(m / 2^BN_num_bits(N)) * round(2^i / N) / 2^(i -
322    * BN_num_bits(N)))|
323    *   <= |(m / 2^BN_num_bits(N)) * (2^i / N) * (2^BN_num_bits(N) / 2^i)|
324    *    = |m/N| */
325   if (!BN_rshift(a, m, recp->num_bits)) {
326     goto err;
327   }
328   if (!BN_mul(b, a, &(recp->Nr), ctx)) {
329     goto err;
330   }
331   if (!BN_rshift(d, b, i - recp->num_bits)) {
332     goto err;
333   }
334   d->neg = 0;
335 
336   if (!BN_mul(b, &(recp->N), d, ctx)) {
337     goto err;
338   }
339   if (!BN_usub(r, m, b)) {
340     goto err;
341   }
342   r->neg = 0;
343 
344   j = 0;
345   while (BN_ucmp(r, &(recp->N)) >= 0) {
346     if (j++ > 2) {
347       OPENSSL_PUT_ERROR(BN, BN_R_BAD_RECIPROCAL);
348       goto err;
349     }
350     if (!BN_usub(r, r, &(recp->N))) {
351       goto err;
352     }
353     if (!BN_add_word(d, 1)) {
354       goto err;
355     }
356   }
357 
358   r->neg = BN_is_zero(r) ? 0 : m->neg;
359   d->neg = m->neg ^ recp->N.neg;
360   ret = 1;
361 
362 err:
363   BN_CTX_end(ctx);
364   return ret;
365 }
366 
BN_mod_mul_reciprocal(BIGNUM * r,const BIGNUM * x,const BIGNUM * y,BN_RECP_CTX * recp,BN_CTX * ctx)367 static int BN_mod_mul_reciprocal(BIGNUM *r, const BIGNUM *x, const BIGNUM *y,
368                                  BN_RECP_CTX *recp, BN_CTX *ctx) {
369   int ret = 0;
370   BIGNUM *a;
371   const BIGNUM *ca;
372 
373   BN_CTX_start(ctx);
374   a = BN_CTX_get(ctx);
375   if (a == NULL) {
376     goto err;
377   }
378 
379   if (y != NULL) {
380     if (x == y) {
381       if (!BN_sqr(a, x, ctx)) {
382         goto err;
383       }
384     } else {
385       if (!BN_mul(a, x, y, ctx)) {
386         goto err;
387       }
388     }
389     ca = a;
390   } else {
391     ca = x; /* Just do the mod */
392   }
393 
394   ret = BN_div_recp(NULL, r, ca, recp, ctx);
395 
396 err:
397   BN_CTX_end(ctx);
398   return ret;
399 }
400 
401 /* BN_window_bits_for_exponent_size -- macro for sliding window mod_exp
402  * functions
403  *
404  * For window size 'w' (w >= 2) and a random 'b' bits exponent, the number of
405  * multiplications is a constant plus on average
406  *
407  *    2^(w-1) + (b-w)/(w+1);
408  *
409  * here 2^(w-1)  is for precomputing the table (we actually need entries only
410  * for windows that have the lowest bit set), and (b-w)/(w+1)  is an
411  * approximation for the expected number of w-bit windows, not counting the
412  * first one.
413  *
414  * Thus we should use
415  *
416  *    w >= 6  if        b > 671
417  *     w = 5  if  671 > b > 239
418  *     w = 4  if  239 > b >  79
419  *     w = 3  if   79 > b >  23
420  *    w <= 2  if   23 > b
421  *
422  * (with draws in between).  Very small exponents are often selected
423  * with low Hamming weight, so we use  w = 1  for b <= 23. */
424 #define BN_window_bits_for_exponent_size(b) \
425 		((b) > 671 ? 6 : \
426 		 (b) > 239 ? 5 : \
427 		 (b) >  79 ? 4 : \
428 		 (b) >  23 ? 3 : 1)
429 
mod_exp_recp(BIGNUM * r,const BIGNUM * a,const BIGNUM * p,const BIGNUM * m,BN_CTX * ctx)430 static int mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
431                         const BIGNUM *m, BN_CTX *ctx) {
432   int i, j, bits, ret = 0, wstart, window;
433   int start = 1;
434   BIGNUM *aa;
435   /* Table of variables obtained from 'ctx' */
436   BIGNUM *val[TABLE_SIZE];
437   BN_RECP_CTX recp;
438 
439   if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
440     /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
441     OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
442     return 0;
443   }
444 
445   bits = BN_num_bits(p);
446 
447   if (bits == 0) {
448     /* x**0 mod 1 is still zero. */
449     if (BN_is_one(m)) {
450       BN_zero(r);
451       return 1;
452     }
453     return BN_one(r);
454   }
455 
456   BN_CTX_start(ctx);
457   aa = BN_CTX_get(ctx);
458   val[0] = BN_CTX_get(ctx);
459   if (!aa || !val[0]) {
460     goto err;
461   }
462 
463   BN_RECP_CTX_init(&recp);
464   if (m->neg) {
465     /* ignore sign of 'm' */
466     if (!BN_copy(aa, m)) {
467       goto err;
468     }
469     aa->neg = 0;
470     if (BN_RECP_CTX_set(&recp, aa, ctx) <= 0) {
471       goto err;
472     }
473   } else {
474     if (BN_RECP_CTX_set(&recp, m, ctx) <= 0) {
475       goto err;
476     }
477   }
478 
479   if (!BN_nnmod(val[0], a, m, ctx)) {
480     goto err; /* 1 */
481   }
482   if (BN_is_zero(val[0])) {
483     BN_zero(r);
484     ret = 1;
485     goto err;
486   }
487 
488   window = BN_window_bits_for_exponent_size(bits);
489   if (window > 1) {
490     if (!BN_mod_mul_reciprocal(aa, val[0], val[0], &recp, ctx)) {
491       goto err; /* 2 */
492     }
493     j = 1 << (window - 1);
494     for (i = 1; i < j; i++) {
495       if (((val[i] = BN_CTX_get(ctx)) == NULL) ||
496           !BN_mod_mul_reciprocal(val[i], val[i - 1], aa, &recp, ctx)) {
497         goto err;
498       }
499     }
500   }
501 
502   start = 1; /* This is used to avoid multiplication etc
503               * when there is only the value '1' in the
504               * buffer. */
505   wstart = bits - 1; /* The top bit of the window */
506 
507   if (!BN_one(r)) {
508     goto err;
509   }
510 
511   for (;;) {
512     int wvalue; /* The 'value' of the window */
513     int wend; /* The bottom bit of the window */
514 
515     if (BN_is_bit_set(p, wstart) == 0) {
516       if (!start) {
517         if (!BN_mod_mul_reciprocal(r, r, r, &recp, ctx)) {
518           goto err;
519         }
520       }
521       if (wstart == 0) {
522         break;
523       }
524       wstart--;
525       continue;
526     }
527 
528     /* We now have wstart on a 'set' bit, we now need to work out
529      * how bit a window to do.  To do this we need to scan
530      * forward until the last set bit before the end of the
531      * window */
532     wvalue = 1;
533     wend = 0;
534     for (i = 1; i < window; i++) {
535       if (wstart - i < 0) {
536         break;
537       }
538       if (BN_is_bit_set(p, wstart - i)) {
539         wvalue <<= (i - wend);
540         wvalue |= 1;
541         wend = i;
542       }
543     }
544 
545     /* wend is the size of the current window */
546     j = wend + 1;
547     /* add the 'bytes above' */
548     if (!start) {
549       for (i = 0; i < j; i++) {
550         if (!BN_mod_mul_reciprocal(r, r, r, &recp, ctx)) {
551           goto err;
552         }
553       }
554     }
555 
556     /* wvalue will be an odd number < 2^window */
557     if (!BN_mod_mul_reciprocal(r, r, val[wvalue >> 1], &recp, ctx)) {
558       goto err;
559     }
560 
561     /* move the 'window' down further */
562     wstart -= wend + 1;
563     start = 0;
564     if (wstart < 0) {
565       break;
566     }
567   }
568   ret = 1;
569 
570 err:
571   BN_CTX_end(ctx);
572   BN_RECP_CTX_free(&recp);
573   return ret;
574 }
575 
BN_mod_exp(BIGNUM * r,const BIGNUM * a,const BIGNUM * p,const BIGNUM * m,BN_CTX * ctx)576 int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
577                BN_CTX *ctx) {
578   /* For even modulus  m = 2^k*m_odd,  it might make sense to compute
579    * a^p mod m_odd  and  a^p mod 2^k  separately (with Montgomery
580    * exponentiation for the odd part), using appropriate exponent
581    * reductions, and combine the results using the CRT.
582    *
583    * For now, we use Montgomery only if the modulus is odd; otherwise,
584    * exponentiation using the reciprocal-based quick remaindering
585    * algorithm is used.
586    *
587    * (Timing obtained with expspeed.c [computations  a^p mod m
588    * where  a, p, m  are of the same length: 256, 512, 1024, 2048,
589    * 4096, 8192 bits], compared to the running time of the
590    * standard algorithm:
591    *
592    *   BN_mod_exp_mont   33 .. 40 %  [AMD K6-2, Linux, debug configuration]
593    *                     55 .. 77 %  [UltraSparc processor, but
594    *                                  debug-solaris-sparcv8-gcc conf.]
595    *
596    *   BN_mod_exp_recp   50 .. 70 %  [AMD K6-2, Linux, debug configuration]
597    *                     62 .. 118 % [UltraSparc, debug-solaris-sparcv8-gcc]
598    *
599    * On the Sparc, BN_mod_exp_recp was faster than BN_mod_exp_mont
600    * at 2048 and more bits, but at 512 and 1024 bits, it was
601    * slower even than the standard algorithm!
602    *
603    * "Real" timings [linux-elf, solaris-sparcv9-gcc configurations]
604    * should be obtained when the new Montgomery reduction code
605    * has been integrated into OpenSSL.) */
606 
607   if (BN_is_odd(m)) {
608     if (a->top == 1 && !a->neg && BN_get_flags(p, BN_FLG_CONSTTIME) == 0) {
609       BN_ULONG A = a->d[0];
610       return BN_mod_exp_mont_word(r, A, p, m, ctx, NULL);
611     }
612 
613     return BN_mod_exp_mont(r, a, p, m, ctx, NULL);
614   }
615 
616   return mod_exp_recp(r, a, p, m, ctx);
617 }
618 
BN_mod_exp_mont(BIGNUM * rr,const BIGNUM * a,const BIGNUM * p,const BIGNUM * m,BN_CTX * ctx,const BN_MONT_CTX * mont)619 int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
620                     const BIGNUM *m, BN_CTX *ctx, const BN_MONT_CTX *mont) {
621   int i, j, bits, ret = 0, wstart, window;
622   int start = 1;
623   BIGNUM *d, *r;
624   const BIGNUM *aa;
625   /* Table of variables obtained from 'ctx' */
626   BIGNUM *val[TABLE_SIZE];
627   BN_MONT_CTX *new_mont = NULL;
628 
629   if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
630     return BN_mod_exp_mont_consttime(rr, a, p, m, ctx, mont);
631   }
632 
633   if (!BN_is_odd(m)) {
634     OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS);
635     return 0;
636   }
637   bits = BN_num_bits(p);
638   if (bits == 0) {
639     /* x**0 mod 1 is still zero. */
640     if (BN_is_one(m)) {
641       BN_zero(rr);
642       return 1;
643     }
644     return BN_one(rr);
645   }
646 
647   BN_CTX_start(ctx);
648   d = BN_CTX_get(ctx);
649   r = BN_CTX_get(ctx);
650   val[0] = BN_CTX_get(ctx);
651   if (!d || !r || !val[0]) {
652     goto err;
653   }
654 
655   /* Allocate a montgomery context if it was not supplied by the caller. */
656   if (mont == NULL) {
657     new_mont = BN_MONT_CTX_new();
658     if (new_mont == NULL || !BN_MONT_CTX_set(new_mont, m, ctx)) {
659       goto err;
660     }
661     mont = new_mont;
662   }
663 
664   if (a->neg || BN_ucmp(a, m) >= 0) {
665     if (!BN_nnmod(val[0], a, m, ctx)) {
666       goto err;
667     }
668     aa = val[0];
669   } else {
670     aa = a;
671   }
672 
673   if (BN_is_zero(aa)) {
674     BN_zero(rr);
675     ret = 1;
676     goto err;
677   }
678   if (!BN_to_montgomery(val[0], aa, mont, ctx)) {
679     goto err; /* 1 */
680   }
681 
682   window = BN_window_bits_for_exponent_size(bits);
683   if (window > 1) {
684     if (!BN_mod_mul_montgomery(d, val[0], val[0], mont, ctx)) {
685       goto err; /* 2 */
686     }
687     j = 1 << (window - 1);
688     for (i = 1; i < j; i++) {
689       if (((val[i] = BN_CTX_get(ctx)) == NULL) ||
690           !BN_mod_mul_montgomery(val[i], val[i - 1], d, mont, ctx)) {
691         goto err;
692       }
693     }
694   }
695 
696   start = 1; /* This is used to avoid multiplication etc
697               * when there is only the value '1' in the
698               * buffer. */
699   wstart = bits - 1; /* The top bit of the window */
700 
701   j = m->top; /* borrow j */
702   if (m->d[j - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) {
703     if (bn_wexpand(r, j) == NULL) {
704       goto err;
705     }
706     /* 2^(top*BN_BITS2) - m */
707     r->d[0] = (0 - m->d[0]) & BN_MASK2;
708     for (i = 1; i < j; i++) {
709       r->d[i] = (~m->d[i]) & BN_MASK2;
710     }
711     r->top = j;
712     /* Upper words will be zero if the corresponding words of 'm'
713      * were 0xfff[...], so decrement r->top accordingly. */
714     bn_correct_top(r);
715   } else if (!BN_to_montgomery(r, BN_value_one(), mont, ctx)) {
716     goto err;
717   }
718 
719   for (;;) {
720     int wvalue; /* The 'value' of the window */
721     int wend; /* The bottom bit of the window */
722 
723     if (BN_is_bit_set(p, wstart) == 0) {
724       if (!start && !BN_mod_mul_montgomery(r, r, r, mont, ctx)) {
725         goto err;
726       }
727       if (wstart == 0) {
728         break;
729       }
730       wstart--;
731       continue;
732     }
733 
734     /* We now have wstart on a 'set' bit, we now need to work out how bit a
735      * window to do.  To do this we need to scan forward until the last set bit
736      * before the end of the window */
737     wvalue = 1;
738     wend = 0;
739     for (i = 1; i < window; i++) {
740       if (wstart - i < 0) {
741         break;
742       }
743       if (BN_is_bit_set(p, wstart - i)) {
744         wvalue <<= (i - wend);
745         wvalue |= 1;
746         wend = i;
747       }
748     }
749 
750     /* wend is the size of the current window */
751     j = wend + 1;
752     /* add the 'bytes above' */
753     if (!start) {
754       for (i = 0; i < j; i++) {
755         if (!BN_mod_mul_montgomery(r, r, r, mont, ctx)) {
756           goto err;
757         }
758       }
759     }
760 
761     /* wvalue will be an odd number < 2^window */
762     if (!BN_mod_mul_montgomery(r, r, val[wvalue >> 1], mont, ctx)) {
763       goto err;
764     }
765 
766     /* move the 'window' down further */
767     wstart -= wend + 1;
768     start = 0;
769     if (wstart < 0) {
770       break;
771     }
772   }
773 
774   if (!BN_from_montgomery(rr, r, mont, ctx)) {
775     goto err;
776   }
777   ret = 1;
778 
779 err:
780   BN_MONT_CTX_free(new_mont);
781   BN_CTX_end(ctx);
782   return ret;
783 }
784 
785 /* BN_mod_exp_mont_consttime() stores the precomputed powers in a specific
786  * layout so that accessing any of these table values shows the same access
787  * pattern as far as cache lines are concerned. The following functions are
788  * used to transfer a BIGNUM from/to that table. */
copy_to_prebuf(const BIGNUM * b,int top,unsigned char * buf,int idx,int width)789 static int copy_to_prebuf(const BIGNUM *b, int top, unsigned char *buf, int idx,
790                           int width) {
791   size_t i, j;
792 
793   if (top > b->top) {
794     top = b->top; /* this works because 'buf' is explicitly zeroed */
795   }
796   for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) {
797     buf[j] = ((unsigned char *)b->d)[i];
798   }
799 
800   return 1;
801 }
802 
copy_from_prebuf(BIGNUM * b,int top,unsigned char * buf,int idx,int width)803 static int copy_from_prebuf(BIGNUM *b, int top, unsigned char *buf, int idx,
804                             int width) {
805   size_t i, j;
806 
807   if (bn_wexpand(b, top) == NULL) {
808     return 0;
809   }
810 
811   for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) {
812     ((unsigned char *)b->d)[i] = buf[j];
813   }
814 
815   b->top = top;
816   bn_correct_top(b);
817   return 1;
818 }
819 
820 /* BN_mod_exp_mont_conttime is based on the assumption that the L1 data cache
821  * line width of the target processor is at least the following value. */
822 #define MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH (64)
823 #define MOD_EXP_CTIME_MIN_CACHE_LINE_MASK \
824   (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - 1)
825 
826 /* Window sizes optimized for fixed window size modular exponentiation
827  * algorithm (BN_mod_exp_mont_consttime).
828  *
829  * To achieve the security goals of BN_mode_exp_mont_consttime, the maximum
830  * size of the window must not exceed
831  * log_2(MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH).
832  *
833  * Window size thresholds are defined for cache line sizes of 32 and 64, cache
834  * line sizes where log_2(32)=5 and log_2(64)=6 respectively. A window size of
835  * 7 should only be used on processors that have a 128 byte or greater cache
836  * line size. */
837 #if MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 64
838 
839 #define BN_window_bits_for_ctime_exponent_size(b) \
840   ((b) > 937 ? 6 : (b) > 306 ? 5 : (b) > 89 ? 4 : (b) > 22 ? 3 : 1)
841 #define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE (6)
842 
843 #elif MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 32
844 
845 #define BN_window_bits_for_ctime_exponent_size(b) \
846   ((b) > 306 ? 5 : (b) > 89 ? 4 : (b) > 22 ? 3 : 1)
847 #define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE (5)
848 
849 #endif
850 
851 /* Given a pointer value, compute the next address that is a cache line
852  * multiple. */
853 #define MOD_EXP_CTIME_ALIGN(x_)          \
854   ((unsigned char *)(x_) +               \
855    (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - \
856     (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
857 
858 /* This variant of BN_mod_exp_mont() uses fixed windows and the special
859  * precomputation memory layout to limit data-dependency to a minimum
860  * to protect secret exponents (cf. the hyper-threading timing attacks
861  * pointed out by Colin Percival,
862  * http://www.daemonology.net/hyperthreading-considered-harmful/)
863  */
BN_mod_exp_mont_consttime(BIGNUM * rr,const BIGNUM * a,const BIGNUM * p,const BIGNUM * m,BN_CTX * ctx,const BN_MONT_CTX * mont)864 int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
865                               const BIGNUM *m, BN_CTX *ctx,
866                               const BN_MONT_CTX *mont) {
867   int i, bits, ret = 0, window, wvalue;
868   int top;
869   BN_MONT_CTX *new_mont = NULL;
870 
871   int numPowers;
872   unsigned char *powerbufFree = NULL;
873   int powerbufLen = 0;
874   unsigned char *powerbuf = NULL;
875   BIGNUM tmp, am;
876 
877   if (!BN_is_odd(m)) {
878     OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS);
879     return 0;
880   }
881 
882   top = m->top;
883 
884   bits = BN_num_bits(p);
885   if (bits == 0) {
886     /* x**0 mod 1 is still zero. */
887     if (BN_is_one(m)) {
888       BN_zero(rr);
889       return 1;
890     }
891     return BN_one(rr);
892   }
893 
894   BN_CTX_start(ctx);
895 
896   /* Allocate a montgomery context if it was not supplied by the caller. */
897   if (mont == NULL) {
898     new_mont = BN_MONT_CTX_new();
899     if (new_mont == NULL || !BN_MONT_CTX_set(new_mont, m, ctx)) {
900       goto err;
901     }
902     mont = new_mont;
903   }
904 
905 #ifdef RSAZ_ENABLED
906   /* If the size of the operands allow it, perform the optimized
907    * RSAZ exponentiation. For further information see
908    * crypto/bn/rsaz_exp.c and accompanying assembly modules. */
909   if ((16 == a->top) && (16 == p->top) && (BN_num_bits(m) == 1024) &&
910       rsaz_avx2_eligible()) {
911     if (NULL == bn_wexpand(rr, 16)) {
912       goto err;
913     }
914     RSAZ_1024_mod_exp_avx2(rr->d, a->d, p->d, m->d, mont->RR.d, mont->n0[0]);
915     rr->top = 16;
916     rr->neg = 0;
917     bn_correct_top(rr);
918     ret = 1;
919     goto err;
920   } else if ((8 == a->top) && (8 == p->top) && (BN_num_bits(m) == 512)) {
921     if (NULL == bn_wexpand(rr, 8)) {
922       goto err;
923     }
924     RSAZ_512_mod_exp(rr->d, a->d, p->d, m->d, mont->n0[0], mont->RR.d);
925     rr->top = 8;
926     rr->neg = 0;
927     bn_correct_top(rr);
928     ret = 1;
929     goto err;
930   }
931 #endif
932 
933   /* Get the window size to use with size of p. */
934   window = BN_window_bits_for_ctime_exponent_size(bits);
935 #if defined(OPENSSL_BN_ASM_MONT5)
936   if (window >= 5) {
937     window = 5; /* ~5% improvement for RSA2048 sign, and even for RSA4096 */
938     if ((top & 7) == 0) {
939       powerbufLen += 2 * top * sizeof(m->d[0]);
940     }
941   }
942 #endif
943 
944   /* Allocate a buffer large enough to hold all of the pre-computed
945    * powers of am, am itself and tmp.
946    */
947   numPowers = 1 << window;
948   powerbufLen +=
949       sizeof(m->d[0]) *
950       (top * numPowers + ((2 * top) > numPowers ? (2 * top) : numPowers));
951 #ifdef alloca
952   if (powerbufLen < 3072) {
953     powerbufFree = alloca(powerbufLen + MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH);
954   } else
955 #endif
956   {
957     if ((powerbufFree = (unsigned char *)OPENSSL_malloc(
958             powerbufLen + MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL) {
959       goto err;
960     }
961   }
962 
963   powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree);
964   memset(powerbuf, 0, powerbufLen);
965 
966 #ifdef alloca
967   if (powerbufLen < 3072) {
968     powerbufFree = NULL;
969   }
970 #endif
971 
972   /* lay down tmp and am right after powers table */
973   tmp.d = (BN_ULONG *)(powerbuf + sizeof(m->d[0]) * top * numPowers);
974   am.d = tmp.d + top;
975   tmp.top = am.top = 0;
976   tmp.dmax = am.dmax = top;
977   tmp.neg = am.neg = 0;
978   tmp.flags = am.flags = BN_FLG_STATIC_DATA;
979 
980 /* prepare a^0 in Montgomery domain */
981 /* by Shay Gueron's suggestion */
982   if (m->d[top - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) {
983     /* 2^(top*BN_BITS2) - m */
984     tmp.d[0] = (0 - m->d[0]) & BN_MASK2;
985     for (i = 1; i < top; i++) {
986       tmp.d[i] = (~m->d[i]) & BN_MASK2;
987     }
988     tmp.top = top;
989   } else if (!BN_to_montgomery(&tmp, BN_value_one(), mont, ctx)) {
990     goto err;
991   }
992 
993   /* prepare a^1 in Montgomery domain */
994   if (a->neg || BN_ucmp(a, m) >= 0) {
995     if (!BN_mod(&am, a, m, ctx) ||
996         !BN_to_montgomery(&am, &am, mont, ctx)) {
997       goto err;
998     }
999   } else if (!BN_to_montgomery(&am, a, mont, ctx)) {
1000     goto err;
1001   }
1002 
1003 #if defined(OPENSSL_BN_ASM_MONT5)
1004   /* This optimization uses ideas from http://eprint.iacr.org/2011/239,
1005    * specifically optimization of cache-timing attack countermeasures
1006    * and pre-computation optimization. */
1007 
1008   /* Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as
1009    * 512-bit RSA is hardly relevant, we omit it to spare size... */
1010   if (window == 5 && top > 1) {
1011     const BN_ULONG *np = mont->N.d, *n0 = mont->n0, *np2;
1012 
1013     /* BN_to_montgomery can contaminate words above .top
1014      * [in BN_DEBUG[_DEBUG] build]... */
1015     for (i = am.top; i < top; i++) {
1016       am.d[i] = 0;
1017     }
1018     for (i = tmp.top; i < top; i++) {
1019       tmp.d[i] = 0;
1020     }
1021 
1022     if (top & 7) {
1023       np2 = np;
1024     } else {
1025       BN_ULONG *np_double = am.d + top;
1026       for (i = 0; i < top; i++) {
1027         np_double[2 * i] = np[i];
1028       }
1029       np2 = np_double;
1030     }
1031 
1032     bn_scatter5(tmp.d, top, powerbuf, 0);
1033     bn_scatter5(am.d, am.top, powerbuf, 1);
1034     bn_mul_mont(tmp.d, am.d, am.d, np, n0, top);
1035     bn_scatter5(tmp.d, top, powerbuf, 2);
1036 
1037     /* same as above, but uses squaring for 1/2 of operations */
1038     for (i = 4; i < 32; i *= 2) {
1039       bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
1040       bn_scatter5(tmp.d, top, powerbuf, i);
1041     }
1042     for (i = 3; i < 8; i += 2) {
1043       int j;
1044       bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
1045       bn_scatter5(tmp.d, top, powerbuf, i);
1046       for (j = 2 * i; j < 32; j *= 2) {
1047         bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
1048         bn_scatter5(tmp.d, top, powerbuf, j);
1049       }
1050     }
1051     for (; i < 16; i += 2) {
1052       bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
1053       bn_scatter5(tmp.d, top, powerbuf, i);
1054       bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
1055       bn_scatter5(tmp.d, top, powerbuf, 2 * i);
1056     }
1057     for (; i < 32; i += 2) {
1058       bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
1059       bn_scatter5(tmp.d, top, powerbuf, i);
1060     }
1061 
1062     bits--;
1063     for (wvalue = 0, i = bits % 5; i >= 0; i--, bits--) {
1064       wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
1065     }
1066     bn_gather5(tmp.d, top, powerbuf, wvalue);
1067 
1068     /* At this point |bits| is 4 mod 5 and at least -1. (|bits| is the first bit
1069      * that has not been read yet.) */
1070     assert(bits >= -1 && (bits == -1 || bits % 5 == 4));
1071 
1072     /* Scan the exponent one window at a time starting from the most
1073      * significant bits.
1074      */
1075     if (top & 7) {
1076       while (bits >= 0) {
1077         for (wvalue = 0, i = 0; i < 5; i++, bits--) {
1078           wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
1079         }
1080 
1081         bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
1082         bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
1083         bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
1084         bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
1085         bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
1086         bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue);
1087       }
1088     } else {
1089       const uint8_t *p_bytes = (const uint8_t *)p->d;
1090       int max_bits = p->top * BN_BITS2;
1091       assert(bits < max_bits);
1092       /* |p = 0| has been handled as a special case, so |max_bits| is at least
1093        * one word. */
1094       assert(max_bits >= 64);
1095 
1096       /* If the first bit to be read lands in the last byte, unroll the first
1097        * iteration to avoid reading past the bounds of |p->d|. (After the first
1098        * iteration, we are guaranteed to be past the last byte.) Note |bits|
1099        * here is the top bit, inclusive. */
1100       if (bits - 4 >= max_bits - 8) {
1101         /* Read five bits from |bits-4| through |bits|, inclusive. */
1102         wvalue = p_bytes[p->top * BN_BYTES - 1];
1103         wvalue >>= (bits - 4) & 7;
1104         wvalue &= 0x1f;
1105         bits -= 5;
1106         bn_power5(tmp.d, tmp.d, powerbuf, np2, n0, top, wvalue);
1107       }
1108       while (bits >= 0) {
1109         /* Read five bits from |bits-4| through |bits|, inclusive. */
1110         int first_bit = bits - 4;
1111         wvalue = *(const uint16_t *) (p_bytes + (first_bit >> 3));
1112         wvalue >>= first_bit & 7;
1113         wvalue &= 0x1f;
1114         bits -= 5;
1115         bn_power5(tmp.d, tmp.d, powerbuf, np2, n0, top, wvalue);
1116       }
1117     }
1118 
1119     ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np2, n0, top);
1120     tmp.top = top;
1121     bn_correct_top(&tmp);
1122     if (ret) {
1123       if (!BN_copy(rr, &tmp)) {
1124         ret = 0;
1125       }
1126       goto err; /* non-zero ret means it's not error */
1127     }
1128   } else
1129 #endif
1130   {
1131     if (!copy_to_prebuf(&tmp, top, powerbuf, 0, numPowers) ||
1132         !copy_to_prebuf(&am, top, powerbuf, 1, numPowers)) {
1133       goto err;
1134     }
1135 
1136     /* If the window size is greater than 1, then calculate
1137      * val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1)
1138      * (even powers could instead be computed as (a^(i/2))^2
1139      * to use the slight performance advantage of sqr over mul).
1140      */
1141     if (window > 1) {
1142       if (!BN_mod_mul_montgomery(&tmp, &am, &am, mont, ctx) ||
1143           !copy_to_prebuf(&tmp, top, powerbuf, 2, numPowers)) {
1144         goto err;
1145       }
1146       for (i = 3; i < numPowers; i++) {
1147         /* Calculate a^i = a^(i-1) * a */
1148         if (!BN_mod_mul_montgomery(&tmp, &am, &tmp, mont, ctx) ||
1149             !copy_to_prebuf(&tmp, top, powerbuf, i, numPowers)) {
1150           goto err;
1151         }
1152       }
1153     }
1154 
1155     bits--;
1156     for (wvalue = 0, i = bits % window; i >= 0; i--, bits--) {
1157       wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
1158     }
1159     if (!copy_from_prebuf(&tmp, top, powerbuf, wvalue, numPowers)) {
1160       goto err;
1161     }
1162 
1163     /* Scan the exponent one window at a time starting from the most
1164      * significant bits.
1165      */
1166     while (bits >= 0) {
1167       wvalue = 0; /* The 'value' of the window */
1168 
1169       /* Scan the window, squaring the result as we go */
1170       for (i = 0; i < window; i++, bits--) {
1171         if (!BN_mod_mul_montgomery(&tmp, &tmp, &tmp, mont, ctx)) {
1172           goto err;
1173         }
1174         wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
1175       }
1176 
1177       /* Fetch the appropriate pre-computed value from the pre-buf */
1178       if (!copy_from_prebuf(&am, top, powerbuf, wvalue, numPowers)) {
1179         goto err;
1180       }
1181 
1182       /* Multiply the result into the intermediate result */
1183       if (!BN_mod_mul_montgomery(&tmp, &tmp, &am, mont, ctx)) {
1184         goto err;
1185       }
1186     }
1187   }
1188 
1189   /* Convert the final result from montgomery to standard format */
1190   if (!BN_from_montgomery(rr, &tmp, mont, ctx)) {
1191     goto err;
1192   }
1193   ret = 1;
1194 
1195 err:
1196   BN_MONT_CTX_free(new_mont);
1197   if (powerbuf != NULL) {
1198     OPENSSL_cleanse(powerbuf, powerbufLen);
1199     OPENSSL_free(powerbufFree);
1200   }
1201   BN_CTX_end(ctx);
1202   return (ret);
1203 }
1204 
BN_mod_exp_mont_word(BIGNUM * rr,BN_ULONG a,const BIGNUM * p,const BIGNUM * m,BN_CTX * ctx,const BN_MONT_CTX * mont)1205 int BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p,
1206                          const BIGNUM *m, BN_CTX *ctx,
1207                          const BN_MONT_CTX *mont) {
1208   BN_MONT_CTX *new_mont = NULL;
1209   int b, bits, ret = 0;
1210   int r_is_one;
1211   BN_ULONG w, next_w;
1212   BIGNUM *d, *r, *t;
1213   BIGNUM *swap_tmp;
1214 #define BN_MOD_MUL_WORD(r, w, m)   \
1215   (BN_mul_word(r, (w)) &&          \
1216    (/* BN_ucmp(r, (m)) < 0 ? 1 :*/ \
1217     (BN_mod(t, r, m, ctx) && (swap_tmp = r, r = t, t = swap_tmp, 1))))
1218   /* BN_MOD_MUL_WORD is only used with 'w' large, so the BN_ucmp test is
1219    * probably more overhead than always using BN_mod (which uses BN_copy if a
1220    * similar test returns true). We can use BN_mod and do not need BN_nnmod
1221    * because our accumulator is never negative (the result of BN_mod does not
1222    * depend on the sign of the modulus). */
1223 #define BN_TO_MONTGOMERY_WORD(r, w, mont) \
1224   (BN_set_word(r, (w)) && BN_to_montgomery(r, r, (mont), ctx))
1225 
1226   if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
1227     /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
1228     OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
1229     return 0;
1230   }
1231 
1232   if (!BN_is_odd(m)) {
1233     OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS);
1234     return 0;
1235   }
1236 
1237   if (m->top == 1) {
1238     a %= m->d[0]; /* make sure that 'a' is reduced */
1239   }
1240 
1241   bits = BN_num_bits(p);
1242   if (bits == 0) {
1243     /* x**0 mod 1 is still zero. */
1244     if (BN_is_one(m)) {
1245       BN_zero(rr);
1246       return 1;
1247     }
1248     return BN_one(rr);
1249   }
1250   if (a == 0) {
1251     BN_zero(rr);
1252     return 1;
1253   }
1254 
1255   BN_CTX_start(ctx);
1256   d = BN_CTX_get(ctx);
1257   r = BN_CTX_get(ctx);
1258   t = BN_CTX_get(ctx);
1259   if (d == NULL || r == NULL || t == NULL) {
1260     goto err;
1261   }
1262 
1263   /* Allocate a montgomery context if it was not supplied by the caller. */
1264   if (mont == NULL) {
1265     new_mont = BN_MONT_CTX_new();
1266     if (new_mont == NULL || !BN_MONT_CTX_set(new_mont, m, ctx)) {
1267       goto err;
1268     }
1269     mont = new_mont;
1270   }
1271 
1272   r_is_one = 1; /* except for Montgomery factor */
1273 
1274   /* bits-1 >= 0 */
1275 
1276   /* The result is accumulated in the product r*w. */
1277   w = a; /* bit 'bits-1' of 'p' is always set */
1278   for (b = bits - 2; b >= 0; b--) {
1279     /* First, square r*w. */
1280     next_w = w * w;
1281     if ((next_w / w) != w) {
1282       /* overflow */
1283       if (r_is_one) {
1284         if (!BN_TO_MONTGOMERY_WORD(r, w, mont)) {
1285           goto err;
1286         }
1287         r_is_one = 0;
1288       } else {
1289         if (!BN_MOD_MUL_WORD(r, w, m)) {
1290           goto err;
1291         }
1292       }
1293       next_w = 1;
1294     }
1295 
1296     w = next_w;
1297     if (!r_is_one) {
1298       if (!BN_mod_mul_montgomery(r, r, r, mont, ctx)) {
1299         goto err;
1300       }
1301     }
1302 
1303     /* Second, multiply r*w by 'a' if exponent bit is set. */
1304     if (BN_is_bit_set(p, b)) {
1305       next_w = w * a;
1306       if ((next_w / a) != w) {
1307         /* overflow */
1308         if (r_is_one) {
1309           if (!BN_TO_MONTGOMERY_WORD(r, w, mont)) {
1310             goto err;
1311           }
1312           r_is_one = 0;
1313         } else {
1314           if (!BN_MOD_MUL_WORD(r, w, m)) {
1315             goto err;
1316           }
1317         }
1318         next_w = a;
1319       }
1320       w = next_w;
1321     }
1322   }
1323 
1324   /* Finally, set r:=r*w. */
1325   if (w != 1) {
1326     if (r_is_one) {
1327       if (!BN_TO_MONTGOMERY_WORD(r, w, mont)) {
1328         goto err;
1329       }
1330       r_is_one = 0;
1331     } else {
1332       if (!BN_MOD_MUL_WORD(r, w, m)) {
1333         goto err;
1334       }
1335     }
1336   }
1337 
1338   if (r_is_one) {
1339     /* can happen only if a == 1*/
1340     if (!BN_one(rr)) {
1341       goto err;
1342     }
1343   } else {
1344     if (!BN_from_montgomery(rr, r, mont, ctx)) {
1345       goto err;
1346     }
1347   }
1348   ret = 1;
1349 
1350 err:
1351   BN_MONT_CTX_free(new_mont);
1352   BN_CTX_end(ctx);
1353   return ret;
1354 }
1355 
1356 #define TABLE_SIZE 32
1357 
BN_mod_exp2_mont(BIGNUM * rr,const BIGNUM * a1,const BIGNUM * p1,const BIGNUM * a2,const BIGNUM * p2,const BIGNUM * m,BN_CTX * ctx,const BN_MONT_CTX * mont)1358 int BN_mod_exp2_mont(BIGNUM *rr, const BIGNUM *a1, const BIGNUM *p1,
1359                      const BIGNUM *a2, const BIGNUM *p2, const BIGNUM *m,
1360                      BN_CTX *ctx, const BN_MONT_CTX *mont) {
1361   int i, j, bits, b, bits1, bits2, ret = 0, wpos1, wpos2, window1, window2,
1362                                    wvalue1, wvalue2;
1363   int r_is_one = 1;
1364   BIGNUM *d, *r;
1365   const BIGNUM *a_mod_m;
1366   /* Tables of variables obtained from 'ctx' */
1367   BIGNUM *val1[TABLE_SIZE], *val2[TABLE_SIZE];
1368   BN_MONT_CTX *new_mont = NULL;
1369 
1370   if (!(m->d[0] & 1)) {
1371     OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS);
1372     return 0;
1373   }
1374   bits1 = BN_num_bits(p1);
1375   bits2 = BN_num_bits(p2);
1376   if (bits1 == 0 && bits2 == 0) {
1377     ret = BN_one(rr);
1378     return ret;
1379   }
1380 
1381   bits = (bits1 > bits2) ? bits1 : bits2;
1382 
1383   BN_CTX_start(ctx);
1384   d = BN_CTX_get(ctx);
1385   r = BN_CTX_get(ctx);
1386   val1[0] = BN_CTX_get(ctx);
1387   val2[0] = BN_CTX_get(ctx);
1388   if (!d || !r || !val1[0] || !val2[0]) {
1389     goto err;
1390   }
1391 
1392   /* Allocate a montgomery context if it was not supplied by the caller. */
1393   if (mont == NULL) {
1394     new_mont = BN_MONT_CTX_new();
1395     if (new_mont == NULL || !BN_MONT_CTX_set(new_mont, m, ctx)) {
1396       goto err;
1397     }
1398     mont = new_mont;
1399   }
1400 
1401   window1 = BN_window_bits_for_exponent_size(bits1);
1402   window2 = BN_window_bits_for_exponent_size(bits2);
1403 
1404   /* Build table for a1:   val1[i] := a1^(2*i + 1) mod m  for i = 0 ..
1405    * 2^(window1-1) */
1406   if (a1->neg || BN_ucmp(a1, m) >= 0) {
1407     if (!BN_mod(val1[0], a1, m, ctx)) {
1408       goto err;
1409     }
1410     a_mod_m = val1[0];
1411   } else {
1412     a_mod_m = a1;
1413   }
1414 
1415   if (BN_is_zero(a_mod_m)) {
1416     BN_zero(rr);
1417     ret = 1;
1418     goto err;
1419   }
1420 
1421   if (!BN_to_montgomery(val1[0], a_mod_m, mont, ctx)) {
1422     goto err;
1423   }
1424 
1425   if (window1 > 1) {
1426     if (!BN_mod_mul_montgomery(d, val1[0], val1[0], mont, ctx)) {
1427       goto err;
1428     }
1429 
1430     j = 1 << (window1 - 1);
1431     for (i = 1; i < j; i++) {
1432       if (((val1[i] = BN_CTX_get(ctx)) == NULL) ||
1433           !BN_mod_mul_montgomery(val1[i], val1[i - 1], d, mont, ctx)) {
1434         goto err;
1435       }
1436     }
1437   }
1438 
1439   /* Build table for a2:   val2[i] := a2^(2*i + 1) mod m  for i = 0 ..
1440    * 2^(window2-1) */
1441   if (a2->neg || BN_ucmp(a2, m) >= 0) {
1442     if (!BN_mod(val2[0], a2, m, ctx)) {
1443       goto err;
1444     }
1445     a_mod_m = val2[0];
1446   } else {
1447     a_mod_m = a2;
1448   }
1449 
1450   if (BN_is_zero(a_mod_m)) {
1451     BN_zero(rr);
1452     ret = 1;
1453     goto err;
1454   }
1455 
1456   if (!BN_to_montgomery(val2[0], a_mod_m, mont, ctx)) {
1457     goto err;
1458   }
1459 
1460   if (window2 > 1) {
1461     if (!BN_mod_mul_montgomery(d, val2[0], val2[0], mont, ctx)) {
1462       goto err;
1463     }
1464 
1465     j = 1 << (window2 - 1);
1466     for (i = 1; i < j; i++) {
1467       if (((val2[i] = BN_CTX_get(ctx)) == NULL) ||
1468           !BN_mod_mul_montgomery(val2[i], val2[i - 1], d, mont, ctx)) {
1469         goto err;
1470       }
1471     }
1472   }
1473 
1474   /* Now compute the power product, using independent windows. */
1475   r_is_one = 1;
1476   wvalue1 = 0; /* The 'value' of the first window */
1477   wvalue2 = 0; /* The 'value' of the second window */
1478   wpos1 = 0;   /* If wvalue1 > 0, the bottom bit of the first window */
1479   wpos2 = 0;   /* If wvalue2 > 0, the bottom bit of the second window */
1480 
1481   if (!BN_to_montgomery(r, BN_value_one(), mont, ctx)) {
1482     goto err;
1483   }
1484 
1485   for (b = bits - 1; b >= 0; b--) {
1486     if (!r_is_one) {
1487       if (!BN_mod_mul_montgomery(r, r, r, mont, ctx)) {
1488         goto err;
1489       }
1490     }
1491 
1492     if (!wvalue1 && BN_is_bit_set(p1, b)) {
1493       /* consider bits b-window1+1 .. b for this window */
1494       i = b - window1 + 1;
1495       /* works for i<0 */
1496       while (!BN_is_bit_set(p1, i)) {
1497         i++;
1498       }
1499       wpos1 = i;
1500       wvalue1 = 1;
1501       for (i = b - 1; i >= wpos1; i--) {
1502         wvalue1 <<= 1;
1503         if (BN_is_bit_set(p1, i)) {
1504           wvalue1++;
1505         }
1506       }
1507     }
1508 
1509     if (!wvalue2 && BN_is_bit_set(p2, b)) {
1510       /* consider bits b-window2+1 .. b for this window */
1511       i = b - window2 + 1;
1512       while (!BN_is_bit_set(p2, i)) {
1513         i++;
1514       }
1515       wpos2 = i;
1516       wvalue2 = 1;
1517       for (i = b - 1; i >= wpos2; i--) {
1518         wvalue2 <<= 1;
1519         if (BN_is_bit_set(p2, i)) {
1520           wvalue2++;
1521         }
1522       }
1523     }
1524 
1525     if (wvalue1 && b == wpos1) {
1526       /* wvalue1 is odd and < 2^window1 */
1527       if (!BN_mod_mul_montgomery(r, r, val1[wvalue1 >> 1], mont, ctx)) {
1528         goto err;
1529       }
1530       wvalue1 = 0;
1531       r_is_one = 0;
1532     }
1533 
1534     if (wvalue2 && b == wpos2) {
1535       /* wvalue2 is odd and < 2^window2 */
1536       if (!BN_mod_mul_montgomery(r, r, val2[wvalue2 >> 1], mont, ctx)) {
1537         goto err;
1538       }
1539       wvalue2 = 0;
1540       r_is_one = 0;
1541     }
1542   }
1543 
1544   if (!BN_from_montgomery(rr, r, mont, ctx)) {
1545     goto err;
1546   }
1547   ret = 1;
1548 
1549 err:
1550   BN_MONT_CTX_free(new_mont);
1551   BN_CTX_end(ctx);
1552   return ret;
1553 }
1554