1 /* ====================================================================
2  * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ==================================================================== */
48 
49 #include <openssl/base.h>
50 
51 #include <assert.h>
52 #include <string.h>
53 
54 #include <openssl/mem.h>
55 #include <openssl/cpu.h>
56 
57 #include "internal.h"
58 #include "../../internal.h"
59 
60 
61 #define PACK(s) ((size_t)(s) << (sizeof(size_t) * 8 - 16))
62 #define REDUCE1BIT(V)                                                 \
63   do {                                                                \
64     if (sizeof(size_t) == 8) {                                        \
65       uint64_t T = UINT64_C(0xe100000000000000) & (0 - ((V).lo & 1)); \
66       (V).lo = ((V).hi << 63) | ((V).lo >> 1);                        \
67       (V).hi = ((V).hi >> 1) ^ T;                                     \
68     } else {                                                          \
69       uint32_t T = 0xe1000000U & (0 - (uint32_t)((V).lo & 1));        \
70       (V).lo = ((V).hi << 63) | ((V).lo >> 1);                        \
71       (V).hi = ((V).hi >> 1) ^ ((uint64_t)T << 32);                   \
72     }                                                                 \
73   } while (0)
74 
75 // kSizeTWithoutLower4Bits is a mask that can be used to zero the lower four
76 // bits of a |size_t|.
77 static const size_t kSizeTWithoutLower4Bits = (size_t) -16;
78 
gcm_init_4bit(u128 Htable[16],const uint64_t H[2])79 void gcm_init_4bit(u128 Htable[16], const uint64_t H[2]) {
80   u128 V;
81 
82   Htable[0].hi = 0;
83   Htable[0].lo = 0;
84   V.hi = H[0];
85   V.lo = H[1];
86 
87   Htable[8] = V;
88   REDUCE1BIT(V);
89   Htable[4] = V;
90   REDUCE1BIT(V);
91   Htable[2] = V;
92   REDUCE1BIT(V);
93   Htable[1] = V;
94   Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
95   V = Htable[4];
96   Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
97   Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
98   Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
99   V = Htable[8];
100   Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
101   Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
102   Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
103   Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
104   Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
105   Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
106   Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
107 
108 #if defined(GHASH_ASM) && defined(OPENSSL_ARM)
109   for (int j = 0; j < 16; ++j) {
110     V = Htable[j];
111     Htable[j].hi = V.lo;
112     Htable[j].lo = V.hi;
113   }
114 #endif
115 }
116 
117 #if !defined(GHASH_ASM) || defined(OPENSSL_AARCH64) || defined(OPENSSL_PPC64LE)
118 static const size_t rem_4bit[16] = {
119     PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
120     PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
121     PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
122     PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)};
123 
gcm_gmult_4bit(uint64_t Xi[2],const u128 Htable[16])124 void gcm_gmult_4bit(uint64_t Xi[2], const u128 Htable[16]) {
125   u128 Z;
126   int cnt = 15;
127   size_t rem, nlo, nhi;
128 
129   nlo = ((const uint8_t *)Xi)[15];
130   nhi = nlo >> 4;
131   nlo &= 0xf;
132 
133   Z.hi = Htable[nlo].hi;
134   Z.lo = Htable[nlo].lo;
135 
136   while (1) {
137     rem = (size_t)Z.lo & 0xf;
138     Z.lo = (Z.hi << 60) | (Z.lo >> 4);
139     Z.hi = (Z.hi >> 4);
140     if (sizeof(size_t) == 8) {
141       Z.hi ^= rem_4bit[rem];
142     } else {
143       Z.hi ^= (uint64_t)rem_4bit[rem] << 32;
144     }
145 
146     Z.hi ^= Htable[nhi].hi;
147     Z.lo ^= Htable[nhi].lo;
148 
149     if (--cnt < 0) {
150       break;
151     }
152 
153     nlo = ((const uint8_t *)Xi)[cnt];
154     nhi = nlo >> 4;
155     nlo &= 0xf;
156 
157     rem = (size_t)Z.lo & 0xf;
158     Z.lo = (Z.hi << 60) | (Z.lo >> 4);
159     Z.hi = (Z.hi >> 4);
160     if (sizeof(size_t) == 8) {
161       Z.hi ^= rem_4bit[rem];
162     } else {
163       Z.hi ^= (uint64_t)rem_4bit[rem] << 32;
164     }
165 
166     Z.hi ^= Htable[nlo].hi;
167     Z.lo ^= Htable[nlo].lo;
168   }
169 
170   Xi[0] = CRYPTO_bswap8(Z.hi);
171   Xi[1] = CRYPTO_bswap8(Z.lo);
172 }
173 
174 // Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
175 // details... Compiler-generated code doesn't seem to give any
176 // performance improvement, at least not on x86[_64]. It's here
177 // mostly as reference and a placeholder for possible future
178 // non-trivial optimization[s]...
gcm_ghash_4bit(uint64_t Xi[2],const u128 Htable[16],const uint8_t * inp,size_t len)179 void gcm_ghash_4bit(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
180                     size_t len) {
181   u128 Z;
182   int cnt;
183   size_t rem, nlo, nhi;
184 
185   do {
186     cnt = 15;
187     nlo = ((const uint8_t *)Xi)[15];
188     nlo ^= inp[15];
189     nhi = nlo >> 4;
190     nlo &= 0xf;
191 
192     Z.hi = Htable[nlo].hi;
193     Z.lo = Htable[nlo].lo;
194 
195     while (1) {
196       rem = (size_t)Z.lo & 0xf;
197       Z.lo = (Z.hi << 60) | (Z.lo >> 4);
198       Z.hi = (Z.hi >> 4);
199       if (sizeof(size_t) == 8) {
200         Z.hi ^= rem_4bit[rem];
201       } else {
202         Z.hi ^= (uint64_t)rem_4bit[rem] << 32;
203       }
204 
205       Z.hi ^= Htable[nhi].hi;
206       Z.lo ^= Htable[nhi].lo;
207 
208       if (--cnt < 0) {
209         break;
210       }
211 
212       nlo = ((const uint8_t *)Xi)[cnt];
213       nlo ^= inp[cnt];
214       nhi = nlo >> 4;
215       nlo &= 0xf;
216 
217       rem = (size_t)Z.lo & 0xf;
218       Z.lo = (Z.hi << 60) | (Z.lo >> 4);
219       Z.hi = (Z.hi >> 4);
220       if (sizeof(size_t) == 8) {
221         Z.hi ^= rem_4bit[rem];
222       } else {
223         Z.hi ^= (uint64_t)rem_4bit[rem] << 32;
224       }
225 
226       Z.hi ^= Htable[nlo].hi;
227       Z.lo ^= Htable[nlo].lo;
228     }
229 
230     Xi[0] = CRYPTO_bswap8(Z.hi);
231     Xi[1] = CRYPTO_bswap8(Z.lo);
232   } while (inp += 16, len -= 16);
233 }
234 #endif   // !GHASH_ASM || AARCH64 || PPC64LE
235 
236 #define GCM_MUL(ctx, Xi) gcm_gmult_4bit((ctx)->Xi.u, (ctx)->gcm_key.Htable)
237 #define GHASH(ctx, in, len) \
238   gcm_ghash_4bit((ctx)->Xi.u, (ctx)->gcm_key.Htable, in, len)
239 // GHASH_CHUNK is "stride parameter" missioned to mitigate cache
240 // trashing effect. In other words idea is to hash data while it's
241 // still in L1 cache after encryption pass...
242 #define GHASH_CHUNK (3 * 1024)
243 
244 #if defined(GHASH_ASM_X86_64) || defined(GHASH_ASM_X86)
gcm_init_ssse3(u128 Htable[16],const uint64_t Xi[2])245 void gcm_init_ssse3(u128 Htable[16], const uint64_t Xi[2]) {
246   // Run the existing 4-bit version.
247   gcm_init_4bit(Htable, Xi);
248 
249   // First, swap hi and lo. The "4bit" version places hi first. It treats the
250   // two fields separately, so the order does not matter, but ghash-ssse3 reads
251   // the entire state into one 128-bit register.
252   for (int i = 0; i < 16; i++) {
253     uint64_t tmp = Htable[i].hi;
254     Htable[i].hi = Htable[i].lo;
255     Htable[i].lo = tmp;
256   }
257 
258   // Treat |Htable| as a 16x16 byte table and transpose it. Thus, Htable[i]
259   // contains the i'th byte of j*H for all j.
260   uint8_t *Hbytes = (uint8_t *)Htable;
261   for (int i = 0; i < 16; i++) {
262     for (int j = 0; j < i; j++) {
263       uint8_t tmp = Hbytes[16*i + j];
264       Hbytes[16*i + j] = Hbytes[16*j + i];
265       Hbytes[16*j + i] = tmp;
266     }
267   }
268 }
269 #endif  // GHASH_ASM_X86_64 || GHASH_ASM_X86
270 
271 #ifdef GCM_FUNCREF_4BIT
272 #undef GCM_MUL
273 #define GCM_MUL(ctx, Xi) (*gcm_gmult_p)((ctx)->Xi.u, (ctx)->gcm_key.Htable)
274 #undef GHASH
275 #define GHASH(ctx, in, len) \
276   (*gcm_ghash_p)((ctx)->Xi.u, (ctx)->gcm_key.Htable, in, len)
277 #endif  // GCM_FUNCREF_4BIT
278 
CRYPTO_ghash_init(gmult_func * out_mult,ghash_func * out_hash,u128 * out_key,u128 out_table[16],int * out_is_avx,const uint8_t gcm_key[16])279 void CRYPTO_ghash_init(gmult_func *out_mult, ghash_func *out_hash,
280                        u128 *out_key, u128 out_table[16], int *out_is_avx,
281                        const uint8_t gcm_key[16]) {
282   *out_is_avx = 0;
283 
284   union {
285     uint64_t u[2];
286     uint8_t c[16];
287   } H;
288 
289   OPENSSL_memcpy(H.c, gcm_key, 16);
290 
291   // H is stored in host byte order
292   H.u[0] = CRYPTO_bswap8(H.u[0]);
293   H.u[1] = CRYPTO_bswap8(H.u[1]);
294 
295   OPENSSL_memcpy(out_key, H.c, 16);
296 
297 #if defined(GHASH_ASM_X86_64)
298   if (crypto_gcm_clmul_enabled()) {
299     if (((OPENSSL_ia32cap_get()[1] >> 22) & 0x41) == 0x41) {  // AVX+MOVBE
300       gcm_init_avx(out_table, H.u);
301       *out_mult = gcm_gmult_avx;
302       *out_hash = gcm_ghash_avx;
303       *out_is_avx = 1;
304       return;
305     }
306     gcm_init_clmul(out_table, H.u);
307     *out_mult = gcm_gmult_clmul;
308     *out_hash = gcm_ghash_clmul;
309     return;
310   }
311   if (gcm_ssse3_capable()) {
312     gcm_init_ssse3(out_table, H.u);
313     *out_mult = gcm_gmult_ssse3;
314     *out_hash = gcm_ghash_ssse3;
315     return;
316   }
317 #elif defined(GHASH_ASM_X86)
318   if (crypto_gcm_clmul_enabled()) {
319     gcm_init_clmul(out_table, H.u);
320     *out_mult = gcm_gmult_clmul;
321     *out_hash = gcm_ghash_clmul;
322     return;
323   }
324   if (gcm_ssse3_capable()) {
325     gcm_init_ssse3(out_table, H.u);
326     *out_mult = gcm_gmult_ssse3;
327     *out_hash = gcm_ghash_ssse3;
328     return;
329   }
330 #elif defined(GHASH_ASM_ARM)
331   if (gcm_pmull_capable()) {
332     gcm_init_v8(out_table, H.u);
333     *out_mult = gcm_gmult_v8;
334     *out_hash = gcm_ghash_v8;
335     return;
336   }
337 
338   if (gcm_neon_capable()) {
339     gcm_init_neon(out_table, H.u);
340     *out_mult = gcm_gmult_neon;
341     *out_hash = gcm_ghash_neon;
342     return;
343   }
344 #elif defined(GHASH_ASM_PPC64LE)
345   if (CRYPTO_is_PPC64LE_vcrypto_capable()) {
346     gcm_init_p8(out_table, H.u);
347     *out_mult = gcm_gmult_p8;
348     *out_hash = gcm_ghash_p8;
349     return;
350   }
351 #endif
352 
353   gcm_init_4bit(out_table, H.u);
354 #if defined(GHASH_ASM_X86)
355   *out_mult = gcm_gmult_4bit_mmx;
356   *out_hash = gcm_ghash_4bit_mmx;
357 #else
358   *out_mult = gcm_gmult_4bit;
359   *out_hash = gcm_ghash_4bit;
360 #endif
361 }
362 
CRYPTO_gcm128_init_key(GCM128_KEY * gcm_key,const AES_KEY * aes_key,block128_f block,int block_is_hwaes)363 void CRYPTO_gcm128_init_key(GCM128_KEY *gcm_key, const AES_KEY *aes_key,
364                             block128_f block, int block_is_hwaes) {
365   OPENSSL_memset(gcm_key, 0, sizeof(*gcm_key));
366   gcm_key->block = block;
367 
368   uint8_t ghash_key[16];
369   OPENSSL_memset(ghash_key, 0, sizeof(ghash_key));
370   (*block)(ghash_key, ghash_key, aes_key);
371 
372   int is_avx;
373   CRYPTO_ghash_init(&gcm_key->gmult, &gcm_key->ghash, &gcm_key->H,
374                     gcm_key->Htable, &is_avx, ghash_key);
375 
376   gcm_key->use_aesni_gcm_crypt = (is_avx && block_is_hwaes) ? 1 : 0;
377 }
378 
CRYPTO_gcm128_setiv(GCM128_CONTEXT * ctx,const AES_KEY * key,const uint8_t * iv,size_t len)379 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const AES_KEY *key,
380                          const uint8_t *iv, size_t len) {
381 #ifdef GCM_FUNCREF_4BIT
382   void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) =
383       ctx->gcm_key.gmult;
384 #endif
385 
386   ctx->Yi.u[0] = 0;
387   ctx->Yi.u[1] = 0;
388   ctx->Xi.u[0] = 0;
389   ctx->Xi.u[1] = 0;
390   ctx->len.u[0] = 0;  // AAD length
391   ctx->len.u[1] = 0;  // message length
392   ctx->ares = 0;
393   ctx->mres = 0;
394 
395   uint32_t ctr;
396   if (len == 12) {
397     OPENSSL_memcpy(ctx->Yi.c, iv, 12);
398     ctx->Yi.c[15] = 1;
399     ctr = 1;
400   } else {
401     uint64_t len0 = len;
402 
403     while (len >= 16) {
404       for (size_t i = 0; i < 16; ++i) {
405         ctx->Yi.c[i] ^= iv[i];
406       }
407       GCM_MUL(ctx, Yi);
408       iv += 16;
409       len -= 16;
410     }
411     if (len) {
412       for (size_t i = 0; i < len; ++i) {
413         ctx->Yi.c[i] ^= iv[i];
414       }
415       GCM_MUL(ctx, Yi);
416     }
417     len0 <<= 3;
418     ctx->Yi.u[1] ^= CRYPTO_bswap8(len0);
419 
420     GCM_MUL(ctx, Yi);
421     ctr = CRYPTO_bswap4(ctx->Yi.d[3]);
422   }
423 
424   (*ctx->gcm_key.block)(ctx->Yi.c, ctx->EK0.c, key);
425   ++ctr;
426   ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
427 }
428 
CRYPTO_gcm128_aad(GCM128_CONTEXT * ctx,const uint8_t * aad,size_t len)429 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const uint8_t *aad, size_t len) {
430 #ifdef GCM_FUNCREF_4BIT
431   void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) =
432       ctx->gcm_key.gmult;
433 #ifdef GHASH
434   void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
435                       size_t len) = ctx->gcm_key.ghash;
436 #endif
437 #endif
438 
439   if (ctx->len.u[1]) {
440     return 0;
441   }
442 
443   uint64_t alen = ctx->len.u[0] + len;
444   if (alen > (UINT64_C(1) << 61) || (sizeof(len) == 8 && alen < len)) {
445     return 0;
446   }
447   ctx->len.u[0] = alen;
448 
449   unsigned n = ctx->ares;
450   if (n) {
451     while (n && len) {
452       ctx->Xi.c[n] ^= *(aad++);
453       --len;
454       n = (n + 1) % 16;
455     }
456     if (n == 0) {
457       GCM_MUL(ctx, Xi);
458     } else {
459       ctx->ares = n;
460       return 1;
461     }
462   }
463 
464   // Process a whole number of blocks.
465   size_t len_blocks = len & kSizeTWithoutLower4Bits;
466   if (len_blocks != 0) {
467     GHASH(ctx, aad, len_blocks);
468     aad += len_blocks;
469     len -= len_blocks;
470   }
471 
472   // Process the remainder.
473   if (len != 0) {
474     n = (unsigned int)len;
475     for (size_t i = 0; i < len; ++i) {
476       ctx->Xi.c[i] ^= aad[i];
477     }
478   }
479 
480   ctx->ares = n;
481   return 1;
482 }
483 
CRYPTO_gcm128_encrypt(GCM128_CONTEXT * ctx,const AES_KEY * key,const uint8_t * in,uint8_t * out,size_t len)484 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, const AES_KEY *key,
485                           const uint8_t *in, uint8_t *out, size_t len) {
486   block128_f block = ctx->gcm_key.block;
487 #ifdef GCM_FUNCREF_4BIT
488   void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) =
489       ctx->gcm_key.gmult;
490   void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
491                       size_t len) = ctx->gcm_key.ghash;
492 #endif
493 
494   uint64_t mlen = ctx->len.u[1] + len;
495   if (mlen > ((UINT64_C(1) << 36) - 32) ||
496       (sizeof(len) == 8 && mlen < len)) {
497     return 0;
498   }
499   ctx->len.u[1] = mlen;
500 
501   if (ctx->ares) {
502     // First call to encrypt finalizes GHASH(AAD)
503     GCM_MUL(ctx, Xi);
504     ctx->ares = 0;
505   }
506 
507   unsigned n = ctx->mres;
508   if (n) {
509     while (n && len) {
510       ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
511       --len;
512       n = (n + 1) % 16;
513     }
514     if (n == 0) {
515       GCM_MUL(ctx, Xi);
516     } else {
517       ctx->mres = n;
518       return 1;
519     }
520   }
521 
522   uint32_t ctr = CRYPTO_bswap4(ctx->Yi.d[3]);
523   while (len >= GHASH_CHUNK) {
524     size_t j = GHASH_CHUNK;
525 
526     while (j) {
527       (*block)(ctx->Yi.c, ctx->EKi.c, key);
528       ++ctr;
529       ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
530       for (size_t i = 0; i < 16; i += sizeof(size_t)) {
531         store_word_le(out + i,
532                       load_word_le(in + i) ^ ctx->EKi.t[i / sizeof(size_t)]);
533       }
534       out += 16;
535       in += 16;
536       j -= 16;
537     }
538     GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
539     len -= GHASH_CHUNK;
540   }
541   size_t len_blocks = len & kSizeTWithoutLower4Bits;
542   if (len_blocks != 0) {
543     while (len >= 16) {
544       (*block)(ctx->Yi.c, ctx->EKi.c, key);
545       ++ctr;
546       ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
547       for (size_t i = 0; i < 16; i += sizeof(size_t)) {
548         store_word_le(out + i,
549                       load_word_le(in + i) ^ ctx->EKi.t[i / sizeof(size_t)]);
550       }
551       out += 16;
552       in += 16;
553       len -= 16;
554     }
555     GHASH(ctx, out - len_blocks, len_blocks);
556   }
557   if (len) {
558     (*block)(ctx->Yi.c, ctx->EKi.c, key);
559     ++ctr;
560     ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
561     while (len--) {
562       ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
563       ++n;
564     }
565   }
566 
567   ctx->mres = n;
568   return 1;
569 }
570 
CRYPTO_gcm128_decrypt(GCM128_CONTEXT * ctx,const AES_KEY * key,const unsigned char * in,unsigned char * out,size_t len)571 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, const AES_KEY *key,
572                           const unsigned char *in, unsigned char *out,
573                           size_t len) {
574   block128_f block = ctx->gcm_key.block;
575 #ifdef GCM_FUNCREF_4BIT
576   void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) =
577       ctx->gcm_key.gmult;
578   void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
579                       size_t len) = ctx->gcm_key.ghash;
580 #endif
581 
582   uint64_t mlen = ctx->len.u[1] + len;
583   if (mlen > ((UINT64_C(1) << 36) - 32) ||
584       (sizeof(len) == 8 && mlen < len)) {
585     return 0;
586   }
587   ctx->len.u[1] = mlen;
588 
589   if (ctx->ares) {
590     // First call to decrypt finalizes GHASH(AAD)
591     GCM_MUL(ctx, Xi);
592     ctx->ares = 0;
593   }
594 
595   unsigned n = ctx->mres;
596   if (n) {
597     while (n && len) {
598       uint8_t c = *(in++);
599       *(out++) = c ^ ctx->EKi.c[n];
600       ctx->Xi.c[n] ^= c;
601       --len;
602       n = (n + 1) % 16;
603     }
604     if (n == 0) {
605       GCM_MUL(ctx, Xi);
606     } else {
607       ctx->mres = n;
608       return 1;
609     }
610   }
611 
612   uint32_t ctr = CRYPTO_bswap4(ctx->Yi.d[3]);
613   while (len >= GHASH_CHUNK) {
614     size_t j = GHASH_CHUNK;
615 
616     GHASH(ctx, in, GHASH_CHUNK);
617     while (j) {
618       (*block)(ctx->Yi.c, ctx->EKi.c, key);
619       ++ctr;
620       ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
621       for (size_t i = 0; i < 16; i += sizeof(size_t)) {
622         store_word_le(out + i,
623                       load_word_le(in + i) ^ ctx->EKi.t[i / sizeof(size_t)]);
624       }
625       out += 16;
626       in += 16;
627       j -= 16;
628     }
629     len -= GHASH_CHUNK;
630   }
631   size_t len_blocks = len & kSizeTWithoutLower4Bits;
632   if (len_blocks != 0) {
633     GHASH(ctx, in, len_blocks);
634     while (len >= 16) {
635       (*block)(ctx->Yi.c, ctx->EKi.c, key);
636       ++ctr;
637       ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
638       for (size_t i = 0; i < 16; i += sizeof(size_t)) {
639         store_word_le(out + i,
640                       load_word_le(in + i) ^ ctx->EKi.t[i / sizeof(size_t)]);
641       }
642       out += 16;
643       in += 16;
644       len -= 16;
645     }
646   }
647   if (len) {
648     (*block)(ctx->Yi.c, ctx->EKi.c, key);
649     ++ctr;
650     ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
651     while (len--) {
652       uint8_t c = in[n];
653       ctx->Xi.c[n] ^= c;
654       out[n] = c ^ ctx->EKi.c[n];
655       ++n;
656     }
657   }
658 
659   ctx->mres = n;
660   return 1;
661 }
662 
CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT * ctx,const AES_KEY * key,const uint8_t * in,uint8_t * out,size_t len,ctr128_f stream)663 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, const AES_KEY *key,
664                                 const uint8_t *in, uint8_t *out, size_t len,
665                                 ctr128_f stream) {
666 #ifdef GCM_FUNCREF_4BIT
667   void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) =
668       ctx->gcm_key.gmult;
669   void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
670                       size_t len) = ctx->gcm_key.ghash;
671 #endif
672 
673   uint64_t mlen = ctx->len.u[1] + len;
674   if (mlen > ((UINT64_C(1) << 36) - 32) ||
675       (sizeof(len) == 8 && mlen < len)) {
676     return 0;
677   }
678   ctx->len.u[1] = mlen;
679 
680   if (ctx->ares) {
681     // First call to encrypt finalizes GHASH(AAD)
682     GCM_MUL(ctx, Xi);
683     ctx->ares = 0;
684   }
685 
686   unsigned n = ctx->mres;
687   if (n) {
688     while (n && len) {
689       ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
690       --len;
691       n = (n + 1) % 16;
692     }
693     if (n == 0) {
694       GCM_MUL(ctx, Xi);
695     } else {
696       ctx->mres = n;
697       return 1;
698     }
699   }
700 
701 #if defined(AESNI_GCM)
702   if (ctx->gcm_key.use_aesni_gcm_crypt) {
703     // |aesni_gcm_encrypt| may not process all the input given to it. It may
704     // not process *any* of its input if it is deemed too small.
705     size_t bulk = aesni_gcm_encrypt(in, out, len, key, ctx->Yi.c, ctx->Xi.u);
706     in += bulk;
707     out += bulk;
708     len -= bulk;
709   }
710 #endif
711 
712   uint32_t ctr = CRYPTO_bswap4(ctx->Yi.d[3]);
713   while (len >= GHASH_CHUNK) {
714     (*stream)(in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
715     ctr += GHASH_CHUNK / 16;
716     ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
717     GHASH(ctx, out, GHASH_CHUNK);
718     out += GHASH_CHUNK;
719     in += GHASH_CHUNK;
720     len -= GHASH_CHUNK;
721   }
722   size_t len_blocks = len & kSizeTWithoutLower4Bits;
723   if (len_blocks != 0) {
724     size_t j = len_blocks / 16;
725 
726     (*stream)(in, out, j, key, ctx->Yi.c);
727     ctr += (unsigned int)j;
728     ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
729     in += len_blocks;
730     len -= len_blocks;
731     GHASH(ctx, out, len_blocks);
732     out += len_blocks;
733   }
734   if (len) {
735     (*ctx->gcm_key.block)(ctx->Yi.c, ctx->EKi.c, key);
736     ++ctr;
737     ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
738     while (len--) {
739       ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
740       ++n;
741     }
742   }
743 
744   ctx->mres = n;
745   return 1;
746 }
747 
CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT * ctx,const AES_KEY * key,const uint8_t * in,uint8_t * out,size_t len,ctr128_f stream)748 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, const AES_KEY *key,
749                                 const uint8_t *in, uint8_t *out, size_t len,
750                                 ctr128_f stream) {
751 #ifdef GCM_FUNCREF_4BIT
752   void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) =
753       ctx->gcm_key.gmult;
754   void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
755                       size_t len) = ctx->gcm_key.ghash;
756 #endif
757 
758   uint64_t mlen = ctx->len.u[1] + len;
759   if (mlen > ((UINT64_C(1) << 36) - 32) ||
760       (sizeof(len) == 8 && mlen < len)) {
761     return 0;
762   }
763   ctx->len.u[1] = mlen;
764 
765   if (ctx->ares) {
766     // First call to decrypt finalizes GHASH(AAD)
767     GCM_MUL(ctx, Xi);
768     ctx->ares = 0;
769   }
770 
771   unsigned n = ctx->mres;
772   if (n) {
773     while (n && len) {
774       uint8_t c = *(in++);
775       *(out++) = c ^ ctx->EKi.c[n];
776       ctx->Xi.c[n] ^= c;
777       --len;
778       n = (n + 1) % 16;
779     }
780     if (n == 0) {
781       GCM_MUL(ctx, Xi);
782     } else {
783       ctx->mres = n;
784       return 1;
785     }
786   }
787 
788 #if defined(AESNI_GCM)
789   if (ctx->gcm_key.use_aesni_gcm_crypt) {
790     // |aesni_gcm_decrypt| may not process all the input given to it. It may
791     // not process *any* of its input if it is deemed too small.
792     size_t bulk = aesni_gcm_decrypt(in, out, len, key, ctx->Yi.c, ctx->Xi.u);
793     in += bulk;
794     out += bulk;
795     len -= bulk;
796   }
797 #endif
798 
799   uint32_t ctr = CRYPTO_bswap4(ctx->Yi.d[3]);
800   while (len >= GHASH_CHUNK) {
801     GHASH(ctx, in, GHASH_CHUNK);
802     (*stream)(in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
803     ctr += GHASH_CHUNK / 16;
804     ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
805     out += GHASH_CHUNK;
806     in += GHASH_CHUNK;
807     len -= GHASH_CHUNK;
808   }
809   size_t len_blocks = len & kSizeTWithoutLower4Bits;
810   if (len_blocks != 0) {
811     size_t j = len_blocks / 16;
812 
813     GHASH(ctx, in, len_blocks);
814     (*stream)(in, out, j, key, ctx->Yi.c);
815     ctr += (unsigned int)j;
816     ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
817     out += len_blocks;
818     in += len_blocks;
819     len -= len_blocks;
820   }
821   if (len) {
822     (*ctx->gcm_key.block)(ctx->Yi.c, ctx->EKi.c, key);
823     ++ctr;
824     ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
825     while (len--) {
826       uint8_t c = in[n];
827       ctx->Xi.c[n] ^= c;
828       out[n] = c ^ ctx->EKi.c[n];
829       ++n;
830     }
831   }
832 
833   ctx->mres = n;
834   return 1;
835 }
836 
CRYPTO_gcm128_finish(GCM128_CONTEXT * ctx,const uint8_t * tag,size_t len)837 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const uint8_t *tag, size_t len) {
838 #ifdef GCM_FUNCREF_4BIT
839   void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) =
840       ctx->gcm_key.gmult;
841 #endif
842 
843   if (ctx->mres || ctx->ares) {
844     GCM_MUL(ctx, Xi);
845   }
846 
847   ctx->Xi.u[0] ^= CRYPTO_bswap8(ctx->len.u[0] << 3);
848   ctx->Xi.u[1] ^= CRYPTO_bswap8(ctx->len.u[1] << 3);
849   GCM_MUL(ctx, Xi);
850 
851   ctx->Xi.u[0] ^= ctx->EK0.u[0];
852   ctx->Xi.u[1] ^= ctx->EK0.u[1];
853 
854   if (tag && len <= sizeof(ctx->Xi)) {
855     return CRYPTO_memcmp(ctx->Xi.c, tag, len) == 0;
856   } else {
857     return 0;
858   }
859 }
860 
CRYPTO_gcm128_tag(GCM128_CONTEXT * ctx,unsigned char * tag,size_t len)861 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len) {
862   CRYPTO_gcm128_finish(ctx, NULL, 0);
863   OPENSSL_memcpy(tag, ctx->Xi.c,
864                  len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
865 }
866 
867 #if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
crypto_gcm_clmul_enabled(void)868 int crypto_gcm_clmul_enabled(void) {
869 #ifdef GHASH_ASM
870   const uint32_t *ia32cap = OPENSSL_ia32cap_get();
871   return (ia32cap[0] & (1 << 24)) &&  // check FXSR bit
872          (ia32cap[1] & (1 << 1));     // check PCLMULQDQ bit
873 #else
874   return 0;
875 #endif
876 }
877 #endif
878