1/* ====================================================================
2 * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in
13 *    the documentation and/or other materials provided with the
14 *    distribution.
15 *
16 * 3. All advertising materials mentioning features or use of this
17 *    software must display the following acknowledgment:
18 *    "This product includes software developed by the OpenSSL Project
19 *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20 *
21 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22 *    endorse or promote products derived from this software without
23 *    prior written permission. For written permission, please contact
24 *    openssl-core@openssl.org.
25 *
26 * 5. Products derived from this software may not be called "OpenSSL"
27 *    nor may "OpenSSL" appear in their names without prior written
28 *    permission of the OpenSSL Project.
29 *
30 * 6. Redistributions of any form whatsoever must retain the following
31 *    acknowledgment:
32 *    "This product includes software developed by the OpenSSL Project
33 *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34 *
35 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46 * OF THE POSSIBILITY OF SUCH DAMAGE.
47 * ==================================================================== */
48
49#include <openssl/base.h>
50
51#include <assert.h>
52#include <string.h>
53
54#include <openssl/mem.h>
55#include <openssl/cpu.h>
56
57#include "internal.h"
58#include "../../internal.h"
59
60#if !defined(OPENSSL_NO_ASM) &&                         \
61    (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \
62     defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) || \
63     defined(OPENSSL_PPC64LE))
64#define GHASH_ASM
65#endif
66
67#define PACK(s) ((size_t)(s) << (sizeof(size_t) * 8 - 16))
68#define REDUCE1BIT(V)                                                 \
69  do {                                                                \
70    if (sizeof(size_t) == 8) {                                        \
71      uint64_t T = UINT64_C(0xe100000000000000) & (0 - ((V).lo & 1)); \
72      (V).lo = ((V).hi << 63) | ((V).lo >> 1);                        \
73      (V).hi = ((V).hi >> 1) ^ T;                                     \
74    } else {                                                          \
75      uint32_t T = 0xe1000000U & (0 - (uint32_t)((V).lo & 1));        \
76      (V).lo = ((V).hi << 63) | ((V).lo >> 1);                        \
77      (V).hi = ((V).hi >> 1) ^ ((uint64_t)T << 32);                   \
78    }                                                                 \
79  } while (0)
80
81// kSizeTWithoutLower4Bits is a mask that can be used to zero the lower four
82// bits of a |size_t|.
83static const size_t kSizeTWithoutLower4Bits = (size_t) -16;
84
85static void gcm_init_4bit(u128 Htable[16], uint64_t H[2]) {
86  u128 V;
87
88  Htable[0].hi = 0;
89  Htable[0].lo = 0;
90  V.hi = H[0];
91  V.lo = H[1];
92
93  Htable[8] = V;
94  REDUCE1BIT(V);
95  Htable[4] = V;
96  REDUCE1BIT(V);
97  Htable[2] = V;
98  REDUCE1BIT(V);
99  Htable[1] = V;
100  Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
101  V = Htable[4];
102  Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
103  Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
104  Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
105  V = Htable[8];
106  Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
107  Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
108  Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
109  Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
110  Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
111  Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
112  Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
113
114#if defined(GHASH_ASM) && defined(OPENSSL_ARM)
115  for (int j = 0; j < 16; ++j) {
116    V = Htable[j];
117    Htable[j].hi = V.lo;
118    Htable[j].lo = V.hi;
119  }
120#endif
121}
122
123#if !defined(GHASH_ASM) || defined(OPENSSL_AARCH64) || defined(OPENSSL_PPC64LE)
124static const size_t rem_4bit[16] = {
125    PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
126    PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
127    PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
128    PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)};
129
130static void gcm_gmult_4bit(uint64_t Xi[2], const u128 Htable[16]) {
131  u128 Z;
132  int cnt = 15;
133  size_t rem, nlo, nhi;
134
135  nlo = ((const uint8_t *)Xi)[15];
136  nhi = nlo >> 4;
137  nlo &= 0xf;
138
139  Z.hi = Htable[nlo].hi;
140  Z.lo = Htable[nlo].lo;
141
142  while (1) {
143    rem = (size_t)Z.lo & 0xf;
144    Z.lo = (Z.hi << 60) | (Z.lo >> 4);
145    Z.hi = (Z.hi >> 4);
146    if (sizeof(size_t) == 8) {
147      Z.hi ^= rem_4bit[rem];
148    } else {
149      Z.hi ^= (uint64_t)rem_4bit[rem] << 32;
150    }
151
152    Z.hi ^= Htable[nhi].hi;
153    Z.lo ^= Htable[nhi].lo;
154
155    if (--cnt < 0) {
156      break;
157    }
158
159    nlo = ((const uint8_t *)Xi)[cnt];
160    nhi = nlo >> 4;
161    nlo &= 0xf;
162
163    rem = (size_t)Z.lo & 0xf;
164    Z.lo = (Z.hi << 60) | (Z.lo >> 4);
165    Z.hi = (Z.hi >> 4);
166    if (sizeof(size_t) == 8) {
167      Z.hi ^= rem_4bit[rem];
168    } else {
169      Z.hi ^= (uint64_t)rem_4bit[rem] << 32;
170    }
171
172    Z.hi ^= Htable[nlo].hi;
173    Z.lo ^= Htable[nlo].lo;
174  }
175
176  Xi[0] = CRYPTO_bswap8(Z.hi);
177  Xi[1] = CRYPTO_bswap8(Z.lo);
178}
179
180// Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
181// details... Compiler-generated code doesn't seem to give any
182// performance improvement, at least not on x86[_64]. It's here
183// mostly as reference and a placeholder for possible future
184// non-trivial optimization[s]...
185static void gcm_ghash_4bit(uint64_t Xi[2], const u128 Htable[16],
186                           const uint8_t *inp, size_t len) {
187  u128 Z;
188  int cnt;
189  size_t rem, nlo, nhi;
190
191  do {
192    cnt = 15;
193    nlo = ((const uint8_t *)Xi)[15];
194    nlo ^= inp[15];
195    nhi = nlo >> 4;
196    nlo &= 0xf;
197
198    Z.hi = Htable[nlo].hi;
199    Z.lo = Htable[nlo].lo;
200
201    while (1) {
202      rem = (size_t)Z.lo & 0xf;
203      Z.lo = (Z.hi << 60) | (Z.lo >> 4);
204      Z.hi = (Z.hi >> 4);
205      if (sizeof(size_t) == 8) {
206        Z.hi ^= rem_4bit[rem];
207      } else {
208        Z.hi ^= (uint64_t)rem_4bit[rem] << 32;
209      }
210
211      Z.hi ^= Htable[nhi].hi;
212      Z.lo ^= Htable[nhi].lo;
213
214      if (--cnt < 0) {
215        break;
216      }
217
218      nlo = ((const uint8_t *)Xi)[cnt];
219      nlo ^= inp[cnt];
220      nhi = nlo >> 4;
221      nlo &= 0xf;
222
223      rem = (size_t)Z.lo & 0xf;
224      Z.lo = (Z.hi << 60) | (Z.lo >> 4);
225      Z.hi = (Z.hi >> 4);
226      if (sizeof(size_t) == 8) {
227        Z.hi ^= rem_4bit[rem];
228      } else {
229        Z.hi ^= (uint64_t)rem_4bit[rem] << 32;
230      }
231
232      Z.hi ^= Htable[nlo].hi;
233      Z.lo ^= Htable[nlo].lo;
234    }
235
236    Xi[0] = CRYPTO_bswap8(Z.hi);
237    Xi[1] = CRYPTO_bswap8(Z.lo);
238  } while (inp += 16, len -= 16);
239}
240#else  // GHASH_ASM
241void gcm_gmult_4bit(uint64_t Xi[2], const u128 Htable[16]);
242void gcm_ghash_4bit(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
243                    size_t len);
244#endif
245
246#define GCM_MUL(ctx, Xi) gcm_gmult_4bit((ctx)->Xi.u, (ctx)->Htable)
247#if defined(GHASH_ASM)
248#define GHASH(ctx, in, len) gcm_ghash_4bit((ctx)->Xi.u, (ctx)->Htable, in, len)
249// GHASH_CHUNK is "stride parameter" missioned to mitigate cache
250// trashing effect. In other words idea is to hash data while it's
251// still in L1 cache after encryption pass...
252#define GHASH_CHUNK (3 * 1024)
253#endif
254
255
256#if defined(GHASH_ASM)
257
258#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
259#define GCM_FUNCREF_4BIT
260void gcm_init_clmul(u128 Htable[16], const uint64_t Xi[2]);
261void gcm_gmult_clmul(uint64_t Xi[2], const u128 Htable[16]);
262void gcm_ghash_clmul(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
263                     size_t len);
264
265#if defined(OPENSSL_X86_64)
266#define GHASH_ASM_X86_64
267void gcm_init_avx(u128 Htable[16], const uint64_t Xi[2]);
268void gcm_gmult_avx(uint64_t Xi[2], const u128 Htable[16]);
269void gcm_ghash_avx(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in,
270                   size_t len);
271#define AESNI_GCM
272size_t aesni_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len,
273                         const void *key, uint8_t ivec[16], uint64_t *Xi);
274size_t aesni_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len,
275                         const void *key, uint8_t ivec[16], uint64_t *Xi);
276#endif
277
278#if defined(OPENSSL_X86)
279#define GHASH_ASM_X86
280void gcm_gmult_4bit_mmx(uint64_t Xi[2], const u128 Htable[16]);
281void gcm_ghash_4bit_mmx(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
282                        size_t len);
283#endif
284
285#elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
286#include <openssl/arm_arch.h>
287#if __ARM_ARCH__ >= 7
288#define GHASH_ASM_ARM
289#define GCM_FUNCREF_4BIT
290
291static int pmull_capable(void) {
292  return CRYPTO_is_ARMv8_PMULL_capable();
293}
294
295void gcm_init_v8(u128 Htable[16], const uint64_t Xi[2]);
296void gcm_gmult_v8(uint64_t Xi[2], const u128 Htable[16]);
297void gcm_ghash_v8(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
298                  size_t len);
299
300#if defined(OPENSSL_ARM)
301// 32-bit ARM also has support for doing GCM with NEON instructions.
302static int neon_capable(void) {
303  return CRYPTO_is_NEON_capable();
304}
305
306void gcm_init_neon(u128 Htable[16], const uint64_t Xi[2]);
307void gcm_gmult_neon(uint64_t Xi[2], const u128 Htable[16]);
308void gcm_ghash_neon(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
309                    size_t len);
310#else
311// AArch64 only has the ARMv8 versions of functions.
312static int neon_capable(void) {
313  return 0;
314}
315static void gcm_init_neon(u128 Htable[16], const uint64_t Xi[2]) {
316  abort();
317}
318static void gcm_gmult_neon(uint64_t Xi[2], const u128 Htable[16]) {
319  abort();
320}
321static void gcm_ghash_neon(uint64_t Xi[2], const u128 Htable[16],
322                           const uint8_t *inp, size_t len) {
323  abort();
324}
325#endif
326
327#endif
328#elif defined(OPENSSL_PPC64LE)
329#define GHASH_ASM_PPC64LE
330#define GCM_FUNCREF_4BIT
331void gcm_init_p8(u128 Htable[16], const uint64_t Xi[2]);
332void gcm_gmult_p8(uint64_t Xi[2], const u128 Htable[16]);
333void gcm_ghash_p8(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
334                  size_t len);
335#endif
336#endif
337
338#ifdef GCM_FUNCREF_4BIT
339#undef GCM_MUL
340#define GCM_MUL(ctx, Xi) (*gcm_gmult_p)((ctx)->Xi.u, (ctx)->Htable)
341#ifdef GHASH
342#undef GHASH
343#define GHASH(ctx, in, len) (*gcm_ghash_p)((ctx)->Xi.u, (ctx)->Htable, in, len)
344#endif
345#endif
346
347void CRYPTO_ghash_init(gmult_func *out_mult, ghash_func *out_hash,
348                       u128 *out_key, u128 out_table[16],
349                       int *out_is_avx,
350                       const uint8_t *gcm_key) {
351  *out_is_avx = 0;
352
353  union {
354    uint64_t u[2];
355    uint8_t c[16];
356  } H;
357
358  OPENSSL_memcpy(H.c, gcm_key, 16);
359
360  // H is stored in host byte order
361  H.u[0] = CRYPTO_bswap8(H.u[0]);
362  H.u[1] = CRYPTO_bswap8(H.u[1]);
363
364  OPENSSL_memcpy(out_key, H.c, 16);
365
366#if defined(GHASH_ASM_X86_64)
367  if (crypto_gcm_clmul_enabled()) {
368    if (((OPENSSL_ia32cap_get()[1] >> 22) & 0x41) == 0x41) {  // AVX+MOVBE
369      gcm_init_avx(out_table, H.u);
370      *out_mult = gcm_gmult_avx;
371      *out_hash = gcm_ghash_avx;
372      *out_is_avx = 1;
373      return;
374    }
375    gcm_init_clmul(out_table, H.u);
376    *out_mult = gcm_gmult_clmul;
377    *out_hash = gcm_ghash_clmul;
378    return;
379  }
380#elif defined(GHASH_ASM_X86)
381  if (crypto_gcm_clmul_enabled()) {
382    gcm_init_clmul(out_table, H.u);
383    *out_mult = gcm_gmult_clmul;
384    *out_hash = gcm_ghash_clmul;
385    return;
386  }
387#elif defined(GHASH_ASM_ARM)
388  if (pmull_capable()) {
389    gcm_init_v8(out_table, H.u);
390    *out_mult = gcm_gmult_v8;
391    *out_hash = gcm_ghash_v8;
392    return;
393  }
394
395  if (neon_capable()) {
396    gcm_init_neon(out_table, H.u);
397    *out_mult = gcm_gmult_neon;
398    *out_hash = gcm_ghash_neon;
399    return;
400  }
401#elif defined(GHASH_ASM_PPC64LE)
402  if (CRYPTO_is_PPC64LE_vcrypto_capable()) {
403    gcm_init_p8(out_table, H.u);
404    *out_mult = gcm_gmult_p8;
405    *out_hash = gcm_ghash_p8;
406    return;
407  }
408#endif
409
410  gcm_init_4bit(out_table, H.u);
411#if defined(GHASH_ASM_X86)
412  *out_mult = gcm_gmult_4bit_mmx;
413  *out_hash = gcm_ghash_4bit_mmx;
414#else
415  *out_mult = gcm_gmult_4bit;
416  *out_hash = gcm_ghash_4bit;
417#endif
418}
419
420void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, const void *aes_key,
421                        block128_f block, int is_aesni_encrypt) {
422  OPENSSL_memset(ctx, 0, sizeof(*ctx));
423  ctx->block = block;
424
425  uint8_t gcm_key[16];
426  OPENSSL_memset(gcm_key, 0, sizeof(gcm_key));
427  (*block)(gcm_key, gcm_key, aes_key);
428
429  int is_avx;
430  CRYPTO_ghash_init(&ctx->gmult, &ctx->ghash, &ctx->H, ctx->Htable, &is_avx,
431                    gcm_key);
432
433  ctx->use_aesni_gcm_crypt = (is_avx && is_aesni_encrypt) ? 1 : 0;
434}
435
436void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const void *key,
437                         const uint8_t *iv, size_t len) {
438  unsigned int ctr;
439#ifdef GCM_FUNCREF_4BIT
440  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult;
441#endif
442
443  ctx->Yi.u[0] = 0;
444  ctx->Yi.u[1] = 0;
445  ctx->Xi.u[0] = 0;
446  ctx->Xi.u[1] = 0;
447  ctx->len.u[0] = 0;  // AAD length
448  ctx->len.u[1] = 0;  // message length
449  ctx->ares = 0;
450  ctx->mres = 0;
451
452  if (len == 12) {
453    OPENSSL_memcpy(ctx->Yi.c, iv, 12);
454    ctx->Yi.c[15] = 1;
455    ctr = 1;
456  } else {
457    uint64_t len0 = len;
458
459    while (len >= 16) {
460      for (size_t i = 0; i < 16; ++i) {
461        ctx->Yi.c[i] ^= iv[i];
462      }
463      GCM_MUL(ctx, Yi);
464      iv += 16;
465      len -= 16;
466    }
467    if (len) {
468      for (size_t i = 0; i < len; ++i) {
469        ctx->Yi.c[i] ^= iv[i];
470      }
471      GCM_MUL(ctx, Yi);
472    }
473    len0 <<= 3;
474    ctx->Yi.u[1] ^= CRYPTO_bswap8(len0);
475
476    GCM_MUL(ctx, Yi);
477    ctr = CRYPTO_bswap4(ctx->Yi.d[3]);
478  }
479
480  (*ctx->block)(ctx->Yi.c, ctx->EK0.c, key);
481  ++ctr;
482  ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
483}
484
485int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const uint8_t *aad, size_t len) {
486  unsigned int n;
487  uint64_t alen = ctx->len.u[0];
488#ifdef GCM_FUNCREF_4BIT
489  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult;
490#ifdef GHASH
491  void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
492                      size_t len) = ctx->ghash;
493#endif
494#endif
495
496  if (ctx->len.u[1]) {
497    return 0;
498  }
499
500  alen += len;
501  if (alen > (UINT64_C(1) << 61) || (sizeof(len) == 8 && alen < len)) {
502    return 0;
503  }
504  ctx->len.u[0] = alen;
505
506  n = ctx->ares;
507  if (n) {
508    while (n && len) {
509      ctx->Xi.c[n] ^= *(aad++);
510      --len;
511      n = (n + 1) % 16;
512    }
513    if (n == 0) {
514      GCM_MUL(ctx, Xi);
515    } else {
516      ctx->ares = n;
517      return 1;
518    }
519  }
520
521  // Process a whole number of blocks.
522#ifdef GHASH
523  size_t len_blocks = len & kSizeTWithoutLower4Bits;
524  if (len_blocks != 0) {
525    GHASH(ctx, aad, len_blocks);
526    aad += len_blocks;
527    len -= len_blocks;
528  }
529#else
530  while (len >= 16) {
531    for (size_t i = 0; i < 16; ++i) {
532      ctx->Xi.c[i] ^= aad[i];
533    }
534    GCM_MUL(ctx, Xi);
535    aad += 16;
536    len -= 16;
537  }
538#endif
539
540  // Process the remainder.
541  if (len != 0) {
542    n = (unsigned int)len;
543    for (size_t i = 0; i < len; ++i) {
544      ctx->Xi.c[i] ^= aad[i];
545    }
546  }
547
548  ctx->ares = n;
549  return 1;
550}
551
552int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, const void *key,
553                          const uint8_t *in, uint8_t *out, size_t len) {
554  unsigned int n, ctr;
555  uint64_t mlen = ctx->len.u[1];
556  block128_f block = ctx->block;
557#ifdef GCM_FUNCREF_4BIT
558  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult;
559#ifdef GHASH
560  void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
561                      size_t len) = ctx->ghash;
562#endif
563#endif
564
565  mlen += len;
566  if (mlen > ((UINT64_C(1) << 36) - 32) ||
567      (sizeof(len) == 8 && mlen < len)) {
568    return 0;
569  }
570  ctx->len.u[1] = mlen;
571
572  if (ctx->ares) {
573    // First call to encrypt finalizes GHASH(AAD)
574    GCM_MUL(ctx, Xi);
575    ctx->ares = 0;
576  }
577
578  ctr = CRYPTO_bswap4(ctx->Yi.d[3]);
579
580  n = ctx->mres;
581  if (n) {
582    while (n && len) {
583      ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
584      --len;
585      n = (n + 1) % 16;
586    }
587    if (n == 0) {
588      GCM_MUL(ctx, Xi);
589    } else {
590      ctx->mres = n;
591      return 1;
592    }
593  }
594  if (STRICT_ALIGNMENT &&
595      ((uintptr_t)in | (uintptr_t)out) % sizeof(size_t) != 0) {
596    for (size_t i = 0; i < len; ++i) {
597      if (n == 0) {
598        (*block)(ctx->Yi.c, ctx->EKi.c, key);
599        ++ctr;
600        ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
601      }
602      ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
603      n = (n + 1) % 16;
604      if (n == 0) {
605        GCM_MUL(ctx, Xi);
606      }
607    }
608
609    ctx->mres = n;
610    return 1;
611  }
612#if defined(GHASH) && defined(GHASH_CHUNK)
613  while (len >= GHASH_CHUNK) {
614    size_t j = GHASH_CHUNK;
615
616    while (j) {
617      (*block)(ctx->Yi.c, ctx->EKi.c, key);
618      ++ctr;
619      ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
620      for (size_t i = 0; i < 16; i += sizeof(size_t)) {
621        store_word_le(out + i,
622                      load_word_le(in + i) ^ ctx->EKi.t[i / sizeof(size_t)]);
623      }
624      out += 16;
625      in += 16;
626      j -= 16;
627    }
628    GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
629    len -= GHASH_CHUNK;
630  }
631  size_t len_blocks = len & kSizeTWithoutLower4Bits;
632  if (len_blocks != 0) {
633    while (len >= 16) {
634      (*block)(ctx->Yi.c, ctx->EKi.c, key);
635      ++ctr;
636      ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
637      for (size_t i = 0; i < 16; i += sizeof(size_t)) {
638        store_word_le(out + i,
639                      load_word_le(in + i) ^ ctx->EKi.t[i / sizeof(size_t)]);
640      }
641      out += 16;
642      in += 16;
643      len -= 16;
644    }
645    GHASH(ctx, out - len_blocks, len_blocks);
646  }
647#else
648  while (len >= 16) {
649    (*block)(ctx->Yi.c, ctx->EKi.c, key);
650    ++ctr;
651    ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
652    for (size_t i = 0; i < 16; i += sizeof(size_t)) {
653      size_t tmp = load_word_le(in + i) ^ ctx->EKi.t[i / sizeof(size_t)];
654      store_word_le(out + i, tmp);
655      ctx->Xi.t[i / sizeof(size_t)] ^= tmp;
656    }
657    GCM_MUL(ctx, Xi);
658    out += 16;
659    in += 16;
660    len -= 16;
661  }
662#endif
663  if (len) {
664    (*block)(ctx->Yi.c, ctx->EKi.c, key);
665    ++ctr;
666    ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
667    while (len--) {
668      ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
669      ++n;
670    }
671  }
672
673  ctx->mres = n;
674  return 1;
675}
676
677int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, const void *key,
678                          const unsigned char *in, unsigned char *out,
679                          size_t len) {
680  unsigned int n, ctr;
681  uint64_t mlen = ctx->len.u[1];
682  block128_f block = ctx->block;
683#ifdef GCM_FUNCREF_4BIT
684  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult;
685#ifdef GHASH
686  void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
687                      size_t len) = ctx->ghash;
688#endif
689#endif
690
691  mlen += len;
692  if (mlen > ((UINT64_C(1) << 36) - 32) ||
693      (sizeof(len) == 8 && mlen < len)) {
694    return 0;
695  }
696  ctx->len.u[1] = mlen;
697
698  if (ctx->ares) {
699    // First call to decrypt finalizes GHASH(AAD)
700    GCM_MUL(ctx, Xi);
701    ctx->ares = 0;
702  }
703
704  ctr = CRYPTO_bswap4(ctx->Yi.d[3]);
705
706  n = ctx->mres;
707  if (n) {
708    while (n && len) {
709      uint8_t c = *(in++);
710      *(out++) = c ^ ctx->EKi.c[n];
711      ctx->Xi.c[n] ^= c;
712      --len;
713      n = (n + 1) % 16;
714    }
715    if (n == 0) {
716      GCM_MUL(ctx, Xi);
717    } else {
718      ctx->mres = n;
719      return 1;
720    }
721  }
722  if (STRICT_ALIGNMENT &&
723      ((uintptr_t)in | (uintptr_t)out) % sizeof(size_t) != 0) {
724    for (size_t i = 0; i < len; ++i) {
725      uint8_t c;
726      if (n == 0) {
727        (*block)(ctx->Yi.c, ctx->EKi.c, key);
728        ++ctr;
729        ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
730      }
731      c = in[i];
732      out[i] = c ^ ctx->EKi.c[n];
733      ctx->Xi.c[n] ^= c;
734      n = (n + 1) % 16;
735      if (n == 0) {
736        GCM_MUL(ctx, Xi);
737      }
738    }
739
740    ctx->mres = n;
741    return 1;
742  }
743#if defined(GHASH) && defined(GHASH_CHUNK)
744  while (len >= GHASH_CHUNK) {
745    size_t j = GHASH_CHUNK;
746
747    GHASH(ctx, in, GHASH_CHUNK);
748    while (j) {
749      (*block)(ctx->Yi.c, ctx->EKi.c, key);
750      ++ctr;
751      ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
752      for (size_t i = 0; i < 16; i += sizeof(size_t)) {
753        store_word_le(out + i,
754                      load_word_le(in + i) ^ ctx->EKi.t[i / sizeof(size_t)]);
755      }
756      out += 16;
757      in += 16;
758      j -= 16;
759    }
760    len -= GHASH_CHUNK;
761  }
762  size_t len_blocks = len & kSizeTWithoutLower4Bits;
763  if (len_blocks != 0) {
764    GHASH(ctx, in, len_blocks);
765    while (len >= 16) {
766      (*block)(ctx->Yi.c, ctx->EKi.c, key);
767      ++ctr;
768      ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
769      for (size_t i = 0; i < 16; i += sizeof(size_t)) {
770        store_word_le(out + i,
771                      load_word_le(in + i) ^ ctx->EKi.t[i / sizeof(size_t)]);
772      }
773      out += 16;
774      in += 16;
775      len -= 16;
776    }
777  }
778#else
779  while (len >= 16) {
780    (*block)(ctx->Yi.c, ctx->EKi.c, key);
781    ++ctr;
782    ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
783    for (size_t i = 0; i < 16; i += sizeof(size_t)) {
784      size_t c = load_word_le(in + i);
785      store_word_le(out + i, c ^ ctx->EKi.t[i / sizeof(size_t)]);
786      ctx->Xi.t[i / sizeof(size_t)] ^= c;
787    }
788    GCM_MUL(ctx, Xi);
789    out += 16;
790    in += 16;
791    len -= 16;
792  }
793#endif
794  if (len) {
795    (*block)(ctx->Yi.c, ctx->EKi.c, key);
796    ++ctr;
797    ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
798    while (len--) {
799      uint8_t c = in[n];
800      ctx->Xi.c[n] ^= c;
801      out[n] = c ^ ctx->EKi.c[n];
802      ++n;
803    }
804  }
805
806  ctx->mres = n;
807  return 1;
808}
809
810int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, const void *key,
811                                const uint8_t *in, uint8_t *out, size_t len,
812                                ctr128_f stream) {
813  unsigned int n, ctr;
814  uint64_t mlen = ctx->len.u[1];
815#ifdef GCM_FUNCREF_4BIT
816  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult;
817#ifdef GHASH
818  void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
819                      size_t len) = ctx->ghash;
820#endif
821#endif
822
823  mlen += len;
824  if (mlen > ((UINT64_C(1) << 36) - 32) ||
825      (sizeof(len) == 8 && mlen < len)) {
826    return 0;
827  }
828  ctx->len.u[1] = mlen;
829
830  if (ctx->ares) {
831    // First call to encrypt finalizes GHASH(AAD)
832    GCM_MUL(ctx, Xi);
833    ctx->ares = 0;
834  }
835
836  n = ctx->mres;
837  if (n) {
838    while (n && len) {
839      ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
840      --len;
841      n = (n + 1) % 16;
842    }
843    if (n == 0) {
844      GCM_MUL(ctx, Xi);
845    } else {
846      ctx->mres = n;
847      return 1;
848    }
849  }
850
851#if defined(AESNI_GCM)
852  if (ctx->use_aesni_gcm_crypt) {
853    // |aesni_gcm_encrypt| may not process all the input given to it. It may
854    // not process *any* of its input if it is deemed too small.
855    size_t bulk = aesni_gcm_encrypt(in, out, len, key, ctx->Yi.c, ctx->Xi.u);
856    in += bulk;
857    out += bulk;
858    len -= bulk;
859  }
860#endif
861
862  ctr = CRYPTO_bswap4(ctx->Yi.d[3]);
863
864#if defined(GHASH)
865  while (len >= GHASH_CHUNK) {
866    (*stream)(in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
867    ctr += GHASH_CHUNK / 16;
868    ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
869    GHASH(ctx, out, GHASH_CHUNK);
870    out += GHASH_CHUNK;
871    in += GHASH_CHUNK;
872    len -= GHASH_CHUNK;
873  }
874#endif
875  size_t i = len & kSizeTWithoutLower4Bits;
876  if (i != 0) {
877    size_t j = i / 16;
878
879    (*stream)(in, out, j, key, ctx->Yi.c);
880    ctr += (unsigned int)j;
881    ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
882    in += i;
883    len -= i;
884#if defined(GHASH)
885    GHASH(ctx, out, i);
886    out += i;
887#else
888    while (j--) {
889      for (i = 0; i < 16; ++i) {
890        ctx->Xi.c[i] ^= out[i];
891      }
892      GCM_MUL(ctx, Xi);
893      out += 16;
894    }
895#endif
896  }
897  if (len) {
898    (*ctx->block)(ctx->Yi.c, ctx->EKi.c, key);
899    ++ctr;
900    ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
901    while (len--) {
902      ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
903      ++n;
904    }
905  }
906
907  ctx->mres = n;
908  return 1;
909}
910
911int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, const void *key,
912                                const uint8_t *in, uint8_t *out, size_t len,
913                                ctr128_f stream) {
914  unsigned int n, ctr;
915  uint64_t mlen = ctx->len.u[1];
916#ifdef GCM_FUNCREF_4BIT
917  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult;
918#ifdef GHASH
919  void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
920                      size_t len) = ctx->ghash;
921#endif
922#endif
923
924  mlen += len;
925  if (mlen > ((UINT64_C(1) << 36) - 32) ||
926      (sizeof(len) == 8 && mlen < len)) {
927    return 0;
928  }
929  ctx->len.u[1] = mlen;
930
931  if (ctx->ares) {
932    // First call to decrypt finalizes GHASH(AAD)
933    GCM_MUL(ctx, Xi);
934    ctx->ares = 0;
935  }
936
937  n = ctx->mres;
938  if (n) {
939    while (n && len) {
940      uint8_t c = *(in++);
941      *(out++) = c ^ ctx->EKi.c[n];
942      ctx->Xi.c[n] ^= c;
943      --len;
944      n = (n + 1) % 16;
945    }
946    if (n == 0) {
947      GCM_MUL(ctx, Xi);
948    } else {
949      ctx->mres = n;
950      return 1;
951    }
952  }
953
954#if defined(AESNI_GCM)
955  if (ctx->use_aesni_gcm_crypt) {
956    // |aesni_gcm_decrypt| may not process all the input given to it. It may
957    // not process *any* of its input if it is deemed too small.
958    size_t bulk = aesni_gcm_decrypt(in, out, len, key, ctx->Yi.c, ctx->Xi.u);
959    in += bulk;
960    out += bulk;
961    len -= bulk;
962  }
963#endif
964
965  ctr = CRYPTO_bswap4(ctx->Yi.d[3]);
966
967#if defined(GHASH)
968  while (len >= GHASH_CHUNK) {
969    GHASH(ctx, in, GHASH_CHUNK);
970    (*stream)(in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
971    ctr += GHASH_CHUNK / 16;
972    ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
973    out += GHASH_CHUNK;
974    in += GHASH_CHUNK;
975    len -= GHASH_CHUNK;
976  }
977#endif
978  size_t i = len & kSizeTWithoutLower4Bits;
979  if (i != 0) {
980    size_t j = i / 16;
981
982#if defined(GHASH)
983    GHASH(ctx, in, i);
984#else
985    while (j--) {
986      size_t k;
987      for (k = 0; k < 16; ++k) {
988        ctx->Xi.c[k] ^= in[k];
989      }
990      GCM_MUL(ctx, Xi);
991      in += 16;
992    }
993    j = i / 16;
994    in -= i;
995#endif
996    (*stream)(in, out, j, key, ctx->Yi.c);
997    ctr += (unsigned int)j;
998    ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
999    out += i;
1000    in += i;
1001    len -= i;
1002  }
1003  if (len) {
1004    (*ctx->block)(ctx->Yi.c, ctx->EKi.c, key);
1005    ++ctr;
1006    ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
1007    while (len--) {
1008      uint8_t c = in[n];
1009      ctx->Xi.c[n] ^= c;
1010      out[n] = c ^ ctx->EKi.c[n];
1011      ++n;
1012    }
1013  }
1014
1015  ctx->mres = n;
1016  return 1;
1017}
1018
1019int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const uint8_t *tag, size_t len) {
1020  uint64_t alen = ctx->len.u[0] << 3;
1021  uint64_t clen = ctx->len.u[1] << 3;
1022#ifdef GCM_FUNCREF_4BIT
1023  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult;
1024#endif
1025
1026  if (ctx->mres || ctx->ares) {
1027    GCM_MUL(ctx, Xi);
1028  }
1029
1030  alen = CRYPTO_bswap8(alen);
1031  clen = CRYPTO_bswap8(clen);
1032
1033  ctx->Xi.u[0] ^= alen;
1034  ctx->Xi.u[1] ^= clen;
1035  GCM_MUL(ctx, Xi);
1036
1037  ctx->Xi.u[0] ^= ctx->EK0.u[0];
1038  ctx->Xi.u[1] ^= ctx->EK0.u[1];
1039
1040  if (tag && len <= sizeof(ctx->Xi)) {
1041    return CRYPTO_memcmp(ctx->Xi.c, tag, len) == 0;
1042  } else {
1043    return 0;
1044  }
1045}
1046
1047void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len) {
1048  CRYPTO_gcm128_finish(ctx, NULL, 0);
1049  OPENSSL_memcpy(tag, ctx->Xi.c,
1050                 len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
1051}
1052
1053#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
1054int crypto_gcm_clmul_enabled(void) {
1055#ifdef GHASH_ASM
1056  const uint32_t *ia32cap = OPENSSL_ia32cap_get();
1057  return (ia32cap[0] & (1 << 24)) &&  // check FXSR bit
1058         (ia32cap[1] & (1 << 1));     // check PCLMULQDQ bit
1059#else
1060  return 0;
1061#endif
1062}
1063#endif
1064