1/* ====================================================================
2 * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in
13 *    the documentation and/or other materials provided with the
14 *    distribution.
15 *
16 * 3. All advertising materials mentioning features or use of this
17 *    software must display the following acknowledgment:
18 *    "This product includes software developed by the OpenSSL Project
19 *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20 *
21 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22 *    endorse or promote products derived from this software without
23 *    prior written permission. For written permission, please contact
24 *    openssl-core@openssl.org.
25 *
26 * 5. Products derived from this software may not be called "OpenSSL"
27 *    nor may "OpenSSL" appear in their names without prior written
28 *    permission of the OpenSSL Project.
29 *
30 * 6. Redistributions of any form whatsoever must retain the following
31 *    acknowledgment:
32 *    "This product includes software developed by the OpenSSL Project
33 *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34 *
35 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46 * OF THE POSSIBILITY OF SUCH DAMAGE.
47 * ==================================================================== */
48
49#include <openssl/base.h>
50
51#include <assert.h>
52#include <string.h>
53
54#include <openssl/mem.h>
55#include <openssl/cpu.h>
56
57#include "internal.h"
58
59
60#if !defined(OPENSSL_NO_ASM) &&                         \
61    (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \
62     defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))
63#define GHASH_ASM
64#endif
65
66#if defined(BSWAP4) && STRICT_ALIGNMENT == 1
67/* redefine, because alignment is ensured */
68#undef GETU32
69#define GETU32(p) BSWAP4(*(const uint32_t *)(p))
70#undef PUTU32
71#define PUTU32(p, v) *(uint32_t *)(p) = BSWAP4(v)
72#endif
73
74#define PACK(s) ((size_t)(s) << (sizeof(size_t) * 8 - 16))
75#define REDUCE1BIT(V)                                                  \
76  do {                                                                 \
77    if (sizeof(size_t) == 8) {                                         \
78      uint64_t T = UINT64_C(0xe100000000000000) & (0 - (V.lo & 1)); \
79      V.lo = (V.hi << 63) | (V.lo >> 1);                               \
80      V.hi = (V.hi >> 1) ^ T;                                          \
81    } else {                                                           \
82      uint32_t T = 0xe1000000U & (0 - (uint32_t)(V.lo & 1));           \
83      V.lo = (V.hi << 63) | (V.lo >> 1);                               \
84      V.hi = (V.hi >> 1) ^ ((uint64_t)T << 32);                        \
85    }                                                                  \
86  } while (0)
87
88// kSizeTWithoutLower4Bits is a mask that can be used to zero the lower four
89// bits of a |size_t|.
90static const size_t kSizeTWithoutLower4Bits = (size_t) -16;
91
92static void gcm_init_4bit(u128 Htable[16], uint64_t H[2]) {
93  u128 V;
94
95  Htable[0].hi = 0;
96  Htable[0].lo = 0;
97  V.hi = H[0];
98  V.lo = H[1];
99
100  Htable[8] = V;
101  REDUCE1BIT(V);
102  Htable[4] = V;
103  REDUCE1BIT(V);
104  Htable[2] = V;
105  REDUCE1BIT(V);
106  Htable[1] = V;
107  Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
108  V = Htable[4];
109  Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
110  Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
111  Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
112  V = Htable[8];
113  Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
114  Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
115  Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
116  Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
117  Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
118  Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
119  Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
120
121#if defined(GHASH_ASM) && defined(OPENSSL_ARM)
122  /* ARM assembler expects specific dword order in Htable. */
123  {
124    int j;
125    const union {
126      long one;
127      char little;
128    } is_endian = {1};
129
130    if (is_endian.little) {
131      for (j = 0; j < 16; ++j) {
132        V = Htable[j];
133        Htable[j].hi = V.lo;
134        Htable[j].lo = V.hi;
135      }
136    } else {
137      for (j = 0; j < 16; ++j) {
138        V = Htable[j];
139        Htable[j].hi = V.lo << 32 | V.lo >> 32;
140        Htable[j].lo = V.hi << 32 | V.hi >> 32;
141      }
142    }
143  }
144#endif
145}
146
147#if !defined(GHASH_ASM) || defined(OPENSSL_AARCH64)
148static const size_t rem_4bit[16] = {
149    PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
150    PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
151    PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
152    PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)};
153
154static void gcm_gmult_4bit(uint64_t Xi[2], const u128 Htable[16]) {
155  u128 Z;
156  int cnt = 15;
157  size_t rem, nlo, nhi;
158  const union {
159    long one;
160    char little;
161  } is_endian = {1};
162
163  nlo = ((const uint8_t *)Xi)[15];
164  nhi = nlo >> 4;
165  nlo &= 0xf;
166
167  Z.hi = Htable[nlo].hi;
168  Z.lo = Htable[nlo].lo;
169
170  while (1) {
171    rem = (size_t)Z.lo & 0xf;
172    Z.lo = (Z.hi << 60) | (Z.lo >> 4);
173    Z.hi = (Z.hi >> 4);
174    if (sizeof(size_t) == 8) {
175      Z.hi ^= rem_4bit[rem];
176    } else {
177      Z.hi ^= (uint64_t)rem_4bit[rem] << 32;
178    }
179
180    Z.hi ^= Htable[nhi].hi;
181    Z.lo ^= Htable[nhi].lo;
182
183    if (--cnt < 0) {
184      break;
185    }
186
187    nlo = ((const uint8_t *)Xi)[cnt];
188    nhi = nlo >> 4;
189    nlo &= 0xf;
190
191    rem = (size_t)Z.lo & 0xf;
192    Z.lo = (Z.hi << 60) | (Z.lo >> 4);
193    Z.hi = (Z.hi >> 4);
194    if (sizeof(size_t) == 8) {
195      Z.hi ^= rem_4bit[rem];
196    } else {
197      Z.hi ^= (uint64_t)rem_4bit[rem] << 32;
198    }
199
200    Z.hi ^= Htable[nlo].hi;
201    Z.lo ^= Htable[nlo].lo;
202  }
203
204  if (is_endian.little) {
205#ifdef BSWAP8
206    Xi[0] = BSWAP8(Z.hi);
207    Xi[1] = BSWAP8(Z.lo);
208#else
209    uint8_t *p = (uint8_t *)Xi;
210    uint32_t v;
211    v = (uint32_t)(Z.hi >> 32);
212    PUTU32(p, v);
213    v = (uint32_t)(Z.hi);
214    PUTU32(p + 4, v);
215    v = (uint32_t)(Z.lo >> 32);
216    PUTU32(p + 8, v);
217    v = (uint32_t)(Z.lo);
218    PUTU32(p + 12, v);
219#endif
220  } else {
221    Xi[0] = Z.hi;
222    Xi[1] = Z.lo;
223  }
224}
225
226/* Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
227 * details... Compiler-generated code doesn't seem to give any
228 * performance improvement, at least not on x86[_64]. It's here
229 * mostly as reference and a placeholder for possible future
230 * non-trivial optimization[s]... */
231static void gcm_ghash_4bit(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
232                           size_t len) {
233  u128 Z;
234  int cnt;
235  size_t rem, nlo, nhi;
236  const union {
237    long one;
238    char little;
239  } is_endian = {1};
240
241  do {
242    cnt = 15;
243    nlo = ((const uint8_t *)Xi)[15];
244    nlo ^= inp[15];
245    nhi = nlo >> 4;
246    nlo &= 0xf;
247
248    Z.hi = Htable[nlo].hi;
249    Z.lo = Htable[nlo].lo;
250
251    while (1) {
252      rem = (size_t)Z.lo & 0xf;
253      Z.lo = (Z.hi << 60) | (Z.lo >> 4);
254      Z.hi = (Z.hi >> 4);
255      if (sizeof(size_t) == 8) {
256        Z.hi ^= rem_4bit[rem];
257      } else {
258        Z.hi ^= (uint64_t)rem_4bit[rem] << 32;
259      }
260
261      Z.hi ^= Htable[nhi].hi;
262      Z.lo ^= Htable[nhi].lo;
263
264      if (--cnt < 0) {
265        break;
266      }
267
268      nlo = ((const uint8_t *)Xi)[cnt];
269      nlo ^= inp[cnt];
270      nhi = nlo >> 4;
271      nlo &= 0xf;
272
273      rem = (size_t)Z.lo & 0xf;
274      Z.lo = (Z.hi << 60) | (Z.lo >> 4);
275      Z.hi = (Z.hi >> 4);
276      if (sizeof(size_t) == 8) {
277        Z.hi ^= rem_4bit[rem];
278      } else {
279        Z.hi ^= (uint64_t)rem_4bit[rem] << 32;
280      }
281
282      Z.hi ^= Htable[nlo].hi;
283      Z.lo ^= Htable[nlo].lo;
284    }
285
286    if (is_endian.little) {
287#ifdef BSWAP8
288      Xi[0] = BSWAP8(Z.hi);
289      Xi[1] = BSWAP8(Z.lo);
290#else
291      uint8_t *p = (uint8_t *)Xi;
292      uint32_t v;
293      v = (uint32_t)(Z.hi >> 32);
294      PUTU32(p, v);
295      v = (uint32_t)(Z.hi);
296      PUTU32(p + 4, v);
297      v = (uint32_t)(Z.lo >> 32);
298      PUTU32(p + 8, v);
299      v = (uint32_t)(Z.lo);
300      PUTU32(p + 12, v);
301#endif
302    } else {
303      Xi[0] = Z.hi;
304      Xi[1] = Z.lo;
305    }
306  } while (inp += 16, len -= 16);
307}
308#else /* GHASH_ASM */
309void gcm_gmult_4bit(uint64_t Xi[2], const u128 Htable[16]);
310void gcm_ghash_4bit(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
311                    size_t len);
312#endif
313
314#define GCM_MUL(ctx, Xi) gcm_gmult_4bit(ctx->Xi.u, ctx->Htable)
315#if defined(GHASH_ASM)
316#define GHASH(ctx, in, len) gcm_ghash_4bit((ctx)->Xi.u, (ctx)->Htable, in, len)
317/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
318 * trashing effect. In other words idea is to hash data while it's
319 * still in L1 cache after encryption pass... */
320#define GHASH_CHUNK (3 * 1024)
321#endif
322
323
324#if defined(GHASH_ASM)
325#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
326#define GHASH_ASM_X86_OR_64
327#define GCM_FUNCREF_4BIT
328void gcm_init_clmul(u128 Htable[16], const uint64_t Xi[2]);
329void gcm_gmult_clmul(uint64_t Xi[2], const u128 Htable[16]);
330void gcm_ghash_clmul(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
331                     size_t len);
332
333#if defined(OPENSSL_X86)
334#define gcm_init_avx gcm_init_clmul
335#define gcm_gmult_avx gcm_gmult_clmul
336#define gcm_ghash_avx gcm_ghash_clmul
337#else
338void gcm_init_avx(u128 Htable[16], const uint64_t Xi[2]);
339void gcm_gmult_avx(uint64_t Xi[2], const u128 Htable[16]);
340void gcm_ghash_avx(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, size_t len);
341#endif
342
343#if defined(OPENSSL_X86)
344#define GHASH_ASM_X86
345void gcm_gmult_4bit_mmx(uint64_t Xi[2], const u128 Htable[16]);
346void gcm_ghash_4bit_mmx(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
347                        size_t len);
348
349void gcm_gmult_4bit_x86(uint64_t Xi[2], const u128 Htable[16]);
350void gcm_ghash_4bit_x86(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
351                        size_t len);
352#endif
353#elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
354#include <openssl/arm_arch.h>
355#if __ARM_ARCH__ >= 7
356#define GHASH_ASM_ARM
357#define GCM_FUNCREF_4BIT
358
359static int pmull_capable(void) {
360  return CRYPTO_is_ARMv8_PMULL_capable();
361}
362
363void gcm_init_v8(u128 Htable[16], const uint64_t Xi[2]);
364void gcm_gmult_v8(uint64_t Xi[2], const u128 Htable[16]);
365void gcm_ghash_v8(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
366                  size_t len);
367
368#if defined(OPENSSL_ARM)
369/* 32-bit ARM also has support for doing GCM with NEON instructions. */
370static int neon_capable(void) {
371  return CRYPTO_is_NEON_capable();
372}
373
374void gcm_init_neon(u128 Htable[16], const uint64_t Xi[2]);
375void gcm_gmult_neon(uint64_t Xi[2], const u128 Htable[16]);
376void gcm_ghash_neon(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
377                    size_t len);
378#else
379/* AArch64 only has the ARMv8 versions of functions. */
380static int neon_capable(void) {
381  return 0;
382}
383void gcm_init_neon(u128 Htable[16], const uint64_t Xi[2]) {
384  abort();
385}
386void gcm_gmult_neon(uint64_t Xi[2], const u128 Htable[16]) {
387  abort();
388}
389void gcm_ghash_neon(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
390                    size_t len) {
391  abort();
392}
393#endif
394
395#endif
396#endif
397#endif
398
399#ifdef GCM_FUNCREF_4BIT
400#undef GCM_MUL
401#define GCM_MUL(ctx, Xi) (*gcm_gmult_p)(ctx->Xi.u, ctx->Htable)
402#ifdef GHASH
403#undef GHASH
404#define GHASH(ctx, in, len) (*gcm_ghash_p)(ctx->Xi.u, ctx->Htable, in, len)
405#endif
406#endif
407
408GCM128_CONTEXT *CRYPTO_gcm128_new(const void *key, block128_f block) {
409  GCM128_CONTEXT *ret;
410
411  ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT));
412  if (ret != NULL) {
413    CRYPTO_gcm128_init(ret, key, block);
414  }
415
416  return ret;
417}
418
419void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, const void *key,
420                        block128_f block) {
421  const union {
422    long one;
423    char little;
424  } is_endian = {1};
425
426  memset(ctx, 0, sizeof(*ctx));
427  ctx->block = block;
428
429  (*block)(ctx->H.c, ctx->H.c, key);
430
431  if (is_endian.little) {
432/* H is stored in host byte order */
433#ifdef BSWAP8
434    ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
435    ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
436#else
437    uint8_t *p = ctx->H.c;
438    uint64_t hi, lo;
439    hi = (uint64_t)GETU32(p) << 32 | GETU32(p + 4);
440    lo = (uint64_t)GETU32(p + 8) << 32 | GETU32(p + 12);
441    ctx->H.u[0] = hi;
442    ctx->H.u[1] = lo;
443#endif
444  }
445
446#if defined(GHASH_ASM_X86_OR_64)
447  if (crypto_gcm_clmul_enabled()) {
448    if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */
449      gcm_init_avx(ctx->Htable, ctx->H.u);
450      ctx->gmult = gcm_gmult_avx;
451      ctx->ghash = gcm_ghash_avx;
452    } else {
453      gcm_init_clmul(ctx->Htable, ctx->H.u);
454      ctx->gmult = gcm_gmult_clmul;
455      ctx->ghash = gcm_ghash_clmul;
456    }
457    return;
458  }
459  gcm_init_4bit(ctx->Htable, ctx->H.u);
460#if defined(GHASH_ASM_X86) /* x86 only */
461  if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */
462    ctx->gmult = gcm_gmult_4bit_mmx;
463    ctx->ghash = gcm_ghash_4bit_mmx;
464  } else {
465    ctx->gmult = gcm_gmult_4bit_x86;
466    ctx->ghash = gcm_ghash_4bit_x86;
467  }
468#else
469  ctx->gmult = gcm_gmult_4bit;
470  ctx->ghash = gcm_ghash_4bit;
471#endif
472#elif defined(GHASH_ASM_ARM)
473  if (pmull_capable()) {
474    gcm_init_v8(ctx->Htable, ctx->H.u);
475    ctx->gmult = gcm_gmult_v8;
476    ctx->ghash = gcm_ghash_v8;
477  } else if (neon_capable()) {
478    gcm_init_neon(ctx->Htable,ctx->H.u);
479    ctx->gmult = gcm_gmult_neon;
480    ctx->ghash = gcm_ghash_neon;
481  } else {
482    gcm_init_4bit(ctx->Htable, ctx->H.u);
483    ctx->gmult = gcm_gmult_4bit;
484    ctx->ghash = gcm_ghash_4bit;
485  }
486#else
487  gcm_init_4bit(ctx->Htable, ctx->H.u);
488  ctx->gmult = gcm_gmult_4bit;
489  ctx->ghash = gcm_ghash_4bit;
490#endif
491}
492
493void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const void *key,
494                         const uint8_t *iv, size_t len) {
495  const union {
496    long one;
497    char little;
498  } is_endian = {1};
499  unsigned int ctr;
500#ifdef GCM_FUNCREF_4BIT
501  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult;
502#endif
503
504  ctx->Yi.u[0] = 0;
505  ctx->Yi.u[1] = 0;
506  ctx->Xi.u[0] = 0;
507  ctx->Xi.u[1] = 0;
508  ctx->len.u[0] = 0; /* AAD length */
509  ctx->len.u[1] = 0; /* message length */
510  ctx->ares = 0;
511  ctx->mres = 0;
512
513  if (len == 12) {
514    memcpy(ctx->Yi.c, iv, 12);
515    ctx->Yi.c[15] = 1;
516    ctr = 1;
517  } else {
518    size_t i;
519    uint64_t len0 = len;
520
521    while (len >= 16) {
522      for (i = 0; i < 16; ++i) {
523        ctx->Yi.c[i] ^= iv[i];
524      }
525      GCM_MUL(ctx, Yi);
526      iv += 16;
527      len -= 16;
528    }
529    if (len) {
530      for (i = 0; i < len; ++i) {
531        ctx->Yi.c[i] ^= iv[i];
532      }
533      GCM_MUL(ctx, Yi);
534    }
535    len0 <<= 3;
536    if (is_endian.little) {
537#ifdef BSWAP8
538      ctx->Yi.u[1] ^= BSWAP8(len0);
539#else
540      ctx->Yi.c[8] ^= (uint8_t)(len0 >> 56);
541      ctx->Yi.c[9] ^= (uint8_t)(len0 >> 48);
542      ctx->Yi.c[10] ^= (uint8_t)(len0 >> 40);
543      ctx->Yi.c[11] ^= (uint8_t)(len0 >> 32);
544      ctx->Yi.c[12] ^= (uint8_t)(len0 >> 24);
545      ctx->Yi.c[13] ^= (uint8_t)(len0 >> 16);
546      ctx->Yi.c[14] ^= (uint8_t)(len0 >> 8);
547      ctx->Yi.c[15] ^= (uint8_t)(len0);
548#endif
549    } else {
550      ctx->Yi.u[1] ^= len0;
551    }
552
553    GCM_MUL(ctx, Yi);
554
555    if (is_endian.little) {
556      ctr = GETU32(ctx->Yi.c + 12);
557    } else {
558      ctr = ctx->Yi.d[3];
559    }
560  }
561
562  (*ctx->block)(ctx->Yi.c, ctx->EK0.c, key);
563  ++ctr;
564  if (is_endian.little) {
565    PUTU32(ctx->Yi.c + 12, ctr);
566  } else {
567    ctx->Yi.d[3] = ctr;
568  }
569}
570
571int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const uint8_t *aad, size_t len) {
572  size_t i;
573  unsigned int n;
574  uint64_t alen = ctx->len.u[0];
575#ifdef GCM_FUNCREF_4BIT
576  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult;
577#ifdef GHASH
578  void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
579                      size_t len) = ctx->ghash;
580#endif
581#endif
582
583  if (ctx->len.u[1]) {
584    return 0;
585  }
586
587  alen += len;
588  if (alen > (UINT64_C(1) << 61) || (sizeof(len) == 8 && alen < len)) {
589    return 0;
590  }
591  ctx->len.u[0] = alen;
592
593  n = ctx->ares;
594  if (n) {
595    while (n && len) {
596      ctx->Xi.c[n] ^= *(aad++);
597      --len;
598      n = (n + 1) % 16;
599    }
600    if (n == 0) {
601      GCM_MUL(ctx, Xi);
602    } else {
603      ctx->ares = n;
604      return 1;
605    }
606  }
607
608#ifdef GHASH
609  if ((i = (len & (size_t) - 16))) {
610    GHASH(ctx, aad, i);
611    aad += i;
612    len -= i;
613  }
614#else
615  while (len >= 16) {
616    for (i = 0; i < 16; ++i) {
617      ctx->Xi.c[i] ^= aad[i];
618    }
619    GCM_MUL(ctx, Xi);
620    aad += 16;
621    len -= 16;
622  }
623#endif
624  if (len) {
625    n = (unsigned int)len;
626    for (i = 0; i < len; ++i) {
627      ctx->Xi.c[i] ^= aad[i];
628    }
629  }
630
631  ctx->ares = n;
632  return 1;
633}
634
635int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, const void *key,
636                          const unsigned char *in, unsigned char *out,
637                          size_t len) {
638  const union {
639    long one;
640    char little;
641  } is_endian = {1};
642  unsigned int n, ctr;
643  size_t i;
644  uint64_t mlen = ctx->len.u[1];
645  block128_f block = ctx->block;
646#ifdef GCM_FUNCREF_4BIT
647  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult;
648#ifdef GHASH
649  void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
650                      size_t len) = ctx->ghash;
651#endif
652#endif
653
654  mlen += len;
655  if (mlen > ((UINT64_C(1) << 36) - 32) ||
656      (sizeof(len) == 8 && mlen < len)) {
657    return 0;
658  }
659  ctx->len.u[1] = mlen;
660
661  if (ctx->ares) {
662    /* First call to encrypt finalizes GHASH(AAD) */
663    GCM_MUL(ctx, Xi);
664    ctx->ares = 0;
665  }
666
667  if (is_endian.little) {
668    ctr = GETU32(ctx->Yi.c + 12);
669  } else {
670    ctr = ctx->Yi.d[3];
671  }
672
673  n = ctx->mres;
674  if (n) {
675    while (n && len) {
676      ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
677      --len;
678      n = (n + 1) % 16;
679    }
680    if (n == 0) {
681      GCM_MUL(ctx, Xi);
682    } else {
683      ctx->mres = n;
684      return 1;
685    }
686  }
687  if (STRICT_ALIGNMENT && ((size_t)in | (size_t)out) % sizeof(size_t) != 0) {
688    for (i = 0; i < len; ++i) {
689      if (n == 0) {
690        (*block)(ctx->Yi.c, ctx->EKi.c, key);
691        ++ctr;
692        if (is_endian.little) {
693          PUTU32(ctx->Yi.c + 12, ctr);
694        } else {
695          ctx->Yi.d[3] = ctr;
696        }
697      }
698      ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
699      n = (n + 1) % 16;
700      if (n == 0) {
701        GCM_MUL(ctx, Xi);
702      }
703    }
704
705    ctx->mres = n;
706    return 1;
707  }
708#if defined(GHASH) && defined(GHASH_CHUNK)
709  while (len >= GHASH_CHUNK) {
710    size_t j = GHASH_CHUNK;
711
712    while (j) {
713      size_t *out_t = (size_t *)out;
714      const size_t *in_t = (const size_t *)in;
715
716      (*block)(ctx->Yi.c, ctx->EKi.c, key);
717      ++ctr;
718      if (is_endian.little) {
719        PUTU32(ctx->Yi.c + 12, ctr);
720      } else {
721        ctx->Yi.d[3] = ctr;
722      }
723      for (i = 0; i < 16 / sizeof(size_t); ++i) {
724        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
725      }
726      out += 16;
727      in += 16;
728      j -= 16;
729    }
730    GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
731    len -= GHASH_CHUNK;
732  }
733  if ((i = (len & (size_t) - 16))) {
734    size_t j = i;
735
736    while (len >= 16) {
737      size_t *out_t = (size_t *)out;
738      const size_t *in_t = (const size_t *)in;
739
740      (*block)(ctx->Yi.c, ctx->EKi.c, key);
741      ++ctr;
742      if (is_endian.little) {
743        PUTU32(ctx->Yi.c + 12, ctr);
744      } else {
745        ctx->Yi.d[3] = ctr;
746      }
747      for (i = 0; i < 16 / sizeof(size_t); ++i) {
748        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
749      }
750      out += 16;
751      in += 16;
752      len -= 16;
753    }
754    GHASH(ctx, out - j, j);
755  }
756#else
757  while (len >= 16) {
758    size_t *out_t = (size_t *)out;
759    const size_t *in_t = (const size_t *)in;
760
761    (*block)(ctx->Yi.c, ctx->EKi.c, key);
762    ++ctr;
763    if (is_endian.little) {
764      PUTU32(ctx->Yi.c + 12, ctr);
765    } else {
766      ctx->Yi.d[3] = ctr;
767    }
768    for (i = 0; i < 16 / sizeof(size_t); ++i) {
769      ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
770    }
771    GCM_MUL(ctx, Xi);
772    out += 16;
773    in += 16;
774    len -= 16;
775  }
776#endif
777  if (len) {
778    (*block)(ctx->Yi.c, ctx->EKi.c, key);
779    ++ctr;
780    if (is_endian.little) {
781      PUTU32(ctx->Yi.c + 12, ctr);
782    } else {
783      ctx->Yi.d[3] = ctr;
784    }
785    while (len--) {
786      ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
787      ++n;
788    }
789  }
790
791  ctx->mres = n;
792  return 1;
793}
794
795int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, const void *key,
796                          const unsigned char *in, unsigned char *out,
797                          size_t len) {
798  const union {
799    long one;
800    char little;
801  } is_endian = {1};
802  unsigned int n, ctr;
803  size_t i;
804  uint64_t mlen = ctx->len.u[1];
805  block128_f block = ctx->block;
806#ifdef GCM_FUNCREF_4BIT
807  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult;
808#ifdef GHASH
809  void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
810                      size_t len) = ctx->ghash;
811#endif
812#endif
813
814  mlen += len;
815  if (mlen > ((UINT64_C(1) << 36) - 32) ||
816      (sizeof(len) == 8 && mlen < len)) {
817    return 0;
818  }
819  ctx->len.u[1] = mlen;
820
821  if (ctx->ares) {
822    /* First call to decrypt finalizes GHASH(AAD) */
823    GCM_MUL(ctx, Xi);
824    ctx->ares = 0;
825  }
826
827  if (is_endian.little) {
828    ctr = GETU32(ctx->Yi.c + 12);
829  } else {
830    ctr = ctx->Yi.d[3];
831  }
832
833  n = ctx->mres;
834  if (n) {
835    while (n && len) {
836      uint8_t c = *(in++);
837      *(out++) = c ^ ctx->EKi.c[n];
838      ctx->Xi.c[n] ^= c;
839      --len;
840      n = (n + 1) % 16;
841    }
842    if (n == 0) {
843      GCM_MUL(ctx, Xi);
844    } else {
845      ctx->mres = n;
846      return 1;
847    }
848  }
849  if (STRICT_ALIGNMENT && ((size_t)in | (size_t)out) % sizeof(size_t) != 0) {
850    for (i = 0; i < len; ++i) {
851      uint8_t c;
852      if (n == 0) {
853        (*block)(ctx->Yi.c, ctx->EKi.c, key);
854        ++ctr;
855        if (is_endian.little) {
856          PUTU32(ctx->Yi.c + 12, ctr);
857        } else {
858          ctx->Yi.d[3] = ctr;
859        }
860      }
861      c = in[i];
862      out[i] = c ^ ctx->EKi.c[n];
863      ctx->Xi.c[n] ^= c;
864      n = (n + 1) % 16;
865      if (n == 0) {
866        GCM_MUL(ctx, Xi);
867      }
868    }
869
870    ctx->mres = n;
871    return 1;
872  }
873#if defined(GHASH) && defined(GHASH_CHUNK)
874  while (len >= GHASH_CHUNK) {
875    size_t j = GHASH_CHUNK;
876
877    GHASH(ctx, in, GHASH_CHUNK);
878    while (j) {
879      size_t *out_t = (size_t *)out;
880      const size_t *in_t = (const size_t *)in;
881
882      (*block)(ctx->Yi.c, ctx->EKi.c, key);
883      ++ctr;
884      if (is_endian.little) {
885        PUTU32(ctx->Yi.c + 12, ctr);
886      } else {
887        ctx->Yi.d[3] = ctr;
888      }
889      for (i = 0; i < 16 / sizeof(size_t); ++i) {
890        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
891      }
892      out += 16;
893      in += 16;
894      j -= 16;
895    }
896    len -= GHASH_CHUNK;
897  }
898  if ((i = (len & (size_t) - 16))) {
899    GHASH(ctx, in, i);
900    while (len >= 16) {
901      size_t *out_t = (size_t *)out;
902      const size_t *in_t = (const size_t *)in;
903
904      (*block)(ctx->Yi.c, ctx->EKi.c, key);
905      ++ctr;
906      if (is_endian.little) {
907        PUTU32(ctx->Yi.c + 12, ctr);
908      } else {
909        ctx->Yi.d[3] = ctr;
910      }
911      for (i = 0; i < 16 / sizeof(size_t); ++i) {
912        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
913      }
914      out += 16;
915      in += 16;
916      len -= 16;
917    }
918  }
919#else
920  while (len >= 16) {
921    size_t *out_t = (size_t *)out;
922    const size_t *in_t = (const size_t *)in;
923
924    (*block)(ctx->Yi.c, ctx->EKi.c, key);
925    ++ctr;
926    if (is_endian.little) {
927      PUTU32(ctx->Yi.c + 12, ctr);
928    } else {
929      ctx->Yi.d[3] = ctr;
930    }
931    for (i = 0; i < 16 / sizeof(size_t); ++i) {
932      size_t c = in_t[i];
933      out_t[i] = c ^ ctx->EKi.t[i];
934      ctx->Xi.t[i] ^= c;
935    }
936    GCM_MUL(ctx, Xi);
937    out += 16;
938    in += 16;
939    len -= 16;
940  }
941#endif
942  if (len) {
943    (*block)(ctx->Yi.c, ctx->EKi.c, key);
944    ++ctr;
945    if (is_endian.little) {
946      PUTU32(ctx->Yi.c + 12, ctr);
947    } else {
948      ctx->Yi.d[3] = ctr;
949    }
950    while (len--) {
951      uint8_t c = in[n];
952      ctx->Xi.c[n] ^= c;
953      out[n] = c ^ ctx->EKi.c[n];
954      ++n;
955    }
956  }
957
958  ctx->mres = n;
959  return 1;
960}
961
962int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, const void *key,
963                                const uint8_t *in, uint8_t *out, size_t len,
964                                ctr128_f stream) {
965  const union {
966    long one;
967    char little;
968  } is_endian = {1};
969  unsigned int n, ctr;
970  uint64_t mlen = ctx->len.u[1];
971#ifdef GCM_FUNCREF_4BIT
972  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult;
973#ifdef GHASH
974  void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
975                      size_t len) = ctx->ghash;
976#endif
977#endif
978
979  mlen += len;
980  if (mlen > ((UINT64_C(1) << 36) - 32) ||
981      (sizeof(len) == 8 && mlen < len)) {
982    return 0;
983  }
984  ctx->len.u[1] = mlen;
985
986  if (ctx->ares) {
987    /* First call to encrypt finalizes GHASH(AAD) */
988    GCM_MUL(ctx, Xi);
989    ctx->ares = 0;
990  }
991
992  if (is_endian.little) {
993    ctr = GETU32(ctx->Yi.c + 12);
994  } else {
995    ctr = ctx->Yi.d[3];
996  }
997
998  n = ctx->mres;
999  if (n) {
1000    while (n && len) {
1001      ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1002      --len;
1003      n = (n + 1) % 16;
1004    }
1005    if (n == 0) {
1006      GCM_MUL(ctx, Xi);
1007    } else {
1008      ctx->mres = n;
1009      return 1;
1010    }
1011  }
1012#if defined(GHASH)
1013  while (len >= GHASH_CHUNK) {
1014    (*stream)(in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1015    ctr += GHASH_CHUNK / 16;
1016    if (is_endian.little) {
1017      PUTU32(ctx->Yi.c + 12, ctr);
1018    } else {
1019      ctx->Yi.d[3] = ctr;
1020    }
1021    GHASH(ctx, out, GHASH_CHUNK);
1022    out += GHASH_CHUNK;
1023    in += GHASH_CHUNK;
1024    len -= GHASH_CHUNK;
1025  }
1026#endif
1027  size_t i = len & kSizeTWithoutLower4Bits;
1028  if (i != 0) {
1029    size_t j = i / 16;
1030
1031    (*stream)(in, out, j, key, ctx->Yi.c);
1032    ctr += (unsigned int)j;
1033    if (is_endian.little) {
1034      PUTU32(ctx->Yi.c + 12, ctr);
1035    } else {
1036      ctx->Yi.d[3] = ctr;
1037    }
1038    in += i;
1039    len -= i;
1040#if defined(GHASH)
1041    GHASH(ctx, out, i);
1042    out += i;
1043#else
1044    while (j--) {
1045      for (i = 0; i < 16; ++i) {
1046        ctx->Xi.c[i] ^= out[i];
1047      }
1048      GCM_MUL(ctx, Xi);
1049      out += 16;
1050    }
1051#endif
1052  }
1053  if (len) {
1054    (*ctx->block)(ctx->Yi.c, ctx->EKi.c, key);
1055    ++ctr;
1056    if (is_endian.little) {
1057      PUTU32(ctx->Yi.c + 12, ctr);
1058    } else {
1059      ctx->Yi.d[3] = ctr;
1060    }
1061    while (len--) {
1062      ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1063      ++n;
1064    }
1065  }
1066
1067  ctx->mres = n;
1068  return 1;
1069}
1070
1071int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, const void *key,
1072                                const uint8_t *in, uint8_t *out, size_t len,
1073                                ctr128_f stream) {
1074  const union {
1075    long one;
1076    char little;
1077  } is_endian = {1};
1078  unsigned int n, ctr;
1079  uint64_t mlen = ctx->len.u[1];
1080#ifdef GCM_FUNCREF_4BIT
1081  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult;
1082#ifdef GHASH
1083  void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
1084                      size_t len) = ctx->ghash;
1085#endif
1086#endif
1087
1088  mlen += len;
1089  if (mlen > ((UINT64_C(1) << 36) - 32) ||
1090      (sizeof(len) == 8 && mlen < len)) {
1091    return 0;
1092  }
1093  ctx->len.u[1] = mlen;
1094
1095  if (ctx->ares) {
1096    /* First call to decrypt finalizes GHASH(AAD) */
1097    GCM_MUL(ctx, Xi);
1098    ctx->ares = 0;
1099  }
1100
1101  if (is_endian.little) {
1102    ctr = GETU32(ctx->Yi.c + 12);
1103  } else {
1104    ctr = ctx->Yi.d[3];
1105  }
1106
1107  n = ctx->mres;
1108  if (n) {
1109    while (n && len) {
1110      uint8_t c = *(in++);
1111      *(out++) = c ^ ctx->EKi.c[n];
1112      ctx->Xi.c[n] ^= c;
1113      --len;
1114      n = (n + 1) % 16;
1115    }
1116    if (n == 0) {
1117      GCM_MUL(ctx, Xi);
1118    } else {
1119      ctx->mres = n;
1120      return 1;
1121    }
1122  }
1123#if defined(GHASH)
1124  while (len >= GHASH_CHUNK) {
1125    GHASH(ctx, in, GHASH_CHUNK);
1126    (*stream)(in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1127    ctr += GHASH_CHUNK / 16;
1128    if (is_endian.little) {
1129      PUTU32(ctx->Yi.c + 12, ctr);
1130    } else {
1131      ctx->Yi.d[3] = ctr;
1132    }
1133    out += GHASH_CHUNK;
1134    in += GHASH_CHUNK;
1135    len -= GHASH_CHUNK;
1136  }
1137#endif
1138  size_t i = len & kSizeTWithoutLower4Bits;
1139  if (i != 0) {
1140    size_t j = i / 16;
1141
1142#if defined(GHASH)
1143    GHASH(ctx, in, i);
1144#else
1145    while (j--) {
1146      size_t k;
1147      for (k = 0; k < 16; ++k) {
1148        ctx->Xi.c[k] ^= in[k];
1149      }
1150      GCM_MUL(ctx, Xi);
1151      in += 16;
1152    }
1153    j = i / 16;
1154    in -= i;
1155#endif
1156    (*stream)(in, out, j, key, ctx->Yi.c);
1157    ctr += (unsigned int)j;
1158    if (is_endian.little) {
1159      PUTU32(ctx->Yi.c + 12, ctr);
1160    } else {
1161      ctx->Yi.d[3] = ctr;
1162    }
1163    out += i;
1164    in += i;
1165    len -= i;
1166  }
1167  if (len) {
1168    (*ctx->block)(ctx->Yi.c, ctx->EKi.c, key);
1169    ++ctr;
1170    if (is_endian.little) {
1171      PUTU32(ctx->Yi.c + 12, ctr);
1172    } else {
1173      ctx->Yi.d[3] = ctr;
1174    }
1175    while (len--) {
1176      uint8_t c = in[n];
1177      ctx->Xi.c[n] ^= c;
1178      out[n] = c ^ ctx->EKi.c[n];
1179      ++n;
1180    }
1181  }
1182
1183  ctx->mres = n;
1184  return 1;
1185}
1186
1187int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const uint8_t *tag, size_t len) {
1188  const union {
1189    long one;
1190    char little;
1191  } is_endian = {1};
1192  uint64_t alen = ctx->len.u[0] << 3;
1193  uint64_t clen = ctx->len.u[1] << 3;
1194#ifdef GCM_FUNCREF_4BIT
1195  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult;
1196#endif
1197
1198  if (ctx->mres || ctx->ares) {
1199    GCM_MUL(ctx, Xi);
1200  }
1201
1202  if (is_endian.little) {
1203#ifdef BSWAP8
1204    alen = BSWAP8(alen);
1205    clen = BSWAP8(clen);
1206#else
1207    uint8_t *p = ctx->len.c;
1208
1209    ctx->len.u[0] = alen;
1210    ctx->len.u[1] = clen;
1211
1212    alen = (uint64_t)GETU32(p) << 32 | GETU32(p + 4);
1213    clen = (uint64_t)GETU32(p + 8) << 32 | GETU32(p + 12);
1214#endif
1215  }
1216
1217  ctx->Xi.u[0] ^= alen;
1218  ctx->Xi.u[1] ^= clen;
1219  GCM_MUL(ctx, Xi);
1220
1221  ctx->Xi.u[0] ^= ctx->EK0.u[0];
1222  ctx->Xi.u[1] ^= ctx->EK0.u[1];
1223
1224  if (tag && len <= sizeof(ctx->Xi)) {
1225    return CRYPTO_memcmp(ctx->Xi.c, tag, len) == 0;
1226  } else {
1227    return 0;
1228  }
1229}
1230
1231void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len) {
1232  CRYPTO_gcm128_finish(ctx, NULL, 0);
1233  memcpy(tag, ctx->Xi.c, len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
1234}
1235
1236void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx) {
1237  if (ctx) {
1238    OPENSSL_cleanse(ctx, sizeof(*ctx));
1239    OPENSSL_free(ctx);
1240  }
1241}
1242
1243#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
1244int crypto_gcm_clmul_enabled(void) {
1245#ifdef GHASH_ASM
1246  return OPENSSL_ia32cap_P[0] & (1 << 24) &&  /* check FXSR bit */
1247    OPENSSL_ia32cap_P[1] & (1 << 1);  /* check PCLMULQDQ bit */
1248#else
1249  return 0;
1250#endif
1251}
1252#endif
1253