1/* ====================================================================
2 * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in
13 *    the documentation and/or other materials provided with the
14 *    distribution.
15 *
16 * 3. All advertising materials mentioning features or use of this
17 *    software must display the following acknowledgment:
18 *    "This product includes software developed by the OpenSSL Project
19 *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20 *
21 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22 *    endorse or promote products derived from this software without
23 *    prior written permission. For written permission, please contact
24 *    openssl-core@openssl.org.
25 *
26 * 5. Products derived from this software may not be called "OpenSSL"
27 *    nor may "OpenSSL" appear in their names without prior written
28 *    permission of the OpenSSL Project.
29 *
30 * 6. Redistributions of any form whatsoever must retain the following
31 *    acknowledgment:
32 *    "This product includes software developed by the OpenSSL Project
33 *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34 *
35 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46 * OF THE POSSIBILITY OF SUCH DAMAGE.
47 * ====================================================================
48 */
49
50#define OPENSSL_FIPSAPI
51
52#include <openssl/crypto.h>
53#include "modes_lcl.h"
54#include <string.h>
55
56#ifndef MODES_DEBUG
57# ifndef NDEBUG
58#  define NDEBUG
59# endif
60#endif
61#include <assert.h>
62
63#if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64/* redefine, because alignment is ensured */
65#undef	GETU32
66#define	GETU32(p)	BSWAP4(*(const u32 *)(p))
67#undef	PUTU32
68#define	PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
69#endif
70
71#define	PACK(s)		((size_t)(s)<<(sizeof(size_t)*8-16))
72#define REDUCE1BIT(V)	do { \
73	if (sizeof(size_t)==8) { \
74		u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75		V.lo  = (V.hi<<63)|(V.lo>>1); \
76		V.hi  = (V.hi>>1 )^T; \
77	} \
78	else { \
79		u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80		V.lo  = (V.hi<<63)|(V.lo>>1); \
81		V.hi  = (V.hi>>1 )^((u64)T<<32); \
82	} \
83} while(0)
84
85/*
86 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87 * never be set to 8. 8 is effectively reserved for testing purposes.
88 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90 * whole spectrum of possible table driven implementations. Why? In
91 * non-"Shoup's" case memory access pattern is segmented in such manner,
92 * that it's trivial to see that cache timing information can reveal
93 * fair portion of intermediate hash value. Given that ciphertext is
94 * always available to attacker, it's possible for him to attempt to
95 * deduce secret parameter H and if successful, tamper with messages
96 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97 * not as trivial, but there is no reason to believe that it's resistant
98 * to cache-timing attack. And the thing about "8-bit" implementation is
99 * that it consumes 16 (sixteen) times more memory, 4KB per individual
100 * key + 1KB shared. Well, on pros side it should be twice as fast as
101 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102 * was observed to run ~75% faster, closer to 100% for commercial
103 * compilers... Yet "4-bit" procedure is preferred, because it's
104 * believed to provide better security-performance balance and adequate
105 * all-round performance. "All-round" refers to things like:
106 *
107 * - shorter setup time effectively improves overall timing for
108 *   handling short messages;
109 * - larger table allocation can become unbearable because of VM
110 *   subsystem penalties (for example on Windows large enough free
111 *   results in VM working set trimming, meaning that consequent
112 *   malloc would immediately incur working set expansion);
113 * - larger table has larger cache footprint, which can affect
114 *   performance of other code paths (not necessarily even from same
115 *   thread in Hyper-Threading world);
116 *
117 * Value of 1 is not appropriate for performance reasons.
118 */
119#if	TABLE_BITS==8
120
121static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122{
123	int  i, j;
124	u128 V;
125
126	Htable[0].hi = 0;
127	Htable[0].lo = 0;
128	V.hi = H[0];
129	V.lo = H[1];
130
131	for (Htable[128]=V, i=64; i>0; i>>=1) {
132		REDUCE1BIT(V);
133		Htable[i] = V;
134	}
135
136	for (i=2; i<256; i<<=1) {
137		u128 *Hi = Htable+i, H0 = *Hi;
138		for (j=1; j<i; ++j) {
139			Hi[j].hi = H0.hi^Htable[j].hi;
140			Hi[j].lo = H0.lo^Htable[j].lo;
141		}
142	}
143}
144
145static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146{
147	u128 Z = { 0, 0};
148	const u8 *xi = (const u8 *)Xi+15;
149	size_t rem, n = *xi;
150	const union { long one; char little; } is_endian = {1};
151	static const size_t rem_8bit[256] = {
152		PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
153		PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
154		PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
155		PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
156		PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
157		PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
158		PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
159		PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
160		PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
161		PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
162		PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
163		PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
164		PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
165		PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
166		PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
167		PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
168		PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
169		PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
170		PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
171		PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
172		PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
173		PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
174		PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
175		PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
176		PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
177		PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
178		PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
179		PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
180		PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
181		PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
182		PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
183		PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
184		PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
185		PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
186		PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
187		PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
188		PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
189		PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
190		PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
191		PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
192		PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
193		PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
194		PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
195		PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
196		PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
197		PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
198		PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
199		PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
200		PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
201		PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
202		PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
203		PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
204		PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
205		PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
206		PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
207		PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
208		PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
209		PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
210		PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
211		PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
212		PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
213		PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
214		PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
215		PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
216
217	while (1) {
218		Z.hi ^= Htable[n].hi;
219		Z.lo ^= Htable[n].lo;
220
221		if ((u8 *)Xi==xi)	break;
222
223		n = *(--xi);
224
225		rem  = (size_t)Z.lo&0xff;
226		Z.lo = (Z.hi<<56)|(Z.lo>>8);
227		Z.hi = (Z.hi>>8);
228		if (sizeof(size_t)==8)
229			Z.hi ^= rem_8bit[rem];
230		else
231			Z.hi ^= (u64)rem_8bit[rem]<<32;
232	}
233
234	if (is_endian.little) {
235#ifdef BSWAP8
236		Xi[0] = BSWAP8(Z.hi);
237		Xi[1] = BSWAP8(Z.lo);
238#else
239		u8 *p = (u8 *)Xi;
240		u32 v;
241		v = (u32)(Z.hi>>32);	PUTU32(p,v);
242		v = (u32)(Z.hi);	PUTU32(p+4,v);
243		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
244		v = (u32)(Z.lo);	PUTU32(p+12,v);
245#endif
246	}
247	else {
248		Xi[0] = Z.hi;
249		Xi[1] = Z.lo;
250	}
251}
252#define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
253
254#elif	TABLE_BITS==4
255
256static void gcm_init_4bit(u128 Htable[16], u64 H[2])
257{
258	u128 V;
259#if defined(OPENSSL_SMALL_FOOTPRINT)
260	int  i;
261#endif
262
263	Htable[0].hi = 0;
264	Htable[0].lo = 0;
265	V.hi = H[0];
266	V.lo = H[1];
267
268#if defined(OPENSSL_SMALL_FOOTPRINT)
269	for (Htable[8]=V, i=4; i>0; i>>=1) {
270		REDUCE1BIT(V);
271		Htable[i] = V;
272	}
273
274	for (i=2; i<16; i<<=1) {
275		u128 *Hi = Htable+i;
276		int   j;
277		for (V=*Hi, j=1; j<i; ++j) {
278			Hi[j].hi = V.hi^Htable[j].hi;
279			Hi[j].lo = V.lo^Htable[j].lo;
280		}
281	}
282#else
283	Htable[8] = V;
284	REDUCE1BIT(V);
285	Htable[4] = V;
286	REDUCE1BIT(V);
287	Htable[2] = V;
288	REDUCE1BIT(V);
289	Htable[1] = V;
290	Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
291	V=Htable[4];
292	Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
293	Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
294	Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
295	V=Htable[8];
296	Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
297	Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
298	Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
299	Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
300	Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
301	Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
302	Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
303#endif
304#if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
305	/*
306	 * ARM assembler expects specific dword order in Htable.
307	 */
308	{
309	int j;
310	const union { long one; char little; } is_endian = {1};
311
312	if (is_endian.little)
313		for (j=0;j<16;++j) {
314			V = Htable[j];
315			Htable[j].hi = V.lo;
316			Htable[j].lo = V.hi;
317		}
318	else
319		for (j=0;j<16;++j) {
320			V = Htable[j];
321			Htable[j].hi = V.lo<<32|V.lo>>32;
322			Htable[j].lo = V.hi<<32|V.hi>>32;
323		}
324	}
325#endif
326}
327
328#ifndef GHASH_ASM
329static const size_t rem_4bit[16] = {
330	PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
331	PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
332	PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
333	PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
334
335static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
336{
337	u128 Z;
338	int cnt = 15;
339	size_t rem, nlo, nhi;
340	const union { long one; char little; } is_endian = {1};
341
342	nlo  = ((const u8 *)Xi)[15];
343	nhi  = nlo>>4;
344	nlo &= 0xf;
345
346	Z.hi = Htable[nlo].hi;
347	Z.lo = Htable[nlo].lo;
348
349	while (1) {
350		rem  = (size_t)Z.lo&0xf;
351		Z.lo = (Z.hi<<60)|(Z.lo>>4);
352		Z.hi = (Z.hi>>4);
353		if (sizeof(size_t)==8)
354			Z.hi ^= rem_4bit[rem];
355		else
356			Z.hi ^= (u64)rem_4bit[rem]<<32;
357
358		Z.hi ^= Htable[nhi].hi;
359		Z.lo ^= Htable[nhi].lo;
360
361		if (--cnt<0)		break;
362
363		nlo  = ((const u8 *)Xi)[cnt];
364		nhi  = nlo>>4;
365		nlo &= 0xf;
366
367		rem  = (size_t)Z.lo&0xf;
368		Z.lo = (Z.hi<<60)|(Z.lo>>4);
369		Z.hi = (Z.hi>>4);
370		if (sizeof(size_t)==8)
371			Z.hi ^= rem_4bit[rem];
372		else
373			Z.hi ^= (u64)rem_4bit[rem]<<32;
374
375		Z.hi ^= Htable[nlo].hi;
376		Z.lo ^= Htable[nlo].lo;
377	}
378
379	if (is_endian.little) {
380#ifdef BSWAP8
381		Xi[0] = BSWAP8(Z.hi);
382		Xi[1] = BSWAP8(Z.lo);
383#else
384		u8 *p = (u8 *)Xi;
385		u32 v;
386		v = (u32)(Z.hi>>32);	PUTU32(p,v);
387		v = (u32)(Z.hi);	PUTU32(p+4,v);
388		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
389		v = (u32)(Z.lo);	PUTU32(p+12,v);
390#endif
391	}
392	else {
393		Xi[0] = Z.hi;
394		Xi[1] = Z.lo;
395	}
396}
397
398#if !defined(OPENSSL_SMALL_FOOTPRINT)
399/*
400 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
401 * details... Compiler-generated code doesn't seem to give any
402 * performance improvement, at least not on x86[_64]. It's here
403 * mostly as reference and a placeholder for possible future
404 * non-trivial optimization[s]...
405 */
406static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
407				const u8 *inp,size_t len)
408{
409    u128 Z;
410    int cnt;
411    size_t rem, nlo, nhi;
412    const union { long one; char little; } is_endian = {1};
413
414#if 1
415    do {
416	cnt  = 15;
417	nlo  = ((const u8 *)Xi)[15];
418	nlo ^= inp[15];
419	nhi  = nlo>>4;
420	nlo &= 0xf;
421
422	Z.hi = Htable[nlo].hi;
423	Z.lo = Htable[nlo].lo;
424
425	while (1) {
426		rem  = (size_t)Z.lo&0xf;
427		Z.lo = (Z.hi<<60)|(Z.lo>>4);
428		Z.hi = (Z.hi>>4);
429		if (sizeof(size_t)==8)
430			Z.hi ^= rem_4bit[rem];
431		else
432			Z.hi ^= (u64)rem_4bit[rem]<<32;
433
434		Z.hi ^= Htable[nhi].hi;
435		Z.lo ^= Htable[nhi].lo;
436
437		if (--cnt<0)		break;
438
439		nlo  = ((const u8 *)Xi)[cnt];
440		nlo ^= inp[cnt];
441		nhi  = nlo>>4;
442		nlo &= 0xf;
443
444		rem  = (size_t)Z.lo&0xf;
445		Z.lo = (Z.hi<<60)|(Z.lo>>4);
446		Z.hi = (Z.hi>>4);
447		if (sizeof(size_t)==8)
448			Z.hi ^= rem_4bit[rem];
449		else
450			Z.hi ^= (u64)rem_4bit[rem]<<32;
451
452		Z.hi ^= Htable[nlo].hi;
453		Z.lo ^= Htable[nlo].lo;
454	}
455#else
456    /*
457     * Extra 256+16 bytes per-key plus 512 bytes shared tables
458     * [should] give ~50% improvement... One could have PACK()-ed
459     * the rem_8bit even here, but the priority is to minimize
460     * cache footprint...
461     */
462    u128 Hshr4[16];	/* Htable shifted right by 4 bits */
463    u8   Hshl4[16];	/* Htable shifted left  by 4 bits */
464    static const unsigned short rem_8bit[256] = {
465	0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
466	0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
467	0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
468	0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
469	0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
470	0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
471	0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
472	0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
473	0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
474	0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
475	0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
476	0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
477	0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
478	0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
479	0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
480	0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
481	0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
482	0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
483	0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
484	0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
485	0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
486	0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
487	0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
488	0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
489	0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
490	0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
491	0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
492	0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
493	0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
494	0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
495	0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
496	0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
497    /*
498     * This pre-processing phase slows down procedure by approximately
499     * same time as it makes each loop spin faster. In other words
500     * single block performance is approximately same as straightforward
501     * "4-bit" implementation, and then it goes only faster...
502     */
503    for (cnt=0; cnt<16; ++cnt) {
504	Z.hi = Htable[cnt].hi;
505	Z.lo = Htable[cnt].lo;
506	Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
507	Hshr4[cnt].hi = (Z.hi>>4);
508	Hshl4[cnt]    = (u8)(Z.lo<<4);
509    }
510
511    do {
512	for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
513		nlo  = ((const u8 *)Xi)[cnt];
514		nlo ^= inp[cnt];
515		nhi  = nlo>>4;
516		nlo &= 0xf;
517
518		Z.hi ^= Htable[nlo].hi;
519		Z.lo ^= Htable[nlo].lo;
520
521		rem = (size_t)Z.lo&0xff;
522
523		Z.lo = (Z.hi<<56)|(Z.lo>>8);
524		Z.hi = (Z.hi>>8);
525
526		Z.hi ^= Hshr4[nhi].hi;
527		Z.lo ^= Hshr4[nhi].lo;
528		Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
529	}
530
531	nlo  = ((const u8 *)Xi)[0];
532	nlo ^= inp[0];
533	nhi  = nlo>>4;
534	nlo &= 0xf;
535
536	Z.hi ^= Htable[nlo].hi;
537	Z.lo ^= Htable[nlo].lo;
538
539	rem = (size_t)Z.lo&0xf;
540
541	Z.lo = (Z.hi<<60)|(Z.lo>>4);
542	Z.hi = (Z.hi>>4);
543
544	Z.hi ^= Htable[nhi].hi;
545	Z.lo ^= Htable[nhi].lo;
546	Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
547#endif
548
549	if (is_endian.little) {
550#ifdef BSWAP8
551		Xi[0] = BSWAP8(Z.hi);
552		Xi[1] = BSWAP8(Z.lo);
553#else
554		u8 *p = (u8 *)Xi;
555		u32 v;
556		v = (u32)(Z.hi>>32);	PUTU32(p,v);
557		v = (u32)(Z.hi);	PUTU32(p+4,v);
558		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
559		v = (u32)(Z.lo);	PUTU32(p+12,v);
560#endif
561	}
562	else {
563		Xi[0] = Z.hi;
564		Xi[1] = Z.lo;
565	}
566    } while (inp+=16, len-=16);
567}
568#endif
569#else
570void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
571void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
572#endif
573
574#define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
575#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
576#define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
577/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
578 * trashing effect. In other words idea is to hash data while it's
579 * still in L1 cache after encryption pass... */
580#define GHASH_CHUNK       (3*1024)
581#endif
582
583#else	/* TABLE_BITS */
584
585static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
586{
587	u128 V,Z = { 0,0 };
588	long X;
589	int  i,j;
590	const long *xi = (const long *)Xi;
591	const union { long one; char little; } is_endian = {1};
592
593	V.hi = H[0];	/* H is in host byte order, no byte swapping */
594	V.lo = H[1];
595
596	for (j=0; j<16/sizeof(long); ++j) {
597		if (is_endian.little) {
598			if (sizeof(long)==8) {
599#ifdef BSWAP8
600				X = (long)(BSWAP8(xi[j]));
601#else
602				const u8 *p = (const u8 *)(xi+j);
603				X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
604#endif
605			}
606			else {
607				const u8 *p = (const u8 *)(xi+j);
608				X = (long)GETU32(p);
609			}
610		}
611		else
612			X = xi[j];
613
614		for (i=0; i<8*sizeof(long); ++i, X<<=1) {
615			u64 M = (u64)(X>>(8*sizeof(long)-1));
616			Z.hi ^= V.hi&M;
617			Z.lo ^= V.lo&M;
618
619			REDUCE1BIT(V);
620		}
621	}
622
623	if (is_endian.little) {
624#ifdef BSWAP8
625		Xi[0] = BSWAP8(Z.hi);
626		Xi[1] = BSWAP8(Z.lo);
627#else
628		u8 *p = (u8 *)Xi;
629		u32 v;
630		v = (u32)(Z.hi>>32);	PUTU32(p,v);
631		v = (u32)(Z.hi);	PUTU32(p+4,v);
632		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
633		v = (u32)(Z.lo);	PUTU32(p+12,v);
634#endif
635	}
636	else {
637		Xi[0] = Z.hi;
638		Xi[1] = Z.lo;
639	}
640}
641#define GCM_MUL(ctx,Xi)	  gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
642
643#endif
644
645#if	TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
646# if	!defined(I386_ONLY) && \
647	(defined(__i386)	|| defined(__i386__)	|| \
648	 defined(__x86_64)	|| defined(__x86_64__)	|| \
649	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
650#  define GHASH_ASM_X86_OR_64
651#  define GCM_FUNCREF_4BIT
652extern unsigned int OPENSSL_ia32cap_P[2];
653
654void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
655void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
656void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
657
658#  if	defined(__i386) || defined(__i386__) || defined(_M_IX86)
659#   define GHASH_ASM_X86
660void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
661void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
662
663void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
664void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
665#  endif
666# elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
667#  include "arm_arch.h"
668#  if __ARM_ARCH__>=7
669#   define GHASH_ASM_ARM
670#   define GCM_FUNCREF_4BIT
671#   define PMULL_CAPABLE	(OPENSSL_armcap_P & ARMV8_PMULL)
672#   if defined(__arm__) || defined(__arm)
673#    define NEON_CAPABLE	(OPENSSL_armcap_P & ARMV7_NEON)
674#   endif
675void gcm_init_neon(u128 Htable[16],const u64 Xi[2]);
676void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
677void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
678void gcm_init_v8(u128 Htable[16],const u64 Xi[2]);
679void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
680void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
681#  endif
682# endif
683#endif
684
685#ifdef GCM_FUNCREF_4BIT
686# undef  GCM_MUL
687# define GCM_MUL(ctx,Xi)	(*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
688# ifdef GHASH
689#  undef  GHASH
690#  define GHASH(ctx,in,len)	(*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
691# endif
692#endif
693
694void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
695{
696	const union { long one; char little; } is_endian = {1};
697
698	memset(ctx,0,sizeof(*ctx));
699	ctx->block = block;
700	ctx->key   = key;
701
702	(*block)(ctx->H.c,ctx->H.c,key);
703
704	if (is_endian.little) {
705		/* H is stored in host byte order */
706#ifdef BSWAP8
707		ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
708		ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
709#else
710		u8 *p = ctx->H.c;
711		u64 hi,lo;
712		hi = (u64)GETU32(p)  <<32|GETU32(p+4);
713		lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
714		ctx->H.u[0] = hi;
715		ctx->H.u[1] = lo;
716#endif
717	}
718
719#if	TABLE_BITS==8
720	gcm_init_8bit(ctx->Htable,ctx->H.u);
721#elif	TABLE_BITS==4
722# if	defined(GHASH_ASM_X86_OR_64)
723#  if	!defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
724	if (OPENSSL_ia32cap_P[0]&(1<<24) &&	/* check FXSR bit */
725	    OPENSSL_ia32cap_P[1]&(1<<1) ) {	/* check PCLMULQDQ bit */
726		gcm_init_clmul(ctx->Htable,ctx->H.u);
727		ctx->gmult = gcm_gmult_clmul;
728		ctx->ghash = gcm_ghash_clmul;
729		return;
730	}
731#  endif
732	gcm_init_4bit(ctx->Htable,ctx->H.u);
733#  if	defined(GHASH_ASM_X86)			/* x86 only */
734#   if	defined(OPENSSL_IA32_SSE2)
735	if (OPENSSL_ia32cap_P[0]&(1<<25)) {	/* check SSE bit */
736#   else
737	if (OPENSSL_ia32cap_P[0]&(1<<23)) {	/* check MMX bit */
738#   endif
739		ctx->gmult = gcm_gmult_4bit_mmx;
740		ctx->ghash = gcm_ghash_4bit_mmx;
741	} else {
742		ctx->gmult = gcm_gmult_4bit_x86;
743		ctx->ghash = gcm_ghash_4bit_x86;
744	}
745#  else
746	ctx->gmult = gcm_gmult_4bit;
747	ctx->ghash = gcm_ghash_4bit;
748#  endif
749# elif	defined(GHASH_ASM_ARM)
750#  ifdef PMULL_CAPABLE
751	if (PMULL_CAPABLE) {
752		gcm_init_v8(ctx->Htable,ctx->H.u);
753		ctx->gmult = gcm_gmult_v8;
754		ctx->ghash = gcm_ghash_v8;
755	} else
756#  endif
757#  ifdef NEON_CAPABLE
758	if (NEON_CAPABLE) {
759		gcm_init_neon(ctx->Htable,ctx->H.u);
760		ctx->gmult = gcm_gmult_neon;
761		ctx->ghash = gcm_ghash_neon;
762	} else
763#  endif
764	{
765		gcm_init_4bit(ctx->Htable,ctx->H.u);
766		ctx->gmult = gcm_gmult_4bit;
767		ctx->ghash = gcm_ghash_4bit;
768	}
769# else
770	gcm_init_4bit(ctx->Htable,ctx->H.u);
771# endif
772#endif
773}
774
775void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
776{
777	const union { long one; char little; } is_endian = {1};
778	unsigned int ctr;
779#ifdef GCM_FUNCREF_4BIT
780	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
781#endif
782
783	ctx->Yi.u[0]  = 0;
784	ctx->Yi.u[1]  = 0;
785	ctx->Xi.u[0]  = 0;
786	ctx->Xi.u[1]  = 0;
787	ctx->len.u[0] = 0;	/* AAD length */
788	ctx->len.u[1] = 0;	/* message length */
789	ctx->ares = 0;
790	ctx->mres = 0;
791
792	if (len==12) {
793		memcpy(ctx->Yi.c,iv,12);
794		ctx->Yi.c[15]=1;
795		ctr=1;
796	}
797	else {
798		size_t i;
799		u64 len0 = len;
800
801		while (len>=16) {
802			for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
803			GCM_MUL(ctx,Yi);
804			iv += 16;
805			len -= 16;
806		}
807		if (len) {
808			for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
809			GCM_MUL(ctx,Yi);
810		}
811		len0 <<= 3;
812		if (is_endian.little) {
813#ifdef BSWAP8
814			ctx->Yi.u[1]  ^= BSWAP8(len0);
815#else
816			ctx->Yi.c[8]  ^= (u8)(len0>>56);
817			ctx->Yi.c[9]  ^= (u8)(len0>>48);
818			ctx->Yi.c[10] ^= (u8)(len0>>40);
819			ctx->Yi.c[11] ^= (u8)(len0>>32);
820			ctx->Yi.c[12] ^= (u8)(len0>>24);
821			ctx->Yi.c[13] ^= (u8)(len0>>16);
822			ctx->Yi.c[14] ^= (u8)(len0>>8);
823			ctx->Yi.c[15] ^= (u8)(len0);
824#endif
825		}
826		else
827			ctx->Yi.u[1]  ^= len0;
828
829		GCM_MUL(ctx,Yi);
830
831		if (is_endian.little)
832#ifdef BSWAP4
833			ctr = BSWAP4(ctx->Yi.d[3]);
834#else
835			ctr = GETU32(ctx->Yi.c+12);
836#endif
837		else
838			ctr = ctx->Yi.d[3];
839	}
840
841	(*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
842	++ctr;
843	if (is_endian.little)
844#ifdef BSWAP4
845		ctx->Yi.d[3] = BSWAP4(ctr);
846#else
847		PUTU32(ctx->Yi.c+12,ctr);
848#endif
849	else
850		ctx->Yi.d[3] = ctr;
851}
852
853int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
854{
855	size_t i;
856	unsigned int n;
857	u64 alen = ctx->len.u[0];
858#ifdef GCM_FUNCREF_4BIT
859	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
860# ifdef GHASH
861	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
862				const u8 *inp,size_t len)	= ctx->ghash;
863# endif
864#endif
865
866	if (ctx->len.u[1]) return -2;
867
868	alen += len;
869	if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
870		return -1;
871	ctx->len.u[0] = alen;
872
873	n = ctx->ares;
874	if (n) {
875		while (n && len) {
876			ctx->Xi.c[n] ^= *(aad++);
877			--len;
878			n = (n+1)%16;
879		}
880		if (n==0) GCM_MUL(ctx,Xi);
881		else {
882			ctx->ares = n;
883			return 0;
884		}
885	}
886
887#ifdef GHASH
888	if ((i = (len&(size_t)-16))) {
889		GHASH(ctx,aad,i);
890		aad += i;
891		len -= i;
892	}
893#else
894	while (len>=16) {
895		for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
896		GCM_MUL(ctx,Xi);
897		aad += 16;
898		len -= 16;
899	}
900#endif
901	if (len) {
902		n = (unsigned int)len;
903		for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
904	}
905
906	ctx->ares = n;
907	return 0;
908}
909
910int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
911		const unsigned char *in, unsigned char *out,
912		size_t len)
913{
914	const union { long one; char little; } is_endian = {1};
915	unsigned int n, ctr;
916	size_t i;
917	u64        mlen  = ctx->len.u[1];
918	block128_f block = ctx->block;
919	void      *key   = ctx->key;
920#ifdef GCM_FUNCREF_4BIT
921	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
922# ifdef GHASH
923	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
924				const u8 *inp,size_t len)	= ctx->ghash;
925# endif
926#endif
927
928#if 0
929	n = (unsigned int)mlen%16; /* alternative to ctx->mres */
930#endif
931	mlen += len;
932	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
933		return -1;
934	ctx->len.u[1] = mlen;
935
936	if (ctx->ares) {
937		/* First call to encrypt finalizes GHASH(AAD) */
938		GCM_MUL(ctx,Xi);
939		ctx->ares = 0;
940	}
941
942	if (is_endian.little)
943#ifdef BSWAP4
944		ctr = BSWAP4(ctx->Yi.d[3]);
945#else
946		ctr = GETU32(ctx->Yi.c+12);
947#endif
948	else
949		ctr = ctx->Yi.d[3];
950
951	n = ctx->mres;
952#if !defined(OPENSSL_SMALL_FOOTPRINT)
953	if (16%sizeof(size_t) == 0) do {	/* always true actually */
954		if (n) {
955			while (n && len) {
956				ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
957				--len;
958				n = (n+1)%16;
959			}
960			if (n==0) GCM_MUL(ctx,Xi);
961			else {
962				ctx->mres = n;
963				return 0;
964			}
965		}
966#if defined(STRICT_ALIGNMENT)
967		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
968			break;
969#endif
970#if defined(GHASH) && defined(GHASH_CHUNK)
971		while (len>=GHASH_CHUNK) {
972		    size_t j=GHASH_CHUNK;
973
974		    while (j) {
975		    	size_t *out_t=(size_t *)out;
976		    	const size_t *in_t=(const size_t *)in;
977
978			(*block)(ctx->Yi.c,ctx->EKi.c,key);
979			++ctr;
980			if (is_endian.little)
981#ifdef BSWAP4
982				ctx->Yi.d[3] = BSWAP4(ctr);
983#else
984				PUTU32(ctx->Yi.c+12,ctr);
985#endif
986			else
987				ctx->Yi.d[3] = ctr;
988			for (i=0; i<16/sizeof(size_t); ++i)
989				out_t[i] = in_t[i] ^ ctx->EKi.t[i];
990			out += 16;
991			in  += 16;
992			j   -= 16;
993		    }
994		    GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
995		    len -= GHASH_CHUNK;
996		}
997		if ((i = (len&(size_t)-16))) {
998		    size_t j=i;
999
1000		    while (len>=16) {
1001		    	size_t *out_t=(size_t *)out;
1002		    	const size_t *in_t=(const size_t *)in;
1003
1004			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1005			++ctr;
1006			if (is_endian.little)
1007#ifdef BSWAP4
1008				ctx->Yi.d[3] = BSWAP4(ctr);
1009#else
1010				PUTU32(ctx->Yi.c+12,ctr);
1011#endif
1012			else
1013				ctx->Yi.d[3] = ctr;
1014			for (i=0; i<16/sizeof(size_t); ++i)
1015				out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1016			out += 16;
1017			in  += 16;
1018			len -= 16;
1019		    }
1020		    GHASH(ctx,out-j,j);
1021		}
1022#else
1023		while (len>=16) {
1024		    	size_t *out_t=(size_t *)out;
1025		    	const size_t *in_t=(const size_t *)in;
1026
1027			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1028			++ctr;
1029			if (is_endian.little)
1030#ifdef BSWAP4
1031				ctx->Yi.d[3] = BSWAP4(ctr);
1032#else
1033				PUTU32(ctx->Yi.c+12,ctr);
1034#endif
1035			else
1036				ctx->Yi.d[3] = ctr;
1037			for (i=0; i<16/sizeof(size_t); ++i)
1038				ctx->Xi.t[i] ^=
1039				out_t[i] = in_t[i]^ctx->EKi.t[i];
1040			GCM_MUL(ctx,Xi);
1041			out += 16;
1042			in  += 16;
1043			len -= 16;
1044		}
1045#endif
1046		if (len) {
1047			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1048			++ctr;
1049			if (is_endian.little)
1050#ifdef BSWAP4
1051				ctx->Yi.d[3] = BSWAP4(ctr);
1052#else
1053				PUTU32(ctx->Yi.c+12,ctr);
1054#endif
1055			else
1056				ctx->Yi.d[3] = ctr;
1057			while (len--) {
1058				ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1059				++n;
1060			}
1061		}
1062
1063		ctx->mres = n;
1064		return 0;
1065	} while(0);
1066#endif
1067	for (i=0;i<len;++i) {
1068		if (n==0) {
1069			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1070			++ctr;
1071			if (is_endian.little)
1072#ifdef BSWAP4
1073				ctx->Yi.d[3] = BSWAP4(ctr);
1074#else
1075				PUTU32(ctx->Yi.c+12,ctr);
1076#endif
1077			else
1078				ctx->Yi.d[3] = ctr;
1079		}
1080		ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1081		n = (n+1)%16;
1082		if (n==0)
1083			GCM_MUL(ctx,Xi);
1084	}
1085
1086	ctx->mres = n;
1087	return 0;
1088}
1089
1090int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1091		const unsigned char *in, unsigned char *out,
1092		size_t len)
1093{
1094	const union { long one; char little; } is_endian = {1};
1095	unsigned int n, ctr;
1096	size_t i;
1097	u64        mlen  = ctx->len.u[1];
1098	block128_f block = ctx->block;
1099	void      *key   = ctx->key;
1100#ifdef GCM_FUNCREF_4BIT
1101	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1102# ifdef GHASH
1103	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1104				const u8 *inp,size_t len)	= ctx->ghash;
1105# endif
1106#endif
1107
1108	mlen += len;
1109	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1110		return -1;
1111	ctx->len.u[1] = mlen;
1112
1113	if (ctx->ares) {
1114		/* First call to decrypt finalizes GHASH(AAD) */
1115		GCM_MUL(ctx,Xi);
1116		ctx->ares = 0;
1117	}
1118
1119	if (is_endian.little)
1120#ifdef BSWAP4
1121		ctr = BSWAP4(ctx->Yi.d[3]);
1122#else
1123		ctr = GETU32(ctx->Yi.c+12);
1124#endif
1125	else
1126		ctr = ctx->Yi.d[3];
1127
1128	n = ctx->mres;
1129#if !defined(OPENSSL_SMALL_FOOTPRINT)
1130	if (16%sizeof(size_t) == 0) do {	/* always true actually */
1131		if (n) {
1132			while (n && len) {
1133				u8 c = *(in++);
1134				*(out++) = c^ctx->EKi.c[n];
1135				ctx->Xi.c[n] ^= c;
1136				--len;
1137				n = (n+1)%16;
1138			}
1139			if (n==0) GCM_MUL (ctx,Xi);
1140			else {
1141				ctx->mres = n;
1142				return 0;
1143			}
1144		}
1145#if defined(STRICT_ALIGNMENT)
1146		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1147			break;
1148#endif
1149#if defined(GHASH) && defined(GHASH_CHUNK)
1150		while (len>=GHASH_CHUNK) {
1151		    size_t j=GHASH_CHUNK;
1152
1153		    GHASH(ctx,in,GHASH_CHUNK);
1154		    while (j) {
1155		    	size_t *out_t=(size_t *)out;
1156		    	const size_t *in_t=(const size_t *)in;
1157
1158			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1159			++ctr;
1160			if (is_endian.little)
1161#ifdef BSWAP4
1162				ctx->Yi.d[3] = BSWAP4(ctr);
1163#else
1164				PUTU32(ctx->Yi.c+12,ctr);
1165#endif
1166			else
1167				ctx->Yi.d[3] = ctr;
1168			for (i=0; i<16/sizeof(size_t); ++i)
1169				out_t[i] = in_t[i]^ctx->EKi.t[i];
1170			out += 16;
1171			in  += 16;
1172			j   -= 16;
1173		    }
1174		    len -= GHASH_CHUNK;
1175		}
1176		if ((i = (len&(size_t)-16))) {
1177		    GHASH(ctx,in,i);
1178		    while (len>=16) {
1179		    	size_t *out_t=(size_t *)out;
1180		    	const size_t *in_t=(const size_t *)in;
1181
1182			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1183			++ctr;
1184			if (is_endian.little)
1185#ifdef BSWAP4
1186				ctx->Yi.d[3] = BSWAP4(ctr);
1187#else
1188				PUTU32(ctx->Yi.c+12,ctr);
1189#endif
1190			else
1191				ctx->Yi.d[3] = ctr;
1192			for (i=0; i<16/sizeof(size_t); ++i)
1193				out_t[i] = in_t[i]^ctx->EKi.t[i];
1194			out += 16;
1195			in  += 16;
1196			len -= 16;
1197		    }
1198		}
1199#else
1200		while (len>=16) {
1201		    	size_t *out_t=(size_t *)out;
1202		    	const size_t *in_t=(const size_t *)in;
1203
1204			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1205			++ctr;
1206			if (is_endian.little)
1207#ifdef BSWAP4
1208				ctx->Yi.d[3] = BSWAP4(ctr);
1209#else
1210				PUTU32(ctx->Yi.c+12,ctr);
1211#endif
1212			else
1213				ctx->Yi.d[3] = ctr;
1214			for (i=0; i<16/sizeof(size_t); ++i) {
1215				size_t c = in[i];
1216				out[i] = c^ctx->EKi.t[i];
1217				ctx->Xi.t[i] ^= c;
1218			}
1219			GCM_MUL(ctx,Xi);
1220			out += 16;
1221			in  += 16;
1222			len -= 16;
1223		}
1224#endif
1225		if (len) {
1226			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1227			++ctr;
1228			if (is_endian.little)
1229#ifdef BSWAP4
1230				ctx->Yi.d[3] = BSWAP4(ctr);
1231#else
1232				PUTU32(ctx->Yi.c+12,ctr);
1233#endif
1234			else
1235				ctx->Yi.d[3] = ctr;
1236			while (len--) {
1237				u8 c = in[n];
1238				ctx->Xi.c[n] ^= c;
1239				out[n] = c^ctx->EKi.c[n];
1240				++n;
1241			}
1242		}
1243
1244		ctx->mres = n;
1245		return 0;
1246	} while(0);
1247#endif
1248	for (i=0;i<len;++i) {
1249		u8 c;
1250		if (n==0) {
1251			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1252			++ctr;
1253			if (is_endian.little)
1254#ifdef BSWAP4
1255				ctx->Yi.d[3] = BSWAP4(ctr);
1256#else
1257				PUTU32(ctx->Yi.c+12,ctr);
1258#endif
1259			else
1260				ctx->Yi.d[3] = ctr;
1261		}
1262		c = in[i];
1263		out[i] = c^ctx->EKi.c[n];
1264		ctx->Xi.c[n] ^= c;
1265		n = (n+1)%16;
1266		if (n==0)
1267			GCM_MUL(ctx,Xi);
1268	}
1269
1270	ctx->mres = n;
1271	return 0;
1272}
1273
1274int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1275		const unsigned char *in, unsigned char *out,
1276		size_t len, ctr128_f stream)
1277{
1278	const union { long one; char little; } is_endian = {1};
1279	unsigned int n, ctr;
1280	size_t i;
1281	u64   mlen = ctx->len.u[1];
1282	void *key  = ctx->key;
1283#ifdef GCM_FUNCREF_4BIT
1284	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1285# ifdef GHASH
1286	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1287				const u8 *inp,size_t len)	= ctx->ghash;
1288# endif
1289#endif
1290
1291	mlen += len;
1292	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1293		return -1;
1294	ctx->len.u[1] = mlen;
1295
1296	if (ctx->ares) {
1297		/* First call to encrypt finalizes GHASH(AAD) */
1298		GCM_MUL(ctx,Xi);
1299		ctx->ares = 0;
1300	}
1301
1302	if (is_endian.little)
1303#ifdef BSWAP4
1304		ctr = BSWAP4(ctx->Yi.d[3]);
1305#else
1306		ctr = GETU32(ctx->Yi.c+12);
1307#endif
1308	else
1309		ctr = ctx->Yi.d[3];
1310
1311	n = ctx->mres;
1312	if (n) {
1313		while (n && len) {
1314			ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1315			--len;
1316			n = (n+1)%16;
1317		}
1318		if (n==0) GCM_MUL(ctx,Xi);
1319		else {
1320			ctx->mres = n;
1321			return 0;
1322		}
1323	}
1324#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1325	while (len>=GHASH_CHUNK) {
1326		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1327		ctr += GHASH_CHUNK/16;
1328		if (is_endian.little)
1329#ifdef BSWAP4
1330			ctx->Yi.d[3] = BSWAP4(ctr);
1331#else
1332			PUTU32(ctx->Yi.c+12,ctr);
1333#endif
1334		else
1335			ctx->Yi.d[3] = ctr;
1336		GHASH(ctx,out,GHASH_CHUNK);
1337		out += GHASH_CHUNK;
1338		in  += GHASH_CHUNK;
1339		len -= GHASH_CHUNK;
1340	}
1341#endif
1342	if ((i = (len&(size_t)-16))) {
1343		size_t j=i/16;
1344
1345		(*stream)(in,out,j,key,ctx->Yi.c);
1346		ctr += (unsigned int)j;
1347		if (is_endian.little)
1348#ifdef BSWAP4
1349			ctx->Yi.d[3] = BSWAP4(ctr);
1350#else
1351			PUTU32(ctx->Yi.c+12,ctr);
1352#endif
1353		else
1354			ctx->Yi.d[3] = ctr;
1355		in  += i;
1356		len -= i;
1357#if defined(GHASH)
1358		GHASH(ctx,out,i);
1359		out += i;
1360#else
1361		while (j--) {
1362			for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1363			GCM_MUL(ctx,Xi);
1364			out += 16;
1365		}
1366#endif
1367	}
1368	if (len) {
1369		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1370		++ctr;
1371		if (is_endian.little)
1372#ifdef BSWAP4
1373			ctx->Yi.d[3] = BSWAP4(ctr);
1374#else
1375			PUTU32(ctx->Yi.c+12,ctr);
1376#endif
1377		else
1378			ctx->Yi.d[3] = ctr;
1379		while (len--) {
1380			ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1381			++n;
1382		}
1383	}
1384
1385	ctx->mres = n;
1386	return 0;
1387}
1388
1389int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1390		const unsigned char *in, unsigned char *out,
1391		size_t len,ctr128_f stream)
1392{
1393	const union { long one; char little; } is_endian = {1};
1394	unsigned int n, ctr;
1395	size_t i;
1396	u64   mlen = ctx->len.u[1];
1397	void *key  = ctx->key;
1398#ifdef GCM_FUNCREF_4BIT
1399	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1400# ifdef GHASH
1401	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1402				const u8 *inp,size_t len)	= ctx->ghash;
1403# endif
1404#endif
1405
1406	mlen += len;
1407	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1408		return -1;
1409	ctx->len.u[1] = mlen;
1410
1411	if (ctx->ares) {
1412		/* First call to decrypt finalizes GHASH(AAD) */
1413		GCM_MUL(ctx,Xi);
1414		ctx->ares = 0;
1415	}
1416
1417	if (is_endian.little)
1418#ifdef BSWAP4
1419		ctr = BSWAP4(ctx->Yi.d[3]);
1420#else
1421		ctr = GETU32(ctx->Yi.c+12);
1422#endif
1423	else
1424		ctr = ctx->Yi.d[3];
1425
1426	n = ctx->mres;
1427	if (n) {
1428		while (n && len) {
1429			u8 c = *(in++);
1430			*(out++) = c^ctx->EKi.c[n];
1431			ctx->Xi.c[n] ^= c;
1432			--len;
1433			n = (n+1)%16;
1434		}
1435		if (n==0) GCM_MUL (ctx,Xi);
1436		else {
1437			ctx->mres = n;
1438			return 0;
1439		}
1440	}
1441#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1442	while (len>=GHASH_CHUNK) {
1443		GHASH(ctx,in,GHASH_CHUNK);
1444		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1445		ctr += GHASH_CHUNK/16;
1446		if (is_endian.little)
1447#ifdef BSWAP4
1448			ctx->Yi.d[3] = BSWAP4(ctr);
1449#else
1450			PUTU32(ctx->Yi.c+12,ctr);
1451#endif
1452		else
1453			ctx->Yi.d[3] = ctr;
1454		out += GHASH_CHUNK;
1455		in  += GHASH_CHUNK;
1456		len -= GHASH_CHUNK;
1457	}
1458#endif
1459	if ((i = (len&(size_t)-16))) {
1460		size_t j=i/16;
1461
1462#if defined(GHASH)
1463		GHASH(ctx,in,i);
1464#else
1465		while (j--) {
1466			size_t k;
1467			for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1468			GCM_MUL(ctx,Xi);
1469			in += 16;
1470		}
1471		j   = i/16;
1472		in -= i;
1473#endif
1474		(*stream)(in,out,j,key,ctx->Yi.c);
1475		ctr += (unsigned int)j;
1476		if (is_endian.little)
1477#ifdef BSWAP4
1478			ctx->Yi.d[3] = BSWAP4(ctr);
1479#else
1480			PUTU32(ctx->Yi.c+12,ctr);
1481#endif
1482		else
1483			ctx->Yi.d[3] = ctr;
1484		out += i;
1485		in  += i;
1486		len -= i;
1487	}
1488	if (len) {
1489		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1490		++ctr;
1491		if (is_endian.little)
1492#ifdef BSWAP4
1493			ctx->Yi.d[3] = BSWAP4(ctr);
1494#else
1495			PUTU32(ctx->Yi.c+12,ctr);
1496#endif
1497		else
1498			ctx->Yi.d[3] = ctr;
1499		while (len--) {
1500			u8 c = in[n];
1501			ctx->Xi.c[n] ^= c;
1502			out[n] = c^ctx->EKi.c[n];
1503			++n;
1504		}
1505	}
1506
1507	ctx->mres = n;
1508	return 0;
1509}
1510
1511int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1512			size_t len)
1513{
1514	const union { long one; char little; } is_endian = {1};
1515	u64 alen = ctx->len.u[0]<<3;
1516	u64 clen = ctx->len.u[1]<<3;
1517#ifdef GCM_FUNCREF_4BIT
1518	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1519#endif
1520
1521	if (ctx->mres || ctx->ares)
1522		GCM_MUL(ctx,Xi);
1523
1524	if (is_endian.little) {
1525#ifdef BSWAP8
1526		alen = BSWAP8(alen);
1527		clen = BSWAP8(clen);
1528#else
1529		u8 *p = ctx->len.c;
1530
1531		ctx->len.u[0] = alen;
1532		ctx->len.u[1] = clen;
1533
1534		alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1535		clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1536#endif
1537	}
1538
1539	ctx->Xi.u[0] ^= alen;
1540	ctx->Xi.u[1] ^= clen;
1541	GCM_MUL(ctx,Xi);
1542
1543	ctx->Xi.u[0] ^= ctx->EK0.u[0];
1544	ctx->Xi.u[1] ^= ctx->EK0.u[1];
1545
1546	if (tag && len<=sizeof(ctx->Xi))
1547		return memcmp(ctx->Xi.c,tag,len);
1548	else
1549		return -1;
1550}
1551
1552void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1553{
1554	CRYPTO_gcm128_finish(ctx, NULL, 0);
1555	memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1556}
1557
1558GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1559{
1560	GCM128_CONTEXT *ret;
1561
1562	if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1563		CRYPTO_gcm128_init(ret,key,block);
1564
1565	return ret;
1566}
1567
1568void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1569{
1570	if (ctx) {
1571		OPENSSL_cleanse(ctx,sizeof(*ctx));
1572		OPENSSL_free(ctx);
1573	}
1574}
1575
1576#if defined(SELFTEST)
1577#include <stdio.h>
1578#include <openssl/aes.h>
1579
1580/* Test Case 1 */
1581static const u8	K1[16],
1582		*P1=NULL,
1583		*A1=NULL,
1584		IV1[12],
1585		*C1=NULL,
1586		T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1587
1588/* Test Case 2 */
1589#define K2 K1
1590#define A2 A1
1591#define IV2 IV1
1592static const u8	P2[16],
1593		C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1594		T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1595
1596/* Test Case 3 */
1597#define A3 A2
1598static const u8	K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1599		P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1600			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1601			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1602			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1603		IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1604		C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1605			0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1606			0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1607			0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1608		T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1609
1610/* Test Case 4 */
1611#define K4 K3
1612#define IV4 IV3
1613static const u8	P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1614			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1615			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1616			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1617		A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1618			0xab,0xad,0xda,0xd2},
1619		C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1620			0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1621			0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1622			0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1623		T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1624
1625/* Test Case 5 */
1626#define K5 K4
1627#define P5 P4
1628#define A5 A4
1629static const u8	IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1630		C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1631			0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1632			0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1633			0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1634		T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1635
1636/* Test Case 6 */
1637#define K6 K5
1638#define P6 P5
1639#define A6 A5
1640static const u8	IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1641			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1642			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1643			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1644		C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1645			0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1646			0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1647			0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1648		T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1649
1650/* Test Case 7 */
1651static const u8 K7[24],
1652		*P7=NULL,
1653		*A7=NULL,
1654		IV7[12],
1655		*C7=NULL,
1656		T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1657
1658/* Test Case 8 */
1659#define K8 K7
1660#define IV8 IV7
1661#define A8 A7
1662static const u8	P8[16],
1663		C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1664		T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1665
1666/* Test Case 9 */
1667#define A9 A8
1668static const u8	K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1669			0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1670		P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1671			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1672			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1673			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1674		IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1675		C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1676			0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1677			0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1678			0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1679		T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1680
1681/* Test Case 10 */
1682#define K10 K9
1683#define IV10 IV9
1684static const u8	P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1685			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1686			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1687			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1688		A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1689			0xab,0xad,0xda,0xd2},
1690		C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1691			0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1692			0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1693			0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1694		T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1695
1696/* Test Case 11 */
1697#define K11 K10
1698#define P11 P10
1699#define A11 A10
1700static const u8	IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1701		C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1702			0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1703			0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1704			0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1705		T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1706
1707/* Test Case 12 */
1708#define K12 K11
1709#define P12 P11
1710#define A12 A11
1711static const u8	IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1712			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1713			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1714			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1715		C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1716			0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1717			0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1718			0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1719		T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1720
1721/* Test Case 13 */
1722static const u8	K13[32],
1723		*P13=NULL,
1724		*A13=NULL,
1725		IV13[12],
1726		*C13=NULL,
1727		T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1728
1729/* Test Case 14 */
1730#define K14 K13
1731#define A14 A13
1732static const u8	P14[16],
1733		IV14[12],
1734		C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1735		T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1736
1737/* Test Case 15 */
1738#define A15 A14
1739static const u8	K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1740			0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1741		P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1742			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1743			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1744			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1745		IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1746		C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1747			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1748			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1749			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1750		T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1751
1752/* Test Case 16 */
1753#define K16 K15
1754#define IV16 IV15
1755static const u8	P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1756			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1757			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1758			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1759		A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1760			0xab,0xad,0xda,0xd2},
1761		C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1762			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1763			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1764			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1765		T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1766
1767/* Test Case 17 */
1768#define K17 K16
1769#define P17 P16
1770#define A17 A16
1771static const u8	IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1772		C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1773			0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1774			0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1775			0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1776		T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1777
1778/* Test Case 18 */
1779#define K18 K17
1780#define P18 P17
1781#define A18 A17
1782static const u8	IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1783			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1784			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1785			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1786		C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1787			0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1788			0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1789			0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1790		T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1791
1792/* Test Case 19 */
1793#define K19 K1
1794#define P19 P1
1795#define IV19 IV1
1796#define C19 C1
1797static const u8 A19[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1798			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1799			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1800			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55,
1801			0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1802			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1803			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1804			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1805		T19[]= {0x5f,0xea,0x79,0x3a,0x2d,0x6f,0x97,0x4d,0x37,0xe6,0x8e,0x0c,0xb8,0xff,0x94,0x92};
1806
1807/* Test Case 20 */
1808#define K20 K1
1809#define A20 A1
1810static const u8 IV20[64]={0xff,0xff,0xff,0xff},	/* this results in 0xff in counter LSB */
1811		P20[288],
1812		C20[]= {0x56,0xb3,0x37,0x3c,0xa9,0xef,0x6e,0x4a,0x2b,0x64,0xfe,0x1e,0x9a,0x17,0xb6,0x14,
1813			0x25,0xf1,0x0d,0x47,0xa7,0x5a,0x5f,0xce,0x13,0xef,0xc6,0xbc,0x78,0x4a,0xf2,0x4f,
1814			0x41,0x41,0xbd,0xd4,0x8c,0xf7,0xc7,0x70,0x88,0x7a,0xfd,0x57,0x3c,0xca,0x54,0x18,
1815			0xa9,0xae,0xff,0xcd,0x7c,0x5c,0xed,0xdf,0xc6,0xa7,0x83,0x97,0xb9,0xa8,0x5b,0x49,
1816			0x9d,0xa5,0x58,0x25,0x72,0x67,0xca,0xab,0x2a,0xd0,0xb2,0x3c,0xa4,0x76,0xa5,0x3c,
1817			0xb1,0x7f,0xb4,0x1c,0x4b,0x8b,0x47,0x5c,0xb4,0xf3,0xf7,0x16,0x50,0x94,0xc2,0x29,
1818			0xc9,0xe8,0xc4,0xdc,0x0a,0x2a,0x5f,0xf1,0x90,0x3e,0x50,0x15,0x11,0x22,0x13,0x76,
1819			0xa1,0xcd,0xb8,0x36,0x4c,0x50,0x61,0xa2,0x0c,0xae,0x74,0xbc,0x4a,0xcd,0x76,0xce,
1820			0xb0,0xab,0xc9,0xfd,0x32,0x17,0xef,0x9f,0x8c,0x90,0xbe,0x40,0x2d,0xdf,0x6d,0x86,
1821			0x97,0xf4,0xf8,0x80,0xdf,0xf1,0x5b,0xfb,0x7a,0x6b,0x28,0x24,0x1e,0xc8,0xfe,0x18,
1822			0x3c,0x2d,0x59,0xe3,0xf9,0xdf,0xff,0x65,0x3c,0x71,0x26,0xf0,0xac,0xb9,0xe6,0x42,
1823			0x11,0xf4,0x2b,0xae,0x12,0xaf,0x46,0x2b,0x10,0x70,0xbe,0xf1,0xab,0x5e,0x36,0x06,
1824			0x87,0x2c,0xa1,0x0d,0xee,0x15,0xb3,0x24,0x9b,0x1a,0x1b,0x95,0x8f,0x23,0x13,0x4c,
1825			0x4b,0xcc,0xb7,0xd0,0x32,0x00,0xbc,0xe4,0x20,0xa2,0xf8,0xeb,0x66,0xdc,0xf3,0x64,
1826			0x4d,0x14,0x23,0xc1,0xb5,0x69,0x90,0x03,0xc1,0x3e,0xce,0xf4,0xbf,0x38,0xa3,0xb6,
1827			0x0e,0xed,0xc3,0x40,0x33,0xba,0xc1,0x90,0x27,0x83,0xdc,0x6d,0x89,0xe2,0xe7,0x74,
1828			0x18,0x8a,0x43,0x9c,0x7e,0xbc,0xc0,0x67,0x2d,0xbd,0xa4,0xdd,0xcf,0xb2,0x79,0x46,
1829			0x13,0xb0,0xbe,0x41,0x31,0x5e,0xf7,0x78,0x70,0x8a,0x70,0xee,0x7d,0x75,0x16,0x5c},
1830		T20[]= {0x8b,0x30,0x7f,0x6b,0x33,0x28,0x6d,0x0a,0xb0,0x26,0xa9,0xed,0x3f,0xe1,0xe8,0x5f};
1831
1832#define TEST_CASE(n)	do {					\
1833	u8 out[sizeof(P##n)];					\
1834	AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);		\
1835	CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);	\
1836	CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));		\
1837	memset(out,0,sizeof(out));				\
1838	if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));	\
1839	if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));	\
1840	if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||		\
1841	    (C##n && memcmp(out,C##n,sizeof(out))))		\
1842		ret++, printf ("encrypt test#%d failed.\n",n);	\
1843	CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));		\
1844	memset(out,0,sizeof(out));				\
1845	if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));	\
1846	if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));	\
1847	if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||		\
1848	    (P##n && memcmp(out,P##n,sizeof(out))))		\
1849		ret++, printf ("decrypt test#%d failed.\n",n);	\
1850	} while(0)
1851
1852int main()
1853{
1854	GCM128_CONTEXT ctx;
1855	AES_KEY key;
1856	int ret=0;
1857
1858	TEST_CASE(1);
1859	TEST_CASE(2);
1860	TEST_CASE(3);
1861	TEST_CASE(4);
1862	TEST_CASE(5);
1863	TEST_CASE(6);
1864	TEST_CASE(7);
1865	TEST_CASE(8);
1866	TEST_CASE(9);
1867	TEST_CASE(10);
1868	TEST_CASE(11);
1869	TEST_CASE(12);
1870	TEST_CASE(13);
1871	TEST_CASE(14);
1872	TEST_CASE(15);
1873	TEST_CASE(16);
1874	TEST_CASE(17);
1875	TEST_CASE(18);
1876	TEST_CASE(19);
1877	TEST_CASE(20);
1878
1879#ifdef OPENSSL_CPUID_OBJ
1880	{
1881	size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1882	union { u64 u; u8 c[1024]; } buf;
1883	int i;
1884
1885	AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1886	CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1887	CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1888
1889	CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1890	start = OPENSSL_rdtsc();
1891	CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1892	gcm_t = OPENSSL_rdtsc() - start;
1893
1894	CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1895			&key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1896			(block128_f)AES_encrypt);
1897	start = OPENSSL_rdtsc();
1898	CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1899			&key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1900			(block128_f)AES_encrypt);
1901	ctr_t = OPENSSL_rdtsc() - start;
1902
1903	printf("%.2f-%.2f=%.2f\n",
1904			gcm_t/(double)sizeof(buf),
1905			ctr_t/(double)sizeof(buf),
1906			(gcm_t-ctr_t)/(double)sizeof(buf));
1907#ifdef GHASH
1908	{
1909	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1910				const u8 *inp,size_t len)	= ctx.ghash;
1911
1912	GHASH((&ctx),buf.c,sizeof(buf));
1913	start = OPENSSL_rdtsc();
1914	for (i=0;i<100;++i) GHASH((&ctx),buf.c,sizeof(buf));
1915	gcm_t = OPENSSL_rdtsc() - start;
1916	printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1917	}
1918#endif
1919	}
1920#endif
1921
1922	return ret;
1923}
1924#endif
1925