1/* ====================================================================
2 * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in
13 *    the documentation and/or other materials provided with the
14 *    distribution.
15 *
16 * 3. All advertising materials mentioning features or use of this
17 *    software must display the following acknowledgment:
18 *    "This product includes software developed by the OpenSSL Project
19 *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20 *
21 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22 *    endorse or promote products derived from this software without
23 *    prior written permission. For written permission, please contact
24 *    openssl-core@openssl.org.
25 *
26 * 5. Products derived from this software may not be called "OpenSSL"
27 *    nor may "OpenSSL" appear in their names without prior written
28 *    permission of the OpenSSL Project.
29 *
30 * 6. Redistributions of any form whatsoever must retain the following
31 *    acknowledgment:
32 *    "This product includes software developed by the OpenSSL Project
33 *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34 *
35 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46 * OF THE POSSIBILITY OF SUCH DAMAGE.
47 * ====================================================================
48 */
49
50#define OPENSSL_FIPSAPI
51
52#include <openssl/crypto.h>
53#include "modes_lcl.h"
54#include <string.h>
55
56#ifndef MODES_DEBUG
57# ifndef NDEBUG
58#  define NDEBUG
59# endif
60#endif
61#include <assert.h>
62
63#if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64/* redefine, because alignment is ensured */
65#undef	GETU32
66#define	GETU32(p)	BSWAP4(*(const u32 *)(p))
67#undef	PUTU32
68#define	PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
69#endif
70
71#define	PACK(s)		((size_t)(s)<<(sizeof(size_t)*8-16))
72#define REDUCE1BIT(V)	do { \
73	if (sizeof(size_t)==8) { \
74		u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75		V.lo  = (V.hi<<63)|(V.lo>>1); \
76		V.hi  = (V.hi>>1 )^T; \
77	} \
78	else { \
79		u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80		V.lo  = (V.hi<<63)|(V.lo>>1); \
81		V.hi  = (V.hi>>1 )^((u64)T<<32); \
82	} \
83} while(0)
84
85/*
86 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87 * never be set to 8. 8 is effectively reserved for testing purposes.
88 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90 * whole spectrum of possible table driven implementations. Why? In
91 * non-"Shoup's" case memory access pattern is segmented in such manner,
92 * that it's trivial to see that cache timing information can reveal
93 * fair portion of intermediate hash value. Given that ciphertext is
94 * always available to attacker, it's possible for him to attempt to
95 * deduce secret parameter H and if successful, tamper with messages
96 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97 * not as trivial, but there is no reason to believe that it's resistant
98 * to cache-timing attack. And the thing about "8-bit" implementation is
99 * that it consumes 16 (sixteen) times more memory, 4KB per individual
100 * key + 1KB shared. Well, on pros side it should be twice as fast as
101 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102 * was observed to run ~75% faster, closer to 100% for commercial
103 * compilers... Yet "4-bit" procedure is preferred, because it's
104 * believed to provide better security-performance balance and adequate
105 * all-round performance. "All-round" refers to things like:
106 *
107 * - shorter setup time effectively improves overall timing for
108 *   handling short messages;
109 * - larger table allocation can become unbearable because of VM
110 *   subsystem penalties (for example on Windows large enough free
111 *   results in VM working set trimming, meaning that consequent
112 *   malloc would immediately incur working set expansion);
113 * - larger table has larger cache footprint, which can affect
114 *   performance of other code paths (not necessarily even from same
115 *   thread in Hyper-Threading world);
116 *
117 * Value of 1 is not appropriate for performance reasons.
118 */
119#if	TABLE_BITS==8
120
121static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122{
123	int  i, j;
124	u128 V;
125
126	Htable[0].hi = 0;
127	Htable[0].lo = 0;
128	V.hi = H[0];
129	V.lo = H[1];
130
131	for (Htable[128]=V, i=64; i>0; i>>=1) {
132		REDUCE1BIT(V);
133		Htable[i] = V;
134	}
135
136	for (i=2; i<256; i<<=1) {
137		u128 *Hi = Htable+i, H0 = *Hi;
138		for (j=1; j<i; ++j) {
139			Hi[j].hi = H0.hi^Htable[j].hi;
140			Hi[j].lo = H0.lo^Htable[j].lo;
141		}
142	}
143}
144
145static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146{
147	u128 Z = { 0, 0};
148	const u8 *xi = (const u8 *)Xi+15;
149	size_t rem, n = *xi;
150	const union { long one; char little; } is_endian = {1};
151	static const size_t rem_8bit[256] = {
152		PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
153		PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
154		PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
155		PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
156		PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
157		PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
158		PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
159		PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
160		PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
161		PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
162		PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
163		PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
164		PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
165		PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
166		PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
167		PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
168		PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
169		PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
170		PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
171		PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
172		PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
173		PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
174		PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
175		PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
176		PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
177		PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
178		PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
179		PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
180		PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
181		PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
182		PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
183		PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
184		PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
185		PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
186		PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
187		PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
188		PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
189		PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
190		PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
191		PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
192		PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
193		PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
194		PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
195		PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
196		PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
197		PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
198		PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
199		PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
200		PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
201		PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
202		PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
203		PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
204		PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
205		PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
206		PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
207		PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
208		PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
209		PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
210		PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
211		PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
212		PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
213		PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
214		PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
215		PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
216
217	while (1) {
218		Z.hi ^= Htable[n].hi;
219		Z.lo ^= Htable[n].lo;
220
221		if ((u8 *)Xi==xi)	break;
222
223		n = *(--xi);
224
225		rem  = (size_t)Z.lo&0xff;
226		Z.lo = (Z.hi<<56)|(Z.lo>>8);
227		Z.hi = (Z.hi>>8);
228		if (sizeof(size_t)==8)
229			Z.hi ^= rem_8bit[rem];
230		else
231			Z.hi ^= (u64)rem_8bit[rem]<<32;
232	}
233
234	if (is_endian.little) {
235#ifdef BSWAP8
236		Xi[0] = BSWAP8(Z.hi);
237		Xi[1] = BSWAP8(Z.lo);
238#else
239		u8 *p = (u8 *)Xi;
240		u32 v;
241		v = (u32)(Z.hi>>32);	PUTU32(p,v);
242		v = (u32)(Z.hi);	PUTU32(p+4,v);
243		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
244		v = (u32)(Z.lo);	PUTU32(p+12,v);
245#endif
246	}
247	else {
248		Xi[0] = Z.hi;
249		Xi[1] = Z.lo;
250	}
251}
252#define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
253
254#elif	TABLE_BITS==4
255
256static void gcm_init_4bit(u128 Htable[16], u64 H[2])
257{
258	u128 V;
259#if defined(OPENSSL_SMALL_FOOTPRINT)
260	int  i;
261#endif
262
263	Htable[0].hi = 0;
264	Htable[0].lo = 0;
265	V.hi = H[0];
266	V.lo = H[1];
267
268#if defined(OPENSSL_SMALL_FOOTPRINT)
269	for (Htable[8]=V, i=4; i>0; i>>=1) {
270		REDUCE1BIT(V);
271		Htable[i] = V;
272	}
273
274	for (i=2; i<16; i<<=1) {
275		u128 *Hi = Htable+i;
276		int   j;
277		for (V=*Hi, j=1; j<i; ++j) {
278			Hi[j].hi = V.hi^Htable[j].hi;
279			Hi[j].lo = V.lo^Htable[j].lo;
280		}
281	}
282#else
283	Htable[8] = V;
284	REDUCE1BIT(V);
285	Htable[4] = V;
286	REDUCE1BIT(V);
287	Htable[2] = V;
288	REDUCE1BIT(V);
289	Htable[1] = V;
290	Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
291	V=Htable[4];
292	Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
293	Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
294	Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
295	V=Htable[8];
296	Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
297	Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
298	Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
299	Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
300	Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
301	Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
302	Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
303#endif
304#if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
305	/*
306	 * ARM assembler expects specific dword order in Htable.
307	 */
308	{
309	int j;
310	const union { long one; char little; } is_endian = {1};
311
312	if (is_endian.little)
313		for (j=0;j<16;++j) {
314			V = Htable[j];
315			Htable[j].hi = V.lo;
316			Htable[j].lo = V.hi;
317		}
318	else
319		for (j=0;j<16;++j) {
320			V = Htable[j];
321			Htable[j].hi = V.lo<<32|V.lo>>32;
322			Htable[j].lo = V.hi<<32|V.hi>>32;
323		}
324	}
325#endif
326}
327
328#ifndef GHASH_ASM
329static const size_t rem_4bit[16] = {
330	PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
331	PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
332	PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
333	PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
334
335static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
336{
337	u128 Z;
338	int cnt = 15;
339	size_t rem, nlo, nhi;
340	const union { long one; char little; } is_endian = {1};
341
342	nlo  = ((const u8 *)Xi)[15];
343	nhi  = nlo>>4;
344	nlo &= 0xf;
345
346	Z.hi = Htable[nlo].hi;
347	Z.lo = Htable[nlo].lo;
348
349	while (1) {
350		rem  = (size_t)Z.lo&0xf;
351		Z.lo = (Z.hi<<60)|(Z.lo>>4);
352		Z.hi = (Z.hi>>4);
353		if (sizeof(size_t)==8)
354			Z.hi ^= rem_4bit[rem];
355		else
356			Z.hi ^= (u64)rem_4bit[rem]<<32;
357
358		Z.hi ^= Htable[nhi].hi;
359		Z.lo ^= Htable[nhi].lo;
360
361		if (--cnt<0)		break;
362
363		nlo  = ((const u8 *)Xi)[cnt];
364		nhi  = nlo>>4;
365		nlo &= 0xf;
366
367		rem  = (size_t)Z.lo&0xf;
368		Z.lo = (Z.hi<<60)|(Z.lo>>4);
369		Z.hi = (Z.hi>>4);
370		if (sizeof(size_t)==8)
371			Z.hi ^= rem_4bit[rem];
372		else
373			Z.hi ^= (u64)rem_4bit[rem]<<32;
374
375		Z.hi ^= Htable[nlo].hi;
376		Z.lo ^= Htable[nlo].lo;
377	}
378
379	if (is_endian.little) {
380#ifdef BSWAP8
381		Xi[0] = BSWAP8(Z.hi);
382		Xi[1] = BSWAP8(Z.lo);
383#else
384		u8 *p = (u8 *)Xi;
385		u32 v;
386		v = (u32)(Z.hi>>32);	PUTU32(p,v);
387		v = (u32)(Z.hi);	PUTU32(p+4,v);
388		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
389		v = (u32)(Z.lo);	PUTU32(p+12,v);
390#endif
391	}
392	else {
393		Xi[0] = Z.hi;
394		Xi[1] = Z.lo;
395	}
396}
397
398#if !defined(OPENSSL_SMALL_FOOTPRINT)
399/*
400 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
401 * details... Compiler-generated code doesn't seem to give any
402 * performance improvement, at least not on x86[_64]. It's here
403 * mostly as reference and a placeholder for possible future
404 * non-trivial optimization[s]...
405 */
406static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
407				const u8 *inp,size_t len)
408{
409    u128 Z;
410    int cnt;
411    size_t rem, nlo, nhi;
412    const union { long one; char little; } is_endian = {1};
413
414#if 1
415    do {
416	cnt  = 15;
417	nlo  = ((const u8 *)Xi)[15];
418	nlo ^= inp[15];
419	nhi  = nlo>>4;
420	nlo &= 0xf;
421
422	Z.hi = Htable[nlo].hi;
423	Z.lo = Htable[nlo].lo;
424
425	while (1) {
426		rem  = (size_t)Z.lo&0xf;
427		Z.lo = (Z.hi<<60)|(Z.lo>>4);
428		Z.hi = (Z.hi>>4);
429		if (sizeof(size_t)==8)
430			Z.hi ^= rem_4bit[rem];
431		else
432			Z.hi ^= (u64)rem_4bit[rem]<<32;
433
434		Z.hi ^= Htable[nhi].hi;
435		Z.lo ^= Htable[nhi].lo;
436
437		if (--cnt<0)		break;
438
439		nlo  = ((const u8 *)Xi)[cnt];
440		nlo ^= inp[cnt];
441		nhi  = nlo>>4;
442		nlo &= 0xf;
443
444		rem  = (size_t)Z.lo&0xf;
445		Z.lo = (Z.hi<<60)|(Z.lo>>4);
446		Z.hi = (Z.hi>>4);
447		if (sizeof(size_t)==8)
448			Z.hi ^= rem_4bit[rem];
449		else
450			Z.hi ^= (u64)rem_4bit[rem]<<32;
451
452		Z.hi ^= Htable[nlo].hi;
453		Z.lo ^= Htable[nlo].lo;
454	}
455#else
456    /*
457     * Extra 256+16 bytes per-key plus 512 bytes shared tables
458     * [should] give ~50% improvement... One could have PACK()-ed
459     * the rem_8bit even here, but the priority is to minimize
460     * cache footprint...
461     */
462    u128 Hshr4[16];	/* Htable shifted right by 4 bits */
463    u8   Hshl4[16];	/* Htable shifted left  by 4 bits */
464    static const unsigned short rem_8bit[256] = {
465	0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
466	0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
467	0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
468	0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
469	0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
470	0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
471	0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
472	0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
473	0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
474	0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
475	0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
476	0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
477	0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
478	0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
479	0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
480	0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
481	0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
482	0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
483	0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
484	0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
485	0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
486	0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
487	0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
488	0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
489	0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
490	0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
491	0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
492	0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
493	0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
494	0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
495	0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
496	0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
497    /*
498     * This pre-processing phase slows down procedure by approximately
499     * same time as it makes each loop spin faster. In other words
500     * single block performance is approximately same as straightforward
501     * "4-bit" implementation, and then it goes only faster...
502     */
503    for (cnt=0; cnt<16; ++cnt) {
504	Z.hi = Htable[cnt].hi;
505	Z.lo = Htable[cnt].lo;
506	Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
507	Hshr4[cnt].hi = (Z.hi>>4);
508	Hshl4[cnt]    = (u8)(Z.lo<<4);
509    }
510
511    do {
512	for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
513		nlo  = ((const u8 *)Xi)[cnt];
514		nlo ^= inp[cnt];
515		nhi  = nlo>>4;
516		nlo &= 0xf;
517
518		Z.hi ^= Htable[nlo].hi;
519		Z.lo ^= Htable[nlo].lo;
520
521		rem = (size_t)Z.lo&0xff;
522
523		Z.lo = (Z.hi<<56)|(Z.lo>>8);
524		Z.hi = (Z.hi>>8);
525
526		Z.hi ^= Hshr4[nhi].hi;
527		Z.lo ^= Hshr4[nhi].lo;
528		Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
529	}
530
531	nlo  = ((const u8 *)Xi)[0];
532	nlo ^= inp[0];
533	nhi  = nlo>>4;
534	nlo &= 0xf;
535
536	Z.hi ^= Htable[nlo].hi;
537	Z.lo ^= Htable[nlo].lo;
538
539	rem = (size_t)Z.lo&0xf;
540
541	Z.lo = (Z.hi<<60)|(Z.lo>>4);
542	Z.hi = (Z.hi>>4);
543
544	Z.hi ^= Htable[nhi].hi;
545	Z.lo ^= Htable[nhi].lo;
546	Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
547#endif
548
549	if (is_endian.little) {
550#ifdef BSWAP8
551		Xi[0] = BSWAP8(Z.hi);
552		Xi[1] = BSWAP8(Z.lo);
553#else
554		u8 *p = (u8 *)Xi;
555		u32 v;
556		v = (u32)(Z.hi>>32);	PUTU32(p,v);
557		v = (u32)(Z.hi);	PUTU32(p+4,v);
558		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
559		v = (u32)(Z.lo);	PUTU32(p+12,v);
560#endif
561	}
562	else {
563		Xi[0] = Z.hi;
564		Xi[1] = Z.lo;
565	}
566    } while (inp+=16, len-=16);
567}
568#endif
569#else
570void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
571void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
572#endif
573
574#define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
575#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
576#define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
577/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
578 * trashing effect. In other words idea is to hash data while it's
579 * still in L1 cache after encryption pass... */
580#define GHASH_CHUNK       (3*1024)
581#endif
582
583#else	/* TABLE_BITS */
584
585static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
586{
587	u128 V,Z = { 0,0 };
588	long X;
589	int  i,j;
590	const long *xi = (const long *)Xi;
591	const union { long one; char little; } is_endian = {1};
592
593	V.hi = H[0];	/* H is in host byte order, no byte swapping */
594	V.lo = H[1];
595
596	for (j=0; j<16/sizeof(long); ++j) {
597		if (is_endian.little) {
598			if (sizeof(long)==8) {
599#ifdef BSWAP8
600				X = (long)(BSWAP8(xi[j]));
601#else
602				const u8 *p = (const u8 *)(xi+j);
603				X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
604#endif
605			}
606			else {
607				const u8 *p = (const u8 *)(xi+j);
608				X = (long)GETU32(p);
609			}
610		}
611		else
612			X = xi[j];
613
614		for (i=0; i<8*sizeof(long); ++i, X<<=1) {
615			u64 M = (u64)(X>>(8*sizeof(long)-1));
616			Z.hi ^= V.hi&M;
617			Z.lo ^= V.lo&M;
618
619			REDUCE1BIT(V);
620		}
621	}
622
623	if (is_endian.little) {
624#ifdef BSWAP8
625		Xi[0] = BSWAP8(Z.hi);
626		Xi[1] = BSWAP8(Z.lo);
627#else
628		u8 *p = (u8 *)Xi;
629		u32 v;
630		v = (u32)(Z.hi>>32);	PUTU32(p,v);
631		v = (u32)(Z.hi);	PUTU32(p+4,v);
632		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
633		v = (u32)(Z.lo);	PUTU32(p+12,v);
634#endif
635	}
636	else {
637		Xi[0] = Z.hi;
638		Xi[1] = Z.lo;
639	}
640}
641#define GCM_MUL(ctx,Xi)	  gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
642
643#endif
644
645#if	TABLE_BITS==4 && defined(GHASH_ASM)
646# if	!defined(I386_ONLY) && \
647	(defined(__i386)	|| defined(__i386__)	|| \
648	 defined(__x86_64)	|| defined(__x86_64__)	|| \
649	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
650#  define GHASH_ASM_X86_OR_64
651#  define GCM_FUNCREF_4BIT
652extern unsigned int OPENSSL_ia32cap_P[2];
653
654void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
655void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
656void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
657
658#  if	defined(__i386) || defined(__i386__) || defined(_M_IX86)
659#   define GHASH_ASM_X86
660void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
661void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
662
663void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
664void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
665#  endif
666# elif defined(__arm__) || defined(__arm)
667#  include "arm_arch.h"
668#  if __ARM_ARCH__>=7
669#   define GHASH_ASM_ARM
670#   define GCM_FUNCREF_4BIT
671void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
672void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
673#  endif
674# endif
675#endif
676
677#ifdef GCM_FUNCREF_4BIT
678# undef  GCM_MUL
679# define GCM_MUL(ctx,Xi)	(*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
680# ifdef GHASH
681#  undef  GHASH
682#  define GHASH(ctx,in,len)	(*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
683# endif
684#endif
685
686void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
687{
688	const union { long one; char little; } is_endian = {1};
689
690	memset(ctx,0,sizeof(*ctx));
691	ctx->block = block;
692	ctx->key   = key;
693
694	(*block)(ctx->H.c,ctx->H.c,key);
695
696	if (is_endian.little) {
697		/* H is stored in host byte order */
698#ifdef BSWAP8
699		ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
700		ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
701#else
702		u8 *p = ctx->H.c;
703		u64 hi,lo;
704		hi = (u64)GETU32(p)  <<32|GETU32(p+4);
705		lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
706		ctx->H.u[0] = hi;
707		ctx->H.u[1] = lo;
708#endif
709	}
710
711#if	TABLE_BITS==8
712	gcm_init_8bit(ctx->Htable,ctx->H.u);
713#elif	TABLE_BITS==4
714# if	defined(GHASH_ASM_X86_OR_64)
715#  if	!defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
716	if (OPENSSL_ia32cap_P[0]&(1<<24) &&	/* check FXSR bit */
717	    OPENSSL_ia32cap_P[1]&(1<<1) ) {	/* check PCLMULQDQ bit */
718		gcm_init_clmul(ctx->Htable,ctx->H.u);
719		ctx->gmult = gcm_gmult_clmul;
720		ctx->ghash = gcm_ghash_clmul;
721		return;
722	}
723#  endif
724	gcm_init_4bit(ctx->Htable,ctx->H.u);
725#  if	defined(GHASH_ASM_X86)			/* x86 only */
726#   if	defined(OPENSSL_IA32_SSE2)
727	if (OPENSSL_ia32cap_P[0]&(1<<25)) {	/* check SSE bit */
728#   else
729	if (OPENSSL_ia32cap_P[0]&(1<<23)) {	/* check MMX bit */
730#   endif
731		ctx->gmult = gcm_gmult_4bit_mmx;
732		ctx->ghash = gcm_ghash_4bit_mmx;
733	} else {
734		ctx->gmult = gcm_gmult_4bit_x86;
735		ctx->ghash = gcm_ghash_4bit_x86;
736	}
737#  else
738	ctx->gmult = gcm_gmult_4bit;
739	ctx->ghash = gcm_ghash_4bit;
740#  endif
741# elif	defined(GHASH_ASM_ARM)
742	if (OPENSSL_armcap_P & ARMV7_NEON) {
743		ctx->gmult = gcm_gmult_neon;
744		ctx->ghash = gcm_ghash_neon;
745	} else {
746		gcm_init_4bit(ctx->Htable,ctx->H.u);
747		ctx->gmult = gcm_gmult_4bit;
748		ctx->ghash = gcm_ghash_4bit;
749	}
750# else
751	gcm_init_4bit(ctx->Htable,ctx->H.u);
752# endif
753#endif
754}
755
756void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
757{
758	const union { long one; char little; } is_endian = {1};
759	unsigned int ctr;
760#ifdef GCM_FUNCREF_4BIT
761	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
762#endif
763
764	ctx->Yi.u[0]  = 0;
765	ctx->Yi.u[1]  = 0;
766	ctx->Xi.u[0]  = 0;
767	ctx->Xi.u[1]  = 0;
768	ctx->len.u[0] = 0;	/* AAD length */
769	ctx->len.u[1] = 0;	/* message length */
770	ctx->ares = 0;
771	ctx->mres = 0;
772
773	if (len==12) {
774		memcpy(ctx->Yi.c,iv,12);
775		ctx->Yi.c[15]=1;
776		ctr=1;
777	}
778	else {
779		size_t i;
780		u64 len0 = len;
781
782		while (len>=16) {
783			for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
784			GCM_MUL(ctx,Yi);
785			iv += 16;
786			len -= 16;
787		}
788		if (len) {
789			for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
790			GCM_MUL(ctx,Yi);
791		}
792		len0 <<= 3;
793		if (is_endian.little) {
794#ifdef BSWAP8
795			ctx->Yi.u[1]  ^= BSWAP8(len0);
796#else
797			ctx->Yi.c[8]  ^= (u8)(len0>>56);
798			ctx->Yi.c[9]  ^= (u8)(len0>>48);
799			ctx->Yi.c[10] ^= (u8)(len0>>40);
800			ctx->Yi.c[11] ^= (u8)(len0>>32);
801			ctx->Yi.c[12] ^= (u8)(len0>>24);
802			ctx->Yi.c[13] ^= (u8)(len0>>16);
803			ctx->Yi.c[14] ^= (u8)(len0>>8);
804			ctx->Yi.c[15] ^= (u8)(len0);
805#endif
806		}
807		else
808			ctx->Yi.u[1]  ^= len0;
809
810		GCM_MUL(ctx,Yi);
811
812		if (is_endian.little)
813			ctr = GETU32(ctx->Yi.c+12);
814		else
815			ctr = ctx->Yi.d[3];
816	}
817
818	(*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
819	++ctr;
820	if (is_endian.little)
821		PUTU32(ctx->Yi.c+12,ctr);
822	else
823		ctx->Yi.d[3] = ctr;
824}
825
826int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
827{
828	size_t i;
829	unsigned int n;
830	u64 alen = ctx->len.u[0];
831#ifdef GCM_FUNCREF_4BIT
832	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
833# ifdef GHASH
834	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
835				const u8 *inp,size_t len)	= ctx->ghash;
836# endif
837#endif
838
839	if (ctx->len.u[1]) return -2;
840
841	alen += len;
842	if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
843		return -1;
844	ctx->len.u[0] = alen;
845
846	n = ctx->ares;
847	if (n) {
848		while (n && len) {
849			ctx->Xi.c[n] ^= *(aad++);
850			--len;
851			n = (n+1)%16;
852		}
853		if (n==0) GCM_MUL(ctx,Xi);
854		else {
855			ctx->ares = n;
856			return 0;
857		}
858	}
859
860#ifdef GHASH
861	if ((i = (len&(size_t)-16))) {
862		GHASH(ctx,aad,i);
863		aad += i;
864		len -= i;
865	}
866#else
867	while (len>=16) {
868		for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
869		GCM_MUL(ctx,Xi);
870		aad += 16;
871		len -= 16;
872	}
873#endif
874	if (len) {
875		n = (unsigned int)len;
876		for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
877	}
878
879	ctx->ares = n;
880	return 0;
881}
882
883int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
884		const unsigned char *in, unsigned char *out,
885		size_t len)
886{
887	const union { long one; char little; } is_endian = {1};
888	unsigned int n, ctr;
889	size_t i;
890	u64        mlen  = ctx->len.u[1];
891	block128_f block = ctx->block;
892	void      *key   = ctx->key;
893#ifdef GCM_FUNCREF_4BIT
894	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
895# ifdef GHASH
896	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
897				const u8 *inp,size_t len)	= ctx->ghash;
898# endif
899#endif
900
901#if 0
902	n = (unsigned int)mlen%16; /* alternative to ctx->mres */
903#endif
904	mlen += len;
905	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
906		return -1;
907	ctx->len.u[1] = mlen;
908
909	if (ctx->ares) {
910		/* First call to encrypt finalizes GHASH(AAD) */
911		GCM_MUL(ctx,Xi);
912		ctx->ares = 0;
913	}
914
915	if (is_endian.little)
916		ctr = GETU32(ctx->Yi.c+12);
917	else
918		ctr = ctx->Yi.d[3];
919
920	n = ctx->mres;
921#if !defined(OPENSSL_SMALL_FOOTPRINT)
922	if (16%sizeof(size_t) == 0) do {	/* always true actually */
923		if (n) {
924			while (n && len) {
925				ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
926				--len;
927				n = (n+1)%16;
928			}
929			if (n==0) GCM_MUL(ctx,Xi);
930			else {
931				ctx->mres = n;
932				return 0;
933			}
934		}
935#if defined(STRICT_ALIGNMENT)
936		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
937			break;
938#endif
939#if defined(GHASH) && defined(GHASH_CHUNK)
940		while (len>=GHASH_CHUNK) {
941		    size_t j=GHASH_CHUNK;
942
943		    while (j) {
944			(*block)(ctx->Yi.c,ctx->EKi.c,key);
945			++ctr;
946			if (is_endian.little)
947				PUTU32(ctx->Yi.c+12,ctr);
948			else
949				ctx->Yi.d[3] = ctr;
950			for (i=0; i<16; i+=sizeof(size_t))
951				*(size_t *)(out+i) =
952				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
953			out += 16;
954			in  += 16;
955			j   -= 16;
956		    }
957		    GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
958		    len -= GHASH_CHUNK;
959		}
960		if ((i = (len&(size_t)-16))) {
961		    size_t j=i;
962
963		    while (len>=16) {
964			(*block)(ctx->Yi.c,ctx->EKi.c,key);
965			++ctr;
966			if (is_endian.little)
967				PUTU32(ctx->Yi.c+12,ctr);
968			else
969				ctx->Yi.d[3] = ctr;
970			for (i=0; i<16; i+=sizeof(size_t))
971				*(size_t *)(out+i) =
972				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
973			out += 16;
974			in  += 16;
975			len -= 16;
976		    }
977		    GHASH(ctx,out-j,j);
978		}
979#else
980		while (len>=16) {
981			(*block)(ctx->Yi.c,ctx->EKi.c,key);
982			++ctr;
983			if (is_endian.little)
984				PUTU32(ctx->Yi.c+12,ctr);
985			else
986				ctx->Yi.d[3] = ctr;
987			for (i=0; i<16; i+=sizeof(size_t))
988				*(size_t *)(ctx->Xi.c+i) ^=
989				*(size_t *)(out+i) =
990				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
991			GCM_MUL(ctx,Xi);
992			out += 16;
993			in  += 16;
994			len -= 16;
995		}
996#endif
997		if (len) {
998			(*block)(ctx->Yi.c,ctx->EKi.c,key);
999			++ctr;
1000			if (is_endian.little)
1001				PUTU32(ctx->Yi.c+12,ctr);
1002			else
1003				ctx->Yi.d[3] = ctr;
1004			while (len--) {
1005				ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1006				++n;
1007			}
1008		}
1009
1010		ctx->mres = n;
1011		return 0;
1012	} while(0);
1013#endif
1014	for (i=0;i<len;++i) {
1015		if (n==0) {
1016			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1017			++ctr;
1018			if (is_endian.little)
1019				PUTU32(ctx->Yi.c+12,ctr);
1020			else
1021				ctx->Yi.d[3] = ctr;
1022		}
1023		ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1024		n = (n+1)%16;
1025		if (n==0)
1026			GCM_MUL(ctx,Xi);
1027	}
1028
1029	ctx->mres = n;
1030	return 0;
1031}
1032
1033int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1034		const unsigned char *in, unsigned char *out,
1035		size_t len)
1036{
1037	const union { long one; char little; } is_endian = {1};
1038	unsigned int n, ctr;
1039	size_t i;
1040	u64        mlen  = ctx->len.u[1];
1041	block128_f block = ctx->block;
1042	void      *key   = ctx->key;
1043#ifdef GCM_FUNCREF_4BIT
1044	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1045# ifdef GHASH
1046	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1047				const u8 *inp,size_t len)	= ctx->ghash;
1048# endif
1049#endif
1050
1051	mlen += len;
1052	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1053		return -1;
1054	ctx->len.u[1] = mlen;
1055
1056	if (ctx->ares) {
1057		/* First call to decrypt finalizes GHASH(AAD) */
1058		GCM_MUL(ctx,Xi);
1059		ctx->ares = 0;
1060	}
1061
1062	if (is_endian.little)
1063		ctr = GETU32(ctx->Yi.c+12);
1064	else
1065		ctr = ctx->Yi.d[3];
1066
1067	n = ctx->mres;
1068#if !defined(OPENSSL_SMALL_FOOTPRINT)
1069	if (16%sizeof(size_t) == 0) do {	/* always true actually */
1070		if (n) {
1071			while (n && len) {
1072				u8 c = *(in++);
1073				*(out++) = c^ctx->EKi.c[n];
1074				ctx->Xi.c[n] ^= c;
1075				--len;
1076				n = (n+1)%16;
1077			}
1078			if (n==0) GCM_MUL (ctx,Xi);
1079			else {
1080				ctx->mres = n;
1081				return 0;
1082			}
1083		}
1084#if defined(STRICT_ALIGNMENT)
1085		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1086			break;
1087#endif
1088#if defined(GHASH) && defined(GHASH_CHUNK)
1089		while (len>=GHASH_CHUNK) {
1090		    size_t j=GHASH_CHUNK;
1091
1092		    GHASH(ctx,in,GHASH_CHUNK);
1093		    while (j) {
1094			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1095			++ctr;
1096			if (is_endian.little)
1097				PUTU32(ctx->Yi.c+12,ctr);
1098			else
1099				ctx->Yi.d[3] = ctr;
1100			for (i=0; i<16; i+=sizeof(size_t))
1101				*(size_t *)(out+i) =
1102				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1103			out += 16;
1104			in  += 16;
1105			j   -= 16;
1106		    }
1107		    len -= GHASH_CHUNK;
1108		}
1109		if ((i = (len&(size_t)-16))) {
1110		    GHASH(ctx,in,i);
1111		    while (len>=16) {
1112			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1113			++ctr;
1114			if (is_endian.little)
1115				PUTU32(ctx->Yi.c+12,ctr);
1116			else
1117				ctx->Yi.d[3] = ctr;
1118			for (i=0; i<16; i+=sizeof(size_t))
1119				*(size_t *)(out+i) =
1120				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1121			out += 16;
1122			in  += 16;
1123			len -= 16;
1124		    }
1125		}
1126#else
1127		while (len>=16) {
1128			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1129			++ctr;
1130			if (is_endian.little)
1131				PUTU32(ctx->Yi.c+12,ctr);
1132			else
1133				ctx->Yi.d[3] = ctr;
1134			for (i=0; i<16; i+=sizeof(size_t)) {
1135				size_t c = *(size_t *)(in+i);
1136				*(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1137				*(size_t *)(ctx->Xi.c+i) ^= c;
1138			}
1139			GCM_MUL(ctx,Xi);
1140			out += 16;
1141			in  += 16;
1142			len -= 16;
1143		}
1144#endif
1145		if (len) {
1146			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1147			++ctr;
1148			if (is_endian.little)
1149				PUTU32(ctx->Yi.c+12,ctr);
1150			else
1151				ctx->Yi.d[3] = ctr;
1152			while (len--) {
1153				u8 c = in[n];
1154				ctx->Xi.c[n] ^= c;
1155				out[n] = c^ctx->EKi.c[n];
1156				++n;
1157			}
1158		}
1159
1160		ctx->mres = n;
1161		return 0;
1162	} while(0);
1163#endif
1164	for (i=0;i<len;++i) {
1165		u8 c;
1166		if (n==0) {
1167			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1168			++ctr;
1169			if (is_endian.little)
1170				PUTU32(ctx->Yi.c+12,ctr);
1171			else
1172				ctx->Yi.d[3] = ctr;
1173		}
1174		c = in[i];
1175		out[i] = c^ctx->EKi.c[n];
1176		ctx->Xi.c[n] ^= c;
1177		n = (n+1)%16;
1178		if (n==0)
1179			GCM_MUL(ctx,Xi);
1180	}
1181
1182	ctx->mres = n;
1183	return 0;
1184}
1185
1186int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1187		const unsigned char *in, unsigned char *out,
1188		size_t len, ctr128_f stream)
1189{
1190	const union { long one; char little; } is_endian = {1};
1191	unsigned int n, ctr;
1192	size_t i;
1193	u64   mlen = ctx->len.u[1];
1194	void *key  = ctx->key;
1195#ifdef GCM_FUNCREF_4BIT
1196	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1197# ifdef GHASH
1198	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1199				const u8 *inp,size_t len)	= ctx->ghash;
1200# endif
1201#endif
1202
1203	mlen += len;
1204	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1205		return -1;
1206	ctx->len.u[1] = mlen;
1207
1208	if (ctx->ares) {
1209		/* First call to encrypt finalizes GHASH(AAD) */
1210		GCM_MUL(ctx,Xi);
1211		ctx->ares = 0;
1212	}
1213
1214	if (is_endian.little)
1215		ctr = GETU32(ctx->Yi.c+12);
1216	else
1217		ctr = ctx->Yi.d[3];
1218
1219	n = ctx->mres;
1220	if (n) {
1221		while (n && len) {
1222			ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1223			--len;
1224			n = (n+1)%16;
1225		}
1226		if (n==0) GCM_MUL(ctx,Xi);
1227		else {
1228			ctx->mres = n;
1229			return 0;
1230		}
1231	}
1232#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1233	while (len>=GHASH_CHUNK) {
1234		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1235		ctr += GHASH_CHUNK/16;
1236		if (is_endian.little)
1237			PUTU32(ctx->Yi.c+12,ctr);
1238		else
1239			ctx->Yi.d[3] = ctr;
1240		GHASH(ctx,out,GHASH_CHUNK);
1241		out += GHASH_CHUNK;
1242		in  += GHASH_CHUNK;
1243		len -= GHASH_CHUNK;
1244	}
1245#endif
1246	if ((i = (len&(size_t)-16))) {
1247		size_t j=i/16;
1248
1249		(*stream)(in,out,j,key,ctx->Yi.c);
1250		ctr += (unsigned int)j;
1251		if (is_endian.little)
1252			PUTU32(ctx->Yi.c+12,ctr);
1253		else
1254			ctx->Yi.d[3] = ctr;
1255		in  += i;
1256		len -= i;
1257#if defined(GHASH)
1258		GHASH(ctx,out,i);
1259		out += i;
1260#else
1261		while (j--) {
1262			for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1263			GCM_MUL(ctx,Xi);
1264			out += 16;
1265		}
1266#endif
1267	}
1268	if (len) {
1269		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1270		++ctr;
1271		if (is_endian.little)
1272			PUTU32(ctx->Yi.c+12,ctr);
1273		else
1274			ctx->Yi.d[3] = ctr;
1275		while (len--) {
1276			ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1277			++n;
1278		}
1279	}
1280
1281	ctx->mres = n;
1282	return 0;
1283}
1284
1285int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1286		const unsigned char *in, unsigned char *out,
1287		size_t len,ctr128_f stream)
1288{
1289	const union { long one; char little; } is_endian = {1};
1290	unsigned int n, ctr;
1291	size_t i;
1292	u64   mlen = ctx->len.u[1];
1293	void *key  = ctx->key;
1294#ifdef GCM_FUNCREF_4BIT
1295	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1296# ifdef GHASH
1297	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1298				const u8 *inp,size_t len)	= ctx->ghash;
1299# endif
1300#endif
1301
1302	mlen += len;
1303	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1304		return -1;
1305	ctx->len.u[1] = mlen;
1306
1307	if (ctx->ares) {
1308		/* First call to decrypt finalizes GHASH(AAD) */
1309		GCM_MUL(ctx,Xi);
1310		ctx->ares = 0;
1311	}
1312
1313	if (is_endian.little)
1314		ctr = GETU32(ctx->Yi.c+12);
1315	else
1316		ctr = ctx->Yi.d[3];
1317
1318	n = ctx->mres;
1319	if (n) {
1320		while (n && len) {
1321			u8 c = *(in++);
1322			*(out++) = c^ctx->EKi.c[n];
1323			ctx->Xi.c[n] ^= c;
1324			--len;
1325			n = (n+1)%16;
1326		}
1327		if (n==0) GCM_MUL (ctx,Xi);
1328		else {
1329			ctx->mres = n;
1330			return 0;
1331		}
1332	}
1333#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1334	while (len>=GHASH_CHUNK) {
1335		GHASH(ctx,in,GHASH_CHUNK);
1336		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1337		ctr += GHASH_CHUNK/16;
1338		if (is_endian.little)
1339			PUTU32(ctx->Yi.c+12,ctr);
1340		else
1341			ctx->Yi.d[3] = ctr;
1342		out += GHASH_CHUNK;
1343		in  += GHASH_CHUNK;
1344		len -= GHASH_CHUNK;
1345	}
1346#endif
1347	if ((i = (len&(size_t)-16))) {
1348		size_t j=i/16;
1349
1350#if defined(GHASH)
1351		GHASH(ctx,in,i);
1352#else
1353		while (j--) {
1354			size_t k;
1355			for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1356			GCM_MUL(ctx,Xi);
1357			in += 16;
1358		}
1359		j   = i/16;
1360		in -= i;
1361#endif
1362		(*stream)(in,out,j,key,ctx->Yi.c);
1363		ctr += (unsigned int)j;
1364		if (is_endian.little)
1365			PUTU32(ctx->Yi.c+12,ctr);
1366		else
1367			ctx->Yi.d[3] = ctr;
1368		out += i;
1369		in  += i;
1370		len -= i;
1371	}
1372	if (len) {
1373		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1374		++ctr;
1375		if (is_endian.little)
1376			PUTU32(ctx->Yi.c+12,ctr);
1377		else
1378			ctx->Yi.d[3] = ctr;
1379		while (len--) {
1380			u8 c = in[n];
1381			ctx->Xi.c[n] ^= c;
1382			out[n] = c^ctx->EKi.c[n];
1383			++n;
1384		}
1385	}
1386
1387	ctx->mres = n;
1388	return 0;
1389}
1390
1391int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1392			size_t len)
1393{
1394	const union { long one; char little; } is_endian = {1};
1395	u64 alen = ctx->len.u[0]<<3;
1396	u64 clen = ctx->len.u[1]<<3;
1397#ifdef GCM_FUNCREF_4BIT
1398	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1399#endif
1400
1401	if (ctx->mres || ctx->ares)
1402		GCM_MUL(ctx,Xi);
1403
1404	if (is_endian.little) {
1405#ifdef BSWAP8
1406		alen = BSWAP8(alen);
1407		clen = BSWAP8(clen);
1408#else
1409		u8 *p = ctx->len.c;
1410
1411		ctx->len.u[0] = alen;
1412		ctx->len.u[1] = clen;
1413
1414		alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1415		clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1416#endif
1417	}
1418
1419	ctx->Xi.u[0] ^= alen;
1420	ctx->Xi.u[1] ^= clen;
1421	GCM_MUL(ctx,Xi);
1422
1423	ctx->Xi.u[0] ^= ctx->EK0.u[0];
1424	ctx->Xi.u[1] ^= ctx->EK0.u[1];
1425
1426	if (tag && len<=sizeof(ctx->Xi))
1427		return memcmp(ctx->Xi.c,tag,len);
1428	else
1429		return -1;
1430}
1431
1432void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1433{
1434	CRYPTO_gcm128_finish(ctx, NULL, 0);
1435	memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1436}
1437
1438GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1439{
1440	GCM128_CONTEXT *ret;
1441
1442	if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1443		CRYPTO_gcm128_init(ret,key,block);
1444
1445	return ret;
1446}
1447
1448void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1449{
1450	if (ctx) {
1451		OPENSSL_cleanse(ctx,sizeof(*ctx));
1452		OPENSSL_free(ctx);
1453	}
1454}
1455
1456#if defined(SELFTEST)
1457#include <stdio.h>
1458#include <openssl/aes.h>
1459
1460/* Test Case 1 */
1461static const u8	K1[16],
1462		*P1=NULL,
1463		*A1=NULL,
1464		IV1[12],
1465		*C1=NULL,
1466		T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1467
1468/* Test Case 2 */
1469#define K2 K1
1470#define A2 A1
1471#define IV2 IV1
1472static const u8	P2[16],
1473		C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1474		T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1475
1476/* Test Case 3 */
1477#define A3 A2
1478static const u8	K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1479		P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1480			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1481			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1482			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1483		IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1484		C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1485			0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1486			0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1487			0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1488		T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1489
1490/* Test Case 4 */
1491#define K4 K3
1492#define IV4 IV3
1493static const u8	P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1494			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1495			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1496			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1497		A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1498			0xab,0xad,0xda,0xd2},
1499		C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1500			0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1501			0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1502			0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1503		T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1504
1505/* Test Case 5 */
1506#define K5 K4
1507#define P5 P4
1508#define A5 A4
1509static const u8	IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1510		C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1511			0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1512			0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1513			0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1514		T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1515
1516/* Test Case 6 */
1517#define K6 K5
1518#define P6 P5
1519#define A6 A5
1520static const u8	IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1521			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1522			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1523			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1524		C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1525			0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1526			0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1527			0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1528		T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1529
1530/* Test Case 7 */
1531static const u8 K7[24],
1532		*P7=NULL,
1533		*A7=NULL,
1534		IV7[12],
1535		*C7=NULL,
1536		T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1537
1538/* Test Case 8 */
1539#define K8 K7
1540#define IV8 IV7
1541#define A8 A7
1542static const u8	P8[16],
1543		C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1544		T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1545
1546/* Test Case 9 */
1547#define A9 A8
1548static const u8	K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1549			0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1550		P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1551			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1552			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1553			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1554		IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1555		C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1556			0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1557			0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1558			0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1559		T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1560
1561/* Test Case 10 */
1562#define K10 K9
1563#define IV10 IV9
1564static const u8	P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1565			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1566			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1567			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1568		A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1569			0xab,0xad,0xda,0xd2},
1570		C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1571			0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1572			0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1573			0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1574		T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1575
1576/* Test Case 11 */
1577#define K11 K10
1578#define P11 P10
1579#define A11 A10
1580static const u8	IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1581		C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1582			0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1583			0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1584			0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1585		T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1586
1587/* Test Case 12 */
1588#define K12 K11
1589#define P12 P11
1590#define A12 A11
1591static const u8	IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1592			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1593			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1594			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1595		C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1596			0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1597			0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1598			0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1599		T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1600
1601/* Test Case 13 */
1602static const u8	K13[32],
1603		*P13=NULL,
1604		*A13=NULL,
1605		IV13[12],
1606		*C13=NULL,
1607		T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1608
1609/* Test Case 14 */
1610#define K14 K13
1611#define A14 A13
1612static const u8	P14[16],
1613		IV14[12],
1614		C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1615		T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1616
1617/* Test Case 15 */
1618#define A15 A14
1619static const u8	K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1620			0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1621		P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1622			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1623			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1624			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1625		IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1626		C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1627			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1628			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1629			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1630		T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1631
1632/* Test Case 16 */
1633#define K16 K15
1634#define IV16 IV15
1635static const u8	P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1636			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1637			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1638			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1639		A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1640			0xab,0xad,0xda,0xd2},
1641		C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1642			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1643			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1644			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1645		T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1646
1647/* Test Case 17 */
1648#define K17 K16
1649#define P17 P16
1650#define A17 A16
1651static const u8	IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1652		C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1653			0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1654			0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1655			0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1656		T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1657
1658/* Test Case 18 */
1659#define K18 K17
1660#define P18 P17
1661#define A18 A17
1662static const u8	IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1663			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1664			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1665			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1666		C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1667			0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1668			0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1669			0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1670		T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1671
1672#define TEST_CASE(n)	do {					\
1673	u8 out[sizeof(P##n)];					\
1674	AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);		\
1675	CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);	\
1676	CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));		\
1677	memset(out,0,sizeof(out));				\
1678	if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));	\
1679	if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));	\
1680	if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||		\
1681	    (C##n && memcmp(out,C##n,sizeof(out))))		\
1682		ret++, printf ("encrypt test#%d failed.\n",n);	\
1683	CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));		\
1684	memset(out,0,sizeof(out));				\
1685	if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));	\
1686	if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));	\
1687	if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||		\
1688	    (P##n && memcmp(out,P##n,sizeof(out))))		\
1689		ret++, printf ("decrypt test#%d failed.\n",n);	\
1690	} while(0)
1691
1692int main()
1693{
1694	GCM128_CONTEXT ctx;
1695	AES_KEY key;
1696	int ret=0;
1697
1698	TEST_CASE(1);
1699	TEST_CASE(2);
1700	TEST_CASE(3);
1701	TEST_CASE(4);
1702	TEST_CASE(5);
1703	TEST_CASE(6);
1704	TEST_CASE(7);
1705	TEST_CASE(8);
1706	TEST_CASE(9);
1707	TEST_CASE(10);
1708	TEST_CASE(11);
1709	TEST_CASE(12);
1710	TEST_CASE(13);
1711	TEST_CASE(14);
1712	TEST_CASE(15);
1713	TEST_CASE(16);
1714	TEST_CASE(17);
1715	TEST_CASE(18);
1716
1717#ifdef OPENSSL_CPUID_OBJ
1718	{
1719	size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1720	union { u64 u; u8 c[1024]; } buf;
1721	int i;
1722
1723	AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1724	CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1725	CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1726
1727	CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1728	start = OPENSSL_rdtsc();
1729	CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1730	gcm_t = OPENSSL_rdtsc() - start;
1731
1732	CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1733			&key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1734			(block128_f)AES_encrypt);
1735	start = OPENSSL_rdtsc();
1736	CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1737			&key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1738			(block128_f)AES_encrypt);
1739	ctr_t = OPENSSL_rdtsc() - start;
1740
1741	printf("%.2f-%.2f=%.2f\n",
1742			gcm_t/(double)sizeof(buf),
1743			ctr_t/(double)sizeof(buf),
1744			(gcm_t-ctr_t)/(double)sizeof(buf));
1745#ifdef GHASH
1746	GHASH(&ctx,buf.c,sizeof(buf));
1747	start = OPENSSL_rdtsc();
1748	for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
1749	gcm_t = OPENSSL_rdtsc() - start;
1750	printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1751#endif
1752	}
1753#endif
1754
1755	return ret;
1756}
1757#endif
1758