aes_x86core.c revision 221304ee937bc0910948a8be1320cb8cc4eb6d36
1/* crypto/aes/aes_core.c -*- mode:C; c-file-style: "eay" -*- */
2/**
3 * rijndael-alg-fst.c
4 *
5 * @version 3.0 (December 2000)
6 *
7 * Optimised ANSI C code for the Rijndael cipher (now AES)
8 *
9 * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
10 * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
11 * @author Paulo Barreto <paulo.barreto@terra.com.br>
12 *
13 * This code is hereby placed in the public domain.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
19 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/*
29 * This is experimental x86[_64] derivative. It assumes little-endian
30 * byte order and expects CPU to sustain unaligned memory references.
31 * It is used as playground for cache-time attack mitigations and
32 * serves as reference C implementation for x86[_64] assembler.
33 *
34 *					<appro@fy.chalmers.se>
35 */
36
37
38#ifndef AES_DEBUG
39# ifndef NDEBUG
40#  define NDEBUG
41# endif
42#endif
43#include <assert.h>
44
45#include <stdlib.h>
46#include <openssl/aes.h>
47#include "aes_locl.h"
48
49/*
50 * These two parameters control which table, 256-byte or 2KB, is
51 * referenced in outer and respectively inner rounds.
52 */
53#define AES_COMPACT_IN_OUTER_ROUNDS
54#ifdef  AES_COMPACT_IN_OUTER_ROUNDS
55/* AES_COMPACT_IN_OUTER_ROUNDS costs ~30% in performance, while
56 * adding AES_COMPACT_IN_INNER_ROUNDS reduces benchmark *further*
57 * by factor of ~2. */
58# undef  AES_COMPACT_IN_INNER_ROUNDS
59#endif
60
61#if 1
62static void prefetch256(const void *table)
63{
64	volatile unsigned long *t=(void *)table,ret;
65	unsigned long sum;
66	int i;
67
68	/* 32 is common least cache-line size */
69	for (sum=0,i=0;i<256/sizeof(t[0]);i+=32/sizeof(t[0]))	sum ^= t[i];
70
71	ret = sum;
72}
73#else
74# define prefetch256(t)
75#endif
76
77#undef GETU32
78#define GETU32(p) (*((u32*)(p)))
79
80#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
81typedef unsigned __int64 u64;
82#define U64(C)	C##UI64
83#elif defined(__arch64__)
84typedef unsigned long u64;
85#define U64(C)	C##UL
86#else
87typedef unsigned long long u64;
88#define U64(C)	C##ULL
89#endif
90
91#undef ROTATE
92#if defined(_MSC_VER) || defined(__ICC)
93# define ROTATE(a,n)	_lrotl(a,n)
94#elif defined(__GNUC__) && __GNUC__>=2
95# if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
96#   define ROTATE(a,n)	({ register unsigned int ret;	\
97				asm (			\
98				"roll %1,%0"		\
99				: "=r"(ret)		\
100				: "I"(n), "0"(a)	\
101				: "cc");		\
102			   ret;				\
103			})
104# endif
105#endif
106/*
107Te [x] = S [x].[02, 01, 01, 03, 02, 01, 01, 03];
108Te0[x] = S [x].[02, 01, 01, 03];
109Te1[x] = S [x].[03, 02, 01, 01];
110Te2[x] = S [x].[01, 03, 02, 01];
111Te3[x] = S [x].[01, 01, 03, 02];
112*/
113#define Te0 (u32)((u64*)((u8*)Te+0))
114#define Te1 (u32)((u64*)((u8*)Te+3))
115#define Te2 (u32)((u64*)((u8*)Te+2))
116#define Te3 (u32)((u64*)((u8*)Te+1))
117/*
118Td [x] = Si[x].[0e, 09, 0d, 0b, 0e, 09, 0d, 0b];
119Td0[x] = Si[x].[0e, 09, 0d, 0b];
120Td1[x] = Si[x].[0b, 0e, 09, 0d];
121Td2[x] = Si[x].[0d, 0b, 0e, 09];
122Td3[x] = Si[x].[09, 0d, 0b, 0e];
123Td4[x] = Si[x].[01];
124*/
125#define Td0 (u32)((u64*)((u8*)Td+0))
126#define Td1 (u32)((u64*)((u8*)Td+3))
127#define Td2 (u32)((u64*)((u8*)Td+2))
128#define Td3 (u32)((u64*)((u8*)Td+1))
129
130static const u64 Te[256] = {
131    U64(0xa56363c6a56363c6), U64(0x847c7cf8847c7cf8),
132    U64(0x997777ee997777ee), U64(0x8d7b7bf68d7b7bf6),
133    U64(0x0df2f2ff0df2f2ff), U64(0xbd6b6bd6bd6b6bd6),
134    U64(0xb16f6fdeb16f6fde), U64(0x54c5c59154c5c591),
135    U64(0x5030306050303060), U64(0x0301010203010102),
136    U64(0xa96767cea96767ce), U64(0x7d2b2b567d2b2b56),
137    U64(0x19fefee719fefee7), U64(0x62d7d7b562d7d7b5),
138    U64(0xe6abab4de6abab4d), U64(0x9a7676ec9a7676ec),
139    U64(0x45caca8f45caca8f), U64(0x9d82821f9d82821f),
140    U64(0x40c9c98940c9c989), U64(0x877d7dfa877d7dfa),
141    U64(0x15fafaef15fafaef), U64(0xeb5959b2eb5959b2),
142    U64(0xc947478ec947478e), U64(0x0bf0f0fb0bf0f0fb),
143    U64(0xecadad41ecadad41), U64(0x67d4d4b367d4d4b3),
144    U64(0xfda2a25ffda2a25f), U64(0xeaafaf45eaafaf45),
145    U64(0xbf9c9c23bf9c9c23), U64(0xf7a4a453f7a4a453),
146    U64(0x967272e4967272e4), U64(0x5bc0c09b5bc0c09b),
147    U64(0xc2b7b775c2b7b775), U64(0x1cfdfde11cfdfde1),
148    U64(0xae93933dae93933d), U64(0x6a26264c6a26264c),
149    U64(0x5a36366c5a36366c), U64(0x413f3f7e413f3f7e),
150    U64(0x02f7f7f502f7f7f5), U64(0x4fcccc834fcccc83),
151    U64(0x5c3434685c343468), U64(0xf4a5a551f4a5a551),
152    U64(0x34e5e5d134e5e5d1), U64(0x08f1f1f908f1f1f9),
153    U64(0x937171e2937171e2), U64(0x73d8d8ab73d8d8ab),
154    U64(0x5331316253313162), U64(0x3f15152a3f15152a),
155    U64(0x0c0404080c040408), U64(0x52c7c79552c7c795),
156    U64(0x6523234665232346), U64(0x5ec3c39d5ec3c39d),
157    U64(0x2818183028181830), U64(0xa1969637a1969637),
158    U64(0x0f05050a0f05050a), U64(0xb59a9a2fb59a9a2f),
159    U64(0x0907070e0907070e), U64(0x3612122436121224),
160    U64(0x9b80801b9b80801b), U64(0x3de2e2df3de2e2df),
161    U64(0x26ebebcd26ebebcd), U64(0x6927274e6927274e),
162    U64(0xcdb2b27fcdb2b27f), U64(0x9f7575ea9f7575ea),
163    U64(0x1b0909121b090912), U64(0x9e83831d9e83831d),
164    U64(0x742c2c58742c2c58), U64(0x2e1a1a342e1a1a34),
165    U64(0x2d1b1b362d1b1b36), U64(0xb26e6edcb26e6edc),
166    U64(0xee5a5ab4ee5a5ab4), U64(0xfba0a05bfba0a05b),
167    U64(0xf65252a4f65252a4), U64(0x4d3b3b764d3b3b76),
168    U64(0x61d6d6b761d6d6b7), U64(0xceb3b37dceb3b37d),
169    U64(0x7b2929527b292952), U64(0x3ee3e3dd3ee3e3dd),
170    U64(0x712f2f5e712f2f5e), U64(0x9784841397848413),
171    U64(0xf55353a6f55353a6), U64(0x68d1d1b968d1d1b9),
172    U64(0x0000000000000000), U64(0x2cededc12cededc1),
173    U64(0x6020204060202040), U64(0x1ffcfce31ffcfce3),
174    U64(0xc8b1b179c8b1b179), U64(0xed5b5bb6ed5b5bb6),
175    U64(0xbe6a6ad4be6a6ad4), U64(0x46cbcb8d46cbcb8d),
176    U64(0xd9bebe67d9bebe67), U64(0x4b3939724b393972),
177    U64(0xde4a4a94de4a4a94), U64(0xd44c4c98d44c4c98),
178    U64(0xe85858b0e85858b0), U64(0x4acfcf854acfcf85),
179    U64(0x6bd0d0bb6bd0d0bb), U64(0x2aefefc52aefefc5),
180    U64(0xe5aaaa4fe5aaaa4f), U64(0x16fbfbed16fbfbed),
181    U64(0xc5434386c5434386), U64(0xd74d4d9ad74d4d9a),
182    U64(0x5533336655333366), U64(0x9485851194858511),
183    U64(0xcf45458acf45458a), U64(0x10f9f9e910f9f9e9),
184    U64(0x0602020406020204), U64(0x817f7ffe817f7ffe),
185    U64(0xf05050a0f05050a0), U64(0x443c3c78443c3c78),
186    U64(0xba9f9f25ba9f9f25), U64(0xe3a8a84be3a8a84b),
187    U64(0xf35151a2f35151a2), U64(0xfea3a35dfea3a35d),
188    U64(0xc0404080c0404080), U64(0x8a8f8f058a8f8f05),
189    U64(0xad92923fad92923f), U64(0xbc9d9d21bc9d9d21),
190    U64(0x4838387048383870), U64(0x04f5f5f104f5f5f1),
191    U64(0xdfbcbc63dfbcbc63), U64(0xc1b6b677c1b6b677),
192    U64(0x75dadaaf75dadaaf), U64(0x6321214263212142),
193    U64(0x3010102030101020), U64(0x1affffe51affffe5),
194    U64(0x0ef3f3fd0ef3f3fd), U64(0x6dd2d2bf6dd2d2bf),
195    U64(0x4ccdcd814ccdcd81), U64(0x140c0c18140c0c18),
196    U64(0x3513132635131326), U64(0x2fececc32fececc3),
197    U64(0xe15f5fbee15f5fbe), U64(0xa2979735a2979735),
198    U64(0xcc444488cc444488), U64(0x3917172e3917172e),
199    U64(0x57c4c49357c4c493), U64(0xf2a7a755f2a7a755),
200    U64(0x827e7efc827e7efc), U64(0x473d3d7a473d3d7a),
201    U64(0xac6464c8ac6464c8), U64(0xe75d5dbae75d5dba),
202    U64(0x2b1919322b191932), U64(0x957373e6957373e6),
203    U64(0xa06060c0a06060c0), U64(0x9881811998818119),
204    U64(0xd14f4f9ed14f4f9e), U64(0x7fdcdca37fdcdca3),
205    U64(0x6622224466222244), U64(0x7e2a2a547e2a2a54),
206    U64(0xab90903bab90903b), U64(0x8388880b8388880b),
207    U64(0xca46468cca46468c), U64(0x29eeeec729eeeec7),
208    U64(0xd3b8b86bd3b8b86b), U64(0x3c1414283c141428),
209    U64(0x79dedea779dedea7), U64(0xe25e5ebce25e5ebc),
210    U64(0x1d0b0b161d0b0b16), U64(0x76dbdbad76dbdbad),
211    U64(0x3be0e0db3be0e0db), U64(0x5632326456323264),
212    U64(0x4e3a3a744e3a3a74), U64(0x1e0a0a141e0a0a14),
213    U64(0xdb494992db494992), U64(0x0a06060c0a06060c),
214    U64(0x6c2424486c242448), U64(0xe45c5cb8e45c5cb8),
215    U64(0x5dc2c29f5dc2c29f), U64(0x6ed3d3bd6ed3d3bd),
216    U64(0xefacac43efacac43), U64(0xa66262c4a66262c4),
217    U64(0xa8919139a8919139), U64(0xa4959531a4959531),
218    U64(0x37e4e4d337e4e4d3), U64(0x8b7979f28b7979f2),
219    U64(0x32e7e7d532e7e7d5), U64(0x43c8c88b43c8c88b),
220    U64(0x5937376e5937376e), U64(0xb76d6ddab76d6dda),
221    U64(0x8c8d8d018c8d8d01), U64(0x64d5d5b164d5d5b1),
222    U64(0xd24e4e9cd24e4e9c), U64(0xe0a9a949e0a9a949),
223    U64(0xb46c6cd8b46c6cd8), U64(0xfa5656acfa5656ac),
224    U64(0x07f4f4f307f4f4f3), U64(0x25eaeacf25eaeacf),
225    U64(0xaf6565caaf6565ca), U64(0x8e7a7af48e7a7af4),
226    U64(0xe9aeae47e9aeae47), U64(0x1808081018080810),
227    U64(0xd5baba6fd5baba6f), U64(0x887878f0887878f0),
228    U64(0x6f25254a6f25254a), U64(0x722e2e5c722e2e5c),
229    U64(0x241c1c38241c1c38), U64(0xf1a6a657f1a6a657),
230    U64(0xc7b4b473c7b4b473), U64(0x51c6c69751c6c697),
231    U64(0x23e8e8cb23e8e8cb), U64(0x7cdddda17cdddda1),
232    U64(0x9c7474e89c7474e8), U64(0x211f1f3e211f1f3e),
233    U64(0xdd4b4b96dd4b4b96), U64(0xdcbdbd61dcbdbd61),
234    U64(0x868b8b0d868b8b0d), U64(0x858a8a0f858a8a0f),
235    U64(0x907070e0907070e0), U64(0x423e3e7c423e3e7c),
236    U64(0xc4b5b571c4b5b571), U64(0xaa6666ccaa6666cc),
237    U64(0xd8484890d8484890), U64(0x0503030605030306),
238    U64(0x01f6f6f701f6f6f7), U64(0x120e0e1c120e0e1c),
239    U64(0xa36161c2a36161c2), U64(0x5f35356a5f35356a),
240    U64(0xf95757aef95757ae), U64(0xd0b9b969d0b9b969),
241    U64(0x9186861791868617), U64(0x58c1c19958c1c199),
242    U64(0x271d1d3a271d1d3a), U64(0xb99e9e27b99e9e27),
243    U64(0x38e1e1d938e1e1d9), U64(0x13f8f8eb13f8f8eb),
244    U64(0xb398982bb398982b), U64(0x3311112233111122),
245    U64(0xbb6969d2bb6969d2), U64(0x70d9d9a970d9d9a9),
246    U64(0x898e8e07898e8e07), U64(0xa7949433a7949433),
247    U64(0xb69b9b2db69b9b2d), U64(0x221e1e3c221e1e3c),
248    U64(0x9287871592878715), U64(0x20e9e9c920e9e9c9),
249    U64(0x49cece8749cece87), U64(0xff5555aaff5555aa),
250    U64(0x7828285078282850), U64(0x7adfdfa57adfdfa5),
251    U64(0x8f8c8c038f8c8c03), U64(0xf8a1a159f8a1a159),
252    U64(0x8089890980898909), U64(0x170d0d1a170d0d1a),
253    U64(0xdabfbf65dabfbf65), U64(0x31e6e6d731e6e6d7),
254    U64(0xc6424284c6424284), U64(0xb86868d0b86868d0),
255    U64(0xc3414182c3414182), U64(0xb0999929b0999929),
256    U64(0x772d2d5a772d2d5a), U64(0x110f0f1e110f0f1e),
257    U64(0xcbb0b07bcbb0b07b), U64(0xfc5454a8fc5454a8),
258    U64(0xd6bbbb6dd6bbbb6d), U64(0x3a16162c3a16162c)
259};
260
261static const u8 Te4[256] = {
262    0x63U, 0x7cU, 0x77U, 0x7bU, 0xf2U, 0x6bU, 0x6fU, 0xc5U,
263    0x30U, 0x01U, 0x67U, 0x2bU, 0xfeU, 0xd7U, 0xabU, 0x76U,
264    0xcaU, 0x82U, 0xc9U, 0x7dU, 0xfaU, 0x59U, 0x47U, 0xf0U,
265    0xadU, 0xd4U, 0xa2U, 0xafU, 0x9cU, 0xa4U, 0x72U, 0xc0U,
266    0xb7U, 0xfdU, 0x93U, 0x26U, 0x36U, 0x3fU, 0xf7U, 0xccU,
267    0x34U, 0xa5U, 0xe5U, 0xf1U, 0x71U, 0xd8U, 0x31U, 0x15U,
268    0x04U, 0xc7U, 0x23U, 0xc3U, 0x18U, 0x96U, 0x05U, 0x9aU,
269    0x07U, 0x12U, 0x80U, 0xe2U, 0xebU, 0x27U, 0xb2U, 0x75U,
270    0x09U, 0x83U, 0x2cU, 0x1aU, 0x1bU, 0x6eU, 0x5aU, 0xa0U,
271    0x52U, 0x3bU, 0xd6U, 0xb3U, 0x29U, 0xe3U, 0x2fU, 0x84U,
272    0x53U, 0xd1U, 0x00U, 0xedU, 0x20U, 0xfcU, 0xb1U, 0x5bU,
273    0x6aU, 0xcbU, 0xbeU, 0x39U, 0x4aU, 0x4cU, 0x58U, 0xcfU,
274    0xd0U, 0xefU, 0xaaU, 0xfbU, 0x43U, 0x4dU, 0x33U, 0x85U,
275    0x45U, 0xf9U, 0x02U, 0x7fU, 0x50U, 0x3cU, 0x9fU, 0xa8U,
276    0x51U, 0xa3U, 0x40U, 0x8fU, 0x92U, 0x9dU, 0x38U, 0xf5U,
277    0xbcU, 0xb6U, 0xdaU, 0x21U, 0x10U, 0xffU, 0xf3U, 0xd2U,
278    0xcdU, 0x0cU, 0x13U, 0xecU, 0x5fU, 0x97U, 0x44U, 0x17U,
279    0xc4U, 0xa7U, 0x7eU, 0x3dU, 0x64U, 0x5dU, 0x19U, 0x73U,
280    0x60U, 0x81U, 0x4fU, 0xdcU, 0x22U, 0x2aU, 0x90U, 0x88U,
281    0x46U, 0xeeU, 0xb8U, 0x14U, 0xdeU, 0x5eU, 0x0bU, 0xdbU,
282    0xe0U, 0x32U, 0x3aU, 0x0aU, 0x49U, 0x06U, 0x24U, 0x5cU,
283    0xc2U, 0xd3U, 0xacU, 0x62U, 0x91U, 0x95U, 0xe4U, 0x79U,
284    0xe7U, 0xc8U, 0x37U, 0x6dU, 0x8dU, 0xd5U, 0x4eU, 0xa9U,
285    0x6cU, 0x56U, 0xf4U, 0xeaU, 0x65U, 0x7aU, 0xaeU, 0x08U,
286    0xbaU, 0x78U, 0x25U, 0x2eU, 0x1cU, 0xa6U, 0xb4U, 0xc6U,
287    0xe8U, 0xddU, 0x74U, 0x1fU, 0x4bU, 0xbdU, 0x8bU, 0x8aU,
288    0x70U, 0x3eU, 0xb5U, 0x66U, 0x48U, 0x03U, 0xf6U, 0x0eU,
289    0x61U, 0x35U, 0x57U, 0xb9U, 0x86U, 0xc1U, 0x1dU, 0x9eU,
290    0xe1U, 0xf8U, 0x98U, 0x11U, 0x69U, 0xd9U, 0x8eU, 0x94U,
291    0x9bU, 0x1eU, 0x87U, 0xe9U, 0xceU, 0x55U, 0x28U, 0xdfU,
292    0x8cU, 0xa1U, 0x89U, 0x0dU, 0xbfU, 0xe6U, 0x42U, 0x68U,
293    0x41U, 0x99U, 0x2dU, 0x0fU, 0xb0U, 0x54U, 0xbbU, 0x16U
294};
295
296static const u64 Td[256] = {
297    U64(0x50a7f45150a7f451), U64(0x5365417e5365417e),
298    U64(0xc3a4171ac3a4171a), U64(0x965e273a965e273a),
299    U64(0xcb6bab3bcb6bab3b), U64(0xf1459d1ff1459d1f),
300    U64(0xab58faacab58faac), U64(0x9303e34b9303e34b),
301    U64(0x55fa302055fa3020), U64(0xf66d76adf66d76ad),
302    U64(0x9176cc889176cc88), U64(0x254c02f5254c02f5),
303    U64(0xfcd7e54ffcd7e54f), U64(0xd7cb2ac5d7cb2ac5),
304    U64(0x8044352680443526), U64(0x8fa362b58fa362b5),
305    U64(0x495ab1de495ab1de), U64(0x671bba25671bba25),
306    U64(0x980eea45980eea45), U64(0xe1c0fe5de1c0fe5d),
307    U64(0x02752fc302752fc3), U64(0x12f04c8112f04c81),
308    U64(0xa397468da397468d), U64(0xc6f9d36bc6f9d36b),
309    U64(0xe75f8f03e75f8f03), U64(0x959c9215959c9215),
310    U64(0xeb7a6dbfeb7a6dbf), U64(0xda595295da595295),
311    U64(0x2d83bed42d83bed4), U64(0xd3217458d3217458),
312    U64(0x2969e0492969e049), U64(0x44c8c98e44c8c98e),
313    U64(0x6a89c2756a89c275), U64(0x78798ef478798ef4),
314    U64(0x6b3e58996b3e5899), U64(0xdd71b927dd71b927),
315    U64(0xb64fe1beb64fe1be), U64(0x17ad88f017ad88f0),
316    U64(0x66ac20c966ac20c9), U64(0xb43ace7db43ace7d),
317    U64(0x184adf63184adf63), U64(0x82311ae582311ae5),
318    U64(0x6033519760335197), U64(0x457f5362457f5362),
319    U64(0xe07764b1e07764b1), U64(0x84ae6bbb84ae6bbb),
320    U64(0x1ca081fe1ca081fe), U64(0x942b08f9942b08f9),
321    U64(0x5868487058684870), U64(0x19fd458f19fd458f),
322    U64(0x876cde94876cde94), U64(0xb7f87b52b7f87b52),
323    U64(0x23d373ab23d373ab), U64(0xe2024b72e2024b72),
324    U64(0x578f1fe3578f1fe3), U64(0x2aab55662aab5566),
325    U64(0x0728ebb20728ebb2), U64(0x03c2b52f03c2b52f),
326    U64(0x9a7bc5869a7bc586), U64(0xa50837d3a50837d3),
327    U64(0xf2872830f2872830), U64(0xb2a5bf23b2a5bf23),
328    U64(0xba6a0302ba6a0302), U64(0x5c8216ed5c8216ed),
329    U64(0x2b1ccf8a2b1ccf8a), U64(0x92b479a792b479a7),
330    U64(0xf0f207f3f0f207f3), U64(0xa1e2694ea1e2694e),
331    U64(0xcdf4da65cdf4da65), U64(0xd5be0506d5be0506),
332    U64(0x1f6234d11f6234d1), U64(0x8afea6c48afea6c4),
333    U64(0x9d532e349d532e34), U64(0xa055f3a2a055f3a2),
334    U64(0x32e18a0532e18a05), U64(0x75ebf6a475ebf6a4),
335    U64(0x39ec830b39ec830b), U64(0xaaef6040aaef6040),
336    U64(0x069f715e069f715e), U64(0x51106ebd51106ebd),
337    U64(0xf98a213ef98a213e), U64(0x3d06dd963d06dd96),
338    U64(0xae053eddae053edd), U64(0x46bde64d46bde64d),
339    U64(0xb58d5491b58d5491), U64(0x055dc471055dc471),
340    U64(0x6fd406046fd40604), U64(0xff155060ff155060),
341    U64(0x24fb981924fb9819), U64(0x97e9bdd697e9bdd6),
342    U64(0xcc434089cc434089), U64(0x779ed967779ed967),
343    U64(0xbd42e8b0bd42e8b0), U64(0x888b8907888b8907),
344    U64(0x385b19e7385b19e7), U64(0xdbeec879dbeec879),
345    U64(0x470a7ca1470a7ca1), U64(0xe90f427ce90f427c),
346    U64(0xc91e84f8c91e84f8), U64(0x0000000000000000),
347    U64(0x8386800983868009), U64(0x48ed2b3248ed2b32),
348    U64(0xac70111eac70111e), U64(0x4e725a6c4e725a6c),
349    U64(0xfbff0efdfbff0efd), U64(0x5638850f5638850f),
350    U64(0x1ed5ae3d1ed5ae3d), U64(0x27392d3627392d36),
351    U64(0x64d90f0a64d90f0a), U64(0x21a65c6821a65c68),
352    U64(0xd1545b9bd1545b9b), U64(0x3a2e36243a2e3624),
353    U64(0xb1670a0cb1670a0c), U64(0x0fe757930fe75793),
354    U64(0xd296eeb4d296eeb4), U64(0x9e919b1b9e919b1b),
355    U64(0x4fc5c0804fc5c080), U64(0xa220dc61a220dc61),
356    U64(0x694b775a694b775a), U64(0x161a121c161a121c),
357    U64(0x0aba93e20aba93e2), U64(0xe52aa0c0e52aa0c0),
358    U64(0x43e0223c43e0223c), U64(0x1d171b121d171b12),
359    U64(0x0b0d090e0b0d090e), U64(0xadc78bf2adc78bf2),
360    U64(0xb9a8b62db9a8b62d), U64(0xc8a91e14c8a91e14),
361    U64(0x8519f1578519f157), U64(0x4c0775af4c0775af),
362    U64(0xbbdd99eebbdd99ee), U64(0xfd607fa3fd607fa3),
363    U64(0x9f2601f79f2601f7), U64(0xbcf5725cbcf5725c),
364    U64(0xc53b6644c53b6644), U64(0x347efb5b347efb5b),
365    U64(0x7629438b7629438b), U64(0xdcc623cbdcc623cb),
366    U64(0x68fcedb668fcedb6), U64(0x63f1e4b863f1e4b8),
367    U64(0xcadc31d7cadc31d7), U64(0x1085634210856342),
368    U64(0x4022971340229713), U64(0x2011c6842011c684),
369    U64(0x7d244a857d244a85), U64(0xf83dbbd2f83dbbd2),
370    U64(0x1132f9ae1132f9ae), U64(0x6da129c76da129c7),
371    U64(0x4b2f9e1d4b2f9e1d), U64(0xf330b2dcf330b2dc),
372    U64(0xec52860dec52860d), U64(0xd0e3c177d0e3c177),
373    U64(0x6c16b32b6c16b32b), U64(0x99b970a999b970a9),
374    U64(0xfa489411fa489411), U64(0x2264e9472264e947),
375    U64(0xc48cfca8c48cfca8), U64(0x1a3ff0a01a3ff0a0),
376    U64(0xd82c7d56d82c7d56), U64(0xef903322ef903322),
377    U64(0xc74e4987c74e4987), U64(0xc1d138d9c1d138d9),
378    U64(0xfea2ca8cfea2ca8c), U64(0x360bd498360bd498),
379    U64(0xcf81f5a6cf81f5a6), U64(0x28de7aa528de7aa5),
380    U64(0x268eb7da268eb7da), U64(0xa4bfad3fa4bfad3f),
381    U64(0xe49d3a2ce49d3a2c), U64(0x0d9278500d927850),
382    U64(0x9bcc5f6a9bcc5f6a), U64(0x62467e5462467e54),
383    U64(0xc2138df6c2138df6), U64(0xe8b8d890e8b8d890),
384    U64(0x5ef7392e5ef7392e), U64(0xf5afc382f5afc382),
385    U64(0xbe805d9fbe805d9f), U64(0x7c93d0697c93d069),
386    U64(0xa92dd56fa92dd56f), U64(0xb31225cfb31225cf),
387    U64(0x3b99acc83b99acc8), U64(0xa77d1810a77d1810),
388    U64(0x6e639ce86e639ce8), U64(0x7bbb3bdb7bbb3bdb),
389    U64(0x097826cd097826cd), U64(0xf418596ef418596e),
390    U64(0x01b79aec01b79aec), U64(0xa89a4f83a89a4f83),
391    U64(0x656e95e6656e95e6), U64(0x7ee6ffaa7ee6ffaa),
392    U64(0x08cfbc2108cfbc21), U64(0xe6e815efe6e815ef),
393    U64(0xd99be7bad99be7ba), U64(0xce366f4ace366f4a),
394    U64(0xd4099fead4099fea), U64(0xd67cb029d67cb029),
395    U64(0xafb2a431afb2a431), U64(0x31233f2a31233f2a),
396    U64(0x3094a5c63094a5c6), U64(0xc066a235c066a235),
397    U64(0x37bc4e7437bc4e74), U64(0xa6ca82fca6ca82fc),
398    U64(0xb0d090e0b0d090e0), U64(0x15d8a73315d8a733),
399    U64(0x4a9804f14a9804f1), U64(0xf7daec41f7daec41),
400    U64(0x0e50cd7f0e50cd7f), U64(0x2ff691172ff69117),
401    U64(0x8dd64d768dd64d76), U64(0x4db0ef434db0ef43),
402    U64(0x544daacc544daacc), U64(0xdf0496e4df0496e4),
403    U64(0xe3b5d19ee3b5d19e), U64(0x1b886a4c1b886a4c),
404    U64(0xb81f2cc1b81f2cc1), U64(0x7f5165467f516546),
405    U64(0x04ea5e9d04ea5e9d), U64(0x5d358c015d358c01),
406    U64(0x737487fa737487fa), U64(0x2e410bfb2e410bfb),
407    U64(0x5a1d67b35a1d67b3), U64(0x52d2db9252d2db92),
408    U64(0x335610e9335610e9), U64(0x1347d66d1347d66d),
409    U64(0x8c61d79a8c61d79a), U64(0x7a0ca1377a0ca137),
410    U64(0x8e14f8598e14f859), U64(0x893c13eb893c13eb),
411    U64(0xee27a9ceee27a9ce), U64(0x35c961b735c961b7),
412    U64(0xede51ce1ede51ce1), U64(0x3cb1477a3cb1477a),
413    U64(0x59dfd29c59dfd29c), U64(0x3f73f2553f73f255),
414    U64(0x79ce141879ce1418), U64(0xbf37c773bf37c773),
415    U64(0xeacdf753eacdf753), U64(0x5baafd5f5baafd5f),
416    U64(0x146f3ddf146f3ddf), U64(0x86db447886db4478),
417    U64(0x81f3afca81f3afca), U64(0x3ec468b93ec468b9),
418    U64(0x2c3424382c342438), U64(0x5f40a3c25f40a3c2),
419    U64(0x72c31d1672c31d16), U64(0x0c25e2bc0c25e2bc),
420    U64(0x8b493c288b493c28), U64(0x41950dff41950dff),
421    U64(0x7101a8397101a839), U64(0xdeb30c08deb30c08),
422    U64(0x9ce4b4d89ce4b4d8), U64(0x90c1566490c15664),
423    U64(0x6184cb7b6184cb7b), U64(0x70b632d570b632d5),
424    U64(0x745c6c48745c6c48), U64(0x4257b8d04257b8d0)
425};
426static const u8 Td4[256] = {
427    0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U,
428    0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU,
429    0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U,
430    0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU,
431    0x54U, 0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU,
432    0xeeU, 0x4cU, 0x95U, 0x0bU, 0x42U, 0xfaU, 0xc3U, 0x4eU,
433    0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U, 0xb2U,
434    0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U,
435    0x72U, 0xf8U, 0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U,
436    0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU, 0x65U, 0xb6U, 0x92U,
437    0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU,
438    0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U,
439    0x90U, 0xd8U, 0xabU, 0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU,
440    0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U, 0x45U, 0x06U,
441    0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U,
442    0xc1U, 0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU,
443    0x3aU, 0x91U, 0x11U, 0x41U, 0x4fU, 0x67U, 0xdcU, 0xeaU,
444    0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U, 0x73U,
445    0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U,
446    0xe2U, 0xf9U, 0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU,
447    0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU, 0x29U, 0xc5U, 0x89U,
448    0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU,
449    0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U,
450    0x9aU, 0xdbU, 0xc0U, 0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U,
451    0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U, 0xc7U, 0x31U,
452    0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU,
453    0x60U, 0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU,
454    0x2dU, 0xe5U, 0x7aU, 0x9fU, 0x93U, 0xc9U, 0x9cU, 0xefU,
455    0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U, 0xb0U,
456    0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U,
457    0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U,
458    0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU
459};
460
461static const u32 rcon[] = {
462    0x00000001U, 0x00000002U, 0x00000004U, 0x00000008U,
463    0x00000010U, 0x00000020U, 0x00000040U, 0x00000080U,
464    0x0000001bU, 0x00000036U, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
465};
466
467/**
468 * Expand the cipher key into the encryption key schedule.
469 */
470int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
471			AES_KEY *key) {
472
473	u32 *rk;
474   	int i = 0;
475	u32 temp;
476
477	if (!userKey || !key)
478		return -1;
479	if (bits != 128 && bits != 192 && bits != 256)
480		return -2;
481
482	rk = key->rd_key;
483
484	if (bits==128)
485		key->rounds = 10;
486	else if (bits==192)
487		key->rounds = 12;
488	else
489		key->rounds = 14;
490
491	rk[0] = GETU32(userKey     );
492	rk[1] = GETU32(userKey +  4);
493	rk[2] = GETU32(userKey +  8);
494	rk[3] = GETU32(userKey + 12);
495	if (bits == 128) {
496		while (1) {
497			temp  = rk[3];
498			rk[4] = rk[0] ^
499				(Te4[(temp >>  8) & 0xff]      ) ^
500				(Te4[(temp >> 16) & 0xff] <<  8) ^
501				(Te4[(temp >> 24)       ] << 16) ^
502				(Te4[(temp      ) & 0xff] << 24) ^
503				rcon[i];
504			rk[5] = rk[1] ^ rk[4];
505			rk[6] = rk[2] ^ rk[5];
506			rk[7] = rk[3] ^ rk[6];
507			if (++i == 10) {
508				return 0;
509			}
510			rk += 4;
511		}
512	}
513	rk[4] = GETU32(userKey + 16);
514	rk[5] = GETU32(userKey + 20);
515	if (bits == 192) {
516		while (1) {
517			temp = rk[ 5];
518			rk[ 6] = rk[ 0] ^
519				(Te4[(temp >>  8) & 0xff]      ) ^
520				(Te4[(temp >> 16) & 0xff] <<  8) ^
521				(Te4[(temp >> 24)       ] << 16) ^
522				(Te4[(temp      ) & 0xff] << 24) ^
523				rcon[i];
524			rk[ 7] = rk[ 1] ^ rk[ 6];
525			rk[ 8] = rk[ 2] ^ rk[ 7];
526			rk[ 9] = rk[ 3] ^ rk[ 8];
527			if (++i == 8) {
528				return 0;
529			}
530			rk[10] = rk[ 4] ^ rk[ 9];
531			rk[11] = rk[ 5] ^ rk[10];
532			rk += 6;
533		}
534	}
535	rk[6] = GETU32(userKey + 24);
536	rk[7] = GETU32(userKey + 28);
537	if (bits == 256) {
538		while (1) {
539			temp = rk[ 7];
540			rk[ 8] = rk[ 0] ^
541				(Te4[(temp >>  8) & 0xff]      ) ^
542				(Te4[(temp >> 16) & 0xff] <<  8) ^
543				(Te4[(temp >> 24)       ] << 16) ^
544				(Te4[(temp      ) & 0xff] << 24) ^
545				rcon[i];
546			rk[ 9] = rk[ 1] ^ rk[ 8];
547			rk[10] = rk[ 2] ^ rk[ 9];
548			rk[11] = rk[ 3] ^ rk[10];
549			if (++i == 7) {
550				return 0;
551			}
552			temp = rk[11];
553			rk[12] = rk[ 4] ^
554				(Te4[(temp      ) & 0xff]      ) ^
555				(Te4[(temp >>  8) & 0xff] <<  8) ^
556				(Te4[(temp >> 16) & 0xff] << 16) ^
557				(Te4[(temp >> 24)       ] << 24);
558			rk[13] = rk[ 5] ^ rk[12];
559			rk[14] = rk[ 6] ^ rk[13];
560			rk[15] = rk[ 7] ^ rk[14];
561
562			rk += 8;
563        	}
564	}
565	return 0;
566}
567
568/**
569 * Expand the cipher key into the decryption key schedule.
570 */
571int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
572			 AES_KEY *key) {
573
574        u32 *rk;
575	int i, j, status;
576	u32 temp;
577
578	/* first, start with an encryption schedule */
579	status = AES_set_encrypt_key(userKey, bits, key);
580	if (status < 0)
581		return status;
582
583	rk = key->rd_key;
584
585	/* invert the order of the round keys: */
586	for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
587		temp = rk[i    ]; rk[i    ] = rk[j    ]; rk[j    ] = temp;
588		temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
589		temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
590		temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
591	}
592	/* apply the inverse MixColumn transform to all round keys but the first and the last: */
593	for (i = 1; i < (key->rounds); i++) {
594		rk += 4;
595#if 1
596		for (j = 0; j < 4; j++) {
597			u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
598
599			tp1 = rk[j];
600			m = tp1 & 0x80808080;
601			tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
602				((m - (m >> 7)) & 0x1b1b1b1b);
603			m = tp2 & 0x80808080;
604			tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
605				((m - (m >> 7)) & 0x1b1b1b1b);
606			m = tp4 & 0x80808080;
607			tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
608				((m - (m >> 7)) & 0x1b1b1b1b);
609			tp9 = tp8 ^ tp1;
610			tpb = tp9 ^ tp2;
611			tpd = tp9 ^ tp4;
612			tpe = tp8 ^ tp4 ^ tp2;
613#if defined(ROTATE)
614			rk[j] = tpe ^ ROTATE(tpd,16) ^
615				ROTATE(tp9,8) ^ ROTATE(tpb,24);
616#else
617			rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
618				(tp9 >> 24) ^ (tp9 << 8) ^
619				(tpb >> 8) ^ (tpb << 24);
620#endif
621		}
622#else
623		rk[0] =
624			Td0[Te2[(rk[0]      ) & 0xff] & 0xff] ^
625			Td1[Te2[(rk[0] >>  8) & 0xff] & 0xff] ^
626			Td2[Te2[(rk[0] >> 16) & 0xff] & 0xff] ^
627			Td3[Te2[(rk[0] >> 24)       ] & 0xff];
628		rk[1] =
629			Td0[Te2[(rk[1]      ) & 0xff] & 0xff] ^
630			Td1[Te2[(rk[1] >>  8) & 0xff] & 0xff] ^
631			Td2[Te2[(rk[1] >> 16) & 0xff] & 0xff] ^
632			Td3[Te2[(rk[1] >> 24)       ] & 0xff];
633		rk[2] =
634			Td0[Te2[(rk[2]      ) & 0xff] & 0xff] ^
635			Td1[Te2[(rk[2] >>  8) & 0xff] & 0xff] ^
636			Td2[Te2[(rk[2] >> 16) & 0xff] & 0xff] ^
637			Td3[Te2[(rk[2] >> 24)       ] & 0xff];
638		rk[3] =
639			Td0[Te2[(rk[3]      ) & 0xff] & 0xff] ^
640			Td1[Te2[(rk[3] >>  8) & 0xff] & 0xff] ^
641			Td2[Te2[(rk[3] >> 16) & 0xff] & 0xff] ^
642			Td3[Te2[(rk[3] >> 24)       ] & 0xff];
643#endif
644	}
645	return 0;
646}
647
648/*
649 * Encrypt a single block
650 * in and out can overlap
651 */
652void AES_encrypt(const unsigned char *in, unsigned char *out,
653		 const AES_KEY *key) {
654
655	const u32 *rk;
656	u32 s0, s1, s2, s3, t[4];
657	int r;
658
659	assert(in && out && key);
660	rk = key->rd_key;
661
662	/*
663	 * map byte array block to cipher state
664	 * and add initial round key:
665	 */
666	s0 = GETU32(in     ) ^ rk[0];
667	s1 = GETU32(in +  4) ^ rk[1];
668	s2 = GETU32(in +  8) ^ rk[2];
669	s3 = GETU32(in + 12) ^ rk[3];
670
671#if defined(AES_COMPACT_IN_OUTER_ROUNDS)
672	prefetch256(Te4);
673
674	t[0] =	Te4[(s0      ) & 0xff]       ^
675		Te4[(s1 >>  8) & 0xff] <<  8 ^
676		Te4[(s2 >> 16) & 0xff] << 16 ^
677		Te4[(s3 >> 24)       ] << 24;
678	t[1] =	Te4[(s1      ) & 0xff]       ^
679		Te4[(s2 >>  8) & 0xff] <<  8 ^
680		Te4[(s3 >> 16) & 0xff] << 16 ^
681		Te4[(s0 >> 24)       ] << 24;
682	t[2] =	Te4[(s2      ) & 0xff]       ^
683		Te4[(s3 >>  8) & 0xff] <<  8 ^
684		Te4[(s0 >> 16) & 0xff] << 16 ^
685		Te4[(s1 >> 24)       ] << 24;
686	t[3] =	Te4[(s3      ) & 0xff]       ^
687		Te4[(s0 >>  8) & 0xff] <<  8 ^
688		Te4[(s1 >> 16) & 0xff] << 16 ^
689		Te4[(s2 >> 24)       ] << 24;
690
691	/* now do the linear transform using words */
692	{	int i;
693		u32 r0, r1, r2;
694
695		for (i = 0; i < 4; i++) {
696			r0 = t[i];
697			r1 = r0 & 0x80808080;
698			r2 = ((r0 & 0x7f7f7f7f) << 1) ^
699				((r1 - (r1 >> 7)) & 0x1b1b1b1b);
700#if defined(ROTATE)
701			t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^
702				ROTATE(r0,16) ^ ROTATE(r0,8);
703#else
704			t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^
705				(r0 << 16) ^ (r0 >> 16) ^
706				(r0 << 8) ^ (r0 >> 24);
707#endif
708			t[i] ^= rk[4+i];
709		}
710	}
711#else
712	t[0] =	Te0[(s0      ) & 0xff] ^
713		Te1[(s1 >>  8) & 0xff] ^
714		Te2[(s2 >> 16) & 0xff] ^
715		Te3[(s3 >> 24)       ] ^
716		rk[4];
717	t[1] =	Te0[(s1      ) & 0xff] ^
718		Te1[(s2 >>  8) & 0xff] ^
719		Te2[(s3 >> 16) & 0xff] ^
720		Te3[(s0 >> 24)       ] ^
721		rk[5];
722	t[2] =	Te0[(s2      ) & 0xff] ^
723		Te1[(s3 >>  8) & 0xff] ^
724		Te2[(s0 >> 16) & 0xff] ^
725		Te3[(s1 >> 24)       ] ^
726		rk[6];
727	t[3] =	Te0[(s3      ) & 0xff] ^
728		Te1[(s0 >>  8) & 0xff] ^
729		Te2[(s1 >> 16) & 0xff] ^
730		Te3[(s2 >> 24)       ] ^
731		rk[7];
732#endif
733	s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
734
735    /*
736     * Nr - 2 full rounds:
737     */
738    for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) {
739#if defined(AES_COMPACT_IN_INNER_ROUNDS)
740	t[0] =	Te4[(s0      ) & 0xff]       ^
741		Te4[(s1 >>  8) & 0xff] <<  8 ^
742		Te4[(s2 >> 16) & 0xff] << 16 ^
743		Te4[(s3 >> 24)       ] << 24;
744	t[1] =	Te4[(s1      ) & 0xff]       ^
745		Te4[(s2 >>  8) & 0xff] <<  8 ^
746		Te4[(s3 >> 16) & 0xff] << 16 ^
747		Te4[(s0 >> 24)       ] << 24;
748	t[2] =	Te4[(s2      ) & 0xff]       ^
749		Te4[(s3 >>  8) & 0xff] <<  8 ^
750		Te4[(s0 >> 16) & 0xff] << 16 ^
751		Te4[(s1 >> 24)       ] << 24;
752	t[3] =	Te4[(s3      ) & 0xff]       ^
753		Te4[(s0 >>  8) & 0xff] <<  8 ^
754		Te4[(s1 >> 16) & 0xff] << 16 ^
755		Te4[(s2 >> 24)       ] << 24;
756
757	/* now do the linear transform using words */
758	{	int i;
759		u32 r0, r1, r2;
760
761		for (i = 0; i < 4; i++) {
762			r0 = t[i];
763			r1 = r0 & 0x80808080;
764			r2 = ((r0 & 0x7f7f7f7f) << 1) ^
765				((r1 - (r1 >> 7)) & 0x1b1b1b1b);
766#if defined(ROTATE)
767			t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^
768				ROTATE(r0,16) ^ ROTATE(r0,8);
769#else
770			t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^
771				(r0 << 16) ^ (r0 >> 16) ^
772				(r0 << 8) ^ (r0 >> 24);
773#endif
774			t[i] ^= rk[i];
775		}
776	}
777#else
778	t[0] =	Te0[(s0      ) & 0xff] ^
779		Te1[(s1 >>  8) & 0xff] ^
780		Te2[(s2 >> 16) & 0xff] ^
781		Te3[(s3 >> 24)       ] ^
782		rk[0];
783	t[1] =	Te0[(s1      ) & 0xff] ^
784		Te1[(s2 >>  8) & 0xff] ^
785		Te2[(s3 >> 16) & 0xff] ^
786		Te3[(s0 >> 24)       ] ^
787		rk[1];
788	t[2] =	Te0[(s2      ) & 0xff] ^
789		Te1[(s3 >>  8) & 0xff] ^
790		Te2[(s0 >> 16) & 0xff] ^
791		Te3[(s1 >> 24)       ] ^
792		rk[2];
793	t[3] =	Te0[(s3      ) & 0xff] ^
794		Te1[(s0 >>  8) & 0xff] ^
795		Te2[(s1 >> 16) & 0xff] ^
796		Te3[(s2 >> 24)       ] ^
797		rk[3];
798#endif
799	s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
800    }
801    /*
802	 * apply last round and
803	 * map cipher state to byte array block:
804	 */
805#if defined(AES_COMPACT_IN_OUTER_ROUNDS)
806	prefetch256(Te4);
807
808	*(u32*)(out+0) =
809		Te4[(s0      ) & 0xff]       ^
810		Te4[(s1 >>  8) & 0xff] <<  8 ^
811		Te4[(s2 >> 16) & 0xff] << 16 ^
812		Te4[(s3 >> 24)       ] << 24 ^
813		rk[0];
814	*(u32*)(out+4) =
815		Te4[(s1      ) & 0xff]       ^
816		Te4[(s2 >>  8) & 0xff] <<  8 ^
817		Te4[(s3 >> 16) & 0xff] << 16 ^
818		Te4[(s0 >> 24)       ] << 24 ^
819		rk[1];
820	*(u32*)(out+8) =
821		Te4[(s2      ) & 0xff]       ^
822		Te4[(s3 >>  8) & 0xff] <<  8 ^
823		Te4[(s0 >> 16) & 0xff] << 16 ^
824		Te4[(s1 >> 24)       ] << 24 ^
825		rk[2];
826	*(u32*)(out+12) =
827		Te4[(s3      ) & 0xff]       ^
828		Te4[(s0 >>  8) & 0xff] <<  8 ^
829		Te4[(s1 >> 16) & 0xff] << 16 ^
830		Te4[(s2 >> 24)       ] << 24 ^
831		rk[3];
832#else
833	*(u32*)(out+0) =
834		(Te2[(s0      ) & 0xff] & 0x000000ffU) ^
835		(Te3[(s1 >>  8) & 0xff] & 0x0000ff00U) ^
836		(Te0[(s2 >> 16) & 0xff] & 0x00ff0000U) ^
837		(Te1[(s3 >> 24)       ] & 0xff000000U) ^
838		rk[0];
839	*(u32*)(out+4) =
840		(Te2[(s1      ) & 0xff] & 0x000000ffU) ^
841		(Te3[(s2 >>  8) & 0xff] & 0x0000ff00U) ^
842		(Te0[(s3 >> 16) & 0xff] & 0x00ff0000U) ^
843		(Te1[(s0 >> 24)       ] & 0xff000000U) ^
844		rk[1];
845	*(u32*)(out+8) =
846		(Te2[(s2      ) & 0xff] & 0x000000ffU) ^
847		(Te3[(s3 >>  8) & 0xff] & 0x0000ff00U) ^
848		(Te0[(s0 >> 16) & 0xff] & 0x00ff0000U) ^
849		(Te1[(s1 >> 24)       ] & 0xff000000U) ^
850		rk[2];
851	*(u32*)(out+12) =
852		(Te2[(s3      ) & 0xff] & 0x000000ffU) ^
853		(Te3[(s0 >>  8) & 0xff] & 0x0000ff00U) ^
854		(Te0[(s1 >> 16) & 0xff] & 0x00ff0000U) ^
855		(Te1[(s2 >> 24)       ] & 0xff000000U) ^
856		rk[3];
857#endif
858}
859
860/*
861 * Decrypt a single block
862 * in and out can overlap
863 */
864void AES_decrypt(const unsigned char *in, unsigned char *out,
865		 const AES_KEY *key) {
866
867	const u32 *rk;
868	u32 s0, s1, s2, s3, t[4];
869	int r;
870
871	assert(in && out && key);
872	rk = key->rd_key;
873
874	/*
875	 * map byte array block to cipher state
876	 * and add initial round key:
877	 */
878	s0 = GETU32(in     ) ^ rk[0];
879	s1 = GETU32(in +  4) ^ rk[1];
880	s2 = GETU32(in +  8) ^ rk[2];
881	s3 = GETU32(in + 12) ^ rk[3];
882
883#if defined(AES_COMPACT_IN_OUTER_ROUNDS)
884	prefetch256(Td4);
885
886        t[0] =	Td4[(s0      ) & 0xff]       ^
887		Td4[(s3 >>  8) & 0xff] <<  8 ^
888		Td4[(s2 >> 16) & 0xff] << 16 ^
889		Td4[(s1 >> 24)       ] << 24;
890        t[1] =	Td4[(s1      ) & 0xff]       ^
891		Td4[(s0 >>  8) & 0xff] <<  8 ^
892		Td4[(s3 >> 16) & 0xff] << 16 ^
893		Td4[(s2 >> 24)       ] << 24;
894        t[2] =	Td4[(s2      ) & 0xff]       ^
895		Td4[(s1 >>  8) & 0xff] <<  8 ^
896		Td4[(s0 >> 16) & 0xff] << 16 ^
897		Td4[(s3 >> 24)       ] << 24;
898        t[3] =	Td4[(s3      ) & 0xff]       ^
899		Td4[(s2 >>  8) & 0xff] <<  8 ^
900		Td4[(s1 >> 16) & 0xff] << 16 ^
901		Td4[(s0 >> 24)       ] << 24;
902
903	/* now do the linear transform using words */
904	{	int i;
905		u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
906
907		for (i = 0; i < 4; i++) {
908			tp1 = t[i];
909			m = tp1 & 0x80808080;
910			tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
911				((m - (m >> 7)) & 0x1b1b1b1b);
912			m = tp2 & 0x80808080;
913			tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
914				((m - (m >> 7)) & 0x1b1b1b1b);
915			m = tp4 & 0x80808080;
916			tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
917				((m - (m >> 7)) & 0x1b1b1b1b);
918			tp9 = tp8 ^ tp1;
919			tpb = tp9 ^ tp2;
920			tpd = tp9 ^ tp4;
921			tpe = tp8 ^ tp4 ^ tp2;
922#if defined(ROTATE)
923			t[i] = tpe ^ ROTATE(tpd,16) ^
924				ROTATE(tp9,8) ^ ROTATE(tpb,24);
925#else
926			t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
927				(tp9 >> 24) ^ (tp9 << 8) ^
928				(tpb >> 8) ^ (tpb << 24);
929#endif
930			t[i] ^= rk[4+i];
931		}
932	}
933#else
934	t[0] =	Td0[(s0      ) & 0xff] ^
935		Td1[(s3 >>  8) & 0xff] ^
936		Td2[(s2 >> 16) & 0xff] ^
937		Td3[(s1 >> 24)       ] ^
938		rk[4];
939	t[1] =	Td0[(s1      ) & 0xff] ^
940		Td1[(s0 >>  8) & 0xff] ^
941		Td2[(s3 >> 16) & 0xff] ^
942		Td3[(s2 >> 24)       ] ^
943		rk[5];
944	t[2] =	Td0[(s2      ) & 0xff] ^
945		Td1[(s1 >>  8) & 0xff] ^
946		Td2[(s0 >> 16) & 0xff] ^
947		Td3[(s3 >> 24)       ] ^
948		rk[6];
949	t[3] =	Td0[(s3      ) & 0xff] ^
950		Td1[(s2 >>  8) & 0xff] ^
951		Td2[(s1 >> 16) & 0xff] ^
952		Td3[(s0 >> 24)       ] ^
953		rk[7];
954#endif
955	s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
956
957    /*
958     * Nr - 2 full rounds:
959     */
960    for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) {
961#if defined(AES_COMPACT_IN_INNER_ROUNDS)
962        t[0] =	Td4[(s0      ) & 0xff]       ^
963		Td4[(s3 >>  8) & 0xff] <<  8 ^
964		Td4[(s2 >> 16) & 0xff] << 16 ^
965		Td4[(s1 >> 24)       ] << 24;
966        t[1] =	Td4[(s1      ) & 0xff]       ^
967		Td4[(s0 >>  8) & 0xff] <<  8 ^
968		Td4[(s3 >> 16) & 0xff] << 16 ^
969		Td4[(s2 >> 24)       ] << 24;
970        t[2] =	Td4[(s2      ) & 0xff]       ^
971		Td4[(s1 >>  8) & 0xff] <<  8 ^
972		Td4[(s0 >> 16) & 0xff] << 16 ^
973		Td4[(s3 >> 24)       ] << 24;
974        t[3] =	Td4[(s3      ) & 0xff]       ^
975		Td4[(s2 >>  8) & 0xff] <<  8 ^
976		Td4[(s1 >> 16) & 0xff] << 16 ^
977		Td4[(s0 >> 24)       ] << 24;
978
979	/* now do the linear transform using words */
980	{	int i;
981		u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
982
983		for (i = 0; i < 4; i++) {
984			tp1 = t[i];
985			m = tp1 & 0x80808080;
986			tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
987				((m - (m >> 7)) & 0x1b1b1b1b);
988			m = tp2 & 0x80808080;
989			tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
990				((m - (m >> 7)) & 0x1b1b1b1b);
991			m = tp4 & 0x80808080;
992			tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
993				((m - (m >> 7)) & 0x1b1b1b1b);
994			tp9 = tp8 ^ tp1;
995			tpb = tp9 ^ tp2;
996			tpd = tp9 ^ tp4;
997			tpe = tp8 ^ tp4 ^ tp2;
998#if defined(ROTATE)
999			t[i] = tpe ^ ROTATE(tpd,16) ^
1000				ROTATE(tp9,8) ^ ROTATE(tpb,24);
1001#else
1002			t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
1003				(tp9 >> 24) ^ (tp9 << 8) ^
1004				(tpb >> 8) ^ (tpb << 24);
1005#endif
1006			t[i] ^= rk[i];
1007		}
1008	}
1009#else
1010	t[0] =	Td0[(s0      ) & 0xff] ^
1011		Td1[(s3 >>  8) & 0xff] ^
1012		Td2[(s2 >> 16) & 0xff] ^
1013		Td3[(s1 >> 24)       ] ^
1014		rk[0];
1015	t[1] =	Td0[(s1      ) & 0xff] ^
1016		Td1[(s0 >>  8) & 0xff] ^
1017		Td2[(s3 >> 16) & 0xff] ^
1018		Td3[(s2 >> 24)       ] ^
1019		rk[1];
1020	t[2] =	Td0[(s2      ) & 0xff] ^
1021		Td1[(s1 >>  8) & 0xff] ^
1022		Td2[(s0 >> 16) & 0xff] ^
1023		Td3[(s3 >> 24)       ] ^
1024		rk[2];
1025	t[3] =	Td0[(s3      ) & 0xff] ^
1026		Td1[(s2 >>  8) & 0xff] ^
1027		Td2[(s1 >> 16) & 0xff] ^
1028		Td3[(s0 >> 24)       ] ^
1029		rk[3];
1030#endif
1031	s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
1032    }
1033    /*
1034	 * apply last round and
1035	 * map cipher state to byte array block:
1036	 */
1037	prefetch256(Td4);
1038
1039	*(u32*)(out+0) =
1040		(Td4[(s0      ) & 0xff])	^
1041		(Td4[(s3 >>  8) & 0xff] <<  8) ^
1042		(Td4[(s2 >> 16) & 0xff] << 16) ^
1043		(Td4[(s1 >> 24)       ] << 24) ^
1044		rk[0];
1045	*(u32*)(out+4) =
1046		(Td4[(s1      ) & 0xff])	 ^
1047		(Td4[(s0 >>  8) & 0xff] <<  8) ^
1048		(Td4[(s3 >> 16) & 0xff] << 16) ^
1049		(Td4[(s2 >> 24)       ] << 24) ^
1050		rk[1];
1051	*(u32*)(out+8) =
1052		(Td4[(s2      ) & 0xff])	 ^
1053		(Td4[(s1 >>  8) & 0xff] <<  8) ^
1054		(Td4[(s0 >> 16) & 0xff] << 16) ^
1055		(Td4[(s3 >> 24)       ] << 24) ^
1056		rk[2];
1057	*(u32*)(out+12) =
1058		(Td4[(s3      ) & 0xff])	 ^
1059		(Td4[(s2 >>  8) & 0xff] <<  8) ^
1060		(Td4[(s1 >> 16) & 0xff] << 16) ^
1061		(Td4[(s0 >> 24)       ] << 24) ^
1062		rk[3];
1063}
1064