1package org.bouncycastle.crypto.engines;
2
3import org.bouncycastle.crypto.BlockCipher;
4import org.bouncycastle.crypto.CipherParameters;
5import org.bouncycastle.crypto.DataLengthException;
6import org.bouncycastle.crypto.params.KeyParameter;
7
8/**
9 * an implementation of the AES (Rijndael), from FIPS-197.
10 * <p>
11 * For further details see: <a href="http://csrc.nist.gov/encryption/aes/">http://csrc.nist.gov/encryption/aes/</a>.
12 *
13 * This implementation is based on optimizations from Dr. Brian Gladman's paper and C code at
14 * <a href="http://fp.gladman.plus.com/cryptography_technology/rijndael/">http://fp.gladman.plus.com/cryptography_technology/rijndael/</a>
15 *
16 * There are three levels of tradeoff of speed vs memory
17 * Because java has no preprocessor, they are written as three separate classes from which to choose
18 *
19 * The fastest uses 8Kbytes of static tables to precompute round calculations, 4 256 word tables for encryption
20 * and 4 for decryption.
21 *
22 * The middle performance version uses only one 256 word table for each, for a total of 2Kbytes,
23 * adding 12 rotate operations per round to compute the values contained in the other tables from
24 * the contents of the first
25 *
26 * The slowest version uses no static tables at all and computes the values
27 * in each round.
28 * <p>
29 * This file contains the slowest performance version with no static tables
30 * for round precomputation, but it has the smallest foot print.
31 *
32 */
33public class AESLightEngine
34    implements BlockCipher
35{
36    // The S box
37    private static final byte[] S = {
38        (byte)99, (byte)124, (byte)119, (byte)123, (byte)242, (byte)107, (byte)111, (byte)197,
39        (byte)48,   (byte)1, (byte)103,  (byte)43, (byte)254, (byte)215, (byte)171, (byte)118,
40        (byte)202, (byte)130, (byte)201, (byte)125, (byte)250,  (byte)89,  (byte)71, (byte)240,
41        (byte)173, (byte)212, (byte)162, (byte)175, (byte)156, (byte)164, (byte)114, (byte)192,
42        (byte)183, (byte)253, (byte)147,  (byte)38,  (byte)54,  (byte)63, (byte)247, (byte)204,
43        (byte)52, (byte)165, (byte)229, (byte)241, (byte)113, (byte)216,  (byte)49,  (byte)21,
44        (byte)4, (byte)199,  (byte)35, (byte)195,  (byte)24, (byte)150,   (byte)5, (byte)154,
45        (byte)7,  (byte)18, (byte)128, (byte)226, (byte)235,  (byte)39, (byte)178, (byte)117,
46        (byte)9, (byte)131,  (byte)44,  (byte)26,  (byte)27, (byte)110,  (byte)90, (byte)160,
47        (byte)82,  (byte)59, (byte)214, (byte)179,  (byte)41, (byte)227,  (byte)47, (byte)132,
48        (byte)83, (byte)209,   (byte)0, (byte)237,  (byte)32, (byte)252, (byte)177,  (byte)91,
49        (byte)106, (byte)203, (byte)190,  (byte)57,  (byte)74,  (byte)76,  (byte)88, (byte)207,
50        (byte)208, (byte)239, (byte)170, (byte)251,  (byte)67,  (byte)77,  (byte)51, (byte)133,
51        (byte)69, (byte)249,   (byte)2, (byte)127,  (byte)80,  (byte)60, (byte)159, (byte)168,
52        (byte)81, (byte)163,  (byte)64, (byte)143, (byte)146, (byte)157,  (byte)56, (byte)245,
53        (byte)188, (byte)182, (byte)218,  (byte)33,  (byte)16, (byte)255, (byte)243, (byte)210,
54        (byte)205,  (byte)12,  (byte)19, (byte)236,  (byte)95, (byte)151,  (byte)68,  (byte)23,
55        (byte)196, (byte)167, (byte)126,  (byte)61, (byte)100,  (byte)93,  (byte)25, (byte)115,
56        (byte)96, (byte)129,  (byte)79, (byte)220,  (byte)34,  (byte)42, (byte)144, (byte)136,
57        (byte)70, (byte)238, (byte)184,  (byte)20, (byte)222,  (byte)94,  (byte)11, (byte)219,
58        (byte)224,  (byte)50,  (byte)58,  (byte)10,  (byte)73,   (byte)6,  (byte)36,  (byte)92,
59        (byte)194, (byte)211, (byte)172,  (byte)98, (byte)145, (byte)149, (byte)228, (byte)121,
60        (byte)231, (byte)200,  (byte)55, (byte)109, (byte)141, (byte)213,  (byte)78, (byte)169,
61        (byte)108,  (byte)86, (byte)244, (byte)234, (byte)101, (byte)122, (byte)174,   (byte)8,
62        (byte)186, (byte)120,  (byte)37,  (byte)46,  (byte)28, (byte)166, (byte)180, (byte)198,
63        (byte)232, (byte)221, (byte)116,  (byte)31,  (byte)75, (byte)189, (byte)139, (byte)138,
64        (byte)112,  (byte)62, (byte)181, (byte)102,  (byte)72,   (byte)3, (byte)246,  (byte)14,
65        (byte)97,  (byte)53,  (byte)87, (byte)185, (byte)134, (byte)193,  (byte)29, (byte)158,
66        (byte)225, (byte)248, (byte)152,  (byte)17, (byte)105, (byte)217, (byte)142, (byte)148,
67        (byte)155,  (byte)30, (byte)135, (byte)233, (byte)206,  (byte)85,  (byte)40, (byte)223,
68        (byte)140, (byte)161, (byte)137,  (byte)13, (byte)191, (byte)230,  (byte)66, (byte)104,
69        (byte)65, (byte)153,  (byte)45,  (byte)15, (byte)176,  (byte)84, (byte)187,  (byte)22,
70    };
71
72    // The inverse S-box
73    private static final byte[] Si = {
74        (byte)82,   (byte)9, (byte)106, (byte)213,  (byte)48,  (byte)54, (byte)165,  (byte)56,
75        (byte)191,  (byte)64, (byte)163, (byte)158, (byte)129, (byte)243, (byte)215, (byte)251,
76        (byte)124, (byte)227,  (byte)57, (byte)130, (byte)155,  (byte)47, (byte)255, (byte)135,
77        (byte)52, (byte)142,  (byte)67,  (byte)68, (byte)196, (byte)222, (byte)233, (byte)203,
78        (byte)84, (byte)123, (byte)148,  (byte)50, (byte)166, (byte)194,  (byte)35,  (byte)61,
79        (byte)238,  (byte)76, (byte)149,  (byte)11,  (byte)66, (byte)250, (byte)195,  (byte)78,
80        (byte)8,  (byte)46, (byte)161, (byte)102,  (byte)40, (byte)217,  (byte)36, (byte)178,
81        (byte)118,  (byte)91, (byte)162,  (byte)73, (byte)109, (byte)139, (byte)209,  (byte)37,
82        (byte)114, (byte)248, (byte)246, (byte)100, (byte)134, (byte)104, (byte)152,  (byte)22,
83        (byte)212, (byte)164,  (byte)92, (byte)204,  (byte)93, (byte)101, (byte)182, (byte)146,
84        (byte)108, (byte)112,  (byte)72,  (byte)80, (byte)253, (byte)237, (byte)185, (byte)218,
85        (byte)94,  (byte)21,  (byte)70,  (byte)87, (byte)167, (byte)141, (byte)157, (byte)132,
86        (byte)144, (byte)216, (byte)171,   (byte)0, (byte)140, (byte)188, (byte)211,  (byte)10,
87        (byte)247, (byte)228,  (byte)88,   (byte)5, (byte)184, (byte)179,  (byte)69,   (byte)6,
88        (byte)208,  (byte)44,  (byte)30, (byte)143, (byte)202,  (byte)63,  (byte)15,   (byte)2,
89        (byte)193, (byte)175, (byte)189,   (byte)3,   (byte)1,  (byte)19, (byte)138, (byte)107,
90        (byte)58, (byte)145,  (byte)17,  (byte)65,  (byte)79, (byte)103, (byte)220, (byte)234,
91        (byte)151, (byte)242, (byte)207, (byte)206, (byte)240, (byte)180, (byte)230, (byte)115,
92        (byte)150, (byte)172, (byte)116,  (byte)34, (byte)231, (byte)173,  (byte)53, (byte)133,
93        (byte)226, (byte)249,  (byte)55, (byte)232,  (byte)28, (byte)117, (byte)223, (byte)110,
94        (byte)71, (byte)241,  (byte)26, (byte)113,  (byte)29,  (byte)41, (byte)197, (byte)137,
95        (byte)111, (byte)183,  (byte)98,  (byte)14, (byte)170,  (byte)24, (byte)190,  (byte)27,
96        (byte)252,  (byte)86,  (byte)62,  (byte)75, (byte)198, (byte)210, (byte)121,  (byte)32,
97        (byte)154, (byte)219, (byte)192, (byte)254, (byte)120, (byte)205,  (byte)90, (byte)244,
98        (byte)31, (byte)221, (byte)168,  (byte)51, (byte)136,   (byte)7, (byte)199,  (byte)49,
99        (byte)177,  (byte)18,  (byte)16,  (byte)89,  (byte)39, (byte)128, (byte)236,  (byte)95,
100        (byte)96,  (byte)81, (byte)127, (byte)169,  (byte)25, (byte)181,  (byte)74,  (byte)13,
101        (byte)45, (byte)229, (byte)122, (byte)159, (byte)147, (byte)201, (byte)156, (byte)239,
102        (byte)160, (byte)224,  (byte)59,  (byte)77, (byte)174,  (byte)42, (byte)245, (byte)176,
103        (byte)200, (byte)235, (byte)187,  (byte)60, (byte)131,  (byte)83, (byte)153,  (byte)97,
104        (byte)23,  (byte)43,   (byte)4, (byte)126, (byte)186, (byte)119, (byte)214,  (byte)38,
105        (byte)225, (byte)105,  (byte)20,  (byte)99,  (byte)85,  (byte)33,  (byte)12, (byte)125,
106        };
107
108    // vector used in calculating key schedule (powers of x in GF(256))
109    private static final int[] rcon = {
110         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a,
111         0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91 };
112
113    private int shift(
114        int     r,
115        int     shift)
116    {
117        return (r >>> shift) | (r << -shift);
118    }
119
120    /* multiply four bytes in GF(2^8) by 'x' {02} in parallel */
121
122    private static final int m1 = 0x80808080;
123    private static final int m2 = 0x7f7f7f7f;
124    private static final int m3 = 0x0000001b;
125
126    private int FFmulX(int x)
127    {
128        return (((x & m2) << 1) ^ (((x & m1) >>> 7) * m3));
129    }
130
131    /*
132       The following defines provide alternative definitions of FFmulX that might
133       give improved performance if a fast 32-bit multiply is not available.
134
135       private int FFmulX(int x) { int u = x & m1; u |= (u >> 1); return ((x & m2) << 1) ^ ((u >>> 3) | (u >>> 6)); }
136       private static final int  m4 = 0x1b1b1b1b;
137       private int FFmulX(int x) { int u = x & m1; return ((x & m2) << 1) ^ ((u - (u >>> 7)) & m4); }
138
139    */
140
141    private int mcol(int x)
142    {
143        int f2 = FFmulX(x);
144        return f2 ^ shift(x ^ f2, 8) ^ shift(x, 16) ^ shift(x, 24);
145    }
146
147    private int inv_mcol(int x)
148    {
149        int f2 = FFmulX(x);
150        int f4 = FFmulX(f2);
151        int f8 = FFmulX(f4);
152        int f9 = x ^ f8;
153
154        return f2 ^ f4 ^ f8 ^ shift(f2 ^ f9, 8) ^ shift(f4 ^ f9, 16) ^ shift(f9, 24);
155    }
156
157
158    private int subWord(int x)
159    {
160        return (S[x&255]&255 | ((S[(x>>8)&255]&255)<<8) | ((S[(x>>16)&255]&255)<<16) | S[(x>>24)&255]<<24);
161    }
162
163    /**
164     * Calculate the necessary round keys
165     * The number of calculations depends on key size and block size
166     * AES specified a fixed block size of 128 bits and key sizes 128/192/256 bits
167     * This code is written assuming those are the only possible values
168     */
169    private int[][] generateWorkingKey(
170                                    byte[] key,
171                                    boolean forEncryption)
172    {
173        int         KC = key.length / 4;  // key length in words
174        int         t;
175
176        if (((KC != 4) && (KC != 6) && (KC != 8)) || ((KC * 4) != key.length))
177        {
178            throw new IllegalArgumentException("Key length not 128/192/256 bits.");
179        }
180
181        ROUNDS = KC + 6;  // This is not always true for the generalized Rijndael that allows larger block sizes
182        int[][] W = new int[ROUNDS+1][4];   // 4 words in a block
183
184        //
185        // copy the key into the round key array
186        //
187
188        t = 0;
189        int i = 0;
190        while (i < key.length)
191            {
192                W[t >> 2][t & 3] = (key[i]&0xff) | ((key[i+1]&0xff) << 8) | ((key[i+2]&0xff) << 16) | (key[i+3] << 24);
193                i+=4;
194                t++;
195            }
196
197        //
198        // while not enough round key material calculated
199        // calculate new values
200        //
201        int k = (ROUNDS + 1) << 2;
202        for (i = KC; (i < k); i++)
203            {
204                int temp = W[(i-1)>>2][(i-1)&3];
205                if ((i % KC) == 0)
206                {
207                    temp = subWord(shift(temp, 8)) ^ rcon[(i / KC)-1];
208                }
209                else if ((KC > 6) && ((i % KC) == 4))
210                {
211                    temp = subWord(temp);
212                }
213
214                W[i>>2][i&3] = W[(i - KC)>>2][(i-KC)&3] ^ temp;
215            }
216
217        if (!forEncryption)
218        {
219            for (int j = 1; j < ROUNDS; j++)
220            {
221                for (i = 0; i < 4; i++)
222                {
223                    W[j][i] = inv_mcol(W[j][i]);
224                }
225            }
226        }
227
228        return W;
229    }
230
231    private int         ROUNDS;
232    private int[][]     WorkingKey = null;
233    private int         C0, C1, C2, C3;
234    private boolean     forEncryption;
235
236    private static final int BLOCK_SIZE = 16;
237
238    /**
239     * default constructor - 128 bit block size.
240     */
241    public AESLightEngine()
242    {
243    }
244
245    /**
246     * initialise an AES cipher.
247     *
248     * @param forEncryption whether or not we are for encryption.
249     * @param params the parameters required to set up the cipher.
250     * @exception IllegalArgumentException if the params argument is
251     * inappropriate.
252     */
253    public void init(
254        boolean           forEncryption,
255        CipherParameters  params)
256    {
257        if (params instanceof KeyParameter)
258        {
259            WorkingKey = generateWorkingKey(((KeyParameter)params).getKey(), forEncryption);
260            this.forEncryption = forEncryption;
261            return;
262        }
263
264        throw new IllegalArgumentException("invalid parameter passed to AES init - " + params.getClass().getName());
265    }
266
267    public String getAlgorithmName()
268    {
269        return "AES";
270    }
271
272    public int getBlockSize()
273    {
274        return BLOCK_SIZE;
275    }
276
277    public int processBlock(
278        byte[] in,
279        int inOff,
280        byte[] out,
281        int outOff)
282    {
283        if (WorkingKey == null)
284        {
285            throw new IllegalStateException("AES engine not initialised");
286        }
287
288        if ((inOff + (32 / 2)) > in.length)
289        {
290            throw new DataLengthException("input buffer too short");
291        }
292
293        if ((outOff + (32 / 2)) > out.length)
294        {
295            throw new DataLengthException("output buffer too short");
296        }
297
298        if (forEncryption)
299        {
300            unpackBlock(in, inOff);
301            encryptBlock(WorkingKey);
302            packBlock(out, outOff);
303        }
304        else
305        {
306            unpackBlock(in, inOff);
307            decryptBlock(WorkingKey);
308            packBlock(out, outOff);
309        }
310
311        return BLOCK_SIZE;
312    }
313
314    public void reset()
315    {
316    }
317
318    private final void unpackBlock(
319        byte[]      bytes,
320        int         off)
321    {
322        int     index = off;
323
324        C0 = (bytes[index++] & 0xff);
325        C0 |= (bytes[index++] & 0xff) << 8;
326        C0 |= (bytes[index++] & 0xff) << 16;
327        C0 |= bytes[index++] << 24;
328
329        C1 = (bytes[index++] & 0xff);
330        C1 |= (bytes[index++] & 0xff) << 8;
331        C1 |= (bytes[index++] & 0xff) << 16;
332        C1 |= bytes[index++] << 24;
333
334        C2 = (bytes[index++] & 0xff);
335        C2 |= (bytes[index++] & 0xff) << 8;
336        C2 |= (bytes[index++] & 0xff) << 16;
337        C2 |= bytes[index++] << 24;
338
339        C3 = (bytes[index++] & 0xff);
340        C3 |= (bytes[index++] & 0xff) << 8;
341        C3 |= (bytes[index++] & 0xff) << 16;
342        C3 |= bytes[index++] << 24;
343    }
344
345    private final void packBlock(
346        byte[]      bytes,
347        int         off)
348    {
349        int     index = off;
350
351        bytes[index++] = (byte)C0;
352        bytes[index++] = (byte)(C0 >> 8);
353        bytes[index++] = (byte)(C0 >> 16);
354        bytes[index++] = (byte)(C0 >> 24);
355
356        bytes[index++] = (byte)C1;
357        bytes[index++] = (byte)(C1 >> 8);
358        bytes[index++] = (byte)(C1 >> 16);
359        bytes[index++] = (byte)(C1 >> 24);
360
361        bytes[index++] = (byte)C2;
362        bytes[index++] = (byte)(C2 >> 8);
363        bytes[index++] = (byte)(C2 >> 16);
364        bytes[index++] = (byte)(C2 >> 24);
365
366        bytes[index++] = (byte)C3;
367        bytes[index++] = (byte)(C3 >> 8);
368        bytes[index++] = (byte)(C3 >> 16);
369        bytes[index++] = (byte)(C3 >> 24);
370    }
371
372    private void encryptBlock(int[][] KW)
373    {
374        int r, r0, r1, r2, r3;
375
376        C0 ^= KW[0][0];
377        C1 ^= KW[0][1];
378        C2 ^= KW[0][2];
379        C3 ^= KW[0][3];
380
381        for (r = 1; r < ROUNDS - 1;)
382        {
383            r0 = mcol((S[C0&255]&255) ^ ((S[(C1>>8)&255]&255)<<8) ^ ((S[(C2>>16)&255]&255)<<16) ^ (S[(C3>>24)&255]<<24)) ^ KW[r][0];
384            r1 = mcol((S[C1&255]&255) ^ ((S[(C2>>8)&255]&255)<<8) ^ ((S[(C3>>16)&255]&255)<<16) ^ (S[(C0>>24)&255]<<24)) ^ KW[r][1];
385            r2 = mcol((S[C2&255]&255) ^ ((S[(C3>>8)&255]&255)<<8) ^ ((S[(C0>>16)&255]&255)<<16) ^ (S[(C1>>24)&255]<<24)) ^ KW[r][2];
386            r3 = mcol((S[C3&255]&255) ^ ((S[(C0>>8)&255]&255)<<8) ^ ((S[(C1>>16)&255]&255)<<16) ^ (S[(C2>>24)&255]<<24)) ^ KW[r++][3];
387            C0 = mcol((S[r0&255]&255) ^ ((S[(r1>>8)&255]&255)<<8) ^ ((S[(r2>>16)&255]&255)<<16) ^ (S[(r3>>24)&255]<<24)) ^ KW[r][0];
388            C1 = mcol((S[r1&255]&255) ^ ((S[(r2>>8)&255]&255)<<8) ^ ((S[(r3>>16)&255]&255)<<16) ^ (S[(r0>>24)&255]<<24)) ^ KW[r][1];
389            C2 = mcol((S[r2&255]&255) ^ ((S[(r3>>8)&255]&255)<<8) ^ ((S[(r0>>16)&255]&255)<<16) ^ (S[(r1>>24)&255]<<24)) ^ KW[r][2];
390            C3 = mcol((S[r3&255]&255) ^ ((S[(r0>>8)&255]&255)<<8) ^ ((S[(r1>>16)&255]&255)<<16) ^ (S[(r2>>24)&255]<<24)) ^ KW[r++][3];
391        }
392
393        r0 = mcol((S[C0&255]&255) ^ ((S[(C1>>8)&255]&255)<<8) ^ ((S[(C2>>16)&255]&255)<<16) ^ (S[(C3>>24)&255]<<24)) ^ KW[r][0];
394        r1 = mcol((S[C1&255]&255) ^ ((S[(C2>>8)&255]&255)<<8) ^ ((S[(C3>>16)&255]&255)<<16) ^ (S[(C0>>24)&255]<<24)) ^ KW[r][1];
395        r2 = mcol((S[C2&255]&255) ^ ((S[(C3>>8)&255]&255)<<8) ^ ((S[(C0>>16)&255]&255)<<16) ^ (S[(C1>>24)&255]<<24)) ^ KW[r][2];
396        r3 = mcol((S[C3&255]&255) ^ ((S[(C0>>8)&255]&255)<<8) ^ ((S[(C1>>16)&255]&255)<<16) ^ (S[(C2>>24)&255]<<24)) ^ KW[r++][3];
397
398        // the final round is a simple function of S
399
400        C0 = (S[r0&255]&255) ^ ((S[(r1>>8)&255]&255)<<8) ^ ((S[(r2>>16)&255]&255)<<16) ^ (S[(r3>>24)&255]<<24) ^ KW[r][0];
401        C1 = (S[r1&255]&255) ^ ((S[(r2>>8)&255]&255)<<8) ^ ((S[(r3>>16)&255]&255)<<16) ^ (S[(r0>>24)&255]<<24) ^ KW[r][1];
402        C2 = (S[r2&255]&255) ^ ((S[(r3>>8)&255]&255)<<8) ^ ((S[(r0>>16)&255]&255)<<16) ^ (S[(r1>>24)&255]<<24) ^ KW[r][2];
403        C3 = (S[r3&255]&255) ^ ((S[(r0>>8)&255]&255)<<8) ^ ((S[(r1>>16)&255]&255)<<16) ^ (S[(r2>>24)&255]<<24) ^ KW[r][3];
404
405    }
406
407    private final void decryptBlock(int[][] KW)
408    {
409        int r, r0, r1, r2, r3;
410
411        C0 ^= KW[ROUNDS][0];
412        C1 ^= KW[ROUNDS][1];
413        C2 ^= KW[ROUNDS][2];
414        C3 ^= KW[ROUNDS][3];
415
416        for (r = ROUNDS-1; r>1;)
417        {
418            r0 = inv_mcol((Si[C0&255]&255) ^ ((Si[(C3>>8)&255]&255)<<8) ^ ((Si[(C2>>16)&255]&255)<<16) ^ (Si[(C1>>24)&255]<<24)) ^ KW[r][0];
419            r1 = inv_mcol((Si[C1&255]&255) ^ ((Si[(C0>>8)&255]&255)<<8) ^ ((Si[(C3>>16)&255]&255)<<16) ^ (Si[(C2>>24)&255]<<24)) ^ KW[r][1];
420            r2 = inv_mcol((Si[C2&255]&255) ^ ((Si[(C1>>8)&255]&255)<<8) ^ ((Si[(C0>>16)&255]&255)<<16) ^ (Si[(C3>>24)&255]<<24)) ^ KW[r][2];
421            r3 = inv_mcol((Si[C3&255]&255) ^ ((Si[(C2>>8)&255]&255)<<8) ^ ((Si[(C1>>16)&255]&255)<<16) ^ (Si[(C0>>24)&255]<<24)) ^ KW[r--][3];
422            C0 = inv_mcol((Si[r0&255]&255) ^ ((Si[(r3>>8)&255]&255)<<8) ^ ((Si[(r2>>16)&255]&255)<<16) ^ (Si[(r1>>24)&255]<<24)) ^ KW[r][0];
423            C1 = inv_mcol((Si[r1&255]&255) ^ ((Si[(r0>>8)&255]&255)<<8) ^ ((Si[(r3>>16)&255]&255)<<16) ^ (Si[(r2>>24)&255]<<24)) ^ KW[r][1];
424            C2 = inv_mcol((Si[r2&255]&255) ^ ((Si[(r1>>8)&255]&255)<<8) ^ ((Si[(r0>>16)&255]&255)<<16) ^ (Si[(r3>>24)&255]<<24)) ^ KW[r][2];
425            C3 = inv_mcol((Si[r3&255]&255) ^ ((Si[(r2>>8)&255]&255)<<8) ^ ((Si[(r1>>16)&255]&255)<<16) ^ (Si[(r0>>24)&255]<<24)) ^ KW[r--][3];
426        }
427
428        r0 = inv_mcol((Si[C0&255]&255) ^ ((Si[(C3>>8)&255]&255)<<8) ^ ((Si[(C2>>16)&255]&255)<<16) ^ (Si[(C1>>24)&255]<<24)) ^ KW[r][0];
429        r1 = inv_mcol((Si[C1&255]&255) ^ ((Si[(C0>>8)&255]&255)<<8) ^ ((Si[(C3>>16)&255]&255)<<16) ^ (Si[(C2>>24)&255]<<24)) ^ KW[r][1];
430        r2 = inv_mcol((Si[C2&255]&255) ^ ((Si[(C1>>8)&255]&255)<<8) ^ ((Si[(C0>>16)&255]&255)<<16) ^ (Si[(C3>>24)&255]<<24)) ^ KW[r][2];
431        r3 = inv_mcol((Si[C3&255]&255) ^ ((Si[(C2>>8)&255]&255)<<8) ^ ((Si[(C1>>16)&255]&255)<<16) ^ (Si[(C0>>24)&255]<<24)) ^ KW[r--][3];
432
433        // the final round's table is a simple function of Si
434
435        C0 = (Si[r0&255]&255) ^ ((Si[(r3>>8)&255]&255)<<8) ^ ((Si[(r2>>16)&255]&255)<<16) ^ (Si[(r1>>24)&255]<<24) ^ KW[0][0];
436        C1 = (Si[r1&255]&255) ^ ((Si[(r0>>8)&255]&255)<<8) ^ ((Si[(r3>>16)&255]&255)<<16) ^ (Si[(r2>>24)&255]<<24) ^ KW[0][1];
437        C2 = (Si[r2&255]&255) ^ ((Si[(r1>>8)&255]&255)<<8) ^ ((Si[(r0>>16)&255]&255)<<16) ^ (Si[(r3>>24)&255]<<24) ^ KW[0][2];
438        C3 = (Si[r3&255]&255) ^ ((Si[(r2>>8)&255]&255)<<8) ^ ((Si[(r1>>16)&255]&255)<<16) ^ (Si[(r0>>24)&255]<<24) ^ KW[0][3];
439    }
440}
441