1package org.bouncycastle.crypto.engines;
3import org.bouncycastle.crypto.BlockCipher;
4import org.bouncycastle.crypto.CipherParameters;
5import org.bouncycastle.crypto.DataLengthException;
6import org.bouncycastle.crypto.params.KeyParameter;
9 * an implementation of the AES (Rijndael), from FIPS-197.
10 * <p>
11 * For further details see: <a href="http://csrc.nist.gov/encryption/aes/">http://csrc.nist.gov/encryption/aes/</a>.
12 *
13 * This implementation is based on optimizations from Dr. Brian Gladman's paper and C code at
14 * <a href="http://fp.gladman.plus.com/cryptography_technology/rijndael/">http://fp.gladman.plus.com/cryptography_technology/rijndael/</a>
15 *
16 * There are three levels of tradeoff of speed vs memory
17 * Because java has no preprocessor, they are written as three separate classes from which to choose
18 *
19 * The fastest uses 8Kbytes of static tables to precompute round calculations, 4 256 word tables for encryption
20 * and 4 for decryption.
21 *
22 * The middle performance version uses only one 256 word table for each, for a total of 2Kbytes,
23 * adding 12 rotate operations per round to compute the values contained in the other tables from
24 * the contents of the first
25 *
26 * The slowest version uses no static tables at all and computes the values
27 * in each round.
28 * <p>
29 * This file contains the slowest performance version with no static tables
30 * for round precomputation, but it has the smallest foot print.
31 *
32 */
33public class AESLightEngine
34    implements BlockCipher
36    // The S box
37    private static final byte[] S = {
38        (byte)99, (byte)124, (byte)119, (byte)123, (byte)242, (byte)107, (byte)111, (byte)197,
39        (byte)48,   (byte)1, (byte)103,  (byte)43, (byte)254, (byte)215, (byte)171, (byte)118,
40        (byte)202, (byte)130, (byte)201, (byte)125, (byte)250,  (byte)89,  (byte)71, (byte)240,
41        (byte)173, (byte)212, (byte)162, (byte)175, (byte)156, (byte)164, (byte)114, (byte)192,
42        (byte)183, (byte)253, (byte)147,  (byte)38,  (byte)54,  (byte)63, (byte)247, (byte)204,
43        (byte)52, (byte)165, (byte)229, (byte)241, (byte)113, (byte)216,  (byte)49,  (byte)21,
44        (byte)4, (byte)199,  (byte)35, (byte)195,  (byte)24, (byte)150,   (byte)5, (byte)154,
45        (byte)7,  (byte)18, (byte)128, (byte)226, (byte)235,  (byte)39, (byte)178, (byte)117,
46        (byte)9, (byte)131,  (byte)44,  (byte)26,  (byte)27, (byte)110,  (byte)90, (byte)160,
47        (byte)82,  (byte)59, (byte)214, (byte)179,  (byte)41, (byte)227,  (byte)47, (byte)132,
48        (byte)83, (byte)209,   (byte)0, (byte)237,  (byte)32, (byte)252, (byte)177,  (byte)91,
49        (byte)106, (byte)203, (byte)190,  (byte)57,  (byte)74,  (byte)76,  (byte)88, (byte)207,
50        (byte)208, (byte)239, (byte)170, (byte)251,  (byte)67,  (byte)77,  (byte)51, (byte)133,
51        (byte)69, (byte)249,   (byte)2, (byte)127,  (byte)80,  (byte)60, (byte)159, (byte)168,
52        (byte)81, (byte)163,  (byte)64, (byte)143, (byte)146, (byte)157,  (byte)56, (byte)245,
53        (byte)188, (byte)182, (byte)218,  (byte)33,  (byte)16, (byte)255, (byte)243, (byte)210,
54        (byte)205,  (byte)12,  (byte)19, (byte)236,  (byte)95, (byte)151,  (byte)68,  (byte)23,
55        (byte)196, (byte)167, (byte)126,  (byte)61, (byte)100,  (byte)93,  (byte)25, (byte)115,
56        (byte)96, (byte)129,  (byte)79, (byte)220,  (byte)34,  (byte)42, (byte)144, (byte)136,
57        (byte)70, (byte)238, (byte)184,  (byte)20, (byte)222,  (byte)94,  (byte)11, (byte)219,
58        (byte)224,  (byte)50,  (byte)58,  (byte)10,  (byte)73,   (byte)6,  (byte)36,  (byte)92,
59        (byte)194, (byte)211, (byte)172,  (byte)98, (byte)145, (byte)149, (byte)228, (byte)121,
60        (byte)231, (byte)200,  (byte)55, (byte)109, (byte)141, (byte)213,  (byte)78, (byte)169,
61        (byte)108,  (byte)86, (byte)244, (byte)234, (byte)101, (byte)122, (byte)174,   (byte)8,
62        (byte)186, (byte)120,  (byte)37,  (byte)46,  (byte)28, (byte)166, (byte)180, (byte)198,
63        (byte)232, (byte)221, (byte)116,  (byte)31,  (byte)75, (byte)189, (byte)139, (byte)138,
64        (byte)112,  (byte)62, (byte)181, (byte)102,  (byte)72,   (byte)3, (byte)246,  (byte)14,
65        (byte)97,  (byte)53,  (byte)87, (byte)185, (byte)134, (byte)193,  (byte)29, (byte)158,
66        (byte)225, (byte)248, (byte)152,  (byte)17, (byte)105, (byte)217, (byte)142, (byte)148,
67        (byte)155,  (byte)30, (byte)135, (byte)233, (byte)206,  (byte)85,  (byte)40, (byte)223,
68        (byte)140, (byte)161, (byte)137,  (byte)13, (byte)191, (byte)230,  (byte)66, (byte)104,
69        (byte)65, (byte)153,  (byte)45,  (byte)15, (byte)176,  (byte)84, (byte)187,  (byte)22,
70    };
72    // The inverse S-box
73    private static final byte[] Si = {
74        (byte)82,   (byte)9, (byte)106, (byte)213,  (byte)48,  (byte)54, (byte)165,  (byte)56,
75        (byte)191,  (byte)64, (byte)163, (byte)158, (byte)129, (byte)243, (byte)215, (byte)251,
76        (byte)124, (byte)227,  (byte)57, (byte)130, (byte)155,  (byte)47, (byte)255, (byte)135,
77        (byte)52, (byte)142,  (byte)67,  (byte)68, (byte)196, (byte)222, (byte)233, (byte)203,
78        (byte)84, (byte)123, (byte)148,  (byte)50, (byte)166, (byte)194,  (byte)35,  (byte)61,
79        (byte)238,  (byte)76, (byte)149,  (byte)11,  (byte)66, (byte)250, (byte)195,  (byte)78,
80        (byte)8,  (byte)46, (byte)161, (byte)102,  (byte)40, (byte)217,  (byte)36, (byte)178,
81        (byte)118,  (byte)91, (byte)162,  (byte)73, (byte)109, (byte)139, (byte)209,  (byte)37,
82        (byte)114, (byte)248, (byte)246, (byte)100, (byte)134, (byte)104, (byte)152,  (byte)22,
83        (byte)212, (byte)164,  (byte)92, (byte)204,  (byte)93, (byte)101, (byte)182, (byte)146,
84        (byte)108, (byte)112,  (byte)72,  (byte)80, (byte)253, (byte)237, (byte)185, (byte)218,
85        (byte)94,  (byte)21,  (byte)70,  (byte)87, (byte)167, (byte)141, (byte)157, (byte)132,
86        (byte)144, (byte)216, (byte)171,   (byte)0, (byte)140, (byte)188, (byte)211,  (byte)10,
87        (byte)247, (byte)228,  (byte)88,   (byte)5, (byte)184, (byte)179,  (byte)69,   (byte)6,
88        (byte)208,  (byte)44,  (byte)30, (byte)143, (byte)202,  (byte)63,  (byte)15,   (byte)2,
89        (byte)193, (byte)175, (byte)189,   (byte)3,   (byte)1,  (byte)19, (byte)138, (byte)107,
90        (byte)58, (byte)145,  (byte)17,  (byte)65,  (byte)79, (byte)103, (byte)220, (byte)234,
91        (byte)151, (byte)242, (byte)207, (byte)206, (byte)240, (byte)180, (byte)230, (byte)115,
92        (byte)150, (byte)172, (byte)116,  (byte)34, (byte)231, (byte)173,  (byte)53, (byte)133,
93        (byte)226, (byte)249,  (byte)55, (byte)232,  (byte)28, (byte)117, (byte)223, (byte)110,
94        (byte)71, (byte)241,  (byte)26, (byte)113,  (byte)29,  (byte)41, (byte)197, (byte)137,
95        (byte)111, (byte)183,  (byte)98,  (byte)14, (byte)170,  (byte)24, (byte)190,  (byte)27,
96        (byte)252,  (byte)86,  (byte)62,  (byte)75, (byte)198, (byte)210, (byte)121,  (byte)32,
97        (byte)154, (byte)219, (byte)192, (byte)254, (byte)120, (byte)205,  (byte)90, (byte)244,
98        (byte)31, (byte)221, (byte)168,  (byte)51, (byte)136,   (byte)7, (byte)199,  (byte)49,
99        (byte)177,  (byte)18,  (byte)16,  (byte)89,  (byte)39, (byte)128, (byte)236,  (byte)95,
100        (byte)96,  (byte)81, (byte)127, (byte)169,  (byte)25, (byte)181,  (byte)74,  (byte)13,
101        (byte)45, (byte)229, (byte)122, (byte)159, (byte)147, (byte)201, (byte)156, (byte)239,
102        (byte)160, (byte)224,  (byte)59,  (byte)77, (byte)174,  (byte)42, (byte)245, (byte)176,
103        (byte)200, (byte)235, (byte)187,  (byte)60, (byte)131,  (byte)83, (byte)153,  (byte)97,
104        (byte)23,  (byte)43,   (byte)4, (byte)126, (byte)186, (byte)119, (byte)214,  (byte)38,
105        (byte)225, (byte)105,  (byte)20,  (byte)99,  (byte)85,  (byte)33,  (byte)12, (byte)125,
106        };
108    // vector used in calculating key schedule (powers of x in GF(256))
109    private static final int[] rcon = {
110         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a,
111         0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91 };
113    private int shift(
114        int     r,
115        int     shift)
116    {
117        return (r >>> shift) | (r << -shift);
118    }
120    /* multiply four bytes in GF(2^8) by 'x' {02} in parallel */
122    private static final int m1 = 0x80808080;
123    private static final int m2 = 0x7f7f7f7f;
124    private static final int m3 = 0x0000001b;
126    private int FFmulX(int x)
127    {
128        return (((x & m2) << 1) ^ (((x & m1) >>> 7) * m3));
129    }
131    /*
132       The following defines provide alternative definitions of FFmulX that might
133       give improved performance if a fast 32-bit multiply is not available.
135       private int FFmulX(int x) { int u = x & m1; u |= (u >> 1); return ((x & m2) << 1) ^ ((u >>> 3) | (u >>> 6)); }
136       private static final int  m4 = 0x1b1b1b1b;
137       private int FFmulX(int x) { int u = x & m1; return ((x & m2) << 1) ^ ((u - (u >>> 7)) & m4); }
139    */
141    private int mcol(int x)
142    {
143        int f2 = FFmulX(x);
144        return f2 ^ shift(x ^ f2, 8) ^ shift(x, 16) ^ shift(x, 24);
145    }
147    private int inv_mcol(int x)
148    {
149        int f2 = FFmulX(x);
150        int f4 = FFmulX(f2);
151        int f8 = FFmulX(f4);
152        int f9 = x ^ f8;
154        return f2 ^ f4 ^ f8 ^ shift(f2 ^ f9, 8) ^ shift(f4 ^ f9, 16) ^ shift(f9, 24);
155    }
158    private int subWord(int x)
159    {
160        return (S[x&255]&255 | ((S[(x>>8)&255]&255)<<8) | ((S[(x>>16)&255]&255)<<16) | S[(x>>24)&255]<<24);
161    }
163    /**
164     * Calculate the necessary round keys
165     * The number of calculations depends on key size and block size
166     * AES specified a fixed block size of 128 bits and key sizes 128/192/256 bits
167     * This code is written assuming those are the only possible values
168     */
169    private int[][] generateWorkingKey(
170                                    byte[] key,
171                                    boolean forEncryption)
172    {
173        int         KC = key.length / 4;  // key length in words
174        int         t;
176        if (((KC != 4) && (KC != 6) && (KC != 8)) || ((KC * 4) != key.length))
177        {
178            throw new IllegalArgumentException("Key length not 128/192/256 bits.");
179        }
181        ROUNDS = KC + 6;  // This is not always true for the generalized Rijndael that allows larger block sizes
182        int[][] W = new int[ROUNDS+1][4];   // 4 words in a block
184        //
185        // copy the key into the round key array
186        //
188        t = 0;
189        int i = 0;
190        while (i < key.length)
191            {
192                W[t >> 2][t & 3] = (key[i]&0xff) | ((key[i+1]&0xff) << 8) | ((key[i+2]&0xff) << 16) | (key[i+3] << 24);
193                i+=4;
194                t++;
195            }
197        //
198        // while not enough round key material calculated
199        // calculate new values
200        //
201        int k = (ROUNDS + 1) << 2;
202        for (i = KC; (i < k); i++)
203            {
204                int temp = W[(i-1)>>2][(i-1)&3];
205                if ((i % KC) == 0)
206                {
207                    temp = subWord(shift(temp, 8)) ^ rcon[(i / KC)-1];
208                }
209                else if ((KC > 6) && ((i % KC) == 4))
210                {
211                    temp = subWord(temp);
212                }
214                W[i>>2][i&3] = W[(i - KC)>>2][(i-KC)&3] ^ temp;
215            }
217        if (!forEncryption)
218        {
219            for (int j = 1; j < ROUNDS; j++)
220            {
221                for (i = 0; i < 4; i++)
222                {
223                    W[j][i] = inv_mcol(W[j][i]);
224                }
225            }
226        }
228        return W;
229    }
231    private int         ROUNDS;
232    private int[][]     WorkingKey = null;
233    private int         C0, C1, C2, C3;
234    private boolean     forEncryption;
236    private static final int BLOCK_SIZE = 16;
238    /**
239     * default constructor - 128 bit block size.
240     */
241    public AESLightEngine()
242    {
243    }
245    /**
246     * initialise an AES cipher.
247     *
248     * @param forEncryption whether or not we are for encryption.
249     * @param params the parameters required to set up the cipher.
250     * @exception IllegalArgumentException if the params argument is
251     * inappropriate.
252     */
253    public void init(
254        boolean           forEncryption,
255        CipherParameters  params)
256    {
257        if (params instanceof KeyParameter)
258        {
259            WorkingKey = generateWorkingKey(((KeyParameter)params).getKey(), forEncryption);
260            this.forEncryption = forEncryption;
261            return;
262        }
264        throw new IllegalArgumentException("invalid parameter passed to AES init - " + params.getClass().getName());
265    }
267    public String getAlgorithmName()
268    {
269        return "AES";
270    }
272    public int getBlockSize()
273    {
274        return BLOCK_SIZE;
275    }
277    public int processBlock(
278        byte[] in,
279        int inOff,
280        byte[] out,
281        int outOff)
282    {
283        if (WorkingKey == null)
284        {
285            throw new IllegalStateException("AES engine not initialised");
286        }
288        if ((inOff + (32 / 2)) > in.length)
289        {
290            throw new DataLengthException("input buffer too short");
291        }
293        if ((outOff + (32 / 2)) > out.length)
294        {
295            throw new DataLengthException("output buffer too short");
296        }
298        if (forEncryption)
299        {
300            unpackBlock(in, inOff);
301            encryptBlock(WorkingKey);
302            packBlock(out, outOff);
303        }
304        else
305        {
306            unpackBlock(in, inOff);
307            decryptBlock(WorkingKey);
308            packBlock(out, outOff);
309        }
311        return BLOCK_SIZE;
312    }
314    public void reset()
315    {
316    }
318    private final void unpackBlock(
319        byte[]      bytes,
320        int         off)
321    {
322        int     index = off;
324        C0 = (bytes[index++] & 0xff);
325        C0 |= (bytes[index++] & 0xff) << 8;
326        C0 |= (bytes[index++] & 0xff) << 16;
327        C0 |= bytes[index++] << 24;
329        C1 = (bytes[index++] & 0xff);
330        C1 |= (bytes[index++] & 0xff) << 8;
331        C1 |= (bytes[index++] & 0xff) << 16;
332        C1 |= bytes[index++] << 24;
334        C2 = (bytes[index++] & 0xff);
335        C2 |= (bytes[index++] & 0xff) << 8;
336        C2 |= (bytes[index++] & 0xff) << 16;
337        C2 |= bytes[index++] << 24;
339        C3 = (bytes[index++] & 0xff);
340        C3 |= (bytes[index++] & 0xff) << 8;
341        C3 |= (bytes[index++] & 0xff) << 16;
342        C3 |= bytes[index++] << 24;
343    }
345    private final void packBlock(
346        byte[]      bytes,
347        int         off)
348    {
349        int     index = off;
351        bytes[index++] = (byte)C0;
352        bytes[index++] = (byte)(C0 >> 8);
353        bytes[index++] = (byte)(C0 >> 16);
354        bytes[index++] = (byte)(C0 >> 24);
356        bytes[index++] = (byte)C1;
357        bytes[index++] = (byte)(C1 >> 8);
358        bytes[index++] = (byte)(C1 >> 16);
359        bytes[index++] = (byte)(C1 >> 24);
361        bytes[index++] = (byte)C2;
362        bytes[index++] = (byte)(C2 >> 8);
363        bytes[index++] = (byte)(C2 >> 16);
364        bytes[index++] = (byte)(C2 >> 24);
366        bytes[index++] = (byte)C3;
367        bytes[index++] = (byte)(C3 >> 8);
368        bytes[index++] = (byte)(C3 >> 16);
369        bytes[index++] = (byte)(C3 >> 24);
370    }
372    private void encryptBlock(int[][] KW)
373    {
374        int r, r0, r1, r2, r3;
376        C0 ^= KW[0][0];
377        C1 ^= KW[0][1];
378        C2 ^= KW[0][2];
379        C3 ^= KW[0][3];
381        for (r = 1; r < ROUNDS - 1;)
382        {
383            r0 = mcol((S[C0&255]&255) ^ ((S[(C1>>8)&255]&255)<<8) ^ ((S[(C2>>16)&255]&255)<<16) ^ (S[(C3>>24)&255]<<24)) ^ KW[r][0];
384            r1 = mcol((S[C1&255]&255) ^ ((S[(C2>>8)&255]&255)<<8) ^ ((S[(C3>>16)&255]&255)<<16) ^ (S[(C0>>24)&255]<<24)) ^ KW[r][1];
385            r2 = mcol((S[C2&255]&255) ^ ((S[(C3>>8)&255]&255)<<8) ^ ((S[(C0>>16)&255]&255)<<16) ^ (S[(C1>>24)&255]<<24)) ^ KW[r][2];
386            r3 = mcol((S[C3&255]&255) ^ ((S[(C0>>8)&255]&255)<<8) ^ ((S[(C1>>16)&255]&255)<<16) ^ (S[(C2>>24)&255]<<24)) ^ KW[r++][3];
387            C0 = mcol((S[r0&255]&255) ^ ((S[(r1>>8)&255]&255)<<8) ^ ((S[(r2>>16)&255]&255)<<16) ^ (S[(r3>>24)&255]<<24)) ^ KW[r][0];
388            C1 = mcol((S[r1&255]&255) ^ ((S[(r2>>8)&255]&255)<<8) ^ ((S[(r3>>16)&255]&255)<<16) ^ (S[(r0>>24)&255]<<24)) ^ KW[r][1];
389            C2 = mcol((S[r2&255]&255) ^ ((S[(r3>>8)&255]&255)<<8) ^ ((S[(r0>>16)&255]&255)<<16) ^ (S[(r1>>24)&255]<<24)) ^ KW[r][2];
390            C3 = mcol((S[r3&255]&255) ^ ((S[(r0>>8)&255]&255)<<8) ^ ((S[(r1>>16)&255]&255)<<16) ^ (S[(r2>>24)&255]<<24)) ^ KW[r++][3];
391        }
393        r0 = mcol((S[C0&255]&255) ^ ((S[(C1>>8)&255]&255)<<8) ^ ((S[(C2>>16)&255]&255)<<16) ^ (S[(C3>>24)&255]<<24)) ^ KW[r][0];
394        r1 = mcol((S[C1&255]&255) ^ ((S[(C2>>8)&255]&255)<<8) ^ ((S[(C3>>16)&255]&255)<<16) ^ (S[(C0>>24)&255]<<24)) ^ KW[r][1];
395        r2 = mcol((S[C2&255]&255) ^ ((S[(C3>>8)&255]&255)<<8) ^ ((S[(C0>>16)&255]&255)<<16) ^ (S[(C1>>24)&255]<<24)) ^ KW[r][2];
396        r3 = mcol((S[C3&255]&255) ^ ((S[(C0>>8)&255]&255)<<8) ^ ((S[(C1>>16)&255]&255)<<16) ^ (S[(C2>>24)&255]<<24)) ^ KW[r++][3];
398        // the final round is a simple function of S
400        C0 = (S[r0&255]&255) ^ ((S[(r1>>8)&255]&255)<<8) ^ ((S[(r2>>16)&255]&255)<<16) ^ (S[(r3>>24)&255]<<24) ^ KW[r][0];
401        C1 = (S[r1&255]&255) ^ ((S[(r2>>8)&255]&255)<<8) ^ ((S[(r3>>16)&255]&255)<<16) ^ (S[(r0>>24)&255]<<24) ^ KW[r][1];
402        C2 = (S[r2&255]&255) ^ ((S[(r3>>8)&255]&255)<<8) ^ ((S[(r0>>16)&255]&255)<<16) ^ (S[(r1>>24)&255]<<24) ^ KW[r][2];
403        C3 = (S[r3&255]&255) ^ ((S[(r0>>8)&255]&255)<<8) ^ ((S[(r1>>16)&255]&255)<<16) ^ (S[(r2>>24)&255]<<24) ^ KW[r][3];
405    }
407    private final void decryptBlock(int[][] KW)
408    {
409        int r, r0, r1, r2, r3;
411        C0 ^= KW[ROUNDS][0];
412        C1 ^= KW[ROUNDS][1];
413        C2 ^= KW[ROUNDS][2];
414        C3 ^= KW[ROUNDS][3];
416        for (r = ROUNDS-1; r>1;)
417        {
418            r0 = inv_mcol((Si[C0&255]&255) ^ ((Si[(C3>>8)&255]&255)<<8) ^ ((Si[(C2>>16)&255]&255)<<16) ^ (Si[(C1>>24)&255]<<24)) ^ KW[r][0];
419            r1 = inv_mcol((Si[C1&255]&255) ^ ((Si[(C0>>8)&255]&255)<<8) ^ ((Si[(C3>>16)&255]&255)<<16) ^ (Si[(C2>>24)&255]<<24)) ^ KW[r][1];
420            r2 = inv_mcol((Si[C2&255]&255) ^ ((Si[(C1>>8)&255]&255)<<8) ^ ((Si[(C0>>16)&255]&255)<<16) ^ (Si[(C3>>24)&255]<<24)) ^ KW[r][2];
421            r3 = inv_mcol((Si[C3&255]&255) ^ ((Si[(C2>>8)&255]&255)<<8) ^ ((Si[(C1>>16)&255]&255)<<16) ^ (Si[(C0>>24)&255]<<24)) ^ KW[r--][3];
422            C0 = inv_mcol((Si[r0&255]&255) ^ ((Si[(r3>>8)&255]&255)<<8) ^ ((Si[(r2>>16)&255]&255)<<16) ^ (Si[(r1>>24)&255]<<24)) ^ KW[r][0];
423            C1 = inv_mcol((Si[r1&255]&255) ^ ((Si[(r0>>8)&255]&255)<<8) ^ ((Si[(r3>>16)&255]&255)<<16) ^ (Si[(r2>>24)&255]<<24)) ^ KW[r][1];
424            C2 = inv_mcol((Si[r2&255]&255) ^ ((Si[(r1>>8)&255]&255)<<8) ^ ((Si[(r0>>16)&255]&255)<<16) ^ (Si[(r3>>24)&255]<<24)) ^ KW[r][2];
425            C3 = inv_mcol((Si[r3&255]&255) ^ ((Si[(r2>>8)&255]&255)<<8) ^ ((Si[(r1>>16)&255]&255)<<16) ^ (Si[(r0>>24)&255]<<24)) ^ KW[r--][3];
426        }
428        r0 = inv_mcol((Si[C0&255]&255) ^ ((Si[(C3>>8)&255]&255)<<8) ^ ((Si[(C2>>16)&255]&255)<<16) ^ (Si[(C1>>24)&255]<<24)) ^ KW[r][0];
429        r1 = inv_mcol((Si[C1&255]&255) ^ ((Si[(C0>>8)&255]&255)<<8) ^ ((Si[(C3>>16)&255]&255)<<16) ^ (Si[(C2>>24)&255]<<24)) ^ KW[r][1];
430        r2 = inv_mcol((Si[C2&255]&255) ^ ((Si[(C1>>8)&255]&255)<<8) ^ ((Si[(C0>>16)&255]&255)<<16) ^ (Si[(C3>>24)&255]<<24)) ^ KW[r][2];
431        r3 = inv_mcol((Si[C3&255]&255) ^ ((Si[(C2>>8)&255]&255)<<8) ^ ((Si[(C1>>16)&255]&255)<<16) ^ (Si[(C0>>24)&255]<<24)) ^ KW[r--][3];
433        // the final round's table is a simple function of Si
435        C0 = (Si[r0&255]&255) ^ ((Si[(r3>>8)&255]&255)<<8) ^ ((Si[(r2>>16)&255]&255)<<16) ^ (Si[(r1>>24)&255]<<24) ^ KW[0][0];
436        C1 = (Si[r1&255]&255) ^ ((Si[(r0>>8)&255]&255)<<8) ^ ((Si[(r3>>16)&255]&255)<<16) ^ (Si[(r2>>24)&255]<<24) ^ KW[0][1];
437        C2 = (Si[r2&255]&255) ^ ((Si[(r1>>8)&255]&255)<<8) ^ ((Si[(r0>>16)&255]&255)<<16) ^ (Si[(r3>>24)&255]<<24) ^ KW[0][2];
438        C3 = (Si[r3&255]&255) ^ ((Si[(r2>>8)&255]&255)<<8) ^ ((Si[(r1>>16)&255]&255)<<16) ^ (Si[(r0>>24)&255]<<24) ^ KW[0][3];
439    }