aes-ppc.pl revision 3d3a1b8fcf46ca3bdb3d8f09acd6ef604624a30d
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# Needs more work: key setup, page boundaries, CBC routine...
11#
12# ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with
13# 128-bit key, which is ~40% better than 64-bit code generated by gcc
14# 4.0. But these are not the ones currently used! Their "compact"
15# counterparts are, for security reason. ppc_AES_encrypt_compact runs
16# at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact -
17# at 1/3 of ppc_AES_decrypt.
18
19# February 2010
20#
21# Rescheduling instructions to favour Power6 pipeline gives 10%
22# performance improvement on the platfrom in question (and marginal
23# improvement even on others). It should be noted that Power6 fails
24# to process byte in 18 cycles, only in 23, because it fails to issue
25# 4 load instructions in two cycles, only in 3. As result non-compact
26# block subroutines are 25% slower than one would expect. Compact
27# functions scale better, because they have pure computational part,
28# which scales perfectly with clock frequency. To be specific
29# ppc_AES_encrypt_compact operates at 42 cycles per byte, while
30# ppc_AES_decrypt_compact - at 55 (in 64-bit build).
31
32$flavour = shift;
33
34if ($flavour =~ /64/) {
35	$SIZE_T	=8;
36	$STU	="stdu";
37	$POP	="ld";
38	$PUSH	="std";
39} elsif ($flavour =~ /32/) {
40	$SIZE_T	=4;
41	$STU	="stwu";
42	$POP	="lwz";
43	$PUSH	="stw";
44} else { die "nonsense $flavour"; }
45
46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
48( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
49die "can't locate ppc-xlate.pl";
50
51open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
52
53$FRAME=32*$SIZE_T;
54
55sub _data_word()
56{ my $i;
57    while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
58}
59
60$sp="r1";
61$toc="r2";
62$inp="r3";
63$out="r4";
64$key="r5";
65
66$Tbl0="r3";
67$Tbl1="r6";
68$Tbl2="r7";
69$Tbl3="r2";
70
71$s0="r8";
72$s1="r9";
73$s2="r10";
74$s3="r11";
75
76$t0="r12";
77$t1="r13";
78$t2="r14";
79$t3="r15";
80
81$acc00="r16";
82$acc01="r17";
83$acc02="r18";
84$acc03="r19";
85
86$acc04="r20";
87$acc05="r21";
88$acc06="r22";
89$acc07="r23";
90
91$acc08="r24";
92$acc09="r25";
93$acc10="r26";
94$acc11="r27";
95
96$acc12="r28";
97$acc13="r29";
98$acc14="r30";
99$acc15="r31";
100
101# stay away from TLS pointer
102if ($SIZE_T==8)	{ die if ($t1 ne "r13");  $t1="r0";		}
103else		{ die if ($Tbl3 ne "r2"); $Tbl3=$t0; $t0="r0";	}
104$mask80=$Tbl2;
105$mask1b=$Tbl3;
106
107$code.=<<___;
108.machine	"any"
109.text
110
111.align	7
112LAES_Te:
113	mflr	r0
114	bcl	20,31,\$+4
115	mflr	$Tbl0	;    vvvvv "distance" between . and 1st data entry
116	addi	$Tbl0,$Tbl0,`128-8`
117	mtlr	r0
118	blr
119	.space	`32-24`
120LAES_Td:
121	mflr	r0
122	bcl	20,31,\$+4
123	mflr	$Tbl0	;    vvvvvvvv "distance" between . and 1st data entry
124	addi	$Tbl0,$Tbl0,`128-8-32+2048+256`
125	mtlr	r0
126	blr
127	.space	`128-32-24`
128___
129&_data_word(
130	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
131	0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
132	0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
133	0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
134	0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
135	0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
136	0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
137	0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
138	0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
139	0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
140	0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
141	0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
142	0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
143	0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
144	0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
145	0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
146	0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
147	0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
148	0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
149	0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
150	0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
151	0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
152	0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
153	0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
154	0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
155	0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
156	0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
157	0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
158	0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
159	0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
160	0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
161	0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
162	0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
163	0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
164	0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
165	0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
166	0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
167	0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
168	0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
169	0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
170	0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
171	0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
172	0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
173	0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
174	0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
175	0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
176	0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
177	0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
178	0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
179	0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
180	0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
181	0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
182	0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
183	0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
184	0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
185	0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
186	0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
187	0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
188	0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
189	0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
190	0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
191	0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
192	0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
193	0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
194$code.=<<___;
195.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
196.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
197.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
198.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
199.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
200.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
201.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
202.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
203.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
204.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
205.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
206.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
207.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
208.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
209.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
210.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
211.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
212.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
213.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
214.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
215.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
216.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
217.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
218.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
219.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
220.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
221.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
222.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
223.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
224.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
225.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
226.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
227___
228&_data_word(
229	0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
230	0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
231	0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
232	0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
233	0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
234	0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
235	0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
236	0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
237	0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
238	0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
239	0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
240	0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
241	0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
242	0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
243	0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
244	0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
245	0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
246	0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
247	0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
248	0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
249	0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
250	0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
251	0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
252	0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
253	0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
254	0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
255	0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
256	0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
257	0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
258	0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
259	0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
260	0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
261	0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
262	0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
263	0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
264	0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
265	0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
266	0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
267	0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
268	0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
269	0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
270	0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
271	0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
272	0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
273	0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
274	0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
275	0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
276	0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
277	0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
278	0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
279	0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
280	0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
281	0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
282	0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
283	0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
284	0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
285	0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
286	0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
287	0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
288	0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
289	0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
290	0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
291	0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
292	0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
293$code.=<<___;
294.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
295.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
296.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
297.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
298.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
299.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
300.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
301.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
302.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
303.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
304.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
305.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
306.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
307.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
308.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
309.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
310.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
311.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
312.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
313.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
314.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
315.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
316.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
317.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
318.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
319.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
320.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
321.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
322.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
323.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
324.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
325.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
326
327
328.globl	.AES_encrypt
329.align	7
330.AES_encrypt:
331	mflr	r0
332	$STU	$sp,-$FRAME($sp)
333
334	$PUSH	r0,`$FRAME-$SIZE_T*21`($sp)
335	$PUSH	$toc,`$FRAME-$SIZE_T*20`($sp)
336	$PUSH	r13,`$FRAME-$SIZE_T*19`($sp)
337	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
338	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
339	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
340	$PUSH	r17,`$FRAME-$SIZE_T*15`($sp)
341	$PUSH	r18,`$FRAME-$SIZE_T*14`($sp)
342	$PUSH	r19,`$FRAME-$SIZE_T*13`($sp)
343	$PUSH	r20,`$FRAME-$SIZE_T*12`($sp)
344	$PUSH	r21,`$FRAME-$SIZE_T*11`($sp)
345	$PUSH	r22,`$FRAME-$SIZE_T*10`($sp)
346	$PUSH	r23,`$FRAME-$SIZE_T*9`($sp)
347	$PUSH	r24,`$FRAME-$SIZE_T*8`($sp)
348	$PUSH	r25,`$FRAME-$SIZE_T*7`($sp)
349	$PUSH	r26,`$FRAME-$SIZE_T*6`($sp)
350	$PUSH	r27,`$FRAME-$SIZE_T*5`($sp)
351	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
352	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
353	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
354	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
355
356	lwz	$s0,0($inp)
357	lwz	$s1,4($inp)
358	lwz	$s2,8($inp)
359	lwz	$s3,12($inp)
360	bl	LAES_Te
361	bl	Lppc_AES_encrypt_compact
362	stw	$s0,0($out)
363	stw	$s1,4($out)
364	stw	$s2,8($out)
365	stw	$s3,12($out)
366
367	$POP	r0,`$FRAME-$SIZE_T*21`($sp)
368	$POP	$toc,`$FRAME-$SIZE_T*20`($sp)
369	$POP	r13,`$FRAME-$SIZE_T*19`($sp)
370	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
371	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
372	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
373	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
374	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
375	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
376	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
377	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
378	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
379	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
380	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
381	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
382	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
383	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
384	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
385	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
386	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
387	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
388	mtlr	r0
389	addi	$sp,$sp,$FRAME
390	blr
391
392.align	5
393Lppc_AES_encrypt:
394	lwz	$acc00,240($key)
395	lwz	$t0,0($key)
396	lwz	$t1,4($key)
397	lwz	$t2,8($key)
398	lwz	$t3,12($key)
399	addi	$Tbl1,$Tbl0,3
400	addi	$Tbl2,$Tbl0,2
401	addi	$Tbl3,$Tbl0,1
402	addi	$acc00,$acc00,-1
403	addi	$key,$key,16
404	xor	$s0,$s0,$t0
405	xor	$s1,$s1,$t1
406	xor	$s2,$s2,$t2
407	xor	$s3,$s3,$t3
408	mtctr	$acc00
409.align	4
410Lenc_loop:
411	rlwinm	$acc00,$s0,`32-24+3`,21,28
412	rlwinm	$acc01,$s1,`32-24+3`,21,28
413	rlwinm	$acc02,$s2,`32-24+3`,21,28
414	rlwinm	$acc03,$s3,`32-24+3`,21,28
415	lwz	$t0,0($key)
416	lwz	$t1,4($key)
417	rlwinm	$acc04,$s1,`32-16+3`,21,28
418	rlwinm	$acc05,$s2,`32-16+3`,21,28
419	lwz	$t2,8($key)
420	lwz	$t3,12($key)
421	rlwinm	$acc06,$s3,`32-16+3`,21,28
422	rlwinm	$acc07,$s0,`32-16+3`,21,28
423	lwzx	$acc00,$Tbl0,$acc00
424	lwzx	$acc01,$Tbl0,$acc01
425	rlwinm	$acc08,$s2,`32-8+3`,21,28
426	rlwinm	$acc09,$s3,`32-8+3`,21,28
427	lwzx	$acc02,$Tbl0,$acc02
428	lwzx	$acc03,$Tbl0,$acc03
429	rlwinm	$acc10,$s0,`32-8+3`,21,28
430	rlwinm	$acc11,$s1,`32-8+3`,21,28
431	lwzx	$acc04,$Tbl1,$acc04
432	lwzx	$acc05,$Tbl1,$acc05
433	rlwinm	$acc12,$s3,`0+3`,21,28
434	rlwinm	$acc13,$s0,`0+3`,21,28
435	lwzx	$acc06,$Tbl1,$acc06
436	lwzx	$acc07,$Tbl1,$acc07
437	rlwinm	$acc14,$s1,`0+3`,21,28
438	rlwinm	$acc15,$s2,`0+3`,21,28
439	lwzx	$acc08,$Tbl2,$acc08
440	lwzx	$acc09,$Tbl2,$acc09
441	xor	$t0,$t0,$acc00
442	xor	$t1,$t1,$acc01
443	lwzx	$acc10,$Tbl2,$acc10
444	lwzx	$acc11,$Tbl2,$acc11
445	xor	$t2,$t2,$acc02
446	xor	$t3,$t3,$acc03
447	lwzx	$acc12,$Tbl3,$acc12
448	lwzx	$acc13,$Tbl3,$acc13
449	xor	$t0,$t0,$acc04
450	xor	$t1,$t1,$acc05
451	lwzx	$acc14,$Tbl3,$acc14
452	lwzx	$acc15,$Tbl3,$acc15
453	xor	$t2,$t2,$acc06
454	xor	$t3,$t3,$acc07
455	xor	$t0,$t0,$acc08
456	xor	$t1,$t1,$acc09
457	xor	$t2,$t2,$acc10
458	xor	$t3,$t3,$acc11
459	xor	$s0,$t0,$acc12
460	xor	$s1,$t1,$acc13
461	xor	$s2,$t2,$acc14
462	xor	$s3,$t3,$acc15
463	addi	$key,$key,16
464	bdnz-	Lenc_loop
465
466	addi	$Tbl2,$Tbl0,2048
467	nop
468	lwz	$t0,0($key)
469	lwz	$t1,4($key)
470	rlwinm	$acc00,$s0,`32-24`,24,31
471	rlwinm	$acc01,$s1,`32-24`,24,31
472	lwz	$t2,8($key)
473	lwz	$t3,12($key)
474	rlwinm	$acc02,$s2,`32-24`,24,31
475	rlwinm	$acc03,$s3,`32-24`,24,31
476	lwz	$acc08,`2048+0`($Tbl0)	! prefetch Te4
477	lwz	$acc09,`2048+32`($Tbl0)
478	rlwinm	$acc04,$s1,`32-16`,24,31
479	rlwinm	$acc05,$s2,`32-16`,24,31
480	lwz	$acc10,`2048+64`($Tbl0)
481	lwz	$acc11,`2048+96`($Tbl0)
482	rlwinm	$acc06,$s3,`32-16`,24,31
483	rlwinm	$acc07,$s0,`32-16`,24,31
484	lwz	$acc12,`2048+128`($Tbl0)
485	lwz	$acc13,`2048+160`($Tbl0)
486	rlwinm	$acc08,$s2,`32-8`,24,31
487	rlwinm	$acc09,$s3,`32-8`,24,31
488	lwz	$acc14,`2048+192`($Tbl0)
489	lwz	$acc15,`2048+224`($Tbl0)
490	rlwinm	$acc10,$s0,`32-8`,24,31
491	rlwinm	$acc11,$s1,`32-8`,24,31
492	lbzx	$acc00,$Tbl2,$acc00
493	lbzx	$acc01,$Tbl2,$acc01
494	rlwinm	$acc12,$s3,`0`,24,31
495	rlwinm	$acc13,$s0,`0`,24,31
496	lbzx	$acc02,$Tbl2,$acc02
497	lbzx	$acc03,$Tbl2,$acc03
498	rlwinm	$acc14,$s1,`0`,24,31
499	rlwinm	$acc15,$s2,`0`,24,31
500	lbzx	$acc04,$Tbl2,$acc04
501	lbzx	$acc05,$Tbl2,$acc05
502	rlwinm	$s0,$acc00,24,0,7
503	rlwinm	$s1,$acc01,24,0,7
504	lbzx	$acc06,$Tbl2,$acc06
505	lbzx	$acc07,$Tbl2,$acc07
506	rlwinm	$s2,$acc02,24,0,7
507	rlwinm	$s3,$acc03,24,0,7
508	lbzx	$acc08,$Tbl2,$acc08
509	lbzx	$acc09,$Tbl2,$acc09
510	rlwimi	$s0,$acc04,16,8,15
511	rlwimi	$s1,$acc05,16,8,15
512	lbzx	$acc10,$Tbl2,$acc10
513	lbzx	$acc11,$Tbl2,$acc11
514	rlwimi	$s2,$acc06,16,8,15
515	rlwimi	$s3,$acc07,16,8,15
516	lbzx	$acc12,$Tbl2,$acc12
517	lbzx	$acc13,$Tbl2,$acc13
518	rlwimi	$s0,$acc08,8,16,23
519	rlwimi	$s1,$acc09,8,16,23
520	lbzx	$acc14,$Tbl2,$acc14
521	lbzx	$acc15,$Tbl2,$acc15
522	rlwimi	$s2,$acc10,8,16,23
523	rlwimi	$s3,$acc11,8,16,23
524	or	$s0,$s0,$acc12
525	or	$s1,$s1,$acc13
526	or	$s2,$s2,$acc14
527	or	$s3,$s3,$acc15
528	xor	$s0,$s0,$t0
529	xor	$s1,$s1,$t1
530	xor	$s2,$s2,$t2
531	xor	$s3,$s3,$t3
532	blr
533
534.align	4
535Lppc_AES_encrypt_compact:
536	lwz	$acc00,240($key)
537	lwz	$t0,0($key)
538	lwz	$t1,4($key)
539	lwz	$t2,8($key)
540	lwz	$t3,12($key)
541	addi	$Tbl1,$Tbl0,2048
542	lis	$mask80,0x8080
543	lis	$mask1b,0x1b1b
544	addi	$key,$key,16
545	ori	$mask80,$mask80,0x8080
546	ori	$mask1b,$mask1b,0x1b1b
547	mtctr	$acc00
548.align	4
549Lenc_compact_loop:
550	xor	$s0,$s0,$t0
551	xor	$s1,$s1,$t1
552	xor	$s2,$s2,$t2
553	xor	$s3,$s3,$t3
554	rlwinm	$acc00,$s0,`32-24`,24,31
555	rlwinm	$acc01,$s1,`32-24`,24,31
556	rlwinm	$acc02,$s2,`32-24`,24,31
557	rlwinm	$acc03,$s3,`32-24`,24,31
558	rlwinm	$acc04,$s1,`32-16`,24,31
559	rlwinm	$acc05,$s2,`32-16`,24,31
560	rlwinm	$acc06,$s3,`32-16`,24,31
561	rlwinm	$acc07,$s0,`32-16`,24,31
562	lbzx	$acc00,$Tbl1,$acc00
563	lbzx	$acc01,$Tbl1,$acc01
564	rlwinm	$acc08,$s2,`32-8`,24,31
565	rlwinm	$acc09,$s3,`32-8`,24,31
566	lbzx	$acc02,$Tbl1,$acc02
567	lbzx	$acc03,$Tbl1,$acc03
568	rlwinm	$acc10,$s0,`32-8`,24,31
569	rlwinm	$acc11,$s1,`32-8`,24,31
570	lbzx	$acc04,$Tbl1,$acc04
571	lbzx	$acc05,$Tbl1,$acc05
572	rlwinm	$acc12,$s3,`0`,24,31
573	rlwinm	$acc13,$s0,`0`,24,31
574	lbzx	$acc06,$Tbl1,$acc06
575	lbzx	$acc07,$Tbl1,$acc07
576	rlwinm	$acc14,$s1,`0`,24,31
577	rlwinm	$acc15,$s2,`0`,24,31
578	lbzx	$acc08,$Tbl1,$acc08
579	lbzx	$acc09,$Tbl1,$acc09
580	rlwinm	$s0,$acc00,24,0,7
581	rlwinm	$s1,$acc01,24,0,7
582	lbzx	$acc10,$Tbl1,$acc10
583	lbzx	$acc11,$Tbl1,$acc11
584	rlwinm	$s2,$acc02,24,0,7
585	rlwinm	$s3,$acc03,24,0,7
586	lbzx	$acc12,$Tbl1,$acc12
587	lbzx	$acc13,$Tbl1,$acc13
588	rlwimi	$s0,$acc04,16,8,15
589	rlwimi	$s1,$acc05,16,8,15
590	lbzx	$acc14,$Tbl1,$acc14
591	lbzx	$acc15,$Tbl1,$acc15
592	rlwimi	$s2,$acc06,16,8,15
593	rlwimi	$s3,$acc07,16,8,15
594	rlwimi	$s0,$acc08,8,16,23
595	rlwimi	$s1,$acc09,8,16,23
596	rlwimi	$s2,$acc10,8,16,23
597	rlwimi	$s3,$acc11,8,16,23
598	lwz	$t0,0($key)
599	lwz	$t1,4($key)
600	or	$s0,$s0,$acc12
601	or	$s1,$s1,$acc13
602	lwz	$t2,8($key)
603	lwz	$t3,12($key)
604	or	$s2,$s2,$acc14
605	or	$s3,$s3,$acc15
606
607	addi	$key,$key,16
608	bdz	Lenc_compact_done
609
610	and	$acc00,$s0,$mask80	# r1=r0&0x80808080
611	and	$acc01,$s1,$mask80
612	and	$acc02,$s2,$mask80
613	and	$acc03,$s3,$mask80
614	srwi	$acc04,$acc00,7		# r1>>7
615	srwi	$acc05,$acc01,7
616	srwi	$acc06,$acc02,7
617	srwi	$acc07,$acc03,7
618	andc	$acc08,$s0,$mask80	# r0&0x7f7f7f7f
619	andc	$acc09,$s1,$mask80
620	andc	$acc10,$s2,$mask80
621	andc	$acc11,$s3,$mask80
622	sub	$acc00,$acc00,$acc04	# r1-(r1>>7)
623	sub	$acc01,$acc01,$acc05
624	sub	$acc02,$acc02,$acc06
625	sub	$acc03,$acc03,$acc07
626	add	$acc08,$acc08,$acc08	# (r0&0x7f7f7f7f)<<1
627	add	$acc09,$acc09,$acc09
628	add	$acc10,$acc10,$acc10
629	add	$acc11,$acc11,$acc11
630	and	$acc00,$acc00,$mask1b	# (r1-(r1>>7))&0x1b1b1b1b
631	and	$acc01,$acc01,$mask1b
632	and	$acc02,$acc02,$mask1b
633	and	$acc03,$acc03,$mask1b
634	xor	$acc00,$acc00,$acc08	# r2
635	xor	$acc01,$acc01,$acc09
636	xor	$acc02,$acc02,$acc10
637	xor	$acc03,$acc03,$acc11
638
639	rotlwi	$acc12,$s0,16		# ROTATE(r0,16)
640	rotlwi	$acc13,$s1,16
641	rotlwi	$acc14,$s2,16
642	rotlwi	$acc15,$s3,16
643	xor	$s0,$s0,$acc00		# r0^r2
644	xor	$s1,$s1,$acc01
645	xor	$s2,$s2,$acc02
646	xor	$s3,$s3,$acc03
647	rotrwi	$s0,$s0,24		# ROTATE(r2^r0,24)
648	rotrwi	$s1,$s1,24
649	rotrwi	$s2,$s2,24
650	rotrwi	$s3,$s3,24
651	xor	$s0,$s0,$acc00		# ROTATE(r2^r0,24)^r2
652	xor	$s1,$s1,$acc01
653	xor	$s2,$s2,$acc02
654	xor	$s3,$s3,$acc03
655	rotlwi	$acc08,$acc12,8		# ROTATE(r0,24)
656	rotlwi	$acc09,$acc13,8
657	rotlwi	$acc10,$acc14,8
658	rotlwi	$acc11,$acc15,8
659	xor	$s0,$s0,$acc12		#
660	xor	$s1,$s1,$acc13
661	xor	$s2,$s2,$acc14
662	xor	$s3,$s3,$acc15
663	xor	$s0,$s0,$acc08		#
664	xor	$s1,$s1,$acc09
665	xor	$s2,$s2,$acc10
666	xor	$s3,$s3,$acc11
667
668	b	Lenc_compact_loop
669.align	4
670Lenc_compact_done:
671	xor	$s0,$s0,$t0
672	xor	$s1,$s1,$t1
673	xor	$s2,$s2,$t2
674	xor	$s3,$s3,$t3
675	blr
676
677.globl	.AES_decrypt
678.align	7
679.AES_decrypt:
680	mflr	r0
681	$STU	$sp,-$FRAME($sp)
682
683	$PUSH	r0,`$FRAME-$SIZE_T*21`($sp)
684	$PUSH	$toc,`$FRAME-$SIZE_T*20`($sp)
685	$PUSH	r13,`$FRAME-$SIZE_T*19`($sp)
686	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
687	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
688	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
689	$PUSH	r17,`$FRAME-$SIZE_T*15`($sp)
690	$PUSH	r18,`$FRAME-$SIZE_T*14`($sp)
691	$PUSH	r19,`$FRAME-$SIZE_T*13`($sp)
692	$PUSH	r20,`$FRAME-$SIZE_T*12`($sp)
693	$PUSH	r21,`$FRAME-$SIZE_T*11`($sp)
694	$PUSH	r22,`$FRAME-$SIZE_T*10`($sp)
695	$PUSH	r23,`$FRAME-$SIZE_T*9`($sp)
696	$PUSH	r24,`$FRAME-$SIZE_T*8`($sp)
697	$PUSH	r25,`$FRAME-$SIZE_T*7`($sp)
698	$PUSH	r26,`$FRAME-$SIZE_T*6`($sp)
699	$PUSH	r27,`$FRAME-$SIZE_T*5`($sp)
700	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
701	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
702	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
703	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
704
705	lwz	$s0,0($inp)
706	lwz	$s1,4($inp)
707	lwz	$s2,8($inp)
708	lwz	$s3,12($inp)
709	bl	LAES_Td
710	bl	Lppc_AES_decrypt_compact
711	stw	$s0,0($out)
712	stw	$s1,4($out)
713	stw	$s2,8($out)
714	stw	$s3,12($out)
715
716	$POP	r0,`$FRAME-$SIZE_T*21`($sp)
717	$POP	$toc,`$FRAME-$SIZE_T*20`($sp)
718	$POP	r13,`$FRAME-$SIZE_T*19`($sp)
719	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
720	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
721	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
722	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
723	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
724	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
725	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
726	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
727	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
728	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
729	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
730	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
731	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
732	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
733	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
734	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
735	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
736	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
737	mtlr	r0
738	addi	$sp,$sp,$FRAME
739	blr
740
741.align	5
742Lppc_AES_decrypt:
743	lwz	$acc00,240($key)
744	lwz	$t0,0($key)
745	lwz	$t1,4($key)
746	lwz	$t2,8($key)
747	lwz	$t3,12($key)
748	addi	$Tbl1,$Tbl0,3
749	addi	$Tbl2,$Tbl0,2
750	addi	$Tbl3,$Tbl0,1
751	addi	$acc00,$acc00,-1
752	addi	$key,$key,16
753	xor	$s0,$s0,$t0
754	xor	$s1,$s1,$t1
755	xor	$s2,$s2,$t2
756	xor	$s3,$s3,$t3
757	mtctr	$acc00
758.align	4
759Ldec_loop:
760	rlwinm	$acc00,$s0,`32-24+3`,21,28
761	rlwinm	$acc01,$s1,`32-24+3`,21,28
762	rlwinm	$acc02,$s2,`32-24+3`,21,28
763	rlwinm	$acc03,$s3,`32-24+3`,21,28
764	lwz	$t0,0($key)
765	lwz	$t1,4($key)
766	rlwinm	$acc04,$s3,`32-16+3`,21,28
767	rlwinm	$acc05,$s0,`32-16+3`,21,28
768	lwz	$t2,8($key)
769	lwz	$t3,12($key)
770	rlwinm	$acc06,$s1,`32-16+3`,21,28
771	rlwinm	$acc07,$s2,`32-16+3`,21,28
772	lwzx	$acc00,$Tbl0,$acc00
773	lwzx	$acc01,$Tbl0,$acc01
774	rlwinm	$acc08,$s2,`32-8+3`,21,28
775	rlwinm	$acc09,$s3,`32-8+3`,21,28
776	lwzx	$acc02,$Tbl0,$acc02
777	lwzx	$acc03,$Tbl0,$acc03
778	rlwinm	$acc10,$s0,`32-8+3`,21,28
779	rlwinm	$acc11,$s1,`32-8+3`,21,28
780	lwzx	$acc04,$Tbl1,$acc04
781	lwzx	$acc05,$Tbl1,$acc05
782	rlwinm	$acc12,$s1,`0+3`,21,28
783	rlwinm	$acc13,$s2,`0+3`,21,28
784	lwzx	$acc06,$Tbl1,$acc06
785	lwzx	$acc07,$Tbl1,$acc07
786	rlwinm	$acc14,$s3,`0+3`,21,28
787	rlwinm	$acc15,$s0,`0+3`,21,28
788	lwzx	$acc08,$Tbl2,$acc08
789	lwzx	$acc09,$Tbl2,$acc09
790	xor	$t0,$t0,$acc00
791	xor	$t1,$t1,$acc01
792	lwzx	$acc10,$Tbl2,$acc10
793	lwzx	$acc11,$Tbl2,$acc11
794	xor	$t2,$t2,$acc02
795	xor	$t3,$t3,$acc03
796	lwzx	$acc12,$Tbl3,$acc12
797	lwzx	$acc13,$Tbl3,$acc13
798	xor	$t0,$t0,$acc04
799	xor	$t1,$t1,$acc05
800	lwzx	$acc14,$Tbl3,$acc14
801	lwzx	$acc15,$Tbl3,$acc15
802	xor	$t2,$t2,$acc06
803	xor	$t3,$t3,$acc07
804	xor	$t0,$t0,$acc08
805	xor	$t1,$t1,$acc09
806	xor	$t2,$t2,$acc10
807	xor	$t3,$t3,$acc11
808	xor	$s0,$t0,$acc12
809	xor	$s1,$t1,$acc13
810	xor	$s2,$t2,$acc14
811	xor	$s3,$t3,$acc15
812	addi	$key,$key,16
813	bdnz-	Ldec_loop
814
815	addi	$Tbl2,$Tbl0,2048
816	nop
817	lwz	$t0,0($key)
818	lwz	$t1,4($key)
819	rlwinm	$acc00,$s0,`32-24`,24,31
820	rlwinm	$acc01,$s1,`32-24`,24,31
821	lwz	$t2,8($key)
822	lwz	$t3,12($key)
823	rlwinm	$acc02,$s2,`32-24`,24,31
824	rlwinm	$acc03,$s3,`32-24`,24,31
825	lwz	$acc08,`2048+0`($Tbl0)	! prefetch Td4
826	lwz	$acc09,`2048+32`($Tbl0)
827	rlwinm	$acc04,$s3,`32-16`,24,31
828	rlwinm	$acc05,$s0,`32-16`,24,31
829	lwz	$acc10,`2048+64`($Tbl0)
830	lwz	$acc11,`2048+96`($Tbl0)
831	lbzx	$acc00,$Tbl2,$acc00
832	lbzx	$acc01,$Tbl2,$acc01
833	lwz	$acc12,`2048+128`($Tbl0)
834	lwz	$acc13,`2048+160`($Tbl0)
835	rlwinm	$acc06,$s1,`32-16`,24,31
836	rlwinm	$acc07,$s2,`32-16`,24,31
837	lwz	$acc14,`2048+192`($Tbl0)
838	lwz	$acc15,`2048+224`($Tbl0)
839	rlwinm	$acc08,$s2,`32-8`,24,31
840	rlwinm	$acc09,$s3,`32-8`,24,31
841	lbzx	$acc02,$Tbl2,$acc02
842	lbzx	$acc03,$Tbl2,$acc03
843	rlwinm	$acc10,$s0,`32-8`,24,31
844	rlwinm	$acc11,$s1,`32-8`,24,31
845	lbzx	$acc04,$Tbl2,$acc04
846	lbzx	$acc05,$Tbl2,$acc05
847	rlwinm	$acc12,$s1,`0`,24,31
848	rlwinm	$acc13,$s2,`0`,24,31
849	lbzx	$acc06,$Tbl2,$acc06
850	lbzx	$acc07,$Tbl2,$acc07
851	rlwinm	$acc14,$s3,`0`,24,31
852	rlwinm	$acc15,$s0,`0`,24,31
853	lbzx	$acc08,$Tbl2,$acc08
854	lbzx	$acc09,$Tbl2,$acc09
855	rlwinm	$s0,$acc00,24,0,7
856	rlwinm	$s1,$acc01,24,0,7
857	lbzx	$acc10,$Tbl2,$acc10
858	lbzx	$acc11,$Tbl2,$acc11
859	rlwinm	$s2,$acc02,24,0,7
860	rlwinm	$s3,$acc03,24,0,7
861	lbzx	$acc12,$Tbl2,$acc12
862	lbzx	$acc13,$Tbl2,$acc13
863	rlwimi	$s0,$acc04,16,8,15
864	rlwimi	$s1,$acc05,16,8,15
865	lbzx	$acc14,$Tbl2,$acc14
866	lbzx	$acc15,$Tbl2,$acc15
867	rlwimi	$s2,$acc06,16,8,15
868	rlwimi	$s3,$acc07,16,8,15
869	rlwimi	$s0,$acc08,8,16,23
870	rlwimi	$s1,$acc09,8,16,23
871	rlwimi	$s2,$acc10,8,16,23
872	rlwimi	$s3,$acc11,8,16,23
873	or	$s0,$s0,$acc12
874	or	$s1,$s1,$acc13
875	or	$s2,$s2,$acc14
876	or	$s3,$s3,$acc15
877	xor	$s0,$s0,$t0
878	xor	$s1,$s1,$t1
879	xor	$s2,$s2,$t2
880	xor	$s3,$s3,$t3
881	blr
882
883.align	4
884Lppc_AES_decrypt_compact:
885	lwz	$acc00,240($key)
886	lwz	$t0,0($key)
887	lwz	$t1,4($key)
888	lwz	$t2,8($key)
889	lwz	$t3,12($key)
890	addi	$Tbl1,$Tbl0,2048
891	lis	$mask80,0x8080
892	lis	$mask1b,0x1b1b
893	addi	$key,$key,16
894	ori	$mask80,$mask80,0x8080
895	ori	$mask1b,$mask1b,0x1b1b
896___
897$code.=<<___ if ($SIZE_T==8);
898	insrdi	$mask80,$mask80,32,0
899	insrdi	$mask1b,$mask1b,32,0
900___
901$code.=<<___;
902	mtctr	$acc00
903.align	4
904Ldec_compact_loop:
905	xor	$s0,$s0,$t0
906	xor	$s1,$s1,$t1
907	xor	$s2,$s2,$t2
908	xor	$s3,$s3,$t3
909	rlwinm	$acc00,$s0,`32-24`,24,31
910	rlwinm	$acc01,$s1,`32-24`,24,31
911	rlwinm	$acc02,$s2,`32-24`,24,31
912	rlwinm	$acc03,$s3,`32-24`,24,31
913	rlwinm	$acc04,$s3,`32-16`,24,31
914	rlwinm	$acc05,$s0,`32-16`,24,31
915	rlwinm	$acc06,$s1,`32-16`,24,31
916	rlwinm	$acc07,$s2,`32-16`,24,31
917	lbzx	$acc00,$Tbl1,$acc00
918	lbzx	$acc01,$Tbl1,$acc01
919	rlwinm	$acc08,$s2,`32-8`,24,31
920	rlwinm	$acc09,$s3,`32-8`,24,31
921	lbzx	$acc02,$Tbl1,$acc02
922	lbzx	$acc03,$Tbl1,$acc03
923	rlwinm	$acc10,$s0,`32-8`,24,31
924	rlwinm	$acc11,$s1,`32-8`,24,31
925	lbzx	$acc04,$Tbl1,$acc04
926	lbzx	$acc05,$Tbl1,$acc05
927	rlwinm	$acc12,$s1,`0`,24,31
928	rlwinm	$acc13,$s2,`0`,24,31
929	lbzx	$acc06,$Tbl1,$acc06
930	lbzx	$acc07,$Tbl1,$acc07
931	rlwinm	$acc14,$s3,`0`,24,31
932	rlwinm	$acc15,$s0,`0`,24,31
933	lbzx	$acc08,$Tbl1,$acc08
934	lbzx	$acc09,$Tbl1,$acc09
935	rlwinm	$s0,$acc00,24,0,7
936	rlwinm	$s1,$acc01,24,0,7
937	lbzx	$acc10,$Tbl1,$acc10
938	lbzx	$acc11,$Tbl1,$acc11
939	rlwinm	$s2,$acc02,24,0,7
940	rlwinm	$s3,$acc03,24,0,7
941	lbzx	$acc12,$Tbl1,$acc12
942	lbzx	$acc13,$Tbl1,$acc13
943	rlwimi	$s0,$acc04,16,8,15
944	rlwimi	$s1,$acc05,16,8,15
945	lbzx	$acc14,$Tbl1,$acc14
946	lbzx	$acc15,$Tbl1,$acc15
947	rlwimi	$s2,$acc06,16,8,15
948	rlwimi	$s3,$acc07,16,8,15
949	rlwimi	$s0,$acc08,8,16,23
950	rlwimi	$s1,$acc09,8,16,23
951	rlwimi	$s2,$acc10,8,16,23
952	rlwimi	$s3,$acc11,8,16,23
953	lwz	$t0,0($key)
954	lwz	$t1,4($key)
955	or	$s0,$s0,$acc12
956	or	$s1,$s1,$acc13
957	lwz	$t2,8($key)
958	lwz	$t3,12($key)
959	or	$s2,$s2,$acc14
960	or	$s3,$s3,$acc15
961
962	addi	$key,$key,16
963	bdz	Ldec_compact_done
964___
965$code.=<<___ if ($SIZE_T==8);
966	# vectorized permutation improves decrypt performance by 10%
967	insrdi	$s0,$s1,32,0
968	insrdi	$s2,$s3,32,0
969
970	and	$acc00,$s0,$mask80	# r1=r0&0x80808080
971	and	$acc02,$s2,$mask80
972	srdi	$acc04,$acc00,7		# r1>>7
973	srdi	$acc06,$acc02,7
974	andc	$acc08,$s0,$mask80	# r0&0x7f7f7f7f
975	andc	$acc10,$s2,$mask80
976	sub	$acc00,$acc00,$acc04	# r1-(r1>>7)
977	sub	$acc02,$acc02,$acc06
978	add	$acc08,$acc08,$acc08	# (r0&0x7f7f7f7f)<<1
979	add	$acc10,$acc10,$acc10
980	and	$acc00,$acc00,$mask1b	# (r1-(r1>>7))&0x1b1b1b1b
981	and	$acc02,$acc02,$mask1b
982	xor	$acc00,$acc00,$acc08	# r2
983	xor	$acc02,$acc02,$acc10
984
985	and	$acc04,$acc00,$mask80	# r1=r2&0x80808080
986	and	$acc06,$acc02,$mask80
987	srdi	$acc08,$acc04,7		# r1>>7
988	srdi	$acc10,$acc06,7
989	andc	$acc12,$acc00,$mask80	# r2&0x7f7f7f7f
990	andc	$acc14,$acc02,$mask80
991	sub	$acc04,$acc04,$acc08	# r1-(r1>>7)
992	sub	$acc06,$acc06,$acc10
993	add	$acc12,$acc12,$acc12	# (r2&0x7f7f7f7f)<<1
994	add	$acc14,$acc14,$acc14
995	and	$acc04,$acc04,$mask1b	# (r1-(r1>>7))&0x1b1b1b1b
996	and	$acc06,$acc06,$mask1b
997	xor	$acc04,$acc04,$acc12	# r4
998	xor	$acc06,$acc06,$acc14
999
1000	and	$acc08,$acc04,$mask80	# r1=r4&0x80808080
1001	and	$acc10,$acc06,$mask80
1002	srdi	$acc12,$acc08,7		# r1>>7
1003	srdi	$acc14,$acc10,7
1004	sub	$acc08,$acc08,$acc12	# r1-(r1>>7)
1005	sub	$acc10,$acc10,$acc14
1006	andc	$acc12,$acc04,$mask80	# r4&0x7f7f7f7f
1007	andc	$acc14,$acc06,$mask80
1008	add	$acc12,$acc12,$acc12	# (r4&0x7f7f7f7f)<<1
1009	add	$acc14,$acc14,$acc14
1010	and	$acc08,$acc08,$mask1b	# (r1-(r1>>7))&0x1b1b1b1b
1011	and	$acc10,$acc10,$mask1b
1012	xor	$acc08,$acc08,$acc12	# r8
1013	xor	$acc10,$acc10,$acc14
1014
1015	xor	$acc00,$acc00,$s0	# r2^r0
1016	xor	$acc02,$acc02,$s2
1017	xor	$acc04,$acc04,$s0	# r4^r0
1018	xor	$acc06,$acc06,$s2
1019
1020	extrdi	$acc01,$acc00,32,0
1021	extrdi	$acc03,$acc02,32,0
1022	extrdi	$acc05,$acc04,32,0
1023	extrdi	$acc07,$acc06,32,0
1024	extrdi	$acc09,$acc08,32,0
1025	extrdi	$acc11,$acc10,32,0
1026___
1027$code.=<<___ if ($SIZE_T==4);
1028	and	$acc00,$s0,$mask80	# r1=r0&0x80808080
1029	and	$acc01,$s1,$mask80
1030	and	$acc02,$s2,$mask80
1031	and	$acc03,$s3,$mask80
1032	srwi	$acc04,$acc00,7		# r1>>7
1033	srwi	$acc05,$acc01,7
1034	srwi	$acc06,$acc02,7
1035	srwi	$acc07,$acc03,7
1036	andc	$acc08,$s0,$mask80	# r0&0x7f7f7f7f
1037	andc	$acc09,$s1,$mask80
1038	andc	$acc10,$s2,$mask80
1039	andc	$acc11,$s3,$mask80
1040	sub	$acc00,$acc00,$acc04	# r1-(r1>>7)
1041	sub	$acc01,$acc01,$acc05
1042	sub	$acc02,$acc02,$acc06
1043	sub	$acc03,$acc03,$acc07
1044	add	$acc08,$acc08,$acc08	# (r0&0x7f7f7f7f)<<1
1045	add	$acc09,$acc09,$acc09
1046	add	$acc10,$acc10,$acc10
1047	add	$acc11,$acc11,$acc11
1048	and	$acc00,$acc00,$mask1b	# (r1-(r1>>7))&0x1b1b1b1b
1049	and	$acc01,$acc01,$mask1b
1050	and	$acc02,$acc02,$mask1b
1051	and	$acc03,$acc03,$mask1b
1052	xor	$acc00,$acc00,$acc08	# r2
1053	xor	$acc01,$acc01,$acc09
1054	xor	$acc02,$acc02,$acc10
1055	xor	$acc03,$acc03,$acc11
1056
1057	and	$acc04,$acc00,$mask80	# r1=r2&0x80808080
1058	and	$acc05,$acc01,$mask80
1059	and	$acc06,$acc02,$mask80
1060	and	$acc07,$acc03,$mask80
1061	srwi	$acc08,$acc04,7		# r1>>7
1062	srwi	$acc09,$acc05,7
1063	srwi	$acc10,$acc06,7
1064	srwi	$acc11,$acc07,7
1065	andc	$acc12,$acc00,$mask80	# r2&0x7f7f7f7f
1066	andc	$acc13,$acc01,$mask80
1067	andc	$acc14,$acc02,$mask80
1068	andc	$acc15,$acc03,$mask80
1069	sub	$acc04,$acc04,$acc08	# r1-(r1>>7)
1070	sub	$acc05,$acc05,$acc09
1071	sub	$acc06,$acc06,$acc10
1072	sub	$acc07,$acc07,$acc11
1073	add	$acc12,$acc12,$acc12	# (r2&0x7f7f7f7f)<<1
1074	add	$acc13,$acc13,$acc13
1075	add	$acc14,$acc14,$acc14
1076	add	$acc15,$acc15,$acc15
1077	and	$acc04,$acc04,$mask1b	# (r1-(r1>>7))&0x1b1b1b1b
1078	and	$acc05,$acc05,$mask1b
1079	and	$acc06,$acc06,$mask1b
1080	and	$acc07,$acc07,$mask1b
1081	xor	$acc04,$acc04,$acc12	# r4
1082	xor	$acc05,$acc05,$acc13
1083	xor	$acc06,$acc06,$acc14
1084	xor	$acc07,$acc07,$acc15
1085
1086	and	$acc08,$acc04,$mask80	# r1=r4&0x80808080
1087	and	$acc09,$acc05,$mask80
1088	and	$acc10,$acc06,$mask80
1089	and	$acc11,$acc07,$mask80
1090	srwi	$acc12,$acc08,7		# r1>>7
1091	srwi	$acc13,$acc09,7
1092	srwi	$acc14,$acc10,7
1093	srwi	$acc15,$acc11,7
1094	sub	$acc08,$acc08,$acc12	# r1-(r1>>7)
1095	sub	$acc09,$acc09,$acc13
1096	sub	$acc10,$acc10,$acc14
1097	sub	$acc11,$acc11,$acc15
1098	andc	$acc12,$acc04,$mask80	# r4&0x7f7f7f7f
1099	andc	$acc13,$acc05,$mask80
1100	andc	$acc14,$acc06,$mask80
1101	andc	$acc15,$acc07,$mask80
1102	add	$acc12,$acc12,$acc12	# (r4&0x7f7f7f7f)<<1
1103	add	$acc13,$acc13,$acc13
1104	add	$acc14,$acc14,$acc14
1105	add	$acc15,$acc15,$acc15
1106	and	$acc08,$acc08,$mask1b	# (r1-(r1>>7))&0x1b1b1b1b
1107	and	$acc09,$acc09,$mask1b
1108	and	$acc10,$acc10,$mask1b
1109	and	$acc11,$acc11,$mask1b
1110	xor	$acc08,$acc08,$acc12	# r8
1111	xor	$acc09,$acc09,$acc13
1112	xor	$acc10,$acc10,$acc14
1113	xor	$acc11,$acc11,$acc15
1114
1115	xor	$acc00,$acc00,$s0	# r2^r0
1116	xor	$acc01,$acc01,$s1
1117	xor	$acc02,$acc02,$s2
1118	xor	$acc03,$acc03,$s3
1119	xor	$acc04,$acc04,$s0	# r4^r0
1120	xor	$acc05,$acc05,$s1
1121	xor	$acc06,$acc06,$s2
1122	xor	$acc07,$acc07,$s3
1123___
1124$code.=<<___;
1125	rotrwi	$s0,$s0,8		# = ROTATE(r0,8)
1126	rotrwi	$s1,$s1,8
1127	rotrwi	$s2,$s2,8
1128	rotrwi	$s3,$s3,8
1129	xor	$s0,$s0,$acc00		# ^= r2^r0
1130	xor	$s1,$s1,$acc01
1131	xor	$s2,$s2,$acc02
1132	xor	$s3,$s3,$acc03
1133	xor	$acc00,$acc00,$acc08
1134	xor	$acc01,$acc01,$acc09
1135	xor	$acc02,$acc02,$acc10
1136	xor	$acc03,$acc03,$acc11
1137	xor	$s0,$s0,$acc04		# ^= r4^r0
1138	xor	$s1,$s1,$acc05
1139	xor	$s2,$s2,$acc06
1140	xor	$s3,$s3,$acc07
1141	rotrwi	$acc00,$acc00,24
1142	rotrwi	$acc01,$acc01,24
1143	rotrwi	$acc02,$acc02,24
1144	rotrwi	$acc03,$acc03,24
1145	xor	$acc04,$acc04,$acc08
1146	xor	$acc05,$acc05,$acc09
1147	xor	$acc06,$acc06,$acc10
1148	xor	$acc07,$acc07,$acc11
1149	xor	$s0,$s0,$acc08		# ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)]
1150	xor	$s1,$s1,$acc09
1151	xor	$s2,$s2,$acc10
1152	xor	$s3,$s3,$acc11
1153	rotrwi	$acc04,$acc04,16
1154	rotrwi	$acc05,$acc05,16
1155	rotrwi	$acc06,$acc06,16
1156	rotrwi	$acc07,$acc07,16
1157	xor	$s0,$s0,$acc00		# ^= ROTATE(r8^r2^r0,24)
1158	xor	$s1,$s1,$acc01
1159	xor	$s2,$s2,$acc02
1160	xor	$s3,$s3,$acc03
1161	rotrwi	$acc08,$acc08,8
1162	rotrwi	$acc09,$acc09,8
1163	rotrwi	$acc10,$acc10,8
1164	rotrwi	$acc11,$acc11,8
1165	xor	$s0,$s0,$acc04		# ^= ROTATE(r8^r4^r0,16)
1166	xor	$s1,$s1,$acc05
1167	xor	$s2,$s2,$acc06
1168	xor	$s3,$s3,$acc07
1169	xor	$s0,$s0,$acc08		# ^= ROTATE(r8,8)
1170	xor	$s1,$s1,$acc09
1171	xor	$s2,$s2,$acc10
1172	xor	$s3,$s3,$acc11
1173
1174	b	Ldec_compact_loop
1175.align	4
1176Ldec_compact_done:
1177	xor	$s0,$s0,$t0
1178	xor	$s1,$s1,$t1
1179	xor	$s2,$s2,$t2
1180	xor	$s3,$s3,$t3
1181	blr
1182.long	0
1183.asciz	"AES for PPC, CRYPTOGAMS by <appro\@openssl.org>"
1184.align	7
1185___
1186
1187$code =~ s/\`([^\`]*)\`/eval $1/gem;
1188print $code;
1189close STDOUT;
1190