1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. Rights for redistribution and usage in source and binary
6# forms are granted according to the OpenSSL license.
7# ====================================================================
8#
9# Version 1.1
10#
11# The major reason for undertaken effort was to mitigate the hazard of
12# cache-timing attack. This is [currently and initially!] addressed in
13# two ways. 1. S-boxes are compressed from 5KB to 2KB+256B size each.
14# 2. References to them are scheduled for L2 cache latency, meaning
15# that the tables don't have to reside in L1 cache. Once again, this
16# is an initial draft and one should expect more countermeasures to
17# be implemented...
18#
19# Version 1.1 prefetches T[ed]4 in order to mitigate attack on last
20# round.
21#
22# Even though performance was not the primary goal [on the contrary,
23# extra shifts "induced" by compressed S-box and longer loop epilogue
24# "induced" by scheduling for L2 have negative effect on performance],
25# the code turned out to run in ~23 cycles per processed byte en-/
26# decrypted with 128-bit key. This is pretty good result for code
27# with mentioned qualities and UltraSPARC core. Compared to Sun C
28# generated code my encrypt procedure runs just few percents faster,
29# while decrypt one - whole 50% faster [yes, Sun C failed to generate
30# optimal decrypt procedure]. Compared to GNU C generated code both
31# procedures are more than 60% faster:-)
32
33$bits=32;
34for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
35if ($bits==64)	{ $bias=2047; $frame=192; }
36else		{ $bias=0;    $frame=112; }
37$locals=16;
38
39$acc0="%l0";
40$acc1="%o0";
41$acc2="%o1";
42$acc3="%o2";
43
44$acc4="%l1";
45$acc5="%o3";
46$acc6="%o4";
47$acc7="%o5";
48
49$acc8="%l2";
50$acc9="%o7";
51$acc10="%g1";
52$acc11="%g2";
53
54$acc12="%l3";
55$acc13="%g3";
56$acc14="%g4";
57$acc15="%g5";
58
59$t0="%l4";
60$t1="%l5";
61$t2="%l6";
62$t3="%l7";
63
64$s0="%i0";
65$s1="%i1";
66$s2="%i2";
67$s3="%i3";
68$tbl="%i4";
69$key="%i5";
70$rounds="%i7";	# aliases with return address, which is off-loaded to stack
71
72sub _data_word()
73{ my $i;
74    while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
75}
76
77$code.=<<___ if ($bits==64);
78.register	%g2,#scratch
79.register	%g3,#scratch
80___
81$code.=<<___;
82.section	".text",#alloc,#execinstr
83
84.align	256
85AES_Te:
86___
87&_data_word(
88	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
89	0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
90	0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
91	0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
92	0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
93	0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
94	0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
95	0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
96	0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
97	0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
98	0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
99	0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
100	0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
101	0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
102	0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
103	0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
104	0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
105	0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
106	0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
107	0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
108	0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
109	0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
110	0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
111	0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
112	0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
113	0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
114	0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
115	0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
116	0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
117	0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
118	0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
119	0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
120	0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
121	0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
122	0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
123	0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
124	0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
125	0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
126	0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
127	0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
128	0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
129	0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
130	0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
131	0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
132	0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
133	0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
134	0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
135	0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
136	0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
137	0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
138	0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
139	0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
140	0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
141	0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
142	0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
143	0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
144	0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
145	0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
146	0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
147	0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
148	0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
149	0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
150	0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
151	0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
152$code.=<<___;
153	.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
154	.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
155	.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
156	.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
157	.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
158	.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
159	.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
160	.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
161	.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
162	.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
163	.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
164	.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
165	.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
166	.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
167	.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
168	.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
169	.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
170	.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
171	.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
172	.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
173	.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
174	.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
175	.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
176	.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
177	.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
178	.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
179	.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
180	.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
181	.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
182	.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
183	.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
184	.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
185.type	AES_Te,#object
186.size	AES_Te,(.-AES_Te)
187
188.align	64
189.skip	16
190_sparcv9_AES_encrypt:
191	save	%sp,-$frame-$locals,%sp
192	stx	%i7,[%sp+$bias+$frame+0]	! off-load return address
193	ld	[$key+240],$rounds
194	ld	[$key+0],$t0
195	ld	[$key+4],$t1			!
196	ld	[$key+8],$t2
197	srl	$rounds,1,$rounds
198	xor	$t0,$s0,$s0
199	ld	[$key+12],$t3
200	srl	$s0,21,$acc0
201	xor	$t1,$s1,$s1
202	ld	[$key+16],$t0
203	srl	$s1,13,$acc1			!
204	xor	$t2,$s2,$s2
205	ld	[$key+20],$t1
206	xor	$t3,$s3,$s3
207	ld	[$key+24],$t2
208	and	$acc0,2040,$acc0
209	ld	[$key+28],$t3
210	nop
211.Lenc_loop:
212	srl	$s2,5,$acc2			!
213	and	$acc1,2040,$acc1
214	ldx	[$tbl+$acc0],$acc0
215	sll	$s3,3,$acc3
216	and	$acc2,2040,$acc2
217	ldx	[$tbl+$acc1],$acc1
218	srl	$s1,21,$acc4
219	and	$acc3,2040,$acc3
220	ldx	[$tbl+$acc2],$acc2		!
221	srl	$s2,13,$acc5
222	and	$acc4,2040,$acc4
223	ldx	[$tbl+$acc3],$acc3
224	srl	$s3,5,$acc6
225	and	$acc5,2040,$acc5
226	ldx	[$tbl+$acc4],$acc4
227	fmovs	%f0,%f0
228	sll	$s0,3,$acc7			!
229	and	$acc6,2040,$acc6
230	ldx	[$tbl+$acc5],$acc5
231	srl	$s2,21,$acc8
232	and	$acc7,2040,$acc7
233	ldx	[$tbl+$acc6],$acc6
234	srl	$s3,13,$acc9
235	and	$acc8,2040,$acc8
236	ldx	[$tbl+$acc7],$acc7		!
237	srl	$s0,5,$acc10
238	and	$acc9,2040,$acc9
239	ldx	[$tbl+$acc8],$acc8
240	sll	$s1,3,$acc11
241	and	$acc10,2040,$acc10
242	ldx	[$tbl+$acc9],$acc9
243	fmovs	%f0,%f0
244	srl	$s3,21,$acc12			!
245	and	$acc11,2040,$acc11
246	ldx	[$tbl+$acc10],$acc10
247	srl	$s0,13,$acc13
248	and	$acc12,2040,$acc12
249	ldx	[$tbl+$acc11],$acc11
250	srl	$s1,5,$acc14
251	and	$acc13,2040,$acc13
252	ldx	[$tbl+$acc12],$acc12		!
253	sll	$s2,3,$acc15
254	and	$acc14,2040,$acc14
255	ldx	[$tbl+$acc13],$acc13
256	and	$acc15,2040,$acc15
257	add	$key,32,$key
258	ldx	[$tbl+$acc14],$acc14
259	fmovs	%f0,%f0
260	subcc	$rounds,1,$rounds		!
261	ldx	[$tbl+$acc15],$acc15
262	bz,a,pn	%icc,.Lenc_last
263	add	$tbl,2048,$rounds
264
265		srlx	$acc1,8,$acc1
266		xor	$acc0,$t0,$t0
267	ld	[$key+0],$s0
268	fmovs	%f0,%f0
269		srlx	$acc2,16,$acc2		!
270		xor	$acc1,$t0,$t0
271	ld	[$key+4],$s1
272		srlx	$acc3,24,$acc3
273		xor	$acc2,$t0,$t0
274	ld	[$key+8],$s2
275		srlx	$acc5,8,$acc5
276		xor	$acc3,$t0,$t0
277	ld	[$key+12],$s3			!
278		srlx	$acc6,16,$acc6
279		xor	$acc4,$t1,$t1
280	fmovs	%f0,%f0
281		srlx	$acc7,24,$acc7
282		xor	$acc5,$t1,$t1
283		srlx	$acc9,8,$acc9
284		xor	$acc6,$t1,$t1
285		srlx	$acc10,16,$acc10	!
286		xor	$acc7,$t1,$t1
287		srlx	$acc11,24,$acc11
288		xor	$acc8,$t2,$t2
289		srlx	$acc13,8,$acc13
290		xor	$acc9,$t2,$t2
291		srlx	$acc14,16,$acc14
292		xor	$acc10,$t2,$t2
293		srlx	$acc15,24,$acc15	!
294		xor	$acc11,$t2,$t2
295		xor	$acc12,$acc14,$acc14
296		xor	$acc13,$t3,$t3
297	srl	$t0,21,$acc0
298		xor	$acc14,$t3,$t3
299	srl	$t1,13,$acc1
300		xor	$acc15,$t3,$t3
301
302	and	$acc0,2040,$acc0		!
303	srl	$t2,5,$acc2
304	and	$acc1,2040,$acc1
305	ldx	[$tbl+$acc0],$acc0
306	sll	$t3,3,$acc3
307	and	$acc2,2040,$acc2
308	ldx	[$tbl+$acc1],$acc1
309	fmovs	%f0,%f0
310	srl	$t1,21,$acc4			!
311	and	$acc3,2040,$acc3
312	ldx	[$tbl+$acc2],$acc2
313	srl	$t2,13,$acc5
314	and	$acc4,2040,$acc4
315	ldx	[$tbl+$acc3],$acc3
316	srl	$t3,5,$acc6
317	and	$acc5,2040,$acc5
318	ldx	[$tbl+$acc4],$acc4		!
319	sll	$t0,3,$acc7
320	and	$acc6,2040,$acc6
321	ldx	[$tbl+$acc5],$acc5
322	srl	$t2,21,$acc8
323	and	$acc7,2040,$acc7
324	ldx	[$tbl+$acc6],$acc6
325	fmovs	%f0,%f0
326	srl	$t3,13,$acc9			!
327	and	$acc8,2040,$acc8
328	ldx	[$tbl+$acc7],$acc7
329	srl	$t0,5,$acc10
330	and	$acc9,2040,$acc9
331	ldx	[$tbl+$acc8],$acc8
332	sll	$t1,3,$acc11
333	and	$acc10,2040,$acc10
334	ldx	[$tbl+$acc9],$acc9		!
335	srl	$t3,21,$acc12
336	and	$acc11,2040,$acc11
337	ldx	[$tbl+$acc10],$acc10
338	srl	$t0,13,$acc13
339	and	$acc12,2040,$acc12
340	ldx	[$tbl+$acc11],$acc11
341	fmovs	%f0,%f0
342	srl	$t1,5,$acc14			!
343	and	$acc13,2040,$acc13
344	ldx	[$tbl+$acc12],$acc12
345	sll	$t2,3,$acc15
346	and	$acc14,2040,$acc14
347	ldx	[$tbl+$acc13],$acc13
348		srlx	$acc1,8,$acc1
349	and	$acc15,2040,$acc15
350	ldx	[$tbl+$acc14],$acc14		!
351
352		srlx	$acc2,16,$acc2
353		xor	$acc0,$s0,$s0
354	ldx	[$tbl+$acc15],$acc15
355		srlx	$acc3,24,$acc3
356		xor	$acc1,$s0,$s0
357	ld	[$key+16],$t0
358	fmovs	%f0,%f0
359		srlx	$acc5,8,$acc5		!
360		xor	$acc2,$s0,$s0
361	ld	[$key+20],$t1
362		srlx	$acc6,16,$acc6
363		xor	$acc3,$s0,$s0
364	ld	[$key+24],$t2
365		srlx	$acc7,24,$acc7
366		xor	$acc4,$s1,$s1
367	ld	[$key+28],$t3			!
368		srlx	$acc9,8,$acc9
369		xor	$acc5,$s1,$s1
370	ldx	[$tbl+2048+0],%g0		! prefetch te4
371		srlx	$acc10,16,$acc10
372		xor	$acc6,$s1,$s1
373	ldx	[$tbl+2048+32],%g0		! prefetch te4
374		srlx	$acc11,24,$acc11
375		xor	$acc7,$s1,$s1
376	ldx	[$tbl+2048+64],%g0		! prefetch te4
377		srlx	$acc13,8,$acc13
378		xor	$acc8,$s2,$s2
379	ldx	[$tbl+2048+96],%g0		! prefetch te4
380		srlx	$acc14,16,$acc14	!
381		xor	$acc9,$s2,$s2
382	ldx	[$tbl+2048+128],%g0		! prefetch te4
383		srlx	$acc15,24,$acc15
384		xor	$acc10,$s2,$s2
385	ldx	[$tbl+2048+160],%g0		! prefetch te4
386	srl	$s0,21,$acc0
387		xor	$acc11,$s2,$s2
388	ldx	[$tbl+2048+192],%g0		! prefetch te4
389		xor	$acc12,$acc14,$acc14
390		xor	$acc13,$s3,$s3
391	ldx	[$tbl+2048+224],%g0		! prefetch te4
392	srl	$s1,13,$acc1			!
393		xor	$acc14,$s3,$s3
394		xor	$acc15,$s3,$s3
395	ba	.Lenc_loop
396	and	$acc0,2040,$acc0
397
398.align	32
399.Lenc_last:
400		srlx	$acc1,8,$acc1		!
401		xor	$acc0,$t0,$t0
402	ld	[$key+0],$s0
403		srlx	$acc2,16,$acc2
404		xor	$acc1,$t0,$t0
405	ld	[$key+4],$s1
406		srlx	$acc3,24,$acc3
407		xor	$acc2,$t0,$t0
408	ld	[$key+8],$s2			!
409		srlx	$acc5,8,$acc5
410		xor	$acc3,$t0,$t0
411	ld	[$key+12],$s3
412		srlx	$acc6,16,$acc6
413		xor	$acc4,$t1,$t1
414		srlx	$acc7,24,$acc7
415		xor	$acc5,$t1,$t1
416		srlx	$acc9,8,$acc9		!
417		xor	$acc6,$t1,$t1
418		srlx	$acc10,16,$acc10
419		xor	$acc7,$t1,$t1
420		srlx	$acc11,24,$acc11
421		xor	$acc8,$t2,$t2
422		srlx	$acc13,8,$acc13
423		xor	$acc9,$t2,$t2
424		srlx	$acc14,16,$acc14	!
425		xor	$acc10,$t2,$t2
426		srlx	$acc15,24,$acc15
427		xor	$acc11,$t2,$t2
428		xor	$acc12,$acc14,$acc14
429		xor	$acc13,$t3,$t3
430	srl	$t0,24,$acc0
431		xor	$acc14,$t3,$t3
432	srl	$t1,16,$acc1			!
433		xor	$acc15,$t3,$t3
434
435	srl	$t2,8,$acc2
436	and	$acc1,255,$acc1
437	ldub	[$rounds+$acc0],$acc0
438	srl	$t1,24,$acc4
439	and	$acc2,255,$acc2
440	ldub	[$rounds+$acc1],$acc1
441	srl	$t2,16,$acc5			!
442	and	$t3,255,$acc3
443	ldub	[$rounds+$acc2],$acc2
444	ldub	[$rounds+$acc3],$acc3
445	srl	$t3,8,$acc6
446	and	$acc5,255,$acc5
447	ldub	[$rounds+$acc4],$acc4
448	fmovs	%f0,%f0
449	srl	$t2,24,$acc8			!
450	and	$acc6,255,$acc6
451	ldub	[$rounds+$acc5],$acc5
452	srl	$t3,16,$acc9
453	and	$t0,255,$acc7
454	ldub	[$rounds+$acc6],$acc6
455	ldub	[$rounds+$acc7],$acc7
456	fmovs	%f0,%f0
457	srl	$t0,8,$acc10			!
458	and	$acc9,255,$acc9
459	ldub	[$rounds+$acc8],$acc8
460	srl	$t3,24,$acc12
461	and	$acc10,255,$acc10
462	ldub	[$rounds+$acc9],$acc9
463	srl	$t0,16,$acc13
464	and	$t1,255,$acc11
465	ldub	[$rounds+$acc10],$acc10		!
466	srl	$t1,8,$acc14
467	and	$acc13,255,$acc13
468	ldub	[$rounds+$acc11],$acc11
469	ldub	[$rounds+$acc12],$acc12
470	and	$acc14,255,$acc14
471	ldub	[$rounds+$acc13],$acc13
472	and	$t2,255,$acc15
473	ldub	[$rounds+$acc14],$acc14		!
474
475		sll	$acc0,24,$acc0
476		xor	$acc3,$s0,$s0
477	ldub	[$rounds+$acc15],$acc15
478		sll	$acc1,16,$acc1
479		xor	$acc0,$s0,$s0
480	ldx	[%sp+$bias+$frame+0],%i7	! restore return address
481	fmovs	%f0,%f0
482		sll	$acc2,8,$acc2		!
483		xor	$acc1,$s0,$s0
484		sll	$acc4,24,$acc4
485		xor	$acc2,$s0,$s0
486		sll	$acc5,16,$acc5
487		xor	$acc7,$s1,$s1
488		sll	$acc6,8,$acc6
489		xor	$acc4,$s1,$s1
490		sll	$acc8,24,$acc8		!
491		xor	$acc5,$s1,$s1
492		sll	$acc9,16,$acc9
493		xor	$acc11,$s2,$s2
494		sll	$acc10,8,$acc10
495		xor	$acc6,$s1,$s1
496		sll	$acc12,24,$acc12
497		xor	$acc8,$s2,$s2
498		sll	$acc13,16,$acc13	!
499		xor	$acc9,$s2,$s2
500		sll	$acc14,8,$acc14
501		xor	$acc10,$s2,$s2
502		xor	$acc12,$acc14,$acc14
503		xor	$acc13,$s3,$s3
504		xor	$acc14,$s3,$s3
505		xor	$acc15,$s3,$s3
506
507	ret
508	restore
509.type	_sparcv9_AES_encrypt,#function
510.size	_sparcv9_AES_encrypt,(.-_sparcv9_AES_encrypt)
511
512.align	32
513.globl	AES_encrypt
514AES_encrypt:
515	or	%o0,%o1,%g1
516	andcc	%g1,3,%g0
517	bnz,pn	%xcc,.Lunaligned_enc
518	save	%sp,-$frame,%sp
519
520	ld	[%i0+0],%o0
521	ld	[%i0+4],%o1
522	ld	[%i0+8],%o2
523	ld	[%i0+12],%o3
524
5251:	call	.+8
526	add	%o7,AES_Te-1b,%o4
527	call	_sparcv9_AES_encrypt
528	mov	%i2,%o5
529
530	st	%o0,[%i1+0]
531	st	%o1,[%i1+4]
532	st	%o2,[%i1+8]
533	st	%o3,[%i1+12]
534
535	ret
536	restore
537
538.align	32
539.Lunaligned_enc:
540	ldub	[%i0+0],%l0
541	ldub	[%i0+1],%l1
542	ldub	[%i0+2],%l2
543
544	sll	%l0,24,%l0
545	ldub	[%i0+3],%l3
546	sll	%l1,16,%l1
547	ldub	[%i0+4],%l4
548	sll	%l2,8,%l2
549	or	%l1,%l0,%l0
550	ldub	[%i0+5],%l5
551	sll	%l4,24,%l4
552	or	%l3,%l2,%l2
553	ldub	[%i0+6],%l6
554	sll	%l5,16,%l5
555	or	%l0,%l2,%o0
556	ldub	[%i0+7],%l7
557
558	sll	%l6,8,%l6
559	or	%l5,%l4,%l4
560	ldub	[%i0+8],%l0
561	or	%l7,%l6,%l6
562	ldub	[%i0+9],%l1
563	or	%l4,%l6,%o1
564	ldub	[%i0+10],%l2
565
566	sll	%l0,24,%l0
567	ldub	[%i0+11],%l3
568	sll	%l1,16,%l1
569	ldub	[%i0+12],%l4
570	sll	%l2,8,%l2
571	or	%l1,%l0,%l0
572	ldub	[%i0+13],%l5
573	sll	%l4,24,%l4
574	or	%l3,%l2,%l2
575	ldub	[%i0+14],%l6
576	sll	%l5,16,%l5
577	or	%l0,%l2,%o2
578	ldub	[%i0+15],%l7
579
580	sll	%l6,8,%l6
581	or	%l5,%l4,%l4
582	or	%l7,%l6,%l6
583	or	%l4,%l6,%o3
584
5851:	call	.+8
586	add	%o7,AES_Te-1b,%o4
587	call	_sparcv9_AES_encrypt
588	mov	%i2,%o5
589
590	srl	%o0,24,%l0
591	srl	%o0,16,%l1
592	stb	%l0,[%i1+0]
593	srl	%o0,8,%l2
594	stb	%l1,[%i1+1]
595	stb	%l2,[%i1+2]
596	srl	%o1,24,%l4
597	stb	%o0,[%i1+3]
598
599	srl	%o1,16,%l5
600	stb	%l4,[%i1+4]
601	srl	%o1,8,%l6
602	stb	%l5,[%i1+5]
603	stb	%l6,[%i1+6]
604	srl	%o2,24,%l0
605	stb	%o1,[%i1+7]
606
607	srl	%o2,16,%l1
608	stb	%l0,[%i1+8]
609	srl	%o2,8,%l2
610	stb	%l1,[%i1+9]
611	stb	%l2,[%i1+10]
612	srl	%o3,24,%l4
613	stb	%o2,[%i1+11]
614
615	srl	%o3,16,%l5
616	stb	%l4,[%i1+12]
617	srl	%o3,8,%l6
618	stb	%l5,[%i1+13]
619	stb	%l6,[%i1+14]
620	stb	%o3,[%i1+15]
621
622	ret
623	restore
624.type	AES_encrypt,#function
625.size	AES_encrypt,(.-AES_encrypt)
626
627___
628
629$code.=<<___;
630.align	256
631AES_Td:
632___
633&_data_word(
634	0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
635	0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
636	0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
637	0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
638	0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
639	0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
640	0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
641	0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
642	0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
643	0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
644	0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
645	0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
646	0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
647	0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
648	0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
649	0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
650	0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
651	0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
652	0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
653	0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
654	0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
655	0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
656	0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
657	0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
658	0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
659	0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
660	0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
661	0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
662	0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
663	0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
664	0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
665	0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
666	0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
667	0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
668	0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
669	0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
670	0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
671	0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
672	0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
673	0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
674	0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
675	0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
676	0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
677	0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
678	0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
679	0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
680	0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
681	0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
682	0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
683	0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
684	0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
685	0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
686	0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
687	0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
688	0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
689	0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
690	0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
691	0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
692	0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
693	0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
694	0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
695	0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
696	0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
697	0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
698$code.=<<___;
699	.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
700	.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
701	.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
702	.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
703	.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
704	.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
705	.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
706	.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
707	.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
708	.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
709	.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
710	.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
711	.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
712	.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
713	.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
714	.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
715	.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
716	.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
717	.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
718	.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
719	.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
720	.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
721	.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
722	.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
723	.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
724	.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
725	.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
726	.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
727	.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
728	.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
729	.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
730	.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
731.type	AES_Td,#object
732.size	AES_Td,(.-AES_Td)
733
734.align	64
735.skip	16
736_sparcv9_AES_decrypt:
737	save	%sp,-$frame-$locals,%sp
738	stx	%i7,[%sp+$bias+$frame+0]	! off-load return address
739	ld	[$key+240],$rounds
740	ld	[$key+0],$t0
741	ld	[$key+4],$t1			!
742	ld	[$key+8],$t2
743	ld	[$key+12],$t3
744	srl	$rounds,1,$rounds
745	xor	$t0,$s0,$s0
746	ld	[$key+16],$t0
747	xor	$t1,$s1,$s1
748	ld	[$key+20],$t1
749	srl	$s0,21,$acc0			!
750	xor	$t2,$s2,$s2
751	ld	[$key+24],$t2
752	xor	$t3,$s3,$s3
753	and	$acc0,2040,$acc0
754	ld	[$key+28],$t3
755	srl	$s3,13,$acc1
756	nop
757.Ldec_loop:
758	srl	$s2,5,$acc2			!
759	and	$acc1,2040,$acc1
760	ldx	[$tbl+$acc0],$acc0
761	sll	$s1,3,$acc3
762	and	$acc2,2040,$acc2
763	ldx	[$tbl+$acc1],$acc1
764	srl	$s1,21,$acc4
765	and	$acc3,2040,$acc3
766	ldx	[$tbl+$acc2],$acc2		!
767	srl	$s0,13,$acc5
768	and	$acc4,2040,$acc4
769	ldx	[$tbl+$acc3],$acc3
770	srl	$s3,5,$acc6
771	and	$acc5,2040,$acc5
772	ldx	[$tbl+$acc4],$acc4
773	fmovs	%f0,%f0
774	sll	$s2,3,$acc7			!
775	and	$acc6,2040,$acc6
776	ldx	[$tbl+$acc5],$acc5
777	srl	$s2,21,$acc8
778	and	$acc7,2040,$acc7
779	ldx	[$tbl+$acc6],$acc6
780	srl	$s1,13,$acc9
781	and	$acc8,2040,$acc8
782	ldx	[$tbl+$acc7],$acc7		!
783	srl	$s0,5,$acc10
784	and	$acc9,2040,$acc9
785	ldx	[$tbl+$acc8],$acc8
786	sll	$s3,3,$acc11
787	and	$acc10,2040,$acc10
788	ldx	[$tbl+$acc9],$acc9
789	fmovs	%f0,%f0
790	srl	$s3,21,$acc12			!
791	and	$acc11,2040,$acc11
792	ldx	[$tbl+$acc10],$acc10
793	srl	$s2,13,$acc13
794	and	$acc12,2040,$acc12
795	ldx	[$tbl+$acc11],$acc11
796	srl	$s1,5,$acc14
797	and	$acc13,2040,$acc13
798	ldx	[$tbl+$acc12],$acc12		!
799	sll	$s0,3,$acc15
800	and	$acc14,2040,$acc14
801	ldx	[$tbl+$acc13],$acc13
802	and	$acc15,2040,$acc15
803	add	$key,32,$key
804	ldx	[$tbl+$acc14],$acc14
805	fmovs	%f0,%f0
806	subcc	$rounds,1,$rounds		!
807	ldx	[$tbl+$acc15],$acc15
808	bz,a,pn	%icc,.Ldec_last
809	add	$tbl,2048,$rounds
810
811		srlx	$acc1,8,$acc1
812		xor	$acc0,$t0,$t0
813	ld	[$key+0],$s0
814	fmovs	%f0,%f0
815		srlx	$acc2,16,$acc2		!
816		xor	$acc1,$t0,$t0
817	ld	[$key+4],$s1
818		srlx	$acc3,24,$acc3
819		xor	$acc2,$t0,$t0
820	ld	[$key+8],$s2
821		srlx	$acc5,8,$acc5
822		xor	$acc3,$t0,$t0
823	ld	[$key+12],$s3			!
824		srlx	$acc6,16,$acc6
825		xor	$acc4,$t1,$t1
826	fmovs	%f0,%f0
827		srlx	$acc7,24,$acc7
828		xor	$acc5,$t1,$t1
829		srlx	$acc9,8,$acc9
830		xor	$acc6,$t1,$t1
831		srlx	$acc10,16,$acc10	!
832		xor	$acc7,$t1,$t1
833		srlx	$acc11,24,$acc11
834		xor	$acc8,$t2,$t2
835		srlx	$acc13,8,$acc13
836		xor	$acc9,$t2,$t2
837		srlx	$acc14,16,$acc14
838		xor	$acc10,$t2,$t2
839		srlx	$acc15,24,$acc15	!
840		xor	$acc11,$t2,$t2
841		xor	$acc12,$acc14,$acc14
842		xor	$acc13,$t3,$t3
843	srl	$t0,21,$acc0
844		xor	$acc14,$t3,$t3
845		xor	$acc15,$t3,$t3
846	srl	$t3,13,$acc1
847
848	and	$acc0,2040,$acc0		!
849	srl	$t2,5,$acc2
850	and	$acc1,2040,$acc1
851	ldx	[$tbl+$acc0],$acc0
852	sll	$t1,3,$acc3
853	and	$acc2,2040,$acc2
854	ldx	[$tbl+$acc1],$acc1
855	fmovs	%f0,%f0
856	srl	$t1,21,$acc4			!
857	and	$acc3,2040,$acc3
858	ldx	[$tbl+$acc2],$acc2
859	srl	$t0,13,$acc5
860	and	$acc4,2040,$acc4
861	ldx	[$tbl+$acc3],$acc3
862	srl	$t3,5,$acc6
863	and	$acc5,2040,$acc5
864	ldx	[$tbl+$acc4],$acc4		!
865	sll	$t2,3,$acc7
866	and	$acc6,2040,$acc6
867	ldx	[$tbl+$acc5],$acc5
868	srl	$t2,21,$acc8
869	and	$acc7,2040,$acc7
870	ldx	[$tbl+$acc6],$acc6
871	fmovs	%f0,%f0
872	srl	$t1,13,$acc9			!
873	and	$acc8,2040,$acc8
874	ldx	[$tbl+$acc7],$acc7
875	srl	$t0,5,$acc10
876	and	$acc9,2040,$acc9
877	ldx	[$tbl+$acc8],$acc8
878	sll	$t3,3,$acc11
879	and	$acc10,2040,$acc10
880	ldx	[$tbl+$acc9],$acc9		!
881	srl	$t3,21,$acc12
882	and	$acc11,2040,$acc11
883	ldx	[$tbl+$acc10],$acc10
884	srl	$t2,13,$acc13
885	and	$acc12,2040,$acc12
886	ldx	[$tbl+$acc11],$acc11
887	fmovs	%f0,%f0
888	srl	$t1,5,$acc14			!
889	and	$acc13,2040,$acc13
890	ldx	[$tbl+$acc12],$acc12
891	sll	$t0,3,$acc15
892	and	$acc14,2040,$acc14
893	ldx	[$tbl+$acc13],$acc13
894		srlx	$acc1,8,$acc1
895	and	$acc15,2040,$acc15
896	ldx	[$tbl+$acc14],$acc14		!
897
898		srlx	$acc2,16,$acc2
899		xor	$acc0,$s0,$s0
900	ldx	[$tbl+$acc15],$acc15
901		srlx	$acc3,24,$acc3
902		xor	$acc1,$s0,$s0
903	ld	[$key+16],$t0
904	fmovs	%f0,%f0
905		srlx	$acc5,8,$acc5		!
906		xor	$acc2,$s0,$s0
907	ld	[$key+20],$t1
908		srlx	$acc6,16,$acc6
909		xor	$acc3,$s0,$s0
910	ld	[$key+24],$t2
911		srlx	$acc7,24,$acc7
912		xor	$acc4,$s1,$s1
913	ld	[$key+28],$t3			!
914		srlx	$acc9,8,$acc9
915		xor	$acc5,$s1,$s1
916	ldx	[$tbl+2048+0],%g0		! prefetch td4
917		srlx	$acc10,16,$acc10
918		xor	$acc6,$s1,$s1
919	ldx	[$tbl+2048+32],%g0		! prefetch td4
920		srlx	$acc11,24,$acc11
921		xor	$acc7,$s1,$s1
922	ldx	[$tbl+2048+64],%g0		! prefetch td4
923		srlx	$acc13,8,$acc13
924		xor	$acc8,$s2,$s2
925	ldx	[$tbl+2048+96],%g0		! prefetch td4
926		srlx	$acc14,16,$acc14	!
927		xor	$acc9,$s2,$s2
928	ldx	[$tbl+2048+128],%g0		! prefetch td4
929		srlx	$acc15,24,$acc15
930		xor	$acc10,$s2,$s2
931	ldx	[$tbl+2048+160],%g0		! prefetch td4
932	srl	$s0,21,$acc0
933		xor	$acc11,$s2,$s2
934	ldx	[$tbl+2048+192],%g0		! prefetch td4
935		xor	$acc12,$acc14,$acc14
936		xor	$acc13,$s3,$s3
937	ldx	[$tbl+2048+224],%g0		! prefetch td4
938	and	$acc0,2040,$acc0		!
939		xor	$acc14,$s3,$s3
940		xor	$acc15,$s3,$s3
941	ba	.Ldec_loop
942	srl	$s3,13,$acc1
943
944.align	32
945.Ldec_last:
946		srlx	$acc1,8,$acc1		!
947		xor	$acc0,$t0,$t0
948	ld	[$key+0],$s0
949		srlx	$acc2,16,$acc2
950		xor	$acc1,$t0,$t0
951	ld	[$key+4],$s1
952		srlx	$acc3,24,$acc3
953		xor	$acc2,$t0,$t0
954	ld	[$key+8],$s2			!
955		srlx	$acc5,8,$acc5
956		xor	$acc3,$t0,$t0
957	ld	[$key+12],$s3
958		srlx	$acc6,16,$acc6
959		xor	$acc4,$t1,$t1
960		srlx	$acc7,24,$acc7
961		xor	$acc5,$t1,$t1
962		srlx	$acc9,8,$acc9		!
963		xor	$acc6,$t1,$t1
964		srlx	$acc10,16,$acc10
965		xor	$acc7,$t1,$t1
966		srlx	$acc11,24,$acc11
967		xor	$acc8,$t2,$t2
968		srlx	$acc13,8,$acc13
969		xor	$acc9,$t2,$t2
970		srlx	$acc14,16,$acc14	!
971		xor	$acc10,$t2,$t2
972		srlx	$acc15,24,$acc15
973		xor	$acc11,$t2,$t2
974		xor	$acc12,$acc14,$acc14
975		xor	$acc13,$t3,$t3
976	srl	$t0,24,$acc0
977		xor	$acc14,$t3,$t3
978		xor	$acc15,$t3,$t3		!
979	srl	$t3,16,$acc1
980
981	srl	$t2,8,$acc2
982	and	$acc1,255,$acc1
983	ldub	[$rounds+$acc0],$acc0
984	srl	$t1,24,$acc4
985	and	$acc2,255,$acc2
986	ldub	[$rounds+$acc1],$acc1
987	srl	$t0,16,$acc5			!
988	and	$t1,255,$acc3
989	ldub	[$rounds+$acc2],$acc2
990	ldub	[$rounds+$acc3],$acc3
991	srl	$t3,8,$acc6
992	and	$acc5,255,$acc5
993	ldub	[$rounds+$acc4],$acc4
994	fmovs	%f0,%f0
995	srl	$t2,24,$acc8			!
996	and	$acc6,255,$acc6
997	ldub	[$rounds+$acc5],$acc5
998	srl	$t1,16,$acc9
999	and	$t2,255,$acc7
1000	ldub	[$rounds+$acc6],$acc6
1001	ldub	[$rounds+$acc7],$acc7
1002	fmovs	%f0,%f0
1003	srl	$t0,8,$acc10			!
1004	and	$acc9,255,$acc9
1005	ldub	[$rounds+$acc8],$acc8
1006	srl	$t3,24,$acc12
1007	and	$acc10,255,$acc10
1008	ldub	[$rounds+$acc9],$acc9
1009	srl	$t2,16,$acc13
1010	and	$t3,255,$acc11
1011	ldub	[$rounds+$acc10],$acc10		!
1012	srl	$t1,8,$acc14
1013	and	$acc13,255,$acc13
1014	ldub	[$rounds+$acc11],$acc11
1015	ldub	[$rounds+$acc12],$acc12
1016	and	$acc14,255,$acc14
1017	ldub	[$rounds+$acc13],$acc13
1018	and	$t0,255,$acc15
1019	ldub	[$rounds+$acc14],$acc14		!
1020
1021		sll	$acc0,24,$acc0
1022		xor	$acc3,$s0,$s0
1023	ldub	[$rounds+$acc15],$acc15
1024		sll	$acc1,16,$acc1
1025		xor	$acc0,$s0,$s0
1026	ldx	[%sp+$bias+$frame+0],%i7	! restore return address
1027	fmovs	%f0,%f0
1028		sll	$acc2,8,$acc2		!
1029		xor	$acc1,$s0,$s0
1030		sll	$acc4,24,$acc4
1031		xor	$acc2,$s0,$s0
1032		sll	$acc5,16,$acc5
1033		xor	$acc7,$s1,$s1
1034		sll	$acc6,8,$acc6
1035		xor	$acc4,$s1,$s1
1036		sll	$acc8,24,$acc8		!
1037		xor	$acc5,$s1,$s1
1038		sll	$acc9,16,$acc9
1039		xor	$acc11,$s2,$s2
1040		sll	$acc10,8,$acc10
1041		xor	$acc6,$s1,$s1
1042		sll	$acc12,24,$acc12
1043		xor	$acc8,$s2,$s2
1044		sll	$acc13,16,$acc13	!
1045		xor	$acc9,$s2,$s2
1046		sll	$acc14,8,$acc14
1047		xor	$acc10,$s2,$s2
1048		xor	$acc12,$acc14,$acc14
1049		xor	$acc13,$s3,$s3
1050		xor	$acc14,$s3,$s3
1051		xor	$acc15,$s3,$s3
1052
1053	ret
1054	restore
1055.type	_sparcv9_AES_decrypt,#function
1056.size	_sparcv9_AES_decrypt,(.-_sparcv9_AES_decrypt)
1057
1058.align	32
1059.globl	AES_decrypt
1060AES_decrypt:
1061	or	%o0,%o1,%g1
1062	andcc	%g1,3,%g0
1063	bnz,pn	%xcc,.Lunaligned_dec
1064	save	%sp,-$frame,%sp
1065
1066	ld	[%i0+0],%o0
1067	ld	[%i0+4],%o1
1068	ld	[%i0+8],%o2
1069	ld	[%i0+12],%o3
1070
10711:	call	.+8
1072	add	%o7,AES_Td-1b,%o4
1073	call	_sparcv9_AES_decrypt
1074	mov	%i2,%o5
1075
1076	st	%o0,[%i1+0]
1077	st	%o1,[%i1+4]
1078	st	%o2,[%i1+8]
1079	st	%o3,[%i1+12]
1080
1081	ret
1082	restore
1083
1084.align	32
1085.Lunaligned_dec:
1086	ldub	[%i0+0],%l0
1087	ldub	[%i0+1],%l1
1088	ldub	[%i0+2],%l2
1089
1090	sll	%l0,24,%l0
1091	ldub	[%i0+3],%l3
1092	sll	%l1,16,%l1
1093	ldub	[%i0+4],%l4
1094	sll	%l2,8,%l2
1095	or	%l1,%l0,%l0
1096	ldub	[%i0+5],%l5
1097	sll	%l4,24,%l4
1098	or	%l3,%l2,%l2
1099	ldub	[%i0+6],%l6
1100	sll	%l5,16,%l5
1101	or	%l0,%l2,%o0
1102	ldub	[%i0+7],%l7
1103
1104	sll	%l6,8,%l6
1105	or	%l5,%l4,%l4
1106	ldub	[%i0+8],%l0
1107	or	%l7,%l6,%l6
1108	ldub	[%i0+9],%l1
1109	or	%l4,%l6,%o1
1110	ldub	[%i0+10],%l2
1111
1112	sll	%l0,24,%l0
1113	ldub	[%i0+11],%l3
1114	sll	%l1,16,%l1
1115	ldub	[%i0+12],%l4
1116	sll	%l2,8,%l2
1117	or	%l1,%l0,%l0
1118	ldub	[%i0+13],%l5
1119	sll	%l4,24,%l4
1120	or	%l3,%l2,%l2
1121	ldub	[%i0+14],%l6
1122	sll	%l5,16,%l5
1123	or	%l0,%l2,%o2
1124	ldub	[%i0+15],%l7
1125
1126	sll	%l6,8,%l6
1127	or	%l5,%l4,%l4
1128	or	%l7,%l6,%l6
1129	or	%l4,%l6,%o3
1130
11311:	call	.+8
1132	add	%o7,AES_Td-1b,%o4
1133	call	_sparcv9_AES_decrypt
1134	mov	%i2,%o5
1135
1136	srl	%o0,24,%l0
1137	srl	%o0,16,%l1
1138	stb	%l0,[%i1+0]
1139	srl	%o0,8,%l2
1140	stb	%l1,[%i1+1]
1141	stb	%l2,[%i1+2]
1142	srl	%o1,24,%l4
1143	stb	%o0,[%i1+3]
1144
1145	srl	%o1,16,%l5
1146	stb	%l4,[%i1+4]
1147	srl	%o1,8,%l6
1148	stb	%l5,[%i1+5]
1149	stb	%l6,[%i1+6]
1150	srl	%o2,24,%l0
1151	stb	%o1,[%i1+7]
1152
1153	srl	%o2,16,%l1
1154	stb	%l0,[%i1+8]
1155	srl	%o2,8,%l2
1156	stb	%l1,[%i1+9]
1157	stb	%l2,[%i1+10]
1158	srl	%o3,24,%l4
1159	stb	%o2,[%i1+11]
1160
1161	srl	%o3,16,%l5
1162	stb	%l4,[%i1+12]
1163	srl	%o3,8,%l6
1164	stb	%l5,[%i1+13]
1165	stb	%l6,[%i1+14]
1166	stb	%o3,[%i1+15]
1167
1168	ret
1169	restore
1170.type	AES_decrypt,#function
1171.size	AES_decrypt,(.-AES_decrypt)
1172___
1173
1174# fmovs instructions substituting for FP nops were originally added
1175# to meet specific instruction alignment requirements to maximize ILP.
1176# As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have
1177# undesired effect, so just omit them and sacrifice some portion of
1178# percent in performance...
1179$code =~ s/fmovs.*$//gm;
1180
1181print $code;
1182close STDOUT;	# ensure flush
1183