aes-s390x.pl revision e45f106cb6b47af1f21efe76e933bdea2f5dd1ca
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# AES for s390x.
11
12# April 2007.
13#
14# Software performance improvement over gcc-generated code is ~70% and
15# in absolute terms is ~73 cycles per byte processed with 128-bit key.
16# You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are
17# *strictly* in-order execution and issued instruction [in this case
18# load value from memory is critical] has to complete before execution
19# flow proceeds. S-boxes are compressed to 2KB[+256B].
20#
21# As for hardware acceleration support. It's basically a "teaser," as
22# it can and should be improved in several ways. Most notably support
23# for CBC is not utilized, nor multiple blocks are ever processed.
24# Then software key schedule can be postponed till hardware support
25# detection... Performance improvement over assembler is reportedly
26# ~2.5x, but can reach >8x [naturally on larger chunks] if proper
27# support is implemented.
28
29# May 2007.
30#
31# Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided
32# for 128-bit keys, if hardware support is detected.
33
34# Januray 2009.
35#
36# Add support for hardware AES192/256 and reschedule instructions to
37# minimize/avoid Address Generation Interlock hazard and to favour
38# dual-issue z10 pipeline. This gave ~25% improvement on z10 and
39# almost 50% on z9. The gain is smaller on z10, because being dual-
40# issue z10 makes it improssible to eliminate the interlock condition:
41# critial path is not long enough. Yet it spends ~24 cycles per byte
42# processed with 128-bit key.
43#
44# Unlike previous version hardware support detection takes place only
45# at the moment of key schedule setup, which is denoted in key->rounds.
46# This is done, because deferred key setup can't be made MT-safe, not
47# for key lengthes longer than 128 bits.
48#
49# Add AES_cbc_encrypt, which gives incredible performance improvement,
50# it was measured to be ~6.6x. It's less than previously mentioned 8x,
51# because software implementation was optimized.
52
53$softonly=0;	# allow hardware support
54
55$t0="%r0";	$mask="%r0";
56$t1="%r1";
57$t2="%r2";	$inp="%r2";
58$t3="%r3";	$out="%r3";	$bits="%r3";
59$key="%r4";
60$i1="%r5";
61$i2="%r6";
62$i3="%r7";
63$s0="%r8";
64$s1="%r9";
65$s2="%r10";
66$s3="%r11";
67$tbl="%r12";
68$rounds="%r13";
69$ra="%r14";
70$sp="%r15";
71
72sub _data_word()
73{ my $i;
74    while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
75}
76
77$code=<<___;
78.text
79
80.type	AES_Te,\@object
81.align	256
82AES_Te:
83___
84&_data_word(
85	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
86	0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
87	0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
88	0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
89	0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
90	0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
91	0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
92	0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
93	0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
94	0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
95	0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
96	0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
97	0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
98	0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
99	0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
100	0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
101	0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
102	0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
103	0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
104	0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
105	0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
106	0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
107	0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
108	0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
109	0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
110	0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
111	0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
112	0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
113	0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
114	0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
115	0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
116	0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
117	0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
118	0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
119	0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
120	0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
121	0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
122	0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
123	0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
124	0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
125	0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
126	0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
127	0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
128	0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
129	0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
130	0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
131	0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
132	0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
133	0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
134	0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
135	0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
136	0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
137	0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
138	0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
139	0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
140	0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
141	0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
142	0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
143	0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
144	0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
145	0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
146	0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
147	0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
148	0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
149$code.=<<___;
150# Te4[256]
151.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
152.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
153.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
154.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
155.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
156.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
157.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
158.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
159.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
160.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
161.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
162.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
163.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
164.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
165.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
166.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
167.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
168.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
169.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
170.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
171.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
172.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
173.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
174.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
175.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
176.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
177.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
178.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
179.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
180.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
181.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
182.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
183# rcon[]
184.long	0x01000000, 0x02000000, 0x04000000, 0x08000000
185.long	0x10000000, 0x20000000, 0x40000000, 0x80000000
186.long	0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
187.align	256
188.size	AES_Te,.-AES_Te
189
190# void AES_encrypt(const unsigned char *inp, unsigned char *out,
191# 		 const AES_KEY *key) {
192.globl	AES_encrypt
193.type	AES_encrypt,\@function
194AES_encrypt:
195___
196$code.=<<___ if (!$softonly);
197	l	%r0,240($key)
198	lhi	%r1,16
199	clr	%r0,%r1
200	jl	.Lesoft
201
202	la	%r1,0($key)
203	#la	%r2,0($inp)
204	la	%r4,0($out)
205	lghi	%r3,16		# single block length
206	.long	0xb92e0042	# km %r4,%r2
207	brc	1,.-4		# can this happen?
208	br	%r14
209.align	64
210.Lesoft:
211___
212$code.=<<___;
213	stmg	%r3,$ra,24($sp)
214
215	llgf	$s0,0($inp)
216	llgf	$s1,4($inp)
217	llgf	$s2,8($inp)
218	llgf	$s3,12($inp)
219
220	larl	$tbl,AES_Te
221	bras	$ra,_s390x_AES_encrypt
222
223	lg	$out,24($sp)
224	st	$s0,0($out)
225	st	$s1,4($out)
226	st	$s2,8($out)
227	st	$s3,12($out)
228
229	lmg	%r6,$ra,48($sp)
230	br	$ra
231.size	AES_encrypt,.-AES_encrypt
232
233.type   _s390x_AES_encrypt,\@function
234.align	16
235_s390x_AES_encrypt:
236	stg	$ra,152($sp)
237	x	$s0,0($key)
238	x	$s1,4($key)
239	x	$s2,8($key)
240	x	$s3,12($key)
241	l	$rounds,240($key)
242	llill	$mask,`0xff<<3`
243	aghi	$rounds,-1
244	j	.Lenc_loop
245.align	16
246.Lenc_loop:
247	sllg	$t1,$s0,`0+3`
248	srlg	$t2,$s0,`8-3`
249	srlg	$t3,$s0,`16-3`
250	srl	$s0,`24-3`
251	nr	$s0,$mask
252	ngr	$t1,$mask
253	nr	$t2,$mask
254	nr	$t3,$mask
255
256	srlg	$i1,$s1,`16-3`	# i0
257	sllg	$i2,$s1,`0+3`
258	srlg	$i3,$s1,`8-3`
259	srl	$s1,`24-3`
260	nr	$i1,$mask
261	nr	$s1,$mask
262	ngr	$i2,$mask
263	nr	$i3,$mask
264
265	l	$s0,0($s0,$tbl)	# Te0[s0>>24]
266	l	$t1,1($t1,$tbl)	# Te3[s0>>0]
267	l	$t2,2($t2,$tbl) # Te2[s0>>8]
268	l	$t3,3($t3,$tbl)	# Te1[s0>>16]
269
270	x	$s0,3($i1,$tbl)	# Te1[s1>>16]
271	l	$s1,0($s1,$tbl)	# Te0[s1>>24]
272	x	$t2,1($i2,$tbl)	# Te3[s1>>0]
273	x	$t3,2($i3,$tbl)	# Te2[s1>>8]
274
275	srlg	$i1,$s2,`8-3`	# i0
276	srlg	$i2,$s2,`16-3`	# i1
277	nr	$i1,$mask
278	nr	$i2,$mask
279	sllg	$i3,$s2,`0+3`
280	srl	$s2,`24-3`
281	nr	$s2,$mask
282	ngr	$i3,$mask
283
284	xr	$s1,$t1
285	srlg	$ra,$s3,`8-3`	# i1
286	sllg	$t1,$s3,`0+3`	# i0
287	nr	$ra,$mask
288	la	$key,16($key)
289	ngr	$t1,$mask
290
291	x	$s0,2($i1,$tbl)	# Te2[s2>>8]
292	x	$s1,3($i2,$tbl)	# Te1[s2>>16]
293	l	$s2,0($s2,$tbl)	# Te0[s2>>24]
294	x	$t3,1($i3,$tbl)	# Te3[s2>>0]
295
296	srlg	$i3,$s3,`16-3`	# i2
297	xr	$s2,$t2
298	srl	$s3,`24-3`
299	nr	$i3,$mask
300	nr	$s3,$mask
301
302	x	$s0,0($key)
303	x	$s1,4($key)
304	x	$s2,8($key)
305	x	$t3,12($key)
306
307	x	$s0,1($t1,$tbl)	# Te3[s3>>0]
308	x	$s1,2($ra,$tbl)	# Te2[s3>>8]
309	x	$s2,3($i3,$tbl)	# Te1[s3>>16]
310	l	$s3,0($s3,$tbl)	# Te0[s3>>24]
311	xr	$s3,$t3
312
313	brct	$rounds,.Lenc_loop
314	.align	16
315
316	sllg	$t1,$s0,`0+3`
317	srlg	$t2,$s0,`8-3`
318	ngr	$t1,$mask
319	srlg	$t3,$s0,`16-3`
320	srl	$s0,`24-3`
321	nr	$s0,$mask
322	nr	$t2,$mask
323	nr	$t3,$mask
324
325	srlg	$i1,$s1,`16-3`	# i0
326	sllg	$i2,$s1,`0+3`
327	ngr	$i2,$mask
328	srlg	$i3,$s1,`8-3`
329	srl	$s1,`24-3`
330	nr	$i1,$mask
331	nr	$s1,$mask
332	nr	$i3,$mask
333
334	llgc	$s0,2($s0,$tbl)	# Te4[s0>>24]
335	llgc	$t1,2($t1,$tbl)	# Te4[s0>>0]
336	sll	$s0,24
337	llgc	$t2,2($t2,$tbl)	# Te4[s0>>8]
338	llgc	$t3,2($t3,$tbl)	# Te4[s0>>16]
339	sll	$t2,8
340	sll	$t3,16
341
342	llgc	$i1,2($i1,$tbl)	# Te4[s1>>16]
343	llgc	$s1,2($s1,$tbl)	# Te4[s1>>24]
344	llgc	$i2,2($i2,$tbl)	# Te4[s1>>0]
345	llgc	$i3,2($i3,$tbl)	# Te4[s1>>8]
346	sll	$i1,16
347	sll	$s1,24
348	sll	$i3,8
349	or	$s0,$i1
350	or	$s1,$t1
351	or	$t2,$i2
352	or	$t3,$i3
353
354	srlg	$i1,$s2,`8-3`	# i0
355	srlg	$i2,$s2,`16-3`	# i1
356	nr	$i1,$mask
357	nr	$i2,$mask
358	sllg	$i3,$s2,`0+3`
359	srl	$s2,`24-3`
360	ngr	$i3,$mask
361	nr	$s2,$mask
362
363	sllg	$t1,$s3,`0+3`	# i0
364	srlg	$ra,$s3,`8-3`	# i1
365	ngr	$t1,$mask
366
367	llgc	$i1,2($i1,$tbl)	# Te4[s2>>8]
368	llgc	$i2,2($i2,$tbl)	# Te4[s2>>16]
369	sll	$i1,8
370	llgc	$s2,2($s2,$tbl)	# Te4[s2>>24]
371	llgc	$i3,2($i3,$tbl)	# Te4[s2>>0]
372	sll	$i2,16
373	nr	$ra,$mask
374	sll	$s2,24
375	or	$s0,$i1
376	or	$s1,$i2
377	or	$s2,$t2
378	or	$t3,$i3
379
380	srlg	$i3,$s3,`16-3`	# i2
381	srl	$s3,`24-3`
382	nr	$i3,$mask
383	nr	$s3,$mask
384
385	l	$t0,16($key)
386	l	$t2,20($key)
387
388	llgc	$i1,2($t1,$tbl)	# Te4[s3>>0]
389	llgc	$i2,2($ra,$tbl)	# Te4[s3>>8]
390	llgc	$i3,2($i3,$tbl)	# Te4[s3>>16]
391	llgc	$s3,2($s3,$tbl)	# Te4[s3>>24]
392	sll	$i2,8
393	sll	$i3,16
394	sll	$s3,24
395	or	$s0,$i1
396	or	$s1,$i2
397	or	$s2,$i3
398	or	$s3,$t3
399
400	lg	$ra,152($sp)
401	xr	$s0,$t0
402	xr	$s1,$t2
403	x	$s2,24($key)
404	x	$s3,28($key)
405
406	br	$ra
407.size	_s390x_AES_encrypt,.-_s390x_AES_encrypt
408___
409
410$code.=<<___;
411.type	AES_Td,\@object
412.align	256
413AES_Td:
414___
415&_data_word(
416	0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
417	0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
418	0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
419	0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
420	0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
421	0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
422	0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
423	0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
424	0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
425	0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
426	0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
427	0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
428	0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
429	0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
430	0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
431	0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
432	0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
433	0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
434	0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
435	0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
436	0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
437	0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
438	0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
439	0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
440	0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
441	0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
442	0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
443	0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
444	0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
445	0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
446	0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
447	0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
448	0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
449	0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
450	0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
451	0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
452	0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
453	0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
454	0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
455	0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
456	0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
457	0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
458	0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
459	0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
460	0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
461	0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
462	0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
463	0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
464	0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
465	0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
466	0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
467	0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
468	0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
469	0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
470	0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
471	0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
472	0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
473	0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
474	0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
475	0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
476	0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
477	0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
478	0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
479	0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
480$code.=<<___;
481# Td4[256]
482.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
483.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
484.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
485.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
486.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
487.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
488.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
489.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
490.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
491.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
492.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
493.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
494.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
495.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
496.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
497.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
498.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
499.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
500.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
501.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
502.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
503.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
504.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
505.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
506.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
507.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
508.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
509.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
510.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
511.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
512.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
513.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
514.size	AES_Td,.-AES_Td
515
516# void AES_decrypt(const unsigned char *inp, unsigned char *out,
517# 		 const AES_KEY *key) {
518.globl	AES_decrypt
519.type	AES_decrypt,\@function
520AES_decrypt:
521___
522$code.=<<___ if (!$softonly);
523	l	%r0,240($key)
524	lhi	%r1,16
525	clr	%r0,%r1
526	jl	.Ldsoft
527
528	la	%r1,0($key)
529	#la	%r2,0($inp)
530	la	%r4,0($out)
531	lghi	%r3,16		# single block length
532	.long	0xb92e0042	# km %r4,%r2
533	brc	1,.-4		# can this happen?
534	br	%r14
535.align	64
536.Ldsoft:
537___
538$code.=<<___;
539	stmg	%r3,$ra,24($sp)
540
541	llgf	$s0,0($inp)
542	llgf	$s1,4($inp)
543	llgf	$s2,8($inp)
544	llgf	$s3,12($inp)
545
546	larl	$tbl,AES_Td
547	bras	$ra,_s390x_AES_decrypt
548
549	lg	$out,24($sp)
550	st	$s0,0($out)
551	st	$s1,4($out)
552	st	$s2,8($out)
553	st	$s3,12($out)
554
555	lmg	%r6,$ra,48($sp)
556	br	$ra
557.size	AES_decrypt,.-AES_decrypt
558
559.type   _s390x_AES_decrypt,\@function
560.align	16
561_s390x_AES_decrypt:
562	stg	$ra,152($sp)
563	x	$s0,0($key)
564	x	$s1,4($key)
565	x	$s2,8($key)
566	x	$s3,12($key)
567	l	$rounds,240($key)
568	llill	$mask,`0xff<<3`
569	aghi	$rounds,-1
570	j	.Ldec_loop
571.align	16
572.Ldec_loop:
573	srlg	$t1,$s0,`16-3`
574	srlg	$t2,$s0,`8-3`
575	sllg	$t3,$s0,`0+3`
576	srl	$s0,`24-3`
577	nr	$s0,$mask
578	nr	$t1,$mask
579	nr	$t2,$mask
580	ngr	$t3,$mask
581
582	sllg	$i1,$s1,`0+3`	# i0
583	srlg	$i2,$s1,`16-3`
584	srlg	$i3,$s1,`8-3`
585	srl	$s1,`24-3`
586	ngr	$i1,$mask
587	nr	$s1,$mask
588	nr	$i2,$mask
589	nr	$i3,$mask
590
591	l	$s0,0($s0,$tbl)	# Td0[s0>>24]
592	l	$t1,3($t1,$tbl)	# Td1[s0>>16]
593	l	$t2,2($t2,$tbl)	# Td2[s0>>8]
594	l	$t3,1($t3,$tbl)	# Td3[s0>>0]
595
596	x	$s0,1($i1,$tbl)	# Td3[s1>>0]
597	l	$s1,0($s1,$tbl)	# Td0[s1>>24]
598	x	$t2,3($i2,$tbl)	# Td1[s1>>16]
599	x	$t3,2($i3,$tbl)	# Td2[s1>>8]
600
601	srlg	$i1,$s2,`8-3`	# i0
602	sllg	$i2,$s2,`0+3`	# i1
603	srlg	$i3,$s2,`16-3`
604	srl	$s2,`24-3`
605	nr	$i1,$mask
606	ngr	$i2,$mask
607	nr	$s2,$mask
608	nr	$i3,$mask
609
610	xr	$s1,$t1
611	srlg	$ra,$s3,`8-3`	# i1
612	srlg	$t1,$s3,`16-3`	# i0
613	nr	$ra,$mask
614	la	$key,16($key)
615	nr	$t1,$mask
616
617	x	$s0,2($i1,$tbl)	# Td2[s2>>8]
618	x	$s1,1($i2,$tbl)	# Td3[s2>>0]
619	l	$s2,0($s2,$tbl)	# Td0[s2>>24]
620	x	$t3,3($i3,$tbl)	# Td1[s2>>16]
621
622	sllg	$i3,$s3,`0+3`	# i2
623	srl	$s3,`24-3`
624	ngr	$i3,$mask
625	nr	$s3,$mask
626
627	xr	$s2,$t2
628	x	$s0,0($key)
629	x	$s1,4($key)
630	x	$s2,8($key)
631	x	$t3,12($key)
632
633	x	$s0,3($t1,$tbl)	# Td1[s3>>16]
634	x	$s1,2($ra,$tbl)	# Td2[s3>>8]
635	x	$s2,1($i3,$tbl)	# Td3[s3>>0]
636	l	$s3,0($s3,$tbl)	# Td0[s3>>24]
637	xr	$s3,$t3
638
639	brct	$rounds,.Ldec_loop
640	.align	16
641
642	l	$t1,`2048+0`($tbl)	# prefetch Td4
643	l	$t2,`2048+64`($tbl)
644	l	$t3,`2048+128`($tbl)
645	l	$i1,`2048+192`($tbl)
646	llill	$mask,0xff
647
648	srlg	$i3,$s0,24	# i0
649	srlg	$t1,$s0,16
650	srlg	$t2,$s0,8
651	nr	$s0,$mask	# i3
652	nr	$t1,$mask
653
654	srlg	$i1,$s1,24
655	nr	$t2,$mask
656	srlg	$i2,$s1,16
657	srlg	$ra,$s1,8
658	nr	$s1,$mask	# i0
659	nr	$i2,$mask
660	nr	$ra,$mask
661
662	llgc	$i3,2048($i3,$tbl)	# Td4[s0>>24]
663	llgc	$t1,2048($t1,$tbl)	# Td4[s0>>16]
664	llgc	$t2,2048($t2,$tbl)	# Td4[s0>>8]
665	sll	$t1,16
666	llgc	$t3,2048($s0,$tbl)	# Td4[s0>>0]
667	sllg	$s0,$i3,24
668	sll	$t2,8
669
670	llgc	$s1,2048($s1,$tbl)	# Td4[s1>>0]
671	llgc	$i1,2048($i1,$tbl)	# Td4[s1>>24]
672	llgc	$i2,2048($i2,$tbl)	# Td4[s1>>16]
673	sll	$i1,24
674	llgc	$i3,2048($ra,$tbl)	# Td4[s1>>8]
675	sll	$i2,16
676	sll	$i3,8
677	or	$s0,$s1
678	or	$t1,$i1
679	or	$t2,$i2
680	or	$t3,$i3
681
682	srlg	$i1,$s2,8	# i0
683	srlg	$i2,$s2,24
684	srlg	$i3,$s2,16
685	nr	$s2,$mask	# i1
686	nr	$i1,$mask
687	nr	$i3,$mask
688	llgc	$i1,2048($i1,$tbl)	# Td4[s2>>8]
689	llgc	$s1,2048($s2,$tbl)	# Td4[s2>>0]
690	llgc	$i2,2048($i2,$tbl)	# Td4[s2>>24]
691	llgc	$i3,2048($i3,$tbl)	# Td4[s2>>16]
692	sll	$i1,8
693	sll	$i2,24
694	or	$s0,$i1
695	sll	$i3,16
696	or	$t2,$i2
697	or	$t3,$i3
698
699	srlg	$i1,$s3,16	# i0
700	srlg	$i2,$s3,8	# i1
701	srlg	$i3,$s3,24
702	nr	$s3,$mask	# i2
703	nr	$i1,$mask
704	nr	$i2,$mask
705
706	lg	$ra,152($sp)
707	or	$s1,$t1
708	l	$t0,16($key)
709	l	$t1,20($key)
710
711	llgc	$i1,2048($i1,$tbl)	# Td4[s3>>16]
712	llgc	$i2,2048($i2,$tbl)	# Td4[s3>>8]
713	sll	$i1,16
714	llgc	$s2,2048($s3,$tbl)	# Td4[s3>>0]
715	llgc	$s3,2048($i3,$tbl)	# Td4[s3>>24]
716	sll	$i2,8
717	sll	$s3,24
718	or	$s0,$i1
719	or	$s1,$i2
720	or	$s2,$t2
721	or	$s3,$t3
722
723	xr	$s0,$t0
724	xr	$s1,$t1
725	x	$s2,24($key)
726	x	$s3,28($key)
727
728	br	$ra
729.size	_s390x_AES_decrypt,.-_s390x_AES_decrypt
730___
731
732$code.=<<___;
733# void AES_set_encrypt_key(const unsigned char *in, int bits,
734# 		 AES_KEY *key) {
735.globl	AES_set_encrypt_key
736.type	AES_set_encrypt_key,\@function
737.align	16
738AES_set_encrypt_key:
739	lghi	$t0,0
740	clgr	$inp,$t0
741	je	.Lminus1
742	clgr	$key,$t0
743	je	.Lminus1
744
745	lghi	$t0,128
746	clr	$bits,$t0
747	je	.Lproceed
748	lghi	$t0,192
749	clr	$bits,$t0
750	je	.Lproceed
751	lghi	$t0,256
752	clr	$bits,$t0
753	je	.Lproceed
754	lghi	%r2,-2
755	br	%r14
756
757.align	16
758.Lproceed:
759___
760$code.=<<___ if (!$softonly);
761	# convert bits to km code, [128,192,256]->[18,19,20]
762	lhi	%r5,-128
763	lhi	%r0,18
764	ar	%r5,$bits
765	srl	%r5,6
766	ar	%r5,%r0
767
768	lghi	%r0,0		# query capability vector
769	la	%r1,16($sp)
770	.long	0xb92f0042	# kmc %r4,%r2
771
772	llihh	%r1,0x8000
773	srlg	%r1,%r1,0(%r5)
774	ng	%r1,16($sp)
775	jz	.Lekey_internal
776
777	lmg	%r0,%r1,0($inp)	# just copy 128 bits...
778	stmg	%r0,%r1,0($key)
779	lhi	%r0,192
780	cr	$bits,%r0
781	jl	1f
782	lg	%r1,16($inp)
783	stg	%r1,16($key)
784	je	1f
785	lg	%r1,24($inp)
786	stg	%r1,24($key)
7871:	st	$bits,236($key)	# save bits
788	st	%r5,240($key)	# save km code
789	lghi	%r2,0
790	br	%r14
791___
792$code.=<<___;
793.align	16
794.Lekey_internal:
795	stmg	%r6,%r13,48($sp)	# all non-volatile regs
796
797	larl	$tbl,AES_Te+2048
798
799	llgf	$s0,0($inp)
800	llgf	$s1,4($inp)
801	llgf	$s2,8($inp)
802	llgf	$s3,12($inp)
803	st	$s0,0($key)
804	st	$s1,4($key)
805	st	$s2,8($key)
806	st	$s3,12($key)
807	lghi	$t0,128
808	cr	$bits,$t0
809	jne	.Lnot128
810
811	llill	$mask,0xff
812	lghi	$t3,0			# i=0
813	lghi	$rounds,10
814	st	$rounds,240($key)
815
816	llgfr	$t2,$s3			# temp=rk[3]
817	srlg	$i1,$s3,8
818	srlg	$i2,$s3,16
819	srlg	$i3,$s3,24
820	nr	$t2,$mask
821	nr	$i1,$mask
822	nr	$i2,$mask
823
824.align	16
825.L128_loop:
826	la	$t2,0($t2,$tbl)
827	la	$i1,0($i1,$tbl)
828	la	$i2,0($i2,$tbl)
829	la	$i3,0($i3,$tbl)
830	icm	$t2,2,0($t2)		# Te4[rk[3]>>0]<<8
831	icm	$t2,4,0($i1)		# Te4[rk[3]>>8]<<16
832	icm	$t2,8,0($i2)		# Te4[rk[3]>>16]<<24
833	icm	$t2,1,0($i3)		# Te4[rk[3]>>24]
834	x	$t2,256($t3,$tbl)	# rcon[i]
835	xr	$s0,$t2			# rk[4]=rk[0]^...
836	xr	$s1,$s0			# rk[5]=rk[1]^rk[4]
837	xr	$s2,$s1			# rk[6]=rk[2]^rk[5]
838	xr	$s3,$s2			# rk[7]=rk[3]^rk[6]
839
840	llgfr	$t2,$s3			# temp=rk[3]
841	srlg	$i1,$s3,8
842	srlg	$i2,$s3,16
843	nr	$t2,$mask
844	nr	$i1,$mask
845	srlg	$i3,$s3,24
846	nr	$i2,$mask
847
848	st	$s0,16($key)
849	st	$s1,20($key)
850	st	$s2,24($key)
851	st	$s3,28($key)
852	la	$key,16($key)		# key+=4
853	la	$t3,4($t3)		# i++
854	brct	$rounds,.L128_loop
855	lghi	%r2,0
856	lmg	%r6,%r13,48($sp)
857	br	$ra
858
859.align	16
860.Lnot128:
861	llgf	$t0,16($inp)
862	llgf	$t1,20($inp)
863	st	$t0,16($key)
864	st	$t1,20($key)
865	lghi	$t0,192
866	cr	$bits,$t0
867	jne	.Lnot192
868
869	llill	$mask,0xff
870	lghi	$t3,0			# i=0
871	lghi	$rounds,12
872	st	$rounds,240($key)
873	lghi	$rounds,8
874
875	srlg	$i1,$t1,8
876	srlg	$i2,$t1,16
877	srlg	$i3,$t1,24
878	nr	$t1,$mask
879	nr	$i1,$mask
880	nr	$i2,$mask
881
882.align	16
883.L192_loop:
884	la	$t1,0($t1,$tbl)
885	la	$i1,0($i1,$tbl)
886	la	$i2,0($i2,$tbl)
887	la	$i3,0($i3,$tbl)
888	icm	$t1,2,0($t1)		# Te4[rk[5]>>0]<<8
889	icm	$t1,4,0($i1)		# Te4[rk[5]>>8]<<16
890	icm	$t1,8,0($i2)		# Te4[rk[5]>>16]<<24
891	icm	$t1,1,0($i3)		# Te4[rk[5]>>24]
892	x	$t1,256($t3,$tbl)	# rcon[i]
893	xr	$s0,$t1			# rk[6]=rk[0]^...
894	xr	$s1,$s0			# rk[7]=rk[1]^rk[6]
895	xr	$s2,$s1			# rk[8]=rk[2]^rk[7]
896	xr	$s3,$s2			# rk[9]=rk[3]^rk[8]
897
898	st	$s0,24($key)
899	st	$s1,28($key)
900	st	$s2,32($key)
901	st	$s3,36($key)
902	brct	$rounds,.L192_continue
903	lghi	%r2,0
904	lmg	%r6,%r13,48($sp)
905	br	$ra
906
907.align	16
908.L192_continue:
909	lgr	$t1,$s3
910	x	$t1,16($key)		# rk[10]=rk[4]^rk[9]
911	st	$t1,40($key)
912	x	$t1,20($key)		# rk[11]=rk[5]^rk[10]
913	st	$t1,44($key)
914
915	srlg	$i1,$t1,8
916	srlg	$i2,$t1,16
917	srlg	$i3,$t1,24
918	nr	$t1,$mask
919	nr	$i1,$mask
920	nr	$i2,$mask
921
922	la	$key,24($key)		# key+=6
923	la	$t3,4($t3)		# i++
924	j	.L192_loop
925
926.align	16
927.Lnot192:
928	llgf	$t0,24($inp)
929	llgf	$t1,28($inp)
930	st	$t0,24($key)
931	st	$t1,28($key)
932	llill	$mask,0xff
933	lghi	$t3,0			# i=0
934	lghi	$rounds,14
935	st	$rounds,240($key)
936	lghi	$rounds,7
937
938	srlg	$i1,$t1,8
939	srlg	$i2,$t1,16
940	srlg	$i3,$t1,24
941	nr	$t1,$mask
942	nr	$i1,$mask
943	nr	$i2,$mask
944
945.align	16
946.L256_loop:
947	la	$t1,0($t1,$tbl)
948	la	$i1,0($i1,$tbl)
949	la	$i2,0($i2,$tbl)
950	la	$i3,0($i3,$tbl)
951	icm	$t1,2,0($t1)		# Te4[rk[7]>>0]<<8
952	icm	$t1,4,0($i1)		# Te4[rk[7]>>8]<<16
953	icm	$t1,8,0($i2)		# Te4[rk[7]>>16]<<24
954	icm	$t1,1,0($i3)		# Te4[rk[7]>>24]
955	x	$t1,256($t3,$tbl)	# rcon[i]
956	xr	$s0,$t1			# rk[8]=rk[0]^...
957	xr	$s1,$s0			# rk[9]=rk[1]^rk[8]
958	xr	$s2,$s1			# rk[10]=rk[2]^rk[9]
959	xr	$s3,$s2			# rk[11]=rk[3]^rk[10]
960	st	$s0,32($key)
961	st	$s1,36($key)
962	st	$s2,40($key)
963	st	$s3,44($key)
964	brct	$rounds,.L256_continue
965	lghi	%r2,0
966	lmg	%r6,%r13,48($sp)
967	br	$ra
968
969.align	16
970.L256_continue:
971	lgr	$t1,$s3			# temp=rk[11]
972	srlg	$i1,$s3,8
973	srlg	$i2,$s3,16
974	srlg	$i3,$s3,24
975	nr	$t1,$mask
976	nr	$i1,$mask
977	nr	$i2,$mask
978	la	$t1,0($t1,$tbl)
979	la	$i1,0($i1,$tbl)
980	la	$i2,0($i2,$tbl)
981	la	$i3,0($i3,$tbl)
982	llgc	$t1,0($t1)		# Te4[rk[11]>>0]
983	icm	$t1,2,0($i1)		# Te4[rk[11]>>8]<<8
984	icm	$t1,4,0($i2)		# Te4[rk[11]>>16]<<16
985	icm	$t1,8,0($i3)		# Te4[rk[11]>>24]<<24
986	x	$t1,16($key)		# rk[12]=rk[4]^...
987	st	$t1,48($key)
988	x	$t1,20($key)		# rk[13]=rk[5]^rk[12]
989	st	$t1,52($key)
990	x	$t1,24($key)		# rk[14]=rk[6]^rk[13]
991	st	$t1,56($key)
992	x	$t1,28($key)		# rk[15]=rk[7]^rk[14]
993	st	$t1,60($key)
994
995	srlg	$i1,$t1,8
996	srlg	$i2,$t1,16
997	srlg	$i3,$t1,24
998	nr	$t1,$mask
999	nr	$i1,$mask
1000	nr	$i2,$mask
1001
1002	la	$key,32($key)		# key+=8
1003	la	$t3,4($t3)		# i++
1004	j	.L256_loop
1005
1006.Lminus1:
1007	lghi	%r2,-1
1008	br	$ra
1009.size	AES_set_encrypt_key,.-AES_set_encrypt_key
1010
1011# void AES_set_decrypt_key(const unsigned char *in, int bits,
1012# 		 AES_KEY *key) {
1013.globl	AES_set_decrypt_key
1014.type	AES_set_decrypt_key,\@function
1015.align	16
1016AES_set_decrypt_key:
1017	stg	$key,32($sp)		# I rely on AES_set_encrypt_key to
1018	stg	$ra,112($sp)		# save non-volatile registers!
1019	bras	$ra,AES_set_encrypt_key
1020	lg	$key,32($sp)
1021	lg	$ra,112($sp)
1022	ltgr	%r2,%r2
1023	bnzr	$ra
1024___
1025$code.=<<___ if (!$softonly);
1026	l	$t0,240($key)
1027	lhi	$t1,16
1028	cr	$t0,$t1
1029	jl	.Lgo
1030	oill	$t0,0x80	# set "decrypt" bit
1031	st	$t0,240($key)
1032	br	$ra
1033
1034.align	16
1035.Ldkey_internal:
1036	stg	$key,32($sp)
1037	stg	$ra,40($sp)
1038	bras	$ra,.Lekey_internal
1039	lg	$key,32($sp)
1040	lg	$ra,40($sp)
1041___
1042$code.=<<___;
1043
1044.Lgo:	llgf	$rounds,240($key)
1045	la	$i1,0($key)
1046	sllg	$i2,$rounds,4
1047	la	$i2,0($i2,$key)
1048	srl	$rounds,1
1049	lghi	$t1,-16
1050
1051.align	16
1052.Linv:	lmg	$s0,$s1,0($i1)
1053	lmg	$s2,$s3,0($i2)
1054	stmg	$s0,$s1,0($i2)
1055	stmg	$s2,$s3,0($i1)
1056	la	$i1,16($i1)
1057	la	$i2,0($t1,$i2)
1058	brct	$rounds,.Linv
1059___
1060$mask80=$i1;
1061$mask1b=$i2;
1062$maskfe=$i3;
1063$code.=<<___;
1064	llgf	$rounds,240($key)
1065	aghi	$rounds,-1
1066	sll	$rounds,2	# (rounds-1)*4
1067	llilh	$mask80,0x8080
1068	llilh	$mask1b,0x1b1b
1069	llilh	$maskfe,0xfefe
1070	oill	$mask80,0x8080
1071	oill	$mask1b,0x1b1b
1072	oill	$maskfe,0xfefe
1073
1074.align	16
1075.Lmix:	l	$s0,16($key)	# tp1
1076	lr	$s1,$s0
1077	ngr	$s1,$mask80
1078	srlg	$t1,$s1,7
1079	slr	$s1,$t1
1080	nr	$s1,$mask1b
1081	sllg	$t1,$s0,1
1082	nr	$t1,$maskfe
1083	xr	$s1,$t1		# tp2
1084
1085	lr	$s2,$s1
1086	ngr	$s2,$mask80
1087	srlg	$t1,$s2,7
1088	slr	$s2,$t1
1089	nr	$s2,$mask1b
1090	sllg	$t1,$s1,1
1091	nr	$t1,$maskfe
1092	xr	$s2,$t1		# tp4
1093
1094	lr	$s3,$s2
1095	ngr	$s3,$mask80
1096	srlg	$t1,$s3,7
1097	slr	$s3,$t1
1098	nr	$s3,$mask1b
1099	sllg	$t1,$s2,1
1100	nr	$t1,$maskfe
1101	xr	$s3,$t1		# tp8
1102
1103	xr	$s1,$s0		# tp2^tp1
1104	xr	$s2,$s0		# tp4^tp1
1105	rll	$s0,$s0,24	# = ROTATE(tp1,8)
1106	xr	$s2,$s3		# ^=tp8
1107	xr	$s0,$s1		# ^=tp2^tp1
1108	xr	$s1,$s3		# tp2^tp1^tp8
1109	xr	$s0,$s2		# ^=tp4^tp1^tp8
1110	rll	$s1,$s1,8
1111	rll	$s2,$s2,16
1112	xr	$s0,$s1		# ^= ROTATE(tp8^tp2^tp1,24)
1113	rll	$s3,$s3,24
1114	xr	$s0,$s2    	# ^= ROTATE(tp8^tp4^tp1,16)
1115	xr	$s0,$s3		# ^= ROTATE(tp8,8)
1116
1117	st	$s0,16($key)
1118	la	$key,4($key)
1119	brct	$rounds,.Lmix
1120
1121	lmg	%r6,%r13,48($sp)# as was saved by AES_set_encrypt_key!
1122	lghi	%r2,0
1123	br	$ra
1124.size	AES_set_decrypt_key,.-AES_set_decrypt_key
1125___
1126
1127#void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
1128#                     size_t length, const AES_KEY *key,
1129#                     unsigned char *ivec, const int enc)
1130{
1131my $inp="%r2";
1132my $out="%r4";	# length and out are swapped
1133my $len="%r3";
1134my $key="%r5";
1135my $ivp="%r6";
1136
1137$code.=<<___;
1138.globl	AES_cbc_encrypt
1139.type	AES_cbc_encrypt,\@function
1140.align	16
1141AES_cbc_encrypt:
1142	xgr	%r3,%r4		# flip %r3 and %r4, out and len
1143	xgr	%r4,%r3
1144	xgr	%r3,%r4
1145___
1146$code.=<<___ if (!$softonly);
1147	lhi	%r0,16
1148	cl	%r0,240($key)
1149	jh	.Lcbc_software
1150
1151	lg	%r0,0($ivp)	# copy ivec
1152	lg	%r1,8($ivp)
1153	stmg	%r0,%r1,16($sp)
1154	lmg	%r0,%r1,0($key)	# copy key, cover 256 bit
1155	stmg	%r0,%r1,32($sp)
1156	lmg	%r0,%r1,16($key)
1157	stmg	%r0,%r1,48($sp)
1158	l	%r0,240($key)	# load kmc code
1159	lghi	$key,15		# res=len%16, len-=res;
1160	ngr	$key,$len
1161	slgr	$len,$key
1162	la	%r1,16($sp)	# parameter block - ivec || key
1163	jz	.Lkmc_truncated
1164	.long	0xb92f0042	# kmc %r4,%r2
1165	brc	1,.-4		# pay attention to "partial completion"
1166	ltr	$key,$key
1167	jnz	.Lkmc_truncated
1168.Lkmc_done:
1169	lmg	%r0,%r1,16($sp)	# copy ivec to caller
1170	stg	%r0,0($ivp)
1171	stg	%r1,8($ivp)
1172	br	$ra
1173.align	16
1174.Lkmc_truncated:
1175	ahi	$key,-1		# it's the way it's encoded in mvc
1176	tmll	%r0,0x80
1177	jnz	.Lkmc_truncated_dec
1178	lghi	%r1,0
1179	stg	%r1,128($sp)
1180	stg	%r1,136($sp)
1181	bras	%r1,1f
1182	mvc	128(1,$sp),0($inp)
11831:	ex	$key,0(%r1)
1184	la	%r1,16($sp)	# restore parameter block
1185	la	$inp,128($sp)
1186	lghi	$len,16
1187	.long	0xb92f0042	# kmc %r4,%r2
1188	j	.Lkmc_done
1189.align	16
1190.Lkmc_truncated_dec:
1191	stg	$out,64($sp)
1192	la	$out,128($sp)
1193	lghi	$len,16
1194	.long	0xb92f0042	# kmc %r4,%r2
1195	lg	$out,64($sp)
1196	bras	%r1,2f
1197	mvc	0(1,$out),128($sp)
11982:	ex	$key,0(%r1)
1199	j	.Lkmc_done
1200.align	16
1201.Lcbc_software:
1202___
1203$code.=<<___;
1204	stmg	$key,$ra,40($sp)
1205	lhi	%r0,0
1206	cl	%r0,164($sp)
1207	je	.Lcbc_decrypt
1208
1209	larl	$tbl,AES_Te
1210
1211	llgf	$s0,0($ivp)
1212	llgf	$s1,4($ivp)
1213	llgf	$s2,8($ivp)
1214	llgf	$s3,12($ivp)
1215
1216	lghi	$t0,16
1217	slgr	$len,$t0
1218	brc	4,.Lcbc_enc_tail	# if borrow
1219.Lcbc_enc_loop:
1220	stmg	$inp,$out,16($sp)
1221	x	$s0,0($inp)
1222	x	$s1,4($inp)
1223	x	$s2,8($inp)
1224	x	$s3,12($inp)
1225	lgr	%r4,$key
1226
1227	bras	$ra,_s390x_AES_encrypt
1228
1229	lmg	$inp,$key,16($sp)
1230	st	$s0,0($out)
1231	st	$s1,4($out)
1232	st	$s2,8($out)
1233	st	$s3,12($out)
1234
1235	la	$inp,16($inp)
1236	la	$out,16($out)
1237	lghi	$t0,16
1238	ltgr	$len,$len
1239	jz	.Lcbc_enc_done
1240	slgr	$len,$t0
1241	brc	4,.Lcbc_enc_tail	# if borrow
1242	j	.Lcbc_enc_loop
1243.align	16
1244.Lcbc_enc_done:
1245	lg	$ivp,48($sp)
1246	st	$s0,0($ivp)
1247	st	$s1,4($ivp)
1248	st	$s2,8($ivp)
1249	st	$s3,12($ivp)
1250
1251	lmg	%r7,$ra,56($sp)
1252	br	$ra
1253
1254.align	16
1255.Lcbc_enc_tail:
1256	aghi	$len,15
1257	lghi	$t0,0
1258	stg	$t0,128($sp)
1259	stg	$t0,136($sp)
1260	bras	$t1,3f
1261	mvc	128(1,$sp),0($inp)
12623:	ex	$len,0($t1)
1263	lghi	$len,0
1264	la	$inp,128($sp)
1265	j	.Lcbc_enc_loop
1266
1267.align	16
1268.Lcbc_decrypt:
1269	larl	$tbl,AES_Td
1270
1271	lg	$t0,0($ivp)
1272	lg	$t1,8($ivp)
1273	stmg	$t0,$t1,128($sp)
1274
1275.Lcbc_dec_loop:
1276	stmg	$inp,$out,16($sp)
1277	llgf	$s0,0($inp)
1278	llgf	$s1,4($inp)
1279	llgf	$s2,8($inp)
1280	llgf	$s3,12($inp)
1281	lgr	%r4,$key
1282
1283	bras	$ra,_s390x_AES_decrypt
1284
1285	lmg	$inp,$key,16($sp)
1286	sllg	$s0,$s0,32
1287	sllg	$s2,$s2,32
1288	lr	$s0,$s1
1289	lr	$s2,$s3
1290
1291	lg	$t0,0($inp)
1292	lg	$t1,8($inp)
1293	xg	$s0,128($sp)
1294	xg	$s2,136($sp)
1295	lghi	$s1,16
1296	slgr	$len,$s1
1297	brc	4,.Lcbc_dec_tail	# if borrow
1298	brc	2,.Lcbc_dec_done	# if zero
1299	stg	$s0,0($out)
1300	stg	$s2,8($out)
1301	stmg	$t0,$t1,128($sp)
1302
1303	la	$inp,16($inp)
1304	la	$out,16($out)
1305	j	.Lcbc_dec_loop
1306
1307.Lcbc_dec_done:
1308	stg	$s0,0($out)
1309	stg	$s2,8($out)
1310.Lcbc_dec_exit:
1311	lmg	$ivp,$ra,48($sp)
1312	stmg	$t0,$t1,0($ivp)
1313
1314	br	$ra
1315
1316.align	16
1317.Lcbc_dec_tail:
1318	aghi	$len,15
1319	stg	$s0,128($sp)
1320	stg	$s2,136($sp)
1321	bras	$s1,4f
1322	mvc	0(1,$out),128($sp)
13234:	ex	$len,0($s1)
1324	j	.Lcbc_dec_exit
1325.size	AES_cbc_encrypt,.-AES_cbc_encrypt
1326___
1327}
1328$code.=<<___;
1329.string	"AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
1330___
1331
1332$code =~ s/\`([^\`]*)\`/eval $1/gem;
1333print $code;
1334