1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# AES for s390x.
11
12# April 2007.
13#
14# Software performance improvement over gcc-generated code is ~70% and
15# in absolute terms is ~73 cycles per byte processed with 128-bit key.
16# You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are
17# *strictly* in-order execution and issued instruction [in this case
18# load value from memory is critical] has to complete before execution
19# flow proceeds. S-boxes are compressed to 2KB[+256B].
20#
21# As for hardware acceleration support. It's basically a "teaser," as
22# it can and should be improved in several ways. Most notably support
23# for CBC is not utilized, nor multiple blocks are ever processed.
24# Then software key schedule can be postponed till hardware support
25# detection... Performance improvement over assembler is reportedly
26# ~2.5x, but can reach >8x [naturally on larger chunks] if proper
27# support is implemented.
28
29# May 2007.
30#
31# Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided
32# for 128-bit keys, if hardware support is detected.
33
34# Januray 2009.
35#
36# Add support for hardware AES192/256 and reschedule instructions to
37# minimize/avoid Address Generation Interlock hazard and to favour
38# dual-issue z10 pipeline. This gave ~25% improvement on z10 and
39# almost 50% on z9. The gain is smaller on z10, because being dual-
40# issue z10 makes it improssible to eliminate the interlock condition:
41# critial path is not long enough. Yet it spends ~24 cycles per byte
42# processed with 128-bit key.
43#
44# Unlike previous version hardware support detection takes place only
45# at the moment of key schedule setup, which is denoted in key->rounds.
46# This is done, because deferred key setup can't be made MT-safe, not
47# for key lengthes longer than 128 bits.
48#
49# Add AES_cbc_encrypt, which gives incredible performance improvement,
50# it was measured to be ~6.6x. It's less than previously mentioned 8x,
51# because software implementation was optimized.
52
53$softonly=0;	# allow hardware support
54
55$t0="%r0";	$mask="%r0";
56$t1="%r1";
57$t2="%r2";	$inp="%r2";
58$t3="%r3";	$out="%r3";	$bits="%r3";
59$key="%r4";
60$i1="%r5";
61$i2="%r6";
62$i3="%r7";
63$s0="%r8";
64$s1="%r9";
65$s2="%r10";
66$s3="%r11";
67$tbl="%r12";
68$rounds="%r13";
69$ra="%r14";
70$sp="%r15";
71
72sub _data_word()
73{ my $i;
74    while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
75}
76
77$code=<<___;
78.text
79
80.type	AES_Te,\@object
81.align	256
82AES_Te:
83___
84&_data_word(
85	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
86	0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
87	0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
88	0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
89	0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
90	0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
91	0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
92	0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
93	0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
94	0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
95	0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
96	0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
97	0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
98	0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
99	0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
100	0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
101	0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
102	0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
103	0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
104	0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
105	0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
106	0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
107	0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
108	0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
109	0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
110	0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
111	0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
112	0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
113	0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
114	0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
115	0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
116	0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
117	0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
118	0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
119	0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
120	0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
121	0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
122	0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
123	0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
124	0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
125	0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
126	0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
127	0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
128	0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
129	0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
130	0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
131	0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
132	0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
133	0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
134	0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
135	0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
136	0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
137	0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
138	0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
139	0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
140	0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
141	0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
142	0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
143	0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
144	0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
145	0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
146	0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
147	0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
148	0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
149$code.=<<___;
150# Te4[256]
151.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
152.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
153.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
154.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
155.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
156.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
157.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
158.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
159.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
160.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
161.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
162.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
163.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
164.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
165.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
166.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
167.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
168.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
169.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
170.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
171.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
172.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
173.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
174.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
175.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
176.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
177.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
178.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
179.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
180.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
181.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
182.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
183# rcon[]
184.long	0x01000000, 0x02000000, 0x04000000, 0x08000000
185.long	0x10000000, 0x20000000, 0x40000000, 0x80000000
186.long	0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
187.align	256
188.size	AES_Te,.-AES_Te
189
190# void AES_encrypt(const unsigned char *inp, unsigned char *out,
191# 		 const AES_KEY *key) {
192.globl	AES_encrypt
193.type	AES_encrypt,\@function
194AES_encrypt:
195___
196$code.=<<___ if (!$softonly);
197	l	%r0,240($key)
198	lhi	%r1,16
199	clr	%r0,%r1
200	jl	.Lesoft
201
202	la	%r1,0($key)
203	#la	%r2,0($inp)
204	la	%r4,0($out)
205	lghi	%r3,16		# single block length
206	.long	0xb92e0042	# km %r4,%r2
207	brc	1,.-4		# can this happen?
208	br	%r14
209.align	64
210.Lesoft:
211___
212$code.=<<___;
213	stmg	%r3,$ra,24($sp)
214
215	llgf	$s0,0($inp)
216	llgf	$s1,4($inp)
217	llgf	$s2,8($inp)
218	llgf	$s3,12($inp)
219
220	larl	$tbl,AES_Te
221	bras	$ra,_s390x_AES_encrypt
222
223	lg	$out,24($sp)
224	st	$s0,0($out)
225	st	$s1,4($out)
226	st	$s2,8($out)
227	st	$s3,12($out)
228
229	lmg	%r6,$ra,48($sp)
230	br	$ra
231.size	AES_encrypt,.-AES_encrypt
232
233.type   _s390x_AES_encrypt,\@function
234.align	16
235_s390x_AES_encrypt:
236	stg	$ra,152($sp)
237	x	$s0,0($key)
238	x	$s1,4($key)
239	x	$s2,8($key)
240	x	$s3,12($key)
241	l	$rounds,240($key)
242	llill	$mask,`0xff<<3`
243	aghi	$rounds,-1
244	j	.Lenc_loop
245.align	16
246.Lenc_loop:
247	sllg	$t1,$s0,`0+3`
248	srlg	$t2,$s0,`8-3`
249	srlg	$t3,$s0,`16-3`
250	srl	$s0,`24-3`
251	nr	$s0,$mask
252	ngr	$t1,$mask
253	nr	$t2,$mask
254	nr	$t3,$mask
255
256	srlg	$i1,$s1,`16-3`	# i0
257	sllg	$i2,$s1,`0+3`
258	srlg	$i3,$s1,`8-3`
259	srl	$s1,`24-3`
260	nr	$i1,$mask
261	nr	$s1,$mask
262	ngr	$i2,$mask
263	nr	$i3,$mask
264
265	l	$s0,0($s0,$tbl)	# Te0[s0>>24]
266	l	$t1,1($t1,$tbl)	# Te3[s0>>0]
267	l	$t2,2($t2,$tbl) # Te2[s0>>8]
268	l	$t3,3($t3,$tbl)	# Te1[s0>>16]
269
270	x	$s0,3($i1,$tbl)	# Te1[s1>>16]
271	l	$s1,0($s1,$tbl)	# Te0[s1>>24]
272	x	$t2,1($i2,$tbl)	# Te3[s1>>0]
273	x	$t3,2($i3,$tbl)	# Te2[s1>>8]
274
275	srlg	$i1,$s2,`8-3`	# i0
276	srlg	$i2,$s2,`16-3`	# i1
277	nr	$i1,$mask
278	nr	$i2,$mask
279	sllg	$i3,$s2,`0+3`
280	srl	$s2,`24-3`
281	nr	$s2,$mask
282	ngr	$i3,$mask
283
284	xr	$s1,$t1
285	srlg	$ra,$s3,`8-3`	# i1
286	sllg	$t1,$s3,`0+3`	# i0
287	nr	$ra,$mask
288	la	$key,16($key)
289	ngr	$t1,$mask
290
291	x	$s0,2($i1,$tbl)	# Te2[s2>>8]
292	x	$s1,3($i2,$tbl)	# Te1[s2>>16]
293	l	$s2,0($s2,$tbl)	# Te0[s2>>24]
294	x	$t3,1($i3,$tbl)	# Te3[s2>>0]
295
296	srlg	$i3,$s3,`16-3`	# i2
297	xr	$s2,$t2
298	srl	$s3,`24-3`
299	nr	$i3,$mask
300	nr	$s3,$mask
301
302	x	$s0,0($key)
303	x	$s1,4($key)
304	x	$s2,8($key)
305	x	$t3,12($key)
306
307	x	$s0,1($t1,$tbl)	# Te3[s3>>0]
308	x	$s1,2($ra,$tbl)	# Te2[s3>>8]
309	x	$s2,3($i3,$tbl)	# Te1[s3>>16]
310	l	$s3,0($s3,$tbl)	# Te0[s3>>24]
311	xr	$s3,$t3
312
313	brct	$rounds,.Lenc_loop
314	.align	16
315
316	sllg	$t1,$s0,`0+3`
317	srlg	$t2,$s0,`8-3`
318	ngr	$t1,$mask
319	srlg	$t3,$s0,`16-3`
320	srl	$s0,`24-3`
321	nr	$s0,$mask
322	nr	$t2,$mask
323	nr	$t3,$mask
324
325	srlg	$i1,$s1,`16-3`	# i0
326	sllg	$i2,$s1,`0+3`
327	ngr	$i2,$mask
328	srlg	$i3,$s1,`8-3`
329	srl	$s1,`24-3`
330	nr	$i1,$mask
331	nr	$s1,$mask
332	nr	$i3,$mask
333
334	llgc	$s0,2($s0,$tbl)	# Te4[s0>>24]
335	llgc	$t1,2($t1,$tbl)	# Te4[s0>>0]
336	sll	$s0,24
337	llgc	$t2,2($t2,$tbl)	# Te4[s0>>8]
338	llgc	$t3,2($t3,$tbl)	# Te4[s0>>16]
339	sll	$t2,8
340	sll	$t3,16
341
342	llgc	$i1,2($i1,$tbl)	# Te4[s1>>16]
343	llgc	$s1,2($s1,$tbl)	# Te4[s1>>24]
344	llgc	$i2,2($i2,$tbl)	# Te4[s1>>0]
345	llgc	$i3,2($i3,$tbl)	# Te4[s1>>8]
346	sll	$i1,16
347	sll	$s1,24
348	sll	$i3,8
349	or	$s0,$i1
350	or	$s1,$t1
351	or	$t2,$i2
352	or	$t3,$i3
353
354	srlg	$i1,$s2,`8-3`	# i0
355	srlg	$i2,$s2,`16-3`	# i1
356	nr	$i1,$mask
357	nr	$i2,$mask
358	sllg	$i3,$s2,`0+3`
359	srl	$s2,`24-3`
360	ngr	$i3,$mask
361	nr	$s2,$mask
362
363	sllg	$t1,$s3,`0+3`	# i0
364	srlg	$ra,$s3,`8-3`	# i1
365	ngr	$t1,$mask
366
367	llgc	$i1,2($i1,$tbl)	# Te4[s2>>8]
368	llgc	$i2,2($i2,$tbl)	# Te4[s2>>16]
369	sll	$i1,8
370	llgc	$s2,2($s2,$tbl)	# Te4[s2>>24]
371	llgc	$i3,2($i3,$tbl)	# Te4[s2>>0]
372	sll	$i2,16
373	nr	$ra,$mask
374	sll	$s2,24
375	or	$s0,$i1
376	or	$s1,$i2
377	or	$s2,$t2
378	or	$t3,$i3
379
380	srlg	$i3,$s3,`16-3`	# i2
381	srl	$s3,`24-3`
382	nr	$i3,$mask
383	nr	$s3,$mask
384
385	l	$t0,16($key)
386	l	$t2,20($key)
387
388	llgc	$i1,2($t1,$tbl)	# Te4[s3>>0]
389	llgc	$i2,2($ra,$tbl)	# Te4[s3>>8]
390	llgc	$i3,2($i3,$tbl)	# Te4[s3>>16]
391	llgc	$s3,2($s3,$tbl)	# Te4[s3>>24]
392	sll	$i2,8
393	sll	$i3,16
394	sll	$s3,24
395	or	$s0,$i1
396	or	$s1,$i2
397	or	$s2,$i3
398	or	$s3,$t3
399
400	lg	$ra,152($sp)
401	xr	$s0,$t0
402	xr	$s1,$t2
403	x	$s2,24($key)
404	x	$s3,28($key)
405
406	br	$ra
407.size	_s390x_AES_encrypt,.-_s390x_AES_encrypt
408___
409
410$code.=<<___;
411.type	AES_Td,\@object
412.align	256
413AES_Td:
414___
415&_data_word(
416	0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
417	0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
418	0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
419	0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
420	0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
421	0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
422	0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
423	0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
424	0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
425	0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
426	0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
427	0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
428	0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
429	0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
430	0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
431	0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
432	0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
433	0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
434	0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
435	0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
436	0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
437	0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
438	0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
439	0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
440	0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
441	0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
442	0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
443	0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
444	0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
445	0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
446	0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
447	0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
448	0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
449	0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
450	0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
451	0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
452	0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
453	0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
454	0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
455	0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
456	0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
457	0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
458	0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
459	0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
460	0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
461	0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
462	0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
463	0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
464	0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
465	0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
466	0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
467	0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
468	0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
469	0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
470	0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
471	0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
472	0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
473	0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
474	0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
475	0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
476	0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
477	0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
478	0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
479	0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
480$code.=<<___;
481# Td4[256]
482.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
483.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
484.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
485.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
486.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
487.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
488.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
489.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
490.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
491.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
492.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
493.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
494.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
495.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
496.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
497.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
498.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
499.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
500.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
501.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
502.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
503.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
504.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
505.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
506.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
507.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
508.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
509.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
510.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
511.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
512.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
513.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
514.size	AES_Td,.-AES_Td
515
516# void AES_decrypt(const unsigned char *inp, unsigned char *out,
517# 		 const AES_KEY *key) {
518.globl	AES_decrypt
519.type	AES_decrypt,\@function
520AES_decrypt:
521___
522$code.=<<___ if (!$softonly);
523	l	%r0,240($key)
524	lhi	%r1,16
525	clr	%r0,%r1
526	jl	.Ldsoft
527
528	la	%r1,0($key)
529	#la	%r2,0($inp)
530	la	%r4,0($out)
531	lghi	%r3,16		# single block length
532	.long	0xb92e0042	# km %r4,%r2
533	brc	1,.-4		# can this happen?
534	br	%r14
535.align	64
536.Ldsoft:
537___
538$code.=<<___;
539	stmg	%r3,$ra,24($sp)
540
541	llgf	$s0,0($inp)
542	llgf	$s1,4($inp)
543	llgf	$s2,8($inp)
544	llgf	$s3,12($inp)
545
546	larl	$tbl,AES_Td
547	bras	$ra,_s390x_AES_decrypt
548
549	lg	$out,24($sp)
550	st	$s0,0($out)
551	st	$s1,4($out)
552	st	$s2,8($out)
553	st	$s3,12($out)
554
555	lmg	%r6,$ra,48($sp)
556	br	$ra
557.size	AES_decrypt,.-AES_decrypt
558
559.type   _s390x_AES_decrypt,\@function
560.align	16
561_s390x_AES_decrypt:
562	stg	$ra,152($sp)
563	x	$s0,0($key)
564	x	$s1,4($key)
565	x	$s2,8($key)
566	x	$s3,12($key)
567	l	$rounds,240($key)
568	llill	$mask,`0xff<<3`
569	aghi	$rounds,-1
570	j	.Ldec_loop
571.align	16
572.Ldec_loop:
573	srlg	$t1,$s0,`16-3`
574	srlg	$t2,$s0,`8-3`
575	sllg	$t3,$s0,`0+3`
576	srl	$s0,`24-3`
577	nr	$s0,$mask
578	nr	$t1,$mask
579	nr	$t2,$mask
580	ngr	$t3,$mask
581
582	sllg	$i1,$s1,`0+3`	# i0
583	srlg	$i2,$s1,`16-3`
584	srlg	$i3,$s1,`8-3`
585	srl	$s1,`24-3`
586	ngr	$i1,$mask
587	nr	$s1,$mask
588	nr	$i2,$mask
589	nr	$i3,$mask
590
591	l	$s0,0($s0,$tbl)	# Td0[s0>>24]
592	l	$t1,3($t1,$tbl)	# Td1[s0>>16]
593	l	$t2,2($t2,$tbl)	# Td2[s0>>8]
594	l	$t3,1($t3,$tbl)	# Td3[s0>>0]
595
596	x	$s0,1($i1,$tbl)	# Td3[s1>>0]
597	l	$s1,0($s1,$tbl)	# Td0[s1>>24]
598	x	$t2,3($i2,$tbl)	# Td1[s1>>16]
599	x	$t3,2($i3,$tbl)	# Td2[s1>>8]
600
601	srlg	$i1,$s2,`8-3`	# i0
602	sllg	$i2,$s2,`0+3`	# i1
603	srlg	$i3,$s2,`16-3`
604	srl	$s2,`24-3`
605	nr	$i1,$mask
606	ngr	$i2,$mask
607	nr	$s2,$mask
608	nr	$i3,$mask
609
610	xr	$s1,$t1
611	srlg	$ra,$s3,`8-3`	# i1
612	srlg	$t1,$s3,`16-3`	# i0
613	nr	$ra,$mask
614	la	$key,16($key)
615	nr	$t1,$mask
616
617	x	$s0,2($i1,$tbl)	# Td2[s2>>8]
618	x	$s1,1($i2,$tbl)	# Td3[s2>>0]
619	l	$s2,0($s2,$tbl)	# Td0[s2>>24]
620	x	$t3,3($i3,$tbl)	# Td1[s2>>16]
621
622	sllg	$i3,$s3,`0+3`	# i2
623	srl	$s3,`24-3`
624	ngr	$i3,$mask
625	nr	$s3,$mask
626
627	xr	$s2,$t2
628	x	$s0,0($key)
629	x	$s1,4($key)
630	x	$s2,8($key)
631	x	$t3,12($key)
632
633	x	$s0,3($t1,$tbl)	# Td1[s3>>16]
634	x	$s1,2($ra,$tbl)	# Td2[s3>>8]
635	x	$s2,1($i3,$tbl)	# Td3[s3>>0]
636	l	$s3,0($s3,$tbl)	# Td0[s3>>24]
637	xr	$s3,$t3
638
639	brct	$rounds,.Ldec_loop
640	.align	16
641
642	l	$t1,`2048+0`($tbl)	# prefetch Td4
643	l	$t2,`2048+64`($tbl)
644	l	$t3,`2048+128`($tbl)
645	l	$i1,`2048+192`($tbl)
646	llill	$mask,0xff
647
648	srlg	$i3,$s0,24	# i0
649	srlg	$t1,$s0,16
650	srlg	$t2,$s0,8
651	nr	$s0,$mask	# i3
652	nr	$t1,$mask
653
654	srlg	$i1,$s1,24
655	nr	$t2,$mask
656	srlg	$i2,$s1,16
657	srlg	$ra,$s1,8
658	nr	$s1,$mask	# i0
659	nr	$i2,$mask
660	nr	$ra,$mask
661
662	llgc	$i3,2048($i3,$tbl)	# Td4[s0>>24]
663	llgc	$t1,2048($t1,$tbl)	# Td4[s0>>16]
664	llgc	$t2,2048($t2,$tbl)	# Td4[s0>>8]
665	sll	$t1,16
666	llgc	$t3,2048($s0,$tbl)	# Td4[s0>>0]
667	sllg	$s0,$i3,24
668	sll	$t2,8
669
670	llgc	$s1,2048($s1,$tbl)	# Td4[s1>>0]
671	llgc	$i1,2048($i1,$tbl)	# Td4[s1>>24]
672	llgc	$i2,2048($i2,$tbl)	# Td4[s1>>16]
673	sll	$i1,24
674	llgc	$i3,2048($ra,$tbl)	# Td4[s1>>8]
675	sll	$i2,16
676	sll	$i3,8
677	or	$s0,$s1
678	or	$t1,$i1
679	or	$t2,$i2
680	or	$t3,$i3
681
682	srlg	$i1,$s2,8	# i0
683	srlg	$i2,$s2,24
684	srlg	$i3,$s2,16
685	nr	$s2,$mask	# i1
686	nr	$i1,$mask
687	nr	$i3,$mask
688	llgc	$i1,2048($i1,$tbl)	# Td4[s2>>8]
689	llgc	$s1,2048($s2,$tbl)	# Td4[s2>>0]
690	llgc	$i2,2048($i2,$tbl)	# Td4[s2>>24]
691	llgc	$i3,2048($i3,$tbl)	# Td4[s2>>16]
692	sll	$i1,8
693	sll	$i2,24
694	or	$s0,$i1
695	sll	$i3,16
696	or	$t2,$i2
697	or	$t3,$i3
698
699	srlg	$i1,$s3,16	# i0
700	srlg	$i2,$s3,8	# i1
701	srlg	$i3,$s3,24
702	nr	$s3,$mask	# i2
703	nr	$i1,$mask
704	nr	$i2,$mask
705
706	lg	$ra,152($sp)
707	or	$s1,$t1
708	l	$t0,16($key)
709	l	$t1,20($key)
710
711	llgc	$i1,2048($i1,$tbl)	# Td4[s3>>16]
712	llgc	$i2,2048($i2,$tbl)	# Td4[s3>>8]
713	sll	$i1,16
714	llgc	$s2,2048($s3,$tbl)	# Td4[s3>>0]
715	llgc	$s3,2048($i3,$tbl)	# Td4[s3>>24]
716	sll	$i2,8
717	sll	$s3,24
718	or	$s0,$i1
719	or	$s1,$i2
720	or	$s2,$t2
721	or	$s3,$t3
722
723	xr	$s0,$t0
724	xr	$s1,$t1
725	x	$s2,24($key)
726	x	$s3,28($key)
727
728	br	$ra
729.size	_s390x_AES_decrypt,.-_s390x_AES_decrypt
730___
731
732$code.=<<___;
733# void AES_set_encrypt_key(const unsigned char *in, int bits,
734# 		 AES_KEY *key) {
735.globl	AES_set_encrypt_key
736.type	AES_set_encrypt_key,\@function
737.align	16
738AES_set_encrypt_key:
739	lghi	$t0,0
740	clgr	$inp,$t0
741	je	.Lminus1
742	clgr	$key,$t0
743	je	.Lminus1
744
745	lghi	$t0,128
746	clr	$bits,$t0
747	je	.Lproceed
748	lghi	$t0,192
749	clr	$bits,$t0
750	je	.Lproceed
751	lghi	$t0,256
752	clr	$bits,$t0
753	je	.Lproceed
754	lghi	%r2,-2
755	br	%r14
756
757.align	16
758.Lproceed:
759___
760$code.=<<___ if (!$softonly);
761	# convert bits to km code, [128,192,256]->[18,19,20]
762	lhi	%r5,-128
763	lhi	%r0,18
764	ar	%r5,$bits
765	srl	%r5,6
766	ar	%r5,%r0
767
768	larl	%r1,OPENSSL_s390xcap_P
769	lg	%r0,0(%r1)
770	tmhl	%r0,0x4000	# check for message-security assist
771	jz	.Lekey_internal
772
773	lghi	%r0,0		# query capability vector
774	la	%r1,16($sp)
775	.long	0xb92f0042	# kmc %r4,%r2
776
777	llihh	%r1,0x8000
778	srlg	%r1,%r1,0(%r5)
779	ng	%r1,16($sp)
780	jz	.Lekey_internal
781
782	lmg	%r0,%r1,0($inp)	# just copy 128 bits...
783	stmg	%r0,%r1,0($key)
784	lhi	%r0,192
785	cr	$bits,%r0
786	jl	1f
787	lg	%r1,16($inp)
788	stg	%r1,16($key)
789	je	1f
790	lg	%r1,24($inp)
791	stg	%r1,24($key)
7921:	st	$bits,236($key)	# save bits
793	st	%r5,240($key)	# save km code
794	lghi	%r2,0
795	br	%r14
796___
797$code.=<<___;
798.align	16
799.Lekey_internal:
800	stmg	%r6,%r13,48($sp)	# all non-volatile regs
801
802	larl	$tbl,AES_Te+2048
803
804	llgf	$s0,0($inp)
805	llgf	$s1,4($inp)
806	llgf	$s2,8($inp)
807	llgf	$s3,12($inp)
808	st	$s0,0($key)
809	st	$s1,4($key)
810	st	$s2,8($key)
811	st	$s3,12($key)
812	lghi	$t0,128
813	cr	$bits,$t0
814	jne	.Lnot128
815
816	llill	$mask,0xff
817	lghi	$t3,0			# i=0
818	lghi	$rounds,10
819	st	$rounds,240($key)
820
821	llgfr	$t2,$s3			# temp=rk[3]
822	srlg	$i1,$s3,8
823	srlg	$i2,$s3,16
824	srlg	$i3,$s3,24
825	nr	$t2,$mask
826	nr	$i1,$mask
827	nr	$i2,$mask
828
829.align	16
830.L128_loop:
831	la	$t2,0($t2,$tbl)
832	la	$i1,0($i1,$tbl)
833	la	$i2,0($i2,$tbl)
834	la	$i3,0($i3,$tbl)
835	icm	$t2,2,0($t2)		# Te4[rk[3]>>0]<<8
836	icm	$t2,4,0($i1)		# Te4[rk[3]>>8]<<16
837	icm	$t2,8,0($i2)		# Te4[rk[3]>>16]<<24
838	icm	$t2,1,0($i3)		# Te4[rk[3]>>24]
839	x	$t2,256($t3,$tbl)	# rcon[i]
840	xr	$s0,$t2			# rk[4]=rk[0]^...
841	xr	$s1,$s0			# rk[5]=rk[1]^rk[4]
842	xr	$s2,$s1			# rk[6]=rk[2]^rk[5]
843	xr	$s3,$s2			# rk[7]=rk[3]^rk[6]
844
845	llgfr	$t2,$s3			# temp=rk[3]
846	srlg	$i1,$s3,8
847	srlg	$i2,$s3,16
848	nr	$t2,$mask
849	nr	$i1,$mask
850	srlg	$i3,$s3,24
851	nr	$i2,$mask
852
853	st	$s0,16($key)
854	st	$s1,20($key)
855	st	$s2,24($key)
856	st	$s3,28($key)
857	la	$key,16($key)		# key+=4
858	la	$t3,4($t3)		# i++
859	brct	$rounds,.L128_loop
860	lghi	%r2,0
861	lmg	%r6,%r13,48($sp)
862	br	$ra
863
864.align	16
865.Lnot128:
866	llgf	$t0,16($inp)
867	llgf	$t1,20($inp)
868	st	$t0,16($key)
869	st	$t1,20($key)
870	lghi	$t0,192
871	cr	$bits,$t0
872	jne	.Lnot192
873
874	llill	$mask,0xff
875	lghi	$t3,0			# i=0
876	lghi	$rounds,12
877	st	$rounds,240($key)
878	lghi	$rounds,8
879
880	srlg	$i1,$t1,8
881	srlg	$i2,$t1,16
882	srlg	$i3,$t1,24
883	nr	$t1,$mask
884	nr	$i1,$mask
885	nr	$i2,$mask
886
887.align	16
888.L192_loop:
889	la	$t1,0($t1,$tbl)
890	la	$i1,0($i1,$tbl)
891	la	$i2,0($i2,$tbl)
892	la	$i3,0($i3,$tbl)
893	icm	$t1,2,0($t1)		# Te4[rk[5]>>0]<<8
894	icm	$t1,4,0($i1)		# Te4[rk[5]>>8]<<16
895	icm	$t1,8,0($i2)		# Te4[rk[5]>>16]<<24
896	icm	$t1,1,0($i3)		# Te4[rk[5]>>24]
897	x	$t1,256($t3,$tbl)	# rcon[i]
898	xr	$s0,$t1			# rk[6]=rk[0]^...
899	xr	$s1,$s0			# rk[7]=rk[1]^rk[6]
900	xr	$s2,$s1			# rk[8]=rk[2]^rk[7]
901	xr	$s3,$s2			# rk[9]=rk[3]^rk[8]
902
903	st	$s0,24($key)
904	st	$s1,28($key)
905	st	$s2,32($key)
906	st	$s3,36($key)
907	brct	$rounds,.L192_continue
908	lghi	%r2,0
909	lmg	%r6,%r13,48($sp)
910	br	$ra
911
912.align	16
913.L192_continue:
914	lgr	$t1,$s3
915	x	$t1,16($key)		# rk[10]=rk[4]^rk[9]
916	st	$t1,40($key)
917	x	$t1,20($key)		# rk[11]=rk[5]^rk[10]
918	st	$t1,44($key)
919
920	srlg	$i1,$t1,8
921	srlg	$i2,$t1,16
922	srlg	$i3,$t1,24
923	nr	$t1,$mask
924	nr	$i1,$mask
925	nr	$i2,$mask
926
927	la	$key,24($key)		# key+=6
928	la	$t3,4($t3)		# i++
929	j	.L192_loop
930
931.align	16
932.Lnot192:
933	llgf	$t0,24($inp)
934	llgf	$t1,28($inp)
935	st	$t0,24($key)
936	st	$t1,28($key)
937	llill	$mask,0xff
938	lghi	$t3,0			# i=0
939	lghi	$rounds,14
940	st	$rounds,240($key)
941	lghi	$rounds,7
942
943	srlg	$i1,$t1,8
944	srlg	$i2,$t1,16
945	srlg	$i3,$t1,24
946	nr	$t1,$mask
947	nr	$i1,$mask
948	nr	$i2,$mask
949
950.align	16
951.L256_loop:
952	la	$t1,0($t1,$tbl)
953	la	$i1,0($i1,$tbl)
954	la	$i2,0($i2,$tbl)
955	la	$i3,0($i3,$tbl)
956	icm	$t1,2,0($t1)		# Te4[rk[7]>>0]<<8
957	icm	$t1,4,0($i1)		# Te4[rk[7]>>8]<<16
958	icm	$t1,8,0($i2)		# Te4[rk[7]>>16]<<24
959	icm	$t1,1,0($i3)		# Te4[rk[7]>>24]
960	x	$t1,256($t3,$tbl)	# rcon[i]
961	xr	$s0,$t1			# rk[8]=rk[0]^...
962	xr	$s1,$s0			# rk[9]=rk[1]^rk[8]
963	xr	$s2,$s1			# rk[10]=rk[2]^rk[9]
964	xr	$s3,$s2			# rk[11]=rk[3]^rk[10]
965	st	$s0,32($key)
966	st	$s1,36($key)
967	st	$s2,40($key)
968	st	$s3,44($key)
969	brct	$rounds,.L256_continue
970	lghi	%r2,0
971	lmg	%r6,%r13,48($sp)
972	br	$ra
973
974.align	16
975.L256_continue:
976	lgr	$t1,$s3			# temp=rk[11]
977	srlg	$i1,$s3,8
978	srlg	$i2,$s3,16
979	srlg	$i3,$s3,24
980	nr	$t1,$mask
981	nr	$i1,$mask
982	nr	$i2,$mask
983	la	$t1,0($t1,$tbl)
984	la	$i1,0($i1,$tbl)
985	la	$i2,0($i2,$tbl)
986	la	$i3,0($i3,$tbl)
987	llgc	$t1,0($t1)		# Te4[rk[11]>>0]
988	icm	$t1,2,0($i1)		# Te4[rk[11]>>8]<<8
989	icm	$t1,4,0($i2)		# Te4[rk[11]>>16]<<16
990	icm	$t1,8,0($i3)		# Te4[rk[11]>>24]<<24
991	x	$t1,16($key)		# rk[12]=rk[4]^...
992	st	$t1,48($key)
993	x	$t1,20($key)		# rk[13]=rk[5]^rk[12]
994	st	$t1,52($key)
995	x	$t1,24($key)		# rk[14]=rk[6]^rk[13]
996	st	$t1,56($key)
997	x	$t1,28($key)		# rk[15]=rk[7]^rk[14]
998	st	$t1,60($key)
999
1000	srlg	$i1,$t1,8
1001	srlg	$i2,$t1,16
1002	srlg	$i3,$t1,24
1003	nr	$t1,$mask
1004	nr	$i1,$mask
1005	nr	$i2,$mask
1006
1007	la	$key,32($key)		# key+=8
1008	la	$t3,4($t3)		# i++
1009	j	.L256_loop
1010
1011.Lminus1:
1012	lghi	%r2,-1
1013	br	$ra
1014.size	AES_set_encrypt_key,.-AES_set_encrypt_key
1015
1016# void AES_set_decrypt_key(const unsigned char *in, int bits,
1017# 		 AES_KEY *key) {
1018.globl	AES_set_decrypt_key
1019.type	AES_set_decrypt_key,\@function
1020.align	16
1021AES_set_decrypt_key:
1022	stg	$key,32($sp)		# I rely on AES_set_encrypt_key to
1023	stg	$ra,112($sp)		# save non-volatile registers!
1024	bras	$ra,AES_set_encrypt_key
1025	lg	$key,32($sp)
1026	lg	$ra,112($sp)
1027	ltgr	%r2,%r2
1028	bnzr	$ra
1029___
1030$code.=<<___ if (!$softonly);
1031	l	$t0,240($key)
1032	lhi	$t1,16
1033	cr	$t0,$t1
1034	jl	.Lgo
1035	oill	$t0,0x80	# set "decrypt" bit
1036	st	$t0,240($key)
1037	br	$ra
1038
1039.align	16
1040.Ldkey_internal:
1041	stg	$key,32($sp)
1042	stg	$ra,40($sp)
1043	bras	$ra,.Lekey_internal
1044	lg	$key,32($sp)
1045	lg	$ra,40($sp)
1046___
1047$code.=<<___;
1048
1049.Lgo:	llgf	$rounds,240($key)
1050	la	$i1,0($key)
1051	sllg	$i2,$rounds,4
1052	la	$i2,0($i2,$key)
1053	srl	$rounds,1
1054	lghi	$t1,-16
1055
1056.align	16
1057.Linv:	lmg	$s0,$s1,0($i1)
1058	lmg	$s2,$s3,0($i2)
1059	stmg	$s0,$s1,0($i2)
1060	stmg	$s2,$s3,0($i1)
1061	la	$i1,16($i1)
1062	la	$i2,0($t1,$i2)
1063	brct	$rounds,.Linv
1064___
1065$mask80=$i1;
1066$mask1b=$i2;
1067$maskfe=$i3;
1068$code.=<<___;
1069	llgf	$rounds,240($key)
1070	aghi	$rounds,-1
1071	sll	$rounds,2	# (rounds-1)*4
1072	llilh	$mask80,0x8080
1073	llilh	$mask1b,0x1b1b
1074	llilh	$maskfe,0xfefe
1075	oill	$mask80,0x8080
1076	oill	$mask1b,0x1b1b
1077	oill	$maskfe,0xfefe
1078
1079.align	16
1080.Lmix:	l	$s0,16($key)	# tp1
1081	lr	$s1,$s0
1082	ngr	$s1,$mask80
1083	srlg	$t1,$s1,7
1084	slr	$s1,$t1
1085	nr	$s1,$mask1b
1086	sllg	$t1,$s0,1
1087	nr	$t1,$maskfe
1088	xr	$s1,$t1		# tp2
1089
1090	lr	$s2,$s1
1091	ngr	$s2,$mask80
1092	srlg	$t1,$s2,7
1093	slr	$s2,$t1
1094	nr	$s2,$mask1b
1095	sllg	$t1,$s1,1
1096	nr	$t1,$maskfe
1097	xr	$s2,$t1		# tp4
1098
1099	lr	$s3,$s2
1100	ngr	$s3,$mask80
1101	srlg	$t1,$s3,7
1102	slr	$s3,$t1
1103	nr	$s3,$mask1b
1104	sllg	$t1,$s2,1
1105	nr	$t1,$maskfe
1106	xr	$s3,$t1		# tp8
1107
1108	xr	$s1,$s0		# tp2^tp1
1109	xr	$s2,$s0		# tp4^tp1
1110	rll	$s0,$s0,24	# = ROTATE(tp1,8)
1111	xr	$s2,$s3		# ^=tp8
1112	xr	$s0,$s1		# ^=tp2^tp1
1113	xr	$s1,$s3		# tp2^tp1^tp8
1114	xr	$s0,$s2		# ^=tp4^tp1^tp8
1115	rll	$s1,$s1,8
1116	rll	$s2,$s2,16
1117	xr	$s0,$s1		# ^= ROTATE(tp8^tp2^tp1,24)
1118	rll	$s3,$s3,24
1119	xr	$s0,$s2    	# ^= ROTATE(tp8^tp4^tp1,16)
1120	xr	$s0,$s3		# ^= ROTATE(tp8,8)
1121
1122	st	$s0,16($key)
1123	la	$key,4($key)
1124	brct	$rounds,.Lmix
1125
1126	lmg	%r6,%r13,48($sp)# as was saved by AES_set_encrypt_key!
1127	lghi	%r2,0
1128	br	$ra
1129.size	AES_set_decrypt_key,.-AES_set_decrypt_key
1130___
1131
1132#void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
1133#                     size_t length, const AES_KEY *key,
1134#                     unsigned char *ivec, const int enc)
1135{
1136my $inp="%r2";
1137my $out="%r4";	# length and out are swapped
1138my $len="%r3";
1139my $key="%r5";
1140my $ivp="%r6";
1141
1142$code.=<<___;
1143.globl	AES_cbc_encrypt
1144.type	AES_cbc_encrypt,\@function
1145.align	16
1146AES_cbc_encrypt:
1147	xgr	%r3,%r4		# flip %r3 and %r4, out and len
1148	xgr	%r4,%r3
1149	xgr	%r3,%r4
1150___
1151$code.=<<___ if (!$softonly);
1152	lhi	%r0,16
1153	cl	%r0,240($key)
1154	jh	.Lcbc_software
1155
1156	lg	%r0,0($ivp)	# copy ivec
1157	lg	%r1,8($ivp)
1158	stmg	%r0,%r1,16($sp)
1159	lmg	%r0,%r1,0($key)	# copy key, cover 256 bit
1160	stmg	%r0,%r1,32($sp)
1161	lmg	%r0,%r1,16($key)
1162	stmg	%r0,%r1,48($sp)
1163	l	%r0,240($key)	# load kmc code
1164	lghi	$key,15		# res=len%16, len-=res;
1165	ngr	$key,$len
1166	slgr	$len,$key
1167	la	%r1,16($sp)	# parameter block - ivec || key
1168	jz	.Lkmc_truncated
1169	.long	0xb92f0042	# kmc %r4,%r2
1170	brc	1,.-4		# pay attention to "partial completion"
1171	ltr	$key,$key
1172	jnz	.Lkmc_truncated
1173.Lkmc_done:
1174	lmg	%r0,%r1,16($sp)	# copy ivec to caller
1175	stg	%r0,0($ivp)
1176	stg	%r1,8($ivp)
1177	br	$ra
1178.align	16
1179.Lkmc_truncated:
1180	ahi	$key,-1		# it's the way it's encoded in mvc
1181	tmll	%r0,0x80
1182	jnz	.Lkmc_truncated_dec
1183	lghi	%r1,0
1184	stg	%r1,128($sp)
1185	stg	%r1,136($sp)
1186	bras	%r1,1f
1187	mvc	128(1,$sp),0($inp)
11881:	ex	$key,0(%r1)
1189	la	%r1,16($sp)	# restore parameter block
1190	la	$inp,128($sp)
1191	lghi	$len,16
1192	.long	0xb92f0042	# kmc %r4,%r2
1193	j	.Lkmc_done
1194.align	16
1195.Lkmc_truncated_dec:
1196	stg	$out,64($sp)
1197	la	$out,128($sp)
1198	lghi	$len,16
1199	.long	0xb92f0042	# kmc %r4,%r2
1200	lg	$out,64($sp)
1201	bras	%r1,2f
1202	mvc	0(1,$out),128($sp)
12032:	ex	$key,0(%r1)
1204	j	.Lkmc_done
1205.align	16
1206.Lcbc_software:
1207___
1208$code.=<<___;
1209	stmg	$key,$ra,40($sp)
1210	lhi	%r0,0
1211	cl	%r0,164($sp)
1212	je	.Lcbc_decrypt
1213
1214	larl	$tbl,AES_Te
1215
1216	llgf	$s0,0($ivp)
1217	llgf	$s1,4($ivp)
1218	llgf	$s2,8($ivp)
1219	llgf	$s3,12($ivp)
1220
1221	lghi	$t0,16
1222	slgr	$len,$t0
1223	brc	4,.Lcbc_enc_tail	# if borrow
1224.Lcbc_enc_loop:
1225	stmg	$inp,$out,16($sp)
1226	x	$s0,0($inp)
1227	x	$s1,4($inp)
1228	x	$s2,8($inp)
1229	x	$s3,12($inp)
1230	lgr	%r4,$key
1231
1232	bras	$ra,_s390x_AES_encrypt
1233
1234	lmg	$inp,$key,16($sp)
1235	st	$s0,0($out)
1236	st	$s1,4($out)
1237	st	$s2,8($out)
1238	st	$s3,12($out)
1239
1240	la	$inp,16($inp)
1241	la	$out,16($out)
1242	lghi	$t0,16
1243	ltgr	$len,$len
1244	jz	.Lcbc_enc_done
1245	slgr	$len,$t0
1246	brc	4,.Lcbc_enc_tail	# if borrow
1247	j	.Lcbc_enc_loop
1248.align	16
1249.Lcbc_enc_done:
1250	lg	$ivp,48($sp)
1251	st	$s0,0($ivp)
1252	st	$s1,4($ivp)
1253	st	$s2,8($ivp)
1254	st	$s3,12($ivp)
1255
1256	lmg	%r7,$ra,56($sp)
1257	br	$ra
1258
1259.align	16
1260.Lcbc_enc_tail:
1261	aghi	$len,15
1262	lghi	$t0,0
1263	stg	$t0,128($sp)
1264	stg	$t0,136($sp)
1265	bras	$t1,3f
1266	mvc	128(1,$sp),0($inp)
12673:	ex	$len,0($t1)
1268	lghi	$len,0
1269	la	$inp,128($sp)
1270	j	.Lcbc_enc_loop
1271
1272.align	16
1273.Lcbc_decrypt:
1274	larl	$tbl,AES_Td
1275
1276	lg	$t0,0($ivp)
1277	lg	$t1,8($ivp)
1278	stmg	$t0,$t1,128($sp)
1279
1280.Lcbc_dec_loop:
1281	stmg	$inp,$out,16($sp)
1282	llgf	$s0,0($inp)
1283	llgf	$s1,4($inp)
1284	llgf	$s2,8($inp)
1285	llgf	$s3,12($inp)
1286	lgr	%r4,$key
1287
1288	bras	$ra,_s390x_AES_decrypt
1289
1290	lmg	$inp,$key,16($sp)
1291	sllg	$s0,$s0,32
1292	sllg	$s2,$s2,32
1293	lr	$s0,$s1
1294	lr	$s2,$s3
1295
1296	lg	$t0,0($inp)
1297	lg	$t1,8($inp)
1298	xg	$s0,128($sp)
1299	xg	$s2,136($sp)
1300	lghi	$s1,16
1301	slgr	$len,$s1
1302	brc	4,.Lcbc_dec_tail	# if borrow
1303	brc	2,.Lcbc_dec_done	# if zero
1304	stg	$s0,0($out)
1305	stg	$s2,8($out)
1306	stmg	$t0,$t1,128($sp)
1307
1308	la	$inp,16($inp)
1309	la	$out,16($out)
1310	j	.Lcbc_dec_loop
1311
1312.Lcbc_dec_done:
1313	stg	$s0,0($out)
1314	stg	$s2,8($out)
1315.Lcbc_dec_exit:
1316	lmg	$ivp,$ra,48($sp)
1317	stmg	$t0,$t1,0($ivp)
1318
1319	br	$ra
1320
1321.align	16
1322.Lcbc_dec_tail:
1323	aghi	$len,15
1324	stg	$s0,128($sp)
1325	stg	$s2,136($sp)
1326	bras	$s1,4f
1327	mvc	0(1,$out),128($sp)
13284:	ex	$len,0($s1)
1329	j	.Lcbc_dec_exit
1330.size	AES_cbc_encrypt,.-AES_cbc_encrypt
1331.comm  OPENSSL_s390xcap_P,8,8
1332___
1333}
1334$code.=<<___;
1335.string	"AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
1336___
1337
1338$code =~ s/\`([^\`]*)\`/eval $1/gem;
1339print $code;
1340