1#! /usr/bin/env perl
2# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for ARMv8 AES instructions. The
18# module is endian-agnostic in sense that it supports both big- and
19# little-endian cases. As does it support both 32- and 64-bit modes
20# of operation. Latter is achieved by limiting amount of utilized
21# registers to 16, which implies additional NEON load and integer
22# instructions. This has no effect on mighty Apple A7, where results
23# are literally equal to the theoretical estimates based on AES
24# instruction latencies and issue rates. On Cortex-A53, an in-order
25# execution core, this costs up to 10-15%, which is partially
26# compensated by implementing dedicated code path for 128-bit
27# CBC encrypt case. On Cortex-A57 parallelizable mode performance
28# seems to be limited by sheer amount of NEON instructions...
29#
30# Performance in cycles per byte processed with 128-bit key:
31#
32#		CBC enc		CBC dec		CTR
33# Apple A7	2.39		1.20		1.20
34# Cortex-A53	1.32		1.29		1.46
35# Cortex-A57(*)	1.95		0.85		0.93
36# Denver	1.96		0.86		0.80
37# Mongoose	1.33		1.20		1.20
38#
39# (*)	original 3.64/1.34/1.32 results were for r0p0 revision
40#	and are still same even for updated module;
41
42$flavour = shift;
43$output  = shift;
44
45$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
46( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
47( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
48die "can't locate arm-xlate.pl";
49
50open OUT,"| \"$^X\" $xlate $flavour $output";
51*STDOUT=*OUT;
52
53$prefix="aes_hw";
54
55$code=<<___;
56#include <openssl/arm_arch.h>
57
58#if __ARM_MAX_ARCH__>=7
59.text
60___
61$code.=<<___ if ($flavour =~ /64/);
62#if !defined(__clang__) || defined(BORINGSSL_CLANG_SUPPORTS_DOT_ARCH)
63.arch  armv8-a+crypto
64#endif
65___
66$code.=<<___						if ($flavour !~ /64/);
67.arch	armv7-a	// don't confuse not-so-latest binutils with argv8 :-)
68.fpu	neon
69.code	32
70#undef	__thumb2__
71___
72
73# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
74# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
75# maintain both 32- and 64-bit codes within single module and
76# transliterate common code to either flavour with regex vodoo.
77#
78{{{
79my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
80my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
81	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
82
83
84$code.=<<___;
85.align	5
86.Lrcon:
87.long	0x01,0x01,0x01,0x01
88.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
89.long	0x1b,0x1b,0x1b,0x1b
90
91.globl	${prefix}_set_encrypt_key
92.type	${prefix}_set_encrypt_key,%function
93.align	5
94${prefix}_set_encrypt_key:
95.Lenc_key:
96___
97$code.=<<___	if ($flavour =~ /64/);
98	stp	x29,x30,[sp,#-16]!
99	add	x29,sp,#0
100___
101$code.=<<___;
102	mov	$ptr,#-1
103	cmp	$inp,#0
104	b.eq	.Lenc_key_abort
105	cmp	$out,#0
106	b.eq	.Lenc_key_abort
107	mov	$ptr,#-2
108	cmp	$bits,#128
109	b.lt	.Lenc_key_abort
110	cmp	$bits,#256
111	b.gt	.Lenc_key_abort
112	tst	$bits,#0x3f
113	b.ne	.Lenc_key_abort
114
115	adr	$ptr,.Lrcon
116	cmp	$bits,#192
117
118	veor	$zero,$zero,$zero
119	vld1.8	{$in0},[$inp],#16
120	mov	$bits,#8		// reuse $bits
121	vld1.32	{$rcon,$mask},[$ptr],#32
122
123	b.lt	.Loop128
124	b.eq	.L192
125	b	.L256
126
127.align	4
128.Loop128:
129	vtbl.8	$key,{$in0},$mask
130	vext.8	$tmp,$zero,$in0,#12
131	vst1.32	{$in0},[$out],#16
132	aese	$key,$zero
133	subs	$bits,$bits,#1
134
135	veor	$in0,$in0,$tmp
136	vext.8	$tmp,$zero,$tmp,#12
137	veor	$in0,$in0,$tmp
138	vext.8	$tmp,$zero,$tmp,#12
139	 veor	$key,$key,$rcon
140	veor	$in0,$in0,$tmp
141	vshl.u8	$rcon,$rcon,#1
142	veor	$in0,$in0,$key
143	b.ne	.Loop128
144
145	vld1.32	{$rcon},[$ptr]
146
147	vtbl.8	$key,{$in0},$mask
148	vext.8	$tmp,$zero,$in0,#12
149	vst1.32	{$in0},[$out],#16
150	aese	$key,$zero
151
152	veor	$in0,$in0,$tmp
153	vext.8	$tmp,$zero,$tmp,#12
154	veor	$in0,$in0,$tmp
155	vext.8	$tmp,$zero,$tmp,#12
156	 veor	$key,$key,$rcon
157	veor	$in0,$in0,$tmp
158	vshl.u8	$rcon,$rcon,#1
159	veor	$in0,$in0,$key
160
161	vtbl.8	$key,{$in0},$mask
162	vext.8	$tmp,$zero,$in0,#12
163	vst1.32	{$in0},[$out],#16
164	aese	$key,$zero
165
166	veor	$in0,$in0,$tmp
167	vext.8	$tmp,$zero,$tmp,#12
168	veor	$in0,$in0,$tmp
169	vext.8	$tmp,$zero,$tmp,#12
170	 veor	$key,$key,$rcon
171	veor	$in0,$in0,$tmp
172	veor	$in0,$in0,$key
173	vst1.32	{$in0},[$out]
174	add	$out,$out,#0x50
175
176	mov	$rounds,#10
177	b	.Ldone
178
179.align	4
180.L192:
181	vld1.8	{$in1},[$inp],#8
182	vmov.i8	$key,#8			// borrow $key
183	vst1.32	{$in0},[$out],#16
184	vsub.i8	$mask,$mask,$key	// adjust the mask
185
186.Loop192:
187	vtbl.8	$key,{$in1},$mask
188	vext.8	$tmp,$zero,$in0,#12
189	vst1.32	{$in1},[$out],#8
190	aese	$key,$zero
191	subs	$bits,$bits,#1
192
193	veor	$in0,$in0,$tmp
194	vext.8	$tmp,$zero,$tmp,#12
195	veor	$in0,$in0,$tmp
196	vext.8	$tmp,$zero,$tmp,#12
197	veor	$in0,$in0,$tmp
198
199	vdup.32	$tmp,${in0}[3]
200	veor	$tmp,$tmp,$in1
201	 veor	$key,$key,$rcon
202	vext.8	$in1,$zero,$in1,#12
203	vshl.u8	$rcon,$rcon,#1
204	veor	$in1,$in1,$tmp
205	veor	$in0,$in0,$key
206	veor	$in1,$in1,$key
207	vst1.32	{$in0},[$out],#16
208	b.ne	.Loop192
209
210	mov	$rounds,#12
211	add	$out,$out,#0x20
212	b	.Ldone
213
214.align	4
215.L256:
216	vld1.8	{$in1},[$inp]
217	mov	$bits,#7
218	mov	$rounds,#14
219	vst1.32	{$in0},[$out],#16
220
221.Loop256:
222	vtbl.8	$key,{$in1},$mask
223	vext.8	$tmp,$zero,$in0,#12
224	vst1.32	{$in1},[$out],#16
225	aese	$key,$zero
226	subs	$bits,$bits,#1
227
228	veor	$in0,$in0,$tmp
229	vext.8	$tmp,$zero,$tmp,#12
230	veor	$in0,$in0,$tmp
231	vext.8	$tmp,$zero,$tmp,#12
232	 veor	$key,$key,$rcon
233	veor	$in0,$in0,$tmp
234	vshl.u8	$rcon,$rcon,#1
235	veor	$in0,$in0,$key
236	vst1.32	{$in0},[$out],#16
237	b.eq	.Ldone
238
239	vdup.32	$key,${in0}[3]		// just splat
240	vext.8	$tmp,$zero,$in1,#12
241	aese	$key,$zero
242
243	veor	$in1,$in1,$tmp
244	vext.8	$tmp,$zero,$tmp,#12
245	veor	$in1,$in1,$tmp
246	vext.8	$tmp,$zero,$tmp,#12
247	veor	$in1,$in1,$tmp
248
249	veor	$in1,$in1,$key
250	b	.Loop256
251
252.Ldone:
253	str	$rounds,[$out]
254	mov	$ptr,#0
255
256.Lenc_key_abort:
257	mov	x0,$ptr			// return value
258	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
259	ret
260.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
261
262.globl	${prefix}_set_decrypt_key
263.type	${prefix}_set_decrypt_key,%function
264.align	5
265${prefix}_set_decrypt_key:
266___
267$code.=<<___	if ($flavour =~ /64/);
268	stp	x29,x30,[sp,#-16]!
269	add	x29,sp,#0
270___
271$code.=<<___	if ($flavour !~ /64/);
272	stmdb	sp!,{r4,lr}
273___
274$code.=<<___;
275	bl	.Lenc_key
276
277	cmp	x0,#0
278	b.ne	.Ldec_key_abort
279
280	sub	$out,$out,#240		// restore original $out
281	mov	x4,#-16
282	add	$inp,$out,x12,lsl#4	// end of key schedule
283
284	vld1.32	{v0.16b},[$out]
285	vld1.32	{v1.16b},[$inp]
286	vst1.32	{v0.16b},[$inp],x4
287	vst1.32	{v1.16b},[$out],#16
288
289.Loop_imc:
290	vld1.32	{v0.16b},[$out]
291	vld1.32	{v1.16b},[$inp]
292	aesimc	v0.16b,v0.16b
293	aesimc	v1.16b,v1.16b
294	vst1.32	{v0.16b},[$inp],x4
295	vst1.32	{v1.16b},[$out],#16
296	cmp	$inp,$out
297	b.hi	.Loop_imc
298
299	vld1.32	{v0.16b},[$out]
300	aesimc	v0.16b,v0.16b
301	vst1.32	{v0.16b},[$inp]
302
303	eor	x0,x0,x0		// return value
304.Ldec_key_abort:
305___
306$code.=<<___	if ($flavour !~ /64/);
307	ldmia	sp!,{r4,pc}
308___
309$code.=<<___	if ($flavour =~ /64/);
310	ldp	x29,x30,[sp],#16
311	ret
312___
313$code.=<<___;
314.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
315___
316}}}
317{{{
318sub gen_block () {
319my $dir = shift;
320my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
321my ($inp,$out,$key)=map("x$_",(0..2));
322my $rounds="w3";
323my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
324
325$code.=<<___;
326.globl	${prefix}_${dir}crypt
327.type	${prefix}_${dir}crypt,%function
328.align	5
329${prefix}_${dir}crypt:
330	ldr	$rounds,[$key,#240]
331	vld1.32	{$rndkey0},[$key],#16
332	vld1.8	{$inout},[$inp]
333	sub	$rounds,$rounds,#2
334	vld1.32	{$rndkey1},[$key],#16
335
336.Loop_${dir}c:
337	aes$e	$inout,$rndkey0
338	aes$mc	$inout,$inout
339	vld1.32	{$rndkey0},[$key],#16
340	subs	$rounds,$rounds,#2
341	aes$e	$inout,$rndkey1
342	aes$mc	$inout,$inout
343	vld1.32	{$rndkey1},[$key],#16
344	b.gt	.Loop_${dir}c
345
346	aes$e	$inout,$rndkey0
347	aes$mc	$inout,$inout
348	vld1.32	{$rndkey0},[$key]
349	aes$e	$inout,$rndkey1
350	veor	$inout,$inout,$rndkey0
351
352	vst1.8	{$inout},[$out]
353	ret
354.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
355___
356}
357&gen_block("en");
358&gen_block("de");
359}}}
360{{{
361my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
362my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
363my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
364
365my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
366my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
367
368### q8-q15	preloaded key schedule
369
370$code.=<<___;
371.globl	${prefix}_cbc_encrypt
372.type	${prefix}_cbc_encrypt,%function
373.align	5
374${prefix}_cbc_encrypt:
375___
376$code.=<<___	if ($flavour =~ /64/);
377	stp	x29,x30,[sp,#-16]!
378	add	x29,sp,#0
379___
380$code.=<<___	if ($flavour !~ /64/);
381	mov	ip,sp
382	stmdb	sp!,{r4-r8,lr}
383	vstmdb	sp!,{d8-d15}            @ ABI specification says so
384	ldmia	ip,{r4-r5}		@ load remaining args
385___
386$code.=<<___;
387	subs	$len,$len,#16
388	mov	$step,#16
389	b.lo	.Lcbc_abort
390	cclr	$step,eq
391
392	cmp	$enc,#0			// en- or decrypting?
393	ldr	$rounds,[$key,#240]
394	and	$len,$len,#-16
395	vld1.8	{$ivec},[$ivp]
396	vld1.8	{$dat},[$inp],$step
397
398	vld1.32	{q8-q9},[$key]		// load key schedule...
399	sub	$rounds,$rounds,#6
400	add	$key_,$key,x5,lsl#4	// pointer to last 7 round keys
401	sub	$rounds,$rounds,#2
402	vld1.32	{q10-q11},[$key_],#32
403	vld1.32	{q12-q13},[$key_],#32
404	vld1.32	{q14-q15},[$key_],#32
405	vld1.32	{$rndlast},[$key_]
406
407	add	$key_,$key,#32
408	mov	$cnt,$rounds
409	b.eq	.Lcbc_dec
410
411	cmp	$rounds,#2
412	veor	$dat,$dat,$ivec
413	veor	$rndzero_n_last,q8,$rndlast
414	b.eq	.Lcbc_enc128
415
416	vld1.32	{$in0-$in1},[$key_]
417	add	$key_,$key,#16
418	add	$key4,$key,#16*4
419	add	$key5,$key,#16*5
420	aese	$dat,q8
421	aesmc	$dat,$dat
422	add	$key6,$key,#16*6
423	add	$key7,$key,#16*7
424	b	.Lenter_cbc_enc
425
426.align	4
427.Loop_cbc_enc:
428	aese	$dat,q8
429	aesmc	$dat,$dat
430	 vst1.8	{$ivec},[$out],#16
431.Lenter_cbc_enc:
432	aese	$dat,q9
433	aesmc	$dat,$dat
434	aese	$dat,$in0
435	aesmc	$dat,$dat
436	vld1.32	{q8},[$key4]
437	cmp	$rounds,#4
438	aese	$dat,$in1
439	aesmc	$dat,$dat
440	vld1.32	{q9},[$key5]
441	b.eq	.Lcbc_enc192
442
443	aese	$dat,q8
444	aesmc	$dat,$dat
445	vld1.32	{q8},[$key6]
446	aese	$dat,q9
447	aesmc	$dat,$dat
448	vld1.32	{q9},[$key7]
449	nop
450
451.Lcbc_enc192:
452	aese	$dat,q8
453	aesmc	$dat,$dat
454	 subs	$len,$len,#16
455	aese	$dat,q9
456	aesmc	$dat,$dat
457	 cclr	$step,eq
458	aese	$dat,q10
459	aesmc	$dat,$dat
460	aese	$dat,q11
461	aesmc	$dat,$dat
462	 vld1.8	{q8},[$inp],$step
463	aese	$dat,q12
464	aesmc	$dat,$dat
465	 veor	q8,q8,$rndzero_n_last
466	aese	$dat,q13
467	aesmc	$dat,$dat
468	 vld1.32 {q9},[$key_]		// re-pre-load rndkey[1]
469	aese	$dat,q14
470	aesmc	$dat,$dat
471	aese	$dat,q15
472	veor	$ivec,$dat,$rndlast
473	b.hs	.Loop_cbc_enc
474
475	vst1.8	{$ivec},[$out],#16
476	b	.Lcbc_done
477
478.align	5
479.Lcbc_enc128:
480	vld1.32	{$in0-$in1},[$key_]
481	aese	$dat,q8
482	aesmc	$dat,$dat
483	b	.Lenter_cbc_enc128
484.Loop_cbc_enc128:
485	aese	$dat,q8
486	aesmc	$dat,$dat
487	 vst1.8	{$ivec},[$out],#16
488.Lenter_cbc_enc128:
489	aese	$dat,q9
490	aesmc	$dat,$dat
491	 subs	$len,$len,#16
492	aese	$dat,$in0
493	aesmc	$dat,$dat
494	 cclr	$step,eq
495	aese	$dat,$in1
496	aesmc	$dat,$dat
497	aese	$dat,q10
498	aesmc	$dat,$dat
499	aese	$dat,q11
500	aesmc	$dat,$dat
501	 vld1.8	{q8},[$inp],$step
502	aese	$dat,q12
503	aesmc	$dat,$dat
504	aese	$dat,q13
505	aesmc	$dat,$dat
506	aese	$dat,q14
507	aesmc	$dat,$dat
508	 veor	q8,q8,$rndzero_n_last
509	aese	$dat,q15
510	veor	$ivec,$dat,$rndlast
511	b.hs	.Loop_cbc_enc128
512
513	vst1.8	{$ivec},[$out],#16
514	b	.Lcbc_done
515___
516{
517my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
518$code.=<<___;
519.align	5
520.Lcbc_dec:
521	vld1.8	{$dat2},[$inp],#16
522	subs	$len,$len,#32		// bias
523	add	$cnt,$rounds,#2
524	vorr	$in1,$dat,$dat
525	vorr	$dat1,$dat,$dat
526	vorr	$in2,$dat2,$dat2
527	b.lo	.Lcbc_dec_tail
528
529	vorr	$dat1,$dat2,$dat2
530	vld1.8	{$dat2},[$inp],#16
531	vorr	$in0,$dat,$dat
532	vorr	$in1,$dat1,$dat1
533	vorr	$in2,$dat2,$dat2
534
535.Loop3x_cbc_dec:
536	aesd	$dat0,q8
537	aesimc	$dat0,$dat0
538	aesd	$dat1,q8
539	aesimc	$dat1,$dat1
540	aesd	$dat2,q8
541	aesimc	$dat2,$dat2
542	vld1.32	{q8},[$key_],#16
543	subs	$cnt,$cnt,#2
544	aesd	$dat0,q9
545	aesimc	$dat0,$dat0
546	aesd	$dat1,q9
547	aesimc	$dat1,$dat1
548	aesd	$dat2,q9
549	aesimc	$dat2,$dat2
550	vld1.32	{q9},[$key_],#16
551	b.gt	.Loop3x_cbc_dec
552
553	aesd	$dat0,q8
554	aesimc	$dat0,$dat0
555	aesd	$dat1,q8
556	aesimc	$dat1,$dat1
557	aesd	$dat2,q8
558	aesimc	$dat2,$dat2
559	 veor	$tmp0,$ivec,$rndlast
560	 subs	$len,$len,#0x30
561	 veor	$tmp1,$in0,$rndlast
562	 mov.lo	x6,$len			// x6, $cnt, is zero at this point
563	aesd	$dat0,q9
564	aesimc	$dat0,$dat0
565	aesd	$dat1,q9
566	aesimc	$dat1,$dat1
567	aesd	$dat2,q9
568	aesimc	$dat2,$dat2
569	 veor	$tmp2,$in1,$rndlast
570	 add	$inp,$inp,x6		// $inp is adjusted in such way that
571					// at exit from the loop $dat1-$dat2
572					// are loaded with last "words"
573	 vorr	$ivec,$in2,$in2
574	 mov	$key_,$key
575	aesd	$dat0,q12
576	aesimc	$dat0,$dat0
577	aesd	$dat1,q12
578	aesimc	$dat1,$dat1
579	aesd	$dat2,q12
580	aesimc	$dat2,$dat2
581	 vld1.8	{$in0},[$inp],#16
582	aesd	$dat0,q13
583	aesimc	$dat0,$dat0
584	aesd	$dat1,q13
585	aesimc	$dat1,$dat1
586	aesd	$dat2,q13
587	aesimc	$dat2,$dat2
588	 vld1.8	{$in1},[$inp],#16
589	aesd	$dat0,q14
590	aesimc	$dat0,$dat0
591	aesd	$dat1,q14
592	aesimc	$dat1,$dat1
593	aesd	$dat2,q14
594	aesimc	$dat2,$dat2
595	 vld1.8	{$in2},[$inp],#16
596	aesd	$dat0,q15
597	aesd	$dat1,q15
598	aesd	$dat2,q15
599	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
600	 add	$cnt,$rounds,#2
601	veor	$tmp0,$tmp0,$dat0
602	veor	$tmp1,$tmp1,$dat1
603	veor	$dat2,$dat2,$tmp2
604	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
605	vst1.8	{$tmp0},[$out],#16
606	 vorr	$dat0,$in0,$in0
607	vst1.8	{$tmp1},[$out],#16
608	 vorr	$dat1,$in1,$in1
609	vst1.8	{$dat2},[$out],#16
610	 vorr	$dat2,$in2,$in2
611	b.hs	.Loop3x_cbc_dec
612
613	cmn	$len,#0x30
614	b.eq	.Lcbc_done
615	nop
616
617.Lcbc_dec_tail:
618	aesd	$dat1,q8
619	aesimc	$dat1,$dat1
620	aesd	$dat2,q8
621	aesimc	$dat2,$dat2
622	vld1.32	{q8},[$key_],#16
623	subs	$cnt,$cnt,#2
624	aesd	$dat1,q9
625	aesimc	$dat1,$dat1
626	aesd	$dat2,q9
627	aesimc	$dat2,$dat2
628	vld1.32	{q9},[$key_],#16
629	b.gt	.Lcbc_dec_tail
630
631	aesd	$dat1,q8
632	aesimc	$dat1,$dat1
633	aesd	$dat2,q8
634	aesimc	$dat2,$dat2
635	aesd	$dat1,q9
636	aesimc	$dat1,$dat1
637	aesd	$dat2,q9
638	aesimc	$dat2,$dat2
639	aesd	$dat1,q12
640	aesimc	$dat1,$dat1
641	aesd	$dat2,q12
642	aesimc	$dat2,$dat2
643	 cmn	$len,#0x20
644	aesd	$dat1,q13
645	aesimc	$dat1,$dat1
646	aesd	$dat2,q13
647	aesimc	$dat2,$dat2
648	 veor	$tmp1,$ivec,$rndlast
649	aesd	$dat1,q14
650	aesimc	$dat1,$dat1
651	aesd	$dat2,q14
652	aesimc	$dat2,$dat2
653	 veor	$tmp2,$in1,$rndlast
654	aesd	$dat1,q15
655	aesd	$dat2,q15
656	b.eq	.Lcbc_dec_one
657	veor	$tmp1,$tmp1,$dat1
658	veor	$tmp2,$tmp2,$dat2
659	 vorr	$ivec,$in2,$in2
660	vst1.8	{$tmp1},[$out],#16
661	vst1.8	{$tmp2},[$out],#16
662	b	.Lcbc_done
663
664.Lcbc_dec_one:
665	veor	$tmp1,$tmp1,$dat2
666	 vorr	$ivec,$in2,$in2
667	vst1.8	{$tmp1},[$out],#16
668
669.Lcbc_done:
670	vst1.8	{$ivec},[$ivp]
671.Lcbc_abort:
672___
673}
674$code.=<<___	if ($flavour !~ /64/);
675	vldmia	sp!,{d8-d15}
676	ldmia	sp!,{r4-r8,pc}
677___
678$code.=<<___	if ($flavour =~ /64/);
679	ldr	x29,[sp],#16
680	ret
681___
682$code.=<<___;
683.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
684___
685}}}
686{{{
687my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
688my ($rounds,$cnt,$key_)=("w5","w6","x7");
689my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
690my $step="x12";		# aliases with $tctr2
691
692my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
693my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
694
695my ($dat,$tmp)=($dat0,$tmp0);
696
697### q8-q15	preloaded key schedule
698
699$code.=<<___;
700.globl	${prefix}_ctr32_encrypt_blocks
701.type	${prefix}_ctr32_encrypt_blocks,%function
702.align	5
703${prefix}_ctr32_encrypt_blocks:
704___
705$code.=<<___	if ($flavour =~ /64/);
706	stp		x29,x30,[sp,#-16]!
707	add		x29,sp,#0
708___
709$code.=<<___	if ($flavour !~ /64/);
710	mov		ip,sp
711	stmdb		sp!,{r4-r10,lr}
712	vstmdb		sp!,{d8-d15}            @ ABI specification says so
713	ldr		r4, [ip]		@ load remaining arg
714___
715$code.=<<___;
716	ldr		$rounds,[$key,#240]
717
718	ldr		$ctr, [$ivp, #12]
719	vld1.32		{$dat0},[$ivp]
720
721	vld1.32		{q8-q9},[$key]		// load key schedule...
722	sub		$rounds,$rounds,#4
723	mov		$step,#16
724	cmp		$len,#2
725	add		$key_,$key,x5,lsl#4	// pointer to last 5 round keys
726	sub		$rounds,$rounds,#2
727	vld1.32		{q12-q13},[$key_],#32
728	vld1.32		{q14-q15},[$key_],#32
729	vld1.32		{$rndlast},[$key_]
730	add		$key_,$key,#32
731	mov		$cnt,$rounds
732	cclr		$step,lo
733#ifndef __ARMEB__
734	rev		$ctr, $ctr
735#endif
736	vorr		$dat1,$dat0,$dat0
737	add		$tctr1, $ctr, #1
738	vorr		$dat2,$dat0,$dat0
739	add		$ctr, $ctr, #2
740	vorr		$ivec,$dat0,$dat0
741	rev		$tctr1, $tctr1
742	vmov.32		${dat1}[3],$tctr1
743	b.ls		.Lctr32_tail
744	rev		$tctr2, $ctr
745	sub		$len,$len,#3		// bias
746	vmov.32		${dat2}[3],$tctr2
747	b		.Loop3x_ctr32
748
749.align	4
750.Loop3x_ctr32:
751	aese		$dat0,q8
752	aesmc		$dat0,$dat0
753	aese		$dat1,q8
754	aesmc		$dat1,$dat1
755	aese		$dat2,q8
756	aesmc		$dat2,$dat2
757	vld1.32		{q8},[$key_],#16
758	subs		$cnt,$cnt,#2
759	aese		$dat0,q9
760	aesmc		$dat0,$dat0
761	aese		$dat1,q9
762	aesmc		$dat1,$dat1
763	aese		$dat2,q9
764	aesmc		$dat2,$dat2
765	vld1.32		{q9},[$key_],#16
766	b.gt		.Loop3x_ctr32
767
768	aese		$dat0,q8
769	aesmc		$tmp0,$dat0
770	aese		$dat1,q8
771	aesmc		$tmp1,$dat1
772	 vld1.8		{$in0},[$inp],#16
773	 vorr		$dat0,$ivec,$ivec
774	aese		$dat2,q8
775	aesmc		$dat2,$dat2
776	 vld1.8		{$in1},[$inp],#16
777	 vorr		$dat1,$ivec,$ivec
778	aese		$tmp0,q9
779	aesmc		$tmp0,$tmp0
780	aese		$tmp1,q9
781	aesmc		$tmp1,$tmp1
782	 vld1.8		{$in2},[$inp],#16
783	 mov		$key_,$key
784	aese		$dat2,q9
785	aesmc		$tmp2,$dat2
786	 vorr		$dat2,$ivec,$ivec
787	 add		$tctr0,$ctr,#1
788	aese		$tmp0,q12
789	aesmc		$tmp0,$tmp0
790	aese		$tmp1,q12
791	aesmc		$tmp1,$tmp1
792	 veor		$in0,$in0,$rndlast
793	 add		$tctr1,$ctr,#2
794	aese		$tmp2,q12
795	aesmc		$tmp2,$tmp2
796	 veor		$in1,$in1,$rndlast
797	 add		$ctr,$ctr,#3
798	aese		$tmp0,q13
799	aesmc		$tmp0,$tmp0
800	aese		$tmp1,q13
801	aesmc		$tmp1,$tmp1
802	 veor		$in2,$in2,$rndlast
803	 rev		$tctr0,$tctr0
804	aese		$tmp2,q13
805	aesmc		$tmp2,$tmp2
806	 vmov.32	${dat0}[3], $tctr0
807	 rev		$tctr1,$tctr1
808	aese		$tmp0,q14
809	aesmc		$tmp0,$tmp0
810	aese		$tmp1,q14
811	aesmc		$tmp1,$tmp1
812	 vmov.32	${dat1}[3], $tctr1
813	 rev		$tctr2,$ctr
814	aese		$tmp2,q14
815	aesmc		$tmp2,$tmp2
816	 vmov.32	${dat2}[3], $tctr2
817	 subs		$len,$len,#3
818	aese		$tmp0,q15
819	aese		$tmp1,q15
820	aese		$tmp2,q15
821
822	veor		$in0,$in0,$tmp0
823	 vld1.32	 {q8},[$key_],#16	// re-pre-load rndkey[0]
824	vst1.8		{$in0},[$out],#16
825	veor		$in1,$in1,$tmp1
826	 mov		$cnt,$rounds
827	vst1.8		{$in1},[$out],#16
828	veor		$in2,$in2,$tmp2
829	 vld1.32	 {q9},[$key_],#16	// re-pre-load rndkey[1]
830	vst1.8		{$in2},[$out],#16
831	b.hs		.Loop3x_ctr32
832
833	adds		$len,$len,#3
834	b.eq		.Lctr32_done
835	cmp		$len,#1
836	mov		$step,#16
837	cclr		$step,eq
838
839.Lctr32_tail:
840	aese		$dat0,q8
841	aesmc		$dat0,$dat0
842	aese		$dat1,q8
843	aesmc		$dat1,$dat1
844	vld1.32		{q8},[$key_],#16
845	subs		$cnt,$cnt,#2
846	aese		$dat0,q9
847	aesmc		$dat0,$dat0
848	aese		$dat1,q9
849	aesmc		$dat1,$dat1
850	vld1.32		{q9},[$key_],#16
851	b.gt		.Lctr32_tail
852
853	aese		$dat0,q8
854	aesmc		$dat0,$dat0
855	aese		$dat1,q8
856	aesmc		$dat1,$dat1
857	aese		$dat0,q9
858	aesmc		$dat0,$dat0
859	aese		$dat1,q9
860	aesmc		$dat1,$dat1
861	 vld1.8		{$in0},[$inp],$step
862	aese		$dat0,q12
863	aesmc		$dat0,$dat0
864	aese		$dat1,q12
865	aesmc		$dat1,$dat1
866	 vld1.8		{$in1},[$inp]
867	aese		$dat0,q13
868	aesmc		$dat0,$dat0
869	aese		$dat1,q13
870	aesmc		$dat1,$dat1
871	 veor		$in0,$in0,$rndlast
872	aese		$dat0,q14
873	aesmc		$dat0,$dat0
874	aese		$dat1,q14
875	aesmc		$dat1,$dat1
876	 veor		$in1,$in1,$rndlast
877	aese		$dat0,q15
878	aese		$dat1,q15
879
880	cmp		$len,#1
881	veor		$in0,$in0,$dat0
882	veor		$in1,$in1,$dat1
883	vst1.8		{$in0},[$out],#16
884	b.eq		.Lctr32_done
885	vst1.8		{$in1},[$out]
886
887.Lctr32_done:
888___
889$code.=<<___	if ($flavour !~ /64/);
890	vldmia		sp!,{d8-d15}
891	ldmia		sp!,{r4-r10,pc}
892___
893$code.=<<___	if ($flavour =~ /64/);
894	ldr		x29,[sp],#16
895	ret
896___
897$code.=<<___;
898.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
899___
900}}}
901$code.=<<___;
902#endif
903___
904########################################
905if ($flavour =~ /64/) {			######## 64-bit code
906    my %opcode = (
907	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
908	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800	);
909
910    local *unaes = sub {
911	my ($mnemonic,$arg)=@_;
912
913	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
914	sprintf ".inst\t0x%08x\t//%s %s",
915			$opcode{$mnemonic}|$1|($2<<5),
916			$mnemonic,$arg;
917    };
918
919    foreach(split("\n",$code)) {
920	s/\`([^\`]*)\`/eval($1)/geo;
921
922	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
923	s/@\s/\/\//o;			# old->new style commentary
924
925	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
926	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
927	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel	$2,$3,$2,$1/o	or
928	s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
929	s/vext\.8/ext/o		or
930	s/vrev32\.8/rev32/o	or
931	s/vtst\.8/cmtst/o	or
932	s/vshr/ushr/o		or
933	s/^(\s+)v/$1/o		or	# strip off v prefix
934	s/\bbx\s+lr\b/ret/o;
935
936	# fix up remainig legacy suffixes
937	s/\.[ui]?8//o;
938	m/\],#8/o and s/\.16b/\.8b/go;
939	s/\.[ui]?32//o and s/\.16b/\.4s/go;
940	s/\.[ui]?64//o and s/\.16b/\.2d/go;
941	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
942
943	print $_,"\n";
944    }
945} else {				######## 32-bit code
946    my %opcode = (
947	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
948	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);
949
950    local *unaes = sub {
951	my ($mnemonic,$arg)=@_;
952
953	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
954	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
955					 |(($2&7)<<1) |(($2&8)<<2);
956	    # since ARMv7 instructions are always encoded little-endian.
957	    # correct solution is to use .inst directive, but older
958	    # assemblers don't implement it:-(
959	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
960			$word&0xff,($word>>8)&0xff,
961			($word>>16)&0xff,($word>>24)&0xff,
962			$mnemonic,$arg;
963	}
964    };
965
966    sub unvtbl {
967	my $arg=shift;
968
969	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
970	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
971		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
972    }
973
974    sub unvdup32 {
975	my $arg=shift;
976
977	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
978	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
979    }
980
981    sub unvmov32 {
982	my $arg=shift;
983
984	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
985	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
986    }
987
988    foreach(split("\n",$code)) {
989	s/\`([^\`]*)\`/eval($1)/geo;
990
991	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
992	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
993	s/\/\/\s?/@ /o;				# new->old style commentary
994
995	# fix up remainig new-style suffixes
996	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
997	s/\],#[0-9]+/]!/o;
998
999	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
1000	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o	or
1001	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
1002	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
1003	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
1004	s/^(\s+)b\./$1b/o				or
1005	s/^(\s+)mov\./$1mov/o				or
1006	s/^(\s+)ret/$1bx\tlr/o;
1007
1008	print $_,"\n";
1009    }
1010}
1011
1012close STDOUT;
1013