aesv8-armx-64.S revision 3f9e6ada2c9f7183a41081263585e6a70bbd9f59
1c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#include "arm_arch.h"
2c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
3c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#if __ARM_ARCH__>=7
4c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles).text
5c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles).arch	armv8-a+crypto
6c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles).align	5
7effb81e5f8246d0db0270817048dc992db66e9fbBen Murdochrcon:
8effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch.long	0x01,0x01,0x01,0x01
9effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
10c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles).long	0x1b,0x1b,0x1b,0x1b
11c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
12c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles).globl	aes_v8_set_encrypt_key
13c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles).type	aes_v8_set_encrypt_key,%function
14c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles).align	5
15c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)aes_v8_set_encrypt_key:
16c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles).Lenc_key:
17effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch	stp	x29,x30,[sp,#-16]!
18cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)	add	x29,sp,#0
19effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch	adr	x3,rcon
20effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch	cmp	w1,#192
21effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch
22effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch	eor	v0.16b,v0.16b,v0.16b
23effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch	ld1	{v3.16b},[x0],#16
24c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)	mov	w1,#8		// reuse w1
25c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)	ld1	{v1.4s,v2.4s},[x3],#32
26010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
27010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)	b.lt	.Loop128
28010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)	b.eq	.L192
29010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)	b	.L256
301320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci
311320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci.align	4
321320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci.Loop128:
33c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)	tbl	v6.16b,{v3.16b},v2.16b
34c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)	ext	v5.16b,v0.16b,v3.16b,#12
35c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)	st1	{v3.4s},[x2],#16
36c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)	aese	v6.16b,v0.16b
37c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)	subs	w1,w1,#1
38effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch
39c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)	eor	v3.16b,v3.16b,v5.16b
40effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch	ext	v5.16b,v0.16b,v5.16b,#12
41effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch	eor	v3.16b,v3.16b,v5.16b
42010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)	ext	v5.16b,v0.16b,v5.16b,#12
43010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)	 eor	v6.16b,v6.16b,v1.16b
44effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch	eor	v3.16b,v3.16b,v5.16b
45effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch	shl	v1.16b,v1.16b,#1
46effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch	eor	v3.16b,v3.16b,v6.16b
47effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch	b.ne	.Loop128
48effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch
49effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch	ld1	{v1.4s},[x3]
50effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch
51effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch	tbl	v6.16b,{v3.16b},v2.16b
52effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch	ext	v5.16b,v0.16b,v3.16b,#12
53c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)	st1	{v3.4s},[x2],#16
54010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)	aese	v6.16b,v0.16b
55010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
56010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)	eor	v3.16b,v3.16b,v5.16b
57010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)	ext	v5.16b,v0.16b,v5.16b,#12
581320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci	eor	v3.16b,v3.16b,v5.16b
591320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci	ext	v5.16b,v0.16b,v5.16b,#12
601320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci	 eor	v6.16b,v6.16b,v1.16b
611320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci	eor	v3.16b,v3.16b,v5.16b
621320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci	shl	v1.16b,v1.16b,#1
631320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci	eor	v3.16b,v3.16b,v6.16b
641320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci
651320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci	tbl	v6.16b,{v3.16b},v2.16b
661320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci	ext	v5.16b,v0.16b,v3.16b,#12
671320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci	st1	{v3.4s},[x2],#16
681320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci	aese	v6.16b,v0.16b
691320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci
701320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci	eor	v3.16b,v3.16b,v5.16b
711320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci	ext	v5.16b,v0.16b,v5.16b,#12
721320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci	eor	v3.16b,v3.16b,v5.16b
731320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci	ext	v5.16b,v0.16b,v5.16b,#12
741320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci	 eor	v6.16b,v6.16b,v1.16b
751320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci	eor	v3.16b,v3.16b,v5.16b
761320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci	eor	v3.16b,v3.16b,v6.16b
771320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci	st1	{v3.4s},[x2]
781320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci	add	x2,x2,#0x50
791320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci
801320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci	mov	w12,#10
811320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci	b	.Ldone
821320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci
831320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci.align	4
841320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci.L192:
851320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci	ld1	{v4.8b},[x0],#8
861320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci	movi	v6.16b,#8			// borrow v6.16b
871320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci	st1	{v3.4s},[x2],#16
881320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
891320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci
90.Loop192:
91	tbl	v6.16b,{v4.16b},v2.16b
92	ext	v5.16b,v0.16b,v3.16b,#12
93	st1	{v4.8b},[x2],#8
94	aese	v6.16b,v0.16b
95	subs	w1,w1,#1
96
97	eor	v3.16b,v3.16b,v5.16b
98	ext	v5.16b,v0.16b,v5.16b,#12
99	eor	v3.16b,v3.16b,v5.16b
100	ext	v5.16b,v0.16b,v5.16b,#12
101	eor	v3.16b,v3.16b,v5.16b
102
103	dup	v5.4s,v3.s[3]
104	eor	v5.16b,v5.16b,v4.16b
105	 eor	v6.16b,v6.16b,v1.16b
106	ext	v4.16b,v0.16b,v4.16b,#12
107	shl	v1.16b,v1.16b,#1
108	eor	v4.16b,v4.16b,v5.16b
109	eor	v3.16b,v3.16b,v6.16b
110	eor	v4.16b,v4.16b,v6.16b
111	st1	{v3.4s},[x2],#16
112	b.ne	.Loop192
113
114	mov	w12,#12
115	add	x2,x2,#0x20
116	b	.Ldone
117
118.align	4
119.L256:
120	ld1	{v4.16b},[x0]
121	mov	w1,#7
122	mov	w12,#14
123	st1	{v3.4s},[x2],#16
124
125.Loop256:
126	tbl	v6.16b,{v4.16b},v2.16b
127	ext	v5.16b,v0.16b,v3.16b,#12
128	st1	{v4.4s},[x2],#16
129	aese	v6.16b,v0.16b
130	subs	w1,w1,#1
131
132	eor	v3.16b,v3.16b,v5.16b
133	ext	v5.16b,v0.16b,v5.16b,#12
134	eor	v3.16b,v3.16b,v5.16b
135	ext	v5.16b,v0.16b,v5.16b,#12
136	 eor	v6.16b,v6.16b,v1.16b
137	eor	v3.16b,v3.16b,v5.16b
138	shl	v1.16b,v1.16b,#1
139	eor	v3.16b,v3.16b,v6.16b
140	st1	{v3.4s},[x2],#16
141	b.eq	.Ldone
142
143	dup	v6.4s,v3.s[3]		// just splat
144	ext	v5.16b,v0.16b,v4.16b,#12
145	aese	v6.16b,v0.16b
146
147	eor	v4.16b,v4.16b,v5.16b
148	ext	v5.16b,v0.16b,v5.16b,#12
149	eor	v4.16b,v4.16b,v5.16b
150	ext	v5.16b,v0.16b,v5.16b,#12
151	eor	v4.16b,v4.16b,v5.16b
152
153	eor	v4.16b,v4.16b,v6.16b
154	b	.Loop256
155
156.Ldone:
157	str	w12,[x2]
158
159	eor	x0,x0,x0		// return value
160	ldr	x29,[sp],#16
161	ret
162.size	aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
163
164.globl	aes_v8_set_decrypt_key
165.type	aes_v8_set_decrypt_key,%function
166.align	5
167aes_v8_set_decrypt_key:
168	stp	x29,x30,[sp,#-16]!
169	add	x29,sp,#0
170	bl	.Lenc_key
171
172	sub	x2,x2,#240		// restore original x2
173	mov	x4,#-16
174	add	x0,x2,x12,lsl#4	// end of key schedule
175
176	ld1	{v0.4s},[x2]
177	ld1	{v1.4s},[x0]
178	st1	{v0.4s},[x0],x4
179	st1	{v1.4s},[x2],#16
180
181.Loop_imc:
182	ld1	{v0.4s},[x2]
183	ld1	{v1.4s},[x0]
184	aesimc	v0.16b,v0.16b
185	aesimc	v1.16b,v1.16b
186	st1	{v0.4s},[x0],x4
187	st1	{v1.4s},[x2],#16
188	cmp	x0,x2
189	b.hi	.Loop_imc
190
191	ld1	{v0.4s},[x2]
192	aesimc	v0.16b,v0.16b
193	st1	{v0.4s},[x0]
194
195	eor	x0,x0,x0		// return value
196	ldp	x29,x30,[sp],#16
197	ret
198.size	aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
199.globl	aes_v8_encrypt
200.type	aes_v8_encrypt,%function
201.align	5
202aes_v8_encrypt:
203	ldr	w3,[x2,#240]
204	ld1	{v0.4s},[x2],#16
205	ld1	{v2.16b},[x0]
206	sub	w3,w3,#2
207	ld1	{v1.4s},[x2],#16
208
209.Loop_enc:
210	aese	v2.16b,v0.16b
211	ld1	{v0.4s},[x2],#16
212	aesmc	v2.16b,v2.16b
213	subs	w3,w3,#2
214	aese	v2.16b,v1.16b
215	ld1	{v1.4s},[x2],#16
216	aesmc	v2.16b,v2.16b
217	b.gt	.Loop_enc
218
219	aese	v2.16b,v0.16b
220	ld1	{v0.4s},[x2]
221	aesmc	v2.16b,v2.16b
222	aese	v2.16b,v1.16b
223	eor	v2.16b,v2.16b,v0.16b
224
225	st1	{v2.16b},[x1]
226	ret
227.size	aes_v8_encrypt,.-aes_v8_encrypt
228.globl	aes_v8_decrypt
229.type	aes_v8_decrypt,%function
230.align	5
231aes_v8_decrypt:
232	ldr	w3,[x2,#240]
233	ld1	{v0.4s},[x2],#16
234	ld1	{v2.16b},[x0]
235	sub	w3,w3,#2
236	ld1	{v1.4s},[x2],#16
237
238.Loop_dec:
239	aesd	v2.16b,v0.16b
240	ld1	{v0.4s},[x2],#16
241	aesimc	v2.16b,v2.16b
242	subs	w3,w3,#2
243	aesd	v2.16b,v1.16b
244	ld1	{v1.4s},[x2],#16
245	aesimc	v2.16b,v2.16b
246	b.gt	.Loop_dec
247
248	aesd	v2.16b,v0.16b
249	ld1	{v0.4s},[x2]
250	aesimc	v2.16b,v2.16b
251	aesd	v2.16b,v1.16b
252	eor	v2.16b,v2.16b,v0.16b
253
254	st1	{v2.16b},[x1]
255	ret
256.size	aes_v8_decrypt,.-aes_v8_decrypt
257.globl	aes_v8_cbc_encrypt
258.type	aes_v8_cbc_encrypt,%function
259.align	5
260aes_v8_cbc_encrypt:
261	stp	x29,x30,[sp,#-16]!
262	add	x29,sp,#0
263	subs	x2,x2,#16
264	mov	x8,#16
265	b.lo	.Lcbc_abort
266	csel	x8,xzr,x8,eq
267
268	cmp	w5,#0			// en- or decrypting?
269	ldr	w5,[x3,#240]
270	and	x2,x2,#-16
271	ld1	{v6.16b},[x4]
272	ld1	{v0.16b},[x0],x8
273
274	ld1	{v16.4s-v17.4s},[x3]		// load key schedule...
275	sub	w5,w5,#6
276	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
277	sub	w5,w5,#2
278	ld1	{v18.4s-v19.4s},[x7],#32
279	ld1	{v20.4s-v21.4s},[x7],#32
280	ld1	{v22.4s-v23.4s},[x7],#32
281	ld1	{v7.4s},[x7]
282
283	add	x7,x3,#32
284	mov	w6,w5
285	b.eq	.Lcbc_dec
286
287	cmp	w5,#2
288	eor	v0.16b,v0.16b,v6.16b
289	eor	v5.16b,v16.16b,v7.16b
290	b.eq	.Lcbc_enc128
291
292.Loop_cbc_enc:
293	aese	v0.16b,v16.16b
294	ld1	{v16.4s},[x7],#16
295	aesmc	v0.16b,v0.16b
296	subs	w6,w6,#2
297	aese	v0.16b,v17.16b
298	ld1	{v17.4s},[x7],#16
299	aesmc	v0.16b,v0.16b
300	b.gt	.Loop_cbc_enc
301
302	aese	v0.16b,v16.16b
303	aesmc	v0.16b,v0.16b
304	 subs	x2,x2,#16
305	aese	v0.16b,v17.16b
306	aesmc	v0.16b,v0.16b
307	 csel	x8,xzr,x8,eq
308	aese	v0.16b,v18.16b
309	aesmc	v0.16b,v0.16b
310	 add	x7,x3,#16
311	aese	v0.16b,v19.16b
312	aesmc	v0.16b,v0.16b
313	 ld1	{v16.16b},[x0],x8
314	aese	v0.16b,v20.16b
315	aesmc	v0.16b,v0.16b
316	 eor	v16.16b,v16.16b,v5.16b
317	aese	v0.16b,v21.16b
318	aesmc	v0.16b,v0.16b
319	 ld1 {v17.4s},[x7],#16	// re-pre-load rndkey[1]
320	aese	v0.16b,v22.16b
321	aesmc	v0.16b,v0.16b
322	aese	v0.16b,v23.16b
323
324	 mov	w6,w5
325	eor	v6.16b,v0.16b,v7.16b
326	st1	{v6.16b},[x1],#16
327	b.hs	.Loop_cbc_enc
328
329	b	.Lcbc_done
330
331.align	5
332.Lcbc_enc128:
333	ld1	{v2.4s-v3.4s},[x7]
334	aese	v0.16b,v16.16b
335	aesmc	v0.16b,v0.16b
336	b	.Lenter_cbc_enc128
337.Loop_cbc_enc128:
338	aese	v0.16b,v16.16b
339	aesmc	v0.16b,v0.16b
340	 st1	{v6.16b},[x1],#16
341.Lenter_cbc_enc128:
342	aese	v0.16b,v17.16b
343	aesmc	v0.16b,v0.16b
344	 subs	x2,x2,#16
345	aese	v0.16b,v2.16b
346	aesmc	v0.16b,v0.16b
347	 csel	x8,xzr,x8,eq
348	aese	v0.16b,v3.16b
349	aesmc	v0.16b,v0.16b
350	aese	v0.16b,v18.16b
351	aesmc	v0.16b,v0.16b
352	aese	v0.16b,v19.16b
353	aesmc	v0.16b,v0.16b
354	 ld1	{v16.16b},[x0],x8
355	aese	v0.16b,v20.16b
356	aesmc	v0.16b,v0.16b
357	aese	v0.16b,v21.16b
358	aesmc	v0.16b,v0.16b
359	aese	v0.16b,v22.16b
360	aesmc	v0.16b,v0.16b
361	 eor	v16.16b,v16.16b,v5.16b
362	aese	v0.16b,v23.16b
363	eor	v6.16b,v0.16b,v7.16b
364	b.hs	.Loop_cbc_enc128
365
366	st1	{v6.16b},[x1],#16
367	b	.Lcbc_done
368
369.align	5
370.Lcbc_dec128:
371	ld1	{v4.4s-v5.4s},[x7]
372	eor	v6.16b,v6.16b,v7.16b
373	eor	v2.16b,v0.16b,v7.16b
374	mov	x12,x8
375
376.Loop2x_cbc_dec128:
377	aesd	v0.16b,v16.16b
378	aesd	v1.16b,v16.16b
379	aesimc	v0.16b,v0.16b
380	aesimc	v1.16b,v1.16b
381	 subs	x2,x2,#32
382	aesd	v0.16b,v17.16b
383	aesd	v1.16b,v17.16b
384	aesimc	v0.16b,v0.16b
385	aesimc	v1.16b,v1.16b
386	 csel	x8,xzr,x8,lo
387	aesd	v0.16b,v4.16b
388	aesd	v1.16b,v4.16b
389	aesimc	v0.16b,v0.16b
390	aesimc	v1.16b,v1.16b
391	 csel	x12,xzr,x12,ls
392	aesd	v0.16b,v5.16b
393	aesd	v1.16b,v5.16b
394	aesimc	v0.16b,v0.16b
395	aesimc	v1.16b,v1.16b
396	aesd	v0.16b,v18.16b
397	aesd	v1.16b,v18.16b
398	aesimc	v0.16b,v0.16b
399	aesimc	v1.16b,v1.16b
400	aesd	v0.16b,v19.16b
401	aesd	v1.16b,v19.16b
402	aesimc	v0.16b,v0.16b
403	aesimc	v1.16b,v1.16b
404	aesd	v0.16b,v20.16b
405	aesd	v1.16b,v20.16b
406	aesimc	v0.16b,v0.16b
407	aesimc	v1.16b,v1.16b
408	aesd	v0.16b,v21.16b
409	aesd	v1.16b,v21.16b
410	aesimc	v0.16b,v0.16b
411	aesimc	v1.16b,v1.16b
412	aesd	v0.16b,v22.16b
413	aesd	v1.16b,v22.16b
414	aesimc	v0.16b,v0.16b
415	aesimc	v1.16b,v1.16b
416	aesd	v0.16b,v23.16b
417	aesd	v1.16b,v23.16b
418
419	eor	v6.16b,v6.16b,v0.16b
420	ld1	{v0.16b},[x0],x8
421	eor	v2.16b,v2.16b,v1.16b
422	ld1	{v1.16b},[x0],x12
423	st1	{v6.16b},[x1],#16
424	eor	v6.16b,v3.16b,v7.16b
425	st1	{v2.16b},[x1],#16
426	eor	v2.16b,v0.16b,v7.16b
427	orr	v3.16b,v1.16b,v1.16b
428	b.hs	.Loop2x_cbc_dec128
429
430	adds	x2,x2,#32
431	eor	v6.16b,v6.16b,v7.16b
432	b.eq	.Lcbc_done
433	eor	v2.16b,v2.16b,v7.16b
434	b	.Lcbc_dec_tail
435
436.align	5
437.Lcbc_dec:
438	subs	x2,x2,#16
439	orr	v2.16b,v0.16b,v0.16b
440	b.lo	.Lcbc_dec_tail
441
442	csel	x8,xzr,x8,eq
443	cmp	w5,#2
444	ld1	{v1.16b},[x0],x8
445	orr	v3.16b,v1.16b,v1.16b
446	b.eq	.Lcbc_dec128
447
448.Loop2x_cbc_dec:
449	aesd	v0.16b,v16.16b
450	aesd	v1.16b,v16.16b
451	ld1	{v16.4s},[x7],#16
452	aesimc	v0.16b,v0.16b
453	aesimc	v1.16b,v1.16b
454	subs	w6,w6,#2
455	aesd	v0.16b,v17.16b
456	aesd	v1.16b,v17.16b
457	ld1	{v17.4s},[x7],#16
458	aesimc	v0.16b,v0.16b
459	aesimc	v1.16b,v1.16b
460	b.gt	.Loop2x_cbc_dec
461
462	aesd	v0.16b,v16.16b
463	aesd	v1.16b,v16.16b
464	aesimc	v0.16b,v0.16b
465	aesimc	v1.16b,v1.16b
466	 eor	v4.16b,v6.16b,v7.16b
467	 eor	v5.16b,v2.16b,v7.16b
468	aesd	v0.16b,v17.16b
469	aesd	v1.16b,v17.16b
470	aesimc	v0.16b,v0.16b
471	aesimc	v1.16b,v1.16b
472	 orr	v6.16b,v3.16b,v3.16b
473	 subs	x2,x2,#32
474	aesd	v0.16b,v18.16b
475	aesd	v1.16b,v18.16b
476	aesimc	v0.16b,v0.16b
477	 csel	x8,xzr,x8,lo
478	aesimc	v1.16b,v1.16b
479	 mov	x7,x3
480	aesd	v0.16b,v19.16b
481	aesd	v1.16b,v19.16b
482	aesimc	v0.16b,v0.16b
483	 ld1	{v2.16b},[x0],x8
484	aesimc	v1.16b,v1.16b
485	 csel	x8,xzr,x8,ls
486	aesd	v0.16b,v20.16b
487	aesd	v1.16b,v20.16b
488	aesimc	v0.16b,v0.16b
489	aesimc	v1.16b,v1.16b
490	 ld1	{v3.16b},[x0],x8
491	aesd	v0.16b,v21.16b
492	aesd	v1.16b,v21.16b
493	aesimc	v0.16b,v0.16b
494	aesimc	v1.16b,v1.16b
495	 ld1 {v16.4s},[x7],#16	// re-pre-load rndkey[0]
496	aesd	v0.16b,v22.16b
497	aesd	v1.16b,v22.16b
498	aesimc	v0.16b,v0.16b
499	aesimc	v1.16b,v1.16b
500	 ld1 {v17.4s},[x7],#16	// re-pre-load rndkey[1]
501	aesd	v0.16b,v23.16b
502	aesd	v1.16b,v23.16b
503
504	 mov	w6,w5
505	eor	v4.16b,v4.16b,v0.16b
506	eor	v5.16b,v5.16b,v1.16b
507	 orr	v0.16b,v2.16b,v2.16b
508	st1	{v4.16b},[x1],#16
509	 orr	v1.16b,v3.16b,v3.16b
510	st1	{v5.16b},[x1],#16
511	b.hs	.Loop2x_cbc_dec
512
513	adds	x2,x2,#32
514	b.eq	.Lcbc_done
515
516.Lcbc_dec_tail:
517	aesd	v0.16b,v16.16b
518	ld1	{v16.4s},[x7],#16
519	aesimc	v0.16b,v0.16b
520	subs	w6,w6,#2
521	aesd	v0.16b,v17.16b
522	ld1	{v17.4s},[x7],#16
523	aesimc	v0.16b,v0.16b
524	b.gt	.Lcbc_dec_tail
525
526	aesd	v0.16b,v16.16b
527	aesimc	v0.16b,v0.16b
528	aesd	v0.16b,v17.16b
529	aesimc	v0.16b,v0.16b
530	 eor	v4.16b,v6.16b,v7.16b
531	aesd	v0.16b,v18.16b
532	aesimc	v0.16b,v0.16b
533	 orr	v6.16b,v2.16b,v2.16b
534	aesd	v0.16b,v19.16b
535	aesimc	v0.16b,v0.16b
536	aesd	v0.16b,v20.16b
537	aesimc	v0.16b,v0.16b
538	aesd	v0.16b,v21.16b
539	aesimc	v0.16b,v0.16b
540	aesd	v0.16b,v22.16b
541	aesimc	v0.16b,v0.16b
542	aesd	v0.16b,v23.16b
543
544	eor	v4.16b,v4.16b,v0.16b
545	st1	{v4.16b},[x1],#16
546
547.Lcbc_done:
548	st1	{v6.16b},[x4]
549.Lcbc_abort:
550	ldr	x29,[sp],#16
551	ret
552.size	aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
553.globl	aes_v8_ctr32_encrypt_blocks
554.type	aes_v8_ctr32_encrypt_blocks,%function
555.align	5
556aes_v8_ctr32_encrypt_blocks:
557	stp		x29,x30,[sp,#-16]!
558	add		x29,sp,#0
559	ldr		w5,[x3,#240]
560
561	ldr		w8, [x4, #12]
562	ld1		{v0.4s},[x4]
563
564	ld1		{v16.4s-v17.4s},[x3]		// load key schedule...
565	sub		w5,w5,#6
566	add		x7,x3,x5,lsl#4	// pointer to last 7 round keys
567	sub		w5,w5,#2
568	ld1		{v18.4s-v19.4s},[x7],#32
569	ld1		{v20.4s-v21.4s},[x7],#32
570	ld1		{v22.4s-v23.4s},[x7],#32
571	ld1		{v7.4s},[x7]
572
573	add		x7,x3,#32
574	mov		w6,w5
575
576	subs		x2,x2,#2
577	b.lo		.Lctr32_tail
578
579#ifndef __ARMEB__
580	rev		w8, w8
581#endif
582	orr		v1.16b,v0.16b,v0.16b
583	add		w8, w8, #1
584	orr		v6.16b,v0.16b,v0.16b
585	rev		w10, w8
586	cmp		w5,#2
587	mov		v1.s[3],w10
588	b.eq		.Lctr32_128
589
590.Loop2x_ctr32:
591	aese		v0.16b,v16.16b
592	aese		v1.16b,v16.16b
593	ld1		{v16.4s},[x7],#16
594	aesmc		v0.16b,v0.16b
595	aesmc		v1.16b,v1.16b
596	subs		w6,w6,#2
597	aese		v0.16b,v17.16b
598	aese		v1.16b,v17.16b
599	ld1		{v17.4s},[x7],#16
600	aesmc		v0.16b,v0.16b
601	aesmc		v1.16b,v1.16b
602	b.gt		.Loop2x_ctr32
603
604	aese		v0.16b,v16.16b
605	aese		v1.16b,v16.16b
606	aesmc		v4.16b,v0.16b
607	 orr		v0.16b,v6.16b,v6.16b
608	aesmc		v5.16b,v1.16b
609	 orr		v1.16b,v6.16b,v6.16b
610	aese		v4.16b,v17.16b
611	aese		v5.16b,v17.16b
612	 ld1		{v2.16b},[x0],#16
613	aesmc		v4.16b,v4.16b
614	 ld1		{v3.16b},[x0],#16
615	aesmc		v5.16b,v5.16b
616	 add		w8,w8,#1
617	aese		v4.16b,v18.16b
618	aese		v5.16b,v18.16b
619	 rev		w9,w8
620	aesmc		v4.16b,v4.16b
621	aesmc		v5.16b,v5.16b
622	 add		w8,w8,#1
623	aese		v4.16b,v19.16b
624	aese		v5.16b,v19.16b
625	 eor		v2.16b,v2.16b,v7.16b
626	 rev		w10,w8
627	aesmc		v4.16b,v4.16b
628	aesmc		v5.16b,v5.16b
629	 eor		v3.16b,v3.16b,v7.16b
630	 mov		x7,x3
631	aese		v4.16b,v20.16b
632	aese		v5.16b,v20.16b
633	 subs		x2,x2,#2
634	aesmc		v4.16b,v4.16b
635	aesmc		v5.16b,v5.16b
636	 ld1	 {v16.4s-v17.4s},[x7],#32	// re-pre-load rndkey[0-1]
637	aese		v4.16b,v21.16b
638	aese		v5.16b,v21.16b
639	aesmc		v4.16b,v4.16b
640	aesmc		v5.16b,v5.16b
641	aese		v4.16b,v22.16b
642	aese		v5.16b,v22.16b
643	 mov	v0.s[3], w9
644	aesmc		v4.16b,v4.16b
645	 mov	v1.s[3], w10
646	aesmc		v5.16b,v5.16b
647	aese		v4.16b,v23.16b
648	aese		v5.16b,v23.16b
649
650	 mov		w6,w5
651	eor		v2.16b,v2.16b,v4.16b
652	eor		v3.16b,v3.16b,v5.16b
653	st1		{v2.16b},[x1],#16
654	st1		{v3.16b},[x1],#16
655	b.hs		.Loop2x_ctr32
656
657	adds		x2,x2,#2
658	b.eq		.Lctr32_done
659	b		.Lctr32_tail
660
661.Lctr32_128:
662	ld1		{v4.4s-v5.4s},[x7]
663
664.Loop2x_ctr32_128:
665	aese		v0.16b,v16.16b
666	aese		v1.16b,v16.16b
667	aesmc		v0.16b,v0.16b
668	 ld1		{v2.16b},[x0],#16
669	aesmc		v1.16b,v1.16b
670	 ld1		{v3.16b},[x0],#16
671	aese		v0.16b,v17.16b
672	aese		v1.16b,v17.16b
673	 add		w8,w8,#1
674	aesmc		v0.16b,v0.16b
675	aesmc		v1.16b,v1.16b
676	 rev		w9,w8
677	aese		v0.16b,v4.16b
678	aese		v1.16b,v4.16b
679	 add		w8,w8,#1
680	aesmc		v0.16b,v0.16b
681	aesmc		v1.16b,v1.16b
682	 rev		w10,w8
683	aese		v0.16b,v5.16b
684	aese		v1.16b,v5.16b
685	 subs		x2,x2,#2
686	aesmc		v0.16b,v0.16b
687	aesmc		v1.16b,v1.16b
688	aese		v0.16b,v18.16b
689	aese		v1.16b,v18.16b
690	aesmc		v0.16b,v0.16b
691	aesmc		v1.16b,v1.16b
692	aese		v0.16b,v19.16b
693	aese		v1.16b,v19.16b
694	aesmc		v0.16b,v0.16b
695	aesmc		v1.16b,v1.16b
696	aese		v0.16b,v20.16b
697	aese		v1.16b,v20.16b
698	aesmc		v0.16b,v0.16b
699	aesmc		v1.16b,v1.16b
700	aese		v0.16b,v21.16b
701	aese		v1.16b,v21.16b
702	aesmc		v0.16b,v0.16b
703	aesmc		v1.16b,v1.16b
704	aese		v0.16b,v22.16b
705	aese		v1.16b,v22.16b
706	aesmc		v0.16b,v0.16b
707	aesmc		v1.16b,v1.16b
708	 eor		v2.16b,v2.16b,v7.16b
709	aese		v0.16b,v23.16b
710	 eor		v3.16b,v3.16b,v7.16b
711	aese		v1.16b,v23.16b
712
713	eor		v2.16b,v2.16b,v0.16b
714	orr		v0.16b,v6.16b,v6.16b
715	eor		v3.16b,v3.16b,v1.16b
716	orr		v1.16b,v6.16b,v6.16b
717	st1		{v2.16b},[x1],#16
718	mov		v0.s[3], w9
719	st1		{v3.16b},[x1],#16
720	mov		v1.s[3], w10
721	b.hs		.Loop2x_ctr32_128
722
723	adds		x2,x2,#2
724	b.eq		.Lctr32_done
725
726.Lctr32_tail:
727	aese		v0.16b,v16.16b
728	ld1		{v16.4s},[x7],#16
729	aesmc		v0.16b,v0.16b
730	subs		w6,w6,#2
731	aese		v0.16b,v17.16b
732	ld1		{v17.4s},[x7],#16
733	aesmc		v0.16b,v0.16b
734	b.gt		.Lctr32_tail
735
736	aese		v0.16b,v16.16b
737	aesmc		v0.16b,v0.16b
738	aese		v0.16b,v17.16b
739	aesmc		v0.16b,v0.16b
740	 ld1		{v2.16b},[x0]
741	aese		v0.16b,v18.16b
742	aesmc		v0.16b,v0.16b
743	aese		v0.16b,v19.16b
744	aesmc		v0.16b,v0.16b
745	aese		v0.16b,v20.16b
746	aesmc		v0.16b,v0.16b
747	aese		v0.16b,v21.16b
748	aesmc		v0.16b,v0.16b
749	aese		v0.16b,v22.16b
750	aesmc		v0.16b,v0.16b
751	 eor		v2.16b,v2.16b,v7.16b
752	aese		v0.16b,v23.16b
753
754	eor		v2.16b,v2.16b,v0.16b
755	st1		{v2.16b},[x1]
756
757.Lctr32_done:
758	ldr		x29,[sp],#16
759	ret
760.size	aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
761#endif
762