aesv8-armx.S revision 3f9e6ada2c9f7183a41081263585e6a70bbd9f59
1#include "arm_arch.h"
2
3#if __ARM_ARCH__>=7
4.text
5.fpu	neon
6.code	32
7.align	5
8rcon:
9.long	0x01,0x01,0x01,0x01
10.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	@ rotate-n-splat
11.long	0x1b,0x1b,0x1b,0x1b
12
13.globl	aes_v8_set_encrypt_key
14.type	aes_v8_set_encrypt_key,%function
15.align	5
16aes_v8_set_encrypt_key:
17.Lenc_key:
18	adr	r3,rcon
19	cmp	r1,#192
20
21	veor	q0,q0,q0
22	vld1.8	{q3},[r0]!
23	mov	r1,#8		@ reuse r1
24	vld1.32	{q1,q2},[r3]!
25
26	blt	.Loop128
27	beq	.L192
28	b	.L256
29
30.align	4
31.Loop128:
32	vtbl.8	d20,{q3},d4
33	vtbl.8	d21,{q3},d5
34	vext.8	q9,q0,q3,#12
35	vst1.32	{q3},[r2]!
36	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
37	subs	r1,r1,#1
38
39	veor	q3,q3,q9
40	vext.8	q9,q0,q9,#12
41	veor	q3,q3,q9
42	vext.8	q9,q0,q9,#12
43	 veor	q10,q10,q1
44	veor	q3,q3,q9
45	vshl.u8	q1,q1,#1
46	veor	q3,q3,q10
47	bne	.Loop128
48
49	vld1.32	{q1},[r3]
50
51	vtbl.8	d20,{q3},d4
52	vtbl.8	d21,{q3},d5
53	vext.8	q9,q0,q3,#12
54	vst1.32	{q3},[r2]!
55	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
56
57	veor	q3,q3,q9
58	vext.8	q9,q0,q9,#12
59	veor	q3,q3,q9
60	vext.8	q9,q0,q9,#12
61	 veor	q10,q10,q1
62	veor	q3,q3,q9
63	vshl.u8	q1,q1,#1
64	veor	q3,q3,q10
65
66	vtbl.8	d20,{q3},d4
67	vtbl.8	d21,{q3},d5
68	vext.8	q9,q0,q3,#12
69	vst1.32	{q3},[r2]!
70	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
71
72	veor	q3,q3,q9
73	vext.8	q9,q0,q9,#12
74	veor	q3,q3,q9
75	vext.8	q9,q0,q9,#12
76	 veor	q10,q10,q1
77	veor	q3,q3,q9
78	veor	q3,q3,q10
79	vst1.32	{q3},[r2]
80	add	r2,r2,#0x50
81
82	mov	r12,#10
83	b	.Ldone
84
85.align	4
86.L192:
87	vld1.8	{d16},[r0]!
88	vmov.i8	q10,#8			@ borrow q10
89	vst1.32	{q3},[r2]!
90	vsub.i8	q2,q2,q10	@ adjust the mask
91
92.Loop192:
93	vtbl.8	d20,{q8},d4
94	vtbl.8	d21,{q8},d5
95	vext.8	q9,q0,q3,#12
96	vst1.32	{d16},[r2]!
97	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
98	subs	r1,r1,#1
99
100	veor	q3,q3,q9
101	vext.8	q9,q0,q9,#12
102	veor	q3,q3,q9
103	vext.8	q9,q0,q9,#12
104	veor	q3,q3,q9
105
106	vdup.32	q9,d7[1]
107	veor	q9,q9,q8
108	 veor	q10,q10,q1
109	vext.8	q8,q0,q8,#12
110	vshl.u8	q1,q1,#1
111	veor	q8,q8,q9
112	veor	q3,q3,q10
113	veor	q8,q8,q10
114	vst1.32	{q3},[r2]!
115	bne	.Loop192
116
117	mov	r12,#12
118	add	r2,r2,#0x20
119	b	.Ldone
120
121.align	4
122.L256:
123	vld1.8	{q8},[r0]
124	mov	r1,#7
125	mov	r12,#14
126	vst1.32	{q3},[r2]!
127
128.Loop256:
129	vtbl.8	d20,{q8},d4
130	vtbl.8	d21,{q8},d5
131	vext.8	q9,q0,q3,#12
132	vst1.32	{q8},[r2]!
133	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
134	subs	r1,r1,#1
135
136	veor	q3,q3,q9
137	vext.8	q9,q0,q9,#12
138	veor	q3,q3,q9
139	vext.8	q9,q0,q9,#12
140	 veor	q10,q10,q1
141	veor	q3,q3,q9
142	vshl.u8	q1,q1,#1
143	veor	q3,q3,q10
144	vst1.32	{q3},[r2]!
145	beq	.Ldone
146
147	vdup.32	q10,d7[1]
148	vext.8	q9,q0,q8,#12
149	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
150
151	veor	q8,q8,q9
152	vext.8	q9,q0,q9,#12
153	veor	q8,q8,q9
154	vext.8	q9,q0,q9,#12
155	veor	q8,q8,q9
156
157	veor	q8,q8,q10
158	b	.Loop256
159
160.Ldone:
161	str	r12,[r2]
162
163	eor	r0,r0,r0		@ return value
164
165	bx	lr
166.size	aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
167
168.globl	aes_v8_set_decrypt_key
169.type	aes_v8_set_decrypt_key,%function
170.align	5
171aes_v8_set_decrypt_key:
172	stmdb	sp!,{r4,lr}
173	bl	.Lenc_key
174
175	sub	r2,r2,#240		@ restore original r2
176	mov	r4,#-16
177	add	r0,r2,r12,lsl#4	@ end of key schedule
178
179	vld1.32	{q0},[r2]
180	vld1.32	{q1},[r0]
181	vst1.32	{q0},[r0],r4
182	vst1.32	{q1},[r2]!
183
184.Loop_imc:
185	vld1.32	{q0},[r2]
186	vld1.32	{q1},[r0]
187	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
188	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
189	vst1.32	{q0},[r0],r4
190	vst1.32	{q1},[r2]!
191	cmp	r0,r2
192	bhi	.Loop_imc
193
194	vld1.32	{q0},[r2]
195	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
196	vst1.32	{q0},[r0]
197
198	eor	r0,r0,r0		@ return value
199	ldmia	sp!,{r4,pc}
200.size	aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
201.globl	aes_v8_encrypt
202.type	aes_v8_encrypt,%function
203.align	5
204aes_v8_encrypt:
205	ldr	r3,[r2,#240]
206	vld1.32	{q0},[r2]!
207	vld1.8	{q2},[r0]
208	sub	r3,r3,#2
209	vld1.32	{q1},[r2]!
210
211.Loop_enc:
212	.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
213	vld1.32	{q0},[r2]!
214	.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
215	subs	r3,r3,#2
216	.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
217	vld1.32	{q1},[r2]!
218	.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
219	bgt	.Loop_enc
220
221	.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
222	vld1.32	{q0},[r2]
223	.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
224	.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
225	veor	q2,q2,q0
226
227	vst1.8	{q2},[r1]
228	bx	lr
229.size	aes_v8_encrypt,.-aes_v8_encrypt
230.globl	aes_v8_decrypt
231.type	aes_v8_decrypt,%function
232.align	5
233aes_v8_decrypt:
234	ldr	r3,[r2,#240]
235	vld1.32	{q0},[r2]!
236	vld1.8	{q2},[r0]
237	sub	r3,r3,#2
238	vld1.32	{q1},[r2]!
239
240.Loop_dec:
241	.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
242	vld1.32	{q0},[r2]!
243	.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
244	subs	r3,r3,#2
245	.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
246	vld1.32	{q1},[r2]!
247	.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
248	bgt	.Loop_dec
249
250	.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
251	vld1.32	{q0},[r2]
252	.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
253	.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
254	veor	q2,q2,q0
255
256	vst1.8	{q2},[r1]
257	bx	lr
258.size	aes_v8_decrypt,.-aes_v8_decrypt
259.globl	aes_v8_cbc_encrypt
260.type	aes_v8_cbc_encrypt,%function
261.align	5
262aes_v8_cbc_encrypt:
263	mov	ip,sp
264	stmdb	sp!,{r4-r8,lr}
265	vstmdb	sp!,{d8-d15}            @ ABI specification says so
266	ldmia	ip,{r4-r5}		@ load remaining args
267	subs	r2,r2,#16
268	mov	r8,#16
269	blo	.Lcbc_abort
270	moveq	r8,#0
271
272	cmp	r5,#0			@ en- or decrypting?
273	ldr	r5,[r3,#240]
274	and	r2,r2,#-16
275	vld1.8	{q6},[r4]
276	vld1.8	{q0},[r0],r8
277
278	vld1.32	{q8-q9},[r3]		@ load key schedule...
279	sub	r5,r5,#6
280	add	r7,r3,r5,lsl#4	@ pointer to last 7 round keys
281	sub	r5,r5,#2
282	vld1.32	{q10-q11},[r7]!
283	vld1.32	{q12-q13},[r7]!
284	vld1.32	{q14-q15},[r7]!
285	vld1.32	{q7},[r7]
286
287	add	r7,r3,#32
288	mov	r6,r5
289	beq	.Lcbc_dec
290
291	cmp	r5,#2
292	veor	q0,q0,q6
293	veor	q5,q8,q7
294	beq	.Lcbc_enc128
295
296.Loop_cbc_enc:
297	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
298	vld1.32	{q8},[r7]!
299	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
300	subs	r6,r6,#2
301	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
302	vld1.32	{q9},[r7]!
303	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
304	bgt	.Loop_cbc_enc
305
306	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
307	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
308	 subs	r2,r2,#16
309	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
310	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
311	 moveq	r8,#0
312	.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
313	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
314	 add	r7,r3,#16
315	.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
316	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
317	 vld1.8	{q8},[r0],r8
318	.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
319	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
320	 veor	q8,q8,q5
321	.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
322	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
323	 vld1.32 {q9},[r7]!	@ re-pre-load rndkey[1]
324	.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
325	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
326	.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
327
328	 mov	r6,r5
329	veor	q6,q0,q7
330	vst1.8	{q6},[r1]!
331	bhs	.Loop_cbc_enc
332
333	b	.Lcbc_done
334
335.align	5
336.Lcbc_enc128:
337	vld1.32	{q2-q3},[r7]
338	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
339	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
340	b	.Lenter_cbc_enc128
341.Loop_cbc_enc128:
342	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
343	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
344	 vst1.8	{q6},[r1]!
345.Lenter_cbc_enc128:
346	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
347	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
348	 subs	r2,r2,#16
349	.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
350	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
351	 moveq	r8,#0
352	.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
353	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
354	.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
355	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
356	.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
357	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
358	 vld1.8	{q8},[r0],r8
359	.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
360	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
361	.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
362	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
363	.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
364	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
365	 veor	q8,q8,q5
366	.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
367	veor	q6,q0,q7
368	bhs	.Loop_cbc_enc128
369
370	vst1.8	{q6},[r1]!
371	b	.Lcbc_done
372
373.align	5
374.Lcbc_dec128:
375	vld1.32	{q4-q5},[r7]
376	veor	q6,q6,q7
377	veor	q2,q0,q7
378	mov	r12,r8
379
380.Loop2x_cbc_dec128:
381	.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
382	.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
383	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
384	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
385	 subs	r2,r2,#32
386	.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
387	.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
388	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
389	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
390	 movlo	r8,#0
391	.byte	0x48,0x03,0xb0,0xf3	@ aesd q0,q4
392	.byte	0x48,0x23,0xb0,0xf3	@ aesd q1,q4
393	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
394	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
395	 movls	r12,#0
396	.byte	0x4a,0x03,0xb0,0xf3	@ aesd q0,q5
397	.byte	0x4a,0x23,0xb0,0xf3	@ aesd q1,q5
398	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
399	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
400	.byte	0x64,0x03,0xb0,0xf3	@ aesd q0,q10
401	.byte	0x64,0x23,0xb0,0xf3	@ aesd q1,q10
402	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
403	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
404	.byte	0x66,0x03,0xb0,0xf3	@ aesd q0,q11
405	.byte	0x66,0x23,0xb0,0xf3	@ aesd q1,q11
406	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
407	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
408	.byte	0x68,0x03,0xb0,0xf3	@ aesd q0,q12
409	.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
410	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
411	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
412	.byte	0x6a,0x03,0xb0,0xf3	@ aesd q0,q13
413	.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
414	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
415	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
416	.byte	0x6c,0x03,0xb0,0xf3	@ aesd q0,q14
417	.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
418	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
419	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
420	.byte	0x6e,0x03,0xb0,0xf3	@ aesd q0,q15
421	.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
422
423	veor	q6,q6,q0
424	vld1.8	{q0},[r0],r8
425	veor	q2,q2,q1
426	vld1.8	{q1},[r0],r12
427	vst1.8	{q6},[r1]!
428	veor	q6,q3,q7
429	vst1.8	{q2},[r1]!
430	veor	q2,q0,q7
431	vorr	q3,q1,q1
432	bhs	.Loop2x_cbc_dec128
433
434	adds	r2,r2,#32
435	veor	q6,q6,q7
436	beq	.Lcbc_done
437	veor	q2,q2,q7
438	b	.Lcbc_dec_tail
439
440.align	5
441.Lcbc_dec:
442	subs	r2,r2,#16
443	vorr	q2,q0,q0
444	blo	.Lcbc_dec_tail
445
446	moveq	r8,#0
447	cmp	r5,#2
448	vld1.8	{q1},[r0],r8
449	vorr	q3,q1,q1
450	beq	.Lcbc_dec128
451
452.Loop2x_cbc_dec:
453	.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
454	.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
455	vld1.32	{q8},[r7]!
456	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
457	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
458	subs	r6,r6,#2
459	.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
460	.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
461	vld1.32	{q9},[r7]!
462	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
463	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
464	bgt	.Loop2x_cbc_dec
465
466	.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
467	.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
468	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
469	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
470	 veor	q4,q6,q7
471	 veor	q5,q2,q7
472	.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
473	.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
474	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
475	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
476	 vorr	q6,q3,q3
477	 subs	r2,r2,#32
478	.byte	0x64,0x03,0xb0,0xf3	@ aesd q0,q10
479	.byte	0x64,0x23,0xb0,0xf3	@ aesd q1,q10
480	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
481	 movlo	r8,#0
482	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
483	 mov	r7,r3
484	.byte	0x66,0x03,0xb0,0xf3	@ aesd q0,q11
485	.byte	0x66,0x23,0xb0,0xf3	@ aesd q1,q11
486	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
487	 vld1.8	{q2},[r0],r8
488	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
489	 movls	r8,#0
490	.byte	0x68,0x03,0xb0,0xf3	@ aesd q0,q12
491	.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
492	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
493	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
494	 vld1.8	{q3},[r0],r8
495	.byte	0x6a,0x03,0xb0,0xf3	@ aesd q0,q13
496	.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
497	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
498	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
499	 vld1.32 {q8},[r7]!	@ re-pre-load rndkey[0]
500	.byte	0x6c,0x03,0xb0,0xf3	@ aesd q0,q14
501	.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
502	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
503	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
504	 vld1.32 {q9},[r7]!	@ re-pre-load rndkey[1]
505	.byte	0x6e,0x03,0xb0,0xf3	@ aesd q0,q15
506	.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
507
508	 mov	r6,r5
509	veor	q4,q4,q0
510	veor	q5,q5,q1
511	 vorr	q0,q2,q2
512	vst1.8	{q4},[r1]!
513	 vorr	q1,q3,q3
514	vst1.8	{q5},[r1]!
515	bhs	.Loop2x_cbc_dec
516
517	adds	r2,r2,#32
518	beq	.Lcbc_done
519
520.Lcbc_dec_tail:
521	.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
522	vld1.32	{q8},[r7]!
523	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
524	subs	r6,r6,#2
525	.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
526	vld1.32	{q9},[r7]!
527	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
528	bgt	.Lcbc_dec_tail
529
530	.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
531	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
532	.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
533	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
534	 veor	q4,q6,q7
535	.byte	0x64,0x03,0xb0,0xf3	@ aesd q0,q10
536	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
537	 vorr	q6,q2,q2
538	.byte	0x66,0x03,0xb0,0xf3	@ aesd q0,q11
539	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
540	.byte	0x68,0x03,0xb0,0xf3	@ aesd q0,q12
541	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
542	.byte	0x6a,0x03,0xb0,0xf3	@ aesd q0,q13
543	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
544	.byte	0x6c,0x03,0xb0,0xf3	@ aesd q0,q14
545	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
546	.byte	0x6e,0x03,0xb0,0xf3	@ aesd q0,q15
547
548	veor	q4,q4,q0
549	vst1.8	{q4},[r1]!
550
551.Lcbc_done:
552	vst1.8	{q6},[r4]
553.Lcbc_abort:
554	vldmia	sp!,{d8-d15}
555	ldmia	sp!,{r4-r8,pc}
556.size	aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
557.globl	aes_v8_ctr32_encrypt_blocks
558.type	aes_v8_ctr32_encrypt_blocks,%function
559.align	5
560aes_v8_ctr32_encrypt_blocks:
561	mov		ip,sp
562	stmdb		sp!,{r4-r10,lr}
563	vstmdb		sp!,{d8-d15}            @ ABI specification says so
564	ldr		r4, [ip]		@ load remaining arg
565	ldr		r5,[r3,#240]
566
567	ldr		r8, [r4, #12]
568	vld1.32		{q0},[r4]
569
570	vld1.32		{q8-q9},[r3]		@ load key schedule...
571	sub		r5,r5,#6
572	add		r7,r3,r5,lsl#4	@ pointer to last 7 round keys
573	sub		r5,r5,#2
574	vld1.32		{q10-q11},[r7]!
575	vld1.32		{q12-q13},[r7]!
576	vld1.32		{q14-q15},[r7]!
577	vld1.32		{q7},[r7]
578
579	add		r7,r3,#32
580	mov		r6,r5
581
582	subs		r2,r2,#2
583	blo		.Lctr32_tail
584
585#ifndef __ARMEB__
586	rev		r8, r8
587#endif
588	vorr		q1,q0,q0
589	add		r8, r8, #1
590	vorr		q6,q0,q0
591	rev		r10, r8
592	cmp		r5,#2
593	vmov.32	d3[1],r10
594	beq		.Lctr32_128
595
596.Loop2x_ctr32:
597	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
598	.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
599	vld1.32		{q8},[r7]!
600	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
601	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
602	subs		r6,r6,#2
603	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
604	.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
605	vld1.32		{q9},[r7]!
606	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
607	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
608	bgt		.Loop2x_ctr32
609
610	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
611	.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
612	.byte	0x80,0x83,0xb0,0xf3	@ aesmc q4,q0
613	 vorr		q0,q6,q6
614	.byte	0x82,0xa3,0xb0,0xf3	@ aesmc q5,q1
615	 vorr		q1,q6,q6
616	.byte	0x22,0x83,0xb0,0xf3	@ aese q4,q9
617	.byte	0x22,0xa3,0xb0,0xf3	@ aese q5,q9
618	 vld1.8		{q2},[r0]!
619	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
620	 vld1.8		{q3},[r0]!
621	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
622	 add		r8,r8,#1
623	.byte	0x24,0x83,0xb0,0xf3	@ aese q4,q10
624	.byte	0x24,0xa3,0xb0,0xf3	@ aese q5,q10
625	 rev		r9,r8
626	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
627	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
628	 add		r8,r8,#1
629	.byte	0x26,0x83,0xb0,0xf3	@ aese q4,q11
630	.byte	0x26,0xa3,0xb0,0xf3	@ aese q5,q11
631	 veor		q2,q2,q7
632	 rev		r10,r8
633	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
634	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
635	 veor		q3,q3,q7
636	 mov		r7,r3
637	.byte	0x28,0x83,0xb0,0xf3	@ aese q4,q12
638	.byte	0x28,0xa3,0xb0,0xf3	@ aese q5,q12
639	 subs		r2,r2,#2
640	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
641	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
642	 vld1.32	 {q8-q9},[r7]!	@ re-pre-load rndkey[0-1]
643	.byte	0x2a,0x83,0xb0,0xf3	@ aese q4,q13
644	.byte	0x2a,0xa3,0xb0,0xf3	@ aese q5,q13
645	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
646	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
647	.byte	0x2c,0x83,0xb0,0xf3	@ aese q4,q14
648	.byte	0x2c,0xa3,0xb0,0xf3	@ aese q5,q14
649	 vmov.32	d1[1], r9
650	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
651	 vmov.32	d3[1], r10
652	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
653	.byte	0x2e,0x83,0xb0,0xf3	@ aese q4,q15
654	.byte	0x2e,0xa3,0xb0,0xf3	@ aese q5,q15
655
656	 mov		r6,r5
657	veor		q2,q2,q4
658	veor		q3,q3,q5
659	vst1.8		{q2},[r1]!
660	vst1.8		{q3},[r1]!
661	bhs		.Loop2x_ctr32
662
663	adds		r2,r2,#2
664	beq		.Lctr32_done
665	b		.Lctr32_tail
666
667.Lctr32_128:
668	vld1.32		{q4-q5},[r7]
669
670.Loop2x_ctr32_128:
671	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
672	.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
673	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
674	 vld1.8		{q2},[r0]!
675	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
676	 vld1.8		{q3},[r0]!
677	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
678	.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
679	 add		r8,r8,#1
680	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
681	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
682	 rev		r9,r8
683	.byte	0x08,0x03,0xb0,0xf3	@ aese q0,q4
684	.byte	0x08,0x23,0xb0,0xf3	@ aese q1,q4
685	 add		r8,r8,#1
686	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
687	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
688	 rev		r10,r8
689	.byte	0x0a,0x03,0xb0,0xf3	@ aese q0,q5
690	.byte	0x0a,0x23,0xb0,0xf3	@ aese q1,q5
691	 subs		r2,r2,#2
692	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
693	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
694	.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
695	.byte	0x24,0x23,0xb0,0xf3	@ aese q1,q10
696	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
697	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
698	.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
699	.byte	0x26,0x23,0xb0,0xf3	@ aese q1,q11
700	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
701	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
702	.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
703	.byte	0x28,0x23,0xb0,0xf3	@ aese q1,q12
704	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
705	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
706	.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
707	.byte	0x2a,0x23,0xb0,0xf3	@ aese q1,q13
708	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
709	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
710	.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
711	.byte	0x2c,0x23,0xb0,0xf3	@ aese q1,q14
712	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
713	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
714	 veor		q2,q2,q7
715	.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
716	 veor		q3,q3,q7
717	.byte	0x2e,0x23,0xb0,0xf3	@ aese q1,q15
718
719	veor		q2,q2,q0
720	vorr		q0,q6,q6
721	veor		q3,q3,q1
722	vorr		q1,q6,q6
723	vst1.8		{q2},[r1]!
724	vmov.32	d1[1], r9
725	vst1.8		{q3},[r1]!
726	vmov.32	d3[1], r10
727	bhs		.Loop2x_ctr32_128
728
729	adds		r2,r2,#2
730	beq		.Lctr32_done
731
732.Lctr32_tail:
733	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
734	vld1.32		{q8},[r7]!
735	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
736	subs		r6,r6,#2
737	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
738	vld1.32		{q9},[r7]!
739	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
740	bgt		.Lctr32_tail
741
742	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
743	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
744	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
745	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
746	 vld1.8		{q2},[r0]
747	.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
748	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
749	.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
750	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
751	.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
752	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
753	.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
754	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
755	.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
756	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
757	 veor		q2,q2,q7
758	.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
759
760	veor		q2,q2,q0
761	vst1.8		{q2},[r1]
762
763.Lctr32_done:
764	vldmia		sp!,{d8-d15}
765	ldmia		sp!,{r4-r10,pc}
766.size	aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
767#endif
768