1#include <openssl/arm_arch.h>
2
3#if __ARM_MAX_ARCH__>=7
4.text
5
6
7.code	32
8#undef	__thumb2__
9.align	5
10Lrcon:
11.long	0x01,0x01,0x01,0x01
12.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	@ rotate-n-splat
13.long	0x1b,0x1b,0x1b,0x1b
14
15.globl	_aes_hw_set_encrypt_key
16.private_extern	_aes_hw_set_encrypt_key
17#ifdef __thumb2__
18.thumb_func	_aes_hw_set_encrypt_key
19#endif
20.align	5
21_aes_hw_set_encrypt_key:
22Lenc_key:
23	mov	r3,#-1
24	cmp	r0,#0
25	beq	Lenc_key_abort
26	cmp	r2,#0
27	beq	Lenc_key_abort
28	mov	r3,#-2
29	cmp	r1,#128
30	blt	Lenc_key_abort
31	cmp	r1,#256
32	bgt	Lenc_key_abort
33	tst	r1,#0x3f
34	bne	Lenc_key_abort
35
36	adr	r3,Lrcon
37	cmp	r1,#192
38
39	veor	q0,q0,q0
40	vld1.8	{q3},[r0]!
41	mov	r1,#8		@ reuse r1
42	vld1.32	{q1,q2},[r3]!
43
44	blt	Loop128
45	beq	L192
46	b	L256
47
48.align	4
49Loop128:
50	vtbl.8	d20,{q3},d4
51	vtbl.8	d21,{q3},d5
52	vext.8	q9,q0,q3,#12
53	vst1.32	{q3},[r2]!
54.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
55	subs	r1,r1,#1
56
57	veor	q3,q3,q9
58	vext.8	q9,q0,q9,#12
59	veor	q3,q3,q9
60	vext.8	q9,q0,q9,#12
61	veor	q10,q10,q1
62	veor	q3,q3,q9
63	vshl.u8	q1,q1,#1
64	veor	q3,q3,q10
65	bne	Loop128
66
67	vld1.32	{q1},[r3]
68
69	vtbl.8	d20,{q3},d4
70	vtbl.8	d21,{q3},d5
71	vext.8	q9,q0,q3,#12
72	vst1.32	{q3},[r2]!
73.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
74
75	veor	q3,q3,q9
76	vext.8	q9,q0,q9,#12
77	veor	q3,q3,q9
78	vext.8	q9,q0,q9,#12
79	veor	q10,q10,q1
80	veor	q3,q3,q9
81	vshl.u8	q1,q1,#1
82	veor	q3,q3,q10
83
84	vtbl.8	d20,{q3},d4
85	vtbl.8	d21,{q3},d5
86	vext.8	q9,q0,q3,#12
87	vst1.32	{q3},[r2]!
88.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
89
90	veor	q3,q3,q9
91	vext.8	q9,q0,q9,#12
92	veor	q3,q3,q9
93	vext.8	q9,q0,q9,#12
94	veor	q10,q10,q1
95	veor	q3,q3,q9
96	veor	q3,q3,q10
97	vst1.32	{q3},[r2]
98	add	r2,r2,#0x50
99
100	mov	r12,#10
101	b	Ldone
102
103.align	4
104L192:
105	vld1.8	{d16},[r0]!
106	vmov.i8	q10,#8			@ borrow q10
107	vst1.32	{q3},[r2]!
108	vsub.i8	q2,q2,q10	@ adjust the mask
109
110Loop192:
111	vtbl.8	d20,{q8},d4
112	vtbl.8	d21,{q8},d5
113	vext.8	q9,q0,q3,#12
114	vst1.32	{d16},[r2]!
115.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
116	subs	r1,r1,#1
117
118	veor	q3,q3,q9
119	vext.8	q9,q0,q9,#12
120	veor	q3,q3,q9
121	vext.8	q9,q0,q9,#12
122	veor	q3,q3,q9
123
124	vdup.32	q9,d7[1]
125	veor	q9,q9,q8
126	veor	q10,q10,q1
127	vext.8	q8,q0,q8,#12
128	vshl.u8	q1,q1,#1
129	veor	q8,q8,q9
130	veor	q3,q3,q10
131	veor	q8,q8,q10
132	vst1.32	{q3},[r2]!
133	bne	Loop192
134
135	mov	r12,#12
136	add	r2,r2,#0x20
137	b	Ldone
138
139.align	4
140L256:
141	vld1.8	{q8},[r0]
142	mov	r1,#7
143	mov	r12,#14
144	vst1.32	{q3},[r2]!
145
146Loop256:
147	vtbl.8	d20,{q8},d4
148	vtbl.8	d21,{q8},d5
149	vext.8	q9,q0,q3,#12
150	vst1.32	{q8},[r2]!
151.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
152	subs	r1,r1,#1
153
154	veor	q3,q3,q9
155	vext.8	q9,q0,q9,#12
156	veor	q3,q3,q9
157	vext.8	q9,q0,q9,#12
158	veor	q10,q10,q1
159	veor	q3,q3,q9
160	vshl.u8	q1,q1,#1
161	veor	q3,q3,q10
162	vst1.32	{q3},[r2]!
163	beq	Ldone
164
165	vdup.32	q10,d7[1]
166	vext.8	q9,q0,q8,#12
167.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
168
169	veor	q8,q8,q9
170	vext.8	q9,q0,q9,#12
171	veor	q8,q8,q9
172	vext.8	q9,q0,q9,#12
173	veor	q8,q8,q9
174
175	veor	q8,q8,q10
176	b	Loop256
177
178Ldone:
179	str	r12,[r2]
180	mov	r3,#0
181
182Lenc_key_abort:
183	mov	r0,r3			@ return value
184
185	bx	lr
186
187
188.globl	_aes_hw_set_decrypt_key
189.private_extern	_aes_hw_set_decrypt_key
190#ifdef __thumb2__
191.thumb_func	_aes_hw_set_decrypt_key
192#endif
193.align	5
194_aes_hw_set_decrypt_key:
195	stmdb	sp!,{r4,lr}
196	bl	Lenc_key
197
198	cmp	r0,#0
199	bne	Ldec_key_abort
200
201	sub	r2,r2,#240		@ restore original r2
202	mov	r4,#-16
203	add	r0,r2,r12,lsl#4	@ end of key schedule
204
205	vld1.32	{q0},[r2]
206	vld1.32	{q1},[r0]
207	vst1.32	{q0},[r0],r4
208	vst1.32	{q1},[r2]!
209
210Loop_imc:
211	vld1.32	{q0},[r2]
212	vld1.32	{q1},[r0]
213.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
214.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
215	vst1.32	{q0},[r0],r4
216	vst1.32	{q1},[r2]!
217	cmp	r0,r2
218	bhi	Loop_imc
219
220	vld1.32	{q0},[r2]
221.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
222	vst1.32	{q0},[r0]
223
224	eor	r0,r0,r0		@ return value
225Ldec_key_abort:
226	ldmia	sp!,{r4,pc}
227
228.globl	_aes_hw_encrypt
229.private_extern	_aes_hw_encrypt
230#ifdef __thumb2__
231.thumb_func	_aes_hw_encrypt
232#endif
233.align	5
234_aes_hw_encrypt:
235	ldr	r3,[r2,#240]
236	vld1.32	{q0},[r2]!
237	vld1.8	{q2},[r0]
238	sub	r3,r3,#2
239	vld1.32	{q1},[r2]!
240
241Loop_enc:
242.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
243.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
244	vld1.32	{q0},[r2]!
245	subs	r3,r3,#2
246.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
247.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
248	vld1.32	{q1},[r2]!
249	bgt	Loop_enc
250
251.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
252.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
253	vld1.32	{q0},[r2]
254.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
255	veor	q2,q2,q0
256
257	vst1.8	{q2},[r1]
258	bx	lr
259
260.globl	_aes_hw_decrypt
261.private_extern	_aes_hw_decrypt
262#ifdef __thumb2__
263.thumb_func	_aes_hw_decrypt
264#endif
265.align	5
266_aes_hw_decrypt:
267	ldr	r3,[r2,#240]
268	vld1.32	{q0},[r2]!
269	vld1.8	{q2},[r0]
270	sub	r3,r3,#2
271	vld1.32	{q1},[r2]!
272
273Loop_dec:
274.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
275.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
276	vld1.32	{q0},[r2]!
277	subs	r3,r3,#2
278.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
279.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
280	vld1.32	{q1},[r2]!
281	bgt	Loop_dec
282
283.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
284.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
285	vld1.32	{q0},[r2]
286.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
287	veor	q2,q2,q0
288
289	vst1.8	{q2},[r1]
290	bx	lr
291
292.globl	_aes_hw_cbc_encrypt
293.private_extern	_aes_hw_cbc_encrypt
294#ifdef __thumb2__
295.thumb_func	_aes_hw_cbc_encrypt
296#endif
297.align	5
298_aes_hw_cbc_encrypt:
299	mov	ip,sp
300	stmdb	sp!,{r4,r5,r6,r7,r8,lr}
301	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}            @ ABI specification says so
302	ldmia	ip,{r4,r5}		@ load remaining args
303	subs	r2,r2,#16
304	mov	r8,#16
305	blo	Lcbc_abort
306	moveq	r8,#0
307
308	cmp	r5,#0			@ en- or decrypting?
309	ldr	r5,[r3,#240]
310	and	r2,r2,#-16
311	vld1.8	{q6},[r4]
312	vld1.8	{q0},[r0],r8
313
314	vld1.32	{q8,q9},[r3]		@ load key schedule...
315	sub	r5,r5,#6
316	add	r7,r3,r5,lsl#4	@ pointer to last 7 round keys
317	sub	r5,r5,#2
318	vld1.32	{q10,q11},[r7]!
319	vld1.32	{q12,q13},[r7]!
320	vld1.32	{q14,q15},[r7]!
321	vld1.32	{q7},[r7]
322
323	add	r7,r3,#32
324	mov	r6,r5
325	beq	Lcbc_dec
326
327	cmp	r5,#2
328	veor	q0,q0,q6
329	veor	q5,q8,q7
330	beq	Lcbc_enc128
331
332	vld1.32	{q2,q3},[r7]
333	add	r7,r3,#16
334	add	r6,r3,#16*4
335	add	r12,r3,#16*5
336.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
337.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
338	add	r14,r3,#16*6
339	add	r3,r3,#16*7
340	b	Lenter_cbc_enc
341
342.align	4
343Loop_cbc_enc:
344.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
345.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
346	vst1.8	{q6},[r1]!
347Lenter_cbc_enc:
348.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
349.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
350.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
351.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
352	vld1.32	{q8},[r6]
353	cmp	r5,#4
354.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
355.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
356	vld1.32	{q9},[r12]
357	beq	Lcbc_enc192
358
359.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
360.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
361	vld1.32	{q8},[r14]
362.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
363.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
364	vld1.32	{q9},[r3]
365	nop
366
367Lcbc_enc192:
368.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
369.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
370	subs	r2,r2,#16
371.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
372.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
373	moveq	r8,#0
374.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
375.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
376.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
377.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
378	vld1.8	{q8},[r0],r8
379.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
380.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
381	veor	q8,q8,q5
382.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
383.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
384	vld1.32	{q9},[r7]		@ re-pre-load rndkey[1]
385.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
386.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
387.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
388	veor	q6,q0,q7
389	bhs	Loop_cbc_enc
390
391	vst1.8	{q6},[r1]!
392	b	Lcbc_done
393
394.align	5
395Lcbc_enc128:
396	vld1.32	{q2,q3},[r7]
397.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
398.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
399	b	Lenter_cbc_enc128
400Loop_cbc_enc128:
401.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
402.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
403	vst1.8	{q6},[r1]!
404Lenter_cbc_enc128:
405.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
406.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
407	subs	r2,r2,#16
408.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
409.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
410	moveq	r8,#0
411.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
412.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
413.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
414.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
415.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
416.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
417	vld1.8	{q8},[r0],r8
418.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
419.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
420.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
421.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
422.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
423.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
424	veor	q8,q8,q5
425.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
426	veor	q6,q0,q7
427	bhs	Loop_cbc_enc128
428
429	vst1.8	{q6},[r1]!
430	b	Lcbc_done
431.align	5
432Lcbc_dec:
433	vld1.8	{q10},[r0]!
434	subs	r2,r2,#32		@ bias
435	add	r6,r5,#2
436	vorr	q3,q0,q0
437	vorr	q1,q0,q0
438	vorr	q11,q10,q10
439	blo	Lcbc_dec_tail
440
441	vorr	q1,q10,q10
442	vld1.8	{q10},[r0]!
443	vorr	q2,q0,q0
444	vorr	q3,q1,q1
445	vorr	q11,q10,q10
446
447Loop3x_cbc_dec:
448.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
449.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
450.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
451.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
452.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
453.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
454	vld1.32	{q8},[r7]!
455	subs	r6,r6,#2
456.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
457.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
458.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
459.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
460.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
461.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
462	vld1.32	{q9},[r7]!
463	bgt	Loop3x_cbc_dec
464
465.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
466.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
467.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
468.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
469.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
470.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
471	veor	q4,q6,q7
472	subs	r2,r2,#0x30
473	veor	q5,q2,q7
474	movlo	r6,r2			@ r6, r6, is zero at this point
475.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
476.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
477.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
478.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
479.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
480.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
481	veor	q9,q3,q7
482	add	r0,r0,r6		@ r0 is adjusted in such way that
483					@ at exit from the loop q1-q10
484					@ are loaded with last "words"
485	vorr	q6,q11,q11
486	mov	r7,r3
487.byte	0x68,0x03,0xb0,0xf3	@ aesd q0,q12
488.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
489.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
490.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
491.byte	0x68,0x43,0xf0,0xf3	@ aesd q10,q12
492.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
493	vld1.8	{q2},[r0]!
494.byte	0x6a,0x03,0xb0,0xf3	@ aesd q0,q13
495.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
496.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
497.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
498.byte	0x6a,0x43,0xf0,0xf3	@ aesd q10,q13
499.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
500	vld1.8	{q3},[r0]!
501.byte	0x6c,0x03,0xb0,0xf3	@ aesd q0,q14
502.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
503.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
504.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
505.byte	0x6c,0x43,0xf0,0xf3	@ aesd q10,q14
506.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
507	vld1.8	{q11},[r0]!
508.byte	0x6e,0x03,0xb0,0xf3	@ aesd q0,q15
509.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
510.byte	0x6e,0x43,0xf0,0xf3	@ aesd q10,q15
511	vld1.32	{q8},[r7]!	@ re-pre-load rndkey[0]
512	add	r6,r5,#2
513	veor	q4,q4,q0
514	veor	q5,q5,q1
515	veor	q10,q10,q9
516	vld1.32	{q9},[r7]!	@ re-pre-load rndkey[1]
517	vst1.8	{q4},[r1]!
518	vorr	q0,q2,q2
519	vst1.8	{q5},[r1]!
520	vorr	q1,q3,q3
521	vst1.8	{q10},[r1]!
522	vorr	q10,q11,q11
523	bhs	Loop3x_cbc_dec
524
525	cmn	r2,#0x30
526	beq	Lcbc_done
527	nop
528
529Lcbc_dec_tail:
530.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
531.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
532.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
533.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
534	vld1.32	{q8},[r7]!
535	subs	r6,r6,#2
536.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
537.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
538.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
539.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
540	vld1.32	{q9},[r7]!
541	bgt	Lcbc_dec_tail
542
543.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
544.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
545.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
546.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
547.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
548.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
549.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
550.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
551.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
552.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
553.byte	0x68,0x43,0xf0,0xf3	@ aesd q10,q12
554.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
555	cmn	r2,#0x20
556.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
557.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
558.byte	0x6a,0x43,0xf0,0xf3	@ aesd q10,q13
559.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
560	veor	q5,q6,q7
561.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
562.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
563.byte	0x6c,0x43,0xf0,0xf3	@ aesd q10,q14
564.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
565	veor	q9,q3,q7
566.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
567.byte	0x6e,0x43,0xf0,0xf3	@ aesd q10,q15
568	beq	Lcbc_dec_one
569	veor	q5,q5,q1
570	veor	q9,q9,q10
571	vorr	q6,q11,q11
572	vst1.8	{q5},[r1]!
573	vst1.8	{q9},[r1]!
574	b	Lcbc_done
575
576Lcbc_dec_one:
577	veor	q5,q5,q10
578	vorr	q6,q11,q11
579	vst1.8	{q5},[r1]!
580
581Lcbc_done:
582	vst1.8	{q6},[r4]
583Lcbc_abort:
584	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
585	ldmia	sp!,{r4,r5,r6,r7,r8,pc}
586
587.globl	_aes_hw_ctr32_encrypt_blocks
588.private_extern	_aes_hw_ctr32_encrypt_blocks
589#ifdef __thumb2__
590.thumb_func	_aes_hw_ctr32_encrypt_blocks
591#endif
592.align	5
593_aes_hw_ctr32_encrypt_blocks:
594	mov	ip,sp
595	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,lr}
596	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}            @ ABI specification says so
597	ldr	r4, [ip]		@ load remaining arg
598	ldr	r5,[r3,#240]
599
600	ldr	r8, [r4, #12]
601	vld1.32	{q0},[r4]
602
603	vld1.32	{q8,q9},[r3]		@ load key schedule...
604	sub	r5,r5,#4
605	mov	r12,#16
606	cmp	r2,#2
607	add	r7,r3,r5,lsl#4	@ pointer to last 5 round keys
608	sub	r5,r5,#2
609	vld1.32	{q12,q13},[r7]!
610	vld1.32	{q14,q15},[r7]!
611	vld1.32	{q7},[r7]
612	add	r7,r3,#32
613	mov	r6,r5
614	movlo	r12,#0
615#ifndef __ARMEB__
616	rev	r8, r8
617#endif
618	vorr	q1,q0,q0
619	add	r10, r8, #1
620	vorr	q10,q0,q0
621	add	r8, r8, #2
622	vorr	q6,q0,q0
623	rev	r10, r10
624	vmov.32	d3[1],r10
625	bls	Lctr32_tail
626	rev	r12, r8
627	sub	r2,r2,#3		@ bias
628	vmov.32	d21[1],r12
629	b	Loop3x_ctr32
630
631.align	4
632Loop3x_ctr32:
633.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
634.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
635.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
636.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
637.byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
638.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
639	vld1.32	{q8},[r7]!
640	subs	r6,r6,#2
641.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
642.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
643.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
644.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
645.byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
646.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
647	vld1.32	{q9},[r7]!
648	bgt	Loop3x_ctr32
649
650.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
651.byte	0x80,0x83,0xb0,0xf3	@ aesmc q4,q0
652.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
653.byte	0x82,0xa3,0xb0,0xf3	@ aesmc q5,q1
654	vld1.8	{q2},[r0]!
655	vorr	q0,q6,q6
656.byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
657.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
658	vld1.8	{q3},[r0]!
659	vorr	q1,q6,q6
660.byte	0x22,0x83,0xb0,0xf3	@ aese q4,q9
661.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
662.byte	0x22,0xa3,0xb0,0xf3	@ aese q5,q9
663.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
664	vld1.8	{q11},[r0]!
665	mov	r7,r3
666.byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
667.byte	0xa4,0x23,0xf0,0xf3	@ aesmc q9,q10
668	vorr	q10,q6,q6
669	add	r9,r8,#1
670.byte	0x28,0x83,0xb0,0xf3	@ aese q4,q12
671.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
672.byte	0x28,0xa3,0xb0,0xf3	@ aese q5,q12
673.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
674	veor	q2,q2,q7
675	add	r10,r8,#2
676.byte	0x28,0x23,0xf0,0xf3	@ aese q9,q12
677.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
678	veor	q3,q3,q7
679	add	r8,r8,#3
680.byte	0x2a,0x83,0xb0,0xf3	@ aese q4,q13
681.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
682.byte	0x2a,0xa3,0xb0,0xf3	@ aese q5,q13
683.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
684	veor	q11,q11,q7
685	rev	r9,r9
686.byte	0x2a,0x23,0xf0,0xf3	@ aese q9,q13
687.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
688	vmov.32	d1[1], r9
689	rev	r10,r10
690.byte	0x2c,0x83,0xb0,0xf3	@ aese q4,q14
691.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
692.byte	0x2c,0xa3,0xb0,0xf3	@ aese q5,q14
693.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
694	vmov.32	d3[1], r10
695	rev	r12,r8
696.byte	0x2c,0x23,0xf0,0xf3	@ aese q9,q14
697.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
698	vmov.32	d21[1], r12
699	subs	r2,r2,#3
700.byte	0x2e,0x83,0xb0,0xf3	@ aese q4,q15
701.byte	0x2e,0xa3,0xb0,0xf3	@ aese q5,q15
702.byte	0x2e,0x23,0xf0,0xf3	@ aese q9,q15
703
704	veor	q2,q2,q4
705	vld1.32	{q8},[r7]!	@ re-pre-load rndkey[0]
706	vst1.8	{q2},[r1]!
707	veor	q3,q3,q5
708	mov	r6,r5
709	vst1.8	{q3},[r1]!
710	veor	q11,q11,q9
711	vld1.32	{q9},[r7]!	@ re-pre-load rndkey[1]
712	vst1.8	{q11},[r1]!
713	bhs	Loop3x_ctr32
714
715	adds	r2,r2,#3
716	beq	Lctr32_done
717	cmp	r2,#1
718	mov	r12,#16
719	moveq	r12,#0
720
721Lctr32_tail:
722.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
723.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
724.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
725.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
726	vld1.32	{q8},[r7]!
727	subs	r6,r6,#2
728.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
729.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
730.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
731.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
732	vld1.32	{q9},[r7]!
733	bgt	Lctr32_tail
734
735.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
736.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
737.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
738.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
739.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
740.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
741.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
742.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
743	vld1.8	{q2},[r0],r12
744.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
745.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
746.byte	0x28,0x23,0xb0,0xf3	@ aese q1,q12
747.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
748	vld1.8	{q3},[r0]
749.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
750.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
751.byte	0x2a,0x23,0xb0,0xf3	@ aese q1,q13
752.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
753	veor	q2,q2,q7
754.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
755.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
756.byte	0x2c,0x23,0xb0,0xf3	@ aese q1,q14
757.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
758	veor	q3,q3,q7
759.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
760.byte	0x2e,0x23,0xb0,0xf3	@ aese q1,q15
761
762	cmp	r2,#1
763	veor	q2,q2,q0
764	veor	q3,q3,q1
765	vst1.8	{q2},[r1]!
766	beq	Lctr32_done
767	vst1.8	{q3},[r1]
768
769Lctr32_done:
770	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
771	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,pc}
772
773#endif
774