1#if defined(__aarch64__)
2#include <openssl/arm_arch.h>
3
4.text
5
6
7
8.align	5
9.Lsigma:
10.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
11.Lone:
12.long	1,0,0,0
13.LOPENSSL_armcap_P:
14#ifdef	__ILP32__
15.long	OPENSSL_armcap_P-.
16#else
17.quad	OPENSSL_armcap_P-.
18#endif
19.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
20.align	2
21
22.globl	ChaCha20_ctr32
23.hidden	ChaCha20_ctr32
24.type	ChaCha20_ctr32,%function
25.align	5
26ChaCha20_ctr32:
27	cbz	x2,.Labort
28	adr	x5,.LOPENSSL_armcap_P
29	cmp	x2,#192
30	b.lo	.Lshort
31#ifdef	__ILP32__
32	ldrsw	x6,[x5]
33#else
34	ldr	x6,[x5]
35#endif
36	ldr	w17,[x6,x5]
37	tst	w17,#ARMV7_NEON
38	b.ne	ChaCha20_neon
39
40.Lshort:
41	stp	x29,x30,[sp,#-96]!
42	add	x29,sp,#0
43
44	adr	x5,.Lsigma
45	stp	x19,x20,[sp,#16]
46	stp	x21,x22,[sp,#32]
47	stp	x23,x24,[sp,#48]
48	stp	x25,x26,[sp,#64]
49	stp	x27,x28,[sp,#80]
50	sub	sp,sp,#64
51
52	ldp	x22,x23,[x5]		// load sigma
53	ldp	x24,x25,[x3]		// load key
54	ldp	x26,x27,[x3,#16]
55	ldp	x28,x30,[x4]		// load counter
56#ifdef	__ARMEB__
57	ror	x24,x24,#32
58	ror	x25,x25,#32
59	ror	x26,x26,#32
60	ror	x27,x27,#32
61	ror	x28,x28,#32
62	ror	x30,x30,#32
63#endif
64
65.Loop_outer:
66	mov	w5,w22			// unpack key block
67	lsr	x6,x22,#32
68	mov	w7,w23
69	lsr	x8,x23,#32
70	mov	w9,w24
71	lsr	x10,x24,#32
72	mov	w11,w25
73	lsr	x12,x25,#32
74	mov	w13,w26
75	lsr	x14,x26,#32
76	mov	w15,w27
77	lsr	x16,x27,#32
78	mov	w17,w28
79	lsr	x19,x28,#32
80	mov	w20,w30
81	lsr	x21,x30,#32
82
83	mov	x4,#10
84	subs	x2,x2,#64
85.Loop:
86	sub	x4,x4,#1
87	add	w5,w5,w9
88	add	w6,w6,w10
89	add	w7,w7,w11
90	add	w8,w8,w12
91	eor	w17,w17,w5
92	eor	w19,w19,w6
93	eor	w20,w20,w7
94	eor	w21,w21,w8
95	ror	w17,w17,#16
96	ror	w19,w19,#16
97	ror	w20,w20,#16
98	ror	w21,w21,#16
99	add	w13,w13,w17
100	add	w14,w14,w19
101	add	w15,w15,w20
102	add	w16,w16,w21
103	eor	w9,w9,w13
104	eor	w10,w10,w14
105	eor	w11,w11,w15
106	eor	w12,w12,w16
107	ror	w9,w9,#20
108	ror	w10,w10,#20
109	ror	w11,w11,#20
110	ror	w12,w12,#20
111	add	w5,w5,w9
112	add	w6,w6,w10
113	add	w7,w7,w11
114	add	w8,w8,w12
115	eor	w17,w17,w5
116	eor	w19,w19,w6
117	eor	w20,w20,w7
118	eor	w21,w21,w8
119	ror	w17,w17,#24
120	ror	w19,w19,#24
121	ror	w20,w20,#24
122	ror	w21,w21,#24
123	add	w13,w13,w17
124	add	w14,w14,w19
125	add	w15,w15,w20
126	add	w16,w16,w21
127	eor	w9,w9,w13
128	eor	w10,w10,w14
129	eor	w11,w11,w15
130	eor	w12,w12,w16
131	ror	w9,w9,#25
132	ror	w10,w10,#25
133	ror	w11,w11,#25
134	ror	w12,w12,#25
135	add	w5,w5,w10
136	add	w6,w6,w11
137	add	w7,w7,w12
138	add	w8,w8,w9
139	eor	w21,w21,w5
140	eor	w17,w17,w6
141	eor	w19,w19,w7
142	eor	w20,w20,w8
143	ror	w21,w21,#16
144	ror	w17,w17,#16
145	ror	w19,w19,#16
146	ror	w20,w20,#16
147	add	w15,w15,w21
148	add	w16,w16,w17
149	add	w13,w13,w19
150	add	w14,w14,w20
151	eor	w10,w10,w15
152	eor	w11,w11,w16
153	eor	w12,w12,w13
154	eor	w9,w9,w14
155	ror	w10,w10,#20
156	ror	w11,w11,#20
157	ror	w12,w12,#20
158	ror	w9,w9,#20
159	add	w5,w5,w10
160	add	w6,w6,w11
161	add	w7,w7,w12
162	add	w8,w8,w9
163	eor	w21,w21,w5
164	eor	w17,w17,w6
165	eor	w19,w19,w7
166	eor	w20,w20,w8
167	ror	w21,w21,#24
168	ror	w17,w17,#24
169	ror	w19,w19,#24
170	ror	w20,w20,#24
171	add	w15,w15,w21
172	add	w16,w16,w17
173	add	w13,w13,w19
174	add	w14,w14,w20
175	eor	w10,w10,w15
176	eor	w11,w11,w16
177	eor	w12,w12,w13
178	eor	w9,w9,w14
179	ror	w10,w10,#25
180	ror	w11,w11,#25
181	ror	w12,w12,#25
182	ror	w9,w9,#25
183	cbnz	x4,.Loop
184
185	add	w5,w5,w22		// accumulate key block
186	add	x6,x6,x22,lsr#32
187	add	w7,w7,w23
188	add	x8,x8,x23,lsr#32
189	add	w9,w9,w24
190	add	x10,x10,x24,lsr#32
191	add	w11,w11,w25
192	add	x12,x12,x25,lsr#32
193	add	w13,w13,w26
194	add	x14,x14,x26,lsr#32
195	add	w15,w15,w27
196	add	x16,x16,x27,lsr#32
197	add	w17,w17,w28
198	add	x19,x19,x28,lsr#32
199	add	w20,w20,w30
200	add	x21,x21,x30,lsr#32
201
202	b.lo	.Ltail
203
204	add	x5,x5,x6,lsl#32	// pack
205	add	x7,x7,x8,lsl#32
206	ldp	x6,x8,[x1,#0]		// load input
207	add	x9,x9,x10,lsl#32
208	add	x11,x11,x12,lsl#32
209	ldp	x10,x12,[x1,#16]
210	add	x13,x13,x14,lsl#32
211	add	x15,x15,x16,lsl#32
212	ldp	x14,x16,[x1,#32]
213	add	x17,x17,x19,lsl#32
214	add	x20,x20,x21,lsl#32
215	ldp	x19,x21,[x1,#48]
216	add	x1,x1,#64
217#ifdef	__ARMEB__
218	rev	x5,x5
219	rev	x7,x7
220	rev	x9,x9
221	rev	x11,x11
222	rev	x13,x13
223	rev	x15,x15
224	rev	x17,x17
225	rev	x20,x20
226#endif
227	eor	x5,x5,x6
228	eor	x7,x7,x8
229	eor	x9,x9,x10
230	eor	x11,x11,x12
231	eor	x13,x13,x14
232	eor	x15,x15,x16
233	eor	x17,x17,x19
234	eor	x20,x20,x21
235
236	stp	x5,x7,[x0,#0]		// store output
237	add	x28,x28,#1			// increment counter
238	stp	x9,x11,[x0,#16]
239	stp	x13,x15,[x0,#32]
240	stp	x17,x20,[x0,#48]
241	add	x0,x0,#64
242
243	b.hi	.Loop_outer
244
245	ldp	x19,x20,[x29,#16]
246	add	sp,sp,#64
247	ldp	x21,x22,[x29,#32]
248	ldp	x23,x24,[x29,#48]
249	ldp	x25,x26,[x29,#64]
250	ldp	x27,x28,[x29,#80]
251	ldp	x29,x30,[sp],#96
252.Labort:
253	ret
254
255.align	4
256.Ltail:
257	add	x2,x2,#64
258.Less_than_64:
259	sub	x0,x0,#1
260	add	x1,x1,x2
261	add	x0,x0,x2
262	add	x4,sp,x2
263	neg	x2,x2
264
265	add	x5,x5,x6,lsl#32	// pack
266	add	x7,x7,x8,lsl#32
267	add	x9,x9,x10,lsl#32
268	add	x11,x11,x12,lsl#32
269	add	x13,x13,x14,lsl#32
270	add	x15,x15,x16,lsl#32
271	add	x17,x17,x19,lsl#32
272	add	x20,x20,x21,lsl#32
273#ifdef	__ARMEB__
274	rev	x5,x5
275	rev	x7,x7
276	rev	x9,x9
277	rev	x11,x11
278	rev	x13,x13
279	rev	x15,x15
280	rev	x17,x17
281	rev	x20,x20
282#endif
283	stp	x5,x7,[sp,#0]
284	stp	x9,x11,[sp,#16]
285	stp	x13,x15,[sp,#32]
286	stp	x17,x20,[sp,#48]
287
288.Loop_tail:
289	ldrb	w10,[x1,x2]
290	ldrb	w11,[x4,x2]
291	add	x2,x2,#1
292	eor	w10,w10,w11
293	strb	w10,[x0,x2]
294	cbnz	x2,.Loop_tail
295
296	stp	xzr,xzr,[sp,#0]
297	stp	xzr,xzr,[sp,#16]
298	stp	xzr,xzr,[sp,#32]
299	stp	xzr,xzr,[sp,#48]
300
301	ldp	x19,x20,[x29,#16]
302	add	sp,sp,#64
303	ldp	x21,x22,[x29,#32]
304	ldp	x23,x24,[x29,#48]
305	ldp	x25,x26,[x29,#64]
306	ldp	x27,x28,[x29,#80]
307	ldp	x29,x30,[sp],#96
308	ret
309.size	ChaCha20_ctr32,.-ChaCha20_ctr32
310
311.type	ChaCha20_neon,%function
312.align	5
313ChaCha20_neon:
314	stp	x29,x30,[sp,#-96]!
315	add	x29,sp,#0
316
317	adr	x5,.Lsigma
318	stp	x19,x20,[sp,#16]
319	stp	x21,x22,[sp,#32]
320	stp	x23,x24,[sp,#48]
321	stp	x25,x26,[sp,#64]
322	stp	x27,x28,[sp,#80]
323	cmp	x2,#512
324	b.hs	.L512_or_more_neon
325
326	sub	sp,sp,#64
327
328	ldp	x22,x23,[x5]		// load sigma
329	ld1	{v24.4s},[x5],#16
330	ldp	x24,x25,[x3]		// load key
331	ldp	x26,x27,[x3,#16]
332	ld1	{v25.4s,v26.4s},[x3]
333	ldp	x28,x30,[x4]		// load counter
334	ld1	{v27.4s},[x4]
335	ld1	{v31.4s},[x5]
336#ifdef	__ARMEB__
337	rev64	v24.4s,v24.4s
338	ror	x24,x24,#32
339	ror	x25,x25,#32
340	ror	x26,x26,#32
341	ror	x27,x27,#32
342	ror	x28,x28,#32
343	ror	x30,x30,#32
344#endif
345	add	v27.4s,v27.4s,v31.4s		// += 1
346	add	v28.4s,v27.4s,v31.4s
347	add	v29.4s,v28.4s,v31.4s
348	shl	v31.4s,v31.4s,#2			// 1 -> 4
349
350.Loop_outer_neon:
351	mov	w5,w22			// unpack key block
352	lsr	x6,x22,#32
353	mov	v0.16b,v24.16b
354	mov	w7,w23
355	lsr	x8,x23,#32
356	mov	v4.16b,v24.16b
357	mov	w9,w24
358	lsr	x10,x24,#32
359	mov	v16.16b,v24.16b
360	mov	w11,w25
361	mov	v1.16b,v25.16b
362	lsr	x12,x25,#32
363	mov	v5.16b,v25.16b
364	mov	w13,w26
365	mov	v17.16b,v25.16b
366	lsr	x14,x26,#32
367	mov	v3.16b,v27.16b
368	mov	w15,w27
369	mov	v7.16b,v28.16b
370	lsr	x16,x27,#32
371	mov	v19.16b,v29.16b
372	mov	w17,w28
373	mov	v2.16b,v26.16b
374	lsr	x19,x28,#32
375	mov	v6.16b,v26.16b
376	mov	w20,w30
377	mov	v18.16b,v26.16b
378	lsr	x21,x30,#32
379
380	mov	x4,#10
381	subs	x2,x2,#256
382.Loop_neon:
383	sub	x4,x4,#1
384	add	v0.4s,v0.4s,v1.4s
385	add	w5,w5,w9
386	add	v4.4s,v4.4s,v5.4s
387	add	w6,w6,w10
388	add	v16.4s,v16.4s,v17.4s
389	add	w7,w7,w11
390	eor	v3.16b,v3.16b,v0.16b
391	add	w8,w8,w12
392	eor	v7.16b,v7.16b,v4.16b
393	eor	w17,w17,w5
394	eor	v19.16b,v19.16b,v16.16b
395	eor	w19,w19,w6
396	rev32	v3.8h,v3.8h
397	eor	w20,w20,w7
398	rev32	v7.8h,v7.8h
399	eor	w21,w21,w8
400	rev32	v19.8h,v19.8h
401	ror	w17,w17,#16
402	add	v2.4s,v2.4s,v3.4s
403	ror	w19,w19,#16
404	add	v6.4s,v6.4s,v7.4s
405	ror	w20,w20,#16
406	add	v18.4s,v18.4s,v19.4s
407	ror	w21,w21,#16
408	eor	v20.16b,v1.16b,v2.16b
409	add	w13,w13,w17
410	eor	v21.16b,v5.16b,v6.16b
411	add	w14,w14,w19
412	eor	v22.16b,v17.16b,v18.16b
413	add	w15,w15,w20
414	ushr	v1.4s,v20.4s,#20
415	add	w16,w16,w21
416	ushr	v5.4s,v21.4s,#20
417	eor	w9,w9,w13
418	ushr	v17.4s,v22.4s,#20
419	eor	w10,w10,w14
420	sli	v1.4s,v20.4s,#12
421	eor	w11,w11,w15
422	sli	v5.4s,v21.4s,#12
423	eor	w12,w12,w16
424	sli	v17.4s,v22.4s,#12
425	ror	w9,w9,#20
426	add	v0.4s,v0.4s,v1.4s
427	ror	w10,w10,#20
428	add	v4.4s,v4.4s,v5.4s
429	ror	w11,w11,#20
430	add	v16.4s,v16.4s,v17.4s
431	ror	w12,w12,#20
432	eor	v20.16b,v3.16b,v0.16b
433	add	w5,w5,w9
434	eor	v21.16b,v7.16b,v4.16b
435	add	w6,w6,w10
436	eor	v22.16b,v19.16b,v16.16b
437	add	w7,w7,w11
438	ushr	v3.4s,v20.4s,#24
439	add	w8,w8,w12
440	ushr	v7.4s,v21.4s,#24
441	eor	w17,w17,w5
442	ushr	v19.4s,v22.4s,#24
443	eor	w19,w19,w6
444	sli	v3.4s,v20.4s,#8
445	eor	w20,w20,w7
446	sli	v7.4s,v21.4s,#8
447	eor	w21,w21,w8
448	sli	v19.4s,v22.4s,#8
449	ror	w17,w17,#24
450	add	v2.4s,v2.4s,v3.4s
451	ror	w19,w19,#24
452	add	v6.4s,v6.4s,v7.4s
453	ror	w20,w20,#24
454	add	v18.4s,v18.4s,v19.4s
455	ror	w21,w21,#24
456	eor	v20.16b,v1.16b,v2.16b
457	add	w13,w13,w17
458	eor	v21.16b,v5.16b,v6.16b
459	add	w14,w14,w19
460	eor	v22.16b,v17.16b,v18.16b
461	add	w15,w15,w20
462	ushr	v1.4s,v20.4s,#25
463	add	w16,w16,w21
464	ushr	v5.4s,v21.4s,#25
465	eor	w9,w9,w13
466	ushr	v17.4s,v22.4s,#25
467	eor	w10,w10,w14
468	sli	v1.4s,v20.4s,#7
469	eor	w11,w11,w15
470	sli	v5.4s,v21.4s,#7
471	eor	w12,w12,w16
472	sli	v17.4s,v22.4s,#7
473	ror	w9,w9,#25
474	ext	v2.16b,v2.16b,v2.16b,#8
475	ror	w10,w10,#25
476	ext	v6.16b,v6.16b,v6.16b,#8
477	ror	w11,w11,#25
478	ext	v18.16b,v18.16b,v18.16b,#8
479	ror	w12,w12,#25
480	ext	v3.16b,v3.16b,v3.16b,#12
481	ext	v7.16b,v7.16b,v7.16b,#12
482	ext	v19.16b,v19.16b,v19.16b,#12
483	ext	v1.16b,v1.16b,v1.16b,#4
484	ext	v5.16b,v5.16b,v5.16b,#4
485	ext	v17.16b,v17.16b,v17.16b,#4
486	add	v0.4s,v0.4s,v1.4s
487	add	w5,w5,w10
488	add	v4.4s,v4.4s,v5.4s
489	add	w6,w6,w11
490	add	v16.4s,v16.4s,v17.4s
491	add	w7,w7,w12
492	eor	v3.16b,v3.16b,v0.16b
493	add	w8,w8,w9
494	eor	v7.16b,v7.16b,v4.16b
495	eor	w21,w21,w5
496	eor	v19.16b,v19.16b,v16.16b
497	eor	w17,w17,w6
498	rev32	v3.8h,v3.8h
499	eor	w19,w19,w7
500	rev32	v7.8h,v7.8h
501	eor	w20,w20,w8
502	rev32	v19.8h,v19.8h
503	ror	w21,w21,#16
504	add	v2.4s,v2.4s,v3.4s
505	ror	w17,w17,#16
506	add	v6.4s,v6.4s,v7.4s
507	ror	w19,w19,#16
508	add	v18.4s,v18.4s,v19.4s
509	ror	w20,w20,#16
510	eor	v20.16b,v1.16b,v2.16b
511	add	w15,w15,w21
512	eor	v21.16b,v5.16b,v6.16b
513	add	w16,w16,w17
514	eor	v22.16b,v17.16b,v18.16b
515	add	w13,w13,w19
516	ushr	v1.4s,v20.4s,#20
517	add	w14,w14,w20
518	ushr	v5.4s,v21.4s,#20
519	eor	w10,w10,w15
520	ushr	v17.4s,v22.4s,#20
521	eor	w11,w11,w16
522	sli	v1.4s,v20.4s,#12
523	eor	w12,w12,w13
524	sli	v5.4s,v21.4s,#12
525	eor	w9,w9,w14
526	sli	v17.4s,v22.4s,#12
527	ror	w10,w10,#20
528	add	v0.4s,v0.4s,v1.4s
529	ror	w11,w11,#20
530	add	v4.4s,v4.4s,v5.4s
531	ror	w12,w12,#20
532	add	v16.4s,v16.4s,v17.4s
533	ror	w9,w9,#20
534	eor	v20.16b,v3.16b,v0.16b
535	add	w5,w5,w10
536	eor	v21.16b,v7.16b,v4.16b
537	add	w6,w6,w11
538	eor	v22.16b,v19.16b,v16.16b
539	add	w7,w7,w12
540	ushr	v3.4s,v20.4s,#24
541	add	w8,w8,w9
542	ushr	v7.4s,v21.4s,#24
543	eor	w21,w21,w5
544	ushr	v19.4s,v22.4s,#24
545	eor	w17,w17,w6
546	sli	v3.4s,v20.4s,#8
547	eor	w19,w19,w7
548	sli	v7.4s,v21.4s,#8
549	eor	w20,w20,w8
550	sli	v19.4s,v22.4s,#8
551	ror	w21,w21,#24
552	add	v2.4s,v2.4s,v3.4s
553	ror	w17,w17,#24
554	add	v6.4s,v6.4s,v7.4s
555	ror	w19,w19,#24
556	add	v18.4s,v18.4s,v19.4s
557	ror	w20,w20,#24
558	eor	v20.16b,v1.16b,v2.16b
559	add	w15,w15,w21
560	eor	v21.16b,v5.16b,v6.16b
561	add	w16,w16,w17
562	eor	v22.16b,v17.16b,v18.16b
563	add	w13,w13,w19
564	ushr	v1.4s,v20.4s,#25
565	add	w14,w14,w20
566	ushr	v5.4s,v21.4s,#25
567	eor	w10,w10,w15
568	ushr	v17.4s,v22.4s,#25
569	eor	w11,w11,w16
570	sli	v1.4s,v20.4s,#7
571	eor	w12,w12,w13
572	sli	v5.4s,v21.4s,#7
573	eor	w9,w9,w14
574	sli	v17.4s,v22.4s,#7
575	ror	w10,w10,#25
576	ext	v2.16b,v2.16b,v2.16b,#8
577	ror	w11,w11,#25
578	ext	v6.16b,v6.16b,v6.16b,#8
579	ror	w12,w12,#25
580	ext	v18.16b,v18.16b,v18.16b,#8
581	ror	w9,w9,#25
582	ext	v3.16b,v3.16b,v3.16b,#4
583	ext	v7.16b,v7.16b,v7.16b,#4
584	ext	v19.16b,v19.16b,v19.16b,#4
585	ext	v1.16b,v1.16b,v1.16b,#12
586	ext	v5.16b,v5.16b,v5.16b,#12
587	ext	v17.16b,v17.16b,v17.16b,#12
588	cbnz	x4,.Loop_neon
589
590	add	w5,w5,w22		// accumulate key block
591	add	v0.4s,v0.4s,v24.4s
592	add	x6,x6,x22,lsr#32
593	add	v4.4s,v4.4s,v24.4s
594	add	w7,w7,w23
595	add	v16.4s,v16.4s,v24.4s
596	add	x8,x8,x23,lsr#32
597	add	v2.4s,v2.4s,v26.4s
598	add	w9,w9,w24
599	add	v6.4s,v6.4s,v26.4s
600	add	x10,x10,x24,lsr#32
601	add	v18.4s,v18.4s,v26.4s
602	add	w11,w11,w25
603	add	v3.4s,v3.4s,v27.4s
604	add	x12,x12,x25,lsr#32
605	add	w13,w13,w26
606	add	v7.4s,v7.4s,v28.4s
607	add	x14,x14,x26,lsr#32
608	add	w15,w15,w27
609	add	v19.4s,v19.4s,v29.4s
610	add	x16,x16,x27,lsr#32
611	add	w17,w17,w28
612	add	v1.4s,v1.4s,v25.4s
613	add	x19,x19,x28,lsr#32
614	add	w20,w20,w30
615	add	v5.4s,v5.4s,v25.4s
616	add	x21,x21,x30,lsr#32
617	add	v17.4s,v17.4s,v25.4s
618
619	b.lo	.Ltail_neon
620
621	add	x5,x5,x6,lsl#32	// pack
622	add	x7,x7,x8,lsl#32
623	ldp	x6,x8,[x1,#0]		// load input
624	add	x9,x9,x10,lsl#32
625	add	x11,x11,x12,lsl#32
626	ldp	x10,x12,[x1,#16]
627	add	x13,x13,x14,lsl#32
628	add	x15,x15,x16,lsl#32
629	ldp	x14,x16,[x1,#32]
630	add	x17,x17,x19,lsl#32
631	add	x20,x20,x21,lsl#32
632	ldp	x19,x21,[x1,#48]
633	add	x1,x1,#64
634#ifdef	__ARMEB__
635	rev	x5,x5
636	rev	x7,x7
637	rev	x9,x9
638	rev	x11,x11
639	rev	x13,x13
640	rev	x15,x15
641	rev	x17,x17
642	rev	x20,x20
643#endif
644	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
645	eor	x5,x5,x6
646	eor	x7,x7,x8
647	eor	x9,x9,x10
648	eor	x11,x11,x12
649	eor	x13,x13,x14
650	eor	v0.16b,v0.16b,v20.16b
651	eor	x15,x15,x16
652	eor	v1.16b,v1.16b,v21.16b
653	eor	x17,x17,x19
654	eor	v2.16b,v2.16b,v22.16b
655	eor	x20,x20,x21
656	eor	v3.16b,v3.16b,v23.16b
657	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
658
659	stp	x5,x7,[x0,#0]		// store output
660	add	x28,x28,#4			// increment counter
661	stp	x9,x11,[x0,#16]
662	add	v27.4s,v27.4s,v31.4s		// += 4
663	stp	x13,x15,[x0,#32]
664	add	v28.4s,v28.4s,v31.4s
665	stp	x17,x20,[x0,#48]
666	add	v29.4s,v29.4s,v31.4s
667	add	x0,x0,#64
668
669	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
670	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
671
672	eor	v4.16b,v4.16b,v20.16b
673	eor	v5.16b,v5.16b,v21.16b
674	eor	v6.16b,v6.16b,v22.16b
675	eor	v7.16b,v7.16b,v23.16b
676	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
677
678	eor	v16.16b,v16.16b,v0.16b
679	eor	v17.16b,v17.16b,v1.16b
680	eor	v18.16b,v18.16b,v2.16b
681	eor	v19.16b,v19.16b,v3.16b
682	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
683
684	b.hi	.Loop_outer_neon
685
686	ldp	x19,x20,[x29,#16]
687	add	sp,sp,#64
688	ldp	x21,x22,[x29,#32]
689	ldp	x23,x24,[x29,#48]
690	ldp	x25,x26,[x29,#64]
691	ldp	x27,x28,[x29,#80]
692	ldp	x29,x30,[sp],#96
693	ret
694
695.Ltail_neon:
696	add	x2,x2,#256
697	cmp	x2,#64
698	b.lo	.Less_than_64
699
700	add	x5,x5,x6,lsl#32	// pack
701	add	x7,x7,x8,lsl#32
702	ldp	x6,x8,[x1,#0]		// load input
703	add	x9,x9,x10,lsl#32
704	add	x11,x11,x12,lsl#32
705	ldp	x10,x12,[x1,#16]
706	add	x13,x13,x14,lsl#32
707	add	x15,x15,x16,lsl#32
708	ldp	x14,x16,[x1,#32]
709	add	x17,x17,x19,lsl#32
710	add	x20,x20,x21,lsl#32
711	ldp	x19,x21,[x1,#48]
712	add	x1,x1,#64
713#ifdef	__ARMEB__
714	rev	x5,x5
715	rev	x7,x7
716	rev	x9,x9
717	rev	x11,x11
718	rev	x13,x13
719	rev	x15,x15
720	rev	x17,x17
721	rev	x20,x20
722#endif
723	eor	x5,x5,x6
724	eor	x7,x7,x8
725	eor	x9,x9,x10
726	eor	x11,x11,x12
727	eor	x13,x13,x14
728	eor	x15,x15,x16
729	eor	x17,x17,x19
730	eor	x20,x20,x21
731
732	stp	x5,x7,[x0,#0]		// store output
733	add	x28,x28,#4			// increment counter
734	stp	x9,x11,[x0,#16]
735	stp	x13,x15,[x0,#32]
736	stp	x17,x20,[x0,#48]
737	add	x0,x0,#64
738	b.eq	.Ldone_neon
739	sub	x2,x2,#64
740	cmp	x2,#64
741	b.lo	.Less_than_128
742
743	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
744	eor	v0.16b,v0.16b,v20.16b
745	eor	v1.16b,v1.16b,v21.16b
746	eor	v2.16b,v2.16b,v22.16b
747	eor	v3.16b,v3.16b,v23.16b
748	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
749	b.eq	.Ldone_neon
750	sub	x2,x2,#64
751	cmp	x2,#64
752	b.lo	.Less_than_192
753
754	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
755	eor	v4.16b,v4.16b,v20.16b
756	eor	v5.16b,v5.16b,v21.16b
757	eor	v6.16b,v6.16b,v22.16b
758	eor	v7.16b,v7.16b,v23.16b
759	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
760	b.eq	.Ldone_neon
761	sub	x2,x2,#64
762
763	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
764	b	.Last_neon
765
766.Less_than_128:
767	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
768	b	.Last_neon
769.Less_than_192:
770	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
771	b	.Last_neon
772
773.align	4
774.Last_neon:
775	sub	x0,x0,#1
776	add	x1,x1,x2
777	add	x0,x0,x2
778	add	x4,sp,x2
779	neg	x2,x2
780
781.Loop_tail_neon:
782	ldrb	w10,[x1,x2]
783	ldrb	w11,[x4,x2]
784	add	x2,x2,#1
785	eor	w10,w10,w11
786	strb	w10,[x0,x2]
787	cbnz	x2,.Loop_tail_neon
788
789	stp	xzr,xzr,[sp,#0]
790	stp	xzr,xzr,[sp,#16]
791	stp	xzr,xzr,[sp,#32]
792	stp	xzr,xzr,[sp,#48]
793
794.Ldone_neon:
795	ldp	x19,x20,[x29,#16]
796	add	sp,sp,#64
797	ldp	x21,x22,[x29,#32]
798	ldp	x23,x24,[x29,#48]
799	ldp	x25,x26,[x29,#64]
800	ldp	x27,x28,[x29,#80]
801	ldp	x29,x30,[sp],#96
802	ret
803.size	ChaCha20_neon,.-ChaCha20_neon
804.type	ChaCha20_512_neon,%function
805.align	5
806ChaCha20_512_neon:
807	stp	x29,x30,[sp,#-96]!
808	add	x29,sp,#0
809
810	adr	x5,.Lsigma
811	stp	x19,x20,[sp,#16]
812	stp	x21,x22,[sp,#32]
813	stp	x23,x24,[sp,#48]
814	stp	x25,x26,[sp,#64]
815	stp	x27,x28,[sp,#80]
816
817.L512_or_more_neon:
818	sub	sp,sp,#128+64
819
820	ldp	x22,x23,[x5]		// load sigma
821	ld1	{v24.4s},[x5],#16
822	ldp	x24,x25,[x3]		// load key
823	ldp	x26,x27,[x3,#16]
824	ld1	{v25.4s,v26.4s},[x3]
825	ldp	x28,x30,[x4]		// load counter
826	ld1	{v27.4s},[x4]
827	ld1	{v31.4s},[x5]
828#ifdef	__ARMEB__
829	rev64	v24.4s,v24.4s
830	ror	x24,x24,#32
831	ror	x25,x25,#32
832	ror	x26,x26,#32
833	ror	x27,x27,#32
834	ror	x28,x28,#32
835	ror	x30,x30,#32
836#endif
837	add	v27.4s,v27.4s,v31.4s		// += 1
838	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
839	add	v27.4s,v27.4s,v31.4s		// not typo
840	str	q26,[sp,#32]
841	add	v28.4s,v27.4s,v31.4s
842	add	v29.4s,v28.4s,v31.4s
843	add	v30.4s,v29.4s,v31.4s
844	shl	v31.4s,v31.4s,#2			// 1 -> 4
845
846	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
847	stp	d10,d11,[sp,#128+16]
848	stp	d12,d13,[sp,#128+32]
849	stp	d14,d15,[sp,#128+48]
850
851	sub	x2,x2,#512			// not typo
852
853.Loop_outer_512_neon:
854	mov	v0.16b,v24.16b
855	mov	v4.16b,v24.16b
856	mov	v8.16b,v24.16b
857	mov	v12.16b,v24.16b
858	mov	v16.16b,v24.16b
859	mov	v20.16b,v24.16b
860	mov	v1.16b,v25.16b
861	mov	w5,w22			// unpack key block
862	mov	v5.16b,v25.16b
863	lsr	x6,x22,#32
864	mov	v9.16b,v25.16b
865	mov	w7,w23
866	mov	v13.16b,v25.16b
867	lsr	x8,x23,#32
868	mov	v17.16b,v25.16b
869	mov	w9,w24
870	mov	v21.16b,v25.16b
871	lsr	x10,x24,#32
872	mov	v3.16b,v27.16b
873	mov	w11,w25
874	mov	v7.16b,v28.16b
875	lsr	x12,x25,#32
876	mov	v11.16b,v29.16b
877	mov	w13,w26
878	mov	v15.16b,v30.16b
879	lsr	x14,x26,#32
880	mov	v2.16b,v26.16b
881	mov	w15,w27
882	mov	v6.16b,v26.16b
883	lsr	x16,x27,#32
884	add	v19.4s,v3.4s,v31.4s			// +4
885	mov	w17,w28
886	add	v23.4s,v7.4s,v31.4s			// +4
887	lsr	x19,x28,#32
888	mov	v10.16b,v26.16b
889	mov	w20,w30
890	mov	v14.16b,v26.16b
891	lsr	x21,x30,#32
892	mov	v18.16b,v26.16b
893	stp	q27,q28,[sp,#48]		// off-load key block, variable part
894	mov	v22.16b,v26.16b
895	str	q29,[sp,#80]
896
897	mov	x4,#5
898	subs	x2,x2,#512
899.Loop_upper_neon:
900	sub	x4,x4,#1
901	add	v0.4s,v0.4s,v1.4s
902	add	w5,w5,w9
903	add	v4.4s,v4.4s,v5.4s
904	add	w6,w6,w10
905	add	v8.4s,v8.4s,v9.4s
906	add	w7,w7,w11
907	add	v12.4s,v12.4s,v13.4s
908	add	w8,w8,w12
909	add	v16.4s,v16.4s,v17.4s
910	eor	w17,w17,w5
911	add	v20.4s,v20.4s,v21.4s
912	eor	w19,w19,w6
913	eor	v3.16b,v3.16b,v0.16b
914	eor	w20,w20,w7
915	eor	v7.16b,v7.16b,v4.16b
916	eor	w21,w21,w8
917	eor	v11.16b,v11.16b,v8.16b
918	ror	w17,w17,#16
919	eor	v15.16b,v15.16b,v12.16b
920	ror	w19,w19,#16
921	eor	v19.16b,v19.16b,v16.16b
922	ror	w20,w20,#16
923	eor	v23.16b,v23.16b,v20.16b
924	ror	w21,w21,#16
925	rev32	v3.8h,v3.8h
926	add	w13,w13,w17
927	rev32	v7.8h,v7.8h
928	add	w14,w14,w19
929	rev32	v11.8h,v11.8h
930	add	w15,w15,w20
931	rev32	v15.8h,v15.8h
932	add	w16,w16,w21
933	rev32	v19.8h,v19.8h
934	eor	w9,w9,w13
935	rev32	v23.8h,v23.8h
936	eor	w10,w10,w14
937	add	v2.4s,v2.4s,v3.4s
938	eor	w11,w11,w15
939	add	v6.4s,v6.4s,v7.4s
940	eor	w12,w12,w16
941	add	v10.4s,v10.4s,v11.4s
942	ror	w9,w9,#20
943	add	v14.4s,v14.4s,v15.4s
944	ror	w10,w10,#20
945	add	v18.4s,v18.4s,v19.4s
946	ror	w11,w11,#20
947	add	v22.4s,v22.4s,v23.4s
948	ror	w12,w12,#20
949	eor	v24.16b,v1.16b,v2.16b
950	add	w5,w5,w9
951	eor	v25.16b,v5.16b,v6.16b
952	add	w6,w6,w10
953	eor	v26.16b,v9.16b,v10.16b
954	add	w7,w7,w11
955	eor	v27.16b,v13.16b,v14.16b
956	add	w8,w8,w12
957	eor	v28.16b,v17.16b,v18.16b
958	eor	w17,w17,w5
959	eor	v29.16b,v21.16b,v22.16b
960	eor	w19,w19,w6
961	ushr	v1.4s,v24.4s,#20
962	eor	w20,w20,w7
963	ushr	v5.4s,v25.4s,#20
964	eor	w21,w21,w8
965	ushr	v9.4s,v26.4s,#20
966	ror	w17,w17,#24
967	ushr	v13.4s,v27.4s,#20
968	ror	w19,w19,#24
969	ushr	v17.4s,v28.4s,#20
970	ror	w20,w20,#24
971	ushr	v21.4s,v29.4s,#20
972	ror	w21,w21,#24
973	sli	v1.4s,v24.4s,#12
974	add	w13,w13,w17
975	sli	v5.4s,v25.4s,#12
976	add	w14,w14,w19
977	sli	v9.4s,v26.4s,#12
978	add	w15,w15,w20
979	sli	v13.4s,v27.4s,#12
980	add	w16,w16,w21
981	sli	v17.4s,v28.4s,#12
982	eor	w9,w9,w13
983	sli	v21.4s,v29.4s,#12
984	eor	w10,w10,w14
985	add	v0.4s,v0.4s,v1.4s
986	eor	w11,w11,w15
987	add	v4.4s,v4.4s,v5.4s
988	eor	w12,w12,w16
989	add	v8.4s,v8.4s,v9.4s
990	ror	w9,w9,#25
991	add	v12.4s,v12.4s,v13.4s
992	ror	w10,w10,#25
993	add	v16.4s,v16.4s,v17.4s
994	ror	w11,w11,#25
995	add	v20.4s,v20.4s,v21.4s
996	ror	w12,w12,#25
997	eor	v24.16b,v3.16b,v0.16b
998	add	w5,w5,w10
999	eor	v25.16b,v7.16b,v4.16b
1000	add	w6,w6,w11
1001	eor	v26.16b,v11.16b,v8.16b
1002	add	w7,w7,w12
1003	eor	v27.16b,v15.16b,v12.16b
1004	add	w8,w8,w9
1005	eor	v28.16b,v19.16b,v16.16b
1006	eor	w21,w21,w5
1007	eor	v29.16b,v23.16b,v20.16b
1008	eor	w17,w17,w6
1009	ushr	v3.4s,v24.4s,#24
1010	eor	w19,w19,w7
1011	ushr	v7.4s,v25.4s,#24
1012	eor	w20,w20,w8
1013	ushr	v11.4s,v26.4s,#24
1014	ror	w21,w21,#16
1015	ushr	v15.4s,v27.4s,#24
1016	ror	w17,w17,#16
1017	ushr	v19.4s,v28.4s,#24
1018	ror	w19,w19,#16
1019	ushr	v23.4s,v29.4s,#24
1020	ror	w20,w20,#16
1021	sli	v3.4s,v24.4s,#8
1022	add	w15,w15,w21
1023	sli	v7.4s,v25.4s,#8
1024	add	w16,w16,w17
1025	sli	v11.4s,v26.4s,#8
1026	add	w13,w13,w19
1027	sli	v15.4s,v27.4s,#8
1028	add	w14,w14,w20
1029	sli	v19.4s,v28.4s,#8
1030	eor	w10,w10,w15
1031	sli	v23.4s,v29.4s,#8
1032	eor	w11,w11,w16
1033	add	v2.4s,v2.4s,v3.4s
1034	eor	w12,w12,w13
1035	add	v6.4s,v6.4s,v7.4s
1036	eor	w9,w9,w14
1037	add	v10.4s,v10.4s,v11.4s
1038	ror	w10,w10,#20
1039	add	v14.4s,v14.4s,v15.4s
1040	ror	w11,w11,#20
1041	add	v18.4s,v18.4s,v19.4s
1042	ror	w12,w12,#20
1043	add	v22.4s,v22.4s,v23.4s
1044	ror	w9,w9,#20
1045	eor	v24.16b,v1.16b,v2.16b
1046	add	w5,w5,w10
1047	eor	v25.16b,v5.16b,v6.16b
1048	add	w6,w6,w11
1049	eor	v26.16b,v9.16b,v10.16b
1050	add	w7,w7,w12
1051	eor	v27.16b,v13.16b,v14.16b
1052	add	w8,w8,w9
1053	eor	v28.16b,v17.16b,v18.16b
1054	eor	w21,w21,w5
1055	eor	v29.16b,v21.16b,v22.16b
1056	eor	w17,w17,w6
1057	ushr	v1.4s,v24.4s,#25
1058	eor	w19,w19,w7
1059	ushr	v5.4s,v25.4s,#25
1060	eor	w20,w20,w8
1061	ushr	v9.4s,v26.4s,#25
1062	ror	w21,w21,#24
1063	ushr	v13.4s,v27.4s,#25
1064	ror	w17,w17,#24
1065	ushr	v17.4s,v28.4s,#25
1066	ror	w19,w19,#24
1067	ushr	v21.4s,v29.4s,#25
1068	ror	w20,w20,#24
1069	sli	v1.4s,v24.4s,#7
1070	add	w15,w15,w21
1071	sli	v5.4s,v25.4s,#7
1072	add	w16,w16,w17
1073	sli	v9.4s,v26.4s,#7
1074	add	w13,w13,w19
1075	sli	v13.4s,v27.4s,#7
1076	add	w14,w14,w20
1077	sli	v17.4s,v28.4s,#7
1078	eor	w10,w10,w15
1079	sli	v21.4s,v29.4s,#7
1080	eor	w11,w11,w16
1081	ext	v2.16b,v2.16b,v2.16b,#8
1082	eor	w12,w12,w13
1083	ext	v6.16b,v6.16b,v6.16b,#8
1084	eor	w9,w9,w14
1085	ext	v10.16b,v10.16b,v10.16b,#8
1086	ror	w10,w10,#25
1087	ext	v14.16b,v14.16b,v14.16b,#8
1088	ror	w11,w11,#25
1089	ext	v18.16b,v18.16b,v18.16b,#8
1090	ror	w12,w12,#25
1091	ext	v22.16b,v22.16b,v22.16b,#8
1092	ror	w9,w9,#25
1093	ext	v3.16b,v3.16b,v3.16b,#12
1094	ext	v7.16b,v7.16b,v7.16b,#12
1095	ext	v11.16b,v11.16b,v11.16b,#12
1096	ext	v15.16b,v15.16b,v15.16b,#12
1097	ext	v19.16b,v19.16b,v19.16b,#12
1098	ext	v23.16b,v23.16b,v23.16b,#12
1099	ext	v1.16b,v1.16b,v1.16b,#4
1100	ext	v5.16b,v5.16b,v5.16b,#4
1101	ext	v9.16b,v9.16b,v9.16b,#4
1102	ext	v13.16b,v13.16b,v13.16b,#4
1103	ext	v17.16b,v17.16b,v17.16b,#4
1104	ext	v21.16b,v21.16b,v21.16b,#4
1105	add	v0.4s,v0.4s,v1.4s
1106	add	w5,w5,w9
1107	add	v4.4s,v4.4s,v5.4s
1108	add	w6,w6,w10
1109	add	v8.4s,v8.4s,v9.4s
1110	add	w7,w7,w11
1111	add	v12.4s,v12.4s,v13.4s
1112	add	w8,w8,w12
1113	add	v16.4s,v16.4s,v17.4s
1114	eor	w17,w17,w5
1115	add	v20.4s,v20.4s,v21.4s
1116	eor	w19,w19,w6
1117	eor	v3.16b,v3.16b,v0.16b
1118	eor	w20,w20,w7
1119	eor	v7.16b,v7.16b,v4.16b
1120	eor	w21,w21,w8
1121	eor	v11.16b,v11.16b,v8.16b
1122	ror	w17,w17,#16
1123	eor	v15.16b,v15.16b,v12.16b
1124	ror	w19,w19,#16
1125	eor	v19.16b,v19.16b,v16.16b
1126	ror	w20,w20,#16
1127	eor	v23.16b,v23.16b,v20.16b
1128	ror	w21,w21,#16
1129	rev32	v3.8h,v3.8h
1130	add	w13,w13,w17
1131	rev32	v7.8h,v7.8h
1132	add	w14,w14,w19
1133	rev32	v11.8h,v11.8h
1134	add	w15,w15,w20
1135	rev32	v15.8h,v15.8h
1136	add	w16,w16,w21
1137	rev32	v19.8h,v19.8h
1138	eor	w9,w9,w13
1139	rev32	v23.8h,v23.8h
1140	eor	w10,w10,w14
1141	add	v2.4s,v2.4s,v3.4s
1142	eor	w11,w11,w15
1143	add	v6.4s,v6.4s,v7.4s
1144	eor	w12,w12,w16
1145	add	v10.4s,v10.4s,v11.4s
1146	ror	w9,w9,#20
1147	add	v14.4s,v14.4s,v15.4s
1148	ror	w10,w10,#20
1149	add	v18.4s,v18.4s,v19.4s
1150	ror	w11,w11,#20
1151	add	v22.4s,v22.4s,v23.4s
1152	ror	w12,w12,#20
1153	eor	v24.16b,v1.16b,v2.16b
1154	add	w5,w5,w9
1155	eor	v25.16b,v5.16b,v6.16b
1156	add	w6,w6,w10
1157	eor	v26.16b,v9.16b,v10.16b
1158	add	w7,w7,w11
1159	eor	v27.16b,v13.16b,v14.16b
1160	add	w8,w8,w12
1161	eor	v28.16b,v17.16b,v18.16b
1162	eor	w17,w17,w5
1163	eor	v29.16b,v21.16b,v22.16b
1164	eor	w19,w19,w6
1165	ushr	v1.4s,v24.4s,#20
1166	eor	w20,w20,w7
1167	ushr	v5.4s,v25.4s,#20
1168	eor	w21,w21,w8
1169	ushr	v9.4s,v26.4s,#20
1170	ror	w17,w17,#24
1171	ushr	v13.4s,v27.4s,#20
1172	ror	w19,w19,#24
1173	ushr	v17.4s,v28.4s,#20
1174	ror	w20,w20,#24
1175	ushr	v21.4s,v29.4s,#20
1176	ror	w21,w21,#24
1177	sli	v1.4s,v24.4s,#12
1178	add	w13,w13,w17
1179	sli	v5.4s,v25.4s,#12
1180	add	w14,w14,w19
1181	sli	v9.4s,v26.4s,#12
1182	add	w15,w15,w20
1183	sli	v13.4s,v27.4s,#12
1184	add	w16,w16,w21
1185	sli	v17.4s,v28.4s,#12
1186	eor	w9,w9,w13
1187	sli	v21.4s,v29.4s,#12
1188	eor	w10,w10,w14
1189	add	v0.4s,v0.4s,v1.4s
1190	eor	w11,w11,w15
1191	add	v4.4s,v4.4s,v5.4s
1192	eor	w12,w12,w16
1193	add	v8.4s,v8.4s,v9.4s
1194	ror	w9,w9,#25
1195	add	v12.4s,v12.4s,v13.4s
1196	ror	w10,w10,#25
1197	add	v16.4s,v16.4s,v17.4s
1198	ror	w11,w11,#25
1199	add	v20.4s,v20.4s,v21.4s
1200	ror	w12,w12,#25
1201	eor	v24.16b,v3.16b,v0.16b
1202	add	w5,w5,w10
1203	eor	v25.16b,v7.16b,v4.16b
1204	add	w6,w6,w11
1205	eor	v26.16b,v11.16b,v8.16b
1206	add	w7,w7,w12
1207	eor	v27.16b,v15.16b,v12.16b
1208	add	w8,w8,w9
1209	eor	v28.16b,v19.16b,v16.16b
1210	eor	w21,w21,w5
1211	eor	v29.16b,v23.16b,v20.16b
1212	eor	w17,w17,w6
1213	ushr	v3.4s,v24.4s,#24
1214	eor	w19,w19,w7
1215	ushr	v7.4s,v25.4s,#24
1216	eor	w20,w20,w8
1217	ushr	v11.4s,v26.4s,#24
1218	ror	w21,w21,#16
1219	ushr	v15.4s,v27.4s,#24
1220	ror	w17,w17,#16
1221	ushr	v19.4s,v28.4s,#24
1222	ror	w19,w19,#16
1223	ushr	v23.4s,v29.4s,#24
1224	ror	w20,w20,#16
1225	sli	v3.4s,v24.4s,#8
1226	add	w15,w15,w21
1227	sli	v7.4s,v25.4s,#8
1228	add	w16,w16,w17
1229	sli	v11.4s,v26.4s,#8
1230	add	w13,w13,w19
1231	sli	v15.4s,v27.4s,#8
1232	add	w14,w14,w20
1233	sli	v19.4s,v28.4s,#8
1234	eor	w10,w10,w15
1235	sli	v23.4s,v29.4s,#8
1236	eor	w11,w11,w16
1237	add	v2.4s,v2.4s,v3.4s
1238	eor	w12,w12,w13
1239	add	v6.4s,v6.4s,v7.4s
1240	eor	w9,w9,w14
1241	add	v10.4s,v10.4s,v11.4s
1242	ror	w10,w10,#20
1243	add	v14.4s,v14.4s,v15.4s
1244	ror	w11,w11,#20
1245	add	v18.4s,v18.4s,v19.4s
1246	ror	w12,w12,#20
1247	add	v22.4s,v22.4s,v23.4s
1248	ror	w9,w9,#20
1249	eor	v24.16b,v1.16b,v2.16b
1250	add	w5,w5,w10
1251	eor	v25.16b,v5.16b,v6.16b
1252	add	w6,w6,w11
1253	eor	v26.16b,v9.16b,v10.16b
1254	add	w7,w7,w12
1255	eor	v27.16b,v13.16b,v14.16b
1256	add	w8,w8,w9
1257	eor	v28.16b,v17.16b,v18.16b
1258	eor	w21,w21,w5
1259	eor	v29.16b,v21.16b,v22.16b
1260	eor	w17,w17,w6
1261	ushr	v1.4s,v24.4s,#25
1262	eor	w19,w19,w7
1263	ushr	v5.4s,v25.4s,#25
1264	eor	w20,w20,w8
1265	ushr	v9.4s,v26.4s,#25
1266	ror	w21,w21,#24
1267	ushr	v13.4s,v27.4s,#25
1268	ror	w17,w17,#24
1269	ushr	v17.4s,v28.4s,#25
1270	ror	w19,w19,#24
1271	ushr	v21.4s,v29.4s,#25
1272	ror	w20,w20,#24
1273	sli	v1.4s,v24.4s,#7
1274	add	w15,w15,w21
1275	sli	v5.4s,v25.4s,#7
1276	add	w16,w16,w17
1277	sli	v9.4s,v26.4s,#7
1278	add	w13,w13,w19
1279	sli	v13.4s,v27.4s,#7
1280	add	w14,w14,w20
1281	sli	v17.4s,v28.4s,#7
1282	eor	w10,w10,w15
1283	sli	v21.4s,v29.4s,#7
1284	eor	w11,w11,w16
1285	ext	v2.16b,v2.16b,v2.16b,#8
1286	eor	w12,w12,w13
1287	ext	v6.16b,v6.16b,v6.16b,#8
1288	eor	w9,w9,w14
1289	ext	v10.16b,v10.16b,v10.16b,#8
1290	ror	w10,w10,#25
1291	ext	v14.16b,v14.16b,v14.16b,#8
1292	ror	w11,w11,#25
1293	ext	v18.16b,v18.16b,v18.16b,#8
1294	ror	w12,w12,#25
1295	ext	v22.16b,v22.16b,v22.16b,#8
1296	ror	w9,w9,#25
1297	ext	v3.16b,v3.16b,v3.16b,#4
1298	ext	v7.16b,v7.16b,v7.16b,#4
1299	ext	v11.16b,v11.16b,v11.16b,#4
1300	ext	v15.16b,v15.16b,v15.16b,#4
1301	ext	v19.16b,v19.16b,v19.16b,#4
1302	ext	v23.16b,v23.16b,v23.16b,#4
1303	ext	v1.16b,v1.16b,v1.16b,#12
1304	ext	v5.16b,v5.16b,v5.16b,#12
1305	ext	v9.16b,v9.16b,v9.16b,#12
1306	ext	v13.16b,v13.16b,v13.16b,#12
1307	ext	v17.16b,v17.16b,v17.16b,#12
1308	ext	v21.16b,v21.16b,v21.16b,#12
1309	cbnz	x4,.Loop_upper_neon
1310
1311	add	w5,w5,w22		// accumulate key block
1312	add	x6,x6,x22,lsr#32
1313	add	w7,w7,w23
1314	add	x8,x8,x23,lsr#32
1315	add	w9,w9,w24
1316	add	x10,x10,x24,lsr#32
1317	add	w11,w11,w25
1318	add	x12,x12,x25,lsr#32
1319	add	w13,w13,w26
1320	add	x14,x14,x26,lsr#32
1321	add	w15,w15,w27
1322	add	x16,x16,x27,lsr#32
1323	add	w17,w17,w28
1324	add	x19,x19,x28,lsr#32
1325	add	w20,w20,w30
1326	add	x21,x21,x30,lsr#32
1327
1328	add	x5,x5,x6,lsl#32	// pack
1329	add	x7,x7,x8,lsl#32
1330	ldp	x6,x8,[x1,#0]		// load input
1331	add	x9,x9,x10,lsl#32
1332	add	x11,x11,x12,lsl#32
1333	ldp	x10,x12,[x1,#16]
1334	add	x13,x13,x14,lsl#32
1335	add	x15,x15,x16,lsl#32
1336	ldp	x14,x16,[x1,#32]
1337	add	x17,x17,x19,lsl#32
1338	add	x20,x20,x21,lsl#32
1339	ldp	x19,x21,[x1,#48]
1340	add	x1,x1,#64
1341#ifdef	__ARMEB__
1342	rev	x5,x5
1343	rev	x7,x7
1344	rev	x9,x9
1345	rev	x11,x11
1346	rev	x13,x13
1347	rev	x15,x15
1348	rev	x17,x17
1349	rev	x20,x20
1350#endif
1351	eor	x5,x5,x6
1352	eor	x7,x7,x8
1353	eor	x9,x9,x10
1354	eor	x11,x11,x12
1355	eor	x13,x13,x14
1356	eor	x15,x15,x16
1357	eor	x17,x17,x19
1358	eor	x20,x20,x21
1359
1360	stp	x5,x7,[x0,#0]		// store output
1361	add	x28,x28,#1			// increment counter
1362	mov	w5,w22			// unpack key block
1363	lsr	x6,x22,#32
1364	stp	x9,x11,[x0,#16]
1365	mov	w7,w23
1366	lsr	x8,x23,#32
1367	stp	x13,x15,[x0,#32]
1368	mov	w9,w24
1369	lsr	x10,x24,#32
1370	stp	x17,x20,[x0,#48]
1371	add	x0,x0,#64
1372	mov	w11,w25
1373	lsr	x12,x25,#32
1374	mov	w13,w26
1375	lsr	x14,x26,#32
1376	mov	w15,w27
1377	lsr	x16,x27,#32
1378	mov	w17,w28
1379	lsr	x19,x28,#32
1380	mov	w20,w30
1381	lsr	x21,x30,#32
1382
1383	mov	x4,#5
1384.Loop_lower_neon:
1385	sub	x4,x4,#1
1386	add	v0.4s,v0.4s,v1.4s
1387	add	w5,w5,w9
1388	add	v4.4s,v4.4s,v5.4s
1389	add	w6,w6,w10
1390	add	v8.4s,v8.4s,v9.4s
1391	add	w7,w7,w11
1392	add	v12.4s,v12.4s,v13.4s
1393	add	w8,w8,w12
1394	add	v16.4s,v16.4s,v17.4s
1395	eor	w17,w17,w5
1396	add	v20.4s,v20.4s,v21.4s
1397	eor	w19,w19,w6
1398	eor	v3.16b,v3.16b,v0.16b
1399	eor	w20,w20,w7
1400	eor	v7.16b,v7.16b,v4.16b
1401	eor	w21,w21,w8
1402	eor	v11.16b,v11.16b,v8.16b
1403	ror	w17,w17,#16
1404	eor	v15.16b,v15.16b,v12.16b
1405	ror	w19,w19,#16
1406	eor	v19.16b,v19.16b,v16.16b
1407	ror	w20,w20,#16
1408	eor	v23.16b,v23.16b,v20.16b
1409	ror	w21,w21,#16
1410	rev32	v3.8h,v3.8h
1411	add	w13,w13,w17
1412	rev32	v7.8h,v7.8h
1413	add	w14,w14,w19
1414	rev32	v11.8h,v11.8h
1415	add	w15,w15,w20
1416	rev32	v15.8h,v15.8h
1417	add	w16,w16,w21
1418	rev32	v19.8h,v19.8h
1419	eor	w9,w9,w13
1420	rev32	v23.8h,v23.8h
1421	eor	w10,w10,w14
1422	add	v2.4s,v2.4s,v3.4s
1423	eor	w11,w11,w15
1424	add	v6.4s,v6.4s,v7.4s
1425	eor	w12,w12,w16
1426	add	v10.4s,v10.4s,v11.4s
1427	ror	w9,w9,#20
1428	add	v14.4s,v14.4s,v15.4s
1429	ror	w10,w10,#20
1430	add	v18.4s,v18.4s,v19.4s
1431	ror	w11,w11,#20
1432	add	v22.4s,v22.4s,v23.4s
1433	ror	w12,w12,#20
1434	eor	v24.16b,v1.16b,v2.16b
1435	add	w5,w5,w9
1436	eor	v25.16b,v5.16b,v6.16b
1437	add	w6,w6,w10
1438	eor	v26.16b,v9.16b,v10.16b
1439	add	w7,w7,w11
1440	eor	v27.16b,v13.16b,v14.16b
1441	add	w8,w8,w12
1442	eor	v28.16b,v17.16b,v18.16b
1443	eor	w17,w17,w5
1444	eor	v29.16b,v21.16b,v22.16b
1445	eor	w19,w19,w6
1446	ushr	v1.4s,v24.4s,#20
1447	eor	w20,w20,w7
1448	ushr	v5.4s,v25.4s,#20
1449	eor	w21,w21,w8
1450	ushr	v9.4s,v26.4s,#20
1451	ror	w17,w17,#24
1452	ushr	v13.4s,v27.4s,#20
1453	ror	w19,w19,#24
1454	ushr	v17.4s,v28.4s,#20
1455	ror	w20,w20,#24
1456	ushr	v21.4s,v29.4s,#20
1457	ror	w21,w21,#24
1458	sli	v1.4s,v24.4s,#12
1459	add	w13,w13,w17
1460	sli	v5.4s,v25.4s,#12
1461	add	w14,w14,w19
1462	sli	v9.4s,v26.4s,#12
1463	add	w15,w15,w20
1464	sli	v13.4s,v27.4s,#12
1465	add	w16,w16,w21
1466	sli	v17.4s,v28.4s,#12
1467	eor	w9,w9,w13
1468	sli	v21.4s,v29.4s,#12
1469	eor	w10,w10,w14
1470	add	v0.4s,v0.4s,v1.4s
1471	eor	w11,w11,w15
1472	add	v4.4s,v4.4s,v5.4s
1473	eor	w12,w12,w16
1474	add	v8.4s,v8.4s,v9.4s
1475	ror	w9,w9,#25
1476	add	v12.4s,v12.4s,v13.4s
1477	ror	w10,w10,#25
1478	add	v16.4s,v16.4s,v17.4s
1479	ror	w11,w11,#25
1480	add	v20.4s,v20.4s,v21.4s
1481	ror	w12,w12,#25
1482	eor	v24.16b,v3.16b,v0.16b
1483	add	w5,w5,w10
1484	eor	v25.16b,v7.16b,v4.16b
1485	add	w6,w6,w11
1486	eor	v26.16b,v11.16b,v8.16b
1487	add	w7,w7,w12
1488	eor	v27.16b,v15.16b,v12.16b
1489	add	w8,w8,w9
1490	eor	v28.16b,v19.16b,v16.16b
1491	eor	w21,w21,w5
1492	eor	v29.16b,v23.16b,v20.16b
1493	eor	w17,w17,w6
1494	ushr	v3.4s,v24.4s,#24
1495	eor	w19,w19,w7
1496	ushr	v7.4s,v25.4s,#24
1497	eor	w20,w20,w8
1498	ushr	v11.4s,v26.4s,#24
1499	ror	w21,w21,#16
1500	ushr	v15.4s,v27.4s,#24
1501	ror	w17,w17,#16
1502	ushr	v19.4s,v28.4s,#24
1503	ror	w19,w19,#16
1504	ushr	v23.4s,v29.4s,#24
1505	ror	w20,w20,#16
1506	sli	v3.4s,v24.4s,#8
1507	add	w15,w15,w21
1508	sli	v7.4s,v25.4s,#8
1509	add	w16,w16,w17
1510	sli	v11.4s,v26.4s,#8
1511	add	w13,w13,w19
1512	sli	v15.4s,v27.4s,#8
1513	add	w14,w14,w20
1514	sli	v19.4s,v28.4s,#8
1515	eor	w10,w10,w15
1516	sli	v23.4s,v29.4s,#8
1517	eor	w11,w11,w16
1518	add	v2.4s,v2.4s,v3.4s
1519	eor	w12,w12,w13
1520	add	v6.4s,v6.4s,v7.4s
1521	eor	w9,w9,w14
1522	add	v10.4s,v10.4s,v11.4s
1523	ror	w10,w10,#20
1524	add	v14.4s,v14.4s,v15.4s
1525	ror	w11,w11,#20
1526	add	v18.4s,v18.4s,v19.4s
1527	ror	w12,w12,#20
1528	add	v22.4s,v22.4s,v23.4s
1529	ror	w9,w9,#20
1530	eor	v24.16b,v1.16b,v2.16b
1531	add	w5,w5,w10
1532	eor	v25.16b,v5.16b,v6.16b
1533	add	w6,w6,w11
1534	eor	v26.16b,v9.16b,v10.16b
1535	add	w7,w7,w12
1536	eor	v27.16b,v13.16b,v14.16b
1537	add	w8,w8,w9
1538	eor	v28.16b,v17.16b,v18.16b
1539	eor	w21,w21,w5
1540	eor	v29.16b,v21.16b,v22.16b
1541	eor	w17,w17,w6
1542	ushr	v1.4s,v24.4s,#25
1543	eor	w19,w19,w7
1544	ushr	v5.4s,v25.4s,#25
1545	eor	w20,w20,w8
1546	ushr	v9.4s,v26.4s,#25
1547	ror	w21,w21,#24
1548	ushr	v13.4s,v27.4s,#25
1549	ror	w17,w17,#24
1550	ushr	v17.4s,v28.4s,#25
1551	ror	w19,w19,#24
1552	ushr	v21.4s,v29.4s,#25
1553	ror	w20,w20,#24
1554	sli	v1.4s,v24.4s,#7
1555	add	w15,w15,w21
1556	sli	v5.4s,v25.4s,#7
1557	add	w16,w16,w17
1558	sli	v9.4s,v26.4s,#7
1559	add	w13,w13,w19
1560	sli	v13.4s,v27.4s,#7
1561	add	w14,w14,w20
1562	sli	v17.4s,v28.4s,#7
1563	eor	w10,w10,w15
1564	sli	v21.4s,v29.4s,#7
1565	eor	w11,w11,w16
1566	ext	v2.16b,v2.16b,v2.16b,#8
1567	eor	w12,w12,w13
1568	ext	v6.16b,v6.16b,v6.16b,#8
1569	eor	w9,w9,w14
1570	ext	v10.16b,v10.16b,v10.16b,#8
1571	ror	w10,w10,#25
1572	ext	v14.16b,v14.16b,v14.16b,#8
1573	ror	w11,w11,#25
1574	ext	v18.16b,v18.16b,v18.16b,#8
1575	ror	w12,w12,#25
1576	ext	v22.16b,v22.16b,v22.16b,#8
1577	ror	w9,w9,#25
1578	ext	v3.16b,v3.16b,v3.16b,#12
1579	ext	v7.16b,v7.16b,v7.16b,#12
1580	ext	v11.16b,v11.16b,v11.16b,#12
1581	ext	v15.16b,v15.16b,v15.16b,#12
1582	ext	v19.16b,v19.16b,v19.16b,#12
1583	ext	v23.16b,v23.16b,v23.16b,#12
1584	ext	v1.16b,v1.16b,v1.16b,#4
1585	ext	v5.16b,v5.16b,v5.16b,#4
1586	ext	v9.16b,v9.16b,v9.16b,#4
1587	ext	v13.16b,v13.16b,v13.16b,#4
1588	ext	v17.16b,v17.16b,v17.16b,#4
1589	ext	v21.16b,v21.16b,v21.16b,#4
1590	add	v0.4s,v0.4s,v1.4s
1591	add	w5,w5,w9
1592	add	v4.4s,v4.4s,v5.4s
1593	add	w6,w6,w10
1594	add	v8.4s,v8.4s,v9.4s
1595	add	w7,w7,w11
1596	add	v12.4s,v12.4s,v13.4s
1597	add	w8,w8,w12
1598	add	v16.4s,v16.4s,v17.4s
1599	eor	w17,w17,w5
1600	add	v20.4s,v20.4s,v21.4s
1601	eor	w19,w19,w6
1602	eor	v3.16b,v3.16b,v0.16b
1603	eor	w20,w20,w7
1604	eor	v7.16b,v7.16b,v4.16b
1605	eor	w21,w21,w8
1606	eor	v11.16b,v11.16b,v8.16b
1607	ror	w17,w17,#16
1608	eor	v15.16b,v15.16b,v12.16b
1609	ror	w19,w19,#16
1610	eor	v19.16b,v19.16b,v16.16b
1611	ror	w20,w20,#16
1612	eor	v23.16b,v23.16b,v20.16b
1613	ror	w21,w21,#16
1614	rev32	v3.8h,v3.8h
1615	add	w13,w13,w17
1616	rev32	v7.8h,v7.8h
1617	add	w14,w14,w19
1618	rev32	v11.8h,v11.8h
1619	add	w15,w15,w20
1620	rev32	v15.8h,v15.8h
1621	add	w16,w16,w21
1622	rev32	v19.8h,v19.8h
1623	eor	w9,w9,w13
1624	rev32	v23.8h,v23.8h
1625	eor	w10,w10,w14
1626	add	v2.4s,v2.4s,v3.4s
1627	eor	w11,w11,w15
1628	add	v6.4s,v6.4s,v7.4s
1629	eor	w12,w12,w16
1630	add	v10.4s,v10.4s,v11.4s
1631	ror	w9,w9,#20
1632	add	v14.4s,v14.4s,v15.4s
1633	ror	w10,w10,#20
1634	add	v18.4s,v18.4s,v19.4s
1635	ror	w11,w11,#20
1636	add	v22.4s,v22.4s,v23.4s
1637	ror	w12,w12,#20
1638	eor	v24.16b,v1.16b,v2.16b
1639	add	w5,w5,w9
1640	eor	v25.16b,v5.16b,v6.16b
1641	add	w6,w6,w10
1642	eor	v26.16b,v9.16b,v10.16b
1643	add	w7,w7,w11
1644	eor	v27.16b,v13.16b,v14.16b
1645	add	w8,w8,w12
1646	eor	v28.16b,v17.16b,v18.16b
1647	eor	w17,w17,w5
1648	eor	v29.16b,v21.16b,v22.16b
1649	eor	w19,w19,w6
1650	ushr	v1.4s,v24.4s,#20
1651	eor	w20,w20,w7
1652	ushr	v5.4s,v25.4s,#20
1653	eor	w21,w21,w8
1654	ushr	v9.4s,v26.4s,#20
1655	ror	w17,w17,#24
1656	ushr	v13.4s,v27.4s,#20
1657	ror	w19,w19,#24
1658	ushr	v17.4s,v28.4s,#20
1659	ror	w20,w20,#24
1660	ushr	v21.4s,v29.4s,#20
1661	ror	w21,w21,#24
1662	sli	v1.4s,v24.4s,#12
1663	add	w13,w13,w17
1664	sli	v5.4s,v25.4s,#12
1665	add	w14,w14,w19
1666	sli	v9.4s,v26.4s,#12
1667	add	w15,w15,w20
1668	sli	v13.4s,v27.4s,#12
1669	add	w16,w16,w21
1670	sli	v17.4s,v28.4s,#12
1671	eor	w9,w9,w13
1672	sli	v21.4s,v29.4s,#12
1673	eor	w10,w10,w14
1674	add	v0.4s,v0.4s,v1.4s
1675	eor	w11,w11,w15
1676	add	v4.4s,v4.4s,v5.4s
1677	eor	w12,w12,w16
1678	add	v8.4s,v8.4s,v9.4s
1679	ror	w9,w9,#25
1680	add	v12.4s,v12.4s,v13.4s
1681	ror	w10,w10,#25
1682	add	v16.4s,v16.4s,v17.4s
1683	ror	w11,w11,#25
1684	add	v20.4s,v20.4s,v21.4s
1685	ror	w12,w12,#25
1686	eor	v24.16b,v3.16b,v0.16b
1687	add	w5,w5,w10
1688	eor	v25.16b,v7.16b,v4.16b
1689	add	w6,w6,w11
1690	eor	v26.16b,v11.16b,v8.16b
1691	add	w7,w7,w12
1692	eor	v27.16b,v15.16b,v12.16b
1693	add	w8,w8,w9
1694	eor	v28.16b,v19.16b,v16.16b
1695	eor	w21,w21,w5
1696	eor	v29.16b,v23.16b,v20.16b
1697	eor	w17,w17,w6
1698	ushr	v3.4s,v24.4s,#24
1699	eor	w19,w19,w7
1700	ushr	v7.4s,v25.4s,#24
1701	eor	w20,w20,w8
1702	ushr	v11.4s,v26.4s,#24
1703	ror	w21,w21,#16
1704	ushr	v15.4s,v27.4s,#24
1705	ror	w17,w17,#16
1706	ushr	v19.4s,v28.4s,#24
1707	ror	w19,w19,#16
1708	ushr	v23.4s,v29.4s,#24
1709	ror	w20,w20,#16
1710	sli	v3.4s,v24.4s,#8
1711	add	w15,w15,w21
1712	sli	v7.4s,v25.4s,#8
1713	add	w16,w16,w17
1714	sli	v11.4s,v26.4s,#8
1715	add	w13,w13,w19
1716	sli	v15.4s,v27.4s,#8
1717	add	w14,w14,w20
1718	sli	v19.4s,v28.4s,#8
1719	eor	w10,w10,w15
1720	sli	v23.4s,v29.4s,#8
1721	eor	w11,w11,w16
1722	add	v2.4s,v2.4s,v3.4s
1723	eor	w12,w12,w13
1724	add	v6.4s,v6.4s,v7.4s
1725	eor	w9,w9,w14
1726	add	v10.4s,v10.4s,v11.4s
1727	ror	w10,w10,#20
1728	add	v14.4s,v14.4s,v15.4s
1729	ror	w11,w11,#20
1730	add	v18.4s,v18.4s,v19.4s
1731	ror	w12,w12,#20
1732	add	v22.4s,v22.4s,v23.4s
1733	ror	w9,w9,#20
1734	eor	v24.16b,v1.16b,v2.16b
1735	add	w5,w5,w10
1736	eor	v25.16b,v5.16b,v6.16b
1737	add	w6,w6,w11
1738	eor	v26.16b,v9.16b,v10.16b
1739	add	w7,w7,w12
1740	eor	v27.16b,v13.16b,v14.16b
1741	add	w8,w8,w9
1742	eor	v28.16b,v17.16b,v18.16b
1743	eor	w21,w21,w5
1744	eor	v29.16b,v21.16b,v22.16b
1745	eor	w17,w17,w6
1746	ushr	v1.4s,v24.4s,#25
1747	eor	w19,w19,w7
1748	ushr	v5.4s,v25.4s,#25
1749	eor	w20,w20,w8
1750	ushr	v9.4s,v26.4s,#25
1751	ror	w21,w21,#24
1752	ushr	v13.4s,v27.4s,#25
1753	ror	w17,w17,#24
1754	ushr	v17.4s,v28.4s,#25
1755	ror	w19,w19,#24
1756	ushr	v21.4s,v29.4s,#25
1757	ror	w20,w20,#24
1758	sli	v1.4s,v24.4s,#7
1759	add	w15,w15,w21
1760	sli	v5.4s,v25.4s,#7
1761	add	w16,w16,w17
1762	sli	v9.4s,v26.4s,#7
1763	add	w13,w13,w19
1764	sli	v13.4s,v27.4s,#7
1765	add	w14,w14,w20
1766	sli	v17.4s,v28.4s,#7
1767	eor	w10,w10,w15
1768	sli	v21.4s,v29.4s,#7
1769	eor	w11,w11,w16
1770	ext	v2.16b,v2.16b,v2.16b,#8
1771	eor	w12,w12,w13
1772	ext	v6.16b,v6.16b,v6.16b,#8
1773	eor	w9,w9,w14
1774	ext	v10.16b,v10.16b,v10.16b,#8
1775	ror	w10,w10,#25
1776	ext	v14.16b,v14.16b,v14.16b,#8
1777	ror	w11,w11,#25
1778	ext	v18.16b,v18.16b,v18.16b,#8
1779	ror	w12,w12,#25
1780	ext	v22.16b,v22.16b,v22.16b,#8
1781	ror	w9,w9,#25
1782	ext	v3.16b,v3.16b,v3.16b,#4
1783	ext	v7.16b,v7.16b,v7.16b,#4
1784	ext	v11.16b,v11.16b,v11.16b,#4
1785	ext	v15.16b,v15.16b,v15.16b,#4
1786	ext	v19.16b,v19.16b,v19.16b,#4
1787	ext	v23.16b,v23.16b,v23.16b,#4
1788	ext	v1.16b,v1.16b,v1.16b,#12
1789	ext	v5.16b,v5.16b,v5.16b,#12
1790	ext	v9.16b,v9.16b,v9.16b,#12
1791	ext	v13.16b,v13.16b,v13.16b,#12
1792	ext	v17.16b,v17.16b,v17.16b,#12
1793	ext	v21.16b,v21.16b,v21.16b,#12
1794	cbnz	x4,.Loop_lower_neon
1795
1796	add	w5,w5,w22		// accumulate key block
1797	ldp	q24,q25,[sp,#0]
1798	add	x6,x6,x22,lsr#32
1799	ldp	q26,q27,[sp,#32]
1800	add	w7,w7,w23
1801	ldp	q28,q29,[sp,#64]
1802	add	x8,x8,x23,lsr#32
1803	add	v0.4s,v0.4s,v24.4s
1804	add	w9,w9,w24
1805	add	v4.4s,v4.4s,v24.4s
1806	add	x10,x10,x24,lsr#32
1807	add	v8.4s,v8.4s,v24.4s
1808	add	w11,w11,w25
1809	add	v12.4s,v12.4s,v24.4s
1810	add	x12,x12,x25,lsr#32
1811	add	v16.4s,v16.4s,v24.4s
1812	add	w13,w13,w26
1813	add	v20.4s,v20.4s,v24.4s
1814	add	x14,x14,x26,lsr#32
1815	add	v2.4s,v2.4s,v26.4s
1816	add	w15,w15,w27
1817	add	v6.4s,v6.4s,v26.4s
1818	add	x16,x16,x27,lsr#32
1819	add	v10.4s,v10.4s,v26.4s
1820	add	w17,w17,w28
1821	add	v14.4s,v14.4s,v26.4s
1822	add	x19,x19,x28,lsr#32
1823	add	v18.4s,v18.4s,v26.4s
1824	add	w20,w20,w30
1825	add	v22.4s,v22.4s,v26.4s
1826	add	x21,x21,x30,lsr#32
1827	add	v19.4s,v19.4s,v31.4s			// +4
1828	add	x5,x5,x6,lsl#32	// pack
1829	add	v23.4s,v23.4s,v31.4s			// +4
1830	add	x7,x7,x8,lsl#32
1831	add	v3.4s,v3.4s,v27.4s
1832	ldp	x6,x8,[x1,#0]		// load input
1833	add	v7.4s,v7.4s,v28.4s
1834	add	x9,x9,x10,lsl#32
1835	add	v11.4s,v11.4s,v29.4s
1836	add	x11,x11,x12,lsl#32
1837	add	v15.4s,v15.4s,v30.4s
1838	ldp	x10,x12,[x1,#16]
1839	add	v19.4s,v19.4s,v27.4s
1840	add	x13,x13,x14,lsl#32
1841	add	v23.4s,v23.4s,v28.4s
1842	add	x15,x15,x16,lsl#32
1843	add	v1.4s,v1.4s,v25.4s
1844	ldp	x14,x16,[x1,#32]
1845	add	v5.4s,v5.4s,v25.4s
1846	add	x17,x17,x19,lsl#32
1847	add	v9.4s,v9.4s,v25.4s
1848	add	x20,x20,x21,lsl#32
1849	add	v13.4s,v13.4s,v25.4s
1850	ldp	x19,x21,[x1,#48]
1851	add	v17.4s,v17.4s,v25.4s
1852	add	x1,x1,#64
1853	add	v21.4s,v21.4s,v25.4s
1854
1855#ifdef	__ARMEB__
1856	rev	x5,x5
1857	rev	x7,x7
1858	rev	x9,x9
1859	rev	x11,x11
1860	rev	x13,x13
1861	rev	x15,x15
1862	rev	x17,x17
1863	rev	x20,x20
1864#endif
1865	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1866	eor	x5,x5,x6
1867	eor	x7,x7,x8
1868	eor	x9,x9,x10
1869	eor	x11,x11,x12
1870	eor	x13,x13,x14
1871	eor	v0.16b,v0.16b,v24.16b
1872	eor	x15,x15,x16
1873	eor	v1.16b,v1.16b,v25.16b
1874	eor	x17,x17,x19
1875	eor	v2.16b,v2.16b,v26.16b
1876	eor	x20,x20,x21
1877	eor	v3.16b,v3.16b,v27.16b
1878	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1879
1880	stp	x5,x7,[x0,#0]		// store output
1881	add	x28,x28,#7			// increment counter
1882	stp	x9,x11,[x0,#16]
1883	stp	x13,x15,[x0,#32]
1884	stp	x17,x20,[x0,#48]
1885	add	x0,x0,#64
1886	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
1887
1888	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1889	eor	v4.16b,v4.16b,v24.16b
1890	eor	v5.16b,v5.16b,v25.16b
1891	eor	v6.16b,v6.16b,v26.16b
1892	eor	v7.16b,v7.16b,v27.16b
1893	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1894
1895	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1896	eor	v8.16b,v8.16b,v0.16b
1897	ldp	q24,q25,[sp,#0]
1898	eor	v9.16b,v9.16b,v1.16b
1899	ldp	q26,q27,[sp,#32]
1900	eor	v10.16b,v10.16b,v2.16b
1901	eor	v11.16b,v11.16b,v3.16b
1902	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1903
1904	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1905	eor	v12.16b,v12.16b,v4.16b
1906	eor	v13.16b,v13.16b,v5.16b
1907	eor	v14.16b,v14.16b,v6.16b
1908	eor	v15.16b,v15.16b,v7.16b
1909	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1910
1911	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1912	eor	v16.16b,v16.16b,v8.16b
1913	eor	v17.16b,v17.16b,v9.16b
1914	eor	v18.16b,v18.16b,v10.16b
1915	eor	v19.16b,v19.16b,v11.16b
1916	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1917
1918	shl	v0.4s,v31.4s,#1			// 4 -> 8
1919	eor	v20.16b,v20.16b,v12.16b
1920	eor	v21.16b,v21.16b,v13.16b
1921	eor	v22.16b,v22.16b,v14.16b
1922	eor	v23.16b,v23.16b,v15.16b
1923	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1924
1925	add	v27.4s,v27.4s,v0.4s			// += 8
1926	add	v28.4s,v28.4s,v0.4s
1927	add	v29.4s,v29.4s,v0.4s
1928	add	v30.4s,v30.4s,v0.4s
1929
1930	b.hs	.Loop_outer_512_neon
1931
1932	adds	x2,x2,#512
1933	ushr	v0.4s,v31.4s,#2			// 4 -> 1
1934
1935	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
1936	ldp	d10,d11,[sp,#128+16]
1937	ldp	d12,d13,[sp,#128+32]
1938	ldp	d14,d15,[sp,#128+48]
1939
1940	stp	q24,q31,[sp,#0]		// wipe off-load area
1941	stp	q24,q31,[sp,#32]
1942	stp	q24,q31,[sp,#64]
1943
1944	b.eq	.Ldone_512_neon
1945
1946	cmp	x2,#192
1947	sub	v27.4s,v27.4s,v0.4s			// -= 1
1948	sub	v28.4s,v28.4s,v0.4s
1949	sub	v29.4s,v29.4s,v0.4s
1950	add	sp,sp,#128
1951	b.hs	.Loop_outer_neon
1952
1953	eor	v25.16b,v25.16b,v25.16b
1954	eor	v26.16b,v26.16b,v26.16b
1955	eor	v27.16b,v27.16b,v27.16b
1956	eor	v28.16b,v28.16b,v28.16b
1957	eor	v29.16b,v29.16b,v29.16b
1958	eor	v30.16b,v30.16b,v30.16b
1959	b	.Loop_outer
1960
1961.Ldone_512_neon:
1962	ldp	x19,x20,[x29,#16]
1963	add	sp,sp,#128+64
1964	ldp	x21,x22,[x29,#32]
1965	ldp	x23,x24,[x29,#48]
1966	ldp	x25,x26,[x29,#64]
1967	ldp	x27,x28,[x29,#80]
1968	ldp	x29,x30,[sp],#96
1969	ret
1970.size	ChaCha20_512_neon,.-ChaCha20_512_neon
1971#endif
1972