1#if defined(__x86_64__)
2.text
3
4.extern	OPENSSL_ia32cap_P
5.hidden OPENSSL_ia32cap_P
6
7.align	64
8.Lzero:
9.long	0,0,0,0
10.Lone:
11.long	1,0,0,0
12.Linc:
13.long	0,1,2,3
14.Lfour:
15.long	4,4,4,4
16.Lincy:
17.long	0,2,4,6,1,3,5,7
18.Leight:
19.long	8,8,8,8,8,8,8,8
20.Lrot16:
21.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
22.Lrot24:
23.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
24.Lsigma:
25.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
26.align	64
27.Lzeroz:
28.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
29.Lfourz:
30.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
31.Lincz:
32.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
33.Lsixteen:
34.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
35.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
36.globl	ChaCha20_ctr32
37.hidden ChaCha20_ctr32
38.type	ChaCha20_ctr32,@function
39.align	64
40ChaCha20_ctr32:
41	cmpq	$0,%rdx
42	je	.Lno_data
43	movq	OPENSSL_ia32cap_P+4(%rip),%r10
44	testl	$512,%r10d
45	jnz	.LChaCha20_ssse3
46
47	pushq	%rbx
48	pushq	%rbp
49	pushq	%r12
50	pushq	%r13
51	pushq	%r14
52	pushq	%r15
53	subq	$64+24,%rsp
54.Lctr32_body:
55
56
57	movdqu	(%rcx),%xmm1
58	movdqu	16(%rcx),%xmm2
59	movdqu	(%r8),%xmm3
60	movdqa	.Lone(%rip),%xmm4
61
62
63	movdqa	%xmm1,16(%rsp)
64	movdqa	%xmm2,32(%rsp)
65	movdqa	%xmm3,48(%rsp)
66	movq	%rdx,%rbp
67	jmp	.Loop_outer
68
69.align	32
70.Loop_outer:
71	movl	$0x61707865,%eax
72	movl	$0x3320646e,%ebx
73	movl	$0x79622d32,%ecx
74	movl	$0x6b206574,%edx
75	movl	16(%rsp),%r8d
76	movl	20(%rsp),%r9d
77	movl	24(%rsp),%r10d
78	movl	28(%rsp),%r11d
79	movd	%xmm3,%r12d
80	movl	52(%rsp),%r13d
81	movl	56(%rsp),%r14d
82	movl	60(%rsp),%r15d
83
84	movq	%rbp,64+0(%rsp)
85	movl	$10,%ebp
86	movq	%rsi,64+8(%rsp)
87.byte	102,72,15,126,214
88	movq	%rdi,64+16(%rsp)
89	movq	%rsi,%rdi
90	shrq	$32,%rdi
91	jmp	.Loop
92
93.align	32
94.Loop:
95	addl	%r8d,%eax
96	xorl	%eax,%r12d
97	roll	$16,%r12d
98	addl	%r9d,%ebx
99	xorl	%ebx,%r13d
100	roll	$16,%r13d
101	addl	%r12d,%esi
102	xorl	%esi,%r8d
103	roll	$12,%r8d
104	addl	%r13d,%edi
105	xorl	%edi,%r9d
106	roll	$12,%r9d
107	addl	%r8d,%eax
108	xorl	%eax,%r12d
109	roll	$8,%r12d
110	addl	%r9d,%ebx
111	xorl	%ebx,%r13d
112	roll	$8,%r13d
113	addl	%r12d,%esi
114	xorl	%esi,%r8d
115	roll	$7,%r8d
116	addl	%r13d,%edi
117	xorl	%edi,%r9d
118	roll	$7,%r9d
119	movl	%esi,32(%rsp)
120	movl	%edi,36(%rsp)
121	movl	40(%rsp),%esi
122	movl	44(%rsp),%edi
123	addl	%r10d,%ecx
124	xorl	%ecx,%r14d
125	roll	$16,%r14d
126	addl	%r11d,%edx
127	xorl	%edx,%r15d
128	roll	$16,%r15d
129	addl	%r14d,%esi
130	xorl	%esi,%r10d
131	roll	$12,%r10d
132	addl	%r15d,%edi
133	xorl	%edi,%r11d
134	roll	$12,%r11d
135	addl	%r10d,%ecx
136	xorl	%ecx,%r14d
137	roll	$8,%r14d
138	addl	%r11d,%edx
139	xorl	%edx,%r15d
140	roll	$8,%r15d
141	addl	%r14d,%esi
142	xorl	%esi,%r10d
143	roll	$7,%r10d
144	addl	%r15d,%edi
145	xorl	%edi,%r11d
146	roll	$7,%r11d
147	addl	%r9d,%eax
148	xorl	%eax,%r15d
149	roll	$16,%r15d
150	addl	%r10d,%ebx
151	xorl	%ebx,%r12d
152	roll	$16,%r12d
153	addl	%r15d,%esi
154	xorl	%esi,%r9d
155	roll	$12,%r9d
156	addl	%r12d,%edi
157	xorl	%edi,%r10d
158	roll	$12,%r10d
159	addl	%r9d,%eax
160	xorl	%eax,%r15d
161	roll	$8,%r15d
162	addl	%r10d,%ebx
163	xorl	%ebx,%r12d
164	roll	$8,%r12d
165	addl	%r15d,%esi
166	xorl	%esi,%r9d
167	roll	$7,%r9d
168	addl	%r12d,%edi
169	xorl	%edi,%r10d
170	roll	$7,%r10d
171	movl	%esi,40(%rsp)
172	movl	%edi,44(%rsp)
173	movl	32(%rsp),%esi
174	movl	36(%rsp),%edi
175	addl	%r11d,%ecx
176	xorl	%ecx,%r13d
177	roll	$16,%r13d
178	addl	%r8d,%edx
179	xorl	%edx,%r14d
180	roll	$16,%r14d
181	addl	%r13d,%esi
182	xorl	%esi,%r11d
183	roll	$12,%r11d
184	addl	%r14d,%edi
185	xorl	%edi,%r8d
186	roll	$12,%r8d
187	addl	%r11d,%ecx
188	xorl	%ecx,%r13d
189	roll	$8,%r13d
190	addl	%r8d,%edx
191	xorl	%edx,%r14d
192	roll	$8,%r14d
193	addl	%r13d,%esi
194	xorl	%esi,%r11d
195	roll	$7,%r11d
196	addl	%r14d,%edi
197	xorl	%edi,%r8d
198	roll	$7,%r8d
199	decl	%ebp
200	jnz	.Loop
201	movl	%edi,36(%rsp)
202	movl	%esi,32(%rsp)
203	movq	64(%rsp),%rbp
204	movdqa	%xmm2,%xmm1
205	movq	64+8(%rsp),%rsi
206	paddd	%xmm4,%xmm3
207	movq	64+16(%rsp),%rdi
208
209	addl	$0x61707865,%eax
210	addl	$0x3320646e,%ebx
211	addl	$0x79622d32,%ecx
212	addl	$0x6b206574,%edx
213	addl	16(%rsp),%r8d
214	addl	20(%rsp),%r9d
215	addl	24(%rsp),%r10d
216	addl	28(%rsp),%r11d
217	addl	48(%rsp),%r12d
218	addl	52(%rsp),%r13d
219	addl	56(%rsp),%r14d
220	addl	60(%rsp),%r15d
221	paddd	32(%rsp),%xmm1
222
223	cmpq	$64,%rbp
224	jb	.Ltail
225
226	xorl	0(%rsi),%eax
227	xorl	4(%rsi),%ebx
228	xorl	8(%rsi),%ecx
229	xorl	12(%rsi),%edx
230	xorl	16(%rsi),%r8d
231	xorl	20(%rsi),%r9d
232	xorl	24(%rsi),%r10d
233	xorl	28(%rsi),%r11d
234	movdqu	32(%rsi),%xmm0
235	xorl	48(%rsi),%r12d
236	xorl	52(%rsi),%r13d
237	xorl	56(%rsi),%r14d
238	xorl	60(%rsi),%r15d
239	leaq	64(%rsi),%rsi
240	pxor	%xmm1,%xmm0
241
242	movdqa	%xmm2,32(%rsp)
243	movd	%xmm3,48(%rsp)
244
245	movl	%eax,0(%rdi)
246	movl	%ebx,4(%rdi)
247	movl	%ecx,8(%rdi)
248	movl	%edx,12(%rdi)
249	movl	%r8d,16(%rdi)
250	movl	%r9d,20(%rdi)
251	movl	%r10d,24(%rdi)
252	movl	%r11d,28(%rdi)
253	movdqu	%xmm0,32(%rdi)
254	movl	%r12d,48(%rdi)
255	movl	%r13d,52(%rdi)
256	movl	%r14d,56(%rdi)
257	movl	%r15d,60(%rdi)
258	leaq	64(%rdi),%rdi
259
260	subq	$64,%rbp
261	jnz	.Loop_outer
262
263	jmp	.Ldone
264
265.align	16
266.Ltail:
267	movl	%eax,0(%rsp)
268	movl	%ebx,4(%rsp)
269	xorq	%rbx,%rbx
270	movl	%ecx,8(%rsp)
271	movl	%edx,12(%rsp)
272	movl	%r8d,16(%rsp)
273	movl	%r9d,20(%rsp)
274	movl	%r10d,24(%rsp)
275	movl	%r11d,28(%rsp)
276	movdqa	%xmm1,32(%rsp)
277	movl	%r12d,48(%rsp)
278	movl	%r13d,52(%rsp)
279	movl	%r14d,56(%rsp)
280	movl	%r15d,60(%rsp)
281
282.Loop_tail:
283	movzbl	(%rsi,%rbx,1),%eax
284	movzbl	(%rsp,%rbx,1),%edx
285	leaq	1(%rbx),%rbx
286	xorl	%edx,%eax
287	movb	%al,-1(%rdi,%rbx,1)
288	decq	%rbp
289	jnz	.Loop_tail
290
291.Ldone:
292	leaq	64+24+48(%rsp),%rsi
293	movq	-48(%rsi),%r15
294	movq	-40(%rsi),%r14
295	movq	-32(%rsi),%r13
296	movq	-24(%rsi),%r12
297	movq	-16(%rsi),%rbp
298	movq	-8(%rsi),%rbx
299	leaq	(%rsi),%rsp
300.Lno_data:
301	.byte	0xf3,0xc3
302.size	ChaCha20_ctr32,.-ChaCha20_ctr32
303.type	ChaCha20_ssse3,@function
304.align	32
305ChaCha20_ssse3:
306.LChaCha20_ssse3:
307	movq	%rsp,%r9
308	cmpq	$128,%rdx
309	ja	.LChaCha20_4x
310
311.Ldo_sse3_after_all:
312	subq	$64+8,%rsp
313	movdqa	.Lsigma(%rip),%xmm0
314	movdqu	(%rcx),%xmm1
315	movdqu	16(%rcx),%xmm2
316	movdqu	(%r8),%xmm3
317	movdqa	.Lrot16(%rip),%xmm6
318	movdqa	.Lrot24(%rip),%xmm7
319
320	movdqa	%xmm0,0(%rsp)
321	movdqa	%xmm1,16(%rsp)
322	movdqa	%xmm2,32(%rsp)
323	movdqa	%xmm3,48(%rsp)
324	movq	$10,%r8
325	jmp	.Loop_ssse3
326
327.align	32
328.Loop_outer_ssse3:
329	movdqa	.Lone(%rip),%xmm3
330	movdqa	0(%rsp),%xmm0
331	movdqa	16(%rsp),%xmm1
332	movdqa	32(%rsp),%xmm2
333	paddd	48(%rsp),%xmm3
334	movq	$10,%r8
335	movdqa	%xmm3,48(%rsp)
336	jmp	.Loop_ssse3
337
338.align	32
339.Loop_ssse3:
340	paddd	%xmm1,%xmm0
341	pxor	%xmm0,%xmm3
342.byte	102,15,56,0,222
343	paddd	%xmm3,%xmm2
344	pxor	%xmm2,%xmm1
345	movdqa	%xmm1,%xmm4
346	psrld	$20,%xmm1
347	pslld	$12,%xmm4
348	por	%xmm4,%xmm1
349	paddd	%xmm1,%xmm0
350	pxor	%xmm0,%xmm3
351.byte	102,15,56,0,223
352	paddd	%xmm3,%xmm2
353	pxor	%xmm2,%xmm1
354	movdqa	%xmm1,%xmm4
355	psrld	$25,%xmm1
356	pslld	$7,%xmm4
357	por	%xmm4,%xmm1
358	pshufd	$78,%xmm2,%xmm2
359	pshufd	$57,%xmm1,%xmm1
360	pshufd	$147,%xmm3,%xmm3
361	nop
362	paddd	%xmm1,%xmm0
363	pxor	%xmm0,%xmm3
364.byte	102,15,56,0,222
365	paddd	%xmm3,%xmm2
366	pxor	%xmm2,%xmm1
367	movdqa	%xmm1,%xmm4
368	psrld	$20,%xmm1
369	pslld	$12,%xmm4
370	por	%xmm4,%xmm1
371	paddd	%xmm1,%xmm0
372	pxor	%xmm0,%xmm3
373.byte	102,15,56,0,223
374	paddd	%xmm3,%xmm2
375	pxor	%xmm2,%xmm1
376	movdqa	%xmm1,%xmm4
377	psrld	$25,%xmm1
378	pslld	$7,%xmm4
379	por	%xmm4,%xmm1
380	pshufd	$78,%xmm2,%xmm2
381	pshufd	$147,%xmm1,%xmm1
382	pshufd	$57,%xmm3,%xmm3
383	decq	%r8
384	jnz	.Loop_ssse3
385	paddd	0(%rsp),%xmm0
386	paddd	16(%rsp),%xmm1
387	paddd	32(%rsp),%xmm2
388	paddd	48(%rsp),%xmm3
389
390	cmpq	$64,%rdx
391	jb	.Ltail_ssse3
392
393	movdqu	0(%rsi),%xmm4
394	movdqu	16(%rsi),%xmm5
395	pxor	%xmm4,%xmm0
396	movdqu	32(%rsi),%xmm4
397	pxor	%xmm5,%xmm1
398	movdqu	48(%rsi),%xmm5
399	leaq	64(%rsi),%rsi
400	pxor	%xmm4,%xmm2
401	pxor	%xmm5,%xmm3
402
403	movdqu	%xmm0,0(%rdi)
404	movdqu	%xmm1,16(%rdi)
405	movdqu	%xmm2,32(%rdi)
406	movdqu	%xmm3,48(%rdi)
407	leaq	64(%rdi),%rdi
408
409	subq	$64,%rdx
410	jnz	.Loop_outer_ssse3
411
412	jmp	.Ldone_ssse3
413
414.align	16
415.Ltail_ssse3:
416	movdqa	%xmm0,0(%rsp)
417	movdqa	%xmm1,16(%rsp)
418	movdqa	%xmm2,32(%rsp)
419	movdqa	%xmm3,48(%rsp)
420	xorq	%r8,%r8
421
422.Loop_tail_ssse3:
423	movzbl	(%rsi,%r8,1),%eax
424	movzbl	(%rsp,%r8,1),%ecx
425	leaq	1(%r8),%r8
426	xorl	%ecx,%eax
427	movb	%al,-1(%rdi,%r8,1)
428	decq	%rdx
429	jnz	.Loop_tail_ssse3
430
431.Ldone_ssse3:
432	leaq	(%r9),%rsp
433.Lssse3_epilogue:
434	.byte	0xf3,0xc3
435.size	ChaCha20_ssse3,.-ChaCha20_ssse3
436.type	ChaCha20_4x,@function
437.align	32
438ChaCha20_4x:
439.LChaCha20_4x:
440	movq	%rsp,%r9
441	movq	%r10,%r11
442	shrq	$32,%r10
443	testq	$32,%r10
444	jnz	.LChaCha20_8x
445	cmpq	$192,%rdx
446	ja	.Lproceed4x
447
448	andq	$71303168,%r11
449	cmpq	$4194304,%r11
450	je	.Ldo_sse3_after_all
451
452.Lproceed4x:
453	subq	$0x140+8,%rsp
454	movdqa	.Lsigma(%rip),%xmm11
455	movdqu	(%rcx),%xmm15
456	movdqu	16(%rcx),%xmm7
457	movdqu	(%r8),%xmm3
458	leaq	256(%rsp),%rcx
459	leaq	.Lrot16(%rip),%r10
460	leaq	.Lrot24(%rip),%r11
461
462	pshufd	$0x00,%xmm11,%xmm8
463	pshufd	$0x55,%xmm11,%xmm9
464	movdqa	%xmm8,64(%rsp)
465	pshufd	$0xaa,%xmm11,%xmm10
466	movdqa	%xmm9,80(%rsp)
467	pshufd	$0xff,%xmm11,%xmm11
468	movdqa	%xmm10,96(%rsp)
469	movdqa	%xmm11,112(%rsp)
470
471	pshufd	$0x00,%xmm15,%xmm12
472	pshufd	$0x55,%xmm15,%xmm13
473	movdqa	%xmm12,128-256(%rcx)
474	pshufd	$0xaa,%xmm15,%xmm14
475	movdqa	%xmm13,144-256(%rcx)
476	pshufd	$0xff,%xmm15,%xmm15
477	movdqa	%xmm14,160-256(%rcx)
478	movdqa	%xmm15,176-256(%rcx)
479
480	pshufd	$0x00,%xmm7,%xmm4
481	pshufd	$0x55,%xmm7,%xmm5
482	movdqa	%xmm4,192-256(%rcx)
483	pshufd	$0xaa,%xmm7,%xmm6
484	movdqa	%xmm5,208-256(%rcx)
485	pshufd	$0xff,%xmm7,%xmm7
486	movdqa	%xmm6,224-256(%rcx)
487	movdqa	%xmm7,240-256(%rcx)
488
489	pshufd	$0x00,%xmm3,%xmm0
490	pshufd	$0x55,%xmm3,%xmm1
491	paddd	.Linc(%rip),%xmm0
492	pshufd	$0xaa,%xmm3,%xmm2
493	movdqa	%xmm1,272-256(%rcx)
494	pshufd	$0xff,%xmm3,%xmm3
495	movdqa	%xmm2,288-256(%rcx)
496	movdqa	%xmm3,304-256(%rcx)
497
498	jmp	.Loop_enter4x
499
500.align	32
501.Loop_outer4x:
502	movdqa	64(%rsp),%xmm8
503	movdqa	80(%rsp),%xmm9
504	movdqa	96(%rsp),%xmm10
505	movdqa	112(%rsp),%xmm11
506	movdqa	128-256(%rcx),%xmm12
507	movdqa	144-256(%rcx),%xmm13
508	movdqa	160-256(%rcx),%xmm14
509	movdqa	176-256(%rcx),%xmm15
510	movdqa	192-256(%rcx),%xmm4
511	movdqa	208-256(%rcx),%xmm5
512	movdqa	224-256(%rcx),%xmm6
513	movdqa	240-256(%rcx),%xmm7
514	movdqa	256-256(%rcx),%xmm0
515	movdqa	272-256(%rcx),%xmm1
516	movdqa	288-256(%rcx),%xmm2
517	movdqa	304-256(%rcx),%xmm3
518	paddd	.Lfour(%rip),%xmm0
519
520.Loop_enter4x:
521	movdqa	%xmm6,32(%rsp)
522	movdqa	%xmm7,48(%rsp)
523	movdqa	(%r10),%xmm7
524	movl	$10,%eax
525	movdqa	%xmm0,256-256(%rcx)
526	jmp	.Loop4x
527
528.align	32
529.Loop4x:
530	paddd	%xmm12,%xmm8
531	paddd	%xmm13,%xmm9
532	pxor	%xmm8,%xmm0
533	pxor	%xmm9,%xmm1
534.byte	102,15,56,0,199
535.byte	102,15,56,0,207
536	paddd	%xmm0,%xmm4
537	paddd	%xmm1,%xmm5
538	pxor	%xmm4,%xmm12
539	pxor	%xmm5,%xmm13
540	movdqa	%xmm12,%xmm6
541	pslld	$12,%xmm12
542	psrld	$20,%xmm6
543	movdqa	%xmm13,%xmm7
544	pslld	$12,%xmm13
545	por	%xmm6,%xmm12
546	psrld	$20,%xmm7
547	movdqa	(%r11),%xmm6
548	por	%xmm7,%xmm13
549	paddd	%xmm12,%xmm8
550	paddd	%xmm13,%xmm9
551	pxor	%xmm8,%xmm0
552	pxor	%xmm9,%xmm1
553.byte	102,15,56,0,198
554.byte	102,15,56,0,206
555	paddd	%xmm0,%xmm4
556	paddd	%xmm1,%xmm5
557	pxor	%xmm4,%xmm12
558	pxor	%xmm5,%xmm13
559	movdqa	%xmm12,%xmm7
560	pslld	$7,%xmm12
561	psrld	$25,%xmm7
562	movdqa	%xmm13,%xmm6
563	pslld	$7,%xmm13
564	por	%xmm7,%xmm12
565	psrld	$25,%xmm6
566	movdqa	(%r10),%xmm7
567	por	%xmm6,%xmm13
568	movdqa	%xmm4,0(%rsp)
569	movdqa	%xmm5,16(%rsp)
570	movdqa	32(%rsp),%xmm4
571	movdqa	48(%rsp),%xmm5
572	paddd	%xmm14,%xmm10
573	paddd	%xmm15,%xmm11
574	pxor	%xmm10,%xmm2
575	pxor	%xmm11,%xmm3
576.byte	102,15,56,0,215
577.byte	102,15,56,0,223
578	paddd	%xmm2,%xmm4
579	paddd	%xmm3,%xmm5
580	pxor	%xmm4,%xmm14
581	pxor	%xmm5,%xmm15
582	movdqa	%xmm14,%xmm6
583	pslld	$12,%xmm14
584	psrld	$20,%xmm6
585	movdqa	%xmm15,%xmm7
586	pslld	$12,%xmm15
587	por	%xmm6,%xmm14
588	psrld	$20,%xmm7
589	movdqa	(%r11),%xmm6
590	por	%xmm7,%xmm15
591	paddd	%xmm14,%xmm10
592	paddd	%xmm15,%xmm11
593	pxor	%xmm10,%xmm2
594	pxor	%xmm11,%xmm3
595.byte	102,15,56,0,214
596.byte	102,15,56,0,222
597	paddd	%xmm2,%xmm4
598	paddd	%xmm3,%xmm5
599	pxor	%xmm4,%xmm14
600	pxor	%xmm5,%xmm15
601	movdqa	%xmm14,%xmm7
602	pslld	$7,%xmm14
603	psrld	$25,%xmm7
604	movdqa	%xmm15,%xmm6
605	pslld	$7,%xmm15
606	por	%xmm7,%xmm14
607	psrld	$25,%xmm6
608	movdqa	(%r10),%xmm7
609	por	%xmm6,%xmm15
610	paddd	%xmm13,%xmm8
611	paddd	%xmm14,%xmm9
612	pxor	%xmm8,%xmm3
613	pxor	%xmm9,%xmm0
614.byte	102,15,56,0,223
615.byte	102,15,56,0,199
616	paddd	%xmm3,%xmm4
617	paddd	%xmm0,%xmm5
618	pxor	%xmm4,%xmm13
619	pxor	%xmm5,%xmm14
620	movdqa	%xmm13,%xmm6
621	pslld	$12,%xmm13
622	psrld	$20,%xmm6
623	movdqa	%xmm14,%xmm7
624	pslld	$12,%xmm14
625	por	%xmm6,%xmm13
626	psrld	$20,%xmm7
627	movdqa	(%r11),%xmm6
628	por	%xmm7,%xmm14
629	paddd	%xmm13,%xmm8
630	paddd	%xmm14,%xmm9
631	pxor	%xmm8,%xmm3
632	pxor	%xmm9,%xmm0
633.byte	102,15,56,0,222
634.byte	102,15,56,0,198
635	paddd	%xmm3,%xmm4
636	paddd	%xmm0,%xmm5
637	pxor	%xmm4,%xmm13
638	pxor	%xmm5,%xmm14
639	movdqa	%xmm13,%xmm7
640	pslld	$7,%xmm13
641	psrld	$25,%xmm7
642	movdqa	%xmm14,%xmm6
643	pslld	$7,%xmm14
644	por	%xmm7,%xmm13
645	psrld	$25,%xmm6
646	movdqa	(%r10),%xmm7
647	por	%xmm6,%xmm14
648	movdqa	%xmm4,32(%rsp)
649	movdqa	%xmm5,48(%rsp)
650	movdqa	0(%rsp),%xmm4
651	movdqa	16(%rsp),%xmm5
652	paddd	%xmm15,%xmm10
653	paddd	%xmm12,%xmm11
654	pxor	%xmm10,%xmm1
655	pxor	%xmm11,%xmm2
656.byte	102,15,56,0,207
657.byte	102,15,56,0,215
658	paddd	%xmm1,%xmm4
659	paddd	%xmm2,%xmm5
660	pxor	%xmm4,%xmm15
661	pxor	%xmm5,%xmm12
662	movdqa	%xmm15,%xmm6
663	pslld	$12,%xmm15
664	psrld	$20,%xmm6
665	movdqa	%xmm12,%xmm7
666	pslld	$12,%xmm12
667	por	%xmm6,%xmm15
668	psrld	$20,%xmm7
669	movdqa	(%r11),%xmm6
670	por	%xmm7,%xmm12
671	paddd	%xmm15,%xmm10
672	paddd	%xmm12,%xmm11
673	pxor	%xmm10,%xmm1
674	pxor	%xmm11,%xmm2
675.byte	102,15,56,0,206
676.byte	102,15,56,0,214
677	paddd	%xmm1,%xmm4
678	paddd	%xmm2,%xmm5
679	pxor	%xmm4,%xmm15
680	pxor	%xmm5,%xmm12
681	movdqa	%xmm15,%xmm7
682	pslld	$7,%xmm15
683	psrld	$25,%xmm7
684	movdqa	%xmm12,%xmm6
685	pslld	$7,%xmm12
686	por	%xmm7,%xmm15
687	psrld	$25,%xmm6
688	movdqa	(%r10),%xmm7
689	por	%xmm6,%xmm12
690	decl	%eax
691	jnz	.Loop4x
692
693	paddd	64(%rsp),%xmm8
694	paddd	80(%rsp),%xmm9
695	paddd	96(%rsp),%xmm10
696	paddd	112(%rsp),%xmm11
697
698	movdqa	%xmm8,%xmm6
699	punpckldq	%xmm9,%xmm8
700	movdqa	%xmm10,%xmm7
701	punpckldq	%xmm11,%xmm10
702	punpckhdq	%xmm9,%xmm6
703	punpckhdq	%xmm11,%xmm7
704	movdqa	%xmm8,%xmm9
705	punpcklqdq	%xmm10,%xmm8
706	movdqa	%xmm6,%xmm11
707	punpcklqdq	%xmm7,%xmm6
708	punpckhqdq	%xmm10,%xmm9
709	punpckhqdq	%xmm7,%xmm11
710	paddd	128-256(%rcx),%xmm12
711	paddd	144-256(%rcx),%xmm13
712	paddd	160-256(%rcx),%xmm14
713	paddd	176-256(%rcx),%xmm15
714
715	movdqa	%xmm8,0(%rsp)
716	movdqa	%xmm9,16(%rsp)
717	movdqa	32(%rsp),%xmm8
718	movdqa	48(%rsp),%xmm9
719
720	movdqa	%xmm12,%xmm10
721	punpckldq	%xmm13,%xmm12
722	movdqa	%xmm14,%xmm7
723	punpckldq	%xmm15,%xmm14
724	punpckhdq	%xmm13,%xmm10
725	punpckhdq	%xmm15,%xmm7
726	movdqa	%xmm12,%xmm13
727	punpcklqdq	%xmm14,%xmm12
728	movdqa	%xmm10,%xmm15
729	punpcklqdq	%xmm7,%xmm10
730	punpckhqdq	%xmm14,%xmm13
731	punpckhqdq	%xmm7,%xmm15
732	paddd	192-256(%rcx),%xmm4
733	paddd	208-256(%rcx),%xmm5
734	paddd	224-256(%rcx),%xmm8
735	paddd	240-256(%rcx),%xmm9
736
737	movdqa	%xmm6,32(%rsp)
738	movdqa	%xmm11,48(%rsp)
739
740	movdqa	%xmm4,%xmm14
741	punpckldq	%xmm5,%xmm4
742	movdqa	%xmm8,%xmm7
743	punpckldq	%xmm9,%xmm8
744	punpckhdq	%xmm5,%xmm14
745	punpckhdq	%xmm9,%xmm7
746	movdqa	%xmm4,%xmm5
747	punpcklqdq	%xmm8,%xmm4
748	movdqa	%xmm14,%xmm9
749	punpcklqdq	%xmm7,%xmm14
750	punpckhqdq	%xmm8,%xmm5
751	punpckhqdq	%xmm7,%xmm9
752	paddd	256-256(%rcx),%xmm0
753	paddd	272-256(%rcx),%xmm1
754	paddd	288-256(%rcx),%xmm2
755	paddd	304-256(%rcx),%xmm3
756
757	movdqa	%xmm0,%xmm8
758	punpckldq	%xmm1,%xmm0
759	movdqa	%xmm2,%xmm7
760	punpckldq	%xmm3,%xmm2
761	punpckhdq	%xmm1,%xmm8
762	punpckhdq	%xmm3,%xmm7
763	movdqa	%xmm0,%xmm1
764	punpcklqdq	%xmm2,%xmm0
765	movdqa	%xmm8,%xmm3
766	punpcklqdq	%xmm7,%xmm8
767	punpckhqdq	%xmm2,%xmm1
768	punpckhqdq	%xmm7,%xmm3
769	cmpq	$256,%rdx
770	jb	.Ltail4x
771
772	movdqu	0(%rsi),%xmm6
773	movdqu	16(%rsi),%xmm11
774	movdqu	32(%rsi),%xmm2
775	movdqu	48(%rsi),%xmm7
776	pxor	0(%rsp),%xmm6
777	pxor	%xmm12,%xmm11
778	pxor	%xmm4,%xmm2
779	pxor	%xmm0,%xmm7
780
781	movdqu	%xmm6,0(%rdi)
782	movdqu	64(%rsi),%xmm6
783	movdqu	%xmm11,16(%rdi)
784	movdqu	80(%rsi),%xmm11
785	movdqu	%xmm2,32(%rdi)
786	movdqu	96(%rsi),%xmm2
787	movdqu	%xmm7,48(%rdi)
788	movdqu	112(%rsi),%xmm7
789	leaq	128(%rsi),%rsi
790	pxor	16(%rsp),%xmm6
791	pxor	%xmm13,%xmm11
792	pxor	%xmm5,%xmm2
793	pxor	%xmm1,%xmm7
794
795	movdqu	%xmm6,64(%rdi)
796	movdqu	0(%rsi),%xmm6
797	movdqu	%xmm11,80(%rdi)
798	movdqu	16(%rsi),%xmm11
799	movdqu	%xmm2,96(%rdi)
800	movdqu	32(%rsi),%xmm2
801	movdqu	%xmm7,112(%rdi)
802	leaq	128(%rdi),%rdi
803	movdqu	48(%rsi),%xmm7
804	pxor	32(%rsp),%xmm6
805	pxor	%xmm10,%xmm11
806	pxor	%xmm14,%xmm2
807	pxor	%xmm8,%xmm7
808
809	movdqu	%xmm6,0(%rdi)
810	movdqu	64(%rsi),%xmm6
811	movdqu	%xmm11,16(%rdi)
812	movdqu	80(%rsi),%xmm11
813	movdqu	%xmm2,32(%rdi)
814	movdqu	96(%rsi),%xmm2
815	movdqu	%xmm7,48(%rdi)
816	movdqu	112(%rsi),%xmm7
817	leaq	128(%rsi),%rsi
818	pxor	48(%rsp),%xmm6
819	pxor	%xmm15,%xmm11
820	pxor	%xmm9,%xmm2
821	pxor	%xmm3,%xmm7
822	movdqu	%xmm6,64(%rdi)
823	movdqu	%xmm11,80(%rdi)
824	movdqu	%xmm2,96(%rdi)
825	movdqu	%xmm7,112(%rdi)
826	leaq	128(%rdi),%rdi
827
828	subq	$256,%rdx
829	jnz	.Loop_outer4x
830
831	jmp	.Ldone4x
832
833.Ltail4x:
834	cmpq	$192,%rdx
835	jae	.L192_or_more4x
836	cmpq	$128,%rdx
837	jae	.L128_or_more4x
838	cmpq	$64,%rdx
839	jae	.L64_or_more4x
840
841
842	xorq	%r10,%r10
843
844	movdqa	%xmm12,16(%rsp)
845	movdqa	%xmm4,32(%rsp)
846	movdqa	%xmm0,48(%rsp)
847	jmp	.Loop_tail4x
848
849.align	32
850.L64_or_more4x:
851	movdqu	0(%rsi),%xmm6
852	movdqu	16(%rsi),%xmm11
853	movdqu	32(%rsi),%xmm2
854	movdqu	48(%rsi),%xmm7
855	pxor	0(%rsp),%xmm6
856	pxor	%xmm12,%xmm11
857	pxor	%xmm4,%xmm2
858	pxor	%xmm0,%xmm7
859	movdqu	%xmm6,0(%rdi)
860	movdqu	%xmm11,16(%rdi)
861	movdqu	%xmm2,32(%rdi)
862	movdqu	%xmm7,48(%rdi)
863	je	.Ldone4x
864
865	movdqa	16(%rsp),%xmm6
866	leaq	64(%rsi),%rsi
867	xorq	%r10,%r10
868	movdqa	%xmm6,0(%rsp)
869	movdqa	%xmm13,16(%rsp)
870	leaq	64(%rdi),%rdi
871	movdqa	%xmm5,32(%rsp)
872	subq	$64,%rdx
873	movdqa	%xmm1,48(%rsp)
874	jmp	.Loop_tail4x
875
876.align	32
877.L128_or_more4x:
878	movdqu	0(%rsi),%xmm6
879	movdqu	16(%rsi),%xmm11
880	movdqu	32(%rsi),%xmm2
881	movdqu	48(%rsi),%xmm7
882	pxor	0(%rsp),%xmm6
883	pxor	%xmm12,%xmm11
884	pxor	%xmm4,%xmm2
885	pxor	%xmm0,%xmm7
886
887	movdqu	%xmm6,0(%rdi)
888	movdqu	64(%rsi),%xmm6
889	movdqu	%xmm11,16(%rdi)
890	movdqu	80(%rsi),%xmm11
891	movdqu	%xmm2,32(%rdi)
892	movdqu	96(%rsi),%xmm2
893	movdqu	%xmm7,48(%rdi)
894	movdqu	112(%rsi),%xmm7
895	pxor	16(%rsp),%xmm6
896	pxor	%xmm13,%xmm11
897	pxor	%xmm5,%xmm2
898	pxor	%xmm1,%xmm7
899	movdqu	%xmm6,64(%rdi)
900	movdqu	%xmm11,80(%rdi)
901	movdqu	%xmm2,96(%rdi)
902	movdqu	%xmm7,112(%rdi)
903	je	.Ldone4x
904
905	movdqa	32(%rsp),%xmm6
906	leaq	128(%rsi),%rsi
907	xorq	%r10,%r10
908	movdqa	%xmm6,0(%rsp)
909	movdqa	%xmm10,16(%rsp)
910	leaq	128(%rdi),%rdi
911	movdqa	%xmm14,32(%rsp)
912	subq	$128,%rdx
913	movdqa	%xmm8,48(%rsp)
914	jmp	.Loop_tail4x
915
916.align	32
917.L192_or_more4x:
918	movdqu	0(%rsi),%xmm6
919	movdqu	16(%rsi),%xmm11
920	movdqu	32(%rsi),%xmm2
921	movdqu	48(%rsi),%xmm7
922	pxor	0(%rsp),%xmm6
923	pxor	%xmm12,%xmm11
924	pxor	%xmm4,%xmm2
925	pxor	%xmm0,%xmm7
926
927	movdqu	%xmm6,0(%rdi)
928	movdqu	64(%rsi),%xmm6
929	movdqu	%xmm11,16(%rdi)
930	movdqu	80(%rsi),%xmm11
931	movdqu	%xmm2,32(%rdi)
932	movdqu	96(%rsi),%xmm2
933	movdqu	%xmm7,48(%rdi)
934	movdqu	112(%rsi),%xmm7
935	leaq	128(%rsi),%rsi
936	pxor	16(%rsp),%xmm6
937	pxor	%xmm13,%xmm11
938	pxor	%xmm5,%xmm2
939	pxor	%xmm1,%xmm7
940
941	movdqu	%xmm6,64(%rdi)
942	movdqu	0(%rsi),%xmm6
943	movdqu	%xmm11,80(%rdi)
944	movdqu	16(%rsi),%xmm11
945	movdqu	%xmm2,96(%rdi)
946	movdqu	32(%rsi),%xmm2
947	movdqu	%xmm7,112(%rdi)
948	leaq	128(%rdi),%rdi
949	movdqu	48(%rsi),%xmm7
950	pxor	32(%rsp),%xmm6
951	pxor	%xmm10,%xmm11
952	pxor	%xmm14,%xmm2
953	pxor	%xmm8,%xmm7
954	movdqu	%xmm6,0(%rdi)
955	movdqu	%xmm11,16(%rdi)
956	movdqu	%xmm2,32(%rdi)
957	movdqu	%xmm7,48(%rdi)
958	je	.Ldone4x
959
960	movdqa	48(%rsp),%xmm6
961	leaq	64(%rsi),%rsi
962	xorq	%r10,%r10
963	movdqa	%xmm6,0(%rsp)
964	movdqa	%xmm15,16(%rsp)
965	leaq	64(%rdi),%rdi
966	movdqa	%xmm9,32(%rsp)
967	subq	$192,%rdx
968	movdqa	%xmm3,48(%rsp)
969
970.Loop_tail4x:
971	movzbl	(%rsi,%r10,1),%eax
972	movzbl	(%rsp,%r10,1),%ecx
973	leaq	1(%r10),%r10
974	xorl	%ecx,%eax
975	movb	%al,-1(%rdi,%r10,1)
976	decq	%rdx
977	jnz	.Loop_tail4x
978
979.Ldone4x:
980	leaq	(%r9),%rsp
981.L4x_epilogue:
982	.byte	0xf3,0xc3
983.size	ChaCha20_4x,.-ChaCha20_4x
984.type	ChaCha20_8x,@function
985.align	32
986ChaCha20_8x:
987.LChaCha20_8x:
988	movq	%rsp,%r9
989	subq	$0x280+8,%rsp
990	andq	$-32,%rsp
991	vzeroupper
992
993
994
995
996
997
998
999
1000
1001
1002	vbroadcasti128	.Lsigma(%rip),%ymm11
1003	vbroadcasti128	(%rcx),%ymm3
1004	vbroadcasti128	16(%rcx),%ymm15
1005	vbroadcasti128	(%r8),%ymm7
1006	leaq	256(%rsp),%rcx
1007	leaq	512(%rsp),%rax
1008	leaq	.Lrot16(%rip),%r10
1009	leaq	.Lrot24(%rip),%r11
1010
1011	vpshufd	$0x00,%ymm11,%ymm8
1012	vpshufd	$0x55,%ymm11,%ymm9
1013	vmovdqa	%ymm8,128-256(%rcx)
1014	vpshufd	$0xaa,%ymm11,%ymm10
1015	vmovdqa	%ymm9,160-256(%rcx)
1016	vpshufd	$0xff,%ymm11,%ymm11
1017	vmovdqa	%ymm10,192-256(%rcx)
1018	vmovdqa	%ymm11,224-256(%rcx)
1019
1020	vpshufd	$0x00,%ymm3,%ymm0
1021	vpshufd	$0x55,%ymm3,%ymm1
1022	vmovdqa	%ymm0,256-256(%rcx)
1023	vpshufd	$0xaa,%ymm3,%ymm2
1024	vmovdqa	%ymm1,288-256(%rcx)
1025	vpshufd	$0xff,%ymm3,%ymm3
1026	vmovdqa	%ymm2,320-256(%rcx)
1027	vmovdqa	%ymm3,352-256(%rcx)
1028
1029	vpshufd	$0x00,%ymm15,%ymm12
1030	vpshufd	$0x55,%ymm15,%ymm13
1031	vmovdqa	%ymm12,384-512(%rax)
1032	vpshufd	$0xaa,%ymm15,%ymm14
1033	vmovdqa	%ymm13,416-512(%rax)
1034	vpshufd	$0xff,%ymm15,%ymm15
1035	vmovdqa	%ymm14,448-512(%rax)
1036	vmovdqa	%ymm15,480-512(%rax)
1037
1038	vpshufd	$0x00,%ymm7,%ymm4
1039	vpshufd	$0x55,%ymm7,%ymm5
1040	vpaddd	.Lincy(%rip),%ymm4,%ymm4
1041	vpshufd	$0xaa,%ymm7,%ymm6
1042	vmovdqa	%ymm5,544-512(%rax)
1043	vpshufd	$0xff,%ymm7,%ymm7
1044	vmovdqa	%ymm6,576-512(%rax)
1045	vmovdqa	%ymm7,608-512(%rax)
1046
1047	jmp	.Loop_enter8x
1048
1049.align	32
1050.Loop_outer8x:
1051	vmovdqa	128-256(%rcx),%ymm8
1052	vmovdqa	160-256(%rcx),%ymm9
1053	vmovdqa	192-256(%rcx),%ymm10
1054	vmovdqa	224-256(%rcx),%ymm11
1055	vmovdqa	256-256(%rcx),%ymm0
1056	vmovdqa	288-256(%rcx),%ymm1
1057	vmovdqa	320-256(%rcx),%ymm2
1058	vmovdqa	352-256(%rcx),%ymm3
1059	vmovdqa	384-512(%rax),%ymm12
1060	vmovdqa	416-512(%rax),%ymm13
1061	vmovdqa	448-512(%rax),%ymm14
1062	vmovdqa	480-512(%rax),%ymm15
1063	vmovdqa	512-512(%rax),%ymm4
1064	vmovdqa	544-512(%rax),%ymm5
1065	vmovdqa	576-512(%rax),%ymm6
1066	vmovdqa	608-512(%rax),%ymm7
1067	vpaddd	.Leight(%rip),%ymm4,%ymm4
1068
1069.Loop_enter8x:
1070	vmovdqa	%ymm14,64(%rsp)
1071	vmovdqa	%ymm15,96(%rsp)
1072	vbroadcasti128	(%r10),%ymm15
1073	vmovdqa	%ymm4,512-512(%rax)
1074	movl	$10,%eax
1075	jmp	.Loop8x
1076
1077.align	32
1078.Loop8x:
1079	vpaddd	%ymm0,%ymm8,%ymm8
1080	vpxor	%ymm4,%ymm8,%ymm4
1081	vpshufb	%ymm15,%ymm4,%ymm4
1082	vpaddd	%ymm1,%ymm9,%ymm9
1083	vpxor	%ymm5,%ymm9,%ymm5
1084	vpshufb	%ymm15,%ymm5,%ymm5
1085	vpaddd	%ymm4,%ymm12,%ymm12
1086	vpxor	%ymm0,%ymm12,%ymm0
1087	vpslld	$12,%ymm0,%ymm14
1088	vpsrld	$20,%ymm0,%ymm0
1089	vpor	%ymm0,%ymm14,%ymm0
1090	vbroadcasti128	(%r11),%ymm14
1091	vpaddd	%ymm5,%ymm13,%ymm13
1092	vpxor	%ymm1,%ymm13,%ymm1
1093	vpslld	$12,%ymm1,%ymm15
1094	vpsrld	$20,%ymm1,%ymm1
1095	vpor	%ymm1,%ymm15,%ymm1
1096	vpaddd	%ymm0,%ymm8,%ymm8
1097	vpxor	%ymm4,%ymm8,%ymm4
1098	vpshufb	%ymm14,%ymm4,%ymm4
1099	vpaddd	%ymm1,%ymm9,%ymm9
1100	vpxor	%ymm5,%ymm9,%ymm5
1101	vpshufb	%ymm14,%ymm5,%ymm5
1102	vpaddd	%ymm4,%ymm12,%ymm12
1103	vpxor	%ymm0,%ymm12,%ymm0
1104	vpslld	$7,%ymm0,%ymm15
1105	vpsrld	$25,%ymm0,%ymm0
1106	vpor	%ymm0,%ymm15,%ymm0
1107	vbroadcasti128	(%r10),%ymm15
1108	vpaddd	%ymm5,%ymm13,%ymm13
1109	vpxor	%ymm1,%ymm13,%ymm1
1110	vpslld	$7,%ymm1,%ymm14
1111	vpsrld	$25,%ymm1,%ymm1
1112	vpor	%ymm1,%ymm14,%ymm1
1113	vmovdqa	%ymm12,0(%rsp)
1114	vmovdqa	%ymm13,32(%rsp)
1115	vmovdqa	64(%rsp),%ymm12
1116	vmovdqa	96(%rsp),%ymm13
1117	vpaddd	%ymm2,%ymm10,%ymm10
1118	vpxor	%ymm6,%ymm10,%ymm6
1119	vpshufb	%ymm15,%ymm6,%ymm6
1120	vpaddd	%ymm3,%ymm11,%ymm11
1121	vpxor	%ymm7,%ymm11,%ymm7
1122	vpshufb	%ymm15,%ymm7,%ymm7
1123	vpaddd	%ymm6,%ymm12,%ymm12
1124	vpxor	%ymm2,%ymm12,%ymm2
1125	vpslld	$12,%ymm2,%ymm14
1126	vpsrld	$20,%ymm2,%ymm2
1127	vpor	%ymm2,%ymm14,%ymm2
1128	vbroadcasti128	(%r11),%ymm14
1129	vpaddd	%ymm7,%ymm13,%ymm13
1130	vpxor	%ymm3,%ymm13,%ymm3
1131	vpslld	$12,%ymm3,%ymm15
1132	vpsrld	$20,%ymm3,%ymm3
1133	vpor	%ymm3,%ymm15,%ymm3
1134	vpaddd	%ymm2,%ymm10,%ymm10
1135	vpxor	%ymm6,%ymm10,%ymm6
1136	vpshufb	%ymm14,%ymm6,%ymm6
1137	vpaddd	%ymm3,%ymm11,%ymm11
1138	vpxor	%ymm7,%ymm11,%ymm7
1139	vpshufb	%ymm14,%ymm7,%ymm7
1140	vpaddd	%ymm6,%ymm12,%ymm12
1141	vpxor	%ymm2,%ymm12,%ymm2
1142	vpslld	$7,%ymm2,%ymm15
1143	vpsrld	$25,%ymm2,%ymm2
1144	vpor	%ymm2,%ymm15,%ymm2
1145	vbroadcasti128	(%r10),%ymm15
1146	vpaddd	%ymm7,%ymm13,%ymm13
1147	vpxor	%ymm3,%ymm13,%ymm3
1148	vpslld	$7,%ymm3,%ymm14
1149	vpsrld	$25,%ymm3,%ymm3
1150	vpor	%ymm3,%ymm14,%ymm3
1151	vpaddd	%ymm1,%ymm8,%ymm8
1152	vpxor	%ymm7,%ymm8,%ymm7
1153	vpshufb	%ymm15,%ymm7,%ymm7
1154	vpaddd	%ymm2,%ymm9,%ymm9
1155	vpxor	%ymm4,%ymm9,%ymm4
1156	vpshufb	%ymm15,%ymm4,%ymm4
1157	vpaddd	%ymm7,%ymm12,%ymm12
1158	vpxor	%ymm1,%ymm12,%ymm1
1159	vpslld	$12,%ymm1,%ymm14
1160	vpsrld	$20,%ymm1,%ymm1
1161	vpor	%ymm1,%ymm14,%ymm1
1162	vbroadcasti128	(%r11),%ymm14
1163	vpaddd	%ymm4,%ymm13,%ymm13
1164	vpxor	%ymm2,%ymm13,%ymm2
1165	vpslld	$12,%ymm2,%ymm15
1166	vpsrld	$20,%ymm2,%ymm2
1167	vpor	%ymm2,%ymm15,%ymm2
1168	vpaddd	%ymm1,%ymm8,%ymm8
1169	vpxor	%ymm7,%ymm8,%ymm7
1170	vpshufb	%ymm14,%ymm7,%ymm7
1171	vpaddd	%ymm2,%ymm9,%ymm9
1172	vpxor	%ymm4,%ymm9,%ymm4
1173	vpshufb	%ymm14,%ymm4,%ymm4
1174	vpaddd	%ymm7,%ymm12,%ymm12
1175	vpxor	%ymm1,%ymm12,%ymm1
1176	vpslld	$7,%ymm1,%ymm15
1177	vpsrld	$25,%ymm1,%ymm1
1178	vpor	%ymm1,%ymm15,%ymm1
1179	vbroadcasti128	(%r10),%ymm15
1180	vpaddd	%ymm4,%ymm13,%ymm13
1181	vpxor	%ymm2,%ymm13,%ymm2
1182	vpslld	$7,%ymm2,%ymm14
1183	vpsrld	$25,%ymm2,%ymm2
1184	vpor	%ymm2,%ymm14,%ymm2
1185	vmovdqa	%ymm12,64(%rsp)
1186	vmovdqa	%ymm13,96(%rsp)
1187	vmovdqa	0(%rsp),%ymm12
1188	vmovdqa	32(%rsp),%ymm13
1189	vpaddd	%ymm3,%ymm10,%ymm10
1190	vpxor	%ymm5,%ymm10,%ymm5
1191	vpshufb	%ymm15,%ymm5,%ymm5
1192	vpaddd	%ymm0,%ymm11,%ymm11
1193	vpxor	%ymm6,%ymm11,%ymm6
1194	vpshufb	%ymm15,%ymm6,%ymm6
1195	vpaddd	%ymm5,%ymm12,%ymm12
1196	vpxor	%ymm3,%ymm12,%ymm3
1197	vpslld	$12,%ymm3,%ymm14
1198	vpsrld	$20,%ymm3,%ymm3
1199	vpor	%ymm3,%ymm14,%ymm3
1200	vbroadcasti128	(%r11),%ymm14
1201	vpaddd	%ymm6,%ymm13,%ymm13
1202	vpxor	%ymm0,%ymm13,%ymm0
1203	vpslld	$12,%ymm0,%ymm15
1204	vpsrld	$20,%ymm0,%ymm0
1205	vpor	%ymm0,%ymm15,%ymm0
1206	vpaddd	%ymm3,%ymm10,%ymm10
1207	vpxor	%ymm5,%ymm10,%ymm5
1208	vpshufb	%ymm14,%ymm5,%ymm5
1209	vpaddd	%ymm0,%ymm11,%ymm11
1210	vpxor	%ymm6,%ymm11,%ymm6
1211	vpshufb	%ymm14,%ymm6,%ymm6
1212	vpaddd	%ymm5,%ymm12,%ymm12
1213	vpxor	%ymm3,%ymm12,%ymm3
1214	vpslld	$7,%ymm3,%ymm15
1215	vpsrld	$25,%ymm3,%ymm3
1216	vpor	%ymm3,%ymm15,%ymm3
1217	vbroadcasti128	(%r10),%ymm15
1218	vpaddd	%ymm6,%ymm13,%ymm13
1219	vpxor	%ymm0,%ymm13,%ymm0
1220	vpslld	$7,%ymm0,%ymm14
1221	vpsrld	$25,%ymm0,%ymm0
1222	vpor	%ymm0,%ymm14,%ymm0
1223	decl	%eax
1224	jnz	.Loop8x
1225
1226	leaq	512(%rsp),%rax
1227	vpaddd	128-256(%rcx),%ymm8,%ymm8
1228	vpaddd	160-256(%rcx),%ymm9,%ymm9
1229	vpaddd	192-256(%rcx),%ymm10,%ymm10
1230	vpaddd	224-256(%rcx),%ymm11,%ymm11
1231
1232	vpunpckldq	%ymm9,%ymm8,%ymm14
1233	vpunpckldq	%ymm11,%ymm10,%ymm15
1234	vpunpckhdq	%ymm9,%ymm8,%ymm8
1235	vpunpckhdq	%ymm11,%ymm10,%ymm10
1236	vpunpcklqdq	%ymm15,%ymm14,%ymm9
1237	vpunpckhqdq	%ymm15,%ymm14,%ymm14
1238	vpunpcklqdq	%ymm10,%ymm8,%ymm11
1239	vpunpckhqdq	%ymm10,%ymm8,%ymm8
1240	vpaddd	256-256(%rcx),%ymm0,%ymm0
1241	vpaddd	288-256(%rcx),%ymm1,%ymm1
1242	vpaddd	320-256(%rcx),%ymm2,%ymm2
1243	vpaddd	352-256(%rcx),%ymm3,%ymm3
1244
1245	vpunpckldq	%ymm1,%ymm0,%ymm10
1246	vpunpckldq	%ymm3,%ymm2,%ymm15
1247	vpunpckhdq	%ymm1,%ymm0,%ymm0
1248	vpunpckhdq	%ymm3,%ymm2,%ymm2
1249	vpunpcklqdq	%ymm15,%ymm10,%ymm1
1250	vpunpckhqdq	%ymm15,%ymm10,%ymm10
1251	vpunpcklqdq	%ymm2,%ymm0,%ymm3
1252	vpunpckhqdq	%ymm2,%ymm0,%ymm0
1253	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
1254	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
1255	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
1256	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
1257	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
1258	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
1259	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
1260	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
1261	vmovdqa	%ymm15,0(%rsp)
1262	vmovdqa	%ymm9,32(%rsp)
1263	vmovdqa	64(%rsp),%ymm15
1264	vmovdqa	96(%rsp),%ymm9
1265
1266	vpaddd	384-512(%rax),%ymm12,%ymm12
1267	vpaddd	416-512(%rax),%ymm13,%ymm13
1268	vpaddd	448-512(%rax),%ymm15,%ymm15
1269	vpaddd	480-512(%rax),%ymm9,%ymm9
1270
1271	vpunpckldq	%ymm13,%ymm12,%ymm2
1272	vpunpckldq	%ymm9,%ymm15,%ymm8
1273	vpunpckhdq	%ymm13,%ymm12,%ymm12
1274	vpunpckhdq	%ymm9,%ymm15,%ymm15
1275	vpunpcklqdq	%ymm8,%ymm2,%ymm13
1276	vpunpckhqdq	%ymm8,%ymm2,%ymm2
1277	vpunpcklqdq	%ymm15,%ymm12,%ymm9
1278	vpunpckhqdq	%ymm15,%ymm12,%ymm12
1279	vpaddd	512-512(%rax),%ymm4,%ymm4
1280	vpaddd	544-512(%rax),%ymm5,%ymm5
1281	vpaddd	576-512(%rax),%ymm6,%ymm6
1282	vpaddd	608-512(%rax),%ymm7,%ymm7
1283
1284	vpunpckldq	%ymm5,%ymm4,%ymm15
1285	vpunpckldq	%ymm7,%ymm6,%ymm8
1286	vpunpckhdq	%ymm5,%ymm4,%ymm4
1287	vpunpckhdq	%ymm7,%ymm6,%ymm6
1288	vpunpcklqdq	%ymm8,%ymm15,%ymm5
1289	vpunpckhqdq	%ymm8,%ymm15,%ymm15
1290	vpunpcklqdq	%ymm6,%ymm4,%ymm7
1291	vpunpckhqdq	%ymm6,%ymm4,%ymm4
1292	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
1293	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
1294	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
1295	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
1296	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
1297	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
1298	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
1299	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
1300	vmovdqa	0(%rsp),%ymm6
1301	vmovdqa	32(%rsp),%ymm12
1302
1303	cmpq	$512,%rdx
1304	jb	.Ltail8x
1305
1306	vpxor	0(%rsi),%ymm6,%ymm6
1307	vpxor	32(%rsi),%ymm8,%ymm8
1308	vpxor	64(%rsi),%ymm1,%ymm1
1309	vpxor	96(%rsi),%ymm5,%ymm5
1310	leaq	128(%rsi),%rsi
1311	vmovdqu	%ymm6,0(%rdi)
1312	vmovdqu	%ymm8,32(%rdi)
1313	vmovdqu	%ymm1,64(%rdi)
1314	vmovdqu	%ymm5,96(%rdi)
1315	leaq	128(%rdi),%rdi
1316
1317	vpxor	0(%rsi),%ymm12,%ymm12
1318	vpxor	32(%rsi),%ymm13,%ymm13
1319	vpxor	64(%rsi),%ymm10,%ymm10
1320	vpxor	96(%rsi),%ymm15,%ymm15
1321	leaq	128(%rsi),%rsi
1322	vmovdqu	%ymm12,0(%rdi)
1323	vmovdqu	%ymm13,32(%rdi)
1324	vmovdqu	%ymm10,64(%rdi)
1325	vmovdqu	%ymm15,96(%rdi)
1326	leaq	128(%rdi),%rdi
1327
1328	vpxor	0(%rsi),%ymm14,%ymm14
1329	vpxor	32(%rsi),%ymm2,%ymm2
1330	vpxor	64(%rsi),%ymm3,%ymm3
1331	vpxor	96(%rsi),%ymm7,%ymm7
1332	leaq	128(%rsi),%rsi
1333	vmovdqu	%ymm14,0(%rdi)
1334	vmovdqu	%ymm2,32(%rdi)
1335	vmovdqu	%ymm3,64(%rdi)
1336	vmovdqu	%ymm7,96(%rdi)
1337	leaq	128(%rdi),%rdi
1338
1339	vpxor	0(%rsi),%ymm11,%ymm11
1340	vpxor	32(%rsi),%ymm9,%ymm9
1341	vpxor	64(%rsi),%ymm0,%ymm0
1342	vpxor	96(%rsi),%ymm4,%ymm4
1343	leaq	128(%rsi),%rsi
1344	vmovdqu	%ymm11,0(%rdi)
1345	vmovdqu	%ymm9,32(%rdi)
1346	vmovdqu	%ymm0,64(%rdi)
1347	vmovdqu	%ymm4,96(%rdi)
1348	leaq	128(%rdi),%rdi
1349
1350	subq	$512,%rdx
1351	jnz	.Loop_outer8x
1352
1353	jmp	.Ldone8x
1354
1355.Ltail8x:
1356	cmpq	$448,%rdx
1357	jae	.L448_or_more8x
1358	cmpq	$384,%rdx
1359	jae	.L384_or_more8x
1360	cmpq	$320,%rdx
1361	jae	.L320_or_more8x
1362	cmpq	$256,%rdx
1363	jae	.L256_or_more8x
1364	cmpq	$192,%rdx
1365	jae	.L192_or_more8x
1366	cmpq	$128,%rdx
1367	jae	.L128_or_more8x
1368	cmpq	$64,%rdx
1369	jae	.L64_or_more8x
1370
1371	xorq	%r10,%r10
1372	vmovdqa	%ymm6,0(%rsp)
1373	vmovdqa	%ymm8,32(%rsp)
1374	jmp	.Loop_tail8x
1375
1376.align	32
1377.L64_or_more8x:
1378	vpxor	0(%rsi),%ymm6,%ymm6
1379	vpxor	32(%rsi),%ymm8,%ymm8
1380	vmovdqu	%ymm6,0(%rdi)
1381	vmovdqu	%ymm8,32(%rdi)
1382	je	.Ldone8x
1383
1384	leaq	64(%rsi),%rsi
1385	xorq	%r10,%r10
1386	vmovdqa	%ymm1,0(%rsp)
1387	leaq	64(%rdi),%rdi
1388	subq	$64,%rdx
1389	vmovdqa	%ymm5,32(%rsp)
1390	jmp	.Loop_tail8x
1391
1392.align	32
1393.L128_or_more8x:
1394	vpxor	0(%rsi),%ymm6,%ymm6
1395	vpxor	32(%rsi),%ymm8,%ymm8
1396	vpxor	64(%rsi),%ymm1,%ymm1
1397	vpxor	96(%rsi),%ymm5,%ymm5
1398	vmovdqu	%ymm6,0(%rdi)
1399	vmovdqu	%ymm8,32(%rdi)
1400	vmovdqu	%ymm1,64(%rdi)
1401	vmovdqu	%ymm5,96(%rdi)
1402	je	.Ldone8x
1403
1404	leaq	128(%rsi),%rsi
1405	xorq	%r10,%r10
1406	vmovdqa	%ymm12,0(%rsp)
1407	leaq	128(%rdi),%rdi
1408	subq	$128,%rdx
1409	vmovdqa	%ymm13,32(%rsp)
1410	jmp	.Loop_tail8x
1411
1412.align	32
1413.L192_or_more8x:
1414	vpxor	0(%rsi),%ymm6,%ymm6
1415	vpxor	32(%rsi),%ymm8,%ymm8
1416	vpxor	64(%rsi),%ymm1,%ymm1
1417	vpxor	96(%rsi),%ymm5,%ymm5
1418	vpxor	128(%rsi),%ymm12,%ymm12
1419	vpxor	160(%rsi),%ymm13,%ymm13
1420	vmovdqu	%ymm6,0(%rdi)
1421	vmovdqu	%ymm8,32(%rdi)
1422	vmovdqu	%ymm1,64(%rdi)
1423	vmovdqu	%ymm5,96(%rdi)
1424	vmovdqu	%ymm12,128(%rdi)
1425	vmovdqu	%ymm13,160(%rdi)
1426	je	.Ldone8x
1427
1428	leaq	192(%rsi),%rsi
1429	xorq	%r10,%r10
1430	vmovdqa	%ymm10,0(%rsp)
1431	leaq	192(%rdi),%rdi
1432	subq	$192,%rdx
1433	vmovdqa	%ymm15,32(%rsp)
1434	jmp	.Loop_tail8x
1435
1436.align	32
1437.L256_or_more8x:
1438	vpxor	0(%rsi),%ymm6,%ymm6
1439	vpxor	32(%rsi),%ymm8,%ymm8
1440	vpxor	64(%rsi),%ymm1,%ymm1
1441	vpxor	96(%rsi),%ymm5,%ymm5
1442	vpxor	128(%rsi),%ymm12,%ymm12
1443	vpxor	160(%rsi),%ymm13,%ymm13
1444	vpxor	192(%rsi),%ymm10,%ymm10
1445	vpxor	224(%rsi),%ymm15,%ymm15
1446	vmovdqu	%ymm6,0(%rdi)
1447	vmovdqu	%ymm8,32(%rdi)
1448	vmovdqu	%ymm1,64(%rdi)
1449	vmovdqu	%ymm5,96(%rdi)
1450	vmovdqu	%ymm12,128(%rdi)
1451	vmovdqu	%ymm13,160(%rdi)
1452	vmovdqu	%ymm10,192(%rdi)
1453	vmovdqu	%ymm15,224(%rdi)
1454	je	.Ldone8x
1455
1456	leaq	256(%rsi),%rsi
1457	xorq	%r10,%r10
1458	vmovdqa	%ymm14,0(%rsp)
1459	leaq	256(%rdi),%rdi
1460	subq	$256,%rdx
1461	vmovdqa	%ymm2,32(%rsp)
1462	jmp	.Loop_tail8x
1463
1464.align	32
1465.L320_or_more8x:
1466	vpxor	0(%rsi),%ymm6,%ymm6
1467	vpxor	32(%rsi),%ymm8,%ymm8
1468	vpxor	64(%rsi),%ymm1,%ymm1
1469	vpxor	96(%rsi),%ymm5,%ymm5
1470	vpxor	128(%rsi),%ymm12,%ymm12
1471	vpxor	160(%rsi),%ymm13,%ymm13
1472	vpxor	192(%rsi),%ymm10,%ymm10
1473	vpxor	224(%rsi),%ymm15,%ymm15
1474	vpxor	256(%rsi),%ymm14,%ymm14
1475	vpxor	288(%rsi),%ymm2,%ymm2
1476	vmovdqu	%ymm6,0(%rdi)
1477	vmovdqu	%ymm8,32(%rdi)
1478	vmovdqu	%ymm1,64(%rdi)
1479	vmovdqu	%ymm5,96(%rdi)
1480	vmovdqu	%ymm12,128(%rdi)
1481	vmovdqu	%ymm13,160(%rdi)
1482	vmovdqu	%ymm10,192(%rdi)
1483	vmovdqu	%ymm15,224(%rdi)
1484	vmovdqu	%ymm14,256(%rdi)
1485	vmovdqu	%ymm2,288(%rdi)
1486	je	.Ldone8x
1487
1488	leaq	320(%rsi),%rsi
1489	xorq	%r10,%r10
1490	vmovdqa	%ymm3,0(%rsp)
1491	leaq	320(%rdi),%rdi
1492	subq	$320,%rdx
1493	vmovdqa	%ymm7,32(%rsp)
1494	jmp	.Loop_tail8x
1495
1496.align	32
1497.L384_or_more8x:
1498	vpxor	0(%rsi),%ymm6,%ymm6
1499	vpxor	32(%rsi),%ymm8,%ymm8
1500	vpxor	64(%rsi),%ymm1,%ymm1
1501	vpxor	96(%rsi),%ymm5,%ymm5
1502	vpxor	128(%rsi),%ymm12,%ymm12
1503	vpxor	160(%rsi),%ymm13,%ymm13
1504	vpxor	192(%rsi),%ymm10,%ymm10
1505	vpxor	224(%rsi),%ymm15,%ymm15
1506	vpxor	256(%rsi),%ymm14,%ymm14
1507	vpxor	288(%rsi),%ymm2,%ymm2
1508	vpxor	320(%rsi),%ymm3,%ymm3
1509	vpxor	352(%rsi),%ymm7,%ymm7
1510	vmovdqu	%ymm6,0(%rdi)
1511	vmovdqu	%ymm8,32(%rdi)
1512	vmovdqu	%ymm1,64(%rdi)
1513	vmovdqu	%ymm5,96(%rdi)
1514	vmovdqu	%ymm12,128(%rdi)
1515	vmovdqu	%ymm13,160(%rdi)
1516	vmovdqu	%ymm10,192(%rdi)
1517	vmovdqu	%ymm15,224(%rdi)
1518	vmovdqu	%ymm14,256(%rdi)
1519	vmovdqu	%ymm2,288(%rdi)
1520	vmovdqu	%ymm3,320(%rdi)
1521	vmovdqu	%ymm7,352(%rdi)
1522	je	.Ldone8x
1523
1524	leaq	384(%rsi),%rsi
1525	xorq	%r10,%r10
1526	vmovdqa	%ymm11,0(%rsp)
1527	leaq	384(%rdi),%rdi
1528	subq	$384,%rdx
1529	vmovdqa	%ymm9,32(%rsp)
1530	jmp	.Loop_tail8x
1531
1532.align	32
1533.L448_or_more8x:
1534	vpxor	0(%rsi),%ymm6,%ymm6
1535	vpxor	32(%rsi),%ymm8,%ymm8
1536	vpxor	64(%rsi),%ymm1,%ymm1
1537	vpxor	96(%rsi),%ymm5,%ymm5
1538	vpxor	128(%rsi),%ymm12,%ymm12
1539	vpxor	160(%rsi),%ymm13,%ymm13
1540	vpxor	192(%rsi),%ymm10,%ymm10
1541	vpxor	224(%rsi),%ymm15,%ymm15
1542	vpxor	256(%rsi),%ymm14,%ymm14
1543	vpxor	288(%rsi),%ymm2,%ymm2
1544	vpxor	320(%rsi),%ymm3,%ymm3
1545	vpxor	352(%rsi),%ymm7,%ymm7
1546	vpxor	384(%rsi),%ymm11,%ymm11
1547	vpxor	416(%rsi),%ymm9,%ymm9
1548	vmovdqu	%ymm6,0(%rdi)
1549	vmovdqu	%ymm8,32(%rdi)
1550	vmovdqu	%ymm1,64(%rdi)
1551	vmovdqu	%ymm5,96(%rdi)
1552	vmovdqu	%ymm12,128(%rdi)
1553	vmovdqu	%ymm13,160(%rdi)
1554	vmovdqu	%ymm10,192(%rdi)
1555	vmovdqu	%ymm15,224(%rdi)
1556	vmovdqu	%ymm14,256(%rdi)
1557	vmovdqu	%ymm2,288(%rdi)
1558	vmovdqu	%ymm3,320(%rdi)
1559	vmovdqu	%ymm7,352(%rdi)
1560	vmovdqu	%ymm11,384(%rdi)
1561	vmovdqu	%ymm9,416(%rdi)
1562	je	.Ldone8x
1563
1564	leaq	448(%rsi),%rsi
1565	xorq	%r10,%r10
1566	vmovdqa	%ymm0,0(%rsp)
1567	leaq	448(%rdi),%rdi
1568	subq	$448,%rdx
1569	vmovdqa	%ymm4,32(%rsp)
1570
1571.Loop_tail8x:
1572	movzbl	(%rsi,%r10,1),%eax
1573	movzbl	(%rsp,%r10,1),%ecx
1574	leaq	1(%r10),%r10
1575	xorl	%ecx,%eax
1576	movb	%al,-1(%rdi,%r10,1)
1577	decq	%rdx
1578	jnz	.Loop_tail8x
1579
1580.Ldone8x:
1581	vzeroall
1582	leaq	(%r9),%rsp
1583.L8x_epilogue:
1584	.byte	0xf3,0xc3
1585.size	ChaCha20_8x,.-ChaCha20_8x
1586#endif
1587