1default	rel
2%define XMMWORD
3%define YMMWORD
4%define ZMMWORD
5section	.text code align=64
6
7
8EXTERN	OPENSSL_ia32cap_P
9
10ALIGN	64
11$L$zero:
12	DD	0,0,0,0
13$L$one:
14	DD	1,0,0,0
15$L$inc:
16	DD	0,1,2,3
17$L$four:
18	DD	4,4,4,4
19$L$incy:
20	DD	0,2,4,6,1,3,5,7
21$L$eight:
22	DD	8,8,8,8,8,8,8,8
23$L$rot16:
24DB	0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd
25$L$rot24:
26DB	0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe
27$L$sigma:
28DB	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107
29DB	0
30ALIGN	64
31$L$zeroz:
32	DD	0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0
33$L$fourz:
34	DD	4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0
35$L$incz:
36	DD	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
37$L$sixteen:
38	DD	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
39DB	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
40DB	95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32
41DB	98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115
42DB	108,46,111,114,103,62,0
43global	ChaCha20_ctr32
44
45ALIGN	64
46ChaCha20_ctr32:
47	mov	QWORD[8+rsp],rdi	;WIN64 prologue
48	mov	QWORD[16+rsp],rsi
49	mov	rax,rsp
50$L$SEH_begin_ChaCha20_ctr32:
51	mov	rdi,rcx
52	mov	rsi,rdx
53	mov	rdx,r8
54	mov	rcx,r9
55	mov	r8,QWORD[40+rsp]
56
57
58	cmp	rdx,0
59	je	NEAR $L$no_data
60	mov	r10,QWORD[((OPENSSL_ia32cap_P+4))]
61	test	r10d,512
62	jnz	NEAR $L$ChaCha20_ssse3
63
64	push	rbx
65	push	rbp
66	push	r12
67	push	r13
68	push	r14
69	push	r15
70	sub	rsp,64+24
71$L$ctr32_body:
72
73
74	movdqu	xmm1,XMMWORD[rcx]
75	movdqu	xmm2,XMMWORD[16+rcx]
76	movdqu	xmm3,XMMWORD[r8]
77	movdqa	xmm4,XMMWORD[$L$one]
78
79
80	movdqa	XMMWORD[16+rsp],xmm1
81	movdqa	XMMWORD[32+rsp],xmm2
82	movdqa	XMMWORD[48+rsp],xmm3
83	mov	rbp,rdx
84	jmp	NEAR $L$oop_outer
85
86ALIGN	32
87$L$oop_outer:
88	mov	eax,0x61707865
89	mov	ebx,0x3320646e
90	mov	ecx,0x79622d32
91	mov	edx,0x6b206574
92	mov	r8d,DWORD[16+rsp]
93	mov	r9d,DWORD[20+rsp]
94	mov	r10d,DWORD[24+rsp]
95	mov	r11d,DWORD[28+rsp]
96	movd	r12d,xmm3
97	mov	r13d,DWORD[52+rsp]
98	mov	r14d,DWORD[56+rsp]
99	mov	r15d,DWORD[60+rsp]
100
101	mov	QWORD[((64+0))+rsp],rbp
102	mov	ebp,10
103	mov	QWORD[((64+8))+rsp],rsi
104DB	102,72,15,126,214
105	mov	QWORD[((64+16))+rsp],rdi
106	mov	rdi,rsi
107	shr	rdi,32
108	jmp	NEAR $L$oop
109
110ALIGN	32
111$L$oop:
112	add	eax,r8d
113	xor	r12d,eax
114	rol	r12d,16
115	add	ebx,r9d
116	xor	r13d,ebx
117	rol	r13d,16
118	add	esi,r12d
119	xor	r8d,esi
120	rol	r8d,12
121	add	edi,r13d
122	xor	r9d,edi
123	rol	r9d,12
124	add	eax,r8d
125	xor	r12d,eax
126	rol	r12d,8
127	add	ebx,r9d
128	xor	r13d,ebx
129	rol	r13d,8
130	add	esi,r12d
131	xor	r8d,esi
132	rol	r8d,7
133	add	edi,r13d
134	xor	r9d,edi
135	rol	r9d,7
136	mov	DWORD[32+rsp],esi
137	mov	DWORD[36+rsp],edi
138	mov	esi,DWORD[40+rsp]
139	mov	edi,DWORD[44+rsp]
140	add	ecx,r10d
141	xor	r14d,ecx
142	rol	r14d,16
143	add	edx,r11d
144	xor	r15d,edx
145	rol	r15d,16
146	add	esi,r14d
147	xor	r10d,esi
148	rol	r10d,12
149	add	edi,r15d
150	xor	r11d,edi
151	rol	r11d,12
152	add	ecx,r10d
153	xor	r14d,ecx
154	rol	r14d,8
155	add	edx,r11d
156	xor	r15d,edx
157	rol	r15d,8
158	add	esi,r14d
159	xor	r10d,esi
160	rol	r10d,7
161	add	edi,r15d
162	xor	r11d,edi
163	rol	r11d,7
164	add	eax,r9d
165	xor	r15d,eax
166	rol	r15d,16
167	add	ebx,r10d
168	xor	r12d,ebx
169	rol	r12d,16
170	add	esi,r15d
171	xor	r9d,esi
172	rol	r9d,12
173	add	edi,r12d
174	xor	r10d,edi
175	rol	r10d,12
176	add	eax,r9d
177	xor	r15d,eax
178	rol	r15d,8
179	add	ebx,r10d
180	xor	r12d,ebx
181	rol	r12d,8
182	add	esi,r15d
183	xor	r9d,esi
184	rol	r9d,7
185	add	edi,r12d
186	xor	r10d,edi
187	rol	r10d,7
188	mov	DWORD[40+rsp],esi
189	mov	DWORD[44+rsp],edi
190	mov	esi,DWORD[32+rsp]
191	mov	edi,DWORD[36+rsp]
192	add	ecx,r11d
193	xor	r13d,ecx
194	rol	r13d,16
195	add	edx,r8d
196	xor	r14d,edx
197	rol	r14d,16
198	add	esi,r13d
199	xor	r11d,esi
200	rol	r11d,12
201	add	edi,r14d
202	xor	r8d,edi
203	rol	r8d,12
204	add	ecx,r11d
205	xor	r13d,ecx
206	rol	r13d,8
207	add	edx,r8d
208	xor	r14d,edx
209	rol	r14d,8
210	add	esi,r13d
211	xor	r11d,esi
212	rol	r11d,7
213	add	edi,r14d
214	xor	r8d,edi
215	rol	r8d,7
216	dec	ebp
217	jnz	NEAR $L$oop
218	mov	DWORD[36+rsp],edi
219	mov	DWORD[32+rsp],esi
220	mov	rbp,QWORD[64+rsp]
221	movdqa	xmm1,xmm2
222	mov	rsi,QWORD[((64+8))+rsp]
223	paddd	xmm3,xmm4
224	mov	rdi,QWORD[((64+16))+rsp]
225
226	add	eax,0x61707865
227	add	ebx,0x3320646e
228	add	ecx,0x79622d32
229	add	edx,0x6b206574
230	add	r8d,DWORD[16+rsp]
231	add	r9d,DWORD[20+rsp]
232	add	r10d,DWORD[24+rsp]
233	add	r11d,DWORD[28+rsp]
234	add	r12d,DWORD[48+rsp]
235	add	r13d,DWORD[52+rsp]
236	add	r14d,DWORD[56+rsp]
237	add	r15d,DWORD[60+rsp]
238	paddd	xmm1,XMMWORD[32+rsp]
239
240	cmp	rbp,64
241	jb	NEAR $L$tail
242
243	xor	eax,DWORD[rsi]
244	xor	ebx,DWORD[4+rsi]
245	xor	ecx,DWORD[8+rsi]
246	xor	edx,DWORD[12+rsi]
247	xor	r8d,DWORD[16+rsi]
248	xor	r9d,DWORD[20+rsi]
249	xor	r10d,DWORD[24+rsi]
250	xor	r11d,DWORD[28+rsi]
251	movdqu	xmm0,XMMWORD[32+rsi]
252	xor	r12d,DWORD[48+rsi]
253	xor	r13d,DWORD[52+rsi]
254	xor	r14d,DWORD[56+rsi]
255	xor	r15d,DWORD[60+rsi]
256	lea	rsi,[64+rsi]
257	pxor	xmm0,xmm1
258
259	movdqa	XMMWORD[32+rsp],xmm2
260	movd	DWORD[48+rsp],xmm3
261
262	mov	DWORD[rdi],eax
263	mov	DWORD[4+rdi],ebx
264	mov	DWORD[8+rdi],ecx
265	mov	DWORD[12+rdi],edx
266	mov	DWORD[16+rdi],r8d
267	mov	DWORD[20+rdi],r9d
268	mov	DWORD[24+rdi],r10d
269	mov	DWORD[28+rdi],r11d
270	movdqu	XMMWORD[32+rdi],xmm0
271	mov	DWORD[48+rdi],r12d
272	mov	DWORD[52+rdi],r13d
273	mov	DWORD[56+rdi],r14d
274	mov	DWORD[60+rdi],r15d
275	lea	rdi,[64+rdi]
276
277	sub	rbp,64
278	jnz	NEAR $L$oop_outer
279
280	jmp	NEAR $L$done
281
282ALIGN	16
283$L$tail:
284	mov	DWORD[rsp],eax
285	mov	DWORD[4+rsp],ebx
286	xor	rbx,rbx
287	mov	DWORD[8+rsp],ecx
288	mov	DWORD[12+rsp],edx
289	mov	DWORD[16+rsp],r8d
290	mov	DWORD[20+rsp],r9d
291	mov	DWORD[24+rsp],r10d
292	mov	DWORD[28+rsp],r11d
293	movdqa	XMMWORD[32+rsp],xmm1
294	mov	DWORD[48+rsp],r12d
295	mov	DWORD[52+rsp],r13d
296	mov	DWORD[56+rsp],r14d
297	mov	DWORD[60+rsp],r15d
298
299$L$oop_tail:
300	movzx	eax,BYTE[rbx*1+rsi]
301	movzx	edx,BYTE[rbx*1+rsp]
302	lea	rbx,[1+rbx]
303	xor	eax,edx
304	mov	BYTE[((-1))+rbx*1+rdi],al
305	dec	rbp
306	jnz	NEAR $L$oop_tail
307
308$L$done:
309	lea	rsi,[((64+24+48))+rsp]
310	mov	r15,QWORD[((-48))+rsi]
311	mov	r14,QWORD[((-40))+rsi]
312	mov	r13,QWORD[((-32))+rsi]
313	mov	r12,QWORD[((-24))+rsi]
314	mov	rbp,QWORD[((-16))+rsi]
315	mov	rbx,QWORD[((-8))+rsi]
316	lea	rsp,[rsi]
317$L$no_data:
318	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
319	mov	rsi,QWORD[16+rsp]
320	DB	0F3h,0C3h		;repret
321$L$SEH_end_ChaCha20_ctr32:
322
323ALIGN	32
324ChaCha20_ssse3:
325	mov	QWORD[8+rsp],rdi	;WIN64 prologue
326	mov	QWORD[16+rsp],rsi
327	mov	rax,rsp
328$L$SEH_begin_ChaCha20_ssse3:
329	mov	rdi,rcx
330	mov	rsi,rdx
331	mov	rdx,r8
332	mov	rcx,r9
333	mov	r8,QWORD[40+rsp]
334
335
336$L$ChaCha20_ssse3:
337	mov	r9,rsp
338	cmp	rdx,128
339	ja	NEAR $L$ChaCha20_4x
340
341$L$do_sse3_after_all:
342	sub	rsp,64+40
343	movaps	XMMWORD[(-40)+r9],xmm6
344	movaps	XMMWORD[(-24)+r9],xmm7
345$L$ssse3_body:
346	movdqa	xmm0,XMMWORD[$L$sigma]
347	movdqu	xmm1,XMMWORD[rcx]
348	movdqu	xmm2,XMMWORD[16+rcx]
349	movdqu	xmm3,XMMWORD[r8]
350	movdqa	xmm6,XMMWORD[$L$rot16]
351	movdqa	xmm7,XMMWORD[$L$rot24]
352
353	movdqa	XMMWORD[rsp],xmm0
354	movdqa	XMMWORD[16+rsp],xmm1
355	movdqa	XMMWORD[32+rsp],xmm2
356	movdqa	XMMWORD[48+rsp],xmm3
357	mov	r8,10
358	jmp	NEAR $L$oop_ssse3
359
360ALIGN	32
361$L$oop_outer_ssse3:
362	movdqa	xmm3,XMMWORD[$L$one]
363	movdqa	xmm0,XMMWORD[rsp]
364	movdqa	xmm1,XMMWORD[16+rsp]
365	movdqa	xmm2,XMMWORD[32+rsp]
366	paddd	xmm3,XMMWORD[48+rsp]
367	mov	r8,10
368	movdqa	XMMWORD[48+rsp],xmm3
369	jmp	NEAR $L$oop_ssse3
370
371ALIGN	32
372$L$oop_ssse3:
373	paddd	xmm0,xmm1
374	pxor	xmm3,xmm0
375DB	102,15,56,0,222
376	paddd	xmm2,xmm3
377	pxor	xmm1,xmm2
378	movdqa	xmm4,xmm1
379	psrld	xmm1,20
380	pslld	xmm4,12
381	por	xmm1,xmm4
382	paddd	xmm0,xmm1
383	pxor	xmm3,xmm0
384DB	102,15,56,0,223
385	paddd	xmm2,xmm3
386	pxor	xmm1,xmm2
387	movdqa	xmm4,xmm1
388	psrld	xmm1,25
389	pslld	xmm4,7
390	por	xmm1,xmm4
391	pshufd	xmm2,xmm2,78
392	pshufd	xmm1,xmm1,57
393	pshufd	xmm3,xmm3,147
394	nop
395	paddd	xmm0,xmm1
396	pxor	xmm3,xmm0
397DB	102,15,56,0,222
398	paddd	xmm2,xmm3
399	pxor	xmm1,xmm2
400	movdqa	xmm4,xmm1
401	psrld	xmm1,20
402	pslld	xmm4,12
403	por	xmm1,xmm4
404	paddd	xmm0,xmm1
405	pxor	xmm3,xmm0
406DB	102,15,56,0,223
407	paddd	xmm2,xmm3
408	pxor	xmm1,xmm2
409	movdqa	xmm4,xmm1
410	psrld	xmm1,25
411	pslld	xmm4,7
412	por	xmm1,xmm4
413	pshufd	xmm2,xmm2,78
414	pshufd	xmm1,xmm1,147
415	pshufd	xmm3,xmm3,57
416	dec	r8
417	jnz	NEAR $L$oop_ssse3
418	paddd	xmm0,XMMWORD[rsp]
419	paddd	xmm1,XMMWORD[16+rsp]
420	paddd	xmm2,XMMWORD[32+rsp]
421	paddd	xmm3,XMMWORD[48+rsp]
422
423	cmp	rdx,64
424	jb	NEAR $L$tail_ssse3
425
426	movdqu	xmm4,XMMWORD[rsi]
427	movdqu	xmm5,XMMWORD[16+rsi]
428	pxor	xmm0,xmm4
429	movdqu	xmm4,XMMWORD[32+rsi]
430	pxor	xmm1,xmm5
431	movdqu	xmm5,XMMWORD[48+rsi]
432	lea	rsi,[64+rsi]
433	pxor	xmm2,xmm4
434	pxor	xmm3,xmm5
435
436	movdqu	XMMWORD[rdi],xmm0
437	movdqu	XMMWORD[16+rdi],xmm1
438	movdqu	XMMWORD[32+rdi],xmm2
439	movdqu	XMMWORD[48+rdi],xmm3
440	lea	rdi,[64+rdi]
441
442	sub	rdx,64
443	jnz	NEAR $L$oop_outer_ssse3
444
445	jmp	NEAR $L$done_ssse3
446
447ALIGN	16
448$L$tail_ssse3:
449	movdqa	XMMWORD[rsp],xmm0
450	movdqa	XMMWORD[16+rsp],xmm1
451	movdqa	XMMWORD[32+rsp],xmm2
452	movdqa	XMMWORD[48+rsp],xmm3
453	xor	r8,r8
454
455$L$oop_tail_ssse3:
456	movzx	eax,BYTE[r8*1+rsi]
457	movzx	ecx,BYTE[r8*1+rsp]
458	lea	r8,[1+r8]
459	xor	eax,ecx
460	mov	BYTE[((-1))+r8*1+rdi],al
461	dec	rdx
462	jnz	NEAR $L$oop_tail_ssse3
463
464$L$done_ssse3:
465	movaps	xmm6,XMMWORD[((-40))+r9]
466	movaps	xmm7,XMMWORD[((-24))+r9]
467	lea	rsp,[r9]
468$L$ssse3_epilogue:
469	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
470	mov	rsi,QWORD[16+rsp]
471	DB	0F3h,0C3h		;repret
472$L$SEH_end_ChaCha20_ssse3:
473
474ALIGN	32
475ChaCha20_4x:
476	mov	QWORD[8+rsp],rdi	;WIN64 prologue
477	mov	QWORD[16+rsp],rsi
478	mov	rax,rsp
479$L$SEH_begin_ChaCha20_4x:
480	mov	rdi,rcx
481	mov	rsi,rdx
482	mov	rdx,r8
483	mov	rcx,r9
484	mov	r8,QWORD[40+rsp]
485
486
487$L$ChaCha20_4x:
488	mov	r9,rsp
489	mov	r11,r10
490	shr	r10,32
491	test	r10,32
492	jnz	NEAR $L$ChaCha20_8x
493	cmp	rdx,192
494	ja	NEAR $L$proceed4x
495
496	and	r11,71303168
497	cmp	r11,4194304
498	je	NEAR $L$do_sse3_after_all
499
500$L$proceed4x:
501	sub	rsp,0x140+168
502	movaps	XMMWORD[(-168)+r9],xmm6
503	movaps	XMMWORD[(-152)+r9],xmm7
504	movaps	XMMWORD[(-136)+r9],xmm8
505	movaps	XMMWORD[(-120)+r9],xmm9
506	movaps	XMMWORD[(-104)+r9],xmm10
507	movaps	XMMWORD[(-88)+r9],xmm11
508	movaps	XMMWORD[(-72)+r9],xmm12
509	movaps	XMMWORD[(-56)+r9],xmm13
510	movaps	XMMWORD[(-40)+r9],xmm14
511	movaps	XMMWORD[(-24)+r9],xmm15
512$L$4x_body:
513	movdqa	xmm11,XMMWORD[$L$sigma]
514	movdqu	xmm15,XMMWORD[rcx]
515	movdqu	xmm7,XMMWORD[16+rcx]
516	movdqu	xmm3,XMMWORD[r8]
517	lea	rcx,[256+rsp]
518	lea	r10,[$L$rot16]
519	lea	r11,[$L$rot24]
520
521	pshufd	xmm8,xmm11,0x00
522	pshufd	xmm9,xmm11,0x55
523	movdqa	XMMWORD[64+rsp],xmm8
524	pshufd	xmm10,xmm11,0xaa
525	movdqa	XMMWORD[80+rsp],xmm9
526	pshufd	xmm11,xmm11,0xff
527	movdqa	XMMWORD[96+rsp],xmm10
528	movdqa	XMMWORD[112+rsp],xmm11
529
530	pshufd	xmm12,xmm15,0x00
531	pshufd	xmm13,xmm15,0x55
532	movdqa	XMMWORD[(128-256)+rcx],xmm12
533	pshufd	xmm14,xmm15,0xaa
534	movdqa	XMMWORD[(144-256)+rcx],xmm13
535	pshufd	xmm15,xmm15,0xff
536	movdqa	XMMWORD[(160-256)+rcx],xmm14
537	movdqa	XMMWORD[(176-256)+rcx],xmm15
538
539	pshufd	xmm4,xmm7,0x00
540	pshufd	xmm5,xmm7,0x55
541	movdqa	XMMWORD[(192-256)+rcx],xmm4
542	pshufd	xmm6,xmm7,0xaa
543	movdqa	XMMWORD[(208-256)+rcx],xmm5
544	pshufd	xmm7,xmm7,0xff
545	movdqa	XMMWORD[(224-256)+rcx],xmm6
546	movdqa	XMMWORD[(240-256)+rcx],xmm7
547
548	pshufd	xmm0,xmm3,0x00
549	pshufd	xmm1,xmm3,0x55
550	paddd	xmm0,XMMWORD[$L$inc]
551	pshufd	xmm2,xmm3,0xaa
552	movdqa	XMMWORD[(272-256)+rcx],xmm1
553	pshufd	xmm3,xmm3,0xff
554	movdqa	XMMWORD[(288-256)+rcx],xmm2
555	movdqa	XMMWORD[(304-256)+rcx],xmm3
556
557	jmp	NEAR $L$oop_enter4x
558
559ALIGN	32
560$L$oop_outer4x:
561	movdqa	xmm8,XMMWORD[64+rsp]
562	movdqa	xmm9,XMMWORD[80+rsp]
563	movdqa	xmm10,XMMWORD[96+rsp]
564	movdqa	xmm11,XMMWORD[112+rsp]
565	movdqa	xmm12,XMMWORD[((128-256))+rcx]
566	movdqa	xmm13,XMMWORD[((144-256))+rcx]
567	movdqa	xmm14,XMMWORD[((160-256))+rcx]
568	movdqa	xmm15,XMMWORD[((176-256))+rcx]
569	movdqa	xmm4,XMMWORD[((192-256))+rcx]
570	movdqa	xmm5,XMMWORD[((208-256))+rcx]
571	movdqa	xmm6,XMMWORD[((224-256))+rcx]
572	movdqa	xmm7,XMMWORD[((240-256))+rcx]
573	movdqa	xmm0,XMMWORD[((256-256))+rcx]
574	movdqa	xmm1,XMMWORD[((272-256))+rcx]
575	movdqa	xmm2,XMMWORD[((288-256))+rcx]
576	movdqa	xmm3,XMMWORD[((304-256))+rcx]
577	paddd	xmm0,XMMWORD[$L$four]
578
579$L$oop_enter4x:
580	movdqa	XMMWORD[32+rsp],xmm6
581	movdqa	XMMWORD[48+rsp],xmm7
582	movdqa	xmm7,XMMWORD[r10]
583	mov	eax,10
584	movdqa	XMMWORD[(256-256)+rcx],xmm0
585	jmp	NEAR $L$oop4x
586
587ALIGN	32
588$L$oop4x:
589	paddd	xmm8,xmm12
590	paddd	xmm9,xmm13
591	pxor	xmm0,xmm8
592	pxor	xmm1,xmm9
593DB	102,15,56,0,199
594DB	102,15,56,0,207
595	paddd	xmm4,xmm0
596	paddd	xmm5,xmm1
597	pxor	xmm12,xmm4
598	pxor	xmm13,xmm5
599	movdqa	xmm6,xmm12
600	pslld	xmm12,12
601	psrld	xmm6,20
602	movdqa	xmm7,xmm13
603	pslld	xmm13,12
604	por	xmm12,xmm6
605	psrld	xmm7,20
606	movdqa	xmm6,XMMWORD[r11]
607	por	xmm13,xmm7
608	paddd	xmm8,xmm12
609	paddd	xmm9,xmm13
610	pxor	xmm0,xmm8
611	pxor	xmm1,xmm9
612DB	102,15,56,0,198
613DB	102,15,56,0,206
614	paddd	xmm4,xmm0
615	paddd	xmm5,xmm1
616	pxor	xmm12,xmm4
617	pxor	xmm13,xmm5
618	movdqa	xmm7,xmm12
619	pslld	xmm12,7
620	psrld	xmm7,25
621	movdqa	xmm6,xmm13
622	pslld	xmm13,7
623	por	xmm12,xmm7
624	psrld	xmm6,25
625	movdqa	xmm7,XMMWORD[r10]
626	por	xmm13,xmm6
627	movdqa	XMMWORD[rsp],xmm4
628	movdqa	XMMWORD[16+rsp],xmm5
629	movdqa	xmm4,XMMWORD[32+rsp]
630	movdqa	xmm5,XMMWORD[48+rsp]
631	paddd	xmm10,xmm14
632	paddd	xmm11,xmm15
633	pxor	xmm2,xmm10
634	pxor	xmm3,xmm11
635DB	102,15,56,0,215
636DB	102,15,56,0,223
637	paddd	xmm4,xmm2
638	paddd	xmm5,xmm3
639	pxor	xmm14,xmm4
640	pxor	xmm15,xmm5
641	movdqa	xmm6,xmm14
642	pslld	xmm14,12
643	psrld	xmm6,20
644	movdqa	xmm7,xmm15
645	pslld	xmm15,12
646	por	xmm14,xmm6
647	psrld	xmm7,20
648	movdqa	xmm6,XMMWORD[r11]
649	por	xmm15,xmm7
650	paddd	xmm10,xmm14
651	paddd	xmm11,xmm15
652	pxor	xmm2,xmm10
653	pxor	xmm3,xmm11
654DB	102,15,56,0,214
655DB	102,15,56,0,222
656	paddd	xmm4,xmm2
657	paddd	xmm5,xmm3
658	pxor	xmm14,xmm4
659	pxor	xmm15,xmm5
660	movdqa	xmm7,xmm14
661	pslld	xmm14,7
662	psrld	xmm7,25
663	movdqa	xmm6,xmm15
664	pslld	xmm15,7
665	por	xmm14,xmm7
666	psrld	xmm6,25
667	movdqa	xmm7,XMMWORD[r10]
668	por	xmm15,xmm6
669	paddd	xmm8,xmm13
670	paddd	xmm9,xmm14
671	pxor	xmm3,xmm8
672	pxor	xmm0,xmm9
673DB	102,15,56,0,223
674DB	102,15,56,0,199
675	paddd	xmm4,xmm3
676	paddd	xmm5,xmm0
677	pxor	xmm13,xmm4
678	pxor	xmm14,xmm5
679	movdqa	xmm6,xmm13
680	pslld	xmm13,12
681	psrld	xmm6,20
682	movdqa	xmm7,xmm14
683	pslld	xmm14,12
684	por	xmm13,xmm6
685	psrld	xmm7,20
686	movdqa	xmm6,XMMWORD[r11]
687	por	xmm14,xmm7
688	paddd	xmm8,xmm13
689	paddd	xmm9,xmm14
690	pxor	xmm3,xmm8
691	pxor	xmm0,xmm9
692DB	102,15,56,0,222
693DB	102,15,56,0,198
694	paddd	xmm4,xmm3
695	paddd	xmm5,xmm0
696	pxor	xmm13,xmm4
697	pxor	xmm14,xmm5
698	movdqa	xmm7,xmm13
699	pslld	xmm13,7
700	psrld	xmm7,25
701	movdqa	xmm6,xmm14
702	pslld	xmm14,7
703	por	xmm13,xmm7
704	psrld	xmm6,25
705	movdqa	xmm7,XMMWORD[r10]
706	por	xmm14,xmm6
707	movdqa	XMMWORD[32+rsp],xmm4
708	movdqa	XMMWORD[48+rsp],xmm5
709	movdqa	xmm4,XMMWORD[rsp]
710	movdqa	xmm5,XMMWORD[16+rsp]
711	paddd	xmm10,xmm15
712	paddd	xmm11,xmm12
713	pxor	xmm1,xmm10
714	pxor	xmm2,xmm11
715DB	102,15,56,0,207
716DB	102,15,56,0,215
717	paddd	xmm4,xmm1
718	paddd	xmm5,xmm2
719	pxor	xmm15,xmm4
720	pxor	xmm12,xmm5
721	movdqa	xmm6,xmm15
722	pslld	xmm15,12
723	psrld	xmm6,20
724	movdqa	xmm7,xmm12
725	pslld	xmm12,12
726	por	xmm15,xmm6
727	psrld	xmm7,20
728	movdqa	xmm6,XMMWORD[r11]
729	por	xmm12,xmm7
730	paddd	xmm10,xmm15
731	paddd	xmm11,xmm12
732	pxor	xmm1,xmm10
733	pxor	xmm2,xmm11
734DB	102,15,56,0,206
735DB	102,15,56,0,214
736	paddd	xmm4,xmm1
737	paddd	xmm5,xmm2
738	pxor	xmm15,xmm4
739	pxor	xmm12,xmm5
740	movdqa	xmm7,xmm15
741	pslld	xmm15,7
742	psrld	xmm7,25
743	movdqa	xmm6,xmm12
744	pslld	xmm12,7
745	por	xmm15,xmm7
746	psrld	xmm6,25
747	movdqa	xmm7,XMMWORD[r10]
748	por	xmm12,xmm6
749	dec	eax
750	jnz	NEAR $L$oop4x
751
752	paddd	xmm8,XMMWORD[64+rsp]
753	paddd	xmm9,XMMWORD[80+rsp]
754	paddd	xmm10,XMMWORD[96+rsp]
755	paddd	xmm11,XMMWORD[112+rsp]
756
757	movdqa	xmm6,xmm8
758	punpckldq	xmm8,xmm9
759	movdqa	xmm7,xmm10
760	punpckldq	xmm10,xmm11
761	punpckhdq	xmm6,xmm9
762	punpckhdq	xmm7,xmm11
763	movdqa	xmm9,xmm8
764	punpcklqdq	xmm8,xmm10
765	movdqa	xmm11,xmm6
766	punpcklqdq	xmm6,xmm7
767	punpckhqdq	xmm9,xmm10
768	punpckhqdq	xmm11,xmm7
769	paddd	xmm12,XMMWORD[((128-256))+rcx]
770	paddd	xmm13,XMMWORD[((144-256))+rcx]
771	paddd	xmm14,XMMWORD[((160-256))+rcx]
772	paddd	xmm15,XMMWORD[((176-256))+rcx]
773
774	movdqa	XMMWORD[rsp],xmm8
775	movdqa	XMMWORD[16+rsp],xmm9
776	movdqa	xmm8,XMMWORD[32+rsp]
777	movdqa	xmm9,XMMWORD[48+rsp]
778
779	movdqa	xmm10,xmm12
780	punpckldq	xmm12,xmm13
781	movdqa	xmm7,xmm14
782	punpckldq	xmm14,xmm15
783	punpckhdq	xmm10,xmm13
784	punpckhdq	xmm7,xmm15
785	movdqa	xmm13,xmm12
786	punpcklqdq	xmm12,xmm14
787	movdqa	xmm15,xmm10
788	punpcklqdq	xmm10,xmm7
789	punpckhqdq	xmm13,xmm14
790	punpckhqdq	xmm15,xmm7
791	paddd	xmm4,XMMWORD[((192-256))+rcx]
792	paddd	xmm5,XMMWORD[((208-256))+rcx]
793	paddd	xmm8,XMMWORD[((224-256))+rcx]
794	paddd	xmm9,XMMWORD[((240-256))+rcx]
795
796	movdqa	XMMWORD[32+rsp],xmm6
797	movdqa	XMMWORD[48+rsp],xmm11
798
799	movdqa	xmm14,xmm4
800	punpckldq	xmm4,xmm5
801	movdqa	xmm7,xmm8
802	punpckldq	xmm8,xmm9
803	punpckhdq	xmm14,xmm5
804	punpckhdq	xmm7,xmm9
805	movdqa	xmm5,xmm4
806	punpcklqdq	xmm4,xmm8
807	movdqa	xmm9,xmm14
808	punpcklqdq	xmm14,xmm7
809	punpckhqdq	xmm5,xmm8
810	punpckhqdq	xmm9,xmm7
811	paddd	xmm0,XMMWORD[((256-256))+rcx]
812	paddd	xmm1,XMMWORD[((272-256))+rcx]
813	paddd	xmm2,XMMWORD[((288-256))+rcx]
814	paddd	xmm3,XMMWORD[((304-256))+rcx]
815
816	movdqa	xmm8,xmm0
817	punpckldq	xmm0,xmm1
818	movdqa	xmm7,xmm2
819	punpckldq	xmm2,xmm3
820	punpckhdq	xmm8,xmm1
821	punpckhdq	xmm7,xmm3
822	movdqa	xmm1,xmm0
823	punpcklqdq	xmm0,xmm2
824	movdqa	xmm3,xmm8
825	punpcklqdq	xmm8,xmm7
826	punpckhqdq	xmm1,xmm2
827	punpckhqdq	xmm3,xmm7
828	cmp	rdx,64*4
829	jb	NEAR $L$tail4x
830
831	movdqu	xmm6,XMMWORD[rsi]
832	movdqu	xmm11,XMMWORD[16+rsi]
833	movdqu	xmm2,XMMWORD[32+rsi]
834	movdqu	xmm7,XMMWORD[48+rsi]
835	pxor	xmm6,XMMWORD[rsp]
836	pxor	xmm11,xmm12
837	pxor	xmm2,xmm4
838	pxor	xmm7,xmm0
839
840	movdqu	XMMWORD[rdi],xmm6
841	movdqu	xmm6,XMMWORD[64+rsi]
842	movdqu	XMMWORD[16+rdi],xmm11
843	movdqu	xmm11,XMMWORD[80+rsi]
844	movdqu	XMMWORD[32+rdi],xmm2
845	movdqu	xmm2,XMMWORD[96+rsi]
846	movdqu	XMMWORD[48+rdi],xmm7
847	movdqu	xmm7,XMMWORD[112+rsi]
848	lea	rsi,[128+rsi]
849	pxor	xmm6,XMMWORD[16+rsp]
850	pxor	xmm11,xmm13
851	pxor	xmm2,xmm5
852	pxor	xmm7,xmm1
853
854	movdqu	XMMWORD[64+rdi],xmm6
855	movdqu	xmm6,XMMWORD[rsi]
856	movdqu	XMMWORD[80+rdi],xmm11
857	movdqu	xmm11,XMMWORD[16+rsi]
858	movdqu	XMMWORD[96+rdi],xmm2
859	movdqu	xmm2,XMMWORD[32+rsi]
860	movdqu	XMMWORD[112+rdi],xmm7
861	lea	rdi,[128+rdi]
862	movdqu	xmm7,XMMWORD[48+rsi]
863	pxor	xmm6,XMMWORD[32+rsp]
864	pxor	xmm11,xmm10
865	pxor	xmm2,xmm14
866	pxor	xmm7,xmm8
867
868	movdqu	XMMWORD[rdi],xmm6
869	movdqu	xmm6,XMMWORD[64+rsi]
870	movdqu	XMMWORD[16+rdi],xmm11
871	movdqu	xmm11,XMMWORD[80+rsi]
872	movdqu	XMMWORD[32+rdi],xmm2
873	movdqu	xmm2,XMMWORD[96+rsi]
874	movdqu	XMMWORD[48+rdi],xmm7
875	movdqu	xmm7,XMMWORD[112+rsi]
876	lea	rsi,[128+rsi]
877	pxor	xmm6,XMMWORD[48+rsp]
878	pxor	xmm11,xmm15
879	pxor	xmm2,xmm9
880	pxor	xmm7,xmm3
881	movdqu	XMMWORD[64+rdi],xmm6
882	movdqu	XMMWORD[80+rdi],xmm11
883	movdqu	XMMWORD[96+rdi],xmm2
884	movdqu	XMMWORD[112+rdi],xmm7
885	lea	rdi,[128+rdi]
886
887	sub	rdx,64*4
888	jnz	NEAR $L$oop_outer4x
889
890	jmp	NEAR $L$done4x
891
892$L$tail4x:
893	cmp	rdx,192
894	jae	NEAR $L$192_or_more4x
895	cmp	rdx,128
896	jae	NEAR $L$128_or_more4x
897	cmp	rdx,64
898	jae	NEAR $L$64_or_more4x
899
900
901	xor	r10,r10
902
903	movdqa	XMMWORD[16+rsp],xmm12
904	movdqa	XMMWORD[32+rsp],xmm4
905	movdqa	XMMWORD[48+rsp],xmm0
906	jmp	NEAR $L$oop_tail4x
907
908ALIGN	32
909$L$64_or_more4x:
910	movdqu	xmm6,XMMWORD[rsi]
911	movdqu	xmm11,XMMWORD[16+rsi]
912	movdqu	xmm2,XMMWORD[32+rsi]
913	movdqu	xmm7,XMMWORD[48+rsi]
914	pxor	xmm6,XMMWORD[rsp]
915	pxor	xmm11,xmm12
916	pxor	xmm2,xmm4
917	pxor	xmm7,xmm0
918	movdqu	XMMWORD[rdi],xmm6
919	movdqu	XMMWORD[16+rdi],xmm11
920	movdqu	XMMWORD[32+rdi],xmm2
921	movdqu	XMMWORD[48+rdi],xmm7
922	je	NEAR $L$done4x
923
924	movdqa	xmm6,XMMWORD[16+rsp]
925	lea	rsi,[64+rsi]
926	xor	r10,r10
927	movdqa	XMMWORD[rsp],xmm6
928	movdqa	XMMWORD[16+rsp],xmm13
929	lea	rdi,[64+rdi]
930	movdqa	XMMWORD[32+rsp],xmm5
931	sub	rdx,64
932	movdqa	XMMWORD[48+rsp],xmm1
933	jmp	NEAR $L$oop_tail4x
934
935ALIGN	32
936$L$128_or_more4x:
937	movdqu	xmm6,XMMWORD[rsi]
938	movdqu	xmm11,XMMWORD[16+rsi]
939	movdqu	xmm2,XMMWORD[32+rsi]
940	movdqu	xmm7,XMMWORD[48+rsi]
941	pxor	xmm6,XMMWORD[rsp]
942	pxor	xmm11,xmm12
943	pxor	xmm2,xmm4
944	pxor	xmm7,xmm0
945
946	movdqu	XMMWORD[rdi],xmm6
947	movdqu	xmm6,XMMWORD[64+rsi]
948	movdqu	XMMWORD[16+rdi],xmm11
949	movdqu	xmm11,XMMWORD[80+rsi]
950	movdqu	XMMWORD[32+rdi],xmm2
951	movdqu	xmm2,XMMWORD[96+rsi]
952	movdqu	XMMWORD[48+rdi],xmm7
953	movdqu	xmm7,XMMWORD[112+rsi]
954	pxor	xmm6,XMMWORD[16+rsp]
955	pxor	xmm11,xmm13
956	pxor	xmm2,xmm5
957	pxor	xmm7,xmm1
958	movdqu	XMMWORD[64+rdi],xmm6
959	movdqu	XMMWORD[80+rdi],xmm11
960	movdqu	XMMWORD[96+rdi],xmm2
961	movdqu	XMMWORD[112+rdi],xmm7
962	je	NEAR $L$done4x
963
964	movdqa	xmm6,XMMWORD[32+rsp]
965	lea	rsi,[128+rsi]
966	xor	r10,r10
967	movdqa	XMMWORD[rsp],xmm6
968	movdqa	XMMWORD[16+rsp],xmm10
969	lea	rdi,[128+rdi]
970	movdqa	XMMWORD[32+rsp],xmm14
971	sub	rdx,128
972	movdqa	XMMWORD[48+rsp],xmm8
973	jmp	NEAR $L$oop_tail4x
974
975ALIGN	32
976$L$192_or_more4x:
977	movdqu	xmm6,XMMWORD[rsi]
978	movdqu	xmm11,XMMWORD[16+rsi]
979	movdqu	xmm2,XMMWORD[32+rsi]
980	movdqu	xmm7,XMMWORD[48+rsi]
981	pxor	xmm6,XMMWORD[rsp]
982	pxor	xmm11,xmm12
983	pxor	xmm2,xmm4
984	pxor	xmm7,xmm0
985
986	movdqu	XMMWORD[rdi],xmm6
987	movdqu	xmm6,XMMWORD[64+rsi]
988	movdqu	XMMWORD[16+rdi],xmm11
989	movdqu	xmm11,XMMWORD[80+rsi]
990	movdqu	XMMWORD[32+rdi],xmm2
991	movdqu	xmm2,XMMWORD[96+rsi]
992	movdqu	XMMWORD[48+rdi],xmm7
993	movdqu	xmm7,XMMWORD[112+rsi]
994	lea	rsi,[128+rsi]
995	pxor	xmm6,XMMWORD[16+rsp]
996	pxor	xmm11,xmm13
997	pxor	xmm2,xmm5
998	pxor	xmm7,xmm1
999
1000	movdqu	XMMWORD[64+rdi],xmm6
1001	movdqu	xmm6,XMMWORD[rsi]
1002	movdqu	XMMWORD[80+rdi],xmm11
1003	movdqu	xmm11,XMMWORD[16+rsi]
1004	movdqu	XMMWORD[96+rdi],xmm2
1005	movdqu	xmm2,XMMWORD[32+rsi]
1006	movdqu	XMMWORD[112+rdi],xmm7
1007	lea	rdi,[128+rdi]
1008	movdqu	xmm7,XMMWORD[48+rsi]
1009	pxor	xmm6,XMMWORD[32+rsp]
1010	pxor	xmm11,xmm10
1011	pxor	xmm2,xmm14
1012	pxor	xmm7,xmm8
1013	movdqu	XMMWORD[rdi],xmm6
1014	movdqu	XMMWORD[16+rdi],xmm11
1015	movdqu	XMMWORD[32+rdi],xmm2
1016	movdqu	XMMWORD[48+rdi],xmm7
1017	je	NEAR $L$done4x
1018
1019	movdqa	xmm6,XMMWORD[48+rsp]
1020	lea	rsi,[64+rsi]
1021	xor	r10,r10
1022	movdqa	XMMWORD[rsp],xmm6
1023	movdqa	XMMWORD[16+rsp],xmm15
1024	lea	rdi,[64+rdi]
1025	movdqa	XMMWORD[32+rsp],xmm9
1026	sub	rdx,192
1027	movdqa	XMMWORD[48+rsp],xmm3
1028
1029$L$oop_tail4x:
1030	movzx	eax,BYTE[r10*1+rsi]
1031	movzx	ecx,BYTE[r10*1+rsp]
1032	lea	r10,[1+r10]
1033	xor	eax,ecx
1034	mov	BYTE[((-1))+r10*1+rdi],al
1035	dec	rdx
1036	jnz	NEAR $L$oop_tail4x
1037
1038$L$done4x:
1039	movaps	xmm6,XMMWORD[((-168))+r9]
1040	movaps	xmm7,XMMWORD[((-152))+r9]
1041	movaps	xmm8,XMMWORD[((-136))+r9]
1042	movaps	xmm9,XMMWORD[((-120))+r9]
1043	movaps	xmm10,XMMWORD[((-104))+r9]
1044	movaps	xmm11,XMMWORD[((-88))+r9]
1045	movaps	xmm12,XMMWORD[((-72))+r9]
1046	movaps	xmm13,XMMWORD[((-56))+r9]
1047	movaps	xmm14,XMMWORD[((-40))+r9]
1048	movaps	xmm15,XMMWORD[((-24))+r9]
1049	lea	rsp,[r9]
1050$L$4x_epilogue:
1051	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1052	mov	rsi,QWORD[16+rsp]
1053	DB	0F3h,0C3h		;repret
1054$L$SEH_end_ChaCha20_4x:
1055
1056ALIGN	32
1057ChaCha20_8x:
1058	mov	QWORD[8+rsp],rdi	;WIN64 prologue
1059	mov	QWORD[16+rsp],rsi
1060	mov	rax,rsp
1061$L$SEH_begin_ChaCha20_8x:
1062	mov	rdi,rcx
1063	mov	rsi,rdx
1064	mov	rdx,r8
1065	mov	rcx,r9
1066	mov	r8,QWORD[40+rsp]
1067
1068
1069$L$ChaCha20_8x:
1070	mov	r9,rsp
1071	sub	rsp,0x280+168
1072	and	rsp,-32
1073	movaps	XMMWORD[(-168)+r9],xmm6
1074	movaps	XMMWORD[(-152)+r9],xmm7
1075	movaps	XMMWORD[(-136)+r9],xmm8
1076	movaps	XMMWORD[(-120)+r9],xmm9
1077	movaps	XMMWORD[(-104)+r9],xmm10
1078	movaps	XMMWORD[(-88)+r9],xmm11
1079	movaps	XMMWORD[(-72)+r9],xmm12
1080	movaps	XMMWORD[(-56)+r9],xmm13
1081	movaps	XMMWORD[(-40)+r9],xmm14
1082	movaps	XMMWORD[(-24)+r9],xmm15
1083$L$8x_body:
1084	vzeroupper
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095	vbroadcasti128	ymm11,XMMWORD[$L$sigma]
1096	vbroadcasti128	ymm3,XMMWORD[rcx]
1097	vbroadcasti128	ymm15,XMMWORD[16+rcx]
1098	vbroadcasti128	ymm7,XMMWORD[r8]
1099	lea	rcx,[256+rsp]
1100	lea	rax,[512+rsp]
1101	lea	r10,[$L$rot16]
1102	lea	r11,[$L$rot24]
1103
1104	vpshufd	ymm8,ymm11,0x00
1105	vpshufd	ymm9,ymm11,0x55
1106	vmovdqa	YMMWORD[(128-256)+rcx],ymm8
1107	vpshufd	ymm10,ymm11,0xaa
1108	vmovdqa	YMMWORD[(160-256)+rcx],ymm9
1109	vpshufd	ymm11,ymm11,0xff
1110	vmovdqa	YMMWORD[(192-256)+rcx],ymm10
1111	vmovdqa	YMMWORD[(224-256)+rcx],ymm11
1112
1113	vpshufd	ymm0,ymm3,0x00
1114	vpshufd	ymm1,ymm3,0x55
1115	vmovdqa	YMMWORD[(256-256)+rcx],ymm0
1116	vpshufd	ymm2,ymm3,0xaa
1117	vmovdqa	YMMWORD[(288-256)+rcx],ymm1
1118	vpshufd	ymm3,ymm3,0xff
1119	vmovdqa	YMMWORD[(320-256)+rcx],ymm2
1120	vmovdqa	YMMWORD[(352-256)+rcx],ymm3
1121
1122	vpshufd	ymm12,ymm15,0x00
1123	vpshufd	ymm13,ymm15,0x55
1124	vmovdqa	YMMWORD[(384-512)+rax],ymm12
1125	vpshufd	ymm14,ymm15,0xaa
1126	vmovdqa	YMMWORD[(416-512)+rax],ymm13
1127	vpshufd	ymm15,ymm15,0xff
1128	vmovdqa	YMMWORD[(448-512)+rax],ymm14
1129	vmovdqa	YMMWORD[(480-512)+rax],ymm15
1130
1131	vpshufd	ymm4,ymm7,0x00
1132	vpshufd	ymm5,ymm7,0x55
1133	vpaddd	ymm4,ymm4,YMMWORD[$L$incy]
1134	vpshufd	ymm6,ymm7,0xaa
1135	vmovdqa	YMMWORD[(544-512)+rax],ymm5
1136	vpshufd	ymm7,ymm7,0xff
1137	vmovdqa	YMMWORD[(576-512)+rax],ymm6
1138	vmovdqa	YMMWORD[(608-512)+rax],ymm7
1139
1140	jmp	NEAR $L$oop_enter8x
1141
1142ALIGN	32
1143$L$oop_outer8x:
1144	vmovdqa	ymm8,YMMWORD[((128-256))+rcx]
1145	vmovdqa	ymm9,YMMWORD[((160-256))+rcx]
1146	vmovdqa	ymm10,YMMWORD[((192-256))+rcx]
1147	vmovdqa	ymm11,YMMWORD[((224-256))+rcx]
1148	vmovdqa	ymm0,YMMWORD[((256-256))+rcx]
1149	vmovdqa	ymm1,YMMWORD[((288-256))+rcx]
1150	vmovdqa	ymm2,YMMWORD[((320-256))+rcx]
1151	vmovdqa	ymm3,YMMWORD[((352-256))+rcx]
1152	vmovdqa	ymm12,YMMWORD[((384-512))+rax]
1153	vmovdqa	ymm13,YMMWORD[((416-512))+rax]
1154	vmovdqa	ymm14,YMMWORD[((448-512))+rax]
1155	vmovdqa	ymm15,YMMWORD[((480-512))+rax]
1156	vmovdqa	ymm4,YMMWORD[((512-512))+rax]
1157	vmovdqa	ymm5,YMMWORD[((544-512))+rax]
1158	vmovdqa	ymm6,YMMWORD[((576-512))+rax]
1159	vmovdqa	ymm7,YMMWORD[((608-512))+rax]
1160	vpaddd	ymm4,ymm4,YMMWORD[$L$eight]
1161
1162$L$oop_enter8x:
1163	vmovdqa	YMMWORD[64+rsp],ymm14
1164	vmovdqa	YMMWORD[96+rsp],ymm15
1165	vbroadcasti128	ymm15,XMMWORD[r10]
1166	vmovdqa	YMMWORD[(512-512)+rax],ymm4
1167	mov	eax,10
1168	jmp	NEAR $L$oop8x
1169
1170ALIGN	32
1171$L$oop8x:
1172	vpaddd	ymm8,ymm8,ymm0
1173	vpxor	ymm4,ymm8,ymm4
1174	vpshufb	ymm4,ymm4,ymm15
1175	vpaddd	ymm9,ymm9,ymm1
1176	vpxor	ymm5,ymm9,ymm5
1177	vpshufb	ymm5,ymm5,ymm15
1178	vpaddd	ymm12,ymm12,ymm4
1179	vpxor	ymm0,ymm12,ymm0
1180	vpslld	ymm14,ymm0,12
1181	vpsrld	ymm0,ymm0,20
1182	vpor	ymm0,ymm14,ymm0
1183	vbroadcasti128	ymm14,XMMWORD[r11]
1184	vpaddd	ymm13,ymm13,ymm5
1185	vpxor	ymm1,ymm13,ymm1
1186	vpslld	ymm15,ymm1,12
1187	vpsrld	ymm1,ymm1,20
1188	vpor	ymm1,ymm15,ymm1
1189	vpaddd	ymm8,ymm8,ymm0
1190	vpxor	ymm4,ymm8,ymm4
1191	vpshufb	ymm4,ymm4,ymm14
1192	vpaddd	ymm9,ymm9,ymm1
1193	vpxor	ymm5,ymm9,ymm5
1194	vpshufb	ymm5,ymm5,ymm14
1195	vpaddd	ymm12,ymm12,ymm4
1196	vpxor	ymm0,ymm12,ymm0
1197	vpslld	ymm15,ymm0,7
1198	vpsrld	ymm0,ymm0,25
1199	vpor	ymm0,ymm15,ymm0
1200	vbroadcasti128	ymm15,XMMWORD[r10]
1201	vpaddd	ymm13,ymm13,ymm5
1202	vpxor	ymm1,ymm13,ymm1
1203	vpslld	ymm14,ymm1,7
1204	vpsrld	ymm1,ymm1,25
1205	vpor	ymm1,ymm14,ymm1
1206	vmovdqa	YMMWORD[rsp],ymm12
1207	vmovdqa	YMMWORD[32+rsp],ymm13
1208	vmovdqa	ymm12,YMMWORD[64+rsp]
1209	vmovdqa	ymm13,YMMWORD[96+rsp]
1210	vpaddd	ymm10,ymm10,ymm2
1211	vpxor	ymm6,ymm10,ymm6
1212	vpshufb	ymm6,ymm6,ymm15
1213	vpaddd	ymm11,ymm11,ymm3
1214	vpxor	ymm7,ymm11,ymm7
1215	vpshufb	ymm7,ymm7,ymm15
1216	vpaddd	ymm12,ymm12,ymm6
1217	vpxor	ymm2,ymm12,ymm2
1218	vpslld	ymm14,ymm2,12
1219	vpsrld	ymm2,ymm2,20
1220	vpor	ymm2,ymm14,ymm2
1221	vbroadcasti128	ymm14,XMMWORD[r11]
1222	vpaddd	ymm13,ymm13,ymm7
1223	vpxor	ymm3,ymm13,ymm3
1224	vpslld	ymm15,ymm3,12
1225	vpsrld	ymm3,ymm3,20
1226	vpor	ymm3,ymm15,ymm3
1227	vpaddd	ymm10,ymm10,ymm2
1228	vpxor	ymm6,ymm10,ymm6
1229	vpshufb	ymm6,ymm6,ymm14
1230	vpaddd	ymm11,ymm11,ymm3
1231	vpxor	ymm7,ymm11,ymm7
1232	vpshufb	ymm7,ymm7,ymm14
1233	vpaddd	ymm12,ymm12,ymm6
1234	vpxor	ymm2,ymm12,ymm2
1235	vpslld	ymm15,ymm2,7
1236	vpsrld	ymm2,ymm2,25
1237	vpor	ymm2,ymm15,ymm2
1238	vbroadcasti128	ymm15,XMMWORD[r10]
1239	vpaddd	ymm13,ymm13,ymm7
1240	vpxor	ymm3,ymm13,ymm3
1241	vpslld	ymm14,ymm3,7
1242	vpsrld	ymm3,ymm3,25
1243	vpor	ymm3,ymm14,ymm3
1244	vpaddd	ymm8,ymm8,ymm1
1245	vpxor	ymm7,ymm8,ymm7
1246	vpshufb	ymm7,ymm7,ymm15
1247	vpaddd	ymm9,ymm9,ymm2
1248	vpxor	ymm4,ymm9,ymm4
1249	vpshufb	ymm4,ymm4,ymm15
1250	vpaddd	ymm12,ymm12,ymm7
1251	vpxor	ymm1,ymm12,ymm1
1252	vpslld	ymm14,ymm1,12
1253	vpsrld	ymm1,ymm1,20
1254	vpor	ymm1,ymm14,ymm1
1255	vbroadcasti128	ymm14,XMMWORD[r11]
1256	vpaddd	ymm13,ymm13,ymm4
1257	vpxor	ymm2,ymm13,ymm2
1258	vpslld	ymm15,ymm2,12
1259	vpsrld	ymm2,ymm2,20
1260	vpor	ymm2,ymm15,ymm2
1261	vpaddd	ymm8,ymm8,ymm1
1262	vpxor	ymm7,ymm8,ymm7
1263	vpshufb	ymm7,ymm7,ymm14
1264	vpaddd	ymm9,ymm9,ymm2
1265	vpxor	ymm4,ymm9,ymm4
1266	vpshufb	ymm4,ymm4,ymm14
1267	vpaddd	ymm12,ymm12,ymm7
1268	vpxor	ymm1,ymm12,ymm1
1269	vpslld	ymm15,ymm1,7
1270	vpsrld	ymm1,ymm1,25
1271	vpor	ymm1,ymm15,ymm1
1272	vbroadcasti128	ymm15,XMMWORD[r10]
1273	vpaddd	ymm13,ymm13,ymm4
1274	vpxor	ymm2,ymm13,ymm2
1275	vpslld	ymm14,ymm2,7
1276	vpsrld	ymm2,ymm2,25
1277	vpor	ymm2,ymm14,ymm2
1278	vmovdqa	YMMWORD[64+rsp],ymm12
1279	vmovdqa	YMMWORD[96+rsp],ymm13
1280	vmovdqa	ymm12,YMMWORD[rsp]
1281	vmovdqa	ymm13,YMMWORD[32+rsp]
1282	vpaddd	ymm10,ymm10,ymm3
1283	vpxor	ymm5,ymm10,ymm5
1284	vpshufb	ymm5,ymm5,ymm15
1285	vpaddd	ymm11,ymm11,ymm0
1286	vpxor	ymm6,ymm11,ymm6
1287	vpshufb	ymm6,ymm6,ymm15
1288	vpaddd	ymm12,ymm12,ymm5
1289	vpxor	ymm3,ymm12,ymm3
1290	vpslld	ymm14,ymm3,12
1291	vpsrld	ymm3,ymm3,20
1292	vpor	ymm3,ymm14,ymm3
1293	vbroadcasti128	ymm14,XMMWORD[r11]
1294	vpaddd	ymm13,ymm13,ymm6
1295	vpxor	ymm0,ymm13,ymm0
1296	vpslld	ymm15,ymm0,12
1297	vpsrld	ymm0,ymm0,20
1298	vpor	ymm0,ymm15,ymm0
1299	vpaddd	ymm10,ymm10,ymm3
1300	vpxor	ymm5,ymm10,ymm5
1301	vpshufb	ymm5,ymm5,ymm14
1302	vpaddd	ymm11,ymm11,ymm0
1303	vpxor	ymm6,ymm11,ymm6
1304	vpshufb	ymm6,ymm6,ymm14
1305	vpaddd	ymm12,ymm12,ymm5
1306	vpxor	ymm3,ymm12,ymm3
1307	vpslld	ymm15,ymm3,7
1308	vpsrld	ymm3,ymm3,25
1309	vpor	ymm3,ymm15,ymm3
1310	vbroadcasti128	ymm15,XMMWORD[r10]
1311	vpaddd	ymm13,ymm13,ymm6
1312	vpxor	ymm0,ymm13,ymm0
1313	vpslld	ymm14,ymm0,7
1314	vpsrld	ymm0,ymm0,25
1315	vpor	ymm0,ymm14,ymm0
1316	dec	eax
1317	jnz	NEAR $L$oop8x
1318
1319	lea	rax,[512+rsp]
1320	vpaddd	ymm8,ymm8,YMMWORD[((128-256))+rcx]
1321	vpaddd	ymm9,ymm9,YMMWORD[((160-256))+rcx]
1322	vpaddd	ymm10,ymm10,YMMWORD[((192-256))+rcx]
1323	vpaddd	ymm11,ymm11,YMMWORD[((224-256))+rcx]
1324
1325	vpunpckldq	ymm14,ymm8,ymm9
1326	vpunpckldq	ymm15,ymm10,ymm11
1327	vpunpckhdq	ymm8,ymm8,ymm9
1328	vpunpckhdq	ymm10,ymm10,ymm11
1329	vpunpcklqdq	ymm9,ymm14,ymm15
1330	vpunpckhqdq	ymm14,ymm14,ymm15
1331	vpunpcklqdq	ymm11,ymm8,ymm10
1332	vpunpckhqdq	ymm8,ymm8,ymm10
1333	vpaddd	ymm0,ymm0,YMMWORD[((256-256))+rcx]
1334	vpaddd	ymm1,ymm1,YMMWORD[((288-256))+rcx]
1335	vpaddd	ymm2,ymm2,YMMWORD[((320-256))+rcx]
1336	vpaddd	ymm3,ymm3,YMMWORD[((352-256))+rcx]
1337
1338	vpunpckldq	ymm10,ymm0,ymm1
1339	vpunpckldq	ymm15,ymm2,ymm3
1340	vpunpckhdq	ymm0,ymm0,ymm1
1341	vpunpckhdq	ymm2,ymm2,ymm3
1342	vpunpcklqdq	ymm1,ymm10,ymm15
1343	vpunpckhqdq	ymm10,ymm10,ymm15
1344	vpunpcklqdq	ymm3,ymm0,ymm2
1345	vpunpckhqdq	ymm0,ymm0,ymm2
1346	vperm2i128	ymm15,ymm9,ymm1,0x20
1347	vperm2i128	ymm1,ymm9,ymm1,0x31
1348	vperm2i128	ymm9,ymm14,ymm10,0x20
1349	vperm2i128	ymm10,ymm14,ymm10,0x31
1350	vperm2i128	ymm14,ymm11,ymm3,0x20
1351	vperm2i128	ymm3,ymm11,ymm3,0x31
1352	vperm2i128	ymm11,ymm8,ymm0,0x20
1353	vperm2i128	ymm0,ymm8,ymm0,0x31
1354	vmovdqa	YMMWORD[rsp],ymm15
1355	vmovdqa	YMMWORD[32+rsp],ymm9
1356	vmovdqa	ymm15,YMMWORD[64+rsp]
1357	vmovdqa	ymm9,YMMWORD[96+rsp]
1358
1359	vpaddd	ymm12,ymm12,YMMWORD[((384-512))+rax]
1360	vpaddd	ymm13,ymm13,YMMWORD[((416-512))+rax]
1361	vpaddd	ymm15,ymm15,YMMWORD[((448-512))+rax]
1362	vpaddd	ymm9,ymm9,YMMWORD[((480-512))+rax]
1363
1364	vpunpckldq	ymm2,ymm12,ymm13
1365	vpunpckldq	ymm8,ymm15,ymm9
1366	vpunpckhdq	ymm12,ymm12,ymm13
1367	vpunpckhdq	ymm15,ymm15,ymm9
1368	vpunpcklqdq	ymm13,ymm2,ymm8
1369	vpunpckhqdq	ymm2,ymm2,ymm8
1370	vpunpcklqdq	ymm9,ymm12,ymm15
1371	vpunpckhqdq	ymm12,ymm12,ymm15
1372	vpaddd	ymm4,ymm4,YMMWORD[((512-512))+rax]
1373	vpaddd	ymm5,ymm5,YMMWORD[((544-512))+rax]
1374	vpaddd	ymm6,ymm6,YMMWORD[((576-512))+rax]
1375	vpaddd	ymm7,ymm7,YMMWORD[((608-512))+rax]
1376
1377	vpunpckldq	ymm15,ymm4,ymm5
1378	vpunpckldq	ymm8,ymm6,ymm7
1379	vpunpckhdq	ymm4,ymm4,ymm5
1380	vpunpckhdq	ymm6,ymm6,ymm7
1381	vpunpcklqdq	ymm5,ymm15,ymm8
1382	vpunpckhqdq	ymm15,ymm15,ymm8
1383	vpunpcklqdq	ymm7,ymm4,ymm6
1384	vpunpckhqdq	ymm4,ymm4,ymm6
1385	vperm2i128	ymm8,ymm13,ymm5,0x20
1386	vperm2i128	ymm5,ymm13,ymm5,0x31
1387	vperm2i128	ymm13,ymm2,ymm15,0x20
1388	vperm2i128	ymm15,ymm2,ymm15,0x31
1389	vperm2i128	ymm2,ymm9,ymm7,0x20
1390	vperm2i128	ymm7,ymm9,ymm7,0x31
1391	vperm2i128	ymm9,ymm12,ymm4,0x20
1392	vperm2i128	ymm4,ymm12,ymm4,0x31
1393	vmovdqa	ymm6,YMMWORD[rsp]
1394	vmovdqa	ymm12,YMMWORD[32+rsp]
1395
1396	cmp	rdx,64*8
1397	jb	NEAR $L$tail8x
1398
1399	vpxor	ymm6,ymm6,YMMWORD[rsi]
1400	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1401	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1402	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1403	lea	rsi,[128+rsi]
1404	vmovdqu	YMMWORD[rdi],ymm6
1405	vmovdqu	YMMWORD[32+rdi],ymm8
1406	vmovdqu	YMMWORD[64+rdi],ymm1
1407	vmovdqu	YMMWORD[96+rdi],ymm5
1408	lea	rdi,[128+rdi]
1409
1410	vpxor	ymm12,ymm12,YMMWORD[rsi]
1411	vpxor	ymm13,ymm13,YMMWORD[32+rsi]
1412	vpxor	ymm10,ymm10,YMMWORD[64+rsi]
1413	vpxor	ymm15,ymm15,YMMWORD[96+rsi]
1414	lea	rsi,[128+rsi]
1415	vmovdqu	YMMWORD[rdi],ymm12
1416	vmovdqu	YMMWORD[32+rdi],ymm13
1417	vmovdqu	YMMWORD[64+rdi],ymm10
1418	vmovdqu	YMMWORD[96+rdi],ymm15
1419	lea	rdi,[128+rdi]
1420
1421	vpxor	ymm14,ymm14,YMMWORD[rsi]
1422	vpxor	ymm2,ymm2,YMMWORD[32+rsi]
1423	vpxor	ymm3,ymm3,YMMWORD[64+rsi]
1424	vpxor	ymm7,ymm7,YMMWORD[96+rsi]
1425	lea	rsi,[128+rsi]
1426	vmovdqu	YMMWORD[rdi],ymm14
1427	vmovdqu	YMMWORD[32+rdi],ymm2
1428	vmovdqu	YMMWORD[64+rdi],ymm3
1429	vmovdqu	YMMWORD[96+rdi],ymm7
1430	lea	rdi,[128+rdi]
1431
1432	vpxor	ymm11,ymm11,YMMWORD[rsi]
1433	vpxor	ymm9,ymm9,YMMWORD[32+rsi]
1434	vpxor	ymm0,ymm0,YMMWORD[64+rsi]
1435	vpxor	ymm4,ymm4,YMMWORD[96+rsi]
1436	lea	rsi,[128+rsi]
1437	vmovdqu	YMMWORD[rdi],ymm11
1438	vmovdqu	YMMWORD[32+rdi],ymm9
1439	vmovdqu	YMMWORD[64+rdi],ymm0
1440	vmovdqu	YMMWORD[96+rdi],ymm4
1441	lea	rdi,[128+rdi]
1442
1443	sub	rdx,64*8
1444	jnz	NEAR $L$oop_outer8x
1445
1446	jmp	NEAR $L$done8x
1447
1448$L$tail8x:
1449	cmp	rdx,448
1450	jae	NEAR $L$448_or_more8x
1451	cmp	rdx,384
1452	jae	NEAR $L$384_or_more8x
1453	cmp	rdx,320
1454	jae	NEAR $L$320_or_more8x
1455	cmp	rdx,256
1456	jae	NEAR $L$256_or_more8x
1457	cmp	rdx,192
1458	jae	NEAR $L$192_or_more8x
1459	cmp	rdx,128
1460	jae	NEAR $L$128_or_more8x
1461	cmp	rdx,64
1462	jae	NEAR $L$64_or_more8x
1463
1464	xor	r10,r10
1465	vmovdqa	YMMWORD[rsp],ymm6
1466	vmovdqa	YMMWORD[32+rsp],ymm8
1467	jmp	NEAR $L$oop_tail8x
1468
1469ALIGN	32
1470$L$64_or_more8x:
1471	vpxor	ymm6,ymm6,YMMWORD[rsi]
1472	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1473	vmovdqu	YMMWORD[rdi],ymm6
1474	vmovdqu	YMMWORD[32+rdi],ymm8
1475	je	NEAR $L$done8x
1476
1477	lea	rsi,[64+rsi]
1478	xor	r10,r10
1479	vmovdqa	YMMWORD[rsp],ymm1
1480	lea	rdi,[64+rdi]
1481	sub	rdx,64
1482	vmovdqa	YMMWORD[32+rsp],ymm5
1483	jmp	NEAR $L$oop_tail8x
1484
1485ALIGN	32
1486$L$128_or_more8x:
1487	vpxor	ymm6,ymm6,YMMWORD[rsi]
1488	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1489	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1490	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1491	vmovdqu	YMMWORD[rdi],ymm6
1492	vmovdqu	YMMWORD[32+rdi],ymm8
1493	vmovdqu	YMMWORD[64+rdi],ymm1
1494	vmovdqu	YMMWORD[96+rdi],ymm5
1495	je	NEAR $L$done8x
1496
1497	lea	rsi,[128+rsi]
1498	xor	r10,r10
1499	vmovdqa	YMMWORD[rsp],ymm12
1500	lea	rdi,[128+rdi]
1501	sub	rdx,128
1502	vmovdqa	YMMWORD[32+rsp],ymm13
1503	jmp	NEAR $L$oop_tail8x
1504
1505ALIGN	32
1506$L$192_or_more8x:
1507	vpxor	ymm6,ymm6,YMMWORD[rsi]
1508	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1509	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1510	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1511	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1512	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1513	vmovdqu	YMMWORD[rdi],ymm6
1514	vmovdqu	YMMWORD[32+rdi],ymm8
1515	vmovdqu	YMMWORD[64+rdi],ymm1
1516	vmovdqu	YMMWORD[96+rdi],ymm5
1517	vmovdqu	YMMWORD[128+rdi],ymm12
1518	vmovdqu	YMMWORD[160+rdi],ymm13
1519	je	NEAR $L$done8x
1520
1521	lea	rsi,[192+rsi]
1522	xor	r10,r10
1523	vmovdqa	YMMWORD[rsp],ymm10
1524	lea	rdi,[192+rdi]
1525	sub	rdx,192
1526	vmovdqa	YMMWORD[32+rsp],ymm15
1527	jmp	NEAR $L$oop_tail8x
1528
1529ALIGN	32
1530$L$256_or_more8x:
1531	vpxor	ymm6,ymm6,YMMWORD[rsi]
1532	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1533	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1534	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1535	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1536	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1537	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1538	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1539	vmovdqu	YMMWORD[rdi],ymm6
1540	vmovdqu	YMMWORD[32+rdi],ymm8
1541	vmovdqu	YMMWORD[64+rdi],ymm1
1542	vmovdqu	YMMWORD[96+rdi],ymm5
1543	vmovdqu	YMMWORD[128+rdi],ymm12
1544	vmovdqu	YMMWORD[160+rdi],ymm13
1545	vmovdqu	YMMWORD[192+rdi],ymm10
1546	vmovdqu	YMMWORD[224+rdi],ymm15
1547	je	NEAR $L$done8x
1548
1549	lea	rsi,[256+rsi]
1550	xor	r10,r10
1551	vmovdqa	YMMWORD[rsp],ymm14
1552	lea	rdi,[256+rdi]
1553	sub	rdx,256
1554	vmovdqa	YMMWORD[32+rsp],ymm2
1555	jmp	NEAR $L$oop_tail8x
1556
1557ALIGN	32
1558$L$320_or_more8x:
1559	vpxor	ymm6,ymm6,YMMWORD[rsi]
1560	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1561	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1562	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1563	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1564	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1565	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1566	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1567	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
1568	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
1569	vmovdqu	YMMWORD[rdi],ymm6
1570	vmovdqu	YMMWORD[32+rdi],ymm8
1571	vmovdqu	YMMWORD[64+rdi],ymm1
1572	vmovdqu	YMMWORD[96+rdi],ymm5
1573	vmovdqu	YMMWORD[128+rdi],ymm12
1574	vmovdqu	YMMWORD[160+rdi],ymm13
1575	vmovdqu	YMMWORD[192+rdi],ymm10
1576	vmovdqu	YMMWORD[224+rdi],ymm15
1577	vmovdqu	YMMWORD[256+rdi],ymm14
1578	vmovdqu	YMMWORD[288+rdi],ymm2
1579	je	NEAR $L$done8x
1580
1581	lea	rsi,[320+rsi]
1582	xor	r10,r10
1583	vmovdqa	YMMWORD[rsp],ymm3
1584	lea	rdi,[320+rdi]
1585	sub	rdx,320
1586	vmovdqa	YMMWORD[32+rsp],ymm7
1587	jmp	NEAR $L$oop_tail8x
1588
1589ALIGN	32
1590$L$384_or_more8x:
1591	vpxor	ymm6,ymm6,YMMWORD[rsi]
1592	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1593	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1594	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1595	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1596	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1597	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1598	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1599	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
1600	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
1601	vpxor	ymm3,ymm3,YMMWORD[320+rsi]
1602	vpxor	ymm7,ymm7,YMMWORD[352+rsi]
1603	vmovdqu	YMMWORD[rdi],ymm6
1604	vmovdqu	YMMWORD[32+rdi],ymm8
1605	vmovdqu	YMMWORD[64+rdi],ymm1
1606	vmovdqu	YMMWORD[96+rdi],ymm5
1607	vmovdqu	YMMWORD[128+rdi],ymm12
1608	vmovdqu	YMMWORD[160+rdi],ymm13
1609	vmovdqu	YMMWORD[192+rdi],ymm10
1610	vmovdqu	YMMWORD[224+rdi],ymm15
1611	vmovdqu	YMMWORD[256+rdi],ymm14
1612	vmovdqu	YMMWORD[288+rdi],ymm2
1613	vmovdqu	YMMWORD[320+rdi],ymm3
1614	vmovdqu	YMMWORD[352+rdi],ymm7
1615	je	NEAR $L$done8x
1616
1617	lea	rsi,[384+rsi]
1618	xor	r10,r10
1619	vmovdqa	YMMWORD[rsp],ymm11
1620	lea	rdi,[384+rdi]
1621	sub	rdx,384
1622	vmovdqa	YMMWORD[32+rsp],ymm9
1623	jmp	NEAR $L$oop_tail8x
1624
1625ALIGN	32
1626$L$448_or_more8x:
1627	vpxor	ymm6,ymm6,YMMWORD[rsi]
1628	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1629	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1630	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1631	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1632	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1633	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1634	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1635	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
1636	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
1637	vpxor	ymm3,ymm3,YMMWORD[320+rsi]
1638	vpxor	ymm7,ymm7,YMMWORD[352+rsi]
1639	vpxor	ymm11,ymm11,YMMWORD[384+rsi]
1640	vpxor	ymm9,ymm9,YMMWORD[416+rsi]
1641	vmovdqu	YMMWORD[rdi],ymm6
1642	vmovdqu	YMMWORD[32+rdi],ymm8
1643	vmovdqu	YMMWORD[64+rdi],ymm1
1644	vmovdqu	YMMWORD[96+rdi],ymm5
1645	vmovdqu	YMMWORD[128+rdi],ymm12
1646	vmovdqu	YMMWORD[160+rdi],ymm13
1647	vmovdqu	YMMWORD[192+rdi],ymm10
1648	vmovdqu	YMMWORD[224+rdi],ymm15
1649	vmovdqu	YMMWORD[256+rdi],ymm14
1650	vmovdqu	YMMWORD[288+rdi],ymm2
1651	vmovdqu	YMMWORD[320+rdi],ymm3
1652	vmovdqu	YMMWORD[352+rdi],ymm7
1653	vmovdqu	YMMWORD[384+rdi],ymm11
1654	vmovdqu	YMMWORD[416+rdi],ymm9
1655	je	NEAR $L$done8x
1656
1657	lea	rsi,[448+rsi]
1658	xor	r10,r10
1659	vmovdqa	YMMWORD[rsp],ymm0
1660	lea	rdi,[448+rdi]
1661	sub	rdx,448
1662	vmovdqa	YMMWORD[32+rsp],ymm4
1663
1664$L$oop_tail8x:
1665	movzx	eax,BYTE[r10*1+rsi]
1666	movzx	ecx,BYTE[r10*1+rsp]
1667	lea	r10,[1+r10]
1668	xor	eax,ecx
1669	mov	BYTE[((-1))+r10*1+rdi],al
1670	dec	rdx
1671	jnz	NEAR $L$oop_tail8x
1672
1673$L$done8x:
1674	vzeroall
1675	movaps	xmm6,XMMWORD[((-168))+r9]
1676	movaps	xmm7,XMMWORD[((-152))+r9]
1677	movaps	xmm8,XMMWORD[((-136))+r9]
1678	movaps	xmm9,XMMWORD[((-120))+r9]
1679	movaps	xmm10,XMMWORD[((-104))+r9]
1680	movaps	xmm11,XMMWORD[((-88))+r9]
1681	movaps	xmm12,XMMWORD[((-72))+r9]
1682	movaps	xmm13,XMMWORD[((-56))+r9]
1683	movaps	xmm14,XMMWORD[((-40))+r9]
1684	movaps	xmm15,XMMWORD[((-24))+r9]
1685	lea	rsp,[r9]
1686$L$8x_epilogue:
1687	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1688	mov	rsi,QWORD[16+rsp]
1689	DB	0F3h,0C3h		;repret
1690$L$SEH_end_ChaCha20_8x:
1691EXTERN	__imp_RtlVirtualUnwind
1692
1693ALIGN	16
1694se_handler:
1695	push	rsi
1696	push	rdi
1697	push	rbx
1698	push	rbp
1699	push	r12
1700	push	r13
1701	push	r14
1702	push	r15
1703	pushfq
1704	sub	rsp,64
1705
1706	mov	rax,QWORD[120+r8]
1707	mov	rbx,QWORD[248+r8]
1708
1709	mov	rsi,QWORD[8+r9]
1710	mov	r11,QWORD[56+r9]
1711
1712	lea	r10,[$L$ctr32_body]
1713	cmp	rbx,r10
1714	jb	NEAR $L$common_seh_tail
1715
1716	mov	rax,QWORD[152+r8]
1717
1718	lea	r10,[$L$no_data]
1719	cmp	rbx,r10
1720	jae	NEAR $L$common_seh_tail
1721
1722	lea	rax,[((64+24+48))+rax]
1723
1724	mov	rbx,QWORD[((-8))+rax]
1725	mov	rbp,QWORD[((-16))+rax]
1726	mov	r12,QWORD[((-24))+rax]
1727	mov	r13,QWORD[((-32))+rax]
1728	mov	r14,QWORD[((-40))+rax]
1729	mov	r15,QWORD[((-48))+rax]
1730	mov	QWORD[144+r8],rbx
1731	mov	QWORD[160+r8],rbp
1732	mov	QWORD[216+r8],r12
1733	mov	QWORD[224+r8],r13
1734	mov	QWORD[232+r8],r14
1735	mov	QWORD[240+r8],r15
1736
1737$L$common_seh_tail:
1738	mov	rdi,QWORD[8+rax]
1739	mov	rsi,QWORD[16+rax]
1740	mov	QWORD[152+r8],rax
1741	mov	QWORD[168+r8],rsi
1742	mov	QWORD[176+r8],rdi
1743
1744	mov	rdi,QWORD[40+r9]
1745	mov	rsi,r8
1746	mov	ecx,154
1747	DD	0xa548f3fc
1748
1749	mov	rsi,r9
1750	xor	rcx,rcx
1751	mov	rdx,QWORD[8+rsi]
1752	mov	r8,QWORD[rsi]
1753	mov	r9,QWORD[16+rsi]
1754	mov	r10,QWORD[40+rsi]
1755	lea	r11,[56+rsi]
1756	lea	r12,[24+rsi]
1757	mov	QWORD[32+rsp],r10
1758	mov	QWORD[40+rsp],r11
1759	mov	QWORD[48+rsp],r12
1760	mov	QWORD[56+rsp],rcx
1761	call	QWORD[__imp_RtlVirtualUnwind]
1762
1763	mov	eax,1
1764	add	rsp,64
1765	popfq
1766	pop	r15
1767	pop	r14
1768	pop	r13
1769	pop	r12
1770	pop	rbp
1771	pop	rbx
1772	pop	rdi
1773	pop	rsi
1774	DB	0F3h,0C3h		;repret
1775
1776
1777
1778ALIGN	16
1779ssse3_handler:
1780	push	rsi
1781	push	rdi
1782	push	rbx
1783	push	rbp
1784	push	r12
1785	push	r13
1786	push	r14
1787	push	r15
1788	pushfq
1789	sub	rsp,64
1790
1791	mov	rax,QWORD[120+r8]
1792	mov	rbx,QWORD[248+r8]
1793
1794	mov	rsi,QWORD[8+r9]
1795	mov	r11,QWORD[56+r9]
1796
1797	mov	r10d,DWORD[r11]
1798	lea	r10,[r10*1+rsi]
1799	cmp	rbx,r10
1800	jb	NEAR $L$common_seh_tail
1801
1802	mov	rax,QWORD[192+r8]
1803
1804	mov	r10d,DWORD[4+r11]
1805	lea	r10,[r10*1+rsi]
1806	cmp	rbx,r10
1807	jae	NEAR $L$common_seh_tail
1808
1809	lea	rsi,[((-40))+rax]
1810	lea	rdi,[512+r8]
1811	mov	ecx,4
1812	DD	0xa548f3fc
1813
1814	jmp	NEAR $L$common_seh_tail
1815
1816
1817
1818ALIGN	16
1819full_handler:
1820	push	rsi
1821	push	rdi
1822	push	rbx
1823	push	rbp
1824	push	r12
1825	push	r13
1826	push	r14
1827	push	r15
1828	pushfq
1829	sub	rsp,64
1830
1831	mov	rax,QWORD[120+r8]
1832	mov	rbx,QWORD[248+r8]
1833
1834	mov	rsi,QWORD[8+r9]
1835	mov	r11,QWORD[56+r9]
1836
1837	mov	r10d,DWORD[r11]
1838	lea	r10,[r10*1+rsi]
1839	cmp	rbx,r10
1840	jb	NEAR $L$common_seh_tail
1841
1842	mov	rax,QWORD[192+r8]
1843
1844	mov	r10d,DWORD[4+r11]
1845	lea	r10,[r10*1+rsi]
1846	cmp	rbx,r10
1847	jae	NEAR $L$common_seh_tail
1848
1849	lea	rsi,[((-168))+rax]
1850	lea	rdi,[512+r8]
1851	mov	ecx,20
1852	DD	0xa548f3fc
1853
1854	jmp	NEAR $L$common_seh_tail
1855
1856
1857section	.pdata rdata align=4
1858ALIGN	4
1859	DD	$L$SEH_begin_ChaCha20_ctr32 wrt ..imagebase
1860	DD	$L$SEH_end_ChaCha20_ctr32 wrt ..imagebase
1861	DD	$L$SEH_info_ChaCha20_ctr32 wrt ..imagebase
1862
1863	DD	$L$SEH_begin_ChaCha20_ssse3 wrt ..imagebase
1864	DD	$L$SEH_end_ChaCha20_ssse3 wrt ..imagebase
1865	DD	$L$SEH_info_ChaCha20_ssse3 wrt ..imagebase
1866
1867	DD	$L$SEH_begin_ChaCha20_4x wrt ..imagebase
1868	DD	$L$SEH_end_ChaCha20_4x wrt ..imagebase
1869	DD	$L$SEH_info_ChaCha20_4x wrt ..imagebase
1870	DD	$L$SEH_begin_ChaCha20_8x wrt ..imagebase
1871	DD	$L$SEH_end_ChaCha20_8x wrt ..imagebase
1872	DD	$L$SEH_info_ChaCha20_8x wrt ..imagebase
1873section	.xdata rdata align=8
1874ALIGN	8
1875$L$SEH_info_ChaCha20_ctr32:
1876DB	9,0,0,0
1877	DD	se_handler wrt ..imagebase
1878
1879$L$SEH_info_ChaCha20_ssse3:
1880DB	9,0,0,0
1881	DD	ssse3_handler wrt ..imagebase
1882	DD	$L$ssse3_body wrt ..imagebase,$L$ssse3_epilogue wrt ..imagebase
1883
1884$L$SEH_info_ChaCha20_4x:
1885DB	9,0,0,0
1886	DD	full_handler wrt ..imagebase
1887	DD	$L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase
1888$L$SEH_info_ChaCha20_8x:
1889DB	9,0,0,0
1890	DD	full_handler wrt ..imagebase
1891	DD	$L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase
1892