1%ifidn __OUTPUT_FORMAT__,obj
2section	code	use32 class=code align=64
3%elifidn __OUTPUT_FORMAT__,win32
4%ifdef __YASM_VERSION_ID__
5%if __YASM_VERSION_ID__ < 01010000h
6%error yasm version 1.1.0 or later needed.
7%endif
8; Yasm automatically includes .00 and complains about redefining it.
9; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html
10%else
11$@feat.00 equ 1
12%endif
13section	.text	code align=64
14%else
15section	.text	code
16%endif
17global	_ChaCha20_ctr32
18align	16
19_ChaCha20_ctr32:
20L$_ChaCha20_ctr32_begin:
21	push	ebp
22	push	ebx
23	push	esi
24	push	edi
25	xor	eax,eax
26	cmp	eax,DWORD [28+esp]
27	je	NEAR L$000no_data
28	call	L$pic_point
29L$pic_point:
30	pop	eax
31	lea	ebp,[_OPENSSL_ia32cap_P]
32	test	DWORD [ebp],16777216
33	jz	NEAR L$001x86
34	test	DWORD [4+ebp],512
35	jz	NEAR L$001x86
36	jmp	NEAR L$ssse3_shortcut
37L$001x86:
38	mov	esi,DWORD [32+esp]
39	mov	edi,DWORD [36+esp]
40	sub	esp,132
41	mov	eax,DWORD [esi]
42	mov	ebx,DWORD [4+esi]
43	mov	ecx,DWORD [8+esi]
44	mov	edx,DWORD [12+esi]
45	mov	DWORD [80+esp],eax
46	mov	DWORD [84+esp],ebx
47	mov	DWORD [88+esp],ecx
48	mov	DWORD [92+esp],edx
49	mov	eax,DWORD [16+esi]
50	mov	ebx,DWORD [20+esi]
51	mov	ecx,DWORD [24+esi]
52	mov	edx,DWORD [28+esi]
53	mov	DWORD [96+esp],eax
54	mov	DWORD [100+esp],ebx
55	mov	DWORD [104+esp],ecx
56	mov	DWORD [108+esp],edx
57	mov	eax,DWORD [edi]
58	mov	ebx,DWORD [4+edi]
59	mov	ecx,DWORD [8+edi]
60	mov	edx,DWORD [12+edi]
61	sub	eax,1
62	mov	DWORD [112+esp],eax
63	mov	DWORD [116+esp],ebx
64	mov	DWORD [120+esp],ecx
65	mov	DWORD [124+esp],edx
66	jmp	NEAR L$002entry
67align	16
68L$003outer_loop:
69	mov	DWORD [156+esp],ebx
70	mov	DWORD [152+esp],eax
71	mov	DWORD [160+esp],ecx
72L$002entry:
73	mov	eax,1634760805
74	mov	DWORD [4+esp],857760878
75	mov	DWORD [8+esp],2036477234
76	mov	DWORD [12+esp],1797285236
77	mov	ebx,DWORD [84+esp]
78	mov	ebp,DWORD [88+esp]
79	mov	ecx,DWORD [104+esp]
80	mov	esi,DWORD [108+esp]
81	mov	edx,DWORD [116+esp]
82	mov	edi,DWORD [120+esp]
83	mov	DWORD [20+esp],ebx
84	mov	DWORD [24+esp],ebp
85	mov	DWORD [40+esp],ecx
86	mov	DWORD [44+esp],esi
87	mov	DWORD [52+esp],edx
88	mov	DWORD [56+esp],edi
89	mov	ebx,DWORD [92+esp]
90	mov	edi,DWORD [124+esp]
91	mov	edx,DWORD [112+esp]
92	mov	ebp,DWORD [80+esp]
93	mov	ecx,DWORD [96+esp]
94	mov	esi,DWORD [100+esp]
95	add	edx,1
96	mov	DWORD [28+esp],ebx
97	mov	DWORD [60+esp],edi
98	mov	DWORD [112+esp],edx
99	mov	ebx,10
100	jmp	NEAR L$004loop
101align	16
102L$004loop:
103	add	eax,ebp
104	mov	DWORD [128+esp],ebx
105	mov	ebx,ebp
106	xor	edx,eax
107	rol	edx,16
108	add	ecx,edx
109	xor	ebx,ecx
110	mov	edi,DWORD [52+esp]
111	rol	ebx,12
112	mov	ebp,DWORD [20+esp]
113	add	eax,ebx
114	xor	edx,eax
115	mov	DWORD [esp],eax
116	rol	edx,8
117	mov	eax,DWORD [4+esp]
118	add	ecx,edx
119	mov	DWORD [48+esp],edx
120	xor	ebx,ecx
121	add	eax,ebp
122	rol	ebx,7
123	xor	edi,eax
124	mov	DWORD [32+esp],ecx
125	rol	edi,16
126	mov	DWORD [16+esp],ebx
127	add	esi,edi
128	mov	ecx,DWORD [40+esp]
129	xor	ebp,esi
130	mov	edx,DWORD [56+esp]
131	rol	ebp,12
132	mov	ebx,DWORD [24+esp]
133	add	eax,ebp
134	xor	edi,eax
135	mov	DWORD [4+esp],eax
136	rol	edi,8
137	mov	eax,DWORD [8+esp]
138	add	esi,edi
139	mov	DWORD [52+esp],edi
140	xor	ebp,esi
141	add	eax,ebx
142	rol	ebp,7
143	xor	edx,eax
144	mov	DWORD [36+esp],esi
145	rol	edx,16
146	mov	DWORD [20+esp],ebp
147	add	ecx,edx
148	mov	esi,DWORD [44+esp]
149	xor	ebx,ecx
150	mov	edi,DWORD [60+esp]
151	rol	ebx,12
152	mov	ebp,DWORD [28+esp]
153	add	eax,ebx
154	xor	edx,eax
155	mov	DWORD [8+esp],eax
156	rol	edx,8
157	mov	eax,DWORD [12+esp]
158	add	ecx,edx
159	mov	DWORD [56+esp],edx
160	xor	ebx,ecx
161	add	eax,ebp
162	rol	ebx,7
163	xor	edi,eax
164	rol	edi,16
165	mov	DWORD [24+esp],ebx
166	add	esi,edi
167	xor	ebp,esi
168	rol	ebp,12
169	mov	ebx,DWORD [20+esp]
170	add	eax,ebp
171	xor	edi,eax
172	mov	DWORD [12+esp],eax
173	rol	edi,8
174	mov	eax,DWORD [esp]
175	add	esi,edi
176	mov	edx,edi
177	xor	ebp,esi
178	add	eax,ebx
179	rol	ebp,7
180	xor	edx,eax
181	rol	edx,16
182	mov	DWORD [28+esp],ebp
183	add	ecx,edx
184	xor	ebx,ecx
185	mov	edi,DWORD [48+esp]
186	rol	ebx,12
187	mov	ebp,DWORD [24+esp]
188	add	eax,ebx
189	xor	edx,eax
190	mov	DWORD [esp],eax
191	rol	edx,8
192	mov	eax,DWORD [4+esp]
193	add	ecx,edx
194	mov	DWORD [60+esp],edx
195	xor	ebx,ecx
196	add	eax,ebp
197	rol	ebx,7
198	xor	edi,eax
199	mov	DWORD [40+esp],ecx
200	rol	edi,16
201	mov	DWORD [20+esp],ebx
202	add	esi,edi
203	mov	ecx,DWORD [32+esp]
204	xor	ebp,esi
205	mov	edx,DWORD [52+esp]
206	rol	ebp,12
207	mov	ebx,DWORD [28+esp]
208	add	eax,ebp
209	xor	edi,eax
210	mov	DWORD [4+esp],eax
211	rol	edi,8
212	mov	eax,DWORD [8+esp]
213	add	esi,edi
214	mov	DWORD [48+esp],edi
215	xor	ebp,esi
216	add	eax,ebx
217	rol	ebp,7
218	xor	edx,eax
219	mov	DWORD [44+esp],esi
220	rol	edx,16
221	mov	DWORD [24+esp],ebp
222	add	ecx,edx
223	mov	esi,DWORD [36+esp]
224	xor	ebx,ecx
225	mov	edi,DWORD [56+esp]
226	rol	ebx,12
227	mov	ebp,DWORD [16+esp]
228	add	eax,ebx
229	xor	edx,eax
230	mov	DWORD [8+esp],eax
231	rol	edx,8
232	mov	eax,DWORD [12+esp]
233	add	ecx,edx
234	mov	DWORD [52+esp],edx
235	xor	ebx,ecx
236	add	eax,ebp
237	rol	ebx,7
238	xor	edi,eax
239	rol	edi,16
240	mov	DWORD [28+esp],ebx
241	add	esi,edi
242	xor	ebp,esi
243	mov	edx,DWORD [48+esp]
244	rol	ebp,12
245	mov	ebx,DWORD [128+esp]
246	add	eax,ebp
247	xor	edi,eax
248	mov	DWORD [12+esp],eax
249	rol	edi,8
250	mov	eax,DWORD [esp]
251	add	esi,edi
252	mov	DWORD [56+esp],edi
253	xor	ebp,esi
254	rol	ebp,7
255	dec	ebx
256	jnz	NEAR L$004loop
257	mov	ebx,DWORD [160+esp]
258	add	eax,1634760805
259	add	ebp,DWORD [80+esp]
260	add	ecx,DWORD [96+esp]
261	add	esi,DWORD [100+esp]
262	cmp	ebx,64
263	jb	NEAR L$005tail
264	mov	ebx,DWORD [156+esp]
265	add	edx,DWORD [112+esp]
266	add	edi,DWORD [120+esp]
267	xor	eax,DWORD [ebx]
268	xor	ebp,DWORD [16+ebx]
269	mov	DWORD [esp],eax
270	mov	eax,DWORD [152+esp]
271	xor	ecx,DWORD [32+ebx]
272	xor	esi,DWORD [36+ebx]
273	xor	edx,DWORD [48+ebx]
274	xor	edi,DWORD [56+ebx]
275	mov	DWORD [16+eax],ebp
276	mov	DWORD [32+eax],ecx
277	mov	DWORD [36+eax],esi
278	mov	DWORD [48+eax],edx
279	mov	DWORD [56+eax],edi
280	mov	ebp,DWORD [4+esp]
281	mov	ecx,DWORD [8+esp]
282	mov	esi,DWORD [12+esp]
283	mov	edx,DWORD [20+esp]
284	mov	edi,DWORD [24+esp]
285	add	ebp,857760878
286	add	ecx,2036477234
287	add	esi,1797285236
288	add	edx,DWORD [84+esp]
289	add	edi,DWORD [88+esp]
290	xor	ebp,DWORD [4+ebx]
291	xor	ecx,DWORD [8+ebx]
292	xor	esi,DWORD [12+ebx]
293	xor	edx,DWORD [20+ebx]
294	xor	edi,DWORD [24+ebx]
295	mov	DWORD [4+eax],ebp
296	mov	DWORD [8+eax],ecx
297	mov	DWORD [12+eax],esi
298	mov	DWORD [20+eax],edx
299	mov	DWORD [24+eax],edi
300	mov	ebp,DWORD [28+esp]
301	mov	ecx,DWORD [40+esp]
302	mov	esi,DWORD [44+esp]
303	mov	edx,DWORD [52+esp]
304	mov	edi,DWORD [60+esp]
305	add	ebp,DWORD [92+esp]
306	add	ecx,DWORD [104+esp]
307	add	esi,DWORD [108+esp]
308	add	edx,DWORD [116+esp]
309	add	edi,DWORD [124+esp]
310	xor	ebp,DWORD [28+ebx]
311	xor	ecx,DWORD [40+ebx]
312	xor	esi,DWORD [44+ebx]
313	xor	edx,DWORD [52+ebx]
314	xor	edi,DWORD [60+ebx]
315	lea	ebx,[64+ebx]
316	mov	DWORD [28+eax],ebp
317	mov	ebp,DWORD [esp]
318	mov	DWORD [40+eax],ecx
319	mov	ecx,DWORD [160+esp]
320	mov	DWORD [44+eax],esi
321	mov	DWORD [52+eax],edx
322	mov	DWORD [60+eax],edi
323	mov	DWORD [eax],ebp
324	lea	eax,[64+eax]
325	sub	ecx,64
326	jnz	NEAR L$003outer_loop
327	jmp	NEAR L$006done
328L$005tail:
329	add	edx,DWORD [112+esp]
330	add	edi,DWORD [120+esp]
331	mov	DWORD [esp],eax
332	mov	DWORD [16+esp],ebp
333	mov	DWORD [32+esp],ecx
334	mov	DWORD [36+esp],esi
335	mov	DWORD [48+esp],edx
336	mov	DWORD [56+esp],edi
337	mov	ebp,DWORD [4+esp]
338	mov	ecx,DWORD [8+esp]
339	mov	esi,DWORD [12+esp]
340	mov	edx,DWORD [20+esp]
341	mov	edi,DWORD [24+esp]
342	add	ebp,857760878
343	add	ecx,2036477234
344	add	esi,1797285236
345	add	edx,DWORD [84+esp]
346	add	edi,DWORD [88+esp]
347	mov	DWORD [4+esp],ebp
348	mov	DWORD [8+esp],ecx
349	mov	DWORD [12+esp],esi
350	mov	DWORD [20+esp],edx
351	mov	DWORD [24+esp],edi
352	mov	ebp,DWORD [28+esp]
353	mov	ecx,DWORD [40+esp]
354	mov	esi,DWORD [44+esp]
355	mov	edx,DWORD [52+esp]
356	mov	edi,DWORD [60+esp]
357	add	ebp,DWORD [92+esp]
358	add	ecx,DWORD [104+esp]
359	add	esi,DWORD [108+esp]
360	add	edx,DWORD [116+esp]
361	add	edi,DWORD [124+esp]
362	mov	DWORD [28+esp],ebp
363	mov	ebp,DWORD [156+esp]
364	mov	DWORD [40+esp],ecx
365	mov	ecx,DWORD [152+esp]
366	mov	DWORD [44+esp],esi
367	xor	esi,esi
368	mov	DWORD [52+esp],edx
369	mov	DWORD [60+esp],edi
370	xor	eax,eax
371	xor	edx,edx
372L$007tail_loop:
373	mov	al,BYTE [ebp*1+esi]
374	mov	dl,BYTE [esi*1+esp]
375	lea	esi,[1+esi]
376	xor	al,dl
377	mov	BYTE [esi*1+ecx-1],al
378	dec	ebx
379	jnz	NEAR L$007tail_loop
380L$006done:
381	add	esp,132
382L$000no_data:
383	pop	edi
384	pop	esi
385	pop	ebx
386	pop	ebp
387	ret
388global	_ChaCha20_ssse3
389align	16
390_ChaCha20_ssse3:
391L$_ChaCha20_ssse3_begin:
392	push	ebp
393	push	ebx
394	push	esi
395	push	edi
396L$ssse3_shortcut:
397	mov	edi,DWORD [20+esp]
398	mov	esi,DWORD [24+esp]
399	mov	ecx,DWORD [28+esp]
400	mov	edx,DWORD [32+esp]
401	mov	ebx,DWORD [36+esp]
402	mov	ebp,esp
403	sub	esp,524
404	and	esp,-64
405	mov	DWORD [512+esp],ebp
406	lea	eax,[(L$ssse3_data-L$pic_point)+eax]
407	movdqu	xmm3,[ebx]
408	cmp	ecx,256
409	jb	NEAR L$0081x
410	mov	DWORD [516+esp],edx
411	mov	DWORD [520+esp],ebx
412	sub	ecx,256
413	lea	ebp,[384+esp]
414	movdqu	xmm7,[edx]
415	pshufd	xmm0,xmm3,0
416	pshufd	xmm1,xmm3,85
417	pshufd	xmm2,xmm3,170
418	pshufd	xmm3,xmm3,255
419	paddd	xmm0,[48+eax]
420	pshufd	xmm4,xmm7,0
421	pshufd	xmm5,xmm7,85
422	psubd	xmm0,[64+eax]
423	pshufd	xmm6,xmm7,170
424	pshufd	xmm7,xmm7,255
425	movdqa	[64+ebp],xmm0
426	movdqa	[80+ebp],xmm1
427	movdqa	[96+ebp],xmm2
428	movdqa	[112+ebp],xmm3
429	movdqu	xmm3,[16+edx]
430	movdqa	[ebp-64],xmm4
431	movdqa	[ebp-48],xmm5
432	movdqa	[ebp-32],xmm6
433	movdqa	[ebp-16],xmm7
434	movdqa	xmm7,[32+eax]
435	lea	ebx,[128+esp]
436	pshufd	xmm0,xmm3,0
437	pshufd	xmm1,xmm3,85
438	pshufd	xmm2,xmm3,170
439	pshufd	xmm3,xmm3,255
440	pshufd	xmm4,xmm7,0
441	pshufd	xmm5,xmm7,85
442	pshufd	xmm6,xmm7,170
443	pshufd	xmm7,xmm7,255
444	movdqa	[ebp],xmm0
445	movdqa	[16+ebp],xmm1
446	movdqa	[32+ebp],xmm2
447	movdqa	[48+ebp],xmm3
448	movdqa	[ebp-128],xmm4
449	movdqa	[ebp-112],xmm5
450	movdqa	[ebp-96],xmm6
451	movdqa	[ebp-80],xmm7
452	lea	esi,[128+esi]
453	lea	edi,[128+edi]
454	jmp	NEAR L$009outer_loop
455align	16
456L$009outer_loop:
457	movdqa	xmm1,[ebp-112]
458	movdqa	xmm2,[ebp-96]
459	movdqa	xmm3,[ebp-80]
460	movdqa	xmm5,[ebp-48]
461	movdqa	xmm6,[ebp-32]
462	movdqa	xmm7,[ebp-16]
463	movdqa	[ebx-112],xmm1
464	movdqa	[ebx-96],xmm2
465	movdqa	[ebx-80],xmm3
466	movdqa	[ebx-48],xmm5
467	movdqa	[ebx-32],xmm6
468	movdqa	[ebx-16],xmm7
469	movdqa	xmm2,[32+ebp]
470	movdqa	xmm3,[48+ebp]
471	movdqa	xmm4,[64+ebp]
472	movdqa	xmm5,[80+ebp]
473	movdqa	xmm6,[96+ebp]
474	movdqa	xmm7,[112+ebp]
475	paddd	xmm4,[64+eax]
476	movdqa	[32+ebx],xmm2
477	movdqa	[48+ebx],xmm3
478	movdqa	[64+ebx],xmm4
479	movdqa	[80+ebx],xmm5
480	movdqa	[96+ebx],xmm6
481	movdqa	[112+ebx],xmm7
482	movdqa	[64+ebp],xmm4
483	movdqa	xmm0,[ebp-128]
484	movdqa	xmm6,xmm4
485	movdqa	xmm3,[ebp-64]
486	movdqa	xmm4,[ebp]
487	movdqa	xmm5,[16+ebp]
488	mov	edx,10
489	nop
490align	16
491L$010loop:
492	paddd	xmm0,xmm3
493	movdqa	xmm2,xmm3
494	pxor	xmm6,xmm0
495	pshufb	xmm6,[eax]
496	paddd	xmm4,xmm6
497	pxor	xmm2,xmm4
498	movdqa	xmm3,[ebx-48]
499	movdqa	xmm1,xmm2
500	pslld	xmm2,12
501	psrld	xmm1,20
502	por	xmm2,xmm1
503	movdqa	xmm1,[ebx-112]
504	paddd	xmm0,xmm2
505	movdqa	xmm7,[80+ebx]
506	pxor	xmm6,xmm0
507	movdqa	[ebx-128],xmm0
508	pshufb	xmm6,[16+eax]
509	paddd	xmm4,xmm6
510	movdqa	[64+ebx],xmm6
511	pxor	xmm2,xmm4
512	paddd	xmm1,xmm3
513	movdqa	xmm0,xmm2
514	pslld	xmm2,7
515	psrld	xmm0,25
516	pxor	xmm7,xmm1
517	por	xmm2,xmm0
518	movdqa	[ebx],xmm4
519	pshufb	xmm7,[eax]
520	movdqa	[ebx-64],xmm2
521	paddd	xmm5,xmm7
522	movdqa	xmm4,[32+ebx]
523	pxor	xmm3,xmm5
524	movdqa	xmm2,[ebx-32]
525	movdqa	xmm0,xmm3
526	pslld	xmm3,12
527	psrld	xmm0,20
528	por	xmm3,xmm0
529	movdqa	xmm0,[ebx-96]
530	paddd	xmm1,xmm3
531	movdqa	xmm6,[96+ebx]
532	pxor	xmm7,xmm1
533	movdqa	[ebx-112],xmm1
534	pshufb	xmm7,[16+eax]
535	paddd	xmm5,xmm7
536	movdqa	[80+ebx],xmm7
537	pxor	xmm3,xmm5
538	paddd	xmm0,xmm2
539	movdqa	xmm1,xmm3
540	pslld	xmm3,7
541	psrld	xmm1,25
542	pxor	xmm6,xmm0
543	por	xmm3,xmm1
544	movdqa	[16+ebx],xmm5
545	pshufb	xmm6,[eax]
546	movdqa	[ebx-48],xmm3
547	paddd	xmm4,xmm6
548	movdqa	xmm5,[48+ebx]
549	pxor	xmm2,xmm4
550	movdqa	xmm3,[ebx-16]
551	movdqa	xmm1,xmm2
552	pslld	xmm2,12
553	psrld	xmm1,20
554	por	xmm2,xmm1
555	movdqa	xmm1,[ebx-80]
556	paddd	xmm0,xmm2
557	movdqa	xmm7,[112+ebx]
558	pxor	xmm6,xmm0
559	movdqa	[ebx-96],xmm0
560	pshufb	xmm6,[16+eax]
561	paddd	xmm4,xmm6
562	movdqa	[96+ebx],xmm6
563	pxor	xmm2,xmm4
564	paddd	xmm1,xmm3
565	movdqa	xmm0,xmm2
566	pslld	xmm2,7
567	psrld	xmm0,25
568	pxor	xmm7,xmm1
569	por	xmm2,xmm0
570	pshufb	xmm7,[eax]
571	movdqa	[ebx-32],xmm2
572	paddd	xmm5,xmm7
573	pxor	xmm3,xmm5
574	movdqa	xmm2,[ebx-48]
575	movdqa	xmm0,xmm3
576	pslld	xmm3,12
577	psrld	xmm0,20
578	por	xmm3,xmm0
579	movdqa	xmm0,[ebx-128]
580	paddd	xmm1,xmm3
581	pxor	xmm7,xmm1
582	movdqa	[ebx-80],xmm1
583	pshufb	xmm7,[16+eax]
584	paddd	xmm5,xmm7
585	movdqa	xmm6,xmm7
586	pxor	xmm3,xmm5
587	paddd	xmm0,xmm2
588	movdqa	xmm1,xmm3
589	pslld	xmm3,7
590	psrld	xmm1,25
591	pxor	xmm6,xmm0
592	por	xmm3,xmm1
593	pshufb	xmm6,[eax]
594	movdqa	[ebx-16],xmm3
595	paddd	xmm4,xmm6
596	pxor	xmm2,xmm4
597	movdqa	xmm3,[ebx-32]
598	movdqa	xmm1,xmm2
599	pslld	xmm2,12
600	psrld	xmm1,20
601	por	xmm2,xmm1
602	movdqa	xmm1,[ebx-112]
603	paddd	xmm0,xmm2
604	movdqa	xmm7,[64+ebx]
605	pxor	xmm6,xmm0
606	movdqa	[ebx-128],xmm0
607	pshufb	xmm6,[16+eax]
608	paddd	xmm4,xmm6
609	movdqa	[112+ebx],xmm6
610	pxor	xmm2,xmm4
611	paddd	xmm1,xmm3
612	movdqa	xmm0,xmm2
613	pslld	xmm2,7
614	psrld	xmm0,25
615	pxor	xmm7,xmm1
616	por	xmm2,xmm0
617	movdqa	[32+ebx],xmm4
618	pshufb	xmm7,[eax]
619	movdqa	[ebx-48],xmm2
620	paddd	xmm5,xmm7
621	movdqa	xmm4,[ebx]
622	pxor	xmm3,xmm5
623	movdqa	xmm2,[ebx-16]
624	movdqa	xmm0,xmm3
625	pslld	xmm3,12
626	psrld	xmm0,20
627	por	xmm3,xmm0
628	movdqa	xmm0,[ebx-96]
629	paddd	xmm1,xmm3
630	movdqa	xmm6,[80+ebx]
631	pxor	xmm7,xmm1
632	movdqa	[ebx-112],xmm1
633	pshufb	xmm7,[16+eax]
634	paddd	xmm5,xmm7
635	movdqa	[64+ebx],xmm7
636	pxor	xmm3,xmm5
637	paddd	xmm0,xmm2
638	movdqa	xmm1,xmm3
639	pslld	xmm3,7
640	psrld	xmm1,25
641	pxor	xmm6,xmm0
642	por	xmm3,xmm1
643	movdqa	[48+ebx],xmm5
644	pshufb	xmm6,[eax]
645	movdqa	[ebx-32],xmm3
646	paddd	xmm4,xmm6
647	movdqa	xmm5,[16+ebx]
648	pxor	xmm2,xmm4
649	movdqa	xmm3,[ebx-64]
650	movdqa	xmm1,xmm2
651	pslld	xmm2,12
652	psrld	xmm1,20
653	por	xmm2,xmm1
654	movdqa	xmm1,[ebx-80]
655	paddd	xmm0,xmm2
656	movdqa	xmm7,[96+ebx]
657	pxor	xmm6,xmm0
658	movdqa	[ebx-96],xmm0
659	pshufb	xmm6,[16+eax]
660	paddd	xmm4,xmm6
661	movdqa	[80+ebx],xmm6
662	pxor	xmm2,xmm4
663	paddd	xmm1,xmm3
664	movdqa	xmm0,xmm2
665	pslld	xmm2,7
666	psrld	xmm0,25
667	pxor	xmm7,xmm1
668	por	xmm2,xmm0
669	pshufb	xmm7,[eax]
670	movdqa	[ebx-16],xmm2
671	paddd	xmm5,xmm7
672	pxor	xmm3,xmm5
673	movdqa	xmm0,xmm3
674	pslld	xmm3,12
675	psrld	xmm0,20
676	por	xmm3,xmm0
677	movdqa	xmm0,[ebx-128]
678	paddd	xmm1,xmm3
679	movdqa	xmm6,[64+ebx]
680	pxor	xmm7,xmm1
681	movdqa	[ebx-80],xmm1
682	pshufb	xmm7,[16+eax]
683	paddd	xmm5,xmm7
684	movdqa	[96+ebx],xmm7
685	pxor	xmm3,xmm5
686	movdqa	xmm1,xmm3
687	pslld	xmm3,7
688	psrld	xmm1,25
689	por	xmm3,xmm1
690	dec	edx
691	jnz	NEAR L$010loop
692	movdqa	[ebx-64],xmm3
693	movdqa	[ebx],xmm4
694	movdqa	[16+ebx],xmm5
695	movdqa	[64+ebx],xmm6
696	movdqa	[96+ebx],xmm7
697	movdqa	xmm1,[ebx-112]
698	movdqa	xmm2,[ebx-96]
699	movdqa	xmm3,[ebx-80]
700	paddd	xmm0,[ebp-128]
701	paddd	xmm1,[ebp-112]
702	paddd	xmm2,[ebp-96]
703	paddd	xmm3,[ebp-80]
704	movdqa	xmm6,xmm0
705	punpckldq	xmm0,xmm1
706	movdqa	xmm7,xmm2
707	punpckldq	xmm2,xmm3
708	punpckhdq	xmm6,xmm1
709	punpckhdq	xmm7,xmm3
710	movdqa	xmm1,xmm0
711	punpcklqdq	xmm0,xmm2
712	movdqa	xmm3,xmm6
713	punpcklqdq	xmm6,xmm7
714	punpckhqdq	xmm1,xmm2
715	punpckhqdq	xmm3,xmm7
716	movdqu	xmm4,[esi-128]
717	movdqu	xmm5,[esi-64]
718	movdqu	xmm2,[esi]
719	movdqu	xmm7,[64+esi]
720	lea	esi,[16+esi]
721	pxor	xmm4,xmm0
722	movdqa	xmm0,[ebx-64]
723	pxor	xmm5,xmm1
724	movdqa	xmm1,[ebx-48]
725	pxor	xmm6,xmm2
726	movdqa	xmm2,[ebx-32]
727	pxor	xmm7,xmm3
728	movdqa	xmm3,[ebx-16]
729	movdqu	[edi-128],xmm4
730	movdqu	[edi-64],xmm5
731	movdqu	[edi],xmm6
732	movdqu	[64+edi],xmm7
733	lea	edi,[16+edi]
734	paddd	xmm0,[ebp-64]
735	paddd	xmm1,[ebp-48]
736	paddd	xmm2,[ebp-32]
737	paddd	xmm3,[ebp-16]
738	movdqa	xmm6,xmm0
739	punpckldq	xmm0,xmm1
740	movdqa	xmm7,xmm2
741	punpckldq	xmm2,xmm3
742	punpckhdq	xmm6,xmm1
743	punpckhdq	xmm7,xmm3
744	movdqa	xmm1,xmm0
745	punpcklqdq	xmm0,xmm2
746	movdqa	xmm3,xmm6
747	punpcklqdq	xmm6,xmm7
748	punpckhqdq	xmm1,xmm2
749	punpckhqdq	xmm3,xmm7
750	movdqu	xmm4,[esi-128]
751	movdqu	xmm5,[esi-64]
752	movdqu	xmm2,[esi]
753	movdqu	xmm7,[64+esi]
754	lea	esi,[16+esi]
755	pxor	xmm4,xmm0
756	movdqa	xmm0,[ebx]
757	pxor	xmm5,xmm1
758	movdqa	xmm1,[16+ebx]
759	pxor	xmm6,xmm2
760	movdqa	xmm2,[32+ebx]
761	pxor	xmm7,xmm3
762	movdqa	xmm3,[48+ebx]
763	movdqu	[edi-128],xmm4
764	movdqu	[edi-64],xmm5
765	movdqu	[edi],xmm6
766	movdqu	[64+edi],xmm7
767	lea	edi,[16+edi]
768	paddd	xmm0,[ebp]
769	paddd	xmm1,[16+ebp]
770	paddd	xmm2,[32+ebp]
771	paddd	xmm3,[48+ebp]
772	movdqa	xmm6,xmm0
773	punpckldq	xmm0,xmm1
774	movdqa	xmm7,xmm2
775	punpckldq	xmm2,xmm3
776	punpckhdq	xmm6,xmm1
777	punpckhdq	xmm7,xmm3
778	movdqa	xmm1,xmm0
779	punpcklqdq	xmm0,xmm2
780	movdqa	xmm3,xmm6
781	punpcklqdq	xmm6,xmm7
782	punpckhqdq	xmm1,xmm2
783	punpckhqdq	xmm3,xmm7
784	movdqu	xmm4,[esi-128]
785	movdqu	xmm5,[esi-64]
786	movdqu	xmm2,[esi]
787	movdqu	xmm7,[64+esi]
788	lea	esi,[16+esi]
789	pxor	xmm4,xmm0
790	movdqa	xmm0,[64+ebx]
791	pxor	xmm5,xmm1
792	movdqa	xmm1,[80+ebx]
793	pxor	xmm6,xmm2
794	movdqa	xmm2,[96+ebx]
795	pxor	xmm7,xmm3
796	movdqa	xmm3,[112+ebx]
797	movdqu	[edi-128],xmm4
798	movdqu	[edi-64],xmm5
799	movdqu	[edi],xmm6
800	movdqu	[64+edi],xmm7
801	lea	edi,[16+edi]
802	paddd	xmm0,[64+ebp]
803	paddd	xmm1,[80+ebp]
804	paddd	xmm2,[96+ebp]
805	paddd	xmm3,[112+ebp]
806	movdqa	xmm6,xmm0
807	punpckldq	xmm0,xmm1
808	movdqa	xmm7,xmm2
809	punpckldq	xmm2,xmm3
810	punpckhdq	xmm6,xmm1
811	punpckhdq	xmm7,xmm3
812	movdqa	xmm1,xmm0
813	punpcklqdq	xmm0,xmm2
814	movdqa	xmm3,xmm6
815	punpcklqdq	xmm6,xmm7
816	punpckhqdq	xmm1,xmm2
817	punpckhqdq	xmm3,xmm7
818	movdqu	xmm4,[esi-128]
819	movdqu	xmm5,[esi-64]
820	movdqu	xmm2,[esi]
821	movdqu	xmm7,[64+esi]
822	lea	esi,[208+esi]
823	pxor	xmm4,xmm0
824	pxor	xmm5,xmm1
825	pxor	xmm6,xmm2
826	pxor	xmm7,xmm3
827	movdqu	[edi-128],xmm4
828	movdqu	[edi-64],xmm5
829	movdqu	[edi],xmm6
830	movdqu	[64+edi],xmm7
831	lea	edi,[208+edi]
832	sub	ecx,256
833	jnc	NEAR L$009outer_loop
834	add	ecx,256
835	jz	NEAR L$011done
836	mov	ebx,DWORD [520+esp]
837	lea	esi,[esi-128]
838	mov	edx,DWORD [516+esp]
839	lea	edi,[edi-128]
840	movd	xmm2,DWORD [64+ebp]
841	movdqu	xmm3,[ebx]
842	paddd	xmm2,[96+eax]
843	pand	xmm3,[112+eax]
844	por	xmm3,xmm2
845L$0081x:
846	movdqa	xmm0,[32+eax]
847	movdqu	xmm1,[edx]
848	movdqu	xmm2,[16+edx]
849	movdqa	xmm6,[eax]
850	movdqa	xmm7,[16+eax]
851	mov	DWORD [48+esp],ebp
852	movdqa	[esp],xmm0
853	movdqa	[16+esp],xmm1
854	movdqa	[32+esp],xmm2
855	movdqa	[48+esp],xmm3
856	mov	edx,10
857	jmp	NEAR L$012loop1x
858align	16
859L$013outer1x:
860	movdqa	xmm3,[80+eax]
861	movdqa	xmm0,[esp]
862	movdqa	xmm1,[16+esp]
863	movdqa	xmm2,[32+esp]
864	paddd	xmm3,[48+esp]
865	mov	edx,10
866	movdqa	[48+esp],xmm3
867	jmp	NEAR L$012loop1x
868align	16
869L$012loop1x:
870	paddd	xmm0,xmm1
871	pxor	xmm3,xmm0
872db	102,15,56,0,222
873	paddd	xmm2,xmm3
874	pxor	xmm1,xmm2
875	movdqa	xmm4,xmm1
876	psrld	xmm1,20
877	pslld	xmm4,12
878	por	xmm1,xmm4
879	paddd	xmm0,xmm1
880	pxor	xmm3,xmm0
881db	102,15,56,0,223
882	paddd	xmm2,xmm3
883	pxor	xmm1,xmm2
884	movdqa	xmm4,xmm1
885	psrld	xmm1,25
886	pslld	xmm4,7
887	por	xmm1,xmm4
888	pshufd	xmm2,xmm2,78
889	pshufd	xmm1,xmm1,57
890	pshufd	xmm3,xmm3,147
891	nop
892	paddd	xmm0,xmm1
893	pxor	xmm3,xmm0
894db	102,15,56,0,222
895	paddd	xmm2,xmm3
896	pxor	xmm1,xmm2
897	movdqa	xmm4,xmm1
898	psrld	xmm1,20
899	pslld	xmm4,12
900	por	xmm1,xmm4
901	paddd	xmm0,xmm1
902	pxor	xmm3,xmm0
903db	102,15,56,0,223
904	paddd	xmm2,xmm3
905	pxor	xmm1,xmm2
906	movdqa	xmm4,xmm1
907	psrld	xmm1,25
908	pslld	xmm4,7
909	por	xmm1,xmm4
910	pshufd	xmm2,xmm2,78
911	pshufd	xmm1,xmm1,147
912	pshufd	xmm3,xmm3,57
913	dec	edx
914	jnz	NEAR L$012loop1x
915	paddd	xmm0,[esp]
916	paddd	xmm1,[16+esp]
917	paddd	xmm2,[32+esp]
918	paddd	xmm3,[48+esp]
919	cmp	ecx,64
920	jb	NEAR L$014tail
921	movdqu	xmm4,[esi]
922	movdqu	xmm5,[16+esi]
923	pxor	xmm0,xmm4
924	movdqu	xmm4,[32+esi]
925	pxor	xmm1,xmm5
926	movdqu	xmm5,[48+esi]
927	pxor	xmm2,xmm4
928	pxor	xmm3,xmm5
929	lea	esi,[64+esi]
930	movdqu	[edi],xmm0
931	movdqu	[16+edi],xmm1
932	movdqu	[32+edi],xmm2
933	movdqu	[48+edi],xmm3
934	lea	edi,[64+edi]
935	sub	ecx,64
936	jnz	NEAR L$013outer1x
937	jmp	NEAR L$011done
938L$014tail:
939	movdqa	[esp],xmm0
940	movdqa	[16+esp],xmm1
941	movdqa	[32+esp],xmm2
942	movdqa	[48+esp],xmm3
943	xor	eax,eax
944	xor	edx,edx
945	xor	ebp,ebp
946L$015tail_loop:
947	mov	al,BYTE [ebp*1+esp]
948	mov	dl,BYTE [ebp*1+esi]
949	lea	ebp,[1+ebp]
950	xor	al,dl
951	mov	BYTE [ebp*1+edi-1],al
952	dec	ecx
953	jnz	NEAR L$015tail_loop
954L$011done:
955	mov	esp,DWORD [512+esp]
956	pop	edi
957	pop	esi
958	pop	ebx
959	pop	ebp
960	ret
961align	64
962L$ssse3_data:
963db	2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
964db	3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
965dd	1634760805,857760878,2036477234,1797285236
966dd	0,1,2,3
967dd	4,4,4,4
968dd	1,0,0,0
969dd	4,0,0,0
970dd	0,-1,-1,-1
971align	64
972db	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
973db	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
974db	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
975db	114,103,62,0
976segment	.bss
977common	_OPENSSL_ia32cap_P 16
978