1default	rel
2%define XMMWORD
3%define YMMWORD
4%define ZMMWORD
5section	.text code align=64
6
7EXTERN	OPENSSL_ia32cap_P
8
9
10ALIGN	64
11$L$poly:
12	DQ	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
13
14$L$One:
15	DD	1,1,1,1,1,1,1,1
16$L$Two:
17	DD	2,2,2,2,2,2,2,2
18$L$Three:
19	DD	3,3,3,3,3,3,3,3
20$L$ONE_mont:
21	DQ	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
22
23
24ALIGN	64
25ecp_nistz256_mul_by_2:
26	mov	QWORD[8+rsp],rdi	;WIN64 prologue
27	mov	QWORD[16+rsp],rsi
28	mov	rax,rsp
29$L$SEH_begin_ecp_nistz256_mul_by_2:
30	mov	rdi,rcx
31	mov	rsi,rdx
32
33
34	push	r12
35	push	r13
36
37	mov	r8,QWORD[rsi]
38	mov	r9,QWORD[8+rsi]
39	add	r8,r8
40	mov	r10,QWORD[16+rsi]
41	adc	r9,r9
42	mov	r11,QWORD[24+rsi]
43	lea	rsi,[$L$poly]
44	mov	rax,r8
45	adc	r10,r10
46	adc	r11,r11
47	mov	rdx,r9
48	sbb	r13,r13
49
50	sub	r8,QWORD[rsi]
51	mov	rcx,r10
52	sbb	r9,QWORD[8+rsi]
53	sbb	r10,QWORD[16+rsi]
54	mov	r12,r11
55	sbb	r11,QWORD[24+rsi]
56	test	r13,r13
57
58	cmovz	r8,rax
59	cmovz	r9,rdx
60	mov	QWORD[rdi],r8
61	cmovz	r10,rcx
62	mov	QWORD[8+rdi],r9
63	cmovz	r11,r12
64	mov	QWORD[16+rdi],r10
65	mov	QWORD[24+rdi],r11
66
67	pop	r13
68	pop	r12
69	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
70	mov	rsi,QWORD[16+rsp]
71	DB	0F3h,0C3h		;repret
72$L$SEH_end_ecp_nistz256_mul_by_2:
73
74
75
76global	ecp_nistz256_neg
77
78ALIGN	32
79ecp_nistz256_neg:
80	mov	QWORD[8+rsp],rdi	;WIN64 prologue
81	mov	QWORD[16+rsp],rsi
82	mov	rax,rsp
83$L$SEH_begin_ecp_nistz256_neg:
84	mov	rdi,rcx
85	mov	rsi,rdx
86
87
88	push	r12
89	push	r13
90
91	xor	r8,r8
92	xor	r9,r9
93	xor	r10,r10
94	xor	r11,r11
95	xor	r13,r13
96
97	sub	r8,QWORD[rsi]
98	sbb	r9,QWORD[8+rsi]
99	sbb	r10,QWORD[16+rsi]
100	mov	rax,r8
101	sbb	r11,QWORD[24+rsi]
102	lea	rsi,[$L$poly]
103	mov	rdx,r9
104	sbb	r13,0
105
106	add	r8,QWORD[rsi]
107	mov	rcx,r10
108	adc	r9,QWORD[8+rsi]
109	adc	r10,QWORD[16+rsi]
110	mov	r12,r11
111	adc	r11,QWORD[24+rsi]
112	test	r13,r13
113
114	cmovz	r8,rax
115	cmovz	r9,rdx
116	mov	QWORD[rdi],r8
117	cmovz	r10,rcx
118	mov	QWORD[8+rdi],r9
119	cmovz	r11,r12
120	mov	QWORD[16+rdi],r10
121	mov	QWORD[24+rdi],r11
122
123	pop	r13
124	pop	r12
125	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
126	mov	rsi,QWORD[16+rsp]
127	DB	0F3h,0C3h		;repret
128$L$SEH_end_ecp_nistz256_neg:
129
130
131
132
133
134
135global	ecp_nistz256_mul_mont
136
137ALIGN	32
138ecp_nistz256_mul_mont:
139	mov	QWORD[8+rsp],rdi	;WIN64 prologue
140	mov	QWORD[16+rsp],rsi
141	mov	rax,rsp
142$L$SEH_begin_ecp_nistz256_mul_mont:
143	mov	rdi,rcx
144	mov	rsi,rdx
145	mov	rdx,r8
146
147
148$L$mul_mont:
149	push	rbp
150	push	rbx
151	push	r12
152	push	r13
153	push	r14
154	push	r15
155	mov	rbx,rdx
156	mov	rax,QWORD[rdx]
157	mov	r9,QWORD[rsi]
158	mov	r10,QWORD[8+rsi]
159	mov	r11,QWORD[16+rsi]
160	mov	r12,QWORD[24+rsi]
161
162	call	__ecp_nistz256_mul_montq
163$L$mul_mont_done:
164	pop	r15
165	pop	r14
166	pop	r13
167	pop	r12
168	pop	rbx
169	pop	rbp
170	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
171	mov	rsi,QWORD[16+rsp]
172	DB	0F3h,0C3h		;repret
173$L$SEH_end_ecp_nistz256_mul_mont:
174
175
176ALIGN	32
177__ecp_nistz256_mul_montq:
178
179
180	mov	rbp,rax
181	mul	r9
182	mov	r14,QWORD[(($L$poly+8))]
183	mov	r8,rax
184	mov	rax,rbp
185	mov	r9,rdx
186
187	mul	r10
188	mov	r15,QWORD[(($L$poly+24))]
189	add	r9,rax
190	mov	rax,rbp
191	adc	rdx,0
192	mov	r10,rdx
193
194	mul	r11
195	add	r10,rax
196	mov	rax,rbp
197	adc	rdx,0
198	mov	r11,rdx
199
200	mul	r12
201	add	r11,rax
202	mov	rax,r8
203	adc	rdx,0
204	xor	r13,r13
205	mov	r12,rdx
206
207
208
209
210
211
212
213
214
215
216	mov	rbp,r8
217	shl	r8,32
218	mul	r15
219	shr	rbp,32
220	add	r9,r8
221	adc	r10,rbp
222	adc	r11,rax
223	mov	rax,QWORD[8+rbx]
224	adc	r12,rdx
225	adc	r13,0
226	xor	r8,r8
227
228
229
230	mov	rbp,rax
231	mul	QWORD[rsi]
232	add	r9,rax
233	mov	rax,rbp
234	adc	rdx,0
235	mov	rcx,rdx
236
237	mul	QWORD[8+rsi]
238	add	r10,rcx
239	adc	rdx,0
240	add	r10,rax
241	mov	rax,rbp
242	adc	rdx,0
243	mov	rcx,rdx
244
245	mul	QWORD[16+rsi]
246	add	r11,rcx
247	adc	rdx,0
248	add	r11,rax
249	mov	rax,rbp
250	adc	rdx,0
251	mov	rcx,rdx
252
253	mul	QWORD[24+rsi]
254	add	r12,rcx
255	adc	rdx,0
256	add	r12,rax
257	mov	rax,r9
258	adc	r13,rdx
259	adc	r8,0
260
261
262
263	mov	rbp,r9
264	shl	r9,32
265	mul	r15
266	shr	rbp,32
267	add	r10,r9
268	adc	r11,rbp
269	adc	r12,rax
270	mov	rax,QWORD[16+rbx]
271	adc	r13,rdx
272	adc	r8,0
273	xor	r9,r9
274
275
276
277	mov	rbp,rax
278	mul	QWORD[rsi]
279	add	r10,rax
280	mov	rax,rbp
281	adc	rdx,0
282	mov	rcx,rdx
283
284	mul	QWORD[8+rsi]
285	add	r11,rcx
286	adc	rdx,0
287	add	r11,rax
288	mov	rax,rbp
289	adc	rdx,0
290	mov	rcx,rdx
291
292	mul	QWORD[16+rsi]
293	add	r12,rcx
294	adc	rdx,0
295	add	r12,rax
296	mov	rax,rbp
297	adc	rdx,0
298	mov	rcx,rdx
299
300	mul	QWORD[24+rsi]
301	add	r13,rcx
302	adc	rdx,0
303	add	r13,rax
304	mov	rax,r10
305	adc	r8,rdx
306	adc	r9,0
307
308
309
310	mov	rbp,r10
311	shl	r10,32
312	mul	r15
313	shr	rbp,32
314	add	r11,r10
315	adc	r12,rbp
316	adc	r13,rax
317	mov	rax,QWORD[24+rbx]
318	adc	r8,rdx
319	adc	r9,0
320	xor	r10,r10
321
322
323
324	mov	rbp,rax
325	mul	QWORD[rsi]
326	add	r11,rax
327	mov	rax,rbp
328	adc	rdx,0
329	mov	rcx,rdx
330
331	mul	QWORD[8+rsi]
332	add	r12,rcx
333	adc	rdx,0
334	add	r12,rax
335	mov	rax,rbp
336	adc	rdx,0
337	mov	rcx,rdx
338
339	mul	QWORD[16+rsi]
340	add	r13,rcx
341	adc	rdx,0
342	add	r13,rax
343	mov	rax,rbp
344	adc	rdx,0
345	mov	rcx,rdx
346
347	mul	QWORD[24+rsi]
348	add	r8,rcx
349	adc	rdx,0
350	add	r8,rax
351	mov	rax,r11
352	adc	r9,rdx
353	adc	r10,0
354
355
356
357	mov	rbp,r11
358	shl	r11,32
359	mul	r15
360	shr	rbp,32
361	add	r12,r11
362	adc	r13,rbp
363	mov	rcx,r12
364	adc	r8,rax
365	adc	r9,rdx
366	mov	rbp,r13
367	adc	r10,0
368
369
370
371	sub	r12,-1
372	mov	rbx,r8
373	sbb	r13,r14
374	sbb	r8,0
375	mov	rdx,r9
376	sbb	r9,r15
377	sbb	r10,0
378
379	cmovc	r12,rcx
380	cmovc	r13,rbp
381	mov	QWORD[rdi],r12
382	cmovc	r8,rbx
383	mov	QWORD[8+rdi],r13
384	cmovc	r9,rdx
385	mov	QWORD[16+rdi],r8
386	mov	QWORD[24+rdi],r9
387
388	DB	0F3h,0C3h		;repret
389
390
391
392
393
394
395
396
397
398global	ecp_nistz256_sqr_mont
399
400ALIGN	32
401ecp_nistz256_sqr_mont:
402	mov	QWORD[8+rsp],rdi	;WIN64 prologue
403	mov	QWORD[16+rsp],rsi
404	mov	rax,rsp
405$L$SEH_begin_ecp_nistz256_sqr_mont:
406	mov	rdi,rcx
407	mov	rsi,rdx
408
409
410	push	rbp
411	push	rbx
412	push	r12
413	push	r13
414	push	r14
415	push	r15
416	mov	rax,QWORD[rsi]
417	mov	r14,QWORD[8+rsi]
418	mov	r15,QWORD[16+rsi]
419	mov	r8,QWORD[24+rsi]
420
421	call	__ecp_nistz256_sqr_montq
422$L$sqr_mont_done:
423	pop	r15
424	pop	r14
425	pop	r13
426	pop	r12
427	pop	rbx
428	pop	rbp
429	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
430	mov	rsi,QWORD[16+rsp]
431	DB	0F3h,0C3h		;repret
432$L$SEH_end_ecp_nistz256_sqr_mont:
433
434
435ALIGN	32
436__ecp_nistz256_sqr_montq:
437	mov	r13,rax
438	mul	r14
439	mov	r9,rax
440	mov	rax,r15
441	mov	r10,rdx
442
443	mul	r13
444	add	r10,rax
445	mov	rax,r8
446	adc	rdx,0
447	mov	r11,rdx
448
449	mul	r13
450	add	r11,rax
451	mov	rax,r15
452	adc	rdx,0
453	mov	r12,rdx
454
455
456	mul	r14
457	add	r11,rax
458	mov	rax,r8
459	adc	rdx,0
460	mov	rbp,rdx
461
462	mul	r14
463	add	r12,rax
464	mov	rax,r8
465	adc	rdx,0
466	add	r12,rbp
467	mov	r13,rdx
468	adc	r13,0
469
470
471	mul	r15
472	xor	r15,r15
473	add	r13,rax
474	mov	rax,QWORD[rsi]
475	mov	r14,rdx
476	adc	r14,0
477
478	add	r9,r9
479	adc	r10,r10
480	adc	r11,r11
481	adc	r12,r12
482	adc	r13,r13
483	adc	r14,r14
484	adc	r15,0
485
486	mul	rax
487	mov	r8,rax
488	mov	rax,QWORD[8+rsi]
489	mov	rcx,rdx
490
491	mul	rax
492	add	r9,rcx
493	adc	r10,rax
494	mov	rax,QWORD[16+rsi]
495	adc	rdx,0
496	mov	rcx,rdx
497
498	mul	rax
499	add	r11,rcx
500	adc	r12,rax
501	mov	rax,QWORD[24+rsi]
502	adc	rdx,0
503	mov	rcx,rdx
504
505	mul	rax
506	add	r13,rcx
507	adc	r14,rax
508	mov	rax,r8
509	adc	r15,rdx
510
511	mov	rsi,QWORD[(($L$poly+8))]
512	mov	rbp,QWORD[(($L$poly+24))]
513
514
515
516
517	mov	rcx,r8
518	shl	r8,32
519	mul	rbp
520	shr	rcx,32
521	add	r9,r8
522	adc	r10,rcx
523	adc	r11,rax
524	mov	rax,r9
525	adc	rdx,0
526
527
528
529	mov	rcx,r9
530	shl	r9,32
531	mov	r8,rdx
532	mul	rbp
533	shr	rcx,32
534	add	r10,r9
535	adc	r11,rcx
536	adc	r8,rax
537	mov	rax,r10
538	adc	rdx,0
539
540
541
542	mov	rcx,r10
543	shl	r10,32
544	mov	r9,rdx
545	mul	rbp
546	shr	rcx,32
547	add	r11,r10
548	adc	r8,rcx
549	adc	r9,rax
550	mov	rax,r11
551	adc	rdx,0
552
553
554
555	mov	rcx,r11
556	shl	r11,32
557	mov	r10,rdx
558	mul	rbp
559	shr	rcx,32
560	add	r8,r11
561	adc	r9,rcx
562	adc	r10,rax
563	adc	rdx,0
564	xor	r11,r11
565
566
567
568	add	r12,r8
569	adc	r13,r9
570	mov	r8,r12
571	adc	r14,r10
572	adc	r15,rdx
573	mov	r9,r13
574	adc	r11,0
575
576	sub	r12,-1
577	mov	r10,r14
578	sbb	r13,rsi
579	sbb	r14,0
580	mov	rcx,r15
581	sbb	r15,rbp
582	sbb	r11,0
583
584	cmovc	r12,r8
585	cmovc	r13,r9
586	mov	QWORD[rdi],r12
587	cmovc	r14,r10
588	mov	QWORD[8+rdi],r13
589	cmovc	r15,rcx
590	mov	QWORD[16+rdi],r14
591	mov	QWORD[24+rdi],r15
592
593	DB	0F3h,0C3h		;repret
594
595
596
597
598
599
600
601global	ecp_nistz256_from_mont
602
603ALIGN	32
604ecp_nistz256_from_mont:
605	mov	QWORD[8+rsp],rdi	;WIN64 prologue
606	mov	QWORD[16+rsp],rsi
607	mov	rax,rsp
608$L$SEH_begin_ecp_nistz256_from_mont:
609	mov	rdi,rcx
610	mov	rsi,rdx
611
612
613	push	r12
614	push	r13
615
616	mov	rax,QWORD[rsi]
617	mov	r13,QWORD[(($L$poly+24))]
618	mov	r9,QWORD[8+rsi]
619	mov	r10,QWORD[16+rsi]
620	mov	r11,QWORD[24+rsi]
621	mov	r8,rax
622	mov	r12,QWORD[(($L$poly+8))]
623
624
625
626	mov	rcx,rax
627	shl	r8,32
628	mul	r13
629	shr	rcx,32
630	add	r9,r8
631	adc	r10,rcx
632	adc	r11,rax
633	mov	rax,r9
634	adc	rdx,0
635
636
637
638	mov	rcx,r9
639	shl	r9,32
640	mov	r8,rdx
641	mul	r13
642	shr	rcx,32
643	add	r10,r9
644	adc	r11,rcx
645	adc	r8,rax
646	mov	rax,r10
647	adc	rdx,0
648
649
650
651	mov	rcx,r10
652	shl	r10,32
653	mov	r9,rdx
654	mul	r13
655	shr	rcx,32
656	add	r11,r10
657	adc	r8,rcx
658	adc	r9,rax
659	mov	rax,r11
660	adc	rdx,0
661
662
663
664	mov	rcx,r11
665	shl	r11,32
666	mov	r10,rdx
667	mul	r13
668	shr	rcx,32
669	add	r8,r11
670	adc	r9,rcx
671	mov	rcx,r8
672	adc	r10,rax
673	mov	rsi,r9
674	adc	rdx,0
675
676	sub	r8,-1
677	mov	rax,r10
678	sbb	r9,r12
679	sbb	r10,0
680	mov	r11,rdx
681	sbb	rdx,r13
682	sbb	r13,r13
683
684	cmovnz	r8,rcx
685	cmovnz	r9,rsi
686	mov	QWORD[rdi],r8
687	cmovnz	r10,rax
688	mov	QWORD[8+rdi],r9
689	cmovz	r11,rdx
690	mov	QWORD[16+rdi],r10
691	mov	QWORD[24+rdi],r11
692
693	pop	r13
694	pop	r12
695	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
696	mov	rsi,QWORD[16+rsp]
697	DB	0F3h,0C3h		;repret
698$L$SEH_end_ecp_nistz256_from_mont:
699
700
701global	ecp_nistz256_select_w5
702
703ALIGN	32
704ecp_nistz256_select_w5:
705	lea	rax,[((-136))+rsp]
706$L$SEH_begin_ecp_nistz256_select_w5:
707DB	0x48,0x8d,0x60,0xe0
708DB	0x0f,0x29,0x70,0xe0
709DB	0x0f,0x29,0x78,0xf0
710DB	0x44,0x0f,0x29,0x00
711DB	0x44,0x0f,0x29,0x48,0x10
712DB	0x44,0x0f,0x29,0x50,0x20
713DB	0x44,0x0f,0x29,0x58,0x30
714DB	0x44,0x0f,0x29,0x60,0x40
715DB	0x44,0x0f,0x29,0x68,0x50
716DB	0x44,0x0f,0x29,0x70,0x60
717DB	0x44,0x0f,0x29,0x78,0x70
718	movdqa	xmm0,XMMWORD[$L$One]
719	movd	xmm1,r8d
720
721	pxor	xmm2,xmm2
722	pxor	xmm3,xmm3
723	pxor	xmm4,xmm4
724	pxor	xmm5,xmm5
725	pxor	xmm6,xmm6
726	pxor	xmm7,xmm7
727
728	movdqa	xmm8,xmm0
729	pshufd	xmm1,xmm1,0
730
731	mov	rax,16
732$L$select_loop_sse_w5:
733
734	movdqa	xmm15,xmm8
735	paddd	xmm8,xmm0
736	pcmpeqd	xmm15,xmm1
737
738	movdqa	xmm9,XMMWORD[rdx]
739	movdqa	xmm10,XMMWORD[16+rdx]
740	movdqa	xmm11,XMMWORD[32+rdx]
741	movdqa	xmm12,XMMWORD[48+rdx]
742	movdqa	xmm13,XMMWORD[64+rdx]
743	movdqa	xmm14,XMMWORD[80+rdx]
744	lea	rdx,[96+rdx]
745
746	pand	xmm9,xmm15
747	pand	xmm10,xmm15
748	por	xmm2,xmm9
749	pand	xmm11,xmm15
750	por	xmm3,xmm10
751	pand	xmm12,xmm15
752	por	xmm4,xmm11
753	pand	xmm13,xmm15
754	por	xmm5,xmm12
755	pand	xmm14,xmm15
756	por	xmm6,xmm13
757	por	xmm7,xmm14
758
759	dec	rax
760	jnz	NEAR $L$select_loop_sse_w5
761
762	movdqu	XMMWORD[rcx],xmm2
763	movdqu	XMMWORD[16+rcx],xmm3
764	movdqu	XMMWORD[32+rcx],xmm4
765	movdqu	XMMWORD[48+rcx],xmm5
766	movdqu	XMMWORD[64+rcx],xmm6
767	movdqu	XMMWORD[80+rcx],xmm7
768	movaps	xmm6,XMMWORD[rsp]
769	movaps	xmm7,XMMWORD[16+rsp]
770	movaps	xmm8,XMMWORD[32+rsp]
771	movaps	xmm9,XMMWORD[48+rsp]
772	movaps	xmm10,XMMWORD[64+rsp]
773	movaps	xmm11,XMMWORD[80+rsp]
774	movaps	xmm12,XMMWORD[96+rsp]
775	movaps	xmm13,XMMWORD[112+rsp]
776	movaps	xmm14,XMMWORD[128+rsp]
777	movaps	xmm15,XMMWORD[144+rsp]
778	lea	rsp,[168+rsp]
779$L$SEH_end_ecp_nistz256_select_w5:
780	DB	0F3h,0C3h		;repret
781
782
783
784
785global	ecp_nistz256_select_w7
786
787ALIGN	32
788ecp_nistz256_select_w7:
789	lea	rax,[((-136))+rsp]
790$L$SEH_begin_ecp_nistz256_select_w7:
791DB	0x48,0x8d,0x60,0xe0
792DB	0x0f,0x29,0x70,0xe0
793DB	0x0f,0x29,0x78,0xf0
794DB	0x44,0x0f,0x29,0x00
795DB	0x44,0x0f,0x29,0x48,0x10
796DB	0x44,0x0f,0x29,0x50,0x20
797DB	0x44,0x0f,0x29,0x58,0x30
798DB	0x44,0x0f,0x29,0x60,0x40
799DB	0x44,0x0f,0x29,0x68,0x50
800DB	0x44,0x0f,0x29,0x70,0x60
801DB	0x44,0x0f,0x29,0x78,0x70
802	movdqa	xmm8,XMMWORD[$L$One]
803	movd	xmm1,r8d
804
805	pxor	xmm2,xmm2
806	pxor	xmm3,xmm3
807	pxor	xmm4,xmm4
808	pxor	xmm5,xmm5
809
810	movdqa	xmm0,xmm8
811	pshufd	xmm1,xmm1,0
812	mov	rax,64
813
814$L$select_loop_sse_w7:
815	movdqa	xmm15,xmm8
816	paddd	xmm8,xmm0
817	movdqa	xmm9,XMMWORD[rdx]
818	movdqa	xmm10,XMMWORD[16+rdx]
819	pcmpeqd	xmm15,xmm1
820	movdqa	xmm11,XMMWORD[32+rdx]
821	movdqa	xmm12,XMMWORD[48+rdx]
822	lea	rdx,[64+rdx]
823
824	pand	xmm9,xmm15
825	pand	xmm10,xmm15
826	por	xmm2,xmm9
827	pand	xmm11,xmm15
828	por	xmm3,xmm10
829	pand	xmm12,xmm15
830	por	xmm4,xmm11
831	prefetcht0	[255+rdx]
832	por	xmm5,xmm12
833
834	dec	rax
835	jnz	NEAR $L$select_loop_sse_w7
836
837	movdqu	XMMWORD[rcx],xmm2
838	movdqu	XMMWORD[16+rcx],xmm3
839	movdqu	XMMWORD[32+rcx],xmm4
840	movdqu	XMMWORD[48+rcx],xmm5
841	movaps	xmm6,XMMWORD[rsp]
842	movaps	xmm7,XMMWORD[16+rsp]
843	movaps	xmm8,XMMWORD[32+rsp]
844	movaps	xmm9,XMMWORD[48+rsp]
845	movaps	xmm10,XMMWORD[64+rsp]
846	movaps	xmm11,XMMWORD[80+rsp]
847	movaps	xmm12,XMMWORD[96+rsp]
848	movaps	xmm13,XMMWORD[112+rsp]
849	movaps	xmm14,XMMWORD[128+rsp]
850	movaps	xmm15,XMMWORD[144+rsp]
851	lea	rsp,[168+rsp]
852$L$SEH_end_ecp_nistz256_select_w7:
853	DB	0F3h,0C3h		;repret
854
855global	ecp_nistz256_avx2_select_w7
856
857ALIGN	32
858ecp_nistz256_avx2_select_w7:
859	mov	QWORD[8+rsp],rdi	;WIN64 prologue
860	mov	QWORD[16+rsp],rsi
861	mov	rax,rsp
862$L$SEH_begin_ecp_nistz256_avx2_select_w7:
863	mov	rdi,rcx
864	mov	rsi,rdx
865	mov	rdx,r8
866
867
868DB	0x0f,0x0b
869	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
870	mov	rsi,QWORD[16+rsp]
871	DB	0F3h,0C3h		;repret
872$L$SEH_end_ecp_nistz256_avx2_select_w7:
873
874ALIGN	32
875__ecp_nistz256_add_toq:
876	add	r12,QWORD[rbx]
877	adc	r13,QWORD[8+rbx]
878	mov	rax,r12
879	adc	r8,QWORD[16+rbx]
880	adc	r9,QWORD[24+rbx]
881	mov	rbp,r13
882	sbb	r11,r11
883
884	sub	r12,-1
885	mov	rcx,r8
886	sbb	r13,r14
887	sbb	r8,0
888	mov	r10,r9
889	sbb	r9,r15
890	test	r11,r11
891
892	cmovz	r12,rax
893	cmovz	r13,rbp
894	mov	QWORD[rdi],r12
895	cmovz	r8,rcx
896	mov	QWORD[8+rdi],r13
897	cmovz	r9,r10
898	mov	QWORD[16+rdi],r8
899	mov	QWORD[24+rdi],r9
900
901	DB	0F3h,0C3h		;repret
902
903
904
905ALIGN	32
906__ecp_nistz256_sub_fromq:
907	sub	r12,QWORD[rbx]
908	sbb	r13,QWORD[8+rbx]
909	mov	rax,r12
910	sbb	r8,QWORD[16+rbx]
911	sbb	r9,QWORD[24+rbx]
912	mov	rbp,r13
913	sbb	r11,r11
914
915	add	r12,-1
916	mov	rcx,r8
917	adc	r13,r14
918	adc	r8,0
919	mov	r10,r9
920	adc	r9,r15
921	test	r11,r11
922
923	cmovz	r12,rax
924	cmovz	r13,rbp
925	mov	QWORD[rdi],r12
926	cmovz	r8,rcx
927	mov	QWORD[8+rdi],r13
928	cmovz	r9,r10
929	mov	QWORD[16+rdi],r8
930	mov	QWORD[24+rdi],r9
931
932	DB	0F3h,0C3h		;repret
933
934
935
936ALIGN	32
937__ecp_nistz256_subq:
938	sub	rax,r12
939	sbb	rbp,r13
940	mov	r12,rax
941	sbb	rcx,r8
942	sbb	r10,r9
943	mov	r13,rbp
944	sbb	r11,r11
945
946	add	rax,-1
947	mov	r8,rcx
948	adc	rbp,r14
949	adc	rcx,0
950	mov	r9,r10
951	adc	r10,r15
952	test	r11,r11
953
954	cmovnz	r12,rax
955	cmovnz	r13,rbp
956	cmovnz	r8,rcx
957	cmovnz	r9,r10
958
959	DB	0F3h,0C3h		;repret
960
961
962
963ALIGN	32
964__ecp_nistz256_mul_by_2q:
965	add	r12,r12
966	adc	r13,r13
967	mov	rax,r12
968	adc	r8,r8
969	adc	r9,r9
970	mov	rbp,r13
971	sbb	r11,r11
972
973	sub	r12,-1
974	mov	rcx,r8
975	sbb	r13,r14
976	sbb	r8,0
977	mov	r10,r9
978	sbb	r9,r15
979	test	r11,r11
980
981	cmovz	r12,rax
982	cmovz	r13,rbp
983	mov	QWORD[rdi],r12
984	cmovz	r8,rcx
985	mov	QWORD[8+rdi],r13
986	cmovz	r9,r10
987	mov	QWORD[16+rdi],r8
988	mov	QWORD[24+rdi],r9
989
990	DB	0F3h,0C3h		;repret
991
992global	ecp_nistz256_point_double
993
994ALIGN	32
995ecp_nistz256_point_double:
996	mov	QWORD[8+rsp],rdi	;WIN64 prologue
997	mov	QWORD[16+rsp],rsi
998	mov	rax,rsp
999$L$SEH_begin_ecp_nistz256_point_double:
1000	mov	rdi,rcx
1001	mov	rsi,rdx
1002
1003
1004	push	rbp
1005	push	rbx
1006	push	r12
1007	push	r13
1008	push	r14
1009	push	r15
1010	sub	rsp,32*5+8
1011
1012	movdqu	xmm0,XMMWORD[rsi]
1013	mov	rbx,rsi
1014	movdqu	xmm1,XMMWORD[16+rsi]
1015	mov	r12,QWORD[((32+0))+rsi]
1016	mov	r13,QWORD[((32+8))+rsi]
1017	mov	r8,QWORD[((32+16))+rsi]
1018	mov	r9,QWORD[((32+24))+rsi]
1019	mov	r14,QWORD[(($L$poly+8))]
1020	mov	r15,QWORD[(($L$poly+24))]
1021	movdqa	XMMWORD[96+rsp],xmm0
1022	movdqa	XMMWORD[(96+16)+rsp],xmm1
1023	lea	r10,[32+rdi]
1024	lea	r11,[64+rdi]
1025DB	102,72,15,110,199
1026DB	102,73,15,110,202
1027DB	102,73,15,110,211
1028
1029	lea	rdi,[rsp]
1030	call	__ecp_nistz256_mul_by_2q
1031
1032	mov	rax,QWORD[((64+0))+rsi]
1033	mov	r14,QWORD[((64+8))+rsi]
1034	mov	r15,QWORD[((64+16))+rsi]
1035	mov	r8,QWORD[((64+24))+rsi]
1036	lea	rsi,[((64-0))+rsi]
1037	lea	rdi,[64+rsp]
1038	call	__ecp_nistz256_sqr_montq
1039
1040	mov	rax,QWORD[((0+0))+rsp]
1041	mov	r14,QWORD[((8+0))+rsp]
1042	lea	rsi,[((0+0))+rsp]
1043	mov	r15,QWORD[((16+0))+rsp]
1044	mov	r8,QWORD[((24+0))+rsp]
1045	lea	rdi,[rsp]
1046	call	__ecp_nistz256_sqr_montq
1047
1048	mov	rax,QWORD[32+rbx]
1049	mov	r9,QWORD[((64+0))+rbx]
1050	mov	r10,QWORD[((64+8))+rbx]
1051	mov	r11,QWORD[((64+16))+rbx]
1052	mov	r12,QWORD[((64+24))+rbx]
1053	lea	rsi,[((64-0))+rbx]
1054	lea	rbx,[32+rbx]
1055DB	102,72,15,126,215
1056	call	__ecp_nistz256_mul_montq
1057	call	__ecp_nistz256_mul_by_2q
1058
1059	mov	r12,QWORD[((96+0))+rsp]
1060	mov	r13,QWORD[((96+8))+rsp]
1061	lea	rbx,[64+rsp]
1062	mov	r8,QWORD[((96+16))+rsp]
1063	mov	r9,QWORD[((96+24))+rsp]
1064	lea	rdi,[32+rsp]
1065	call	__ecp_nistz256_add_toq
1066
1067	mov	r12,QWORD[((96+0))+rsp]
1068	mov	r13,QWORD[((96+8))+rsp]
1069	lea	rbx,[64+rsp]
1070	mov	r8,QWORD[((96+16))+rsp]
1071	mov	r9,QWORD[((96+24))+rsp]
1072	lea	rdi,[64+rsp]
1073	call	__ecp_nistz256_sub_fromq
1074
1075	mov	rax,QWORD[((0+0))+rsp]
1076	mov	r14,QWORD[((8+0))+rsp]
1077	lea	rsi,[((0+0))+rsp]
1078	mov	r15,QWORD[((16+0))+rsp]
1079	mov	r8,QWORD[((24+0))+rsp]
1080DB	102,72,15,126,207
1081	call	__ecp_nistz256_sqr_montq
1082	xor	r9,r9
1083	mov	rax,r12
1084	add	r12,-1
1085	mov	r10,r13
1086	adc	r13,rsi
1087	mov	rcx,r14
1088	adc	r14,0
1089	mov	r8,r15
1090	adc	r15,rbp
1091	adc	r9,0
1092	xor	rsi,rsi
1093	test	rax,1
1094
1095	cmovz	r12,rax
1096	cmovz	r13,r10
1097	cmovz	r14,rcx
1098	cmovz	r15,r8
1099	cmovz	r9,rsi
1100
1101	mov	rax,r13
1102	shr	r12,1
1103	shl	rax,63
1104	mov	r10,r14
1105	shr	r13,1
1106	or	r12,rax
1107	shl	r10,63
1108	mov	rcx,r15
1109	shr	r14,1
1110	or	r13,r10
1111	shl	rcx,63
1112	mov	QWORD[rdi],r12
1113	shr	r15,1
1114	mov	QWORD[8+rdi],r13
1115	shl	r9,63
1116	or	r14,rcx
1117	or	r15,r9
1118	mov	QWORD[16+rdi],r14
1119	mov	QWORD[24+rdi],r15
1120	mov	rax,QWORD[64+rsp]
1121	lea	rbx,[64+rsp]
1122	mov	r9,QWORD[((0+32))+rsp]
1123	mov	r10,QWORD[((8+32))+rsp]
1124	lea	rsi,[((0+32))+rsp]
1125	mov	r11,QWORD[((16+32))+rsp]
1126	mov	r12,QWORD[((24+32))+rsp]
1127	lea	rdi,[32+rsp]
1128	call	__ecp_nistz256_mul_montq
1129
1130	lea	rdi,[128+rsp]
1131	call	__ecp_nistz256_mul_by_2q
1132
1133	lea	rbx,[32+rsp]
1134	lea	rdi,[32+rsp]
1135	call	__ecp_nistz256_add_toq
1136
1137	mov	rax,QWORD[96+rsp]
1138	lea	rbx,[96+rsp]
1139	mov	r9,QWORD[((0+0))+rsp]
1140	mov	r10,QWORD[((8+0))+rsp]
1141	lea	rsi,[((0+0))+rsp]
1142	mov	r11,QWORD[((16+0))+rsp]
1143	mov	r12,QWORD[((24+0))+rsp]
1144	lea	rdi,[rsp]
1145	call	__ecp_nistz256_mul_montq
1146
1147	lea	rdi,[128+rsp]
1148	call	__ecp_nistz256_mul_by_2q
1149
1150	mov	rax,QWORD[((0+32))+rsp]
1151	mov	r14,QWORD[((8+32))+rsp]
1152	lea	rsi,[((0+32))+rsp]
1153	mov	r15,QWORD[((16+32))+rsp]
1154	mov	r8,QWORD[((24+32))+rsp]
1155DB	102,72,15,126,199
1156	call	__ecp_nistz256_sqr_montq
1157
1158	lea	rbx,[128+rsp]
1159	mov	r8,r14
1160	mov	r9,r15
1161	mov	r14,rsi
1162	mov	r15,rbp
1163	call	__ecp_nistz256_sub_fromq
1164
1165	mov	rax,QWORD[((0+0))+rsp]
1166	mov	rbp,QWORD[((0+8))+rsp]
1167	mov	rcx,QWORD[((0+16))+rsp]
1168	mov	r10,QWORD[((0+24))+rsp]
1169	lea	rdi,[rsp]
1170	call	__ecp_nistz256_subq
1171
1172	mov	rax,QWORD[32+rsp]
1173	lea	rbx,[32+rsp]
1174	mov	r14,r12
1175	xor	ecx,ecx
1176	mov	QWORD[((0+0))+rsp],r12
1177	mov	r10,r13
1178	mov	QWORD[((0+8))+rsp],r13
1179	cmovz	r11,r8
1180	mov	QWORD[((0+16))+rsp],r8
1181	lea	rsi,[((0-0))+rsp]
1182	cmovz	r12,r9
1183	mov	QWORD[((0+24))+rsp],r9
1184	mov	r9,r14
1185	lea	rdi,[rsp]
1186	call	__ecp_nistz256_mul_montq
1187
1188DB	102,72,15,126,203
1189DB	102,72,15,126,207
1190	call	__ecp_nistz256_sub_fromq
1191
1192	add	rsp,32*5+8
1193	pop	r15
1194	pop	r14
1195	pop	r13
1196	pop	r12
1197	pop	rbx
1198	pop	rbp
1199	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1200	mov	rsi,QWORD[16+rsp]
1201	DB	0F3h,0C3h		;repret
1202$L$SEH_end_ecp_nistz256_point_double:
1203global	ecp_nistz256_point_add
1204
1205ALIGN	32
1206ecp_nistz256_point_add:
1207	mov	QWORD[8+rsp],rdi	;WIN64 prologue
1208	mov	QWORD[16+rsp],rsi
1209	mov	rax,rsp
1210$L$SEH_begin_ecp_nistz256_point_add:
1211	mov	rdi,rcx
1212	mov	rsi,rdx
1213	mov	rdx,r8
1214
1215
1216	push	rbp
1217	push	rbx
1218	push	r12
1219	push	r13
1220	push	r14
1221	push	r15
1222	sub	rsp,32*18+8
1223
1224	movdqu	xmm0,XMMWORD[rsi]
1225	movdqu	xmm1,XMMWORD[16+rsi]
1226	movdqu	xmm2,XMMWORD[32+rsi]
1227	movdqu	xmm3,XMMWORD[48+rsi]
1228	movdqu	xmm4,XMMWORD[64+rsi]
1229	movdqu	xmm5,XMMWORD[80+rsi]
1230	mov	rbx,rsi
1231	mov	rsi,rdx
1232	movdqa	XMMWORD[384+rsp],xmm0
1233	movdqa	XMMWORD[(384+16)+rsp],xmm1
1234	por	xmm1,xmm0
1235	movdqa	XMMWORD[416+rsp],xmm2
1236	movdqa	XMMWORD[(416+16)+rsp],xmm3
1237	por	xmm3,xmm2
1238	movdqa	XMMWORD[448+rsp],xmm4
1239	movdqa	XMMWORD[(448+16)+rsp],xmm5
1240	por	xmm3,xmm1
1241
1242	movdqu	xmm0,XMMWORD[rsi]
1243	pshufd	xmm5,xmm3,0xb1
1244	movdqu	xmm1,XMMWORD[16+rsi]
1245	movdqu	xmm2,XMMWORD[32+rsi]
1246	por	xmm5,xmm3
1247	movdqu	xmm3,XMMWORD[48+rsi]
1248	mov	rax,QWORD[((64+0))+rsi]
1249	mov	r14,QWORD[((64+8))+rsi]
1250	mov	r15,QWORD[((64+16))+rsi]
1251	mov	r8,QWORD[((64+24))+rsi]
1252	movdqa	XMMWORD[480+rsp],xmm0
1253	pshufd	xmm4,xmm5,0x1e
1254	movdqa	XMMWORD[(480+16)+rsp],xmm1
1255	por	xmm1,xmm0
1256DB	102,72,15,110,199
1257	movdqa	XMMWORD[512+rsp],xmm2
1258	movdqa	XMMWORD[(512+16)+rsp],xmm3
1259	por	xmm3,xmm2
1260	por	xmm5,xmm4
1261	pxor	xmm4,xmm4
1262	por	xmm3,xmm1
1263
1264	lea	rsi,[((64-0))+rsi]
1265	mov	QWORD[((544+0))+rsp],rax
1266	mov	QWORD[((544+8))+rsp],r14
1267	mov	QWORD[((544+16))+rsp],r15
1268	mov	QWORD[((544+24))+rsp],r8
1269	lea	rdi,[96+rsp]
1270	call	__ecp_nistz256_sqr_montq
1271
1272	pcmpeqd	xmm5,xmm4
1273	pshufd	xmm4,xmm3,0xb1
1274	por	xmm4,xmm3
1275	pshufd	xmm5,xmm5,0
1276	pshufd	xmm3,xmm4,0x1e
1277	por	xmm4,xmm3
1278	pxor	xmm3,xmm3
1279	pcmpeqd	xmm4,xmm3
1280	pshufd	xmm4,xmm4,0
1281	mov	rax,QWORD[((64+0))+rbx]
1282	mov	r14,QWORD[((64+8))+rbx]
1283	mov	r15,QWORD[((64+16))+rbx]
1284	mov	r8,QWORD[((64+24))+rbx]
1285
1286	lea	rsi,[((64-0))+rbx]
1287	lea	rdi,[32+rsp]
1288	call	__ecp_nistz256_sqr_montq
1289
1290	mov	rax,QWORD[544+rsp]
1291	lea	rbx,[544+rsp]
1292	mov	r9,QWORD[((0+96))+rsp]
1293	mov	r10,QWORD[((8+96))+rsp]
1294	lea	rsi,[((0+96))+rsp]
1295	mov	r11,QWORD[((16+96))+rsp]
1296	mov	r12,QWORD[((24+96))+rsp]
1297	lea	rdi,[224+rsp]
1298	call	__ecp_nistz256_mul_montq
1299
1300	mov	rax,QWORD[448+rsp]
1301	lea	rbx,[448+rsp]
1302	mov	r9,QWORD[((0+32))+rsp]
1303	mov	r10,QWORD[((8+32))+rsp]
1304	lea	rsi,[((0+32))+rsp]
1305	mov	r11,QWORD[((16+32))+rsp]
1306	mov	r12,QWORD[((24+32))+rsp]
1307	lea	rdi,[256+rsp]
1308	call	__ecp_nistz256_mul_montq
1309
1310	mov	rax,QWORD[416+rsp]
1311	lea	rbx,[416+rsp]
1312	mov	r9,QWORD[((0+224))+rsp]
1313	mov	r10,QWORD[((8+224))+rsp]
1314	lea	rsi,[((0+224))+rsp]
1315	mov	r11,QWORD[((16+224))+rsp]
1316	mov	r12,QWORD[((24+224))+rsp]
1317	lea	rdi,[224+rsp]
1318	call	__ecp_nistz256_mul_montq
1319
1320	mov	rax,QWORD[512+rsp]
1321	lea	rbx,[512+rsp]
1322	mov	r9,QWORD[((0+256))+rsp]
1323	mov	r10,QWORD[((8+256))+rsp]
1324	lea	rsi,[((0+256))+rsp]
1325	mov	r11,QWORD[((16+256))+rsp]
1326	mov	r12,QWORD[((24+256))+rsp]
1327	lea	rdi,[256+rsp]
1328	call	__ecp_nistz256_mul_montq
1329
1330	lea	rbx,[224+rsp]
1331	lea	rdi,[64+rsp]
1332	call	__ecp_nistz256_sub_fromq
1333
1334	or	r12,r13
1335	movdqa	xmm2,xmm4
1336	or	r12,r8
1337	or	r12,r9
1338	por	xmm2,xmm5
1339DB	102,73,15,110,220
1340
1341	mov	rax,QWORD[384+rsp]
1342	lea	rbx,[384+rsp]
1343	mov	r9,QWORD[((0+96))+rsp]
1344	mov	r10,QWORD[((8+96))+rsp]
1345	lea	rsi,[((0+96))+rsp]
1346	mov	r11,QWORD[((16+96))+rsp]
1347	mov	r12,QWORD[((24+96))+rsp]
1348	lea	rdi,[160+rsp]
1349	call	__ecp_nistz256_mul_montq
1350
1351	mov	rax,QWORD[480+rsp]
1352	lea	rbx,[480+rsp]
1353	mov	r9,QWORD[((0+32))+rsp]
1354	mov	r10,QWORD[((8+32))+rsp]
1355	lea	rsi,[((0+32))+rsp]
1356	mov	r11,QWORD[((16+32))+rsp]
1357	mov	r12,QWORD[((24+32))+rsp]
1358	lea	rdi,[192+rsp]
1359	call	__ecp_nistz256_mul_montq
1360
1361	lea	rbx,[160+rsp]
1362	lea	rdi,[rsp]
1363	call	__ecp_nistz256_sub_fromq
1364
1365	or	r12,r13
1366	or	r12,r8
1367	or	r12,r9
1368
1369DB	0x3e
1370	jnz	NEAR $L$add_proceedq
1371DB	102,73,15,126,208
1372DB	102,73,15,126,217
1373	test	r8,r8
1374	jnz	NEAR $L$add_proceedq
1375	test	r9,r9
1376	jz	NEAR $L$add_proceedq
1377
1378DB	102,72,15,126,199
1379	pxor	xmm0,xmm0
1380	movdqu	XMMWORD[rdi],xmm0
1381	movdqu	XMMWORD[16+rdi],xmm0
1382	movdqu	XMMWORD[32+rdi],xmm0
1383	movdqu	XMMWORD[48+rdi],xmm0
1384	movdqu	XMMWORD[64+rdi],xmm0
1385	movdqu	XMMWORD[80+rdi],xmm0
1386	jmp	NEAR $L$add_doneq
1387
1388ALIGN	32
1389$L$add_proceedq:
1390	mov	rax,QWORD[((0+64))+rsp]
1391	mov	r14,QWORD[((8+64))+rsp]
1392	lea	rsi,[((0+64))+rsp]
1393	mov	r15,QWORD[((16+64))+rsp]
1394	mov	r8,QWORD[((24+64))+rsp]
1395	lea	rdi,[96+rsp]
1396	call	__ecp_nistz256_sqr_montq
1397
1398	mov	rax,QWORD[448+rsp]
1399	lea	rbx,[448+rsp]
1400	mov	r9,QWORD[((0+0))+rsp]
1401	mov	r10,QWORD[((8+0))+rsp]
1402	lea	rsi,[((0+0))+rsp]
1403	mov	r11,QWORD[((16+0))+rsp]
1404	mov	r12,QWORD[((24+0))+rsp]
1405	lea	rdi,[352+rsp]
1406	call	__ecp_nistz256_mul_montq
1407
1408	mov	rax,QWORD[((0+0))+rsp]
1409	mov	r14,QWORD[((8+0))+rsp]
1410	lea	rsi,[((0+0))+rsp]
1411	mov	r15,QWORD[((16+0))+rsp]
1412	mov	r8,QWORD[((24+0))+rsp]
1413	lea	rdi,[32+rsp]
1414	call	__ecp_nistz256_sqr_montq
1415
1416	mov	rax,QWORD[544+rsp]
1417	lea	rbx,[544+rsp]
1418	mov	r9,QWORD[((0+352))+rsp]
1419	mov	r10,QWORD[((8+352))+rsp]
1420	lea	rsi,[((0+352))+rsp]
1421	mov	r11,QWORD[((16+352))+rsp]
1422	mov	r12,QWORD[((24+352))+rsp]
1423	lea	rdi,[352+rsp]
1424	call	__ecp_nistz256_mul_montq
1425
1426	mov	rax,QWORD[rsp]
1427	lea	rbx,[rsp]
1428	mov	r9,QWORD[((0+32))+rsp]
1429	mov	r10,QWORD[((8+32))+rsp]
1430	lea	rsi,[((0+32))+rsp]
1431	mov	r11,QWORD[((16+32))+rsp]
1432	mov	r12,QWORD[((24+32))+rsp]
1433	lea	rdi,[128+rsp]
1434	call	__ecp_nistz256_mul_montq
1435
1436	mov	rax,QWORD[160+rsp]
1437	lea	rbx,[160+rsp]
1438	mov	r9,QWORD[((0+32))+rsp]
1439	mov	r10,QWORD[((8+32))+rsp]
1440	lea	rsi,[((0+32))+rsp]
1441	mov	r11,QWORD[((16+32))+rsp]
1442	mov	r12,QWORD[((24+32))+rsp]
1443	lea	rdi,[192+rsp]
1444	call	__ecp_nistz256_mul_montq
1445
1446
1447
1448
1449	add	r12,r12
1450	lea	rsi,[96+rsp]
1451	adc	r13,r13
1452	mov	rax,r12
1453	adc	r8,r8
1454	adc	r9,r9
1455	mov	rbp,r13
1456	sbb	r11,r11
1457
1458	sub	r12,-1
1459	mov	rcx,r8
1460	sbb	r13,r14
1461	sbb	r8,0
1462	mov	r10,r9
1463	sbb	r9,r15
1464	test	r11,r11
1465
1466	cmovz	r12,rax
1467	mov	rax,QWORD[rsi]
1468	cmovz	r13,rbp
1469	mov	rbp,QWORD[8+rsi]
1470	cmovz	r8,rcx
1471	mov	rcx,QWORD[16+rsi]
1472	cmovz	r9,r10
1473	mov	r10,QWORD[24+rsi]
1474
1475	call	__ecp_nistz256_subq
1476
1477	lea	rbx,[128+rsp]
1478	lea	rdi,[288+rsp]
1479	call	__ecp_nistz256_sub_fromq
1480
1481	mov	rax,QWORD[((192+0))+rsp]
1482	mov	rbp,QWORD[((192+8))+rsp]
1483	mov	rcx,QWORD[((192+16))+rsp]
1484	mov	r10,QWORD[((192+24))+rsp]
1485	lea	rdi,[320+rsp]
1486
1487	call	__ecp_nistz256_subq
1488
1489	mov	QWORD[rdi],r12
1490	mov	QWORD[8+rdi],r13
1491	mov	QWORD[16+rdi],r8
1492	mov	QWORD[24+rdi],r9
1493	mov	rax,QWORD[128+rsp]
1494	lea	rbx,[128+rsp]
1495	mov	r9,QWORD[((0+224))+rsp]
1496	mov	r10,QWORD[((8+224))+rsp]
1497	lea	rsi,[((0+224))+rsp]
1498	mov	r11,QWORD[((16+224))+rsp]
1499	mov	r12,QWORD[((24+224))+rsp]
1500	lea	rdi,[256+rsp]
1501	call	__ecp_nistz256_mul_montq
1502
1503	mov	rax,QWORD[320+rsp]
1504	lea	rbx,[320+rsp]
1505	mov	r9,QWORD[((0+64))+rsp]
1506	mov	r10,QWORD[((8+64))+rsp]
1507	lea	rsi,[((0+64))+rsp]
1508	mov	r11,QWORD[((16+64))+rsp]
1509	mov	r12,QWORD[((24+64))+rsp]
1510	lea	rdi,[320+rsp]
1511	call	__ecp_nistz256_mul_montq
1512
1513	lea	rbx,[256+rsp]
1514	lea	rdi,[320+rsp]
1515	call	__ecp_nistz256_sub_fromq
1516
1517DB	102,72,15,126,199
1518
1519	movdqa	xmm0,xmm5
1520	movdqa	xmm1,xmm5
1521	pandn	xmm0,XMMWORD[352+rsp]
1522	movdqa	xmm2,xmm5
1523	pandn	xmm1,XMMWORD[((352+16))+rsp]
1524	movdqa	xmm3,xmm5
1525	pand	xmm2,XMMWORD[544+rsp]
1526	pand	xmm3,XMMWORD[((544+16))+rsp]
1527	por	xmm2,xmm0
1528	por	xmm3,xmm1
1529
1530	movdqa	xmm0,xmm4
1531	movdqa	xmm1,xmm4
1532	pandn	xmm0,xmm2
1533	movdqa	xmm2,xmm4
1534	pandn	xmm1,xmm3
1535	movdqa	xmm3,xmm4
1536	pand	xmm2,XMMWORD[448+rsp]
1537	pand	xmm3,XMMWORD[((448+16))+rsp]
1538	por	xmm2,xmm0
1539	por	xmm3,xmm1
1540	movdqu	XMMWORD[64+rdi],xmm2
1541	movdqu	XMMWORD[80+rdi],xmm3
1542
1543	movdqa	xmm0,xmm5
1544	movdqa	xmm1,xmm5
1545	pandn	xmm0,XMMWORD[288+rsp]
1546	movdqa	xmm2,xmm5
1547	pandn	xmm1,XMMWORD[((288+16))+rsp]
1548	movdqa	xmm3,xmm5
1549	pand	xmm2,XMMWORD[480+rsp]
1550	pand	xmm3,XMMWORD[((480+16))+rsp]
1551	por	xmm2,xmm0
1552	por	xmm3,xmm1
1553
1554	movdqa	xmm0,xmm4
1555	movdqa	xmm1,xmm4
1556	pandn	xmm0,xmm2
1557	movdqa	xmm2,xmm4
1558	pandn	xmm1,xmm3
1559	movdqa	xmm3,xmm4
1560	pand	xmm2,XMMWORD[384+rsp]
1561	pand	xmm3,XMMWORD[((384+16))+rsp]
1562	por	xmm2,xmm0
1563	por	xmm3,xmm1
1564	movdqu	XMMWORD[rdi],xmm2
1565	movdqu	XMMWORD[16+rdi],xmm3
1566
1567	movdqa	xmm0,xmm5
1568	movdqa	xmm1,xmm5
1569	pandn	xmm0,XMMWORD[320+rsp]
1570	movdqa	xmm2,xmm5
1571	pandn	xmm1,XMMWORD[((320+16))+rsp]
1572	movdqa	xmm3,xmm5
1573	pand	xmm2,XMMWORD[512+rsp]
1574	pand	xmm3,XMMWORD[((512+16))+rsp]
1575	por	xmm2,xmm0
1576	por	xmm3,xmm1
1577
1578	movdqa	xmm0,xmm4
1579	movdqa	xmm1,xmm4
1580	pandn	xmm0,xmm2
1581	movdqa	xmm2,xmm4
1582	pandn	xmm1,xmm3
1583	movdqa	xmm3,xmm4
1584	pand	xmm2,XMMWORD[416+rsp]
1585	pand	xmm3,XMMWORD[((416+16))+rsp]
1586	por	xmm2,xmm0
1587	por	xmm3,xmm1
1588	movdqu	XMMWORD[32+rdi],xmm2
1589	movdqu	XMMWORD[48+rdi],xmm3
1590
1591$L$add_doneq:
1592	add	rsp,32*18+8
1593	pop	r15
1594	pop	r14
1595	pop	r13
1596	pop	r12
1597	pop	rbx
1598	pop	rbp
1599	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1600	mov	rsi,QWORD[16+rsp]
1601	DB	0F3h,0C3h		;repret
1602$L$SEH_end_ecp_nistz256_point_add:
1603global	ecp_nistz256_point_add_affine
1604
1605ALIGN	32
1606ecp_nistz256_point_add_affine:
1607	mov	QWORD[8+rsp],rdi	;WIN64 prologue
1608	mov	QWORD[16+rsp],rsi
1609	mov	rax,rsp
1610$L$SEH_begin_ecp_nistz256_point_add_affine:
1611	mov	rdi,rcx
1612	mov	rsi,rdx
1613	mov	rdx,r8
1614
1615
1616	push	rbp
1617	push	rbx
1618	push	r12
1619	push	r13
1620	push	r14
1621	push	r15
1622	sub	rsp,32*15+8
1623
1624	movdqu	xmm0,XMMWORD[rsi]
1625	mov	rbx,rdx
1626	movdqu	xmm1,XMMWORD[16+rsi]
1627	movdqu	xmm2,XMMWORD[32+rsi]
1628	movdqu	xmm3,XMMWORD[48+rsi]
1629	movdqu	xmm4,XMMWORD[64+rsi]
1630	movdqu	xmm5,XMMWORD[80+rsi]
1631	mov	rax,QWORD[((64+0))+rsi]
1632	mov	r14,QWORD[((64+8))+rsi]
1633	mov	r15,QWORD[((64+16))+rsi]
1634	mov	r8,QWORD[((64+24))+rsi]
1635	movdqa	XMMWORD[320+rsp],xmm0
1636	movdqa	XMMWORD[(320+16)+rsp],xmm1
1637	por	xmm1,xmm0
1638	movdqa	XMMWORD[352+rsp],xmm2
1639	movdqa	XMMWORD[(352+16)+rsp],xmm3
1640	por	xmm3,xmm2
1641	movdqa	XMMWORD[384+rsp],xmm4
1642	movdqa	XMMWORD[(384+16)+rsp],xmm5
1643	por	xmm3,xmm1
1644
1645	movdqu	xmm0,XMMWORD[rbx]
1646	pshufd	xmm5,xmm3,0xb1
1647	movdqu	xmm1,XMMWORD[16+rbx]
1648	movdqu	xmm2,XMMWORD[32+rbx]
1649	por	xmm5,xmm3
1650	movdqu	xmm3,XMMWORD[48+rbx]
1651	movdqa	XMMWORD[416+rsp],xmm0
1652	pshufd	xmm4,xmm5,0x1e
1653	movdqa	XMMWORD[(416+16)+rsp],xmm1
1654	por	xmm1,xmm0
1655DB	102,72,15,110,199
1656	movdqa	XMMWORD[448+rsp],xmm2
1657	movdqa	XMMWORD[(448+16)+rsp],xmm3
1658	por	xmm3,xmm2
1659	por	xmm5,xmm4
1660	pxor	xmm4,xmm4
1661	por	xmm3,xmm1
1662
1663	lea	rsi,[((64-0))+rsi]
1664	lea	rdi,[32+rsp]
1665	call	__ecp_nistz256_sqr_montq
1666
1667	pcmpeqd	xmm5,xmm4
1668	pshufd	xmm4,xmm3,0xb1
1669	mov	rax,QWORD[rbx]
1670
1671	mov	r9,r12
1672	por	xmm4,xmm3
1673	pshufd	xmm5,xmm5,0
1674	pshufd	xmm3,xmm4,0x1e
1675	mov	r10,r13
1676	por	xmm4,xmm3
1677	pxor	xmm3,xmm3
1678	mov	r11,r14
1679	pcmpeqd	xmm4,xmm3
1680	pshufd	xmm4,xmm4,0
1681
1682	lea	rsi,[((32-0))+rsp]
1683	mov	r12,r15
1684	lea	rdi,[rsp]
1685	call	__ecp_nistz256_mul_montq
1686
1687	lea	rbx,[320+rsp]
1688	lea	rdi,[64+rsp]
1689	call	__ecp_nistz256_sub_fromq
1690
1691	mov	rax,QWORD[384+rsp]
1692	lea	rbx,[384+rsp]
1693	mov	r9,QWORD[((0+32))+rsp]
1694	mov	r10,QWORD[((8+32))+rsp]
1695	lea	rsi,[((0+32))+rsp]
1696	mov	r11,QWORD[((16+32))+rsp]
1697	mov	r12,QWORD[((24+32))+rsp]
1698	lea	rdi,[32+rsp]
1699	call	__ecp_nistz256_mul_montq
1700
1701	mov	rax,QWORD[384+rsp]
1702	lea	rbx,[384+rsp]
1703	mov	r9,QWORD[((0+64))+rsp]
1704	mov	r10,QWORD[((8+64))+rsp]
1705	lea	rsi,[((0+64))+rsp]
1706	mov	r11,QWORD[((16+64))+rsp]
1707	mov	r12,QWORD[((24+64))+rsp]
1708	lea	rdi,[288+rsp]
1709	call	__ecp_nistz256_mul_montq
1710
1711	mov	rax,QWORD[448+rsp]
1712	lea	rbx,[448+rsp]
1713	mov	r9,QWORD[((0+32))+rsp]
1714	mov	r10,QWORD[((8+32))+rsp]
1715	lea	rsi,[((0+32))+rsp]
1716	mov	r11,QWORD[((16+32))+rsp]
1717	mov	r12,QWORD[((24+32))+rsp]
1718	lea	rdi,[32+rsp]
1719	call	__ecp_nistz256_mul_montq
1720
1721	lea	rbx,[352+rsp]
1722	lea	rdi,[96+rsp]
1723	call	__ecp_nistz256_sub_fromq
1724
1725	mov	rax,QWORD[((0+64))+rsp]
1726	mov	r14,QWORD[((8+64))+rsp]
1727	lea	rsi,[((0+64))+rsp]
1728	mov	r15,QWORD[((16+64))+rsp]
1729	mov	r8,QWORD[((24+64))+rsp]
1730	lea	rdi,[128+rsp]
1731	call	__ecp_nistz256_sqr_montq
1732
1733	mov	rax,QWORD[((0+96))+rsp]
1734	mov	r14,QWORD[((8+96))+rsp]
1735	lea	rsi,[((0+96))+rsp]
1736	mov	r15,QWORD[((16+96))+rsp]
1737	mov	r8,QWORD[((24+96))+rsp]
1738	lea	rdi,[192+rsp]
1739	call	__ecp_nistz256_sqr_montq
1740
1741	mov	rax,QWORD[128+rsp]
1742	lea	rbx,[128+rsp]
1743	mov	r9,QWORD[((0+64))+rsp]
1744	mov	r10,QWORD[((8+64))+rsp]
1745	lea	rsi,[((0+64))+rsp]
1746	mov	r11,QWORD[((16+64))+rsp]
1747	mov	r12,QWORD[((24+64))+rsp]
1748	lea	rdi,[160+rsp]
1749	call	__ecp_nistz256_mul_montq
1750
1751	mov	rax,QWORD[320+rsp]
1752	lea	rbx,[320+rsp]
1753	mov	r9,QWORD[((0+128))+rsp]
1754	mov	r10,QWORD[((8+128))+rsp]
1755	lea	rsi,[((0+128))+rsp]
1756	mov	r11,QWORD[((16+128))+rsp]
1757	mov	r12,QWORD[((24+128))+rsp]
1758	lea	rdi,[rsp]
1759	call	__ecp_nistz256_mul_montq
1760
1761
1762
1763
1764	add	r12,r12
1765	lea	rsi,[192+rsp]
1766	adc	r13,r13
1767	mov	rax,r12
1768	adc	r8,r8
1769	adc	r9,r9
1770	mov	rbp,r13
1771	sbb	r11,r11
1772
1773	sub	r12,-1
1774	mov	rcx,r8
1775	sbb	r13,r14
1776	sbb	r8,0
1777	mov	r10,r9
1778	sbb	r9,r15
1779	test	r11,r11
1780
1781	cmovz	r12,rax
1782	mov	rax,QWORD[rsi]
1783	cmovz	r13,rbp
1784	mov	rbp,QWORD[8+rsi]
1785	cmovz	r8,rcx
1786	mov	rcx,QWORD[16+rsi]
1787	cmovz	r9,r10
1788	mov	r10,QWORD[24+rsi]
1789
1790	call	__ecp_nistz256_subq
1791
1792	lea	rbx,[160+rsp]
1793	lea	rdi,[224+rsp]
1794	call	__ecp_nistz256_sub_fromq
1795
1796	mov	rax,QWORD[((0+0))+rsp]
1797	mov	rbp,QWORD[((0+8))+rsp]
1798	mov	rcx,QWORD[((0+16))+rsp]
1799	mov	r10,QWORD[((0+24))+rsp]
1800	lea	rdi,[64+rsp]
1801
1802	call	__ecp_nistz256_subq
1803
1804	mov	QWORD[rdi],r12
1805	mov	QWORD[8+rdi],r13
1806	mov	QWORD[16+rdi],r8
1807	mov	QWORD[24+rdi],r9
1808	mov	rax,QWORD[352+rsp]
1809	lea	rbx,[352+rsp]
1810	mov	r9,QWORD[((0+160))+rsp]
1811	mov	r10,QWORD[((8+160))+rsp]
1812	lea	rsi,[((0+160))+rsp]
1813	mov	r11,QWORD[((16+160))+rsp]
1814	mov	r12,QWORD[((24+160))+rsp]
1815	lea	rdi,[32+rsp]
1816	call	__ecp_nistz256_mul_montq
1817
1818	mov	rax,QWORD[96+rsp]
1819	lea	rbx,[96+rsp]
1820	mov	r9,QWORD[((0+64))+rsp]
1821	mov	r10,QWORD[((8+64))+rsp]
1822	lea	rsi,[((0+64))+rsp]
1823	mov	r11,QWORD[((16+64))+rsp]
1824	mov	r12,QWORD[((24+64))+rsp]
1825	lea	rdi,[64+rsp]
1826	call	__ecp_nistz256_mul_montq
1827
1828	lea	rbx,[32+rsp]
1829	lea	rdi,[256+rsp]
1830	call	__ecp_nistz256_sub_fromq
1831
1832DB	102,72,15,126,199
1833
1834	movdqa	xmm0,xmm5
1835	movdqa	xmm1,xmm5
1836	pandn	xmm0,XMMWORD[288+rsp]
1837	movdqa	xmm2,xmm5
1838	pandn	xmm1,XMMWORD[((288+16))+rsp]
1839	movdqa	xmm3,xmm5
1840	pand	xmm2,XMMWORD[$L$ONE_mont]
1841	pand	xmm3,XMMWORD[(($L$ONE_mont+16))]
1842	por	xmm2,xmm0
1843	por	xmm3,xmm1
1844
1845	movdqa	xmm0,xmm4
1846	movdqa	xmm1,xmm4
1847	pandn	xmm0,xmm2
1848	movdqa	xmm2,xmm4
1849	pandn	xmm1,xmm3
1850	movdqa	xmm3,xmm4
1851	pand	xmm2,XMMWORD[384+rsp]
1852	pand	xmm3,XMMWORD[((384+16))+rsp]
1853	por	xmm2,xmm0
1854	por	xmm3,xmm1
1855	movdqu	XMMWORD[64+rdi],xmm2
1856	movdqu	XMMWORD[80+rdi],xmm3
1857
1858	movdqa	xmm0,xmm5
1859	movdqa	xmm1,xmm5
1860	pandn	xmm0,XMMWORD[224+rsp]
1861	movdqa	xmm2,xmm5
1862	pandn	xmm1,XMMWORD[((224+16))+rsp]
1863	movdqa	xmm3,xmm5
1864	pand	xmm2,XMMWORD[416+rsp]
1865	pand	xmm3,XMMWORD[((416+16))+rsp]
1866	por	xmm2,xmm0
1867	por	xmm3,xmm1
1868
1869	movdqa	xmm0,xmm4
1870	movdqa	xmm1,xmm4
1871	pandn	xmm0,xmm2
1872	movdqa	xmm2,xmm4
1873	pandn	xmm1,xmm3
1874	movdqa	xmm3,xmm4
1875	pand	xmm2,XMMWORD[320+rsp]
1876	pand	xmm3,XMMWORD[((320+16))+rsp]
1877	por	xmm2,xmm0
1878	por	xmm3,xmm1
1879	movdqu	XMMWORD[rdi],xmm2
1880	movdqu	XMMWORD[16+rdi],xmm3
1881
1882	movdqa	xmm0,xmm5
1883	movdqa	xmm1,xmm5
1884	pandn	xmm0,XMMWORD[256+rsp]
1885	movdqa	xmm2,xmm5
1886	pandn	xmm1,XMMWORD[((256+16))+rsp]
1887	movdqa	xmm3,xmm5
1888	pand	xmm2,XMMWORD[448+rsp]
1889	pand	xmm3,XMMWORD[((448+16))+rsp]
1890	por	xmm2,xmm0
1891	por	xmm3,xmm1
1892
1893	movdqa	xmm0,xmm4
1894	movdqa	xmm1,xmm4
1895	pandn	xmm0,xmm2
1896	movdqa	xmm2,xmm4
1897	pandn	xmm1,xmm3
1898	movdqa	xmm3,xmm4
1899	pand	xmm2,XMMWORD[352+rsp]
1900	pand	xmm3,XMMWORD[((352+16))+rsp]
1901	por	xmm2,xmm0
1902	por	xmm3,xmm1
1903	movdqu	XMMWORD[32+rdi],xmm2
1904	movdqu	XMMWORD[48+rdi],xmm3
1905
1906	add	rsp,32*15+8
1907	pop	r15
1908	pop	r14
1909	pop	r13
1910	pop	r12
1911	pop	rbx
1912	pop	rbp
1913	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1914	mov	rsi,QWORD[16+rsp]
1915	DB	0F3h,0C3h		;repret
1916$L$SEH_end_ecp_nistz256_point_add_affine:
1917