p256-x86_64-asm.asm revision bb1ceac29bc7a18b94e3da78057dc41aa7071784
1default	rel
2%define XMMWORD
3%define YMMWORD
4%define ZMMWORD
5section	.text code align=64
6
7EXTERN	OPENSSL_ia32cap_P
8
9
10ALIGN	64
11$L$poly:
12	DQ	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
13
14$L$One:
15	DD	1,1,1,1,1,1,1,1
16$L$Two:
17	DD	2,2,2,2,2,2,2,2
18$L$Three:
19	DD	3,3,3,3,3,3,3,3
20$L$ONE_mont:
21	DQ	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
22
23
24ALIGN	64
25ecp_nistz256_mul_by_2:
26	mov	QWORD[8+rsp],rdi	;WIN64 prologue
27	mov	QWORD[16+rsp],rsi
28	mov	rax,rsp
29$L$SEH_begin_ecp_nistz256_mul_by_2:
30	mov	rdi,rcx
31	mov	rsi,rdx
32
33
34	push	r12
35	push	r13
36
37	mov	r8,QWORD[rsi]
38	mov	r9,QWORD[8+rsi]
39	add	r8,r8
40	mov	r10,QWORD[16+rsi]
41	adc	r9,r9
42	mov	r11,QWORD[24+rsi]
43	lea	rsi,[$L$poly]
44	mov	rax,r8
45	adc	r10,r10
46	adc	r11,r11
47	mov	rdx,r9
48	sbb	r13,r13
49
50	sub	r8,QWORD[rsi]
51	mov	rcx,r10
52	sbb	r9,QWORD[8+rsi]
53	sbb	r10,QWORD[16+rsi]
54	mov	r12,r11
55	sbb	r11,QWORD[24+rsi]
56	test	r13,r13
57
58	cmovz	r8,rax
59	cmovz	r9,rdx
60	mov	QWORD[rdi],r8
61	cmovz	r10,rcx
62	mov	QWORD[8+rdi],r9
63	cmovz	r11,r12
64	mov	QWORD[16+rdi],r10
65	mov	QWORD[24+rdi],r11
66
67	pop	r13
68	pop	r12
69	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
70	mov	rsi,QWORD[16+rsp]
71	DB	0F3h,0C3h		;repret
72$L$SEH_end_ecp_nistz256_mul_by_2:
73
74
75
76global	ecp_nistz256_neg
77
78ALIGN	32
79ecp_nistz256_neg:
80	mov	QWORD[8+rsp],rdi	;WIN64 prologue
81	mov	QWORD[16+rsp],rsi
82	mov	rax,rsp
83$L$SEH_begin_ecp_nistz256_neg:
84	mov	rdi,rcx
85	mov	rsi,rdx
86
87
88	push	r12
89	push	r13
90
91	xor	r8,r8
92	xor	r9,r9
93	xor	r10,r10
94	xor	r11,r11
95	xor	r13,r13
96
97	sub	r8,QWORD[rsi]
98	sbb	r9,QWORD[8+rsi]
99	sbb	r10,QWORD[16+rsi]
100	mov	rax,r8
101	sbb	r11,QWORD[24+rsi]
102	lea	rsi,[$L$poly]
103	mov	rdx,r9
104	sbb	r13,0
105
106	add	r8,QWORD[rsi]
107	mov	rcx,r10
108	adc	r9,QWORD[8+rsi]
109	adc	r10,QWORD[16+rsi]
110	mov	r12,r11
111	adc	r11,QWORD[24+rsi]
112	test	r13,r13
113
114	cmovz	r8,rax
115	cmovz	r9,rdx
116	mov	QWORD[rdi],r8
117	cmovz	r10,rcx
118	mov	QWORD[8+rdi],r9
119	cmovz	r11,r12
120	mov	QWORD[16+rdi],r10
121	mov	QWORD[24+rdi],r11
122
123	pop	r13
124	pop	r12
125	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
126	mov	rsi,QWORD[16+rsp]
127	DB	0F3h,0C3h		;repret
128$L$SEH_end_ecp_nistz256_neg:
129
130
131
132
133
134
135global	ecp_nistz256_mul_mont
136
137ALIGN	32
138ecp_nistz256_mul_mont:
139	mov	QWORD[8+rsp],rdi	;WIN64 prologue
140	mov	QWORD[16+rsp],rsi
141	mov	rax,rsp
142$L$SEH_begin_ecp_nistz256_mul_mont:
143	mov	rdi,rcx
144	mov	rsi,rdx
145	mov	rdx,r8
146
147
148$L$mul_mont:
149	push	rbp
150	push	rbx
151	push	r12
152	push	r13
153	push	r14
154	push	r15
155	mov	rbx,rdx
156	mov	rax,QWORD[rdx]
157	mov	r9,QWORD[rsi]
158	mov	r10,QWORD[8+rsi]
159	mov	r11,QWORD[16+rsi]
160	mov	r12,QWORD[24+rsi]
161
162	call	__ecp_nistz256_mul_montq
163$L$mul_mont_done:
164	pop	r15
165	pop	r14
166	pop	r13
167	pop	r12
168	pop	rbx
169	pop	rbp
170	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
171	mov	rsi,QWORD[16+rsp]
172	DB	0F3h,0C3h		;repret
173$L$SEH_end_ecp_nistz256_mul_mont:
174
175
176ALIGN	32
177__ecp_nistz256_mul_montq:
178
179
180	mov	rbp,rax
181	mul	r9
182	mov	r14,QWORD[(($L$poly+8))]
183	mov	r8,rax
184	mov	rax,rbp
185	mov	r9,rdx
186
187	mul	r10
188	mov	r15,QWORD[(($L$poly+24))]
189	add	r9,rax
190	mov	rax,rbp
191	adc	rdx,0
192	mov	r10,rdx
193
194	mul	r11
195	add	r10,rax
196	mov	rax,rbp
197	adc	rdx,0
198	mov	r11,rdx
199
200	mul	r12
201	add	r11,rax
202	mov	rax,r8
203	adc	rdx,0
204	xor	r13,r13
205	mov	r12,rdx
206
207
208
209
210
211
212
213
214
215
216	mov	rbp,r8
217	shl	r8,32
218	mul	r15
219	shr	rbp,32
220	add	r9,r8
221	adc	r10,rbp
222	adc	r11,rax
223	mov	rax,QWORD[8+rbx]
224	adc	r12,rdx
225	adc	r13,0
226	xor	r8,r8
227
228
229
230	mov	rbp,rax
231	mul	QWORD[rsi]
232	add	r9,rax
233	mov	rax,rbp
234	adc	rdx,0
235	mov	rcx,rdx
236
237	mul	QWORD[8+rsi]
238	add	r10,rcx
239	adc	rdx,0
240	add	r10,rax
241	mov	rax,rbp
242	adc	rdx,0
243	mov	rcx,rdx
244
245	mul	QWORD[16+rsi]
246	add	r11,rcx
247	adc	rdx,0
248	add	r11,rax
249	mov	rax,rbp
250	adc	rdx,0
251	mov	rcx,rdx
252
253	mul	QWORD[24+rsi]
254	add	r12,rcx
255	adc	rdx,0
256	add	r12,rax
257	mov	rax,r9
258	adc	r13,rdx
259	adc	r8,0
260
261
262
263	mov	rbp,r9
264	shl	r9,32
265	mul	r15
266	shr	rbp,32
267	add	r10,r9
268	adc	r11,rbp
269	adc	r12,rax
270	mov	rax,QWORD[16+rbx]
271	adc	r13,rdx
272	adc	r8,0
273	xor	r9,r9
274
275
276
277	mov	rbp,rax
278	mul	QWORD[rsi]
279	add	r10,rax
280	mov	rax,rbp
281	adc	rdx,0
282	mov	rcx,rdx
283
284	mul	QWORD[8+rsi]
285	add	r11,rcx
286	adc	rdx,0
287	add	r11,rax
288	mov	rax,rbp
289	adc	rdx,0
290	mov	rcx,rdx
291
292	mul	QWORD[16+rsi]
293	add	r12,rcx
294	adc	rdx,0
295	add	r12,rax
296	mov	rax,rbp
297	adc	rdx,0
298	mov	rcx,rdx
299
300	mul	QWORD[24+rsi]
301	add	r13,rcx
302	adc	rdx,0
303	add	r13,rax
304	mov	rax,r10
305	adc	r8,rdx
306	adc	r9,0
307
308
309
310	mov	rbp,r10
311	shl	r10,32
312	mul	r15
313	shr	rbp,32
314	add	r11,r10
315	adc	r12,rbp
316	adc	r13,rax
317	mov	rax,QWORD[24+rbx]
318	adc	r8,rdx
319	adc	r9,0
320	xor	r10,r10
321
322
323
324	mov	rbp,rax
325	mul	QWORD[rsi]
326	add	r11,rax
327	mov	rax,rbp
328	adc	rdx,0
329	mov	rcx,rdx
330
331	mul	QWORD[8+rsi]
332	add	r12,rcx
333	adc	rdx,0
334	add	r12,rax
335	mov	rax,rbp
336	adc	rdx,0
337	mov	rcx,rdx
338
339	mul	QWORD[16+rsi]
340	add	r13,rcx
341	adc	rdx,0
342	add	r13,rax
343	mov	rax,rbp
344	adc	rdx,0
345	mov	rcx,rdx
346
347	mul	QWORD[24+rsi]
348	add	r8,rcx
349	adc	rdx,0
350	add	r8,rax
351	mov	rax,r11
352	adc	r9,rdx
353	adc	r10,0
354
355
356
357	mov	rbp,r11
358	shl	r11,32
359	mul	r15
360	shr	rbp,32
361	add	r12,r11
362	adc	r13,rbp
363	mov	rcx,r12
364	adc	r8,rax
365	adc	r9,rdx
366	mov	rbp,r13
367	adc	r10,0
368
369
370
371	sub	r12,-1
372	mov	rbx,r8
373	sbb	r13,r14
374	sbb	r8,0
375	mov	rdx,r9
376	sbb	r9,r15
377	sbb	r10,0
378
379	cmovc	r12,rcx
380	cmovc	r13,rbp
381	mov	QWORD[rdi],r12
382	cmovc	r8,rbx
383	mov	QWORD[8+rdi],r13
384	cmovc	r9,rdx
385	mov	QWORD[16+rdi],r8
386	mov	QWORD[24+rdi],r9
387
388	DB	0F3h,0C3h		;repret
389
390
391
392
393
394
395
396
397
398global	ecp_nistz256_sqr_mont
399
400ALIGN	32
401ecp_nistz256_sqr_mont:
402	mov	QWORD[8+rsp],rdi	;WIN64 prologue
403	mov	QWORD[16+rsp],rsi
404	mov	rax,rsp
405$L$SEH_begin_ecp_nistz256_sqr_mont:
406	mov	rdi,rcx
407	mov	rsi,rdx
408
409
410	push	rbp
411	push	rbx
412	push	r12
413	push	r13
414	push	r14
415	push	r15
416	mov	rax,QWORD[rsi]
417	mov	r14,QWORD[8+rsi]
418	mov	r15,QWORD[16+rsi]
419	mov	r8,QWORD[24+rsi]
420
421	call	__ecp_nistz256_sqr_montq
422$L$sqr_mont_done:
423	pop	r15
424	pop	r14
425	pop	r13
426	pop	r12
427	pop	rbx
428	pop	rbp
429	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
430	mov	rsi,QWORD[16+rsp]
431	DB	0F3h,0C3h		;repret
432$L$SEH_end_ecp_nistz256_sqr_mont:
433
434
435ALIGN	32
436__ecp_nistz256_sqr_montq:
437	mov	r13,rax
438	mul	r14
439	mov	r9,rax
440	mov	rax,r15
441	mov	r10,rdx
442
443	mul	r13
444	add	r10,rax
445	mov	rax,r8
446	adc	rdx,0
447	mov	r11,rdx
448
449	mul	r13
450	add	r11,rax
451	mov	rax,r15
452	adc	rdx,0
453	mov	r12,rdx
454
455
456	mul	r14
457	add	r11,rax
458	mov	rax,r8
459	adc	rdx,0
460	mov	rbp,rdx
461
462	mul	r14
463	add	r12,rax
464	mov	rax,r8
465	adc	rdx,0
466	add	r12,rbp
467	mov	r13,rdx
468	adc	r13,0
469
470
471	mul	r15
472	xor	r15,r15
473	add	r13,rax
474	mov	rax,QWORD[rsi]
475	mov	r14,rdx
476	adc	r14,0
477
478	add	r9,r9
479	adc	r10,r10
480	adc	r11,r11
481	adc	r12,r12
482	adc	r13,r13
483	adc	r14,r14
484	adc	r15,0
485
486	mul	rax
487	mov	r8,rax
488	mov	rax,QWORD[8+rsi]
489	mov	rcx,rdx
490
491	mul	rax
492	add	r9,rcx
493	adc	r10,rax
494	mov	rax,QWORD[16+rsi]
495	adc	rdx,0
496	mov	rcx,rdx
497
498	mul	rax
499	add	r11,rcx
500	adc	r12,rax
501	mov	rax,QWORD[24+rsi]
502	adc	rdx,0
503	mov	rcx,rdx
504
505	mul	rax
506	add	r13,rcx
507	adc	r14,rax
508	mov	rax,r8
509	adc	r15,rdx
510
511	mov	rsi,QWORD[(($L$poly+8))]
512	mov	rbp,QWORD[(($L$poly+24))]
513
514
515
516
517	mov	rcx,r8
518	shl	r8,32
519	mul	rbp
520	shr	rcx,32
521	add	r9,r8
522	adc	r10,rcx
523	adc	r11,rax
524	mov	rax,r9
525	adc	rdx,0
526
527
528
529	mov	rcx,r9
530	shl	r9,32
531	mov	r8,rdx
532	mul	rbp
533	shr	rcx,32
534	add	r10,r9
535	adc	r11,rcx
536	adc	r8,rax
537	mov	rax,r10
538	adc	rdx,0
539
540
541
542	mov	rcx,r10
543	shl	r10,32
544	mov	r9,rdx
545	mul	rbp
546	shr	rcx,32
547	add	r11,r10
548	adc	r8,rcx
549	adc	r9,rax
550	mov	rax,r11
551	adc	rdx,0
552
553
554
555	mov	rcx,r11
556	shl	r11,32
557	mov	r10,rdx
558	mul	rbp
559	shr	rcx,32
560	add	r8,r11
561	adc	r9,rcx
562	adc	r10,rax
563	adc	rdx,0
564	xor	r11,r11
565
566
567
568	add	r12,r8
569	adc	r13,r9
570	mov	r8,r12
571	adc	r14,r10
572	adc	r15,rdx
573	mov	r9,r13
574	adc	r11,0
575
576	sub	r12,-1
577	mov	r10,r14
578	sbb	r13,rsi
579	sbb	r14,0
580	mov	rcx,r15
581	sbb	r15,rbp
582	sbb	r11,0
583
584	cmovc	r12,r8
585	cmovc	r13,r9
586	mov	QWORD[rdi],r12
587	cmovc	r14,r10
588	mov	QWORD[8+rdi],r13
589	cmovc	r15,rcx
590	mov	QWORD[16+rdi],r14
591	mov	QWORD[24+rdi],r15
592
593	DB	0F3h,0C3h		;repret
594
595
596
597
598
599
600
601global	ecp_nistz256_from_mont
602
603ALIGN	32
604ecp_nistz256_from_mont:
605	mov	QWORD[8+rsp],rdi	;WIN64 prologue
606	mov	QWORD[16+rsp],rsi
607	mov	rax,rsp
608$L$SEH_begin_ecp_nistz256_from_mont:
609	mov	rdi,rcx
610	mov	rsi,rdx
611
612
613	push	r12
614	push	r13
615
616	mov	rax,QWORD[rsi]
617	mov	r13,QWORD[(($L$poly+24))]
618	mov	r9,QWORD[8+rsi]
619	mov	r10,QWORD[16+rsi]
620	mov	r11,QWORD[24+rsi]
621	mov	r8,rax
622	mov	r12,QWORD[(($L$poly+8))]
623
624
625
626	mov	rcx,rax
627	shl	r8,32
628	mul	r13
629	shr	rcx,32
630	add	r9,r8
631	adc	r10,rcx
632	adc	r11,rax
633	mov	rax,r9
634	adc	rdx,0
635
636
637
638	mov	rcx,r9
639	shl	r9,32
640	mov	r8,rdx
641	mul	r13
642	shr	rcx,32
643	add	r10,r9
644	adc	r11,rcx
645	adc	r8,rax
646	mov	rax,r10
647	adc	rdx,0
648
649
650
651	mov	rcx,r10
652	shl	r10,32
653	mov	r9,rdx
654	mul	r13
655	shr	rcx,32
656	add	r11,r10
657	adc	r8,rcx
658	adc	r9,rax
659	mov	rax,r11
660	adc	rdx,0
661
662
663
664	mov	rcx,r11
665	shl	r11,32
666	mov	r10,rdx
667	mul	r13
668	shr	rcx,32
669	add	r8,r11
670	adc	r9,rcx
671	mov	rcx,r8
672	adc	r10,rax
673	mov	rsi,r9
674	adc	rdx,0
675
676
677
678	sub	r8,-1
679	mov	rax,r10
680	sbb	r9,r12
681	sbb	r10,0
682	mov	r11,rdx
683	sbb	rdx,r13
684	sbb	r13,r13
685
686	cmovnz	r8,rcx
687	cmovnz	r9,rsi
688	mov	QWORD[rdi],r8
689	cmovnz	r10,rax
690	mov	QWORD[8+rdi],r9
691	cmovz	r11,rdx
692	mov	QWORD[16+rdi],r10
693	mov	QWORD[24+rdi],r11
694
695	pop	r13
696	pop	r12
697	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
698	mov	rsi,QWORD[16+rsp]
699	DB	0F3h,0C3h		;repret
700$L$SEH_end_ecp_nistz256_from_mont:
701
702
703global	ecp_nistz256_select_w5
704
705ALIGN	32
706ecp_nistz256_select_w5:
707	lea	rax,[((-136))+rsp]
708$L$SEH_begin_ecp_nistz256_select_w5:
709DB	0x48,0x8d,0x60,0xe0
710DB	0x0f,0x29,0x70,0xe0
711DB	0x0f,0x29,0x78,0xf0
712DB	0x44,0x0f,0x29,0x00
713DB	0x44,0x0f,0x29,0x48,0x10
714DB	0x44,0x0f,0x29,0x50,0x20
715DB	0x44,0x0f,0x29,0x58,0x30
716DB	0x44,0x0f,0x29,0x60,0x40
717DB	0x44,0x0f,0x29,0x68,0x50
718DB	0x44,0x0f,0x29,0x70,0x60
719DB	0x44,0x0f,0x29,0x78,0x70
720	movdqa	xmm0,XMMWORD[$L$One]
721	movd	xmm1,r8d
722
723	pxor	xmm2,xmm2
724	pxor	xmm3,xmm3
725	pxor	xmm4,xmm4
726	pxor	xmm5,xmm5
727	pxor	xmm6,xmm6
728	pxor	xmm7,xmm7
729
730	movdqa	xmm8,xmm0
731	pshufd	xmm1,xmm1,0
732
733	mov	rax,16
734$L$select_loop_sse_w5:
735
736	movdqa	xmm15,xmm8
737	paddd	xmm8,xmm0
738	pcmpeqd	xmm15,xmm1
739
740	movdqa	xmm9,XMMWORD[rdx]
741	movdqa	xmm10,XMMWORD[16+rdx]
742	movdqa	xmm11,XMMWORD[32+rdx]
743	movdqa	xmm12,XMMWORD[48+rdx]
744	movdqa	xmm13,XMMWORD[64+rdx]
745	movdqa	xmm14,XMMWORD[80+rdx]
746	lea	rdx,[96+rdx]
747
748	pand	xmm9,xmm15
749	pand	xmm10,xmm15
750	por	xmm2,xmm9
751	pand	xmm11,xmm15
752	por	xmm3,xmm10
753	pand	xmm12,xmm15
754	por	xmm4,xmm11
755	pand	xmm13,xmm15
756	por	xmm5,xmm12
757	pand	xmm14,xmm15
758	por	xmm6,xmm13
759	por	xmm7,xmm14
760
761	dec	rax
762	jnz	NEAR $L$select_loop_sse_w5
763
764	movdqu	XMMWORD[rcx],xmm2
765	movdqu	XMMWORD[16+rcx],xmm3
766	movdqu	XMMWORD[32+rcx],xmm4
767	movdqu	XMMWORD[48+rcx],xmm5
768	movdqu	XMMWORD[64+rcx],xmm6
769	movdqu	XMMWORD[80+rcx],xmm7
770	movaps	xmm6,XMMWORD[rsp]
771	movaps	xmm7,XMMWORD[16+rsp]
772	movaps	xmm8,XMMWORD[32+rsp]
773	movaps	xmm9,XMMWORD[48+rsp]
774	movaps	xmm10,XMMWORD[64+rsp]
775	movaps	xmm11,XMMWORD[80+rsp]
776	movaps	xmm12,XMMWORD[96+rsp]
777	movaps	xmm13,XMMWORD[112+rsp]
778	movaps	xmm14,XMMWORD[128+rsp]
779	movaps	xmm15,XMMWORD[144+rsp]
780	lea	rsp,[168+rsp]
781$L$SEH_end_ecp_nistz256_select_w5:
782	DB	0F3h,0C3h		;repret
783
784
785
786
787global	ecp_nistz256_select_w7
788
789ALIGN	32
790ecp_nistz256_select_w7:
791	lea	rax,[((-136))+rsp]
792$L$SEH_begin_ecp_nistz256_select_w7:
793DB	0x48,0x8d,0x60,0xe0
794DB	0x0f,0x29,0x70,0xe0
795DB	0x0f,0x29,0x78,0xf0
796DB	0x44,0x0f,0x29,0x00
797DB	0x44,0x0f,0x29,0x48,0x10
798DB	0x44,0x0f,0x29,0x50,0x20
799DB	0x44,0x0f,0x29,0x58,0x30
800DB	0x44,0x0f,0x29,0x60,0x40
801DB	0x44,0x0f,0x29,0x68,0x50
802DB	0x44,0x0f,0x29,0x70,0x60
803DB	0x44,0x0f,0x29,0x78,0x70
804	movdqa	xmm8,XMMWORD[$L$One]
805	movd	xmm1,r8d
806
807	pxor	xmm2,xmm2
808	pxor	xmm3,xmm3
809	pxor	xmm4,xmm4
810	pxor	xmm5,xmm5
811
812	movdqa	xmm0,xmm8
813	pshufd	xmm1,xmm1,0
814	mov	rax,64
815
816$L$select_loop_sse_w7:
817	movdqa	xmm15,xmm8
818	paddd	xmm8,xmm0
819	movdqa	xmm9,XMMWORD[rdx]
820	movdqa	xmm10,XMMWORD[16+rdx]
821	pcmpeqd	xmm15,xmm1
822	movdqa	xmm11,XMMWORD[32+rdx]
823	movdqa	xmm12,XMMWORD[48+rdx]
824	lea	rdx,[64+rdx]
825
826	pand	xmm9,xmm15
827	pand	xmm10,xmm15
828	por	xmm2,xmm9
829	pand	xmm11,xmm15
830	por	xmm3,xmm10
831	pand	xmm12,xmm15
832	por	xmm4,xmm11
833	prefetcht0	[255+rdx]
834	por	xmm5,xmm12
835
836	dec	rax
837	jnz	NEAR $L$select_loop_sse_w7
838
839	movdqu	XMMWORD[rcx],xmm2
840	movdqu	XMMWORD[16+rcx],xmm3
841	movdqu	XMMWORD[32+rcx],xmm4
842	movdqu	XMMWORD[48+rcx],xmm5
843	movaps	xmm6,XMMWORD[rsp]
844	movaps	xmm7,XMMWORD[16+rsp]
845	movaps	xmm8,XMMWORD[32+rsp]
846	movaps	xmm9,XMMWORD[48+rsp]
847	movaps	xmm10,XMMWORD[64+rsp]
848	movaps	xmm11,XMMWORD[80+rsp]
849	movaps	xmm12,XMMWORD[96+rsp]
850	movaps	xmm13,XMMWORD[112+rsp]
851	movaps	xmm14,XMMWORD[128+rsp]
852	movaps	xmm15,XMMWORD[144+rsp]
853	lea	rsp,[168+rsp]
854$L$SEH_end_ecp_nistz256_select_w7:
855	DB	0F3h,0C3h		;repret
856
857global	ecp_nistz256_avx2_select_w7
858
859ALIGN	32
860ecp_nistz256_avx2_select_w7:
861	mov	QWORD[8+rsp],rdi	;WIN64 prologue
862	mov	QWORD[16+rsp],rsi
863	mov	rax,rsp
864$L$SEH_begin_ecp_nistz256_avx2_select_w7:
865	mov	rdi,rcx
866	mov	rsi,rdx
867	mov	rdx,r8
868
869
870DB	0x0f,0x0b
871	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
872	mov	rsi,QWORD[16+rsp]
873	DB	0F3h,0C3h		;repret
874$L$SEH_end_ecp_nistz256_avx2_select_w7:
875
876ALIGN	32
877__ecp_nistz256_add_toq:
878	add	r12,QWORD[rbx]
879	adc	r13,QWORD[8+rbx]
880	mov	rax,r12
881	adc	r8,QWORD[16+rbx]
882	adc	r9,QWORD[24+rbx]
883	mov	rbp,r13
884	sbb	r11,r11
885
886	sub	r12,-1
887	mov	rcx,r8
888	sbb	r13,r14
889	sbb	r8,0
890	mov	r10,r9
891	sbb	r9,r15
892	test	r11,r11
893
894	cmovz	r12,rax
895	cmovz	r13,rbp
896	mov	QWORD[rdi],r12
897	cmovz	r8,rcx
898	mov	QWORD[8+rdi],r13
899	cmovz	r9,r10
900	mov	QWORD[16+rdi],r8
901	mov	QWORD[24+rdi],r9
902
903	DB	0F3h,0C3h		;repret
904
905
906
907ALIGN	32
908__ecp_nistz256_sub_fromq:
909	sub	r12,QWORD[rbx]
910	sbb	r13,QWORD[8+rbx]
911	mov	rax,r12
912	sbb	r8,QWORD[16+rbx]
913	sbb	r9,QWORD[24+rbx]
914	mov	rbp,r13
915	sbb	r11,r11
916
917	add	r12,-1
918	mov	rcx,r8
919	adc	r13,r14
920	adc	r8,0
921	mov	r10,r9
922	adc	r9,r15
923	test	r11,r11
924
925	cmovz	r12,rax
926	cmovz	r13,rbp
927	mov	QWORD[rdi],r12
928	cmovz	r8,rcx
929	mov	QWORD[8+rdi],r13
930	cmovz	r9,r10
931	mov	QWORD[16+rdi],r8
932	mov	QWORD[24+rdi],r9
933
934	DB	0F3h,0C3h		;repret
935
936
937
938ALIGN	32
939__ecp_nistz256_subq:
940	sub	rax,r12
941	sbb	rbp,r13
942	mov	r12,rax
943	sbb	rcx,r8
944	sbb	r10,r9
945	mov	r13,rbp
946	sbb	r11,r11
947
948	add	rax,-1
949	mov	r8,rcx
950	adc	rbp,r14
951	adc	rcx,0
952	mov	r9,r10
953	adc	r10,r15
954	test	r11,r11
955
956	cmovnz	r12,rax
957	cmovnz	r13,rbp
958	cmovnz	r8,rcx
959	cmovnz	r9,r10
960
961	DB	0F3h,0C3h		;repret
962
963
964
965ALIGN	32
966__ecp_nistz256_mul_by_2q:
967	add	r12,r12
968	adc	r13,r13
969	mov	rax,r12
970	adc	r8,r8
971	adc	r9,r9
972	mov	rbp,r13
973	sbb	r11,r11
974
975	sub	r12,-1
976	mov	rcx,r8
977	sbb	r13,r14
978	sbb	r8,0
979	mov	r10,r9
980	sbb	r9,r15
981	test	r11,r11
982
983	cmovz	r12,rax
984	cmovz	r13,rbp
985	mov	QWORD[rdi],r12
986	cmovz	r8,rcx
987	mov	QWORD[8+rdi],r13
988	cmovz	r9,r10
989	mov	QWORD[16+rdi],r8
990	mov	QWORD[24+rdi],r9
991
992	DB	0F3h,0C3h		;repret
993
994global	ecp_nistz256_point_double
995
996ALIGN	32
997ecp_nistz256_point_double:
998	mov	QWORD[8+rsp],rdi	;WIN64 prologue
999	mov	QWORD[16+rsp],rsi
1000	mov	rax,rsp
1001$L$SEH_begin_ecp_nistz256_point_double:
1002	mov	rdi,rcx
1003	mov	rsi,rdx
1004
1005
1006	push	rbp
1007	push	rbx
1008	push	r12
1009	push	r13
1010	push	r14
1011	push	r15
1012	sub	rsp,32*5+8
1013
1014$L$point_double_shortcutq:
1015	movdqu	xmm0,XMMWORD[rsi]
1016	mov	rbx,rsi
1017	movdqu	xmm1,XMMWORD[16+rsi]
1018	mov	r12,QWORD[((32+0))+rsi]
1019	mov	r13,QWORD[((32+8))+rsi]
1020	mov	r8,QWORD[((32+16))+rsi]
1021	mov	r9,QWORD[((32+24))+rsi]
1022	mov	r14,QWORD[(($L$poly+8))]
1023	mov	r15,QWORD[(($L$poly+24))]
1024	movdqa	XMMWORD[96+rsp],xmm0
1025	movdqa	XMMWORD[(96+16)+rsp],xmm1
1026	lea	r10,[32+rdi]
1027	lea	r11,[64+rdi]
1028DB	102,72,15,110,199
1029DB	102,73,15,110,202
1030DB	102,73,15,110,211
1031
1032	lea	rdi,[rsp]
1033	call	__ecp_nistz256_mul_by_2q
1034
1035	mov	rax,QWORD[((64+0))+rsi]
1036	mov	r14,QWORD[((64+8))+rsi]
1037	mov	r15,QWORD[((64+16))+rsi]
1038	mov	r8,QWORD[((64+24))+rsi]
1039	lea	rsi,[((64-0))+rsi]
1040	lea	rdi,[64+rsp]
1041	call	__ecp_nistz256_sqr_montq
1042
1043	mov	rax,QWORD[((0+0))+rsp]
1044	mov	r14,QWORD[((8+0))+rsp]
1045	lea	rsi,[((0+0))+rsp]
1046	mov	r15,QWORD[((16+0))+rsp]
1047	mov	r8,QWORD[((24+0))+rsp]
1048	lea	rdi,[rsp]
1049	call	__ecp_nistz256_sqr_montq
1050
1051	mov	rax,QWORD[32+rbx]
1052	mov	r9,QWORD[((64+0))+rbx]
1053	mov	r10,QWORD[((64+8))+rbx]
1054	mov	r11,QWORD[((64+16))+rbx]
1055	mov	r12,QWORD[((64+24))+rbx]
1056	lea	rsi,[((64-0))+rbx]
1057	lea	rbx,[32+rbx]
1058DB	102,72,15,126,215
1059	call	__ecp_nistz256_mul_montq
1060	call	__ecp_nistz256_mul_by_2q
1061
1062	mov	r12,QWORD[((96+0))+rsp]
1063	mov	r13,QWORD[((96+8))+rsp]
1064	lea	rbx,[64+rsp]
1065	mov	r8,QWORD[((96+16))+rsp]
1066	mov	r9,QWORD[((96+24))+rsp]
1067	lea	rdi,[32+rsp]
1068	call	__ecp_nistz256_add_toq
1069
1070	mov	r12,QWORD[((96+0))+rsp]
1071	mov	r13,QWORD[((96+8))+rsp]
1072	lea	rbx,[64+rsp]
1073	mov	r8,QWORD[((96+16))+rsp]
1074	mov	r9,QWORD[((96+24))+rsp]
1075	lea	rdi,[64+rsp]
1076	call	__ecp_nistz256_sub_fromq
1077
1078	mov	rax,QWORD[((0+0))+rsp]
1079	mov	r14,QWORD[((8+0))+rsp]
1080	lea	rsi,[((0+0))+rsp]
1081	mov	r15,QWORD[((16+0))+rsp]
1082	mov	r8,QWORD[((24+0))+rsp]
1083DB	102,72,15,126,207
1084	call	__ecp_nistz256_sqr_montq
1085	xor	r9,r9
1086	mov	rax,r12
1087	add	r12,-1
1088	mov	r10,r13
1089	adc	r13,rsi
1090	mov	rcx,r14
1091	adc	r14,0
1092	mov	r8,r15
1093	adc	r15,rbp
1094	adc	r9,0
1095	xor	rsi,rsi
1096	test	rax,1
1097
1098	cmovz	r12,rax
1099	cmovz	r13,r10
1100	cmovz	r14,rcx
1101	cmovz	r15,r8
1102	cmovz	r9,rsi
1103
1104	mov	rax,r13
1105	shr	r12,1
1106	shl	rax,63
1107	mov	r10,r14
1108	shr	r13,1
1109	or	r12,rax
1110	shl	r10,63
1111	mov	rcx,r15
1112	shr	r14,1
1113	or	r13,r10
1114	shl	rcx,63
1115	mov	QWORD[rdi],r12
1116	shr	r15,1
1117	mov	QWORD[8+rdi],r13
1118	shl	r9,63
1119	or	r14,rcx
1120	or	r15,r9
1121	mov	QWORD[16+rdi],r14
1122	mov	QWORD[24+rdi],r15
1123	mov	rax,QWORD[64+rsp]
1124	lea	rbx,[64+rsp]
1125	mov	r9,QWORD[((0+32))+rsp]
1126	mov	r10,QWORD[((8+32))+rsp]
1127	lea	rsi,[((0+32))+rsp]
1128	mov	r11,QWORD[((16+32))+rsp]
1129	mov	r12,QWORD[((24+32))+rsp]
1130	lea	rdi,[32+rsp]
1131	call	__ecp_nistz256_mul_montq
1132
1133	lea	rdi,[128+rsp]
1134	call	__ecp_nistz256_mul_by_2q
1135
1136	lea	rbx,[32+rsp]
1137	lea	rdi,[32+rsp]
1138	call	__ecp_nistz256_add_toq
1139
1140	mov	rax,QWORD[96+rsp]
1141	lea	rbx,[96+rsp]
1142	mov	r9,QWORD[((0+0))+rsp]
1143	mov	r10,QWORD[((8+0))+rsp]
1144	lea	rsi,[((0+0))+rsp]
1145	mov	r11,QWORD[((16+0))+rsp]
1146	mov	r12,QWORD[((24+0))+rsp]
1147	lea	rdi,[rsp]
1148	call	__ecp_nistz256_mul_montq
1149
1150	lea	rdi,[128+rsp]
1151	call	__ecp_nistz256_mul_by_2q
1152
1153	mov	rax,QWORD[((0+32))+rsp]
1154	mov	r14,QWORD[((8+32))+rsp]
1155	lea	rsi,[((0+32))+rsp]
1156	mov	r15,QWORD[((16+32))+rsp]
1157	mov	r8,QWORD[((24+32))+rsp]
1158DB	102,72,15,126,199
1159	call	__ecp_nistz256_sqr_montq
1160
1161	lea	rbx,[128+rsp]
1162	mov	r8,r14
1163	mov	r9,r15
1164	mov	r14,rsi
1165	mov	r15,rbp
1166	call	__ecp_nistz256_sub_fromq
1167
1168	mov	rax,QWORD[((0+0))+rsp]
1169	mov	rbp,QWORD[((0+8))+rsp]
1170	mov	rcx,QWORD[((0+16))+rsp]
1171	mov	r10,QWORD[((0+24))+rsp]
1172	lea	rdi,[rsp]
1173	call	__ecp_nistz256_subq
1174
1175	mov	rax,QWORD[32+rsp]
1176	lea	rbx,[32+rsp]
1177	mov	r14,r12
1178	xor	ecx,ecx
1179	mov	QWORD[((0+0))+rsp],r12
1180	mov	r10,r13
1181	mov	QWORD[((0+8))+rsp],r13
1182	cmovz	r11,r8
1183	mov	QWORD[((0+16))+rsp],r8
1184	lea	rsi,[((0-0))+rsp]
1185	cmovz	r12,r9
1186	mov	QWORD[((0+24))+rsp],r9
1187	mov	r9,r14
1188	lea	rdi,[rsp]
1189	call	__ecp_nistz256_mul_montq
1190
1191DB	102,72,15,126,203
1192DB	102,72,15,126,207
1193	call	__ecp_nistz256_sub_fromq
1194
1195	add	rsp,32*5+8
1196	pop	r15
1197	pop	r14
1198	pop	r13
1199	pop	r12
1200	pop	rbx
1201	pop	rbp
1202	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1203	mov	rsi,QWORD[16+rsp]
1204	DB	0F3h,0C3h		;repret
1205$L$SEH_end_ecp_nistz256_point_double:
1206global	ecp_nistz256_point_add
1207
1208ALIGN	32
1209ecp_nistz256_point_add:
1210	mov	QWORD[8+rsp],rdi	;WIN64 prologue
1211	mov	QWORD[16+rsp],rsi
1212	mov	rax,rsp
1213$L$SEH_begin_ecp_nistz256_point_add:
1214	mov	rdi,rcx
1215	mov	rsi,rdx
1216	mov	rdx,r8
1217
1218
1219	push	rbp
1220	push	rbx
1221	push	r12
1222	push	r13
1223	push	r14
1224	push	r15
1225	sub	rsp,32*18+8
1226
1227	movdqu	xmm0,XMMWORD[rsi]
1228	movdqu	xmm1,XMMWORD[16+rsi]
1229	movdqu	xmm2,XMMWORD[32+rsi]
1230	movdqu	xmm3,XMMWORD[48+rsi]
1231	movdqu	xmm4,XMMWORD[64+rsi]
1232	movdqu	xmm5,XMMWORD[80+rsi]
1233	mov	rbx,rsi
1234	mov	rsi,rdx
1235	movdqa	XMMWORD[384+rsp],xmm0
1236	movdqa	XMMWORD[(384+16)+rsp],xmm1
1237	por	xmm1,xmm0
1238	movdqa	XMMWORD[416+rsp],xmm2
1239	movdqa	XMMWORD[(416+16)+rsp],xmm3
1240	por	xmm3,xmm2
1241	movdqa	XMMWORD[448+rsp],xmm4
1242	movdqa	XMMWORD[(448+16)+rsp],xmm5
1243	por	xmm3,xmm1
1244
1245	movdqu	xmm0,XMMWORD[rsi]
1246	pshufd	xmm5,xmm3,0xb1
1247	movdqu	xmm1,XMMWORD[16+rsi]
1248	movdqu	xmm2,XMMWORD[32+rsi]
1249	por	xmm5,xmm3
1250	movdqu	xmm3,XMMWORD[48+rsi]
1251	mov	rax,QWORD[((64+0))+rsi]
1252	mov	r14,QWORD[((64+8))+rsi]
1253	mov	r15,QWORD[((64+16))+rsi]
1254	mov	r8,QWORD[((64+24))+rsi]
1255	movdqa	XMMWORD[480+rsp],xmm0
1256	pshufd	xmm4,xmm5,0x1e
1257	movdqa	XMMWORD[(480+16)+rsp],xmm1
1258	por	xmm1,xmm0
1259DB	102,72,15,110,199
1260	movdqa	XMMWORD[512+rsp],xmm2
1261	movdqa	XMMWORD[(512+16)+rsp],xmm3
1262	por	xmm3,xmm2
1263	por	xmm5,xmm4
1264	pxor	xmm4,xmm4
1265	por	xmm3,xmm1
1266
1267	lea	rsi,[((64-0))+rsi]
1268	mov	QWORD[((544+0))+rsp],rax
1269	mov	QWORD[((544+8))+rsp],r14
1270	mov	QWORD[((544+16))+rsp],r15
1271	mov	QWORD[((544+24))+rsp],r8
1272	lea	rdi,[96+rsp]
1273	call	__ecp_nistz256_sqr_montq
1274
1275	pcmpeqd	xmm5,xmm4
1276	pshufd	xmm4,xmm3,0xb1
1277	por	xmm4,xmm3
1278	pshufd	xmm5,xmm5,0
1279	pshufd	xmm3,xmm4,0x1e
1280	por	xmm4,xmm3
1281	pxor	xmm3,xmm3
1282	pcmpeqd	xmm4,xmm3
1283	pshufd	xmm4,xmm4,0
1284	mov	rax,QWORD[((64+0))+rbx]
1285	mov	r14,QWORD[((64+8))+rbx]
1286	mov	r15,QWORD[((64+16))+rbx]
1287	mov	r8,QWORD[((64+24))+rbx]
1288DB	102,72,15,110,203
1289
1290	lea	rsi,[((64-0))+rbx]
1291	lea	rdi,[32+rsp]
1292	call	__ecp_nistz256_sqr_montq
1293
1294	mov	rax,QWORD[544+rsp]
1295	lea	rbx,[544+rsp]
1296	mov	r9,QWORD[((0+96))+rsp]
1297	mov	r10,QWORD[((8+96))+rsp]
1298	lea	rsi,[((0+96))+rsp]
1299	mov	r11,QWORD[((16+96))+rsp]
1300	mov	r12,QWORD[((24+96))+rsp]
1301	lea	rdi,[224+rsp]
1302	call	__ecp_nistz256_mul_montq
1303
1304	mov	rax,QWORD[448+rsp]
1305	lea	rbx,[448+rsp]
1306	mov	r9,QWORD[((0+32))+rsp]
1307	mov	r10,QWORD[((8+32))+rsp]
1308	lea	rsi,[((0+32))+rsp]
1309	mov	r11,QWORD[((16+32))+rsp]
1310	mov	r12,QWORD[((24+32))+rsp]
1311	lea	rdi,[256+rsp]
1312	call	__ecp_nistz256_mul_montq
1313
1314	mov	rax,QWORD[416+rsp]
1315	lea	rbx,[416+rsp]
1316	mov	r9,QWORD[((0+224))+rsp]
1317	mov	r10,QWORD[((8+224))+rsp]
1318	lea	rsi,[((0+224))+rsp]
1319	mov	r11,QWORD[((16+224))+rsp]
1320	mov	r12,QWORD[((24+224))+rsp]
1321	lea	rdi,[224+rsp]
1322	call	__ecp_nistz256_mul_montq
1323
1324	mov	rax,QWORD[512+rsp]
1325	lea	rbx,[512+rsp]
1326	mov	r9,QWORD[((0+256))+rsp]
1327	mov	r10,QWORD[((8+256))+rsp]
1328	lea	rsi,[((0+256))+rsp]
1329	mov	r11,QWORD[((16+256))+rsp]
1330	mov	r12,QWORD[((24+256))+rsp]
1331	lea	rdi,[256+rsp]
1332	call	__ecp_nistz256_mul_montq
1333
1334	lea	rbx,[224+rsp]
1335	lea	rdi,[64+rsp]
1336	call	__ecp_nistz256_sub_fromq
1337
1338	or	r12,r13
1339	movdqa	xmm2,xmm4
1340	or	r12,r8
1341	or	r12,r9
1342	por	xmm2,xmm5
1343DB	102,73,15,110,220
1344
1345	mov	rax,QWORD[384+rsp]
1346	lea	rbx,[384+rsp]
1347	mov	r9,QWORD[((0+96))+rsp]
1348	mov	r10,QWORD[((8+96))+rsp]
1349	lea	rsi,[((0+96))+rsp]
1350	mov	r11,QWORD[((16+96))+rsp]
1351	mov	r12,QWORD[((24+96))+rsp]
1352	lea	rdi,[160+rsp]
1353	call	__ecp_nistz256_mul_montq
1354
1355	mov	rax,QWORD[480+rsp]
1356	lea	rbx,[480+rsp]
1357	mov	r9,QWORD[((0+32))+rsp]
1358	mov	r10,QWORD[((8+32))+rsp]
1359	lea	rsi,[((0+32))+rsp]
1360	mov	r11,QWORD[((16+32))+rsp]
1361	mov	r12,QWORD[((24+32))+rsp]
1362	lea	rdi,[192+rsp]
1363	call	__ecp_nistz256_mul_montq
1364
1365	lea	rbx,[160+rsp]
1366	lea	rdi,[rsp]
1367	call	__ecp_nistz256_sub_fromq
1368
1369	or	r12,r13
1370	or	r12,r8
1371	or	r12,r9
1372
1373DB	0x3e
1374	jnz	NEAR $L$add_proceedq
1375DB	102,73,15,126,208
1376DB	102,73,15,126,217
1377	test	r8,r8
1378	jnz	NEAR $L$add_proceedq
1379	test	r9,r9
1380	jz	NEAR $L$add_doubleq
1381
1382DB	102,72,15,126,199
1383	pxor	xmm0,xmm0
1384	movdqu	XMMWORD[rdi],xmm0
1385	movdqu	XMMWORD[16+rdi],xmm0
1386	movdqu	XMMWORD[32+rdi],xmm0
1387	movdqu	XMMWORD[48+rdi],xmm0
1388	movdqu	XMMWORD[64+rdi],xmm0
1389	movdqu	XMMWORD[80+rdi],xmm0
1390	jmp	NEAR $L$add_doneq
1391
1392ALIGN	32
1393$L$add_doubleq:
1394DB	102,72,15,126,206
1395DB	102,72,15,126,199
1396	add	rsp,416
1397	jmp	NEAR $L$point_double_shortcutq
1398
1399ALIGN	32
1400$L$add_proceedq:
1401	mov	rax,QWORD[((0+64))+rsp]
1402	mov	r14,QWORD[((8+64))+rsp]
1403	lea	rsi,[((0+64))+rsp]
1404	mov	r15,QWORD[((16+64))+rsp]
1405	mov	r8,QWORD[((24+64))+rsp]
1406	lea	rdi,[96+rsp]
1407	call	__ecp_nistz256_sqr_montq
1408
1409	mov	rax,QWORD[448+rsp]
1410	lea	rbx,[448+rsp]
1411	mov	r9,QWORD[((0+0))+rsp]
1412	mov	r10,QWORD[((8+0))+rsp]
1413	lea	rsi,[((0+0))+rsp]
1414	mov	r11,QWORD[((16+0))+rsp]
1415	mov	r12,QWORD[((24+0))+rsp]
1416	lea	rdi,[352+rsp]
1417	call	__ecp_nistz256_mul_montq
1418
1419	mov	rax,QWORD[((0+0))+rsp]
1420	mov	r14,QWORD[((8+0))+rsp]
1421	lea	rsi,[((0+0))+rsp]
1422	mov	r15,QWORD[((16+0))+rsp]
1423	mov	r8,QWORD[((24+0))+rsp]
1424	lea	rdi,[32+rsp]
1425	call	__ecp_nistz256_sqr_montq
1426
1427	mov	rax,QWORD[544+rsp]
1428	lea	rbx,[544+rsp]
1429	mov	r9,QWORD[((0+352))+rsp]
1430	mov	r10,QWORD[((8+352))+rsp]
1431	lea	rsi,[((0+352))+rsp]
1432	mov	r11,QWORD[((16+352))+rsp]
1433	mov	r12,QWORD[((24+352))+rsp]
1434	lea	rdi,[352+rsp]
1435	call	__ecp_nistz256_mul_montq
1436
1437	mov	rax,QWORD[rsp]
1438	lea	rbx,[rsp]
1439	mov	r9,QWORD[((0+32))+rsp]
1440	mov	r10,QWORD[((8+32))+rsp]
1441	lea	rsi,[((0+32))+rsp]
1442	mov	r11,QWORD[((16+32))+rsp]
1443	mov	r12,QWORD[((24+32))+rsp]
1444	lea	rdi,[128+rsp]
1445	call	__ecp_nistz256_mul_montq
1446
1447	mov	rax,QWORD[160+rsp]
1448	lea	rbx,[160+rsp]
1449	mov	r9,QWORD[((0+32))+rsp]
1450	mov	r10,QWORD[((8+32))+rsp]
1451	lea	rsi,[((0+32))+rsp]
1452	mov	r11,QWORD[((16+32))+rsp]
1453	mov	r12,QWORD[((24+32))+rsp]
1454	lea	rdi,[192+rsp]
1455	call	__ecp_nistz256_mul_montq
1456
1457
1458
1459
1460	add	r12,r12
1461	lea	rsi,[96+rsp]
1462	adc	r13,r13
1463	mov	rax,r12
1464	adc	r8,r8
1465	adc	r9,r9
1466	mov	rbp,r13
1467	sbb	r11,r11
1468
1469	sub	r12,-1
1470	mov	rcx,r8
1471	sbb	r13,r14
1472	sbb	r8,0
1473	mov	r10,r9
1474	sbb	r9,r15
1475	test	r11,r11
1476
1477	cmovz	r12,rax
1478	mov	rax,QWORD[rsi]
1479	cmovz	r13,rbp
1480	mov	rbp,QWORD[8+rsi]
1481	cmovz	r8,rcx
1482	mov	rcx,QWORD[16+rsi]
1483	cmovz	r9,r10
1484	mov	r10,QWORD[24+rsi]
1485
1486	call	__ecp_nistz256_subq
1487
1488	lea	rbx,[128+rsp]
1489	lea	rdi,[288+rsp]
1490	call	__ecp_nistz256_sub_fromq
1491
1492	mov	rax,QWORD[((192+0))+rsp]
1493	mov	rbp,QWORD[((192+8))+rsp]
1494	mov	rcx,QWORD[((192+16))+rsp]
1495	mov	r10,QWORD[((192+24))+rsp]
1496	lea	rdi,[320+rsp]
1497
1498	call	__ecp_nistz256_subq
1499
1500	mov	QWORD[rdi],r12
1501	mov	QWORD[8+rdi],r13
1502	mov	QWORD[16+rdi],r8
1503	mov	QWORD[24+rdi],r9
1504	mov	rax,QWORD[128+rsp]
1505	lea	rbx,[128+rsp]
1506	mov	r9,QWORD[((0+224))+rsp]
1507	mov	r10,QWORD[((8+224))+rsp]
1508	lea	rsi,[((0+224))+rsp]
1509	mov	r11,QWORD[((16+224))+rsp]
1510	mov	r12,QWORD[((24+224))+rsp]
1511	lea	rdi,[256+rsp]
1512	call	__ecp_nistz256_mul_montq
1513
1514	mov	rax,QWORD[320+rsp]
1515	lea	rbx,[320+rsp]
1516	mov	r9,QWORD[((0+64))+rsp]
1517	mov	r10,QWORD[((8+64))+rsp]
1518	lea	rsi,[((0+64))+rsp]
1519	mov	r11,QWORD[((16+64))+rsp]
1520	mov	r12,QWORD[((24+64))+rsp]
1521	lea	rdi,[320+rsp]
1522	call	__ecp_nistz256_mul_montq
1523
1524	lea	rbx,[256+rsp]
1525	lea	rdi,[320+rsp]
1526	call	__ecp_nistz256_sub_fromq
1527
1528DB	102,72,15,126,199
1529
1530	movdqa	xmm0,xmm5
1531	movdqa	xmm1,xmm5
1532	pandn	xmm0,XMMWORD[352+rsp]
1533	movdqa	xmm2,xmm5
1534	pandn	xmm1,XMMWORD[((352+16))+rsp]
1535	movdqa	xmm3,xmm5
1536	pand	xmm2,XMMWORD[544+rsp]
1537	pand	xmm3,XMMWORD[((544+16))+rsp]
1538	por	xmm2,xmm0
1539	por	xmm3,xmm1
1540
1541	movdqa	xmm0,xmm4
1542	movdqa	xmm1,xmm4
1543	pandn	xmm0,xmm2
1544	movdqa	xmm2,xmm4
1545	pandn	xmm1,xmm3
1546	movdqa	xmm3,xmm4
1547	pand	xmm2,XMMWORD[448+rsp]
1548	pand	xmm3,XMMWORD[((448+16))+rsp]
1549	por	xmm2,xmm0
1550	por	xmm3,xmm1
1551	movdqu	XMMWORD[64+rdi],xmm2
1552	movdqu	XMMWORD[80+rdi],xmm3
1553
1554	movdqa	xmm0,xmm5
1555	movdqa	xmm1,xmm5
1556	pandn	xmm0,XMMWORD[288+rsp]
1557	movdqa	xmm2,xmm5
1558	pandn	xmm1,XMMWORD[((288+16))+rsp]
1559	movdqa	xmm3,xmm5
1560	pand	xmm2,XMMWORD[480+rsp]
1561	pand	xmm3,XMMWORD[((480+16))+rsp]
1562	por	xmm2,xmm0
1563	por	xmm3,xmm1
1564
1565	movdqa	xmm0,xmm4
1566	movdqa	xmm1,xmm4
1567	pandn	xmm0,xmm2
1568	movdqa	xmm2,xmm4
1569	pandn	xmm1,xmm3
1570	movdqa	xmm3,xmm4
1571	pand	xmm2,XMMWORD[384+rsp]
1572	pand	xmm3,XMMWORD[((384+16))+rsp]
1573	por	xmm2,xmm0
1574	por	xmm3,xmm1
1575	movdqu	XMMWORD[rdi],xmm2
1576	movdqu	XMMWORD[16+rdi],xmm3
1577
1578	movdqa	xmm0,xmm5
1579	movdqa	xmm1,xmm5
1580	pandn	xmm0,XMMWORD[320+rsp]
1581	movdqa	xmm2,xmm5
1582	pandn	xmm1,XMMWORD[((320+16))+rsp]
1583	movdqa	xmm3,xmm5
1584	pand	xmm2,XMMWORD[512+rsp]
1585	pand	xmm3,XMMWORD[((512+16))+rsp]
1586	por	xmm2,xmm0
1587	por	xmm3,xmm1
1588
1589	movdqa	xmm0,xmm4
1590	movdqa	xmm1,xmm4
1591	pandn	xmm0,xmm2
1592	movdqa	xmm2,xmm4
1593	pandn	xmm1,xmm3
1594	movdqa	xmm3,xmm4
1595	pand	xmm2,XMMWORD[416+rsp]
1596	pand	xmm3,XMMWORD[((416+16))+rsp]
1597	por	xmm2,xmm0
1598	por	xmm3,xmm1
1599	movdqu	XMMWORD[32+rdi],xmm2
1600	movdqu	XMMWORD[48+rdi],xmm3
1601
1602$L$add_doneq:
1603	add	rsp,32*18+8
1604	pop	r15
1605	pop	r14
1606	pop	r13
1607	pop	r12
1608	pop	rbx
1609	pop	rbp
1610	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1611	mov	rsi,QWORD[16+rsp]
1612	DB	0F3h,0C3h		;repret
1613$L$SEH_end_ecp_nistz256_point_add:
1614global	ecp_nistz256_point_add_affine
1615
1616ALIGN	32
1617ecp_nistz256_point_add_affine:
1618	mov	QWORD[8+rsp],rdi	;WIN64 prologue
1619	mov	QWORD[16+rsp],rsi
1620	mov	rax,rsp
1621$L$SEH_begin_ecp_nistz256_point_add_affine:
1622	mov	rdi,rcx
1623	mov	rsi,rdx
1624	mov	rdx,r8
1625
1626
1627	push	rbp
1628	push	rbx
1629	push	r12
1630	push	r13
1631	push	r14
1632	push	r15
1633	sub	rsp,32*15+8
1634
1635	movdqu	xmm0,XMMWORD[rsi]
1636	mov	rbx,rdx
1637	movdqu	xmm1,XMMWORD[16+rsi]
1638	movdqu	xmm2,XMMWORD[32+rsi]
1639	movdqu	xmm3,XMMWORD[48+rsi]
1640	movdqu	xmm4,XMMWORD[64+rsi]
1641	movdqu	xmm5,XMMWORD[80+rsi]
1642	mov	rax,QWORD[((64+0))+rsi]
1643	mov	r14,QWORD[((64+8))+rsi]
1644	mov	r15,QWORD[((64+16))+rsi]
1645	mov	r8,QWORD[((64+24))+rsi]
1646	movdqa	XMMWORD[320+rsp],xmm0
1647	movdqa	XMMWORD[(320+16)+rsp],xmm1
1648	por	xmm1,xmm0
1649	movdqa	XMMWORD[352+rsp],xmm2
1650	movdqa	XMMWORD[(352+16)+rsp],xmm3
1651	por	xmm3,xmm2
1652	movdqa	XMMWORD[384+rsp],xmm4
1653	movdqa	XMMWORD[(384+16)+rsp],xmm5
1654	por	xmm3,xmm1
1655
1656	movdqu	xmm0,XMMWORD[rbx]
1657	pshufd	xmm5,xmm3,0xb1
1658	movdqu	xmm1,XMMWORD[16+rbx]
1659	movdqu	xmm2,XMMWORD[32+rbx]
1660	por	xmm5,xmm3
1661	movdqu	xmm3,XMMWORD[48+rbx]
1662	movdqa	XMMWORD[416+rsp],xmm0
1663	pshufd	xmm4,xmm5,0x1e
1664	movdqa	XMMWORD[(416+16)+rsp],xmm1
1665	por	xmm1,xmm0
1666DB	102,72,15,110,199
1667	movdqa	XMMWORD[448+rsp],xmm2
1668	movdqa	XMMWORD[(448+16)+rsp],xmm3
1669	por	xmm3,xmm2
1670	por	xmm5,xmm4
1671	pxor	xmm4,xmm4
1672	por	xmm3,xmm1
1673
1674	lea	rsi,[((64-0))+rsi]
1675	lea	rdi,[32+rsp]
1676	call	__ecp_nistz256_sqr_montq
1677
1678	pcmpeqd	xmm5,xmm4
1679	pshufd	xmm4,xmm3,0xb1
1680	mov	rax,QWORD[rbx]
1681
1682	mov	r9,r12
1683	por	xmm4,xmm3
1684	pshufd	xmm5,xmm5,0
1685	pshufd	xmm3,xmm4,0x1e
1686	mov	r10,r13
1687	por	xmm4,xmm3
1688	pxor	xmm3,xmm3
1689	mov	r11,r14
1690	pcmpeqd	xmm4,xmm3
1691	pshufd	xmm4,xmm4,0
1692
1693	lea	rsi,[((32-0))+rsp]
1694	mov	r12,r15
1695	lea	rdi,[rsp]
1696	call	__ecp_nistz256_mul_montq
1697
1698	lea	rbx,[320+rsp]
1699	lea	rdi,[64+rsp]
1700	call	__ecp_nistz256_sub_fromq
1701
1702	mov	rax,QWORD[384+rsp]
1703	lea	rbx,[384+rsp]
1704	mov	r9,QWORD[((0+32))+rsp]
1705	mov	r10,QWORD[((8+32))+rsp]
1706	lea	rsi,[((0+32))+rsp]
1707	mov	r11,QWORD[((16+32))+rsp]
1708	mov	r12,QWORD[((24+32))+rsp]
1709	lea	rdi,[32+rsp]
1710	call	__ecp_nistz256_mul_montq
1711
1712	mov	rax,QWORD[384+rsp]
1713	lea	rbx,[384+rsp]
1714	mov	r9,QWORD[((0+64))+rsp]
1715	mov	r10,QWORD[((8+64))+rsp]
1716	lea	rsi,[((0+64))+rsp]
1717	mov	r11,QWORD[((16+64))+rsp]
1718	mov	r12,QWORD[((24+64))+rsp]
1719	lea	rdi,[288+rsp]
1720	call	__ecp_nistz256_mul_montq
1721
1722	mov	rax,QWORD[448+rsp]
1723	lea	rbx,[448+rsp]
1724	mov	r9,QWORD[((0+32))+rsp]
1725	mov	r10,QWORD[((8+32))+rsp]
1726	lea	rsi,[((0+32))+rsp]
1727	mov	r11,QWORD[((16+32))+rsp]
1728	mov	r12,QWORD[((24+32))+rsp]
1729	lea	rdi,[32+rsp]
1730	call	__ecp_nistz256_mul_montq
1731
1732	lea	rbx,[352+rsp]
1733	lea	rdi,[96+rsp]
1734	call	__ecp_nistz256_sub_fromq
1735
1736	mov	rax,QWORD[((0+64))+rsp]
1737	mov	r14,QWORD[((8+64))+rsp]
1738	lea	rsi,[((0+64))+rsp]
1739	mov	r15,QWORD[((16+64))+rsp]
1740	mov	r8,QWORD[((24+64))+rsp]
1741	lea	rdi,[128+rsp]
1742	call	__ecp_nistz256_sqr_montq
1743
1744	mov	rax,QWORD[((0+96))+rsp]
1745	mov	r14,QWORD[((8+96))+rsp]
1746	lea	rsi,[((0+96))+rsp]
1747	mov	r15,QWORD[((16+96))+rsp]
1748	mov	r8,QWORD[((24+96))+rsp]
1749	lea	rdi,[192+rsp]
1750	call	__ecp_nistz256_sqr_montq
1751
1752	mov	rax,QWORD[128+rsp]
1753	lea	rbx,[128+rsp]
1754	mov	r9,QWORD[((0+64))+rsp]
1755	mov	r10,QWORD[((8+64))+rsp]
1756	lea	rsi,[((0+64))+rsp]
1757	mov	r11,QWORD[((16+64))+rsp]
1758	mov	r12,QWORD[((24+64))+rsp]
1759	lea	rdi,[160+rsp]
1760	call	__ecp_nistz256_mul_montq
1761
1762	mov	rax,QWORD[320+rsp]
1763	lea	rbx,[320+rsp]
1764	mov	r9,QWORD[((0+128))+rsp]
1765	mov	r10,QWORD[((8+128))+rsp]
1766	lea	rsi,[((0+128))+rsp]
1767	mov	r11,QWORD[((16+128))+rsp]
1768	mov	r12,QWORD[((24+128))+rsp]
1769	lea	rdi,[rsp]
1770	call	__ecp_nistz256_mul_montq
1771
1772
1773
1774
1775	add	r12,r12
1776	lea	rsi,[192+rsp]
1777	adc	r13,r13
1778	mov	rax,r12
1779	adc	r8,r8
1780	adc	r9,r9
1781	mov	rbp,r13
1782	sbb	r11,r11
1783
1784	sub	r12,-1
1785	mov	rcx,r8
1786	sbb	r13,r14
1787	sbb	r8,0
1788	mov	r10,r9
1789	sbb	r9,r15
1790	test	r11,r11
1791
1792	cmovz	r12,rax
1793	mov	rax,QWORD[rsi]
1794	cmovz	r13,rbp
1795	mov	rbp,QWORD[8+rsi]
1796	cmovz	r8,rcx
1797	mov	rcx,QWORD[16+rsi]
1798	cmovz	r9,r10
1799	mov	r10,QWORD[24+rsi]
1800
1801	call	__ecp_nistz256_subq
1802
1803	lea	rbx,[160+rsp]
1804	lea	rdi,[224+rsp]
1805	call	__ecp_nistz256_sub_fromq
1806
1807	mov	rax,QWORD[((0+0))+rsp]
1808	mov	rbp,QWORD[((0+8))+rsp]
1809	mov	rcx,QWORD[((0+16))+rsp]
1810	mov	r10,QWORD[((0+24))+rsp]
1811	lea	rdi,[64+rsp]
1812
1813	call	__ecp_nistz256_subq
1814
1815	mov	QWORD[rdi],r12
1816	mov	QWORD[8+rdi],r13
1817	mov	QWORD[16+rdi],r8
1818	mov	QWORD[24+rdi],r9
1819	mov	rax,QWORD[352+rsp]
1820	lea	rbx,[352+rsp]
1821	mov	r9,QWORD[((0+160))+rsp]
1822	mov	r10,QWORD[((8+160))+rsp]
1823	lea	rsi,[((0+160))+rsp]
1824	mov	r11,QWORD[((16+160))+rsp]
1825	mov	r12,QWORD[((24+160))+rsp]
1826	lea	rdi,[32+rsp]
1827	call	__ecp_nistz256_mul_montq
1828
1829	mov	rax,QWORD[96+rsp]
1830	lea	rbx,[96+rsp]
1831	mov	r9,QWORD[((0+64))+rsp]
1832	mov	r10,QWORD[((8+64))+rsp]
1833	lea	rsi,[((0+64))+rsp]
1834	mov	r11,QWORD[((16+64))+rsp]
1835	mov	r12,QWORD[((24+64))+rsp]
1836	lea	rdi,[64+rsp]
1837	call	__ecp_nistz256_mul_montq
1838
1839	lea	rbx,[32+rsp]
1840	lea	rdi,[256+rsp]
1841	call	__ecp_nistz256_sub_fromq
1842
1843DB	102,72,15,126,199
1844
1845	movdqa	xmm0,xmm5
1846	movdqa	xmm1,xmm5
1847	pandn	xmm0,XMMWORD[288+rsp]
1848	movdqa	xmm2,xmm5
1849	pandn	xmm1,XMMWORD[((288+16))+rsp]
1850	movdqa	xmm3,xmm5
1851	pand	xmm2,XMMWORD[$L$ONE_mont]
1852	pand	xmm3,XMMWORD[(($L$ONE_mont+16))]
1853	por	xmm2,xmm0
1854	por	xmm3,xmm1
1855
1856	movdqa	xmm0,xmm4
1857	movdqa	xmm1,xmm4
1858	pandn	xmm0,xmm2
1859	movdqa	xmm2,xmm4
1860	pandn	xmm1,xmm3
1861	movdqa	xmm3,xmm4
1862	pand	xmm2,XMMWORD[384+rsp]
1863	pand	xmm3,XMMWORD[((384+16))+rsp]
1864	por	xmm2,xmm0
1865	por	xmm3,xmm1
1866	movdqu	XMMWORD[64+rdi],xmm2
1867	movdqu	XMMWORD[80+rdi],xmm3
1868
1869	movdqa	xmm0,xmm5
1870	movdqa	xmm1,xmm5
1871	pandn	xmm0,XMMWORD[224+rsp]
1872	movdqa	xmm2,xmm5
1873	pandn	xmm1,XMMWORD[((224+16))+rsp]
1874	movdqa	xmm3,xmm5
1875	pand	xmm2,XMMWORD[416+rsp]
1876	pand	xmm3,XMMWORD[((416+16))+rsp]
1877	por	xmm2,xmm0
1878	por	xmm3,xmm1
1879
1880	movdqa	xmm0,xmm4
1881	movdqa	xmm1,xmm4
1882	pandn	xmm0,xmm2
1883	movdqa	xmm2,xmm4
1884	pandn	xmm1,xmm3
1885	movdqa	xmm3,xmm4
1886	pand	xmm2,XMMWORD[320+rsp]
1887	pand	xmm3,XMMWORD[((320+16))+rsp]
1888	por	xmm2,xmm0
1889	por	xmm3,xmm1
1890	movdqu	XMMWORD[rdi],xmm2
1891	movdqu	XMMWORD[16+rdi],xmm3
1892
1893	movdqa	xmm0,xmm5
1894	movdqa	xmm1,xmm5
1895	pandn	xmm0,XMMWORD[256+rsp]
1896	movdqa	xmm2,xmm5
1897	pandn	xmm1,XMMWORD[((256+16))+rsp]
1898	movdqa	xmm3,xmm5
1899	pand	xmm2,XMMWORD[448+rsp]
1900	pand	xmm3,XMMWORD[((448+16))+rsp]
1901	por	xmm2,xmm0
1902	por	xmm3,xmm1
1903
1904	movdqa	xmm0,xmm4
1905	movdqa	xmm1,xmm4
1906	pandn	xmm0,xmm2
1907	movdqa	xmm2,xmm4
1908	pandn	xmm1,xmm3
1909	movdqa	xmm3,xmm4
1910	pand	xmm2,XMMWORD[352+rsp]
1911	pand	xmm3,XMMWORD[((352+16))+rsp]
1912	por	xmm2,xmm0
1913	por	xmm3,xmm1
1914	movdqu	XMMWORD[32+rdi],xmm2
1915	movdqu	XMMWORD[48+rdi],xmm3
1916
1917	add	rsp,32*15+8
1918	pop	r15
1919	pop	r14
1920	pop	r13
1921	pop	r12
1922	pop	rbx
1923	pop	rbp
1924	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1925	mov	rsi,QWORD[16+rsp]
1926	DB	0F3h,0C3h		;repret
1927$L$SEH_end_ecp_nistz256_point_add_affine:
1928