1default	rel
2%define XMMWORD
3%define YMMWORD
4%define ZMMWORD
5section	.text code align=64
6
7
8EXTERN	OPENSSL_ia32cap_P
9
10global	rsaz_512_sqr
11
12ALIGN	32
13rsaz_512_sqr:
14	mov	QWORD[8+rsp],rdi	;WIN64 prologue
15	mov	QWORD[16+rsp],rsi
16	mov	rax,rsp
17$L$SEH_begin_rsaz_512_sqr:
18	mov	rdi,rcx
19	mov	rsi,rdx
20	mov	rdx,r8
21	mov	rcx,r9
22	mov	r8,QWORD[40+rsp]
23
24
25	push	rbx
26	push	rbp
27	push	r12
28	push	r13
29	push	r14
30	push	r15
31
32	sub	rsp,128+24
33$L$sqr_body:
34	mov	rbp,rdx
35	mov	rdx,QWORD[rsi]
36	mov	rax,QWORD[8+rsi]
37	mov	QWORD[128+rsp],rcx
38	jmp	NEAR $L$oop_sqr
39
40ALIGN	32
41$L$oop_sqr:
42	mov	DWORD[((128+8))+rsp],r8d
43
44	mov	rbx,rdx
45	mul	rdx
46	mov	r8,rax
47	mov	rax,QWORD[16+rsi]
48	mov	r9,rdx
49
50	mul	rbx
51	add	r9,rax
52	mov	rax,QWORD[24+rsi]
53	mov	r10,rdx
54	adc	r10,0
55
56	mul	rbx
57	add	r10,rax
58	mov	rax,QWORD[32+rsi]
59	mov	r11,rdx
60	adc	r11,0
61
62	mul	rbx
63	add	r11,rax
64	mov	rax,QWORD[40+rsi]
65	mov	r12,rdx
66	adc	r12,0
67
68	mul	rbx
69	add	r12,rax
70	mov	rax,QWORD[48+rsi]
71	mov	r13,rdx
72	adc	r13,0
73
74	mul	rbx
75	add	r13,rax
76	mov	rax,QWORD[56+rsi]
77	mov	r14,rdx
78	adc	r14,0
79
80	mul	rbx
81	add	r14,rax
82	mov	rax,rbx
83	mov	r15,rdx
84	adc	r15,0
85
86	add	r8,r8
87	mov	rcx,r9
88	adc	r9,r9
89
90	mul	rax
91	mov	QWORD[rsp],rax
92	add	r8,rdx
93	adc	r9,0
94
95	mov	QWORD[8+rsp],r8
96	shr	rcx,63
97
98
99	mov	r8,QWORD[8+rsi]
100	mov	rax,QWORD[16+rsi]
101	mul	r8
102	add	r10,rax
103	mov	rax,QWORD[24+rsi]
104	mov	rbx,rdx
105	adc	rbx,0
106
107	mul	r8
108	add	r11,rax
109	mov	rax,QWORD[32+rsi]
110	adc	rdx,0
111	add	r11,rbx
112	mov	rbx,rdx
113	adc	rbx,0
114
115	mul	r8
116	add	r12,rax
117	mov	rax,QWORD[40+rsi]
118	adc	rdx,0
119	add	r12,rbx
120	mov	rbx,rdx
121	adc	rbx,0
122
123	mul	r8
124	add	r13,rax
125	mov	rax,QWORD[48+rsi]
126	adc	rdx,0
127	add	r13,rbx
128	mov	rbx,rdx
129	adc	rbx,0
130
131	mul	r8
132	add	r14,rax
133	mov	rax,QWORD[56+rsi]
134	adc	rdx,0
135	add	r14,rbx
136	mov	rbx,rdx
137	adc	rbx,0
138
139	mul	r8
140	add	r15,rax
141	mov	rax,r8
142	adc	rdx,0
143	add	r15,rbx
144	mov	r8,rdx
145	mov	rdx,r10
146	adc	r8,0
147
148	add	rdx,rdx
149	lea	r10,[r10*2+rcx]
150	mov	rbx,r11
151	adc	r11,r11
152
153	mul	rax
154	add	r9,rax
155	adc	r10,rdx
156	adc	r11,0
157
158	mov	QWORD[16+rsp],r9
159	mov	QWORD[24+rsp],r10
160	shr	rbx,63
161
162
163	mov	r9,QWORD[16+rsi]
164	mov	rax,QWORD[24+rsi]
165	mul	r9
166	add	r12,rax
167	mov	rax,QWORD[32+rsi]
168	mov	rcx,rdx
169	adc	rcx,0
170
171	mul	r9
172	add	r13,rax
173	mov	rax,QWORD[40+rsi]
174	adc	rdx,0
175	add	r13,rcx
176	mov	rcx,rdx
177	adc	rcx,0
178
179	mul	r9
180	add	r14,rax
181	mov	rax,QWORD[48+rsi]
182	adc	rdx,0
183	add	r14,rcx
184	mov	rcx,rdx
185	adc	rcx,0
186
187	mul	r9
188	mov	r10,r12
189	lea	r12,[r12*2+rbx]
190	add	r15,rax
191	mov	rax,QWORD[56+rsi]
192	adc	rdx,0
193	add	r15,rcx
194	mov	rcx,rdx
195	adc	rcx,0
196
197	mul	r9
198	shr	r10,63
199	add	r8,rax
200	mov	rax,r9
201	adc	rdx,0
202	add	r8,rcx
203	mov	r9,rdx
204	adc	r9,0
205
206	mov	rcx,r13
207	lea	r13,[r13*2+r10]
208
209	mul	rax
210	add	r11,rax
211	adc	r12,rdx
212	adc	r13,0
213
214	mov	QWORD[32+rsp],r11
215	mov	QWORD[40+rsp],r12
216	shr	rcx,63
217
218
219	mov	r10,QWORD[24+rsi]
220	mov	rax,QWORD[32+rsi]
221	mul	r10
222	add	r14,rax
223	mov	rax,QWORD[40+rsi]
224	mov	rbx,rdx
225	adc	rbx,0
226
227	mul	r10
228	add	r15,rax
229	mov	rax,QWORD[48+rsi]
230	adc	rdx,0
231	add	r15,rbx
232	mov	rbx,rdx
233	adc	rbx,0
234
235	mul	r10
236	mov	r12,r14
237	lea	r14,[r14*2+rcx]
238	add	r8,rax
239	mov	rax,QWORD[56+rsi]
240	adc	rdx,0
241	add	r8,rbx
242	mov	rbx,rdx
243	adc	rbx,0
244
245	mul	r10
246	shr	r12,63
247	add	r9,rax
248	mov	rax,r10
249	adc	rdx,0
250	add	r9,rbx
251	mov	r10,rdx
252	adc	r10,0
253
254	mov	rbx,r15
255	lea	r15,[r15*2+r12]
256
257	mul	rax
258	add	r13,rax
259	adc	r14,rdx
260	adc	r15,0
261
262	mov	QWORD[48+rsp],r13
263	mov	QWORD[56+rsp],r14
264	shr	rbx,63
265
266
267	mov	r11,QWORD[32+rsi]
268	mov	rax,QWORD[40+rsi]
269	mul	r11
270	add	r8,rax
271	mov	rax,QWORD[48+rsi]
272	mov	rcx,rdx
273	adc	rcx,0
274
275	mul	r11
276	add	r9,rax
277	mov	rax,QWORD[56+rsi]
278	adc	rdx,0
279	mov	r12,r8
280	lea	r8,[r8*2+rbx]
281	add	r9,rcx
282	mov	rcx,rdx
283	adc	rcx,0
284
285	mul	r11
286	shr	r12,63
287	add	r10,rax
288	mov	rax,r11
289	adc	rdx,0
290	add	r10,rcx
291	mov	r11,rdx
292	adc	r11,0
293
294	mov	rcx,r9
295	lea	r9,[r9*2+r12]
296
297	mul	rax
298	add	r15,rax
299	adc	r8,rdx
300	adc	r9,0
301
302	mov	QWORD[64+rsp],r15
303	mov	QWORD[72+rsp],r8
304	shr	rcx,63
305
306
307	mov	r12,QWORD[40+rsi]
308	mov	rax,QWORD[48+rsi]
309	mul	r12
310	add	r10,rax
311	mov	rax,QWORD[56+rsi]
312	mov	rbx,rdx
313	adc	rbx,0
314
315	mul	r12
316	add	r11,rax
317	mov	rax,r12
318	mov	r15,r10
319	lea	r10,[r10*2+rcx]
320	adc	rdx,0
321	shr	r15,63
322	add	r11,rbx
323	mov	r12,rdx
324	adc	r12,0
325
326	mov	rbx,r11
327	lea	r11,[r11*2+r15]
328
329	mul	rax
330	add	r9,rax
331	adc	r10,rdx
332	adc	r11,0
333
334	mov	QWORD[80+rsp],r9
335	mov	QWORD[88+rsp],r10
336
337
338	mov	r13,QWORD[48+rsi]
339	mov	rax,QWORD[56+rsi]
340	mul	r13
341	add	r12,rax
342	mov	rax,r13
343	mov	r13,rdx
344	adc	r13,0
345
346	xor	r14,r14
347	shl	rbx,1
348	adc	r12,r12
349	adc	r13,r13
350	adc	r14,r14
351
352	mul	rax
353	add	r11,rax
354	adc	r12,rdx
355	adc	r13,0
356
357	mov	QWORD[96+rsp],r11
358	mov	QWORD[104+rsp],r12
359
360
361	mov	rax,QWORD[56+rsi]
362	mul	rax
363	add	r13,rax
364	adc	rdx,0
365
366	add	r14,rdx
367
368	mov	QWORD[112+rsp],r13
369	mov	QWORD[120+rsp],r14
370
371	mov	r8,QWORD[rsp]
372	mov	r9,QWORD[8+rsp]
373	mov	r10,QWORD[16+rsp]
374	mov	r11,QWORD[24+rsp]
375	mov	r12,QWORD[32+rsp]
376	mov	r13,QWORD[40+rsp]
377	mov	r14,QWORD[48+rsp]
378	mov	r15,QWORD[56+rsp]
379
380	call	__rsaz_512_reduce
381
382	add	r8,QWORD[64+rsp]
383	adc	r9,QWORD[72+rsp]
384	adc	r10,QWORD[80+rsp]
385	adc	r11,QWORD[88+rsp]
386	adc	r12,QWORD[96+rsp]
387	adc	r13,QWORD[104+rsp]
388	adc	r14,QWORD[112+rsp]
389	adc	r15,QWORD[120+rsp]
390	sbb	rcx,rcx
391
392	call	__rsaz_512_subtract
393
394	mov	rdx,r8
395	mov	rax,r9
396	mov	r8d,DWORD[((128+8))+rsp]
397	mov	rsi,rdi
398
399	dec	r8d
400	jnz	NEAR $L$oop_sqr
401
402	lea	rax,[((128+24+48))+rsp]
403	mov	r15,QWORD[((-48))+rax]
404	mov	r14,QWORD[((-40))+rax]
405	mov	r13,QWORD[((-32))+rax]
406	mov	r12,QWORD[((-24))+rax]
407	mov	rbp,QWORD[((-16))+rax]
408	mov	rbx,QWORD[((-8))+rax]
409	lea	rsp,[rax]
410$L$sqr_epilogue:
411	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
412	mov	rsi,QWORD[16+rsp]
413	DB	0F3h,0C3h		;repret
414$L$SEH_end_rsaz_512_sqr:
415global	rsaz_512_mul
416
417ALIGN	32
418rsaz_512_mul:
419	mov	QWORD[8+rsp],rdi	;WIN64 prologue
420	mov	QWORD[16+rsp],rsi
421	mov	rax,rsp
422$L$SEH_begin_rsaz_512_mul:
423	mov	rdi,rcx
424	mov	rsi,rdx
425	mov	rdx,r8
426	mov	rcx,r9
427	mov	r8,QWORD[40+rsp]
428
429
430	push	rbx
431	push	rbp
432	push	r12
433	push	r13
434	push	r14
435	push	r15
436
437	sub	rsp,128+24
438$L$mul_body:
439DB	102,72,15,110,199
440DB	102,72,15,110,201
441	mov	QWORD[128+rsp],r8
442	mov	rbx,QWORD[rdx]
443	mov	rbp,rdx
444	call	__rsaz_512_mul
445
446DB	102,72,15,126,199
447DB	102,72,15,126,205
448
449	mov	r8,QWORD[rsp]
450	mov	r9,QWORD[8+rsp]
451	mov	r10,QWORD[16+rsp]
452	mov	r11,QWORD[24+rsp]
453	mov	r12,QWORD[32+rsp]
454	mov	r13,QWORD[40+rsp]
455	mov	r14,QWORD[48+rsp]
456	mov	r15,QWORD[56+rsp]
457
458	call	__rsaz_512_reduce
459	add	r8,QWORD[64+rsp]
460	adc	r9,QWORD[72+rsp]
461	adc	r10,QWORD[80+rsp]
462	adc	r11,QWORD[88+rsp]
463	adc	r12,QWORD[96+rsp]
464	adc	r13,QWORD[104+rsp]
465	adc	r14,QWORD[112+rsp]
466	adc	r15,QWORD[120+rsp]
467	sbb	rcx,rcx
468
469	call	__rsaz_512_subtract
470
471	lea	rax,[((128+24+48))+rsp]
472	mov	r15,QWORD[((-48))+rax]
473	mov	r14,QWORD[((-40))+rax]
474	mov	r13,QWORD[((-32))+rax]
475	mov	r12,QWORD[((-24))+rax]
476	mov	rbp,QWORD[((-16))+rax]
477	mov	rbx,QWORD[((-8))+rax]
478	lea	rsp,[rax]
479$L$mul_epilogue:
480	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
481	mov	rsi,QWORD[16+rsp]
482	DB	0F3h,0C3h		;repret
483$L$SEH_end_rsaz_512_mul:
484global	rsaz_512_mul_gather4
485
486ALIGN	32
487rsaz_512_mul_gather4:
488	mov	QWORD[8+rsp],rdi	;WIN64 prologue
489	mov	QWORD[16+rsp],rsi
490	mov	rax,rsp
491$L$SEH_begin_rsaz_512_mul_gather4:
492	mov	rdi,rcx
493	mov	rsi,rdx
494	mov	rdx,r8
495	mov	rcx,r9
496	mov	r8,QWORD[40+rsp]
497	mov	r9,QWORD[48+rsp]
498
499
500	push	rbx
501	push	rbp
502	push	r12
503	push	r13
504	push	r14
505	push	r15
506
507	mov	r9d,r9d
508	sub	rsp,128+24
509$L$mul_gather4_body:
510	mov	eax,DWORD[64+r9*4+rdx]
511DB	102,72,15,110,199
512	mov	ebx,DWORD[r9*4+rdx]
513DB	102,72,15,110,201
514	mov	QWORD[128+rsp],r8
515
516	shl	rax,32
517	or	rbx,rax
518	mov	rax,QWORD[rsi]
519	mov	rcx,QWORD[8+rsi]
520	lea	rbp,[128+r9*4+rdx]
521	mul	rbx
522	mov	QWORD[rsp],rax
523	mov	rax,rcx
524	mov	r8,rdx
525
526	mul	rbx
527	movd	xmm4,DWORD[rbp]
528	add	r8,rax
529	mov	rax,QWORD[16+rsi]
530	mov	r9,rdx
531	adc	r9,0
532
533	mul	rbx
534	movd	xmm5,DWORD[64+rbp]
535	add	r9,rax
536	mov	rax,QWORD[24+rsi]
537	mov	r10,rdx
538	adc	r10,0
539
540	mul	rbx
541	pslldq	xmm5,4
542	add	r10,rax
543	mov	rax,QWORD[32+rsi]
544	mov	r11,rdx
545	adc	r11,0
546
547	mul	rbx
548	por	xmm4,xmm5
549	add	r11,rax
550	mov	rax,QWORD[40+rsi]
551	mov	r12,rdx
552	adc	r12,0
553
554	mul	rbx
555	add	r12,rax
556	mov	rax,QWORD[48+rsi]
557	mov	r13,rdx
558	adc	r13,0
559
560	mul	rbx
561	lea	rbp,[128+rbp]
562	add	r13,rax
563	mov	rax,QWORD[56+rsi]
564	mov	r14,rdx
565	adc	r14,0
566
567	mul	rbx
568DB	102,72,15,126,227
569	add	r14,rax
570	mov	rax,QWORD[rsi]
571	mov	r15,rdx
572	adc	r15,0
573
574	lea	rdi,[8+rsp]
575	mov	ecx,7
576	jmp	NEAR $L$oop_mul_gather
577
578ALIGN	32
579$L$oop_mul_gather:
580	mul	rbx
581	add	r8,rax
582	mov	rax,QWORD[8+rsi]
583	mov	QWORD[rdi],r8
584	mov	r8,rdx
585	adc	r8,0
586
587	mul	rbx
588	movd	xmm4,DWORD[rbp]
589	add	r9,rax
590	mov	rax,QWORD[16+rsi]
591	adc	rdx,0
592	add	r8,r9
593	mov	r9,rdx
594	adc	r9,0
595
596	mul	rbx
597	movd	xmm5,DWORD[64+rbp]
598	add	r10,rax
599	mov	rax,QWORD[24+rsi]
600	adc	rdx,0
601	add	r9,r10
602	mov	r10,rdx
603	adc	r10,0
604
605	mul	rbx
606	pslldq	xmm5,4
607	add	r11,rax
608	mov	rax,QWORD[32+rsi]
609	adc	rdx,0
610	add	r10,r11
611	mov	r11,rdx
612	adc	r11,0
613
614	mul	rbx
615	por	xmm4,xmm5
616	add	r12,rax
617	mov	rax,QWORD[40+rsi]
618	adc	rdx,0
619	add	r11,r12
620	mov	r12,rdx
621	adc	r12,0
622
623	mul	rbx
624	add	r13,rax
625	mov	rax,QWORD[48+rsi]
626	adc	rdx,0
627	add	r12,r13
628	mov	r13,rdx
629	adc	r13,0
630
631	mul	rbx
632	add	r14,rax
633	mov	rax,QWORD[56+rsi]
634	adc	rdx,0
635	add	r13,r14
636	mov	r14,rdx
637	adc	r14,0
638
639	mul	rbx
640DB	102,72,15,126,227
641	add	r15,rax
642	mov	rax,QWORD[rsi]
643	adc	rdx,0
644	add	r14,r15
645	mov	r15,rdx
646	adc	r15,0
647
648	lea	rbp,[128+rbp]
649	lea	rdi,[8+rdi]
650
651	dec	ecx
652	jnz	NEAR $L$oop_mul_gather
653
654	mov	QWORD[rdi],r8
655	mov	QWORD[8+rdi],r9
656	mov	QWORD[16+rdi],r10
657	mov	QWORD[24+rdi],r11
658	mov	QWORD[32+rdi],r12
659	mov	QWORD[40+rdi],r13
660	mov	QWORD[48+rdi],r14
661	mov	QWORD[56+rdi],r15
662
663DB	102,72,15,126,199
664DB	102,72,15,126,205
665
666	mov	r8,QWORD[rsp]
667	mov	r9,QWORD[8+rsp]
668	mov	r10,QWORD[16+rsp]
669	mov	r11,QWORD[24+rsp]
670	mov	r12,QWORD[32+rsp]
671	mov	r13,QWORD[40+rsp]
672	mov	r14,QWORD[48+rsp]
673	mov	r15,QWORD[56+rsp]
674
675	call	__rsaz_512_reduce
676	add	r8,QWORD[64+rsp]
677	adc	r9,QWORD[72+rsp]
678	adc	r10,QWORD[80+rsp]
679	adc	r11,QWORD[88+rsp]
680	adc	r12,QWORD[96+rsp]
681	adc	r13,QWORD[104+rsp]
682	adc	r14,QWORD[112+rsp]
683	adc	r15,QWORD[120+rsp]
684	sbb	rcx,rcx
685
686	call	__rsaz_512_subtract
687
688	lea	rax,[((128+24+48))+rsp]
689	mov	r15,QWORD[((-48))+rax]
690	mov	r14,QWORD[((-40))+rax]
691	mov	r13,QWORD[((-32))+rax]
692	mov	r12,QWORD[((-24))+rax]
693	mov	rbp,QWORD[((-16))+rax]
694	mov	rbx,QWORD[((-8))+rax]
695	lea	rsp,[rax]
696$L$mul_gather4_epilogue:
697	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
698	mov	rsi,QWORD[16+rsp]
699	DB	0F3h,0C3h		;repret
700$L$SEH_end_rsaz_512_mul_gather4:
701global	rsaz_512_mul_scatter4
702
703ALIGN	32
704rsaz_512_mul_scatter4:
705	mov	QWORD[8+rsp],rdi	;WIN64 prologue
706	mov	QWORD[16+rsp],rsi
707	mov	rax,rsp
708$L$SEH_begin_rsaz_512_mul_scatter4:
709	mov	rdi,rcx
710	mov	rsi,rdx
711	mov	rdx,r8
712	mov	rcx,r9
713	mov	r8,QWORD[40+rsp]
714	mov	r9,QWORD[48+rsp]
715
716
717	push	rbx
718	push	rbp
719	push	r12
720	push	r13
721	push	r14
722	push	r15
723
724	mov	r9d,r9d
725	sub	rsp,128+24
726$L$mul_scatter4_body:
727	lea	r8,[r9*4+r8]
728DB	102,72,15,110,199
729DB	102,72,15,110,202
730DB	102,73,15,110,208
731	mov	QWORD[128+rsp],rcx
732
733	mov	rbp,rdi
734	mov	rbx,QWORD[rdi]
735	call	__rsaz_512_mul
736
737DB	102,72,15,126,199
738DB	102,72,15,126,205
739
740	mov	r8,QWORD[rsp]
741	mov	r9,QWORD[8+rsp]
742	mov	r10,QWORD[16+rsp]
743	mov	r11,QWORD[24+rsp]
744	mov	r12,QWORD[32+rsp]
745	mov	r13,QWORD[40+rsp]
746	mov	r14,QWORD[48+rsp]
747	mov	r15,QWORD[56+rsp]
748
749	call	__rsaz_512_reduce
750	add	r8,QWORD[64+rsp]
751	adc	r9,QWORD[72+rsp]
752	adc	r10,QWORD[80+rsp]
753	adc	r11,QWORD[88+rsp]
754	adc	r12,QWORD[96+rsp]
755	adc	r13,QWORD[104+rsp]
756	adc	r14,QWORD[112+rsp]
757	adc	r15,QWORD[120+rsp]
758DB	102,72,15,126,214
759	sbb	rcx,rcx
760
761	call	__rsaz_512_subtract
762
763	mov	DWORD[rsi],r8d
764	shr	r8,32
765	mov	DWORD[128+rsi],r9d
766	shr	r9,32
767	mov	DWORD[256+rsi],r10d
768	shr	r10,32
769	mov	DWORD[384+rsi],r11d
770	shr	r11,32
771	mov	DWORD[512+rsi],r12d
772	shr	r12,32
773	mov	DWORD[640+rsi],r13d
774	shr	r13,32
775	mov	DWORD[768+rsi],r14d
776	shr	r14,32
777	mov	DWORD[896+rsi],r15d
778	shr	r15,32
779	mov	DWORD[64+rsi],r8d
780	mov	DWORD[192+rsi],r9d
781	mov	DWORD[320+rsi],r10d
782	mov	DWORD[448+rsi],r11d
783	mov	DWORD[576+rsi],r12d
784	mov	DWORD[704+rsi],r13d
785	mov	DWORD[832+rsi],r14d
786	mov	DWORD[960+rsi],r15d
787
788	lea	rax,[((128+24+48))+rsp]
789	mov	r15,QWORD[((-48))+rax]
790	mov	r14,QWORD[((-40))+rax]
791	mov	r13,QWORD[((-32))+rax]
792	mov	r12,QWORD[((-24))+rax]
793	mov	rbp,QWORD[((-16))+rax]
794	mov	rbx,QWORD[((-8))+rax]
795	lea	rsp,[rax]
796$L$mul_scatter4_epilogue:
797	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
798	mov	rsi,QWORD[16+rsp]
799	DB	0F3h,0C3h		;repret
800$L$SEH_end_rsaz_512_mul_scatter4:
801global	rsaz_512_mul_by_one
802
803ALIGN	32
804rsaz_512_mul_by_one:
805	mov	QWORD[8+rsp],rdi	;WIN64 prologue
806	mov	QWORD[16+rsp],rsi
807	mov	rax,rsp
808$L$SEH_begin_rsaz_512_mul_by_one:
809	mov	rdi,rcx
810	mov	rsi,rdx
811	mov	rdx,r8
812	mov	rcx,r9
813
814
815	push	rbx
816	push	rbp
817	push	r12
818	push	r13
819	push	r14
820	push	r15
821
822	sub	rsp,128+24
823$L$mul_by_one_body:
824	mov	rbp,rdx
825	mov	QWORD[128+rsp],rcx
826
827	mov	r8,QWORD[rsi]
828	pxor	xmm0,xmm0
829	mov	r9,QWORD[8+rsi]
830	mov	r10,QWORD[16+rsi]
831	mov	r11,QWORD[24+rsi]
832	mov	r12,QWORD[32+rsi]
833	mov	r13,QWORD[40+rsi]
834	mov	r14,QWORD[48+rsi]
835	mov	r15,QWORD[56+rsi]
836
837	movdqa	XMMWORD[rsp],xmm0
838	movdqa	XMMWORD[16+rsp],xmm0
839	movdqa	XMMWORD[32+rsp],xmm0
840	movdqa	XMMWORD[48+rsp],xmm0
841	movdqa	XMMWORD[64+rsp],xmm0
842	movdqa	XMMWORD[80+rsp],xmm0
843	movdqa	XMMWORD[96+rsp],xmm0
844	call	__rsaz_512_reduce
845	mov	QWORD[rdi],r8
846	mov	QWORD[8+rdi],r9
847	mov	QWORD[16+rdi],r10
848	mov	QWORD[24+rdi],r11
849	mov	QWORD[32+rdi],r12
850	mov	QWORD[40+rdi],r13
851	mov	QWORD[48+rdi],r14
852	mov	QWORD[56+rdi],r15
853
854	lea	rax,[((128+24+48))+rsp]
855	mov	r15,QWORD[((-48))+rax]
856	mov	r14,QWORD[((-40))+rax]
857	mov	r13,QWORD[((-32))+rax]
858	mov	r12,QWORD[((-24))+rax]
859	mov	rbp,QWORD[((-16))+rax]
860	mov	rbx,QWORD[((-8))+rax]
861	lea	rsp,[rax]
862$L$mul_by_one_epilogue:
863	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
864	mov	rsi,QWORD[16+rsp]
865	DB	0F3h,0C3h		;repret
866$L$SEH_end_rsaz_512_mul_by_one:
867
868ALIGN	32
869__rsaz_512_reduce:
870	mov	rbx,r8
871	imul	rbx,QWORD[((128+8))+rsp]
872	mov	rax,QWORD[rbp]
873	mov	ecx,8
874	jmp	NEAR $L$reduction_loop
875
876ALIGN	32
877$L$reduction_loop:
878	mul	rbx
879	mov	rax,QWORD[8+rbp]
880	neg	r8
881	mov	r8,rdx
882	adc	r8,0
883
884	mul	rbx
885	add	r9,rax
886	mov	rax,QWORD[16+rbp]
887	adc	rdx,0
888	add	r8,r9
889	mov	r9,rdx
890	adc	r9,0
891
892	mul	rbx
893	add	r10,rax
894	mov	rax,QWORD[24+rbp]
895	adc	rdx,0
896	add	r9,r10
897	mov	r10,rdx
898	adc	r10,0
899
900	mul	rbx
901	add	r11,rax
902	mov	rax,QWORD[32+rbp]
903	adc	rdx,0
904	add	r10,r11
905	mov	rsi,QWORD[((128+8))+rsp]
906
907
908	adc	rdx,0
909	mov	r11,rdx
910
911	mul	rbx
912	add	r12,rax
913	mov	rax,QWORD[40+rbp]
914	adc	rdx,0
915	imul	rsi,r8
916	add	r11,r12
917	mov	r12,rdx
918	adc	r12,0
919
920	mul	rbx
921	add	r13,rax
922	mov	rax,QWORD[48+rbp]
923	adc	rdx,0
924	add	r12,r13
925	mov	r13,rdx
926	adc	r13,0
927
928	mul	rbx
929	add	r14,rax
930	mov	rax,QWORD[56+rbp]
931	adc	rdx,0
932	add	r13,r14
933	mov	r14,rdx
934	adc	r14,0
935
936	mul	rbx
937	mov	rbx,rsi
938	add	r15,rax
939	mov	rax,QWORD[rbp]
940	adc	rdx,0
941	add	r14,r15
942	mov	r15,rdx
943	adc	r15,0
944
945	dec	ecx
946	jne	NEAR $L$reduction_loop
947
948	DB	0F3h,0C3h		;repret
949
950
951ALIGN	32
952__rsaz_512_subtract:
953	mov	QWORD[rdi],r8
954	mov	QWORD[8+rdi],r9
955	mov	QWORD[16+rdi],r10
956	mov	QWORD[24+rdi],r11
957	mov	QWORD[32+rdi],r12
958	mov	QWORD[40+rdi],r13
959	mov	QWORD[48+rdi],r14
960	mov	QWORD[56+rdi],r15
961
962	mov	r8,QWORD[rbp]
963	mov	r9,QWORD[8+rbp]
964	neg	r8
965	not	r9
966	and	r8,rcx
967	mov	r10,QWORD[16+rbp]
968	and	r9,rcx
969	not	r10
970	mov	r11,QWORD[24+rbp]
971	and	r10,rcx
972	not	r11
973	mov	r12,QWORD[32+rbp]
974	and	r11,rcx
975	not	r12
976	mov	r13,QWORD[40+rbp]
977	and	r12,rcx
978	not	r13
979	mov	r14,QWORD[48+rbp]
980	and	r13,rcx
981	not	r14
982	mov	r15,QWORD[56+rbp]
983	and	r14,rcx
984	not	r15
985	and	r15,rcx
986
987	add	r8,QWORD[rdi]
988	adc	r9,QWORD[8+rdi]
989	adc	r10,QWORD[16+rdi]
990	adc	r11,QWORD[24+rdi]
991	adc	r12,QWORD[32+rdi]
992	adc	r13,QWORD[40+rdi]
993	adc	r14,QWORD[48+rdi]
994	adc	r15,QWORD[56+rdi]
995
996	mov	QWORD[rdi],r8
997	mov	QWORD[8+rdi],r9
998	mov	QWORD[16+rdi],r10
999	mov	QWORD[24+rdi],r11
1000	mov	QWORD[32+rdi],r12
1001	mov	QWORD[40+rdi],r13
1002	mov	QWORD[48+rdi],r14
1003	mov	QWORD[56+rdi],r15
1004
1005	DB	0F3h,0C3h		;repret
1006
1007
1008ALIGN	32
1009__rsaz_512_mul:
1010	lea	rdi,[8+rsp]
1011
1012	mov	rax,QWORD[rsi]
1013	mul	rbx
1014	mov	QWORD[rdi],rax
1015	mov	rax,QWORD[8+rsi]
1016	mov	r8,rdx
1017
1018	mul	rbx
1019	add	r8,rax
1020	mov	rax,QWORD[16+rsi]
1021	mov	r9,rdx
1022	adc	r9,0
1023
1024	mul	rbx
1025	add	r9,rax
1026	mov	rax,QWORD[24+rsi]
1027	mov	r10,rdx
1028	adc	r10,0
1029
1030	mul	rbx
1031	add	r10,rax
1032	mov	rax,QWORD[32+rsi]
1033	mov	r11,rdx
1034	adc	r11,0
1035
1036	mul	rbx
1037	add	r11,rax
1038	mov	rax,QWORD[40+rsi]
1039	mov	r12,rdx
1040	adc	r12,0
1041
1042	mul	rbx
1043	add	r12,rax
1044	mov	rax,QWORD[48+rsi]
1045	mov	r13,rdx
1046	adc	r13,0
1047
1048	mul	rbx
1049	add	r13,rax
1050	mov	rax,QWORD[56+rsi]
1051	mov	r14,rdx
1052	adc	r14,0
1053
1054	mul	rbx
1055	add	r14,rax
1056	mov	rax,QWORD[rsi]
1057	mov	r15,rdx
1058	adc	r15,0
1059
1060	lea	rbp,[8+rbp]
1061	lea	rdi,[8+rdi]
1062
1063	mov	ecx,7
1064	jmp	NEAR $L$oop_mul
1065
1066ALIGN	32
1067$L$oop_mul:
1068	mov	rbx,QWORD[rbp]
1069	mul	rbx
1070	add	r8,rax
1071	mov	rax,QWORD[8+rsi]
1072	mov	QWORD[rdi],r8
1073	mov	r8,rdx
1074	adc	r8,0
1075
1076	mul	rbx
1077	add	r9,rax
1078	mov	rax,QWORD[16+rsi]
1079	adc	rdx,0
1080	add	r8,r9
1081	mov	r9,rdx
1082	adc	r9,0
1083
1084	mul	rbx
1085	add	r10,rax
1086	mov	rax,QWORD[24+rsi]
1087	adc	rdx,0
1088	add	r9,r10
1089	mov	r10,rdx
1090	adc	r10,0
1091
1092	mul	rbx
1093	add	r11,rax
1094	mov	rax,QWORD[32+rsi]
1095	adc	rdx,0
1096	add	r10,r11
1097	mov	r11,rdx
1098	adc	r11,0
1099
1100	mul	rbx
1101	add	r12,rax
1102	mov	rax,QWORD[40+rsi]
1103	adc	rdx,0
1104	add	r11,r12
1105	mov	r12,rdx
1106	adc	r12,0
1107
1108	mul	rbx
1109	add	r13,rax
1110	mov	rax,QWORD[48+rsi]
1111	adc	rdx,0
1112	add	r12,r13
1113	mov	r13,rdx
1114	adc	r13,0
1115
1116	mul	rbx
1117	add	r14,rax
1118	mov	rax,QWORD[56+rsi]
1119	adc	rdx,0
1120	add	r13,r14
1121	mov	r14,rdx
1122	lea	rbp,[8+rbp]
1123	adc	r14,0
1124
1125	mul	rbx
1126	add	r15,rax
1127	mov	rax,QWORD[rsi]
1128	adc	rdx,0
1129	add	r14,r15
1130	mov	r15,rdx
1131	adc	r15,0
1132
1133	lea	rdi,[8+rdi]
1134
1135	dec	ecx
1136	jnz	NEAR $L$oop_mul
1137
1138	mov	QWORD[rdi],r8
1139	mov	QWORD[8+rdi],r9
1140	mov	QWORD[16+rdi],r10
1141	mov	QWORD[24+rdi],r11
1142	mov	QWORD[32+rdi],r12
1143	mov	QWORD[40+rdi],r13
1144	mov	QWORD[48+rdi],r14
1145	mov	QWORD[56+rdi],r15
1146
1147	DB	0F3h,0C3h		;repret
1148
1149global	rsaz_512_scatter4
1150
1151ALIGN	16
1152rsaz_512_scatter4:
1153	lea	rcx,[r8*4+rcx]
1154	mov	r9d,8
1155	jmp	NEAR $L$oop_scatter
1156ALIGN	16
1157$L$oop_scatter:
1158	mov	rax,QWORD[rdx]
1159	lea	rdx,[8+rdx]
1160	mov	DWORD[rcx],eax
1161	shr	rax,32
1162	mov	DWORD[64+rcx],eax
1163	lea	rcx,[128+rcx]
1164	dec	r9d
1165	jnz	NEAR $L$oop_scatter
1166	DB	0F3h,0C3h		;repret
1167
1168
1169global	rsaz_512_gather4
1170
1171ALIGN	16
1172rsaz_512_gather4:
1173	lea	rdx,[r8*4+rdx]
1174	mov	r9d,8
1175	jmp	NEAR $L$oop_gather
1176ALIGN	16
1177$L$oop_gather:
1178	mov	eax,DWORD[rdx]
1179	mov	r8d,DWORD[64+rdx]
1180	lea	rdx,[128+rdx]
1181	shl	r8,32
1182	or	rax,r8
1183	mov	QWORD[rcx],rax
1184	lea	rcx,[8+rcx]
1185	dec	r9d
1186	jnz	NEAR $L$oop_gather
1187	DB	0F3h,0C3h		;repret
1188
1189EXTERN	__imp_RtlVirtualUnwind
1190
1191ALIGN	16
1192se_handler:
1193	push	rsi
1194	push	rdi
1195	push	rbx
1196	push	rbp
1197	push	r12
1198	push	r13
1199	push	r14
1200	push	r15
1201	pushfq
1202	sub	rsp,64
1203
1204	mov	rax,QWORD[120+r8]
1205	mov	rbx,QWORD[248+r8]
1206
1207	mov	rsi,QWORD[8+r9]
1208	mov	r11,QWORD[56+r9]
1209
1210	mov	r10d,DWORD[r11]
1211	lea	r10,[r10*1+rsi]
1212	cmp	rbx,r10
1213	jb	NEAR $L$common_seh_tail
1214
1215	mov	rax,QWORD[152+r8]
1216
1217	mov	r10d,DWORD[4+r11]
1218	lea	r10,[r10*1+rsi]
1219	cmp	rbx,r10
1220	jae	NEAR $L$common_seh_tail
1221
1222	lea	rax,[((128+24+48))+rax]
1223
1224	mov	rbx,QWORD[((-8))+rax]
1225	mov	rbp,QWORD[((-16))+rax]
1226	mov	r12,QWORD[((-24))+rax]
1227	mov	r13,QWORD[((-32))+rax]
1228	mov	r14,QWORD[((-40))+rax]
1229	mov	r15,QWORD[((-48))+rax]
1230	mov	QWORD[144+r8],rbx
1231	mov	QWORD[160+r8],rbp
1232	mov	QWORD[216+r8],r12
1233	mov	QWORD[224+r8],r13
1234	mov	QWORD[232+r8],r14
1235	mov	QWORD[240+r8],r15
1236
1237$L$common_seh_tail:
1238	mov	rdi,QWORD[8+rax]
1239	mov	rsi,QWORD[16+rax]
1240	mov	QWORD[152+r8],rax
1241	mov	QWORD[168+r8],rsi
1242	mov	QWORD[176+r8],rdi
1243
1244	mov	rdi,QWORD[40+r9]
1245	mov	rsi,r8
1246	mov	ecx,154
1247	DD	0xa548f3fc
1248
1249	mov	rsi,r9
1250	xor	rcx,rcx
1251	mov	rdx,QWORD[8+rsi]
1252	mov	r8,QWORD[rsi]
1253	mov	r9,QWORD[16+rsi]
1254	mov	r10,QWORD[40+rsi]
1255	lea	r11,[56+rsi]
1256	lea	r12,[24+rsi]
1257	mov	QWORD[32+rsp],r10
1258	mov	QWORD[40+rsp],r11
1259	mov	QWORD[48+rsp],r12
1260	mov	QWORD[56+rsp],rcx
1261	call	QWORD[__imp_RtlVirtualUnwind]
1262
1263	mov	eax,1
1264	add	rsp,64
1265	popfq
1266	pop	r15
1267	pop	r14
1268	pop	r13
1269	pop	r12
1270	pop	rbp
1271	pop	rbx
1272	pop	rdi
1273	pop	rsi
1274	DB	0F3h,0C3h		;repret
1275
1276
1277section	.pdata rdata align=4
1278ALIGN	4
1279	DD	$L$SEH_begin_rsaz_512_sqr wrt ..imagebase
1280	DD	$L$SEH_end_rsaz_512_sqr wrt ..imagebase
1281	DD	$L$SEH_info_rsaz_512_sqr wrt ..imagebase
1282
1283	DD	$L$SEH_begin_rsaz_512_mul wrt ..imagebase
1284	DD	$L$SEH_end_rsaz_512_mul wrt ..imagebase
1285	DD	$L$SEH_info_rsaz_512_mul wrt ..imagebase
1286
1287	DD	$L$SEH_begin_rsaz_512_mul_gather4 wrt ..imagebase
1288	DD	$L$SEH_end_rsaz_512_mul_gather4 wrt ..imagebase
1289	DD	$L$SEH_info_rsaz_512_mul_gather4 wrt ..imagebase
1290
1291	DD	$L$SEH_begin_rsaz_512_mul_scatter4 wrt ..imagebase
1292	DD	$L$SEH_end_rsaz_512_mul_scatter4 wrt ..imagebase
1293	DD	$L$SEH_info_rsaz_512_mul_scatter4 wrt ..imagebase
1294
1295	DD	$L$SEH_begin_rsaz_512_mul_by_one wrt ..imagebase
1296	DD	$L$SEH_end_rsaz_512_mul_by_one wrt ..imagebase
1297	DD	$L$SEH_info_rsaz_512_mul_by_one wrt ..imagebase
1298
1299section	.xdata rdata align=8
1300ALIGN	8
1301$L$SEH_info_rsaz_512_sqr:
1302DB	9,0,0,0
1303	DD	se_handler wrt ..imagebase
1304	DD	$L$sqr_body wrt ..imagebase,$L$sqr_epilogue wrt ..imagebase
1305$L$SEH_info_rsaz_512_mul:
1306DB	9,0,0,0
1307	DD	se_handler wrt ..imagebase
1308	DD	$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
1309$L$SEH_info_rsaz_512_mul_gather4:
1310DB	9,0,0,0
1311	DD	se_handler wrt ..imagebase
1312	DD	$L$mul_gather4_body wrt ..imagebase,$L$mul_gather4_epilogue wrt ..imagebase
1313$L$SEH_info_rsaz_512_mul_scatter4:
1314DB	9,0,0,0
1315	DD	se_handler wrt ..imagebase
1316	DD	$L$mul_scatter4_body wrt ..imagebase,$L$mul_scatter4_epilogue wrt ..imagebase
1317$L$SEH_info_rsaz_512_mul_by_one:
1318DB	9,0,0,0
1319	DD	se_handler wrt ..imagebase
1320	DD	$L$mul_by_one_body wrt ..imagebase,$L$mul_by_one_epilogue wrt ..imagebase
1321