1default	rel
2%define XMMWORD
3%define YMMWORD
4%define ZMMWORD
5section	.text code align=64
6
7
8EXTERN	OPENSSL_ia32cap_P
9
10global	bn_mul_mont_gather5
11
12ALIGN	64
13bn_mul_mont_gather5:
14	mov	QWORD[8+rsp],rdi	;WIN64 prologue
15	mov	QWORD[16+rsp],rsi
16	mov	rax,rsp
17$L$SEH_begin_bn_mul_mont_gather5:
18	mov	rdi,rcx
19	mov	rsi,rdx
20	mov	rdx,r8
21	mov	rcx,r9
22	mov	r8,QWORD[40+rsp]
23	mov	r9,QWORD[48+rsp]
24
25
26	test	r9d,7
27	jnz	NEAR $L$mul_enter
28	jmp	NEAR $L$mul4x_enter
29
30ALIGN	16
31$L$mul_enter:
32	mov	r9d,r9d
33	mov	rax,rsp
34	mov	r10d,DWORD[56+rsp]
35	push	rbx
36	push	rbp
37	push	r12
38	push	r13
39	push	r14
40	push	r15
41	lea	rsp,[((-40))+rsp]
42	movaps	XMMWORD[rsp],xmm6
43	movaps	XMMWORD[16+rsp],xmm7
44	lea	r11,[2+r9]
45	neg	r11
46	lea	rsp,[r11*8+rsp]
47	and	rsp,-1024
48
49	mov	QWORD[8+r9*8+rsp],rax
50$L$mul_body:
51	mov	r12,rdx
52	mov	r11,r10
53	shr	r10,3
54	and	r11,7
55	not	r10
56	lea	rax,[$L$magic_masks]
57	and	r10,3
58	lea	r12,[96+r11*8+r12]
59	movq	xmm4,QWORD[r10*8+rax]
60	movq	xmm5,QWORD[8+r10*8+rax]
61	movq	xmm6,QWORD[16+r10*8+rax]
62	movq	xmm7,QWORD[24+r10*8+rax]
63
64	movq	xmm0,QWORD[(((-96)))+r12]
65	movq	xmm1,QWORD[((-32))+r12]
66	pand	xmm0,xmm4
67	movq	xmm2,QWORD[32+r12]
68	pand	xmm1,xmm5
69	movq	xmm3,QWORD[96+r12]
70	pand	xmm2,xmm6
71	por	xmm0,xmm1
72	pand	xmm3,xmm7
73	por	xmm0,xmm2
74	lea	r12,[256+r12]
75	por	xmm0,xmm3
76
77DB	102,72,15,126,195
78
79	mov	r8,QWORD[r8]
80	mov	rax,QWORD[rsi]
81
82	xor	r14,r14
83	xor	r15,r15
84
85	movq	xmm0,QWORD[(((-96)))+r12]
86	movq	xmm1,QWORD[((-32))+r12]
87	pand	xmm0,xmm4
88	movq	xmm2,QWORD[32+r12]
89	pand	xmm1,xmm5
90
91	mov	rbp,r8
92	mul	rbx
93	mov	r10,rax
94	mov	rax,QWORD[rcx]
95
96	movq	xmm3,QWORD[96+r12]
97	pand	xmm2,xmm6
98	por	xmm0,xmm1
99	pand	xmm3,xmm7
100
101	imul	rbp,r10
102	mov	r11,rdx
103
104	por	xmm0,xmm2
105	lea	r12,[256+r12]
106	por	xmm0,xmm3
107
108	mul	rbp
109	add	r10,rax
110	mov	rax,QWORD[8+rsi]
111	adc	rdx,0
112	mov	r13,rdx
113
114	lea	r15,[1+r15]
115	jmp	NEAR $L$1st_enter
116
117ALIGN	16
118$L$1st:
119	add	r13,rax
120	mov	rax,QWORD[r15*8+rsi]
121	adc	rdx,0
122	add	r13,r11
123	mov	r11,r10
124	adc	rdx,0
125	mov	QWORD[((-16))+r15*8+rsp],r13
126	mov	r13,rdx
127
128$L$1st_enter:
129	mul	rbx
130	add	r11,rax
131	mov	rax,QWORD[r15*8+rcx]
132	adc	rdx,0
133	lea	r15,[1+r15]
134	mov	r10,rdx
135
136	mul	rbp
137	cmp	r15,r9
138	jne	NEAR $L$1st
139
140DB	102,72,15,126,195
141
142	add	r13,rax
143	mov	rax,QWORD[rsi]
144	adc	rdx,0
145	add	r13,r11
146	adc	rdx,0
147	mov	QWORD[((-16))+r15*8+rsp],r13
148	mov	r13,rdx
149	mov	r11,r10
150
151	xor	rdx,rdx
152	add	r13,r11
153	adc	rdx,0
154	mov	QWORD[((-8))+r9*8+rsp],r13
155	mov	QWORD[r9*8+rsp],rdx
156
157	lea	r14,[1+r14]
158	jmp	NEAR $L$outer
159ALIGN	16
160$L$outer:
161	xor	r15,r15
162	mov	rbp,r8
163	mov	r10,QWORD[rsp]
164
165	movq	xmm0,QWORD[(((-96)))+r12]
166	movq	xmm1,QWORD[((-32))+r12]
167	pand	xmm0,xmm4
168	movq	xmm2,QWORD[32+r12]
169	pand	xmm1,xmm5
170
171	mul	rbx
172	add	r10,rax
173	mov	rax,QWORD[rcx]
174	adc	rdx,0
175
176	movq	xmm3,QWORD[96+r12]
177	pand	xmm2,xmm6
178	por	xmm0,xmm1
179	pand	xmm3,xmm7
180
181	imul	rbp,r10
182	mov	r11,rdx
183
184	por	xmm0,xmm2
185	lea	r12,[256+r12]
186	por	xmm0,xmm3
187
188	mul	rbp
189	add	r10,rax
190	mov	rax,QWORD[8+rsi]
191	adc	rdx,0
192	mov	r10,QWORD[8+rsp]
193	mov	r13,rdx
194
195	lea	r15,[1+r15]
196	jmp	NEAR $L$inner_enter
197
198ALIGN	16
199$L$inner:
200	add	r13,rax
201	mov	rax,QWORD[r15*8+rsi]
202	adc	rdx,0
203	add	r13,r10
204	mov	r10,QWORD[r15*8+rsp]
205	adc	rdx,0
206	mov	QWORD[((-16))+r15*8+rsp],r13
207	mov	r13,rdx
208
209$L$inner_enter:
210	mul	rbx
211	add	r11,rax
212	mov	rax,QWORD[r15*8+rcx]
213	adc	rdx,0
214	add	r10,r11
215	mov	r11,rdx
216	adc	r11,0
217	lea	r15,[1+r15]
218
219	mul	rbp
220	cmp	r15,r9
221	jne	NEAR $L$inner
222
223DB	102,72,15,126,195
224
225	add	r13,rax
226	mov	rax,QWORD[rsi]
227	adc	rdx,0
228	add	r13,r10
229	mov	r10,QWORD[r15*8+rsp]
230	adc	rdx,0
231	mov	QWORD[((-16))+r15*8+rsp],r13
232	mov	r13,rdx
233
234	xor	rdx,rdx
235	add	r13,r11
236	adc	rdx,0
237	add	r13,r10
238	adc	rdx,0
239	mov	QWORD[((-8))+r9*8+rsp],r13
240	mov	QWORD[r9*8+rsp],rdx
241
242	lea	r14,[1+r14]
243	cmp	r14,r9
244	jb	NEAR $L$outer
245
246	xor	r14,r14
247	mov	rax,QWORD[rsp]
248	lea	rsi,[rsp]
249	mov	r15,r9
250	jmp	NEAR $L$sub
251ALIGN	16
252$L$sub:	sbb	rax,QWORD[r14*8+rcx]
253	mov	QWORD[r14*8+rdi],rax
254	mov	rax,QWORD[8+r14*8+rsi]
255	lea	r14,[1+r14]
256	dec	r15
257	jnz	NEAR $L$sub
258
259	sbb	rax,0
260	xor	r14,r14
261	mov	r15,r9
262ALIGN	16
263$L$copy:
264	mov	rsi,QWORD[r14*8+rsp]
265	mov	rcx,QWORD[r14*8+rdi]
266	xor	rsi,rcx
267	and	rsi,rax
268	xor	rsi,rcx
269	mov	QWORD[r14*8+rsp],r14
270	mov	QWORD[r14*8+rdi],rsi
271	lea	r14,[1+r14]
272	sub	r15,1
273	jnz	NEAR $L$copy
274
275	mov	rsi,QWORD[8+r9*8+rsp]
276	mov	rax,1
277	movaps	xmm6,XMMWORD[((-88))+rsi]
278	movaps	xmm7,XMMWORD[((-72))+rsi]
279	mov	r15,QWORD[((-48))+rsi]
280	mov	r14,QWORD[((-40))+rsi]
281	mov	r13,QWORD[((-32))+rsi]
282	mov	r12,QWORD[((-24))+rsi]
283	mov	rbp,QWORD[((-16))+rsi]
284	mov	rbx,QWORD[((-8))+rsi]
285	lea	rsp,[rsi]
286$L$mul_epilogue:
287	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
288	mov	rsi,QWORD[16+rsp]
289	DB	0F3h,0C3h		;repret
290$L$SEH_end_bn_mul_mont_gather5:
291
292ALIGN	32
293bn_mul4x_mont_gather5:
294	mov	QWORD[8+rsp],rdi	;WIN64 prologue
295	mov	QWORD[16+rsp],rsi
296	mov	rax,rsp
297$L$SEH_begin_bn_mul4x_mont_gather5:
298	mov	rdi,rcx
299	mov	rsi,rdx
300	mov	rdx,r8
301	mov	rcx,r9
302	mov	r8,QWORD[40+rsp]
303	mov	r9,QWORD[48+rsp]
304
305
306$L$mul4x_enter:
307DB	0x67
308	mov	rax,rsp
309	push	rbx
310	push	rbp
311	push	r12
312	push	r13
313	push	r14
314	push	r15
315	lea	rsp,[((-40))+rsp]
316	movaps	XMMWORD[rsp],xmm6
317	movaps	XMMWORD[16+rsp],xmm7
318DB	0x67
319	mov	r10d,r9d
320	shl	r9d,3
321	shl	r10d,3+2
322	neg	r9
323
324
325
326
327
328
329
330
331	lea	r11,[((-64))+r9*2+rsp]
332	sub	r11,rsi
333	and	r11,4095
334	cmp	r10,r11
335	jb	NEAR $L$mul4xsp_alt
336	sub	rsp,r11
337	lea	rsp,[((-64))+r9*2+rsp]
338	jmp	NEAR $L$mul4xsp_done
339
340ALIGN	32
341$L$mul4xsp_alt:
342	lea	r10,[((4096-64))+r9*2]
343	lea	rsp,[((-64))+r9*2+rsp]
344	sub	r11,r10
345	mov	r10,0
346	cmovc	r11,r10
347	sub	rsp,r11
348$L$mul4xsp_done:
349	and	rsp,-64
350	neg	r9
351
352	mov	QWORD[40+rsp],rax
353$L$mul4x_body:
354
355	call	mul4x_internal
356
357	mov	rsi,QWORD[40+rsp]
358	mov	rax,1
359	movaps	xmm6,XMMWORD[((-88))+rsi]
360	movaps	xmm7,XMMWORD[((-72))+rsi]
361	mov	r15,QWORD[((-48))+rsi]
362	mov	r14,QWORD[((-40))+rsi]
363	mov	r13,QWORD[((-32))+rsi]
364	mov	r12,QWORD[((-24))+rsi]
365	mov	rbp,QWORD[((-16))+rsi]
366	mov	rbx,QWORD[((-8))+rsi]
367	lea	rsp,[rsi]
368$L$mul4x_epilogue:
369	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
370	mov	rsi,QWORD[16+rsp]
371	DB	0F3h,0C3h		;repret
372$L$SEH_end_bn_mul4x_mont_gather5:
373
374
375ALIGN	32
376mul4x_internal:
377	shl	r9,5
378	mov	r10d,DWORD[56+rax]
379	lea	r13,[256+r9*1+rdx]
380	shr	r9,5
381	mov	r11,r10
382	shr	r10,3
383	and	r11,7
384	not	r10
385	lea	rax,[$L$magic_masks]
386	and	r10,3
387	lea	r12,[96+r11*8+rdx]
388	movq	xmm4,QWORD[r10*8+rax]
389	movq	xmm5,QWORD[8+r10*8+rax]
390	add	r11,7
391	movq	xmm6,QWORD[16+r10*8+rax]
392	movq	xmm7,QWORD[24+r10*8+rax]
393	and	r11,7
394
395	movq	xmm0,QWORD[(((-96)))+r12]
396	lea	r14,[256+r12]
397	movq	xmm1,QWORD[((-32))+r12]
398	pand	xmm0,xmm4
399	movq	xmm2,QWORD[32+r12]
400	pand	xmm1,xmm5
401	movq	xmm3,QWORD[96+r12]
402	pand	xmm2,xmm6
403DB	0x67
404	por	xmm0,xmm1
405	movq	xmm1,QWORD[((-96))+r14]
406DB	0x67
407	pand	xmm3,xmm7
408DB	0x67
409	por	xmm0,xmm2
410	movq	xmm2,QWORD[((-32))+r14]
411DB	0x67
412	pand	xmm1,xmm4
413DB	0x67
414	por	xmm0,xmm3
415	movq	xmm3,QWORD[32+r14]
416
417DB	102,72,15,126,195
418	movq	xmm0,QWORD[96+r14]
419	mov	QWORD[((16+8))+rsp],r13
420	mov	QWORD[((56+8))+rsp],rdi
421
422	mov	r8,QWORD[r8]
423	mov	rax,QWORD[rsi]
424	lea	rsi,[r9*1+rsi]
425	neg	r9
426
427	mov	rbp,r8
428	mul	rbx
429	mov	r10,rax
430	mov	rax,QWORD[rcx]
431
432	pand	xmm2,xmm5
433	pand	xmm3,xmm6
434	por	xmm1,xmm2
435
436	imul	rbp,r10
437
438
439
440
441
442
443
444	lea	r14,[((64+8))+r11*8+rsp]
445	mov	r11,rdx
446
447	pand	xmm0,xmm7
448	por	xmm1,xmm3
449	lea	r12,[512+r12]
450	por	xmm0,xmm1
451
452	mul	rbp
453	add	r10,rax
454	mov	rax,QWORD[8+r9*1+rsi]
455	adc	rdx,0
456	mov	rdi,rdx
457
458	mul	rbx
459	add	r11,rax
460	mov	rax,QWORD[16+rcx]
461	adc	rdx,0
462	mov	r10,rdx
463
464	mul	rbp
465	add	rdi,rax
466	mov	rax,QWORD[16+r9*1+rsi]
467	adc	rdx,0
468	add	rdi,r11
469	lea	r15,[32+r9]
470	lea	rcx,[64+rcx]
471	adc	rdx,0
472	mov	QWORD[r14],rdi
473	mov	r13,rdx
474	jmp	NEAR $L$1st4x
475
476ALIGN	32
477$L$1st4x:
478	mul	rbx
479	add	r10,rax
480	mov	rax,QWORD[((-32))+rcx]
481	lea	r14,[32+r14]
482	adc	rdx,0
483	mov	r11,rdx
484
485	mul	rbp
486	add	r13,rax
487	mov	rax,QWORD[((-8))+r15*1+rsi]
488	adc	rdx,0
489	add	r13,r10
490	adc	rdx,0
491	mov	QWORD[((-24))+r14],r13
492	mov	rdi,rdx
493
494	mul	rbx
495	add	r11,rax
496	mov	rax,QWORD[((-16))+rcx]
497	adc	rdx,0
498	mov	r10,rdx
499
500	mul	rbp
501	add	rdi,rax
502	mov	rax,QWORD[r15*1+rsi]
503	adc	rdx,0
504	add	rdi,r11
505	adc	rdx,0
506	mov	QWORD[((-16))+r14],rdi
507	mov	r13,rdx
508
509	mul	rbx
510	add	r10,rax
511	mov	rax,QWORD[rcx]
512	adc	rdx,0
513	mov	r11,rdx
514
515	mul	rbp
516	add	r13,rax
517	mov	rax,QWORD[8+r15*1+rsi]
518	adc	rdx,0
519	add	r13,r10
520	adc	rdx,0
521	mov	QWORD[((-8))+r14],r13
522	mov	rdi,rdx
523
524	mul	rbx
525	add	r11,rax
526	mov	rax,QWORD[16+rcx]
527	adc	rdx,0
528	mov	r10,rdx
529
530	mul	rbp
531	add	rdi,rax
532	mov	rax,QWORD[16+r15*1+rsi]
533	adc	rdx,0
534	add	rdi,r11
535	lea	rcx,[64+rcx]
536	adc	rdx,0
537	mov	QWORD[r14],rdi
538	mov	r13,rdx
539
540	add	r15,32
541	jnz	NEAR $L$1st4x
542
543	mul	rbx
544	add	r10,rax
545	mov	rax,QWORD[((-32))+rcx]
546	lea	r14,[32+r14]
547	adc	rdx,0
548	mov	r11,rdx
549
550	mul	rbp
551	add	r13,rax
552	mov	rax,QWORD[((-8))+rsi]
553	adc	rdx,0
554	add	r13,r10
555	adc	rdx,0
556	mov	QWORD[((-24))+r14],r13
557	mov	rdi,rdx
558
559	mul	rbx
560	add	r11,rax
561	mov	rax,QWORD[((-16))+rcx]
562	adc	rdx,0
563	mov	r10,rdx
564
565	mul	rbp
566	add	rdi,rax
567	mov	rax,QWORD[r9*1+rsi]
568	adc	rdx,0
569	add	rdi,r11
570	adc	rdx,0
571	mov	QWORD[((-16))+r14],rdi
572	mov	r13,rdx
573
574DB	102,72,15,126,195
575	lea	rcx,[r9*2+rcx]
576
577	xor	rdi,rdi
578	add	r13,r10
579	adc	rdi,0
580	mov	QWORD[((-8))+r14],r13
581
582	jmp	NEAR $L$outer4x
583
584ALIGN	32
585$L$outer4x:
586	mov	r10,QWORD[r9*1+r14]
587	mov	rbp,r8
588	mul	rbx
589	add	r10,rax
590	mov	rax,QWORD[rcx]
591	adc	rdx,0
592
593	movq	xmm0,QWORD[(((-96)))+r12]
594	movq	xmm1,QWORD[((-32))+r12]
595	pand	xmm0,xmm4
596	movq	xmm2,QWORD[32+r12]
597	pand	xmm1,xmm5
598	movq	xmm3,QWORD[96+r12]
599
600	imul	rbp,r10
601DB	0x67
602	mov	r11,rdx
603	mov	QWORD[r14],rdi
604
605	pand	xmm2,xmm6
606	por	xmm0,xmm1
607	pand	xmm3,xmm7
608	por	xmm0,xmm2
609	lea	r14,[r9*1+r14]
610	lea	r12,[256+r12]
611	por	xmm0,xmm3
612
613	mul	rbp
614	add	r10,rax
615	mov	rax,QWORD[8+r9*1+rsi]
616	adc	rdx,0
617	mov	rdi,rdx
618
619	mul	rbx
620	add	r11,rax
621	mov	rax,QWORD[16+rcx]
622	adc	rdx,0
623	add	r11,QWORD[8+r14]
624	adc	rdx,0
625	mov	r10,rdx
626
627	mul	rbp
628	add	rdi,rax
629	mov	rax,QWORD[16+r9*1+rsi]
630	adc	rdx,0
631	add	rdi,r11
632	lea	r15,[32+r9]
633	lea	rcx,[64+rcx]
634	adc	rdx,0
635	mov	r13,rdx
636	jmp	NEAR $L$inner4x
637
638ALIGN	32
639$L$inner4x:
640	mul	rbx
641	add	r10,rax
642	mov	rax,QWORD[((-32))+rcx]
643	adc	rdx,0
644	add	r10,QWORD[16+r14]
645	lea	r14,[32+r14]
646	adc	rdx,0
647	mov	r11,rdx
648
649	mul	rbp
650	add	r13,rax
651	mov	rax,QWORD[((-8))+r15*1+rsi]
652	adc	rdx,0
653	add	r13,r10
654	adc	rdx,0
655	mov	QWORD[((-32))+r14],rdi
656	mov	rdi,rdx
657
658	mul	rbx
659	add	r11,rax
660	mov	rax,QWORD[((-16))+rcx]
661	adc	rdx,0
662	add	r11,QWORD[((-8))+r14]
663	adc	rdx,0
664	mov	r10,rdx
665
666	mul	rbp
667	add	rdi,rax
668	mov	rax,QWORD[r15*1+rsi]
669	adc	rdx,0
670	add	rdi,r11
671	adc	rdx,0
672	mov	QWORD[((-24))+r14],r13
673	mov	r13,rdx
674
675	mul	rbx
676	add	r10,rax
677	mov	rax,QWORD[rcx]
678	adc	rdx,0
679	add	r10,QWORD[r14]
680	adc	rdx,0
681	mov	r11,rdx
682
683	mul	rbp
684	add	r13,rax
685	mov	rax,QWORD[8+r15*1+rsi]
686	adc	rdx,0
687	add	r13,r10
688	adc	rdx,0
689	mov	QWORD[((-16))+r14],rdi
690	mov	rdi,rdx
691
692	mul	rbx
693	add	r11,rax
694	mov	rax,QWORD[16+rcx]
695	adc	rdx,0
696	add	r11,QWORD[8+r14]
697	adc	rdx,0
698	mov	r10,rdx
699
700	mul	rbp
701	add	rdi,rax
702	mov	rax,QWORD[16+r15*1+rsi]
703	adc	rdx,0
704	add	rdi,r11
705	lea	rcx,[64+rcx]
706	adc	rdx,0
707	mov	QWORD[((-8))+r14],r13
708	mov	r13,rdx
709
710	add	r15,32
711	jnz	NEAR $L$inner4x
712
713	mul	rbx
714	add	r10,rax
715	mov	rax,QWORD[((-32))+rcx]
716	adc	rdx,0
717	add	r10,QWORD[16+r14]
718	lea	r14,[32+r14]
719	adc	rdx,0
720	mov	r11,rdx
721
722	mul	rbp
723	add	r13,rax
724	mov	rax,QWORD[((-8))+rsi]
725	adc	rdx,0
726	add	r13,r10
727	adc	rdx,0
728	mov	QWORD[((-32))+r14],rdi
729	mov	rdi,rdx
730
731	mul	rbx
732	add	r11,rax
733	mov	rax,rbp
734	mov	rbp,QWORD[((-16))+rcx]
735	adc	rdx,0
736	add	r11,QWORD[((-8))+r14]
737	adc	rdx,0
738	mov	r10,rdx
739
740	mul	rbp
741	add	rdi,rax
742	mov	rax,QWORD[r9*1+rsi]
743	adc	rdx,0
744	add	rdi,r11
745	adc	rdx,0
746	mov	QWORD[((-24))+r14],r13
747	mov	r13,rdx
748
749DB	102,72,15,126,195
750	mov	QWORD[((-16))+r14],rdi
751	lea	rcx,[r9*2+rcx]
752
753	xor	rdi,rdi
754	add	r13,r10
755	adc	rdi,0
756	add	r13,QWORD[r14]
757	adc	rdi,0
758	mov	QWORD[((-8))+r14],r13
759
760	cmp	r12,QWORD[((16+8))+rsp]
761	jb	NEAR $L$outer4x
762	sub	rbp,r13
763	adc	r15,r15
764	or	rdi,r15
765	xor	rdi,1
766	lea	rbx,[r9*1+r14]
767	lea	rbp,[rdi*8+rcx]
768	mov	rcx,r9
769	sar	rcx,3+2
770	mov	rdi,QWORD[((56+8))+rsp]
771	jmp	NEAR $L$sqr4x_sub
772
773global	bn_power5
774
775ALIGN	32
776bn_power5:
777	mov	QWORD[8+rsp],rdi	;WIN64 prologue
778	mov	QWORD[16+rsp],rsi
779	mov	rax,rsp
780$L$SEH_begin_bn_power5:
781	mov	rdi,rcx
782	mov	rsi,rdx
783	mov	rdx,r8
784	mov	rcx,r9
785	mov	r8,QWORD[40+rsp]
786	mov	r9,QWORD[48+rsp]
787
788
789	mov	rax,rsp
790	push	rbx
791	push	rbp
792	push	r12
793	push	r13
794	push	r14
795	push	r15
796	lea	rsp,[((-40))+rsp]
797	movaps	XMMWORD[rsp],xmm6
798	movaps	XMMWORD[16+rsp],xmm7
799	mov	r10d,r9d
800	shl	r9d,3
801	shl	r10d,3+2
802	neg	r9
803	mov	r8,QWORD[r8]
804
805
806
807
808
809
810
811	lea	r11,[((-64))+r9*2+rsp]
812	sub	r11,rsi
813	and	r11,4095
814	cmp	r10,r11
815	jb	NEAR $L$pwr_sp_alt
816	sub	rsp,r11
817	lea	rsp,[((-64))+r9*2+rsp]
818	jmp	NEAR $L$pwr_sp_done
819
820ALIGN	32
821$L$pwr_sp_alt:
822	lea	r10,[((4096-64))+r9*2]
823	lea	rsp,[((-64))+r9*2+rsp]
824	sub	r11,r10
825	mov	r10,0
826	cmovc	r11,r10
827	sub	rsp,r11
828$L$pwr_sp_done:
829	and	rsp,-64
830	mov	r10,r9
831	neg	r9
832
833
834
835
836
837
838
839
840
841
842	mov	QWORD[32+rsp],r8
843	mov	QWORD[40+rsp],rax
844$L$power5_body:
845DB	102,72,15,110,207
846DB	102,72,15,110,209
847DB	102,73,15,110,218
848DB	102,72,15,110,226
849
850	call	__bn_sqr8x_internal
851	call	__bn_sqr8x_internal
852	call	__bn_sqr8x_internal
853	call	__bn_sqr8x_internal
854	call	__bn_sqr8x_internal
855
856DB	102,72,15,126,209
857DB	102,72,15,126,226
858	mov	rdi,rsi
859	mov	rax,QWORD[40+rsp]
860	lea	r8,[32+rsp]
861
862	call	mul4x_internal
863
864	mov	rsi,QWORD[40+rsp]
865	mov	rax,1
866	mov	r15,QWORD[((-48))+rsi]
867	mov	r14,QWORD[((-40))+rsi]
868	mov	r13,QWORD[((-32))+rsi]
869	mov	r12,QWORD[((-24))+rsi]
870	mov	rbp,QWORD[((-16))+rsi]
871	mov	rbx,QWORD[((-8))+rsi]
872	lea	rsp,[rsi]
873$L$power5_epilogue:
874	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
875	mov	rsi,QWORD[16+rsp]
876	DB	0F3h,0C3h		;repret
877$L$SEH_end_bn_power5:
878
879global	bn_sqr8x_internal
880
881
882ALIGN	32
883bn_sqr8x_internal:
884__bn_sqr8x_internal:
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958	lea	rbp,[32+r10]
959	lea	rsi,[r9*1+rsi]
960
961	mov	rcx,r9
962
963
964	mov	r14,QWORD[((-32))+rbp*1+rsi]
965	lea	rdi,[((48+8))+r9*2+rsp]
966	mov	rax,QWORD[((-24))+rbp*1+rsi]
967	lea	rdi,[((-32))+rbp*1+rdi]
968	mov	rbx,QWORD[((-16))+rbp*1+rsi]
969	mov	r15,rax
970
971	mul	r14
972	mov	r10,rax
973	mov	rax,rbx
974	mov	r11,rdx
975	mov	QWORD[((-24))+rbp*1+rdi],r10
976
977	mul	r14
978	add	r11,rax
979	mov	rax,rbx
980	adc	rdx,0
981	mov	QWORD[((-16))+rbp*1+rdi],r11
982	mov	r10,rdx
983
984
985	mov	rbx,QWORD[((-8))+rbp*1+rsi]
986	mul	r15
987	mov	r12,rax
988	mov	rax,rbx
989	mov	r13,rdx
990
991	lea	rcx,[rbp]
992	mul	r14
993	add	r10,rax
994	mov	rax,rbx
995	mov	r11,rdx
996	adc	r11,0
997	add	r10,r12
998	adc	r11,0
999	mov	QWORD[((-8))+rcx*1+rdi],r10
1000	jmp	NEAR $L$sqr4x_1st
1001
1002ALIGN	32
1003$L$sqr4x_1st:
1004	mov	rbx,QWORD[rcx*1+rsi]
1005	mul	r15
1006	add	r13,rax
1007	mov	rax,rbx
1008	mov	r12,rdx
1009	adc	r12,0
1010
1011	mul	r14
1012	add	r11,rax
1013	mov	rax,rbx
1014	mov	rbx,QWORD[8+rcx*1+rsi]
1015	mov	r10,rdx
1016	adc	r10,0
1017	add	r11,r13
1018	adc	r10,0
1019
1020
1021	mul	r15
1022	add	r12,rax
1023	mov	rax,rbx
1024	mov	QWORD[rcx*1+rdi],r11
1025	mov	r13,rdx
1026	adc	r13,0
1027
1028	mul	r14
1029	add	r10,rax
1030	mov	rax,rbx
1031	mov	rbx,QWORD[16+rcx*1+rsi]
1032	mov	r11,rdx
1033	adc	r11,0
1034	add	r10,r12
1035	adc	r11,0
1036
1037	mul	r15
1038	add	r13,rax
1039	mov	rax,rbx
1040	mov	QWORD[8+rcx*1+rdi],r10
1041	mov	r12,rdx
1042	adc	r12,0
1043
1044	mul	r14
1045	add	r11,rax
1046	mov	rax,rbx
1047	mov	rbx,QWORD[24+rcx*1+rsi]
1048	mov	r10,rdx
1049	adc	r10,0
1050	add	r11,r13
1051	adc	r10,0
1052
1053
1054	mul	r15
1055	add	r12,rax
1056	mov	rax,rbx
1057	mov	QWORD[16+rcx*1+rdi],r11
1058	mov	r13,rdx
1059	adc	r13,0
1060	lea	rcx,[32+rcx]
1061
1062	mul	r14
1063	add	r10,rax
1064	mov	rax,rbx
1065	mov	r11,rdx
1066	adc	r11,0
1067	add	r10,r12
1068	adc	r11,0
1069	mov	QWORD[((-8))+rcx*1+rdi],r10
1070
1071	cmp	rcx,0
1072	jne	NEAR $L$sqr4x_1st
1073
1074	mul	r15
1075	add	r13,rax
1076	lea	rbp,[16+rbp]
1077	adc	rdx,0
1078	add	r13,r11
1079	adc	rdx,0
1080
1081	mov	QWORD[rdi],r13
1082	mov	r12,rdx
1083	mov	QWORD[8+rdi],rdx
1084	jmp	NEAR $L$sqr4x_outer
1085
1086ALIGN	32
1087$L$sqr4x_outer:
1088	mov	r14,QWORD[((-32))+rbp*1+rsi]
1089	lea	rdi,[((48+8))+r9*2+rsp]
1090	mov	rax,QWORD[((-24))+rbp*1+rsi]
1091	lea	rdi,[((-32))+rbp*1+rdi]
1092	mov	rbx,QWORD[((-16))+rbp*1+rsi]
1093	mov	r15,rax
1094
1095	mul	r14
1096	mov	r10,QWORD[((-24))+rbp*1+rdi]
1097	add	r10,rax
1098	mov	rax,rbx
1099	adc	rdx,0
1100	mov	QWORD[((-24))+rbp*1+rdi],r10
1101	mov	r11,rdx
1102
1103	mul	r14
1104	add	r11,rax
1105	mov	rax,rbx
1106	adc	rdx,0
1107	add	r11,QWORD[((-16))+rbp*1+rdi]
1108	mov	r10,rdx
1109	adc	r10,0
1110	mov	QWORD[((-16))+rbp*1+rdi],r11
1111
1112	xor	r12,r12
1113
1114	mov	rbx,QWORD[((-8))+rbp*1+rsi]
1115	mul	r15
1116	add	r12,rax
1117	mov	rax,rbx
1118	adc	rdx,0
1119	add	r12,QWORD[((-8))+rbp*1+rdi]
1120	mov	r13,rdx
1121	adc	r13,0
1122
1123	mul	r14
1124	add	r10,rax
1125	mov	rax,rbx
1126	adc	rdx,0
1127	add	r10,r12
1128	mov	r11,rdx
1129	adc	r11,0
1130	mov	QWORD[((-8))+rbp*1+rdi],r10
1131
1132	lea	rcx,[rbp]
1133	jmp	NEAR $L$sqr4x_inner
1134
1135ALIGN	32
1136$L$sqr4x_inner:
1137	mov	rbx,QWORD[rcx*1+rsi]
1138	mul	r15
1139	add	r13,rax
1140	mov	rax,rbx
1141	mov	r12,rdx
1142	adc	r12,0
1143	add	r13,QWORD[rcx*1+rdi]
1144	adc	r12,0
1145
1146DB	0x67
1147	mul	r14
1148	add	r11,rax
1149	mov	rax,rbx
1150	mov	rbx,QWORD[8+rcx*1+rsi]
1151	mov	r10,rdx
1152	adc	r10,0
1153	add	r11,r13
1154	adc	r10,0
1155
1156	mul	r15
1157	add	r12,rax
1158	mov	QWORD[rcx*1+rdi],r11
1159	mov	rax,rbx
1160	mov	r13,rdx
1161	adc	r13,0
1162	add	r12,QWORD[8+rcx*1+rdi]
1163	lea	rcx,[16+rcx]
1164	adc	r13,0
1165
1166	mul	r14
1167	add	r10,rax
1168	mov	rax,rbx
1169	adc	rdx,0
1170	add	r10,r12
1171	mov	r11,rdx
1172	adc	r11,0
1173	mov	QWORD[((-8))+rcx*1+rdi],r10
1174
1175	cmp	rcx,0
1176	jne	NEAR $L$sqr4x_inner
1177
1178DB	0x67
1179	mul	r15
1180	add	r13,rax
1181	adc	rdx,0
1182	add	r13,r11
1183	adc	rdx,0
1184
1185	mov	QWORD[rdi],r13
1186	mov	r12,rdx
1187	mov	QWORD[8+rdi],rdx
1188
1189	add	rbp,16
1190	jnz	NEAR $L$sqr4x_outer
1191
1192
1193	mov	r14,QWORD[((-32))+rsi]
1194	lea	rdi,[((48+8))+r9*2+rsp]
1195	mov	rax,QWORD[((-24))+rsi]
1196	lea	rdi,[((-32))+rbp*1+rdi]
1197	mov	rbx,QWORD[((-16))+rsi]
1198	mov	r15,rax
1199
1200	mul	r14
1201	add	r10,rax
1202	mov	rax,rbx
1203	mov	r11,rdx
1204	adc	r11,0
1205
1206	mul	r14
1207	add	r11,rax
1208	mov	rax,rbx
1209	mov	QWORD[((-24))+rdi],r10
1210	mov	r10,rdx
1211	adc	r10,0
1212	add	r11,r13
1213	mov	rbx,QWORD[((-8))+rsi]
1214	adc	r10,0
1215
1216	mul	r15
1217	add	r12,rax
1218	mov	rax,rbx
1219	mov	QWORD[((-16))+rdi],r11
1220	mov	r13,rdx
1221	adc	r13,0
1222
1223	mul	r14
1224	add	r10,rax
1225	mov	rax,rbx
1226	mov	r11,rdx
1227	adc	r11,0
1228	add	r10,r12
1229	adc	r11,0
1230	mov	QWORD[((-8))+rdi],r10
1231
1232	mul	r15
1233	add	r13,rax
1234	mov	rax,QWORD[((-16))+rsi]
1235	adc	rdx,0
1236	add	r13,r11
1237	adc	rdx,0
1238
1239	mov	QWORD[rdi],r13
1240	mov	r12,rdx
1241	mov	QWORD[8+rdi],rdx
1242
1243	mul	rbx
1244	add	rbp,16
1245	xor	r14,r14
1246	sub	rbp,r9
1247	xor	r15,r15
1248
1249	add	rax,r12
1250	adc	rdx,0
1251	mov	QWORD[8+rdi],rax
1252	mov	QWORD[16+rdi],rdx
1253	mov	QWORD[24+rdi],r15
1254
1255	mov	rax,QWORD[((-16))+rbp*1+rsi]
1256	lea	rdi,[((48+8))+rsp]
1257	xor	r10,r10
1258	mov	r11,QWORD[8+rdi]
1259
1260	lea	r12,[r10*2+r14]
1261	shr	r10,63
1262	lea	r13,[r11*2+rcx]
1263	shr	r11,63
1264	or	r13,r10
1265	mov	r10,QWORD[16+rdi]
1266	mov	r14,r11
1267	mul	rax
1268	neg	r15
1269	mov	r11,QWORD[24+rdi]
1270	adc	r12,rax
1271	mov	rax,QWORD[((-8))+rbp*1+rsi]
1272	mov	QWORD[rdi],r12
1273	adc	r13,rdx
1274
1275	lea	rbx,[r10*2+r14]
1276	mov	QWORD[8+rdi],r13
1277	sbb	r15,r15
1278	shr	r10,63
1279	lea	r8,[r11*2+rcx]
1280	shr	r11,63
1281	or	r8,r10
1282	mov	r10,QWORD[32+rdi]
1283	mov	r14,r11
1284	mul	rax
1285	neg	r15
1286	mov	r11,QWORD[40+rdi]
1287	adc	rbx,rax
1288	mov	rax,QWORD[rbp*1+rsi]
1289	mov	QWORD[16+rdi],rbx
1290	adc	r8,rdx
1291	lea	rbp,[16+rbp]
1292	mov	QWORD[24+rdi],r8
1293	sbb	r15,r15
1294	lea	rdi,[64+rdi]
1295	jmp	NEAR $L$sqr4x_shift_n_add
1296
1297ALIGN	32
1298$L$sqr4x_shift_n_add:
1299	lea	r12,[r10*2+r14]
1300	shr	r10,63
1301	lea	r13,[r11*2+rcx]
1302	shr	r11,63
1303	or	r13,r10
1304	mov	r10,QWORD[((-16))+rdi]
1305	mov	r14,r11
1306	mul	rax
1307	neg	r15
1308	mov	r11,QWORD[((-8))+rdi]
1309	adc	r12,rax
1310	mov	rax,QWORD[((-8))+rbp*1+rsi]
1311	mov	QWORD[((-32))+rdi],r12
1312	adc	r13,rdx
1313
1314	lea	rbx,[r10*2+r14]
1315	mov	QWORD[((-24))+rdi],r13
1316	sbb	r15,r15
1317	shr	r10,63
1318	lea	r8,[r11*2+rcx]
1319	shr	r11,63
1320	or	r8,r10
1321	mov	r10,QWORD[rdi]
1322	mov	r14,r11
1323	mul	rax
1324	neg	r15
1325	mov	r11,QWORD[8+rdi]
1326	adc	rbx,rax
1327	mov	rax,QWORD[rbp*1+rsi]
1328	mov	QWORD[((-16))+rdi],rbx
1329	adc	r8,rdx
1330
1331	lea	r12,[r10*2+r14]
1332	mov	QWORD[((-8))+rdi],r8
1333	sbb	r15,r15
1334	shr	r10,63
1335	lea	r13,[r11*2+rcx]
1336	shr	r11,63
1337	or	r13,r10
1338	mov	r10,QWORD[16+rdi]
1339	mov	r14,r11
1340	mul	rax
1341	neg	r15
1342	mov	r11,QWORD[24+rdi]
1343	adc	r12,rax
1344	mov	rax,QWORD[8+rbp*1+rsi]
1345	mov	QWORD[rdi],r12
1346	adc	r13,rdx
1347
1348	lea	rbx,[r10*2+r14]
1349	mov	QWORD[8+rdi],r13
1350	sbb	r15,r15
1351	shr	r10,63
1352	lea	r8,[r11*2+rcx]
1353	shr	r11,63
1354	or	r8,r10
1355	mov	r10,QWORD[32+rdi]
1356	mov	r14,r11
1357	mul	rax
1358	neg	r15
1359	mov	r11,QWORD[40+rdi]
1360	adc	rbx,rax
1361	mov	rax,QWORD[16+rbp*1+rsi]
1362	mov	QWORD[16+rdi],rbx
1363	adc	r8,rdx
1364	mov	QWORD[24+rdi],r8
1365	sbb	r15,r15
1366	lea	rdi,[64+rdi]
1367	add	rbp,32
1368	jnz	NEAR $L$sqr4x_shift_n_add
1369
1370	lea	r12,[r10*2+r14]
1371DB	0x67
1372	shr	r10,63
1373	lea	r13,[r11*2+rcx]
1374	shr	r11,63
1375	or	r13,r10
1376	mov	r10,QWORD[((-16))+rdi]
1377	mov	r14,r11
1378	mul	rax
1379	neg	r15
1380	mov	r11,QWORD[((-8))+rdi]
1381	adc	r12,rax
1382	mov	rax,QWORD[((-8))+rsi]
1383	mov	QWORD[((-32))+rdi],r12
1384	adc	r13,rdx
1385
1386	lea	rbx,[r10*2+r14]
1387	mov	QWORD[((-24))+rdi],r13
1388	sbb	r15,r15
1389	shr	r10,63
1390	lea	r8,[r11*2+rcx]
1391	shr	r11,63
1392	or	r8,r10
1393	mul	rax
1394	neg	r15
1395	adc	rbx,rax
1396	adc	r8,rdx
1397	mov	QWORD[((-16))+rdi],rbx
1398	mov	QWORD[((-8))+rdi],r8
1399DB	102,72,15,126,213
1400sqr8x_reduction:
1401	xor	rax,rax
1402	lea	rcx,[r9*2+rbp]
1403	lea	rdx,[((48+8))+r9*2+rsp]
1404	mov	QWORD[((0+8))+rsp],rcx
1405	lea	rdi,[((48+8))+r9*1+rsp]
1406	mov	QWORD[((8+8))+rsp],rdx
1407	neg	r9
1408	jmp	NEAR $L$8x_reduction_loop
1409
1410ALIGN	32
1411$L$8x_reduction_loop:
1412	lea	rdi,[r9*1+rdi]
1413DB	0x66
1414	mov	rbx,QWORD[rdi]
1415	mov	r9,QWORD[8+rdi]
1416	mov	r10,QWORD[16+rdi]
1417	mov	r11,QWORD[24+rdi]
1418	mov	r12,QWORD[32+rdi]
1419	mov	r13,QWORD[40+rdi]
1420	mov	r14,QWORD[48+rdi]
1421	mov	r15,QWORD[56+rdi]
1422	mov	QWORD[rdx],rax
1423	lea	rdi,[64+rdi]
1424
1425DB	0x67
1426	mov	r8,rbx
1427	imul	rbx,QWORD[((32+8))+rsp]
1428	mov	rax,QWORD[rbp]
1429	mov	ecx,8
1430	jmp	NEAR $L$8x_reduce
1431
1432ALIGN	32
1433$L$8x_reduce:
1434	mul	rbx
1435	mov	rax,QWORD[16+rbp]
1436	neg	r8
1437	mov	r8,rdx
1438	adc	r8,0
1439
1440	mul	rbx
1441	add	r9,rax
1442	mov	rax,QWORD[32+rbp]
1443	adc	rdx,0
1444	add	r8,r9
1445	mov	QWORD[((48-8+8))+rcx*8+rsp],rbx
1446	mov	r9,rdx
1447	adc	r9,0
1448
1449	mul	rbx
1450	add	r10,rax
1451	mov	rax,QWORD[48+rbp]
1452	adc	rdx,0
1453	add	r9,r10
1454	mov	rsi,QWORD[((32+8))+rsp]
1455	mov	r10,rdx
1456	adc	r10,0
1457
1458	mul	rbx
1459	add	r11,rax
1460	mov	rax,QWORD[64+rbp]
1461	adc	rdx,0
1462	imul	rsi,r8
1463	add	r10,r11
1464	mov	r11,rdx
1465	adc	r11,0
1466
1467	mul	rbx
1468	add	r12,rax
1469	mov	rax,QWORD[80+rbp]
1470	adc	rdx,0
1471	add	r11,r12
1472	mov	r12,rdx
1473	adc	r12,0
1474
1475	mul	rbx
1476	add	r13,rax
1477	mov	rax,QWORD[96+rbp]
1478	adc	rdx,0
1479	add	r12,r13
1480	mov	r13,rdx
1481	adc	r13,0
1482
1483	mul	rbx
1484	add	r14,rax
1485	mov	rax,QWORD[112+rbp]
1486	adc	rdx,0
1487	add	r13,r14
1488	mov	r14,rdx
1489	adc	r14,0
1490
1491	mul	rbx
1492	mov	rbx,rsi
1493	add	r15,rax
1494	mov	rax,QWORD[rbp]
1495	adc	rdx,0
1496	add	r14,r15
1497	mov	r15,rdx
1498	adc	r15,0
1499
1500	dec	ecx
1501	jnz	NEAR $L$8x_reduce
1502
1503	lea	rbp,[128+rbp]
1504	xor	rax,rax
1505	mov	rdx,QWORD[((8+8))+rsp]
1506	cmp	rbp,QWORD[((0+8))+rsp]
1507	jae	NEAR $L$8x_no_tail
1508
1509DB	0x66
1510	add	r8,QWORD[rdi]
1511	adc	r9,QWORD[8+rdi]
1512	adc	r10,QWORD[16+rdi]
1513	adc	r11,QWORD[24+rdi]
1514	adc	r12,QWORD[32+rdi]
1515	adc	r13,QWORD[40+rdi]
1516	adc	r14,QWORD[48+rdi]
1517	adc	r15,QWORD[56+rdi]
1518	sbb	rsi,rsi
1519
1520	mov	rbx,QWORD[((48+56+8))+rsp]
1521	mov	ecx,8
1522	mov	rax,QWORD[rbp]
1523	jmp	NEAR $L$8x_tail
1524
1525ALIGN	32
1526$L$8x_tail:
1527	mul	rbx
1528	add	r8,rax
1529	mov	rax,QWORD[16+rbp]
1530	mov	QWORD[rdi],r8
1531	mov	r8,rdx
1532	adc	r8,0
1533
1534	mul	rbx
1535	add	r9,rax
1536	mov	rax,QWORD[32+rbp]
1537	adc	rdx,0
1538	add	r8,r9
1539	lea	rdi,[8+rdi]
1540	mov	r9,rdx
1541	adc	r9,0
1542
1543	mul	rbx
1544	add	r10,rax
1545	mov	rax,QWORD[48+rbp]
1546	adc	rdx,0
1547	add	r9,r10
1548	mov	r10,rdx
1549	adc	r10,0
1550
1551	mul	rbx
1552	add	r11,rax
1553	mov	rax,QWORD[64+rbp]
1554	adc	rdx,0
1555	add	r10,r11
1556	mov	r11,rdx
1557	adc	r11,0
1558
1559	mul	rbx
1560	add	r12,rax
1561	mov	rax,QWORD[80+rbp]
1562	adc	rdx,0
1563	add	r11,r12
1564	mov	r12,rdx
1565	adc	r12,0
1566
1567	mul	rbx
1568	add	r13,rax
1569	mov	rax,QWORD[96+rbp]
1570	adc	rdx,0
1571	add	r12,r13
1572	mov	r13,rdx
1573	adc	r13,0
1574
1575	mul	rbx
1576	add	r14,rax
1577	mov	rax,QWORD[112+rbp]
1578	adc	rdx,0
1579	add	r13,r14
1580	mov	r14,rdx
1581	adc	r14,0
1582
1583	mul	rbx
1584	mov	rbx,QWORD[((48-16+8))+rcx*8+rsp]
1585	add	r15,rax
1586	adc	rdx,0
1587	add	r14,r15
1588	mov	rax,QWORD[rbp]
1589	mov	r15,rdx
1590	adc	r15,0
1591
1592	dec	ecx
1593	jnz	NEAR $L$8x_tail
1594
1595	lea	rbp,[128+rbp]
1596	mov	rdx,QWORD[((8+8))+rsp]
1597	cmp	rbp,QWORD[((0+8))+rsp]
1598	jae	NEAR $L$8x_tail_done
1599
1600	mov	rbx,QWORD[((48+56+8))+rsp]
1601	neg	rsi
1602	mov	rax,QWORD[rbp]
1603	adc	r8,QWORD[rdi]
1604	adc	r9,QWORD[8+rdi]
1605	adc	r10,QWORD[16+rdi]
1606	adc	r11,QWORD[24+rdi]
1607	adc	r12,QWORD[32+rdi]
1608	adc	r13,QWORD[40+rdi]
1609	adc	r14,QWORD[48+rdi]
1610	adc	r15,QWORD[56+rdi]
1611	sbb	rsi,rsi
1612
1613	mov	ecx,8
1614	jmp	NEAR $L$8x_tail
1615
1616ALIGN	32
1617$L$8x_tail_done:
1618	add	r8,QWORD[rdx]
1619	adc	r9,0
1620	adc	r10,0
1621	adc	r11,0
1622	adc	r12,0
1623	adc	r13,0
1624	adc	r14,0
1625	adc	r15,0
1626
1627
1628	xor	rax,rax
1629
1630	neg	rsi
1631$L$8x_no_tail:
1632	adc	r8,QWORD[rdi]
1633	adc	r9,QWORD[8+rdi]
1634	adc	r10,QWORD[16+rdi]
1635	adc	r11,QWORD[24+rdi]
1636	adc	r12,QWORD[32+rdi]
1637	adc	r13,QWORD[40+rdi]
1638	adc	r14,QWORD[48+rdi]
1639	adc	r15,QWORD[56+rdi]
1640	adc	rax,0
1641	mov	rcx,QWORD[((-16))+rbp]
1642	xor	rsi,rsi
1643
1644DB	102,72,15,126,213
1645
1646	mov	QWORD[rdi],r8
1647	mov	QWORD[8+rdi],r9
1648DB	102,73,15,126,217
1649	mov	QWORD[16+rdi],r10
1650	mov	QWORD[24+rdi],r11
1651	mov	QWORD[32+rdi],r12
1652	mov	QWORD[40+rdi],r13
1653	mov	QWORD[48+rdi],r14
1654	mov	QWORD[56+rdi],r15
1655	lea	rdi,[64+rdi]
1656
1657	cmp	rdi,rdx
1658	jb	NEAR $L$8x_reduction_loop
1659
1660	sub	rcx,r15
1661	lea	rbx,[r9*1+rdi]
1662	adc	rsi,rsi
1663	mov	rcx,r9
1664	or	rax,rsi
1665DB	102,72,15,126,207
1666	xor	rax,1
1667DB	102,72,15,126,206
1668	lea	rbp,[rax*8+rbp]
1669	sar	rcx,3+2
1670	jmp	NEAR $L$sqr4x_sub
1671
1672ALIGN	32
1673$L$sqr4x_sub:
1674DB	0x66
1675	mov	r12,QWORD[rbx]
1676	mov	r13,QWORD[8+rbx]
1677	sbb	r12,QWORD[rbp]
1678	mov	r14,QWORD[16+rbx]
1679	sbb	r13,QWORD[16+rbp]
1680	mov	r15,QWORD[24+rbx]
1681	lea	rbx,[32+rbx]
1682	sbb	r14,QWORD[32+rbp]
1683	mov	QWORD[rdi],r12
1684	sbb	r15,QWORD[48+rbp]
1685	lea	rbp,[64+rbp]
1686	mov	QWORD[8+rdi],r13
1687	mov	QWORD[16+rdi],r14
1688	mov	QWORD[24+rdi],r15
1689	lea	rdi,[32+rdi]
1690
1691	inc	rcx
1692	jnz	NEAR $L$sqr4x_sub
1693	mov	r10,r9
1694	neg	r9
1695	DB	0F3h,0C3h		;repret
1696
1697global	bn_from_montgomery
1698
1699ALIGN	32
1700bn_from_montgomery:
1701	test	DWORD[48+rsp],7
1702	jz	NEAR bn_from_mont8x
1703	xor	eax,eax
1704	DB	0F3h,0C3h		;repret
1705
1706
1707
1708ALIGN	32
1709bn_from_mont8x:
1710	mov	QWORD[8+rsp],rdi	;WIN64 prologue
1711	mov	QWORD[16+rsp],rsi
1712	mov	rax,rsp
1713$L$SEH_begin_bn_from_mont8x:
1714	mov	rdi,rcx
1715	mov	rsi,rdx
1716	mov	rdx,r8
1717	mov	rcx,r9
1718	mov	r8,QWORD[40+rsp]
1719	mov	r9,QWORD[48+rsp]
1720
1721
1722DB	0x67
1723	mov	rax,rsp
1724	push	rbx
1725	push	rbp
1726	push	r12
1727	push	r13
1728	push	r14
1729	push	r15
1730	lea	rsp,[((-40))+rsp]
1731	movaps	XMMWORD[rsp],xmm6
1732	movaps	XMMWORD[16+rsp],xmm7
1733DB	0x67
1734	mov	r10d,r9d
1735	shl	r9d,3
1736	shl	r10d,3+2
1737	neg	r9
1738	mov	r8,QWORD[r8]
1739
1740
1741
1742
1743
1744
1745
1746	lea	r11,[((-64))+r9*2+rsp]
1747	sub	r11,rsi
1748	and	r11,4095
1749	cmp	r10,r11
1750	jb	NEAR $L$from_sp_alt
1751	sub	rsp,r11
1752	lea	rsp,[((-64))+r9*2+rsp]
1753	jmp	NEAR $L$from_sp_done
1754
1755ALIGN	32
1756$L$from_sp_alt:
1757	lea	r10,[((4096-64))+r9*2]
1758	lea	rsp,[((-64))+r9*2+rsp]
1759	sub	r11,r10
1760	mov	r10,0
1761	cmovc	r11,r10
1762	sub	rsp,r11
1763$L$from_sp_done:
1764	and	rsp,-64
1765	mov	r10,r9
1766	neg	r9
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777	mov	QWORD[32+rsp],r8
1778	mov	QWORD[40+rsp],rax
1779$L$from_body:
1780	mov	r11,r9
1781	lea	rax,[48+rsp]
1782	pxor	xmm0,xmm0
1783	jmp	NEAR $L$mul_by_1
1784
1785ALIGN	32
1786$L$mul_by_1:
1787	movdqu	xmm1,XMMWORD[rsi]
1788	movdqu	xmm2,XMMWORD[16+rsi]
1789	movdqu	xmm3,XMMWORD[32+rsi]
1790	movdqa	XMMWORD[r9*1+rax],xmm0
1791	movdqu	xmm4,XMMWORD[48+rsi]
1792	movdqa	XMMWORD[16+r9*1+rax],xmm0
1793DB	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
1794	movdqa	XMMWORD[rax],xmm1
1795	movdqa	XMMWORD[32+r9*1+rax],xmm0
1796	movdqa	XMMWORD[16+rax],xmm2
1797	movdqa	XMMWORD[48+r9*1+rax],xmm0
1798	movdqa	XMMWORD[32+rax],xmm3
1799	movdqa	XMMWORD[48+rax],xmm4
1800	lea	rax,[64+rax]
1801	sub	r11,64
1802	jnz	NEAR $L$mul_by_1
1803
1804DB	102,72,15,110,207
1805DB	102,72,15,110,209
1806DB	0x67
1807	mov	rbp,rcx
1808DB	102,73,15,110,218
1809	call	sqr8x_reduction
1810
1811	pxor	xmm0,xmm0
1812	lea	rax,[48+rsp]
1813	mov	rsi,QWORD[40+rsp]
1814	jmp	NEAR $L$from_mont_zero
1815
1816ALIGN	32
1817$L$from_mont_zero:
1818	movdqa	XMMWORD[rax],xmm0
1819	movdqa	XMMWORD[16+rax],xmm0
1820	movdqa	XMMWORD[32+rax],xmm0
1821	movdqa	XMMWORD[48+rax],xmm0
1822	lea	rax,[64+rax]
1823	sub	r9,32
1824	jnz	NEAR $L$from_mont_zero
1825
1826	mov	rax,1
1827	mov	r15,QWORD[((-48))+rsi]
1828	mov	r14,QWORD[((-40))+rsi]
1829	mov	r13,QWORD[((-32))+rsi]
1830	mov	r12,QWORD[((-24))+rsi]
1831	mov	rbp,QWORD[((-16))+rsi]
1832	mov	rbx,QWORD[((-8))+rsi]
1833	lea	rsp,[rsi]
1834$L$from_epilogue:
1835	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1836	mov	rsi,QWORD[16+rsp]
1837	DB	0F3h,0C3h		;repret
1838$L$SEH_end_bn_from_mont8x:
1839global	bn_scatter5
1840
1841ALIGN	16
1842bn_scatter5:
1843	cmp	edx,0
1844	jz	NEAR $L$scatter_epilogue
1845	lea	r8,[r9*8+r8]
1846$L$scatter:
1847	mov	rax,QWORD[rcx]
1848	lea	rcx,[8+rcx]
1849	mov	QWORD[r8],rax
1850	lea	r8,[256+r8]
1851	sub	edx,1
1852	jnz	NEAR $L$scatter
1853$L$scatter_epilogue:
1854	DB	0F3h,0C3h		;repret
1855
1856
1857global	bn_gather5
1858
1859ALIGN	16
1860bn_gather5:
1861$L$SEH_begin_bn_gather5:
1862
1863DB	0x48,0x83,0xec,0x28
1864DB	0x0f,0x29,0x34,0x24
1865DB	0x0f,0x29,0x7c,0x24,0x10
1866	mov	r11d,r9d
1867	shr	r9d,3
1868	and	r11,7
1869	not	r9d
1870	lea	rax,[$L$magic_masks]
1871	and	r9d,3
1872	lea	r8,[128+r11*8+r8]
1873	movq	xmm4,QWORD[r9*8+rax]
1874	movq	xmm5,QWORD[8+r9*8+rax]
1875	movq	xmm6,QWORD[16+r9*8+rax]
1876	movq	xmm7,QWORD[24+r9*8+rax]
1877	jmp	NEAR $L$gather
1878ALIGN	16
1879$L$gather:
1880	movq	xmm0,QWORD[(((-128)))+r8]
1881	movq	xmm1,QWORD[((-64))+r8]
1882	pand	xmm0,xmm4
1883	movq	xmm2,QWORD[r8]
1884	pand	xmm1,xmm5
1885	movq	xmm3,QWORD[64+r8]
1886	pand	xmm2,xmm6
1887	por	xmm0,xmm1
1888	pand	xmm3,xmm7
1889DB	0x67,0x67
1890	por	xmm0,xmm2
1891	lea	r8,[256+r8]
1892	por	xmm0,xmm3
1893
1894	movq	QWORD[rcx],xmm0
1895	lea	rcx,[8+rcx]
1896	sub	edx,1
1897	jnz	NEAR $L$gather
1898	movaps	xmm6,XMMWORD[rsp]
1899	movaps	xmm7,XMMWORD[16+rsp]
1900	lea	rsp,[40+rsp]
1901	DB	0F3h,0C3h		;repret
1902$L$SEH_end_bn_gather5:
1903
1904ALIGN	64
1905$L$magic_masks:
1906	DD	0,0,0,0,0,0,-1,-1
1907	DD	0,0,0,0,0,0,0,0
1908DB	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
1909DB	112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115
1910DB	99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111
1911DB	114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79
1912DB	71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111
1913DB	112,101,110,115,115,108,46,111,114,103,62,0
1914EXTERN	__imp_RtlVirtualUnwind
1915
1916ALIGN	16
1917mul_handler:
1918	push	rsi
1919	push	rdi
1920	push	rbx
1921	push	rbp
1922	push	r12
1923	push	r13
1924	push	r14
1925	push	r15
1926	pushfq
1927	sub	rsp,64
1928
1929	mov	rax,QWORD[120+r8]
1930	mov	rbx,QWORD[248+r8]
1931
1932	mov	rsi,QWORD[8+r9]
1933	mov	r11,QWORD[56+r9]
1934
1935	mov	r10d,DWORD[r11]
1936	lea	r10,[r10*1+rsi]
1937	cmp	rbx,r10
1938	jb	NEAR $L$common_seh_tail
1939
1940	mov	rax,QWORD[152+r8]
1941
1942	mov	r10d,DWORD[4+r11]
1943	lea	r10,[r10*1+rsi]
1944	cmp	rbx,r10
1945	jae	NEAR $L$common_seh_tail
1946
1947	lea	r10,[$L$mul_epilogue]
1948	cmp	rbx,r10
1949	jb	NEAR $L$body_40
1950
1951	mov	r10,QWORD[192+r8]
1952	mov	rax,QWORD[8+r10*8+rax]
1953	jmp	NEAR $L$body_proceed
1954
1955$L$body_40:
1956	mov	rax,QWORD[40+rax]
1957$L$body_proceed:
1958
1959	movaps	xmm0,XMMWORD[((-88))+rax]
1960	movaps	xmm1,XMMWORD[((-72))+rax]
1961
1962	mov	rbx,QWORD[((-8))+rax]
1963	mov	rbp,QWORD[((-16))+rax]
1964	mov	r12,QWORD[((-24))+rax]
1965	mov	r13,QWORD[((-32))+rax]
1966	mov	r14,QWORD[((-40))+rax]
1967	mov	r15,QWORD[((-48))+rax]
1968	mov	QWORD[144+r8],rbx
1969	mov	QWORD[160+r8],rbp
1970	mov	QWORD[216+r8],r12
1971	mov	QWORD[224+r8],r13
1972	mov	QWORD[232+r8],r14
1973	mov	QWORD[240+r8],r15
1974	movups	XMMWORD[512+r8],xmm0
1975	movups	XMMWORD[528+r8],xmm1
1976
1977$L$common_seh_tail:
1978	mov	rdi,QWORD[8+rax]
1979	mov	rsi,QWORD[16+rax]
1980	mov	QWORD[152+r8],rax
1981	mov	QWORD[168+r8],rsi
1982	mov	QWORD[176+r8],rdi
1983
1984	mov	rdi,QWORD[40+r9]
1985	mov	rsi,r8
1986	mov	ecx,154
1987	DD	0xa548f3fc
1988
1989	mov	rsi,r9
1990	xor	rcx,rcx
1991	mov	rdx,QWORD[8+rsi]
1992	mov	r8,QWORD[rsi]
1993	mov	r9,QWORD[16+rsi]
1994	mov	r10,QWORD[40+rsi]
1995	lea	r11,[56+rsi]
1996	lea	r12,[24+rsi]
1997	mov	QWORD[32+rsp],r10
1998	mov	QWORD[40+rsp],r11
1999	mov	QWORD[48+rsp],r12
2000	mov	QWORD[56+rsp],rcx
2001	call	QWORD[__imp_RtlVirtualUnwind]
2002
2003	mov	eax,1
2004	add	rsp,64
2005	popfq
2006	pop	r15
2007	pop	r14
2008	pop	r13
2009	pop	r12
2010	pop	rbp
2011	pop	rbx
2012	pop	rdi
2013	pop	rsi
2014	DB	0F3h,0C3h		;repret
2015
2016
2017section	.pdata rdata align=4
2018ALIGN	4
2019	DD	$L$SEH_begin_bn_mul_mont_gather5 wrt ..imagebase
2020	DD	$L$SEH_end_bn_mul_mont_gather5 wrt ..imagebase
2021	DD	$L$SEH_info_bn_mul_mont_gather5 wrt ..imagebase
2022
2023	DD	$L$SEH_begin_bn_mul4x_mont_gather5 wrt ..imagebase
2024	DD	$L$SEH_end_bn_mul4x_mont_gather5 wrt ..imagebase
2025	DD	$L$SEH_info_bn_mul4x_mont_gather5 wrt ..imagebase
2026
2027	DD	$L$SEH_begin_bn_power5 wrt ..imagebase
2028	DD	$L$SEH_end_bn_power5 wrt ..imagebase
2029	DD	$L$SEH_info_bn_power5 wrt ..imagebase
2030
2031	DD	$L$SEH_begin_bn_from_mont8x wrt ..imagebase
2032	DD	$L$SEH_end_bn_from_mont8x wrt ..imagebase
2033	DD	$L$SEH_info_bn_from_mont8x wrt ..imagebase
2034	DD	$L$SEH_begin_bn_gather5 wrt ..imagebase
2035	DD	$L$SEH_end_bn_gather5 wrt ..imagebase
2036	DD	$L$SEH_info_bn_gather5 wrt ..imagebase
2037
2038section	.xdata rdata align=8
2039ALIGN	8
2040$L$SEH_info_bn_mul_mont_gather5:
2041DB	9,0,0,0
2042	DD	mul_handler wrt ..imagebase
2043	DD	$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
2044ALIGN	8
2045$L$SEH_info_bn_mul4x_mont_gather5:
2046DB	9,0,0,0
2047	DD	mul_handler wrt ..imagebase
2048	DD	$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase
2049ALIGN	8
2050$L$SEH_info_bn_power5:
2051DB	9,0,0,0
2052	DD	mul_handler wrt ..imagebase
2053	DD	$L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase
2054ALIGN	8
2055$L$SEH_info_bn_from_mont8x:
2056DB	9,0,0,0
2057	DD	mul_handler wrt ..imagebase
2058	DD	$L$from_body wrt ..imagebase,$L$from_epilogue wrt ..imagebase
2059ALIGN	8
2060$L$SEH_info_bn_gather5:
2061DB	0x01,0x0d,0x05,0x00
2062DB	0x0d,0x78,0x01,0x00
2063DB	0x08,0x68,0x00,0x00
2064DB	0x04,0x42,0x00,0x00
2065ALIGN	8
2066