1#if defined(__x86_64__)
2.text
3
4.extern	OPENSSL_ia32cap_P
5.hidden OPENSSL_ia32cap_P
6
7.globl	bn_mul_mont_gather5
8.hidden bn_mul_mont_gather5
9.type	bn_mul_mont_gather5,@function
10.align	64
11bn_mul_mont_gather5:
12	testl	$7,%r9d
13	jnz	.Lmul_enter
14	jmp	.Lmul4x_enter
15
16.align	16
17.Lmul_enter:
18	movl	%r9d,%r9d
19	movq	%rsp,%rax
20	movl	8(%rsp),%r10d
21	pushq	%rbx
22	pushq	%rbp
23	pushq	%r12
24	pushq	%r13
25	pushq	%r14
26	pushq	%r15
27	leaq	2(%r9),%r11
28	negq	%r11
29	leaq	(%rsp,%r11,8),%rsp
30	andq	$-1024,%rsp
31
32	movq	%rax,8(%rsp,%r9,8)
33.Lmul_body:
34	movq	%rdx,%r12
35	movq	%r10,%r11
36	shrq	$3,%r10
37	andq	$7,%r11
38	notq	%r10
39	leaq	.Lmagic_masks(%rip),%rax
40	andq	$3,%r10
41	leaq	96(%r12,%r11,8),%r12
42	movq	0(%rax,%r10,8),%xmm4
43	movq	8(%rax,%r10,8),%xmm5
44	movq	16(%rax,%r10,8),%xmm6
45	movq	24(%rax,%r10,8),%xmm7
46
47	movq	-96(%r12),%xmm0
48	movq	-32(%r12),%xmm1
49	pand	%xmm4,%xmm0
50	movq	32(%r12),%xmm2
51	pand	%xmm5,%xmm1
52	movq	96(%r12),%xmm3
53	pand	%xmm6,%xmm2
54	por	%xmm1,%xmm0
55	pand	%xmm7,%xmm3
56	por	%xmm2,%xmm0
57	leaq	256(%r12),%r12
58	por	%xmm3,%xmm0
59
60.byte	102,72,15,126,195
61
62	movq	(%r8),%r8
63	movq	(%rsi),%rax
64
65	xorq	%r14,%r14
66	xorq	%r15,%r15
67
68	movq	-96(%r12),%xmm0
69	movq	-32(%r12),%xmm1
70	pand	%xmm4,%xmm0
71	movq	32(%r12),%xmm2
72	pand	%xmm5,%xmm1
73
74	movq	%r8,%rbp
75	mulq	%rbx
76	movq	%rax,%r10
77	movq	(%rcx),%rax
78
79	movq	96(%r12),%xmm3
80	pand	%xmm6,%xmm2
81	por	%xmm1,%xmm0
82	pand	%xmm7,%xmm3
83
84	imulq	%r10,%rbp
85	movq	%rdx,%r11
86
87	por	%xmm2,%xmm0
88	leaq	256(%r12),%r12
89	por	%xmm3,%xmm0
90
91	mulq	%rbp
92	addq	%rax,%r10
93	movq	8(%rsi),%rax
94	adcq	$0,%rdx
95	movq	%rdx,%r13
96
97	leaq	1(%r15),%r15
98	jmp	.L1st_enter
99
100.align	16
101.L1st:
102	addq	%rax,%r13
103	movq	(%rsi,%r15,8),%rax
104	adcq	$0,%rdx
105	addq	%r11,%r13
106	movq	%r10,%r11
107	adcq	$0,%rdx
108	movq	%r13,-16(%rsp,%r15,8)
109	movq	%rdx,%r13
110
111.L1st_enter:
112	mulq	%rbx
113	addq	%rax,%r11
114	movq	(%rcx,%r15,8),%rax
115	adcq	$0,%rdx
116	leaq	1(%r15),%r15
117	movq	%rdx,%r10
118
119	mulq	%rbp
120	cmpq	%r9,%r15
121	jne	.L1st
122
123.byte	102,72,15,126,195
124
125	addq	%rax,%r13
126	movq	(%rsi),%rax
127	adcq	$0,%rdx
128	addq	%r11,%r13
129	adcq	$0,%rdx
130	movq	%r13,-16(%rsp,%r15,8)
131	movq	%rdx,%r13
132	movq	%r10,%r11
133
134	xorq	%rdx,%rdx
135	addq	%r11,%r13
136	adcq	$0,%rdx
137	movq	%r13,-8(%rsp,%r9,8)
138	movq	%rdx,(%rsp,%r9,8)
139
140	leaq	1(%r14),%r14
141	jmp	.Louter
142.align	16
143.Louter:
144	xorq	%r15,%r15
145	movq	%r8,%rbp
146	movq	(%rsp),%r10
147
148	movq	-96(%r12),%xmm0
149	movq	-32(%r12),%xmm1
150	pand	%xmm4,%xmm0
151	movq	32(%r12),%xmm2
152	pand	%xmm5,%xmm1
153
154	mulq	%rbx
155	addq	%rax,%r10
156	movq	(%rcx),%rax
157	adcq	$0,%rdx
158
159	movq	96(%r12),%xmm3
160	pand	%xmm6,%xmm2
161	por	%xmm1,%xmm0
162	pand	%xmm7,%xmm3
163
164	imulq	%r10,%rbp
165	movq	%rdx,%r11
166
167	por	%xmm2,%xmm0
168	leaq	256(%r12),%r12
169	por	%xmm3,%xmm0
170
171	mulq	%rbp
172	addq	%rax,%r10
173	movq	8(%rsi),%rax
174	adcq	$0,%rdx
175	movq	8(%rsp),%r10
176	movq	%rdx,%r13
177
178	leaq	1(%r15),%r15
179	jmp	.Linner_enter
180
181.align	16
182.Linner:
183	addq	%rax,%r13
184	movq	(%rsi,%r15,8),%rax
185	adcq	$0,%rdx
186	addq	%r10,%r13
187	movq	(%rsp,%r15,8),%r10
188	adcq	$0,%rdx
189	movq	%r13,-16(%rsp,%r15,8)
190	movq	%rdx,%r13
191
192.Linner_enter:
193	mulq	%rbx
194	addq	%rax,%r11
195	movq	(%rcx,%r15,8),%rax
196	adcq	$0,%rdx
197	addq	%r11,%r10
198	movq	%rdx,%r11
199	adcq	$0,%r11
200	leaq	1(%r15),%r15
201
202	mulq	%rbp
203	cmpq	%r9,%r15
204	jne	.Linner
205
206.byte	102,72,15,126,195
207
208	addq	%rax,%r13
209	movq	(%rsi),%rax
210	adcq	$0,%rdx
211	addq	%r10,%r13
212	movq	(%rsp,%r15,8),%r10
213	adcq	$0,%rdx
214	movq	%r13,-16(%rsp,%r15,8)
215	movq	%rdx,%r13
216
217	xorq	%rdx,%rdx
218	addq	%r11,%r13
219	adcq	$0,%rdx
220	addq	%r10,%r13
221	adcq	$0,%rdx
222	movq	%r13,-8(%rsp,%r9,8)
223	movq	%rdx,(%rsp,%r9,8)
224
225	leaq	1(%r14),%r14
226	cmpq	%r9,%r14
227	jb	.Louter
228
229	xorq	%r14,%r14
230	movq	(%rsp),%rax
231	leaq	(%rsp),%rsi
232	movq	%r9,%r15
233	jmp	.Lsub
234.align	16
235.Lsub:	sbbq	(%rcx,%r14,8),%rax
236	movq	%rax,(%rdi,%r14,8)
237	movq	8(%rsi,%r14,8),%rax
238	leaq	1(%r14),%r14
239	decq	%r15
240	jnz	.Lsub
241
242	sbbq	$0,%rax
243	xorq	%r14,%r14
244	movq	%r9,%r15
245.align	16
246.Lcopy:
247	movq	(%rsp,%r14,8),%rsi
248	movq	(%rdi,%r14,8),%rcx
249	xorq	%rcx,%rsi
250	andq	%rax,%rsi
251	xorq	%rcx,%rsi
252	movq	%r14,(%rsp,%r14,8)
253	movq	%rsi,(%rdi,%r14,8)
254	leaq	1(%r14),%r14
255	subq	$1,%r15
256	jnz	.Lcopy
257
258	movq	8(%rsp,%r9,8),%rsi
259	movq	$1,%rax
260	movq	-48(%rsi),%r15
261	movq	-40(%rsi),%r14
262	movq	-32(%rsi),%r13
263	movq	-24(%rsi),%r12
264	movq	-16(%rsi),%rbp
265	movq	-8(%rsi),%rbx
266	leaq	(%rsi),%rsp
267.Lmul_epilogue:
268	.byte	0xf3,0xc3
269.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
270.type	bn_mul4x_mont_gather5,@function
271.align	32
272bn_mul4x_mont_gather5:
273.Lmul4x_enter:
274.byte	0x67
275	movq	%rsp,%rax
276	pushq	%rbx
277	pushq	%rbp
278	pushq	%r12
279	pushq	%r13
280	pushq	%r14
281	pushq	%r15
282.byte	0x67
283	movl	%r9d,%r10d
284	shll	$3,%r9d
285	shll	$3+2,%r10d
286	negq	%r9
287
288
289
290
291
292
293
294
295	leaq	-64(%rsp,%r9,2),%r11
296	subq	%rsi,%r11
297	andq	$4095,%r11
298	cmpq	%r11,%r10
299	jb	.Lmul4xsp_alt
300	subq	%r11,%rsp
301	leaq	-64(%rsp,%r9,2),%rsp
302	jmp	.Lmul4xsp_done
303
304.align	32
305.Lmul4xsp_alt:
306	leaq	4096-64(,%r9,2),%r10
307	leaq	-64(%rsp,%r9,2),%rsp
308	subq	%r10,%r11
309	movq	$0,%r10
310	cmovcq	%r10,%r11
311	subq	%r11,%rsp
312.Lmul4xsp_done:
313	andq	$-64,%rsp
314	negq	%r9
315
316	movq	%rax,40(%rsp)
317.Lmul4x_body:
318
319	call	mul4x_internal
320
321	movq	40(%rsp),%rsi
322	movq	$1,%rax
323	movq	-48(%rsi),%r15
324	movq	-40(%rsi),%r14
325	movq	-32(%rsi),%r13
326	movq	-24(%rsi),%r12
327	movq	-16(%rsi),%rbp
328	movq	-8(%rsi),%rbx
329	leaq	(%rsi),%rsp
330.Lmul4x_epilogue:
331	.byte	0xf3,0xc3
332.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
333
334.type	mul4x_internal,@function
335.align	32
336mul4x_internal:
337	shlq	$5,%r9
338	movl	8(%rax),%r10d
339	leaq	256(%rdx,%r9,1),%r13
340	shrq	$5,%r9
341	movq	%r10,%r11
342	shrq	$3,%r10
343	andq	$7,%r11
344	notq	%r10
345	leaq	.Lmagic_masks(%rip),%rax
346	andq	$3,%r10
347	leaq	96(%rdx,%r11,8),%r12
348	movq	0(%rax,%r10,8),%xmm4
349	movq	8(%rax,%r10,8),%xmm5
350	addq	$7,%r11
351	movq	16(%rax,%r10,8),%xmm6
352	movq	24(%rax,%r10,8),%xmm7
353	andq	$7,%r11
354
355	movq	-96(%r12),%xmm0
356	leaq	256(%r12),%r14
357	movq	-32(%r12),%xmm1
358	pand	%xmm4,%xmm0
359	movq	32(%r12),%xmm2
360	pand	%xmm5,%xmm1
361	movq	96(%r12),%xmm3
362	pand	%xmm6,%xmm2
363.byte	0x67
364	por	%xmm1,%xmm0
365	movq	-96(%r14),%xmm1
366.byte	0x67
367	pand	%xmm7,%xmm3
368.byte	0x67
369	por	%xmm2,%xmm0
370	movq	-32(%r14),%xmm2
371.byte	0x67
372	pand	%xmm4,%xmm1
373.byte	0x67
374	por	%xmm3,%xmm0
375	movq	32(%r14),%xmm3
376
377.byte	102,72,15,126,195
378	movq	96(%r14),%xmm0
379	movq	%r13,16+8(%rsp)
380	movq	%rdi,56+8(%rsp)
381
382	movq	(%r8),%r8
383	movq	(%rsi),%rax
384	leaq	(%rsi,%r9,1),%rsi
385	negq	%r9
386
387	movq	%r8,%rbp
388	mulq	%rbx
389	movq	%rax,%r10
390	movq	(%rcx),%rax
391
392	pand	%xmm5,%xmm2
393	pand	%xmm6,%xmm3
394	por	%xmm2,%xmm1
395
396	imulq	%r10,%rbp
397
398
399
400
401
402
403
404	leaq	64+8(%rsp,%r11,8),%r14
405	movq	%rdx,%r11
406
407	pand	%xmm7,%xmm0
408	por	%xmm3,%xmm1
409	leaq	512(%r12),%r12
410	por	%xmm1,%xmm0
411
412	mulq	%rbp
413	addq	%rax,%r10
414	movq	8(%rsi,%r9,1),%rax
415	adcq	$0,%rdx
416	movq	%rdx,%rdi
417
418	mulq	%rbx
419	addq	%rax,%r11
420	movq	16(%rcx),%rax
421	adcq	$0,%rdx
422	movq	%rdx,%r10
423
424	mulq	%rbp
425	addq	%rax,%rdi
426	movq	16(%rsi,%r9,1),%rax
427	adcq	$0,%rdx
428	addq	%r11,%rdi
429	leaq	32(%r9),%r15
430	leaq	64(%rcx),%rcx
431	adcq	$0,%rdx
432	movq	%rdi,(%r14)
433	movq	%rdx,%r13
434	jmp	.L1st4x
435
436.align	32
437.L1st4x:
438	mulq	%rbx
439	addq	%rax,%r10
440	movq	-32(%rcx),%rax
441	leaq	32(%r14),%r14
442	adcq	$0,%rdx
443	movq	%rdx,%r11
444
445	mulq	%rbp
446	addq	%rax,%r13
447	movq	-8(%rsi,%r15,1),%rax
448	adcq	$0,%rdx
449	addq	%r10,%r13
450	adcq	$0,%rdx
451	movq	%r13,-24(%r14)
452	movq	%rdx,%rdi
453
454	mulq	%rbx
455	addq	%rax,%r11
456	movq	-16(%rcx),%rax
457	adcq	$0,%rdx
458	movq	%rdx,%r10
459
460	mulq	%rbp
461	addq	%rax,%rdi
462	movq	(%rsi,%r15,1),%rax
463	adcq	$0,%rdx
464	addq	%r11,%rdi
465	adcq	$0,%rdx
466	movq	%rdi,-16(%r14)
467	movq	%rdx,%r13
468
469	mulq	%rbx
470	addq	%rax,%r10
471	movq	0(%rcx),%rax
472	adcq	$0,%rdx
473	movq	%rdx,%r11
474
475	mulq	%rbp
476	addq	%rax,%r13
477	movq	8(%rsi,%r15,1),%rax
478	adcq	$0,%rdx
479	addq	%r10,%r13
480	adcq	$0,%rdx
481	movq	%r13,-8(%r14)
482	movq	%rdx,%rdi
483
484	mulq	%rbx
485	addq	%rax,%r11
486	movq	16(%rcx),%rax
487	adcq	$0,%rdx
488	movq	%rdx,%r10
489
490	mulq	%rbp
491	addq	%rax,%rdi
492	movq	16(%rsi,%r15,1),%rax
493	adcq	$0,%rdx
494	addq	%r11,%rdi
495	leaq	64(%rcx),%rcx
496	adcq	$0,%rdx
497	movq	%rdi,(%r14)
498	movq	%rdx,%r13
499
500	addq	$32,%r15
501	jnz	.L1st4x
502
503	mulq	%rbx
504	addq	%rax,%r10
505	movq	-32(%rcx),%rax
506	leaq	32(%r14),%r14
507	adcq	$0,%rdx
508	movq	%rdx,%r11
509
510	mulq	%rbp
511	addq	%rax,%r13
512	movq	-8(%rsi),%rax
513	adcq	$0,%rdx
514	addq	%r10,%r13
515	adcq	$0,%rdx
516	movq	%r13,-24(%r14)
517	movq	%rdx,%rdi
518
519	mulq	%rbx
520	addq	%rax,%r11
521	movq	-16(%rcx),%rax
522	adcq	$0,%rdx
523	movq	%rdx,%r10
524
525	mulq	%rbp
526	addq	%rax,%rdi
527	movq	(%rsi,%r9,1),%rax
528	adcq	$0,%rdx
529	addq	%r11,%rdi
530	adcq	$0,%rdx
531	movq	%rdi,-16(%r14)
532	movq	%rdx,%r13
533
534.byte	102,72,15,126,195
535	leaq	(%rcx,%r9,2),%rcx
536
537	xorq	%rdi,%rdi
538	addq	%r10,%r13
539	adcq	$0,%rdi
540	movq	%r13,-8(%r14)
541
542	jmp	.Louter4x
543
544.align	32
545.Louter4x:
546	movq	(%r14,%r9,1),%r10
547	movq	%r8,%rbp
548	mulq	%rbx
549	addq	%rax,%r10
550	movq	(%rcx),%rax
551	adcq	$0,%rdx
552
553	movq	-96(%r12),%xmm0
554	movq	-32(%r12),%xmm1
555	pand	%xmm4,%xmm0
556	movq	32(%r12),%xmm2
557	pand	%xmm5,%xmm1
558	movq	96(%r12),%xmm3
559
560	imulq	%r10,%rbp
561.byte	0x67
562	movq	%rdx,%r11
563	movq	%rdi,(%r14)
564
565	pand	%xmm6,%xmm2
566	por	%xmm1,%xmm0
567	pand	%xmm7,%xmm3
568	por	%xmm2,%xmm0
569	leaq	(%r14,%r9,1),%r14
570	leaq	256(%r12),%r12
571	por	%xmm3,%xmm0
572
573	mulq	%rbp
574	addq	%rax,%r10
575	movq	8(%rsi,%r9,1),%rax
576	adcq	$0,%rdx
577	movq	%rdx,%rdi
578
579	mulq	%rbx
580	addq	%rax,%r11
581	movq	16(%rcx),%rax
582	adcq	$0,%rdx
583	addq	8(%r14),%r11
584	adcq	$0,%rdx
585	movq	%rdx,%r10
586
587	mulq	%rbp
588	addq	%rax,%rdi
589	movq	16(%rsi,%r9,1),%rax
590	adcq	$0,%rdx
591	addq	%r11,%rdi
592	leaq	32(%r9),%r15
593	leaq	64(%rcx),%rcx
594	adcq	$0,%rdx
595	movq	%rdx,%r13
596	jmp	.Linner4x
597
598.align	32
599.Linner4x:
600	mulq	%rbx
601	addq	%rax,%r10
602	movq	-32(%rcx),%rax
603	adcq	$0,%rdx
604	addq	16(%r14),%r10
605	leaq	32(%r14),%r14
606	adcq	$0,%rdx
607	movq	%rdx,%r11
608
609	mulq	%rbp
610	addq	%rax,%r13
611	movq	-8(%rsi,%r15,1),%rax
612	adcq	$0,%rdx
613	addq	%r10,%r13
614	adcq	$0,%rdx
615	movq	%rdi,-32(%r14)
616	movq	%rdx,%rdi
617
618	mulq	%rbx
619	addq	%rax,%r11
620	movq	-16(%rcx),%rax
621	adcq	$0,%rdx
622	addq	-8(%r14),%r11
623	adcq	$0,%rdx
624	movq	%rdx,%r10
625
626	mulq	%rbp
627	addq	%rax,%rdi
628	movq	(%rsi,%r15,1),%rax
629	adcq	$0,%rdx
630	addq	%r11,%rdi
631	adcq	$0,%rdx
632	movq	%r13,-24(%r14)
633	movq	%rdx,%r13
634
635	mulq	%rbx
636	addq	%rax,%r10
637	movq	0(%rcx),%rax
638	adcq	$0,%rdx
639	addq	(%r14),%r10
640	adcq	$0,%rdx
641	movq	%rdx,%r11
642
643	mulq	%rbp
644	addq	%rax,%r13
645	movq	8(%rsi,%r15,1),%rax
646	adcq	$0,%rdx
647	addq	%r10,%r13
648	adcq	$0,%rdx
649	movq	%rdi,-16(%r14)
650	movq	%rdx,%rdi
651
652	mulq	%rbx
653	addq	%rax,%r11
654	movq	16(%rcx),%rax
655	adcq	$0,%rdx
656	addq	8(%r14),%r11
657	adcq	$0,%rdx
658	movq	%rdx,%r10
659
660	mulq	%rbp
661	addq	%rax,%rdi
662	movq	16(%rsi,%r15,1),%rax
663	adcq	$0,%rdx
664	addq	%r11,%rdi
665	leaq	64(%rcx),%rcx
666	adcq	$0,%rdx
667	movq	%r13,-8(%r14)
668	movq	%rdx,%r13
669
670	addq	$32,%r15
671	jnz	.Linner4x
672
673	mulq	%rbx
674	addq	%rax,%r10
675	movq	-32(%rcx),%rax
676	adcq	$0,%rdx
677	addq	16(%r14),%r10
678	leaq	32(%r14),%r14
679	adcq	$0,%rdx
680	movq	%rdx,%r11
681
682	mulq	%rbp
683	addq	%rax,%r13
684	movq	-8(%rsi),%rax
685	adcq	$0,%rdx
686	addq	%r10,%r13
687	adcq	$0,%rdx
688	movq	%rdi,-32(%r14)
689	movq	%rdx,%rdi
690
691	mulq	%rbx
692	addq	%rax,%r11
693	movq	%rbp,%rax
694	movq	-16(%rcx),%rbp
695	adcq	$0,%rdx
696	addq	-8(%r14),%r11
697	adcq	$0,%rdx
698	movq	%rdx,%r10
699
700	mulq	%rbp
701	addq	%rax,%rdi
702	movq	(%rsi,%r9,1),%rax
703	adcq	$0,%rdx
704	addq	%r11,%rdi
705	adcq	$0,%rdx
706	movq	%r13,-24(%r14)
707	movq	%rdx,%r13
708
709.byte	102,72,15,126,195
710	movq	%rdi,-16(%r14)
711	leaq	(%rcx,%r9,2),%rcx
712
713	xorq	%rdi,%rdi
714	addq	%r10,%r13
715	adcq	$0,%rdi
716	addq	(%r14),%r13
717	adcq	$0,%rdi
718	movq	%r13,-8(%r14)
719
720	cmpq	16+8(%rsp),%r12
721	jb	.Louter4x
722	subq	%r13,%rbp
723	adcq	%r15,%r15
724	orq	%r15,%rdi
725	xorq	$1,%rdi
726	leaq	(%r14,%r9,1),%rbx
727	leaq	(%rcx,%rdi,8),%rbp
728	movq	%r9,%rcx
729	sarq	$3+2,%rcx
730	movq	56+8(%rsp),%rdi
731	jmp	.Lsqr4x_sub
732.size	mul4x_internal,.-mul4x_internal
733.globl	bn_power5
734.hidden bn_power5
735.type	bn_power5,@function
736.align	32
737bn_power5:
738	movq	%rsp,%rax
739	pushq	%rbx
740	pushq	%rbp
741	pushq	%r12
742	pushq	%r13
743	pushq	%r14
744	pushq	%r15
745	movl	%r9d,%r10d
746	shll	$3,%r9d
747	shll	$3+2,%r10d
748	negq	%r9
749	movq	(%r8),%r8
750
751
752
753
754
755
756
757	leaq	-64(%rsp,%r9,2),%r11
758	subq	%rsi,%r11
759	andq	$4095,%r11
760	cmpq	%r11,%r10
761	jb	.Lpwr_sp_alt
762	subq	%r11,%rsp
763	leaq	-64(%rsp,%r9,2),%rsp
764	jmp	.Lpwr_sp_done
765
766.align	32
767.Lpwr_sp_alt:
768	leaq	4096-64(,%r9,2),%r10
769	leaq	-64(%rsp,%r9,2),%rsp
770	subq	%r10,%r11
771	movq	$0,%r10
772	cmovcq	%r10,%r11
773	subq	%r11,%rsp
774.Lpwr_sp_done:
775	andq	$-64,%rsp
776	movq	%r9,%r10
777	negq	%r9
778
779
780
781
782
783
784
785
786
787
788	movq	%r8,32(%rsp)
789	movq	%rax,40(%rsp)
790.Lpower5_body:
791.byte	102,72,15,110,207
792.byte	102,72,15,110,209
793.byte	102,73,15,110,218
794.byte	102,72,15,110,226
795
796	call	__bn_sqr8x_internal
797	call	__bn_sqr8x_internal
798	call	__bn_sqr8x_internal
799	call	__bn_sqr8x_internal
800	call	__bn_sqr8x_internal
801
802.byte	102,72,15,126,209
803.byte	102,72,15,126,226
804	movq	%rsi,%rdi
805	movq	40(%rsp),%rax
806	leaq	32(%rsp),%r8
807
808	call	mul4x_internal
809
810	movq	40(%rsp),%rsi
811	movq	$1,%rax
812	movq	-48(%rsi),%r15
813	movq	-40(%rsi),%r14
814	movq	-32(%rsi),%r13
815	movq	-24(%rsi),%r12
816	movq	-16(%rsi),%rbp
817	movq	-8(%rsi),%rbx
818	leaq	(%rsi),%rsp
819.Lpower5_epilogue:
820	.byte	0xf3,0xc3
821.size	bn_power5,.-bn_power5
822
823.globl	bn_sqr8x_internal
824.hidden bn_sqr8x_internal
825.hidden	bn_sqr8x_internal
826.type	bn_sqr8x_internal,@function
827.align	32
828bn_sqr8x_internal:
829__bn_sqr8x_internal:
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903	leaq	32(%r10),%rbp
904	leaq	(%rsi,%r9,1),%rsi
905
906	movq	%r9,%rcx
907
908
909	movq	-32(%rsi,%rbp,1),%r14
910	leaq	48+8(%rsp,%r9,2),%rdi
911	movq	-24(%rsi,%rbp,1),%rax
912	leaq	-32(%rdi,%rbp,1),%rdi
913	movq	-16(%rsi,%rbp,1),%rbx
914	movq	%rax,%r15
915
916	mulq	%r14
917	movq	%rax,%r10
918	movq	%rbx,%rax
919	movq	%rdx,%r11
920	movq	%r10,-24(%rdi,%rbp,1)
921
922	mulq	%r14
923	addq	%rax,%r11
924	movq	%rbx,%rax
925	adcq	$0,%rdx
926	movq	%r11,-16(%rdi,%rbp,1)
927	movq	%rdx,%r10
928
929
930	movq	-8(%rsi,%rbp,1),%rbx
931	mulq	%r15
932	movq	%rax,%r12
933	movq	%rbx,%rax
934	movq	%rdx,%r13
935
936	leaq	(%rbp),%rcx
937	mulq	%r14
938	addq	%rax,%r10
939	movq	%rbx,%rax
940	movq	%rdx,%r11
941	adcq	$0,%r11
942	addq	%r12,%r10
943	adcq	$0,%r11
944	movq	%r10,-8(%rdi,%rcx,1)
945	jmp	.Lsqr4x_1st
946
947.align	32
948.Lsqr4x_1st:
949	movq	(%rsi,%rcx,1),%rbx
950	mulq	%r15
951	addq	%rax,%r13
952	movq	%rbx,%rax
953	movq	%rdx,%r12
954	adcq	$0,%r12
955
956	mulq	%r14
957	addq	%rax,%r11
958	movq	%rbx,%rax
959	movq	8(%rsi,%rcx,1),%rbx
960	movq	%rdx,%r10
961	adcq	$0,%r10
962	addq	%r13,%r11
963	adcq	$0,%r10
964
965
966	mulq	%r15
967	addq	%rax,%r12
968	movq	%rbx,%rax
969	movq	%r11,(%rdi,%rcx,1)
970	movq	%rdx,%r13
971	adcq	$0,%r13
972
973	mulq	%r14
974	addq	%rax,%r10
975	movq	%rbx,%rax
976	movq	16(%rsi,%rcx,1),%rbx
977	movq	%rdx,%r11
978	adcq	$0,%r11
979	addq	%r12,%r10
980	adcq	$0,%r11
981
982	mulq	%r15
983	addq	%rax,%r13
984	movq	%rbx,%rax
985	movq	%r10,8(%rdi,%rcx,1)
986	movq	%rdx,%r12
987	adcq	$0,%r12
988
989	mulq	%r14
990	addq	%rax,%r11
991	movq	%rbx,%rax
992	movq	24(%rsi,%rcx,1),%rbx
993	movq	%rdx,%r10
994	adcq	$0,%r10
995	addq	%r13,%r11
996	adcq	$0,%r10
997
998
999	mulq	%r15
1000	addq	%rax,%r12
1001	movq	%rbx,%rax
1002	movq	%r11,16(%rdi,%rcx,1)
1003	movq	%rdx,%r13
1004	adcq	$0,%r13
1005	leaq	32(%rcx),%rcx
1006
1007	mulq	%r14
1008	addq	%rax,%r10
1009	movq	%rbx,%rax
1010	movq	%rdx,%r11
1011	adcq	$0,%r11
1012	addq	%r12,%r10
1013	adcq	$0,%r11
1014	movq	%r10,-8(%rdi,%rcx,1)
1015
1016	cmpq	$0,%rcx
1017	jne	.Lsqr4x_1st
1018
1019	mulq	%r15
1020	addq	%rax,%r13
1021	leaq	16(%rbp),%rbp
1022	adcq	$0,%rdx
1023	addq	%r11,%r13
1024	adcq	$0,%rdx
1025
1026	movq	%r13,(%rdi)
1027	movq	%rdx,%r12
1028	movq	%rdx,8(%rdi)
1029	jmp	.Lsqr4x_outer
1030
1031.align	32
1032.Lsqr4x_outer:
1033	movq	-32(%rsi,%rbp,1),%r14
1034	leaq	48+8(%rsp,%r9,2),%rdi
1035	movq	-24(%rsi,%rbp,1),%rax
1036	leaq	-32(%rdi,%rbp,1),%rdi
1037	movq	-16(%rsi,%rbp,1),%rbx
1038	movq	%rax,%r15
1039
1040	mulq	%r14
1041	movq	-24(%rdi,%rbp,1),%r10
1042	addq	%rax,%r10
1043	movq	%rbx,%rax
1044	adcq	$0,%rdx
1045	movq	%r10,-24(%rdi,%rbp,1)
1046	movq	%rdx,%r11
1047
1048	mulq	%r14
1049	addq	%rax,%r11
1050	movq	%rbx,%rax
1051	adcq	$0,%rdx
1052	addq	-16(%rdi,%rbp,1),%r11
1053	movq	%rdx,%r10
1054	adcq	$0,%r10
1055	movq	%r11,-16(%rdi,%rbp,1)
1056
1057	xorq	%r12,%r12
1058
1059	movq	-8(%rsi,%rbp,1),%rbx
1060	mulq	%r15
1061	addq	%rax,%r12
1062	movq	%rbx,%rax
1063	adcq	$0,%rdx
1064	addq	-8(%rdi,%rbp,1),%r12
1065	movq	%rdx,%r13
1066	adcq	$0,%r13
1067
1068	mulq	%r14
1069	addq	%rax,%r10
1070	movq	%rbx,%rax
1071	adcq	$0,%rdx
1072	addq	%r12,%r10
1073	movq	%rdx,%r11
1074	adcq	$0,%r11
1075	movq	%r10,-8(%rdi,%rbp,1)
1076
1077	leaq	(%rbp),%rcx
1078	jmp	.Lsqr4x_inner
1079
1080.align	32
1081.Lsqr4x_inner:
1082	movq	(%rsi,%rcx,1),%rbx
1083	mulq	%r15
1084	addq	%rax,%r13
1085	movq	%rbx,%rax
1086	movq	%rdx,%r12
1087	adcq	$0,%r12
1088	addq	(%rdi,%rcx,1),%r13
1089	adcq	$0,%r12
1090
1091.byte	0x67
1092	mulq	%r14
1093	addq	%rax,%r11
1094	movq	%rbx,%rax
1095	movq	8(%rsi,%rcx,1),%rbx
1096	movq	%rdx,%r10
1097	adcq	$0,%r10
1098	addq	%r13,%r11
1099	adcq	$0,%r10
1100
1101	mulq	%r15
1102	addq	%rax,%r12
1103	movq	%r11,(%rdi,%rcx,1)
1104	movq	%rbx,%rax
1105	movq	%rdx,%r13
1106	adcq	$0,%r13
1107	addq	8(%rdi,%rcx,1),%r12
1108	leaq	16(%rcx),%rcx
1109	adcq	$0,%r13
1110
1111	mulq	%r14
1112	addq	%rax,%r10
1113	movq	%rbx,%rax
1114	adcq	$0,%rdx
1115	addq	%r12,%r10
1116	movq	%rdx,%r11
1117	adcq	$0,%r11
1118	movq	%r10,-8(%rdi,%rcx,1)
1119
1120	cmpq	$0,%rcx
1121	jne	.Lsqr4x_inner
1122
1123.byte	0x67
1124	mulq	%r15
1125	addq	%rax,%r13
1126	adcq	$0,%rdx
1127	addq	%r11,%r13
1128	adcq	$0,%rdx
1129
1130	movq	%r13,(%rdi)
1131	movq	%rdx,%r12
1132	movq	%rdx,8(%rdi)
1133
1134	addq	$16,%rbp
1135	jnz	.Lsqr4x_outer
1136
1137
1138	movq	-32(%rsi),%r14
1139	leaq	48+8(%rsp,%r9,2),%rdi
1140	movq	-24(%rsi),%rax
1141	leaq	-32(%rdi,%rbp,1),%rdi
1142	movq	-16(%rsi),%rbx
1143	movq	%rax,%r15
1144
1145	mulq	%r14
1146	addq	%rax,%r10
1147	movq	%rbx,%rax
1148	movq	%rdx,%r11
1149	adcq	$0,%r11
1150
1151	mulq	%r14
1152	addq	%rax,%r11
1153	movq	%rbx,%rax
1154	movq	%r10,-24(%rdi)
1155	movq	%rdx,%r10
1156	adcq	$0,%r10
1157	addq	%r13,%r11
1158	movq	-8(%rsi),%rbx
1159	adcq	$0,%r10
1160
1161	mulq	%r15
1162	addq	%rax,%r12
1163	movq	%rbx,%rax
1164	movq	%r11,-16(%rdi)
1165	movq	%rdx,%r13
1166	adcq	$0,%r13
1167
1168	mulq	%r14
1169	addq	%rax,%r10
1170	movq	%rbx,%rax
1171	movq	%rdx,%r11
1172	adcq	$0,%r11
1173	addq	%r12,%r10
1174	adcq	$0,%r11
1175	movq	%r10,-8(%rdi)
1176
1177	mulq	%r15
1178	addq	%rax,%r13
1179	movq	-16(%rsi),%rax
1180	adcq	$0,%rdx
1181	addq	%r11,%r13
1182	adcq	$0,%rdx
1183
1184	movq	%r13,(%rdi)
1185	movq	%rdx,%r12
1186	movq	%rdx,8(%rdi)
1187
1188	mulq	%rbx
1189	addq	$16,%rbp
1190	xorq	%r14,%r14
1191	subq	%r9,%rbp
1192	xorq	%r15,%r15
1193
1194	addq	%r12,%rax
1195	adcq	$0,%rdx
1196	movq	%rax,8(%rdi)
1197	movq	%rdx,16(%rdi)
1198	movq	%r15,24(%rdi)
1199
1200	movq	-16(%rsi,%rbp,1),%rax
1201	leaq	48+8(%rsp),%rdi
1202	xorq	%r10,%r10
1203	movq	8(%rdi),%r11
1204
1205	leaq	(%r14,%r10,2),%r12
1206	shrq	$63,%r10
1207	leaq	(%rcx,%r11,2),%r13
1208	shrq	$63,%r11
1209	orq	%r10,%r13
1210	movq	16(%rdi),%r10
1211	movq	%r11,%r14
1212	mulq	%rax
1213	negq	%r15
1214	movq	24(%rdi),%r11
1215	adcq	%rax,%r12
1216	movq	-8(%rsi,%rbp,1),%rax
1217	movq	%r12,(%rdi)
1218	adcq	%rdx,%r13
1219
1220	leaq	(%r14,%r10,2),%rbx
1221	movq	%r13,8(%rdi)
1222	sbbq	%r15,%r15
1223	shrq	$63,%r10
1224	leaq	(%rcx,%r11,2),%r8
1225	shrq	$63,%r11
1226	orq	%r10,%r8
1227	movq	32(%rdi),%r10
1228	movq	%r11,%r14
1229	mulq	%rax
1230	negq	%r15
1231	movq	40(%rdi),%r11
1232	adcq	%rax,%rbx
1233	movq	0(%rsi,%rbp,1),%rax
1234	movq	%rbx,16(%rdi)
1235	adcq	%rdx,%r8
1236	leaq	16(%rbp),%rbp
1237	movq	%r8,24(%rdi)
1238	sbbq	%r15,%r15
1239	leaq	64(%rdi),%rdi
1240	jmp	.Lsqr4x_shift_n_add
1241
1242.align	32
1243.Lsqr4x_shift_n_add:
1244	leaq	(%r14,%r10,2),%r12
1245	shrq	$63,%r10
1246	leaq	(%rcx,%r11,2),%r13
1247	shrq	$63,%r11
1248	orq	%r10,%r13
1249	movq	-16(%rdi),%r10
1250	movq	%r11,%r14
1251	mulq	%rax
1252	negq	%r15
1253	movq	-8(%rdi),%r11
1254	adcq	%rax,%r12
1255	movq	-8(%rsi,%rbp,1),%rax
1256	movq	%r12,-32(%rdi)
1257	adcq	%rdx,%r13
1258
1259	leaq	(%r14,%r10,2),%rbx
1260	movq	%r13,-24(%rdi)
1261	sbbq	%r15,%r15
1262	shrq	$63,%r10
1263	leaq	(%rcx,%r11,2),%r8
1264	shrq	$63,%r11
1265	orq	%r10,%r8
1266	movq	0(%rdi),%r10
1267	movq	%r11,%r14
1268	mulq	%rax
1269	negq	%r15
1270	movq	8(%rdi),%r11
1271	adcq	%rax,%rbx
1272	movq	0(%rsi,%rbp,1),%rax
1273	movq	%rbx,-16(%rdi)
1274	adcq	%rdx,%r8
1275
1276	leaq	(%r14,%r10,2),%r12
1277	movq	%r8,-8(%rdi)
1278	sbbq	%r15,%r15
1279	shrq	$63,%r10
1280	leaq	(%rcx,%r11,2),%r13
1281	shrq	$63,%r11
1282	orq	%r10,%r13
1283	movq	16(%rdi),%r10
1284	movq	%r11,%r14
1285	mulq	%rax
1286	negq	%r15
1287	movq	24(%rdi),%r11
1288	adcq	%rax,%r12
1289	movq	8(%rsi,%rbp,1),%rax
1290	movq	%r12,0(%rdi)
1291	adcq	%rdx,%r13
1292
1293	leaq	(%r14,%r10,2),%rbx
1294	movq	%r13,8(%rdi)
1295	sbbq	%r15,%r15
1296	shrq	$63,%r10
1297	leaq	(%rcx,%r11,2),%r8
1298	shrq	$63,%r11
1299	orq	%r10,%r8
1300	movq	32(%rdi),%r10
1301	movq	%r11,%r14
1302	mulq	%rax
1303	negq	%r15
1304	movq	40(%rdi),%r11
1305	adcq	%rax,%rbx
1306	movq	16(%rsi,%rbp,1),%rax
1307	movq	%rbx,16(%rdi)
1308	adcq	%rdx,%r8
1309	movq	%r8,24(%rdi)
1310	sbbq	%r15,%r15
1311	leaq	64(%rdi),%rdi
1312	addq	$32,%rbp
1313	jnz	.Lsqr4x_shift_n_add
1314
1315	leaq	(%r14,%r10,2),%r12
1316.byte	0x67
1317	shrq	$63,%r10
1318	leaq	(%rcx,%r11,2),%r13
1319	shrq	$63,%r11
1320	orq	%r10,%r13
1321	movq	-16(%rdi),%r10
1322	movq	%r11,%r14
1323	mulq	%rax
1324	negq	%r15
1325	movq	-8(%rdi),%r11
1326	adcq	%rax,%r12
1327	movq	-8(%rsi),%rax
1328	movq	%r12,-32(%rdi)
1329	adcq	%rdx,%r13
1330
1331	leaq	(%r14,%r10,2),%rbx
1332	movq	%r13,-24(%rdi)
1333	sbbq	%r15,%r15
1334	shrq	$63,%r10
1335	leaq	(%rcx,%r11,2),%r8
1336	shrq	$63,%r11
1337	orq	%r10,%r8
1338	mulq	%rax
1339	negq	%r15
1340	adcq	%rax,%rbx
1341	adcq	%rdx,%r8
1342	movq	%rbx,-16(%rdi)
1343	movq	%r8,-8(%rdi)
1344.byte	102,72,15,126,213
1345sqr8x_reduction:
1346	xorq	%rax,%rax
1347	leaq	(%rbp,%r9,2),%rcx
1348	leaq	48+8(%rsp,%r9,2),%rdx
1349	movq	%rcx,0+8(%rsp)
1350	leaq	48+8(%rsp,%r9,1),%rdi
1351	movq	%rdx,8+8(%rsp)
1352	negq	%r9
1353	jmp	.L8x_reduction_loop
1354
1355.align	32
1356.L8x_reduction_loop:
1357	leaq	(%rdi,%r9,1),%rdi
1358.byte	0x66
1359	movq	0(%rdi),%rbx
1360	movq	8(%rdi),%r9
1361	movq	16(%rdi),%r10
1362	movq	24(%rdi),%r11
1363	movq	32(%rdi),%r12
1364	movq	40(%rdi),%r13
1365	movq	48(%rdi),%r14
1366	movq	56(%rdi),%r15
1367	movq	%rax,(%rdx)
1368	leaq	64(%rdi),%rdi
1369
1370.byte	0x67
1371	movq	%rbx,%r8
1372	imulq	32+8(%rsp),%rbx
1373	movq	0(%rbp),%rax
1374	movl	$8,%ecx
1375	jmp	.L8x_reduce
1376
1377.align	32
1378.L8x_reduce:
1379	mulq	%rbx
1380	movq	16(%rbp),%rax
1381	negq	%r8
1382	movq	%rdx,%r8
1383	adcq	$0,%r8
1384
1385	mulq	%rbx
1386	addq	%rax,%r9
1387	movq	32(%rbp),%rax
1388	adcq	$0,%rdx
1389	addq	%r9,%r8
1390	movq	%rbx,48-8+8(%rsp,%rcx,8)
1391	movq	%rdx,%r9
1392	adcq	$0,%r9
1393
1394	mulq	%rbx
1395	addq	%rax,%r10
1396	movq	48(%rbp),%rax
1397	adcq	$0,%rdx
1398	addq	%r10,%r9
1399	movq	32+8(%rsp),%rsi
1400	movq	%rdx,%r10
1401	adcq	$0,%r10
1402
1403	mulq	%rbx
1404	addq	%rax,%r11
1405	movq	64(%rbp),%rax
1406	adcq	$0,%rdx
1407	imulq	%r8,%rsi
1408	addq	%r11,%r10
1409	movq	%rdx,%r11
1410	adcq	$0,%r11
1411
1412	mulq	%rbx
1413	addq	%rax,%r12
1414	movq	80(%rbp),%rax
1415	adcq	$0,%rdx
1416	addq	%r12,%r11
1417	movq	%rdx,%r12
1418	adcq	$0,%r12
1419
1420	mulq	%rbx
1421	addq	%rax,%r13
1422	movq	96(%rbp),%rax
1423	adcq	$0,%rdx
1424	addq	%r13,%r12
1425	movq	%rdx,%r13
1426	adcq	$0,%r13
1427
1428	mulq	%rbx
1429	addq	%rax,%r14
1430	movq	112(%rbp),%rax
1431	adcq	$0,%rdx
1432	addq	%r14,%r13
1433	movq	%rdx,%r14
1434	adcq	$0,%r14
1435
1436	mulq	%rbx
1437	movq	%rsi,%rbx
1438	addq	%rax,%r15
1439	movq	0(%rbp),%rax
1440	adcq	$0,%rdx
1441	addq	%r15,%r14
1442	movq	%rdx,%r15
1443	adcq	$0,%r15
1444
1445	decl	%ecx
1446	jnz	.L8x_reduce
1447
1448	leaq	128(%rbp),%rbp
1449	xorq	%rax,%rax
1450	movq	8+8(%rsp),%rdx
1451	cmpq	0+8(%rsp),%rbp
1452	jae	.L8x_no_tail
1453
1454.byte	0x66
1455	addq	0(%rdi),%r8
1456	adcq	8(%rdi),%r9
1457	adcq	16(%rdi),%r10
1458	adcq	24(%rdi),%r11
1459	adcq	32(%rdi),%r12
1460	adcq	40(%rdi),%r13
1461	adcq	48(%rdi),%r14
1462	adcq	56(%rdi),%r15
1463	sbbq	%rsi,%rsi
1464
1465	movq	48+56+8(%rsp),%rbx
1466	movl	$8,%ecx
1467	movq	0(%rbp),%rax
1468	jmp	.L8x_tail
1469
1470.align	32
1471.L8x_tail:
1472	mulq	%rbx
1473	addq	%rax,%r8
1474	movq	16(%rbp),%rax
1475	movq	%r8,(%rdi)
1476	movq	%rdx,%r8
1477	adcq	$0,%r8
1478
1479	mulq	%rbx
1480	addq	%rax,%r9
1481	movq	32(%rbp),%rax
1482	adcq	$0,%rdx
1483	addq	%r9,%r8
1484	leaq	8(%rdi),%rdi
1485	movq	%rdx,%r9
1486	adcq	$0,%r9
1487
1488	mulq	%rbx
1489	addq	%rax,%r10
1490	movq	48(%rbp),%rax
1491	adcq	$0,%rdx
1492	addq	%r10,%r9
1493	movq	%rdx,%r10
1494	adcq	$0,%r10
1495
1496	mulq	%rbx
1497	addq	%rax,%r11
1498	movq	64(%rbp),%rax
1499	adcq	$0,%rdx
1500	addq	%r11,%r10
1501	movq	%rdx,%r11
1502	adcq	$0,%r11
1503
1504	mulq	%rbx
1505	addq	%rax,%r12
1506	movq	80(%rbp),%rax
1507	adcq	$0,%rdx
1508	addq	%r12,%r11
1509	movq	%rdx,%r12
1510	adcq	$0,%r12
1511
1512	mulq	%rbx
1513	addq	%rax,%r13
1514	movq	96(%rbp),%rax
1515	adcq	$0,%rdx
1516	addq	%r13,%r12
1517	movq	%rdx,%r13
1518	adcq	$0,%r13
1519
1520	mulq	%rbx
1521	addq	%rax,%r14
1522	movq	112(%rbp),%rax
1523	adcq	$0,%rdx
1524	addq	%r14,%r13
1525	movq	%rdx,%r14
1526	adcq	$0,%r14
1527
1528	mulq	%rbx
1529	movq	48-16+8(%rsp,%rcx,8),%rbx
1530	addq	%rax,%r15
1531	adcq	$0,%rdx
1532	addq	%r15,%r14
1533	movq	0(%rbp),%rax
1534	movq	%rdx,%r15
1535	adcq	$0,%r15
1536
1537	decl	%ecx
1538	jnz	.L8x_tail
1539
1540	leaq	128(%rbp),%rbp
1541	movq	8+8(%rsp),%rdx
1542	cmpq	0+8(%rsp),%rbp
1543	jae	.L8x_tail_done
1544
1545	movq	48+56+8(%rsp),%rbx
1546	negq	%rsi
1547	movq	0(%rbp),%rax
1548	adcq	0(%rdi),%r8
1549	adcq	8(%rdi),%r9
1550	adcq	16(%rdi),%r10
1551	adcq	24(%rdi),%r11
1552	adcq	32(%rdi),%r12
1553	adcq	40(%rdi),%r13
1554	adcq	48(%rdi),%r14
1555	adcq	56(%rdi),%r15
1556	sbbq	%rsi,%rsi
1557
1558	movl	$8,%ecx
1559	jmp	.L8x_tail
1560
1561.align	32
1562.L8x_tail_done:
1563	addq	(%rdx),%r8
1564	xorq	%rax,%rax
1565
1566	negq	%rsi
1567.L8x_no_tail:
1568	adcq	0(%rdi),%r8
1569	adcq	8(%rdi),%r9
1570	adcq	16(%rdi),%r10
1571	adcq	24(%rdi),%r11
1572	adcq	32(%rdi),%r12
1573	adcq	40(%rdi),%r13
1574	adcq	48(%rdi),%r14
1575	adcq	56(%rdi),%r15
1576	adcq	$0,%rax
1577	movq	-16(%rbp),%rcx
1578	xorq	%rsi,%rsi
1579
1580.byte	102,72,15,126,213
1581
1582	movq	%r8,0(%rdi)
1583	movq	%r9,8(%rdi)
1584.byte	102,73,15,126,217
1585	movq	%r10,16(%rdi)
1586	movq	%r11,24(%rdi)
1587	movq	%r12,32(%rdi)
1588	movq	%r13,40(%rdi)
1589	movq	%r14,48(%rdi)
1590	movq	%r15,56(%rdi)
1591	leaq	64(%rdi),%rdi
1592
1593	cmpq	%rdx,%rdi
1594	jb	.L8x_reduction_loop
1595
1596	subq	%r15,%rcx
1597	leaq	(%rdi,%r9,1),%rbx
1598	adcq	%rsi,%rsi
1599	movq	%r9,%rcx
1600	orq	%rsi,%rax
1601.byte	102,72,15,126,207
1602	xorq	$1,%rax
1603.byte	102,72,15,126,206
1604	leaq	(%rbp,%rax,8),%rbp
1605	sarq	$3+2,%rcx
1606	jmp	.Lsqr4x_sub
1607
1608.align	32
1609.Lsqr4x_sub:
1610.byte	0x66
1611	movq	0(%rbx),%r12
1612	movq	8(%rbx),%r13
1613	sbbq	0(%rbp),%r12
1614	movq	16(%rbx),%r14
1615	sbbq	16(%rbp),%r13
1616	movq	24(%rbx),%r15
1617	leaq	32(%rbx),%rbx
1618	sbbq	32(%rbp),%r14
1619	movq	%r12,0(%rdi)
1620	sbbq	48(%rbp),%r15
1621	leaq	64(%rbp),%rbp
1622	movq	%r13,8(%rdi)
1623	movq	%r14,16(%rdi)
1624	movq	%r15,24(%rdi)
1625	leaq	32(%rdi),%rdi
1626
1627	incq	%rcx
1628	jnz	.Lsqr4x_sub
1629	movq	%r9,%r10
1630	negq	%r9
1631	.byte	0xf3,0xc3
1632.size	bn_sqr8x_internal,.-bn_sqr8x_internal
1633.globl	bn_from_montgomery
1634.hidden bn_from_montgomery
1635.type	bn_from_montgomery,@function
1636.align	32
1637bn_from_montgomery:
1638	testl	$7,%r9d
1639	jz	bn_from_mont8x
1640	xorl	%eax,%eax
1641	.byte	0xf3,0xc3
1642.size	bn_from_montgomery,.-bn_from_montgomery
1643
1644.type	bn_from_mont8x,@function
1645.align	32
1646bn_from_mont8x:
1647.byte	0x67
1648	movq	%rsp,%rax
1649	pushq	%rbx
1650	pushq	%rbp
1651	pushq	%r12
1652	pushq	%r13
1653	pushq	%r14
1654	pushq	%r15
1655.byte	0x67
1656	movl	%r9d,%r10d
1657	shll	$3,%r9d
1658	shll	$3+2,%r10d
1659	negq	%r9
1660	movq	(%r8),%r8
1661
1662
1663
1664
1665
1666
1667
1668	leaq	-64(%rsp,%r9,2),%r11
1669	subq	%rsi,%r11
1670	andq	$4095,%r11
1671	cmpq	%r11,%r10
1672	jb	.Lfrom_sp_alt
1673	subq	%r11,%rsp
1674	leaq	-64(%rsp,%r9,2),%rsp
1675	jmp	.Lfrom_sp_done
1676
1677.align	32
1678.Lfrom_sp_alt:
1679	leaq	4096-64(,%r9,2),%r10
1680	leaq	-64(%rsp,%r9,2),%rsp
1681	subq	%r10,%r11
1682	movq	$0,%r10
1683	cmovcq	%r10,%r11
1684	subq	%r11,%rsp
1685.Lfrom_sp_done:
1686	andq	$-64,%rsp
1687	movq	%r9,%r10
1688	negq	%r9
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699	movq	%r8,32(%rsp)
1700	movq	%rax,40(%rsp)
1701.Lfrom_body:
1702	movq	%r9,%r11
1703	leaq	48(%rsp),%rax
1704	pxor	%xmm0,%xmm0
1705	jmp	.Lmul_by_1
1706
1707.align	32
1708.Lmul_by_1:
1709	movdqu	(%rsi),%xmm1
1710	movdqu	16(%rsi),%xmm2
1711	movdqu	32(%rsi),%xmm3
1712	movdqa	%xmm0,(%rax,%r9,1)
1713	movdqu	48(%rsi),%xmm4
1714	movdqa	%xmm0,16(%rax,%r9,1)
1715.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
1716	movdqa	%xmm1,(%rax)
1717	movdqa	%xmm0,32(%rax,%r9,1)
1718	movdqa	%xmm2,16(%rax)
1719	movdqa	%xmm0,48(%rax,%r9,1)
1720	movdqa	%xmm3,32(%rax)
1721	movdqa	%xmm4,48(%rax)
1722	leaq	64(%rax),%rax
1723	subq	$64,%r11
1724	jnz	.Lmul_by_1
1725
1726.byte	102,72,15,110,207
1727.byte	102,72,15,110,209
1728.byte	0x67
1729	movq	%rcx,%rbp
1730.byte	102,73,15,110,218
1731	call	sqr8x_reduction
1732
1733	pxor	%xmm0,%xmm0
1734	leaq	48(%rsp),%rax
1735	movq	40(%rsp),%rsi
1736	jmp	.Lfrom_mont_zero
1737
1738.align	32
1739.Lfrom_mont_zero:
1740	movdqa	%xmm0,0(%rax)
1741	movdqa	%xmm0,16(%rax)
1742	movdqa	%xmm0,32(%rax)
1743	movdqa	%xmm0,48(%rax)
1744	leaq	64(%rax),%rax
1745	subq	$32,%r9
1746	jnz	.Lfrom_mont_zero
1747
1748	movq	$1,%rax
1749	movq	-48(%rsi),%r15
1750	movq	-40(%rsi),%r14
1751	movq	-32(%rsi),%r13
1752	movq	-24(%rsi),%r12
1753	movq	-16(%rsi),%rbp
1754	movq	-8(%rsi),%rbx
1755	leaq	(%rsi),%rsp
1756.Lfrom_epilogue:
1757	.byte	0xf3,0xc3
1758.size	bn_from_mont8x,.-bn_from_mont8x
1759.globl	bn_scatter5
1760.hidden bn_scatter5
1761.type	bn_scatter5,@function
1762.align	16
1763bn_scatter5:
1764	cmpl	$0,%esi
1765	jz	.Lscatter_epilogue
1766	leaq	(%rdx,%rcx,8),%rdx
1767.Lscatter:
1768	movq	(%rdi),%rax
1769	leaq	8(%rdi),%rdi
1770	movq	%rax,(%rdx)
1771	leaq	256(%rdx),%rdx
1772	subl	$1,%esi
1773	jnz	.Lscatter
1774.Lscatter_epilogue:
1775	.byte	0xf3,0xc3
1776.size	bn_scatter5,.-bn_scatter5
1777
1778.globl	bn_gather5
1779.hidden bn_gather5
1780.type	bn_gather5,@function
1781.align	16
1782bn_gather5:
1783	movl	%ecx,%r11d
1784	shrl	$3,%ecx
1785	andq	$7,%r11
1786	notl	%ecx
1787	leaq	.Lmagic_masks(%rip),%rax
1788	andl	$3,%ecx
1789	leaq	128(%rdx,%r11,8),%rdx
1790	movq	0(%rax,%rcx,8),%xmm4
1791	movq	8(%rax,%rcx,8),%xmm5
1792	movq	16(%rax,%rcx,8),%xmm6
1793	movq	24(%rax,%rcx,8),%xmm7
1794	jmp	.Lgather
1795.align	16
1796.Lgather:
1797	movq	-128(%rdx),%xmm0
1798	movq	-64(%rdx),%xmm1
1799	pand	%xmm4,%xmm0
1800	movq	0(%rdx),%xmm2
1801	pand	%xmm5,%xmm1
1802	movq	64(%rdx),%xmm3
1803	pand	%xmm6,%xmm2
1804	por	%xmm1,%xmm0
1805	pand	%xmm7,%xmm3
1806.byte	0x67,0x67
1807	por	%xmm2,%xmm0
1808	leaq	256(%rdx),%rdx
1809	por	%xmm3,%xmm0
1810
1811	movq	%xmm0,(%rdi)
1812	leaq	8(%rdi),%rdi
1813	subl	$1,%esi
1814	jnz	.Lgather
1815	.byte	0xf3,0xc3
1816.LSEH_end_bn_gather5:
1817.size	bn_gather5,.-bn_gather5
1818.align	64
1819.Lmagic_masks:
1820.long	0,0, 0,0, 0,0, -1,-1
1821.long	0,0, 0,0, 0,0,  0,0
1822.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1823#endif
1824