1#if defined(__x86_64__)
2.text
3
4
5
6.globl	_bn_mul_mont_gather5
7.private_extern _bn_mul_mont_gather5
8
9.p2align	6
10_bn_mul_mont_gather5:
11	testl	$7,%r9d
12	jnz	L$mul_enter
13	jmp	L$mul4x_enter
14
15.p2align	4
16L$mul_enter:
17	movl	%r9d,%r9d
18	movq	%rsp,%rax
19	movl	8(%rsp),%r10d
20	pushq	%rbx
21	pushq	%rbp
22	pushq	%r12
23	pushq	%r13
24	pushq	%r14
25	pushq	%r15
26	leaq	2(%r9),%r11
27	negq	%r11
28	leaq	(%rsp,%r11,8),%rsp
29	andq	$-1024,%rsp
30
31	movq	%rax,8(%rsp,%r9,8)
32L$mul_body:
33	movq	%rdx,%r12
34	movq	%r10,%r11
35	shrq	$3,%r10
36	andq	$7,%r11
37	notq	%r10
38	leaq	L$magic_masks(%rip),%rax
39	andq	$3,%r10
40	leaq	96(%r12,%r11,8),%r12
41	movq	0(%rax,%r10,8),%xmm4
42	movq	8(%rax,%r10,8),%xmm5
43	movq	16(%rax,%r10,8),%xmm6
44	movq	24(%rax,%r10,8),%xmm7
45
46	movq	-96(%r12),%xmm0
47	movq	-32(%r12),%xmm1
48	pand	%xmm4,%xmm0
49	movq	32(%r12),%xmm2
50	pand	%xmm5,%xmm1
51	movq	96(%r12),%xmm3
52	pand	%xmm6,%xmm2
53	por	%xmm1,%xmm0
54	pand	%xmm7,%xmm3
55	por	%xmm2,%xmm0
56	leaq	256(%r12),%r12
57	por	%xmm3,%xmm0
58
59.byte	102,72,15,126,195
60
61	movq	(%r8),%r8
62	movq	(%rsi),%rax
63
64	xorq	%r14,%r14
65	xorq	%r15,%r15
66
67	movq	-96(%r12),%xmm0
68	movq	-32(%r12),%xmm1
69	pand	%xmm4,%xmm0
70	movq	32(%r12),%xmm2
71	pand	%xmm5,%xmm1
72
73	movq	%r8,%rbp
74	mulq	%rbx
75	movq	%rax,%r10
76	movq	(%rcx),%rax
77
78	movq	96(%r12),%xmm3
79	pand	%xmm6,%xmm2
80	por	%xmm1,%xmm0
81	pand	%xmm7,%xmm3
82
83	imulq	%r10,%rbp
84	movq	%rdx,%r11
85
86	por	%xmm2,%xmm0
87	leaq	256(%r12),%r12
88	por	%xmm3,%xmm0
89
90	mulq	%rbp
91	addq	%rax,%r10
92	movq	8(%rsi),%rax
93	adcq	$0,%rdx
94	movq	%rdx,%r13
95
96	leaq	1(%r15),%r15
97	jmp	L$1st_enter
98
99.p2align	4
100L$1st:
101	addq	%rax,%r13
102	movq	(%rsi,%r15,8),%rax
103	adcq	$0,%rdx
104	addq	%r11,%r13
105	movq	%r10,%r11
106	adcq	$0,%rdx
107	movq	%r13,-16(%rsp,%r15,8)
108	movq	%rdx,%r13
109
110L$1st_enter:
111	mulq	%rbx
112	addq	%rax,%r11
113	movq	(%rcx,%r15,8),%rax
114	adcq	$0,%rdx
115	leaq	1(%r15),%r15
116	movq	%rdx,%r10
117
118	mulq	%rbp
119	cmpq	%r9,%r15
120	jne	L$1st
121
122.byte	102,72,15,126,195
123
124	addq	%rax,%r13
125	movq	(%rsi),%rax
126	adcq	$0,%rdx
127	addq	%r11,%r13
128	adcq	$0,%rdx
129	movq	%r13,-16(%rsp,%r15,8)
130	movq	%rdx,%r13
131	movq	%r10,%r11
132
133	xorq	%rdx,%rdx
134	addq	%r11,%r13
135	adcq	$0,%rdx
136	movq	%r13,-8(%rsp,%r9,8)
137	movq	%rdx,(%rsp,%r9,8)
138
139	leaq	1(%r14),%r14
140	jmp	L$outer
141.p2align	4
142L$outer:
143	xorq	%r15,%r15
144	movq	%r8,%rbp
145	movq	(%rsp),%r10
146
147	movq	-96(%r12),%xmm0
148	movq	-32(%r12),%xmm1
149	pand	%xmm4,%xmm0
150	movq	32(%r12),%xmm2
151	pand	%xmm5,%xmm1
152
153	mulq	%rbx
154	addq	%rax,%r10
155	movq	(%rcx),%rax
156	adcq	$0,%rdx
157
158	movq	96(%r12),%xmm3
159	pand	%xmm6,%xmm2
160	por	%xmm1,%xmm0
161	pand	%xmm7,%xmm3
162
163	imulq	%r10,%rbp
164	movq	%rdx,%r11
165
166	por	%xmm2,%xmm0
167	leaq	256(%r12),%r12
168	por	%xmm3,%xmm0
169
170	mulq	%rbp
171	addq	%rax,%r10
172	movq	8(%rsi),%rax
173	adcq	$0,%rdx
174	movq	8(%rsp),%r10
175	movq	%rdx,%r13
176
177	leaq	1(%r15),%r15
178	jmp	L$inner_enter
179
180.p2align	4
181L$inner:
182	addq	%rax,%r13
183	movq	(%rsi,%r15,8),%rax
184	adcq	$0,%rdx
185	addq	%r10,%r13
186	movq	(%rsp,%r15,8),%r10
187	adcq	$0,%rdx
188	movq	%r13,-16(%rsp,%r15,8)
189	movq	%rdx,%r13
190
191L$inner_enter:
192	mulq	%rbx
193	addq	%rax,%r11
194	movq	(%rcx,%r15,8),%rax
195	adcq	$0,%rdx
196	addq	%r11,%r10
197	movq	%rdx,%r11
198	adcq	$0,%r11
199	leaq	1(%r15),%r15
200
201	mulq	%rbp
202	cmpq	%r9,%r15
203	jne	L$inner
204
205.byte	102,72,15,126,195
206
207	addq	%rax,%r13
208	movq	(%rsi),%rax
209	adcq	$0,%rdx
210	addq	%r10,%r13
211	movq	(%rsp,%r15,8),%r10
212	adcq	$0,%rdx
213	movq	%r13,-16(%rsp,%r15,8)
214	movq	%rdx,%r13
215
216	xorq	%rdx,%rdx
217	addq	%r11,%r13
218	adcq	$0,%rdx
219	addq	%r10,%r13
220	adcq	$0,%rdx
221	movq	%r13,-8(%rsp,%r9,8)
222	movq	%rdx,(%rsp,%r9,8)
223
224	leaq	1(%r14),%r14
225	cmpq	%r9,%r14
226	jb	L$outer
227
228	xorq	%r14,%r14
229	movq	(%rsp),%rax
230	leaq	(%rsp),%rsi
231	movq	%r9,%r15
232	jmp	L$sub
233.p2align	4
234L$sub:	sbbq	(%rcx,%r14,8),%rax
235	movq	%rax,(%rdi,%r14,8)
236	movq	8(%rsi,%r14,8),%rax
237	leaq	1(%r14),%r14
238	decq	%r15
239	jnz	L$sub
240
241	sbbq	$0,%rax
242	xorq	%r14,%r14
243	movq	%r9,%r15
244.p2align	4
245L$copy:
246	movq	(%rsp,%r14,8),%rsi
247	movq	(%rdi,%r14,8),%rcx
248	xorq	%rcx,%rsi
249	andq	%rax,%rsi
250	xorq	%rcx,%rsi
251	movq	%r14,(%rsp,%r14,8)
252	movq	%rsi,(%rdi,%r14,8)
253	leaq	1(%r14),%r14
254	subq	$1,%r15
255	jnz	L$copy
256
257	movq	8(%rsp,%r9,8),%rsi
258	movq	$1,%rax
259	movq	-48(%rsi),%r15
260	movq	-40(%rsi),%r14
261	movq	-32(%rsi),%r13
262	movq	-24(%rsi),%r12
263	movq	-16(%rsi),%rbp
264	movq	-8(%rsi),%rbx
265	leaq	(%rsi),%rsp
266L$mul_epilogue:
267	.byte	0xf3,0xc3
268
269
270.p2align	5
271bn_mul4x_mont_gather5:
272L$mul4x_enter:
273.byte	0x67
274	movq	%rsp,%rax
275	pushq	%rbx
276	pushq	%rbp
277	pushq	%r12
278	pushq	%r13
279	pushq	%r14
280	pushq	%r15
281.byte	0x67
282	movl	%r9d,%r10d
283	shll	$3,%r9d
284	shll	$3+2,%r10d
285	negq	%r9
286
287
288
289
290
291
292
293
294	leaq	-64(%rsp,%r9,2),%r11
295	subq	%rsi,%r11
296	andq	$4095,%r11
297	cmpq	%r11,%r10
298	jb	L$mul4xsp_alt
299	subq	%r11,%rsp
300	leaq	-64(%rsp,%r9,2),%rsp
301	jmp	L$mul4xsp_done
302
303.p2align	5
304L$mul4xsp_alt:
305	leaq	4096-64(,%r9,2),%r10
306	leaq	-64(%rsp,%r9,2),%rsp
307	subq	%r10,%r11
308	movq	$0,%r10
309	cmovcq	%r10,%r11
310	subq	%r11,%rsp
311L$mul4xsp_done:
312	andq	$-64,%rsp
313	negq	%r9
314
315	movq	%rax,40(%rsp)
316L$mul4x_body:
317
318	call	mul4x_internal
319
320	movq	40(%rsp),%rsi
321	movq	$1,%rax
322	movq	-48(%rsi),%r15
323	movq	-40(%rsi),%r14
324	movq	-32(%rsi),%r13
325	movq	-24(%rsi),%r12
326	movq	-16(%rsi),%rbp
327	movq	-8(%rsi),%rbx
328	leaq	(%rsi),%rsp
329L$mul4x_epilogue:
330	.byte	0xf3,0xc3
331
332
333
334.p2align	5
335mul4x_internal:
336	shlq	$5,%r9
337	movl	8(%rax),%r10d
338	leaq	256(%rdx,%r9,1),%r13
339	shrq	$5,%r9
340	movq	%r10,%r11
341	shrq	$3,%r10
342	andq	$7,%r11
343	notq	%r10
344	leaq	L$magic_masks(%rip),%rax
345	andq	$3,%r10
346	leaq	96(%rdx,%r11,8),%r12
347	movq	0(%rax,%r10,8),%xmm4
348	movq	8(%rax,%r10,8),%xmm5
349	addq	$7,%r11
350	movq	16(%rax,%r10,8),%xmm6
351	movq	24(%rax,%r10,8),%xmm7
352	andq	$7,%r11
353
354	movq	-96(%r12),%xmm0
355	leaq	256(%r12),%r14
356	movq	-32(%r12),%xmm1
357	pand	%xmm4,%xmm0
358	movq	32(%r12),%xmm2
359	pand	%xmm5,%xmm1
360	movq	96(%r12),%xmm3
361	pand	%xmm6,%xmm2
362.byte	0x67
363	por	%xmm1,%xmm0
364	movq	-96(%r14),%xmm1
365.byte	0x67
366	pand	%xmm7,%xmm3
367.byte	0x67
368	por	%xmm2,%xmm0
369	movq	-32(%r14),%xmm2
370.byte	0x67
371	pand	%xmm4,%xmm1
372.byte	0x67
373	por	%xmm3,%xmm0
374	movq	32(%r14),%xmm3
375
376.byte	102,72,15,126,195
377	movq	96(%r14),%xmm0
378	movq	%r13,16+8(%rsp)
379	movq	%rdi,56+8(%rsp)
380
381	movq	(%r8),%r8
382	movq	(%rsi),%rax
383	leaq	(%rsi,%r9,1),%rsi
384	negq	%r9
385
386	movq	%r8,%rbp
387	mulq	%rbx
388	movq	%rax,%r10
389	movq	(%rcx),%rax
390
391	pand	%xmm5,%xmm2
392	pand	%xmm6,%xmm3
393	por	%xmm2,%xmm1
394
395	imulq	%r10,%rbp
396
397
398
399
400
401
402
403	leaq	64+8(%rsp,%r11,8),%r14
404	movq	%rdx,%r11
405
406	pand	%xmm7,%xmm0
407	por	%xmm3,%xmm1
408	leaq	512(%r12),%r12
409	por	%xmm1,%xmm0
410
411	mulq	%rbp
412	addq	%rax,%r10
413	movq	8(%rsi,%r9,1),%rax
414	adcq	$0,%rdx
415	movq	%rdx,%rdi
416
417	mulq	%rbx
418	addq	%rax,%r11
419	movq	16(%rcx),%rax
420	adcq	$0,%rdx
421	movq	%rdx,%r10
422
423	mulq	%rbp
424	addq	%rax,%rdi
425	movq	16(%rsi,%r9,1),%rax
426	adcq	$0,%rdx
427	addq	%r11,%rdi
428	leaq	32(%r9),%r15
429	leaq	64(%rcx),%rcx
430	adcq	$0,%rdx
431	movq	%rdi,(%r14)
432	movq	%rdx,%r13
433	jmp	L$1st4x
434
435.p2align	5
436L$1st4x:
437	mulq	%rbx
438	addq	%rax,%r10
439	movq	-32(%rcx),%rax
440	leaq	32(%r14),%r14
441	adcq	$0,%rdx
442	movq	%rdx,%r11
443
444	mulq	%rbp
445	addq	%rax,%r13
446	movq	-8(%rsi,%r15,1),%rax
447	adcq	$0,%rdx
448	addq	%r10,%r13
449	adcq	$0,%rdx
450	movq	%r13,-24(%r14)
451	movq	%rdx,%rdi
452
453	mulq	%rbx
454	addq	%rax,%r11
455	movq	-16(%rcx),%rax
456	adcq	$0,%rdx
457	movq	%rdx,%r10
458
459	mulq	%rbp
460	addq	%rax,%rdi
461	movq	(%rsi,%r15,1),%rax
462	adcq	$0,%rdx
463	addq	%r11,%rdi
464	adcq	$0,%rdx
465	movq	%rdi,-16(%r14)
466	movq	%rdx,%r13
467
468	mulq	%rbx
469	addq	%rax,%r10
470	movq	0(%rcx),%rax
471	adcq	$0,%rdx
472	movq	%rdx,%r11
473
474	mulq	%rbp
475	addq	%rax,%r13
476	movq	8(%rsi,%r15,1),%rax
477	adcq	$0,%rdx
478	addq	%r10,%r13
479	adcq	$0,%rdx
480	movq	%r13,-8(%r14)
481	movq	%rdx,%rdi
482
483	mulq	%rbx
484	addq	%rax,%r11
485	movq	16(%rcx),%rax
486	adcq	$0,%rdx
487	movq	%rdx,%r10
488
489	mulq	%rbp
490	addq	%rax,%rdi
491	movq	16(%rsi,%r15,1),%rax
492	adcq	$0,%rdx
493	addq	%r11,%rdi
494	leaq	64(%rcx),%rcx
495	adcq	$0,%rdx
496	movq	%rdi,(%r14)
497	movq	%rdx,%r13
498
499	addq	$32,%r15
500	jnz	L$1st4x
501
502	mulq	%rbx
503	addq	%rax,%r10
504	movq	-32(%rcx),%rax
505	leaq	32(%r14),%r14
506	adcq	$0,%rdx
507	movq	%rdx,%r11
508
509	mulq	%rbp
510	addq	%rax,%r13
511	movq	-8(%rsi),%rax
512	adcq	$0,%rdx
513	addq	%r10,%r13
514	adcq	$0,%rdx
515	movq	%r13,-24(%r14)
516	movq	%rdx,%rdi
517
518	mulq	%rbx
519	addq	%rax,%r11
520	movq	-16(%rcx),%rax
521	adcq	$0,%rdx
522	movq	%rdx,%r10
523
524	mulq	%rbp
525	addq	%rax,%rdi
526	movq	(%rsi,%r9,1),%rax
527	adcq	$0,%rdx
528	addq	%r11,%rdi
529	adcq	$0,%rdx
530	movq	%rdi,-16(%r14)
531	movq	%rdx,%r13
532
533.byte	102,72,15,126,195
534	leaq	(%rcx,%r9,2),%rcx
535
536	xorq	%rdi,%rdi
537	addq	%r10,%r13
538	adcq	$0,%rdi
539	movq	%r13,-8(%r14)
540
541	jmp	L$outer4x
542
543.p2align	5
544L$outer4x:
545	movq	(%r14,%r9,1),%r10
546	movq	%r8,%rbp
547	mulq	%rbx
548	addq	%rax,%r10
549	movq	(%rcx),%rax
550	adcq	$0,%rdx
551
552	movq	-96(%r12),%xmm0
553	movq	-32(%r12),%xmm1
554	pand	%xmm4,%xmm0
555	movq	32(%r12),%xmm2
556	pand	%xmm5,%xmm1
557	movq	96(%r12),%xmm3
558
559	imulq	%r10,%rbp
560.byte	0x67
561	movq	%rdx,%r11
562	movq	%rdi,(%r14)
563
564	pand	%xmm6,%xmm2
565	por	%xmm1,%xmm0
566	pand	%xmm7,%xmm3
567	por	%xmm2,%xmm0
568	leaq	(%r14,%r9,1),%r14
569	leaq	256(%r12),%r12
570	por	%xmm3,%xmm0
571
572	mulq	%rbp
573	addq	%rax,%r10
574	movq	8(%rsi,%r9,1),%rax
575	adcq	$0,%rdx
576	movq	%rdx,%rdi
577
578	mulq	%rbx
579	addq	%rax,%r11
580	movq	16(%rcx),%rax
581	adcq	$0,%rdx
582	addq	8(%r14),%r11
583	adcq	$0,%rdx
584	movq	%rdx,%r10
585
586	mulq	%rbp
587	addq	%rax,%rdi
588	movq	16(%rsi,%r9,1),%rax
589	adcq	$0,%rdx
590	addq	%r11,%rdi
591	leaq	32(%r9),%r15
592	leaq	64(%rcx),%rcx
593	adcq	$0,%rdx
594	movq	%rdx,%r13
595	jmp	L$inner4x
596
597.p2align	5
598L$inner4x:
599	mulq	%rbx
600	addq	%rax,%r10
601	movq	-32(%rcx),%rax
602	adcq	$0,%rdx
603	addq	16(%r14),%r10
604	leaq	32(%r14),%r14
605	adcq	$0,%rdx
606	movq	%rdx,%r11
607
608	mulq	%rbp
609	addq	%rax,%r13
610	movq	-8(%rsi,%r15,1),%rax
611	adcq	$0,%rdx
612	addq	%r10,%r13
613	adcq	$0,%rdx
614	movq	%rdi,-32(%r14)
615	movq	%rdx,%rdi
616
617	mulq	%rbx
618	addq	%rax,%r11
619	movq	-16(%rcx),%rax
620	adcq	$0,%rdx
621	addq	-8(%r14),%r11
622	adcq	$0,%rdx
623	movq	%rdx,%r10
624
625	mulq	%rbp
626	addq	%rax,%rdi
627	movq	(%rsi,%r15,1),%rax
628	adcq	$0,%rdx
629	addq	%r11,%rdi
630	adcq	$0,%rdx
631	movq	%r13,-24(%r14)
632	movq	%rdx,%r13
633
634	mulq	%rbx
635	addq	%rax,%r10
636	movq	0(%rcx),%rax
637	adcq	$0,%rdx
638	addq	(%r14),%r10
639	adcq	$0,%rdx
640	movq	%rdx,%r11
641
642	mulq	%rbp
643	addq	%rax,%r13
644	movq	8(%rsi,%r15,1),%rax
645	adcq	$0,%rdx
646	addq	%r10,%r13
647	adcq	$0,%rdx
648	movq	%rdi,-16(%r14)
649	movq	%rdx,%rdi
650
651	mulq	%rbx
652	addq	%rax,%r11
653	movq	16(%rcx),%rax
654	adcq	$0,%rdx
655	addq	8(%r14),%r11
656	adcq	$0,%rdx
657	movq	%rdx,%r10
658
659	mulq	%rbp
660	addq	%rax,%rdi
661	movq	16(%rsi,%r15,1),%rax
662	adcq	$0,%rdx
663	addq	%r11,%rdi
664	leaq	64(%rcx),%rcx
665	adcq	$0,%rdx
666	movq	%r13,-8(%r14)
667	movq	%rdx,%r13
668
669	addq	$32,%r15
670	jnz	L$inner4x
671
672	mulq	%rbx
673	addq	%rax,%r10
674	movq	-32(%rcx),%rax
675	adcq	$0,%rdx
676	addq	16(%r14),%r10
677	leaq	32(%r14),%r14
678	adcq	$0,%rdx
679	movq	%rdx,%r11
680
681	mulq	%rbp
682	addq	%rax,%r13
683	movq	-8(%rsi),%rax
684	adcq	$0,%rdx
685	addq	%r10,%r13
686	adcq	$0,%rdx
687	movq	%rdi,-32(%r14)
688	movq	%rdx,%rdi
689
690	mulq	%rbx
691	addq	%rax,%r11
692	movq	%rbp,%rax
693	movq	-16(%rcx),%rbp
694	adcq	$0,%rdx
695	addq	-8(%r14),%r11
696	adcq	$0,%rdx
697	movq	%rdx,%r10
698
699	mulq	%rbp
700	addq	%rax,%rdi
701	movq	(%rsi,%r9,1),%rax
702	adcq	$0,%rdx
703	addq	%r11,%rdi
704	adcq	$0,%rdx
705	movq	%r13,-24(%r14)
706	movq	%rdx,%r13
707
708.byte	102,72,15,126,195
709	movq	%rdi,-16(%r14)
710	leaq	(%rcx,%r9,2),%rcx
711
712	xorq	%rdi,%rdi
713	addq	%r10,%r13
714	adcq	$0,%rdi
715	addq	(%r14),%r13
716	adcq	$0,%rdi
717	movq	%r13,-8(%r14)
718
719	cmpq	16+8(%rsp),%r12
720	jb	L$outer4x
721	subq	%r13,%rbp
722	adcq	%r15,%r15
723	orq	%r15,%rdi
724	xorq	$1,%rdi
725	leaq	(%r14,%r9,1),%rbx
726	leaq	(%rcx,%rdi,8),%rbp
727	movq	%r9,%rcx
728	sarq	$3+2,%rcx
729	movq	56+8(%rsp),%rdi
730	jmp	L$sqr4x_sub
731
732.globl	_bn_power5
733.private_extern _bn_power5
734
735.p2align	5
736_bn_power5:
737	movq	%rsp,%rax
738	pushq	%rbx
739	pushq	%rbp
740	pushq	%r12
741	pushq	%r13
742	pushq	%r14
743	pushq	%r15
744	movl	%r9d,%r10d
745	shll	$3,%r9d
746	shll	$3+2,%r10d
747	negq	%r9
748	movq	(%r8),%r8
749
750
751
752
753
754
755
756	leaq	-64(%rsp,%r9,2),%r11
757	subq	%rsi,%r11
758	andq	$4095,%r11
759	cmpq	%r11,%r10
760	jb	L$pwr_sp_alt
761	subq	%r11,%rsp
762	leaq	-64(%rsp,%r9,2),%rsp
763	jmp	L$pwr_sp_done
764
765.p2align	5
766L$pwr_sp_alt:
767	leaq	4096-64(,%r9,2),%r10
768	leaq	-64(%rsp,%r9,2),%rsp
769	subq	%r10,%r11
770	movq	$0,%r10
771	cmovcq	%r10,%r11
772	subq	%r11,%rsp
773L$pwr_sp_done:
774	andq	$-64,%rsp
775	movq	%r9,%r10
776	negq	%r9
777
778
779
780
781
782
783
784
785
786
787	movq	%r8,32(%rsp)
788	movq	%rax,40(%rsp)
789L$power5_body:
790.byte	102,72,15,110,207
791.byte	102,72,15,110,209
792.byte	102,73,15,110,218
793.byte	102,72,15,110,226
794
795	call	__bn_sqr8x_internal
796	call	__bn_sqr8x_internal
797	call	__bn_sqr8x_internal
798	call	__bn_sqr8x_internal
799	call	__bn_sqr8x_internal
800
801.byte	102,72,15,126,209
802.byte	102,72,15,126,226
803	movq	%rsi,%rdi
804	movq	40(%rsp),%rax
805	leaq	32(%rsp),%r8
806
807	call	mul4x_internal
808
809	movq	40(%rsp),%rsi
810	movq	$1,%rax
811	movq	-48(%rsi),%r15
812	movq	-40(%rsi),%r14
813	movq	-32(%rsi),%r13
814	movq	-24(%rsi),%r12
815	movq	-16(%rsi),%rbp
816	movq	-8(%rsi),%rbx
817	leaq	(%rsi),%rsp
818L$power5_epilogue:
819	.byte	0xf3,0xc3
820
821
822.globl	_bn_sqr8x_internal
823.private_extern _bn_sqr8x_internal
824.private_extern	_bn_sqr8x_internal
825
826.p2align	5
827_bn_sqr8x_internal:
828__bn_sqr8x_internal:
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902	leaq	32(%r10),%rbp
903	leaq	(%rsi,%r9,1),%rsi
904
905	movq	%r9,%rcx
906
907
908	movq	-32(%rsi,%rbp,1),%r14
909	leaq	48+8(%rsp,%r9,2),%rdi
910	movq	-24(%rsi,%rbp,1),%rax
911	leaq	-32(%rdi,%rbp,1),%rdi
912	movq	-16(%rsi,%rbp,1),%rbx
913	movq	%rax,%r15
914
915	mulq	%r14
916	movq	%rax,%r10
917	movq	%rbx,%rax
918	movq	%rdx,%r11
919	movq	%r10,-24(%rdi,%rbp,1)
920
921	mulq	%r14
922	addq	%rax,%r11
923	movq	%rbx,%rax
924	adcq	$0,%rdx
925	movq	%r11,-16(%rdi,%rbp,1)
926	movq	%rdx,%r10
927
928
929	movq	-8(%rsi,%rbp,1),%rbx
930	mulq	%r15
931	movq	%rax,%r12
932	movq	%rbx,%rax
933	movq	%rdx,%r13
934
935	leaq	(%rbp),%rcx
936	mulq	%r14
937	addq	%rax,%r10
938	movq	%rbx,%rax
939	movq	%rdx,%r11
940	adcq	$0,%r11
941	addq	%r12,%r10
942	adcq	$0,%r11
943	movq	%r10,-8(%rdi,%rcx,1)
944	jmp	L$sqr4x_1st
945
946.p2align	5
947L$sqr4x_1st:
948	movq	(%rsi,%rcx,1),%rbx
949	mulq	%r15
950	addq	%rax,%r13
951	movq	%rbx,%rax
952	movq	%rdx,%r12
953	adcq	$0,%r12
954
955	mulq	%r14
956	addq	%rax,%r11
957	movq	%rbx,%rax
958	movq	8(%rsi,%rcx,1),%rbx
959	movq	%rdx,%r10
960	adcq	$0,%r10
961	addq	%r13,%r11
962	adcq	$0,%r10
963
964
965	mulq	%r15
966	addq	%rax,%r12
967	movq	%rbx,%rax
968	movq	%r11,(%rdi,%rcx,1)
969	movq	%rdx,%r13
970	adcq	$0,%r13
971
972	mulq	%r14
973	addq	%rax,%r10
974	movq	%rbx,%rax
975	movq	16(%rsi,%rcx,1),%rbx
976	movq	%rdx,%r11
977	adcq	$0,%r11
978	addq	%r12,%r10
979	adcq	$0,%r11
980
981	mulq	%r15
982	addq	%rax,%r13
983	movq	%rbx,%rax
984	movq	%r10,8(%rdi,%rcx,1)
985	movq	%rdx,%r12
986	adcq	$0,%r12
987
988	mulq	%r14
989	addq	%rax,%r11
990	movq	%rbx,%rax
991	movq	24(%rsi,%rcx,1),%rbx
992	movq	%rdx,%r10
993	adcq	$0,%r10
994	addq	%r13,%r11
995	adcq	$0,%r10
996
997
998	mulq	%r15
999	addq	%rax,%r12
1000	movq	%rbx,%rax
1001	movq	%r11,16(%rdi,%rcx,1)
1002	movq	%rdx,%r13
1003	adcq	$0,%r13
1004	leaq	32(%rcx),%rcx
1005
1006	mulq	%r14
1007	addq	%rax,%r10
1008	movq	%rbx,%rax
1009	movq	%rdx,%r11
1010	adcq	$0,%r11
1011	addq	%r12,%r10
1012	adcq	$0,%r11
1013	movq	%r10,-8(%rdi,%rcx,1)
1014
1015	cmpq	$0,%rcx
1016	jne	L$sqr4x_1st
1017
1018	mulq	%r15
1019	addq	%rax,%r13
1020	leaq	16(%rbp),%rbp
1021	adcq	$0,%rdx
1022	addq	%r11,%r13
1023	adcq	$0,%rdx
1024
1025	movq	%r13,(%rdi)
1026	movq	%rdx,%r12
1027	movq	%rdx,8(%rdi)
1028	jmp	L$sqr4x_outer
1029
1030.p2align	5
1031L$sqr4x_outer:
1032	movq	-32(%rsi,%rbp,1),%r14
1033	leaq	48+8(%rsp,%r9,2),%rdi
1034	movq	-24(%rsi,%rbp,1),%rax
1035	leaq	-32(%rdi,%rbp,1),%rdi
1036	movq	-16(%rsi,%rbp,1),%rbx
1037	movq	%rax,%r15
1038
1039	mulq	%r14
1040	movq	-24(%rdi,%rbp,1),%r10
1041	addq	%rax,%r10
1042	movq	%rbx,%rax
1043	adcq	$0,%rdx
1044	movq	%r10,-24(%rdi,%rbp,1)
1045	movq	%rdx,%r11
1046
1047	mulq	%r14
1048	addq	%rax,%r11
1049	movq	%rbx,%rax
1050	adcq	$0,%rdx
1051	addq	-16(%rdi,%rbp,1),%r11
1052	movq	%rdx,%r10
1053	adcq	$0,%r10
1054	movq	%r11,-16(%rdi,%rbp,1)
1055
1056	xorq	%r12,%r12
1057
1058	movq	-8(%rsi,%rbp,1),%rbx
1059	mulq	%r15
1060	addq	%rax,%r12
1061	movq	%rbx,%rax
1062	adcq	$0,%rdx
1063	addq	-8(%rdi,%rbp,1),%r12
1064	movq	%rdx,%r13
1065	adcq	$0,%r13
1066
1067	mulq	%r14
1068	addq	%rax,%r10
1069	movq	%rbx,%rax
1070	adcq	$0,%rdx
1071	addq	%r12,%r10
1072	movq	%rdx,%r11
1073	adcq	$0,%r11
1074	movq	%r10,-8(%rdi,%rbp,1)
1075
1076	leaq	(%rbp),%rcx
1077	jmp	L$sqr4x_inner
1078
1079.p2align	5
1080L$sqr4x_inner:
1081	movq	(%rsi,%rcx,1),%rbx
1082	mulq	%r15
1083	addq	%rax,%r13
1084	movq	%rbx,%rax
1085	movq	%rdx,%r12
1086	adcq	$0,%r12
1087	addq	(%rdi,%rcx,1),%r13
1088	adcq	$0,%r12
1089
1090.byte	0x67
1091	mulq	%r14
1092	addq	%rax,%r11
1093	movq	%rbx,%rax
1094	movq	8(%rsi,%rcx,1),%rbx
1095	movq	%rdx,%r10
1096	adcq	$0,%r10
1097	addq	%r13,%r11
1098	adcq	$0,%r10
1099
1100	mulq	%r15
1101	addq	%rax,%r12
1102	movq	%r11,(%rdi,%rcx,1)
1103	movq	%rbx,%rax
1104	movq	%rdx,%r13
1105	adcq	$0,%r13
1106	addq	8(%rdi,%rcx,1),%r12
1107	leaq	16(%rcx),%rcx
1108	adcq	$0,%r13
1109
1110	mulq	%r14
1111	addq	%rax,%r10
1112	movq	%rbx,%rax
1113	adcq	$0,%rdx
1114	addq	%r12,%r10
1115	movq	%rdx,%r11
1116	adcq	$0,%r11
1117	movq	%r10,-8(%rdi,%rcx,1)
1118
1119	cmpq	$0,%rcx
1120	jne	L$sqr4x_inner
1121
1122.byte	0x67
1123	mulq	%r15
1124	addq	%rax,%r13
1125	adcq	$0,%rdx
1126	addq	%r11,%r13
1127	adcq	$0,%rdx
1128
1129	movq	%r13,(%rdi)
1130	movq	%rdx,%r12
1131	movq	%rdx,8(%rdi)
1132
1133	addq	$16,%rbp
1134	jnz	L$sqr4x_outer
1135
1136
1137	movq	-32(%rsi),%r14
1138	leaq	48+8(%rsp,%r9,2),%rdi
1139	movq	-24(%rsi),%rax
1140	leaq	-32(%rdi,%rbp,1),%rdi
1141	movq	-16(%rsi),%rbx
1142	movq	%rax,%r15
1143
1144	mulq	%r14
1145	addq	%rax,%r10
1146	movq	%rbx,%rax
1147	movq	%rdx,%r11
1148	adcq	$0,%r11
1149
1150	mulq	%r14
1151	addq	%rax,%r11
1152	movq	%rbx,%rax
1153	movq	%r10,-24(%rdi)
1154	movq	%rdx,%r10
1155	adcq	$0,%r10
1156	addq	%r13,%r11
1157	movq	-8(%rsi),%rbx
1158	adcq	$0,%r10
1159
1160	mulq	%r15
1161	addq	%rax,%r12
1162	movq	%rbx,%rax
1163	movq	%r11,-16(%rdi)
1164	movq	%rdx,%r13
1165	adcq	$0,%r13
1166
1167	mulq	%r14
1168	addq	%rax,%r10
1169	movq	%rbx,%rax
1170	movq	%rdx,%r11
1171	adcq	$0,%r11
1172	addq	%r12,%r10
1173	adcq	$0,%r11
1174	movq	%r10,-8(%rdi)
1175
1176	mulq	%r15
1177	addq	%rax,%r13
1178	movq	-16(%rsi),%rax
1179	adcq	$0,%rdx
1180	addq	%r11,%r13
1181	adcq	$0,%rdx
1182
1183	movq	%r13,(%rdi)
1184	movq	%rdx,%r12
1185	movq	%rdx,8(%rdi)
1186
1187	mulq	%rbx
1188	addq	$16,%rbp
1189	xorq	%r14,%r14
1190	subq	%r9,%rbp
1191	xorq	%r15,%r15
1192
1193	addq	%r12,%rax
1194	adcq	$0,%rdx
1195	movq	%rax,8(%rdi)
1196	movq	%rdx,16(%rdi)
1197	movq	%r15,24(%rdi)
1198
1199	movq	-16(%rsi,%rbp,1),%rax
1200	leaq	48+8(%rsp),%rdi
1201	xorq	%r10,%r10
1202	movq	8(%rdi),%r11
1203
1204	leaq	(%r14,%r10,2),%r12
1205	shrq	$63,%r10
1206	leaq	(%rcx,%r11,2),%r13
1207	shrq	$63,%r11
1208	orq	%r10,%r13
1209	movq	16(%rdi),%r10
1210	movq	%r11,%r14
1211	mulq	%rax
1212	negq	%r15
1213	movq	24(%rdi),%r11
1214	adcq	%rax,%r12
1215	movq	-8(%rsi,%rbp,1),%rax
1216	movq	%r12,(%rdi)
1217	adcq	%rdx,%r13
1218
1219	leaq	(%r14,%r10,2),%rbx
1220	movq	%r13,8(%rdi)
1221	sbbq	%r15,%r15
1222	shrq	$63,%r10
1223	leaq	(%rcx,%r11,2),%r8
1224	shrq	$63,%r11
1225	orq	%r10,%r8
1226	movq	32(%rdi),%r10
1227	movq	%r11,%r14
1228	mulq	%rax
1229	negq	%r15
1230	movq	40(%rdi),%r11
1231	adcq	%rax,%rbx
1232	movq	0(%rsi,%rbp,1),%rax
1233	movq	%rbx,16(%rdi)
1234	adcq	%rdx,%r8
1235	leaq	16(%rbp),%rbp
1236	movq	%r8,24(%rdi)
1237	sbbq	%r15,%r15
1238	leaq	64(%rdi),%rdi
1239	jmp	L$sqr4x_shift_n_add
1240
1241.p2align	5
1242L$sqr4x_shift_n_add:
1243	leaq	(%r14,%r10,2),%r12
1244	shrq	$63,%r10
1245	leaq	(%rcx,%r11,2),%r13
1246	shrq	$63,%r11
1247	orq	%r10,%r13
1248	movq	-16(%rdi),%r10
1249	movq	%r11,%r14
1250	mulq	%rax
1251	negq	%r15
1252	movq	-8(%rdi),%r11
1253	adcq	%rax,%r12
1254	movq	-8(%rsi,%rbp,1),%rax
1255	movq	%r12,-32(%rdi)
1256	adcq	%rdx,%r13
1257
1258	leaq	(%r14,%r10,2),%rbx
1259	movq	%r13,-24(%rdi)
1260	sbbq	%r15,%r15
1261	shrq	$63,%r10
1262	leaq	(%rcx,%r11,2),%r8
1263	shrq	$63,%r11
1264	orq	%r10,%r8
1265	movq	0(%rdi),%r10
1266	movq	%r11,%r14
1267	mulq	%rax
1268	negq	%r15
1269	movq	8(%rdi),%r11
1270	adcq	%rax,%rbx
1271	movq	0(%rsi,%rbp,1),%rax
1272	movq	%rbx,-16(%rdi)
1273	adcq	%rdx,%r8
1274
1275	leaq	(%r14,%r10,2),%r12
1276	movq	%r8,-8(%rdi)
1277	sbbq	%r15,%r15
1278	shrq	$63,%r10
1279	leaq	(%rcx,%r11,2),%r13
1280	shrq	$63,%r11
1281	orq	%r10,%r13
1282	movq	16(%rdi),%r10
1283	movq	%r11,%r14
1284	mulq	%rax
1285	negq	%r15
1286	movq	24(%rdi),%r11
1287	adcq	%rax,%r12
1288	movq	8(%rsi,%rbp,1),%rax
1289	movq	%r12,0(%rdi)
1290	adcq	%rdx,%r13
1291
1292	leaq	(%r14,%r10,2),%rbx
1293	movq	%r13,8(%rdi)
1294	sbbq	%r15,%r15
1295	shrq	$63,%r10
1296	leaq	(%rcx,%r11,2),%r8
1297	shrq	$63,%r11
1298	orq	%r10,%r8
1299	movq	32(%rdi),%r10
1300	movq	%r11,%r14
1301	mulq	%rax
1302	negq	%r15
1303	movq	40(%rdi),%r11
1304	adcq	%rax,%rbx
1305	movq	16(%rsi,%rbp,1),%rax
1306	movq	%rbx,16(%rdi)
1307	adcq	%rdx,%r8
1308	movq	%r8,24(%rdi)
1309	sbbq	%r15,%r15
1310	leaq	64(%rdi),%rdi
1311	addq	$32,%rbp
1312	jnz	L$sqr4x_shift_n_add
1313
1314	leaq	(%r14,%r10,2),%r12
1315.byte	0x67
1316	shrq	$63,%r10
1317	leaq	(%rcx,%r11,2),%r13
1318	shrq	$63,%r11
1319	orq	%r10,%r13
1320	movq	-16(%rdi),%r10
1321	movq	%r11,%r14
1322	mulq	%rax
1323	negq	%r15
1324	movq	-8(%rdi),%r11
1325	adcq	%rax,%r12
1326	movq	-8(%rsi),%rax
1327	movq	%r12,-32(%rdi)
1328	adcq	%rdx,%r13
1329
1330	leaq	(%r14,%r10,2),%rbx
1331	movq	%r13,-24(%rdi)
1332	sbbq	%r15,%r15
1333	shrq	$63,%r10
1334	leaq	(%rcx,%r11,2),%r8
1335	shrq	$63,%r11
1336	orq	%r10,%r8
1337	mulq	%rax
1338	negq	%r15
1339	adcq	%rax,%rbx
1340	adcq	%rdx,%r8
1341	movq	%rbx,-16(%rdi)
1342	movq	%r8,-8(%rdi)
1343.byte	102,72,15,126,213
1344sqr8x_reduction:
1345	xorq	%rax,%rax
1346	leaq	(%rbp,%r9,2),%rcx
1347	leaq	48+8(%rsp,%r9,2),%rdx
1348	movq	%rcx,0+8(%rsp)
1349	leaq	48+8(%rsp,%r9,1),%rdi
1350	movq	%rdx,8+8(%rsp)
1351	negq	%r9
1352	jmp	L$8x_reduction_loop
1353
1354.p2align	5
1355L$8x_reduction_loop:
1356	leaq	(%rdi,%r9,1),%rdi
1357.byte	0x66
1358	movq	0(%rdi),%rbx
1359	movq	8(%rdi),%r9
1360	movq	16(%rdi),%r10
1361	movq	24(%rdi),%r11
1362	movq	32(%rdi),%r12
1363	movq	40(%rdi),%r13
1364	movq	48(%rdi),%r14
1365	movq	56(%rdi),%r15
1366	movq	%rax,(%rdx)
1367	leaq	64(%rdi),%rdi
1368
1369.byte	0x67
1370	movq	%rbx,%r8
1371	imulq	32+8(%rsp),%rbx
1372	movq	0(%rbp),%rax
1373	movl	$8,%ecx
1374	jmp	L$8x_reduce
1375
1376.p2align	5
1377L$8x_reduce:
1378	mulq	%rbx
1379	movq	16(%rbp),%rax
1380	negq	%r8
1381	movq	%rdx,%r8
1382	adcq	$0,%r8
1383
1384	mulq	%rbx
1385	addq	%rax,%r9
1386	movq	32(%rbp),%rax
1387	adcq	$0,%rdx
1388	addq	%r9,%r8
1389	movq	%rbx,48-8+8(%rsp,%rcx,8)
1390	movq	%rdx,%r9
1391	adcq	$0,%r9
1392
1393	mulq	%rbx
1394	addq	%rax,%r10
1395	movq	48(%rbp),%rax
1396	adcq	$0,%rdx
1397	addq	%r10,%r9
1398	movq	32+8(%rsp),%rsi
1399	movq	%rdx,%r10
1400	adcq	$0,%r10
1401
1402	mulq	%rbx
1403	addq	%rax,%r11
1404	movq	64(%rbp),%rax
1405	adcq	$0,%rdx
1406	imulq	%r8,%rsi
1407	addq	%r11,%r10
1408	movq	%rdx,%r11
1409	adcq	$0,%r11
1410
1411	mulq	%rbx
1412	addq	%rax,%r12
1413	movq	80(%rbp),%rax
1414	adcq	$0,%rdx
1415	addq	%r12,%r11
1416	movq	%rdx,%r12
1417	adcq	$0,%r12
1418
1419	mulq	%rbx
1420	addq	%rax,%r13
1421	movq	96(%rbp),%rax
1422	adcq	$0,%rdx
1423	addq	%r13,%r12
1424	movq	%rdx,%r13
1425	adcq	$0,%r13
1426
1427	mulq	%rbx
1428	addq	%rax,%r14
1429	movq	112(%rbp),%rax
1430	adcq	$0,%rdx
1431	addq	%r14,%r13
1432	movq	%rdx,%r14
1433	adcq	$0,%r14
1434
1435	mulq	%rbx
1436	movq	%rsi,%rbx
1437	addq	%rax,%r15
1438	movq	0(%rbp),%rax
1439	adcq	$0,%rdx
1440	addq	%r15,%r14
1441	movq	%rdx,%r15
1442	adcq	$0,%r15
1443
1444	decl	%ecx
1445	jnz	L$8x_reduce
1446
1447	leaq	128(%rbp),%rbp
1448	xorq	%rax,%rax
1449	movq	8+8(%rsp),%rdx
1450	cmpq	0+8(%rsp),%rbp
1451	jae	L$8x_no_tail
1452
1453.byte	0x66
1454	addq	0(%rdi),%r8
1455	adcq	8(%rdi),%r9
1456	adcq	16(%rdi),%r10
1457	adcq	24(%rdi),%r11
1458	adcq	32(%rdi),%r12
1459	adcq	40(%rdi),%r13
1460	adcq	48(%rdi),%r14
1461	adcq	56(%rdi),%r15
1462	sbbq	%rsi,%rsi
1463
1464	movq	48+56+8(%rsp),%rbx
1465	movl	$8,%ecx
1466	movq	0(%rbp),%rax
1467	jmp	L$8x_tail
1468
1469.p2align	5
1470L$8x_tail:
1471	mulq	%rbx
1472	addq	%rax,%r8
1473	movq	16(%rbp),%rax
1474	movq	%r8,(%rdi)
1475	movq	%rdx,%r8
1476	adcq	$0,%r8
1477
1478	mulq	%rbx
1479	addq	%rax,%r9
1480	movq	32(%rbp),%rax
1481	adcq	$0,%rdx
1482	addq	%r9,%r8
1483	leaq	8(%rdi),%rdi
1484	movq	%rdx,%r9
1485	adcq	$0,%r9
1486
1487	mulq	%rbx
1488	addq	%rax,%r10
1489	movq	48(%rbp),%rax
1490	adcq	$0,%rdx
1491	addq	%r10,%r9
1492	movq	%rdx,%r10
1493	adcq	$0,%r10
1494
1495	mulq	%rbx
1496	addq	%rax,%r11
1497	movq	64(%rbp),%rax
1498	adcq	$0,%rdx
1499	addq	%r11,%r10
1500	movq	%rdx,%r11
1501	adcq	$0,%r11
1502
1503	mulq	%rbx
1504	addq	%rax,%r12
1505	movq	80(%rbp),%rax
1506	adcq	$0,%rdx
1507	addq	%r12,%r11
1508	movq	%rdx,%r12
1509	adcq	$0,%r12
1510
1511	mulq	%rbx
1512	addq	%rax,%r13
1513	movq	96(%rbp),%rax
1514	adcq	$0,%rdx
1515	addq	%r13,%r12
1516	movq	%rdx,%r13
1517	adcq	$0,%r13
1518
1519	mulq	%rbx
1520	addq	%rax,%r14
1521	movq	112(%rbp),%rax
1522	adcq	$0,%rdx
1523	addq	%r14,%r13
1524	movq	%rdx,%r14
1525	adcq	$0,%r14
1526
1527	mulq	%rbx
1528	movq	48-16+8(%rsp,%rcx,8),%rbx
1529	addq	%rax,%r15
1530	adcq	$0,%rdx
1531	addq	%r15,%r14
1532	movq	0(%rbp),%rax
1533	movq	%rdx,%r15
1534	adcq	$0,%r15
1535
1536	decl	%ecx
1537	jnz	L$8x_tail
1538
1539	leaq	128(%rbp),%rbp
1540	movq	8+8(%rsp),%rdx
1541	cmpq	0+8(%rsp),%rbp
1542	jae	L$8x_tail_done
1543
1544	movq	48+56+8(%rsp),%rbx
1545	negq	%rsi
1546	movq	0(%rbp),%rax
1547	adcq	0(%rdi),%r8
1548	adcq	8(%rdi),%r9
1549	adcq	16(%rdi),%r10
1550	adcq	24(%rdi),%r11
1551	adcq	32(%rdi),%r12
1552	adcq	40(%rdi),%r13
1553	adcq	48(%rdi),%r14
1554	adcq	56(%rdi),%r15
1555	sbbq	%rsi,%rsi
1556
1557	movl	$8,%ecx
1558	jmp	L$8x_tail
1559
1560.p2align	5
1561L$8x_tail_done:
1562	addq	(%rdx),%r8
1563	xorq	%rax,%rax
1564
1565	negq	%rsi
1566L$8x_no_tail:
1567	adcq	0(%rdi),%r8
1568	adcq	8(%rdi),%r9
1569	adcq	16(%rdi),%r10
1570	adcq	24(%rdi),%r11
1571	adcq	32(%rdi),%r12
1572	adcq	40(%rdi),%r13
1573	adcq	48(%rdi),%r14
1574	adcq	56(%rdi),%r15
1575	adcq	$0,%rax
1576	movq	-16(%rbp),%rcx
1577	xorq	%rsi,%rsi
1578
1579.byte	102,72,15,126,213
1580
1581	movq	%r8,0(%rdi)
1582	movq	%r9,8(%rdi)
1583.byte	102,73,15,126,217
1584	movq	%r10,16(%rdi)
1585	movq	%r11,24(%rdi)
1586	movq	%r12,32(%rdi)
1587	movq	%r13,40(%rdi)
1588	movq	%r14,48(%rdi)
1589	movq	%r15,56(%rdi)
1590	leaq	64(%rdi),%rdi
1591
1592	cmpq	%rdx,%rdi
1593	jb	L$8x_reduction_loop
1594
1595	subq	%r15,%rcx
1596	leaq	(%rdi,%r9,1),%rbx
1597	adcq	%rsi,%rsi
1598	movq	%r9,%rcx
1599	orq	%rsi,%rax
1600.byte	102,72,15,126,207
1601	xorq	$1,%rax
1602.byte	102,72,15,126,206
1603	leaq	(%rbp,%rax,8),%rbp
1604	sarq	$3+2,%rcx
1605	jmp	L$sqr4x_sub
1606
1607.p2align	5
1608L$sqr4x_sub:
1609.byte	0x66
1610	movq	0(%rbx),%r12
1611	movq	8(%rbx),%r13
1612	sbbq	0(%rbp),%r12
1613	movq	16(%rbx),%r14
1614	sbbq	16(%rbp),%r13
1615	movq	24(%rbx),%r15
1616	leaq	32(%rbx),%rbx
1617	sbbq	32(%rbp),%r14
1618	movq	%r12,0(%rdi)
1619	sbbq	48(%rbp),%r15
1620	leaq	64(%rbp),%rbp
1621	movq	%r13,8(%rdi)
1622	movq	%r14,16(%rdi)
1623	movq	%r15,24(%rdi)
1624	leaq	32(%rdi),%rdi
1625
1626	incq	%rcx
1627	jnz	L$sqr4x_sub
1628	movq	%r9,%r10
1629	negq	%r9
1630	.byte	0xf3,0xc3
1631
1632.globl	_bn_from_montgomery
1633.private_extern _bn_from_montgomery
1634
1635.p2align	5
1636_bn_from_montgomery:
1637	testl	$7,%r9d
1638	jz	bn_from_mont8x
1639	xorl	%eax,%eax
1640	.byte	0xf3,0xc3
1641
1642
1643
1644.p2align	5
1645bn_from_mont8x:
1646.byte	0x67
1647	movq	%rsp,%rax
1648	pushq	%rbx
1649	pushq	%rbp
1650	pushq	%r12
1651	pushq	%r13
1652	pushq	%r14
1653	pushq	%r15
1654.byte	0x67
1655	movl	%r9d,%r10d
1656	shll	$3,%r9d
1657	shll	$3+2,%r10d
1658	negq	%r9
1659	movq	(%r8),%r8
1660
1661
1662
1663
1664
1665
1666
1667	leaq	-64(%rsp,%r9,2),%r11
1668	subq	%rsi,%r11
1669	andq	$4095,%r11
1670	cmpq	%r11,%r10
1671	jb	L$from_sp_alt
1672	subq	%r11,%rsp
1673	leaq	-64(%rsp,%r9,2),%rsp
1674	jmp	L$from_sp_done
1675
1676.p2align	5
1677L$from_sp_alt:
1678	leaq	4096-64(,%r9,2),%r10
1679	leaq	-64(%rsp,%r9,2),%rsp
1680	subq	%r10,%r11
1681	movq	$0,%r10
1682	cmovcq	%r10,%r11
1683	subq	%r11,%rsp
1684L$from_sp_done:
1685	andq	$-64,%rsp
1686	movq	%r9,%r10
1687	negq	%r9
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698	movq	%r8,32(%rsp)
1699	movq	%rax,40(%rsp)
1700L$from_body:
1701	movq	%r9,%r11
1702	leaq	48(%rsp),%rax
1703	pxor	%xmm0,%xmm0
1704	jmp	L$mul_by_1
1705
1706.p2align	5
1707L$mul_by_1:
1708	movdqu	(%rsi),%xmm1
1709	movdqu	16(%rsi),%xmm2
1710	movdqu	32(%rsi),%xmm3
1711	movdqa	%xmm0,(%rax,%r9,1)
1712	movdqu	48(%rsi),%xmm4
1713	movdqa	%xmm0,16(%rax,%r9,1)
1714.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
1715	movdqa	%xmm1,(%rax)
1716	movdqa	%xmm0,32(%rax,%r9,1)
1717	movdqa	%xmm2,16(%rax)
1718	movdqa	%xmm0,48(%rax,%r9,1)
1719	movdqa	%xmm3,32(%rax)
1720	movdqa	%xmm4,48(%rax)
1721	leaq	64(%rax),%rax
1722	subq	$64,%r11
1723	jnz	L$mul_by_1
1724
1725.byte	102,72,15,110,207
1726.byte	102,72,15,110,209
1727.byte	0x67
1728	movq	%rcx,%rbp
1729.byte	102,73,15,110,218
1730	call	sqr8x_reduction
1731
1732	pxor	%xmm0,%xmm0
1733	leaq	48(%rsp),%rax
1734	movq	40(%rsp),%rsi
1735	jmp	L$from_mont_zero
1736
1737.p2align	5
1738L$from_mont_zero:
1739	movdqa	%xmm0,0(%rax)
1740	movdqa	%xmm0,16(%rax)
1741	movdqa	%xmm0,32(%rax)
1742	movdqa	%xmm0,48(%rax)
1743	leaq	64(%rax),%rax
1744	subq	$32,%r9
1745	jnz	L$from_mont_zero
1746
1747	movq	$1,%rax
1748	movq	-48(%rsi),%r15
1749	movq	-40(%rsi),%r14
1750	movq	-32(%rsi),%r13
1751	movq	-24(%rsi),%r12
1752	movq	-16(%rsi),%rbp
1753	movq	-8(%rsi),%rbx
1754	leaq	(%rsi),%rsp
1755L$from_epilogue:
1756	.byte	0xf3,0xc3
1757
1758.globl	_bn_scatter5
1759.private_extern _bn_scatter5
1760
1761.p2align	4
1762_bn_scatter5:
1763	cmpl	$0,%esi
1764	jz	L$scatter_epilogue
1765	leaq	(%rdx,%rcx,8),%rdx
1766L$scatter:
1767	movq	(%rdi),%rax
1768	leaq	8(%rdi),%rdi
1769	movq	%rax,(%rdx)
1770	leaq	256(%rdx),%rdx
1771	subl	$1,%esi
1772	jnz	L$scatter
1773L$scatter_epilogue:
1774	.byte	0xf3,0xc3
1775
1776
1777.globl	_bn_gather5
1778.private_extern _bn_gather5
1779
1780.p2align	4
1781_bn_gather5:
1782	movl	%ecx,%r11d
1783	shrl	$3,%ecx
1784	andq	$7,%r11
1785	notl	%ecx
1786	leaq	L$magic_masks(%rip),%rax
1787	andl	$3,%ecx
1788	leaq	128(%rdx,%r11,8),%rdx
1789	movq	0(%rax,%rcx,8),%xmm4
1790	movq	8(%rax,%rcx,8),%xmm5
1791	movq	16(%rax,%rcx,8),%xmm6
1792	movq	24(%rax,%rcx,8),%xmm7
1793	jmp	L$gather
1794.p2align	4
1795L$gather:
1796	movq	-128(%rdx),%xmm0
1797	movq	-64(%rdx),%xmm1
1798	pand	%xmm4,%xmm0
1799	movq	0(%rdx),%xmm2
1800	pand	%xmm5,%xmm1
1801	movq	64(%rdx),%xmm3
1802	pand	%xmm6,%xmm2
1803	por	%xmm1,%xmm0
1804	pand	%xmm7,%xmm3
1805.byte	0x67,0x67
1806	por	%xmm2,%xmm0
1807	leaq	256(%rdx),%rdx
1808	por	%xmm3,%xmm0
1809
1810	movq	%xmm0,(%rdi)
1811	leaq	8(%rdi),%rdi
1812	subl	$1,%esi
1813	jnz	L$gather
1814	.byte	0xf3,0xc3
1815L$SEH_end_bn_gather5:
1816
1817.p2align	6
1818L$magic_masks:
1819.long	0,0, 0,0, 0,0, -1,-1
1820.long	0,0, 0,0, 0,0,  0,0
1821.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1822#endif
1823