1#if defined(__x86_64__)
2.text
3
4.extern	OPENSSL_ia32cap_P
5.hidden OPENSSL_ia32cap_P
6
7.globl	rsaz_512_sqr
8.hidden rsaz_512_sqr
9.type	rsaz_512_sqr,@function
10.align	32
11rsaz_512_sqr:
12	pushq	%rbx
13	pushq	%rbp
14	pushq	%r12
15	pushq	%r13
16	pushq	%r14
17	pushq	%r15
18
19	subq	$128+24,%rsp
20.Lsqr_body:
21	movq	%rdx,%rbp
22	movq	(%rsi),%rdx
23	movq	8(%rsi),%rax
24	movq	%rcx,128(%rsp)
25	jmp	.Loop_sqr
26
27.align	32
28.Loop_sqr:
29	movl	%r8d,128+8(%rsp)
30
31	movq	%rdx,%rbx
32	mulq	%rdx
33	movq	%rax,%r8
34	movq	16(%rsi),%rax
35	movq	%rdx,%r9
36
37	mulq	%rbx
38	addq	%rax,%r9
39	movq	24(%rsi),%rax
40	movq	%rdx,%r10
41	adcq	$0,%r10
42
43	mulq	%rbx
44	addq	%rax,%r10
45	movq	32(%rsi),%rax
46	movq	%rdx,%r11
47	adcq	$0,%r11
48
49	mulq	%rbx
50	addq	%rax,%r11
51	movq	40(%rsi),%rax
52	movq	%rdx,%r12
53	adcq	$0,%r12
54
55	mulq	%rbx
56	addq	%rax,%r12
57	movq	48(%rsi),%rax
58	movq	%rdx,%r13
59	adcq	$0,%r13
60
61	mulq	%rbx
62	addq	%rax,%r13
63	movq	56(%rsi),%rax
64	movq	%rdx,%r14
65	adcq	$0,%r14
66
67	mulq	%rbx
68	addq	%rax,%r14
69	movq	%rbx,%rax
70	movq	%rdx,%r15
71	adcq	$0,%r15
72
73	addq	%r8,%r8
74	movq	%r9,%rcx
75	adcq	%r9,%r9
76
77	mulq	%rax
78	movq	%rax,(%rsp)
79	addq	%rdx,%r8
80	adcq	$0,%r9
81
82	movq	%r8,8(%rsp)
83	shrq	$63,%rcx
84
85
86	movq	8(%rsi),%r8
87	movq	16(%rsi),%rax
88	mulq	%r8
89	addq	%rax,%r10
90	movq	24(%rsi),%rax
91	movq	%rdx,%rbx
92	adcq	$0,%rbx
93
94	mulq	%r8
95	addq	%rax,%r11
96	movq	32(%rsi),%rax
97	adcq	$0,%rdx
98	addq	%rbx,%r11
99	movq	%rdx,%rbx
100	adcq	$0,%rbx
101
102	mulq	%r8
103	addq	%rax,%r12
104	movq	40(%rsi),%rax
105	adcq	$0,%rdx
106	addq	%rbx,%r12
107	movq	%rdx,%rbx
108	adcq	$0,%rbx
109
110	mulq	%r8
111	addq	%rax,%r13
112	movq	48(%rsi),%rax
113	adcq	$0,%rdx
114	addq	%rbx,%r13
115	movq	%rdx,%rbx
116	adcq	$0,%rbx
117
118	mulq	%r8
119	addq	%rax,%r14
120	movq	56(%rsi),%rax
121	adcq	$0,%rdx
122	addq	%rbx,%r14
123	movq	%rdx,%rbx
124	adcq	$0,%rbx
125
126	mulq	%r8
127	addq	%rax,%r15
128	movq	%r8,%rax
129	adcq	$0,%rdx
130	addq	%rbx,%r15
131	movq	%rdx,%r8
132	movq	%r10,%rdx
133	adcq	$0,%r8
134
135	addq	%rdx,%rdx
136	leaq	(%rcx,%r10,2),%r10
137	movq	%r11,%rbx
138	adcq	%r11,%r11
139
140	mulq	%rax
141	addq	%rax,%r9
142	adcq	%rdx,%r10
143	adcq	$0,%r11
144
145	movq	%r9,16(%rsp)
146	movq	%r10,24(%rsp)
147	shrq	$63,%rbx
148
149
150	movq	16(%rsi),%r9
151	movq	24(%rsi),%rax
152	mulq	%r9
153	addq	%rax,%r12
154	movq	32(%rsi),%rax
155	movq	%rdx,%rcx
156	adcq	$0,%rcx
157
158	mulq	%r9
159	addq	%rax,%r13
160	movq	40(%rsi),%rax
161	adcq	$0,%rdx
162	addq	%rcx,%r13
163	movq	%rdx,%rcx
164	adcq	$0,%rcx
165
166	mulq	%r9
167	addq	%rax,%r14
168	movq	48(%rsi),%rax
169	adcq	$0,%rdx
170	addq	%rcx,%r14
171	movq	%rdx,%rcx
172	adcq	$0,%rcx
173
174	mulq	%r9
175	movq	%r12,%r10
176	leaq	(%rbx,%r12,2),%r12
177	addq	%rax,%r15
178	movq	56(%rsi),%rax
179	adcq	$0,%rdx
180	addq	%rcx,%r15
181	movq	%rdx,%rcx
182	adcq	$0,%rcx
183
184	mulq	%r9
185	shrq	$63,%r10
186	addq	%rax,%r8
187	movq	%r9,%rax
188	adcq	$0,%rdx
189	addq	%rcx,%r8
190	movq	%rdx,%r9
191	adcq	$0,%r9
192
193	movq	%r13,%rcx
194	leaq	(%r10,%r13,2),%r13
195
196	mulq	%rax
197	addq	%rax,%r11
198	adcq	%rdx,%r12
199	adcq	$0,%r13
200
201	movq	%r11,32(%rsp)
202	movq	%r12,40(%rsp)
203	shrq	$63,%rcx
204
205
206	movq	24(%rsi),%r10
207	movq	32(%rsi),%rax
208	mulq	%r10
209	addq	%rax,%r14
210	movq	40(%rsi),%rax
211	movq	%rdx,%rbx
212	adcq	$0,%rbx
213
214	mulq	%r10
215	addq	%rax,%r15
216	movq	48(%rsi),%rax
217	adcq	$0,%rdx
218	addq	%rbx,%r15
219	movq	%rdx,%rbx
220	adcq	$0,%rbx
221
222	mulq	%r10
223	movq	%r14,%r12
224	leaq	(%rcx,%r14,2),%r14
225	addq	%rax,%r8
226	movq	56(%rsi),%rax
227	adcq	$0,%rdx
228	addq	%rbx,%r8
229	movq	%rdx,%rbx
230	adcq	$0,%rbx
231
232	mulq	%r10
233	shrq	$63,%r12
234	addq	%rax,%r9
235	movq	%r10,%rax
236	adcq	$0,%rdx
237	addq	%rbx,%r9
238	movq	%rdx,%r10
239	adcq	$0,%r10
240
241	movq	%r15,%rbx
242	leaq	(%r12,%r15,2),%r15
243
244	mulq	%rax
245	addq	%rax,%r13
246	adcq	%rdx,%r14
247	adcq	$0,%r15
248
249	movq	%r13,48(%rsp)
250	movq	%r14,56(%rsp)
251	shrq	$63,%rbx
252
253
254	movq	32(%rsi),%r11
255	movq	40(%rsi),%rax
256	mulq	%r11
257	addq	%rax,%r8
258	movq	48(%rsi),%rax
259	movq	%rdx,%rcx
260	adcq	$0,%rcx
261
262	mulq	%r11
263	addq	%rax,%r9
264	movq	56(%rsi),%rax
265	adcq	$0,%rdx
266	movq	%r8,%r12
267	leaq	(%rbx,%r8,2),%r8
268	addq	%rcx,%r9
269	movq	%rdx,%rcx
270	adcq	$0,%rcx
271
272	mulq	%r11
273	shrq	$63,%r12
274	addq	%rax,%r10
275	movq	%r11,%rax
276	adcq	$0,%rdx
277	addq	%rcx,%r10
278	movq	%rdx,%r11
279	adcq	$0,%r11
280
281	movq	%r9,%rcx
282	leaq	(%r12,%r9,2),%r9
283
284	mulq	%rax
285	addq	%rax,%r15
286	adcq	%rdx,%r8
287	adcq	$0,%r9
288
289	movq	%r15,64(%rsp)
290	movq	%r8,72(%rsp)
291	shrq	$63,%rcx
292
293
294	movq	40(%rsi),%r12
295	movq	48(%rsi),%rax
296	mulq	%r12
297	addq	%rax,%r10
298	movq	56(%rsi),%rax
299	movq	%rdx,%rbx
300	adcq	$0,%rbx
301
302	mulq	%r12
303	addq	%rax,%r11
304	movq	%r12,%rax
305	movq	%r10,%r15
306	leaq	(%rcx,%r10,2),%r10
307	adcq	$0,%rdx
308	shrq	$63,%r15
309	addq	%rbx,%r11
310	movq	%rdx,%r12
311	adcq	$0,%r12
312
313	movq	%r11,%rbx
314	leaq	(%r15,%r11,2),%r11
315
316	mulq	%rax
317	addq	%rax,%r9
318	adcq	%rdx,%r10
319	adcq	$0,%r11
320
321	movq	%r9,80(%rsp)
322	movq	%r10,88(%rsp)
323
324
325	movq	48(%rsi),%r13
326	movq	56(%rsi),%rax
327	mulq	%r13
328	addq	%rax,%r12
329	movq	%r13,%rax
330	movq	%rdx,%r13
331	adcq	$0,%r13
332
333	xorq	%r14,%r14
334	shlq	$1,%rbx
335	adcq	%r12,%r12
336	adcq	%r13,%r13
337	adcq	%r14,%r14
338
339	mulq	%rax
340	addq	%rax,%r11
341	adcq	%rdx,%r12
342	adcq	$0,%r13
343
344	movq	%r11,96(%rsp)
345	movq	%r12,104(%rsp)
346
347
348	movq	56(%rsi),%rax
349	mulq	%rax
350	addq	%rax,%r13
351	adcq	$0,%rdx
352
353	addq	%rdx,%r14
354
355	movq	%r13,112(%rsp)
356	movq	%r14,120(%rsp)
357
358	movq	(%rsp),%r8
359	movq	8(%rsp),%r9
360	movq	16(%rsp),%r10
361	movq	24(%rsp),%r11
362	movq	32(%rsp),%r12
363	movq	40(%rsp),%r13
364	movq	48(%rsp),%r14
365	movq	56(%rsp),%r15
366
367	call	__rsaz_512_reduce
368
369	addq	64(%rsp),%r8
370	adcq	72(%rsp),%r9
371	adcq	80(%rsp),%r10
372	adcq	88(%rsp),%r11
373	adcq	96(%rsp),%r12
374	adcq	104(%rsp),%r13
375	adcq	112(%rsp),%r14
376	adcq	120(%rsp),%r15
377	sbbq	%rcx,%rcx
378
379	call	__rsaz_512_subtract
380
381	movq	%r8,%rdx
382	movq	%r9,%rax
383	movl	128+8(%rsp),%r8d
384	movq	%rdi,%rsi
385
386	decl	%r8d
387	jnz	.Loop_sqr
388
389	leaq	128+24+48(%rsp),%rax
390	movq	-48(%rax),%r15
391	movq	-40(%rax),%r14
392	movq	-32(%rax),%r13
393	movq	-24(%rax),%r12
394	movq	-16(%rax),%rbp
395	movq	-8(%rax),%rbx
396	leaq	(%rax),%rsp
397.Lsqr_epilogue:
398	.byte	0xf3,0xc3
399.size	rsaz_512_sqr,.-rsaz_512_sqr
400.globl	rsaz_512_mul
401.hidden rsaz_512_mul
402.type	rsaz_512_mul,@function
403.align	32
404rsaz_512_mul:
405	pushq	%rbx
406	pushq	%rbp
407	pushq	%r12
408	pushq	%r13
409	pushq	%r14
410	pushq	%r15
411
412	subq	$128+24,%rsp
413.Lmul_body:
414.byte	102,72,15,110,199
415.byte	102,72,15,110,201
416	movq	%r8,128(%rsp)
417	movq	(%rdx),%rbx
418	movq	%rdx,%rbp
419	call	__rsaz_512_mul
420
421.byte	102,72,15,126,199
422.byte	102,72,15,126,205
423
424	movq	(%rsp),%r8
425	movq	8(%rsp),%r9
426	movq	16(%rsp),%r10
427	movq	24(%rsp),%r11
428	movq	32(%rsp),%r12
429	movq	40(%rsp),%r13
430	movq	48(%rsp),%r14
431	movq	56(%rsp),%r15
432
433	call	__rsaz_512_reduce
434	addq	64(%rsp),%r8
435	adcq	72(%rsp),%r9
436	adcq	80(%rsp),%r10
437	adcq	88(%rsp),%r11
438	adcq	96(%rsp),%r12
439	adcq	104(%rsp),%r13
440	adcq	112(%rsp),%r14
441	adcq	120(%rsp),%r15
442	sbbq	%rcx,%rcx
443
444	call	__rsaz_512_subtract
445
446	leaq	128+24+48(%rsp),%rax
447	movq	-48(%rax),%r15
448	movq	-40(%rax),%r14
449	movq	-32(%rax),%r13
450	movq	-24(%rax),%r12
451	movq	-16(%rax),%rbp
452	movq	-8(%rax),%rbx
453	leaq	(%rax),%rsp
454.Lmul_epilogue:
455	.byte	0xf3,0xc3
456.size	rsaz_512_mul,.-rsaz_512_mul
457.globl	rsaz_512_mul_gather4
458.hidden rsaz_512_mul_gather4
459.type	rsaz_512_mul_gather4,@function
460.align	32
461rsaz_512_mul_gather4:
462	pushq	%rbx
463	pushq	%rbp
464	pushq	%r12
465	pushq	%r13
466	pushq	%r14
467	pushq	%r15
468
469	movl	%r9d,%r9d
470	subq	$128+24,%rsp
471.Lmul_gather4_body:
472	movl	64(%rdx,%r9,4),%eax
473.byte	102,72,15,110,199
474	movl	(%rdx,%r9,4),%ebx
475.byte	102,72,15,110,201
476	movq	%r8,128(%rsp)
477
478	shlq	$32,%rax
479	orq	%rax,%rbx
480	movq	(%rsi),%rax
481	movq	8(%rsi),%rcx
482	leaq	128(%rdx,%r9,4),%rbp
483	mulq	%rbx
484	movq	%rax,(%rsp)
485	movq	%rcx,%rax
486	movq	%rdx,%r8
487
488	mulq	%rbx
489	movd	(%rbp),%xmm4
490	addq	%rax,%r8
491	movq	16(%rsi),%rax
492	movq	%rdx,%r9
493	adcq	$0,%r9
494
495	mulq	%rbx
496	movd	64(%rbp),%xmm5
497	addq	%rax,%r9
498	movq	24(%rsi),%rax
499	movq	%rdx,%r10
500	adcq	$0,%r10
501
502	mulq	%rbx
503	pslldq	$4,%xmm5
504	addq	%rax,%r10
505	movq	32(%rsi),%rax
506	movq	%rdx,%r11
507	adcq	$0,%r11
508
509	mulq	%rbx
510	por	%xmm5,%xmm4
511	addq	%rax,%r11
512	movq	40(%rsi),%rax
513	movq	%rdx,%r12
514	adcq	$0,%r12
515
516	mulq	%rbx
517	addq	%rax,%r12
518	movq	48(%rsi),%rax
519	movq	%rdx,%r13
520	adcq	$0,%r13
521
522	mulq	%rbx
523	leaq	128(%rbp),%rbp
524	addq	%rax,%r13
525	movq	56(%rsi),%rax
526	movq	%rdx,%r14
527	adcq	$0,%r14
528
529	mulq	%rbx
530.byte	102,72,15,126,227
531	addq	%rax,%r14
532	movq	(%rsi),%rax
533	movq	%rdx,%r15
534	adcq	$0,%r15
535
536	leaq	8(%rsp),%rdi
537	movl	$7,%ecx
538	jmp	.Loop_mul_gather
539
540.align	32
541.Loop_mul_gather:
542	mulq	%rbx
543	addq	%rax,%r8
544	movq	8(%rsi),%rax
545	movq	%r8,(%rdi)
546	movq	%rdx,%r8
547	adcq	$0,%r8
548
549	mulq	%rbx
550	movd	(%rbp),%xmm4
551	addq	%rax,%r9
552	movq	16(%rsi),%rax
553	adcq	$0,%rdx
554	addq	%r9,%r8
555	movq	%rdx,%r9
556	adcq	$0,%r9
557
558	mulq	%rbx
559	movd	64(%rbp),%xmm5
560	addq	%rax,%r10
561	movq	24(%rsi),%rax
562	adcq	$0,%rdx
563	addq	%r10,%r9
564	movq	%rdx,%r10
565	adcq	$0,%r10
566
567	mulq	%rbx
568	pslldq	$4,%xmm5
569	addq	%rax,%r11
570	movq	32(%rsi),%rax
571	adcq	$0,%rdx
572	addq	%r11,%r10
573	movq	%rdx,%r11
574	adcq	$0,%r11
575
576	mulq	%rbx
577	por	%xmm5,%xmm4
578	addq	%rax,%r12
579	movq	40(%rsi),%rax
580	adcq	$0,%rdx
581	addq	%r12,%r11
582	movq	%rdx,%r12
583	adcq	$0,%r12
584
585	mulq	%rbx
586	addq	%rax,%r13
587	movq	48(%rsi),%rax
588	adcq	$0,%rdx
589	addq	%r13,%r12
590	movq	%rdx,%r13
591	adcq	$0,%r13
592
593	mulq	%rbx
594	addq	%rax,%r14
595	movq	56(%rsi),%rax
596	adcq	$0,%rdx
597	addq	%r14,%r13
598	movq	%rdx,%r14
599	adcq	$0,%r14
600
601	mulq	%rbx
602.byte	102,72,15,126,227
603	addq	%rax,%r15
604	movq	(%rsi),%rax
605	adcq	$0,%rdx
606	addq	%r15,%r14
607	movq	%rdx,%r15
608	adcq	$0,%r15
609
610	leaq	128(%rbp),%rbp
611	leaq	8(%rdi),%rdi
612
613	decl	%ecx
614	jnz	.Loop_mul_gather
615
616	movq	%r8,(%rdi)
617	movq	%r9,8(%rdi)
618	movq	%r10,16(%rdi)
619	movq	%r11,24(%rdi)
620	movq	%r12,32(%rdi)
621	movq	%r13,40(%rdi)
622	movq	%r14,48(%rdi)
623	movq	%r15,56(%rdi)
624
625.byte	102,72,15,126,199
626.byte	102,72,15,126,205
627
628	movq	(%rsp),%r8
629	movq	8(%rsp),%r9
630	movq	16(%rsp),%r10
631	movq	24(%rsp),%r11
632	movq	32(%rsp),%r12
633	movq	40(%rsp),%r13
634	movq	48(%rsp),%r14
635	movq	56(%rsp),%r15
636
637	call	__rsaz_512_reduce
638	addq	64(%rsp),%r8
639	adcq	72(%rsp),%r9
640	adcq	80(%rsp),%r10
641	adcq	88(%rsp),%r11
642	adcq	96(%rsp),%r12
643	adcq	104(%rsp),%r13
644	adcq	112(%rsp),%r14
645	adcq	120(%rsp),%r15
646	sbbq	%rcx,%rcx
647
648	call	__rsaz_512_subtract
649
650	leaq	128+24+48(%rsp),%rax
651	movq	-48(%rax),%r15
652	movq	-40(%rax),%r14
653	movq	-32(%rax),%r13
654	movq	-24(%rax),%r12
655	movq	-16(%rax),%rbp
656	movq	-8(%rax),%rbx
657	leaq	(%rax),%rsp
658.Lmul_gather4_epilogue:
659	.byte	0xf3,0xc3
660.size	rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
661.globl	rsaz_512_mul_scatter4
662.hidden rsaz_512_mul_scatter4
663.type	rsaz_512_mul_scatter4,@function
664.align	32
665rsaz_512_mul_scatter4:
666	pushq	%rbx
667	pushq	%rbp
668	pushq	%r12
669	pushq	%r13
670	pushq	%r14
671	pushq	%r15
672
673	movl	%r9d,%r9d
674	subq	$128+24,%rsp
675.Lmul_scatter4_body:
676	leaq	(%r8,%r9,4),%r8
677.byte	102,72,15,110,199
678.byte	102,72,15,110,202
679.byte	102,73,15,110,208
680	movq	%rcx,128(%rsp)
681
682	movq	%rdi,%rbp
683	movq	(%rdi),%rbx
684	call	__rsaz_512_mul
685
686.byte	102,72,15,126,199
687.byte	102,72,15,126,205
688
689	movq	(%rsp),%r8
690	movq	8(%rsp),%r9
691	movq	16(%rsp),%r10
692	movq	24(%rsp),%r11
693	movq	32(%rsp),%r12
694	movq	40(%rsp),%r13
695	movq	48(%rsp),%r14
696	movq	56(%rsp),%r15
697
698	call	__rsaz_512_reduce
699	addq	64(%rsp),%r8
700	adcq	72(%rsp),%r9
701	adcq	80(%rsp),%r10
702	adcq	88(%rsp),%r11
703	adcq	96(%rsp),%r12
704	adcq	104(%rsp),%r13
705	adcq	112(%rsp),%r14
706	adcq	120(%rsp),%r15
707.byte	102,72,15,126,214
708	sbbq	%rcx,%rcx
709
710	call	__rsaz_512_subtract
711
712	movl	%r8d,0(%rsi)
713	shrq	$32,%r8
714	movl	%r9d,128(%rsi)
715	shrq	$32,%r9
716	movl	%r10d,256(%rsi)
717	shrq	$32,%r10
718	movl	%r11d,384(%rsi)
719	shrq	$32,%r11
720	movl	%r12d,512(%rsi)
721	shrq	$32,%r12
722	movl	%r13d,640(%rsi)
723	shrq	$32,%r13
724	movl	%r14d,768(%rsi)
725	shrq	$32,%r14
726	movl	%r15d,896(%rsi)
727	shrq	$32,%r15
728	movl	%r8d,64(%rsi)
729	movl	%r9d,192(%rsi)
730	movl	%r10d,320(%rsi)
731	movl	%r11d,448(%rsi)
732	movl	%r12d,576(%rsi)
733	movl	%r13d,704(%rsi)
734	movl	%r14d,832(%rsi)
735	movl	%r15d,960(%rsi)
736
737	leaq	128+24+48(%rsp),%rax
738	movq	-48(%rax),%r15
739	movq	-40(%rax),%r14
740	movq	-32(%rax),%r13
741	movq	-24(%rax),%r12
742	movq	-16(%rax),%rbp
743	movq	-8(%rax),%rbx
744	leaq	(%rax),%rsp
745.Lmul_scatter4_epilogue:
746	.byte	0xf3,0xc3
747.size	rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
748.globl	rsaz_512_mul_by_one
749.hidden rsaz_512_mul_by_one
750.type	rsaz_512_mul_by_one,@function
751.align	32
752rsaz_512_mul_by_one:
753	pushq	%rbx
754	pushq	%rbp
755	pushq	%r12
756	pushq	%r13
757	pushq	%r14
758	pushq	%r15
759
760	subq	$128+24,%rsp
761.Lmul_by_one_body:
762	movq	%rdx,%rbp
763	movq	%rcx,128(%rsp)
764
765	movq	(%rsi),%r8
766	pxor	%xmm0,%xmm0
767	movq	8(%rsi),%r9
768	movq	16(%rsi),%r10
769	movq	24(%rsi),%r11
770	movq	32(%rsi),%r12
771	movq	40(%rsi),%r13
772	movq	48(%rsi),%r14
773	movq	56(%rsi),%r15
774
775	movdqa	%xmm0,(%rsp)
776	movdqa	%xmm0,16(%rsp)
777	movdqa	%xmm0,32(%rsp)
778	movdqa	%xmm0,48(%rsp)
779	movdqa	%xmm0,64(%rsp)
780	movdqa	%xmm0,80(%rsp)
781	movdqa	%xmm0,96(%rsp)
782	call	__rsaz_512_reduce
783	movq	%r8,(%rdi)
784	movq	%r9,8(%rdi)
785	movq	%r10,16(%rdi)
786	movq	%r11,24(%rdi)
787	movq	%r12,32(%rdi)
788	movq	%r13,40(%rdi)
789	movq	%r14,48(%rdi)
790	movq	%r15,56(%rdi)
791
792	leaq	128+24+48(%rsp),%rax
793	movq	-48(%rax),%r15
794	movq	-40(%rax),%r14
795	movq	-32(%rax),%r13
796	movq	-24(%rax),%r12
797	movq	-16(%rax),%rbp
798	movq	-8(%rax),%rbx
799	leaq	(%rax),%rsp
800.Lmul_by_one_epilogue:
801	.byte	0xf3,0xc3
802.size	rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
803.type	__rsaz_512_reduce,@function
804.align	32
805__rsaz_512_reduce:
806	movq	%r8,%rbx
807	imulq	128+8(%rsp),%rbx
808	movq	0(%rbp),%rax
809	movl	$8,%ecx
810	jmp	.Lreduction_loop
811
812.align	32
813.Lreduction_loop:
814	mulq	%rbx
815	movq	8(%rbp),%rax
816	negq	%r8
817	movq	%rdx,%r8
818	adcq	$0,%r8
819
820	mulq	%rbx
821	addq	%rax,%r9
822	movq	16(%rbp),%rax
823	adcq	$0,%rdx
824	addq	%r9,%r8
825	movq	%rdx,%r9
826	adcq	$0,%r9
827
828	mulq	%rbx
829	addq	%rax,%r10
830	movq	24(%rbp),%rax
831	adcq	$0,%rdx
832	addq	%r10,%r9
833	movq	%rdx,%r10
834	adcq	$0,%r10
835
836	mulq	%rbx
837	addq	%rax,%r11
838	movq	32(%rbp),%rax
839	adcq	$0,%rdx
840	addq	%r11,%r10
841	movq	128+8(%rsp),%rsi
842
843
844	adcq	$0,%rdx
845	movq	%rdx,%r11
846
847	mulq	%rbx
848	addq	%rax,%r12
849	movq	40(%rbp),%rax
850	adcq	$0,%rdx
851	imulq	%r8,%rsi
852	addq	%r12,%r11
853	movq	%rdx,%r12
854	adcq	$0,%r12
855
856	mulq	%rbx
857	addq	%rax,%r13
858	movq	48(%rbp),%rax
859	adcq	$0,%rdx
860	addq	%r13,%r12
861	movq	%rdx,%r13
862	adcq	$0,%r13
863
864	mulq	%rbx
865	addq	%rax,%r14
866	movq	56(%rbp),%rax
867	adcq	$0,%rdx
868	addq	%r14,%r13
869	movq	%rdx,%r14
870	adcq	$0,%r14
871
872	mulq	%rbx
873	movq	%rsi,%rbx
874	addq	%rax,%r15
875	movq	0(%rbp),%rax
876	adcq	$0,%rdx
877	addq	%r15,%r14
878	movq	%rdx,%r15
879	adcq	$0,%r15
880
881	decl	%ecx
882	jne	.Lreduction_loop
883
884	.byte	0xf3,0xc3
885.size	__rsaz_512_reduce,.-__rsaz_512_reduce
886.type	__rsaz_512_subtract,@function
887.align	32
888__rsaz_512_subtract:
889	movq	%r8,(%rdi)
890	movq	%r9,8(%rdi)
891	movq	%r10,16(%rdi)
892	movq	%r11,24(%rdi)
893	movq	%r12,32(%rdi)
894	movq	%r13,40(%rdi)
895	movq	%r14,48(%rdi)
896	movq	%r15,56(%rdi)
897
898	movq	0(%rbp),%r8
899	movq	8(%rbp),%r9
900	negq	%r8
901	notq	%r9
902	andq	%rcx,%r8
903	movq	16(%rbp),%r10
904	andq	%rcx,%r9
905	notq	%r10
906	movq	24(%rbp),%r11
907	andq	%rcx,%r10
908	notq	%r11
909	movq	32(%rbp),%r12
910	andq	%rcx,%r11
911	notq	%r12
912	movq	40(%rbp),%r13
913	andq	%rcx,%r12
914	notq	%r13
915	movq	48(%rbp),%r14
916	andq	%rcx,%r13
917	notq	%r14
918	movq	56(%rbp),%r15
919	andq	%rcx,%r14
920	notq	%r15
921	andq	%rcx,%r15
922
923	addq	(%rdi),%r8
924	adcq	8(%rdi),%r9
925	adcq	16(%rdi),%r10
926	adcq	24(%rdi),%r11
927	adcq	32(%rdi),%r12
928	adcq	40(%rdi),%r13
929	adcq	48(%rdi),%r14
930	adcq	56(%rdi),%r15
931
932	movq	%r8,(%rdi)
933	movq	%r9,8(%rdi)
934	movq	%r10,16(%rdi)
935	movq	%r11,24(%rdi)
936	movq	%r12,32(%rdi)
937	movq	%r13,40(%rdi)
938	movq	%r14,48(%rdi)
939	movq	%r15,56(%rdi)
940
941	.byte	0xf3,0xc3
942.size	__rsaz_512_subtract,.-__rsaz_512_subtract
943.type	__rsaz_512_mul,@function
944.align	32
945__rsaz_512_mul:
946	leaq	8(%rsp),%rdi
947
948	movq	(%rsi),%rax
949	mulq	%rbx
950	movq	%rax,(%rdi)
951	movq	8(%rsi),%rax
952	movq	%rdx,%r8
953
954	mulq	%rbx
955	addq	%rax,%r8
956	movq	16(%rsi),%rax
957	movq	%rdx,%r9
958	adcq	$0,%r9
959
960	mulq	%rbx
961	addq	%rax,%r9
962	movq	24(%rsi),%rax
963	movq	%rdx,%r10
964	adcq	$0,%r10
965
966	mulq	%rbx
967	addq	%rax,%r10
968	movq	32(%rsi),%rax
969	movq	%rdx,%r11
970	adcq	$0,%r11
971
972	mulq	%rbx
973	addq	%rax,%r11
974	movq	40(%rsi),%rax
975	movq	%rdx,%r12
976	adcq	$0,%r12
977
978	mulq	%rbx
979	addq	%rax,%r12
980	movq	48(%rsi),%rax
981	movq	%rdx,%r13
982	adcq	$0,%r13
983
984	mulq	%rbx
985	addq	%rax,%r13
986	movq	56(%rsi),%rax
987	movq	%rdx,%r14
988	adcq	$0,%r14
989
990	mulq	%rbx
991	addq	%rax,%r14
992	movq	(%rsi),%rax
993	movq	%rdx,%r15
994	adcq	$0,%r15
995
996	leaq	8(%rbp),%rbp
997	leaq	8(%rdi),%rdi
998
999	movl	$7,%ecx
1000	jmp	.Loop_mul
1001
1002.align	32
1003.Loop_mul:
1004	movq	(%rbp),%rbx
1005	mulq	%rbx
1006	addq	%rax,%r8
1007	movq	8(%rsi),%rax
1008	movq	%r8,(%rdi)
1009	movq	%rdx,%r8
1010	adcq	$0,%r8
1011
1012	mulq	%rbx
1013	addq	%rax,%r9
1014	movq	16(%rsi),%rax
1015	adcq	$0,%rdx
1016	addq	%r9,%r8
1017	movq	%rdx,%r9
1018	adcq	$0,%r9
1019
1020	mulq	%rbx
1021	addq	%rax,%r10
1022	movq	24(%rsi),%rax
1023	adcq	$0,%rdx
1024	addq	%r10,%r9
1025	movq	%rdx,%r10
1026	adcq	$0,%r10
1027
1028	mulq	%rbx
1029	addq	%rax,%r11
1030	movq	32(%rsi),%rax
1031	adcq	$0,%rdx
1032	addq	%r11,%r10
1033	movq	%rdx,%r11
1034	adcq	$0,%r11
1035
1036	mulq	%rbx
1037	addq	%rax,%r12
1038	movq	40(%rsi),%rax
1039	adcq	$0,%rdx
1040	addq	%r12,%r11
1041	movq	%rdx,%r12
1042	adcq	$0,%r12
1043
1044	mulq	%rbx
1045	addq	%rax,%r13
1046	movq	48(%rsi),%rax
1047	adcq	$0,%rdx
1048	addq	%r13,%r12
1049	movq	%rdx,%r13
1050	adcq	$0,%r13
1051
1052	mulq	%rbx
1053	addq	%rax,%r14
1054	movq	56(%rsi),%rax
1055	adcq	$0,%rdx
1056	addq	%r14,%r13
1057	movq	%rdx,%r14
1058	leaq	8(%rbp),%rbp
1059	adcq	$0,%r14
1060
1061	mulq	%rbx
1062	addq	%rax,%r15
1063	movq	(%rsi),%rax
1064	adcq	$0,%rdx
1065	addq	%r15,%r14
1066	movq	%rdx,%r15
1067	adcq	$0,%r15
1068
1069	leaq	8(%rdi),%rdi
1070
1071	decl	%ecx
1072	jnz	.Loop_mul
1073
1074	movq	%r8,(%rdi)
1075	movq	%r9,8(%rdi)
1076	movq	%r10,16(%rdi)
1077	movq	%r11,24(%rdi)
1078	movq	%r12,32(%rdi)
1079	movq	%r13,40(%rdi)
1080	movq	%r14,48(%rdi)
1081	movq	%r15,56(%rdi)
1082
1083	.byte	0xf3,0xc3
1084.size	__rsaz_512_mul,.-__rsaz_512_mul
1085.globl	rsaz_512_scatter4
1086.hidden rsaz_512_scatter4
1087.type	rsaz_512_scatter4,@function
1088.align	16
1089rsaz_512_scatter4:
1090	leaq	(%rdi,%rdx,4),%rdi
1091	movl	$8,%r9d
1092	jmp	.Loop_scatter
1093.align	16
1094.Loop_scatter:
1095	movq	(%rsi),%rax
1096	leaq	8(%rsi),%rsi
1097	movl	%eax,(%rdi)
1098	shrq	$32,%rax
1099	movl	%eax,64(%rdi)
1100	leaq	128(%rdi),%rdi
1101	decl	%r9d
1102	jnz	.Loop_scatter
1103	.byte	0xf3,0xc3
1104.size	rsaz_512_scatter4,.-rsaz_512_scatter4
1105
1106.globl	rsaz_512_gather4
1107.hidden rsaz_512_gather4
1108.type	rsaz_512_gather4,@function
1109.align	16
1110rsaz_512_gather4:
1111	leaq	(%rsi,%rdx,4),%rsi
1112	movl	$8,%r9d
1113	jmp	.Loop_gather
1114.align	16
1115.Loop_gather:
1116	movl	(%rsi),%eax
1117	movl	64(%rsi),%r8d
1118	leaq	128(%rsi),%rsi
1119	shlq	$32,%r8
1120	orq	%r8,%rax
1121	movq	%rax,(%rdi)
1122	leaq	8(%rdi),%rdi
1123	decl	%r9d
1124	jnz	.Loop_gather
1125	.byte	0xf3,0xc3
1126.size	rsaz_512_gather4,.-rsaz_512_gather4
1127#endif
1128