1#if defined(__x86_64__)
2.text
3
4
5
6.globl	_bn_mul_mont
7.private_extern _bn_mul_mont
8
9.p2align	4
10_bn_mul_mont:
11	testl	$3,%r9d
12	jnz	L$mul_enter
13	cmpl	$8,%r9d
14	jb	L$mul_enter
15	cmpq	%rsi,%rdx
16	jne	L$mul4x_enter
17	testl	$7,%r9d
18	jz	L$sqr8x_enter
19	jmp	L$mul4x_enter
20
21.p2align	4
22L$mul_enter:
23	pushq	%rbx
24	pushq	%rbp
25	pushq	%r12
26	pushq	%r13
27	pushq	%r14
28	pushq	%r15
29
30	movl	%r9d,%r9d
31	leaq	2(%r9),%r10
32	movq	%rsp,%r11
33	negq	%r10
34	leaq	(%rsp,%r10,8),%rsp
35	andq	$-1024,%rsp
36
37	movq	%r11,8(%rsp,%r9,8)
38L$mul_body:
39	movq	%rdx,%r12
40	movq	(%r8),%r8
41	movq	(%r12),%rbx
42	movq	(%rsi),%rax
43
44	xorq	%r14,%r14
45	xorq	%r15,%r15
46
47	movq	%r8,%rbp
48	mulq	%rbx
49	movq	%rax,%r10
50	movq	(%rcx),%rax
51
52	imulq	%r10,%rbp
53	movq	%rdx,%r11
54
55	mulq	%rbp
56	addq	%rax,%r10
57	movq	8(%rsi),%rax
58	adcq	$0,%rdx
59	movq	%rdx,%r13
60
61	leaq	1(%r15),%r15
62	jmp	L$1st_enter
63
64.p2align	4
65L$1st:
66	addq	%rax,%r13
67	movq	(%rsi,%r15,8),%rax
68	adcq	$0,%rdx
69	addq	%r11,%r13
70	movq	%r10,%r11
71	adcq	$0,%rdx
72	movq	%r13,-16(%rsp,%r15,8)
73	movq	%rdx,%r13
74
75L$1st_enter:
76	mulq	%rbx
77	addq	%rax,%r11
78	movq	(%rcx,%r15,8),%rax
79	adcq	$0,%rdx
80	leaq	1(%r15),%r15
81	movq	%rdx,%r10
82
83	mulq	%rbp
84	cmpq	%r9,%r15
85	jne	L$1st
86
87	addq	%rax,%r13
88	movq	(%rsi),%rax
89	adcq	$0,%rdx
90	addq	%r11,%r13
91	adcq	$0,%rdx
92	movq	%r13,-16(%rsp,%r15,8)
93	movq	%rdx,%r13
94	movq	%r10,%r11
95
96	xorq	%rdx,%rdx
97	addq	%r11,%r13
98	adcq	$0,%rdx
99	movq	%r13,-8(%rsp,%r9,8)
100	movq	%rdx,(%rsp,%r9,8)
101
102	leaq	1(%r14),%r14
103	jmp	L$outer
104.p2align	4
105L$outer:
106	movq	(%r12,%r14,8),%rbx
107	xorq	%r15,%r15
108	movq	%r8,%rbp
109	movq	(%rsp),%r10
110	mulq	%rbx
111	addq	%rax,%r10
112	movq	(%rcx),%rax
113	adcq	$0,%rdx
114
115	imulq	%r10,%rbp
116	movq	%rdx,%r11
117
118	mulq	%rbp
119	addq	%rax,%r10
120	movq	8(%rsi),%rax
121	adcq	$0,%rdx
122	movq	8(%rsp),%r10
123	movq	%rdx,%r13
124
125	leaq	1(%r15),%r15
126	jmp	L$inner_enter
127
128.p2align	4
129L$inner:
130	addq	%rax,%r13
131	movq	(%rsi,%r15,8),%rax
132	adcq	$0,%rdx
133	addq	%r10,%r13
134	movq	(%rsp,%r15,8),%r10
135	adcq	$0,%rdx
136	movq	%r13,-16(%rsp,%r15,8)
137	movq	%rdx,%r13
138
139L$inner_enter:
140	mulq	%rbx
141	addq	%rax,%r11
142	movq	(%rcx,%r15,8),%rax
143	adcq	$0,%rdx
144	addq	%r11,%r10
145	movq	%rdx,%r11
146	adcq	$0,%r11
147	leaq	1(%r15),%r15
148
149	mulq	%rbp
150	cmpq	%r9,%r15
151	jne	L$inner
152
153	addq	%rax,%r13
154	movq	(%rsi),%rax
155	adcq	$0,%rdx
156	addq	%r10,%r13
157	movq	(%rsp,%r15,8),%r10
158	adcq	$0,%rdx
159	movq	%r13,-16(%rsp,%r15,8)
160	movq	%rdx,%r13
161
162	xorq	%rdx,%rdx
163	addq	%r11,%r13
164	adcq	$0,%rdx
165	addq	%r10,%r13
166	adcq	$0,%rdx
167	movq	%r13,-8(%rsp,%r9,8)
168	movq	%rdx,(%rsp,%r9,8)
169
170	leaq	1(%r14),%r14
171	cmpq	%r9,%r14
172	jb	L$outer
173
174	xorq	%r14,%r14
175	movq	(%rsp),%rax
176	leaq	(%rsp),%rsi
177	movq	%r9,%r15
178	jmp	L$sub
179.p2align	4
180L$sub:	sbbq	(%rcx,%r14,8),%rax
181	movq	%rax,(%rdi,%r14,8)
182	movq	8(%rsi,%r14,8),%rax
183	leaq	1(%r14),%r14
184	decq	%r15
185	jnz	L$sub
186
187	sbbq	$0,%rax
188	xorq	%r14,%r14
189	movq	%r9,%r15
190.p2align	4
191L$copy:
192	movq	(%rsp,%r14,8),%rsi
193	movq	(%rdi,%r14,8),%rcx
194	xorq	%rcx,%rsi
195	andq	%rax,%rsi
196	xorq	%rcx,%rsi
197	movq	%r14,(%rsp,%r14,8)
198	movq	%rsi,(%rdi,%r14,8)
199	leaq	1(%r14),%r14
200	subq	$1,%r15
201	jnz	L$copy
202
203	movq	8(%rsp,%r9,8),%rsi
204	movq	$1,%rax
205	movq	(%rsi),%r15
206	movq	8(%rsi),%r14
207	movq	16(%rsi),%r13
208	movq	24(%rsi),%r12
209	movq	32(%rsi),%rbp
210	movq	40(%rsi),%rbx
211	leaq	48(%rsi),%rsp
212L$mul_epilogue:
213	.byte	0xf3,0xc3
214
215
216.p2align	4
217bn_mul4x_mont:
218L$mul4x_enter:
219	pushq	%rbx
220	pushq	%rbp
221	pushq	%r12
222	pushq	%r13
223	pushq	%r14
224	pushq	%r15
225
226	movl	%r9d,%r9d
227	leaq	4(%r9),%r10
228	movq	%rsp,%r11
229	negq	%r10
230	leaq	(%rsp,%r10,8),%rsp
231	andq	$-1024,%rsp
232
233	movq	%r11,8(%rsp,%r9,8)
234L$mul4x_body:
235	movq	%rdi,16(%rsp,%r9,8)
236	movq	%rdx,%r12
237	movq	(%r8),%r8
238	movq	(%r12),%rbx
239	movq	(%rsi),%rax
240
241	xorq	%r14,%r14
242	xorq	%r15,%r15
243
244	movq	%r8,%rbp
245	mulq	%rbx
246	movq	%rax,%r10
247	movq	(%rcx),%rax
248
249	imulq	%r10,%rbp
250	movq	%rdx,%r11
251
252	mulq	%rbp
253	addq	%rax,%r10
254	movq	8(%rsi),%rax
255	adcq	$0,%rdx
256	movq	%rdx,%rdi
257
258	mulq	%rbx
259	addq	%rax,%r11
260	movq	8(%rcx),%rax
261	adcq	$0,%rdx
262	movq	%rdx,%r10
263
264	mulq	%rbp
265	addq	%rax,%rdi
266	movq	16(%rsi),%rax
267	adcq	$0,%rdx
268	addq	%r11,%rdi
269	leaq	4(%r15),%r15
270	adcq	$0,%rdx
271	movq	%rdi,(%rsp)
272	movq	%rdx,%r13
273	jmp	L$1st4x
274.p2align	4
275L$1st4x:
276	mulq	%rbx
277	addq	%rax,%r10
278	movq	-16(%rcx,%r15,8),%rax
279	adcq	$0,%rdx
280	movq	%rdx,%r11
281
282	mulq	%rbp
283	addq	%rax,%r13
284	movq	-8(%rsi,%r15,8),%rax
285	adcq	$0,%rdx
286	addq	%r10,%r13
287	adcq	$0,%rdx
288	movq	%r13,-24(%rsp,%r15,8)
289	movq	%rdx,%rdi
290
291	mulq	%rbx
292	addq	%rax,%r11
293	movq	-8(%rcx,%r15,8),%rax
294	adcq	$0,%rdx
295	movq	%rdx,%r10
296
297	mulq	%rbp
298	addq	%rax,%rdi
299	movq	(%rsi,%r15,8),%rax
300	adcq	$0,%rdx
301	addq	%r11,%rdi
302	adcq	$0,%rdx
303	movq	%rdi,-16(%rsp,%r15,8)
304	movq	%rdx,%r13
305
306	mulq	%rbx
307	addq	%rax,%r10
308	movq	(%rcx,%r15,8),%rax
309	adcq	$0,%rdx
310	movq	%rdx,%r11
311
312	mulq	%rbp
313	addq	%rax,%r13
314	movq	8(%rsi,%r15,8),%rax
315	adcq	$0,%rdx
316	addq	%r10,%r13
317	adcq	$0,%rdx
318	movq	%r13,-8(%rsp,%r15,8)
319	movq	%rdx,%rdi
320
321	mulq	%rbx
322	addq	%rax,%r11
323	movq	8(%rcx,%r15,8),%rax
324	adcq	$0,%rdx
325	leaq	4(%r15),%r15
326	movq	%rdx,%r10
327
328	mulq	%rbp
329	addq	%rax,%rdi
330	movq	-16(%rsi,%r15,8),%rax
331	adcq	$0,%rdx
332	addq	%r11,%rdi
333	adcq	$0,%rdx
334	movq	%rdi,-32(%rsp,%r15,8)
335	movq	%rdx,%r13
336	cmpq	%r9,%r15
337	jb	L$1st4x
338
339	mulq	%rbx
340	addq	%rax,%r10
341	movq	-16(%rcx,%r15,8),%rax
342	adcq	$0,%rdx
343	movq	%rdx,%r11
344
345	mulq	%rbp
346	addq	%rax,%r13
347	movq	-8(%rsi,%r15,8),%rax
348	adcq	$0,%rdx
349	addq	%r10,%r13
350	adcq	$0,%rdx
351	movq	%r13,-24(%rsp,%r15,8)
352	movq	%rdx,%rdi
353
354	mulq	%rbx
355	addq	%rax,%r11
356	movq	-8(%rcx,%r15,8),%rax
357	adcq	$0,%rdx
358	movq	%rdx,%r10
359
360	mulq	%rbp
361	addq	%rax,%rdi
362	movq	(%rsi),%rax
363	adcq	$0,%rdx
364	addq	%r11,%rdi
365	adcq	$0,%rdx
366	movq	%rdi,-16(%rsp,%r15,8)
367	movq	%rdx,%r13
368
369	xorq	%rdi,%rdi
370	addq	%r10,%r13
371	adcq	$0,%rdi
372	movq	%r13,-8(%rsp,%r15,8)
373	movq	%rdi,(%rsp,%r15,8)
374
375	leaq	1(%r14),%r14
376.p2align	2
377L$outer4x:
378	movq	(%r12,%r14,8),%rbx
379	xorq	%r15,%r15
380	movq	(%rsp),%r10
381	movq	%r8,%rbp
382	mulq	%rbx
383	addq	%rax,%r10
384	movq	(%rcx),%rax
385	adcq	$0,%rdx
386
387	imulq	%r10,%rbp
388	movq	%rdx,%r11
389
390	mulq	%rbp
391	addq	%rax,%r10
392	movq	8(%rsi),%rax
393	adcq	$0,%rdx
394	movq	%rdx,%rdi
395
396	mulq	%rbx
397	addq	%rax,%r11
398	movq	8(%rcx),%rax
399	adcq	$0,%rdx
400	addq	8(%rsp),%r11
401	adcq	$0,%rdx
402	movq	%rdx,%r10
403
404	mulq	%rbp
405	addq	%rax,%rdi
406	movq	16(%rsi),%rax
407	adcq	$0,%rdx
408	addq	%r11,%rdi
409	leaq	4(%r15),%r15
410	adcq	$0,%rdx
411	movq	%rdi,(%rsp)
412	movq	%rdx,%r13
413	jmp	L$inner4x
414.p2align	4
415L$inner4x:
416	mulq	%rbx
417	addq	%rax,%r10
418	movq	-16(%rcx,%r15,8),%rax
419	adcq	$0,%rdx
420	addq	-16(%rsp,%r15,8),%r10
421	adcq	$0,%rdx
422	movq	%rdx,%r11
423
424	mulq	%rbp
425	addq	%rax,%r13
426	movq	-8(%rsi,%r15,8),%rax
427	adcq	$0,%rdx
428	addq	%r10,%r13
429	adcq	$0,%rdx
430	movq	%r13,-24(%rsp,%r15,8)
431	movq	%rdx,%rdi
432
433	mulq	%rbx
434	addq	%rax,%r11
435	movq	-8(%rcx,%r15,8),%rax
436	adcq	$0,%rdx
437	addq	-8(%rsp,%r15,8),%r11
438	adcq	$0,%rdx
439	movq	%rdx,%r10
440
441	mulq	%rbp
442	addq	%rax,%rdi
443	movq	(%rsi,%r15,8),%rax
444	adcq	$0,%rdx
445	addq	%r11,%rdi
446	adcq	$0,%rdx
447	movq	%rdi,-16(%rsp,%r15,8)
448	movq	%rdx,%r13
449
450	mulq	%rbx
451	addq	%rax,%r10
452	movq	(%rcx,%r15,8),%rax
453	adcq	$0,%rdx
454	addq	(%rsp,%r15,8),%r10
455	adcq	$0,%rdx
456	movq	%rdx,%r11
457
458	mulq	%rbp
459	addq	%rax,%r13
460	movq	8(%rsi,%r15,8),%rax
461	adcq	$0,%rdx
462	addq	%r10,%r13
463	adcq	$0,%rdx
464	movq	%r13,-8(%rsp,%r15,8)
465	movq	%rdx,%rdi
466
467	mulq	%rbx
468	addq	%rax,%r11
469	movq	8(%rcx,%r15,8),%rax
470	adcq	$0,%rdx
471	addq	8(%rsp,%r15,8),%r11
472	adcq	$0,%rdx
473	leaq	4(%r15),%r15
474	movq	%rdx,%r10
475
476	mulq	%rbp
477	addq	%rax,%rdi
478	movq	-16(%rsi,%r15,8),%rax
479	adcq	$0,%rdx
480	addq	%r11,%rdi
481	adcq	$0,%rdx
482	movq	%rdi,-32(%rsp,%r15,8)
483	movq	%rdx,%r13
484	cmpq	%r9,%r15
485	jb	L$inner4x
486
487	mulq	%rbx
488	addq	%rax,%r10
489	movq	-16(%rcx,%r15,8),%rax
490	adcq	$0,%rdx
491	addq	-16(%rsp,%r15,8),%r10
492	adcq	$0,%rdx
493	movq	%rdx,%r11
494
495	mulq	%rbp
496	addq	%rax,%r13
497	movq	-8(%rsi,%r15,8),%rax
498	adcq	$0,%rdx
499	addq	%r10,%r13
500	adcq	$0,%rdx
501	movq	%r13,-24(%rsp,%r15,8)
502	movq	%rdx,%rdi
503
504	mulq	%rbx
505	addq	%rax,%r11
506	movq	-8(%rcx,%r15,8),%rax
507	adcq	$0,%rdx
508	addq	-8(%rsp,%r15,8),%r11
509	adcq	$0,%rdx
510	leaq	1(%r14),%r14
511	movq	%rdx,%r10
512
513	mulq	%rbp
514	addq	%rax,%rdi
515	movq	(%rsi),%rax
516	adcq	$0,%rdx
517	addq	%r11,%rdi
518	adcq	$0,%rdx
519	movq	%rdi,-16(%rsp,%r15,8)
520	movq	%rdx,%r13
521
522	xorq	%rdi,%rdi
523	addq	%r10,%r13
524	adcq	$0,%rdi
525	addq	(%rsp,%r9,8),%r13
526	adcq	$0,%rdi
527	movq	%r13,-8(%rsp,%r15,8)
528	movq	%rdi,(%rsp,%r15,8)
529
530	cmpq	%r9,%r14
531	jb	L$outer4x
532	movq	16(%rsp,%r9,8),%rdi
533	movq	0(%rsp),%rax
534	movq	8(%rsp),%rdx
535	shrq	$2,%r9
536	leaq	(%rsp),%rsi
537	xorq	%r14,%r14
538
539	subq	0(%rcx),%rax
540	movq	16(%rsi),%rbx
541	movq	24(%rsi),%rbp
542	sbbq	8(%rcx),%rdx
543	leaq	-1(%r9),%r15
544	jmp	L$sub4x
545.p2align	4
546L$sub4x:
547	movq	%rax,0(%rdi,%r14,8)
548	movq	%rdx,8(%rdi,%r14,8)
549	sbbq	16(%rcx,%r14,8),%rbx
550	movq	32(%rsi,%r14,8),%rax
551	movq	40(%rsi,%r14,8),%rdx
552	sbbq	24(%rcx,%r14,8),%rbp
553	movq	%rbx,16(%rdi,%r14,8)
554	movq	%rbp,24(%rdi,%r14,8)
555	sbbq	32(%rcx,%r14,8),%rax
556	movq	48(%rsi,%r14,8),%rbx
557	movq	56(%rsi,%r14,8),%rbp
558	sbbq	40(%rcx,%r14,8),%rdx
559	leaq	4(%r14),%r14
560	decq	%r15
561	jnz	L$sub4x
562
563	movq	%rax,0(%rdi,%r14,8)
564	movq	32(%rsi,%r14,8),%rax
565	sbbq	16(%rcx,%r14,8),%rbx
566	movq	%rdx,8(%rdi,%r14,8)
567	sbbq	24(%rcx,%r14,8),%rbp
568	movq	%rbx,16(%rdi,%r14,8)
569
570	sbbq	$0,%rax
571	movq	%rax,%xmm0
572	punpcklqdq	%xmm0,%xmm0
573	movq	%rbp,24(%rdi,%r14,8)
574	xorq	%r14,%r14
575
576	movq	%r9,%r15
577	pxor	%xmm5,%xmm5
578	jmp	L$copy4x
579.p2align	4
580L$copy4x:
581	movdqu	(%rsp,%r14,1),%xmm2
582	movdqu	16(%rsp,%r14,1),%xmm4
583	movdqu	(%rdi,%r14,1),%xmm1
584	movdqu	16(%rdi,%r14,1),%xmm3
585	pxor	%xmm1,%xmm2
586	pxor	%xmm3,%xmm4
587	pand	%xmm0,%xmm2
588	pand	%xmm0,%xmm4
589	pxor	%xmm1,%xmm2
590	pxor	%xmm3,%xmm4
591	movdqu	%xmm2,(%rdi,%r14,1)
592	movdqu	%xmm4,16(%rdi,%r14,1)
593	movdqa	%xmm5,(%rsp,%r14,1)
594	movdqa	%xmm5,16(%rsp,%r14,1)
595
596	leaq	32(%r14),%r14
597	decq	%r15
598	jnz	L$copy4x
599
600	shlq	$2,%r9
601	movq	8(%rsp,%r9,8),%rsi
602	movq	$1,%rax
603	movq	(%rsi),%r15
604	movq	8(%rsi),%r14
605	movq	16(%rsi),%r13
606	movq	24(%rsi),%r12
607	movq	32(%rsi),%rbp
608	movq	40(%rsi),%rbx
609	leaq	48(%rsi),%rsp
610L$mul4x_epilogue:
611	.byte	0xf3,0xc3
612
613
614
615
616.p2align	5
617bn_sqr8x_mont:
618L$sqr8x_enter:
619	movq	%rsp,%rax
620	pushq	%rbx
621	pushq	%rbp
622	pushq	%r12
623	pushq	%r13
624	pushq	%r14
625	pushq	%r15
626
627	movl	%r9d,%r10d
628	shll	$3,%r9d
629	shlq	$3+2,%r10
630	negq	%r9
631
632
633
634
635
636
637	leaq	-64(%rsp,%r9,4),%r11
638	movq	(%r8),%r8
639	subq	%rsi,%r11
640	andq	$4095,%r11
641	cmpq	%r11,%r10
642	jb	L$sqr8x_sp_alt
643	subq	%r11,%rsp
644	leaq	-64(%rsp,%r9,4),%rsp
645	jmp	L$sqr8x_sp_done
646
647.p2align	5
648L$sqr8x_sp_alt:
649	leaq	4096-64(,%r9,4),%r10
650	leaq	-64(%rsp,%r9,4),%rsp
651	subq	%r10,%r11
652	movq	$0,%r10
653	cmovcq	%r10,%r11
654	subq	%r11,%rsp
655L$sqr8x_sp_done:
656	andq	$-64,%rsp
657	movq	%r9,%r10
658	negq	%r9
659
660	leaq	64(%rsp,%r9,2),%r11
661	movq	%r8,32(%rsp)
662	movq	%rax,40(%rsp)
663L$sqr8x_body:
664
665	movq	%r9,%rbp
666.byte	102,73,15,110,211
667	shrq	$3+2,%rbp
668	movl	_OPENSSL_ia32cap_P+8(%rip),%eax
669	jmp	L$sqr8x_copy_n
670
671.p2align	5
672L$sqr8x_copy_n:
673	movq	0(%rcx),%xmm0
674	movq	8(%rcx),%xmm1
675	movq	16(%rcx),%xmm3
676	movq	24(%rcx),%xmm4
677	leaq	32(%rcx),%rcx
678	movdqa	%xmm0,0(%r11)
679	movdqa	%xmm1,16(%r11)
680	movdqa	%xmm3,32(%r11)
681	movdqa	%xmm4,48(%r11)
682	leaq	64(%r11),%r11
683	decq	%rbp
684	jnz	L$sqr8x_copy_n
685
686	pxor	%xmm0,%xmm0
687.byte	102,72,15,110,207
688.byte	102,73,15,110,218
689	call	_bn_sqr8x_internal
690
691	pxor	%xmm0,%xmm0
692	leaq	48(%rsp),%rax
693	leaq	64(%rsp,%r9,2),%rdx
694	shrq	$3+2,%r9
695	movq	40(%rsp),%rsi
696	jmp	L$sqr8x_zero
697
698.p2align	5
699L$sqr8x_zero:
700	movdqa	%xmm0,0(%rax)
701	movdqa	%xmm0,16(%rax)
702	movdqa	%xmm0,32(%rax)
703	movdqa	%xmm0,48(%rax)
704	leaq	64(%rax),%rax
705	movdqa	%xmm0,0(%rdx)
706	movdqa	%xmm0,16(%rdx)
707	movdqa	%xmm0,32(%rdx)
708	movdqa	%xmm0,48(%rdx)
709	leaq	64(%rdx),%rdx
710	decq	%r9
711	jnz	L$sqr8x_zero
712
713	movq	$1,%rax
714	movq	-48(%rsi),%r15
715	movq	-40(%rsi),%r14
716	movq	-32(%rsi),%r13
717	movq	-24(%rsi),%r12
718	movq	-16(%rsi),%rbp
719	movq	-8(%rsi),%rbx
720	leaq	(%rsi),%rsp
721L$sqr8x_epilogue:
722	.byte	0xf3,0xc3
723
724.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
725.p2align	4
726#endif
727