1#if defined(__x86_64__)
2.text
3
4.extern	OPENSSL_ia32cap_P
5.hidden OPENSSL_ia32cap_P
6
7.globl	bn_mul_mont
8.hidden bn_mul_mont
9.type	bn_mul_mont,@function
10.align	16
11bn_mul_mont:
12	testl	$3,%r9d
13	jnz	.Lmul_enter
14	cmpl	$8,%r9d
15	jb	.Lmul_enter
16	cmpq	%rsi,%rdx
17	jne	.Lmul4x_enter
18	testl	$7,%r9d
19	jz	.Lsqr8x_enter
20	jmp	.Lmul4x_enter
21
22.align	16
23.Lmul_enter:
24	pushq	%rbx
25	pushq	%rbp
26	pushq	%r12
27	pushq	%r13
28	pushq	%r14
29	pushq	%r15
30
31	movl	%r9d,%r9d
32	leaq	2(%r9),%r10
33	movq	%rsp,%r11
34	negq	%r10
35	leaq	(%rsp,%r10,8),%rsp
36	andq	$-1024,%rsp
37
38	movq	%r11,8(%rsp,%r9,8)
39.Lmul_body:
40	movq	%rdx,%r12
41	movq	(%r8),%r8
42	movq	(%r12),%rbx
43	movq	(%rsi),%rax
44
45	xorq	%r14,%r14
46	xorq	%r15,%r15
47
48	movq	%r8,%rbp
49	mulq	%rbx
50	movq	%rax,%r10
51	movq	(%rcx),%rax
52
53	imulq	%r10,%rbp
54	movq	%rdx,%r11
55
56	mulq	%rbp
57	addq	%rax,%r10
58	movq	8(%rsi),%rax
59	adcq	$0,%rdx
60	movq	%rdx,%r13
61
62	leaq	1(%r15),%r15
63	jmp	.L1st_enter
64
65.align	16
66.L1st:
67	addq	%rax,%r13
68	movq	(%rsi,%r15,8),%rax
69	adcq	$0,%rdx
70	addq	%r11,%r13
71	movq	%r10,%r11
72	adcq	$0,%rdx
73	movq	%r13,-16(%rsp,%r15,8)
74	movq	%rdx,%r13
75
76.L1st_enter:
77	mulq	%rbx
78	addq	%rax,%r11
79	movq	(%rcx,%r15,8),%rax
80	adcq	$0,%rdx
81	leaq	1(%r15),%r15
82	movq	%rdx,%r10
83
84	mulq	%rbp
85	cmpq	%r9,%r15
86	jne	.L1st
87
88	addq	%rax,%r13
89	movq	(%rsi),%rax
90	adcq	$0,%rdx
91	addq	%r11,%r13
92	adcq	$0,%rdx
93	movq	%r13,-16(%rsp,%r15,8)
94	movq	%rdx,%r13
95	movq	%r10,%r11
96
97	xorq	%rdx,%rdx
98	addq	%r11,%r13
99	adcq	$0,%rdx
100	movq	%r13,-8(%rsp,%r9,8)
101	movq	%rdx,(%rsp,%r9,8)
102
103	leaq	1(%r14),%r14
104	jmp	.Louter
105.align	16
106.Louter:
107	movq	(%r12,%r14,8),%rbx
108	xorq	%r15,%r15
109	movq	%r8,%rbp
110	movq	(%rsp),%r10
111	mulq	%rbx
112	addq	%rax,%r10
113	movq	(%rcx),%rax
114	adcq	$0,%rdx
115
116	imulq	%r10,%rbp
117	movq	%rdx,%r11
118
119	mulq	%rbp
120	addq	%rax,%r10
121	movq	8(%rsi),%rax
122	adcq	$0,%rdx
123	movq	8(%rsp),%r10
124	movq	%rdx,%r13
125
126	leaq	1(%r15),%r15
127	jmp	.Linner_enter
128
129.align	16
130.Linner:
131	addq	%rax,%r13
132	movq	(%rsi,%r15,8),%rax
133	adcq	$0,%rdx
134	addq	%r10,%r13
135	movq	(%rsp,%r15,8),%r10
136	adcq	$0,%rdx
137	movq	%r13,-16(%rsp,%r15,8)
138	movq	%rdx,%r13
139
140.Linner_enter:
141	mulq	%rbx
142	addq	%rax,%r11
143	movq	(%rcx,%r15,8),%rax
144	adcq	$0,%rdx
145	addq	%r11,%r10
146	movq	%rdx,%r11
147	adcq	$0,%r11
148	leaq	1(%r15),%r15
149
150	mulq	%rbp
151	cmpq	%r9,%r15
152	jne	.Linner
153
154	addq	%rax,%r13
155	movq	(%rsi),%rax
156	adcq	$0,%rdx
157	addq	%r10,%r13
158	movq	(%rsp,%r15,8),%r10
159	adcq	$0,%rdx
160	movq	%r13,-16(%rsp,%r15,8)
161	movq	%rdx,%r13
162
163	xorq	%rdx,%rdx
164	addq	%r11,%r13
165	adcq	$0,%rdx
166	addq	%r10,%r13
167	adcq	$0,%rdx
168	movq	%r13,-8(%rsp,%r9,8)
169	movq	%rdx,(%rsp,%r9,8)
170
171	leaq	1(%r14),%r14
172	cmpq	%r9,%r14
173	jb	.Louter
174
175	xorq	%r14,%r14
176	movq	(%rsp),%rax
177	leaq	(%rsp),%rsi
178	movq	%r9,%r15
179	jmp	.Lsub
180.align	16
181.Lsub:	sbbq	(%rcx,%r14,8),%rax
182	movq	%rax,(%rdi,%r14,8)
183	movq	8(%rsi,%r14,8),%rax
184	leaq	1(%r14),%r14
185	decq	%r15
186	jnz	.Lsub
187
188	sbbq	$0,%rax
189	xorq	%r14,%r14
190	movq	%r9,%r15
191.align	16
192.Lcopy:
193	movq	(%rsp,%r14,8),%rsi
194	movq	(%rdi,%r14,8),%rcx
195	xorq	%rcx,%rsi
196	andq	%rax,%rsi
197	xorq	%rcx,%rsi
198	movq	%r14,(%rsp,%r14,8)
199	movq	%rsi,(%rdi,%r14,8)
200	leaq	1(%r14),%r14
201	subq	$1,%r15
202	jnz	.Lcopy
203
204	movq	8(%rsp,%r9,8),%rsi
205	movq	$1,%rax
206	movq	(%rsi),%r15
207	movq	8(%rsi),%r14
208	movq	16(%rsi),%r13
209	movq	24(%rsi),%r12
210	movq	32(%rsi),%rbp
211	movq	40(%rsi),%rbx
212	leaq	48(%rsi),%rsp
213.Lmul_epilogue:
214	.byte	0xf3,0xc3
215.size	bn_mul_mont,.-bn_mul_mont
216.type	bn_mul4x_mont,@function
217.align	16
218bn_mul4x_mont:
219.Lmul4x_enter:
220	pushq	%rbx
221	pushq	%rbp
222	pushq	%r12
223	pushq	%r13
224	pushq	%r14
225	pushq	%r15
226
227	movl	%r9d,%r9d
228	leaq	4(%r9),%r10
229	movq	%rsp,%r11
230	negq	%r10
231	leaq	(%rsp,%r10,8),%rsp
232	andq	$-1024,%rsp
233
234	movq	%r11,8(%rsp,%r9,8)
235.Lmul4x_body:
236	movq	%rdi,16(%rsp,%r9,8)
237	movq	%rdx,%r12
238	movq	(%r8),%r8
239	movq	(%r12),%rbx
240	movq	(%rsi),%rax
241
242	xorq	%r14,%r14
243	xorq	%r15,%r15
244
245	movq	%r8,%rbp
246	mulq	%rbx
247	movq	%rax,%r10
248	movq	(%rcx),%rax
249
250	imulq	%r10,%rbp
251	movq	%rdx,%r11
252
253	mulq	%rbp
254	addq	%rax,%r10
255	movq	8(%rsi),%rax
256	adcq	$0,%rdx
257	movq	%rdx,%rdi
258
259	mulq	%rbx
260	addq	%rax,%r11
261	movq	8(%rcx),%rax
262	adcq	$0,%rdx
263	movq	%rdx,%r10
264
265	mulq	%rbp
266	addq	%rax,%rdi
267	movq	16(%rsi),%rax
268	adcq	$0,%rdx
269	addq	%r11,%rdi
270	leaq	4(%r15),%r15
271	adcq	$0,%rdx
272	movq	%rdi,(%rsp)
273	movq	%rdx,%r13
274	jmp	.L1st4x
275.align	16
276.L1st4x:
277	mulq	%rbx
278	addq	%rax,%r10
279	movq	-16(%rcx,%r15,8),%rax
280	adcq	$0,%rdx
281	movq	%rdx,%r11
282
283	mulq	%rbp
284	addq	%rax,%r13
285	movq	-8(%rsi,%r15,8),%rax
286	adcq	$0,%rdx
287	addq	%r10,%r13
288	adcq	$0,%rdx
289	movq	%r13,-24(%rsp,%r15,8)
290	movq	%rdx,%rdi
291
292	mulq	%rbx
293	addq	%rax,%r11
294	movq	-8(%rcx,%r15,8),%rax
295	adcq	$0,%rdx
296	movq	%rdx,%r10
297
298	mulq	%rbp
299	addq	%rax,%rdi
300	movq	(%rsi,%r15,8),%rax
301	adcq	$0,%rdx
302	addq	%r11,%rdi
303	adcq	$0,%rdx
304	movq	%rdi,-16(%rsp,%r15,8)
305	movq	%rdx,%r13
306
307	mulq	%rbx
308	addq	%rax,%r10
309	movq	(%rcx,%r15,8),%rax
310	adcq	$0,%rdx
311	movq	%rdx,%r11
312
313	mulq	%rbp
314	addq	%rax,%r13
315	movq	8(%rsi,%r15,8),%rax
316	adcq	$0,%rdx
317	addq	%r10,%r13
318	adcq	$0,%rdx
319	movq	%r13,-8(%rsp,%r15,8)
320	movq	%rdx,%rdi
321
322	mulq	%rbx
323	addq	%rax,%r11
324	movq	8(%rcx,%r15,8),%rax
325	adcq	$0,%rdx
326	leaq	4(%r15),%r15
327	movq	%rdx,%r10
328
329	mulq	%rbp
330	addq	%rax,%rdi
331	movq	-16(%rsi,%r15,8),%rax
332	adcq	$0,%rdx
333	addq	%r11,%rdi
334	adcq	$0,%rdx
335	movq	%rdi,-32(%rsp,%r15,8)
336	movq	%rdx,%r13
337	cmpq	%r9,%r15
338	jb	.L1st4x
339
340	mulq	%rbx
341	addq	%rax,%r10
342	movq	-16(%rcx,%r15,8),%rax
343	adcq	$0,%rdx
344	movq	%rdx,%r11
345
346	mulq	%rbp
347	addq	%rax,%r13
348	movq	-8(%rsi,%r15,8),%rax
349	adcq	$0,%rdx
350	addq	%r10,%r13
351	adcq	$0,%rdx
352	movq	%r13,-24(%rsp,%r15,8)
353	movq	%rdx,%rdi
354
355	mulq	%rbx
356	addq	%rax,%r11
357	movq	-8(%rcx,%r15,8),%rax
358	adcq	$0,%rdx
359	movq	%rdx,%r10
360
361	mulq	%rbp
362	addq	%rax,%rdi
363	movq	(%rsi),%rax
364	adcq	$0,%rdx
365	addq	%r11,%rdi
366	adcq	$0,%rdx
367	movq	%rdi,-16(%rsp,%r15,8)
368	movq	%rdx,%r13
369
370	xorq	%rdi,%rdi
371	addq	%r10,%r13
372	adcq	$0,%rdi
373	movq	%r13,-8(%rsp,%r15,8)
374	movq	%rdi,(%rsp,%r15,8)
375
376	leaq	1(%r14),%r14
377.align	4
378.Louter4x:
379	movq	(%r12,%r14,8),%rbx
380	xorq	%r15,%r15
381	movq	(%rsp),%r10
382	movq	%r8,%rbp
383	mulq	%rbx
384	addq	%rax,%r10
385	movq	(%rcx),%rax
386	adcq	$0,%rdx
387
388	imulq	%r10,%rbp
389	movq	%rdx,%r11
390
391	mulq	%rbp
392	addq	%rax,%r10
393	movq	8(%rsi),%rax
394	adcq	$0,%rdx
395	movq	%rdx,%rdi
396
397	mulq	%rbx
398	addq	%rax,%r11
399	movq	8(%rcx),%rax
400	adcq	$0,%rdx
401	addq	8(%rsp),%r11
402	adcq	$0,%rdx
403	movq	%rdx,%r10
404
405	mulq	%rbp
406	addq	%rax,%rdi
407	movq	16(%rsi),%rax
408	adcq	$0,%rdx
409	addq	%r11,%rdi
410	leaq	4(%r15),%r15
411	adcq	$0,%rdx
412	movq	%rdi,(%rsp)
413	movq	%rdx,%r13
414	jmp	.Linner4x
415.align	16
416.Linner4x:
417	mulq	%rbx
418	addq	%rax,%r10
419	movq	-16(%rcx,%r15,8),%rax
420	adcq	$0,%rdx
421	addq	-16(%rsp,%r15,8),%r10
422	adcq	$0,%rdx
423	movq	%rdx,%r11
424
425	mulq	%rbp
426	addq	%rax,%r13
427	movq	-8(%rsi,%r15,8),%rax
428	adcq	$0,%rdx
429	addq	%r10,%r13
430	adcq	$0,%rdx
431	movq	%r13,-24(%rsp,%r15,8)
432	movq	%rdx,%rdi
433
434	mulq	%rbx
435	addq	%rax,%r11
436	movq	-8(%rcx,%r15,8),%rax
437	adcq	$0,%rdx
438	addq	-8(%rsp,%r15,8),%r11
439	adcq	$0,%rdx
440	movq	%rdx,%r10
441
442	mulq	%rbp
443	addq	%rax,%rdi
444	movq	(%rsi,%r15,8),%rax
445	adcq	$0,%rdx
446	addq	%r11,%rdi
447	adcq	$0,%rdx
448	movq	%rdi,-16(%rsp,%r15,8)
449	movq	%rdx,%r13
450
451	mulq	%rbx
452	addq	%rax,%r10
453	movq	(%rcx,%r15,8),%rax
454	adcq	$0,%rdx
455	addq	(%rsp,%r15,8),%r10
456	adcq	$0,%rdx
457	movq	%rdx,%r11
458
459	mulq	%rbp
460	addq	%rax,%r13
461	movq	8(%rsi,%r15,8),%rax
462	adcq	$0,%rdx
463	addq	%r10,%r13
464	adcq	$0,%rdx
465	movq	%r13,-8(%rsp,%r15,8)
466	movq	%rdx,%rdi
467
468	mulq	%rbx
469	addq	%rax,%r11
470	movq	8(%rcx,%r15,8),%rax
471	adcq	$0,%rdx
472	addq	8(%rsp,%r15,8),%r11
473	adcq	$0,%rdx
474	leaq	4(%r15),%r15
475	movq	%rdx,%r10
476
477	mulq	%rbp
478	addq	%rax,%rdi
479	movq	-16(%rsi,%r15,8),%rax
480	adcq	$0,%rdx
481	addq	%r11,%rdi
482	adcq	$0,%rdx
483	movq	%rdi,-32(%rsp,%r15,8)
484	movq	%rdx,%r13
485	cmpq	%r9,%r15
486	jb	.Linner4x
487
488	mulq	%rbx
489	addq	%rax,%r10
490	movq	-16(%rcx,%r15,8),%rax
491	adcq	$0,%rdx
492	addq	-16(%rsp,%r15,8),%r10
493	adcq	$0,%rdx
494	movq	%rdx,%r11
495
496	mulq	%rbp
497	addq	%rax,%r13
498	movq	-8(%rsi,%r15,8),%rax
499	adcq	$0,%rdx
500	addq	%r10,%r13
501	adcq	$0,%rdx
502	movq	%r13,-24(%rsp,%r15,8)
503	movq	%rdx,%rdi
504
505	mulq	%rbx
506	addq	%rax,%r11
507	movq	-8(%rcx,%r15,8),%rax
508	adcq	$0,%rdx
509	addq	-8(%rsp,%r15,8),%r11
510	adcq	$0,%rdx
511	leaq	1(%r14),%r14
512	movq	%rdx,%r10
513
514	mulq	%rbp
515	addq	%rax,%rdi
516	movq	(%rsi),%rax
517	adcq	$0,%rdx
518	addq	%r11,%rdi
519	adcq	$0,%rdx
520	movq	%rdi,-16(%rsp,%r15,8)
521	movq	%rdx,%r13
522
523	xorq	%rdi,%rdi
524	addq	%r10,%r13
525	adcq	$0,%rdi
526	addq	(%rsp,%r9,8),%r13
527	adcq	$0,%rdi
528	movq	%r13,-8(%rsp,%r15,8)
529	movq	%rdi,(%rsp,%r15,8)
530
531	cmpq	%r9,%r14
532	jb	.Louter4x
533	movq	16(%rsp,%r9,8),%rdi
534	movq	0(%rsp),%rax
535	movq	8(%rsp),%rdx
536	shrq	$2,%r9
537	leaq	(%rsp),%rsi
538	xorq	%r14,%r14
539
540	subq	0(%rcx),%rax
541	movq	16(%rsi),%rbx
542	movq	24(%rsi),%rbp
543	sbbq	8(%rcx),%rdx
544	leaq	-1(%r9),%r15
545	jmp	.Lsub4x
546.align	16
547.Lsub4x:
548	movq	%rax,0(%rdi,%r14,8)
549	movq	%rdx,8(%rdi,%r14,8)
550	sbbq	16(%rcx,%r14,8),%rbx
551	movq	32(%rsi,%r14,8),%rax
552	movq	40(%rsi,%r14,8),%rdx
553	sbbq	24(%rcx,%r14,8),%rbp
554	movq	%rbx,16(%rdi,%r14,8)
555	movq	%rbp,24(%rdi,%r14,8)
556	sbbq	32(%rcx,%r14,8),%rax
557	movq	48(%rsi,%r14,8),%rbx
558	movq	56(%rsi,%r14,8),%rbp
559	sbbq	40(%rcx,%r14,8),%rdx
560	leaq	4(%r14),%r14
561	decq	%r15
562	jnz	.Lsub4x
563
564	movq	%rax,0(%rdi,%r14,8)
565	movq	32(%rsi,%r14,8),%rax
566	sbbq	16(%rcx,%r14,8),%rbx
567	movq	%rdx,8(%rdi,%r14,8)
568	sbbq	24(%rcx,%r14,8),%rbp
569	movq	%rbx,16(%rdi,%r14,8)
570
571	sbbq	$0,%rax
572	movq	%rax,%xmm0
573	punpcklqdq	%xmm0,%xmm0
574	movq	%rbp,24(%rdi,%r14,8)
575	xorq	%r14,%r14
576
577	movq	%r9,%r15
578	pxor	%xmm5,%xmm5
579	jmp	.Lcopy4x
580.align	16
581.Lcopy4x:
582	movdqu	(%rsp,%r14,1),%xmm2
583	movdqu	16(%rsp,%r14,1),%xmm4
584	movdqu	(%rdi,%r14,1),%xmm1
585	movdqu	16(%rdi,%r14,1),%xmm3
586	pxor	%xmm1,%xmm2
587	pxor	%xmm3,%xmm4
588	pand	%xmm0,%xmm2
589	pand	%xmm0,%xmm4
590	pxor	%xmm1,%xmm2
591	pxor	%xmm3,%xmm4
592	movdqu	%xmm2,(%rdi,%r14,1)
593	movdqu	%xmm4,16(%rdi,%r14,1)
594	movdqa	%xmm5,(%rsp,%r14,1)
595	movdqa	%xmm5,16(%rsp,%r14,1)
596
597	leaq	32(%r14),%r14
598	decq	%r15
599	jnz	.Lcopy4x
600
601	shlq	$2,%r9
602	movq	8(%rsp,%r9,8),%rsi
603	movq	$1,%rax
604	movq	(%rsi),%r15
605	movq	8(%rsi),%r14
606	movq	16(%rsi),%r13
607	movq	24(%rsi),%r12
608	movq	32(%rsi),%rbp
609	movq	40(%rsi),%rbx
610	leaq	48(%rsi),%rsp
611.Lmul4x_epilogue:
612	.byte	0xf3,0xc3
613.size	bn_mul4x_mont,.-bn_mul4x_mont
614.extern	bn_sqr8x_internal
615.hidden bn_sqr8x_internal
616
617.type	bn_sqr8x_mont,@function
618.align	32
619bn_sqr8x_mont:
620.Lsqr8x_enter:
621	movq	%rsp,%rax
622	pushq	%rbx
623	pushq	%rbp
624	pushq	%r12
625	pushq	%r13
626	pushq	%r14
627	pushq	%r15
628
629	movl	%r9d,%r10d
630	shll	$3,%r9d
631	shlq	$3+2,%r10
632	negq	%r9
633
634
635
636
637
638
639	leaq	-64(%rsp,%r9,4),%r11
640	movq	(%r8),%r8
641	subq	%rsi,%r11
642	andq	$4095,%r11
643	cmpq	%r11,%r10
644	jb	.Lsqr8x_sp_alt
645	subq	%r11,%rsp
646	leaq	-64(%rsp,%r9,4),%rsp
647	jmp	.Lsqr8x_sp_done
648
649.align	32
650.Lsqr8x_sp_alt:
651	leaq	4096-64(,%r9,4),%r10
652	leaq	-64(%rsp,%r9,4),%rsp
653	subq	%r10,%r11
654	movq	$0,%r10
655	cmovcq	%r10,%r11
656	subq	%r11,%rsp
657.Lsqr8x_sp_done:
658	andq	$-64,%rsp
659	movq	%r9,%r10
660	negq	%r9
661
662	leaq	64(%rsp,%r9,2),%r11
663	movq	%r8,32(%rsp)
664	movq	%rax,40(%rsp)
665.Lsqr8x_body:
666
667	movq	%r9,%rbp
668.byte	102,73,15,110,211
669	shrq	$3+2,%rbp
670	movl	OPENSSL_ia32cap_P+8(%rip),%eax
671	jmp	.Lsqr8x_copy_n
672
673.align	32
674.Lsqr8x_copy_n:
675	movq	0(%rcx),%xmm0
676	movq	8(%rcx),%xmm1
677	movq	16(%rcx),%xmm3
678	movq	24(%rcx),%xmm4
679	leaq	32(%rcx),%rcx
680	movdqa	%xmm0,0(%r11)
681	movdqa	%xmm1,16(%r11)
682	movdqa	%xmm3,32(%r11)
683	movdqa	%xmm4,48(%r11)
684	leaq	64(%r11),%r11
685	decq	%rbp
686	jnz	.Lsqr8x_copy_n
687
688	pxor	%xmm0,%xmm0
689.byte	102,72,15,110,207
690.byte	102,73,15,110,218
691	call	bn_sqr8x_internal
692
693	pxor	%xmm0,%xmm0
694	leaq	48(%rsp),%rax
695	leaq	64(%rsp,%r9,2),%rdx
696	shrq	$3+2,%r9
697	movq	40(%rsp),%rsi
698	jmp	.Lsqr8x_zero
699
700.align	32
701.Lsqr8x_zero:
702	movdqa	%xmm0,0(%rax)
703	movdqa	%xmm0,16(%rax)
704	movdqa	%xmm0,32(%rax)
705	movdqa	%xmm0,48(%rax)
706	leaq	64(%rax),%rax
707	movdqa	%xmm0,0(%rdx)
708	movdqa	%xmm0,16(%rdx)
709	movdqa	%xmm0,32(%rdx)
710	movdqa	%xmm0,48(%rdx)
711	leaq	64(%rdx),%rdx
712	decq	%r9
713	jnz	.Lsqr8x_zero
714
715	movq	$1,%rax
716	movq	-48(%rsi),%r15
717	movq	-40(%rsi),%r14
718	movq	-32(%rsi),%r13
719	movq	-24(%rsi),%r12
720	movq	-16(%rsi),%rbp
721	movq	-8(%rsi),%rbx
722	leaq	(%rsi),%rsp
723.Lsqr8x_epilogue:
724	.byte	0xf3,0xc3
725.size	bn_sqr8x_mont,.-bn_sqr8x_mont
726.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
727.align	16
728#endif
729