1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# October 2005.
11#
12# Montgomery multiplication routine for x86_64. While it gives modest
13# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
14# than twice, >2x, as fast. Most common rsa1024 sign is improved by
15# respectful 50%. It remains to be seen if loop unrolling and
16# dedicated squaring routine can provide further improvement...
17
18# July 2011.
19#
20# Add dedicated squaring procedure. Performance improvement varies
21# from platform to platform, but in average it's ~5%/15%/25%/33%
22# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
23
24# August 2011.
25#
26# Unroll and modulo-schedule inner loops in such manner that they
27# are "fallen through" for input lengths of 8, which is critical for
28# 1024-bit RSA *sign*. Average performance improvement in comparison
29# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
30# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
31
32# June 2013.
33#
34# Optimize reduction in squaring procedure and improve 1024+-bit RSA
35# sign performance by 10-16% on Intel Sandy Bridge and later
36# (virtually same on non-Intel processors).
37
38# August 2013.
39#
40# Add MULX/ADOX/ADCX code path.
41
42$flavour = shift;
43$output  = shift;
44if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
45
46$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
47
48$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
50( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
51die "can't locate x86_64-xlate.pl";
52
53open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
54*STDOUT=*OUT;
55
56# In upstream, this is controlled by shelling out to the compiler to check
57# versions, but BoringSSL is intended to be used with pre-generated perlasm
58# output, so this isn't useful anyway.
59#
60# TODO(davidben): Enable this option after testing. $addx goes up to 1.
61$addx = 0;
62
63# int bn_mul_mont(
64$rp="%rdi";	# BN_ULONG *rp,
65$ap="%rsi";	# const BN_ULONG *ap,
66$bp="%rdx";	# const BN_ULONG *bp,
67$np="%rcx";	# const BN_ULONG *np,
68$n0="%r8";	# const BN_ULONG *n0,
69$num="%r9";	# int num);
70$lo0="%r10";
71$hi0="%r11";
72$hi1="%r13";
73$i="%r14";
74$j="%r15";
75$m0="%rbx";
76$m1="%rbp";
77
78$code=<<___;
79.text
80
81.extern	OPENSSL_ia32cap_P
82
83.globl	bn_mul_mont
84.type	bn_mul_mont,\@function,6
85.align	16
86bn_mul_mont:
87.cfi_startproc
88	mov	${num}d,${num}d
89	mov	%rsp,%rax
90.cfi_def_cfa_register	%rax
91	test	\$3,${num}d
92	jnz	.Lmul_enter
93	cmp	\$8,${num}d
94	jb	.Lmul_enter
95___
96$code.=<<___ if ($addx);
97	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
98___
99$code.=<<___;
100	cmp	$ap,$bp
101	jne	.Lmul4x_enter
102	test	\$7,${num}d
103	jz	.Lsqr8x_enter
104	jmp	.Lmul4x_enter
105
106.align	16
107.Lmul_enter:
108	push	%rbx
109.cfi_push	%rbx
110	push	%rbp
111.cfi_push	%rbp
112	push	%r12
113.cfi_push	%r12
114	push	%r13
115.cfi_push	%r13
116	push	%r14
117.cfi_push	%r14
118	push	%r15
119.cfi_push	%r15
120
121	neg	$num
122	mov	%rsp,%r11
123	lea	-16(%rsp,$num,8),%r10	# future alloca(8*(num+2))
124	neg	$num			# restore $num
125	and	\$-1024,%r10		# minimize TLB usage
126
127	# An OS-agnostic version of __chkstk.
128	#
129	# Some OSes (Windows) insist on stack being "wired" to
130	# physical memory in strictly sequential manner, i.e. if stack
131	# allocation spans two pages, then reference to farmost one can
132	# be punishable by SEGV. But page walking can do good even on
133	# other OSes, because it guarantees that villain thread hits
134	# the guard page before it can make damage to innocent one...
135	sub	%r10,%r11
136	and	\$-4096,%r11
137	lea	(%r10,%r11),%rsp
138	mov	(%rsp),%r11
139	cmp	%r10,%rsp
140	ja	.Lmul_page_walk
141	jmp	.Lmul_page_walk_done
142
143.align	16
144.Lmul_page_walk:
145	lea	-4096(%rsp),%rsp
146	mov	(%rsp),%r11
147	cmp	%r10,%rsp
148	ja	.Lmul_page_walk
149.Lmul_page_walk_done:
150
151	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
152.cfi_cfa_expression	%rsp+8,$num,8,mul,plus,deref,+8
153.Lmul_body:
154	mov	$bp,%r12		# reassign $bp
155___
156		$bp="%r12";
157$code.=<<___;
158	mov	($n0),$n0		# pull n0[0] value
159	mov	($bp),$m0		# m0=bp[0]
160	mov	($ap),%rax
161
162	xor	$i,$i			# i=0
163	xor	$j,$j			# j=0
164
165	mov	$n0,$m1
166	mulq	$m0			# ap[0]*bp[0]
167	mov	%rax,$lo0
168	mov	($np),%rax
169
170	imulq	$lo0,$m1		# "tp[0]"*n0
171	mov	%rdx,$hi0
172
173	mulq	$m1			# np[0]*m1
174	add	%rax,$lo0		# discarded
175	mov	8($ap),%rax
176	adc	\$0,%rdx
177	mov	%rdx,$hi1
178
179	lea	1($j),$j		# j++
180	jmp	.L1st_enter
181
182.align	16
183.L1st:
184	add	%rax,$hi1
185	mov	($ap,$j,8),%rax
186	adc	\$0,%rdx
187	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
188	mov	$lo0,$hi0
189	adc	\$0,%rdx
190	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
191	mov	%rdx,$hi1
192
193.L1st_enter:
194	mulq	$m0			# ap[j]*bp[0]
195	add	%rax,$hi0
196	mov	($np,$j,8),%rax
197	adc	\$0,%rdx
198	lea	1($j),$j		# j++
199	mov	%rdx,$lo0
200
201	mulq	$m1			# np[j]*m1
202	cmp	$num,$j
203	jne	.L1st
204
205	add	%rax,$hi1
206	mov	($ap),%rax		# ap[0]
207	adc	\$0,%rdx
208	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
209	adc	\$0,%rdx
210	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
211	mov	%rdx,$hi1
212	mov	$lo0,$hi0
213
214	xor	%rdx,%rdx
215	add	$hi0,$hi1
216	adc	\$0,%rdx
217	mov	$hi1,-8(%rsp,$num,8)
218	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
219
220	lea	1($i),$i		# i++
221	jmp	.Louter
222.align	16
223.Louter:
224	mov	($bp,$i,8),$m0		# m0=bp[i]
225	xor	$j,$j			# j=0
226	mov	$n0,$m1
227	mov	(%rsp),$lo0
228	mulq	$m0			# ap[0]*bp[i]
229	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
230	mov	($np),%rax
231	adc	\$0,%rdx
232
233	imulq	$lo0,$m1		# tp[0]*n0
234	mov	%rdx,$hi0
235
236	mulq	$m1			# np[0]*m1
237	add	%rax,$lo0		# discarded
238	mov	8($ap),%rax
239	adc	\$0,%rdx
240	mov	8(%rsp),$lo0		# tp[1]
241	mov	%rdx,$hi1
242
243	lea	1($j),$j		# j++
244	jmp	.Linner_enter
245
246.align	16
247.Linner:
248	add	%rax,$hi1
249	mov	($ap,$j,8),%rax
250	adc	\$0,%rdx
251	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
252	mov	(%rsp,$j,8),$lo0
253	adc	\$0,%rdx
254	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
255	mov	%rdx,$hi1
256
257.Linner_enter:
258	mulq	$m0			# ap[j]*bp[i]
259	add	%rax,$hi0
260	mov	($np,$j,8),%rax
261	adc	\$0,%rdx
262	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
263	mov	%rdx,$hi0
264	adc	\$0,$hi0
265	lea	1($j),$j		# j++
266
267	mulq	$m1			# np[j]*m1
268	cmp	$num,$j
269	jne	.Linner
270
271	add	%rax,$hi1
272	mov	($ap),%rax		# ap[0]
273	adc	\$0,%rdx
274	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
275	mov	(%rsp,$j,8),$lo0
276	adc	\$0,%rdx
277	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
278	mov	%rdx,$hi1
279
280	xor	%rdx,%rdx
281	add	$hi0,$hi1
282	adc	\$0,%rdx
283	add	$lo0,$hi1		# pull upmost overflow bit
284	adc	\$0,%rdx
285	mov	$hi1,-8(%rsp,$num,8)
286	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
287
288	lea	1($i),$i		# i++
289	cmp	$num,$i
290	jb	.Louter
291
292	xor	$i,$i			# i=0 and clear CF!
293	mov	(%rsp),%rax		# tp[0]
294	lea	(%rsp),$ap		# borrow ap for tp
295	mov	$num,$j			# j=num
296	jmp	.Lsub
297.align	16
298.Lsub:	sbb	($np,$i,8),%rax
299	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
300	mov	8($ap,$i,8),%rax	# tp[i+1]
301	lea	1($i),$i		# i++
302	dec	$j			# doesnn't affect CF!
303	jnz	.Lsub
304
305	sbb	\$0,%rax		# handle upmost overflow bit
306	xor	$i,$i
307	and	%rax,$ap
308	not	%rax
309	mov	$rp,$np
310	and	%rax,$np
311	mov	$num,$j			# j=num
312	or	$np,$ap			# ap=borrow?tp:rp
313.align	16
314.Lcopy:					# copy or in-place refresh
315	mov	($ap,$i,8),%rax
316	mov	$i,(%rsp,$i,8)		# zap temporary vector
317	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
318	lea	1($i),$i
319	sub	\$1,$j
320	jnz	.Lcopy
321
322	mov	8(%rsp,$num,8),%rsi	# restore %rsp
323.cfi_def_cfa	%rsi,8
324	mov	\$1,%rax
325	mov	-48(%rsi),%r15
326.cfi_restore	%r15
327	mov	-40(%rsi),%r14
328.cfi_restore	%r14
329	mov	-32(%rsi),%r13
330.cfi_restore	%r13
331	mov	-24(%rsi),%r12
332.cfi_restore	%r12
333	mov	-16(%rsi),%rbp
334.cfi_restore	%rbp
335	mov	-8(%rsi),%rbx
336.cfi_restore	%rbx
337	lea	(%rsi),%rsp
338.cfi_def_cfa_register	%rsp
339.Lmul_epilogue:
340	ret
341.cfi_endproc
342.size	bn_mul_mont,.-bn_mul_mont
343___
344{{{
345my @A=("%r10","%r11");
346my @N=("%r13","%rdi");
347$code.=<<___;
348.type	bn_mul4x_mont,\@function,6
349.align	16
350bn_mul4x_mont:
351.cfi_startproc
352	mov	${num}d,${num}d
353	mov	%rsp,%rax
354.cfi_def_cfa_register	%rax
355.Lmul4x_enter:
356___
357$code.=<<___ if ($addx);
358	and	\$0x80100,%r11d
359	cmp	\$0x80100,%r11d
360	je	.Lmulx4x_enter
361___
362$code.=<<___;
363	push	%rbx
364.cfi_push	%rbx
365	push	%rbp
366.cfi_push	%rbp
367	push	%r12
368.cfi_push	%r12
369	push	%r13
370.cfi_push	%r13
371	push	%r14
372.cfi_push	%r14
373	push	%r15
374.cfi_push	%r15
375
376	neg	$num
377	mov	%rsp,%r11
378	lea	-32(%rsp,$num,8),%r10	# future alloca(8*(num+4))
379	neg	$num			# restore
380	and	\$-1024,%r10		# minimize TLB usage
381
382	sub	%r10,%r11
383	and	\$-4096,%r11
384	lea	(%r10,%r11),%rsp
385	mov	(%rsp),%r11
386	cmp	%r10,%rsp
387	ja	.Lmul4x_page_walk
388	jmp	.Lmul4x_page_walk_done
389
390.Lmul4x_page_walk:
391	lea	-4096(%rsp),%rsp
392	mov	(%rsp),%r11
393	cmp	%r10,%rsp
394	ja	.Lmul4x_page_walk
395.Lmul4x_page_walk_done:
396
397	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
398.cfi_cfa_expression	%rsp+8,$num,8,mul,plus,deref,+8
399.Lmul4x_body:
400	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
401	mov	%rdx,%r12		# reassign $bp
402___
403		$bp="%r12";
404$code.=<<___;
405	mov	($n0),$n0		# pull n0[0] value
406	mov	($bp),$m0		# m0=bp[0]
407	mov	($ap),%rax
408
409	xor	$i,$i			# i=0
410	xor	$j,$j			# j=0
411
412	mov	$n0,$m1
413	mulq	$m0			# ap[0]*bp[0]
414	mov	%rax,$A[0]
415	mov	($np),%rax
416
417	imulq	$A[0],$m1		# "tp[0]"*n0
418	mov	%rdx,$A[1]
419
420	mulq	$m1			# np[0]*m1
421	add	%rax,$A[0]		# discarded
422	mov	8($ap),%rax
423	adc	\$0,%rdx
424	mov	%rdx,$N[1]
425
426	mulq	$m0
427	add	%rax,$A[1]
428	mov	8($np),%rax
429	adc	\$0,%rdx
430	mov	%rdx,$A[0]
431
432	mulq	$m1
433	add	%rax,$N[1]
434	mov	16($ap),%rax
435	adc	\$0,%rdx
436	add	$A[1],$N[1]
437	lea	4($j),$j		# j++
438	adc	\$0,%rdx
439	mov	$N[1],(%rsp)
440	mov	%rdx,$N[0]
441	jmp	.L1st4x
442.align	16
443.L1st4x:
444	mulq	$m0			# ap[j]*bp[0]
445	add	%rax,$A[0]
446	mov	-16($np,$j,8),%rax
447	adc	\$0,%rdx
448	mov	%rdx,$A[1]
449
450	mulq	$m1			# np[j]*m1
451	add	%rax,$N[0]
452	mov	-8($ap,$j,8),%rax
453	adc	\$0,%rdx
454	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
455	adc	\$0,%rdx
456	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
457	mov	%rdx,$N[1]
458
459	mulq	$m0			# ap[j]*bp[0]
460	add	%rax,$A[1]
461	mov	-8($np,$j,8),%rax
462	adc	\$0,%rdx
463	mov	%rdx,$A[0]
464
465	mulq	$m1			# np[j]*m1
466	add	%rax,$N[1]
467	mov	($ap,$j,8),%rax
468	adc	\$0,%rdx
469	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
470	adc	\$0,%rdx
471	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
472	mov	%rdx,$N[0]
473
474	mulq	$m0			# ap[j]*bp[0]
475	add	%rax,$A[0]
476	mov	($np,$j,8),%rax
477	adc	\$0,%rdx
478	mov	%rdx,$A[1]
479
480	mulq	$m1			# np[j]*m1
481	add	%rax,$N[0]
482	mov	8($ap,$j,8),%rax
483	adc	\$0,%rdx
484	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
485	adc	\$0,%rdx
486	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
487	mov	%rdx,$N[1]
488
489	mulq	$m0			# ap[j]*bp[0]
490	add	%rax,$A[1]
491	mov	8($np,$j,8),%rax
492	adc	\$0,%rdx
493	lea	4($j),$j		# j++
494	mov	%rdx,$A[0]
495
496	mulq	$m1			# np[j]*m1
497	add	%rax,$N[1]
498	mov	-16($ap,$j,8),%rax
499	adc	\$0,%rdx
500	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
501	adc	\$0,%rdx
502	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
503	mov	%rdx,$N[0]
504	cmp	$num,$j
505	jb	.L1st4x
506
507	mulq	$m0			# ap[j]*bp[0]
508	add	%rax,$A[0]
509	mov	-16($np,$j,8),%rax
510	adc	\$0,%rdx
511	mov	%rdx,$A[1]
512
513	mulq	$m1			# np[j]*m1
514	add	%rax,$N[0]
515	mov	-8($ap,$j,8),%rax
516	adc	\$0,%rdx
517	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
518	adc	\$0,%rdx
519	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
520	mov	%rdx,$N[1]
521
522	mulq	$m0			# ap[j]*bp[0]
523	add	%rax,$A[1]
524	mov	-8($np,$j,8),%rax
525	adc	\$0,%rdx
526	mov	%rdx,$A[0]
527
528	mulq	$m1			# np[j]*m1
529	add	%rax,$N[1]
530	mov	($ap),%rax		# ap[0]
531	adc	\$0,%rdx
532	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
533	adc	\$0,%rdx
534	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
535	mov	%rdx,$N[0]
536
537	xor	$N[1],$N[1]
538	add	$A[0],$N[0]
539	adc	\$0,$N[1]
540	mov	$N[0],-8(%rsp,$j,8)
541	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
542
543	lea	1($i),$i		# i++
544.align	4
545.Louter4x:
546	mov	($bp,$i,8),$m0		# m0=bp[i]
547	xor	$j,$j			# j=0
548	mov	(%rsp),$A[0]
549	mov	$n0,$m1
550	mulq	$m0			# ap[0]*bp[i]
551	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
552	mov	($np),%rax
553	adc	\$0,%rdx
554
555	imulq	$A[0],$m1		# tp[0]*n0
556	mov	%rdx,$A[1]
557
558	mulq	$m1			# np[0]*m1
559	add	%rax,$A[0]		# "$N[0]", discarded
560	mov	8($ap),%rax
561	adc	\$0,%rdx
562	mov	%rdx,$N[1]
563
564	mulq	$m0			# ap[j]*bp[i]
565	add	%rax,$A[1]
566	mov	8($np),%rax
567	adc	\$0,%rdx
568	add	8(%rsp),$A[1]		# +tp[1]
569	adc	\$0,%rdx
570	mov	%rdx,$A[0]
571
572	mulq	$m1			# np[j]*m1
573	add	%rax,$N[1]
574	mov	16($ap),%rax
575	adc	\$0,%rdx
576	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
577	lea	4($j),$j		# j+=2
578	adc	\$0,%rdx
579	mov	$N[1],(%rsp)		# tp[j-1]
580	mov	%rdx,$N[0]
581	jmp	.Linner4x
582.align	16
583.Linner4x:
584	mulq	$m0			# ap[j]*bp[i]
585	add	%rax,$A[0]
586	mov	-16($np,$j,8),%rax
587	adc	\$0,%rdx
588	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
589	adc	\$0,%rdx
590	mov	%rdx,$A[1]
591
592	mulq	$m1			# np[j]*m1
593	add	%rax,$N[0]
594	mov	-8($ap,$j,8),%rax
595	adc	\$0,%rdx
596	add	$A[0],$N[0]
597	adc	\$0,%rdx
598	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
599	mov	%rdx,$N[1]
600
601	mulq	$m0			# ap[j]*bp[i]
602	add	%rax,$A[1]
603	mov	-8($np,$j,8),%rax
604	adc	\$0,%rdx
605	add	-8(%rsp,$j,8),$A[1]
606	adc	\$0,%rdx
607	mov	%rdx,$A[0]
608
609	mulq	$m1			# np[j]*m1
610	add	%rax,$N[1]
611	mov	($ap,$j,8),%rax
612	adc	\$0,%rdx
613	add	$A[1],$N[1]
614	adc	\$0,%rdx
615	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
616	mov	%rdx,$N[0]
617
618	mulq	$m0			# ap[j]*bp[i]
619	add	%rax,$A[0]
620	mov	($np,$j,8),%rax
621	adc	\$0,%rdx
622	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
623	adc	\$0,%rdx
624	mov	%rdx,$A[1]
625
626	mulq	$m1			# np[j]*m1
627	add	%rax,$N[0]
628	mov	8($ap,$j,8),%rax
629	adc	\$0,%rdx
630	add	$A[0],$N[0]
631	adc	\$0,%rdx
632	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
633	mov	%rdx,$N[1]
634
635	mulq	$m0			# ap[j]*bp[i]
636	add	%rax,$A[1]
637	mov	8($np,$j,8),%rax
638	adc	\$0,%rdx
639	add	8(%rsp,$j,8),$A[1]
640	adc	\$0,%rdx
641	lea	4($j),$j		# j++
642	mov	%rdx,$A[0]
643
644	mulq	$m1			# np[j]*m1
645	add	%rax,$N[1]
646	mov	-16($ap,$j,8),%rax
647	adc	\$0,%rdx
648	add	$A[1],$N[1]
649	adc	\$0,%rdx
650	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
651	mov	%rdx,$N[0]
652	cmp	$num,$j
653	jb	.Linner4x
654
655	mulq	$m0			# ap[j]*bp[i]
656	add	%rax,$A[0]
657	mov	-16($np,$j,8),%rax
658	adc	\$0,%rdx
659	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
660	adc	\$0,%rdx
661	mov	%rdx,$A[1]
662
663	mulq	$m1			# np[j]*m1
664	add	%rax,$N[0]
665	mov	-8($ap,$j,8),%rax
666	adc	\$0,%rdx
667	add	$A[0],$N[0]
668	adc	\$0,%rdx
669	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
670	mov	%rdx,$N[1]
671
672	mulq	$m0			# ap[j]*bp[i]
673	add	%rax,$A[1]
674	mov	-8($np,$j,8),%rax
675	adc	\$0,%rdx
676	add	-8(%rsp,$j,8),$A[1]
677	adc	\$0,%rdx
678	lea	1($i),$i		# i++
679	mov	%rdx,$A[0]
680
681	mulq	$m1			# np[j]*m1
682	add	%rax,$N[1]
683	mov	($ap),%rax		# ap[0]
684	adc	\$0,%rdx
685	add	$A[1],$N[1]
686	adc	\$0,%rdx
687	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
688	mov	%rdx,$N[0]
689
690	xor	$N[1],$N[1]
691	add	$A[0],$N[0]
692	adc	\$0,$N[1]
693	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
694	adc	\$0,$N[1]
695	mov	$N[0],-8(%rsp,$j,8)
696	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
697
698	cmp	$num,$i
699	jb	.Louter4x
700___
701{
702my @ri=("%rax","%rdx",$m0,$m1);
703$code.=<<___;
704	mov	16(%rsp,$num,8),$rp	# restore $rp
705	lea	-4($num),$j
706	mov	0(%rsp),@ri[0]		# tp[0]
707	pxor	%xmm0,%xmm0
708	mov	8(%rsp),@ri[1]		# tp[1]
709	shr	\$2,$j			# j=num/4-1
710	lea	(%rsp),$ap		# borrow ap for tp
711	xor	$i,$i			# i=0 and clear CF!
712
713	sub	0($np),@ri[0]
714	mov	16($ap),@ri[2]		# tp[2]
715	mov	24($ap),@ri[3]		# tp[3]
716	sbb	8($np),@ri[1]
717	jmp	.Lsub4x
718.align	16
719.Lsub4x:
720	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
721	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
722	sbb	16($np,$i,8),@ri[2]
723	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
724	mov	40($ap,$i,8),@ri[1]
725	sbb	24($np,$i,8),@ri[3]
726	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
727	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
728	sbb	32($np,$i,8),@ri[0]
729	mov	48($ap,$i,8),@ri[2]
730	mov	56($ap,$i,8),@ri[3]
731	sbb	40($np,$i,8),@ri[1]
732	lea	4($i),$i		# i++
733	dec	$j			# doesnn't affect CF!
734	jnz	.Lsub4x
735
736	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
737	mov	32($ap,$i,8),@ri[0]	# load overflow bit
738	sbb	16($np,$i,8),@ri[2]
739	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
740	sbb	24($np,$i,8),@ri[3]
741	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
742
743	sbb	\$0,@ri[0]		# handle upmost overflow bit
744	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
745	xor	$i,$i			# i=0
746	and	@ri[0],$ap
747	not	@ri[0]
748	mov	$rp,$np
749	and	@ri[0],$np
750	lea	-4($num),$j
751	or	$np,$ap			# ap=borrow?tp:rp
752	shr	\$2,$j			# j=num/4-1
753
754	movdqu	($ap),%xmm1
755	movdqa	%xmm0,(%rsp)
756	movdqu	%xmm1,($rp)
757	jmp	.Lcopy4x
758.align	16
759.Lcopy4x:					# copy or in-place refresh
760	movdqu	16($ap,$i),%xmm2
761	movdqu	32($ap,$i),%xmm1
762	movdqa	%xmm0,16(%rsp,$i)
763	movdqu	%xmm2,16($rp,$i)
764	movdqa	%xmm0,32(%rsp,$i)
765	movdqu	%xmm1,32($rp,$i)
766	lea	32($i),$i
767	dec	$j
768	jnz	.Lcopy4x
769
770	movdqu	16($ap,$i),%xmm2
771	movdqa	%xmm0,16(%rsp,$i)
772	movdqu	%xmm2,16($rp,$i)
773___
774}
775$code.=<<___;
776	mov	8(%rsp,$num,8),%rsi	# restore %rsp
777.cfi_def_cfa	%rsi, 8
778	mov	\$1,%rax
779	mov	-48(%rsi),%r15
780.cfi_restore	%r15
781	mov	-40(%rsi),%r14
782.cfi_restore	%r14
783	mov	-32(%rsi),%r13
784.cfi_restore	%r13
785	mov	-24(%rsi),%r12
786.cfi_restore	%r12
787	mov	-16(%rsi),%rbp
788.cfi_restore	%rbp
789	mov	-8(%rsi),%rbx
790.cfi_restore	%rbx
791	lea	(%rsi),%rsp
792.cfi_def_cfa_register	%rsp
793.Lmul4x_epilogue:
794	ret
795.cfi_endproc
796.size	bn_mul4x_mont,.-bn_mul4x_mont
797___
798}}}
799{{{
800######################################################################
801# void bn_sqr8x_mont(
802my $rptr="%rdi";	# const BN_ULONG *rptr,
803my $aptr="%rsi";	# const BN_ULONG *aptr,
804my $bptr="%rdx";	# not used
805my $nptr="%rcx";	# const BN_ULONG *nptr,
806my $n0  ="%r8";		# const BN_ULONG *n0);
807my $num ="%r9";		# int num, has to be divisible by 8
808
809my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
810my @A0=("%r10","%r11");
811my @A1=("%r12","%r13");
812my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
813
814$code.=<<___	if ($addx);
815.extern	bn_sqrx8x_internal		# see x86_64-mont5 module
816___
817$code.=<<___;
818.extern	bn_sqr8x_internal		# see x86_64-mont5 module
819
820.type	bn_sqr8x_mont,\@function,6
821.align	32
822bn_sqr8x_mont:
823.cfi_startproc
824	mov	%rsp,%rax
825.cfi_def_cfa_register	%rax
826.Lsqr8x_enter:
827	push	%rbx
828.cfi_push	%rbx
829	push	%rbp
830.cfi_push	%rbp
831	push	%r12
832.cfi_push	%r12
833	push	%r13
834.cfi_push	%r13
835	push	%r14
836.cfi_push	%r14
837	push	%r15
838.cfi_push	%r15
839.Lsqr8x_prologue:
840
841	mov	${num}d,%r10d
842	shl	\$3,${num}d		# convert $num to bytes
843	shl	\$3+2,%r10		# 4*$num
844	neg	$num
845
846	##############################################################
847	# ensure that stack frame doesn't alias with $aptr modulo
848	# 4096. this is done to allow memory disambiguation logic
849	# do its job.
850	#
851	lea	-64(%rsp,$num,2),%r11
852	mov	%rsp,%rbp
853	mov	($n0),$n0		# *n0
854	sub	$aptr,%r11
855	and	\$4095,%r11
856	cmp	%r11,%r10
857	jb	.Lsqr8x_sp_alt
858	sub	%r11,%rbp		# align with $aptr
859	lea	-64(%rbp,$num,2),%rbp	# future alloca(frame+2*$num)
860	jmp	.Lsqr8x_sp_done
861
862.align	32
863.Lsqr8x_sp_alt:
864	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
865	lea	-64(%rbp,$num,2),%rbp	# future alloca(frame+2*$num)
866	sub	%r10,%r11
867	mov	\$0,%r10
868	cmovc	%r10,%r11
869	sub	%r11,%rbp
870.Lsqr8x_sp_done:
871	and	\$-64,%rbp
872	mov	%rsp,%r11
873	sub	%rbp,%r11
874	and	\$-4096,%r11
875	lea	(%rbp,%r11),%rsp
876	mov	(%rsp),%r10
877	cmp	%rbp,%rsp
878	ja	.Lsqr8x_page_walk
879	jmp	.Lsqr8x_page_walk_done
880
881.align	16
882.Lsqr8x_page_walk:
883	lea	-4096(%rsp),%rsp
884	mov	(%rsp),%r10
885	cmp	%rbp,%rsp
886	ja	.Lsqr8x_page_walk
887.Lsqr8x_page_walk_done:
888
889	mov	$num,%r10
890	neg	$num
891
892	mov	$n0,  32(%rsp)
893	mov	%rax, 40(%rsp)		# save original %rsp
894.cfi_cfa_expression	%rsp+40,deref,+8
895.Lsqr8x_body:
896
897	movq	$nptr, %xmm2		# save pointer to modulus
898	pxor	%xmm0,%xmm0
899	movq	$rptr,%xmm1		# save $rptr
900	movq	%r10, %xmm3		# -$num
901___
902$code.=<<___ if ($addx);
903	mov	OPENSSL_ia32cap_P+8(%rip),%eax
904	and	\$0x80100,%eax
905	cmp	\$0x80100,%eax
906	jne	.Lsqr8x_nox
907
908	call	bn_sqrx8x_internal	# see x86_64-mont5 module
909					# %rax	top-most carry
910					# %rbp	nptr
911					# %rcx	-8*num
912					# %r8	end of tp[2*num]
913	lea	(%r8,%rcx),%rbx
914	mov	%rcx,$num
915	mov	%rcx,%rdx
916	movq	%xmm1,$rptr
917	sar	\$3+2,%rcx		# %cf=0
918	jmp	.Lsqr8x_sub
919
920.align	32
921.Lsqr8x_nox:
922___
923$code.=<<___;
924	call	bn_sqr8x_internal	# see x86_64-mont5 module
925					# %rax	top-most carry
926					# %rbp	nptr
927					# %r8	-8*num
928					# %rdi	end of tp[2*num]
929	lea	(%rdi,$num),%rbx
930	mov	$num,%rcx
931	mov	$num,%rdx
932	movq	%xmm1,$rptr
933	sar	\$3+2,%rcx		# %cf=0
934	jmp	.Lsqr8x_sub
935
936.align	32
937.Lsqr8x_sub:
938	mov	8*0(%rbx),%r12
939	mov	8*1(%rbx),%r13
940	mov	8*2(%rbx),%r14
941	mov	8*3(%rbx),%r15
942	lea	8*4(%rbx),%rbx
943	sbb	8*0(%rbp),%r12
944	sbb	8*1(%rbp),%r13
945	sbb	8*2(%rbp),%r14
946	sbb	8*3(%rbp),%r15
947	lea	8*4(%rbp),%rbp
948	mov	%r12,8*0($rptr)
949	mov	%r13,8*1($rptr)
950	mov	%r14,8*2($rptr)
951	mov	%r15,8*3($rptr)
952	lea	8*4($rptr),$rptr
953	inc	%rcx			# preserves %cf
954	jnz	.Lsqr8x_sub
955
956	sbb	\$0,%rax		# top-most carry
957	lea	(%rbx,$num),%rbx	# rewind
958	lea	($rptr,$num),$rptr	# rewind
959
960	movq	%rax,%xmm1
961	pxor	%xmm0,%xmm0
962	pshufd	\$0,%xmm1,%xmm1
963	mov	40(%rsp),%rsi		# restore %rsp
964.cfi_def_cfa	%rsi,8
965	jmp	.Lsqr8x_cond_copy
966
967.align	32
968.Lsqr8x_cond_copy:
969	movdqa	16*0(%rbx),%xmm2
970	movdqa	16*1(%rbx),%xmm3
971	lea	16*2(%rbx),%rbx
972	movdqu	16*0($rptr),%xmm4
973	movdqu	16*1($rptr),%xmm5
974	lea	16*2($rptr),$rptr
975	movdqa	%xmm0,-16*2(%rbx)	# zero tp
976	movdqa	%xmm0,-16*1(%rbx)
977	movdqa	%xmm0,-16*2(%rbx,%rdx)
978	movdqa	%xmm0,-16*1(%rbx,%rdx)
979	pcmpeqd	%xmm1,%xmm0
980	pand	%xmm1,%xmm2
981	pand	%xmm1,%xmm3
982	pand	%xmm0,%xmm4
983	pand	%xmm0,%xmm5
984	pxor	%xmm0,%xmm0
985	por	%xmm2,%xmm4
986	por	%xmm3,%xmm5
987	movdqu	%xmm4,-16*2($rptr)
988	movdqu	%xmm5,-16*1($rptr)
989	add	\$32,$num
990	jnz	.Lsqr8x_cond_copy
991
992	mov	\$1,%rax
993	mov	-48(%rsi),%r15
994.cfi_restore	%r15
995	mov	-40(%rsi),%r14
996.cfi_restore	%r14
997	mov	-32(%rsi),%r13
998.cfi_restore	%r13
999	mov	-24(%rsi),%r12
1000.cfi_restore	%r12
1001	mov	-16(%rsi),%rbp
1002.cfi_restore	%rbp
1003	mov	-8(%rsi),%rbx
1004.cfi_restore	%rbx
1005	lea	(%rsi),%rsp
1006.cfi_def_cfa_register	%rsp
1007.Lsqr8x_epilogue:
1008	ret
1009.cfi_endproc
1010.size	bn_sqr8x_mont,.-bn_sqr8x_mont
1011___
1012}}}
1013
1014if ($addx) {{{
1015my $bp="%rdx";	# original value
1016
1017$code.=<<___;
1018.type	bn_mulx4x_mont,\@function,6
1019.align	32
1020bn_mulx4x_mont:
1021.cfi_startproc
1022	mov	%rsp,%rax
1023.cfi_def_cfa_register	%rax
1024.Lmulx4x_enter:
1025	push	%rbx
1026.cfi_push	%rbx
1027	push	%rbp
1028.cfi_push	%rbp
1029	push	%r12
1030.cfi_push	%r12
1031	push	%r13
1032.cfi_push	%r13
1033	push	%r14
1034.cfi_push	%r14
1035	push	%r15
1036.cfi_push	%r15
1037.Lmulx4x_prologue:
1038
1039	shl	\$3,${num}d		# convert $num to bytes
1040	xor	%r10,%r10
1041	sub	$num,%r10		# -$num
1042	mov	($n0),$n0		# *n0
1043	lea	-72(%rsp,%r10),%rbp	# future alloca(frame+$num+8)
1044	and	\$-128,%rbp
1045	mov	%rsp,%r11
1046	sub	%rbp,%r11
1047	and	\$-4096,%r11
1048	lea	(%rbp,%r11),%rsp
1049	mov	(%rsp),%r10
1050	cmp	%rbp,%rsp
1051	ja	.Lmulx4x_page_walk
1052	jmp	.Lmulx4x_page_walk_done
1053
1054.align	16
1055.Lmulx4x_page_walk:
1056	lea	-4096(%rsp),%rsp
1057	mov	(%rsp),%r10
1058	cmp	%rbp,%rsp
1059	ja	.Lmulx4x_page_walk
1060.Lmulx4x_page_walk_done:
1061
1062	lea	($bp,$num),%r10
1063	##############################################################
1064	# Stack layout
1065	# +0	num
1066	# +8	off-loaded &b[i]
1067	# +16	end of b[num]
1068	# +24	saved n0
1069	# +32	saved rp
1070	# +40	saved %rsp
1071	# +48	inner counter
1072	# +56
1073	# +64	tmp[num+1]
1074	#
1075	mov	$num,0(%rsp)		# save $num
1076	shr	\$5,$num
1077	mov	%r10,16(%rsp)		# end of b[num]
1078	sub	\$1,$num
1079	mov	$n0, 24(%rsp)		# save *n0
1080	mov	$rp, 32(%rsp)		# save $rp
1081	mov	%rax,40(%rsp)		# save original %rsp
1082.cfi_cfa_expression	%rsp+40,deref,+8
1083	mov	$num,48(%rsp)		# inner counter
1084	jmp	.Lmulx4x_body
1085
1086.align	32
1087.Lmulx4x_body:
1088___
1089my ($aptr, $bptr, $nptr, $tptr, $mi,  $bi,  $zero, $num)=
1090   ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
1091my $rptr=$bptr;
1092$code.=<<___;
1093	lea	8($bp),$bptr
1094	mov	($bp),%rdx		# b[0], $bp==%rdx actually
1095	lea	64+32(%rsp),$tptr
1096	mov	%rdx,$bi
1097
1098	mulx	0*8($aptr),$mi,%rax	# a[0]*b[0]
1099	mulx	1*8($aptr),%r11,%r14	# a[1]*b[0]
1100	add	%rax,%r11
1101	mov	$bptr,8(%rsp)		# off-load &b[i]
1102	mulx	2*8($aptr),%r12,%r13	# ...
1103	adc	%r14,%r12
1104	adc	\$0,%r13
1105
1106	mov	$mi,$bptr		# borrow $bptr
1107	imulq	24(%rsp),$mi		# "t[0]"*n0
1108	xor	$zero,$zero		# cf=0, of=0
1109
1110	mulx	3*8($aptr),%rax,%r14
1111	 mov	$mi,%rdx
1112	lea	4*8($aptr),$aptr
1113	adcx	%rax,%r13
1114	adcx	$zero,%r14		# cf=0
1115
1116	mulx	0*8($nptr),%rax,%r10
1117	adcx	%rax,$bptr		# discarded
1118	adox	%r11,%r10
1119	mulx	1*8($nptr),%rax,%r11
1120	adcx	%rax,%r10
1121	adox	%r12,%r11
1122	.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00	# mulx	2*8($nptr),%rax,%r12
1123	mov	48(%rsp),$bptr		# counter value
1124	mov	%r10,-4*8($tptr)
1125	adcx	%rax,%r11
1126	adox	%r13,%r12
1127	mulx	3*8($nptr),%rax,%r15
1128	 mov	$bi,%rdx
1129	mov	%r11,-3*8($tptr)
1130	adcx	%rax,%r12
1131	adox	$zero,%r15		# of=0
1132	lea	4*8($nptr),$nptr
1133	mov	%r12,-2*8($tptr)
1134
1135	jmp	.Lmulx4x_1st
1136
1137.align	32
1138.Lmulx4x_1st:
1139	adcx	$zero,%r15		# cf=0, modulo-scheduled
1140	mulx	0*8($aptr),%r10,%rax	# a[4]*b[0]
1141	adcx	%r14,%r10
1142	mulx	1*8($aptr),%r11,%r14	# a[5]*b[0]
1143	adcx	%rax,%r11
1144	mulx	2*8($aptr),%r12,%rax	# ...
1145	adcx	%r14,%r12
1146	mulx	3*8($aptr),%r13,%r14
1147	 .byte	0x67,0x67
1148	 mov	$mi,%rdx
1149	adcx	%rax,%r13
1150	adcx	$zero,%r14		# cf=0
1151	lea	4*8($aptr),$aptr
1152	lea	4*8($tptr),$tptr
1153
1154	adox	%r15,%r10
1155	mulx	0*8($nptr),%rax,%r15
1156	adcx	%rax,%r10
1157	adox	%r15,%r11
1158	mulx	1*8($nptr),%rax,%r15
1159	adcx	%rax,%r11
1160	adox	%r15,%r12
1161	mulx	2*8($nptr),%rax,%r15
1162	mov	%r10,-5*8($tptr)
1163	adcx	%rax,%r12
1164	mov	%r11,-4*8($tptr)
1165	adox	%r15,%r13
1166	mulx	3*8($nptr),%rax,%r15
1167	 mov	$bi,%rdx
1168	mov	%r12,-3*8($tptr)
1169	adcx	%rax,%r13
1170	adox	$zero,%r15
1171	lea	4*8($nptr),$nptr
1172	mov	%r13,-2*8($tptr)
1173
1174	dec	$bptr			# of=0, pass cf
1175	jnz	.Lmulx4x_1st
1176
1177	mov	0(%rsp),$num		# load num
1178	mov	8(%rsp),$bptr		# re-load &b[i]
1179	adc	$zero,%r15		# modulo-scheduled
1180	add	%r15,%r14
1181	sbb	%r15,%r15		# top-most carry
1182	mov	%r14,-1*8($tptr)
1183	jmp	.Lmulx4x_outer
1184
1185.align	32
1186.Lmulx4x_outer:
1187	mov	($bptr),%rdx		# b[i]
1188	lea	8($bptr),$bptr		# b++
1189	sub	$num,$aptr		# rewind $aptr
1190	mov	%r15,($tptr)		# save top-most carry
1191	lea	64+4*8(%rsp),$tptr
1192	sub	$num,$nptr		# rewind $nptr
1193
1194	mulx	0*8($aptr),$mi,%r11	# a[0]*b[i]
1195	xor	%ebp,%ebp		# xor	$zero,$zero	# cf=0, of=0
1196	mov	%rdx,$bi
1197	mulx	1*8($aptr),%r14,%r12	# a[1]*b[i]
1198	adox	-4*8($tptr),$mi
1199	adcx	%r14,%r11
1200	mulx	2*8($aptr),%r15,%r13	# ...
1201	adox	-3*8($tptr),%r11
1202	adcx	%r15,%r12
1203	adox	-2*8($tptr),%r12
1204	adcx	$zero,%r13
1205	adox	$zero,%r13
1206
1207	mov	$bptr,8(%rsp)		# off-load &b[i]
1208	mov	$mi,%r15
1209	imulq	24(%rsp),$mi		# "t[0]"*n0
1210	xor	%ebp,%ebp		# xor	$zero,$zero	# cf=0, of=0
1211
1212	mulx	3*8($aptr),%rax,%r14
1213	 mov	$mi,%rdx
1214	adcx	%rax,%r13
1215	adox	-1*8($tptr),%r13
1216	adcx	$zero,%r14
1217	lea	4*8($aptr),$aptr
1218	adox	$zero,%r14
1219
1220	mulx	0*8($nptr),%rax,%r10
1221	adcx	%rax,%r15		# discarded
1222	adox	%r11,%r10
1223	mulx	1*8($nptr),%rax,%r11
1224	adcx	%rax,%r10
1225	adox	%r12,%r11
1226	mulx	2*8($nptr),%rax,%r12
1227	mov	%r10,-4*8($tptr)
1228	adcx	%rax,%r11
1229	adox	%r13,%r12
1230	mulx	3*8($nptr),%rax,%r15
1231	 mov	$bi,%rdx
1232	mov	%r11,-3*8($tptr)
1233	lea	4*8($nptr),$nptr
1234	adcx	%rax,%r12
1235	adox	$zero,%r15		# of=0
1236	mov	48(%rsp),$bptr		# counter value
1237	mov	%r12,-2*8($tptr)
1238
1239	jmp	.Lmulx4x_inner
1240
1241.align	32
1242.Lmulx4x_inner:
1243	mulx	0*8($aptr),%r10,%rax	# a[4]*b[i]
1244	adcx	$zero,%r15		# cf=0, modulo-scheduled
1245	adox	%r14,%r10
1246	mulx	1*8($aptr),%r11,%r14	# a[5]*b[i]
1247	adcx	0*8($tptr),%r10
1248	adox	%rax,%r11
1249	mulx	2*8($aptr),%r12,%rax	# ...
1250	adcx	1*8($tptr),%r11
1251	adox	%r14,%r12
1252	mulx	3*8($aptr),%r13,%r14
1253	 mov	$mi,%rdx
1254	adcx	2*8($tptr),%r12
1255	adox	%rax,%r13
1256	adcx	3*8($tptr),%r13
1257	adox	$zero,%r14		# of=0
1258	lea	4*8($aptr),$aptr
1259	lea	4*8($tptr),$tptr
1260	adcx	$zero,%r14		# cf=0
1261
1262	adox	%r15,%r10
1263	mulx	0*8($nptr),%rax,%r15
1264	adcx	%rax,%r10
1265	adox	%r15,%r11
1266	mulx	1*8($nptr),%rax,%r15
1267	adcx	%rax,%r11
1268	adox	%r15,%r12
1269	mulx	2*8($nptr),%rax,%r15
1270	mov	%r10,-5*8($tptr)
1271	adcx	%rax,%r12
1272	adox	%r15,%r13
1273	mulx	3*8($nptr),%rax,%r15
1274	 mov	$bi,%rdx
1275	mov	%r11,-4*8($tptr)
1276	mov	%r12,-3*8($tptr)
1277	adcx	%rax,%r13
1278	adox	$zero,%r15
1279	lea	4*8($nptr),$nptr
1280	mov	%r13,-2*8($tptr)
1281
1282	dec	$bptr			# of=0, pass cf
1283	jnz	.Lmulx4x_inner
1284
1285	mov	0(%rsp),$num		# load num
1286	mov	8(%rsp),$bptr		# re-load &b[i]
1287	adc	$zero,%r15		# modulo-scheduled
1288	sub	0*8($tptr),$zero	# pull top-most carry
1289	adc	%r15,%r14
1290	sbb	%r15,%r15		# top-most carry
1291	mov	%r14,-1*8($tptr)
1292
1293	cmp	16(%rsp),$bptr
1294	jne	.Lmulx4x_outer
1295
1296	lea	64(%rsp),$tptr
1297	sub	$num,$nptr		# rewind $nptr
1298	neg	%r15
1299	mov	$num,%rdx
1300	shr	\$3+2,$num		# %cf=0
1301	mov	32(%rsp),$rptr		# restore rp
1302	jmp	.Lmulx4x_sub
1303
1304.align	32
1305.Lmulx4x_sub:
1306	mov	8*0($tptr),%r11
1307	mov	8*1($tptr),%r12
1308	mov	8*2($tptr),%r13
1309	mov	8*3($tptr),%r14
1310	lea	8*4($tptr),$tptr
1311	sbb	8*0($nptr),%r11
1312	sbb	8*1($nptr),%r12
1313	sbb	8*2($nptr),%r13
1314	sbb	8*3($nptr),%r14
1315	lea	8*4($nptr),$nptr
1316	mov	%r11,8*0($rptr)
1317	mov	%r12,8*1($rptr)
1318	mov	%r13,8*2($rptr)
1319	mov	%r14,8*3($rptr)
1320	lea	8*4($rptr),$rptr
1321	dec	$num			# preserves %cf
1322	jnz	.Lmulx4x_sub
1323
1324	sbb	\$0,%r15		# top-most carry
1325	lea	64(%rsp),$tptr
1326	sub	%rdx,$rptr		# rewind
1327
1328	movq	%r15,%xmm1
1329	pxor	%xmm0,%xmm0
1330	pshufd	\$0,%xmm1,%xmm1
1331	mov	40(%rsp),%rsi		# restore %rsp
1332.cfi_def_cfa	%rsi,8
1333	jmp	.Lmulx4x_cond_copy
1334
1335.align	32
1336.Lmulx4x_cond_copy:
1337	movdqa	16*0($tptr),%xmm2
1338	movdqa	16*1($tptr),%xmm3
1339	lea	16*2($tptr),$tptr
1340	movdqu	16*0($rptr),%xmm4
1341	movdqu	16*1($rptr),%xmm5
1342	lea	16*2($rptr),$rptr
1343	movdqa	%xmm0,-16*2($tptr)	# zero tp
1344	movdqa	%xmm0,-16*1($tptr)
1345	pcmpeqd	%xmm1,%xmm0
1346	pand	%xmm1,%xmm2
1347	pand	%xmm1,%xmm3
1348	pand	%xmm0,%xmm4
1349	pand	%xmm0,%xmm5
1350	pxor	%xmm0,%xmm0
1351	por	%xmm2,%xmm4
1352	por	%xmm3,%xmm5
1353	movdqu	%xmm4,-16*2($rptr)
1354	movdqu	%xmm5,-16*1($rptr)
1355	sub	\$32,%rdx
1356	jnz	.Lmulx4x_cond_copy
1357
1358	mov	%rdx,($tptr)
1359
1360	mov	\$1,%rax
1361	mov	-48(%rsi),%r15
1362.cfi_restore	%r15
1363	mov	-40(%rsi),%r14
1364.cfi_restore	%r14
1365	mov	-32(%rsi),%r13
1366.cfi_restore	%r13
1367	mov	-24(%rsi),%r12
1368.cfi_restore	%r12
1369	mov	-16(%rsi),%rbp
1370.cfi_restore	%rbp
1371	mov	-8(%rsi),%rbx
1372.cfi_restore	%rbx
1373	lea	(%rsi),%rsp
1374.cfi_def_cfa_register	%rsp
1375.Lmulx4x_epilogue:
1376	ret
1377.cfi_endproc
1378.size	bn_mulx4x_mont,.-bn_mulx4x_mont
1379___
1380}}}
1381$code.=<<___;
1382.asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1383.align	16
1384___
1385
1386# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1387#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1388if ($win64) {
1389$rec="%rcx";
1390$frame="%rdx";
1391$context="%r8";
1392$disp="%r9";
1393
1394$code.=<<___;
1395.extern	__imp_RtlVirtualUnwind
1396.type	mul_handler,\@abi-omnipotent
1397.align	16
1398mul_handler:
1399	push	%rsi
1400	push	%rdi
1401	push	%rbx
1402	push	%rbp
1403	push	%r12
1404	push	%r13
1405	push	%r14
1406	push	%r15
1407	pushfq
1408	sub	\$64,%rsp
1409
1410	mov	120($context),%rax	# pull context->Rax
1411	mov	248($context),%rbx	# pull context->Rip
1412
1413	mov	8($disp),%rsi		# disp->ImageBase
1414	mov	56($disp),%r11		# disp->HandlerData
1415
1416	mov	0(%r11),%r10d		# HandlerData[0]
1417	lea	(%rsi,%r10),%r10	# end of prologue label
1418	cmp	%r10,%rbx		# context->Rip<end of prologue label
1419	jb	.Lcommon_seh_tail
1420
1421	mov	152($context),%rax	# pull context->Rsp
1422
1423	mov	4(%r11),%r10d		# HandlerData[1]
1424	lea	(%rsi,%r10),%r10	# epilogue label
1425	cmp	%r10,%rbx		# context->Rip>=epilogue label
1426	jae	.Lcommon_seh_tail
1427
1428	mov	192($context),%r10	# pull $num
1429	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
1430
1431	jmp	.Lcommon_pop_regs
1432.size	mul_handler,.-mul_handler
1433
1434.type	sqr_handler,\@abi-omnipotent
1435.align	16
1436sqr_handler:
1437	push	%rsi
1438	push	%rdi
1439	push	%rbx
1440	push	%rbp
1441	push	%r12
1442	push	%r13
1443	push	%r14
1444	push	%r15
1445	pushfq
1446	sub	\$64,%rsp
1447
1448	mov	120($context),%rax	# pull context->Rax
1449	mov	248($context),%rbx	# pull context->Rip
1450
1451	mov	8($disp),%rsi		# disp->ImageBase
1452	mov	56($disp),%r11		# disp->HandlerData
1453
1454	mov	0(%r11),%r10d		# HandlerData[0]
1455	lea	(%rsi,%r10),%r10	# end of prologue label
1456	cmp	%r10,%rbx		# context->Rip<.Lsqr_prologue
1457	jb	.Lcommon_seh_tail
1458
1459	mov	4(%r11),%r10d		# HandlerData[1]
1460	lea	(%rsi,%r10),%r10	# body label
1461	cmp	%r10,%rbx		# context->Rip<.Lsqr_body
1462	jb	.Lcommon_pop_regs
1463
1464	mov	152($context),%rax	# pull context->Rsp
1465
1466	mov	8(%r11),%r10d		# HandlerData[2]
1467	lea	(%rsi,%r10),%r10	# epilogue label
1468	cmp	%r10,%rbx		# context->Rip>=.Lsqr_epilogue
1469	jae	.Lcommon_seh_tail
1470
1471	mov	40(%rax),%rax		# pull saved stack pointer
1472
1473.Lcommon_pop_regs:
1474	mov	-8(%rax),%rbx
1475	mov	-16(%rax),%rbp
1476	mov	-24(%rax),%r12
1477	mov	-32(%rax),%r13
1478	mov	-40(%rax),%r14
1479	mov	-48(%rax),%r15
1480	mov	%rbx,144($context)	# restore context->Rbx
1481	mov	%rbp,160($context)	# restore context->Rbp
1482	mov	%r12,216($context)	# restore context->R12
1483	mov	%r13,224($context)	# restore context->R13
1484	mov	%r14,232($context)	# restore context->R14
1485	mov	%r15,240($context)	# restore context->R15
1486
1487.Lcommon_seh_tail:
1488	mov	8(%rax),%rdi
1489	mov	16(%rax),%rsi
1490	mov	%rax,152($context)	# restore context->Rsp
1491	mov	%rsi,168($context)	# restore context->Rsi
1492	mov	%rdi,176($context)	# restore context->Rdi
1493
1494	mov	40($disp),%rdi		# disp->ContextRecord
1495	mov	$context,%rsi		# context
1496	mov	\$154,%ecx		# sizeof(CONTEXT)
1497	.long	0xa548f3fc		# cld; rep movsq
1498
1499	mov	$disp,%rsi
1500	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1501	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1502	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1503	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1504	mov	40(%rsi),%r10		# disp->ContextRecord
1505	lea	56(%rsi),%r11		# &disp->HandlerData
1506	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1507	mov	%r10,32(%rsp)		# arg5
1508	mov	%r11,40(%rsp)		# arg6
1509	mov	%r12,48(%rsp)		# arg7
1510	mov	%rcx,56(%rsp)		# arg8, (NULL)
1511	call	*__imp_RtlVirtualUnwind(%rip)
1512
1513	mov	\$1,%eax		# ExceptionContinueSearch
1514	add	\$64,%rsp
1515	popfq
1516	pop	%r15
1517	pop	%r14
1518	pop	%r13
1519	pop	%r12
1520	pop	%rbp
1521	pop	%rbx
1522	pop	%rdi
1523	pop	%rsi
1524	ret
1525.size	sqr_handler,.-sqr_handler
1526
1527.section	.pdata
1528.align	4
1529	.rva	.LSEH_begin_bn_mul_mont
1530	.rva	.LSEH_end_bn_mul_mont
1531	.rva	.LSEH_info_bn_mul_mont
1532
1533	.rva	.LSEH_begin_bn_mul4x_mont
1534	.rva	.LSEH_end_bn_mul4x_mont
1535	.rva	.LSEH_info_bn_mul4x_mont
1536
1537	.rva	.LSEH_begin_bn_sqr8x_mont
1538	.rva	.LSEH_end_bn_sqr8x_mont
1539	.rva	.LSEH_info_bn_sqr8x_mont
1540___
1541$code.=<<___ if ($addx);
1542	.rva	.LSEH_begin_bn_mulx4x_mont
1543	.rva	.LSEH_end_bn_mulx4x_mont
1544	.rva	.LSEH_info_bn_mulx4x_mont
1545___
1546$code.=<<___;
1547.section	.xdata
1548.align	8
1549.LSEH_info_bn_mul_mont:
1550	.byte	9,0,0,0
1551	.rva	mul_handler
1552	.rva	.Lmul_body,.Lmul_epilogue	# HandlerData[]
1553.LSEH_info_bn_mul4x_mont:
1554	.byte	9,0,0,0
1555	.rva	mul_handler
1556	.rva	.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
1557.LSEH_info_bn_sqr8x_mont:
1558	.byte	9,0,0,0
1559	.rva	sqr_handler
1560	.rva	.Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue		# HandlerData[]
1561.align	8
1562___
1563$code.=<<___ if ($addx);
1564.LSEH_info_bn_mulx4x_mont:
1565	.byte	9,0,0,0
1566	.rva	sqr_handler
1567	.rva	.Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue	# HandlerData[]
1568.align	8
1569___
1570}
1571
1572print $code;
1573close STDOUT;
1574