1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# August 2011.
11#
12# Companion to x86_64-mont.pl that optimizes cache-timing attack
13# countermeasures. The subroutines are produced by replacing bp[i]
14# references in their x86_64-mont.pl counterparts with cache-neutral
15# references to powers table computed in BN_mod_exp_mont_consttime.
16# In addition subroutine that scatters elements of the powers table
17# is implemented, so that scatter-/gathering can be tuned without
18# bn_exp.c modifications.
19
20# August 2013.
21#
22# Add MULX/AD*X code paths and additional interfaces to optimize for
23# branch prediction unit. For input lengths that are multiples of 8
24# the np argument is not just modulus value, but one interleaved
25# with 0. This is to optimize post-condition...
26
27$flavour = shift;
28$output  = shift;
29if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
30
31$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
32
33$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
34( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
35( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
36die "can't locate x86_64-xlate.pl";
37
38open OUT,"| \"$^X\" $xlate $flavour $output";
39*STDOUT=*OUT;
40
41if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
42		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
43	$addx = ($1>=2.23);
44}
45
46if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
47	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
48	$addx = ($1>=2.10);
49}
50
51if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
52	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
53	$addx = ($1>=12);
54}
55
56# int bn_mul_mont_gather5(
57$rp="%rdi";	# BN_ULONG *rp,
58$ap="%rsi";	# const BN_ULONG *ap,
59$bp="%rdx";	# const BN_ULONG *bp,
60$np="%rcx";	# const BN_ULONG *np,
61$n0="%r8";	# const BN_ULONG *n0,
62$num="%r9";	# int num,
63		# int idx);	# 0 to 2^5-1, "index" in $bp holding
64				# pre-computed powers of a', interlaced
65				# in such manner that b[0] is $bp[idx],
66				# b[1] is [2^5+idx], etc.
67$lo0="%r10";
68$hi0="%r11";
69$hi1="%r13";
70$i="%r14";
71$j="%r15";
72$m0="%rbx";
73$m1="%rbp";
74
75$code=<<___;
76.text
77
78.extern	OPENSSL_ia32cap_P
79
80.globl	bn_mul_mont_gather5
81.type	bn_mul_mont_gather5,\@function,6
82.align	64
83bn_mul_mont_gather5:
84	test	\$7,${num}d
85	jnz	.Lmul_enter
86___
87$code.=<<___ if ($addx);
88	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
89___
90$code.=<<___;
91	jmp	.Lmul4x_enter
92
93.align	16
94.Lmul_enter:
95	mov	${num}d,${num}d
96	mov	%rsp,%rax
97	mov	`($win64?56:8)`(%rsp),%r10d	# load 7th argument
98	push	%rbx
99	push	%rbp
100	push	%r12
101	push	%r13
102	push	%r14
103	push	%r15
104___
105$code.=<<___ if ($win64);
106	lea	-0x28(%rsp),%rsp
107	movaps	%xmm6,(%rsp)
108	movaps	%xmm7,0x10(%rsp)
109___
110$code.=<<___;
111	lea	2($num),%r11
112	neg	%r11
113	lea	(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+2))
114	and	\$-1024,%rsp		# minimize TLB usage
115
116	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
117.Lmul_body:
118	mov	$bp,%r12		# reassign $bp
119___
120		$bp="%r12";
121		$STRIDE=2**5*8;		# 5 is "window size"
122		$N=$STRIDE/4;		# should match cache line size
123$code.=<<___;
124	mov	%r10,%r11
125	shr	\$`log($N/8)/log(2)`,%r10
126	and	\$`$N/8-1`,%r11
127	not	%r10
128	lea	.Lmagic_masks(%rip),%rax
129	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
130	lea	96($bp,%r11,8),$bp	# pointer within 1st cache line
131	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
132	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
133	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
134	movq	24(%rax,%r10,8),%xmm7
135
136	movq	`0*$STRIDE/4-96`($bp),%xmm0
137	movq	`1*$STRIDE/4-96`($bp),%xmm1
138	pand	%xmm4,%xmm0
139	movq	`2*$STRIDE/4-96`($bp),%xmm2
140	pand	%xmm5,%xmm1
141	movq	`3*$STRIDE/4-96`($bp),%xmm3
142	pand	%xmm6,%xmm2
143	por	%xmm1,%xmm0
144	pand	%xmm7,%xmm3
145	por	%xmm2,%xmm0
146	lea	$STRIDE($bp),$bp
147	por	%xmm3,%xmm0
148
149	movq	%xmm0,$m0		# m0=bp[0]
150
151	mov	($n0),$n0		# pull n0[0] value
152	mov	($ap),%rax
153
154	xor	$i,$i			# i=0
155	xor	$j,$j			# j=0
156
157	movq	`0*$STRIDE/4-96`($bp),%xmm0
158	movq	`1*$STRIDE/4-96`($bp),%xmm1
159	pand	%xmm4,%xmm0
160	movq	`2*$STRIDE/4-96`($bp),%xmm2
161	pand	%xmm5,%xmm1
162
163	mov	$n0,$m1
164	mulq	$m0			# ap[0]*bp[0]
165	mov	%rax,$lo0
166	mov	($np),%rax
167
168	movq	`3*$STRIDE/4-96`($bp),%xmm3
169	pand	%xmm6,%xmm2
170	por	%xmm1,%xmm0
171	pand	%xmm7,%xmm3
172
173	imulq	$lo0,$m1		# "tp[0]"*n0
174	mov	%rdx,$hi0
175
176	por	%xmm2,%xmm0
177	lea	$STRIDE($bp),$bp
178	por	%xmm3,%xmm0
179
180	mulq	$m1			# np[0]*m1
181	add	%rax,$lo0		# discarded
182	mov	8($ap),%rax
183	adc	\$0,%rdx
184	mov	%rdx,$hi1
185
186	lea	1($j),$j		# j++
187	jmp	.L1st_enter
188
189.align	16
190.L1st:
191	add	%rax,$hi1
192	mov	($ap,$j,8),%rax
193	adc	\$0,%rdx
194	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
195	mov	$lo0,$hi0
196	adc	\$0,%rdx
197	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
198	mov	%rdx,$hi1
199
200.L1st_enter:
201	mulq	$m0			# ap[j]*bp[0]
202	add	%rax,$hi0
203	mov	($np,$j,8),%rax
204	adc	\$0,%rdx
205	lea	1($j),$j		# j++
206	mov	%rdx,$lo0
207
208	mulq	$m1			# np[j]*m1
209	cmp	$num,$j
210	jne	.L1st
211
212	movq	%xmm0,$m0		# bp[1]
213
214	add	%rax,$hi1
215	mov	($ap),%rax		# ap[0]
216	adc	\$0,%rdx
217	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
218	adc	\$0,%rdx
219	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
220	mov	%rdx,$hi1
221	mov	$lo0,$hi0
222
223	xor	%rdx,%rdx
224	add	$hi0,$hi1
225	adc	\$0,%rdx
226	mov	$hi1,-8(%rsp,$num,8)
227	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
228
229	lea	1($i),$i		# i++
230	jmp	.Louter
231.align	16
232.Louter:
233	xor	$j,$j			# j=0
234	mov	$n0,$m1
235	mov	(%rsp),$lo0
236
237	movq	`0*$STRIDE/4-96`($bp),%xmm0
238	movq	`1*$STRIDE/4-96`($bp),%xmm1
239	pand	%xmm4,%xmm0
240	movq	`2*$STRIDE/4-96`($bp),%xmm2
241	pand	%xmm5,%xmm1
242
243	mulq	$m0			# ap[0]*bp[i]
244	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
245	mov	($np),%rax
246	adc	\$0,%rdx
247
248	movq	`3*$STRIDE/4-96`($bp),%xmm3
249	pand	%xmm6,%xmm2
250	por	%xmm1,%xmm0
251	pand	%xmm7,%xmm3
252
253	imulq	$lo0,$m1		# tp[0]*n0
254	mov	%rdx,$hi0
255
256	por	%xmm2,%xmm0
257	lea	$STRIDE($bp),$bp
258	por	%xmm3,%xmm0
259
260	mulq	$m1			# np[0]*m1
261	add	%rax,$lo0		# discarded
262	mov	8($ap),%rax
263	adc	\$0,%rdx
264	mov	8(%rsp),$lo0		# tp[1]
265	mov	%rdx,$hi1
266
267	lea	1($j),$j		# j++
268	jmp	.Linner_enter
269
270.align	16
271.Linner:
272	add	%rax,$hi1
273	mov	($ap,$j,8),%rax
274	adc	\$0,%rdx
275	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
276	mov	(%rsp,$j,8),$lo0
277	adc	\$0,%rdx
278	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
279	mov	%rdx,$hi1
280
281.Linner_enter:
282	mulq	$m0			# ap[j]*bp[i]
283	add	%rax,$hi0
284	mov	($np,$j,8),%rax
285	adc	\$0,%rdx
286	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
287	mov	%rdx,$hi0
288	adc	\$0,$hi0
289	lea	1($j),$j		# j++
290
291	mulq	$m1			# np[j]*m1
292	cmp	$num,$j
293	jne	.Linner
294
295	movq	%xmm0,$m0		# bp[i+1]
296
297	add	%rax,$hi1
298	mov	($ap),%rax		# ap[0]
299	adc	\$0,%rdx
300	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
301	mov	(%rsp,$j,8),$lo0
302	adc	\$0,%rdx
303	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
304	mov	%rdx,$hi1
305
306	xor	%rdx,%rdx
307	add	$hi0,$hi1
308	adc	\$0,%rdx
309	add	$lo0,$hi1		# pull upmost overflow bit
310	adc	\$0,%rdx
311	mov	$hi1,-8(%rsp,$num,8)
312	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
313
314	lea	1($i),$i		# i++
315	cmp	$num,$i
316	jb	.Louter
317
318	xor	$i,$i			# i=0 and clear CF!
319	mov	(%rsp),%rax		# tp[0]
320	lea	(%rsp),$ap		# borrow ap for tp
321	mov	$num,$j			# j=num
322	jmp	.Lsub
323.align	16
324.Lsub:	sbb	($np,$i,8),%rax
325	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
326	mov	8($ap,$i,8),%rax	# tp[i+1]
327	lea	1($i),$i		# i++
328	dec	$j			# doesnn't affect CF!
329	jnz	.Lsub
330
331	sbb	\$0,%rax		# handle upmost overflow bit
332	xor	$i,$i
333	mov	$num,$j			# j=num
334.align	16
335.Lcopy:					# copy or in-place refresh
336	mov	(%rsp,$i,8),$ap
337	mov	($rp,$i,8),$np
338	xor	$np,$ap			# conditional select:
339	and	%rax,$ap		# ((ap ^ np) & %rax) ^ np
340	xor	$np,$ap			# ap = borrow?tp:rp
341	mov	$i,(%rsp,$i,8)		# zap temporary vector
342	mov	$ap,($rp,$i,8)		# rp[i]=tp[i]
343	lea	1($i),$i
344	sub	\$1,$j
345	jnz	.Lcopy
346
347	mov	8(%rsp,$num,8),%rsi	# restore %rsp
348	mov	\$1,%rax
349___
350$code.=<<___ if ($win64);
351	movaps	-88(%rsi),%xmm6
352	movaps	-72(%rsi),%xmm7
353___
354$code.=<<___;
355	mov	-48(%rsi),%r15
356	mov	-40(%rsi),%r14
357	mov	-32(%rsi),%r13
358	mov	-24(%rsi),%r12
359	mov	-16(%rsi),%rbp
360	mov	-8(%rsi),%rbx
361	lea	(%rsi),%rsp
362.Lmul_epilogue:
363	ret
364.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
365___
366{{{
367my @A=("%r10","%r11");
368my @N=("%r13","%rdi");
369$code.=<<___;
370.type	bn_mul4x_mont_gather5,\@function,6
371.align	32
372bn_mul4x_mont_gather5:
373.Lmul4x_enter:
374___
375$code.=<<___ if ($addx);
376	and	\$0x80100,%r11d
377	cmp	\$0x80100,%r11d
378	je	.Lmulx4x_enter
379___
380$code.=<<___;
381	.byte	0x67
382	mov	%rsp,%rax
383	push	%rbx
384	push	%rbp
385	push	%r12
386	push	%r13
387	push	%r14
388	push	%r15
389___
390$code.=<<___ if ($win64);
391	lea	-0x28(%rsp),%rsp
392	movaps	%xmm6,(%rsp)
393	movaps	%xmm7,0x10(%rsp)
394___
395$code.=<<___;
396	.byte	0x67
397	mov	${num}d,%r10d
398	shl	\$3,${num}d
399	shl	\$3+2,%r10d		# 4*$num
400	neg	$num			# -$num
401
402	##############################################################
403	# ensure that stack frame doesn't alias with $aptr+4*$num
404	# modulo 4096, which covers ret[num], am[num] and n[2*num]
405	# (see bn_exp.c). this is done to allow memory disambiguation
406	# logic do its magic. [excessive frame is allocated in order
407	# to allow bn_from_mont8x to clear it.]
408	#
409	lea	-64(%rsp,$num,2),%r11
410	sub	$ap,%r11
411	and	\$4095,%r11
412	cmp	%r11,%r10
413	jb	.Lmul4xsp_alt
414	sub	%r11,%rsp		# align with $ap
415	lea	-64(%rsp,$num,2),%rsp	# alloca(128+num*8)
416	jmp	.Lmul4xsp_done
417
418.align	32
419.Lmul4xsp_alt:
420	lea	4096-64(,$num,2),%r10
421	lea	-64(%rsp,$num,2),%rsp	# alloca(128+num*8)
422	sub	%r10,%r11
423	mov	\$0,%r10
424	cmovc	%r10,%r11
425	sub	%r11,%rsp
426.Lmul4xsp_done:
427	and	\$-64,%rsp
428	neg	$num
429
430	mov	%rax,40(%rsp)
431.Lmul4x_body:
432
433	call	mul4x_internal
434
435	mov	40(%rsp),%rsi		# restore %rsp
436	mov	\$1,%rax
437___
438$code.=<<___ if ($win64);
439	movaps	-88(%rsi),%xmm6
440	movaps	-72(%rsi),%xmm7
441___
442$code.=<<___;
443	mov	-48(%rsi),%r15
444	mov	-40(%rsi),%r14
445	mov	-32(%rsi),%r13
446	mov	-24(%rsi),%r12
447	mov	-16(%rsi),%rbp
448	mov	-8(%rsi),%rbx
449	lea	(%rsi),%rsp
450.Lmul4x_epilogue:
451	ret
452.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
453
454.type	mul4x_internal,\@abi-omnipotent
455.align	32
456mul4x_internal:
457	shl	\$5,$num
458	mov	`($win64?56:8)`(%rax),%r10d	# load 7th argument
459	lea	256(%rdx,$num),%r13
460	shr	\$5,$num		# restore $num
461___
462		$bp="%r12";
463		$STRIDE=2**5*8;		# 5 is "window size"
464		$N=$STRIDE/4;		# should match cache line size
465		$tp=$i;
466$code.=<<___;
467	mov	%r10,%r11
468	shr	\$`log($N/8)/log(2)`,%r10
469	and	\$`$N/8-1`,%r11
470	not	%r10
471	lea	.Lmagic_masks(%rip),%rax
472	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
473	lea	96(%rdx,%r11,8),$bp	# pointer within 1st cache line
474	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
475	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
476	add	\$7,%r11
477	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
478	movq	24(%rax,%r10,8),%xmm7
479	and	\$7,%r11
480
481	movq	`0*$STRIDE/4-96`($bp),%xmm0
482	lea	$STRIDE($bp),$tp	# borrow $tp
483	movq	`1*$STRIDE/4-96`($bp),%xmm1
484	pand	%xmm4,%xmm0
485	movq	`2*$STRIDE/4-96`($bp),%xmm2
486	pand	%xmm5,%xmm1
487	movq	`3*$STRIDE/4-96`($bp),%xmm3
488	pand	%xmm6,%xmm2
489	.byte	0x67
490	por	%xmm1,%xmm0
491	movq	`0*$STRIDE/4-96`($tp),%xmm1
492	.byte	0x67
493	pand	%xmm7,%xmm3
494	.byte	0x67
495	por	%xmm2,%xmm0
496	movq	`1*$STRIDE/4-96`($tp),%xmm2
497	.byte	0x67
498	pand	%xmm4,%xmm1
499	.byte	0x67
500	por	%xmm3,%xmm0
501	movq	`2*$STRIDE/4-96`($tp),%xmm3
502
503	movq	%xmm0,$m0		# m0=bp[0]
504	movq	`3*$STRIDE/4-96`($tp),%xmm0
505	mov	%r13,16+8(%rsp)		# save end of b[num]
506	mov	$rp, 56+8(%rsp)		# save $rp
507
508	mov	($n0),$n0		# pull n0[0] value
509	mov	($ap),%rax
510	lea	($ap,$num),$ap		# end of a[num]
511	neg	$num
512
513	mov	$n0,$m1
514	mulq	$m0			# ap[0]*bp[0]
515	mov	%rax,$A[0]
516	mov	($np),%rax
517
518	pand	%xmm5,%xmm2
519	pand	%xmm6,%xmm3
520	por	%xmm2,%xmm1
521
522	imulq	$A[0],$m1		# "tp[0]"*n0
523	##############################################################
524	# $tp is chosen so that writing to top-most element of the
525	# vector occurs just "above" references to powers table,
526	# "above" modulo cache-line size, which effectively precludes
527	# possibility of memory disambiguation logic failure when
528	# accessing the table.
529	#
530	lea	64+8(%rsp,%r11,8),$tp
531	mov	%rdx,$A[1]
532
533	pand	%xmm7,%xmm0
534	por	%xmm3,%xmm1
535	lea	2*$STRIDE($bp),$bp
536	por	%xmm1,%xmm0
537
538	mulq	$m1			# np[0]*m1
539	add	%rax,$A[0]		# discarded
540	mov	8($ap,$num),%rax
541	adc	\$0,%rdx
542	mov	%rdx,$N[1]
543
544	mulq	$m0
545	add	%rax,$A[1]
546	mov	16*1($np),%rax		# interleaved with 0, therefore 16*n
547	adc	\$0,%rdx
548	mov	%rdx,$A[0]
549
550	mulq	$m1
551	add	%rax,$N[1]
552	mov	16($ap,$num),%rax
553	adc	\$0,%rdx
554	add	$A[1],$N[1]
555	lea	4*8($num),$j		# j=4
556	lea	16*4($np),$np
557	adc	\$0,%rdx
558	mov	$N[1],($tp)
559	mov	%rdx,$N[0]
560	jmp	.L1st4x
561
562.align	32
563.L1st4x:
564	mulq	$m0			# ap[j]*bp[0]
565	add	%rax,$A[0]
566	mov	-16*2($np),%rax
567	lea	32($tp),$tp
568	adc	\$0,%rdx
569	mov	%rdx,$A[1]
570
571	mulq	$m1			# np[j]*m1
572	add	%rax,$N[0]
573	mov	-8($ap,$j),%rax
574	adc	\$0,%rdx
575	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
576	adc	\$0,%rdx
577	mov	$N[0],-24($tp)		# tp[j-1]
578	mov	%rdx,$N[1]
579
580	mulq	$m0			# ap[j]*bp[0]
581	add	%rax,$A[1]
582	mov	-16*1($np),%rax
583	adc	\$0,%rdx
584	mov	%rdx,$A[0]
585
586	mulq	$m1			# np[j]*m1
587	add	%rax,$N[1]
588	mov	($ap,$j),%rax
589	adc	\$0,%rdx
590	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
591	adc	\$0,%rdx
592	mov	$N[1],-16($tp)		# tp[j-1]
593	mov	%rdx,$N[0]
594
595	mulq	$m0			# ap[j]*bp[0]
596	add	%rax,$A[0]
597	mov	16*0($np),%rax
598	adc	\$0,%rdx
599	mov	%rdx,$A[1]
600
601	mulq	$m1			# np[j]*m1
602	add	%rax,$N[0]
603	mov	8($ap,$j),%rax
604	adc	\$0,%rdx
605	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
606	adc	\$0,%rdx
607	mov	$N[0],-8($tp)		# tp[j-1]
608	mov	%rdx,$N[1]
609
610	mulq	$m0			# ap[j]*bp[0]
611	add	%rax,$A[1]
612	mov	16*1($np),%rax
613	adc	\$0,%rdx
614	mov	%rdx,$A[0]
615
616	mulq	$m1			# np[j]*m1
617	add	%rax,$N[1]
618	mov	16($ap,$j),%rax
619	adc	\$0,%rdx
620	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
621	lea	16*4($np),$np
622	adc	\$0,%rdx
623	mov	$N[1],($tp)		# tp[j-1]
624	mov	%rdx,$N[0]
625
626	add	\$32,$j			# j+=4
627	jnz	.L1st4x
628
629	mulq	$m0			# ap[j]*bp[0]
630	add	%rax,$A[0]
631	mov	-16*2($np),%rax
632	lea	32($tp),$tp
633	adc	\$0,%rdx
634	mov	%rdx,$A[1]
635
636	mulq	$m1			# np[j]*m1
637	add	%rax,$N[0]
638	mov	-8($ap),%rax
639	adc	\$0,%rdx
640	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
641	adc	\$0,%rdx
642	mov	$N[0],-24($tp)		# tp[j-1]
643	mov	%rdx,$N[1]
644
645	mulq	$m0			# ap[j]*bp[0]
646	add	%rax,$A[1]
647	mov	-16*1($np),%rax
648	adc	\$0,%rdx
649	mov	%rdx,$A[0]
650
651	mulq	$m1			# np[j]*m1
652	add	%rax,$N[1]
653	mov	($ap,$num),%rax		# ap[0]
654	adc	\$0,%rdx
655	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
656	adc	\$0,%rdx
657	mov	$N[1],-16($tp)		# tp[j-1]
658	mov	%rdx,$N[0]
659
660	movq	%xmm0,$m0		# bp[1]
661	lea	($np,$num,2),$np	# rewind $np
662
663	xor	$N[1],$N[1]
664	add	$A[0],$N[0]
665	adc	\$0,$N[1]
666	mov	$N[0],-8($tp)
667
668	jmp	.Louter4x
669
670.align	32
671.Louter4x:
672	mov	($tp,$num),$A[0]
673	mov	$n0,$m1
674	mulq	$m0			# ap[0]*bp[i]
675	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
676	mov	($np),%rax
677	adc	\$0,%rdx
678
679	movq	`0*$STRIDE/4-96`($bp),%xmm0
680	movq	`1*$STRIDE/4-96`($bp),%xmm1
681	pand	%xmm4,%xmm0
682	movq	`2*$STRIDE/4-96`($bp),%xmm2
683	pand	%xmm5,%xmm1
684	movq	`3*$STRIDE/4-96`($bp),%xmm3
685
686	imulq	$A[0],$m1		# tp[0]*n0
687	.byte	0x67
688	mov	%rdx,$A[1]
689	mov	$N[1],($tp)		# store upmost overflow bit
690
691	pand	%xmm6,%xmm2
692	por	%xmm1,%xmm0
693	pand	%xmm7,%xmm3
694	por	%xmm2,%xmm0
695	lea	($tp,$num),$tp		# rewind $tp
696	lea	$STRIDE($bp),$bp
697	por	%xmm3,%xmm0
698
699	mulq	$m1			# np[0]*m1
700	add	%rax,$A[0]		# "$N[0]", discarded
701	mov	8($ap,$num),%rax
702	adc	\$0,%rdx
703	mov	%rdx,$N[1]
704
705	mulq	$m0			# ap[j]*bp[i]
706	add	%rax,$A[1]
707	mov	16*1($np),%rax		# interleaved with 0, therefore 16*n
708	adc	\$0,%rdx
709	add	8($tp),$A[1]		# +tp[1]
710	adc	\$0,%rdx
711	mov	%rdx,$A[0]
712
713	mulq	$m1			# np[j]*m1
714	add	%rax,$N[1]
715	mov	16($ap,$num),%rax
716	adc	\$0,%rdx
717	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
718	lea	4*8($num),$j		# j=4
719	lea	16*4($np),$np
720	adc	\$0,%rdx
721	mov	%rdx,$N[0]
722	jmp	.Linner4x
723
724.align	32
725.Linner4x:
726	mulq	$m0			# ap[j]*bp[i]
727	add	%rax,$A[0]
728	mov	-16*2($np),%rax
729	adc	\$0,%rdx
730	add	16($tp),$A[0]		# ap[j]*bp[i]+tp[j]
731	lea	32($tp),$tp
732	adc	\$0,%rdx
733	mov	%rdx,$A[1]
734
735	mulq	$m1			# np[j]*m1
736	add	%rax,$N[0]
737	mov	-8($ap,$j),%rax
738	adc	\$0,%rdx
739	add	$A[0],$N[0]
740	adc	\$0,%rdx
741	mov	$N[1],-32($tp)		# tp[j-1]
742	mov	%rdx,$N[1]
743
744	mulq	$m0			# ap[j]*bp[i]
745	add	%rax,$A[1]
746	mov	-16*1($np),%rax
747	adc	\$0,%rdx
748	add	-8($tp),$A[1]
749	adc	\$0,%rdx
750	mov	%rdx,$A[0]
751
752	mulq	$m1			# np[j]*m1
753	add	%rax,$N[1]
754	mov	($ap,$j),%rax
755	adc	\$0,%rdx
756	add	$A[1],$N[1]
757	adc	\$0,%rdx
758	mov	$N[0],-24($tp)		# tp[j-1]
759	mov	%rdx,$N[0]
760
761	mulq	$m0			# ap[j]*bp[i]
762	add	%rax,$A[0]
763	mov	16*0($np),%rax
764	adc	\$0,%rdx
765	add	($tp),$A[0]		# ap[j]*bp[i]+tp[j]
766	adc	\$0,%rdx
767	mov	%rdx,$A[1]
768
769	mulq	$m1			# np[j]*m1
770	add	%rax,$N[0]
771	mov	8($ap,$j),%rax
772	adc	\$0,%rdx
773	add	$A[0],$N[0]
774	adc	\$0,%rdx
775	mov	$N[1],-16($tp)		# tp[j-1]
776	mov	%rdx,$N[1]
777
778	mulq	$m0			# ap[j]*bp[i]
779	add	%rax,$A[1]
780	mov	16*1($np),%rax
781	adc	\$0,%rdx
782	add	8($tp),$A[1]
783	adc	\$0,%rdx
784	mov	%rdx,$A[0]
785
786	mulq	$m1			# np[j]*m1
787	add	%rax,$N[1]
788	mov	16($ap,$j),%rax
789	adc	\$0,%rdx
790	add	$A[1],$N[1]
791	lea	16*4($np),$np
792	adc	\$0,%rdx
793	mov	$N[0],-8($tp)		# tp[j-1]
794	mov	%rdx,$N[0]
795
796	add	\$32,$j			# j+=4
797	jnz	.Linner4x
798
799	mulq	$m0			# ap[j]*bp[i]
800	add	%rax,$A[0]
801	mov	-16*2($np),%rax
802	adc	\$0,%rdx
803	add	16($tp),$A[0]		# ap[j]*bp[i]+tp[j]
804	lea	32($tp),$tp
805	adc	\$0,%rdx
806	mov	%rdx,$A[1]
807
808	mulq	$m1			# np[j]*m1
809	add	%rax,$N[0]
810	mov	-8($ap),%rax
811	adc	\$0,%rdx
812	add	$A[0],$N[0]
813	adc	\$0,%rdx
814	mov	$N[1],-32($tp)		# tp[j-1]
815	mov	%rdx,$N[1]
816
817	mulq	$m0			# ap[j]*bp[i]
818	add	%rax,$A[1]
819	mov	$m1,%rax
820	mov	-16*1($np),$m1
821	adc	\$0,%rdx
822	add	-8($tp),$A[1]
823	adc	\$0,%rdx
824	mov	%rdx,$A[0]
825
826	mulq	$m1			# np[j]*m1
827	add	%rax,$N[1]
828	mov	($ap,$num),%rax		# ap[0]
829	adc	\$0,%rdx
830	add	$A[1],$N[1]
831	adc	\$0,%rdx
832	mov	$N[0],-24($tp)		# tp[j-1]
833	mov	%rdx,$N[0]
834
835	movq	%xmm0,$m0		# bp[i+1]
836	mov	$N[1],-16($tp)		# tp[j-1]
837	lea	($np,$num,2),$np	# rewind $np
838
839	xor	$N[1],$N[1]
840	add	$A[0],$N[0]
841	adc	\$0,$N[1]
842	add	($tp),$N[0]		# pull upmost overflow bit
843	adc	\$0,$N[1]		# upmost overflow bit
844	mov	$N[0],-8($tp)
845
846	cmp	16+8(%rsp),$bp
847	jb	.Louter4x
848___
849if (1) {
850$code.=<<___;
851	sub	$N[0],$m1		# compare top-most words
852	adc	$j,$j			# $j is zero
853	or	$j,$N[1]
854	xor	\$1,$N[1]
855	lea	($tp,$num),%rbx		# tptr in .sqr4x_sub
856	lea	($np,$N[1],8),%rbp	# nptr in .sqr4x_sub
857	mov	%r9,%rcx
858	sar	\$3+2,%rcx		# cf=0
859	mov	56+8(%rsp),%rdi		# rptr in .sqr4x_sub
860	jmp	.Lsqr4x_sub
861___
862} else {
863my @ri=("%rax",$bp,$m0,$m1);
864my $rp="%rdx";
865$code.=<<___
866	xor	\$1,$N[1]
867	lea	($tp,$num),$tp		# rewind $tp
868	sar	\$5,$num		# cf=0
869	lea	($np,$N[1],8),$np
870	mov	56+8(%rsp),$rp		# restore $rp
871	jmp	.Lsub4x
872
873.align	32
874.Lsub4x:
875	.byte	0x66
876	mov	8*0($tp),@ri[0]
877	mov	8*1($tp),@ri[1]
878	.byte	0x66
879	sbb	16*0($np),@ri[0]
880	mov	8*2($tp),@ri[2]
881	sbb	16*1($np),@ri[1]
882	mov	3*8($tp),@ri[3]
883	lea	4*8($tp),$tp
884	sbb	16*2($np),@ri[2]
885	mov	@ri[0],8*0($rp)
886	sbb	16*3($np),@ri[3]
887	lea	16*4($np),$np
888	mov	@ri[1],8*1($rp)
889	mov	@ri[2],8*2($rp)
890	mov	@ri[3],8*3($rp)
891	lea	8*4($rp),$rp
892
893	inc	$num
894	jnz	.Lsub4x
895
896	ret
897___
898}
899$code.=<<___;
900.size	mul4x_internal,.-mul4x_internal
901___
902}}}
903{{{
904######################################################################
905# void bn_power5(
906my $rptr="%rdi";	# BN_ULONG *rptr,
907my $aptr="%rsi";	# const BN_ULONG *aptr,
908my $bptr="%rdx";	# const void *table,
909my $nptr="%rcx";	# const BN_ULONG *nptr,
910my $n0  ="%r8";		# const BN_ULONG *n0);
911my $num ="%r9";		# int num, has to be divisible by 8
912			# int pwr
913
914my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
915my @A0=("%r10","%r11");
916my @A1=("%r12","%r13");
917my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
918
919$code.=<<___;
920.globl	bn_power5
921.type	bn_power5,\@function,6
922.align	32
923bn_power5:
924___
925$code.=<<___ if ($addx);
926	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
927	and	\$0x80100,%r11d
928	cmp	\$0x80100,%r11d
929	je	.Lpowerx5_enter
930___
931$code.=<<___;
932	mov	%rsp,%rax
933	push	%rbx
934	push	%rbp
935	push	%r12
936	push	%r13
937	push	%r14
938	push	%r15
939___
940$code.=<<___ if ($win64);
941	lea	-0x28(%rsp),%rsp
942	movaps	%xmm6,(%rsp)
943	movaps	%xmm7,0x10(%rsp)
944___
945$code.=<<___;
946	mov	${num}d,%r10d
947	shl	\$3,${num}d		# convert $num to bytes
948	shl	\$3+2,%r10d		# 4*$num
949	neg	$num
950	mov	($n0),$n0		# *n0
951
952	##############################################################
953	# ensure that stack frame doesn't alias with $aptr+4*$num
954	# modulo 4096, which covers ret[num], am[num] and n[2*num]
955	# (see bn_exp.c). this is done to allow memory disambiguation
956	# logic do its magic.
957	#
958	lea	-64(%rsp,$num,2),%r11
959	sub	$aptr,%r11
960	and	\$4095,%r11
961	cmp	%r11,%r10
962	jb	.Lpwr_sp_alt
963	sub	%r11,%rsp		# align with $aptr
964	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
965	jmp	.Lpwr_sp_done
966
967.align	32
968.Lpwr_sp_alt:
969	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
970	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
971	sub	%r10,%r11
972	mov	\$0,%r10
973	cmovc	%r10,%r11
974	sub	%r11,%rsp
975.Lpwr_sp_done:
976	and	\$-64,%rsp
977	mov	$num,%r10
978	neg	$num
979
980	##############################################################
981	# Stack layout
982	#
983	# +0	saved $num, used in reduction section
984	# +8	&t[2*$num], used in reduction section
985	# +32	saved *n0
986	# +40	saved %rsp
987	# +48	t[2*$num]
988	#
989	mov	$n0,  32(%rsp)
990	mov	%rax, 40(%rsp)		# save original %rsp
991.Lpower5_body:
992	movq	$rptr,%xmm1		# save $rptr
993	movq	$nptr,%xmm2		# save $nptr
994	movq	%r10, %xmm3		# -$num
995	movq	$bptr,%xmm4
996
997	call	__bn_sqr8x_internal
998	call	__bn_sqr8x_internal
999	call	__bn_sqr8x_internal
1000	call	__bn_sqr8x_internal
1001	call	__bn_sqr8x_internal
1002
1003	movq	%xmm2,$nptr
1004	movq	%xmm4,$bptr
1005	mov	$aptr,$rptr
1006	mov	40(%rsp),%rax
1007	lea	32(%rsp),$n0
1008
1009	call	mul4x_internal
1010
1011	mov	40(%rsp),%rsi		# restore %rsp
1012	mov	\$1,%rax
1013	mov	-48(%rsi),%r15
1014	mov	-40(%rsi),%r14
1015	mov	-32(%rsi),%r13
1016	mov	-24(%rsi),%r12
1017	mov	-16(%rsi),%rbp
1018	mov	-8(%rsi),%rbx
1019	lea	(%rsi),%rsp
1020.Lpower5_epilogue:
1021	ret
1022.size	bn_power5,.-bn_power5
1023
1024.globl	bn_sqr8x_internal
1025.hidden	bn_sqr8x_internal
1026.type	bn_sqr8x_internal,\@abi-omnipotent
1027.align	32
1028bn_sqr8x_internal:
1029__bn_sqr8x_internal:
1030	##############################################################
1031	# Squaring part:
1032	#
1033	# a) multiply-n-add everything but a[i]*a[i];
1034	# b) shift result of a) by 1 to the left and accumulate
1035	#    a[i]*a[i] products;
1036	#
1037	##############################################################
1038	#                                                     a[1]a[0]
1039	#                                                 a[2]a[0]
1040	#                                             a[3]a[0]
1041	#                                             a[2]a[1]
1042	#                                         a[4]a[0]
1043	#                                         a[3]a[1]
1044	#                                     a[5]a[0]
1045	#                                     a[4]a[1]
1046	#                                     a[3]a[2]
1047	#                                 a[6]a[0]
1048	#                                 a[5]a[1]
1049	#                                 a[4]a[2]
1050	#                             a[7]a[0]
1051	#                             a[6]a[1]
1052	#                             a[5]a[2]
1053	#                             a[4]a[3]
1054	#                         a[7]a[1]
1055	#                         a[6]a[2]
1056	#                         a[5]a[3]
1057	#                     a[7]a[2]
1058	#                     a[6]a[3]
1059	#                     a[5]a[4]
1060	#                 a[7]a[3]
1061	#                 a[6]a[4]
1062	#             a[7]a[4]
1063	#             a[6]a[5]
1064	#         a[7]a[5]
1065	#     a[7]a[6]
1066	#                                                     a[1]a[0]
1067	#                                                 a[2]a[0]
1068	#                                             a[3]a[0]
1069	#                                         a[4]a[0]
1070	#                                     a[5]a[0]
1071	#                                 a[6]a[0]
1072	#                             a[7]a[0]
1073	#                                             a[2]a[1]
1074	#                                         a[3]a[1]
1075	#                                     a[4]a[1]
1076	#                                 a[5]a[1]
1077	#                             a[6]a[1]
1078	#                         a[7]a[1]
1079	#                                     a[3]a[2]
1080	#                                 a[4]a[2]
1081	#                             a[5]a[2]
1082	#                         a[6]a[2]
1083	#                     a[7]a[2]
1084	#                             a[4]a[3]
1085	#                         a[5]a[3]
1086	#                     a[6]a[3]
1087	#                 a[7]a[3]
1088	#                     a[5]a[4]
1089	#                 a[6]a[4]
1090	#             a[7]a[4]
1091	#             a[6]a[5]
1092	#         a[7]a[5]
1093	#     a[7]a[6]
1094	#                                                         a[0]a[0]
1095	#                                                 a[1]a[1]
1096	#                                         a[2]a[2]
1097	#                                 a[3]a[3]
1098	#                         a[4]a[4]
1099	#                 a[5]a[5]
1100	#         a[6]a[6]
1101	# a[7]a[7]
1102
1103	lea	32(%r10),$i		# $i=-($num-32)
1104	lea	($aptr,$num),$aptr	# end of a[] buffer, ($aptr,$i)=&ap[2]
1105
1106	mov	$num,$j			# $j=$num
1107
1108					# comments apply to $num==8 case
1109	mov	-32($aptr,$i),$a0	# a[0]
1110	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
1111	mov	-24($aptr,$i),%rax	# a[1]
1112	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
1113	mov	-16($aptr,$i),$ai	# a[2]
1114	mov	%rax,$a1
1115
1116	mul	$a0			# a[1]*a[0]
1117	mov	%rax,$A0[0]		# a[1]*a[0]
1118	 mov	$ai,%rax		# a[2]
1119	mov	%rdx,$A0[1]
1120	mov	$A0[0],-24($tptr,$i)	# t[1]
1121
1122	mul	$a0			# a[2]*a[0]
1123	add	%rax,$A0[1]
1124	 mov	$ai,%rax
1125	adc	\$0,%rdx
1126	mov	$A0[1],-16($tptr,$i)	# t[2]
1127	mov	%rdx,$A0[0]
1128
1129
1130	 mov	-8($aptr,$i),$ai	# a[3]
1131	mul	$a1			# a[2]*a[1]
1132	mov	%rax,$A1[0]		# a[2]*a[1]+t[3]
1133	 mov	$ai,%rax
1134	mov	%rdx,$A1[1]
1135
1136	 lea	($i),$j
1137	mul	$a0			# a[3]*a[0]
1138	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
1139	 mov	$ai,%rax
1140	mov	%rdx,$A0[1]
1141	adc	\$0,$A0[1]
1142	add	$A1[0],$A0[0]
1143	adc	\$0,$A0[1]
1144	mov	$A0[0],-8($tptr,$j)	# t[3]
1145	jmp	.Lsqr4x_1st
1146
1147.align	32
1148.Lsqr4x_1st:
1149	 mov	($aptr,$j),$ai		# a[4]
1150	mul	$a1			# a[3]*a[1]
1151	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
1152	 mov	$ai,%rax
1153	mov	%rdx,$A1[0]
1154	adc	\$0,$A1[0]
1155
1156	mul	$a0			# a[4]*a[0]
1157	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
1158	 mov	$ai,%rax		# a[3]
1159	 mov	8($aptr,$j),$ai		# a[5]
1160	mov	%rdx,$A0[0]
1161	adc	\$0,$A0[0]
1162	add	$A1[1],$A0[1]
1163	adc	\$0,$A0[0]
1164
1165
1166	mul	$a1			# a[4]*a[3]
1167	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
1168	 mov	$ai,%rax
1169	 mov	$A0[1],($tptr,$j)	# t[4]
1170	mov	%rdx,$A1[1]
1171	adc	\$0,$A1[1]
1172
1173	mul	$a0			# a[5]*a[2]
1174	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
1175	 mov	$ai,%rax
1176	 mov	16($aptr,$j),$ai	# a[6]
1177	mov	%rdx,$A0[1]
1178	adc	\$0,$A0[1]
1179	add	$A1[0],$A0[0]
1180	adc	\$0,$A0[1]
1181
1182	mul	$a1			# a[5]*a[3]
1183	add	%rax,$A1[1]		# a[5]*a[3]+t[6]
1184	 mov	$ai,%rax
1185	 mov	$A0[0],8($tptr,$j)	# t[5]
1186	mov	%rdx,$A1[0]
1187	adc	\$0,$A1[0]
1188
1189	mul	$a0			# a[6]*a[2]
1190	add	%rax,$A0[1]		# a[6]*a[2]+a[5]*a[3]+t[6]
1191	 mov	$ai,%rax		# a[3]
1192	 mov	24($aptr,$j),$ai	# a[7]
1193	mov	%rdx,$A0[0]
1194	adc	\$0,$A0[0]
1195	add	$A1[1],$A0[1]
1196	adc	\$0,$A0[0]
1197
1198
1199	mul	$a1			# a[6]*a[5]
1200	add	%rax,$A1[0]		# a[6]*a[5]+t[7]
1201	 mov	$ai,%rax
1202	 mov	$A0[1],16($tptr,$j)	# t[6]
1203	mov	%rdx,$A1[1]
1204	adc	\$0,$A1[1]
1205	 lea	32($j),$j
1206
1207	mul	$a0			# a[7]*a[4]
1208	add	%rax,$A0[0]		# a[7]*a[4]+a[6]*a[5]+t[6]
1209	 mov	$ai,%rax
1210	mov	%rdx,$A0[1]
1211	adc	\$0,$A0[1]
1212	add	$A1[0],$A0[0]
1213	adc	\$0,$A0[1]
1214	mov	$A0[0],-8($tptr,$j)	# t[7]
1215
1216	cmp	\$0,$j
1217	jne	.Lsqr4x_1st
1218
1219	mul	$a1			# a[7]*a[5]
1220	add	%rax,$A1[1]
1221	lea	16($i),$i
1222	adc	\$0,%rdx
1223	add	$A0[1],$A1[1]
1224	adc	\$0,%rdx
1225
1226	mov	$A1[1],($tptr)		# t[8]
1227	mov	%rdx,$A1[0]
1228	mov	%rdx,8($tptr)		# t[9]
1229	jmp	.Lsqr4x_outer
1230
1231.align	32
1232.Lsqr4x_outer:				# comments apply to $num==6 case
1233	mov	-32($aptr,$i),$a0	# a[0]
1234	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
1235	mov	-24($aptr,$i),%rax	# a[1]
1236	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
1237	mov	-16($aptr,$i),$ai	# a[2]
1238	mov	%rax,$a1
1239
1240	mul	$a0			# a[1]*a[0]
1241	mov	-24($tptr,$i),$A0[0]	# t[1]
1242	add	%rax,$A0[0]		# a[1]*a[0]+t[1]
1243	 mov	$ai,%rax		# a[2]
1244	adc	\$0,%rdx
1245	mov	$A0[0],-24($tptr,$i)	# t[1]
1246	mov	%rdx,$A0[1]
1247
1248	mul	$a0			# a[2]*a[0]
1249	add	%rax,$A0[1]
1250	 mov	$ai,%rax
1251	adc	\$0,%rdx
1252	add	-16($tptr,$i),$A0[1]	# a[2]*a[0]+t[2]
1253	mov	%rdx,$A0[0]
1254	adc	\$0,$A0[0]
1255	mov	$A0[1],-16($tptr,$i)	# t[2]
1256
1257	xor	$A1[0],$A1[0]
1258
1259	 mov	-8($aptr,$i),$ai	# a[3]
1260	mul	$a1			# a[2]*a[1]
1261	add	%rax,$A1[0]		# a[2]*a[1]+t[3]
1262	 mov	$ai,%rax
1263	adc	\$0,%rdx
1264	add	-8($tptr,$i),$A1[0]
1265	mov	%rdx,$A1[1]
1266	adc	\$0,$A1[1]
1267
1268	mul	$a0			# a[3]*a[0]
1269	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
1270	 mov	$ai,%rax
1271	adc	\$0,%rdx
1272	add	$A1[0],$A0[0]
1273	mov	%rdx,$A0[1]
1274	adc	\$0,$A0[1]
1275	mov	$A0[0],-8($tptr,$i)	# t[3]
1276
1277	lea	($i),$j
1278	jmp	.Lsqr4x_inner
1279
1280.align	32
1281.Lsqr4x_inner:
1282	 mov	($aptr,$j),$ai		# a[4]
1283	mul	$a1			# a[3]*a[1]
1284	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
1285	 mov	$ai,%rax
1286	mov	%rdx,$A1[0]
1287	adc	\$0,$A1[0]
1288	add	($tptr,$j),$A1[1]
1289	adc	\$0,$A1[0]
1290
1291	.byte	0x67
1292	mul	$a0			# a[4]*a[0]
1293	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
1294	 mov	$ai,%rax		# a[3]
1295	 mov	8($aptr,$j),$ai		# a[5]
1296	mov	%rdx,$A0[0]
1297	adc	\$0,$A0[0]
1298	add	$A1[1],$A0[1]
1299	adc	\$0,$A0[0]
1300
1301	mul	$a1			# a[4]*a[3]
1302	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
1303	mov	$A0[1],($tptr,$j)	# t[4]
1304	 mov	$ai,%rax
1305	mov	%rdx,$A1[1]
1306	adc	\$0,$A1[1]
1307	add	8($tptr,$j),$A1[0]
1308	lea	16($j),$j		# j++
1309	adc	\$0,$A1[1]
1310
1311	mul	$a0			# a[5]*a[2]
1312	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
1313	 mov	$ai,%rax
1314	adc	\$0,%rdx
1315	add	$A1[0],$A0[0]
1316	mov	%rdx,$A0[1]
1317	adc	\$0,$A0[1]
1318	mov	$A0[0],-8($tptr,$j)	# t[5], "preloaded t[1]" below
1319
1320	cmp	\$0,$j
1321	jne	.Lsqr4x_inner
1322
1323	.byte	0x67
1324	mul	$a1			# a[5]*a[3]
1325	add	%rax,$A1[1]
1326	adc	\$0,%rdx
1327	add	$A0[1],$A1[1]
1328	adc	\$0,%rdx
1329
1330	mov	$A1[1],($tptr)		# t[6], "preloaded t[2]" below
1331	mov	%rdx,$A1[0]
1332	mov	%rdx,8($tptr)		# t[7], "preloaded t[3]" below
1333
1334	add	\$16,$i
1335	jnz	.Lsqr4x_outer
1336
1337					# comments apply to $num==4 case
1338	mov	-32($aptr),$a0		# a[0]
1339	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
1340	mov	-24($aptr),%rax		# a[1]
1341	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
1342	mov	-16($aptr),$ai		# a[2]
1343	mov	%rax,$a1
1344
1345	mul	$a0			# a[1]*a[0]
1346	add	%rax,$A0[0]		# a[1]*a[0]+t[1], preloaded t[1]
1347	 mov	$ai,%rax		# a[2]
1348	mov	%rdx,$A0[1]
1349	adc	\$0,$A0[1]
1350
1351	mul	$a0			# a[2]*a[0]
1352	add	%rax,$A0[1]
1353	 mov	$ai,%rax
1354	 mov	$A0[0],-24($tptr)	# t[1]
1355	mov	%rdx,$A0[0]
1356	adc	\$0,$A0[0]
1357	add	$A1[1],$A0[1]		# a[2]*a[0]+t[2], preloaded t[2]
1358	 mov	-8($aptr),$ai		# a[3]
1359	adc	\$0,$A0[0]
1360
1361	mul	$a1			# a[2]*a[1]
1362	add	%rax,$A1[0]		# a[2]*a[1]+t[3], preloaded t[3]
1363	 mov	$ai,%rax
1364	 mov	$A0[1],-16($tptr)	# t[2]
1365	mov	%rdx,$A1[1]
1366	adc	\$0,$A1[1]
1367
1368	mul	$a0			# a[3]*a[0]
1369	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
1370	 mov	$ai,%rax
1371	mov	%rdx,$A0[1]
1372	adc	\$0,$A0[1]
1373	add	$A1[0],$A0[0]
1374	adc	\$0,$A0[1]
1375	mov	$A0[0],-8($tptr)	# t[3]
1376
1377	mul	$a1			# a[3]*a[1]
1378	add	%rax,$A1[1]
1379	 mov	-16($aptr),%rax		# a[2]
1380	adc	\$0,%rdx
1381	add	$A0[1],$A1[1]
1382	adc	\$0,%rdx
1383
1384	mov	$A1[1],($tptr)		# t[4]
1385	mov	%rdx,$A1[0]
1386	mov	%rdx,8($tptr)		# t[5]
1387
1388	mul	$ai			# a[2]*a[3]
1389___
1390{
1391my ($shift,$carry)=($a0,$a1);
1392my @S=(@A1,$ai,$n0);
1393$code.=<<___;
1394	 add	\$16,$i
1395	 xor	$shift,$shift
1396	 sub	$num,$i			# $i=16-$num
1397	 xor	$carry,$carry
1398
1399	add	$A1[0],%rax		# t[5]
1400	adc	\$0,%rdx
1401	mov	%rax,8($tptr)		# t[5]
1402	mov	%rdx,16($tptr)		# t[6]
1403	mov	$carry,24($tptr)	# t[7]
1404
1405	 mov	-16($aptr,$i),%rax	# a[0]
1406	lea	48+8(%rsp),$tptr
1407	 xor	$A0[0],$A0[0]		# t[0]
1408	 mov	8($tptr),$A0[1]		# t[1]
1409
1410	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1411	shr	\$63,$A0[0]
1412	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1413	shr	\$63,$A0[1]
1414	or	$A0[0],$S[1]		# | t[2*i]>>63
1415	 mov	16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1416	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1417	mul	%rax			# a[i]*a[i]
1418	neg	$carry			# mov $carry,cf
1419	 mov	24($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1420	adc	%rax,$S[0]
1421	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
1422	mov	$S[0],($tptr)
1423	adc	%rdx,$S[1]
1424
1425	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1426	 mov	$S[1],8($tptr)
1427	 sbb	$carry,$carry		# mov cf,$carry
1428	shr	\$63,$A0[0]
1429	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1430	shr	\$63,$A0[1]
1431	or	$A0[0],$S[3]		# | t[2*i]>>63
1432	 mov	32($tptr),$A0[0]	# t[2*i+2]	# prefetch
1433	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1434	mul	%rax			# a[i]*a[i]
1435	neg	$carry			# mov $carry,cf
1436	 mov	40($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1437	adc	%rax,$S[2]
1438	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
1439	mov	$S[2],16($tptr)
1440	adc	%rdx,$S[3]
1441	lea	16($i),$i
1442	mov	$S[3],24($tptr)
1443	sbb	$carry,$carry		# mov cf,$carry
1444	lea	64($tptr),$tptr
1445	jmp	.Lsqr4x_shift_n_add
1446
1447.align	32
1448.Lsqr4x_shift_n_add:
1449	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1450	shr	\$63,$A0[0]
1451	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1452	shr	\$63,$A0[1]
1453	or	$A0[0],$S[1]		# | t[2*i]>>63
1454	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1455	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1456	mul	%rax			# a[i]*a[i]
1457	neg	$carry			# mov $carry,cf
1458	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1459	adc	%rax,$S[0]
1460	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
1461	mov	$S[0],-32($tptr)
1462	adc	%rdx,$S[1]
1463
1464	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1465	 mov	$S[1],-24($tptr)
1466	 sbb	$carry,$carry		# mov cf,$carry
1467	shr	\$63,$A0[0]
1468	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1469	shr	\$63,$A0[1]
1470	or	$A0[0],$S[3]		# | t[2*i]>>63
1471	 mov	0($tptr),$A0[0]		# t[2*i+2]	# prefetch
1472	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1473	mul	%rax			# a[i]*a[i]
1474	neg	$carry			# mov $carry,cf
1475	 mov	8($tptr),$A0[1]		# t[2*i+2+1]	# prefetch
1476	adc	%rax,$S[2]
1477	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
1478	mov	$S[2],-16($tptr)
1479	adc	%rdx,$S[3]
1480
1481	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1482	 mov	$S[3],-8($tptr)
1483	 sbb	$carry,$carry		# mov cf,$carry
1484	shr	\$63,$A0[0]
1485	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1486	shr	\$63,$A0[1]
1487	or	$A0[0],$S[1]		# | t[2*i]>>63
1488	 mov	16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1489	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1490	mul	%rax			# a[i]*a[i]
1491	neg	$carry			# mov $carry,cf
1492	 mov	24($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1493	adc	%rax,$S[0]
1494	 mov	8($aptr,$i),%rax	# a[i+1]	# prefetch
1495	mov	$S[0],0($tptr)
1496	adc	%rdx,$S[1]
1497
1498	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1499	 mov	$S[1],8($tptr)
1500	 sbb	$carry,$carry		# mov cf,$carry
1501	shr	\$63,$A0[0]
1502	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1503	shr	\$63,$A0[1]
1504	or	$A0[0],$S[3]		# | t[2*i]>>63
1505	 mov	32($tptr),$A0[0]	# t[2*i+2]	# prefetch
1506	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1507	mul	%rax			# a[i]*a[i]
1508	neg	$carry			# mov $carry,cf
1509	 mov	40($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1510	adc	%rax,$S[2]
1511	 mov	16($aptr,$i),%rax	# a[i+1]	# prefetch
1512	mov	$S[2],16($tptr)
1513	adc	%rdx,$S[3]
1514	mov	$S[3],24($tptr)
1515	sbb	$carry,$carry		# mov cf,$carry
1516	lea	64($tptr),$tptr
1517	add	\$32,$i
1518	jnz	.Lsqr4x_shift_n_add
1519
1520	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1521	.byte	0x67
1522	shr	\$63,$A0[0]
1523	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1524	shr	\$63,$A0[1]
1525	or	$A0[0],$S[1]		# | t[2*i]>>63
1526	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1527	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1528	mul	%rax			# a[i]*a[i]
1529	neg	$carry			# mov $carry,cf
1530	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1531	adc	%rax,$S[0]
1532	 mov	-8($aptr),%rax		# a[i+1]	# prefetch
1533	mov	$S[0],-32($tptr)
1534	adc	%rdx,$S[1]
1535
1536	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1|shift
1537	 mov	$S[1],-24($tptr)
1538	 sbb	$carry,$carry		# mov cf,$carry
1539	shr	\$63,$A0[0]
1540	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1541	shr	\$63,$A0[1]
1542	or	$A0[0],$S[3]		# | t[2*i]>>63
1543	mul	%rax			# a[i]*a[i]
1544	neg	$carry			# mov $carry,cf
1545	adc	%rax,$S[2]
1546	adc	%rdx,$S[3]
1547	mov	$S[2],-16($tptr)
1548	mov	$S[3],-8($tptr)
1549___
1550}
1551######################################################################
1552# Montgomery reduction part, "word-by-word" algorithm.
1553#
1554# This new path is inspired by multiple submissions from Intel, by
1555# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
1556# Vinodh Gopal...
1557{
1558my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
1559
1560$code.=<<___;
1561	movq	%xmm2,$nptr
1562sqr8x_reduction:
1563	xor	%rax,%rax
1564	lea	($nptr,$num,2),%rcx	# end of n[]
1565	lea	48+8(%rsp,$num,2),%rdx	# end of t[] buffer
1566	mov	%rcx,0+8(%rsp)
1567	lea	48+8(%rsp,$num),$tptr	# end of initial t[] window
1568	mov	%rdx,8+8(%rsp)
1569	neg	$num
1570	jmp	.L8x_reduction_loop
1571
1572.align	32
1573.L8x_reduction_loop:
1574	lea	($tptr,$num),$tptr	# start of current t[] window
1575	.byte	0x66
1576	mov	8*0($tptr),$m0
1577	mov	8*1($tptr),%r9
1578	mov	8*2($tptr),%r10
1579	mov	8*3($tptr),%r11
1580	mov	8*4($tptr),%r12
1581	mov	8*5($tptr),%r13
1582	mov	8*6($tptr),%r14
1583	mov	8*7($tptr),%r15
1584	mov	%rax,(%rdx)		# store top-most carry bit
1585	lea	8*8($tptr),$tptr
1586
1587	.byte	0x67
1588	mov	$m0,%r8
1589	imulq	32+8(%rsp),$m0		# n0*a[0]
1590	mov	16*0($nptr),%rax	# n[0]
1591	mov	\$8,%ecx
1592	jmp	.L8x_reduce
1593
1594.align	32
1595.L8x_reduce:
1596	mulq	$m0
1597	 mov	16*1($nptr),%rax	# n[1]
1598	neg	%r8
1599	mov	%rdx,%r8
1600	adc	\$0,%r8
1601
1602	mulq	$m0
1603	add	%rax,%r9
1604	 mov	16*2($nptr),%rax
1605	adc	\$0,%rdx
1606	add	%r9,%r8
1607	 mov	$m0,48-8+8(%rsp,%rcx,8)	# put aside n0*a[i]
1608	mov	%rdx,%r9
1609	adc	\$0,%r9
1610
1611	mulq	$m0
1612	add	%rax,%r10
1613	 mov	16*3($nptr),%rax
1614	adc	\$0,%rdx
1615	add	%r10,%r9
1616	 mov	32+8(%rsp),$carry	# pull n0, borrow $carry
1617	mov	%rdx,%r10
1618	adc	\$0,%r10
1619
1620	mulq	$m0
1621	add	%rax,%r11
1622	 mov	16*4($nptr),%rax
1623	adc	\$0,%rdx
1624	 imulq	%r8,$carry		# modulo-scheduled
1625	add	%r11,%r10
1626	mov	%rdx,%r11
1627	adc	\$0,%r11
1628
1629	mulq	$m0
1630	add	%rax,%r12
1631	 mov	16*5($nptr),%rax
1632	adc	\$0,%rdx
1633	add	%r12,%r11
1634	mov	%rdx,%r12
1635	adc	\$0,%r12
1636
1637	mulq	$m0
1638	add	%rax,%r13
1639	 mov	16*6($nptr),%rax
1640	adc	\$0,%rdx
1641	add	%r13,%r12
1642	mov	%rdx,%r13
1643	adc	\$0,%r13
1644
1645	mulq	$m0
1646	add	%rax,%r14
1647	 mov	16*7($nptr),%rax
1648	adc	\$0,%rdx
1649	add	%r14,%r13
1650	mov	%rdx,%r14
1651	adc	\$0,%r14
1652
1653	mulq	$m0
1654	 mov	$carry,$m0		# n0*a[i]
1655	add	%rax,%r15
1656	 mov	16*0($nptr),%rax	# n[0]
1657	adc	\$0,%rdx
1658	add	%r15,%r14
1659	mov	%rdx,%r15
1660	adc	\$0,%r15
1661
1662	dec	%ecx
1663	jnz	.L8x_reduce
1664
1665	lea	16*8($nptr),$nptr
1666	xor	%rax,%rax
1667	mov	8+8(%rsp),%rdx		# pull end of t[]
1668	cmp	0+8(%rsp),$nptr		# end of n[]?
1669	jae	.L8x_no_tail
1670
1671	.byte	0x66
1672	add	8*0($tptr),%r8
1673	adc	8*1($tptr),%r9
1674	adc	8*2($tptr),%r10
1675	adc	8*3($tptr),%r11
1676	adc	8*4($tptr),%r12
1677	adc	8*5($tptr),%r13
1678	adc	8*6($tptr),%r14
1679	adc	8*7($tptr),%r15
1680	sbb	$carry,$carry		# top carry
1681
1682	mov	48+56+8(%rsp),$m0	# pull n0*a[0]
1683	mov	\$8,%ecx
1684	mov	16*0($nptr),%rax
1685	jmp	.L8x_tail
1686
1687.align	32
1688.L8x_tail:
1689	mulq	$m0
1690	add	%rax,%r8
1691	 mov	16*1($nptr),%rax
1692	 mov	%r8,($tptr)		# save result
1693	mov	%rdx,%r8
1694	adc	\$0,%r8
1695
1696	mulq	$m0
1697	add	%rax,%r9
1698	 mov	16*2($nptr),%rax
1699	adc	\$0,%rdx
1700	add	%r9,%r8
1701	 lea	8($tptr),$tptr		# $tptr++
1702	mov	%rdx,%r9
1703	adc	\$0,%r9
1704
1705	mulq	$m0
1706	add	%rax,%r10
1707	 mov	16*3($nptr),%rax
1708	adc	\$0,%rdx
1709	add	%r10,%r9
1710	mov	%rdx,%r10
1711	adc	\$0,%r10
1712
1713	mulq	$m0
1714	add	%rax,%r11
1715	 mov	16*4($nptr),%rax
1716	adc	\$0,%rdx
1717	add	%r11,%r10
1718	mov	%rdx,%r11
1719	adc	\$0,%r11
1720
1721	mulq	$m0
1722	add	%rax,%r12
1723	 mov	16*5($nptr),%rax
1724	adc	\$0,%rdx
1725	add	%r12,%r11
1726	mov	%rdx,%r12
1727	adc	\$0,%r12
1728
1729	mulq	$m0
1730	add	%rax,%r13
1731	 mov	16*6($nptr),%rax
1732	adc	\$0,%rdx
1733	add	%r13,%r12
1734	mov	%rdx,%r13
1735	adc	\$0,%r13
1736
1737	mulq	$m0
1738	add	%rax,%r14
1739	 mov	16*7($nptr),%rax
1740	adc	\$0,%rdx
1741	add	%r14,%r13
1742	mov	%rdx,%r14
1743	adc	\$0,%r14
1744
1745	mulq	$m0
1746	 mov	48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i]
1747	add	%rax,%r15
1748	adc	\$0,%rdx
1749	add	%r15,%r14
1750	 mov	16*0($nptr),%rax	# pull n[0]
1751	mov	%rdx,%r15
1752	adc	\$0,%r15
1753
1754	dec	%ecx
1755	jnz	.L8x_tail
1756
1757	lea	16*8($nptr),$nptr
1758	mov	8+8(%rsp),%rdx		# pull end of t[]
1759	cmp	0+8(%rsp),$nptr		# end of n[]?
1760	jae	.L8x_tail_done		# break out of loop
1761
1762	 mov	48+56+8(%rsp),$m0	# pull n0*a[0]
1763	neg	$carry
1764	 mov	8*0($nptr),%rax		# pull n[0]
1765	adc	8*0($tptr),%r8
1766	adc	8*1($tptr),%r9
1767	adc	8*2($tptr),%r10
1768	adc	8*3($tptr),%r11
1769	adc	8*4($tptr),%r12
1770	adc	8*5($tptr),%r13
1771	adc	8*6($tptr),%r14
1772	adc	8*7($tptr),%r15
1773	sbb	$carry,$carry		# top carry
1774
1775	mov	\$8,%ecx
1776	jmp	.L8x_tail
1777
1778.align	32
1779.L8x_tail_done:
1780	add	(%rdx),%r8		# can this overflow?
1781	xor	%rax,%rax
1782
1783	neg	$carry
1784.L8x_no_tail:
1785	adc	8*0($tptr),%r8
1786	adc	8*1($tptr),%r9
1787	adc	8*2($tptr),%r10
1788	adc	8*3($tptr),%r11
1789	adc	8*4($tptr),%r12
1790	adc	8*5($tptr),%r13
1791	adc	8*6($tptr),%r14
1792	adc	8*7($tptr),%r15
1793	adc	\$0,%rax		# top-most carry
1794	 mov	-16($nptr),%rcx		# np[num-1]
1795	 xor	$carry,$carry
1796
1797	movq	%xmm2,$nptr		# restore $nptr
1798
1799	mov	%r8,8*0($tptr)		# store top 512 bits
1800	mov	%r9,8*1($tptr)
1801	 movq	%xmm3,$num		# $num is %r9, can't be moved upwards
1802	mov	%r10,8*2($tptr)
1803	mov	%r11,8*3($tptr)
1804	mov	%r12,8*4($tptr)
1805	mov	%r13,8*5($tptr)
1806	mov	%r14,8*6($tptr)
1807	mov	%r15,8*7($tptr)
1808	lea	8*8($tptr),$tptr
1809
1810	cmp	%rdx,$tptr		# end of t[]?
1811	jb	.L8x_reduction_loop
1812___
1813}
1814##############################################################
1815# Post-condition, 4x unrolled
1816#
1817{
1818my ($tptr,$nptr)=("%rbx","%rbp");
1819$code.=<<___;
1820	#xor	%rsi,%rsi		# %rsi was $carry above
1821	sub	%r15,%rcx		# compare top-most words
1822	lea	(%rdi,$num),$tptr	# %rdi was $tptr above
1823	adc	%rsi,%rsi
1824	mov	$num,%rcx
1825	or	%rsi,%rax
1826	movq	%xmm1,$rptr		# restore $rptr
1827	xor	\$1,%rax
1828	movq	%xmm1,$aptr		# prepare for back-to-back call
1829	lea	($nptr,%rax,8),$nptr
1830	sar	\$3+2,%rcx		# cf=0
1831	jmp	.Lsqr4x_sub
1832
1833.align	32
1834.Lsqr4x_sub:
1835	.byte	0x66
1836	mov	8*0($tptr),%r12
1837	mov	8*1($tptr),%r13
1838	sbb	16*0($nptr),%r12
1839	mov	8*2($tptr),%r14
1840	sbb	16*1($nptr),%r13
1841	mov	8*3($tptr),%r15
1842	lea	8*4($tptr),$tptr
1843	sbb	16*2($nptr),%r14
1844	mov	%r12,8*0($rptr)
1845	sbb	16*3($nptr),%r15
1846	lea	16*4($nptr),$nptr
1847	mov	%r13,8*1($rptr)
1848	mov	%r14,8*2($rptr)
1849	mov	%r15,8*3($rptr)
1850	lea	8*4($rptr),$rptr
1851
1852	inc	%rcx			# pass %cf
1853	jnz	.Lsqr4x_sub
1854___
1855}
1856$code.=<<___;
1857	mov	$num,%r10		# prepare for back-to-back call
1858	neg	$num			# restore $num
1859	ret
1860.size	bn_sqr8x_internal,.-bn_sqr8x_internal
1861___
1862{
1863$code.=<<___;
1864.globl	bn_from_montgomery
1865.type	bn_from_montgomery,\@abi-omnipotent
1866.align	32
1867bn_from_montgomery:
1868	testl	\$7,`($win64?"48(%rsp)":"%r9d")`
1869	jz	bn_from_mont8x
1870	xor	%eax,%eax
1871	ret
1872.size	bn_from_montgomery,.-bn_from_montgomery
1873
1874.type	bn_from_mont8x,\@function,6
1875.align	32
1876bn_from_mont8x:
1877	.byte	0x67
1878	mov	%rsp,%rax
1879	push	%rbx
1880	push	%rbp
1881	push	%r12
1882	push	%r13
1883	push	%r14
1884	push	%r15
1885___
1886$code.=<<___ if ($win64);
1887	lea	-0x28(%rsp),%rsp
1888	movaps	%xmm6,(%rsp)
1889	movaps	%xmm7,0x10(%rsp)
1890___
1891$code.=<<___;
1892	.byte	0x67
1893	mov	${num}d,%r10d
1894	shl	\$3,${num}d		# convert $num to bytes
1895	shl	\$3+2,%r10d		# 4*$num
1896	neg	$num
1897	mov	($n0),$n0		# *n0
1898
1899	##############################################################
1900	# ensure that stack frame doesn't alias with $aptr+4*$num
1901	# modulo 4096, which covers ret[num], am[num] and n[2*num]
1902	# (see bn_exp.c). this is done to allow memory disambiguation
1903	# logic do its magic.
1904	#
1905	lea	-64(%rsp,$num,2),%r11
1906	sub	$aptr,%r11
1907	and	\$4095,%r11
1908	cmp	%r11,%r10
1909	jb	.Lfrom_sp_alt
1910	sub	%r11,%rsp		# align with $aptr
1911	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
1912	jmp	.Lfrom_sp_done
1913
1914.align	32
1915.Lfrom_sp_alt:
1916	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
1917	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
1918	sub	%r10,%r11
1919	mov	\$0,%r10
1920	cmovc	%r10,%r11
1921	sub	%r11,%rsp
1922.Lfrom_sp_done:
1923	and	\$-64,%rsp
1924	mov	$num,%r10
1925	neg	$num
1926
1927	##############################################################
1928	# Stack layout
1929	#
1930	# +0	saved $num, used in reduction section
1931	# +8	&t[2*$num], used in reduction section
1932	# +32	saved *n0
1933	# +40	saved %rsp
1934	# +48	t[2*$num]
1935	#
1936	mov	$n0,  32(%rsp)
1937	mov	%rax, 40(%rsp)		# save original %rsp
1938.Lfrom_body:
1939	mov	$num,%r11
1940	lea	48(%rsp),%rax
1941	pxor	%xmm0,%xmm0
1942	jmp	.Lmul_by_1
1943
1944.align	32
1945.Lmul_by_1:
1946	movdqu	($aptr),%xmm1
1947	movdqu	16($aptr),%xmm2
1948	movdqu	32($aptr),%xmm3
1949	movdqa	%xmm0,(%rax,$num)
1950	movdqu	48($aptr),%xmm4
1951	movdqa	%xmm0,16(%rax,$num)
1952	.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00	# lea	64($aptr),$aptr
1953	movdqa	%xmm1,(%rax)
1954	movdqa	%xmm0,32(%rax,$num)
1955	movdqa	%xmm2,16(%rax)
1956	movdqa	%xmm0,48(%rax,$num)
1957	movdqa	%xmm3,32(%rax)
1958	movdqa	%xmm4,48(%rax)
1959	lea	64(%rax),%rax
1960	sub	\$64,%r11
1961	jnz	.Lmul_by_1
1962
1963	movq	$rptr,%xmm1
1964	movq	$nptr,%xmm2
1965	.byte	0x67
1966	mov	$nptr,%rbp
1967	movq	%r10, %xmm3		# -num
1968___
1969$code.=<<___ if ($addx);
1970	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
1971	and	\$0x80100,%r11d
1972	cmp	\$0x80100,%r11d
1973	jne	.Lfrom_mont_nox
1974
1975	lea	(%rax,$num),$rptr
1976	call	sqrx8x_reduction
1977
1978	pxor	%xmm0,%xmm0
1979	lea	48(%rsp),%rax
1980	mov	40(%rsp),%rsi		# restore %rsp
1981	jmp	.Lfrom_mont_zero
1982
1983.align	32
1984.Lfrom_mont_nox:
1985___
1986$code.=<<___;
1987	call	sqr8x_reduction
1988
1989	pxor	%xmm0,%xmm0
1990	lea	48(%rsp),%rax
1991	mov	40(%rsp),%rsi		# restore %rsp
1992	jmp	.Lfrom_mont_zero
1993
1994.align	32
1995.Lfrom_mont_zero:
1996	movdqa	%xmm0,16*0(%rax)
1997	movdqa	%xmm0,16*1(%rax)
1998	movdqa	%xmm0,16*2(%rax)
1999	movdqa	%xmm0,16*3(%rax)
2000	lea	16*4(%rax),%rax
2001	sub	\$32,$num
2002	jnz	.Lfrom_mont_zero
2003
2004	mov	\$1,%rax
2005	mov	-48(%rsi),%r15
2006	mov	-40(%rsi),%r14
2007	mov	-32(%rsi),%r13
2008	mov	-24(%rsi),%r12
2009	mov	-16(%rsi),%rbp
2010	mov	-8(%rsi),%rbx
2011	lea	(%rsi),%rsp
2012.Lfrom_epilogue:
2013	ret
2014.size	bn_from_mont8x,.-bn_from_mont8x
2015___
2016}
2017}}}
2018
2019if ($addx) {{{
2020my $bp="%rdx";	# restore original value
2021
2022$code.=<<___;
2023.type	bn_mulx4x_mont_gather5,\@function,6
2024.align	32
2025bn_mulx4x_mont_gather5:
2026.Lmulx4x_enter:
2027	.byte	0x67
2028	mov	%rsp,%rax
2029	push	%rbx
2030	push	%rbp
2031	push	%r12
2032	push	%r13
2033	push	%r14
2034	push	%r15
2035___
2036$code.=<<___ if ($win64);
2037	lea	-0x28(%rsp),%rsp
2038	movaps	%xmm6,(%rsp)
2039	movaps	%xmm7,0x10(%rsp)
2040___
2041$code.=<<___;
2042	.byte	0x67
2043	mov	${num}d,%r10d
2044	shl	\$3,${num}d		# convert $num to bytes
2045	shl	\$3+2,%r10d		# 4*$num
2046	neg	$num			# -$num
2047	mov	($n0),$n0		# *n0
2048
2049	##############################################################
2050	# ensure that stack frame doesn't alias with $aptr+4*$num
2051	# modulo 4096, which covers a[num], ret[num] and n[2*num]
2052	# (see bn_exp.c). this is done to allow memory disambiguation
2053	# logic do its magic. [excessive frame is allocated in order
2054	# to allow bn_from_mont8x to clear it.]
2055	#
2056	lea	-64(%rsp,$num,2),%r11
2057	sub	$ap,%r11
2058	and	\$4095,%r11
2059	cmp	%r11,%r10
2060	jb	.Lmulx4xsp_alt
2061	sub	%r11,%rsp		# align with $aptr
2062	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+$num)
2063	jmp	.Lmulx4xsp_done
2064
2065.align	32
2066.Lmulx4xsp_alt:
2067	lea	4096-64(,$num,2),%r10	# 4096-frame-$num
2068	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+$num)
2069	sub	%r10,%r11
2070	mov	\$0,%r10
2071	cmovc	%r10,%r11
2072	sub	%r11,%rsp
2073.Lmulx4xsp_done:
2074	and	\$-64,%rsp		# ensure alignment
2075	##############################################################
2076	# Stack layout
2077	# +0	-num
2078	# +8	off-loaded &b[i]
2079	# +16	end of b[num]
2080	# +24	inner counter
2081	# +32	saved n0
2082	# +40	saved %rsp
2083	# +48
2084	# +56	saved rp
2085	# +64	tmp[num+1]
2086	#
2087	mov	$n0, 32(%rsp)		# save *n0
2088	mov	%rax,40(%rsp)		# save original %rsp
2089.Lmulx4x_body:
2090	call	mulx4x_internal
2091
2092	mov	40(%rsp),%rsi		# restore %rsp
2093	mov	\$1,%rax
2094___
2095$code.=<<___ if ($win64);
2096	movaps	-88(%rsi),%xmm6
2097	movaps	-72(%rsi),%xmm7
2098___
2099$code.=<<___;
2100	mov	-48(%rsi),%r15
2101	mov	-40(%rsi),%r14
2102	mov	-32(%rsi),%r13
2103	mov	-24(%rsi),%r12
2104	mov	-16(%rsi),%rbp
2105	mov	-8(%rsi),%rbx
2106	lea	(%rsi),%rsp
2107.Lmulx4x_epilogue:
2108	ret
2109.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2110
2111.type	mulx4x_internal,\@abi-omnipotent
2112.align	32
2113mulx4x_internal:
2114	.byte	0x4c,0x89,0x8c,0x24,0x08,0x00,0x00,0x00	# mov	$num,8(%rsp)		# save -$num
2115	.byte	0x67
2116	neg	$num			# restore $num
2117	shl	\$5,$num
2118	lea	256($bp,$num),%r13
2119	shr	\$5+5,$num
2120	mov	`($win64?56:8)`(%rax),%r10d	# load 7th argument
2121	sub	\$1,$num
2122	mov	%r13,16+8(%rsp)		# end of b[num]
2123	mov	$num,24+8(%rsp)		# inner counter
2124	mov	$rp, 56+8(%rsp)		# save $rp
2125___
2126my ($aptr, $bptr, $nptr, $tptr, $mi,  $bi,  $zero, $num)=
2127   ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
2128my $rptr=$bptr;
2129my $STRIDE=2**5*8;		# 5 is "window size"
2130my $N=$STRIDE/4;		# should match cache line size
2131$code.=<<___;
2132	mov	%r10,%r11
2133	shr	\$`log($N/8)/log(2)`,%r10
2134	and	\$`$N/8-1`,%r11
2135	not	%r10
2136	lea	.Lmagic_masks(%rip),%rax
2137	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
2138	lea	96($bp,%r11,8),$bptr	# pointer within 1st cache line
2139	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
2140	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
2141	add	\$7,%r11
2142	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
2143	movq	24(%rax,%r10,8),%xmm7
2144	and	\$7,%r11
2145
2146	movq	`0*$STRIDE/4-96`($bptr),%xmm0
2147	lea	$STRIDE($bptr),$tptr	# borrow $tptr
2148	movq	`1*$STRIDE/4-96`($bptr),%xmm1
2149	pand	%xmm4,%xmm0
2150	movq	`2*$STRIDE/4-96`($bptr),%xmm2
2151	pand	%xmm5,%xmm1
2152	movq	`3*$STRIDE/4-96`($bptr),%xmm3
2153	pand	%xmm6,%xmm2
2154	por	%xmm1,%xmm0
2155	movq	`0*$STRIDE/4-96`($tptr),%xmm1
2156	pand	%xmm7,%xmm3
2157	por	%xmm2,%xmm0
2158	movq	`1*$STRIDE/4-96`($tptr),%xmm2
2159	por	%xmm3,%xmm0
2160	.byte	0x67,0x67
2161	pand	%xmm4,%xmm1
2162	movq	`2*$STRIDE/4-96`($tptr),%xmm3
2163
2164	movq	%xmm0,%rdx		# bp[0]
2165	movq	`3*$STRIDE/4-96`($tptr),%xmm0
2166	lea	2*$STRIDE($bptr),$bptr	# next &b[i]
2167	pand	%xmm5,%xmm2
2168	.byte	0x67,0x67
2169	pand	%xmm6,%xmm3
2170	##############################################################
2171	# $tptr is chosen so that writing to top-most element of the
2172	# vector occurs just "above" references to powers table,
2173	# "above" modulo cache-line size, which effectively precludes
2174	# possibility of memory disambiguation logic failure when
2175	# accessing the table.
2176	#
2177	lea	64+8*4+8(%rsp,%r11,8),$tptr
2178
2179	mov	%rdx,$bi
2180	mulx	0*8($aptr),$mi,%rax	# a[0]*b[0]
2181	mulx	1*8($aptr),%r11,%r12	# a[1]*b[0]
2182	add	%rax,%r11
2183	mulx	2*8($aptr),%rax,%r13	# ...
2184	adc	%rax,%r12
2185	adc	\$0,%r13
2186	mulx	3*8($aptr),%rax,%r14
2187
2188	mov	$mi,%r15
2189	imulq	32+8(%rsp),$mi		# "t[0]"*n0
2190	xor	$zero,$zero		# cf=0, of=0
2191	mov	$mi,%rdx
2192
2193	por	%xmm2,%xmm1
2194	pand	%xmm7,%xmm0
2195	por	%xmm3,%xmm1
2196	mov	$bptr,8+8(%rsp)		# off-load &b[i]
2197	por	%xmm1,%xmm0
2198
2199	.byte	0x48,0x8d,0xb6,0x20,0x00,0x00,0x00	# lea	4*8($aptr),$aptr
2200	adcx	%rax,%r13
2201	adcx	$zero,%r14		# cf=0
2202
2203	mulx	0*16($nptr),%rax,%r10
2204	adcx	%rax,%r15		# discarded
2205	adox	%r11,%r10
2206	mulx	1*16($nptr),%rax,%r11
2207	adcx	%rax,%r10
2208	adox	%r12,%r11
2209	mulx	2*16($nptr),%rax,%r12
2210	mov	24+8(%rsp),$bptr	# counter value
2211	.byte	0x66
2212	mov	%r10,-8*4($tptr)
2213	adcx	%rax,%r11
2214	adox	%r13,%r12
2215	mulx	3*16($nptr),%rax,%r15
2216	 .byte	0x67,0x67
2217	 mov	$bi,%rdx
2218	mov	%r11,-8*3($tptr)
2219	adcx	%rax,%r12
2220	adox	$zero,%r15		# of=0
2221	.byte	0x48,0x8d,0x89,0x40,0x00,0x00,0x00	# lea	4*16($nptr),$nptr
2222	mov	%r12,-8*2($tptr)
2223	#jmp	.Lmulx4x_1st
2224
2225.align	32
2226.Lmulx4x_1st:
2227	adcx	$zero,%r15		# cf=0, modulo-scheduled
2228	mulx	0*8($aptr),%r10,%rax	# a[4]*b[0]
2229	adcx	%r14,%r10
2230	mulx	1*8($aptr),%r11,%r14	# a[5]*b[0]
2231	adcx	%rax,%r11
2232	mulx	2*8($aptr),%r12,%rax	# ...
2233	adcx	%r14,%r12
2234	mulx	3*8($aptr),%r13,%r14
2235	 .byte	0x67,0x67
2236	 mov	$mi,%rdx
2237	adcx	%rax,%r13
2238	adcx	$zero,%r14		# cf=0
2239	lea	4*8($aptr),$aptr
2240	lea	4*8($tptr),$tptr
2241
2242	adox	%r15,%r10
2243	mulx	0*16($nptr),%rax,%r15
2244	adcx	%rax,%r10
2245	adox	%r15,%r11
2246	mulx	1*16($nptr),%rax,%r15
2247	adcx	%rax,%r11
2248	adox	%r15,%r12
2249	mulx	2*16($nptr),%rax,%r15
2250	mov	%r10,-5*8($tptr)
2251	adcx	%rax,%r12
2252	mov	%r11,-4*8($tptr)
2253	adox	%r15,%r13
2254	mulx	3*16($nptr),%rax,%r15
2255	 mov	$bi,%rdx
2256	mov	%r12,-3*8($tptr)
2257	adcx	%rax,%r13
2258	adox	$zero,%r15
2259	lea	4*16($nptr),$nptr
2260	mov	%r13,-2*8($tptr)
2261
2262	dec	$bptr			# of=0, pass cf
2263	jnz	.Lmulx4x_1st
2264
2265	mov	8(%rsp),$num		# load -num
2266	movq	%xmm0,%rdx		# bp[1]
2267	adc	$zero,%r15		# modulo-scheduled
2268	lea	($aptr,$num),$aptr	# rewind $aptr
2269	add	%r15,%r14
2270	mov	8+8(%rsp),$bptr		# re-load &b[i]
2271	adc	$zero,$zero		# top-most carry
2272	mov	%r14,-1*8($tptr)
2273	jmp	.Lmulx4x_outer
2274
2275.align	32
2276.Lmulx4x_outer:
2277	mov	$zero,($tptr)		# save top-most carry
2278	lea	4*8($tptr,$num),$tptr	# rewind $tptr
2279	mulx	0*8($aptr),$mi,%r11	# a[0]*b[i]
2280	xor	$zero,$zero		# cf=0, of=0
2281	mov	%rdx,$bi
2282	mulx	1*8($aptr),%r14,%r12	# a[1]*b[i]
2283	adox	-4*8($tptr),$mi		# +t[0]
2284	adcx	%r14,%r11
2285	mulx	2*8($aptr),%r15,%r13	# ...
2286	adox	-3*8($tptr),%r11
2287	adcx	%r15,%r12
2288	mulx	3*8($aptr),%rdx,%r14
2289	adox	-2*8($tptr),%r12
2290	adcx	%rdx,%r13
2291	lea	($nptr,$num,2),$nptr	# rewind $nptr
2292	lea	4*8($aptr),$aptr
2293	adox	-1*8($tptr),%r13
2294	adcx	$zero,%r14
2295	adox	$zero,%r14
2296
2297	.byte	0x67
2298	mov	$mi,%r15
2299	imulq	32+8(%rsp),$mi		# "t[0]"*n0
2300
2301	movq	`0*$STRIDE/4-96`($bptr),%xmm0
2302	.byte	0x67,0x67
2303	mov	$mi,%rdx
2304	movq	`1*$STRIDE/4-96`($bptr),%xmm1
2305	.byte	0x67
2306	pand	%xmm4,%xmm0
2307	movq	`2*$STRIDE/4-96`($bptr),%xmm2
2308	.byte	0x67
2309	pand	%xmm5,%xmm1
2310	movq	`3*$STRIDE/4-96`($bptr),%xmm3
2311	add	\$$STRIDE,$bptr		# next &b[i]
2312	.byte	0x67
2313	pand	%xmm6,%xmm2
2314	por	%xmm1,%xmm0
2315	pand	%xmm7,%xmm3
2316	xor	$zero,$zero		# cf=0, of=0
2317	mov	$bptr,8+8(%rsp)		# off-load &b[i]
2318
2319	mulx	0*16($nptr),%rax,%r10
2320	adcx	%rax,%r15		# discarded
2321	adox	%r11,%r10
2322	mulx	1*16($nptr),%rax,%r11
2323	adcx	%rax,%r10
2324	adox	%r12,%r11
2325	mulx	2*16($nptr),%rax,%r12
2326	adcx	%rax,%r11
2327	adox	%r13,%r12
2328	mulx	3*16($nptr),%rax,%r15
2329	 mov	$bi,%rdx
2330	 por	%xmm2,%xmm0
2331	mov	24+8(%rsp),$bptr	# counter value
2332	mov	%r10,-8*4($tptr)
2333	 por	%xmm3,%xmm0
2334	adcx	%rax,%r12
2335	mov	%r11,-8*3($tptr)
2336	adox	$zero,%r15		# of=0
2337	mov	%r12,-8*2($tptr)
2338	lea	4*16($nptr),$nptr
2339	jmp	.Lmulx4x_inner
2340
2341.align	32
2342.Lmulx4x_inner:
2343	mulx	0*8($aptr),%r10,%rax	# a[4]*b[i]
2344	adcx	$zero,%r15		# cf=0, modulo-scheduled
2345	adox	%r14,%r10
2346	mulx	1*8($aptr),%r11,%r14	# a[5]*b[i]
2347	adcx	0*8($tptr),%r10
2348	adox	%rax,%r11
2349	mulx	2*8($aptr),%r12,%rax	# ...
2350	adcx	1*8($tptr),%r11
2351	adox	%r14,%r12
2352	mulx	3*8($aptr),%r13,%r14
2353	 mov	$mi,%rdx
2354	adcx	2*8($tptr),%r12
2355	adox	%rax,%r13
2356	adcx	3*8($tptr),%r13
2357	adox	$zero,%r14		# of=0
2358	lea	4*8($aptr),$aptr
2359	lea	4*8($tptr),$tptr
2360	adcx	$zero,%r14		# cf=0
2361
2362	adox	%r15,%r10
2363	mulx	0*16($nptr),%rax,%r15
2364	adcx	%rax,%r10
2365	adox	%r15,%r11
2366	mulx	1*16($nptr),%rax,%r15
2367	adcx	%rax,%r11
2368	adox	%r15,%r12
2369	mulx	2*16($nptr),%rax,%r15
2370	mov	%r10,-5*8($tptr)
2371	adcx	%rax,%r12
2372	adox	%r15,%r13
2373	mov	%r11,-4*8($tptr)
2374	mulx	3*16($nptr),%rax,%r15
2375	 mov	$bi,%rdx
2376	lea	4*16($nptr),$nptr
2377	mov	%r12,-3*8($tptr)
2378	adcx	%rax,%r13
2379	adox	$zero,%r15
2380	mov	%r13,-2*8($tptr)
2381
2382	dec	$bptr			# of=0, pass cf
2383	jnz	.Lmulx4x_inner
2384
2385	mov	0+8(%rsp),$num		# load -num
2386	movq	%xmm0,%rdx		# bp[i+1]
2387	adc	$zero,%r15		# modulo-scheduled
2388	sub	0*8($tptr),$bptr	# pull top-most carry to %cf
2389	mov	8+8(%rsp),$bptr		# re-load &b[i]
2390	mov	16+8(%rsp),%r10
2391	adc	%r15,%r14
2392	lea	($aptr,$num),$aptr	# rewind $aptr
2393	adc	$zero,$zero		# top-most carry
2394	mov	%r14,-1*8($tptr)
2395
2396	cmp	%r10,$bptr
2397	jb	.Lmulx4x_outer
2398
2399	mov	-16($nptr),%r10
2400	xor	%r15,%r15
2401	sub	%r14,%r10		# compare top-most words
2402	adc	%r15,%r15
2403	or	%r15,$zero
2404	xor	\$1,$zero
2405	lea	($tptr,$num),%rdi	# rewind $tptr
2406	lea	($nptr,$num,2),$nptr	# rewind $nptr
2407	.byte	0x67,0x67
2408	sar	\$3+2,$num		# cf=0
2409	lea	($nptr,$zero,8),%rbp
2410	mov	56+8(%rsp),%rdx		# restore rp
2411	mov	$num,%rcx
2412	jmp	.Lsqrx4x_sub		# common post-condition
2413.size	mulx4x_internal,.-mulx4x_internal
2414___
2415}{
2416######################################################################
2417# void bn_power5(
2418my $rptr="%rdi";	# BN_ULONG *rptr,
2419my $aptr="%rsi";	# const BN_ULONG *aptr,
2420my $bptr="%rdx";	# const void *table,
2421my $nptr="%rcx";	# const BN_ULONG *nptr,
2422my $n0  ="%r8";		# const BN_ULONG *n0);
2423my $num ="%r9";		# int num, has to be divisible by 8
2424			# int pwr);
2425
2426my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
2427my @A0=("%r10","%r11");
2428my @A1=("%r12","%r13");
2429my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
2430
2431$code.=<<___;
2432.type	bn_powerx5,\@function,6
2433.align	32
2434bn_powerx5:
2435.Lpowerx5_enter:
2436	.byte	0x67
2437	mov	%rsp,%rax
2438	push	%rbx
2439	push	%rbp
2440	push	%r12
2441	push	%r13
2442	push	%r14
2443	push	%r15
2444___
2445$code.=<<___ if ($win64);
2446	lea	-0x28(%rsp),%rsp
2447	movaps	%xmm6,(%rsp)
2448	movaps	%xmm7,0x10(%rsp)
2449___
2450$code.=<<___;
2451	.byte	0x67
2452	mov	${num}d,%r10d
2453	shl	\$3,${num}d		# convert $num to bytes
2454	shl	\$3+2,%r10d		# 4*$num
2455	neg	$num
2456	mov	($n0),$n0		# *n0
2457
2458	##############################################################
2459	# ensure that stack frame doesn't alias with $aptr+4*$num
2460	# modulo 4096, which covers ret[num], am[num] and n[2*num]
2461	# (see bn_exp.c). this is done to allow memory disambiguation
2462	# logic do its magic.
2463	#
2464	lea	-64(%rsp,$num,2),%r11
2465	sub	$aptr,%r11
2466	and	\$4095,%r11
2467	cmp	%r11,%r10
2468	jb	.Lpwrx_sp_alt
2469	sub	%r11,%rsp		# align with $aptr
2470	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
2471	jmp	.Lpwrx_sp_done
2472
2473.align	32
2474.Lpwrx_sp_alt:
2475	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
2476	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
2477	sub	%r10,%r11
2478	mov	\$0,%r10
2479	cmovc	%r10,%r11
2480	sub	%r11,%rsp
2481.Lpwrx_sp_done:
2482	and	\$-64,%rsp
2483	mov	$num,%r10
2484	neg	$num
2485
2486	##############################################################
2487	# Stack layout
2488	#
2489	# +0	saved $num, used in reduction section
2490	# +8	&t[2*$num], used in reduction section
2491	# +16	intermediate carry bit
2492	# +24	top-most carry bit, used in reduction section
2493	# +32	saved *n0
2494	# +40	saved %rsp
2495	# +48	t[2*$num]
2496	#
2497	pxor	%xmm0,%xmm0
2498	movq	$rptr,%xmm1		# save $rptr
2499	movq	$nptr,%xmm2		# save $nptr
2500	movq	%r10, %xmm3		# -$num
2501	movq	$bptr,%xmm4
2502	mov	$n0,  32(%rsp)
2503	mov	%rax, 40(%rsp)		# save original %rsp
2504.Lpowerx5_body:
2505
2506	call	__bn_sqrx8x_internal
2507	call	__bn_sqrx8x_internal
2508	call	__bn_sqrx8x_internal
2509	call	__bn_sqrx8x_internal
2510	call	__bn_sqrx8x_internal
2511
2512	mov	%r10,$num		# -num
2513	mov	$aptr,$rptr
2514	movq	%xmm2,$nptr
2515	movq	%xmm4,$bptr
2516	mov	40(%rsp),%rax
2517
2518	call	mulx4x_internal
2519
2520	mov	40(%rsp),%rsi		# restore %rsp
2521	mov	\$1,%rax
2522___
2523$code.=<<___ if ($win64);
2524	movaps	-88(%rsi),%xmm6
2525	movaps	-72(%rsi),%xmm7
2526___
2527$code.=<<___;
2528	mov	-48(%rsi),%r15
2529	mov	-40(%rsi),%r14
2530	mov	-32(%rsi),%r13
2531	mov	-24(%rsi),%r12
2532	mov	-16(%rsi),%rbp
2533	mov	-8(%rsi),%rbx
2534	lea	(%rsi),%rsp
2535.Lpowerx5_epilogue:
2536	ret
2537.size	bn_powerx5,.-bn_powerx5
2538
2539.globl	bn_sqrx8x_internal
2540.hidden	bn_sqrx8x_internal
2541.type	bn_sqrx8x_internal,\@abi-omnipotent
2542.align	32
2543bn_sqrx8x_internal:
2544__bn_sqrx8x_internal:
2545	##################################################################
2546	# Squaring part:
2547	#
2548	# a) multiply-n-add everything but a[i]*a[i];
2549	# b) shift result of a) by 1 to the left and accumulate
2550	#    a[i]*a[i] products;
2551	#
2552	##################################################################
2553	# a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
2554	#                                                     a[1]a[0]
2555	#                                                 a[2]a[0]
2556	#                                             a[3]a[0]
2557	#                                             a[2]a[1]
2558	#                                         a[3]a[1]
2559	#                                     a[3]a[2]
2560	#
2561	#                                         a[4]a[0]
2562	#                                     a[5]a[0]
2563	#                                 a[6]a[0]
2564	#                             a[7]a[0]
2565	#                                     a[4]a[1]
2566	#                                 a[5]a[1]
2567	#                             a[6]a[1]
2568	#                         a[7]a[1]
2569	#                                 a[4]a[2]
2570	#                             a[5]a[2]
2571	#                         a[6]a[2]
2572	#                     a[7]a[2]
2573	#                             a[4]a[3]
2574	#                         a[5]a[3]
2575	#                     a[6]a[3]
2576	#                 a[7]a[3]
2577	#
2578	#                     a[5]a[4]
2579	#                 a[6]a[4]
2580	#             a[7]a[4]
2581	#             a[6]a[5]
2582	#         a[7]a[5]
2583	#     a[7]a[6]
2584	# a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
2585___
2586{
2587my ($zero,$carry)=("%rbp","%rcx");
2588my $aaptr=$zero;
2589$code.=<<___;
2590	lea	48+8(%rsp),$tptr
2591	lea	($aptr,$num),$aaptr
2592	mov	$num,0+8(%rsp)			# save $num
2593	mov	$aaptr,8+8(%rsp)		# save end of $aptr
2594	jmp	.Lsqr8x_zero_start
2595
2596.align	32
2597.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2598.Lsqrx8x_zero:
2599	.byte	0x3e
2600	movdqa	%xmm0,0*8($tptr)
2601	movdqa	%xmm0,2*8($tptr)
2602	movdqa	%xmm0,4*8($tptr)
2603	movdqa	%xmm0,6*8($tptr)
2604.Lsqr8x_zero_start:			# aligned at 32
2605	movdqa	%xmm0,8*8($tptr)
2606	movdqa	%xmm0,10*8($tptr)
2607	movdqa	%xmm0,12*8($tptr)
2608	movdqa	%xmm0,14*8($tptr)
2609	lea	16*8($tptr),$tptr
2610	sub	\$64,$num
2611	jnz	.Lsqrx8x_zero
2612
2613	mov	0*8($aptr),%rdx		# a[0], modulo-scheduled
2614	#xor	%r9,%r9			# t[1], ex-$num, zero already
2615	xor	%r10,%r10
2616	xor	%r11,%r11
2617	xor	%r12,%r12
2618	xor	%r13,%r13
2619	xor	%r14,%r14
2620	xor	%r15,%r15
2621	lea	48+8(%rsp),$tptr
2622	xor	$zero,$zero		# cf=0, cf=0
2623	jmp	.Lsqrx8x_outer_loop
2624
2625.align	32
2626.Lsqrx8x_outer_loop:
2627	mulx	1*8($aptr),%r8,%rax	# a[1]*a[0]
2628	adcx	%r9,%r8			# a[1]*a[0]+=t[1]
2629	adox	%rax,%r10
2630	mulx	2*8($aptr),%r9,%rax	# a[2]*a[0]
2631	adcx	%r10,%r9
2632	adox	%rax,%r11
2633	.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00	# mulx	3*8($aptr),%r10,%rax	# ...
2634	adcx	%r11,%r10
2635	adox	%rax,%r12
2636	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00	# mulx	4*8($aptr),%r11,%rax
2637	adcx	%r12,%r11
2638	adox	%rax,%r13
2639	mulx	5*8($aptr),%r12,%rax
2640	adcx	%r13,%r12
2641	adox	%rax,%r14
2642	mulx	6*8($aptr),%r13,%rax
2643	adcx	%r14,%r13
2644	adox	%r15,%rax
2645	mulx	7*8($aptr),%r14,%r15
2646	 mov	1*8($aptr),%rdx		# a[1]
2647	adcx	%rax,%r14
2648	adox	$zero,%r15
2649	adc	8*8($tptr),%r15
2650	mov	%r8,1*8($tptr)		# t[1]
2651	mov	%r9,2*8($tptr)		# t[2]
2652	sbb	$carry,$carry		# mov %cf,$carry
2653	xor	$zero,$zero		# cf=0, of=0
2654
2655
2656	mulx	2*8($aptr),%r8,%rbx	# a[2]*a[1]
2657	mulx	3*8($aptr),%r9,%rax	# a[3]*a[1]
2658	adcx	%r10,%r8
2659	adox	%rbx,%r9
2660	mulx	4*8($aptr),%r10,%rbx	# ...
2661	adcx	%r11,%r9
2662	adox	%rax,%r10
2663	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00	# mulx	5*8($aptr),%r11,%rax
2664	adcx	%r12,%r10
2665	adox	%rbx,%r11
2666	.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00	# mulx	6*8($aptr),%r12,%rbx
2667	adcx	%r13,%r11
2668	adox	%r14,%r12
2669	.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00	# mulx	7*8($aptr),%r13,%r14
2670	 mov	2*8($aptr),%rdx		# a[2]
2671	adcx	%rax,%r12
2672	adox	%rbx,%r13
2673	adcx	%r15,%r13
2674	adox	$zero,%r14		# of=0
2675	adcx	$zero,%r14		# cf=0
2676
2677	mov	%r8,3*8($tptr)		# t[3]
2678	mov	%r9,4*8($tptr)		# t[4]
2679
2680	mulx	3*8($aptr),%r8,%rbx	# a[3]*a[2]
2681	mulx	4*8($aptr),%r9,%rax	# a[4]*a[2]
2682	adcx	%r10,%r8
2683	adox	%rbx,%r9
2684	mulx	5*8($aptr),%r10,%rbx	# ...
2685	adcx	%r11,%r9
2686	adox	%rax,%r10
2687	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00	# mulx	6*8($aptr),%r11,%rax
2688	adcx	%r12,%r10
2689	adox	%r13,%r11
2690	.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00	# mulx	7*8($aptr),%r12,%r13
2691	.byte	0x3e
2692	 mov	3*8($aptr),%rdx		# a[3]
2693	adcx	%rbx,%r11
2694	adox	%rax,%r12
2695	adcx	%r14,%r12
2696	mov	%r8,5*8($tptr)		# t[5]
2697	mov	%r9,6*8($tptr)		# t[6]
2698	 mulx	4*8($aptr),%r8,%rax	# a[4]*a[3]
2699	adox	$zero,%r13		# of=0
2700	adcx	$zero,%r13		# cf=0
2701
2702	mulx	5*8($aptr),%r9,%rbx	# a[5]*a[3]
2703	adcx	%r10,%r8
2704	adox	%rax,%r9
2705	mulx	6*8($aptr),%r10,%rax	# ...
2706	adcx	%r11,%r9
2707	adox	%r12,%r10
2708	mulx	7*8($aptr),%r11,%r12
2709	 mov	4*8($aptr),%rdx		# a[4]
2710	 mov	5*8($aptr),%r14		# a[5]
2711	adcx	%rbx,%r10
2712	adox	%rax,%r11
2713	 mov	6*8($aptr),%r15		# a[6]
2714	adcx	%r13,%r11
2715	adox	$zero,%r12		# of=0
2716	adcx	$zero,%r12		# cf=0
2717
2718	mov	%r8,7*8($tptr)		# t[7]
2719	mov	%r9,8*8($tptr)		# t[8]
2720
2721	mulx	%r14,%r9,%rax		# a[5]*a[4]
2722	 mov	7*8($aptr),%r8		# a[7]
2723	adcx	%r10,%r9
2724	mulx	%r15,%r10,%rbx		# a[6]*a[4]
2725	adox	%rax,%r10
2726	adcx	%r11,%r10
2727	mulx	%r8,%r11,%rax		# a[7]*a[4]
2728	 mov	%r14,%rdx		# a[5]
2729	adox	%rbx,%r11
2730	adcx	%r12,%r11
2731	#adox	$zero,%rax		# of=0
2732	adcx	$zero,%rax		# cf=0
2733
2734	mulx	%r15,%r14,%rbx		# a[6]*a[5]
2735	mulx	%r8,%r12,%r13		# a[7]*a[5]
2736	 mov	%r15,%rdx		# a[6]
2737	 lea	8*8($aptr),$aptr
2738	adcx	%r14,%r11
2739	adox	%rbx,%r12
2740	adcx	%rax,%r12
2741	adox	$zero,%r13
2742
2743	.byte	0x67,0x67
2744	mulx	%r8,%r8,%r14		# a[7]*a[6]
2745	adcx	%r8,%r13
2746	adcx	$zero,%r14
2747
2748	cmp	8+8(%rsp),$aptr
2749	je	.Lsqrx8x_outer_break
2750
2751	neg	$carry			# mov $carry,%cf
2752	mov	\$-8,%rcx
2753	mov	$zero,%r15
2754	mov	8*8($tptr),%r8
2755	adcx	9*8($tptr),%r9		# +=t[9]
2756	adcx	10*8($tptr),%r10	# ...
2757	adcx	11*8($tptr),%r11
2758	adc	12*8($tptr),%r12
2759	adc	13*8($tptr),%r13
2760	adc	14*8($tptr),%r14
2761	adc	15*8($tptr),%r15
2762	lea	($aptr),$aaptr
2763	lea	2*64($tptr),$tptr
2764	sbb	%rax,%rax		# mov %cf,$carry
2765
2766	mov	-64($aptr),%rdx		# a[0]
2767	mov	%rax,16+8(%rsp)		# offload $carry
2768	mov	$tptr,24+8(%rsp)
2769
2770	#lea	8*8($tptr),$tptr	# see 2*8*8($tptr) above
2771	xor	%eax,%eax		# cf=0, of=0
2772	jmp	.Lsqrx8x_loop
2773
2774.align	32
2775.Lsqrx8x_loop:
2776	mov	%r8,%rbx
2777	mulx	0*8($aaptr),%rax,%r8	# a[8]*a[i]
2778	adcx	%rax,%rbx		# +=t[8]
2779	adox	%r9,%r8
2780
2781	mulx	1*8($aaptr),%rax,%r9	# ...
2782	adcx	%rax,%r8
2783	adox	%r10,%r9
2784
2785	mulx	2*8($aaptr),%rax,%r10
2786	adcx	%rax,%r9
2787	adox	%r11,%r10
2788
2789	mulx	3*8($aaptr),%rax,%r11
2790	adcx	%rax,%r10
2791	adox	%r12,%r11
2792
2793	.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	4*8($aaptr),%rax,%r12
2794	adcx	%rax,%r11
2795	adox	%r13,%r12
2796
2797	mulx	5*8($aaptr),%rax,%r13
2798	adcx	%rax,%r12
2799	adox	%r14,%r13
2800
2801	mulx	6*8($aaptr),%rax,%r14
2802	 mov	%rbx,($tptr,%rcx,8)	# store t[8+i]
2803	 mov	\$0,%ebx
2804	adcx	%rax,%r13
2805	adox	%r15,%r14
2806
2807	.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00	# mulx	7*8($aaptr),%rax,%r15
2808	 mov	8($aptr,%rcx,8),%rdx	# a[i]
2809	adcx	%rax,%r14
2810	adox	%rbx,%r15		# %rbx is 0, of=0
2811	adcx	%rbx,%r15		# cf=0
2812
2813	.byte	0x67
2814	inc	%rcx			# of=0
2815	jnz	.Lsqrx8x_loop
2816
2817	lea	8*8($aaptr),$aaptr
2818	mov	\$-8,%rcx
2819	cmp	8+8(%rsp),$aaptr	# done?
2820	je	.Lsqrx8x_break
2821
2822	sub	16+8(%rsp),%rbx		# mov 16(%rsp),%cf
2823	.byte	0x66
2824	mov	-64($aptr),%rdx
2825	adcx	0*8($tptr),%r8
2826	adcx	1*8($tptr),%r9
2827	adc	2*8($tptr),%r10
2828	adc	3*8($tptr),%r11
2829	adc	4*8($tptr),%r12
2830	adc	5*8($tptr),%r13
2831	adc	6*8($tptr),%r14
2832	adc	7*8($tptr),%r15
2833	lea	8*8($tptr),$tptr
2834	.byte	0x67
2835	sbb	%rax,%rax		# mov %cf,%rax
2836	xor	%ebx,%ebx		# cf=0, of=0
2837	mov	%rax,16+8(%rsp)		# offload carry
2838	jmp	.Lsqrx8x_loop
2839
2840.align	32
2841.Lsqrx8x_break:
2842	sub	16+8(%rsp),%r8		# consume last carry
2843	mov	24+8(%rsp),$carry	# initial $tptr, borrow $carry
2844	mov	0*8($aptr),%rdx		# a[8], modulo-scheduled
2845	xor	%ebp,%ebp		# xor	$zero,$zero
2846	mov	%r8,0*8($tptr)
2847	cmp	$carry,$tptr		# cf=0, of=0
2848	je	.Lsqrx8x_outer_loop
2849
2850	mov	%r9,1*8($tptr)
2851	 mov	1*8($carry),%r9
2852	mov	%r10,2*8($tptr)
2853	 mov	2*8($carry),%r10
2854	mov	%r11,3*8($tptr)
2855	 mov	3*8($carry),%r11
2856	mov	%r12,4*8($tptr)
2857	 mov	4*8($carry),%r12
2858	mov	%r13,5*8($tptr)
2859	 mov	5*8($carry),%r13
2860	mov	%r14,6*8($tptr)
2861	 mov	6*8($carry),%r14
2862	mov	%r15,7*8($tptr)
2863	 mov	7*8($carry),%r15
2864	mov	$carry,$tptr
2865	jmp	.Lsqrx8x_outer_loop
2866
2867.align	32
2868.Lsqrx8x_outer_break:
2869	mov	%r9,9*8($tptr)		# t[9]
2870	 movq	%xmm3,%rcx		# -$num
2871	mov	%r10,10*8($tptr)	# ...
2872	mov	%r11,11*8($tptr)
2873	mov	%r12,12*8($tptr)
2874	mov	%r13,13*8($tptr)
2875	mov	%r14,14*8($tptr)
2876___
2877}{
2878my $i="%rcx";
2879$code.=<<___;
2880	lea	48+8(%rsp),$tptr
2881	mov	($aptr,$i),%rdx		# a[0]
2882
2883	mov	8($tptr),$A0[1]		# t[1]
2884	xor	$A0[0],$A0[0]		# t[0], of=0, cf=0
2885	mov	0+8(%rsp),$num		# restore $num
2886	adox	$A0[1],$A0[1]
2887	 mov	16($tptr),$A1[0]	# t[2]	# prefetch
2888	 mov	24($tptr),$A1[1]	# t[3]	# prefetch
2889	#jmp	.Lsqrx4x_shift_n_add	# happens to be aligned
2890
2891.align	32
2892.Lsqrx4x_shift_n_add:
2893	mulx	%rdx,%rax,%rbx
2894	 adox	$A1[0],$A1[0]
2895	adcx	$A0[0],%rax
2896	 .byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00	# mov	8($aptr,$i),%rdx	# a[i+1]	# prefetch
2897	 .byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00	# mov	32($tptr),$A0[0]	# t[2*i+4]	# prefetch
2898	 adox	$A1[1],$A1[1]
2899	adcx	$A0[1],%rbx
2900	 mov	40($tptr),$A0[1]		# t[2*i+4+1]	# prefetch
2901	mov	%rax,0($tptr)
2902	mov	%rbx,8($tptr)
2903
2904	mulx	%rdx,%rax,%rbx
2905	 adox	$A0[0],$A0[0]
2906	adcx	$A1[0],%rax
2907	 mov	16($aptr,$i),%rdx	# a[i+2]	# prefetch
2908	 mov	48($tptr),$A1[0]	# t[2*i+6]	# prefetch
2909	 adox	$A0[1],$A0[1]
2910	adcx	$A1[1],%rbx
2911	 mov	56($tptr),$A1[1]	# t[2*i+6+1]	# prefetch
2912	mov	%rax,16($tptr)
2913	mov	%rbx,24($tptr)
2914
2915	mulx	%rdx,%rax,%rbx
2916	 adox	$A1[0],$A1[0]
2917	adcx	$A0[0],%rax
2918	 mov	24($aptr,$i),%rdx	# a[i+3]	# prefetch
2919	 lea	32($i),$i
2920	 mov	64($tptr),$A0[0]	# t[2*i+8]	# prefetch
2921	 adox	$A1[1],$A1[1]
2922	adcx	$A0[1],%rbx
2923	 mov	72($tptr),$A0[1]	# t[2*i+8+1]	# prefetch
2924	mov	%rax,32($tptr)
2925	mov	%rbx,40($tptr)
2926
2927	mulx	%rdx,%rax,%rbx
2928	 adox	$A0[0],$A0[0]
2929	adcx	$A1[0],%rax
2930	jrcxz	.Lsqrx4x_shift_n_add_break
2931	 .byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00	# mov	0($aptr,$i),%rdx	# a[i+4]	# prefetch
2932	 adox	$A0[1],$A0[1]
2933	adcx	$A1[1],%rbx
2934	 mov	80($tptr),$A1[0]	# t[2*i+10]	# prefetch
2935	 mov	88($tptr),$A1[1]	# t[2*i+10+1]	# prefetch
2936	mov	%rax,48($tptr)
2937	mov	%rbx,56($tptr)
2938	lea	64($tptr),$tptr
2939	nop
2940	jmp	.Lsqrx4x_shift_n_add
2941
2942.align	32
2943.Lsqrx4x_shift_n_add_break:
2944	adcx	$A1[1],%rbx
2945	mov	%rax,48($tptr)
2946	mov	%rbx,56($tptr)
2947	lea	64($tptr),$tptr		# end of t[] buffer
2948___
2949}
2950######################################################################
2951# Montgomery reduction part, "word-by-word" algorithm.
2952#
2953# This new path is inspired by multiple submissions from Intel, by
2954# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
2955# Vinodh Gopal...
2956{
2957my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
2958
2959$code.=<<___;
2960	movq	%xmm2,$nptr
2961sqrx8x_reduction:
2962	xor	%eax,%eax		# initial top-most carry bit
2963	mov	32+8(%rsp),%rbx		# n0
2964	mov	48+8(%rsp),%rdx		# "%r8", 8*0($tptr)
2965	lea	-128($nptr,$num,2),%rcx	# end of n[]
2966	#lea	48+8(%rsp,$num,2),$tptr	# end of t[] buffer
2967	mov	%rcx, 0+8(%rsp)		# save end of n[]
2968	mov	$tptr,8+8(%rsp)		# save end of t[]
2969
2970	lea	48+8(%rsp),$tptr		# initial t[] window
2971	jmp	.Lsqrx8x_reduction_loop
2972
2973.align	32
2974.Lsqrx8x_reduction_loop:
2975	mov	8*1($tptr),%r9
2976	mov	8*2($tptr),%r10
2977	mov	8*3($tptr),%r11
2978	mov	8*4($tptr),%r12
2979	mov	%rdx,%r8
2980	imulq	%rbx,%rdx		# n0*a[i]
2981	mov	8*5($tptr),%r13
2982	mov	8*6($tptr),%r14
2983	mov	8*7($tptr),%r15
2984	mov	%rax,24+8(%rsp)		# store top-most carry bit
2985
2986	lea	8*8($tptr),$tptr
2987	xor	$carry,$carry		# cf=0,of=0
2988	mov	\$-8,%rcx
2989	jmp	.Lsqrx8x_reduce
2990
2991.align	32
2992.Lsqrx8x_reduce:
2993	mov	%r8, %rbx
2994	mulx	16*0($nptr),%rax,%r8	# n[0]
2995	adcx	%rbx,%rax		# discarded
2996	adox	%r9,%r8
2997
2998	mulx	16*1($nptr),%rbx,%r9	# n[1]
2999	adcx	%rbx,%r8
3000	adox	%r10,%r9
3001
3002	mulx	16*2($nptr),%rbx,%r10
3003	adcx	%rbx,%r9
3004	adox	%r11,%r10
3005
3006	mulx	16*3($nptr),%rbx,%r11
3007	adcx	%rbx,%r10
3008	adox	%r12,%r11
3009
3010	.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x40,0x00,0x00,0x00	# mulx	16*4($nptr),%rbx,%r12
3011	 mov	%rdx,%rax
3012	 mov	%r8,%rdx
3013	adcx	%rbx,%r11
3014	adox	%r13,%r12
3015
3016	 mulx	32+8(%rsp),%rbx,%rdx	# %rdx discarded
3017	 mov	%rax,%rdx
3018	 mov	%rax,64+48+8(%rsp,%rcx,8)	# put aside n0*a[i]
3019
3020	mulx	16*5($nptr),%rax,%r13
3021	adcx	%rax,%r12
3022	adox	%r14,%r13
3023
3024	mulx	16*6($nptr),%rax,%r14
3025	adcx	%rax,%r13
3026	adox	%r15,%r14
3027
3028	mulx	16*7($nptr),%rax,%r15
3029	 mov	%rbx,%rdx
3030	adcx	%rax,%r14
3031	adox	$carry,%r15		# $carry is 0
3032	adcx	$carry,%r15		# cf=0
3033
3034	.byte	0x67,0x67,0x67
3035	inc	%rcx			# of=0
3036	jnz	.Lsqrx8x_reduce
3037
3038	mov	$carry,%rax		# xor	%rax,%rax
3039	cmp	0+8(%rsp),$nptr		# end of n[]?
3040	jae	.Lsqrx8x_no_tail
3041
3042	mov	48+8(%rsp),%rdx		# pull n0*a[0]
3043	add	8*0($tptr),%r8
3044	lea	16*8($nptr),$nptr
3045	mov	\$-8,%rcx
3046	adcx	8*1($tptr),%r9
3047	adcx	8*2($tptr),%r10
3048	adc	8*3($tptr),%r11
3049	adc	8*4($tptr),%r12
3050	adc	8*5($tptr),%r13
3051	adc	8*6($tptr),%r14
3052	adc	8*7($tptr),%r15
3053	lea	8*8($tptr),$tptr
3054	sbb	%rax,%rax		# top carry
3055
3056	xor	$carry,$carry		# of=0, cf=0
3057	mov	%rax,16+8(%rsp)
3058	jmp	.Lsqrx8x_tail
3059
3060.align	32
3061.Lsqrx8x_tail:
3062	mov	%r8,%rbx
3063	mulx	16*0($nptr),%rax,%r8
3064	adcx	%rax,%rbx
3065	adox	%r9,%r8
3066
3067	mulx	16*1($nptr),%rax,%r9
3068	adcx	%rax,%r8
3069	adox	%r10,%r9
3070
3071	mulx	16*2($nptr),%rax,%r10
3072	adcx	%rax,%r9
3073	adox	%r11,%r10
3074
3075	mulx	16*3($nptr),%rax,%r11
3076	adcx	%rax,%r10
3077	adox	%r12,%r11
3078
3079	.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x40,0x00,0x00,0x00	# mulx	16*4($nptr),%rax,%r12
3080	adcx	%rax,%r11
3081	adox	%r13,%r12
3082
3083	mulx	16*5($nptr),%rax,%r13
3084	adcx	%rax,%r12
3085	adox	%r14,%r13
3086
3087	mulx	16*6($nptr),%rax,%r14
3088	adcx	%rax,%r13
3089	adox	%r15,%r14
3090
3091	mulx	16*7($nptr),%rax,%r15
3092	 mov	72+48+8(%rsp,%rcx,8),%rdx	# pull n0*a[i]
3093	adcx	%rax,%r14
3094	adox	$carry,%r15
3095	 mov	%rbx,($tptr,%rcx,8)	# save result
3096	 mov	%r8,%rbx
3097	adcx	$carry,%r15		# cf=0
3098
3099	inc	%rcx			# of=0
3100	jnz	.Lsqrx8x_tail
3101
3102	cmp	0+8(%rsp),$nptr		# end of n[]?
3103	jae	.Lsqrx8x_tail_done	# break out of loop
3104
3105	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
3106	 mov	48+8(%rsp),%rdx		# pull n0*a[0]
3107	 lea	16*8($nptr),$nptr
3108	adc	8*0($tptr),%r8
3109	adc	8*1($tptr),%r9
3110	adc	8*2($tptr),%r10
3111	adc	8*3($tptr),%r11
3112	adc	8*4($tptr),%r12
3113	adc	8*5($tptr),%r13
3114	adc	8*6($tptr),%r14
3115	adc	8*7($tptr),%r15
3116	lea	8*8($tptr),$tptr
3117	sbb	%rax,%rax
3118	sub	\$8,%rcx		# mov	\$-8,%rcx
3119
3120	xor	$carry,$carry		# of=0, cf=0
3121	mov	%rax,16+8(%rsp)
3122	jmp	.Lsqrx8x_tail
3123
3124.align	32
3125.Lsqrx8x_tail_done:
3126	add	24+8(%rsp),%r8		# can this overflow?
3127	mov	$carry,%rax		# xor	%rax,%rax
3128
3129	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
3130.Lsqrx8x_no_tail:			# %cf is 0 if jumped here
3131	adc	8*0($tptr),%r8
3132	 movq	%xmm3,%rcx
3133	adc	8*1($tptr),%r9
3134	 mov	16*7($nptr),$carry
3135	 movq	%xmm2,$nptr		# restore $nptr
3136	adc	8*2($tptr),%r10
3137	adc	8*3($tptr),%r11
3138	adc	8*4($tptr),%r12
3139	adc	8*5($tptr),%r13
3140	adc	8*6($tptr),%r14
3141	adc	8*7($tptr),%r15
3142	adc	%rax,%rax		# top-most carry
3143
3144	mov	32+8(%rsp),%rbx		# n0
3145	mov	8*8($tptr,%rcx),%rdx	# modulo-scheduled "%r8"
3146
3147	mov	%r8,8*0($tptr)		# store top 512 bits
3148	 lea	8*8($tptr),%r8		# borrow %r8
3149	mov	%r9,8*1($tptr)
3150	mov	%r10,8*2($tptr)
3151	mov	%r11,8*3($tptr)
3152	mov	%r12,8*4($tptr)
3153	mov	%r13,8*5($tptr)
3154	mov	%r14,8*6($tptr)
3155	mov	%r15,8*7($tptr)
3156
3157	lea	8*8($tptr,%rcx),$tptr	# start of current t[] window
3158	cmp	8+8(%rsp),%r8		# end of t[]?
3159	jb	.Lsqrx8x_reduction_loop
3160___
3161}
3162##############################################################
3163# Post-condition, 4x unrolled
3164#
3165{
3166my ($rptr,$nptr)=("%rdx","%rbp");
3167my @ri=map("%r$_",(10..13));
3168my @ni=map("%r$_",(14..15));
3169$code.=<<___;
3170	xor	%rbx,%rbx
3171	sub	%r15,%rsi		# compare top-most words
3172	adc	%rbx,%rbx
3173	mov	%rcx,%r10		# -$num
3174	.byte	0x67
3175	or	%rbx,%rax
3176	.byte	0x67
3177	mov	%rcx,%r9		# -$num
3178	xor	\$1,%rax
3179	sar	\$3+2,%rcx		# cf=0
3180	#lea	48+8(%rsp,%r9),$tptr
3181	lea	($nptr,%rax,8),$nptr
3182	movq	%xmm1,$rptr		# restore $rptr
3183	movq	%xmm1,$aptr		# prepare for back-to-back call
3184	jmp	.Lsqrx4x_sub
3185
3186.align	32
3187.Lsqrx4x_sub:
3188	.byte	0x66
3189	mov	8*0($tptr),%r12
3190	mov	8*1($tptr),%r13
3191	sbb	16*0($nptr),%r12
3192	mov	8*2($tptr),%r14
3193	sbb	16*1($nptr),%r13
3194	mov	8*3($tptr),%r15
3195	lea	8*4($tptr),$tptr
3196	sbb	16*2($nptr),%r14
3197	mov	%r12,8*0($rptr)
3198	sbb	16*3($nptr),%r15
3199	lea	16*4($nptr),$nptr
3200	mov	%r13,8*1($rptr)
3201	mov	%r14,8*2($rptr)
3202	mov	%r15,8*3($rptr)
3203	lea	8*4($rptr),$rptr
3204
3205	inc	%rcx
3206	jnz	.Lsqrx4x_sub
3207___
3208}
3209$code.=<<___;
3210	neg	%r9			# restore $num
3211
3212	ret
3213.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
3214___
3215}}}
3216{
3217my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
3218				("%rdi","%esi","%rdx","%ecx");  # Unix order
3219my $out=$inp;
3220my $STRIDE=2**5*8;
3221my $N=$STRIDE/4;
3222
3223$code.=<<___;
3224.globl	bn_scatter5
3225.type	bn_scatter5,\@abi-omnipotent
3226.align	16
3227bn_scatter5:
3228	cmp	\$0, $num
3229	jz	.Lscatter_epilogue
3230	lea	($tbl,$idx,8),$tbl
3231.Lscatter:
3232	mov	($inp),%rax
3233	lea	8($inp),$inp
3234	mov	%rax,($tbl)
3235	lea	32*8($tbl),$tbl
3236	sub	\$1,$num
3237	jnz	.Lscatter
3238.Lscatter_epilogue:
3239	ret
3240.size	bn_scatter5,.-bn_scatter5
3241
3242.globl	bn_gather5
3243.type	bn_gather5,\@abi-omnipotent
3244.align	16
3245bn_gather5:
3246___
3247$code.=<<___ if ($win64);
3248.LSEH_begin_bn_gather5:
3249	# I can't trust assembler to use specific encoding:-(
3250	.byte	0x48,0x83,0xec,0x28		#sub	\$0x28,%rsp
3251	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
3252	.byte	0x0f,0x29,0x7c,0x24,0x10	#movdqa	%xmm7,0x10(%rsp)
3253___
3254$code.=<<___;
3255	mov	$idx,%r11d
3256	shr	\$`log($N/8)/log(2)`,$idx
3257	and	\$`$N/8-1`,%r11
3258	not	$idx
3259	lea	.Lmagic_masks(%rip),%rax
3260	and	\$`2**5/($N/8)-1`,$idx	# 5 is "window size"
3261	lea	128($tbl,%r11,8),$tbl	# pointer within 1st cache line
3262	movq	0(%rax,$idx,8),%xmm4	# set of masks denoting which
3263	movq	8(%rax,$idx,8),%xmm5	# cache line contains element
3264	movq	16(%rax,$idx,8),%xmm6	# denoted by 7th argument
3265	movq	24(%rax,$idx,8),%xmm7
3266	jmp	.Lgather
3267.align	16
3268.Lgather:
3269	movq	`0*$STRIDE/4-128`($tbl),%xmm0
3270	movq	`1*$STRIDE/4-128`($tbl),%xmm1
3271	pand	%xmm4,%xmm0
3272	movq	`2*$STRIDE/4-128`($tbl),%xmm2
3273	pand	%xmm5,%xmm1
3274	movq	`3*$STRIDE/4-128`($tbl),%xmm3
3275	pand	%xmm6,%xmm2
3276	por	%xmm1,%xmm0
3277	pand	%xmm7,%xmm3
3278	.byte	0x67,0x67
3279	por	%xmm2,%xmm0
3280	lea	$STRIDE($tbl),$tbl
3281	por	%xmm3,%xmm0
3282
3283	movq	%xmm0,($out)		# m0=bp[0]
3284	lea	8($out),$out
3285	sub	\$1,$num
3286	jnz	.Lgather
3287___
3288$code.=<<___ if ($win64);
3289	movaps	(%rsp),%xmm6
3290	movaps	0x10(%rsp),%xmm7
3291	lea	0x28(%rsp),%rsp
3292___
3293$code.=<<___;
3294	ret
3295.LSEH_end_bn_gather5:
3296.size	bn_gather5,.-bn_gather5
3297___
3298}
3299$code.=<<___;
3300.align	64
3301.Lmagic_masks:
3302	.long	0,0, 0,0, 0,0, -1,-1
3303	.long	0,0, 0,0, 0,0,  0,0
3304.asciz	"Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
3305___
3306
3307# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3308#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
3309if ($win64) {
3310$rec="%rcx";
3311$frame="%rdx";
3312$context="%r8";
3313$disp="%r9";
3314
3315$code.=<<___;
3316.extern	__imp_RtlVirtualUnwind
3317.type	mul_handler,\@abi-omnipotent
3318.align	16
3319mul_handler:
3320	push	%rsi
3321	push	%rdi
3322	push	%rbx
3323	push	%rbp
3324	push	%r12
3325	push	%r13
3326	push	%r14
3327	push	%r15
3328	pushfq
3329	sub	\$64,%rsp
3330
3331	mov	120($context),%rax	# pull context->Rax
3332	mov	248($context),%rbx	# pull context->Rip
3333
3334	mov	8($disp),%rsi		# disp->ImageBase
3335	mov	56($disp),%r11		# disp->HandlerData
3336
3337	mov	0(%r11),%r10d		# HandlerData[0]
3338	lea	(%rsi,%r10),%r10	# end of prologue label
3339	cmp	%r10,%rbx		# context->Rip<end of prologue label
3340	jb	.Lcommon_seh_tail
3341
3342	mov	152($context),%rax	# pull context->Rsp
3343
3344	mov	4(%r11),%r10d		# HandlerData[1]
3345	lea	(%rsi,%r10),%r10	# epilogue label
3346	cmp	%r10,%rbx		# context->Rip>=epilogue label
3347	jae	.Lcommon_seh_tail
3348
3349	lea	.Lmul_epilogue(%rip),%r10
3350	cmp	%r10,%rbx
3351	jb	.Lbody_40
3352
3353	mov	192($context),%r10	# pull $num
3354	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
3355	jmp	.Lbody_proceed
3356
3357.Lbody_40:
3358	mov	40(%rax),%rax		# pull saved stack pointer
3359.Lbody_proceed:
3360
3361	movaps	-88(%rax),%xmm0
3362	movaps	-72(%rax),%xmm1
3363
3364	mov	-8(%rax),%rbx
3365	mov	-16(%rax),%rbp
3366	mov	-24(%rax),%r12
3367	mov	-32(%rax),%r13
3368	mov	-40(%rax),%r14
3369	mov	-48(%rax),%r15
3370	mov	%rbx,144($context)	# restore context->Rbx
3371	mov	%rbp,160($context)	# restore context->Rbp
3372	mov	%r12,216($context)	# restore context->R12
3373	mov	%r13,224($context)	# restore context->R13
3374	mov	%r14,232($context)	# restore context->R14
3375	mov	%r15,240($context)	# restore context->R15
3376	movups	%xmm0,512($context)	# restore context->Xmm6
3377	movups	%xmm1,528($context)	# restore context->Xmm7
3378
3379.Lcommon_seh_tail:
3380	mov	8(%rax),%rdi
3381	mov	16(%rax),%rsi
3382	mov	%rax,152($context)	# restore context->Rsp
3383	mov	%rsi,168($context)	# restore context->Rsi
3384	mov	%rdi,176($context)	# restore context->Rdi
3385
3386	mov	40($disp),%rdi		# disp->ContextRecord
3387	mov	$context,%rsi		# context
3388	mov	\$154,%ecx		# sizeof(CONTEXT)
3389	.long	0xa548f3fc		# cld; rep movsq
3390
3391	mov	$disp,%rsi
3392	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
3393	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
3394	mov	0(%rsi),%r8		# arg3, disp->ControlPc
3395	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
3396	mov	40(%rsi),%r10		# disp->ContextRecord
3397	lea	56(%rsi),%r11		# &disp->HandlerData
3398	lea	24(%rsi),%r12		# &disp->EstablisherFrame
3399	mov	%r10,32(%rsp)		# arg5
3400	mov	%r11,40(%rsp)		# arg6
3401	mov	%r12,48(%rsp)		# arg7
3402	mov	%rcx,56(%rsp)		# arg8, (NULL)
3403	call	*__imp_RtlVirtualUnwind(%rip)
3404
3405	mov	\$1,%eax		# ExceptionContinueSearch
3406	add	\$64,%rsp
3407	popfq
3408	pop	%r15
3409	pop	%r14
3410	pop	%r13
3411	pop	%r12
3412	pop	%rbp
3413	pop	%rbx
3414	pop	%rdi
3415	pop	%rsi
3416	ret
3417.size	mul_handler,.-mul_handler
3418
3419.section	.pdata
3420.align	4
3421	.rva	.LSEH_begin_bn_mul_mont_gather5
3422	.rva	.LSEH_end_bn_mul_mont_gather5
3423	.rva	.LSEH_info_bn_mul_mont_gather5
3424
3425	.rva	.LSEH_begin_bn_mul4x_mont_gather5
3426	.rva	.LSEH_end_bn_mul4x_mont_gather5
3427	.rva	.LSEH_info_bn_mul4x_mont_gather5
3428
3429	.rva	.LSEH_begin_bn_power5
3430	.rva	.LSEH_end_bn_power5
3431	.rva	.LSEH_info_bn_power5
3432
3433	.rva	.LSEH_begin_bn_from_mont8x
3434	.rva	.LSEH_end_bn_from_mont8x
3435	.rva	.LSEH_info_bn_from_mont8x
3436___
3437$code.=<<___ if ($addx);
3438	.rva	.LSEH_begin_bn_mulx4x_mont_gather5
3439	.rva	.LSEH_end_bn_mulx4x_mont_gather5
3440	.rva	.LSEH_info_bn_mulx4x_mont_gather5
3441
3442	.rva	.LSEH_begin_bn_powerx5
3443	.rva	.LSEH_end_bn_powerx5
3444	.rva	.LSEH_info_bn_powerx5
3445___
3446$code.=<<___;
3447	.rva	.LSEH_begin_bn_gather5
3448	.rva	.LSEH_end_bn_gather5
3449	.rva	.LSEH_info_bn_gather5
3450
3451.section	.xdata
3452.align	8
3453.LSEH_info_bn_mul_mont_gather5:
3454	.byte	9,0,0,0
3455	.rva	mul_handler
3456	.rva	.Lmul_body,.Lmul_epilogue		# HandlerData[]
3457.align	8
3458.LSEH_info_bn_mul4x_mont_gather5:
3459	.byte	9,0,0,0
3460	.rva	mul_handler
3461	.rva	.Lmul4x_body,.Lmul4x_epilogue		# HandlerData[]
3462.align	8
3463.LSEH_info_bn_power5:
3464	.byte	9,0,0,0
3465	.rva	mul_handler
3466	.rva	.Lpower5_body,.Lpower5_epilogue		# HandlerData[]
3467.align	8
3468.LSEH_info_bn_from_mont8x:
3469	.byte	9,0,0,0
3470	.rva	mul_handler
3471	.rva	.Lfrom_body,.Lfrom_epilogue		# HandlerData[]
3472___
3473$code.=<<___ if ($addx);
3474.align	8
3475.LSEH_info_bn_mulx4x_mont_gather5:
3476	.byte	9,0,0,0
3477	.rva	mul_handler
3478	.rva	.Lmulx4x_body,.Lmulx4x_epilogue		# HandlerData[]
3479.align	8
3480.LSEH_info_bn_powerx5:
3481	.byte	9,0,0,0
3482	.rva	mul_handler
3483	.rva	.Lpowerx5_body,.Lpowerx5_epilogue	# HandlerData[]
3484___
3485$code.=<<___;
3486.align	8
3487.LSEH_info_bn_gather5:
3488        .byte   0x01,0x0d,0x05,0x00
3489        .byte   0x0d,0x78,0x01,0x00	#movaps	0x10(rsp),xmm7
3490        .byte   0x08,0x68,0x00,0x00	#movaps	(rsp),xmm6
3491        .byte   0x04,0x42,0x00,0x00	#sub	rsp,0x28
3492.align	8
3493___
3494}
3495
3496$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3497
3498print $code;
3499close STDOUT;
3500