1656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project#!/usr/bin/env perl
2656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
3656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project# ====================================================================
4392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project# project. The module is, however, dual licensed under OpenSSL and
6656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project# CRYPTOGAMS licenses depending on where you obtain it. For further
7656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project# details see http://www.openssl.org/~appro/cryptogams/.
8656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project# ====================================================================
9656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
10656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project# October 2005.
11656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project#
12656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project# Montgomery multiplication routine for x86_64. While it gives modest
13656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
14656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project# than twice, >2x, as fast. Most common rsa1024 sign is improved by
15656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project# respectful 50%. It remains to be seen if loop unrolling and
16656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project# dedicated squaring routine can provide further improvement...
17656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
18392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# July 2011.
19392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
20392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Add dedicated squaring procedure. Performance improvement varies
21392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# from platform to platform, but in average it's ~5%/15%/25%/33%
22392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
23392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
24392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# August 2011.
25392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
26392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Unroll and modulo-schedule inner loops in such manner that they
27392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# are "fallen through" for input lengths of 8, which is critical for
28392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 1024-bit RSA *sign*. Average performance improvement in comparison
29392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
30392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
31392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
32221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$flavour = shift;
33221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$output  = shift;
34221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromif ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
36221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
37656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
38656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
39656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
40656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
41656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Projectdie "can't locate x86_64-xlate.pl";
42656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
43221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromopen STDOUT,"| $^X $xlate $flavour $output";
44656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
45656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project# int bn_mul_mont(
46656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$rp="%rdi";	# BN_ULONG *rp,
47656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$ap="%rsi";	# const BN_ULONG *ap,
48656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$bp="%rdx";	# const BN_ULONG *bp,
49656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$np="%rcx";	# const BN_ULONG *np,
50656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$n0="%r8";	# const BN_ULONG *n0,
51656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$num="%r9";	# int num);
52656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$lo0="%r10";
53656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$hi0="%r11";
54656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$hi1="%r13";
55656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$i="%r14";
56656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$j="%r15";
57656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$m0="%rbx";
58656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$m1="%rbp";
59656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
60656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project$code=<<___;
61656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project.text
62656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
63656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project.globl	bn_mul_mont
64656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project.type	bn_mul_mont,\@function,6
65656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project.align	16
66656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Projectbn_mul_mont:
67392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	test	\$3,${num}d
68392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz	.Lmul_enter
69392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$8,${num}d
70392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lmul_enter
71392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	$ap,$bp
72392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jne	.Lmul4x_enter
73392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lsqr4x_enter
74392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
75392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
76392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lmul_enter:
77656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	push	%rbx
78656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	push	%rbp
79656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	push	%r12
80656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	push	%r13
81656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	push	%r14
82656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	push	%r15
83656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
84656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	mov	${num}d,${num}d
85221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lea	2($num),%r10
86221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	%rsp,%r11
87221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	neg	%r10
88221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+2))
89656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	and	\$-1024,%rsp		# minimize TLB usage
90656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
91221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
92392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lmul_body:
93392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$bp,%r12		# reassign $bp
94392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
95392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom		$bp="%r12";
96392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
97656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	mov	($n0),$n0		# pull n0[0] value
98392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	($bp),$m0		# m0=bp[0]
99392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	($ap),%rax
100656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
101656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	xor	$i,$i			# i=0
102656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	xor	$j,$j			# j=0
103656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
104392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$n0,$m1
105656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	mulq	$m0			# ap[0]*bp[0]
106656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	mov	%rax,$lo0
107392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	($np),%rax
108656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
109392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	imulq	$lo0,$m1		# "tp[0]"*n0
110392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$hi0
111656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
112392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m1			# np[0]*m1
113392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$lo0		# discarded
114392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	8($ap),%rax
115656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	adc	\$0,%rdx
116656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	mov	%rdx,$hi1
117656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
118656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	lea	1($j),$j		# j++
119392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.L1st_enter
120392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
121392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
122656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project.L1st:
123392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$hi1
124656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	mov	($ap,$j,8),%rax
125656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	adc	\$0,%rdx
126392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
127392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$lo0,$hi0
128392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
129392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
130392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$hi1
131392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
132392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.L1st_enter:
133392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m0			# ap[j]*bp[0]
134392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$hi0
135656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	mov	($np,$j,8),%rax
136392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
137392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	1($j),$j		# j++
138392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$lo0
139656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
140656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	mulq	$m1			# np[j]*m1
141392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	$num,$j
142392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jne	.L1st
143392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
144392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$hi1
145392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	($ap),%rax		# ap[0]
146656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	adc	\$0,%rdx
147392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
148656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	adc	\$0,%rdx
149392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
150656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	mov	%rdx,$hi1
151392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$lo0,$hi0
152656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
153656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	xor	%rdx,%rdx
154656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	add	$hi0,$hi1
155656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	adc	\$0,%rdx
156656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	mov	$hi1,-8(%rsp,$num,8)
157656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
158656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
159656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	lea	1($i),$i		# i++
160392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Louter
161392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
162656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project.Louter:
163656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	mov	($bp,$i,8),$m0		# m0=bp[i]
164392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$j,$j			# j=0
165392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$n0,$m1
166392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	(%rsp),$lo0
167656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	mulq	$m0			# ap[0]*bp[i]
168392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
169392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	($np),%rax
170656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	adc	\$0,%rdx
171656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
172392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	imulq	$lo0,$m1		# tp[0]*n0
173392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$hi0
174656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
175392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m1			# np[0]*m1
176392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$lo0		# discarded
177392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	8($ap),%rax
178656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	adc	\$0,%rdx
179392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	8(%rsp),$lo0		# tp[1]
180656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	mov	%rdx,$hi1
181656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
182656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	lea	1($j),$j		# j++
183392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Linner_enter
184392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
185392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
186656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project.Linner:
187392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$hi1
188656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	mov	($ap,$j,8),%rax
189656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	adc	\$0,%rdx
190392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
191392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	(%rsp,$j,8),$lo0
192392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
193392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
194392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$hi1
195392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
196392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Linner_enter:
197392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m0			# ap[j]*bp[i]
198392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$hi0
199656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	mov	($np,$j,8),%rax
200656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	adc	\$0,%rdx
201392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
202656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	mov	%rdx,$hi0
203392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$hi0
204392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	1($j),$j		# j++
205656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
206656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	mulq	$m1			# np[j]*m1
207392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	$num,$j
208392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jne	.Linner
209392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
210392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$hi1
211392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	($ap),%rax		# ap[0]
212656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	adc	\$0,%rdx
213392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
214656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	mov	(%rsp,$j,8),$lo0
215392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
216392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
217656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	mov	%rdx,$hi1
218656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
219656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	xor	%rdx,%rdx
220656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	add	$hi0,$hi1
221656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	adc	\$0,%rdx
222656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	add	$lo0,$hi1		# pull upmost overflow bit
223656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	adc	\$0,%rdx
224656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	mov	$hi1,-8(%rsp,$num,8)
225656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
226656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
227656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	lea	1($i),$i		# i++
228656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	cmp	$num,$i
229656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	jl	.Louter
230656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
231656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	xor	$i,$i			# i=0 and clear CF!
232392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	(%rsp),%rax		# tp[0]
233392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	(%rsp),$ap		# borrow ap for tp
234392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$num,$j			# j=num
235656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	jmp	.Lsub
236656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project.align	16
237656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project.Lsub:	sbb	($np,$i,8),%rax
238656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
239656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	mov	8($ap,$i,8),%rax	# tp[i+1]
240656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	lea	1($i),$i		# i++
241392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec	$j			# doesnn't affect CF!
242392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz	.Lsub
243656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
244656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	sbb	\$0,%rax		# handle upmost overflow bit
245392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$i,$i
246656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	and	%rax,$ap
247656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	not	%rax
248656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	mov	$rp,$np
249656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	and	%rax,$np
250392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$num,$j			# j=num
251656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	or	$np,$ap			# ap=borrow?tp:rp
252656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project.align	16
253656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project.Lcopy:					# copy or in-place refresh
254392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	($ap,$i,8),%rax
255392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$i,(%rsp,$i,8)		# zap temporary vector
256392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
257392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	1($i),$i
258392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$1,$j
259392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz	.Lcopy
260392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
261392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	8(%rsp,$num,8),%rsi	# restore %rsp
262392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	\$1,%rax
263392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	(%rsi),%r15
264392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	8(%rsi),%r14
265392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	16(%rsi),%r13
266392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	24(%rsi),%r12
267392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	32(%rsi),%rbp
268392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	40(%rsi),%rbx
269392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	48(%rsi),%rsp
270392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lmul_epilogue:
271392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
272392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	bn_mul_mont,.-bn_mul_mont
273392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
274392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{{{
275392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @A=("%r10","%r11");
276392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @N=("%r13","%rdi");
277392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
278392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	bn_mul4x_mont,\@function,6
279392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
280392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombn_mul4x_mont:
281392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lmul4x_enter:
282392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbx
283392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbp
284392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r12
285392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r13
286392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r14
287392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r15
288392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
289392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	${num}d,${num}d
290392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	4($num),%r10
291392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rsp,%r11
292392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	neg	%r10
293392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+4))
294392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	\$-1024,%rsp		# minimize TLB usage
295392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
296392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
297392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lmul4x_body:
298392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
299392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,%r12		# reassign $bp
300392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
301392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom		$bp="%r12";
302392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
303392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	($n0),$n0		# pull n0[0] value
304392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	($bp),$m0		# m0=bp[0]
305392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	($ap),%rax
306392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
307392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$i,$i			# i=0
308392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$j,$j			# j=0
309392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
310392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$n0,$m1
311392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m0			# ap[0]*bp[0]
312392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rax,$A[0]
313392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	($np),%rax
314392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
315392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	imulq	$A[0],$m1		# "tp[0]"*n0
316392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$A[1]
317392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
318392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m1			# np[0]*m1
319392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A[0]		# discarded
320392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	8($ap),%rax
321392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
322392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$N[1]
323392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
324392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m0
325392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A[1]
326392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	8($np),%rax
327392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
328392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$A[0]
329392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
330392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m1
331392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$N[1]
332392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	16($ap),%rax
333392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
334392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A[1],$N[1]
335392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	4($j),$j		# j++
336392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
337392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$N[1],(%rsp)
338392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$N[0]
339392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.L1st4x
340392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
341392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.L1st4x:
342392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m0			# ap[j]*bp[0]
343392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A[0]
344392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-16($np,$j,8),%rax
345392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
346392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$A[1]
347392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
348392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m1			# np[j]*m1
349392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$N[0]
350392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-8($ap,$j,8),%rax
351392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
352392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
353392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
354392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
355392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$N[1]
356392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
357392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m0			# ap[j]*bp[0]
358392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A[1]
359392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-8($np,$j,8),%rax
360392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
361392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$A[0]
362392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
363392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m1			# np[j]*m1
364392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$N[1]
365656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	mov	($ap,$j,8),%rax
366392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
367392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
368392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
369392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
370392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$N[0]
371392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
372392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m0			# ap[j]*bp[0]
373392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A[0]
374392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	($np,$j,8),%rax
375392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
376392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$A[1]
377392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
378392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m1			# np[j]*m1
379392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$N[0]
380392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	8($ap,$j,8),%rax
381392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
382392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
383392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
384392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
385392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$N[1]
386392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
387392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m0			# ap[j]*bp[0]
388392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A[1]
389392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	8($np,$j,8),%rax
390392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
391392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	4($j),$j		# j++
392392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$A[0]
393392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
394392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m1			# np[j]*m1
395392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$N[1]
396392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-16($ap,$j,8),%rax
397392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
398392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
399392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
400392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
401392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$N[0]
402392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	$num,$j
403392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jl	.L1st4x
404392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
405392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m0			# ap[j]*bp[0]
406392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A[0]
407392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-16($np,$j,8),%rax
408392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
409392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$A[1]
410392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
411392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m1			# np[j]*m1
412392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$N[0]
413392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-8($ap,$j,8),%rax
414392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
415392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
416392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
417392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
418392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$N[1]
419392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
420392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m0			# ap[j]*bp[0]
421392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A[1]
422392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-8($np,$j,8),%rax
423392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
424392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$A[0]
425392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
426392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m1			# np[j]*m1
427392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$N[1]
428392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	($ap),%rax		# ap[0]
429392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
430392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
431392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
432392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
433392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$N[0]
434392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
435392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$N[1],$N[1]
436392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A[0],$N[0]
437392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$N[1]
438392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$N[0],-8(%rsp,$j,8)
439392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
440392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
441392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	1($i),$i		# i++
442392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	4
443392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Louter4x:
444392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	($bp,$i,8),$m0		# m0=bp[i]
445392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$j,$j			# j=0
446392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	(%rsp),$A[0]
447392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$n0,$m1
448392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m0			# ap[0]*bp[i]
449392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
450392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	($np),%rax
451392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
452392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
453392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	imulq	$A[0],$m1		# tp[0]*n0
454392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$A[1]
455392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
456392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m1			# np[0]*m1
457392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A[0]		# "$N[0]", discarded
458392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	8($ap),%rax
459392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
460392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$N[1]
461392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
462392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m0			# ap[j]*bp[i]
463392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A[1]
464392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	8($np),%rax
465392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
466392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	8(%rsp),$A[1]		# +tp[1]
467392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
468392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$A[0]
469392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
470392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m1			# np[j]*m1
471392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$N[1]
472392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	16($ap),%rax
473392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
474392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
475392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	4($j),$j		# j+=2
476392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
477392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$N[1],(%rsp)		# tp[j-1]
478392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$N[0]
479392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Linner4x
480392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
481392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Linner4x:
482392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m0			# ap[j]*bp[i]
483392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A[0]
484392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-16($np,$j,8),%rax
485392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
486392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
487392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
488392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$A[1]
489392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
490392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m1			# np[j]*m1
491392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$N[0]
492392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-8($ap,$j,8),%rax
493392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
494392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A[0],$N[0]
495392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
496392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
497392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$N[1]
498392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
499392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m0			# ap[j]*bp[i]
500392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A[1]
501392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-8($np,$j,8),%rax
502392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
503392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	-8(%rsp,$j,8),$A[1]
504392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
505392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$A[0]
506392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
507392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m1			# np[j]*m1
508392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$N[1]
509392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	($ap,$j,8),%rax
510392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
511392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A[1],$N[1]
512392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
513392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
514392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$N[0]
515392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
516392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m0			# ap[j]*bp[i]
517392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A[0]
518392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	($np,$j,8),%rax
519392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
520392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
521392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
522392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$A[1]
523392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
524392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m1			# np[j]*m1
525392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$N[0]
526392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	8($ap,$j,8),%rax
527392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
528392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A[0],$N[0]
529392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
530392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
531392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$N[1]
532392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
533392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m0			# ap[j]*bp[i]
534392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A[1]
535392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	8($np,$j,8),%rax
536392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
537392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	8(%rsp,$j,8),$A[1]
538392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
539392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	4($j),$j		# j++
540392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$A[0]
541392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
542392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m1			# np[j]*m1
543392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$N[1]
544392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-16($ap,$j,8),%rax
545392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
546392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A[1],$N[1]
547392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
548392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
549392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$N[0]
550392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	$num,$j
551392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jl	.Linner4x
552392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
553392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m0			# ap[j]*bp[i]
554392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A[0]
555392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-16($np,$j,8),%rax
556392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
557392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
558392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
559392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$A[1]
560392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
561392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m1			# np[j]*m1
562392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$N[0]
563392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-8($ap,$j,8),%rax
564392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
565392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A[0],$N[0]
566392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
567392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
568392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$N[1]
569392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
570392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m0			# ap[j]*bp[i]
571392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A[1]
572392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-8($np,$j,8),%rax
573392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
574392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	-8(%rsp,$j,8),$A[1]
575392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
576392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	1($i),$i		# i++
577392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$A[0]
578392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
579392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mulq	$m1			# np[j]*m1
580392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$N[1]
581392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	($ap),%rax		# ap[0]
582392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
583392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A[1],$N[1]
584392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
585392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
586392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$N[0]
587392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
588392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$N[1],$N[1]
589392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A[0],$N[0]
590392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$N[1]
591392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
592392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$N[1]
593392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$N[0],-8(%rsp,$j,8)
594392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
595392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
596392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	$num,$i
597392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jl	.Louter4x
598392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
599392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{
600392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @ri=("%rax","%rdx",$m0,$m1);
601392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
602392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	16(%rsp,$num,8),$rp	# restore $rp
603392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0(%rsp),@ri[0]		# tp[0]
604392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	%xmm0,%xmm0
605392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	8(%rsp),@ri[1]		# tp[1]
606392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr	\$2,$num		# num/=4
607392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	(%rsp),$ap		# borrow ap for tp
608392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$i,$i			# i=0 and clear CF!
609392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
610392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	0($np),@ri[0]
611392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	16($ap),@ri[2]		# tp[2]
612392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	24($ap),@ri[3]		# tp[3]
613392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sbb	8($np),@ri[1]
614392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-1($num),$j		# j=num/4-1
615392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lsub4x
616392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
617392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsub4x:
618392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
619392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
620392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sbb	16($np,$i,8),@ri[2]
621392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
622392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	40($ap,$i,8),@ri[1]
623392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sbb	24($np,$i,8),@ri[3]
624392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
625392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
626392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sbb	32($np,$i,8),@ri[0]
627392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	48($ap,$i,8),@ri[2]
628392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	56($ap,$i,8),@ri[3]
629392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sbb	40($np,$i,8),@ri[1]
630392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	4($i),$i		# i++
631392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec	$j			# doesnn't affect CF!
632392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz	.Lsub4x
633392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
634392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
635392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	32($ap,$i,8),@ri[0]	# load overflow bit
636392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sbb	16($np,$i,8),@ri[2]
637392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
638392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sbb	24($np,$i,8),@ri[3]
639392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
640392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
641392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sbb	\$0,@ri[0]		# handle upmost overflow bit
642392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
643392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$i,$i			# i=0
644392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	@ri[0],$ap
645392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	not	@ri[0]
646392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rp,$np
647392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	@ri[0],$np
648392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-1($num),$j
649392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	or	$np,$ap			# ap=borrow?tp:rp
650392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
651392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	($ap),%xmm1
652392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm0,(%rsp)
653392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	%xmm1,($rp)
654392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lcopy4x
655392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
656392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcopy4x:					# copy or in-place refresh
657392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	16($ap,$i),%xmm2
658392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	32($ap,$i),%xmm1
659392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm0,16(%rsp,$i)
660392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	%xmm2,16($rp,$i)
661392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm0,32(%rsp,$i)
662392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	%xmm1,32($rp,$i)
663392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	32($i),$i
664656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	dec	$j
665392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz	.Lcopy4x
666656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
667392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shl	\$2,$num
668392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	16($ap,$i),%xmm2
669392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm0,16(%rsp,$i)
670392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	%xmm2,16($rp,$i)
671392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
672392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
673392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
674221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	8(%rsp,$num,8),%rsi	# restore %rsp
675656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	mov	\$1,%rax
676221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	(%rsi),%r15
677221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	8(%rsi),%r14
678221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	16(%rsi),%r13
679221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	24(%rsi),%r12
680221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	32(%rsi),%rbp
681221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	40(%rsi),%rbx
682221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lea	48(%rsi),%rsp
683392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lmul4x_epilogue:
684221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	ret
685392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	bn_mul4x_mont,.-bn_mul4x_mont
686392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
687392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}}}
688392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{{{
689392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom######################################################################
690392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# void bn_sqr4x_mont(
691392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $rptr="%rdi";	# const BN_ULONG *rptr,
692392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $aptr="%rsi";	# const BN_ULONG *aptr,
693392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $bptr="%rdx";	# not used
694392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $nptr="%rcx";	# const BN_ULONG *nptr,
695392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $n0  ="%r8";		# const BN_ULONG *n0);
696392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $num ="%r9";		# int num, has to be divisible by 4 and
697392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom			# not less than 8
698392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
699392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
700392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @A0=("%r10","%r11");
701392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @A1=("%r12","%r13");
702392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($a0,$a1,$ai)=("%r14","%r15","%rbx");
703392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
704392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
705392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	bn_sqr4x_mont,\@function,6
706392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
707392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombn_sqr4x_mont:
708392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsqr4x_enter:
709392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbx
710392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbp
711392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r12
712392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r13
713392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r14
714392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r15
715392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
716392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shl	\$3,${num}d		# convert $num to bytes
717392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	%r10,%r10
718392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rsp,%r11		# put aside %rsp
719392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	$num,%r10		# -$num
720392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	($n0),$n0		# *n0
721392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-72(%rsp,%r10,2),%rsp	# alloca(frame+2*$num)
722392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	\$-1024,%rsp		# minimize TLB usage
723392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	##############################################################
724392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	# Stack layout
725392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	#
726392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	# +0	saved $num, used in reduction section
727392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	# +8	&t[2*$num], used in reduction section
728392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	# +32	saved $rptr
729392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	# +40	saved $nptr
730392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	# +48	saved *n0
731392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	# +56	saved %rsp
732392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	# +64	t[2*$num]
733392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	#
734392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rptr,32(%rsp)		# save $rptr
735392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$nptr,40(%rsp)
736392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$n0,  48(%rsp)
737392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%r11, 56(%rsp)		# save original %rsp
738392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsqr4x_body:
739392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	##############################################################
740392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	# Squaring part:
741392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	#
742392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	# a) multiply-n-add everything but a[i]*a[i];
743392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	# b) shift result of a) by 1 to the left and accumulate
744392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	#    a[i]*a[i] products;
745392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	#
746392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	32(%r10),$i		# $i=-($num-32)
747392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($aptr,$num),$aptr	# end of a[] buffer, ($aptr,$i)=&ap[2]
748392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
749392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$num,$j			# $j=$num
750392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
751392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom					# comments apply to $num==8 case
752392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-32($aptr,$i),$a0	# a[0]
753392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
754392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-24($aptr,$i),%rax	# a[1]
755392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
756392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-16($aptr,$i),$ai	# a[2]
757392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rax,$a1
758392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
759392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$a0			# a[1]*a[0]
760392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rax,$A0[0]		# a[1]*a[0]
761392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$ai,%rax		# a[2]
762392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$A0[1]
763392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A0[0],-24($tptr,$i)	# t[1]
764392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
765392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A0[0],$A0[0]
766392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$a0			# a[2]*a[0]
767392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A0[1]
768392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$ai,%rax
769392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A0[0]
770392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A0[1],-16($tptr,$i)	# t[2]
771392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
772392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-16($i),$j		# j=-16
773392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
774392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
775392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	8($aptr,$j),$ai		# a[3]
776392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$a1			# a[2]*a[1]
777392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rax,$A1[0]		# a[2]*a[1]+t[3]
778392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$ai,%rax
779392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,$A1[1]
780392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
781392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A0[1],$A0[1]
782392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A1[0],$A0[0]
783392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 lea	16($j),$j
784392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A0[1]
785392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$a0			# a[3]*a[0]
786392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
787392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$ai,%rax
788392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A0[1]
789392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A0[0],-8($tptr,$j)	# t[3]
790392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lsqr4x_1st
791392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
792392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
793392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsqr4x_1st:
794392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	($aptr,$j),$ai		# a[4]
795392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A1[0],$A1[0]
796392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$a1			# a[3]*a[1]
797392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
798392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$ai,%rax
799392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A1[0]
800392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
801392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A0[0],$A0[0]
802392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A1[1],$A0[1]
803392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A0[0]
804392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$a0			# a[4]*a[0]
805392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
806392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$ai,%rax		# a[3]
807392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A0[0]
808392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A0[1],($tptr,$j)	# t[4]
809392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
810392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
811392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	8($aptr,$j),$ai		# a[5]
812392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A1[1],$A1[1]
813392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$a1			# a[4]*a[3]
814392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
815392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$ai,%rax
816392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A1[1]
817392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
818392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A0[1],$A0[1]
819392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A1[0],$A0[0]
820392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A0[1]
821392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$a0			# a[5]*a[2]
822392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
823392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$ai,%rax
824392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A0[1]
825392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A0[0],8($tptr,$j)	# t[5]
826392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
827392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	16($aptr,$j),$ai	# a[6]
828392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A1[0],$A1[0]
829392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$a1			# a[5]*a[3]
830392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A1[1]		# a[5]*a[3]+t[6]
831392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$ai,%rax
832392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A1[0]
833392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
834392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A0[0],$A0[0]
835392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A1[1],$A0[1]
836392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A0[0]
837392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$a0			# a[6]*a[2]
838392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A0[1]		# a[6]*a[2]+a[5]*a[3]+t[6]
839392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$ai,%rax		# a[3]
840392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A0[0]
841392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A0[1],16($tptr,$j)	# t[6]
842392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
843392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
844392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	24($aptr,$j),$ai	# a[7]
845392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A1[1],$A1[1]
846392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$a1			# a[6]*a[5]
847392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A1[0]		# a[6]*a[5]+t[7]
848392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$ai,%rax
849392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A1[1]
850392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
851392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A0[1],$A0[1]
852392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A1[0],$A0[0]
853392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 lea	32($j),$j
854392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A0[1]
855392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$a0			# a[7]*a[4]
856392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A0[0]		# a[7]*a[4]+a[6]*a[5]+t[6]
857392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$ai,%rax
858392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A0[1]
859392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A0[0],-8($tptr,$j)	# t[7]
860392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
861392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$0,$j
862392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jne	.Lsqr4x_1st
863392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
864392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A1[0],$A1[0]
865392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A0[1],$A1[1]
866392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A1[0]
867392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$a1			# a[7]*a[5]
868392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A1[1]
869392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A1[0]
870392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
871392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A1[1],($tptr)		# t[8]
872392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16($i),$i
873392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A1[0],8($tptr)		# t[9]
874392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lsqr4x_outer
875392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
876392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
877392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsqr4x_outer:				# comments apply to $num==6 case
878392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-32($aptr,$i),$a0	# a[0]
879392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
880392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-24($aptr,$i),%rax	# a[1]
881392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
882392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-16($aptr,$i),$ai	# a[2]
883392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rax,$a1
884392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
885392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-24($tptr,$i),$A0[0]	# t[1]
886392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A0[1],$A0[1]
887392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$a0			# a[1]*a[0]
888392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A0[0]		# a[1]*a[0]+t[1]
889392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$ai,%rax		# a[2]
890392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A0[1]
891392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A0[0],-24($tptr,$i)	# t[1]
892392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
893392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A0[0],$A0[0]
894392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	-16($tptr,$i),$A0[1]	# a[2]*a[0]+t[2]
895392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A0[0]
896392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$a0			# a[2]*a[0]
897392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A0[1]
898392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$ai,%rax
899392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A0[0]
900392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A0[1],-16($tptr,$i)	# t[2]
901392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
902392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-16($i),$j		# j=-16
903392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A1[0],$A1[0]
904392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
905392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
906392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	8($aptr,$j),$ai		# a[3]
907392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A1[1],$A1[1]
908392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	8($tptr,$j),$A1[0]
909392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A1[1]
910392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$a1			# a[2]*a[1]
911392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A1[0]		# a[2]*a[1]+t[3]
912392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$ai,%rax
913392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A1[1]
914392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
915392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A0[1],$A0[1]
916392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A1[0],$A0[0]
917392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A0[1]
918392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$a0			# a[3]*a[0]
919392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
920392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$ai,%rax
921392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A0[1]
922392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A0[0],8($tptr,$j)	# t[3]
923392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
924392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16($j),$j
925392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lsqr4x_inner
926392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
927392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
928392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsqr4x_inner:
929392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	($aptr,$j),$ai		# a[4]
930392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A1[0],$A1[0]
931392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	($tptr,$j),$A1[1]
932392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A1[0]
933392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$a1			# a[3]*a[1]
934392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
935392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$ai,%rax
936392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A1[0]
937392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
938392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A0[0],$A0[0]
939392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A1[1],$A0[1]
940392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A0[0]
941392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$a0			# a[4]*a[0]
942392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
943392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$ai,%rax		# a[3]
944392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A0[0]
945392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A0[1],($tptr,$j)	# t[4]
946392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
947392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	8($aptr,$j),$ai		# a[5]
948392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A1[1],$A1[1]
949392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	8($tptr,$j),$A1[0]
950392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A1[1]
951392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$a1			# a[4]*a[3]
952392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
953392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$ai,%rax
954392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A1[1]
955392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
956392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A0[1],$A0[1]
957392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A1[0],$A0[0]
958392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16($j),$j		# j++
959392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A0[1]
960392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$a0			# a[5]*a[2]
961392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
962392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$ai,%rax
963392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A0[1]
964392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A0[0],-8($tptr,$j)	# t[5], "preloaded t[1]" below
965392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
966392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$0,$j
967392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jne	.Lsqr4x_inner
968392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
969392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A1[0],$A1[0]
970392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A0[1],$A1[1]
971392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A1[0]
972392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$a1			# a[5]*a[3]
973392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A1[1]
974392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A1[0]
975392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
976392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A1[1],($tptr)		# t[6], "preloaded t[2]" below
977392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A1[0],8($tptr)		# t[7], "preloaded t[3]" below
978392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
979392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	\$16,$i
980392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz	.Lsqr4x_outer
981392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
982392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom					# comments apply to $num==4 case
983392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-32($aptr),$a0		# a[0]
984392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
985392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-24($aptr),%rax		# a[1]
986392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
987392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-16($aptr),$ai		# a[2]
988392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rax,$a1
989392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
990392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A0[1],$A0[1]
991392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$a0			# a[1]*a[0]
992392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A0[0]		# a[1]*a[0]+t[1], preloaded t[1]
993392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$ai,%rax		# a[2]
994392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A0[1]
995392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A0[0],-24($tptr)	# t[1]
996392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
997392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A0[0],$A0[0]
998392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A1[1],$A0[1]		# a[2]*a[0]+t[2], preloaded t[2]
999392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A0[0]
1000392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$a0			# a[2]*a[0]
1001392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A0[1]
1002392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$ai,%rax
1003392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A0[0]
1004392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A0[1],-16($tptr)	# t[2]
1005392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1006392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	-8($aptr),$ai		# a[3]
1007392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$a1			# a[2]*a[1]
1008392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A1[0]		# a[2]*a[1]+t[3], preloaded t[3]
1009392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$ai,%rax
1010392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
1011392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1012392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A0[1],$A0[1]
1013392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A1[0],$A0[0]
1014392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	%rdx,$A1[1]
1015392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A0[1]
1016392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$a0			# a[3]*a[0]
1017392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
1018392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$ai,%rax
1019392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A0[1]
1020392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A0[0],-8($tptr)	# t[3]
1021392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1022392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A1[0],$A1[0]
1023392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A0[1],$A1[1]
1024392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A1[0]
1025392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$a1			# a[3]*a[1]
1026392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A1[1]
1027392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	-16($aptr),%rax		# a[2]
1028392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A1[0]
1029392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1030392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A1[1],($tptr)		# t[4]
1031392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A1[0],8($tptr)		# t[5]
1032392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1033392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$ai			# a[2]*a[3]
1034392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1035392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{
1036392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($shift,$carry)=($a0,$a1);
1037392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @S=(@A1,$ai,$n0);
1038392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1039392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 add	\$16,$i
1040392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 xor	$shift,$shift
1041392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 sub	$num,$i			# $i=16-$num
1042392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 xor	$carry,$carry
1043392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1044392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A1[0],%rax		# t[5]
1045392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,%rdx
1046392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rax,8($tptr)		# t[5]
1047392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdx,16($tptr)		# t[6]
1048392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$carry,24($tptr)	# t[7]
1049392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1050392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	-16($aptr,$i),%rax	# a[0]
1051392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	64(%rsp,$num,2),$tptr
1052392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 xor	$A0[0],$A0[0]		# t[0]
1053392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	-24($tptr,$i,2),$A0[1]	# t[1]
1054392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1055392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1056392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr	\$63,$A0[0]
1057392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1058392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr	\$63,$A0[1]
1059392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	or	$A0[0],$S[1]		# | t[2*i]>>63
1060392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	-16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1061392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1062392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	%rax			# a[i]*a[i]
1063392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	neg	$carry			# mov $carry,cf
1064392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	-8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1065392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rax,$S[0]
1066392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
1067392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$S[0],-32($tptr,$i,2)
1068392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$S[1]
1069392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1070392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1071392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$S[1],-24($tptr,$i,2)
1072392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 sbb	$carry,$carry		# mov cf,$carry
1073392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr	\$63,$A0[0]
1074392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1075392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr	\$63,$A0[1]
1076392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	or	$A0[0],$S[3]		# | t[2*i]>>63
1077392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	0($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1078392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1079392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	%rax			# a[i]*a[i]
1080392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	neg	$carry			# mov $carry,cf
1081392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1082392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rax,$S[2]
1083392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
1084392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$S[2],-16($tptr,$i,2)
1085392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$S[3]
1086392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16($i),$i
1087392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$S[3],-40($tptr,$i,2)
1088392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sbb	$carry,$carry		# mov cf,$carry
1089392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lsqr4x_shift_n_add
1090392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1091392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1092392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsqr4x_shift_n_add:
1093392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1094392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr	\$63,$A0[0]
1095392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1096392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr	\$63,$A0[1]
1097392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	or	$A0[0],$S[1]		# | t[2*i]>>63
1098392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	-16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1099392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1100392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	%rax			# a[i]*a[i]
1101392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	neg	$carry			# mov $carry,cf
1102392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	-8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1103392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rax,$S[0]
1104392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
1105392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$S[0],-32($tptr,$i,2)
1106392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$S[1]
1107392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1108392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1109392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$S[1],-24($tptr,$i,2)
1110392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 sbb	$carry,$carry		# mov cf,$carry
1111392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr	\$63,$A0[0]
1112392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1113392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr	\$63,$A0[1]
1114392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	or	$A0[0],$S[3]		# | t[2*i]>>63
1115392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	0($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1116392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1117392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	%rax			# a[i]*a[i]
1118392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	neg	$carry			# mov $carry,cf
1119392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1120392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rax,$S[2]
1121392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
1122392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$S[2],-16($tptr,$i,2)
1123392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$S[3]
1124392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1125392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1126392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$S[3],-8($tptr,$i,2)
1127392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 sbb	$carry,$carry		# mov cf,$carry
1128392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr	\$63,$A0[0]
1129392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1130392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr	\$63,$A0[1]
1131392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	or	$A0[0],$S[1]		# | t[2*i]>>63
1132392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1133392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1134392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	%rax			# a[i]*a[i]
1135392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	neg	$carry			# mov $carry,cf
1136392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	24($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1137392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rax,$S[0]
1138392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	8($aptr,$i),%rax	# a[i+1]	# prefetch
1139392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$S[0],0($tptr,$i,2)
1140392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$S[1]
1141392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1142392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1143392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$S[1],8($tptr,$i,2)
1144392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 sbb	$carry,$carry		# mov cf,$carry
1145392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr	\$63,$A0[0]
1146392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1147392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr	\$63,$A0[1]
1148392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	or	$A0[0],$S[3]		# | t[2*i]>>63
1149392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	32($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1150392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1151392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	%rax			# a[i]*a[i]
1152392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	neg	$carry			# mov $carry,cf
1153392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	40($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1154392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rax,$S[2]
1155392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	16($aptr,$i),%rax	# a[i+1]	# prefetch
1156392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$S[2],16($tptr,$i,2)
1157392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$S[3]
1158392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$S[3],24($tptr,$i,2)
1159392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sbb	$carry,$carry		# mov cf,$carry
1160392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	\$32,$i
1161392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz	.Lsqr4x_shift_n_add
1162392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1163392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1164392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr	\$63,$A0[0]
1165392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1166392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr	\$63,$A0[1]
1167392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	or	$A0[0],$S[1]		# | t[2*i]>>63
1168392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1169392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1170392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	%rax			# a[i]*a[i]
1171392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	neg	$carry			# mov $carry,cf
1172392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1173392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rax,$S[0]
1174392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	-8($aptr),%rax		# a[i+1]	# prefetch
1175392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$S[0],-32($tptr)
1176392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$S[1]
1177392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1178392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1|shift
1179392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$S[1],-24($tptr)
1180392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 sbb	$carry,$carry		# mov cf,$carry
1181392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr	\$63,$A0[0]
1182392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1183392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr	\$63,$A0[1]
1184392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	or	$A0[0],$S[3]		# | t[2*i]>>63
1185392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	%rax			# a[i]*a[i]
1186392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	neg	$carry			# mov $carry,cf
1187392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rax,$S[2]
1188392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$S[3]
1189392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$S[2],-16($tptr)
1190392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$S[3],-8($tptr)
1191392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1192392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
1193392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom##############################################################
1194392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Montgomery reduction part, "word-by-word" algorithm.
1195392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
1196392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{
1197392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($topbit,$nptr)=("%rbp",$aptr);
1198392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($m0,$m1)=($a0,$a1);
1199392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @Ni=("%rbx","%r9");
1200392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1201392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	40(%rsp),$nptr		# restore $nptr
1202392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	48(%rsp),$n0		# restore *n0
1203392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$j,$j
1204392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$num,0(%rsp)		# save $num
1205392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	$num,$j			# $j=-$num
1206392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	64(%rsp),$A0[0]		# t[0]		# modsched #
1207392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$n0,$m0			#		# modsched #
1208392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	64(%rsp,$num,2),%rax	# end of t[] buffer
1209392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	64(%rsp,$num),$tptr	# end of t[] window
1210392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rax,8(%rsp)		# save end of t[] buffer
1211392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($nptr,$num),$nptr	# end of n[] buffer
1212392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$topbit,$topbit		# $topbit=0
1213392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1214392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0($nptr,$j),%rax	# n[0]		# modsched #
1215392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	8($nptr,$j),$Ni[1]	# n[1]		# modsched #
1216392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 imulq	$A0[0],$m0		# m0=t[0]*n0	# modsched #
1217392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	%rax,$Ni[0]		#		# modsched #
1218392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lsqr4x_mont_outer
1219392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1220392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1221392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsqr4x_mont_outer:
1222392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A0[1],$A0[1]
1223392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$m0			# n[0]*m0
1224392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A0[0]		# n[0]*m0+t[0]
1225392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$Ni[1],%rax
1226392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A0[1]
1227392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$n0,$m1
1228392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1229392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A0[0],$A0[0]
1230392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	8($tptr,$j),$A0[1]
1231392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A0[0]
1232392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$m0			# n[1]*m0
1233392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A0[1]		# n[1]*m0+t[1]
1234392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$Ni[0],%rax
1235392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A0[0]
1236392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1237392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	imulq	$A0[1],$m1
1238392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1239392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	16($nptr,$j),$Ni[0]	# n[2]
1240392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A1[1],$A1[1]
1241392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A0[1],$A1[0]
1242392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A1[1]
1243392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$m1			# n[0]*m1
1244392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A1[0]		# n[0]*m1+"t[1]"
1245392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$Ni[0],%rax
1246392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A1[1]
1247392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A1[0],8($tptr,$j)	# "t[1]"
1248392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1249392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A0[1],$A0[1]
1250392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	16($tptr,$j),$A0[0]
1251392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A0[1]
1252392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$m0			# n[2]*m0
1253392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A0[0]		# n[2]*m0+t[2]
1254392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$Ni[1],%rax
1255392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A0[1]
1256392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1257392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	24($nptr,$j),$Ni[1]	# n[3]
1258392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A1[0],$A1[0]
1259392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A0[0],$A1[1]
1260392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A1[0]
1261392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$m1			# n[1]*m1
1262392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A1[1]		# n[1]*m1+"t[2]"
1263392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$Ni[1],%rax
1264392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A1[0]
1265392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A1[1],16($tptr,$j)	# "t[2]"
1266392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1267392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A0[0],$A0[0]
1268392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	24($tptr,$j),$A0[1]
1269392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	32($j),$j
1270392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A0[0]
1271392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$m0			# n[3]*m0
1272392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A0[1]		# n[3]*m0+t[3]
1273392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$Ni[0],%rax
1274392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A0[0]
1275392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lsqr4x_mont_inner
1276392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1277392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1278392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsqr4x_mont_inner:
1279392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	($nptr,$j),$Ni[0]	# n[4]
1280392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A1[1],$A1[1]
1281392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A0[1],$A1[0]
1282392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A1[1]
1283392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$m1			# n[2]*m1
1284392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A1[0]		# n[2]*m1+"t[3]"
1285392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$Ni[0],%rax
1286392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A1[1]
1287392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A1[0],-8($tptr,$j)	# "t[3]"
1288392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1289392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A0[1],$A0[1]
1290392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	($tptr,$j),$A0[0]
1291392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A0[1]
1292392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$m0			# n[4]*m0
1293392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A0[0]		# n[4]*m0+t[4]
1294392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$Ni[1],%rax
1295392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A0[1]
1296392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1297392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	8($nptr,$j),$Ni[1]	# n[5]
1298392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A1[0],$A1[0]
1299392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A0[0],$A1[1]
1300392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A1[0]
1301392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$m1			# n[3]*m1
1302392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A1[1]		# n[3]*m1+"t[4]"
1303392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$Ni[1],%rax
1304392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A1[0]
1305392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A1[1],($tptr,$j)	# "t[4]"
1306392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1307392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A0[0],$A0[0]
1308392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	8($tptr,$j),$A0[1]
1309392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A0[0]
1310392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$m0			# n[5]*m0
1311392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A0[1]		# n[5]*m0+t[5]
1312392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$Ni[0],%rax
1313392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A0[0]
1314392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1315392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1316392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	16($nptr,$j),$Ni[0]	# n[6]
1317392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A1[1],$A1[1]
1318392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A0[1],$A1[0]
1319392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A1[1]
1320392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$m1			# n[4]*m1
1321392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A1[0]		# n[4]*m1+"t[5]"
1322392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$Ni[0],%rax
1323392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A1[1]
1324392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A1[0],8($tptr,$j)	# "t[5]"
1325392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1326392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A0[1],$A0[1]
1327392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	16($tptr,$j),$A0[0]
1328392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A0[1]
1329392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$m0			# n[6]*m0
1330392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A0[0]		# n[6]*m0+t[6]
1331392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$Ni[1],%rax
1332392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A0[1]
1333392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1334392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	24($nptr,$j),$Ni[1]	# n[7]
1335392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A1[0],$A1[0]
1336392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A0[0],$A1[1]
1337392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A1[0]
1338392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$m1			# n[5]*m1
1339392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A1[1]		# n[5]*m1+"t[6]"
1340392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$Ni[1],%rax
1341392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A1[0]
1342392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A1[1],16($tptr,$j)	# "t[6]"
1343392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1344392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A0[0],$A0[0]
1345392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	24($tptr,$j),$A0[1]
1346392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	32($j),$j
1347392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A0[0]
1348392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$m0			# n[7]*m0
1349392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A0[1]		# n[7]*m0+t[7]
1350392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$Ni[0],%rax
1351392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A0[0]
1352392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$0,$j
1353392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jne	.Lsqr4x_mont_inner
1354392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1355392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 sub	0(%rsp),$j		# $j=-$num	# modsched #
1356392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$n0,$m0			#		# modsched #
1357392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1358392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A1[1],$A1[1]
1359392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A0[1],$A1[0]
1360392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A1[1]
1361392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$m1			# n[6]*m1
1362392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A1[0]		# n[6]*m1+"t[7]"
1363392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$Ni[1],%rax
1364392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A1[1]
1365392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A1[0],-8($tptr)	# "t[7]"
1366392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1367392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A0[1],$A0[1]
1368392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	($tptr),$A0[0]		# +t[8]
1369392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A0[1]
1370392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	0($nptr,$j),$Ni[0]	# n[0]		# modsched #
1371392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$topbit,$A0[0]
1372392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A0[1]
1373392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1374392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 imulq	16($tptr,$j),$m0	# m0=t[0]*n0	# modsched #
1375392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$A1[0],$A1[0]
1376392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	8($nptr,$j),$Ni[1]	# n[1]		# modsched #
1377392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A0[0],$A1[1]
1378392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	16($tptr,$j),$A0[0]	# t[0]		# modsched #
1379392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$A1[0]
1380392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mul	$m1			# n[7]*m1
1381392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	%rax,$A1[1]		# n[7]*m1+"t[8]"
1382392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 mov	$Ni[0],%rax		#		# modsched #
1383392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	%rdx,$A1[0]
1384392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A1[1],($tptr)		# "t[8]"
1385392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1386392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$topbit,$topbit
1387392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	8($tptr),$A1[0]		# +t[9]
1388392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	$topbit,$topbit
1389392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$A0[1],$A1[0]
1390392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16($tptr),$tptr		# "t[$num]>>128"
1391392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	adc	\$0,$topbit
1392392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$A1[0],-8($tptr)	# "t[9]"
1393392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	8(%rsp),$tptr		# are we done?
1394392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lsqr4x_mont_outer
1395392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1396392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0(%rsp),$num		# restore $num
1397392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$topbit,($tptr)		# save $topbit
1398392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1399392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
1400392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom##############################################################
1401392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Post-condition, 4x unrolled copy from bn_mul_mont
1402392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
1403392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{
1404392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($tptr,$nptr)=("%rbx",$aptr);
1405392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @ri=("%rax","%rdx","%r10","%r11");
1406392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1407392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	64(%rsp,$num),@ri[0]	# tp[0]
1408392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	64(%rsp,$num),$tptr	# upper half of t[2*$num] holds result
1409392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	40(%rsp),$nptr		# restore $nptr
1410392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr	\$5,$num		# num/4
1411392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	8($tptr),@ri[1]		# t[1]
1412392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$i,$i			# i=0 and clear CF!
1413392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1414392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	32(%rsp),$rptr		# restore $rptr
1415392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	0($nptr),@ri[0]
1416392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	16($tptr),@ri[2]	# t[2]
1417392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	24($tptr),@ri[3]	# t[3]
1418392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sbb	8($nptr),@ri[1]
1419392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-1($num),$j		# j=num/4-1
1420392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lsqr4x_sub
1421392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1422392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsqr4x_sub:
1423392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	@ri[0],0($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1424392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	@ri[1],8($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1425392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sbb	16($nptr,$i,8),@ri[2]
1426392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	32($tptr,$i,8),@ri[0]	# tp[i+1]
1427392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	40($tptr,$i,8),@ri[1]
1428392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sbb	24($nptr,$i,8),@ri[3]
1429392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	@ri[2],16($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1430392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	@ri[3],24($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1431392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sbb	32($nptr,$i,8),@ri[0]
1432392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	48($tptr,$i,8),@ri[2]
1433392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	56($tptr,$i,8),@ri[3]
1434392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sbb	40($nptr,$i,8),@ri[1]
1435392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	4($i),$i		# i++
1436392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec	$j			# doesn't affect CF!
1437392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz	.Lsqr4x_sub
1438392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1439392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	@ri[0],0($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1440392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	32($tptr,$i,8),@ri[0]	# load overflow bit
1441392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sbb	16($nptr,$i,8),@ri[2]
1442392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	@ri[1],8($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1443392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sbb	24($nptr,$i,8),@ri[3]
1444392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	@ri[2],16($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1445392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1446392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sbb	\$0,@ri[0]		# handle upmost overflow bit
1447392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	@ri[3],24($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1448392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$i,$i			# i=0
1449392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	@ri[0],$tptr
1450392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	not	@ri[0]
1451392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rptr,$nptr
1452392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	@ri[0],$nptr
1453392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-1($num),$j
1454392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	or	$nptr,$tptr		# tp=borrow?tp:rp
1455392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1456392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	%xmm0,%xmm0
1457392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	64(%rsp,$num,8),$nptr
1458392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	($tptr),%xmm1
1459392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($nptr,$num,8),$nptr
1460392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm0,64(%rsp)		# zap lower half of temporary vector
1461392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm0,($nptr)		# zap upper half of temporary vector
1462392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	%xmm1,($rptr)
1463392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lsqr4x_copy
1464392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1465392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsqr4x_copy:				# copy or in-place refresh
1466392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	16($tptr,$i),%xmm2
1467392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	32($tptr,$i),%xmm1
1468392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm0,80(%rsp,$i)	# zap lower half of temporary vector
1469392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm0,96(%rsp,$i)	# zap lower half of temporary vector
1470392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm0,16($nptr,$i)	# zap upper half of temporary vector
1471392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm0,32($nptr,$i)	# zap upper half of temporary vector
1472392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	%xmm2,16($rptr,$i)
1473392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	%xmm1,32($rptr,$i)
1474392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	32($i),$i
1475392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec	$j
1476392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz	.Lsqr4x_copy
1477392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1478392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	16($tptr,$i),%xmm2
1479392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm0,80(%rsp,$i)	# zap lower half of temporary vector
1480392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm0,16($nptr,$i)	# zap upper half of temporary vector
1481392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	%xmm2,16($rptr,$i)
1482392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1483392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
1484392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1485392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	56(%rsp),%rsi		# restore %rsp
1486392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	\$1,%rax
1487392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0(%rsi),%r15
1488392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	8(%rsi),%r14
1489392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	16(%rsi),%r13
1490392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	24(%rsi),%r12
1491392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	32(%rsi),%rbp
1492392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	40(%rsi),%rbx
1493392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	48(%rsi),%rsp
1494392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lsqr4x_epilogue:
1495392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
1496392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	bn_sqr4x_mont,.-bn_sqr4x_mont
1497392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1498392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}}}
1499392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1500221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1501221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.align	16
1502221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom___
1503221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
1504221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1505221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1506221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromif ($win64) {
1507221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$rec="%rcx";
1508221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$frame="%rdx";
1509221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$context="%r8";
1510221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$disp="%r9";
1511221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
1512221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$code.=<<___;
1513221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.extern	__imp_RtlVirtualUnwind
1514392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	mul_handler,\@abi-omnipotent
1515221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.align	16
1516392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommul_handler:
1517221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	push	%rsi
1518221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	push	%rdi
1519221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	push	%rbx
1520221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	push	%rbp
1521221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	push	%r12
1522221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	push	%r13
1523221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	push	%r14
1524221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	push	%r15
1525221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	pushfq
1526221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	sub	\$64,%rsp
1527221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
1528221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	120($context),%rax	# pull context->Rax
1529221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	248($context),%rbx	# pull context->Rip
1530221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
1531392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	8($disp),%rsi		# disp->ImageBase
1532392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	56($disp),%r11		# disp->HandlerData
1533392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1534392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0(%r11),%r10d		# HandlerData[0]
1535392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	(%rsi,%r10),%r10	# end of prologue label
1536392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	%r10,%rbx		# context->Rip<end of prologue label
1537392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lcommon_seh_tail
1538221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
1539221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	152($context),%rax	# pull context->Rsp
1540221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
1541392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	4(%r11),%r10d		# HandlerData[1]
1542392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	(%rsi,%r10),%r10	# epilogue label
1543392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	%r10,%rbx		# context->Rip>=epilogue label
1544392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jae	.Lcommon_seh_tail
1545221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
1546221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	192($context),%r10	# pull $num
1547221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
1548221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lea	48(%rax),%rax
1549221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
1550221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	-8(%rax),%rbx
1551221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	-16(%rax),%rbp
1552221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	-24(%rax),%r12
1553221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	-32(%rax),%r13
1554221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	-40(%rax),%r14
1555221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	-48(%rax),%r15
1556221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	%rbx,144($context)	# restore context->Rbx
1557221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	%rbp,160($context)	# restore context->Rbp
1558221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	%r12,216($context)	# restore context->R12
1559221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	%r13,224($context)	# restore context->R13
1560221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	%r14,232($context)	# restore context->R14
1561221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	%r15,240($context)	# restore context->R15
1562221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
1563392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lcommon_seh_tail
1564392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	mul_handler,.-mul_handler
1565392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1566392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	sqr_handler,\@abi-omnipotent
1567392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1568392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsqr_handler:
1569392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rsi
1570392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rdi
1571392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbx
1572392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbp
1573392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r12
1574392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r13
1575392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r14
1576392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r15
1577392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pushfq
1578392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$64,%rsp
1579392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1580392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	120($context),%rax	# pull context->Rax
1581392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	248($context),%rbx	# pull context->Rip
1582392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1583392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	.Lsqr4x_body(%rip),%r10
1584392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	%r10,%rbx		# context->Rip<.Lsqr_body
1585392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lcommon_seh_tail
1586392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1587392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	152($context),%rax	# pull context->Rsp
1588392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1589392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	.Lsqr4x_epilogue(%rip),%r10
1590392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	%r10,%rbx		# context->Rip>=.Lsqr_epilogue
1591392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jae	.Lcommon_seh_tail
1592392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1593392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	56(%rax),%rax		# pull saved stack pointer
1594392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	48(%rax),%rax
1595392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1596392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-8(%rax),%rbx
1597392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-16(%rax),%rbp
1598392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-24(%rax),%r12
1599392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-32(%rax),%r13
1600392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-40(%rax),%r14
1601392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	-48(%rax),%r15
1602392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rbx,144($context)	# restore context->Rbx
1603392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rbp,160($context)	# restore context->Rbp
1604392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%r12,216($context)	# restore context->R12
1605392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%r13,224($context)	# restore context->R13
1606392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%r14,232($context)	# restore context->R14
1607392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%r15,240($context)	# restore context->R15
1608392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1609392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcommon_seh_tail:
1610221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	8(%rax),%rdi
1611221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	16(%rax),%rsi
1612221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	%rax,152($context)	# restore context->Rsp
1613221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	%rsi,168($context)	# restore context->Rsi
1614221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	%rdi,176($context)	# restore context->Rdi
1615221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
1616221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	40($disp),%rdi		# disp->ContextRecord
1617221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	$context,%rsi		# context
1618221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	\$154,%ecx		# sizeof(CONTEXT)
1619221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	.long	0xa548f3fc		# cld; rep movsq
1620221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
1621221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	$disp,%rsi
1622221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1623221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1624221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1625221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1626221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	40(%rsi),%r10		# disp->ContextRecord
1627221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lea	56(%rsi),%r11		# &disp->HandlerData
1628221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1629221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	%r10,32(%rsp)		# arg5
1630221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	%r11,40(%rsp)		# arg6
1631221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	%r12,48(%rsp)		# arg7
1632221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	%rcx,56(%rsp)		# arg8, (NULL)
1633221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	call	*__imp_RtlVirtualUnwind(%rip)
1634221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
1635221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mov	\$1,%eax		# ExceptionContinueSearch
1636221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	add	\$64,%rsp
1637221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	popfq
1638656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	pop	%r15
1639656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	pop	%r14
1640656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	pop	%r13
1641656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	pop	%r12
1642656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	pop	%rbp
1643656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	pop	%rbx
1644221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	pop	%rdi
1645221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	pop	%rsi
1646656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project	ret
1647392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	sqr_handler,.-sqr_handler
1648221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
1649221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.section	.pdata
1650221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.align	4
1651221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	.rva	.LSEH_begin_bn_mul_mont
1652221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	.rva	.LSEH_end_bn_mul_mont
1653221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	.rva	.LSEH_info_bn_mul_mont
1654221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
1655392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_begin_bn_mul4x_mont
1656392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_end_bn_mul4x_mont
1657392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_info_bn_mul4x_mont
1658392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1659392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_begin_bn_sqr4x_mont
1660392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_end_bn_sqr4x_mont
1661392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_info_bn_sqr4x_mont
1662392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1663221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.section	.xdata
1664221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.align	8
1665221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.LSEH_info_bn_mul_mont:
1666221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	.byte	9,0,0,0
1667392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	mul_handler
1668392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lmul_body,.Lmul_epilogue	# HandlerData[]
1669392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LSEH_info_bn_mul4x_mont:
1670392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.byte	9,0,0,0
1671392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	mul_handler
1672392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
1673392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LSEH_info_bn_sqr4x_mont:
1674392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.byte	9,0,0,0
1675392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	sqr_handler
1676656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project___
1677221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom}
1678656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Project
1679656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Projectprint $code;
1680656d9c7f52f88b3a3daccafa7655dec086c4756eThe Android Open Source Projectclose STDOUT;
1681