1480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org#!/usr/bin/env perl
2480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
3480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# ====================================================================
4480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# project. The module is, however, dual licensed under OpenSSL and
6480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# CRYPTOGAMS licenses depending on where you obtain it. For further
7480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# details see http://www.openssl.org/~appro/cryptogams/.
8480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# ====================================================================
9480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
10480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# December 2005
11480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org#
12480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
13480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# for undertaken effort are multiple. First of all, UltraSPARC is not
14480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# the whole SPARCv9 universe and other VIS-free implementations deserve
15480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# optimized code as much. Secondly, newly introduced UltraSPARC T1,
16480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
17480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
18480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# several integrated RSA/DSA accelerator circuits accessible through
19480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# kernel driver [only(*)], but having decent user-land software
20480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# implementation is important too. Finally, reasons like desire to
21480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# experiment with dedicated squaring procedure. Yes, this module
22480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# implements one, because it was easiest to draft it in SPARCv9
23480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# instructions...
24480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
25480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# (*)	Engine accessing the driver in question is on my TODO list.
26480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org#	For reference, acceleator is estimated to give 6 to 10 times
27480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org#	improvement on single-threaded RSA sign. It should be noted
28480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org#	that 6-10x improvement coefficient does not actually mean
29480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org#	something extraordinary in terms of absolute [single-threaded]
30480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org#	performance, as SPARCv9 instruction set is by all means least
31480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org#	suitable for high performance crypto among other 64 bit
32480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org#	platforms. 6-10x factor simply places T1 in same performance
33480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org#	domain as say AMD64 and IA-64. Improvement of RSA verify don't
34480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org#	appear impressive at all, but it's the sign operation which is
35480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org#	far more critical/interesting.
36480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
37480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# You might notice that inner loops are modulo-scheduled:-) This has
38480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# essentially negligible impact on UltraSPARC performance, it's
39480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# Fujitsu SPARC64 V users who should notice and hopefully appreciate
40480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# the advantage... Currently this module surpasses sparcv9a-mont.pl
41480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
42480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# module still have hidden potential [see TODO list there], which is
43480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# estimated to be larger than 20%...
44480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
45480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# int bn_mul_mont(
46480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$rp="%i0";	# BN_ULONG *rp,
47480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$ap="%i1";	# const BN_ULONG *ap,
48480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$bp="%i2";	# const BN_ULONG *bp,
49480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$np="%i3";	# const BN_ULONG *np,
50480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$n0="%i4";	# const BN_ULONG *n0,
51480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$num="%i5";	# int num);
52480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
53480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$bits=32;
54480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.orgfor (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
55480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.orgif ($bits==64)	{ $bias=2047; $frame=192; }
56480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.orgelse		{ $bias=0;    $frame=128; }
57480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
58480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$car0="%o0";
59480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$car1="%o1";
60480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$car2="%o2";	# 1 bit
61480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$acc0="%o3";
62480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$acc1="%o4";
63480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$mask="%g1";	# 32 bits, what a waste...
64480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$tmp0="%g4";
65480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$tmp1="%g5";
66480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
67480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$i="%l0";
68480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$j="%l1";
69480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$mul0="%l2";
70480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$mul1="%l3";
71480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$tp="%l4";
72480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$apj="%l5";
73480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$npj="%l6";
74480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$tpj="%l7";
75480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
76480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$fname="bn_mul_mont_int";
77480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
78480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$code=<<___;
79480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org.section	".text",#alloc,#execinstr
80480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
81480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org.global	$fname
82480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org.align	32
83480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$fname:
84480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	cmp	%o5,4			! 128 bits minimum
85480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	bge,pt	%icc,.Lenter
86480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	sethi	%hi(0xffffffff),$mask
87480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	retl
88480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	clr	%o0
89480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org.align	32
90480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org.Lenter:
91480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	save	%sp,-$frame,%sp
92480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	sll	$num,2,$num		! num*=4
93480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	or	$mask,%lo(0xffffffff),$mask
94480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$n0],$n0
95480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	cmp	$ap,$bp
96480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$num,$mask,$num
97480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$bp],$mul0		! bp[0]
98480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	nop
99480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
100480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	%sp,$bias,%o7		! real top of stack
101480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$ap],$car0		! ap[0] ! redundant in squaring context
102480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	sub	%o7,$num,%o7
103480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$ap+4],$apj		! ap[1]
104480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	%o7,-1024,%o7
105480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$np],$car1		! np[0]
106480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	sub	%o7,$bias,%sp		! alloca
107480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$np+4],$npj		! np[1]
108480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	be,pt	`$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
109480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mov	12,$j
110480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
111480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$car0,$mul0,$car0	! ap[0]*bp[0]
112480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$apj,$mul0,$tmp0	!prologue! ap[1]*bp[0]
113480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$car0,$mask,$acc0
114480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	%sp,$bias+$frame,$tp
115480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$ap+8],$apj		!prologue!
116480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
117480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$n0,$acc0,$mul1		! "t[0]"*n0
118480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$mul1,$mask,$mul1
119480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
120480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$car1,$mul1,$car1	! np[0]*"t[0]"*n0
121480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$npj,$mul1,$acc1	!prologue! np[1]*"t[0]"*n0
122480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car0,32,$car0
123480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car1,$car1
124480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$np+8],$npj		!prologue!
125480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car1
126480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mov	$tmp0,$acc0		!prologue!
127480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
128480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org.L1st:
129480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$apj,$mul0,$tmp0
130480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$npj,$mul1,$tmp1
131480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car0,$car0
132480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$ap+$j],$apj		! ap[j]
133480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$car0,$mask,$acc0
134480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc1,$car1,$car1
135480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$np+$j],$npj		! np[j]
136480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car0,32,$car0
137480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car1,$car1
138480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$j,4,$j			! j++
139480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mov	$tmp0,$acc0
140480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	st	$car1,[$tp]
141480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	cmp	$j,$num
142480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mov	$tmp1,$acc1
143480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car1
144480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	bl	%icc,.L1st
145480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tp,4,$tp		! tp++
146480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org!.L1st
147480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
148480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$apj,$mul0,$tmp0	!epilogue!
149480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$npj,$mul1,$tmp1
150480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car0,$car0
151480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$car0,$mask,$acc0
152480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc1,$car1,$car1
153480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car0,32,$car0
154480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car1,$car1
155480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	st	$car1,[$tp]
156480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car1
157480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
158480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tmp0,$car0,$car0
159480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$car0,$mask,$acc0
160480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tmp1,$car1,$car1
161480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car0,32,$car0
162480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car1,$car1
163480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	st	$car1,[$tp+4]
164480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car1
165480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
166480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$car0,$car1,$car1
167480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	st	$car1,[$tp+8]
168480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car2
169480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
170480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mov	4,$i			! i++
171480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$bp+4],$mul0		! bp[1]
172480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org.Louter:
173480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	%sp,$bias+$frame,$tp
174480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$ap],$car0		! ap[0]
175480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$ap+4],$apj		! ap[1]
176480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$np],$car1		! np[0]
177480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$np+4],$npj		! np[1]
178480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$tp],$tmp1		! tp[0]
179480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$tp+4],$tpj		! tp[1]
180480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mov	12,$j
181480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
182480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$car0,$mul0,$car0
183480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$apj,$mul0,$tmp0	!prologue!
184480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tmp1,$car0,$car0
185480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$ap+8],$apj		!prologue!
186480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$car0,$mask,$acc0
187480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
188480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$n0,$acc0,$mul1
189480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$mul1,$mask,$mul1
190480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
191480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$car1,$mul1,$car1
192480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$npj,$mul1,$acc1	!prologue!
193480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car0,32,$car0
194480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car1,$car1
195480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$np+8],$npj		!prologue!
196480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car1
197480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mov	$tmp0,$acc0		!prologue!
198480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
199480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org.Linner:
200480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$apj,$mul0,$tmp0
201480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$npj,$mul1,$tmp1
202480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tpj,$car0,$car0
203480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$ap+$j],$apj		! ap[j]
204480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car0,$car0
205480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc1,$car1,$car1
206480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$np+$j],$npj		! np[j]
207480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$car0,$mask,$acc0
208480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$tp+8],$tpj		! tp[j]
209480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car0,32,$car0
210480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car1,$car1
211480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$j,4,$j			! j++
212480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mov	$tmp0,$acc0
213480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	st	$car1,[$tp]		! tp[j-1]
214480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car1
215480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mov	$tmp1,$acc1
216480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	cmp	$j,$num
217480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	bl	%icc,.Linner
218480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tp,4,$tp		! tp++
219480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org!.Linner
220480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
221480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$apj,$mul0,$tmp0	!epilogue!
222480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$npj,$mul1,$tmp1
223480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tpj,$car0,$car0
224480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car0,$car0
225480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$tp+8],$tpj		! tp[j]
226480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$car0,$mask,$acc0
227480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc1,$car1,$car1
228480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car0,32,$car0
229480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car1,$car1
230480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	st	$car1,[$tp]		! tp[j-1]
231480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car1
232480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
233480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tpj,$car0,$car0
234480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tmp0,$car0,$car0
235480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$car0,$mask,$acc0
236480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tmp1,$car1,$car1
237480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car1,$car1
238480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	st	$car1,[$tp+4]		! tp[j-1]
239480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car0,32,$car0
240480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$i,4,$i			! i++
241480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car1
242480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
243480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$car0,$car1,$car1
244480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	cmp	$i,$num
245480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$car2,$car1,$car1
246480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	st	$car1,[$tp+8]
247480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
248480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car2
249480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	bl,a	%icc,.Louter
250480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$bp+$i],$mul0		! bp[i]
251480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org!.Louter
252480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
253480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tp,12,$tp
254480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
255480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org.Ltail:
256480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$np,$num,$np
257480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$rp,$num,$rp
258480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mov	$tp,$ap
259480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	sub	%g0,$num,%o7		! k=-num
260480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ba	.Lsub
261480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	subcc	%g0,%g0,%g0		! clear %icc.c
262480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org.align	16
263480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org.Lsub:
264480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$tp+%o7],%o0
265480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$np+%o7],%o1
266480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	subccc	%o0,%o1,%o1		! tp[j]-np[j]
267480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$rp,%o7,$i
268480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	%o7,4,%o7
269480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	brnz	%o7,.Lsub
270480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	st	%o1,[$i]
271480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	subc	$car2,0,$car2		! handle upmost overflow bit
272480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$tp,$car2,$ap
273480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	andn	$rp,$car2,$np
274480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	or	$ap,$np,$ap
275480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	sub	%g0,$num,%o7
276480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
277480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org.Lcopy:
278480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$ap+%o7],%o0		! copy or in-place refresh
279480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	st	%g0,[$tp+%o7]		! zap tp
280480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	st	%o0,[$rp+%o7]
281480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	%o7,4,%o7
282480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	brnz	%o7,.Lcopy
283480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	nop
284480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mov	1,%i0
285480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ret
286480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	restore
287480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org___
288480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
289480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org########
290480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
291480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org######## code without following dedicated squaring procedure.
292480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org########
293480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$sbit="%i2";		# re-use $bp!
294480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
295480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$code.=<<___;
296480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org.align	32
297480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org.Lbn_sqr_mont:
298480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$mul0,$mul0,$car0		! ap[0]*ap[0]
299480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$apj,$mul0,$tmp0		!prologue!
300480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$car0,$mask,$acc0
301480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	%sp,$bias+$frame,$tp
302480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$ap+8],$apj			!prologue!
303480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
304480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$n0,$acc0,$mul1			! "t[0]"*n0
305480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car0,32,$car0
306480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$mul1,$mask,$mul1
307480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
308480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$car1,$mul1,$car1		! np[0]*"t[0]"*n0
309480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$npj,$mul1,$acc1		!prologue!
310480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$car0,1,$sbit
311480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$np+8],$npj			!prologue!
312480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car0,1,$car0
313480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car1,$car1
314480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car1
315480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mov	$tmp0,$acc0			!prologue!
316480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
317480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org.Lsqr_1st:
318480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$apj,$mul0,$tmp0
319480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$npj,$mul1,$tmp1
320480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car0,$car0		! ap[j]*a0+c0
321480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc1,$car1,$car1
322480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$ap+$j],$apj			! ap[j]
323480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$car0,$mask,$acc0
324480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$np+$j],$npj			! np[j]
325480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car0,32,$car0
326480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$acc0,$acc0
327480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	or	$sbit,$acc0,$acc0
328480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mov	$tmp1,$acc1
329480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$acc0,32,$sbit
330480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$j,4,$j				! j++
331480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$acc0,$mask,$acc0
332480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	cmp	$j,$num
333480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car1,$car1
334480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	st	$car1,[$tp]
335480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mov	$tmp0,$acc0
336480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car1
337480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	bl	%icc,.Lsqr_1st
338480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tp,4,$tp			! tp++
339480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org!.Lsqr_1st
340480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
341480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$apj,$mul0,$tmp0		! epilogue
342480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$npj,$mul1,$tmp1
343480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car0,$car0		! ap[j]*a0+c0
344480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc1,$car1,$car1
345480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$car0,$mask,$acc0
346480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car0,32,$car0
347480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$acc0,$acc0
348480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	or	$sbit,$acc0,$acc0
349480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$acc0,32,$sbit
350480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$acc0,$mask,$acc0
351480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car1,$car1
352480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	st	$car1,[$tp]
353480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car1
354480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
355480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tmp0,$car0,$car0		! ap[j]*a0+c0
356480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tmp1,$car1,$car1
357480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$car0,$mask,$acc0
358480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car0,32,$car0
359480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$acc0,$acc0
360480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	or	$sbit,$acc0,$acc0
361480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$acc0,32,$sbit
362480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$acc0,$mask,$acc0
363480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car1,$car1
364480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	st	$car1,[$tp+4]
365480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car1
366480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
367480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$car0,$car0,$car0
368480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	or	$sbit,$car0,$car0
369480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$car0,$car1,$car1
370480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	st	$car1,[$tp+8]
371480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car2
372480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
373480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[%sp+$bias+$frame],$tmp0	! tp[0]
374480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[%sp+$bias+$frame+4],$tmp1	! tp[1]
375480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[%sp+$bias+$frame+8],$tpj	! tp[2]
376480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$ap+4],$mul0			! ap[1]
377480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$ap+8],$apj			! ap[2]
378480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$np],$car1			! np[0]
379480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$np+4],$npj			! np[1]
380480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$n0,$tmp0,$mul1
381480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
382480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$mul0,$mul0,$car0
383480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$mul1,$mask,$mul1
384480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
385480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$car1,$mul1,$car1
386480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$npj,$mul1,$acc1
387480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tmp0,$car1,$car1
388480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$car0,$mask,$acc0
389480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$np+8],$npj			! np[2]
390480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car1
391480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tmp1,$car1,$car1
392480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car0,32,$car0
393480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car1,$car1
394480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$car0,1,$sbit
395480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc1,$car1,$car1
396480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car0,1,$car0
397480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mov	12,$j
398480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	st	$car1,[%sp+$bias+$frame]	! tp[0]=
399480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car1
400480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	%sp,$bias+$frame+4,$tp
401480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
402480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org.Lsqr_2nd:
403480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$apj,$mul0,$acc0
404480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$npj,$mul1,$acc1
405480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car0,$car0
406480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tpj,$car1,$car1
407480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$ap+$j],$apj			! ap[j]
408480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$car0,$mask,$acc0
409480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$np+$j],$npj			! np[j]
410480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car0,32,$car0
411480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc1,$car1,$car1
412480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$tp+8],$tpj			! tp[j]
413480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$acc0,$acc0
414480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$j,4,$j				! j++
415480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	or	$sbit,$acc0,$acc0
416480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$acc0,32,$sbit
417480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$acc0,$mask,$acc0
418480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	cmp	$j,$num
419480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car1,$car1
420480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	st	$car1,[$tp]			! tp[j-1]
421480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car1
422480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	bl	%icc,.Lsqr_2nd
423480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tp,4,$tp			! tp++
424480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org!.Lsqr_2nd
425480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
426480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$apj,$mul0,$acc0
427480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$npj,$mul1,$acc1
428480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car0,$car0
429480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tpj,$car1,$car1
430480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$car0,$mask,$acc0
431480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car0,32,$car0
432480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc1,$car1,$car1
433480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$acc0,$acc0
434480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	or	$sbit,$acc0,$acc0
435480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$acc0,32,$sbit
436480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$acc0,$mask,$acc0
437480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car1,$car1
438480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	st	$car1,[$tp]			! tp[j-1]
439480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car1
440480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
441480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$car0,$car0,$car0
442480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	or	$sbit,$car0,$car0
443480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$car0,$car1,$car1
444480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$car2,$car1,$car1
445480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	st	$car1,[$tp+4]
446480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car2
447480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
448480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
449480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
450480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$ap+8],$mul0			! ap[2]
451480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$np],$car1			! np[0]
452480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$np+4],$npj			! np[1]
453480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$n0,$tmp1,$mul1
454480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$mul1,$mask,$mul1
455480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mov	8,$i
456480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
457480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$mul0,$mul0,$car0
458480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$car1,$mul1,$car1
459480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$car0,$mask,$acc0
460480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tmp1,$car1,$car1
461480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car0,32,$car0
462480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	%sp,$bias+$frame,$tp
463480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car1
464480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$car0,1,$sbit
465480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car0,1,$car0
466480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mov	4,$j
467480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
468480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org.Lsqr_outer:
469480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org.Lsqr_inner1:
470480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$npj,$mul1,$acc1
471480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tpj,$car1,$car1
472480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$j,4,$j
473480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$tp+8],$tpj
474480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	cmp	$j,$i
475480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc1,$car1,$car1
476480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$np+$j],$npj
477480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	st	$car1,[$tp]
478480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car1
479480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	bl	%icc,.Lsqr_inner1
480480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tp,4,$tp
481480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org!.Lsqr_inner1
482480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
483480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$j,4,$j
484480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$ap+$j],$apj			! ap[j]
485480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$npj,$mul1,$acc1
486480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tpj,$car1,$car1
487480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$np+$j],$npj			! np[j]
488480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car1,$car1
489480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$tp+8],$tpj			! tp[j]
490480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc1,$car1,$car1
491480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	st	$car1,[$tp]
492480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car1
493480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
494480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$j,4,$j
495480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	cmp	$j,$num
496480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	be,pn	%icc,.Lsqr_no_inner2
497480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tp,4,$tp
498480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
499480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org.Lsqr_inner2:
500480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$apj,$mul0,$acc0
501480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$npj,$mul1,$acc1
502480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tpj,$car1,$car1
503480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car0,$car0
504480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$ap+$j],$apj			! ap[j]
505480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$car0,$mask,$acc0
506480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$np+$j],$npj			! np[j]
507480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car0,32,$car0
508480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$acc0,$acc0
509480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$tp+8],$tpj			! tp[j]
510480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	or	$sbit,$acc0,$acc0
511480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$j,4,$j				! j++
512480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$acc0,32,$sbit
513480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$acc0,$mask,$acc0
514480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	cmp	$j,$num
515480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car1,$car1
516480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc1,$car1,$car1
517480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	st	$car1,[$tp]			! tp[j-1]
518480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car1
519480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	bl	%icc,.Lsqr_inner2
520480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tp,4,$tp			! tp++
521480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
522480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org.Lsqr_no_inner2:
523480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$apj,$mul0,$acc0
524480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$npj,$mul1,$acc1
525480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tpj,$car1,$car1
526480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car0,$car0
527480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$car0,$mask,$acc0
528480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car0,32,$car0
529480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$acc0,$acc0
530480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	or	$sbit,$acc0,$acc0
531480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$acc0,32,$sbit
532480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$acc0,$mask,$acc0
533480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car1,$car1
534480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc1,$car1,$car1
535480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	st	$car1,[$tp]			! tp[j-1]
536480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car1
537480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
538480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$car0,$car0,$car0
539480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	or	$sbit,$car0,$car0
540480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$car0,$car1,$car1
541480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$car2,$car1,$car1
542480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	st	$car1,[$tp+4]
543480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car2
544480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
545480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$i,4,$i				! i++
546480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
547480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
548480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$ap+$i],$mul0			! ap[j]
549480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$np],$car1			! np[0]
550480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$np+4],$npj			! np[1]
551480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$n0,$tmp1,$mul1
552480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$mul1,$mask,$mul1
553480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$i,4,$tmp0
554480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
555480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$mul0,$mul0,$car0
556480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$car1,$mul1,$car1
557480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$car0,$mask,$acc0
558480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tmp1,$car1,$car1
559480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car0,32,$car0
560480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	%sp,$bias+$frame,$tp
561480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car1
562480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	and	$car0,1,$sbit
563480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car0,1,$car0
564480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
565480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	cmp	$tmp0,$num			! i<num-1
566480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	bl	%icc,.Lsqr_outer
567480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mov	4,$j
568480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
569480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org.Lsqr_last:
570480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$npj,$mul1,$acc1
571480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tpj,$car1,$car1
572480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$j,4,$j
573480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$tp+8],$tpj
574480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	cmp	$j,$i
575480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc1,$car1,$car1
576480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ld	[$np+$j],$npj
577480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	st	$car1,[$tp]
578480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car1
579480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	bl	%icc,.Lsqr_last
580480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tp,4,$tp
581480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org!.Lsqr_last
582480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
583480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	mulx	$npj,$mul1,$acc1
584480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tpj,$car1,$car1
585480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc0,$car1,$car1
586480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$acc1,$car1,$car1
587480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	st	$car1,[$tp]
588480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car1
589480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
590480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$car0,$car0,$car0		! recover $car0
591480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	or	$sbit,$car0,$car0
592480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$car0,$car1,$car1
593480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$car2,$car1,$car1
594480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	st	$car1,[$tp+4]
595480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	srlx	$car1,32,$car2
596480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org
597480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	ba	.Ltail
598480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org	add	$tp,8,$tp
599480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org.type	$fname,#function
600480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org.size	$fname,(.-$fname)
601480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org.asciz	"Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
602480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org.align	32
603480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org___
604480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$code =~ s/\`([^\`]*)\`/eval($1)/gem;
605480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.orgprint $code;
606480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.orgclose STDOUT;
607