1221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#!/usr/bin/env perl
2221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
3221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# ====================================================================
4221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# project. The module is, however, dual licensed under OpenSSL and
6221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# CRYPTOGAMS licenses depending on where you obtain it. For further
7221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# details see http://www.openssl.org/~appro/cryptogams/.
8221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# ====================================================================
9221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
10221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# April 2006
11221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
12221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# "Teaser" Montgomery multiplication module for PowerPC. It's possible
13221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# to gain a bit more by modulo-scheduling outer loop, then dedicated
14221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# squaring procedure should give further 20% and code can be adapted
15221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# for 32-bit application running on 64-bit CPU. As for the latter.
16221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# It won't be able to achieve "native" 64-bit performance, because in
17221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 32-bit application context every addc instruction will have to be
18221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# expanded as addc, twice right shift by 32 and finally adde, etc.
19221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
20221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# for 64-bit application running on PPC970/G5 is:
21221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#
22221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 512-bit	+65%
23221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 1024-bit	+35%
24221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 2048-bit	+18%
25221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 4096-bit	+4%
26221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
27221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$flavour = shift;
28221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
29221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromif ($flavour =~ /32/) {
30221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$BITS=	32;
31221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$BNSZ=	$BITS/8;
32221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$SIZE_T=4;
33221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$RZONE=	224;
34221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
35221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$LD=	"lwz";		# load
36221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$LDU=	"lwzu";		# load and update
37221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$LDX=	"lwzx";		# load indexed
38221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$ST=	"stw";		# store
39221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$STU=	"stwu";		# store and update
40221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$STX=	"stwx";		# store indexed
41221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$STUX=	"stwux";	# store indexed and update
42221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULL=	"mullw";	# unsigned multiply low
43221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULH=	"mulhwu";	# unsigned multiply high
44221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UCMP=	"cmplw";	# unsigned compare
45221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$SHRI=	"srwi";		# unsigned shift right by immediate
46221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$PUSH=	$ST;
47221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$POP=	$LD;
48221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom} elsif ($flavour =~ /64/) {
49221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$BITS=	64;
50221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$BNSZ=	$BITS/8;
51221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$SIZE_T=8;
52221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$RZONE=	288;
53221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
54221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	# same as above, but 64-bit mnemonics...
55221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$LD=	"ld";		# load
56221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$LDU=	"ldu";		# load and update
57221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$LDX=	"ldx";		# load indexed
58221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$ST=	"std";		# store
59221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$STU=	"stdu";		# store and update
60221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$STX=	"stdx";		# store indexed
61221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$STUX=	"stdux";	# store indexed and update
62221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULL=	"mulld";	# unsigned multiply low
63221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULH=	"mulhdu";	# unsigned multiply high
64221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UCMP=	"cmpld";	# unsigned compare
65221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$SHRI=	"srdi";		# unsigned shift right by immediate
66221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$PUSH=	$ST;
67221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$POP=	$LD;
68221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom} else { die "nonsense $flavour"; }
69221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
70392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$FRAME=8*$SIZE_T+$RZONE;
71392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$LOCALS=8*$SIZE_T;
72392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
73221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
74221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
75221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
76221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromdie "can't locate ppc-xlate.pl";
77221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
78221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromopen STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
79221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
80221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$sp="r1";
81221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$toc="r2";
82221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$rp="r3";	$ovf="r3";
83221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$ap="r4";
84221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$bp="r5";
85221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$np="r6";
86221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$n0="r7";
87221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$num="r8";
88221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$rp="r9";	# $rp is reassigned
89221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$aj="r10";
90221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$nj="r11";
91221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$tj="r12";
92221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# non-volatile registers
93392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$i="r20";
94392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$j="r21";
95392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$tp="r22";
96392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$m0="r23";
97392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$m1="r24";
98392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$lo0="r25";
99392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$hi0="r26";
100392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$lo1="r27";
101392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$hi1="r28";
102392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$alo="r29";
103392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$ahi="r30";
104392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$nlo="r31";
105221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#
106221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$nhi="r0";
107221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
108221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$code=<<___;
109221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.machine "any"
110221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.text
111221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
112392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.globl	.bn_mul_mont_int
113221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.align	4
114392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.bn_mul_mont_int:
115221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	cmpwi	$num,4
116221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mr	$rp,r3		; $rp is reassigned
117221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	li	r3,0
118221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	bltlr
119392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
120392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($BNSZ==4);
121392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmpwi	$num,32		; longer key performance is not better
122392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	bgelr
123392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
124392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
125221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	slwi	$num,$num,`log($BNSZ)/log(2)`
126221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	li	$tj,-4096
127392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	addi	$ovf,$num,$FRAME
128221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	subf	$ovf,$ovf,$sp	; $sp-$ovf
129221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	and	$ovf,$ovf,$tj	; minimize TLB usage
130221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	subf	$ovf,$sp,$ovf	; $ovf-$sp
131392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mr	$tj,$sp
132221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	srwi	$num,$num,`log($BNSZ)/log(2)`
133221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$STUX	$sp,$sp,$ovf
134221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
135392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$PUSH	r20,`-12*$SIZE_T`($tj)
136392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$PUSH	r21,`-11*$SIZE_T`($tj)
137392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$PUSH	r22,`-10*$SIZE_T`($tj)
138392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$PUSH	r23,`-9*$SIZE_T`($tj)
139392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$PUSH	r24,`-8*$SIZE_T`($tj)
140392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$PUSH	r25,`-7*$SIZE_T`($tj)
141392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$PUSH	r26,`-6*$SIZE_T`($tj)
142392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$PUSH	r27,`-5*$SIZE_T`($tj)
143392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$PUSH	r28,`-4*$SIZE_T`($tj)
144392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$PUSH	r29,`-3*$SIZE_T`($tj)
145392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$PUSH	r30,`-2*$SIZE_T`($tj)
146392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$PUSH	r31,`-1*$SIZE_T`($tj)
147221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
148221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$LD	$n0,0($n0)	; pull n0[0] value
149221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addi	$num,$num,-2	; adjust $num for counter register
150221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
151221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$LD	$m0,0($bp)	; m0=bp[0]
152221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$LD	$aj,0($ap)	; ap[0]
153392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	addi	$tp,$sp,$LOCALS
154221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[0]
155221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULH	$hi0,$aj,$m0
156221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
157221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$LD	$aj,$BNSZ($ap)	; ap[1]
158221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$LD	$nj,0($np)	; np[0]
159221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
160221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULL	$m1,$lo0,$n0	; "tp[0]"*n0
161221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
162221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULL	$alo,$aj,$m0	; ap[1]*bp[0]
163221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULH	$ahi,$aj,$m0
164221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
165221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULL	$lo1,$nj,$m1	; np[0]*m1
166221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULH	$hi1,$nj,$m1
167221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$LD	$nj,$BNSZ($np)	; np[1]
168221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addc	$lo1,$lo1,$lo0
169221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addze	$hi1,$hi1
170221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
171221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULL	$nlo,$nj,$m1	; np[1]*m1
172221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULH	$nhi,$nj,$m1
173221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
174221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mtctr	$num
175221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	li	$j,`2*$BNSZ`
176221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.align	4
177221304ee937bc0910948a8be1320cb8cc4eb6d36Brian CarlstromL1st:
178221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$LDX	$aj,$ap,$j	; ap[j]
179221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addc	$lo0,$alo,$hi0
180221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$LDX	$nj,$np,$j	; np[j]
181221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addze	$hi0,$ahi
182221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULL	$alo,$aj,$m0	; ap[j]*bp[0]
183221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addc	$lo1,$nlo,$hi1
184221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULH	$ahi,$aj,$m0
185221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addze	$hi1,$nhi
186221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULL	$nlo,$nj,$m1	; np[j]*m1
187221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
188221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULH	$nhi,$nj,$m1
189221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addze	$hi1,$hi1
190221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$ST	$lo1,0($tp)	; tp[j-1]
191221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
192221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addi	$j,$j,$BNSZ	; j++
193221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addi	$tp,$tp,$BNSZ	; tp++
194221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	bdnz-	L1st
195221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom;L1st
196221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addc	$lo0,$alo,$hi0
197221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addze	$hi0,$ahi
198221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
199221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addc	$lo1,$nlo,$hi1
200221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addze	$hi1,$nhi
201221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
202221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addze	$hi1,$hi1
203221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$ST	$lo1,0($tp)	; tp[j-1]
204221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
205221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	li	$ovf,0
206221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addc	$hi1,$hi1,$hi0
207221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addze	$ovf,$ovf	; upmost overflow bit
208221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$ST	$hi1,$BNSZ($tp)
209221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
210221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	li	$i,$BNSZ
211221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.align	4
212221304ee937bc0910948a8be1320cb8cc4eb6d36Brian CarlstromLouter:
213221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$LDX	$m0,$bp,$i	; m0=bp[i]
214221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$LD	$aj,0($ap)	; ap[0]
215392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	addi	$tp,$sp,$LOCALS
216392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$LD	$tj,$LOCALS($sp); tp[0]
217221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[i]
218221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULH	$hi0,$aj,$m0
219221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$LD	$aj,$BNSZ($ap)	; ap[1]
220221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$LD	$nj,0($np)	; np[0]
221221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addc	$lo0,$lo0,$tj	; ap[0]*bp[i]+tp[0]
222221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
223221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addze	$hi0,$hi0
224221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULL	$m1,$lo0,$n0	; tp[0]*n0
225221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULH	$ahi,$aj,$m0
226221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULL	$lo1,$nj,$m1	; np[0]*m1
227221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULH	$hi1,$nj,$m1
228221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$LD	$nj,$BNSZ($np)	; np[1]
229221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addc	$lo1,$lo1,$lo0
230221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULL	$nlo,$nj,$m1	; np[1]*m1
231221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addze	$hi1,$hi1
232221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULH	$nhi,$nj,$m1
233221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
234221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mtctr	$num
235221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	li	$j,`2*$BNSZ`
236221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.align	4
237221304ee937bc0910948a8be1320cb8cc4eb6d36Brian CarlstromLinner:
238221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$LDX	$aj,$ap,$j	; ap[j]
239221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addc	$lo0,$alo,$hi0
240221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$LD	$tj,$BNSZ($tp)	; tp[j]
241221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addze	$hi0,$ahi
242221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$LDX	$nj,$np,$j	; np[j]
243221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addc	$lo1,$nlo,$hi1
244221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
245221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addze	$hi1,$nhi
246221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULH	$ahi,$aj,$m0
247221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
248221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULL	$nlo,$nj,$m1	; np[j]*m1
249221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addze	$hi0,$hi0
250221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UMULH	$nhi,$nj,$m1
251221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
252221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addi	$j,$j,$BNSZ	; j++
253221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addze	$hi1,$hi1
254221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$ST	$lo1,0($tp)	; tp[j-1]
255221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addi	$tp,$tp,$BNSZ	; tp++
256221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	bdnz-	Linner
257221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom;Linner
258221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$LD	$tj,$BNSZ($tp)	; tp[j]
259221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addc	$lo0,$alo,$hi0
260221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addze	$hi0,$ahi
261221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
262221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addze	$hi0,$hi0
263221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
264221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addc	$lo1,$nlo,$hi1
265221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addze	$hi1,$nhi
266221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
267221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addze	$hi1,$hi1
268221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$ST	$lo1,0($tp)	; tp[j-1]
269221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
270221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addic	$ovf,$ovf,-1	; move upmost overflow to XER[CA]
271221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	li	$ovf,0
272221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	adde	$hi1,$hi1,$hi0
273221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addze	$ovf,$ovf
274221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$ST	$hi1,$BNSZ($tp)
275221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom;
276221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	slwi	$tj,$num,`log($BNSZ)/log(2)`
277221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$UCMP	$i,$tj
278221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addi	$i,$i,$BNSZ
279221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	ble-	Louter
280221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
281221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addi	$num,$num,2	; restore $num
282221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	subfc	$j,$j,$j	; j=0 and "clear" XER[CA]
283392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	addi	$tp,$sp,$LOCALS
284221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mtctr	$num
285221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
286221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.align	4
287221304ee937bc0910948a8be1320cb8cc4eb6d36Brian CarlstromLsub:	$LDX	$tj,$tp,$j
288221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$LDX	$nj,$np,$j
289221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	subfe	$aj,$nj,$tj	; tp[j]-np[j]
290221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$STX	$aj,$rp,$j
291221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addi	$j,$j,$BNSZ
292221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	bdnz-	Lsub
293221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
294221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	li	$j,0
295221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	mtctr	$num
296221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	subfe	$ovf,$j,$ovf	; handle upmost overflow bit
297221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	and	$ap,$tp,$ovf
298221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	andc	$np,$rp,$ovf
299221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	or	$ap,$ap,$np	; ap=borrow?tp:rp
300221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
301221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.align	4
302221304ee937bc0910948a8be1320cb8cc4eb6d36Brian CarlstromLcopy:				; copy or in-place refresh
303221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$LDX	$tj,$ap,$j
304221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$STX	$tj,$rp,$j
305221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$STX	$j,$tp,$j	; zap at once
306221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	addi	$j,$j,$BNSZ
307221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	bdnz-	Lcopy
308221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
309392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$POP	$tj,0($sp)
310221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	li	r3,1
311392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$POP	r20,`-12*$SIZE_T`($tj)
312392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$POP	r21,`-11*$SIZE_T`($tj)
313392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$POP	r22,`-10*$SIZE_T`($tj)
314392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$POP	r23,`-9*$SIZE_T`($tj)
315392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$POP	r24,`-8*$SIZE_T`($tj)
316392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$POP	r25,`-7*$SIZE_T`($tj)
317392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$POP	r26,`-6*$SIZE_T`($tj)
318392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$POP	r27,`-5*$SIZE_T`($tj)
319392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$POP	r28,`-4*$SIZE_T`($tj)
320392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$POP	r29,`-3*$SIZE_T`($tj)
321392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$POP	r30,`-2*$SIZE_T`($tj)
322392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$POP	r31,`-1*$SIZE_T`($tj)
323392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mr	$sp,$tj
324221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	blr
325221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	.long	0
326392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.byte	0,12,4,0,0x80,12,6,0
327392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.long	0
328392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
329392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
330221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom___
331221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
332221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$code =~ s/\`([^\`]*)\`/eval $1/gem;
333221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromprint $code;
334221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromclose STDOUT;
335