ppc-mont.pl revision 221304ee937bc0910948a8be1320cb8cc4eb6d36
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# April 2006
11
12# "Teaser" Montgomery multiplication module for PowerPC. It's possible
13# to gain a bit more by modulo-scheduling outer loop, then dedicated
14# squaring procedure should give further 20% and code can be adapted
15# for 32-bit application running on 64-bit CPU. As for the latter.
16# It won't be able to achieve "native" 64-bit performance, because in
17# 32-bit application context every addc instruction will have to be
18# expanded as addc, twice right shift by 32 and finally adde, etc.
19# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
20# for 64-bit application running on PPC970/G5 is:
21#
22# 512-bit	+65%
23# 1024-bit	+35%
24# 2048-bit	+18%
25# 4096-bit	+4%
26
27$flavour = shift;
28
29if ($flavour =~ /32/) {
30	$BITS=	32;
31	$BNSZ=	$BITS/8;
32	$SIZE_T=4;
33	$RZONE=	224;
34	$FRAME=	$SIZE_T*16;
35
36	$LD=	"lwz";		# load
37	$LDU=	"lwzu";		# load and update
38	$LDX=	"lwzx";		# load indexed
39	$ST=	"stw";		# store
40	$STU=	"stwu";		# store and update
41	$STX=	"stwx";		# store indexed
42	$STUX=	"stwux";	# store indexed and update
43	$UMULL=	"mullw";	# unsigned multiply low
44	$UMULH=	"mulhwu";	# unsigned multiply high
45	$UCMP=	"cmplw";	# unsigned compare
46	$SHRI=	"srwi";		# unsigned shift right by immediate
47	$PUSH=	$ST;
48	$POP=	$LD;
49} elsif ($flavour =~ /64/) {
50	$BITS=	64;
51	$BNSZ=	$BITS/8;
52	$SIZE_T=8;
53	$RZONE=	288;
54	$FRAME=	$SIZE_T*16;
55
56	# same as above, but 64-bit mnemonics...
57	$LD=	"ld";		# load
58	$LDU=	"ldu";		# load and update
59	$LDX=	"ldx";		# load indexed
60	$ST=	"std";		# store
61	$STU=	"stdu";		# store and update
62	$STX=	"stdx";		# store indexed
63	$STUX=	"stdux";	# store indexed and update
64	$UMULL=	"mulld";	# unsigned multiply low
65	$UMULH=	"mulhdu";	# unsigned multiply high
66	$UCMP=	"cmpld";	# unsigned compare
67	$SHRI=	"srdi";		# unsigned shift right by immediate
68	$PUSH=	$ST;
69	$POP=	$LD;
70} else { die "nonsense $flavour"; }
71
72$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
73( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
74( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
75die "can't locate ppc-xlate.pl";
76
77open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
78
79$sp="r1";
80$toc="r2";
81$rp="r3";	$ovf="r3";
82$ap="r4";
83$bp="r5";
84$np="r6";
85$n0="r7";
86$num="r8";
87$rp="r9";	# $rp is reassigned
88$aj="r10";
89$nj="r11";
90$tj="r12";
91# non-volatile registers
92$i="r14";
93$j="r15";
94$tp="r16";
95$m0="r17";
96$m1="r18";
97$lo0="r19";
98$hi0="r20";
99$lo1="r21";
100$hi1="r22";
101$alo="r23";
102$ahi="r24";
103$nlo="r25";
104#
105$nhi="r0";
106
107$code=<<___;
108.machine "any"
109.text
110
111.globl	.bn_mul_mont
112.align	4
113.bn_mul_mont:
114	cmpwi	$num,4
115	mr	$rp,r3		; $rp is reassigned
116	li	r3,0
117	bltlr
118
119	slwi	$num,$num,`log($BNSZ)/log(2)`
120	li	$tj,-4096
121	addi	$ovf,$num,`$FRAME+$RZONE`
122	subf	$ovf,$ovf,$sp	; $sp-$ovf
123	and	$ovf,$ovf,$tj	; minimize TLB usage
124	subf	$ovf,$sp,$ovf	; $ovf-$sp
125	srwi	$num,$num,`log($BNSZ)/log(2)`
126	$STUX	$sp,$sp,$ovf
127
128	$PUSH	r14,`4*$SIZE_T`($sp)
129	$PUSH	r15,`5*$SIZE_T`($sp)
130	$PUSH	r16,`6*$SIZE_T`($sp)
131	$PUSH	r17,`7*$SIZE_T`($sp)
132	$PUSH	r18,`8*$SIZE_T`($sp)
133	$PUSH	r19,`9*$SIZE_T`($sp)
134	$PUSH	r20,`10*$SIZE_T`($sp)
135	$PUSH	r21,`11*$SIZE_T`($sp)
136	$PUSH	r22,`12*$SIZE_T`($sp)
137	$PUSH	r23,`13*$SIZE_T`($sp)
138	$PUSH	r24,`14*$SIZE_T`($sp)
139	$PUSH	r25,`15*$SIZE_T`($sp)
140
141	$LD	$n0,0($n0)	; pull n0[0] value
142	addi	$num,$num,-2	; adjust $num for counter register
143
144	$LD	$m0,0($bp)	; m0=bp[0]
145	$LD	$aj,0($ap)	; ap[0]
146	addi	$tp,$sp,$FRAME
147	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[0]
148	$UMULH	$hi0,$aj,$m0
149
150	$LD	$aj,$BNSZ($ap)	; ap[1]
151	$LD	$nj,0($np)	; np[0]
152
153	$UMULL	$m1,$lo0,$n0	; "tp[0]"*n0
154
155	$UMULL	$alo,$aj,$m0	; ap[1]*bp[0]
156	$UMULH	$ahi,$aj,$m0
157
158	$UMULL	$lo1,$nj,$m1	; np[0]*m1
159	$UMULH	$hi1,$nj,$m1
160	$LD	$nj,$BNSZ($np)	; np[1]
161	addc	$lo1,$lo1,$lo0
162	addze	$hi1,$hi1
163
164	$UMULL	$nlo,$nj,$m1	; np[1]*m1
165	$UMULH	$nhi,$nj,$m1
166
167	mtctr	$num
168	li	$j,`2*$BNSZ`
169.align	4
170L1st:
171	$LDX	$aj,$ap,$j	; ap[j]
172	addc	$lo0,$alo,$hi0
173	$LDX	$nj,$np,$j	; np[j]
174	addze	$hi0,$ahi
175	$UMULL	$alo,$aj,$m0	; ap[j]*bp[0]
176	addc	$lo1,$nlo,$hi1
177	$UMULH	$ahi,$aj,$m0
178	addze	$hi1,$nhi
179	$UMULL	$nlo,$nj,$m1	; np[j]*m1
180	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
181	$UMULH	$nhi,$nj,$m1
182	addze	$hi1,$hi1
183	$ST	$lo1,0($tp)	; tp[j-1]
184
185	addi	$j,$j,$BNSZ	; j++
186	addi	$tp,$tp,$BNSZ	; tp++
187	bdnz-	L1st
188;L1st
189	addc	$lo0,$alo,$hi0
190	addze	$hi0,$ahi
191
192	addc	$lo1,$nlo,$hi1
193	addze	$hi1,$nhi
194	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
195	addze	$hi1,$hi1
196	$ST	$lo1,0($tp)	; tp[j-1]
197
198	li	$ovf,0
199	addc	$hi1,$hi1,$hi0
200	addze	$ovf,$ovf	; upmost overflow bit
201	$ST	$hi1,$BNSZ($tp)
202
203	li	$i,$BNSZ
204.align	4
205Louter:
206	$LDX	$m0,$bp,$i	; m0=bp[i]
207	$LD	$aj,0($ap)	; ap[0]
208	addi	$tp,$sp,$FRAME
209	$LD	$tj,$FRAME($sp)	; tp[0]
210	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[i]
211	$UMULH	$hi0,$aj,$m0
212	$LD	$aj,$BNSZ($ap)	; ap[1]
213	$LD	$nj,0($np)	; np[0]
214	addc	$lo0,$lo0,$tj	; ap[0]*bp[i]+tp[0]
215	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
216	addze	$hi0,$hi0
217	$UMULL	$m1,$lo0,$n0	; tp[0]*n0
218	$UMULH	$ahi,$aj,$m0
219	$UMULL	$lo1,$nj,$m1	; np[0]*m1
220	$UMULH	$hi1,$nj,$m1
221	$LD	$nj,$BNSZ($np)	; np[1]
222	addc	$lo1,$lo1,$lo0
223	$UMULL	$nlo,$nj,$m1	; np[1]*m1
224	addze	$hi1,$hi1
225	$UMULH	$nhi,$nj,$m1
226
227	mtctr	$num
228	li	$j,`2*$BNSZ`
229.align	4
230Linner:
231	$LDX	$aj,$ap,$j	; ap[j]
232	addc	$lo0,$alo,$hi0
233	$LD	$tj,$BNSZ($tp)	; tp[j]
234	addze	$hi0,$ahi
235	$LDX	$nj,$np,$j	; np[j]
236	addc	$lo1,$nlo,$hi1
237	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
238	addze	$hi1,$nhi
239	$UMULH	$ahi,$aj,$m0
240	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
241	$UMULL	$nlo,$nj,$m1	; np[j]*m1
242	addze	$hi0,$hi0
243	$UMULH	$nhi,$nj,$m1
244	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
245	addi	$j,$j,$BNSZ	; j++
246	addze	$hi1,$hi1
247	$ST	$lo1,0($tp)	; tp[j-1]
248	addi	$tp,$tp,$BNSZ	; tp++
249	bdnz-	Linner
250;Linner
251	$LD	$tj,$BNSZ($tp)	; tp[j]
252	addc	$lo0,$alo,$hi0
253	addze	$hi0,$ahi
254	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
255	addze	$hi0,$hi0
256
257	addc	$lo1,$nlo,$hi1
258	addze	$hi1,$nhi
259	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
260	addze	$hi1,$hi1
261	$ST	$lo1,0($tp)	; tp[j-1]
262
263	addic	$ovf,$ovf,-1	; move upmost overflow to XER[CA]
264	li	$ovf,0
265	adde	$hi1,$hi1,$hi0
266	addze	$ovf,$ovf
267	$ST	$hi1,$BNSZ($tp)
268;
269	slwi	$tj,$num,`log($BNSZ)/log(2)`
270	$UCMP	$i,$tj
271	addi	$i,$i,$BNSZ
272	ble-	Louter
273
274	addi	$num,$num,2	; restore $num
275	subfc	$j,$j,$j	; j=0 and "clear" XER[CA]
276	addi	$tp,$sp,$FRAME
277	mtctr	$num
278
279.align	4
280Lsub:	$LDX	$tj,$tp,$j
281	$LDX	$nj,$np,$j
282	subfe	$aj,$nj,$tj	; tp[j]-np[j]
283	$STX	$aj,$rp,$j
284	addi	$j,$j,$BNSZ
285	bdnz-	Lsub
286
287	li	$j,0
288	mtctr	$num
289	subfe	$ovf,$j,$ovf	; handle upmost overflow bit
290	and	$ap,$tp,$ovf
291	andc	$np,$rp,$ovf
292	or	$ap,$ap,$np	; ap=borrow?tp:rp
293
294.align	4
295Lcopy:				; copy or in-place refresh
296	$LDX	$tj,$ap,$j
297	$STX	$tj,$rp,$j
298	$STX	$j,$tp,$j	; zap at once
299	addi	$j,$j,$BNSZ
300	bdnz-	Lcopy
301
302	$POP	r14,`4*$SIZE_T`($sp)
303	$POP	r15,`5*$SIZE_T`($sp)
304	$POP	r16,`6*$SIZE_T`($sp)
305	$POP	r17,`7*$SIZE_T`($sp)
306	$POP	r18,`8*$SIZE_T`($sp)
307	$POP	r19,`9*$SIZE_T`($sp)
308	$POP	r20,`10*$SIZE_T`($sp)
309	$POP	r21,`11*$SIZE_T`($sp)
310	$POP	r22,`12*$SIZE_T`($sp)
311	$POP	r23,`13*$SIZE_T`($sp)
312	$POP	r24,`14*$SIZE_T`($sp)
313	$POP	r25,`15*$SIZE_T`($sp)
314	$POP	$sp,0($sp)
315	li	r3,1
316	blr
317	.long	0
318.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
319___
320
321$code =~ s/\`([^\`]*)\`/eval $1/gem;
322print $code;
323close STDOUT;
324