1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# April 2006
11
12# "Teaser" Montgomery multiplication module for PowerPC. It's possible
13# to gain a bit more by modulo-scheduling outer loop, then dedicated
14# squaring procedure should give further 20% and code can be adapted
15# for 32-bit application running on 64-bit CPU. As for the latter.
16# It won't be able to achieve "native" 64-bit performance, because in
17# 32-bit application context every addc instruction will have to be
18# expanded as addc, twice right shift by 32 and finally adde, etc.
19# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
20# for 64-bit application running on PPC970/G5 is:
21#
22# 512-bit	+65%
23# 1024-bit	+35%
24# 2048-bit	+18%
25# 4096-bit	+4%
26
27$flavour = shift;
28
29if ($flavour =~ /32/) {
30	$BITS=	32;
31	$BNSZ=	$BITS/8;
32	$SIZE_T=4;
33	$RZONE=	224;
34
35	$LD=	"lwz";		# load
36	$LDU=	"lwzu";		# load and update
37	$LDX=	"lwzx";		# load indexed
38	$ST=	"stw";		# store
39	$STU=	"stwu";		# store and update
40	$STX=	"stwx";		# store indexed
41	$STUX=	"stwux";	# store indexed and update
42	$UMULL=	"mullw";	# unsigned multiply low
43	$UMULH=	"mulhwu";	# unsigned multiply high
44	$UCMP=	"cmplw";	# unsigned compare
45	$SHRI=	"srwi";		# unsigned shift right by immediate
46	$PUSH=	$ST;
47	$POP=	$LD;
48} elsif ($flavour =~ /64/) {
49	$BITS=	64;
50	$BNSZ=	$BITS/8;
51	$SIZE_T=8;
52	$RZONE=	288;
53
54	# same as above, but 64-bit mnemonics...
55	$LD=	"ld";		# load
56	$LDU=	"ldu";		# load and update
57	$LDX=	"ldx";		# load indexed
58	$ST=	"std";		# store
59	$STU=	"stdu";		# store and update
60	$STX=	"stdx";		# store indexed
61	$STUX=	"stdux";	# store indexed and update
62	$UMULL=	"mulld";	# unsigned multiply low
63	$UMULH=	"mulhdu";	# unsigned multiply high
64	$UCMP=	"cmpld";	# unsigned compare
65	$SHRI=	"srdi";		# unsigned shift right by immediate
66	$PUSH=	$ST;
67	$POP=	$LD;
68} else { die "nonsense $flavour"; }
69
70$FRAME=8*$SIZE_T+$RZONE;
71$LOCALS=8*$SIZE_T;
72
73$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
74( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
75( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
76die "can't locate ppc-xlate.pl";
77
78open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
79
80$sp="r1";
81$toc="r2";
82$rp="r3";	$ovf="r3";
83$ap="r4";
84$bp="r5";
85$np="r6";
86$n0="r7";
87$num="r8";
88$rp="r9";	# $rp is reassigned
89$aj="r10";
90$nj="r11";
91$tj="r12";
92# non-volatile registers
93$i="r20";
94$j="r21";
95$tp="r22";
96$m0="r23";
97$m1="r24";
98$lo0="r25";
99$hi0="r26";
100$lo1="r27";
101$hi1="r28";
102$alo="r29";
103$ahi="r30";
104$nlo="r31";
105#
106$nhi="r0";
107
108$code=<<___;
109.machine "any"
110.text
111
112.globl	.bn_mul_mont_int
113.align	4
114.bn_mul_mont_int:
115	cmpwi	$num,4
116	mr	$rp,r3		; $rp is reassigned
117	li	r3,0
118	bltlr
119___
120$code.=<<___ if ($BNSZ==4);
121	cmpwi	$num,32		; longer key performance is not better
122	bgelr
123___
124$code.=<<___;
125	slwi	$num,$num,`log($BNSZ)/log(2)`
126	li	$tj,-4096
127	addi	$ovf,$num,$FRAME
128	subf	$ovf,$ovf,$sp	; $sp-$ovf
129	and	$ovf,$ovf,$tj	; minimize TLB usage
130	subf	$ovf,$sp,$ovf	; $ovf-$sp
131	mr	$tj,$sp
132	srwi	$num,$num,`log($BNSZ)/log(2)`
133	$STUX	$sp,$sp,$ovf
134
135	$PUSH	r20,`-12*$SIZE_T`($tj)
136	$PUSH	r21,`-11*$SIZE_T`($tj)
137	$PUSH	r22,`-10*$SIZE_T`($tj)
138	$PUSH	r23,`-9*$SIZE_T`($tj)
139	$PUSH	r24,`-8*$SIZE_T`($tj)
140	$PUSH	r25,`-7*$SIZE_T`($tj)
141	$PUSH	r26,`-6*$SIZE_T`($tj)
142	$PUSH	r27,`-5*$SIZE_T`($tj)
143	$PUSH	r28,`-4*$SIZE_T`($tj)
144	$PUSH	r29,`-3*$SIZE_T`($tj)
145	$PUSH	r30,`-2*$SIZE_T`($tj)
146	$PUSH	r31,`-1*$SIZE_T`($tj)
147
148	$LD	$n0,0($n0)	; pull n0[0] value
149	addi	$num,$num,-2	; adjust $num for counter register
150
151	$LD	$m0,0($bp)	; m0=bp[0]
152	$LD	$aj,0($ap)	; ap[0]
153	addi	$tp,$sp,$LOCALS
154	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[0]
155	$UMULH	$hi0,$aj,$m0
156
157	$LD	$aj,$BNSZ($ap)	; ap[1]
158	$LD	$nj,0($np)	; np[0]
159
160	$UMULL	$m1,$lo0,$n0	; "tp[0]"*n0
161
162	$UMULL	$alo,$aj,$m0	; ap[1]*bp[0]
163	$UMULH	$ahi,$aj,$m0
164
165	$UMULL	$lo1,$nj,$m1	; np[0]*m1
166	$UMULH	$hi1,$nj,$m1
167	$LD	$nj,$BNSZ($np)	; np[1]
168	addc	$lo1,$lo1,$lo0
169	addze	$hi1,$hi1
170
171	$UMULL	$nlo,$nj,$m1	; np[1]*m1
172	$UMULH	$nhi,$nj,$m1
173
174	mtctr	$num
175	li	$j,`2*$BNSZ`
176.align	4
177L1st:
178	$LDX	$aj,$ap,$j	; ap[j]
179	addc	$lo0,$alo,$hi0
180	$LDX	$nj,$np,$j	; np[j]
181	addze	$hi0,$ahi
182	$UMULL	$alo,$aj,$m0	; ap[j]*bp[0]
183	addc	$lo1,$nlo,$hi1
184	$UMULH	$ahi,$aj,$m0
185	addze	$hi1,$nhi
186	$UMULL	$nlo,$nj,$m1	; np[j]*m1
187	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
188	$UMULH	$nhi,$nj,$m1
189	addze	$hi1,$hi1
190	$ST	$lo1,0($tp)	; tp[j-1]
191
192	addi	$j,$j,$BNSZ	; j++
193	addi	$tp,$tp,$BNSZ	; tp++
194	bdnz-	L1st
195;L1st
196	addc	$lo0,$alo,$hi0
197	addze	$hi0,$ahi
198
199	addc	$lo1,$nlo,$hi1
200	addze	$hi1,$nhi
201	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
202	addze	$hi1,$hi1
203	$ST	$lo1,0($tp)	; tp[j-1]
204
205	li	$ovf,0
206	addc	$hi1,$hi1,$hi0
207	addze	$ovf,$ovf	; upmost overflow bit
208	$ST	$hi1,$BNSZ($tp)
209
210	li	$i,$BNSZ
211.align	4
212Louter:
213	$LDX	$m0,$bp,$i	; m0=bp[i]
214	$LD	$aj,0($ap)	; ap[0]
215	addi	$tp,$sp,$LOCALS
216	$LD	$tj,$LOCALS($sp); tp[0]
217	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[i]
218	$UMULH	$hi0,$aj,$m0
219	$LD	$aj,$BNSZ($ap)	; ap[1]
220	$LD	$nj,0($np)	; np[0]
221	addc	$lo0,$lo0,$tj	; ap[0]*bp[i]+tp[0]
222	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
223	addze	$hi0,$hi0
224	$UMULL	$m1,$lo0,$n0	; tp[0]*n0
225	$UMULH	$ahi,$aj,$m0
226	$UMULL	$lo1,$nj,$m1	; np[0]*m1
227	$UMULH	$hi1,$nj,$m1
228	$LD	$nj,$BNSZ($np)	; np[1]
229	addc	$lo1,$lo1,$lo0
230	$UMULL	$nlo,$nj,$m1	; np[1]*m1
231	addze	$hi1,$hi1
232	$UMULH	$nhi,$nj,$m1
233
234	mtctr	$num
235	li	$j,`2*$BNSZ`
236.align	4
237Linner:
238	$LDX	$aj,$ap,$j	; ap[j]
239	addc	$lo0,$alo,$hi0
240	$LD	$tj,$BNSZ($tp)	; tp[j]
241	addze	$hi0,$ahi
242	$LDX	$nj,$np,$j	; np[j]
243	addc	$lo1,$nlo,$hi1
244	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
245	addze	$hi1,$nhi
246	$UMULH	$ahi,$aj,$m0
247	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
248	$UMULL	$nlo,$nj,$m1	; np[j]*m1
249	addze	$hi0,$hi0
250	$UMULH	$nhi,$nj,$m1
251	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
252	addi	$j,$j,$BNSZ	; j++
253	addze	$hi1,$hi1
254	$ST	$lo1,0($tp)	; tp[j-1]
255	addi	$tp,$tp,$BNSZ	; tp++
256	bdnz-	Linner
257;Linner
258	$LD	$tj,$BNSZ($tp)	; tp[j]
259	addc	$lo0,$alo,$hi0
260	addze	$hi0,$ahi
261	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
262	addze	$hi0,$hi0
263
264	addc	$lo1,$nlo,$hi1
265	addze	$hi1,$nhi
266	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
267	addze	$hi1,$hi1
268	$ST	$lo1,0($tp)	; tp[j-1]
269
270	addic	$ovf,$ovf,-1	; move upmost overflow to XER[CA]
271	li	$ovf,0
272	adde	$hi1,$hi1,$hi0
273	addze	$ovf,$ovf
274	$ST	$hi1,$BNSZ($tp)
275;
276	slwi	$tj,$num,`log($BNSZ)/log(2)`
277	$UCMP	$i,$tj
278	addi	$i,$i,$BNSZ
279	ble-	Louter
280
281	addi	$num,$num,2	; restore $num
282	subfc	$j,$j,$j	; j=0 and "clear" XER[CA]
283	addi	$tp,$sp,$LOCALS
284	mtctr	$num
285
286.align	4
287Lsub:	$LDX	$tj,$tp,$j
288	$LDX	$nj,$np,$j
289	subfe	$aj,$nj,$tj	; tp[j]-np[j]
290	$STX	$aj,$rp,$j
291	addi	$j,$j,$BNSZ
292	bdnz-	Lsub
293
294	li	$j,0
295	mtctr	$num
296	subfe	$ovf,$j,$ovf	; handle upmost overflow bit
297	and	$ap,$tp,$ovf
298	andc	$np,$rp,$ovf
299	or	$ap,$ap,$np	; ap=borrow?tp:rp
300
301.align	4
302Lcopy:				; copy or in-place refresh
303	$LDX	$tj,$ap,$j
304	$STX	$tj,$rp,$j
305	$STX	$j,$tp,$j	; zap at once
306	addi	$j,$j,$BNSZ
307	bdnz-	Lcopy
308
309	$POP	$tj,0($sp)
310	li	r3,1
311	$POP	r20,`-12*$SIZE_T`($tj)
312	$POP	r21,`-11*$SIZE_T`($tj)
313	$POP	r22,`-10*$SIZE_T`($tj)
314	$POP	r23,`-9*$SIZE_T`($tj)
315	$POP	r24,`-8*$SIZE_T`($tj)
316	$POP	r25,`-7*$SIZE_T`($tj)
317	$POP	r26,`-6*$SIZE_T`($tj)
318	$POP	r27,`-5*$SIZE_T`($tj)
319	$POP	r28,`-4*$SIZE_T`($tj)
320	$POP	r29,`-3*$SIZE_T`($tj)
321	$POP	r30,`-2*$SIZE_T`($tj)
322	$POP	r31,`-1*$SIZE_T`($tj)
323	mr	$sp,$tj
324	blr
325	.long	0
326	.byte	0,12,4,0,0x80,12,6,0
327	.long	0
328
329.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
330___
331
332$code =~ s/\`([^\`]*)\`/eval $1/gem;
333print $code;
334close STDOUT;
335