1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# April 2006 11 12# "Teaser" Montgomery multiplication module for PowerPC. It's possible 13# to gain a bit more by modulo-scheduling outer loop, then dedicated 14# squaring procedure should give further 20% and code can be adapted 15# for 32-bit application running on 64-bit CPU. As for the latter. 16# It won't be able to achieve "native" 64-bit performance, because in 17# 32-bit application context every addc instruction will have to be 18# expanded as addc, twice right shift by 32 and finally adde, etc. 19# So far RSA *sign* performance improvement over pre-bn_mul_mont asm 20# for 64-bit application running on PPC970/G5 is: 21# 22# 512-bit +65% 23# 1024-bit +35% 24# 2048-bit +18% 25# 4096-bit +4% 26 27$flavour = shift; 28 29if ($flavour =~ /32/) { 30 $BITS= 32; 31 $BNSZ= $BITS/8; 32 $SIZE_T=4; 33 $RZONE= 224; 34 35 $LD= "lwz"; # load 36 $LDU= "lwzu"; # load and update 37 $LDX= "lwzx"; # load indexed 38 $ST= "stw"; # store 39 $STU= "stwu"; # store and update 40 $STX= "stwx"; # store indexed 41 $STUX= "stwux"; # store indexed and update 42 $UMULL= "mullw"; # unsigned multiply low 43 $UMULH= "mulhwu"; # unsigned multiply high 44 $UCMP= "cmplw"; # unsigned compare 45 $SHRI= "srwi"; # unsigned shift right by immediate 46 $PUSH= $ST; 47 $POP= $LD; 48} elsif ($flavour =~ /64/) { 49 $BITS= 64; 50 $BNSZ= $BITS/8; 51 $SIZE_T=8; 52 $RZONE= 288; 53 54 # same as above, but 64-bit mnemonics... 55 $LD= "ld"; # load 56 $LDU= "ldu"; # load and update 57 $LDX= "ldx"; # load indexed 58 $ST= "std"; # store 59 $STU= "stdu"; # store and update 60 $STX= "stdx"; # store indexed 61 $STUX= "stdux"; # store indexed and update 62 $UMULL= "mulld"; # unsigned multiply low 63 $UMULH= "mulhdu"; # unsigned multiply high 64 $UCMP= "cmpld"; # unsigned compare 65 $SHRI= "srdi"; # unsigned shift right by immediate 66 $PUSH= $ST; 67 $POP= $LD; 68} else { die "nonsense $flavour"; } 69 70$FRAME=8*$SIZE_T+$RZONE; 71$LOCALS=8*$SIZE_T; 72 73$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 74( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 75( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 76die "can't locate ppc-xlate.pl"; 77 78open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 79 80$sp="r1"; 81$toc="r2"; 82$rp="r3"; $ovf="r3"; 83$ap="r4"; 84$bp="r5"; 85$np="r6"; 86$n0="r7"; 87$num="r8"; 88$rp="r9"; # $rp is reassigned 89$aj="r10"; 90$nj="r11"; 91$tj="r12"; 92# non-volatile registers 93$i="r20"; 94$j="r21"; 95$tp="r22"; 96$m0="r23"; 97$m1="r24"; 98$lo0="r25"; 99$hi0="r26"; 100$lo1="r27"; 101$hi1="r28"; 102$alo="r29"; 103$ahi="r30"; 104$nlo="r31"; 105# 106$nhi="r0"; 107 108$code=<<___; 109.machine "any" 110.text 111 112.globl .bn_mul_mont_int 113.align 4 114.bn_mul_mont_int: 115 cmpwi $num,4 116 mr $rp,r3 ; $rp is reassigned 117 li r3,0 118 bltlr 119___ 120$code.=<<___ if ($BNSZ==4); 121 cmpwi $num,32 ; longer key performance is not better 122 bgelr 123___ 124$code.=<<___; 125 slwi $num,$num,`log($BNSZ)/log(2)` 126 li $tj,-4096 127 addi $ovf,$num,$FRAME 128 subf $ovf,$ovf,$sp ; $sp-$ovf 129 and $ovf,$ovf,$tj ; minimize TLB usage 130 subf $ovf,$sp,$ovf ; $ovf-$sp 131 mr $tj,$sp 132 srwi $num,$num,`log($BNSZ)/log(2)` 133 $STUX $sp,$sp,$ovf 134 135 $PUSH r20,`-12*$SIZE_T`($tj) 136 $PUSH r21,`-11*$SIZE_T`($tj) 137 $PUSH r22,`-10*$SIZE_T`($tj) 138 $PUSH r23,`-9*$SIZE_T`($tj) 139 $PUSH r24,`-8*$SIZE_T`($tj) 140 $PUSH r25,`-7*$SIZE_T`($tj) 141 $PUSH r26,`-6*$SIZE_T`($tj) 142 $PUSH r27,`-5*$SIZE_T`($tj) 143 $PUSH r28,`-4*$SIZE_T`($tj) 144 $PUSH r29,`-3*$SIZE_T`($tj) 145 $PUSH r30,`-2*$SIZE_T`($tj) 146 $PUSH r31,`-1*$SIZE_T`($tj) 147 148 $LD $n0,0($n0) ; pull n0[0] value 149 addi $num,$num,-2 ; adjust $num for counter register 150 151 $LD $m0,0($bp) ; m0=bp[0] 152 $LD $aj,0($ap) ; ap[0] 153 addi $tp,$sp,$LOCALS 154 $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0] 155 $UMULH $hi0,$aj,$m0 156 157 $LD $aj,$BNSZ($ap) ; ap[1] 158 $LD $nj,0($np) ; np[0] 159 160 $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0 161 162 $UMULL $alo,$aj,$m0 ; ap[1]*bp[0] 163 $UMULH $ahi,$aj,$m0 164 165 $UMULL $lo1,$nj,$m1 ; np[0]*m1 166 $UMULH $hi1,$nj,$m1 167 $LD $nj,$BNSZ($np) ; np[1] 168 addc $lo1,$lo1,$lo0 169 addze $hi1,$hi1 170 171 $UMULL $nlo,$nj,$m1 ; np[1]*m1 172 $UMULH $nhi,$nj,$m1 173 174 mtctr $num 175 li $j,`2*$BNSZ` 176.align 4 177L1st: 178 $LDX $aj,$ap,$j ; ap[j] 179 addc $lo0,$alo,$hi0 180 $LDX $nj,$np,$j ; np[j] 181 addze $hi0,$ahi 182 $UMULL $alo,$aj,$m0 ; ap[j]*bp[0] 183 addc $lo1,$nlo,$hi1 184 $UMULH $ahi,$aj,$m0 185 addze $hi1,$nhi 186 $UMULL $nlo,$nj,$m1 ; np[j]*m1 187 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] 188 $UMULH $nhi,$nj,$m1 189 addze $hi1,$hi1 190 $ST $lo1,0($tp) ; tp[j-1] 191 192 addi $j,$j,$BNSZ ; j++ 193 addi $tp,$tp,$BNSZ ; tp++ 194 bdnz- L1st 195;L1st 196 addc $lo0,$alo,$hi0 197 addze $hi0,$ahi 198 199 addc $lo1,$nlo,$hi1 200 addze $hi1,$nhi 201 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] 202 addze $hi1,$hi1 203 $ST $lo1,0($tp) ; tp[j-1] 204 205 li $ovf,0 206 addc $hi1,$hi1,$hi0 207 addze $ovf,$ovf ; upmost overflow bit 208 $ST $hi1,$BNSZ($tp) 209 210 li $i,$BNSZ 211.align 4 212Louter: 213 $LDX $m0,$bp,$i ; m0=bp[i] 214 $LD $aj,0($ap) ; ap[0] 215 addi $tp,$sp,$LOCALS 216 $LD $tj,$LOCALS($sp); tp[0] 217 $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i] 218 $UMULH $hi0,$aj,$m0 219 $LD $aj,$BNSZ($ap) ; ap[1] 220 $LD $nj,0($np) ; np[0] 221 addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0] 222 $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] 223 addze $hi0,$hi0 224 $UMULL $m1,$lo0,$n0 ; tp[0]*n0 225 $UMULH $ahi,$aj,$m0 226 $UMULL $lo1,$nj,$m1 ; np[0]*m1 227 $UMULH $hi1,$nj,$m1 228 $LD $nj,$BNSZ($np) ; np[1] 229 addc $lo1,$lo1,$lo0 230 $UMULL $nlo,$nj,$m1 ; np[1]*m1 231 addze $hi1,$hi1 232 $UMULH $nhi,$nj,$m1 233 234 mtctr $num 235 li $j,`2*$BNSZ` 236.align 4 237Linner: 238 $LDX $aj,$ap,$j ; ap[j] 239 addc $lo0,$alo,$hi0 240 $LD $tj,$BNSZ($tp) ; tp[j] 241 addze $hi0,$ahi 242 $LDX $nj,$np,$j ; np[j] 243 addc $lo1,$nlo,$hi1 244 $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] 245 addze $hi1,$nhi 246 $UMULH $ahi,$aj,$m0 247 addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] 248 $UMULL $nlo,$nj,$m1 ; np[j]*m1 249 addze $hi0,$hi0 250 $UMULH $nhi,$nj,$m1 251 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] 252 addi $j,$j,$BNSZ ; j++ 253 addze $hi1,$hi1 254 $ST $lo1,0($tp) ; tp[j-1] 255 addi $tp,$tp,$BNSZ ; tp++ 256 bdnz- Linner 257;Linner 258 $LD $tj,$BNSZ($tp) ; tp[j] 259 addc $lo0,$alo,$hi0 260 addze $hi0,$ahi 261 addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] 262 addze $hi0,$hi0 263 264 addc $lo1,$nlo,$hi1 265 addze $hi1,$nhi 266 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] 267 addze $hi1,$hi1 268 $ST $lo1,0($tp) ; tp[j-1] 269 270 addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA] 271 li $ovf,0 272 adde $hi1,$hi1,$hi0 273 addze $ovf,$ovf 274 $ST $hi1,$BNSZ($tp) 275; 276 slwi $tj,$num,`log($BNSZ)/log(2)` 277 $UCMP $i,$tj 278 addi $i,$i,$BNSZ 279 ble- Louter 280 281 addi $num,$num,2 ; restore $num 282 subfc $j,$j,$j ; j=0 and "clear" XER[CA] 283 addi $tp,$sp,$LOCALS 284 mtctr $num 285 286.align 4 287Lsub: $LDX $tj,$tp,$j 288 $LDX $nj,$np,$j 289 subfe $aj,$nj,$tj ; tp[j]-np[j] 290 $STX $aj,$rp,$j 291 addi $j,$j,$BNSZ 292 bdnz- Lsub 293 294 li $j,0 295 mtctr $num 296 subfe $ovf,$j,$ovf ; handle upmost overflow bit 297 and $ap,$tp,$ovf 298 andc $np,$rp,$ovf 299 or $ap,$ap,$np ; ap=borrow?tp:rp 300 301.align 4 302Lcopy: ; copy or in-place refresh 303 $LDX $tj,$ap,$j 304 $STX $tj,$rp,$j 305 $STX $j,$tp,$j ; zap at once 306 addi $j,$j,$BNSZ 307 bdnz- Lcopy 308 309 $POP $tj,0($sp) 310 li r3,1 311 $POP r20,`-12*$SIZE_T`($tj) 312 $POP r21,`-11*$SIZE_T`($tj) 313 $POP r22,`-10*$SIZE_T`($tj) 314 $POP r23,`-9*$SIZE_T`($tj) 315 $POP r24,`-8*$SIZE_T`($tj) 316 $POP r25,`-7*$SIZE_T`($tj) 317 $POP r26,`-6*$SIZE_T`($tj) 318 $POP r27,`-5*$SIZE_T`($tj) 319 $POP r28,`-4*$SIZE_T`($tj) 320 $POP r29,`-3*$SIZE_T`($tj) 321 $POP r30,`-2*$SIZE_T`($tj) 322 $POP r31,`-1*$SIZE_T`($tj) 323 mr $sp,$tj 324 blr 325 .long 0 326 .byte 0,12,4,0,0x80,12,6,0 327 .long 0 328 329.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>" 330___ 331 332$code =~ s/\`([^\`]*)\`/eval $1/gem; 333print $code; 334close STDOUT; 335