1221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#!/usr/bin/env perl 2221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 3221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# ==================================================================== 4221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# project. The module is, however, dual licensed under OpenSSL and 6221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# CRYPTOGAMS licenses depending on where you obtain it. For further 7221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# details see http://www.openssl.org/~appro/cryptogams/. 8221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# ==================================================================== 9221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 10221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# April 2006 11221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 12221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# "Teaser" Montgomery multiplication module for PowerPC. It's possible 13221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# to gain a bit more by modulo-scheduling outer loop, then dedicated 14221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# squaring procedure should give further 20% and code can be adapted 15221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# for 32-bit application running on 64-bit CPU. As for the latter. 16221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# It won't be able to achieve "native" 64-bit performance, because in 17221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 32-bit application context every addc instruction will have to be 18221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# expanded as addc, twice right shift by 32 and finally adde, etc. 19221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# So far RSA *sign* performance improvement over pre-bn_mul_mont asm 20221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# for 64-bit application running on PPC970/G5 is: 21221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 22221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 512-bit +65% 23221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 1024-bit +35% 24221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 2048-bit +18% 25221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 4096-bit +4% 26221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 27221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$flavour = shift; 28221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 29221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromif ($flavour =~ /32/) { 30221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $BITS= 32; 31221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $BNSZ= $BITS/8; 32221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $SIZE_T=4; 33221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $RZONE= 224; 34221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 35221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $LD= "lwz"; # load 36221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $LDU= "lwzu"; # load and update 37221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $LDX= "lwzx"; # load indexed 38221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $ST= "stw"; # store 39221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $STU= "stwu"; # store and update 40221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $STX= "stwx"; # store indexed 41221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $STUX= "stwux"; # store indexed and update 42221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULL= "mullw"; # unsigned multiply low 43221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULH= "mulhwu"; # unsigned multiply high 44221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UCMP= "cmplw"; # unsigned compare 45221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $SHRI= "srwi"; # unsigned shift right by immediate 46221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $PUSH= $ST; 47221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $POP= $LD; 48221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom} elsif ($flavour =~ /64/) { 49221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $BITS= 64; 50221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $BNSZ= $BITS/8; 51221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $SIZE_T=8; 52221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $RZONE= 288; 53221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 54221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom # same as above, but 64-bit mnemonics... 55221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $LD= "ld"; # load 56221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $LDU= "ldu"; # load and update 57221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $LDX= "ldx"; # load indexed 58221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $ST= "std"; # store 59221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $STU= "stdu"; # store and update 60221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $STX= "stdx"; # store indexed 61221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $STUX= "stdux"; # store indexed and update 62221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULL= "mulld"; # unsigned multiply low 63221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULH= "mulhdu"; # unsigned multiply high 64221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UCMP= "cmpld"; # unsigned compare 65221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $SHRI= "srdi"; # unsigned shift right by immediate 66221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $PUSH= $ST; 67221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $POP= $LD; 68221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom} else { die "nonsense $flavour"; } 69221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 70392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$FRAME=8*$SIZE_T+$RZONE; 71392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$LOCALS=8*$SIZE_T; 72392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 73221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 74221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 75221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 76221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromdie "can't locate ppc-xlate.pl"; 77221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 78221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromopen STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 79221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 80221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$sp="r1"; 81221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$toc="r2"; 82221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$rp="r3"; $ovf="r3"; 83221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$ap="r4"; 84221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$bp="r5"; 85221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$np="r6"; 86221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$n0="r7"; 87221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$num="r8"; 88221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$rp="r9"; # $rp is reassigned 89221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$aj="r10"; 90221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$nj="r11"; 91221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$tj="r12"; 92221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# non-volatile registers 93392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$i="r20"; 94392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$j="r21"; 95392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$tp="r22"; 96392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$m0="r23"; 97392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$m1="r24"; 98392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$lo0="r25"; 99392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$hi0="r26"; 100392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$lo1="r27"; 101392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$hi1="r28"; 102392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$alo="r29"; 103392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$ahi="r30"; 104392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$nlo="r31"; 105221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 106221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$nhi="r0"; 107221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 108221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$code=<<___; 109221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.machine "any" 110221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.text 111221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 112392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.globl .bn_mul_mont_int 113221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.align 4 114392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.bn_mul_mont_int: 115221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom cmpwi $num,4 116221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mr $rp,r3 ; $rp is reassigned 117221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom li r3,0 118221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom bltlr 119392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 120392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($BNSZ==4); 121392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmpwi $num,32 ; longer key performance is not better 122392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom bgelr 123392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 124392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 125221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom slwi $num,$num,`log($BNSZ)/log(2)` 126221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom li $tj,-4096 127392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addi $ovf,$num,$FRAME 128221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom subf $ovf,$ovf,$sp ; $sp-$ovf 129221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom and $ovf,$ovf,$tj ; minimize TLB usage 130221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom subf $ovf,$sp,$ovf ; $ovf-$sp 131392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mr $tj,$sp 132221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom srwi $num,$num,`log($BNSZ)/log(2)` 133221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $STUX $sp,$sp,$ovf 134221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 135392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $PUSH r20,`-12*$SIZE_T`($tj) 136392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $PUSH r21,`-11*$SIZE_T`($tj) 137392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $PUSH r22,`-10*$SIZE_T`($tj) 138392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $PUSH r23,`-9*$SIZE_T`($tj) 139392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $PUSH r24,`-8*$SIZE_T`($tj) 140392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $PUSH r25,`-7*$SIZE_T`($tj) 141392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $PUSH r26,`-6*$SIZE_T`($tj) 142392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $PUSH r27,`-5*$SIZE_T`($tj) 143392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $PUSH r28,`-4*$SIZE_T`($tj) 144392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $PUSH r29,`-3*$SIZE_T`($tj) 145392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $PUSH r30,`-2*$SIZE_T`($tj) 146392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $PUSH r31,`-1*$SIZE_T`($tj) 147221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 148221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $LD $n0,0($n0) ; pull n0[0] value 149221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addi $num,$num,-2 ; adjust $num for counter register 150221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 151221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $LD $m0,0($bp) ; m0=bp[0] 152221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $LD $aj,0($ap) ; ap[0] 153392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addi $tp,$sp,$LOCALS 154221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0] 155221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULH $hi0,$aj,$m0 156221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 157221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $LD $aj,$BNSZ($ap) ; ap[1] 158221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $LD $nj,0($np) ; np[0] 159221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 160221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0 161221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 162221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULL $alo,$aj,$m0 ; ap[1]*bp[0] 163221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULH $ahi,$aj,$m0 164221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 165221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULL $lo1,$nj,$m1 ; np[0]*m1 166221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULH $hi1,$nj,$m1 167221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $LD $nj,$BNSZ($np) ; np[1] 168221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addc $lo1,$lo1,$lo0 169221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addze $hi1,$hi1 170221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 171221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULL $nlo,$nj,$m1 ; np[1]*m1 172221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULH $nhi,$nj,$m1 173221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 174221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mtctr $num 175221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom li $j,`2*$BNSZ` 176221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.align 4 177221304ee937bc0910948a8be1320cb8cc4eb6d36Brian CarlstromL1st: 178221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $LDX $aj,$ap,$j ; ap[j] 179221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addc $lo0,$alo,$hi0 180221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $LDX $nj,$np,$j ; np[j] 181221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addze $hi0,$ahi 182221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULL $alo,$aj,$m0 ; ap[j]*bp[0] 183221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addc $lo1,$nlo,$hi1 184221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULH $ahi,$aj,$m0 185221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addze $hi1,$nhi 186221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULL $nlo,$nj,$m1 ; np[j]*m1 187221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] 188221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULH $nhi,$nj,$m1 189221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addze $hi1,$hi1 190221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $ST $lo1,0($tp) ; tp[j-1] 191221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 192221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addi $j,$j,$BNSZ ; j++ 193221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addi $tp,$tp,$BNSZ ; tp++ 194221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom bdnz- L1st 195221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom;L1st 196221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addc $lo0,$alo,$hi0 197221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addze $hi0,$ahi 198221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 199221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addc $lo1,$nlo,$hi1 200221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addze $hi1,$nhi 201221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] 202221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addze $hi1,$hi1 203221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $ST $lo1,0($tp) ; tp[j-1] 204221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 205221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom li $ovf,0 206221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addc $hi1,$hi1,$hi0 207221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addze $ovf,$ovf ; upmost overflow bit 208221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $ST $hi1,$BNSZ($tp) 209221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 210221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom li $i,$BNSZ 211221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.align 4 212221304ee937bc0910948a8be1320cb8cc4eb6d36Brian CarlstromLouter: 213221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $LDX $m0,$bp,$i ; m0=bp[i] 214221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $LD $aj,0($ap) ; ap[0] 215392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addi $tp,$sp,$LOCALS 216392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $LD $tj,$LOCALS($sp); tp[0] 217221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i] 218221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULH $hi0,$aj,$m0 219221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $LD $aj,$BNSZ($ap) ; ap[1] 220221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $LD $nj,0($np) ; np[0] 221221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0] 222221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] 223221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addze $hi0,$hi0 224221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULL $m1,$lo0,$n0 ; tp[0]*n0 225221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULH $ahi,$aj,$m0 226221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULL $lo1,$nj,$m1 ; np[0]*m1 227221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULH $hi1,$nj,$m1 228221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $LD $nj,$BNSZ($np) ; np[1] 229221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addc $lo1,$lo1,$lo0 230221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULL $nlo,$nj,$m1 ; np[1]*m1 231221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addze $hi1,$hi1 232221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULH $nhi,$nj,$m1 233221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 234221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mtctr $num 235221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom li $j,`2*$BNSZ` 236221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.align 4 237221304ee937bc0910948a8be1320cb8cc4eb6d36Brian CarlstromLinner: 238221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $LDX $aj,$ap,$j ; ap[j] 239221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addc $lo0,$alo,$hi0 240221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $LD $tj,$BNSZ($tp) ; tp[j] 241221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addze $hi0,$ahi 242221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $LDX $nj,$np,$j ; np[j] 243221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addc $lo1,$nlo,$hi1 244221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] 245221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addze $hi1,$nhi 246221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULH $ahi,$aj,$m0 247221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] 248221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULL $nlo,$nj,$m1 ; np[j]*m1 249221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addze $hi0,$hi0 250221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UMULH $nhi,$nj,$m1 251221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] 252221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addi $j,$j,$BNSZ ; j++ 253221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addze $hi1,$hi1 254221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $ST $lo1,0($tp) ; tp[j-1] 255221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addi $tp,$tp,$BNSZ ; tp++ 256221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom bdnz- Linner 257221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom;Linner 258221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $LD $tj,$BNSZ($tp) ; tp[j] 259221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addc $lo0,$alo,$hi0 260221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addze $hi0,$ahi 261221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] 262221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addze $hi0,$hi0 263221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 264221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addc $lo1,$nlo,$hi1 265221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addze $hi1,$nhi 266221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] 267221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addze $hi1,$hi1 268221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $ST $lo1,0($tp) ; tp[j-1] 269221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 270221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA] 271221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom li $ovf,0 272221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom adde $hi1,$hi1,$hi0 273221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addze $ovf,$ovf 274221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $ST $hi1,$BNSZ($tp) 275221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom; 276221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom slwi $tj,$num,`log($BNSZ)/log(2)` 277221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $UCMP $i,$tj 278221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addi $i,$i,$BNSZ 279221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom ble- Louter 280221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 281221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addi $num,$num,2 ; restore $num 282221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom subfc $j,$j,$j ; j=0 and "clear" XER[CA] 283392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addi $tp,$sp,$LOCALS 284221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mtctr $num 285221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 286221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.align 4 287221304ee937bc0910948a8be1320cb8cc4eb6d36Brian CarlstromLsub: $LDX $tj,$tp,$j 288221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $LDX $nj,$np,$j 289221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom subfe $aj,$nj,$tj ; tp[j]-np[j] 290221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $STX $aj,$rp,$j 291221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addi $j,$j,$BNSZ 292221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom bdnz- Lsub 293221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 294221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom li $j,0 295221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mtctr $num 296221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom subfe $ovf,$j,$ovf ; handle upmost overflow bit 297221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom and $ap,$tp,$ovf 298221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom andc $np,$rp,$ovf 299221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom or $ap,$ap,$np ; ap=borrow?tp:rp 300221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 301221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.align 4 302221304ee937bc0910948a8be1320cb8cc4eb6d36Brian CarlstromLcopy: ; copy or in-place refresh 303221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $LDX $tj,$ap,$j 304221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $STX $tj,$rp,$j 305221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $STX $j,$tp,$j ; zap at once 306221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom addi $j,$j,$BNSZ 307221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom bdnz- Lcopy 308221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 309392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $POP $tj,0($sp) 310221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom li r3,1 311392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $POP r20,`-12*$SIZE_T`($tj) 312392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $POP r21,`-11*$SIZE_T`($tj) 313392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $POP r22,`-10*$SIZE_T`($tj) 314392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $POP r23,`-9*$SIZE_T`($tj) 315392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $POP r24,`-8*$SIZE_T`($tj) 316392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $POP r25,`-7*$SIZE_T`($tj) 317392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $POP r26,`-6*$SIZE_T`($tj) 318392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $POP r27,`-5*$SIZE_T`($tj) 319392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $POP r28,`-4*$SIZE_T`($tj) 320392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $POP r29,`-3*$SIZE_T`($tj) 321392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $POP r30,`-2*$SIZE_T`($tj) 322392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $POP r31,`-1*$SIZE_T`($tj) 323392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mr $sp,$tj 324221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom blr 325221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .long 0 326392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .byte 0,12,4,0,0x80,12,6,0 327392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .long 0 328392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 329392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>" 330221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom___ 331221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 332221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$code =~ s/\`([^\`]*)\`/eval $1/gem; 333221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromprint $code; 334221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromclose STDOUT; 335