1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# January 2010 11# 12# "Teaser" Montgomery multiplication module for IA-64. There are 13# several possibilities for improvement: 14# 15# - modulo-scheduling outer loop would eliminate quite a number of 16# stalls after ldf8, xma and getf.sig outside inner loop and 17# improve shorter key performance; 18# - shorter vector support [with input vectors being fetched only 19# once] should be added; 20# - 2x unroll with help of n0[1] would make the code scalable on 21# "wider" IA-64, "wider" than Itanium 2 that is, which is not of 22# acute interest, because upcoming Tukwila's individual cores are 23# reportedly based on Itanium 2 design; 24# - dedicated squaring procedure(?); 25# 26# January 2010 27# 28# Shorter vector support is implemented by zero-padding ap and np 29# vectors up to 8 elements, or 512 bits. This means that 256-bit 30# inputs will be processed only 2 times faster than 512-bit inputs, 31# not 4 [as one would expect, because algorithm complexity is n^2]. 32# The reason for padding is that inputs shorter than 512 bits won't 33# be processed faster anyway, because minimal critical path of the 34# core loop happens to match 512-bit timing. Either way, it resulted 35# in >100% improvement of 512-bit RSA sign benchmark and 50% - of 36# 1024-bit one [in comparison to original version of *this* module]. 37# 38# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with* 39# this module is: 40# sign verify sign/s verify/s 41# rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4 42# rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0 43# rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0 44# rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6 45# dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0 46# dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4 47# dsa 2048 bits 0.001453s 0.001703s 688.1 587.4 48# 49# ... and *without* (but still with ia64.S): 50# 51# rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5 52# rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3 53# rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9 54# rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9 55# dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6 56# dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2 57# dsa 2048 bits 0.001894s 0.002179s 528.1 458.9 58# 59# As it can be seen, RSA sign performance improves by 130-30%, 60# hereafter less for longer keys, while verify - by 74-13%. 61# DSA performance improves by 115-30%. 62 63if ($^O eq "hpux") { 64 $ADDP="addp4"; 65 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } 66} else { $ADDP="add"; } 67 68$code=<<___; 69.explicit 70.text 71 72// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap, 73// const BN_ULONG *bp,const BN_ULONG *np, 74// const BN_ULONG *n0p,int num); 75.align 64 76.global bn_mul_mont# 77.proc bn_mul_mont# 78bn_mul_mont: 79 .prologue 80 .body 81{ .mmi; cmp4.le p6,p7=2,r37;; 82(p6) cmp4.lt.unc p8,p9=8,r37 83 mov ret0=r0 };; 84{ .bbb; 85(p9) br.cond.dptk.many bn_mul_mont_8 86(p8) br.cond.dpnt.many bn_mul_mont_general 87(p7) br.ret.spnt.many b0 };; 88.endp bn_mul_mont# 89 90prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11; 91 92rptr=r8; aptr=r9; bptr=r14; nptr=r15; 93tptr=r16; // &tp[0] 94tp_1=r17; // &tp[-1] 95num=r18; len=r19; lc=r20; 96topbit=r21; // carry bit from tmp[num] 97 98n0=f6; 99m0=f7; 100bi=f8; 101 102.align 64 103.local bn_mul_mont_general# 104.proc bn_mul_mont_general# 105bn_mul_mont_general: 106 .prologue 107{ .mmi; .save ar.pfs,prevfs 108 alloc prevfs=ar.pfs,6,2,0,8 109 $ADDP aptr=0,in1 110 .save ar.lc,prevlc 111 mov prevlc=ar.lc } 112{ .mmi; .vframe prevsp 113 mov prevsp=sp 114 $ADDP bptr=0,in2 115 .save pr,prevpr 116 mov prevpr=pr };; 117 118 .body 119 .rotf alo[6],nlo[4],ahi[8],nhi[6] 120 .rotr a[3],n[3],t[2] 121 122{ .mmi; ldf8 bi=[bptr],8 // (*bp++) 123 ldf8 alo[4]=[aptr],16 // ap[0] 124 $ADDP r30=8,in1 };; 125{ .mmi; ldf8 alo[3]=[r30],16 // ap[1] 126 ldf8 alo[2]=[aptr],16 // ap[2] 127 $ADDP in4=0,in4 };; 128{ .mmi; ldf8 alo[1]=[r30] // ap[3] 129 ldf8 n0=[in4] // n0 130 $ADDP rptr=0,in0 } 131{ .mmi; $ADDP nptr=0,in3 132 mov r31=16 133 zxt4 num=in5 };; 134{ .mmi; ldf8 nlo[2]=[nptr],8 // np[0] 135 shladd len=num,3,r0 136 shladd r31=num,3,r31 };; 137{ .mmi; ldf8 nlo[1]=[nptr],8 // np[1] 138 add lc=-5,num 139 sub r31=sp,r31 };; 140{ .mfb; and sp=-16,r31 // alloca 141 xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0] 142 nop.b 0 } 143{ .mfb; nop.m 0 144 xmpy.lu alo[4]=alo[4],bi 145 brp.loop.imp .L1st_ctop,.L1st_cend-16 146 };; 147{ .mfi; nop.m 0 148 xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0] 149 add tp_1=8,sp } 150{ .mfi; nop.m 0 151 xma.lu alo[3]=alo[3],bi,ahi[2] 152 mov pr.rot=0x20001f<<16 153 // ------^----- (p40) at first (p23) 154 // ----------^^ p[16:20]=1 155 };; 156{ .mfi; nop.m 0 157 xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0 158 mov ar.lc=lc } 159{ .mfi; nop.m 0 160 fcvt.fxu.s1 nhi[1]=f0 161 mov ar.ec=8 };; 162 163.align 32 164.L1st_ctop: 165.pred.rel "mutex",p40,p42 166{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++) 167 (p18) xma.hu ahi[0]=alo[2],bi,ahi[1] 168 (p40) add n[2]=n[2],a[2] } // (p23) } 169{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16) 170 (p18) xma.lu alo[2]=alo[2],bi,ahi[1] 171 (p42) add n[2]=n[2],a[2],1 };; // (p23) 172{ .mfi; (p21) getf.sig a[0]=alo[5] 173 (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1] 174 (p42) cmp.leu p41,p39=n[2],a[2] } // (p23) 175{ .mfi; (p23) st8 [tp_1]=n[2],8 176 (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1] 177 (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23) 178{ .mmb; (p21) getf.sig n[0]=nlo[3] 179 (p16) nop.m 0 180 br.ctop.sptk .L1st_ctop };; 181.L1st_cend: 182 183{ .mmi; getf.sig a[0]=ahi[6] // (p24) 184 getf.sig n[0]=nhi[4] 185 add num=-1,num };; // num-- 186{ .mmi; .pred.rel "mutex",p40,p42 187(p40) add n[0]=n[0],a[0] 188(p42) add n[0]=n[0],a[0],1 189 sub aptr=aptr,len };; // rewind 190{ .mmi; .pred.rel "mutex",p40,p42 191(p40) cmp.ltu p41,p39=n[0],a[0] 192(p42) cmp.leu p41,p39=n[0],a[0] 193 sub nptr=nptr,len };; 194{ .mmi; .pred.rel "mutex",p39,p41 195(p39) add topbit=r0,r0 196(p41) add topbit=r0,r0,1 197 nop.i 0 } 198{ .mmi; st8 [tp_1]=n[0] 199 add tptr=16,sp 200 add tp_1=8,sp };; 201 202.Louter: 203{ .mmi; ldf8 bi=[bptr],8 // (*bp++) 204 ldf8 ahi[3]=[tptr] // tp[0] 205 add r30=8,aptr };; 206{ .mmi; ldf8 alo[4]=[aptr],16 // ap[0] 207 ldf8 alo[3]=[r30],16 // ap[1] 208 add r31=8,nptr };; 209{ .mfb; ldf8 alo[2]=[aptr],16 // ap[2] 210 xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0] 211 brp.loop.imp .Linner_ctop,.Linner_cend-16 212 } 213{ .mfb; ldf8 alo[1]=[r30] // ap[3] 214 xma.lu alo[4]=alo[4],bi,ahi[3] 215 clrrrb.pr };; 216{ .mfi; ldf8 nlo[2]=[nptr],16 // np[0] 217 xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i] 218 nop.i 0 } 219{ .mfi; ldf8 nlo[1]=[r31] // np[1] 220 xma.lu alo[3]=alo[3],bi,ahi[2] 221 mov pr.rot=0x20101f<<16 222 // ------^----- (p40) at first (p23) 223 // --------^--- (p30) at first (p22) 224 // ----------^^ p[16:20]=1 225 };; 226{ .mfi; st8 [tptr]=r0 // tp[0] is already accounted 227 xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0 228 mov ar.lc=lc } 229{ .mfi; 230 fcvt.fxu.s1 nhi[1]=f0 231 mov ar.ec=8 };; 232 233// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in 234// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7 235// in latter case accounts for two-tick pipeline stall, which means 236// that its performance would be ~20% lower than optimal one. No 237// attempt was made to address this, because original Itanium is 238// hardly represented out in the wild... 239.align 32 240.Linner_ctop: 241.pred.rel "mutex",p40,p42 242.pred.rel "mutex",p30,p32 243{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++) 244 (p18) xma.hu ahi[0]=alo[2],bi,ahi[1] 245 (p40) add n[2]=n[2],a[2] } // (p23) 246{ .mfi; (p16) nop.m 0 247 (p18) xma.lu alo[2]=alo[2],bi,ahi[1] 248 (p42) add n[2]=n[2],a[2],1 };; // (p23) 249{ .mfi; (p21) getf.sig a[0]=alo[5] 250 (p16) nop.f 0 251 (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23) 252{ .mfi; (p21) ld8 t[0]=[tptr],8 253 (p16) nop.f 0 254 (p42) cmp.leu p41,p39=n[2],a[2] };; // (p23) 255{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++) 256 (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1] 257 (p30) add a[1]=a[1],t[1] } // (p22) 258{ .mfi; (p16) nop.m 0 259 (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1] 260 (p32) add a[1]=a[1],t[1],1 };; // (p22) 261{ .mmi; (p21) getf.sig n[0]=nlo[3] 262 (p16) nop.m 0 263 (p30) cmp.ltu p31,p29=a[1],t[1] } // (p22) 264{ .mmb; (p23) st8 [tp_1]=n[2],8 265 (p32) cmp.leu p31,p29=a[1],t[1] // (p22) 266 br.ctop.sptk .Linner_ctop };; 267.Linner_cend: 268 269{ .mmi; getf.sig a[0]=ahi[6] // (p24) 270 getf.sig n[0]=nhi[4] 271 nop.i 0 };; 272 273{ .mmi; .pred.rel "mutex",p31,p33 274(p31) add a[0]=a[0],topbit 275(p33) add a[0]=a[0],topbit,1 276 mov topbit=r0 };; 277{ .mfi; .pred.rel "mutex",p31,p33 278(p31) cmp.ltu p32,p30=a[0],topbit 279(p33) cmp.leu p32,p30=a[0],topbit 280 } 281{ .mfi; .pred.rel "mutex",p40,p42 282(p40) add n[0]=n[0],a[0] 283(p42) add n[0]=n[0],a[0],1 284 };; 285{ .mmi; .pred.rel "mutex",p44,p46 286(p40) cmp.ltu p41,p39=n[0],a[0] 287(p42) cmp.leu p41,p39=n[0],a[0] 288(p32) add topbit=r0,r0,1 } 289 290{ .mmi; st8 [tp_1]=n[0],8 291 cmp4.ne p6,p0=1,num 292 sub aptr=aptr,len };; // rewind 293{ .mmi; sub nptr=nptr,len 294(p41) add topbit=r0,r0,1 295 add tptr=16,sp } 296{ .mmb; add tp_1=8,sp 297 add num=-1,num // num-- 298(p6) br.cond.sptk.many .Louter };; 299 300{ .mbb; add lc=4,lc 301 brp.loop.imp .Lsub_ctop,.Lsub_cend-16 302 clrrrb.pr };; 303{ .mii; nop.m 0 304 mov pr.rot=0x10001<<16 305 // ------^---- (p33) at first (p17) 306 mov ar.lc=lc } 307{ .mii; nop.m 0 308 mov ar.ec=3 309 nop.i 0 };; 310 311.Lsub_ctop: 312.pred.rel "mutex",p33,p35 313{ .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++) 314 (p16) nop.f 0 315 (p33) sub n[1]=t[1],n[1] } // (p17) 316{ .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++) 317 (p16) nop.f 0 318 (p35) sub n[1]=t[1],n[1],1 };; // (p17) 319{ .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r 320 (p33) cmp.gtu p34,p32=n[1],t[1] // (p17) 321 (p18) nop.b 0 } 322{ .mib; (p18) nop.m 0 323 (p35) cmp.geu p34,p32=n[1],t[1] // (p17) 324 br.ctop.sptk .Lsub_ctop };; 325.Lsub_cend: 326 327{ .mmb; .pred.rel "mutex",p34,p36 328(p34) sub topbit=topbit,r0 // (p19) 329(p36) sub topbit=topbit,r0,1 330 brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16 331 } 332{ .mmb; sub rptr=rptr,len // rewind 333 sub tptr=tptr,len 334 clrrrb.pr };; 335{ .mmi; and aptr=tptr,topbit 336 andcm bptr=rptr,topbit 337 mov pr.rot=1<<16 };; 338{ .mii; or nptr=aptr,bptr 339 mov ar.lc=lc 340 mov ar.ec=3 };; 341 342.Lcopy_ctop: 343{ .mmb; (p16) ld8 n[0]=[nptr],8 344 (p18) st8 [tptr]=r0,8 345 (p16) nop.b 0 } 346{ .mmb; (p16) nop.m 0 347 (p18) st8 [rptr]=n[2],8 348 br.ctop.sptk .Lcopy_ctop };; 349.Lcopy_cend: 350 351{ .mmi; mov ret0=1 // signal "handled" 352 rum 1<<5 // clear um.mfh 353 mov ar.lc=prevlc } 354{ .mib; .restore sp 355 mov sp=prevsp 356 mov pr=prevpr,0x1ffff 357 br.ret.sptk.many b0 };; 358.endp bn_mul_mont_general# 359 360a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23; 361n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31; 362t0=r15; 363 364ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15; 365ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23; 366 367.align 64 368.skip 48 // aligns loop body 369.local bn_mul_mont_8# 370.proc bn_mul_mont_8# 371bn_mul_mont_8: 372 .prologue 373{ .mmi; .save ar.pfs,prevfs 374 alloc prevfs=ar.pfs,6,2,0,8 375 .vframe prevsp 376 mov prevsp=sp 377 .save ar.lc,prevlc 378 mov prevlc=ar.lc } 379{ .mmi; add r17=-6*16,sp 380 add sp=-7*16,sp 381 .save pr,prevpr 382 mov prevpr=pr };; 383 384{ .mmi; .save.gf 0,0x10 385 stf.spill [sp]=f16,-16 386 .save.gf 0,0x20 387 stf.spill [r17]=f17,32 388 add r16=-5*16,prevsp};; 389{ .mmi; .save.gf 0,0x40 390 stf.spill [r16]=f18,32 391 .save.gf 0,0x80 392 stf.spill [r17]=f19,32 393 $ADDP aptr=0,in1 };; 394{ .mmi; .save.gf 0,0x100 395 stf.spill [r16]=f20,32 396 .save.gf 0,0x200 397 stf.spill [r17]=f21,32 398 $ADDP r29=8,in1 };; 399{ .mmi; .save.gf 0,0x400 400 stf.spill [r16]=f22 401 .save.gf 0,0x800 402 stf.spill [r17]=f23 403 $ADDP rptr=0,in0 };; 404 405 .body 406 .rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10] 407 .rotr t[8] 408 409// load input vectors padding them to 8 elements 410{ .mmi; ldf8 ai0=[aptr],16 // ap[0] 411 ldf8 ai1=[r29],16 // ap[1] 412 $ADDP bptr=0,in2 } 413{ .mmi; $ADDP r30=8,in2 414 $ADDP nptr=0,in3 415 $ADDP r31=8,in3 };; 416{ .mmi; ldf8 bj[7]=[bptr],16 // bp[0] 417 ldf8 bj[6]=[r30],16 // bp[1] 418 cmp4.le p4,p5=3,in5 } 419{ .mmi; ldf8 ni0=[nptr],16 // np[0] 420 ldf8 ni1=[r31],16 // np[1] 421 cmp4.le p6,p7=4,in5 };; 422 423{ .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2] 424 (p5)fcvt.fxu ai2=f0 425 cmp4.le p8,p9=5,in5 } 426{ .mfi; (p6)ldf8 ai3=[r29],16 // ap[3] 427 (p7)fcvt.fxu ai3=f0 428 cmp4.le p10,p11=6,in5 } 429{ .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2] 430 (p5)fcvt.fxu bj[5]=f0 431 cmp4.le p12,p13=7,in5 } 432{ .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3] 433 (p7)fcvt.fxu bj[4]=f0 434 cmp4.le p14,p15=8,in5 } 435{ .mfi; (p4)ldf8 ni2=[nptr],16 // np[2] 436 (p5)fcvt.fxu ni2=f0 437 addp4 r28=-1,in5 } 438{ .mfi; (p6)ldf8 ni3=[r31],16 // np[3] 439 (p7)fcvt.fxu ni3=f0 440 $ADDP in4=0,in4 };; 441 442{ .mfi; ldf8 n0=[in4] 443 fcvt.fxu tf[1]=f0 444 nop.i 0 } 445 446{ .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4] 447 (p9)fcvt.fxu ai4=f0 448 mov t[0]=r0 } 449{ .mfi; (p10)ldf8 ai5=[r29],16 // ap[5] 450 (p11)fcvt.fxu ai5=f0 451 mov t[1]=r0 } 452{ .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4] 453 (p9)fcvt.fxu bj[3]=f0 454 mov t[2]=r0 } 455{ .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5] 456 (p11)fcvt.fxu bj[2]=f0 457 mov t[3]=r0 } 458{ .mfi; (p8)ldf8 ni4=[nptr],16 // np[4] 459 (p9)fcvt.fxu ni4=f0 460 mov t[4]=r0 } 461{ .mfi; (p10)ldf8 ni5=[r31],16 // np[5] 462 (p11)fcvt.fxu ni5=f0 463 mov t[5]=r0 };; 464 465{ .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6] 466 (p13)fcvt.fxu ai6=f0 467 mov t[6]=r0 } 468{ .mfi; (p14)ldf8 ai7=[r29],16 // ap[7] 469 (p15)fcvt.fxu ai7=f0 470 mov t[7]=r0 } 471{ .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6] 472 (p13)fcvt.fxu bj[1]=f0 473 mov ar.lc=r28 } 474{ .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7] 475 (p15)fcvt.fxu bj[0]=f0 476 mov ar.ec=1 } 477{ .mfi; (p12)ldf8 ni6=[nptr],16 // np[6] 478 (p13)fcvt.fxu ni6=f0 479 mov pr.rot=1<<16 } 480{ .mfb; (p14)ldf8 ni7=[r31],16 // np[7] 481 (p15)fcvt.fxu ni7=f0 482 brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16 483 };; 484 485// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt 486// to measure with help of Interval Time Counter indicated that the 487// factor is a tad higher: 33 or 34, if not 35. Exact measurement and 488// addressing the issue is problematic, because I don't have access 489// to platform-specific instruction-level profiler. On Itanium it 490// should run in 56*n ticks, because of higher xma latency... 491.Louter_8_ctop: 492 .pred.rel "mutex",p40,p42 493 .pred.rel "mutex",p48,p50 494{ .mfi; (p16) nop.m 0 // 0: 495 (p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0] 496 (p40) add a3=a3,n3 } // (p17) a3+=n3 497{ .mfi; (p42) add a3=a3,n3,1 498 (p16) xma.lu alo[0]=ai0,bj[7],tf[1] 499 (p16) nop.i 0 };; 500{ .mii; (p17) getf.sig a7=alo[8] // 1: 501 (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3 502 (p50) add t[6]=t[6],a3,1 };; 503{ .mfi; (p17) getf.sig a8=ahi[8] // 2: 504 (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0 505 (p40) cmp.ltu p43,p41=a3,n3 } 506{ .mfi; (p42) cmp.leu p43,p41=a3,n3 507 (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6] 508 (p16) nop.i 0 };; 509{ .mii; (p17) getf.sig n5=nlo[6] // 3: 510 (p48) cmp.ltu p51,p49=t[6],a3 511 (p50) cmp.leu p51,p49=t[6],a3 };; 512 .pred.rel "mutex",p41,p43 513 .pred.rel "mutex",p49,p51 514{ .mfi; (p16) nop.m 0 // 4: 515 (p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i] 516 (p41) add a4=a4,n4 } // (p17) a4+=n4 517{ .mfi; (p43) add a4=a4,n4,1 518 (p16) xma.lu alo[1]=ai1,bj[7],ahi[0] 519 (p16) nop.i 0 };; 520{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4 521 (p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0 522 (p51) add t[5]=t[5],a4,1 };; 523{ .mfi; (p16) nop.m 0 // 6: 524 (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0 525 (p41) cmp.ltu p42,p40=a4,n4 } 526{ .mfi; (p43) cmp.leu p42,p40=a4,n4 527 (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7] 528 (p16) nop.i 0 };; 529{ .mii; (p17) getf.sig n6=nlo[7] // 7: 530 (p49) cmp.ltu p50,p48=t[5],a4 531 (p51) cmp.leu p50,p48=t[5],a4 };; 532 .pred.rel "mutex",p40,p42 533 .pred.rel "mutex",p48,p50 534{ .mfi; (p16) nop.m 0 // 8: 535 (p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i] 536 (p40) add a5=a5,n5 } // (p17) a5+=n5 537{ .mfi; (p42) add a5=a5,n5,1 538 (p16) xma.lu alo[2]=ai2,bj[7],ahi[1] 539 (p16) nop.i 0 };; 540{ .mii; (p16) getf.sig a1=alo[1] // 9: 541 (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5 542 (p50) add t[4]=t[4],a5,1 };; 543{ .mfi; (p16) nop.m 0 // 10: 544 (p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0 545 (p40) cmp.ltu p43,p41=a5,n5 } 546{ .mfi; (p42) cmp.leu p43,p41=a5,n5 547 (p16) xma.lu nlo[0]=ni0,mj[0],alo[0] 548 (p16) nop.i 0 };; 549{ .mii; (p17) getf.sig n7=nlo[8] // 11: 550 (p48) cmp.ltu p51,p49=t[4],a5 551 (p50) cmp.leu p51,p49=t[4],a5 };; 552 .pred.rel "mutex",p41,p43 553 .pred.rel "mutex",p49,p51 554{ .mfi; (p17) getf.sig n8=nhi[8] // 12: 555 (p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i] 556 (p41) add a6=a6,n6 } // (p17) a6+=n6 557{ .mfi; (p43) add a6=a6,n6,1 558 (p16) xma.lu alo[3]=ai3,bj[7],ahi[2] 559 (p16) nop.i 0 };; 560{ .mii; (p16) getf.sig a2=alo[2] // 13: 561 (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6 562 (p51) add t[3]=t[3],a6,1 };; 563{ .mfi; (p16) nop.m 0 // 14: 564 (p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0 565 (p41) cmp.ltu p42,p40=a6,n6 } 566{ .mfi; (p43) cmp.leu p42,p40=a6,n6 567 (p16) xma.lu nlo[1]=ni1,mj[0],nhi[0] 568 (p16) nop.i 0 };; 569{ .mii; (p16) nop.m 0 // 15: 570 (p49) cmp.ltu p50,p48=t[3],a6 571 (p51) cmp.leu p50,p48=t[3],a6 };; 572 .pred.rel "mutex",p40,p42 573 .pred.rel "mutex",p48,p50 574{ .mfi; (p16) nop.m 0 // 16: 575 (p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i] 576 (p40) add a7=a7,n7 } // (p17) a7+=n7 577{ .mfi; (p42) add a7=a7,n7,1 578 (p16) xma.lu alo[4]=ai4,bj[7],ahi[3] 579 (p16) nop.i 0 };; 580{ .mii; (p16) getf.sig a3=alo[3] // 17: 581 (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7 582 (p50) add t[2]=t[2],a7,1 };; 583{ .mfi; (p16) nop.m 0 // 18: 584 (p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0 585 (p40) cmp.ltu p43,p41=a7,n7 } 586{ .mfi; (p42) cmp.leu p43,p41=a7,n7 587 (p16) xma.lu nlo[2]=ni2,mj[0],nhi[1] 588 (p16) nop.i 0 };; 589{ .mii; (p16) getf.sig n1=nlo[1] // 19: 590 (p48) cmp.ltu p51,p49=t[2],a7 591 (p50) cmp.leu p51,p49=t[2],a7 };; 592 .pred.rel "mutex",p41,p43 593 .pred.rel "mutex",p49,p51 594{ .mfi; (p16) nop.m 0 // 20: 595 (p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i] 596 (p41) add a8=a8,n8 } // (p17) a8+=n8 597{ .mfi; (p43) add a8=a8,n8,1 598 (p16) xma.lu alo[5]=ai5,bj[7],ahi[4] 599 (p16) nop.i 0 };; 600{ .mii; (p16) getf.sig a4=alo[4] // 21: 601 (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8 602 (p51) add t[1]=t[1],a8,1 };; 603{ .mfi; (p16) nop.m 0 // 22: 604 (p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0 605 (p41) cmp.ltu p42,p40=a8,n8 } 606{ .mfi; (p43) cmp.leu p42,p40=a8,n8 607 (p16) xma.lu nlo[3]=ni3,mj[0],nhi[2] 608 (p16) nop.i 0 };; 609{ .mii; (p16) getf.sig n2=nlo[2] // 23: 610 (p49) cmp.ltu p50,p48=t[1],a8 611 (p51) cmp.leu p50,p48=t[1],a8 };; 612{ .mfi; (p16) nop.m 0 // 24: 613 (p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i] 614 (p16) add a1=a1,n1 } // (p16) a1+=n1 615{ .mfi; (p16) nop.m 0 616 (p16) xma.lu alo[6]=ai6,bj[7],ahi[5] 617 (p17) mov t[0]=r0 };; 618{ .mii; (p16) getf.sig a5=alo[5] // 25: 619 (p16) add t0=t[7],a1 // (p16) t[7]+=a1 620 (p42) add t[0]=t[0],r0,1 };; 621{ .mfi; (p16) setf.sig tf[0]=t0 // 26: 622 (p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0 623 (p50) add t[0]=t[0],r0,1 } 624{ .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1 625 (p16) xma.lu nlo[4]=ni4,mj[0],nhi[3] 626 (p16) nop.i 0 };; 627{ .mii; (p16) getf.sig n3=nlo[3] // 27: 628 (p16) cmp.ltu.unc p50,p48=t0,a1 629 (p16) nop.i 0 };; 630 .pred.rel "mutex",p40,p42 631 .pred.rel "mutex",p48,p50 632{ .mfi; (p16) nop.m 0 // 28: 633 (p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i] 634 (p40) add a2=a2,n2 } // (p16) a2+=n2 635{ .mfi; (p42) add a2=a2,n2,1 636 (p16) xma.lu alo[7]=ai7,bj[7],ahi[6] 637 (p16) nop.i 0 };; 638{ .mii; (p16) getf.sig a6=alo[6] // 29: 639 (p48) add t[6]=t[6],a2 // (p16) t[6]+=a2 640 (p50) add t[6]=t[6],a2,1 };; 641{ .mfi; (p16) nop.m 0 // 30: 642 (p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0 643 (p40) cmp.ltu p41,p39=a2,n2 } 644{ .mfi; (p42) cmp.leu p41,p39=a2,n2 645 (p16) xma.lu nlo[5]=ni5,mj[0],nhi[4] 646 (p16) nop.i 0 };; 647{ .mfi; (p16) getf.sig n4=nlo[4] // 31: 648 (p16) nop.f 0 649 (p48) cmp.ltu p49,p47=t[6],a2 } 650{ .mfb; (p50) cmp.leu p49,p47=t[6],a2 651 (p16) nop.f 0 652 br.ctop.sptk.many .Louter_8_ctop };; 653.Louter_8_cend: 654 655// above loop has to execute one more time, without (p16), which is 656// replaced with merged move of np[8] to GPR bank 657 .pred.rel "mutex",p40,p42 658 .pred.rel "mutex",p48,p50 659{ .mmi; (p0) getf.sig n1=ni0 // 0: 660 (p40) add a3=a3,n3 // (p17) a3+=n3 661 (p42) add a3=a3,n3,1 };; 662{ .mii; (p17) getf.sig a7=alo[8] // 1: 663 (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3 664 (p50) add t[6]=t[6],a3,1 };; 665{ .mfi; (p17) getf.sig a8=ahi[8] // 2: 666 (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0 667 (p40) cmp.ltu p43,p41=a3,n3 } 668{ .mfi; (p42) cmp.leu p43,p41=a3,n3 669 (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6] 670 (p0) nop.i 0 };; 671{ .mii; (p17) getf.sig n5=nlo[6] // 3: 672 (p48) cmp.ltu p51,p49=t[6],a3 673 (p50) cmp.leu p51,p49=t[6],a3 };; 674 .pred.rel "mutex",p41,p43 675 .pred.rel "mutex",p49,p51 676{ .mmi; (p0) getf.sig n2=ni1 // 4: 677 (p41) add a4=a4,n4 // (p17) a4+=n4 678 (p43) add a4=a4,n4,1 };; 679{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4 680 (p0) nop.f 0 681 (p51) add t[5]=t[5],a4,1 };; 682{ .mfi; (p0) getf.sig n3=ni2 // 6: 683 (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0 684 (p41) cmp.ltu p42,p40=a4,n4 } 685{ .mfi; (p43) cmp.leu p42,p40=a4,n4 686 (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7] 687 (p0) nop.i 0 };; 688{ .mii; (p17) getf.sig n6=nlo[7] // 7: 689 (p49) cmp.ltu p50,p48=t[5],a4 690 (p51) cmp.leu p50,p48=t[5],a4 };; 691 .pred.rel "mutex",p40,p42 692 .pred.rel "mutex",p48,p50 693{ .mii; (p0) getf.sig n4=ni3 // 8: 694 (p40) add a5=a5,n5 // (p17) a5+=n5 695 (p42) add a5=a5,n5,1 };; 696{ .mii; (p0) nop.m 0 // 9: 697 (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5 698 (p50) add t[4]=t[4],a5,1 };; 699{ .mii; (p0) nop.m 0 // 10: 700 (p40) cmp.ltu p43,p41=a5,n5 701 (p42) cmp.leu p43,p41=a5,n5 };; 702{ .mii; (p17) getf.sig n7=nlo[8] // 11: 703 (p48) cmp.ltu p51,p49=t[4],a5 704 (p50) cmp.leu p51,p49=t[4],a5 };; 705 .pred.rel "mutex",p41,p43 706 .pred.rel "mutex",p49,p51 707{ .mii; (p17) getf.sig n8=nhi[8] // 12: 708 (p41) add a6=a6,n6 // (p17) a6+=n6 709 (p43) add a6=a6,n6,1 };; 710{ .mii; (p0) getf.sig n5=ni4 // 13: 711 (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6 712 (p51) add t[3]=t[3],a6,1 };; 713{ .mii; (p0) nop.m 0 // 14: 714 (p41) cmp.ltu p42,p40=a6,n6 715 (p43) cmp.leu p42,p40=a6,n6 };; 716{ .mii; (p0) getf.sig n6=ni5 // 15: 717 (p49) cmp.ltu p50,p48=t[3],a6 718 (p51) cmp.leu p50,p48=t[3],a6 };; 719 .pred.rel "mutex",p40,p42 720 .pred.rel "mutex",p48,p50 721{ .mii; (p0) nop.m 0 // 16: 722 (p40) add a7=a7,n7 // (p17) a7+=n7 723 (p42) add a7=a7,n7,1 };; 724{ .mii; (p0) nop.m 0 // 17: 725 (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7 726 (p50) add t[2]=t[2],a7,1 };; 727{ .mii; (p0) nop.m 0 // 18: 728 (p40) cmp.ltu p43,p41=a7,n7 729 (p42) cmp.leu p43,p41=a7,n7 };; 730{ .mii; (p0) getf.sig n7=ni6 // 19: 731 (p48) cmp.ltu p51,p49=t[2],a7 732 (p50) cmp.leu p51,p49=t[2],a7 };; 733 .pred.rel "mutex",p41,p43 734 .pred.rel "mutex",p49,p51 735{ .mii; (p0) nop.m 0 // 20: 736 (p41) add a8=a8,n8 // (p17) a8+=n8 737 (p43) add a8=a8,n8,1 };; 738{ .mmi; (p0) nop.m 0 // 21: 739 (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8 740 (p51) add t[1]=t[1],a8,1 } 741{ .mmi; (p17) mov t[0]=r0 742 (p41) cmp.ltu p42,p40=a8,n8 743 (p43) cmp.leu p42,p40=a8,n8 };; 744{ .mmi; (p0) getf.sig n8=ni7 // 22: 745 (p49) cmp.ltu p50,p48=t[1],a8 746 (p51) cmp.leu p50,p48=t[1],a8 } 747{ .mmi; (p42) add t[0]=t[0],r0,1 748 (p0) add r16=-7*16,prevsp 749 (p0) add r17=-6*16,prevsp };; 750 751// subtract np[8] from carrybit|tmp[8] 752// carrybit|tmp[8] layout upon exit from above loop is: 753// t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant) 754{ .mmi; (p50)add t[0]=t[0],r0,1 755 add r18=-5*16,prevsp 756 sub n1=t0,n1 };; 757{ .mmi; cmp.gtu p34,p32=n1,t0;; 758 .pred.rel "mutex",p32,p34 759 (p32)sub n2=t[7],n2 760 (p34)sub n2=t[7],n2,1 };; 761{ .mii; (p32)cmp.gtu p35,p33=n2,t[7] 762 (p34)cmp.geu p35,p33=n2,t[7];; 763 .pred.rel "mutex",p33,p35 764 (p33)sub n3=t[6],n3 } 765{ .mmi; (p35)sub n3=t[6],n3,1;; 766 (p33)cmp.gtu p34,p32=n3,t[6] 767 (p35)cmp.geu p34,p32=n3,t[6] };; 768 .pred.rel "mutex",p32,p34 769{ .mii; (p32)sub n4=t[5],n4 770 (p34)sub n4=t[5],n4,1;; 771 (p32)cmp.gtu p35,p33=n4,t[5] } 772{ .mmi; (p34)cmp.geu p35,p33=n4,t[5];; 773 .pred.rel "mutex",p33,p35 774 (p33)sub n5=t[4],n5 775 (p35)sub n5=t[4],n5,1 };; 776{ .mii; (p33)cmp.gtu p34,p32=n5,t[4] 777 (p35)cmp.geu p34,p32=n5,t[4];; 778 .pred.rel "mutex",p32,p34 779 (p32)sub n6=t[3],n6 } 780{ .mmi; (p34)sub n6=t[3],n6,1;; 781 (p32)cmp.gtu p35,p33=n6,t[3] 782 (p34)cmp.geu p35,p33=n6,t[3] };; 783 .pred.rel "mutex",p33,p35 784{ .mii; (p33)sub n7=t[2],n7 785 (p35)sub n7=t[2],n7,1;; 786 (p33)cmp.gtu p34,p32=n7,t[2] } 787{ .mmi; (p35)cmp.geu p34,p32=n7,t[2];; 788 .pred.rel "mutex",p32,p34 789 (p32)sub n8=t[1],n8 790 (p34)sub n8=t[1],n8,1 };; 791{ .mii; (p32)cmp.gtu p35,p33=n8,t[1] 792 (p34)cmp.geu p35,p33=n8,t[1];; 793 .pred.rel "mutex",p33,p35 794 (p33)sub a8=t[0],r0 } 795{ .mmi; (p35)sub a8=t[0],r0,1;; 796 (p33)cmp.gtu p34,p32=a8,t[0] 797 (p35)cmp.geu p34,p32=a8,t[0] };; 798 799// save the result, either tmp[num] or tmp[num]-np[num] 800 .pred.rel "mutex",p32,p34 801{ .mmi; (p32)st8 [rptr]=n1,8 802 (p34)st8 [rptr]=t0,8 803 add r19=-4*16,prevsp};; 804{ .mmb; (p32)st8 [rptr]=n2,8 805 (p34)st8 [rptr]=t[7],8 806 (p5)br.cond.dpnt.few .Ldone };; 807{ .mmb; (p32)st8 [rptr]=n3,8 808 (p34)st8 [rptr]=t[6],8 809 (p7)br.cond.dpnt.few .Ldone };; 810{ .mmb; (p32)st8 [rptr]=n4,8 811 (p34)st8 [rptr]=t[5],8 812 (p9)br.cond.dpnt.few .Ldone };; 813{ .mmb; (p32)st8 [rptr]=n5,8 814 (p34)st8 [rptr]=t[4],8 815 (p11)br.cond.dpnt.few .Ldone };; 816{ .mmb; (p32)st8 [rptr]=n6,8 817 (p34)st8 [rptr]=t[3],8 818 (p13)br.cond.dpnt.few .Ldone };; 819{ .mmb; (p32)st8 [rptr]=n7,8 820 (p34)st8 [rptr]=t[2],8 821 (p15)br.cond.dpnt.few .Ldone };; 822{ .mmb; (p32)st8 [rptr]=n8,8 823 (p34)st8 [rptr]=t[1],8 824 nop.b 0 };; 825.Ldone: // epilogue 826{ .mmi; ldf.fill f16=[r16],64 827 ldf.fill f17=[r17],64 828 nop.i 0 } 829{ .mmi; ldf.fill f18=[r18],64 830 ldf.fill f19=[r19],64 831 mov pr=prevpr,0x1ffff };; 832{ .mmi; ldf.fill f20=[r16] 833 ldf.fill f21=[r17] 834 mov ar.lc=prevlc } 835{ .mmi; ldf.fill f22=[r18] 836 ldf.fill f23=[r19] 837 mov ret0=1 } // signal "handled" 838{ .mib; rum 1<<5 839 .restore sp 840 mov sp=prevsp 841 br.ret.sptk.many b0 };; 842.endp bn_mul_mont_8# 843 844.type copyright#,\@object 845copyright: 846stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>" 847___ 848 849$output=shift and open STDOUT,">$output"; 850print $code; 851close STDOUT; 852