1; 2; PA-RISC 64-bit implementation of bn_asm code 3; 4; This code is approximately 2x faster than the C version 5; for RSA/DSA. 6; 7; See http://devresource.hp.com/ for more details on the PA-RISC 8; architecture. Also see the book "PA-RISC 2.0 Architecture" 9; by Gerry Kane for information on the instruction set architecture. 10; 11; Code written by Chris Ruemmler (with some help from the HP C 12; compiler). 13; 14; The code compiles with HP's assembler 15; 16 17 .level 2.0W 18 .space $TEXT$ 19 .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY 20 21; 22; Global Register definitions used for the routines. 23; 24; Some information about HP's runtime architecture for 64-bits. 25; 26; "Caller save" means the calling function must save the register 27; if it wants the register to be preserved. 28; "Callee save" means if a function uses the register, it must save 29; the value before using it. 30; 31; For the floating point registers 32; 33; "caller save" registers: fr4-fr11, fr22-fr31 34; "callee save" registers: fr12-fr21 35; "special" registers: fr0-fr3 (status and exception registers) 36; 37; For the integer registers 38; value zero : r0 39; "caller save" registers: r1,r19-r26 40; "callee save" registers: r3-r18 41; return register : r2 (rp) 42; return values ; r28 (ret0,ret1) 43; Stack pointer ; r30 (sp) 44; global data pointer ; r27 (dp) 45; argument pointer ; r29 (ap) 46; millicode return ptr ; r31 (also a caller save register) 47 48 49; 50; Arguments to the routines 51; 52r_ptr .reg %r26 53a_ptr .reg %r25 54b_ptr .reg %r24 55num .reg %r24 56w .reg %r23 57n .reg %r23 58 59 60; 61; Globals used in some routines 62; 63 64top_overflow .reg %r29 65high_mask .reg %r22 ; value 0xffffffff80000000L 66 67 68;------------------------------------------------------------------------------ 69; 70; bn_mul_add_words 71; 72;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr, 73; int num, BN_ULONG w) 74; 75; arg0 = r_ptr 76; arg1 = a_ptr 77; arg2 = num 78; arg3 = w 79; 80; Local register definitions 81; 82 83fm1 .reg %fr22 84fm .reg %fr23 85ht_temp .reg %fr24 86ht_temp_1 .reg %fr25 87lt_temp .reg %fr26 88lt_temp_1 .reg %fr27 89fm1_1 .reg %fr28 90fm_1 .reg %fr29 91 92fw_h .reg %fr7L 93fw_l .reg %fr7R 94fw .reg %fr7 95 96fht_0 .reg %fr8L 97flt_0 .reg %fr8R 98t_float_0 .reg %fr8 99 100fht_1 .reg %fr9L 101flt_1 .reg %fr9R 102t_float_1 .reg %fr9 103 104tmp_0 .reg %r31 105tmp_1 .reg %r21 106m_0 .reg %r20 107m_1 .reg %r19 108ht_0 .reg %r1 109ht_1 .reg %r3 110lt_0 .reg %r4 111lt_1 .reg %r5 112m1_0 .reg %r6 113m1_1 .reg %r7 114rp_val .reg %r8 115rp_val_1 .reg %r9 116 117bn_mul_add_words 118 .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN 119 .proc 120 .callinfo frame=128 121 .entry 122 .align 64 123 124 STD %r3,0(%sp) ; save r3 125 STD %r4,8(%sp) ; save r4 126 NOP ; Needed to make the loop 16-byte aligned 127 NOP ; Needed to make the loop 16-byte aligned 128 129 STD %r5,16(%sp) ; save r5 130 STD %r6,24(%sp) ; save r6 131 STD %r7,32(%sp) ; save r7 132 STD %r8,40(%sp) ; save r8 133 134 STD %r9,48(%sp) ; save r9 135 COPY %r0,%ret0 ; return 0 by default 136 DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 137 STD w,56(%sp) ; store w on stack 138 139 CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit 140 LDO 128(%sp),%sp ; bump stack 141 142 ; 143 ; The loop is unrolled twice, so if there is only 1 number 144 ; then go straight to the cleanup code. 145 ; 146 CMPIB,= 1,num,bn_mul_add_words_single_top 147 FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l) 148 149 ; 150 ; This loop is unrolled 2 times (64-byte aligned as well) 151 ; 152 ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus 153 ; two 32-bit mutiplies can be issued per cycle. 154 ; 155bn_mul_add_words_unroll2 156 157 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) 158 FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) 159 LDD 0(r_ptr),rp_val ; rp[0] 160 LDD 8(r_ptr),rp_val_1 ; rp[1] 161 162 XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l 163 XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l 164 FSTD fm1,-16(%sp) ; -16(sp) = m1[0] 165 FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1] 166 167 XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h 168 XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h 169 FSTD fm,-8(%sp) ; -8(sp) = m[0] 170 FSTD fm_1,-40(%sp) ; -40(sp) = m[1] 171 172 XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h 173 XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h 174 FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp 175 FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1 176 177 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l 178 XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l 179 FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp 180 FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1 181 182 LDD -8(%sp),m_0 ; m[0] 183 LDD -40(%sp),m_1 ; m[1] 184 LDD -16(%sp),m1_0 ; m1[0] 185 LDD -48(%sp),m1_1 ; m1[1] 186 187 LDD -24(%sp),ht_0 ; ht[0] 188 LDD -56(%sp),ht_1 ; ht[1] 189 ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0]; 190 ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1]; 191 192 LDD -32(%sp),lt_0 193 LDD -64(%sp),lt_1 194 CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0]) 195 ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32) 196 197 CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1]) 198 ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32) 199 EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32 200 DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32 201 202 EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32 203 DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32 204 ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32) 205 ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32) 206 207 ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0]; 208 ADD,DC ht_0,%r0,ht_0 ; ht[0]++ 209 ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1]; 210 ADD,DC ht_1,%r0,ht_1 ; ht[1]++ 211 212 ADD %ret0,lt_0,lt_0 ; lt[0] = lt[0] + c; 213 ADD,DC ht_0,%r0,ht_0 ; ht[0]++ 214 ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0] 215 ADD,DC ht_0,%r0,ht_0 ; ht[0]++ 216 217 LDO -2(num),num ; num = num - 2; 218 ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c); 219 ADD,DC ht_1,%r0,ht_1 ; ht[1]++ 220 STD lt_0,0(r_ptr) ; rp[0] = lt[0] 221 222 ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1] 223 ADD,DC ht_1,%r0,%ret0 ; ht[1]++ 224 LDO 16(a_ptr),a_ptr ; a_ptr += 2 225 226 STD lt_1,8(r_ptr) ; rp[1] = lt[1] 227 CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do 228 LDO 16(r_ptr),r_ptr ; r_ptr += 2 229 230 CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one 231 232 ; 233 ; Top of loop aligned on 64-byte boundary 234 ; 235bn_mul_add_words_single_top 236 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) 237 LDD 0(r_ptr),rp_val ; rp[0] 238 LDO 8(a_ptr),a_ptr ; a_ptr++ 239 XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l 240 FSTD fm1,-16(%sp) ; -16(sp) = m1 241 XMPYU flt_0,fw_h,fm ; m = lt*fw_h 242 FSTD fm,-8(%sp) ; -8(sp) = m 243 XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h 244 FSTD ht_temp,-24(%sp) ; -24(sp) = ht 245 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l 246 FSTD lt_temp,-32(%sp) ; -32(sp) = lt 247 248 LDD -8(%sp),m_0 249 LDD -16(%sp),m1_0 ; m1 = temp1 250 ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; 251 LDD -24(%sp),ht_0 252 LDD -32(%sp),lt_0 253 254 CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) 255 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) 256 257 EXTRD,U tmp_0,31,32,m_0 ; m>>32 258 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 259 260 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) 261 ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1; 262 ADD,DC ht_0,%r0,ht_0 ; ht++ 263 ADD %ret0,tmp_0,lt_0 ; lt = lt + c; 264 ADD,DC ht_0,%r0,ht_0 ; ht++ 265 ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0] 266 ADD,DC ht_0,%r0,%ret0 ; ht++ 267 STD lt_0,0(r_ptr) ; rp[0] = lt 268 269bn_mul_add_words_exit 270 .EXIT 271 LDD -80(%sp),%r9 ; restore r9 272 LDD -88(%sp),%r8 ; restore r8 273 LDD -96(%sp),%r7 ; restore r7 274 LDD -104(%sp),%r6 ; restore r6 275 LDD -112(%sp),%r5 ; restore r5 276 LDD -120(%sp),%r4 ; restore r4 277 BVE (%rp) 278 LDD,MB -128(%sp),%r3 ; restore r3 279 .PROCEND ;in=23,24,25,26,29;out=28; 280 281;---------------------------------------------------------------------------- 282; 283;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 284; 285; arg0 = rp 286; arg1 = ap 287; arg2 = num 288; arg3 = w 289 290bn_mul_words 291 .proc 292 .callinfo frame=128 293 .entry 294 .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN 295 .align 64 296 297 STD %r3,0(%sp) ; save r3 298 STD %r4,8(%sp) ; save r4 299 STD %r5,16(%sp) ; save r5 300 STD %r6,24(%sp) ; save r6 301 302 STD %r7,32(%sp) ; save r7 303 COPY %r0,%ret0 ; return 0 by default 304 DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 305 STD w,56(%sp) ; w on stack 306 307 CMPIB,>= 0,num,bn_mul_words_exit 308 LDO 128(%sp),%sp ; bump stack 309 310 ; 311 ; See if only 1 word to do, thus just do cleanup 312 ; 313 CMPIB,= 1,num,bn_mul_words_single_top 314 FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l) 315 316 ; 317 ; This loop is unrolled 2 times (64-byte aligned as well) 318 ; 319 ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus 320 ; two 32-bit mutiplies can be issued per cycle. 321 ; 322bn_mul_words_unroll2 323 324 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) 325 FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) 326 XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l 327 XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l 328 329 FSTD fm1,-16(%sp) ; -16(sp) = m1 330 FSTD fm1_1,-48(%sp) ; -48(sp) = m1 331 XMPYU flt_0,fw_h,fm ; m = lt*fw_h 332 XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h 333 334 FSTD fm,-8(%sp) ; -8(sp) = m 335 FSTD fm_1,-40(%sp) ; -40(sp) = m 336 XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h 337 XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h 338 339 FSTD ht_temp,-24(%sp) ; -24(sp) = ht 340 FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht 341 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l 342 XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l 343 344 FSTD lt_temp,-32(%sp) ; -32(sp) = lt 345 FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt 346 LDD -8(%sp),m_0 347 LDD -40(%sp),m_1 348 349 LDD -16(%sp),m1_0 350 LDD -48(%sp),m1_1 351 LDD -24(%sp),ht_0 352 LDD -56(%sp),ht_1 353 354 ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1; 355 ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1; 356 LDD -32(%sp),lt_0 357 LDD -64(%sp),lt_1 358 359 CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1) 360 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) 361 CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1) 362 ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32) 363 364 EXTRD,U tmp_0,31,32,m_0 ; m>>32 365 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 366 EXTRD,U tmp_1,31,32,m_1 ; m>>32 367 DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32 368 369 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) 370 ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32) 371 ADD lt_0,m1_0,lt_0 ; lt = lt+m1; 372 ADD,DC ht_0,%r0,ht_0 ; ht++ 373 374 ADD lt_1,m1_1,lt_1 ; lt = lt+m1; 375 ADD,DC ht_1,%r0,ht_1 ; ht++ 376 ADD %ret0,lt_0,lt_0 ; lt = lt + c (ret0); 377 ADD,DC ht_0,%r0,ht_0 ; ht++ 378 379 ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0) 380 ADD,DC ht_1,%r0,ht_1 ; ht++ 381 STD lt_0,0(r_ptr) ; rp[0] = lt 382 STD lt_1,8(r_ptr) ; rp[1] = lt 383 384 COPY ht_1,%ret0 ; carry = ht 385 LDO -2(num),num ; num = num - 2; 386 LDO 16(a_ptr),a_ptr ; ap += 2 387 CMPIB,<= 2,num,bn_mul_words_unroll2 388 LDO 16(r_ptr),r_ptr ; rp++ 389 390 CMPIB,=,N 0,num,bn_mul_words_exit ; are we done? 391 392 ; 393 ; Top of loop aligned on 64-byte boundary 394 ; 395bn_mul_words_single_top 396 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) 397 398 XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l 399 FSTD fm1,-16(%sp) ; -16(sp) = m1 400 XMPYU flt_0,fw_h,fm ; m = lt*fw_h 401 FSTD fm,-8(%sp) ; -8(sp) = m 402 XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h 403 FSTD ht_temp,-24(%sp) ; -24(sp) = ht 404 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l 405 FSTD lt_temp,-32(%sp) ; -32(sp) = lt 406 407 LDD -8(%sp),m_0 408 LDD -16(%sp),m1_0 409 ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; 410 LDD -24(%sp),ht_0 411 LDD -32(%sp),lt_0 412 413 CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) 414 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) 415 416 EXTRD,U tmp_0,31,32,m_0 ; m>>32 417 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 418 419 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) 420 ADD lt_0,m1_0,lt_0 ; lt= lt+m1; 421 ADD,DC ht_0,%r0,ht_0 ; ht++ 422 423 ADD %ret0,lt_0,lt_0 ; lt = lt + c; 424 ADD,DC ht_0,%r0,ht_0 ; ht++ 425 426 COPY ht_0,%ret0 ; copy carry 427 STD lt_0,0(r_ptr) ; rp[0] = lt 428 429bn_mul_words_exit 430 .EXIT 431 LDD -96(%sp),%r7 ; restore r7 432 LDD -104(%sp),%r6 ; restore r6 433 LDD -112(%sp),%r5 ; restore r5 434 LDD -120(%sp),%r4 ; restore r4 435 BVE (%rp) 436 LDD,MB -128(%sp),%r3 ; restore r3 437 .PROCEND ;in=23,24,25,26,29;out=28; 438 439;---------------------------------------------------------------------------- 440; 441;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num) 442; 443; arg0 = rp 444; arg1 = ap 445; arg2 = num 446; 447 448bn_sqr_words 449 .proc 450 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE 451 .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN 452 .entry 453 .align 64 454 455 STD %r3,0(%sp) ; save r3 456 STD %r4,8(%sp) ; save r4 457 NOP 458 STD %r5,16(%sp) ; save r5 459 460 CMPIB,>= 0,num,bn_sqr_words_exit 461 LDO 128(%sp),%sp ; bump stack 462 463 ; 464 ; If only 1, the goto straight to cleanup 465 ; 466 CMPIB,= 1,num,bn_sqr_words_single_top 467 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L 468 469 ; 470 ; This loop is unrolled 2 times (64-byte aligned as well) 471 ; 472 473bn_sqr_words_unroll2 474 FLDD 0(a_ptr),t_float_0 ; a[0] 475 FLDD 8(a_ptr),t_float_1 ; a[1] 476 XMPYU fht_0,flt_0,fm ; m[0] 477 XMPYU fht_1,flt_1,fm_1 ; m[1] 478 479 FSTD fm,-24(%sp) ; store m[0] 480 FSTD fm_1,-56(%sp) ; store m[1] 481 XMPYU flt_0,flt_0,lt_temp ; lt[0] 482 XMPYU flt_1,flt_1,lt_temp_1 ; lt[1] 483 484 FSTD lt_temp,-16(%sp) ; store lt[0] 485 FSTD lt_temp_1,-48(%sp) ; store lt[1] 486 XMPYU fht_0,fht_0,ht_temp ; ht[0] 487 XMPYU fht_1,fht_1,ht_temp_1 ; ht[1] 488 489 FSTD ht_temp,-8(%sp) ; store ht[0] 490 FSTD ht_temp_1,-40(%sp) ; store ht[1] 491 LDD -24(%sp),m_0 492 LDD -56(%sp),m_1 493 494 AND m_0,high_mask,tmp_0 ; m[0] & Mask 495 AND m_1,high_mask,tmp_1 ; m[1] & Mask 496 DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1 497 DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1 498 499 LDD -16(%sp),lt_0 500 LDD -48(%sp),lt_1 501 EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1 502 EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1 503 504 LDD -8(%sp),ht_0 505 LDD -40(%sp),ht_1 506 ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0 507 ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1 508 509 ADD lt_0,m_0,lt_0 ; lt = lt+m 510 ADD,DC ht_0,%r0,ht_0 ; ht[0]++ 511 STD lt_0,0(r_ptr) ; rp[0] = lt[0] 512 STD ht_0,8(r_ptr) ; rp[1] = ht[1] 513 514 ADD lt_1,m_1,lt_1 ; lt = lt+m 515 ADD,DC ht_1,%r0,ht_1 ; ht[1]++ 516 STD lt_1,16(r_ptr) ; rp[2] = lt[1] 517 STD ht_1,24(r_ptr) ; rp[3] = ht[1] 518 519 LDO -2(num),num ; num = num - 2; 520 LDO 16(a_ptr),a_ptr ; ap += 2 521 CMPIB,<= 2,num,bn_sqr_words_unroll2 522 LDO 32(r_ptr),r_ptr ; rp += 4 523 524 CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done? 525 526 ; 527 ; Top of loop aligned on 64-byte boundary 528 ; 529bn_sqr_words_single_top 530 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) 531 532 XMPYU fht_0,flt_0,fm ; m 533 FSTD fm,-24(%sp) ; store m 534 535 XMPYU flt_0,flt_0,lt_temp ; lt 536 FSTD lt_temp,-16(%sp) ; store lt 537 538 XMPYU fht_0,fht_0,ht_temp ; ht 539 FSTD ht_temp,-8(%sp) ; store ht 540 541 LDD -24(%sp),m_0 ; load m 542 AND m_0,high_mask,tmp_0 ; m & Mask 543 DEPD,Z m_0,30,31,m_0 ; m << 32+1 544 LDD -16(%sp),lt_0 ; lt 545 546 LDD -8(%sp),ht_0 ; ht 547 EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1 548 ADD m_0,lt_0,lt_0 ; lt = lt+m 549 ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0 550 ADD,DC ht_0,%r0,ht_0 ; ht++ 551 552 STD lt_0,0(r_ptr) ; rp[0] = lt 553 STD ht_0,8(r_ptr) ; rp[1] = ht 554 555bn_sqr_words_exit 556 .EXIT 557 LDD -112(%sp),%r5 ; restore r5 558 LDD -120(%sp),%r4 ; restore r4 559 BVE (%rp) 560 LDD,MB -128(%sp),%r3 561 .PROCEND ;in=23,24,25,26,29;out=28; 562 563 564;---------------------------------------------------------------------------- 565; 566;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 567; 568; arg0 = rp 569; arg1 = ap 570; arg2 = bp 571; arg3 = n 572 573t .reg %r22 574b .reg %r21 575l .reg %r20 576 577bn_add_words 578 .proc 579 .entry 580 .callinfo 581 .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN 582 .align 64 583 584 CMPIB,>= 0,n,bn_add_words_exit 585 COPY %r0,%ret0 ; return 0 by default 586 587 ; 588 ; If 2 or more numbers do the loop 589 ; 590 CMPIB,= 1,n,bn_add_words_single_top 591 NOP 592 593 ; 594 ; This loop is unrolled 2 times (64-byte aligned as well) 595 ; 596bn_add_words_unroll2 597 LDD 0(a_ptr),t 598 LDD 0(b_ptr),b 599 ADD t,%ret0,t ; t = t+c; 600 ADD,DC %r0,%r0,%ret0 ; set c to carry 601 ADD t,b,l ; l = t + b[0] 602 ADD,DC %ret0,%r0,%ret0 ; c+= carry 603 STD l,0(r_ptr) 604 605 LDD 8(a_ptr),t 606 LDD 8(b_ptr),b 607 ADD t,%ret0,t ; t = t+c; 608 ADD,DC %r0,%r0,%ret0 ; set c to carry 609 ADD t,b,l ; l = t + b[0] 610 ADD,DC %ret0,%r0,%ret0 ; c+= carry 611 STD l,8(r_ptr) 612 613 LDO -2(n),n 614 LDO 16(a_ptr),a_ptr 615 LDO 16(b_ptr),b_ptr 616 617 CMPIB,<= 2,n,bn_add_words_unroll2 618 LDO 16(r_ptr),r_ptr 619 620 CMPIB,=,N 0,n,bn_add_words_exit ; are we done? 621 622bn_add_words_single_top 623 LDD 0(a_ptr),t 624 LDD 0(b_ptr),b 625 626 ADD t,%ret0,t ; t = t+c; 627 ADD,DC %r0,%r0,%ret0 ; set c to carry (could use CMPCLR??) 628 ADD t,b,l ; l = t + b[0] 629 ADD,DC %ret0,%r0,%ret0 ; c+= carry 630 STD l,0(r_ptr) 631 632bn_add_words_exit 633 .EXIT 634 BVE (%rp) 635 NOP 636 .PROCEND ;in=23,24,25,26,29;out=28; 637 638;---------------------------------------------------------------------------- 639; 640;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 641; 642; arg0 = rp 643; arg1 = ap 644; arg2 = bp 645; arg3 = n 646 647t1 .reg %r22 648t2 .reg %r21 649sub_tmp1 .reg %r20 650sub_tmp2 .reg %r19 651 652 653bn_sub_words 654 .proc 655 .callinfo 656 .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN 657 .entry 658 .align 64 659 660 CMPIB,>= 0,n,bn_sub_words_exit 661 COPY %r0,%ret0 ; return 0 by default 662 663 ; 664 ; If 2 or more numbers do the loop 665 ; 666 CMPIB,= 1,n,bn_sub_words_single_top 667 NOP 668 669 ; 670 ; This loop is unrolled 2 times (64-byte aligned as well) 671 ; 672bn_sub_words_unroll2 673 LDD 0(a_ptr),t1 674 LDD 0(b_ptr),t2 675 SUB t1,t2,sub_tmp1 ; t3 = t1-t2; 676 SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c; 677 678 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 679 LDO 1(%r0),sub_tmp2 680 681 CMPCLR,*= t1,t2,%r0 682 COPY sub_tmp2,%ret0 683 STD sub_tmp1,0(r_ptr) 684 685 LDD 8(a_ptr),t1 686 LDD 8(b_ptr),t2 687 SUB t1,t2,sub_tmp1 ; t3 = t1-t2; 688 SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c; 689 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 690 LDO 1(%r0),sub_tmp2 691 692 CMPCLR,*= t1,t2,%r0 693 COPY sub_tmp2,%ret0 694 STD sub_tmp1,8(r_ptr) 695 696 LDO -2(n),n 697 LDO 16(a_ptr),a_ptr 698 LDO 16(b_ptr),b_ptr 699 700 CMPIB,<= 2,n,bn_sub_words_unroll2 701 LDO 16(r_ptr),r_ptr 702 703 CMPIB,=,N 0,n,bn_sub_words_exit ; are we done? 704 705bn_sub_words_single_top 706 LDD 0(a_ptr),t1 707 LDD 0(b_ptr),t2 708 SUB t1,t2,sub_tmp1 ; t3 = t1-t2; 709 SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c; 710 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 711 LDO 1(%r0),sub_tmp2 712 713 CMPCLR,*= t1,t2,%r0 714 COPY sub_tmp2,%ret0 715 716 STD sub_tmp1,0(r_ptr) 717 718bn_sub_words_exit 719 .EXIT 720 BVE (%rp) 721 NOP 722 .PROCEND ;in=23,24,25,26,29;out=28; 723 724;------------------------------------------------------------------------------ 725; 726; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d) 727; 728; arg0 = h 729; arg1 = l 730; arg2 = d 731; 732; This is mainly just modified assembly from the compiler, thus the 733; lack of variable names. 734; 735;------------------------------------------------------------------------------ 736bn_div_words 737 .proc 738 .callinfo CALLER,FRAME=272,ENTRY_GR=%r10,SAVE_RP,ARGS_SAVED,ORDERING_AWARE 739 .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN 740 .IMPORT BN_num_bits_word,CODE,NO_RELOCATION 741 .IMPORT __iob,DATA 742 .IMPORT fprintf,CODE,NO_RELOCATION 743 .IMPORT abort,CODE,NO_RELOCATION 744 .IMPORT $$div2U,MILLICODE 745 .entry 746 STD %r2,-16(%r30) 747 STD,MA %r3,352(%r30) 748 STD %r4,-344(%r30) 749 STD %r5,-336(%r30) 750 STD %r6,-328(%r30) 751 STD %r7,-320(%r30) 752 STD %r8,-312(%r30) 753 STD %r9,-304(%r30) 754 STD %r10,-296(%r30) 755 756 STD %r27,-288(%r30) ; save gp 757 758 COPY %r24,%r3 ; save d 759 COPY %r26,%r4 ; save h (high 64-bits) 760 LDO -1(%r0),%ret0 ; return -1 by default 761 762 CMPB,*= %r0,%arg2,$D3 ; if (d == 0) 763 COPY %r25,%r5 ; save l (low 64-bits) 764 765 LDO -48(%r30),%r29 ; create ap 766 .CALL ;in=26,29;out=28; 767 B,L BN_num_bits_word,%r2 768 COPY %r3,%r26 769 LDD -288(%r30),%r27 ; restore gp 770 LDI 64,%r21 771 772 CMPB,= %r21,%ret0,$00000012 ;if (i == 64) (forward) 773 COPY %ret0,%r24 ; i 774 MTSARCM %r24 775 DEPDI,Z -1,%sar,1,%r29 776 CMPB,*<<,N %r29,%r4,bn_div_err_case ; if (h > 1<<i) (forward) 777 778$00000012 779 SUBI 64,%r24,%r31 ; i = 64 - i; 780 CMPCLR,*<< %r4,%r3,%r0 ; if (h >= d) 781 SUB %r4,%r3,%r4 ; h -= d 782 CMPB,= %r31,%r0,$0000001A ; if (i) 783 COPY %r0,%r10 ; ret = 0 784 MTSARCM %r31 ; i to shift 785 DEPD,Z %r3,%sar,64,%r3 ; d <<= i; 786 SUBI 64,%r31,%r19 ; 64 - i; redundent 787 MTSAR %r19 ; (64 -i) to shift 788 SHRPD %r4,%r5,%sar,%r4 ; l>> (64-i) 789 MTSARCM %r31 ; i to shift 790 DEPD,Z %r5,%sar,64,%r5 ; l <<= i; 791 792$0000001A 793 DEPDI,Z -1,31,32,%r19 794 EXTRD,U %r3,31,32,%r6 ; dh=(d&0xfff)>>32 795 EXTRD,U %r3,63,32,%r8 ; dl = d&0xffffff 796 LDO 2(%r0),%r9 797 STD %r3,-280(%r30) ; "d" to stack 798 799$0000001C 800 DEPDI,Z -1,63,32,%r29 ; 801 EXTRD,U %r4,31,32,%r31 ; h >> 32 802 CMPB,*=,N %r31,%r6,$D2 ; if ((h>>32) != dh)(forward) div 803 COPY %r4,%r26 804 EXTRD,U %r4,31,32,%r25 805 COPY %r6,%r24 806 .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL) 807 B,L $$div2U,%r2 808 EXTRD,U %r6,31,32,%r23 809 DEPD %r28,31,32,%r29 810$D2 811 STD %r29,-272(%r30) ; q 812 AND %r5,%r19,%r24 ; t & 0xffffffff00000000; 813 EXTRD,U %r24,31,32,%r24 ; ??? 814 FLDD -272(%r30),%fr7 ; q 815 FLDD -280(%r30),%fr8 ; d 816 XMPYU %fr8L,%fr7L,%fr10 817 FSTD %fr10,-256(%r30) 818 XMPYU %fr8L,%fr7R,%fr22 819 FSTD %fr22,-264(%r30) 820 XMPYU %fr8R,%fr7L,%fr11 821 XMPYU %fr8R,%fr7R,%fr23 822 FSTD %fr11,-232(%r30) 823 FSTD %fr23,-240(%r30) 824 LDD -256(%r30),%r28 825 DEPD,Z %r28,31,32,%r2 826 LDD -264(%r30),%r20 827 ADD,L %r20,%r2,%r31 828 LDD -232(%r30),%r22 829 DEPD,Z %r22,31,32,%r22 830 LDD -240(%r30),%r21 831 B $00000024 ; enter loop 832 ADD,L %r21,%r22,%r23 833 834$0000002A 835 LDO -1(%r29),%r29 836 SUB %r23,%r8,%r23 837$00000024 838 SUB %r4,%r31,%r25 839 AND %r25,%r19,%r26 840 CMPB,*<>,N %r0,%r26,$00000046 ; (forward) 841 DEPD,Z %r25,31,32,%r20 842 OR %r20,%r24,%r21 843 CMPB,*<<,N %r21,%r23,$0000002A ;(backward) 844 SUB %r31,%r6,%r31 845;-------------Break path--------------------- 846 847$00000046 848 DEPD,Z %r23,31,32,%r25 ;tl 849 EXTRD,U %r23,31,32,%r26 ;t 850 AND %r25,%r19,%r24 ;tl = (tl<<32)&0xfffffff0000000L 851 ADD,L %r31,%r26,%r31 ;th += t; 852 CMPCLR,*>>= %r5,%r24,%r0 ;if (l<tl) 853 LDO 1(%r31),%r31 ; th++; 854 CMPB,*<<=,N %r31,%r4,$00000036 ;if (n < th) (forward) 855 LDO -1(%r29),%r29 ;q--; 856 ADD,L %r4,%r3,%r4 ;h += d; 857$00000036 858 ADDIB,=,N -1,%r9,$D1 ;if (--count == 0) break (forward) 859 SUB %r5,%r24,%r28 ; l -= tl; 860 SUB %r4,%r31,%r24 ; h -= th; 861 SHRPD %r24,%r28,32,%r4 ; h = ((h<<32)|(l>>32)); 862 DEPD,Z %r29,31,32,%r10 ; ret = q<<32 863 b $0000001C 864 DEPD,Z %r28,31,32,%r5 ; l = l << 32 865 866$D1 867 OR %r10,%r29,%r28 ; ret |= q 868$D3 869 LDD -368(%r30),%r2 870$D0 871 LDD -296(%r30),%r10 872 LDD -304(%r30),%r9 873 LDD -312(%r30),%r8 874 LDD -320(%r30),%r7 875 LDD -328(%r30),%r6 876 LDD -336(%r30),%r5 877 LDD -344(%r30),%r4 878 BVE (%r2) 879 .EXIT 880 LDD,MB -352(%r30),%r3 881 882bn_div_err_case 883 MFIA %r6 884 ADDIL L'bn_div_words-bn_div_err_case,%r6,%r1 885 LDO R'bn_div_words-bn_div_err_case(%r1),%r6 886 ADDIL LT'__iob,%r27,%r1 887 LDD RT'__iob(%r1),%r26 888 ADDIL L'C$4-bn_div_words,%r6,%r1 889 LDO R'C$4-bn_div_words(%r1),%r25 890 LDO 64(%r26),%r26 891 .CALL ;in=24,25,26,29;out=28; 892 B,L fprintf,%r2 893 LDO -48(%r30),%r29 894 LDD -288(%r30),%r27 895 .CALL ;in=29; 896 B,L abort,%r2 897 LDO -48(%r30),%r29 898 LDD -288(%r30),%r27 899 B $D0 900 LDD -368(%r30),%r2 901 .PROCEND ;in=24,25,26,29;out=28; 902 903;---------------------------------------------------------------------------- 904; 905; Registers to hold 64-bit values to manipulate. The "L" part 906; of the register corresponds to the upper 32-bits, while the "R" 907; part corresponds to the lower 32-bits 908; 909; Note, that when using b6 and b7, the code must save these before 910; using them because they are callee save registers 911; 912; 913; Floating point registers to use to save values that 914; are manipulated. These don't collide with ftemp1-6 and 915; are all caller save registers 916; 917a0 .reg %fr22 918a0L .reg %fr22L 919a0R .reg %fr22R 920 921a1 .reg %fr23 922a1L .reg %fr23L 923a1R .reg %fr23R 924 925a2 .reg %fr24 926a2L .reg %fr24L 927a2R .reg %fr24R 928 929a3 .reg %fr25 930a3L .reg %fr25L 931a3R .reg %fr25R 932 933a4 .reg %fr26 934a4L .reg %fr26L 935a4R .reg %fr26R 936 937a5 .reg %fr27 938a5L .reg %fr27L 939a5R .reg %fr27R 940 941a6 .reg %fr28 942a6L .reg %fr28L 943a6R .reg %fr28R 944 945a7 .reg %fr29 946a7L .reg %fr29L 947a7R .reg %fr29R 948 949b0 .reg %fr30 950b0L .reg %fr30L 951b0R .reg %fr30R 952 953b1 .reg %fr31 954b1L .reg %fr31L 955b1R .reg %fr31R 956 957; 958; Temporary floating point variables, these are all caller save 959; registers 960; 961ftemp1 .reg %fr4 962ftemp2 .reg %fr5 963ftemp3 .reg %fr6 964ftemp4 .reg %fr7 965 966; 967; The B set of registers when used. 968; 969 970b2 .reg %fr8 971b2L .reg %fr8L 972b2R .reg %fr8R 973 974b3 .reg %fr9 975b3L .reg %fr9L 976b3R .reg %fr9R 977 978b4 .reg %fr10 979b4L .reg %fr10L 980b4R .reg %fr10R 981 982b5 .reg %fr11 983b5L .reg %fr11L 984b5R .reg %fr11R 985 986b6 .reg %fr12 987b6L .reg %fr12L 988b6R .reg %fr12R 989 990b7 .reg %fr13 991b7L .reg %fr13L 992b7R .reg %fr13R 993 994c1 .reg %r21 ; only reg 995temp1 .reg %r20 ; only reg 996temp2 .reg %r19 ; only reg 997temp3 .reg %r31 ; only reg 998 999m1 .reg %r28 1000c2 .reg %r23 1001high_one .reg %r1 1002ht .reg %r6 1003lt .reg %r5 1004m .reg %r4 1005c3 .reg %r3 1006 1007SQR_ADD_C .macro A0L,A0R,C1,C2,C3 1008 XMPYU A0L,A0R,ftemp1 ; m 1009 FSTD ftemp1,-24(%sp) ; store m 1010 1011 XMPYU A0R,A0R,ftemp2 ; lt 1012 FSTD ftemp2,-16(%sp) ; store lt 1013 1014 XMPYU A0L,A0L,ftemp3 ; ht 1015 FSTD ftemp3,-8(%sp) ; store ht 1016 1017 LDD -24(%sp),m ; load m 1018 AND m,high_mask,temp2 ; m & Mask 1019 DEPD,Z m,30,31,temp3 ; m << 32+1 1020 LDD -16(%sp),lt ; lt 1021 1022 LDD -8(%sp),ht ; ht 1023 EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1 1024 ADD temp3,lt,lt ; lt = lt+m 1025 ADD,L ht,temp1,ht ; ht += temp1 1026 ADD,DC ht,%r0,ht ; ht++ 1027 1028 ADD C1,lt,C1 ; c1=c1+lt 1029 ADD,DC ht,%r0,ht ; ht++ 1030 1031 ADD C2,ht,C2 ; c2=c2+ht 1032 ADD,DC C3,%r0,C3 ; c3++ 1033.endm 1034 1035SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3 1036 XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht 1037 FSTD ftemp1,-16(%sp) ; 1038 XMPYU A0R,A1L,ftemp2 ; m = bh*lt 1039 FSTD ftemp2,-8(%sp) ; 1040 XMPYU A0R,A1R,ftemp3 ; lt = bl*lt 1041 FSTD ftemp3,-32(%sp) 1042 XMPYU A0L,A1L,ftemp4 ; ht = bh*ht 1043 FSTD ftemp4,-24(%sp) ; 1044 1045 LDD -8(%sp),m ; r21 = m 1046 LDD -16(%sp),m1 ; r19 = m1 1047 ADD,L m,m1,m ; m+m1 1048 1049 DEPD,Z m,31,32,temp3 ; (m+m1<<32) 1050 LDD -24(%sp),ht ; r24 = ht 1051 1052 CMPCLR,*>>= m,m1,%r0 ; if (m < m1) 1053 ADD,L ht,high_one,ht ; ht+=high_one 1054 1055 EXTRD,U m,31,32,temp1 ; m >> 32 1056 LDD -32(%sp),lt ; lt 1057 ADD,L ht,temp1,ht ; ht+= m>>32 1058 ADD lt,temp3,lt ; lt = lt+m1 1059 ADD,DC ht,%r0,ht ; ht++ 1060 1061 ADD ht,ht,ht ; ht=ht+ht; 1062 ADD,DC C3,%r0,C3 ; add in carry (c3++) 1063 1064 ADD lt,lt,lt ; lt=lt+lt; 1065 ADD,DC ht,%r0,ht ; add in carry (ht++) 1066 1067 ADD C1,lt,C1 ; c1=c1+lt 1068 ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++) 1069 LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise 1070 1071 ADD C2,ht,C2 ; c2 = c2 + ht 1072 ADD,DC C3,%r0,C3 ; add in carry (c3++) 1073.endm 1074 1075; 1076;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) 1077; arg0 = r_ptr 1078; arg1 = a_ptr 1079; 1080 1081bn_sqr_comba8 1082 .PROC 1083 .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE 1084 .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN 1085 .ENTRY 1086 .align 64 1087 1088 STD %r3,0(%sp) ; save r3 1089 STD %r4,8(%sp) ; save r4 1090 STD %r5,16(%sp) ; save r5 1091 STD %r6,24(%sp) ; save r6 1092 1093 ; 1094 ; Zero out carries 1095 ; 1096 COPY %r0,c1 1097 COPY %r0,c2 1098 COPY %r0,c3 1099 1100 LDO 128(%sp),%sp ; bump stack 1101 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L 1102 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 1103 1104 ; 1105 ; Load up all of the values we are going to use 1106 ; 1107 FLDD 0(a_ptr),a0 1108 FLDD 8(a_ptr),a1 1109 FLDD 16(a_ptr),a2 1110 FLDD 24(a_ptr),a3 1111 FLDD 32(a_ptr),a4 1112 FLDD 40(a_ptr),a5 1113 FLDD 48(a_ptr),a6 1114 FLDD 56(a_ptr),a7 1115 1116 SQR_ADD_C a0L,a0R,c1,c2,c3 1117 STD c1,0(r_ptr) ; r[0] = c1; 1118 COPY %r0,c1 1119 1120 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 1121 STD c2,8(r_ptr) ; r[1] = c2; 1122 COPY %r0,c2 1123 1124 SQR_ADD_C a1L,a1R,c3,c1,c2 1125 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 1126 STD c3,16(r_ptr) ; r[2] = c3; 1127 COPY %r0,c3 1128 1129 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 1130 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 1131 STD c1,24(r_ptr) ; r[3] = c1; 1132 COPY %r0,c1 1133 1134 SQR_ADD_C a2L,a2R,c2,c3,c1 1135 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 1136 SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1 1137 STD c2,32(r_ptr) ; r[4] = c2; 1138 COPY %r0,c2 1139 1140 SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2 1141 SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2 1142 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 1143 STD c3,40(r_ptr) ; r[5] = c3; 1144 COPY %r0,c3 1145 1146 SQR_ADD_C a3L,a3R,c1,c2,c3 1147 SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3 1148 SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3 1149 SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3 1150 STD c1,48(r_ptr) ; r[6] = c1; 1151 COPY %r0,c1 1152 1153 SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1 1154 SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1 1155 SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1 1156 SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1 1157 STD c2,56(r_ptr) ; r[7] = c2; 1158 COPY %r0,c2 1159 1160 SQR_ADD_C a4L,a4R,c3,c1,c2 1161 SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2 1162 SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2 1163 SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2 1164 STD c3,64(r_ptr) ; r[8] = c3; 1165 COPY %r0,c3 1166 1167 SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3 1168 SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3 1169 SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3 1170 STD c1,72(r_ptr) ; r[9] = c1; 1171 COPY %r0,c1 1172 1173 SQR_ADD_C a5L,a5R,c2,c3,c1 1174 SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1 1175 SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1 1176 STD c2,80(r_ptr) ; r[10] = c2; 1177 COPY %r0,c2 1178 1179 SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2 1180 SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2 1181 STD c3,88(r_ptr) ; r[11] = c3; 1182 COPY %r0,c3 1183 1184 SQR_ADD_C a6L,a6R,c1,c2,c3 1185 SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3 1186 STD c1,96(r_ptr) ; r[12] = c1; 1187 COPY %r0,c1 1188 1189 SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1 1190 STD c2,104(r_ptr) ; r[13] = c2; 1191 COPY %r0,c2 1192 1193 SQR_ADD_C a7L,a7R,c3,c1,c2 1194 STD c3, 112(r_ptr) ; r[14] = c3 1195 STD c1, 120(r_ptr) ; r[15] = c1 1196 1197 .EXIT 1198 LDD -104(%sp),%r6 ; restore r6 1199 LDD -112(%sp),%r5 ; restore r5 1200 LDD -120(%sp),%r4 ; restore r4 1201 BVE (%rp) 1202 LDD,MB -128(%sp),%r3 1203 1204 .PROCEND 1205 1206;----------------------------------------------------------------------------- 1207; 1208;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) 1209; arg0 = r_ptr 1210; arg1 = a_ptr 1211; 1212 1213bn_sqr_comba4 1214 .proc 1215 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE 1216 .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN 1217 .entry 1218 .align 64 1219 STD %r3,0(%sp) ; save r3 1220 STD %r4,8(%sp) ; save r4 1221 STD %r5,16(%sp) ; save r5 1222 STD %r6,24(%sp) ; save r6 1223 1224 ; 1225 ; Zero out carries 1226 ; 1227 COPY %r0,c1 1228 COPY %r0,c2 1229 COPY %r0,c3 1230 1231 LDO 128(%sp),%sp ; bump stack 1232 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L 1233 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 1234 1235 ; 1236 ; Load up all of the values we are going to use 1237 ; 1238 FLDD 0(a_ptr),a0 1239 FLDD 8(a_ptr),a1 1240 FLDD 16(a_ptr),a2 1241 FLDD 24(a_ptr),a3 1242 FLDD 32(a_ptr),a4 1243 FLDD 40(a_ptr),a5 1244 FLDD 48(a_ptr),a6 1245 FLDD 56(a_ptr),a7 1246 1247 SQR_ADD_C a0L,a0R,c1,c2,c3 1248 1249 STD c1,0(r_ptr) ; r[0] = c1; 1250 COPY %r0,c1 1251 1252 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 1253 1254 STD c2,8(r_ptr) ; r[1] = c2; 1255 COPY %r0,c2 1256 1257 SQR_ADD_C a1L,a1R,c3,c1,c2 1258 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 1259 1260 STD c3,16(r_ptr) ; r[2] = c3; 1261 COPY %r0,c3 1262 1263 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 1264 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 1265 1266 STD c1,24(r_ptr) ; r[3] = c1; 1267 COPY %r0,c1 1268 1269 SQR_ADD_C a2L,a2R,c2,c3,c1 1270 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 1271 1272 STD c2,32(r_ptr) ; r[4] = c2; 1273 COPY %r0,c2 1274 1275 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 1276 STD c3,40(r_ptr) ; r[5] = c3; 1277 COPY %r0,c3 1278 1279 SQR_ADD_C a3L,a3R,c1,c2,c3 1280 STD c1,48(r_ptr) ; r[6] = c1; 1281 STD c2,56(r_ptr) ; r[7] = c2; 1282 1283 .EXIT 1284 LDD -104(%sp),%r6 ; restore r6 1285 LDD -112(%sp),%r5 ; restore r5 1286 LDD -120(%sp),%r4 ; restore r4 1287 BVE (%rp) 1288 LDD,MB -128(%sp),%r3 1289 1290 .PROCEND 1291 1292 1293;--------------------------------------------------------------------------- 1294 1295MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3 1296 XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht 1297 FSTD ftemp1,-16(%sp) ; 1298 XMPYU A0R,B0L,ftemp2 ; m = bh*lt 1299 FSTD ftemp2,-8(%sp) ; 1300 XMPYU A0R,B0R,ftemp3 ; lt = bl*lt 1301 FSTD ftemp3,-32(%sp) 1302 XMPYU A0L,B0L,ftemp4 ; ht = bh*ht 1303 FSTD ftemp4,-24(%sp) ; 1304 1305 LDD -8(%sp),m ; r21 = m 1306 LDD -16(%sp),m1 ; r19 = m1 1307 ADD,L m,m1,m ; m+m1 1308 1309 DEPD,Z m,31,32,temp3 ; (m+m1<<32) 1310 LDD -24(%sp),ht ; r24 = ht 1311 1312 CMPCLR,*>>= m,m1,%r0 ; if (m < m1) 1313 ADD,L ht,high_one,ht ; ht+=high_one 1314 1315 EXTRD,U m,31,32,temp1 ; m >> 32 1316 LDD -32(%sp),lt ; lt 1317 ADD,L ht,temp1,ht ; ht+= m>>32 1318 ADD lt,temp3,lt ; lt = lt+m1 1319 ADD,DC ht,%r0,ht ; ht++ 1320 1321 ADD C1,lt,C1 ; c1=c1+lt 1322 ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise 1323 1324 ADD C2,ht,C2 ; c2 = c2 + ht 1325 ADD,DC C3,%r0,C3 ; add in carry (c3++) 1326.endm 1327 1328 1329; 1330;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 1331; arg0 = r_ptr 1332; arg1 = a_ptr 1333; arg2 = b_ptr 1334; 1335 1336bn_mul_comba8 1337 .proc 1338 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE 1339 .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN 1340 .entry 1341 .align 64 1342 1343 STD %r3,0(%sp) ; save r3 1344 STD %r4,8(%sp) ; save r4 1345 STD %r5,16(%sp) ; save r5 1346 STD %r6,24(%sp) ; save r6 1347 FSTD %fr12,32(%sp) ; save r6 1348 FSTD %fr13,40(%sp) ; save r7 1349 1350 ; 1351 ; Zero out carries 1352 ; 1353 COPY %r0,c1 1354 COPY %r0,c2 1355 COPY %r0,c3 1356 1357 LDO 128(%sp),%sp ; bump stack 1358 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 1359 1360 ; 1361 ; Load up all of the values we are going to use 1362 ; 1363 FLDD 0(a_ptr),a0 1364 FLDD 8(a_ptr),a1 1365 FLDD 16(a_ptr),a2 1366 FLDD 24(a_ptr),a3 1367 FLDD 32(a_ptr),a4 1368 FLDD 40(a_ptr),a5 1369 FLDD 48(a_ptr),a6 1370 FLDD 56(a_ptr),a7 1371 1372 FLDD 0(b_ptr),b0 1373 FLDD 8(b_ptr),b1 1374 FLDD 16(b_ptr),b2 1375 FLDD 24(b_ptr),b3 1376 FLDD 32(b_ptr),b4 1377 FLDD 40(b_ptr),b5 1378 FLDD 48(b_ptr),b6 1379 FLDD 56(b_ptr),b7 1380 1381 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 1382 STD c1,0(r_ptr) 1383 COPY %r0,c1 1384 1385 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 1386 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 1387 STD c2,8(r_ptr) 1388 COPY %r0,c2 1389 1390 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 1391 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 1392 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 1393 STD c3,16(r_ptr) 1394 COPY %r0,c3 1395 1396 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 1397 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 1398 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 1399 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 1400 STD c1,24(r_ptr) 1401 COPY %r0,c1 1402 1403 MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1 1404 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 1405 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 1406 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 1407 MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1 1408 STD c2,32(r_ptr) 1409 COPY %r0,c2 1410 1411 MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2 1412 MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2 1413 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 1414 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 1415 MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2 1416 MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2 1417 STD c3,40(r_ptr) 1418 COPY %r0,c3 1419 1420 MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3 1421 MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3 1422 MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3 1423 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 1424 MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3 1425 MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3 1426 MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3 1427 STD c1,48(r_ptr) 1428 COPY %r0,c1 1429 1430 MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1 1431 MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1 1432 MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1 1433 MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1 1434 MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1 1435 MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1 1436 MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1 1437 MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1 1438 STD c2,56(r_ptr) 1439 COPY %r0,c2 1440 1441 MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2 1442 MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2 1443 MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2 1444 MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2 1445 MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2 1446 MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2 1447 MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2 1448 STD c3,64(r_ptr) 1449 COPY %r0,c3 1450 1451 MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3 1452 MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3 1453 MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3 1454 MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3 1455 MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3 1456 MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3 1457 STD c1,72(r_ptr) 1458 COPY %r0,c1 1459 1460 MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1 1461 MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1 1462 MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1 1463 MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1 1464 MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1 1465 STD c2,80(r_ptr) 1466 COPY %r0,c2 1467 1468 MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2 1469 MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2 1470 MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2 1471 MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2 1472 STD c3,88(r_ptr) 1473 COPY %r0,c3 1474 1475 MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3 1476 MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3 1477 MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3 1478 STD c1,96(r_ptr) 1479 COPY %r0,c1 1480 1481 MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1 1482 MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1 1483 STD c2,104(r_ptr) 1484 COPY %r0,c2 1485 1486 MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2 1487 STD c3,112(r_ptr) 1488 STD c1,120(r_ptr) 1489 1490 .EXIT 1491 FLDD -88(%sp),%fr13 1492 FLDD -96(%sp),%fr12 1493 LDD -104(%sp),%r6 ; restore r6 1494 LDD -112(%sp),%r5 ; restore r5 1495 LDD -120(%sp),%r4 ; restore r4 1496 BVE (%rp) 1497 LDD,MB -128(%sp),%r3 1498 1499 .PROCEND 1500 1501;----------------------------------------------------------------------------- 1502; 1503;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 1504; arg0 = r_ptr 1505; arg1 = a_ptr 1506; arg2 = b_ptr 1507; 1508 1509bn_mul_comba4 1510 .proc 1511 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE 1512 .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN 1513 .entry 1514 .align 64 1515 1516 STD %r3,0(%sp) ; save r3 1517 STD %r4,8(%sp) ; save r4 1518 STD %r5,16(%sp) ; save r5 1519 STD %r6,24(%sp) ; save r6 1520 FSTD %fr12,32(%sp) ; save r6 1521 FSTD %fr13,40(%sp) ; save r7 1522 1523 ; 1524 ; Zero out carries 1525 ; 1526 COPY %r0,c1 1527 COPY %r0,c2 1528 COPY %r0,c3 1529 1530 LDO 128(%sp),%sp ; bump stack 1531 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 1532 1533 ; 1534 ; Load up all of the values we are going to use 1535 ; 1536 FLDD 0(a_ptr),a0 1537 FLDD 8(a_ptr),a1 1538 FLDD 16(a_ptr),a2 1539 FLDD 24(a_ptr),a3 1540 1541 FLDD 0(b_ptr),b0 1542 FLDD 8(b_ptr),b1 1543 FLDD 16(b_ptr),b2 1544 FLDD 24(b_ptr),b3 1545 1546 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 1547 STD c1,0(r_ptr) 1548 COPY %r0,c1 1549 1550 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 1551 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 1552 STD c2,8(r_ptr) 1553 COPY %r0,c2 1554 1555 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 1556 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 1557 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 1558 STD c3,16(r_ptr) 1559 COPY %r0,c3 1560 1561 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 1562 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 1563 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 1564 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 1565 STD c1,24(r_ptr) 1566 COPY %r0,c1 1567 1568 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 1569 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 1570 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 1571 STD c2,32(r_ptr) 1572 COPY %r0,c2 1573 1574 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 1575 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 1576 STD c3,40(r_ptr) 1577 COPY %r0,c3 1578 1579 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 1580 STD c1,48(r_ptr) 1581 STD c2,56(r_ptr) 1582 1583 .EXIT 1584 FLDD -88(%sp),%fr13 1585 FLDD -96(%sp),%fr12 1586 LDD -104(%sp),%r6 ; restore r6 1587 LDD -112(%sp),%r5 ; restore r5 1588 LDD -120(%sp),%r4 ; restore r4 1589 BVE (%rp) 1590 LDD,MB -128(%sp),%r3 1591 1592 .PROCEND 1593 1594 1595 .SPACE $TEXT$ 1596 .SUBSPA $CODE$ 1597 .SPACE $PRIVATE$,SORT=16 1598 .IMPORT $global$,DATA 1599 .SPACE $TEXT$ 1600 .SUBSPA $CODE$ 1601 .SUBSPA $LIT$,ACCESS=0x2c 1602C$4 1603 .ALIGN 8 1604 .STRINGZ "Division would overflow (%d)\n" 1605 .END 1606