1/* Copyright (c) 2005 Hewlett-Packard Development Company, L.P. 2 3Permission is hereby granted, free of charge, to any person obtaining 4a copy of this software and associated documentation files (the 5"Software"), to deal in the Software without restriction, including 6without limitation the rights to use, copy, modify, merge, publish, 7distribute, sublicense, and/or sell copies of the Software, and to 8permit persons to whom the Software is furnished to do so, subject to 9the following conditions: 10 11The above copyright notice and this permission notice shall be 12included in all copies or substantial portions of the Software. 13 14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ 21 22// Common registers are assigned as follows: 23// 24// COMMON 25// 26// t0 Const Tbl Ptr TPtr 27// t1 Round Constant TRound 28// t4 Block residual LenResid 29// t5 Residual Data DTmp 30// 31// {in,out}0 Block 0 Cycle RotateM0 32// {in,out}1 Block Value 12 M12 33// {in,out}2 Block Value 8 M8 34// {in,out}3 Block Value 4 M4 35// {in,out}4 Block Value 0 M0 36// {in,out}5 Block 1 Cycle RotateM1 37// {in,out}6 Block Value 13 M13 38// {in,out}7 Block Value 9 M9 39// {in,out}8 Block Value 5 M5 40// {in,out}9 Block Value 1 M1 41// {in,out}10 Block 2 Cycle RotateM2 42// {in,out}11 Block Value 14 M14 43// {in,out}12 Block Value 10 M10 44// {in,out}13 Block Value 6 M6 45// {in,out}14 Block Value 2 M2 46// {in,out}15 Block 3 Cycle RotateM3 47// {in,out}16 Block Value 15 M15 48// {in,out}17 Block Value 11 M11 49// {in,out}18 Block Value 7 M7 50// {in,out}19 Block Value 3 M3 51// {in,out}20 Scratch Z 52// {in,out}21 Scratch Y 53// {in,out}22 Scratch X 54// {in,out}23 Scratch W 55// {in,out}24 Digest A A 56// {in,out}25 Digest B B 57// {in,out}26 Digest C C 58// {in,out}27 Digest D D 59// {in,out}28 Active Data Ptr DPtr 60// in28 Dummy Value - 61// out28 Dummy Value - 62// bt0 Coroutine Link QUICK_RTN 63// 64/// These predicates are used for computing the padding block(s) and 65/// are shared between the driver and digest co-routines 66// 67// pt0 Extra Pad Block pExtra 68// pt1 Load next word pLoad 69// pt2 Skip next word pSkip 70// pt3 Search for Pad pNoPad 71// pt4 Pad Word 0 pPad0 72// pt5 Pad Word 1 pPad1 73// pt6 Pad Word 2 pPad2 74// pt7 Pad Word 3 pPad3 75 76#define DTmp r19 77#define LenResid r18 78#define QUICK_RTN b6 79#define TPtr r14 80#define TRound r15 81#define pExtra p6 82#define pLoad p7 83#define pNoPad p9 84#define pPad0 p10 85#define pPad1 p11 86#define pPad2 p12 87#define pPad3 p13 88#define pSkip p8 89 90#define A_ out24 91#define B_ out25 92#define C_ out26 93#define D_ out27 94#define DPtr_ out28 95#define M0_ out4 96#define M1_ out9 97#define M10_ out12 98#define M11_ out17 99#define M12_ out1 100#define M13_ out6 101#define M14_ out11 102#define M15_ out16 103#define M2_ out14 104#define M3_ out19 105#define M4_ out3 106#define M5_ out8 107#define M6_ out13 108#define M7_ out18 109#define M8_ out2 110#define M9_ out7 111#define RotateM0_ out0 112#define RotateM1_ out5 113#define RotateM2_ out10 114#define RotateM3_ out15 115#define W_ out23 116#define X_ out22 117#define Y_ out21 118#define Z_ out20 119 120#define A in24 121#define B in25 122#define C in26 123#define D in27 124#define DPtr in28 125#define M0 in4 126#define M1 in9 127#define M10 in12 128#define M11 in17 129#define M12 in1 130#define M13 in6 131#define M14 in11 132#define M15 in16 133#define M2 in14 134#define M3 in19 135#define M4 in3 136#define M5 in8 137#define M6 in13 138#define M7 in18 139#define M8 in2 140#define M9 in7 141#define RotateM0 in0 142#define RotateM1 in5 143#define RotateM2 in10 144#define RotateM3 in15 145#define W in23 146#define X in22 147#define Y in21 148#define Z in20 149 150/* register stack configuration for md5_block_asm_data_order(): */ 151#define MD5_NINP 3 152#define MD5_NLOC 0 153#define MD5_NOUT 29 154#define MD5_NROT 0 155 156/* register stack configuration for helpers: */ 157#define _NINPUTS MD5_NOUT 158#define _NLOCALS 0 159#define _NOUTPUT 0 160#define _NROTATE 24 /* this must be <= _NINPUTS */ 161 162#if defined(_HPUX_SOURCE) && !defined(_LP64) 163#define ADDP addp4 164#else 165#define ADDP add 166#endif 167 168#if defined(_HPUX_SOURCE) || defined(B_ENDIAN) 169#define HOST_IS_BIG_ENDIAN 170#endif 171 172// Macros for getting the left and right portions of little-endian words 173 174#define GETLW(dst, src, align) dep.z dst = src, 32 - 8 * align, 8 * align 175#define GETRW(dst, src, align) extr.u dst = src, 8 * align, 32 - 8 * align 176 177// MD5 driver 178// 179// Reads an input block, then calls the digest block 180// subroutine and adds the results to the accumulated 181// digest. It allocates 32 outs which the subroutine 182// uses as it's inputs and rotating 183// registers. Initializes the round constant pointer and 184// takes care of saving/restoring ar.lc 185// 186/// INPUT 187// 188// in0 Context Ptr CtxPtr0 189// in1 Input Data Ptr DPtrIn 190// in2 Integral Blocks BlockCount 191// rp Return Address - 192// 193/// CODE 194// 195// v2 Input Align InAlign 196// t0 Shared w/digest - 197// t1 Shared w/digest - 198// t2 Shared w/digest - 199// t3 Shared w/digest - 200// t4 Shared w/digest - 201// t5 Shared w/digest - 202// t6 PFS Save PFSSave 203// t7 ar.lc Save LCSave 204// t8 Saved PR PRSave 205// t9 2nd CtxPtr CtxPtr1 206// t10 Table Base CTable 207// t11 Table[0] CTable0 208// t13 Accumulator A AccumA 209// t14 Accumulator B AccumB 210// t15 Accumulator C AccumC 211// t16 Accumulator D AccumD 212// pt0 Shared w/digest - 213// pt1 Shared w/digest - 214// pt2 Shared w/digest - 215// pt3 Shared w/digest - 216// pt4 Shared w/digest - 217// pt5 Shared w/digest - 218// pt6 Shared w/digest - 219// pt7 Shared w/digest - 220// pt8 Not Aligned pOff 221// pt8 Blocks Left pAgain 222 223#define AccumA r27 224#define AccumB r28 225#define AccumC r29 226#define AccumD r30 227#define CTable r24 228#define CTable0 r25 229#define CtxPtr0 in0 230#define CtxPtr1 r23 231#define DPtrIn in1 232#define BlockCount in2 233#define InAlign r10 234#define LCSave r21 235#define PFSSave r20 236#define PRSave r22 237#define pAgain p63 238#define pOff p63 239 240 .text 241 242/* md5_block_asm_data_order(MD5_CTX *c, const void *data, size_t num) 243 244 where: 245 c: a pointer to a structure of this type: 246 247 typedef struct MD5state_st 248 { 249 MD5_LONG A,B,C,D; 250 MD5_LONG Nl,Nh; 251 MD5_LONG data[MD5_LBLOCK]; 252 unsigned int num; 253 } 254 MD5_CTX; 255 256 data: a pointer to the input data (may be misaligned) 257 num: the number of 16-byte blocks to hash (i.e., the length 258 of DATA is 16*NUM. 259 260 */ 261 262 .type md5_block_asm_data_order, @function 263 .global md5_block_asm_data_order 264 .align 32 265 .proc md5_block_asm_data_order 266md5_block_asm_data_order: 267.md5_block: 268 .prologue 269{ .mmi 270 .save ar.pfs, PFSSave 271 alloc PFSSave = ar.pfs, MD5_NINP, MD5_NLOC, MD5_NOUT, MD5_NROT 272 ADDP CtxPtr1 = 8, CtxPtr0 273 mov CTable = ip 274} 275{ .mmi 276 ADDP DPtrIn = 0, DPtrIn 277 ADDP CtxPtr0 = 0, CtxPtr0 278 .save ar.lc, LCSave 279 mov LCSave = ar.lc 280} 281;; 282{ .mmi 283 add CTable = .md5_tbl_data_order#-.md5_block#, CTable 284 and InAlign = 0x3, DPtrIn 285} 286 287{ .mmi 288 ld4 AccumA = [CtxPtr0], 4 289 ld4 AccumC = [CtxPtr1], 4 290 .save pr, PRSave 291 mov PRSave = pr 292 .body 293} 294;; 295{ .mmi 296 ld4 AccumB = [CtxPtr0] 297 ld4 AccumD = [CtxPtr1] 298 dep DPtr_ = 0, DPtrIn, 0, 2 299} ;; 300#ifdef HOST_IS_BIG_ENDIAN 301 rum psr.be;; // switch to little-endian 302#endif 303{ .mmb 304 ld4 CTable0 = [CTable], 4 305 cmp.ne pOff, p0 = 0, InAlign 306(pOff) br.cond.spnt.many .md5_unaligned 307} ;; 308 309// The FF load/compute loop rotates values three times, so that 310// loading into M12 here produces the M0 value, M13 -> M1, etc. 311 312.md5_block_loop0: 313{ .mmi 314 ld4 M12_ = [DPtr_], 4 315 mov TPtr = CTable 316 mov TRound = CTable0 317} ;; 318{ .mmi 319 ld4 M13_ = [DPtr_], 4 320 mov A_ = AccumA 321 mov B_ = AccumB 322} ;; 323{ .mmi 324 ld4 M14_ = [DPtr_], 4 325 mov C_ = AccumC 326 mov D_ = AccumD 327} ;; 328{ .mmb 329 ld4 M15_ = [DPtr_], 4 330 add BlockCount = -1, BlockCount 331 br.call.sptk.many QUICK_RTN = md5_digest_block0 332} ;; 333 334// Now, we add the new digest values and do some clean-up 335// before checking if there's another full block to process 336 337{ .mmi 338 add AccumA = AccumA, A_ 339 add AccumB = AccumB, B_ 340 cmp.ne pAgain, p0 = 0, BlockCount 341} 342{ .mib 343 add AccumC = AccumC, C_ 344 add AccumD = AccumD, D_ 345(pAgain) br.cond.dptk.many .md5_block_loop0 346} ;; 347 348.md5_exit: 349#ifdef HOST_IS_BIG_ENDIAN 350 sum psr.be;; // switch back to big-endian mode 351#endif 352{ .mmi 353 st4 [CtxPtr0] = AccumB, -4 354 st4 [CtxPtr1] = AccumD, -4 355 mov pr = PRSave, 0x1ffff ;; 356} 357{ .mmi 358 st4 [CtxPtr0] = AccumA 359 st4 [CtxPtr1] = AccumC 360 mov ar.lc = LCSave 361} ;; 362{ .mib 363 mov ar.pfs = PFSSave 364 br.ret.sptk.few rp 365} ;; 366 367#define MD5UNALIGNED(offset) \ 368.md5_process##offset: \ 369{ .mib ; \ 370 nop 0x0 ; \ 371 GETRW(DTmp, DTmp, offset) ; \ 372} ;; \ 373.md5_block_loop##offset: \ 374{ .mmi ; \ 375 ld4 Y_ = [DPtr_], 4 ; \ 376 mov TPtr = CTable ; \ 377 mov TRound = CTable0 ; \ 378} ;; \ 379{ .mmi ; \ 380 ld4 M13_ = [DPtr_], 4 ; \ 381 mov A_ = AccumA ; \ 382 mov B_ = AccumB ; \ 383} ;; \ 384{ .mii ; \ 385 ld4 M14_ = [DPtr_], 4 ; \ 386 GETLW(W_, Y_, offset) ; \ 387 mov C_ = AccumC ; \ 388} \ 389{ .mmi ; \ 390 mov D_ = AccumD ;; \ 391 or M12_ = W_, DTmp ; \ 392 GETRW(DTmp, Y_, offset) ; \ 393} \ 394{ .mib ; \ 395 ld4 M15_ = [DPtr_], 4 ; \ 396 add BlockCount = -1, BlockCount ; \ 397 br.call.sptk.many QUICK_RTN = md5_digest_block##offset; \ 398} ;; \ 399{ .mmi ; \ 400 add AccumA = AccumA, A_ ; \ 401 add AccumB = AccumB, B_ ; \ 402 cmp.ne pAgain, p0 = 0, BlockCount ; \ 403} \ 404{ .mib ; \ 405 add AccumC = AccumC, C_ ; \ 406 add AccumD = AccumD, D_ ; \ 407(pAgain) br.cond.dptk.many .md5_block_loop##offset ; \ 408} ;; \ 409{ .mib ; \ 410 nop 0x0 ; \ 411 nop 0x0 ; \ 412 br.cond.sptk.many .md5_exit ; \ 413} ;; 414 415 .align 32 416.md5_unaligned: 417// 418// Because variable shifts are expensive, we special case each of 419// the four alignements. In practice, this won't hurt too much 420// since only one working set of code will be loaded. 421// 422{ .mib 423 ld4 DTmp = [DPtr_], 4 424 cmp.eq pOff, p0 = 1, InAlign 425(pOff) br.cond.dpnt.many .md5_process1 426} ;; 427{ .mib 428 cmp.eq pOff, p0 = 2, InAlign 429 nop 0x0 430(pOff) br.cond.dpnt.many .md5_process2 431} ;; 432 MD5UNALIGNED(3) 433 MD5UNALIGNED(1) 434 MD5UNALIGNED(2) 435 436 .endp md5_block_asm_data_order 437 438 439// MD5 Perform the F function and load 440// 441// Passed the first 4 words (M0 - M3) and initial (A, B, C, D) values, 442// computes the FF() round of functions, then branches to the common 443// digest code to finish up with GG(), HH, and II(). 444// 445// INPUT 446// 447// rp Return Address - 448// 449// CODE 450// 451// v0 PFS bit bucket PFS 452// v1 Loop Trip Count LTrip 453// pt0 Load next word pMore 454 455/* For F round: */ 456#define LTrip r9 457#define PFS r8 458#define pMore p6 459 460/* For GHI rounds: */ 461#define T r9 462#define U r10 463#define V r11 464 465#define COMPUTE(a, b, s, M, R) \ 466{ \ 467 .mii ; \ 468 ld4 TRound = [TPtr], 4 ; \ 469 dep.z Y = Z, 32, 32 ;; \ 470 shrp Z = Z, Y, 64 - s ; \ 471} ;; \ 472{ \ 473 .mmi ; \ 474 add a = Z, b ; \ 475 mov R = M ; \ 476 nop 0x0 ; \ 477} ;; 478 479#define LOOP(a, b, s, M, R, label) \ 480{ .mii ; \ 481 ld4 TRound = [TPtr], 4 ; \ 482 dep.z Y = Z, 32, 32 ;; \ 483 shrp Z = Z, Y, 64 - s ; \ 484} ;; \ 485{ .mib ; \ 486 add a = Z, b ; \ 487 mov R = M ; \ 488 br.ctop.sptk.many label ; \ 489} ;; 490 491// G(B, C, D) = (B & D) | (C & ~D) 492 493#define G(a, b, c, d, M) \ 494{ .mmi ; \ 495 add Z = M, TRound ; \ 496 and Y = b, d ; \ 497 andcm X = c, d ; \ 498} ;; \ 499{ .mii ; \ 500 add Z = Z, a ; \ 501 or Y = Y, X ;; \ 502 add Z = Z, Y ; \ 503} ;; 504 505// H(B, C, D) = B ^ C ^ D 506 507#define H(a, b, c, d, M) \ 508{ .mmi ; \ 509 add Z = M, TRound ; \ 510 xor Y = b, c ; \ 511 nop 0x0 ; \ 512} ;; \ 513{ .mii ; \ 514 add Z = Z, a ; \ 515 xor Y = Y, d ;; \ 516 add Z = Z, Y ; \ 517} ;; 518 519// I(B, C, D) = C ^ (B | ~D) 520// 521// However, since we have an andcm operator, we use the fact that 522// 523// Y ^ Z == ~Y ^ ~Z 524// 525// to rewrite the expression as 526// 527// I(B, C, D) = ~C ^ (~B & D) 528 529#define I(a, b, c, d, M) \ 530{ .mmi ; \ 531 add Z = M, TRound ; \ 532 andcm Y = d, b ; \ 533 andcm X = -1, c ; \ 534} ;; \ 535{ .mii ; \ 536 add Z = Z, a ; \ 537 xor Y = Y, X ;; \ 538 add Z = Z, Y ; \ 539} ;; 540 541#define GG4(label) \ 542 G(A, B, C, D, M0) \ 543 COMPUTE(A, B, 5, M0, RotateM0) \ 544 G(D, A, B, C, M1) \ 545 COMPUTE(D, A, 9, M1, RotateM1) \ 546 G(C, D, A, B, M2) \ 547 COMPUTE(C, D, 14, M2, RotateM2) \ 548 G(B, C, D, A, M3) \ 549 LOOP(B, C, 20, M3, RotateM3, label) 550 551#define HH4(label) \ 552 H(A, B, C, D, M0) \ 553 COMPUTE(A, B, 4, M0, RotateM0) \ 554 H(D, A, B, C, M1) \ 555 COMPUTE(D, A, 11, M1, RotateM1) \ 556 H(C, D, A, B, M2) \ 557 COMPUTE(C, D, 16, M2, RotateM2) \ 558 H(B, C, D, A, M3) \ 559 LOOP(B, C, 23, M3, RotateM3, label) 560 561#define II4(label) \ 562 I(A, B, C, D, M0) \ 563 COMPUTE(A, B, 6, M0, RotateM0) \ 564 I(D, A, B, C, M1) \ 565 COMPUTE(D, A, 10, M1, RotateM1) \ 566 I(C, D, A, B, M2) \ 567 COMPUTE(C, D, 15, M2, RotateM2) \ 568 I(B, C, D, A, M3) \ 569 LOOP(B, C, 21, M3, RotateM3, label) 570 571#define FFLOAD(a, b, c, d, M, N, s) \ 572{ .mii ; \ 573(pMore) ld4 N = [DPtr], 4 ; \ 574 add Z = M, TRound ; \ 575 and Y = c, b ; \ 576} \ 577{ .mmi ; \ 578 andcm X = d, b ;; \ 579 add Z = Z, a ; \ 580 or Y = Y, X ; \ 581} ;; \ 582{ .mii ; \ 583 ld4 TRound = [TPtr], 4 ; \ 584 add Z = Z, Y ;; \ 585 dep.z Y = Z, 32, 32 ; \ 586} ;; \ 587{ .mii ; \ 588 nop 0x0 ; \ 589 shrp Z = Z, Y, 64 - s ;; \ 590 add a = Z, b ; \ 591} ;; 592 593#define FFLOOP(a, b, c, d, M, N, s, dest) \ 594{ .mii ; \ 595(pMore) ld4 N = [DPtr], 4 ; \ 596 add Z = M, TRound ; \ 597 and Y = c, b ; \ 598} \ 599{ .mmi ; \ 600 andcm X = d, b ;; \ 601 add Z = Z, a ; \ 602 or Y = Y, X ; \ 603} ;; \ 604{ .mii ; \ 605 ld4 TRound = [TPtr], 4 ; \ 606 add Z = Z, Y ;; \ 607 dep.z Y = Z, 32, 32 ; \ 608} ;; \ 609{ .mii ; \ 610 nop 0x0 ; \ 611 shrp Z = Z, Y, 64 - s ;; \ 612 add a = Z, b ; \ 613} \ 614{ .mib ; \ 615 cmp.ne pMore, p0 = 0, LTrip ; \ 616 add LTrip = -1, LTrip ; \ 617 br.ctop.dptk.many dest ; \ 618} ;; 619 620 .type md5_digest_block0, @function 621 .align 32 622 623 .proc md5_digest_block0 624 .prologue 625md5_digest_block0: 626 .altrp QUICK_RTN 627 .body 628{ .mmi 629 alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE 630 mov LTrip = 2 631 mov ar.lc = 3 632} ;; 633{ .mii 634 cmp.eq pMore, p0 = r0, r0 635 mov ar.ec = 0 636 nop 0x0 637} ;; 638 639.md5_FF_round0: 640 FFLOAD(A, B, C, D, M12, RotateM0, 7) 641 FFLOAD(D, A, B, C, M13, RotateM1, 12) 642 FFLOAD(C, D, A, B, M14, RotateM2, 17) 643 FFLOOP(B, C, D, A, M15, RotateM3, 22, .md5_FF_round0) 644 // 645 // !!! Fall through to md5_digest_GHI 646 // 647 .endp md5_digest_block0 648 649 .type md5_digest_GHI, @function 650 .align 32 651 652 .proc md5_digest_GHI 653 .prologue 654 .regstk _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE 655md5_digest_GHI: 656 .altrp QUICK_RTN 657 .body 658// 659// The following sequence shuffles the block counstants round for the 660// next round: 661// 662// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 663// 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12 664// 665{ .mmi 666 mov Z = M0 667 mov Y = M15 668 mov ar.lc = 3 669} 670{ .mmi 671 mov X = M2 672 mov W = M9 673 mov V = M4 674} ;; 675 676{ .mmi 677 mov M0 = M1 678 mov M15 = M12 679 mov ar.ec = 1 680} 681{ .mmi 682 mov M2 = M11 683 mov M9 = M14 684 mov M4 = M5 685} ;; 686 687{ .mmi 688 mov M1 = M6 689 mov M12 = M13 690 mov U = M3 691} 692{ .mmi 693 mov M11 = M8 694 mov M14 = M7 695 mov M5 = M10 696} ;; 697 698{ .mmi 699 mov M6 = Y 700 mov M13 = X 701 mov M3 = Z 702} 703{ .mmi 704 mov M8 = W 705 mov M7 = V 706 mov M10 = U 707} ;; 708 709.md5_GG_round: 710 GG4(.md5_GG_round) 711 712// The following sequence shuffles the block constants round for the 713// next round: 714// 715// 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12 716// 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2 717 718{ .mmi 719 mov Z = M0 720 mov Y = M1 721 mov ar.lc = 3 722} 723{ .mmi 724 mov X = M3 725 mov W = M5 726 mov V = M6 727} ;; 728 729{ .mmi 730 mov M0 = M4 731 mov M1 = M11 732 mov ar.ec = 1 733} 734{ .mmi 735 mov M3 = M9 736 mov U = M8 737 mov T = M13 738} ;; 739 740{ .mmi 741 mov M4 = Z 742 mov M11 = Y 743 mov M5 = M7 744} 745{ .mmi 746 mov M6 = M14 747 mov M8 = M12 748 mov M13 = M15 749} ;; 750 751{ .mmi 752 mov M7 = W 753 mov M14 = V 754 nop 0x0 755} 756{ .mmi 757 mov M9 = X 758 mov M12 = U 759 mov M15 = T 760} ;; 761 762.md5_HH_round: 763 HH4(.md5_HH_round) 764 765// The following sequence shuffles the block constants round for the 766// next round: 767// 768// 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2 769// 0 7 14 5 12 3 10 1 8 15 6 13 4 11 2 9 770 771{ .mmi 772 mov Z = M0 773 mov Y = M15 774 mov ar.lc = 3 775} 776{ .mmi 777 mov X = M10 778 mov W = M1 779 mov V = M4 780} ;; 781 782{ .mmi 783 mov M0 = M9 784 mov M15 = M12 785 mov ar.ec = 1 786} 787{ .mmi 788 mov M10 = M11 789 mov M1 = M6 790 mov M4 = M13 791} ;; 792 793{ .mmi 794 mov M9 = M14 795 mov M12 = M5 796 mov U = M3 797} 798{ .mmi 799 mov M11 = M8 800 mov M6 = M7 801 mov M13 = M2 802} ;; 803 804{ .mmi 805 mov M14 = Y 806 mov M5 = X 807 mov M3 = Z 808} 809{ .mmi 810 mov M8 = W 811 mov M7 = V 812 mov M2 = U 813} ;; 814 815.md5_II_round: 816 II4(.md5_II_round) 817 818{ .mib 819 nop 0x0 820 nop 0x0 821 br.ret.sptk.many QUICK_RTN 822} ;; 823 824 .endp md5_digest_GHI 825 826#define FFLOADU(a, b, c, d, M, P, N, s, offset) \ 827{ .mii ; \ 828(pMore) ld4 N = [DPtr], 4 ; \ 829 add Z = M, TRound ; \ 830 and Y = c, b ; \ 831} \ 832{ .mmi ; \ 833 andcm X = d, b ;; \ 834 add Z = Z, a ; \ 835 or Y = Y, X ; \ 836} ;; \ 837{ .mii ; \ 838 ld4 TRound = [TPtr], 4 ; \ 839 GETLW(W, P, offset) ; \ 840 add Z = Z, Y ; \ 841} ;; \ 842{ .mii ; \ 843 or W = W, DTmp ; \ 844 dep.z Y = Z, 32, 32 ;; \ 845 shrp Z = Z, Y, 64 - s ; \ 846} ;; \ 847{ .mii ; \ 848 add a = Z, b ; \ 849 GETRW(DTmp, P, offset) ; \ 850 mov P = W ; \ 851} ;; 852 853#define FFLOOPU(a, b, c, d, M, P, N, s, offset) \ 854{ .mii ; \ 855(pMore) ld4 N = [DPtr], 4 ; \ 856 add Z = M, TRound ; \ 857 and Y = c, b ; \ 858} \ 859{ .mmi ; \ 860 andcm X = d, b ;; \ 861 add Z = Z, a ; \ 862 or Y = Y, X ; \ 863} ;; \ 864{ .mii ; \ 865 ld4 TRound = [TPtr], 4 ; \ 866(pMore) GETLW(W, P, offset) ; \ 867 add Z = Z, Y ; \ 868} ;; \ 869{ .mii ; \ 870(pMore) or W = W, DTmp ; \ 871 dep.z Y = Z, 32, 32 ;; \ 872 shrp Z = Z, Y, 64 - s ; \ 873} ;; \ 874{ .mii ; \ 875 add a = Z, b ; \ 876(pMore) GETRW(DTmp, P, offset) ; \ 877(pMore) mov P = W ; \ 878} \ 879{ .mib ; \ 880 cmp.ne pMore, p0 = 0, LTrip ; \ 881 add LTrip = -1, LTrip ; \ 882 br.ctop.sptk.many .md5_FF_round##offset ; \ 883} ;; 884 885#define MD5FBLOCK(offset) \ 886 .type md5_digest_block##offset, @function ; \ 887 \ 888 .align 32 ; \ 889 .proc md5_digest_block##offset ; \ 890 .prologue ; \ 891 .altrp QUICK_RTN ; \ 892 .body ; \ 893md5_digest_block##offset: \ 894{ .mmi ; \ 895 alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE ; \ 896 mov LTrip = 2 ; \ 897 mov ar.lc = 3 ; \ 898} ;; \ 899{ .mii ; \ 900 cmp.eq pMore, p0 = r0, r0 ; \ 901 mov ar.ec = 0 ; \ 902 nop 0x0 ; \ 903} ;; \ 904 \ 905 .pred.rel "mutex", pLoad, pSkip ; \ 906.md5_FF_round##offset: \ 907 FFLOADU(A, B, C, D, M12, M13, RotateM0, 7, offset) \ 908 FFLOADU(D, A, B, C, M13, M14, RotateM1, 12, offset) \ 909 FFLOADU(C, D, A, B, M14, M15, RotateM2, 17, offset) \ 910 FFLOOPU(B, C, D, A, M15, RotateM0, RotateM3, 22, offset) \ 911 \ 912{ .mib ; \ 913 nop 0x0 ; \ 914 nop 0x0 ; \ 915 br.cond.sptk.many md5_digest_GHI ; \ 916} ;; \ 917 .endp md5_digest_block##offset 918 919MD5FBLOCK(1) 920MD5FBLOCK(2) 921MD5FBLOCK(3) 922 923 .align 64 924 .type md5_constants, @object 925md5_constants: 926.md5_tbl_data_order: // To ensure little-endian data 927 // order, code as bytes. 928 data1 0x78, 0xa4, 0x6a, 0xd7 // 0 929 data1 0x56, 0xb7, 0xc7, 0xe8 // 1 930 data1 0xdb, 0x70, 0x20, 0x24 // 2 931 data1 0xee, 0xce, 0xbd, 0xc1 // 3 932 data1 0xaf, 0x0f, 0x7c, 0xf5 // 4 933 data1 0x2a, 0xc6, 0x87, 0x47 // 5 934 data1 0x13, 0x46, 0x30, 0xa8 // 6 935 data1 0x01, 0x95, 0x46, 0xfd // 7 936 data1 0xd8, 0x98, 0x80, 0x69 // 8 937 data1 0xaf, 0xf7, 0x44, 0x8b // 9 938 data1 0xb1, 0x5b, 0xff, 0xff // 10 939 data1 0xbe, 0xd7, 0x5c, 0x89 // 11 940 data1 0x22, 0x11, 0x90, 0x6b // 12 941 data1 0x93, 0x71, 0x98, 0xfd // 13 942 data1 0x8e, 0x43, 0x79, 0xa6 // 14 943 data1 0x21, 0x08, 0xb4, 0x49 // 15 944 data1 0x62, 0x25, 0x1e, 0xf6 // 16 945 data1 0x40, 0xb3, 0x40, 0xc0 // 17 946 data1 0x51, 0x5a, 0x5e, 0x26 // 18 947 data1 0xaa, 0xc7, 0xb6, 0xe9 // 19 948 data1 0x5d, 0x10, 0x2f, 0xd6 // 20 949 data1 0x53, 0x14, 0x44, 0x02 // 21 950 data1 0x81, 0xe6, 0xa1, 0xd8 // 22 951 data1 0xc8, 0xfb, 0xd3, 0xe7 // 23 952 data1 0xe6, 0xcd, 0xe1, 0x21 // 24 953 data1 0xd6, 0x07, 0x37, 0xc3 // 25 954 data1 0x87, 0x0d, 0xd5, 0xf4 // 26 955 data1 0xed, 0x14, 0x5a, 0x45 // 27 956 data1 0x05, 0xe9, 0xe3, 0xa9 // 28 957 data1 0xf8, 0xa3, 0xef, 0xfc // 29 958 data1 0xd9, 0x02, 0x6f, 0x67 // 30 959 data1 0x8a, 0x4c, 0x2a, 0x8d // 31 960 data1 0x42, 0x39, 0xfa, 0xff // 32 961 data1 0x81, 0xf6, 0x71, 0x87 // 33 962 data1 0x22, 0x61, 0x9d, 0x6d // 34 963 data1 0x0c, 0x38, 0xe5, 0xfd // 35 964 data1 0x44, 0xea, 0xbe, 0xa4 // 36 965 data1 0xa9, 0xcf, 0xde, 0x4b // 37 966 data1 0x60, 0x4b, 0xbb, 0xf6 // 38 967 data1 0x70, 0xbc, 0xbf, 0xbe // 39 968 data1 0xc6, 0x7e, 0x9b, 0x28 // 40 969 data1 0xfa, 0x27, 0xa1, 0xea // 41 970 data1 0x85, 0x30, 0xef, 0xd4 // 42 971 data1 0x05, 0x1d, 0x88, 0x04 // 43 972 data1 0x39, 0xd0, 0xd4, 0xd9 // 44 973 data1 0xe5, 0x99, 0xdb, 0xe6 // 45 974 data1 0xf8, 0x7c, 0xa2, 0x1f // 46 975 data1 0x65, 0x56, 0xac, 0xc4 // 47 976 data1 0x44, 0x22, 0x29, 0xf4 // 48 977 data1 0x97, 0xff, 0x2a, 0x43 // 49 978 data1 0xa7, 0x23, 0x94, 0xab // 50 979 data1 0x39, 0xa0, 0x93, 0xfc // 51 980 data1 0xc3, 0x59, 0x5b, 0x65 // 52 981 data1 0x92, 0xcc, 0x0c, 0x8f // 53 982 data1 0x7d, 0xf4, 0xef, 0xff // 54 983 data1 0xd1, 0x5d, 0x84, 0x85 // 55 984 data1 0x4f, 0x7e, 0xa8, 0x6f // 56 985 data1 0xe0, 0xe6, 0x2c, 0xfe // 57 986 data1 0x14, 0x43, 0x01, 0xa3 // 58 987 data1 0xa1, 0x11, 0x08, 0x4e // 59 988 data1 0x82, 0x7e, 0x53, 0xf7 // 60 989 data1 0x35, 0xf2, 0x3a, 0xbd // 61 990 data1 0xbb, 0xd2, 0xd7, 0x2a // 62 991 data1 0x91, 0xd3, 0x86, 0xeb // 63 992.size md5_constants#,64*4 993