sha512-sparcv9.pl revision 221304ee937bc0910948a8be1320cb8cc4eb6d36
15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#!/usr/bin/env perl 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# ==================================================================== 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# project. The module is, however, dual licensed under OpenSSL and 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# CRYPTOGAMS licenses depending on where you obtain it. For further 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# details see http://www.openssl.org/~appro/cryptogams/. 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# ==================================================================== 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# SHA256 performance improvement over compiler generated code varies 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# build]. Just like in SHA1 module I aim to ensure scalability on 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# UltraSPARC T1 by packing X[16] to 8 64-bit registers. 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 15a3f7b4e666c476898878fa745f637129375cd889Ben Murdoch# SHA512 on pre-T1 UltraSPARC. 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Performance is >75% better than 64-bit code generated by Sun C and 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# over 2x than 32-bit code. X[16] resides on stack, but access to it 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# is scheduled for L2 latency and staged through 32 least significant 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# duality. Nevetheless it's ~40% faster than SHA256, which is pretty 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# good [optimal coefficient is 50%]. 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# SHA512 on UltraSPARC T1. 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# It's not any faster than 64-bit code generated by Sun C 5.8. This is 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# because 64-bit code generator has the advantage of using 64-bit 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# loads(*) to access X[16], which I consciously traded for 32-/64-bit 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# ABI duality [as per above]. But it surpasses 32-bit Sun C generated 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# code by 60%, not to mention that it doesn't suffer from severe decay 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# when running 4 times physical cores threads and that it leaves gcc 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# [3.4] behind by over 4x factor! If compared to SHA256, single thread 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# performance is only 10% better, but overall throughput for maximum 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# amount of threads for given CPU exceeds corresponding one of SHA256 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# by 30% [again, optimal coefficient is 50%]. 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# in-order, i.e. load instruction has to complete prior next 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# instruction in given thread is executed, even if the latter is 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# not dependent on load result! This means that on T1 two 32-bit 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# loads are always slower than one 64-bit load. Once again this 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# is unlike pre-T1 UltraSPARC, where, if scheduled appropriately, 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# 2x32-bit loads can be as fast as 1x64-bit ones. 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$bits=32; 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)if ($bits==64) { $bias=2047; $frame=192; } 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)else { $bias=0; $frame=112; } 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$output=shift; 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)open STDOUT,">$output"; 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)if ($output =~ /512/) { 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) $label="512"; 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) $SZ=8; 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) $LD="ldx"; # load from memory 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) $ST="stx"; # store to memory 58cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) $SLL="sllx"; # shift left logical 59cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) $SRL="srlx"; # shift right logical 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) @Sigma0=(28,34,39); 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) @Sigma1=(14,18,41); 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) @sigma0=( 7, 1, 8); # right shift first 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) @sigma1=( 6,19,61); # right shift first 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) $lastK=0x817; 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) $rounds=80; 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) $align=4; 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) $locals=16*$SZ; # X[16] 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) $A="%o0"; 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) $B="%o1"; 724e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) $C="%o2"; 734e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) $D="%o3"; 744e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) $E="%o4"; 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) $F="%o5"; 764e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) $G="%g1"; 774e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) $H="%o7"; 784e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) @V=($A,$B,$C,$D,$E,$F,$G,$H); 794e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)} else { 804e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) $label="256"; 814e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) $SZ=4; 824e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) $LD="ld"; # load from memory 834e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) $ST="st"; # store to memory 844e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) $SLL="sll"; # shift left logical 854e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) $SRL="srl"; # shift right logical 864e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) @Sigma0=( 2,13,22); 874e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) @Sigma1=( 6,11,25); 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) @sigma0=( 3, 7,18); # right shift first 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) @sigma1=(10,17,19); # right shift first 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) $lastK=0x8f2; 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) $rounds=64; 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) $align=8; 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 942a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) $locals=0; # X[16] is register resident 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7"); 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) $A="%l0"; 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) $B="%l1"; 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) $C="%l2"; 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) $D="%l3"; 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) $E="%l4"; 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) $F="%l5"; 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) $G="%l6"; 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) $H="%l7"; 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) @V=($A,$B,$C,$D,$E,$F,$G,$H); 106cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)} 107cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)$T1="%g2"; 108cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)$tmp0="%g3"; 109cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)$tmp1="%g4"; 110cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)$tmp2="%g5"; 111cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) 112cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)$ctx="%i0"; 113cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)$inp="%i1"; 114cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)$len="%i2"; 115cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)$Ktbl="%i3"; 116cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)$tmp31="%i4"; 117cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)$tmp32="%i5"; 118cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) 119cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)########### SHA256 12046d4c2bc3267f3f028f39e7e311b0f89aba2e4fdTorne (Richard Coles)$Xload = sub { 1211320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tuccimy ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 12246d4c2bc3267f3f028f39e7e311b0f89aba2e4fdTorne (Richard Coles) 123cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) if ($i==0) { 124cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)$code.=<<___; 125cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) ldx [$inp+0],@X[0] 126cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) ldx [$inp+16],@X[2] 127cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) ldx [$inp+32],@X[4] 128cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) ldx [$inp+48],@X[6] 129cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) ldx [$inp+8],@X[1] 130cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) ldx [$inp+24],@X[3] 131cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too 132cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) ldx [$inp+40],@X[5] 133cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) bz,pt %icc,.Laligned 134cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) ldx [$inp+56],@X[7] 135cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) 136cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) sllx @X[0],$tmp31,@X[0] 137cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) ldx [$inp+64],$T1 138cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)___ 139cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)for($j=0;$j<7;$j++) 140cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles){ $code.=<<___; 141cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) srlx @X[$j+1],$tmp32,$tmp1 142cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) sllx @X[$j+1],$tmp31,@X[$j+1] 143cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) or $tmp1,@X[$j],@X[$j] 144cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)___ 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$code.=<<___; 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) srlx $T1,$tmp32,$T1 1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) or $T1,@X[7],@X[7] 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles).Laligned: 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)___ 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 15346d4c2bc3267f3f028f39e7e311b0f89aba2e4fdTorne (Richard Coles) if ($i&1) { 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) $code.="\tadd @X[$i/2],$h,$T1\n"; 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n"; 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} if ($SZ==4); 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 16046d4c2bc3267f3f028f39e7e311b0f89aba2e4fdTorne (Richard Coles)########### SHA512 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$Xload = sub { 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 1632a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8)); 1642a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 1652a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)$code.=<<___ if ($i==0); 1662a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) ld [$inp+0],%l0 1672a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) ld [$inp+4],%l1 1682a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) ld [$inp+8],%l2 1692a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) ld [$inp+12],%l3 1702a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) ld [$inp+16],%l4 17146d4c2bc3267f3f028f39e7e311b0f89aba2e4fdTorne (Richard Coles) ld [$inp+20],%l5 1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ld [$inp+24],%l6 1735f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) ld [$inp+28],%l7 1745f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)___ 1755f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)$code.=<<___ if ($i<15); 1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sllx @pair[1],$tmp31,$tmp2 ! Xload($i) 1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) add $tmp31,32,$tmp0 1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sllx @pair[0],$tmp0,$tmp1 1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)` 18046d4c2bc3267f3f028f39e7e311b0f89aba2e4fdTorne (Richard Coles) srlx @pair[2],$tmp32,@pair[1] 1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) or $tmp1,$tmp2,$tmp2 18246d4c2bc3267f3f028f39e7e311b0f89aba2e4fdTorne (Richard Coles) or @pair[1],$tmp2,$tmp2 1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)` 1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) add $h,$tmp2,$T1 1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`] 1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)___ 18746d4c2bc3267f3f028f39e7e311b0f89aba2e4fdTorne (Richard Coles)$code.=<<___ if ($i==12); 1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) brnz,a $tmp31,.+8 1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ld [$inp+128],%l0 1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)___ 1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$code.=<<___ if ($i==15); 19246d4c2bc3267f3f028f39e7e311b0f89aba2e4fdTorne (Richard Coles) ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2 1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sllx @pair[1],$tmp31,$tmp2 ! Xload($i) 1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) add $tmp31,32,$tmp0 1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3 1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sllx @pair[0],$tmp0,$tmp1 1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4 1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) srlx @pair[2],$tmp32,@pair[1] 1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) or $tmp1,$tmp2,$tmp2 2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5 2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) or @pair[1],$tmp2,$tmp2 2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6 2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) add $h,$tmp2,$T1 2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`] 2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7 2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0 2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1 2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)___ 2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} if ($SZ==8); 2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)########### common 2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)sub BODY_00_15 { 2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ($i<16) { 2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) &$Xload(@_); 21723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) } else { 21823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) $code.="\tadd $h,$T1,$T1\n"; 21923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) } 22023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) 22123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)$code.=<<___; 22223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) $SRL $e,@Sigma1[0],$h !! $i 22323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) xor $f,$g,$tmp2 22423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1 2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) and $e,$tmp2,$tmp2 2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) $SRL $e,@Sigma1[1],$tmp0 227a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) xor $tmp1,$h,$h 228a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1 229a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) xor $tmp0,$h,$h 230a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) $SRL $e,@Sigma1[2],$tmp0 231a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) xor $tmp1,$h,$h 232a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1 2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) xor $tmp0,$h,$h 2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) xor $g,$tmp2,$tmp2 ! Ch(e,f,g) 2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) xor $tmp1,$h,$tmp0 ! Sigma1(e) 236 237 $SRL $a,@Sigma0[0],$h 238 add $tmp2,$T1,$T1 239 $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i] 240 $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1 241 add $tmp0,$T1,$T1 242 $SRL $a,@Sigma0[1],$tmp0 243 xor $tmp1,$h,$h 244 $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1 245 xor $tmp0,$h,$h 246 $SRL $a,@Sigma0[2],$tmp0 247 xor $tmp1,$h,$h 248 $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1 249 xor $tmp0,$h,$h 250 xor $tmp1,$h,$h ! Sigma0(a) 251 252 or $a,$b,$tmp0 253 and $a,$b,$tmp1 254 and $c,$tmp0,$tmp0 255 or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c) 256 add $tmp2,$T1,$T1 ! +=K[$i] 257 add $tmp1,$h,$h 258 259 add $T1,$d,$d 260 add $T1,$h,$h 261___ 262} 263 264########### SHA256 265$BODY_16_XX = sub { 266my $i=@_[0]; 267my $xi; 268 269 if ($i&1) { 270 $xi=$tmp32; 271 $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n"; 272 } else { 273 $xi=@X[(($i+1)/2)%8]; 274 } 275$code.=<<___; 276 srl $xi,@sigma0[0],$T1 !! Xupdate($i) 277 sll $xi,`32-@sigma0[2]`,$tmp1 278 srl $xi,@sigma0[1],$tmp0 279 xor $tmp1,$T1,$T1 280 sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1 281 xor $tmp0,$T1,$T1 282 srl $xi,@sigma0[2],$tmp0 283 xor $tmp1,$T1,$T1 284___ 285 if ($i&1) { 286 $xi=@X[(($i+14)/2)%8]; 287 } else { 288 $xi=$tmp32; 289 $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n"; 290 } 291$code.=<<___; 292 srl $xi,@sigma1[0],$tmp2 293 xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1]) 294 sll $xi,`32-@sigma1[2]`,$tmp1 295 srl $xi,@sigma1[1],$tmp0 296 xor $tmp1,$tmp2,$tmp2 297 sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1 298 xor $tmp0,$tmp2,$tmp2 299 srl $xi,@sigma1[2],$tmp0 300 xor $tmp1,$tmp2,$tmp2 301___ 302 if ($i&1) { 303 $xi=@X[($i/2)%8]; 304$code.=<<___; 305 srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9] 306 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) 307 srl @X[($i/2)%8],0,$tmp0 308 add $xi,$T1,$T1 ! +=X[i] 309 xor $tmp0,@X[($i/2)%8],@X[($i/2)%8] 310 add $tmp2,$T1,$T1 311 add $tmp1,$T1,$T1 312 313 srl $T1,0,$T1 314 or $T1,@X[($i/2)%8],@X[($i/2)%8] 315___ 316 } else { 317 $xi=@X[(($i+9)/2)%8]; 318$code.=<<___; 319 srlx @X[($i/2)%8],32,$tmp1 ! X[i] 320 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) 321 srl @X[($i/2)%8],0,@X[($i/2)%8] 322 add $xi,$T1,$T1 ! +=X[i+9] 323 add $tmp2,$T1,$T1 324 add $tmp1,$T1,$T1 325 326 sllx $T1,32,$tmp0 327 or $tmp0,@X[($i/2)%8],@X[($i/2)%8] 328___ 329 } 330 &BODY_00_15(@_); 331} if ($SZ==4); 332 333########### SHA512 334$BODY_16_XX = sub { 335my $i=@_[0]; 336my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1)); 337 338$code.=<<___; 339 sllx %l2,32,$tmp0 !! Xupdate($i) 340 or %l3,$tmp0,$tmp0 341 342 srlx $tmp0,@sigma0[0],$T1 343 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2 344 sllx $tmp0,`64-@sigma0[2]`,$tmp1 345 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3 346 srlx $tmp0,@sigma0[1],$tmp0 347 xor $tmp1,$T1,$T1 348 sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1 349 xor $tmp0,$T1,$T1 350 srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0 351 xor $tmp1,$T1,$T1 352 sllx %l6,32,$tmp2 353 xor $tmp0,$T1,$T1 ! sigma0(X[$i+1]) 354 or %l7,$tmp2,$tmp2 355 356 srlx $tmp2,@sigma1[0],$tmp1 357 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6 358 sllx $tmp2,`64-@sigma1[2]`,$tmp0 359 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7 360 srlx $tmp2,@sigma1[1],$tmp2 361 xor $tmp0,$tmp1,$tmp1 362 sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0 363 xor $tmp2,$tmp1,$tmp1 364 srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2 365 xor $tmp0,$tmp1,$tmp1 366 sllx %l4,32,$tmp0 367 xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14]) 368 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4 369 or %l5,$tmp0,$tmp0 370 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5 371 372 sllx %l0,32,$tmp2 373 add $tmp1,$T1,$T1 374 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0 375 or %l1,$tmp2,$tmp2 376 add $tmp0,$T1,$T1 ! +=X[$i+9] 377 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1 378 add $tmp2,$T1,$T1 ! +=X[$i] 379 $ST $T1,[%sp+`$bias+$frame+($i%16)*$SZ`] 380___ 381 &BODY_00_15(@_); 382} if ($SZ==8); 383 384$code.=<<___ if ($bits==64); 385.register %g2,#scratch 386.register %g3,#scratch 387___ 388$code.=<<___; 389.section ".text",#alloc,#execinstr 390 391.align 64 392K${label}: 393.type K${label},#object 394___ 395if ($SZ==4) { 396$code.=<<___; 397 .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 398 .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 399 .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 400 .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 401 .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc 402 .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da 403 .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 404 .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 405 .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 406 .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 407 .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 408 .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 409 .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 410 .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 411 .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 412 .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 413___ 414} else { 415$code.=<<___; 416 .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd 417 .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc 418 .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019 419 .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118 420 .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe 421 .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2 422 .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1 423 .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694 424 .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3 425 .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65 426 .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483 427 .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5 428 .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210 429 .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4 430 .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725 431 .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70 432 .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926 433 .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df 434 .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8 435 .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b 436 .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001 437 .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30 438 .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910 439 .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8 440 .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53 441 .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8 442 .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb 443 .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3 444 .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60 445 .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec 446 .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9 447 .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b 448 .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207 449 .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178 450 .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6 451 .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b 452 .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493 453 .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c 454 .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a 455 .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817 456___ 457} 458$code.=<<___; 459.size K${label},.-K${label} 460.globl sha${label}_block_data_order 461sha${label}_block_data_order: 462 save %sp,`-$frame-$locals`,%sp 463 and $inp,`$align-1`,$tmp31 464 sllx $len,`log(16*$SZ)/log(2)`,$len 465 andn $inp,`$align-1`,$inp 466 sll $tmp31,3,$tmp31 467 add $inp,$len,$len 468___ 469$code.=<<___ if ($SZ==8); # SHA512 470 mov 32,$tmp32 471 sub $tmp32,$tmp31,$tmp32 472___ 473$code.=<<___; 474.Lpic: call .+8 475 add %o7,K${label}-.Lpic,$Ktbl 476 477 $LD [$ctx+`0*$SZ`],$A 478 $LD [$ctx+`1*$SZ`],$B 479 $LD [$ctx+`2*$SZ`],$C 480 $LD [$ctx+`3*$SZ`],$D 481 $LD [$ctx+`4*$SZ`],$E 482 $LD [$ctx+`5*$SZ`],$F 483 $LD [$ctx+`6*$SZ`],$G 484 $LD [$ctx+`7*$SZ`],$H 485 486.Lloop: 487___ 488for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 489$code.=".L16_xx:\n"; 490for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); } 491$code.=<<___; 492 and $tmp2,0xfff,$tmp2 493 cmp $tmp2,$lastK 494 bne .L16_xx 495 add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16 496 497___ 498$code.=<<___ if ($SZ==4); # SHA256 499 $LD [$ctx+`0*$SZ`],@X[0] 500 $LD [$ctx+`1*$SZ`],@X[1] 501 $LD [$ctx+`2*$SZ`],@X[2] 502 $LD [$ctx+`3*$SZ`],@X[3] 503 $LD [$ctx+`4*$SZ`],@X[4] 504 $LD [$ctx+`5*$SZ`],@X[5] 505 $LD [$ctx+`6*$SZ`],@X[6] 506 $LD [$ctx+`7*$SZ`],@X[7] 507 508 add $A,@X[0],$A 509 $ST $A,[$ctx+`0*$SZ`] 510 add $B,@X[1],$B 511 $ST $B,[$ctx+`1*$SZ`] 512 add $C,@X[2],$C 513 $ST $C,[$ctx+`2*$SZ`] 514 add $D,@X[3],$D 515 $ST $D,[$ctx+`3*$SZ`] 516 add $E,@X[4],$E 517 $ST $E,[$ctx+`4*$SZ`] 518 add $F,@X[5],$F 519 $ST $F,[$ctx+`5*$SZ`] 520 add $G,@X[6],$G 521 $ST $G,[$ctx+`6*$SZ`] 522 add $H,@X[7],$H 523 $ST $H,[$ctx+`7*$SZ`] 524___ 525$code.=<<___ if ($SZ==8); # SHA512 526 ld [$ctx+`0*$SZ+0`],%l0 527 ld [$ctx+`0*$SZ+4`],%l1 528 ld [$ctx+`1*$SZ+0`],%l2 529 ld [$ctx+`1*$SZ+4`],%l3 530 ld [$ctx+`2*$SZ+0`],%l4 531 ld [$ctx+`2*$SZ+4`],%l5 532 ld [$ctx+`3*$SZ+0`],%l6 533 534 sllx %l0,32,$tmp0 535 ld [$ctx+`3*$SZ+4`],%l7 536 sllx %l2,32,$tmp1 537 or %l1,$tmp0,$tmp0 538 or %l3,$tmp1,$tmp1 539 add $tmp0,$A,$A 540 add $tmp1,$B,$B 541 $ST $A,[$ctx+`0*$SZ`] 542 sllx %l4,32,$tmp2 543 $ST $B,[$ctx+`1*$SZ`] 544 sllx %l6,32,$T1 545 or %l5,$tmp2,$tmp2 546 or %l7,$T1,$T1 547 add $tmp2,$C,$C 548 $ST $C,[$ctx+`2*$SZ`] 549 add $T1,$D,$D 550 $ST $D,[$ctx+`3*$SZ`] 551 552 ld [$ctx+`4*$SZ+0`],%l0 553 ld [$ctx+`4*$SZ+4`],%l1 554 ld [$ctx+`5*$SZ+0`],%l2 555 ld [$ctx+`5*$SZ+4`],%l3 556 ld [$ctx+`6*$SZ+0`],%l4 557 ld [$ctx+`6*$SZ+4`],%l5 558 ld [$ctx+`7*$SZ+0`],%l6 559 560 sllx %l0,32,$tmp0 561 ld [$ctx+`7*$SZ+4`],%l7 562 sllx %l2,32,$tmp1 563 or %l1,$tmp0,$tmp0 564 or %l3,$tmp1,$tmp1 565 add $tmp0,$E,$E 566 add $tmp1,$F,$F 567 $ST $E,[$ctx+`4*$SZ`] 568 sllx %l4,32,$tmp2 569 $ST $F,[$ctx+`5*$SZ`] 570 sllx %l6,32,$T1 571 or %l5,$tmp2,$tmp2 572 or %l7,$T1,$T1 573 add $tmp2,$G,$G 574 $ST $G,[$ctx+`6*$SZ`] 575 add $T1,$H,$H 576 $ST $H,[$ctx+`7*$SZ`] 577___ 578$code.=<<___; 579 add $inp,`16*$SZ`,$inp ! advance inp 580 cmp $inp,$len 581 bne `$bits==64?"%xcc":"%icc"`,.Lloop 582 sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl 583 584 ret 585 restore 586.type sha${label}_block_data_order,#function 587.size sha${label}_block_data_order,(.-sha${label}_block_data_order) 588.asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" 589___ 590 591$code =~ s/\`([^\`]*)\`/eval $1/gem; 592print $code; 593close STDOUT; 594