sha512-sparcv9.pl revision 221304ee937bc0910948a8be1320cb8cc4eb6d36
15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#!/usr/bin/env perl
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# ====================================================================
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# project. The module is, however, dual licensed under OpenSSL and
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# CRYPTOGAMS licenses depending on where you obtain it. For further
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# details see http://www.openssl.org/~appro/cryptogams/.
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# ====================================================================
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# SHA256 performance improvement over compiler generated code varies
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# build]. Just like in SHA1 module I aim to ensure scalability on
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# UltraSPARC T1 by packing X[16] to 8 64-bit registers.
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
15a3f7b4e666c476898878fa745f637129375cd889Ben Murdoch# SHA512 on pre-T1 UltraSPARC.
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# Performance is >75% better than 64-bit code generated by Sun C and
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# over 2x than 32-bit code. X[16] resides on stack, but access to it
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# is scheduled for L2 latency and staged through 32 least significant
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# duality. Nevetheless it's ~40% faster than SHA256, which is pretty
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# good [optimal coefficient is 50%].
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# SHA512 on UltraSPARC T1.
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# It's not any faster than 64-bit code generated by Sun C 5.8. This is
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# because 64-bit code generator has the advantage of using 64-bit
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# loads(*) to access X[16], which I consciously traded for 32-/64-bit
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# ABI duality [as per above]. But it surpasses 32-bit Sun C generated
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# code by 60%, not to mention that it doesn't suffer from severe decay
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# when running 4 times physical cores threads and that it leaves gcc
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# [3.4] behind by over 4x factor! If compared to SHA256, single thread
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# performance is only 10% better, but overall throughput for maximum
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# amount of threads for given CPU exceeds corresponding one of SHA256
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# by 30% [again, optimal coefficient is 50%].
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# (*)	Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#	in-order, i.e. load instruction has to complete prior next
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#	instruction in given thread is executed, even if the latter is
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#	not dependent on load result! This means that on T1 two 32-bit
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#	loads are always slower than one 64-bit load. Once again this
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#	is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#	2x32-bit loads can be as fast as 1x64-bit ones.
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$bits=32;
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)if ($bits==64)	{ $bias=2047; $frame=192; }
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)else		{ $bias=0;    $frame=112; }
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$output=shift;
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)open STDOUT,">$output";
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)if ($output =~ /512/) {
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	$label="512";
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	$SZ=8;
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	$LD="ldx";		# load from memory
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	$ST="stx";		# store to memory
58cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)	$SLL="sllx";		# shift left logical
59cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)	$SRL="srlx";		# shift right logical
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	@Sigma0=(28,34,39);
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	@Sigma1=(14,18,41);
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	@sigma0=( 7, 1, 8);	# right shift first
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	@sigma1=( 6,19,61);	# right shift first
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	$lastK=0x817;
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	$rounds=80;
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	$align=4;
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	$locals=16*$SZ;		# X[16]
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	$A="%o0";
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	$B="%o1";
724e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)	$C="%o2";
734e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)	$D="%o3";
744e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)	$E="%o4";
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	$F="%o5";
764e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)	$G="%g1";
774e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)	$H="%o7";
784e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)	@V=($A,$B,$C,$D,$E,$F,$G,$H);
794e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)} else {
804e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)	$label="256";
814e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)	$SZ=4;
824e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)	$LD="ld";		# load from memory
834e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)	$ST="st";		# store to memory
844e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)	$SLL="sll";		# shift left logical
854e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)	$SRL="srl";		# shift right logical
864e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)	@Sigma0=( 2,13,22);
874e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)	@Sigma1=( 6,11,25);
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	@sigma0=( 3, 7,18);	# right shift first
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	@sigma1=(10,17,19);	# right shift first
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	$lastK=0x8f2;
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	$rounds=64;
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	$align=8;
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
942a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)	$locals=0;		# X[16] is register resident
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	$A="%l0";
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	$B="%l1";
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	$C="%l2";
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	$D="%l3";
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	$E="%l4";
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	$F="%l5";
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	$G="%l6";
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	$H="%l7";
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	@V=($A,$B,$C,$D,$E,$F,$G,$H);
106cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)}
107cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)$T1="%g2";
108cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)$tmp0="%g3";
109cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)$tmp1="%g4";
110cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)$tmp2="%g5";
111cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)
112cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)$ctx="%i0";
113cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)$inp="%i1";
114cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)$len="%i2";
115cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)$Ktbl="%i3";
116cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)$tmp31="%i4";
117cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)$tmp32="%i5";
118cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)
119cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)########### SHA256
12046d4c2bc3267f3f028f39e7e311b0f89aba2e4fdTorne (Richard Coles)$Xload = sub {
1211320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tuccimy ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
12246d4c2bc3267f3f028f39e7e311b0f89aba2e4fdTorne (Richard Coles)
123cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)    if ($i==0) {
124cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)$code.=<<___;
125cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)	ldx	[$inp+0],@X[0]
126cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)	ldx	[$inp+16],@X[2]
127cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)	ldx	[$inp+32],@X[4]
128cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)	ldx	[$inp+48],@X[6]
129cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)	ldx	[$inp+8],@X[1]
130cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)	ldx	[$inp+24],@X[3]
131cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)	subcc	%g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
132cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)	ldx	[$inp+40],@X[5]
133cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)	bz,pt	%icc,.Laligned
134cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)	ldx	[$inp+56],@X[7]
135cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)
136cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)	sllx	@X[0],$tmp31,@X[0]
137cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)	ldx	[$inp+64],$T1
138cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)___
139cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)for($j=0;$j<7;$j++)
140cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles){   $code.=<<___;
141cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)	srlx	@X[$j+1],$tmp32,$tmp1
142cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)	sllx	@X[$j+1],$tmp31,@X[$j+1]
143cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)	or	$tmp1,@X[$j],@X[$j]
144cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)___
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$code.=<<___;
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	srlx	$T1,$tmp32,$T1
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	or	$T1,@X[7],@X[7]
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles).Laligned:
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)___
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
15346d4c2bc3267f3f028f39e7e311b0f89aba2e4fdTorne (Richard Coles)    if ($i&1) {
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	$code.="\tadd	@X[$i/2],$h,$T1\n";
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    } else {
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	$code.="\tsrlx	@X[$i/2],32,$T1\n\tadd	$h,$T1,$T1\n";
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} if ($SZ==4);
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
16046d4c2bc3267f3f028f39e7e311b0f89aba2e4fdTorne (Richard Coles)########### SHA512
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$Xload = sub {
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
1632a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
1642a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
1652a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)$code.=<<___ if ($i==0);
1662a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)	ld	[$inp+0],%l0
1672a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)	ld	[$inp+4],%l1
1682a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)	ld	[$inp+8],%l2
1692a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)	ld	[$inp+12],%l3
1702a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)	ld	[$inp+16],%l4
17146d4c2bc3267f3f028f39e7e311b0f89aba2e4fdTorne (Richard Coles)	ld	[$inp+20],%l5
1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	ld	[$inp+24],%l6
1735f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)	ld	[$inp+28],%l7
1745f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)___
1755f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)$code.=<<___ if ($i<15);
1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	sllx	@pair[1],$tmp31,$tmp2	! Xload($i)
1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	add	$tmp31,32,$tmp0
1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	sllx	@pair[0],$tmp0,$tmp1
1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	`"ld	[$inp+".eval(32+0+$i*8)."],@pair[0]"	if ($i<12)`
18046d4c2bc3267f3f028f39e7e311b0f89aba2e4fdTorne (Richard Coles)	srlx	@pair[2],$tmp32,@pair[1]
1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	or	$tmp1,$tmp2,$tmp2
18246d4c2bc3267f3f028f39e7e311b0f89aba2e4fdTorne (Richard Coles)	or	@pair[1],$tmp2,$tmp2
1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	`"ld	[$inp+".eval(32+4+$i*8)."],@pair[1]"	if ($i<12)`
1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	add	$h,$tmp2,$T1
1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	$ST	$tmp2,[%sp+`$bias+$frame+$i*$SZ`]
1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)___
18746d4c2bc3267f3f028f39e7e311b0f89aba2e4fdTorne (Richard Coles)$code.=<<___ if ($i==12);
1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	brnz,a	$tmp31,.+8
1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	ld	[$inp+128],%l0
1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)___
1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)$code.=<<___ if ($i==15);
19246d4c2bc3267f3f028f39e7e311b0f89aba2e4fdTorne (Richard Coles)	ld	[%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	sllx	@pair[1],$tmp31,$tmp2	! Xload($i)
1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	add	$tmp31,32,$tmp0
1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	ld	[%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	sllx	@pair[0],$tmp0,$tmp1
1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	ld	[%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	srlx	@pair[2],$tmp32,@pair[1]
1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	or	$tmp1,$tmp2,$tmp2
2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	ld	[%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	or	@pair[1],$tmp2,$tmp2
2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	ld	[%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	add	$h,$tmp2,$T1
2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	$ST	$tmp2,[%sp+`$bias+$frame+$i*$SZ`]
2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	ld	[%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	ld	[%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	ld	[%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)___
2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} if ($SZ==8);
2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)########### common
2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)sub BODY_00_15 {
2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if ($i<16) {
2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	&$Xload(@_);
21723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)    } else {
21823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)	$code.="\tadd	$h,$T1,$T1\n";
21923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)    }
22023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)
22123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)$code.=<<___;
22223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)	$SRL	$e,@Sigma1[0],$h	!! $i
22323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)	xor	$f,$g,$tmp2
22423730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)	$SLL	$e,`$SZ*8-@Sigma1[2]`,$tmp1
2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	and	$e,$tmp2,$tmp2
2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	$SRL	$e,@Sigma1[1],$tmp0
227a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)	xor	$tmp1,$h,$h
228a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)	$SLL	$e,`$SZ*8-@Sigma1[1]`,$tmp1
229a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)	xor	$tmp0,$h,$h
230a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)	$SRL	$e,@Sigma1[2],$tmp0
231a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)	xor	$tmp1,$h,$h
232a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)	$SLL	$e,`$SZ*8-@Sigma1[0]`,$tmp1
2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	xor	$tmp0,$h,$h
2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	xor	$g,$tmp2,$tmp2		! Ch(e,f,g)
2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)	xor	$tmp1,$h,$tmp0		! Sigma1(e)
236
237	$SRL	$a,@Sigma0[0],$h
238	add	$tmp2,$T1,$T1
239	$LD	[$Ktbl+`$i*$SZ`],$tmp2	! K[$i]
240	$SLL	$a,`$SZ*8-@Sigma0[2]`,$tmp1
241	add	$tmp0,$T1,$T1
242	$SRL	$a,@Sigma0[1],$tmp0
243	xor	$tmp1,$h,$h
244	$SLL	$a,`$SZ*8-@Sigma0[1]`,$tmp1
245	xor	$tmp0,$h,$h
246	$SRL	$a,@Sigma0[2],$tmp0
247	xor	$tmp1,$h,$h
248	$SLL	$a,`$SZ*8-@Sigma0[0]`,$tmp1
249	xor	$tmp0,$h,$h
250	xor	$tmp1,$h,$h		! Sigma0(a)
251
252	or	$a,$b,$tmp0
253	and	$a,$b,$tmp1
254	and	$c,$tmp0,$tmp0
255	or	$tmp0,$tmp1,$tmp1	! Maj(a,b,c)
256	add	$tmp2,$T1,$T1		! +=K[$i]
257	add	$tmp1,$h,$h
258
259	add	$T1,$d,$d
260	add	$T1,$h,$h
261___
262}
263
264########### SHA256
265$BODY_16_XX = sub {
266my $i=@_[0];
267my $xi;
268
269    if ($i&1) {
270	$xi=$tmp32;
271	$code.="\tsrlx	@X[(($i+1)/2)%8],32,$xi\n";
272    } else {
273	$xi=@X[(($i+1)/2)%8];
274    }
275$code.=<<___;
276	srl	$xi,@sigma0[0],$T1		!! Xupdate($i)
277	sll	$xi,`32-@sigma0[2]`,$tmp1
278	srl	$xi,@sigma0[1],$tmp0
279	xor	$tmp1,$T1,$T1
280	sll	$tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
281	xor	$tmp0,$T1,$T1
282	srl	$xi,@sigma0[2],$tmp0
283	xor	$tmp1,$T1,$T1
284___
285    if ($i&1) {
286	$xi=@X[(($i+14)/2)%8];
287    } else {
288	$xi=$tmp32;
289	$code.="\tsrlx	@X[(($i+14)/2)%8],32,$xi\n";
290    }
291$code.=<<___;
292	srl	$xi,@sigma1[0],$tmp2
293	xor	$tmp0,$T1,$T1			! T1=sigma0(X[i+1])
294	sll	$xi,`32-@sigma1[2]`,$tmp1
295	srl	$xi,@sigma1[1],$tmp0
296	xor	$tmp1,$tmp2,$tmp2
297	sll	$tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
298	xor	$tmp0,$tmp2,$tmp2
299	srl	$xi,@sigma1[2],$tmp0
300	xor	$tmp1,$tmp2,$tmp2
301___
302    if ($i&1) {
303	$xi=@X[($i/2)%8];
304$code.=<<___;
305	srlx	@X[(($i+9)/2)%8],32,$tmp1	! X[i+9]
306	xor	$tmp0,$tmp2,$tmp2		! sigma1(X[i+14])
307	srl	@X[($i/2)%8],0,$tmp0
308	add	$xi,$T1,$T1			! +=X[i]
309	xor	$tmp0,@X[($i/2)%8],@X[($i/2)%8]
310	add	$tmp2,$T1,$T1
311	add	$tmp1,$T1,$T1
312
313	srl	$T1,0,$T1
314	or	$T1,@X[($i/2)%8],@X[($i/2)%8]
315___
316    } else {
317	$xi=@X[(($i+9)/2)%8];
318$code.=<<___;
319	srlx	@X[($i/2)%8],32,$tmp1		! X[i]
320	xor	$tmp0,$tmp2,$tmp2		! sigma1(X[i+14])
321	srl	@X[($i/2)%8],0,@X[($i/2)%8]
322	add	$xi,$T1,$T1			! +=X[i+9]
323	add	$tmp2,$T1,$T1
324	add	$tmp1,$T1,$T1
325
326	sllx	$T1,32,$tmp0
327	or	$tmp0,@X[($i/2)%8],@X[($i/2)%8]
328___
329    }
330    &BODY_00_15(@_);
331} if ($SZ==4);
332
333########### SHA512
334$BODY_16_XX = sub {
335my $i=@_[0];
336my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
337
338$code.=<<___;
339	sllx	%l2,32,$tmp0		!! Xupdate($i)
340	or	%l3,$tmp0,$tmp0
341
342	srlx	$tmp0,@sigma0[0],$T1
343	ld	[%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
344	sllx	$tmp0,`64-@sigma0[2]`,$tmp1
345	ld	[%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
346	srlx	$tmp0,@sigma0[1],$tmp0
347	xor	$tmp1,$T1,$T1
348	sllx	$tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
349	xor	$tmp0,$T1,$T1
350	srlx	$tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
351	xor	$tmp1,$T1,$T1
352	sllx	%l6,32,$tmp2
353	xor	$tmp0,$T1,$T1		! sigma0(X[$i+1])
354	or	%l7,$tmp2,$tmp2
355
356	srlx	$tmp2,@sigma1[0],$tmp1
357	ld	[%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
358	sllx	$tmp2,`64-@sigma1[2]`,$tmp0
359	ld	[%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
360	srlx	$tmp2,@sigma1[1],$tmp2
361	xor	$tmp0,$tmp1,$tmp1
362	sllx	$tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
363	xor	$tmp2,$tmp1,$tmp1
364	srlx	$tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
365	xor	$tmp0,$tmp1,$tmp1
366	sllx	%l4,32,$tmp0
367	xor	$tmp2,$tmp1,$tmp1	! sigma1(X[$i+14])
368	ld	[%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
369	or	%l5,$tmp0,$tmp0
370	ld	[%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
371
372	sllx	%l0,32,$tmp2
373	add	$tmp1,$T1,$T1
374	ld	[%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
375	or	%l1,$tmp2,$tmp2
376	add	$tmp0,$T1,$T1		! +=X[$i+9]
377	ld	[%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
378	add	$tmp2,$T1,$T1		! +=X[$i]
379	$ST	$T1,[%sp+`$bias+$frame+($i%16)*$SZ`]
380___
381    &BODY_00_15(@_);
382} if ($SZ==8);
383
384$code.=<<___ if ($bits==64);
385.register	%g2,#scratch
386.register	%g3,#scratch
387___
388$code.=<<___;
389.section	".text",#alloc,#execinstr
390
391.align	64
392K${label}:
393.type	K${label},#object
394___
395if ($SZ==4) {
396$code.=<<___;
397	.long	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
398	.long	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
399	.long	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
400	.long	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
401	.long	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
402	.long	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
403	.long	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
404	.long	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
405	.long	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
406	.long	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
407	.long	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
408	.long	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
409	.long	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
410	.long	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
411	.long	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
412	.long	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
413___
414} else {
415$code.=<<___;
416	.long	0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
417	.long	0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
418	.long	0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
419	.long	0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
420	.long	0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
421	.long	0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
422	.long	0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
423	.long	0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
424	.long	0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
425	.long	0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
426	.long	0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
427	.long	0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
428	.long	0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
429	.long	0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
430	.long	0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
431	.long	0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
432	.long	0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
433	.long	0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
434	.long	0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
435	.long	0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
436	.long	0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
437	.long	0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
438	.long	0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
439	.long	0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
440	.long	0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
441	.long	0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
442	.long	0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
443	.long	0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
444	.long	0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
445	.long	0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
446	.long	0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
447	.long	0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
448	.long	0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
449	.long	0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
450	.long	0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
451	.long	0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
452	.long	0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
453	.long	0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
454	.long	0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
455	.long	0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
456___
457}
458$code.=<<___;
459.size	K${label},.-K${label}
460.globl	sha${label}_block_data_order
461sha${label}_block_data_order:
462	save	%sp,`-$frame-$locals`,%sp
463	and	$inp,`$align-1`,$tmp31
464	sllx	$len,`log(16*$SZ)/log(2)`,$len
465	andn	$inp,`$align-1`,$inp
466	sll	$tmp31,3,$tmp31
467	add	$inp,$len,$len
468___
469$code.=<<___ if ($SZ==8); # SHA512
470	mov	32,$tmp32
471	sub	$tmp32,$tmp31,$tmp32
472___
473$code.=<<___;
474.Lpic:	call	.+8
475	add	%o7,K${label}-.Lpic,$Ktbl
476
477	$LD	[$ctx+`0*$SZ`],$A
478	$LD	[$ctx+`1*$SZ`],$B
479	$LD	[$ctx+`2*$SZ`],$C
480	$LD	[$ctx+`3*$SZ`],$D
481	$LD	[$ctx+`4*$SZ`],$E
482	$LD	[$ctx+`5*$SZ`],$F
483	$LD	[$ctx+`6*$SZ`],$G
484	$LD	[$ctx+`7*$SZ`],$H
485
486.Lloop:
487___
488for ($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
489$code.=".L16_xx:\n";
490for (;$i<32;$i++)	{ &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
491$code.=<<___;
492	and	$tmp2,0xfff,$tmp2
493	cmp	$tmp2,$lastK
494	bne	.L16_xx
495	add	$Ktbl,`16*$SZ`,$Ktbl	! Ktbl+=16
496
497___
498$code.=<<___ if ($SZ==4); # SHA256
499	$LD	[$ctx+`0*$SZ`],@X[0]
500	$LD	[$ctx+`1*$SZ`],@X[1]
501	$LD	[$ctx+`2*$SZ`],@X[2]
502	$LD	[$ctx+`3*$SZ`],@X[3]
503	$LD	[$ctx+`4*$SZ`],@X[4]
504	$LD	[$ctx+`5*$SZ`],@X[5]
505	$LD	[$ctx+`6*$SZ`],@X[6]
506	$LD	[$ctx+`7*$SZ`],@X[7]
507
508	add	$A,@X[0],$A
509	$ST	$A,[$ctx+`0*$SZ`]
510	add	$B,@X[1],$B
511	$ST	$B,[$ctx+`1*$SZ`]
512	add	$C,@X[2],$C
513	$ST	$C,[$ctx+`2*$SZ`]
514	add	$D,@X[3],$D
515	$ST	$D,[$ctx+`3*$SZ`]
516	add	$E,@X[4],$E
517	$ST	$E,[$ctx+`4*$SZ`]
518	add	$F,@X[5],$F
519	$ST	$F,[$ctx+`5*$SZ`]
520	add	$G,@X[6],$G
521	$ST	$G,[$ctx+`6*$SZ`]
522	add	$H,@X[7],$H
523	$ST	$H,[$ctx+`7*$SZ`]
524___
525$code.=<<___ if ($SZ==8); # SHA512
526	ld	[$ctx+`0*$SZ+0`],%l0
527	ld	[$ctx+`0*$SZ+4`],%l1
528	ld	[$ctx+`1*$SZ+0`],%l2
529	ld	[$ctx+`1*$SZ+4`],%l3
530	ld	[$ctx+`2*$SZ+0`],%l4
531	ld	[$ctx+`2*$SZ+4`],%l5
532	ld	[$ctx+`3*$SZ+0`],%l6
533
534	sllx	%l0,32,$tmp0
535	ld	[$ctx+`3*$SZ+4`],%l7
536	sllx	%l2,32,$tmp1
537	or	%l1,$tmp0,$tmp0
538	or	%l3,$tmp1,$tmp1
539	add	$tmp0,$A,$A
540	add	$tmp1,$B,$B
541	$ST	$A,[$ctx+`0*$SZ`]
542	sllx	%l4,32,$tmp2
543	$ST	$B,[$ctx+`1*$SZ`]
544	sllx	%l6,32,$T1
545	or	%l5,$tmp2,$tmp2
546	or	%l7,$T1,$T1
547	add	$tmp2,$C,$C
548	$ST	$C,[$ctx+`2*$SZ`]
549	add	$T1,$D,$D
550	$ST	$D,[$ctx+`3*$SZ`]
551
552	ld	[$ctx+`4*$SZ+0`],%l0
553	ld	[$ctx+`4*$SZ+4`],%l1
554	ld	[$ctx+`5*$SZ+0`],%l2
555	ld	[$ctx+`5*$SZ+4`],%l3
556	ld	[$ctx+`6*$SZ+0`],%l4
557	ld	[$ctx+`6*$SZ+4`],%l5
558	ld	[$ctx+`7*$SZ+0`],%l6
559
560	sllx	%l0,32,$tmp0
561	ld	[$ctx+`7*$SZ+4`],%l7
562	sllx	%l2,32,$tmp1
563	or	%l1,$tmp0,$tmp0
564	or	%l3,$tmp1,$tmp1
565	add	$tmp0,$E,$E
566	add	$tmp1,$F,$F
567	$ST	$E,[$ctx+`4*$SZ`]
568	sllx	%l4,32,$tmp2
569	$ST	$F,[$ctx+`5*$SZ`]
570	sllx	%l6,32,$T1
571	or	%l5,$tmp2,$tmp2
572	or	%l7,$T1,$T1
573	add	$tmp2,$G,$G
574	$ST	$G,[$ctx+`6*$SZ`]
575	add	$T1,$H,$H
576	$ST	$H,[$ctx+`7*$SZ`]
577___
578$code.=<<___;
579	add	$inp,`16*$SZ`,$inp		! advance inp
580	cmp	$inp,$len
581	bne	`$bits==64?"%xcc":"%icc"`,.Lloop
582	sub	$Ktbl,`($rounds-16)*$SZ`,$Ktbl	! rewind Ktbl
583
584	ret
585	restore
586.type	sha${label}_block_data_order,#function
587.size	sha${label}_block_data_order,(.-sha${label}_block_data_order)
588.asciz	"SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
589___
590
591$code =~ s/\`([^\`]*)\`/eval $1/gem;
592print $code;
593close STDOUT;
594