sha512-sparcv9.pl revision 392aa7cc7d2b122614c5393c3e357da07fd07af3
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256 performance improvement over compiler generated code varies
11# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
12# build]. Just like in SHA1 module I aim to ensure scalability on
13# UltraSPARC T1 by packing X[16] to 8 64-bit registers.
14
15# SHA512 on pre-T1 UltraSPARC.
16#
17# Performance is >75% better than 64-bit code generated by Sun C and
18# over 2x than 32-bit code. X[16] resides on stack, but access to it
19# is scheduled for L2 latency and staged through 32 least significant
20# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
21# duality. Nevetheless it's ~40% faster than SHA256, which is pretty
22# good [optimal coefficient is 50%].
23#
24# SHA512 on UltraSPARC T1.
25#
26# It's not any faster than 64-bit code generated by Sun C 5.8. This is
27# because 64-bit code generator has the advantage of using 64-bit
28# loads(*) to access X[16], which I consciously traded for 32-/64-bit
29# ABI duality [as per above]. But it surpasses 32-bit Sun C generated
30# code by 60%, not to mention that it doesn't suffer from severe decay
31# when running 4 times physical cores threads and that it leaves gcc
32# [3.4] behind by over 4x factor! If compared to SHA256, single thread
33# performance is only 10% better, but overall throughput for maximum
34# amount of threads for given CPU exceeds corresponding one of SHA256
35# by 30% [again, optimal coefficient is 50%].
36#
37# (*)	Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
38#	in-order, i.e. load instruction has to complete prior next
39#	instruction in given thread is executed, even if the latter is
40#	not dependent on load result! This means that on T1 two 32-bit
41#	loads are always slower than one 64-bit load. Once again this
42#	is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
43#	2x32-bit loads can be as fast as 1x64-bit ones.
44
45$bits=32;
46for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
47if ($bits==64)	{ $bias=2047; $frame=192; }
48else		{ $bias=0;    $frame=112; }
49
50$output=shift;
51open STDOUT,">$output";
52
53if ($output =~ /512/) {
54	$label="512";
55	$SZ=8;
56	$LD="ldx";		# load from memory
57	$ST="stx";		# store to memory
58	$SLL="sllx";		# shift left logical
59	$SRL="srlx";		# shift right logical
60	@Sigma0=(28,34,39);
61	@Sigma1=(14,18,41);
62	@sigma0=( 7, 1, 8);	# right shift first
63	@sigma1=( 6,19,61);	# right shift first
64	$lastK=0x817;
65	$rounds=80;
66	$align=4;
67
68	$locals=16*$SZ;		# X[16]
69
70	$A="%o0";
71	$B="%o1";
72	$C="%o2";
73	$D="%o3";
74	$E="%o4";
75	$F="%o5";
76	$G="%g1";
77	$H="%o7";
78	@V=($A,$B,$C,$D,$E,$F,$G,$H);
79} else {
80	$label="256";
81	$SZ=4;
82	$LD="ld";		# load from memory
83	$ST="st";		# store to memory
84	$SLL="sll";		# shift left logical
85	$SRL="srl";		# shift right logical
86	@Sigma0=( 2,13,22);
87	@Sigma1=( 6,11,25);
88	@sigma0=( 3, 7,18);	# right shift first
89	@sigma1=(10,17,19);	# right shift first
90	$lastK=0x8f2;
91	$rounds=64;
92	$align=8;
93
94	$locals=0;		# X[16] is register resident
95	@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
96
97	$A="%l0";
98	$B="%l1";
99	$C="%l2";
100	$D="%l3";
101	$E="%l4";
102	$F="%l5";
103	$G="%l6";
104	$H="%l7";
105	@V=($A,$B,$C,$D,$E,$F,$G,$H);
106}
107$T1="%g2";
108$tmp0="%g3";
109$tmp1="%g4";
110$tmp2="%g5";
111
112$ctx="%i0";
113$inp="%i1";
114$len="%i2";
115$Ktbl="%i3";
116$tmp31="%i4";
117$tmp32="%i5";
118
119########### SHA256
120$Xload = sub {
121my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
122
123    if ($i==0) {
124$code.=<<___;
125	ldx	[$inp+0],@X[0]
126	ldx	[$inp+16],@X[2]
127	ldx	[$inp+32],@X[4]
128	ldx	[$inp+48],@X[6]
129	ldx	[$inp+8],@X[1]
130	ldx	[$inp+24],@X[3]
131	subcc	%g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
132	ldx	[$inp+40],@X[5]
133	bz,pt	%icc,.Laligned
134	ldx	[$inp+56],@X[7]
135
136	sllx	@X[0],$tmp31,@X[0]
137	ldx	[$inp+64],$T1
138___
139for($j=0;$j<7;$j++)
140{   $code.=<<___;
141	srlx	@X[$j+1],$tmp32,$tmp1
142	sllx	@X[$j+1],$tmp31,@X[$j+1]
143	or	$tmp1,@X[$j],@X[$j]
144___
145}
146$code.=<<___;
147	srlx	$T1,$tmp32,$T1
148	or	$T1,@X[7],@X[7]
149.Laligned:
150___
151    }
152
153    if ($i&1) {
154	$code.="\tadd	@X[$i/2],$h,$T1\n";
155    } else {
156	$code.="\tsrlx	@X[$i/2],32,$T1\n\tadd	$h,$T1,$T1\n";
157    }
158} if ($SZ==4);
159
160########### SHA512
161$Xload = sub {
162my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
163my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
164
165$code.=<<___ if ($i==0);
166	ld	[$inp+0],%l0
167	ld	[$inp+4],%l1
168	ld	[$inp+8],%l2
169	ld	[$inp+12],%l3
170	ld	[$inp+16],%l4
171	ld	[$inp+20],%l5
172	ld	[$inp+24],%l6
173	ld	[$inp+28],%l7
174___
175$code.=<<___ if ($i<15);
176	sllx	@pair[1],$tmp31,$tmp2	! Xload($i)
177	add	$tmp31,32,$tmp0
178	sllx	@pair[0],$tmp0,$tmp1
179	`"ld	[$inp+".eval(32+0+$i*8)."],@pair[0]"	if ($i<12)`
180	srlx	@pair[2],$tmp32,@pair[1]
181	or	$tmp1,$tmp2,$tmp2
182	or	@pair[1],$tmp2,$tmp2
183	`"ld	[$inp+".eval(32+4+$i*8)."],@pair[1]"	if ($i<12)`
184	add	$h,$tmp2,$T1
185	$ST	$tmp2,[%sp+`$bias+$frame+$i*$SZ`]
186___
187$code.=<<___ if ($i==12);
188	brnz,a	$tmp31,.+8
189	ld	[$inp+128],%l0
190___
191$code.=<<___ if ($i==15);
192	ld	[%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
193	sllx	@pair[1],$tmp31,$tmp2	! Xload($i)
194	add	$tmp31,32,$tmp0
195	ld	[%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
196	sllx	@pair[0],$tmp0,$tmp1
197	ld	[%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
198	srlx	@pair[2],$tmp32,@pair[1]
199	or	$tmp1,$tmp2,$tmp2
200	ld	[%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
201	or	@pair[1],$tmp2,$tmp2
202	ld	[%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
203	add	$h,$tmp2,$T1
204	$ST	$tmp2,[%sp+`$bias+$frame+$i*$SZ`]
205	ld	[%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
206	ld	[%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
207	ld	[%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
208___
209} if ($SZ==8);
210
211########### common
212sub BODY_00_15 {
213my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
214
215    if ($i<16) {
216	&$Xload(@_);
217    } else {
218	$code.="\tadd	$h,$T1,$T1\n";
219    }
220
221$code.=<<___;
222	$SRL	$e,@Sigma1[0],$h	!! $i
223	xor	$f,$g,$tmp2
224	$SLL	$e,`$SZ*8-@Sigma1[2]`,$tmp1
225	and	$e,$tmp2,$tmp2
226	$SRL	$e,@Sigma1[1],$tmp0
227	xor	$tmp1,$h,$h
228	$SLL	$e,`$SZ*8-@Sigma1[1]`,$tmp1
229	xor	$tmp0,$h,$h
230	$SRL	$e,@Sigma1[2],$tmp0
231	xor	$tmp1,$h,$h
232	$SLL	$e,`$SZ*8-@Sigma1[0]`,$tmp1
233	xor	$tmp0,$h,$h
234	xor	$g,$tmp2,$tmp2		! Ch(e,f,g)
235	xor	$tmp1,$h,$tmp0		! Sigma1(e)
236
237	$SRL	$a,@Sigma0[0],$h
238	add	$tmp2,$T1,$T1
239	$LD	[$Ktbl+`$i*$SZ`],$tmp2	! K[$i]
240	$SLL	$a,`$SZ*8-@Sigma0[2]`,$tmp1
241	add	$tmp0,$T1,$T1
242	$SRL	$a,@Sigma0[1],$tmp0
243	xor	$tmp1,$h,$h
244	$SLL	$a,`$SZ*8-@Sigma0[1]`,$tmp1
245	xor	$tmp0,$h,$h
246	$SRL	$a,@Sigma0[2],$tmp0
247	xor	$tmp1,$h,$h
248	$SLL	$a,`$SZ*8-@Sigma0[0]`,$tmp1
249	xor	$tmp0,$h,$h
250	xor	$tmp1,$h,$h		! Sigma0(a)
251
252	or	$a,$b,$tmp0
253	and	$a,$b,$tmp1
254	and	$c,$tmp0,$tmp0
255	or	$tmp0,$tmp1,$tmp1	! Maj(a,b,c)
256	add	$tmp2,$T1,$T1		! +=K[$i]
257	add	$tmp1,$h,$h
258
259	add	$T1,$d,$d
260	add	$T1,$h,$h
261___
262}
263
264########### SHA256
265$BODY_16_XX = sub {
266my $i=@_[0];
267my $xi;
268
269    if ($i&1) {
270	$xi=$tmp32;
271	$code.="\tsrlx	@X[(($i+1)/2)%8],32,$xi\n";
272    } else {
273	$xi=@X[(($i+1)/2)%8];
274    }
275$code.=<<___;
276	srl	$xi,@sigma0[0],$T1		!! Xupdate($i)
277	sll	$xi,`32-@sigma0[2]`,$tmp1
278	srl	$xi,@sigma0[1],$tmp0
279	xor	$tmp1,$T1,$T1
280	sll	$tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
281	xor	$tmp0,$T1,$T1
282	srl	$xi,@sigma0[2],$tmp0
283	xor	$tmp1,$T1,$T1
284___
285    if ($i&1) {
286	$xi=@X[(($i+14)/2)%8];
287    } else {
288	$xi=$tmp32;
289	$code.="\tsrlx	@X[(($i+14)/2)%8],32,$xi\n";
290    }
291$code.=<<___;
292	srl	$xi,@sigma1[0],$tmp2
293	xor	$tmp0,$T1,$T1			! T1=sigma0(X[i+1])
294	sll	$xi,`32-@sigma1[2]`,$tmp1
295	srl	$xi,@sigma1[1],$tmp0
296	xor	$tmp1,$tmp2,$tmp2
297	sll	$tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
298	xor	$tmp0,$tmp2,$tmp2
299	srl	$xi,@sigma1[2],$tmp0
300	xor	$tmp1,$tmp2,$tmp2
301___
302    if ($i&1) {
303	$xi=@X[($i/2)%8];
304$code.=<<___;
305	srlx	@X[(($i+9)/2)%8],32,$tmp1	! X[i+9]
306	xor	$tmp0,$tmp2,$tmp2		! sigma1(X[i+14])
307	srl	@X[($i/2)%8],0,$tmp0
308	add	$tmp2,$tmp1,$tmp1
309	add	$xi,$T1,$T1			! +=X[i]
310	xor	$tmp0,@X[($i/2)%8],@X[($i/2)%8]
311	add	$tmp1,$T1,$T1
312
313	srl	$T1,0,$T1
314	or	$T1,@X[($i/2)%8],@X[($i/2)%8]
315___
316    } else {
317	$xi=@X[(($i+9)/2)%8];
318$code.=<<___;
319	srlx	@X[($i/2)%8],32,$tmp1		! X[i]
320	xor	$tmp0,$tmp2,$tmp2		! sigma1(X[i+14])
321	add	$xi,$T1,$T1			! +=X[i+9]
322	add	$tmp2,$tmp1,$tmp1
323	srl	@X[($i/2)%8],0,@X[($i/2)%8]
324	add	$tmp1,$T1,$T1
325
326	sllx	$T1,32,$tmp0
327	or	$tmp0,@X[($i/2)%8],@X[($i/2)%8]
328___
329    }
330    &BODY_00_15(@_);
331} if ($SZ==4);
332
333########### SHA512
334$BODY_16_XX = sub {
335my $i=@_[0];
336my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
337
338$code.=<<___;
339	sllx	%l2,32,$tmp0		!! Xupdate($i)
340	or	%l3,$tmp0,$tmp0
341
342	srlx	$tmp0,@sigma0[0],$T1
343	ld	[%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
344	sllx	$tmp0,`64-@sigma0[2]`,$tmp1
345	ld	[%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
346	srlx	$tmp0,@sigma0[1],$tmp0
347	xor	$tmp1,$T1,$T1
348	sllx	$tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
349	xor	$tmp0,$T1,$T1
350	srlx	$tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
351	xor	$tmp1,$T1,$T1
352	sllx	%l6,32,$tmp2
353	xor	$tmp0,$T1,$T1		! sigma0(X[$i+1])
354	or	%l7,$tmp2,$tmp2
355
356	srlx	$tmp2,@sigma1[0],$tmp1
357	ld	[%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
358	sllx	$tmp2,`64-@sigma1[2]`,$tmp0
359	ld	[%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
360	srlx	$tmp2,@sigma1[1],$tmp2
361	xor	$tmp0,$tmp1,$tmp1
362	sllx	$tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
363	xor	$tmp2,$tmp1,$tmp1
364	srlx	$tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
365	xor	$tmp0,$tmp1,$tmp1
366	sllx	%l4,32,$tmp0
367	xor	$tmp2,$tmp1,$tmp1	! sigma1(X[$i+14])
368	ld	[%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
369	or	%l5,$tmp0,$tmp0
370	ld	[%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
371
372	sllx	%l0,32,$tmp2
373	add	$tmp1,$T1,$T1
374	ld	[%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
375	or	%l1,$tmp2,$tmp2
376	add	$tmp0,$T1,$T1		! +=X[$i+9]
377	ld	[%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
378	add	$tmp2,$T1,$T1		! +=X[$i]
379	$ST	$T1,[%sp+`$bias+$frame+($i%16)*$SZ`]
380___
381    &BODY_00_15(@_);
382} if ($SZ==8);
383
384$code.=<<___ if ($bits==64);
385.register	%g2,#scratch
386.register	%g3,#scratch
387___
388$code.=<<___;
389.section	".text",#alloc,#execinstr
390
391.align	64
392K${label}:
393.type	K${label},#object
394___
395if ($SZ==4) {
396$code.=<<___;
397	.long	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
398	.long	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
399	.long	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
400	.long	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
401	.long	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
402	.long	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
403	.long	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
404	.long	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
405	.long	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
406	.long	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
407	.long	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
408	.long	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
409	.long	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
410	.long	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
411	.long	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
412	.long	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
413___
414} else {
415$code.=<<___;
416	.long	0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
417	.long	0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
418	.long	0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
419	.long	0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
420	.long	0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
421	.long	0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
422	.long	0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
423	.long	0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
424	.long	0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
425	.long	0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
426	.long	0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
427	.long	0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
428	.long	0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
429	.long	0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
430	.long	0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
431	.long	0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
432	.long	0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
433	.long	0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
434	.long	0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
435	.long	0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
436	.long	0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
437	.long	0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
438	.long	0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
439	.long	0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
440	.long	0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
441	.long	0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
442	.long	0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
443	.long	0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
444	.long	0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
445	.long	0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
446	.long	0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
447	.long	0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
448	.long	0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
449	.long	0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
450	.long	0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
451	.long	0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
452	.long	0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
453	.long	0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
454	.long	0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
455	.long	0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
456___
457}
458$code.=<<___;
459.size	K${label},.-K${label}
460.globl	sha${label}_block_data_order
461sha${label}_block_data_order:
462	save	%sp,`-$frame-$locals`,%sp
463	and	$inp,`$align-1`,$tmp31
464	sllx	$len,`log(16*$SZ)/log(2)`,$len
465	andn	$inp,`$align-1`,$inp
466	sll	$tmp31,3,$tmp31
467	add	$inp,$len,$len
468___
469$code.=<<___ if ($SZ==8); # SHA512
470	mov	32,$tmp32
471	sub	$tmp32,$tmp31,$tmp32
472___
473$code.=<<___;
474.Lpic:	call	.+8
475	add	%o7,K${label}-.Lpic,$Ktbl
476
477	$LD	[$ctx+`0*$SZ`],$A
478	$LD	[$ctx+`1*$SZ`],$B
479	$LD	[$ctx+`2*$SZ`],$C
480	$LD	[$ctx+`3*$SZ`],$D
481	$LD	[$ctx+`4*$SZ`],$E
482	$LD	[$ctx+`5*$SZ`],$F
483	$LD	[$ctx+`6*$SZ`],$G
484	$LD	[$ctx+`7*$SZ`],$H
485
486.Lloop:
487___
488for ($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
489$code.=".L16_xx:\n";
490for (;$i<32;$i++)	{ &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
491$code.=<<___;
492	and	$tmp2,0xfff,$tmp2
493	cmp	$tmp2,$lastK
494	bne	.L16_xx
495	add	$Ktbl,`16*$SZ`,$Ktbl	! Ktbl+=16
496
497___
498$code.=<<___ if ($SZ==4); # SHA256
499	$LD	[$ctx+`0*$SZ`],@X[0]
500	$LD	[$ctx+`1*$SZ`],@X[1]
501	$LD	[$ctx+`2*$SZ`],@X[2]
502	$LD	[$ctx+`3*$SZ`],@X[3]
503	$LD	[$ctx+`4*$SZ`],@X[4]
504	$LD	[$ctx+`5*$SZ`],@X[5]
505	$LD	[$ctx+`6*$SZ`],@X[6]
506	$LD	[$ctx+`7*$SZ`],@X[7]
507
508	add	$A,@X[0],$A
509	$ST	$A,[$ctx+`0*$SZ`]
510	add	$B,@X[1],$B
511	$ST	$B,[$ctx+`1*$SZ`]
512	add	$C,@X[2],$C
513	$ST	$C,[$ctx+`2*$SZ`]
514	add	$D,@X[3],$D
515	$ST	$D,[$ctx+`3*$SZ`]
516	add	$E,@X[4],$E
517	$ST	$E,[$ctx+`4*$SZ`]
518	add	$F,@X[5],$F
519	$ST	$F,[$ctx+`5*$SZ`]
520	add	$G,@X[6],$G
521	$ST	$G,[$ctx+`6*$SZ`]
522	add	$H,@X[7],$H
523	$ST	$H,[$ctx+`7*$SZ`]
524___
525$code.=<<___ if ($SZ==8); # SHA512
526	ld	[$ctx+`0*$SZ+0`],%l0
527	ld	[$ctx+`0*$SZ+4`],%l1
528	ld	[$ctx+`1*$SZ+0`],%l2
529	ld	[$ctx+`1*$SZ+4`],%l3
530	ld	[$ctx+`2*$SZ+0`],%l4
531	ld	[$ctx+`2*$SZ+4`],%l5
532	ld	[$ctx+`3*$SZ+0`],%l6
533
534	sllx	%l0,32,$tmp0
535	ld	[$ctx+`3*$SZ+4`],%l7
536	sllx	%l2,32,$tmp1
537	or	%l1,$tmp0,$tmp0
538	or	%l3,$tmp1,$tmp1
539	add	$tmp0,$A,$A
540	add	$tmp1,$B,$B
541	$ST	$A,[$ctx+`0*$SZ`]
542	sllx	%l4,32,$tmp2
543	$ST	$B,[$ctx+`1*$SZ`]
544	sllx	%l6,32,$T1
545	or	%l5,$tmp2,$tmp2
546	or	%l7,$T1,$T1
547	add	$tmp2,$C,$C
548	$ST	$C,[$ctx+`2*$SZ`]
549	add	$T1,$D,$D
550	$ST	$D,[$ctx+`3*$SZ`]
551
552	ld	[$ctx+`4*$SZ+0`],%l0
553	ld	[$ctx+`4*$SZ+4`],%l1
554	ld	[$ctx+`5*$SZ+0`],%l2
555	ld	[$ctx+`5*$SZ+4`],%l3
556	ld	[$ctx+`6*$SZ+0`],%l4
557	ld	[$ctx+`6*$SZ+4`],%l5
558	ld	[$ctx+`7*$SZ+0`],%l6
559
560	sllx	%l0,32,$tmp0
561	ld	[$ctx+`7*$SZ+4`],%l7
562	sllx	%l2,32,$tmp1
563	or	%l1,$tmp0,$tmp0
564	or	%l3,$tmp1,$tmp1
565	add	$tmp0,$E,$E
566	add	$tmp1,$F,$F
567	$ST	$E,[$ctx+`4*$SZ`]
568	sllx	%l4,32,$tmp2
569	$ST	$F,[$ctx+`5*$SZ`]
570	sllx	%l6,32,$T1
571	or	%l5,$tmp2,$tmp2
572	or	%l7,$T1,$T1
573	add	$tmp2,$G,$G
574	$ST	$G,[$ctx+`6*$SZ`]
575	add	$T1,$H,$H
576	$ST	$H,[$ctx+`7*$SZ`]
577___
578$code.=<<___;
579	add	$inp,`16*$SZ`,$inp		! advance inp
580	cmp	$inp,$len
581	bne	`$bits==64?"%xcc":"%icc"`,.Lloop
582	sub	$Ktbl,`($rounds-16)*$SZ`,$Ktbl	! rewind Ktbl
583
584	ret
585	restore
586.type	sha${label}_block_data_order,#function
587.size	sha${label}_block_data_order,(.-sha${label}_block_data_order)
588.asciz	"SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
589.align	4
590___
591
592$code =~ s/\`([^\`]*)\`/eval $1/gem;
593print $code;
594close STDOUT;
595