1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256/512 block procedure for PA-RISC.
11
12# June 2009.
13#
14# SHA256 performance is >75% better than gcc 3.2 generated code on
15# PA-7100LC. Compared to code generated by vendor compiler this
16# implementation is almost 70% faster in 64-bit build, but delivers
17# virtually same performance in 32-bit build on PA-8600.
18#
19# SHA512 performance is >2.9x better than gcc 3.2 generated code on
20# PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the
21# code is executed on PA-RISC 2.0 processor and switches to 64-bit
22# code path delivering adequate peformance even in "blended" 32-bit
23# build. Though 64-bit code is not any faster than code generated by
24# vendor compiler on PA-8600...
25#
26# Special thanks to polarhome.com for providing HP-UX account.
27
28$flavour = shift;
29$output = shift;
30open STDOUT,">$output";
31
32if ($flavour =~ /64/) {
33	$LEVEL		="2.0W";
34	$SIZE_T		=8;
35	$FRAME_MARKER	=80;
36	$SAVED_RP	=16;
37	$PUSH		="std";
38	$PUSHMA		="std,ma";
39	$POP		="ldd";
40	$POPMB		="ldd,mb";
41} else {
42	$LEVEL		="1.0";
43	$SIZE_T		=4;
44	$FRAME_MARKER	=48;
45	$SAVED_RP	=20;
46	$PUSH		="stw";
47	$PUSHMA		="stwm";
48	$POP		="ldw";
49	$POPMB		="ldwm";
50}
51
52if ($output =~ /512/) {
53	$func="sha512_block_data_order";
54	$SZ=8;
55	@Sigma0=(28,34,39);
56	@Sigma1=(14,18,41);
57	@sigma0=(1,  8, 7);
58	@sigma1=(19,61, 6);
59	$rounds=80;
60	$LAST10BITS=0x017;
61	$LD="ldd";
62	$LDM="ldd,ma";
63	$ST="std";
64} else {
65	$func="sha256_block_data_order";
66	$SZ=4;
67	@Sigma0=( 2,13,22);
68	@Sigma1=( 6,11,25);
69	@sigma0=( 7,18, 3);
70	@sigma1=(17,19,10);
71	$rounds=64;
72	$LAST10BITS=0x0f2;
73	$LD="ldw";
74	$LDM="ldwm";
75	$ST="stw";
76}
77
78$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
79				#                 [+ argument transfer]
80$XOFF=16*$SZ+32;		# local variables
81$FRAME+=$XOFF;
82$XOFF+=$FRAME_MARKER;		# distance between %sp and local variables
83
84$ctx="%r26";	# zapped by $a0
85$inp="%r25";	# zapped by $a1
86$num="%r24";	# zapped by $t0
87
88$a0 ="%r26";
89$a1 ="%r25";
90$t0 ="%r24";
91$t1 ="%r29";
92$Tbl="%r31";
93
94@V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28");
95
96@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
97    "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp);
98
99sub ROUND_00_15 {
100my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
101$code.=<<___;
102	_ror	$e,$Sigma1[0],$a0
103	and	$f,$e,$t0
104	_ror	$e,$Sigma1[1],$a1
105	addl	$t1,$h,$h
106	andcm	$g,$e,$t1
107	xor	$a1,$a0,$a0
108	_ror	$a1,`$Sigma1[2]-$Sigma1[1]`,$a1
109	or	$t0,$t1,$t1		; Ch(e,f,g)
110	addl	@X[$i%16],$h,$h
111	xor	$a0,$a1,$a1		; Sigma1(e)
112	addl	$t1,$h,$h
113	_ror	$a,$Sigma0[0],$a0
114	addl	$a1,$h,$h
115
116	_ror	$a,$Sigma0[1],$a1
117	and	$a,$b,$t0
118	and	$a,$c,$t1
119	xor	$a1,$a0,$a0
120	_ror	$a1,`$Sigma0[2]-$Sigma0[1]`,$a1
121	xor	$t1,$t0,$t0
122	and	$b,$c,$t1
123	xor	$a0,$a1,$a1		; Sigma0(a)
124	addl	$h,$d,$d
125	xor	$t1,$t0,$t0		; Maj(a,b,c)
126	`"$LDM	$SZ($Tbl),$t1" if ($i<15)`
127	addl	$a1,$h,$h
128	addl	$t0,$h,$h
129
130___
131}
132
133sub ROUND_16_xx {
134my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
135$i-=16;
136$code.=<<___;
137	_ror	@X[($i+1)%16],$sigma0[0],$a0
138	_ror	@X[($i+1)%16],$sigma0[1],$a1
139	addl	@X[($i+9)%16],@X[$i],@X[$i]
140	_ror	@X[($i+14)%16],$sigma1[0],$t0
141	_ror	@X[($i+14)%16],$sigma1[1],$t1
142	xor	$a1,$a0,$a0
143	_shr	@X[($i+1)%16],$sigma0[2],$a1
144	xor	$t1,$t0,$t0
145	_shr	@X[($i+14)%16],$sigma1[2],$t1
146	xor	$a1,$a0,$a0		; sigma0(X[(i+1)&0x0f])
147	xor	$t1,$t0,$t0		; sigma1(X[(i+14)&0x0f])
148	$LDM	$SZ($Tbl),$t1
149	addl	$a0,@X[$i],@X[$i]
150	addl	$t0,@X[$i],@X[$i]
151___
152$code.=<<___ if ($i==15);
153	extru	$t1,31,10,$a1
154	comiclr,<> $LAST10BITS,$a1,%r0
155	ldo	1($Tbl),$Tbl		; signal end of $Tbl
156___
157&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
158}
159
160$code=<<___;
161	.LEVEL	$LEVEL
162	.SPACE	\$TEXT\$
163	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
164
165	.ALIGN	64
166L\$table
167___
168$code.=<<___ if ($SZ==8);
169	.WORD	0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
170	.WORD	0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
171	.WORD	0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
172	.WORD	0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
173	.WORD	0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
174	.WORD	0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
175	.WORD	0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
176	.WORD	0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
177	.WORD	0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
178	.WORD	0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
179	.WORD	0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
180	.WORD	0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
181	.WORD	0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
182	.WORD	0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
183	.WORD	0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
184	.WORD	0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
185	.WORD	0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
186	.WORD	0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
187	.WORD	0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
188	.WORD	0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
189	.WORD	0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
190	.WORD	0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
191	.WORD	0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
192	.WORD	0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
193	.WORD	0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
194	.WORD	0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
195	.WORD	0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
196	.WORD	0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
197	.WORD	0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
198	.WORD	0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
199	.WORD	0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
200	.WORD	0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
201	.WORD	0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
202	.WORD	0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
203	.WORD	0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
204	.WORD	0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
205	.WORD	0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
206	.WORD	0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
207	.WORD	0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
208	.WORD	0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
209___
210$code.=<<___ if ($SZ==4);
211	.WORD	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
212	.WORD	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
213	.WORD	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
214	.WORD	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
215	.WORD	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
216	.WORD	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
217	.WORD	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
218	.WORD	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
219	.WORD	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
220	.WORD	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
221	.WORD	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
222	.WORD	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
223	.WORD	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
224	.WORD	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
225	.WORD	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
226	.WORD	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
227___
228$code.=<<___;
229
230	.EXPORT	$func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
231	.ALIGN	64
232$func
233	.PROC
234	.CALLINFO	FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
235	.ENTRY
236	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
237	$PUSHMA	%r3,$FRAME(%sp)
238	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
239	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
240	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
241	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
242	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
243	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
244	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
245	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
246	$PUSH	%r12,`-$FRAME+9*$SIZE_T`(%sp)
247	$PUSH	%r13,`-$FRAME+10*$SIZE_T`(%sp)
248	$PUSH	%r14,`-$FRAME+11*$SIZE_T`(%sp)
249	$PUSH	%r15,`-$FRAME+12*$SIZE_T`(%sp)
250	$PUSH	%r16,`-$FRAME+13*$SIZE_T`(%sp)
251	$PUSH	%r17,`-$FRAME+14*$SIZE_T`(%sp)
252	$PUSH	%r18,`-$FRAME+15*$SIZE_T`(%sp)
253
254	_shl	$num,`log(16*$SZ)/log(2)`,$num
255	addl	$inp,$num,$num		; $num to point at the end of $inp
256
257	$PUSH	$num,`-$FRAME_MARKER-4*$SIZE_T`(%sp)	; save arguments
258	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)
259	$PUSH	$ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp)
260
261	blr	%r0,$Tbl
262	ldi	3,$t1
263L\$pic
264	andcm	$Tbl,$t1,$Tbl		; wipe privilege level
265	ldo	L\$table-L\$pic($Tbl),$Tbl
266___
267$code.=<<___ if ($SZ==8 && $SIZE_T==4);
268	ldi	31,$t1
269	mtctl	$t1,%cr11
270	extrd,u,*= $t1,%sar,1,$t1	; executes on PA-RISC 1.0
271	b	L\$parisc1
272	nop
273___
274$code.=<<___;
275	$LD	`0*$SZ`($ctx),$A	; load context
276	$LD	`1*$SZ`($ctx),$B
277	$LD	`2*$SZ`($ctx),$C
278	$LD	`3*$SZ`($ctx),$D
279	$LD	`4*$SZ`($ctx),$E
280	$LD	`5*$SZ`($ctx),$F
281	$LD	`6*$SZ`($ctx),$G
282	$LD	`7*$SZ`($ctx),$H
283
284	extru	$inp,31,`log($SZ)/log(2)`,$t0
285	sh3addl	$t0,%r0,$t0
286	subi	`8*$SZ`,$t0,$t0
287	mtctl	$t0,%cr11		; load %sar with align factor
288
289L\$oop
290	ldi	`$SZ-1`,$t0
291	$LDM	$SZ($Tbl),$t1
292	andcm	$inp,$t0,$t0		; align $inp
293___
294	for ($i=0;$i<15;$i++) {		# load input block
295	$code.="\t$LD	`$SZ*$i`($t0),@X[$i]\n";		}
296$code.=<<___;
297	cmpb,*=	$inp,$t0,L\$aligned
298	$LD	`$SZ*15`($t0),@X[15]
299	$LD	`$SZ*16`($t0),@X[16]
300___
301	for ($i=0;$i<16;$i++) {		# align data
302	$code.="\t_align	@X[$i],@X[$i+1],@X[$i]\n";	}
303$code.=<<___;
304L\$aligned
305	nop	; otherwise /usr/ccs/bin/as is confused by below .WORD
306___
307
308for($i=0;$i<16;$i++)	{ &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
309$code.=<<___;
310L\$rounds
311	nop	; otherwise /usr/ccs/bin/as is confused by below .WORD
312___
313for(;$i<32;$i++)	{ &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); }
314$code.=<<___;
315	bb,>=	$Tbl,31,L\$rounds	; end of $Tbl signalled?
316	nop
317
318	$POP	`-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx	; restore arguments
319	$POP	`-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
320	$POP	`-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
321	ldo	`-$rounds*$SZ-1`($Tbl),$Tbl		; rewind $Tbl
322
323	$LD	`0*$SZ`($ctx),@X[0]	; load context
324	$LD	`1*$SZ`($ctx),@X[1]
325	$LD	`2*$SZ`($ctx),@X[2]
326	$LD	`3*$SZ`($ctx),@X[3]
327	$LD	`4*$SZ`($ctx),@X[4]
328	$LD	`5*$SZ`($ctx),@X[5]
329	addl	@X[0],$A,$A
330	$LD	`6*$SZ`($ctx),@X[6]
331	addl	@X[1],$B,$B
332	$LD	`7*$SZ`($ctx),@X[7]
333	ldo	`16*$SZ`($inp),$inp	; advance $inp
334
335	$ST	$A,`0*$SZ`($ctx)	; save context
336	addl	@X[2],$C,$C
337	$ST	$B,`1*$SZ`($ctx)
338	addl	@X[3],$D,$D
339	$ST	$C,`2*$SZ`($ctx)
340	addl	@X[4],$E,$E
341	$ST	$D,`3*$SZ`($ctx)
342	addl	@X[5],$F,$F
343	$ST	$E,`4*$SZ`($ctx)
344	addl	@X[6],$G,$G
345	$ST	$F,`5*$SZ`($ctx)
346	addl	@X[7],$H,$H
347	$ST	$G,`6*$SZ`($ctx)
348	$ST	$H,`7*$SZ`($ctx)
349
350	cmpb,*<>,n $inp,$num,L\$oop
351	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)	; save $inp
352___
353if ($SZ==8 && $SIZE_T==4)	# SHA512 for 32-bit PA-RISC 1.0
354{{
355$code.=<<___;
356	b	L\$done
357	nop
358
359	.ALIGN	64
360L\$parisc1
361___
362
363@V=(  $Ahi,  $Alo,  $Bhi,  $Blo,  $Chi,  $Clo,  $Dhi,  $Dlo,
364      $Ehi,  $Elo,  $Fhi,  $Flo,  $Ghi,  $Glo,  $Hhi,  $Hlo) =
365   ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
366     "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
367$a0 ="%r17";
368$a1 ="%r18";
369$a2 ="%r19";
370$a3 ="%r20";
371$t0 ="%r21";
372$t1 ="%r22";
373$t2 ="%r28";
374$t3 ="%r29";
375$Tbl="%r31";
376
377@X=("%r23","%r24","%r25","%r26");	# zaps $num,$inp,$ctx
378
379sub ROUND_00_15_pa1 {
380my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
381       $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_;
382my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
383
384$code.=<<___ if (!$flag);
385	ldw	`-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
386	ldw	`-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo	; load X[i+1]
387___
388$code.=<<___;
389	shd	$ehi,$elo,$Sigma1[0],$t0
390	 add	$Xlo,$hlo,$hlo
391	shd	$elo,$ehi,$Sigma1[0],$t1
392	 addc	$Xhi,$hhi,$hhi		; h += X[i]
393	shd	$ehi,$elo,$Sigma1[1],$t2
394	 ldwm	8($Tbl),$Xhi
395	shd	$elo,$ehi,$Sigma1[1],$t3
396	 ldw	-4($Tbl),$Xlo		; load K[i]
397	xor	$t2,$t0,$t0
398	xor	$t3,$t1,$t1
399	 and	$flo,$elo,$a0
400	 and	$fhi,$ehi,$a1
401	shd	$ehi,$elo,$Sigma1[2],$t2
402	 andcm	$glo,$elo,$a2
403	shd	$elo,$ehi,$Sigma1[2],$t3
404	 andcm	$ghi,$ehi,$a3
405	xor	$t2,$t0,$t0
406	xor	$t3,$t1,$t1		; Sigma1(e)
407	add	$Xlo,$hlo,$hlo
408	 xor	$a2,$a0,$a0
409	addc	$Xhi,$hhi,$hhi		; h += K[i]
410	 xor	$a3,$a1,$a1		; Ch(e,f,g)
411
412	 add	$t0,$hlo,$hlo
413	shd	$ahi,$alo,$Sigma0[0],$t0
414	 addc	$t1,$hhi,$hhi		; h += Sigma1(e)
415	shd	$alo,$ahi,$Sigma0[0],$t1
416	 add	$a0,$hlo,$hlo
417	shd	$ahi,$alo,$Sigma0[1],$t2
418	 addc	$a1,$hhi,$hhi		; h += Ch(e,f,g)
419	shd	$alo,$ahi,$Sigma0[1],$t3
420
421	xor	$t2,$t0,$t0
422	xor	$t3,$t1,$t1
423	shd	$ahi,$alo,$Sigma0[2],$t2
424	and	$alo,$blo,$a0
425	shd	$alo,$ahi,$Sigma0[2],$t3
426	and	$ahi,$bhi,$a1
427	xor	$t2,$t0,$t0
428	xor	$t3,$t1,$t1		; Sigma0(a)
429
430	and	$alo,$clo,$a2
431	and	$ahi,$chi,$a3
432	xor	$a2,$a0,$a0
433	 add	$hlo,$dlo,$dlo
434	xor	$a3,$a1,$a1
435	 addc	$hhi,$dhi,$dhi		; d += h
436	and	$blo,$clo,$a2
437	 add	$t0,$hlo,$hlo
438	and	$bhi,$chi,$a3
439	 addc	$t1,$hhi,$hhi		; h += Sigma0(a)
440	xor	$a2,$a0,$a0
441	 add	$a0,$hlo,$hlo
442	xor	$a3,$a1,$a1		; Maj(a,b,c)
443	 addc	$a1,$hhi,$hhi		; h += Maj(a,b,c)
444
445___
446$code.=<<___ if ($i==15 && $flag);
447	extru	$Xlo,31,10,$Xlo
448	comiclr,= $LAST10BITS,$Xlo,%r0
449	b	L\$rounds_pa1
450	nop
451___
452push(@X,shift(@X)); push(@X,shift(@X));
453}
454
455sub ROUND_16_xx_pa1 {
456my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
457my ($i)=shift;
458$i-=16;
459$code.=<<___;
460	ldw	`-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
461	ldw	`-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo	; load X[i+1]
462	ldw	`-$XOFF+8*(($i+9)%16)`(%sp),$a1
463	ldw	`-$XOFF+8*(($i+9)%16)+4`(%sp),$a0	; load X[i+9]
464	ldw	`-$XOFF+8*(($i+14)%16)`(%sp),$a3
465	ldw	`-$XOFF+8*(($i+14)%16)+4`(%sp),$a2	; load X[i+14]
466	shd	$Xnhi,$Xnlo,$sigma0[0],$t0
467	shd	$Xnlo,$Xnhi,$sigma0[0],$t1
468	 add	$a0,$Xlo,$Xlo
469	shd	$Xnhi,$Xnlo,$sigma0[1],$t2
470	 addc	$a1,$Xhi,$Xhi
471	shd	$Xnlo,$Xnhi,$sigma0[1],$t3
472	xor	$t2,$t0,$t0
473	shd	$Xnhi,$Xnlo,$sigma0[2],$t2
474	xor	$t3,$t1,$t1
475	extru	$Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3
476	xor	$t2,$t0,$t0
477	 shd	$a3,$a2,$sigma1[0],$a0
478	xor	$t3,$t1,$t1		; sigma0(X[i+1)&0x0f])
479	 shd	$a2,$a3,$sigma1[0],$a1
480	add	$t0,$Xlo,$Xlo
481	 shd	$a3,$a2,$sigma1[1],$t2
482	addc	$t1,$Xhi,$Xhi
483	 shd	$a2,$a3,$sigma1[1],$t3
484	xor	$t2,$a0,$a0
485	shd	$a3,$a2,$sigma1[2],$t2
486	xor	$t3,$a1,$a1
487	extru	$a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3
488	xor	$t2,$a0,$a0
489	xor	$t3,$a1,$a1		; sigma0(X[i+14)&0x0f])
490	add	$a0,$Xlo,$Xlo
491	addc	$a1,$Xhi,$Xhi
492
493	stw	$Xhi,`-$XOFF+8*($i%16)`(%sp)
494	stw	$Xlo,`-$XOFF+8*($i%16)+4`(%sp)
495___
496&ROUND_00_15_pa1($i,@_,1);
497}
498$code.=<<___;
499	ldw	`0*4`($ctx),$Ahi		; load context
500	ldw	`1*4`($ctx),$Alo
501	ldw	`2*4`($ctx),$Bhi
502	ldw	`3*4`($ctx),$Blo
503	ldw	`4*4`($ctx),$Chi
504	ldw	`5*4`($ctx),$Clo
505	ldw	`6*4`($ctx),$Dhi
506	ldw	`7*4`($ctx),$Dlo
507	ldw	`8*4`($ctx),$Ehi
508	ldw	`9*4`($ctx),$Elo
509	ldw	`10*4`($ctx),$Fhi
510	ldw	`11*4`($ctx),$Flo
511	ldw	`12*4`($ctx),$Ghi
512	ldw	`13*4`($ctx),$Glo
513	ldw	`14*4`($ctx),$Hhi
514	ldw	`15*4`($ctx),$Hlo
515
516	extru	$inp,31,2,$t0
517	sh3addl	$t0,%r0,$t0
518	subi	32,$t0,$t0
519	mtctl	$t0,%cr11		; load %sar with align factor
520
521L\$oop_pa1
522	extru	$inp,31,2,$a3
523	comib,=	0,$a3,L\$aligned_pa1
524	sub	$inp,$a3,$inp
525
526	ldw	`0*4`($inp),$X[0]
527	ldw	`1*4`($inp),$X[1]
528	ldw	`2*4`($inp),$t2
529	ldw	`3*4`($inp),$t3
530	ldw	`4*4`($inp),$a0
531	ldw	`5*4`($inp),$a1
532	ldw	`6*4`($inp),$a2
533	ldw	`7*4`($inp),$a3
534	vshd	$X[0],$X[1],$X[0]
535	vshd	$X[1],$t2,$X[1]
536	stw	$X[0],`-$XOFF+0*4`(%sp)
537	ldw	`8*4`($inp),$t0
538	vshd	$t2,$t3,$t2
539	stw	$X[1],`-$XOFF+1*4`(%sp)
540	ldw	`9*4`($inp),$t1
541	vshd	$t3,$a0,$t3
542___
543{
544my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
545for ($i=2;$i<=(128/4-8);$i++) {
546$code.=<<___;
547	stw	$t[0],`-$XOFF+$i*4`(%sp)
548	ldw	`(8+$i)*4`($inp),$t[0]
549	vshd	$t[1],$t[2],$t[1]
550___
551push(@t,shift(@t));
552}
553for (;$i<(128/4-1);$i++) {
554$code.=<<___;
555	stw	$t[0],`-$XOFF+$i*4`(%sp)
556	vshd	$t[1],$t[2],$t[1]
557___
558push(@t,shift(@t));
559}
560$code.=<<___;
561	b	L\$collected_pa1
562	stw	$t[0],`-$XOFF+$i*4`(%sp)
563
564___
565}
566$code.=<<___;
567L\$aligned_pa1
568	ldw	`0*4`($inp),$X[0]
569	ldw	`1*4`($inp),$X[1]
570	ldw	`2*4`($inp),$t2
571	ldw	`3*4`($inp),$t3
572	ldw	`4*4`($inp),$a0
573	ldw	`5*4`($inp),$a1
574	ldw	`6*4`($inp),$a2
575	ldw	`7*4`($inp),$a3
576	stw	$X[0],`-$XOFF+0*4`(%sp)
577	ldw	`8*4`($inp),$t0
578	stw	$X[1],`-$XOFF+1*4`(%sp)
579	ldw	`9*4`($inp),$t1
580___
581{
582my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
583for ($i=2;$i<(128/4-8);$i++) {
584$code.=<<___;
585	stw	$t[0],`-$XOFF+$i*4`(%sp)
586	ldw	`(8+$i)*4`($inp),$t[0]
587___
588push(@t,shift(@t));
589}
590for (;$i<128/4;$i++) {
591$code.=<<___;
592	stw	$t[0],`-$XOFF+$i*4`(%sp)
593___
594push(@t,shift(@t));
595}
596$code.="L\$collected_pa1\n";
597}
598
599for($i=0;$i<16;$i++)	{ &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
600$code.="L\$rounds_pa1\n";
601for(;$i<32;$i++)	{ &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
602
603$code.=<<___;
604	$POP	`-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx	; restore arguments
605	$POP	`-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
606	$POP	`-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
607	ldo	`-$rounds*$SZ`($Tbl),$Tbl		; rewind $Tbl
608
609	ldw	`0*4`($ctx),$t1		; update context
610	ldw	`1*4`($ctx),$t0
611	ldw	`2*4`($ctx),$t3
612	ldw	`3*4`($ctx),$t2
613	ldw	`4*4`($ctx),$a1
614	ldw	`5*4`($ctx),$a0
615	ldw	`6*4`($ctx),$a3
616	add	$t0,$Alo,$Alo
617	ldw	`7*4`($ctx),$a2
618	addc	$t1,$Ahi,$Ahi
619	ldw	`8*4`($ctx),$t1
620	add	$t2,$Blo,$Blo
621	ldw	`9*4`($ctx),$t0
622	addc	$t3,$Bhi,$Bhi
623	ldw	`10*4`($ctx),$t3
624	add	$a0,$Clo,$Clo
625	ldw	`11*4`($ctx),$t2
626	addc	$a1,$Chi,$Chi
627	ldw	`12*4`($ctx),$a1
628	add	$a2,$Dlo,$Dlo
629	ldw	`13*4`($ctx),$a0
630	addc	$a3,$Dhi,$Dhi
631	ldw	`14*4`($ctx),$a3
632	add	$t0,$Elo,$Elo
633	ldw	`15*4`($ctx),$a2
634	addc	$t1,$Ehi,$Ehi
635	stw	$Ahi,`0*4`($ctx)
636	add	$t2,$Flo,$Flo
637	stw	$Alo,`1*4`($ctx)
638	addc	$t3,$Fhi,$Fhi
639	stw	$Bhi,`2*4`($ctx)
640	add	$a0,$Glo,$Glo
641	stw	$Blo,`3*4`($ctx)
642	addc	$a1,$Ghi,$Ghi
643	stw	$Chi,`4*4`($ctx)
644	add	$a2,$Hlo,$Hlo
645	stw	$Clo,`5*4`($ctx)
646	addc	$a3,$Hhi,$Hhi
647	stw	$Dhi,`6*4`($ctx)
648	ldo	`16*$SZ`($inp),$inp	; advance $inp
649	stw	$Dlo,`7*4`($ctx)
650	stw	$Ehi,`8*4`($ctx)
651	stw	$Elo,`9*4`($ctx)
652	stw	$Fhi,`10*4`($ctx)
653	stw	$Flo,`11*4`($ctx)
654	stw	$Ghi,`12*4`($ctx)
655	stw	$Glo,`13*4`($ctx)
656	stw	$Hhi,`14*4`($ctx)
657	comb,=	$inp,$num,L\$done
658	stw	$Hlo,`15*4`($ctx)
659	b	L\$oop_pa1
660	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)	; save $inp
661L\$done
662___
663}}
664$code.=<<___;
665	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
666	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
667	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
668	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
669	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
670	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
671	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
672	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
673	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
674	$POP	`-$FRAME+9*$SIZE_T`(%sp),%r12
675	$POP	`-$FRAME+10*$SIZE_T`(%sp),%r13
676	$POP	`-$FRAME+11*$SIZE_T`(%sp),%r14
677	$POP	`-$FRAME+12*$SIZE_T`(%sp),%r15
678	$POP	`-$FRAME+13*$SIZE_T`(%sp),%r16
679	$POP	`-$FRAME+14*$SIZE_T`(%sp),%r17
680	$POP	`-$FRAME+15*$SIZE_T`(%sp),%r18
681	bv	(%r2)
682	.EXIT
683	$POPMB	-$FRAME(%sp),%r3
684	.PROCEND
685	.STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
686___
687
688# Explicitly encode PA-RISC 2.0 instructions used in this module, so
689# that it can be compiled with .LEVEL 1.0. It should be noted that I
690# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
691# directive...
692
693my $ldd = sub {
694  my ($mod,$args) = @_;
695  my $orig = "ldd$mod\t$args";
696
697    if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices
698    {	my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1);
699	$opcode|=(1<<3) if ($mod =~ /^,m/);
700	$opcode|=(1<<2) if ($mod =~ /^,mb/);
701	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
702    }
703    else { "\t".$orig; }
704};
705
706my $std = sub {
707  my ($mod,$args) = @_;
708  my $orig = "std$mod\t$args";
709
710    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
711    {	my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
712	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
713    }
714    else { "\t".$orig; }
715};
716
717my $extrd = sub {
718  my ($mod,$args) = @_;
719  my $orig = "extrd$mod\t$args";
720
721    # I only have ",u" completer, it's implicitly encoded...
722    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
723    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
724	my $len=32-$3;
725	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
726	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
727	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
728    }
729    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
730    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
731	my $len=32-$2;
732	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
733	$opcode |= (1<<13) if ($mod =~ /,\**=/);
734	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
735    }
736    else { "\t".$orig; }
737};
738
739my $shrpd = sub {
740  my ($mod,$args) = @_;
741  my $orig = "shrpd$mod\t$args";
742
743    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
744    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
745	my $cpos=63-$3;
746	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
747	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
748    }
749    elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)	# format 11
750    {	sprintf "\t.WORD\t0x%08x\t; %s",
751		(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
752    }
753    else { "\t".$orig; }
754};
755
756sub assemble {
757  my ($mnemonic,$mod,$args)=@_;
758  my $opcode = eval("\$$mnemonic");
759
760    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
761}
762
763foreach (split("\n",$code)) {
764	s/\`([^\`]*)\`/eval $1/ge;
765
766	s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/
767		$3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32)	# rotation for >=32
768		:       sprintf("shd\t%$1,%$2,%d",$3)/e			or
769	# translate made up instructons: _ror, _shr, _align, _shl
770	s/_ror(\s+)(%r[0-9]+),/
771		($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e			or
772
773	s/_shr(\s+%r[0-9]+),([0-9]+),/
774		$SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2)
775		:        sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e	or
776
777	s/_align(\s+%r[0-9]+,%r[0-9]+),/
778		($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e		or
779
780	s/_shl(\s+%r[0-9]+),([0-9]+),/
781		$SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2)
782		:            sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e;
783
784	s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4);
785
786	s/cmpb,\*/comb,/ if ($SIZE_T==4);
787
788	print $_,"\n";
789}
790
791close STDOUT;
792