1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# I let hardware handle unaligned input(*), except on page boundaries
11# (see below for details). Otherwise straightforward implementation
12# with X vector in register bank. The module is big-endian [which is
13# not big deal as there're no little-endian targets left around].
14#
15# (*) this means that this module is inappropriate for PPC403? Does
16#     anybody know if pre-POWER3 can sustain unaligned load?
17
18# 			-m64	-m32
19# ----------------------------------
20# PPC970,gcc-4.0.0	+76%	+59%
21# Power6,xlc-7		+68%	+33%
22
23$flavour = shift;
24
25if ($flavour =~ /64/) {
26	$SIZE_T	=8;
27	$LRSAVE	=2*$SIZE_T;
28	$UCMP	="cmpld";
29	$STU	="stdu";
30	$POP	="ld";
31	$PUSH	="std";
32} elsif ($flavour =~ /32/) {
33	$SIZE_T	=4;
34	$LRSAVE	=$SIZE_T;
35	$UCMP	="cmplw";
36	$STU	="stwu";
37	$POP	="lwz";
38	$PUSH	="stw";
39} else { die "nonsense $flavour"; }
40
41$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
43( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
44die "can't locate ppc-xlate.pl";
45
46open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
47
48$FRAME=24*$SIZE_T+64;
49$LOCALS=6*$SIZE_T;
50
51$K  ="r0";
52$sp ="r1";
53$toc="r2";
54$ctx="r3";
55$inp="r4";
56$num="r5";
57$t0 ="r15";
58$t1 ="r6";
59
60$A  ="r7";
61$B  ="r8";
62$C  ="r9";
63$D  ="r10";
64$E  ="r11";
65$T  ="r12";
66
67@V=($A,$B,$C,$D,$E,$T);
68@X=("r16","r17","r18","r19","r20","r21","r22","r23",
69    "r24","r25","r26","r27","r28","r29","r30","r31");
70
71sub BODY_00_19 {
72my ($i,$a,$b,$c,$d,$e,$f)=@_;
73my $j=$i+1;
74$code.=<<___ if ($i==0);
75	lwz	@X[$i],`$i*4`($inp)
76___
77$code.=<<___ if ($i<15);
78	lwz	@X[$j],`$j*4`($inp)
79	add	$f,$K,$e
80	rotlwi	$e,$a,5
81	add	$f,$f,@X[$i]
82	and	$t0,$c,$b
83	add	$f,$f,$e
84	andc	$t1,$d,$b
85	rotlwi	$b,$b,30
86	or	$t0,$t0,$t1
87	add	$f,$f,$t0
88___
89$code.=<<___ if ($i>=15);
90	add	$f,$K,$e
91	rotlwi	$e,$a,5
92	xor	@X[$j%16],@X[$j%16],@X[($j+2)%16]
93	add	$f,$f,@X[$i%16]
94	and	$t0,$c,$b
95	xor	@X[$j%16],@X[$j%16],@X[($j+8)%16]
96	add	$f,$f,$e
97	andc	$t1,$d,$b
98	rotlwi	$b,$b,30
99	or	$t0,$t0,$t1
100	xor	@X[$j%16],@X[$j%16],@X[($j+13)%16]
101	add	$f,$f,$t0
102	rotlwi	@X[$j%16],@X[$j%16],1
103___
104}
105
106sub BODY_20_39 {
107my ($i,$a,$b,$c,$d,$e,$f)=@_;
108my $j=$i+1;
109$code.=<<___ if ($i<79);
110	add	$f,$K,$e
111	rotlwi	$e,$a,5
112	xor	@X[$j%16],@X[$j%16],@X[($j+2)%16]
113	add	$f,$f,@X[$i%16]
114	xor	$t0,$b,$c
115	xor	@X[$j%16],@X[$j%16],@X[($j+8)%16]
116	add	$f,$f,$e
117	rotlwi	$b,$b,30
118	xor	$t0,$t0,$d
119	xor	@X[$j%16],@X[$j%16],@X[($j+13)%16]
120	add	$f,$f,$t0
121	rotlwi	@X[$j%16],@X[$j%16],1
122___
123$code.=<<___ if ($i==79);
124	add	$f,$K,$e
125	rotlwi	$e,$a,5
126	lwz	r16,0($ctx)
127	add	$f,$f,@X[$i%16]
128	xor	$t0,$b,$c
129	lwz	r17,4($ctx)
130	add	$f,$f,$e
131	rotlwi	$b,$b,30
132	lwz	r18,8($ctx)
133	xor	$t0,$t0,$d
134	lwz	r19,12($ctx)
135	add	$f,$f,$t0
136	lwz	r20,16($ctx)
137___
138}
139
140sub BODY_40_59 {
141my ($i,$a,$b,$c,$d,$e,$f)=@_;
142my $j=$i+1;
143$code.=<<___;
144	add	$f,$K,$e
145	rotlwi	$e,$a,5
146	xor	@X[$j%16],@X[$j%16],@X[($j+2)%16]
147	add	$f,$f,@X[$i%16]
148	and	$t0,$b,$c
149	xor	@X[$j%16],@X[$j%16],@X[($j+8)%16]
150	add	$f,$f,$e
151	or	$t1,$b,$c
152	rotlwi	$b,$b,30
153	xor	@X[$j%16],@X[$j%16],@X[($j+13)%16]
154	and	$t1,$t1,$d
155	or	$t0,$t0,$t1
156	rotlwi	@X[$j%16],@X[$j%16],1
157	add	$f,$f,$t0
158___
159}
160
161$code=<<___;
162.machine	"any"
163.text
164
165.globl	.sha1_block_data_order
166.align	4
167.sha1_block_data_order:
168	$STU	$sp,-$FRAME($sp)
169	mflr	r0
170	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
171	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
172	$PUSH	r17,`$FRAME-$SIZE_T*15`($sp)
173	$PUSH	r18,`$FRAME-$SIZE_T*14`($sp)
174	$PUSH	r19,`$FRAME-$SIZE_T*13`($sp)
175	$PUSH	r20,`$FRAME-$SIZE_T*12`($sp)
176	$PUSH	r21,`$FRAME-$SIZE_T*11`($sp)
177	$PUSH	r22,`$FRAME-$SIZE_T*10`($sp)
178	$PUSH	r23,`$FRAME-$SIZE_T*9`($sp)
179	$PUSH	r24,`$FRAME-$SIZE_T*8`($sp)
180	$PUSH	r25,`$FRAME-$SIZE_T*7`($sp)
181	$PUSH	r26,`$FRAME-$SIZE_T*6`($sp)
182	$PUSH	r27,`$FRAME-$SIZE_T*5`($sp)
183	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
184	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
185	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
186	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
187	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
188	lwz	$A,0($ctx)
189	lwz	$B,4($ctx)
190	lwz	$C,8($ctx)
191	lwz	$D,12($ctx)
192	lwz	$E,16($ctx)
193	andi.	r0,$inp,3
194	bne	Lunaligned
195Laligned:
196	mtctr	$num
197	bl	Lsha1_block_private
198	b	Ldone
199
200; PowerPC specification allows an implementation to be ill-behaved
201; upon unaligned access which crosses page boundary. "Better safe
202; than sorry" principle makes me treat it specially. But I don't
203; look for particular offending word, but rather for 64-byte input
204; block which crosses the boundary. Once found that block is aligned
205; and hashed separately...
206.align	4
207Lunaligned:
208	subfic	$t1,$inp,4096
209	andi.	$t1,$t1,4095	; distance to closest page boundary
210	srwi.	$t1,$t1,6	; t1/=64
211	beq	Lcross_page
212	$UCMP	$num,$t1
213	ble-	Laligned	; didn't cross the page boundary
214	mtctr	$t1
215	subfc	$num,$t1,$num
216	bl	Lsha1_block_private
217Lcross_page:
218	li	$t1,16
219	mtctr	$t1
220	addi	r20,$sp,$LOCALS	; spot within the frame
221Lmemcpy:
222	lbz	r16,0($inp)
223	lbz	r17,1($inp)
224	lbz	r18,2($inp)
225	lbz	r19,3($inp)
226	addi	$inp,$inp,4
227	stb	r16,0(r20)
228	stb	r17,1(r20)
229	stb	r18,2(r20)
230	stb	r19,3(r20)
231	addi	r20,r20,4
232	bdnz	Lmemcpy
233
234	$PUSH	$inp,`$FRAME-$SIZE_T*18`($sp)
235	li	$t1,1
236	addi	$inp,$sp,$LOCALS
237	mtctr	$t1
238	bl	Lsha1_block_private
239	$POP	$inp,`$FRAME-$SIZE_T*18`($sp)
240	addic.	$num,$num,-1
241	bne-	Lunaligned
242
243Ldone:
244	$POP	r0,`$FRAME+$LRSAVE`($sp)
245	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
246	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
247	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
248	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
249	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
250	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
251	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
252	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
253	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
254	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
255	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
256	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
257	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
258	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
259	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
260	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
261	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
262	mtlr	r0
263	addi	$sp,$sp,$FRAME
264	blr
265	.long	0
266	.byte	0,12,4,1,0x80,18,3,0
267	.long	0
268___
269
270# This is private block function, which uses tailored calling
271# interface, namely upon entry SHA_CTX is pre-loaded to given
272# registers and counter register contains amount of chunks to
273# digest...
274$code.=<<___;
275.align	4
276Lsha1_block_private:
277___
278$code.=<<___;	# load K_00_19
279	lis	$K,0x5a82
280	ori	$K,$K,0x7999
281___
282for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
283$code.=<<___;	# load K_20_39
284	lis	$K,0x6ed9
285	ori	$K,$K,0xeba1
286___
287for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
288$code.=<<___;	# load K_40_59
289	lis	$K,0x8f1b
290	ori	$K,$K,0xbcdc
291___
292for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
293$code.=<<___;	# load K_60_79
294	lis	$K,0xca62
295	ori	$K,$K,0xc1d6
296___
297for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
298$code.=<<___;
299	add	r16,r16,$E
300	add	r17,r17,$T
301	add	r18,r18,$A
302	add	r19,r19,$B
303	add	r20,r20,$C
304	stw	r16,0($ctx)
305	mr	$A,r16
306	stw	r17,4($ctx)
307	mr	$B,r17
308	stw	r18,8($ctx)
309	mr	$C,r18
310	stw	r19,12($ctx)
311	mr	$D,r19
312	stw	r20,16($ctx)
313	mr	$E,r20
314	addi	$inp,$inp,`16*4`
315	bdnz-	Lsha1_block_private
316	blr
317	.long	0
318	.byte	0,12,0x14,0,0,0,0,0
319___
320$code.=<<___;
321.asciz	"SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
322___
323
324$code =~ s/\`([^\`]*)\`/eval $1/gem;
325print $code;
326close STDOUT;
327