sha1-ppc.pl revision 221304ee937bc0910948a8be1320cb8cc4eb6d36
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# I let hardware handle unaligned input(*), except on page boundaries
11# (see below for details). Otherwise straightforward implementation
12# with X vector in register bank. The module is big-endian [which is
13# not big deal as there're no little-endian targets left around].
14#
15# (*) this means that this module is inappropriate for PPC403? Does
16#     anybody know if pre-POWER3 can sustain unaligned load?
17
18# 			-m64	-m32
19# ----------------------------------
20# PPC970,gcc-4.0.0	+76%	+59%
21# Power6,xlc-7		+68%	+33%
22
23$flavour = shift;
24
25if ($flavour =~ /64/) {
26	$SIZE_T	=8;
27	$UCMP	="cmpld";
28	$STU	="stdu";
29	$POP	="ld";
30	$PUSH	="std";
31} elsif ($flavour =~ /32/) {
32	$SIZE_T	=4;
33	$UCMP	="cmplw";
34	$STU	="stwu";
35	$POP	="lwz";
36	$PUSH	="stw";
37} else { die "nonsense $flavour"; }
38
39$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
41( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
42die "can't locate ppc-xlate.pl";
43
44open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
45
46$FRAME=24*$SIZE_T;
47
48$K  ="r0";
49$sp ="r1";
50$toc="r2";
51$ctx="r3";
52$inp="r4";
53$num="r5";
54$t0 ="r15";
55$t1 ="r6";
56
57$A  ="r7";
58$B  ="r8";
59$C  ="r9";
60$D  ="r10";
61$E  ="r11";
62$T  ="r12";
63
64@V=($A,$B,$C,$D,$E,$T);
65@X=("r16","r17","r18","r19","r20","r21","r22","r23",
66    "r24","r25","r26","r27","r28","r29","r30","r31");
67
68sub BODY_00_19 {
69my ($i,$a,$b,$c,$d,$e,$f)=@_;
70my $j=$i+1;
71$code.=<<___ if ($i==0);
72	lwz	@X[$i],`$i*4`($inp)
73___
74$code.=<<___ if ($i<15);
75	lwz	@X[$j],`$j*4`($inp)
76	add	$f,$K,$e
77	rotlwi	$e,$a,5
78	add	$f,$f,@X[$i]
79	and	$t0,$c,$b
80	add	$f,$f,$e
81	andc	$t1,$d,$b
82	rotlwi	$b,$b,30
83	or	$t0,$t0,$t1
84	add	$f,$f,$t0
85___
86$code.=<<___ if ($i>=15);
87	add	$f,$K,$e
88	rotlwi	$e,$a,5
89	xor	@X[$j%16],@X[$j%16],@X[($j+2)%16]
90	add	$f,$f,@X[$i%16]
91	and	$t0,$c,$b
92	xor	@X[$j%16],@X[$j%16],@X[($j+8)%16]
93	add	$f,$f,$e
94	andc	$t1,$d,$b
95	rotlwi	$b,$b,30
96	or	$t0,$t0,$t1
97	xor	@X[$j%16],@X[$j%16],@X[($j+13)%16]
98	add	$f,$f,$t0
99	rotlwi	@X[$j%16],@X[$j%16],1
100___
101}
102
103sub BODY_20_39 {
104my ($i,$a,$b,$c,$d,$e,$f)=@_;
105my $j=$i+1;
106$code.=<<___ if ($i<79);
107	add	$f,$K,$e
108	rotlwi	$e,$a,5
109	xor	@X[$j%16],@X[$j%16],@X[($j+2)%16]
110	add	$f,$f,@X[$i%16]
111	xor	$t0,$b,$c
112	xor	@X[$j%16],@X[$j%16],@X[($j+8)%16]
113	add	$f,$f,$e
114	rotlwi	$b,$b,30
115	xor	$t0,$t0,$d
116	xor	@X[$j%16],@X[$j%16],@X[($j+13)%16]
117	add	$f,$f,$t0
118	rotlwi	@X[$j%16],@X[$j%16],1
119___
120$code.=<<___ if ($i==79);
121	add	$f,$K,$e
122	rotlwi	$e,$a,5
123	lwz	r16,0($ctx)
124	add	$f,$f,@X[$i%16]
125	xor	$t0,$b,$c
126	lwz	r17,4($ctx)
127	add	$f,$f,$e
128	rotlwi	$b,$b,30
129	lwz	r18,8($ctx)
130	xor	$t0,$t0,$d
131	lwz	r19,12($ctx)
132	add	$f,$f,$t0
133	lwz	r20,16($ctx)
134___
135}
136
137sub BODY_40_59 {
138my ($i,$a,$b,$c,$d,$e,$f)=@_;
139my $j=$i+1;
140$code.=<<___;
141	add	$f,$K,$e
142	rotlwi	$e,$a,5
143	xor	@X[$j%16],@X[$j%16],@X[($j+2)%16]
144	add	$f,$f,@X[$i%16]
145	and	$t0,$b,$c
146	xor	@X[$j%16],@X[$j%16],@X[($j+8)%16]
147	add	$f,$f,$e
148	or	$t1,$b,$c
149	rotlwi	$b,$b,30
150	xor	@X[$j%16],@X[$j%16],@X[($j+13)%16]
151	and	$t1,$t1,$d
152	or	$t0,$t0,$t1
153	rotlwi	@X[$j%16],@X[$j%16],1
154	add	$f,$f,$t0
155___
156}
157
158$code=<<___;
159.machine	"any"
160.text
161
162.globl	.sha1_block_data_order
163.align	4
164.sha1_block_data_order:
165	mflr	r0
166	$STU	$sp,`-($FRAME+64)`($sp)
167	$PUSH	r0,`$FRAME-$SIZE_T*18`($sp)
168	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
169	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
170	$PUSH	r17,`$FRAME-$SIZE_T*15`($sp)
171	$PUSH	r18,`$FRAME-$SIZE_T*14`($sp)
172	$PUSH	r19,`$FRAME-$SIZE_T*13`($sp)
173	$PUSH	r20,`$FRAME-$SIZE_T*12`($sp)
174	$PUSH	r21,`$FRAME-$SIZE_T*11`($sp)
175	$PUSH	r22,`$FRAME-$SIZE_T*10`($sp)
176	$PUSH	r23,`$FRAME-$SIZE_T*9`($sp)
177	$PUSH	r24,`$FRAME-$SIZE_T*8`($sp)
178	$PUSH	r25,`$FRAME-$SIZE_T*7`($sp)
179	$PUSH	r26,`$FRAME-$SIZE_T*6`($sp)
180	$PUSH	r27,`$FRAME-$SIZE_T*5`($sp)
181	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
182	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
183	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
184	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
185	lwz	$A,0($ctx)
186	lwz	$B,4($ctx)
187	lwz	$C,8($ctx)
188	lwz	$D,12($ctx)
189	lwz	$E,16($ctx)
190	andi.	r0,$inp,3
191	bne	Lunaligned
192Laligned:
193	mtctr	$num
194	bl	Lsha1_block_private
195Ldone:
196	$POP	r0,`$FRAME-$SIZE_T*18`($sp)
197	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
198	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
199	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
200	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
201	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
202	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
203	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
204	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
205	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
206	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
207	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
208	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
209	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
210	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
211	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
212	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
213	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
214	mtlr	r0
215	addi	$sp,$sp,`$FRAME+64`
216	blr
217___
218
219# PowerPC specification allows an implementation to be ill-behaved
220# upon unaligned access which crosses page boundary. "Better safe
221# than sorry" principle makes me treat it specially. But I don't
222# look for particular offending word, but rather for 64-byte input
223# block which crosses the boundary. Once found that block is aligned
224# and hashed separately...
225$code.=<<___;
226.align	4
227Lunaligned:
228	subfic	$t1,$inp,4096
229	andi.	$t1,$t1,4095	; distance to closest page boundary
230	srwi.	$t1,$t1,6	; t1/=64
231	beq	Lcross_page
232	$UCMP	$num,$t1
233	ble-	Laligned	; didn't cross the page boundary
234	mtctr	$t1
235	subfc	$num,$t1,$num
236	bl	Lsha1_block_private
237Lcross_page:
238	li	$t1,16
239	mtctr	$t1
240	addi	r20,$sp,$FRAME	; spot below the frame
241Lmemcpy:
242	lbz	r16,0($inp)
243	lbz	r17,1($inp)
244	lbz	r18,2($inp)
245	lbz	r19,3($inp)
246	addi	$inp,$inp,4
247	stb	r16,0(r20)
248	stb	r17,1(r20)
249	stb	r18,2(r20)
250	stb	r19,3(r20)
251	addi	r20,r20,4
252	bdnz	Lmemcpy
253
254	$PUSH	$inp,`$FRAME-$SIZE_T*19`($sp)
255	li	$t1,1
256	addi	$inp,$sp,$FRAME
257	mtctr	$t1
258	bl	Lsha1_block_private
259	$POP	$inp,`$FRAME-$SIZE_T*19`($sp)
260	addic.	$num,$num,-1
261	bne-	Lunaligned
262	b	Ldone
263___
264
265# This is private block function, which uses tailored calling
266# interface, namely upon entry SHA_CTX is pre-loaded to given
267# registers and counter register contains amount of chunks to
268# digest...
269$code.=<<___;
270.align	4
271Lsha1_block_private:
272___
273$code.=<<___;	# load K_00_19
274	lis	$K,0x5a82
275	ori	$K,$K,0x7999
276___
277for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
278$code.=<<___;	# load K_20_39
279	lis	$K,0x6ed9
280	ori	$K,$K,0xeba1
281___
282for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
283$code.=<<___;	# load K_40_59
284	lis	$K,0x8f1b
285	ori	$K,$K,0xbcdc
286___
287for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
288$code.=<<___;	# load K_60_79
289	lis	$K,0xca62
290	ori	$K,$K,0xc1d6
291___
292for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
293$code.=<<___;
294	add	r16,r16,$E
295	add	r17,r17,$T
296	add	r18,r18,$A
297	add	r19,r19,$B
298	add	r20,r20,$C
299	stw	r16,0($ctx)
300	mr	$A,r16
301	stw	r17,4($ctx)
302	mr	$B,r17
303	stw	r18,8($ctx)
304	mr	$C,r18
305	stw	r19,12($ctx)
306	mr	$D,r19
307	stw	r20,16($ctx)
308	mr	$E,r20
309	addi	$inp,$inp,`16*4`
310	bdnz-	Lsha1_block_private
311	blr
312___
313$code.=<<___;
314.asciz	"SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
315___
316
317$code =~ s/\`([^\`]*)\`/eval $1/gem;
318print $code;
319close STDOUT;
320