sha256-armv4.pl revision 221304ee937bc0910948a8be1320cb8cc4eb6d36
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256 block procedure for ARMv4. May 2007.
11
12# Performance is ~2x better than gcc 3.4 generated code and in "abso-
13# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
14# byte.
15
16$output=shift;
17open STDOUT,">$output";
18
19$ctx="r0";	$t0="r0";
20$inp="r1";
21$len="r2";	$t1="r2";
22$T1="r3";
23$A="r4";
24$B="r5";
25$C="r6";
26$D="r7";
27$E="r8";
28$F="r9";
29$G="r10";
30$H="r11";
31@V=($A,$B,$C,$D,$E,$F,$G,$H);
32$t2="r12";
33$Ktbl="r14";
34
35@Sigma0=( 2,13,22);
36@Sigma1=( 6,11,25);
37@sigma0=( 7,18, 3);
38@sigma1=(17,19,10);
39
40sub BODY_00_15 {
41my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
42
43$code.=<<___ if ($i<16);
44	ldrb	$T1,[$inp,#3]			@ $i
45	ldrb	$t2,[$inp,#2]
46	ldrb	$t1,[$inp,#1]
47	ldrb	$t0,[$inp],#4
48	orr	$T1,$T1,$t2,lsl#8
49	orr	$T1,$T1,$t1,lsl#16
50	orr	$T1,$T1,$t0,lsl#24
51	`"str	$inp,[sp,#17*4]"	if ($i==15)`
52___
53$code.=<<___;
54	ldr	$t2,[$Ktbl],#4			@ *K256++
55	str	$T1,[sp,#`$i%16`*4]
56	mov	$t0,$e,ror#$Sigma1[0]
57	eor	$t0,$t0,$e,ror#$Sigma1[1]
58	eor	$t0,$t0,$e,ror#$Sigma1[2]	@ Sigma1(e)
59	add	$T1,$T1,$t0
60	eor	$t1,$f,$g
61	and	$t1,$t1,$e
62	eor	$t1,$t1,$g			@ Ch(e,f,g)
63	add	$T1,$T1,$t1
64	add	$T1,$T1,$h
65	add	$T1,$T1,$t2
66	mov	$h,$a,ror#$Sigma0[0]
67	eor	$h,$h,$a,ror#$Sigma0[1]
68	eor	$h,$h,$a,ror#$Sigma0[2]		@ Sigma0(a)
69	orr	$t0,$a,$b
70	and	$t0,$t0,$c
71	and	$t1,$a,$b
72	orr	$t0,$t0,$t1			@ Maj(a,b,c)
73	add	$h,$h,$t0
74	add	$d,$d,$T1
75	add	$h,$h,$T1
76___
77}
78
79sub BODY_16_XX {
80my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
81
82$code.=<<___;
83	ldr	$t1,[sp,#`($i+1)%16`*4]	@ $i
84	ldr	$t2,[sp,#`($i+14)%16`*4]
85	ldr	$T1,[sp,#`($i+0)%16`*4]
86	ldr	$inp,[sp,#`($i+9)%16`*4]
87	mov	$t0,$t1,ror#$sigma0[0]
88	eor	$t0,$t0,$t1,ror#$sigma0[1]
89	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
90	mov	$t1,$t2,ror#$sigma1[0]
91	eor	$t1,$t1,$t2,ror#$sigma1[1]
92	eor	$t1,$t1,$t2,lsr#$sigma1[2]	@ sigma1(X[i+14])
93	add	$T1,$T1,$t0
94	add	$T1,$T1,$t1
95	add	$T1,$T1,$inp
96___
97	&BODY_00_15(@_);
98}
99
100$code=<<___;
101.text
102.code	32
103
104.type	K256,%object
105.align	5
106K256:
107.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
108.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
109.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
110.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
111.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
112.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
113.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
114.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
115.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
116.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
117.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
118.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
119.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
120.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
121.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
122.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
123.size	K256,.-K256
124
125.global	sha256_block_data_order
126.type	sha256_block_data_order,%function
127sha256_block_data_order:
128	sub	r3,pc,#8		@ sha256_block_data_order
129	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
130	stmdb	sp!,{$ctx,$inp,$len,r4-r12,lr}
131	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
132	sub	$Ktbl,r3,#256		@ K256
133	sub	sp,sp,#16*4		@ alloca(X[16])
134.Loop:
135___
136for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
137$code.=".Lrounds_16_xx:\n";
138for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
139$code.=<<___;
140	and	$t2,$t2,#0xff
141	cmp	$t2,#0xf2
142	bne	.Lrounds_16_xx
143
144	ldr	$T1,[sp,#16*4]		@ pull ctx
145	ldr	$t0,[$T1,#0]
146	ldr	$t1,[$T1,#4]
147	ldr	$t2,[$T1,#8]
148	add	$A,$A,$t0
149	ldr	$t0,[$T1,#12]
150	add	$B,$B,$t1
151	ldr	$t1,[$T1,#16]
152	add	$C,$C,$t2
153	ldr	$t2,[$T1,#20]
154	add	$D,$D,$t0
155	ldr	$t0,[$T1,#24]
156	add	$E,$E,$t1
157	ldr	$t1,[$T1,#28]
158	add	$F,$F,$t2
159	ldr	$inp,[sp,#17*4]		@ pull inp
160	ldr	$t2,[sp,#18*4]		@ pull inp+len
161	add	$G,$G,$t0
162	add	$H,$H,$t1
163	stmia	$T1,{$A,$B,$C,$D,$E,$F,$G,$H}
164	cmp	$inp,$t2
165	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
166	bne	.Loop
167
168	add	sp,sp,#`16+3`*4	@ destroy frame
169	ldmia	sp!,{r4-r12,lr}
170	tst	lr,#1
171	moveq	pc,lr			@ be binary compatible with V4, yet
172	bx	lr			@ interoperable with Thumb ISA:-)
173.size   sha256_block_data_order,.-sha256_block_data_order
174.asciz  "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
175.align	2
176___
177
178$code =~ s/\`([^\`]*)\`/eval $1/gem;
179$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
180print $code;
181close STDOUT; # enforce flush
182