1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256 block procedure for ARMv4. May 2007.
11
12# Performance is ~2x better than gcc 3.4 generated code and in "abso-
13# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
14# byte [on single-issue Xscale PXA250 core].
15
16# July 2010.
17#
18# Rescheduling for dual-issue pipeline resulted in 22% improvement on
19# Cortex A8 core and ~20 cycles per processed byte.
20
21while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
22open STDOUT,">$output";
23
24$ctx="r0";	$t0="r0";
25$inp="r1";
26$len="r2";	$t1="r2";
27$T1="r3";
28$A="r4";
29$B="r5";
30$C="r6";
31$D="r7";
32$E="r8";
33$F="r9";
34$G="r10";
35$H="r11";
36@V=($A,$B,$C,$D,$E,$F,$G,$H);
37$t2="r12";
38$Ktbl="r14";
39
40@Sigma0=( 2,13,22);
41@Sigma1=( 6,11,25);
42@sigma0=( 7,18, 3);
43@sigma1=(17,19,10);
44
45sub BODY_00_15 {
46my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
47
48$code.=<<___ if ($i<16);
49	ldrb	$T1,[$inp,#3]			@ $i
50	ldrb	$t2,[$inp,#2]
51	ldrb	$t1,[$inp,#1]
52	ldrb	$t0,[$inp],#4
53	orr	$T1,$T1,$t2,lsl#8
54	orr	$T1,$T1,$t1,lsl#16
55	orr	$T1,$T1,$t0,lsl#24
56	`"str	$inp,[sp,#17*4]"	if ($i==15)`
57___
58$code.=<<___;
59	ldr	$t2,[$Ktbl],#4			@ *K256++
60	mov	$t0,$e,ror#$Sigma1[0]
61	str	$T1,[sp,#`$i%16`*4]
62	eor	$t0,$t0,$e,ror#$Sigma1[1]
63	eor	$t1,$f,$g
64	eor	$t0,$t0,$e,ror#$Sigma1[2]	@ Sigma1(e)
65	and	$t1,$t1,$e
66	add	$T1,$T1,$t0
67	eor	$t1,$t1,$g			@ Ch(e,f,g)
68	add	$T1,$T1,$h
69	mov	$h,$a,ror#$Sigma0[0]
70	add	$T1,$T1,$t1
71	eor	$h,$h,$a,ror#$Sigma0[1]
72	add	$T1,$T1,$t2
73	eor	$h,$h,$a,ror#$Sigma0[2]		@ Sigma0(a)
74	orr	$t0,$a,$b
75	and	$t1,$a,$b
76	and	$t0,$t0,$c
77	add	$h,$h,$T1
78	orr	$t0,$t0,$t1			@ Maj(a,b,c)
79	add	$d,$d,$T1
80	add	$h,$h,$t0
81___
82}
83
84sub BODY_16_XX {
85my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
86
87$code.=<<___;
88	ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
89	ldr	$t2,[sp,#`($i+14)%16`*4]
90	ldr	$T1,[sp,#`($i+0)%16`*4]
91	mov	$t0,$t1,ror#$sigma0[0]
92	ldr	$inp,[sp,#`($i+9)%16`*4]
93	eor	$t0,$t0,$t1,ror#$sigma0[1]
94	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
95	mov	$t1,$t2,ror#$sigma1[0]
96	add	$T1,$T1,$t0
97	eor	$t1,$t1,$t2,ror#$sigma1[1]
98	add	$T1,$T1,$inp
99	eor	$t1,$t1,$t2,lsr#$sigma1[2]	@ sigma1(X[i+14])
100	add	$T1,$T1,$t1
101___
102	&BODY_00_15(@_);
103}
104
105$code=<<___;
106.text
107.code	32
108
109.type	K256,%object
110.align	5
111K256:
112.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
113.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
114.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
115.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
116.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
117.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
118.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
119.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
120.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
121.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
122.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
123.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
124.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
125.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
126.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
127.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
128.size	K256,.-K256
129
130.global	sha256_block_data_order
131.type	sha256_block_data_order,%function
132sha256_block_data_order:
133	sub	r3,pc,#8		@ sha256_block_data_order
134	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
135	stmdb	sp!,{$ctx,$inp,$len,r4-r12,lr}
136	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
137	sub	$Ktbl,r3,#256		@ K256
138	sub	sp,sp,#16*4		@ alloca(X[16])
139.Loop:
140___
141for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
142$code.=".Lrounds_16_xx:\n";
143for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
144$code.=<<___;
145	and	$t2,$t2,#0xff
146	cmp	$t2,#0xf2
147	bne	.Lrounds_16_xx
148
149	ldr	$T1,[sp,#16*4]		@ pull ctx
150	ldr	$t0,[$T1,#0]
151	ldr	$t1,[$T1,#4]
152	ldr	$t2,[$T1,#8]
153	add	$A,$A,$t0
154	ldr	$t0,[$T1,#12]
155	add	$B,$B,$t1
156	ldr	$t1,[$T1,#16]
157	add	$C,$C,$t2
158	ldr	$t2,[$T1,#20]
159	add	$D,$D,$t0
160	ldr	$t0,[$T1,#24]
161	add	$E,$E,$t1
162	ldr	$t1,[$T1,#28]
163	add	$F,$F,$t2
164	ldr	$inp,[sp,#17*4]		@ pull inp
165	ldr	$t2,[sp,#18*4]		@ pull inp+len
166	add	$G,$G,$t0
167	add	$H,$H,$t1
168	stmia	$T1,{$A,$B,$C,$D,$E,$F,$G,$H}
169	cmp	$inp,$t2
170	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
171	bne	.Loop
172
173	add	sp,sp,#`16+3`*4	@ destroy frame
174	ldmia	sp!,{r4-r12,lr}
175	tst	lr,#1
176	moveq	pc,lr			@ be binary compatible with V4, yet
177	bx	lr			@ interoperable with Thumb ISA:-)
178.size   sha256_block_data_order,.-sha256_block_data_order
179.asciz  "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
180.align	2
181___
182
183$code =~ s/\`([^\`]*)\`/eval $1/gem;
184$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
185print $code;
186close STDOUT; # enforce flush
187