sha256-armv4.pl revision 3f9e6ada2c9f7183a41081263585e6a70bbd9f59
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256 block procedure for ARMv4. May 2007.
11
12# Performance is ~2x better than gcc 3.4 generated code and in "abso-
13# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
14# byte [on single-issue Xscale PXA250 core].
15
16# July 2010.
17#
18# Rescheduling for dual-issue pipeline resulted in 22% improvement on
19# Cortex A8 core and ~20 cycles per processed byte.
20
21# February 2011.
22#
23# Profiler-assisted and platform-specific optimization resulted in 16%
24# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
25
26# September 2013.
27#
28# Add NEON implementation. On Cortex A8 it was measured to process one
29# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
30# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
31# code (meaning that latter performs sub-optimally, nothing was done
32# about it).
33
34# May 2014.
35#
36# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
37
38while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
39open STDOUT,">$output";
40
41$ctx="r0";	$t0="r0";
42$inp="r1";	$t4="r1";
43$len="r2";	$t1="r2";
44$T1="r3";	$t3="r3";
45$A="r4";
46$B="r5";
47$C="r6";
48$D="r7";
49$E="r8";
50$F="r9";
51$G="r10";
52$H="r11";
53@V=($A,$B,$C,$D,$E,$F,$G,$H);
54$t2="r12";
55$Ktbl="r14";
56
57@Sigma0=( 2,13,22);
58@Sigma1=( 6,11,25);
59@sigma0=( 7,18, 3);
60@sigma1=(17,19,10);
61
62sub BODY_00_15 {
63my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
64
65$code.=<<___ if ($i<16);
66#if __ARM_ARCH__>=7
67	@ ldr	$t1,[$inp],#4			@ $i
68# if $i==15
69	str	$inp,[sp,#17*4]			@ make room for $t4
70# endif
71	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
72	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
73	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
74	rev	$t1,$t1
75#else
76	@ ldrb	$t1,[$inp,#3]			@ $i
77	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
78	ldrb	$t2,[$inp,#2]
79	ldrb	$t0,[$inp,#1]
80	orr	$t1,$t1,$t2,lsl#8
81	ldrb	$t2,[$inp],#4
82	orr	$t1,$t1,$t0,lsl#16
83# if $i==15
84	str	$inp,[sp,#17*4]			@ make room for $t4
85# endif
86	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
87	orr	$t1,$t1,$t2,lsl#24
88	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
89#endif
90___
91$code.=<<___;
92	ldr	$t2,[$Ktbl],#4			@ *K256++
93	add	$h,$h,$t1			@ h+=X[i]
94	str	$t1,[sp,#`$i%16`*4]
95	eor	$t1,$f,$g
96	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
97	and	$t1,$t1,$e
98	add	$h,$h,$t2			@ h+=K256[i]
99	eor	$t1,$t1,$g			@ Ch(e,f,g)
100	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
101	add	$h,$h,$t1			@ h+=Ch(e,f,g)
102#if $i==31
103	and	$t2,$t2,#0xff
104	cmp	$t2,#0xf2			@ done?
105#endif
106#if $i<15
107# if __ARM_ARCH__>=7
108	ldr	$t1,[$inp],#4			@ prefetch
109# else
110	ldrb	$t1,[$inp,#3]
111# endif
112	eor	$t2,$a,$b			@ a^b, b^c in next round
113#else
114	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
115	eor	$t2,$a,$b			@ a^b, b^c in next round
116	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
117#endif
118	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
119	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
120	add	$d,$d,$h			@ d+=h
121	eor	$t3,$t3,$b			@ Maj(a,b,c)
122	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
123	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
124___
125	($t2,$t3)=($t3,$t2);
126}
127
128sub BODY_16_XX {
129my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
130
131$code.=<<___;
132	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
133	@ ldr	$t4,[sp,#`($i+14)%16`*4]
134	mov	$t0,$t1,ror#$sigma0[0]
135	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
136	mov	$t2,$t4,ror#$sigma1[0]
137	eor	$t0,$t0,$t1,ror#$sigma0[1]
138	eor	$t2,$t2,$t4,ror#$sigma1[1]
139	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
140	ldr	$t1,[sp,#`($i+0)%16`*4]
141	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
142	ldr	$t4,[sp,#`($i+9)%16`*4]
143
144	add	$t2,$t2,$t0
145	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
146	add	$t1,$t1,$t2
147	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
148	add	$t1,$t1,$t4			@ X[i]
149___
150	&BODY_00_15(@_);
151}
152
153$code=<<___;
154#include "arm_arch.h"
155
156.text
157.code	32
158
159.type	K256,%object
160.align	5
161K256:
162.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
163.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
164.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
165.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
166.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
167.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
168.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
169.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
170.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
171.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
172.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
173.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
174.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
175.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
176.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
177.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
178.size	K256,.-K256
179.word	0				@ terminator
180.LOPENSSL_armcap:
181.word	OPENSSL_armcap_P-sha256_block_data_order
182.align	5
183
184.global	sha256_block_data_order
185.type	sha256_block_data_order,%function
186sha256_block_data_order:
187	sub	r3,pc,#8		@ sha256_block_data_order
188	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
189#if __ARM_ARCH__>=7
190	ldr	r12,.LOPENSSL_armcap
191	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
192	tst	r12,#ARMV8_SHA256
193	bne	.LARMv8
194	tst	r12,#ARMV7_NEON
195	bne	.LNEON
196#endif
197	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
198	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
199	sub	$Ktbl,r3,#256+32	@ K256
200	sub	sp,sp,#16*4		@ alloca(X[16])
201.Loop:
202# if __ARM_ARCH__>=7
203	ldr	$t1,[$inp],#4
204# else
205	ldrb	$t1,[$inp,#3]
206# endif
207	eor	$t3,$B,$C		@ magic
208	eor	$t2,$t2,$t2
209___
210for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
211$code.=".Lrounds_16_xx:\n";
212for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
213$code.=<<___;
214	ldreq	$t3,[sp,#16*4]		@ pull ctx
215	bne	.Lrounds_16_xx
216
217	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
218	ldr	$t0,[$t3,#0]
219	ldr	$t1,[$t3,#4]
220	ldr	$t2,[$t3,#8]
221	add	$A,$A,$t0
222	ldr	$t0,[$t3,#12]
223	add	$B,$B,$t1
224	ldr	$t1,[$t3,#16]
225	add	$C,$C,$t2
226	ldr	$t2,[$t3,#20]
227	add	$D,$D,$t0
228	ldr	$t0,[$t3,#24]
229	add	$E,$E,$t1
230	ldr	$t1,[$t3,#28]
231	add	$F,$F,$t2
232	ldr	$inp,[sp,#17*4]		@ pull inp
233	ldr	$t2,[sp,#18*4]		@ pull inp+len
234	add	$G,$G,$t0
235	add	$H,$H,$t1
236	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
237	cmp	$inp,$t2
238	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
239	bne	.Loop
240
241	add	sp,sp,#`16+3`*4	@ destroy frame
242#if __ARM_ARCH__>=5
243	ldmia	sp!,{r4-r11,pc}
244#else
245	ldmia	sp!,{r4-r11,lr}
246	tst	lr,#1
247	moveq	pc,lr			@ be binary compatible with V4, yet
248	bx	lr			@ interoperable with Thumb ISA:-)
249#endif
250.size	sha256_block_data_order,.-sha256_block_data_order
251___
252######################################################################
253# NEON stuff
254#
255{{{
256my @X=map("q$_",(0..3));
257my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
258my $Xfer=$t4;
259my $j=0;
260
261sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
262sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
263
264sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
265{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
266  my $arg = pop;
267    $arg = "#$arg" if ($arg*1 eq $arg);
268    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
269}
270
271sub Xupdate()
272{ use integer;
273  my $body = shift;
274  my @insns = (&$body,&$body,&$body,&$body);
275  my ($a,$b,$c,$d,$e,$f,$g,$h);
276
277	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
278	 eval(shift(@insns));
279	 eval(shift(@insns));
280	 eval(shift(@insns));
281	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
282	 eval(shift(@insns));
283	 eval(shift(@insns));
284	 eval(shift(@insns));
285	&vshr_u32	($T2,$T0,$sigma0[0]);
286	 eval(shift(@insns));
287	 eval(shift(@insns));
288	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
289	 eval(shift(@insns));
290	 eval(shift(@insns));
291	&vshr_u32	($T1,$T0,$sigma0[2]);
292	 eval(shift(@insns));
293	 eval(shift(@insns));
294	&vsli_32	($T2,$T0,32-$sigma0[0]);
295	 eval(shift(@insns));
296	 eval(shift(@insns));
297	&vshr_u32	($T3,$T0,$sigma0[1]);
298	 eval(shift(@insns));
299	 eval(shift(@insns));
300	&veor		($T1,$T1,$T2);
301	 eval(shift(@insns));
302	 eval(shift(@insns));
303	&vsli_32	($T3,$T0,32-$sigma0[1]);
304	 eval(shift(@insns));
305	 eval(shift(@insns));
306	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
307	 eval(shift(@insns));
308	 eval(shift(@insns));
309	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
310	 eval(shift(@insns));
311	 eval(shift(@insns));
312	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
313	 eval(shift(@insns));
314	 eval(shift(@insns));
315	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
316	 eval(shift(@insns));
317	 eval(shift(@insns));
318	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
319	 eval(shift(@insns));
320	 eval(shift(@insns));
321	  &veor		($T5,$T5,$T4);
322	 eval(shift(@insns));
323	 eval(shift(@insns));
324	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
325	 eval(shift(@insns));
326	 eval(shift(@insns));
327	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
328	 eval(shift(@insns));
329	 eval(shift(@insns));
330	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
331	 eval(shift(@insns));
332	 eval(shift(@insns));
333	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
334	 eval(shift(@insns));
335	 eval(shift(@insns));
336	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
337	 eval(shift(@insns));
338	 eval(shift(@insns));
339	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
340	 eval(shift(@insns));
341	 eval(shift(@insns));
342	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
343	 eval(shift(@insns));
344	 eval(shift(@insns));
345	  &veor		($T5,$T5,$T4);
346	 eval(shift(@insns));
347	 eval(shift(@insns));
348	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
349	 eval(shift(@insns));
350	 eval(shift(@insns));
351	&vld1_32	("{$T0}","[$Ktbl,:128]!");
352	 eval(shift(@insns));
353	 eval(shift(@insns));
354	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
355	 eval(shift(@insns));
356	 eval(shift(@insns));
357	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
358	 eval(shift(@insns));
359	 eval(shift(@insns));
360	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
361	 eval(shift(@insns));
362	 eval(shift(@insns));
363	&vadd_i32	($T0,$T0,@X[0]);
364	 while($#insns>=2) { eval(shift(@insns)); }
365	&vst1_32	("{$T0}","[$Xfer,:128]!");
366	 eval(shift(@insns));
367	 eval(shift(@insns));
368
369	push(@X,shift(@X));		# "rotate" X[]
370}
371
372sub Xpreload()
373{ use integer;
374  my $body = shift;
375  my @insns = (&$body,&$body,&$body,&$body);
376  my ($a,$b,$c,$d,$e,$f,$g,$h);
377
378	 eval(shift(@insns));
379	 eval(shift(@insns));
380	 eval(shift(@insns));
381	 eval(shift(@insns));
382	&vld1_32	("{$T0}","[$Ktbl,:128]!");
383	 eval(shift(@insns));
384	 eval(shift(@insns));
385	 eval(shift(@insns));
386	 eval(shift(@insns));
387	&vrev32_8	(@X[0],@X[0]);
388	 eval(shift(@insns));
389	 eval(shift(@insns));
390	 eval(shift(@insns));
391	 eval(shift(@insns));
392	&vadd_i32	($T0,$T0,@X[0]);
393	 foreach (@insns) { eval; }	# remaining instructions
394	&vst1_32	("{$T0}","[$Xfer,:128]!");
395
396	push(@X,shift(@X));		# "rotate" X[]
397}
398
399sub body_00_15 () {
400	(
401	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
402	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
403	'&eor	($t1,$f,$g)',
404	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
405	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
406	'&and	($t1,$t1,$e)',
407	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
408	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
409	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
410	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
411	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
412	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
413	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
414	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
415	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
416	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
417	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
418	'&add	($d,$d,$h)',			# d+=h
419	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
420	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
421	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
422	)
423}
424
425$code.=<<___;
426#if __ARM_ARCH__>=7
427.fpu	neon
428
429.type	sha256_block_data_order_neon,%function
430.align	4
431sha256_block_data_order_neon:
432.LNEON:
433	stmdb	sp!,{r4-r12,lr}
434
435	mov	$t2,sp
436	sub	sp,sp,#16*4+16		@ alloca
437	sub	$Ktbl,r3,#256+32	@ K256
438	bic	sp,sp,#15		@ align for 128-bit stores
439
440	vld1.8		{@X[0]},[$inp]!
441	vld1.8		{@X[1]},[$inp]!
442	vld1.8		{@X[2]},[$inp]!
443	vld1.8		{@X[3]},[$inp]!
444	vld1.32		{$T0},[$Ktbl,:128]!
445	vld1.32		{$T1},[$Ktbl,:128]!
446	vld1.32		{$T2},[$Ktbl,:128]!
447	vld1.32		{$T3},[$Ktbl,:128]!
448	vrev32.8	@X[0],@X[0]		@ yes, even on
449	str		$ctx,[sp,#64]
450	vrev32.8	@X[1],@X[1]		@ big-endian
451	str		$inp,[sp,#68]
452	mov		$Xfer,sp
453	vrev32.8	@X[2],@X[2]
454	str		$len,[sp,#72]
455	vrev32.8	@X[3],@X[3]
456	str		$t2,[sp,#76]		@ save original sp
457	vadd.i32	$T0,$T0,@X[0]
458	vadd.i32	$T1,$T1,@X[1]
459	vst1.32		{$T0},[$Xfer,:128]!
460	vadd.i32	$T2,$T2,@X[2]
461	vst1.32		{$T1},[$Xfer,:128]!
462	vadd.i32	$T3,$T3,@X[3]
463	vst1.32		{$T2},[$Xfer,:128]!
464	vst1.32		{$T3},[$Xfer,:128]!
465
466	ldmia		$ctx,{$A-$H}
467	sub		$Xfer,$Xfer,#64
468	ldr		$t1,[sp,#0]
469	eor		$t2,$t2,$t2
470	eor		$t3,$B,$C
471	b		.L_00_48
472
473.align	4
474.L_00_48:
475___
476	&Xupdate(\&body_00_15);
477	&Xupdate(\&body_00_15);
478	&Xupdate(\&body_00_15);
479	&Xupdate(\&body_00_15);
480$code.=<<___;
481	teq	$t1,#0				@ check for K256 terminator
482	ldr	$t1,[sp,#0]
483	sub	$Xfer,$Xfer,#64
484	bne	.L_00_48
485
486	ldr		$inp,[sp,#68]
487	ldr		$t0,[sp,#72]
488	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
489	teq		$inp,$t0
490	subeq		$inp,$inp,#64		@ avoid SEGV
491	vld1.8		{@X[0]},[$inp]!		@ load next input block
492	vld1.8		{@X[1]},[$inp]!
493	vld1.8		{@X[2]},[$inp]!
494	vld1.8		{@X[3]},[$inp]!
495	strne		$inp,[sp,#68]
496	mov		$Xfer,sp
497___
498	&Xpreload(\&body_00_15);
499	&Xpreload(\&body_00_15);
500	&Xpreload(\&body_00_15);
501	&Xpreload(\&body_00_15);
502$code.=<<___;
503	ldr	$t0,[$t1,#0]
504	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
505	ldr	$t2,[$t1,#4]
506	ldr	$t3,[$t1,#8]
507	ldr	$t4,[$t1,#12]
508	add	$A,$A,$t0			@ accumulate
509	ldr	$t0,[$t1,#16]
510	add	$B,$B,$t2
511	ldr	$t2,[$t1,#20]
512	add	$C,$C,$t3
513	ldr	$t3,[$t1,#24]
514	add	$D,$D,$t4
515	ldr	$t4,[$t1,#28]
516	add	$E,$E,$t0
517	str	$A,[$t1],#4
518	add	$F,$F,$t2
519	str	$B,[$t1],#4
520	add	$G,$G,$t3
521	str	$C,[$t1],#4
522	add	$H,$H,$t4
523	str	$D,[$t1],#4
524	stmia	$t1,{$E-$H}
525
526	movne	$Xfer,sp
527	ldrne	$t1,[sp,#0]
528	eorne	$t2,$t2,$t2
529	ldreq	sp,[sp,#76]			@ restore original sp
530	eorne	$t3,$B,$C
531	bne	.L_00_48
532
533	ldmia	sp!,{r4-r12,pc}
534.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
535#endif
536___
537}}}
538######################################################################
539# ARMv8 stuff
540#
541{{{
542my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
543my @MSG=map("q$_",(8..11));
544my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
545my $Ktbl="r3";
546
547$code.=<<___;
548#if __ARM_ARCH__>=7
549.type	sha256_block_data_order_armv8,%function
550.align	5
551sha256_block_data_order_armv8:
552.LARMv8:
553	vld1.32	{$ABCD,$EFGH},[$ctx]
554	sub	$Ktbl,r3,#sha256_block_data_order-K256
555
556.Loop_v8:
557	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
558	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
559	vld1.32		{$W0},[$Ktbl]!
560	vrev32.8	@MSG[0],@MSG[0]
561	vrev32.8	@MSG[1],@MSG[1]
562	vrev32.8	@MSG[2],@MSG[2]
563	vrev32.8	@MSG[3],@MSG[3]
564	vmov		$ABCD_SAVE,$ABCD	@ offload
565	vmov		$EFGH_SAVE,$EFGH
566	teq		$inp,$len
567___
568for($i=0;$i<12;$i++) {
569$code.=<<___;
570	vld1.32		{$W1},[$Ktbl]!
571	vadd.i32	$W0,$W0,@MSG[0]
572	sha256su0	@MSG[0],@MSG[1]
573	vmov		$abcd,$ABCD
574	sha256h		$ABCD,$EFGH,$W0
575	sha256h2	$EFGH,$abcd,$W0
576	sha256su1	@MSG[0],@MSG[2],@MSG[3]
577___
578	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
579}
580$code.=<<___;
581	vld1.32		{$W1},[$Ktbl]!
582	vadd.i32	$W0,$W0,@MSG[0]
583	vmov		$abcd,$ABCD
584	sha256h		$ABCD,$EFGH,$W0
585	sha256h2	$EFGH,$abcd,$W0
586
587	vld1.32		{$W0},[$Ktbl]!
588	vadd.i32	$W1,$W1,@MSG[1]
589	vmov		$abcd,$ABCD
590	sha256h		$ABCD,$EFGH,$W1
591	sha256h2	$EFGH,$abcd,$W1
592
593	vld1.32		{$W1},[$Ktbl]
594	vadd.i32	$W0,$W0,@MSG[2]
595	sub		$Ktbl,$Ktbl,#256-16	@ rewind
596	vmov		$abcd,$ABCD
597	sha256h		$ABCD,$EFGH,$W0
598	sha256h2	$EFGH,$abcd,$W0
599
600	vadd.i32	$W1,$W1,@MSG[3]
601	vmov		$abcd,$ABCD
602	sha256h		$ABCD,$EFGH,$W1
603	sha256h2	$EFGH,$abcd,$W1
604
605	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
606	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
607	bne		.Loop_v8
608
609	vst1.32		{$ABCD,$EFGH},[$ctx]
610
611	ret		@ bx lr
612.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
613#endif
614___
615}}}
616$code.=<<___;
617.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
618.align	2
619.comm   OPENSSL_armcap_P,4,4
620___
621
622{   my  %opcode = (
623	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
624	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
625
626    sub unsha256 {
627	my ($mnemonic,$arg)=@_;
628
629	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
630	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
631					 |(($2&7)<<17)|(($2&8)<<4)
632					 |(($3&7)<<1) |(($3&8)<<2);
633	    # since ARMv7 instructions are always encoded little-endian.
634	    # correct solution is to use .inst directive, but older
635	    # assemblers don't implement it:-(
636	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
637			$word&0xff,($word>>8)&0xff,
638			($word>>16)&0xff,($word>>24)&0xff,
639			$mnemonic,$arg;
640	}
641    }
642}
643
644foreach (split($/,$code)) {
645
646	s/\`([^\`]*)\`/eval $1/geo;
647
648	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
649
650	s/\bret\b/bx	lr/go		or
651	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
652
653	print $_,"\n";
654}
655
656close STDOUT; # enforce flush
657