1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256 block procedure for ARMv4. May 2007.
11
12# Performance is ~2x better than gcc 3.4 generated code and in "abso-
13# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
14# byte [on single-issue Xscale PXA250 core].
15
16# July 2010.
17#
18# Rescheduling for dual-issue pipeline resulted in 22% improvement on
19# Cortex A8 core and ~20 cycles per processed byte.
20
21# February 2011.
22#
23# Profiler-assisted and platform-specific optimization resulted in 16%
24# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
25
26# September 2013.
27#
28# Add NEON implementation. On Cortex A8 it was measured to process one
29# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
30# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
31# code (meaning that latter performs sub-optimally, nothing was done
32# about it).
33
34while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
35open STDOUT,">$output";
36
37$ctx="r0";	$t0="r0";
38$inp="r1";	$t4="r1";
39$len="r2";	$t1="r2";
40$T1="r3";	$t3="r3";
41$A="r4";
42$B="r5";
43$C="r6";
44$D="r7";
45$E="r8";
46$F="r9";
47$G="r10";
48$H="r11";
49@V=($A,$B,$C,$D,$E,$F,$G,$H);
50$t2="r12";
51$Ktbl="r14";
52
53@Sigma0=( 2,13,22);
54@Sigma1=( 6,11,25);
55@sigma0=( 7,18, 3);
56@sigma1=(17,19,10);
57
58sub BODY_00_15 {
59my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
60
61$code.=<<___ if ($i<16);
62#if __ARM_ARCH__>=7
63	@ ldr	$t1,[$inp],#4			@ $i
64# if $i==15
65	str	$inp,[sp,#17*4]			@ make room for $t4
66# endif
67	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
68	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
69	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
70	rev	$t1,$t1
71#else
72	@ ldrb	$t1,[$inp,#3]			@ $i
73	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
74	ldrb	$t2,[$inp,#2]
75	ldrb	$t0,[$inp,#1]
76	orr	$t1,$t1,$t2,lsl#8
77	ldrb	$t2,[$inp],#4
78	orr	$t1,$t1,$t0,lsl#16
79# if $i==15
80	str	$inp,[sp,#17*4]			@ make room for $t4
81# endif
82	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
83	orr	$t1,$t1,$t2,lsl#24
84	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
85#endif
86___
87$code.=<<___;
88	ldr	$t2,[$Ktbl],#4			@ *K256++
89	add	$h,$h,$t1			@ h+=X[i]
90	str	$t1,[sp,#`$i%16`*4]
91	eor	$t1,$f,$g
92	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
93	and	$t1,$t1,$e
94	add	$h,$h,$t2			@ h+=K256[i]
95	eor	$t1,$t1,$g			@ Ch(e,f,g)
96	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
97	add	$h,$h,$t1			@ h+=Ch(e,f,g)
98#if $i==31
99	and	$t2,$t2,#0xff
100	cmp	$t2,#0xf2			@ done?
101#endif
102#if $i<15
103# if __ARM_ARCH__>=7
104	ldr	$t1,[$inp],#4			@ prefetch
105# else
106	ldrb	$t1,[$inp,#3]
107# endif
108	eor	$t2,$a,$b			@ a^b, b^c in next round
109#else
110	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
111	eor	$t2,$a,$b			@ a^b, b^c in next round
112	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
113#endif
114	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
115	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
116	add	$d,$d,$h			@ d+=h
117	eor	$t3,$t3,$b			@ Maj(a,b,c)
118	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
119	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
120___
121	($t2,$t3)=($t3,$t2);
122}
123
124sub BODY_16_XX {
125my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
126
127$code.=<<___;
128	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
129	@ ldr	$t4,[sp,#`($i+14)%16`*4]
130	mov	$t0,$t1,ror#$sigma0[0]
131	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
132	mov	$t2,$t4,ror#$sigma1[0]
133	eor	$t0,$t0,$t1,ror#$sigma0[1]
134	eor	$t2,$t2,$t4,ror#$sigma1[1]
135	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
136	ldr	$t1,[sp,#`($i+0)%16`*4]
137	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
138	ldr	$t4,[sp,#`($i+9)%16`*4]
139
140	add	$t2,$t2,$t0
141	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
142	add	$t1,$t1,$t2
143	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
144	add	$t1,$t1,$t4			@ X[i]
145___
146	&BODY_00_15(@_);
147}
148
149$code=<<___;
150#if defined(__arm__)
151#include "arm_arch.h"
152
153.text
154.code	32
155
156.type	K256,%object
157.align	5
158K256:
159.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
160.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
161.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
162.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
163.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
164.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
165.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
166.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
167.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
168.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
169.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
170.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
171.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
172.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
173.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
174.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
175.size	K256,.-K256
176.word	0				@ terminator
177.LOPENSSL_armcap:
178.word	OPENSSL_armcap_P-sha256_block_data_order
179.align	5
180
181.global	sha256_block_data_order
182.hidden	sha256_block_data_order
183.type	sha256_block_data_order,%function
184sha256_block_data_order:
185	sub	r3,pc,#8		@ sha256_block_data_order
186	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
187#if __ARM_ARCH__>=7
188	ldr	r12,.LOPENSSL_armcap
189	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
190	tst	r12,#1
191	bne	.LNEON
192#endif
193	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
194	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
195	sub	$Ktbl,r3,#256+32	@ K256
196	sub	sp,sp,#16*4		@ alloca(X[16])
197.Loop:
198# if __ARM_ARCH__>=7
199	ldr	$t1,[$inp],#4
200# else
201	ldrb	$t1,[$inp,#3]
202# endif
203	eor	$t3,$B,$C		@ magic
204	eor	$t2,$t2,$t2
205___
206for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
207$code.=".Lrounds_16_xx:\n";
208for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
209$code.=<<___;
210	ldreq	$t3,[sp,#16*4]		@ pull ctx
211	bne	.Lrounds_16_xx
212
213	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
214	ldr	$t0,[$t3,#0]
215	ldr	$t1,[$t3,#4]
216	ldr	$t2,[$t3,#8]
217	add	$A,$A,$t0
218	ldr	$t0,[$t3,#12]
219	add	$B,$B,$t1
220	ldr	$t1,[$t3,#16]
221	add	$C,$C,$t2
222	ldr	$t2,[$t3,#20]
223	add	$D,$D,$t0
224	ldr	$t0,[$t3,#24]
225	add	$E,$E,$t1
226	ldr	$t1,[$t3,#28]
227	add	$F,$F,$t2
228	ldr	$inp,[sp,#17*4]		@ pull inp
229	ldr	$t2,[sp,#18*4]		@ pull inp+len
230	add	$G,$G,$t0
231	add	$H,$H,$t1
232	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
233	cmp	$inp,$t2
234	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
235	bne	.Loop
236
237	add	sp,sp,#`16+3`*4	@ destroy frame
238#if __ARM_ARCH__>=5
239	ldmia	sp!,{r4-r11,pc}
240#else
241	ldmia	sp!,{r4-r11,lr}
242	tst	lr,#1
243	moveq	pc,lr			@ be binary compatible with V4, yet
244	bx	lr			@ interoperable with Thumb ISA:-)
245#endif
246___
247######################################################################
248# NEON stuff
249#
250{{{
251my @X=map("q$_",(0..3));
252my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
253my $Xfer=$t4;
254my $j=0;
255
256sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
257sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
258
259sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
260{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
261  my $arg = pop;
262    $arg = "#$arg" if ($arg*1 eq $arg);
263    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
264}
265
266sub Xupdate()
267{ use integer;
268  my $body = shift;
269  my @insns = (&$body,&$body,&$body,&$body);
270  my ($a,$b,$c,$d,$e,$f,$g,$h);
271
272	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
273	 eval(shift(@insns));
274	 eval(shift(@insns));
275	 eval(shift(@insns));
276	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
277	 eval(shift(@insns));
278	 eval(shift(@insns));
279	 eval(shift(@insns));
280	&vshr_u32	($T2,$T0,$sigma0[0]);
281	 eval(shift(@insns));
282	 eval(shift(@insns));
283	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
284	 eval(shift(@insns));
285	 eval(shift(@insns));
286	&vshr_u32	($T1,$T0,$sigma0[2]);
287	 eval(shift(@insns));
288	 eval(shift(@insns));
289	&vsli_32	($T2,$T0,32-$sigma0[0]);
290	 eval(shift(@insns));
291	 eval(shift(@insns));
292	&vshr_u32	($T3,$T0,$sigma0[1]);
293	 eval(shift(@insns));
294	 eval(shift(@insns));
295	&veor		($T1,$T1,$T2);
296	 eval(shift(@insns));
297	 eval(shift(@insns));
298	&vsli_32	($T3,$T0,32-$sigma0[1]);
299	 eval(shift(@insns));
300	 eval(shift(@insns));
301	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
302	 eval(shift(@insns));
303	 eval(shift(@insns));
304	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
305	 eval(shift(@insns));
306	 eval(shift(@insns));
307	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
308	 eval(shift(@insns));
309	 eval(shift(@insns));
310	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
311	 eval(shift(@insns));
312	 eval(shift(@insns));
313	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
314	 eval(shift(@insns));
315	 eval(shift(@insns));
316	  &veor		($T5,$T5,$T4);
317	 eval(shift(@insns));
318	 eval(shift(@insns));
319	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
320	 eval(shift(@insns));
321	 eval(shift(@insns));
322	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
323	 eval(shift(@insns));
324	 eval(shift(@insns));
325	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
326	 eval(shift(@insns));
327	 eval(shift(@insns));
328	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
329	 eval(shift(@insns));
330	 eval(shift(@insns));
331	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
332	 eval(shift(@insns));
333	 eval(shift(@insns));
334	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
335	 eval(shift(@insns));
336	 eval(shift(@insns));
337	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
338	 eval(shift(@insns));
339	 eval(shift(@insns));
340	  &veor		($T5,$T5,$T4);
341	 eval(shift(@insns));
342	 eval(shift(@insns));
343	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
344	 eval(shift(@insns));
345	 eval(shift(@insns));
346	&vld1_32	("{$T0}","[$Ktbl,:128]!");
347	 eval(shift(@insns));
348	 eval(shift(@insns));
349	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
350	 eval(shift(@insns));
351	 eval(shift(@insns));
352	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
353	 eval(shift(@insns));
354	 eval(shift(@insns));
355	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
356	 eval(shift(@insns));
357	 eval(shift(@insns));
358	&vadd_i32	($T0,$T0,@X[0]);
359	 while($#insns>=2) { eval(shift(@insns)); }
360	&vst1_32	("{$T0}","[$Xfer,:128]!");
361	 eval(shift(@insns));
362	 eval(shift(@insns));
363
364	push(@X,shift(@X));		# "rotate" X[]
365}
366
367sub Xpreload()
368{ use integer;
369  my $body = shift;
370  my @insns = (&$body,&$body,&$body,&$body);
371  my ($a,$b,$c,$d,$e,$f,$g,$h);
372
373	 eval(shift(@insns));
374	 eval(shift(@insns));
375	 eval(shift(@insns));
376	 eval(shift(@insns));
377	&vld1_32	("{$T0}","[$Ktbl,:128]!");
378	 eval(shift(@insns));
379	 eval(shift(@insns));
380	 eval(shift(@insns));
381	 eval(shift(@insns));
382	&vrev32_8	(@X[0],@X[0]);
383	 eval(shift(@insns));
384	 eval(shift(@insns));
385	 eval(shift(@insns));
386	 eval(shift(@insns));
387	&vadd_i32	($T0,$T0,@X[0]);
388	 foreach (@insns) { eval; }	# remaining instructions
389	&vst1_32	("{$T0}","[$Xfer,:128]!");
390
391	push(@X,shift(@X));		# "rotate" X[]
392}
393
394sub body_00_15 () {
395	(
396	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
397	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
398	'&eor	($t1,$f,$g)',
399	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
400	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
401	'&and	($t1,$t1,$e)',
402	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
403	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
404	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
405	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
406	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
407	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
408	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
409	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
410	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
411	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
412	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
413	'&add	($d,$d,$h)',			# d+=h
414	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
415	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
416	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
417	)
418}
419
420$code.=<<___;
421#if __ARM_ARCH__>=7
422.fpu	neon
423.align	4
424.LNEON:
425	stmdb	sp!,{r4-r12,lr}
426
427	mov	$t2,sp
428	sub	sp,sp,#16*4+16		@ alloca
429	sub	$Ktbl,r3,#256+32	@ K256
430	bic	sp,sp,#15		@ align for 128-bit stores
431
432	vld1.8		{@X[0]},[$inp]!
433	vld1.8		{@X[1]},[$inp]!
434	vld1.8		{@X[2]},[$inp]!
435	vld1.8		{@X[3]},[$inp]!
436	vld1.32		{$T0},[$Ktbl,:128]!
437	vld1.32		{$T1},[$Ktbl,:128]!
438	vld1.32		{$T2},[$Ktbl,:128]!
439	vld1.32		{$T3},[$Ktbl,:128]!
440	vrev32.8	@X[0],@X[0]		@ yes, even on
441	str		$ctx,[sp,#64]
442	vrev32.8	@X[1],@X[1]		@ big-endian
443	str		$inp,[sp,#68]
444	mov		$Xfer,sp
445	vrev32.8	@X[2],@X[2]
446	str		$len,[sp,#72]
447	vrev32.8	@X[3],@X[3]
448	str		$t2,[sp,#76]		@ save original sp
449	vadd.i32	$T0,$T0,@X[0]
450	vadd.i32	$T1,$T1,@X[1]
451	vst1.32		{$T0},[$Xfer,:128]!
452	vadd.i32	$T2,$T2,@X[2]
453	vst1.32		{$T1},[$Xfer,:128]!
454	vadd.i32	$T3,$T3,@X[3]
455	vst1.32		{$T2},[$Xfer,:128]!
456	vst1.32		{$T3},[$Xfer,:128]!
457
458	ldmia		$ctx,{$A-$H}
459	sub		$Xfer,$Xfer,#64
460	ldr		$t1,[sp,#0]
461	eor		$t2,$t2,$t2
462	eor		$t3,$B,$C
463	b		.L_00_48
464
465.align	4
466.L_00_48:
467___
468	&Xupdate(\&body_00_15);
469	&Xupdate(\&body_00_15);
470	&Xupdate(\&body_00_15);
471	&Xupdate(\&body_00_15);
472$code.=<<___;
473	teq	$t1,#0				@ check for K256 terminator
474	ldr	$t1,[sp,#0]
475	sub	$Xfer,$Xfer,#64
476	bne	.L_00_48
477
478	ldr		$inp,[sp,#68]
479	ldr		$t0,[sp,#72]
480	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
481	teq		$inp,$t0
482	subeq		$inp,$inp,#64		@ avoid SEGV
483	vld1.8		{@X[0]},[$inp]!		@ load next input block
484	vld1.8		{@X[1]},[$inp]!
485	vld1.8		{@X[2]},[$inp]!
486	vld1.8		{@X[3]},[$inp]!
487	strne		$inp,[sp,#68]
488	mov		$Xfer,sp
489___
490	&Xpreload(\&body_00_15);
491	&Xpreload(\&body_00_15);
492	&Xpreload(\&body_00_15);
493	&Xpreload(\&body_00_15);
494$code.=<<___;
495	ldr	$t0,[$t1,#0]
496	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
497	ldr	$t2,[$t1,#4]
498	ldr	$t3,[$t1,#8]
499	ldr	$t4,[$t1,#12]
500	add	$A,$A,$t0			@ accumulate
501	ldr	$t0,[$t1,#16]
502	add	$B,$B,$t2
503	ldr	$t2,[$t1,#20]
504	add	$C,$C,$t3
505	ldr	$t3,[$t1,#24]
506	add	$D,$D,$t4
507	ldr	$t4,[$t1,#28]
508	add	$E,$E,$t0
509	str	$A,[$t1],#4
510	add	$F,$F,$t2
511	str	$B,[$t1],#4
512	add	$G,$G,$t3
513	str	$C,[$t1],#4
514	add	$H,$H,$t4
515	str	$D,[$t1],#4
516	stmia	$t1,{$E-$H}
517
518	movne	$Xfer,sp
519	ldrne	$t1,[sp,#0]
520	eorne	$t2,$t2,$t2
521	ldreq	sp,[sp,#76]			@ restore original sp
522	eorne	$t3,$B,$C
523	bne	.L_00_48
524
525	ldmia	sp!,{r4-r12,pc}
526#endif
527___
528}}}
529$code.=<<___;
530.size   sha256_block_data_order,.-sha256_block_data_order
531.asciz  "SHA256 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
532.align	2
533.comm   OPENSSL_armcap_P,4,4
534
535#endif
536___
537
538$code =~ s/\`([^\`]*)\`/eval $1/gem;
539$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
540print $code;
541close STDOUT; # enforce flush
542