sha512-armv4.pl revision 43c12e3d4f9bbbbd4a8ba7b149686437514bc6b6
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA512 block procedure for ARMv4. September 2007.
11
12# This code is ~4.5 (four and a half) times faster than code generated
13# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
14# Xscale PXA250 core].
15#
16# July 2010.
17#
18# Rescheduling for dual-issue pipeline resulted in 6% improvement on
19# Cortex A8 core and ~40 cycles per processed byte.
20
21# Byte order [in]dependence. =========================================
22#
23# Caller is expected to maintain specific *dword* order in h[0-7],
24# namely with most significant dword at *lower* address, which is
25# reflected in below two parameters. *Byte* order within these dwords
26# in turn is whatever *native* byte order on current platform.
27$hi=0;
28$lo=4;
29# ====================================================================
30
31while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
32open STDOUT,">$output";
33
34$ctx="r0";
35$inp="r1";
36$len="r2";
37$Tlo="r3";
38$Thi="r4";
39$Alo="r5";
40$Ahi="r6";
41$Elo="r7";
42$Ehi="r8";
43$t0="r9";
44$t1="r10";
45$t2="r11";
46$t3="r12";
47############	r13 is stack pointer
48$Ktbl="r14";
49############	r15 is program counter
50
51$Aoff=8*0;
52$Boff=8*1;
53$Coff=8*2;
54$Doff=8*3;
55$Eoff=8*4;
56$Foff=8*5;
57$Goff=8*6;
58$Hoff=8*7;
59$Xoff=8*8;
60
61sub BODY_00_15() {
62my $magic = shift;
63$code.=<<___;
64	ldr	$t2,[sp,#$Hoff+0]	@ h.lo
65	ldr	$t3,[sp,#$Hoff+4]	@ h.hi
66	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
67	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
68	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
69	mov	$t0,$Elo,lsr#14
70	mov	$t1,$Ehi,lsr#14
71	eor	$t0,$t0,$Ehi,lsl#18
72	eor	$t1,$t1,$Elo,lsl#18
73	eor	$t0,$t0,$Elo,lsr#18
74	eor	$t1,$t1,$Ehi,lsr#18
75	eor	$t0,$t0,$Ehi,lsl#14
76	eor	$t1,$t1,$Elo,lsl#14
77	eor	$t0,$t0,$Ehi,lsr#9
78	eor	$t1,$t1,$Elo,lsr#9
79	eor	$t0,$t0,$Elo,lsl#23
80	eor	$t1,$t1,$Ehi,lsl#23	@ Sigma1(e)
81	adds	$Tlo,$Tlo,$t0
82	ldr	$t0,[sp,#$Foff+0]	@ f.lo
83	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
84	ldr	$t1,[sp,#$Foff+4]	@ f.hi
85	adds	$Tlo,$Tlo,$t2
86	ldr	$t2,[sp,#$Goff+0]	@ g.lo
87	adc	$Thi,$Thi,$t3		@ T += h
88	ldr	$t3,[sp,#$Goff+4]	@ g.hi
89
90	eor	$t0,$t0,$t2
91	str	$Elo,[sp,#$Eoff+0]
92	eor	$t1,$t1,$t3
93	str	$Ehi,[sp,#$Eoff+4]
94	and	$t0,$t0,$Elo
95	str	$Alo,[sp,#$Aoff+0]
96	and	$t1,$t1,$Ehi
97	str	$Ahi,[sp,#$Aoff+4]
98	eor	$t0,$t0,$t2
99	ldr	$t2,[$Ktbl,#4]		@ K[i].lo
100	eor	$t1,$t1,$t3		@ Ch(e,f,g)
101	ldr	$t3,[$Ktbl,#0]		@ K[i].hi
102
103	adds	$Tlo,$Tlo,$t0
104	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
105	adc	$Thi,$Thi,$t1		@ T += Ch(e,f,g)
106	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
107	adds	$Tlo,$Tlo,$t2
108	adc	$Thi,$Thi,$t3		@ T += K[i]
109	adds	$Elo,$Elo,$Tlo
110	adc	$Ehi,$Ehi,$Thi		@ d += T
111
112	and	$t0,$t2,#0xff
113	teq	$t0,#$magic
114	orreq	$Ktbl,$Ktbl,#1
115
116	ldr	$t2,[sp,#$Boff+0]	@ b.lo
117	ldr	$t3,[sp,#$Coff+0]	@ c.lo
118	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
119	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
120	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
121	mov	$t0,$Alo,lsr#28
122	mov	$t1,$Ahi,lsr#28
123	eor	$t0,$t0,$Ahi,lsl#4
124	eor	$t1,$t1,$Alo,lsl#4
125	eor	$t0,$t0,$Ahi,lsr#2
126	eor	$t1,$t1,$Alo,lsr#2
127	eor	$t0,$t0,$Alo,lsl#30
128	eor	$t1,$t1,$Ahi,lsl#30
129	eor	$t0,$t0,$Ahi,lsr#7
130	eor	$t1,$t1,$Alo,lsr#7
131	eor	$t0,$t0,$Alo,lsl#25
132	eor	$t1,$t1,$Ahi,lsl#25	@ Sigma0(a)
133	adds	$Tlo,$Tlo,$t0
134	adc	$Thi,$Thi,$t1		@ T += Sigma0(a)
135
136	and	$t0,$Alo,$t2
137	orr	$Alo,$Alo,$t2
138	ldr	$t1,[sp,#$Boff+4]	@ b.hi
139	ldr	$t2,[sp,#$Coff+4]	@ c.hi
140	and	$Alo,$Alo,$t3
141	orr	$Alo,$Alo,$t0		@ Maj(a,b,c).lo
142	and	$t3,$Ahi,$t1
143	orr	$Ahi,$Ahi,$t1
144	and	$Ahi,$Ahi,$t2
145	orr	$Ahi,$Ahi,$t3		@ Maj(a,b,c).hi
146	adds	$Alo,$Alo,$Tlo
147	adc	$Ahi,$Ahi,$Thi		@ h += T
148
149	sub	sp,sp,#8
150	add	$Ktbl,$Ktbl,#8
151___
152}
153$code=<<___;
154.text
155.code	32
156.type	K512,%object
157.align	5
158K512:
159.word	0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
160.word	0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
161.word	0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
162.word	0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
163.word	0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
164.word	0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
165.word	0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
166.word	0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
167.word	0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
168.word	0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
169.word	0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
170.word	0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
171.word	0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
172.word	0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
173.word	0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
174.word	0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
175.word	0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
176.word	0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
177.word	0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
178.word	0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
179.word	0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
180.word	0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
181.word	0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
182.word	0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
183.word	0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
184.word	0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
185.word	0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
186.word	0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
187.word	0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
188.word	0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
189.word	0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
190.word	0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
191.word	0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
192.word	0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
193.word	0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
194.word	0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
195.word	0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
196.word	0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
197.word	0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
198.word	0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
199.size	K512,.-K512
200
201.global	sha512_block_data_order
202.type	sha512_block_data_order,%function
203sha512_block_data_order:
204	sub	r3,pc,#8		@ sha512_block_data_order
205	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
206	stmdb	sp!,{r4-r12,lr}
207	sub	$Ktbl,r3,#640		@ K512
208	sub	sp,sp,#9*8
209
210	ldr	$Elo,[$ctx,#$Eoff+$lo]
211	ldr	$Ehi,[$ctx,#$Eoff+$hi]
212	ldr	$t0, [$ctx,#$Goff+$lo]
213	ldr	$t1, [$ctx,#$Goff+$hi]
214	ldr	$t2, [$ctx,#$Hoff+$lo]
215	ldr	$t3, [$ctx,#$Hoff+$hi]
216.Loop:
217	str	$t0, [sp,#$Goff+0]
218	str	$t1, [sp,#$Goff+4]
219	str	$t2, [sp,#$Hoff+0]
220	str	$t3, [sp,#$Hoff+4]
221	ldr	$Alo,[$ctx,#$Aoff+$lo]
222	ldr	$Ahi,[$ctx,#$Aoff+$hi]
223	ldr	$Tlo,[$ctx,#$Boff+$lo]
224	ldr	$Thi,[$ctx,#$Boff+$hi]
225	ldr	$t0, [$ctx,#$Coff+$lo]
226	ldr	$t1, [$ctx,#$Coff+$hi]
227	ldr	$t2, [$ctx,#$Doff+$lo]
228	ldr	$t3, [$ctx,#$Doff+$hi]
229	str	$Tlo,[sp,#$Boff+0]
230	str	$Thi,[sp,#$Boff+4]
231	str	$t0, [sp,#$Coff+0]
232	str	$t1, [sp,#$Coff+4]
233	str	$t2, [sp,#$Doff+0]
234	str	$t3, [sp,#$Doff+4]
235	ldr	$Tlo,[$ctx,#$Foff+$lo]
236	ldr	$Thi,[$ctx,#$Foff+$hi]
237	str	$Tlo,[sp,#$Foff+0]
238	str	$Thi,[sp,#$Foff+4]
239
240.L00_15:
241	ldrb	$Tlo,[$inp,#7]
242	ldrb	$t0, [$inp,#6]
243	ldrb	$t1, [$inp,#5]
244	ldrb	$t2, [$inp,#4]
245	ldrb	$Thi,[$inp,#3]
246	ldrb	$t3, [$inp,#2]
247	orr	$Tlo,$Tlo,$t0,lsl#8
248	ldrb	$t0, [$inp,#1]
249	orr	$Tlo,$Tlo,$t1,lsl#16
250	ldrb	$t1, [$inp],#8
251	orr	$Tlo,$Tlo,$t2,lsl#24
252	orr	$Thi,$Thi,$t3,lsl#8
253	orr	$Thi,$Thi,$t0,lsl#16
254	orr	$Thi,$Thi,$t1,lsl#24
255	str	$Tlo,[sp,#$Xoff+0]
256	str	$Thi,[sp,#$Xoff+4]
257___
258	&BODY_00_15(0x94);
259$code.=<<___;
260	tst	$Ktbl,#1
261	beq	.L00_15
262	bic	$Ktbl,$Ktbl,#1
263
264.L16_79:
265	ldr	$t0,[sp,#`$Xoff+8*(16-1)`+0]
266	ldr	$t1,[sp,#`$Xoff+8*(16-1)`+4]
267	ldr	$t2,[sp,#`$Xoff+8*(16-14)`+0]
268	ldr	$t3,[sp,#`$Xoff+8*(16-14)`+4]
269
270	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
271	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
272	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
273	mov	$Tlo,$t0,lsr#1
274	mov	$Thi,$t1,lsr#1
275	eor	$Tlo,$Tlo,$t1,lsl#31
276	eor	$Thi,$Thi,$t0,lsl#31
277	eor	$Tlo,$Tlo,$t0,lsr#8
278	eor	$Thi,$Thi,$t1,lsr#8
279	eor	$Tlo,$Tlo,$t1,lsl#24
280	eor	$Thi,$Thi,$t0,lsl#24
281	eor	$Tlo,$Tlo,$t0,lsr#7
282	eor	$Thi,$Thi,$t1,lsr#7
283	eor	$Tlo,$Tlo,$t1,lsl#25
284
285	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
286	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
287	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
288	mov	$t0,$t2,lsr#19
289	mov	$t1,$t3,lsr#19
290	eor	$t0,$t0,$t3,lsl#13
291	eor	$t1,$t1,$t2,lsl#13
292	eor	$t0,$t0,$t3,lsr#29
293	eor	$t1,$t1,$t2,lsr#29
294	eor	$t0,$t0,$t2,lsl#3
295	eor	$t1,$t1,$t3,lsl#3
296	eor	$t0,$t0,$t2,lsr#6
297	eor	$t1,$t1,$t3,lsr#6
298	eor	$t0,$t0,$t3,lsl#26
299
300	ldr	$t2,[sp,#`$Xoff+8*(16-9)`+0]
301	ldr	$t3,[sp,#`$Xoff+8*(16-9)`+4]
302	adds	$Tlo,$Tlo,$t0
303	adc	$Thi,$Thi,$t1
304
305	ldr	$t0,[sp,#`$Xoff+8*16`+0]
306	ldr	$t1,[sp,#`$Xoff+8*16`+4]
307	adds	$Tlo,$Tlo,$t2
308	adc	$Thi,$Thi,$t3
309	adds	$Tlo,$Tlo,$t0
310	adc	$Thi,$Thi,$t1
311	str	$Tlo,[sp,#$Xoff+0]
312	str	$Thi,[sp,#$Xoff+4]
313___
314	&BODY_00_15(0x17);
315$code.=<<___;
316	tst	$Ktbl,#1
317	beq	.L16_79
318	bic	$Ktbl,$Ktbl,#1
319
320	ldr	$Tlo,[sp,#$Boff+0]
321	ldr	$Thi,[sp,#$Boff+4]
322	ldr	$t0, [$ctx,#$Aoff+$lo]
323	ldr	$t1, [$ctx,#$Aoff+$hi]
324	ldr	$t2, [$ctx,#$Boff+$lo]
325	ldr	$t3, [$ctx,#$Boff+$hi]
326	adds	$t0,$Alo,$t0
327	adc	$t1,$Ahi,$t1
328	adds	$t2,$Tlo,$t2
329	adc	$t3,$Thi,$t3
330	str	$t0, [$ctx,#$Aoff+$lo]
331	str	$t1, [$ctx,#$Aoff+$hi]
332	str	$t2, [$ctx,#$Boff+$lo]
333	str	$t3, [$ctx,#$Boff+$hi]
334
335	ldr	$Alo,[sp,#$Coff+0]
336	ldr	$Ahi,[sp,#$Coff+4]
337	ldr	$Tlo,[sp,#$Doff+0]
338	ldr	$Thi,[sp,#$Doff+4]
339	ldr	$t0, [$ctx,#$Coff+$lo]
340	ldr	$t1, [$ctx,#$Coff+$hi]
341	ldr	$t2, [$ctx,#$Doff+$lo]
342	ldr	$t3, [$ctx,#$Doff+$hi]
343	adds	$t0,$Alo,$t0
344	adc	$t1,$Ahi,$t1
345	adds	$t2,$Tlo,$t2
346	adc	$t3,$Thi,$t3
347	str	$t0, [$ctx,#$Coff+$lo]
348	str	$t1, [$ctx,#$Coff+$hi]
349	str	$t2, [$ctx,#$Doff+$lo]
350	str	$t3, [$ctx,#$Doff+$hi]
351
352	ldr	$Tlo,[sp,#$Foff+0]
353	ldr	$Thi,[sp,#$Foff+4]
354	ldr	$t0, [$ctx,#$Eoff+$lo]
355	ldr	$t1, [$ctx,#$Eoff+$hi]
356	ldr	$t2, [$ctx,#$Foff+$lo]
357	ldr	$t3, [$ctx,#$Foff+$hi]
358	adds	$Elo,$Elo,$t0
359	adc	$Ehi,$Ehi,$t1
360	adds	$t2,$Tlo,$t2
361	adc	$t3,$Thi,$t3
362	str	$Elo,[$ctx,#$Eoff+$lo]
363	str	$Ehi,[$ctx,#$Eoff+$hi]
364	str	$t2, [$ctx,#$Foff+$lo]
365	str	$t3, [$ctx,#$Foff+$hi]
366
367	ldr	$Alo,[sp,#$Goff+0]
368	ldr	$Ahi,[sp,#$Goff+4]
369	ldr	$Tlo,[sp,#$Hoff+0]
370	ldr	$Thi,[sp,#$Hoff+4]
371	ldr	$t0, [$ctx,#$Goff+$lo]
372	ldr	$t1, [$ctx,#$Goff+$hi]
373	ldr	$t2, [$ctx,#$Hoff+$lo]
374	ldr	$t3, [$ctx,#$Hoff+$hi]
375	adds	$t0,$Alo,$t0
376	adc	$t1,$Ahi,$t1
377	adds	$t2,$Tlo,$t2
378	adc	$t3,$Thi,$t3
379	str	$t0, [$ctx,#$Goff+$lo]
380	str	$t1, [$ctx,#$Goff+$hi]
381	str	$t2, [$ctx,#$Hoff+$lo]
382	str	$t3, [$ctx,#$Hoff+$hi]
383
384	add	sp,sp,#640
385	sub	$Ktbl,$Ktbl,#640
386
387	teq	$inp,$len
388	bne	.Loop
389
390	add	sp,sp,#8*9		@ destroy frame
391	ldmia	sp!,{r4-r12,lr}
392	tst	lr,#1
393	moveq	pc,lr			@ be binary compatible with V4, yet
394	bx	lr			@ interoperable with Thumb ISA:-)
395.size   sha512_block_data_order,.-sha512_block_data_order
396.asciz  "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
397.align	2
398___
399
400$code =~ s/\`([^\`]*)\`/eval $1/gem;
401$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
402print $code;
403close STDOUT; # enforce flush
404