1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA512 block procedure for ARMv4. September 2007.
11
12# This code is ~4.5 (four and a half) times faster than code generated
13# by gcc 3.4 and it spends ~72 clock cycles per byte.
14
15# Byte order [in]dependence. =========================================
16#
17# Caller is expected to maintain specific *dword* order in h[0-7],
18# namely with most significant dword at *lower* address, which is
19# reflected in below two parameters. *Byte* order within these dwords
20# in turn is whatever *native* byte order on current platform.
21$hi=0;
22$lo=4;
23# ====================================================================
24
25$output=shift;
26open STDOUT,">$output";
27
28$ctx="r0";
29$inp="r1";
30$len="r2";
31$Tlo="r3";
32$Thi="r4";
33$Alo="r5";
34$Ahi="r6";
35$Elo="r7";
36$Ehi="r8";
37$t0="r9";
38$t1="r10";
39$t2="r11";
40$t3="r12";
41############	r13 is stack pointer
42$Ktbl="r14";
43############	r15 is program counter
44
45$Aoff=8*0;
46$Boff=8*1;
47$Coff=8*2;
48$Doff=8*3;
49$Eoff=8*4;
50$Foff=8*5;
51$Goff=8*6;
52$Hoff=8*7;
53$Xoff=8*8;
54
55sub BODY_00_15() {
56my $magic = shift;
57$code.=<<___;
58	ldr	$t2,[sp,#$Hoff+0]	@ h.lo
59	ldr	$t3,[sp,#$Hoff+4]	@ h.hi
60	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
61	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
62	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
63	mov	$t0,$Elo,lsr#14
64	mov	$t1,$Ehi,lsr#14
65	eor	$t0,$t0,$Ehi,lsl#18
66	eor	$t1,$t1,$Elo,lsl#18
67	eor	$t0,$t0,$Elo,lsr#18
68	eor	$t1,$t1,$Ehi,lsr#18
69	eor	$t0,$t0,$Ehi,lsl#14
70	eor	$t1,$t1,$Elo,lsl#14
71	eor	$t0,$t0,$Ehi,lsr#9
72	eor	$t1,$t1,$Elo,lsr#9
73	eor	$t0,$t0,$Elo,lsl#23
74	eor	$t1,$t1,$Ehi,lsl#23	@ Sigma1(e)
75	adds	$Tlo,$Tlo,$t0
76	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
77	adds	$Tlo,$Tlo,$t2
78	adc	$Thi,$Thi,$t3		@ T += h
79
80	ldr	$t0,[sp,#$Foff+0]	@ f.lo
81	ldr	$t1,[sp,#$Foff+4]	@ f.hi
82	ldr	$t2,[sp,#$Goff+0]	@ g.lo
83	ldr	$t3,[sp,#$Goff+4]	@ g.hi
84	str	$Elo,[sp,#$Eoff+0]
85	str	$Ehi,[sp,#$Eoff+4]
86	str	$Alo,[sp,#$Aoff+0]
87	str	$Ahi,[sp,#$Aoff+4]
88
89	eor	$t0,$t0,$t2
90	eor	$t1,$t1,$t3
91	and	$t0,$t0,$Elo
92	and	$t1,$t1,$Ehi
93	eor	$t0,$t0,$t2
94	eor	$t1,$t1,$t3		@ Ch(e,f,g)
95
96	ldr	$t2,[$Ktbl,#4]		@ K[i].lo
97	ldr	$t3,[$Ktbl,#0]		@ K[i].hi
98	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
99	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
100
101	adds	$Tlo,$Tlo,$t0
102	adc	$Thi,$Thi,$t1		@ T += Ch(e,f,g)
103	adds	$Tlo,$Tlo,$t2
104	adc	$Thi,$Thi,$t3		@ T += K[i]
105	adds	$Elo,$Elo,$Tlo
106	adc	$Ehi,$Ehi,$Thi		@ d += T
107
108	and	$t0,$t2,#0xff
109	teq	$t0,#$magic
110	orreq	$Ktbl,$Ktbl,#1
111
112	ldr	$t2,[sp,#$Boff+0]	@ b.lo
113	ldr	$t3,[sp,#$Coff+0]	@ c.lo
114	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
115	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
116	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
117	mov	$t0,$Alo,lsr#28
118	mov	$t1,$Ahi,lsr#28
119	eor	$t0,$t0,$Ahi,lsl#4
120	eor	$t1,$t1,$Alo,lsl#4
121	eor	$t0,$t0,$Ahi,lsr#2
122	eor	$t1,$t1,$Alo,lsr#2
123	eor	$t0,$t0,$Alo,lsl#30
124	eor	$t1,$t1,$Ahi,lsl#30
125	eor	$t0,$t0,$Ahi,lsr#7
126	eor	$t1,$t1,$Alo,lsr#7
127	eor	$t0,$t0,$Alo,lsl#25
128	eor	$t1,$t1,$Ahi,lsl#25	@ Sigma0(a)
129	adds	$Tlo,$Tlo,$t0
130	adc	$Thi,$Thi,$t1		@ T += Sigma0(a)
131
132	and	$t0,$Alo,$t2
133	orr	$Alo,$Alo,$t2
134	ldr	$t1,[sp,#$Boff+4]	@ b.hi
135	ldr	$t2,[sp,#$Coff+4]	@ c.hi
136	and	$Alo,$Alo,$t3
137	orr	$Alo,$Alo,$t0		@ Maj(a,b,c).lo
138	and	$t3,$Ahi,$t1
139	orr	$Ahi,$Ahi,$t1
140	and	$Ahi,$Ahi,$t2
141	orr	$Ahi,$Ahi,$t3		@ Maj(a,b,c).hi
142	adds	$Alo,$Alo,$Tlo
143	adc	$Ahi,$Ahi,$Thi		@ h += T
144
145	sub	sp,sp,#8
146	add	$Ktbl,$Ktbl,#8
147___
148}
149$code=<<___;
150.text
151.code	32
152.type	K512,%object
153.align	5
154K512:
155.word	0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
156.word	0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
157.word	0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
158.word	0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
159.word	0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
160.word	0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
161.word	0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
162.word	0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
163.word	0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
164.word	0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
165.word	0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
166.word	0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
167.word	0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
168.word	0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
169.word	0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
170.word	0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
171.word	0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
172.word	0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
173.word	0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
174.word	0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
175.word	0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
176.word	0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
177.word	0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
178.word	0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
179.word	0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
180.word	0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
181.word	0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
182.word	0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
183.word	0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
184.word	0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
185.word	0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
186.word	0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
187.word	0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
188.word	0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
189.word	0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
190.word	0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
191.word	0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
192.word	0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
193.word	0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
194.word	0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
195.size	K512,.-K512
196
197.global	sha512_block_data_order
198.type	sha512_block_data_order,%function
199sha512_block_data_order:
200	sub	r3,pc,#8		@ sha512_block_data_order
201	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
202	stmdb	sp!,{r4-r12,lr}
203	sub	$Ktbl,r3,#640		@ K512
204	sub	sp,sp,#9*8
205
206	ldr	$Elo,[$ctx,#$Eoff+$lo]
207	ldr	$Ehi,[$ctx,#$Eoff+$hi]
208	ldr	$t0, [$ctx,#$Goff+$lo]
209	ldr	$t1, [$ctx,#$Goff+$hi]
210	ldr	$t2, [$ctx,#$Hoff+$lo]
211	ldr	$t3, [$ctx,#$Hoff+$hi]
212.Loop:
213	str	$t0, [sp,#$Goff+0]
214	str	$t1, [sp,#$Goff+4]
215	str	$t2, [sp,#$Hoff+0]
216	str	$t3, [sp,#$Hoff+4]
217	ldr	$Alo,[$ctx,#$Aoff+$lo]
218	ldr	$Ahi,[$ctx,#$Aoff+$hi]
219	ldr	$Tlo,[$ctx,#$Boff+$lo]
220	ldr	$Thi,[$ctx,#$Boff+$hi]
221	ldr	$t0, [$ctx,#$Coff+$lo]
222	ldr	$t1, [$ctx,#$Coff+$hi]
223	ldr	$t2, [$ctx,#$Doff+$lo]
224	ldr	$t3, [$ctx,#$Doff+$hi]
225	str	$Tlo,[sp,#$Boff+0]
226	str	$Thi,[sp,#$Boff+4]
227	str	$t0, [sp,#$Coff+0]
228	str	$t1, [sp,#$Coff+4]
229	str	$t2, [sp,#$Doff+0]
230	str	$t3, [sp,#$Doff+4]
231	ldr	$Tlo,[$ctx,#$Foff+$lo]
232	ldr	$Thi,[$ctx,#$Foff+$hi]
233	str	$Tlo,[sp,#$Foff+0]
234	str	$Thi,[sp,#$Foff+4]
235
236.L00_15:
237	ldrb	$Tlo,[$inp,#7]
238	ldrb	$t0, [$inp,#6]
239	ldrb	$t1, [$inp,#5]
240	ldrb	$t2, [$inp,#4]
241	ldrb	$Thi,[$inp,#3]
242	ldrb	$t3, [$inp,#2]
243	orr	$Tlo,$Tlo,$t0,lsl#8
244	ldrb	$t0, [$inp,#1]
245	orr	$Tlo,$Tlo,$t1,lsl#16
246	ldrb	$t1, [$inp],#8
247	orr	$Tlo,$Tlo,$t2,lsl#24
248	orr	$Thi,$Thi,$t3,lsl#8
249	orr	$Thi,$Thi,$t0,lsl#16
250	orr	$Thi,$Thi,$t1,lsl#24
251	str	$Tlo,[sp,#$Xoff+0]
252	str	$Thi,[sp,#$Xoff+4]
253___
254	&BODY_00_15(0x94);
255$code.=<<___;
256	tst	$Ktbl,#1
257	beq	.L00_15
258	bic	$Ktbl,$Ktbl,#1
259
260.L16_79:
261	ldr	$t0,[sp,#`$Xoff+8*(16-1)`+0]
262	ldr	$t1,[sp,#`$Xoff+8*(16-1)`+4]
263	ldr	$t2,[sp,#`$Xoff+8*(16-14)`+0]
264	ldr	$t3,[sp,#`$Xoff+8*(16-14)`+4]
265
266	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
267	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
268	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
269	mov	$Tlo,$t0,lsr#1
270	mov	$Thi,$t1,lsr#1
271	eor	$Tlo,$Tlo,$t1,lsl#31
272	eor	$Thi,$Thi,$t0,lsl#31
273	eor	$Tlo,$Tlo,$t0,lsr#8
274	eor	$Thi,$Thi,$t1,lsr#8
275	eor	$Tlo,$Tlo,$t1,lsl#24
276	eor	$Thi,$Thi,$t0,lsl#24
277	eor	$Tlo,$Tlo,$t0,lsr#7
278	eor	$Thi,$Thi,$t1,lsr#7
279	eor	$Tlo,$Tlo,$t1,lsl#25
280
281	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
282	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
283	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
284	mov	$t0,$t2,lsr#19
285	mov	$t1,$t3,lsr#19
286	eor	$t0,$t0,$t3,lsl#13
287	eor	$t1,$t1,$t2,lsl#13
288	eor	$t0,$t0,$t3,lsr#29
289	eor	$t1,$t1,$t2,lsr#29
290	eor	$t0,$t0,$t2,lsl#3
291	eor	$t1,$t1,$t3,lsl#3
292	eor	$t0,$t0,$t2,lsr#6
293	eor	$t1,$t1,$t3,lsr#6
294	eor	$t0,$t0,$t3,lsl#26
295
296	ldr	$t2,[sp,#`$Xoff+8*(16-9)`+0]
297	ldr	$t3,[sp,#`$Xoff+8*(16-9)`+4]
298	adds	$Tlo,$Tlo,$t0
299	adc	$Thi,$Thi,$t1
300
301	ldr	$t0,[sp,#`$Xoff+8*16`+0]
302	ldr	$t1,[sp,#`$Xoff+8*16`+4]
303	adds	$Tlo,$Tlo,$t2
304	adc	$Thi,$Thi,$t3
305	adds	$Tlo,$Tlo,$t0
306	adc	$Thi,$Thi,$t1
307	str	$Tlo,[sp,#$Xoff+0]
308	str	$Thi,[sp,#$Xoff+4]
309___
310	&BODY_00_15(0x17);
311$code.=<<___;
312	tst	$Ktbl,#1
313	beq	.L16_79
314	bic	$Ktbl,$Ktbl,#1
315
316	ldr	$Tlo,[sp,#$Boff+0]
317	ldr	$Thi,[sp,#$Boff+4]
318	ldr	$t0, [$ctx,#$Aoff+$lo]
319	ldr	$t1, [$ctx,#$Aoff+$hi]
320	ldr	$t2, [$ctx,#$Boff+$lo]
321	ldr	$t3, [$ctx,#$Boff+$hi]
322	adds	$t0,$Alo,$t0
323	adc	$t1,$Ahi,$t1
324	adds	$t2,$Tlo,$t2
325	adc	$t3,$Thi,$t3
326	str	$t0, [$ctx,#$Aoff+$lo]
327	str	$t1, [$ctx,#$Aoff+$hi]
328	str	$t2, [$ctx,#$Boff+$lo]
329	str	$t3, [$ctx,#$Boff+$hi]
330
331	ldr	$Alo,[sp,#$Coff+0]
332	ldr	$Ahi,[sp,#$Coff+4]
333	ldr	$Tlo,[sp,#$Doff+0]
334	ldr	$Thi,[sp,#$Doff+4]
335	ldr	$t0, [$ctx,#$Coff+$lo]
336	ldr	$t1, [$ctx,#$Coff+$hi]
337	ldr	$t2, [$ctx,#$Doff+$lo]
338	ldr	$t3, [$ctx,#$Doff+$hi]
339	adds	$t0,$Alo,$t0
340	adc	$t1,$Ahi,$t1
341	adds	$t2,$Tlo,$t2
342	adc	$t3,$Thi,$t3
343	str	$t0, [$ctx,#$Coff+$lo]
344	str	$t1, [$ctx,#$Coff+$hi]
345	str	$t2, [$ctx,#$Doff+$lo]
346	str	$t3, [$ctx,#$Doff+$hi]
347
348	ldr	$Tlo,[sp,#$Foff+0]
349	ldr	$Thi,[sp,#$Foff+4]
350	ldr	$t0, [$ctx,#$Eoff+$lo]
351	ldr	$t1, [$ctx,#$Eoff+$hi]
352	ldr	$t2, [$ctx,#$Foff+$lo]
353	ldr	$t3, [$ctx,#$Foff+$hi]
354	adds	$Elo,$Elo,$t0
355	adc	$Ehi,$Ehi,$t1
356	adds	$t2,$Tlo,$t2
357	adc	$t3,$Thi,$t3
358	str	$Elo,[$ctx,#$Eoff+$lo]
359	str	$Ehi,[$ctx,#$Eoff+$hi]
360	str	$t2, [$ctx,#$Foff+$lo]
361	str	$t3, [$ctx,#$Foff+$hi]
362
363	ldr	$Alo,[sp,#$Goff+0]
364	ldr	$Ahi,[sp,#$Goff+4]
365	ldr	$Tlo,[sp,#$Hoff+0]
366	ldr	$Thi,[sp,#$Hoff+4]
367	ldr	$t0, [$ctx,#$Goff+$lo]
368	ldr	$t1, [$ctx,#$Goff+$hi]
369	ldr	$t2, [$ctx,#$Hoff+$lo]
370	ldr	$t3, [$ctx,#$Hoff+$hi]
371	adds	$t0,$Alo,$t0
372	adc	$t1,$Ahi,$t1
373	adds	$t2,$Tlo,$t2
374	adc	$t3,$Thi,$t3
375	str	$t0, [$ctx,#$Goff+$lo]
376	str	$t1, [$ctx,#$Goff+$hi]
377	str	$t2, [$ctx,#$Hoff+$lo]
378	str	$t3, [$ctx,#$Hoff+$hi]
379
380	add	sp,sp,#640
381	sub	$Ktbl,$Ktbl,#640
382
383	teq	$inp,$len
384	bne	.Loop
385
386	add	sp,sp,#8*9		@ destroy frame
387	ldmia	sp!,{r4-r12,lr}
388	tst	lr,#1
389	moveq	pc,lr			@ be binary compatible with V4, yet
390	bx	lr			@ interoperable with Thumb ISA:-)
391.size   sha512_block_data_order,.-sha512_block_data_order
392.asciz  "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
393.align	2
394___
395
396$code =~ s/\`([^\`]*)\`/eval $1/gem;
397$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
398print $code;
399close STDOUT; # enforce flush
400