195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#!/usr/bin/env perl
295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# ====================================================================
495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# project. The module is, however, dual licensed under OpenSSL and
695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# CRYPTOGAMS licenses depending on where you obtain it. For further
795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# details see http://www.openssl.org/~appro/cryptogams/.
895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# ====================================================================
995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
1095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# SHA512 block procedure for ARMv4. September 2007.
1195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
1295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# This code is ~4.5 (four and a half) times faster than code generated
1395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
1495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# Xscale PXA250 core].
1595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#
1695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# July 2010.
1795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#
1895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# Rescheduling for dual-issue pipeline resulted in 6% improvement on
1995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# Cortex A8 core and ~40 cycles per processed byte.
2095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
2195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# February 2011.
2295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#
2395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# Profiler-assisted and platform-specific optimization resulted in 7%
2495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# improvement on Coxtex A8 core and ~38 cycles per byte.
2595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
2695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# March 2011.
2795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#
2895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# Add NEON implementation. On Cortex A8 it was measured to process
2995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# one byte in 23.3 cycles or ~60% faster than integer-only code.
3095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
3195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# August 2012.
3295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#
3395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# Improve NEON performance by 12% on Snapdragon S4. In absolute
3495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# terms it's 22.6 cycles per byte, which is disappointing result.
3595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# Technical writers asserted that 3-way S4 pipeline can sustain
3695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# multiple NEON instructions per cycle, but dual NEON issue could
3795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# not be observed, and for NEON-only sequences IPC(*) was found to
3895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# be limited by 1:-( 0.33 and 0.66 were measured for sequences with
3995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# ILPs(*) of 1 and 2 respectively. This in turn means that you can
4095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# even find yourself striving, as I did here, for achieving IPC
4195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# adequate to one delivered by Cortex A8 [for reference, it's
4295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# 0.5 for ILP of 1, and 1 for higher ILPs].
4395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#
4495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# (*) ILP, instruction-level parallelism, how many instructions
4595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#     *can* execute at the same time. IPC, instructions per cycle,
4695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#     indicates how many instructions actually execute.
4795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
4895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# Byte order [in]dependence. =========================================
4995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#
5095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# Originally caller was expected to maintain specific *dword* order in
5195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# h[0-7], namely with most significant dword at *lower* address, which
5295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# was reflected in below two parameters as 0 and 4. Now caller is
5395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# expected to maintain native byte order for whole 64-bit values.
5495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$hi="HI";
5595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$lo="LO";
5695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# ====================================================================
5795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
5895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleywhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
5995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyopen STDOUT,">$output";
6095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
6195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$ctx="r0";	# parameter block
6295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$inp="r1";
6395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$len="r2";
6495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
6595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$Tlo="r3";
6695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$Thi="r4";
6795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$Alo="r5";
6895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$Ahi="r6";
6995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$Elo="r7";
7095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$Ehi="r8";
7195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$t0="r9";
7295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$t1="r10";
7395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$t2="r11";
7495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$t3="r12";
7595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley############	r13 is stack pointer
7695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$Ktbl="r14";
7795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley############	r15 is program counter
7895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
7995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$Aoff=8*0;
8095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$Boff=8*1;
8195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$Coff=8*2;
8295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$Doff=8*3;
8395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$Eoff=8*4;
8495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$Foff=8*5;
8595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$Goff=8*6;
8695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$Hoff=8*7;
8795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$Xoff=8*8;
8895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
8995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub BODY_00_15() {
9095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy $magic = shift;
9195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
9295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
9395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
9495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
9595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$t0,$Elo,lsr#14
9695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$Tlo,[sp,#$Xoff+0]
9795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$t1,$Ehi,lsr#14
9895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$Thi,[sp,#$Xoff+4]
9995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t0,$t0,$Ehi,lsl#18
10095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t2,[sp,#$Hoff+0]	@ h.lo
10195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t1,$t1,$Elo,lsl#18
10295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t3,[sp,#$Hoff+4]	@ h.hi
10395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t0,$t0,$Elo,lsr#18
10495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t1,$t1,$Ehi,lsr#18
10595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t0,$t0,$Ehi,lsl#14
10695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t1,$t1,$Elo,lsl#14
10795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t0,$t0,$Ehi,lsr#9
10895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t1,$t1,$Elo,lsr#9
10995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t0,$t0,$Elo,lsl#23
11095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t1,$t1,$Ehi,lsl#23	@ Sigma1(e)
11195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adds	$Tlo,$Tlo,$t0
11295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t0,[sp,#$Foff+0]	@ f.lo
11395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
11495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t1,[sp,#$Foff+4]	@ f.hi
11595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adds	$Tlo,$Tlo,$t2
11695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t2,[sp,#$Goff+0]	@ g.lo
11795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adc	$Thi,$Thi,$t3		@ T += h
11895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t3,[sp,#$Goff+4]	@ g.hi
11995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
12095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t0,$t0,$t2
12195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$Elo,[sp,#$Eoff+0]
12295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t1,$t1,$t3
12395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$Ehi,[sp,#$Eoff+4]
12495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	and	$t0,$t0,$Elo
12595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$Alo,[sp,#$Aoff+0]
12695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	and	$t1,$t1,$Ehi
12795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$Ahi,[sp,#$Aoff+4]
12895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t0,$t0,$t2
12995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t2,[$Ktbl,#$lo]	@ K[i].lo
13095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t1,$t1,$t3		@ Ch(e,f,g)
13195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t3,[$Ktbl,#$hi]	@ K[i].hi
13295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
13395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adds	$Tlo,$Tlo,$t0
13495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
13595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adc	$Thi,$Thi,$t1		@ T += Ch(e,f,g)
13695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
13795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adds	$Tlo,$Tlo,$t2
13895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	and	$t0,$t2,#0xff
13995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adc	$Thi,$Thi,$t3		@ T += K[i]
14095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adds	$Elo,$Elo,$Tlo
14195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t2,[sp,#$Boff+0]	@ b.lo
14295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adc	$Ehi,$Ehi,$Thi		@ d += T
14395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	teq	$t0,#$magic
14495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
14595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t3,[sp,#$Coff+0]	@ c.lo
14695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	orreq	$Ktbl,$Ktbl,#1
14795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
14895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
14995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
15095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$t0,$Alo,lsr#28
15195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$t1,$Ahi,lsr#28
15295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t0,$t0,$Ahi,lsl#4
15395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t1,$t1,$Alo,lsl#4
15495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t0,$t0,$Ahi,lsr#2
15595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t1,$t1,$Alo,lsr#2
15695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t0,$t0,$Alo,lsl#30
15795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t1,$t1,$Ahi,lsl#30
15895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t0,$t0,$Ahi,lsr#7
15995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t1,$t1,$Alo,lsr#7
16095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t0,$t0,$Alo,lsl#25
16195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t1,$t1,$Ahi,lsl#25	@ Sigma0(a)
16295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adds	$Tlo,$Tlo,$t0
16395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	and	$t0,$Alo,$t2
16495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adc	$Thi,$Thi,$t1		@ T += Sigma0(a)
16595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
16695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t1,[sp,#$Boff+4]	@ b.hi
16795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	orr	$Alo,$Alo,$t2
16895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t2,[sp,#$Coff+4]	@ c.hi
16995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	and	$Alo,$Alo,$t3
17095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	and	$t3,$Ahi,$t1
17195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	orr	$Ahi,$Ahi,$t1
17295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	orr	$Alo,$Alo,$t0		@ Maj(a,b,c).lo
17395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	and	$Ahi,$Ahi,$t2
17495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adds	$Alo,$Alo,$Tlo
17595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	orr	$Ahi,$Ahi,$t3		@ Maj(a,b,c).hi
17695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub	sp,sp,#8
17795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adc	$Ahi,$Ahi,$Thi		@ h += T
17895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	tst	$Ktbl,#1
17995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	$Ktbl,$Ktbl,#8
18095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
18195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
18295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code=<<___;
18395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#if defined(__arm__)
18495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#include "arm_arch.h"
18595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#ifdef __ARMEL__
18695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# define LO 0
18795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# define HI 4
18895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
18995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#else
19095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# define HI 0
19195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# define LO 4
19295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
19395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#endif
19495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
19595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.text
19695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.code	32
19795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.type	K512,%object
19895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	5
19995c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyK512:
20095c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
20195c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
20295c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
20395c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
20495c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
20595c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
20695c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
20795c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
20895c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
20995c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
21095c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
21195c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
21295c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
21395c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
21495c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
21595c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
21695c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
21795c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
21895c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
21995c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
22095c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
22195c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
22295c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
22395c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
22495c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
22595c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
22695c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
22795c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
22895c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
22995c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
23095c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
23195c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
23295c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
23395c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
23495c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
23595c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
23695c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
23795c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
23895c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
23995c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyWORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
24095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.size	K512,.-K512
24195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.LOPENSSL_armcap:
24295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.word	OPENSSL_armcap_P-sha512_block_data_order
24395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.skip	32-4
24495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
24595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.global	sha512_block_data_order
246eb7d2ed1fe8a33b3e3871502ba7e12efaf94360cAdam Langley.hidden	sha512_block_data_order
24795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.type	sha512_block_data_order,%function
24895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysha512_block_data_order:
24995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub	r3,pc,#8		@ sha512_block_data_order
25095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
25195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#if __ARM_ARCH__>=7
25295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	r12,.LOPENSSL_armcap
25395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
25495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	tst	r12,#1
25595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	bne	.LNEON
25695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#endif
25795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	stmdb	sp!,{r4-r12,lr}
25895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub	$Ktbl,r3,#672		@ K512
25995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub	sp,sp,#9*8
26095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
26195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$Elo,[$ctx,#$Eoff+$lo]
26295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$Ehi,[$ctx,#$Eoff+$hi]
26395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t0, [$ctx,#$Goff+$lo]
26495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t1, [$ctx,#$Goff+$hi]
26595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t2, [$ctx,#$Hoff+$lo]
26695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t3, [$ctx,#$Hoff+$hi]
26795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Loop:
26895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$t0, [sp,#$Goff+0]
26995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$t1, [sp,#$Goff+4]
27095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$t2, [sp,#$Hoff+0]
27195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$t3, [sp,#$Hoff+4]
27295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$Alo,[$ctx,#$Aoff+$lo]
27395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$Ahi,[$ctx,#$Aoff+$hi]
27495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$Tlo,[$ctx,#$Boff+$lo]
27595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$Thi,[$ctx,#$Boff+$hi]
27695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t0, [$ctx,#$Coff+$lo]
27795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t1, [$ctx,#$Coff+$hi]
27895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t2, [$ctx,#$Doff+$lo]
27995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t3, [$ctx,#$Doff+$hi]
28095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$Tlo,[sp,#$Boff+0]
28195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$Thi,[sp,#$Boff+4]
28295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$t0, [sp,#$Coff+0]
28395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$t1, [sp,#$Coff+4]
28495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$t2, [sp,#$Doff+0]
28595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$t3, [sp,#$Doff+4]
28695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$Tlo,[$ctx,#$Foff+$lo]
28795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$Thi,[$ctx,#$Foff+$hi]
28895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$Tlo,[sp,#$Foff+0]
28995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$Thi,[sp,#$Foff+4]
29095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
29195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.L00_15:
29295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#if __ARM_ARCH__<7
29395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldrb	$Tlo,[$inp,#7]
29495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldrb	$t0, [$inp,#6]
29595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldrb	$t1, [$inp,#5]
29695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldrb	$t2, [$inp,#4]
29795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldrb	$Thi,[$inp,#3]
29895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldrb	$t3, [$inp,#2]
29995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	orr	$Tlo,$Tlo,$t0,lsl#8
30095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldrb	$t0, [$inp,#1]
30195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	orr	$Tlo,$Tlo,$t1,lsl#16
30295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldrb	$t1, [$inp],#8
30395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	orr	$Tlo,$Tlo,$t2,lsl#24
30495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	orr	$Thi,$Thi,$t3,lsl#8
30595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	orr	$Thi,$Thi,$t0,lsl#16
30695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	orr	$Thi,$Thi,$t1,lsl#24
30795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#else
30895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$Tlo,[$inp,#4]
30995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$Thi,[$inp],#8
31095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#ifdef __ARMEL__
31195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	rev	$Tlo,$Tlo
31295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	rev	$Thi,$Thi
31395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#endif
31495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#endif
31595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
31695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&BODY_00_15(0x94);
31795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
31895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	tst	$Ktbl,#1
31995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	beq	.L00_15
32095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t0,[sp,#`$Xoff+8*(16-1)`+0]
32195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t1,[sp,#`$Xoff+8*(16-1)`+4]
32295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	bic	$Ktbl,$Ktbl,#1
32395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.L16_79:
32495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
32595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
32695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
32795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$Tlo,$t0,lsr#1
32895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t2,[sp,#`$Xoff+8*(16-14)`+0]
32995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$Thi,$t1,lsr#1
33095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t3,[sp,#`$Xoff+8*(16-14)`+4]
33195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$Tlo,$Tlo,$t1,lsl#31
33295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$Thi,$Thi,$t0,lsl#31
33395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$Tlo,$Tlo,$t0,lsr#8
33495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$Thi,$Thi,$t1,lsr#8
33595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$Tlo,$Tlo,$t1,lsl#24
33695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$Thi,$Thi,$t0,lsl#24
33795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$Tlo,$Tlo,$t0,lsr#7
33895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$Thi,$Thi,$t1,lsr#7
33995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$Tlo,$Tlo,$t1,lsl#25
34095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
34195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
34295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
34395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
34495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$t0,$t2,lsr#19
34595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$t1,$t3,lsr#19
34695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t0,$t0,$t3,lsl#13
34795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t1,$t1,$t2,lsl#13
34895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t0,$t0,$t3,lsr#29
34995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t1,$t1,$t2,lsr#29
35095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t0,$t0,$t2,lsl#3
35195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t1,$t1,$t3,lsl#3
35295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t0,$t0,$t2,lsr#6
35395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t1,$t1,$t3,lsr#6
35495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t2,[sp,#`$Xoff+8*(16-9)`+0]
35595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	eor	$t0,$t0,$t3,lsl#26
35695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
35795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t3,[sp,#`$Xoff+8*(16-9)`+4]
35895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adds	$Tlo,$Tlo,$t0
35995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t0,[sp,#`$Xoff+8*16`+0]
36095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adc	$Thi,$Thi,$t1
36195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
36295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t1,[sp,#`$Xoff+8*16`+4]
36395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adds	$Tlo,$Tlo,$t2
36495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adc	$Thi,$Thi,$t3
36595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adds	$Tlo,$Tlo,$t0
36695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adc	$Thi,$Thi,$t1
36795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
36895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&BODY_00_15(0x17);
36995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
37095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldreq	$t0,[sp,#`$Xoff+8*(16-1)`+0]
37195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldreq	$t1,[sp,#`$Xoff+8*(16-1)`+4]
37295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	beq	.L16_79
37395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	bic	$Ktbl,$Ktbl,#1
37495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
37595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$Tlo,[sp,#$Boff+0]
37695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$Thi,[sp,#$Boff+4]
37795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t0, [$ctx,#$Aoff+$lo]
37895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t1, [$ctx,#$Aoff+$hi]
37995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t2, [$ctx,#$Boff+$lo]
38095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t3, [$ctx,#$Boff+$hi]
38195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adds	$t0,$Alo,$t0
38295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$t0, [$ctx,#$Aoff+$lo]
38395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adc	$t1,$Ahi,$t1
38495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$t1, [$ctx,#$Aoff+$hi]
38595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adds	$t2,$Tlo,$t2
38695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$t2, [$ctx,#$Boff+$lo]
38795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adc	$t3,$Thi,$t3
38895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$t3, [$ctx,#$Boff+$hi]
38995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
39095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$Alo,[sp,#$Coff+0]
39195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$Ahi,[sp,#$Coff+4]
39295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$Tlo,[sp,#$Doff+0]
39395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$Thi,[sp,#$Doff+4]
39495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t0, [$ctx,#$Coff+$lo]
39595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t1, [$ctx,#$Coff+$hi]
39695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t2, [$ctx,#$Doff+$lo]
39795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t3, [$ctx,#$Doff+$hi]
39895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adds	$t0,$Alo,$t0
39995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$t0, [$ctx,#$Coff+$lo]
40095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adc	$t1,$Ahi,$t1
40195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$t1, [$ctx,#$Coff+$hi]
40295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adds	$t2,$Tlo,$t2
40395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$t2, [$ctx,#$Doff+$lo]
40495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adc	$t3,$Thi,$t3
40595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$t3, [$ctx,#$Doff+$hi]
40695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
40795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$Tlo,[sp,#$Foff+0]
40895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$Thi,[sp,#$Foff+4]
40995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t0, [$ctx,#$Eoff+$lo]
41095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t1, [$ctx,#$Eoff+$hi]
41195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t2, [$ctx,#$Foff+$lo]
41295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t3, [$ctx,#$Foff+$hi]
41395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adds	$Elo,$Elo,$t0
41495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$Elo,[$ctx,#$Eoff+$lo]
41595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adc	$Ehi,$Ehi,$t1
41695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$Ehi,[$ctx,#$Eoff+$hi]
41795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adds	$t2,$Tlo,$t2
41895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$t2, [$ctx,#$Foff+$lo]
41995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adc	$t3,$Thi,$t3
42095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$t3, [$ctx,#$Foff+$hi]
42195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
42295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$Alo,[sp,#$Goff+0]
42395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$Ahi,[sp,#$Goff+4]
42495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$Tlo,[sp,#$Hoff+0]
42595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$Thi,[sp,#$Hoff+4]
42695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t0, [$ctx,#$Goff+$lo]
42795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t1, [$ctx,#$Goff+$hi]
42895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t2, [$ctx,#$Hoff+$lo]
42995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$t3, [$ctx,#$Hoff+$hi]
43095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adds	$t0,$Alo,$t0
43195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$t0, [$ctx,#$Goff+$lo]
43295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adc	$t1,$Ahi,$t1
43395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$t1, [$ctx,#$Goff+$hi]
43495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adds	$t2,$Tlo,$t2
43595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$t2, [$ctx,#$Hoff+$lo]
43695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adc	$t3,$Thi,$t3
43795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$t3, [$ctx,#$Hoff+$hi]
43895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
43995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	sp,sp,#640
44095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub	$Ktbl,$Ktbl,#640
44195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
44295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	teq	$inp,$len
44395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	bne	.Loop
44495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
44595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	sp,sp,#8*9		@ destroy frame
44695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#if __ARM_ARCH__>=5
44795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldmia	sp!,{r4-r12,pc}
44895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#else
44995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldmia	sp!,{r4-r12,lr}
45095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	tst	lr,#1
45195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	moveq	pc,lr			@ be binary compatible with V4, yet
45295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	bx	lr			@ interoperable with Thumb ISA:-)
45395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#endif
45495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
45595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
45695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley{
45795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy @Sigma0=(28,34,39);
45895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy @Sigma1=(14,18,41);
45995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy @sigma0=(1, 8, 7);
46095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy @sigma1=(19,61,6);
46195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
46295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy $Ktbl="r3";
46395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy $cnt="r12";	# volatile register known as ip, intra-procedure-call scratch
46495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
46595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy @X=map("d$_",(0..15));
46695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
46795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
46895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub NEON_00_15() {
46995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy $i=shift;
47095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
47195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));	# temps
47295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
47395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___ if ($i<16 || $i&1);
47495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$t0,$e,#@Sigma1[0]	@ $i
47595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#if $i<16
47695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vld1.64		{@X[$i%16]},[$inp]!	@ handles unaligned
47795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#endif
47895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$t1,$e,#@Sigma1[1]
47995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#if $i>0
48095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vadd.i64	$a,$Maj			@ h+=Maj from the past
48195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#endif
48295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$t2,$e,#@Sigma1[2]
48395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
48495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
48595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vld1.64		{$K},[$Ktbl,:64]!	@ K[i++]
48695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vsli.64		$t0,$e,#`64-@Sigma1[0]`
48795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vsli.64		$t1,$e,#`64-@Sigma1[1]`
48895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmov		$Ch,$e
48995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vsli.64		$t2,$e,#`64-@Sigma1[2]`
49095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#if $i<16 && defined(__ARMEL__)
49195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vrev64.8	@X[$i],@X[$i]
49295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#endif
49395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	veor		$t1,$t0
49495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vbsl		$Ch,$f,$g		@ Ch(e,f,g)
49595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$t0,$a,#@Sigma0[0]
49695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	veor		$t2,$t1			@ Sigma1(e)
49795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.i64	$T1,$Ch,$h
49895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$t1,$a,#@Sigma0[1]
49995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vsli.64		$t0,$a,#`64-@Sigma0[0]`
50095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.i64	$T1,$t2
50195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$t2,$a,#@Sigma0[2]
50295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.i64	$K,@X[$i%16]
50395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vsli.64		$t1,$a,#`64-@Sigma0[1]`
50495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	veor		$Maj,$a,$b
50595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vsli.64		$t2,$a,#`64-@Sigma0[2]`
50695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	veor		$h,$t0,$t1
50795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.i64	$T1,$K
50895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vbsl		$Maj,$c,$b		@ Maj(a,b,c)
50995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	veor		$h,$t2			@ Sigma0(a)
51095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.i64	$d,$T1
51195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.i64	$Maj,$T1
51295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	@ vadd.i64	$h,$Maj
51395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
51495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
51595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
51695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub NEON_16_79() {
51795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy $i=shift;
51895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
51995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyif ($i&1)	{ &NEON_00_15($i,@_); return; }
52095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
52195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# 2x-vectorized, therefore runs every 2nd round
52295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy @X=map("q$_",(0..7));			# view @X as 128-bit vector
52395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy ($t0,$t1,$s0,$s1) = map("q$_",(12..15));	# temps
52495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy ($d0,$d1,$d2) = map("d$_",(24..26));		# temps from NEON_00_15
52595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy $e=@_[4];					# $e from NEON_00_15
52695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$i /= 2;
52795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
52895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$t0,@X[($i+7)%8],#@sigma1[0]
52995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$t1,@X[($i+7)%8],#@sigma1[1]
53095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vadd.i64	@_[0],d30			@ h+=Maj from the past
53195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$s1,@X[($i+7)%8],#@sigma1[2]
53295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vsli.64		$t0,@X[($i+7)%8],#`64-@sigma1[0]`
53395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vext.8		$s0,@X[$i%8],@X[($i+1)%8],#8	@ X[i+1]
53495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vsli.64		$t1,@X[($i+7)%8],#`64-@sigma1[1]`
53595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	veor		$s1,$t0
53695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$t0,$s0,#@sigma0[0]
53795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	veor		$s1,$t1				@ sigma1(X[i+14])
53895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$t1,$s0,#@sigma0[1]
53995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.i64	@X[$i%8],$s1
54095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$s1,$s0,#@sigma0[2]
54195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vsli.64		$t0,$s0,#`64-@sigma0[0]`
54295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vsli.64		$t1,$s0,#`64-@sigma0[1]`
54395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vext.8		$s0,@X[($i+4)%8],@X[($i+5)%8],#8	@ X[i+9]
54495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	veor		$s1,$t0
54595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$d0,$e,#@Sigma1[0]		@ from NEON_00_15
54695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.i64	@X[$i%8],$s0
54795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$d1,$e,#@Sigma1[1]		@ from NEON_00_15
54895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	veor		$s1,$t1				@ sigma0(X[i+1])
54995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$d2,$e,#@Sigma1[2]		@ from NEON_00_15
55095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.i64	@X[$i%8],$s1
55195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
55295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&NEON_00_15(2*$i,@_);
55395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
55495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
55595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
55695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#if __ARM_ARCH__>=7
55795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.fpu	neon
55895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
55995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	4
56095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.LNEON:
56195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	dmb				@ errata #451034 on early Cortex A8
56295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vstmdb	sp!,{d8-d15}		@ ABI specification says so
56395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub	$Ktbl,r3,#672		@ K512
56495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vldmia	$ctx,{$A-$H}		@ load context
56595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Loop_neon:
56695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
56795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyfor($i=0;$i<16;$i++)	{ &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
56895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
56995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov		$cnt,#4
57095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.L16_79_neon:
57195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	subs		$cnt,#1
57295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
57395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyfor(;$i<32;$i++)	{ &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
57495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
57595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	bne		.L16_79_neon
57695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
57795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vadd.i64	$A,d30		@ h+=Maj from the past
57895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vldmia		$ctx,{d24-d31}	@ load context to temp
57995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.i64	q8,q12		@ vectorized accumulate
58095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.i64	q9,q13
58195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.i64	q10,q14
58295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.i64	q11,q15
58395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vstmia		$ctx,{$A-$H}	@ save context
58495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	teq		$inp,$len
58595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub		$Ktbl,#640	@ rewind K512
58695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	bne		.Loop_neon
58795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
58895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vldmia	sp!,{d8-d15}		@ epilogue
58995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	bx	lr
59095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#endif
59195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
59295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
59395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
59495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.size	sha512_block_data_order,.-sha512_block_data_order
59595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
59695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	2
59795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.comm	OPENSSL_armcap_P,4,4
59895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
59995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#endif
60095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
60195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
60295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code =~ s/\`([^\`]*)\`/eval $1/gem;
60395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
60495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyprint $code;
60595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyclose STDOUT; # enforce flush
606