195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#!/usr/bin/env perl
295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# ====================================================================
495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# project. The module is, however, dual licensed under OpenSSL and
695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# CRYPTOGAMS licenses depending on where you obtain it. For further
795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# details see http://www.openssl.org/~appro/cryptogams/.
895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# ====================================================================
995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
1095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# January 2007.
1195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
1295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# Montgomery multiplication for ARMv4.
1395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#
1495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# Performance improvement naturally varies among CPU implementations
1595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# and compilers. The code was observed to provide +65-35% improvement
1695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# [depending on key length, less for longer keys] on ARM920T, and
1795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
1895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# base and compiler generated code with in-lined umull and even umlal
1995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# instructions. The latter means that this code didn't really have an
2095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# "advantage" of utilizing some "secret" instruction.
2195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#
2295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# The code is interoperable with Thumb ISA and is rather compact, less
2395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# than 1/2KB. Windows CE port would be trivial, as it's exclusively
2495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# about decorations, ABI and instruction syntax are identical.
2595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
2695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# November 2013
2795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#
2895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
2995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# performance improvement on Cortex-A8 is ~45-100% depending on key
3095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
3195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# On Snapdragon S4 improvement was measured to vary from ~70% to
3295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
3395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# rather because original integer-only code seems to perform
3495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# suboptimally on S4. Situation on Cortex-A9 is unfortunately
3595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# different. It's being looked into, but the trouble is that
3695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# performance for vectors longer than 256 bits is actually couple
3795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# of percent worse than for integer-only code. The code is chosen
3895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# for execution on all NEON-capable processors, because gain on
3995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# others outweighs the marginal loss on Cortex-A9.
4095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
4195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleywhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
4295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyopen STDOUT,">$output";
4395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
4495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$num="r0";	# starts as num argument, but holds &tp[num-1]
4595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$ap="r1";
4695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$bp="r2"; $bi="r2"; $rp="r2";
4795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$np="r3";
4895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$tp="r4";
4995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$aj="r5";
5095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$nj="r6";
5195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$tj="r7";
5295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$n0="r8";
5395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley###########	# r9 is reserved by ELF as platform specific, e.g. TLS pointer
5495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$alo="r10";	# sl, gcc uses it to keep @GOT
5595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$ahi="r11";	# fp
5695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$nlo="r12";	# ip
5795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley###########	# r13 is stack pointer
5895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$nhi="r14";	# lr
5995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley###########	# r15 is program counter
6095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
6195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#### argument block layout relative to &tp[num-1], a.k.a. $num
6295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$_rp="$num,#12*4";
6395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# ap permanently resides in r1
6495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$_bp="$num,#13*4";
6595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# np permanently resides in r3
6695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$_n0="$num,#14*4";
6795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$_num="$num,#15*4";	$_bpend=$_num;
6895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
6995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code=<<___;
7095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#include "arm_arch.h"
7195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
7295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.text
7395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.code	32
7495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
7595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#if __ARM_ARCH__>=7
7695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	5
7795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.LOPENSSL_armcap:
7895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.word	OPENSSL_armcap_P-bn_mul_mont
7995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#endif
8095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
8195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.global	bn_mul_mont
82eb7d2ed1fe8a33b3e3871502ba7e12efaf94360cAdam Langley.hidden	bn_mul_mont
8395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.type	bn_mul_mont,%function
8495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
8595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	5
8695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleybn_mul_mont:
8795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	ip,[sp,#4]		@ load num
8895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	stmdb	sp!,{r0,r2}		@ sp points at argument block
8995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#if __ARM_ARCH__>=7
9095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	tst	ip,#7
9195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	bne	.Lialu
9295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adr	r0,bn_mul_mont
9395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	r2,.LOPENSSL_armcap
9495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	r0,[r0,r2]
9595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	tst	r0,#1			@ NEON available?
9695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldmia	sp, {r0,r2}
9795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	beq	.Lialu
9895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	sp,sp,#8
9995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	b	bn_mul8x_mont_neon
10095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	4
10195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Lialu:
10295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#endif
10395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	cmp	ip,#2
10495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$num,ip			@ load num
10595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movlt	r0,#0
10695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	addlt	sp,sp,#2*4
10795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	blt	.Labrt
10895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
10995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
11095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
11195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$num,$num,lsl#2		@ rescale $num for byte count
11295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub	sp,sp,$num		@ alloca(4*num)
11395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub	sp,sp,#4		@ +extra dword
11495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub	$num,$num,#4		@ "num=num-1"
11595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	$tp,$bp,$num		@ &bp[num-1]
11695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
11795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	$num,sp,$num		@ $num to point at &tp[num-1]
11895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$n0,[$_n0]		@ &n0
11995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$bi,[$bp]		@ bp[0]
12095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$aj,[$ap],#4		@ ap[0],ap++
12195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$nj,[$np],#4		@ np[0],np++
12295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$n0,[$n0]		@ *n0
12395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$tp,[$_bpend]		@ save &bp[num]
12495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
12595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	umull	$alo,$ahi,$aj,$bi	@ ap[0]*bp[0]
12695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$n0,[$_n0]		@ save n0 value
12795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mul	$n0,$alo,$n0		@ "tp[0]"*n0
12895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$nlo,#0
12995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"t[0]"
13095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$tp,sp
13195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
13295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.L1st:
13395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$aj,[$ap],#4		@ ap[j],ap++
13495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$alo,$ahi
13595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$nj,[$np],#4		@ np[j],np++
13695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$ahi,#0
13795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
13895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$nhi,#0
13995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
14095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adds	$nlo,$nlo,$alo
14195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
14295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adc	$nlo,$nhi,#0
14395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	cmp	$tp,$num
14495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	bne	.L1st
14595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
14695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adds	$nlo,$nlo,$ahi
14795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$tp,[$_bp]		@ restore bp
14895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$nhi,#0
14995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$n0,[$_n0]		@ restore n0
15095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adc	$nhi,$nhi,#0
15195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$nlo,[$num]		@ tp[num-1]=
15295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$nhi,[$num,#4]		@ tp[num]=
15395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
15495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Louter:
15595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub	$tj,$num,sp		@ "original" $num-1 value
15695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
15795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$bi,[$tp,#4]!		@ *(++bp)
15895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub	$np,$np,$tj		@ "rewind" np to &np[1]
15995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$aj,[$ap,#-4]		@ ap[0]
16095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$alo,[sp]		@ tp[0]
16195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$nj,[$np,#-4]		@ np[0]
16295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$tj,[sp,#4]		@ tp[1]
16395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
16495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$ahi,#0
16595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	umlal	$alo,$ahi,$aj,$bi	@ ap[0]*bp[i]+tp[0]
16695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$tp,[$_bp]		@ save bp
16795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mul	$n0,$alo,$n0
16895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$nlo,#0
16995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"tp[0]"
17095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$tp,sp
17195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
17295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Linner:
17395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$aj,[$ap],#4		@ ap[j],ap++
17495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adds	$alo,$ahi,$tj		@ +=tp[j]
17595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$nj,[$np],#4		@ np[j],np++
17695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$ahi,#0
17795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
17895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$nhi,#0
17995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
18095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adc	$ahi,$ahi,#0
18195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$tj,[$tp,#8]		@ tp[j+1]
18295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adds	$nlo,$nlo,$alo
18395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
18495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adc	$nlo,$nhi,#0
18595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	cmp	$tp,$num
18695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	bne	.Linner
18795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
18895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adds	$nlo,$nlo,$ahi
18995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$nhi,#0
19095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$tp,[$_bp]		@ restore bp
19195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adc	$nhi,$nhi,#0
19295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$n0,[$_n0]		@ restore n0
19395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adds	$nlo,$nlo,$tj
19495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$tj,[$_bpend]		@ restore &bp[num]
19595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	adc	$nhi,$nhi,#0
19695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$nlo,[$num]		@ tp[num-1]=
19795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$nhi,[$num,#4]		@ tp[num]=
19895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
19995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	cmp	$tp,$tj
20095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	bne	.Louter
20195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
20295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$rp,[$_rp]		@ pull rp
20395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	$num,$num,#4		@ $num to point at &tp[num]
20495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub	$aj,$num,sp		@ "original" num value
20595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$tp,sp			@ "rewind" $tp
20695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$ap,$tp			@ "borrow" $ap
20795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
20895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
20995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	subs	$tj,$tj,$tj		@ "clear" carry flag
21095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Lsub:	ldr	$tj,[$tp],#4
21195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	$nj,[$np],#4
21295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sbcs	$tj,$tj,$nj		@ tp[j]-np[j]
21395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$tj,[$rp],#4		@ rp[j]=
21495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	teq	$tp,$num		@ preserve carry
21595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	bne	.Lsub
21695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sbcs	$nhi,$nhi,#0		@ upmost carry
21795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$tp,sp			@ "rewind" $tp
21895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub	$rp,$rp,$aj		@ "rewind" $rp
21995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
22095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	and	$ap,$tp,$nhi
22195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	bic	$np,$rp,$nhi
22295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	orr	$ap,$ap,$np		@ ap=borrow?tp:rp
22395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
22495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Lcopy:	ldr	$tj,[$ap],#4		@ copy or in-place refresh
22595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	sp,[$tp],#4		@ zap tp
22695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	str	$tj,[$rp],#4
22795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	cmp	$tp,$num
22895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	bne	.Lcopy
22995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
23095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	sp,$num,#4		@ skip over tp[num+1]
23195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldmia	sp!,{r4-r12,lr}		@ restore registers
23295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	sp,sp,#2*4		@ skip over {r0,r2}
23395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	r0,#1
23495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Labrt:	tst	lr,#1
23595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	moveq	pc,lr			@ be binary compatible with V4, yet
23695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	bx	lr			@ interoperable with Thumb ISA:-)
23795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.size	bn_mul_mont,.-bn_mul_mont
23895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
23995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley{
24095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
24195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
24295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
24395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
24495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
24595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy ($Z,$Temp)=("q4","q5");
24695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
24795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy ($Bi,$Ni,$M0)=map("d$_",(28..31));
24895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy $zero=&Dlo($Z);
24995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy $temp=&Dlo($Temp);
25095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
25195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
25295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
25395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
25495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
25595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#if __ARM_ARCH__>=7
25695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.fpu	neon
25795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
25895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.type	bn_mul8x_mont_neon,%function
25995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	5
26095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleybn_mul8x_mont_neon:
26195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	ip,sp
26295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	stmdb	sp!,{r4-r11}
26395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vstmdb	sp!,{d8-d15}		@ ABI specification says so
26495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldmia	ip,{r4-r5}		@ load rest of parameter block
26595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
26695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub		$toutptr,sp,#16
26795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vld1.32		{${Bi}[0]}, [$bptr,:32]!
26895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub		$toutptr,$toutptr,$num,lsl#4
26995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vld1.32		{$A0-$A3},  [$aptr]!		@ can't specify :32 :-(
27095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	and		$toutptr,$toutptr,#-64
27195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vld1.32		{${M0}[0]}, [$n0,:32]
27295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov		sp,$toutptr			@ alloca
27395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	veor		$zero,$zero,$zero
27495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	subs		$inner,$num,#8
27595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vzip.16		$Bi,$zero
27695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
27795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmull.u32	$A0xB,$Bi,${A0}[0]
27895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmull.u32	$A1xB,$Bi,${A0}[1]
27995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmull.u32	$A2xB,$Bi,${A1}[0]
28095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
28195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmull.u32	$A3xB,$Bi,${A1}[1]
28295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
28395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
28495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	veor		$zero,$zero,$zero
28595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmul.u32	$Ni,$temp,$M0
28695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
28795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmull.u32	$A4xB,$Bi,${A2}[0]
28895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vld1.32	{$N0-$N3}, [$nptr]!
28995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmull.u32	$A5xB,$Bi,${A2}[1]
29095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmull.u32	$A6xB,$Bi,${A3}[0]
29195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vzip.16		$Ni,$zero
29295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmull.u32	$A7xB,$Bi,${A3}[1]
29395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
29495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	bne	.LNEON_1st
29595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
29695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	@ special case for num=8, everything is in register bank...
29795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
29895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A0xB,$Ni,${N0}[0]
29995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub		$outer,$num,#1
30095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A1xB,$Ni,${N0}[1]
30195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A2xB,$Ni,${N1}[0]
30295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A3xB,$Ni,${N1}[1]
30395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
30495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A4xB,$Ni,${N2}[0]
30595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmov		$Temp,$A0xB
30695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A5xB,$Ni,${N2}[1]
30795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmov		$A0xB,$A1xB
30895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A6xB,$Ni,${N3}[0]
30995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmov		$A1xB,$A2xB
31095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A7xB,$Ni,${N3}[1]
31195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmov		$A2xB,$A3xB
31295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmov		$A3xB,$A4xB
31395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$temp,$temp,#16
31495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmov		$A4xB,$A5xB
31595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmov		$A5xB,$A6xB
31695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
31795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmov		$A6xB,$A7xB
31895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	veor		$A7xB,$A7xB
31995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$temp,$temp,#16
32095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
32195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	b	.LNEON_outer8
32295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
32395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	4
32495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.LNEON_outer8:
32595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vld1.32		{${Bi}[0]}, [$bptr,:32]!
32695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	veor		$zero,$zero,$zero
32795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vzip.16		$Bi,$zero
32895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
32995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
33095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A0xB,$Bi,${A0}[0]
33195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A1xB,$Bi,${A0}[1]
33295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A2xB,$Bi,${A1}[0]
33395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
33495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A3xB,$Bi,${A1}[1]
33595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
33695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
33795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	veor		$zero,$zero,$zero
33895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	subs		$outer,$outer,#1
33995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmul.u32	$Ni,$temp,$M0
34095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
34195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A4xB,$Bi,${A2}[0]
34295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A5xB,$Bi,${A2}[1]
34395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A6xB,$Bi,${A3}[0]
34495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vzip.16		$Ni,$zero
34595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A7xB,$Bi,${A3}[1]
34695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
34795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A0xB,$Ni,${N0}[0]
34895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A1xB,$Ni,${N0}[1]
34995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A2xB,$Ni,${N1}[0]
35095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A3xB,$Ni,${N1}[1]
35195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
35295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A4xB,$Ni,${N2}[0]
35395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmov		$Temp,$A0xB
35495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A5xB,$Ni,${N2}[1]
35595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmov		$A0xB,$A1xB
35695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A6xB,$Ni,${N3}[0]
35795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmov		$A1xB,$A2xB
35895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A7xB,$Ni,${N3}[1]
35995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmov		$A2xB,$A3xB
36095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmov		$A3xB,$A4xB
36195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$temp,$temp,#16
36295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmov		$A4xB,$A5xB
36395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmov		$A5xB,$A6xB
36495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
36595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmov		$A6xB,$A7xB
36695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	veor		$A7xB,$A7xB
36795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$temp,$temp,#16
36895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
36995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	bne	.LNEON_outer8
37095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
37195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
37295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov		$toutptr,sp
37395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
37495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov		$inner,$num
37595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
37695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add		$tinptr,sp,#16
37795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
37895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`
37995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
38095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	b	.LNEON_tail2
38195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
38295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	4
38395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.LNEON_1st:
38495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A0xB,$Ni,${N0}[0]
38595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vld1.32	{$A0-$A3}, [$aptr]!
38695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A1xB,$Ni,${N0}[1]
38795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	subs		$inner,$inner,#8
38895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A2xB,$Ni,${N1}[0]
38995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A3xB,$Ni,${N1}[1]
39095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
39195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A4xB,$Ni,${N2}[0]
39295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vld1.32	{$N0-$N1}, [$nptr]!
39395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A5xB,$Ni,${N2}[1]
39495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vst1.64	{$A0xB-$A1xB}, [$toutptr,:256]!
39595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A6xB,$Ni,${N3}[0]
39695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A7xB,$Ni,${N3}[1]
39795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vst1.64	{$A2xB-$A3xB}, [$toutptr,:256]!
39895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
39995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmull.u32	$A0xB,$Bi,${A0}[0]
40095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vld1.32	{$N2-$N3}, [$nptr]!
40195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmull.u32	$A1xB,$Bi,${A0}[1]
40295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vst1.64	{$A4xB-$A5xB}, [$toutptr,:256]!
40395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmull.u32	$A2xB,$Bi,${A1}[0]
40495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmull.u32	$A3xB,$Bi,${A1}[1]
40595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vst1.64	{$A6xB-$A7xB}, [$toutptr,:256]!
40695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
40795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmull.u32	$A4xB,$Bi,${A2}[0]
40895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmull.u32	$A5xB,$Bi,${A2}[1]
40995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmull.u32	$A6xB,$Bi,${A3}[0]
41095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmull.u32	$A7xB,$Bi,${A3}[1]
41195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
41295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	bne	.LNEON_1st
41395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
41495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A0xB,$Ni,${N0}[0]
41595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add		$tinptr,sp,#16
41695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A1xB,$Ni,${N0}[1]
41795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
41895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A2xB,$Ni,${N1}[0]
41995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vld1.64	{$Temp}, [sp,:128]
42095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A3xB,$Ni,${N1}[1]
42195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub		$outer,$num,#1
42295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
42395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A4xB,$Ni,${N2}[0]
42495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
42595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A5xB,$Ni,${N2}[1]
42695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$temp,$temp,#16
42795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vld1.64	{$A0xB},       [$tinptr, :128]!
42895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A6xB,$Ni,${N3}[0]
42995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
43095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A7xB,$Ni,${N3}[1]
43195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
43295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
43395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
43495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	veor		$Z,$Z,$Z
43595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
43695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
43795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vst1.64		{$Z},          [$toutptr,:128]
43895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$temp,$temp,#16
43995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
44095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	b		.LNEON_outer
44195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
44295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	4
44395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.LNEON_outer:
44495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vld1.32		{${Bi}[0]}, [$bptr,:32]!
44595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub		$nptr,$nptr,$num,lsl#2		@ rewind $nptr
44695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vld1.32		{$A0-$A3},  [$aptr]!
44795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	veor		$zero,$zero,$zero
44895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov		$toutptr,sp
44995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vzip.16		$Bi,$zero
45095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub		$inner,$num,#8
45195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
45295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
45395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A0xB,$Bi,${A0}[0]
45495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vld1.64	{$A3xB-$A4xB},[$tinptr,:256]!
45595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A1xB,$Bi,${A0}[1]
45695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A2xB,$Bi,${A1}[0]
45795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vld1.64	{$A5xB-$A6xB},[$tinptr,:256]!
45895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A3xB,$Bi,${A1}[1]
45995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
46095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
46195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	veor		$zero,$zero,$zero
46295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
46395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vld1.64	{$A7xB},[$tinptr,:128]!
46495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmul.u32	$Ni,$temp,$M0
46595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
46695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A4xB,$Bi,${A2}[0]
46795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vld1.32	{$N0-$N3}, [$nptr]!
46895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A5xB,$Bi,${A2}[1]
46995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A6xB,$Bi,${A3}[0]
47095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vzip.16		$Ni,$zero
47195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A7xB,$Bi,${A3}[1]
47295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
47395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.LNEON_inner:
47495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A0xB,$Ni,${N0}[0]
47595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vld1.32	{$A0-$A3}, [$aptr]!
47695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A1xB,$Ni,${N0}[1]
47795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 subs		$inner,$inner,#8
47895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A2xB,$Ni,${N1}[0]
47995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A3xB,$Ni,${N1}[1]
48095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
48195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
48295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A4xB,$Ni,${N2}[0]
48395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vld1.64	{$A0xB},       [$tinptr, :128]!
48495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A5xB,$Ni,${N2}[1]
48595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
48695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A6xB,$Ni,${N3}[0]
48795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
48895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A7xB,$Ni,${N3}[1]
48995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
49095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
49195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A0xB,$Bi,${A0}[0]
49295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vld1.64	{$A3xB-$A4xB}, [$tinptr, :256]!
49395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A1xB,$Bi,${A0}[1]
49495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
49595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A2xB,$Bi,${A1}[0]
49695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vld1.64	{$A5xB-$A6xB}, [$tinptr, :256]!
49795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A3xB,$Bi,${A1}[1]
49895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vld1.32	{$N0-$N3}, [$nptr]!
49995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
50095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A4xB,$Bi,${A2}[0]
50195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vld1.64	{$A7xB},       [$tinptr, :128]!
50295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A5xB,$Bi,${A2}[1]
50395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A6xB,$Bi,${A3}[0]
50495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A7xB,$Bi,${A3}[1]
50595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
50695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	bne	.LNEON_inner
50795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
50895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A0xB,$Ni,${N0}[0]
50995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add		$tinptr,sp,#16
51095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A1xB,$Ni,${N0}[1]
51195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
51295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A2xB,$Ni,${N1}[0]
51395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vld1.64	{$Temp}, [sp,:128]
51495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A3xB,$Ni,${N1}[1]
51595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	subs		$outer,$outer,#1
51695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
51795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A4xB,$Ni,${N2}[0]
51895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
51995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A5xB,$Ni,${N2}[1]
52095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vld1.64	{$A0xB},       [$tinptr, :128]!
52195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$temp,$temp,#16
52295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
52395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A6xB,$Ni,${N3}[0]
52495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
52595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmlal.u32	$A7xB,$Ni,${N3}[1]
52695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
52795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
52895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
52995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
53095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$temp,$temp,#16
53195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
53295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	bne	.LNEON_outer
53395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
53495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov		$toutptr,sp
53595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov		$inner,$num
53695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
53795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.LNEON_tail:
53895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
53995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vld1.64		{$A3xB-$A4xB}, [$tinptr, :256]!
54095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
54195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
54295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vld1.64		{$A5xB-$A6xB}, [$tinptr, :256]!
54395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
54495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vld1.64		{$A7xB},       [$tinptr, :128]!
54595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`
54695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
54795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.LNEON_tail2:
54895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.u64	`&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
54995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vst1.32		{`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
55095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$temp,`&Dlo("$A1xB")`,#16
55195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.u64	`&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
55295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$temp,`&Dhi("$A1xB")`,#16
55395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vzip.16		`&Dlo("$A1xB")`,`&Dhi("$A1xB")`
55495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
55595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.u64	`&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
55695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vst1.32		{`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
55795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$temp,`&Dlo("$A2xB")`,#16
55895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.u64	`&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
55995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$temp,`&Dhi("$A2xB")`,#16
56095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vzip.16		`&Dlo("$A2xB")`,`&Dhi("$A2xB")`
56195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
56295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.u64	`&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
56395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vst1.32		{`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
56495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$temp,`&Dlo("$A3xB")`,#16
56595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.u64	`&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
56695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$temp,`&Dhi("$A3xB")`,#16
56795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vzip.16		`&Dlo("$A3xB")`,`&Dhi("$A3xB")`
56895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
56995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.u64	`&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
57095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vst1.32		{`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
57195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$temp,`&Dlo("$A4xB")`,#16
57295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.u64	`&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
57395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$temp,`&Dhi("$A4xB")`,#16
57495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vzip.16		`&Dlo("$A4xB")`,`&Dhi("$A4xB")`
57595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
57695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.u64	`&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
57795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vst1.32		{`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
57895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$temp,`&Dlo("$A5xB")`,#16
57995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.u64	`&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
58095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$temp,`&Dhi("$A5xB")`,#16
58195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vzip.16		`&Dlo("$A5xB")`,`&Dhi("$A5xB")`
58295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
58395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.u64	`&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
58495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vst1.32		{`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
58595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$temp,`&Dlo("$A6xB")`,#16
58695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.u64	`&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
58795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vld1.64		{$A0xB}, [$tinptr, :128]!
58895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$temp,`&Dhi("$A6xB")`,#16
58995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vzip.16		`&Dlo("$A6xB")`,`&Dhi("$A6xB")`
59095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
59195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.u64	`&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
59295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vst1.32		{`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
59395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$temp,`&Dlo("$A7xB")`,#16
59495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vadd.u64	`&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
59595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vld1.64		{$A1xB-$A2xB},	[$tinptr, :256]!
59695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vshr.u64	$temp,`&Dhi("$A7xB")`,#16
59795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vzip.16		`&Dlo("$A7xB")`,`&Dhi("$A7xB")`
59895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	subs		$inner,$inner,#8
59995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vst1.32		{`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
60095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
60195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	bne	.LNEON_tail
60295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
60395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vst1.32	{${temp}[0]}, [$toutptr, :32]		@ top-most bit
60495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub	$nptr,$nptr,$num,lsl#2			@ rewind $nptr
60595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	subs	$aptr,sp,#0				@ clear carry flag
60695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	$bptr,sp,$num,lsl#2
60795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
60895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.LNEON_sub:
60995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldmia	$aptr!, {r4-r7}
61095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldmia	$nptr!, {r8-r11}
61195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sbcs	r8, r4,r8
61295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sbcs	r9, r5,r9
61395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sbcs	r10,r6,r10
61495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sbcs	r11,r7,r11
61595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	teq	$aptr,$bptr				@ preserves carry
61695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	stmia	$rptr!, {r8-r11}
61795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	bne	.LNEON_sub
61895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
61995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldr	r10, [$aptr]				@ load top-most bit
62095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	veor	q0,q0,q0
62195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub	r11,$bptr,sp				@ this is num*4
62295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	veor	q1,q1,q1
62395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$aptr,sp
62495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub	$rptr,$rptr,r11				@ rewind $rptr
62595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$nptr,$bptr				@ second 3/4th of frame
62695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sbcs	r10,r10,#0				@ result is carry flag
62795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
62895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.LNEON_copy_n_zap:
62995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldmia	$aptr!, {r4-r7}
63095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldmia	$rptr,  {r8-r11}
63195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movcc	r8, r4
63295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
63395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movcc	r9, r5
63495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movcc	r10,r6
63595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
63695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movcc	r11,r7
63795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldmia	$aptr, {r4-r7}
63895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	stmia	$rptr!, {r8-r11}
63995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub	$aptr,$aptr,#16
64095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ldmia	$rptr, {r8-r11}
64195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movcc	r8, r4
64295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vst1.64	{q0-q1}, [$aptr,:256]!			@ wipe
64395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movcc	r9, r5
64495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movcc	r10,r6
64595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
64695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movcc	r11,r7
64795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	teq	$aptr,$bptr				@ preserves carry
64895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	stmia	$rptr!, {r8-r11}
64995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	bne	.LNEON_copy_n_zap
65095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
65195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub	sp,ip,#96
65295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley        vldmia  sp!,{d8-d15}
65395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley        ldmia   sp!,{r4-r11}
65495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	bx	lr
65595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
65695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#endif
65795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
65895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
65995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
66095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.asciz	"Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
66195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	2
66295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#if __ARM_ARCH__>=7
66395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.comm	OPENSSL_armcap_P,4,4
66495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#endif
66595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
66695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
66795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code =~ s/\`([^\`]*)\`/eval $1/gem;
66895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
66995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyprint $code;
67095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyclose STDOUT;
671