1d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#!/usr/bin/env perl
2d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
3d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# ====================================================================
4d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# project. The module is, however, dual licensed under OpenSSL and
6d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# CRYPTOGAMS licenses depending on where you obtain it. For further
7d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# details see http://www.openssl.org/~appro/cryptogams/.
8d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# ====================================================================
9d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
10d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# January 2007.
11d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
12d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Montgomery multiplication for ARMv4.
13d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
14d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Performance improvement naturally varies among CPU implementations
15d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# and compilers. The code was observed to provide +65-35% improvement
16d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# [depending on key length, less for longer keys] on ARM920T, and
17d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
18d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# base and compiler generated code with in-lined umull and even umlal
19d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# instructions. The latter means that this code didn't really have an
20d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# "advantage" of utilizing some "secret" instruction.
21d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
22d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# The code is interoperable with Thumb ISA and is rather compact, less
23d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# than 1/2KB. Windows CE port would be trivial, as it's exclusively
24d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# about decorations, ABI and instruction syntax are identical.
25d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
26d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# November 2013
27d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
28d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
29d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# performance improvement on Cortex-A8 is ~45-100% depending on key
30d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
31d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# On Snapdragon S4 improvement was measured to vary from ~70% to
32d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
33d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# rather because original integer-only code seems to perform
34d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# suboptimally on S4. Situation on Cortex-A9 is unfortunately
35d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# different. It's being looked into, but the trouble is that
36d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# performance for vectors longer than 256 bits is actually couple
37d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# of percent worse than for integer-only code. The code is chosen
38d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# for execution on all NEON-capable processors, because gain on
39d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# others outweighs the marginal loss on Cortex-A9.
40d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
41e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley$flavour = shift;
42e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleyif ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
43e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleyelse { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
44e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
45e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleyif ($flavour && $flavour ne "void") {
46e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
48e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
49e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley    die "can't locate arm-xlate.pl";
50e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
51e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley    open STDOUT,"| \"$^X\" $xlate $flavour $output";
52e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley} else {
53e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley    open STDOUT,">$output";
54e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley}
55d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
56d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$num="r0";	# starts as num argument, but holds &tp[num-1]
57d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$ap="r1";
58d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$bp="r2"; $bi="r2"; $rp="r2";
59d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$np="r3";
60d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$tp="r4";
61d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$aj="r5";
62d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$nj="r6";
63d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$tj="r7";
64d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$n0="r8";
65d9e397b599b13d642138480a28c14db7a136bf0Adam Langley###########	# r9 is reserved by ELF as platform specific, e.g. TLS pointer
66d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$alo="r10";	# sl, gcc uses it to keep @GOT
67d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$ahi="r11";	# fp
68d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$nlo="r12";	# ip
69d9e397b599b13d642138480a28c14db7a136bf0Adam Langley###########	# r13 is stack pointer
70d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$nhi="r14";	# lr
71d9e397b599b13d642138480a28c14db7a136bf0Adam Langley###########	# r15 is program counter
72d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
73d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#### argument block layout relative to &tp[num-1], a.k.a. $num
74d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$_rp="$num,#12*4";
75d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# ap permanently resides in r1
76d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$_bp="$num,#13*4";
77d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# np permanently resides in r3
78d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$_n0="$num,#14*4";
79d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$_num="$num,#15*4";	$_bpend=$_num;
80d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
81d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code=<<___;
82d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#include "arm_arch.h"
83d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
84d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.text
85d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.code	32
86d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
87e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley#if __ARM_MAX_ARCH__>=7
88d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	5
89d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LOPENSSL_armcap:
90e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley.word	OPENSSL_armcap_P-.Lbn_mul_mont
91d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#endif
92d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
93d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.global	bn_mul_mont
94d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.hidden	bn_mul_mont
95d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.type	bn_mul_mont,%function
96d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
97d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	5
98d9e397b599b13d642138480a28c14db7a136bf0Adam Langleybn_mul_mont:
99e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley.Lbn_mul_mont:
100d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldr	ip,[sp,#4]		@ load num
101d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	stmdb	sp!,{r0,r2}		@ sp points at argument block
102e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley#if __ARM_MAX_ARCH__>=7
103d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	tst	ip,#7
104d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	bne	.Lialu
105d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	adr	r0,bn_mul_mont
106d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldr	r2,.LOPENSSL_armcap
107d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldr	r0,[r0,r2]
108e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley#ifdef	__APPLE__
109e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley	ldr	r0,[r0]
110e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley#endif
111d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	tst	r0,#1			@ NEON available?
112d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldmia	sp, {r0,r2}
113d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	beq	.Lialu
114d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add	sp,sp,#8
115d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	b	bn_mul8x_mont_neon
116d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	4
117d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lialu:
118d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#endif
119d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	ip,#2
120d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$num,ip			@ load num
121d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movlt	r0,#0
122d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	addlt	sp,sp,#2*4
123d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	blt	.Labrt
124d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
125d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
126d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
127d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$num,$num,lsl#2		@ rescale $num for byte count
128d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	sp,sp,$num		@ alloca(4*num)
129d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	sp,sp,#4		@ +extra dword
130d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	$num,$num,#4		@ "num=num-1"
131d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add	$tp,$bp,$num		@ &bp[num-1]
132d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
133d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add	$num,sp,$num		@ $num to point at &tp[num-1]
134d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldr	$n0,[$_n0]		@ &n0
135d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldr	$bi,[$bp]		@ bp[0]
136d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldr	$aj,[$ap],#4		@ ap[0],ap++
137d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldr	$nj,[$np],#4		@ np[0],np++
138d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldr	$n0,[$n0]		@ *n0
139d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	str	$tp,[$_bpend]		@ save &bp[num]
140d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
141d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	umull	$alo,$ahi,$aj,$bi	@ ap[0]*bp[0]
142d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	str	$n0,[$_n0]		@ save n0 value
143d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mul	$n0,$alo,$n0		@ "tp[0]"*n0
144d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$nlo,#0
145d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"t[0]"
146d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$tp,sp
147d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
148d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.L1st:
149d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldr	$aj,[$ap],#4		@ ap[j],ap++
150d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$alo,$ahi
151d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldr	$nj,[$np],#4		@ np[j],np++
152d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$ahi,#0
153d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
154d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$nhi,#0
155d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
156d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	adds	$nlo,$nlo,$alo
157d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
158d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	adc	$nlo,$nhi,#0
159d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	$tp,$num
160d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	bne	.L1st
161d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
162d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	adds	$nlo,$nlo,$ahi
163d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldr	$tp,[$_bp]		@ restore bp
164d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$nhi,#0
165d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldr	$n0,[$_n0]		@ restore n0
166d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	adc	$nhi,$nhi,#0
167d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	str	$nlo,[$num]		@ tp[num-1]=
168d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	str	$nhi,[$num,#4]		@ tp[num]=
169d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
170d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Louter:
171d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	$tj,$num,sp		@ "original" $num-1 value
172d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
173d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldr	$bi,[$tp,#4]!		@ *(++bp)
174d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	$np,$np,$tj		@ "rewind" np to &np[1]
175d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldr	$aj,[$ap,#-4]		@ ap[0]
176d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldr	$alo,[sp]		@ tp[0]
177d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldr	$nj,[$np,#-4]		@ np[0]
178d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldr	$tj,[sp,#4]		@ tp[1]
179d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
180d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$ahi,#0
181d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	umlal	$alo,$ahi,$aj,$bi	@ ap[0]*bp[i]+tp[0]
182d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	str	$tp,[$_bp]		@ save bp
183d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mul	$n0,$alo,$n0
184d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$nlo,#0
185d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"tp[0]"
186d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$tp,sp
187d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
188d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Linner:
189d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldr	$aj,[$ap],#4		@ ap[j],ap++
190d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	adds	$alo,$ahi,$tj		@ +=tp[j]
191d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldr	$nj,[$np],#4		@ np[j],np++
192d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$ahi,#0
193d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
194d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$nhi,#0
195d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
196d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	adc	$ahi,$ahi,#0
197d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldr	$tj,[$tp,#8]		@ tp[j+1]
198d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	adds	$nlo,$nlo,$alo
199d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
200d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	adc	$nlo,$nhi,#0
201d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	$tp,$num
202d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	bne	.Linner
203d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
204d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	adds	$nlo,$nlo,$ahi
205d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$nhi,#0
206d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldr	$tp,[$_bp]		@ restore bp
207d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	adc	$nhi,$nhi,#0
208d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldr	$n0,[$_n0]		@ restore n0
209d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	adds	$nlo,$nlo,$tj
210d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldr	$tj,[$_bpend]		@ restore &bp[num]
211d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	adc	$nhi,$nhi,#0
212d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	str	$nlo,[$num]		@ tp[num-1]=
213d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	str	$nhi,[$num,#4]		@ tp[num]=
214d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
215d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	$tp,$tj
216d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	bne	.Louter
217d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
218d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldr	$rp,[$_rp]		@ pull rp
219d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add	$num,$num,#4		@ $num to point at &tp[num]
220d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	$aj,$num,sp		@ "original" num value
221d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$tp,sp			@ "rewind" $tp
222d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$ap,$tp			@ "borrow" $ap
223d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
224d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
225d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	subs	$tj,$tj,$tj		@ "clear" carry flag
226d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lsub:	ldr	$tj,[$tp],#4
227d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldr	$nj,[$np],#4
228d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sbcs	$tj,$tj,$nj		@ tp[j]-np[j]
229d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	str	$tj,[$rp],#4		@ rp[j]=
230d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	teq	$tp,$num		@ preserve carry
231d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	bne	.Lsub
232d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sbcs	$nhi,$nhi,#0		@ upmost carry
233d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$tp,sp			@ "rewind" $tp
234d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	$rp,$rp,$aj		@ "rewind" $rp
235d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
236d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	and	$ap,$tp,$nhi
237d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	bic	$np,$rp,$nhi
238d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	orr	$ap,$ap,$np		@ ap=borrow?tp:rp
239d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
240d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lcopy:	ldr	$tj,[$ap],#4		@ copy or in-place refresh
241d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	str	sp,[$tp],#4		@ zap tp
242d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	str	$tj,[$rp],#4
243d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	$tp,$num
244d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	bne	.Lcopy
245d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
246d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add	sp,$num,#4		@ skip over tp[num+1]
247d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldmia	sp!,{r4-r12,lr}		@ restore registers
248d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add	sp,sp,#2*4		@ skip over {r0,r2}
249d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	r0,#1
250e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley.Labrt:
251e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley#if __ARM_ARCH__>=5
252e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley	ret				@ bx lr
253e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley#else
254e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley	tst	lr,#1
255d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	moveq	pc,lr			@ be binary compatible with V4, yet
256d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	bx	lr			@ interoperable with Thumb ISA:-)
257e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley#endif
258d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.size	bn_mul_mont,.-bn_mul_mont
259d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
260d9e397b599b13d642138480a28c14db7a136bf0Adam Langley{
261d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
262d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
263d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
264d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
265d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
266d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($Z,$Temp)=("q4","q5");
267d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
268d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($Bi,$Ni,$M0)=map("d$_",(28..31));
269d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy $zero=&Dlo($Z);
270d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy $temp=&Dlo($Temp);
271d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
272d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
273d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
274d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
275d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
276e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley#if __ARM_MAX_ARCH__>=7
277e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley.arch	armv7-a
278d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.fpu	neon
279d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
280d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.type	bn_mul8x_mont_neon,%function
281d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	5
282d9e397b599b13d642138480a28c14db7a136bf0Adam Langleybn_mul8x_mont_neon:
283d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	ip,sp
284d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	stmdb	sp!,{r4-r11}
285d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vstmdb	sp!,{d8-d15}		@ ABI specification says so
286d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldmia	ip,{r4-r5}		@ load rest of parameter block
287d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
288d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub		$toutptr,sp,#16
289d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vld1.32		{${Bi}[0]}, [$bptr,:32]!
290d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub		$toutptr,$toutptr,$num,lsl#4
291d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vld1.32		{$A0-$A3},  [$aptr]!		@ can't specify :32 :-(
292d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	and		$toutptr,$toutptr,#-64
293d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vld1.32		{${M0}[0]}, [$n0,:32]
294d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov		sp,$toutptr			@ alloca
295d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	veor		$zero,$zero,$zero
296d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	subs		$inner,$num,#8
297d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vzip.16		$Bi,$zero
298d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
299d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmull.u32	$A0xB,$Bi,${A0}[0]
300d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmull.u32	$A1xB,$Bi,${A0}[1]
301d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmull.u32	$A2xB,$Bi,${A1}[0]
302d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
303d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmull.u32	$A3xB,$Bi,${A1}[1]
304d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
305d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
306d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	veor		$zero,$zero,$zero
307d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmul.u32	$Ni,$temp,$M0
308d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
309d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmull.u32	$A4xB,$Bi,${A2}[0]
310d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 vld1.32	{$N0-$N3}, [$nptr]!
311d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmull.u32	$A5xB,$Bi,${A2}[1]
312d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmull.u32	$A6xB,$Bi,${A3}[0]
313d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vzip.16		$Ni,$zero
314d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmull.u32	$A7xB,$Bi,${A3}[1]
315d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
316d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	bne	.LNEON_1st
317d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
318d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	@ special case for num=8, everything is in register bank...
319d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
320d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A0xB,$Ni,${N0}[0]
321d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub		$outer,$num,#1
322d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A1xB,$Ni,${N0}[1]
323d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A2xB,$Ni,${N1}[0]
324d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A3xB,$Ni,${N1}[1]
325d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
326d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A4xB,$Ni,${N2}[0]
327d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmov		$Temp,$A0xB
328d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A5xB,$Ni,${N2}[1]
329d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmov		$A0xB,$A1xB
330d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A6xB,$Ni,${N3}[0]
331d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmov		$A1xB,$A2xB
332d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A7xB,$Ni,${N3}[1]
333d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmov		$A2xB,$A3xB
334d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmov		$A3xB,$A4xB
335d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshr.u64	$temp,$temp,#16
336d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmov		$A4xB,$A5xB
337d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmov		$A5xB,$A6xB
338d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
339d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmov		$A6xB,$A7xB
340d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	veor		$A7xB,$A7xB
341d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshr.u64	$temp,$temp,#16
342d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
343d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	b	.LNEON_outer8
344d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
345d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	4
346d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LNEON_outer8:
347d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vld1.32		{${Bi}[0]}, [$bptr,:32]!
348d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	veor		$zero,$zero,$zero
349d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vzip.16		$Bi,$zero
350d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
351d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
352d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A0xB,$Bi,${A0}[0]
353d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A1xB,$Bi,${A0}[1]
354d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A2xB,$Bi,${A1}[0]
355d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
356d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A3xB,$Bi,${A1}[1]
357d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
358d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
359d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	veor		$zero,$zero,$zero
360d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	subs		$outer,$outer,#1
361d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmul.u32	$Ni,$temp,$M0
362d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
363d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A4xB,$Bi,${A2}[0]
364d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A5xB,$Bi,${A2}[1]
365d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A6xB,$Bi,${A3}[0]
366d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vzip.16		$Ni,$zero
367d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A7xB,$Bi,${A3}[1]
368d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
369d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A0xB,$Ni,${N0}[0]
370d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A1xB,$Ni,${N0}[1]
371d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A2xB,$Ni,${N1}[0]
372d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A3xB,$Ni,${N1}[1]
373d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
374d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A4xB,$Ni,${N2}[0]
375d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmov		$Temp,$A0xB
376d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A5xB,$Ni,${N2}[1]
377d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmov		$A0xB,$A1xB
378d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A6xB,$Ni,${N3}[0]
379d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmov		$A1xB,$A2xB
380d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A7xB,$Ni,${N3}[1]
381d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmov		$A2xB,$A3xB
382d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmov		$A3xB,$A4xB
383d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshr.u64	$temp,$temp,#16
384d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmov		$A4xB,$A5xB
385d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmov		$A5xB,$A6xB
386d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
387d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmov		$A6xB,$A7xB
388d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	veor		$A7xB,$A7xB
389d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshr.u64	$temp,$temp,#16
390d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
391d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	bne	.LNEON_outer8
392d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
393d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
394d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov		$toutptr,sp
395d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
396d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov		$inner,$num
397d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
398d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add		$tinptr,sp,#16
399d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
400d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`
401d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
402d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	b	.LNEON_tail2
403d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
404d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	4
405d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LNEON_1st:
406d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A0xB,$Ni,${N0}[0]
407d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 vld1.32	{$A0-$A3}, [$aptr]!
408d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A1xB,$Ni,${N0}[1]
409d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	subs		$inner,$inner,#8
410d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A2xB,$Ni,${N1}[0]
411d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A3xB,$Ni,${N1}[1]
412d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
413d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A4xB,$Ni,${N2}[0]
414d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 vld1.32	{$N0-$N1}, [$nptr]!
415d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A5xB,$Ni,${N2}[1]
416d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 vst1.64	{$A0xB-$A1xB}, [$toutptr,:256]!
417d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A6xB,$Ni,${N3}[0]
418d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A7xB,$Ni,${N3}[1]
419d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 vst1.64	{$A2xB-$A3xB}, [$toutptr,:256]!
420d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
421d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmull.u32	$A0xB,$Bi,${A0}[0]
422d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 vld1.32	{$N2-$N3}, [$nptr]!
423d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmull.u32	$A1xB,$Bi,${A0}[1]
424d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 vst1.64	{$A4xB-$A5xB}, [$toutptr,:256]!
425d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmull.u32	$A2xB,$Bi,${A1}[0]
426d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmull.u32	$A3xB,$Bi,${A1}[1]
427d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 vst1.64	{$A6xB-$A7xB}, [$toutptr,:256]!
428d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
429d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmull.u32	$A4xB,$Bi,${A2}[0]
430d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmull.u32	$A5xB,$Bi,${A2}[1]
431d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmull.u32	$A6xB,$Bi,${A3}[0]
432d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmull.u32	$A7xB,$Bi,${A3}[1]
433d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
434d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	bne	.LNEON_1st
435d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
436d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A0xB,$Ni,${N0}[0]
437d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add		$tinptr,sp,#16
438d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A1xB,$Ni,${N0}[1]
439d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
440d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A2xB,$Ni,${N1}[0]
441d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 vld1.64	{$Temp}, [sp,:128]
442d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A3xB,$Ni,${N1}[1]
443d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub		$outer,$num,#1
444d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
445d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A4xB,$Ni,${N2}[0]
446d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
447d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A5xB,$Ni,${N2}[1]
448d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshr.u64	$temp,$temp,#16
449d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 vld1.64	{$A0xB},       [$tinptr, :128]!
450d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A6xB,$Ni,${N3}[0]
451d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
452d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A7xB,$Ni,${N3}[1]
453d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
454d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
455d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
456d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	veor		$Z,$Z,$Z
457d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
458d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
459d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vst1.64		{$Z},          [$toutptr,:128]
460d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshr.u64	$temp,$temp,#16
461d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
462d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	b		.LNEON_outer
463d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
464d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	4
465d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LNEON_outer:
466d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vld1.32		{${Bi}[0]}, [$bptr,:32]!
467d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub		$nptr,$nptr,$num,lsl#2		@ rewind $nptr
468d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vld1.32		{$A0-$A3},  [$aptr]!
469d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	veor		$zero,$zero,$zero
470d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov		$toutptr,sp
471d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vzip.16		$Bi,$zero
472d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub		$inner,$num,#8
473d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
474d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
475d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A0xB,$Bi,${A0}[0]
476d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 vld1.64	{$A3xB-$A4xB},[$tinptr,:256]!
477d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A1xB,$Bi,${A0}[1]
478d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A2xB,$Bi,${A1}[0]
479d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 vld1.64	{$A5xB-$A6xB},[$tinptr,:256]!
480d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A3xB,$Bi,${A1}[1]
481d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
482d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
483d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	veor		$zero,$zero,$zero
484d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
485d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 vld1.64	{$A7xB},[$tinptr,:128]!
486d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmul.u32	$Ni,$temp,$M0
487d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
488d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A4xB,$Bi,${A2}[0]
489d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 vld1.32	{$N0-$N3}, [$nptr]!
490d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A5xB,$Bi,${A2}[1]
491d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A6xB,$Bi,${A3}[0]
492d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vzip.16		$Ni,$zero
493d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A7xB,$Bi,${A3}[1]
494d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
495d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LNEON_inner:
496d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A0xB,$Ni,${N0}[0]
497d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 vld1.32	{$A0-$A3}, [$aptr]!
498d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A1xB,$Ni,${N0}[1]
499d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 subs		$inner,$inner,#8
500d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A2xB,$Ni,${N1}[0]
501d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A3xB,$Ni,${N1}[1]
502d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
503d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
504d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A4xB,$Ni,${N2}[0]
505d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 vld1.64	{$A0xB},       [$tinptr, :128]!
506d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A5xB,$Ni,${N2}[1]
507d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
508d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A6xB,$Ni,${N3}[0]
509d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
510d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A7xB,$Ni,${N3}[1]
511d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
512d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
513d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A0xB,$Bi,${A0}[0]
514d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 vld1.64	{$A3xB-$A4xB}, [$tinptr, :256]!
515d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A1xB,$Bi,${A0}[1]
516d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
517d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A2xB,$Bi,${A1}[0]
518d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 vld1.64	{$A5xB-$A6xB}, [$tinptr, :256]!
519d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A3xB,$Bi,${A1}[1]
520d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 vld1.32	{$N0-$N3}, [$nptr]!
521d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
522d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A4xB,$Bi,${A2}[0]
523d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 vld1.64	{$A7xB},       [$tinptr, :128]!
524d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A5xB,$Bi,${A2}[1]
525d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A6xB,$Bi,${A3}[0]
526d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A7xB,$Bi,${A3}[1]
527d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
528d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	bne	.LNEON_inner
529d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
530d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A0xB,$Ni,${N0}[0]
531d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add		$tinptr,sp,#16
532d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A1xB,$Ni,${N0}[1]
533d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
534d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A2xB,$Ni,${N1}[0]
535d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 vld1.64	{$Temp}, [sp,:128]
536d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A3xB,$Ni,${N1}[1]
537d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	subs		$outer,$outer,#1
538d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
539d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A4xB,$Ni,${N2}[0]
540d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
541d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A5xB,$Ni,${N2}[1]
542d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 vld1.64	{$A0xB},       [$tinptr, :128]!
543d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshr.u64	$temp,$temp,#16
544d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
545d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A6xB,$Ni,${N3}[0]
546d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
547d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vmlal.u32	$A7xB,$Ni,${N3}[1]
548d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
549d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
550d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
551d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
552d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshr.u64	$temp,$temp,#16
553d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
554d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	bne	.LNEON_outer
555d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
556d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov		$toutptr,sp
557d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov		$inner,$num
558d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
559d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LNEON_tail:
560d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
561d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vld1.64		{$A3xB-$A4xB}, [$tinptr, :256]!
562d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
563d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
564d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vld1.64		{$A5xB-$A6xB}, [$tinptr, :256]!
565d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
566d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vld1.64		{$A7xB},       [$tinptr, :128]!
567d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`
568d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
569d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LNEON_tail2:
570d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vadd.u64	`&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
571d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vst1.32		{`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
572d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshr.u64	$temp,`&Dlo("$A1xB")`,#16
573d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vadd.u64	`&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
574d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshr.u64	$temp,`&Dhi("$A1xB")`,#16
575d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vzip.16		`&Dlo("$A1xB")`,`&Dhi("$A1xB")`
576d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
577d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vadd.u64	`&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
578d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vst1.32		{`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
579d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshr.u64	$temp,`&Dlo("$A2xB")`,#16
580d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vadd.u64	`&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
581d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshr.u64	$temp,`&Dhi("$A2xB")`,#16
582d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vzip.16		`&Dlo("$A2xB")`,`&Dhi("$A2xB")`
583d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
584d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vadd.u64	`&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
585d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vst1.32		{`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
586d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshr.u64	$temp,`&Dlo("$A3xB")`,#16
587d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vadd.u64	`&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
588d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshr.u64	$temp,`&Dhi("$A3xB")`,#16
589d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vzip.16		`&Dlo("$A3xB")`,`&Dhi("$A3xB")`
590d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
591d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vadd.u64	`&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
592d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vst1.32		{`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
593d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshr.u64	$temp,`&Dlo("$A4xB")`,#16
594d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vadd.u64	`&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
595d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshr.u64	$temp,`&Dhi("$A4xB")`,#16
596d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vzip.16		`&Dlo("$A4xB")`,`&Dhi("$A4xB")`
597d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
598d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vadd.u64	`&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
599d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vst1.32		{`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
600d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshr.u64	$temp,`&Dlo("$A5xB")`,#16
601d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vadd.u64	`&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
602d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshr.u64	$temp,`&Dhi("$A5xB")`,#16
603d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vzip.16		`&Dlo("$A5xB")`,`&Dhi("$A5xB")`
604d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
605d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vadd.u64	`&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
606d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vst1.32		{`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
607d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshr.u64	$temp,`&Dlo("$A6xB")`,#16
608d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vadd.u64	`&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
609d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vld1.64		{$A0xB}, [$tinptr, :128]!
610d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshr.u64	$temp,`&Dhi("$A6xB")`,#16
611d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vzip.16		`&Dlo("$A6xB")`,`&Dhi("$A6xB")`
612d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
613d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vadd.u64	`&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
614d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vst1.32		{`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
615d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshr.u64	$temp,`&Dlo("$A7xB")`,#16
616d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vadd.u64	`&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
617d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vld1.64		{$A1xB-$A2xB},	[$tinptr, :256]!
618d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vshr.u64	$temp,`&Dhi("$A7xB")`,#16
619d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vzip.16		`&Dlo("$A7xB")`,`&Dhi("$A7xB")`
620d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	subs		$inner,$inner,#8
621d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vst1.32		{`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
622d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
623d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	bne	.LNEON_tail
624d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
625d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vst1.32	{${temp}[0]}, [$toutptr, :32]		@ top-most bit
626d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	$nptr,$nptr,$num,lsl#2			@ rewind $nptr
627d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	subs	$aptr,sp,#0				@ clear carry flag
628d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add	$bptr,sp,$num,lsl#2
629d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
630d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LNEON_sub:
631d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldmia	$aptr!, {r4-r7}
632d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldmia	$nptr!, {r8-r11}
633d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sbcs	r8, r4,r8
634d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sbcs	r9, r5,r9
635d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sbcs	r10,r6,r10
636d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sbcs	r11,r7,r11
637d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	teq	$aptr,$bptr				@ preserves carry
638d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	stmia	$rptr!, {r8-r11}
639d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	bne	.LNEON_sub
640d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
641d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldr	r10, [$aptr]				@ load top-most bit
642d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	veor	q0,q0,q0
643d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	r11,$bptr,sp				@ this is num*4
644d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	veor	q1,q1,q1
645d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$aptr,sp
646d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	$rptr,$rptr,r11				@ rewind $rptr
647d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$nptr,$bptr				@ second 3/4th of frame
648d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sbcs	r10,r10,#0				@ result is carry flag
649d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
650d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LNEON_copy_n_zap:
651d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldmia	$aptr!, {r4-r7}
652d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldmia	$rptr,  {r8-r11}
653d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movcc	r8, r4
654d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
655d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movcc	r9, r5
656d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movcc	r10,r6
657d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
658d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movcc	r11,r7
659d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldmia	$aptr, {r4-r7}
660d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	stmia	$rptr!, {r8-r11}
661d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	$aptr,$aptr,#16
662d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ldmia	$rptr, {r8-r11}
663d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movcc	r8, r4
664d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vst1.64	{q0-q1}, [$aptr,:256]!			@ wipe
665d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movcc	r9, r5
666d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movcc	r10,r6
667d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
668d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movcc	r11,r7
669d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	teq	$aptr,$bptr				@ preserves carry
670d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	stmia	$rptr!, {r8-r11}
671d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	bne	.LNEON_copy_n_zap
672d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
673d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	sp,ip,#96
674d9e397b599b13d642138480a28c14db7a136bf0Adam Langley        vldmia  sp!,{d8-d15}
675d9e397b599b13d642138480a28c14db7a136bf0Adam Langley        ldmia   sp!,{r4-r11}
676e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley	ret						@ bx lr
677d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
678d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#endif
679d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
680d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
681d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
682d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.asciz	"Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
683d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	2
684e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley#if __ARM_MAX_ARCH__>=7
685d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.comm	OPENSSL_armcap_P,4,4
686e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley.hidden	OPENSSL_armcap_P
687d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#endif
688d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
689d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
690d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code =~ s/\`([^\`]*)\`/eval $1/gem;
691d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
692e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley$code =~ s/\bret\b/bx	lr/gm;
693d9e397b599b13d642138480a28c14db7a136bf0Adam Langleyprint $code;
694d9e397b599b13d642138480a28c14db7a136bf0Adam Langleyclose STDOUT;
695