1d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#!/usr/bin/env perl 2d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 3d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# ==================================================================== 4d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# project. The module is, however, dual licensed under OpenSSL and 6d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# CRYPTOGAMS licenses depending on where you obtain it. For further 7d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# details see http://www.openssl.org/~appro/cryptogams/. 8d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# ==================================================================== 9d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 10d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# January 2007. 11d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 12d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Montgomery multiplication for ARMv4. 13d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# 14d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Performance improvement naturally varies among CPU implementations 15d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# and compilers. The code was observed to provide +65-35% improvement 16d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# [depending on key length, less for longer keys] on ARM920T, and 17d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code 18d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# base and compiler generated code with in-lined umull and even umlal 19d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# instructions. The latter means that this code didn't really have an 20d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# "advantage" of utilizing some "secret" instruction. 21d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# 22d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# The code is interoperable with Thumb ISA and is rather compact, less 23d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# than 1/2KB. Windows CE port would be trivial, as it's exclusively 24d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# about decorations, ABI and instruction syntax are identical. 25d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 26d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# November 2013 27d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# 28d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Add NEON code path, which handles lengths divisible by 8. RSA/DSA 29d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# performance improvement on Cortex-A8 is ~45-100% depending on key 30d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# length, more for longer keys. On Cortex-A15 the span is ~10-105%. 31d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# On Snapdragon S4 improvement was measured to vary from ~70% to 32d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is 33d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# rather because original integer-only code seems to perform 34d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# suboptimally on S4. Situation on Cortex-A9 is unfortunately 35d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# different. It's being looked into, but the trouble is that 36d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# performance for vectors longer than 256 bits is actually couple 37d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# of percent worse than for integer-only code. The code is chosen 38d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# for execution on all NEON-capable processors, because gain on 39d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# others outweighs the marginal loss on Cortex-A9. 40d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 41e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley$flavour = shift; 42e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleyif ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 43e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleyelse { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } 44e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 45e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleyif ($flavour && $flavour ne "void") { 46e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 47e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 48e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 49e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley die "can't locate arm-xlate.pl"; 50e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 51e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley open STDOUT,"| \"$^X\" $xlate $flavour $output"; 52e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley} else { 53e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley open STDOUT,">$output"; 54e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley} 55d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 56d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$num="r0"; # starts as num argument, but holds &tp[num-1] 57d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$ap="r1"; 58d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$bp="r2"; $bi="r2"; $rp="r2"; 59d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$np="r3"; 60d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$tp="r4"; 61d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$aj="r5"; 62d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$nj="r6"; 63d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$tj="r7"; 64d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$n0="r8"; 65d9e397b599b13d642138480a28c14db7a136bf0Adam Langley########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer 66d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$alo="r10"; # sl, gcc uses it to keep @GOT 67d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$ahi="r11"; # fp 68d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$nlo="r12"; # ip 69d9e397b599b13d642138480a28c14db7a136bf0Adam Langley########### # r13 is stack pointer 70d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$nhi="r14"; # lr 71d9e397b599b13d642138480a28c14db7a136bf0Adam Langley########### # r15 is program counter 72d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 73d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#### argument block layout relative to &tp[num-1], a.k.a. $num 74d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$_rp="$num,#12*4"; 75d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# ap permanently resides in r1 76d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$_bp="$num,#13*4"; 77d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# np permanently resides in r3 78d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$_n0="$num,#14*4"; 79d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$_num="$num,#15*4"; $_bpend=$_num; 80d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 81d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code=<<___; 82d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#include "arm_arch.h" 83d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 84d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.text 85d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.code 32 86d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 87e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley#if __ARM_MAX_ARCH__>=7 88d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align 5 89d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LOPENSSL_armcap: 90e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley.word OPENSSL_armcap_P-.Lbn_mul_mont 91d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#endif 92d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 93d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.global bn_mul_mont 94d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.hidden bn_mul_mont 95d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.type bn_mul_mont,%function 96d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 97d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align 5 98d9e397b599b13d642138480a28c14db7a136bf0Adam Langleybn_mul_mont: 99e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley.Lbn_mul_mont: 100d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldr ip,[sp,#4] @ load num 101d9e397b599b13d642138480a28c14db7a136bf0Adam Langley stmdb sp!,{r0,r2} @ sp points at argument block 102e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley#if __ARM_MAX_ARCH__>=7 103d9e397b599b13d642138480a28c14db7a136bf0Adam Langley tst ip,#7 104d9e397b599b13d642138480a28c14db7a136bf0Adam Langley bne .Lialu 105d9e397b599b13d642138480a28c14db7a136bf0Adam Langley adr r0,bn_mul_mont 106d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldr r2,.LOPENSSL_armcap 107d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldr r0,[r0,r2] 108e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley#ifdef __APPLE__ 109e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley ldr r0,[r0] 110e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley#endif 111d9e397b599b13d642138480a28c14db7a136bf0Adam Langley tst r0,#1 @ NEON available? 112d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldmia sp, {r0,r2} 113d9e397b599b13d642138480a28c14db7a136bf0Adam Langley beq .Lialu 114d9e397b599b13d642138480a28c14db7a136bf0Adam Langley add sp,sp,#8 115d9e397b599b13d642138480a28c14db7a136bf0Adam Langley b bn_mul8x_mont_neon 116d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align 4 117d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lialu: 118d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#endif 119d9e397b599b13d642138480a28c14db7a136bf0Adam Langley cmp ip,#2 120d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mov $num,ip @ load num 121d9e397b599b13d642138480a28c14db7a136bf0Adam Langley movlt r0,#0 122d9e397b599b13d642138480a28c14db7a136bf0Adam Langley addlt sp,sp,#2*4 123d9e397b599b13d642138480a28c14db7a136bf0Adam Langley blt .Labrt 124d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 125d9e397b599b13d642138480a28c14db7a136bf0Adam Langley stmdb sp!,{r4-r12,lr} @ save 10 registers 126d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 127d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mov $num,$num,lsl#2 @ rescale $num for byte count 128d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sub sp,sp,$num @ alloca(4*num) 129d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sub sp,sp,#4 @ +extra dword 130d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sub $num,$num,#4 @ "num=num-1" 131d9e397b599b13d642138480a28c14db7a136bf0Adam Langley add $tp,$bp,$num @ &bp[num-1] 132d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 133d9e397b599b13d642138480a28c14db7a136bf0Adam Langley add $num,sp,$num @ $num to point at &tp[num-1] 134d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldr $n0,[$_n0] @ &n0 135d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldr $bi,[$bp] @ bp[0] 136d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldr $aj,[$ap],#4 @ ap[0],ap++ 137d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldr $nj,[$np],#4 @ np[0],np++ 138d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldr $n0,[$n0] @ *n0 139d9e397b599b13d642138480a28c14db7a136bf0Adam Langley str $tp,[$_bpend] @ save &bp[num] 140d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 141d9e397b599b13d642138480a28c14db7a136bf0Adam Langley umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0] 142d9e397b599b13d642138480a28c14db7a136bf0Adam Langley str $n0,[$_n0] @ save n0 value 143d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mul $n0,$alo,$n0 @ "tp[0]"*n0 144d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mov $nlo,#0 145d9e397b599b13d642138480a28c14db7a136bf0Adam Langley umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]" 146d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mov $tp,sp 147d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 148d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.L1st: 149d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldr $aj,[$ap],#4 @ ap[j],ap++ 150d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mov $alo,$ahi 151d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldr $nj,[$np],#4 @ np[j],np++ 152d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mov $ahi,#0 153d9e397b599b13d642138480a28c14db7a136bf0Adam Langley umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0] 154d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mov $nhi,#0 155d9e397b599b13d642138480a28c14db7a136bf0Adam Langley umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 156d9e397b599b13d642138480a28c14db7a136bf0Adam Langley adds $nlo,$nlo,$alo 157d9e397b599b13d642138480a28c14db7a136bf0Adam Langley str $nlo,[$tp],#4 @ tp[j-1]=,tp++ 158d9e397b599b13d642138480a28c14db7a136bf0Adam Langley adc $nlo,$nhi,#0 159d9e397b599b13d642138480a28c14db7a136bf0Adam Langley cmp $tp,$num 160d9e397b599b13d642138480a28c14db7a136bf0Adam Langley bne .L1st 161d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 162d9e397b599b13d642138480a28c14db7a136bf0Adam Langley adds $nlo,$nlo,$ahi 163d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldr $tp,[$_bp] @ restore bp 164d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mov $nhi,#0 165d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldr $n0,[$_n0] @ restore n0 166d9e397b599b13d642138480a28c14db7a136bf0Adam Langley adc $nhi,$nhi,#0 167d9e397b599b13d642138480a28c14db7a136bf0Adam Langley str $nlo,[$num] @ tp[num-1]= 168d9e397b599b13d642138480a28c14db7a136bf0Adam Langley str $nhi,[$num,#4] @ tp[num]= 169d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 170d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Louter: 171d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sub $tj,$num,sp @ "original" $num-1 value 172d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sub $ap,$ap,$tj @ "rewind" ap to &ap[1] 173d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldr $bi,[$tp,#4]! @ *(++bp) 174d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sub $np,$np,$tj @ "rewind" np to &np[1] 175d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldr $aj,[$ap,#-4] @ ap[0] 176d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldr $alo,[sp] @ tp[0] 177d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldr $nj,[$np,#-4] @ np[0] 178d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldr $tj,[sp,#4] @ tp[1] 179d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 180d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mov $ahi,#0 181d9e397b599b13d642138480a28c14db7a136bf0Adam Langley umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0] 182d9e397b599b13d642138480a28c14db7a136bf0Adam Langley str $tp,[$_bp] @ save bp 183d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mul $n0,$alo,$n0 184d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mov $nlo,#0 185d9e397b599b13d642138480a28c14db7a136bf0Adam Langley umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]" 186d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mov $tp,sp 187d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 188d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Linner: 189d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldr $aj,[$ap],#4 @ ap[j],ap++ 190d9e397b599b13d642138480a28c14db7a136bf0Adam Langley adds $alo,$ahi,$tj @ +=tp[j] 191d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldr $nj,[$np],#4 @ np[j],np++ 192d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mov $ahi,#0 193d9e397b599b13d642138480a28c14db7a136bf0Adam Langley umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i] 194d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mov $nhi,#0 195d9e397b599b13d642138480a28c14db7a136bf0Adam Langley umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 196d9e397b599b13d642138480a28c14db7a136bf0Adam Langley adc $ahi,$ahi,#0 197d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldr $tj,[$tp,#8] @ tp[j+1] 198d9e397b599b13d642138480a28c14db7a136bf0Adam Langley adds $nlo,$nlo,$alo 199d9e397b599b13d642138480a28c14db7a136bf0Adam Langley str $nlo,[$tp],#4 @ tp[j-1]=,tp++ 200d9e397b599b13d642138480a28c14db7a136bf0Adam Langley adc $nlo,$nhi,#0 201d9e397b599b13d642138480a28c14db7a136bf0Adam Langley cmp $tp,$num 202d9e397b599b13d642138480a28c14db7a136bf0Adam Langley bne .Linner 203d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 204d9e397b599b13d642138480a28c14db7a136bf0Adam Langley adds $nlo,$nlo,$ahi 205d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mov $nhi,#0 206d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldr $tp,[$_bp] @ restore bp 207d9e397b599b13d642138480a28c14db7a136bf0Adam Langley adc $nhi,$nhi,#0 208d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldr $n0,[$_n0] @ restore n0 209d9e397b599b13d642138480a28c14db7a136bf0Adam Langley adds $nlo,$nlo,$tj 210d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldr $tj,[$_bpend] @ restore &bp[num] 211d9e397b599b13d642138480a28c14db7a136bf0Adam Langley adc $nhi,$nhi,#0 212d9e397b599b13d642138480a28c14db7a136bf0Adam Langley str $nlo,[$num] @ tp[num-1]= 213d9e397b599b13d642138480a28c14db7a136bf0Adam Langley str $nhi,[$num,#4] @ tp[num]= 214d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 215d9e397b599b13d642138480a28c14db7a136bf0Adam Langley cmp $tp,$tj 216d9e397b599b13d642138480a28c14db7a136bf0Adam Langley bne .Louter 217d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 218d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldr $rp,[$_rp] @ pull rp 219d9e397b599b13d642138480a28c14db7a136bf0Adam Langley add $num,$num,#4 @ $num to point at &tp[num] 220d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sub $aj,$num,sp @ "original" num value 221d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mov $tp,sp @ "rewind" $tp 222d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mov $ap,$tp @ "borrow" $ap 223d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sub $np,$np,$aj @ "rewind" $np to &np[0] 224d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 225d9e397b599b13d642138480a28c14db7a136bf0Adam Langley subs $tj,$tj,$tj @ "clear" carry flag 226d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lsub: ldr $tj,[$tp],#4 227d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldr $nj,[$np],#4 228d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sbcs $tj,$tj,$nj @ tp[j]-np[j] 229d9e397b599b13d642138480a28c14db7a136bf0Adam Langley str $tj,[$rp],#4 @ rp[j]= 230d9e397b599b13d642138480a28c14db7a136bf0Adam Langley teq $tp,$num @ preserve carry 231d9e397b599b13d642138480a28c14db7a136bf0Adam Langley bne .Lsub 232d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sbcs $nhi,$nhi,#0 @ upmost carry 233d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mov $tp,sp @ "rewind" $tp 234d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sub $rp,$rp,$aj @ "rewind" $rp 235d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 236d9e397b599b13d642138480a28c14db7a136bf0Adam Langley and $ap,$tp,$nhi 237d9e397b599b13d642138480a28c14db7a136bf0Adam Langley bic $np,$rp,$nhi 238d9e397b599b13d642138480a28c14db7a136bf0Adam Langley orr $ap,$ap,$np @ ap=borrow?tp:rp 239d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 240d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh 241d9e397b599b13d642138480a28c14db7a136bf0Adam Langley str sp,[$tp],#4 @ zap tp 242d9e397b599b13d642138480a28c14db7a136bf0Adam Langley str $tj,[$rp],#4 243d9e397b599b13d642138480a28c14db7a136bf0Adam Langley cmp $tp,$num 244d9e397b599b13d642138480a28c14db7a136bf0Adam Langley bne .Lcopy 245d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 246d9e397b599b13d642138480a28c14db7a136bf0Adam Langley add sp,$num,#4 @ skip over tp[num+1] 247d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldmia sp!,{r4-r12,lr} @ restore registers 248d9e397b599b13d642138480a28c14db7a136bf0Adam Langley add sp,sp,#2*4 @ skip over {r0,r2} 249d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mov r0,#1 250e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley.Labrt: 251e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley#if __ARM_ARCH__>=5 252e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley ret @ bx lr 253e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley#else 254e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley tst lr,#1 255d9e397b599b13d642138480a28c14db7a136bf0Adam Langley moveq pc,lr @ be binary compatible with V4, yet 256d9e397b599b13d642138480a28c14db7a136bf0Adam Langley bx lr @ interoperable with Thumb ISA:-) 257e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley#endif 258d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.size bn_mul_mont,.-bn_mul_mont 259d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___ 260d9e397b599b13d642138480a28c14db7a136bf0Adam Langley{ 261d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } 262d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } 263d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 264d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($A0,$A1,$A2,$A3)=map("d$_",(0..3)); 265d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($N0,$N1,$N2,$N3)=map("d$_",(4..7)); 266d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($Z,$Temp)=("q4","q5"); 267d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13)); 268d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($Bi,$Ni,$M0)=map("d$_",(28..31)); 269d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy $zero=&Dlo($Z); 270d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy $temp=&Dlo($Temp); 271d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 272d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5)); 273d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9)); 274d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 275d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___; 276e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley#if __ARM_MAX_ARCH__>=7 277e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley.arch armv7-a 278d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.fpu neon 279d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 280d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.type bn_mul8x_mont_neon,%function 281d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align 5 282d9e397b599b13d642138480a28c14db7a136bf0Adam Langleybn_mul8x_mont_neon: 283d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mov ip,sp 284d9e397b599b13d642138480a28c14db7a136bf0Adam Langley stmdb sp!,{r4-r11} 285d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vstmdb sp!,{d8-d15} @ ABI specification says so 286d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldmia ip,{r4-r5} @ load rest of parameter block 287d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 288d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sub $toutptr,sp,#16 289d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.32 {${Bi}[0]}, [$bptr,:32]! 290d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sub $toutptr,$toutptr,$num,lsl#4 291d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-( 292d9e397b599b13d642138480a28c14db7a136bf0Adam Langley and $toutptr,$toutptr,#-64 293d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.32 {${M0}[0]}, [$n0,:32] 294d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mov sp,$toutptr @ alloca 295d9e397b599b13d642138480a28c14db7a136bf0Adam Langley veor $zero,$zero,$zero 296d9e397b599b13d642138480a28c14db7a136bf0Adam Langley subs $inner,$num,#8 297d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vzip.16 $Bi,$zero 298d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 299d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmull.u32 $A0xB,$Bi,${A0}[0] 300d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmull.u32 $A1xB,$Bi,${A0}[1] 301d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmull.u32 $A2xB,$Bi,${A1}[0] 302d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshl.i64 $temp,`&Dhi("$A0xB")`,#16 303d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmull.u32 $A3xB,$Bi,${A1}[1] 304d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 305d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vadd.u64 $temp,$temp,`&Dlo("$A0xB")` 306d9e397b599b13d642138480a28c14db7a136bf0Adam Langley veor $zero,$zero,$zero 307d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmul.u32 $Ni,$temp,$M0 308d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 309d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmull.u32 $A4xB,$Bi,${A2}[0] 310d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.32 {$N0-$N3}, [$nptr]! 311d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmull.u32 $A5xB,$Bi,${A2}[1] 312d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmull.u32 $A6xB,$Bi,${A3}[0] 313d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vzip.16 $Ni,$zero 314d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmull.u32 $A7xB,$Bi,${A3}[1] 315d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 316d9e397b599b13d642138480a28c14db7a136bf0Adam Langley bne .LNEON_1st 317d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 318d9e397b599b13d642138480a28c14db7a136bf0Adam Langley @ special case for num=8, everything is in register bank... 319d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 320d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A0xB,$Ni,${N0}[0] 321d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sub $outer,$num,#1 322d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A1xB,$Ni,${N0}[1] 323d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A2xB,$Ni,${N1}[0] 324d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A3xB,$Ni,${N1}[1] 325d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 326d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A4xB,$Ni,${N2}[0] 327d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmov $Temp,$A0xB 328d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A5xB,$Ni,${N2}[1] 329d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmov $A0xB,$A1xB 330d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A6xB,$Ni,${N3}[0] 331d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmov $A1xB,$A2xB 332d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A7xB,$Ni,${N3}[1] 333d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmov $A2xB,$A3xB 334d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmov $A3xB,$A4xB 335d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshr.u64 $temp,$temp,#16 336d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmov $A4xB,$A5xB 337d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmov $A5xB,$A6xB 338d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vadd.u64 $temp,$temp,`&Dhi("$Temp")` 339d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmov $A6xB,$A7xB 340d9e397b599b13d642138480a28c14db7a136bf0Adam Langley veor $A7xB,$A7xB 341d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshr.u64 $temp,$temp,#16 342d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 343d9e397b599b13d642138480a28c14db7a136bf0Adam Langley b .LNEON_outer8 344d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 345d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align 4 346d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LNEON_outer8: 347d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.32 {${Bi}[0]}, [$bptr,:32]! 348d9e397b599b13d642138480a28c14db7a136bf0Adam Langley veor $zero,$zero,$zero 349d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vzip.16 $Bi,$zero 350d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp 351d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 352d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A0xB,$Bi,${A0}[0] 353d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A1xB,$Bi,${A0}[1] 354d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A2xB,$Bi,${A1}[0] 355d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshl.i64 $temp,`&Dhi("$A0xB")`,#16 356d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A3xB,$Bi,${A1}[1] 357d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 358d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vadd.u64 $temp,$temp,`&Dlo("$A0xB")` 359d9e397b599b13d642138480a28c14db7a136bf0Adam Langley veor $zero,$zero,$zero 360d9e397b599b13d642138480a28c14db7a136bf0Adam Langley subs $outer,$outer,#1 361d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmul.u32 $Ni,$temp,$M0 362d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 363d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A4xB,$Bi,${A2}[0] 364d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A5xB,$Bi,${A2}[1] 365d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A6xB,$Bi,${A3}[0] 366d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vzip.16 $Ni,$zero 367d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A7xB,$Bi,${A3}[1] 368d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 369d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A0xB,$Ni,${N0}[0] 370d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A1xB,$Ni,${N0}[1] 371d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A2xB,$Ni,${N1}[0] 372d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A3xB,$Ni,${N1}[1] 373d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 374d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A4xB,$Ni,${N2}[0] 375d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmov $Temp,$A0xB 376d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A5xB,$Ni,${N2}[1] 377d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmov $A0xB,$A1xB 378d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A6xB,$Ni,${N3}[0] 379d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmov $A1xB,$A2xB 380d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A7xB,$Ni,${N3}[1] 381d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmov $A2xB,$A3xB 382d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmov $A3xB,$A4xB 383d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshr.u64 $temp,$temp,#16 384d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmov $A4xB,$A5xB 385d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmov $A5xB,$A6xB 386d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vadd.u64 $temp,$temp,`&Dhi("$Temp")` 387d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmov $A6xB,$A7xB 388d9e397b599b13d642138480a28c14db7a136bf0Adam Langley veor $A7xB,$A7xB 389d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshr.u64 $temp,$temp,#16 390d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 391d9e397b599b13d642138480a28c14db7a136bf0Adam Langley bne .LNEON_outer8 392d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 393d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp 394d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mov $toutptr,sp 395d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshr.u64 $temp,`&Dlo("$A0xB")`,#16 396d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mov $inner,$num 397d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp 398d9e397b599b13d642138480a28c14db7a136bf0Adam Langley add $tinptr,sp,#16 399d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshr.u64 $temp,`&Dhi("$A0xB")`,#16 400d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")` 401d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 402d9e397b599b13d642138480a28c14db7a136bf0Adam Langley b .LNEON_tail2 403d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 404d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align 4 405d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LNEON_1st: 406d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A0xB,$Ni,${N0}[0] 407d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.32 {$A0-$A3}, [$aptr]! 408d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A1xB,$Ni,${N0}[1] 409d9e397b599b13d642138480a28c14db7a136bf0Adam Langley subs $inner,$inner,#8 410d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A2xB,$Ni,${N1}[0] 411d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A3xB,$Ni,${N1}[1] 412d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 413d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A4xB,$Ni,${N2}[0] 414d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.32 {$N0-$N1}, [$nptr]! 415d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A5xB,$Ni,${N2}[1] 416d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! 417d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A6xB,$Ni,${N3}[0] 418d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A7xB,$Ni,${N3}[1] 419d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! 420d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 421d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmull.u32 $A0xB,$Bi,${A0}[0] 422d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.32 {$N2-$N3}, [$nptr]! 423d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmull.u32 $A1xB,$Bi,${A0}[1] 424d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! 425d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmull.u32 $A2xB,$Bi,${A1}[0] 426d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmull.u32 $A3xB,$Bi,${A1}[1] 427d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! 428d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 429d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmull.u32 $A4xB,$Bi,${A2}[0] 430d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmull.u32 $A5xB,$Bi,${A2}[1] 431d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmull.u32 $A6xB,$Bi,${A3}[0] 432d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmull.u32 $A7xB,$Bi,${A3}[1] 433d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 434d9e397b599b13d642138480a28c14db7a136bf0Adam Langley bne .LNEON_1st 435d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 436d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A0xB,$Ni,${N0}[0] 437d9e397b599b13d642138480a28c14db7a136bf0Adam Langley add $tinptr,sp,#16 438d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A1xB,$Ni,${N0}[1] 439d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr 440d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A2xB,$Ni,${N1}[0] 441d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.64 {$Temp}, [sp,:128] 442d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A3xB,$Ni,${N1}[1] 443d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sub $outer,$num,#1 444d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 445d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A4xB,$Ni,${N2}[0] 446d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! 447d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A5xB,$Ni,${N2}[1] 448d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshr.u64 $temp,$temp,#16 449d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.64 {$A0xB}, [$tinptr, :128]! 450d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A6xB,$Ni,${N3}[0] 451d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! 452d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A7xB,$Ni,${N3}[1] 453d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 454d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! 455d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vadd.u64 $temp,$temp,`&Dhi("$Temp")` 456d9e397b599b13d642138480a28c14db7a136bf0Adam Langley veor $Z,$Z,$Z 457d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! 458d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! 459d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.64 {$Z}, [$toutptr,:128] 460d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshr.u64 $temp,$temp,#16 461d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 462d9e397b599b13d642138480a28c14db7a136bf0Adam Langley b .LNEON_outer 463d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 464d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align 4 465d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LNEON_outer: 466d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.32 {${Bi}[0]}, [$bptr,:32]! 467d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr 468d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.32 {$A0-$A3}, [$aptr]! 469d9e397b599b13d642138480a28c14db7a136bf0Adam Langley veor $zero,$zero,$zero 470d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mov $toutptr,sp 471d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vzip.16 $Bi,$zero 472d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sub $inner,$num,#8 473d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp 474d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 475d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A0xB,$Bi,${A0}[0] 476d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.64 {$A3xB-$A4xB},[$tinptr,:256]! 477d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A1xB,$Bi,${A0}[1] 478d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A2xB,$Bi,${A1}[0] 479d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.64 {$A5xB-$A6xB},[$tinptr,:256]! 480d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A3xB,$Bi,${A1}[1] 481d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 482d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshl.i64 $temp,`&Dhi("$A0xB")`,#16 483d9e397b599b13d642138480a28c14db7a136bf0Adam Langley veor $zero,$zero,$zero 484d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vadd.u64 $temp,$temp,`&Dlo("$A0xB")` 485d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.64 {$A7xB},[$tinptr,:128]! 486d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmul.u32 $Ni,$temp,$M0 487d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 488d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A4xB,$Bi,${A2}[0] 489d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.32 {$N0-$N3}, [$nptr]! 490d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A5xB,$Bi,${A2}[1] 491d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A6xB,$Bi,${A3}[0] 492d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vzip.16 $Ni,$zero 493d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A7xB,$Bi,${A3}[1] 494d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 495d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LNEON_inner: 496d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A0xB,$Ni,${N0}[0] 497d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.32 {$A0-$A3}, [$aptr]! 498d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A1xB,$Ni,${N0}[1] 499d9e397b599b13d642138480a28c14db7a136bf0Adam Langley subs $inner,$inner,#8 500d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A2xB,$Ni,${N1}[0] 501d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A3xB,$Ni,${N1}[1] 502d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! 503d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 504d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A4xB,$Ni,${N2}[0] 505d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.64 {$A0xB}, [$tinptr, :128]! 506d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A5xB,$Ni,${N2}[1] 507d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! 508d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A6xB,$Ni,${N3}[0] 509d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! 510d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A7xB,$Ni,${N3}[1] 511d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! 512d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 513d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A0xB,$Bi,${A0}[0] 514d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]! 515d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A1xB,$Bi,${A0}[1] 516d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! 517d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A2xB,$Bi,${A1}[0] 518d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]! 519d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A3xB,$Bi,${A1}[1] 520d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.32 {$N0-$N3}, [$nptr]! 521d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 522d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A4xB,$Bi,${A2}[0] 523d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.64 {$A7xB}, [$tinptr, :128]! 524d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A5xB,$Bi,${A2}[1] 525d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A6xB,$Bi,${A3}[0] 526d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A7xB,$Bi,${A3}[1] 527d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 528d9e397b599b13d642138480a28c14db7a136bf0Adam Langley bne .LNEON_inner 529d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 530d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A0xB,$Ni,${N0}[0] 531d9e397b599b13d642138480a28c14db7a136bf0Adam Langley add $tinptr,sp,#16 532d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A1xB,$Ni,${N0}[1] 533d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr 534d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A2xB,$Ni,${N1}[0] 535d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.64 {$Temp}, [sp,:128] 536d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A3xB,$Ni,${N1}[1] 537d9e397b599b13d642138480a28c14db7a136bf0Adam Langley subs $outer,$outer,#1 538d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 539d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A4xB,$Ni,${N2}[0] 540d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! 541d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A5xB,$Ni,${N2}[1] 542d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.64 {$A0xB}, [$tinptr, :128]! 543d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshr.u64 $temp,$temp,#16 544d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! 545d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A6xB,$Ni,${N3}[0] 546d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! 547d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vmlal.u32 $A7xB,$Ni,${N3}[1] 548d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 549d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! 550d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vadd.u64 $temp,$temp,`&Dhi("$Temp")` 551d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! 552d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshr.u64 $temp,$temp,#16 553d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 554d9e397b599b13d642138480a28c14db7a136bf0Adam Langley bne .LNEON_outer 555d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 556d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mov $toutptr,sp 557d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mov $inner,$num 558d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 559d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LNEON_tail: 560d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp 561d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]! 562d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshr.u64 $temp,`&Dlo("$A0xB")`,#16 563d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp 564d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]! 565d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshr.u64 $temp,`&Dhi("$A0xB")`,#16 566d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.64 {$A7xB}, [$tinptr, :128]! 567d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")` 568d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 569d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LNEON_tail2: 570d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vadd.u64 `&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp 571d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.32 {`&Dlo("$A0xB")`[0]}, [$toutptr, :32]! 572d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshr.u64 $temp,`&Dlo("$A1xB")`,#16 573d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vadd.u64 `&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp 574d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshr.u64 $temp,`&Dhi("$A1xB")`,#16 575d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vzip.16 `&Dlo("$A1xB")`,`&Dhi("$A1xB")` 576d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 577d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vadd.u64 `&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp 578d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.32 {`&Dlo("$A1xB")`[0]}, [$toutptr, :32]! 579d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshr.u64 $temp,`&Dlo("$A2xB")`,#16 580d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vadd.u64 `&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp 581d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshr.u64 $temp,`&Dhi("$A2xB")`,#16 582d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vzip.16 `&Dlo("$A2xB")`,`&Dhi("$A2xB")` 583d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 584d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vadd.u64 `&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp 585d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.32 {`&Dlo("$A2xB")`[0]}, [$toutptr, :32]! 586d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshr.u64 $temp,`&Dlo("$A3xB")`,#16 587d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vadd.u64 `&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp 588d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshr.u64 $temp,`&Dhi("$A3xB")`,#16 589d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vzip.16 `&Dlo("$A3xB")`,`&Dhi("$A3xB")` 590d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 591d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vadd.u64 `&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp 592d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.32 {`&Dlo("$A3xB")`[0]}, [$toutptr, :32]! 593d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshr.u64 $temp,`&Dlo("$A4xB")`,#16 594d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vadd.u64 `&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp 595d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshr.u64 $temp,`&Dhi("$A4xB")`,#16 596d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vzip.16 `&Dlo("$A4xB")`,`&Dhi("$A4xB")` 597d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 598d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vadd.u64 `&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp 599d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.32 {`&Dlo("$A4xB")`[0]}, [$toutptr, :32]! 600d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshr.u64 $temp,`&Dlo("$A5xB")`,#16 601d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vadd.u64 `&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp 602d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshr.u64 $temp,`&Dhi("$A5xB")`,#16 603d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vzip.16 `&Dlo("$A5xB")`,`&Dhi("$A5xB")` 604d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 605d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vadd.u64 `&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp 606d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.32 {`&Dlo("$A5xB")`[0]}, [$toutptr, :32]! 607d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshr.u64 $temp,`&Dlo("$A6xB")`,#16 608d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vadd.u64 `&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp 609d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.64 {$A0xB}, [$tinptr, :128]! 610d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshr.u64 $temp,`&Dhi("$A6xB")`,#16 611d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vzip.16 `&Dlo("$A6xB")`,`&Dhi("$A6xB")` 612d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 613d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vadd.u64 `&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp 614d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.32 {`&Dlo("$A6xB")`[0]}, [$toutptr, :32]! 615d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshr.u64 $temp,`&Dlo("$A7xB")`,#16 616d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vadd.u64 `&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp 617d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! 618d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vshr.u64 $temp,`&Dhi("$A7xB")`,#16 619d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vzip.16 `&Dlo("$A7xB")`,`&Dhi("$A7xB")` 620d9e397b599b13d642138480a28c14db7a136bf0Adam Langley subs $inner,$inner,#8 621d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.32 {`&Dlo("$A7xB")`[0]}, [$toutptr, :32]! 622d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 623d9e397b599b13d642138480a28c14db7a136bf0Adam Langley bne .LNEON_tail 624d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 625d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit 626d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr 627d9e397b599b13d642138480a28c14db7a136bf0Adam Langley subs $aptr,sp,#0 @ clear carry flag 628d9e397b599b13d642138480a28c14db7a136bf0Adam Langley add $bptr,sp,$num,lsl#2 629d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 630d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LNEON_sub: 631d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldmia $aptr!, {r4-r7} 632d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldmia $nptr!, {r8-r11} 633d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sbcs r8, r4,r8 634d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sbcs r9, r5,r9 635d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sbcs r10,r6,r10 636d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sbcs r11,r7,r11 637d9e397b599b13d642138480a28c14db7a136bf0Adam Langley teq $aptr,$bptr @ preserves carry 638d9e397b599b13d642138480a28c14db7a136bf0Adam Langley stmia $rptr!, {r8-r11} 639d9e397b599b13d642138480a28c14db7a136bf0Adam Langley bne .LNEON_sub 640d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 641d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldr r10, [$aptr] @ load top-most bit 642d9e397b599b13d642138480a28c14db7a136bf0Adam Langley veor q0,q0,q0 643d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sub r11,$bptr,sp @ this is num*4 644d9e397b599b13d642138480a28c14db7a136bf0Adam Langley veor q1,q1,q1 645d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mov $aptr,sp 646d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sub $rptr,$rptr,r11 @ rewind $rptr 647d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mov $nptr,$bptr @ second 3/4th of frame 648d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sbcs r10,r10,#0 @ result is carry flag 649d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 650d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LNEON_copy_n_zap: 651d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldmia $aptr!, {r4-r7} 652d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldmia $rptr, {r8-r11} 653d9e397b599b13d642138480a28c14db7a136bf0Adam Langley movcc r8, r4 654d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.64 {q0-q1}, [$nptr,:256]! @ wipe 655d9e397b599b13d642138480a28c14db7a136bf0Adam Langley movcc r9, r5 656d9e397b599b13d642138480a28c14db7a136bf0Adam Langley movcc r10,r6 657d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.64 {q0-q1}, [$nptr,:256]! @ wipe 658d9e397b599b13d642138480a28c14db7a136bf0Adam Langley movcc r11,r7 659d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldmia $aptr, {r4-r7} 660d9e397b599b13d642138480a28c14db7a136bf0Adam Langley stmia $rptr!, {r8-r11} 661d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sub $aptr,$aptr,#16 662d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldmia $rptr, {r8-r11} 663d9e397b599b13d642138480a28c14db7a136bf0Adam Langley movcc r8, r4 664d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.64 {q0-q1}, [$aptr,:256]! @ wipe 665d9e397b599b13d642138480a28c14db7a136bf0Adam Langley movcc r9, r5 666d9e397b599b13d642138480a28c14db7a136bf0Adam Langley movcc r10,r6 667d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vst1.64 {q0-q1}, [$nptr,:256]! @ wipe 668d9e397b599b13d642138480a28c14db7a136bf0Adam Langley movcc r11,r7 669d9e397b599b13d642138480a28c14db7a136bf0Adam Langley teq $aptr,$bptr @ preserves carry 670d9e397b599b13d642138480a28c14db7a136bf0Adam Langley stmia $rptr!, {r8-r11} 671d9e397b599b13d642138480a28c14db7a136bf0Adam Langley bne .LNEON_copy_n_zap 672d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 673d9e397b599b13d642138480a28c14db7a136bf0Adam Langley sub sp,ip,#96 674d9e397b599b13d642138480a28c14db7a136bf0Adam Langley vldmia sp!,{d8-d15} 675d9e397b599b13d642138480a28c14db7a136bf0Adam Langley ldmia sp!,{r4-r11} 676e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley ret @ bx lr 677d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon 678d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#endif 679d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___ 680d9e397b599b13d642138480a28c14db7a136bf0Adam Langley} 681d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___; 682d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" 683d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align 2 684e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley#if __ARM_MAX_ARCH__>=7 685d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.comm OPENSSL_armcap_P,4,4 686e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley.hidden OPENSSL_armcap_P 687d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#endif 688d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___ 689d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 690d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code =~ s/\`([^\`]*)\`/eval $1/gem; 691d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 692e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley$code =~ s/\bret\b/bx lr/gm; 693d9e397b599b13d642138480a28c14db7a136bf0Adam Langleyprint $code; 694d9e397b599b13d642138480a28c14db7a136bf0Adam Langleyclose STDOUT; 695