195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#!/usr/bin/env perl 295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# ==================================================================== 495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# project. The module is, however, dual licensed under OpenSSL and 695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# CRYPTOGAMS licenses depending on where you obtain it. For further 795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# details see http://www.openssl.org/~appro/cryptogams/. 895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# ==================================================================== 995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 1095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# January 2007. 1195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 1295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# Montgomery multiplication for ARMv4. 1395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# 1495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# Performance improvement naturally varies among CPU implementations 1595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# and compilers. The code was observed to provide +65-35% improvement 1695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# [depending on key length, less for longer keys] on ARM920T, and 1795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code 1895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# base and compiler generated code with in-lined umull and even umlal 1995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# instructions. The latter means that this code didn't really have an 2095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# "advantage" of utilizing some "secret" instruction. 2195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# 2295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# The code is interoperable with Thumb ISA and is rather compact, less 2395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# than 1/2KB. Windows CE port would be trivial, as it's exclusively 2495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# about decorations, ABI and instruction syntax are identical. 2595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 2695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# November 2013 2795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# 2895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# Add NEON code path, which handles lengths divisible by 8. RSA/DSA 2995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# performance improvement on Cortex-A8 is ~45-100% depending on key 3095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# length, more for longer keys. On Cortex-A15 the span is ~10-105%. 3195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# On Snapdragon S4 improvement was measured to vary from ~70% to 3295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is 3395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# rather because original integer-only code seems to perform 3495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# suboptimally on S4. Situation on Cortex-A9 is unfortunately 3595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# different. It's being looked into, but the trouble is that 3695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# performance for vectors longer than 256 bits is actually couple 3795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# of percent worse than for integer-only code. The code is chosen 3895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# for execution on all NEON-capable processors, because gain on 3995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# others outweighs the marginal loss on Cortex-A9. 4095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 4195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleywhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 4295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyopen STDOUT,">$output"; 4395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 4495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$num="r0"; # starts as num argument, but holds &tp[num-1] 4595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$ap="r1"; 4695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$bp="r2"; $bi="r2"; $rp="r2"; 4795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$np="r3"; 4895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$tp="r4"; 4995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$aj="r5"; 5095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$nj="r6"; 5195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$tj="r7"; 5295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$n0="r8"; 5395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer 5495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$alo="r10"; # sl, gcc uses it to keep @GOT 5595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$ahi="r11"; # fp 5695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$nlo="r12"; # ip 5795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley########### # r13 is stack pointer 5895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$nhi="r14"; # lr 5995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley########### # r15 is program counter 6095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 6195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#### argument block layout relative to &tp[num-1], a.k.a. $num 6295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$_rp="$num,#12*4"; 6395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# ap permanently resides in r1 6495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$_bp="$num,#13*4"; 6595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# np permanently resides in r3 6695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$_n0="$num,#14*4"; 6795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$_num="$num,#15*4"; $_bpend=$_num; 6895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 6995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code=<<___; 7095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#include "arm_arch.h" 7195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 7295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.text 7395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.code 32 7495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 7595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#if __ARM_ARCH__>=7 7695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align 5 7795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.LOPENSSL_armcap: 7895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.word OPENSSL_armcap_P-bn_mul_mont 7995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#endif 8095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 8195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.global bn_mul_mont 82eb7d2ed1fe8a33b3e3871502ba7e12efaf94360cAdam Langley.hidden bn_mul_mont 8395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.type bn_mul_mont,%function 8495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 8595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align 5 8695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleybn_mul_mont: 8795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldr ip,[sp,#4] @ load num 8895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley stmdb sp!,{r0,r2} @ sp points at argument block 8995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#if __ARM_ARCH__>=7 9095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley tst ip,#7 9195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley bne .Lialu 9295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley adr r0,bn_mul_mont 9395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldr r2,.LOPENSSL_armcap 9495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldr r0,[r0,r2] 9595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley tst r0,#1 @ NEON available? 9695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldmia sp, {r0,r2} 9795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley beq .Lialu 9895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley add sp,sp,#8 9995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley b bn_mul8x_mont_neon 10095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align 4 10195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Lialu: 10295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#endif 10395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley cmp ip,#2 10495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mov $num,ip @ load num 10595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley movlt r0,#0 10695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley addlt sp,sp,#2*4 10795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley blt .Labrt 10895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 10995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley stmdb sp!,{r4-r12,lr} @ save 10 registers 11095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 11195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mov $num,$num,lsl#2 @ rescale $num for byte count 11295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sub sp,sp,$num @ alloca(4*num) 11395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sub sp,sp,#4 @ +extra dword 11495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sub $num,$num,#4 @ "num=num-1" 11595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley add $tp,$bp,$num @ &bp[num-1] 11695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 11795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley add $num,sp,$num @ $num to point at &tp[num-1] 11895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldr $n0,[$_n0] @ &n0 11995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldr $bi,[$bp] @ bp[0] 12095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldr $aj,[$ap],#4 @ ap[0],ap++ 12195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldr $nj,[$np],#4 @ np[0],np++ 12295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldr $n0,[$n0] @ *n0 12395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley str $tp,[$_bpend] @ save &bp[num] 12495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 12595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0] 12695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley str $n0,[$_n0] @ save n0 value 12795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mul $n0,$alo,$n0 @ "tp[0]"*n0 12895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mov $nlo,#0 12995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]" 13095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mov $tp,sp 13195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 13295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.L1st: 13395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldr $aj,[$ap],#4 @ ap[j],ap++ 13495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mov $alo,$ahi 13595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldr $nj,[$np],#4 @ np[j],np++ 13695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mov $ahi,#0 13795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0] 13895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mov $nhi,#0 13995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 14095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley adds $nlo,$nlo,$alo 14195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley str $nlo,[$tp],#4 @ tp[j-1]=,tp++ 14295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley adc $nlo,$nhi,#0 14395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley cmp $tp,$num 14495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley bne .L1st 14595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 14695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley adds $nlo,$nlo,$ahi 14795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldr $tp,[$_bp] @ restore bp 14895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mov $nhi,#0 14995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldr $n0,[$_n0] @ restore n0 15095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley adc $nhi,$nhi,#0 15195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley str $nlo,[$num] @ tp[num-1]= 15295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley str $nhi,[$num,#4] @ tp[num]= 15395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 15495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Louter: 15595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sub $tj,$num,sp @ "original" $num-1 value 15695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sub $ap,$ap,$tj @ "rewind" ap to &ap[1] 15795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldr $bi,[$tp,#4]! @ *(++bp) 15895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sub $np,$np,$tj @ "rewind" np to &np[1] 15995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldr $aj,[$ap,#-4] @ ap[0] 16095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldr $alo,[sp] @ tp[0] 16195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldr $nj,[$np,#-4] @ np[0] 16295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldr $tj,[sp,#4] @ tp[1] 16395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 16495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mov $ahi,#0 16595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0] 16695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley str $tp,[$_bp] @ save bp 16795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mul $n0,$alo,$n0 16895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mov $nlo,#0 16995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]" 17095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mov $tp,sp 17195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 17295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Linner: 17395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldr $aj,[$ap],#4 @ ap[j],ap++ 17495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley adds $alo,$ahi,$tj @ +=tp[j] 17595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldr $nj,[$np],#4 @ np[j],np++ 17695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mov $ahi,#0 17795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i] 17895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mov $nhi,#0 17995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 18095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley adc $ahi,$ahi,#0 18195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldr $tj,[$tp,#8] @ tp[j+1] 18295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley adds $nlo,$nlo,$alo 18395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley str $nlo,[$tp],#4 @ tp[j-1]=,tp++ 18495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley adc $nlo,$nhi,#0 18595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley cmp $tp,$num 18695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley bne .Linner 18795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 18895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley adds $nlo,$nlo,$ahi 18995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mov $nhi,#0 19095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldr $tp,[$_bp] @ restore bp 19195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley adc $nhi,$nhi,#0 19295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldr $n0,[$_n0] @ restore n0 19395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley adds $nlo,$nlo,$tj 19495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldr $tj,[$_bpend] @ restore &bp[num] 19595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley adc $nhi,$nhi,#0 19695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley str $nlo,[$num] @ tp[num-1]= 19795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley str $nhi,[$num,#4] @ tp[num]= 19895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 19995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley cmp $tp,$tj 20095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley bne .Louter 20195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 20295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldr $rp,[$_rp] @ pull rp 20395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley add $num,$num,#4 @ $num to point at &tp[num] 20495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sub $aj,$num,sp @ "original" num value 20595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mov $tp,sp @ "rewind" $tp 20695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mov $ap,$tp @ "borrow" $ap 20795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sub $np,$np,$aj @ "rewind" $np to &np[0] 20895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 20995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley subs $tj,$tj,$tj @ "clear" carry flag 21095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Lsub: ldr $tj,[$tp],#4 21195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldr $nj,[$np],#4 21295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sbcs $tj,$tj,$nj @ tp[j]-np[j] 21395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley str $tj,[$rp],#4 @ rp[j]= 21495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley teq $tp,$num @ preserve carry 21595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley bne .Lsub 21695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sbcs $nhi,$nhi,#0 @ upmost carry 21795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mov $tp,sp @ "rewind" $tp 21895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sub $rp,$rp,$aj @ "rewind" $rp 21995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 22095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley and $ap,$tp,$nhi 22195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley bic $np,$rp,$nhi 22295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley orr $ap,$ap,$np @ ap=borrow?tp:rp 22395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 22495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh 22595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley str sp,[$tp],#4 @ zap tp 22695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley str $tj,[$rp],#4 22795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley cmp $tp,$num 22895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley bne .Lcopy 22995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 23095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley add sp,$num,#4 @ skip over tp[num+1] 23195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldmia sp!,{r4-r12,lr} @ restore registers 23295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley add sp,sp,#2*4 @ skip over {r0,r2} 23395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mov r0,#1 23495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Labrt: tst lr,#1 23595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley moveq pc,lr @ be binary compatible with V4, yet 23695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley bx lr @ interoperable with Thumb ISA:-) 23795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.size bn_mul_mont,.-bn_mul_mont 23895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___ 23995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley{ 24095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } 24195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } 24295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 24395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy ($A0,$A1,$A2,$A3)=map("d$_",(0..3)); 24495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy ($N0,$N1,$N2,$N3)=map("d$_",(4..7)); 24595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy ($Z,$Temp)=("q4","q5"); 24695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13)); 24795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy ($Bi,$Ni,$M0)=map("d$_",(28..31)); 24895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy $zero=&Dlo($Z); 24995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy $temp=&Dlo($Temp); 25095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 25195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5)); 25295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9)); 25395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 25495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___; 25595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#if __ARM_ARCH__>=7 25695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.fpu neon 25795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 25895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.type bn_mul8x_mont_neon,%function 25995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align 5 26095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleybn_mul8x_mont_neon: 26195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mov ip,sp 26295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley stmdb sp!,{r4-r11} 26395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vstmdb sp!,{d8-d15} @ ABI specification says so 26495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldmia ip,{r4-r5} @ load rest of parameter block 26595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 26695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sub $toutptr,sp,#16 26795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.32 {${Bi}[0]}, [$bptr,:32]! 26895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sub $toutptr,$toutptr,$num,lsl#4 26995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-( 27095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley and $toutptr,$toutptr,#-64 27195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.32 {${M0}[0]}, [$n0,:32] 27295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mov sp,$toutptr @ alloca 27395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley veor $zero,$zero,$zero 27495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley subs $inner,$num,#8 27595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vzip.16 $Bi,$zero 27695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 27795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmull.u32 $A0xB,$Bi,${A0}[0] 27895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmull.u32 $A1xB,$Bi,${A0}[1] 27995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmull.u32 $A2xB,$Bi,${A1}[0] 28095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshl.i64 $temp,`&Dhi("$A0xB")`,#16 28195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmull.u32 $A3xB,$Bi,${A1}[1] 28295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 28395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vadd.u64 $temp,$temp,`&Dlo("$A0xB")` 28495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley veor $zero,$zero,$zero 28595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmul.u32 $Ni,$temp,$M0 28695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 28795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmull.u32 $A4xB,$Bi,${A2}[0] 28895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.32 {$N0-$N3}, [$nptr]! 28995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmull.u32 $A5xB,$Bi,${A2}[1] 29095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmull.u32 $A6xB,$Bi,${A3}[0] 29195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vzip.16 $Ni,$zero 29295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmull.u32 $A7xB,$Bi,${A3}[1] 29395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 29495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley bne .LNEON_1st 29595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 29695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley @ special case for num=8, everything is in register bank... 29795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 29895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A0xB,$Ni,${N0}[0] 29995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sub $outer,$num,#1 30095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A1xB,$Ni,${N0}[1] 30195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A2xB,$Ni,${N1}[0] 30295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A3xB,$Ni,${N1}[1] 30395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 30495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A4xB,$Ni,${N2}[0] 30595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmov $Temp,$A0xB 30695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A5xB,$Ni,${N2}[1] 30795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmov $A0xB,$A1xB 30895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A6xB,$Ni,${N3}[0] 30995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmov $A1xB,$A2xB 31095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A7xB,$Ni,${N3}[1] 31195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmov $A2xB,$A3xB 31295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmov $A3xB,$A4xB 31395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshr.u64 $temp,$temp,#16 31495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmov $A4xB,$A5xB 31595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmov $A5xB,$A6xB 31695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vadd.u64 $temp,$temp,`&Dhi("$Temp")` 31795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmov $A6xB,$A7xB 31895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley veor $A7xB,$A7xB 31995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshr.u64 $temp,$temp,#16 32095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 32195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley b .LNEON_outer8 32295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 32395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align 4 32495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.LNEON_outer8: 32595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.32 {${Bi}[0]}, [$bptr,:32]! 32695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley veor $zero,$zero,$zero 32795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vzip.16 $Bi,$zero 32895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp 32995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 33095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A0xB,$Bi,${A0}[0] 33195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A1xB,$Bi,${A0}[1] 33295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A2xB,$Bi,${A1}[0] 33395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshl.i64 $temp,`&Dhi("$A0xB")`,#16 33495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A3xB,$Bi,${A1}[1] 33595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 33695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vadd.u64 $temp,$temp,`&Dlo("$A0xB")` 33795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley veor $zero,$zero,$zero 33895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley subs $outer,$outer,#1 33995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmul.u32 $Ni,$temp,$M0 34095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 34195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A4xB,$Bi,${A2}[0] 34295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A5xB,$Bi,${A2}[1] 34395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A6xB,$Bi,${A3}[0] 34495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vzip.16 $Ni,$zero 34595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A7xB,$Bi,${A3}[1] 34695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 34795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A0xB,$Ni,${N0}[0] 34895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A1xB,$Ni,${N0}[1] 34995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A2xB,$Ni,${N1}[0] 35095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A3xB,$Ni,${N1}[1] 35195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 35295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A4xB,$Ni,${N2}[0] 35395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmov $Temp,$A0xB 35495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A5xB,$Ni,${N2}[1] 35595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmov $A0xB,$A1xB 35695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A6xB,$Ni,${N3}[0] 35795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmov $A1xB,$A2xB 35895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A7xB,$Ni,${N3}[1] 35995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmov $A2xB,$A3xB 36095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmov $A3xB,$A4xB 36195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshr.u64 $temp,$temp,#16 36295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmov $A4xB,$A5xB 36395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmov $A5xB,$A6xB 36495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vadd.u64 $temp,$temp,`&Dhi("$Temp")` 36595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmov $A6xB,$A7xB 36695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley veor $A7xB,$A7xB 36795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshr.u64 $temp,$temp,#16 36895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 36995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley bne .LNEON_outer8 37095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 37195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp 37295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mov $toutptr,sp 37395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshr.u64 $temp,`&Dlo("$A0xB")`,#16 37495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mov $inner,$num 37595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp 37695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley add $tinptr,sp,#16 37795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshr.u64 $temp,`&Dhi("$A0xB")`,#16 37895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")` 37995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 38095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley b .LNEON_tail2 38195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 38295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align 4 38395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.LNEON_1st: 38495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A0xB,$Ni,${N0}[0] 38595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.32 {$A0-$A3}, [$aptr]! 38695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A1xB,$Ni,${N0}[1] 38795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley subs $inner,$inner,#8 38895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A2xB,$Ni,${N1}[0] 38995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A3xB,$Ni,${N1}[1] 39095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 39195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A4xB,$Ni,${N2}[0] 39295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.32 {$N0-$N1}, [$nptr]! 39395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A5xB,$Ni,${N2}[1] 39495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! 39595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A6xB,$Ni,${N3}[0] 39695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A7xB,$Ni,${N3}[1] 39795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! 39895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 39995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmull.u32 $A0xB,$Bi,${A0}[0] 40095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.32 {$N2-$N3}, [$nptr]! 40195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmull.u32 $A1xB,$Bi,${A0}[1] 40295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! 40395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmull.u32 $A2xB,$Bi,${A1}[0] 40495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmull.u32 $A3xB,$Bi,${A1}[1] 40595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! 40695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 40795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmull.u32 $A4xB,$Bi,${A2}[0] 40895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmull.u32 $A5xB,$Bi,${A2}[1] 40995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmull.u32 $A6xB,$Bi,${A3}[0] 41095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmull.u32 $A7xB,$Bi,${A3}[1] 41195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 41295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley bne .LNEON_1st 41395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 41495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A0xB,$Ni,${N0}[0] 41595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley add $tinptr,sp,#16 41695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A1xB,$Ni,${N0}[1] 41795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr 41895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A2xB,$Ni,${N1}[0] 41995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.64 {$Temp}, [sp,:128] 42095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A3xB,$Ni,${N1}[1] 42195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sub $outer,$num,#1 42295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 42395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A4xB,$Ni,${N2}[0] 42495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! 42595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A5xB,$Ni,${N2}[1] 42695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshr.u64 $temp,$temp,#16 42795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.64 {$A0xB}, [$tinptr, :128]! 42895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A6xB,$Ni,${N3}[0] 42995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! 43095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A7xB,$Ni,${N3}[1] 43195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 43295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! 43395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vadd.u64 $temp,$temp,`&Dhi("$Temp")` 43495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley veor $Z,$Z,$Z 43595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! 43695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! 43795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.64 {$Z}, [$toutptr,:128] 43895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshr.u64 $temp,$temp,#16 43995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 44095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley b .LNEON_outer 44195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 44295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align 4 44395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.LNEON_outer: 44495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.32 {${Bi}[0]}, [$bptr,:32]! 44595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr 44695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.32 {$A0-$A3}, [$aptr]! 44795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley veor $zero,$zero,$zero 44895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mov $toutptr,sp 44995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vzip.16 $Bi,$zero 45095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sub $inner,$num,#8 45195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp 45295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 45395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A0xB,$Bi,${A0}[0] 45495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.64 {$A3xB-$A4xB},[$tinptr,:256]! 45595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A1xB,$Bi,${A0}[1] 45695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A2xB,$Bi,${A1}[0] 45795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.64 {$A5xB-$A6xB},[$tinptr,:256]! 45895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A3xB,$Bi,${A1}[1] 45995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 46095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshl.i64 $temp,`&Dhi("$A0xB")`,#16 46195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley veor $zero,$zero,$zero 46295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vadd.u64 $temp,$temp,`&Dlo("$A0xB")` 46395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.64 {$A7xB},[$tinptr,:128]! 46495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmul.u32 $Ni,$temp,$M0 46595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 46695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A4xB,$Bi,${A2}[0] 46795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.32 {$N0-$N3}, [$nptr]! 46895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A5xB,$Bi,${A2}[1] 46995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A6xB,$Bi,${A3}[0] 47095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vzip.16 $Ni,$zero 47195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A7xB,$Bi,${A3}[1] 47295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 47395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.LNEON_inner: 47495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A0xB,$Ni,${N0}[0] 47595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.32 {$A0-$A3}, [$aptr]! 47695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A1xB,$Ni,${N0}[1] 47795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley subs $inner,$inner,#8 47895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A2xB,$Ni,${N1}[0] 47995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A3xB,$Ni,${N1}[1] 48095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! 48195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 48295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A4xB,$Ni,${N2}[0] 48395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.64 {$A0xB}, [$tinptr, :128]! 48495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A5xB,$Ni,${N2}[1] 48595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! 48695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A6xB,$Ni,${N3}[0] 48795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! 48895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A7xB,$Ni,${N3}[1] 48995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! 49095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 49195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A0xB,$Bi,${A0}[0] 49295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]! 49395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A1xB,$Bi,${A0}[1] 49495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! 49595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A2xB,$Bi,${A1}[0] 49695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]! 49795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A3xB,$Bi,${A1}[1] 49895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.32 {$N0-$N3}, [$nptr]! 49995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 50095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A4xB,$Bi,${A2}[0] 50195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.64 {$A7xB}, [$tinptr, :128]! 50295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A5xB,$Bi,${A2}[1] 50395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A6xB,$Bi,${A3}[0] 50495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A7xB,$Bi,${A3}[1] 50595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 50695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley bne .LNEON_inner 50795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 50895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A0xB,$Ni,${N0}[0] 50995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley add $tinptr,sp,#16 51095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A1xB,$Ni,${N0}[1] 51195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr 51295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A2xB,$Ni,${N1}[0] 51395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.64 {$Temp}, [sp,:128] 51495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A3xB,$Ni,${N1}[1] 51595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley subs $outer,$outer,#1 51695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 51795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A4xB,$Ni,${N2}[0] 51895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! 51995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A5xB,$Ni,${N2}[1] 52095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.64 {$A0xB}, [$tinptr, :128]! 52195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshr.u64 $temp,$temp,#16 52295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! 52395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A6xB,$Ni,${N3}[0] 52495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! 52595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vmlal.u32 $A7xB,$Ni,${N3}[1] 52695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 52795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! 52895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vadd.u64 $temp,$temp,`&Dhi("$Temp")` 52995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! 53095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshr.u64 $temp,$temp,#16 53195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 53295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley bne .LNEON_outer 53395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 53495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mov $toutptr,sp 53595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mov $inner,$num 53695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 53795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.LNEON_tail: 53895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp 53995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]! 54095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshr.u64 $temp,`&Dlo("$A0xB")`,#16 54195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp 54295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]! 54395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshr.u64 $temp,`&Dhi("$A0xB")`,#16 54495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.64 {$A7xB}, [$tinptr, :128]! 54595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")` 54695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 54795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.LNEON_tail2: 54895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vadd.u64 `&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp 54995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.32 {`&Dlo("$A0xB")`[0]}, [$toutptr, :32]! 55095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshr.u64 $temp,`&Dlo("$A1xB")`,#16 55195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vadd.u64 `&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp 55295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshr.u64 $temp,`&Dhi("$A1xB")`,#16 55395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vzip.16 `&Dlo("$A1xB")`,`&Dhi("$A1xB")` 55495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 55595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vadd.u64 `&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp 55695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.32 {`&Dlo("$A1xB")`[0]}, [$toutptr, :32]! 55795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshr.u64 $temp,`&Dlo("$A2xB")`,#16 55895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vadd.u64 `&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp 55995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshr.u64 $temp,`&Dhi("$A2xB")`,#16 56095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vzip.16 `&Dlo("$A2xB")`,`&Dhi("$A2xB")` 56195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 56295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vadd.u64 `&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp 56395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.32 {`&Dlo("$A2xB")`[0]}, [$toutptr, :32]! 56495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshr.u64 $temp,`&Dlo("$A3xB")`,#16 56595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vadd.u64 `&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp 56695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshr.u64 $temp,`&Dhi("$A3xB")`,#16 56795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vzip.16 `&Dlo("$A3xB")`,`&Dhi("$A3xB")` 56895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 56995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vadd.u64 `&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp 57095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.32 {`&Dlo("$A3xB")`[0]}, [$toutptr, :32]! 57195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshr.u64 $temp,`&Dlo("$A4xB")`,#16 57295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vadd.u64 `&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp 57395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshr.u64 $temp,`&Dhi("$A4xB")`,#16 57495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vzip.16 `&Dlo("$A4xB")`,`&Dhi("$A4xB")` 57595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 57695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vadd.u64 `&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp 57795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.32 {`&Dlo("$A4xB")`[0]}, [$toutptr, :32]! 57895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshr.u64 $temp,`&Dlo("$A5xB")`,#16 57995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vadd.u64 `&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp 58095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshr.u64 $temp,`&Dhi("$A5xB")`,#16 58195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vzip.16 `&Dlo("$A5xB")`,`&Dhi("$A5xB")` 58295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 58395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vadd.u64 `&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp 58495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.32 {`&Dlo("$A5xB")`[0]}, [$toutptr, :32]! 58595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshr.u64 $temp,`&Dlo("$A6xB")`,#16 58695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vadd.u64 `&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp 58795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.64 {$A0xB}, [$tinptr, :128]! 58895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshr.u64 $temp,`&Dhi("$A6xB")`,#16 58995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vzip.16 `&Dlo("$A6xB")`,`&Dhi("$A6xB")` 59095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 59195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vadd.u64 `&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp 59295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.32 {`&Dlo("$A6xB")`[0]}, [$toutptr, :32]! 59395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshr.u64 $temp,`&Dlo("$A7xB")`,#16 59495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vadd.u64 `&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp 59595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! 59695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vshr.u64 $temp,`&Dhi("$A7xB")`,#16 59795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vzip.16 `&Dlo("$A7xB")`,`&Dhi("$A7xB")` 59895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley subs $inner,$inner,#8 59995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.32 {`&Dlo("$A7xB")`[0]}, [$toutptr, :32]! 60095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 60195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley bne .LNEON_tail 60295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 60395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit 60495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr 60595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley subs $aptr,sp,#0 @ clear carry flag 60695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley add $bptr,sp,$num,lsl#2 60795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 60895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.LNEON_sub: 60995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldmia $aptr!, {r4-r7} 61095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldmia $nptr!, {r8-r11} 61195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sbcs r8, r4,r8 61295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sbcs r9, r5,r9 61395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sbcs r10,r6,r10 61495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sbcs r11,r7,r11 61595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley teq $aptr,$bptr @ preserves carry 61695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley stmia $rptr!, {r8-r11} 61795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley bne .LNEON_sub 61895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 61995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldr r10, [$aptr] @ load top-most bit 62095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley veor q0,q0,q0 62195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sub r11,$bptr,sp @ this is num*4 62295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley veor q1,q1,q1 62395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mov $aptr,sp 62495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sub $rptr,$rptr,r11 @ rewind $rptr 62595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley mov $nptr,$bptr @ second 3/4th of frame 62695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sbcs r10,r10,#0 @ result is carry flag 62795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 62895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.LNEON_copy_n_zap: 62995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldmia $aptr!, {r4-r7} 63095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldmia $rptr, {r8-r11} 63195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley movcc r8, r4 63295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.64 {q0-q1}, [$nptr,:256]! @ wipe 63395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley movcc r9, r5 63495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley movcc r10,r6 63595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.64 {q0-q1}, [$nptr,:256]! @ wipe 63695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley movcc r11,r7 63795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldmia $aptr, {r4-r7} 63895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley stmia $rptr!, {r8-r11} 63995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sub $aptr,$aptr,#16 64095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldmia $rptr, {r8-r11} 64195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley movcc r8, r4 64295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.64 {q0-q1}, [$aptr,:256]! @ wipe 64395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley movcc r9, r5 64495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley movcc r10,r6 64595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vst1.64 {q0-q1}, [$nptr,:256]! @ wipe 64695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley movcc r11,r7 64795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley teq $aptr,$bptr @ preserves carry 64895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley stmia $rptr!, {r8-r11} 64995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley bne .LNEON_copy_n_zap 65095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 65195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley sub sp,ip,#96 65295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley vldmia sp!,{d8-d15} 65395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley ldmia sp!,{r4-r11} 65495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley bx lr 65595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon 65695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#endif 65795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___ 65895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley} 65995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___; 66095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" 66195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align 2 66295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#if __ARM_ARCH__>=7 66395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.comm OPENSSL_armcap_P,4,4 66495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#endif 66595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___ 66695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley 66795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code =~ s/\`([^\`]*)\`/eval $1/gem; 66895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 66995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyprint $code; 67095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyclose STDOUT; 671