1efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// Copyright 2013 The Go Authors. All rights reserved. 2efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// Use of this source code is governed by a BSD-style 3efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// license that can be found in the LICENSE file. 4efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 5efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// +build !math_big_pure_go,ppc64 !math_big_pure_go,ppc64le 6efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 7efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen#include "textflag.h" 8efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 9efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// This file provides fast assembly versions for the elementary 10efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// arithmetic operations on vectors implemented in arith.go. 11efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 12efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// func mulWW(x, y Word) (z1, z0 Word) 13efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan WillemsenTEXT ·mulWW(SB), NOSPLIT, $0 14efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD x+0(FP), R4 15efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD y+8(FP), R5 16efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MULHDU R4, R5, R6 17efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MULLD R4, R5, R7 18efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD R6, z1+16(FP) 19efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD R7, z0+24(FP) 20efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen RET 21efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 22efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan WillemsenTEXT ·addVV(SB), NOSPLIT, $0 23efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen BR ·addVV_g(SB) 24efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 25efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// func subVV(z, x, y []Word) (c Word) 26efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// z[i] = x[i] - y[i] for all i, carrying 27efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan WillemsenTEXT ·subVV(SB), NOSPLIT, $0 28efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD z_len+8(FP), R7 29efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD x+24(FP), R8 30efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD y+48(FP), R9 31efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD z+0(FP), R10 32efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 33efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD $0, R4 // c = 0 34efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD $0, R5 // i = 0 35efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD $1, R29 // work around lack of ADDI 36efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD $8, R28 // work around lack of scaled addressing 37efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 38efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen SUBC R0, R0 // clear CA 39efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen JMP sublend 40efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 41efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// amd64 saves and restores CF, but I believe they only have to do that because all of 42efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// their math operations clobber it - we should just be able to recover it at the end. 43efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsensubloop: 44efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MULLD R5, R28, R6 45efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD (R8)(R6), R11 // x[i] 46efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD (R9)(R6), R12 // y[i] 47efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 48efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen SUBE R12, R11, R15 49efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD R15, (R10)(R6) 50efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 51efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen ADD R29, R5 // i++ 52efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 53efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsensublend: 54efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen CMP R5, R7 55efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen BLT subloop 56efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 57efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen ADDZE R4 58efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen XOR R29, R4 59efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD R4, c+72(FP) 60efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen RET 61efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 62efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan WillemsenTEXT ·addVW(SB), NOSPLIT, $0 63efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen BR ·addVW_g(SB) 64efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 65efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan WillemsenTEXT ·subVW(SB), NOSPLIT, $0 66efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen BR ·subVW_g(SB) 67efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 68efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan WillemsenTEXT ·shlVU(SB), NOSPLIT, $0 69efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen BR ·shlVU_g(SB) 70efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 71efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan WillemsenTEXT ·shrVU(SB), NOSPLIT, $0 72efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen BR ·shrVU_g(SB) 73efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 74efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// func mulAddVWW(z, x []Word, y, r Word) (c Word) 75efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan WillemsenTEXT ·mulAddVWW(SB), NOSPLIT, $0 76efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD z+0(FP), R10 77efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD x+24(FP), R8 78efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD y+48(FP), R9 79efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD r+56(FP), R4 // c = r 80efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD z_len+8(FP), R11 81efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD $0, R3 // i = 0 82efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD $8, R18 83efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD $1, R19 84efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 85efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen JMP e5 86efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 87efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsenl5: 88efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MULLD R18, R3, R5 89efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD (R8)(R5), R20 90efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MULLD R9, R20, R6 91efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MULHDU R9, R20, R7 92efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen ADDC R4, R6 93efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen ADDZE R7 94efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD R6, (R10)(R5) 95efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD R7, R4 96efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen ADD R19, R3 97efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 98efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsene5: 99efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen CMP R3, R11 100efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen BLT l5 101efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 102efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD R4, c+64(FP) 103efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen RET 104efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 105efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// func addMulVVW(z, x []Word, y Word) (c Word) 106efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan WillemsenTEXT ·addMulVVW(SB), NOSPLIT, $0 107efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD z+0(FP), R10 108efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD x+24(FP), R8 109efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD y+48(FP), R9 110efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD z_len+8(FP), R22 111efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 112efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD $0, R5 // i = 0 113efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD $0, R4 // c = 0 114efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD $8, R28 115efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD $-2, R23 116efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen AND R22, R23 // mask the last bit of z.len 117efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD $2, R24 118efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen CMP R23, R24 119efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen BGE unrolled 120efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen JMP end 121efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 122efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsenunrolled: 123efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD $8, R19 // no (RA)(RB*8) on power 124efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MULLD R5, R19 125efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD (R10)(R19), R11 // R11 = z[i] 126efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD (R8)(R19), R16 // R16 = x[i] 127efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen ADD R28, R19, R25 128efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD (R10)(R25), R17 129efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD (R8)(R25), R18 130efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 131efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MULLD R9, R16, R12 132efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MULHDU R9, R16, R14 133efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MULLD R9, R18, R6 134efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MULHDU R9, R18, R7 135efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen ADDC R4, R12 136efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen ADDZE R14 137efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen ADDC R11, R12 // z[i] = (x[i]*y) + z[i] + carry 138efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen ADDZE R14 // carry = high order bits + add carry 139efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD R12, (R10)(R19) 140efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen ADDC R14, R6 141efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen ADDZE R7 142efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen ADDC R17, R6 143efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen ADDZE R7 144efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD R6, (R10)(R25) 145efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD R7, R4 146efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 147efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen ADD R24, R5 148efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen CMP R5, R23 149efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen BLT unrolled 150efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen JMP end 151efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 152efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsenloop: 153efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD $8, R19 154efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MULLD R5, R19 155efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD (R10)(R19), R11 156efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD (R8)(R19), R16 157efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MULLD R9, R16, R12 158efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MULHDU R9, R16, R14 159efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen ADDC R4, R12 160efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen ADDZE R14 161efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen ADDC R11, R12 162efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen ADDZE R14 163efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD R12, (R10)(R19) 164efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD R14, R4 165efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 166efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD $1, R15 167efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen ADD R15, R5 168efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 169efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsenend: 170efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen CMP R5, R22 171efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen BLT loop 172efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 173efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD R4, c+56(FP) 174efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen RET 175efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 176efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan WillemsenTEXT ·divWVW(SB), NOSPLIT, $0 177efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen BR ·divWVW_g(SB) 178efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen 179efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// func bitLen(x Word) int 180efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan WillemsenTEXT ·bitLen(SB), NOSPLIT, $0 181efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD x+0(FP), R4 182efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen CNTLZD R4, R4 183efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD $64, R5 184efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen SUB R4, R5 185efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen MOVD R5, n+8(FP) 186efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen RET 187