1efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// Copyright 2013 The Go Authors. All rights reserved.
2efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// Use of this source code is governed by a BSD-style
3efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// license that can be found in the LICENSE file.
4efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
5efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// +build !math_big_pure_go,ppc64 !math_big_pure_go,ppc64le
6efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
7efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen#include "textflag.h"
8efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
9efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// This file provides fast assembly versions for the elementary
10efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// arithmetic operations on vectors implemented in arith.go.
11efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
12efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// func mulWW(x, y Word) (z1, z0 Word)
13efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan WillemsenTEXT ·mulWW(SB), NOSPLIT, $0
14efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD   x+0(FP), R4
15efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD   y+8(FP), R5
16efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MULHDU R4, R5, R6
17efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MULLD  R4, R5, R7
18efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD   R6, z1+16(FP)
19efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD   R7, z0+24(FP)
20efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	RET
21efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
22efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan WillemsenTEXT ·addVV(SB), NOSPLIT, $0
23efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	BR ·addVV_g(SB)
24efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
25efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// func subVV(z, x, y []Word) (c Word)
26efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// z[i] = x[i] - y[i] for all i, carrying
27efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan WillemsenTEXT ·subVV(SB), NOSPLIT, $0
28efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD z_len+8(FP), R7
29efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD x+24(FP), R8
30efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD y+48(FP), R9
31efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD z+0(FP), R10
32efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
33efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD $0, R4  // c = 0
34efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD $0, R5  // i = 0
35efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD $1, R29 // work around lack of ADDI
36efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD $8, R28 // work around lack of scaled addressing
37efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
38efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	SUBC R0, R0  // clear CA
39efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	JMP  sublend
40efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
41efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// amd64 saves and restores CF, but I believe they only have to do that because all of
42efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// their math operations clobber it - we should just be able to recover it at the end.
43efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsensubloop:
44efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MULLD R5, R28, R6
45efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD  (R8)(R6), R11 // x[i]
46efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD  (R9)(R6), R12 // y[i]
47efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
48efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	SUBE R12, R11, R15
49efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD R15, (R10)(R6)
50efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
51efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	ADD R29, R5 // i++
52efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
53efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsensublend:
54efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	CMP R5, R7
55efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	BLT subloop
56efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
57efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	ADDZE R4
58efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	XOR   R29, R4
59efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD  R4, c+72(FP)
60efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	RET
61efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
62efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan WillemsenTEXT ·addVW(SB), NOSPLIT, $0
63efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	BR ·addVW_g(SB)
64efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
65efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan WillemsenTEXT ·subVW(SB), NOSPLIT, $0
66efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	BR ·subVW_g(SB)
67efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
68efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan WillemsenTEXT ·shlVU(SB), NOSPLIT, $0
69efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	BR ·shlVU_g(SB)
70efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
71efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan WillemsenTEXT ·shrVU(SB), NOSPLIT, $0
72efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	BR ·shrVU_g(SB)
73efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
74efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// func mulAddVWW(z, x []Word, y, r Word) (c Word)
75efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan WillemsenTEXT ·mulAddVWW(SB), NOSPLIT, $0
76efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD z+0(FP), R10
77efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD x+24(FP), R8
78efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD y+48(FP), R9
79efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD r+56(FP), R4     // c = r
80efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD z_len+8(FP), R11
81efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD $0, R3           // i = 0
82efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD $8, R18
83efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD $1, R19
84efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
85efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	JMP e5
86efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
87efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsenl5:
88efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MULLD  R18, R3, R5
89efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD   (R8)(R5), R20
90efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MULLD  R9, R20, R6
91efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MULHDU R9, R20, R7
92efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	ADDC   R4, R6
93efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	ADDZE  R7
94efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD   R6, (R10)(R5)
95efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD   R7, R4
96efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	ADD    R19, R3
97efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
98efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsene5:
99efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	CMP R3, R11
100efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	BLT l5
101efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
102efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD R4, c+64(FP)
103efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	RET
104efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
105efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// func addMulVVW(z, x []Word, y Word) (c Word)
106efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan WillemsenTEXT ·addMulVVW(SB), NOSPLIT, $0
107efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD z+0(FP), R10
108efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD x+24(FP), R8
109efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD y+48(FP), R9
110efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD z_len+8(FP), R22
111efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
112efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD $0, R5   // i = 0
113efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD $0, R4   // c = 0
114efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD $8, R28
115efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD $-2, R23
116efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	AND  R22, R23 // mask the last bit of z.len
117efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD $2, R24
118efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	CMP  R23, R24
119efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	BGE  unrolled
120efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	JMP  end
121efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
122efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsenunrolled:
123efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD  $8, R19         // no (RA)(RB*8) on power
124efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MULLD R5, R19
125efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD  (R10)(R19), R11 // R11 = z[i]
126efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD  (R8)(R19), R16  // R16 = x[i]
127efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	ADD   R28, R19, R25
128efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD  (R10)(R25), R17
129efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD  (R8)(R25), R18
130efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
131efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MULLD  R9, R16, R12
132efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MULHDU R9, R16, R14
133efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MULLD  R9, R18, R6
134efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MULHDU R9, R18, R7
135efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	ADDC   R4, R12
136efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	ADDZE  R14
137efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	ADDC   R11, R12        // z[i] = (x[i]*y) + z[i] + carry
138efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	ADDZE  R14             // carry = high order bits + add carry
139efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD   R12, (R10)(R19)
140efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	ADDC   R14, R6
141efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	ADDZE  R7
142efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	ADDC   R17, R6
143efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	ADDZE  R7
144efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD   R6, (R10)(R25)
145efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD   R7, R4
146efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
147efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	ADD R24, R5
148efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	CMP R5, R23
149efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	BLT unrolled
150efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	JMP end
151efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
152efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsenloop:
153efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD   $8, R19
154efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MULLD  R5, R19
155efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD   (R10)(R19), R11
156efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD   (R8)(R19), R16
157efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MULLD  R9, R16, R12
158efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MULHDU R9, R16, R14
159efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	ADDC   R4, R12
160efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	ADDZE  R14
161efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	ADDC   R11, R12
162efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	ADDZE  R14
163efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD   R12, (R10)(R19)
164efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD   R14, R4
165efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
166efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD $1, R15
167efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	ADD  R15, R5
168efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
169efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsenend:
170efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	CMP R5, R22
171efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	BLT loop
172efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
173efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD R4, c+56(FP)
174efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	RET
175efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
176efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan WillemsenTEXT ·divWVW(SB), NOSPLIT, $0
177efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	BR ·divWVW_g(SB)
178efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen
179efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen// func bitLen(x Word) int
180efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan WillemsenTEXT ·bitLen(SB), NOSPLIT, $0
181efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD   x+0(FP), R4
182efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	CNTLZD R4, R4
183efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD   $64, R5
184efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	SUB    R4, R5
185efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	MOVD   R5, n+8(FP)
186efea46b87b2dcc66da02dfbf66fd901d24c7a09Dan Willemsen	RET
187