1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build !math_big_pure_go,s390x
6
7#include "textflag.h"
8
9// This file provides fast assembly versions for the elementary
10// arithmetic operations on vectors implemented in arith.go.
11
12TEXT ·hasVectorFacility(SB),NOSPLIT,$24-1
13        MOVD    $x-24(SP), R1
14        XC      $24, 0(R1), 0(R1) // clear the storage
15        MOVD    $2, R0            // R0 is the number of double words stored -1
16        WORD    $0xB2B01000       // STFLE 0(R1)
17        XOR     R0, R0            // reset the value of R0
18        MOVBZ   z-8(SP), R1
19        AND     $0x40, R1
20        BEQ     novector
21vectorinstalled:
22        // check if the vector instruction has been enabled
23        VLEIB   $0, $0xF, V16
24        VLGVB   $0, V16, R1
25        CMPBNE  R1, $0xF, novector
26        MOVB    $1, ret+0(FP) // have vx
27        RET
28novector:
29        MOVB    $0, ret+0(FP) // no vx
30        RET
31
32TEXT ·mulWW(SB),NOSPLIT,$0
33	MOVD	x+0(FP), R3
34	MOVD	y+8(FP), R4
35	MULHDU	R3, R4
36	MOVD	R10, z1+16(FP)
37	MOVD	R11, z0+24(FP)
38	RET
39
40// func divWW(x1, x0, y Word) (q, r Word)
41TEXT ·divWW(SB),NOSPLIT,$0
42	MOVD	x1+0(FP), R10
43	MOVD	x0+8(FP), R11
44	MOVD	y+16(FP), R5
45	WORD	$0xb98700a5 // dlgr r10,r5
46	MOVD	R11, q+24(FP)
47	MOVD	R10, r+32(FP)
48	RET
49
50// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
51// func addVV(z, x, y []Word) (c Word)
52
53
54TEXT ·addVV(SB),NOSPLIT,$0
55	MOVD	addvectorfacility+0x00(SB),R1
56	BR	(R1)
57
58TEXT ·addVV_check(SB),NOSPLIT, $0
59	MOVB	·hasVX(SB), R1
60	CMPBEQ	R1, $1, vectorimpl      // vectorfacility = 1, vector supported
61	MOVD	$addvectorfacility+0x00(SB), R1
62	MOVDaddVV_novec(SB), R2
63	MOVD	R2, 0(R1)
64	//MOVD	$·addVV_novec(SB), 0(R1)
65	BR	·addVV_novec(SB)
66vectorimpl:
67	MOVD	$addvectorfacility+0x00(SB), R1
68	MOVDaddVV_vec(SB), R2
69	MOVD	R2, 0(R1)
70	//MOVD	$·addVV_vec(SB), 0(R1)
71	BR	·addVV_vec(SB)
72
73GLOBL addvectorfacility+0x00(SB), NOPTR, $8
74DATA addvectorfacility+0x00(SB)/8, $·addVV_check(SB)
75
76TEXT ·addVV_vec(SB),NOSPLIT,$0
77	MOVD	z_len+8(FP), R3
78	MOVD	x+24(FP), R8
79	MOVD	y+48(FP), R9
80	MOVD	z+0(FP), R2
81
82	MOVD	$0, R4		// c = 0
83	MOVD	$0, R0		// make sure it's zero
84	MOVD	$0, R10		// i = 0
85
86
87	// s/JL/JMP/ below to disable the unrolled loop
88	SUB	$4, R3
89	BLT	v1
90	SUB     $12, R3                 // n -= 16
91        BLT     A1                      // if n < 0 goto A1
92
93	MOVD	R8, R5
94	MOVD	R9, R6
95	MOVD	R2, R7
96	// n >= 0
97	// regular loop body unrolled 16x
98	VZERO	V0			// c = 0
99UU1:	VLM	0(R5), V1, V4		// 64-bytes into V1..V8
100	ADD	$64, R5
101	VPDI	$0x4,V1,V1,V1		// flip the doublewords to big-endian order
102	VPDI	$0x4,V2,V2,V2		// flip the doublewords to big-endian order
103
104
105	VLM	0(R6), V9, V12  	// 64-bytes into V9..V16
106	ADD	$64, R6
107	VPDI	$0x4,V9,V9,V9		// flip the doublewords to big-endian order
108	VPDI	$0x4,V10,V10,V10	// flip the doublewords to big-endian order
109
110	VACCCQ	V1, V9, V0, V25
111	VACQ	V1, V9, V0, V17
112	VACCCQ	V2, V10, V25, V26
113	VACQ	V2, V10, V25, V18
114
115
116	VLM	0(R5), V5, V6		// 32-bytes into V1..V8
117	VLM	0(R6), V13, V14  	// 32-bytes into V9..V16
118	ADD	$32, R5
119	ADD	$32, R6
120
121	VPDI	$0x4,V3,V3,V3		// flip the doublewords to big-endian order
122	VPDI	$0x4,V4,V4,V4		// flip the doublewords to big-endian order
123	VPDI	$0x4,V11,V11,V11	// flip the doublewords to big-endian order
124	VPDI	$0x4,V12,V12,V12	// flip the doublewords to big-endian order
125
126	VACCCQ	V3, V11, V26, V27
127	VACQ	V3, V11, V26, V19
128	VACCCQ	V4, V12, V27, V28
129	VACQ	V4, V12, V27, V20
130
131	VLM	0(R5), V7, V8		// 32-bytes into V1..V8
132	VLM	0(R6), V15, V16  	// 32-bytes into V9..V16
133	ADD	$32, R5
134	ADD	$32, R6
135
136	VPDI	$0x4,V5,V5,V5		// flip the doublewords to big-endian order
137	VPDI	$0x4,V6,V6,V6		// flip the doublewords to big-endian order
138	VPDI	$0x4,V13,V13,V13	// flip the doublewords to big-endian order
139	VPDI	$0x4,V14,V14,V14	// flip the doublewords to big-endian order
140
141	VACCCQ	V5, V13, V28, V29
142	VACQ	V5, V13, V28, V21
143	VACCCQ	V6, V14, V29, V30
144	VACQ	V6, V14, V29, V22
145
146	VPDI	$0x4,V7,V7,V7		// flip the doublewords to big-endian order
147	VPDI	$0x4,V8,V8,V8		// flip the doublewords to big-endian order
148	VPDI	$0x4,V15,V15,V15	// flip the doublewords to big-endian order
149	VPDI	$0x4,V16,V16,V16	// flip the doublewords to big-endian order
150
151	VACCCQ	V7, V15, V30, V31
152	VACQ	V7, V15, V30, V23
153	VACCCQ	V8, V16, V31, V0	//V0 has carry-over
154	VACQ	V8, V16, V31, V24
155
156	VPDI	$0x4,V17,V17,V17	// flip the doublewords to big-endian order
157	VPDI	$0x4,V18,V18,V18	// flip the doublewords to big-endian order
158	VPDI	$0x4,V19,V19,V19	// flip the doublewords to big-endian order
159	VPDI	$0x4,V20,V20,V20	// flip the doublewords to big-endian order
160	VPDI	$0x4,V21,V21,V21	// flip the doublewords to big-endian order
161	VPDI	$0x4,V22,V22,V22	// flip the doublewords to big-endian order
162	VPDI	$0x4,V23,V23,V23	// flip the doublewords to big-endian order
163	VPDI	$0x4,V24,V24,V24	// flip the doublewords to big-endian order
164	VSTM	V17, V24, 0(R7)  	// 128-bytes into z
165	ADD	$128, R7
166	ADD	$128, R10	// i += 16
167	SUB	$16,  R3	// n -= 16
168	BGE	UU1		// if n >= 0 goto U1
169	VLGVG	$1, V0, R4	// put cf into R4
170	NEG	R4, R4		// save cf
171
172A1:	ADD	$12, R3		// n += 16
173
174
175	// s/JL/JMP/ below to disable the unrolled loop
176	BLT	v1		// if n < 0 goto v1
177
178U1:	// n >= 0
179	// regular loop body unrolled 4x
180	MOVD	0(R8)(R10*1), R5
181	MOVD	8(R8)(R10*1), R6
182	MOVD	16(R8)(R10*1), R7
183	MOVD	24(R8)(R10*1), R1
184	ADDC	R4, R4		// restore CF
185	MOVD	0(R9)(R10*1), R11
186	ADDE	R11, R5
187	MOVD	8(R9)(R10*1), R11
188	ADDE	R11, R6
189	MOVD	16(R9)(R10*1), R11
190	ADDE	R11, R7
191	MOVD	24(R9)(R10*1), R11
192	ADDE	R11, R1
193	MOVD	R0, R4
194	ADDE	R4, R4		// save CF
195	NEG	R4, R4
196	MOVD	R5, 0(R2)(R10*1)
197	MOVD	R6, 8(R2)(R10*1)
198	MOVD	R7, 16(R2)(R10*1)
199	MOVD	R1, 24(R2)(R10*1)
200
201
202	ADD	$32, R10	// i += 4
203	SUB	$4,  R3		// n -= 4
204	BGE	U1		// if n >= 0 goto U1
205
206v1:	ADD	$4, R3		// n += 4
207	BLE	E1		// if n <= 0 goto E1
208
209L1:	// n > 0
210	ADDC	R4, R4		// restore CF
211	MOVD	0(R8)(R10*1), R5
212	MOVD	0(R9)(R10*1), R11
213	ADDE	R11, R5
214	MOVD	R5, 0(R2)(R10*1)
215	MOVD	R0, R4
216	ADDE	R4, R4		// save CF
217	NEG 	R4, R4
218
219	ADD	$8, R10		// i++
220	SUB	$1, R3		// n--
221	BGT	L1		// if n > 0 goto L1
222
223E1:	NEG	R4, R4
224	MOVD	R4, c+72(FP)	// return c
225	RET
226
227TEXT ·addVV_novec(SB),NOSPLIT,$0
228novec:
229	MOVD	z_len+8(FP), R3
230	MOVD	x+24(FP), R8
231	MOVD	y+48(FP), R9
232	MOVD	z+0(FP), R2
233
234	MOVD	$0, R4		// c = 0
235	MOVD	$0, R0		// make sure it's zero
236	MOVD	$0, R10		// i = 0
237
238	// s/JL/JMP/ below to disable the unrolled loop
239	SUB	$4, R3		// n -= 4
240	BLT	v1n		// if n < 0 goto v1n
241U1n:	// n >= 0
242	// regular loop body unrolled 4x
243	MOVD	0(R8)(R10*1), R5
244	MOVD	8(R8)(R10*1), R6
245	MOVD	16(R8)(R10*1), R7
246	MOVD	24(R8)(R10*1), R1
247	ADDC	R4, R4		// restore CF
248	MOVD	0(R9)(R10*1), R11
249	ADDE	R11, R5
250	MOVD	8(R9)(R10*1), R11
251	ADDE	R11, R6
252	MOVD	16(R9)(R10*1), R11
253	ADDE	R11, R7
254	MOVD	24(R9)(R10*1), R11
255	ADDE	R11, R1
256	MOVD	R0, R4
257	ADDE	R4, R4		// save CF
258	NEG	R4, R4
259	MOVD	R5, 0(R2)(R10*1)
260	MOVD	R6, 8(R2)(R10*1)
261	MOVD	R7, 16(R2)(R10*1)
262	MOVD	R1, 24(R2)(R10*1)
263
264
265	ADD	$32, R10	// i += 4
266	SUB	$4,  R3		// n -= 4
267	BGE	U1n		// if n >= 0 goto U1n
268
269v1n:	ADD	$4, R3		// n += 4
270	BLE	E1n		// if n <= 0 goto E1n
271
272L1n:	// n > 0
273	ADDC	R4, R4		// restore CF
274	MOVD	0(R8)(R10*1), R5
275	MOVD	0(R9)(R10*1), R11
276	ADDE	R11, R5
277	MOVD	R5, 0(R2)(R10*1)
278	MOVD	R0, R4
279	ADDE	R4, R4		// save CF
280	NEG 	R4, R4
281
282	ADD	$8, R10		// i++
283	SUB	$1, R3		// n--
284	BGT L1n			// if n > 0 goto L1n
285
286E1n:	NEG	R4, R4
287	MOVD	R4, c+72(FP)	// return c
288	RET
289
290
291TEXT ·subVV(SB),NOSPLIT,$0
292	MOVD	subvectorfacility+0x00(SB),R1
293	BR	(R1)
294
295TEXT ·subVV_check(SB),NOSPLIT,$0
296	MOVB	·hasVX(SB), R1
297	CMPBEQ	R1, $1, vectorimpl      // vectorfacility = 1, vector supported
298	MOVD	$subvectorfacility+0x00(SB), R1
299	MOVDsubVV_novec(SB), R2
300	MOVD	R2, 0(R1)
301	//MOVD	$·subVV_novec(SB), 0(R1)
302	BR	·subVV_novec(SB)
303vectorimpl:
304	MOVD	$subvectorfacility+0x00(SB), R1
305	MOVDsubVV_vec(SB), R2
306        MOVD    R2, 0(R1)
307	//MOVD	$·subVV_vec(SB), 0(R1)
308	BR	·subVV_vec(SB)
309
310GLOBL subvectorfacility+0x00(SB), NOPTR, $8
311DATA subvectorfacility+0x00(SB)/8, $·subVV_check(SB)
312
313// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
314// func subVV(z, x, y []Word) (c Word)
315// (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
316TEXT ·subVV_vec(SB),NOSPLIT,$0
317	MOVD	z_len+8(FP), R3
318	MOVD	x+24(FP), R8
319	MOVD	y+48(FP), R9
320	MOVD	z+0(FP), R2
321	MOVD	$0, R4		// c = 0
322	MOVD	$0, R0		// make sure it's zero
323	MOVD	$0, R10		// i = 0
324
325	// s/JL/JMP/ below to disable the unrolled loop
326	SUB	$4, R3		// n -= 4
327	BLT	v1		// if n < 0 goto v1
328	SUB     $12, R3         // n -= 16
329        BLT     A1              // if n < 0 goto A1
330
331	MOVD	R8, R5
332	MOVD	R9, R6
333	MOVD	R2, R7
334
335	// n >= 0
336	// regular loop body unrolled 16x
337	VZERO	V0		// cf = 0
338	MOVD	$1, R4		// for 390 subtraction cf starts as 1 (no borrow)
339	VLVGG	$1, R4, V0	//put carry into V0
340
341UU1:	VLM	0(R5), V1, V4		// 64-bytes into V1..V8
342	ADD	$64, R5
343	VPDI	$0x4,V1,V1,V1		// flip the doublewords to big-endian order
344	VPDI	$0x4,V2,V2,V2		// flip the doublewords to big-endian order
345
346
347	VLM	0(R6), V9, V12  	// 64-bytes into V9..V16
348	ADD	$64, R6
349	VPDI	$0x4,V9,V9,V9		// flip the doublewords to big-endian order
350	VPDI	$0x4,V10,V10,V10	// flip the doublewords to big-endian order
351
352	VSBCBIQ	V1, V9, V0, V25
353	VSBIQ	V1, V9, V0, V17
354	VSBCBIQ	V2, V10, V25, V26
355	VSBIQ	V2, V10, V25, V18
356
357
358	VLM	0(R5), V5, V6		// 32-bytes into V1..V8
359	VLM	0(R6), V13, V14  	// 32-bytes into V9..V16
360	ADD	$32, R5
361	ADD	$32, R6
362
363	VPDI	$0x4,V3,V3,V3		// flip the doublewords to big-endian order
364	VPDI	$0x4,V4,V4,V4		// flip the doublewords to big-endian order
365	VPDI	$0x4,V11,V11,V11	// flip the doublewords to big-endian order
366	VPDI	$0x4,V12,V12,V12	// flip the doublewords to big-endian order
367
368	VSBCBIQ	V3, V11, V26, V27
369	VSBIQ	V3, V11, V26, V19
370	VSBCBIQ	V4, V12, V27, V28
371	VSBIQ	V4, V12, V27, V20
372
373	VLM	0(R5), V7, V8		// 32-bytes into V1..V8
374	VLM	0(R6), V15, V16  	// 32-bytes into V9..V16
375	ADD	$32, R5
376	ADD	$32, R6
377
378	VPDI	$0x4,V5,V5,V5		// flip the doublewords to big-endian order
379	VPDI	$0x4,V6,V6,V6		// flip the doublewords to big-endian order
380	VPDI	$0x4,V13,V13,V13	// flip the doublewords to big-endian order
381	VPDI	$0x4,V14,V14,V14	// flip the doublewords to big-endian order
382
383	VSBCBIQ	V5, V13, V28, V29
384	VSBIQ	V5, V13, V28, V21
385	VSBCBIQ	V6, V14, V29, V30
386	VSBIQ	V6, V14, V29, V22
387
388	VPDI	$0x4,V7,V7,V7		// flip the doublewords to big-endian order
389	VPDI	$0x4,V8,V8,V8		// flip the doublewords to big-endian order
390	VPDI	$0x4,V15,V15,V15	// flip the doublewords to big-endian order
391	VPDI	$0x4,V16,V16,V16	// flip the doublewords to big-endian order
392
393	VSBCBIQ	V7, V15, V30, V31
394	VSBIQ	V7, V15, V30, V23
395	VSBCBIQ	V8, V16, V31, V0	//V0 has carry-over
396	VSBIQ	V8, V16, V31, V24
397
398	VPDI	$0x4,V17,V17,V17	// flip the doublewords to big-endian order
399	VPDI	$0x4,V18,V18,V18	// flip the doublewords to big-endian order
400	VPDI	$0x4,V19,V19,V19	// flip the doublewords to big-endian order
401	VPDI	$0x4,V20,V20,V20	// flip the doublewords to big-endian order
402	VPDI	$0x4,V21,V21,V21	// flip the doublewords to big-endian order
403	VPDI	$0x4,V22,V22,V22	// flip the doublewords to big-endian order
404	VPDI	$0x4,V23,V23,V23	// flip the doublewords to big-endian order
405	VPDI	$0x4,V24,V24,V24	// flip the doublewords to big-endian order
406	VSTM	V17, V24, 0(R7)   // 128-bytes into z
407	ADD	$128, R7
408	ADD	$128, R10	// i += 16
409	SUB	$16,  R3	// n -= 16
410	BGE	UU1		// if n >= 0 goto U1
411	VLGVG	$1, V0, R4	// put cf into R4
412	SUB	$1, R4		// save cf
413
414A1:	ADD	$12, R3		// n += 16
415	BLT	v1		// if n < 0 goto v1
416
417U1:	// n >= 0
418	// regular loop body unrolled 4x
419	MOVD	0(R8)(R10*1), R5
420	MOVD	8(R8)(R10*1), R6
421	MOVD	16(R8)(R10*1), R7
422	MOVD	24(R8)(R10*1), R1
423	MOVD	R0, R11
424	SUBC	R4, R11		// restore CF
425	MOVD	0(R9)(R10*1), R11
426	SUBE	R11, R5
427	MOVD	8(R9)(R10*1), R11
428	SUBE	R11, R6
429	MOVD	16(R9)(R10*1), R11
430	SUBE	R11, R7
431	MOVD	24(R9)(R10*1), R11
432	SUBE	R11, R1
433	MOVD	R0, R4
434	SUBE	R4, R4		// save CF
435	MOVD	R5, 0(R2)(R10*1)
436	MOVD	R6, 8(R2)(R10*1)
437	MOVD	R7, 16(R2)(R10*1)
438	MOVD	R1, 24(R2)(R10*1)
439
440	ADD	$32, R10	// i += 4
441	SUB	$4,  R3		// n -= 4
442	BGE	U1		// if n >= 0 goto U1n
443
444v1:	ADD	$4, R3		// n += 4
445	BLE	E1		// if n <= 0 goto E1
446
447L1:	// n > 0
448	MOVD	R0, R11
449	SUBC	R4, R11		// restore CF
450	MOVD	0(R8)(R10*1), R5
451	MOVD	0(R9)(R10*1), R11
452	SUBE	R11, R5
453	MOVD	R5, 0(R2)(R10*1)
454	MOVD	R0, R4
455	SUBE	R4, R4		// save CF
456
457	ADD	$8, R10		// i++
458	SUB	$1, R3		// n--
459	BGT	L1		// if n > 0 goto L1n
460
461E1:	NEG	R4, R4
462	MOVD	R4, c+72(FP)	// return c
463	RET
464
465
466// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
467// func subVV(z, x, y []Word) (c Word)
468// (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
469TEXT ·subVV_novec(SB),NOSPLIT,$0
470	MOVD z_len+8(FP), R3
471	MOVD x+24(FP), R8
472	MOVD y+48(FP), R9
473	MOVD z+0(FP), R2
474
475	MOVD $0, R4		// c = 0
476	MOVD $0, R0		// make sure it's zero
477	MOVD $0, R10		// i = 0
478
479	// s/JL/JMP/ below to disable the unrolled loop
480	SUB  $4, R3		// n -= 4
481	BLT v1			// if n < 0 goto v1
482
483U1:	// n >= 0
484	// regular loop body unrolled 4x
485	MOVD 0(R8)(R10*1), R5
486	MOVD 8(R8)(R10*1), R6
487	MOVD 16(R8)(R10*1), R7
488	MOVD 24(R8)(R10*1), R1
489	MOVD R0, R11
490	SUBC R4, R11		// restore CF
491	MOVD 0(R9)(R10*1), R11
492	SUBE R11, R5
493	MOVD 8(R9)(R10*1), R11
494	SUBE R11, R6
495	MOVD 16(R9)(R10*1), R11
496	SUBE R11, R7
497	MOVD 24(R9)(R10*1), R11
498	SUBE R11, R1
499	MOVD R0, R4
500	SUBE R4, R4		// save CF
501	MOVD R5, 0(R2)(R10*1)
502	MOVD R6, 8(R2)(R10*1)
503	MOVD R7, 16(R2)(R10*1)
504	MOVD R1, 24(R2)(R10*1)
505
506
507	ADD  $32, R10		// i += 4
508	SUB  $4,  R3		// n -= 4
509	BGE  U1			// if n >= 0 goto U1
510
511v1:	ADD  $4, R3		// n += 4
512	BLE E1			// if n <= 0 goto E1
513
514L1:	// n > 0
515	MOVD R0, R11
516	SUBC R4, R11		// restore CF
517	MOVD 0(R8)(R10*1), R5
518	MOVD 0(R9)(R10*1), R11
519	SUBE R11, R5
520	MOVD R5, 0(R2)(R10*1)
521	MOVD R0, R4
522	SUBE R4, R4		// save CF
523
524	ADD  $8, R10		// i++
525	SUB  $1, R3		// n--
526	BGT L1			// if n > 0 goto L1
527
528E1:	NEG  R4, R4
529	MOVD R4, c+72(FP)	// return c
530	RET
531
532TEXT ·addVW(SB),NOSPLIT,$0
533	MOVD	addwvectorfacility+0x00(SB),R1
534	BR	(R1)
535
536TEXT ·addVW_check(SB),NOSPLIT,$0
537	MOVB	·hasVX(SB), R1
538	CMPBEQ	R1, $1, vectorimpl      // vectorfacility = 1, vector supported
539	MOVD	$addwvectorfacility+0x00(SB), R1
540	MOVDaddVW_novec(SB), R2
541        MOVD    R2, 0(R1)
542	//MOVD	$·addVW_novec(SB), 0(R1)
543	BR	·addVW_novec(SB)
544vectorimpl:
545	MOVD	$addwvectorfacility+0x00(SB), R1
546	MOVDaddVW_vec(SB), R2
547        MOVD    R2, 0(R1)
548	//MOVD	$·addVW_vec(SB), 0(R1)
549	BR	·addVW_vec(SB)
550
551GLOBL addwvectorfacility+0x00(SB), NOPTR, $8
552DATA addwvectorfacility+0x00(SB)/8, $·addVW_check(SB)
553
554
555// func addVW_vec(z, x []Word, y Word) (c Word)
556TEXT ·addVW_vec(SB),NOSPLIT,$0
557	MOVD	z_len+8(FP), R3
558	MOVD	x+24(FP), R8
559	MOVD	y+48(FP), R4	// c = y
560	MOVD	z+0(FP), R2
561
562	MOVD	$0, R0		// make sure it's zero
563	MOVD	$0, R10		// i = 0
564	MOVD	R8, R5
565	MOVD	R2, R7
566
567	// s/JL/JMP/ below to disable the unrolled loop
568	SUB	$4, R3			// n -= 4
569	BLT	v10			// if n < 0 goto v10
570	SUB	$12, R3
571	BLT	A10
572
573	// n >= 0
574	// regular loop body unrolled 16x
575
576	VZERO	V0			// prepare V0 to be final carry register
577	VZERO	V9			// to ensure upper half is zero
578	VLVGG	$1, R4, V9
579UU1:	VLM	0(R5), V1, V4		// 64-bytes into V1..V4
580	ADD	$64, R5
581	VPDI	$0x4,V1,V1,V1		// flip the doublewords to big-endian order
582	VPDI	$0x4,V2,V2,V2		// flip the doublewords to big-endian order
583
584
585	VACCCQ	V1, V9, V0, V25
586	VACQ	V1, V9, V0, V17
587	VZERO	V9
588	VACCCQ	V2, V9, V25, V26
589	VACQ	V2, V9, V25, V18
590
591
592	VLM	0(R5), V5, V6		// 32-bytes into V5..V6
593	ADD	$32, R5
594
595	VPDI	$0x4,V3,V3,V3		// flip the doublewords to big-endian order
596	VPDI	$0x4,V4,V4,V4		// flip the doublewords to big-endian order
597
598	VACCCQ	V3, V9, V26, V27
599	VACQ	V3, V9, V26, V19
600	VACCCQ	V4, V9, V27, V28
601	VACQ	V4, V9, V27, V20
602
603	VLM	0(R5), V7, V8		// 32-bytes into V7..V8
604	ADD	$32, R5
605
606	VPDI	$0x4,V5,V5,V5		// flip the doublewords to big-endian order
607	VPDI	$0x4,V6,V6,V6		// flip the doublewords to big-endian order
608
609	VACCCQ	V5, V9, V28, V29
610	VACQ	V5, V9, V28, V21
611	VACCCQ	V6, V9, V29, V30
612	VACQ	V6, V9, V29, V22
613
614	VPDI	$0x4,V7,V7,V7		// flip the doublewords to big-endian order
615	VPDI	$0x4,V8,V8,V8		// flip the doublewords to big-endian order
616
617	VACCCQ	V7, V9, V30, V31
618	VACQ	V7, V9, V30, V23
619	VACCCQ	V8, V9, V31, V0	//V0 has carry-over
620	VACQ	V8, V9, V31, V24
621
622	VPDI	$0x4,V17,V17,V17	// flip the doublewords to big-endian order
623	VPDI	$0x4,V18,V18,V18	// flip the doublewords to big-endian order
624	VPDI	$0x4,V19,V19,V19	// flip the doublewords to big-endian order
625	VPDI	$0x4,V20,V20,V20	// flip the doublewords to big-endian order
626	VPDI	$0x4,V21,V21,V21	// flip the doublewords to big-endian order
627	VPDI	$0x4,V22,V22,V22	// flip the doublewords to big-endian order
628	VPDI	$0x4,V23,V23,V23	// flip the doublewords to big-endian order
629	VPDI	$0x4,V24,V24,V24	// flip the doublewords to big-endian order
630	VSTM	V17, V24, 0(R7)   	// 128-bytes into z
631	ADD	$128, R7
632	ADD	$128, R10		// i += 16
633	SUB	$16,  R3		// n -= 16
634	BGE	UU1		// if n >= 0 goto U1
635	VLGVG	$1, V0, R4	// put cf into R4 in case we branch to v10
636
637A10:	ADD	$12, R3		// n += 16
638
639
640	// s/JL/JMP/ below to disable the unrolled loop
641
642	BLT	v10		// if n < 0 goto v10
643
644
645U4:	// n >= 0
646	// regular loop body unrolled 4x
647	MOVD 0(R8)(R10*1), R5
648	MOVD 8(R8)(R10*1), R6
649	MOVD 16(R8)(R10*1), R7
650	MOVD 24(R8)(R10*1), R1
651	ADDC R4, R5
652	ADDE R0, R6
653	ADDE R0, R7
654	ADDE R0, R1
655	ADDE R0, R0
656	MOVD R0, R4		// save CF
657	SUB  R0, R0
658	MOVD R5, 0(R2)(R10*1)
659	MOVD R6, 8(R2)(R10*1)
660	MOVD R7, 16(R2)(R10*1)
661	MOVD R1, 24(R2)(R10*1)
662
663	ADD $32, R10		// i += 4 -> i +=32
664	SUB $4, R3		// n -= 4
665	BGE U4			// if n >= 0 goto U4
666
667v10:	ADD $4, R3		// n += 4
668	BLE E10			// if n <= 0 goto E4
669
670
671L4:	// n > 0
672	MOVD	0(R8)(R10*1), R5
673	ADDC	R4, R5
674	ADDE	R0, R0
675	MOVD	R0, R4		// save CF
676	SUB 	R0, R0
677	MOVD	R5, 0(R2)(R10*1)
678
679	ADD	$8, R10		// i++
680	SUB	$1, R3		// n--
681	BGT	L4		// if n > 0 goto L4
682
683E10:	MOVD	R4, c+56(FP)	// return c
684
685	RET
686
687
688TEXT ·addVW_novec(SB),NOSPLIT,$0
689//DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0)
690	MOVD z_len+8(FP), R3
691	MOVD x+24(FP), R8
692	MOVD y+48(FP), R4	// c = y
693	MOVD z+0(FP), R2
694	MOVD $0, R0		// make sure it's 0
695	MOVD $0, R10		// i = 0
696
697	// s/JL/JMP/ below to disable the unrolled loop
698	SUB $4, R3		// n -= 4
699	BLT v4			// if n < 4 goto v4
700
701U4:	// n >= 0
702	// regular loop body unrolled 4x
703	MOVD 0(R8)(R10*1), R5
704	MOVD 8(R8)(R10*1), R6
705	MOVD 16(R8)(R10*1), R7
706	MOVD 24(R8)(R10*1), R1
707	ADDC R4, R5
708	ADDE R0, R6
709	ADDE R0, R7
710	ADDE R0, R1
711	ADDE R0, R0
712	MOVD R0, R4		// save CF
713	SUB  R0, R0
714	MOVD R5, 0(R2)(R10*1)
715	MOVD R6, 8(R2)(R10*1)
716	MOVD R7, 16(R2)(R10*1)
717	MOVD R1, 24(R2)(R10*1)
718
719	ADD $32, R10		// i += 4 -> i +=32
720	SUB $4, R3		// n -= 4
721	BGE U4			// if n >= 0 goto U4
722
723v4:	ADD $4, R3		// n += 4
724	BLE E4			// if n <= 0 goto E4
725
726L4:	// n > 0
727	MOVD 0(R8)(R10*1), R5
728	ADDC R4, R5
729	ADDE R0, R0
730	MOVD R0, R4		// save CF
731	SUB  R0, R0
732	MOVD R5, 0(R2)(R10*1)
733
734	ADD  $8, R10		// i++
735	SUB  $1, R3		// n--
736	BGT L4			// if n > 0 goto L4
737
738E4:	MOVD R4, c+56(FP)	// return c
739
740	RET
741
742TEXT ·subVW(SB),NOSPLIT,$0
743	MOVD	subwvectorfacility+0x00(SB),R1
744	BR	(R1)
745
746TEXT ·subVW_check(SB),NOSPLIT,$0
747	MOVB	·hasVX(SB), R1
748	CMPBEQ	R1, $1, vectorimpl      // vectorfacility = 1, vector supported
749	MOVD	$subwvectorfacility+0x00(SB), R1
750	MOVDsubVW_novec(SB), R2
751        MOVD    R2, 0(R1)
752	//MOVD	$·subVW_novec(SB), 0(R1)
753	BR	·subVW_novec(SB)
754vectorimpl:
755	MOVD	$subwvectorfacility+0x00(SB), R1
756	MOVDsubVW_vec(SB), R2
757        MOVD    R2, 0(R1)
758	//MOVD	$·subVW_vec(SB), 0(R1)
759	BR	·subVW_vec(SB)
760
761GLOBL subwvectorfacility+0x00(SB), NOPTR, $8
762DATA subwvectorfacility+0x00(SB)/8, $·subVW_check(SB)
763
764// func subVW(z, x []Word, y Word) (c Word)
765TEXT ·subVW_vec(SB),NOSPLIT,$0
766	MOVD	z_len+8(FP), R3
767	MOVD	x+24(FP), R8
768	MOVD	y+48(FP), R4	// c = y
769	MOVD	z+0(FP), R2
770
771	MOVD	$0, R0		// make sure it's zero
772	MOVD	$0, R10		// i = 0
773	MOVD	R8, R5
774	MOVD	R2, R7
775
776	// s/JL/JMP/ below to disable the unrolled loop
777	SUB	$4, R3			// n -= 4
778	BLT	v11			// if n < 0 goto v11
779	SUB	$12, R3
780	BLT	A11
781
782	VZERO	V0
783	MOVD	$1, R6			// prepare V0 to be final carry register
784	VLVGG	$1, R6, V0		// borrow is initially "no borrow"
785	VZERO	V9			// to ensure upper half is zero
786	VLVGG	$1, R4, V9
787
788	// n >= 0
789	// regular loop body unrolled 16x
790
791
792UU1:	VLM	0(R5), V1, V4		// 64-bytes into V1..V4
793	ADD	$64, R5
794	VPDI	$0x4,V1,V1,V1		// flip the doublewords to big-endian order
795	VPDI	$0x4,V2,V2,V2		// flip the doublewords to big-endian order
796
797
798	VSBCBIQ	V1, V9, V0, V25
799	VSBIQ	V1, V9, V0, V17
800	VZERO	V9
801	VSBCBIQ	V2, V9, V25, V26
802	VSBIQ	V2, V9, V25, V18
803
804	VLM	0(R5), V5, V6		// 32-bytes into V5..V6
805	ADD	$32, R5
806
807	VPDI	$0x4,V3,V3,V3		// flip the doublewords to big-endian order
808	VPDI	$0x4,V4,V4,V4		// flip the doublewords to big-endian order
809
810
811	VSBCBIQ	V3, V9, V26, V27
812	VSBIQ	V3, V9, V26, V19
813	VSBCBIQ	V4, V9, V27, V28
814	VSBIQ	V4, V9, V27, V20
815
816	VLM	0(R5), V7, V8		// 32-bytes into V7..V8
817	ADD	$32, R5
818
819	VPDI	$0x4,V5,V5,V5		// flip the doublewords to big-endian order
820	VPDI	$0x4,V6,V6,V6		// flip the doublewords to big-endian order
821
822	VSBCBIQ	V5, V9, V28, V29
823	VSBIQ	V5, V9, V28, V21
824	VSBCBIQ	V6, V9, V29, V30
825	VSBIQ	V6, V9, V29, V22
826
827	VPDI	$0x4,V7,V7,V7		// flip the doublewords to big-endian order
828	VPDI	$0x4,V8,V8,V8		// flip the doublewords to big-endian order
829
830	VSBCBIQ	V7, V9, V30, V31
831	VSBIQ	V7, V9, V30, V23
832	VSBCBIQ	V8, V9, V31, V0	// V0 has carry-over
833	VSBIQ	V8, V9, V31, V24
834
835	VPDI	$0x4,V17,V17,V17	// flip the doublewords to big-endian order
836	VPDI	$0x4,V18,V18,V18	// flip the doublewords to big-endian order
837	VPDI	$0x4,V19,V19,V19	// flip the doublewords to big-endian order
838	VPDI	$0x4,V20,V20,V20	// flip the doublewords to big-endian order
839	VPDI	$0x4,V21,V21,V21	// flip the doublewords to big-endian order
840	VPDI	$0x4,V22,V22,V22	// flip the doublewords to big-endian order
841	VPDI	$0x4,V23,V23,V23	// flip the doublewords to big-endian order
842	VPDI	$0x4,V24,V24,V24	// flip the doublewords to big-endian order
843	VSTM	V17, V24, 0(R7)   	// 128-bytes into z
844	ADD	$128, R7
845	ADD	$128, R10		// i += 16
846	SUB	$16,  R3		// n -= 16
847	BGE	UU1			// if n >= 0 goto U1
848	VLGVG	$1, V0, R4		// put cf into R4 in case we branch to v10
849	SUB	$1, R4			// save cf
850	NEG	R4, R4
851A11:	ADD	$12, R3			// n += 16
852
853	BLT	v11			// if n < 0 goto v11
854
855	// n >= 0
856	// regular loop body unrolled 4x
857
858U4:	// n >= 0
859	// regular loop body unrolled 4x
860	MOVD 0(R8)(R10*1), R5
861	MOVD 8(R8)(R10*1), R6
862	MOVD 16(R8)(R10*1), R7
863	MOVD 24(R8)(R10*1), R1
864	SUBC R4, R5 //SLGR  -> SUBC
865	SUBE R0, R6 //SLBGR -> SUBE
866	SUBE R0, R7
867	SUBE R0, R1
868	SUBE R4, R4		// save CF
869	NEG  R4, R4
870	MOVD R5, 0(R2)(R10*1)
871	MOVD R6, 8(R2)(R10*1)
872	MOVD R7, 16(R2)(R10*1)
873	MOVD R1, 24(R2)(R10*1)
874
875	ADD $32, R10		// i += 4 -> i +=32
876	SUB $4, R3		// n -= 4
877	BGE U4			// if n >= 0 goto U4
878
879v11:	ADD $4, R3		// n += 4
880	BLE E11			// if n <= 0 goto E4
881
882L4:	// n > 0
883
884	MOVD	0(R8)(R10*1), R5
885	SUBC	R4, R5
886	SUBE	R4, R4		// save CF
887	NEG	R4, R4
888	MOVD	R5, 0(R2)(R10*1)
889
890	ADD	$8, R10		// i++
891	SUB	$1, R3		// n--
892	BGT	L4		// if n > 0 goto L4
893
894E11:	MOVD	R4, c+56(FP)	// return c
895
896	RET
897
898//DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0)
899// func subVW(z, x []Word, y Word) (c Word)
900// (same as addVW except for SUBC/SUBE instead of ADDC/ADDE and label names)
901TEXT ·subVW_novec(SB),NOSPLIT,$0
902	MOVD z_len+8(FP), R3
903	MOVD x+24(FP), R8
904	MOVD y+48(FP), R4	// c = y
905	MOVD z+0(FP), R2
906	MOVD $0, R0		// make sure it's 0
907	MOVD $0, R10		// i = 0
908
909	// s/JL/JMP/ below to disable the unrolled loop
910	SUB $4, R3		// n -= 4
911	BLT v4			// if n < 4 goto v4
912
913U4:	// n >= 0
914	// regular loop body unrolled 4x
915	MOVD 0(R8)(R10*1), R5
916	MOVD 8(R8)(R10*1), R6
917	MOVD 16(R8)(R10*1), R7
918	MOVD 24(R8)(R10*1), R1
919	SUBC R4, R5 //SLGR  -> SUBC
920	SUBE R0, R6 //SLBGR -> SUBE
921	SUBE R0, R7
922	SUBE R0, R1
923	SUBE R4, R4		// save CF
924	NEG  R4, R4
925	MOVD R5, 0(R2)(R10*1)
926	MOVD R6, 8(R2)(R10*1)
927	MOVD R7, 16(R2)(R10*1)
928	MOVD R1, 24(R2)(R10*1)
929
930	ADD $32, R10		// i += 4 -> i +=32
931	SUB $4, R3		// n -= 4
932	BGE U4			// if n >= 0 goto U4
933
934v4:	ADD $4, R3		// n += 4
935	BLE E4			// if n <= 0 goto E4
936
937L4:	// n > 0
938	MOVD 0(R8)(R10*1), R5
939	SUBC R4, R5
940	SUBE R4, R4		// save CF
941	NEG  R4, R4
942	MOVD R5, 0(R2)(R10*1)
943
944	ADD  $8, R10		// i++
945	SUB  $1, R3		// n--
946	BGT L4			// if n > 0 goto L4
947
948E4:	MOVD R4, c+56(FP)	// return c
949
950	RET
951
952// func shlVU(z, x []Word, s uint) (c Word)
953TEXT ·shlVU(SB),NOSPLIT,$0
954	MOVD	z_len+8(FP), R5
955	MOVD	$0, R0
956	SUB	$1, R5             // n--
957	BLT	X8b                // n < 0        (n <= 0)
958
959	// n > 0
960	MOVD	s+48(FP), R4
961	CMPBEQ	R0, R4, Z80	   //handle 0 case beq
962	MOVD	$64, R6
963	CMPBEQ	R6, R4, Z864	   //handle 64 case beq
964	MOVD	z+0(FP), R2
965	MOVD	x+24(FP), R8
966	SLD	$3, R5             // n = n*8
967	SUB	R4, R6, R7
968	MOVD	(R8)(R5*1), R10    // w1 = x[i-1]
969	SRD	R7, R10, R3
970	MOVD	R3, c+56(FP)
971
972	MOVD	$0, R1             // i = 0
973	BR	E8
974
975	// i < n-1
976L8:	MOVD	R10, R3             // w = w1
977	MOVD	-8(R8)(R5*1), R10   // w1 = x[i+1]
978
979	SLD	R4,  R3             // w<<s | w1>>ŝ
980	SRD	R7, R10, R6
981	OR 	R6, R3
982	MOVD	R3, (R2)(R5*1)      // z[i] = w<<s | w1>>ŝ
983	SUB	$8, R5              // i--
984
985E8:	CMPBGT	R5, R0, L8	    // i < n-1
986
987	// i >= n-1
988X8a:	SLD	R4, R10             // w1<<s
989	MOVD	R10, (R2)           // z[0] = w1<<s
990	RET
991
992X8b:	MOVD	R0, c+56(FP)
993	RET
994
995Z80:	MOVD	z+0(FP), R2
996	MOVD	x+24(FP), R8
997	SLD	$3, R5             // n = n*8
998
999	MOVD	(R8), R10
1000	MOVD	$0, R3
1001	MOVD	R3, c+56(FP)
1002
1003	MOVD	$0, R1             // i = 0
1004	BR	E8Z
1005
1006	// i < n-1
1007L8Z:	MOVD	R10, R3
1008	MOVD	8(R8)(R1*1), R10
1009
1010	MOVD	R3, (R2)(R1*1)
1011	ADD 	$8, R1
1012
1013E8Z:	CMPBLT	R1, R5, L8Z
1014
1015	// i >= n-1
1016	MOVD	R10, (R2)(R5*1)
1017	RET
1018
1019Z864:	MOVD	z+0(FP), R2
1020	MOVD	x+24(FP), R8
1021	SLD	$3, R5             // n = n*8
1022	MOVD	(R8)(R5*1), R3     // w1 = x[n-1]
1023	MOVD	R3, c+56(FP)       // z[i] = x[n-1]
1024
1025	BR	E864
1026
1027	// i < n-1
1028L864:	MOVD	-8(R8)(R5*1), R3
1029
1030	MOVD	R3, (R2)(R5*1)     // z[i] = x[n-1]
1031	SUB	$8, R5             // i--
1032
1033E864:	CMPBGT	R5, R0, L864       // i < n-1
1034
1035	MOVD	R0, (R2)           // z[n-1] = 0
1036	RET
1037
1038
1039// CX = R4, r8 = r8, r10 = r2 , r11 = r5, DX = r3, AX = r10 , BX = R1 , 64-count = r7 (R0 set to 0) temp = R6
1040// func shrVU(z, x []Word, s uint) (c Word)
1041TEXT ·shrVU(SB),NOSPLIT,$0
1042	MOVD	z_len+8(FP), R5
1043	MOVD	$0, R0
1044	SUB	$1, R5             // n--
1045	BLT	X9b                // n < 0        (n <= 0)
1046
1047	// n > 0
1048	MOVD	s+48(FP), R4
1049	CMPBEQ	R0, R4, ZB0	//handle 0 case beq
1050	MOVD	$64, R6
1051	CMPBEQ 	R6, R4, ZB64	//handle 64 case beq
1052	MOVD	z+0(FP), R2
1053	MOVD	x+24(FP), R8
1054	SLD	$3, R5		// n = n*8
1055	SUB	R4, R6, R7
1056	MOVD	(R8), R10	// w1 = x[0]
1057	SLD	R7, R10, R3
1058	MOVD	R3, c+56(FP)
1059
1060	MOVD	$0, R1		// i = 0
1061	BR 	E9
1062
1063	// i < n-1
1064L9:	MOVD	R10, R3		// w = w1
1065	MOVD	8(R8)(R1*1), R10	// w1 = x[i+1]
1066
1067	SRD	R4,  R3		// w>>s | w1<<s
1068	SLD	R7, R10, R6
1069	OR	R6, R3
1070	MOVD	R3, (R2)(R1*1)	// z[i] = w>>s | w1<<s
1071	ADD	$8, R1		// i++
1072
1073E9:	CMPBLT	R1, R5, L9	// i < n-1
1074
1075	// i >= n-1
1076X9a:	SRD	R4, R10		// w1>>s
1077	MOVD	R10, (R2)(R5*1)	// z[n-1] = w1>>s
1078	RET
1079
1080X9b:	MOVD	R0, c+56(FP)
1081	RET
1082
1083ZB0:	MOVD	z+0(FP), R2
1084	MOVD	x+24(FP), R8
1085	SLD	$3, R5		// n = n*8
1086
1087	MOVD	(R8), R10	// w1 = x[0]
1088	MOVD	$0, R3		// R10 << 64
1089	MOVD	R3, c+56(FP)
1090
1091	MOVD	$0, R1		// i = 0
1092	BR	E9Z
1093
1094	// i < n-1
1095L9Z:	MOVD	R10, R3		// w = w1
1096	MOVD	8(R8)(R1*1), R10	// w1 = x[i+1]
1097
1098	MOVD	R3, (R2)(R1*1)	// z[i] = w>>s | w1<<s
1099	ADD	$8, R1		// i++
1100
1101E9Z:	CMPBLT	R1, R5, L9Z	// i < n-1
1102
1103	// i >= n-1
1104	MOVD	R10, (R2)(R5*1)	// z[n-1] = w1>>s
1105	RET
1106
1107ZB64:	MOVD	z+0(FP), R2
1108	MOVD	x+24(FP), R8
1109	SLD	$3, R5		// n = n*8
1110	MOVD	(R8), R3	// w1 = x[0]
1111	MOVD	R3, c+56(FP)
1112
1113	MOVD	$0, R1		// i = 0
1114	BR	E964
1115
1116	// i < n-1
1117L964:	MOVD	8(R8)(R1*1), R3	// w1 = x[i+1]
1118
1119	MOVD	R3, (R2)(R1*1)	// z[i] = w>>s | w1<<s
1120	ADD	$8, R1		// i++
1121
1122E964:	CMPBLT	R1, R5, L964	// i < n-1
1123
1124	// i >= n-1
1125	MOVD	$0, R10            // w1>>s
1126	MOVD	R10, (R2)(R5*1)    // z[n-1] = w1>>s
1127	RET
1128
1129// CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, DX = r3, AX = r6 , BX = R1 , (R0 set to 0) + use R11 + use R7 for i
1130// func mulAddVWW(z, x []Word, y, r Word) (c Word)
1131TEXT ·mulAddVWW(SB),NOSPLIT,$0
1132	MOVD	z+0(FP), R2
1133	MOVD	x+24(FP), R8
1134	MOVD	y+48(FP), R9
1135	MOVD	r+56(FP), R4	// c = r
1136	MOVD	z_len+8(FP), R5
1137	MOVD	$0, R1		// i = 0
1138	MOVD	$0, R7		// i*8 = 0
1139	MOVD	$0, R0		// make sure it's zero
1140	BR	E5
1141
1142L5:	MOVD	(R8)(R1*1), R6
1143	MULHDU	R9, R6
1144	ADDC	R4, R11 	//add to low order bits
1145	ADDE	R0, R6
1146	MOVD	R11, (R2)(R1*1)
1147	MOVD	R6, R4
1148	ADD	$8, R1		// i*8 + 8
1149	ADD	$1, R7		// i++
1150
1151E5:	CMPBLT	R7, R5, L5	// i < n
1152
1153	MOVD	R4, c+64(FP)
1154	RET
1155
1156// func addMulVVW(z, x []Word, y Word) (c Word)
1157// CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1 , (R0 set to 0) + use R11 + use R7 for i
1158TEXT ·addMulVVW(SB),NOSPLIT,$0
1159	MOVD	z+0(FP), R2
1160	MOVD	x+24(FP), R8
1161	MOVD	y+48(FP), R9
1162	MOVD	z_len+8(FP), R5
1163
1164	MOVD	$0, R1		// i*8 = 0
1165	MOVD	$0, R7		// i = 0
1166	MOVD	$0, R0		// make sure it's zero
1167	MOVD	$0, R4		// c = 0
1168
1169	MOVD	R5, R12
1170	AND	$-2, R12
1171	CMPBGE	R5, $2, A6
1172	BR	E6
1173
1174A6:	MOVD	(R8)(R1*1), R6
1175	MULHDU	R9, R6
1176	MOVD	(R2)(R1*1), R10
1177	ADDC	R10, R11	//add to low order bits
1178	ADDE	R0, R6
1179	ADDC	R4, R11
1180	ADDE	R0, R6
1181	MOVD	R6, R4
1182	MOVD	R11, (R2)(R1*1)
1183
1184	MOVD	(8)(R8)(R1*1), R6
1185	MULHDU	R9, R6
1186	MOVD	(8)(R2)(R1*1), R10
1187	ADDC	R10, R11	//add to low order bits
1188	ADDE	R0, R6
1189	ADDC	R4, R11
1190	ADDE	R0, R6
1191	MOVD	R6, R4
1192	MOVD	R11, (8)(R2)(R1*1)
1193
1194	ADD	$16, R1		// i*8 + 8
1195	ADD	$2, R7		// i++
1196
1197	CMPBLT	R7, R12, A6
1198	BR	E6
1199
1200L6:	MOVD	(R8)(R1*1), R6
1201	MULHDU	R9, R6
1202	MOVD	(R2)(R1*1), R10
1203	ADDC	R10, R11	//add to low order bits
1204	ADDE	R0, R6
1205	ADDC	R4, R11
1206	ADDE	R0, R6
1207	MOVD	R6, R4
1208	MOVD	R11, (R2)(R1*1)
1209
1210	ADD	$8, R1		// i*8 + 8
1211	ADD	$1, R7		// i++
1212
1213E6:	CMPBLT	R7, R5, L6	// i < n
1214
1215	MOVD	R4, c+56(FP)
1216	RET
1217
1218// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
1219// CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1(*8) , (R0 set to 0) + use R11 + use R7 for i
1220TEXT ·divWVW(SB),NOSPLIT,$0
1221	MOVD	z+0(FP), R2
1222	MOVD	xn+24(FP), R10	// r = xn
1223	MOVD	x+32(FP), R8
1224	MOVD	y+56(FP), R9
1225	MOVD	z_len+8(FP), R7	// i = z
1226	SLD	$3, R7, R1		// i*8
1227	MOVD	$0, R0		// make sure it's zero
1228	BR	E7
1229
1230L7:	MOVD	(R8)(R1*1), R11
1231	WORD	$0xB98700A9	//DLGR R10,R9
1232	MOVD	R11, (R2)(R1*1)
1233
1234E7:	SUB	$1, R7		// i--
1235	SUB	$8, R1
1236	BGE	L7		// i >= 0
1237
1238	MOVD	R10, r+64(FP)
1239	RET
1240