1// Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
2// https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/memmove-386.s
3//
4//         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
5//         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
6//         Portions Copyright 2009 The Go Authors. All rights reserved.
7//
8// Permission is hereby granted, free of charge, to any person obtaining a copy
9// of this software and associated documentation files (the "Software"), to deal
10// in the Software without restriction, including without limitation the rights
11// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12// copies of the Software, and to permit persons to whom the Software is
13// furnished to do so, subject to the following conditions:
14//
15// The above copyright notice and this permission notice shall be included in
16// all copies or substantial portions of the Software.
17//
18// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
21// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24// THE SOFTWARE.
25
26// +build !plan9
27
28#include "textflag.h"
29
30// void runtime·memmove(void*, void*, uintptr)
31TEXT runtime·memmove(SB), NOSPLIT, $0-24
32
33	MOVQ	to+0(FP), DI
34	MOVQ	from+8(FP), SI
35	MOVQ	n+16(FP), BX
36
37	// REP instructions have a high startup cost, so we handle small sizes
38	// with some straightline code. The REP MOVSQ instruction is really fast
39	// for large sizes. The cutover is approximately 2K.
40tail:
41	// move_129through256 or smaller work whether or not the source and the
42	// destination memory regions overlap because they load all data into
43	// registers before writing it back.  move_256through2048 on the other
44	// hand can be used only when the memory regions don't overlap or the copy
45	// direction is forward.
46	TESTQ	BX, BX
47	JEQ	move_0
48	CMPQ	BX, $2
49	JBE	move_1or2
50	CMPQ	BX, $4
51	JBE	move_3or4
52	CMPQ	BX, $8
53	JB	move_5through7
54	JE	move_8
55	CMPQ	BX, $16
56	JBE	move_9through16
57	CMPQ	BX, $32
58	JBE	move_17through32
59	CMPQ	BX, $64
60	JBE	move_33through64
61	CMPQ	BX, $128
62	JBE	move_65through128
63	CMPQ	BX, $256
64	JBE	move_129through256
65	// TODO: use branch table and BSR to make this just a single dispatch
66
67	TESTB	$1, runtime·useRepMovs(SB)
68	JZ	avxUnaligned
69
70/*
71 * check and set for backwards
72 */
73	CMPQ	SI, DI
74	JLS	back
75
76/*
77 * forward copy loop
78 */
79forward:
80	CMPQ	BX, $2048
81	JLS	move_256through2048
82
83	// If REP MOVSB isn't fast, don't use it
84	TESTL	$(1<<9), runtime·cpuid_ebx7(SB) // erms, aka enhanced REP MOVSB/STOSB
85	JEQ	fwdBy8
86
87	// Check alignment
88	MOVL	SI, AX
89	ORL	DI, AX
90	TESTL	$7, AX
91	JEQ	fwdBy8
92
93	// Do 1 byte at a time
94	MOVQ	BX, CX
95	REP;	MOVSB
96	RET
97
98fwdBy8:
99	// Do 8 bytes at a time
100	MOVQ	BX, CX
101	SHRQ	$3, CX
102	ANDQ	$7, BX
103	REP;	MOVSQ
104	JMP	tail
105
106back:
107/*
108 * check overlap
109 */
110	MOVQ	SI, CX
111	ADDQ	BX, CX
112	CMPQ	CX, DI
113	JLS	forward
114/*
115 * whole thing backwards has
116 * adjusted addresses
117 */
118	ADDQ	BX, DI
119	ADDQ	BX, SI
120	STD
121
122/*
123 * copy
124 */
125	MOVQ	BX, CX
126	SHRQ	$3, CX
127	ANDQ	$7, BX
128
129	SUBQ	$8, DI
130	SUBQ	$8, SI
131	REP;	MOVSQ
132
133	CLD
134	ADDQ	$8, DI
135	ADDQ	$8, SI
136	SUBQ	BX, DI
137	SUBQ	BX, SI
138	JMP	tail
139
140move_1or2:
141	MOVB	(SI), AX
142	MOVB	-1(SI)(BX*1), CX
143	MOVB	AX, (DI)
144	MOVB	CX, -1(DI)(BX*1)
145	RET
146move_0:
147	RET
148move_3or4:
149	CMPQ	BX, $4
150	JB	move_3
151	MOVL	(SI), AX
152	MOVL	AX, (DI)
153	RET
154move_3:
155	MOVW	(SI), AX
156	MOVB	2(SI), CX
157	MOVW	AX, (DI)
158	MOVB	CX, 2(DI)
159	RET
160move_5through7:
161	MOVL	(SI), AX
162	MOVL	-4(SI)(BX*1), CX
163	MOVL	AX, (DI)
164	MOVL	CX, -4(DI)(BX*1)
165	RET
166move_8:
167	// We need a separate case for 8 to make sure we write pointers atomically.
168	MOVQ	(SI), AX
169	MOVQ	AX, (DI)
170	RET
171move_9through16:
172	MOVQ	(SI), AX
173	MOVQ	-8(SI)(BX*1), CX
174	MOVQ	AX, (DI)
175	MOVQ	CX, -8(DI)(BX*1)
176	RET
177move_17through32:
178	MOVOU	(SI), X0
179	MOVOU	-16(SI)(BX*1), X1
180	MOVOU	X0, (DI)
181	MOVOU	X1, -16(DI)(BX*1)
182	RET
183move_33through64:
184	MOVOU	(SI), X0
185	MOVOU	16(SI), X1
186	MOVOU	-32(SI)(BX*1), X2
187	MOVOU	-16(SI)(BX*1), X3
188	MOVOU	X0, (DI)
189	MOVOU	X1, 16(DI)
190	MOVOU	X2, -32(DI)(BX*1)
191	MOVOU	X3, -16(DI)(BX*1)
192	RET
193move_65through128:
194	MOVOU	(SI), X0
195	MOVOU	16(SI), X1
196	MOVOU	32(SI), X2
197	MOVOU	48(SI), X3
198	MOVOU	-64(SI)(BX*1), X4
199	MOVOU	-48(SI)(BX*1), X5
200	MOVOU	-32(SI)(BX*1), X6
201	MOVOU	-16(SI)(BX*1), X7
202	MOVOU	X0, (DI)
203	MOVOU	X1, 16(DI)
204	MOVOU	X2, 32(DI)
205	MOVOU	X3, 48(DI)
206	MOVOU	X4, -64(DI)(BX*1)
207	MOVOU	X5, -48(DI)(BX*1)
208	MOVOU	X6, -32(DI)(BX*1)
209	MOVOU	X7, -16(DI)(BX*1)
210	RET
211move_129through256:
212	MOVOU	(SI), X0
213	MOVOU	16(SI), X1
214	MOVOU	32(SI), X2
215	MOVOU	48(SI), X3
216	MOVOU	64(SI), X4
217	MOVOU	80(SI), X5
218	MOVOU	96(SI), X6
219	MOVOU	112(SI), X7
220	MOVOU	-128(SI)(BX*1), X8
221	MOVOU	-112(SI)(BX*1), X9
222	MOVOU	-96(SI)(BX*1), X10
223	MOVOU	-80(SI)(BX*1), X11
224	MOVOU	-64(SI)(BX*1), X12
225	MOVOU	-48(SI)(BX*1), X13
226	MOVOU	-32(SI)(BX*1), X14
227	MOVOU	-16(SI)(BX*1), X15
228	MOVOU	X0, (DI)
229	MOVOU	X1, 16(DI)
230	MOVOU	X2, 32(DI)
231	MOVOU	X3, 48(DI)
232	MOVOU	X4, 64(DI)
233	MOVOU	X5, 80(DI)
234	MOVOU	X6, 96(DI)
235	MOVOU	X7, 112(DI)
236	MOVOU	X8, -128(DI)(BX*1)
237	MOVOU	X9, -112(DI)(BX*1)
238	MOVOU	X10, -96(DI)(BX*1)
239	MOVOU	X11, -80(DI)(BX*1)
240	MOVOU	X12, -64(DI)(BX*1)
241	MOVOU	X13, -48(DI)(BX*1)
242	MOVOU	X14, -32(DI)(BX*1)
243	MOVOU	X15, -16(DI)(BX*1)
244	RET
245move_256through2048:
246	SUBQ	$256, BX
247	MOVOU	(SI), X0
248	MOVOU	16(SI), X1
249	MOVOU	32(SI), X2
250	MOVOU	48(SI), X3
251	MOVOU	64(SI), X4
252	MOVOU	80(SI), X5
253	MOVOU	96(SI), X6
254	MOVOU	112(SI), X7
255	MOVOU	128(SI), X8
256	MOVOU	144(SI), X9
257	MOVOU	160(SI), X10
258	MOVOU	176(SI), X11
259	MOVOU	192(SI), X12
260	MOVOU	208(SI), X13
261	MOVOU	224(SI), X14
262	MOVOU	240(SI), X15
263	MOVOU	X0, (DI)
264	MOVOU	X1, 16(DI)
265	MOVOU	X2, 32(DI)
266	MOVOU	X3, 48(DI)
267	MOVOU	X4, 64(DI)
268	MOVOU	X5, 80(DI)
269	MOVOU	X6, 96(DI)
270	MOVOU	X7, 112(DI)
271	MOVOU	X8, 128(DI)
272	MOVOU	X9, 144(DI)
273	MOVOU	X10, 160(DI)
274	MOVOU	X11, 176(DI)
275	MOVOU	X12, 192(DI)
276	MOVOU	X13, 208(DI)
277	MOVOU	X14, 224(DI)
278	MOVOU	X15, 240(DI)
279	CMPQ	BX, $256
280	LEAQ	256(SI), SI
281	LEAQ	256(DI), DI
282	JGE	move_256through2048
283	JMP	tail
284
285avxUnaligned:
286	// There are two implementations of move algorithm.
287	// The first one for non-ovelapped memory regions. It uses forward copying.
288	// The second one for overlapped regions. It uses backward copying
289	MOVQ	DI, CX
290	SUBQ	SI, CX
291	// Now CX contains distance between SRC and DEST
292	CMPQ	CX, BX
293	// If the distance lesser than region length it means that regions are overlapped
294	JC	copy_backward
295
296	// Non-temporal copy would be better for big sizes.
297	CMPQ	BX, $0x100000
298	JAE	gobble_big_data_fwd
299
300	// Memory layout on the source side
301	// SI                                       CX
302	// |<---------BX before correction--------->|
303	// |       |<--BX corrected-->|             |
304	// |       |                  |<--- AX  --->|
305	// |<-R11->|                  |<-128 bytes->|
306	// +----------------------------------------+
307	// | Head  | Body             | Tail        |
308	// +-------+------------------+-------------+
309	// ^       ^                  ^
310	// |       |                  |
311	// Save head into Y4          Save tail into X5..X12
312	//         |
313	//         SI+R11, where R11 = ((DI & -32) + 32) - DI
314	// Algorithm:
315	// 1. Unaligned save of the tail's 128 bytes
316	// 2. Unaligned save of the head's 32  bytes
317	// 3. Destination-aligned copying of body (128 bytes per iteration)
318	// 4. Put head on the new place
319	// 5. Put the tail on the new place
320	// It can be important to satisfy processor's pipeline requirements for
321	// small sizes as the cost of unaligned memory region copying is
322	// comparable with the cost of main loop. So code is slightly messed there.
323	// There is more clean implementation of that algorithm for bigger sizes
324	// where the cost of unaligned part copying is negligible.
325	// You can see it after gobble_big_data_fwd label.
326	LEAQ	(SI)(BX*1), CX
327	MOVQ	DI, R10
328	// CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
329	MOVOU	-0x80(CX), X5
330	MOVOU	-0x70(CX), X6
331	MOVQ	$0x80, AX
332	// Align destination address
333	ANDQ	$-32, DI
334	ADDQ	$32, DI
335	// Continue tail saving.
336	MOVOU	-0x60(CX), X7
337	MOVOU	-0x50(CX), X8
338	// Make R11 delta between aligned and unaligned destination addresses.
339	MOVQ	DI, R11
340	SUBQ	R10, R11
341	// Continue tail saving.
342	MOVOU	-0x40(CX), X9
343	MOVOU	-0x30(CX), X10
344	// Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
345	SUBQ	R11, BX
346	// Continue tail saving.
347	MOVOU	-0x20(CX), X11
348	MOVOU	-0x10(CX), X12
349	// The tail will be put on it's place after main body copying.
350	// It's time for the unaligned heading part.
351	VMOVDQU	(SI), Y4
352	// Adjust source address to point past head.
353	ADDQ	R11, SI
354	SUBQ	AX, BX
355	// Aligned memory copying there
356gobble_128_loop:
357	VMOVDQU	(SI), Y0
358	VMOVDQU	0x20(SI), Y1
359	VMOVDQU	0x40(SI), Y2
360	VMOVDQU	0x60(SI), Y3
361	ADDQ	AX, SI
362	VMOVDQA	Y0, (DI)
363	VMOVDQA	Y1, 0x20(DI)
364	VMOVDQA	Y2, 0x40(DI)
365	VMOVDQA	Y3, 0x60(DI)
366	ADDQ	AX, DI
367	SUBQ	AX, BX
368	JA	gobble_128_loop
369	// Now we can store unaligned parts.
370	ADDQ	AX, BX
371	ADDQ	DI, BX
372	VMOVDQU	Y4, (R10)
373	VZEROUPPER
374	MOVOU	X5, -0x80(BX)
375	MOVOU	X6, -0x70(BX)
376	MOVOU	X7, -0x60(BX)
377	MOVOU	X8, -0x50(BX)
378	MOVOU	X9, -0x40(BX)
379	MOVOU	X10, -0x30(BX)
380	MOVOU	X11, -0x20(BX)
381	MOVOU	X12, -0x10(BX)
382	RET
383
384gobble_big_data_fwd:
385	// There is forward copying for big regions.
386	// It uses non-temporal mov instructions.
387	// Details of this algorithm are commented previously for small sizes.
388	LEAQ	(SI)(BX*1), CX
389	MOVOU	-0x80(SI)(BX*1), X5
390	MOVOU	-0x70(CX), X6
391	MOVOU	-0x60(CX), X7
392	MOVOU	-0x50(CX), X8
393	MOVOU	-0x40(CX), X9
394	MOVOU	-0x30(CX), X10
395	MOVOU	-0x20(CX), X11
396	MOVOU	-0x10(CX), X12
397	VMOVDQU	(SI), Y4
398	MOVQ	DI, R8
399	ANDQ	$-32, DI
400	ADDQ	$32, DI
401	MOVQ	DI, R10
402	SUBQ	R8, R10
403	SUBQ	R10, BX
404	ADDQ	R10, SI
405	LEAQ	(DI)(BX*1), CX
406	SUBQ	$0x80, BX
407gobble_mem_fwd_loop:
408	PREFETCHNTA 0x1C0(SI)
409	PREFETCHNTA 0x280(SI)
410	// Prefetch values were choosen empirically.
411	// Approach for prefetch usage as in 7.6.6 of [1]
412	// [1] 64-ia-32-architectures-optimization-manual.pdf
413	// http://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
414	VMOVDQU	(SI), Y0
415	VMOVDQU	0x20(SI), Y1
416	VMOVDQU	0x40(SI), Y2
417	VMOVDQU	0x60(SI), Y3
418	ADDQ	$0x80, SI
419	VMOVNTDQ Y0, (DI)
420	VMOVNTDQ Y1, 0x20(DI)
421	VMOVNTDQ Y2, 0x40(DI)
422	VMOVNTDQ Y3, 0x60(DI)
423	ADDQ	$0x80, DI
424	SUBQ	$0x80, BX
425	JA		gobble_mem_fwd_loop
426	// NT instructions don't follow the normal cache-coherency rules.
427	// We need SFENCE there to make copied data available timely.
428	SFENCE
429	VMOVDQU	Y4, (R8)
430	VZEROUPPER
431	MOVOU	X5, -0x80(CX)
432	MOVOU	X6, -0x70(CX)
433	MOVOU	X7, -0x60(CX)
434	MOVOU	X8, -0x50(CX)
435	MOVOU	X9, -0x40(CX)
436	MOVOU	X10, -0x30(CX)
437	MOVOU	X11, -0x20(CX)
438	MOVOU	X12, -0x10(CX)
439	RET
440
441copy_backward:
442	MOVQ	DI, AX
443	// Backward copying is about the same as the forward one.
444	// Firstly we load unaligned tail in the beginning of region.
445	MOVOU	(SI), X5
446	MOVOU	0x10(SI), X6
447	ADDQ	BX, DI
448	MOVOU	0x20(SI), X7
449	MOVOU	0x30(SI), X8
450	LEAQ	-0x20(DI), R10
451	MOVQ	DI, R11
452	MOVOU	0x40(SI), X9
453	MOVOU	0x50(SI), X10
454	ANDQ	$0x1F, R11
455	MOVOU	0x60(SI), X11
456	MOVOU	0x70(SI), X12
457	XORQ	R11, DI
458	// Let's point SI to the end of region
459	ADDQ	BX, SI
460	// and load unaligned head into X4.
461	VMOVDQU	-0x20(SI), Y4
462	SUBQ	R11, SI
463	SUBQ	R11, BX
464	// If there is enough data for non-temporal moves go to special loop
465	CMPQ	BX, $0x100000
466	JA		gobble_big_data_bwd
467	SUBQ	$0x80, BX
468gobble_mem_bwd_loop:
469	VMOVDQU	-0x20(SI), Y0
470	VMOVDQU	-0x40(SI), Y1
471	VMOVDQU	-0x60(SI), Y2
472	VMOVDQU	-0x80(SI), Y3
473	SUBQ	$0x80, SI
474	VMOVDQA	Y0, -0x20(DI)
475	VMOVDQA	Y1, -0x40(DI)
476	VMOVDQA	Y2, -0x60(DI)
477	VMOVDQA	Y3, -0x80(DI)
478	SUBQ	$0x80, DI
479	SUBQ	$0x80, BX
480	JA		gobble_mem_bwd_loop
481	// Let's store unaligned data
482	VMOVDQU	Y4, (R10)
483	VZEROUPPER
484	MOVOU	X5, (AX)
485	MOVOU	X6, 0x10(AX)
486	MOVOU	X7, 0x20(AX)
487	MOVOU	X8, 0x30(AX)
488	MOVOU	X9, 0x40(AX)
489	MOVOU	X10, 0x50(AX)
490	MOVOU	X11, 0x60(AX)
491	MOVOU	X12, 0x70(AX)
492	RET
493
494gobble_big_data_bwd:
495	SUBQ	$0x80, BX
496gobble_big_mem_bwd_loop:
497	PREFETCHNTA -0x1C0(SI)
498	PREFETCHNTA -0x280(SI)
499	VMOVDQU	-0x20(SI), Y0
500	VMOVDQU	-0x40(SI), Y1
501	VMOVDQU	-0x60(SI), Y2
502	VMOVDQU	-0x80(SI), Y3
503	SUBQ	$0x80, SI
504	VMOVNTDQ	Y0, -0x20(DI)
505	VMOVNTDQ	Y1, -0x40(DI)
506	VMOVNTDQ	Y2, -0x60(DI)
507	VMOVNTDQ	Y3, -0x80(DI)
508	SUBQ	$0x80, DI
509	SUBQ	$0x80, BX
510	JA	gobble_big_mem_bwd_loop
511	SFENCE
512	VMOVDQU	Y4, (R10)
513	VZEROUPPER
514	MOVOU	X5, (AX)
515	MOVOU	X6, 0x10(AX)
516	MOVOU	X7, 0x20(AX)
517	MOVOU	X8, 0x30(AX)
518	MOVOU	X9, 0x40(AX)
519	MOVOU	X10, 0x50(AX)
520	MOVOU	X11, 0x60(AX)
521	MOVOU	X12, 0x70(AX)
522	RET
523