1/* Copyright (c) 2014, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7         notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9         notice, this list of conditions and the following disclaimer in the
10         documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12         names of its contributors may be used to endorse or promote products
13         derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28/* Assumptions:
29 *
30 * ARMv8-a, AArch64
31 * Unaligned accesses
32 * wchar_t is 4 bytes
33 */
34
35#include <private/bionic_asm.h>
36
37/* Parameters and result.  */
38#define dstin	x0
39#define src	x1
40#define count	x2
41#define tmp1	x3
42#define tmp1w	w3
43#define tmp2	x4
44#define tmp2w	w4
45#define tmp3	x5
46#define tmp3w	w5
47#define dst	x6
48
49#define A_l	x7
50#define A_h	x8
51#define B_l	x9
52#define B_h	x10
53#define C_l	x11
54#define C_h	x12
55#define D_l	x13
56#define D_h	x14
57
58#if defined(WMEMMOVE)
59ENTRY(wmemmove)
60	lsl	count, count, #2
61#else
62ENTRY(memmove)
63#endif
64	cmp	dstin, src
65	b.lo	.Ldownwards
66	add	tmp1, src, count
67	cmp	dstin, tmp1
68	b.hs	memcpy		/* No overlap.  */
69
70	/* Upwards move with potential overlap.
71	 * Need to move from the tail backwards.  SRC and DST point one
72	 * byte beyond the remaining data to move.  */
73	add	dst, dstin, count
74	add	src, src, count
75	cmp	count, #64
76	b.ge	.Lmov_not_short_up
77
78	/* Deal with small moves quickly by dropping straight into the
79	 * exit block.  */
80.Ltail63up:
81	/* Move up to 48 bytes of data.  At this point we only need the
82	 * bottom 6 bits of count to be accurate.  */
83	ands	tmp1, count, #0x30
84	b.eq	.Ltail15up
85	sub	dst, dst, tmp1
86	sub	src, src, tmp1
87	cmp	tmp1w, #0x20
88	b.eq	1f
89	b.lt	2f
90	ldp	A_l, A_h, [src, #32]
91	stp	A_l, A_h, [dst, #32]
921:
93	ldp	A_l, A_h, [src, #16]
94	stp	A_l, A_h, [dst, #16]
952:
96	ldp	A_l, A_h, [src]
97	stp	A_l, A_h, [dst]
98.Ltail15up:
99	/* Move up to 15 bytes of data.  Does not assume additional data
100	 * being moved.  */
101	tbz	count, #3, 1f
102	ldr	tmp1, [src, #-8]!
103	str	tmp1, [dst, #-8]!
1041:
105	tbz	count, #2, 1f
106	ldr	tmp1w, [src, #-4]!
107	str	tmp1w, [dst, #-4]!
1081:
109	tbz	count, #1, 1f
110	ldrh	tmp1w, [src, #-2]!
111	strh	tmp1w, [dst, #-2]!
1121:
113	tbz	count, #0, 1f
114	ldrb	tmp1w, [src, #-1]
115	strb	tmp1w, [dst, #-1]
1161:
117	ret
118
119.Lmov_not_short_up:
120	/* We don't much care about the alignment of DST, but we want SRC
121	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
122	 * boundaries on both loads and stores.  */
123	ands	tmp2, src, #15		/* Bytes to reach alignment.  */
124	b.eq	2f
125	sub	count, count, tmp2
126	/* Move enough data to reach alignment; unlike memcpy, we have to
127	 * be aware of the overlap, which means we can't move data twice.  */
128	tbz	tmp2, #3, 1f
129	ldr	tmp1, [src, #-8]!
130	str	tmp1, [dst, #-8]!
1311:
132	tbz	tmp2, #2, 1f
133	ldr	tmp1w, [src, #-4]!
134	str	tmp1w, [dst, #-4]!
1351:
136	tbz	tmp2, #1, 1f
137	ldrh	tmp1w, [src, #-2]!
138	strh	tmp1w, [dst, #-2]!
1391:
140	tbz	tmp2, #0, 1f
141	ldrb	tmp1w, [src, #-1]!
142	strb	tmp1w, [dst, #-1]!
1431:
144
145	/* There may be less than 63 bytes to go now.  */
146	cmp	count, #63
147	b.le	.Ltail63up
1482:
149	subs	count, count, #128
150	b.ge	.Lmov_body_large_up
151	/* Less than 128 bytes to move, so handle 64 here and then jump
152	 * to the tail.  */
153	ldp	A_l, A_h, [src, #-64]!
154	ldp	B_l, B_h, [src, #16]
155	ldp	C_l, C_h, [src, #32]
156	ldp	D_l, D_h, [src, #48]
157	stp	A_l, A_h, [dst, #-64]!
158	stp	B_l, B_h, [dst, #16]
159	stp	C_l, C_h, [dst, #32]
160	stp	D_l, D_h, [dst, #48]
161	tst	count, #0x3f
162	b.ne	.Ltail63up
163	ret
164
165	/* Critical loop.  Start at a new Icache line boundary.  Assuming
166	 * 64 bytes per line this ensures the entire loop is in one line.  */
167	.p2align 6
168.Lmov_body_large_up:
169	/* There are at least 128 bytes to move.  */
170	ldp	A_l, A_h, [src, #-16]
171	ldp	B_l, B_h, [src, #-32]
172	ldp	C_l, C_h, [src, #-48]
173	ldp	D_l, D_h, [src, #-64]!
1741:
175	stp	A_l, A_h, [dst, #-16]
176	ldp	A_l, A_h, [src, #-16]
177	stp	B_l, B_h, [dst, #-32]
178	ldp	B_l, B_h, [src, #-32]
179	stp	C_l, C_h, [dst, #-48]
180	ldp	C_l, C_h, [src, #-48]
181	stp	D_l, D_h, [dst, #-64]!
182	ldp	D_l, D_h, [src, #-64]!
183	subs	count, count, #64
184	b.ge	1b
185	stp	A_l, A_h, [dst, #-16]
186	stp	B_l, B_h, [dst, #-32]
187	stp	C_l, C_h, [dst, #-48]
188	stp	D_l, D_h, [dst, #-64]!
189	tst	count, #0x3f
190	b.ne	.Ltail63up
191	ret
192
193
194.Ldownwards:
195	/* For a downwards move we can safely use memcpy provided that
196	 * DST is more than 16 bytes away from SRC.  */
197	sub	tmp1, src, #16
198	cmp	dstin, tmp1
199	b.ls	memcpy		/* May overlap, but not critically.  */
200
201	mov	dst, dstin	/* Preserve DSTIN for return value.  */
202	cmp	count, #64
203	b.ge	.Lmov_not_short_down
204
205	/* Deal with small moves quickly by dropping straight into the
206	 * exit block.  */
207.Ltail63down:
208	/* Move up to 48 bytes of data.  At this point we only need the
209	 * bottom 6 bits of count to be accurate.  */
210	ands	tmp1, count, #0x30
211	b.eq	.Ltail15down
212	add	dst, dst, tmp1
213	add	src, src, tmp1
214	cmp	tmp1w, #0x20
215	b.eq	1f
216	b.lt	2f
217	ldp	A_l, A_h, [src, #-48]
218	stp	A_l, A_h, [dst, #-48]
2191:
220	ldp	A_l, A_h, [src, #-32]
221	stp	A_l, A_h, [dst, #-32]
2222:
223	ldp	A_l, A_h, [src, #-16]
224	stp	A_l, A_h, [dst, #-16]
225.Ltail15down:
226	/* Move up to 15 bytes of data.  Does not assume additional data
227	   being moved.  */
228	tbz	count, #3, 1f
229	ldr	tmp1, [src], #8
230	str	tmp1, [dst], #8
2311:
232	tbz	count, #2, 1f
233	ldr	tmp1w, [src], #4
234	str	tmp1w, [dst], #4
2351:
236	tbz	count, #1, 1f
237	ldrh	tmp1w, [src], #2
238	strh	tmp1w, [dst], #2
2391:
240	tbz	count, #0, 1f
241	ldrb	tmp1w, [src]
242	strb	tmp1w, [dst]
2431:
244	ret
245
246.Lmov_not_short_down:
247	/* We don't much care about the alignment of DST, but we want SRC
248	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
249	 * boundaries on both loads and stores.  */
250	neg	tmp2, src
251	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
252	b.eq	2f
253	sub	count, count, tmp2
254	/* Move enough data to reach alignment; unlike memcpy, we have to
255	 * be aware of the overlap, which means we can't move data twice.  */
256	tbz	tmp2, #3, 1f
257	ldr	tmp1, [src], #8
258	str	tmp1, [dst], #8
2591:
260	tbz	tmp2, #2, 1f
261	ldr	tmp1w, [src], #4
262	str	tmp1w, [dst], #4
2631:
264	tbz	tmp2, #1, 1f
265	ldrh	tmp1w, [src], #2
266	strh	tmp1w, [dst], #2
2671:
268	tbz	tmp2, #0, 1f
269	ldrb	tmp1w, [src], #1
270	strb	tmp1w, [dst], #1
2711:
272
273	/* There may be less than 63 bytes to go now.  */
274	cmp	count, #63
275	b.le	.Ltail63down
2762:
277	subs	count, count, #128
278	b.ge	.Lmov_body_large_down
279	/* Less than 128 bytes to move, so handle 64 here and then jump
280	 * to the tail.  */
281	ldp	A_l, A_h, [src]
282	ldp	B_l, B_h, [src, #16]
283	ldp	C_l, C_h, [src, #32]
284	ldp	D_l, D_h, [src, #48]
285	stp	A_l, A_h, [dst]
286	stp	B_l, B_h, [dst, #16]
287	stp	C_l, C_h, [dst, #32]
288	stp	D_l, D_h, [dst, #48]
289	tst	count, #0x3f
290	add	src, src, #64
291	add	dst, dst, #64
292	b.ne	.Ltail63down
293	ret
294
295	/* Critical loop.  Start at a new cache line boundary.  Assuming
296	 * 64 bytes per line this ensures the entire loop is in one line.  */
297	.p2align 6
298.Lmov_body_large_down:
299	/* There are at least 128 bytes to move.  */
300	ldp	A_l, A_h, [src, #0]
301	sub	dst, dst, #16		/* Pre-bias.  */
302	ldp	B_l, B_h, [src, #16]
303	ldp	C_l, C_h, [src, #32]
304	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
3051:
306	stp	A_l, A_h, [dst, #16]
307	ldp	A_l, A_h, [src, #16]
308	stp	B_l, B_h, [dst, #32]
309	ldp	B_l, B_h, [src, #32]
310	stp	C_l, C_h, [dst, #48]
311	ldp	C_l, C_h, [src, #48]
312	stp	D_l, D_h, [dst, #64]!
313	ldp	D_l, D_h, [src, #64]!
314	subs	count, count, #64
315	b.ge	1b
316	stp	A_l, A_h, [dst, #16]
317	stp	B_l, B_h, [dst, #32]
318	stp	C_l, C_h, [dst, #48]
319	stp	D_l, D_h, [dst, #64]
320	add	src, src, #16
321	add	dst, dst, #64 + 16
322	tst	count, #0x3f
323	b.ne	.Ltail63down
324	ret
325#if defined(WMEMMOVE)
326END(wmemmove)
327#else
328END(memmove)
329#endif
330