memmove.S revision 6f2bde344123d8503cd60f3ecd3420f39aa24eb9
1/* Copyright (c) 2014, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7         notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9         notice, this list of conditions and the following disclaimer in the
10         documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12         names of its contributors may be used to endorse or promote products
13         derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28/* Assumptions:
29 *
30 * ARMv8-a, AArch64
31 * Unaligned accesses
32 * wchar_t is 4 bytes
33 */
34
35#include <private/bionic_asm.h>
36
37/* Parameters and result.  */
38#ifdef BCOPY
39#define origdstin	x1
40#define origsrc	x0
41#endif
42#define dstin	x0
43#define src	x1
44#define count	x2
45#define tmp1	x3
46#define tmp1w	w3
47#define tmp2	x4
48#define tmp2w	w4
49#define tmp3	x5
50#define tmp3w	w5
51#define dst	x6
52
53#define A_l	x7
54#define A_h	x8
55#define B_l	x9
56#define B_h	x10
57#define C_l	x11
58#define C_h	x12
59#define D_l	x13
60#define D_h	x14
61
62#ifdef BCOPY
63ENTRY(bcopy)
64	/* Swap src and dst so that a branch to memcpy doesn't cause issues. */
65	mov	tmp1, origsrc
66	mov	origsrc, origdstin
67	mov	origdstin, tmp1
68#elif defined(WMEMMOVE)
69ENTRY(wmemmove)
70	lsl	count, count, #2
71#else
72ENTRY(memmove)
73#endif
74	cmp	dstin, src
75	b.lo	.Ldownwards
76	add	tmp1, src, count
77	cmp	dstin, tmp1
78	b.hs	memcpy		/* No overlap.  */
79
80	/* Upwards move with potential overlap.
81	 * Need to move from the tail backwards.  SRC and DST point one
82	 * byte beyond the remaining data to move.  */
83	add	dst, dstin, count
84	add	src, src, count
85	cmp	count, #64
86	b.ge	.Lmov_not_short_up
87
88	/* Deal with small moves quickly by dropping straight into the
89	 * exit block.  */
90.Ltail63up:
91	/* Move up to 48 bytes of data.  At this point we only need the
92	 * bottom 6 bits of count to be accurate.  */
93	ands	tmp1, count, #0x30
94	b.eq	.Ltail15up
95	sub	dst, dst, tmp1
96	sub	src, src, tmp1
97	cmp	tmp1w, #0x20
98	b.eq	1f
99	b.lt	2f
100	ldp	A_l, A_h, [src, #32]
101	stp	A_l, A_h, [dst, #32]
1021:
103	ldp	A_l, A_h, [src, #16]
104	stp	A_l, A_h, [dst, #16]
1052:
106	ldp	A_l, A_h, [src]
107	stp	A_l, A_h, [dst]
108.Ltail15up:
109	/* Move up to 15 bytes of data.  Does not assume additional data
110	 * being moved.  */
111	tbz	count, #3, 1f
112	ldr	tmp1, [src, #-8]!
113	str	tmp1, [dst, #-8]!
1141:
115	tbz	count, #2, 1f
116	ldr	tmp1w, [src, #-4]!
117	str	tmp1w, [dst, #-4]!
1181:
119	tbz	count, #1, 1f
120	ldrh	tmp1w, [src, #-2]!
121	strh	tmp1w, [dst, #-2]!
1221:
123	tbz	count, #0, 1f
124	ldrb	tmp1w, [src, #-1]
125	strb	tmp1w, [dst, #-1]
1261:
127	ret
128
129.Lmov_not_short_up:
130	/* We don't much care about the alignment of DST, but we want SRC
131	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
132	 * boundaries on both loads and stores.  */
133	ands	tmp2, src, #15		/* Bytes to reach alignment.  */
134	b.eq	2f
135	sub	count, count, tmp2
136	/* Move enough data to reach alignment; unlike memcpy, we have to
137	 * be aware of the overlap, which means we can't move data twice.  */
138	tbz	tmp2, #3, 1f
139	ldr	tmp1, [src, #-8]!
140	str	tmp1, [dst, #-8]!
1411:
142	tbz	tmp2, #2, 1f
143	ldr	tmp1w, [src, #-4]!
144	str	tmp1w, [dst, #-4]!
1451:
146	tbz	tmp2, #1, 1f
147	ldrh	tmp1w, [src, #-2]!
148	strh	tmp1w, [dst, #-2]!
1491:
150	tbz	tmp2, #0, 1f
151	ldrb	tmp1w, [src, #-1]!
152	strb	tmp1w, [dst, #-1]!
1531:
154
155	/* There may be less than 63 bytes to go now.  */
156	cmp	count, #63
157	b.le	.Ltail63up
1582:
159	subs	count, count, #128
160	b.ge	.Lmov_body_large_up
161	/* Less than 128 bytes to move, so handle 64 here and then jump
162	 * to the tail.  */
163	ldp	A_l, A_h, [src, #-64]!
164	ldp	B_l, B_h, [src, #16]
165	ldp	C_l, C_h, [src, #32]
166	ldp	D_l, D_h, [src, #48]
167	stp	A_l, A_h, [dst, #-64]!
168	stp	B_l, B_h, [dst, #16]
169	stp	C_l, C_h, [dst, #32]
170	stp	D_l, D_h, [dst, #48]
171	tst	count, #0x3f
172	b.ne	.Ltail63up
173	ret
174
175	/* Critical loop.  Start at a new Icache line boundary.  Assuming
176	 * 64 bytes per line this ensures the entire loop is in one line.  */
177	.p2align 6
178.Lmov_body_large_up:
179	/* There are at least 128 bytes to move.  */
180	ldp	A_l, A_h, [src, #-16]
181	ldp	B_l, B_h, [src, #-32]
182	ldp	C_l, C_h, [src, #-48]
183	ldp	D_l, D_h, [src, #-64]!
1841:
185	stp	A_l, A_h, [dst, #-16]
186	ldp	A_l, A_h, [src, #-16]
187	stp	B_l, B_h, [dst, #-32]
188	ldp	B_l, B_h, [src, #-32]
189	stp	C_l, C_h, [dst, #-48]
190	ldp	C_l, C_h, [src, #-48]
191	stp	D_l, D_h, [dst, #-64]!
192	ldp	D_l, D_h, [src, #-64]!
193	subs	count, count, #64
194	b.ge	1b
195	stp	A_l, A_h, [dst, #-16]
196	stp	B_l, B_h, [dst, #-32]
197	stp	C_l, C_h, [dst, #-48]
198	stp	D_l, D_h, [dst, #-64]!
199	tst	count, #0x3f
200	b.ne	.Ltail63up
201	ret
202
203
204.Ldownwards:
205	/* For a downwards move we can safely use memcpy provided that
206	 * DST is more than 16 bytes away from SRC.  */
207	sub	tmp1, src, #16
208	cmp	dstin, tmp1
209	b.ls	memcpy		/* May overlap, but not critically.  */
210
211	mov	dst, dstin	/* Preserve DSTIN for return value.  */
212	cmp	count, #64
213	b.ge	.Lmov_not_short_down
214
215	/* Deal with small moves quickly by dropping straight into the
216	 * exit block.  */
217.Ltail63down:
218	/* Move up to 48 bytes of data.  At this point we only need the
219	 * bottom 6 bits of count to be accurate.  */
220	ands	tmp1, count, #0x30
221	b.eq	.Ltail15down
222	add	dst, dst, tmp1
223	add	src, src, tmp1
224	cmp	tmp1w, #0x20
225	b.eq	1f
226	b.lt	2f
227	ldp	A_l, A_h, [src, #-48]
228	stp	A_l, A_h, [dst, #-48]
2291:
230	ldp	A_l, A_h, [src, #-32]
231	stp	A_l, A_h, [dst, #-32]
2322:
233	ldp	A_l, A_h, [src, #-16]
234	stp	A_l, A_h, [dst, #-16]
235.Ltail15down:
236	/* Move up to 15 bytes of data.  Does not assume additional data
237	   being moved.  */
238	tbz	count, #3, 1f
239	ldr	tmp1, [src], #8
240	str	tmp1, [dst], #8
2411:
242	tbz	count, #2, 1f
243	ldr	tmp1w, [src], #4
244	str	tmp1w, [dst], #4
2451:
246	tbz	count, #1, 1f
247	ldrh	tmp1w, [src], #2
248	strh	tmp1w, [dst], #2
2491:
250	tbz	count, #0, 1f
251	ldrb	tmp1w, [src]
252	strb	tmp1w, [dst]
2531:
254	ret
255
256.Lmov_not_short_down:
257	/* We don't much care about the alignment of DST, but we want SRC
258	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
259	 * boundaries on both loads and stores.  */
260	neg	tmp2, src
261	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
262	b.eq	2f
263	sub	count, count, tmp2
264	/* Move enough data to reach alignment; unlike memcpy, we have to
265	 * be aware of the overlap, which means we can't move data twice.  */
266	tbz	tmp2, #3, 1f
267	ldr	tmp1, [src], #8
268	str	tmp1, [dst], #8
2691:
270	tbz	tmp2, #2, 1f
271	ldr	tmp1w, [src], #4
272	str	tmp1w, [dst], #4
2731:
274	tbz	tmp2, #1, 1f
275	ldrh	tmp1w, [src], #2
276	strh	tmp1w, [dst], #2
2771:
278	tbz	tmp2, #0, 1f
279	ldrb	tmp1w, [src], #1
280	strb	tmp1w, [dst], #1
2811:
282
283	/* There may be less than 63 bytes to go now.  */
284	cmp	count, #63
285	b.le	.Ltail63down
2862:
287	subs	count, count, #128
288	b.ge	.Lmov_body_large_down
289	/* Less than 128 bytes to move, so handle 64 here and then jump
290	 * to the tail.  */
291	ldp	A_l, A_h, [src]
292	ldp	B_l, B_h, [src, #16]
293	ldp	C_l, C_h, [src, #32]
294	ldp	D_l, D_h, [src, #48]
295	stp	A_l, A_h, [dst]
296	stp	B_l, B_h, [dst, #16]
297	stp	C_l, C_h, [dst, #32]
298	stp	D_l, D_h, [dst, #48]
299	tst	count, #0x3f
300	add	src, src, #64
301	add	dst, dst, #64
302	b.ne	.Ltail63down
303	ret
304
305	/* Critical loop.  Start at a new cache line boundary.  Assuming
306	 * 64 bytes per line this ensures the entire loop is in one line.  */
307	.p2align 6
308.Lmov_body_large_down:
309	/* There are at least 128 bytes to move.  */
310	ldp	A_l, A_h, [src, #0]
311	sub	dst, dst, #16		/* Pre-bias.  */
312	ldp	B_l, B_h, [src, #16]
313	ldp	C_l, C_h, [src, #32]
314	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
3151:
316	stp	A_l, A_h, [dst, #16]
317	ldp	A_l, A_h, [src, #16]
318	stp	B_l, B_h, [dst, #32]
319	ldp	B_l, B_h, [src, #32]
320	stp	C_l, C_h, [dst, #48]
321	ldp	C_l, C_h, [src, #48]
322	stp	D_l, D_h, [dst, #64]!
323	ldp	D_l, D_h, [src, #64]!
324	subs	count, count, #64
325	b.ge	1b
326	stp	A_l, A_h, [dst, #16]
327	stp	B_l, B_h, [dst, #32]
328	stp	C_l, C_h, [dst, #48]
329	stp	D_l, D_h, [dst, #64]
330	add	src, src, #16
331	add	dst, dst, #64 + 16
332	tst	count, #0x3f
333	b.ne	.Ltail63down
334	ret
335#ifdef BCOPY
336END(bcopy)
337#elif defined(WMEMMOVE)
338END(wmemmove)
339#else
340END(memmove)
341#endif
342