memmove.S revision 8167dd7cb98e87ffe9b40e4993c330b244ca2234
1/* Copyright (c) 2014, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7         notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9         notice, this list of conditions and the following disclaimer in the
10         documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12         names of its contributors may be used to endorse or promote products
13         derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28/* Assumptions:
29 *
30 * ARMv8-a, AArch64
31 * Unaligned accesses
32 * wchar_t is 4 bytes
33 */
34
35#include <private/bionic_asm.h>
36
37/* Parameters and result.  */
38#ifdef BCOPY
39#define dstin	x1
40#define src	x0
41#else
42#define dstin	x0
43#define src	x1
44#endif
45#define count	x2
46#define tmp1	x3
47#define tmp1w	w3
48#define tmp2	x4
49#define tmp2w	w4
50#define tmp3	x5
51#define tmp3w	w5
52#define dst	x6
53
54#define A_l	x7
55#define A_h	x8
56#define B_l	x9
57#define B_h	x10
58#define C_l	x11
59#define C_h	x12
60#define D_l	x13
61#define D_h	x14
62
63#ifdef BCOPY
64ENTRY(bcopy)
65#elif defined(WMEMMOVE)
66ENTRY(wmemmove)
67	lsl	count, count, #2
68#else
69ENTRY(memmove)
70#endif
71	cmp	dstin, src
72	b.lo	.Ldownwards
73	add	tmp1, src, count
74	cmp	dstin, tmp1
75	b.hs	memcpy		/* No overlap.  */
76
77	/* Upwards move with potential overlap.
78	 * Need to move from the tail backwards.  SRC and DST point one
79	 * byte beyond the remaining data to move.  */
80	add	dst, dstin, count
81	add	src, src, count
82	cmp	count, #64
83	b.ge	.Lmov_not_short_up
84
85	/* Deal with small moves quickly by dropping straight into the
86	 * exit block.  */
87.Ltail63up:
88	/* Move up to 48 bytes of data.  At this point we only need the
89	 * bottom 6 bits of count to be accurate.  */
90	ands	tmp1, count, #0x30
91	b.eq	.Ltail15up
92	sub	dst, dst, tmp1
93	sub	src, src, tmp1
94	cmp	tmp1w, #0x20
95	b.eq	1f
96	b.lt	2f
97	ldp	A_l, A_h, [src, #32]
98	stp	A_l, A_h, [dst, #32]
991:
100	ldp	A_l, A_h, [src, #16]
101	stp	A_l, A_h, [dst, #16]
1022:
103	ldp	A_l, A_h, [src]
104	stp	A_l, A_h, [dst]
105.Ltail15up:
106	/* Move up to 15 bytes of data.  Does not assume additional data
107	 * being moved.  */
108	tbz	count, #3, 1f
109	ldr	tmp1, [src, #-8]!
110	str	tmp1, [dst, #-8]!
1111:
112	tbz	count, #2, 1f
113	ldr	tmp1w, [src, #-4]!
114	str	tmp1w, [dst, #-4]!
1151:
116	tbz	count, #1, 1f
117	ldrh	tmp1w, [src, #-2]!
118	strh	tmp1w, [dst, #-2]!
1191:
120	tbz	count, #0, 1f
121	ldrb	tmp1w, [src, #-1]
122	strb	tmp1w, [dst, #-1]
1231:
124	ret
125
126.Lmov_not_short_up:
127	/* We don't much care about the alignment of DST, but we want SRC
128	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
129	 * boundaries on both loads and stores.  */
130	ands	tmp2, src, #15		/* Bytes to reach alignment.  */
131	b.eq	2f
132	sub	count, count, tmp2
133	/* Move enough data to reach alignment; unlike memcpy, we have to
134	 * be aware of the overlap, which means we can't move data twice.  */
135	tbz	tmp2, #3, 1f
136	ldr	tmp1, [src, #-8]!
137	str	tmp1, [dst, #-8]!
1381:
139	tbz	tmp2, #2, 1f
140	ldr	tmp1w, [src, #-4]!
141	str	tmp1w, [dst, #-4]!
1421:
143	tbz	tmp2, #1, 1f
144	ldrh	tmp1w, [src, #-2]!
145	strh	tmp1w, [dst, #-2]!
1461:
147	tbz	tmp2, #0, 1f
148	ldrb	tmp1w, [src, #-1]!
149	strb	tmp1w, [dst, #-1]!
1501:
151
152	/* There may be less than 63 bytes to go now.  */
153	cmp	count, #63
154	b.le	.Ltail63up
1552:
156	subs	count, count, #128
157	b.ge	.Lmov_body_large_up
158	/* Less than 128 bytes to move, so handle 64 here and then jump
159	 * to the tail.  */
160	ldp	A_l, A_h, [src, #-64]!
161	ldp	B_l, B_h, [src, #16]
162	ldp	C_l, C_h, [src, #32]
163	ldp	D_l, D_h, [src, #48]
164	stp	A_l, A_h, [dst, #-64]!
165	stp	B_l, B_h, [dst, #16]
166	stp	C_l, C_h, [dst, #32]
167	stp	D_l, D_h, [dst, #48]
168	tst	count, #0x3f
169	b.ne	.Ltail63up
170	ret
171
172	/* Critical loop.  Start at a new Icache line boundary.  Assuming
173	 * 64 bytes per line this ensures the entire loop is in one line.  */
174	.p2align 6
175.Lmov_body_large_up:
176	/* There are at least 128 bytes to move.  */
177	ldp	A_l, A_h, [src, #-16]
178	ldp	B_l, B_h, [src, #-32]
179	ldp	C_l, C_h, [src, #-48]
180	ldp	D_l, D_h, [src, #-64]!
1811:
182	stp	A_l, A_h, [dst, #-16]
183	ldp	A_l, A_h, [src, #-16]
184	stp	B_l, B_h, [dst, #-32]
185	ldp	B_l, B_h, [src, #-32]
186	stp	C_l, C_h, [dst, #-48]
187	ldp	C_l, C_h, [src, #-48]
188	stp	D_l, D_h, [dst, #-64]!
189	ldp	D_l, D_h, [src, #-64]!
190	subs	count, count, #64
191	b.ge	1b
192	stp	A_l, A_h, [dst, #-16]
193	stp	B_l, B_h, [dst, #-32]
194	stp	C_l, C_h, [dst, #-48]
195	stp	D_l, D_h, [dst, #-64]!
196	tst	count, #0x3f
197	b.ne	.Ltail63up
198	ret
199
200
201.Ldownwards:
202	/* For a downwards move we can safely use memcpy provided that
203	 * DST is more than 16 bytes away from SRC.  */
204	sub	tmp1, src, #16
205	cmp	dstin, tmp1
206	b.ls	memcpy		/* May overlap, but not critically.  */
207
208	mov	dst, dstin	/* Preserve DSTIN for return value.  */
209	cmp	count, #64
210	b.ge	.Lmov_not_short_down
211
212	/* Deal with small moves quickly by dropping straight into the
213	 * exit block.  */
214.Ltail63down:
215	/* Move up to 48 bytes of data.  At this point we only need the
216	 * bottom 6 bits of count to be accurate.  */
217	ands	tmp1, count, #0x30
218	b.eq	.Ltail15down
219	add	dst, dst, tmp1
220	add	src, src, tmp1
221	cmp	tmp1w, #0x20
222	b.eq	1f
223	b.lt	2f
224	ldp	A_l, A_h, [src, #-48]
225	stp	A_l, A_h, [dst, #-48]
2261:
227	ldp	A_l, A_h, [src, #-32]
228	stp	A_l, A_h, [dst, #-32]
2292:
230	ldp	A_l, A_h, [src, #-16]
231	stp	A_l, A_h, [dst, #-16]
232.Ltail15down:
233	/* Move up to 15 bytes of data.  Does not assume additional data
234	   being moved.  */
235	tbz	count, #3, 1f
236	ldr	tmp1, [src], #8
237	str	tmp1, [dst], #8
2381:
239	tbz	count, #2, 1f
240	ldr	tmp1w, [src], #4
241	str	tmp1w, [dst], #4
2421:
243	tbz	count, #1, 1f
244	ldrh	tmp1w, [src], #2
245	strh	tmp1w, [dst], #2
2461:
247	tbz	count, #0, 1f
248	ldrb	tmp1w, [src]
249	strb	tmp1w, [dst]
2501:
251	ret
252
253.Lmov_not_short_down:
254	/* We don't much care about the alignment of DST, but we want SRC
255	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
256	 * boundaries on both loads and stores.  */
257	neg	tmp2, src
258	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
259	b.eq	2f
260	sub	count, count, tmp2
261	/* Move enough data to reach alignment; unlike memcpy, we have to
262	 * be aware of the overlap, which means we can't move data twice.  */
263	tbz	tmp2, #3, 1f
264	ldr	tmp1, [src], #8
265	str	tmp1, [dst], #8
2661:
267	tbz	tmp2, #2, 1f
268	ldr	tmp1w, [src], #4
269	str	tmp1w, [dst], #4
2701:
271	tbz	tmp2, #1, 1f
272	ldrh	tmp1w, [src], #2
273	strh	tmp1w, [dst], #2
2741:
275	tbz	tmp2, #0, 1f
276	ldrb	tmp1w, [src], #1
277	strb	tmp1w, [dst], #1
2781:
279
280	/* There may be less than 63 bytes to go now.  */
281	cmp	count, #63
282	b.le	.Ltail63down
2832:
284	subs	count, count, #128
285	b.ge	.Lmov_body_large_down
286	/* Less than 128 bytes to move, so handle 64 here and then jump
287	 * to the tail.  */
288	ldp	A_l, A_h, [src]
289	ldp	B_l, B_h, [src, #16]
290	ldp	C_l, C_h, [src, #32]
291	ldp	D_l, D_h, [src, #48]
292	stp	A_l, A_h, [dst]
293	stp	B_l, B_h, [dst, #16]
294	stp	C_l, C_h, [dst, #32]
295	stp	D_l, D_h, [dst, #48]
296	tst	count, #0x3f
297	add	src, src, #64
298	add	dst, dst, #64
299	b.ne	.Ltail63down
300	ret
301
302	/* Critical loop.  Start at a new cache line boundary.  Assuming
303	 * 64 bytes per line this ensures the entire loop is in one line.  */
304	.p2align 6
305.Lmov_body_large_down:
306	/* There are at least 128 bytes to move.  */
307	ldp	A_l, A_h, [src, #0]
308	sub	dst, dst, #16		/* Pre-bias.  */
309	ldp	B_l, B_h, [src, #16]
310	ldp	C_l, C_h, [src, #32]
311	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
3121:
313	stp	A_l, A_h, [dst, #16]
314	ldp	A_l, A_h, [src, #16]
315	stp	B_l, B_h, [dst, #32]
316	ldp	B_l, B_h, [src, #32]
317	stp	C_l, C_h, [dst, #48]
318	ldp	C_l, C_h, [src, #48]
319	stp	D_l, D_h, [dst, #64]!
320	ldp	D_l, D_h, [src, #64]!
321	subs	count, count, #64
322	b.ge	1b
323	stp	A_l, A_h, [dst, #16]
324	stp	B_l, B_h, [dst, #32]
325	stp	C_l, C_h, [dst, #48]
326	stp	D_l, D_h, [dst, #64]
327	add	src, src, #16
328	add	dst, dst, #64 + 16
329	tst	count, #0x3f
330	b.ne	.Ltail63down
331	ret
332#ifdef BCOPY
333END(bcopy)
334#elif defined(WMEMMOVE)
335END(wmemmove)
336#else
337END(memmove)
338#endif
339