1/*
2 * Copyright (C) 2013 ARM Ltd.
3 * Copyright (C) 2013 Linaro.
4 *
5 * This code is based on glibc cortex strings work originally authored by Linaro
6 * and re-licensed under GPLv2 for the Linux kernel. The original code can
7 * be found @
8 *
9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10 * files/head:/src/aarch64/
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License version 2 as
14 * published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
23 */
24
25#include <linux/linkage.h>
26#include <asm/assembler.h>
27#include <asm/cache.h>
28
29/*
30 * Move a buffer from src to test (alignment handled by the hardware).
31 * If dest <= src, call memcpy, otherwise copy in reverse order.
32 *
33 * Parameters:
34 *	x0 - dest
35 *	x1 - src
36 *	x2 - n
37 * Returns:
38 *	x0 - dest
39 */
40dstin	.req	x0
41src	.req	x1
42count	.req	x2
43tmp1	.req	x3
44tmp1w	.req	w3
45tmp2	.req	x4
46tmp2w	.req	w4
47tmp3	.req	x5
48tmp3w	.req	w5
49dst	.req	x6
50
51A_l	.req	x7
52A_h	.req	x8
53B_l	.req	x9
54B_h	.req	x10
55C_l	.req	x11
56C_h	.req	x12
57D_l	.req	x13
58D_h	.req	x14
59
60ENTRY(memmove)
61	cmp	dstin, src
62	b.lo	memcpy
63	add	tmp1, src, count
64	cmp	dstin, tmp1
65	b.hs	memcpy		/* No overlap.  */
66
67	add	dst, dstin, count
68	add	src, src, count
69	cmp	count, #16
70	b.lo	.Ltail15  /*probably non-alignment accesses.*/
71
72	ands	tmp2, src, #15     /* Bytes to reach alignment.  */
73	b.eq	.LSrcAligned
74	sub	count, count, tmp2
75	/*
76	* process the aligned offset length to make the src aligned firstly.
77	* those extra instructions' cost is acceptable. It also make the
78	* coming accesses are based on aligned address.
79	*/
80	tbz	tmp2, #0, 1f
81	ldrb	tmp1w, [src, #-1]!
82	strb	tmp1w, [dst, #-1]!
831:
84	tbz	tmp2, #1, 2f
85	ldrh	tmp1w, [src, #-2]!
86	strh	tmp1w, [dst, #-2]!
872:
88	tbz	tmp2, #2, 3f
89	ldr	tmp1w, [src, #-4]!
90	str	tmp1w, [dst, #-4]!
913:
92	tbz	tmp2, #3, .LSrcAligned
93	ldr	tmp1, [src, #-8]!
94	str	tmp1, [dst, #-8]!
95
96.LSrcAligned:
97	cmp	count, #64
98	b.ge	.Lcpy_over64
99
100	/*
101	* Deal with small copies quickly by dropping straight into the
102	* exit block.
103	*/
104.Ltail63:
105	/*
106	* Copy up to 48 bytes of data. At this point we only need the
107	* bottom 6 bits of count to be accurate.
108	*/
109	ands	tmp1, count, #0x30
110	b.eq	.Ltail15
111	cmp	tmp1w, #0x20
112	b.eq	1f
113	b.lt	2f
114	ldp	A_l, A_h, [src, #-16]!
115	stp	A_l, A_h, [dst, #-16]!
1161:
117	ldp	A_l, A_h, [src, #-16]!
118	stp	A_l, A_h, [dst, #-16]!
1192:
120	ldp	A_l, A_h, [src, #-16]!
121	stp	A_l, A_h, [dst, #-16]!
122
123.Ltail15:
124	tbz	count, #3, 1f
125	ldr	tmp1, [src, #-8]!
126	str	tmp1, [dst, #-8]!
1271:
128	tbz	count, #2, 2f
129	ldr	tmp1w, [src, #-4]!
130	str	tmp1w, [dst, #-4]!
1312:
132	tbz	count, #1, 3f
133	ldrh	tmp1w, [src, #-2]!
134	strh	tmp1w, [dst, #-2]!
1353:
136	tbz	count, #0, .Lexitfunc
137	ldrb	tmp1w, [src, #-1]
138	strb	tmp1w, [dst, #-1]
139
140.Lexitfunc:
141	ret
142
143.Lcpy_over64:
144	subs	count, count, #128
145	b.ge	.Lcpy_body_large
146	/*
147	* Less than 128 bytes to copy, so handle 64 bytes here and then jump
148	* to the tail.
149	*/
150	ldp	A_l, A_h, [src, #-16]
151	stp	A_l, A_h, [dst, #-16]
152	ldp	B_l, B_h, [src, #-32]
153	ldp	C_l, C_h, [src, #-48]
154	stp	B_l, B_h, [dst, #-32]
155	stp	C_l, C_h, [dst, #-48]
156	ldp	D_l, D_h, [src, #-64]!
157	stp	D_l, D_h, [dst, #-64]!
158
159	tst	count, #0x3f
160	b.ne	.Ltail63
161	ret
162
163	/*
164	* Critical loop. Start at a new cache line boundary. Assuming
165	* 64 bytes per line this ensures the entire loop is in one line.
166	*/
167	.p2align	L1_CACHE_SHIFT
168.Lcpy_body_large:
169	/* pre-load 64 bytes data. */
170	ldp	A_l, A_h, [src, #-16]
171	ldp	B_l, B_h, [src, #-32]
172	ldp	C_l, C_h, [src, #-48]
173	ldp	D_l, D_h, [src, #-64]!
1741:
175	/*
176	* interlace the load of next 64 bytes data block with store of the last
177	* loaded 64 bytes data.
178	*/
179	stp	A_l, A_h, [dst, #-16]
180	ldp	A_l, A_h, [src, #-16]
181	stp	B_l, B_h, [dst, #-32]
182	ldp	B_l, B_h, [src, #-32]
183	stp	C_l, C_h, [dst, #-48]
184	ldp	C_l, C_h, [src, #-48]
185	stp	D_l, D_h, [dst, #-64]!
186	ldp	D_l, D_h, [src, #-64]!
187	subs	count, count, #64
188	b.ge	1b
189	stp	A_l, A_h, [dst, #-16]
190	stp	B_l, B_h, [dst, #-32]
191	stp	C_l, C_h, [dst, #-48]
192	stp	D_l, D_h, [dst, #-64]!
193
194	tst	count, #0x3f
195	b.ne	.Ltail63
196	ret
197ENDPROC(memmove)
198