1/*
2 * Copyright (c) 2009
3 *      MIPS Technologies, Inc., California.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
14 *    contributors may be used to endorse or promote products derived from
15 *    this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30/************************************************************************
31 *
32 *  memcpy.S
33 *  Version: "043009"
34 *
35 ************************************************************************/
36
37
38/************************************************************************
39 *  Include files
40 ************************************************************************/
41
42#include <private/bionic_asm.h>
43
44
45/*
46 * This routine could be optimized for MIPS64. The current code only
47 * uses MIPS32 instructions.
48 */
49#if defined(__MIPSEB__)
50#  define LWHI	lwl		/* high part is left in big-endian	*/
51#  define SWHI	swl		/* high part is left in big-endian	*/
52#  define LWLO	lwr		/* low part is right in big-endian	*/
53#  define SWLO	swr		/* low part is right in big-endian	*/
54#endif
55
56#if defined(__MIPSEL__)
57#  define LWHI	lwr		/* high part is right in little-endian	*/
58#  define SWHI	swr		/* high part is right in little-endian	*/
59#  define LWLO	lwl		/* low part is left in big-endian	*/
60#  define SWLO	swl		/* low part is left in big-endian	*/
61#endif
62
63LEAF(memcpy,0)
64
65	.set	noreorder
66	.set	noat
67/*
68 * Below we handle the case where memcpy is called with overlapping src and dst.
69 * Although memcpy is not required to handle this case, some parts of Android like Skia
70 * rely on such usage. We call memmove to handle such cases.
71 */
72	subu	t0,a0,a1
73	sra	AT,t0,31
74	xor	t1,t0,AT
75	subu	t0,t1,AT
76	sltu	AT,t0,a2
77	beq	AT,zero,.Lmemcpy
78	 la	t9,memmove
79	jr	t9
80	 nop
81.Lmemcpy:
82	slti	AT,a2,8
83	bne	AT,zero,.Llast8
84	 move	v0,a0	# memcpy returns the dst pointer
85
86# Test if the src and dst are word-aligned, or can be made word-aligned
87	xor	t8,a1,a0
88	andi	t8,t8,0x3		# t8 is a0/a1 word-displacement
89
90	bne	t8,zero,.Lunaligned
91	 negu	a3,a0
92
93	andi	a3,a3,0x3	# we need to copy a3 bytes to make a0/a1 aligned
94	beq	a3,zero,.Lchk16w # when a3=0 then the dst (a0) is word-aligned
95	 subu	a2,a2,a3	# now a2 is the remining bytes count
96
97	LWHI	t8,0(a1)
98	addu	a1,a1,a3
99	SWHI	t8,0(a0)
100	addu	a0,a0,a3
101
102# Now the dst/src are mutually word-aligned with word-aligned addresses
103.Lchk16w:
104	andi	t8,a2,0x3f	# any whole 64-byte chunks?
105				# t8 is the byte count after 64-byte chunks
106
107	beq	a2,t8,.Lchk8w	# if a2==t8, no 64-byte chunks
108				# There will be at most 1 32-byte chunk after it
109	 subu	a3,a2,t8	# subtract from a2 the reminder
110                                # Here a3 counts bytes in 16w chunks
111	addu	a3,a0,a3	# Now a3 is the final dst after 64-byte chunks
112
113	addu	t0,a0,a2	# t0 is the "past the end" address
114
115# When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past
116# the "t0-32" address
117# This means: for x=128 the last "safe" a0 address is "t0-160"
118# Alternatively, for x=64 the last "safe" a0 address is "t0-96"
119# In the current version we will use "pref 30,128(a0)", so "t0-160" is the limit
120	subu	t9,t0,160	# t9 is the "last safe pref 30,128(a0)" address
121
122	pref    0,0(a1)		# bring the first line of src, addr 0
123	pref    0,32(a1)	# bring the second line of src, addr 32
124	pref    0,64(a1)	# bring the third line of src, addr 64
125	pref	30,32(a0)	# safe, as we have at least 64 bytes ahead
126# In case the a0 > t9 don't use "pref 30" at all
127	sgtu	v1,a0,t9
128	bgtz	v1,.Lloop16w	# skip "pref 30,64(a0)" for too short arrays
129	 nop
130# otherwise, start with using pref30
131	pref	30,64(a0)
132.Lloop16w:
133	pref	0,96(a1)
134	lw	t0,0(a1)
135	bgtz	v1,.Lskip_pref30_96	# skip "pref 30,96(a0)"
136	 lw	t1,4(a1)
137	pref    30,96(a0)   # continue setting up the dest, addr 96
138.Lskip_pref30_96:
139	lw	t2,8(a1)
140	lw	t3,12(a1)
141	lw	t4,16(a1)
142	lw	t5,20(a1)
143	lw	t6,24(a1)
144	lw	t7,28(a1)
145        pref    0,128(a1)    # bring the next lines of src, addr 128
146
147	sw	t0,0(a0)
148	sw	t1,4(a0)
149	sw	t2,8(a0)
150	sw	t3,12(a0)
151	sw	t4,16(a0)
152	sw	t5,20(a0)
153	sw	t6,24(a0)
154	sw	t7,28(a0)
155
156	lw	t0,32(a1)
157	bgtz	v1,.Lskip_pref30_128	# skip "pref 30,128(a0)"
158	 lw	t1,36(a1)
159	pref    30,128(a0)   # continue setting up the dest, addr 128
160.Lskip_pref30_128:
161	lw	t2,40(a1)
162	lw	t3,44(a1)
163	lw	t4,48(a1)
164	lw	t5,52(a1)
165	lw	t6,56(a1)
166	lw	t7,60(a1)
167        pref    0, 160(a1)    # bring the next lines of src, addr 160
168
169	sw	t0,32(a0)
170	sw	t1,36(a0)
171	sw	t2,40(a0)
172	sw	t3,44(a0)
173	sw	t4,48(a0)
174	sw	t5,52(a0)
175	sw	t6,56(a0)
176	sw	t7,60(a0)
177
178	addiu	a0,a0,64	# adding 64 to dest
179	sgtu	v1,a0,t9
180	bne	a0,a3,.Lloop16w
181	 addiu	a1,a1,64	# adding 64 to src
182	move	a2,t8
183
184# Here we have src and dest word-aligned but less than 64-bytes to go
185
186.Lchk8w:
187	pref 0, 0x0(a1)
188	andi	t8,a2,0x1f	# is there a 32-byte chunk?
189				# the t8 is the reminder count past 32-bytes
190	beq	a2,t8,.Lchk1w	# when a2=t8, no 32-byte chunk
191	 nop
192
193	lw	t0,0(a1)
194	lw	t1,4(a1)
195	lw	t2,8(a1)
196	lw	t3,12(a1)
197	lw	t4,16(a1)
198	lw	t5,20(a1)
199	lw	t6,24(a1)
200	lw	t7,28(a1)
201	addiu	a1,a1,32
202
203	sw	t0,0(a0)
204	sw	t1,4(a0)
205	sw	t2,8(a0)
206	sw	t3,12(a0)
207	sw	t4,16(a0)
208	sw	t5,20(a0)
209	sw	t6,24(a0)
210	sw	t7,28(a0)
211	addiu	a0,a0,32
212
213.Lchk1w:
214	andi	a2,t8,0x3	# now a2 is the reminder past 1w chunks
215	beq	a2,t8,.Llast8
216	 subu	a3,t8,a2	# a3 is count of bytes in 1w chunks
217	addu	a3,a0,a3	# now a3 is the dst address past the 1w chunks
218
219# copying in words (4-byte chunks)
220.LwordCopy_loop:
221	lw	t3,0(a1)	# the first t3 may be equal t0 ... optimize?
222	addiu	a1,a1,4
223	addiu	a0,a0,4
224	bne	a0,a3,.LwordCopy_loop
225	 sw	t3,-4(a0)
226
227# For the last (<8) bytes
228.Llast8:
229	blez	a2,.Lleave
230	 addu	a3,a0,a2	# a3 is the last dst address
231.Llast8loop:
232	lb	v1,0(a1)
233	addiu	a1,a1,1
234	addiu	a0,a0,1
235	bne	a0,a3,.Llast8loop
236	 sb	v1,-1(a0)
237
238.Lleave:
239	j	ra
240	 nop
241
242#
243# UNALIGNED case
244#
245
246.Lunaligned:
247	# got here with a3="negu a0"
248	andi	a3,a3,0x3	# test if the a0 is word aligned
249	beqz	a3,.Lua_chk16w
250	 subu	a2,a2,a3	# bytes left after initial a3 bytes
251
252	LWHI	v1,0(a1)
253	LWLO	v1,3(a1)
254	addu	a1,a1,a3	# a3 may be here 1, 2 or 3
255	SWHI	v1,0(a0)
256	addu	a0,a0,a3	# below the dst will be word aligned (NOTE1)
257
258.Lua_chk16w:
259	andi	t8,a2,0x3f	# any whole 64-byte chunks?
260				# t8 is the byte count after 64-byte chunks
261	beq	a2,t8,.Lua_chk8w # if a2==t8, no 64-byte chunks
262				# There will be at most 1 32-byte chunk after it
263	 subu	a3,a2,t8	# subtract from a2 the reminder
264                                # Here a3 counts bytes in 16w chunks
265	addu	a3,a0,a3	# Now a3 is the final dst after 64-byte chunks
266
267	addu	t0,a0,a2	# t0 is the "past the end" address
268
269	subu	t9,t0,160	# t9 is the "last safe pref 30,128(a0)" address
270
271	pref    0,0(a1)		# bring the first line of src, addr 0
272	pref    0,32(a1)	# bring the second line of src, addr 32
273	pref    0,64(a1)	# bring the third line of src, addr 64
274	pref	30,32(a0)	# safe, as we have at least 64 bytes ahead
275# In case the a0 > t9 don't use "pref 30" at all
276	sgtu	v1,a0,t9
277	bgtz	v1,.Lua_loop16w	# skip "pref 30,64(a0)" for too short arrays
278	 nop
279# otherwise, start with using pref30
280	pref	30,64(a0)
281.Lua_loop16w:
282	pref	0,96(a1)
283	LWHI	t0,0(a1)
284	LWLO	t0,3(a1)
285	LWHI	t1,4(a1)
286	bgtz	v1,.Lua_skip_pref30_96
287	 LWLO	t1,7(a1)
288	pref    30,96(a0)   # continue setting up the dest, addr 96
289.Lua_skip_pref30_96:
290	LWHI	t2,8(a1)
291	LWLO	t2,11(a1)
292	LWHI	t3,12(a1)
293	LWLO	t3,15(a1)
294	LWHI	t4,16(a1)
295	LWLO	t4,19(a1)
296	LWHI	t5,20(a1)
297	LWLO	t5,23(a1)
298	LWHI	t6,24(a1)
299	LWLO	t6,27(a1)
300	LWHI	t7,28(a1)
301	LWLO	t7,31(a1)
302        pref    0,128(a1)    # bring the next lines of src, addr 128
303
304	sw	t0,0(a0)
305	sw	t1,4(a0)
306	sw	t2,8(a0)
307	sw	t3,12(a0)
308	sw	t4,16(a0)
309	sw	t5,20(a0)
310	sw	t6,24(a0)
311	sw	t7,28(a0)
312
313	LWHI	t0,32(a1)
314	LWLO	t0,35(a1)
315	LWHI	t1,36(a1)
316	bgtz	v1,.Lua_skip_pref30_128
317	LWLO	t1,39(a1)
318	pref    30,128(a0)   # continue setting up the dest, addr 128
319.Lua_skip_pref30_128:
320	LWHI	t2,40(a1)
321	LWLO	t2,43(a1)
322	LWHI	t3,44(a1)
323	LWLO	t3,47(a1)
324	LWHI	t4,48(a1)
325	LWLO	t4,51(a1)
326	LWHI	t5,52(a1)
327	LWLO	t5,55(a1)
328	LWHI	t6,56(a1)
329	LWLO	t6,59(a1)
330	LWHI	t7,60(a1)
331	LWLO	t7,63(a1)
332        pref    0, 160(a1)    # bring the next lines of src, addr 160
333
334	sw	t0,32(a0)
335	sw	t1,36(a0)
336	sw	t2,40(a0)
337	sw	t3,44(a0)
338	sw	t4,48(a0)
339	sw	t5,52(a0)
340	sw	t6,56(a0)
341	sw	t7,60(a0)
342
343	addiu	a0,a0,64	# adding 64 to dest
344	sgtu	v1,a0,t9
345	bne	a0,a3,.Lua_loop16w
346	 addiu	a1,a1,64	# adding 64 to src
347	move	a2,t8
348
349# Here we have src and dest word-aligned but less than 64-bytes to go
350
351.Lua_chk8w:
352	pref 0, 0x0(a1)
353	andi	t8,a2,0x1f	# is there a 32-byte chunk?
354				# the t8 is the reminder count
355	beq	a2,t8,.Lua_chk1w # when a2=t8, no 32-byte chunk
356	 nop
357
358	LWHI	t0,0(a1)
359	LWLO	t0,3(a1)
360	LWHI	t1,4(a1)
361	LWLO	t1,7(a1)
362	LWHI	t2,8(a1)
363	LWLO	t2,11(a1)
364	LWHI	t3,12(a1)
365	LWLO	t3,15(a1)
366	LWHI	t4,16(a1)
367	LWLO	t4,19(a1)
368	LWHI	t5,20(a1)
369	LWLO	t5,23(a1)
370	LWHI	t6,24(a1)
371	LWLO	t6,27(a1)
372	LWHI	t7,28(a1)
373	LWLO	t7,31(a1)
374	addiu	a1,a1,32
375
376	sw	t0,0(a0)
377	sw	t1,4(a0)
378	sw	t2,8(a0)
379	sw	t3,12(a0)
380	sw	t4,16(a0)
381	sw	t5,20(a0)
382	sw	t6,24(a0)
383	sw	t7,28(a0)
384	addiu	a0,a0,32
385
386.Lua_chk1w:
387	andi	a2,t8,0x3	# now a2 is the reminder past 1w chunks
388	beq	a2,t8,.Lua_smallCopy
389	 subu	a3,t8,a2	# a3 is count of bytes in 1w chunks
390	addu	a3,a0,a3	# now a3 is the dst address past the 1w chunks
391
392# copying in words (4-byte chunks)
393.Lua_wordCopy_loop:
394	LWHI	v1,0(a1)
395	LWLO	v1,3(a1)
396	addiu	a1,a1,4
397	addiu	a0,a0,4		# note: dst=a0 is word aligned here, see NOTE1
398	bne	a0,a3,.Lua_wordCopy_loop
399	 sw	v1,-4(a0)
400
401# Now less than 4 bytes (value in a2) left to copy
402.Lua_smallCopy:
403	beqz	a2,.Lleave
404	addu	a3,a0,a2	# a3 is the last dst address
405.Lua_smallCopy_loop:
406	lb	v1,0(a1)
407	addiu	a1,a1,1
408	addiu	a0,a0,1
409	bne	a0,a3,.Lua_smallCopy_loop
410	 sb	v1,-1(a0)
411
412	j	ra
413	 nop
414
415	.set	at
416	.set	reorder
417
418END(memcpy)
419
420
421/************************************************************************
422 *  Implementation : Static functions
423 ************************************************************************/
424