1/*
2 * Copyright (c) 2012
3 *      MIPS Technologies, Inc., California.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
14 *    contributors may be used to endorse or promote products derived from
15 *    this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include "pixman-mips-dspr2-asm.h"
31
32/*
33 * This routine could be optimized for MIPS64. The current code only
34 * uses MIPS32 instructions.
35 */
36
37#ifdef EB
38#  define LWHI	lwl		/* high part is left in big-endian */
39#  define SWHI	swl		/* high part is left in big-endian */
40#  define LWLO	lwr		/* low part is right in big-endian */
41#  define SWLO	swr		/* low part is right in big-endian */
42#else
43#  define LWHI	lwr		/* high part is right in little-endian */
44#  define SWHI	swr		/* high part is right in little-endian */
45#  define LWLO	lwl		/* low part is left in big-endian */
46#  define SWLO	swl		/* low part is left in big-endian */
47#endif
48
49LEAF_MIPS32R2(pixman_mips_fast_memcpy)
50
51	slti	AT, a2, 8
52	bne	AT, zero, $last8
53	move	v0, a0	/* memcpy returns the dst pointer */
54
55/* Test if the src and dst are word-aligned, or can be made word-aligned */
56	xor	t8, a1, a0
57	andi	t8, t8, 0x3		/* t8 is a0/a1 word-displacement */
58
59	bne	t8, zero, $unaligned
60	negu	a3, a0
61
62	andi	a3, a3, 0x3	/* we need to copy a3 bytes to make a0/a1 aligned */
63	beq	a3, zero, $chk16w	/* when a3=0 then the dst (a0) is word-aligned */
64	subu	a2, a2, a3	/* now a2 is the remining bytes count */
65
66	LWHI	t8, 0(a1)
67	addu	a1, a1, a3
68	SWHI	t8, 0(a0)
69	addu	a0, a0, a3
70
71/* Now the dst/src are mutually word-aligned with word-aligned addresses */
72$chk16w:	andi	t8, a2, 0x3f	/* any whole 64-byte chunks? */
73				/* t8 is the byte count after 64-byte chunks */
74
75	beq	a2, t8, $chk8w	/* if a2==t8, no 64-byte chunks */
76				/* There will be at most 1 32-byte chunk after it */
77	subu	a3, a2, t8	/* subtract from a2 the reminder */
78                                /* Here a3 counts bytes in 16w chunks */
79	addu	a3, a0, a3	/* Now a3 is the final dst after 64-byte chunks */
80
81	addu	t0, a0, a2	/* t0 is the "past the end" address */
82
83/*
84 * When in the loop we exercise "pref 30, x(a0)", the a0+x should not be past
85 * the "t0-32" address
86 * This means: for x=128 the last "safe" a0 address is "t0-160"
87 * Alternatively, for x=64 the last "safe" a0 address is "t0-96"
88 * In the current version we use "pref 30, 128(a0)", so "t0-160" is the limit
89 */
90	subu	t9, t0, 160	/* t9 is the "last safe pref 30, 128(a0)" address */
91
92	pref    0, 0(a1)		/* bring the first line of src, addr 0 */
93	pref    0, 32(a1)	/* bring the second line of src, addr 32 */
94	pref    0, 64(a1)	/* bring the third line of src, addr 64 */
95	pref	30, 32(a0)	/* safe, as we have at least 64 bytes ahead */
96/* In case the a0 > t9 don't use "pref 30" at all */
97	sgtu	v1, a0, t9
98	bgtz	v1, $loop16w	/* skip "pref 30, 64(a0)" for too short arrays */
99	nop
100/* otherwise, start with using pref30 */
101	pref	30, 64(a0)
102$loop16w:
103	pref	0, 96(a1)
104	lw	t0, 0(a1)
105	bgtz	v1, $skip_pref30_96	/* skip "pref 30, 96(a0)" */
106	lw	t1, 4(a1)
107	pref    30, 96(a0)   /* continue setting up the dest, addr 96 */
108$skip_pref30_96:
109	lw	t2, 8(a1)
110	lw	t3, 12(a1)
111	lw	t4, 16(a1)
112	lw	t5, 20(a1)
113	lw	t6, 24(a1)
114	lw	t7, 28(a1)
115        pref    0, 128(a1)    /* bring the next lines of src, addr 128 */
116
117	sw	t0, 0(a0)
118	sw	t1, 4(a0)
119	sw	t2, 8(a0)
120	sw	t3, 12(a0)
121	sw	t4, 16(a0)
122	sw	t5, 20(a0)
123	sw	t6, 24(a0)
124	sw	t7, 28(a0)
125
126	lw	t0, 32(a1)
127	bgtz	v1, $skip_pref30_128	/* skip "pref 30, 128(a0)" */
128	lw	t1, 36(a1)
129	pref    30, 128(a0)   /* continue setting up the dest, addr 128 */
130$skip_pref30_128:
131	lw	t2, 40(a1)
132	lw	t3, 44(a1)
133	lw	t4, 48(a1)
134	lw	t5, 52(a1)
135	lw	t6, 56(a1)
136	lw	t7, 60(a1)
137        pref    0, 160(a1)    /* bring the next lines of src, addr 160 */
138
139	sw	t0, 32(a0)
140	sw	t1, 36(a0)
141	sw	t2, 40(a0)
142	sw	t3, 44(a0)
143	sw	t4, 48(a0)
144	sw	t5, 52(a0)
145	sw	t6, 56(a0)
146	sw	t7, 60(a0)
147
148	addiu	a0, a0, 64	/* adding 64 to dest */
149	sgtu	v1, a0, t9
150	bne	a0, a3, $loop16w
151	addiu	a1, a1, 64	/* adding 64 to src */
152	move	a2, t8
153
154/* Here we have src and dest word-aligned but less than 64-bytes to go */
155
156$chk8w:
157	pref 0, 0x0(a1)
158	andi	t8, a2, 0x1f	/* is there a 32-byte chunk? */
159				/* the t8 is the reminder count past 32-bytes */
160	beq	a2, t8, $chk1w	/* when a2=t8, no 32-byte chunk */
161	 nop
162
163	lw	t0, 0(a1)
164	lw	t1, 4(a1)
165	lw	t2, 8(a1)
166	lw	t3, 12(a1)
167	lw	t4, 16(a1)
168	lw	t5, 20(a1)
169	lw	t6, 24(a1)
170	lw	t7, 28(a1)
171	addiu	a1, a1, 32
172
173	sw	t0, 0(a0)
174	sw	t1, 4(a0)
175	sw	t2, 8(a0)
176	sw	t3, 12(a0)
177	sw	t4, 16(a0)
178	sw	t5, 20(a0)
179	sw	t6, 24(a0)
180	sw	t7, 28(a0)
181	addiu	a0, a0, 32
182
183$chk1w:
184	andi	a2, t8, 0x3	/* now a2 is the reminder past 1w chunks */
185	beq	a2, t8, $last8
186	subu	a3, t8, a2	/* a3 is count of bytes in 1w chunks */
187	addu	a3, a0, a3	/* now a3 is the dst address past the 1w chunks */
188
189/* copying in words (4-byte chunks) */
190$wordCopy_loop:
191	lw	t3, 0(a1)	/* the first t3 may be equal t0 ... optimize? */
192	addiu	a1, a1, 4
193	addiu	a0, a0, 4
194	bne	a0, a3, $wordCopy_loop
195	sw	t3, -4(a0)
196
197/* For the last (<8) bytes */
198$last8:
199	blez	a2, leave
200	addu	a3, a0, a2	/* a3 is the last dst address */
201$last8loop:
202	lb	v1, 0(a1)
203	addiu	a1, a1, 1
204	addiu	a0, a0, 1
205	bne	a0, a3, $last8loop
206	sb	v1, -1(a0)
207
208leave:	j	ra
209	nop
210
211/*
212 * UNALIGNED case
213 */
214
215$unaligned:
216	/* got here with a3="negu a0" */
217	andi	a3, a3, 0x3	/* test if the a0 is word aligned */
218	beqz	a3, $ua_chk16w
219	subu	a2, a2, a3	/* bytes left after initial a3 bytes */
220
221	LWHI	v1, 0(a1)
222	LWLO	v1, 3(a1)
223	addu	a1, a1, a3	/* a3 may be here 1, 2 or 3 */
224	SWHI	v1, 0(a0)
225	addu	a0, a0, a3	/* below the dst will be word aligned (NOTE1) */
226
227$ua_chk16w:	andi	t8, a2, 0x3f	/* any whole 64-byte chunks? */
228				/* t8 is the byte count after 64-byte chunks */
229	beq	a2, t8, $ua_chk8w	/* if a2==t8, no 64-byte chunks */
230				/* There will be at most 1 32-byte chunk after it */
231	subu	a3, a2, t8	/* subtract from a2 the reminder */
232                                /* Here a3 counts bytes in 16w chunks */
233	addu	a3, a0, a3	/* Now a3 is the final dst after 64-byte chunks */
234
235	addu	t0, a0, a2	/* t0 is the "past the end" address */
236
237	subu	t9, t0, 160	/* t9 is the "last safe pref 30, 128(a0)" address */
238
239	pref    0, 0(a1)		/* bring the first line of src, addr 0 */
240	pref    0, 32(a1)	/* bring the second line of src, addr 32 */
241	pref    0, 64(a1)	/* bring the third line of src, addr 64 */
242	pref	30, 32(a0)	/* safe, as we have at least 64 bytes ahead */
243/* In case the a0 > t9 don't use "pref 30" at all */
244	sgtu	v1, a0, t9
245	bgtz	v1, $ua_loop16w	/* skip "pref 30, 64(a0)" for too short arrays */
246	nop
247/* otherwise,  start with using pref30 */
248	pref	30, 64(a0)
249$ua_loop16w:
250	pref	0, 96(a1)
251	LWHI	t0, 0(a1)
252	LWLO	t0, 3(a1)
253	LWHI	t1, 4(a1)
254	bgtz	v1, $ua_skip_pref30_96
255	LWLO	t1, 7(a1)
256	pref    30, 96(a0)   /* continue setting up the dest, addr 96 */
257$ua_skip_pref30_96:
258	LWHI	t2, 8(a1)
259	LWLO	t2, 11(a1)
260	LWHI	t3, 12(a1)
261	LWLO	t3, 15(a1)
262	LWHI	t4, 16(a1)
263	LWLO	t4, 19(a1)
264	LWHI	t5, 20(a1)
265	LWLO	t5, 23(a1)
266	LWHI	t6, 24(a1)
267	LWLO	t6, 27(a1)
268	LWHI	t7, 28(a1)
269	LWLO	t7, 31(a1)
270        pref    0, 128(a1)    /* bring the next lines of src, addr 128 */
271
272	sw	t0, 0(a0)
273	sw	t1, 4(a0)
274	sw	t2, 8(a0)
275	sw	t3, 12(a0)
276	sw	t4, 16(a0)
277	sw	t5, 20(a0)
278	sw	t6, 24(a0)
279	sw	t7, 28(a0)
280
281	LWHI	t0, 32(a1)
282	LWLO	t0, 35(a1)
283	LWHI	t1, 36(a1)
284	bgtz	v1, $ua_skip_pref30_128
285	LWLO	t1, 39(a1)
286	pref    30, 128(a0)   /* continue setting up the dest, addr 128 */
287$ua_skip_pref30_128:
288	LWHI	t2, 40(a1)
289	LWLO	t2, 43(a1)
290	LWHI	t3, 44(a1)
291	LWLO	t3, 47(a1)
292	LWHI	t4, 48(a1)
293	LWLO	t4, 51(a1)
294	LWHI	t5, 52(a1)
295	LWLO	t5, 55(a1)
296	LWHI	t6, 56(a1)
297	LWLO	t6, 59(a1)
298	LWHI	t7, 60(a1)
299	LWLO	t7, 63(a1)
300        pref    0, 160(a1)    /* bring the next lines of src, addr 160 */
301
302	sw	t0, 32(a0)
303	sw	t1, 36(a0)
304	sw	t2, 40(a0)
305	sw	t3, 44(a0)
306	sw	t4, 48(a0)
307	sw	t5, 52(a0)
308	sw	t6, 56(a0)
309	sw	t7, 60(a0)
310
311	addiu	a0, a0, 64	/* adding 64 to dest */
312	sgtu	v1, a0, t9
313	bne	a0, a3, $ua_loop16w
314	addiu	a1, a1, 64	/* adding 64 to src */
315	move	a2, t8
316
317/* Here we have src and dest word-aligned but less than 64-bytes to go */
318
319$ua_chk8w:
320	pref 0, 0x0(a1)
321	andi	t8, a2, 0x1f	/* is there a 32-byte chunk? */
322				/* the t8 is the reminder count */
323	beq	a2, t8, $ua_chk1w	/* when a2=t8, no 32-byte chunk */
324
325	LWHI	t0, 0(a1)
326	LWLO	t0, 3(a1)
327	LWHI	t1, 4(a1)
328	LWLO	t1, 7(a1)
329	LWHI	t2, 8(a1)
330	LWLO	t2, 11(a1)
331	LWHI	t3, 12(a1)
332	LWLO	t3, 15(a1)
333	LWHI	t4, 16(a1)
334	LWLO	t4, 19(a1)
335	LWHI	t5, 20(a1)
336	LWLO	t5, 23(a1)
337	LWHI	t6, 24(a1)
338	LWLO	t6, 27(a1)
339	LWHI	t7, 28(a1)
340	LWLO	t7, 31(a1)
341	addiu	a1, a1, 32
342
343	sw	t0, 0(a0)
344	sw	t1, 4(a0)
345	sw	t2, 8(a0)
346	sw	t3, 12(a0)
347	sw	t4, 16(a0)
348	sw	t5, 20(a0)
349	sw	t6, 24(a0)
350	sw	t7, 28(a0)
351	addiu	a0, a0, 32
352
353$ua_chk1w:
354	andi	a2, t8, 0x3	/* now a2 is the reminder past 1w chunks */
355	beq	a2, t8, $ua_smallCopy
356	subu	a3, t8, a2	/* a3 is count of bytes in 1w chunks */
357	addu	a3, a0, a3	/* now a3 is the dst address past the 1w chunks */
358
359/* copying in words (4-byte chunks) */
360$ua_wordCopy_loop:
361	LWHI	v1, 0(a1)
362	LWLO	v1, 3(a1)
363	addiu	a1, a1, 4
364	addiu	a0, a0, 4		/* note: dst=a0 is word aligned here, see NOTE1 */
365	bne	a0, a3, $ua_wordCopy_loop
366	sw	v1, -4(a0)
367
368/* Now less than 4 bytes (value in a2) left to copy */
369$ua_smallCopy:
370	beqz	a2, leave
371	addu	a3, a0, a2	/* a3 is the last dst address */
372$ua_smallCopy_loop:
373	lb	v1, 0(a1)
374	addiu	a1, a1, 1
375	addiu	a0, a0, 1
376	bne	a0, a3, $ua_smallCopy_loop
377	sb	v1, -1(a0)
378
379	j	ra
380	nop
381
382END(pixman_mips_fast_memcpy)
383