11176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/*
21176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * Copyright (c) 2012
31176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *      MIPS Technologies, Inc., California.
41176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *
51176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * Redistribution and use in source and binary forms, with or without
61176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * modification, are permitted provided that the following conditions
71176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * are met:
81176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * 1. Redistributions of source code must retain the above copyright
91176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *    notice, this list of conditions and the following disclaimer.
101176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * 2. Redistributions in binary form must reproduce the above copyright
111176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *    notice, this list of conditions and the following disclaimer in the
121176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *    documentation and/or other materials provided with the distribution.
131176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
141176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *    contributors may be used to endorse or promote products derived from
151176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *    this software without specific prior written permission.
161176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *
171176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
181176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
191176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
201176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
211176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
221176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
231176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
241176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
251176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
261176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
271176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * SUCH DAMAGE.
281176bdada62cabc6ec4b0308a930e83b679d5d36John Reck */
291176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
301176bdada62cabc6ec4b0308a930e83b679d5d36John Reck#include "pixman-mips-dspr2-asm.h"
311176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
321176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/*
331176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * This routine could be optimized for MIPS64. The current code only
341176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * uses MIPS32 instructions.
351176bdada62cabc6ec4b0308a930e83b679d5d36John Reck */
361176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
371176bdada62cabc6ec4b0308a930e83b679d5d36John Reck#ifdef EB
381176bdada62cabc6ec4b0308a930e83b679d5d36John Reck#  define LWHI	lwl		/* high part is left in big-endian */
391176bdada62cabc6ec4b0308a930e83b679d5d36John Reck#  define SWHI	swl		/* high part is left in big-endian */
401176bdada62cabc6ec4b0308a930e83b679d5d36John Reck#  define LWLO	lwr		/* low part is right in big-endian */
411176bdada62cabc6ec4b0308a930e83b679d5d36John Reck#  define SWLO	swr		/* low part is right in big-endian */
421176bdada62cabc6ec4b0308a930e83b679d5d36John Reck#else
431176bdada62cabc6ec4b0308a930e83b679d5d36John Reck#  define LWHI	lwr		/* high part is right in little-endian */
441176bdada62cabc6ec4b0308a930e83b679d5d36John Reck#  define SWHI	swr		/* high part is right in little-endian */
451176bdada62cabc6ec4b0308a930e83b679d5d36John Reck#  define LWLO	lwl		/* low part is left in big-endian */
461176bdada62cabc6ec4b0308a930e83b679d5d36John Reck#  define SWLO	swl		/* low part is left in big-endian */
471176bdada62cabc6ec4b0308a930e83b679d5d36John Reck#endif
481176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
491176bdada62cabc6ec4b0308a930e83b679d5d36John ReckLEAF_MIPS32R2(pixman_mips_fast_memcpy)
501176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
511176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	slti	AT, a2, 8
521176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	bne	AT, zero, $last8
531176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	move	v0, a0	/* memcpy returns the dst pointer */
541176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
551176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/* Test if the src and dst are word-aligned, or can be made word-aligned */
561176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	xor	t8, a1, a0
571176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	andi	t8, t8, 0x3		/* t8 is a0/a1 word-displacement */
581176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
591176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	bne	t8, zero, $unaligned
601176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	negu	a3, a0
611176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
621176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	andi	a3, a3, 0x3	/* we need to copy a3 bytes to make a0/a1 aligned */
631176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	beq	a3, zero, $chk16w	/* when a3=0 then the dst (a0) is word-aligned */
641176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	subu	a2, a2, a3	/* now a2 is the remining bytes count */
651176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
661176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWHI	t8, 0(a1)
671176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addu	a1, a1, a3
681176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	SWHI	t8, 0(a0)
691176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addu	a0, a0, a3
701176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
711176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/* Now the dst/src are mutually word-aligned with word-aligned addresses */
721176bdada62cabc6ec4b0308a930e83b679d5d36John Reck$chk16w:	andi	t8, a2, 0x3f	/* any whole 64-byte chunks? */
731176bdada62cabc6ec4b0308a930e83b679d5d36John Reck				/* t8 is the byte count after 64-byte chunks */
741176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
751176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	beq	a2, t8, $chk8w	/* if a2==t8, no 64-byte chunks */
761176bdada62cabc6ec4b0308a930e83b679d5d36John Reck				/* There will be at most 1 32-byte chunk after it */
771176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	subu	a3, a2, t8	/* subtract from a2 the reminder */
781176bdada62cabc6ec4b0308a930e83b679d5d36John Reck                                /* Here a3 counts bytes in 16w chunks */
791176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addu	a3, a0, a3	/* Now a3 is the final dst after 64-byte chunks */
801176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
811176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addu	t0, a0, a2	/* t0 is the "past the end" address */
821176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
831176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/*
841176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * When in the loop we exercise "pref 30, x(a0)", the a0+x should not be past
851176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * the "t0-32" address
861176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * This means: for x=128 the last "safe" a0 address is "t0-160"
871176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * Alternatively, for x=64 the last "safe" a0 address is "t0-96"
881176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * In the current version we use "pref 30, 128(a0)", so "t0-160" is the limit
891176bdada62cabc6ec4b0308a930e83b679d5d36John Reck */
901176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	subu	t9, t0, 160	/* t9 is the "last safe pref 30, 128(a0)" address */
911176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
921176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	pref    0, 0(a1)		/* bring the first line of src, addr 0 */
931176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	pref    0, 32(a1)	/* bring the second line of src, addr 32 */
941176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	pref    0, 64(a1)	/* bring the third line of src, addr 64 */
951176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	pref	30, 32(a0)	/* safe, as we have at least 64 bytes ahead */
961176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/* In case the a0 > t9 don't use "pref 30" at all */
971176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sgtu	v1, a0, t9
981176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	bgtz	v1, $loop16w	/* skip "pref 30, 64(a0)" for too short arrays */
991176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	nop
1001176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/* otherwise, start with using pref30 */
1011176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	pref	30, 64(a0)
1021176bdada62cabc6ec4b0308a930e83b679d5d36John Reck$loop16w:
1031176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	pref	0, 96(a1)
1041176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	lw	t0, 0(a1)
1051176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	bgtz	v1, $skip_pref30_96	/* skip "pref 30, 96(a0)" */
1061176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	lw	t1, 4(a1)
1071176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	pref    30, 96(a0)   /* continue setting up the dest, addr 96 */
1081176bdada62cabc6ec4b0308a930e83b679d5d36John Reck$skip_pref30_96:
1091176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	lw	t2, 8(a1)
1101176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	lw	t3, 12(a1)
1111176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	lw	t4, 16(a1)
1121176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	lw	t5, 20(a1)
1131176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	lw	t6, 24(a1)
1141176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	lw	t7, 28(a1)
1151176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        pref    0, 128(a1)    /* bring the next lines of src, addr 128 */
1161176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
1171176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t0, 0(a0)
1181176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t1, 4(a0)
1191176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t2, 8(a0)
1201176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t3, 12(a0)
1211176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t4, 16(a0)
1221176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t5, 20(a0)
1231176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t6, 24(a0)
1241176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t7, 28(a0)
1251176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
1261176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	lw	t0, 32(a1)
1271176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	bgtz	v1, $skip_pref30_128	/* skip "pref 30, 128(a0)" */
1281176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	lw	t1, 36(a1)
1291176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	pref    30, 128(a0)   /* continue setting up the dest, addr 128 */
1301176bdada62cabc6ec4b0308a930e83b679d5d36John Reck$skip_pref30_128:
1311176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	lw	t2, 40(a1)
1321176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	lw	t3, 44(a1)
1331176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	lw	t4, 48(a1)
1341176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	lw	t5, 52(a1)
1351176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	lw	t6, 56(a1)
1361176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	lw	t7, 60(a1)
1371176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        pref    0, 160(a1)    /* bring the next lines of src, addr 160 */
1381176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
1391176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t0, 32(a0)
1401176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t1, 36(a0)
1411176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t2, 40(a0)
1421176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t3, 44(a0)
1431176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t4, 48(a0)
1441176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t5, 52(a0)
1451176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t6, 56(a0)
1461176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t7, 60(a0)
1471176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
1481176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addiu	a0, a0, 64	/* adding 64 to dest */
1491176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sgtu	v1, a0, t9
1501176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	bne	a0, a3, $loop16w
1511176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addiu	a1, a1, 64	/* adding 64 to src */
1521176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	move	a2, t8
1531176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
1541176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/* Here we have src and dest word-aligned but less than 64-bytes to go */
1551176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
1561176bdada62cabc6ec4b0308a930e83b679d5d36John Reck$chk8w:
1571176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	pref 0, 0x0(a1)
1581176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	andi	t8, a2, 0x1f	/* is there a 32-byte chunk? */
1591176bdada62cabc6ec4b0308a930e83b679d5d36John Reck				/* the t8 is the reminder count past 32-bytes */
1601176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	beq	a2, t8, $chk1w	/* when a2=t8, no 32-byte chunk */
1611176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	 nop
1621176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
1631176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	lw	t0, 0(a1)
1641176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	lw	t1, 4(a1)
1651176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	lw	t2, 8(a1)
1661176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	lw	t3, 12(a1)
1671176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	lw	t4, 16(a1)
1681176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	lw	t5, 20(a1)
1691176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	lw	t6, 24(a1)
1701176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	lw	t7, 28(a1)
1711176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addiu	a1, a1, 32
1721176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
1731176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t0, 0(a0)
1741176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t1, 4(a0)
1751176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t2, 8(a0)
1761176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t3, 12(a0)
1771176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t4, 16(a0)
1781176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t5, 20(a0)
1791176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t6, 24(a0)
1801176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t7, 28(a0)
1811176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addiu	a0, a0, 32
1821176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
1831176bdada62cabc6ec4b0308a930e83b679d5d36John Reck$chk1w:
1841176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	andi	a2, t8, 0x3	/* now a2 is the reminder past 1w chunks */
1851176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	beq	a2, t8, $last8
1861176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	subu	a3, t8, a2	/* a3 is count of bytes in 1w chunks */
1871176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addu	a3, a0, a3	/* now a3 is the dst address past the 1w chunks */
1881176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
1891176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/* copying in words (4-byte chunks) */
1901176bdada62cabc6ec4b0308a930e83b679d5d36John Reck$wordCopy_loop:
1911176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	lw	t3, 0(a1)	/* the first t3 may be equal t0 ... optimize? */
1921176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addiu	a1, a1, 4
1931176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addiu	a0, a0, 4
1941176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	bne	a0, a3, $wordCopy_loop
1951176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t3, -4(a0)
1961176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
1971176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/* For the last (<8) bytes */
1981176bdada62cabc6ec4b0308a930e83b679d5d36John Reck$last8:
1991176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	blez	a2, leave
2001176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addu	a3, a0, a2	/* a3 is the last dst address */
2011176bdada62cabc6ec4b0308a930e83b679d5d36John Reck$last8loop:
2021176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	lb	v1, 0(a1)
2031176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addiu	a1, a1, 1
2041176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addiu	a0, a0, 1
2051176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	bne	a0, a3, $last8loop
2061176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sb	v1, -1(a0)
2071176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
2081176bdada62cabc6ec4b0308a930e83b679d5d36John Reckleave:	j	ra
2091176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	nop
2101176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
2111176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/*
2121176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * UNALIGNED case
2131176bdada62cabc6ec4b0308a930e83b679d5d36John Reck */
2141176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
2151176bdada62cabc6ec4b0308a930e83b679d5d36John Reck$unaligned:
2161176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	/* got here with a3="negu a0" */
2171176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	andi	a3, a3, 0x3	/* test if the a0 is word aligned */
2181176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	beqz	a3, $ua_chk16w
2191176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	subu	a2, a2, a3	/* bytes left after initial a3 bytes */
2201176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
2211176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWHI	v1, 0(a1)
2221176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWLO	v1, 3(a1)
2231176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addu	a1, a1, a3	/* a3 may be here 1, 2 or 3 */
2241176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	SWHI	v1, 0(a0)
2251176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addu	a0, a0, a3	/* below the dst will be word aligned (NOTE1) */
2261176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
2271176bdada62cabc6ec4b0308a930e83b679d5d36John Reck$ua_chk16w:	andi	t8, a2, 0x3f	/* any whole 64-byte chunks? */
2281176bdada62cabc6ec4b0308a930e83b679d5d36John Reck				/* t8 is the byte count after 64-byte chunks */
2291176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	beq	a2, t8, $ua_chk8w	/* if a2==t8, no 64-byte chunks */
2301176bdada62cabc6ec4b0308a930e83b679d5d36John Reck				/* There will be at most 1 32-byte chunk after it */
2311176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	subu	a3, a2, t8	/* subtract from a2 the reminder */
2321176bdada62cabc6ec4b0308a930e83b679d5d36John Reck                                /* Here a3 counts bytes in 16w chunks */
2331176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addu	a3, a0, a3	/* Now a3 is the final dst after 64-byte chunks */
2341176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
2351176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addu	t0, a0, a2	/* t0 is the "past the end" address */
2361176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
2371176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	subu	t9, t0, 160	/* t9 is the "last safe pref 30, 128(a0)" address */
2381176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
2391176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	pref    0, 0(a1)		/* bring the first line of src, addr 0 */
2401176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	pref    0, 32(a1)	/* bring the second line of src, addr 32 */
2411176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	pref    0, 64(a1)	/* bring the third line of src, addr 64 */
2421176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	pref	30, 32(a0)	/* safe, as we have at least 64 bytes ahead */
2431176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/* In case the a0 > t9 don't use "pref 30" at all */
2441176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sgtu	v1, a0, t9
2451176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	bgtz	v1, $ua_loop16w	/* skip "pref 30, 64(a0)" for too short arrays */
2461176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	nop
2471176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/* otherwise,  start with using pref30 */
2481176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	pref	30, 64(a0)
2491176bdada62cabc6ec4b0308a930e83b679d5d36John Reck$ua_loop16w:
2501176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	pref	0, 96(a1)
2511176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWHI	t0, 0(a1)
2521176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWLO	t0, 3(a1)
2531176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWHI	t1, 4(a1)
2541176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	bgtz	v1, $ua_skip_pref30_96
2551176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWLO	t1, 7(a1)
2561176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	pref    30, 96(a0)   /* continue setting up the dest, addr 96 */
2571176bdada62cabc6ec4b0308a930e83b679d5d36John Reck$ua_skip_pref30_96:
2581176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWHI	t2, 8(a1)
2591176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWLO	t2, 11(a1)
2601176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWHI	t3, 12(a1)
2611176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWLO	t3, 15(a1)
2621176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWHI	t4, 16(a1)
2631176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWLO	t4, 19(a1)
2641176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWHI	t5, 20(a1)
2651176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWLO	t5, 23(a1)
2661176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWHI	t6, 24(a1)
2671176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWLO	t6, 27(a1)
2681176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWHI	t7, 28(a1)
2691176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWLO	t7, 31(a1)
2701176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        pref    0, 128(a1)    /* bring the next lines of src, addr 128 */
2711176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
2721176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t0, 0(a0)
2731176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t1, 4(a0)
2741176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t2, 8(a0)
2751176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t3, 12(a0)
2761176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t4, 16(a0)
2771176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t5, 20(a0)
2781176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t6, 24(a0)
2791176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t7, 28(a0)
2801176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
2811176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWHI	t0, 32(a1)
2821176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWLO	t0, 35(a1)
2831176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWHI	t1, 36(a1)
2841176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	bgtz	v1, $ua_skip_pref30_128
2851176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWLO	t1, 39(a1)
2861176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	pref    30, 128(a0)   /* continue setting up the dest, addr 128 */
2871176bdada62cabc6ec4b0308a930e83b679d5d36John Reck$ua_skip_pref30_128:
2881176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWHI	t2, 40(a1)
2891176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWLO	t2, 43(a1)
2901176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWHI	t3, 44(a1)
2911176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWLO	t3, 47(a1)
2921176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWHI	t4, 48(a1)
2931176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWLO	t4, 51(a1)
2941176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWHI	t5, 52(a1)
2951176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWLO	t5, 55(a1)
2961176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWHI	t6, 56(a1)
2971176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWLO	t6, 59(a1)
2981176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWHI	t7, 60(a1)
2991176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWLO	t7, 63(a1)
3001176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        pref    0, 160(a1)    /* bring the next lines of src, addr 160 */
3011176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
3021176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t0, 32(a0)
3031176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t1, 36(a0)
3041176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t2, 40(a0)
3051176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t3, 44(a0)
3061176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t4, 48(a0)
3071176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t5, 52(a0)
3081176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t6, 56(a0)
3091176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t7, 60(a0)
3101176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
3111176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addiu	a0, a0, 64	/* adding 64 to dest */
3121176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sgtu	v1, a0, t9
3131176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	bne	a0, a3, $ua_loop16w
3141176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addiu	a1, a1, 64	/* adding 64 to src */
3151176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	move	a2, t8
3161176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
3171176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/* Here we have src and dest word-aligned but less than 64-bytes to go */
3181176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
3191176bdada62cabc6ec4b0308a930e83b679d5d36John Reck$ua_chk8w:
3201176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	pref 0, 0x0(a1)
3211176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	andi	t8, a2, 0x1f	/* is there a 32-byte chunk? */
3221176bdada62cabc6ec4b0308a930e83b679d5d36John Reck				/* the t8 is the reminder count */
3231176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	beq	a2, t8, $ua_chk1w	/* when a2=t8, no 32-byte chunk */
3241176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
3251176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWHI	t0, 0(a1)
3261176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWLO	t0, 3(a1)
3271176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWHI	t1, 4(a1)
3281176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWLO	t1, 7(a1)
3291176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWHI	t2, 8(a1)
3301176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWLO	t2, 11(a1)
3311176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWHI	t3, 12(a1)
3321176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWLO	t3, 15(a1)
3331176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWHI	t4, 16(a1)
3341176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWLO	t4, 19(a1)
3351176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWHI	t5, 20(a1)
3361176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWLO	t5, 23(a1)
3371176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWHI	t6, 24(a1)
3381176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWLO	t6, 27(a1)
3391176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWHI	t7, 28(a1)
3401176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWLO	t7, 31(a1)
3411176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addiu	a1, a1, 32
3421176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
3431176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t0, 0(a0)
3441176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t1, 4(a0)
3451176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t2, 8(a0)
3461176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t3, 12(a0)
3471176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t4, 16(a0)
3481176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t5, 20(a0)
3491176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t6, 24(a0)
3501176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	t7, 28(a0)
3511176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addiu	a0, a0, 32
3521176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
3531176bdada62cabc6ec4b0308a930e83b679d5d36John Reck$ua_chk1w:
3541176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	andi	a2, t8, 0x3	/* now a2 is the reminder past 1w chunks */
3551176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	beq	a2, t8, $ua_smallCopy
3561176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	subu	a3, t8, a2	/* a3 is count of bytes in 1w chunks */
3571176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addu	a3, a0, a3	/* now a3 is the dst address past the 1w chunks */
3581176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
3591176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/* copying in words (4-byte chunks) */
3601176bdada62cabc6ec4b0308a930e83b679d5d36John Reck$ua_wordCopy_loop:
3611176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWHI	v1, 0(a1)
3621176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	LWLO	v1, 3(a1)
3631176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addiu	a1, a1, 4
3641176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addiu	a0, a0, 4		/* note: dst=a0 is word aligned here, see NOTE1 */
3651176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	bne	a0, a3, $ua_wordCopy_loop
3661176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sw	v1, -4(a0)
3671176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
3681176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/* Now less than 4 bytes (value in a2) left to copy */
3691176bdada62cabc6ec4b0308a930e83b679d5d36John Reck$ua_smallCopy:
3701176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	beqz	a2, leave
3711176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addu	a3, a0, a2	/* a3 is the last dst address */
3721176bdada62cabc6ec4b0308a930e83b679d5d36John Reck$ua_smallCopy_loop:
3731176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	lb	v1, 0(a1)
3741176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addiu	a1, a1, 1
3751176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	addiu	a0, a0, 1
3761176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	bne	a0, a3, $ua_smallCopy_loop
3771176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	sb	v1, -1(a0)
3781176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
3791176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	j	ra
3801176bdada62cabc6ec4b0308a930e83b679d5d36John Reck	nop
3811176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
3821176bdada62cabc6ec4b0308a930e83b679d5d36John ReckEND(pixman_mips_fast_memcpy)
383