1405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham/*
2405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham * Copyright (c) 2009
3405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham *      MIPS Technologies, Inc., California.
4405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham *
5405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham * Redistribution and use in source and binary forms, with or without
6405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham * modification, are permitted provided that the following conditions
7405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham * are met:
8405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham * 1. Redistributions of source code must retain the above copyright
9405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham *    notice, this list of conditions and the following disclaimer.
10405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham * 2. Redistributions in binary form must reproduce the above copyright
11405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham *    notice, this list of conditions and the following disclaimer in the
12405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham *    documentation and/or other materials provided with the distribution.
13405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
14405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham *    contributors may be used to endorse or promote products derived from
15405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham *    this software without specific prior written permission.
16405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham *
17405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
18405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
21405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham * SUCH DAMAGE.
28405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham */
29405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
30405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham/************************************************************************
31405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham *
32405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham *  memset.S, version "64h" with 1 cache line horizon for "pref 30" and 14 nops
33405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham *  Version: "043009"
34405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham *
35405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham ************************************************************************/
36405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
37405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
38405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham/************************************************************************
39405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham *  Include files
40405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham ************************************************************************/
41405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
42851e68a2402fa414544e66650e09dfdaac813e51Elliott Hughes#include <private/bionic_asm.h>
43405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
44851e68a2402fa414544e66650e09dfdaac813e51Elliott Hughes/*
45405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham * This routine could be optimized for MIPS64. The current code only
46405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham * uses MIPS32 instructions.
47851e68a2402fa414544e66650e09dfdaac813e51Elliott Hughes */
48405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
49405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#if defined(__MIPSEB__)
50405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#  define SWHI	swl		/* high part is left in big-endian	*/
51405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#  define SWLO	swr		/* low part is right in big-endian	*/
52405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#endif
53405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
54405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#if defined(__MIPSEL__)
55405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#  define SWHI	swr		/* high part is right in little-endian	*/
56405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#  define SWLO	swl		/* low part is left in little-endian	*/
57405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#endif
58405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
59405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#if !(defined(XGPROF) || defined(XPROF))
60405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#undef SETUP_GP
61405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#define SETUP_GP
62405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#endif
63405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
64405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#ifdef NDEBUG
65405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#define DBG #
66405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#else
67405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#define DBG
68405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#endif
69405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
70405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham/*
71405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham * void _memset16(uint16_t* dst, uint16_t value, size_t size);
72405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham */
73405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
74405b8029a6888f386adf3512113a33546141d1c8Raghu GandhamLEAF(_memset16,0)
75405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	.set noreorder
76405b8029a6888f386adf3512113a33546141d1c8Raghu GandhamDBG	/* Check parameters */
77405b8029a6888f386adf3512113a33546141d1c8Raghu GandhamDBG	andi	t0,a0,1			# a0 must be halfword aligned
78405b8029a6888f386adf3512113a33546141d1c8Raghu GandhamDBG	tne	t0,zero
79405b8029a6888f386adf3512113a33546141d1c8Raghu GandhamDBG	andi	t2,a2,1			# a2 must be even
80405b8029a6888f386adf3512113a33546141d1c8Raghu GandhamDBG	tne	t2,zero
81405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
82405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#ifdef FIXARGS
83405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	# ensure count is even
84405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#if (__mips==32) && (__mips_isa_rev>=2)
85405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	ins	a2,zero,0,1
86405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#else
87405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	ori	a2,1
88405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	xori	a2,1
89405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#endif
90405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#endif
91405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
92405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#if (__mips==32) && (__mips_isa_rev>=2)
93405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	ins	a1,a1,16,16
94405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#else
95405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	andi	a1,0xffff
96405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sll	t3,a1,16
97405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	or	a1,t3
98405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#endif
99405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
100405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	beqz	a2,.Ldone
101405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	 andi	t1,a0,2
102405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	beqz	t1,.Lalignok
103405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	 addu	t0,a0,a2		# t0 is the "past the end" address
104405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sh	a1,0(a0)		# store one halfword to get aligned
105405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	addu	a0,2
106405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	subu	a2,2
107405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham.Lalignok:
108405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	slti	t1,a2,4			# .Laligned for 4 or more bytes
109405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	beqz	t1,.Laligned
110405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	 sne	t1,a2,2			# one more halfword?
111405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	bnez	t1,.Ldone
112405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	 nop
113405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sh	a1,0(a0)
114405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham.Ldone:
115405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	j	ra
116405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	 nop
117405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	.set reorder
118405b8029a6888f386adf3512113a33546141d1c8Raghu GandhamEND(_memset16)
119405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
120405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham/*
121405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham * void _memset32(uint32_t* dst, uint32_t value, size_t size);
122405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham */
123405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
124405b8029a6888f386adf3512113a33546141d1c8Raghu GandhamLEAF(_memset32,0)
125405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	.set noreorder
126405b8029a6888f386adf3512113a33546141d1c8Raghu GandhamDBG	/* Check parameters */
127405b8029a6888f386adf3512113a33546141d1c8Raghu GandhamDBG	andi	t0,a0,3			# a0 must be word aligned
128405b8029a6888f386adf3512113a33546141d1c8Raghu GandhamDBG	tne	t0,zero
129405b8029a6888f386adf3512113a33546141d1c8Raghu GandhamDBG	andi	t2,a2,3			# a2 must be a multiple of 4 bytes
130405b8029a6888f386adf3512113a33546141d1c8Raghu GandhamDBG	tne	t2,zero
131405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
132405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#ifdef FIXARGS
133405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	# ensure count is a multiple of 4
134405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#if (__mips==32) && (__mips_isa_rev>=2)
135405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	ins	$a2,$0,0,2
136405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#else
137405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	ori	a2,3
138405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	xori	a2,3
139405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#endif
140405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#endif
141405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
142405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	bnez	a2,.Laligned		# any work to do?
143405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	 addu	t0,a0,a2		# t0 is the "past the end" address
144405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
145405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	j	ra
146405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	 nop
147405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	.set reorder
148405b8029a6888f386adf3512113a33546141d1c8Raghu GandhamEND(_memset32)
149405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
150405b8029a6888f386adf3512113a33546141d1c8Raghu GandhamLEAF(memset,0)
151405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
152405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	.set	noreorder
153405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	.set	noat
154405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
155405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	addu	t0,a0,a2		# t0 is the "past the end" address
156405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	slti	AT,a2,4			# is a2 less than 4?
157405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	bne	AT,zero,.Llast4		# if yes, go to last4
158405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	 move	v0,a0			# memset returns the dst pointer
159405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
160405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	beq	a1,zero,.Lset0
161405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	 subu	v1,zero,a0
162405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
163405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	# smear byte into 32 bit word
164405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#if (__mips==32) && (__mips_isa_rev>=2)
165405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	ins     a1, a1, 8, 8        # Replicate fill byte into half-word.
166405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	ins     a1, a1, 16, 16      # Replicate fill byte into word.
167405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#else
168405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	and	a1,0xff
169405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sll	AT,a1,8
170405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	or	a1,AT
171405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sll	AT,a1,16
172405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	or	a1,AT
173405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#endif
174405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
175405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham.Lset0:
176405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	andi	v1,v1,0x3		# word-unaligned address?
177405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	beq	v1,zero,.Laligned	# v1 is the unalignment count
178405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	 subu	a2,a2,v1
179405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	SWHI	a1,0(a0)
180405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	addu	a0,a0,v1
181405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
182405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham# Here we have the "word-aligned" a0 (until the "last4")
183405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham.Laligned:
184405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	andi	t8,a2,0x3f	# any 64-byte chunks?
185405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham				# t8 is the byte count past 64-byte chunks
186405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	beq	a2,t8,.Lchk8w	# when a2==t8, no 64-byte chunks
187405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham				# There will be at most 1 32-byte chunk then
188405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	 subu	a3,a2,t8	# subtract from a2 the reminder
189405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham				# Here a3 counts bytes in 16w chunks
190405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	addu	a3,a0,a3	# Now a3 is the final dst after 64-byte chunks
191405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
192405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham# Find out, if there are any 64-byte chunks after which will be still at least
193405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham# 96 bytes left. The value "96" is calculated as needed buffer for
194405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham# "pref 30,64(a0)" prefetch, which can be used as "pref 30,0(a0)" after
195405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham# incrementing "a0" by 64.
196405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham# For "a2" below 160 there will be no such "pref 30 safe" 64-byte chunk.
197405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#
198405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sltiu	v1,a2,160
199405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	bgtz	v1,.Lloop16w_nopref30	# skip "pref 30,0(a0)"
200405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	 subu	t7,a2,96	# subtract "pref 30 unsafe" region
201405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham		# below we have at least 1 64-byte chunk which is "pref 30 safe"
202405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	andi	t6,t7,0x3f	# t6 is past "64-byte safe chunks" reminder
203405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	subu	t5,t7,t6	# subtract from t7 the reminder
204405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham				# Here t5 counts bytes in 16w "safe" chunks
205405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	addu	t4,a0,t5	# Now t4 is the dst after 64-byte "safe" chunks
206405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
207405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham# Don't use "pref 30,0(a0)" for a0 in a "middle" of a cache line
208405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham#	pref	30,0(a0)
209405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham# Here we are in the region, where it is safe to use "pref 30,64(a0)"
210405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham.Lloop16w:
211405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	addiu	a0,a0,64
212405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	pref	30,-32(a0)	# continue setting up the dest, addr 64-32
213405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-64(a0)
214405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-60(a0)
215405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-56(a0)
216405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-52(a0)
217405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-48(a0)
218405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-44(a0)
219405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-40(a0)
220405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-36(a0)
221405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	nop
222405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	nop			# the extra nop instructions help to balance
223851e68a2402fa414544e66650e09dfdaac813e51Elliott Hughes	nop			# cycles needed for "store" + "fill" + "evict"
224405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	nop			# For 64byte store there are needed 8 fill
225405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	nop			# and 8 evict cycles, i.e. at least 32 instr.
226405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	nop
227405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	nop
228405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	pref	30,0(a0)	# continue setting up the dest, addr 64-0
229405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-32(a0)
230405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-28(a0)
231405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-24(a0)
232405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-20(a0)
233405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-16(a0)
234405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-12(a0)
235405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-8(a0)
236405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-4(a0)
237405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	nop
238405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	nop
239405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	nop
240405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	nop			# NOTE: adding 14 nop-s instead of 12 nop-s
241405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	nop			# gives better results for "fast" memory
242405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	nop
243405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	bne	a0,t4,.Lloop16w
244405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	 nop
245405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
246405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	beq	a0,a3,.Lchk8w	# maybe no more 64-byte chunks?
247405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	 nop			# this "delayed slot" is useless ...
248405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
249405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham.Lloop16w_nopref30:	# there could be up to 3 "64-byte nopref30" chunks
250405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	addiu	a0,a0,64
251405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-64(a0)
252405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-60(a0)
253405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-56(a0)
254405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-52(a0)
255405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-48(a0)
256405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-44(a0)
257405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-40(a0)
258405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-36(a0)
259405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-32(a0)
260405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-28(a0)
261405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-24(a0)
262405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-20(a0)
263405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-16(a0)
264405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-12(a0)
265405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,-8(a0)
266405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	bne	a0,a3,.Lloop16w_nopref30
267405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	 sw	a1,-4(a0)
268405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
269405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham.Lchk8w:		# t8 here is the byte count past 64-byte chunks
270405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
271405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	andi	t7,t8,0x1f	# is there a 32-byte chunk?
272405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham				# the t7 is the reminder count past 32-bytes
273405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	beq	t8,t7,.Lchk1w	# when t8==t7, no 32-byte chunk
274405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	 move	a2,t7
275405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
276405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,0(a0)
277405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,4(a0)
278405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,8(a0)
279405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,12(a0)
280405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,16(a0)
281405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,20(a0)
282405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,24(a0)
283405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	sw	a1,28(a0)
284405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	addiu	a0,a0,32
285405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
286405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham.Lchk1w:
287405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	andi	t8,a2,0x3	# now t8 is the reminder past 1w chunks
288405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	beq	a2,t8,.Llast4aligned
289405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	 subu	a3,a2,t8	# a3 is the count of bytes in 1w chunks
290405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	addu	a3,a0,a3	# now a3 is the dst address past the 1w chunks
291405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
292405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham# copying in words (4-byte chunks)
293405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham.LwordCopy_loop:
294405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	addiu	a0,a0,4
295405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	bne	a0,a3,.LwordCopy_loop
296405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	 sw	a1,-4(a0)
297405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
298405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham# store last 0-3 bytes
299405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham# this will repeat the last store if the memset finishes on a word boundary
300405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham.Llast4aligned:
301405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	j	ra
302405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	 SWLO	a1,-1(t0)
303405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
304405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham.Llast4:
305405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	beq	a0,t0,.Llast4e
306405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham.Llast4l:
307405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	 addiu	a0,a0,1
308405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	bne	a0,t0,.Llast4l
309405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	 sb	a1,-1(a0)
310405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham.Llast4e:
311405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	j	ra
312405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	 nop
313405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
314405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	.set	at
315405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham	.set	reorder
316405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
317405b8029a6888f386adf3512113a33546141d1c8Raghu GandhamEND(memset)
318405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
319405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham
320405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham/************************************************************************
321405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham *  Implementation : Static functions
322405b8029a6888f386adf3512113a33546141d1c8Raghu Gandham ************************************************************************/
323