memset.S revision 851e68a2402fa414544e66650e09dfdaac813e51
1/*
2 * Copyright (c) 2009
3 *      MIPS Technologies, Inc., California.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
14 *    contributors may be used to endorse or promote products derived from
15 *    this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30/************************************************************************
31 *
32 *  memset.S, version "64h" with 1 cache line horizon for "pref 30" and 14 nops
33 *  Version: "043009"
34 *
35 ************************************************************************/
36
37
38/************************************************************************
39 *  Include files
40 ************************************************************************/
41
42#include <private/bionic_asm.h>
43
44/*
45 * This routine could be optimized for MIPS64. The current code only
46 * uses MIPS32 instructions.
47 */
48
49#if defined(__MIPSEB__)
50#  define SWHI	swl		/* high part is left in big-endian	*/
51#  define SWLO	swr		/* low part is right in big-endian	*/
52#endif
53
54#if defined(__MIPSEL__)
55#  define SWHI	swr		/* high part is right in little-endian	*/
56#  define SWLO	swl		/* low part is left in little-endian	*/
57#endif
58
59#if !(defined(XGPROF) || defined(XPROF))
60#undef SETUP_GP
61#define SETUP_GP
62#endif
63
64#ifdef NDEBUG
65#define DBG #
66#else
67#define DBG
68#endif
69
70/*
71 * void _memset16(uint16_t* dst, uint16_t value, size_t size);
72 */
73
74LEAF(_memset16,0)
75	.set noreorder
76DBG	/* Check parameters */
77DBG	andi	t0,a0,1			# a0 must be halfword aligned
78DBG	tne	t0,zero
79DBG	andi	t2,a2,1			# a2 must be even
80DBG	tne	t2,zero
81
82#ifdef FIXARGS
83	# ensure count is even
84#if (__mips==32) && (__mips_isa_rev>=2)
85	ins	a2,zero,0,1
86#else
87	ori	a2,1
88	xori	a2,1
89#endif
90#endif
91
92#if (__mips==32) && (__mips_isa_rev>=2)
93	ins	a1,a1,16,16
94#else
95	andi	a1,0xffff
96	sll	t3,a1,16
97	or	a1,t3
98#endif
99
100	beqz	a2,.Ldone
101	 andi	t1,a0,2
102	beqz	t1,.Lalignok
103	 addu	t0,a0,a2		# t0 is the "past the end" address
104	sh	a1,0(a0)		# store one halfword to get aligned
105	addu	a0,2
106	subu	a2,2
107.Lalignok:
108	slti	t1,a2,4			# .Laligned for 4 or more bytes
109	beqz	t1,.Laligned
110	 sne	t1,a2,2			# one more halfword?
111	bnez	t1,.Ldone
112	 nop
113	sh	a1,0(a0)
114.Ldone:
115	j	ra
116	 nop
117	.set reorder
118END(_memset16)
119
120/*
121 * void _memset32(uint32_t* dst, uint32_t value, size_t size);
122 */
123
124LEAF(_memset32,0)
125	.set noreorder
126DBG	/* Check parameters */
127DBG	andi	t0,a0,3			# a0 must be word aligned
128DBG	tne	t0,zero
129DBG	andi	t2,a2,3			# a2 must be a multiple of 4 bytes
130DBG	tne	t2,zero
131
132#ifdef FIXARGS
133	# ensure count is a multiple of 4
134#if (__mips==32) && (__mips_isa_rev>=2)
135	ins	$a2,$0,0,2
136#else
137	ori	a2,3
138	xori	a2,3
139#endif
140#endif
141
142	bnez	a2,.Laligned		# any work to do?
143	 addu	t0,a0,a2		# t0 is the "past the end" address
144
145	j	ra
146	 nop
147	.set reorder
148END(_memset32)
149
150LEAF(memset,0)
151
152	.set	noreorder
153	.set	noat
154
155	addu	t0,a0,a2		# t0 is the "past the end" address
156	slti	AT,a2,4			# is a2 less than 4?
157	bne	AT,zero,.Llast4		# if yes, go to last4
158	 move	v0,a0			# memset returns the dst pointer
159
160	beq	a1,zero,.Lset0
161	 subu	v1,zero,a0
162
163	# smear byte into 32 bit word
164#if (__mips==32) && (__mips_isa_rev>=2)
165	ins     a1, a1, 8, 8        # Replicate fill byte into half-word.
166	ins     a1, a1, 16, 16      # Replicate fill byte into word.
167#else
168	and	a1,0xff
169	sll	AT,a1,8
170	or	a1,AT
171	sll	AT,a1,16
172	or	a1,AT
173#endif
174
175.Lset0:
176	andi	v1,v1,0x3		# word-unaligned address?
177	beq	v1,zero,.Laligned	# v1 is the unalignment count
178	 subu	a2,a2,v1
179	SWHI	a1,0(a0)
180	addu	a0,a0,v1
181
182# Here we have the "word-aligned" a0 (until the "last4")
183.Laligned:
184	andi	t8,a2,0x3f	# any 64-byte chunks?
185				# t8 is the byte count past 64-byte chunks
186	beq	a2,t8,.Lchk8w	# when a2==t8, no 64-byte chunks
187				# There will be at most 1 32-byte chunk then
188	 subu	a3,a2,t8	# subtract from a2 the reminder
189				# Here a3 counts bytes in 16w chunks
190	addu	a3,a0,a3	# Now a3 is the final dst after 64-byte chunks
191
192# Find out, if there are any 64-byte chunks after which will be still at least
193# 96 bytes left. The value "96" is calculated as needed buffer for
194# "pref 30,64(a0)" prefetch, which can be used as "pref 30,0(a0)" after
195# incrementing "a0" by 64.
196# For "a2" below 160 there will be no such "pref 30 safe" 64-byte chunk.
197#
198	sltiu	v1,a2,160
199	bgtz	v1,.Lloop16w_nopref30	# skip "pref 30,0(a0)"
200	 subu	t7,a2,96	# subtract "pref 30 unsafe" region
201		# below we have at least 1 64-byte chunk which is "pref 30 safe"
202	andi	t6,t7,0x3f	# t6 is past "64-byte safe chunks" reminder
203	subu	t5,t7,t6	# subtract from t7 the reminder
204				# Here t5 counts bytes in 16w "safe" chunks
205	addu	t4,a0,t5	# Now t4 is the dst after 64-byte "safe" chunks
206
207# Don't use "pref 30,0(a0)" for a0 in a "middle" of a cache line
208#	pref	30,0(a0)
209# Here we are in the region, where it is safe to use "pref 30,64(a0)"
210.Lloop16w:
211	addiu	a0,a0,64
212	pref	30,-32(a0)	# continue setting up the dest, addr 64-32
213	sw	a1,-64(a0)
214	sw	a1,-60(a0)
215	sw	a1,-56(a0)
216	sw	a1,-52(a0)
217	sw	a1,-48(a0)
218	sw	a1,-44(a0)
219	sw	a1,-40(a0)
220	sw	a1,-36(a0)
221	nop
222	nop			# the extra nop instructions help to balance
223	nop			# cycles needed for "store" + "fill" + "evict"
224	nop			# For 64byte store there are needed 8 fill
225	nop			# and 8 evict cycles, i.e. at least 32 instr.
226	nop
227	nop
228	pref	30,0(a0)	# continue setting up the dest, addr 64-0
229	sw	a1,-32(a0)
230	sw	a1,-28(a0)
231	sw	a1,-24(a0)
232	sw	a1,-20(a0)
233	sw	a1,-16(a0)
234	sw	a1,-12(a0)
235	sw	a1,-8(a0)
236	sw	a1,-4(a0)
237	nop
238	nop
239	nop
240	nop			# NOTE: adding 14 nop-s instead of 12 nop-s
241	nop			# gives better results for "fast" memory
242	nop
243	bne	a0,t4,.Lloop16w
244	 nop
245
246	beq	a0,a3,.Lchk8w	# maybe no more 64-byte chunks?
247	 nop			# this "delayed slot" is useless ...
248
249.Lloop16w_nopref30:	# there could be up to 3 "64-byte nopref30" chunks
250	addiu	a0,a0,64
251	sw	a1,-64(a0)
252	sw	a1,-60(a0)
253	sw	a1,-56(a0)
254	sw	a1,-52(a0)
255	sw	a1,-48(a0)
256	sw	a1,-44(a0)
257	sw	a1,-40(a0)
258	sw	a1,-36(a0)
259	sw	a1,-32(a0)
260	sw	a1,-28(a0)
261	sw	a1,-24(a0)
262	sw	a1,-20(a0)
263	sw	a1,-16(a0)
264	sw	a1,-12(a0)
265	sw	a1,-8(a0)
266	bne	a0,a3,.Lloop16w_nopref30
267	 sw	a1,-4(a0)
268
269.Lchk8w:		# t8 here is the byte count past 64-byte chunks
270
271	andi	t7,t8,0x1f	# is there a 32-byte chunk?
272				# the t7 is the reminder count past 32-bytes
273	beq	t8,t7,.Lchk1w	# when t8==t7, no 32-byte chunk
274	 move	a2,t7
275
276	sw	a1,0(a0)
277	sw	a1,4(a0)
278	sw	a1,8(a0)
279	sw	a1,12(a0)
280	sw	a1,16(a0)
281	sw	a1,20(a0)
282	sw	a1,24(a0)
283	sw	a1,28(a0)
284	addiu	a0,a0,32
285
286.Lchk1w:
287	andi	t8,a2,0x3	# now t8 is the reminder past 1w chunks
288	beq	a2,t8,.Llast4aligned
289	 subu	a3,a2,t8	# a3 is the count of bytes in 1w chunks
290	addu	a3,a0,a3	# now a3 is the dst address past the 1w chunks
291
292# copying in words (4-byte chunks)
293.LwordCopy_loop:
294	addiu	a0,a0,4
295	bne	a0,a3,.LwordCopy_loop
296	 sw	a1,-4(a0)
297
298# store last 0-3 bytes
299# this will repeat the last store if the memset finishes on a word boundary
300.Llast4aligned:
301	j	ra
302	 SWLO	a1,-1(t0)
303
304.Llast4:
305	beq	a0,t0,.Llast4e
306.Llast4l:
307	 addiu	a0,a0,1
308	bne	a0,t0,.Llast4l
309	 sb	a1,-1(a0)
310.Llast4e:
311	j	ra
312	 nop
313
314	.set	at
315	.set	reorder
316
317END(memset)
318
319
320/************************************************************************
321 *  Implementation : Static functions
322 ************************************************************************/
323