1/*
2 * Copyright (c) 2009
3 *      MIPS Technologies, Inc., California.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
14 *    contributors may be used to endorse or promote products derived from
15 *    this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30/************************************************************************
31 *
32 *  memset.S, version "64h" with 1 cache line horizon for "pref 30" and 14 nops
33 *  Version: "043009"
34 *
35 ************************************************************************/
36
37
38/************************************************************************
39 *  Include files
40 ************************************************************************/
41
42#include "machine/asm.h"
43
44/*
45 * This routine could be optimized for MIPS64. The current code only
46 * uses MIPS32 instructions.
47 */
48
49#if defined(__MIPSEB__)
50#  define SWHI	swl		/* high part is left in big-endian	*/
51#endif
52
53#if defined(__MIPSEL__)
54#  define SWHI	swr		/* high part is right in little-endian	*/
55#endif
56
57#if !(defined(XGPROF) || defined(XPROF))
58#undef SETUP_GP
59#define SETUP_GP
60#endif
61
62LEAF(memset_cmips,0)
63
64	.set	noreorder
65	.set	noat
66
67	addu	t0,a0,a2		# t0 is the "past the end" address
68	slti	AT,a2,4			# is a2 less than 4?
69	bne	AT,zero,.Llast4		# if yes, go to last4
70	move	v0,a0			# memset returns the dst pointer
71
72	beq	a1,zero,.Lset0
73	subu	v1,zero,a0
74
75	# smear byte into 32 bit word
76#if (__mips==32) && (__mips_isa_rev>=2)
77	ins     a1, a1, 8, 8        # Replicate fill byte into half-word.
78	ins     a1, a1, 16, 16      # Replicate fill byte into word.
79#else
80	and	a1,0xff
81	sll	AT,a1,8
82	or	a1,AT
83	sll	AT,a1,16
84	or	a1,AT
85#endif
86
87.Lset0:	andi	v1,v1,0x3		# word-unaligned address?
88	beq	v1,zero,.Laligned	# v1 is the unalignment count
89	subu	a2,a2,v1
90	SWHI	a1,0(a0)
91	addu	a0,a0,v1
92
93# Here we have the "word-aligned" a0 (until the "last4")
94.Laligned:
95	andi	t8,a2,0x3f	# any 64-byte chunks?
96				# t8 is the byte count past 64-byte chunks
97	beq	a2,t8,.Lchk8w	# when a2==t8, no 64-byte chunks
98				# There will be at most 1 32-byte chunk then
99	subu	a3,a2,t8	# subtract from a2 the reminder
100				# Here a3 counts bytes in 16w chunks
101	addu	a3,a0,a3	# Now a3 is the final dst after 64-byte chunks
102
103# Find out, if there are any 64-byte chunks after which will be still at least
104# 96 bytes left. The value "96" is calculated as needed buffer for
105# "pref 30,64(a0)" prefetch, which can be used as "pref 30,0(a0)" after
106# incrementing "a0" by 64.
107# For "a2" below 160 there will be no such "pref 30 safe" 64-byte chunk.
108#
109	sltiu	v1,a2,160
110	bgtz	v1,.Lloop16w_nopref30	# skip "pref 30,0(a0)"
111	subu	t7,a2,96	# subtract "pref 30 unsafe" region
112		# below we have at least 1 64-byte chunk which is "pref 30 safe"
113	andi	t6,t7,0x3f	# t6 is past "64-byte safe chunks" reminder
114	subu	t5,t7,t6	# subtract from t7 the reminder
115				# Here t5 counts bytes in 16w "safe" chunks
116	addu	t4,a0,t5	# Now t4 is the dst after 64-byte "safe" chunks
117
118# Don't use "pref 30,0(a0)" for a0 in a "middle" of a cache line
119#	pref	30,0(a0)
120# Here we are in the region, where it is safe to use "pref 30,64(a0)"
121.Lloop16w:
122	addiu	a0,a0,64
123	pref	30,-32(a0)	# continue setting up the dest, addr 64-32
124	sw	a1,-64(a0)
125	sw	a1,-60(a0)
126	sw	a1,-56(a0)
127	sw	a1,-52(a0)
128	sw	a1,-48(a0)
129	sw	a1,-44(a0)
130	sw	a1,-40(a0)
131	sw	a1,-36(a0)
132	nop
133	nop			# the extra nop instructions help to balance
134	nop			# cycles needed for "store" + "fill" + "evict"
135	nop			# For 64byte store there are needed 8 fill
136	nop			# and 8 evict cycles, i.e. at least 32 instr.
137	nop
138	nop
139	pref	30,0(a0)	# continue setting up the dest, addr 64-0
140	sw	a1,-32(a0)
141	sw	a1,-28(a0)
142	sw	a1,-24(a0)
143	sw	a1,-20(a0)
144	sw	a1,-16(a0)
145	sw	a1,-12(a0)
146	sw	a1,-8(a0)
147	sw	a1,-4(a0)
148	nop
149	nop
150	nop
151	nop			# NOTE: adding 14 nop-s instead of 12 nop-s
152	nop			# gives better results for "fast" memory
153	nop
154	bne	a0,t4,.Lloop16w
155	nop
156
157	beq	a0,a3,.Lchk8w	# maybe no more 64-byte chunks?
158	nop			# this "delayed slot" is useless ...
159
160.Lloop16w_nopref30:	# there could be up to 3 "64-byte nopref30" chunks
161	addiu	a0,a0,64
162	sw	a1,-64(a0)
163	sw	a1,-60(a0)
164	sw	a1,-56(a0)
165	sw	a1,-52(a0)
166	sw	a1,-48(a0)
167	sw	a1,-44(a0)
168	sw	a1,-40(a0)
169	sw	a1,-36(a0)
170	sw	a1,-32(a0)
171	sw	a1,-28(a0)
172	sw	a1,-24(a0)
173	sw	a1,-20(a0)
174	sw	a1,-16(a0)
175	sw	a1,-12(a0)
176	sw	a1,-8(a0)
177	bne	a0,a3,.Lloop16w_nopref30
178	sw	a1,-4(a0)
179
180.Lchk8w:		# t8 here is the byte count past 64-byte chunks
181
182	andi	t7,t8,0x1f	# is there a 32-byte chunk?
183				# the t7 is the reminder count past 32-bytes
184	beq	t8,t7,.Lchk1w	# when t8==t7, no 32-byte chunk
185	move	a2,t7
186
187	sw	a1,0(a0)
188	sw	a1,4(a0)
189	sw	a1,8(a0)
190	sw	a1,12(a0)
191	sw	a1,16(a0)
192	sw	a1,20(a0)
193	sw	a1,24(a0)
194	sw	a1,28(a0)
195	addiu	a0,a0,32
196
197.Lchk1w:
198	andi	t8,a2,0x3	# now t8 is the reminder past 1w chunks
199	beq	a2,t8,.Llast4
200	subu	a3,a2,t8	# a3 is the count of bytes in 1w chunks
201	addu	a3,a0,a3	# now a3 is the dst address past the 1w chunks
202
203# copying in words (4-byte chunks)
204.LwordCopy_loop:
205	addiu	a0,a0,4
206	bne	a0,a3,.LwordCopy_loop
207	sw	a1,-4(a0)
208
209.Llast4:beq	a0,t0,.Llast4e
210.Llast4l:addiu	a0,a0,1
211	bne	a0,t0,.Llast4l
212	sb	a1,-1(a0)
213
214.Llast4e:
215	j	ra
216	nop
217
218	.set	at
219	.set	reorder
220
221END(memset_cmips)
222
223
224/************************************************************************
225 *  Implementation : Static functions
226 ************************************************************************/
227
228