memset.S revision 372f19e9e27c1333c0fc1e83b53d365051e81612
1/* Copyright (c) 2012-2013, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7         notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9         notice, this list of conditions and the following disclaimer in the
10         documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12         names of its contributors may be used to endorse or promote products
13         derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26
27/*
28 * Copyright (c) 2015 ARM Ltd
29 * All rights reserved.
30 *
31 * Redistribution and use in source and binary forms, with or without
32 * modification, are permitted provided that the following conditions
33 * are met:
34 * 1. Redistributions of source code must retain the above copyright
35 *    notice, this list of conditions and the following disclaimer.
36 * 2. Redistributions in binary form must reproduce the above copyright
37 *    notice, this list of conditions and the following disclaimer in the
38 *    documentation and/or other materials provided with the distribution.
39 * 3. The name of the company may not be used to endorse or promote
40 *    products derived from this software without specific prior written
41 *    permission.
42 *
43 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
44 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
45 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
46 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
48 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
49 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
50 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
51 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
52 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53 */
54
55/* Assumptions:
56 *
57 * ARMv8-a, AArch64, unaligned accesses
58 *
59 */
60
61#include <private/bionic_asm.h>
62
63/* By default we assume that the DC instruction can be used to zero
64   data blocks more efficiently.  In some circumstances this might be
65   unsafe, for example in an asymmetric multiprocessor environment with
66   different DC clear lengths (neither the upper nor lower lengths are
67   safe to use).
68
69   If code may be run in a virtualized environment, then define
70   MAYBE_VIRT.  This will cause the code to cache the system register
71   values rather than re-reading them each call.  */
72
73#define dstin		x0
74#define val		x1
75#define valw		w1
76#define count		x2
77#define dst 		x3
78#define dstend		x4
79#define tmp1		x5
80#define tmp1w		w5
81#define tmp2		x6
82#define tmp2w		w6
83#define zva_len		x5
84#define zva_lenw	w7
85
86#define L(l) .L ## l
87
88ENTRY(__memset_chk)
89  cmp count, dst
90  bls memset
91
92  // Preserve for accurate backtrace.
93  stp x29, x30, [sp, -16]!
94  .cfi_def_cfa_offset 16
95  .cfi_rel_offset x29, 0
96  .cfi_rel_offset x30, 8
97
98  bl __memset_chk_fail
99END(__memset_chk)
100
101ENTRY(memset)
102
103	dup	v0.16B, valw
104	add	dstend, dstin, count
105
106	cmp	count, 96
107	b.hi	L(set_long)
108	cmp	count, 16
109	b.hs	L(set_medium)
110	mov	val, v0.D[0]
111
112	/* Set 0..15 bytes.  */
113	tbz	count, 3, 1f
114	str	val, [dstin]
115	str	val, [dstend, -8]
116	ret
117	nop
1181:	tbz	count, 2, 2f
119	str	valw, [dstin]
120	str	valw, [dstend, -4]
121	ret
1222:	cbz	count, 3f
123	strb	valw, [dstin]
124	tbz	count, 1, 3f
125	strh	valw, [dstend, -2]
1263:	ret
127
128	/* Set 17..96 bytes.  */
129L(set_medium):
130	str	q0, [dstin]
131	tbnz	count, 6, L(set96)
132	str	q0, [dstend, -16]
133	tbz	count, 5, 1f
134	str	q0, [dstin, 16]
135	str	q0, [dstend, -32]
1361:	ret
137
138	.p2align 4
139	/* Set 64..96 bytes.  Write 64 bytes from the start and
140	   32 bytes from the end.  */
141L(set96):
142	str	q0, [dstin, 16]
143	stp	q0, q0, [dstin, 32]
144	stp	q0, q0, [dstend, -32]
145	ret
146
147	.p2align 3
148	nop
149L(set_long):
150	and	valw, valw, 255
151	bic	dst, dstin, 15
152	str	q0, [dstin]
153	cmp	count, 256
154	ccmp	valw, 0, 0, cs
155	b.eq	L(try_zva)
156L(no_zva):
157	sub	count, dstend, dst	/* Count is 16 too large.  */
158	add	dst, dst, 16
159	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
1601:	stp	q0, q0, [dst], 64
161	stp	q0, q0, [dst, -32]
162L(tail64):
163	subs	count, count, 64
164	b.hi	1b
1652:	stp	q0, q0, [dstend, -64]
166	stp	q0, q0, [dstend, -32]
167	ret
168
169	.p2align 3
170L(try_zva):
171	mrs	tmp1, dczid_el0
172	tbnz	tmp1w, 4, L(no_zva)
173	and	tmp1w, tmp1w, 15
174	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
175	b.ne	 L(zva_128)
176
177	/* Write the first and last 64 byte aligned block using stp rather
178	   than using DC ZVA.  This is faster on some cores.
179	 */
180L(zva_64):
181	str	q0, [dst, 16]
182	stp	q0, q0, [dst, 32]
183	bic	dst, dst, 63
184	stp	q0, q0, [dst, 64]
185	stp	q0, q0, [dst, 96]
186	sub	count, dstend, dst	/* Count is now 128 too large.	*/
187	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
188	add	dst, dst, 128
189	nop
1901:	dc	zva, dst
191	add	dst, dst, 64
192	subs	count, count, 64
193	b.hi	1b
194	stp	q0, q0, [dst, 0]
195	stp	q0, q0, [dst, 32]
196	stp	q0, q0, [dstend, -64]
197	stp	q0, q0, [dstend, -32]
198	ret
199
200	.p2align 3
201L(zva_128):
202	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
203	b.ne	L(zva_other)
204
205	str	q0, [dst, 16]
206	stp	q0, q0, [dst, 32]
207	stp	q0, q0, [dst, 64]
208	stp	q0, q0, [dst, 96]
209	bic	dst, dst, 127
210	sub	count, dstend, dst	/* Count is now 128 too large.	*/
211	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
212	add	dst, dst, 128
2131:	dc	zva, dst
214	add	dst, dst, 128
215	subs	count, count, 128
216	b.hi	1b
217	stp	q0, q0, [dstend, -128]
218	stp	q0, q0, [dstend, -96]
219	stp	q0, q0, [dstend, -64]
220	stp	q0, q0, [dstend, -32]
221	ret
222
223L(zva_other):
224	mov	tmp2w, 4
225	lsl	zva_lenw, tmp2w, tmp1w
226	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
227	cmp	count, tmp1
228	blo	L(no_zva)
229
230	sub	tmp2, zva_len, 1
231	add	tmp1, dst, zva_len
232	add	dst, dst, 16
233	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
234	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
235	beq	2f
2361:	stp	q0, q0, [dst], 64
237	stp	q0, q0, [dst, -32]
238	subs	count, count, 64
239	b.hi	1b
2402:	mov	dst, tmp1
241	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
242	subs	count, count, zva_len
243	b.lo	4f
2443:	dc	zva, dst
245	add	dst, dst, zva_len
246	subs	count, count, zva_len
247	b.hs	3b
2484:	add	count, count, zva_len
249	b	L(tail64)
250
251END(memset)
252