memset.S revision 7e4fa560999d07064d219a16ebb50d3691dd1b63
1/* Copyright (c) 2012, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7         notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9         notice, this list of conditions and the following disclaimer in the
10         documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12         names of its contributors may be used to endorse or promote products
13         derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28/* Assumptions:
29 *
30 * ARMv8-a, AArch64
31 * Unaligned accesses
32 *
33 */
34
35#include <private/bionic_asm.h>
36
37/* By default we assume that the DC instruction can be used to zero
38   data blocks more efficiently.  In some circumstances this might be
39   unsafe, for example in an asymmetric multiprocessor environment with
40   different DC clear lengths (neither the upper nor lower lengths are
41   safe to use).  The feature can be disabled by defining DONT_USE_DC.
42
43   If code may be run in a virtualized environment, then define
44   MAYBE_VIRT.  This will cause the code to cache the system register
45   values rather than re-reading them each call.  */
46
47#define dstin		x0
48#define val		w1
49#define count		x2
50#define tmp1		x3
51#define tmp1w		w3
52#define tmp2		x4
53#define tmp2w		w4
54#define zva_len_x	x5
55#define zva_len		w5
56#define zva_bits_x	x6
57
58#define A_l		x7
59#define A_lw		w7
60#define dst		x8
61#define tmp3w		w9
62
63ENTRY(memset)
64
65	mov	dst, dstin		/* Preserve return value.  */
66	ands	A_lw, val, #255
67#ifndef DONT_USE_DC
68	b.eq	.Lzero_mem
69#endif
70	orr	A_lw, A_lw, A_lw, lsl #8
71	orr	A_lw, A_lw, A_lw, lsl #16
72	orr	A_l, A_l, A_l, lsl #32
73.Ltail_maybe_long:
74	cmp	count, #64
75	b.ge	.Lnot_short
76.Ltail_maybe_tiny:
77	cmp	count, #15
78	b.le	.Ltail15tiny
79.Ltail63:
80	ands	tmp1, count, #0x30
81	b.eq	.Ltail15
82	add	dst, dst, tmp1
83	cmp	tmp1w, #0x20
84	b.eq	1f
85	b.lt	2f
86	stp	A_l, A_l, [dst, #-48]
871:
88	stp	A_l, A_l, [dst, #-32]
892:
90	stp	A_l, A_l, [dst, #-16]
91
92.Ltail15:
93	and	count, count, #15
94	add	dst, dst, count
95	stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */
96	ret
97
98.Ltail15tiny:
99	/* Set up to 15 bytes.  Does not assume earlier memory
100	   being set.  */
101	tbz	count, #3, 1f
102	str	A_l, [dst], #8
1031:
104	tbz	count, #2, 1f
105	str	A_lw, [dst], #4
1061:
107	tbz	count, #1, 1f
108	strh	A_lw, [dst], #2
1091:
110	tbz	count, #0, 1f
111	strb	A_lw, [dst]
1121:
113	ret
114
115	/* Critical loop.  Start at a new cache line boundary.  Assuming
116	 * 64 bytes per line, this ensures the entire loop is in one line.  */
117	.p2align 6
118.Lnot_short:
119	neg	tmp2, dst
120	ands	tmp2, tmp2, #15
121	b.eq	2f
122	/* Bring DST to 128-bit (16-byte) alignment.  We know that there's
123	 * more than that to set, so we simply store 16 bytes and advance by
124	 * the amount required to reach alignment.  */
125	sub	count, count, tmp2
126	stp	A_l, A_l, [dst]
127	add	dst, dst, tmp2
128	/* There may be less than 63 bytes to go now.  */
129	cmp	count, #63
130	b.le	.Ltail63
1312:
132	sub	dst, dst, #16		/* Pre-bias.  */
133	sub	count, count, #64
1341:
135	stp	A_l, A_l, [dst, #16]
136	stp	A_l, A_l, [dst, #32]
137	stp	A_l, A_l, [dst, #48]
138	stp	A_l, A_l, [dst, #64]!
139	subs	count, count, #64
140	b.ge	1b
141	tst	count, #0x3f
142	add	dst, dst, #16
143	b.ne	.Ltail63
144	ret
145
146#ifndef DONT_USE_DC
147	/* For zeroing memory, check to see if we can use the ZVA feature to
148	 * zero entire 'cache' lines.  */
149.Lzero_mem:
150	mov	A_l, #0
151	cmp	count, #63
152	b.le	.Ltail_maybe_tiny
153	neg	tmp2, dst
154	ands	tmp2, tmp2, #15
155	b.eq	1f
156	sub	count, count, tmp2
157	stp	A_l, A_l, [dst]
158	add	dst, dst, tmp2
159	cmp	count, #63
160	b.le	.Ltail63
1611:
162	/* For zeroing small amounts of memory, it's not worth setting up
163	 * the line-clear code.  */
164	cmp	count, #128
165	b.lt	.Lnot_short
166#ifdef MAYBE_VIRT
167	/* For efficiency when virtualized, we cache the ZVA capability.  */
168	adrp	tmp2, .Lcache_clear
169	ldr	zva_len, [tmp2, #:lo12:.Lcache_clear]
170	tbnz	zva_len, #31, .Lnot_short
171	cbnz	zva_len, .Lzero_by_line
172	mrs	tmp1, dczid_el0
173	tbz	tmp1, #4, 1f
174	/* ZVA not available.  Remember this for next time.  */
175	mov	zva_len, #~0
176	str	zva_len, [tmp2, #:lo12:.Lcache_clear]
177	b	.Lnot_short
1781:
179	mov	tmp3w, #4
180	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
181	lsl	zva_len, tmp3w, zva_len
182	str	zva_len, [tmp2, #:lo12:.Lcache_clear]
183#else
184	mrs	tmp1, dczid_el0
185	tbnz	tmp1, #4, .Lnot_short
186	mov	tmp3w, #4
187	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
188	lsl	zva_len, tmp3w, zva_len
189#endif
190
191.Lzero_by_line:
192	/* Compute how far we need to go to become suitably aligned.  We're
193	 * already at quad-word alignment.  */
194	cmp	count, zva_len_x
195	b.lt	.Lnot_short		/* Not enough to reach alignment.  */
196	sub	zva_bits_x, zva_len_x, #1
197	neg	tmp2, dst
198	ands	tmp2, tmp2, zva_bits_x
199	b.eq	1f			/* Already aligned.  */
200	/* Not aligned, check that there's enough to copy after alignment.  */
201	sub	tmp1, count, tmp2
202	cmp	tmp1, #64
203	ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */
204	b.lt	.Lnot_short
205	/* We know that there's at least 64 bytes to zero and that it's safe
206	 * to overrun by 64 bytes.  */
207	mov	count, tmp1
2082:
209	stp	A_l, A_l, [dst]
210	stp	A_l, A_l, [dst, #16]
211	stp	A_l, A_l, [dst, #32]
212	subs	tmp2, tmp2, #64
213	stp	A_l, A_l, [dst, #48]
214	add	dst, dst, #64
215	b.ge	2b
216	/* We've overrun a bit, so adjust dst downwards.  */
217	add	dst, dst, tmp2
2181:
219	sub	count, count, zva_len_x
2203:
221	dc	zva, dst
222	add	dst, dst, zva_len_x
223	subs	count, count, zva_len_x
224	b.ge	3b
225	ands	count, count, zva_bits_x
226	b.ne	.Ltail_maybe_long
227	ret
228END(memset)
229
230#ifdef MAYBE_VIRT
231	.bss
232	.p2align 2
233.Lcache_clear:
234	.space 4
235#endif
236#endif /* DONT_USE_DC */
237