1/* Copyright (c) 2012, Linaro Limited
2   All rights reserved.
3   Copyright (c) 2014, NVIDIA Corporation.  All rights reserved.
4
5   Redistribution and use in source and binary forms, with or without
6   modification, are permitted provided that the following conditions are met:
7       * Redistributions of source code must retain the above copyright
8         notice, this list of conditions and the following disclaimer.
9       * Redistributions in binary form must reproduce the above copyright
10         notice, this list of conditions and the following disclaimer in the
11         documentation and/or other materials provided with the distribution.
12       * Neither the name of the Linaro nor the
13         names of its contributors may be used to endorse or promote products
14         derived from this software without specific prior written permission.
15
16   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27*/
28
29/* Assumptions:
30 *
31 * denver, ARMv8-a, AArch64
32 * Unaligned accesses
33 *
34 */
35
36#include <private/bionic_asm.h>
37
38/* By default we assume that the DC instruction can be used to zero
39   data blocks more efficiently.  In some circumstances this might be
40   unsafe, for example in an asymmetric multiprocessor environment with
41   different DC clear lengths (neither the upper nor lower lengths are
42   safe to use).  The feature can be disabled by defining DONT_USE_DC.
43
44   If code may be run in a virtualized environment, then define
45   MAYBE_VIRT.  This will cause the code to cache the system register
46   values rather than re-reading them each call.  */
47
48#define dstin		x0
49#define val		w1
50#define count		x2
51#define tmp1		x3
52#define tmp1w		w3
53#define tmp2		x4
54#define tmp2w		w4
55#define zva_len_x	x5
56#define zva_len		w5
57#define zva_bits_x	x6
58
59#define A_l		x7
60#define A_lw		w7
61#define dst		x8
62#define tmp3w		w9
63
64#define QA_l		q0
65
66ENTRY(memset)
67
68	mov	dst, dstin		/* Preserve return value.  */
69	ands	A_lw, val, #255
70#ifndef DONT_USE_DC
71#	b.eq	.Lzero_mem
72#endif
73	orr	A_lw, A_lw, A_lw, lsl #8
74	orr	A_lw, A_lw, A_lw, lsl #16
75	orr	A_l, A_l, A_l, lsl #32
76.Ltail_maybe_long:
77	cmp	count, #256
78	b.ge	.Lnot_short
79.Ltail_maybe_tiny:
80	cmp	count, #15
81	b.le	.Ltail15tiny
82.Ltail255:
83	ands	tmp1, count, #0xC0
84	b.eq	.Ltail63
85	dup	v0.4s, A_lw
86	cmp	tmp1w, #0x80
87	b.eq	1f
88	b.lt	2f
89	stp	QA_l, QA_l, [dst], #32
90	stp	QA_l, QA_l, [dst], #32
911:
92	stp	QA_l, QA_l, [dst], #32
93	stp	QA_l, QA_l, [dst], #32
942:
95	stp	QA_l, QA_l, [dst], #32
96	stp	QA_l, QA_l, [dst], #32
97.Ltail63:
98	ands	tmp1, count, #0x30
99	b.eq	.Ltail15
100	add	dst, dst, tmp1
101	cmp	tmp1w, #0x20
102	b.eq	1f
103	b.lt	2f
104	stp	A_l, A_l, [dst, #-48]
1051:
106	stp	A_l, A_l, [dst, #-32]
1072:
108	stp	A_l, A_l, [dst, #-16]
109
110.Ltail15:
111	and	count, count, #15
112	add	dst, dst, count
113	stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */
114	ret
115
116.Ltail15tiny:
117	/* Set up to 15 bytes.  Does not assume earlier memory
118	   being set.  */
119	tbz	count, #3, 1f
120	str	A_l, [dst], #8
1211:
122	tbz	count, #2, 1f
123	str	A_lw, [dst], #4
1241:
125	tbz	count, #1, 1f
126	strh	A_lw, [dst], #2
1271:
128	tbz	count, #0, 1f
129	strb	A_lw, [dst]
1301:
131	ret
132
133	/* Critical loop.  Start at a new cache line boundary.  Assuming
134	 * 64 bytes per line, this ensures the entire loop is in one line.  */
135	.p2align 6
136.Lnot_short:
137	dup	v0.4s, A_lw
138	neg	tmp2, dst
139	ands	tmp2, tmp2, #15
140	b.eq	2f
141	/* Bring DST to 128-bit (16-byte) alignment.  We know that there's
142	 * more than that to set, so we simply store 16 bytes and advance by
143	 * the amount required to reach alignment.  */
144	sub	count, count, tmp2
145	stp	A_l, A_l, [dst]
146	add	dst, dst, tmp2
147	/* There may be less than 63 bytes to go now.  */
148	cmp	count, #255
149	b.le	.Ltail255
1502:
151	cmp	count, #2097152
152	b.gt	3f
1531:
154	sub	count, count, #256
1552:
156	stp	QA_l, QA_l, [dst], #32
157	stp	QA_l, QA_l, [dst], #32
158	stp	QA_l, QA_l, [dst], #32
159	stp	QA_l, QA_l, [dst], #32
160	stp	QA_l, QA_l, [dst], #32
161	stp	QA_l, QA_l, [dst], #32
162	stp	QA_l, QA_l, [dst], #32
163	stp	QA_l, QA_l, [dst], #32
164	subs	count, count, #256
165	b.ge	2b
166	tst	count, #0xff
167	b.ne	.Ltail255
168	ret
1693:
170	sub	count, count, #64
1714:
172	subs	count, count, #64
173	stnp	QA_l, QA_l, [dst]
174	stnp	QA_l, QA_l, [dst, #32]
175	add	dst, dst, #64
176	b.ge	4b
177	tst	count, #0x3f
178	b.ne	.Ltail63
179	ret
180
181#ifndef DONT_USE_DC
182	/* For zeroing memory, check to see if we can use the ZVA feature to
183	 * zero entire 'cache' lines.  */
184.Lzero_mem:
185	mov	A_l, #0
186	cmp	count, #63
187	b.le	.Ltail_maybe_tiny
188	neg	tmp2, dst
189	ands	tmp2, tmp2, #15
190	b.eq	1f
191	sub	count, count, tmp2
192	stp	A_l, A_l, [dst]
193	add	dst, dst, tmp2
194	cmp	count, #63
195	b.le	.Ltail63
1961:
197	/* For zeroing small amounts of memory, it's not worth setting up
198	 * the line-clear code.  */
199	cmp	count, #128
200	b.lt	.Lnot_short
201#ifdef MAYBE_VIRT
202	/* For efficiency when virtualized, we cache the ZVA capability.  */
203	adrp	tmp2, .Lcache_clear
204	ldr	zva_len, [tmp2, #:lo12:.Lcache_clear]
205	tbnz	zva_len, #31, .Lnot_short
206	cbnz	zva_len, .Lzero_by_line
207	mrs	tmp1, dczid_el0
208	tbz	tmp1, #4, 1f
209	/* ZVA not available.  Remember this for next time.  */
210	mov	zva_len, #~0
211	str	zva_len, [tmp2, #:lo12:.Lcache_clear]
212	b	.Lnot_short
2131:
214	mov	tmp3w, #4
215	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
216	lsl	zva_len, tmp3w, zva_len
217	str	zva_len, [tmp2, #:lo12:.Lcache_clear]
218#else
219	mrs	tmp1, dczid_el0
220	tbnz	tmp1, #4, .Lnot_short
221	mov	tmp3w, #4
222	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
223	lsl	zva_len, tmp3w, zva_len
224#endif
225
226.Lzero_by_line:
227	/* Compute how far we need to go to become suitably aligned.  We're
228	 * already at quad-word alignment.  */
229	cmp	count, zva_len_x
230	b.lt	.Lnot_short		/* Not enough to reach alignment.  */
231	sub	zva_bits_x, zva_len_x, #1
232	neg	tmp2, dst
233	ands	tmp2, tmp2, zva_bits_x
234	b.eq	1f			/* Already aligned.  */
235	/* Not aligned, check that there's enough to copy after alignment.  */
236	sub	tmp1, count, tmp2
237	cmp	tmp1, #64
238	ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */
239	b.lt	.Lnot_short
240	/* We know that there's at least 64 bytes to zero and that it's safe
241	 * to overrun by 64 bytes.  */
242	mov	count, tmp1
2432:
244	stp	A_l, A_l, [dst]
245	stp	A_l, A_l, [dst, #16]
246	stp	A_l, A_l, [dst, #32]
247	subs	tmp2, tmp2, #64
248	stp	A_l, A_l, [dst, #48]
249	add	dst, dst, #64
250	b.ge	2b
251	/* We've overrun a bit, so adjust dst downwards.  */
252	add	dst, dst, tmp2
2531:
254	sub	count, count, zva_len_x
2553:
256	dc	zva, dst
257	add	dst, dst, zva_len_x
258	subs	count, count, zva_len_x
259	b.ge	3b
260	ands	count, count, zva_bits_x
261	b.ne	.Ltail_maybe_long
262	ret
263END(memset)
264
265#ifdef MAYBE_VIRT
266	.bss
267	.p2align 2
268.Lcache_clear:
269	.space 4
270#endif
271#endif /* DONT_USE_DC */
272