memset.S revision 784609317d49e854813f1797d7a53cf7d4379643
1/* Copyright (c) 2012, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7         notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9         notice, this list of conditions and the following disclaimer in the
10         documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12         names of its contributors may be used to endorse or promote products
13         derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28/* Assumptions:
29 *
30 * ARMv8-a, AArch64
31 * Unaligned accesses
32 *
33 */
34
35#include <private/bionic_asm.h>
36
37/* By default we assume that the DC instruction can be used to zero
38   data blocks more efficiently.  In some circumstances this might be
39   unsafe, for example in an asymmetric multiprocessor environment with
40   different DC clear lengths (neither the upper nor lower lengths are
41   safe to use).
42
43   If code may be run in a virtualized environment, then define
44   MAYBE_VIRT.  This will cause the code to cache the system register
45   values rather than re-reading them each call.  */
46
47#define dstin		x0
48#define val		w1
49#define count		x2
50#define dst_count x3 /* for __memset_chk */
51#define tmp1		x3
52#define tmp1w		w3
53#define tmp2		x4
54#define tmp2w		w4
55#define zva_len_x	x5
56#define zva_len		w5
57#define zva_bits_x	x6
58
59#define A_l		x7
60#define A_lw		w7
61#define dst		x8
62#define tmp3w		w9
63
64ENTRY(__memset_chk)
65  cmp count, dst_count
66  bls memset
67
68  // Preserve for accurate backtrace.
69  stp x29, x30, [sp, -16]!
70  .cfi_def_cfa_offset 16
71  .cfi_rel_offset x29, 0
72  .cfi_rel_offset x30, 8
73
74  bl __memset_chk_fail
75END(__memset_chk)
76
77ENTRY(memset)
78
79	mov	dst, dstin		/* Preserve return value.  */
80	ands	A_lw, val, #255
81	b.eq	.Lzero_mem
82	orr	A_lw, A_lw, A_lw, lsl #8
83	orr	A_lw, A_lw, A_lw, lsl #16
84	orr	A_l, A_l, A_l, lsl #32
85.Ltail_maybe_long:
86	cmp	count, #64
87	b.ge	.Lnot_short
88.Ltail_maybe_tiny:
89	cmp	count, #15
90	b.le	.Ltail15tiny
91.Ltail63:
92	ands	tmp1, count, #0x30
93	b.eq	.Ltail15
94	add	dst, dst, tmp1
95	cmp	tmp1w, #0x20
96	b.eq	1f
97	b.lt	2f
98	stp	A_l, A_l, [dst, #-48]
991:
100	stp	A_l, A_l, [dst, #-32]
1012:
102	stp	A_l, A_l, [dst, #-16]
103
104.Ltail15:
105	and	count, count, #15
106	add	dst, dst, count
107	stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */
108	ret
109
110.Ltail15tiny:
111	/* Set up to 15 bytes.  Does not assume earlier memory
112	   being set.  */
113	tbz	count, #3, 1f
114	str	A_l, [dst], #8
1151:
116	tbz	count, #2, 1f
117	str	A_lw, [dst], #4
1181:
119	tbz	count, #1, 1f
120	strh	A_lw, [dst], #2
1211:
122	tbz	count, #0, 1f
123	strb	A_lw, [dst]
1241:
125	ret
126
127	/* Critical loop.  Start at a new cache line boundary.  Assuming
128	 * 64 bytes per line, this ensures the entire loop is in one line.  */
129	.p2align 6
130.Lnot_short:
131	neg	tmp2, dst
132	ands	tmp2, tmp2, #15
133	b.eq	2f
134	/* Bring DST to 128-bit (16-byte) alignment.  We know that there's
135	 * more than that to set, so we simply store 16 bytes and advance by
136	 * the amount required to reach alignment.  */
137	sub	count, count, tmp2
138	stp	A_l, A_l, [dst]
139	add	dst, dst, tmp2
140	/* There may be less than 63 bytes to go now.  */
141	cmp	count, #63
142	b.le	.Ltail63
1432:
144	sub	dst, dst, #16		/* Pre-bias.  */
145	sub	count, count, #64
1461:
147	stp	A_l, A_l, [dst, #16]
148	stp	A_l, A_l, [dst, #32]
149	stp	A_l, A_l, [dst, #48]
150	stp	A_l, A_l, [dst, #64]!
151	subs	count, count, #64
152	b.ge	1b
153	tst	count, #0x3f
154	add	dst, dst, #16
155	b.ne	.Ltail63
156	ret
157
158	/* For zeroing memory, check to see if we can use the ZVA feature to
159	 * zero entire 'cache' lines.  */
160.Lzero_mem:
161	mov	A_l, #0
162	cmp	count, #63
163	b.le	.Ltail_maybe_tiny
164	neg	tmp2, dst
165	ands	tmp2, tmp2, #15
166	b.eq	1f
167	sub	count, count, tmp2
168	stp	A_l, A_l, [dst]
169	add	dst, dst, tmp2
170	cmp	count, #63
171	b.le	.Ltail63
1721:
173	/* For zeroing small amounts of memory, it's not worth setting up
174	 * the line-clear code.  */
175	cmp	count, #128
176	b.lt	.Lnot_short
177#ifdef MAYBE_VIRT
178	/* For efficiency when virtualized, we cache the ZVA capability.  */
179	adrp	tmp2, .Lcache_clear
180	ldr	zva_len, [tmp2, #:lo12:.Lcache_clear]
181	tbnz	zva_len, #31, .Lnot_short
182	cbnz	zva_len, .Lzero_by_line
183	mrs	tmp1, dczid_el0
184	tbz	tmp1, #4, 1f
185	/* ZVA not available.  Remember this for next time.  */
186	mov	zva_len, #~0
187	str	zva_len, [tmp2, #:lo12:.Lcache_clear]
188	b	.Lnot_short
1891:
190	mov	tmp3w, #4
191	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
192	lsl	zva_len, tmp3w, zva_len
193	str	zva_len, [tmp2, #:lo12:.Lcache_clear]
194#else
195	mrs	tmp1, dczid_el0
196	tbnz	tmp1, #4, .Lnot_short
197	mov	tmp3w, #4
198	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
199	lsl	zva_len, tmp3w, zva_len
200#endif
201
202.Lzero_by_line:
203	/* Compute how far we need to go to become suitably aligned.  We're
204	 * already at quad-word alignment.  */
205	cmp	count, zva_len_x
206	b.lt	.Lnot_short		/* Not enough to reach alignment.  */
207	sub	zva_bits_x, zva_len_x, #1
208	neg	tmp2, dst
209	ands	tmp2, tmp2, zva_bits_x
210	b.eq	1f			/* Already aligned.  */
211	/* Not aligned, check that there's enough to copy after alignment.  */
212	sub	tmp1, count, tmp2
213	cmp	tmp1, #64
214	ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */
215	b.lt	.Lnot_short
216	/* We know that there's at least 64 bytes to zero and that it's safe
217	 * to overrun by 64 bytes.  */
218	mov	count, tmp1
2192:
220	stp	A_l, A_l, [dst]
221	stp	A_l, A_l, [dst, #16]
222	stp	A_l, A_l, [dst, #32]
223	subs	tmp2, tmp2, #64
224	stp	A_l, A_l, [dst, #48]
225	add	dst, dst, #64
226	b.ge	2b
227	/* We've overrun a bit, so adjust dst downwards.  */
228	add	dst, dst, tmp2
2291:
230	sub	count, count, zva_len_x
2313:
232	dc	zva, dst
233	add	dst, dst, zva_len_x
234	subs	count, count, zva_len_x
235	b.ge	3b
236	ands	count, count, zva_bits_x
237	b.ne	.Ltail_maybe_long
238	ret
239END(memset)
240
241#ifdef MAYBE_VIRT
242	.bss
243	.p2align 2
244.Lcache_clear:
245	.space 4
246#endif
247