10ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang/* Copyright (c) 2012, Linaro Limited
20ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang   All rights reserved.
30ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang   Copyright (c) 2014, NVIDIA Corporation.  All rights reserved.
40ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang
50ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang   Redistribution and use in source and binary forms, with or without
60ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang   modification, are permitted provided that the following conditions are met:
70ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang       * Redistributions of source code must retain the above copyright
80ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang         notice, this list of conditions and the following disclaimer.
90ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang       * Redistributions in binary form must reproduce the above copyright
100ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang         notice, this list of conditions and the following disclaimer in the
110ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang         documentation and/or other materials provided with the distribution.
120ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang       * Neither the name of the Linaro nor the
130ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang         names of its contributors may be used to endorse or promote products
140ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang         derived from this software without specific prior written permission.
150ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang
160ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
170ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
180ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
190ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
200ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
210ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
220ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
230ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
240ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
250ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
260ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
270ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang*/
280ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang
290ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang/* Assumptions:
300ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang *
310ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang * denver, ARMv8-a, AArch64
320ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang * Unaligned accesses
330ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang *
340ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang */
350ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang
360ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang#include <private/bionic_asm.h>
370ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang
380ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang/* By default we assume that the DC instruction can be used to zero
390ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang   data blocks more efficiently.  In some circumstances this might be
400ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang   unsafe, for example in an asymmetric multiprocessor environment with
410ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang   different DC clear lengths (neither the upper nor lower lengths are
420ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang   safe to use).  The feature can be disabled by defining DONT_USE_DC.
430ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang
440ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang   If code may be run in a virtualized environment, then define
450ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang   MAYBE_VIRT.  This will cause the code to cache the system register
460ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang   values rather than re-reading them each call.  */
470ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang
480ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang#define dstin		x0
490ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang#define val		w1
500ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang#define count		x2
510ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang#define tmp1		x3
520ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang#define tmp1w		w3
530ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang#define tmp2		x4
540ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang#define tmp2w		w4
550ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang#define zva_len_x	x5
560ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang#define zva_len		w5
570ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang#define zva_bits_x	x6
580ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang
590ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang#define A_l		x7
600ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang#define A_lw		w7
610ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang#define dst		x8
620ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang#define tmp3w		w9
630ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang
640ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang#define QA_l		q0
650ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang
660ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu ZhangENTRY(memset)
670ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang
680ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	mov	dst, dstin		/* Preserve return value.  */
690ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	ands	A_lw, val, #255
700ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang#ifndef DONT_USE_DC
710ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang#	b.eq	.Lzero_mem
720ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang#endif
730ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	orr	A_lw, A_lw, A_lw, lsl #8
740ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	orr	A_lw, A_lw, A_lw, lsl #16
750ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	orr	A_l, A_l, A_l, lsl #32
760ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang.Ltail_maybe_long:
770ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	cmp	count, #256
780ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	b.ge	.Lnot_short
790ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang.Ltail_maybe_tiny:
800ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	cmp	count, #15
810ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	b.le	.Ltail15tiny
820ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang.Ltail255:
830ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	ands	tmp1, count, #0xC0
840ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	b.eq	.Ltail63
850ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	dup	v0.4s, A_lw
860ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	cmp	tmp1w, #0x80
870ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	b.eq	1f
880ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	b.lt	2f
890ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	stp	QA_l, QA_l, [dst], #32
900ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	stp	QA_l, QA_l, [dst], #32
910ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang1:
920ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	stp	QA_l, QA_l, [dst], #32
930ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	stp	QA_l, QA_l, [dst], #32
940ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang2:
950ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	stp	QA_l, QA_l, [dst], #32
960ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	stp	QA_l, QA_l, [dst], #32
970ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang.Ltail63:
980ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	ands	tmp1, count, #0x30
990ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	b.eq	.Ltail15
1000ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	add	dst, dst, tmp1
1010ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	cmp	tmp1w, #0x20
1020ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	b.eq	1f
1030ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	b.lt	2f
1040ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	stp	A_l, A_l, [dst, #-48]
1050ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang1:
1060ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	stp	A_l, A_l, [dst, #-32]
1070ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang2:
1080ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	stp	A_l, A_l, [dst, #-16]
1090ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang
1100ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang.Ltail15:
1110ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	and	count, count, #15
1120ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	add	dst, dst, count
1130ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */
1140ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	ret
1150ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang
1160ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang.Ltail15tiny:
1170ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	/* Set up to 15 bytes.  Does not assume earlier memory
1180ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	   being set.  */
1190ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	tbz	count, #3, 1f
1200ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	str	A_l, [dst], #8
1210ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang1:
1220ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	tbz	count, #2, 1f
1230ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	str	A_lw, [dst], #4
1240ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang1:
1250ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	tbz	count, #1, 1f
1260ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	strh	A_lw, [dst], #2
1270ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang1:
1280ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	tbz	count, #0, 1f
1290ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	strb	A_lw, [dst]
1300ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang1:
1310ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	ret
1320ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang
1330ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	/* Critical loop.  Start at a new cache line boundary.  Assuming
1340ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	 * 64 bytes per line, this ensures the entire loop is in one line.  */
1350ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	.p2align 6
1360ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang.Lnot_short:
1370ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	dup	v0.4s, A_lw
1380ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	neg	tmp2, dst
1390ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	ands	tmp2, tmp2, #15
1400ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	b.eq	2f
1410ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	/* Bring DST to 128-bit (16-byte) alignment.  We know that there's
1420ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	 * more than that to set, so we simply store 16 bytes and advance by
1430ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	 * the amount required to reach alignment.  */
1440ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	sub	count, count, tmp2
1450ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	stp	A_l, A_l, [dst]
1460ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	add	dst, dst, tmp2
1470ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	/* There may be less than 63 bytes to go now.  */
1480ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	cmp	count, #255
1490ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	b.le	.Ltail255
1500ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang2:
1510ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	cmp	count, #2097152
1520ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	b.gt	3f
1530ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang1:
1540ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	sub	count, count, #256
1550ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang2:
1560ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	stp	QA_l, QA_l, [dst], #32
1570ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	stp	QA_l, QA_l, [dst], #32
1580ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	stp	QA_l, QA_l, [dst], #32
1590ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	stp	QA_l, QA_l, [dst], #32
1600ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	stp	QA_l, QA_l, [dst], #32
1610ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	stp	QA_l, QA_l, [dst], #32
1620ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	stp	QA_l, QA_l, [dst], #32
1630ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	stp	QA_l, QA_l, [dst], #32
1640ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	subs	count, count, #256
1650ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	b.ge	2b
1660ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	tst	count, #0xff
1670ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	b.ne	.Ltail255
1680ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	ret
1690ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang3:
1700ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	sub	count, count, #64
1710ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang4:
1720ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	subs	count, count, #64
1730ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	stnp	QA_l, QA_l, [dst]
1740ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	stnp	QA_l, QA_l, [dst, #32]
1750ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	add	dst, dst, #64
1760ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	b.ge	4b
1770ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	tst	count, #0x3f
1780ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	b.ne	.Ltail63
1790ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	ret
1800ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang
1810ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang#ifndef DONT_USE_DC
1820ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	/* For zeroing memory, check to see if we can use the ZVA feature to
1830ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	 * zero entire 'cache' lines.  */
1840ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang.Lzero_mem:
1850ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	mov	A_l, #0
1860ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	cmp	count, #63
1870ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	b.le	.Ltail_maybe_tiny
1880ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	neg	tmp2, dst
1890ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	ands	tmp2, tmp2, #15
1900ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	b.eq	1f
1910ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	sub	count, count, tmp2
1920ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	stp	A_l, A_l, [dst]
1930ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	add	dst, dst, tmp2
1940ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	cmp	count, #63
1950ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	b.le	.Ltail63
1960ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang1:
1970ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	/* For zeroing small amounts of memory, it's not worth setting up
1980ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	 * the line-clear code.  */
1990ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	cmp	count, #128
2000ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	b.lt	.Lnot_short
2010ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang#ifdef MAYBE_VIRT
2020ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	/* For efficiency when virtualized, we cache the ZVA capability.  */
2030ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	adrp	tmp2, .Lcache_clear
2040ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	ldr	zva_len, [tmp2, #:lo12:.Lcache_clear]
2050ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	tbnz	zva_len, #31, .Lnot_short
2060ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	cbnz	zva_len, .Lzero_by_line
2070ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	mrs	tmp1, dczid_el0
2080ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	tbz	tmp1, #4, 1f
2090ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	/* ZVA not available.  Remember this for next time.  */
2100ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	mov	zva_len, #~0
2110ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	str	zva_len, [tmp2, #:lo12:.Lcache_clear]
2120ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	b	.Lnot_short
2130ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang1:
2140ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	mov	tmp3w, #4
2150ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
2160ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	lsl	zva_len, tmp3w, zva_len
2170ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	str	zva_len, [tmp2, #:lo12:.Lcache_clear]
2180ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang#else
2190ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	mrs	tmp1, dczid_el0
2200ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	tbnz	tmp1, #4, .Lnot_short
2210ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	mov	tmp3w, #4
2220ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
2230ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	lsl	zva_len, tmp3w, zva_len
2240ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang#endif
2250ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang
2260ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang.Lzero_by_line:
2270ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	/* Compute how far we need to go to become suitably aligned.  We're
2280ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	 * already at quad-word alignment.  */
2290ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	cmp	count, zva_len_x
2300ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	b.lt	.Lnot_short		/* Not enough to reach alignment.  */
2310ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	sub	zva_bits_x, zva_len_x, #1
2320ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	neg	tmp2, dst
2330ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	ands	tmp2, tmp2, zva_bits_x
2340ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	b.eq	1f			/* Already aligned.  */
2350ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	/* Not aligned, check that there's enough to copy after alignment.  */
2360ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	sub	tmp1, count, tmp2
2370ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	cmp	tmp1, #64
2380ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */
2390ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	b.lt	.Lnot_short
2400ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	/* We know that there's at least 64 bytes to zero and that it's safe
2410ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	 * to overrun by 64 bytes.  */
2420ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	mov	count, tmp1
2430ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang2:
2440ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	stp	A_l, A_l, [dst]
2450ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	stp	A_l, A_l, [dst, #16]
2460ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	stp	A_l, A_l, [dst, #32]
2470ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	subs	tmp2, tmp2, #64
2480ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	stp	A_l, A_l, [dst, #48]
2490ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	add	dst, dst, #64
2500ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	b.ge	2b
2510ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	/* We've overrun a bit, so adjust dst downwards.  */
2520ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	add	dst, dst, tmp2
2530ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang1:
2540ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	sub	count, count, zva_len_x
2550ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang3:
2560ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	dc	zva, dst
2570ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	add	dst, dst, zva_len_x
2580ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	subs	count, count, zva_len_x
2590ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	b.ge	3b
2600ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	ands	count, count, zva_bits_x
2610ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	b.ne	.Ltail_maybe_long
2620ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	ret
2630ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu ZhangEND(memset)
2640ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang
2650ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang#ifdef MAYBE_VIRT
2660ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	.bss
2670ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	.p2align 2
2680ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang.Lcache_clear:
2690ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang	.space 4
2700ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang#endif
2710ef7a8fd31a7ee9571c54c17e1f42cba4cfb44ecShu Zhang#endif /* DONT_USE_DC */
272