1/* Copyright (c) 2012, Linaro Limited
2   All rights reserved.
3   Copyright (c) 2014, NVIDIA Corporation.  All rights reserved.
4
5   Redistribution and use in source and binary forms, with or without
6   modification, are permitted provided that the following conditions are met:
7       * Redistributions of source code must retain the above copyright
8         notice, this list of conditions and the following disclaimer.
9       * Redistributions in binary form must reproduce the above copyright
10         notice, this list of conditions and the following disclaimer in the
11         documentation and/or other materials provided with the distribution.
12       * Neither the name of the Linaro nor the
13         names of its contributors may be used to endorse or promote products
14         derived from this software without specific prior written permission.
15
16   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27*/
28
29/* Assumptions:
30 *
31 * denver, ARMv8-a, AArch64
32 * Unaligned accesses
33 *
34 */
35
36#define dstin	x0
37#define src	x1
38#define count	x2
39#define tmp1	x3
40#define tmp1w	w3
41#define tmp2	x4
42#define tmp2w	w4
43#define tmp3	x5
44#define tmp3w	w5
45#define dst	x6
46
47#define A_l	x7
48#define A_h	x8
49#define B_l	x9
50#define B_h	x10
51#define C_l	x11
52#define C_h	x12
53#define D_l	x13
54#define D_h	x14
55
56#define QA_l	q0
57#define QA_h	q1
58#define QB_l	q2
59#define QB_h	q3
60
61	mov	dst, dstin
62	cmp	count, #64
63	b.ge	.Lcpy_not_short
64	cmp	count, #15
65	b.le	.Ltail15tiny
66
67	/* Deal with small copies quickly by dropping straight into the
68	 * exit block.  */
69.Ltail63:
70	/* Copy up to 48 bytes of data.  At this point we only need the
71	 * bottom 6 bits of count to be accurate.  */
72	ands	tmp1, count, #0x30
73	b.eq	.Ltail15
74	add	dst, dst, tmp1
75	add	src, src, tmp1
76	cmp	tmp1w, #0x20
77	b.eq	1f
78	b.lt	2f
79	ldp	A_l, A_h, [src, #-48]
80	stp	A_l, A_h, [dst, #-48]
811:
82	ldp	A_l, A_h, [src, #-32]
83	stp	A_l, A_h, [dst, #-32]
842:
85	ldp	A_l, A_h, [src, #-16]
86	stp	A_l, A_h, [dst, #-16]
87
88.Ltail15:
89	ands	count, count, #15
90	beq	1f
91	add	src, src, count
92	ldp	A_l, A_h, [src, #-16]
93	add	dst, dst, count
94	stp	A_l, A_h, [dst, #-16]
951:
96	ret
97
98.Ltail15tiny:
99	/* Copy up to 15 bytes of data.  Does not assume additional data
100	   being copied.  */
101	tbz	count, #3, 1f
102	ldr	tmp1, [src], #8
103	str	tmp1, [dst], #8
1041:
105	tbz	count, #2, 1f
106	ldr	tmp1w, [src], #4
107	str	tmp1w, [dst], #4
1081:
109	tbz	count, #1, 1f
110	ldrh	tmp1w, [src], #2
111	strh	tmp1w, [dst], #2
1121:
113	tbz	count, #0, 1f
114	ldrb	tmp1w, [src]
115	strb	tmp1w, [dst]
1161:
117	ret
118
119.Lcpy_not_short:
120	/* We don't much care about the alignment of DST, but we want SRC
121	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
122	 * boundaries on both loads and stores.  */
123	neg	tmp2, src
124	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
125	b.eq	2f
126	sub	count, count, tmp2
127	/* Copy more data than needed; it's faster than jumping
128	 * around copying sub-Quadword quantities.  We know that
129	 * it can't overrun.  */
130	ldp	A_l, A_h, [src]
131	add	src, src, tmp2
132	stp	A_l, A_h, [dst]
133	add	dst, dst, tmp2
134	/* There may be less than 63 bytes to go now.  */
135	cmp	count, #63
136	b.le	.Ltail63
1372:
138	subs	count, count, #128
139	b.ge	.Lcpy_body_large
140	/* Less than 128 bytes to copy, so handle 64 here and then jump
141	 * to the tail.  */
142	ldp	QA_l, QA_h, [src]
143	ldp	QB_l, QB_h, [src, #32]
144	stp	QA_l, QA_h, [dst]
145	stp	QB_l, QB_h, [dst, #32]
146	tst	count, #0x3f
147	add	src, src, #64
148	add	dst, dst, #64
149	b.ne	.Ltail63
150	ret
151
152	/* Critical loop.  Start at a new cache line boundary.  Assuming
153	 * 64 bytes per line this ensures the entire loop is in one line.  */
154	.p2align 6
155.Lcpy_body_large:
156	cmp	count, 65536
157	bhi	.Lcpy_body_huge
158	/* There are at least 128 bytes to copy.  */
159	ldp	QA_l, QA_h, [src, #0]
160	sub	dst, dst, #32		/* Pre-bias.  */
161	ldp	QB_l, QB_h, [src, #32]!	/* src += 64 - Pre-bias.  */
1621:
163	stp	QA_l, QA_h, [dst, #32]
164	ldp	QA_l, QA_h, [src, #32]
165	stp	QB_l, QB_h, [dst, #64]!
166	ldp	QB_l, QB_h, [src, #64]!
167
168	subs	count, count, #64
169	b.ge	1b
170
171	stp	QA_l, QA_h, [dst, #32]
172	stp	QB_l, QB_h, [dst, #64]
173	add	src, src, #32
174	add	dst, dst, #64 + 32
175	tst	count, #0x3f
176	b.ne	.Ltail63
177	ret
178.Lcpy_body_huge:
179	/* There are at least 128 bytes to copy.  */
180	ldp	QA_l, QA_h, [src, #0]
181	sub	dst, dst, #32		/* Pre-bias.  */
182	ldp	QB_l, QB_h, [src, #32]!
1831:
184	stnp	QA_l, QA_h, [dst, #32]
185	stnp	QB_l, QB_h, [dst, #64]
186	ldp	QA_l, QA_h, [src, #32]
187	ldp	QB_l, QB_h, [src, #64]!
188	add	dst, dst, #64
189
190	subs	count, count, #64
191	b.ge	1b
192
193	stnp	QA_l, QA_h, [dst, #32]
194	stnp	QB_l, QB_h, [dst, #64]
195	add	src, src, #32
196	add	dst, dst, #64 + 32
197	tst	count, #0x3f
198	b.ne	.Ltail63
199	ret
200