1/***************************************************************************
2 Copyright (c) 2009-2013 The Linux Foundation. All rights reserved.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6     * Redistributions of source code must retain the above copyright
7       notice, this list of conditions and the following disclaimer.
8     * Redistributions in binary form must reproduce the above copyright
9       notice, this list of conditions and the following disclaimer in the
10       documentation and/or other materials provided with the distribution.
11     * Neither the name of The Linux Foundation nor the names of its contributors may
12       be used to endorse or promote products derived from this software
13       without specific prior written permission.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 POSSIBILITY OF SUCH DAMAGE.
26  ***************************************************************************/
27
28/* Assumes neon instructions and a cache line size of 64 bytes. */
29
30#include <machine/cpu-features.h>
31#include <machine/asm.h>
32
33#define PLDOFFS	(10)
34#define PLDTHRESH (PLDOFFS)
35#define BBTHRESH (4096/64)
36#define PLDSIZE (64)
37
38#if (PLDOFFS < 1)
39#error Routine does not support offsets less than 1
40#endif
41
42#if (PLDTHRESH < PLDOFFS)
43#error PLD threshold must be greater than or equal to the PLD offset
44#endif
45
46	.text
47	.fpu    neon
48
49.L_memcpy_base:
50	cmp	r2, #4
51	blt	.L_neon_lt4
52	cmp	r2, #16
53	blt	.L_neon_lt16
54	cmp	r2, #32
55	blt	.L_neon_16
56	cmp	r2, #64
57	blt	.L_neon_copy_32_a
58
59	mov	r12, r2, lsr #6
60	cmp	r12, #PLDTHRESH
61	ble	.L_neon_copy_64_loop_nopld
62
63	push	{r9, r10}
64	.cfi_adjust_cfa_offset 8
65	.cfi_rel_offset r9, 0
66	.cfi_rel_offset r10, 4
67
68	cmp	r12, #BBTHRESH
69	ble	.L_neon_prime_pump
70
71	add	lr, r0, #0x400
72	add	r9, r1, #(PLDOFFS*PLDSIZE)
73	sub	lr, lr, r9
74	lsl	lr, lr, #21
75	lsr	lr, lr, #21
76	add	lr, lr, #(PLDOFFS*PLDSIZE)
77	cmp	r12, lr, lsr #6
78	ble	.L_neon_prime_pump
79
80	itt	gt
81	movgt	r9, #(PLDOFFS)
82	rsbsgt	r9, r9, lr, lsr #6
83	ble	.L_neon_prime_pump
84
85	add	r10, r1, lr
86	bic	r10, #0x3F
87
88	sub	r12, r12, lr, lsr #6
89
90	cmp	r9, r12
91	itee	le
92	suble	r12, r12, r9
93	movgt	r9, r12
94	movgt	r12, #0
95
96	pld	[r1, #((PLDOFFS-1)*PLDSIZE)]
97.L_neon_copy_64_loop_outer_doublepld:
98	pld	[r1, #((PLDOFFS)*PLDSIZE)]
99	vld1.32	{q0, q1}, [r1]!
100	vld1.32	{q2, q3}, [r1]!
101	ldr	r3, [r10]
102	subs	r9, r9, #1
103	vst1.32	{q0, q1}, [r0]!
104	vst1.32	{q2, q3}, [r0]!
105	add	r10, #64
106	bne	.L_neon_copy_64_loop_outer_doublepld
107	cmp	r12, #0
108	beq	.L_neon_pop_before_nopld
109
110	cmp	r12, #(512*1024/64)
111	blt	.L_neon_copy_64_loop_outer
112
113.L_neon_copy_64_loop_ddr:
114	vld1.32	{q0, q1}, [r1]!
115	vld1.32	{q2, q3}, [r1]!
116	pld	[r10]
117	subs	r12, r12, #1
118	vst1.32	{q0, q1}, [r0]!
119	vst1.32	{q2, q3}, [r0]!
120	add	r10, #64
121	bne	.L_neon_copy_64_loop_ddr
122	b	.L_neon_pop_before_nopld
123
124.L_neon_prime_pump:
125	mov	lr, #(PLDOFFS*PLDSIZE)
126	add	r10, r1, #(PLDOFFS*PLDSIZE)
127	bic	r10, #0x3F
128	sub	r12, r12, #PLDOFFS
129	ldr	r3, [r10, #(-1*PLDSIZE)]
130
131.L_neon_copy_64_loop_outer:
132	vld1.32	{q0, q1}, [r1]!
133	vld1.32	{q2, q3}, [r1]!
134	ldr	r3, [r10]
135	subs	r12, r12, #1
136	vst1.32	{q0, q1}, [r0]!
137	vst1.32	{q2, q3}, [r0]!
138	add	r10, #64
139	bne	.L_neon_copy_64_loop_outer
140
141.L_neon_pop_before_nopld:
142	mov	r12, lr, lsr #6
143	pop	{r9, r10}
144	.cfi_adjust_cfa_offset -8
145	.cfi_restore r9
146	.cfi_restore r10
147
148.L_neon_copy_64_loop_nopld:
149	vld1.32	{q8, q9}, [r1]!
150	vld1.32	{q10, q11}, [r1]!
151	subs	r12, r12, #1
152	vst1.32	{q8, q9}, [r0]!
153	vst1.32	{q10, q11}, [r0]!
154	bne	.L_neon_copy_64_loop_nopld
155	ands	r2, r2, #0x3f
156	beq	.L_neon_exit
157
158.L_neon_copy_32_a:
159	movs	r3, r2, lsl #27
160	bcc	.L_neon_16
161	vld1.32	{q0,q1}, [r1]!
162	vst1.32	{q0,q1}, [r0]!
163
164.L_neon_16:
165	bpl	.L_neon_lt16
166	vld1.32	{q8}, [r1]!
167	vst1.32	{q8}, [r0]!
168	ands	r2, r2, #0x0f
169	beq	.L_neon_exit
170
171.L_neon_lt16:
172	movs	r3, r2, lsl #29
173	bcc	1f
174	vld1.8	{d0}, [r1]!
175	vst1.8	{d0}, [r0]!
1761:
177	bge	.L_neon_lt4
178	vld4.8	{d0[0], d1[0], d2[0], d3[0]}, [r1]!
179	vst4.8	{d0[0], d1[0], d2[0], d3[0]}, [r0]!
180
181.L_neon_lt4:
182	movs	r2, r2, lsl #31
183	itt	cs
184	ldrhcs	r3, [r1], #2
185	strhcs	r3, [r0], #2
186	itt	mi
187	ldrbmi	r3, [r1]
188	strbmi	r3, [r0]
189
190.L_neon_exit:
191	pop	{r0, pc}
192