1!   Copyright (C) 2008-2012 Imagination Technologies Ltd.
2
3	.text
4	.global	_memcpy
5	.type	_memcpy,function
6! D1Ar1 dst
7! D0Ar2 src
8! D1Ar3 cnt
9! D0Re0 dst
10_memcpy:
11	CMP 	D1Ar3, #16
12	MOV 	A1.2, D0Ar2		! source pointer
13	MOV 	A0.2, D1Ar1		! destination pointer
14	MOV 	A0.3, D1Ar1		! for return value
15! If there are less than 16 bytes to copy use the byte copy loop
16	BGE 	$Llong_copy
17
18$Lbyte_copy:
19! Simply copy a byte at a time
20	SUBS	TXRPT, D1Ar3, #1
21	BLT	$Lend
22$Lloop_byte:
23	GETB 	D1Re0, [A1.2++]
24	SETB 	[A0.2++], D1Re0
25	BR	$Lloop_byte
26
27$Lend:
28! Finally set return value and return
29	MOV 	D0Re0, A0.3
30	MOV 	PC, D1RtP
31
32$Llong_copy:
33	ANDS 	D1Ar5, D1Ar1, #7	! test destination alignment
34	BZ	$Laligned_dst
35
36! The destination address is not 8 byte aligned. We will copy bytes from
37! the source to the destination until the remaining data has an 8 byte
38! destination address alignment (i.e we should never copy more than 7
39! bytes here).
40$Lalign_dst:
41	GETB 	D0Re0, [A1.2++]
42	ADD 	D1Ar5, D1Ar5, #1	! dest is aligned when D1Ar5 reaches #8
43	SUB 	D1Ar3, D1Ar3, #1	! decrement count of remaining bytes
44	SETB 	[A0.2++], D0Re0
45	CMP 	D1Ar5, #8
46	BNE 	$Lalign_dst
47
48! We have at least (16 - 7) = 9 bytes to copy - calculate the number of 8 byte
49! blocks, then jump to the unaligned copy loop or fall through to the aligned
50! copy loop as appropriate.
51$Laligned_dst:
52	MOV	D0Ar4, A1.2
53	LSR 	D1Ar5, D1Ar3, #3	! D1Ar5 = number of 8 byte blocks
54	ANDS 	D0Ar4, D0Ar4, #7	! test source alignment
55	BNZ 	$Lunaligned_copy	! if unaligned, use unaligned copy loop
56
57! Both source and destination are 8 byte aligned - the easy case.
58$Laligned_copy:
59	LSRS	D1Ar5, D1Ar3, #5	! D1Ar5 = number of 32 byte blocks
60	BZ	$Lbyte_copy
61	SUB	TXRPT, D1Ar5, #1
62
63$Laligned_32:
64	GETL 	D0Re0, D1Re0, [A1.2++]
65	GETL 	D0Ar6, D1Ar5, [A1.2++]
66	SETL 	[A0.2++], D0Re0, D1Re0
67	SETL 	[A0.2++], D0Ar6, D1Ar5
68	GETL 	D0Re0, D1Re0, [A1.2++]
69	GETL 	D0Ar6, D1Ar5, [A1.2++]
70	SETL 	[A0.2++], D0Re0, D1Re0
71	SETL 	[A0.2++], D0Ar6, D1Ar5
72	BR	$Laligned_32
73
74! If there are any remaining bytes use the byte copy loop, otherwise we are done
75	ANDS 	D1Ar3, D1Ar3, #0x1f
76	BNZ	$Lbyte_copy
77	B	$Lend
78
79! The destination is 8 byte aligned but the source is not, and there are 8
80! or more bytes to be copied.
81$Lunaligned_copy:
82! Adjust the source pointer (A1.2) to the 8 byte boundary before its
83! current value
84	MOV 	D0Ar4, A1.2
85	MOV 	D0Ar6, A1.2
86	ANDMB 	D0Ar4, D0Ar4, #0xfff8
87	MOV 	A1.2, D0Ar4
88! Save the number of bytes of mis-alignment in D0Ar4 for use later
89	SUBS 	D0Ar6, D0Ar6, D0Ar4
90	MOV	D0Ar4, D0Ar6
91! if there is no mis-alignment after all, use the aligned copy loop
92	BZ 	$Laligned_copy
93
94! prefetch 8 bytes
95	GETL 	D0Re0, D1Re0, [A1.2]
96
97	SUB	TXRPT, D1Ar5, #1
98
99! There are 3 mis-alignment cases to be considered. Less than 4 bytes, exactly
100! 4 bytes, and more than 4 bytes.
101	CMP 	D0Ar6, #4
102	BLT 	$Lunaligned_1_2_3	! use 1-3 byte mis-alignment loop
103	BZ 	$Lunaligned_4		! use 4 byte mis-alignment loop
104
105! The mis-alignment is more than 4 bytes
106$Lunaligned_5_6_7:
107	SUB 	D0Ar6, D0Ar6, #4
108! Calculate the bit offsets required for the shift operations necesssary
109! to align the data.
110! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset)
111	MULW 	D0Ar6, D0Ar6, #8
112	MOV	D1Ar5, #32
113	SUB	D1Ar5, D1Ar5, D0Ar6
114! Move data 4 bytes before we enter the main loop
115	MOV 	D0Re0, D1Re0
116
117$Lloop_5_6_7:
118	GETL 	D0Ar2, D1Ar1, [++A1.2]
119! form 64-bit data in D0Re0, D1Re0
120	LSR 	D0Re0, D0Re0, D0Ar6
121	MOV 	D1Re0, D0Ar2
122	LSL 	D1Re0, D1Re0, D1Ar5
123	ADD 	D0Re0, D0Re0, D1Re0
124
125	LSR 	D0Ar2, D0Ar2, D0Ar6
126	LSL 	D1Re0, D1Ar1, D1Ar5
127	ADD 	D1Re0, D1Re0, D0Ar2
128
129	SETL 	[A0.2++], D0Re0, D1Re0
130	MOV 	D0Re0, D1Ar1
131	BR	$Lloop_5_6_7
132
133	B 	$Lunaligned_end
134
135$Lunaligned_1_2_3:
136! Calculate the bit offsets required for the shift operations necesssary
137! to align the data.
138! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset)
139	MULW 	D0Ar6, D0Ar6, #8
140	MOV	D1Ar5, #32
141	SUB	D1Ar5, D1Ar5, D0Ar6
142
143$Lloop_1_2_3:
144! form 64-bit data in D0Re0,D1Re0
145	LSR 	D0Re0, D0Re0, D0Ar6
146	LSL 	D1Ar1, D1Re0, D1Ar5
147	ADD 	D0Re0, D0Re0, D1Ar1
148	MOV	D0Ar2, D1Re0
149	LSR 	D0FrT, D0Ar2, D0Ar6
150	GETL 	D0Ar2, D1Ar1, [++A1.2]
151
152	MOV 	D1Re0, D0Ar2
153	LSL 	D1Re0, D1Re0, D1Ar5
154	ADD 	D1Re0, D1Re0, D0FrT
155
156	SETL 	[A0.2++], D0Re0, D1Re0
157	MOV 	D0Re0, D0Ar2
158	MOV 	D1Re0, D1Ar1
159	BR	$Lloop_1_2_3
160
161	B 	$Lunaligned_end
162
163! The 4 byte mis-alignment case - this does not require any shifting, just a
164! shuffling of registers.
165$Lunaligned_4:
166	MOV 	D0Re0, D1Re0
167$Lloop_4:
168	GETL 	D0Ar2, D1Ar1, [++A1.2]
169	MOV 	D1Re0, D0Ar2
170	SETL 	[A0.2++], D0Re0, D1Re0
171	MOV 	D0Re0, D1Ar1
172	BR	$Lloop_4
173
174$Lunaligned_end:
175! If there are no remaining bytes to copy, we are done.
176	ANDS 	D1Ar3, D1Ar3, #7
177	BZ	$Lend
178! Re-adjust the source pointer (A1.2) back to the actual (unaligned) byte
179! address of the remaining bytes, and fall through to the byte copy loop.
180	MOV 	D0Ar6, A1.2
181	ADD 	D1Ar5, D0Ar4, D0Ar6
182	MOV 	A1.2, D1Ar5
183	B	$Lbyte_copy
184
185	.size _memcpy,.-_memcpy
186