1/*
2 * Copyright (C) 2002 Paul Mackerras, IBM Corp.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9#include <asm/processor.h>
10#include <asm/ppc_asm.h>
11
12	.align	7
13_GLOBAL(memcpy)
14BEGIN_FTR_SECTION
15	std	r3,48(r1)	/* save destination pointer for return value */
16FTR_SECTION_ELSE
17	b	memcpy_power7
18ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
19	PPC_MTOCRF(0x01,r5)
20	cmpldi	cr1,r5,16
21	neg	r6,r3		# LS 3 bits = # bytes to 8-byte dest bdry
22	andi.	r6,r6,7
23	dcbt	0,r4
24	blt	cr1,.Lshort_copy
25/* Below we want to nop out the bne if we're on a CPU that has the
26   CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
27   cleared.
28   At the time of writing the only CPU that has this combination of bits
29   set is Power6. */
30BEGIN_FTR_SECTION
31	nop
32FTR_SECTION_ELSE
33	bne	.Ldst_unaligned
34ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
35                    CPU_FTR_UNALIGNED_LD_STD)
36.Ldst_aligned:
37	addi	r3,r3,-16
38BEGIN_FTR_SECTION
39	andi.	r0,r4,7
40	bne	.Lsrc_unaligned
41END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
42	srdi	r7,r5,4
43	ld	r9,0(r4)
44	addi	r4,r4,-8
45	mtctr	r7
46	andi.	r5,r5,7
47	bf	cr7*4+0,2f
48	addi	r3,r3,8
49	addi	r4,r4,8
50	mr	r8,r9
51	blt	cr1,3f
521:	ld	r9,8(r4)
53	std	r8,8(r3)
542:	ldu	r8,16(r4)
55	stdu	r9,16(r3)
56	bdnz	1b
573:	std	r8,8(r3)
58	beq	3f
59	addi	r3,r3,16
60.Ldo_tail:
61	bf	cr7*4+1,1f
62	lwz	r9,8(r4)
63	addi	r4,r4,4
64	stw	r9,0(r3)
65	addi	r3,r3,4
661:	bf	cr7*4+2,2f
67	lhz	r9,8(r4)
68	addi	r4,r4,2
69	sth	r9,0(r3)
70	addi	r3,r3,2
712:	bf	cr7*4+3,3f
72	lbz	r9,8(r4)
73	stb	r9,0(r3)
743:	ld	r3,48(r1)	/* return dest pointer */
75	blr
76
77.Lsrc_unaligned:
78	srdi	r6,r5,3
79	addi	r5,r5,-16
80	subf	r4,r0,r4
81	srdi	r7,r5,4
82	sldi	r10,r0,3
83	cmpdi	cr6,r6,3
84	andi.	r5,r5,7
85	mtctr	r7
86	subfic	r11,r10,64
87	add	r5,r5,r0
88
89	bt	cr7*4+0,0f
90
91	ld	r9,0(r4)	# 3+2n loads, 2+2n stores
92	ld	r0,8(r4)
93	sld	r6,r9,r10
94	ldu	r9,16(r4)
95	srd	r7,r0,r11
96	sld	r8,r0,r10
97	or	r7,r7,r6
98	blt	cr6,4f
99	ld	r0,8(r4)
100	# s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
101	b	2f
102
1030:	ld	r0,0(r4)	# 4+2n loads, 3+2n stores
104	ldu	r9,8(r4)
105	sld	r8,r0,r10
106	addi	r3,r3,-8
107	blt	cr6,5f
108	ld	r0,8(r4)
109	srd	r12,r9,r11
110	sld	r6,r9,r10
111	ldu	r9,16(r4)
112	or	r12,r8,r12
113	srd	r7,r0,r11
114	sld	r8,r0,r10
115	addi	r3,r3,16
116	beq	cr6,3f
117
118	# d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
1191:	or	r7,r7,r6
120	ld	r0,8(r4)
121	std	r12,8(r3)
1222:	srd	r12,r9,r11
123	sld	r6,r9,r10
124	ldu	r9,16(r4)
125	or	r12,r8,r12
126	stdu	r7,16(r3)
127	srd	r7,r0,r11
128	sld	r8,r0,r10
129	bdnz	1b
130
1313:	std	r12,8(r3)
132	or	r7,r7,r6
1334:	std	r7,16(r3)
1345:	srd	r12,r9,r11
135	or	r12,r8,r12
136	std	r12,24(r3)
137	beq	4f
138	cmpwi	cr1,r5,8
139	addi	r3,r3,32
140	sld	r9,r9,r10
141	ble	cr1,6f
142	ld	r0,8(r4)
143	srd	r7,r0,r11
144	or	r9,r7,r9
1456:
146	bf	cr7*4+1,1f
147	rotldi	r9,r9,32
148	stw	r9,0(r3)
149	addi	r3,r3,4
1501:	bf	cr7*4+2,2f
151	rotldi	r9,r9,16
152	sth	r9,0(r3)
153	addi	r3,r3,2
1542:	bf	cr7*4+3,3f
155	rotldi	r9,r9,8
156	stb	r9,0(r3)
1573:	ld	r3,48(r1)	/* return dest pointer */
158	blr
159
160.Ldst_unaligned:
161	PPC_MTOCRF(0x01,r6)		# put #bytes to 8B bdry into cr7
162	subf	r5,r6,r5
163	li	r7,0
164	cmpldi	cr1,r5,16
165	bf	cr7*4+3,1f
166	lbz	r0,0(r4)
167	stb	r0,0(r3)
168	addi	r7,r7,1
1691:	bf	cr7*4+2,2f
170	lhzx	r0,r7,r4
171	sthx	r0,r7,r3
172	addi	r7,r7,2
1732:	bf	cr7*4+1,3f
174	lwzx	r0,r7,r4
175	stwx	r0,r7,r3
1763:	PPC_MTOCRF(0x01,r5)
177	add	r4,r6,r4
178	add	r3,r6,r3
179	b	.Ldst_aligned
180
181.Lshort_copy:
182	bf	cr7*4+0,1f
183	lwz	r0,0(r4)
184	lwz	r9,4(r4)
185	addi	r4,r4,8
186	stw	r0,0(r3)
187	stw	r9,4(r3)
188	addi	r3,r3,8
1891:	bf	cr7*4+1,2f
190	lwz	r0,0(r4)
191	addi	r4,r4,4
192	stw	r0,0(r3)
193	addi	r3,r3,4
1942:	bf	cr7*4+2,3f
195	lhz	r0,0(r4)
196	addi	r4,r4,2
197	sth	r0,0(r3)
198	addi	r3,r3,2
1993:	bf	cr7*4+3,4f
200	lbz	r0,0(r4)
201	stb	r0,0(r3)
2024:	ld	r3,48(r1)	/* return dest pointer */
203	blr
204