1; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
3; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
4; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
5
6;===------------------------------------------------------------------------===;
7; Global Address Space
8;===------------------------------------------------------------------------===;
9; FUNC-LABEL: {{^}}store_i1:
10; EG: MEM_RAT MSKOR
11; SI: buffer_store_byte
12define void @store_i1(i1 addrspace(1)* %out) {
13entry:
14  store i1 true, i1 addrspace(1)* %out
15  ret void
16}
17
18; i8 store
19; FUNC-LABEL: {{^}}store_i8:
20; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
21
22; IG 0: Get the byte index and truncate the value
23; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
24; EG: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
25; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y
26; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43)
27
28
29; IG 1: Truncate the calculated the shift amount for the mask
30
31; IG 2: Shift the value and the mask
32; EG: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]]
33; EG: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]]
34; EG-NEXT: 255
35; IG 3: Initialize the Y and Z channels to zero
36;       XXX: An optimal scheduler should merge this into one of the prevous IGs.
37; EG: MOV T[[RW_GPR]].Y, 0.0
38; EG: MOV * T[[RW_GPR]].Z, 0.0
39
40; SI: buffer_store_byte
41
42define void @store_i8(i8 addrspace(1)* %out, i8 %in) {
43entry:
44  store i8 %in, i8 addrspace(1)* %out
45  ret void
46}
47
48; i16 store
49; FUNC-LABEL: {{^}}store_i16:
50; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
51
52; IG 0: Get the byte index and truncate the value
53
54
55; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
56; EG-NEXT: 3(4.203895e-45),
57
58; EG: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
59; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y
60
61; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
62; IG 1: Truncate the calculated the shift amount for the mask
63
64; IG 2: Shift the value and the mask
65; EG: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]]
66; EG: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]]
67; EG-NEXT: 65535
68; IG 3: Initialize the Y and Z channels to zero
69;       XXX: An optimal scheduler should merge this into one of the prevous IGs.
70; EG: MOV T[[RW_GPR]].Y, 0.0
71; EG: MOV * T[[RW_GPR]].Z, 0.0
72
73; SI: buffer_store_short
74define void @store_i16(i16 addrspace(1)* %out, i16 %in) {
75entry:
76  store i16 %in, i16 addrspace(1)* %out
77  ret void
78}
79
80; FUNC-LABEL: {{^}}store_i24:
81; SI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
82; SI-DAG: buffer_store_byte
83; SI-DAG: buffer_store_short
84define void @store_i24(i24 addrspace(1)* %out, i24 %in) {
85entry:
86  store i24 %in, i24 addrspace(1)* %out
87  ret void
88}
89
90; FUNC-LABEL: {{^}}store_i25:
91; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x1ffffff{{$}}
92; SI: v_mov_b32_e32 [[VAND:v[0-9]+]], [[AND]]
93; SI: buffer_store_dword [[VAND]]
94define void @store_i25(i25 addrspace(1)* %out, i25 %in) {
95entry:
96  store i25 %in, i25 addrspace(1)* %out
97  ret void
98}
99
100; FUNC-LABEL: {{^}}store_v2i8:
101; EG: MEM_RAT MSKOR
102; EG-NOT: MEM_RAT MSKOR
103
104; SI: buffer_store_short
105define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
106entry:
107  %0 = trunc <2 x i32> %in to <2 x i8>
108  store <2 x i8> %0, <2 x i8> addrspace(1)* %out
109  ret void
110}
111
112
113; FUNC-LABEL: {{^}}store_v2i16:
114; EG: MEM_RAT_CACHELESS STORE_RAW
115
116; CM: MEM_RAT_CACHELESS STORE_DWORD
117
118; SI: buffer_store_dword
119define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
120entry:
121  %0 = trunc <2 x i32> %in to <2 x i16>
122  store <2 x i16> %0, <2 x i16> addrspace(1)* %out
123  ret void
124}
125
126; FUNC-LABEL: {{^}}store_v4i8:
127; EG: MEM_RAT_CACHELESS STORE_RAW
128
129; CM: MEM_RAT_CACHELESS STORE_DWORD
130
131; SI: buffer_store_dword
132define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
133entry:
134  %0 = trunc <4 x i32> %in to <4 x i8>
135  store <4 x i8> %0, <4 x i8> addrspace(1)* %out
136  ret void
137}
138
139; floating-point store
140; FUNC-LABEL: {{^}}store_f32:
141; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1
142
143; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+\.X, T[0-9]+\.X}}
144
145; SI: buffer_store_dword
146
147define void @store_f32(float addrspace(1)* %out, float %in) {
148  store float %in, float addrspace(1)* %out
149  ret void
150}
151
152; FUNC-LABEL: {{^}}store_v4i16:
153; MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
154
155; SI: buffer_store_dwordx2
156define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) {
157entry:
158  %0 = trunc <4 x i32> %in to <4 x i16>
159  store <4 x i16> %0, <4 x i16> addrspace(1)* %out
160  ret void
161}
162
163; vec2 floating-point stores
164; FUNC-LABEL: {{^}}store_v2f32:
165; EG: MEM_RAT_CACHELESS STORE_RAW
166
167; CM: MEM_RAT_CACHELESS STORE_DWORD
168
169; SI: buffer_store_dwordx2
170
171define void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) {
172entry:
173  %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0
174  %1 = insertelement <2 x float> %0, float %b, i32 1
175  store <2 x float> %1, <2 x float> addrspace(1)* %out
176  ret void
177}
178
179; FUNC-LABEL: {{^}}store_v4i32:
180; EG: MEM_RAT_CACHELESS STORE_RAW
181; EG-NOT: MEM_RAT_CACHELESS STORE_RAW
182
183; CM: MEM_RAT_CACHELESS STORE_DWORD
184; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD
185
186; SI: buffer_store_dwordx4
187define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
188entry:
189  store <4 x i32> %in, <4 x i32> addrspace(1)* %out
190  ret void
191}
192
193; FUNC-LABEL: {{^}}store_i64_i8:
194; EG: MEM_RAT MSKOR
195; SI: buffer_store_byte
196define void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) {
197entry:
198  %0 = trunc i64 %in to i8
199  store i8 %0, i8 addrspace(1)* %out
200  ret void
201}
202
203; FUNC-LABEL: {{^}}store_i64_i16:
204; EG: MEM_RAT MSKOR
205; SI: buffer_store_short
206define void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) {
207entry:
208  %0 = trunc i64 %in to i16
209  store i16 %0, i16 addrspace(1)* %out
210  ret void
211}
212
213;===------------------------------------------------------------------------===;
214; Local Address Space
215;===------------------------------------------------------------------------===;
216
217; FUNC-LABEL: {{^}}store_local_i1:
218; EG: LDS_BYTE_WRITE
219; SI: ds_write_b8
220define void @store_local_i1(i1 addrspace(3)* %out) {
221entry:
222  store i1 true, i1 addrspace(3)* %out
223  ret void
224}
225
226; FUNC-LABEL: {{^}}store_local_i8:
227; EG: LDS_BYTE_WRITE
228
229; SI: ds_write_b8
230define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) {
231  store i8 %in, i8 addrspace(3)* %out
232  ret void
233}
234
235; FUNC-LABEL: {{^}}store_local_i16:
236; EG: LDS_SHORT_WRITE
237
238; SI: ds_write_b16
239define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) {
240  store i16 %in, i16 addrspace(3)* %out
241  ret void
242}
243
244; FUNC-LABEL: {{^}}store_local_v2i16:
245; EG: LDS_WRITE
246
247; CM: LDS_WRITE
248
249; SI: ds_write_b32
250define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) {
251entry:
252  store <2 x i16> %in, <2 x i16> addrspace(3)* %out
253  ret void
254}
255
256; FUNC-LABEL: {{^}}store_local_v4i8:
257; EG: LDS_WRITE
258
259; CM: LDS_WRITE
260
261; SI: ds_write_b32
262define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
263entry:
264  store <4 x i8> %in, <4 x i8> addrspace(3)* %out
265  ret void
266}
267
268; FUNC-LABEL: {{^}}store_local_v2i32:
269; EG: LDS_WRITE
270; EG: LDS_WRITE
271
272; CM: LDS_WRITE
273; CM: LDS_WRITE
274
275; SI: ds_write_b64
276define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) {
277entry:
278  store <2 x i32> %in, <2 x i32> addrspace(3)* %out
279  ret void
280}
281
282; FUNC-LABEL: {{^}}store_local_v4i32:
283; EG: LDS_WRITE
284; EG: LDS_WRITE
285; EG: LDS_WRITE
286; EG: LDS_WRITE
287
288; CM: LDS_WRITE
289; CM: LDS_WRITE
290; CM: LDS_WRITE
291; CM: LDS_WRITE
292
293; SI: ds_write2_b64
294define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
295entry:
296  store <4 x i32> %in, <4 x i32> addrspace(3)* %out
297  ret void
298}
299
300; FUNC-LABEL: {{^}}store_local_v4i32_align4:
301; EG: LDS_WRITE
302; EG: LDS_WRITE
303; EG: LDS_WRITE
304; EG: LDS_WRITE
305
306; CM: LDS_WRITE
307; CM: LDS_WRITE
308; CM: LDS_WRITE
309; CM: LDS_WRITE
310
311; SI: ds_write2_b32
312; SI: ds_write2_b32
313define void @store_local_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
314entry:
315  store <4 x i32> %in, <4 x i32> addrspace(3)* %out, align 4
316  ret void
317}
318
319; FUNC-LABEL: {{^}}store_local_i64_i8:
320; EG: LDS_BYTE_WRITE
321; SI: ds_write_b8
322define void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) {
323entry:
324  %0 = trunc i64 %in to i8
325  store i8 %0, i8 addrspace(3)* %out
326  ret void
327}
328
329; FUNC-LABEL: {{^}}store_local_i64_i16:
330; EG: LDS_SHORT_WRITE
331; SI: ds_write_b16
332define void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) {
333entry:
334  %0 = trunc i64 %in to i16
335  store i16 %0, i16 addrspace(3)* %out
336  ret void
337}
338
339; The stores in this function are combined by the optimizer to create a
340; 64-bit store with 32-bit alignment.  This is legal for SI and the legalizer
341; should not try to split the 64-bit store back into 2 32-bit stores.
342;
343; Evergreen / Northern Islands don't support 64-bit stores yet, so there should
344; be two 32-bit stores.
345
346; FUNC-LABEL: {{^}}vecload2:
347; EG: MEM_RAT_CACHELESS STORE_RAW
348
349; CM: MEM_RAT_CACHELESS STORE_DWORD
350
351; SI: buffer_store_dwordx2
352define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
353entry:
354  %0 = load i32, i32 addrspace(2)* %mem, align 4
355  %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1
356  %1 = load i32, i32 addrspace(2)* %arrayidx1.i, align 4
357  store i32 %0, i32 addrspace(1)* %out, align 4
358  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
359  store i32 %1, i32 addrspace(1)* %arrayidx1, align 4
360  ret void
361}
362
363; When i128 was a legal type this program generated cannot select errors:
364
365; FUNC-LABEL: {{^}}"i128-const-store":
366; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 1
367
368; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+}}, T{{[0-9]+}}.X
369
370; SI: buffer_store_dwordx4
371define void @i128-const-store(i32 addrspace(1)* %out) {
372entry:
373  store i32 1, i32 addrspace(1)* %out, align 4
374  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
375  store i32 1, i32 addrspace(1)* %arrayidx2, align 4
376  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
377  store i32 2, i32 addrspace(1)* %arrayidx4, align 4
378  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
379  store i32 2, i32 addrspace(1)* %arrayidx6, align 4
380  ret void
381}
382
383attributes #0 = { nounwind }
384