1; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
3; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
4; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
5
6; FUNC-LABEL: {{^}}store_i1:
7; EG: MOVA_INT
8; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
9; EG: MOVA_INT
10; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
11
12; CM: MOVA_INT
13; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
14; CM: MOVA_INT
15; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
16
17; SI: buffer_store_byte
18define amdgpu_kernel void @store_i1(i1 addrspace(5)* %out) {
19entry:
20  store i1 true, i1 addrspace(5)* %out
21  ret void
22}
23
24; i8 store
25; FUNC-LABEL: {{^}}store_i8:
26; EG: LSHR * [[ADDRESS:T[0-9]\.[XYZW]]], KC0[2].Y, literal.x
27; EG-NEXT: 2
28; EG: MOVA_INT * AR.x (MASKED)
29; EG: MOV [[OLD:T[0-9]\.[XYZW]]], {{.*}}AR.x
30
31; IG 0: Get the byte index and truncate the value
32; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
33; EG: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
34; EG-NEXT: 3(4.203895e-45)
35
36
37; EG: LSHL * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], literal.x, PV.W
38; EG-NEXT: 255(3.573311e-43)
39
40; EG: NOT_INT
41; EG: AND_INT {{[\* ]*}}[[CLR_CHAN:T[0-9]\.[XYZW]]], {{.*}}[[OLD]]
42; EG: OR_INT * [[RES:T[0-9]\.[XYZW]]]
43; TODO: Is the reload necessary?
44; EG: MOVA_INT * AR.x (MASKED), [[ADDRESS]]
45; EG: MOV * T(0 + AR.x).X+, [[RES]]
46
47; SI: buffer_store_byte
48
49define amdgpu_kernel void @store_i8(i8 addrspace(5)* %out, i8 %in) {
50entry:
51  store i8 %in, i8 addrspace(5)* %out
52  ret void
53}
54
55; i16 store
56; FUNC-LABEL: {{^}}store_i16:
57; EG: LSHR * [[ADDRESS:T[0-9]\.[XYZW]]], KC0[2].Y, literal.x
58; EG-NEXT: 2
59; EG: MOVA_INT * AR.x (MASKED)
60; EG: MOV [[OLD:T[0-9]\.[XYZW]]], {{.*}}AR.x
61
62; EG: VTX_READ_16
63
64; IG 0: Get the byte index and truncate the value
65; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
66; EG: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
67; EG-NEXT: 3(4.203895e-45)
68
69; EG: NOT_INT
70; EG: AND_INT {{[\* ]*}}[[CLR_CHAN:T[0-9]\.[XYZW]]], {{.*}}[[OLD]]
71; EG: OR_INT * [[RES:T[0-9]\.[XYZW]]]
72; TODO: Is the reload necessary?
73; EG: MOVA_INT * AR.x (MASKED), [[ADDRESS]]
74; EG: MOV * T(0 + AR.x).X+, [[RES]]
75
76; SI: buffer_store_short
77define amdgpu_kernel void @store_i16(i16 addrspace(5)* %out, i16 %in) {
78entry:
79  store i16 %in, i16 addrspace(5)* %out
80  ret void
81}
82
83; FUNC-LABEL: {{^}}store_i24:
84; SI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
85; SI-DAG: buffer_store_byte
86; SI-DAG: buffer_store_short
87
88; EG: MOVA_INT
89; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
90; EG: MOVA_INT
91; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
92; TODO: This load and store can be eliminated
93; EG: MOVA_INT
94; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
95; EG: MOVA_INT
96; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
97
98; CM: MOVA_INT
99; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
100; CM: MOVA_INT
101; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
102; TODO: This load and store can be eliminated
103; CM: MOVA_INT
104; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
105; CM: MOVA_INT
106; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
107define amdgpu_kernel void @store_i24(i24 addrspace(5)* %out, i24 %in) {
108entry:
109  store i24 %in, i24 addrspace(5)* %out
110  ret void
111}
112
113; FUNC-LABEL: {{^}}store_i25:
114; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x1ffffff{{$}}
115; SI: v_mov_b32_e32 [[VAND:v[0-9]+]], [[AND]]
116; SI: buffer_store_dword [[VAND]]
117
118; EG: MOVA_INT
119; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
120; EG-NOT: MOVA_INT
121
122; CM: MOVA_INT
123; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
124; CM-NOT: MOVA_INT
125define amdgpu_kernel void @store_i25(i25 addrspace(5)* %out, i25 %in) {
126entry:
127  store i25 %in, i25 addrspace(5)* %out
128  ret void
129}
130
131; FUNC-LABEL: {{^}}store_v2i8:
132; v2i8 is naturally 2B aligned, treat as i16
133; EG: MOVA_INT
134; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
135; EG: MOVA_INT
136; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
137; EG-NOT: MOVA_INT
138
139; CM: MOVA_INT
140; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
141; CM: MOVA_INT
142; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
143; CM-NOT: MOVA_INT
144
145; SI: buffer_store_short
146define amdgpu_kernel void @store_v2i8(<2 x i8> addrspace(5)* %out, <2 x i32> %in) {
147entry:
148  %0 = trunc <2 x i32> %in to <2 x i8>
149  store <2 x i8> %0, <2 x i8> addrspace(5)* %out
150  ret void
151}
152
153; FUNC-LABEL: {{^}}store_v2i8_unaligned:
154; EG: MOVA_INT
155; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
156; EG: MOVA_INT
157; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
158; TODO: This load and store cannot be eliminated,
159;       they might be different locations
160; EG: MOVA_INT
161; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
162; EG: MOVA_INT
163; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
164
165; CM: MOVA_INT
166; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
167; CM: MOVA_INT
168; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
169; TODO: This load and store cannot be eliminated,
170;       they might be different locations
171; CM: MOVA_INT
172; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
173; CM: MOVA_INT
174; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
175
176; SI: buffer_store_byte
177define amdgpu_kernel void @store_v2i8_unaligned(<2 x i8> addrspace(5)* %out, <2 x i32> %in) {
178entry:
179  %0 = trunc <2 x i32> %in to <2 x i8>
180  store <2 x i8> %0, <2 x i8> addrspace(5)* %out, align 1
181  ret void
182}
183
184
185; FUNC-LABEL: {{^}}store_v2i16:
186; v2i8 is naturally 2B aligned, treat as i16
187; EG: MOVA_INT
188; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
189; EG-NOT: MOVA_INT
190
191; CM: MOVA_INT
192; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
193; CM-NOT: MOVA_INT
194
195; SI: buffer_store_dword
196define amdgpu_kernel void @store_v2i16(<2 x i16> addrspace(5)* %out, <2 x i32> %in) {
197entry:
198  %0 = trunc <2 x i32> %in to <2 x i16>
199  store <2 x i16> %0, <2 x i16> addrspace(5)* %out
200  ret void
201}
202
203; FUNC-LABEL: {{^}}store_v2i16_unaligned:
204; EG: MOVA_INT
205; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
206; EG: MOVA_INT
207; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
208; TODO: This load and store cannot be eliminated,
209;       they might be different locations
210; EG: MOVA_INT
211; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
212; EG: MOVA_INT
213; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
214
215; CM: MOVA_INT
216; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
217; CM: MOVA_INT
218; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
219; TODO: This load and store cannot be eliminated,
220;       they might be different locations
221; CM: MOVA_INT
222; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
223; CM: MOVA_INT
224; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
225
226; SI: buffer_store_short
227; SI: buffer_store_short
228define amdgpu_kernel void @store_v2i16_unaligned(<2 x i16> addrspace(5)* %out, <2 x i32> %in) {
229entry:
230  %0 = trunc <2 x i32> %in to <2 x i16>
231  store <2 x i16> %0, <2 x i16> addrspace(5)* %out, align 2
232  ret void
233}
234
235; FUNC-LABEL: {{^}}store_v4i8:
236; EG: MOVA_INT
237; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
238; EG-NOT: MOVA_INT
239
240; CM: MOVA_INT
241; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
242; CM-NOT: MOVA_INT
243
244; SI: buffer_store_dword
245define amdgpu_kernel void @store_v4i8(<4 x i8> addrspace(5)* %out, <4 x i32> %in) {
246entry:
247  %0 = trunc <4 x i32> %in to <4 x i8>
248  store <4 x i8> %0, <4 x i8> addrspace(5)* %out
249  ret void
250}
251
252; FUNC-LABEL: {{^}}store_v4i8_unaligned:
253; EG: MOVA_INT
254; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
255; EG: MOVA_INT
256; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
257; TODO: This load and store cannot be eliminated,
258;       they might be different locations
259; EG: MOVA_INT
260; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
261; EG: MOVA_INT
262; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
263; TODO: This load and store cannot be eliminated,
264;       they might be different locations
265; EG: MOVA_INT
266; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
267; EG: MOVA_INT
268; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
269; TODO: This load and store cannot be eliminated,
270;       they might be different locations
271; EG: MOVA_INT
272; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
273; EG: MOVA_INT
274; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
275
276; CM: MOVA_INT
277; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
278; CM: MOVA_INT
279; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
280; TODO: This load and store cannot be eliminated,
281;       they might be different locations
282; CM: MOVA_INT
283; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
284; CM: MOVA_INT
285; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
286; TODO: This load and store cannot be eliminated,
287;       they might be different locations
288; CM: MOVA_INT
289; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
290; CM: MOVA_INT
291; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
292; TODO: This load and store cannot be eliminated,
293;       they might be different locations
294; CM: MOVA_INT
295; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
296; CM: MOVA_INT
297; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
298
299; SI: buffer_store_byte
300; SI: buffer_store_byte
301; SI: buffer_store_byte
302; SI: buffer_store_byte
303; SI-NOT: buffer_store_dword
304define amdgpu_kernel void @store_v4i8_unaligned(<4 x i8> addrspace(5)* %out, <4 x i32> %in) {
305entry:
306  %0 = trunc <4 x i32> %in to <4 x i8>
307  store <4 x i8> %0, <4 x i8> addrspace(5)* %out, align 1
308  ret void
309}
310
311; FUNC-LABEL: {{^}}store_v8i8_unaligned:
312; EG: MOVA_INT
313; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
314; EG: MOVA_INT
315; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
316; TODO: This load and store cannot be eliminated,
317;       they might be different locations
318; EG: MOVA_INT
319; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
320; EG: MOVA_INT
321; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
322; TODO: This load and store cannot be eliminated,
323;       they might be different locations
324; EG: MOVA_INT
325; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
326; EG: MOVA_INT
327; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
328; TODO: This load and store cannot be eliminated,
329;       they might be different locations
330; EG: MOVA_INT
331; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
332; EG: MOVA_INT
333; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
334; TODO: This load and store cannot be eliminated,
335;       they might be different locations
336; EG: MOVA_INT
337; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
338; EG: MOVA_INT
339; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
340; TODO: This load and store cannot be eliminated,
341;       they might be different locations
342; EG: MOVA_INT
343; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
344; EG: MOVA_INT
345; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
346; TODO: This load and store cannot be eliminated,
347;       they might be different locations
348; EG: MOVA_INT
349; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
350; EG: MOVA_INT
351; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
352; TODO: This load and store cannot be eliminated,
353;       they might be different locations
354; EG: MOVA_INT
355; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
356; EG: MOVA_INT
357; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
358
359; CM: MOVA_INT
360; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
361; CM: MOVA_INT
362; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
363; TODO: This load and store cannot be eliminated,
364;       they might be different locations
365; CM: MOVA_INT
366; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
367; CM: MOVA_INT
368; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
369; TODO: This load and store cannot be eliminated,
370;       they might be different locations
371; CM: MOVA_INT
372; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
373; CM: MOVA_INT
374; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
375; TODO: This load and store cannot be eliminated,
376;       they might be different locations
377; CM: MOVA_INT
378; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
379; CM: MOVA_INT
380; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
381; TODO: This load and store cannot be eliminated,
382;       they might be different locations
383; CM: MOVA_INT
384; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
385; CM: MOVA_INT
386; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
387; TODO: This load and store cannot be eliminated,
388;       they might be different locations
389; CM: MOVA_INT
390; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
391; CM: MOVA_INT
392; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
393; TODO: This load and store cannot be eliminated,
394;       they might be different locations
395; CM: MOVA_INT
396; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
397; CM: MOVA_INT
398; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
399; TODO: This load and store cannot be eliminated,
400;       they might be different locations
401; CM: MOVA_INT
402; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
403; CM: MOVA_INT
404; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
405
406; SI: buffer_store_byte
407; SI: buffer_store_byte
408; SI: buffer_store_byte
409; SI: buffer_store_byte
410; SI: buffer_store_byte
411; SI: buffer_store_byte
412; SI: buffer_store_byte
413; SI: buffer_store_byte
414; SI-NOT: buffer_store_dword
415define amdgpu_kernel void @store_v8i8_unaligned(<8 x i8> addrspace(5)* %out, <8 x i32> %in) {
416entry:
417  %0 = trunc <8 x i32> %in to <8 x i8>
418  store <8 x i8> %0, <8 x i8> addrspace(5)* %out, align 1
419  ret void
420}
421
422; FUNC-LABEL: {{^}}store_v4i8_halfaligned:
423; EG: MOVA_INT
424; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
425; EG: MOVA_INT
426; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
427; TODO: This load and store cannot be eliminated,
428;       they might be different locations
429; EG: MOVA_INT
430; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
431; EG: MOVA_INT
432; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
433
434; CM: MOVA_INT
435; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
436; CM: MOVA_INT
437; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
438; TODO: This load and store cannot be eliminated,
439;       they might be different locations
440; CM: MOVA_INT
441; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
442; CM: MOVA_INT
443; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
444
445; SI: buffer_store_short
446; SI: buffer_store_short
447; SI-NOT: buffer_store_dword
448define amdgpu_kernel void @store_v4i8_halfaligned(<4 x i8> addrspace(5)* %out, <4 x i32> %in) {
449entry:
450  %0 = trunc <4 x i32> %in to <4 x i8>
451  store <4 x i8> %0, <4 x i8> addrspace(5)* %out, align 2
452  ret void
453}
454
455; floating-point store
456; FUNC-LABEL: {{^}}store_f32:
457; EG: MOVA_INT
458; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
459
460; CM: MOVA_INT
461; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
462
463; SI: buffer_store_dword
464
465define amdgpu_kernel void @store_f32(float addrspace(5)* %out, float %in) {
466  store float %in, float addrspace(5)* %out
467  ret void
468}
469
470; FUNC-LABEL: {{^}}store_v4i16:
471; EG: MOVA_INT
472; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
473; EG: MOVA_INT
474; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
475
476; CM: MOVA_INT
477; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
478; CM: MOVA_INT
479; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
480
481;TODO: why not x2?
482; XSI: buffer_store_dwordx2
483; SI: buffer_store_dword
484; SI: buffer_store_dword
485define amdgpu_kernel void @store_v4i16(<4 x i16> addrspace(5)* %out, <4 x i32> %in) {
486entry:
487  %0 = trunc <4 x i32> %in to <4 x i16>
488  store <4 x i16> %0, <4 x i16> addrspace(5)* %out
489  ret void
490}
491
492; vec2 floating-point stores
493; FUNC-LABEL: {{^}}store_v2f32:
494; EG: MOVA_INT
495; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
496; EG: MOVA_INT
497; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
498
499; CM: MOVA_INT
500; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
501; CM: MOVA_INT
502; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
503
504;TODO: why not x2?
505; XSI: buffer_store_dwordx2
506; SI: buffer_store_dword
507; SI: buffer_store_dword
508
509define amdgpu_kernel void @store_v2f32(<2 x float> addrspace(5)* %out, float %a, float %b) {
510entry:
511  %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0
512  %1 = insertelement <2 x float> %0, float %b, i32 1
513  store <2 x float> %1, <2 x float> addrspace(5)* %out
514  ret void
515}
516
517; FUNC-LABEL: {{^}}store_v3i32:
518; EG: MOVA_INT
519; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
520; EG: MOVA_INT
521; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
522; EG: MOVA_INT
523; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
524
525; CM: MOVA_INT
526; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
527; CM: MOVA_INT
528; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
529; CM: MOVA_INT
530; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
531
532;TODO: why not x2?
533; XSI-DAG: buffer_store_dwordx2
534; SI: buffer_store_dword
535; SI: buffer_store_dword
536; SI: buffer_store_dword
537
538define amdgpu_kernel void @store_v3i32(<3 x i32> addrspace(5)* %out, <3 x i32> %a) nounwind {
539  store <3 x i32> %a, <3 x i32> addrspace(5)* %out, align 16
540  ret void
541}
542
543; FUNC-LABEL: {{^}}store_v4i32:
544; EG: MOVA_INT
545; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
546; EG: MOVA_INT
547; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
548; EG: MOVA_INT
549; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
550; EG: MOVA_INT
551; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
552
553; CM: MOVA_INT
554; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
555; CM: MOVA_INT
556; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
557; CM: MOVA_INT
558; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
559; CM: MOVA_INT
560; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
561
562;TODO: why not x4?
563; XSI: buffer_store_dwordx4
564; SI: buffer_store_dword
565; SI: buffer_store_dword
566; SI: buffer_store_dword
567; SI: buffer_store_dword
568define amdgpu_kernel void @store_v4i32(<4 x i32> addrspace(5)* %out, <4 x i32> %in) {
569entry:
570  store <4 x i32> %in, <4 x i32> addrspace(5)* %out
571  ret void
572}
573
574; FUNC-LABEL: {{^}}store_v4i32_unaligned:
575; EG: MOVA_INT
576; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
577; EG: MOVA_INT
578; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
579; EG: MOVA_INT
580; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
581; EG: MOVA_INT
582; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
583
584; CM: MOVA_INT
585; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
586; CM: MOVA_INT
587; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
588; CM: MOVA_INT
589; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
590; CM: MOVA_INT
591; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
592
593;TODO: why not x4?
594; XSI: buffer_store_dwordx4
595; SI: buffer_store_dword
596; SI: buffer_store_dword
597; SI: buffer_store_dword
598; SI: buffer_store_dword
599define amdgpu_kernel void @store_v4i32_unaligned(<4 x i32> addrspace(5)* %out, <4 x i32> %in) {
600entry:
601  store <4 x i32> %in, <4 x i32> addrspace(5)* %out, align 4
602  ret void
603}
604
605; v4f32 store
606; FUNC-LABEL: {{^}}store_v4f32:
607; EG: MOVA_INT
608; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
609; EG: MOVA_INT
610; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
611; EG: MOVA_INT
612; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
613; EG: MOVA_INT
614; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
615
616; CM: MOVA_INT
617; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
618; CM: MOVA_INT
619; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
620; CM: MOVA_INT
621; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
622; CM: MOVA_INT
623; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
624
625;TODO: why not x4?
626; XSI: buffer_store_dwordx4
627; SI: buffer_store_dword
628; SI: buffer_store_dword
629; SI: buffer_store_dword
630; SI: buffer_store_dword
631define amdgpu_kernel void @store_v4f32(<4 x float> addrspace(5)* %out, <4 x float> addrspace(5)* %in) {
632  %1 = load <4 x float>, <4 x float> addrspace(5)* %in
633  store <4 x float> %1, <4 x float> addrspace(5)* %out
634  ret void
635}
636
637; FUNC-LABEL: {{^}}store_i64_i8:
638; EG: MOVA_INT
639; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
640; EG: MOVA_INT
641; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
642
643; CM: MOVA_INT
644; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
645; CM: MOVA_INT
646; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
647
648; SI: buffer_store_byte
649define amdgpu_kernel void @store_i64_i8(i8 addrspace(5)* %out, i64 %in) {
650entry:
651  %0 = trunc i64 %in to i8
652  store i8 %0, i8 addrspace(5)* %out
653  ret void
654}
655
656; FUNC-LABEL: {{^}}store_i64_i16:
657; EG: MOVA_INT
658; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
659; EG: MOVA_INT
660; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
661
662; CM: MOVA_INT
663; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
664; CM: MOVA_INT
665; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
666
667; SI: buffer_store_short
668define amdgpu_kernel void @store_i64_i16(i16 addrspace(5)* %out, i64 %in) {
669entry:
670  %0 = trunc i64 %in to i16
671  store i16 %0, i16 addrspace(5)* %out
672  ret void
673}
674
675; The stores in this function are combined by the optimizer to create a
676; 64-bit store with 32-bit alignment.  This is legal and the legalizer
677; should not try to split the 64-bit store back into 2 32-bit stores.
678
679; FUNC-LABEL: {{^}}vecload2:
680; EG: MOVA_INT
681; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
682; EG: MOVA_INT
683; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
684
685; CM: MOVA_INT
686; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
687; CM: MOVA_INT
688; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
689
690;TODO: why not x2?
691; XSI: buffer_store_dwordx2
692; SI: buffer_store_dword
693; SI: buffer_store_dword
694define amdgpu_kernel void @vecload2(i32 addrspace(5)* nocapture %out, i32 addrspace(4)* nocapture %mem) #0 {
695entry:
696  %0 = load i32, i32 addrspace(4)* %mem, align 4
697  %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(4)* %mem, i64 1
698  %1 = load i32, i32 addrspace(4)* %arrayidx1.i, align 4
699  store i32 %0, i32 addrspace(5)* %out, align 4
700  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(5)* %out, i64 1
701  store i32 %1, i32 addrspace(5)* %arrayidx1, align 4
702  ret void
703}
704
705; When i128 was a legal type this program generated cannot select errors:
706
707; FUNC-LABEL: {{^}}"i128-const-store":
708; EG: MOVA_INT
709; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
710; EG: MOVA_INT
711; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
712; EG: MOVA_INT
713; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
714; EG: MOVA_INT
715; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
716
717; CM: MOVA_INT
718; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
719; CM: MOVA_INT
720; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
721; CM: MOVA_INT
722; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
723; CM: MOVA_INT
724; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
725
726;TODO: why not x4?
727; XSI: buffer_store_dwordx4
728; SI: buffer_store_dword
729; SI: buffer_store_dword
730; SI: buffer_store_dword
731; SI: buffer_store_dword
732define amdgpu_kernel void @i128-const-store(i32 addrspace(5)* %out) {
733entry:
734  store i32 1, i32 addrspace(5)* %out, align 4
735  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(5)* %out, i64 1
736  store i32 1, i32 addrspace(5)* %arrayidx2, align 4
737  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(5)* %out, i64 2
738  store i32 2, i32 addrspace(5)* %arrayidx4, align 4
739  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(5)* %out, i64 3
740  store i32 2, i32 addrspace(5)* %arrayidx6, align 4
741  ret void
742}
743
744
745attributes #0 = { nounwind }
746