1; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,SI %s
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,VI %s
3
4; GCN-LABEL: {{^}}extract_vector_elt_v1i8:
5; GCN: s_load_dword [[LOAD:s[0-9]+]]
6; GCN: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
7; GCN: buffer_store_byte [[V_LOAD]]
8define amdgpu_kernel void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 {
9  %p0 = extractelement <1 x i8> %foo, i32 0
10  store i8 %p0, i8 addrspace(1)* %out
11  ret void
12}
13
14; GCN-LABEL: {{^}}extract_vector_elt_v2i8:
15; GCN: s_load_dword s
16; GCN-NOT: {{flat|buffer|global}}
17; SI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
18; VI: v_lshrrev_b16_e64 v{{[0-9]+}}, 8, s{{[0-9]+}}
19; GCN-NOT: {{flat|buffer|global}}
20; GCN: buffer_store_byte
21; GCN: buffer_store_byte
22define amdgpu_kernel void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo) #0 {
23  %p0 = extractelement <2 x i8> %foo, i32 0
24  %p1 = extractelement <2 x i8> %foo, i32 1
25  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
26  store volatile i8 %p1, i8 addrspace(1)* %out
27  store volatile i8 %p0, i8 addrspace(1)* %out1
28  ret void
29}
30
31; GCN-LABEL: {{^}}extract_vector_elt_v3i8:
32; GCN: s_load_dword s
33; GCN-NOT: {{flat|buffer|global}}
34; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
35; GCN-NOT: {{flat|buffer|global}}
36; GCN: buffer_store_byte
37; GCN: buffer_store_byte
38define amdgpu_kernel void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo) #0 {
39  %p0 = extractelement <3 x i8> %foo, i32 0
40  %p1 = extractelement <3 x i8> %foo, i32 2
41  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
42  store volatile i8 %p1, i8 addrspace(1)* %out
43  store volatile i8 %p0, i8 addrspace(1)* %out1
44  ret void
45}
46
47; GCN-LABEL: {{^}}extract_vector_elt_v4i8:
48; GCN: s_load_dword s
49; GCN-NOT: {{flat|buffer|global}}
50; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
51; GCN-NOT: {{flat|buffer|global}}
52; GCN: buffer_store_byte
53; GCN: buffer_store_byte
54define amdgpu_kernel void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo) #0 {
55  %p0 = extractelement <4 x i8> %foo, i32 0
56  %p1 = extractelement <4 x i8> %foo, i32 2
57  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
58  store volatile i8 %p1, i8 addrspace(1)* %out
59  store volatile i8 %p0, i8 addrspace(1)* %out1
60  ret void
61}
62
63; GCN-LABEL: {{^}}extract_vector_elt_v8i8:
64; GCN-NOT: {{s|flat|buffer|global}}_load
65; GCN: s_load_dword [[VAL:s[0-9]+]]
66; GCN-NOT: {{s|flat|buffer|global}}_load
67; GCN: s_lshr_b32 s{{[0-9]+}}, [[VAL]], 16
68; GCN-NOT: {{s|flat|buffer|global}}_load
69; GCN: buffer_store_byte
70; GCN: buffer_store_byte
71define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 {
72  %p0 = extractelement <8 x i8> %foo, i32 0
73  %p1 = extractelement <8 x i8> %foo, i32 2
74  store volatile i8 %p1, i8 addrspace(1)* null
75  store volatile i8 %p0, i8 addrspace(1)* null
76  ret void
77}
78
79; GCN-LABEL: {{^}}extract_vector_elt_v16i8:
80; GCN: s_load_dword [[LOAD0:s[0-9]+]]
81; GCN-NOT: {{flat|buffer|global}}
82; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[LOAD0]], 16
83; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], [[LOAD0]]
84; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
85; GCN: buffer_store_byte [[V_ELT2]]
86; GCN: buffer_store_byte [[V_LOAD0]]
87define amdgpu_kernel void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x i8> %foo) #0 {
88  %p0 = extractelement <16 x i8> %foo, i32 0
89  %p1 = extractelement <16 x i8> %foo, i32 2
90  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
91  store volatile i8 %p1, i8 addrspace(1)* %out
92  store volatile i8 %p0, i8 addrspace(1)* %out1
93  ret void
94}
95
96; GCN-LABEL: {{^}}extract_vector_elt_v32i8:
97; GCN-NOT: {{s|flat|buffer|global}}_load
98; GCN: s_load_dword [[VAL:s[0-9]+]]
99; GCN-NOT: {{s|flat|buffer|global}}_load
100; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[VAL]], 16
101; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], s{{[0-9]+}}
102; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
103; GCN: buffer_store_byte [[V_ELT2]]
104; GCN: buffer_store_byte [[V_LOAD0]]
105define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 {
106  %p0 = extractelement <32 x i8> %foo, i32 0
107  %p1 = extractelement <32 x i8> %foo, i32 2
108  store volatile i8 %p1, i8 addrspace(1)* null
109  store volatile i8 %p0, i8 addrspace(1)* null
110  ret void
111}
112
113; GCN-LABEL: {{^}}extract_vector_elt_v64i8:
114; GCN: s_load_dword [[LOAD0:s[0-9]+]]
115; GCN-NOT: {{flat|buffer|global}}
116; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[LOAD0]], 16
117; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], [[LOAD0]]
118; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
119; GCN: buffer_store_byte [[V_ELT2]]
120; GCN: buffer_store_byte [[V_LOAD0]]
121define amdgpu_kernel void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x i8> %foo) #0 {
122  %p0 = extractelement <64 x i8> %foo, i32 0
123  %p1 = extractelement <64 x i8> %foo, i32 2
124  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
125  store volatile i8 %p1, i8 addrspace(1)* %out
126  store volatile i8 %p0, i8 addrspace(1)* %out1
127  ret void
128}
129
130; FIXME: SI generates much worse code from that's a pain to match
131
132; FIXME: 16-bit and 32-bit shift not combined after legalize to to
133; isTypeDesirableForOp in SimplifyDemandedBits
134
135; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v2i8:
136; VI: s_load_dword [[LOAD:s[0-9]+]], s[4:5], 0x28
137; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x4c
138; VI-NOT: {{flat|buffer|global}}
139; VI-DAG: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
140; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
141; VI: v_lshrrev_b16_e32 [[ELT:v[0-9]+]], [[SCALED_IDX]], [[V_LOAD]]
142; VI: buffer_store_byte [[ELT]]
143define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(i8 addrspace(1)* %out, [8 x i32], <2 x i8> %foo, [8 x i32], i32 %idx) #0 {
144  %elt = extractelement <2 x i8> %foo, i32 %idx
145  store volatile i8 %elt, i8 addrspace(1)* %out
146  ret void
147}
148
149; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i8:
150; VI: s_load_dword [[LOAD:s[0-9]+]], s[4:5], 0x28
151; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x4c
152; VI-NOT: {{flat|buffer|global}}
153; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
154; VI: s_lshr_b32 [[ELT:s[0-9]+]], [[LOAD]], [[SCALED_IDX]]
155; VI: v_mov_b32_e32 [[V_ELT:v[0-9]+]], [[ELT]]
156; VI: buffer_store_byte [[V_ELT]]
157define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, [8 x i32], <3 x i8> %foo, [8 x i32], i32 %idx) #0 {
158  %p0 = extractelement <3 x i8> %foo, i32 %idx
159  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
160  store volatile i8 %p0, i8 addrspace(1)* %out
161  ret void
162}
163
164; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i8:
165; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x30
166; VI: s_load_dword [[VEC4:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
167
168; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
169; VI: s_lshr_b32 [[EXTRACT:s[0-9]+]], [[VEC4]], [[SCALED_IDX]]
170
171; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], [[EXTRACT]]
172; VI: buffer_store_byte [[V_EXTRACT]]
173define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(4)* %vec.ptr, [8 x i32], i32 %idx) #0 {
174  %vec = load <4 x i8>, <4 x i8> addrspace(4)* %vec.ptr
175  %p0 = extractelement <4 x i8> %vec, i32 %idx
176  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
177  store volatile i8 %p0, i8 addrspace(1)* %out
178  ret void
179}
180
181; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v8i8:
182; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x10
183; VI: s_load_dwordx2 [[VEC8:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
184
185; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
186; VI: s_lshr_b64 s{{\[}}[[EXTRACT_LO:[0-9]+]]:{{[0-9]+\]}}, [[VEC8]], [[SCALED_IDX]]
187; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], s[[EXTRACT_LO]]
188; VI: buffer_store_byte [[V_EXTRACT]]
189define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> addrspace(4)* %vec.ptr, i32 %idx) #0 {
190  %vec = load <8 x i8>, <8 x i8> addrspace(4)* %vec.ptr
191  %p0 = extractelement <8 x i8> %vec, i32 %idx
192  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
193  store volatile i8 %p0, i8 addrspace(1)* %out
194  ret void
195}
196
197; GCN-LABEL: {{^}}reduce_load_vector_v8i8_extract_0123:
198; GCN-NOT: {{s|buffer|flat|global}}_load_
199; GCN: s_load_dword s
200; GCN-NOT: {{s|buffer|flat|global}}_load_
201; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
202; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
203; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 24
204define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 {
205  %load = load <8 x i8>, <8 x i8> addrspace(4)* null
206  %elt0 = extractelement <8 x i8> %load, i32 0
207  %elt1 = extractelement <8 x i8> %load, i32 1
208  %elt2 = extractelement <8 x i8> %load, i32 2
209  %elt3 = extractelement <8 x i8> %load, i32 3
210  store volatile i8 %elt0, i8 addrspace(1)* undef, align 1
211  store volatile i8 %elt1, i8 addrspace(1)* undef, align 1
212  store volatile i8 %elt2, i8 addrspace(1)* undef, align 1
213  store volatile i8 %elt3, i8 addrspace(1)* undef, align 1
214  ret void
215}
216
217; GCN-LABEL: {{^}}reduce_load_vector_v8i8_extract_0145:
218; GCN-NOT: {{s|buffer|flat|global}}_load_
219; GCN: s_load_dwordx2
220; GCN-NOT: {{s|buffer|flat|global}}_load_
221; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
222; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
223define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 {
224  %load = load <8 x i8>, <8 x i8> addrspace(4)* null
225  %elt0 = extractelement <8 x i8> %load, i32 0
226  %elt1 = extractelement <8 x i8> %load, i32 1
227  %elt4 = extractelement <8 x i8> %load, i32 4
228  %elt5 = extractelement <8 x i8> %load, i32 5
229  store volatile i8 %elt0, i8 addrspace(1)* undef, align 1
230  store volatile i8 %elt1, i8 addrspace(1)* undef, align 1
231  store volatile i8 %elt4, i8 addrspace(1)* undef, align 1
232  store volatile i8 %elt5, i8 addrspace(1)* undef, align 1
233  ret void
234}
235
236; GCN-LABEL: {{^}}reduce_load_vector_v8i8_extract_45:
237; GCN-NOT: {{s|buffer|flat|global}}_load_
238; GCN: s_mov_b64 [[PTR:s\[[0-9]+:[0-9]+\]]], 4{{$}}
239; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0{{$}}
240; GCN-NOT: {{s|buffer|flat|global}}_load_
241; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
242define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 {
243  %load = load <8 x i8>, <8 x i8> addrspace(4)* null
244  %elt4 = extractelement <8 x i8> %load, i32 4
245  %elt5 = extractelement <8 x i8> %load, i32 5
246  store volatile i8 %elt4, i8 addrspace(1)* undef, align 1
247  store volatile i8 %elt5, i8 addrspace(1)* undef, align 1
248  ret void
249}
250
251; FIXME: ought to be able to eliminate high half of load
252; GCN-LABEL: {{^}}reduce_load_vector_v16i8_extract_0145:
253; GCN-NOT: {{s|buffer|flat|global}}_load_
254; GCN: s_load_dwordx4
255; GCN-NOT: {{s|buffer|flat|global}}_load_
256; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
257; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
258define amdgpu_kernel void @reduce_load_vector_v16i8_extract_0145() #0 {
259  %load = load <16 x i8>, <16 x i8> addrspace(4)* null
260  %elt0 = extractelement <16 x i8> %load, i32 0
261  %elt1 = extractelement <16 x i8> %load, i32 1
262  %elt4 = extractelement <16 x i8> %load, i32 4
263  %elt5 = extractelement <16 x i8> %load, i32 5
264  store volatile i8 %elt0, i8 addrspace(1)* undef, align 1
265  store volatile i8 %elt1, i8 addrspace(1)* undef, align 1
266  store volatile i8 %elt4, i8 addrspace(1)* undef, align 1
267  store volatile i8 %elt5, i8 addrspace(1)* undef, align 1
268  ret void
269}
270
271attributes #0 = { nounwind }
272