1; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
2; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s
3; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
4; RUN: llc -march=r600 -mtriple=r600-- -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
5
6; FUNC-LABEL: {{^}}v_test_imin_sle_i32:
7; GCN: v_min_i32_e32
8
9; EG: MIN_INT
10define amdgpu_kernel void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
11  %tid = call i32 @llvm.r600.read.tidig.x()
12  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid
13  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid
14  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
15  %a = load i32, i32 addrspace(1)* %a.gep, align 4
16  %b = load i32, i32 addrspace(1)* %b.gep, align 4
17  %cmp = icmp sle i32 %a, %b
18  %val = select i1 %cmp, i32 %a, i32 %b
19  store i32 %val, i32 addrspace(1)* %out.gep, align 4
20  ret void
21}
22
23; FUNC-LABEL: {{^}}s_test_imin_sle_i32:
24; GCN: s_min_i32
25
26; EG: MIN_INT
27define amdgpu_kernel void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
28  %cmp = icmp sle i32 %a, %b
29  %val = select i1 %cmp, i32 %a, i32 %b
30  store i32 %val, i32 addrspace(1)* %out, align 4
31  ret void
32}
33
34; FUNC-LABEL: {{^}}s_test_imin_sle_v1i32:
35; GCN: s_min_i32
36
37; EG: MIN_INT
38define amdgpu_kernel void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 {
39  %cmp = icmp sle <1 x i32> %a, %b
40  %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
41  store <1 x i32> %val, <1 x i32> addrspace(1)* %out
42  ret void
43}
44
45; FUNC-LABEL: {{^}}s_test_imin_sle_v4i32:
46; GCN: s_min_i32
47; GCN: s_min_i32
48; GCN: s_min_i32
49; GCN: s_min_i32
50
51; EG: MIN_INT
52; EG: MIN_INT
53; EG: MIN_INT
54; EG: MIN_INT
55define amdgpu_kernel void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 {
56  %cmp = icmp sle <4 x i32> %a, %b
57  %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
58  store <4 x i32> %val, <4 x i32> addrspace(1)* %out
59  ret void
60}
61
62; FUNC-LABEL: {{^}}s_test_imin_sle_i8:
63; GCN: s_load_dword
64; GCN: s_load_dword
65; GCN: s_sext_i32_i8
66; GCN: s_sext_i32_i8
67; GCN: s_min_i32
68define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, [8 x i32], i8 %a, [8 x i32], i8 %b) #0 {
69  %cmp = icmp sle i8 %a, %b
70  %val = select i1 %cmp, i8 %a, i8 %b
71  store i8 %val, i8 addrspace(1)* %out
72  ret void
73}
74
75; FIXME: Why vector and sdwa for last element?
76; FUNC-LABEL: {{^}}s_test_imin_sle_v4i8:
77; GCN: s_load_dword s
78; GCN: s_load_dword s
79; GCN-NOT: _load_
80
81; SI: s_min_i32
82; SI: s_min_i32
83; SI: s_min_i32
84; SI: s_min_i32
85
86; VI: s_min_i32
87; VI: s_min_i32
88; VI: s_min_i32
89; VI: v_min_i32_sdwa
90
91; GFX9: v_min_i16
92; GFX9: v_min_i16
93; GFX9: v_min_i16
94; GFX9: v_min_i16
95
96; EG: MIN_INT
97; EG: MIN_INT
98; EG: MIN_INT
99; EG: MIN_INT
100define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], <4 x i8> %b) #0 {
101  %cmp = icmp sle <4 x i8> %a, %b
102  %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
103  store <4 x i8> %val, <4 x i8> addrspace(1)* %out
104  ret void
105}
106
107; FUNC-LABEL: {{^}}s_test_imin_sle_v2i16:
108; GCN: s_load_dword s
109; GCN: s_load_dword s
110
111; SI: s_ashr_i32
112; SI: s_sext_i32_i16
113; SI: s_ashr_i32
114; SI: s_sext_i32_i16
115; SI: s_min_i32
116; SI: s_min_i32
117
118; VI: s_sext_i32_i16
119; VI: s_sext_i32_i16
120; VI: s_min_i32
121; VI: s_min_i32
122
123; GFX9: v_pk_min_i16
124
125; EG: MIN_INT
126; EG: MIN_INT
127define amdgpu_kernel void @s_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 {
128  %cmp = icmp sle <2 x i16> %a, %b
129  %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
130  store <2 x i16> %val, <2 x i16> addrspace(1)* %out
131  ret void
132}
133
134; FUNC-LABEL: {{^}}s_test_imin_sle_v4i16:
135; SI-NOT: buffer_load
136; SI: s_min_i32
137; SI: s_min_i32
138; SI: s_min_i32
139; SI: s_min_i32
140
141; VI: s_min_i32
142; VI: s_min_i32
143; VI: s_min_i32
144; VI: s_min_i32
145
146; GFX9: v_pk_min_i16
147; GFX9: v_pk_min_i16
148
149; EG: MIN_INT
150; EG: MIN_INT
151; EG: MIN_INT
152; EG: MIN_INT
153define amdgpu_kernel void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) #0 {
154  %cmp = icmp sle <4 x i16> %a, %b
155  %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
156  store <4 x i16> %val, <4 x i16> addrspace(1)* %out
157  ret void
158}
159
160; FUNC-LABEL: @v_test_imin_slt_i32
161; GCN: v_min_i32_e32
162
163; EG: MIN_INT
164define amdgpu_kernel void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 {
165  %tid = call i32 @llvm.r600.read.tidig.x()
166  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %aptr, i32 %tid
167  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %bptr, i32 %tid
168  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
169  %a = load i32, i32 addrspace(1)* %a.gep, align 4
170  %b = load i32, i32 addrspace(1)* %b.gep, align 4
171  %cmp = icmp slt i32 %a, %b
172  %val = select i1 %cmp, i32 %a, i32 %b
173  store i32 %val, i32 addrspace(1)* %out.gep, align 4
174  ret void
175}
176
177; FUNC-LABEL: @v_test_imin_slt_i16
178; SI: v_min_i32_e32
179
180; GFX89: v_min_i16_e32
181
182; EG: MIN_INT
183define amdgpu_kernel void @v_test_imin_slt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 {
184  %tid = call i32 @llvm.r600.read.tidig.x()
185  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %aptr, i32 %tid
186  %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %bptr, i32 %tid
187  %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
188
189  %a = load i16, i16 addrspace(1)* %a.gep
190  %b = load i16, i16 addrspace(1)* %b.gep
191  %cmp = icmp slt i16 %a, %b
192  %val = select i1 %cmp, i16 %a, i16 %b
193  store i16 %val, i16 addrspace(1)* %out.gep
194  ret void
195}
196
197; FUNC-LABEL: @s_test_imin_slt_i32
198; GCN: s_min_i32
199
200; EG: MIN_INT
201define amdgpu_kernel void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
202  %cmp = icmp slt i32 %a, %b
203  %val = select i1 %cmp, i32 %a, i32 %b
204  store i32 %val, i32 addrspace(1)* %out, align 4
205  ret void
206}
207
208; FUNC-LABEL: {{^}}s_test_imin_slt_v2i32:
209; GCN: s_min_i32
210; GCN: s_min_i32
211
212; EG: MIN_INT
213; EG: MIN_INT
214define amdgpu_kernel void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
215  %cmp = icmp slt <2 x i32> %a, %b
216  %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
217  store <2 x i32> %val, <2 x i32> addrspace(1)* %out
218  ret void
219}
220
221; FUNC-LABEL: {{^}}s_test_imin_slt_imm_i32:
222; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
223
224; EG: MIN_INT {{.*}}literal.{{[xyzw]}}
225define amdgpu_kernel void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 {
226  %cmp = icmp slt i32 %a, 8
227  %val = select i1 %cmp, i32 %a, i32 8
228  store i32 %val, i32 addrspace(1)* %out, align 4
229  ret void
230}
231
232; FUNC-LABEL: {{^}}s_test_imin_sle_imm_i32:
233; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
234
235; EG: MIN_INT {{.*}}literal.{{[xyzw]}}
236define amdgpu_kernel void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 {
237  %cmp = icmp sle i32 %a, 8
238  %val = select i1 %cmp, i32 %a, i32 8
239  store i32 %val, i32 addrspace(1)* %out, align 4
240  ret void
241}
242
243; FUNC-LABEL: @v_test_umin_ule_i32
244; GCN: v_min_u32_e32
245
246; EG: MIN_UINT
247define amdgpu_kernel void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
248  %tid = call i32 @llvm.r600.read.tidig.x()
249  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid
250  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid
251  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
252  %a = load i32, i32 addrspace(1)* %a.gep, align 4
253  %b = load i32, i32 addrspace(1)* %b.gep, align 4
254  %cmp = icmp ule i32 %a, %b
255  %val = select i1 %cmp, i32 %a, i32 %b
256  store i32 %val, i32 addrspace(1)* %out.gep, align 4
257  ret void
258}
259
260; FUNC-LABEL: @v_test_umin_ule_v3i32
261; GCN: v_min_u32_e32
262; GCN: v_min_u32_e32
263; GCN: v_min_u32_e32
264; GCN-NOT: v_min_u32_e32
265; GCN: s_endpgm
266
267; EG: MIN_UINT
268; EG: MIN_UINT
269; EG: MIN_UINT
270define amdgpu_kernel void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %a.ptr, <3 x i32> addrspace(1)* %b.ptr) #0 {
271  %tid = call i32 @llvm.r600.read.tidig.x()
272  %a.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %a.ptr, i32 %tid
273  %b.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %b.ptr, i32 %tid
274  %out.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %tid
275
276  %a = load <3 x i32>, <3 x i32> addrspace(1)* %a.gep
277  %b = load <3 x i32>, <3 x i32> addrspace(1)* %b.gep
278  %cmp = icmp ule <3 x i32> %a, %b
279  %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b
280  store <3 x i32> %val, <3 x i32> addrspace(1)* %out.gep
281  ret void
282}
283
284; FIXME: Reduce unused packed component to scalar
285; FUNC-LABEL: @v_test_umin_ule_v3i16{{$}}
286; SI: v_min_u32_e32
287; SI: v_min_u32_e32
288; SI: v_min_u32_e32
289; SI-NOT: v_min_u32_e32
290
291; VI: v_min_u16_e32
292; VI: v_min_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
293; VI: v_min_u16_e32
294; VI-NOT: v_min_u16
295
296; GFX9: v_pk_min_u16
297; GFX9: v_pk_min_u16
298
299; GCN: s_endpgm
300
301; EG: MIN_UINT
302; EG: MIN_UINT
303; EG: MIN_UINT
304define amdgpu_kernel void @v_test_umin_ule_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr) #0 {
305  %tid = call i32 @llvm.r600.read.tidig.x()
306  %a.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %a.ptr, i32 %tid
307  %b.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %b.ptr, i32 %tid
308  %out.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %out, i32 %tid
309
310  %a = load <3 x i16>, <3 x i16> addrspace(1)* %a.gep
311  %b = load <3 x i16>, <3 x i16> addrspace(1)* %b.gep
312  %cmp = icmp ule <3 x i16> %a, %b
313  %val = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
314  store <3 x i16> %val, <3 x i16> addrspace(1)* %out.gep
315  ret void
316}
317
318; FUNC-LABEL: @s_test_umin_ule_i32
319; GCN: s_min_u32
320
321; EG: MIN_UINT
322define amdgpu_kernel void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
323  %cmp = icmp ule i32 %a, %b
324  %val = select i1 %cmp, i32 %a, i32 %b
325  store i32 %val, i32 addrspace(1)* %out, align 4
326  ret void
327}
328
329; FUNC-LABEL: @v_test_umin_ult_i32
330; GCN: v_min_u32_e32
331
332; EG: MIN_UINT
333define amdgpu_kernel void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
334  %tid = call i32 @llvm.r600.read.tidig.x()
335  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid
336  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid
337  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
338  %a = load i32, i32 addrspace(1)* %a.gep, align 4
339  %b = load i32, i32 addrspace(1)* %b.gep, align 4
340  %cmp = icmp ult i32 %a, %b
341  %val = select i1 %cmp, i32 %a, i32 %b
342  store i32 %val, i32 addrspace(1)* %out.gep, align 4
343  ret void
344}
345
346; FUNC-LABEL: {{^}}v_test_umin_ult_i8:
347; SI: {{buffer|flat|global}}_load_ubyte
348; SI: {{buffer|flat|global}}_load_ubyte
349; SI: v_min_u32_e32
350
351; GFX89: {{flat|global}}_load_ubyte
352; GFX89: {{flat|global}}_load_ubyte
353; GFX89: v_min_u16_e32
354
355; EG: MIN_UINT
356define amdgpu_kernel void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %a.ptr, i8 addrspace(1)* %b.ptr) #0 {
357  %tid = call i32 @llvm.r600.read.tidig.x()
358  %a.gep = getelementptr inbounds i8, i8 addrspace(1)* %a.ptr, i32 %tid
359  %b.gep = getelementptr inbounds i8, i8 addrspace(1)* %b.ptr, i32 %tid
360  %out.gep = getelementptr inbounds i8, i8 addrspace(1)* %out, i32 %tid
361
362  %a = load i8, i8 addrspace(1)* %a.gep, align 1
363  %b = load i8, i8 addrspace(1)* %b.gep, align 1
364  %cmp = icmp ult i8 %a, %b
365  %val = select i1 %cmp, i8 %a, i8 %b
366  store i8 %val, i8 addrspace(1)* %out.gep, align 1
367  ret void
368}
369
370; FUNC-LABEL: @s_test_umin_ult_i32
371; GCN: s_min_u32
372
373; EG: MIN_UINT
374define amdgpu_kernel void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
375  %cmp = icmp ult i32 %a, %b
376  %val = select i1 %cmp, i32 %a, i32 %b
377  store i32 %val, i32 addrspace(1)* %out, align 4
378  ret void
379}
380
381; FUNC-LABEL: @v_test_umin_ult_i32_multi_use
382; SI-NOT: v_min
383; GCN: v_cmp_lt_u32
384; SI-NEXT: v_cndmask_b32
385; SI-NOT: v_min
386; GCN: s_endpgm
387
388; EG-NOT: MIN_UINT
389define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 {
390  %a = load i32, i32 addrspace(1)* %aptr, align 4
391  %b = load i32, i32 addrspace(1)* %bptr, align 4
392  %cmp = icmp ult i32 %a, %b
393  %val = select i1 %cmp, i32 %a, i32 %b
394  store i32 %val, i32 addrspace(1)* %out0, align 4
395  store i1 %cmp, i1 addrspace(1)* %out1
396  ret void
397}
398
399; FUNC-LABEL: @v_test_umin_ult_i16_multi_use
400; GCN-NOT: v_min
401; GCN: v_cmp_lt_u32
402; GCN-NEXT: v_cndmask_b32
403; GCN-NOT: v_min
404; GCN: s_endpgm
405
406; EG-NOT: MIN_UINT
407define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace(1)* %out1, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 {
408  %a = load i16, i16 addrspace(1)* %aptr, align 2
409  %b = load i16, i16 addrspace(1)* %bptr, align 2
410  %cmp = icmp ult i16 %a, %b
411  %val = select i1 %cmp, i16 %a, i16 %b
412  store i16 %val, i16 addrspace(1)* %out0, align 2
413  store i1 %cmp, i1 addrspace(1)* %out1
414  ret void
415}
416
417
418; FUNC-LABEL: @s_test_umin_ult_v1i32
419; GCN: s_min_u32
420
421; EG: MIN_UINT
422define amdgpu_kernel void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 {
423  %cmp = icmp ult <1 x i32> %a, %b
424  %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
425  store <1 x i32> %val, <1 x i32> addrspace(1)* %out
426  ret void
427}
428
429; FUNC-LABEL: {{^}}s_test_umin_ult_v8i32:
430; GCN: s_min_u32
431; GCN: s_min_u32
432; GCN: s_min_u32
433; GCN: s_min_u32
434; GCN: s_min_u32
435; GCN: s_min_u32
436; GCN: s_min_u32
437; GCN: s_min_u32
438
439; EG: MIN_UINT
440; EG: MIN_UINT
441; EG: MIN_UINT
442; EG: MIN_UINT
443; EG: MIN_UINT
444; EG: MIN_UINT
445; EG: MIN_UINT
446; EG: MIN_UINT
447define amdgpu_kernel void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) #0 {
448  %cmp = icmp ult <8 x i32> %a, %b
449  %val = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b
450  store <8 x i32> %val, <8 x i32> addrspace(1)* %out
451  ret void
452}
453
454; FUNC-LABEL: {{^}}s_test_umin_ult_v8i16:
455; GCN-NOT: {{buffer|flat|global}}_load
456; SI: s_min_u32
457; SI: s_min_u32
458; SI: s_min_u32
459; SI: s_min_u32
460; SI: s_min_u32
461; SI: s_min_u32
462; SI: s_min_u32
463; SI: s_min_u32
464
465; VI: s_min_u32
466; VI: s_min_u32
467; VI: s_min_u32
468; VI: s_min_u32
469; VI: s_min_u32
470; VI: s_min_u32
471; VI: s_min_u32
472; VI: s_min_u32
473
474; EG: MIN_UINT
475; EG: MIN_UINT
476; EG: MIN_UINT
477; EG: MIN_UINT
478; EG: MIN_UINT
479; EG: MIN_UINT
480; EG: MIN_UINT
481; EG: MIN_UINT
482define amdgpu_kernel void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) #0 {
483  %cmp = icmp ult <8 x i16> %a, %b
484  %val = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
485  store <8 x i16> %val, <8 x i16> addrspace(1)* %out
486  ret void
487}
488
489; Make sure redundant and removed
490; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16:
491; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}}
492; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
493; GCN: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]]
494; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
495; GCN: buffer_store_dword [[VMIN]]
496
497; EG: MIN_UINT
498define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) #0 {
499  %a.ext = zext i16 %a to i32
500  %b.ext = zext i16 %b to i32
501  %cmp = icmp ult i32 %a.ext, %b.ext
502  %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
503  %mask = and i32 %val, 65535
504  store i32 %mask, i32 addrspace(1)* %out
505  ret void
506}
507
508; Make sure redundant sign_extend_inreg removed.
509
510; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16:
511; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}}
512; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
513; GCN-DAG: s_sext_i32_i16 [[EXT_A:s[0-9]+]], [[A]]
514; GCN-DAG: s_sext_i32_i16 [[EXT_B:s[0-9]+]], [[B]]
515
516; GCN: s_min_i32 [[MIN:s[0-9]+]], [[EXT_A]], [[EXT_B]]
517; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
518; GCN: buffer_store_dword [[VMIN]]
519
520; EG: MIN_INT
521define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) #0 {
522  %a.ext = sext i16 %a to i32
523  %b.ext = sext i16 %b to i32
524  %cmp = icmp slt i32 %a.ext, %b.ext
525  %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
526  %shl = shl i32 %val, 16
527  %sextinreg = ashr i32 %shl, 16
528  store i32 %sextinreg, i32 addrspace(1)* %out
529  ret void
530}
531
532; FUNC-LABEL: {{^}}s_test_imin_sle_i16:
533; GCN: s_min_i32
534
535; EG: MIN_INT
536define amdgpu_kernel void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) #0 {
537  %cmp = icmp sle i16 %a, %b
538  %val = select i1 %cmp, i16 %a, i16 %b
539  store i16 %val, i16 addrspace(1)* %out
540  ret void
541}
542
543; 64 bit
544; FUNC-LABEL: {{^}}test_umin_ult_i64
545; GCN: s_endpgm
546
547; EG: MIN_UINT
548; EG: MIN_UINT
549define amdgpu_kernel void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
550  %tmp = icmp ult i64 %a, %b
551  %val = select i1 %tmp, i64 %a, i64 %b
552  store i64 %val, i64 addrspace(1)* %out, align 8
553  ret void
554}
555
556; FUNC-LABEL: {{^}}test_umin_ule_i64
557; GCN: s_endpgm
558
559; EG: MIN_UINT
560; EG: MIN_UINT
561define amdgpu_kernel void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
562  %tmp = icmp ule i64 %a, %b
563  %val = select i1 %tmp, i64 %a, i64 %b
564  store i64 %val, i64 addrspace(1)* %out, align 8
565  ret void
566}
567
568; FUNC-LABEL: {{^}}test_imin_slt_i64
569; GCN: s_endpgm
570
571; EG-DAG: MIN_UINT
572; EG-DAG: MIN_INT
573define amdgpu_kernel void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
574  %tmp = icmp slt i64 %a, %b
575  %val = select i1 %tmp, i64 %a, i64 %b
576  store i64 %val, i64 addrspace(1)* %out, align 8
577  ret void
578}
579
580; FUNC-LABEL: {{^}}test_imin_sle_i64
581; GCN: s_endpgm
582
583; EG-DAG: MIN_UINT
584; EG-DAG: MIN_INT
585define amdgpu_kernel void @test_imin_sle_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
586  %tmp = icmp sle i64 %a, %b
587  %val = select i1 %tmp, i64 %a, i64 %b
588  store i64 %val, i64 addrspace(1)* %out, align 8
589  ret void
590}
591
592; FUNC-LABEL: {{^}}v_test_imin_sle_v2i16:
593; SI: v_min_i32
594; SI: v_min_i32
595
596; VI: v_min_i16
597; VI: v_min_i16
598
599; GFX9: v_pk_min_i16
600
601; EG: MIN_INT
602; EG: MIN_INT
603define amdgpu_kernel void @v_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 {
604  %tid = call i32 @llvm.r600.read.tidig.x()
605  %a.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a.ptr, i32 %tid
606  %b.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b.ptr, i32 %tid
607  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
608  %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.gep
609  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.gep
610  %cmp = icmp sle <2 x i16> %a, %b
611  %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
612  store <2 x i16> %val, <2 x i16> addrspace(1)* %out.gep
613  ret void
614}
615
616; FIXME: i16 min
617; FUNC-LABEL: {{^}}v_test_imin_ule_v2i16:
618; SI: v_min_u32
619; SI: v_min_u32
620
621; VI: v_min_u16
622; VI: v_min_u16
623
624; GFX9: v_pk_min_u16
625
626; EG: MIN_UINT
627; EG: MIN_UINT
628define amdgpu_kernel void @v_test_imin_ule_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 {
629  %tid = call i32 @llvm.r600.read.tidig.x()
630  %a.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a.ptr, i32 %tid
631  %b.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b.ptr, i32 %tid
632  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
633  %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.gep
634  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.gep
635  %cmp = icmp ule <2 x i16> %a, %b
636  %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
637  store <2 x i16> %val, <2 x i16> addrspace(1)* %out.gep
638  ret void
639}
640
641declare i32 @llvm.r600.read.tidig.x() #1
642
643attributes #0 = { nounwind }
644attributes #1 = { nounwind readnone }
645