1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
3; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
4
5define amdgpu_ps i32 @s_andn2_i32(i32 inreg %src0, i32 inreg %src1) {
6; GCN-LABEL: s_andn2_i32:
7; GCN:       ; %bb.0:
8; GCN-NEXT:    s_andn2_b32 s0, s2, s3
9; GCN-NEXT:    ; return to shader part epilog
10  %not.src1 = xor i32 %src1, -1
11  %and = and i32 %src0, %not.src1
12  ret i32 %and
13}
14
15define amdgpu_ps i32 @s_andn2_i32_commute(i32 inreg %src0, i32 inreg %src1) {
16; GCN-LABEL: s_andn2_i32_commute:
17; GCN:       ; %bb.0:
18; GCN-NEXT:    s_andn2_b32 s0, s2, s3
19; GCN-NEXT:    ; return to shader part epilog
20  %not.src1 = xor i32 %src1, -1
21  %and = and i32 %not.src1, %src0
22  ret i32 %and
23}
24
25define amdgpu_ps { i32, i32 } @s_andn2_i32_multi_use(i32 inreg %src0, i32 inreg %src1) {
26; GCN-LABEL: s_andn2_i32_multi_use:
27; GCN:       ; %bb.0:
28; GCN-NEXT:    s_not_b32 s1, s3
29; GCN-NEXT:    s_andn2_b32 s0, s2, s3
30; GCN-NEXT:    ; return to shader part epilog
31  %not.src1 = xor i32 %src1, -1
32  %and = and i32 %src0, %not.src1
33  %insert.0 = insertvalue { i32, i32 } undef, i32 %and, 0
34  %insert.1 = insertvalue { i32, i32 } %insert.0, i32 %not.src1, 1
35  ret { i32, i32 } %insert.1
36}
37
38define amdgpu_ps { i32, i32 } @s_andn2_i32_multi_foldable_use(i32 inreg %src0, i32 inreg %src1, i32 inreg %src2) {
39; GCN-LABEL: s_andn2_i32_multi_foldable_use:
40; GCN:       ; %bb.0:
41; GCN-NEXT:    s_andn2_b32 s0, s2, s4
42; GCN-NEXT:    s_andn2_b32 s1, s3, s4
43; GCN-NEXT:    ; return to shader part epilog
44  %not.src2 = xor i32 %src2, -1
45  %and0 = and i32 %src0, %not.src2
46  %and1 = and i32 %src1, %not.src2
47  %insert.0 = insertvalue { i32, i32 } undef, i32 %and0, 0
48  %insert.1 = insertvalue { i32, i32 } %insert.0, i32 %and1, 1
49  ret { i32, i32 } %insert.1
50}
51
52define i32 @v_andn2_i32(i32 %src0, i32 %src1) {
53; GCN-LABEL: v_andn2_i32:
54; GCN:       ; %bb.0:
55; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56; GCN-NEXT:    v_xor_b32_e32 v1, -1, v1
57; GCN-NEXT:    v_and_b32_e32 v0, v0, v1
58; GCN-NEXT:    s_setpc_b64 s[30:31]
59  %not.src1 = xor i32 %src1, -1
60  %and = and i32 %src0, %not.src1
61  ret i32 %and
62}
63
64define amdgpu_ps float @v_andn2_i32_sv(i32 inreg %src0, i32 %src1) {
65; GCN-LABEL: v_andn2_i32_sv:
66; GCN:       ; %bb.0:
67; GCN-NEXT:    v_xor_b32_e32 v0, -1, v0
68; GCN-NEXT:    v_and_b32_e32 v0, s2, v0
69; GCN-NEXT:    ; return to shader part epilog
70  %not.src1 = xor i32 %src1, -1
71  %and = and i32 %src0, %not.src1
72  %cast = bitcast i32 %and to float
73  ret float %cast
74}
75
76define amdgpu_ps float @v_andn2_i32_vs(i32 %src0, i32 inreg %src1) {
77; GCN-LABEL: v_andn2_i32_vs:
78; GCN:       ; %bb.0:
79; GCN-NEXT:    s_not_b32 s0, s2
80; GCN-NEXT:    v_and_b32_e32 v0, s0, v0
81; GCN-NEXT:    ; return to shader part epilog
82  %not.src1 = xor i32 %src1, -1
83  %and = and i32 %src0, %not.src1
84  %cast = bitcast i32 %and to float
85  ret float %cast
86}
87
88define amdgpu_ps i64 @s_andn2_i64(i64 inreg %src0, i64 inreg %src1) {
89; GCN-LABEL: s_andn2_i64:
90; GCN:       ; %bb.0:
91; GCN-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[4:5]
92; GCN-NEXT:    ; return to shader part epilog
93  %not.src1 = xor i64 %src1, -1
94  %and = and i64 %src0, %not.src1
95  ret i64 %and
96}
97
98define amdgpu_ps i64 @s_andn2_i64_commute(i64 inreg %src0, i64 inreg %src1) {
99; GCN-LABEL: s_andn2_i64_commute:
100; GCN:       ; %bb.0:
101; GCN-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[4:5]
102; GCN-NEXT:    ; return to shader part epilog
103  %not.src1 = xor i64 %src1, -1
104  %and = and i64 %not.src1, %src0
105  ret i64 %and
106}
107
108define amdgpu_ps { i64, i64 } @s_andn2_i64_multi_foldable_use(i64 inreg %src0, i64 inreg %src1, i64 inreg %src2) {
109; GCN-LABEL: s_andn2_i64_multi_foldable_use:
110; GCN:       ; %bb.0:
111; GCN-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[6:7]
112; GCN-NEXT:    s_andn2_b64 s[2:3], s[4:5], s[6:7]
113; GCN-NEXT:    ; return to shader part epilog
114  %not.src2 = xor i64 %src2, -1
115  %and0 = and i64 %src0, %not.src2
116  %and1 = and i64 %src1, %not.src2
117  %insert.0 = insertvalue { i64, i64 } undef, i64 %and0, 0
118  %insert.1 = insertvalue { i64, i64 } %insert.0, i64 %and1, 1
119  ret { i64, i64 } %insert.1
120}
121
122define amdgpu_ps { i64, i64 } @s_andn2_i64_multi_use(i64 inreg %src0, i64 inreg %src1) {
123; GCN-LABEL: s_andn2_i64_multi_use:
124; GCN:       ; %bb.0:
125; GCN-NEXT:    s_not_b64 s[6:7], s[4:5]
126; GCN-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[4:5]
127; GCN-NEXT:    s_mov_b32 s2, s6
128; GCN-NEXT:    s_mov_b32 s3, s7
129; GCN-NEXT:    ; return to shader part epilog
130  %not.src1 = xor i64 %src1, -1
131  %and = and i64 %src0, %not.src1
132  %insert.0 = insertvalue { i64, i64 } undef, i64 %and, 0
133  %insert.1 = insertvalue { i64, i64 } %insert.0, i64 %not.src1, 1
134  ret { i64, i64 } %insert.1
135}
136
137define i64 @v_andn2_i64(i64 %src0, i64 %src1) {
138; GCN-LABEL: v_andn2_i64:
139; GCN:       ; %bb.0:
140; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
141; GCN-NEXT:    v_xor_b32_e32 v2, -1, v2
142; GCN-NEXT:    v_xor_b32_e32 v3, -1, v3
143; GCN-NEXT:    v_and_b32_e32 v0, v0, v2
144; GCN-NEXT:    v_and_b32_e32 v1, v1, v3
145; GCN-NEXT:    s_setpc_b64 s[30:31]
146  %not.src1 = xor i64 %src1, -1
147  %and = and i64 %src0, %not.src1
148  ret i64 %and
149}
150
151define amdgpu_ps <2 x float> @v_andn2_i64_sv(i64 inreg %src0, i64 %src1) {
152; GCN-LABEL: v_andn2_i64_sv:
153; GCN:       ; %bb.0:
154; GCN-NEXT:    v_xor_b32_e32 v0, -1, v0
155; GCN-NEXT:    v_xor_b32_e32 v1, -1, v1
156; GCN-NEXT:    v_and_b32_e32 v0, s2, v0
157; GCN-NEXT:    v_and_b32_e32 v1, s3, v1
158; GCN-NEXT:    ; return to shader part epilog
159  %not.src1 = xor i64 %src1, -1
160  %and = and i64 %src0, %not.src1
161  %cast = bitcast i64 %and to <2 x float>
162  ret <2 x float> %cast
163}
164
165define amdgpu_ps <2 x float> @v_andn2_i64_vs(i64 %src0, i64 inreg %src1) {
166; GCN-LABEL: v_andn2_i64_vs:
167; GCN:       ; %bb.0:
168; GCN-NEXT:    s_not_b64 s[0:1], s[2:3]
169; GCN-NEXT:    v_and_b32_e32 v0, s0, v0
170; GCN-NEXT:    v_and_b32_e32 v1, s1, v1
171; GCN-NEXT:    ; return to shader part epilog
172  %not.src1 = xor i64 %src1, -1
173  %and = and i64 %src0, %not.src1
174  %cast = bitcast i64 %and to <2 x float>
175  ret <2 x float> %cast
176}
177
178define amdgpu_ps <2 x i32> @s_andn2_v2i32(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
179; GCN-LABEL: s_andn2_v2i32:
180; GCN:       ; %bb.0:
181; GCN-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[4:5]
182; GCN-NEXT:    ; return to shader part epilog
183  %not.src1 = xor <2 x i32> %src1, <i32 -1, i32 -1>
184  %and = and <2 x i32> %src0, %not.src1
185  ret <2 x i32> %and
186}
187
188define amdgpu_ps <2 x i32> @s_andn2_v2i32_commute(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
189; GCN-LABEL: s_andn2_v2i32_commute:
190; GCN:       ; %bb.0:
191; GCN-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[4:5]
192; GCN-NEXT:    ; return to shader part epilog
193  %not.src1 = xor <2 x i32> %src1, <i32 -1, i32 -1>
194  %and = and <2 x i32> %not.src1, %src0
195  ret <2 x i32> %and
196}
197
198define amdgpu_ps i16 @s_andn2_i16(i16 inreg %src0, i16 inreg %src1) {
199; GCN-LABEL: s_andn2_i16:
200; GCN:       ; %bb.0:
201; GCN-NEXT:    s_andn2_b32 s0, s2, s3
202; GCN-NEXT:    ; return to shader part epilog
203  %not.src1 = xor i16 %src1, -1
204  %and = and i16 %src0, %not.src1
205  ret i16 %and
206}
207
208define amdgpu_ps i16 @s_andn2_i16_commute(i16 inreg %src0, i16 inreg %src1) {
209; GCN-LABEL: s_andn2_i16_commute:
210; GCN:       ; %bb.0:
211; GCN-NEXT:    s_andn2_b32 s0, s2, s3
212; GCN-NEXT:    ; return to shader part epilog
213  %not.src1 = xor i16 %src1, -1
214  %and = and i16 %not.src1, %src0
215  ret i16 %and
216}
217
218define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg %src1) {
219; GCN-LABEL: s_andn2_i16_multi_use:
220; GCN:       ; %bb.0:
221; GCN-NEXT:    s_xor_b32 s1, s3, -1
222; GCN-NEXT:    s_andn2_b32 s0, s2, s3
223; GCN-NEXT:    ; return to shader part epilog
224  %not.src1 = xor i16 %src1, -1
225  %and = and i16 %src0, %not.src1
226  %insert.0 = insertvalue { i16, i16 } undef, i16 %and, 0
227  %insert.1 = insertvalue { i16, i16 } %insert.0, i16 %not.src1, 1
228  ret { i16, i16 } %insert.1
229}
230
231define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_foldable_use(i16 inreg %src0, i16 inreg %src1, i16 inreg %src2) {
232; GCN-LABEL: s_andn2_i16_multi_foldable_use:
233; GCN:       ; %bb.0:
234; GCN-NEXT:    s_andn2_b32 s0, s2, s4
235; GCN-NEXT:    s_andn2_b32 s1, s3, s4
236; GCN-NEXT:    ; return to shader part epilog
237  %not.src2 = xor i16 %src2, -1
238  %and0 = and i16 %src0, %not.src2
239  %and1 = and i16 %src1, %not.src2
240  %insert.0 = insertvalue { i16, i16 } undef, i16 %and0, 0
241  %insert.1 = insertvalue { i16, i16 } %insert.0, i16 %and1, 1
242  ret { i16, i16 } %insert.1
243}
244
245define i16 @v_andn2_i16(i16 %src0, i16 %src1) {
246; GCN-LABEL: v_andn2_i16:
247; GCN:       ; %bb.0:
248; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
249; GCN-NEXT:    v_xor_b32_e32 v1, -1, v1
250; GCN-NEXT:    v_and_b32_e32 v0, v0, v1
251; GCN-NEXT:    s_setpc_b64 s[30:31]
252  %not.src1 = xor i16 %src1, -1
253  %and = and i16 %src0, %not.src1
254  ret i16 %and
255}
256
257define amdgpu_ps float @v_andn2_i16_sv(i16 inreg %src0, i16 %src1) {
258; GCN-LABEL: v_andn2_i16_sv:
259; GCN:       ; %bb.0:
260; GCN-NEXT:    v_xor_b32_e32 v0, -1, v0
261; GCN-NEXT:    v_and_b32_e32 v0, s2, v0
262; GCN-NEXT:    v_bfe_u32 v0, v0, 0, 16
263; GCN-NEXT:    ; return to shader part epilog
264  %not.src1 = xor i16 %src1, -1
265  %and = and i16 %src0, %not.src1
266  %zext = zext i16 %and to i32
267  %cast.zext = bitcast i32 %zext to float
268  ret float %cast.zext
269}
270
271define amdgpu_ps float @v_andn2_i16_vs(i16 %src0, i16 inreg %src1) {
272; GCN-LABEL: v_andn2_i16_vs:
273; GCN:       ; %bb.0:
274; GCN-NEXT:    s_xor_b32 s0, s2, -1
275; GCN-NEXT:    v_and_b32_e32 v0, s0, v0
276; GCN-NEXT:    v_bfe_u32 v0, v0, 0, 16
277; GCN-NEXT:    ; return to shader part epilog
278  %not.src1 = xor i16 %src1, -1
279  %and = and i16 %src0, %not.src1
280  %zext = zext i16 %and to i32
281  %cast.zext = bitcast i32 %zext to float
282  ret float %cast.zext
283}
284
285define amdgpu_ps i32 @s_andn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
286; GFX6-LABEL: s_andn2_v2i16:
287; GFX6:       ; %bb.0:
288; GFX6-NEXT:    s_mov_b32 s1, 0xffff
289; GFX6-NEXT:    s_and_b32 s2, s2, s1
290; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
291; GFX6-NEXT:    s_or_b32 s0, s0, s2
292; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
293; GFX6-NEXT:    s_and_b32 s1, s4, s1
294; GFX6-NEXT:    s_or_b32 s1, s2, s1
295; GFX6-NEXT:    s_xor_b32 s1, s1, -1
296; GFX6-NEXT:    s_and_b32 s0, s0, s1
297; GFX6-NEXT:    ; return to shader part epilog
298;
299; GFX9-LABEL: s_andn2_v2i16:
300; GFX9:       ; %bb.0:
301; GFX9-NEXT:    s_andn2_b32 s0, s2, s3
302; GFX9-NEXT:    ; return to shader part epilog
303  %not.src1 = xor <2 x i16> %src1, <i16 -1, i16 -1>
304  %and = and <2 x i16> %src0, %not.src1
305  %cast = bitcast <2 x i16> %and to i32
306  ret i32 %cast
307}
308
309define amdgpu_ps i32 @s_andn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
310; GFX6-LABEL: s_andn2_v2i16_commute:
311; GFX6:       ; %bb.0:
312; GFX6-NEXT:    s_mov_b32 s1, 0xffff
313; GFX6-NEXT:    s_and_b32 s2, s2, s1
314; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
315; GFX6-NEXT:    s_or_b32 s0, s0, s2
316; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
317; GFX6-NEXT:    s_and_b32 s1, s4, s1
318; GFX6-NEXT:    s_or_b32 s1, s2, s1
319; GFX6-NEXT:    s_xor_b32 s1, s1, -1
320; GFX6-NEXT:    s_and_b32 s0, s1, s0
321; GFX6-NEXT:    ; return to shader part epilog
322;
323; GFX9-LABEL: s_andn2_v2i16_commute:
324; GFX9:       ; %bb.0:
325; GFX9-NEXT:    s_andn2_b32 s0, s2, s3
326; GFX9-NEXT:    ; return to shader part epilog
327  %not.src1 = xor <2 x i16> %src1, <i16 -1, i16 -1>
328  %and = and <2 x i16> %not.src1, %src0
329  %cast = bitcast <2 x i16> %and to i32
330  ret i32 %cast
331}
332
333define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
334; GFX6-LABEL: s_andn2_v2i16_multi_use:
335; GFX6:       ; %bb.0:
336; GFX6-NEXT:    s_mov_b32 s1, 0xffff
337; GFX6-NEXT:    s_and_b32 s2, s2, s1
338; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
339; GFX6-NEXT:    s_or_b32 s0, s0, s2
340; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
341; GFX6-NEXT:    s_and_b32 s1, s4, s1
342; GFX6-NEXT:    s_or_b32 s1, s2, s1
343; GFX6-NEXT:    s_xor_b32 s1, s1, -1
344; GFX6-NEXT:    s_and_b32 s0, s0, s1
345; GFX6-NEXT:    ; return to shader part epilog
346;
347; GFX9-LABEL: s_andn2_v2i16_multi_use:
348; GFX9:       ; %bb.0:
349; GFX9-NEXT:    s_xor_b32 s1, s3, -1
350; GFX9-NEXT:    s_andn2_b32 s0, s2, s3
351; GFX9-NEXT:    ; return to shader part epilog
352  %not.src1 = xor <2 x i16> %src1, <i16 -1, i16 -1>
353  %and = and <2 x i16> %src0, %not.src1
354
355  %cast.0 = bitcast <2 x i16> %and to i32
356  %cast.1 = bitcast <2 x i16> %not.src1 to i32
357  %insert.0 = insertvalue { i32, i32 } undef, i32 %cast.0, 0
358  %insert.1 = insertvalue { i32, i32 } %insert.0, i32 %cast.1, 1
359  ret { i32, i32 } %insert.1
360}
361
362define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) {
363; GFX6-LABEL: s_andn2_v2i16_multi_foldable_use:
364; GFX6:       ; %bb.0:
365; GFX6-NEXT:    s_mov_b32 s1, 0xffff
366; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
367; GFX6-NEXT:    s_and_b32 s2, s2, s1
368; GFX6-NEXT:    s_or_b32 s0, s0, s2
369; GFX6-NEXT:    s_and_b32 s3, s4, s1
370; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
371; GFX6-NEXT:    s_or_b32 s2, s2, s3
372; GFX6-NEXT:    s_lshl_b32 s3, s7, 16
373; GFX6-NEXT:    s_and_b32 s1, s6, s1
374; GFX6-NEXT:    s_or_b32 s1, s3, s1
375; GFX6-NEXT:    s_xor_b32 s1, s1, -1
376; GFX6-NEXT:    s_and_b32 s0, s0, s1
377; GFX6-NEXT:    s_and_b32 s1, s2, s1
378; GFX6-NEXT:    ; return to shader part epilog
379;
380; GFX9-LABEL: s_andn2_v2i16_multi_foldable_use:
381; GFX9:       ; %bb.0:
382; GFX9-NEXT:    s_andn2_b32 s0, s2, s4
383; GFX9-NEXT:    s_andn2_b32 s1, s3, s4
384; GFX9-NEXT:    ; return to shader part epilog
385  %not.src2 = xor <2 x i16> %src2, <i16 -1, i16 -1>
386  %and0 = and <2 x i16> %src0, %not.src2
387  %and1 = and <2 x i16> %src1, %not.src2
388
389  %cast.0 = bitcast <2 x i16> %and0 to i32
390  %cast.1 = bitcast <2 x i16> %and1 to i32
391  %insert.0 = insertvalue { i32, i32 } undef, i32 %cast.0, 0
392  %insert.1 = insertvalue { i32, i32 } %insert.0, i32 %cast.1, 1
393  ret { i32, i32 } %insert.1
394}
395
396define <2 x i16> @v_andn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) {
397; GFX6-LABEL: v_andn2_v2i16:
398; GFX6:       ; %bb.0:
399; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
400; GFX6-NEXT:    v_mov_b32_e32 v4, 0xffff
401; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
402; GFX6-NEXT:    v_and_b32_e32 v0, v0, v4
403; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
404; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
405; GFX6-NEXT:    v_and_b32_e32 v2, v2, v4
406; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
407; GFX6-NEXT:    v_xor_b32_e32 v1, -1, v1
408; GFX6-NEXT:    v_and_b32_e32 v0, v0, v1
409; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
410; GFX6-NEXT:    s_setpc_b64 s[30:31]
411;
412; GFX9-LABEL: v_andn2_v2i16:
413; GFX9:       ; %bb.0:
414; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
415; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
416; GFX9-NEXT:    v_and_b32_e32 v0, v0, v1
417; GFX9-NEXT:    s_setpc_b64 s[30:31]
418  %not.src1 = xor <2 x i16> %src1, <i16 -1, i16 -1>
419  %and = and <2 x i16> %src0, %not.src1
420  ret <2 x i16> %and
421}
422
423; FIXME:
424; define amdgpu_ps i48 @s_andn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
425;   %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -1>
426;   %and = and <3 x i16> %src0, %not.src1
427;   %cast = bitcast <3 x i16> %and to i48
428;   ret i48 %cast
429; }
430
431; define amdgpu_ps i48 @s_andn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
432;   %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -1>
433;   %and = and <3 x i16> %not.src1, %src0
434;   %cast = bitcast <3 x i16> %and to i48
435;   ret i48 %cast
436; }
437
438; define amdgpu_ps { i48, i48 } @s_andn2_v3i16_multi_use(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
439;   %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -1>
440;   %and = and <3 x i16> %src0, %not.src1
441
442;   %cast.0 = bitcast <3 x i16> %and to i48
443;   %cast.1 = bitcast <3 x i16> %not.src1 to i48
444;   %insert.0 = insertvalue { i48, i48 } undef, i48 %cast.0, 0
445;   %insert.1 = insertvalue { i48, i48 } %insert.0, i48 %cast.1, 1
446;   ret { i48, i48 } %insert.1
447; }
448
449; define <3 x i16> @v_andn2_v3i16(<3 x i16> %src0, <3 x i16> %src1) {
450;   %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -11>
451;   %and = and <3 x i16> %src0, %not.src1
452;   ret <3 x i16> %and
453; }
454
455define amdgpu_ps i64 @s_andn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1) {
456; GFX6-LABEL: s_andn2_v4i16:
457; GFX6:       ; %bb.0:
458; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
459; GFX6-NEXT:    s_mov_b32 s3, 0xffff
460; GFX6-NEXT:    s_and_b32 s1, s2, s3
461; GFX6-NEXT:    s_or_b32 s0, s0, s1
462; GFX6-NEXT:    s_and_b32 s2, s4, s3
463; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
464; GFX6-NEXT:    s_or_b32 s1, s1, s2
465; GFX6-NEXT:    s_and_b32 s4, s6, s3
466; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
467; GFX6-NEXT:    s_or_b32 s2, s2, s4
468; GFX6-NEXT:    s_lshl_b32 s4, s9, 16
469; GFX6-NEXT:    s_and_b32 s3, s8, s3
470; GFX6-NEXT:    s_or_b32 s3, s4, s3
471; GFX6-NEXT:    s_mov_b32 s4, -1
472; GFX6-NEXT:    s_mov_b32 s5, s4
473; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
474; GFX6-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
475; GFX6-NEXT:    ; return to shader part epilog
476;
477; GFX9-LABEL: s_andn2_v4i16:
478; GFX9:       ; %bb.0:
479; GFX9-NEXT:    s_mov_b32 s0, -1
480; GFX9-NEXT:    s_mov_b32 s1, s0
481; GFX9-NEXT:    s_xor_b64 s[0:1], s[4:5], s[0:1]
482; GFX9-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
483; GFX9-NEXT:    ; return to shader part epilog
484  %not.src1 = xor <4 x i16> %src1, <i16 -1, i16 -1, i16 -1, i16 -1>
485  %and = and <4 x i16> %src0, %not.src1
486  %cast = bitcast <4 x i16> %and to i64
487  ret i64 %cast
488}
489
490define amdgpu_ps i64 @s_andn2_v4i16_commute(<4 x i16> inreg %src0, <4 x i16> inreg %src1) {
491; GFX6-LABEL: s_andn2_v4i16_commute:
492; GFX6:       ; %bb.0:
493; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
494; GFX6-NEXT:    s_mov_b32 s3, 0xffff
495; GFX6-NEXT:    s_and_b32 s1, s2, s3
496; GFX6-NEXT:    s_or_b32 s0, s0, s1
497; GFX6-NEXT:    s_and_b32 s2, s4, s3
498; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
499; GFX6-NEXT:    s_or_b32 s1, s1, s2
500; GFX6-NEXT:    s_and_b32 s4, s6, s3
501; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
502; GFX6-NEXT:    s_or_b32 s2, s2, s4
503; GFX6-NEXT:    s_lshl_b32 s4, s9, 16
504; GFX6-NEXT:    s_and_b32 s3, s8, s3
505; GFX6-NEXT:    s_or_b32 s3, s4, s3
506; GFX6-NEXT:    s_mov_b32 s4, -1
507; GFX6-NEXT:    s_mov_b32 s5, s4
508; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
509; GFX6-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
510; GFX6-NEXT:    ; return to shader part epilog
511;
512; GFX9-LABEL: s_andn2_v4i16_commute:
513; GFX9:       ; %bb.0:
514; GFX9-NEXT:    s_mov_b32 s0, -1
515; GFX9-NEXT:    s_mov_b32 s1, s0
516; GFX9-NEXT:    s_xor_b64 s[0:1], s[4:5], s[0:1]
517; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
518; GFX9-NEXT:    ; return to shader part epilog
519  %not.src1 = xor <4 x i16> %src1, <i16 -1, i16 -1, i16 -1, i16 -1>
520  %and = and <4 x i16> %not.src1, %src0
521  %cast = bitcast <4 x i16> %and to i64
522  ret i64 %cast
523}
524
525define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_use(<4 x i16> inreg %src0, <4 x i16> inreg %src1) {
526; GFX6-LABEL: s_andn2_v4i16_multi_use:
527; GFX6:       ; %bb.0:
528; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
529; GFX6-NEXT:    s_mov_b32 s3, 0xffff
530; GFX6-NEXT:    s_and_b32 s1, s2, s3
531; GFX6-NEXT:    s_or_b32 s0, s0, s1
532; GFX6-NEXT:    s_and_b32 s2, s4, s3
533; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
534; GFX6-NEXT:    s_or_b32 s1, s1, s2
535; GFX6-NEXT:    s_and_b32 s4, s6, s3
536; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
537; GFX6-NEXT:    s_or_b32 s2, s2, s4
538; GFX6-NEXT:    s_lshl_b32 s4, s9, 16
539; GFX6-NEXT:    s_and_b32 s3, s8, s3
540; GFX6-NEXT:    s_or_b32 s3, s4, s3
541; GFX6-NEXT:    s_mov_b32 s4, -1
542; GFX6-NEXT:    s_mov_b32 s5, s4
543; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
544; GFX6-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
545; GFX6-NEXT:    ; return to shader part epilog
546;
547; GFX9-LABEL: s_andn2_v4i16_multi_use:
548; GFX9:       ; %bb.0:
549; GFX9-NEXT:    s_mov_b32 s0, -1
550; GFX9-NEXT:    s_mov_b32 s1, s0
551; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], s[0:1]
552; GFX9-NEXT:    s_and_b64 s[0:1], s[2:3], s[4:5]
553; GFX9-NEXT:    s_mov_b32 s2, s4
554; GFX9-NEXT:    s_mov_b32 s3, s5
555; GFX9-NEXT:    ; return to shader part epilog
556  %not.src1 = xor <4 x i16> %src1, <i16 -1, i16 -1, i16 -1, i16 -1>
557  %and = and <4 x i16> %src0, %not.src1
558
559  %cast.0 = bitcast <4 x i16> %and to i64
560  %cast.1 = bitcast <4 x i16> %not.src1 to i64
561  %insert.0 = insertvalue { i64, i64 } undef, i64 %cast.0, 0
562  %insert.1 = insertvalue { i64, i64 } %insert.0, i64 %cast.1, 1
563  ret { i64, i64 } %insert.1
564}
565
566define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_foldable_use(<4 x i16> inreg %src0, <4 x i16> inreg %src1, <4 x i16> inreg %src2) {
567; GFX6-LABEL: s_andn2_v4i16_multi_foldable_use:
568; GFX6:       ; %bb.0:
569; GFX6-NEXT:    s_mov_b32 s14, 0xffff
570; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
571; GFX6-NEXT:    s_and_b32 s1, s2, s14
572; GFX6-NEXT:    s_or_b32 s0, s0, s1
573; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
574; GFX6-NEXT:    s_and_b32 s2, s4, s14
575; GFX6-NEXT:    s_or_b32 s1, s1, s2
576; GFX6-NEXT:    s_and_b32 s3, s6, s14
577; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
578; GFX6-NEXT:    s_or_b32 s2, s2, s3
579; GFX6-NEXT:    s_lshl_b32 s3, s9, 16
580; GFX6-NEXT:    s_and_b32 s4, s8, s14
581; GFX6-NEXT:    s_or_b32 s3, s3, s4
582; GFX6-NEXT:    s_lshl_b32 s4, s11, 16
583; GFX6-NEXT:    s_and_b32 s5, s10, s14
584; GFX6-NEXT:    s_or_b32 s4, s4, s5
585; GFX6-NEXT:    s_lshl_b32 s5, s13, 16
586; GFX6-NEXT:    s_and_b32 s6, s12, s14
587; GFX6-NEXT:    s_or_b32 s5, s5, s6
588; GFX6-NEXT:    s_mov_b32 s6, -1
589; GFX6-NEXT:    s_mov_b32 s7, s6
590; GFX6-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
591; GFX6-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
592; GFX6-NEXT:    s_and_b64 s[2:3], s[2:3], s[4:5]
593; GFX6-NEXT:    ; return to shader part epilog
594;
595; GFX9-LABEL: s_andn2_v4i16_multi_foldable_use:
596; GFX9:       ; %bb.0:
597; GFX9-NEXT:    s_mov_b32 s0, -1
598; GFX9-NEXT:    s_mov_b32 s1, s0
599; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[0:1]
600; GFX9-NEXT:    s_and_b64 s[0:1], s[2:3], s[6:7]
601; GFX9-NEXT:    s_and_b64 s[2:3], s[4:5], s[6:7]
602; GFX9-NEXT:    ; return to shader part epilog
603  %not.src2 = xor <4 x i16> %src2, <i16 -1, i16 -1, i16 -1, i16 -1>
604  %and0 = and <4 x i16> %src0, %not.src2
605  %and1 = and <4 x i16> %src1, %not.src2
606
607  %cast.0 = bitcast <4 x i16> %and0 to i64
608  %cast.1 = bitcast <4 x i16> %and1 to i64
609  %insert.0 = insertvalue { i64, i64 } undef, i64 %cast.0, 0
610  %insert.1 = insertvalue { i64, i64 } %insert.0, i64 %cast.1, 1
611  ret { i64, i64 } %insert.1
612}
613
614define <4 x i16> @v_andn2_v4i16(<4 x i16> %src0, <4 x i16> %src1) {
615; GFX6-LABEL: v_andn2_v4i16:
616; GFX6:       ; %bb.0:
617; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
618; GFX6-NEXT:    v_mov_b32_e32 v8, 0xffff
619; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
620; GFX6-NEXT:    v_and_b32_e32 v0, v0, v8
621; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
622; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
623; GFX6-NEXT:    v_and_b32_e32 v2, v2, v8
624; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
625; GFX6-NEXT:    v_and_b32_e32 v3, v4, v8
626; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
627; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
628; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
629; GFX6-NEXT:    v_and_b32_e32 v4, v6, v8
630; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
631; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v2
632; GFX6-NEXT:    v_xor_b32_e32 v3, -1, v3
633; GFX6-NEXT:    v_and_b32_e32 v0, v0, v2
634; GFX6-NEXT:    v_and_b32_e32 v2, v1, v3
635; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
636; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
637; GFX6-NEXT:    s_setpc_b64 s[30:31]
638;
639; GFX9-LABEL: v_andn2_v4i16:
640; GFX9:       ; %bb.0:
641; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
642; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
643; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v3
644; GFX9-NEXT:    v_and_b32_e32 v0, v0, v2
645; GFX9-NEXT:    v_and_b32_e32 v1, v1, v3
646; GFX9-NEXT:    s_setpc_b64 s[30:31]
647  %not.src1 = xor <4 x i16> %src1, <i16 -1, i16 -1, i16 -1, i16 -1>
648  %and = and <4 x i16> %src0, %not.src1
649  ret <4 x i16> %and
650}
651