1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=NOSDWA,GCN %s
2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,SDWA,GCN %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,SDWA,GCN %s
4
5; GCN-LABEL: {{^}}add_shr_i32:
6; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}}
7; NOSDWA: v_add_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v[[DST]]
8; NOSDWA-NOT: v_add_{{(_co)?}}_u32_sdwa
9
10; VI: v_add_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11; GFX9: v_add_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
12
13define amdgpu_kernel void @add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
14  %a = load i32, i32 addrspace(1)* %in, align 4
15  %shr = lshr i32 %a, 16
16  %add = add i32 %a, %shr
17  store i32 %add, i32 addrspace(1)* %out, align 4
18  ret void
19}
20
21; GCN-LABEL: {{^}}sub_shr_i32:
22; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}}
23; NOSDWA: v_subrev_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v[[DST]]
24; NOSDWA-NOT: v_subrev_{{(_co)?}}_u32_sdwa
25
26; VI: v_subrev_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
27; GFX9: v_sub_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
28define amdgpu_kernel void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
29  %a = load i32, i32 addrspace(1)* %in, align 4
30  %shr = lshr i32 %a, 16
31  %sub = sub i32 %shr, %a
32  store i32 %sub, i32 addrspace(1)* %out, align 4
33  ret void
34}
35
36; GCN-LABEL: {{^}}mul_shr_i32:
37; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
38; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
39; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v[[DST0]], v[[DST1]]
40; NOSDWA-NOT: v_mul_u32_u24_sdwa
41
42; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
43
44define amdgpu_kernel void @mul_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in1, i32 addrspace(1)* %in2) {
45  %a = load i32, i32 addrspace(1)* %in1, align 4
46  %b = load i32, i32 addrspace(1)* %in2, align 4
47  %shra = lshr i32 %a, 16
48  %shrb = lshr i32 %b, 16
49  %mul = mul i32 %shra, %shrb
50  store i32 %mul, i32 addrspace(1)* %out, align 4
51  ret void
52}
53
54; GCN-LABEL: {{^}}mul_i16:
55; NOSDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
56; NOSDWA-NOT: v_mul_u32_u24_sdwa
57; SDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
58; SDWA-NOT: v_mul_u32_u24_sdwa
59
60define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %ina, i16 addrspace(1)* %inb) {
61entry:
62  %a = load i16, i16 addrspace(1)* %ina, align 4
63  %b = load i16, i16 addrspace(1)* %inb, align 4
64  %mul = mul i16 %a, %b
65  store i16 %mul, i16 addrspace(1)* %out, align 4
66  ret void
67}
68
69; GCN-LABEL: {{^}}mul_v2i16:
70; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
71; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
72; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST0]], v[[DST1]]
73; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]]
74; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]]
75; NOSDWA-NOT: v_mul_u32_u24_sdwa
76
77; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
78; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
79; VI: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_LO]], v[[DST_MUL_HI]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
80
81; GFX9: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
82
83define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
84entry:
85  %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
86  %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
87  %mul = mul <2 x i16> %a, %b
88  store <2 x i16> %mul, <2 x i16> addrspace(1)* %out, align 4
89  ret void
90}
91
92; GCN-LABEL: {{^}}mul_v4i16:
93; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
94; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
95; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
96; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
97; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
98; NOSDWA-NOT: v_mul_u32_u24_sdwa
99
100; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
101; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
102; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
103; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
104; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
105; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
106
107; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
108; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
109
110define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) {
111entry:
112  %a = load <4 x i16>, <4 x i16> addrspace(1)* %ina, align 4
113  %b = load <4 x i16>, <4 x i16> addrspace(1)* %inb, align 4
114  %mul = mul <4 x i16> %a, %b
115  store <4 x i16> %mul, <4 x i16> addrspace(1)* %out, align 4
116  ret void
117}
118
119; GCN-LABEL: {{^}}mul_v8i16:
120; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
121; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
122; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
123; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
124; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
125; NOSDWA-NOT: v_mul_u32_u24_sdwa
126
127; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
128; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
129; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
130; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
131; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL4:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
132; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL5:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
133; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL6:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
134; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL7:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
135; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL6]], v[[DST_MUL7]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
136; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL4]], v[[DST_MUL5]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
137; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
138; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
139
140; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
141; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
142; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
143; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
144
145define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) {
146entry:
147  %a = load <8 x i16>, <8 x i16> addrspace(1)* %ina, align 4
148  %b = load <8 x i16>, <8 x i16> addrspace(1)* %inb, align 4
149  %mul = mul <8 x i16> %a, %b
150  store <8 x i16> %mul, <8 x i16> addrspace(1)* %out, align 4
151  ret void
152}
153
154; GCN-LABEL: {{^}}mul_half:
155; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
156; NOSDWA-NOT: v_mul_f16_sdwa
157; SDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
158; SDWA-NOT: v_mul_f16_sdwa
159
160define amdgpu_kernel void @mul_half(half addrspace(1)* %out, half addrspace(1)* %ina, half addrspace(1)* %inb) {
161entry:
162  %a = load half, half addrspace(1)* %ina, align 4
163  %b = load half, half addrspace(1)* %inb, align 4
164  %mul = fmul half %a, %b
165  store half %mul, half addrspace(1)* %out, align 4
166  ret void
167}
168
169; GCN-LABEL: {{^}}mul_v2half:
170; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
171; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
172; NOSDWA: v_mul_f16_e32 v[[DST_MUL:[0-9]+]], v[[DST0]], v[[DST1]]
173; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]]
174; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]]
175; NOSDWA-NOT: v_mul_f16_sdwa
176
177; VI-DAG: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
178; VI-DAG: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
179; VI: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_LO]], v[[DST_MUL_HI]]
180
181; GFX9: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
182
183define amdgpu_kernel void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) {
184entry:
185  %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4
186  %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4
187  %mul = fmul <2 x half> %a, %b
188  store <2 x half> %mul, <2 x half> addrspace(1)* %out, align 4
189  ret void
190}
191
192; GCN-LABEL: {{^}}mul_v4half:
193; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
194; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
195; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
196; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
197; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
198; NOSDWA-NOT: v_mul_f16_sdwa
199
200; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
201; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
202; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
203; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
204
205; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
206; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
207
208define amdgpu_kernel void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) {
209entry:
210  %a = load <4 x half>, <4 x half> addrspace(1)* %ina, align 4
211  %b = load <4 x half>, <4 x half> addrspace(1)* %inb, align 4
212  %mul = fmul <4 x half> %a, %b
213  store <4 x half> %mul, <4 x half> addrspace(1)* %out, align 4
214  ret void
215}
216
217; GCN-LABEL: {{^}}mul_v8half:
218; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
219; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
220; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
221; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
222; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
223; NOSDWA-NOT: v_mul_f16_sdwa
224
225; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
226; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
227; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
228; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
229; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
230; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
231; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
232; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
233
234; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
235; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
236; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
237; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
238
239define amdgpu_kernel void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) {
240entry:
241  %a = load <8 x half>, <8 x half> addrspace(1)* %ina, align 4
242  %b = load <8 x half>, <8 x half> addrspace(1)* %inb, align 4
243  %mul = fmul <8 x half> %a, %b
244  store <8 x half> %mul, <8 x half> addrspace(1)* %out, align 4
245  ret void
246}
247
248; GCN-LABEL: {{^}}mul_i8:
249; NOSDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
250; NOSDWA-NOT: v_mul_u32_u24_sdwa
251; SDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
252; SDWA-NOT: v_mul_u32_u24_sdwa
253
254define amdgpu_kernel void @mul_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %ina, i8 addrspace(1)* %inb) {
255entry:
256  %a = load i8, i8 addrspace(1)* %ina, align 4
257  %b = load i8, i8 addrspace(1)* %inb, align 4
258  %mul = mul i8 %a, %b
259  store i8 %mul, i8 addrspace(1)* %out, align 4
260  ret void
261}
262
263; GCN-LABEL: {{^}}mul_v2i8:
264; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
265; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
266; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
267; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
268; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
269; NOSDWA-NOT: v_mul_u32_u24_sdwa
270
271; VI: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
272
273; GFX9-DAG: v_mul_lo_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
274; GFX9-DAG: v_mul_lo_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
275; GFX9: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
276
277define amdgpu_kernel void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) {
278entry:
279  %a = load <2 x i8>, <2 x i8> addrspace(1)* %ina, align 4
280  %b = load <2 x i8>, <2 x i8> addrspace(1)* %inb, align 4
281  %mul = mul <2 x i8> %a, %b
282  store <2 x i8> %mul, <2 x i8> addrspace(1)* %out, align 4
283  ret void
284}
285
286; GCN-LABEL: {{^}}mul_v4i8:
287; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
288; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
289; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
290; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
291; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
292; NOSDWA-NOT: v_mul_u32_u24_sdwa
293
294; VI-DAG: v_mul_u32_u24_sdwa
295; VI-DAG: v_mul_u32_u24_sdwa
296; VI-DAG: v_mul_u32_u24_sdwa
297
298; GFX9-DAG: v_mul_lo_u16_sdwa
299; GFX9-DAG: v_mul_lo_u16_sdwa
300; GFX9-DAG: v_mul_lo_u16_sdwa
301
302define amdgpu_kernel void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) {
303entry:
304  %a = load <4 x i8>, <4 x i8> addrspace(1)* %ina, align 4
305  %b = load <4 x i8>, <4 x i8> addrspace(1)* %inb, align 4
306  %mul = mul <4 x i8> %a, %b
307  store <4 x i8> %mul, <4 x i8> addrspace(1)* %out, align 4
308  ret void
309}
310
311; GCN-LABEL: {{^}}mul_v8i8:
312; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
313; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
314; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
315; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
316; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
317; NOSDWA-NOT: v_mul_u32_u24_sdwa
318
319; VI-DAG: v_mul_u32_u24_sdwa
320; VI-DAG: v_mul_u32_u24_sdwa
321; VI-DAG: v_mul_u32_u24_sdwa
322; VI-DAG: v_mul_u32_u24_sdwa
323; VI-DAG: v_mul_u32_u24_sdwa
324; VI-DAG: v_mul_u32_u24_sdwa
325
326; GFX9-DAG: v_mul_lo_u16_sdwa
327; GFX9-DAG: v_mul_lo_u16_sdwa
328; GFX9-DAG: v_mul_lo_u16_sdwa
329; GFX9-DAG: v_mul_lo_u16_sdwa
330; GFX9-DAG: v_mul_lo_u16_sdwa
331; GFX9-DAG: v_mul_lo_u16_sdwa
332
333define amdgpu_kernel void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) {
334entry:
335  %a = load <8 x i8>, <8 x i8> addrspace(1)* %ina, align 4
336  %b = load <8 x i8>, <8 x i8> addrspace(1)* %inb, align 4
337  %mul = mul <8 x i8> %a, %b
338  store <8 x i8> %mul, <8 x i8> addrspace(1)* %out, align 4
339  ret void
340}
341
342; GCN-LABEL: {{^}}sitofp_v2i16_to_v2f16:
343; NOSDWA-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
344; NOSDWA-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
345; NOSDWA-DAG: v_cvt_f32_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}
346; NOSDWA-DAG: v_cvt_f32_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}
347; NOSDWA-NOT: v_cvt_f32_i32_sdwa
348
349; SDWA-DAG: v_cvt_f32_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
350; SDWA-DAG: v_cvt_f32_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
351
352define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
353    <2 x half> addrspace(1)* %r,
354    <2 x i16> addrspace(1)* %a) {
355entry:
356  %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
357  %r.val = sitofp <2 x i16> %a.val to <2 x half>
358  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
359  ret void
360}
361
362
363; GCN-LABEL: {{^}}mac_v2half:
364; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
365; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
366; NOSDWA: v_mac_f16_e32 v[[DST_MAC:[0-9]+]], v[[DST0]], v[[DST1]]
367; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]]
368; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]]
369; NOSDWA-NOT: v_mac_f16_sdwa
370
371; VI: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
372; VI: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]]
373
374; GFX9: v_pk_mul_f16 v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v[[SRC:[0-9]+]]
375; GFX9: v_pk_add_f16 v{{[0-9]+}}, v[[DST_MUL]], v[[SRC]]
376
377define amdgpu_kernel void @mac_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) {
378entry:
379  %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4
380  %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4
381  %mul = fmul <2 x half> %a, %b
382  %mac = fadd <2 x half> %mul, %b
383  store <2 x half> %mac, <2 x half> addrspace(1)* %out, align 4
384  ret void
385}
386
387; GCN-LABEL: {{^}}immediate_mul_v2i16:
388; NOSDWA-NOT: v_mul_u32_u24_sdwa
389; VI-DAG: v_mov_b32_e32 v[[M321:[0-9]+]], 0x141
390; VI-DAG: v_mov_b32_e32 v[[M123:[0-9]+]], 0x7b
391; VI-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M123]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
392; VI-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M321]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
393
394; GFX9: s_mov_b32 s[[IMM:[0-9]+]], 0x141007b
395; GFX9: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, s[[IMM]]
396
397define amdgpu_kernel void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
398entry:
399  %a = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4
400  %mul = mul <2 x i16> %a, <i16 123, i16 321>
401  store <2 x i16> %mul, <2 x i16> addrspace(1)* %out, align 4
402  ret void
403}
404
405; Double use of same src - should not convert it
406; GCN-LABEL: {{^}}mulmul_v2i16:
407; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
408; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
409; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
410; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
411; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
412; NOSDWA-NOT: v_mul_u32_u24_sdwa
413
414; VI: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
415
416; GFX9: v_pk_mul_lo_u16 v[[DST1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
417; GFX9: v_pk_mul_lo_u16 v{{[0-9]+}}, v[[DST1]], v{{[0-9]+}}
418
419define amdgpu_kernel void @mulmul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
420entry:
421  %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
422  %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
423  %mul = mul <2 x i16> %a, %b
424  %mul2 = mul <2 x i16> %mul, %b
425  store <2 x i16> %mul2, <2 x i16> addrspace(1)* %out, align 4
426  ret void
427}
428
429; GCN-LABEL: {{^}}add_bb_v2i16:
430; NOSDWA-NOT: v_add_{{(_co)?}}_u32_sdwa
431
432; VI: v_add_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
433
434; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
435
436define amdgpu_kernel void @add_bb_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
437entry:
438  %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
439  %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
440  br label %add_label
441add_label:
442  %add = add <2 x i16> %a, %b
443  br label %store_label
444store_label:
445  store <2 x i16> %add, <2 x i16> addrspace(1)* %out, align 4
446  ret void
447}
448
449
450; Check that "pulling out" SDWA operands works correctly.
451; GCN-LABEL: {{^}}pulled_out_test:
452; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
453; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
454; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
455; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
456; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
457; NOSDWA-NOT: v_and_b32_sdwa
458; NOSDWA-NOT: v_or_b32_sdwa
459
460; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
461; GFX9-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
462; SDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
463; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
464; GFX9-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
465; SDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
466; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
467
468define amdgpu_kernel void @pulled_out_test(<8 x i8> addrspace(1)* %sourceA, <8 x i8> addrspace(1)* %destValues) {
469entry:
470  %idxprom = ashr exact i64 15, 32
471  %arrayidx = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %sourceA, i64 %idxprom
472  %tmp = load <8 x i8>, <8 x i8> addrspace(1)* %arrayidx, align 8
473
474  %tmp1 = extractelement <8 x i8> %tmp, i32 0
475  %tmp2 = extractelement <8 x i8> %tmp, i32 1
476  %tmp3 = extractelement <8 x i8> %tmp, i32 2
477  %tmp4 = extractelement <8 x i8> %tmp, i32 3
478  %tmp5 = extractelement <8 x i8> %tmp, i32 4
479  %tmp6 = extractelement <8 x i8> %tmp, i32 5
480  %tmp7 = extractelement <8 x i8> %tmp, i32 6
481  %tmp8 = extractelement <8 x i8> %tmp, i32 7
482
483  %tmp9 = insertelement <2 x i8> undef, i8 %tmp1, i32 0
484  %tmp10 = insertelement <2 x i8> %tmp9, i8 %tmp2, i32 1
485  %tmp11 = insertelement <2 x i8> undef, i8 %tmp3, i32 0
486  %tmp12 = insertelement <2 x i8> %tmp11, i8 %tmp4, i32 1
487  %tmp13 = insertelement <2 x i8> undef, i8 %tmp5, i32 0
488  %tmp14 = insertelement <2 x i8> %tmp13, i8 %tmp6, i32 1
489  %tmp15 = insertelement <2 x i8> undef, i8 %tmp7, i32 0
490  %tmp16 = insertelement <2 x i8> %tmp15, i8 %tmp8, i32 1
491
492  %tmp17 = shufflevector <2 x i8> %tmp10, <2 x i8> %tmp12, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
493  %tmp18 = shufflevector <2 x i8> %tmp14, <2 x i8> %tmp16, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
494  %tmp19 = shufflevector <4 x i8> %tmp17, <4 x i8> %tmp18, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
495
496  %arrayidx5 = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %destValues, i64 %idxprom
497  store <8 x i8> %tmp19, <8 x i8> addrspace(1)* %arrayidx5, align 8
498  ret void
499}
500
501; GCN-LABEL: {{^}}sdwa_crash_inlineasm_def:
502; GCN: s_mov_b32 s{{[0-9]+}}, 0xffff
503; GCN: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
504; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x10000,
505define amdgpu_kernel void @sdwa_crash_inlineasm_def() #0 {
506bb:
507  br label %bb1
508
509bb1:                                              ; preds = %bb11, %bb
510  %tmp = phi <2 x i32> [ %tmp12, %bb11 ], [ undef, %bb ]
511  br i1 true, label %bb2, label %bb11
512
513bb2:                                              ; preds = %bb1
514  %tmp3 = call i32 asm "v_and_b32_e32 $0, $1, $2", "=v,s,v"(i32 65535, i32 undef) #1
515  %tmp5 = or i32 %tmp3, 65536
516  %tmp6 = insertelement <2 x i32> %tmp, i32 %tmp5, i64 0
517  br label %bb11
518
519bb11:                                             ; preds = %bb10, %bb2
520  %tmp12 = phi <2 x i32> [ %tmp6, %bb2 ], [ %tmp, %bb1 ]
521  br label %bb1
522}
523