1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=NOSDWA,GCN %s
2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GFX89,SDWA,GCN %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_10,SDWA,GCN %s
4; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -amdgpu-sdwa-peephole -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX9_10,SDWA,GCN %s
5
6; GCN-LABEL: {{^}}add_shr_i32:
7; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}}
8; NOSDWA: v_add_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v[[DST]]
9; NOSDWA-NOT: v_add_{{(_co)?}}_u32_sdwa
10
11; VI: v_add_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
12; GFX9: v_add_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
13; GFX10: v_add_nc_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
14
15define amdgpu_kernel void @add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
16  %a = load i32, i32 addrspace(1)* %in, align 4
17  %shr = lshr i32 %a, 16
18  %add = add i32 %a, %shr
19  store i32 %add, i32 addrspace(1)* %out, align 4
20  ret void
21}
22
23; GCN-LABEL: {{^}}sub_shr_i32:
24; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}}
25; NOSDWA: v_subrev_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v[[DST]]
26; NOSDWA-NOT: v_subrev_{{(_co)?}}_u32_sdwa
27
28; VI: v_subrev_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
29; GFX9: v_sub_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
30; GFX10: v_sub_nc_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
31define amdgpu_kernel void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
32  %a = load i32, i32 addrspace(1)* %in, align 4
33  %shr = lshr i32 %a, 16
34  %sub = sub i32 %shr, %a
35  store i32 %sub, i32 addrspace(1)* %out, align 4
36  ret void
37}
38
39; GCN-LABEL: {{^}}mul_shr_i32:
40; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
41; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
42; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v[[DST0]], v[[DST1]]
43; NOSDWA-NOT: v_mul_u32_u24_sdwa
44
45; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
46
47define amdgpu_kernel void @mul_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in1, i32 addrspace(1)* %in2) #0 {
48  %a = load i32, i32 addrspace(1)* %in1, align 4
49  %b = load i32, i32 addrspace(1)* %in2, align 4
50  %shra = lshr i32 %a, 16
51  %shrb = lshr i32 %b, 16
52  %mul = mul i32 %shra, %shrb
53  store i32 %mul, i32 addrspace(1)* %out, align 4
54  ret void
55}
56
57; GCN-LABEL: {{^}}mul_i16:
58; NOSDWA: v_mul_lo_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
59; NOSDWA-NOT: v_mul_u32_u24_sdwa
60; GFX89: v_mul_lo_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
61; GFX10: v_mul_lo_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
62; SDWA-NOT: v_mul_u32_u24_sdwa
63
64define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %ina, i16 addrspace(1)* %inb) #0 {
65entry:
66  %a = load i16, i16 addrspace(1)* %ina, align 4
67  %b = load i16, i16 addrspace(1)* %inb, align 4
68  %mul = mul i16 %a, %b
69  store i16 %mul, i16 addrspace(1)* %out, align 4
70  ret void
71}
72
73; GCN-LABEL: {{^}}mul_v2i16:
74; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
75; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
76; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST0]], v[[DST1]]
77; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]]
78; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]]
79; NOSDWA-NOT: v_mul_u32_u24_sdwa
80
81; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
82; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
83; VI: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_LO]], v[[DST_MUL_HI]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
84
85; GFX9_10: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
86
87define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) #0 {
88entry:
89  %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
90  %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
91  %mul = mul <2 x i16> %a, %b
92  store <2 x i16> %mul, <2 x i16> addrspace(1)* %out, align 4
93  ret void
94}
95
96; GCN-LABEL: {{^}}mul_v4i16:
97; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
98; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
99; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
100; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
101; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
102; NOSDWA-NOT: v_mul_u32_u24_sdwa
103
104; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
105; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
106; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
107; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
108; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
109; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
110
111; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
112; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
113
114define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) #0 {
115entry:
116  %a = load <4 x i16>, <4 x i16> addrspace(1)* %ina, align 4
117  %b = load <4 x i16>, <4 x i16> addrspace(1)* %inb, align 4
118  %mul = mul <4 x i16> %a, %b
119  store <4 x i16> %mul, <4 x i16> addrspace(1)* %out, align 4
120  ret void
121}
122
123; GCN-LABEL: {{^}}mul_v8i16:
124; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
125; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
126; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
127; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
128; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
129; NOSDWA-NOT: v_mul_u32_u24_sdwa
130
131; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
132; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
133; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
134; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
135; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL4:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
136; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL5:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
137; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL6:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
138; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL7:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
139; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL6]], v[[DST_MUL7]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
140; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL4]], v[[DST_MUL5]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
141; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
142; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
143
144; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
145; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
146; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
147; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
148
149define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) #0 {
150entry:
151  %a = load <8 x i16>, <8 x i16> addrspace(1)* %ina, align 4
152  %b = load <8 x i16>, <8 x i16> addrspace(1)* %inb, align 4
153  %mul = mul <8 x i16> %a, %b
154  store <8 x i16> %mul, <8 x i16> addrspace(1)* %out, align 4
155  ret void
156}
157
158; GCN-LABEL: {{^}}mul_half:
159; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
160; NOSDWA-NOT: v_mul_f16_sdwa
161; SDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
162; SDWA-NOT: v_mul_f16_sdwa
163
164define amdgpu_kernel void @mul_half(half addrspace(1)* %out, half addrspace(1)* %ina, half addrspace(1)* %inb) #0 {
165entry:
166  %a = load half, half addrspace(1)* %ina, align 4
167  %b = load half, half addrspace(1)* %inb, align 4
168  %mul = fmul half %a, %b
169  store half %mul, half addrspace(1)* %out, align 4
170  ret void
171}
172
173; GCN-LABEL: {{^}}mul_v2half:
174; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
175; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
176; NOSDWA: v_mul_f16_e32 v[[DST_MUL:[0-9]+]], v[[DST0]], v[[DST1]]
177; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]]
178; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]]
179; NOSDWA-NOT: v_mul_f16_sdwa
180
181; VI-DAG: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
182; VI-DAG: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
183; VI: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_LO]], v[[DST_MUL_HI]]
184
185; GFX9_10: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
186
187define amdgpu_kernel void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) #0 {
188entry:
189  %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4
190  %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4
191  %mul = fmul <2 x half> %a, %b
192  store <2 x half> %mul, <2 x half> addrspace(1)* %out, align 4
193  ret void
194}
195
196; GCN-LABEL: {{^}}mul_v4half:
197; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
198; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
199; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
200; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
201; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
202; NOSDWA-NOT: v_mul_f16_sdwa
203
204; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
205; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
206; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
207; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
208
209; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
210; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
211
212define amdgpu_kernel void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) #0 {
213entry:
214  %a = load <4 x half>, <4 x half> addrspace(1)* %ina, align 4
215  %b = load <4 x half>, <4 x half> addrspace(1)* %inb, align 4
216  %mul = fmul <4 x half> %a, %b
217  store <4 x half> %mul, <4 x half> addrspace(1)* %out, align 4
218  ret void
219}
220
221; GCN-LABEL: {{^}}mul_v8half:
222; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
223; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
224; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
225; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
226; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
227; NOSDWA-NOT: v_mul_f16_sdwa
228
229; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
230; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
231; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
232; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
233; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
234; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
235; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
236; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
237
238; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
239; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
240; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
241; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
242
243define amdgpu_kernel void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) #0 {
244entry:
245  %a = load <8 x half>, <8 x half> addrspace(1)* %ina, align 4
246  %b = load <8 x half>, <8 x half> addrspace(1)* %inb, align 4
247  %mul = fmul <8 x half> %a, %b
248  store <8 x half> %mul, <8 x half> addrspace(1)* %out, align 4
249  ret void
250}
251
252; GCN-LABEL: {{^}}mul_i8:
253; NOSDWA: v_mul_lo_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
254; NOSDWA-NOT: v_mul_u32_u24_sdwa
255; GFX89: v_mul_lo_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
256; GFX10: v_mul_lo_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
257; SDWA-NOT: v_mul_u32_u24_sdwa
258
259define amdgpu_kernel void @mul_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %ina, i8 addrspace(1)* %inb) #0 {
260entry:
261  %a = load i8, i8 addrspace(1)* %ina, align 4
262  %b = load i8, i8 addrspace(1)* %inb, align 4
263  %mul = mul i8 %a, %b
264  store i8 %mul, i8 addrspace(1)* %out, align 4
265  ret void
266}
267
268; GCN-LABEL: {{^}}mul_v2i8:
269; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
270; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
271; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
272; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
273; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
274; NOSDWA-NOT: v_mul_u32_u24_sdwa
275
276; VI: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
277
278; GFX9-DAG: v_mul_lo_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
279; GFX9-DAG: v_mul_lo_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
280
281; GFX10-DAG: v_mul_lo_u16_e64
282; GFX10-DAG: v_mul_lo_u16_e64
283
284; GFX9: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
285
286; GFX10: v_lshlrev_b16_e64 v{{[0-9]+}}, 8, v
287; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
288define amdgpu_kernel void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) #0 {
289entry:
290  %a = load <2 x i8>, <2 x i8> addrspace(1)* %ina, align 4
291  %b = load <2 x i8>, <2 x i8> addrspace(1)* %inb, align 4
292  %mul = mul <2 x i8> %a, %b
293  store <2 x i8> %mul, <2 x i8> addrspace(1)* %out, align 4
294  ret void
295}
296
297; GCN-LABEL: {{^}}mul_v4i8:
298; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
299; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
300; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
301; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
302; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
303; NOSDWA-NOT: v_mul_u32_u24_sdwa
304
305; VI-DAG: v_mul_u32_u24_sdwa
306; VI-DAG: v_mul_u32_u24_sdwa
307; VI-DAG: v_mul_u32_u24_sdwa
308
309; GFX9-DAG: v_mul_lo_u16_sdwa
310; GFX9-DAG: v_mul_lo_u16_sdwa
311; GFX9-DAG: v_mul_lo_u16_sdwa
312
313; GFX10-DAG: v_mul_lo_u16_e64
314; GFX10-DAG: v_mul_lo_u16_e64
315; GFX10-DAG: v_mul_lo_u16_e64
316; GFX10-DAG: v_mul_lo_u16_e64
317
318define amdgpu_kernel void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) #0 {
319entry:
320  %a = load <4 x i8>, <4 x i8> addrspace(1)* %ina, align 4
321  %b = load <4 x i8>, <4 x i8> addrspace(1)* %inb, align 4
322  %mul = mul <4 x i8> %a, %b
323  store <4 x i8> %mul, <4 x i8> addrspace(1)* %out, align 4
324  ret void
325}
326
327; GCN-LABEL: {{^}}mul_v8i8:
328; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
329; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
330; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
331; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
332; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
333; NOSDWA-NOT: v_mul_u32_u24_sdwa
334
335; VI-DAG: v_mul_u32_u24_sdwa
336; VI-DAG: v_mul_u32_u24_sdwa
337; VI-DAG: v_mul_u32_u24_sdwa
338; VI-DAG: v_mul_u32_u24_sdwa
339; VI-DAG: v_mul_u32_u24_sdwa
340; VI-DAG: v_mul_u32_u24_sdwa
341
342; GFX9-DAG: v_mul_lo_u16_sdwa
343; GFX9-DAG: v_mul_lo_u16_sdwa
344; GFX9-DAG: v_mul_lo_u16_sdwa
345; GFX9-DAG: v_mul_lo_u16_sdwa
346; GFX9-DAG: v_mul_lo_u16_sdwa
347; GFX9-DAG: v_mul_lo_u16_sdwa
348
349; GFX10-DAG: v_mul_lo_u16_e64
350; GFX10-DAG: v_mul_lo_u16_e64
351; GFX10-DAG: v_mul_lo_u16_e64
352; GFX10-DAG: v_mul_lo_u16_e64
353; GFX10-DAG: v_mul_lo_u16_e64
354; GFX10-DAG: v_mul_lo_u16_e64
355; GFX10-DAG: v_mul_lo_u16_e64
356; GFX10-DAG: v_mul_lo_u16_e64
357
358define amdgpu_kernel void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) #0 {
359entry:
360  %a = load <8 x i8>, <8 x i8> addrspace(1)* %ina, align 4
361  %b = load <8 x i8>, <8 x i8> addrspace(1)* %inb, align 4
362  %mul = mul <8 x i8> %a, %b
363  store <8 x i8> %mul, <8 x i8> addrspace(1)* %out, align 4
364  ret void
365}
366
367; GCN-LABEL: {{^}}sitofp_v2i16_to_v2f16:
368; NOSDWA-DAG: v_cvt_f16_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}
369; NOSDWA-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
370; NOSDWA-DAG: v_cvt_f16_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}
371; NOSDWA-NOT: v_cvt_f16_i16_sdwa
372
373; SDWA-DAG: v_cvt_f16_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}
374; SDWA-DAG: v_cvt_f16_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}} dst_sel:{{(WORD_1|DWORD)?}} dst_unused:UNUSED_PAD src0_sel:WORD_1
375
376; FIXME: Should be able to avoid or
377define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
378    <2 x half> addrspace(1)* %r,
379    <2 x i16> addrspace(1)* %a) #0 {
380entry:
381  %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
382  %r.val = sitofp <2 x i16> %a.val to <2 x half>
383  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
384  ret void
385}
386
387
388; GCN-LABEL: {{^}}mac_v2half:
389; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
390; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
391; NOSDWA: v_mac_f16_e32 v[[DST_MAC:[0-9]+]], v[[DST0]], v[[DST1]]
392; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]]
393; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]]
394; NOSDWA-NOT: v_mac_f16_sdwa
395
396; VI: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
397; VI: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]]
398
399; GFX9_10: v_pk_mul_f16 v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v[[SRC:[0-9]+]]
400; GFX9_10: v_pk_add_f16 v{{[0-9]+}}, v[[DST_MUL]], v[[SRC]]
401
402define amdgpu_kernel void @mac_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) #0 {
403entry:
404  %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4
405  %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4
406  %mul = fmul <2 x half> %a, %b
407  %mac = fadd <2 x half> %mul, %b
408  store <2 x half> %mac, <2 x half> addrspace(1)* %out, align 4
409  ret void
410}
411
412; GCN-LABEL: {{^}}immediate_mul_v2i16:
413; NOSDWA-NOT: v_mul_u32_u24_sdwa
414; VI-DAG: v_mov_b32_e32 v[[M321:[0-9]+]], 0x141
415; VI-DAG: v_mov_b32_e32 v[[M123:[0-9]+]], 0x7b
416; VI-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M123]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
417; VI-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M321]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
418
419; GFX9: s_mov_b32 s[[IMM:[0-9]+]], 0x141007b
420; GFX9: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, s[[IMM]]
421
422; GFX10: v_pk_mul_lo_u16 v{{[0-9]+}}, 0x141007b, v{{[0-9]+}}
423
424define amdgpu_kernel void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
425entry:
426  %a = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4
427  %mul = mul <2 x i16> %a, <i16 123, i16 321>
428  store <2 x i16> %mul, <2 x i16> addrspace(1)* %out, align 4
429  ret void
430}
431
432; Double use of same src - should not convert it
433; GCN-LABEL: {{^}}mulmul_v2i16:
434; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
435; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
436; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
437; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
438; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
439; NOSDWA-NOT: v_mul_u32_u24_sdwa
440
441; VI: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
442
443; GFX9_10: v_pk_mul_lo_u16 v[[DST1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
444; GFX9_10: v_pk_mul_lo_u16 v{{[0-9]+}}, v[[DST1]], v{{[0-9]+}}
445
446define amdgpu_kernel void @mulmul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) #0 {
447entry:
448  %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
449  %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
450  %mul = mul <2 x i16> %a, %b
451  %mul2 = mul <2 x i16> %mul, %b
452  store <2 x i16> %mul2, <2 x i16> addrspace(1)* %out, align 4
453  ret void
454}
455
456; GCN-LABEL: {{^}}add_bb_v2i16:
457; NOSDWA-NOT: v_add_{{(_co)?}}_u32_sdwa
458
459; VI: v_add_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
460
461; GFX9_10: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
462
463define amdgpu_kernel void @add_bb_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) #0 {
464entry:
465  %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
466  %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
467  br label %add_label
468add_label:
469  %add = add <2 x i16> %a, %b
470  br label %store_label
471store_label:
472  store <2 x i16> %add, <2 x i16> addrspace(1)* %out, align 4
473  ret void
474}
475
476
477; Check that "pulling out" SDWA operands works correctly.
478; GCN-LABEL: {{^}}pulled_out_test:
479; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
480; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
481; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
482; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
483; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
484; NOSDWA-NOT: v_and_b32_sdwa
485; NOSDWA-NOT: v_or_b32_sdwa
486
487; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
488; GFX9_10-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
489; GFX89-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
490;
491; GFX10-DAG: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
492;
493; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
494; GFX9_10-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
495; GFX89-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
496;
497; GFX10-DAG: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
498;
499; GFX89: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
500;
501; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
502; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
503; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
504; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
505
506define amdgpu_kernel void @pulled_out_test(<8 x i8> addrspace(1)* %sourceA, <8 x i8> addrspace(1)* %destValues) #0 {
507entry:
508  %idxprom = ashr exact i64 15, 32
509  %arrayidx = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %sourceA, i64 %idxprom
510  %tmp = load <8 x i8>, <8 x i8> addrspace(1)* %arrayidx, align 8
511
512  %tmp1 = extractelement <8 x i8> %tmp, i32 0
513  %tmp2 = extractelement <8 x i8> %tmp, i32 1
514  %tmp3 = extractelement <8 x i8> %tmp, i32 2
515  %tmp4 = extractelement <8 x i8> %tmp, i32 3
516  %tmp5 = extractelement <8 x i8> %tmp, i32 4
517  %tmp6 = extractelement <8 x i8> %tmp, i32 5
518  %tmp7 = extractelement <8 x i8> %tmp, i32 6
519  %tmp8 = extractelement <8 x i8> %tmp, i32 7
520
521  %tmp9 = insertelement <2 x i8> undef, i8 %tmp1, i32 0
522  %tmp10 = insertelement <2 x i8> %tmp9, i8 %tmp2, i32 1
523  %tmp11 = insertelement <2 x i8> undef, i8 %tmp3, i32 0
524  %tmp12 = insertelement <2 x i8> %tmp11, i8 %tmp4, i32 1
525  %tmp13 = insertelement <2 x i8> undef, i8 %tmp5, i32 0
526  %tmp14 = insertelement <2 x i8> %tmp13, i8 %tmp6, i32 1
527  %tmp15 = insertelement <2 x i8> undef, i8 %tmp7, i32 0
528  %tmp16 = insertelement <2 x i8> %tmp15, i8 %tmp8, i32 1
529
530  %tmp17 = shufflevector <2 x i8> %tmp10, <2 x i8> %tmp12, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
531  %tmp18 = shufflevector <2 x i8> %tmp14, <2 x i8> %tmp16, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
532  %tmp19 = shufflevector <4 x i8> %tmp17, <4 x i8> %tmp18, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
533
534  %arrayidx5 = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %destValues, i64 %idxprom
535  store <8 x i8> %tmp19, <8 x i8> addrspace(1)* %arrayidx5, align 8
536  ret void
537}
538
539; GCN-LABEL: {{^}}sdwa_crash_inlineasm_def:
540; GCN: s_mov_b32 s{{[0-9]+}}, 0xffff
541; GCN: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
542;
543; TODO: Why is the constant not peepholed into the v_or_b32_e32?
544;
545; NOSDWA: s_mov_b32 [[CONST:s[0-9]+]], 0x10000
546; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, s0,
547; SDWA: v_or_b32_e32 v{{[0-9]+}}, 0x10000,
548define amdgpu_kernel void @sdwa_crash_inlineasm_def() #0 {
549bb:
550  br label %bb1
551
552bb1:                                              ; preds = %bb11, %bb
553  %tmp = phi <2 x i32> [ %tmp12, %bb11 ], [ undef, %bb ]
554  br i1 true, label %bb2, label %bb11
555
556bb2:                                              ; preds = %bb1
557  %tmp3 = call i32 asm "v_and_b32_e32 $0, $1, $2", "=v,s,v"(i32 65535, i32 undef) #1
558  %tmp5 = or i32 %tmp3, 65536
559  %tmp6 = insertelement <2 x i32> %tmp, i32 %tmp5, i64 0
560  br label %bb11
561
562bb11:                                             ; preds = %bb10, %bb2
563  %tmp12 = phi <2 x i32> [ %tmp6, %bb2 ], [ %tmp, %bb1 ]
564  store volatile <2 x i32> %tmp12, <2 x i32> addrspace(1)* undef
565  br label %bb1
566}
567
568attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" }
569