1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=NOSDWA,GCN %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GFX89,SDWA,GCN %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_10,SDWA,GCN %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -amdgpu-sdwa-peephole -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX9_10,SDWA,GCN %s 5 6; GCN-LABEL: {{^}}add_shr_i32: 7; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}} 8; NOSDWA: v_add_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v[[DST]] 9; NOSDWA-NOT: v_add_{{(_co)?}}_u32_sdwa 10 11; VI: v_add_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 12; GFX9: v_add_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 13; GFX10: v_add_nc_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 14 15define amdgpu_kernel void @add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 16 %a = load i32, i32 addrspace(1)* %in, align 4 17 %shr = lshr i32 %a, 16 18 %add = add i32 %a, %shr 19 store i32 %add, i32 addrspace(1)* %out, align 4 20 ret void 21} 22 23; GCN-LABEL: {{^}}sub_shr_i32: 24; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}} 25; NOSDWA: v_subrev_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v[[DST]] 26; NOSDWA-NOT: v_subrev_{{(_co)?}}_u32_sdwa 27 28; VI: v_subrev_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 29; GFX9: v_sub_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 30; GFX10: v_sub_nc_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 31define amdgpu_kernel void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 32 %a = load i32, i32 addrspace(1)* %in, align 4 33 %shr = lshr i32 %a, 16 34 %sub = sub i32 %shr, %a 35 store i32 %sub, i32 addrspace(1)* %out, align 4 36 ret void 37} 38 39; GCN-LABEL: {{^}}mul_shr_i32: 40; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} 41; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} 42; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v[[DST0]], v[[DST1]] 43; NOSDWA-NOT: v_mul_u32_u24_sdwa 44 45; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 46 47define amdgpu_kernel void @mul_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in1, i32 addrspace(1)* %in2) #0 { 48 %a = load i32, i32 addrspace(1)* %in1, align 4 49 %b = load i32, i32 addrspace(1)* %in2, align 4 50 %shra = lshr i32 %a, 16 51 %shrb = lshr i32 %b, 16 52 %mul = mul i32 %shra, %shrb 53 store i32 %mul, i32 addrspace(1)* %out, align 4 54 ret void 55} 56 57; GCN-LABEL: {{^}}mul_i16: 58; NOSDWA: v_mul_lo_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 59; NOSDWA-NOT: v_mul_u32_u24_sdwa 60; GFX89: v_mul_lo_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 61; GFX10: v_mul_lo_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 62; SDWA-NOT: v_mul_u32_u24_sdwa 63 64define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %ina, i16 addrspace(1)* %inb) #0 { 65entry: 66 %a = load i16, i16 addrspace(1)* %ina, align 4 67 %b = load i16, i16 addrspace(1)* %inb, align 4 68 %mul = mul i16 %a, %b 69 store i16 %mul, i16 addrspace(1)* %out, align 4 70 ret void 71} 72 73; GCN-LABEL: {{^}}mul_v2i16: 74; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} 75; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} 76; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST0]], v[[DST1]] 77; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]] 78; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]] 79; NOSDWA-NOT: v_mul_u32_u24_sdwa 80 81; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 82; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 83; VI: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_LO]], v[[DST_MUL_HI]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 84 85; GFX9_10: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 86 87define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) #0 { 88entry: 89 %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4 90 %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4 91 %mul = mul <2 x i16> %a, %b 92 store <2 x i16> %mul, <2 x i16> addrspace(1)* %out, align 4 93 ret void 94} 95 96; GCN-LABEL: {{^}}mul_v4i16: 97; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} 98; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} 99; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 100; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} 101; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 102; NOSDWA-NOT: v_mul_u32_u24_sdwa 103 104; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 105; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 106; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 107; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 108; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 109; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 110 111; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 112; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 113 114define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) #0 { 115entry: 116 %a = load <4 x i16>, <4 x i16> addrspace(1)* %ina, align 4 117 %b = load <4 x i16>, <4 x i16> addrspace(1)* %inb, align 4 118 %mul = mul <4 x i16> %a, %b 119 store <4 x i16> %mul, <4 x i16> addrspace(1)* %out, align 4 120 ret void 121} 122 123; GCN-LABEL: {{^}}mul_v8i16: 124; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} 125; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} 126; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 127; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} 128; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 129; NOSDWA-NOT: v_mul_u32_u24_sdwa 130 131; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 132; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 133; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 134; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 135; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL4:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 136; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL5:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 137; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL6:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 138; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL7:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 139; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL6]], v[[DST_MUL7]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 140; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL4]], v[[DST_MUL5]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 141; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 142; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 143 144; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 145; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 146; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 147; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 148 149define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) #0 { 150entry: 151 %a = load <8 x i16>, <8 x i16> addrspace(1)* %ina, align 4 152 %b = load <8 x i16>, <8 x i16> addrspace(1)* %inb, align 4 153 %mul = mul <8 x i16> %a, %b 154 store <8 x i16> %mul, <8 x i16> addrspace(1)* %out, align 4 155 ret void 156} 157 158; GCN-LABEL: {{^}}mul_half: 159; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 160; NOSDWA-NOT: v_mul_f16_sdwa 161; SDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 162; SDWA-NOT: v_mul_f16_sdwa 163 164define amdgpu_kernel void @mul_half(half addrspace(1)* %out, half addrspace(1)* %ina, half addrspace(1)* %inb) #0 { 165entry: 166 %a = load half, half addrspace(1)* %ina, align 4 167 %b = load half, half addrspace(1)* %inb, align 4 168 %mul = fmul half %a, %b 169 store half %mul, half addrspace(1)* %out, align 4 170 ret void 171} 172 173; GCN-LABEL: {{^}}mul_v2half: 174; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} 175; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} 176; NOSDWA: v_mul_f16_e32 v[[DST_MUL:[0-9]+]], v[[DST0]], v[[DST1]] 177; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]] 178; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]] 179; NOSDWA-NOT: v_mul_f16_sdwa 180 181; VI-DAG: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 182; VI-DAG: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} 183; VI: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_LO]], v[[DST_MUL_HI]] 184 185; GFX9_10: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 186 187define amdgpu_kernel void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) #0 { 188entry: 189 %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4 190 %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4 191 %mul = fmul <2 x half> %a, %b 192 store <2 x half> %mul, <2 x half> addrspace(1)* %out, align 4 193 ret void 194} 195 196; GCN-LABEL: {{^}}mul_v4half: 197; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} 198; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} 199; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 200; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} 201; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 202; NOSDWA-NOT: v_mul_f16_sdwa 203 204; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 205; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 206; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 207; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 208 209; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 210; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 211 212define amdgpu_kernel void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) #0 { 213entry: 214 %a = load <4 x half>, <4 x half> addrspace(1)* %ina, align 4 215 %b = load <4 x half>, <4 x half> addrspace(1)* %inb, align 4 216 %mul = fmul <4 x half> %a, %b 217 store <4 x half> %mul, <4 x half> addrspace(1)* %out, align 4 218 ret void 219} 220 221; GCN-LABEL: {{^}}mul_v8half: 222; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} 223; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} 224; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 225; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} 226; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 227; NOSDWA-NOT: v_mul_f16_sdwa 228 229; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 230; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 231; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 232; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 233; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 234; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 235; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 236; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 237 238; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 239; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 240; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 241; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 242 243define amdgpu_kernel void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) #0 { 244entry: 245 %a = load <8 x half>, <8 x half> addrspace(1)* %ina, align 4 246 %b = load <8 x half>, <8 x half> addrspace(1)* %inb, align 4 247 %mul = fmul <8 x half> %a, %b 248 store <8 x half> %mul, <8 x half> addrspace(1)* %out, align 4 249 ret void 250} 251 252; GCN-LABEL: {{^}}mul_i8: 253; NOSDWA: v_mul_lo_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 254; NOSDWA-NOT: v_mul_u32_u24_sdwa 255; GFX89: v_mul_lo_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 256; GFX10: v_mul_lo_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 257; SDWA-NOT: v_mul_u32_u24_sdwa 258 259define amdgpu_kernel void @mul_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %ina, i8 addrspace(1)* %inb) #0 { 260entry: 261 %a = load i8, i8 addrspace(1)* %ina, align 4 262 %b = load i8, i8 addrspace(1)* %inb, align 4 263 %mul = mul i8 %a, %b 264 store i8 %mul, i8 addrspace(1)* %out, align 4 265 ret void 266} 267 268; GCN-LABEL: {{^}}mul_v2i8: 269; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} 270; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} 271; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 272; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} 273; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 274; NOSDWA-NOT: v_mul_u32_u24_sdwa 275 276; VI: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 277 278; GFX9-DAG: v_mul_lo_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 279; GFX9-DAG: v_mul_lo_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 280 281; GFX10-DAG: v_mul_lo_u16_e64 282; GFX10-DAG: v_mul_lo_u16_e64 283 284; GFX9: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 285 286; GFX10: v_lshlrev_b16_e64 v{{[0-9]+}}, 8, v 287; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 288define amdgpu_kernel void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) #0 { 289entry: 290 %a = load <2 x i8>, <2 x i8> addrspace(1)* %ina, align 4 291 %b = load <2 x i8>, <2 x i8> addrspace(1)* %inb, align 4 292 %mul = mul <2 x i8> %a, %b 293 store <2 x i8> %mul, <2 x i8> addrspace(1)* %out, align 4 294 ret void 295} 296 297; GCN-LABEL: {{^}}mul_v4i8: 298; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} 299; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} 300; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 301; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} 302; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 303; NOSDWA-NOT: v_mul_u32_u24_sdwa 304 305; VI-DAG: v_mul_u32_u24_sdwa 306; VI-DAG: v_mul_u32_u24_sdwa 307; VI-DAG: v_mul_u32_u24_sdwa 308 309; GFX9-DAG: v_mul_lo_u16_sdwa 310; GFX9-DAG: v_mul_lo_u16_sdwa 311; GFX9-DAG: v_mul_lo_u16_sdwa 312 313; GFX10-DAG: v_mul_lo_u16_e64 314; GFX10-DAG: v_mul_lo_u16_e64 315; GFX10-DAG: v_mul_lo_u16_e64 316; GFX10-DAG: v_mul_lo_u16_e64 317 318define amdgpu_kernel void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) #0 { 319entry: 320 %a = load <4 x i8>, <4 x i8> addrspace(1)* %ina, align 4 321 %b = load <4 x i8>, <4 x i8> addrspace(1)* %inb, align 4 322 %mul = mul <4 x i8> %a, %b 323 store <4 x i8> %mul, <4 x i8> addrspace(1)* %out, align 4 324 ret void 325} 326 327; GCN-LABEL: {{^}}mul_v8i8: 328; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} 329; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} 330; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 331; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} 332; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 333; NOSDWA-NOT: v_mul_u32_u24_sdwa 334 335; VI-DAG: v_mul_u32_u24_sdwa 336; VI-DAG: v_mul_u32_u24_sdwa 337; VI-DAG: v_mul_u32_u24_sdwa 338; VI-DAG: v_mul_u32_u24_sdwa 339; VI-DAG: v_mul_u32_u24_sdwa 340; VI-DAG: v_mul_u32_u24_sdwa 341 342; GFX9-DAG: v_mul_lo_u16_sdwa 343; GFX9-DAG: v_mul_lo_u16_sdwa 344; GFX9-DAG: v_mul_lo_u16_sdwa 345; GFX9-DAG: v_mul_lo_u16_sdwa 346; GFX9-DAG: v_mul_lo_u16_sdwa 347; GFX9-DAG: v_mul_lo_u16_sdwa 348 349; GFX10-DAG: v_mul_lo_u16_e64 350; GFX10-DAG: v_mul_lo_u16_e64 351; GFX10-DAG: v_mul_lo_u16_e64 352; GFX10-DAG: v_mul_lo_u16_e64 353; GFX10-DAG: v_mul_lo_u16_e64 354; GFX10-DAG: v_mul_lo_u16_e64 355; GFX10-DAG: v_mul_lo_u16_e64 356; GFX10-DAG: v_mul_lo_u16_e64 357 358define amdgpu_kernel void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) #0 { 359entry: 360 %a = load <8 x i8>, <8 x i8> addrspace(1)* %ina, align 4 361 %b = load <8 x i8>, <8 x i8> addrspace(1)* %inb, align 4 362 %mul = mul <8 x i8> %a, %b 363 store <8 x i8> %mul, <8 x i8> addrspace(1)* %out, align 4 364 ret void 365} 366 367; GCN-LABEL: {{^}}sitofp_v2i16_to_v2f16: 368; NOSDWA-DAG: v_cvt_f16_i16_e32 v{{[0-9]+}}, v{{[0-9]+}} 369; NOSDWA-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} 370; NOSDWA-DAG: v_cvt_f16_i16_e32 v{{[0-9]+}}, v{{[0-9]+}} 371; NOSDWA-NOT: v_cvt_f16_i16_sdwa 372 373; SDWA-DAG: v_cvt_f16_i16_e32 v{{[0-9]+}}, v{{[0-9]+}} 374; SDWA-DAG: v_cvt_f16_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}} dst_sel:{{(WORD_1|DWORD)?}} dst_unused:UNUSED_PAD src0_sel:WORD_1 375 376; FIXME: Should be able to avoid or 377define amdgpu_kernel void @sitofp_v2i16_to_v2f16( 378 <2 x half> addrspace(1)* %r, 379 <2 x i16> addrspace(1)* %a) #0 { 380entry: 381 %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a 382 %r.val = sitofp <2 x i16> %a.val to <2 x half> 383 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 384 ret void 385} 386 387 388; GCN-LABEL: {{^}}mac_v2half: 389; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} 390; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} 391; NOSDWA: v_mac_f16_e32 v[[DST_MAC:[0-9]+]], v[[DST0]], v[[DST1]] 392; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]] 393; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]] 394; NOSDWA-NOT: v_mac_f16_sdwa 395 396; VI: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 397; VI: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]] 398 399; GFX9_10: v_pk_mul_f16 v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v[[SRC:[0-9]+]] 400; GFX9_10: v_pk_add_f16 v{{[0-9]+}}, v[[DST_MUL]], v[[SRC]] 401 402define amdgpu_kernel void @mac_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) #0 { 403entry: 404 %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4 405 %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4 406 %mul = fmul <2 x half> %a, %b 407 %mac = fadd <2 x half> %mul, %b 408 store <2 x half> %mac, <2 x half> addrspace(1)* %out, align 4 409 ret void 410} 411 412; GCN-LABEL: {{^}}immediate_mul_v2i16: 413; NOSDWA-NOT: v_mul_u32_u24_sdwa 414; VI-DAG: v_mov_b32_e32 v[[M321:[0-9]+]], 0x141 415; VI-DAG: v_mov_b32_e32 v[[M123:[0-9]+]], 0x7b 416; VI-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M123]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 417; VI-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M321]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 418 419; GFX9: s_mov_b32 s[[IMM:[0-9]+]], 0x141007b 420; GFX9: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, s[[IMM]] 421 422; GFX10: v_pk_mul_lo_u16 v{{[0-9]+}}, 0x141007b, v{{[0-9]+}} 423 424define amdgpu_kernel void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 425entry: 426 %a = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4 427 %mul = mul <2 x i16> %a, <i16 123, i16 321> 428 store <2 x i16> %mul, <2 x i16> addrspace(1)* %out, align 4 429 ret void 430} 431 432; Double use of same src - should not convert it 433; GCN-LABEL: {{^}}mulmul_v2i16: 434; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} 435; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} 436; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 437; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} 438; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 439; NOSDWA-NOT: v_mul_u32_u24_sdwa 440 441; VI: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 442 443; GFX9_10: v_pk_mul_lo_u16 v[[DST1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} 444; GFX9_10: v_pk_mul_lo_u16 v{{[0-9]+}}, v[[DST1]], v{{[0-9]+}} 445 446define amdgpu_kernel void @mulmul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) #0 { 447entry: 448 %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4 449 %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4 450 %mul = mul <2 x i16> %a, %b 451 %mul2 = mul <2 x i16> %mul, %b 452 store <2 x i16> %mul2, <2 x i16> addrspace(1)* %out, align 4 453 ret void 454} 455 456; GCN-LABEL: {{^}}add_bb_v2i16: 457; NOSDWA-NOT: v_add_{{(_co)?}}_u32_sdwa 458 459; VI: v_add_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 460 461; GFX9_10: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 462 463define amdgpu_kernel void @add_bb_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) #0 { 464entry: 465 %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4 466 %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4 467 br label %add_label 468add_label: 469 %add = add <2 x i16> %a, %b 470 br label %store_label 471store_label: 472 store <2 x i16> %add, <2 x i16> addrspace(1)* %out, align 4 473 ret void 474} 475 476 477; Check that "pulling out" SDWA operands works correctly. 478; GCN-LABEL: {{^}}pulled_out_test: 479; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 480; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} 481; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 482; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} 483; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 484; NOSDWA-NOT: v_and_b32_sdwa 485; NOSDWA-NOT: v_or_b32_sdwa 486 487; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 488; GFX9_10-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 489; GFX89-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} 490; 491; GFX10-DAG: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 492; 493; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 494; GFX9_10-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 495; GFX89-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} 496; 497; GFX10-DAG: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 498; 499; GFX89: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 500; 501; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 502; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 503; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 504; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 505 506define amdgpu_kernel void @pulled_out_test(<8 x i8> addrspace(1)* %sourceA, <8 x i8> addrspace(1)* %destValues) #0 { 507entry: 508 %idxprom = ashr exact i64 15, 32 509 %arrayidx = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %sourceA, i64 %idxprom 510 %tmp = load <8 x i8>, <8 x i8> addrspace(1)* %arrayidx, align 8 511 512 %tmp1 = extractelement <8 x i8> %tmp, i32 0 513 %tmp2 = extractelement <8 x i8> %tmp, i32 1 514 %tmp3 = extractelement <8 x i8> %tmp, i32 2 515 %tmp4 = extractelement <8 x i8> %tmp, i32 3 516 %tmp5 = extractelement <8 x i8> %tmp, i32 4 517 %tmp6 = extractelement <8 x i8> %tmp, i32 5 518 %tmp7 = extractelement <8 x i8> %tmp, i32 6 519 %tmp8 = extractelement <8 x i8> %tmp, i32 7 520 521 %tmp9 = insertelement <2 x i8> undef, i8 %tmp1, i32 0 522 %tmp10 = insertelement <2 x i8> %tmp9, i8 %tmp2, i32 1 523 %tmp11 = insertelement <2 x i8> undef, i8 %tmp3, i32 0 524 %tmp12 = insertelement <2 x i8> %tmp11, i8 %tmp4, i32 1 525 %tmp13 = insertelement <2 x i8> undef, i8 %tmp5, i32 0 526 %tmp14 = insertelement <2 x i8> %tmp13, i8 %tmp6, i32 1 527 %tmp15 = insertelement <2 x i8> undef, i8 %tmp7, i32 0 528 %tmp16 = insertelement <2 x i8> %tmp15, i8 %tmp8, i32 1 529 530 %tmp17 = shufflevector <2 x i8> %tmp10, <2 x i8> %tmp12, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 531 %tmp18 = shufflevector <2 x i8> %tmp14, <2 x i8> %tmp16, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 532 %tmp19 = shufflevector <4 x i8> %tmp17, <4 x i8> %tmp18, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 533 534 %arrayidx5 = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %destValues, i64 %idxprom 535 store <8 x i8> %tmp19, <8 x i8> addrspace(1)* %arrayidx5, align 8 536 ret void 537} 538 539; GCN-LABEL: {{^}}sdwa_crash_inlineasm_def: 540; GCN: s_mov_b32 s{{[0-9]+}}, 0xffff 541; GCN: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 542; 543; TODO: Why is the constant not peepholed into the v_or_b32_e32? 544; 545; NOSDWA: s_mov_b32 [[CONST:s[0-9]+]], 0x10000 546; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, s0, 547; SDWA: v_or_b32_e32 v{{[0-9]+}}, 0x10000, 548define amdgpu_kernel void @sdwa_crash_inlineasm_def() #0 { 549bb: 550 br label %bb1 551 552bb1: ; preds = %bb11, %bb 553 %tmp = phi <2 x i32> [ %tmp12, %bb11 ], [ undef, %bb ] 554 br i1 true, label %bb2, label %bb11 555 556bb2: ; preds = %bb1 557 %tmp3 = call i32 asm "v_and_b32_e32 $0, $1, $2", "=v,s,v"(i32 65535, i32 undef) #1 558 %tmp5 = or i32 %tmp3, 65536 559 %tmp6 = insertelement <2 x i32> %tmp, i32 %tmp5, i64 0 560 br label %bb11 561 562bb11: ; preds = %bb10, %bb2 563 %tmp12 = phi <2 x i32> [ %tmp6, %bb2 ], [ %tmp, %bb1 ] 564 store volatile <2 x i32> %tmp12, <2 x i32> addrspace(1)* undef 565 br label %bb1 566} 567 568attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" } 569