1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s 7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s 8; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s 9; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s 10 11define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1, 12; GFX7-LABEL: idot8_acc32: 13; GFX7: ; %bb.0: ; %entry 14; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 15; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 16; GFX7-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 17; GFX7-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 18; GFX7-NEXT: s_mov_b32 s26, -1 19; GFX7-NEXT: s_waitcnt lgkmcnt(0) 20; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 21; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 22; GFX7-NEXT: s_load_dword s20, s[0:1], 0x0 23; GFX7-NEXT: s_mov_b32 s27, 0xe8f000 24; GFX7-NEXT: s_add_u32 s24, s24, s3 25; GFX7-NEXT: s_addc_u32 s25, s25, 0 26; GFX7-NEXT: s_waitcnt lgkmcnt(0) 27; GFX7-NEXT: s_bfe_i32 s7, s5, 0x40000 28; GFX7-NEXT: s_bfe_i32 s6, s4, 0x40000 29; GFX7-NEXT: s_bfe_i32 s9, s5, 0x40004 30; GFX7-NEXT: v_mov_b32_e32 v0, s7 31; GFX7-NEXT: v_mov_b32_e32 v1, s20 32; GFX7-NEXT: v_mad_i32_i24 v0, s6, v0, v1 33; GFX7-NEXT: s_bfe_i32 s8, s4, 0x40004 34; GFX7-NEXT: v_mov_b32_e32 v1, s9 35; GFX7-NEXT: s_bfe_i32 s11, s5, 0x40008 36; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0 37; GFX7-NEXT: s_bfe_i32 s10, s4, 0x40008 38; GFX7-NEXT: v_mov_b32_e32 v1, s11 39; GFX7-NEXT: s_bfe_i32 s13, s5, 0x4000c 40; GFX7-NEXT: v_mad_i32_i24 v0, s10, v1, v0 41; GFX7-NEXT: s_bfe_i32 s12, s4, 0x4000c 42; GFX7-NEXT: v_mov_b32_e32 v1, s13 43; GFX7-NEXT: s_bfe_i32 s15, s5, 0x40010 44; GFX7-NEXT: v_mad_i32_i24 v0, s12, v1, v0 45; GFX7-NEXT: s_bfe_i32 s14, s4, 0x40010 46; GFX7-NEXT: v_mov_b32_e32 v1, s15 47; GFX7-NEXT: s_bfe_i32 s17, s5, 0x40014 48; GFX7-NEXT: s_bfe_i32 s19, s5, 0x40018 49; GFX7-NEXT: v_mad_i32_i24 v0, s14, v1, v0 50; GFX7-NEXT: s_bfe_i32 s16, s4, 0x40014 51; GFX7-NEXT: v_mov_b32_e32 v1, s17 52; GFX7-NEXT: s_bfe_i32 s18, s4, 0x40018 53; GFX7-NEXT: v_mad_i32_i24 v0, s16, v1, v0 54; GFX7-NEXT: v_mov_b32_e32 v1, s19 55; GFX7-NEXT: s_ashr_i32 s5, s5, 28 56; GFX7-NEXT: v_mad_i32_i24 v0, s18, v1, v0 57; GFX7-NEXT: s_ashr_i32 s4, s4, 28 58; GFX7-NEXT: v_mov_b32_e32 v1, s5 59; GFX7-NEXT: s_mov_b32 s3, 0xf000 60; GFX7-NEXT: s_mov_b32 s2, -1 61; GFX7-NEXT: v_mad_i32_i24 v0, s4, v1, v0 62; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 63; GFX7-NEXT: s_endpgm 64; 65; GFX8-LABEL: idot8_acc32: 66; GFX8: ; %bb.0: ; %entry 67; GFX8-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 68; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 69; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 70; GFX8-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 71; GFX8-NEXT: s_mov_b32 s22, -1 72; GFX8-NEXT: s_mov_b32 s23, 0xe80000 73; GFX8-NEXT: s_add_u32 s20, s20, s3 74; GFX8-NEXT: s_waitcnt lgkmcnt(0) 75; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 76; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 77; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 78; GFX8-NEXT: s_addc_u32 s21, s21, 0 79; GFX8-NEXT: s_waitcnt lgkmcnt(0) 80; GFX8-NEXT: s_bfe_i32 s4, s2, 0x40000 81; GFX8-NEXT: s_bfe_i32 s5, s3, 0x40000 82; GFX8-NEXT: s_bfe_i32 s7, s3, 0x40004 83; GFX8-NEXT: v_mov_b32_e32 v0, s5 84; GFX8-NEXT: v_mov_b32_e32 v1, s18 85; GFX8-NEXT: v_mad_i32_i24 v0, s4, v0, v1 86; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40004 87; GFX8-NEXT: v_mov_b32_e32 v1, s7 88; GFX8-NEXT: s_bfe_i32 s9, s3, 0x40008 89; GFX8-NEXT: v_mad_i32_i24 v0, s6, v1, v0 90; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40008 91; GFX8-NEXT: v_mov_b32_e32 v1, s9 92; GFX8-NEXT: s_bfe_i32 s11, s3, 0x4000c 93; GFX8-NEXT: v_mad_i32_i24 v0, s8, v1, v0 94; GFX8-NEXT: s_bfe_i32 s10, s2, 0x4000c 95; GFX8-NEXT: v_mov_b32_e32 v1, s11 96; GFX8-NEXT: s_bfe_i32 s13, s3, 0x40010 97; GFX8-NEXT: v_mad_i32_i24 v0, s10, v1, v0 98; GFX8-NEXT: s_bfe_i32 s12, s2, 0x40010 99; GFX8-NEXT: v_mov_b32_e32 v1, s13 100; GFX8-NEXT: s_bfe_i32 s15, s3, 0x40014 101; GFX8-NEXT: s_bfe_i32 s17, s3, 0x40018 102; GFX8-NEXT: v_mad_i32_i24 v0, s12, v1, v0 103; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40014 104; GFX8-NEXT: v_mov_b32_e32 v1, s15 105; GFX8-NEXT: s_bfe_i32 s16, s2, 0x40018 106; GFX8-NEXT: v_mad_i32_i24 v0, s14, v1, v0 107; GFX8-NEXT: v_mov_b32_e32 v1, s17 108; GFX8-NEXT: s_ashr_i32 s3, s3, 28 109; GFX8-NEXT: v_mad_i32_i24 v0, s16, v1, v0 110; GFX8-NEXT: s_ashr_i32 s2, s2, 28 111; GFX8-NEXT: v_mov_b32_e32 v1, s3 112; GFX8-NEXT: v_mad_i32_i24 v2, s2, v1, v0 113; GFX8-NEXT: v_mov_b32_e32 v0, s0 114; GFX8-NEXT: v_mov_b32_e32 v1, s1 115; GFX8-NEXT: flat_store_dword v[0:1], v2 116; GFX8-NEXT: s_endpgm 117; 118; GFX9-LABEL: idot8_acc32: 119; GFX9: ; %bb.0: ; %entry 120; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 121; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 122; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 123; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 124; GFX9-NEXT: s_mov_b32 s22, -1 125; GFX9-NEXT: s_mov_b32 s23, 0xe00000 126; GFX9-NEXT: s_add_u32 s20, s20, s3 127; GFX9-NEXT: s_waitcnt lgkmcnt(0) 128; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 129; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 130; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 131; GFX9-NEXT: s_addc_u32 s21, s21, 0 132; GFX9-NEXT: v_mov_b32_e32 v0, 0 133; GFX9-NEXT: s_waitcnt lgkmcnt(0) 134; GFX9-NEXT: s_bfe_i32 s4, s2, 0x40000 135; GFX9-NEXT: s_bfe_i32 s5, s3, 0x40000 136; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40004 137; GFX9-NEXT: v_mov_b32_e32 v1, s5 138; GFX9-NEXT: v_mov_b32_e32 v2, s18 139; GFX9-NEXT: v_mad_i32_i24 v1, s4, v1, v2 140; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004 141; GFX9-NEXT: v_mov_b32_e32 v2, s7 142; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40008 143; GFX9-NEXT: v_mad_i32_i24 v1, s6, v2, v1 144; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008 145; GFX9-NEXT: v_mov_b32_e32 v2, s9 146; GFX9-NEXT: s_bfe_i32 s11, s3, 0x4000c 147; GFX9-NEXT: v_mad_i32_i24 v1, s8, v2, v1 148; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c 149; GFX9-NEXT: v_mov_b32_e32 v2, s11 150; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010 151; GFX9-NEXT: v_mad_i32_i24 v1, s10, v2, v1 152; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 153; GFX9-NEXT: v_mov_b32_e32 v2, s13 154; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014 155; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018 156; GFX9-NEXT: v_mad_i32_i24 v1, s12, v2, v1 157; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 158; GFX9-NEXT: v_mov_b32_e32 v2, s15 159; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 160; GFX9-NEXT: v_mad_i32_i24 v1, s14, v2, v1 161; GFX9-NEXT: v_mov_b32_e32 v2, s17 162; GFX9-NEXT: s_ashr_i32 s3, s3, 28 163; GFX9-NEXT: v_mad_i32_i24 v1, s16, v2, v1 164; GFX9-NEXT: s_ashr_i32 s2, s2, 28 165; GFX9-NEXT: v_mov_b32_e32 v2, s3 166; GFX9-NEXT: v_mad_i32_i24 v1, s2, v2, v1 167; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 168; GFX9-NEXT: s_endpgm 169; 170; GFX9-DL-LABEL: idot8_acc32: 171; GFX9-DL: ; %bb.0: ; %entry 172; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 173; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 174; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 175; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 176; GFX9-DL-NEXT: s_mov_b32 s10, -1 177; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 178; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 179; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 180; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 181; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 182; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 183; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 184; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 185; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 186; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 187; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 188; GFX9-DL-NEXT: v_dot8_i32_i4 v1, s4, v1, v2 189; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 190; GFX9-DL-NEXT: s_endpgm 191; 192; GFX10-DL-LABEL: idot8_acc32: 193; GFX10-DL: ; %bb.0: ; %entry 194; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 195; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 196; GFX10-DL-NEXT: s_mov_b32 s10, -1 197; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 198; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 199; GFX10-DL-NEXT: s_clause 0x1 200; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 201; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 202; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 203; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 204; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 205; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 206; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 207; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 208; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 209; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 210; GFX10-DL-NEXT: v_dot8_i32_i4 v0, s0, s1, v0 211; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] 212; GFX10-DL-NEXT: s_endpgm 213 <8 x i4> addrspace(1)* %src2, 214 i32 addrspace(1)* nocapture %dst) { 215entry: 216 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 217 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 218 219 %v1e0 = extractelement <8 x i4> %vec1, i64 0 220 %cv1e0 = sext i4 %v1e0 to i32 221 %v2e0 = extractelement <8 x i4> %vec2, i64 0 222 %cv2e0 = sext i4 %v2e0 to i32 223 %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0 224 225 %v1e1 = extractelement <8 x i4> %vec1, i64 1 226 %cv1e1 = sext i4 %v1e1 to i32 227 %v2e1 = extractelement <8 x i4> %vec2, i64 1 228 %cv2e1 = sext i4 %v2e1 to i32 229 %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1 230 231 %v1e2 = extractelement <8 x i4> %vec1, i64 2 232 %cv1e2 = sext i4 %v1e2 to i32 233 %v2e2 = extractelement <8 x i4> %vec2, i64 2 234 %cv2e2 = sext i4 %v2e2 to i32 235 %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2 236 237 %v1e3 = extractelement <8 x i4> %vec1, i64 3 238 %cv1e3 = sext i4 %v1e3 to i32 239 %v2e3 = extractelement <8 x i4> %vec2, i64 3 240 %cv2e3 = sext i4 %v2e3 to i32 241 %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3 242 243 %v1e4 = extractelement <8 x i4> %vec1, i64 4 244 %cv1e4 = sext i4 %v1e4 to i32 245 %v2e4 = extractelement <8 x i4> %vec2, i64 4 246 %cv2e4 = sext i4 %v2e4 to i32 247 %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4 248 249 %v1e5 = extractelement <8 x i4> %vec1, i64 5 250 %cv1e5 = sext i4 %v1e5 to i32 251 %v2e5 = extractelement <8 x i4> %vec2, i64 5 252 %cv2e5 = sext i4 %v2e5 to i32 253 %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5 254 255 %v1e6 = extractelement <8 x i4> %vec1, i64 6 256 %cv1e6 = sext i4 %v1e6 to i32 257 %v2e6 = extractelement <8 x i4> %vec2, i64 6 258 %cv2e6 = sext i4 %v2e6 to i32 259 %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6 260 261 %v1e7 = extractelement <8 x i4> %vec1, i64 7 262 %cv1e7 = sext i4 %v1e7 to i32 263 %v2e7 = extractelement <8 x i4> %vec2, i64 7 264 %cv2e7 = sext i4 %v2e7 to i32 265 %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7 266 267 %acc = load i32, i32 addrspace(1)* %dst, align 4 268 %add1 = add i32 %mul0, %acc 269 %add2 = add i32 %add1, %mul1 270 %add3 = add i32 %add2, %mul2 271 %add4 = add i32 %add3, %mul3 272 %add5 = add i32 %add4, %mul4 273 %add6 = add i32 %add5, %mul5 274 %add7 = add i32 %add6, %mul6 275 %add8 = add i32 %add7, %mul7 276 277 store i32 %add8, i32 addrspace(1)* %dst, align 4 278 ret void 279} 280 281; TODO: Once the unnecessary zero extentions of the elements are removed; 282; pattern recognizer will kick in. 283define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, 284; GFX7-LABEL: idot8_acc16: 285; GFX7: ; %bb.0: ; %entry 286; GFX7-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 287; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 288; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 289; GFX7-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 290; GFX7-NEXT: s_mov_b32 s26, -1 291; GFX7-NEXT: s_mov_b32 s27, 0xe8f000 292; GFX7-NEXT: s_add_u32 s24, s24, s3 293; GFX7-NEXT: s_mov_b32 s3, 0xf000 294; GFX7-NEXT: s_mov_b32 s2, -1 295; GFX7-NEXT: s_waitcnt lgkmcnt(0) 296; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 297; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 298; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 299; GFX7-NEXT: s_addc_u32 s25, s25, 0 300; GFX7-NEXT: s_mov_b32 s8, 0xffff 301; GFX7-NEXT: s_waitcnt lgkmcnt(0) 302; GFX7-NEXT: s_bfe_i32 s6, s4, 0x40000 303; GFX7-NEXT: s_bfe_i32 s7, s5, 0x40000 304; GFX7-NEXT: s_bfe_i32 s10, s5, 0x40004 305; GFX7-NEXT: s_and_b32 s7, s7, s8 306; GFX7-NEXT: s_bfe_i32 s9, s4, 0x40004 307; GFX7-NEXT: s_bfe_i32 s12, s5, 0x40008 308; GFX7-NEXT: s_and_b32 s10, s10, s8 309; GFX7-NEXT: s_and_b32 s6, s6, s8 310; GFX7-NEXT: v_mov_b32_e32 v1, s7 311; GFX7-NEXT: s_bfe_i32 s11, s4, 0x40008 312; GFX7-NEXT: s_bfe_i32 s14, s5, 0x4000c 313; GFX7-NEXT: s_and_b32 s12, s12, s8 314; GFX7-NEXT: s_and_b32 s9, s9, s8 315; GFX7-NEXT: v_mov_b32_e32 v2, s10 316; GFX7-NEXT: s_bfe_i32 s13, s4, 0x4000c 317; GFX7-NEXT: s_bfe_i32 s16, s5, 0x40010 318; GFX7-NEXT: s_and_b32 s14, s14, s8 319; GFX7-NEXT: s_and_b32 s11, s11, s8 320; GFX7-NEXT: v_mov_b32_e32 v3, s12 321; GFX7-NEXT: s_bfe_i32 s15, s4, 0x40010 322; GFX7-NEXT: s_bfe_i32 s18, s5, 0x40014 323; GFX7-NEXT: s_and_b32 s16, s16, s8 324; GFX7-NEXT: s_and_b32 s13, s13, s8 325; GFX7-NEXT: v_mov_b32_e32 v4, s14 326; GFX7-NEXT: s_bfe_i32 s20, s5, 0x40018 327; GFX7-NEXT: s_bfe_i32 s17, s4, 0x40014 328; GFX7-NEXT: s_and_b32 s18, s18, s8 329; GFX7-NEXT: s_and_b32 s15, s15, s8 330; GFX7-NEXT: v_mov_b32_e32 v5, s16 331; GFX7-NEXT: s_bfe_i32 s19, s4, 0x40018 332; GFX7-NEXT: s_ashr_i32 s5, s5, 28 333; GFX7-NEXT: s_and_b32 s20, s20, s8 334; GFX7-NEXT: s_and_b32 s17, s17, s8 335; GFX7-NEXT: v_mov_b32_e32 v6, s18 336; GFX7-NEXT: s_ashr_i32 s4, s4, 28 337; GFX7-NEXT: s_and_b32 s19, s19, s8 338; GFX7-NEXT: s_and_b32 s5, s5, s8 339; GFX7-NEXT: v_mov_b32_e32 v7, s20 340; GFX7-NEXT: s_and_b32 s4, s4, s8 341; GFX7-NEXT: s_waitcnt vmcnt(0) 342; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 343; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 344; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 345; GFX7-NEXT: v_mad_u32_u24 v0, s13, v4, v0 346; GFX7-NEXT: v_mad_u32_u24 v0, s15, v5, v0 347; GFX7-NEXT: v_mad_u32_u24 v0, s17, v6, v0 348; GFX7-NEXT: v_mad_u32_u24 v0, s19, v7, v0 349; GFX7-NEXT: v_mov_b32_e32 v1, s5 350; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 351; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 352; GFX7-NEXT: s_endpgm 353; 354; GFX8-LABEL: idot8_acc16: 355; GFX8: ; %bb.0: ; %entry 356; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 357; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 358; GFX8-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 359; GFX8-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 360; GFX8-NEXT: s_mov_b32 s18, -1 361; GFX8-NEXT: s_mov_b32 s19, 0xe80000 362; GFX8-NEXT: s_waitcnt lgkmcnt(0) 363; GFX8-NEXT: v_mov_b32_e32 v0, s0 364; GFX8-NEXT: v_mov_b32_e32 v1, s1 365; GFX8-NEXT: flat_load_ushort v2, v[0:1] 366; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 367; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 368; GFX8-NEXT: s_add_u32 s16, s16, s3 369; GFX8-NEXT: s_addc_u32 s17, s17, 0 370; GFX8-NEXT: s_waitcnt lgkmcnt(0) 371; GFX8-NEXT: s_bfe_i32 s4, s0, 0x40000 372; GFX8-NEXT: s_bfe_i32 s5, s1, 0x40000 373; GFX8-NEXT: s_bfe_i32 s7, s1, 0x40004 374; GFX8-NEXT: s_bfe_i32 s9, s1, 0x40008 375; GFX8-NEXT: v_mov_b32_e32 v6, s5 376; GFX8-NEXT: s_lshr_b32 s2, s0, 12 377; GFX8-NEXT: s_lshr_b32 s3, s1, 12 378; GFX8-NEXT: s_bfe_i32 s6, s0, 0x40004 379; GFX8-NEXT: s_bfe_i32 s8, s0, 0x40008 380; GFX8-NEXT: v_mov_b32_e32 v3, s9 381; GFX8-NEXT: v_mov_b32_e32 v7, s7 382; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s2 383; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s3 384; GFX8-NEXT: v_mul_i32_i24_e32 v3, s8, v3 385; GFX8-NEXT: s_bfe_i32 s11, s1, 0x40010 386; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 387; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 388; GFX8-NEXT: s_bfe_i32 s13, s1, 0x40014 389; GFX8-NEXT: s_bfe_i32 s10, s0, 0x40010 390; GFX8-NEXT: v_mov_b32_e32 v8, s11 391; GFX8-NEXT: s_bfe_i32 s15, s1, 0x40018 392; GFX8-NEXT: s_bfe_i32 s12, s0, 0x40014 393; GFX8-NEXT: v_mov_b32_e32 v9, s13 394; GFX8-NEXT: s_bfe_i32 s14, s0, 0x40018 395; GFX8-NEXT: s_ashr_i32 s1, s1, 28 396; GFX8-NEXT: v_mov_b32_e32 v10, s15 397; GFX8-NEXT: s_ashr_i32 s0, s0, 28 398; GFX8-NEXT: s_waitcnt vmcnt(0) 399; GFX8-NEXT: v_mad_i32_i24 v2, s4, v6, v2 400; GFX8-NEXT: v_mad_i32_i24 v2, s6, v7, v2 401; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 402; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2 403; GFX8-NEXT: v_mad_i32_i24 v2, s10, v8, v2 404; GFX8-NEXT: v_mad_i32_i24 v2, s12, v9, v2 405; GFX8-NEXT: v_mad_i32_i24 v2, s14, v10, v2 406; GFX8-NEXT: v_mov_b32_e32 v3, s1 407; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 408; GFX8-NEXT: flat_store_short v[0:1], v2 409; GFX8-NEXT: s_endpgm 410; 411; GFX9-LABEL: idot8_acc16: 412; GFX9: ; %bb.0: ; %entry 413; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 414; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 415; GFX9-NEXT: v_mov_b32_e32 v0, 0 416; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 417; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 418; GFX9-NEXT: s_mov_b32 s22, -1 419; GFX9-NEXT: s_waitcnt lgkmcnt(0) 420; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] 421; GFX9-NEXT: s_mov_b32 s23, 0xe00000 422; GFX9-NEXT: s_add_u32 s20, s20, s3 423; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 424; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 425; GFX9-NEXT: s_addc_u32 s21, s21, 0 426; GFX9-NEXT: s_waitcnt lgkmcnt(0) 427; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40000 428; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40000 429; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40004 430; GFX9-NEXT: s_bfe_i32 s11, s3, 0x40008 431; GFX9-NEXT: v_mov_b32_e32 v5, s7 432; GFX9-NEXT: s_lshr_b32 s4, s2, 12 433; GFX9-NEXT: s_lshr_b32 s5, s3, 12 434; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40004 435; GFX9-NEXT: s_bfe_i32 s10, s2, 0x40008 436; GFX9-NEXT: v_mov_b32_e32 v2, s11 437; GFX9-NEXT: v_mov_b32_e32 v6, s9 438; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s4 439; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s5 440; GFX9-NEXT: v_mul_i32_i24_e32 v2, s10, v2 441; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010 442; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3 443; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 444; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014 445; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 446; GFX9-NEXT: v_mov_b32_e32 v7, s13 447; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018 448; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 449; GFX9-NEXT: v_mov_b32_e32 v8, s15 450; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 451; GFX9-NEXT: s_ashr_i32 s3, s3, 28 452; GFX9-NEXT: v_mov_b32_e32 v9, s17 453; GFX9-NEXT: s_ashr_i32 s2, s2, 28 454; GFX9-NEXT: s_waitcnt vmcnt(0) 455; GFX9-NEXT: v_mad_i32_i24 v1, s6, v5, v1 456; GFX9-NEXT: v_mad_i32_i24 v1, s8, v6, v1 457; GFX9-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 458; GFX9-NEXT: v_mad_u32_u24 v1, v3, v4, v1 459; GFX9-NEXT: v_mad_i32_i24 v1, s12, v7, v1 460; GFX9-NEXT: v_mad_i32_i24 v1, s14, v8, v1 461; GFX9-NEXT: v_mad_i32_i24 v1, s16, v9, v1 462; GFX9-NEXT: v_mov_b32_e32 v2, s3 463; GFX9-NEXT: v_mad_i32_i24 v1, s2, v2, v1 464; GFX9-NEXT: global_store_short v0, v1, s[0:1] 465; GFX9-NEXT: s_endpgm 466; 467; GFX9-DL-LABEL: idot8_acc16: 468; GFX9-DL: ; %bb.0: ; %entry 469; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 470; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 471; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 472; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 473; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 474; GFX9-DL-NEXT: s_mov_b32 s22, -1 475; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 476; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1] 477; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 478; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 479; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 480; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 481; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 482; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 483; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40000 484; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x40000 485; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x40004 486; GFX9-DL-NEXT: s_bfe_i32 s11, s3, 0x40008 487; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 488; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 12 489; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 12 490; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40004 491; GFX9-DL-NEXT: s_bfe_i32 s10, s2, 0x40008 492; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 493; GFX9-DL-NEXT: v_mov_b32_e32 v6, s9 494; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s4 495; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s5 496; GFX9-DL-NEXT: v_mul_i32_i24_e32 v2, s10, v2 497; GFX9-DL-NEXT: s_bfe_i32 s13, s3, 0x40010 498; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3 499; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 500; GFX9-DL-NEXT: s_bfe_i32 s15, s3, 0x40014 501; GFX9-DL-NEXT: s_bfe_i32 s12, s2, 0x40010 502; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 503; GFX9-DL-NEXT: s_bfe_i32 s17, s3, 0x40018 504; GFX9-DL-NEXT: s_bfe_i32 s14, s2, 0x40014 505; GFX9-DL-NEXT: v_mov_b32_e32 v8, s15 506; GFX9-DL-NEXT: s_bfe_i32 s16, s2, 0x40018 507; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 28 508; GFX9-DL-NEXT: v_mov_b32_e32 v9, s17 509; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 510; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 511; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v5, v1 512; GFX9-DL-NEXT: v_mad_i32_i24 v1, s8, v6, v1 513; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 514; GFX9-DL-NEXT: v_mad_u32_u24 v1, v3, v4, v1 515; GFX9-DL-NEXT: v_mad_i32_i24 v1, s12, v7, v1 516; GFX9-DL-NEXT: v_mad_i32_i24 v1, s14, v8, v1 517; GFX9-DL-NEXT: v_mad_i32_i24 v1, s16, v9, v1 518; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 519; GFX9-DL-NEXT: v_mad_i32_i24 v1, s2, v2, v1 520; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] 521; GFX9-DL-NEXT: s_endpgm 522; 523; GFX10-DL-LABEL: idot8_acc16: 524; GFX10-DL: ; %bb.0: ; %entry 525; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 526; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 527; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 528; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 529; GFX10-DL-NEXT: s_mov_b32 s14, -1 530; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 531; GFX10-DL-NEXT: s_add_u32 s12, s12, s3 532; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 533; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 534; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 535; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] 536; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 537; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 538; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 539; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12 540; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 12 541; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40000 542; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x40000 543; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s2 544; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s3 545; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40004 546; GFX10-DL-NEXT: s_bfe_i32 s9, s0, 0x40008 547; GFX10-DL-NEXT: s_bfe_i32 s10, s1, 0x40008 548; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004 549; GFX10-DL-NEXT: v_mul_i32_i24_e64 v4, s9, s10 550; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2 551; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 552; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010 553; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 554; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1 555; GFX10-DL-NEXT: v_mad_i32_i24 v1, s8, s2, v1 556; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff 557; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2 558; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 559; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 560; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 561; GFX10-DL-NEXT: v_mad_u32_u24 v1, v2, v3, v1 562; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 563; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40014 564; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40014 565; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 566; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 567; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018 568; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 569; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 570; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 571; GFX10-DL-NEXT: v_mad_i32_i24 v1, s0, s1, v1 572; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] 573; GFX10-DL-NEXT: s_endpgm 574 <8 x i4> addrspace(1)* %src2, 575 i16 addrspace(1)* nocapture %dst) { 576entry: 577 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 578 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 579 580 %v1e0 = extractelement <8 x i4> %vec1, i64 0 581 %cv1e0 = sext i4 %v1e0 to i16 582 %v2e0 = extractelement <8 x i4> %vec2, i64 0 583 %cv2e0 = sext i4 %v2e0 to i16 584 %mul0 = mul nuw nsw i16 %cv1e0, %cv2e0 585 586 %v1e1 = extractelement <8 x i4> %vec1, i64 1 587 %cv1e1 = sext i4 %v1e1 to i16 588 %v2e1 = extractelement <8 x i4> %vec2, i64 1 589 %cv2e1 = sext i4 %v2e1 to i16 590 %mul1 = mul nuw nsw i16 %cv1e1, %cv2e1 591 592 %v1e2 = extractelement <8 x i4> %vec1, i64 2 593 %cv1e2 = sext i4 %v1e2 to i16 594 %v2e2 = extractelement <8 x i4> %vec2, i64 2 595 %cv2e2 = sext i4 %v2e2 to i16 596 %mul2 = mul nuw nsw i16 %cv1e2, %cv2e2 597 598 %v1e3 = extractelement <8 x i4> %vec1, i64 3 599 %cv1e3 = sext i4 %v1e3 to i16 600 %v2e3 = extractelement <8 x i4> %vec2, i64 3 601 %cv2e3 = sext i4 %v2e3 to i16 602 %mul3 = mul nuw nsw i16 %cv1e3, %cv2e3 603 604 %v1e4 = extractelement <8 x i4> %vec1, i64 4 605 %cv1e4 = sext i4 %v1e4 to i16 606 %v2e4 = extractelement <8 x i4> %vec2, i64 4 607 %cv2e4 = sext i4 %v2e4 to i16 608 %mul4 = mul nuw nsw i16 %cv1e4, %cv2e4 609 610 %v1e5 = extractelement <8 x i4> %vec1, i64 5 611 %cv1e5 = sext i4 %v1e5 to i16 612 %v2e5 = extractelement <8 x i4> %vec2, i64 5 613 %cv2e5 = sext i4 %v2e5 to i16 614 %mul5 = mul nuw nsw i16 %cv1e5, %cv2e5 615 616 %v1e6 = extractelement <8 x i4> %vec1, i64 6 617 %cv1e6 = sext i4 %v1e6 to i16 618 %v2e6 = extractelement <8 x i4> %vec2, i64 6 619 %cv2e6 = sext i4 %v2e6 to i16 620 %mul6 = mul nuw nsw i16 %cv1e6, %cv2e6 621 622 %v1e7 = extractelement <8 x i4> %vec1, i64 7 623 %cv1e7 = sext i4 %v1e7 to i16 624 %v2e7 = extractelement <8 x i4> %vec2, i64 7 625 %cv2e7 = sext i4 %v2e7 to i16 626 %mul7 = mul nuw nsw i16 %cv1e7, %cv2e7 627 628 %acc = load i16, i16 addrspace(1)* %dst, align 4 629 %add1 = add i16 %mul0, %acc 630 %add2 = add i16 %add1, %mul1 631 %add3 = add i16 %add2, %mul2 632 %add4 = add i16 %add3, %mul3 633 %add5 = add i16 %add4, %mul4 634 %add6 = add i16 %add5, %mul5 635 %add7 = add i16 %add6, %mul6 636 %add8 = add i16 %add7, %mul7 637 638 store i16 %add8, i16 addrspace(1)* %dst, align 4 639 ret void 640} 641 642; TODO: Support this pattern. 643define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, 644; GFX7-LABEL: idot8_acc8: 645; GFX7: ; %bb.0: ; %entry 646; GFX7-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 647; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 648; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 649; GFX7-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 650; GFX7-NEXT: s_mov_b32 s26, -1 651; GFX7-NEXT: s_mov_b32 s27, 0xe8f000 652; GFX7-NEXT: s_add_u32 s24, s24, s3 653; GFX7-NEXT: s_mov_b32 s3, 0xf000 654; GFX7-NEXT: s_mov_b32 s2, -1 655; GFX7-NEXT: s_waitcnt lgkmcnt(0) 656; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 657; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 658; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 659; GFX7-NEXT: s_addc_u32 s25, s25, 0 660; GFX7-NEXT: s_movk_i32 s8, 0xff 661; GFX7-NEXT: s_waitcnt lgkmcnt(0) 662; GFX7-NEXT: s_bfe_i32 s6, s4, 0x40000 663; GFX7-NEXT: s_bfe_i32 s7, s5, 0x40000 664; GFX7-NEXT: s_bfe_i32 s10, s5, 0x40004 665; GFX7-NEXT: s_and_b32 s7, s7, s8 666; GFX7-NEXT: s_bfe_i32 s9, s4, 0x40004 667; GFX7-NEXT: s_bfe_i32 s12, s5, 0x40008 668; GFX7-NEXT: s_and_b32 s10, s10, s8 669; GFX7-NEXT: s_and_b32 s6, s6, s8 670; GFX7-NEXT: v_mov_b32_e32 v1, s7 671; GFX7-NEXT: s_bfe_i32 s11, s4, 0x40008 672; GFX7-NEXT: s_bfe_i32 s14, s5, 0x4000c 673; GFX7-NEXT: s_and_b32 s12, s12, s8 674; GFX7-NEXT: s_and_b32 s9, s9, s8 675; GFX7-NEXT: v_mov_b32_e32 v2, s10 676; GFX7-NEXT: s_bfe_i32 s13, s4, 0x4000c 677; GFX7-NEXT: s_bfe_i32 s16, s5, 0x40010 678; GFX7-NEXT: s_and_b32 s14, s14, s8 679; GFX7-NEXT: s_and_b32 s11, s11, s8 680; GFX7-NEXT: v_mov_b32_e32 v3, s12 681; GFX7-NEXT: s_bfe_i32 s15, s4, 0x40010 682; GFX7-NEXT: s_bfe_i32 s18, s5, 0x40014 683; GFX7-NEXT: s_and_b32 s16, s16, s8 684; GFX7-NEXT: s_and_b32 s13, s13, s8 685; GFX7-NEXT: v_mov_b32_e32 v4, s14 686; GFX7-NEXT: s_bfe_i32 s20, s5, 0x40018 687; GFX7-NEXT: s_bfe_i32 s17, s4, 0x40014 688; GFX7-NEXT: s_and_b32 s18, s18, s8 689; GFX7-NEXT: s_and_b32 s15, s15, s8 690; GFX7-NEXT: v_mov_b32_e32 v5, s16 691; GFX7-NEXT: s_bfe_i32 s19, s4, 0x40018 692; GFX7-NEXT: s_ashr_i32 s5, s5, 28 693; GFX7-NEXT: s_and_b32 s20, s20, s8 694; GFX7-NEXT: s_and_b32 s17, s17, s8 695; GFX7-NEXT: v_mov_b32_e32 v6, s18 696; GFX7-NEXT: s_ashr_i32 s4, s4, 28 697; GFX7-NEXT: s_and_b32 s19, s19, s8 698; GFX7-NEXT: s_and_b32 s5, s5, s8 699; GFX7-NEXT: v_mov_b32_e32 v7, s20 700; GFX7-NEXT: s_and_b32 s4, s4, s8 701; GFX7-NEXT: s_waitcnt vmcnt(0) 702; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 703; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 704; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 705; GFX7-NEXT: v_mad_u32_u24 v0, s13, v4, v0 706; GFX7-NEXT: v_mad_u32_u24 v0, s15, v5, v0 707; GFX7-NEXT: v_mad_u32_u24 v0, s17, v6, v0 708; GFX7-NEXT: v_mad_u32_u24 v0, s19, v7, v0 709; GFX7-NEXT: v_mov_b32_e32 v1, s5 710; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 711; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 712; GFX7-NEXT: s_endpgm 713; 714; GFX8-LABEL: idot8_acc8: 715; GFX8: ; %bb.0: ; %entry 716; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 717; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 718; GFX8-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 719; GFX8-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 720; GFX8-NEXT: s_mov_b32 s22, -1 721; GFX8-NEXT: s_mov_b32 s23, 0xe80000 722; GFX8-NEXT: s_waitcnt lgkmcnt(0) 723; GFX8-NEXT: v_mov_b32_e32 v0, s0 724; GFX8-NEXT: v_mov_b32_e32 v1, s1 725; GFX8-NEXT: flat_load_ubyte v2, v[0:1] 726; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 727; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 728; GFX8-NEXT: s_add_u32 s20, s20, s3 729; GFX8-NEXT: s_addc_u32 s21, s21, 0 730; GFX8-NEXT: s_movk_i32 s0, 0xff 731; GFX8-NEXT: s_waitcnt lgkmcnt(0) 732; GFX8-NEXT: s_lshr_b32 s3, s1, 12 733; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40000 734; GFX8-NEXT: s_lshr_b32 s4, s2, 12 735; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40004 736; GFX8-NEXT: s_bfe_i32 s10, s2, 0x40008 737; GFX8-NEXT: s_bfe_i32 s5, s1, 0x40000 738; GFX8-NEXT: v_mov_b32_e32 v6, s6 739; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s3 740; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s4 741; GFX8-NEXT: s_bfe_i32 s7, s1, 0x40004 742; GFX8-NEXT: s_bfe_i32 s9, s1, 0x40008 743; GFX8-NEXT: v_mov_b32_e32 v3, s10 744; GFX8-NEXT: v_mov_b32_e32 v7, s8 745; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 746; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 747; GFX8-NEXT: v_mul_i32_i24_e32 v3, s9, v3 748; GFX8-NEXT: s_bfe_i32 s12, s2, 0x40010 749; GFX8-NEXT: v_and_b32_e32 v4, s0, v4 750; GFX8-NEXT: v_and_b32_e32 v5, s0, v5 751; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40014 752; GFX8-NEXT: s_bfe_i32 s11, s1, 0x40010 753; GFX8-NEXT: v_mov_b32_e32 v8, s12 754; GFX8-NEXT: s_bfe_i32 s16, s2, 0x40018 755; GFX8-NEXT: s_bfe_i32 s13, s1, 0x40014 756; GFX8-NEXT: v_mov_b32_e32 v9, s14 757; GFX8-NEXT: s_bfe_i32 s15, s1, 0x40018 758; GFX8-NEXT: s_ashr_i32 s2, s2, 28 759; GFX8-NEXT: v_mov_b32_e32 v10, s16 760; GFX8-NEXT: s_ashr_i32 s1, s1, 28 761; GFX8-NEXT: s_waitcnt vmcnt(0) 762; GFX8-NEXT: v_mad_i32_i24 v2, s5, v6, v2 763; GFX8-NEXT: v_mad_i32_i24 v2, s7, v7, v2 764; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 765; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2 766; GFX8-NEXT: v_mad_i32_i24 v2, s11, v8, v2 767; GFX8-NEXT: v_mad_i32_i24 v2, s13, v9, v2 768; GFX8-NEXT: v_mad_i32_i24 v2, s15, v10, v2 769; GFX8-NEXT: v_mov_b32_e32 v3, s2 770; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 771; GFX8-NEXT: flat_store_byte v[0:1], v2 772; GFX8-NEXT: s_endpgm 773; 774; GFX9-LABEL: idot8_acc8: 775; GFX9: ; %bb.0: ; %entry 776; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 777; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 778; GFX9-NEXT: v_mov_b32_e32 v0, 0 779; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 780; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 781; GFX9-NEXT: s_mov_b32 s22, -1 782; GFX9-NEXT: s_waitcnt lgkmcnt(0) 783; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] 784; GFX9-NEXT: s_mov_b32 s23, 0xe00000 785; GFX9-NEXT: s_add_u32 s20, s20, s3 786; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 787; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 788; GFX9-NEXT: s_addc_u32 s21, s21, 0 789; GFX9-NEXT: s_movk_i32 s2, 0xff 790; GFX9-NEXT: s_waitcnt lgkmcnt(0) 791; GFX9-NEXT: s_lshr_b32 s5, s3, 12 792; GFX9-NEXT: s_bfe_i32 s8, s4, 0x40000 793; GFX9-NEXT: s_lshr_b32 s6, s4, 12 794; GFX9-NEXT: s_bfe_i32 s10, s4, 0x40004 795; GFX9-NEXT: s_bfe_i32 s12, s4, 0x40008 796; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40000 797; GFX9-NEXT: v_mov_b32_e32 v5, s8 798; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s5 799; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s6 800; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40004 801; GFX9-NEXT: s_bfe_i32 s11, s3, 0x40008 802; GFX9-NEXT: v_mov_b32_e32 v2, s12 803; GFX9-NEXT: v_mov_b32_e32 v6, s10 804; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3 805; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 806; GFX9-NEXT: v_mul_i32_i24_e32 v2, s11, v2 807; GFX9-NEXT: s_bfe_i32 s14, s4, 0x40010 808; GFX9-NEXT: v_and_b32_e32 v3, s2, v3 809; GFX9-NEXT: v_and_b32_e32 v4, s2, v4 810; GFX9-NEXT: s_bfe_i32 s16, s4, 0x40014 811; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010 812; GFX9-NEXT: v_mov_b32_e32 v7, s14 813; GFX9-NEXT: s_bfe_i32 s18, s4, 0x40018 814; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014 815; GFX9-NEXT: v_mov_b32_e32 v8, s16 816; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018 817; GFX9-NEXT: s_ashr_i32 s4, s4, 28 818; GFX9-NEXT: v_mov_b32_e32 v9, s18 819; GFX9-NEXT: s_ashr_i32 s3, s3, 28 820; GFX9-NEXT: s_waitcnt vmcnt(0) 821; GFX9-NEXT: v_mad_i32_i24 v1, s7, v5, v1 822; GFX9-NEXT: v_mad_i32_i24 v1, s9, v6, v1 823; GFX9-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 824; GFX9-NEXT: v_mad_u32_u24 v1, v3, v4, v1 825; GFX9-NEXT: v_mad_i32_i24 v1, s13, v7, v1 826; GFX9-NEXT: v_mad_i32_i24 v1, s15, v8, v1 827; GFX9-NEXT: v_mad_i32_i24 v1, s17, v9, v1 828; GFX9-NEXT: v_mov_b32_e32 v2, s4 829; GFX9-NEXT: v_mad_i32_i24 v1, s3, v2, v1 830; GFX9-NEXT: global_store_byte v0, v1, s[0:1] 831; GFX9-NEXT: s_endpgm 832; 833; GFX9-DL-LABEL: idot8_acc8: 834; GFX9-DL: ; %bb.0: ; %entry 835; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 836; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 837; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 838; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 839; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 840; GFX9-DL-NEXT: s_mov_b32 s22, -1 841; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 842; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] 843; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 844; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 845; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 846; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 847; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 848; GFX9-DL-NEXT: s_movk_i32 s2, 0xff 849; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 850; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 12 851; GFX9-DL-NEXT: s_bfe_i32 s8, s4, 0x40000 852; GFX9-DL-NEXT: s_lshr_b32 s6, s4, 12 853; GFX9-DL-NEXT: s_bfe_i32 s10, s4, 0x40004 854; GFX9-DL-NEXT: s_bfe_i32 s12, s4, 0x40008 855; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x40000 856; GFX9-DL-NEXT: v_mov_b32_e32 v5, s8 857; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s5 858; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s6 859; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x40004 860; GFX9-DL-NEXT: s_bfe_i32 s11, s3, 0x40008 861; GFX9-DL-NEXT: v_mov_b32_e32 v2, s12 862; GFX9-DL-NEXT: v_mov_b32_e32 v6, s10 863; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3 864; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 865; GFX9-DL-NEXT: v_mul_i32_i24_e32 v2, s11, v2 866; GFX9-DL-NEXT: s_bfe_i32 s14, s4, 0x40010 867; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3 868; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4 869; GFX9-DL-NEXT: s_bfe_i32 s16, s4, 0x40014 870; GFX9-DL-NEXT: s_bfe_i32 s13, s3, 0x40010 871; GFX9-DL-NEXT: v_mov_b32_e32 v7, s14 872; GFX9-DL-NEXT: s_bfe_i32 s18, s4, 0x40018 873; GFX9-DL-NEXT: s_bfe_i32 s15, s3, 0x40014 874; GFX9-DL-NEXT: v_mov_b32_e32 v8, s16 875; GFX9-DL-NEXT: s_bfe_i32 s17, s3, 0x40018 876; GFX9-DL-NEXT: s_ashr_i32 s4, s4, 28 877; GFX9-DL-NEXT: v_mov_b32_e32 v9, s18 878; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 28 879; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 880; GFX9-DL-NEXT: v_mad_i32_i24 v1, s7, v5, v1 881; GFX9-DL-NEXT: v_mad_i32_i24 v1, s9, v6, v1 882; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 883; GFX9-DL-NEXT: v_mad_u32_u24 v1, v3, v4, v1 884; GFX9-DL-NEXT: v_mad_i32_i24 v1, s13, v7, v1 885; GFX9-DL-NEXT: v_mad_i32_i24 v1, s15, v8, v1 886; GFX9-DL-NEXT: v_mad_i32_i24 v1, s17, v9, v1 887; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 888; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 889; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] 890; GFX9-DL-NEXT: s_endpgm 891; 892; GFX10-DL-LABEL: idot8_acc8: 893; GFX10-DL: ; %bb.0: ; %entry 894; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 895; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 896; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 897; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 898; GFX10-DL-NEXT: s_mov_b32 s14, -1 899; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 900; GFX10-DL-NEXT: s_add_u32 s12, s12, s3 901; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 902; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 903; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 904; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] 905; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 906; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 907; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 908; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12 909; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 12 910; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40000 911; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x40000 912; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s2 913; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s3 914; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40004 915; GFX10-DL-NEXT: s_bfe_i32 s9, s0, 0x40008 916; GFX10-DL-NEXT: s_bfe_i32 s10, s1, 0x40008 917; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004 918; GFX10-DL-NEXT: v_mul_i32_i24_e64 v4, s9, s10 919; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2 920; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 921; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010 922; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 923; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1 924; GFX10-DL-NEXT: v_mad_i32_i24 v1, s8, s2, v1 925; GFX10-DL-NEXT: s_movk_i32 s2, 0xff 926; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2 927; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 928; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 929; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 930; GFX10-DL-NEXT: v_mad_u32_u24 v1, v2, v3, v1 931; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 932; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40014 933; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40014 934; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 935; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 936; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018 937; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 938; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 939; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 940; GFX10-DL-NEXT: v_mad_i32_i24 v1, s0, s1, v1 941; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] 942; GFX10-DL-NEXT: s_endpgm 943 <8 x i4> addrspace(1)* %src2, 944 i8 addrspace(1)* nocapture %dst) { 945entry: 946 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 947 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 948 949 %v1e0 = extractelement <8 x i4> %vec1, i64 0 950 %cv1e0 = sext i4 %v1e0 to i8 951 %v2e0 = extractelement <8 x i4> %vec2, i64 0 952 %cv2e0 = sext i4 %v2e0 to i8 953 %mul0 = mul nuw nsw i8 %cv1e0, %cv2e0 954 955 %v1e1 = extractelement <8 x i4> %vec1, i64 1 956 %cv1e1 = sext i4 %v1e1 to i8 957 %v2e1 = extractelement <8 x i4> %vec2, i64 1 958 %cv2e1 = sext i4 %v2e1 to i8 959 %mul1 = mul nuw nsw i8 %cv1e1, %cv2e1 960 961 %v1e2 = extractelement <8 x i4> %vec1, i64 2 962 %cv1e2 = sext i4 %v1e2 to i8 963 %v2e2 = extractelement <8 x i4> %vec2, i64 2 964 %cv2e2 = sext i4 %v2e2 to i8 965 %mul2 = mul nuw nsw i8 %cv1e2, %cv2e2 966 967 %v1e3 = extractelement <8 x i4> %vec1, i64 3 968 %cv1e3 = sext i4 %v1e3 to i8 969 %v2e3 = extractelement <8 x i4> %vec2, i64 3 970 %cv2e3 = sext i4 %v2e3 to i8 971 %mul3 = mul nuw nsw i8 %cv1e3, %cv2e3 972 973 %v1e4 = extractelement <8 x i4> %vec1, i64 4 974 %cv1e4 = sext i4 %v1e4 to i8 975 %v2e4 = extractelement <8 x i4> %vec2, i64 4 976 %cv2e4 = sext i4 %v2e4 to i8 977 %mul4 = mul nuw nsw i8 %cv1e4, %cv2e4 978 979 %v1e5 = extractelement <8 x i4> %vec1, i64 5 980 %cv1e5 = sext i4 %v1e5 to i8 981 %v2e5 = extractelement <8 x i4> %vec2, i64 5 982 %cv2e5 = sext i4 %v2e5 to i8 983 %mul5 = mul nuw nsw i8 %cv1e5, %cv2e5 984 985 %v1e6 = extractelement <8 x i4> %vec1, i64 6 986 %cv1e6 = sext i4 %v1e6 to i8 987 %v2e6 = extractelement <8 x i4> %vec2, i64 6 988 %cv2e6 = sext i4 %v2e6 to i8 989 %mul6 = mul nuw nsw i8 %cv1e6, %cv2e6 990 991 %v1e7 = extractelement <8 x i4> %vec1, i64 7 992 %cv1e7 = sext i4 %v1e7 to i8 993 %v2e7 = extractelement <8 x i4> %vec2, i64 7 994 %cv2e7 = sext i4 %v2e7 to i8 995 %mul7 = mul nuw nsw i8 %cv1e7, %cv2e7 996 997 %acc = load i8, i8 addrspace(1)* %dst, align 4 998 %add1 = add i8 %mul0, %acc 999 %add2 = add i8 %add1, %mul1 1000 %add3 = add i8 %add2, %mul2 1001 %add4 = add i8 %add3, %mul3 1002 %add5 = add i8 %add4, %mul4 1003 %add6 = add i8 %add5, %mul5 1004 %add7 = add i8 %add6, %mul6 1005 %add8 = add i8 %add7, %mul7 1006 1007 store i8 %add8, i8 addrspace(1)* %dst, align 4 1008 ret void 1009} 1010 1011; Make sure the pattern is not recognized if there are multiple uses of the 1012; intermediate multiplications. 1013define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, 1014; GFX7-LABEL: idot8_multiuses_mul1: 1015; GFX7: ; %bb.0: ; %entry 1016; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1017; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1018; GFX7-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 1019; GFX7-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 1020; GFX7-NEXT: s_mov_b32 s26, -1 1021; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1022; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 1023; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 1024; GFX7-NEXT: s_load_dword s20, s[0:1], 0x0 1025; GFX7-NEXT: s_mov_b32 s27, 0xe8f000 1026; GFX7-NEXT: s_add_u32 s24, s24, s3 1027; GFX7-NEXT: s_addc_u32 s25, s25, 0 1028; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1029; GFX7-NEXT: s_bfe_i32 s7, s5, 0x40000 1030; GFX7-NEXT: s_bfe_i32 s6, s4, 0x40000 1031; GFX7-NEXT: v_mov_b32_e32 v0, s7 1032; GFX7-NEXT: v_mov_b32_e32 v1, s20 1033; GFX7-NEXT: v_mad_i32_i24 v1, s6, v0, v1 1034; GFX7-NEXT: s_bfe_i32 s9, s5, 0x40004 1035; GFX7-NEXT: s_bfe_i32 s8, s4, 0x40004 1036; GFX7-NEXT: s_bfe_i32 s11, s5, 0x40008 1037; GFX7-NEXT: v_mad_i32_i24 v0, s6, v0, v1 1038; GFX7-NEXT: v_mov_b32_e32 v2, s9 1039; GFX7-NEXT: v_mad_i32_i24 v0, s8, v2, v0 1040; GFX7-NEXT: s_bfe_i32 s10, s4, 0x40008 1041; GFX7-NEXT: v_mov_b32_e32 v2, s11 1042; GFX7-NEXT: s_bfe_i32 s13, s5, 0x4000c 1043; GFX7-NEXT: v_mad_i32_i24 v0, s10, v2, v0 1044; GFX7-NEXT: s_bfe_i32 s12, s4, 0x4000c 1045; GFX7-NEXT: v_mov_b32_e32 v2, s13 1046; GFX7-NEXT: s_bfe_i32 s15, s5, 0x40010 1047; GFX7-NEXT: v_mad_i32_i24 v0, s12, v2, v0 1048; GFX7-NEXT: s_bfe_i32 s14, s4, 0x40010 1049; GFX7-NEXT: v_mov_b32_e32 v2, s15 1050; GFX7-NEXT: s_bfe_i32 s17, s5, 0x40014 1051; GFX7-NEXT: s_bfe_i32 s19, s5, 0x40018 1052; GFX7-NEXT: v_mad_i32_i24 v0, s14, v2, v0 1053; GFX7-NEXT: s_bfe_i32 s16, s4, 0x40014 1054; GFX7-NEXT: v_mov_b32_e32 v2, s17 1055; GFX7-NEXT: s_bfe_i32 s18, s4, 0x40018 1056; GFX7-NEXT: v_mad_i32_i24 v0, s16, v2, v0 1057; GFX7-NEXT: v_mov_b32_e32 v2, s19 1058; GFX7-NEXT: s_ashr_i32 s5, s5, 28 1059; GFX7-NEXT: v_mad_i32_i24 v0, s18, v2, v0 1060; GFX7-NEXT: s_ashr_i32 s4, s4, 28 1061; GFX7-NEXT: v_mov_b32_e32 v2, s5 1062; GFX7-NEXT: v_mad_i32_i24 v0, s4, v2, v0 1063; GFX7-NEXT: s_mov_b32 s3, 0xf000 1064; GFX7-NEXT: s_mov_b32 s2, -1 1065; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1066; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1067; GFX7-NEXT: s_endpgm 1068; 1069; GFX8-LABEL: idot8_multiuses_mul1: 1070; GFX8: ; %bb.0: ; %entry 1071; GFX8-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 1072; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1073; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1074; GFX8-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 1075; GFX8-NEXT: s_mov_b32 s22, -1 1076; GFX8-NEXT: s_mov_b32 s23, 0xe80000 1077; GFX8-NEXT: s_add_u32 s20, s20, s3 1078; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1079; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 1080; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 1081; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 1082; GFX8-NEXT: s_addc_u32 s21, s21, 0 1083; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1084; GFX8-NEXT: s_bfe_i32 s4, s2, 0x40000 1085; GFX8-NEXT: s_bfe_i32 s5, s3, 0x40000 1086; GFX8-NEXT: v_mov_b32_e32 v0, s5 1087; GFX8-NEXT: v_mov_b32_e32 v1, s18 1088; GFX8-NEXT: v_mad_i32_i24 v1, s4, v0, v1 1089; GFX8-NEXT: s_bfe_i32 s7, s3, 0x40004 1090; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40004 1091; GFX8-NEXT: s_bfe_i32 s9, s3, 0x40008 1092; GFX8-NEXT: v_mad_i32_i24 v0, s4, v0, v1 1093; GFX8-NEXT: v_mov_b32_e32 v2, s7 1094; GFX8-NEXT: v_mad_i32_i24 v0, s6, v2, v0 1095; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40008 1096; GFX8-NEXT: v_mov_b32_e32 v2, s9 1097; GFX8-NEXT: s_bfe_i32 s11, s3, 0x4000c 1098; GFX8-NEXT: v_mad_i32_i24 v0, s8, v2, v0 1099; GFX8-NEXT: s_bfe_i32 s10, s2, 0x4000c 1100; GFX8-NEXT: v_mov_b32_e32 v2, s11 1101; GFX8-NEXT: s_bfe_i32 s13, s3, 0x40010 1102; GFX8-NEXT: v_mad_i32_i24 v0, s10, v2, v0 1103; GFX8-NEXT: s_bfe_i32 s12, s2, 0x40010 1104; GFX8-NEXT: v_mov_b32_e32 v2, s13 1105; GFX8-NEXT: s_bfe_i32 s15, s3, 0x40014 1106; GFX8-NEXT: s_bfe_i32 s17, s3, 0x40018 1107; GFX8-NEXT: v_mad_i32_i24 v0, s12, v2, v0 1108; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40014 1109; GFX8-NEXT: v_mov_b32_e32 v2, s15 1110; GFX8-NEXT: s_bfe_i32 s16, s2, 0x40018 1111; GFX8-NEXT: v_mad_i32_i24 v0, s14, v2, v0 1112; GFX8-NEXT: v_mov_b32_e32 v2, s17 1113; GFX8-NEXT: s_ashr_i32 s3, s3, 28 1114; GFX8-NEXT: v_mad_i32_i24 v0, s16, v2, v0 1115; GFX8-NEXT: s_ashr_i32 s2, s2, 28 1116; GFX8-NEXT: v_mov_b32_e32 v2, s3 1117; GFX8-NEXT: v_mad_i32_i24 v0, s2, v2, v0 1118; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 1119; GFX8-NEXT: v_mov_b32_e32 v0, s0 1120; GFX8-NEXT: v_mov_b32_e32 v1, s1 1121; GFX8-NEXT: flat_store_dword v[0:1], v2 1122; GFX8-NEXT: s_endpgm 1123; 1124; GFX9-LABEL: idot8_multiuses_mul1: 1125; GFX9: ; %bb.0: ; %entry 1126; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 1127; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1128; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1129; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 1130; GFX9-NEXT: s_mov_b32 s22, -1 1131; GFX9-NEXT: s_mov_b32 s23, 0xe00000 1132; GFX9-NEXT: s_add_u32 s20, s20, s3 1133; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1134; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 1135; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 1136; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 1137; GFX9-NEXT: s_addc_u32 s21, s21, 0 1138; GFX9-NEXT: v_mov_b32_e32 v0, 0 1139; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1140; GFX9-NEXT: s_bfe_i32 s4, s2, 0x40000 1141; GFX9-NEXT: s_bfe_i32 s5, s3, 0x40000 1142; GFX9-NEXT: v_mov_b32_e32 v1, s5 1143; GFX9-NEXT: v_mov_b32_e32 v2, s18 1144; GFX9-NEXT: v_mad_i32_i24 v2, s4, v1, v2 1145; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40004 1146; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004 1147; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40008 1148; GFX9-NEXT: v_mad_i32_i24 v1, s4, v1, v2 1149; GFX9-NEXT: v_mov_b32_e32 v3, s7 1150; GFX9-NEXT: v_mad_i32_i24 v1, s6, v3, v1 1151; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008 1152; GFX9-NEXT: v_mov_b32_e32 v3, s9 1153; GFX9-NEXT: s_bfe_i32 s11, s3, 0x4000c 1154; GFX9-NEXT: v_mad_i32_i24 v1, s8, v3, v1 1155; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c 1156; GFX9-NEXT: v_mov_b32_e32 v3, s11 1157; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010 1158; GFX9-NEXT: v_mad_i32_i24 v1, s10, v3, v1 1159; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 1160; GFX9-NEXT: v_mov_b32_e32 v3, s13 1161; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014 1162; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018 1163; GFX9-NEXT: v_mad_i32_i24 v1, s12, v3, v1 1164; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 1165; GFX9-NEXT: v_mov_b32_e32 v3, s15 1166; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 1167; GFX9-NEXT: v_mad_i32_i24 v1, s14, v3, v1 1168; GFX9-NEXT: v_mov_b32_e32 v3, s17 1169; GFX9-NEXT: s_ashr_i32 s3, s3, 28 1170; GFX9-NEXT: v_mad_i32_i24 v1, s16, v3, v1 1171; GFX9-NEXT: s_ashr_i32 s2, s2, 28 1172; GFX9-NEXT: v_mov_b32_e32 v3, s3 1173; GFX9-NEXT: v_mad_i32_i24 v1, s2, v3, v1 1174; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 1175; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1176; GFX9-NEXT: s_endpgm 1177; 1178; GFX9-DL-LABEL: idot8_multiuses_mul1: 1179; GFX9-DL: ; %bb.0: ; %entry 1180; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 1181; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1182; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1183; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 1184; GFX9-DL-NEXT: s_mov_b32 s22, -1 1185; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 1186; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 1187; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1188; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 1189; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 1190; GFX9-DL-NEXT: s_load_dword s18, s[0:1], 0x0 1191; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 1192; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1193; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1194; GFX9-DL-NEXT: s_bfe_i32 s4, s2, 0x40000 1195; GFX9-DL-NEXT: s_bfe_i32 s5, s3, 0x40000 1196; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 1197; GFX9-DL-NEXT: v_mov_b32_e32 v2, s18 1198; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v1, v2 1199; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x40004 1200; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40004 1201; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x40008 1202; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v1, v2 1203; GFX9-DL-NEXT: v_mov_b32_e32 v3, s7 1204; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v3, v1 1205; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40008 1206; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9 1207; GFX9-DL-NEXT: s_bfe_i32 s11, s3, 0x4000c 1208; GFX9-DL-NEXT: v_mad_i32_i24 v1, s8, v3, v1 1209; GFX9-DL-NEXT: s_bfe_i32 s10, s2, 0x4000c 1210; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 1211; GFX9-DL-NEXT: s_bfe_i32 s13, s3, 0x40010 1212; GFX9-DL-NEXT: v_mad_i32_i24 v1, s10, v3, v1 1213; GFX9-DL-NEXT: s_bfe_i32 s12, s2, 0x40010 1214; GFX9-DL-NEXT: v_mov_b32_e32 v3, s13 1215; GFX9-DL-NEXT: s_bfe_i32 s15, s3, 0x40014 1216; GFX9-DL-NEXT: s_bfe_i32 s17, s3, 0x40018 1217; GFX9-DL-NEXT: v_mad_i32_i24 v1, s12, v3, v1 1218; GFX9-DL-NEXT: s_bfe_i32 s14, s2, 0x40014 1219; GFX9-DL-NEXT: v_mov_b32_e32 v3, s15 1220; GFX9-DL-NEXT: s_bfe_i32 s16, s2, 0x40018 1221; GFX9-DL-NEXT: v_mad_i32_i24 v1, s14, v3, v1 1222; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 1223; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 28 1224; GFX9-DL-NEXT: v_mad_i32_i24 v1, s16, v3, v1 1225; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 1226; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 1227; GFX9-DL-NEXT: v_mad_i32_i24 v1, s2, v3, v1 1228; GFX9-DL-NEXT: v_add_u32_e32 v1, v2, v1 1229; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 1230; GFX9-DL-NEXT: s_endpgm 1231; 1232; GFX10-DL-LABEL: idot8_multiuses_mul1: 1233; GFX10-DL: ; %bb.0: ; %entry 1234; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1235; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1236; GFX10-DL-NEXT: s_mov_b32 s10, -1 1237; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 1238; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 1239; GFX10-DL-NEXT: s_clause 0x1 1240; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 1241; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1242; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 1243; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 1244; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1245; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 1246; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 1247; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 1248; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1249; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 1250; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40000 1251; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40000 1252; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0 1253; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v0 1254; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40004 1255; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40004 1256; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 1257; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40008 1258; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40008 1259; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 1260; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x4000c 1261; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x4000c 1262; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 1263; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 1264; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010 1265; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 1266; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40014 1267; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40014 1268; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 1269; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 1270; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018 1271; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 1272; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 1273; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 1274; GFX10-DL-NEXT: v_mad_i32_i24 v1, s0, s1, v1 1275; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v0, v1 1276; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5] 1277; GFX10-DL-NEXT: s_endpgm 1278 <8 x i4> addrspace(1)* %src2, 1279 i32 addrspace(1)* nocapture %dst) { 1280entry: 1281 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 1282 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 1283 1284 %v1e0 = extractelement <8 x i4> %vec1, i64 0 1285 %cv1e0 = sext i4 %v1e0 to i32 1286 %v2e0 = extractelement <8 x i4> %vec2, i64 0 1287 %cv2e0 = sext i4 %v2e0 to i32 1288 %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0 1289 1290 %v1e1 = extractelement <8 x i4> %vec1, i64 1 1291 %cv1e1 = sext i4 %v1e1 to i32 1292 %v2e1 = extractelement <8 x i4> %vec2, i64 1 1293 %cv2e1 = sext i4 %v2e1 to i32 1294 %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1 1295 1296 %v1e2 = extractelement <8 x i4> %vec1, i64 2 1297 %cv1e2 = sext i4 %v1e2 to i32 1298 %v2e2 = extractelement <8 x i4> %vec2, i64 2 1299 %cv2e2 = sext i4 %v2e2 to i32 1300 %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2 1301 1302 %v1e3 = extractelement <8 x i4> %vec1, i64 3 1303 %cv1e3 = sext i4 %v1e3 to i32 1304 %v2e3 = extractelement <8 x i4> %vec2, i64 3 1305 %cv2e3 = sext i4 %v2e3 to i32 1306 %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3 1307 1308 %v1e4 = extractelement <8 x i4> %vec1, i64 4 1309 %cv1e4 = sext i4 %v1e4 to i32 1310 %v2e4 = extractelement <8 x i4> %vec2, i64 4 1311 %cv2e4 = sext i4 %v2e4 to i32 1312 %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4 1313 1314 %v1e5 = extractelement <8 x i4> %vec1, i64 5 1315 %cv1e5 = sext i4 %v1e5 to i32 1316 %v2e5 = extractelement <8 x i4> %vec2, i64 5 1317 %cv2e5 = sext i4 %v2e5 to i32 1318 %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5 1319 1320 %v1e6 = extractelement <8 x i4> %vec1, i64 6 1321 %cv1e6 = sext i4 %v1e6 to i32 1322 %v2e6 = extractelement <8 x i4> %vec2, i64 6 1323 %cv2e6 = sext i4 %v2e6 to i32 1324 %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6 1325 1326 %v1e7 = extractelement <8 x i4> %vec1, i64 7 1327 %cv1e7 = sext i4 %v1e7 to i32 1328 %v2e7 = extractelement <8 x i4> %vec2, i64 7 1329 %cv2e7 = sext i4 %v2e7 to i32 1330 %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7 1331 1332 %acc = load i32, i32 addrspace(1)* %dst, align 4 1333 %add = add i32 %mul0, %acc 1334 %add1 = add i32 %mul0, %add 1335 %add2 = add i32 %add1, %mul1 1336 %add3 = add i32 %add2, %mul2 1337 %add4 = add i32 %add3, %mul3 1338 %add5 = add i32 %add4, %mul4 1339 %add6 = add i32 %add5, %mul5 1340 %add7 = add i32 %add6, %mul6 1341 %add8 = add i32 %add7, %mul7 1342 1343 %res = add i32 %add, %add8 1344 store i32 %res, i32 addrspace(1)* %dst, align 4 1345 ret void 1346} 1347 1348; TODO: Support this pattern. 1349define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, 1350; GFX7-LABEL: idot8_acc32_vecMul: 1351; GFX7: ; %bb.0: ; %entry 1352; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1353; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1354; GFX7-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 1355; GFX7-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 1356; GFX7-NEXT: s_mov_b32 s26, -1 1357; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1358; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 1359; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 1360; GFX7-NEXT: s_load_dword s20, s[0:1], 0x0 1361; GFX7-NEXT: s_mov_b32 s27, 0xe8f000 1362; GFX7-NEXT: s_add_u32 s24, s24, s3 1363; GFX7-NEXT: s_addc_u32 s25, s25, 0 1364; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1365; GFX7-NEXT: s_ashr_i32 s13, s5, 28 1366; GFX7-NEXT: s_bfe_i32 s14, s5, 0x40018 1367; GFX7-NEXT: s_bfe_i32 s15, s5, 0x40014 1368; GFX7-NEXT: s_bfe_i32 s16, s5, 0x40010 1369; GFX7-NEXT: s_bfe_i32 s17, s5, 0x4000c 1370; GFX7-NEXT: s_bfe_i32 s18, s5, 0x40008 1371; GFX7-NEXT: s_bfe_i32 s19, s5, 0x40004 1372; GFX7-NEXT: s_bfe_i32 s5, s5, 0x40000 1373; GFX7-NEXT: s_ashr_i32 s6, s4, 28 1374; GFX7-NEXT: s_bfe_i32 s7, s4, 0x40018 1375; GFX7-NEXT: s_bfe_i32 s8, s4, 0x40014 1376; GFX7-NEXT: s_bfe_i32 s9, s4, 0x40010 1377; GFX7-NEXT: s_bfe_i32 s10, s4, 0x4000c 1378; GFX7-NEXT: s_bfe_i32 s11, s4, 0x40008 1379; GFX7-NEXT: s_bfe_i32 s12, s4, 0x40004 1380; GFX7-NEXT: s_bfe_i32 s4, s4, 0x40000 1381; GFX7-NEXT: v_mov_b32_e32 v0, s5 1382; GFX7-NEXT: v_mov_b32_e32 v1, s20 1383; GFX7-NEXT: v_mad_i32_i24 v0, s4, v0, v1 1384; GFX7-NEXT: v_mov_b32_e32 v1, s19 1385; GFX7-NEXT: v_mad_i32_i24 v0, s12, v1, v0 1386; GFX7-NEXT: v_mov_b32_e32 v1, s18 1387; GFX7-NEXT: v_mad_i32_i24 v0, s11, v1, v0 1388; GFX7-NEXT: v_mov_b32_e32 v1, s17 1389; GFX7-NEXT: v_mad_i32_i24 v0, s10, v1, v0 1390; GFX7-NEXT: v_mov_b32_e32 v1, s16 1391; GFX7-NEXT: v_mad_i32_i24 v0, s9, v1, v0 1392; GFX7-NEXT: v_mov_b32_e32 v1, s15 1393; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0 1394; GFX7-NEXT: v_mov_b32_e32 v1, s14 1395; GFX7-NEXT: v_mad_i32_i24 v0, s7, v1, v0 1396; GFX7-NEXT: v_mov_b32_e32 v1, s13 1397; GFX7-NEXT: s_mov_b32 s3, 0xf000 1398; GFX7-NEXT: s_mov_b32 s2, -1 1399; GFX7-NEXT: v_mad_i32_i24 v0, s6, v1, v0 1400; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1401; GFX7-NEXT: s_endpgm 1402; 1403; GFX8-LABEL: idot8_acc32_vecMul: 1404; GFX8: ; %bb.0: ; %entry 1405; GFX8-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 1406; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1407; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1408; GFX8-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 1409; GFX8-NEXT: s_mov_b32 s22, -1 1410; GFX8-NEXT: s_mov_b32 s23, 0xe80000 1411; GFX8-NEXT: s_add_u32 s20, s20, s3 1412; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1413; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 1414; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 1415; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 1416; GFX8-NEXT: s_addc_u32 s21, s21, 0 1417; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1418; GFX8-NEXT: s_ashr_i32 s4, s2, 28 1419; GFX8-NEXT: s_ashr_i32 s11, s3, 28 1420; GFX8-NEXT: s_bfe_i32 s12, s3, 0x40018 1421; GFX8-NEXT: s_bfe_i32 s13, s3, 0x40014 1422; GFX8-NEXT: s_bfe_i32 s14, s3, 0x40010 1423; GFX8-NEXT: s_bfe_i32 s15, s3, 0x4000c 1424; GFX8-NEXT: s_bfe_i32 s16, s3, 0x40008 1425; GFX8-NEXT: s_bfe_i32 s17, s3, 0x40004 1426; GFX8-NEXT: s_bfe_i32 s3, s3, 0x40000 1427; GFX8-NEXT: s_bfe_i32 s5, s2, 0x40018 1428; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40014 1429; GFX8-NEXT: s_bfe_i32 s7, s2, 0x40010 1430; GFX8-NEXT: s_bfe_i32 s8, s2, 0x4000c 1431; GFX8-NEXT: s_bfe_i32 s9, s2, 0x40008 1432; GFX8-NEXT: s_bfe_i32 s10, s2, 0x40004 1433; GFX8-NEXT: s_bfe_i32 s2, s2, 0x40000 1434; GFX8-NEXT: v_mov_b32_e32 v0, s3 1435; GFX8-NEXT: v_mov_b32_e32 v1, s18 1436; GFX8-NEXT: v_mad_i32_i24 v0, s2, v0, v1 1437; GFX8-NEXT: v_mov_b32_e32 v1, s17 1438; GFX8-NEXT: v_mad_i32_i24 v0, s10, v1, v0 1439; GFX8-NEXT: v_mov_b32_e32 v1, s16 1440; GFX8-NEXT: v_mad_i32_i24 v0, s9, v1, v0 1441; GFX8-NEXT: v_mov_b32_e32 v1, s15 1442; GFX8-NEXT: v_mad_i32_i24 v0, s8, v1, v0 1443; GFX8-NEXT: v_mov_b32_e32 v1, s14 1444; GFX8-NEXT: v_mad_i32_i24 v0, s7, v1, v0 1445; GFX8-NEXT: v_mov_b32_e32 v1, s13 1446; GFX8-NEXT: v_mad_i32_i24 v0, s6, v1, v0 1447; GFX8-NEXT: v_mov_b32_e32 v1, s12 1448; GFX8-NEXT: v_mad_i32_i24 v0, s5, v1, v0 1449; GFX8-NEXT: v_mov_b32_e32 v1, s11 1450; GFX8-NEXT: v_mad_i32_i24 v2, s4, v1, v0 1451; GFX8-NEXT: v_mov_b32_e32 v0, s0 1452; GFX8-NEXT: v_mov_b32_e32 v1, s1 1453; GFX8-NEXT: flat_store_dword v[0:1], v2 1454; GFX8-NEXT: s_endpgm 1455; 1456; GFX9-LABEL: idot8_acc32_vecMul: 1457; GFX9: ; %bb.0: ; %entry 1458; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 1459; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1460; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1461; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 1462; GFX9-NEXT: s_mov_b32 s22, -1 1463; GFX9-NEXT: s_mov_b32 s23, 0xe00000 1464; GFX9-NEXT: s_add_u32 s20, s20, s3 1465; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1466; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 1467; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 1468; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 1469; GFX9-NEXT: s_addc_u32 s21, s21, 0 1470; GFX9-NEXT: v_mov_b32_e32 v0, 0 1471; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1472; GFX9-NEXT: s_ashr_i32 s4, s2, 28 1473; GFX9-NEXT: s_ashr_i32 s11, s3, 28 1474; GFX9-NEXT: s_bfe_i32 s12, s3, 0x40018 1475; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40014 1476; GFX9-NEXT: s_bfe_i32 s14, s3, 0x40010 1477; GFX9-NEXT: s_bfe_i32 s15, s3, 0x4000c 1478; GFX9-NEXT: s_bfe_i32 s16, s3, 0x40008 1479; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40004 1480; GFX9-NEXT: s_bfe_i32 s3, s3, 0x40000 1481; GFX9-NEXT: s_bfe_i32 s5, s2, 0x40018 1482; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40014 1483; GFX9-NEXT: s_bfe_i32 s7, s2, 0x40010 1484; GFX9-NEXT: s_bfe_i32 s8, s2, 0x4000c 1485; GFX9-NEXT: s_bfe_i32 s9, s2, 0x40008 1486; GFX9-NEXT: s_bfe_i32 s10, s2, 0x40004 1487; GFX9-NEXT: s_bfe_i32 s2, s2, 0x40000 1488; GFX9-NEXT: v_mov_b32_e32 v1, s3 1489; GFX9-NEXT: v_mov_b32_e32 v2, s18 1490; GFX9-NEXT: v_mad_i32_i24 v1, s2, v1, v2 1491; GFX9-NEXT: v_mov_b32_e32 v2, s17 1492; GFX9-NEXT: v_mad_i32_i24 v1, s10, v2, v1 1493; GFX9-NEXT: v_mov_b32_e32 v2, s16 1494; GFX9-NEXT: v_mad_i32_i24 v1, s9, v2, v1 1495; GFX9-NEXT: v_mov_b32_e32 v2, s15 1496; GFX9-NEXT: v_mad_i32_i24 v1, s8, v2, v1 1497; GFX9-NEXT: v_mov_b32_e32 v2, s14 1498; GFX9-NEXT: v_mad_i32_i24 v1, s7, v2, v1 1499; GFX9-NEXT: v_mov_b32_e32 v2, s13 1500; GFX9-NEXT: v_mad_i32_i24 v1, s6, v2, v1 1501; GFX9-NEXT: v_mov_b32_e32 v2, s12 1502; GFX9-NEXT: v_mad_i32_i24 v1, s5, v2, v1 1503; GFX9-NEXT: v_mov_b32_e32 v2, s11 1504; GFX9-NEXT: v_mad_i32_i24 v1, s4, v2, v1 1505; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1506; GFX9-NEXT: s_endpgm 1507; 1508; GFX9-DL-LABEL: idot8_acc32_vecMul: 1509; GFX9-DL: ; %bb.0: ; %entry 1510; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1511; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1512; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1513; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1514; GFX9-DL-NEXT: s_mov_b32 s10, -1 1515; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 1516; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 1517; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1518; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 1519; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 1520; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 1521; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1522; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 1523; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1524; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 1525; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 1526; GFX9-DL-NEXT: v_dot8_i32_i4 v1, s4, v1, v2 1527; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 1528; GFX9-DL-NEXT: s_endpgm 1529; 1530; GFX10-DL-LABEL: idot8_acc32_vecMul: 1531; GFX10-DL: ; %bb.0: ; %entry 1532; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1533; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1534; GFX10-DL-NEXT: s_mov_b32 s10, -1 1535; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 1536; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 1537; GFX10-DL-NEXT: s_clause 0x1 1538; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 1539; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1540; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 1541; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 1542; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1543; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 1544; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 1545; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 1546; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1547; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 1548; GFX10-DL-NEXT: v_dot8_i32_i4 v0, s0, s1, v0 1549; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] 1550; GFX10-DL-NEXT: s_endpgm 1551 <8 x i4> addrspace(1)* %src2, 1552 i32 addrspace(1)* nocapture %dst) { 1553entry: 1554 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 1555 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 1556 1557 %cvec1 = sext <8 x i4> %vec1 to <8 x i32> 1558 %cvec2 = sext <8 x i4> %vec2 to <8 x i32> 1559 1560 %mul = mul <8 x i32> %cvec1, %cvec2 1561 %mul0 = extractelement <8 x i32> %mul, i64 0 1562 %mul1 = extractelement <8 x i32> %mul, i64 1 1563 %mul2 = extractelement <8 x i32> %mul, i64 2 1564 %mul3 = extractelement <8 x i32> %mul, i64 3 1565 %mul4 = extractelement <8 x i32> %mul, i64 4 1566 %mul5 = extractelement <8 x i32> %mul, i64 5 1567 %mul6 = extractelement <8 x i32> %mul, i64 6 1568 %mul7 = extractelement <8 x i32> %mul, i64 7 1569 1570 %acc = load i32, i32 addrspace(1)* %dst, align 4 1571 %add1 = add i32 %mul0, %acc 1572 %add2 = add i32 %add1, %mul1 1573 %add3 = add i32 %add2, %mul2 1574 %add4 = add i32 %add3, %mul3 1575 %add5 = add i32 %add4, %mul4 1576 %add6 = add i32 %add5, %mul5 1577 %add7 = add i32 %add6, %mul6 1578 %add8 = add i32 %add7, %mul7 1579 1580 store i32 %add8, i32 addrspace(1)* %dst, align 4 1581 ret void 1582} 1583 1584; TODO: Support this pattern. 1585define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, 1586; GFX7-LABEL: idot8_acc16_vecMul: 1587; GFX7: ; %bb.0: ; %entry 1588; GFX7-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 1589; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1590; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1591; GFX7-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 1592; GFX7-NEXT: s_mov_b32 s26, -1 1593; GFX7-NEXT: s_mov_b32 s27, 0xe8f000 1594; GFX7-NEXT: s_add_u32 s24, s24, s3 1595; GFX7-NEXT: s_mov_b32 s3, 0xf000 1596; GFX7-NEXT: s_mov_b32 s2, -1 1597; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1598; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 1599; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 1600; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 1601; GFX7-NEXT: s_addc_u32 s25, s25, 0 1602; GFX7-NEXT: s_mov_b32 s8, 0xffff 1603; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1604; GFX7-NEXT: s_ashr_i32 s6, s4, 28 1605; GFX7-NEXT: s_bfe_i32 s15, s5, 0x40018 1606; GFX7-NEXT: s_bfe_i32 s16, s5, 0x40014 1607; GFX7-NEXT: s_bfe_i32 s17, s5, 0x40010 1608; GFX7-NEXT: s_bfe_i32 s18, s5, 0x40008 1609; GFX7-NEXT: s_bfe_i32 s19, s5, 0x4000c 1610; GFX7-NEXT: s_bfe_i32 s20, s5, 0x40000 1611; GFX7-NEXT: s_ashr_i32 s14, s5, 28 1612; GFX7-NEXT: s_bfe_i32 s5, s5, 0x40004 1613; GFX7-NEXT: s_bfe_i32 s7, s4, 0x40018 1614; GFX7-NEXT: s_bfe_i32 s9, s4, 0x40014 1615; GFX7-NEXT: s_bfe_i32 s10, s4, 0x40010 1616; GFX7-NEXT: s_bfe_i32 s11, s4, 0x40008 1617; GFX7-NEXT: v_mov_b32_e32 v4, s18 1618; GFX7-NEXT: s_bfe_i32 s12, s4, 0x4000c 1619; GFX7-NEXT: v_mov_b32_e32 v3, s19 1620; GFX7-NEXT: s_bfe_i32 s13, s4, 0x40000 1621; GFX7-NEXT: v_mov_b32_e32 v2, s20 1622; GFX7-NEXT: s_bfe_i32 s4, s4, 0x40004 1623; GFX7-NEXT: v_mov_b32_e32 v1, s5 1624; GFX7-NEXT: v_mul_i32_i24_e32 v1, s4, v1 1625; GFX7-NEXT: v_mul_i32_i24_e32 v2, s13, v2 1626; GFX7-NEXT: v_mul_i32_i24_e32 v3, s12, v3 1627; GFX7-NEXT: v_mul_i32_i24_e32 v4, s11, v4 1628; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1629; GFX7-NEXT: v_and_b32_e32 v2, s8, v2 1630; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1631; GFX7-NEXT: v_and_b32_e32 v4, s8, v4 1632; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 1633; GFX7-NEXT: v_or_b32_e32 v2, v2, v1 1634; GFX7-NEXT: v_alignbit_b32 v1, v3, v1, 16 1635; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 1636; GFX7-NEXT: v_mov_b32_e32 v5, s17 1637; GFX7-NEXT: v_mov_b32_e32 v6, s16 1638; GFX7-NEXT: v_mov_b32_e32 v7, s15 1639; GFX7-NEXT: s_waitcnt vmcnt(0) 1640; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1641; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0 1642; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0 1643; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0 1644; GFX7-NEXT: v_mad_i32_i24 v0, s10, v5, v0 1645; GFX7-NEXT: v_mad_i32_i24 v0, s9, v6, v0 1646; GFX7-NEXT: v_mad_i32_i24 v0, s7, v7, v0 1647; GFX7-NEXT: v_mov_b32_e32 v1, s14 1648; GFX7-NEXT: v_mad_i32_i24 v0, s6, v1, v0 1649; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 1650; GFX7-NEXT: s_endpgm 1651; 1652; GFX8-LABEL: idot8_acc16_vecMul: 1653; GFX8: ; %bb.0: ; %entry 1654; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1655; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1656; GFX8-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 1657; GFX8-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 1658; GFX8-NEXT: s_mov_b32 s18, -1 1659; GFX8-NEXT: s_mov_b32 s19, 0xe80000 1660; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1661; GFX8-NEXT: v_mov_b32_e32 v0, s0 1662; GFX8-NEXT: v_mov_b32_e32 v1, s1 1663; GFX8-NEXT: flat_load_ushort v2, v[0:1] 1664; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 1665; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 1666; GFX8-NEXT: s_add_u32 s16, s16, s3 1667; GFX8-NEXT: s_addc_u32 s17, s17, 0 1668; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1669; GFX8-NEXT: s_bfe_i32 s8, s0, 0x40000 1670; GFX8-NEXT: s_bfe_i32 s15, s1, 0x40000 1671; GFX8-NEXT: s_bfe_i32 s10, s1, 0x40018 1672; GFX8-NEXT: s_bfe_i32 s11, s1, 0x40014 1673; GFX8-NEXT: s_bfe_i32 s12, s1, 0x40010 1674; GFX8-NEXT: s_bfe_i32 s13, s1, 0x4000c 1675; GFX8-NEXT: s_bfe_i32 s14, s1, 0x40004 1676; GFX8-NEXT: s_ashr_i32 s9, s1, 28 1677; GFX8-NEXT: s_bfe_i32 s1, s1, 0x40008 1678; GFX8-NEXT: v_mov_b32_e32 v4, s15 1679; GFX8-NEXT: s_ashr_i32 s2, s0, 28 1680; GFX8-NEXT: s_bfe_i32 s3, s0, 0x40018 1681; GFX8-NEXT: s_bfe_i32 s4, s0, 0x40014 1682; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40010 1683; GFX8-NEXT: s_bfe_i32 s6, s0, 0x4000c 1684; GFX8-NEXT: s_bfe_i32 s7, s0, 0x40004 1685; GFX8-NEXT: s_bfe_i32 s0, s0, 0x40008 1686; GFX8-NEXT: v_mov_b32_e32 v3, s1 1687; GFX8-NEXT: v_mov_b32_e32 v5, s14 1688; GFX8-NEXT: v_mul_i32_i24_e32 v3, s0, v3 1689; GFX8-NEXT: v_mov_b32_e32 v6, s13 1690; GFX8-NEXT: v_mov_b32_e32 v7, s12 1691; GFX8-NEXT: v_mov_b32_e32 v8, s11 1692; GFX8-NEXT: v_mov_b32_e32 v9, s10 1693; GFX8-NEXT: s_waitcnt vmcnt(0) 1694; GFX8-NEXT: v_mad_i32_i24 v2, s8, v4, v2 1695; GFX8-NEXT: v_mad_i32_i24 v2, s7, v5, v2 1696; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1697; GFX8-NEXT: v_mad_i32_i24 v2, s6, v6, v2 1698; GFX8-NEXT: v_mad_i32_i24 v2, s5, v7, v2 1699; GFX8-NEXT: v_mad_i32_i24 v2, s4, v8, v2 1700; GFX8-NEXT: v_mad_i32_i24 v2, s3, v9, v2 1701; GFX8-NEXT: v_mov_b32_e32 v3, s9 1702; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2 1703; GFX8-NEXT: flat_store_short v[0:1], v2 1704; GFX8-NEXT: s_endpgm 1705; 1706; GFX9-LABEL: idot8_acc16_vecMul: 1707; GFX9: ; %bb.0: ; %entry 1708; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1709; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1710; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 1711; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 1712; GFX9-NEXT: s_mov_b32 s22, -1 1713; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1714; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 1715; GFX9-NEXT: s_mov_b32 s23, 0xe00000 1716; GFX9-NEXT: s_add_u32 s20, s20, s3 1717; GFX9-NEXT: s_addc_u32 s21, s21, 0 1718; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 1719; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1720; GFX9-NEXT: s_bfe_u32 s3, s2, 0x40018 1721; GFX9-NEXT: s_lshr_b32 s4, s2, 28 1722; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010 1723; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 1724; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 1725; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c 1726; GFX9-NEXT: s_and_b32 s11, s2, 15 1727; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004 1728; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s2 1729; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1] 1730; GFX9-NEXT: s_pack_ll_b32_b16 s2, s9, s10 1731; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] 1732; GFX9-NEXT: s_pack_ll_b32_b16 s2, s5, s8 1733; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] 1734; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s4 1735; GFX9-NEXT: s_bfe_u32 s7, s6, 0x40018 1736; GFX9-NEXT: s_lshr_b32 s12, s6, 28 1737; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40010 1738; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40014 1739; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40008 1740; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c 1741; GFX9-NEXT: s_and_b32 s17, s6, 15 1742; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004 1743; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] 1744; GFX9-NEXT: s_pack_ll_b32_b16 s2, s17, s6 1745; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] 1746; GFX9-NEXT: v_mov_b32_e32 v0, 0 1747; GFX9-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] 1748; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] 1749; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v5 1750; GFX9-NEXT: global_load_ushort v5, v0, s[0:1] 1751; GFX9-NEXT: s_pack_ll_b32_b16 s2, s15, s16 1752; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1] 1753; GFX9-NEXT: s_pack_ll_b32_b16 s2, s13, s14 1754; GFX9-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] 1755; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] 1756; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1] 1757; GFX9-NEXT: v_pk_mul_lo_u16 v2, v2, v6 1758; GFX9-NEXT: s_pack_ll_b32_b16 s2, s7, s12 1759; GFX9-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] 1760; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] 1761; GFX9-NEXT: v_pk_lshlrev_b16 v8, 12, s2 op_sel_hi:[0,1] 1762; GFX9-NEXT: v_pk_mul_lo_u16 v3, v3, v7 1763; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] 1764; GFX9-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] 1765; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v8 1766; GFX9-NEXT: s_waitcnt vmcnt(0) 1767; GFX9-NEXT: v_add_u32_e32 v5, v1, v5 1768; GFX9-NEXT: v_add_u32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1769; GFX9-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1770; GFX9-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1771; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 1772; GFX9-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1773; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 1774; GFX9-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1775; GFX9-NEXT: global_store_short v0, v1, s[0:1] 1776; GFX9-NEXT: s_endpgm 1777; 1778; GFX9-DL-LABEL: idot8_acc16_vecMul: 1779; GFX9-DL: ; %bb.0: ; %entry 1780; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1781; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1782; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 1783; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 1784; GFX9-DL-NEXT: s_mov_b32 s22, -1 1785; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1786; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 1787; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 1788; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 1789; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 1790; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 1791; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1792; GFX9-DL-NEXT: s_bfe_u32 s3, s2, 0x40018 1793; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 1794; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 1795; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 1796; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 1797; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c 1798; GFX9-DL-NEXT: s_and_b32 s11, s2, 15 1799; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004 1800; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s2 1801; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1] 1802; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s9, s10 1803; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] 1804; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s5, s8 1805; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] 1806; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s3, s4 1807; GFX9-DL-NEXT: s_bfe_u32 s7, s6, 0x40018 1808; GFX9-DL-NEXT: s_lshr_b32 s12, s6, 28 1809; GFX9-DL-NEXT: s_bfe_u32 s13, s6, 0x40010 1810; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40014 1811; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x40008 1812; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x4000c 1813; GFX9-DL-NEXT: s_and_b32 s17, s6, 15 1814; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004 1815; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] 1816; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s17, s6 1817; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] 1818; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1819; GFX9-DL-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] 1820; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] 1821; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v5 1822; GFX9-DL-NEXT: global_load_ushort v5, v0, s[0:1] 1823; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s15, s16 1824; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1] 1825; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s13, s14 1826; GFX9-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] 1827; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] 1828; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1] 1829; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v6 1830; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s7, s12 1831; GFX9-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] 1832; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] 1833; GFX9-DL-NEXT: v_pk_lshlrev_b16 v8, 12, s2 op_sel_hi:[0,1] 1834; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v7 1835; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] 1836; GFX9-DL-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] 1837; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v8 1838; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1839; GFX9-DL-NEXT: v_add_u32_e32 v5, v1, v5 1840; GFX9-DL-NEXT: v_add_u32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1841; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1842; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1843; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v3 1844; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1845; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v4 1846; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1847; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] 1848; GFX9-DL-NEXT: s_endpgm 1849; 1850; GFX10-DL-LABEL: idot8_acc16_vecMul: 1851; GFX10-DL: ; %bb.0: ; %entry 1852; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 1853; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 1854; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1855; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1856; GFX10-DL-NEXT: s_mov_b32 s14, -1 1857; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 1858; GFX10-DL-NEXT: s_add_u32 s12, s12, s3 1859; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1860; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 1861; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1862; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] 1863; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 1864; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 1865; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1866; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 1867; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 28 1868; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 1869; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x40014 1870; GFX10-DL-NEXT: s_bfe_u32 s8, s0, 0x40008 1871; GFX10-DL-NEXT: s_bfe_u32 s9, s0, 0x4000c 1872; GFX10-DL-NEXT: s_and_b32 s10, s0, 15 1873; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x40004 1874; GFX10-DL-NEXT: s_and_b32 s11, s1, 15 1875; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s10, s0 1876; GFX10-DL-NEXT: s_bfe_u32 s10, s1, 0x40004 1877; GFX10-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s0 op_sel_hi:[0,1] 1878; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s11, s10 1879; GFX10-DL-NEXT: s_bfe_u32 s11, s1, 0x4000c 1880; GFX10-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] 1881; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40008 1882; GFX10-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] 1883; GFX10-DL-NEXT: s_pack_ll_b32_b16 s8, s8, s9 1884; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s11 1885; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] 1886; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s8 op_sel_hi:[0,1] 1887; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] 1888; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40010 1889; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40014 1890; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v3 1891; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v4 op_sel_hi:[0,1] 1892; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1] 1893; GFX10-DL-NEXT: s_pack_ll_b32_b16 s6, s6, s7 1894; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s8, s0 1895; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s6 op_sel_hi:[0,1] 1896; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] 1897; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 1898; GFX10-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 1899; GFX10-DL-NEXT: s_lshr_b32 s0, s1, 28 1900; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s2, s3 1901; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1] 1902; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s10, s0 1903; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] 1904; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1905; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1 1906; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1907; GFX10-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v5 op_sel_hi:[0,1] 1908; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s1 op_sel_hi:[0,1] 1909; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1910; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v4 1911; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1] 1912; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1913; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v5 op_sel_hi:[0,1] 1914; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 1915; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 1916; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1917; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v3 1918; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1919; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] 1920; GFX10-DL-NEXT: s_endpgm 1921 <8 x i4> addrspace(1)* %src2, 1922 i16 addrspace(1)* nocapture %dst) { 1923entry: 1924 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 1925 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 1926 1927 %cvec1 = sext <8 x i4> %vec1 to <8 x i16> 1928 %cvec2 = sext <8 x i4> %vec2 to <8 x i16> 1929 1930 %mul = mul <8 x i16> %cvec1, %cvec2 1931 %mul0 = extractelement <8 x i16> %mul, i64 0 1932 %mul1 = extractelement <8 x i16> %mul, i64 1 1933 %mul2 = extractelement <8 x i16> %mul, i64 2 1934 %mul3 = extractelement <8 x i16> %mul, i64 3 1935 %mul4 = extractelement <8 x i16> %mul, i64 4 1936 %mul5 = extractelement <8 x i16> %mul, i64 5 1937 %mul6 = extractelement <8 x i16> %mul, i64 6 1938 %mul7 = extractelement <8 x i16> %mul, i64 7 1939 1940 %acc = load i16, i16 addrspace(1)* %dst, align 4 1941 %add1 = add i16 %mul0, %acc 1942 %add2 = add i16 %add1, %mul1 1943 %add3 = add i16 %add2, %mul2 1944 %add4 = add i16 %add3, %mul3 1945 %add5 = add i16 %add4, %mul4 1946 %add6 = add i16 %add5, %mul5 1947 %add7 = add i16 %add6, %mul6 1948 %add8 = add i16 %add7, %mul7 1949 1950 store i16 %add8, i16 addrspace(1)* %dst, align 4 1951 ret void 1952} 1953 1954; TODO: Support this pattern. 1955define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, 1956; GFX7-LABEL: idot8_acc8_vecMul: 1957; GFX7: ; %bb.0: ; %entry 1958; GFX7-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 1959; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1960; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1961; GFX7-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 1962; GFX7-NEXT: s_mov_b32 s26, -1 1963; GFX7-NEXT: s_mov_b32 s27, 0xe8f000 1964; GFX7-NEXT: s_add_u32 s24, s24, s3 1965; GFX7-NEXT: s_mov_b32 s3, 0xf000 1966; GFX7-NEXT: s_mov_b32 s2, -1 1967; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1968; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 1969; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 1970; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 1971; GFX7-NEXT: s_addc_u32 s25, s25, 0 1972; GFX7-NEXT: s_movk_i32 s8, 0xff 1973; GFX7-NEXT: s_mov_b32 s9, 0xffff 1974; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1975; GFX7-NEXT: s_bfe_i32 s6, s4, 0x40000 1976; GFX7-NEXT: s_bfe_i32 s15, s5, 0x40000 1977; GFX7-NEXT: s_bfe_i32 s16, s5, 0x40004 1978; GFX7-NEXT: s_bfe_i32 s17, s5, 0x40008 1979; GFX7-NEXT: s_bfe_i32 s18, s5, 0x4000c 1980; GFX7-NEXT: s_bfe_i32 s19, s5, 0x40010 1981; GFX7-NEXT: s_bfe_i32 s20, s5, 0x40014 1982; GFX7-NEXT: s_bfe_i32 s21, s5, 0x40018 1983; GFX7-NEXT: s_ashr_i32 s5, s5, 28 1984; GFX7-NEXT: v_mov_b32_e32 v8, s15 1985; GFX7-NEXT: s_bfe_i32 s7, s4, 0x40004 1986; GFX7-NEXT: v_mov_b32_e32 v7, s16 1987; GFX7-NEXT: s_bfe_i32 s10, s4, 0x40008 1988; GFX7-NEXT: v_mov_b32_e32 v6, s17 1989; GFX7-NEXT: s_bfe_i32 s11, s4, 0x4000c 1990; GFX7-NEXT: v_mov_b32_e32 v5, s18 1991; GFX7-NEXT: s_bfe_i32 s12, s4, 0x40010 1992; GFX7-NEXT: v_mov_b32_e32 v4, s19 1993; GFX7-NEXT: s_bfe_i32 s13, s4, 0x40014 1994; GFX7-NEXT: v_mov_b32_e32 v3, s20 1995; GFX7-NEXT: s_bfe_i32 s14, s4, 0x40018 1996; GFX7-NEXT: v_mov_b32_e32 v2, s21 1997; GFX7-NEXT: s_ashr_i32 s4, s4, 28 1998; GFX7-NEXT: v_mov_b32_e32 v1, s5 1999; GFX7-NEXT: v_mul_i32_i24_e32 v1, s4, v1 2000; GFX7-NEXT: v_mul_i32_i24_e32 v2, s14, v2 2001; GFX7-NEXT: v_mul_i32_i24_e32 v3, s13, v3 2002; GFX7-NEXT: v_mul_i32_i24_e32 v9, s12, v4 2003; GFX7-NEXT: v_mul_i32_i24_e32 v5, s11, v5 2004; GFX7-NEXT: v_mul_i32_i24_e32 v6, s10, v6 2005; GFX7-NEXT: v_mul_i32_i24_e32 v7, s7, v7 2006; GFX7-NEXT: v_mul_i32_i24_e32 v8, s6, v8 2007; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 2008; GFX7-NEXT: v_and_b32_e32 v2, s8, v2 2009; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 2010; GFX7-NEXT: v_and_b32_e32 v9, s8, v9 2011; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 2012; GFX7-NEXT: v_and_b32_e32 v6, s8, v6 2013; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 2014; GFX7-NEXT: v_and_b32_e32 v8, s8, v8 2015; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 2016; GFX7-NEXT: v_or_b32_e32 v2, v9, v3 2017; GFX7-NEXT: v_or_b32_e32 v3, v6, v5 2018; GFX7-NEXT: v_or_b32_e32 v5, v8, v7 2019; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2020; GFX7-NEXT: v_and_b32_e32 v2, s9, v2 2021; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2022; GFX7-NEXT: v_and_b32_e32 v5, s9, v5 2023; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 2024; GFX7-NEXT: v_or_b32_e32 v2, v5, v3 2025; GFX7-NEXT: v_alignbit_b32 v3, v1, v2, 8 2026; GFX7-NEXT: v_alignbit_b32 v5, v1, v2, 16 2027; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 2028; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 2029; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v1 2030; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v1 2031; GFX7-NEXT: s_waitcnt vmcnt(0) 2032; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 2033; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0 2034; GFX7-NEXT: v_add_i32_e32 v0, vcc, v5, v0 2035; GFX7-NEXT: v_add_i32_e32 v0, vcc, v6, v0 2036; GFX7-NEXT: v_mad_i32_i24 v0, s12, v4, v0 2037; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v7 2038; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v8 2039; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 2040; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 2041; GFX7-NEXT: s_endpgm 2042; 2043; GFX8-LABEL: idot8_acc8_vecMul: 2044; GFX8: ; %bb.0: ; %entry 2045; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2046; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2047; GFX8-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 2048; GFX8-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 2049; GFX8-NEXT: s_mov_b32 s22, -1 2050; GFX8-NEXT: s_mov_b32 s23, 0xe80000 2051; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2052; GFX8-NEXT: v_mov_b32_e32 v0, s0 2053; GFX8-NEXT: v_mov_b32_e32 v1, s1 2054; GFX8-NEXT: flat_load_ubyte v2, v[0:1] 2055; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 2056; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 2057; GFX8-NEXT: s_add_u32 s20, s20, s3 2058; GFX8-NEXT: s_addc_u32 s21, s21, 0 2059; GFX8-NEXT: s_mov_b32 s0, 0xffff 2060; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2061; GFX8-NEXT: s_bfe_i32 s7, s1, 0x40004 2062; GFX8-NEXT: s_bfe_i32 s9, s1, 0x4000c 2063; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40004 2064; GFX8-NEXT: s_bfe_i32 s15, s2, 0x40000 2065; GFX8-NEXT: s_bfe_i32 s16, s2, 0x4000c 2066; GFX8-NEXT: s_bfe_i32 s3, s1, 0x40014 2067; GFX8-NEXT: s_ashr_i32 s5, s1, 28 2068; GFX8-NEXT: s_bfe_i32 s10, s2, 0x40014 2069; GFX8-NEXT: s_bfe_i32 s11, s2, 0x40010 2070; GFX8-NEXT: s_ashr_i32 s12, s2, 28 2071; GFX8-NEXT: s_bfe_i32 s13, s2, 0x40018 2072; GFX8-NEXT: s_bfe_i32 s2, s2, 0x40008 2073; GFX8-NEXT: s_bfe_i32 s8, s1, 0x40000 2074; GFX8-NEXT: v_mov_b32_e32 v4, s16 2075; GFX8-NEXT: v_mov_b32_e32 v5, s9 2076; GFX8-NEXT: v_mov_b32_e32 v6, s15 2077; GFX8-NEXT: v_mov_b32_e32 v7, s14 2078; GFX8-NEXT: v_mov_b32_e32 v8, s7 2079; GFX8-NEXT: v_mul_i32_i24_sdwa v4, v5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2080; GFX8-NEXT: v_mul_i32_i24_e32 v5, s8, v6 2081; GFX8-NEXT: v_mul_i32_i24_sdwa v6, v8, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2082; GFX8-NEXT: s_bfe_i32 s4, s1, 0x40010 2083; GFX8-NEXT: s_bfe_i32 s6, s1, 0x40018 2084; GFX8-NEXT: v_mov_b32_e32 v9, s13 2085; GFX8-NEXT: s_bfe_i32 s1, s1, 0x40008 2086; GFX8-NEXT: v_mov_b32_e32 v3, s2 2087; GFX8-NEXT: v_mov_b32_e32 v10, s12 2088; GFX8-NEXT: v_mov_b32_e32 v11, s5 2089; GFX8-NEXT: v_mov_b32_e32 v12, s11 2090; GFX8-NEXT: v_mov_b32_e32 v13, s10 2091; GFX8-NEXT: v_mov_b32_e32 v14, s3 2092; GFX8-NEXT: v_mul_i32_i24_e32 v3, s1, v3 2093; GFX8-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2094; GFX8-NEXT: v_mul_i32_i24_e32 v7, s6, v9 2095; GFX8-NEXT: v_mul_i32_i24_sdwa v8, v11, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2096; GFX8-NEXT: v_mul_i32_i24_e32 v9, s4, v12 2097; GFX8-NEXT: v_mul_i32_i24_sdwa v10, v14, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2098; GFX8-NEXT: v_and_b32_e32 v5, s0, v5 2099; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2100; GFX8-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2101; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2102; GFX8-NEXT: v_and_b32_e32 v4, s0, v9 2103; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 2104; GFX8-NEXT: v_or_b32_e32 v6, v4, v7 2105; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 2106; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v6 2107; GFX8-NEXT: s_waitcnt vmcnt(0) 2108; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 2109; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 2110; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 2111; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD 2112; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 2113; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 2114; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2115; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD 2116; GFX8-NEXT: flat_store_byte v[0:1], v2 2117; GFX8-NEXT: s_endpgm 2118; 2119; GFX9-LABEL: idot8_acc8_vecMul: 2120; GFX9: ; %bb.0: ; %entry 2121; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2122; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2123; GFX9-NEXT: v_mov_b32_e32 v0, 0 2124; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 2125; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 2126; GFX9-NEXT: s_mov_b32 s22, -1 2127; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2128; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] 2129; GFX9-NEXT: s_mov_b32 s23, 0xe00000 2130; GFX9-NEXT: s_add_u32 s20, s20, s3 2131; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 2132; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 2133; GFX9-NEXT: s_addc_u32 s21, s21, 0 2134; GFX9-NEXT: s_mov_b32 s2, 0xffff 2135; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2136; GFX9-NEXT: s_lshr_b32 s9, s3, 4 2137; GFX9-NEXT: s_lshr_b32 s16, s4, 4 2138; GFX9-NEXT: v_lshlrev_b16_e64 v2, 12, s3 2139; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s4 2140; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s9 2141; GFX9-NEXT: v_lshlrev_b16_e64 v13, 12, s16 2142; GFX9-NEXT: s_lshr_b32 s10, s3, 12 2143; GFX9-NEXT: s_lshr_b32 s11, s3, 8 2144; GFX9-NEXT: s_lshr_b32 s17, s4, 12 2145; GFX9-NEXT: s_lshr_b32 s18, s4, 8 2146; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s11 2147; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s10 2148; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s18 2149; GFX9-NEXT: v_lshlrev_b16_e64 v12, 12, s17 2150; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 2151; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3 2152; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 2153; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13 2154; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 2155; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 2156; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 2157; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 2158; GFX9-NEXT: v_mul_lo_u16_e32 v2, v2, v3 2159; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2160; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2161; GFX9-NEXT: s_lshr_b32 s5, s3, 20 2162; GFX9-NEXT: s_lshr_b32 s6, s3, 16 2163; GFX9-NEXT: s_lshr_b32 s12, s4, 20 2164; GFX9-NEXT: s_lshr_b32 s13, s4, 16 2165; GFX9-NEXT: v_mul_lo_u16_sdwa v5, v5, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2166; GFX9-NEXT: v_mul_lo_u16_e32 v4, v4, v11 2167; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s6 2168; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s5 2169; GFX9-NEXT: v_lshlrev_b16_e64 v16, 12, s13 2170; GFX9-NEXT: v_lshlrev_b16_e64 v17, 12, s12 2171; GFX9-NEXT: s_lshr_b32 s7, s3, 28 2172; GFX9-NEXT: s_lshr_b32 s8, s3, 24 2173; GFX9-NEXT: s_lshr_b32 s14, s4, 28 2174; GFX9-NEXT: s_lshr_b32 s15, s4, 24 2175; GFX9-NEXT: v_and_b32_e32 v2, s2, v2 2176; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2177; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s8 2178; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s7 2179; GFX9-NEXT: v_lshlrev_b16_e64 v14, 12, s15 2180; GFX9-NEXT: v_lshlrev_b16_e64 v15, 12, s14 2181; GFX9-NEXT: v_or_b32_e32 v4, v2, v4 2182; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9 2183; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v16 2184; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 2185; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v17 2186; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 2187; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14 2188; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 2189; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v15 2190; GFX9-NEXT: v_mul_lo_u16_sdwa v3, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2191; GFX9-NEXT: v_mul_lo_u16_e32 v9, v9, v16 2192; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v4 2193; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2194; GFX9-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2195; GFX9-NEXT: v_mul_lo_u16_e32 v7, v7, v14 2196; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2197; GFX9-NEXT: v_and_b32_e32 v3, s2, v3 2198; GFX9-NEXT: v_or_b32_e32 v5, v3, v7 2199; GFX9-NEXT: s_waitcnt vmcnt(0) 2200; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 2201; GFX9-NEXT: v_add_u32_e32 v1, v1, v6 2202; GFX9-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 2203; GFX9-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2204; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 2205; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v5 2206; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 2207; GFX9-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2208; GFX9-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2209; GFX9-NEXT: global_store_byte v0, v1, s[0:1] 2210; GFX9-NEXT: s_endpgm 2211; 2212; GFX9-DL-LABEL: idot8_acc8_vecMul: 2213; GFX9-DL: ; %bb.0: ; %entry 2214; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2215; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2216; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2217; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 2218; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 2219; GFX9-DL-NEXT: s_mov_b32 s22, -1 2220; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2221; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] 2222; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 2223; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 2224; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 2225; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 2226; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 2227; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff 2228; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2229; GFX9-DL-NEXT: s_lshr_b32 s9, s3, 4 2230; GFX9-DL-NEXT: s_lshr_b32 s16, s4, 4 2231; GFX9-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s3 2232; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s4 2233; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s9 2234; GFX9-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s16 2235; GFX9-DL-NEXT: s_lshr_b32 s10, s3, 12 2236; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 8 2237; GFX9-DL-NEXT: s_lshr_b32 s17, s4, 12 2238; GFX9-DL-NEXT: s_lshr_b32 s18, s4, 8 2239; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s11 2240; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s10 2241; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s18 2242; GFX9-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s17 2243; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 2244; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3 2245; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 2246; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13 2247; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 2248; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 2249; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 2250; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 2251; GFX9-DL-NEXT: v_mul_lo_u16_e32 v2, v2, v3 2252; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2253; GFX9-DL-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2254; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 20 2255; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 16 2256; GFX9-DL-NEXT: s_lshr_b32 s12, s4, 20 2257; GFX9-DL-NEXT: s_lshr_b32 s13, s4, 16 2258; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v5, v5, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2259; GFX9-DL-NEXT: v_mul_lo_u16_e32 v4, v4, v11 2260; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s6 2261; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s5 2262; GFX9-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s13 2263; GFX9-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s12 2264; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 28 2265; GFX9-DL-NEXT: s_lshr_b32 s8, s3, 24 2266; GFX9-DL-NEXT: s_lshr_b32 s14, s4, 28 2267; GFX9-DL-NEXT: s_lshr_b32 s15, s4, 24 2268; GFX9-DL-NEXT: v_and_b32_e32 v2, s2, v2 2269; GFX9-DL-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2270; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s8 2271; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s7 2272; GFX9-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s15 2273; GFX9-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s14 2274; GFX9-DL-NEXT: v_or_b32_e32 v4, v2, v4 2275; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9 2276; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v16 2277; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 2278; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v17 2279; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 2280; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14 2281; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 2282; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v15 2283; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v3, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2284; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, v9, v16 2285; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v4 2286; GFX9-DL-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2287; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2288; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v7, v14 2289; GFX9-DL-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2290; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3 2291; GFX9-DL-NEXT: v_or_b32_e32 v5, v3, v7 2292; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2293; GFX9-DL-NEXT: v_add_u32_e32 v1, v2, v1 2294; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v6 2295; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 2296; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2297; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v3 2298; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v5 2299; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2 2300; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2301; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2302; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] 2303; GFX9-DL-NEXT: s_endpgm 2304; 2305; GFX10-DL-LABEL: idot8_acc8_vecMul: 2306; GFX10-DL: ; %bb.0: ; %entry 2307; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 2308; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 2309; GFX10-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 2310; GFX10-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 2311; GFX10-DL-NEXT: s_mov_b32 s22, -1 2312; GFX10-DL-NEXT: s_mov_b32 s23, 0x31c16000 2313; GFX10-DL-NEXT: s_add_u32 s20, s20, s3 2314; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2315; GFX10-DL-NEXT: s_addc_u32 s21, s21, 0 2316; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2317; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] 2318; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 2319; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 2320; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff 2321; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2322; GFX10-DL-NEXT: s_lshr_b32 s9, s0, 4 2323; GFX10-DL-NEXT: s_lshr_b32 s16, s1, 4 2324; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s9 2325; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s16 2326; GFX10-DL-NEXT: s_lshr_b32 s10, s0, 12 2327; GFX10-DL-NEXT: s_lshr_b32 s17, s1, 12 2328; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s0 2329; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s1 2330; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s17 2331; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s10 2332; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v6 2333; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v12 2334; GFX10-DL-NEXT: s_lshr_b32 s11, s0, 8 2335; GFX10-DL-NEXT: s_lshr_b32 s18, s1, 8 2336; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s11 2337; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s18 2338; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2 2339; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 2340; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v5 2341; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, v6, v12 2342; GFX10-DL-NEXT: v_ashrrev_i16_e64 v13, 12, v13 2343; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 2344; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v11 2345; GFX10-DL-NEXT: v_mul_lo_u16_e64 v2, v2, v3 2346; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v6 2347; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v19, v13 2348; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 20 2349; GFX10-DL-NEXT: s_lshr_b32 s6, s0, 16 2350; GFX10-DL-NEXT: s_lshr_b32 s7, s0, 28 2351; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 24 2352; GFX10-DL-NEXT: s_lshr_b32 s12, s1, 20 2353; GFX10-DL-NEXT: v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2354; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s8 2355; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s7 2356; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s6 2357; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s3 2358; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s12 2359; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v11 2360; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3 2361; GFX10-DL-NEXT: s_lshr_b32 s13, s1, 16 2362; GFX10-DL-NEXT: s_lshr_b32 s14, s1, 28 2363; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s13 2364; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v7 2365; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v8 2366; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v9 2367; GFX10-DL-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2368; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2 2369; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s14 2370; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v10 2371; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v12 2372; GFX10-DL-NEXT: s_lshr_b32 s15, s1, 24 2373; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v6 2374; GFX10-DL-NEXT: v_ashrrev_i16_e64 v10, 12, v15 2375; GFX10-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s15 2376; GFX10-DL-NEXT: v_or_b32_e32 v3, v2, v3 2377; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v9 2378; GFX10-DL-NEXT: v_mul_lo_u16_e64 v15, v8, v6 2379; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v7, v10 2380; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v14 2381; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v3 2382; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2383; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1 2384; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 8, v4 2385; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v5, v11 2386; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v7 2387; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v8 2388; GFX10-DL-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2389; GFX10-DL-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2390; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 2391; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2 2392; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2393; GFX10-DL-NEXT: v_or_b32_e32 v3, v2, v4 2394; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 2395; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v3 2396; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 2397; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2398; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2399; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] 2400; GFX10-DL-NEXT: s_endpgm 2401 <8 x i4> addrspace(1)* %src2, 2402 i8 addrspace(1)* nocapture %dst) { 2403entry: 2404 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 2405 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 2406 2407 %cvec1 = sext <8 x i4> %vec1 to <8 x i8> 2408 %cvec2 = sext <8 x i4> %vec2 to <8 x i8> 2409 2410 %mul = mul <8 x i8> %cvec1, %cvec2 2411 %mul0 = extractelement <8 x i8> %mul, i64 0 2412 %mul1 = extractelement <8 x i8> %mul, i64 1 2413 %mul2 = extractelement <8 x i8> %mul, i64 2 2414 %mul3 = extractelement <8 x i8> %mul, i64 3 2415 %mul4 = extractelement <8 x i8> %mul, i64 4 2416 %mul5 = extractelement <8 x i8> %mul, i64 5 2417 %mul6 = extractelement <8 x i8> %mul, i64 6 2418 %mul7 = extractelement <8 x i8> %mul, i64 7 2419 2420 %acc = load i8, i8 addrspace(1)* %dst, align 4 2421 %add1 = add i8 %mul0, %acc 2422 %add2 = add i8 %add1, %mul1 2423 %add3 = add i8 %add2, %mul2 2424 %add4 = add i8 %add3, %mul3 2425 %add5 = add i8 %add4, %mul4 2426 %add6 = add i8 %add5, %mul5 2427 %add7 = add i8 %add6, %mul6 2428 %add8 = add i8 %add7, %mul7 2429 2430 store i8 %add8, i8 addrspace(1)* %dst, align 4 2431 ret void 2432} 2433